Blame view

mm/mlock.c 15.7 KB
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1
2
3
4
5
6
  /*
   *	linux/mm/mlock.c
   *
   *  (C) Copyright 1995 Linus Torvalds
   *  (C) Copyright 2002 Christoph Hellwig
   */
c59ede7b7   Randy.Dunlap   [PATCH] move capa...
7
  #include <linux/capability.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
8
9
  #include <linux/mman.h>
  #include <linux/mm.h>
b291f0003   Nick Piggin   mlock: mlocked pa...
10
11
12
  #include <linux/swap.h>
  #include <linux/swapops.h>
  #include <linux/pagemap.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
13
14
  #include <linux/mempolicy.h>
  #include <linux/syscalls.h>
e8edc6e03   Alexey Dobriyan   Detach sched.h fr...
15
  #include <linux/sched.h>
b95f1b31b   Paul Gortmaker   mm: Map most file...
16
  #include <linux/export.h>
b291f0003   Nick Piggin   mlock: mlocked pa...
17
18
19
20
21
  #include <linux/rmap.h>
  #include <linux/mmzone.h>
  #include <linux/hugetlb.h>
  
  #include "internal.h"
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
22

e8edc6e03   Alexey Dobriyan   Detach sched.h fr...
23
24
25
26
  int can_do_mlock(void)
  {
  	if (capable(CAP_IPC_LOCK))
  		return 1;
59e99e5b9   Jiri Slaby   mm: use rlimit he...
27
  	if (rlimit(RLIMIT_MEMLOCK) != 0)
e8edc6e03   Alexey Dobriyan   Detach sched.h fr...
28
29
30
31
  		return 1;
  	return 0;
  }
  EXPORT_SYMBOL(can_do_mlock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
32

b291f0003   Nick Piggin   mlock: mlocked pa...
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
  /*
   * Mlocked pages are marked with PageMlocked() flag for efficient testing
   * in vmscan and, possibly, the fault path; and to support semi-accurate
   * statistics.
   *
   * An mlocked page [PageMlocked(page)] is unevictable.  As such, it will
   * be placed on the LRU "unevictable" list, rather than the [in]active lists.
   * The unevictable list is an LRU sibling list to the [in]active lists.
   * PageUnevictable is set to indicate the unevictable state.
   *
   * When lazy mlocking via vmscan, it is important to ensure that the
   * vma's VM_LOCKED status is not concurrently being modified, otherwise we
   * may have mlocked a page that is being munlocked. So lazy mlock must take
   * the mmap_sem for read, and verify that the vma really is locked
   * (see mm/rmap.c).
   */
  
  /*
   *  LRU accounting for clear_page_mlock()
   */
  void __clear_page_mlock(struct page *page)
  {
  	VM_BUG_ON(!PageLocked(page));
  
  	if (!page->mapping) {	/* truncated ? */
  		return;
  	}
5344b7e64   Nick Piggin   vmstat: mlocked p...
60
61
  	dec_zone_page_state(page, NR_MLOCK);
  	count_vm_event(UNEVICTABLE_PGCLEARED);
b291f0003   Nick Piggin   mlock: mlocked pa...
62
63
64
65
  	if (!isolate_lru_page(page)) {
  		putback_lru_page(page);
  	} else {
  		/*
8891d6da1   KOSAKI Motohiro   mm: remove lru_ad...
66
  		 * We lost the race. the page already moved to evictable list.
b291f0003   Nick Piggin   mlock: mlocked pa...
67
  		 */
8891d6da1   KOSAKI Motohiro   mm: remove lru_ad...
68
  		if (PageUnevictable(page))
5344b7e64   Nick Piggin   vmstat: mlocked p...
69
  			count_vm_event(UNEVICTABLE_PGSTRANDED);
b291f0003   Nick Piggin   mlock: mlocked pa...
70
71
72
73
74
75
76
77
78
79
  	}
  }
  
  /*
   * Mark page as mlocked if not already.
   * If page on LRU, isolate and putback to move to unevictable list.
   */
  void mlock_vma_page(struct page *page)
  {
  	BUG_ON(!PageLocked(page));
5344b7e64   Nick Piggin   vmstat: mlocked p...
80
81
82
83
84
85
  	if (!TestSetPageMlocked(page)) {
  		inc_zone_page_state(page, NR_MLOCK);
  		count_vm_event(UNEVICTABLE_PGMLOCKED);
  		if (!isolate_lru_page(page))
  			putback_lru_page(page);
  	}
b291f0003   Nick Piggin   mlock: mlocked pa...
86
  }
6927c1dd9   Lee Schermerhorn   mlock: replace st...
87
88
89
  /**
   * munlock_vma_page - munlock a vma page
   * @page - page to be unlocked
b291f0003   Nick Piggin   mlock: mlocked pa...
90
   *
6927c1dd9   Lee Schermerhorn   mlock: replace st...
91
92
93
94
95
96
97
98
99
100
   * called from munlock()/munmap() path with page supposedly on the LRU.
   * When we munlock a page, because the vma where we found the page is being
   * munlock()ed or munmap()ed, we want to check whether other vmas hold the
   * page locked so that we can leave it on the unevictable lru list and not
   * bother vmscan with it.  However, to walk the page's rmap list in
   * try_to_munlock() we must isolate the page from the LRU.  If some other
   * task has removed the page from the LRU, we won't be able to do that.
   * So we clear the PageMlocked as we might not get another chance.  If we
   * can't isolate the page, we leave it for putback_lru_page() and vmscan
   * [page_referenced()/try_to_unmap()] to deal with.
b291f0003   Nick Piggin   mlock: mlocked pa...
101
   */
73848b468   Hugh Dickins   ksm: fix mlockfre...
102
  void munlock_vma_page(struct page *page)
b291f0003   Nick Piggin   mlock: mlocked pa...
103
104
  {
  	BUG_ON(!PageLocked(page));
5344b7e64   Nick Piggin   vmstat: mlocked p...
105
106
107
  	if (TestClearPageMlocked(page)) {
  		dec_zone_page_state(page, NR_MLOCK);
  		if (!isolate_lru_page(page)) {
3d470fc38   Hugh Dickins   mm: munlock use m...
108
109
110
111
112
113
114
115
116
  			int ret = SWAP_AGAIN;
  
  			/*
  			 * Optimization: if the page was mapped just once,
  			 * that's our mapping and we don't need to check all the
  			 * other vmas.
  			 */
  			if (page_mapcount(page) > 1)
  				ret = try_to_munlock(page);
5344b7e64   Nick Piggin   vmstat: mlocked p...
117
118
119
  			/*
  			 * did try_to_unlock() succeed or punt?
  			 */
53f79acb6   Hugh Dickins   mm: mlocking in t...
120
  			if (ret != SWAP_MLOCK)
5344b7e64   Nick Piggin   vmstat: mlocked p...
121
122
123
124
125
  				count_vm_event(UNEVICTABLE_PGMUNLOCKED);
  
  			putback_lru_page(page);
  		} else {
  			/*
6927c1dd9   Lee Schermerhorn   mlock: replace st...
126
127
128
129
130
131
  			 * Some other task has removed the page from the LRU.
  			 * putback_lru_page() will take care of removing the
  			 * page from the unevictable list, if necessary.
  			 * vmscan [page_referenced()] will move the page back
  			 * to the unevictable list if some other vma has it
  			 * mlocked.
5344b7e64   Nick Piggin   vmstat: mlocked p...
132
133
134
135
136
137
  			 */
  			if (PageUnevictable(page))
  				count_vm_event(UNEVICTABLE_PGSTRANDED);
  			else
  				count_vm_event(UNEVICTABLE_PGMUNLOCKED);
  		}
b291f0003   Nick Piggin   mlock: mlocked pa...
138
139
  	}
  }
ba470de43   Rik van Riel   mmap: handle mloc...
140
  /**
408e82b78   Hugh Dickins   mm: munlock use f...
141
   * __mlock_vma_pages_range() -  mlock a range of pages in the vma.
ba470de43   Rik van Riel   mmap: handle mloc...
142
143
144
   * @vma:   target vma
   * @start: start address
   * @end:   end address
ba470de43   Rik van Riel   mmap: handle mloc...
145
   *
408e82b78   Hugh Dickins   mm: munlock use f...
146
   * This takes care of making the pages present too.
b291f0003   Nick Piggin   mlock: mlocked pa...
147
   *
ba470de43   Rik van Riel   mmap: handle mloc...
148
   * return 0 on success, negative error code on error.
b291f0003   Nick Piggin   mlock: mlocked pa...
149
   *
ba470de43   Rik van Riel   mmap: handle mloc...
150
   * vma->vm_mm->mmap_sem must be held for at least read.
b291f0003   Nick Piggin   mlock: mlocked pa...
151
   */
ba470de43   Rik van Riel   mmap: handle mloc...
152
  static long __mlock_vma_pages_range(struct vm_area_struct *vma,
53a7706d5   Michel Lespinasse   mlock: do not hol...
153
154
  				    unsigned long start, unsigned long end,
  				    int *nonblocking)
b291f0003   Nick Piggin   mlock: mlocked pa...
155
156
157
  {
  	struct mm_struct *mm = vma->vm_mm;
  	unsigned long addr = start;
b291f0003   Nick Piggin   mlock: mlocked pa...
158
  	int nr_pages = (end - start) / PAGE_SIZE;
408e82b78   Hugh Dickins   mm: munlock use f...
159
  	int gup_flags;
ba470de43   Rik van Riel   mmap: handle mloc...
160
161
162
163
164
  
  	VM_BUG_ON(start & ~PAGE_MASK);
  	VM_BUG_ON(end   & ~PAGE_MASK);
  	VM_BUG_ON(start < vma->vm_start);
  	VM_BUG_ON(end   > vma->vm_end);
408e82b78   Hugh Dickins   mm: munlock use f...
165
  	VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
b291f0003   Nick Piggin   mlock: mlocked pa...
166

a1fde08c7   Linus Torvalds   VM: skip the stac...
167
  	gup_flags = FOLL_TOUCH | FOLL_MLOCK;
5ecfda041   Michel Lespinasse   mlock: avoid dirt...
168
169
170
171
172
173
  	/*
  	 * We want to touch writable mappings with a write fault in order
  	 * to break COW, except for shared mappings because these don't COW
  	 * and we would not want to dirty them for nothing.
  	 */
  	if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
58fa879e1   Hugh Dickins   mm: FOLL flags fo...
174
  		gup_flags |= FOLL_WRITE;
b291f0003   Nick Piggin   mlock: mlocked pa...
175

fdf4c587a   Michel Lespinasse   mlock: operate on...
176
177
178
179
180
181
  	/*
  	 * We want mlock to succeed for regions that have any permissions
  	 * other than PROT_NONE.
  	 */
  	if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
  		gup_flags |= FOLL_FORCE;
53a7706d5   Michel Lespinasse   mlock: do not hol...
182
183
  	return __get_user_pages(current, mm, addr, nr_pages, gup_flags,
  				NULL, NULL, nonblocking);
9978ad583   Lee Schermerhorn   mlock: make mlock...
184
185
186
187
188
189
190
191
192
193
194
195
  }
  
  /*
   * convert get_user_pages() return value to posix mlock() error
   */
  static int __mlock_posix_error_return(long retval)
  {
  	if (retval == -EFAULT)
  		retval = -ENOMEM;
  	else if (retval == -ENOMEM)
  		retval = -EAGAIN;
  	return retval;
b291f0003   Nick Piggin   mlock: mlocked pa...
196
  }
ba470de43   Rik van Riel   mmap: handle mloc...
197
198
199
200
201
202
203
204
205
206
207
208
  /**
   * mlock_vma_pages_range() - mlock pages in specified vma range.
   * @vma - the vma containing the specfied address range
   * @start - starting address in @vma to mlock
   * @end   - end address [+1] in @vma to mlock
   *
   * For mmap()/mremap()/expansion of mlocked vma.
   *
   * return 0 on success for "normal" vmas.
   *
   * return number of pages [> 0] to be removed from locked_vm on success
   * of "special" vmas.
b291f0003   Nick Piggin   mlock: mlocked pa...
209
   */
ba470de43   Rik van Riel   mmap: handle mloc...
210
  long mlock_vma_pages_range(struct vm_area_struct *vma,
b291f0003   Nick Piggin   mlock: mlocked pa...
211
212
213
214
215
216
217
218
219
220
221
222
223
  			unsigned long start, unsigned long end)
  {
  	int nr_pages = (end - start) / PAGE_SIZE;
  	BUG_ON(!(vma->vm_flags & VM_LOCKED));
  
  	/*
  	 * filter unlockable vmas
  	 */
  	if (vma->vm_flags & (VM_IO | VM_PFNMAP))
  		goto no_mlock;
  
  	if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
  			is_vm_hugetlb_page(vma) ||
31db58b3a   Stephen Wilson   mm: arch: make ge...
224
  			vma == get_gate_vma(current->mm))) {
8edb08caf   Lee Schermerhorn   mlock: downgrade ...
225

53a7706d5   Michel Lespinasse   mlock: do not hol...
226
  		__mlock_vma_pages_range(vma, start, end, NULL);
d5b562330   Hugh Dickins   mm: fix error cas...
227
228
229
  
  		/* Hide errors from mmap() and other callers */
  		return 0;
8edb08caf   Lee Schermerhorn   mlock: downgrade ...
230
  	}
b291f0003   Nick Piggin   mlock: mlocked pa...
231
232
233
234
235
236
237
238
239
240
241
242
243
  
  	/*
  	 * User mapped kernel pages or huge pages:
  	 * make these pages present to populate the ptes, but
  	 * fall thru' to reset VM_LOCKED--no need to unlock, and
  	 * return nr_pages so these don't get counted against task's
  	 * locked limit.  huge pages are already counted against
  	 * locked vm limit.
  	 */
  	make_pages_present(start, end);
  
  no_mlock:
  	vma->vm_flags &= ~VM_LOCKED;	/* and don't come back! */
ba470de43   Rik van Riel   mmap: handle mloc...
244
  	return nr_pages;		/* error or pages NOT mlocked */
b291f0003   Nick Piggin   mlock: mlocked pa...
245
  }
b291f0003   Nick Piggin   mlock: mlocked pa...
246
  /*
ba470de43   Rik van Riel   mmap: handle mloc...
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
   * munlock_vma_pages_range() - munlock all pages in the vma range.'
   * @vma - vma containing range to be munlock()ed.
   * @start - start address in @vma of the range
   * @end - end of range in @vma.
   *
   *  For mremap(), munmap() and exit().
   *
   * Called with @vma VM_LOCKED.
   *
   * Returns with VM_LOCKED cleared.  Callers must be prepared to
   * deal with this.
   *
   * We don't save and restore VM_LOCKED here because pages are
   * still on lru.  In unmap path, pages might be scanned by reclaim
   * and re-mlocked by try_to_{munlock|unmap} before we unmap and
   * free them.  This will result in freeing mlocked pages.
b291f0003   Nick Piggin   mlock: mlocked pa...
263
   */
ba470de43   Rik van Riel   mmap: handle mloc...
264
  void munlock_vma_pages_range(struct vm_area_struct *vma,
408e82b78   Hugh Dickins   mm: munlock use f...
265
  			     unsigned long start, unsigned long end)
b291f0003   Nick Piggin   mlock: mlocked pa...
266
  {
408e82b78   Hugh Dickins   mm: munlock use f...
267
268
269
  	unsigned long addr;
  
  	lru_add_drain();
b291f0003   Nick Piggin   mlock: mlocked pa...
270
  	vma->vm_flags &= ~VM_LOCKED;
408e82b78   Hugh Dickins   mm: munlock use f...
271
272
  
  	for (addr = start; addr < end; addr += PAGE_SIZE) {
6e919717c   Hugh Dickins   mm: m(un)lock avo...
273
274
275
276
277
278
279
280
281
282
  		struct page *page;
  		/*
  		 * Although FOLL_DUMP is intended for get_dump_page(),
  		 * it just so happens that its special treatment of the
  		 * ZERO_PAGE (returning an error instead of doing get_page)
  		 * suits munlock very well (and if somehow an abnormal page
  		 * has sneaked into the range, we won't oops here: great).
  		 */
  		page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
  		if (page && !IS_ERR(page)) {
408e82b78   Hugh Dickins   mm: munlock use f...
283
  			lock_page(page);
6e919717c   Hugh Dickins   mm: m(un)lock avo...
284
285
286
287
288
289
  			/*
  			 * Like in __mlock_vma_pages_range(),
  			 * because we lock page here and migration is
  			 * blocked by the elevated reference, we need
  			 * only check for file-cache page truncation.
  			 */
408e82b78   Hugh Dickins   mm: munlock use f...
290
291
292
293
294
295
296
  			if (page->mapping)
  				munlock_vma_page(page);
  			unlock_page(page);
  			put_page(page);
  		}
  		cond_resched();
  	}
b291f0003   Nick Piggin   mlock: mlocked pa...
297
298
299
300
301
302
303
304
305
306
307
  }
  
  /*
   * mlock_fixup  - handle mlock[all]/munlock[all] requests.
   *
   * Filters out "special" vmas -- VM_LOCKED never gets set for these, and
   * munlock is a no-op.  However, for some special vmas, we go ahead and
   * populate the ptes via make_pages_present().
   *
   * For vmas that pass the filters, merge/split as appropriate.
   */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
308
  static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
ca16d140a   KOSAKI Motohiro   mm: don't access ...
309
  	unsigned long start, unsigned long end, vm_flags_t newflags)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
310
  {
b291f0003   Nick Piggin   mlock: mlocked pa...
311
  	struct mm_struct *mm = vma->vm_mm;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
312
  	pgoff_t pgoff;
b291f0003   Nick Piggin   mlock: mlocked pa...
313
  	int nr_pages;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
314
  	int ret = 0;
ca16d140a   KOSAKI Motohiro   mm: don't access ...
315
  	int lock = !!(newflags & VM_LOCKED);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
316

fed067da4   Michel Lespinasse   mlock: only hold ...
317
  	if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
31db58b3a   Stephen Wilson   mm: arch: make ge...
318
  	    is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm))
b291f0003   Nick Piggin   mlock: mlocked pa...
319
  		goto out;	/* don't set VM_LOCKED,  don't count */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
320
321
322
323
324
325
326
  	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
  	*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
  			  vma->vm_file, pgoff, vma_policy(vma));
  	if (*prev) {
  		vma = *prev;
  		goto success;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
327
328
329
330
331
332
333
334
335
336
337
338
339
340
  	if (start != vma->vm_start) {
  		ret = split_vma(mm, vma, start, 1);
  		if (ret)
  			goto out;
  	}
  
  	if (end != vma->vm_end) {
  		ret = split_vma(mm, vma, end, 0);
  		if (ret)
  			goto out;
  	}
  
  success:
  	/*
b291f0003   Nick Piggin   mlock: mlocked pa...
341
342
343
344
345
346
347
348
  	 * Keep track of amount of locked VM.
  	 */
  	nr_pages = (end - start) >> PAGE_SHIFT;
  	if (!lock)
  		nr_pages = -nr_pages;
  	mm->locked_vm += nr_pages;
  
  	/*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
349
350
  	 * vm_flags is protected by the mmap_sem held in write mode.
  	 * It's okay if try_to_unmap_one unmaps a page just after we
b291f0003   Nick Piggin   mlock: mlocked pa...
351
  	 * set VM_LOCKED, __mlock_vma_pages_range will bring it back.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
352
  	 */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
353

fed067da4   Michel Lespinasse   mlock: only hold ...
354
  	if (lock)
408e82b78   Hugh Dickins   mm: munlock use f...
355
  		vma->vm_flags = newflags;
fed067da4   Michel Lespinasse   mlock: only hold ...
356
  	else
408e82b78   Hugh Dickins   mm: munlock use f...
357
  		munlock_vma_pages_range(vma, start, end);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
358

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
359
  out:
b291f0003   Nick Piggin   mlock: mlocked pa...
360
  	*prev = vma;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
361
362
363
364
365
366
367
368
  	return ret;
  }
  
  static int do_mlock(unsigned long start, size_t len, int on)
  {
  	unsigned long nstart, end, tmp;
  	struct vm_area_struct * vma, * prev;
  	int error;
fed067da4   Michel Lespinasse   mlock: only hold ...
369
370
  	VM_BUG_ON(start & ~PAGE_MASK);
  	VM_BUG_ON(len != PAGE_ALIGN(len));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
371
372
373
374
375
376
377
378
379
380
381
382
383
  	end = start + len;
  	if (end < start)
  		return -EINVAL;
  	if (end == start)
  		return 0;
  	vma = find_vma_prev(current->mm, start, &prev);
  	if (!vma || vma->vm_start > start)
  		return -ENOMEM;
  
  	if (start > vma->vm_start)
  		prev = vma;
  
  	for (nstart = start ; ; ) {
ca16d140a   KOSAKI Motohiro   mm: don't access ...
384
  		vm_flags_t newflags;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
  
  		/* Here we know that  vma->vm_start <= nstart < vma->vm_end. */
  
  		newflags = vma->vm_flags | VM_LOCKED;
  		if (!on)
  			newflags &= ~VM_LOCKED;
  
  		tmp = vma->vm_end;
  		if (tmp > end)
  			tmp = end;
  		error = mlock_fixup(vma, &prev, nstart, tmp, newflags);
  		if (error)
  			break;
  		nstart = tmp;
  		if (nstart < prev->vm_end)
  			nstart = prev->vm_end;
  		if (nstart >= end)
  			break;
  
  		vma = prev->vm_next;
  		if (!vma || vma->vm_start != nstart) {
  			error = -ENOMEM;
  			break;
  		}
  	}
  	return error;
  }
fed067da4   Michel Lespinasse   mlock: only hold ...
412
413
414
415
416
  static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors)
  {
  	struct mm_struct *mm = current->mm;
  	unsigned long end, nstart, nend;
  	struct vm_area_struct *vma = NULL;
53a7706d5   Michel Lespinasse   mlock: do not hol...
417
  	int locked = 0;
fed067da4   Michel Lespinasse   mlock: only hold ...
418
419
420
421
422
  	int ret = 0;
  
  	VM_BUG_ON(start & ~PAGE_MASK);
  	VM_BUG_ON(len != PAGE_ALIGN(len));
  	end = start + len;
fed067da4   Michel Lespinasse   mlock: only hold ...
423
424
425
426
427
  	for (nstart = start; nstart < end; nstart = nend) {
  		/*
  		 * We want to fault in pages for [nstart; end) address range.
  		 * Find first corresponding VMA.
  		 */
53a7706d5   Michel Lespinasse   mlock: do not hol...
428
429
430
  		if (!locked) {
  			locked = 1;
  			down_read(&mm->mmap_sem);
fed067da4   Michel Lespinasse   mlock: only hold ...
431
  			vma = find_vma(mm, nstart);
53a7706d5   Michel Lespinasse   mlock: do not hol...
432
  		} else if (nstart >= vma->vm_end)
fed067da4   Michel Lespinasse   mlock: only hold ...
433
434
435
436
437
438
439
440
441
442
443
444
445
  			vma = vma->vm_next;
  		if (!vma || vma->vm_start >= end)
  			break;
  		/*
  		 * Set [nstart; nend) to intersection of desired address
  		 * range with the first VMA. Also, skip undesirable VMA types.
  		 */
  		nend = min(end, vma->vm_end);
  		if (vma->vm_flags & (VM_IO | VM_PFNMAP))
  			continue;
  		if (nstart < vma->vm_start)
  			nstart = vma->vm_start;
  		/*
53a7706d5   Michel Lespinasse   mlock: do not hol...
446
447
448
  		 * Now fault in a range of pages. __mlock_vma_pages_range()
  		 * double checks the vma flags, so that it won't mlock pages
  		 * if the vma was already munlocked.
fed067da4   Michel Lespinasse   mlock: only hold ...
449
  		 */
53a7706d5   Michel Lespinasse   mlock: do not hol...
450
451
452
453
454
455
  		ret = __mlock_vma_pages_range(vma, nstart, nend, &locked);
  		if (ret < 0) {
  			if (ignore_errors) {
  				ret = 0;
  				continue;	/* continue at next VMA */
  			}
5fdb20021   Michel Lespinasse   mm: move VM_LOCKE...
456
457
458
  			ret = __mlock_posix_error_return(ret);
  			break;
  		}
53a7706d5   Michel Lespinasse   mlock: do not hol...
459
460
  		nend = nstart + ret * PAGE_SIZE;
  		ret = 0;
fed067da4   Michel Lespinasse   mlock: only hold ...
461
  	}
53a7706d5   Michel Lespinasse   mlock: do not hol...
462
463
  	if (locked)
  		up_read(&mm->mmap_sem);
fed067da4   Michel Lespinasse   mlock: only hold ...
464
465
  	return ret;	/* 0 or negative error code */
  }
6a6160a7b   Heiko Carstens   [CVE-2009-0029] S...
466
  SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
467
468
469
470
471
472
473
  {
  	unsigned long locked;
  	unsigned long lock_limit;
  	int error = -ENOMEM;
  
  	if (!can_do_mlock())
  		return -EPERM;
8891d6da1   KOSAKI Motohiro   mm: remove lru_ad...
474
  	lru_add_drain_all();	/* flush pagevec */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
475
476
477
478
479
480
  	down_write(&current->mm->mmap_sem);
  	len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
  	start &= PAGE_MASK;
  
  	locked = len >> PAGE_SHIFT;
  	locked += current->mm->locked_vm;
59e99e5b9   Jiri Slaby   mm: use rlimit he...
481
  	lock_limit = rlimit(RLIMIT_MEMLOCK);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
482
483
484
485
486
487
  	lock_limit >>= PAGE_SHIFT;
  
  	/* check against resource limits */
  	if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
  		error = do_mlock(start, len, 1);
  	up_write(&current->mm->mmap_sem);
fed067da4   Michel Lespinasse   mlock: only hold ...
488
489
  	if (!error)
  		error = do_mlock_pages(start, len, 0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
490
491
  	return error;
  }
6a6160a7b   Heiko Carstens   [CVE-2009-0029] S...
492
  SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
  {
  	int ret;
  
  	down_write(&current->mm->mmap_sem);
  	len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
  	start &= PAGE_MASK;
  	ret = do_mlock(start, len, 0);
  	up_write(&current->mm->mmap_sem);
  	return ret;
  }
  
  static int do_mlockall(int flags)
  {
  	struct vm_area_struct * vma, * prev = NULL;
  	unsigned int def_flags = 0;
  
  	if (flags & MCL_FUTURE)
  		def_flags = VM_LOCKED;
  	current->mm->def_flags = def_flags;
  	if (flags == MCL_FUTURE)
  		goto out;
  
  	for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
ca16d140a   KOSAKI Motohiro   mm: don't access ...
516
  		vm_flags_t newflags;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
517
518
519
520
521
522
523
524
525
526
527
  
  		newflags = vma->vm_flags | VM_LOCKED;
  		if (!(flags & MCL_CURRENT))
  			newflags &= ~VM_LOCKED;
  
  		/* Ignore errors */
  		mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
  	}
  out:
  	return 0;
  }
3480b2574   Heiko Carstens   [CVE-2009-0029] S...
528
  SYSCALL_DEFINE1(mlockall, int, flags)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
529
530
531
532
533
534
535
536
537
538
  {
  	unsigned long lock_limit;
  	int ret = -EINVAL;
  
  	if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE)))
  		goto out;
  
  	ret = -EPERM;
  	if (!can_do_mlock())
  		goto out;
df9d6985b   Christoph Lameter   mm: do not drain ...
539
540
  	if (flags & MCL_CURRENT)
  		lru_add_drain_all();	/* flush pagevec */
8891d6da1   KOSAKI Motohiro   mm: remove lru_ad...
541

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
542
  	down_write(&current->mm->mmap_sem);
59e99e5b9   Jiri Slaby   mm: use rlimit he...
543
  	lock_limit = rlimit(RLIMIT_MEMLOCK);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
544
545
546
547
548
549
550
  	lock_limit >>= PAGE_SHIFT;
  
  	ret = -ENOMEM;
  	if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
  	    capable(CAP_IPC_LOCK))
  		ret = do_mlockall(flags);
  	up_write(&current->mm->mmap_sem);
fed067da4   Michel Lespinasse   mlock: only hold ...
551
552
553
554
  	if (!ret && (flags & MCL_CURRENT)) {
  		/* Ignore errors */
  		do_mlock_pages(0, TASK_SIZE, 1);
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
555
556
557
  out:
  	return ret;
  }
3480b2574   Heiko Carstens   [CVE-2009-0029] S...
558
  SYSCALL_DEFINE0(munlockall)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
  {
  	int ret;
  
  	down_write(&current->mm->mmap_sem);
  	ret = do_mlockall(0);
  	up_write(&current->mm->mmap_sem);
  	return ret;
  }
  
  /*
   * Objects with different lifetime than processes (SHM_LOCK and SHM_HUGETLB
   * shm segments) get accounted against the user_struct instead.
   */
  static DEFINE_SPINLOCK(shmlock_user_lock);
  
  int user_shm_lock(size_t size, struct user_struct *user)
  {
  	unsigned long lock_limit, locked;
  	int allowed = 0;
  
  	locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
59e99e5b9   Jiri Slaby   mm: use rlimit he...
580
  	lock_limit = rlimit(RLIMIT_MEMLOCK);
5ed44a401   Herbert van den Bergh   do not limit lock...
581
582
  	if (lock_limit == RLIM_INFINITY)
  		allowed = 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
583
584
  	lock_limit >>= PAGE_SHIFT;
  	spin_lock(&shmlock_user_lock);
5ed44a401   Herbert van den Bergh   do not limit lock...
585
586
  	if (!allowed &&
  	    locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
  		goto out;
  	get_uid(user);
  	user->locked_shm += locked;
  	allowed = 1;
  out:
  	spin_unlock(&shmlock_user_lock);
  	return allowed;
  }
  
  void user_shm_unlock(size_t size, struct user_struct *user)
  {
  	spin_lock(&shmlock_user_lock);
  	user->locked_shm -= (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
  	spin_unlock(&shmlock_user_lock);
  	free_uid(user);
  }