Blame view

mm/madvise.c 14.2 KB
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1
2
3
4
5
6
7
8
9
10
  /*
   *	linux/mm/madvise.c
   *
   * Copyright (C) 1999  Linus Torvalds
   * Copyright (C) 2002  Christoph Hellwig
   */
  
  #include <linux/mman.h>
  #include <linux/pagemap.h>
  #include <linux/syscalls.h>
05b743847   Prasanna Meda   [PATCH] madvise: ...
11
  #include <linux/mempolicy.h>
afcf938ee   Andi Kleen   HWPOISON: Add a m...
12
  #include <linux/page-isolation.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
13
  #include <linux/hugetlb.h>
3f31d0757   Hugh Dickins   mm/fs: route MADV...
14
  #include <linux/falloc.h>
e8edc6e03   Alexey Dobriyan   Detach sched.h fr...
15
  #include <linux/sched.h>
f8af4da3b   Hugh Dickins   ksm: the mm inter...
16
  #include <linux/ksm.h>
3f31d0757   Hugh Dickins   mm/fs: route MADV...
17
  #include <linux/fs.h>
9ab4233dd   Andy Lutomirski   mm: Hold a file r...
18
  #include <linux/file.h>
1998cc048   Shaohua Li   mm: make madvise(...
19
20
21
  #include <linux/blkdev.h>
  #include <linux/swap.h>
  #include <linux/swapops.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
22
23
  
  /*
0a27a14a6   Nick Piggin   mm: madvise avoid...
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
   * Any behaviour which results in changes to the vma->vm_flags needs to
   * take mmap_sem for writing. Others, which simply traverse vmas, need
   * to only take it for reading.
   */
  static int madvise_need_mmap_write(int behavior)
  {
  	switch (behavior) {
  	case MADV_REMOVE:
  	case MADV_WILLNEED:
  	case MADV_DONTNEED:
  		return 0;
  	default:
  		/* be safe, default to 1. list exceptions explicitly */
  		return 1;
  	}
  }
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
42
43
44
   * We can potentially split a vm area into separate
   * areas, each area with its own behavior.
   */
ec9bed9d3   Vladimir Cernov   mm/madvise.c: fix...
45
  static long madvise_behavior(struct vm_area_struct *vma,
05b743847   Prasanna Meda   [PATCH] madvise: ...
46
47
  		     struct vm_area_struct **prev,
  		     unsigned long start, unsigned long end, int behavior)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
48
  {
ec9bed9d3   Vladimir Cernov   mm/madvise.c: fix...
49
  	struct mm_struct *mm = vma->vm_mm;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
50
  	int error = 0;
05b743847   Prasanna Meda   [PATCH] madvise: ...
51
  	pgoff_t pgoff;
3866ea90d   Hugh Dickins   ksm: first tidy u...
52
  	unsigned long new_flags = vma->vm_flags;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
53
54
  
  	switch (behavior) {
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
55
56
57
  	case MADV_NORMAL:
  		new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
  		break;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
58
  	case MADV_SEQUENTIAL:
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
59
  		new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
60
61
  		break;
  	case MADV_RANDOM:
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
62
  		new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
63
  		break;
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
64
65
66
67
  	case MADV_DONTFORK:
  		new_flags |= VM_DONTCOPY;
  		break;
  	case MADV_DOFORK:
3866ea90d   Hugh Dickins   ksm: first tidy u...
68
69
70
71
  		if (vma->vm_flags & VM_IO) {
  			error = -EINVAL;
  			goto out;
  		}
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
72
  		new_flags &= ~VM_DONTCOPY;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
73
  		break;
accb61fe7   Jason Baron   coredump: add VM_...
74
  	case MADV_DONTDUMP:
0103bd16f   Konstantin Khlebnikov   mm: prepare VM_DO...
75
  		new_flags |= VM_DONTDUMP;
accb61fe7   Jason Baron   coredump: add VM_...
76
77
  		break;
  	case MADV_DODUMP:
0103bd16f   Konstantin Khlebnikov   mm: prepare VM_DO...
78
79
80
81
82
  		if (new_flags & VM_SPECIAL) {
  			error = -EINVAL;
  			goto out;
  		}
  		new_flags &= ~VM_DONTDUMP;
accb61fe7   Jason Baron   coredump: add VM_...
83
  		break;
f8af4da3b   Hugh Dickins   ksm: the mm inter...
84
85
86
87
88
89
  	case MADV_MERGEABLE:
  	case MADV_UNMERGEABLE:
  		error = ksm_madvise(vma, start, end, behavior, &new_flags);
  		if (error)
  			goto out;
  		break;
0af4e98b6   Andrea Arcangeli   thp: madvise(MADV...
90
  	case MADV_HUGEPAGE:
a664b2d85   Andrea Arcangeli   thp: madvise(MADV...
91
  	case MADV_NOHUGEPAGE:
60ab3244e   Andrea Arcangeli   thp: khugepaged: ...
92
  		error = hugepage_madvise(vma, &new_flags, behavior);
0af4e98b6   Andrea Arcangeli   thp: madvise(MADV...
93
94
95
  		if (error)
  			goto out;
  		break;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
96
  	}
05b743847   Prasanna Meda   [PATCH] madvise: ...
97
98
  	if (new_flags == vma->vm_flags) {
  		*prev = vma;
836d5ffd3   Hugh Dickins   [PATCH] mm: fix m...
99
  		goto out;
05b743847   Prasanna Meda   [PATCH] madvise: ...
100
101
102
103
104
105
106
107
108
109
110
  	}
  
  	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
  	*prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
  				vma->vm_file, pgoff, vma_policy(vma));
  	if (*prev) {
  		vma = *prev;
  		goto success;
  	}
  
  	*prev = vma;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
111
112
113
114
115
116
117
118
119
120
121
122
  
  	if (start != vma->vm_start) {
  		error = split_vma(mm, vma, start, 1);
  		if (error)
  			goto out;
  	}
  
  	if (end != vma->vm_end) {
  		error = split_vma(mm, vma, end, 0);
  		if (error)
  			goto out;
  	}
836d5ffd3   Hugh Dickins   [PATCH] mm: fix m...
123
  success:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
124
125
126
  	/*
  	 * vm_flags is protected by the mmap_sem held in write mode.
  	 */
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
127
  	vma->vm_flags = new_flags;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
128
129
130
131
132
133
  
  out:
  	if (error == -ENOMEM)
  		error = -EAGAIN;
  	return error;
  }
1998cc048   Shaohua Li   mm: make madvise(...
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
  #ifdef CONFIG_SWAP
  static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
  	unsigned long end, struct mm_walk *walk)
  {
  	pte_t *orig_pte;
  	struct vm_area_struct *vma = walk->private;
  	unsigned long index;
  
  	if (pmd_none_or_trans_huge_or_clear_bad(pmd))
  		return 0;
  
  	for (index = start; index != end; index += PAGE_SIZE) {
  		pte_t pte;
  		swp_entry_t entry;
  		struct page *page;
  		spinlock_t *ptl;
  
  		orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
  		pte = *(orig_pte + ((index - start) / PAGE_SIZE));
  		pte_unmap_unlock(orig_pte, ptl);
  
  		if (pte_present(pte) || pte_none(pte) || pte_file(pte))
  			continue;
  		entry = pte_to_swp_entry(pte);
  		if (unlikely(non_swap_entry(entry)))
  			continue;
  
  		page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
  								vma, index);
  		if (page)
  			page_cache_release(page);
  	}
  
  	return 0;
  }
  
  static void force_swapin_readahead(struct vm_area_struct *vma,
  		unsigned long start, unsigned long end)
  {
  	struct mm_walk walk = {
  		.mm = vma->vm_mm,
  		.pmd_entry = swapin_walk_pmd_entry,
  		.private = vma,
  	};
  
  	walk_page_range(start, end, &walk);
  
  	lru_add_drain();	/* Push any new pages onto the LRU now */
  }
  
  static void force_shm_swapin_readahead(struct vm_area_struct *vma,
  		unsigned long start, unsigned long end,
  		struct address_space *mapping)
  {
  	pgoff_t index;
  	struct page *page;
  	swp_entry_t swap;
  
  	for (; start < end; start += PAGE_SIZE) {
  		index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
  
  		page = find_get_page(mapping, index);
  		if (!radix_tree_exceptional_entry(page)) {
  			if (page)
  				page_cache_release(page);
  			continue;
  		}
  		swap = radix_to_swp_entry(page);
  		page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
  								NULL, 0);
  		if (page)
  			page_cache_release(page);
  	}
  
  	lru_add_drain();	/* Push any new pages onto the LRU now */
  }
  #endif		/* CONFIG_SWAP */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
211
212
213
  /*
   * Schedule all required I/O operations.  Do not wait for completion.
   */
ec9bed9d3   Vladimir Cernov   mm/madvise.c: fix...
214
215
  static long madvise_willneed(struct vm_area_struct *vma,
  			     struct vm_area_struct **prev,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
216
217
218
  			     unsigned long start, unsigned long end)
  {
  	struct file *file = vma->vm_file;
1998cc048   Shaohua Li   mm: make madvise(...
219
220
221
222
223
224
225
226
227
228
229
  #ifdef CONFIG_SWAP
  	if (!file || mapping_cap_swap_backed(file->f_mapping)) {
  		*prev = vma;
  		if (!file)
  			force_swapin_readahead(vma, start, end);
  		else
  			force_shm_swapin_readahead(vma, start, end,
  						file->f_mapping);
  		return 0;
  	}
  #endif
1bef40032   Suzuki   [PATCH] madvise: ...
230
231
  	if (!file)
  		return -EBADF;
70688e4dd   Nick Piggin   xip: support non-...
232
  	if (file->f_mapping->a_ops->get_xip_mem) {
fe77ba6f4   Carsten Otte   [PATCH] xip: madv...
233
234
235
  		/* no bad return value, but ignore advice */
  		return 0;
  	}
05b743847   Prasanna Meda   [PATCH] madvise: ...
236
  	*prev = vma;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
237
238
239
240
  	start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
  	if (end > vma->vm_end)
  		end = vma->vm_end;
  	end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
f7e839dd3   Wu Fengguang   readahead: move m...
241
  	force_page_cache_readahead(file->f_mapping, file, start, end - start);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
242
243
244
245
246
247
248
  	return 0;
  }
  
  /*
   * Application no longer needs these pages.  If the pages are dirty,
   * it's OK to just throw them away.  The app will be more careful about
   * data it wants to keep.  Be sure to free swap resources too.  The
7e6cbea39   Fernando Luis Vazquez Cao   madvise: update f...
249
   * zap_page_range call sets things up for shrink_active_list to actually free
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
250
251
   * these pages later if no one else has touched them in the meantime,
   * although we could add these pages to a global reuse list for
7e6cbea39   Fernando Luis Vazquez Cao   madvise: update f...
252
   * shrink_active_list to pick up before reclaiming other pages.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
253
254
255
256
257
258
259
260
261
262
263
   *
   * NB: This interface discards data rather than pushes it out to swap,
   * as some implementations do.  This has performance implications for
   * applications like large transactional databases which want to discard
   * pages in anonymous maps after committing to backing store the data
   * that was kept in them.  There is no reason to write this data out to
   * the swap area if the application is discarding it.
   *
   * An interface that causes the system to free clean pages and flush
   * dirty pages is already available as msync(MS_INVALIDATE).
   */
ec9bed9d3   Vladimir Cernov   mm/madvise.c: fix...
264
265
  static long madvise_dontneed(struct vm_area_struct *vma,
  			     struct vm_area_struct **prev,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
266
267
  			     unsigned long start, unsigned long end)
  {
05b743847   Prasanna Meda   [PATCH] madvise: ...
268
  	*prev = vma;
6aab341e0   Linus Torvalds   mm: re-architect ...
269
  	if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
270
271
272
273
274
275
276
277
278
279
280
281
  		return -EINVAL;
  
  	if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
  		struct zap_details details = {
  			.nonlinear_vma = vma,
  			.last_index = ULONG_MAX,
  		};
  		zap_page_range(vma, start, end - start, &details);
  	} else
  		zap_page_range(vma, start, end - start, NULL);
  	return 0;
  }
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
282
283
284
285
286
287
288
289
  /*
   * Application wants to free up the pages and associated backing store.
   * This is effectively punching a hole into the middle of a file.
   *
   * NOTE: Currently, only shmfs/tmpfs is supported for this operation.
   * Other filesystems return -ENOSYS.
   */
  static long madvise_remove(struct vm_area_struct *vma,
00e9fa2d6   Nick Piggin   [PATCH] mm: fix m...
290
  				struct vm_area_struct **prev,
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
291
292
  				unsigned long start, unsigned long end)
  {
3f31d0757   Hugh Dickins   mm/fs: route MADV...
293
  	loff_t offset;
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
294
  	int error;
9ab4233dd   Andy Lutomirski   mm: Hold a file r...
295
  	struct file *f;
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
296

90ed52ebe   Hugh Dickins   [PATCH] holepunch...
297
  	*prev = NULL;	/* tell sys_madvise we drop mmap_sem */
00e9fa2d6   Nick Piggin   [PATCH] mm: fix m...
298

f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
299
300
  	if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
  		return -EINVAL;
9ab4233dd   Andy Lutomirski   mm: Hold a file r...
301
302
303
  	f = vma->vm_file;
  
  	if (!f || !f->f_mapping || !f->f_mapping->host) {
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
304
305
  			return -EINVAL;
  	}
69cf0fac6   Hugh Dickins   [PATCH] Fix MADV_...
306
307
  	if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
  		return -EACCES;
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
308
309
  	offset = (loff_t)(start - vma->vm_start)
  			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
310

9ab4233dd   Andy Lutomirski   mm: Hold a file r...
311
312
313
314
315
316
317
  	/*
  	 * Filesystem's fallocate may need to take i_mutex.  We need to
  	 * explicitly grab a reference because the vma (and hence the
  	 * vma's reference to the file) can go away as soon as we drop
  	 * mmap_sem.
  	 */
  	get_file(f);
0a27a14a6   Nick Piggin   mm: madvise avoid...
318
  	up_read(&current->mm->mmap_sem);
9ab4233dd   Andy Lutomirski   mm: Hold a file r...
319
  	error = do_fallocate(f,
3f31d0757   Hugh Dickins   mm/fs: route MADV...
320
321
  				FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
  				offset, end - start);
9ab4233dd   Andy Lutomirski   mm: Hold a file r...
322
  	fput(f);
0a27a14a6   Nick Piggin   mm: madvise avoid...
323
  	down_read(&current->mm->mmap_sem);
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
324
  	return error;
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
325
  }
9893e49d6   Andi Kleen   HWPOISON: Add mad...
326
327
328
329
  #ifdef CONFIG_MEMORY_FAILURE
  /*
   * Error injection support for memory error handling.
   */
afcf938ee   Andi Kleen   HWPOISON: Add a m...
330
  static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
9893e49d6   Andi Kleen   HWPOISON: Add mad...
331
  {
20cb6cab5   Wanpeng Li   mm/hwpoison: fix ...
332
  	struct page *p;
9893e49d6   Andi Kleen   HWPOISON: Add mad...
333
334
  	if (!capable(CAP_SYS_ADMIN))
  		return -EPERM;
20cb6cab5   Wanpeng Li   mm/hwpoison: fix ...
335
336
  	for (; start < end; start += PAGE_SIZE <<
  				compound_order(compound_head(p))) {
325c4ef5c   Andrew Morton   mm/madvise.c:madv...
337
338
339
  		int ret;
  
  		ret = get_user_pages_fast(start, 1, 0, &p);
9893e49d6   Andi Kleen   HWPOISON: Add mad...
340
341
  		if (ret != 1)
  			return ret;
325c4ef5c   Andrew Morton   mm/madvise.c:madv...
342

29b4eedee   Wanpeng Li   mm/hwpoison.c: fi...
343
344
345
346
  		if (PageHWPoison(p)) {
  			put_page(p);
  			continue;
  		}
afcf938ee   Andi Kleen   HWPOISON: Add a m...
347
  		if (bhv == MADV_SOFT_OFFLINE) {
b194b8cdb   Wanpeng Li   mm/hwpoison: add ...
348
349
  			pr_info("Soft offlining page %#lx at %#lx
  ",
afcf938ee   Andi Kleen   HWPOISON: Add a m...
350
351
352
  				page_to_pfn(p), start);
  			ret = soft_offline_page(p, MF_COUNT_INCREASED);
  			if (ret)
8302423b8   Wanpeng Li   mm/madvise.c: fix...
353
  				return ret;
afcf938ee   Andi Kleen   HWPOISON: Add a m...
354
355
  			continue;
  		}
b194b8cdb   Wanpeng Li   mm/hwpoison: add ...
356
357
  		pr_info("Injecting memory failure for page %#lx at %#lx
  ",
9893e49d6   Andi Kleen   HWPOISON: Add mad...
358
359
  		       page_to_pfn(p), start);
  		/* Ignore return value for now */
cd42f4a3b   Tony Luck   HWPOISON: Clean u...
360
  		memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
9893e49d6   Andi Kleen   HWPOISON: Add mad...
361
  	}
325c4ef5c   Andrew Morton   mm/madvise.c:madv...
362
  	return 0;
9893e49d6   Andi Kleen   HWPOISON: Add mad...
363
364
  }
  #endif
165cd4023   suzuki   [PATCH] madvise()...
365
366
367
  static long
  madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
  		unsigned long start, unsigned long end, int behavior)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
368
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
369
  	switch (behavior) {
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
370
  	case MADV_REMOVE:
3866ea90d   Hugh Dickins   ksm: first tidy u...
371
  		return madvise_remove(vma, prev, start, end);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
372
  	case MADV_WILLNEED:
3866ea90d   Hugh Dickins   ksm: first tidy u...
373
  		return madvise_willneed(vma, prev, start, end);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
374
  	case MADV_DONTNEED:
3866ea90d   Hugh Dickins   ksm: first tidy u...
375
  		return madvise_dontneed(vma, prev, start, end);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
376
  	default:
3866ea90d   Hugh Dickins   ksm: first tidy u...
377
  		return madvise_behavior(vma, prev, start, end, behavior);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
378
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
379
  }
75927af8b   Nick Piggin   mm: madvise(): co...
380
381
382
383
384
385
386
387
388
389
390
391
  static int
  madvise_behavior_valid(int behavior)
  {
  	switch (behavior) {
  	case MADV_DOFORK:
  	case MADV_DONTFORK:
  	case MADV_NORMAL:
  	case MADV_SEQUENTIAL:
  	case MADV_RANDOM:
  	case MADV_REMOVE:
  	case MADV_WILLNEED:
  	case MADV_DONTNEED:
f8af4da3b   Hugh Dickins   ksm: the mm inter...
392
393
394
395
  #ifdef CONFIG_KSM
  	case MADV_MERGEABLE:
  	case MADV_UNMERGEABLE:
  #endif
0af4e98b6   Andrea Arcangeli   thp: madvise(MADV...
396
397
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  	case MADV_HUGEPAGE:
a664b2d85   Andrea Arcangeli   thp: madvise(MADV...
398
  	case MADV_NOHUGEPAGE:
0af4e98b6   Andrea Arcangeli   thp: madvise(MADV...
399
  #endif
accb61fe7   Jason Baron   coredump: add VM_...
400
401
  	case MADV_DONTDUMP:
  	case MADV_DODUMP:
75927af8b   Nick Piggin   mm: madvise(): co...
402
403
404
405
406
407
  		return 1;
  
  	default:
  		return 0;
  	}
  }
3866ea90d   Hugh Dickins   ksm: first tidy u...
408

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
  /*
   * The madvise(2) system call.
   *
   * Applications can use madvise() to advise the kernel how it should
   * handle paging I/O in this VM area.  The idea is to help the kernel
   * use appropriate read-ahead and caching techniques.  The information
   * provided is advisory only, and can be safely disregarded by the
   * kernel without affecting the correct operation of the application.
   *
   * behavior values:
   *  MADV_NORMAL - the default behavior is to read clusters.  This
   *		results in some read-ahead and read-behind.
   *  MADV_RANDOM - the system should read the minimum amount of data
   *		on any access, since it is unlikely that the appli-
   *		cation will need more than what it asks for.
   *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
   *		once, so they can be aggressively read ahead, and
   *		can be freed soon after they are accessed.
   *  MADV_WILLNEED - the application is notifying the system to read
   *		some pages ahead.
   *  MADV_DONTNEED - the application is finished with the given range,
   *		so the kernel can free resources associated with it.
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
431
432
   *  MADV_REMOVE - the application wants to free up the given range of
   *		pages and associated backing store.
3866ea90d   Hugh Dickins   ksm: first tidy u...
433
434
435
   *  MADV_DONTFORK - omit this area from child's address space when forking:
   *		typically, to avoid COWing pages pinned by get_user_pages().
   *  MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
f8af4da3b   Hugh Dickins   ksm: the mm inter...
436
437
438
   *  MADV_MERGEABLE - the application recommends that KSM try to merge pages in
   *		this area with pages of identical content from other such areas.
   *  MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
439
440
441
442
443
444
445
446
447
448
449
450
   *
   * return values:
   *  zero    - success
   *  -EINVAL - start + len < 0, start is not page-aligned,
   *		"behavior" is not a valid value, or application
   *		is attempting to release locked or shared pages.
   *  -ENOMEM - addresses in the specified range are not currently
   *		mapped, or are outside the AS of the process.
   *  -EIO    - an I/O error occurred while paging in data.
   *  -EBADF  - map exists, but area maps something that isn't a file.
   *  -EAGAIN - a kernel resource was temporarily unavailable.
   */
3480b2574   Heiko Carstens   [CVE-2009-0029] S...
451
  SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
452
  {
05b743847   Prasanna Meda   [PATCH] madvise: ...
453
  	unsigned long end, tmp;
ec9bed9d3   Vladimir Cernov   mm/madvise.c: fix...
454
  	struct vm_area_struct *vma, *prev;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
455
456
  	int unmapped_error = 0;
  	int error = -EINVAL;
f79777932   Jason Baron   speed up madvise_...
457
  	int write;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
458
  	size_t len;
1998cc048   Shaohua Li   mm: make madvise(...
459
  	struct blk_plug plug;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
460

9893e49d6   Andi Kleen   HWPOISON: Add mad...
461
  #ifdef CONFIG_MEMORY_FAILURE
afcf938ee   Andi Kleen   HWPOISON: Add a m...
462
463
  	if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
  		return madvise_hwpoison(behavior, start, start+len_in);
9893e49d6   Andi Kleen   HWPOISON: Add mad...
464
  #endif
75927af8b   Nick Piggin   mm: madvise(): co...
465
466
  	if (!madvise_behavior_valid(behavior))
  		return error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
467
  	if (start & ~PAGE_MASK)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
468
  		return error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
469
470
471
472
  	len = (len_in + ~PAGE_MASK) & PAGE_MASK;
  
  	/* Check to see whether len was rounded up from small -ve to zero */
  	if (len_in && !len)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
473
  		return error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
474
475
476
  
  	end = start + len;
  	if (end < start)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
477
  		return error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
478
479
480
  
  	error = 0;
  	if (end == start)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
481
482
483
484
485
486
487
  		return error;
  
  	write = madvise_need_mmap_write(behavior);
  	if (write)
  		down_write(&current->mm->mmap_sem);
  	else
  		down_read(&current->mm->mmap_sem);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
488
489
490
491
  
  	/*
  	 * If the interval [start,end) covers some unmapped address
  	 * ranges, just ignore them, but return -ENOMEM at the end.
05b743847   Prasanna Meda   [PATCH] madvise: ...
492
  	 * - different from the way of handling in mlock etc.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
493
  	 */
05b743847   Prasanna Meda   [PATCH] madvise: ...
494
  	vma = find_vma_prev(current->mm, start, &prev);
836d5ffd3   Hugh Dickins   [PATCH] mm: fix m...
495
496
  	if (vma && start > vma->vm_start)
  		prev = vma;
1998cc048   Shaohua Li   mm: make madvise(...
497
  	blk_start_plug(&plug);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
498
499
500
501
  	for (;;) {
  		/* Still start < end. */
  		error = -ENOMEM;
  		if (!vma)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
502
  			goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
503

05b743847   Prasanna Meda   [PATCH] madvise: ...
504
  		/* Here start < (end|vma->vm_end). */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
505
506
507
  		if (start < vma->vm_start) {
  			unmapped_error = -ENOMEM;
  			start = vma->vm_start;
05b743847   Prasanna Meda   [PATCH] madvise: ...
508
  			if (start >= end)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
509
  				goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
510
  		}
05b743847   Prasanna Meda   [PATCH] madvise: ...
511
512
513
514
  		/* Here vma->vm_start <= start < (end|vma->vm_end) */
  		tmp = vma->vm_end;
  		if (end < tmp)
  			tmp = end;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
515

05b743847   Prasanna Meda   [PATCH] madvise: ...
516
517
  		/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
  		error = madvise_vma(vma, &prev, start, tmp, behavior);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
518
  		if (error)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
519
  			goto out;
05b743847   Prasanna Meda   [PATCH] madvise: ...
520
  		start = tmp;
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
521
  		if (prev && start < prev->vm_end)
05b743847   Prasanna Meda   [PATCH] madvise: ...
522
523
524
  			start = prev->vm_end;
  		error = unmapped_error;
  		if (start >= end)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
525
  			goto out;
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
526
527
528
529
  		if (prev)
  			vma = prev->vm_next;
  		else	/* madvise_remove dropped mmap_sem */
  			vma = find_vma(current->mm, start);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
530
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
531
  out:
84d96d897   Rasmus Villemoes   mm: madvise: comp...
532
  	blk_finish_plug(&plug);
f79777932   Jason Baron   speed up madvise_...
533
  	if (write)
0a27a14a6   Nick Piggin   mm: madvise avoid...
534
535
536
  		up_write(&current->mm->mmap_sem);
  	else
  		up_read(&current->mm->mmap_sem);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
537
538
  	return error;
  }