Blame view

mm/madvise.c 13.9 KB
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1
2
3
4
5
6
7
8
9
10
  /*
   *	linux/mm/madvise.c
   *
   * Copyright (C) 1999  Linus Torvalds
   * Copyright (C) 2002  Christoph Hellwig
   */
  
  #include <linux/mman.h>
  #include <linux/pagemap.h>
  #include <linux/syscalls.h>
05b743847   Prasanna Meda   [PATCH] madvise: ...
11
  #include <linux/mempolicy.h>
afcf938ee   Andi Kleen   HWPOISON: Add a m...
12
  #include <linux/page-isolation.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
13
  #include <linux/hugetlb.h>
3f31d0757   Hugh Dickins   mm/fs: route MADV...
14
  #include <linux/falloc.h>
e8edc6e03   Alexey Dobriyan   Detach sched.h fr...
15
  #include <linux/sched.h>
f8af4da3b   Hugh Dickins   ksm: the mm inter...
16
  #include <linux/ksm.h>
3f31d0757   Hugh Dickins   mm/fs: route MADV...
17
  #include <linux/fs.h>
9ab4233dd   Andy Lutomirski   mm: Hold a file r...
18
  #include <linux/file.h>
1998cc048   Shaohua Li   mm: make madvise(...
19
  #include <linux/blkdev.h>
66114cad6   Tejun Heo   writeback: separa...
20
  #include <linux/backing-dev.h>
1998cc048   Shaohua Li   mm: make madvise(...
21
22
  #include <linux/swap.h>
  #include <linux/swapops.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
23
24
  
  /*
0a27a14a6   Nick Piggin   mm: madvise avoid...
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
   * Any behaviour which results in changes to the vma->vm_flags needs to
   * take mmap_sem for writing. Others, which simply traverse vmas, need
   * to only take it for reading.
   */
  static int madvise_need_mmap_write(int behavior)
  {
  	switch (behavior) {
  	case MADV_REMOVE:
  	case MADV_WILLNEED:
  	case MADV_DONTNEED:
  		return 0;
  	default:
  		/* be safe, default to 1. list exceptions explicitly */
  		return 1;
  	}
  }
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
43
44
45
   * We can potentially split a vm area into separate
   * areas, each area with its own behavior.
   */
ec9bed9d3   Vladimir Cernov   mm/madvise.c: fix...
46
  static long madvise_behavior(struct vm_area_struct *vma,
05b743847   Prasanna Meda   [PATCH] madvise: ...
47
48
  		     struct vm_area_struct **prev,
  		     unsigned long start, unsigned long end, int behavior)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
49
  {
ec9bed9d3   Vladimir Cernov   mm/madvise.c: fix...
50
  	struct mm_struct *mm = vma->vm_mm;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
51
  	int error = 0;
05b743847   Prasanna Meda   [PATCH] madvise: ...
52
  	pgoff_t pgoff;
3866ea90d   Hugh Dickins   ksm: first tidy u...
53
  	unsigned long new_flags = vma->vm_flags;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
54
55
  
  	switch (behavior) {
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
56
57
58
  	case MADV_NORMAL:
  		new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
  		break;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
59
  	case MADV_SEQUENTIAL:
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
60
  		new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
61
62
  		break;
  	case MADV_RANDOM:
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
63
  		new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
64
  		break;
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
65
66
67
68
  	case MADV_DONTFORK:
  		new_flags |= VM_DONTCOPY;
  		break;
  	case MADV_DOFORK:
3866ea90d   Hugh Dickins   ksm: first tidy u...
69
70
71
72
  		if (vma->vm_flags & VM_IO) {
  			error = -EINVAL;
  			goto out;
  		}
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
73
  		new_flags &= ~VM_DONTCOPY;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
74
  		break;
accb61fe7   Jason Baron   coredump: add VM_...
75
  	case MADV_DONTDUMP:
0103bd16f   Konstantin Khlebnikov   mm: prepare VM_DO...
76
  		new_flags |= VM_DONTDUMP;
accb61fe7   Jason Baron   coredump: add VM_...
77
78
  		break;
  	case MADV_DODUMP:
0103bd16f   Konstantin Khlebnikov   mm: prepare VM_DO...
79
80
81
82
83
  		if (new_flags & VM_SPECIAL) {
  			error = -EINVAL;
  			goto out;
  		}
  		new_flags &= ~VM_DONTDUMP;
accb61fe7   Jason Baron   coredump: add VM_...
84
  		break;
f8af4da3b   Hugh Dickins   ksm: the mm inter...
85
86
87
88
89
90
  	case MADV_MERGEABLE:
  	case MADV_UNMERGEABLE:
  		error = ksm_madvise(vma, start, end, behavior, &new_flags);
  		if (error)
  			goto out;
  		break;
0af4e98b6   Andrea Arcangeli   thp: madvise(MADV...
91
  	case MADV_HUGEPAGE:
a664b2d85   Andrea Arcangeli   thp: madvise(MADV...
92
  	case MADV_NOHUGEPAGE:
60ab3244e   Andrea Arcangeli   thp: khugepaged: ...
93
  		error = hugepage_madvise(vma, &new_flags, behavior);
0af4e98b6   Andrea Arcangeli   thp: madvise(MADV...
94
95
96
  		if (error)
  			goto out;
  		break;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
97
  	}
05b743847   Prasanna Meda   [PATCH] madvise: ...
98
99
  	if (new_flags == vma->vm_flags) {
  		*prev = vma;
836d5ffd3   Hugh Dickins   [PATCH] mm: fix m...
100
  		goto out;
05b743847   Prasanna Meda   [PATCH] madvise: ...
101
102
103
104
  	}
  
  	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
  	*prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
19a809afe   Andrea Arcangeli   userfaultfd: teac...
105
106
  			  vma->vm_file, pgoff, vma_policy(vma),
  			  vma->vm_userfaultfd_ctx);
05b743847   Prasanna Meda   [PATCH] madvise: ...
107
108
109
110
111
112
  	if (*prev) {
  		vma = *prev;
  		goto success;
  	}
  
  	*prev = vma;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
113
114
115
116
117
118
119
120
121
122
123
124
  
  	if (start != vma->vm_start) {
  		error = split_vma(mm, vma, start, 1);
  		if (error)
  			goto out;
  	}
  
  	if (end != vma->vm_end) {
  		error = split_vma(mm, vma, end, 0);
  		if (error)
  			goto out;
  	}
836d5ffd3   Hugh Dickins   [PATCH] mm: fix m...
125
  success:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
126
127
128
  	/*
  	 * vm_flags is protected by the mmap_sem held in write mode.
  	 */
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
129
  	vma->vm_flags = new_flags;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
130
131
132
133
134
135
  
  out:
  	if (error == -ENOMEM)
  		error = -EAGAIN;
  	return error;
  }
1998cc048   Shaohua Li   mm: make madvise(...
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
  #ifdef CONFIG_SWAP
  static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
  	unsigned long end, struct mm_walk *walk)
  {
  	pte_t *orig_pte;
  	struct vm_area_struct *vma = walk->private;
  	unsigned long index;
  
  	if (pmd_none_or_trans_huge_or_clear_bad(pmd))
  		return 0;
  
  	for (index = start; index != end; index += PAGE_SIZE) {
  		pte_t pte;
  		swp_entry_t entry;
  		struct page *page;
  		spinlock_t *ptl;
  
  		orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
  		pte = *(orig_pte + ((index - start) / PAGE_SIZE));
  		pte_unmap_unlock(orig_pte, ptl);
0661a3361   Kirill A. Shutemov   mm: remove rest u...
156
  		if (pte_present(pte) || pte_none(pte))
1998cc048   Shaohua Li   mm: make madvise(...
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
  			continue;
  		entry = pte_to_swp_entry(pte);
  		if (unlikely(non_swap_entry(entry)))
  			continue;
  
  		page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
  								vma, index);
  		if (page)
  			page_cache_release(page);
  	}
  
  	return 0;
  }
  
  static void force_swapin_readahead(struct vm_area_struct *vma,
  		unsigned long start, unsigned long end)
  {
  	struct mm_walk walk = {
  		.mm = vma->vm_mm,
  		.pmd_entry = swapin_walk_pmd_entry,
  		.private = vma,
  	};
  
  	walk_page_range(start, end, &walk);
  
  	lru_add_drain();	/* Push any new pages onto the LRU now */
  }
  
  static void force_shm_swapin_readahead(struct vm_area_struct *vma,
  		unsigned long start, unsigned long end,
  		struct address_space *mapping)
  {
  	pgoff_t index;
  	struct page *page;
  	swp_entry_t swap;
  
  	for (; start < end; start += PAGE_SIZE) {
  		index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
55231e5c8   Johannes Weiner   mm: madvise: fix ...
195
  		page = find_get_entry(mapping, index);
1998cc048   Shaohua Li   mm: make madvise(...
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
  		if (!radix_tree_exceptional_entry(page)) {
  			if (page)
  				page_cache_release(page);
  			continue;
  		}
  		swap = radix_to_swp_entry(page);
  		page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
  								NULL, 0);
  		if (page)
  			page_cache_release(page);
  	}
  
  	lru_add_drain();	/* Push any new pages onto the LRU now */
  }
  #endif		/* CONFIG_SWAP */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
211
212
213
  /*
   * Schedule all required I/O operations.  Do not wait for completion.
   */
ec9bed9d3   Vladimir Cernov   mm/madvise.c: fix...
214
215
  static long madvise_willneed(struct vm_area_struct *vma,
  			     struct vm_area_struct **prev,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
216
217
218
  			     unsigned long start, unsigned long end)
  {
  	struct file *file = vma->vm_file;
1998cc048   Shaohua Li   mm: make madvise(...
219
  #ifdef CONFIG_SWAP
97b713ba3   Christoph Hellwig   fs: kill BDI_CAP_...
220
  	if (!file) {
1998cc048   Shaohua Li   mm: make madvise(...
221
  		*prev = vma;
97b713ba3   Christoph Hellwig   fs: kill BDI_CAP_...
222
  		force_swapin_readahead(vma, start, end);
1998cc048   Shaohua Li   mm: make madvise(...
223
224
  		return 0;
  	}
1998cc048   Shaohua Li   mm: make madvise(...
225

97b713ba3   Christoph Hellwig   fs: kill BDI_CAP_...
226
227
228
229
230
231
232
  	if (shmem_mapping(file->f_mapping)) {
  		*prev = vma;
  		force_shm_swapin_readahead(vma, start, end,
  					file->f_mapping);
  		return 0;
  	}
  #else
1bef40032   Suzuki   [PATCH] madvise: ...
233
234
  	if (!file)
  		return -EBADF;
97b713ba3   Christoph Hellwig   fs: kill BDI_CAP_...
235
  #endif
1bef40032   Suzuki   [PATCH] madvise: ...
236

e748dcd09   Matthew Wilcox   vfs: remove get_x...
237
  	if (IS_DAX(file_inode(file))) {
fe77ba6f4   Carsten Otte   [PATCH] xip: madv...
238
239
240
  		/* no bad return value, but ignore advice */
  		return 0;
  	}
05b743847   Prasanna Meda   [PATCH] madvise: ...
241
  	*prev = vma;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
242
243
244
245
  	start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
  	if (end > vma->vm_end)
  		end = vma->vm_end;
  	end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
f7e839dd3   Wu Fengguang   readahead: move m...
246
  	force_page_cache_readahead(file->f_mapping, file, start, end - start);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
247
248
249
250
251
252
253
  	return 0;
  }
  
  /*
   * Application no longer needs these pages.  If the pages are dirty,
   * it's OK to just throw them away.  The app will be more careful about
   * data it wants to keep.  Be sure to free swap resources too.  The
7e6cbea39   Fernando Luis Vazquez Cao   madvise: update f...
254
   * zap_page_range call sets things up for shrink_active_list to actually free
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
255
256
   * these pages later if no one else has touched them in the meantime,
   * although we could add these pages to a global reuse list for
7e6cbea39   Fernando Luis Vazquez Cao   madvise: update f...
257
   * shrink_active_list to pick up before reclaiming other pages.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
258
259
260
261
262
263
264
265
266
267
268
   *
   * NB: This interface discards data rather than pushes it out to swap,
   * as some implementations do.  This has performance implications for
   * applications like large transactional databases which want to discard
   * pages in anonymous maps after committing to backing store the data
   * that was kept in them.  There is no reason to write this data out to
   * the swap area if the application is discarding it.
   *
   * An interface that causes the system to free clean pages and flush
   * dirty pages is already available as msync(MS_INVALIDATE).
   */
ec9bed9d3   Vladimir Cernov   mm/madvise.c: fix...
269
270
  static long madvise_dontneed(struct vm_area_struct *vma,
  			     struct vm_area_struct **prev,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
271
272
  			     unsigned long start, unsigned long end)
  {
05b743847   Prasanna Meda   [PATCH] madvise: ...
273
  	*prev = vma;
6aab341e0   Linus Torvalds   mm: re-architect ...
274
  	if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
275
  		return -EINVAL;
8a5f14a23   Kirill A. Shutemov   mm: drop support ...
276
  	zap_page_range(vma, start, end - start, NULL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
277
278
  	return 0;
  }
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
279
280
281
  /*
   * Application wants to free up the pages and associated backing store.
   * This is effectively punching a hole into the middle of a file.
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
282
283
   */
  static long madvise_remove(struct vm_area_struct *vma,
00e9fa2d6   Nick Piggin   [PATCH] mm: fix m...
284
  				struct vm_area_struct **prev,
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
285
286
  				unsigned long start, unsigned long end)
  {
3f31d0757   Hugh Dickins   mm/fs: route MADV...
287
  	loff_t offset;
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
288
  	int error;
9ab4233dd   Andy Lutomirski   mm: Hold a file r...
289
  	struct file *f;
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
290

90ed52ebe   Hugh Dickins   [PATCH] holepunch...
291
  	*prev = NULL;	/* tell sys_madvise we drop mmap_sem */
00e9fa2d6   Nick Piggin   [PATCH] mm: fix m...
292

72079ba0d   Mike Kravetz   mm: madvise allow...
293
  	if (vma->vm_flags & VM_LOCKED)
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
294
  		return -EINVAL;
9ab4233dd   Andy Lutomirski   mm: Hold a file r...
295
296
297
  	f = vma->vm_file;
  
  	if (!f || !f->f_mapping || !f->f_mapping->host) {
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
298
299
  			return -EINVAL;
  	}
69cf0fac6   Hugh Dickins   [PATCH] Fix MADV_...
300
301
  	if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
  		return -EACCES;
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
302
303
  	offset = (loff_t)(start - vma->vm_start)
  			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
304

9ab4233dd   Andy Lutomirski   mm: Hold a file r...
305
306
307
308
309
310
311
  	/*
  	 * Filesystem's fallocate may need to take i_mutex.  We need to
  	 * explicitly grab a reference because the vma (and hence the
  	 * vma's reference to the file) can go away as soon as we drop
  	 * mmap_sem.
  	 */
  	get_file(f);
0a27a14a6   Nick Piggin   mm: madvise avoid...
312
  	up_read(&current->mm->mmap_sem);
72c72bdf7   Anna Schumaker   VFS: Rename do_fa...
313
  	error = vfs_fallocate(f,
3f31d0757   Hugh Dickins   mm/fs: route MADV...
314
315
  				FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
  				offset, end - start);
9ab4233dd   Andy Lutomirski   mm: Hold a file r...
316
  	fput(f);
0a27a14a6   Nick Piggin   mm: madvise avoid...
317
  	down_read(&current->mm->mmap_sem);
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
318
  	return error;
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
319
  }
9893e49d6   Andi Kleen   HWPOISON: Add mad...
320
321
322
323
  #ifdef CONFIG_MEMORY_FAILURE
  /*
   * Error injection support for memory error handling.
   */
afcf938ee   Andi Kleen   HWPOISON: Add a m...
324
  static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
9893e49d6   Andi Kleen   HWPOISON: Add mad...
325
  {
20cb6cab5   Wanpeng Li   mm/hwpoison: fix ...
326
  	struct page *p;
9893e49d6   Andi Kleen   HWPOISON: Add mad...
327
328
  	if (!capable(CAP_SYS_ADMIN))
  		return -EPERM;
20cb6cab5   Wanpeng Li   mm/hwpoison: fix ...
329
330
  	for (; start < end; start += PAGE_SIZE <<
  				compound_order(compound_head(p))) {
325c4ef5c   Andrew Morton   mm/madvise.c:madv...
331
332
333
  		int ret;
  
  		ret = get_user_pages_fast(start, 1, 0, &p);
9893e49d6   Andi Kleen   HWPOISON: Add mad...
334
335
  		if (ret != 1)
  			return ret;
325c4ef5c   Andrew Morton   mm/madvise.c:madv...
336

29b4eedee   Wanpeng Li   mm/hwpoison.c: fi...
337
338
339
340
  		if (PageHWPoison(p)) {
  			put_page(p);
  			continue;
  		}
afcf938ee   Andi Kleen   HWPOISON: Add a m...
341
  		if (bhv == MADV_SOFT_OFFLINE) {
b194b8cdb   Wanpeng Li   mm/hwpoison: add ...
342
343
  			pr_info("Soft offlining page %#lx at %#lx
  ",
afcf938ee   Andi Kleen   HWPOISON: Add a m...
344
345
346
  				page_to_pfn(p), start);
  			ret = soft_offline_page(p, MF_COUNT_INCREASED);
  			if (ret)
8302423b8   Wanpeng Li   mm/madvise.c: fix...
347
  				return ret;
afcf938ee   Andi Kleen   HWPOISON: Add a m...
348
349
  			continue;
  		}
b194b8cdb   Wanpeng Li   mm/hwpoison: add ...
350
351
  		pr_info("Injecting memory failure for page %#lx at %#lx
  ",
9893e49d6   Andi Kleen   HWPOISON: Add mad...
352
353
  		       page_to_pfn(p), start);
  		/* Ignore return value for now */
cd42f4a3b   Tony Luck   HWPOISON: Clean u...
354
  		memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
9893e49d6   Andi Kleen   HWPOISON: Add mad...
355
  	}
325c4ef5c   Andrew Morton   mm/madvise.c:madv...
356
  	return 0;
9893e49d6   Andi Kleen   HWPOISON: Add mad...
357
358
  }
  #endif
165cd4023   suzuki   [PATCH] madvise()...
359
360
361
  static long
  madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
  		unsigned long start, unsigned long end, int behavior)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
362
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
363
  	switch (behavior) {
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
364
  	case MADV_REMOVE:
3866ea90d   Hugh Dickins   ksm: first tidy u...
365
  		return madvise_remove(vma, prev, start, end);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
366
  	case MADV_WILLNEED:
3866ea90d   Hugh Dickins   ksm: first tidy u...
367
  		return madvise_willneed(vma, prev, start, end);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
368
  	case MADV_DONTNEED:
3866ea90d   Hugh Dickins   ksm: first tidy u...
369
  		return madvise_dontneed(vma, prev, start, end);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
370
  	default:
3866ea90d   Hugh Dickins   ksm: first tidy u...
371
  		return madvise_behavior(vma, prev, start, end, behavior);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
372
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
373
  }
1ecef9ed0   Nicholas Krause   mm/madvise.c: mak...
374
  static bool
75927af8b   Nick Piggin   mm: madvise(): co...
375
376
377
378
379
380
381
382
383
384
385
  madvise_behavior_valid(int behavior)
  {
  	switch (behavior) {
  	case MADV_DOFORK:
  	case MADV_DONTFORK:
  	case MADV_NORMAL:
  	case MADV_SEQUENTIAL:
  	case MADV_RANDOM:
  	case MADV_REMOVE:
  	case MADV_WILLNEED:
  	case MADV_DONTNEED:
f8af4da3b   Hugh Dickins   ksm: the mm inter...
386
387
388
389
  #ifdef CONFIG_KSM
  	case MADV_MERGEABLE:
  	case MADV_UNMERGEABLE:
  #endif
0af4e98b6   Andrea Arcangeli   thp: madvise(MADV...
390
391
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  	case MADV_HUGEPAGE:
a664b2d85   Andrea Arcangeli   thp: madvise(MADV...
392
  	case MADV_NOHUGEPAGE:
0af4e98b6   Andrea Arcangeli   thp: madvise(MADV...
393
  #endif
accb61fe7   Jason Baron   coredump: add VM_...
394
395
  	case MADV_DONTDUMP:
  	case MADV_DODUMP:
1ecef9ed0   Nicholas Krause   mm/madvise.c: mak...
396
  		return true;
75927af8b   Nick Piggin   mm: madvise(): co...
397
398
  
  	default:
1ecef9ed0   Nicholas Krause   mm/madvise.c: mak...
399
  		return false;
75927af8b   Nick Piggin   mm: madvise(): co...
400
401
  	}
  }
3866ea90d   Hugh Dickins   ksm: first tidy u...
402

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
  /*
   * The madvise(2) system call.
   *
   * Applications can use madvise() to advise the kernel how it should
   * handle paging I/O in this VM area.  The idea is to help the kernel
   * use appropriate read-ahead and caching techniques.  The information
   * provided is advisory only, and can be safely disregarded by the
   * kernel without affecting the correct operation of the application.
   *
   * behavior values:
   *  MADV_NORMAL - the default behavior is to read clusters.  This
   *		results in some read-ahead and read-behind.
   *  MADV_RANDOM - the system should read the minimum amount of data
   *		on any access, since it is unlikely that the appli-
   *		cation will need more than what it asks for.
   *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
   *		once, so they can be aggressively read ahead, and
   *		can be freed soon after they are accessed.
   *  MADV_WILLNEED - the application is notifying the system to read
   *		some pages ahead.
   *  MADV_DONTNEED - the application is finished with the given range,
   *		so the kernel can free resources associated with it.
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
425
426
   *  MADV_REMOVE - the application wants to free up the given range of
   *		pages and associated backing store.
3866ea90d   Hugh Dickins   ksm: first tidy u...
427
428
429
   *  MADV_DONTFORK - omit this area from child's address space when forking:
   *		typically, to avoid COWing pages pinned by get_user_pages().
   *  MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
f8af4da3b   Hugh Dickins   ksm: the mm inter...
430
431
432
   *  MADV_MERGEABLE - the application recommends that KSM try to merge pages in
   *		this area with pages of identical content from other such areas.
   *  MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
433
434
435
436
437
438
439
440
441
442
443
444
   *
   * return values:
   *  zero    - success
   *  -EINVAL - start + len < 0, start is not page-aligned,
   *		"behavior" is not a valid value, or application
   *		is attempting to release locked or shared pages.
   *  -ENOMEM - addresses in the specified range are not currently
   *		mapped, or are outside the AS of the process.
   *  -EIO    - an I/O error occurred while paging in data.
   *  -EBADF  - map exists, but area maps something that isn't a file.
   *  -EAGAIN - a kernel resource was temporarily unavailable.
   */
3480b2574   Heiko Carstens   [CVE-2009-0029] S...
445
  SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
446
  {
05b743847   Prasanna Meda   [PATCH] madvise: ...
447
  	unsigned long end, tmp;
ec9bed9d3   Vladimir Cernov   mm/madvise.c: fix...
448
  	struct vm_area_struct *vma, *prev;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
449
450
  	int unmapped_error = 0;
  	int error = -EINVAL;
f79777932   Jason Baron   speed up madvise_...
451
  	int write;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
452
  	size_t len;
1998cc048   Shaohua Li   mm: make madvise(...
453
  	struct blk_plug plug;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
454

9893e49d6   Andi Kleen   HWPOISON: Add mad...
455
  #ifdef CONFIG_MEMORY_FAILURE
afcf938ee   Andi Kleen   HWPOISON: Add a m...
456
457
  	if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
  		return madvise_hwpoison(behavior, start, start+len_in);
9893e49d6   Andi Kleen   HWPOISON: Add mad...
458
  #endif
75927af8b   Nick Piggin   mm: madvise(): co...
459
460
  	if (!madvise_behavior_valid(behavior))
  		return error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
461
  	if (start & ~PAGE_MASK)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
462
  		return error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
463
464
465
466
  	len = (len_in + ~PAGE_MASK) & PAGE_MASK;
  
  	/* Check to see whether len was rounded up from small -ve to zero */
  	if (len_in && !len)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
467
  		return error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
468
469
470
  
  	end = start + len;
  	if (end < start)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
471
  		return error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
472
473
474
  
  	error = 0;
  	if (end == start)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
475
476
477
478
479
480
481
  		return error;
  
  	write = madvise_need_mmap_write(behavior);
  	if (write)
  		down_write(&current->mm->mmap_sem);
  	else
  		down_read(&current->mm->mmap_sem);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
482
483
484
485
  
  	/*
  	 * If the interval [start,end) covers some unmapped address
  	 * ranges, just ignore them, but return -ENOMEM at the end.
05b743847   Prasanna Meda   [PATCH] madvise: ...
486
  	 * - different from the way of handling in mlock etc.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
487
  	 */
05b743847   Prasanna Meda   [PATCH] madvise: ...
488
  	vma = find_vma_prev(current->mm, start, &prev);
836d5ffd3   Hugh Dickins   [PATCH] mm: fix m...
489
490
  	if (vma && start > vma->vm_start)
  		prev = vma;
1998cc048   Shaohua Li   mm: make madvise(...
491
  	blk_start_plug(&plug);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
492
493
494
495
  	for (;;) {
  		/* Still start < end. */
  		error = -ENOMEM;
  		if (!vma)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
496
  			goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
497

05b743847   Prasanna Meda   [PATCH] madvise: ...
498
  		/* Here start < (end|vma->vm_end). */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
499
500
501
  		if (start < vma->vm_start) {
  			unmapped_error = -ENOMEM;
  			start = vma->vm_start;
05b743847   Prasanna Meda   [PATCH] madvise: ...
502
  			if (start >= end)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
503
  				goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
504
  		}
05b743847   Prasanna Meda   [PATCH] madvise: ...
505
506
507
508
  		/* Here vma->vm_start <= start < (end|vma->vm_end) */
  		tmp = vma->vm_end;
  		if (end < tmp)
  			tmp = end;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
509

05b743847   Prasanna Meda   [PATCH] madvise: ...
510
511
  		/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
  		error = madvise_vma(vma, &prev, start, tmp, behavior);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
512
  		if (error)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
513
  			goto out;
05b743847   Prasanna Meda   [PATCH] madvise: ...
514
  		start = tmp;
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
515
  		if (prev && start < prev->vm_end)
05b743847   Prasanna Meda   [PATCH] madvise: ...
516
517
518
  			start = prev->vm_end;
  		error = unmapped_error;
  		if (start >= end)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
519
  			goto out;
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
520
521
522
523
  		if (prev)
  			vma = prev->vm_next;
  		else	/* madvise_remove dropped mmap_sem */
  			vma = find_vma(current->mm, start);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
524
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
525
  out:
84d96d897   Rasmus Villemoes   mm: madvise: comp...
526
  	blk_finish_plug(&plug);
f79777932   Jason Baron   speed up madvise_...
527
  	if (write)
0a27a14a6   Nick Piggin   mm: madvise avoid...
528
529
530
  		up_write(&current->mm->mmap_sem);
  	else
  		up_read(&current->mm->mmap_sem);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
531
532
  	return error;
  }