Blame view

mm/madvise.c 14.1 KB
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1
2
3
4
5
6
7
8
9
10
  /*
   *	linux/mm/madvise.c
   *
   * Copyright (C) 1999  Linus Torvalds
   * Copyright (C) 2002  Christoph Hellwig
   */
  
  #include <linux/mman.h>
  #include <linux/pagemap.h>
  #include <linux/syscalls.h>
05b743847   Prasanna Meda   [PATCH] madvise: ...
11
  #include <linux/mempolicy.h>
afcf938ee   Andi Kleen   HWPOISON: Add a m...
12
  #include <linux/page-isolation.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
13
  #include <linux/hugetlb.h>
3f31d0757   Hugh Dickins   mm/fs: route MADV...
14
  #include <linux/falloc.h>
e8edc6e03   Alexey Dobriyan   Detach sched.h fr...
15
  #include <linux/sched.h>
f8af4da3b   Hugh Dickins   ksm: the mm inter...
16
  #include <linux/ksm.h>
3f31d0757   Hugh Dickins   mm/fs: route MADV...
17
  #include <linux/fs.h>
9ab4233dd   Andy Lutomirski   mm: Hold a file r...
18
  #include <linux/file.h>
1998cc048   Shaohua Li   mm: make madvise(...
19
20
21
  #include <linux/blkdev.h>
  #include <linux/swap.h>
  #include <linux/swapops.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
22
23
  
  /*
0a27a14a6   Nick Piggin   mm: madvise avoid...
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
   * Any behaviour which results in changes to the vma->vm_flags needs to
   * take mmap_sem for writing. Others, which simply traverse vmas, need
   * to only take it for reading.
   */
  static int madvise_need_mmap_write(int behavior)
  {
  	switch (behavior) {
  	case MADV_REMOVE:
  	case MADV_WILLNEED:
  	case MADV_DONTNEED:
  		return 0;
  	default:
  		/* be safe, default to 1. list exceptions explicitly */
  		return 1;
  	}
  }
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
42
43
44
   * We can potentially split a vm area into separate
   * areas, each area with its own behavior.
   */
ec9bed9d3   Vladimir Cernov   mm/madvise.c: fix...
45
  static long madvise_behavior(struct vm_area_struct *vma,
05b743847   Prasanna Meda   [PATCH] madvise: ...
46
47
  		     struct vm_area_struct **prev,
  		     unsigned long start, unsigned long end, int behavior)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
48
  {
ec9bed9d3   Vladimir Cernov   mm/madvise.c: fix...
49
  	struct mm_struct *mm = vma->vm_mm;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
50
  	int error = 0;
05b743847   Prasanna Meda   [PATCH] madvise: ...
51
  	pgoff_t pgoff;
3866ea90d   Hugh Dickins   ksm: first tidy u...
52
  	unsigned long new_flags = vma->vm_flags;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
53
54
  
  	switch (behavior) {
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
55
56
57
  	case MADV_NORMAL:
  		new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
  		break;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
58
  	case MADV_SEQUENTIAL:
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
59
  		new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
60
61
  		break;
  	case MADV_RANDOM:
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
62
  		new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
63
  		break;
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
64
65
66
67
  	case MADV_DONTFORK:
  		new_flags |= VM_DONTCOPY;
  		break;
  	case MADV_DOFORK:
3866ea90d   Hugh Dickins   ksm: first tidy u...
68
69
70
71
  		if (vma->vm_flags & VM_IO) {
  			error = -EINVAL;
  			goto out;
  		}
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
72
  		new_flags &= ~VM_DONTCOPY;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
73
  		break;
accb61fe7   Jason Baron   coredump: add VM_...
74
  	case MADV_DONTDUMP:
0103bd16f   Konstantin Khlebnikov   mm: prepare VM_DO...
75
  		new_flags |= VM_DONTDUMP;
accb61fe7   Jason Baron   coredump: add VM_...
76
77
  		break;
  	case MADV_DODUMP:
0103bd16f   Konstantin Khlebnikov   mm: prepare VM_DO...
78
79
80
81
82
  		if (new_flags & VM_SPECIAL) {
  			error = -EINVAL;
  			goto out;
  		}
  		new_flags &= ~VM_DONTDUMP;
accb61fe7   Jason Baron   coredump: add VM_...
83
  		break;
f8af4da3b   Hugh Dickins   ksm: the mm inter...
84
85
86
87
88
89
  	case MADV_MERGEABLE:
  	case MADV_UNMERGEABLE:
  		error = ksm_madvise(vma, start, end, behavior, &new_flags);
  		if (error)
  			goto out;
  		break;
0af4e98b6   Andrea Arcangeli   thp: madvise(MADV...
90
  	case MADV_HUGEPAGE:
a664b2d85   Andrea Arcangeli   thp: madvise(MADV...
91
  	case MADV_NOHUGEPAGE:
60ab3244e   Andrea Arcangeli   thp: khugepaged: ...
92
  		error = hugepage_madvise(vma, &new_flags, behavior);
0af4e98b6   Andrea Arcangeli   thp: madvise(MADV...
93
94
95
  		if (error)
  			goto out;
  		break;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
96
  	}
05b743847   Prasanna Meda   [PATCH] madvise: ...
97
98
  	if (new_flags == vma->vm_flags) {
  		*prev = vma;
836d5ffd3   Hugh Dickins   [PATCH] mm: fix m...
99
  		goto out;
05b743847   Prasanna Meda   [PATCH] madvise: ...
100
101
102
103
104
105
106
107
108
109
110
  	}
  
  	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
  	*prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
  				vma->vm_file, pgoff, vma_policy(vma));
  	if (*prev) {
  		vma = *prev;
  		goto success;
  	}
  
  	*prev = vma;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
111
112
113
114
115
116
117
118
119
120
121
122
  
  	if (start != vma->vm_start) {
  		error = split_vma(mm, vma, start, 1);
  		if (error)
  			goto out;
  	}
  
  	if (end != vma->vm_end) {
  		error = split_vma(mm, vma, end, 0);
  		if (error)
  			goto out;
  	}
836d5ffd3   Hugh Dickins   [PATCH] mm: fix m...
123
  success:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
124
125
126
  	/*
  	 * vm_flags is protected by the mmap_sem held in write mode.
  	 */
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
127
  	vma->vm_flags = new_flags;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
128
129
130
131
132
133
  
  out:
  	if (error == -ENOMEM)
  		error = -EAGAIN;
  	return error;
  }
1998cc048   Shaohua Li   mm: make madvise(...
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
  #ifdef CONFIG_SWAP
  static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
  	unsigned long end, struct mm_walk *walk)
  {
  	pte_t *orig_pte;
  	struct vm_area_struct *vma = walk->private;
  	unsigned long index;
  
  	if (pmd_none_or_trans_huge_or_clear_bad(pmd))
  		return 0;
  
  	for (index = start; index != end; index += PAGE_SIZE) {
  		pte_t pte;
  		swp_entry_t entry;
  		struct page *page;
  		spinlock_t *ptl;
  
  		orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
  		pte = *(orig_pte + ((index - start) / PAGE_SIZE));
  		pte_unmap_unlock(orig_pte, ptl);
  
  		if (pte_present(pte) || pte_none(pte) || pte_file(pte))
  			continue;
  		entry = pte_to_swp_entry(pte);
  		if (unlikely(non_swap_entry(entry)))
  			continue;
  
  		page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
  								vma, index);
  		if (page)
  			page_cache_release(page);
  	}
  
  	return 0;
  }
  
  static void force_swapin_readahead(struct vm_area_struct *vma,
  		unsigned long start, unsigned long end)
  {
  	struct mm_walk walk = {
  		.mm = vma->vm_mm,
  		.pmd_entry = swapin_walk_pmd_entry,
  		.private = vma,
  	};
  
  	walk_page_range(start, end, &walk);
  
  	lru_add_drain();	/* Push any new pages onto the LRU now */
  }
  
  static void force_shm_swapin_readahead(struct vm_area_struct *vma,
  		unsigned long start, unsigned long end,
  		struct address_space *mapping)
  {
  	pgoff_t index;
  	struct page *page;
  	swp_entry_t swap;
  
  	for (; start < end; start += PAGE_SIZE) {
  		index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
55231e5c8   Johannes Weiner   mm: madvise: fix ...
194
  		page = find_get_entry(mapping, index);
1998cc048   Shaohua Li   mm: make madvise(...
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
  		if (!radix_tree_exceptional_entry(page)) {
  			if (page)
  				page_cache_release(page);
  			continue;
  		}
  		swap = radix_to_swp_entry(page);
  		page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
  								NULL, 0);
  		if (page)
  			page_cache_release(page);
  	}
  
  	lru_add_drain();	/* Push any new pages onto the LRU now */
  }
  #endif		/* CONFIG_SWAP */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
210
211
212
  /*
   * Schedule all required I/O operations.  Do not wait for completion.
   */
ec9bed9d3   Vladimir Cernov   mm/madvise.c: fix...
213
214
  static long madvise_willneed(struct vm_area_struct *vma,
  			     struct vm_area_struct **prev,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
215
216
217
  			     unsigned long start, unsigned long end)
  {
  	struct file *file = vma->vm_file;
1998cc048   Shaohua Li   mm: make madvise(...
218
219
220
221
222
223
224
225
226
227
228
  #ifdef CONFIG_SWAP
  	if (!file || mapping_cap_swap_backed(file->f_mapping)) {
  		*prev = vma;
  		if (!file)
  			force_swapin_readahead(vma, start, end);
  		else
  			force_shm_swapin_readahead(vma, start, end,
  						file->f_mapping);
  		return 0;
  	}
  #endif
1bef40032   Suzuki   [PATCH] madvise: ...
229
230
  	if (!file)
  		return -EBADF;
70688e4dd   Nick Piggin   xip: support non-...
231
  	if (file->f_mapping->a_ops->get_xip_mem) {
fe77ba6f4   Carsten Otte   [PATCH] xip: madv...
232
233
234
  		/* no bad return value, but ignore advice */
  		return 0;
  	}
05b743847   Prasanna Meda   [PATCH] madvise: ...
235
  	*prev = vma;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
236
237
238
239
  	start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
  	if (end > vma->vm_end)
  		end = vma->vm_end;
  	end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
f7e839dd3   Wu Fengguang   readahead: move m...
240
  	force_page_cache_readahead(file->f_mapping, file, start, end - start);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
241
242
243
244
245
246
247
  	return 0;
  }
  
  /*
   * Application no longer needs these pages.  If the pages are dirty,
   * it's OK to just throw them away.  The app will be more careful about
   * data it wants to keep.  Be sure to free swap resources too.  The
7e6cbea39   Fernando Luis Vazquez Cao   madvise: update f...
248
   * zap_page_range call sets things up for shrink_active_list to actually free
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
249
250
   * these pages later if no one else has touched them in the meantime,
   * although we could add these pages to a global reuse list for
7e6cbea39   Fernando Luis Vazquez Cao   madvise: update f...
251
   * shrink_active_list to pick up before reclaiming other pages.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
252
253
254
255
256
257
258
259
260
261
262
   *
   * NB: This interface discards data rather than pushes it out to swap,
   * as some implementations do.  This has performance implications for
   * applications like large transactional databases which want to discard
   * pages in anonymous maps after committing to backing store the data
   * that was kept in them.  There is no reason to write this data out to
   * the swap area if the application is discarding it.
   *
   * An interface that causes the system to free clean pages and flush
   * dirty pages is already available as msync(MS_INVALIDATE).
   */
ec9bed9d3   Vladimir Cernov   mm/madvise.c: fix...
263
264
  static long madvise_dontneed(struct vm_area_struct *vma,
  			     struct vm_area_struct **prev,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
265
266
  			     unsigned long start, unsigned long end)
  {
05b743847   Prasanna Meda   [PATCH] madvise: ...
267
  	*prev = vma;
6aab341e0   Linus Torvalds   mm: re-architect ...
268
  	if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
269
270
271
272
273
274
275
276
277
278
279
280
  		return -EINVAL;
  
  	if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
  		struct zap_details details = {
  			.nonlinear_vma = vma,
  			.last_index = ULONG_MAX,
  		};
  		zap_page_range(vma, start, end - start, &details);
  	} else
  		zap_page_range(vma, start, end - start, NULL);
  	return 0;
  }
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
281
282
283
  /*
   * Application wants to free up the pages and associated backing store.
   * This is effectively punching a hole into the middle of a file.
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
284
285
   */
  static long madvise_remove(struct vm_area_struct *vma,
00e9fa2d6   Nick Piggin   [PATCH] mm: fix m...
286
  				struct vm_area_struct **prev,
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
287
288
  				unsigned long start, unsigned long end)
  {
3f31d0757   Hugh Dickins   mm/fs: route MADV...
289
  	loff_t offset;
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
290
  	int error;
9ab4233dd   Andy Lutomirski   mm: Hold a file r...
291
  	struct file *f;
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
292

90ed52ebe   Hugh Dickins   [PATCH] holepunch...
293
  	*prev = NULL;	/* tell sys_madvise we drop mmap_sem */
00e9fa2d6   Nick Piggin   [PATCH] mm: fix m...
294

f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
295
296
  	if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
  		return -EINVAL;
9ab4233dd   Andy Lutomirski   mm: Hold a file r...
297
298
299
  	f = vma->vm_file;
  
  	if (!f || !f->f_mapping || !f->f_mapping->host) {
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
300
301
  			return -EINVAL;
  	}
69cf0fac6   Hugh Dickins   [PATCH] Fix MADV_...
302
303
  	if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
  		return -EACCES;
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
304
305
  	offset = (loff_t)(start - vma->vm_start)
  			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
306

9ab4233dd   Andy Lutomirski   mm: Hold a file r...
307
308
309
310
311
312
313
  	/*
  	 * Filesystem's fallocate may need to take i_mutex.  We need to
  	 * explicitly grab a reference because the vma (and hence the
  	 * vma's reference to the file) can go away as soon as we drop
  	 * mmap_sem.
  	 */
  	get_file(f);
0a27a14a6   Nick Piggin   mm: madvise avoid...
314
  	up_read(&current->mm->mmap_sem);
72c72bdf7   Anna Schumaker   VFS: Rename do_fa...
315
  	error = vfs_fallocate(f,
3f31d0757   Hugh Dickins   mm/fs: route MADV...
316
317
  				FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
  				offset, end - start);
9ab4233dd   Andy Lutomirski   mm: Hold a file r...
318
  	fput(f);
0a27a14a6   Nick Piggin   mm: madvise avoid...
319
  	down_read(&current->mm->mmap_sem);
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
320
  	return error;
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
321
  }
9893e49d6   Andi Kleen   HWPOISON: Add mad...
322
323
324
325
  #ifdef CONFIG_MEMORY_FAILURE
  /*
   * Error injection support for memory error handling.
   */
afcf938ee   Andi Kleen   HWPOISON: Add a m...
326
  static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
9893e49d6   Andi Kleen   HWPOISON: Add mad...
327
  {
20cb6cab5   Wanpeng Li   mm/hwpoison: fix ...
328
  	struct page *p;
9893e49d6   Andi Kleen   HWPOISON: Add mad...
329
330
  	if (!capable(CAP_SYS_ADMIN))
  		return -EPERM;
20cb6cab5   Wanpeng Li   mm/hwpoison: fix ...
331
332
  	for (; start < end; start += PAGE_SIZE <<
  				compound_order(compound_head(p))) {
325c4ef5c   Andrew Morton   mm/madvise.c:madv...
333
334
335
  		int ret;
  
  		ret = get_user_pages_fast(start, 1, 0, &p);
9893e49d6   Andi Kleen   HWPOISON: Add mad...
336
337
  		if (ret != 1)
  			return ret;
325c4ef5c   Andrew Morton   mm/madvise.c:madv...
338

29b4eedee   Wanpeng Li   mm/hwpoison.c: fi...
339
340
341
342
  		if (PageHWPoison(p)) {
  			put_page(p);
  			continue;
  		}
afcf938ee   Andi Kleen   HWPOISON: Add a m...
343
  		if (bhv == MADV_SOFT_OFFLINE) {
b194b8cdb   Wanpeng Li   mm/hwpoison: add ...
344
345
  			pr_info("Soft offlining page %#lx at %#lx
  ",
afcf938ee   Andi Kleen   HWPOISON: Add a m...
346
347
348
  				page_to_pfn(p), start);
  			ret = soft_offline_page(p, MF_COUNT_INCREASED);
  			if (ret)
8302423b8   Wanpeng Li   mm/madvise.c: fix...
349
  				return ret;
afcf938ee   Andi Kleen   HWPOISON: Add a m...
350
351
  			continue;
  		}
b194b8cdb   Wanpeng Li   mm/hwpoison: add ...
352
353
  		pr_info("Injecting memory failure for page %#lx at %#lx
  ",
9893e49d6   Andi Kleen   HWPOISON: Add mad...
354
355
  		       page_to_pfn(p), start);
  		/* Ignore return value for now */
cd42f4a3b   Tony Luck   HWPOISON: Clean u...
356
  		memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
9893e49d6   Andi Kleen   HWPOISON: Add mad...
357
  	}
325c4ef5c   Andrew Morton   mm/madvise.c:madv...
358
  	return 0;
9893e49d6   Andi Kleen   HWPOISON: Add mad...
359
360
  }
  #endif
165cd4023   suzuki   [PATCH] madvise()...
361
362
363
  static long
  madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
  		unsigned long start, unsigned long end, int behavior)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
364
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
365
  	switch (behavior) {
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
366
  	case MADV_REMOVE:
3866ea90d   Hugh Dickins   ksm: first tidy u...
367
  		return madvise_remove(vma, prev, start, end);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
368
  	case MADV_WILLNEED:
3866ea90d   Hugh Dickins   ksm: first tidy u...
369
  		return madvise_willneed(vma, prev, start, end);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
370
  	case MADV_DONTNEED:
3866ea90d   Hugh Dickins   ksm: first tidy u...
371
  		return madvise_dontneed(vma, prev, start, end);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
372
  	default:
3866ea90d   Hugh Dickins   ksm: first tidy u...
373
  		return madvise_behavior(vma, prev, start, end, behavior);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
374
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
375
  }
75927af8b   Nick Piggin   mm: madvise(): co...
376
377
378
379
380
381
382
383
384
385
386
387
  static int
  madvise_behavior_valid(int behavior)
  {
  	switch (behavior) {
  	case MADV_DOFORK:
  	case MADV_DONTFORK:
  	case MADV_NORMAL:
  	case MADV_SEQUENTIAL:
  	case MADV_RANDOM:
  	case MADV_REMOVE:
  	case MADV_WILLNEED:
  	case MADV_DONTNEED:
f8af4da3b   Hugh Dickins   ksm: the mm inter...
388
389
390
391
  #ifdef CONFIG_KSM
  	case MADV_MERGEABLE:
  	case MADV_UNMERGEABLE:
  #endif
0af4e98b6   Andrea Arcangeli   thp: madvise(MADV...
392
393
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  	case MADV_HUGEPAGE:
a664b2d85   Andrea Arcangeli   thp: madvise(MADV...
394
  	case MADV_NOHUGEPAGE:
0af4e98b6   Andrea Arcangeli   thp: madvise(MADV...
395
  #endif
accb61fe7   Jason Baron   coredump: add VM_...
396
397
  	case MADV_DONTDUMP:
  	case MADV_DODUMP:
75927af8b   Nick Piggin   mm: madvise(): co...
398
399
400
401
402
403
  		return 1;
  
  	default:
  		return 0;
  	}
  }
3866ea90d   Hugh Dickins   ksm: first tidy u...
404

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
  /*
   * The madvise(2) system call.
   *
   * Applications can use madvise() to advise the kernel how it should
   * handle paging I/O in this VM area.  The idea is to help the kernel
   * use appropriate read-ahead and caching techniques.  The information
   * provided is advisory only, and can be safely disregarded by the
   * kernel without affecting the correct operation of the application.
   *
   * behavior values:
   *  MADV_NORMAL - the default behavior is to read clusters.  This
   *		results in some read-ahead and read-behind.
   *  MADV_RANDOM - the system should read the minimum amount of data
   *		on any access, since it is unlikely that the appli-
   *		cation will need more than what it asks for.
   *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
   *		once, so they can be aggressively read ahead, and
   *		can be freed soon after they are accessed.
   *  MADV_WILLNEED - the application is notifying the system to read
   *		some pages ahead.
   *  MADV_DONTNEED - the application is finished with the given range,
   *		so the kernel can free resources associated with it.
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
427
428
   *  MADV_REMOVE - the application wants to free up the given range of
   *		pages and associated backing store.
3866ea90d   Hugh Dickins   ksm: first tidy u...
429
430
431
   *  MADV_DONTFORK - omit this area from child's address space when forking:
   *		typically, to avoid COWing pages pinned by get_user_pages().
   *  MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
f8af4da3b   Hugh Dickins   ksm: the mm inter...
432
433
434
   *  MADV_MERGEABLE - the application recommends that KSM try to merge pages in
   *		this area with pages of identical content from other such areas.
   *  MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
435
436
437
438
439
440
441
442
443
444
445
446
   *
   * return values:
   *  zero    - success
   *  -EINVAL - start + len < 0, start is not page-aligned,
   *		"behavior" is not a valid value, or application
   *		is attempting to release locked or shared pages.
   *  -ENOMEM - addresses in the specified range are not currently
   *		mapped, or are outside the AS of the process.
   *  -EIO    - an I/O error occurred while paging in data.
   *  -EBADF  - map exists, but area maps something that isn't a file.
   *  -EAGAIN - a kernel resource was temporarily unavailable.
   */
3480b2574   Heiko Carstens   [CVE-2009-0029] S...
447
  SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
448
  {
05b743847   Prasanna Meda   [PATCH] madvise: ...
449
  	unsigned long end, tmp;
ec9bed9d3   Vladimir Cernov   mm/madvise.c: fix...
450
  	struct vm_area_struct *vma, *prev;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
451
452
  	int unmapped_error = 0;
  	int error = -EINVAL;
f79777932   Jason Baron   speed up madvise_...
453
  	int write;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
454
  	size_t len;
1998cc048   Shaohua Li   mm: make madvise(...
455
  	struct blk_plug plug;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
456

9893e49d6   Andi Kleen   HWPOISON: Add mad...
457
  #ifdef CONFIG_MEMORY_FAILURE
afcf938ee   Andi Kleen   HWPOISON: Add a m...
458
459
  	if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
  		return madvise_hwpoison(behavior, start, start+len_in);
9893e49d6   Andi Kleen   HWPOISON: Add mad...
460
  #endif
75927af8b   Nick Piggin   mm: madvise(): co...
461
462
  	if (!madvise_behavior_valid(behavior))
  		return error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
463
  	if (start & ~PAGE_MASK)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
464
  		return error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
465
466
467
468
  	len = (len_in + ~PAGE_MASK) & PAGE_MASK;
  
  	/* Check to see whether len was rounded up from small -ve to zero */
  	if (len_in && !len)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
469
  		return error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
470
471
472
  
  	end = start + len;
  	if (end < start)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
473
  		return error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
474
475
476
  
  	error = 0;
  	if (end == start)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
477
478
479
480
481
482
483
  		return error;
  
  	write = madvise_need_mmap_write(behavior);
  	if (write)
  		down_write(&current->mm->mmap_sem);
  	else
  		down_read(&current->mm->mmap_sem);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
484
485
486
487
  
  	/*
  	 * If the interval [start,end) covers some unmapped address
  	 * ranges, just ignore them, but return -ENOMEM at the end.
05b743847   Prasanna Meda   [PATCH] madvise: ...
488
  	 * - different from the way of handling in mlock etc.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
489
  	 */
05b743847   Prasanna Meda   [PATCH] madvise: ...
490
  	vma = find_vma_prev(current->mm, start, &prev);
836d5ffd3   Hugh Dickins   [PATCH] mm: fix m...
491
492
  	if (vma && start > vma->vm_start)
  		prev = vma;
1998cc048   Shaohua Li   mm: make madvise(...
493
  	blk_start_plug(&plug);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
494
495
496
497
  	for (;;) {
  		/* Still start < end. */
  		error = -ENOMEM;
  		if (!vma)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
498
  			goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
499

05b743847   Prasanna Meda   [PATCH] madvise: ...
500
  		/* Here start < (end|vma->vm_end). */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
501
502
503
  		if (start < vma->vm_start) {
  			unmapped_error = -ENOMEM;
  			start = vma->vm_start;
05b743847   Prasanna Meda   [PATCH] madvise: ...
504
  			if (start >= end)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
505
  				goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
506
  		}
05b743847   Prasanna Meda   [PATCH] madvise: ...
507
508
509
510
  		/* Here vma->vm_start <= start < (end|vma->vm_end) */
  		tmp = vma->vm_end;
  		if (end < tmp)
  			tmp = end;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
511

05b743847   Prasanna Meda   [PATCH] madvise: ...
512
513
  		/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
  		error = madvise_vma(vma, &prev, start, tmp, behavior);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
514
  		if (error)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
515
  			goto out;
05b743847   Prasanna Meda   [PATCH] madvise: ...
516
  		start = tmp;
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
517
  		if (prev && start < prev->vm_end)
05b743847   Prasanna Meda   [PATCH] madvise: ...
518
519
520
  			start = prev->vm_end;
  		error = unmapped_error;
  		if (start >= end)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
521
  			goto out;
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
522
523
524
525
  		if (prev)
  			vma = prev->vm_next;
  		else	/* madvise_remove dropped mmap_sem */
  			vma = find_vma(current->mm, start);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
526
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
527
  out:
84d96d897   Rasmus Villemoes   mm: madvise: comp...
528
  	blk_finish_plug(&plug);
f79777932   Jason Baron   speed up madvise_...
529
  	if (write)
0a27a14a6   Nick Piggin   mm: madvise avoid...
530
531
532
  		up_write(&current->mm->mmap_sem);
  	else
  		up_read(&current->mm->mmap_sem);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
533
534
  	return error;
  }