Blame view

mm/madvise.c 13.9 KB
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1
2
3
4
5
6
7
8
9
10
  /*
   *	linux/mm/madvise.c
   *
   * Copyright (C) 1999  Linus Torvalds
   * Copyright (C) 2002  Christoph Hellwig
   */
  
  #include <linux/mman.h>
  #include <linux/pagemap.h>
  #include <linux/syscalls.h>
05b743847   Prasanna Meda   [PATCH] madvise: ...
11
  #include <linux/mempolicy.h>
afcf938ee   Andi Kleen   HWPOISON: Add a m...
12
  #include <linux/page-isolation.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
13
  #include <linux/hugetlb.h>
3f31d0757   Hugh Dickins   mm/fs: route MADV...
14
  #include <linux/falloc.h>
e8edc6e03   Alexey Dobriyan   Detach sched.h fr...
15
  #include <linux/sched.h>
f8af4da3b   Hugh Dickins   ksm: the mm inter...
16
  #include <linux/ksm.h>
3f31d0757   Hugh Dickins   mm/fs: route MADV...
17
  #include <linux/fs.h>
9ab4233dd   Andy Lutomirski   mm: Hold a file r...
18
  #include <linux/file.h>
1998cc048   Shaohua Li   mm: make madvise(...
19
  #include <linux/blkdev.h>
66114cad6   Tejun Heo   writeback: separa...
20
  #include <linux/backing-dev.h>
1998cc048   Shaohua Li   mm: make madvise(...
21
22
  #include <linux/swap.h>
  #include <linux/swapops.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
23
24
  
  /*
0a27a14a6   Nick Piggin   mm: madvise avoid...
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
   * Any behaviour which results in changes to the vma->vm_flags needs to
   * take mmap_sem for writing. Others, which simply traverse vmas, need
   * to only take it for reading.
   */
  static int madvise_need_mmap_write(int behavior)
  {
  	switch (behavior) {
  	case MADV_REMOVE:
  	case MADV_WILLNEED:
  	case MADV_DONTNEED:
  		return 0;
  	default:
  		/* be safe, default to 1. list exceptions explicitly */
  		return 1;
  	}
  }
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
43
44
45
   * We can potentially split a vm area into separate
   * areas, each area with its own behavior.
   */
ec9bed9d3   Vladimir Cernov   mm/madvise.c: fix...
46
  static long madvise_behavior(struct vm_area_struct *vma,
05b743847   Prasanna Meda   [PATCH] madvise: ...
47
48
  		     struct vm_area_struct **prev,
  		     unsigned long start, unsigned long end, int behavior)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
49
  {
ec9bed9d3   Vladimir Cernov   mm/madvise.c: fix...
50
  	struct mm_struct *mm = vma->vm_mm;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
51
  	int error = 0;
05b743847   Prasanna Meda   [PATCH] madvise: ...
52
  	pgoff_t pgoff;
3866ea90d   Hugh Dickins   ksm: first tidy u...
53
  	unsigned long new_flags = vma->vm_flags;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
54
55
  
  	switch (behavior) {
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
56
57
58
  	case MADV_NORMAL:
  		new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
  		break;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
59
  	case MADV_SEQUENTIAL:
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
60
  		new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
61
62
  		break;
  	case MADV_RANDOM:
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
63
  		new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
64
  		break;
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
65
66
67
68
  	case MADV_DONTFORK:
  		new_flags |= VM_DONTCOPY;
  		break;
  	case MADV_DOFORK:
3866ea90d   Hugh Dickins   ksm: first tidy u...
69
70
71
72
  		if (vma->vm_flags & VM_IO) {
  			error = -EINVAL;
  			goto out;
  		}
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
73
  		new_flags &= ~VM_DONTCOPY;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
74
  		break;
accb61fe7   Jason Baron   coredump: add VM_...
75
  	case MADV_DONTDUMP:
0103bd16f   Konstantin Khlebnikov   mm: prepare VM_DO...
76
  		new_flags |= VM_DONTDUMP;
accb61fe7   Jason Baron   coredump: add VM_...
77
78
  		break;
  	case MADV_DODUMP:
0103bd16f   Konstantin Khlebnikov   mm: prepare VM_DO...
79
80
81
82
83
  		if (new_flags & VM_SPECIAL) {
  			error = -EINVAL;
  			goto out;
  		}
  		new_flags &= ~VM_DONTDUMP;
accb61fe7   Jason Baron   coredump: add VM_...
84
  		break;
f8af4da3b   Hugh Dickins   ksm: the mm inter...
85
86
87
88
89
90
  	case MADV_MERGEABLE:
  	case MADV_UNMERGEABLE:
  		error = ksm_madvise(vma, start, end, behavior, &new_flags);
  		if (error)
  			goto out;
  		break;
0af4e98b6   Andrea Arcangeli   thp: madvise(MADV...
91
  	case MADV_HUGEPAGE:
a664b2d85   Andrea Arcangeli   thp: madvise(MADV...
92
  	case MADV_NOHUGEPAGE:
60ab3244e   Andrea Arcangeli   thp: khugepaged: ...
93
  		error = hugepage_madvise(vma, &new_flags, behavior);
0af4e98b6   Andrea Arcangeli   thp: madvise(MADV...
94
95
96
  		if (error)
  			goto out;
  		break;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
97
  	}
05b743847   Prasanna Meda   [PATCH] madvise: ...
98
99
  	if (new_flags == vma->vm_flags) {
  		*prev = vma;
836d5ffd3   Hugh Dickins   [PATCH] mm: fix m...
100
  		goto out;
05b743847   Prasanna Meda   [PATCH] madvise: ...
101
102
103
104
105
106
107
108
109
110
111
  	}
  
  	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
  	*prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
  				vma->vm_file, pgoff, vma_policy(vma));
  	if (*prev) {
  		vma = *prev;
  		goto success;
  	}
  
  	*prev = vma;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
112
113
114
115
116
117
118
119
120
121
122
123
  
  	if (start != vma->vm_start) {
  		error = split_vma(mm, vma, start, 1);
  		if (error)
  			goto out;
  	}
  
  	if (end != vma->vm_end) {
  		error = split_vma(mm, vma, end, 0);
  		if (error)
  			goto out;
  	}
836d5ffd3   Hugh Dickins   [PATCH] mm: fix m...
124
  success:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
125
126
127
  	/*
  	 * vm_flags is protected by the mmap_sem held in write mode.
  	 */
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
128
  	vma->vm_flags = new_flags;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
129
130
131
132
133
134
  
  out:
  	if (error == -ENOMEM)
  		error = -EAGAIN;
  	return error;
  }
1998cc048   Shaohua Li   mm: make madvise(...
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
  #ifdef CONFIG_SWAP
  static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
  	unsigned long end, struct mm_walk *walk)
  {
  	pte_t *orig_pte;
  	struct vm_area_struct *vma = walk->private;
  	unsigned long index;
  
  	if (pmd_none_or_trans_huge_or_clear_bad(pmd))
  		return 0;
  
  	for (index = start; index != end; index += PAGE_SIZE) {
  		pte_t pte;
  		swp_entry_t entry;
  		struct page *page;
  		spinlock_t *ptl;
  
  		orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
  		pte = *(orig_pte + ((index - start) / PAGE_SIZE));
  		pte_unmap_unlock(orig_pte, ptl);
0661a3361   Kirill A. Shutemov   mm: remove rest u...
155
  		if (pte_present(pte) || pte_none(pte))
1998cc048   Shaohua Li   mm: make madvise(...
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
  			continue;
  		entry = pte_to_swp_entry(pte);
  		if (unlikely(non_swap_entry(entry)))
  			continue;
  
  		page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
  								vma, index);
  		if (page)
  			page_cache_release(page);
  	}
  
  	return 0;
  }
  
  static void force_swapin_readahead(struct vm_area_struct *vma,
  		unsigned long start, unsigned long end)
  {
  	struct mm_walk walk = {
  		.mm = vma->vm_mm,
  		.pmd_entry = swapin_walk_pmd_entry,
  		.private = vma,
  	};
  
  	walk_page_range(start, end, &walk);
  
  	lru_add_drain();	/* Push any new pages onto the LRU now */
  }
  
  static void force_shm_swapin_readahead(struct vm_area_struct *vma,
  		unsigned long start, unsigned long end,
  		struct address_space *mapping)
  {
  	pgoff_t index;
  	struct page *page;
  	swp_entry_t swap;
  
  	for (; start < end; start += PAGE_SIZE) {
  		index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
55231e5c8   Johannes Weiner   mm: madvise: fix ...
194
  		page = find_get_entry(mapping, index);
1998cc048   Shaohua Li   mm: make madvise(...
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
  		if (!radix_tree_exceptional_entry(page)) {
  			if (page)
  				page_cache_release(page);
  			continue;
  		}
  		swap = radix_to_swp_entry(page);
  		page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
  								NULL, 0);
  		if (page)
  			page_cache_release(page);
  	}
  
  	lru_add_drain();	/* Push any new pages onto the LRU now */
  }
  #endif		/* CONFIG_SWAP */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
210
211
212
  /*
   * Schedule all required I/O operations.  Do not wait for completion.
   */
ec9bed9d3   Vladimir Cernov   mm/madvise.c: fix...
213
214
  static long madvise_willneed(struct vm_area_struct *vma,
  			     struct vm_area_struct **prev,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
215
216
217
  			     unsigned long start, unsigned long end)
  {
  	struct file *file = vma->vm_file;
1998cc048   Shaohua Li   mm: make madvise(...
218
  #ifdef CONFIG_SWAP
97b713ba3   Christoph Hellwig   fs: kill BDI_CAP_...
219
  	if (!file) {
1998cc048   Shaohua Li   mm: make madvise(...
220
  		*prev = vma;
97b713ba3   Christoph Hellwig   fs: kill BDI_CAP_...
221
  		force_swapin_readahead(vma, start, end);
1998cc048   Shaohua Li   mm: make madvise(...
222
223
  		return 0;
  	}
1998cc048   Shaohua Li   mm: make madvise(...
224

97b713ba3   Christoph Hellwig   fs: kill BDI_CAP_...
225
226
227
228
229
230
231
  	if (shmem_mapping(file->f_mapping)) {
  		*prev = vma;
  		force_shm_swapin_readahead(vma, start, end,
  					file->f_mapping);
  		return 0;
  	}
  #else
1bef40032   Suzuki   [PATCH] madvise: ...
232
233
  	if (!file)
  		return -EBADF;
97b713ba3   Christoph Hellwig   fs: kill BDI_CAP_...
234
  #endif
1bef40032   Suzuki   [PATCH] madvise: ...
235

e748dcd09   Matthew Wilcox   vfs: remove get_x...
236
  	if (IS_DAX(file_inode(file))) {
fe77ba6f4   Carsten Otte   [PATCH] xip: madv...
237
238
239
  		/* no bad return value, but ignore advice */
  		return 0;
  	}
05b743847   Prasanna Meda   [PATCH] madvise: ...
240
  	*prev = vma;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
241
242
243
244
  	start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
  	if (end > vma->vm_end)
  		end = vma->vm_end;
  	end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
f7e839dd3   Wu Fengguang   readahead: move m...
245
  	force_page_cache_readahead(file->f_mapping, file, start, end - start);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
246
247
248
249
250
251
252
  	return 0;
  }
  
  /*
   * Application no longer needs these pages.  If the pages are dirty,
   * it's OK to just throw them away.  The app will be more careful about
   * data it wants to keep.  Be sure to free swap resources too.  The
7e6cbea39   Fernando Luis Vazquez Cao   madvise: update f...
253
   * zap_page_range call sets things up for shrink_active_list to actually free
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
254
255
   * these pages later if no one else has touched them in the meantime,
   * although we could add these pages to a global reuse list for
7e6cbea39   Fernando Luis Vazquez Cao   madvise: update f...
256
   * shrink_active_list to pick up before reclaiming other pages.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
257
258
259
260
261
262
263
264
265
266
267
   *
   * NB: This interface discards data rather than pushes it out to swap,
   * as some implementations do.  This has performance implications for
   * applications like large transactional databases which want to discard
   * pages in anonymous maps after committing to backing store the data
   * that was kept in them.  There is no reason to write this data out to
   * the swap area if the application is discarding it.
   *
   * An interface that causes the system to free clean pages and flush
   * dirty pages is already available as msync(MS_INVALIDATE).
   */
ec9bed9d3   Vladimir Cernov   mm/madvise.c: fix...
268
269
  static long madvise_dontneed(struct vm_area_struct *vma,
  			     struct vm_area_struct **prev,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
270
271
  			     unsigned long start, unsigned long end)
  {
05b743847   Prasanna Meda   [PATCH] madvise: ...
272
  	*prev = vma;
6aab341e0   Linus Torvalds   mm: re-architect ...
273
  	if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
274
  		return -EINVAL;
8a5f14a23   Kirill A. Shutemov   mm: drop support ...
275
  	zap_page_range(vma, start, end - start, NULL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
276
277
  	return 0;
  }
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
278
279
280
  /*
   * Application wants to free up the pages and associated backing store.
   * This is effectively punching a hole into the middle of a file.
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
281
282
   */
  static long madvise_remove(struct vm_area_struct *vma,
00e9fa2d6   Nick Piggin   [PATCH] mm: fix m...
283
  				struct vm_area_struct **prev,
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
284
285
  				unsigned long start, unsigned long end)
  {
3f31d0757   Hugh Dickins   mm/fs: route MADV...
286
  	loff_t offset;
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
287
  	int error;
9ab4233dd   Andy Lutomirski   mm: Hold a file r...
288
  	struct file *f;
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
289

90ed52ebe   Hugh Dickins   [PATCH] holepunch...
290
  	*prev = NULL;	/* tell sys_madvise we drop mmap_sem */
00e9fa2d6   Nick Piggin   [PATCH] mm: fix m...
291

0661a3361   Kirill A. Shutemov   mm: remove rest u...
292
  	if (vma->vm_flags & (VM_LOCKED | VM_HUGETLB))
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
293
  		return -EINVAL;
9ab4233dd   Andy Lutomirski   mm: Hold a file r...
294
295
296
  	f = vma->vm_file;
  
  	if (!f || !f->f_mapping || !f->f_mapping->host) {
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
297
298
  			return -EINVAL;
  	}
69cf0fac6   Hugh Dickins   [PATCH] Fix MADV_...
299
300
  	if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
  		return -EACCES;
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
301
302
  	offset = (loff_t)(start - vma->vm_start)
  			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
303

9ab4233dd   Andy Lutomirski   mm: Hold a file r...
304
305
306
307
308
309
310
  	/*
  	 * Filesystem's fallocate may need to take i_mutex.  We need to
  	 * explicitly grab a reference because the vma (and hence the
  	 * vma's reference to the file) can go away as soon as we drop
  	 * mmap_sem.
  	 */
  	get_file(f);
0a27a14a6   Nick Piggin   mm: madvise avoid...
311
  	up_read(&current->mm->mmap_sem);
72c72bdf7   Anna Schumaker   VFS: Rename do_fa...
312
  	error = vfs_fallocate(f,
3f31d0757   Hugh Dickins   mm/fs: route MADV...
313
314
  				FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
  				offset, end - start);
9ab4233dd   Andy Lutomirski   mm: Hold a file r...
315
  	fput(f);
0a27a14a6   Nick Piggin   mm: madvise avoid...
316
  	down_read(&current->mm->mmap_sem);
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
317
  	return error;
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
318
  }
9893e49d6   Andi Kleen   HWPOISON: Add mad...
319
320
321
322
  #ifdef CONFIG_MEMORY_FAILURE
  /*
   * Error injection support for memory error handling.
   */
afcf938ee   Andi Kleen   HWPOISON: Add a m...
323
  static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
9893e49d6   Andi Kleen   HWPOISON: Add mad...
324
  {
20cb6cab5   Wanpeng Li   mm/hwpoison: fix ...
325
  	struct page *p;
9893e49d6   Andi Kleen   HWPOISON: Add mad...
326
327
  	if (!capable(CAP_SYS_ADMIN))
  		return -EPERM;
20cb6cab5   Wanpeng Li   mm/hwpoison: fix ...
328
329
  	for (; start < end; start += PAGE_SIZE <<
  				compound_order(compound_head(p))) {
325c4ef5c   Andrew Morton   mm/madvise.c:madv...
330
331
332
  		int ret;
  
  		ret = get_user_pages_fast(start, 1, 0, &p);
9893e49d6   Andi Kleen   HWPOISON: Add mad...
333
334
  		if (ret != 1)
  			return ret;
325c4ef5c   Andrew Morton   mm/madvise.c:madv...
335

29b4eedee   Wanpeng Li   mm/hwpoison.c: fi...
336
337
338
339
  		if (PageHWPoison(p)) {
  			put_page(p);
  			continue;
  		}
afcf938ee   Andi Kleen   HWPOISON: Add a m...
340
  		if (bhv == MADV_SOFT_OFFLINE) {
b194b8cdb   Wanpeng Li   mm/hwpoison: add ...
341
342
  			pr_info("Soft offlining page %#lx at %#lx
  ",
afcf938ee   Andi Kleen   HWPOISON: Add a m...
343
344
345
  				page_to_pfn(p), start);
  			ret = soft_offline_page(p, MF_COUNT_INCREASED);
  			if (ret)
8302423b8   Wanpeng Li   mm/madvise.c: fix...
346
  				return ret;
afcf938ee   Andi Kleen   HWPOISON: Add a m...
347
348
  			continue;
  		}
b194b8cdb   Wanpeng Li   mm/hwpoison: add ...
349
350
  		pr_info("Injecting memory failure for page %#lx at %#lx
  ",
9893e49d6   Andi Kleen   HWPOISON: Add mad...
351
352
  		       page_to_pfn(p), start);
  		/* Ignore return value for now */
cd42f4a3b   Tony Luck   HWPOISON: Clean u...
353
  		memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
9893e49d6   Andi Kleen   HWPOISON: Add mad...
354
  	}
325c4ef5c   Andrew Morton   mm/madvise.c:madv...
355
  	return 0;
9893e49d6   Andi Kleen   HWPOISON: Add mad...
356
357
  }
  #endif
165cd4023   suzuki   [PATCH] madvise()...
358
359
360
  static long
  madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
  		unsigned long start, unsigned long end, int behavior)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
361
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
362
  	switch (behavior) {
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
363
  	case MADV_REMOVE:
3866ea90d   Hugh Dickins   ksm: first tidy u...
364
  		return madvise_remove(vma, prev, start, end);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
365
  	case MADV_WILLNEED:
3866ea90d   Hugh Dickins   ksm: first tidy u...
366
  		return madvise_willneed(vma, prev, start, end);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
367
  	case MADV_DONTNEED:
3866ea90d   Hugh Dickins   ksm: first tidy u...
368
  		return madvise_dontneed(vma, prev, start, end);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
369
  	default:
3866ea90d   Hugh Dickins   ksm: first tidy u...
370
  		return madvise_behavior(vma, prev, start, end, behavior);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
371
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
372
  }
75927af8b   Nick Piggin   mm: madvise(): co...
373
374
375
376
377
378
379
380
381
382
383
384
  static int
  madvise_behavior_valid(int behavior)
  {
  	switch (behavior) {
  	case MADV_DOFORK:
  	case MADV_DONTFORK:
  	case MADV_NORMAL:
  	case MADV_SEQUENTIAL:
  	case MADV_RANDOM:
  	case MADV_REMOVE:
  	case MADV_WILLNEED:
  	case MADV_DONTNEED:
f8af4da3b   Hugh Dickins   ksm: the mm inter...
385
386
387
388
  #ifdef CONFIG_KSM
  	case MADV_MERGEABLE:
  	case MADV_UNMERGEABLE:
  #endif
0af4e98b6   Andrea Arcangeli   thp: madvise(MADV...
389
390
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  	case MADV_HUGEPAGE:
a664b2d85   Andrea Arcangeli   thp: madvise(MADV...
391
  	case MADV_NOHUGEPAGE:
0af4e98b6   Andrea Arcangeli   thp: madvise(MADV...
392
  #endif
accb61fe7   Jason Baron   coredump: add VM_...
393
394
  	case MADV_DONTDUMP:
  	case MADV_DODUMP:
75927af8b   Nick Piggin   mm: madvise(): co...
395
396
397
398
399
400
  		return 1;
  
  	default:
  		return 0;
  	}
  }
3866ea90d   Hugh Dickins   ksm: first tidy u...
401

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
  /*
   * The madvise(2) system call.
   *
   * Applications can use madvise() to advise the kernel how it should
   * handle paging I/O in this VM area.  The idea is to help the kernel
   * use appropriate read-ahead and caching techniques.  The information
   * provided is advisory only, and can be safely disregarded by the
   * kernel without affecting the correct operation of the application.
   *
   * behavior values:
   *  MADV_NORMAL - the default behavior is to read clusters.  This
   *		results in some read-ahead and read-behind.
   *  MADV_RANDOM - the system should read the minimum amount of data
   *		on any access, since it is unlikely that the appli-
   *		cation will need more than what it asks for.
   *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
   *		once, so they can be aggressively read ahead, and
   *		can be freed soon after they are accessed.
   *  MADV_WILLNEED - the application is notifying the system to read
   *		some pages ahead.
   *  MADV_DONTNEED - the application is finished with the given range,
   *		so the kernel can free resources associated with it.
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
424
425
   *  MADV_REMOVE - the application wants to free up the given range of
   *		pages and associated backing store.
3866ea90d   Hugh Dickins   ksm: first tidy u...
426
427
428
   *  MADV_DONTFORK - omit this area from child's address space when forking:
   *		typically, to avoid COWing pages pinned by get_user_pages().
   *  MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
f8af4da3b   Hugh Dickins   ksm: the mm inter...
429
430
431
   *  MADV_MERGEABLE - the application recommends that KSM try to merge pages in
   *		this area with pages of identical content from other such areas.
   *  MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
432
433
434
435
436
437
438
439
440
441
442
443
   *
   * return values:
   *  zero    - success
   *  -EINVAL - start + len < 0, start is not page-aligned,
   *		"behavior" is not a valid value, or application
   *		is attempting to release locked or shared pages.
   *  -ENOMEM - addresses in the specified range are not currently
   *		mapped, or are outside the AS of the process.
   *  -EIO    - an I/O error occurred while paging in data.
   *  -EBADF  - map exists, but area maps something that isn't a file.
   *  -EAGAIN - a kernel resource was temporarily unavailable.
   */
3480b2574   Heiko Carstens   [CVE-2009-0029] S...
444
  SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
445
  {
05b743847   Prasanna Meda   [PATCH] madvise: ...
446
  	unsigned long end, tmp;
ec9bed9d3   Vladimir Cernov   mm/madvise.c: fix...
447
  	struct vm_area_struct *vma, *prev;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
448
449
  	int unmapped_error = 0;
  	int error = -EINVAL;
f79777932   Jason Baron   speed up madvise_...
450
  	int write;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
451
  	size_t len;
1998cc048   Shaohua Li   mm: make madvise(...
452
  	struct blk_plug plug;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
453

9893e49d6   Andi Kleen   HWPOISON: Add mad...
454
  #ifdef CONFIG_MEMORY_FAILURE
afcf938ee   Andi Kleen   HWPOISON: Add a m...
455
456
  	if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
  		return madvise_hwpoison(behavior, start, start+len_in);
9893e49d6   Andi Kleen   HWPOISON: Add mad...
457
  #endif
75927af8b   Nick Piggin   mm: madvise(): co...
458
459
  	if (!madvise_behavior_valid(behavior))
  		return error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
460
  	if (start & ~PAGE_MASK)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
461
  		return error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
462
463
464
465
  	len = (len_in + ~PAGE_MASK) & PAGE_MASK;
  
  	/* Check to see whether len was rounded up from small -ve to zero */
  	if (len_in && !len)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
466
  		return error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
467
468
469
  
  	end = start + len;
  	if (end < start)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
470
  		return error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
471
472
473
  
  	error = 0;
  	if (end == start)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
474
475
476
477
478
479
480
  		return error;
  
  	write = madvise_need_mmap_write(behavior);
  	if (write)
  		down_write(&current->mm->mmap_sem);
  	else
  		down_read(&current->mm->mmap_sem);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
481
482
483
484
  
  	/*
  	 * If the interval [start,end) covers some unmapped address
  	 * ranges, just ignore them, but return -ENOMEM at the end.
05b743847   Prasanna Meda   [PATCH] madvise: ...
485
  	 * - different from the way of handling in mlock etc.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
486
  	 */
05b743847   Prasanna Meda   [PATCH] madvise: ...
487
  	vma = find_vma_prev(current->mm, start, &prev);
836d5ffd3   Hugh Dickins   [PATCH] mm: fix m...
488
489
  	if (vma && start > vma->vm_start)
  		prev = vma;
1998cc048   Shaohua Li   mm: make madvise(...
490
  	blk_start_plug(&plug);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
491
492
493
494
  	for (;;) {
  		/* Still start < end. */
  		error = -ENOMEM;
  		if (!vma)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
495
  			goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
496

05b743847   Prasanna Meda   [PATCH] madvise: ...
497
  		/* Here start < (end|vma->vm_end). */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
498
499
500
  		if (start < vma->vm_start) {
  			unmapped_error = -ENOMEM;
  			start = vma->vm_start;
05b743847   Prasanna Meda   [PATCH] madvise: ...
501
  			if (start >= end)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
502
  				goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
503
  		}
05b743847   Prasanna Meda   [PATCH] madvise: ...
504
505
506
507
  		/* Here vma->vm_start <= start < (end|vma->vm_end) */
  		tmp = vma->vm_end;
  		if (end < tmp)
  			tmp = end;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
508

05b743847   Prasanna Meda   [PATCH] madvise: ...
509
510
  		/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
  		error = madvise_vma(vma, &prev, start, tmp, behavior);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
511
  		if (error)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
512
  			goto out;
05b743847   Prasanna Meda   [PATCH] madvise: ...
513
  		start = tmp;
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
514
  		if (prev && start < prev->vm_end)
05b743847   Prasanna Meda   [PATCH] madvise: ...
515
516
517
  			start = prev->vm_end;
  		error = unmapped_error;
  		if (start >= end)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
518
  			goto out;
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
519
520
521
522
  		if (prev)
  			vma = prev->vm_next;
  		else	/* madvise_remove dropped mmap_sem */
  			vma = find_vma(current->mm, start);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
523
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
524
  out:
84d96d897   Rasmus Villemoes   mm: madvise: comp...
525
  	blk_finish_plug(&plug);
f79777932   Jason Baron   speed up madvise_...
526
  	if (write)
0a27a14a6   Nick Piggin   mm: madvise avoid...
527
528
529
  		up_write(&current->mm->mmap_sem);
  	else
  		up_read(&current->mm->mmap_sem);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
530
531
  	return error;
  }