Blame view

mm/madvise.c 19.4 KB
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1
2
3
4
5
6
7
8
9
10
  /*
   *	linux/mm/madvise.c
   *
   * Copyright (C) 1999  Linus Torvalds
   * Copyright (C) 2002  Christoph Hellwig
   */
  
  #include <linux/mman.h>
  #include <linux/pagemap.h>
  #include <linux/syscalls.h>
05b743847   Prasanna Meda   [PATCH] madvise: ...
11
  #include <linux/mempolicy.h>
afcf938ee   Andi Kleen   HWPOISON: Add a m...
12
  #include <linux/page-isolation.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
13
  #include <linux/hugetlb.h>
3f31d0757   Hugh Dickins   mm/fs: route MADV...
14
  #include <linux/falloc.h>
e8edc6e03   Alexey Dobriyan   Detach sched.h fr...
15
  #include <linux/sched.h>
f8af4da3b   Hugh Dickins   ksm: the mm inter...
16
  #include <linux/ksm.h>
3f31d0757   Hugh Dickins   mm/fs: route MADV...
17
  #include <linux/fs.h>
9ab4233dd   Andy Lutomirski   mm: Hold a file r...
18
  #include <linux/file.h>
1998cc048   Shaohua Li   mm: make madvise(...
19
  #include <linux/blkdev.h>
66114cad6   Tejun Heo   writeback: separa...
20
  #include <linux/backing-dev.h>
1998cc048   Shaohua Li   mm: make madvise(...
21
22
  #include <linux/swap.h>
  #include <linux/swapops.h>
854e9ed09   Minchan Kim   mm: support madvi...
23
24
25
  #include <linux/mmu_notifier.h>
  
  #include <asm/tlb.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
26
27
  
  /*
0a27a14a6   Nick Piggin   mm: madvise avoid...
28
29
30
31
32
33
34
35
36
37
   * Any behaviour which results in changes to the vma->vm_flags needs to
   * take mmap_sem for writing. Others, which simply traverse vmas, need
   * to only take it for reading.
   */
  static int madvise_need_mmap_write(int behavior)
  {
  	switch (behavior) {
  	case MADV_REMOVE:
  	case MADV_WILLNEED:
  	case MADV_DONTNEED:
854e9ed09   Minchan Kim   mm: support madvi...
38
  	case MADV_FREE:
0a27a14a6   Nick Piggin   mm: madvise avoid...
39
40
41
42
43
44
45
46
  		return 0;
  	default:
  		/* be safe, default to 1. list exceptions explicitly */
  		return 1;
  	}
  }
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
47
48
49
   * We can potentially split a vm area into separate
   * areas, each area with its own behavior.
   */
ec9bed9d3   Vladimir Cernov   mm/madvise.c: fix...
50
  static long madvise_behavior(struct vm_area_struct *vma,
05b743847   Prasanna Meda   [PATCH] madvise: ...
51
52
  		     struct vm_area_struct **prev,
  		     unsigned long start, unsigned long end, int behavior)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
53
  {
ec9bed9d3   Vladimir Cernov   mm/madvise.c: fix...
54
  	struct mm_struct *mm = vma->vm_mm;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
55
  	int error = 0;
05b743847   Prasanna Meda   [PATCH] madvise: ...
56
  	pgoff_t pgoff;
3866ea90d   Hugh Dickins   ksm: first tidy u...
57
  	unsigned long new_flags = vma->vm_flags;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
58
59
  
  	switch (behavior) {
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
60
61
62
  	case MADV_NORMAL:
  		new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
  		break;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
63
  	case MADV_SEQUENTIAL:
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
64
  		new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
65
66
  		break;
  	case MADV_RANDOM:
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
67
  		new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
68
  		break;
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
69
70
71
72
  	case MADV_DONTFORK:
  		new_flags |= VM_DONTCOPY;
  		break;
  	case MADV_DOFORK:
3866ea90d   Hugh Dickins   ksm: first tidy u...
73
74
75
76
  		if (vma->vm_flags & VM_IO) {
  			error = -EINVAL;
  			goto out;
  		}
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
77
  		new_flags &= ~VM_DONTCOPY;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
78
  		break;
accb61fe7   Jason Baron   coredump: add VM_...
79
  	case MADV_DONTDUMP:
0103bd16f   Konstantin Khlebnikov   mm: prepare VM_DO...
80
  		new_flags |= VM_DONTDUMP;
accb61fe7   Jason Baron   coredump: add VM_...
81
82
  		break;
  	case MADV_DODUMP:
0103bd16f   Konstantin Khlebnikov   mm: prepare VM_DO...
83
84
85
86
87
  		if (new_flags & VM_SPECIAL) {
  			error = -EINVAL;
  			goto out;
  		}
  		new_flags &= ~VM_DONTDUMP;
accb61fe7   Jason Baron   coredump: add VM_...
88
  		break;
f8af4da3b   Hugh Dickins   ksm: the mm inter...
89
90
91
92
93
94
  	case MADV_MERGEABLE:
  	case MADV_UNMERGEABLE:
  		error = ksm_madvise(vma, start, end, behavior, &new_flags);
  		if (error)
  			goto out;
  		break;
0af4e98b6   Andrea Arcangeli   thp: madvise(MADV...
95
  	case MADV_HUGEPAGE:
a664b2d85   Andrea Arcangeli   thp: madvise(MADV...
96
  	case MADV_NOHUGEPAGE:
60ab3244e   Andrea Arcangeli   thp: khugepaged: ...
97
  		error = hugepage_madvise(vma, &new_flags, behavior);
0af4e98b6   Andrea Arcangeli   thp: madvise(MADV...
98
99
100
  		if (error)
  			goto out;
  		break;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
101
  	}
05b743847   Prasanna Meda   [PATCH] madvise: ...
102
103
  	if (new_flags == vma->vm_flags) {
  		*prev = vma;
836d5ffd3   Hugh Dickins   [PATCH] mm: fix m...
104
  		goto out;
05b743847   Prasanna Meda   [PATCH] madvise: ...
105
106
107
108
  	}
  
  	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
  	*prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
19a809afe   Andrea Arcangeli   userfaultfd: teac...
109
110
  			  vma->vm_file, pgoff, vma_policy(vma),
  			  vma->vm_userfaultfd_ctx);
05b743847   Prasanna Meda   [PATCH] madvise: ...
111
112
113
114
115
116
  	if (*prev) {
  		vma = *prev;
  		goto success;
  	}
  
  	*prev = vma;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
117
118
119
120
121
122
123
124
125
126
127
128
  
  	if (start != vma->vm_start) {
  		error = split_vma(mm, vma, start, 1);
  		if (error)
  			goto out;
  	}
  
  	if (end != vma->vm_end) {
  		error = split_vma(mm, vma, end, 0);
  		if (error)
  			goto out;
  	}
836d5ffd3   Hugh Dickins   [PATCH] mm: fix m...
129
  success:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
130
131
132
  	/*
  	 * vm_flags is protected by the mmap_sem held in write mode.
  	 */
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
133
  	vma->vm_flags = new_flags;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
134
135
136
137
138
139
  
  out:
  	if (error == -ENOMEM)
  		error = -EAGAIN;
  	return error;
  }
1998cc048   Shaohua Li   mm: make madvise(...
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
  #ifdef CONFIG_SWAP
  static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
  	unsigned long end, struct mm_walk *walk)
  {
  	pte_t *orig_pte;
  	struct vm_area_struct *vma = walk->private;
  	unsigned long index;
  
  	if (pmd_none_or_trans_huge_or_clear_bad(pmd))
  		return 0;
  
  	for (index = start; index != end; index += PAGE_SIZE) {
  		pte_t pte;
  		swp_entry_t entry;
  		struct page *page;
  		spinlock_t *ptl;
  
  		orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
  		pte = *(orig_pte + ((index - start) / PAGE_SIZE));
  		pte_unmap_unlock(orig_pte, ptl);
0661a3361   Kirill A. Shutemov   mm: remove rest u...
160
  		if (pte_present(pte) || pte_none(pte))
1998cc048   Shaohua Li   mm: make madvise(...
161
162
163
164
165
166
167
168
  			continue;
  		entry = pte_to_swp_entry(pte);
  		if (unlikely(non_swap_entry(entry)))
  			continue;
  
  		page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
  								vma, index);
  		if (page)
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
169
  			put_page(page);
1998cc048   Shaohua Li   mm: make madvise(...
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
  	}
  
  	return 0;
  }
  
  static void force_swapin_readahead(struct vm_area_struct *vma,
  		unsigned long start, unsigned long end)
  {
  	struct mm_walk walk = {
  		.mm = vma->vm_mm,
  		.pmd_entry = swapin_walk_pmd_entry,
  		.private = vma,
  	};
  
  	walk_page_range(start, end, &walk);
  
  	lru_add_drain();	/* Push any new pages onto the LRU now */
  }
  
  static void force_shm_swapin_readahead(struct vm_area_struct *vma,
  		unsigned long start, unsigned long end,
  		struct address_space *mapping)
  {
  	pgoff_t index;
  	struct page *page;
  	swp_entry_t swap;
  
  	for (; start < end; start += PAGE_SIZE) {
  		index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
55231e5c8   Johannes Weiner   mm: madvise: fix ...
199
  		page = find_get_entry(mapping, index);
1998cc048   Shaohua Li   mm: make madvise(...
200
201
  		if (!radix_tree_exceptional_entry(page)) {
  			if (page)
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
202
  				put_page(page);
1998cc048   Shaohua Li   mm: make madvise(...
203
204
205
206
207
208
  			continue;
  		}
  		swap = radix_to_swp_entry(page);
  		page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
  								NULL, 0);
  		if (page)
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
209
  			put_page(page);
1998cc048   Shaohua Li   mm: make madvise(...
210
211
212
213
214
  	}
  
  	lru_add_drain();	/* Push any new pages onto the LRU now */
  }
  #endif		/* CONFIG_SWAP */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
215
216
217
  /*
   * Schedule all required I/O operations.  Do not wait for completion.
   */
ec9bed9d3   Vladimir Cernov   mm/madvise.c: fix...
218
219
  static long madvise_willneed(struct vm_area_struct *vma,
  			     struct vm_area_struct **prev,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
220
221
222
  			     unsigned long start, unsigned long end)
  {
  	struct file *file = vma->vm_file;
1998cc048   Shaohua Li   mm: make madvise(...
223
  #ifdef CONFIG_SWAP
97b713ba3   Christoph Hellwig   fs: kill BDI_CAP_...
224
  	if (!file) {
1998cc048   Shaohua Li   mm: make madvise(...
225
  		*prev = vma;
97b713ba3   Christoph Hellwig   fs: kill BDI_CAP_...
226
  		force_swapin_readahead(vma, start, end);
1998cc048   Shaohua Li   mm: make madvise(...
227
228
  		return 0;
  	}
1998cc048   Shaohua Li   mm: make madvise(...
229

97b713ba3   Christoph Hellwig   fs: kill BDI_CAP_...
230
231
232
233
234
235
236
  	if (shmem_mapping(file->f_mapping)) {
  		*prev = vma;
  		force_shm_swapin_readahead(vma, start, end,
  					file->f_mapping);
  		return 0;
  	}
  #else
1bef40032   Suzuki   [PATCH] madvise: ...
237
238
  	if (!file)
  		return -EBADF;
97b713ba3   Christoph Hellwig   fs: kill BDI_CAP_...
239
  #endif
1bef40032   Suzuki   [PATCH] madvise: ...
240

e748dcd09   Matthew Wilcox   vfs: remove get_x...
241
  	if (IS_DAX(file_inode(file))) {
fe77ba6f4   Carsten Otte   [PATCH] xip: madv...
242
243
244
  		/* no bad return value, but ignore advice */
  		return 0;
  	}
05b743847   Prasanna Meda   [PATCH] madvise: ...
245
  	*prev = vma;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
246
247
248
249
  	start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
  	if (end > vma->vm_end)
  		end = vma->vm_end;
  	end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
f7e839dd3   Wu Fengguang   readahead: move m...
250
  	force_page_cache_readahead(file->f_mapping, file, start, end - start);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
251
252
  	return 0;
  }
854e9ed09   Minchan Kim   mm: support madvi...
253
254
255
256
257
258
259
260
261
262
  static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
  				unsigned long end, struct mm_walk *walk)
  
  {
  	struct mmu_gather *tlb = walk->private;
  	struct mm_struct *mm = tlb->mm;
  	struct vm_area_struct *vma = walk->vma;
  	spinlock_t *ptl;
  	pte_t *orig_pte, *pte, ptent;
  	struct page *page;
64b42bc1c   Minchan Kim   mm/madvise.c: fre...
263
  	int nr_swap = 0;
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
264
265
266
267
268
269
  	unsigned long next;
  
  	next = pmd_addr_end(addr, end);
  	if (pmd_trans_huge(*pmd))
  		if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
  			goto next;
854e9ed09   Minchan Kim   mm: support madvi...
270

854e9ed09   Minchan Kim   mm: support madvi...
271
272
273
274
275
276
277
  	if (pmd_trans_unstable(pmd))
  		return 0;
  
  	orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
  	arch_enter_lazy_mmu_mode();
  	for (; addr != end; pte++, addr += PAGE_SIZE) {
  		ptent = *pte;
64b42bc1c   Minchan Kim   mm/madvise.c: fre...
278
  		if (pte_none(ptent))
854e9ed09   Minchan Kim   mm: support madvi...
279
  			continue;
64b42bc1c   Minchan Kim   mm/madvise.c: fre...
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
  		/*
  		 * If the pte has swp_entry, just clear page table to
  		 * prevent swap-in which is more expensive rather than
  		 * (page allocation + zeroing).
  		 */
  		if (!pte_present(ptent)) {
  			swp_entry_t entry;
  
  			entry = pte_to_swp_entry(ptent);
  			if (non_swap_entry(entry))
  				continue;
  			nr_swap--;
  			free_swap_and_cache(entry);
  			pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
  			continue;
  		}
854e9ed09   Minchan Kim   mm: support madvi...
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
  
  		page = vm_normal_page(vma, addr, ptent);
  		if (!page)
  			continue;
  
  		/*
  		 * If pmd isn't transhuge but the page is THP and
  		 * is owned by only this process, split it and
  		 * deactivate all pages.
  		 */
  		if (PageTransCompound(page)) {
  			if (page_mapcount(page) != 1)
  				goto out;
  			get_page(page);
  			if (!trylock_page(page)) {
  				put_page(page);
  				goto out;
  			}
  			pte_unmap_unlock(orig_pte, ptl);
  			if (split_huge_page(page)) {
  				unlock_page(page);
  				put_page(page);
  				pte_offset_map_lock(mm, pmd, addr, &ptl);
  				goto out;
  			}
  			put_page(page);
  			unlock_page(page);
  			pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
  			pte--;
  			addr -= PAGE_SIZE;
  			continue;
  		}
  
  		VM_BUG_ON_PAGE(PageTransCompound(page), page);
  
  		if (PageSwapCache(page) || PageDirty(page)) {
  			if (!trylock_page(page))
  				continue;
  			/*
  			 * If page is shared with others, we couldn't clear
  			 * PG_dirty of the page.
  			 */
  			if (page_mapcount(page) != 1) {
  				unlock_page(page);
  				continue;
  			}
  
  			if (PageSwapCache(page) && !try_to_free_swap(page)) {
  				unlock_page(page);
  				continue;
  			}
  
  			ClearPageDirty(page);
  			unlock_page(page);
  		}
  
  		if (pte_young(ptent) || pte_dirty(ptent)) {
  			/*
  			 * Some of architecture(ex, PPC) don't update TLB
  			 * with set_pte_at and tlb_remove_tlb_entry so for
  			 * the portability, remap the pte with old|clean
  			 * after pte clearing.
  			 */
  			ptent = ptep_get_and_clear_full(mm, addr, pte,
  							tlb->fullmm);
  
  			ptent = pte_mkold(ptent);
  			ptent = pte_mkclean(ptent);
  			set_pte_at(mm, addr, pte, ptent);
10853a039   Minchan Kim   mm: move lazily f...
365
366
  			if (PageActive(page))
  				deactivate_page(page);
854e9ed09   Minchan Kim   mm: support madvi...
367
368
369
370
  			tlb_remove_tlb_entry(tlb, pte, addr);
  		}
  	}
  out:
64b42bc1c   Minchan Kim   mm/madvise.c: fre...
371
372
373
374
375
376
  	if (nr_swap) {
  		if (current->mm == mm)
  			sync_mm_rss(mm);
  
  		add_mm_counter(mm, MM_SWAPENTS, nr_swap);
  	}
854e9ed09   Minchan Kim   mm: support madvi...
377
378
379
  	arch_leave_lazy_mmu_mode();
  	pte_unmap_unlock(orig_pte, ptl);
  	cond_resched();
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
380
  next:
854e9ed09   Minchan Kim   mm: support madvi...
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
  	return 0;
  }
  
  static void madvise_free_page_range(struct mmu_gather *tlb,
  			     struct vm_area_struct *vma,
  			     unsigned long addr, unsigned long end)
  {
  	struct mm_walk free_walk = {
  		.pmd_entry = madvise_free_pte_range,
  		.mm = vma->vm_mm,
  		.private = tlb,
  	};
  
  	tlb_start_vma(tlb, vma);
  	walk_page_range(addr, end, &free_walk);
  	tlb_end_vma(tlb, vma);
  }
  
  static int madvise_free_single_vma(struct vm_area_struct *vma,
  			unsigned long start_addr, unsigned long end_addr)
  {
  	unsigned long start, end;
  	struct mm_struct *mm = vma->vm_mm;
  	struct mmu_gather tlb;
  
  	if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
  		return -EINVAL;
  
  	/* MADV_FREE works for only anon vma at the moment */
  	if (!vma_is_anonymous(vma))
  		return -EINVAL;
  
  	start = max(vma->vm_start, start_addr);
  	if (start >= vma->vm_end)
  		return -EINVAL;
  	end = min(vma->vm_end, end_addr);
  	if (end <= vma->vm_start)
  		return -EINVAL;
  
  	lru_add_drain();
  	tlb_gather_mmu(&tlb, mm, start, end);
  	update_hiwater_rss(mm);
  
  	mmu_notifier_invalidate_range_start(mm, start, end);
  	madvise_free_page_range(&tlb, vma, start, end);
  	mmu_notifier_invalidate_range_end(mm, start, end);
  	tlb_finish_mmu(&tlb, start, end);
  
  	return 0;
  }
  
  static long madvise_free(struct vm_area_struct *vma,
  			     struct vm_area_struct **prev,
  			     unsigned long start, unsigned long end)
  {
  	*prev = vma;
  	return madvise_free_single_vma(vma, start, end);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
439
440
441
442
  /*
   * Application no longer needs these pages.  If the pages are dirty,
   * it's OK to just throw them away.  The app will be more careful about
   * data it wants to keep.  Be sure to free swap resources too.  The
7e6cbea39   Fernando Luis Vazquez Cao   madvise: update f...
443
   * zap_page_range call sets things up for shrink_active_list to actually free
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
444
445
   * these pages later if no one else has touched them in the meantime,
   * although we could add these pages to a global reuse list for
7e6cbea39   Fernando Luis Vazquez Cao   madvise: update f...
446
   * shrink_active_list to pick up before reclaiming other pages.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
447
448
449
450
451
452
453
454
455
456
457
   *
   * NB: This interface discards data rather than pushes it out to swap,
   * as some implementations do.  This has performance implications for
   * applications like large transactional databases which want to discard
   * pages in anonymous maps after committing to backing store the data
   * that was kept in them.  There is no reason to write this data out to
   * the swap area if the application is discarding it.
   *
   * An interface that causes the system to free clean pages and flush
   * dirty pages is already available as msync(MS_INVALIDATE).
   */
ec9bed9d3   Vladimir Cernov   mm/madvise.c: fix...
458
459
  static long madvise_dontneed(struct vm_area_struct *vma,
  			     struct vm_area_struct **prev,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
460
461
  			     unsigned long start, unsigned long end)
  {
05b743847   Prasanna Meda   [PATCH] madvise: ...
462
  	*prev = vma;
6aab341e0   Linus Torvalds   mm: re-architect ...
463
  	if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
464
  		return -EINVAL;
8a5f14a23   Kirill A. Shutemov   mm: drop support ...
465
  	zap_page_range(vma, start, end - start, NULL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
466
467
  	return 0;
  }
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
468
469
470
  /*
   * Application wants to free up the pages and associated backing store.
   * This is effectively punching a hole into the middle of a file.
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
471
472
   */
  static long madvise_remove(struct vm_area_struct *vma,
00e9fa2d6   Nick Piggin   [PATCH] mm: fix m...
473
  				struct vm_area_struct **prev,
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
474
475
  				unsigned long start, unsigned long end)
  {
3f31d0757   Hugh Dickins   mm/fs: route MADV...
476
  	loff_t offset;
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
477
  	int error;
9ab4233dd   Andy Lutomirski   mm: Hold a file r...
478
  	struct file *f;
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
479

90ed52ebe   Hugh Dickins   [PATCH] holepunch...
480
  	*prev = NULL;	/* tell sys_madvise we drop mmap_sem */
00e9fa2d6   Nick Piggin   [PATCH] mm: fix m...
481

72079ba0d   Mike Kravetz   mm: madvise allow...
482
  	if (vma->vm_flags & VM_LOCKED)
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
483
  		return -EINVAL;
9ab4233dd   Andy Lutomirski   mm: Hold a file r...
484
485
486
  	f = vma->vm_file;
  
  	if (!f || !f->f_mapping || !f->f_mapping->host) {
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
487
488
  			return -EINVAL;
  	}
69cf0fac6   Hugh Dickins   [PATCH] Fix MADV_...
489
490
  	if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
  		return -EACCES;
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
491
492
  	offset = (loff_t)(start - vma->vm_start)
  			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
493

9ab4233dd   Andy Lutomirski   mm: Hold a file r...
494
495
496
497
498
499
500
  	/*
  	 * Filesystem's fallocate may need to take i_mutex.  We need to
  	 * explicitly grab a reference because the vma (and hence the
  	 * vma's reference to the file) can go away as soon as we drop
  	 * mmap_sem.
  	 */
  	get_file(f);
0a27a14a6   Nick Piggin   mm: madvise avoid...
501
  	up_read(&current->mm->mmap_sem);
72c72bdf7   Anna Schumaker   VFS: Rename do_fa...
502
  	error = vfs_fallocate(f,
3f31d0757   Hugh Dickins   mm/fs: route MADV...
503
504
  				FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
  				offset, end - start);
9ab4233dd   Andy Lutomirski   mm: Hold a file r...
505
  	fput(f);
0a27a14a6   Nick Piggin   mm: madvise avoid...
506
  	down_read(&current->mm->mmap_sem);
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
507
  	return error;
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
508
  }
9893e49d6   Andi Kleen   HWPOISON: Add mad...
509
510
511
512
  #ifdef CONFIG_MEMORY_FAILURE
  /*
   * Error injection support for memory error handling.
   */
afcf938ee   Andi Kleen   HWPOISON: Add a m...
513
  static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
9893e49d6   Andi Kleen   HWPOISON: Add mad...
514
  {
20cb6cab5   Wanpeng Li   mm/hwpoison: fix ...
515
  	struct page *p;
9893e49d6   Andi Kleen   HWPOISON: Add mad...
516
517
  	if (!capable(CAP_SYS_ADMIN))
  		return -EPERM;
20cb6cab5   Wanpeng Li   mm/hwpoison: fix ...
518
519
  	for (; start < end; start += PAGE_SIZE <<
  				compound_order(compound_head(p))) {
325c4ef5c   Andrew Morton   mm/madvise.c:madv...
520
521
522
  		int ret;
  
  		ret = get_user_pages_fast(start, 1, 0, &p);
9893e49d6   Andi Kleen   HWPOISON: Add mad...
523
524
  		if (ret != 1)
  			return ret;
325c4ef5c   Andrew Morton   mm/madvise.c:madv...
525

29b4eedee   Wanpeng Li   mm/hwpoison.c: fi...
526
527
528
529
  		if (PageHWPoison(p)) {
  			put_page(p);
  			continue;
  		}
afcf938ee   Andi Kleen   HWPOISON: Add a m...
530
  		if (bhv == MADV_SOFT_OFFLINE) {
b194b8cdb   Wanpeng Li   mm/hwpoison: add ...
531
532
  			pr_info("Soft offlining page %#lx at %#lx
  ",
afcf938ee   Andi Kleen   HWPOISON: Add a m...
533
534
535
  				page_to_pfn(p), start);
  			ret = soft_offline_page(p, MF_COUNT_INCREASED);
  			if (ret)
8302423b8   Wanpeng Li   mm/madvise.c: fix...
536
  				return ret;
afcf938ee   Andi Kleen   HWPOISON: Add a m...
537
538
  			continue;
  		}
b194b8cdb   Wanpeng Li   mm/hwpoison: add ...
539
540
  		pr_info("Injecting memory failure for page %#lx at %#lx
  ",
9893e49d6   Andi Kleen   HWPOISON: Add mad...
541
  		       page_to_pfn(p), start);
23a003bfd   Naoya Horiguchi   mm/madvise: pass ...
542
543
544
  		ret = memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
  		if (ret)
  			return ret;
9893e49d6   Andi Kleen   HWPOISON: Add mad...
545
  	}
325c4ef5c   Andrew Morton   mm/madvise.c:madv...
546
  	return 0;
9893e49d6   Andi Kleen   HWPOISON: Add mad...
547
548
  }
  #endif
165cd4023   suzuki   [PATCH] madvise()...
549
550
551
  static long
  madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
  		unsigned long start, unsigned long end, int behavior)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
552
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
553
  	switch (behavior) {
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
554
  	case MADV_REMOVE:
3866ea90d   Hugh Dickins   ksm: first tidy u...
555
  		return madvise_remove(vma, prev, start, end);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
556
  	case MADV_WILLNEED:
3866ea90d   Hugh Dickins   ksm: first tidy u...
557
  		return madvise_willneed(vma, prev, start, end);
854e9ed09   Minchan Kim   mm: support madvi...
558
559
560
561
562
563
564
565
  	case MADV_FREE:
  		/*
  		 * XXX: In this implementation, MADV_FREE works like
  		 * MADV_DONTNEED on swapless system or full swap.
  		 */
  		if (get_nr_swap_pages() > 0)
  			return madvise_free(vma, prev, start, end);
  		/* passthrough */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
566
  	case MADV_DONTNEED:
3866ea90d   Hugh Dickins   ksm: first tidy u...
567
  		return madvise_dontneed(vma, prev, start, end);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
568
  	default:
3866ea90d   Hugh Dickins   ksm: first tidy u...
569
  		return madvise_behavior(vma, prev, start, end, behavior);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
570
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
571
  }
1ecef9ed0   Nicholas Krause   mm/madvise.c: mak...
572
  static bool
75927af8b   Nick Piggin   mm: madvise(): co...
573
574
575
576
577
578
579
580
581
582
583
  madvise_behavior_valid(int behavior)
  {
  	switch (behavior) {
  	case MADV_DOFORK:
  	case MADV_DONTFORK:
  	case MADV_NORMAL:
  	case MADV_SEQUENTIAL:
  	case MADV_RANDOM:
  	case MADV_REMOVE:
  	case MADV_WILLNEED:
  	case MADV_DONTNEED:
854e9ed09   Minchan Kim   mm: support madvi...
584
  	case MADV_FREE:
f8af4da3b   Hugh Dickins   ksm: the mm inter...
585
586
587
588
  #ifdef CONFIG_KSM
  	case MADV_MERGEABLE:
  	case MADV_UNMERGEABLE:
  #endif
0af4e98b6   Andrea Arcangeli   thp: madvise(MADV...
589
590
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  	case MADV_HUGEPAGE:
a664b2d85   Andrea Arcangeli   thp: madvise(MADV...
591
  	case MADV_NOHUGEPAGE:
0af4e98b6   Andrea Arcangeli   thp: madvise(MADV...
592
  #endif
accb61fe7   Jason Baron   coredump: add VM_...
593
594
  	case MADV_DONTDUMP:
  	case MADV_DODUMP:
1ecef9ed0   Nicholas Krause   mm/madvise.c: mak...
595
  		return true;
75927af8b   Nick Piggin   mm: madvise(): co...
596
597
  
  	default:
1ecef9ed0   Nicholas Krause   mm/madvise.c: mak...
598
  		return false;
75927af8b   Nick Piggin   mm: madvise(): co...
599
600
  	}
  }
3866ea90d   Hugh Dickins   ksm: first tidy u...
601

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
  /*
   * The madvise(2) system call.
   *
   * Applications can use madvise() to advise the kernel how it should
   * handle paging I/O in this VM area.  The idea is to help the kernel
   * use appropriate read-ahead and caching techniques.  The information
   * provided is advisory only, and can be safely disregarded by the
   * kernel without affecting the correct operation of the application.
   *
   * behavior values:
   *  MADV_NORMAL - the default behavior is to read clusters.  This
   *		results in some read-ahead and read-behind.
   *  MADV_RANDOM - the system should read the minimum amount of data
   *		on any access, since it is unlikely that the appli-
   *		cation will need more than what it asks for.
   *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
   *		once, so they can be aggressively read ahead, and
   *		can be freed soon after they are accessed.
   *  MADV_WILLNEED - the application is notifying the system to read
   *		some pages ahead.
   *  MADV_DONTNEED - the application is finished with the given range,
   *		so the kernel can free resources associated with it.
d7206a70a   Naoya Horiguchi   mm/madvise: updat...
624
625
   *  MADV_FREE - the application marks pages in the given range as lazy free,
   *		where actual purges are postponed until memory pressure happens.
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
626
627
   *  MADV_REMOVE - the application wants to free up the given range of
   *		pages and associated backing store.
3866ea90d   Hugh Dickins   ksm: first tidy u...
628
629
630
   *  MADV_DONTFORK - omit this area from child's address space when forking:
   *		typically, to avoid COWing pages pinned by get_user_pages().
   *  MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
d7206a70a   Naoya Horiguchi   mm/madvise: updat...
631
632
633
   *  MADV_HWPOISON - trigger memory error handler as if the given memory range
   *		were corrupted by unrecoverable hardware memory failure.
   *  MADV_SOFT_OFFLINE - try to soft-offline the given range of memory.
f8af4da3b   Hugh Dickins   ksm: the mm inter...
634
635
636
   *  MADV_MERGEABLE - the application recommends that KSM try to merge pages in
   *		this area with pages of identical content from other such areas.
   *  MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
d7206a70a   Naoya Horiguchi   mm/madvise: updat...
637
638
639
640
641
642
643
644
645
   *  MADV_HUGEPAGE - the application wants to back the given range by transparent
   *		huge pages in the future. Existing pages might be coalesced and
   *		new pages might be allocated as THP.
   *  MADV_NOHUGEPAGE - mark the given range as not worth being backed by
   *		transparent huge pages so the existing pages will not be
   *		coalesced into THP and new pages will not be allocated as THP.
   *  MADV_DONTDUMP - the application wants to prevent pages in the given range
   *		from being included in its core dump.
   *  MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
646
647
648
649
650
651
652
653
654
655
656
657
   *
   * return values:
   *  zero    - success
   *  -EINVAL - start + len < 0, start is not page-aligned,
   *		"behavior" is not a valid value, or application
   *		is attempting to release locked or shared pages.
   *  -ENOMEM - addresses in the specified range are not currently
   *		mapped, or are outside the AS of the process.
   *  -EIO    - an I/O error occurred while paging in data.
   *  -EBADF  - map exists, but area maps something that isn't a file.
   *  -EAGAIN - a kernel resource was temporarily unavailable.
   */
3480b2574   Heiko Carstens   [CVE-2009-0029] S...
658
  SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
659
  {
05b743847   Prasanna Meda   [PATCH] madvise: ...
660
  	unsigned long end, tmp;
ec9bed9d3   Vladimir Cernov   mm/madvise.c: fix...
661
  	struct vm_area_struct *vma, *prev;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
662
663
  	int unmapped_error = 0;
  	int error = -EINVAL;
f79777932   Jason Baron   speed up madvise_...
664
  	int write;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
665
  	size_t len;
1998cc048   Shaohua Li   mm: make madvise(...
666
  	struct blk_plug plug;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
667

9893e49d6   Andi Kleen   HWPOISON: Add mad...
668
  #ifdef CONFIG_MEMORY_FAILURE
afcf938ee   Andi Kleen   HWPOISON: Add a m...
669
670
  	if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
  		return madvise_hwpoison(behavior, start, start+len_in);
9893e49d6   Andi Kleen   HWPOISON: Add mad...
671
  #endif
75927af8b   Nick Piggin   mm: madvise(): co...
672
673
  	if (!madvise_behavior_valid(behavior))
  		return error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
674
  	if (start & ~PAGE_MASK)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
675
  		return error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
676
677
678
679
  	len = (len_in + ~PAGE_MASK) & PAGE_MASK;
  
  	/* Check to see whether len was rounded up from small -ve to zero */
  	if (len_in && !len)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
680
  		return error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
681
682
683
  
  	end = start + len;
  	if (end < start)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
684
  		return error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
685
686
687
  
  	error = 0;
  	if (end == start)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
688
689
690
  		return error;
  
  	write = madvise_need_mmap_write(behavior);
dc0ef0df7   Michal Hocko   mm: make mmap_sem...
691
692
693
694
  	if (write) {
  		if (down_write_killable(&current->mm->mmap_sem))
  			return -EINTR;
  	} else {
84d96d897   Rasmus Villemoes   mm: madvise: comp...
695
  		down_read(&current->mm->mmap_sem);
dc0ef0df7   Michal Hocko   mm: make mmap_sem...
696
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
697
698
699
700
  
  	/*
  	 * If the interval [start,end) covers some unmapped address
  	 * ranges, just ignore them, but return -ENOMEM at the end.
05b743847   Prasanna Meda   [PATCH] madvise: ...
701
  	 * - different from the way of handling in mlock etc.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
702
  	 */
05b743847   Prasanna Meda   [PATCH] madvise: ...
703
  	vma = find_vma_prev(current->mm, start, &prev);
836d5ffd3   Hugh Dickins   [PATCH] mm: fix m...
704
705
  	if (vma && start > vma->vm_start)
  		prev = vma;
1998cc048   Shaohua Li   mm: make madvise(...
706
  	blk_start_plug(&plug);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
707
708
709
710
  	for (;;) {
  		/* Still start < end. */
  		error = -ENOMEM;
  		if (!vma)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
711
  			goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
712

05b743847   Prasanna Meda   [PATCH] madvise: ...
713
  		/* Here start < (end|vma->vm_end). */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
714
715
716
  		if (start < vma->vm_start) {
  			unmapped_error = -ENOMEM;
  			start = vma->vm_start;
05b743847   Prasanna Meda   [PATCH] madvise: ...
717
  			if (start >= end)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
718
  				goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
719
  		}
05b743847   Prasanna Meda   [PATCH] madvise: ...
720
721
722
723
  		/* Here vma->vm_start <= start < (end|vma->vm_end) */
  		tmp = vma->vm_end;
  		if (end < tmp)
  			tmp = end;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
724

05b743847   Prasanna Meda   [PATCH] madvise: ...
725
726
  		/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
  		error = madvise_vma(vma, &prev, start, tmp, behavior);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
727
  		if (error)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
728
  			goto out;
05b743847   Prasanna Meda   [PATCH] madvise: ...
729
  		start = tmp;
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
730
  		if (prev && start < prev->vm_end)
05b743847   Prasanna Meda   [PATCH] madvise: ...
731
732
733
  			start = prev->vm_end;
  		error = unmapped_error;
  		if (start >= end)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
734
  			goto out;
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
735
736
737
738
  		if (prev)
  			vma = prev->vm_next;
  		else	/* madvise_remove dropped mmap_sem */
  			vma = find_vma(current->mm, start);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
739
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
740
  out:
84d96d897   Rasmus Villemoes   mm: madvise: comp...
741
  	blk_finish_plug(&plug);
f79777932   Jason Baron   speed up madvise_...
742
  	if (write)
0a27a14a6   Nick Piggin   mm: madvise avoid...
743
744
745
  		up_write(&current->mm->mmap_sem);
  	else
  		up_read(&current->mm->mmap_sem);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
746
747
  	return error;
  }