Blame view

mm/madvise.c 28.8 KB
b24413180   Greg Kroah-Hartman   License cleanup: ...
1
  // SPDX-License-Identifier: GPL-2.0
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2
3
4
5
6
7
8
9
10
11
  /*
   *	linux/mm/madvise.c
   *
   * Copyright (C) 1999  Linus Torvalds
   * Copyright (C) 2002  Christoph Hellwig
   */
  
  #include <linux/mman.h>
  #include <linux/pagemap.h>
  #include <linux/syscalls.h>
05b743847   Prasanna Meda   [PATCH] madvise: ...
12
  #include <linux/mempolicy.h>
afcf938ee   Andi Kleen   HWPOISON: Add a m...
13
  #include <linux/page-isolation.h>
9c276cc65   Minchan Kim   mm: introduce MAD...
14
  #include <linux/page_idle.h>
05ce77249   Pavel Emelyanov   userfaultfd: non-...
15
  #include <linux/userfaultfd_k.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
16
  #include <linux/hugetlb.h>
3f31d0757   Hugh Dickins   mm/fs: route MADV...
17
  #include <linux/falloc.h>
692fe6243   Jan Kara   mm: Handle MADV_W...
18
  #include <linux/fadvise.h>
e8edc6e03   Alexey Dobriyan   Detach sched.h fr...
19
  #include <linux/sched.h>
f8af4da3b   Hugh Dickins   ksm: the mm inter...
20
  #include <linux/ksm.h>
3f31d0757   Hugh Dickins   mm/fs: route MADV...
21
  #include <linux/fs.h>
9ab4233dd   Andy Lutomirski   mm: Hold a file r...
22
  #include <linux/file.h>
1998cc048   Shaohua Li   mm: make madvise(...
23
  #include <linux/blkdev.h>
66114cad6   Tejun Heo   writeback: separa...
24
  #include <linux/backing-dev.h>
a520110e4   Christoph Hellwig   mm: split out a n...
25
  #include <linux/pagewalk.h>
1998cc048   Shaohua Li   mm: make madvise(...
26
27
  #include <linux/swap.h>
  #include <linux/swapops.h>
3a4f8a0b3   Hugh Dickins   mm: remove shmem_...
28
  #include <linux/shmem_fs.h>
854e9ed09   Minchan Kim   mm: support madvi...
29
30
31
  #include <linux/mmu_notifier.h>
  
  #include <asm/tlb.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
32

235190738   Kirill A. Shutemov   oom-reaper: use m...
33
  #include "internal.h"
d616d5126   Minchan Kim   mm: factor out co...
34
35
36
37
  struct madvise_walk_private {
  	struct mmu_gather *tlb;
  	bool pageout;
  };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
38
  /*
0a27a14a6   Nick Piggin   mm: madvise avoid...
39
40
41
42
43
44
45
46
47
48
   * Any behaviour which results in changes to the vma->vm_flags needs to
   * take mmap_sem for writing. Others, which simply traverse vmas, need
   * to only take it for reading.
   */
  static int madvise_need_mmap_write(int behavior)
  {
  	switch (behavior) {
  	case MADV_REMOVE:
  	case MADV_WILLNEED:
  	case MADV_DONTNEED:
9c276cc65   Minchan Kim   mm: introduce MAD...
49
  	case MADV_COLD:
1a4e58cce   Minchan Kim   mm: introduce MAD...
50
  	case MADV_PAGEOUT:
854e9ed09   Minchan Kim   mm: support madvi...
51
  	case MADV_FREE:
0a27a14a6   Nick Piggin   mm: madvise avoid...
52
53
54
55
56
57
58
59
  		return 0;
  	default:
  		/* be safe, default to 1. list exceptions explicitly */
  		return 1;
  	}
  }
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
60
61
62
   * We can potentially split a vm area into separate
   * areas, each area with its own behavior.
   */
ec9bed9d3   Vladimir Cernov   mm/madvise.c: fix...
63
  static long madvise_behavior(struct vm_area_struct *vma,
05b743847   Prasanna Meda   [PATCH] madvise: ...
64
65
  		     struct vm_area_struct **prev,
  		     unsigned long start, unsigned long end, int behavior)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
66
  {
ec9bed9d3   Vladimir Cernov   mm/madvise.c: fix...
67
  	struct mm_struct *mm = vma->vm_mm;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
68
  	int error = 0;
05b743847   Prasanna Meda   [PATCH] madvise: ...
69
  	pgoff_t pgoff;
3866ea90d   Hugh Dickins   ksm: first tidy u...
70
  	unsigned long new_flags = vma->vm_flags;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
71
72
  
  	switch (behavior) {
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
73
74
75
  	case MADV_NORMAL:
  		new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
  		break;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
76
  	case MADV_SEQUENTIAL:
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
77
  		new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
78
79
  		break;
  	case MADV_RANDOM:
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
80
  		new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
81
  		break;
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
82
83
84
85
  	case MADV_DONTFORK:
  		new_flags |= VM_DONTCOPY;
  		break;
  	case MADV_DOFORK:
3866ea90d   Hugh Dickins   ksm: first tidy u...
86
87
88
89
  		if (vma->vm_flags & VM_IO) {
  			error = -EINVAL;
  			goto out;
  		}
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
90
  		new_flags &= ~VM_DONTCOPY;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
91
  		break;
d2cd9ede6   Rik van Riel   mm,fork: introduc...
92
93
94
95
96
97
98
99
100
101
102
  	case MADV_WIPEONFORK:
  		/* MADV_WIPEONFORK is only supported on anonymous memory. */
  		if (vma->vm_file || vma->vm_flags & VM_SHARED) {
  			error = -EINVAL;
  			goto out;
  		}
  		new_flags |= VM_WIPEONFORK;
  		break;
  	case MADV_KEEPONFORK:
  		new_flags &= ~VM_WIPEONFORK;
  		break;
accb61fe7   Jason Baron   coredump: add VM_...
103
  	case MADV_DONTDUMP:
0103bd16f   Konstantin Khlebnikov   mm: prepare VM_DO...
104
  		new_flags |= VM_DONTDUMP;
accb61fe7   Jason Baron   coredump: add VM_...
105
106
  		break;
  	case MADV_DODUMP:
d41aa5252   Daniel Black   mm: madvise(MADV_...
107
  		if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) {
0103bd16f   Konstantin Khlebnikov   mm: prepare VM_DO...
108
109
110
111
  			error = -EINVAL;
  			goto out;
  		}
  		new_flags &= ~VM_DONTDUMP;
accb61fe7   Jason Baron   coredump: add VM_...
112
  		break;
f8af4da3b   Hugh Dickins   ksm: the mm inter...
113
114
115
  	case MADV_MERGEABLE:
  	case MADV_UNMERGEABLE:
  		error = ksm_madvise(vma, start, end, behavior, &new_flags);
f3bc0dba3   Mike Rapoport   mm/madvise: reduc...
116
117
  		if (error)
  			goto out_convert_errno;
f8af4da3b   Hugh Dickins   ksm: the mm inter...
118
  		break;
0af4e98b6   Andrea Arcangeli   thp: madvise(MADV...
119
  	case MADV_HUGEPAGE:
a664b2d85   Andrea Arcangeli   thp: madvise(MADV...
120
  	case MADV_NOHUGEPAGE:
60ab3244e   Andrea Arcangeli   thp: khugepaged: ...
121
  		error = hugepage_madvise(vma, &new_flags, behavior);
f3bc0dba3   Mike Rapoport   mm/madvise: reduc...
122
123
  		if (error)
  			goto out_convert_errno;
0af4e98b6   Andrea Arcangeli   thp: madvise(MADV...
124
  		break;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
125
  	}
05b743847   Prasanna Meda   [PATCH] madvise: ...
126
127
  	if (new_flags == vma->vm_flags) {
  		*prev = vma;
836d5ffd3   Hugh Dickins   [PATCH] mm: fix m...
128
  		goto out;
05b743847   Prasanna Meda   [PATCH] madvise: ...
129
130
131
132
  	}
  
  	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
  	*prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
19a809afe   Andrea Arcangeli   userfaultfd: teac...
133
134
  			  vma->vm_file, pgoff, vma_policy(vma),
  			  vma->vm_userfaultfd_ctx);
05b743847   Prasanna Meda   [PATCH] madvise: ...
135
136
137
138
139
140
  	if (*prev) {
  		vma = *prev;
  		goto success;
  	}
  
  	*prev = vma;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
141
142
  
  	if (start != vma->vm_start) {
def5efe03   David Rientjes   mm, madvise: fail...
143
144
  		if (unlikely(mm->map_count >= sysctl_max_map_count)) {
  			error = -ENOMEM;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
145
  			goto out;
def5efe03   David Rientjes   mm, madvise: fail...
146
147
  		}
  		error = __split_vma(mm, vma, start, 1);
f3bc0dba3   Mike Rapoport   mm/madvise: reduc...
148
149
  		if (error)
  			goto out_convert_errno;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
150
151
152
  	}
  
  	if (end != vma->vm_end) {
def5efe03   David Rientjes   mm, madvise: fail...
153
154
  		if (unlikely(mm->map_count >= sysctl_max_map_count)) {
  			error = -ENOMEM;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
155
  			goto out;
def5efe03   David Rientjes   mm, madvise: fail...
156
157
  		}
  		error = __split_vma(mm, vma, end, 0);
f3bc0dba3   Mike Rapoport   mm/madvise: reduc...
158
159
  		if (error)
  			goto out_convert_errno;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
160
  	}
836d5ffd3   Hugh Dickins   [PATCH] mm: fix m...
161
  success:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
162
163
164
  	/*
  	 * vm_flags is protected by the mmap_sem held in write mode.
  	 */
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
165
  	vma->vm_flags = new_flags;
f3bc0dba3   Mike Rapoport   mm/madvise: reduc...
166
167
168
169
170
171
172
173
  
  out_convert_errno:
  	/*
  	 * madvise() returns EAGAIN if kernel resources, such as
  	 * slab, are temporarily unavailable.
  	 */
  	if (error == -ENOMEM)
  		error = -EAGAIN;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
174
  out:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
175
176
  	return error;
  }
1998cc048   Shaohua Li   mm: make madvise(...
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
  #ifdef CONFIG_SWAP
  static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
  	unsigned long end, struct mm_walk *walk)
  {
  	pte_t *orig_pte;
  	struct vm_area_struct *vma = walk->private;
  	unsigned long index;
  
  	if (pmd_none_or_trans_huge_or_clear_bad(pmd))
  		return 0;
  
  	for (index = start; index != end; index += PAGE_SIZE) {
  		pte_t pte;
  		swp_entry_t entry;
  		struct page *page;
  		spinlock_t *ptl;
  
  		orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
  		pte = *(orig_pte + ((index - start) / PAGE_SIZE));
  		pte_unmap_unlock(orig_pte, ptl);
0661a3361   Kirill A. Shutemov   mm: remove rest u...
197
  		if (pte_present(pte) || pte_none(pte))
1998cc048   Shaohua Li   mm: make madvise(...
198
199
200
201
202
203
  			continue;
  		entry = pte_to_swp_entry(pte);
  		if (unlikely(non_swap_entry(entry)))
  			continue;
  
  		page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
23955622f   Shaohua Li   swap: add block i...
204
  							vma, index, false);
1998cc048   Shaohua Li   mm: make madvise(...
205
  		if (page)
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
206
  			put_page(page);
1998cc048   Shaohua Li   mm: make madvise(...
207
208
209
210
  	}
  
  	return 0;
  }
7b86ac337   Christoph Hellwig   pagewalk: separat...
211
212
213
  static const struct mm_walk_ops swapin_walk_ops = {
  	.pmd_entry		= swapin_walk_pmd_entry,
  };
1998cc048   Shaohua Li   mm: make madvise(...
214
215
216
217
218
219
220
221
222
223
224
  
  static void force_shm_swapin_readahead(struct vm_area_struct *vma,
  		unsigned long start, unsigned long end,
  		struct address_space *mapping)
  {
  	pgoff_t index;
  	struct page *page;
  	swp_entry_t swap;
  
  	for (; start < end; start += PAGE_SIZE) {
  		index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
55231e5c8   Johannes Weiner   mm: madvise: fix ...
225
  		page = find_get_entry(mapping, index);
3159f943a   Matthew Wilcox   xarray: Replace e...
226
  		if (!xa_is_value(page)) {
1998cc048   Shaohua Li   mm: make madvise(...
227
  			if (page)
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
228
  				put_page(page);
1998cc048   Shaohua Li   mm: make madvise(...
229
230
231
232
  			continue;
  		}
  		swap = radix_to_swp_entry(page);
  		page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
23955622f   Shaohua Li   swap: add block i...
233
  							NULL, 0, false);
1998cc048   Shaohua Li   mm: make madvise(...
234
  		if (page)
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
235
  			put_page(page);
1998cc048   Shaohua Li   mm: make madvise(...
236
237
238
239
240
  	}
  
  	lru_add_drain();	/* Push any new pages onto the LRU now */
  }
  #endif		/* CONFIG_SWAP */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
241
242
243
  /*
   * Schedule all required I/O operations.  Do not wait for completion.
   */
ec9bed9d3   Vladimir Cernov   mm/madvise.c: fix...
244
245
  static long madvise_willneed(struct vm_area_struct *vma,
  			     struct vm_area_struct **prev,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
246
247
248
  			     unsigned long start, unsigned long end)
  {
  	struct file *file = vma->vm_file;
692fe6243   Jan Kara   mm: Handle MADV_W...
249
  	loff_t offset;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
250

6ea8d958a   chenjie   mm/madvise.c: fix...
251
  	*prev = vma;
1998cc048   Shaohua Li   mm: make madvise(...
252
  #ifdef CONFIG_SWAP
97b713ba3   Christoph Hellwig   fs: kill BDI_CAP_...
253
  	if (!file) {
7b86ac337   Christoph Hellwig   pagewalk: separat...
254
255
  		walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma);
  		lru_add_drain(); /* Push any new pages onto the LRU now */
1998cc048   Shaohua Li   mm: make madvise(...
256
257
  		return 0;
  	}
1998cc048   Shaohua Li   mm: make madvise(...
258

97b713ba3   Christoph Hellwig   fs: kill BDI_CAP_...
259
  	if (shmem_mapping(file->f_mapping)) {
97b713ba3   Christoph Hellwig   fs: kill BDI_CAP_...
260
261
262
263
264
  		force_shm_swapin_readahead(vma, start, end,
  					file->f_mapping);
  		return 0;
  	}
  #else
1bef40032   Suzuki   [PATCH] madvise: ...
265
266
  	if (!file)
  		return -EBADF;
97b713ba3   Christoph Hellwig   fs: kill BDI_CAP_...
267
  #endif
1bef40032   Suzuki   [PATCH] madvise: ...
268

e748dcd09   Matthew Wilcox   vfs: remove get_x...
269
  	if (IS_DAX(file_inode(file))) {
fe77ba6f4   Carsten Otte   [PATCH] xip: madv...
270
271
272
  		/* no bad return value, but ignore advice */
  		return 0;
  	}
692fe6243   Jan Kara   mm: Handle MADV_W...
273
274
275
276
277
278
279
280
281
282
283
284
285
286
  	/*
  	 * Filesystem's fadvise may need to take various locks.  We need to
  	 * explicitly grab a reference because the vma (and hence the
  	 * vma's reference to the file) can go away as soon as we drop
  	 * mmap_sem.
  	 */
  	*prev = NULL;	/* tell sys_madvise we drop mmap_sem */
  	get_file(file);
  	up_read(&current->mm->mmap_sem);
  	offset = (loff_t)(start - vma->vm_start)
  			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
  	vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
  	fput(file);
  	down_read(&current->mm->mmap_sem);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
287
288
  	return 0;
  }
d616d5126   Minchan Kim   mm: factor out co...
289
290
291
  static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
  				unsigned long addr, unsigned long end,
  				struct mm_walk *walk)
9c276cc65   Minchan Kim   mm: introduce MAD...
292
  {
d616d5126   Minchan Kim   mm: factor out co...
293
294
295
  	struct madvise_walk_private *private = walk->private;
  	struct mmu_gather *tlb = private->tlb;
  	bool pageout = private->pageout;
9c276cc65   Minchan Kim   mm: introduce MAD...
296
297
298
299
  	struct mm_struct *mm = tlb->mm;
  	struct vm_area_struct *vma = walk->vma;
  	pte_t *orig_pte, *pte, ptent;
  	spinlock_t *ptl;
d616d5126   Minchan Kim   mm: factor out co...
300
301
302
303
304
  	struct page *page = NULL;
  	LIST_HEAD(page_list);
  
  	if (fatal_signal_pending(current))
  		return -EINTR;
9c276cc65   Minchan Kim   mm: introduce MAD...
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
  
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  	if (pmd_trans_huge(*pmd)) {
  		pmd_t orig_pmd;
  		unsigned long next = pmd_addr_end(addr, end);
  
  		tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
  		ptl = pmd_trans_huge_lock(pmd, vma);
  		if (!ptl)
  			return 0;
  
  		orig_pmd = *pmd;
  		if (is_huge_zero_pmd(orig_pmd))
  			goto huge_unlock;
  
  		if (unlikely(!pmd_present(orig_pmd))) {
  			VM_BUG_ON(thp_migration_supported() &&
  					!is_pmd_migration_entry(orig_pmd));
  			goto huge_unlock;
  		}
  
  		page = pmd_page(orig_pmd);
  		if (next - addr != HPAGE_PMD_SIZE) {
  			int err;
  
  			if (page_mapcount(page) != 1)
  				goto huge_unlock;
  
  			get_page(page);
  			spin_unlock(ptl);
  			lock_page(page);
  			err = split_huge_page(page);
  			unlock_page(page);
  			put_page(page);
  			if (!err)
  				goto regular_page;
  			return 0;
  		}
  
  		if (pmd_young(orig_pmd)) {
  			pmdp_invalidate(vma, addr, pmd);
  			orig_pmd = pmd_mkold(orig_pmd);
  
  			set_pmd_at(mm, addr, pmd, orig_pmd);
  			tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
  		}
d616d5126   Minchan Kim   mm: factor out co...
351
  		ClearPageReferenced(page);
9c276cc65   Minchan Kim   mm: introduce MAD...
352
  		test_and_clear_page_young(page);
d616d5126   Minchan Kim   mm: factor out co...
353
  		if (pageout) {
820729629   zhong jiang   mm: fix trying to...
354
355
356
357
358
359
  			if (!isolate_lru_page(page)) {
  				if (PageUnevictable(page))
  					putback_lru_page(page);
  				else
  					list_add(&page->lru, &page_list);
  			}
d616d5126   Minchan Kim   mm: factor out co...
360
361
  		} else
  			deactivate_page(page);
9c276cc65   Minchan Kim   mm: introduce MAD...
362
363
  huge_unlock:
  		spin_unlock(ptl);
d616d5126   Minchan Kim   mm: factor out co...
364
365
  		if (pageout)
  			reclaim_pages(&page_list);
9c276cc65   Minchan Kim   mm: introduce MAD...
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
  		return 0;
  	}
  
  	if (pmd_trans_unstable(pmd))
  		return 0;
  regular_page:
  #endif
  	tlb_change_page_size(tlb, PAGE_SIZE);
  	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
  	flush_tlb_batched_pending(mm);
  	arch_enter_lazy_mmu_mode();
  	for (; addr < end; pte++, addr += PAGE_SIZE) {
  		ptent = *pte;
  
  		if (pte_none(ptent))
  			continue;
  
  		if (!pte_present(ptent))
  			continue;
  
  		page = vm_normal_page(vma, addr, ptent);
  		if (!page)
  			continue;
  
  		/*
  		 * Creating a THP page is expensive so split it only if we
  		 * are sure it's worth. Split it if we are only owner.
  		 */
  		if (PageTransCompound(page)) {
  			if (page_mapcount(page) != 1)
  				break;
  			get_page(page);
  			if (!trylock_page(page)) {
  				put_page(page);
  				break;
  			}
  			pte_unmap_unlock(orig_pte, ptl);
  			if (split_huge_page(page)) {
  				unlock_page(page);
  				put_page(page);
  				pte_offset_map_lock(mm, pmd, addr, &ptl);
  				break;
  			}
  			unlock_page(page);
  			put_page(page);
  			pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
  			pte--;
  			addr -= PAGE_SIZE;
  			continue;
  		}
  
  		VM_BUG_ON_PAGE(PageTransCompound(page), page);
  
  		if (pte_young(ptent)) {
  			ptent = ptep_get_and_clear_full(mm, addr, pte,
  							tlb->fullmm);
  			ptent = pte_mkold(ptent);
  			set_pte_at(mm, addr, pte, ptent);
  			tlb_remove_tlb_entry(tlb, pte, addr);
  		}
  
  		/*
  		 * We are deactivating a page for accelerating reclaiming.
  		 * VM couldn't reclaim the page unless we clear PG_young.
  		 * As a side effect, it makes confuse idle-page tracking
  		 * because they will miss recent referenced history.
  		 */
d616d5126   Minchan Kim   mm: factor out co...
433
  		ClearPageReferenced(page);
9c276cc65   Minchan Kim   mm: introduce MAD...
434
  		test_and_clear_page_young(page);
d616d5126   Minchan Kim   mm: factor out co...
435
  		if (pageout) {
820729629   zhong jiang   mm: fix trying to...
436
437
438
439
440
441
  			if (!isolate_lru_page(page)) {
  				if (PageUnevictable(page))
  					putback_lru_page(page);
  				else
  					list_add(&page->lru, &page_list);
  			}
d616d5126   Minchan Kim   mm: factor out co...
442
443
  		} else
  			deactivate_page(page);
9c276cc65   Minchan Kim   mm: introduce MAD...
444
445
446
447
  	}
  
  	arch_leave_lazy_mmu_mode();
  	pte_unmap_unlock(orig_pte, ptl);
d616d5126   Minchan Kim   mm: factor out co...
448
449
  	if (pageout)
  		reclaim_pages(&page_list);
9c276cc65   Minchan Kim   mm: introduce MAD...
450
451
452
453
454
455
  	cond_resched();
  
  	return 0;
  }
  
  static const struct mm_walk_ops cold_walk_ops = {
d616d5126   Minchan Kim   mm: factor out co...
456
  	.pmd_entry = madvise_cold_or_pageout_pte_range,
9c276cc65   Minchan Kim   mm: introduce MAD...
457
458
459
460
461
462
  };
  
  static void madvise_cold_page_range(struct mmu_gather *tlb,
  			     struct vm_area_struct *vma,
  			     unsigned long addr, unsigned long end)
  {
d616d5126   Minchan Kim   mm: factor out co...
463
464
465
466
  	struct madvise_walk_private walk_private = {
  		.pageout = false,
  		.tlb = tlb,
  	};
9c276cc65   Minchan Kim   mm: introduce MAD...
467
  	tlb_start_vma(tlb, vma);
d616d5126   Minchan Kim   mm: factor out co...
468
  	walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
9c276cc65   Minchan Kim   mm: introduce MAD...
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
  	tlb_end_vma(tlb, vma);
  }
  
  static long madvise_cold(struct vm_area_struct *vma,
  			struct vm_area_struct **prev,
  			unsigned long start_addr, unsigned long end_addr)
  {
  	struct mm_struct *mm = vma->vm_mm;
  	struct mmu_gather tlb;
  
  	*prev = vma;
  	if (!can_madv_lru_vma(vma))
  		return -EINVAL;
  
  	lru_add_drain();
  	tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
  	madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
  	tlb_finish_mmu(&tlb, start_addr, end_addr);
  
  	return 0;
  }
1a4e58cce   Minchan Kim   mm: introduce MAD...
490
491
492
493
  static void madvise_pageout_page_range(struct mmu_gather *tlb,
  			     struct vm_area_struct *vma,
  			     unsigned long addr, unsigned long end)
  {
d616d5126   Minchan Kim   mm: factor out co...
494
495
496
497
  	struct madvise_walk_private walk_private = {
  		.pageout = true,
  		.tlb = tlb,
  	};
1a4e58cce   Minchan Kim   mm: introduce MAD...
498
  	tlb_start_vma(tlb, vma);
d616d5126   Minchan Kim   mm: factor out co...
499
  	walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
1a4e58cce   Minchan Kim   mm: introduce MAD...
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
  	tlb_end_vma(tlb, vma);
  }
  
  static inline bool can_do_pageout(struct vm_area_struct *vma)
  {
  	if (vma_is_anonymous(vma))
  		return true;
  	if (!vma->vm_file)
  		return false;
  	/*
  	 * paging out pagecache only for non-anonymous mappings that correspond
  	 * to the files the calling process could (if tried) open for writing;
  	 * otherwise we'd be including shared non-exclusive mappings, which
  	 * opens a side channel.
  	 */
  	return inode_owner_or_capable(file_inode(vma->vm_file)) ||
  		inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0;
  }
  
  static long madvise_pageout(struct vm_area_struct *vma,
  			struct vm_area_struct **prev,
  			unsigned long start_addr, unsigned long end_addr)
  {
  	struct mm_struct *mm = vma->vm_mm;
  	struct mmu_gather tlb;
  
  	*prev = vma;
  	if (!can_madv_lru_vma(vma))
  		return -EINVAL;
  
  	if (!can_do_pageout(vma))
  		return 0;
  
  	lru_add_drain();
  	tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
  	madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
  	tlb_finish_mmu(&tlb, start_addr, end_addr);
  
  	return 0;
  }
854e9ed09   Minchan Kim   mm: support madvi...
540
541
542
543
544
545
546
547
548
549
  static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
  				unsigned long end, struct mm_walk *walk)
  
  {
  	struct mmu_gather *tlb = walk->private;
  	struct mm_struct *mm = tlb->mm;
  	struct vm_area_struct *vma = walk->vma;
  	spinlock_t *ptl;
  	pte_t *orig_pte, *pte, ptent;
  	struct page *page;
64b42bc1c   Minchan Kim   mm/madvise.c: fre...
550
  	int nr_swap = 0;
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
551
552
553
554
555
556
  	unsigned long next;
  
  	next = pmd_addr_end(addr, end);
  	if (pmd_trans_huge(*pmd))
  		if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
  			goto next;
854e9ed09   Minchan Kim   mm: support madvi...
557

854e9ed09   Minchan Kim   mm: support madvi...
558
559
  	if (pmd_trans_unstable(pmd))
  		return 0;
ed6a79352   Peter Zijlstra   asm-generic/tlb, ...
560
  	tlb_change_page_size(tlb, PAGE_SIZE);
854e9ed09   Minchan Kim   mm: support madvi...
561
  	orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
3ea277194   Mel Gorman   mm, mprotect: flu...
562
  	flush_tlb_batched_pending(mm);
854e9ed09   Minchan Kim   mm: support madvi...
563
564
565
  	arch_enter_lazy_mmu_mode();
  	for (; addr != end; pte++, addr += PAGE_SIZE) {
  		ptent = *pte;
64b42bc1c   Minchan Kim   mm/madvise.c: fre...
566
  		if (pte_none(ptent))
854e9ed09   Minchan Kim   mm: support madvi...
567
  			continue;
64b42bc1c   Minchan Kim   mm/madvise.c: fre...
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
  		/*
  		 * If the pte has swp_entry, just clear page table to
  		 * prevent swap-in which is more expensive rather than
  		 * (page allocation + zeroing).
  		 */
  		if (!pte_present(ptent)) {
  			swp_entry_t entry;
  
  			entry = pte_to_swp_entry(ptent);
  			if (non_swap_entry(entry))
  				continue;
  			nr_swap--;
  			free_swap_and_cache(entry);
  			pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
  			continue;
  		}
854e9ed09   Minchan Kim   mm: support madvi...
584

25b2995a3   Christoph Hellwig   mm: remove MEMORY...
585
  		page = vm_normal_page(vma, addr, ptent);
854e9ed09   Minchan Kim   mm: support madvi...
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
  		if (!page)
  			continue;
  
  		/*
  		 * If pmd isn't transhuge but the page is THP and
  		 * is owned by only this process, split it and
  		 * deactivate all pages.
  		 */
  		if (PageTransCompound(page)) {
  			if (page_mapcount(page) != 1)
  				goto out;
  			get_page(page);
  			if (!trylock_page(page)) {
  				put_page(page);
  				goto out;
  			}
  			pte_unmap_unlock(orig_pte, ptl);
  			if (split_huge_page(page)) {
  				unlock_page(page);
  				put_page(page);
  				pte_offset_map_lock(mm, pmd, addr, &ptl);
  				goto out;
  			}
854e9ed09   Minchan Kim   mm: support madvi...
609
  			unlock_page(page);
263630e8d   Eric Biggers   mm/madvise.c: fix...
610
  			put_page(page);
854e9ed09   Minchan Kim   mm: support madvi...
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
  			pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
  			pte--;
  			addr -= PAGE_SIZE;
  			continue;
  		}
  
  		VM_BUG_ON_PAGE(PageTransCompound(page), page);
  
  		if (PageSwapCache(page) || PageDirty(page)) {
  			if (!trylock_page(page))
  				continue;
  			/*
  			 * If page is shared with others, we couldn't clear
  			 * PG_dirty of the page.
  			 */
  			if (page_mapcount(page) != 1) {
  				unlock_page(page);
  				continue;
  			}
  
  			if (PageSwapCache(page) && !try_to_free_swap(page)) {
  				unlock_page(page);
  				continue;
  			}
  
  			ClearPageDirty(page);
  			unlock_page(page);
  		}
  
  		if (pte_young(ptent) || pte_dirty(ptent)) {
  			/*
  			 * Some of architecture(ex, PPC) don't update TLB
  			 * with set_pte_at and tlb_remove_tlb_entry so for
  			 * the portability, remap the pte with old|clean
  			 * after pte clearing.
  			 */
  			ptent = ptep_get_and_clear_full(mm, addr, pte,
  							tlb->fullmm);
  
  			ptent = pte_mkold(ptent);
  			ptent = pte_mkclean(ptent);
  			set_pte_at(mm, addr, pte, ptent);
  			tlb_remove_tlb_entry(tlb, pte, addr);
  		}
802a3a92a   Shaohua Li   mm: reclaim MADV_...
655
  		mark_page_lazyfree(page);
854e9ed09   Minchan Kim   mm: support madvi...
656
657
  	}
  out:
64b42bc1c   Minchan Kim   mm/madvise.c: fre...
658
659
660
661
662
663
  	if (nr_swap) {
  		if (current->mm == mm)
  			sync_mm_rss(mm);
  
  		add_mm_counter(mm, MM_SWAPENTS, nr_swap);
  	}
854e9ed09   Minchan Kim   mm: support madvi...
664
665
666
  	arch_leave_lazy_mmu_mode();
  	pte_unmap_unlock(orig_pte, ptl);
  	cond_resched();
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
667
  next:
854e9ed09   Minchan Kim   mm: support madvi...
668
669
  	return 0;
  }
7b86ac337   Christoph Hellwig   pagewalk: separat...
670
671
672
  static const struct mm_walk_ops madvise_free_walk_ops = {
  	.pmd_entry		= madvise_free_pte_range,
  };
854e9ed09   Minchan Kim   mm: support madvi...
673
674
675
676
  
  static int madvise_free_single_vma(struct vm_area_struct *vma,
  			unsigned long start_addr, unsigned long end_addr)
  {
854e9ed09   Minchan Kim   mm: support madvi...
677
  	struct mm_struct *mm = vma->vm_mm;
ac46d4f3c   Jérôme Glisse   mm/mmu_notifier: ...
678
  	struct mmu_notifier_range range;
854e9ed09   Minchan Kim   mm: support madvi...
679
  	struct mmu_gather tlb;
854e9ed09   Minchan Kim   mm: support madvi...
680
681
682
  	/* MADV_FREE works for only anon vma at the moment */
  	if (!vma_is_anonymous(vma))
  		return -EINVAL;
ac46d4f3c   Jérôme Glisse   mm/mmu_notifier: ...
683
684
  	range.start = max(vma->vm_start, start_addr);
  	if (range.start >= vma->vm_end)
854e9ed09   Minchan Kim   mm: support madvi...
685
  		return -EINVAL;
ac46d4f3c   Jérôme Glisse   mm/mmu_notifier: ...
686
687
  	range.end = min(vma->vm_end, end_addr);
  	if (range.end <= vma->vm_start)
854e9ed09   Minchan Kim   mm: support madvi...
688
  		return -EINVAL;
7269f9999   Jérôme Glisse   mm/mmu_notifier: ...
689
  	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
6f4f13e8d   Jérôme Glisse   mm/mmu_notifier: ...
690
  				range.start, range.end);
854e9ed09   Minchan Kim   mm: support madvi...
691
692
  
  	lru_add_drain();
ac46d4f3c   Jérôme Glisse   mm/mmu_notifier: ...
693
  	tlb_gather_mmu(&tlb, mm, range.start, range.end);
854e9ed09   Minchan Kim   mm: support madvi...
694
  	update_hiwater_rss(mm);
ac46d4f3c   Jérôme Glisse   mm/mmu_notifier: ...
695
  	mmu_notifier_invalidate_range_start(&range);
7b86ac337   Christoph Hellwig   pagewalk: separat...
696
697
698
699
  	tlb_start_vma(&tlb, vma);
  	walk_page_range(vma->vm_mm, range.start, range.end,
  			&madvise_free_walk_ops, &tlb);
  	tlb_end_vma(&tlb, vma);
ac46d4f3c   Jérôme Glisse   mm/mmu_notifier: ...
700
701
  	mmu_notifier_invalidate_range_end(&range);
  	tlb_finish_mmu(&tlb, range.start, range.end);
854e9ed09   Minchan Kim   mm: support madvi...
702
703
704
  
  	return 0;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
705
706
707
708
  /*
   * Application no longer needs these pages.  If the pages are dirty,
   * it's OK to just throw them away.  The app will be more careful about
   * data it wants to keep.  Be sure to free swap resources too.  The
7e6cbea39   Fernando Luis Vazquez Cao   madvise: update f...
709
   * zap_page_range call sets things up for shrink_active_list to actually free
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
710
711
   * these pages later if no one else has touched them in the meantime,
   * although we could add these pages to a global reuse list for
7e6cbea39   Fernando Luis Vazquez Cao   madvise: update f...
712
   * shrink_active_list to pick up before reclaiming other pages.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
713
714
715
716
717
718
719
720
721
722
723
   *
   * NB: This interface discards data rather than pushes it out to swap,
   * as some implementations do.  This has performance implications for
   * applications like large transactional databases which want to discard
   * pages in anonymous maps after committing to backing store the data
   * that was kept in them.  There is no reason to write this data out to
   * the swap area if the application is discarding it.
   *
   * An interface that causes the system to free clean pages and flush
   * dirty pages is already available as msync(MS_INVALIDATE).
   */
230ca982b   Mike Rapoport   userfaultfd: non-...
724
725
726
727
728
729
730
731
732
733
734
  static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
  					unsigned long start, unsigned long end)
  {
  	zap_page_range(vma, start, end - start);
  	return 0;
  }
  
  static long madvise_dontneed_free(struct vm_area_struct *vma,
  				  struct vm_area_struct **prev,
  				  unsigned long start, unsigned long end,
  				  int behavior)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
735
  {
05b743847   Prasanna Meda   [PATCH] madvise: ...
736
  	*prev = vma;
9c276cc65   Minchan Kim   mm: introduce MAD...
737
  	if (!can_madv_lru_vma(vma))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
738
  		return -EINVAL;
70ccb92fd   Andrea Arcangeli   userfaultfd: non-...
739
740
741
742
743
744
745
746
747
748
749
750
751
752
  	if (!userfaultfd_remove(vma, start, end)) {
  		*prev = NULL; /* mmap_sem has been dropped, prev is stale */
  
  		down_read(&current->mm->mmap_sem);
  		vma = find_vma(current->mm, start);
  		if (!vma)
  			return -ENOMEM;
  		if (start < vma->vm_start) {
  			/*
  			 * This "vma" under revalidation is the one
  			 * with the lowest vma->vm_start where start
  			 * is also < vma->vm_end. If start <
  			 * vma->vm_start it means an hole materialized
  			 * in the user address space within the
230ca982b   Mike Rapoport   userfaultfd: non-...
753
754
  			 * virtual range passed to MADV_DONTNEED
  			 * or MADV_FREE.
70ccb92fd   Andrea Arcangeli   userfaultfd: non-...
755
756
757
  			 */
  			return -ENOMEM;
  		}
9c276cc65   Minchan Kim   mm: introduce MAD...
758
  		if (!can_madv_lru_vma(vma))
70ccb92fd   Andrea Arcangeli   userfaultfd: non-...
759
760
761
762
763
764
  			return -EINVAL;
  		if (end > vma->vm_end) {
  			/*
  			 * Don't fail if end > vma->vm_end. If the old
  			 * vma was splitted while the mmap_sem was
  			 * released the effect of the concurrent
230ca982b   Mike Rapoport   userfaultfd: non-...
765
  			 * operation may not cause madvise() to
70ccb92fd   Andrea Arcangeli   userfaultfd: non-...
766
767
768
769
770
771
772
773
774
775
776
  			 * have an undefined result. There may be an
  			 * adjacent next vma that we'll walk
  			 * next. userfaultfd_remove() will generate an
  			 * UFFD_EVENT_REMOVE repetition on the
  			 * end-vma->vm_end range, but the manager can
  			 * handle a repetition fine.
  			 */
  			end = vma->vm_end;
  		}
  		VM_WARN_ON(start >= end);
  	}
230ca982b   Mike Rapoport   userfaultfd: non-...
777
778
779
780
781
782
783
  
  	if (behavior == MADV_DONTNEED)
  		return madvise_dontneed_single_vma(vma, start, end);
  	else if (behavior == MADV_FREE)
  		return madvise_free_single_vma(vma, start, end);
  	else
  		return -EINVAL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
784
  }
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
785
786
787
  /*
   * Application wants to free up the pages and associated backing store.
   * This is effectively punching a hole into the middle of a file.
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
788
789
   */
  static long madvise_remove(struct vm_area_struct *vma,
00e9fa2d6   Nick Piggin   [PATCH] mm: fix m...
790
  				struct vm_area_struct **prev,
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
791
792
  				unsigned long start, unsigned long end)
  {
3f31d0757   Hugh Dickins   mm/fs: route MADV...
793
  	loff_t offset;
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
794
  	int error;
9ab4233dd   Andy Lutomirski   mm: Hold a file r...
795
  	struct file *f;
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
796

90ed52ebe   Hugh Dickins   [PATCH] holepunch...
797
  	*prev = NULL;	/* tell sys_madvise we drop mmap_sem */
00e9fa2d6   Nick Piggin   [PATCH] mm: fix m...
798

72079ba0d   Mike Kravetz   mm: madvise allow...
799
  	if (vma->vm_flags & VM_LOCKED)
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
800
  		return -EINVAL;
9ab4233dd   Andy Lutomirski   mm: Hold a file r...
801
802
803
  	f = vma->vm_file;
  
  	if (!f || !f->f_mapping || !f->f_mapping->host) {
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
804
805
  			return -EINVAL;
  	}
69cf0fac6   Hugh Dickins   [PATCH] Fix MADV_...
806
807
  	if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
  		return -EACCES;
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
808
809
  	offset = (loff_t)(start - vma->vm_start)
  			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
810

9ab4233dd   Andy Lutomirski   mm: Hold a file r...
811
812
813
814
815
816
817
  	/*
  	 * Filesystem's fallocate may need to take i_mutex.  We need to
  	 * explicitly grab a reference because the vma (and hence the
  	 * vma's reference to the file) can go away as soon as we drop
  	 * mmap_sem.
  	 */
  	get_file(f);
70ccb92fd   Andrea Arcangeli   userfaultfd: non-...
818
819
820
821
  	if (userfaultfd_remove(vma, start, end)) {
  		/* mmap_sem was not released by userfaultfd_remove() */
  		up_read(&current->mm->mmap_sem);
  	}
72c72bdf7   Anna Schumaker   VFS: Rename do_fa...
822
  	error = vfs_fallocate(f,
3f31d0757   Hugh Dickins   mm/fs: route MADV...
823
824
  				FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
  				offset, end - start);
9ab4233dd   Andy Lutomirski   mm: Hold a file r...
825
  	fput(f);
0a27a14a6   Nick Piggin   mm: madvise avoid...
826
  	down_read(&current->mm->mmap_sem);
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
827
  	return error;
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
828
  }
9893e49d6   Andi Kleen   HWPOISON: Add mad...
829
830
831
832
  #ifdef CONFIG_MEMORY_FAILURE
  /*
   * Error injection support for memory error handling.
   */
97167a768   Anshuman Khandual   mm/madvise.c: cle...
833
834
  static int madvise_inject_error(int behavior,
  		unsigned long start, unsigned long end)
9893e49d6   Andi Kleen   HWPOISON: Add mad...
835
  {
97167a768   Anshuman Khandual   mm/madvise.c: cle...
836
  	struct page *page;
c461ad6a6   Mel Gorman   mm, madvise: ensu...
837
  	struct zone *zone;
19bfbe22f   Alexandru Moise   mm, hugetlb, soft...
838
  	unsigned int order;
97167a768   Anshuman Khandual   mm/madvise.c: cle...
839

9893e49d6   Andi Kleen   HWPOISON: Add mad...
840
841
  	if (!capable(CAP_SYS_ADMIN))
  		return -EPERM;
97167a768   Anshuman Khandual   mm/madvise.c: cle...
842

19bfbe22f   Alexandru Moise   mm, hugetlb, soft...
843
844
  
  	for (; start < end; start += PAGE_SIZE << order) {
23e7b5c2e   Dan Williams   mm, madvise_injec...
845
  		unsigned long pfn;
325c4ef5c   Andrew Morton   mm/madvise.c:madv...
846
  		int ret;
97167a768   Anshuman Khandual   mm/madvise.c: cle...
847
  		ret = get_user_pages_fast(start, 1, 0, &page);
9893e49d6   Andi Kleen   HWPOISON: Add mad...
848
849
  		if (ret != 1)
  			return ret;
23e7b5c2e   Dan Williams   mm, madvise_injec...
850
  		pfn = page_to_pfn(page);
325c4ef5c   Andrew Morton   mm/madvise.c:madv...
851

19bfbe22f   Alexandru Moise   mm, hugetlb, soft...
852
853
854
855
856
857
  		/*
  		 * When soft offlining hugepages, after migrating the page
  		 * we dissolve it, therefore in the second loop "page" will
  		 * no longer be a compound page, and order will be 0.
  		 */
  		order = compound_order(compound_head(page));
97167a768   Anshuman Khandual   mm/madvise.c: cle...
858
859
  		if (PageHWPoison(page)) {
  			put_page(page);
29b4eedee   Wanpeng Li   mm/hwpoison.c: fi...
860
861
  			continue;
  		}
97167a768   Anshuman Khandual   mm/madvise.c: cle...
862
863
864
865
  
  		if (behavior == MADV_SOFT_OFFLINE) {
  			pr_info("Soft offlining pfn %#lx at process virtual address %#lx
  ",
23e7b5c2e   Dan Williams   mm, madvise_injec...
866
  					pfn, start);
97167a768   Anshuman Khandual   mm/madvise.c: cle...
867
868
  
  			ret = soft_offline_page(page, MF_COUNT_INCREASED);
afcf938ee   Andi Kleen   HWPOISON: Add a m...
869
  			if (ret)
8302423b8   Wanpeng Li   mm/madvise.c: fix...
870
  				return ret;
afcf938ee   Andi Kleen   HWPOISON: Add a m...
871
872
  			continue;
  		}
23e7b5c2e   Dan Williams   mm, madvise_injec...
873

97167a768   Anshuman Khandual   mm/madvise.c: cle...
874
875
  		pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx
  ",
23e7b5c2e   Dan Williams   mm, madvise_injec...
876
  				pfn, start);
97167a768   Anshuman Khandual   mm/madvise.c: cle...
877

23e7b5c2e   Dan Williams   mm, madvise_injec...
878
879
880
881
882
883
884
885
  		/*
  		 * Drop the page reference taken by get_user_pages_fast(). In
  		 * the absence of MF_COUNT_INCREASED the memory_failure()
  		 * routine is responsible for pinning the page to prevent it
  		 * from being released back to the page allocator.
  		 */
  		put_page(page);
  		ret = memory_failure(pfn, 0);
23a003bfd   Naoya Horiguchi   mm/madvise: pass ...
886
887
  		if (ret)
  			return ret;
9893e49d6   Andi Kleen   HWPOISON: Add mad...
888
  	}
c461ad6a6   Mel Gorman   mm, madvise: ensu...
889
890
891
892
  
  	/* Ensure that all poisoned pages are removed from per-cpu lists */
  	for_each_populated_zone(zone)
  		drain_all_pages(zone);
325c4ef5c   Andrew Morton   mm/madvise.c:madv...
893
  	return 0;
9893e49d6   Andi Kleen   HWPOISON: Add mad...
894
895
  }
  #endif
165cd4023   suzuki   [PATCH] madvise()...
896
897
898
  static long
  madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
  		unsigned long start, unsigned long end, int behavior)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
899
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
900
  	switch (behavior) {
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
901
  	case MADV_REMOVE:
3866ea90d   Hugh Dickins   ksm: first tidy u...
902
  		return madvise_remove(vma, prev, start, end);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
903
  	case MADV_WILLNEED:
3866ea90d   Hugh Dickins   ksm: first tidy u...
904
  		return madvise_willneed(vma, prev, start, end);
9c276cc65   Minchan Kim   mm: introduce MAD...
905
906
  	case MADV_COLD:
  		return madvise_cold(vma, prev, start, end);
1a4e58cce   Minchan Kim   mm: introduce MAD...
907
908
  	case MADV_PAGEOUT:
  		return madvise_pageout(vma, prev, start, end);
854e9ed09   Minchan Kim   mm: support madvi...
909
  	case MADV_FREE:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
910
  	case MADV_DONTNEED:
230ca982b   Mike Rapoport   userfaultfd: non-...
911
  		return madvise_dontneed_free(vma, prev, start, end, behavior);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
912
  	default:
3866ea90d   Hugh Dickins   ksm: first tidy u...
913
  		return madvise_behavior(vma, prev, start, end, behavior);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
914
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
915
  }
1ecef9ed0   Nicholas Krause   mm/madvise.c: mak...
916
  static bool
75927af8b   Nick Piggin   mm: madvise(): co...
917
918
919
920
921
922
923
924
925
926
927
  madvise_behavior_valid(int behavior)
  {
  	switch (behavior) {
  	case MADV_DOFORK:
  	case MADV_DONTFORK:
  	case MADV_NORMAL:
  	case MADV_SEQUENTIAL:
  	case MADV_RANDOM:
  	case MADV_REMOVE:
  	case MADV_WILLNEED:
  	case MADV_DONTNEED:
854e9ed09   Minchan Kim   mm: support madvi...
928
  	case MADV_FREE:
9c276cc65   Minchan Kim   mm: introduce MAD...
929
  	case MADV_COLD:
1a4e58cce   Minchan Kim   mm: introduce MAD...
930
  	case MADV_PAGEOUT:
f8af4da3b   Hugh Dickins   ksm: the mm inter...
931
932
933
934
  #ifdef CONFIG_KSM
  	case MADV_MERGEABLE:
  	case MADV_UNMERGEABLE:
  #endif
0af4e98b6   Andrea Arcangeli   thp: madvise(MADV...
935
936
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  	case MADV_HUGEPAGE:
a664b2d85   Andrea Arcangeli   thp: madvise(MADV...
937
  	case MADV_NOHUGEPAGE:
0af4e98b6   Andrea Arcangeli   thp: madvise(MADV...
938
  #endif
accb61fe7   Jason Baron   coredump: add VM_...
939
940
  	case MADV_DONTDUMP:
  	case MADV_DODUMP:
d2cd9ede6   Rik van Riel   mm,fork: introduc...
941
942
  	case MADV_WIPEONFORK:
  	case MADV_KEEPONFORK:
5e451be75   Anshuman Khandual   mm/madvise: move ...
943
944
945
946
  #ifdef CONFIG_MEMORY_FAILURE
  	case MADV_SOFT_OFFLINE:
  	case MADV_HWPOISON:
  #endif
1ecef9ed0   Nicholas Krause   mm/madvise.c: mak...
947
  		return true;
75927af8b   Nick Piggin   mm: madvise(): co...
948
949
  
  	default:
1ecef9ed0   Nicholas Krause   mm/madvise.c: mak...
950
  		return false;
75927af8b   Nick Piggin   mm: madvise(): co...
951
952
  	}
  }
3866ea90d   Hugh Dickins   ksm: first tidy u...
953

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
  /*
   * The madvise(2) system call.
   *
   * Applications can use madvise() to advise the kernel how it should
   * handle paging I/O in this VM area.  The idea is to help the kernel
   * use appropriate read-ahead and caching techniques.  The information
   * provided is advisory only, and can be safely disregarded by the
   * kernel without affecting the correct operation of the application.
   *
   * behavior values:
   *  MADV_NORMAL - the default behavior is to read clusters.  This
   *		results in some read-ahead and read-behind.
   *  MADV_RANDOM - the system should read the minimum amount of data
   *		on any access, since it is unlikely that the appli-
   *		cation will need more than what it asks for.
   *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
   *		once, so they can be aggressively read ahead, and
   *		can be freed soon after they are accessed.
   *  MADV_WILLNEED - the application is notifying the system to read
   *		some pages ahead.
   *  MADV_DONTNEED - the application is finished with the given range,
   *		so the kernel can free resources associated with it.
d7206a70a   Naoya Horiguchi   mm/madvise: updat...
976
977
   *  MADV_FREE - the application marks pages in the given range as lazy free,
   *		where actual purges are postponed until memory pressure happens.
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
978
979
   *  MADV_REMOVE - the application wants to free up the given range of
   *		pages and associated backing store.
3866ea90d   Hugh Dickins   ksm: first tidy u...
980
981
982
   *  MADV_DONTFORK - omit this area from child's address space when forking:
   *		typically, to avoid COWing pages pinned by get_user_pages().
   *  MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
c02c30093   Yang Shi   mm/madvise.c: add...
983
984
985
   *  MADV_WIPEONFORK - present the child process with zero-filled memory in this
   *              range after a fork.
   *  MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK
d7206a70a   Naoya Horiguchi   mm/madvise: updat...
986
987
988
   *  MADV_HWPOISON - trigger memory error handler as if the given memory range
   *		were corrupted by unrecoverable hardware memory failure.
   *  MADV_SOFT_OFFLINE - try to soft-offline the given range of memory.
f8af4da3b   Hugh Dickins   ksm: the mm inter...
989
990
991
   *  MADV_MERGEABLE - the application recommends that KSM try to merge pages in
   *		this area with pages of identical content from other such areas.
   *  MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
d7206a70a   Naoya Horiguchi   mm/madvise: updat...
992
993
994
995
996
997
998
999
1000
   *  MADV_HUGEPAGE - the application wants to back the given range by transparent
   *		huge pages in the future. Existing pages might be coalesced and
   *		new pages might be allocated as THP.
   *  MADV_NOHUGEPAGE - mark the given range as not worth being backed by
   *		transparent huge pages so the existing pages will not be
   *		coalesced into THP and new pages will not be allocated as THP.
   *  MADV_DONTDUMP - the application wants to prevent pages in the given range
   *		from being included in its core dump.
   *  MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1001
1002
1003
1004
1005
   *
   * return values:
   *  zero    - success
   *  -EINVAL - start + len < 0, start is not page-aligned,
   *		"behavior" is not a valid value, or application
c02c30093   Yang Shi   mm/madvise.c: add...
1006
1007
1008
   *		is attempting to release locked or shared pages,
   *		or the specified address range includes file, Huge TLB,
   *		MAP_SHARED or VMPFNMAP range.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1009
1010
1011
1012
1013
1014
   *  -ENOMEM - addresses in the specified range are not currently
   *		mapped, or are outside the AS of the process.
   *  -EIO    - an I/O error occurred while paging in data.
   *  -EBADF  - map exists, but area maps something that isn't a file.
   *  -EAGAIN - a kernel resource was temporarily unavailable.
   */
3480b2574   Heiko Carstens   [CVE-2009-0029] S...
1015
  SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1016
  {
05b743847   Prasanna Meda   [PATCH] madvise: ...
1017
  	unsigned long end, tmp;
ec9bed9d3   Vladimir Cernov   mm/madvise.c: fix...
1018
  	struct vm_area_struct *vma, *prev;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1019
1020
  	int unmapped_error = 0;
  	int error = -EINVAL;
f79777932   Jason Baron   speed up madvise_...
1021
  	int write;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1022
  	size_t len;
1998cc048   Shaohua Li   mm: make madvise(...
1023
  	struct blk_plug plug;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1024

057d33891   Andrey Konovalov   mm: untag user po...
1025
  	start = untagged_addr(start);
75927af8b   Nick Piggin   mm: madvise(): co...
1026
1027
  	if (!madvise_behavior_valid(behavior))
  		return error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1028
  	if (start & ~PAGE_MASK)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
1029
  		return error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1030
1031
1032
1033
  	len = (len_in + ~PAGE_MASK) & PAGE_MASK;
  
  	/* Check to see whether len was rounded up from small -ve to zero */
  	if (len_in && !len)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
1034
  		return error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1035
1036
1037
  
  	end = start + len;
  	if (end < start)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
1038
  		return error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1039
1040
1041
  
  	error = 0;
  	if (end == start)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
1042
  		return error;
5e451be75   Anshuman Khandual   mm/madvise: move ...
1043
1044
1045
1046
  #ifdef CONFIG_MEMORY_FAILURE
  	if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
  		return madvise_inject_error(behavior, start, start + len_in);
  #endif
84d96d897   Rasmus Villemoes   mm: madvise: comp...
1047
  	write = madvise_need_mmap_write(behavior);
dc0ef0df7   Michal Hocko   mm: make mmap_sem...
1048
1049
1050
1051
  	if (write) {
  		if (down_write_killable(&current->mm->mmap_sem))
  			return -EINTR;
  	} else {
84d96d897   Rasmus Villemoes   mm: madvise: comp...
1052
  		down_read(&current->mm->mmap_sem);
dc0ef0df7   Michal Hocko   mm: make mmap_sem...
1053
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1054
1055
1056
1057
  
  	/*
  	 * If the interval [start,end) covers some unmapped address
  	 * ranges, just ignore them, but return -ENOMEM at the end.
05b743847   Prasanna Meda   [PATCH] madvise: ...
1058
  	 * - different from the way of handling in mlock etc.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1059
  	 */
05b743847   Prasanna Meda   [PATCH] madvise: ...
1060
  	vma = find_vma_prev(current->mm, start, &prev);
836d5ffd3   Hugh Dickins   [PATCH] mm: fix m...
1061
1062
  	if (vma && start > vma->vm_start)
  		prev = vma;
1998cc048   Shaohua Li   mm: make madvise(...
1063
  	blk_start_plug(&plug);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1064
1065
1066
1067
  	for (;;) {
  		/* Still start < end. */
  		error = -ENOMEM;
  		if (!vma)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
1068
  			goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1069

05b743847   Prasanna Meda   [PATCH] madvise: ...
1070
  		/* Here start < (end|vma->vm_end). */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1071
1072
1073
  		if (start < vma->vm_start) {
  			unmapped_error = -ENOMEM;
  			start = vma->vm_start;
05b743847   Prasanna Meda   [PATCH] madvise: ...
1074
  			if (start >= end)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
1075
  				goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1076
  		}
05b743847   Prasanna Meda   [PATCH] madvise: ...
1077
1078
1079
1080
  		/* Here vma->vm_start <= start < (end|vma->vm_end) */
  		tmp = vma->vm_end;
  		if (end < tmp)
  			tmp = end;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1081

05b743847   Prasanna Meda   [PATCH] madvise: ...
1082
1083
  		/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
  		error = madvise_vma(vma, &prev, start, tmp, behavior);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1084
  		if (error)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
1085
  			goto out;
05b743847   Prasanna Meda   [PATCH] madvise: ...
1086
  		start = tmp;
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
1087
  		if (prev && start < prev->vm_end)
05b743847   Prasanna Meda   [PATCH] madvise: ...
1088
1089
1090
  			start = prev->vm_end;
  		error = unmapped_error;
  		if (start >= end)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
1091
  			goto out;
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
1092
1093
1094
1095
  		if (prev)
  			vma = prev->vm_next;
  		else	/* madvise_remove dropped mmap_sem */
  			vma = find_vma(current->mm, start);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1096
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1097
  out:
84d96d897   Rasmus Villemoes   mm: madvise: comp...
1098
  	blk_finish_plug(&plug);
f79777932   Jason Baron   speed up madvise_...
1099
  	if (write)
0a27a14a6   Nick Piggin   mm: madvise avoid...
1100
1101
1102
  		up_write(&current->mm->mmap_sem);
  	else
  		up_read(&current->mm->mmap_sem);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1103
1104
  	return error;
  }