Blame view

mm/madvise.c 29 KB
b24413180   Greg Kroah-Hartman   License cleanup: ...
1
  // SPDX-License-Identifier: GPL-2.0
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2
3
4
5
6
7
8
9
10
11
  /*
   *	linux/mm/madvise.c
   *
   * Copyright (C) 1999  Linus Torvalds
   * Copyright (C) 2002  Christoph Hellwig
   */
  
  #include <linux/mman.h>
  #include <linux/pagemap.h>
  #include <linux/syscalls.h>
05b743847   Prasanna Meda   [PATCH] madvise: ...
12
  #include <linux/mempolicy.h>
afcf938ee   Andi Kleen   HWPOISON: Add a m...
13
  #include <linux/page-isolation.h>
9c276cc65   Minchan Kim   mm: introduce MAD...
14
  #include <linux/page_idle.h>
05ce77249   Pavel Emelyanov   userfaultfd: non-...
15
  #include <linux/userfaultfd_k.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
16
  #include <linux/hugetlb.h>
3f31d0757   Hugh Dickins   mm/fs: route MADV...
17
  #include <linux/falloc.h>
692fe6243   Jan Kara   mm: Handle MADV_W...
18
  #include <linux/fadvise.h>
e8edc6e03   Alexey Dobriyan   Detach sched.h fr...
19
  #include <linux/sched.h>
f8af4da3b   Hugh Dickins   ksm: the mm inter...
20
  #include <linux/ksm.h>
3f31d0757   Hugh Dickins   mm/fs: route MADV...
21
  #include <linux/fs.h>
9ab4233dd   Andy Lutomirski   mm: Hold a file r...
22
  #include <linux/file.h>
1998cc048   Shaohua Li   mm: make madvise(...
23
  #include <linux/blkdev.h>
66114cad6   Tejun Heo   writeback: separa...
24
  #include <linux/backing-dev.h>
a520110e4   Christoph Hellwig   mm: split out a n...
25
  #include <linux/pagewalk.h>
1998cc048   Shaohua Li   mm: make madvise(...
26
27
  #include <linux/swap.h>
  #include <linux/swapops.h>
3a4f8a0b3   Hugh Dickins   mm: remove shmem_...
28
  #include <linux/shmem_fs.h>
854e9ed09   Minchan Kim   mm: support madvi...
29
30
31
  #include <linux/mmu_notifier.h>
  
  #include <asm/tlb.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
32

235190738   Kirill A. Shutemov   oom-reaper: use m...
33
  #include "internal.h"
d616d5126   Minchan Kim   mm: factor out co...
34
35
36
37
  struct madvise_walk_private {
  	struct mmu_gather *tlb;
  	bool pageout;
  };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
38
  /*
0a27a14a6   Nick Piggin   mm: madvise avoid...
39
40
41
42
43
44
45
46
47
48
   * Any behaviour which results in changes to the vma->vm_flags needs to
   * take mmap_sem for writing. Others, which simply traverse vmas, need
   * to only take it for reading.
   */
  static int madvise_need_mmap_write(int behavior)
  {
  	switch (behavior) {
  	case MADV_REMOVE:
  	case MADV_WILLNEED:
  	case MADV_DONTNEED:
9c276cc65   Minchan Kim   mm: introduce MAD...
49
  	case MADV_COLD:
1a4e58cce   Minchan Kim   mm: introduce MAD...
50
  	case MADV_PAGEOUT:
854e9ed09   Minchan Kim   mm: support madvi...
51
  	case MADV_FREE:
0a27a14a6   Nick Piggin   mm: madvise avoid...
52
53
54
55
56
57
58
59
  		return 0;
  	default:
  		/* be safe, default to 1. list exceptions explicitly */
  		return 1;
  	}
  }
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
60
61
62
   * We can potentially split a vm area into separate
   * areas, each area with its own behavior.
   */
ec9bed9d3   Vladimir Cernov   mm/madvise.c: fix...
63
  static long madvise_behavior(struct vm_area_struct *vma,
05b743847   Prasanna Meda   [PATCH] madvise: ...
64
65
  		     struct vm_area_struct **prev,
  		     unsigned long start, unsigned long end, int behavior)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
66
  {
ec9bed9d3   Vladimir Cernov   mm/madvise.c: fix...
67
  	struct mm_struct *mm = vma->vm_mm;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
68
  	int error = 0;
05b743847   Prasanna Meda   [PATCH] madvise: ...
69
  	pgoff_t pgoff;
3866ea90d   Hugh Dickins   ksm: first tidy u...
70
  	unsigned long new_flags = vma->vm_flags;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
71
72
  
  	switch (behavior) {
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
73
74
75
  	case MADV_NORMAL:
  		new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
  		break;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
76
  	case MADV_SEQUENTIAL:
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
77
  		new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
78
79
  		break;
  	case MADV_RANDOM:
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
80
  		new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
81
  		break;
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
82
83
84
85
  	case MADV_DONTFORK:
  		new_flags |= VM_DONTCOPY;
  		break;
  	case MADV_DOFORK:
3866ea90d   Hugh Dickins   ksm: first tidy u...
86
87
88
89
  		if (vma->vm_flags & VM_IO) {
  			error = -EINVAL;
  			goto out;
  		}
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
90
  		new_flags &= ~VM_DONTCOPY;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
91
  		break;
d2cd9ede6   Rik van Riel   mm,fork: introduc...
92
93
94
95
96
97
98
99
100
101
102
  	case MADV_WIPEONFORK:
  		/* MADV_WIPEONFORK is only supported on anonymous memory. */
  		if (vma->vm_file || vma->vm_flags & VM_SHARED) {
  			error = -EINVAL;
  			goto out;
  		}
  		new_flags |= VM_WIPEONFORK;
  		break;
  	case MADV_KEEPONFORK:
  		new_flags &= ~VM_WIPEONFORK;
  		break;
accb61fe7   Jason Baron   coredump: add VM_...
103
  	case MADV_DONTDUMP:
0103bd16f   Konstantin Khlebnikov   mm: prepare VM_DO...
104
  		new_flags |= VM_DONTDUMP;
accb61fe7   Jason Baron   coredump: add VM_...
105
106
  		break;
  	case MADV_DODUMP:
d41aa5252   Daniel Black   mm: madvise(MADV_...
107
  		if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) {
0103bd16f   Konstantin Khlebnikov   mm: prepare VM_DO...
108
109
110
111
  			error = -EINVAL;
  			goto out;
  		}
  		new_flags &= ~VM_DONTDUMP;
accb61fe7   Jason Baron   coredump: add VM_...
112
  		break;
f8af4da3b   Hugh Dickins   ksm: the mm inter...
113
114
115
  	case MADV_MERGEABLE:
  	case MADV_UNMERGEABLE:
  		error = ksm_madvise(vma, start, end, behavior, &new_flags);
f3bc0dba3   Mike Rapoport   mm/madvise: reduc...
116
117
  		if (error)
  			goto out_convert_errno;
f8af4da3b   Hugh Dickins   ksm: the mm inter...
118
  		break;
0af4e98b6   Andrea Arcangeli   thp: madvise(MADV...
119
  	case MADV_HUGEPAGE:
a664b2d85   Andrea Arcangeli   thp: madvise(MADV...
120
  	case MADV_NOHUGEPAGE:
60ab3244e   Andrea Arcangeli   thp: khugepaged: ...
121
  		error = hugepage_madvise(vma, &new_flags, behavior);
f3bc0dba3   Mike Rapoport   mm/madvise: reduc...
122
123
  		if (error)
  			goto out_convert_errno;
0af4e98b6   Andrea Arcangeli   thp: madvise(MADV...
124
  		break;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
125
  	}
05b743847   Prasanna Meda   [PATCH] madvise: ...
126
127
  	if (new_flags == vma->vm_flags) {
  		*prev = vma;
836d5ffd3   Hugh Dickins   [PATCH] mm: fix m...
128
  		goto out;
05b743847   Prasanna Meda   [PATCH] madvise: ...
129
130
131
132
  	}
  
  	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
  	*prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
19a809afe   Andrea Arcangeli   userfaultfd: teac...
133
134
  			  vma->vm_file, pgoff, vma_policy(vma),
  			  vma->vm_userfaultfd_ctx);
05b743847   Prasanna Meda   [PATCH] madvise: ...
135
136
137
138
139
140
  	if (*prev) {
  		vma = *prev;
  		goto success;
  	}
  
  	*prev = vma;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
141
142
  
  	if (start != vma->vm_start) {
def5efe03   David Rientjes   mm, madvise: fail...
143
144
  		if (unlikely(mm->map_count >= sysctl_max_map_count)) {
  			error = -ENOMEM;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
145
  			goto out;
def5efe03   David Rientjes   mm, madvise: fail...
146
147
  		}
  		error = __split_vma(mm, vma, start, 1);
f3bc0dba3   Mike Rapoport   mm/madvise: reduc...
148
149
  		if (error)
  			goto out_convert_errno;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
150
151
152
  	}
  
  	if (end != vma->vm_end) {
def5efe03   David Rientjes   mm, madvise: fail...
153
154
  		if (unlikely(mm->map_count >= sysctl_max_map_count)) {
  			error = -ENOMEM;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
155
  			goto out;
def5efe03   David Rientjes   mm, madvise: fail...
156
157
  		}
  		error = __split_vma(mm, vma, end, 0);
f3bc0dba3   Mike Rapoport   mm/madvise: reduc...
158
159
  		if (error)
  			goto out_convert_errno;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
160
  	}
836d5ffd3   Hugh Dickins   [PATCH] mm: fix m...
161
  success:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
162
163
164
  	/*
  	 * vm_flags is protected by the mmap_sem held in write mode.
  	 */
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
165
  	vma->vm_flags = new_flags;
f3bc0dba3   Mike Rapoport   mm/madvise: reduc...
166
167
168
169
170
171
172
173
  
  out_convert_errno:
  	/*
  	 * madvise() returns EAGAIN if kernel resources, such as
  	 * slab, are temporarily unavailable.
  	 */
  	if (error == -ENOMEM)
  		error = -EAGAIN;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
174
  out:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
175
176
  	return error;
  }
1998cc048   Shaohua Li   mm: make madvise(...
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
  #ifdef CONFIG_SWAP
  static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
  	unsigned long end, struct mm_walk *walk)
  {
  	pte_t *orig_pte;
  	struct vm_area_struct *vma = walk->private;
  	unsigned long index;
  
  	if (pmd_none_or_trans_huge_or_clear_bad(pmd))
  		return 0;
  
  	for (index = start; index != end; index += PAGE_SIZE) {
  		pte_t pte;
  		swp_entry_t entry;
  		struct page *page;
  		spinlock_t *ptl;
  
  		orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
  		pte = *(orig_pte + ((index - start) / PAGE_SIZE));
  		pte_unmap_unlock(orig_pte, ptl);
0661a3361   Kirill A. Shutemov   mm: remove rest u...
197
  		if (pte_present(pte) || pte_none(pte))
1998cc048   Shaohua Li   mm: make madvise(...
198
199
200
201
202
203
  			continue;
  		entry = pte_to_swp_entry(pte);
  		if (unlikely(non_swap_entry(entry)))
  			continue;
  
  		page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
23955622f   Shaohua Li   swap: add block i...
204
  							vma, index, false);
1998cc048   Shaohua Li   mm: make madvise(...
205
  		if (page)
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
206
  			put_page(page);
1998cc048   Shaohua Li   mm: make madvise(...
207
208
209
210
  	}
  
  	return 0;
  }
7b86ac337   Christoph Hellwig   pagewalk: separat...
211
212
213
  static const struct mm_walk_ops swapin_walk_ops = {
  	.pmd_entry		= swapin_walk_pmd_entry,
  };
1998cc048   Shaohua Li   mm: make madvise(...
214
215
216
217
218
219
220
221
222
223
224
  
  static void force_shm_swapin_readahead(struct vm_area_struct *vma,
  		unsigned long start, unsigned long end,
  		struct address_space *mapping)
  {
  	pgoff_t index;
  	struct page *page;
  	swp_entry_t swap;
  
  	for (; start < end; start += PAGE_SIZE) {
  		index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
55231e5c8   Johannes Weiner   mm: madvise: fix ...
225
  		page = find_get_entry(mapping, index);
3159f943a   Matthew Wilcox   xarray: Replace e...
226
  		if (!xa_is_value(page)) {
1998cc048   Shaohua Li   mm: make madvise(...
227
  			if (page)
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
228
  				put_page(page);
1998cc048   Shaohua Li   mm: make madvise(...
229
230
231
232
  			continue;
  		}
  		swap = radix_to_swp_entry(page);
  		page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
23955622f   Shaohua Li   swap: add block i...
233
  							NULL, 0, false);
1998cc048   Shaohua Li   mm: make madvise(...
234
  		if (page)
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
235
  			put_page(page);
1998cc048   Shaohua Li   mm: make madvise(...
236
237
238
239
240
  	}
  
  	lru_add_drain();	/* Push any new pages onto the LRU now */
  }
  #endif		/* CONFIG_SWAP */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
241
242
243
  /*
   * Schedule all required I/O operations.  Do not wait for completion.
   */
ec9bed9d3   Vladimir Cernov   mm/madvise.c: fix...
244
245
  static long madvise_willneed(struct vm_area_struct *vma,
  			     struct vm_area_struct **prev,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
246
247
248
  			     unsigned long start, unsigned long end)
  {
  	struct file *file = vma->vm_file;
692fe6243   Jan Kara   mm: Handle MADV_W...
249
  	loff_t offset;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
250

6ea8d958a   chenjie   mm/madvise.c: fix...
251
  	*prev = vma;
1998cc048   Shaohua Li   mm: make madvise(...
252
  #ifdef CONFIG_SWAP
97b713ba3   Christoph Hellwig   fs: kill BDI_CAP_...
253
  	if (!file) {
7b86ac337   Christoph Hellwig   pagewalk: separat...
254
255
  		walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma);
  		lru_add_drain(); /* Push any new pages onto the LRU now */
1998cc048   Shaohua Li   mm: make madvise(...
256
257
  		return 0;
  	}
1998cc048   Shaohua Li   mm: make madvise(...
258

97b713ba3   Christoph Hellwig   fs: kill BDI_CAP_...
259
  	if (shmem_mapping(file->f_mapping)) {
97b713ba3   Christoph Hellwig   fs: kill BDI_CAP_...
260
261
262
263
264
  		force_shm_swapin_readahead(vma, start, end,
  					file->f_mapping);
  		return 0;
  	}
  #else
1bef40032   Suzuki   [PATCH] madvise: ...
265
266
  	if (!file)
  		return -EBADF;
97b713ba3   Christoph Hellwig   fs: kill BDI_CAP_...
267
  #endif
1bef40032   Suzuki   [PATCH] madvise: ...
268

e748dcd09   Matthew Wilcox   vfs: remove get_x...
269
  	if (IS_DAX(file_inode(file))) {
fe77ba6f4   Carsten Otte   [PATCH] xip: madv...
270
271
272
  		/* no bad return value, but ignore advice */
  		return 0;
  	}
692fe6243   Jan Kara   mm: Handle MADV_W...
273
274
275
276
277
278
279
280
  	/*
  	 * Filesystem's fadvise may need to take various locks.  We need to
  	 * explicitly grab a reference because the vma (and hence the
  	 * vma's reference to the file) can go away as soon as we drop
  	 * mmap_sem.
  	 */
  	*prev = NULL;	/* tell sys_madvise we drop mmap_sem */
  	get_file(file);
692fe6243   Jan Kara   mm: Handle MADV_W...
281
282
  	offset = (loff_t)(start - vma->vm_start)
  			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
f4fa8d937   Yang Shi   mm: madvise: fix ...
283
  	up_read(&current->mm->mmap_sem);
692fe6243   Jan Kara   mm: Handle MADV_W...
284
285
286
  	vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
  	fput(file);
  	down_read(&current->mm->mmap_sem);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
287
288
  	return 0;
  }
d616d5126   Minchan Kim   mm: factor out co...
289
290
291
  static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
  				unsigned long addr, unsigned long end,
  				struct mm_walk *walk)
9c276cc65   Minchan Kim   mm: introduce MAD...
292
  {
d616d5126   Minchan Kim   mm: factor out co...
293
294
295
  	struct madvise_walk_private *private = walk->private;
  	struct mmu_gather *tlb = private->tlb;
  	bool pageout = private->pageout;
9c276cc65   Minchan Kim   mm: introduce MAD...
296
297
298
299
  	struct mm_struct *mm = tlb->mm;
  	struct vm_area_struct *vma = walk->vma;
  	pte_t *orig_pte, *pte, ptent;
  	spinlock_t *ptl;
d616d5126   Minchan Kim   mm: factor out co...
300
301
302
303
304
  	struct page *page = NULL;
  	LIST_HEAD(page_list);
  
  	if (fatal_signal_pending(current))
  		return -EINTR;
9c276cc65   Minchan Kim   mm: introduce MAD...
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
  
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  	if (pmd_trans_huge(*pmd)) {
  		pmd_t orig_pmd;
  		unsigned long next = pmd_addr_end(addr, end);
  
  		tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
  		ptl = pmd_trans_huge_lock(pmd, vma);
  		if (!ptl)
  			return 0;
  
  		orig_pmd = *pmd;
  		if (is_huge_zero_pmd(orig_pmd))
  			goto huge_unlock;
  
  		if (unlikely(!pmd_present(orig_pmd))) {
  			VM_BUG_ON(thp_migration_supported() &&
  					!is_pmd_migration_entry(orig_pmd));
  			goto huge_unlock;
  		}
  
  		page = pmd_page(orig_pmd);
69f434a05   Michal Hocko   mm: do not allow ...
327
328
329
330
  
  		/* Do not interfere with other mappings of this page */
  		if (page_mapcount(page) != 1)
  			goto huge_unlock;
9c276cc65   Minchan Kim   mm: introduce MAD...
331
332
  		if (next - addr != HPAGE_PMD_SIZE) {
  			int err;
9c276cc65   Minchan Kim   mm: introduce MAD...
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
  			get_page(page);
  			spin_unlock(ptl);
  			lock_page(page);
  			err = split_huge_page(page);
  			unlock_page(page);
  			put_page(page);
  			if (!err)
  				goto regular_page;
  			return 0;
  		}
  
  		if (pmd_young(orig_pmd)) {
  			pmdp_invalidate(vma, addr, pmd);
  			orig_pmd = pmd_mkold(orig_pmd);
  
  			set_pmd_at(mm, addr, pmd, orig_pmd);
  			tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
  		}
d616d5126   Minchan Kim   mm: factor out co...
351
  		ClearPageReferenced(page);
9c276cc65   Minchan Kim   mm: introduce MAD...
352
  		test_and_clear_page_young(page);
d616d5126   Minchan Kim   mm: factor out co...
353
  		if (pageout) {
820729629   zhong jiang   mm: fix trying to...
354
355
356
357
358
359
  			if (!isolate_lru_page(page)) {
  				if (PageUnevictable(page))
  					putback_lru_page(page);
  				else
  					list_add(&page->lru, &page_list);
  			}
d616d5126   Minchan Kim   mm: factor out co...
360
361
  		} else
  			deactivate_page(page);
9c276cc65   Minchan Kim   mm: introduce MAD...
362
363
  huge_unlock:
  		spin_unlock(ptl);
d616d5126   Minchan Kim   mm: factor out co...
364
365
  		if (pageout)
  			reclaim_pages(&page_list);
9c276cc65   Minchan Kim   mm: introduce MAD...
366
367
  		return 0;
  	}
fe932d4c9   Minchan Kim   mm: validate pmd ...
368
  regular_page:
9c276cc65   Minchan Kim   mm: introduce MAD...
369
370
  	if (pmd_trans_unstable(pmd))
  		return 0;
9c276cc65   Minchan Kim   mm: introduce MAD...
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
  #endif
  	tlb_change_page_size(tlb, PAGE_SIZE);
  	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
  	flush_tlb_batched_pending(mm);
  	arch_enter_lazy_mmu_mode();
  	for (; addr < end; pte++, addr += PAGE_SIZE) {
  		ptent = *pte;
  
  		if (pte_none(ptent))
  			continue;
  
  		if (!pte_present(ptent))
  			continue;
  
  		page = vm_normal_page(vma, addr, ptent);
  		if (!page)
  			continue;
  
  		/*
  		 * Creating a THP page is expensive so split it only if we
  		 * are sure it's worth. Split it if we are only owner.
  		 */
  		if (PageTransCompound(page)) {
  			if (page_mapcount(page) != 1)
  				break;
  			get_page(page);
  			if (!trylock_page(page)) {
  				put_page(page);
  				break;
  			}
  			pte_unmap_unlock(orig_pte, ptl);
  			if (split_huge_page(page)) {
  				unlock_page(page);
  				put_page(page);
  				pte_offset_map_lock(mm, pmd, addr, &ptl);
  				break;
  			}
  			unlock_page(page);
  			put_page(page);
  			pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
  			pte--;
  			addr -= PAGE_SIZE;
  			continue;
  		}
69f434a05   Michal Hocko   mm: do not allow ...
415
416
417
  		/* Do not interfere with other mappings of this page */
  		if (page_mapcount(page) != 1)
  			continue;
9c276cc65   Minchan Kim   mm: introduce MAD...
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
  		VM_BUG_ON_PAGE(PageTransCompound(page), page);
  
  		if (pte_young(ptent)) {
  			ptent = ptep_get_and_clear_full(mm, addr, pte,
  							tlb->fullmm);
  			ptent = pte_mkold(ptent);
  			set_pte_at(mm, addr, pte, ptent);
  			tlb_remove_tlb_entry(tlb, pte, addr);
  		}
  
  		/*
  		 * We are deactivating a page for accelerating reclaiming.
  		 * VM couldn't reclaim the page unless we clear PG_young.
  		 * As a side effect, it makes confuse idle-page tracking
  		 * because they will miss recent referenced history.
  		 */
d616d5126   Minchan Kim   mm: factor out co...
434
  		ClearPageReferenced(page);
9c276cc65   Minchan Kim   mm: introduce MAD...
435
  		test_and_clear_page_young(page);
d616d5126   Minchan Kim   mm: factor out co...
436
  		if (pageout) {
820729629   zhong jiang   mm: fix trying to...
437
438
439
440
441
442
  			if (!isolate_lru_page(page)) {
  				if (PageUnevictable(page))
  					putback_lru_page(page);
  				else
  					list_add(&page->lru, &page_list);
  			}
d616d5126   Minchan Kim   mm: factor out co...
443
444
  		} else
  			deactivate_page(page);
9c276cc65   Minchan Kim   mm: introduce MAD...
445
446
447
448
  	}
  
  	arch_leave_lazy_mmu_mode();
  	pte_unmap_unlock(orig_pte, ptl);
d616d5126   Minchan Kim   mm: factor out co...
449
450
  	if (pageout)
  		reclaim_pages(&page_list);
9c276cc65   Minchan Kim   mm: introduce MAD...
451
452
453
454
455
456
  	cond_resched();
  
  	return 0;
  }
  
  static const struct mm_walk_ops cold_walk_ops = {
d616d5126   Minchan Kim   mm: factor out co...
457
  	.pmd_entry = madvise_cold_or_pageout_pte_range,
9c276cc65   Minchan Kim   mm: introduce MAD...
458
459
460
461
462
463
  };
  
  static void madvise_cold_page_range(struct mmu_gather *tlb,
  			     struct vm_area_struct *vma,
  			     unsigned long addr, unsigned long end)
  {
d616d5126   Minchan Kim   mm: factor out co...
464
465
466
467
  	struct madvise_walk_private walk_private = {
  		.pageout = false,
  		.tlb = tlb,
  	};
9c276cc65   Minchan Kim   mm: introduce MAD...
468
  	tlb_start_vma(tlb, vma);
d616d5126   Minchan Kim   mm: factor out co...
469
  	walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
9c276cc65   Minchan Kim   mm: introduce MAD...
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
  	tlb_end_vma(tlb, vma);
  }
  
  static long madvise_cold(struct vm_area_struct *vma,
  			struct vm_area_struct **prev,
  			unsigned long start_addr, unsigned long end_addr)
  {
  	struct mm_struct *mm = vma->vm_mm;
  	struct mmu_gather tlb;
  
  	*prev = vma;
  	if (!can_madv_lru_vma(vma))
  		return -EINVAL;
  
  	lru_add_drain();
  	tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
  	madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
  	tlb_finish_mmu(&tlb, start_addr, end_addr);
  
  	return 0;
  }
1a4e58cce   Minchan Kim   mm: introduce MAD...
491
492
493
494
  static void madvise_pageout_page_range(struct mmu_gather *tlb,
  			     struct vm_area_struct *vma,
  			     unsigned long addr, unsigned long end)
  {
d616d5126   Minchan Kim   mm: factor out co...
495
496
497
498
  	struct madvise_walk_private walk_private = {
  		.pageout = true,
  		.tlb = tlb,
  	};
1a4e58cce   Minchan Kim   mm: introduce MAD...
499
  	tlb_start_vma(tlb, vma);
d616d5126   Minchan Kim   mm: factor out co...
500
  	walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
1a4e58cce   Minchan Kim   mm: introduce MAD...
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
  	tlb_end_vma(tlb, vma);
  }
  
  static inline bool can_do_pageout(struct vm_area_struct *vma)
  {
  	if (vma_is_anonymous(vma))
  		return true;
  	if (!vma->vm_file)
  		return false;
  	/*
  	 * paging out pagecache only for non-anonymous mappings that correspond
  	 * to the files the calling process could (if tried) open for writing;
  	 * otherwise we'd be including shared non-exclusive mappings, which
  	 * opens a side channel.
  	 */
  	return inode_owner_or_capable(file_inode(vma->vm_file)) ||
  		inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0;
  }
  
  static long madvise_pageout(struct vm_area_struct *vma,
  			struct vm_area_struct **prev,
  			unsigned long start_addr, unsigned long end_addr)
  {
  	struct mm_struct *mm = vma->vm_mm;
  	struct mmu_gather tlb;
  
  	*prev = vma;
  	if (!can_madv_lru_vma(vma))
  		return -EINVAL;
  
  	if (!can_do_pageout(vma))
  		return 0;
  
  	lru_add_drain();
  	tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
  	madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
  	tlb_finish_mmu(&tlb, start_addr, end_addr);
  
  	return 0;
  }
854e9ed09   Minchan Kim   mm: support madvi...
541
542
543
544
545
546
547
548
549
550
  static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
  				unsigned long end, struct mm_walk *walk)
  
  {
  	struct mmu_gather *tlb = walk->private;
  	struct mm_struct *mm = tlb->mm;
  	struct vm_area_struct *vma = walk->vma;
  	spinlock_t *ptl;
  	pte_t *orig_pte, *pte, ptent;
  	struct page *page;
64b42bc1c   Minchan Kim   mm/madvise.c: fre...
551
  	int nr_swap = 0;
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
552
553
554
555
556
557
  	unsigned long next;
  
  	next = pmd_addr_end(addr, end);
  	if (pmd_trans_huge(*pmd))
  		if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
  			goto next;
854e9ed09   Minchan Kim   mm: support madvi...
558

854e9ed09   Minchan Kim   mm: support madvi...
559
560
  	if (pmd_trans_unstable(pmd))
  		return 0;
ed6a79352   Peter Zijlstra   asm-generic/tlb, ...
561
  	tlb_change_page_size(tlb, PAGE_SIZE);
854e9ed09   Minchan Kim   mm: support madvi...
562
  	orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
3ea277194   Mel Gorman   mm, mprotect: flu...
563
  	flush_tlb_batched_pending(mm);
854e9ed09   Minchan Kim   mm: support madvi...
564
565
566
  	arch_enter_lazy_mmu_mode();
  	for (; addr != end; pte++, addr += PAGE_SIZE) {
  		ptent = *pte;
64b42bc1c   Minchan Kim   mm/madvise.c: fre...
567
  		if (pte_none(ptent))
854e9ed09   Minchan Kim   mm: support madvi...
568
  			continue;
64b42bc1c   Minchan Kim   mm/madvise.c: fre...
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
  		/*
  		 * If the pte has swp_entry, just clear page table to
  		 * prevent swap-in which is more expensive rather than
  		 * (page allocation + zeroing).
  		 */
  		if (!pte_present(ptent)) {
  			swp_entry_t entry;
  
  			entry = pte_to_swp_entry(ptent);
  			if (non_swap_entry(entry))
  				continue;
  			nr_swap--;
  			free_swap_and_cache(entry);
  			pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
  			continue;
  		}
854e9ed09   Minchan Kim   mm: support madvi...
585

25b2995a3   Christoph Hellwig   mm: remove MEMORY...
586
  		page = vm_normal_page(vma, addr, ptent);
854e9ed09   Minchan Kim   mm: support madvi...
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
  		if (!page)
  			continue;
  
  		/*
  		 * If pmd isn't transhuge but the page is THP and
  		 * is owned by only this process, split it and
  		 * deactivate all pages.
  		 */
  		if (PageTransCompound(page)) {
  			if (page_mapcount(page) != 1)
  				goto out;
  			get_page(page);
  			if (!trylock_page(page)) {
  				put_page(page);
  				goto out;
  			}
  			pte_unmap_unlock(orig_pte, ptl);
  			if (split_huge_page(page)) {
  				unlock_page(page);
  				put_page(page);
  				pte_offset_map_lock(mm, pmd, addr, &ptl);
  				goto out;
  			}
854e9ed09   Minchan Kim   mm: support madvi...
610
  			unlock_page(page);
263630e8d   Eric Biggers   mm/madvise.c: fix...
611
  			put_page(page);
854e9ed09   Minchan Kim   mm: support madvi...
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
  			pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
  			pte--;
  			addr -= PAGE_SIZE;
  			continue;
  		}
  
  		VM_BUG_ON_PAGE(PageTransCompound(page), page);
  
  		if (PageSwapCache(page) || PageDirty(page)) {
  			if (!trylock_page(page))
  				continue;
  			/*
  			 * If page is shared with others, we couldn't clear
  			 * PG_dirty of the page.
  			 */
  			if (page_mapcount(page) != 1) {
  				unlock_page(page);
  				continue;
  			}
  
  			if (PageSwapCache(page) && !try_to_free_swap(page)) {
  				unlock_page(page);
  				continue;
  			}
  
  			ClearPageDirty(page);
  			unlock_page(page);
  		}
  
  		if (pte_young(ptent) || pte_dirty(ptent)) {
  			/*
  			 * Some of architecture(ex, PPC) don't update TLB
  			 * with set_pte_at and tlb_remove_tlb_entry so for
  			 * the portability, remap the pte with old|clean
  			 * after pte clearing.
  			 */
  			ptent = ptep_get_and_clear_full(mm, addr, pte,
  							tlb->fullmm);
  
  			ptent = pte_mkold(ptent);
  			ptent = pte_mkclean(ptent);
  			set_pte_at(mm, addr, pte, ptent);
  			tlb_remove_tlb_entry(tlb, pte, addr);
  		}
802a3a92a   Shaohua Li   mm: reclaim MADV_...
656
  		mark_page_lazyfree(page);
854e9ed09   Minchan Kim   mm: support madvi...
657
658
  	}
  out:
64b42bc1c   Minchan Kim   mm/madvise.c: fre...
659
660
661
662
663
664
  	if (nr_swap) {
  		if (current->mm == mm)
  			sync_mm_rss(mm);
  
  		add_mm_counter(mm, MM_SWAPENTS, nr_swap);
  	}
854e9ed09   Minchan Kim   mm: support madvi...
665
666
667
  	arch_leave_lazy_mmu_mode();
  	pte_unmap_unlock(orig_pte, ptl);
  	cond_resched();
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
668
  next:
854e9ed09   Minchan Kim   mm: support madvi...
669
670
  	return 0;
  }
7b86ac337   Christoph Hellwig   pagewalk: separat...
671
672
673
  static const struct mm_walk_ops madvise_free_walk_ops = {
  	.pmd_entry		= madvise_free_pte_range,
  };
854e9ed09   Minchan Kim   mm: support madvi...
674
675
676
677
  
  static int madvise_free_single_vma(struct vm_area_struct *vma,
  			unsigned long start_addr, unsigned long end_addr)
  {
854e9ed09   Minchan Kim   mm: support madvi...
678
  	struct mm_struct *mm = vma->vm_mm;
ac46d4f3c   Jérôme Glisse   mm/mmu_notifier: ...
679
  	struct mmu_notifier_range range;
854e9ed09   Minchan Kim   mm: support madvi...
680
  	struct mmu_gather tlb;
854e9ed09   Minchan Kim   mm: support madvi...
681
682
683
  	/* MADV_FREE works for only anon vma at the moment */
  	if (!vma_is_anonymous(vma))
  		return -EINVAL;
ac46d4f3c   Jérôme Glisse   mm/mmu_notifier: ...
684
685
  	range.start = max(vma->vm_start, start_addr);
  	if (range.start >= vma->vm_end)
854e9ed09   Minchan Kim   mm: support madvi...
686
  		return -EINVAL;
ac46d4f3c   Jérôme Glisse   mm/mmu_notifier: ...
687
688
  	range.end = min(vma->vm_end, end_addr);
  	if (range.end <= vma->vm_start)
854e9ed09   Minchan Kim   mm: support madvi...
689
  		return -EINVAL;
7269f9999   Jérôme Glisse   mm/mmu_notifier: ...
690
  	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
6f4f13e8d   Jérôme Glisse   mm/mmu_notifier: ...
691
  				range.start, range.end);
854e9ed09   Minchan Kim   mm: support madvi...
692
693
  
  	lru_add_drain();
ac46d4f3c   Jérôme Glisse   mm/mmu_notifier: ...
694
  	tlb_gather_mmu(&tlb, mm, range.start, range.end);
854e9ed09   Minchan Kim   mm: support madvi...
695
  	update_hiwater_rss(mm);
ac46d4f3c   Jérôme Glisse   mm/mmu_notifier: ...
696
  	mmu_notifier_invalidate_range_start(&range);
7b86ac337   Christoph Hellwig   pagewalk: separat...
697
698
699
700
  	tlb_start_vma(&tlb, vma);
  	walk_page_range(vma->vm_mm, range.start, range.end,
  			&madvise_free_walk_ops, &tlb);
  	tlb_end_vma(&tlb, vma);
ac46d4f3c   Jérôme Glisse   mm/mmu_notifier: ...
701
702
  	mmu_notifier_invalidate_range_end(&range);
  	tlb_finish_mmu(&tlb, range.start, range.end);
854e9ed09   Minchan Kim   mm: support madvi...
703
704
705
  
  	return 0;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
706
707
708
709
  /*
   * Application no longer needs these pages.  If the pages are dirty,
   * it's OK to just throw them away.  The app will be more careful about
   * data it wants to keep.  Be sure to free swap resources too.  The
7e6cbea39   Fernando Luis Vazquez Cao   madvise: update f...
710
   * zap_page_range call sets things up for shrink_active_list to actually free
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
711
712
   * these pages later if no one else has touched them in the meantime,
   * although we could add these pages to a global reuse list for
7e6cbea39   Fernando Luis Vazquez Cao   madvise: update f...
713
   * shrink_active_list to pick up before reclaiming other pages.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
714
715
716
717
718
719
720
721
722
723
724
   *
   * NB: This interface discards data rather than pushes it out to swap,
   * as some implementations do.  This has performance implications for
   * applications like large transactional databases which want to discard
   * pages in anonymous maps after committing to backing store the data
   * that was kept in them.  There is no reason to write this data out to
   * the swap area if the application is discarding it.
   *
   * An interface that causes the system to free clean pages and flush
   * dirty pages is already available as msync(MS_INVALIDATE).
   */
230ca982b   Mike Rapoport   userfaultfd: non-...
725
726
727
728
729
730
731
732
733
734
735
  static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
  					unsigned long start, unsigned long end)
  {
  	zap_page_range(vma, start, end - start);
  	return 0;
  }
  
  static long madvise_dontneed_free(struct vm_area_struct *vma,
  				  struct vm_area_struct **prev,
  				  unsigned long start, unsigned long end,
  				  int behavior)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
736
  {
05b743847   Prasanna Meda   [PATCH] madvise: ...
737
  	*prev = vma;
9c276cc65   Minchan Kim   mm: introduce MAD...
738
  	if (!can_madv_lru_vma(vma))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
739
  		return -EINVAL;
70ccb92fd   Andrea Arcangeli   userfaultfd: non-...
740
741
742
743
744
745
746
747
748
749
750
751
752
753
  	if (!userfaultfd_remove(vma, start, end)) {
  		*prev = NULL; /* mmap_sem has been dropped, prev is stale */
  
  		down_read(&current->mm->mmap_sem);
  		vma = find_vma(current->mm, start);
  		if (!vma)
  			return -ENOMEM;
  		if (start < vma->vm_start) {
  			/*
  			 * This "vma" under revalidation is the one
  			 * with the lowest vma->vm_start where start
  			 * is also < vma->vm_end. If start <
  			 * vma->vm_start it means an hole materialized
  			 * in the user address space within the
230ca982b   Mike Rapoport   userfaultfd: non-...
754
755
  			 * virtual range passed to MADV_DONTNEED
  			 * or MADV_FREE.
70ccb92fd   Andrea Arcangeli   userfaultfd: non-...
756
757
758
  			 */
  			return -ENOMEM;
  		}
9c276cc65   Minchan Kim   mm: introduce MAD...
759
  		if (!can_madv_lru_vma(vma))
70ccb92fd   Andrea Arcangeli   userfaultfd: non-...
760
761
762
763
764
765
  			return -EINVAL;
  		if (end > vma->vm_end) {
  			/*
  			 * Don't fail if end > vma->vm_end. If the old
  			 * vma was splitted while the mmap_sem was
  			 * released the effect of the concurrent
230ca982b   Mike Rapoport   userfaultfd: non-...
766
  			 * operation may not cause madvise() to
70ccb92fd   Andrea Arcangeli   userfaultfd: non-...
767
768
769
770
771
772
773
774
775
776
777
  			 * have an undefined result. There may be an
  			 * adjacent next vma that we'll walk
  			 * next. userfaultfd_remove() will generate an
  			 * UFFD_EVENT_REMOVE repetition on the
  			 * end-vma->vm_end range, but the manager can
  			 * handle a repetition fine.
  			 */
  			end = vma->vm_end;
  		}
  		VM_WARN_ON(start >= end);
  	}
230ca982b   Mike Rapoport   userfaultfd: non-...
778
779
780
781
782
783
784
  
  	if (behavior == MADV_DONTNEED)
  		return madvise_dontneed_single_vma(vma, start, end);
  	else if (behavior == MADV_FREE)
  		return madvise_free_single_vma(vma, start, end);
  	else
  		return -EINVAL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
785
  }
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
786
787
788
  /*
   * Application wants to free up the pages and associated backing store.
   * This is effectively punching a hole into the middle of a file.
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
789
790
   */
  static long madvise_remove(struct vm_area_struct *vma,
00e9fa2d6   Nick Piggin   [PATCH] mm: fix m...
791
  				struct vm_area_struct **prev,
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
792
793
  				unsigned long start, unsigned long end)
  {
3f31d0757   Hugh Dickins   mm/fs: route MADV...
794
  	loff_t offset;
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
795
  	int error;
9ab4233dd   Andy Lutomirski   mm: Hold a file r...
796
  	struct file *f;
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
797

90ed52ebe   Hugh Dickins   [PATCH] holepunch...
798
  	*prev = NULL;	/* tell sys_madvise we drop mmap_sem */
00e9fa2d6   Nick Piggin   [PATCH] mm: fix m...
799

72079ba0d   Mike Kravetz   mm: madvise allow...
800
  	if (vma->vm_flags & VM_LOCKED)
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
801
  		return -EINVAL;
9ab4233dd   Andy Lutomirski   mm: Hold a file r...
802
803
804
  	f = vma->vm_file;
  
  	if (!f || !f->f_mapping || !f->f_mapping->host) {
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
805
806
  			return -EINVAL;
  	}
69cf0fac6   Hugh Dickins   [PATCH] Fix MADV_...
807
808
  	if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
  		return -EACCES;
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
809
810
  	offset = (loff_t)(start - vma->vm_start)
  			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
811

9ab4233dd   Andy Lutomirski   mm: Hold a file r...
812
813
814
815
816
817
818
  	/*
  	 * Filesystem's fallocate may need to take i_mutex.  We need to
  	 * explicitly grab a reference because the vma (and hence the
  	 * vma's reference to the file) can go away as soon as we drop
  	 * mmap_sem.
  	 */
  	get_file(f);
70ccb92fd   Andrea Arcangeli   userfaultfd: non-...
819
820
821
822
  	if (userfaultfd_remove(vma, start, end)) {
  		/* mmap_sem was not released by userfaultfd_remove() */
  		up_read(&current->mm->mmap_sem);
  	}
72c72bdf7   Anna Schumaker   VFS: Rename do_fa...
823
  	error = vfs_fallocate(f,
3f31d0757   Hugh Dickins   mm/fs: route MADV...
824
825
  				FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
  				offset, end - start);
9ab4233dd   Andy Lutomirski   mm: Hold a file r...
826
  	fput(f);
0a27a14a6   Nick Piggin   mm: madvise avoid...
827
  	down_read(&current->mm->mmap_sem);
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
828
  	return error;
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
829
  }
9893e49d6   Andi Kleen   HWPOISON: Add mad...
830
831
832
833
  #ifdef CONFIG_MEMORY_FAILURE
  /*
   * Error injection support for memory error handling.
   */
97167a768   Anshuman Khandual   mm/madvise.c: cle...
834
835
  static int madvise_inject_error(int behavior,
  		unsigned long start, unsigned long end)
9893e49d6   Andi Kleen   HWPOISON: Add mad...
836
  {
97167a768   Anshuman Khandual   mm/madvise.c: cle...
837
  	struct page *page;
c461ad6a6   Mel Gorman   mm, madvise: ensu...
838
  	struct zone *zone;
19bfbe22f   Alexandru Moise   mm, hugetlb, soft...
839
  	unsigned int order;
97167a768   Anshuman Khandual   mm/madvise.c: cle...
840

9893e49d6   Andi Kleen   HWPOISON: Add mad...
841
842
  	if (!capable(CAP_SYS_ADMIN))
  		return -EPERM;
97167a768   Anshuman Khandual   mm/madvise.c: cle...
843

19bfbe22f   Alexandru Moise   mm, hugetlb, soft...
844
845
  
  	for (; start < end; start += PAGE_SIZE << order) {
23e7b5c2e   Dan Williams   mm, madvise_injec...
846
  		unsigned long pfn;
325c4ef5c   Andrew Morton   mm/madvise.c:madv...
847
  		int ret;
97167a768   Anshuman Khandual   mm/madvise.c: cle...
848
  		ret = get_user_pages_fast(start, 1, 0, &page);
9893e49d6   Andi Kleen   HWPOISON: Add mad...
849
850
  		if (ret != 1)
  			return ret;
23e7b5c2e   Dan Williams   mm, madvise_injec...
851
  		pfn = page_to_pfn(page);
325c4ef5c   Andrew Morton   mm/madvise.c:madv...
852

19bfbe22f   Alexandru Moise   mm, hugetlb, soft...
853
854
855
856
857
858
  		/*
  		 * When soft offlining hugepages, after migrating the page
  		 * we dissolve it, therefore in the second loop "page" will
  		 * no longer be a compound page, and order will be 0.
  		 */
  		order = compound_order(compound_head(page));
97167a768   Anshuman Khandual   mm/madvise.c: cle...
859
860
  		if (PageHWPoison(page)) {
  			put_page(page);
29b4eedee   Wanpeng Li   mm/hwpoison.c: fi...
861
862
  			continue;
  		}
97167a768   Anshuman Khandual   mm/madvise.c: cle...
863
864
865
866
  
  		if (behavior == MADV_SOFT_OFFLINE) {
  			pr_info("Soft offlining pfn %#lx at process virtual address %#lx
  ",
23e7b5c2e   Dan Williams   mm, madvise_injec...
867
  					pfn, start);
97167a768   Anshuman Khandual   mm/madvise.c: cle...
868
869
  
  			ret = soft_offline_page(page, MF_COUNT_INCREASED);
afcf938ee   Andi Kleen   HWPOISON: Add a m...
870
  			if (ret)
8302423b8   Wanpeng Li   mm/madvise.c: fix...
871
  				return ret;
afcf938ee   Andi Kleen   HWPOISON: Add a m...
872
873
  			continue;
  		}
23e7b5c2e   Dan Williams   mm, madvise_injec...
874

97167a768   Anshuman Khandual   mm/madvise.c: cle...
875
876
  		pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx
  ",
23e7b5c2e   Dan Williams   mm, madvise_injec...
877
  				pfn, start);
97167a768   Anshuman Khandual   mm/madvise.c: cle...
878

23e7b5c2e   Dan Williams   mm, madvise_injec...
879
880
881
882
883
884
885
886
  		/*
  		 * Drop the page reference taken by get_user_pages_fast(). In
  		 * the absence of MF_COUNT_INCREASED the memory_failure()
  		 * routine is responsible for pinning the page to prevent it
  		 * from being released back to the page allocator.
  		 */
  		put_page(page);
  		ret = memory_failure(pfn, 0);
23a003bfd   Naoya Horiguchi   mm/madvise: pass ...
887
888
  		if (ret)
  			return ret;
9893e49d6   Andi Kleen   HWPOISON: Add mad...
889
  	}
c461ad6a6   Mel Gorman   mm, madvise: ensu...
890
891
892
893
  
  	/* Ensure that all poisoned pages are removed from per-cpu lists */
  	for_each_populated_zone(zone)
  		drain_all_pages(zone);
325c4ef5c   Andrew Morton   mm/madvise.c:madv...
894
  	return 0;
9893e49d6   Andi Kleen   HWPOISON: Add mad...
895
896
  }
  #endif
165cd4023   suzuki   [PATCH] madvise()...
897
898
899
  static long
  madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
  		unsigned long start, unsigned long end, int behavior)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
900
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
901
  	switch (behavior) {
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
902
  	case MADV_REMOVE:
3866ea90d   Hugh Dickins   ksm: first tidy u...
903
  		return madvise_remove(vma, prev, start, end);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
904
  	case MADV_WILLNEED:
3866ea90d   Hugh Dickins   ksm: first tidy u...
905
  		return madvise_willneed(vma, prev, start, end);
9c276cc65   Minchan Kim   mm: introduce MAD...
906
907
  	case MADV_COLD:
  		return madvise_cold(vma, prev, start, end);
1a4e58cce   Minchan Kim   mm: introduce MAD...
908
909
  	case MADV_PAGEOUT:
  		return madvise_pageout(vma, prev, start, end);
854e9ed09   Minchan Kim   mm: support madvi...
910
  	case MADV_FREE:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
911
  	case MADV_DONTNEED:
230ca982b   Mike Rapoport   userfaultfd: non-...
912
  		return madvise_dontneed_free(vma, prev, start, end, behavior);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
913
  	default:
3866ea90d   Hugh Dickins   ksm: first tidy u...
914
  		return madvise_behavior(vma, prev, start, end, behavior);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
915
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
916
  }
1ecef9ed0   Nicholas Krause   mm/madvise.c: mak...
917
  static bool
75927af8b   Nick Piggin   mm: madvise(): co...
918
919
920
921
922
923
924
925
926
927
928
  madvise_behavior_valid(int behavior)
  {
  	switch (behavior) {
  	case MADV_DOFORK:
  	case MADV_DONTFORK:
  	case MADV_NORMAL:
  	case MADV_SEQUENTIAL:
  	case MADV_RANDOM:
  	case MADV_REMOVE:
  	case MADV_WILLNEED:
  	case MADV_DONTNEED:
854e9ed09   Minchan Kim   mm: support madvi...
929
  	case MADV_FREE:
9c276cc65   Minchan Kim   mm: introduce MAD...
930
  	case MADV_COLD:
1a4e58cce   Minchan Kim   mm: introduce MAD...
931
  	case MADV_PAGEOUT:
f8af4da3b   Hugh Dickins   ksm: the mm inter...
932
933
934
935
  #ifdef CONFIG_KSM
  	case MADV_MERGEABLE:
  	case MADV_UNMERGEABLE:
  #endif
0af4e98b6   Andrea Arcangeli   thp: madvise(MADV...
936
937
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  	case MADV_HUGEPAGE:
a664b2d85   Andrea Arcangeli   thp: madvise(MADV...
938
  	case MADV_NOHUGEPAGE:
0af4e98b6   Andrea Arcangeli   thp: madvise(MADV...
939
  #endif
accb61fe7   Jason Baron   coredump: add VM_...
940
941
  	case MADV_DONTDUMP:
  	case MADV_DODUMP:
d2cd9ede6   Rik van Riel   mm,fork: introduc...
942
943
  	case MADV_WIPEONFORK:
  	case MADV_KEEPONFORK:
5e451be75   Anshuman Khandual   mm/madvise: move ...
944
945
946
947
  #ifdef CONFIG_MEMORY_FAILURE
  	case MADV_SOFT_OFFLINE:
  	case MADV_HWPOISON:
  #endif
1ecef9ed0   Nicholas Krause   mm/madvise.c: mak...
948
  		return true;
75927af8b   Nick Piggin   mm: madvise(): co...
949
950
  
  	default:
1ecef9ed0   Nicholas Krause   mm/madvise.c: mak...
951
  		return false;
75927af8b   Nick Piggin   mm: madvise(): co...
952
953
  	}
  }
3866ea90d   Hugh Dickins   ksm: first tidy u...
954

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
  /*
   * The madvise(2) system call.
   *
   * Applications can use madvise() to advise the kernel how it should
   * handle paging I/O in this VM area.  The idea is to help the kernel
   * use appropriate read-ahead and caching techniques.  The information
   * provided is advisory only, and can be safely disregarded by the
   * kernel without affecting the correct operation of the application.
   *
   * behavior values:
   *  MADV_NORMAL - the default behavior is to read clusters.  This
   *		results in some read-ahead and read-behind.
   *  MADV_RANDOM - the system should read the minimum amount of data
   *		on any access, since it is unlikely that the appli-
   *		cation will need more than what it asks for.
   *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
   *		once, so they can be aggressively read ahead, and
   *		can be freed soon after they are accessed.
   *  MADV_WILLNEED - the application is notifying the system to read
   *		some pages ahead.
   *  MADV_DONTNEED - the application is finished with the given range,
   *		so the kernel can free resources associated with it.
d7206a70a   Naoya Horiguchi   mm/madvise: updat...
977
978
   *  MADV_FREE - the application marks pages in the given range as lazy free,
   *		where actual purges are postponed until memory pressure happens.
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
979
980
   *  MADV_REMOVE - the application wants to free up the given range of
   *		pages and associated backing store.
3866ea90d   Hugh Dickins   ksm: first tidy u...
981
982
983
   *  MADV_DONTFORK - omit this area from child's address space when forking:
   *		typically, to avoid COWing pages pinned by get_user_pages().
   *  MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
c02c30093   Yang Shi   mm/madvise.c: add...
984
985
986
   *  MADV_WIPEONFORK - present the child process with zero-filled memory in this
   *              range after a fork.
   *  MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK
d7206a70a   Naoya Horiguchi   mm/madvise: updat...
987
988
989
   *  MADV_HWPOISON - trigger memory error handler as if the given memory range
   *		were corrupted by unrecoverable hardware memory failure.
   *  MADV_SOFT_OFFLINE - try to soft-offline the given range of memory.
f8af4da3b   Hugh Dickins   ksm: the mm inter...
990
991
992
   *  MADV_MERGEABLE - the application recommends that KSM try to merge pages in
   *		this area with pages of identical content from other such areas.
   *  MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
d7206a70a   Naoya Horiguchi   mm/madvise: updat...
993
994
995
996
997
998
999
1000
1001
   *  MADV_HUGEPAGE - the application wants to back the given range by transparent
   *		huge pages in the future. Existing pages might be coalesced and
   *		new pages might be allocated as THP.
   *  MADV_NOHUGEPAGE - mark the given range as not worth being backed by
   *		transparent huge pages so the existing pages will not be
   *		coalesced into THP and new pages will not be allocated as THP.
   *  MADV_DONTDUMP - the application wants to prevent pages in the given range
   *		from being included in its core dump.
   *  MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1002
1003
1004
1005
1006
   *
   * return values:
   *  zero    - success
   *  -EINVAL - start + len < 0, start is not page-aligned,
   *		"behavior" is not a valid value, or application
c02c30093   Yang Shi   mm/madvise.c: add...
1007
1008
1009
   *		is attempting to release locked or shared pages,
   *		or the specified address range includes file, Huge TLB,
   *		MAP_SHARED or VMPFNMAP range.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1010
1011
1012
1013
1014
1015
   *  -ENOMEM - addresses in the specified range are not currently
   *		mapped, or are outside the AS of the process.
   *  -EIO    - an I/O error occurred while paging in data.
   *  -EBADF  - map exists, but area maps something that isn't a file.
   *  -EAGAIN - a kernel resource was temporarily unavailable.
   */
3480b2574   Heiko Carstens   [CVE-2009-0029] S...
1016
  SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1017
  {
05b743847   Prasanna Meda   [PATCH] madvise: ...
1018
  	unsigned long end, tmp;
ec9bed9d3   Vladimir Cernov   mm/madvise.c: fix...
1019
  	struct vm_area_struct *vma, *prev;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1020
1021
  	int unmapped_error = 0;
  	int error = -EINVAL;
f79777932   Jason Baron   speed up madvise_...
1022
  	int write;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1023
  	size_t len;
1998cc048   Shaohua Li   mm: make madvise(...
1024
  	struct blk_plug plug;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1025

057d33891   Andrey Konovalov   mm: untag user po...
1026
  	start = untagged_addr(start);
75927af8b   Nick Piggin   mm: madvise(): co...
1027
1028
  	if (!madvise_behavior_valid(behavior))
  		return error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1029
  	if (start & ~PAGE_MASK)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
1030
  		return error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1031
1032
1033
1034
  	len = (len_in + ~PAGE_MASK) & PAGE_MASK;
  
  	/* Check to see whether len was rounded up from small -ve to zero */
  	if (len_in && !len)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
1035
  		return error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1036
1037
1038
  
  	end = start + len;
  	if (end < start)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
1039
  		return error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1040
1041
1042
  
  	error = 0;
  	if (end == start)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
1043
  		return error;
5e451be75   Anshuman Khandual   mm/madvise: move ...
1044
1045
1046
1047
  #ifdef CONFIG_MEMORY_FAILURE
  	if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
  		return madvise_inject_error(behavior, start, start + len_in);
  #endif
84d96d897   Rasmus Villemoes   mm: madvise: comp...
1048
  	write = madvise_need_mmap_write(behavior);
dc0ef0df7   Michal Hocko   mm: make mmap_sem...
1049
1050
1051
1052
  	if (write) {
  		if (down_write_killable(&current->mm->mmap_sem))
  			return -EINTR;
  	} else {
84d96d897   Rasmus Villemoes   mm: madvise: comp...
1053
  		down_read(&current->mm->mmap_sem);
dc0ef0df7   Michal Hocko   mm: make mmap_sem...
1054
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1055
1056
1057
1058
  
  	/*
  	 * If the interval [start,end) covers some unmapped address
  	 * ranges, just ignore them, but return -ENOMEM at the end.
05b743847   Prasanna Meda   [PATCH] madvise: ...
1059
  	 * - different from the way of handling in mlock etc.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1060
  	 */
05b743847   Prasanna Meda   [PATCH] madvise: ...
1061
  	vma = find_vma_prev(current->mm, start, &prev);
836d5ffd3   Hugh Dickins   [PATCH] mm: fix m...
1062
1063
  	if (vma && start > vma->vm_start)
  		prev = vma;
1998cc048   Shaohua Li   mm: make madvise(...
1064
  	blk_start_plug(&plug);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1065
1066
1067
1068
  	for (;;) {
  		/* Still start < end. */
  		error = -ENOMEM;
  		if (!vma)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
1069
  			goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1070

05b743847   Prasanna Meda   [PATCH] madvise: ...
1071
  		/* Here start < (end|vma->vm_end). */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1072
1073
1074
  		if (start < vma->vm_start) {
  			unmapped_error = -ENOMEM;
  			start = vma->vm_start;
05b743847   Prasanna Meda   [PATCH] madvise: ...
1075
  			if (start >= end)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
1076
  				goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1077
  		}
05b743847   Prasanna Meda   [PATCH] madvise: ...
1078
1079
1080
1081
  		/* Here vma->vm_start <= start < (end|vma->vm_end) */
  		tmp = vma->vm_end;
  		if (end < tmp)
  			tmp = end;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1082

05b743847   Prasanna Meda   [PATCH] madvise: ...
1083
1084
  		/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
  		error = madvise_vma(vma, &prev, start, tmp, behavior);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1085
  		if (error)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
1086
  			goto out;
05b743847   Prasanna Meda   [PATCH] madvise: ...
1087
  		start = tmp;
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
1088
  		if (prev && start < prev->vm_end)
05b743847   Prasanna Meda   [PATCH] madvise: ...
1089
1090
1091
  			start = prev->vm_end;
  		error = unmapped_error;
  		if (start >= end)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
1092
  			goto out;
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
1093
1094
1095
1096
  		if (prev)
  			vma = prev->vm_next;
  		else	/* madvise_remove dropped mmap_sem */
  			vma = find_vma(current->mm, start);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1097
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1098
  out:
84d96d897   Rasmus Villemoes   mm: madvise: comp...
1099
  	blk_finish_plug(&plug);
f79777932   Jason Baron   speed up madvise_...
1100
  	if (write)
0a27a14a6   Nick Piggin   mm: madvise avoid...
1101
1102
1103
  		up_write(&current->mm->mmap_sem);
  	else
  		up_read(&current->mm->mmap_sem);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1104
1105
  	return error;
  }