Blame view

mm/madvise.c 32.4 KB
b24413180   Greg Kroah-Hartman   License cleanup: ...
1
  // SPDX-License-Identifier: GPL-2.0
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2
3
4
5
6
7
8
9
10
11
  /*
   *	linux/mm/madvise.c
   *
   * Copyright (C) 1999  Linus Torvalds
   * Copyright (C) 2002  Christoph Hellwig
   */
  
  #include <linux/mman.h>
  #include <linux/pagemap.h>
  #include <linux/syscalls.h>
05b743847   Prasanna Meda   [PATCH] madvise: ...
12
  #include <linux/mempolicy.h>
afcf938ee   Andi Kleen   HWPOISON: Add a m...
13
  #include <linux/page-isolation.h>
9c276cc65   Minchan Kim   mm: introduce MAD...
14
  #include <linux/page_idle.h>
05ce77249   Pavel Emelyanov   userfaultfd: non-...
15
  #include <linux/userfaultfd_k.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
16
  #include <linux/hugetlb.h>
3f31d0757   Hugh Dickins   mm/fs: route MADV...
17
  #include <linux/falloc.h>
692fe6243   Jan Kara   mm: Handle MADV_W...
18
  #include <linux/fadvise.h>
e8edc6e03   Alexey Dobriyan   Detach sched.h fr...
19
  #include <linux/sched.h>
ecb8ac8b1   Minchan Kim   mm/madvise: intro...
20
21
  #include <linux/sched/mm.h>
  #include <linux/uio.h>
f8af4da3b   Hugh Dickins   ksm: the mm inter...
22
  #include <linux/ksm.h>
3f31d0757   Hugh Dickins   mm/fs: route MADV...
23
  #include <linux/fs.h>
9ab4233dd   Andy Lutomirski   mm: Hold a file r...
24
  #include <linux/file.h>
1998cc048   Shaohua Li   mm: make madvise(...
25
  #include <linux/blkdev.h>
66114cad6   Tejun Heo   writeback: separa...
26
  #include <linux/backing-dev.h>
a520110e4   Christoph Hellwig   mm: split out a n...
27
  #include <linux/pagewalk.h>
1998cc048   Shaohua Li   mm: make madvise(...
28
29
  #include <linux/swap.h>
  #include <linux/swapops.h>
3a4f8a0b3   Hugh Dickins   mm: remove shmem_...
30
  #include <linux/shmem_fs.h>
854e9ed09   Minchan Kim   mm: support madvi...
31
32
33
  #include <linux/mmu_notifier.h>
  
  #include <asm/tlb.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
34

235190738   Kirill A. Shutemov   oom-reaper: use m...
35
  #include "internal.h"
d616d5126   Minchan Kim   mm: factor out co...
36
37
38
39
  struct madvise_walk_private {
  	struct mmu_gather *tlb;
  	bool pageout;
  };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
40
  /*
0a27a14a6   Nick Piggin   mm: madvise avoid...
41
   * Any behaviour which results in changes to the vma->vm_flags needs to
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
42
   * take mmap_lock for writing. Others, which simply traverse vmas, need
0a27a14a6   Nick Piggin   mm: madvise avoid...
43
44
45
46
47
48
49
50
   * to only take it for reading.
   */
  static int madvise_need_mmap_write(int behavior)
  {
  	switch (behavior) {
  	case MADV_REMOVE:
  	case MADV_WILLNEED:
  	case MADV_DONTNEED:
9c276cc65   Minchan Kim   mm: introduce MAD...
51
  	case MADV_COLD:
1a4e58cce   Minchan Kim   mm: introduce MAD...
52
  	case MADV_PAGEOUT:
854e9ed09   Minchan Kim   mm: support madvi...
53
  	case MADV_FREE:
4ca9b3859   David Hildenbrand   mm/madvise: intro...
54
55
  	case MADV_POPULATE_READ:
  	case MADV_POPULATE_WRITE:
0a27a14a6   Nick Piggin   mm: madvise avoid...
56
57
58
59
60
61
62
63
  		return 0;
  	default:
  		/* be safe, default to 1. list exceptions explicitly */
  		return 1;
  	}
  }
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
64
65
66
   * We can potentially split a vm area into separate
   * areas, each area with its own behavior.
   */
ec9bed9d3   Vladimir Cernov   mm/madvise.c: fix...
67
  static long madvise_behavior(struct vm_area_struct *vma,
05b743847   Prasanna Meda   [PATCH] madvise: ...
68
69
  		     struct vm_area_struct **prev,
  		     unsigned long start, unsigned long end, int behavior)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
70
  {
ec9bed9d3   Vladimir Cernov   mm/madvise.c: fix...
71
  	struct mm_struct *mm = vma->vm_mm;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
72
  	int error = 0;
05b743847   Prasanna Meda   [PATCH] madvise: ...
73
  	pgoff_t pgoff;
3866ea90d   Hugh Dickins   ksm: first tidy u...
74
  	unsigned long new_flags = vma->vm_flags;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
75
76
  
  	switch (behavior) {
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
77
78
79
  	case MADV_NORMAL:
  		new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
  		break;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
80
  	case MADV_SEQUENTIAL:
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
81
  		new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
82
83
  		break;
  	case MADV_RANDOM:
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
84
  		new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
85
  		break;
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
86
87
88
89
  	case MADV_DONTFORK:
  		new_flags |= VM_DONTCOPY;
  		break;
  	case MADV_DOFORK:
3866ea90d   Hugh Dickins   ksm: first tidy u...
90
91
92
93
  		if (vma->vm_flags & VM_IO) {
  			error = -EINVAL;
  			goto out;
  		}
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
94
  		new_flags &= ~VM_DONTCOPY;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
95
  		break;
d2cd9ede6   Rik van Riel   mm,fork: introduc...
96
97
98
99
100
101
102
103
104
105
106
  	case MADV_WIPEONFORK:
  		/* MADV_WIPEONFORK is only supported on anonymous memory. */
  		if (vma->vm_file || vma->vm_flags & VM_SHARED) {
  			error = -EINVAL;
  			goto out;
  		}
  		new_flags |= VM_WIPEONFORK;
  		break;
  	case MADV_KEEPONFORK:
  		new_flags &= ~VM_WIPEONFORK;
  		break;
accb61fe7   Jason Baron   coredump: add VM_...
107
  	case MADV_DONTDUMP:
0103bd16f   Konstantin Khlebnikov   mm: prepare VM_DO...
108
  		new_flags |= VM_DONTDUMP;
accb61fe7   Jason Baron   coredump: add VM_...
109
110
  		break;
  	case MADV_DODUMP:
d41aa5252   Daniel Black   mm: madvise(MADV_...
111
  		if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) {
0103bd16f   Konstantin Khlebnikov   mm: prepare VM_DO...
112
113
114
115
  			error = -EINVAL;
  			goto out;
  		}
  		new_flags &= ~VM_DONTDUMP;
accb61fe7   Jason Baron   coredump: add VM_...
116
  		break;
f8af4da3b   Hugh Dickins   ksm: the mm inter...
117
118
119
  	case MADV_MERGEABLE:
  	case MADV_UNMERGEABLE:
  		error = ksm_madvise(vma, start, end, behavior, &new_flags);
f3bc0dba3   Mike Rapoport   mm/madvise: reduc...
120
121
  		if (error)
  			goto out_convert_errno;
f8af4da3b   Hugh Dickins   ksm: the mm inter...
122
  		break;
0af4e98b6   Andrea Arcangeli   thp: madvise(MADV...
123
  	case MADV_HUGEPAGE:
a664b2d85   Andrea Arcangeli   thp: madvise(MADV...
124
  	case MADV_NOHUGEPAGE:
60ab3244e   Andrea Arcangeli   thp: khugepaged: ...
125
  		error = hugepage_madvise(vma, &new_flags, behavior);
f3bc0dba3   Mike Rapoport   mm/madvise: reduc...
126
127
  		if (error)
  			goto out_convert_errno;
0af4e98b6   Andrea Arcangeli   thp: madvise(MADV...
128
  		break;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
129
  	}
05b743847   Prasanna Meda   [PATCH] madvise: ...
130
131
  	if (new_flags == vma->vm_flags) {
  		*prev = vma;
836d5ffd3   Hugh Dickins   [PATCH] mm: fix m...
132
  		goto out;
05b743847   Prasanna Meda   [PATCH] madvise: ...
133
134
135
136
  	}
  
  	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
  	*prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
19a809afe   Andrea Arcangeli   userfaultfd: teac...
137
138
  			  vma->vm_file, pgoff, vma_policy(vma),
  			  vma->vm_userfaultfd_ctx);
05b743847   Prasanna Meda   [PATCH] madvise: ...
139
140
141
142
143
144
  	if (*prev) {
  		vma = *prev;
  		goto success;
  	}
  
  	*prev = vma;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
145
146
  
  	if (start != vma->vm_start) {
def5efe03   David Rientjes   mm, madvise: fail...
147
148
  		if (unlikely(mm->map_count >= sysctl_max_map_count)) {
  			error = -ENOMEM;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
149
  			goto out;
def5efe03   David Rientjes   mm, madvise: fail...
150
151
  		}
  		error = __split_vma(mm, vma, start, 1);
f3bc0dba3   Mike Rapoport   mm/madvise: reduc...
152
153
  		if (error)
  			goto out_convert_errno;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
154
155
156
  	}
  
  	if (end != vma->vm_end) {
def5efe03   David Rientjes   mm, madvise: fail...
157
158
  		if (unlikely(mm->map_count >= sysctl_max_map_count)) {
  			error = -ENOMEM;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
159
  			goto out;
def5efe03   David Rientjes   mm, madvise: fail...
160
161
  		}
  		error = __split_vma(mm, vma, end, 0);
f3bc0dba3   Mike Rapoport   mm/madvise: reduc...
162
163
  		if (error)
  			goto out_convert_errno;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
164
  	}
836d5ffd3   Hugh Dickins   [PATCH] mm: fix m...
165
  success:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
166
  	/*
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
167
  	 * vm_flags is protected by the mmap_lock held in write mode.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
168
  	 */
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
169
  	vma->vm_flags = new_flags;
f3bc0dba3   Mike Rapoport   mm/madvise: reduc...
170
171
172
173
174
175
176
177
  
  out_convert_errno:
  	/*
  	 * madvise() returns EAGAIN if kernel resources, such as
  	 * slab, are temporarily unavailable.
  	 */
  	if (error == -ENOMEM)
  		error = -EAGAIN;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
178
  out:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
179
180
  	return error;
  }
1998cc048   Shaohua Li   mm: make madvise(...
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
  #ifdef CONFIG_SWAP
  static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
  	unsigned long end, struct mm_walk *walk)
  {
  	pte_t *orig_pte;
  	struct vm_area_struct *vma = walk->private;
  	unsigned long index;
  
  	if (pmd_none_or_trans_huge_or_clear_bad(pmd))
  		return 0;
  
  	for (index = start; index != end; index += PAGE_SIZE) {
  		pte_t pte;
  		swp_entry_t entry;
  		struct page *page;
  		spinlock_t *ptl;
  
  		orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
  		pte = *(orig_pte + ((index - start) / PAGE_SIZE));
  		pte_unmap_unlock(orig_pte, ptl);
0661a3361   Kirill A. Shutemov   mm: remove rest u...
201
  		if (pte_present(pte) || pte_none(pte))
1998cc048   Shaohua Li   mm: make madvise(...
202
203
204
205
206
207
  			continue;
  		entry = pte_to_swp_entry(pte);
  		if (unlikely(non_swap_entry(entry)))
  			continue;
  
  		page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
23955622f   Shaohua Li   swap: add block i...
208
  							vma, index, false);
1998cc048   Shaohua Li   mm: make madvise(...
209
  		if (page)
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
210
  			put_page(page);
1998cc048   Shaohua Li   mm: make madvise(...
211
212
213
214
  	}
  
  	return 0;
  }
7b86ac337   Christoph Hellwig   pagewalk: separat...
215
216
217
  static const struct mm_walk_ops swapin_walk_ops = {
  	.pmd_entry		= swapin_walk_pmd_entry,
  };
1998cc048   Shaohua Li   mm: make madvise(...
218
219
220
221
222
  
  static void force_shm_swapin_readahead(struct vm_area_struct *vma,
  		unsigned long start, unsigned long end,
  		struct address_space *mapping)
  {
e6e88712e   Matthew Wilcox (Oracle)   mm: optimise madv...
223
  	XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start));
66383800d   Matthew Wilcox (Oracle)   mm: fix madvise W...
224
  	pgoff_t end_index = linear_page_index(vma, end + PAGE_SIZE - 1);
1998cc048   Shaohua Li   mm: make madvise(...
225
  	struct page *page;
1998cc048   Shaohua Li   mm: make madvise(...
226

e6e88712e   Matthew Wilcox (Oracle)   mm: optimise madv...
227
228
229
  	rcu_read_lock();
  	xas_for_each(&xas, page, end_index) {
  		swp_entry_t swap;
1998cc048   Shaohua Li   mm: make madvise(...
230

e6e88712e   Matthew Wilcox (Oracle)   mm: optimise madv...
231
  		if (!xa_is_value(page))
1998cc048   Shaohua Li   mm: make madvise(...
232
  			continue;
e6e88712e   Matthew Wilcox (Oracle)   mm: optimise madv...
233
234
  		xas_pause(&xas);
  		rcu_read_unlock();
1998cc048   Shaohua Li   mm: make madvise(...
235
236
  		swap = radix_to_swp_entry(page);
  		page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
23955622f   Shaohua Li   swap: add block i...
237
  							NULL, 0, false);
1998cc048   Shaohua Li   mm: make madvise(...
238
  		if (page)
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
239
  			put_page(page);
e6e88712e   Matthew Wilcox (Oracle)   mm: optimise madv...
240
241
  
  		rcu_read_lock();
1998cc048   Shaohua Li   mm: make madvise(...
242
  	}
e6e88712e   Matthew Wilcox (Oracle)   mm: optimise madv...
243
  	rcu_read_unlock();
1998cc048   Shaohua Li   mm: make madvise(...
244
245
246
247
  
  	lru_add_drain();	/* Push any new pages onto the LRU now */
  }
  #endif		/* CONFIG_SWAP */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
248
249
250
  /*
   * Schedule all required I/O operations.  Do not wait for completion.
   */
ec9bed9d3   Vladimir Cernov   mm/madvise.c: fix...
251
252
  static long madvise_willneed(struct vm_area_struct *vma,
  			     struct vm_area_struct **prev,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
253
254
  			     unsigned long start, unsigned long end)
  {
0726b01e7   Minchan Kim   mm/madvise: pass ...
255
  	struct mm_struct *mm = vma->vm_mm;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
256
  	struct file *file = vma->vm_file;
692fe6243   Jan Kara   mm: Handle MADV_W...
257
  	loff_t offset;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
258

6ea8d958a   chenjie   mm/madvise.c: fix...
259
  	*prev = vma;
1998cc048   Shaohua Li   mm: make madvise(...
260
  #ifdef CONFIG_SWAP
97b713ba3   Christoph Hellwig   fs: kill BDI_CAP_...
261
  	if (!file) {
7b86ac337   Christoph Hellwig   pagewalk: separat...
262
263
  		walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma);
  		lru_add_drain(); /* Push any new pages onto the LRU now */
1998cc048   Shaohua Li   mm: make madvise(...
264
265
  		return 0;
  	}
1998cc048   Shaohua Li   mm: make madvise(...
266

97b713ba3   Christoph Hellwig   fs: kill BDI_CAP_...
267
  	if (shmem_mapping(file->f_mapping)) {
97b713ba3   Christoph Hellwig   fs: kill BDI_CAP_...
268
269
270
271
272
  		force_shm_swapin_readahead(vma, start, end,
  					file->f_mapping);
  		return 0;
  	}
  #else
1bef40032   Suzuki   [PATCH] madvise: ...
273
274
  	if (!file)
  		return -EBADF;
97b713ba3   Christoph Hellwig   fs: kill BDI_CAP_...
275
  #endif
1bef40032   Suzuki   [PATCH] madvise: ...
276

e748dcd09   Matthew Wilcox   vfs: remove get_x...
277
  	if (IS_DAX(file_inode(file))) {
fe77ba6f4   Carsten Otte   [PATCH] xip: madv...
278
279
280
  		/* no bad return value, but ignore advice */
  		return 0;
  	}
692fe6243   Jan Kara   mm: Handle MADV_W...
281
282
283
284
  	/*
  	 * Filesystem's fadvise may need to take various locks.  We need to
  	 * explicitly grab a reference because the vma (and hence the
  	 * vma's reference to the file) can go away as soon as we drop
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
285
  	 * mmap_lock.
692fe6243   Jan Kara   mm: Handle MADV_W...
286
  	 */
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
287
  	*prev = NULL;	/* tell sys_madvise we drop mmap_lock */
692fe6243   Jan Kara   mm: Handle MADV_W...
288
  	get_file(file);
692fe6243   Jan Kara   mm: Handle MADV_W...
289
290
  	offset = (loff_t)(start - vma->vm_start)
  			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
0726b01e7   Minchan Kim   mm/madvise: pass ...
291
  	mmap_read_unlock(mm);
692fe6243   Jan Kara   mm: Handle MADV_W...
292
293
  	vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
  	fput(file);
0726b01e7   Minchan Kim   mm/madvise: pass ...
294
  	mmap_read_lock(mm);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
295
296
  	return 0;
  }
d616d5126   Minchan Kim   mm: factor out co...
297
298
299
  static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
  				unsigned long addr, unsigned long end,
  				struct mm_walk *walk)
9c276cc65   Minchan Kim   mm: introduce MAD...
300
  {
d616d5126   Minchan Kim   mm: factor out co...
301
302
303
  	struct madvise_walk_private *private = walk->private;
  	struct mmu_gather *tlb = private->tlb;
  	bool pageout = private->pageout;
9c276cc65   Minchan Kim   mm: introduce MAD...
304
305
306
307
  	struct mm_struct *mm = tlb->mm;
  	struct vm_area_struct *vma = walk->vma;
  	pte_t *orig_pte, *pte, ptent;
  	spinlock_t *ptl;
d616d5126   Minchan Kim   mm: factor out co...
308
309
310
311
312
  	struct page *page = NULL;
  	LIST_HEAD(page_list);
  
  	if (fatal_signal_pending(current))
  		return -EINTR;
9c276cc65   Minchan Kim   mm: introduce MAD...
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
  
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  	if (pmd_trans_huge(*pmd)) {
  		pmd_t orig_pmd;
  		unsigned long next = pmd_addr_end(addr, end);
  
  		tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
  		ptl = pmd_trans_huge_lock(pmd, vma);
  		if (!ptl)
  			return 0;
  
  		orig_pmd = *pmd;
  		if (is_huge_zero_pmd(orig_pmd))
  			goto huge_unlock;
  
  		if (unlikely(!pmd_present(orig_pmd))) {
  			VM_BUG_ON(thp_migration_supported() &&
  					!is_pmd_migration_entry(orig_pmd));
  			goto huge_unlock;
  		}
  
  		page = pmd_page(orig_pmd);
12e967fd8   Michal Hocko   mm: do not allow ...
335
336
337
338
  
  		/* Do not interfere with other mappings of this page */
  		if (page_mapcount(page) != 1)
  			goto huge_unlock;
9c276cc65   Minchan Kim   mm: introduce MAD...
339
340
  		if (next - addr != HPAGE_PMD_SIZE) {
  			int err;
9c276cc65   Minchan Kim   mm: introduce MAD...
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
  			get_page(page);
  			spin_unlock(ptl);
  			lock_page(page);
  			err = split_huge_page(page);
  			unlock_page(page);
  			put_page(page);
  			if (!err)
  				goto regular_page;
  			return 0;
  		}
  
  		if (pmd_young(orig_pmd)) {
  			pmdp_invalidate(vma, addr, pmd);
  			orig_pmd = pmd_mkold(orig_pmd);
  
  			set_pmd_at(mm, addr, pmd, orig_pmd);
  			tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
  		}
d616d5126   Minchan Kim   mm: factor out co...
359
  		ClearPageReferenced(page);
9c276cc65   Minchan Kim   mm: introduce MAD...
360
  		test_and_clear_page_young(page);
d616d5126   Minchan Kim   mm: factor out co...
361
  		if (pageout) {
820729629   zhong jiang   mm: fix trying to...
362
363
364
365
366
367
  			if (!isolate_lru_page(page)) {
  				if (PageUnevictable(page))
  					putback_lru_page(page);
  				else
  					list_add(&page->lru, &page_list);
  			}
d616d5126   Minchan Kim   mm: factor out co...
368
369
  		} else
  			deactivate_page(page);
9c276cc65   Minchan Kim   mm: introduce MAD...
370
371
  huge_unlock:
  		spin_unlock(ptl);
d616d5126   Minchan Kim   mm: factor out co...
372
373
  		if (pageout)
  			reclaim_pages(&page_list);
9c276cc65   Minchan Kim   mm: introduce MAD...
374
375
  		return 0;
  	}
ce2684254   Minchan Kim   mm: validate pmd ...
376
  regular_page:
9c276cc65   Minchan Kim   mm: introduce MAD...
377
378
  	if (pmd_trans_unstable(pmd))
  		return 0;
9c276cc65   Minchan Kim   mm: introduce MAD...
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
  #endif
  	tlb_change_page_size(tlb, PAGE_SIZE);
  	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
  	flush_tlb_batched_pending(mm);
  	arch_enter_lazy_mmu_mode();
  	for (; addr < end; pte++, addr += PAGE_SIZE) {
  		ptent = *pte;
  
  		if (pte_none(ptent))
  			continue;
  
  		if (!pte_present(ptent))
  			continue;
  
  		page = vm_normal_page(vma, addr, ptent);
  		if (!page)
  			continue;
  
  		/*
  		 * Creating a THP page is expensive so split it only if we
  		 * are sure it's worth. Split it if we are only owner.
  		 */
  		if (PageTransCompound(page)) {
  			if (page_mapcount(page) != 1)
  				break;
  			get_page(page);
  			if (!trylock_page(page)) {
  				put_page(page);
  				break;
  			}
  			pte_unmap_unlock(orig_pte, ptl);
  			if (split_huge_page(page)) {
  				unlock_page(page);
  				put_page(page);
  				pte_offset_map_lock(mm, pmd, addr, &ptl);
  				break;
  			}
  			unlock_page(page);
  			put_page(page);
  			pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
  			pte--;
  			addr -= PAGE_SIZE;
  			continue;
  		}
12e967fd8   Michal Hocko   mm: do not allow ...
423
424
425
  		/* Do not interfere with other mappings of this page */
  		if (page_mapcount(page) != 1)
  			continue;
9c276cc65   Minchan Kim   mm: introduce MAD...
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
  		VM_BUG_ON_PAGE(PageTransCompound(page), page);
  
  		if (pte_young(ptent)) {
  			ptent = ptep_get_and_clear_full(mm, addr, pte,
  							tlb->fullmm);
  			ptent = pte_mkold(ptent);
  			set_pte_at(mm, addr, pte, ptent);
  			tlb_remove_tlb_entry(tlb, pte, addr);
  		}
  
  		/*
  		 * We are deactivating a page for accelerating reclaiming.
  		 * VM couldn't reclaim the page unless we clear PG_young.
  		 * As a side effect, it makes confuse idle-page tracking
  		 * because they will miss recent referenced history.
  		 */
d616d5126   Minchan Kim   mm: factor out co...
442
  		ClearPageReferenced(page);
9c276cc65   Minchan Kim   mm: introduce MAD...
443
  		test_and_clear_page_young(page);
d616d5126   Minchan Kim   mm: factor out co...
444
  		if (pageout) {
820729629   zhong jiang   mm: fix trying to...
445
446
447
448
449
450
  			if (!isolate_lru_page(page)) {
  				if (PageUnevictable(page))
  					putback_lru_page(page);
  				else
  					list_add(&page->lru, &page_list);
  			}
d616d5126   Minchan Kim   mm: factor out co...
451
452
  		} else
  			deactivate_page(page);
9c276cc65   Minchan Kim   mm: introduce MAD...
453
454
455
456
  	}
  
  	arch_leave_lazy_mmu_mode();
  	pte_unmap_unlock(orig_pte, ptl);
d616d5126   Minchan Kim   mm: factor out co...
457
458
  	if (pageout)
  		reclaim_pages(&page_list);
9c276cc65   Minchan Kim   mm: introduce MAD...
459
460
461
462
463
464
  	cond_resched();
  
  	return 0;
  }
  
  static const struct mm_walk_ops cold_walk_ops = {
d616d5126   Minchan Kim   mm: factor out co...
465
  	.pmd_entry = madvise_cold_or_pageout_pte_range,
9c276cc65   Minchan Kim   mm: introduce MAD...
466
467
468
469
470
471
  };
  
  static void madvise_cold_page_range(struct mmu_gather *tlb,
  			     struct vm_area_struct *vma,
  			     unsigned long addr, unsigned long end)
  {
d616d5126   Minchan Kim   mm: factor out co...
472
473
474
475
  	struct madvise_walk_private walk_private = {
  		.pageout = false,
  		.tlb = tlb,
  	};
9c276cc65   Minchan Kim   mm: introduce MAD...
476
  	tlb_start_vma(tlb, vma);
d616d5126   Minchan Kim   mm: factor out co...
477
  	walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
9c276cc65   Minchan Kim   mm: introduce MAD...
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
  	tlb_end_vma(tlb, vma);
  }
  
  static long madvise_cold(struct vm_area_struct *vma,
  			struct vm_area_struct **prev,
  			unsigned long start_addr, unsigned long end_addr)
  {
  	struct mm_struct *mm = vma->vm_mm;
  	struct mmu_gather tlb;
  
  	*prev = vma;
  	if (!can_madv_lru_vma(vma))
  		return -EINVAL;
  
  	lru_add_drain();
a72afd873   Will Deacon   tlb: mmu_gather: ...
493
  	tlb_gather_mmu(&tlb, mm);
9c276cc65   Minchan Kim   mm: introduce MAD...
494
  	madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
ae8eba8b5   Will Deacon   tlb: mmu_gather: ...
495
  	tlb_finish_mmu(&tlb);
9c276cc65   Minchan Kim   mm: introduce MAD...
496
497
498
  
  	return 0;
  }
1a4e58cce   Minchan Kim   mm: introduce MAD...
499
500
501
502
  static void madvise_pageout_page_range(struct mmu_gather *tlb,
  			     struct vm_area_struct *vma,
  			     unsigned long addr, unsigned long end)
  {
d616d5126   Minchan Kim   mm: factor out co...
503
504
505
506
  	struct madvise_walk_private walk_private = {
  		.pageout = true,
  		.tlb = tlb,
  	};
1a4e58cce   Minchan Kim   mm: introduce MAD...
507
  	tlb_start_vma(tlb, vma);
d616d5126   Minchan Kim   mm: factor out co...
508
  	walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
1a4e58cce   Minchan Kim   mm: introduce MAD...
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
  	tlb_end_vma(tlb, vma);
  }
  
  static inline bool can_do_pageout(struct vm_area_struct *vma)
  {
  	if (vma_is_anonymous(vma))
  		return true;
  	if (!vma->vm_file)
  		return false;
  	/*
  	 * paging out pagecache only for non-anonymous mappings that correspond
  	 * to the files the calling process could (if tried) open for writing;
  	 * otherwise we'd be including shared non-exclusive mappings, which
  	 * opens a side channel.
  	 */
21cb47be6   Christian Brauner   inode: make init ...
524
525
  	return inode_owner_or_capable(&init_user_ns,
  				      file_inode(vma->vm_file)) ||
02f92b386   Christian Brauner   fs: add file and ...
526
  	       file_permission(vma->vm_file, MAY_WRITE) == 0;
1a4e58cce   Minchan Kim   mm: introduce MAD...
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
  }
  
  static long madvise_pageout(struct vm_area_struct *vma,
  			struct vm_area_struct **prev,
  			unsigned long start_addr, unsigned long end_addr)
  {
  	struct mm_struct *mm = vma->vm_mm;
  	struct mmu_gather tlb;
  
  	*prev = vma;
  	if (!can_madv_lru_vma(vma))
  		return -EINVAL;
  
  	if (!can_do_pageout(vma))
  		return 0;
  
  	lru_add_drain();
a72afd873   Will Deacon   tlb: mmu_gather: ...
544
  	tlb_gather_mmu(&tlb, mm);
1a4e58cce   Minchan Kim   mm: introduce MAD...
545
  	madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
ae8eba8b5   Will Deacon   tlb: mmu_gather: ...
546
  	tlb_finish_mmu(&tlb);
1a4e58cce   Minchan Kim   mm: introduce MAD...
547
548
549
  
  	return 0;
  }
854e9ed09   Minchan Kim   mm: support madvi...
550
551
552
553
554
555
556
557
558
559
  static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
  				unsigned long end, struct mm_walk *walk)
  
  {
  	struct mmu_gather *tlb = walk->private;
  	struct mm_struct *mm = tlb->mm;
  	struct vm_area_struct *vma = walk->vma;
  	spinlock_t *ptl;
  	pte_t *orig_pte, *pte, ptent;
  	struct page *page;
64b42bc1c   Minchan Kim   mm/madvise.c: fre...
560
  	int nr_swap = 0;
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
561
562
563
564
565
566
  	unsigned long next;
  
  	next = pmd_addr_end(addr, end);
  	if (pmd_trans_huge(*pmd))
  		if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
  			goto next;
854e9ed09   Minchan Kim   mm: support madvi...
567

854e9ed09   Minchan Kim   mm: support madvi...
568
569
  	if (pmd_trans_unstable(pmd))
  		return 0;
ed6a79352   Peter Zijlstra   asm-generic/tlb, ...
570
  	tlb_change_page_size(tlb, PAGE_SIZE);
854e9ed09   Minchan Kim   mm: support madvi...
571
  	orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
3ea277194   Mel Gorman   mm, mprotect: flu...
572
  	flush_tlb_batched_pending(mm);
854e9ed09   Minchan Kim   mm: support madvi...
573
574
575
  	arch_enter_lazy_mmu_mode();
  	for (; addr != end; pte++, addr += PAGE_SIZE) {
  		ptent = *pte;
64b42bc1c   Minchan Kim   mm/madvise.c: fre...
576
  		if (pte_none(ptent))
854e9ed09   Minchan Kim   mm: support madvi...
577
  			continue;
64b42bc1c   Minchan Kim   mm/madvise.c: fre...
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
  		/*
  		 * If the pte has swp_entry, just clear page table to
  		 * prevent swap-in which is more expensive rather than
  		 * (page allocation + zeroing).
  		 */
  		if (!pte_present(ptent)) {
  			swp_entry_t entry;
  
  			entry = pte_to_swp_entry(ptent);
  			if (non_swap_entry(entry))
  				continue;
  			nr_swap--;
  			free_swap_and_cache(entry);
  			pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
  			continue;
  		}
854e9ed09   Minchan Kim   mm: support madvi...
594

25b2995a3   Christoph Hellwig   mm: remove MEMORY...
595
  		page = vm_normal_page(vma, addr, ptent);
854e9ed09   Minchan Kim   mm: support madvi...
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
  		if (!page)
  			continue;
  
  		/*
  		 * If pmd isn't transhuge but the page is THP and
  		 * is owned by only this process, split it and
  		 * deactivate all pages.
  		 */
  		if (PageTransCompound(page)) {
  			if (page_mapcount(page) != 1)
  				goto out;
  			get_page(page);
  			if (!trylock_page(page)) {
  				put_page(page);
  				goto out;
  			}
  			pte_unmap_unlock(orig_pte, ptl);
  			if (split_huge_page(page)) {
  				unlock_page(page);
  				put_page(page);
  				pte_offset_map_lock(mm, pmd, addr, &ptl);
  				goto out;
  			}
854e9ed09   Minchan Kim   mm: support madvi...
619
  			unlock_page(page);
263630e8d   Eric Biggers   mm/madvise.c: fix...
620
  			put_page(page);
854e9ed09   Minchan Kim   mm: support madvi...
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
  			pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
  			pte--;
  			addr -= PAGE_SIZE;
  			continue;
  		}
  
  		VM_BUG_ON_PAGE(PageTransCompound(page), page);
  
  		if (PageSwapCache(page) || PageDirty(page)) {
  			if (!trylock_page(page))
  				continue;
  			/*
  			 * If page is shared with others, we couldn't clear
  			 * PG_dirty of the page.
  			 */
  			if (page_mapcount(page) != 1) {
  				unlock_page(page);
  				continue;
  			}
  
  			if (PageSwapCache(page) && !try_to_free_swap(page)) {
  				unlock_page(page);
  				continue;
  			}
  
  			ClearPageDirty(page);
  			unlock_page(page);
  		}
  
  		if (pte_young(ptent) || pte_dirty(ptent)) {
  			/*
  			 * Some of architecture(ex, PPC) don't update TLB
  			 * with set_pte_at and tlb_remove_tlb_entry so for
  			 * the portability, remap the pte with old|clean
  			 * after pte clearing.
  			 */
  			ptent = ptep_get_and_clear_full(mm, addr, pte,
  							tlb->fullmm);
  
  			ptent = pte_mkold(ptent);
  			ptent = pte_mkclean(ptent);
  			set_pte_at(mm, addr, pte, ptent);
  			tlb_remove_tlb_entry(tlb, pte, addr);
  		}
802a3a92a   Shaohua Li   mm: reclaim MADV_...
665
  		mark_page_lazyfree(page);
854e9ed09   Minchan Kim   mm: support madvi...
666
667
  	}
  out:
64b42bc1c   Minchan Kim   mm/madvise.c: fre...
668
669
670
671
672
673
  	if (nr_swap) {
  		if (current->mm == mm)
  			sync_mm_rss(mm);
  
  		add_mm_counter(mm, MM_SWAPENTS, nr_swap);
  	}
854e9ed09   Minchan Kim   mm: support madvi...
674
675
676
  	arch_leave_lazy_mmu_mode();
  	pte_unmap_unlock(orig_pte, ptl);
  	cond_resched();
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
677
  next:
854e9ed09   Minchan Kim   mm: support madvi...
678
679
  	return 0;
  }
7b86ac337   Christoph Hellwig   pagewalk: separat...
680
681
682
  static const struct mm_walk_ops madvise_free_walk_ops = {
  	.pmd_entry		= madvise_free_pte_range,
  };
854e9ed09   Minchan Kim   mm: support madvi...
683
684
685
686
  
  static int madvise_free_single_vma(struct vm_area_struct *vma,
  			unsigned long start_addr, unsigned long end_addr)
  {
854e9ed09   Minchan Kim   mm: support madvi...
687
  	struct mm_struct *mm = vma->vm_mm;
ac46d4f3c   Jérôme Glisse   mm/mmu_notifier: ...
688
  	struct mmu_notifier_range range;
854e9ed09   Minchan Kim   mm: support madvi...
689
  	struct mmu_gather tlb;
854e9ed09   Minchan Kim   mm: support madvi...
690
691
692
  	/* MADV_FREE works for only anon vma at the moment */
  	if (!vma_is_anonymous(vma))
  		return -EINVAL;
ac46d4f3c   Jérôme Glisse   mm/mmu_notifier: ...
693
694
  	range.start = max(vma->vm_start, start_addr);
  	if (range.start >= vma->vm_end)
854e9ed09   Minchan Kim   mm: support madvi...
695
  		return -EINVAL;
ac46d4f3c   Jérôme Glisse   mm/mmu_notifier: ...
696
697
  	range.end = min(vma->vm_end, end_addr);
  	if (range.end <= vma->vm_start)
854e9ed09   Minchan Kim   mm: support madvi...
698
  		return -EINVAL;
7269f9999   Jérôme Glisse   mm/mmu_notifier: ...
699
  	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
6f4f13e8d   Jérôme Glisse   mm/mmu_notifier: ...
700
  				range.start, range.end);
854e9ed09   Minchan Kim   mm: support madvi...
701
702
  
  	lru_add_drain();
a72afd873   Will Deacon   tlb: mmu_gather: ...
703
  	tlb_gather_mmu(&tlb, mm);
854e9ed09   Minchan Kim   mm: support madvi...
704
  	update_hiwater_rss(mm);
ac46d4f3c   Jérôme Glisse   mm/mmu_notifier: ...
705
  	mmu_notifier_invalidate_range_start(&range);
7b86ac337   Christoph Hellwig   pagewalk: separat...
706
707
708
709
  	tlb_start_vma(&tlb, vma);
  	walk_page_range(vma->vm_mm, range.start, range.end,
  			&madvise_free_walk_ops, &tlb);
  	tlb_end_vma(&tlb, vma);
ac46d4f3c   Jérôme Glisse   mm/mmu_notifier: ...
710
  	mmu_notifier_invalidate_range_end(&range);
ae8eba8b5   Will Deacon   tlb: mmu_gather: ...
711
  	tlb_finish_mmu(&tlb);
854e9ed09   Minchan Kim   mm: support madvi...
712
713
714
  
  	return 0;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
715
716
717
718
  /*
   * Application no longer needs these pages.  If the pages are dirty,
   * it's OK to just throw them away.  The app will be more careful about
   * data it wants to keep.  Be sure to free swap resources too.  The
7e6cbea39   Fernando Luis Vazquez Cao   madvise: update f...
719
   * zap_page_range call sets things up for shrink_active_list to actually free
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
720
721
   * these pages later if no one else has touched them in the meantime,
   * although we could add these pages to a global reuse list for
7e6cbea39   Fernando Luis Vazquez Cao   madvise: update f...
722
   * shrink_active_list to pick up before reclaiming other pages.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
723
724
725
726
727
728
729
730
731
732
733
   *
   * NB: This interface discards data rather than pushes it out to swap,
   * as some implementations do.  This has performance implications for
   * applications like large transactional databases which want to discard
   * pages in anonymous maps after committing to backing store the data
   * that was kept in them.  There is no reason to write this data out to
   * the swap area if the application is discarding it.
   *
   * An interface that causes the system to free clean pages and flush
   * dirty pages is already available as msync(MS_INVALIDATE).
   */
230ca982b   Mike Rapoport   userfaultfd: non-...
734
735
736
737
738
739
740
741
742
743
744
  static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
  					unsigned long start, unsigned long end)
  {
  	zap_page_range(vma, start, end - start);
  	return 0;
  }
  
  static long madvise_dontneed_free(struct vm_area_struct *vma,
  				  struct vm_area_struct **prev,
  				  unsigned long start, unsigned long end,
  				  int behavior)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
745
  {
0726b01e7   Minchan Kim   mm/madvise: pass ...
746
  	struct mm_struct *mm = vma->vm_mm;
05b743847   Prasanna Meda   [PATCH] madvise: ...
747
  	*prev = vma;
9c276cc65   Minchan Kim   mm: introduce MAD...
748
  	if (!can_madv_lru_vma(vma))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
749
  		return -EINVAL;
70ccb92fd   Andrea Arcangeli   userfaultfd: non-...
750
  	if (!userfaultfd_remove(vma, start, end)) {
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
751
  		*prev = NULL; /* mmap_lock has been dropped, prev is stale */
70ccb92fd   Andrea Arcangeli   userfaultfd: non-...
752

0726b01e7   Minchan Kim   mm/madvise: pass ...
753
754
  		mmap_read_lock(mm);
  		vma = find_vma(mm, start);
70ccb92fd   Andrea Arcangeli   userfaultfd: non-...
755
756
757
758
759
760
761
762
763
  		if (!vma)
  			return -ENOMEM;
  		if (start < vma->vm_start) {
  			/*
  			 * This "vma" under revalidation is the one
  			 * with the lowest vma->vm_start where start
  			 * is also < vma->vm_end. If start <
  			 * vma->vm_start it means an hole materialized
  			 * in the user address space within the
230ca982b   Mike Rapoport   userfaultfd: non-...
764
765
  			 * virtual range passed to MADV_DONTNEED
  			 * or MADV_FREE.
70ccb92fd   Andrea Arcangeli   userfaultfd: non-...
766
767
768
  			 */
  			return -ENOMEM;
  		}
9c276cc65   Minchan Kim   mm: introduce MAD...
769
  		if (!can_madv_lru_vma(vma))
70ccb92fd   Andrea Arcangeli   userfaultfd: non-...
770
771
772
773
  			return -EINVAL;
  		if (end > vma->vm_end) {
  			/*
  			 * Don't fail if end > vma->vm_end. If the old
f0953a1bb   Ingo Molnar   mm: fix typos in ...
774
  			 * vma was split while the mmap_lock was
70ccb92fd   Andrea Arcangeli   userfaultfd: non-...
775
  			 * released the effect of the concurrent
230ca982b   Mike Rapoport   userfaultfd: non-...
776
  			 * operation may not cause madvise() to
70ccb92fd   Andrea Arcangeli   userfaultfd: non-...
777
778
779
780
781
782
783
784
785
786
787
  			 * have an undefined result. There may be an
  			 * adjacent next vma that we'll walk
  			 * next. userfaultfd_remove() will generate an
  			 * UFFD_EVENT_REMOVE repetition on the
  			 * end-vma->vm_end range, but the manager can
  			 * handle a repetition fine.
  			 */
  			end = vma->vm_end;
  		}
  		VM_WARN_ON(start >= end);
  	}
230ca982b   Mike Rapoport   userfaultfd: non-...
788
789
790
791
792
793
794
  
  	if (behavior == MADV_DONTNEED)
  		return madvise_dontneed_single_vma(vma, start, end);
  	else if (behavior == MADV_FREE)
  		return madvise_free_single_vma(vma, start, end);
  	else
  		return -EINVAL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
795
  }
4ca9b3859   David Hildenbrand   mm/madvise: intro...
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
  static long madvise_populate(struct vm_area_struct *vma,
  			     struct vm_area_struct **prev,
  			     unsigned long start, unsigned long end,
  			     int behavior)
  {
  	const bool write = behavior == MADV_POPULATE_WRITE;
  	struct mm_struct *mm = vma->vm_mm;
  	unsigned long tmp_end;
  	int locked = 1;
  	long pages;
  
  	*prev = vma;
  
  	while (start < end) {
  		/*
  		 * We might have temporarily dropped the lock. For example,
  		 * our VMA might have been split.
  		 */
  		if (!vma || start >= vma->vm_end) {
  			vma = find_vma(mm, start);
  			if (!vma || start < vma->vm_start)
  				return -ENOMEM;
  		}
  
  		tmp_end = min_t(unsigned long, end, vma->vm_end);
  		/* Populate (prefault) page tables readable/writable. */
  		pages = faultin_vma_page_range(vma, start, tmp_end, write,
  					       &locked);
  		if (!locked) {
  			mmap_read_lock(mm);
  			locked = 1;
  			*prev = NULL;
  			vma = NULL;
  		}
  		if (pages < 0) {
  			switch (pages) {
  			case -EINTR:
  				return -EINTR;
eb2faa513   David Hildenbrand   mm/madvise: repor...
834
  			case -EINVAL: /* Incompatible mappings / permissions. */
4ca9b3859   David Hildenbrand   mm/madvise: intro...
835
836
837
  				return -EINVAL;
  			case -EHWPOISON:
  				return -EHWPOISON;
eb2faa513   David Hildenbrand   mm/madvise: repor...
838
839
  			case -EFAULT: /* VM_FAULT_SIGBUS or VM_FAULT_SIGSEGV */
  				return -EFAULT;
4ca9b3859   David Hildenbrand   mm/madvise: intro...
840
841
842
843
844
845
846
847
848
849
850
851
852
  			default:
  				pr_warn_once("%s: unhandled return value: %ld
  ",
  					     __func__, pages);
  				fallthrough;
  			case -ENOMEM:
  				return -ENOMEM;
  			}
  		}
  		start += pages * PAGE_SIZE;
  	}
  	return 0;
  }
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
853
854
855
  /*
   * Application wants to free up the pages and associated backing store.
   * This is effectively punching a hole into the middle of a file.
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
856
857
   */
  static long madvise_remove(struct vm_area_struct *vma,
00e9fa2d6   Nick Piggin   [PATCH] mm: fix m...
858
  				struct vm_area_struct **prev,
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
859
860
  				unsigned long start, unsigned long end)
  {
3f31d0757   Hugh Dickins   mm/fs: route MADV...
861
  	loff_t offset;
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
862
  	int error;
9ab4233dd   Andy Lutomirski   mm: Hold a file r...
863
  	struct file *f;
0726b01e7   Minchan Kim   mm/madvise: pass ...
864
  	struct mm_struct *mm = vma->vm_mm;
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
865

c1e8d7c6a   Michel Lespinasse   mmap locking API:...
866
  	*prev = NULL;	/* tell sys_madvise we drop mmap_lock */
00e9fa2d6   Nick Piggin   [PATCH] mm: fix m...
867

72079ba0d   Mike Kravetz   mm: madvise allow...
868
  	if (vma->vm_flags & VM_LOCKED)
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
869
  		return -EINVAL;
9ab4233dd   Andy Lutomirski   mm: Hold a file r...
870
871
872
  	f = vma->vm_file;
  
  	if (!f || !f->f_mapping || !f->f_mapping->host) {
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
873
874
  			return -EINVAL;
  	}
69cf0fac6   Hugh Dickins   [PATCH] Fix MADV_...
875
876
  	if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
  		return -EACCES;
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
877
878
  	offset = (loff_t)(start - vma->vm_start)
  			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
879

9ab4233dd   Andy Lutomirski   mm: Hold a file r...
880
  	/*
9608703e4   Jan Kara   mm: Fix comments ...
881
  	 * Filesystem's fallocate may need to take i_rwsem.  We need to
9ab4233dd   Andy Lutomirski   mm: Hold a file r...
882
883
  	 * explicitly grab a reference because the vma (and hence the
  	 * vma's reference to the file) can go away as soon as we drop
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
884
  	 * mmap_lock.
9ab4233dd   Andy Lutomirski   mm: Hold a file r...
885
886
  	 */
  	get_file(f);
70ccb92fd   Andrea Arcangeli   userfaultfd: non-...
887
  	if (userfaultfd_remove(vma, start, end)) {
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
888
  		/* mmap_lock was not released by userfaultfd_remove() */
0726b01e7   Minchan Kim   mm/madvise: pass ...
889
  		mmap_read_unlock(mm);
70ccb92fd   Andrea Arcangeli   userfaultfd: non-...
890
  	}
72c72bdf7   Anna Schumaker   VFS: Rename do_fa...
891
  	error = vfs_fallocate(f,
3f31d0757   Hugh Dickins   mm/fs: route MADV...
892
893
  				FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
  				offset, end - start);
9ab4233dd   Andy Lutomirski   mm: Hold a file r...
894
  	fput(f);
0726b01e7   Minchan Kim   mm/madvise: pass ...
895
  	mmap_read_lock(mm);
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
896
  	return error;
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
897
  }
9893e49d6   Andi Kleen   HWPOISON: Add mad...
898
899
900
901
  #ifdef CONFIG_MEMORY_FAILURE
  /*
   * Error injection support for memory error handling.
   */
97167a768   Anshuman Khandual   mm/madvise.c: cle...
902
903
  static int madvise_inject_error(int behavior,
  		unsigned long start, unsigned long end)
9893e49d6   Andi Kleen   HWPOISON: Add mad...
904
  {
d3cd257ce   Yunfeng Ye   mm/madvise.c: rep...
905
  	unsigned long size;
97167a768   Anshuman Khandual   mm/madvise.c: cle...
906

9893e49d6   Andi Kleen   HWPOISON: Add mad...
907
908
  	if (!capable(CAP_SYS_ADMIN))
  		return -EPERM;
97167a768   Anshuman Khandual   mm/madvise.c: cle...
909

19bfbe22f   Alexandru Moise   mm, hugetlb, soft...
910

d3cd257ce   Yunfeng Ye   mm/madvise.c: rep...
911
  	for (; start < end; start += size) {
23e7b5c2e   Dan Williams   mm, madvise_injec...
912
  		unsigned long pfn;
dc7560b49   Oscar Salvador   mm,hwpoison: refa...
913
  		struct page *page;
325c4ef5c   Andrew Morton   mm/madvise.c:madv...
914
  		int ret;
97167a768   Anshuman Khandual   mm/madvise.c: cle...
915
  		ret = get_user_pages_fast(start, 1, 0, &page);
9893e49d6   Andi Kleen   HWPOISON: Add mad...
916
917
  		if (ret != 1)
  			return ret;
23e7b5c2e   Dan Williams   mm, madvise_injec...
918
  		pfn = page_to_pfn(page);
325c4ef5c   Andrew Morton   mm/madvise.c:madv...
919

19bfbe22f   Alexandru Moise   mm, hugetlb, soft...
920
921
922
  		/*
  		 * When soft offlining hugepages, after migrating the page
  		 * we dissolve it, therefore in the second loop "page" will
d3cd257ce   Yunfeng Ye   mm/madvise.c: rep...
923
  		 * no longer be a compound page.
19bfbe22f   Alexandru Moise   mm, hugetlb, soft...
924
  		 */
d3cd257ce   Yunfeng Ye   mm/madvise.c: rep...
925
  		size = page_size(compound_head(page));
19bfbe22f   Alexandru Moise   mm, hugetlb, soft...
926

97167a768   Anshuman Khandual   mm/madvise.c: cle...
927
928
929
  		if (behavior == MADV_SOFT_OFFLINE) {
  			pr_info("Soft offlining pfn %#lx at process virtual address %#lx
  ",
dc7560b49   Oscar Salvador   mm,hwpoison: refa...
930
  				 pfn, start);
feec24a61   Naoya Horiguchi   mm, soft-offline:...
931
  			ret = soft_offline_page(pfn, MF_COUNT_INCREASED);
dc7560b49   Oscar Salvador   mm,hwpoison: refa...
932
933
934
935
  		} else {
  			pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx
  ",
  				 pfn, start);
1e8aaedb1   Oscar Salvador   mm,memory_failure...
936
  			ret = memory_failure(pfn, MF_COUNT_INCREASED);
7a07875fa   luofei   mm/hwpoison: avoi...
937
938
  			if (ret == -EOPNOTSUPP)
  				ret = 0;
afcf938ee   Andi Kleen   HWPOISON: Add a m...
939
  		}
23e7b5c2e   Dan Williams   mm, madvise_injec...
940

23a003bfd   Naoya Horiguchi   mm/madvise: pass ...
941
942
  		if (ret)
  			return ret;
9893e49d6   Andi Kleen   HWPOISON: Add mad...
943
  	}
c461ad6a6   Mel Gorman   mm, madvise: ensu...
944

325c4ef5c   Andrew Morton   mm/madvise.c:madv...
945
  	return 0;
9893e49d6   Andi Kleen   HWPOISON: Add mad...
946
947
  }
  #endif
165cd4023   suzuki   [PATCH] madvise()...
948
949
950
  static long
  madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
  		unsigned long start, unsigned long end, int behavior)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
951
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
952
  	switch (behavior) {
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
953
  	case MADV_REMOVE:
3866ea90d   Hugh Dickins   ksm: first tidy u...
954
  		return madvise_remove(vma, prev, start, end);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
955
  	case MADV_WILLNEED:
3866ea90d   Hugh Dickins   ksm: first tidy u...
956
  		return madvise_willneed(vma, prev, start, end);
9c276cc65   Minchan Kim   mm: introduce MAD...
957
958
  	case MADV_COLD:
  		return madvise_cold(vma, prev, start, end);
1a4e58cce   Minchan Kim   mm: introduce MAD...
959
960
  	case MADV_PAGEOUT:
  		return madvise_pageout(vma, prev, start, end);
854e9ed09   Minchan Kim   mm: support madvi...
961
  	case MADV_FREE:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
962
  	case MADV_DONTNEED:
230ca982b   Mike Rapoport   userfaultfd: non-...
963
  		return madvise_dontneed_free(vma, prev, start, end, behavior);
4ca9b3859   David Hildenbrand   mm/madvise: intro...
964
965
966
  	case MADV_POPULATE_READ:
  	case MADV_POPULATE_WRITE:
  		return madvise_populate(vma, prev, start, end, behavior);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
967
  	default:
3866ea90d   Hugh Dickins   ksm: first tidy u...
968
  		return madvise_behavior(vma, prev, start, end, behavior);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
969
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
970
  }
1ecef9ed0   Nicholas Krause   mm/madvise.c: mak...
971
  static bool
75927af8b   Nick Piggin   mm: madvise(): co...
972
973
974
975
976
977
978
979
980
981
982
  madvise_behavior_valid(int behavior)
  {
  	switch (behavior) {
  	case MADV_DOFORK:
  	case MADV_DONTFORK:
  	case MADV_NORMAL:
  	case MADV_SEQUENTIAL:
  	case MADV_RANDOM:
  	case MADV_REMOVE:
  	case MADV_WILLNEED:
  	case MADV_DONTNEED:
854e9ed09   Minchan Kim   mm: support madvi...
983
  	case MADV_FREE:
9c276cc65   Minchan Kim   mm: introduce MAD...
984
  	case MADV_COLD:
1a4e58cce   Minchan Kim   mm: introduce MAD...
985
  	case MADV_PAGEOUT:
4ca9b3859   David Hildenbrand   mm/madvise: intro...
986
987
  	case MADV_POPULATE_READ:
  	case MADV_POPULATE_WRITE:
f8af4da3b   Hugh Dickins   ksm: the mm inter...
988
989
990
991
  #ifdef CONFIG_KSM
  	case MADV_MERGEABLE:
  	case MADV_UNMERGEABLE:
  #endif
0af4e98b6   Andrea Arcangeli   thp: madvise(MADV...
992
993
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  	case MADV_HUGEPAGE:
a664b2d85   Andrea Arcangeli   thp: madvise(MADV...
994
  	case MADV_NOHUGEPAGE:
0af4e98b6   Andrea Arcangeli   thp: madvise(MADV...
995
  #endif
accb61fe7   Jason Baron   coredump: add VM_...
996
997
  	case MADV_DONTDUMP:
  	case MADV_DODUMP:
d2cd9ede6   Rik van Riel   mm,fork: introduc...
998
999
  	case MADV_WIPEONFORK:
  	case MADV_KEEPONFORK:
5e451be75   Anshuman Khandual   mm/madvise: move ...
1000
1001
1002
1003
  #ifdef CONFIG_MEMORY_FAILURE
  	case MADV_SOFT_OFFLINE:
  	case MADV_HWPOISON:
  #endif
1ecef9ed0   Nicholas Krause   mm/madvise.c: mak...
1004
  		return true;
75927af8b   Nick Piggin   mm: madvise(): co...
1005
1006
  
  	default:
1ecef9ed0   Nicholas Krause   mm/madvise.c: mak...
1007
  		return false;
75927af8b   Nick Piggin   mm: madvise(): co...
1008
1009
  	}
  }
3866ea90d   Hugh Dickins   ksm: first tidy u...
1010

ecb8ac8b1   Minchan Kim   mm/madvise: intro...
1011
1012
1013
1014
1015
1016
  static bool
  process_madvise_behavior_valid(int behavior)
  {
  	switch (behavior) {
  	case MADV_COLD:
  	case MADV_PAGEOUT:
d5fffc5af   zhangkui   mm/madvise: add M...
1017
  	case MADV_WILLNEED:
ecb8ac8b1   Minchan Kim   mm/madvise: intro...
1018
1019
1020
1021
1022
  		return true;
  	default:
  		return false;
  	}
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
  /*
   * The madvise(2) system call.
   *
   * Applications can use madvise() to advise the kernel how it should
   * handle paging I/O in this VM area.  The idea is to help the kernel
   * use appropriate read-ahead and caching techniques.  The information
   * provided is advisory only, and can be safely disregarded by the
   * kernel without affecting the correct operation of the application.
   *
   * behavior values:
   *  MADV_NORMAL - the default behavior is to read clusters.  This
   *		results in some read-ahead and read-behind.
   *  MADV_RANDOM - the system should read the minimum amount of data
   *		on any access, since it is unlikely that the appli-
   *		cation will need more than what it asks for.
   *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
   *		once, so they can be aggressively read ahead, and
   *		can be freed soon after they are accessed.
   *  MADV_WILLNEED - the application is notifying the system to read
   *		some pages ahead.
   *  MADV_DONTNEED - the application is finished with the given range,
   *		so the kernel can free resources associated with it.
d7206a70a   Naoya Horiguchi   mm/madvise: updat...
1045
1046
   *  MADV_FREE - the application marks pages in the given range as lazy free,
   *		where actual purges are postponed until memory pressure happens.
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
1047
1048
   *  MADV_REMOVE - the application wants to free up the given range of
   *		pages and associated backing store.
3866ea90d   Hugh Dickins   ksm: first tidy u...
1049
1050
1051
   *  MADV_DONTFORK - omit this area from child's address space when forking:
   *		typically, to avoid COWing pages pinned by get_user_pages().
   *  MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
c02c30093   Yang Shi   mm/madvise.c: add...
1052
1053
1054
   *  MADV_WIPEONFORK - present the child process with zero-filled memory in this
   *              range after a fork.
   *  MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK
d7206a70a   Naoya Horiguchi   mm/madvise: updat...
1055
1056
1057
   *  MADV_HWPOISON - trigger memory error handler as if the given memory range
   *		were corrupted by unrecoverable hardware memory failure.
   *  MADV_SOFT_OFFLINE - try to soft-offline the given range of memory.
f8af4da3b   Hugh Dickins   ksm: the mm inter...
1058
1059
1060
   *  MADV_MERGEABLE - the application recommends that KSM try to merge pages in
   *		this area with pages of identical content from other such areas.
   *  MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
d7206a70a   Naoya Horiguchi   mm/madvise: updat...
1061
1062
1063
1064
1065
1066
1067
1068
1069
   *  MADV_HUGEPAGE - the application wants to back the given range by transparent
   *		huge pages in the future. Existing pages might be coalesced and
   *		new pages might be allocated as THP.
   *  MADV_NOHUGEPAGE - mark the given range as not worth being backed by
   *		transparent huge pages so the existing pages will not be
   *		coalesced into THP and new pages will not be allocated as THP.
   *  MADV_DONTDUMP - the application wants to prevent pages in the given range
   *		from being included in its core dump.
   *  MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
ecb8ac8b1   Minchan Kim   mm/madvise: intro...
1070
1071
   *  MADV_COLD - the application is not expected to use this memory soon,
   *		deactivate pages in this range so that they can be reclaimed
f0953a1bb   Ingo Molnar   mm: fix typos in ...
1072
   *		easily if memory pressure happens.
ecb8ac8b1   Minchan Kim   mm/madvise: intro...
1073
1074
   *  MADV_PAGEOUT - the application is not expected to use this memory soon,
   *		page out the pages in this range immediately.
4ca9b3859   David Hildenbrand   mm/madvise: intro...
1075
1076
1077
1078
   *  MADV_POPULATE_READ - populate (prefault) page tables readable by
   *		triggering read faults if required
   *  MADV_POPULATE_WRITE - populate (prefault) page tables writable by
   *		triggering write faults if required
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1079
1080
1081
1082
1083
   *
   * return values:
   *  zero    - success
   *  -EINVAL - start + len < 0, start is not page-aligned,
   *		"behavior" is not a valid value, or application
c02c30093   Yang Shi   mm/madvise.c: add...
1084
1085
1086
   *		is attempting to release locked or shared pages,
   *		or the specified address range includes file, Huge TLB,
   *		MAP_SHARED or VMPFNMAP range.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1087
1088
1089
1090
1091
1092
   *  -ENOMEM - addresses in the specified range are not currently
   *		mapped, or are outside the AS of the process.
   *  -EIO    - an I/O error occurred while paging in data.
   *  -EBADF  - map exists, but area maps something that isn't a file.
   *  -EAGAIN - a kernel resource was temporarily unavailable.
   */
0726b01e7   Minchan Kim   mm/madvise: pass ...
1093
  int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1094
  {
05b743847   Prasanna Meda   [PATCH] madvise: ...
1095
  	unsigned long end, tmp;
ec9bed9d3   Vladimir Cernov   mm/madvise.c: fix...
1096
  	struct vm_area_struct *vma, *prev;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1097
1098
  	int unmapped_error = 0;
  	int error = -EINVAL;
f79777932   Jason Baron   speed up madvise_...
1099
  	int write;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1100
  	size_t len;
1998cc048   Shaohua Li   mm: make madvise(...
1101
  	struct blk_plug plug;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1102

057d33891   Andrey Konovalov   mm: untag user po...
1103
  	start = untagged_addr(start);
75927af8b   Nick Piggin   mm: madvise(): co...
1104
1105
  	if (!madvise_behavior_valid(behavior))
  		return error;
df6c6500b   Wei Yang   mm/madvise.c: use...
1106
  	if (!PAGE_ALIGNED(start))
84d96d897   Rasmus Villemoes   mm: madvise: comp...
1107
  		return error;
df6c6500b   Wei Yang   mm/madvise.c: use...
1108
  	len = PAGE_ALIGN(len_in);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1109
1110
1111
  
  	/* Check to see whether len was rounded up from small -ve to zero */
  	if (len_in && !len)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
1112
  		return error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1113
1114
1115
  
  	end = start + len;
  	if (end < start)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
1116
  		return error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1117
1118
1119
  
  	error = 0;
  	if (end == start)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
1120
  		return error;
5e451be75   Anshuman Khandual   mm/madvise: move ...
1121
1122
1123
1124
  #ifdef CONFIG_MEMORY_FAILURE
  	if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
  		return madvise_inject_error(behavior, start, start + len_in);
  #endif
84d96d897   Rasmus Villemoes   mm: madvise: comp...
1125
  	write = madvise_need_mmap_write(behavior);
dc0ef0df7   Michal Hocko   mm: make mmap_sem...
1126
  	if (write) {
0726b01e7   Minchan Kim   mm/madvise: pass ...
1127
  		if (mmap_write_lock_killable(mm))
dc0ef0df7   Michal Hocko   mm: make mmap_sem...
1128
1129
  			return -EINTR;
  	} else {
0726b01e7   Minchan Kim   mm/madvise: pass ...
1130
  		mmap_read_lock(mm);
dc0ef0df7   Michal Hocko   mm: make mmap_sem...
1131
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1132
1133
1134
1135
  
  	/*
  	 * If the interval [start,end) covers some unmapped address
  	 * ranges, just ignore them, but return -ENOMEM at the end.
05b743847   Prasanna Meda   [PATCH] madvise: ...
1136
  	 * - different from the way of handling in mlock etc.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1137
  	 */
0726b01e7   Minchan Kim   mm/madvise: pass ...
1138
  	vma = find_vma_prev(mm, start, &prev);
836d5ffd3   Hugh Dickins   [PATCH] mm: fix m...
1139
1140
  	if (vma && start > vma->vm_start)
  		prev = vma;
1998cc048   Shaohua Li   mm: make madvise(...
1141
  	blk_start_plug(&plug);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1142
1143
1144
1145
  	for (;;) {
  		/* Still start < end. */
  		error = -ENOMEM;
  		if (!vma)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
1146
  			goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1147

05b743847   Prasanna Meda   [PATCH] madvise: ...
1148
  		/* Here start < (end|vma->vm_end). */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1149
1150
1151
  		if (start < vma->vm_start) {
  			unmapped_error = -ENOMEM;
  			start = vma->vm_start;
05b743847   Prasanna Meda   [PATCH] madvise: ...
1152
  			if (start >= end)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
1153
  				goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1154
  		}
05b743847   Prasanna Meda   [PATCH] madvise: ...
1155
1156
1157
1158
  		/* Here vma->vm_start <= start < (end|vma->vm_end) */
  		tmp = vma->vm_end;
  		if (end < tmp)
  			tmp = end;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1159

05b743847   Prasanna Meda   [PATCH] madvise: ...
1160
1161
  		/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
  		error = madvise_vma(vma, &prev, start, tmp, behavior);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1162
  		if (error)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
1163
  			goto out;
05b743847   Prasanna Meda   [PATCH] madvise: ...
1164
  		start = tmp;
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
1165
  		if (prev && start < prev->vm_end)
05b743847   Prasanna Meda   [PATCH] madvise: ...
1166
1167
1168
  			start = prev->vm_end;
  		error = unmapped_error;
  		if (start >= end)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
1169
  			goto out;
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
1170
1171
  		if (prev)
  			vma = prev->vm_next;
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
1172
  		else	/* madvise_remove dropped mmap_lock */
0726b01e7   Minchan Kim   mm/madvise: pass ...
1173
  			vma = find_vma(mm, start);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1174
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1175
  out:
84d96d897   Rasmus Villemoes   mm: madvise: comp...
1176
  	blk_finish_plug(&plug);
f79777932   Jason Baron   speed up madvise_...
1177
  	if (write)
0726b01e7   Minchan Kim   mm/madvise: pass ...
1178
  		mmap_write_unlock(mm);
0a27a14a6   Nick Piggin   mm: madvise avoid...
1179
  	else
0726b01e7   Minchan Kim   mm/madvise: pass ...
1180
  		mmap_read_unlock(mm);
0a27a14a6   Nick Piggin   mm: madvise avoid...
1181

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1182
1183
  	return error;
  }
db08ca252   Jens Axboe   mm: make do_madvi...
1184
1185
1186
  
  SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
  {
0726b01e7   Minchan Kim   mm/madvise: pass ...
1187
  	return do_madvise(current->mm, start, len_in, behavior);
db08ca252   Jens Axboe   mm: make do_madvi...
1188
  }
ecb8ac8b1   Minchan Kim   mm/madvise: intro...
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
  
  SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
  		size_t, vlen, int, behavior, unsigned int, flags)
  {
  	ssize_t ret;
  	struct iovec iovstack[UIO_FASTIOV], iovec;
  	struct iovec *iov = iovstack;
  	struct iov_iter iter;
  	struct pid *pid;
  	struct task_struct *task;
  	struct mm_struct *mm;
  	size_t total_len;
  	unsigned int f_flags;
  
  	if (flags != 0) {
  		ret = -EINVAL;
  		goto out;
  	}
  
  	ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
  	if (ret < 0)
  		goto out;
  
  	pid = pidfd_get_pid(pidfd, &f_flags);
  	if (IS_ERR(pid)) {
  		ret = PTR_ERR(pid);
  		goto free_iov;
  	}
  
  	task = get_pid_task(pid, PIDTYPE_PID);
  	if (!task) {
  		ret = -ESRCH;
  		goto put_pid;
  	}
a68a0262a   Minchan Kim   mm/madvise: remov...
1223
  	if (!process_madvise_behavior_valid(behavior)) {
ecb8ac8b1   Minchan Kim   mm/madvise: intro...
1224
1225
1226
  		ret = -EINVAL;
  		goto release_task;
  	}
96cfe2c0f   Suren Baghdasaryan   mm/madvise: repla...
1227
1228
  	/* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
  	mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
ecb8ac8b1   Minchan Kim   mm/madvise: intro...
1229
1230
1231
1232
  	if (IS_ERR_OR_NULL(mm)) {
  		ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
  		goto release_task;
  	}
96cfe2c0f   Suren Baghdasaryan   mm/madvise: repla...
1233
1234
1235
1236
1237
1238
1239
1240
  	/*
  	 * Require CAP_SYS_NICE for influencing process performance. Note that
  	 * only non-destructive hints are currently supported.
  	 */
  	if (!capable(CAP_SYS_NICE)) {
  		ret = -EPERM;
  		goto release_mm;
  	}
ecb8ac8b1   Minchan Kim   mm/madvise: intro...
1241
1242
1243
1244
1245
1246
  	total_len = iov_iter_count(&iter);
  
  	while (iov_iter_count(&iter)) {
  		iovec = iov_iter_iovec(&iter);
  		ret = do_madvise(mm, (unsigned long)iovec.iov_base,
  					iovec.iov_len, behavior);
d4835551f   Charan Teja Kalla   Revert "mm: madvi...
1247
  		if (ret < 0)
ecb8ac8b1   Minchan Kim   mm/madvise: intro...
1248
1249
1250
  			break;
  		iov_iter_advance(&iter, iovec.iov_len);
  	}
a07a4b75c   Charan Teja Kalla   mm: madvise: retu...
1251
  	ret = (total_len - iov_iter_count(&iter)) ? : ret;
ecb8ac8b1   Minchan Kim   mm/madvise: intro...
1252

96cfe2c0f   Suren Baghdasaryan   mm/madvise: repla...
1253
  release_mm:
ecb8ac8b1   Minchan Kim   mm/madvise: intro...
1254
  	mmput(mm);
ecb8ac8b1   Minchan Kim   mm/madvise: intro...
1255
1256
1257
1258
1259
1260
1261
1262
1263
  release_task:
  	put_task_struct(task);
  put_pid:
  	put_pid(pid);
  free_iov:
  	kfree(iov);
  out:
  	return ret;
  }