Blame view

mm/madvise.c 30.6 KB
b24413180   Greg Kroah-Hartman   License cleanup: ...
1
  // SPDX-License-Identifier: GPL-2.0
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2
3
4
5
6
7
8
9
10
11
  /*
   *	linux/mm/madvise.c
   *
   * Copyright (C) 1999  Linus Torvalds
   * Copyright (C) 2002  Christoph Hellwig
   */
  
  #include <linux/mman.h>
  #include <linux/pagemap.h>
  #include <linux/syscalls.h>
05b743847   Prasanna Meda   [PATCH] madvise: ...
12
  #include <linux/mempolicy.h>
afcf938ee   Andi Kleen   HWPOISON: Add a m...
13
  #include <linux/page-isolation.h>
9c276cc65   Minchan Kim   mm: introduce MAD...
14
  #include <linux/page_idle.h>
05ce77249   Pavel Emelyanov   userfaultfd: non-...
15
  #include <linux/userfaultfd_k.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
16
  #include <linux/hugetlb.h>
3f31d0757   Hugh Dickins   mm/fs: route MADV...
17
  #include <linux/falloc.h>
692fe6243   Jan Kara   mm: Handle MADV_W...
18
  #include <linux/fadvise.h>
e8edc6e03   Alexey Dobriyan   Detach sched.h fr...
19
  #include <linux/sched.h>
ecb8ac8b1   Minchan Kim   mm/madvise: intro...
20
21
  #include <linux/sched/mm.h>
  #include <linux/uio.h>
f8af4da3b   Hugh Dickins   ksm: the mm inter...
22
  #include <linux/ksm.h>
3f31d0757   Hugh Dickins   mm/fs: route MADV...
23
  #include <linux/fs.h>
9ab4233dd   Andy Lutomirski   mm: Hold a file r...
24
  #include <linux/file.h>
1998cc048   Shaohua Li   mm: make madvise(...
25
  #include <linux/blkdev.h>
66114cad6   Tejun Heo   writeback: separa...
26
  #include <linux/backing-dev.h>
a520110e4   Christoph Hellwig   mm: split out a n...
27
  #include <linux/pagewalk.h>
1998cc048   Shaohua Li   mm: make madvise(...
28
29
  #include <linux/swap.h>
  #include <linux/swapops.h>
3a4f8a0b3   Hugh Dickins   mm: remove shmem_...
30
  #include <linux/shmem_fs.h>
854e9ed09   Minchan Kim   mm: support madvi...
31
32
33
  #include <linux/mmu_notifier.h>
  
  #include <asm/tlb.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
34

235190738   Kirill A. Shutemov   oom-reaper: use m...
35
  #include "internal.h"
d616d5126   Minchan Kim   mm: factor out co...
36
37
38
39
  struct madvise_walk_private {
  	struct mmu_gather *tlb;
  	bool pageout;
  };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
40
  /*
0a27a14a6   Nick Piggin   mm: madvise avoid...
41
   * Any behaviour which results in changes to the vma->vm_flags needs to
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
42
   * take mmap_lock for writing. Others, which simply traverse vmas, need
0a27a14a6   Nick Piggin   mm: madvise avoid...
43
44
45
46
47
48
49
50
   * to only take it for reading.
   */
  static int madvise_need_mmap_write(int behavior)
  {
  	switch (behavior) {
  	case MADV_REMOVE:
  	case MADV_WILLNEED:
  	case MADV_DONTNEED:
9c276cc65   Minchan Kim   mm: introduce MAD...
51
  	case MADV_COLD:
1a4e58cce   Minchan Kim   mm: introduce MAD...
52
  	case MADV_PAGEOUT:
854e9ed09   Minchan Kim   mm: support madvi...
53
  	case MADV_FREE:
0a27a14a6   Nick Piggin   mm: madvise avoid...
54
55
56
57
58
59
60
61
  		return 0;
  	default:
  		/* be safe, default to 1. list exceptions explicitly */
  		return 1;
  	}
  }
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
62
63
64
   * We can potentially split a vm area into separate
   * areas, each area with its own behavior.
   */
ec9bed9d3   Vladimir Cernov   mm/madvise.c: fix...
65
  static long madvise_behavior(struct vm_area_struct *vma,
05b743847   Prasanna Meda   [PATCH] madvise: ...
66
67
  		     struct vm_area_struct **prev,
  		     unsigned long start, unsigned long end, int behavior)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
68
  {
ec9bed9d3   Vladimir Cernov   mm/madvise.c: fix...
69
  	struct mm_struct *mm = vma->vm_mm;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
70
  	int error = 0;
05b743847   Prasanna Meda   [PATCH] madvise: ...
71
  	pgoff_t pgoff;
3866ea90d   Hugh Dickins   ksm: first tidy u...
72
  	unsigned long new_flags = vma->vm_flags;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
73
74
  
  	switch (behavior) {
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
75
76
77
  	case MADV_NORMAL:
  		new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
  		break;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
78
  	case MADV_SEQUENTIAL:
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
79
  		new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
80
81
  		break;
  	case MADV_RANDOM:
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
82
  		new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
83
  		break;
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
84
85
86
87
  	case MADV_DONTFORK:
  		new_flags |= VM_DONTCOPY;
  		break;
  	case MADV_DOFORK:
3866ea90d   Hugh Dickins   ksm: first tidy u...
88
89
90
91
  		if (vma->vm_flags & VM_IO) {
  			error = -EINVAL;
  			goto out;
  		}
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
92
  		new_flags &= ~VM_DONTCOPY;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
93
  		break;
d2cd9ede6   Rik van Riel   mm,fork: introduc...
94
95
96
97
98
99
100
101
102
103
104
  	case MADV_WIPEONFORK:
  		/* MADV_WIPEONFORK is only supported on anonymous memory. */
  		if (vma->vm_file || vma->vm_flags & VM_SHARED) {
  			error = -EINVAL;
  			goto out;
  		}
  		new_flags |= VM_WIPEONFORK;
  		break;
  	case MADV_KEEPONFORK:
  		new_flags &= ~VM_WIPEONFORK;
  		break;
accb61fe7   Jason Baron   coredump: add VM_...
105
  	case MADV_DONTDUMP:
0103bd16f   Konstantin Khlebnikov   mm: prepare VM_DO...
106
  		new_flags |= VM_DONTDUMP;
accb61fe7   Jason Baron   coredump: add VM_...
107
108
  		break;
  	case MADV_DODUMP:
d41aa5252   Daniel Black   mm: madvise(MADV_...
109
  		if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) {
0103bd16f   Konstantin Khlebnikov   mm: prepare VM_DO...
110
111
112
113
  			error = -EINVAL;
  			goto out;
  		}
  		new_flags &= ~VM_DONTDUMP;
accb61fe7   Jason Baron   coredump: add VM_...
114
  		break;
f8af4da3b   Hugh Dickins   ksm: the mm inter...
115
116
117
  	case MADV_MERGEABLE:
  	case MADV_UNMERGEABLE:
  		error = ksm_madvise(vma, start, end, behavior, &new_flags);
f3bc0dba3   Mike Rapoport   mm/madvise: reduc...
118
119
  		if (error)
  			goto out_convert_errno;
f8af4da3b   Hugh Dickins   ksm: the mm inter...
120
  		break;
0af4e98b6   Andrea Arcangeli   thp: madvise(MADV...
121
  	case MADV_HUGEPAGE:
a664b2d85   Andrea Arcangeli   thp: madvise(MADV...
122
  	case MADV_NOHUGEPAGE:
60ab3244e   Andrea Arcangeli   thp: khugepaged: ...
123
  		error = hugepage_madvise(vma, &new_flags, behavior);
f3bc0dba3   Mike Rapoport   mm/madvise: reduc...
124
125
  		if (error)
  			goto out_convert_errno;
0af4e98b6   Andrea Arcangeli   thp: madvise(MADV...
126
  		break;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
127
  	}
05b743847   Prasanna Meda   [PATCH] madvise: ...
128
129
  	if (new_flags == vma->vm_flags) {
  		*prev = vma;
836d5ffd3   Hugh Dickins   [PATCH] mm: fix m...
130
  		goto out;
05b743847   Prasanna Meda   [PATCH] madvise: ...
131
132
133
134
  	}
  
  	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
  	*prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
19a809afe   Andrea Arcangeli   userfaultfd: teac...
135
136
  			  vma->vm_file, pgoff, vma_policy(vma),
  			  vma->vm_userfaultfd_ctx);
05b743847   Prasanna Meda   [PATCH] madvise: ...
137
138
139
140
141
142
  	if (*prev) {
  		vma = *prev;
  		goto success;
  	}
  
  	*prev = vma;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
143
144
  
  	if (start != vma->vm_start) {
def5efe03   David Rientjes   mm, madvise: fail...
145
146
  		if (unlikely(mm->map_count >= sysctl_max_map_count)) {
  			error = -ENOMEM;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
147
  			goto out;
def5efe03   David Rientjes   mm, madvise: fail...
148
149
  		}
  		error = __split_vma(mm, vma, start, 1);
f3bc0dba3   Mike Rapoport   mm/madvise: reduc...
150
151
  		if (error)
  			goto out_convert_errno;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
152
153
154
  	}
  
  	if (end != vma->vm_end) {
def5efe03   David Rientjes   mm, madvise: fail...
155
156
  		if (unlikely(mm->map_count >= sysctl_max_map_count)) {
  			error = -ENOMEM;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
157
  			goto out;
def5efe03   David Rientjes   mm, madvise: fail...
158
159
  		}
  		error = __split_vma(mm, vma, end, 0);
f3bc0dba3   Mike Rapoport   mm/madvise: reduc...
160
161
  		if (error)
  			goto out_convert_errno;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
162
  	}
836d5ffd3   Hugh Dickins   [PATCH] mm: fix m...
163
  success:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
164
  	/*
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
165
  	 * vm_flags is protected by the mmap_lock held in write mode.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
166
  	 */
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
167
  	vma->vm_flags = new_flags;
f3bc0dba3   Mike Rapoport   mm/madvise: reduc...
168
169
170
171
172
173
174
175
  
  out_convert_errno:
  	/*
  	 * madvise() returns EAGAIN if kernel resources, such as
  	 * slab, are temporarily unavailable.
  	 */
  	if (error == -ENOMEM)
  		error = -EAGAIN;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
176
  out:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
177
178
  	return error;
  }
1998cc048   Shaohua Li   mm: make madvise(...
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
  #ifdef CONFIG_SWAP
  static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
  	unsigned long end, struct mm_walk *walk)
  {
  	pte_t *orig_pte;
  	struct vm_area_struct *vma = walk->private;
  	unsigned long index;
  
  	if (pmd_none_or_trans_huge_or_clear_bad(pmd))
  		return 0;
  
  	for (index = start; index != end; index += PAGE_SIZE) {
  		pte_t pte;
  		swp_entry_t entry;
  		struct page *page;
  		spinlock_t *ptl;
  
  		orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
  		pte = *(orig_pte + ((index - start) / PAGE_SIZE));
  		pte_unmap_unlock(orig_pte, ptl);
0661a3361   Kirill A. Shutemov   mm: remove rest u...
199
  		if (pte_present(pte) || pte_none(pte))
1998cc048   Shaohua Li   mm: make madvise(...
200
201
202
203
204
205
  			continue;
  		entry = pte_to_swp_entry(pte);
  		if (unlikely(non_swap_entry(entry)))
  			continue;
  
  		page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
23955622f   Shaohua Li   swap: add block i...
206
  							vma, index, false);
1998cc048   Shaohua Li   mm: make madvise(...
207
  		if (page)
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
208
  			put_page(page);
1998cc048   Shaohua Li   mm: make madvise(...
209
210
211
212
  	}
  
  	return 0;
  }
7b86ac337   Christoph Hellwig   pagewalk: separat...
213
214
215
  static const struct mm_walk_ops swapin_walk_ops = {
  	.pmd_entry		= swapin_walk_pmd_entry,
  };
1998cc048   Shaohua Li   mm: make madvise(...
216
217
218
219
220
  
  static void force_shm_swapin_readahead(struct vm_area_struct *vma,
  		unsigned long start, unsigned long end,
  		struct address_space *mapping)
  {
e6e88712e   Matthew Wilcox (Oracle)   mm: optimise madv...
221
  	XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start));
66383800d   Matthew Wilcox (Oracle)   mm: fix madvise W...
222
  	pgoff_t end_index = linear_page_index(vma, end + PAGE_SIZE - 1);
1998cc048   Shaohua Li   mm: make madvise(...
223
  	struct page *page;
1998cc048   Shaohua Li   mm: make madvise(...
224

e6e88712e   Matthew Wilcox (Oracle)   mm: optimise madv...
225
226
227
  	rcu_read_lock();
  	xas_for_each(&xas, page, end_index) {
  		swp_entry_t swap;
1998cc048   Shaohua Li   mm: make madvise(...
228

e6e88712e   Matthew Wilcox (Oracle)   mm: optimise madv...
229
  		if (!xa_is_value(page))
1998cc048   Shaohua Li   mm: make madvise(...
230
  			continue;
e6e88712e   Matthew Wilcox (Oracle)   mm: optimise madv...
231
232
  		xas_pause(&xas);
  		rcu_read_unlock();
1998cc048   Shaohua Li   mm: make madvise(...
233
234
  		swap = radix_to_swp_entry(page);
  		page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
23955622f   Shaohua Li   swap: add block i...
235
  							NULL, 0, false);
1998cc048   Shaohua Li   mm: make madvise(...
236
  		if (page)
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
237
  			put_page(page);
e6e88712e   Matthew Wilcox (Oracle)   mm: optimise madv...
238
239
  
  		rcu_read_lock();
1998cc048   Shaohua Li   mm: make madvise(...
240
  	}
e6e88712e   Matthew Wilcox (Oracle)   mm: optimise madv...
241
  	rcu_read_unlock();
1998cc048   Shaohua Li   mm: make madvise(...
242
243
244
245
  
  	lru_add_drain();	/* Push any new pages onto the LRU now */
  }
  #endif		/* CONFIG_SWAP */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
246
247
248
  /*
   * Schedule all required I/O operations.  Do not wait for completion.
   */
ec9bed9d3   Vladimir Cernov   mm/madvise.c: fix...
249
250
  static long madvise_willneed(struct vm_area_struct *vma,
  			     struct vm_area_struct **prev,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
251
252
  			     unsigned long start, unsigned long end)
  {
0726b01e7   Minchan Kim   mm/madvise: pass ...
253
  	struct mm_struct *mm = vma->vm_mm;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
254
  	struct file *file = vma->vm_file;
692fe6243   Jan Kara   mm: Handle MADV_W...
255
  	loff_t offset;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
256

6ea8d958a   chenjie   mm/madvise.c: fix...
257
  	*prev = vma;
1998cc048   Shaohua Li   mm: make madvise(...
258
  #ifdef CONFIG_SWAP
97b713ba3   Christoph Hellwig   fs: kill BDI_CAP_...
259
  	if (!file) {
7b86ac337   Christoph Hellwig   pagewalk: separat...
260
261
  		walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma);
  		lru_add_drain(); /* Push any new pages onto the LRU now */
1998cc048   Shaohua Li   mm: make madvise(...
262
263
  		return 0;
  	}
1998cc048   Shaohua Li   mm: make madvise(...
264

97b713ba3   Christoph Hellwig   fs: kill BDI_CAP_...
265
  	if (shmem_mapping(file->f_mapping)) {
97b713ba3   Christoph Hellwig   fs: kill BDI_CAP_...
266
267
268
269
270
  		force_shm_swapin_readahead(vma, start, end,
  					file->f_mapping);
  		return 0;
  	}
  #else
1bef40032   Suzuki   [PATCH] madvise: ...
271
272
  	if (!file)
  		return -EBADF;
97b713ba3   Christoph Hellwig   fs: kill BDI_CAP_...
273
  #endif
1bef40032   Suzuki   [PATCH] madvise: ...
274

e748dcd09   Matthew Wilcox   vfs: remove get_x...
275
  	if (IS_DAX(file_inode(file))) {
fe77ba6f4   Carsten Otte   [PATCH] xip: madv...
276
277
278
  		/* no bad return value, but ignore advice */
  		return 0;
  	}
692fe6243   Jan Kara   mm: Handle MADV_W...
279
280
281
282
  	/*
  	 * Filesystem's fadvise may need to take various locks.  We need to
  	 * explicitly grab a reference because the vma (and hence the
  	 * vma's reference to the file) can go away as soon as we drop
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
283
  	 * mmap_lock.
692fe6243   Jan Kara   mm: Handle MADV_W...
284
  	 */
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
285
  	*prev = NULL;	/* tell sys_madvise we drop mmap_lock */
692fe6243   Jan Kara   mm: Handle MADV_W...
286
  	get_file(file);
692fe6243   Jan Kara   mm: Handle MADV_W...
287
288
  	offset = (loff_t)(start - vma->vm_start)
  			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
0726b01e7   Minchan Kim   mm/madvise: pass ...
289
  	mmap_read_unlock(mm);
692fe6243   Jan Kara   mm: Handle MADV_W...
290
291
  	vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
  	fput(file);
0726b01e7   Minchan Kim   mm/madvise: pass ...
292
  	mmap_read_lock(mm);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
293
294
  	return 0;
  }
d616d5126   Minchan Kim   mm: factor out co...
295
296
297
  static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
  				unsigned long addr, unsigned long end,
  				struct mm_walk *walk)
9c276cc65   Minchan Kim   mm: introduce MAD...
298
  {
d616d5126   Minchan Kim   mm: factor out co...
299
300
301
  	struct madvise_walk_private *private = walk->private;
  	struct mmu_gather *tlb = private->tlb;
  	bool pageout = private->pageout;
9c276cc65   Minchan Kim   mm: introduce MAD...
302
303
304
305
  	struct mm_struct *mm = tlb->mm;
  	struct vm_area_struct *vma = walk->vma;
  	pte_t *orig_pte, *pte, ptent;
  	spinlock_t *ptl;
d616d5126   Minchan Kim   mm: factor out co...
306
307
308
309
310
  	struct page *page = NULL;
  	LIST_HEAD(page_list);
  
  	if (fatal_signal_pending(current))
  		return -EINTR;
9c276cc65   Minchan Kim   mm: introduce MAD...
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
  
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  	if (pmd_trans_huge(*pmd)) {
  		pmd_t orig_pmd;
  		unsigned long next = pmd_addr_end(addr, end);
  
  		tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
  		ptl = pmd_trans_huge_lock(pmd, vma);
  		if (!ptl)
  			return 0;
  
  		orig_pmd = *pmd;
  		if (is_huge_zero_pmd(orig_pmd))
  			goto huge_unlock;
  
  		if (unlikely(!pmd_present(orig_pmd))) {
  			VM_BUG_ON(thp_migration_supported() &&
  					!is_pmd_migration_entry(orig_pmd));
  			goto huge_unlock;
  		}
  
  		page = pmd_page(orig_pmd);
12e967fd8   Michal Hocko   mm: do not allow ...
333
334
335
336
  
  		/* Do not interfere with other mappings of this page */
  		if (page_mapcount(page) != 1)
  			goto huge_unlock;
9c276cc65   Minchan Kim   mm: introduce MAD...
337
338
  		if (next - addr != HPAGE_PMD_SIZE) {
  			int err;
9c276cc65   Minchan Kim   mm: introduce MAD...
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
  			get_page(page);
  			spin_unlock(ptl);
  			lock_page(page);
  			err = split_huge_page(page);
  			unlock_page(page);
  			put_page(page);
  			if (!err)
  				goto regular_page;
  			return 0;
  		}
  
  		if (pmd_young(orig_pmd)) {
  			pmdp_invalidate(vma, addr, pmd);
  			orig_pmd = pmd_mkold(orig_pmd);
  
  			set_pmd_at(mm, addr, pmd, orig_pmd);
  			tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
  		}
d616d5126   Minchan Kim   mm: factor out co...
357
  		ClearPageReferenced(page);
9c276cc65   Minchan Kim   mm: introduce MAD...
358
  		test_and_clear_page_young(page);
d616d5126   Minchan Kim   mm: factor out co...
359
  		if (pageout) {
820729629   zhong jiang   mm: fix trying to...
360
361
362
363
364
365
  			if (!isolate_lru_page(page)) {
  				if (PageUnevictable(page))
  					putback_lru_page(page);
  				else
  					list_add(&page->lru, &page_list);
  			}
d616d5126   Minchan Kim   mm: factor out co...
366
367
  		} else
  			deactivate_page(page);
9c276cc65   Minchan Kim   mm: introduce MAD...
368
369
  huge_unlock:
  		spin_unlock(ptl);
d616d5126   Minchan Kim   mm: factor out co...
370
371
  		if (pageout)
  			reclaim_pages(&page_list);
9c276cc65   Minchan Kim   mm: introduce MAD...
372
373
  		return 0;
  	}
ce2684254   Minchan Kim   mm: validate pmd ...
374
  regular_page:
9c276cc65   Minchan Kim   mm: introduce MAD...
375
376
  	if (pmd_trans_unstable(pmd))
  		return 0;
9c276cc65   Minchan Kim   mm: introduce MAD...
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
  #endif
  	tlb_change_page_size(tlb, PAGE_SIZE);
  	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
  	flush_tlb_batched_pending(mm);
  	arch_enter_lazy_mmu_mode();
  	for (; addr < end; pte++, addr += PAGE_SIZE) {
  		ptent = *pte;
  
  		if (pte_none(ptent))
  			continue;
  
  		if (!pte_present(ptent))
  			continue;
  
  		page = vm_normal_page(vma, addr, ptent);
  		if (!page)
  			continue;
  
  		/*
  		 * Creating a THP page is expensive so split it only if we
  		 * are sure it's worth. Split it if we are only owner.
  		 */
  		if (PageTransCompound(page)) {
  			if (page_mapcount(page) != 1)
  				break;
  			get_page(page);
  			if (!trylock_page(page)) {
  				put_page(page);
  				break;
  			}
  			pte_unmap_unlock(orig_pte, ptl);
  			if (split_huge_page(page)) {
  				unlock_page(page);
  				put_page(page);
  				pte_offset_map_lock(mm, pmd, addr, &ptl);
  				break;
  			}
  			unlock_page(page);
  			put_page(page);
  			pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
  			pte--;
  			addr -= PAGE_SIZE;
  			continue;
  		}
12e967fd8   Michal Hocko   mm: do not allow ...
421
422
423
  		/* Do not interfere with other mappings of this page */
  		if (page_mapcount(page) != 1)
  			continue;
9c276cc65   Minchan Kim   mm: introduce MAD...
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
  		VM_BUG_ON_PAGE(PageTransCompound(page), page);
  
  		if (pte_young(ptent)) {
  			ptent = ptep_get_and_clear_full(mm, addr, pte,
  							tlb->fullmm);
  			ptent = pte_mkold(ptent);
  			set_pte_at(mm, addr, pte, ptent);
  			tlb_remove_tlb_entry(tlb, pte, addr);
  		}
  
  		/*
  		 * We are deactivating a page for accelerating reclaiming.
  		 * VM couldn't reclaim the page unless we clear PG_young.
  		 * As a side effect, it makes confuse idle-page tracking
  		 * because they will miss recent referenced history.
  		 */
d616d5126   Minchan Kim   mm: factor out co...
440
  		ClearPageReferenced(page);
9c276cc65   Minchan Kim   mm: introduce MAD...
441
  		test_and_clear_page_young(page);
d616d5126   Minchan Kim   mm: factor out co...
442
  		if (pageout) {
820729629   zhong jiang   mm: fix trying to...
443
444
445
446
447
448
  			if (!isolate_lru_page(page)) {
  				if (PageUnevictable(page))
  					putback_lru_page(page);
  				else
  					list_add(&page->lru, &page_list);
  			}
d616d5126   Minchan Kim   mm: factor out co...
449
450
  		} else
  			deactivate_page(page);
9c276cc65   Minchan Kim   mm: introduce MAD...
451
452
453
454
  	}
  
  	arch_leave_lazy_mmu_mode();
  	pte_unmap_unlock(orig_pte, ptl);
d616d5126   Minchan Kim   mm: factor out co...
455
456
  	if (pageout)
  		reclaim_pages(&page_list);
9c276cc65   Minchan Kim   mm: introduce MAD...
457
458
459
460
461
462
  	cond_resched();
  
  	return 0;
  }
  
  static const struct mm_walk_ops cold_walk_ops = {
d616d5126   Minchan Kim   mm: factor out co...
463
  	.pmd_entry = madvise_cold_or_pageout_pte_range,
9c276cc65   Minchan Kim   mm: introduce MAD...
464
465
466
467
468
469
  };
  
  static void madvise_cold_page_range(struct mmu_gather *tlb,
  			     struct vm_area_struct *vma,
  			     unsigned long addr, unsigned long end)
  {
d616d5126   Minchan Kim   mm: factor out co...
470
471
472
473
  	struct madvise_walk_private walk_private = {
  		.pageout = false,
  		.tlb = tlb,
  	};
9c276cc65   Minchan Kim   mm: introduce MAD...
474
  	tlb_start_vma(tlb, vma);
d616d5126   Minchan Kim   mm: factor out co...
475
  	walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
9c276cc65   Minchan Kim   mm: introduce MAD...
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
  	tlb_end_vma(tlb, vma);
  }
  
  static long madvise_cold(struct vm_area_struct *vma,
  			struct vm_area_struct **prev,
  			unsigned long start_addr, unsigned long end_addr)
  {
  	struct mm_struct *mm = vma->vm_mm;
  	struct mmu_gather tlb;
  
  	*prev = vma;
  	if (!can_madv_lru_vma(vma))
  		return -EINVAL;
  
  	lru_add_drain();
  	tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
  	madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
  	tlb_finish_mmu(&tlb, start_addr, end_addr);
  
  	return 0;
  }
1a4e58cce   Minchan Kim   mm: introduce MAD...
497
498
499
500
  static void madvise_pageout_page_range(struct mmu_gather *tlb,
  			     struct vm_area_struct *vma,
  			     unsigned long addr, unsigned long end)
  {
d616d5126   Minchan Kim   mm: factor out co...
501
502
503
504
  	struct madvise_walk_private walk_private = {
  		.pageout = true,
  		.tlb = tlb,
  	};
1a4e58cce   Minchan Kim   mm: introduce MAD...
505
  	tlb_start_vma(tlb, vma);
d616d5126   Minchan Kim   mm: factor out co...
506
  	walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
1a4e58cce   Minchan Kim   mm: introduce MAD...
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
  	tlb_end_vma(tlb, vma);
  }
  
  static inline bool can_do_pageout(struct vm_area_struct *vma)
  {
  	if (vma_is_anonymous(vma))
  		return true;
  	if (!vma->vm_file)
  		return false;
  	/*
  	 * paging out pagecache only for non-anonymous mappings that correspond
  	 * to the files the calling process could (if tried) open for writing;
  	 * otherwise we'd be including shared non-exclusive mappings, which
  	 * opens a side channel.
  	 */
  	return inode_owner_or_capable(file_inode(vma->vm_file)) ||
  		inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0;
  }
  
  static long madvise_pageout(struct vm_area_struct *vma,
  			struct vm_area_struct **prev,
  			unsigned long start_addr, unsigned long end_addr)
  {
  	struct mm_struct *mm = vma->vm_mm;
  	struct mmu_gather tlb;
  
  	*prev = vma;
  	if (!can_madv_lru_vma(vma))
  		return -EINVAL;
  
  	if (!can_do_pageout(vma))
  		return 0;
  
  	lru_add_drain();
  	tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
  	madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
  	tlb_finish_mmu(&tlb, start_addr, end_addr);
  
  	return 0;
  }
854e9ed09   Minchan Kim   mm: support madvi...
547
548
549
550
551
552
553
554
555
556
  static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
  				unsigned long end, struct mm_walk *walk)
  
  {
  	struct mmu_gather *tlb = walk->private;
  	struct mm_struct *mm = tlb->mm;
  	struct vm_area_struct *vma = walk->vma;
  	spinlock_t *ptl;
  	pte_t *orig_pte, *pte, ptent;
  	struct page *page;
64b42bc1c   Minchan Kim   mm/madvise.c: fre...
557
  	int nr_swap = 0;
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
558
559
560
561
562
563
  	unsigned long next;
  
  	next = pmd_addr_end(addr, end);
  	if (pmd_trans_huge(*pmd))
  		if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
  			goto next;
854e9ed09   Minchan Kim   mm: support madvi...
564

854e9ed09   Minchan Kim   mm: support madvi...
565
566
  	if (pmd_trans_unstable(pmd))
  		return 0;
ed6a79352   Peter Zijlstra   asm-generic/tlb, ...
567
  	tlb_change_page_size(tlb, PAGE_SIZE);
854e9ed09   Minchan Kim   mm: support madvi...
568
  	orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
3ea277194   Mel Gorman   mm, mprotect: flu...
569
  	flush_tlb_batched_pending(mm);
854e9ed09   Minchan Kim   mm: support madvi...
570
571
572
  	arch_enter_lazy_mmu_mode();
  	for (; addr != end; pte++, addr += PAGE_SIZE) {
  		ptent = *pte;
64b42bc1c   Minchan Kim   mm/madvise.c: fre...
573
  		if (pte_none(ptent))
854e9ed09   Minchan Kim   mm: support madvi...
574
  			continue;
64b42bc1c   Minchan Kim   mm/madvise.c: fre...
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
  		/*
  		 * If the pte has swp_entry, just clear page table to
  		 * prevent swap-in which is more expensive rather than
  		 * (page allocation + zeroing).
  		 */
  		if (!pte_present(ptent)) {
  			swp_entry_t entry;
  
  			entry = pte_to_swp_entry(ptent);
  			if (non_swap_entry(entry))
  				continue;
  			nr_swap--;
  			free_swap_and_cache(entry);
  			pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
  			continue;
  		}
854e9ed09   Minchan Kim   mm: support madvi...
591

25b2995a3   Christoph Hellwig   mm: remove MEMORY...
592
  		page = vm_normal_page(vma, addr, ptent);
854e9ed09   Minchan Kim   mm: support madvi...
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
  		if (!page)
  			continue;
  
  		/*
  		 * If pmd isn't transhuge but the page is THP and
  		 * is owned by only this process, split it and
  		 * deactivate all pages.
  		 */
  		if (PageTransCompound(page)) {
  			if (page_mapcount(page) != 1)
  				goto out;
  			get_page(page);
  			if (!trylock_page(page)) {
  				put_page(page);
  				goto out;
  			}
  			pte_unmap_unlock(orig_pte, ptl);
  			if (split_huge_page(page)) {
  				unlock_page(page);
  				put_page(page);
  				pte_offset_map_lock(mm, pmd, addr, &ptl);
  				goto out;
  			}
854e9ed09   Minchan Kim   mm: support madvi...
616
  			unlock_page(page);
263630e8d   Eric Biggers   mm/madvise.c: fix...
617
  			put_page(page);
854e9ed09   Minchan Kim   mm: support madvi...
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
  			pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
  			pte--;
  			addr -= PAGE_SIZE;
  			continue;
  		}
  
  		VM_BUG_ON_PAGE(PageTransCompound(page), page);
  
  		if (PageSwapCache(page) || PageDirty(page)) {
  			if (!trylock_page(page))
  				continue;
  			/*
  			 * If page is shared with others, we couldn't clear
  			 * PG_dirty of the page.
  			 */
  			if (page_mapcount(page) != 1) {
  				unlock_page(page);
  				continue;
  			}
  
  			if (PageSwapCache(page) && !try_to_free_swap(page)) {
  				unlock_page(page);
  				continue;
  			}
  
  			ClearPageDirty(page);
  			unlock_page(page);
  		}
  
  		if (pte_young(ptent) || pte_dirty(ptent)) {
  			/*
  			 * Some of architecture(ex, PPC) don't update TLB
  			 * with set_pte_at and tlb_remove_tlb_entry so for
  			 * the portability, remap the pte with old|clean
  			 * after pte clearing.
  			 */
  			ptent = ptep_get_and_clear_full(mm, addr, pte,
  							tlb->fullmm);
  
  			ptent = pte_mkold(ptent);
  			ptent = pte_mkclean(ptent);
  			set_pte_at(mm, addr, pte, ptent);
  			tlb_remove_tlb_entry(tlb, pte, addr);
  		}
802a3a92a   Shaohua Li   mm: reclaim MADV_...
662
  		mark_page_lazyfree(page);
854e9ed09   Minchan Kim   mm: support madvi...
663
664
  	}
  out:
64b42bc1c   Minchan Kim   mm/madvise.c: fre...
665
666
667
668
669
670
  	if (nr_swap) {
  		if (current->mm == mm)
  			sync_mm_rss(mm);
  
  		add_mm_counter(mm, MM_SWAPENTS, nr_swap);
  	}
854e9ed09   Minchan Kim   mm: support madvi...
671
672
673
  	arch_leave_lazy_mmu_mode();
  	pte_unmap_unlock(orig_pte, ptl);
  	cond_resched();
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
674
  next:
854e9ed09   Minchan Kim   mm: support madvi...
675
676
  	return 0;
  }
7b86ac337   Christoph Hellwig   pagewalk: separat...
677
678
679
  static const struct mm_walk_ops madvise_free_walk_ops = {
  	.pmd_entry		= madvise_free_pte_range,
  };
854e9ed09   Minchan Kim   mm: support madvi...
680
681
682
683
  
  static int madvise_free_single_vma(struct vm_area_struct *vma,
  			unsigned long start_addr, unsigned long end_addr)
  {
854e9ed09   Minchan Kim   mm: support madvi...
684
  	struct mm_struct *mm = vma->vm_mm;
ac46d4f3c   Jérôme Glisse   mm/mmu_notifier: ...
685
  	struct mmu_notifier_range range;
854e9ed09   Minchan Kim   mm: support madvi...
686
  	struct mmu_gather tlb;
854e9ed09   Minchan Kim   mm: support madvi...
687
688
689
  	/* MADV_FREE works for only anon vma at the moment */
  	if (!vma_is_anonymous(vma))
  		return -EINVAL;
ac46d4f3c   Jérôme Glisse   mm/mmu_notifier: ...
690
691
  	range.start = max(vma->vm_start, start_addr);
  	if (range.start >= vma->vm_end)
854e9ed09   Minchan Kim   mm: support madvi...
692
  		return -EINVAL;
ac46d4f3c   Jérôme Glisse   mm/mmu_notifier: ...
693
694
  	range.end = min(vma->vm_end, end_addr);
  	if (range.end <= vma->vm_start)
854e9ed09   Minchan Kim   mm: support madvi...
695
  		return -EINVAL;
7269f9999   Jérôme Glisse   mm/mmu_notifier: ...
696
  	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
6f4f13e8d   Jérôme Glisse   mm/mmu_notifier: ...
697
  				range.start, range.end);
854e9ed09   Minchan Kim   mm: support madvi...
698
699
  
  	lru_add_drain();
ac46d4f3c   Jérôme Glisse   mm/mmu_notifier: ...
700
  	tlb_gather_mmu(&tlb, mm, range.start, range.end);
854e9ed09   Minchan Kim   mm: support madvi...
701
  	update_hiwater_rss(mm);
ac46d4f3c   Jérôme Glisse   mm/mmu_notifier: ...
702
  	mmu_notifier_invalidate_range_start(&range);
7b86ac337   Christoph Hellwig   pagewalk: separat...
703
704
705
706
  	tlb_start_vma(&tlb, vma);
  	walk_page_range(vma->vm_mm, range.start, range.end,
  			&madvise_free_walk_ops, &tlb);
  	tlb_end_vma(&tlb, vma);
ac46d4f3c   Jérôme Glisse   mm/mmu_notifier: ...
707
708
  	mmu_notifier_invalidate_range_end(&range);
  	tlb_finish_mmu(&tlb, range.start, range.end);
854e9ed09   Minchan Kim   mm: support madvi...
709
710
711
  
  	return 0;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
712
713
714
715
  /*
   * Application no longer needs these pages.  If the pages are dirty,
   * it's OK to just throw them away.  The app will be more careful about
   * data it wants to keep.  Be sure to free swap resources too.  The
7e6cbea39   Fernando Luis Vazquez Cao   madvise: update f...
716
   * zap_page_range call sets things up for shrink_active_list to actually free
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
717
718
   * these pages later if no one else has touched them in the meantime,
   * although we could add these pages to a global reuse list for
7e6cbea39   Fernando Luis Vazquez Cao   madvise: update f...
719
   * shrink_active_list to pick up before reclaiming other pages.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
720
721
722
723
724
725
726
727
728
729
730
   *
   * NB: This interface discards data rather than pushes it out to swap,
   * as some implementations do.  This has performance implications for
   * applications like large transactional databases which want to discard
   * pages in anonymous maps after committing to backing store the data
   * that was kept in them.  There is no reason to write this data out to
   * the swap area if the application is discarding it.
   *
   * An interface that causes the system to free clean pages and flush
   * dirty pages is already available as msync(MS_INVALIDATE).
   */
230ca982b   Mike Rapoport   userfaultfd: non-...
731
732
733
734
735
736
737
738
739
740
741
  static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
  					unsigned long start, unsigned long end)
  {
  	zap_page_range(vma, start, end - start);
  	return 0;
  }
  
  static long madvise_dontneed_free(struct vm_area_struct *vma,
  				  struct vm_area_struct **prev,
  				  unsigned long start, unsigned long end,
  				  int behavior)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
742
  {
0726b01e7   Minchan Kim   mm/madvise: pass ...
743
  	struct mm_struct *mm = vma->vm_mm;
05b743847   Prasanna Meda   [PATCH] madvise: ...
744
  	*prev = vma;
9c276cc65   Minchan Kim   mm: introduce MAD...
745
  	if (!can_madv_lru_vma(vma))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
746
  		return -EINVAL;
70ccb92fd   Andrea Arcangeli   userfaultfd: non-...
747
  	if (!userfaultfd_remove(vma, start, end)) {
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
748
  		*prev = NULL; /* mmap_lock has been dropped, prev is stale */
70ccb92fd   Andrea Arcangeli   userfaultfd: non-...
749

0726b01e7   Minchan Kim   mm/madvise: pass ...
750
751
  		mmap_read_lock(mm);
  		vma = find_vma(mm, start);
70ccb92fd   Andrea Arcangeli   userfaultfd: non-...
752
753
754
755
756
757
758
759
760
  		if (!vma)
  			return -ENOMEM;
  		if (start < vma->vm_start) {
  			/*
  			 * This "vma" under revalidation is the one
  			 * with the lowest vma->vm_start where start
  			 * is also < vma->vm_end. If start <
  			 * vma->vm_start it means an hole materialized
  			 * in the user address space within the
230ca982b   Mike Rapoport   userfaultfd: non-...
761
762
  			 * virtual range passed to MADV_DONTNEED
  			 * or MADV_FREE.
70ccb92fd   Andrea Arcangeli   userfaultfd: non-...
763
764
765
  			 */
  			return -ENOMEM;
  		}
9c276cc65   Minchan Kim   mm: introduce MAD...
766
  		if (!can_madv_lru_vma(vma))
70ccb92fd   Andrea Arcangeli   userfaultfd: non-...
767
768
769
770
  			return -EINVAL;
  		if (end > vma->vm_end) {
  			/*
  			 * Don't fail if end > vma->vm_end. If the old
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
771
  			 * vma was splitted while the mmap_lock was
70ccb92fd   Andrea Arcangeli   userfaultfd: non-...
772
  			 * released the effect of the concurrent
230ca982b   Mike Rapoport   userfaultfd: non-...
773
  			 * operation may not cause madvise() to
70ccb92fd   Andrea Arcangeli   userfaultfd: non-...
774
775
776
777
778
779
780
781
782
783
784
  			 * have an undefined result. There may be an
  			 * adjacent next vma that we'll walk
  			 * next. userfaultfd_remove() will generate an
  			 * UFFD_EVENT_REMOVE repetition on the
  			 * end-vma->vm_end range, but the manager can
  			 * handle a repetition fine.
  			 */
  			end = vma->vm_end;
  		}
  		VM_WARN_ON(start >= end);
  	}
230ca982b   Mike Rapoport   userfaultfd: non-...
785
786
787
788
789
790
791
  
  	if (behavior == MADV_DONTNEED)
  		return madvise_dontneed_single_vma(vma, start, end);
  	else if (behavior == MADV_FREE)
  		return madvise_free_single_vma(vma, start, end);
  	else
  		return -EINVAL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
792
  }
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
793
794
795
  /*
   * Application wants to free up the pages and associated backing store.
   * This is effectively punching a hole into the middle of a file.
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
796
797
   */
  static long madvise_remove(struct vm_area_struct *vma,
00e9fa2d6   Nick Piggin   [PATCH] mm: fix m...
798
  				struct vm_area_struct **prev,
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
799
800
  				unsigned long start, unsigned long end)
  {
3f31d0757   Hugh Dickins   mm/fs: route MADV...
801
  	loff_t offset;
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
802
  	int error;
9ab4233dd   Andy Lutomirski   mm: Hold a file r...
803
  	struct file *f;
0726b01e7   Minchan Kim   mm/madvise: pass ...
804
  	struct mm_struct *mm = vma->vm_mm;
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
805

c1e8d7c6a   Michel Lespinasse   mmap locking API:...
806
  	*prev = NULL;	/* tell sys_madvise we drop mmap_lock */
00e9fa2d6   Nick Piggin   [PATCH] mm: fix m...
807

72079ba0d   Mike Kravetz   mm: madvise allow...
808
  	if (vma->vm_flags & VM_LOCKED)
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
809
  		return -EINVAL;
9ab4233dd   Andy Lutomirski   mm: Hold a file r...
810
811
812
  	f = vma->vm_file;
  
  	if (!f || !f->f_mapping || !f->f_mapping->host) {
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
813
814
  			return -EINVAL;
  	}
69cf0fac6   Hugh Dickins   [PATCH] Fix MADV_...
815
816
  	if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
  		return -EACCES;
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
817
818
  	offset = (loff_t)(start - vma->vm_start)
  			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
819

9ab4233dd   Andy Lutomirski   mm: Hold a file r...
820
821
822
823
  	/*
  	 * Filesystem's fallocate may need to take i_mutex.  We need to
  	 * explicitly grab a reference because the vma (and hence the
  	 * vma's reference to the file) can go away as soon as we drop
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
824
  	 * mmap_lock.
9ab4233dd   Andy Lutomirski   mm: Hold a file r...
825
826
  	 */
  	get_file(f);
70ccb92fd   Andrea Arcangeli   userfaultfd: non-...
827
  	if (userfaultfd_remove(vma, start, end)) {
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
828
  		/* mmap_lock was not released by userfaultfd_remove() */
0726b01e7   Minchan Kim   mm/madvise: pass ...
829
  		mmap_read_unlock(mm);
70ccb92fd   Andrea Arcangeli   userfaultfd: non-...
830
  	}
72c72bdf7   Anna Schumaker   VFS: Rename do_fa...
831
  	error = vfs_fallocate(f,
3f31d0757   Hugh Dickins   mm/fs: route MADV...
832
833
  				FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
  				offset, end - start);
9ab4233dd   Andy Lutomirski   mm: Hold a file r...
834
  	fput(f);
0726b01e7   Minchan Kim   mm/madvise: pass ...
835
  	mmap_read_lock(mm);
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
836
  	return error;
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
837
  }
9893e49d6   Andi Kleen   HWPOISON: Add mad...
838
839
840
841
  #ifdef CONFIG_MEMORY_FAILURE
  /*
   * Error injection support for memory error handling.
   */
97167a768   Anshuman Khandual   mm/madvise.c: cle...
842
843
  static int madvise_inject_error(int behavior,
  		unsigned long start, unsigned long end)
9893e49d6   Andi Kleen   HWPOISON: Add mad...
844
  {
c461ad6a6   Mel Gorman   mm, madvise: ensu...
845
  	struct zone *zone;
d3cd257ce   Yunfeng Ye   mm/madvise.c: rep...
846
  	unsigned long size;
97167a768   Anshuman Khandual   mm/madvise.c: cle...
847

9893e49d6   Andi Kleen   HWPOISON: Add mad...
848
849
  	if (!capable(CAP_SYS_ADMIN))
  		return -EPERM;
97167a768   Anshuman Khandual   mm/madvise.c: cle...
850

19bfbe22f   Alexandru Moise   mm, hugetlb, soft...
851

d3cd257ce   Yunfeng Ye   mm/madvise.c: rep...
852
  	for (; start < end; start += size) {
23e7b5c2e   Dan Williams   mm, madvise_injec...
853
  		unsigned long pfn;
dc7560b49   Oscar Salvador   mm,hwpoison: refa...
854
  		struct page *page;
325c4ef5c   Andrew Morton   mm/madvise.c:madv...
855
  		int ret;
97167a768   Anshuman Khandual   mm/madvise.c: cle...
856
  		ret = get_user_pages_fast(start, 1, 0, &page);
9893e49d6   Andi Kleen   HWPOISON: Add mad...
857
858
  		if (ret != 1)
  			return ret;
23e7b5c2e   Dan Williams   mm, madvise_injec...
859
  		pfn = page_to_pfn(page);
325c4ef5c   Andrew Morton   mm/madvise.c:madv...
860

19bfbe22f   Alexandru Moise   mm, hugetlb, soft...
861
862
863
  		/*
  		 * When soft offlining hugepages, after migrating the page
  		 * we dissolve it, therefore in the second loop "page" will
d3cd257ce   Yunfeng Ye   mm/madvise.c: rep...
864
  		 * no longer be a compound page.
19bfbe22f   Alexandru Moise   mm, hugetlb, soft...
865
  		 */
d3cd257ce   Yunfeng Ye   mm/madvise.c: rep...
866
  		size = page_size(compound_head(page));
19bfbe22f   Alexandru Moise   mm, hugetlb, soft...
867

97167a768   Anshuman Khandual   mm/madvise.c: cle...
868
869
870
  		if (behavior == MADV_SOFT_OFFLINE) {
  			pr_info("Soft offlining pfn %#lx at process virtual address %#lx
  ",
dc7560b49   Oscar Salvador   mm,hwpoison: refa...
871
  				 pfn, start);
feec24a61   Naoya Horiguchi   mm, soft-offline:...
872
  			ret = soft_offline_page(pfn, MF_COUNT_INCREASED);
dc7560b49   Oscar Salvador   mm,hwpoison: refa...
873
874
875
876
  		} else {
  			pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx
  ",
  				 pfn, start);
b7bf8ed8d   Oscar Salvador   mm,memory_failure...
877
  			ret = memory_failure(pfn, MF_COUNT_INCREASED);
afcf938ee   Andi Kleen   HWPOISON: Add a m...
878
  		}
23e7b5c2e   Dan Williams   mm, madvise_injec...
879

23a003bfd   Naoya Horiguchi   mm/madvise: pass ...
880
881
  		if (ret)
  			return ret;
9893e49d6   Andi Kleen   HWPOISON: Add mad...
882
  	}
c461ad6a6   Mel Gorman   mm, madvise: ensu...
883
884
885
886
  
  	/* Ensure that all poisoned pages are removed from per-cpu lists */
  	for_each_populated_zone(zone)
  		drain_all_pages(zone);
325c4ef5c   Andrew Morton   mm/madvise.c:madv...
887
  	return 0;
9893e49d6   Andi Kleen   HWPOISON: Add mad...
888
889
  }
  #endif
165cd4023   suzuki   [PATCH] madvise()...
890
891
892
  static long
  madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
  		unsigned long start, unsigned long end, int behavior)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
893
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
894
  	switch (behavior) {
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
895
  	case MADV_REMOVE:
3866ea90d   Hugh Dickins   ksm: first tidy u...
896
  		return madvise_remove(vma, prev, start, end);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
897
  	case MADV_WILLNEED:
3866ea90d   Hugh Dickins   ksm: first tidy u...
898
  		return madvise_willneed(vma, prev, start, end);
9c276cc65   Minchan Kim   mm: introduce MAD...
899
900
  	case MADV_COLD:
  		return madvise_cold(vma, prev, start, end);
1a4e58cce   Minchan Kim   mm: introduce MAD...
901
902
  	case MADV_PAGEOUT:
  		return madvise_pageout(vma, prev, start, end);
854e9ed09   Minchan Kim   mm: support madvi...
903
  	case MADV_FREE:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
904
  	case MADV_DONTNEED:
230ca982b   Mike Rapoport   userfaultfd: non-...
905
  		return madvise_dontneed_free(vma, prev, start, end, behavior);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
906
  	default:
3866ea90d   Hugh Dickins   ksm: first tidy u...
907
  		return madvise_behavior(vma, prev, start, end, behavior);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
908
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
909
  }
1ecef9ed0   Nicholas Krause   mm/madvise.c: mak...
910
  static bool
75927af8b   Nick Piggin   mm: madvise(): co...
911
912
913
914
915
916
917
918
919
920
921
  madvise_behavior_valid(int behavior)
  {
  	switch (behavior) {
  	case MADV_DOFORK:
  	case MADV_DONTFORK:
  	case MADV_NORMAL:
  	case MADV_SEQUENTIAL:
  	case MADV_RANDOM:
  	case MADV_REMOVE:
  	case MADV_WILLNEED:
  	case MADV_DONTNEED:
854e9ed09   Minchan Kim   mm: support madvi...
922
  	case MADV_FREE:
9c276cc65   Minchan Kim   mm: introduce MAD...
923
  	case MADV_COLD:
1a4e58cce   Minchan Kim   mm: introduce MAD...
924
  	case MADV_PAGEOUT:
f8af4da3b   Hugh Dickins   ksm: the mm inter...
925
926
927
928
  #ifdef CONFIG_KSM
  	case MADV_MERGEABLE:
  	case MADV_UNMERGEABLE:
  #endif
0af4e98b6   Andrea Arcangeli   thp: madvise(MADV...
929
930
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  	case MADV_HUGEPAGE:
a664b2d85   Andrea Arcangeli   thp: madvise(MADV...
931
  	case MADV_NOHUGEPAGE:
0af4e98b6   Andrea Arcangeli   thp: madvise(MADV...
932
  #endif
accb61fe7   Jason Baron   coredump: add VM_...
933
934
  	case MADV_DONTDUMP:
  	case MADV_DODUMP:
d2cd9ede6   Rik van Riel   mm,fork: introduc...
935
936
  	case MADV_WIPEONFORK:
  	case MADV_KEEPONFORK:
5e451be75   Anshuman Khandual   mm/madvise: move ...
937
938
939
940
  #ifdef CONFIG_MEMORY_FAILURE
  	case MADV_SOFT_OFFLINE:
  	case MADV_HWPOISON:
  #endif
1ecef9ed0   Nicholas Krause   mm/madvise.c: mak...
941
  		return true;
75927af8b   Nick Piggin   mm: madvise(): co...
942
943
  
  	default:
1ecef9ed0   Nicholas Krause   mm/madvise.c: mak...
944
  		return false;
75927af8b   Nick Piggin   mm: madvise(): co...
945
946
  	}
  }
3866ea90d   Hugh Dickins   ksm: first tidy u...
947

ecb8ac8b1   Minchan Kim   mm/madvise: intro...
948
949
950
951
952
953
954
955
956
957
958
  static bool
  process_madvise_behavior_valid(int behavior)
  {
  	switch (behavior) {
  	case MADV_COLD:
  	case MADV_PAGEOUT:
  		return true;
  	default:
  		return false;
  	}
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
  /*
   * The madvise(2) system call.
   *
   * Applications can use madvise() to advise the kernel how it should
   * handle paging I/O in this VM area.  The idea is to help the kernel
   * use appropriate read-ahead and caching techniques.  The information
   * provided is advisory only, and can be safely disregarded by the
   * kernel without affecting the correct operation of the application.
   *
   * behavior values:
   *  MADV_NORMAL - the default behavior is to read clusters.  This
   *		results in some read-ahead and read-behind.
   *  MADV_RANDOM - the system should read the minimum amount of data
   *		on any access, since it is unlikely that the appli-
   *		cation will need more than what it asks for.
   *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
   *		once, so they can be aggressively read ahead, and
   *		can be freed soon after they are accessed.
   *  MADV_WILLNEED - the application is notifying the system to read
   *		some pages ahead.
   *  MADV_DONTNEED - the application is finished with the given range,
   *		so the kernel can free resources associated with it.
d7206a70a   Naoya Horiguchi   mm/madvise: updat...
981
982
   *  MADV_FREE - the application marks pages in the given range as lazy free,
   *		where actual purges are postponed until memory pressure happens.
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
983
984
   *  MADV_REMOVE - the application wants to free up the given range of
   *		pages and associated backing store.
3866ea90d   Hugh Dickins   ksm: first tidy u...
985
986
987
   *  MADV_DONTFORK - omit this area from child's address space when forking:
   *		typically, to avoid COWing pages pinned by get_user_pages().
   *  MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
c02c30093   Yang Shi   mm/madvise.c: add...
988
989
990
   *  MADV_WIPEONFORK - present the child process with zero-filled memory in this
   *              range after a fork.
   *  MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK
d7206a70a   Naoya Horiguchi   mm/madvise: updat...
991
992
993
   *  MADV_HWPOISON - trigger memory error handler as if the given memory range
   *		were corrupted by unrecoverable hardware memory failure.
   *  MADV_SOFT_OFFLINE - try to soft-offline the given range of memory.
f8af4da3b   Hugh Dickins   ksm: the mm inter...
994
995
996
   *  MADV_MERGEABLE - the application recommends that KSM try to merge pages in
   *		this area with pages of identical content from other such areas.
   *  MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
d7206a70a   Naoya Horiguchi   mm/madvise: updat...
997
998
999
1000
1001
1002
1003
1004
1005
   *  MADV_HUGEPAGE - the application wants to back the given range by transparent
   *		huge pages in the future. Existing pages might be coalesced and
   *		new pages might be allocated as THP.
   *  MADV_NOHUGEPAGE - mark the given range as not worth being backed by
   *		transparent huge pages so the existing pages will not be
   *		coalesced into THP and new pages will not be allocated as THP.
   *  MADV_DONTDUMP - the application wants to prevent pages in the given range
   *		from being included in its core dump.
   *  MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
ecb8ac8b1   Minchan Kim   mm/madvise: intro...
1006
1007
1008
1009
1010
   *  MADV_COLD - the application is not expected to use this memory soon,
   *		deactivate pages in this range so that they can be reclaimed
   *		easily if memory pressure hanppens.
   *  MADV_PAGEOUT - the application is not expected to use this memory soon,
   *		page out the pages in this range immediately.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1011
1012
1013
1014
1015
   *
   * return values:
   *  zero    - success
   *  -EINVAL - start + len < 0, start is not page-aligned,
   *		"behavior" is not a valid value, or application
c02c30093   Yang Shi   mm/madvise.c: add...
1016
1017
1018
   *		is attempting to release locked or shared pages,
   *		or the specified address range includes file, Huge TLB,
   *		MAP_SHARED or VMPFNMAP range.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1019
1020
1021
1022
1023
1024
   *  -ENOMEM - addresses in the specified range are not currently
   *		mapped, or are outside the AS of the process.
   *  -EIO    - an I/O error occurred while paging in data.
   *  -EBADF  - map exists, but area maps something that isn't a file.
   *  -EAGAIN - a kernel resource was temporarily unavailable.
   */
0726b01e7   Minchan Kim   mm/madvise: pass ...
1025
  int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1026
  {
05b743847   Prasanna Meda   [PATCH] madvise: ...
1027
  	unsigned long end, tmp;
ec9bed9d3   Vladimir Cernov   mm/madvise.c: fix...
1028
  	struct vm_area_struct *vma, *prev;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1029
1030
  	int unmapped_error = 0;
  	int error = -EINVAL;
f79777932   Jason Baron   speed up madvise_...
1031
  	int write;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1032
  	size_t len;
1998cc048   Shaohua Li   mm: make madvise(...
1033
  	struct blk_plug plug;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1034

057d33891   Andrey Konovalov   mm: untag user po...
1035
  	start = untagged_addr(start);
75927af8b   Nick Piggin   mm: madvise(): co...
1036
1037
  	if (!madvise_behavior_valid(behavior))
  		return error;
df6c6500b   Wei Yang   mm/madvise.c: use...
1038
  	if (!PAGE_ALIGNED(start))
84d96d897   Rasmus Villemoes   mm: madvise: comp...
1039
  		return error;
df6c6500b   Wei Yang   mm/madvise.c: use...
1040
  	len = PAGE_ALIGN(len_in);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1041
1042
1043
  
  	/* Check to see whether len was rounded up from small -ve to zero */
  	if (len_in && !len)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
1044
  		return error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1045
1046
1047
  
  	end = start + len;
  	if (end < start)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
1048
  		return error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1049
1050
1051
  
  	error = 0;
  	if (end == start)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
1052
  		return error;
5e451be75   Anshuman Khandual   mm/madvise: move ...
1053
1054
1055
1056
  #ifdef CONFIG_MEMORY_FAILURE
  	if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
  		return madvise_inject_error(behavior, start, start + len_in);
  #endif
84d96d897   Rasmus Villemoes   mm: madvise: comp...
1057
  	write = madvise_need_mmap_write(behavior);
dc0ef0df7   Michal Hocko   mm: make mmap_sem...
1058
  	if (write) {
0726b01e7   Minchan Kim   mm/madvise: pass ...
1059
  		if (mmap_write_lock_killable(mm))
dc0ef0df7   Michal Hocko   mm: make mmap_sem...
1060
1061
  			return -EINTR;
  	} else {
0726b01e7   Minchan Kim   mm/madvise: pass ...
1062
  		mmap_read_lock(mm);
dc0ef0df7   Michal Hocko   mm: make mmap_sem...
1063
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1064
1065
1066
1067
  
  	/*
  	 * If the interval [start,end) covers some unmapped address
  	 * ranges, just ignore them, but return -ENOMEM at the end.
05b743847   Prasanna Meda   [PATCH] madvise: ...
1068
  	 * - different from the way of handling in mlock etc.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1069
  	 */
0726b01e7   Minchan Kim   mm/madvise: pass ...
1070
  	vma = find_vma_prev(mm, start, &prev);
836d5ffd3   Hugh Dickins   [PATCH] mm: fix m...
1071
1072
  	if (vma && start > vma->vm_start)
  		prev = vma;
1998cc048   Shaohua Li   mm: make madvise(...
1073
  	blk_start_plug(&plug);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1074
1075
1076
1077
  	for (;;) {
  		/* Still start < end. */
  		error = -ENOMEM;
  		if (!vma)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
1078
  			goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1079

05b743847   Prasanna Meda   [PATCH] madvise: ...
1080
  		/* Here start < (end|vma->vm_end). */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1081
1082
1083
  		if (start < vma->vm_start) {
  			unmapped_error = -ENOMEM;
  			start = vma->vm_start;
05b743847   Prasanna Meda   [PATCH] madvise: ...
1084
  			if (start >= end)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
1085
  				goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1086
  		}
05b743847   Prasanna Meda   [PATCH] madvise: ...
1087
1088
1089
1090
  		/* Here vma->vm_start <= start < (end|vma->vm_end) */
  		tmp = vma->vm_end;
  		if (end < tmp)
  			tmp = end;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1091

05b743847   Prasanna Meda   [PATCH] madvise: ...
1092
1093
  		/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
  		error = madvise_vma(vma, &prev, start, tmp, behavior);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1094
  		if (error)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
1095
  			goto out;
05b743847   Prasanna Meda   [PATCH] madvise: ...
1096
  		start = tmp;
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
1097
  		if (prev && start < prev->vm_end)
05b743847   Prasanna Meda   [PATCH] madvise: ...
1098
1099
1100
  			start = prev->vm_end;
  		error = unmapped_error;
  		if (start >= end)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
1101
  			goto out;
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
1102
1103
  		if (prev)
  			vma = prev->vm_next;
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
1104
  		else	/* madvise_remove dropped mmap_lock */
0726b01e7   Minchan Kim   mm/madvise: pass ...
1105
  			vma = find_vma(mm, start);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1106
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1107
  out:
84d96d897   Rasmus Villemoes   mm: madvise: comp...
1108
  	blk_finish_plug(&plug);
f79777932   Jason Baron   speed up madvise_...
1109
  	if (write)
0726b01e7   Minchan Kim   mm/madvise: pass ...
1110
  		mmap_write_unlock(mm);
0a27a14a6   Nick Piggin   mm: madvise avoid...
1111
  	else
0726b01e7   Minchan Kim   mm/madvise: pass ...
1112
  		mmap_read_unlock(mm);
0a27a14a6   Nick Piggin   mm: madvise avoid...
1113

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1114
1115
  	return error;
  }
db08ca252   Jens Axboe   mm: make do_madvi...
1116
1117
1118
  
  SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
  {
0726b01e7   Minchan Kim   mm/madvise: pass ...
1119
  	return do_madvise(current->mm, start, len_in, behavior);
db08ca252   Jens Axboe   mm: make do_madvi...
1120
  }
ecb8ac8b1   Minchan Kim   mm/madvise: intro...
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
  
  SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
  		size_t, vlen, int, behavior, unsigned int, flags)
  {
  	ssize_t ret;
  	struct iovec iovstack[UIO_FASTIOV], iovec;
  	struct iovec *iov = iovstack;
  	struct iov_iter iter;
  	struct pid *pid;
  	struct task_struct *task;
  	struct mm_struct *mm;
  	size_t total_len;
  	unsigned int f_flags;
  
  	if (flags != 0) {
  		ret = -EINVAL;
  		goto out;
  	}
  
  	ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
  	if (ret < 0)
  		goto out;
  
  	pid = pidfd_get_pid(pidfd, &f_flags);
  	if (IS_ERR(pid)) {
  		ret = PTR_ERR(pid);
  		goto free_iov;
  	}
  
  	task = get_pid_task(pid, PIDTYPE_PID);
  	if (!task) {
  		ret = -ESRCH;
  		goto put_pid;
  	}
a68a0262a   Minchan Kim   mm/madvise: remov...
1155
  	if (!process_madvise_behavior_valid(behavior)) {
ecb8ac8b1   Minchan Kim   mm/madvise: intro...
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
  		ret = -EINVAL;
  		goto release_task;
  	}
  
  	mm = mm_access(task, PTRACE_MODE_ATTACH_FSCREDS);
  	if (IS_ERR_OR_NULL(mm)) {
  		ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
  		goto release_task;
  	}
  
  	total_len = iov_iter_count(&iter);
  
  	while (iov_iter_count(&iter)) {
  		iovec = iov_iter_iovec(&iter);
  		ret = do_madvise(mm, (unsigned long)iovec.iov_base,
  					iovec.iov_len, behavior);
  		if (ret < 0)
  			break;
  		iov_iter_advance(&iter, iovec.iov_len);
  	}
  
  	if (ret == 0)
  		ret = total_len - iov_iter_count(&iter);
  
  	mmput(mm);
ecb8ac8b1   Minchan Kim   mm/madvise: intro...
1181
1182
1183
1184
1185
1186
1187
1188
1189
  release_task:
  	put_task_struct(task);
  put_pid:
  	put_pid(pid);
  free_iov:
  	kfree(iov);
  out:
  	return ret;
  }