Blame view

mm/madvise.c 22.6 KB
b24413180   Greg Kroah-Hartman   License cleanup: ...
1
  // SPDX-License-Identifier: GPL-2.0
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2
3
4
5
6
7
8
9
10
11
  /*
   *	linux/mm/madvise.c
   *
   * Copyright (C) 1999  Linus Torvalds
   * Copyright (C) 2002  Christoph Hellwig
   */
  
  #include <linux/mman.h>
  #include <linux/pagemap.h>
  #include <linux/syscalls.h>
05b743847   Prasanna Meda   [PATCH] madvise: ...
12
  #include <linux/mempolicy.h>
afcf938ee   Andi Kleen   HWPOISON: Add a m...
13
  #include <linux/page-isolation.h>
05ce77249   Pavel Emelyanov   userfaultfd: non-...
14
  #include <linux/userfaultfd_k.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
15
  #include <linux/hugetlb.h>
3f31d0757   Hugh Dickins   mm/fs: route MADV...
16
  #include <linux/falloc.h>
e8edc6e03   Alexey Dobriyan   Detach sched.h fr...
17
  #include <linux/sched.h>
f8af4da3b   Hugh Dickins   ksm: the mm inter...
18
  #include <linux/ksm.h>
3f31d0757   Hugh Dickins   mm/fs: route MADV...
19
  #include <linux/fs.h>
9ab4233dd   Andy Lutomirski   mm: Hold a file r...
20
  #include <linux/file.h>
1998cc048   Shaohua Li   mm: make madvise(...
21
  #include <linux/blkdev.h>
66114cad6   Tejun Heo   writeback: separa...
22
  #include <linux/backing-dev.h>
1998cc048   Shaohua Li   mm: make madvise(...
23
24
  #include <linux/swap.h>
  #include <linux/swapops.h>
3a4f8a0b3   Hugh Dickins   mm: remove shmem_...
25
  #include <linux/shmem_fs.h>
854e9ed09   Minchan Kim   mm: support madvi...
26
27
28
  #include <linux/mmu_notifier.h>
  
  #include <asm/tlb.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
29

235190738   Kirill A. Shutemov   oom-reaper: use m...
30
  #include "internal.h"
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
31
  /*
0a27a14a6   Nick Piggin   mm: madvise avoid...
32
33
34
35
36
37
38
39
40
41
   * Any behaviour which results in changes to the vma->vm_flags needs to
   * take mmap_sem for writing. Others, which simply traverse vmas, need
   * to only take it for reading.
   */
  static int madvise_need_mmap_write(int behavior)
  {
  	switch (behavior) {
  	case MADV_REMOVE:
  	case MADV_WILLNEED:
  	case MADV_DONTNEED:
854e9ed09   Minchan Kim   mm: support madvi...
42
  	case MADV_FREE:
0a27a14a6   Nick Piggin   mm: madvise avoid...
43
44
45
46
47
48
49
50
  		return 0;
  	default:
  		/* be safe, default to 1. list exceptions explicitly */
  		return 1;
  	}
  }
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
51
52
53
   * We can potentially split a vm area into separate
   * areas, each area with its own behavior.
   */
ec9bed9d3   Vladimir Cernov   mm/madvise.c: fix...
54
  static long madvise_behavior(struct vm_area_struct *vma,
05b743847   Prasanna Meda   [PATCH] madvise: ...
55
56
  		     struct vm_area_struct **prev,
  		     unsigned long start, unsigned long end, int behavior)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
57
  {
ec9bed9d3   Vladimir Cernov   mm/madvise.c: fix...
58
  	struct mm_struct *mm = vma->vm_mm;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
59
  	int error = 0;
05b743847   Prasanna Meda   [PATCH] madvise: ...
60
  	pgoff_t pgoff;
3866ea90d   Hugh Dickins   ksm: first tidy u...
61
  	unsigned long new_flags = vma->vm_flags;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
62
63
  
  	switch (behavior) {
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
64
65
66
  	case MADV_NORMAL:
  		new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
  		break;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
67
  	case MADV_SEQUENTIAL:
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
68
  		new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
69
70
  		break;
  	case MADV_RANDOM:
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
71
  		new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
72
  		break;
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
73
74
75
76
  	case MADV_DONTFORK:
  		new_flags |= VM_DONTCOPY;
  		break;
  	case MADV_DOFORK:
3866ea90d   Hugh Dickins   ksm: first tidy u...
77
78
79
80
  		if (vma->vm_flags & VM_IO) {
  			error = -EINVAL;
  			goto out;
  		}
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
81
  		new_flags &= ~VM_DONTCOPY;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
82
  		break;
d2cd9ede6   Rik van Riel   mm,fork: introduc...
83
84
85
86
87
88
89
90
91
92
93
  	case MADV_WIPEONFORK:
  		/* MADV_WIPEONFORK is only supported on anonymous memory. */
  		if (vma->vm_file || vma->vm_flags & VM_SHARED) {
  			error = -EINVAL;
  			goto out;
  		}
  		new_flags |= VM_WIPEONFORK;
  		break;
  	case MADV_KEEPONFORK:
  		new_flags &= ~VM_WIPEONFORK;
  		break;
accb61fe7   Jason Baron   coredump: add VM_...
94
  	case MADV_DONTDUMP:
0103bd16f   Konstantin Khlebnikov   mm: prepare VM_DO...
95
  		new_flags |= VM_DONTDUMP;
accb61fe7   Jason Baron   coredump: add VM_...
96
97
  		break;
  	case MADV_DODUMP:
dcc89aaf5   Daniel Black   mm: madvise(MADV_...
98
  		if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) {
0103bd16f   Konstantin Khlebnikov   mm: prepare VM_DO...
99
100
101
102
  			error = -EINVAL;
  			goto out;
  		}
  		new_flags &= ~VM_DONTDUMP;
accb61fe7   Jason Baron   coredump: add VM_...
103
  		break;
f8af4da3b   Hugh Dickins   ksm: the mm inter...
104
105
106
  	case MADV_MERGEABLE:
  	case MADV_UNMERGEABLE:
  		error = ksm_madvise(vma, start, end, behavior, &new_flags);
def5efe03   David Rientjes   mm, madvise: fail...
107
108
109
110
111
112
113
  		if (error) {
  			/*
  			 * madvise() returns EAGAIN if kernel resources, such as
  			 * slab, are temporarily unavailable.
  			 */
  			if (error == -ENOMEM)
  				error = -EAGAIN;
f8af4da3b   Hugh Dickins   ksm: the mm inter...
114
  			goto out;
def5efe03   David Rientjes   mm, madvise: fail...
115
  		}
f8af4da3b   Hugh Dickins   ksm: the mm inter...
116
  		break;
0af4e98b6   Andrea Arcangeli   thp: madvise(MADV...
117
  	case MADV_HUGEPAGE:
a664b2d85   Andrea Arcangeli   thp: madvise(MADV...
118
  	case MADV_NOHUGEPAGE:
60ab3244e   Andrea Arcangeli   thp: khugepaged: ...
119
  		error = hugepage_madvise(vma, &new_flags, behavior);
def5efe03   David Rientjes   mm, madvise: fail...
120
121
122
123
124
125
126
  		if (error) {
  			/*
  			 * madvise() returns EAGAIN if kernel resources, such as
  			 * slab, are temporarily unavailable.
  			 */
  			if (error == -ENOMEM)
  				error = -EAGAIN;
0af4e98b6   Andrea Arcangeli   thp: madvise(MADV...
127
  			goto out;
def5efe03   David Rientjes   mm, madvise: fail...
128
  		}
0af4e98b6   Andrea Arcangeli   thp: madvise(MADV...
129
  		break;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
130
  	}
05b743847   Prasanna Meda   [PATCH] madvise: ...
131
132
  	if (new_flags == vma->vm_flags) {
  		*prev = vma;
836d5ffd3   Hugh Dickins   [PATCH] mm: fix m...
133
  		goto out;
05b743847   Prasanna Meda   [PATCH] madvise: ...
134
135
136
137
  	}
  
  	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
  	*prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
19a809afe   Andrea Arcangeli   userfaultfd: teac...
138
139
  			  vma->vm_file, pgoff, vma_policy(vma),
  			  vma->vm_userfaultfd_ctx);
05b743847   Prasanna Meda   [PATCH] madvise: ...
140
141
142
143
144
145
  	if (*prev) {
  		vma = *prev;
  		goto success;
  	}
  
  	*prev = vma;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
146
147
  
  	if (start != vma->vm_start) {
def5efe03   David Rientjes   mm, madvise: fail...
148
149
  		if (unlikely(mm->map_count >= sysctl_max_map_count)) {
  			error = -ENOMEM;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
150
  			goto out;
def5efe03   David Rientjes   mm, madvise: fail...
151
152
153
154
155
156
157
158
159
160
161
  		}
  		error = __split_vma(mm, vma, start, 1);
  		if (error) {
  			/*
  			 * madvise() returns EAGAIN if kernel resources, such as
  			 * slab, are temporarily unavailable.
  			 */
  			if (error == -ENOMEM)
  				error = -EAGAIN;
  			goto out;
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
162
163
164
  	}
  
  	if (end != vma->vm_end) {
def5efe03   David Rientjes   mm, madvise: fail...
165
166
  		if (unlikely(mm->map_count >= sysctl_max_map_count)) {
  			error = -ENOMEM;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
167
  			goto out;
def5efe03   David Rientjes   mm, madvise: fail...
168
169
170
171
172
173
174
175
176
177
178
  		}
  		error = __split_vma(mm, vma, end, 0);
  		if (error) {
  			/*
  			 * madvise() returns EAGAIN if kernel resources, such as
  			 * slab, are temporarily unavailable.
  			 */
  			if (error == -ENOMEM)
  				error = -EAGAIN;
  			goto out;
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
179
  	}
836d5ffd3   Hugh Dickins   [PATCH] mm: fix m...
180
  success:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
181
182
183
  	/*
  	 * vm_flags is protected by the mmap_sem held in write mode.
  	 */
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
184
  	vma->vm_flags = new_flags;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
185
  out:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
186
187
  	return error;
  }
1998cc048   Shaohua Li   mm: make madvise(...
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
  #ifdef CONFIG_SWAP
  static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
  	unsigned long end, struct mm_walk *walk)
  {
  	pte_t *orig_pte;
  	struct vm_area_struct *vma = walk->private;
  	unsigned long index;
  
  	if (pmd_none_or_trans_huge_or_clear_bad(pmd))
  		return 0;
  
  	for (index = start; index != end; index += PAGE_SIZE) {
  		pte_t pte;
  		swp_entry_t entry;
  		struct page *page;
  		spinlock_t *ptl;
  
  		orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
  		pte = *(orig_pte + ((index - start) / PAGE_SIZE));
  		pte_unmap_unlock(orig_pte, ptl);
0661a3361   Kirill A. Shutemov   mm: remove rest u...
208
  		if (pte_present(pte) || pte_none(pte))
1998cc048   Shaohua Li   mm: make madvise(...
209
210
211
212
213
214
  			continue;
  		entry = pte_to_swp_entry(pte);
  		if (unlikely(non_swap_entry(entry)))
  			continue;
  
  		page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
23955622f   Shaohua Li   swap: add block i...
215
  							vma, index, false);
1998cc048   Shaohua Li   mm: make madvise(...
216
  		if (page)
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
217
  			put_page(page);
1998cc048   Shaohua Li   mm: make madvise(...
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
  	}
  
  	return 0;
  }
  
  static void force_swapin_readahead(struct vm_area_struct *vma,
  		unsigned long start, unsigned long end)
  {
  	struct mm_walk walk = {
  		.mm = vma->vm_mm,
  		.pmd_entry = swapin_walk_pmd_entry,
  		.private = vma,
  	};
  
  	walk_page_range(start, end, &walk);
  
  	lru_add_drain();	/* Push any new pages onto the LRU now */
  }
  
  static void force_shm_swapin_readahead(struct vm_area_struct *vma,
  		unsigned long start, unsigned long end,
  		struct address_space *mapping)
  {
  	pgoff_t index;
  	struct page *page;
  	swp_entry_t swap;
  
  	for (; start < end; start += PAGE_SIZE) {
  		index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
55231e5c8   Johannes Weiner   mm: madvise: fix ...
247
  		page = find_get_entry(mapping, index);
1998cc048   Shaohua Li   mm: make madvise(...
248
249
  		if (!radix_tree_exceptional_entry(page)) {
  			if (page)
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
250
  				put_page(page);
1998cc048   Shaohua Li   mm: make madvise(...
251
252
253
254
  			continue;
  		}
  		swap = radix_to_swp_entry(page);
  		page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
23955622f   Shaohua Li   swap: add block i...
255
  							NULL, 0, false);
1998cc048   Shaohua Li   mm: make madvise(...
256
  		if (page)
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
257
  			put_page(page);
1998cc048   Shaohua Li   mm: make madvise(...
258
259
260
261
262
  	}
  
  	lru_add_drain();	/* Push any new pages onto the LRU now */
  }
  #endif		/* CONFIG_SWAP */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
263
264
265
  /*
   * Schedule all required I/O operations.  Do not wait for completion.
   */
ec9bed9d3   Vladimir Cernov   mm/madvise.c: fix...
266
267
  static long madvise_willneed(struct vm_area_struct *vma,
  			     struct vm_area_struct **prev,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
268
269
270
  			     unsigned long start, unsigned long end)
  {
  	struct file *file = vma->vm_file;
8a0bb9eba   chenjie   mm/madvise.c: fix...
271
  	*prev = vma;
1998cc048   Shaohua Li   mm: make madvise(...
272
  #ifdef CONFIG_SWAP
97b713ba3   Christoph Hellwig   fs: kill BDI_CAP_...
273
  	if (!file) {
97b713ba3   Christoph Hellwig   fs: kill BDI_CAP_...
274
  		force_swapin_readahead(vma, start, end);
1998cc048   Shaohua Li   mm: make madvise(...
275
276
  		return 0;
  	}
1998cc048   Shaohua Li   mm: make madvise(...
277

97b713ba3   Christoph Hellwig   fs: kill BDI_CAP_...
278
  	if (shmem_mapping(file->f_mapping)) {
97b713ba3   Christoph Hellwig   fs: kill BDI_CAP_...
279
280
281
282
283
  		force_shm_swapin_readahead(vma, start, end,
  					file->f_mapping);
  		return 0;
  	}
  #else
1bef40032   Suzuki   [PATCH] madvise: ...
284
285
  	if (!file)
  		return -EBADF;
97b713ba3   Christoph Hellwig   fs: kill BDI_CAP_...
286
  #endif
1bef40032   Suzuki   [PATCH] madvise: ...
287

e748dcd09   Matthew Wilcox   vfs: remove get_x...
288
  	if (IS_DAX(file_inode(file))) {
fe77ba6f4   Carsten Otte   [PATCH] xip: madv...
289
290
291
  		/* no bad return value, but ignore advice */
  		return 0;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
292
293
294
295
  	start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
  	if (end > vma->vm_end)
  		end = vma->vm_end;
  	end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
f7e839dd3   Wu Fengguang   readahead: move m...
296
  	force_page_cache_readahead(file->f_mapping, file, start, end - start);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
297
298
  	return 0;
  }
854e9ed09   Minchan Kim   mm: support madvi...
299
300
301
302
303
304
305
306
307
308
  static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
  				unsigned long end, struct mm_walk *walk)
  
  {
  	struct mmu_gather *tlb = walk->private;
  	struct mm_struct *mm = tlb->mm;
  	struct vm_area_struct *vma = walk->vma;
  	spinlock_t *ptl;
  	pte_t *orig_pte, *pte, ptent;
  	struct page *page;
64b42bc1c   Minchan Kim   mm/madvise.c: fre...
309
  	int nr_swap = 0;
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
310
311
312
313
314
315
  	unsigned long next;
  
  	next = pmd_addr_end(addr, end);
  	if (pmd_trans_huge(*pmd))
  		if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
  			goto next;
854e9ed09   Minchan Kim   mm: support madvi...
316

854e9ed09   Minchan Kim   mm: support madvi...
317
318
  	if (pmd_trans_unstable(pmd))
  		return 0;
07e326610   Aneesh Kumar K.V   mm: add tlb_remov...
319
  	tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
854e9ed09   Minchan Kim   mm: support madvi...
320
  	orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
3ea277194   Mel Gorman   mm, mprotect: flu...
321
  	flush_tlb_batched_pending(mm);
854e9ed09   Minchan Kim   mm: support madvi...
322
323
324
  	arch_enter_lazy_mmu_mode();
  	for (; addr != end; pte++, addr += PAGE_SIZE) {
  		ptent = *pte;
64b42bc1c   Minchan Kim   mm/madvise.c: fre...
325
  		if (pte_none(ptent))
854e9ed09   Minchan Kim   mm: support madvi...
326
  			continue;
64b42bc1c   Minchan Kim   mm/madvise.c: fre...
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
  		/*
  		 * If the pte has swp_entry, just clear page table to
  		 * prevent swap-in which is more expensive rather than
  		 * (page allocation + zeroing).
  		 */
  		if (!pte_present(ptent)) {
  			swp_entry_t entry;
  
  			entry = pte_to_swp_entry(ptent);
  			if (non_swap_entry(entry))
  				continue;
  			nr_swap--;
  			free_swap_and_cache(entry);
  			pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
  			continue;
  		}
854e9ed09   Minchan Kim   mm: support madvi...
343

df6ad6983   Jérôme Glisse   mm/device-public-...
344
  		page = _vm_normal_page(vma, addr, ptent, true);
854e9ed09   Minchan Kim   mm: support madvi...
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
  		if (!page)
  			continue;
  
  		/*
  		 * If pmd isn't transhuge but the page is THP and
  		 * is owned by only this process, split it and
  		 * deactivate all pages.
  		 */
  		if (PageTransCompound(page)) {
  			if (page_mapcount(page) != 1)
  				goto out;
  			get_page(page);
  			if (!trylock_page(page)) {
  				put_page(page);
  				goto out;
  			}
  			pte_unmap_unlock(orig_pte, ptl);
  			if (split_huge_page(page)) {
  				unlock_page(page);
  				put_page(page);
  				pte_offset_map_lock(mm, pmd, addr, &ptl);
  				goto out;
  			}
854e9ed09   Minchan Kim   mm: support madvi...
368
  			unlock_page(page);
263630e8d   Eric Biggers   mm/madvise.c: fix...
369
  			put_page(page);
854e9ed09   Minchan Kim   mm: support madvi...
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
  			pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
  			pte--;
  			addr -= PAGE_SIZE;
  			continue;
  		}
  
  		VM_BUG_ON_PAGE(PageTransCompound(page), page);
  
  		if (PageSwapCache(page) || PageDirty(page)) {
  			if (!trylock_page(page))
  				continue;
  			/*
  			 * If page is shared with others, we couldn't clear
  			 * PG_dirty of the page.
  			 */
  			if (page_mapcount(page) != 1) {
  				unlock_page(page);
  				continue;
  			}
  
  			if (PageSwapCache(page) && !try_to_free_swap(page)) {
  				unlock_page(page);
  				continue;
  			}
  
  			ClearPageDirty(page);
  			unlock_page(page);
  		}
  
  		if (pte_young(ptent) || pte_dirty(ptent)) {
  			/*
  			 * Some of architecture(ex, PPC) don't update TLB
  			 * with set_pte_at and tlb_remove_tlb_entry so for
  			 * the portability, remap the pte with old|clean
  			 * after pte clearing.
  			 */
  			ptent = ptep_get_and_clear_full(mm, addr, pte,
  							tlb->fullmm);
  
  			ptent = pte_mkold(ptent);
  			ptent = pte_mkclean(ptent);
  			set_pte_at(mm, addr, pte, ptent);
  			tlb_remove_tlb_entry(tlb, pte, addr);
  		}
802a3a92a   Shaohua Li   mm: reclaim MADV_...
414
  		mark_page_lazyfree(page);
854e9ed09   Minchan Kim   mm: support madvi...
415
416
  	}
  out:
64b42bc1c   Minchan Kim   mm/madvise.c: fre...
417
418
419
420
421
422
  	if (nr_swap) {
  		if (current->mm == mm)
  			sync_mm_rss(mm);
  
  		add_mm_counter(mm, MM_SWAPENTS, nr_swap);
  	}
854e9ed09   Minchan Kim   mm: support madvi...
423
424
425
  	arch_leave_lazy_mmu_mode();
  	pte_unmap_unlock(orig_pte, ptl);
  	cond_resched();
b8d3c4c30   Minchan Kim   mm/huge_memory.c:...
426
  next:
854e9ed09   Minchan Kim   mm: support madvi...
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
  	return 0;
  }
  
  static void madvise_free_page_range(struct mmu_gather *tlb,
  			     struct vm_area_struct *vma,
  			     unsigned long addr, unsigned long end)
  {
  	struct mm_walk free_walk = {
  		.pmd_entry = madvise_free_pte_range,
  		.mm = vma->vm_mm,
  		.private = tlb,
  	};
  
  	tlb_start_vma(tlb, vma);
  	walk_page_range(addr, end, &free_walk);
  	tlb_end_vma(tlb, vma);
  }
  
  static int madvise_free_single_vma(struct vm_area_struct *vma,
  			unsigned long start_addr, unsigned long end_addr)
  {
  	unsigned long start, end;
  	struct mm_struct *mm = vma->vm_mm;
  	struct mmu_gather tlb;
854e9ed09   Minchan Kim   mm: support madvi...
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
  	/* MADV_FREE works for only anon vma at the moment */
  	if (!vma_is_anonymous(vma))
  		return -EINVAL;
  
  	start = max(vma->vm_start, start_addr);
  	if (start >= vma->vm_end)
  		return -EINVAL;
  	end = min(vma->vm_end, end_addr);
  	if (end <= vma->vm_start)
  		return -EINVAL;
  
  	lru_add_drain();
  	tlb_gather_mmu(&tlb, mm, start, end);
  	update_hiwater_rss(mm);
  
  	mmu_notifier_invalidate_range_start(mm, start, end);
  	madvise_free_page_range(&tlb, vma, start, end);
  	mmu_notifier_invalidate_range_end(mm, start, end);
  	tlb_finish_mmu(&tlb, start, end);
  
  	return 0;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
473
474
475
476
  /*
   * Application no longer needs these pages.  If the pages are dirty,
   * it's OK to just throw them away.  The app will be more careful about
   * data it wants to keep.  Be sure to free swap resources too.  The
7e6cbea39   Fernando Luis Vazquez Cao   madvise: update f...
477
   * zap_page_range call sets things up for shrink_active_list to actually free
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
478
479
   * these pages later if no one else has touched them in the meantime,
   * although we could add these pages to a global reuse list for
7e6cbea39   Fernando Luis Vazquez Cao   madvise: update f...
480
   * shrink_active_list to pick up before reclaiming other pages.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
481
482
483
484
485
486
487
488
489
490
491
   *
   * NB: This interface discards data rather than pushes it out to swap,
   * as some implementations do.  This has performance implications for
   * applications like large transactional databases which want to discard
   * pages in anonymous maps after committing to backing store the data
   * that was kept in them.  There is no reason to write this data out to
   * the swap area if the application is discarding it.
   *
   * An interface that causes the system to free clean pages and flush
   * dirty pages is already available as msync(MS_INVALIDATE).
   */
230ca982b   Mike Rapoport   userfaultfd: non-...
492
493
494
495
496
497
498
499
500
501
502
  static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
  					unsigned long start, unsigned long end)
  {
  	zap_page_range(vma, start, end - start);
  	return 0;
  }
  
  static long madvise_dontneed_free(struct vm_area_struct *vma,
  				  struct vm_area_struct **prev,
  				  unsigned long start, unsigned long end,
  				  int behavior)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
503
  {
05b743847   Prasanna Meda   [PATCH] madvise: ...
504
  	*prev = vma;
235190738   Kirill A. Shutemov   oom-reaper: use m...
505
  	if (!can_madv_dontneed_vma(vma))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
506
  		return -EINVAL;
70ccb92fd   Andrea Arcangeli   userfaultfd: non-...
507
508
509
510
511
512
513
514
515
516
517
518
519
520
  	if (!userfaultfd_remove(vma, start, end)) {
  		*prev = NULL; /* mmap_sem has been dropped, prev is stale */
  
  		down_read(&current->mm->mmap_sem);
  		vma = find_vma(current->mm, start);
  		if (!vma)
  			return -ENOMEM;
  		if (start < vma->vm_start) {
  			/*
  			 * This "vma" under revalidation is the one
  			 * with the lowest vma->vm_start where start
  			 * is also < vma->vm_end. If start <
  			 * vma->vm_start it means an hole materialized
  			 * in the user address space within the
230ca982b   Mike Rapoport   userfaultfd: non-...
521
522
  			 * virtual range passed to MADV_DONTNEED
  			 * or MADV_FREE.
70ccb92fd   Andrea Arcangeli   userfaultfd: non-...
523
524
525
526
527
528
529
530
531
532
  			 */
  			return -ENOMEM;
  		}
  		if (!can_madv_dontneed_vma(vma))
  			return -EINVAL;
  		if (end > vma->vm_end) {
  			/*
  			 * Don't fail if end > vma->vm_end. If the old
  			 * vma was splitted while the mmap_sem was
  			 * released the effect of the concurrent
230ca982b   Mike Rapoport   userfaultfd: non-...
533
  			 * operation may not cause madvise() to
70ccb92fd   Andrea Arcangeli   userfaultfd: non-...
534
535
536
537
538
539
540
541
542
543
544
  			 * have an undefined result. There may be an
  			 * adjacent next vma that we'll walk
  			 * next. userfaultfd_remove() will generate an
  			 * UFFD_EVENT_REMOVE repetition on the
  			 * end-vma->vm_end range, but the manager can
  			 * handle a repetition fine.
  			 */
  			end = vma->vm_end;
  		}
  		VM_WARN_ON(start >= end);
  	}
230ca982b   Mike Rapoport   userfaultfd: non-...
545
546
547
548
549
550
551
  
  	if (behavior == MADV_DONTNEED)
  		return madvise_dontneed_single_vma(vma, start, end);
  	else if (behavior == MADV_FREE)
  		return madvise_free_single_vma(vma, start, end);
  	else
  		return -EINVAL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
552
  }
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
553
554
555
  /*
   * Application wants to free up the pages and associated backing store.
   * This is effectively punching a hole into the middle of a file.
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
556
557
   */
  static long madvise_remove(struct vm_area_struct *vma,
00e9fa2d6   Nick Piggin   [PATCH] mm: fix m...
558
  				struct vm_area_struct **prev,
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
559
560
  				unsigned long start, unsigned long end)
  {
3f31d0757   Hugh Dickins   mm/fs: route MADV...
561
  	loff_t offset;
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
562
  	int error;
9ab4233dd   Andy Lutomirski   mm: Hold a file r...
563
  	struct file *f;
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
564

90ed52ebe   Hugh Dickins   [PATCH] holepunch...
565
  	*prev = NULL;	/* tell sys_madvise we drop mmap_sem */
00e9fa2d6   Nick Piggin   [PATCH] mm: fix m...
566

72079ba0d   Mike Kravetz   mm: madvise allow...
567
  	if (vma->vm_flags & VM_LOCKED)
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
568
  		return -EINVAL;
9ab4233dd   Andy Lutomirski   mm: Hold a file r...
569
570
571
  	f = vma->vm_file;
  
  	if (!f || !f->f_mapping || !f->f_mapping->host) {
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
572
573
  			return -EINVAL;
  	}
69cf0fac6   Hugh Dickins   [PATCH] Fix MADV_...
574
575
  	if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
  		return -EACCES;
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
576
577
  	offset = (loff_t)(start - vma->vm_start)
  			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
578

9ab4233dd   Andy Lutomirski   mm: Hold a file r...
579
580
581
582
583
584
585
  	/*
  	 * Filesystem's fallocate may need to take i_mutex.  We need to
  	 * explicitly grab a reference because the vma (and hence the
  	 * vma's reference to the file) can go away as soon as we drop
  	 * mmap_sem.
  	 */
  	get_file(f);
70ccb92fd   Andrea Arcangeli   userfaultfd: non-...
586
587
588
589
  	if (userfaultfd_remove(vma, start, end)) {
  		/* mmap_sem was not released by userfaultfd_remove() */
  		up_read(&current->mm->mmap_sem);
  	}
72c72bdf7   Anna Schumaker   VFS: Rename do_fa...
590
  	error = vfs_fallocate(f,
3f31d0757   Hugh Dickins   mm/fs: route MADV...
591
592
  				FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
  				offset, end - start);
9ab4233dd   Andy Lutomirski   mm: Hold a file r...
593
  	fput(f);
0a27a14a6   Nick Piggin   mm: madvise avoid...
594
  	down_read(&current->mm->mmap_sem);
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
595
  	return error;
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
596
  }
9893e49d6   Andi Kleen   HWPOISON: Add mad...
597
598
599
600
  #ifdef CONFIG_MEMORY_FAILURE
  /*
   * Error injection support for memory error handling.
   */
97167a768   Anshuman Khandual   mm/madvise.c: cle...
601
602
  static int madvise_inject_error(int behavior,
  		unsigned long start, unsigned long end)
9893e49d6   Andi Kleen   HWPOISON: Add mad...
603
  {
97167a768   Anshuman Khandual   mm/madvise.c: cle...
604
  	struct page *page;
c461ad6a6   Mel Gorman   mm, madvise: ensu...
605
  	struct zone *zone;
19bfbe22f   Alexandru Moise   mm, hugetlb, soft...
606
  	unsigned int order;
97167a768   Anshuman Khandual   mm/madvise.c: cle...
607

9893e49d6   Andi Kleen   HWPOISON: Add mad...
608
609
  	if (!capable(CAP_SYS_ADMIN))
  		return -EPERM;
97167a768   Anshuman Khandual   mm/madvise.c: cle...
610

19bfbe22f   Alexandru Moise   mm, hugetlb, soft...
611
612
  
  	for (; start < end; start += PAGE_SIZE << order) {
325c4ef5c   Andrew Morton   mm/madvise.c:madv...
613
  		int ret;
97167a768   Anshuman Khandual   mm/madvise.c: cle...
614
  		ret = get_user_pages_fast(start, 1, 0, &page);
9893e49d6   Andi Kleen   HWPOISON: Add mad...
615
616
  		if (ret != 1)
  			return ret;
325c4ef5c   Andrew Morton   mm/madvise.c:madv...
617

19bfbe22f   Alexandru Moise   mm, hugetlb, soft...
618
619
620
621
622
623
  		/*
  		 * When soft offlining hugepages, after migrating the page
  		 * we dissolve it, therefore in the second loop "page" will
  		 * no longer be a compound page, and order will be 0.
  		 */
  		order = compound_order(compound_head(page));
97167a768   Anshuman Khandual   mm/madvise.c: cle...
624
625
  		if (PageHWPoison(page)) {
  			put_page(page);
29b4eedee   Wanpeng Li   mm/hwpoison.c: fi...
626
627
  			continue;
  		}
97167a768   Anshuman Khandual   mm/madvise.c: cle...
628
629
630
631
632
633
634
  
  		if (behavior == MADV_SOFT_OFFLINE) {
  			pr_info("Soft offlining pfn %#lx at process virtual address %#lx
  ",
  						page_to_pfn(page), start);
  
  			ret = soft_offline_page(page, MF_COUNT_INCREASED);
afcf938ee   Andi Kleen   HWPOISON: Add a m...
635
  			if (ret)
8302423b8   Wanpeng Li   mm/madvise.c: fix...
636
  				return ret;
afcf938ee   Andi Kleen   HWPOISON: Add a m...
637
638
  			continue;
  		}
97167a768   Anshuman Khandual   mm/madvise.c: cle...
639
640
641
642
643
  		pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx
  ",
  						page_to_pfn(page), start);
  
  		ret = memory_failure(page_to_pfn(page), 0, MF_COUNT_INCREASED);
23a003bfd   Naoya Horiguchi   mm/madvise: pass ...
644
645
  		if (ret)
  			return ret;
9893e49d6   Andi Kleen   HWPOISON: Add mad...
646
  	}
c461ad6a6   Mel Gorman   mm, madvise: ensu...
647
648
649
650
  
  	/* Ensure that all poisoned pages are removed from per-cpu lists */
  	for_each_populated_zone(zone)
  		drain_all_pages(zone);
325c4ef5c   Andrew Morton   mm/madvise.c:madv...
651
  	return 0;
9893e49d6   Andi Kleen   HWPOISON: Add mad...
652
653
  }
  #endif
165cd4023   suzuki   [PATCH] madvise()...
654
655
656
  static long
  madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
  		unsigned long start, unsigned long end, int behavior)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
657
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
658
  	switch (behavior) {
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
659
  	case MADV_REMOVE:
3866ea90d   Hugh Dickins   ksm: first tidy u...
660
  		return madvise_remove(vma, prev, start, end);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
661
  	case MADV_WILLNEED:
3866ea90d   Hugh Dickins   ksm: first tidy u...
662
  		return madvise_willneed(vma, prev, start, end);
854e9ed09   Minchan Kim   mm: support madvi...
663
  	case MADV_FREE:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
664
  	case MADV_DONTNEED:
230ca982b   Mike Rapoport   userfaultfd: non-...
665
  		return madvise_dontneed_free(vma, prev, start, end, behavior);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
666
  	default:
3866ea90d   Hugh Dickins   ksm: first tidy u...
667
  		return madvise_behavior(vma, prev, start, end, behavior);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
668
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
669
  }
1ecef9ed0   Nicholas Krause   mm/madvise.c: mak...
670
  static bool
75927af8b   Nick Piggin   mm: madvise(): co...
671
672
673
674
675
676
677
678
679
680
681
  madvise_behavior_valid(int behavior)
  {
  	switch (behavior) {
  	case MADV_DOFORK:
  	case MADV_DONTFORK:
  	case MADV_NORMAL:
  	case MADV_SEQUENTIAL:
  	case MADV_RANDOM:
  	case MADV_REMOVE:
  	case MADV_WILLNEED:
  	case MADV_DONTNEED:
854e9ed09   Minchan Kim   mm: support madvi...
682
  	case MADV_FREE:
f8af4da3b   Hugh Dickins   ksm: the mm inter...
683
684
685
686
  #ifdef CONFIG_KSM
  	case MADV_MERGEABLE:
  	case MADV_UNMERGEABLE:
  #endif
0af4e98b6   Andrea Arcangeli   thp: madvise(MADV...
687
688
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  	case MADV_HUGEPAGE:
a664b2d85   Andrea Arcangeli   thp: madvise(MADV...
689
  	case MADV_NOHUGEPAGE:
0af4e98b6   Andrea Arcangeli   thp: madvise(MADV...
690
  #endif
accb61fe7   Jason Baron   coredump: add VM_...
691
692
  	case MADV_DONTDUMP:
  	case MADV_DODUMP:
d2cd9ede6   Rik van Riel   mm,fork: introduc...
693
694
  	case MADV_WIPEONFORK:
  	case MADV_KEEPONFORK:
5e451be75   Anshuman Khandual   mm/madvise: move ...
695
696
697
698
  #ifdef CONFIG_MEMORY_FAILURE
  	case MADV_SOFT_OFFLINE:
  	case MADV_HWPOISON:
  #endif
1ecef9ed0   Nicholas Krause   mm/madvise.c: mak...
699
  		return true;
75927af8b   Nick Piggin   mm: madvise(): co...
700
701
  
  	default:
1ecef9ed0   Nicholas Krause   mm/madvise.c: mak...
702
  		return false;
75927af8b   Nick Piggin   mm: madvise(): co...
703
704
  	}
  }
3866ea90d   Hugh Dickins   ksm: first tidy u...
705

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
  /*
   * The madvise(2) system call.
   *
   * Applications can use madvise() to advise the kernel how it should
   * handle paging I/O in this VM area.  The idea is to help the kernel
   * use appropriate read-ahead and caching techniques.  The information
   * provided is advisory only, and can be safely disregarded by the
   * kernel without affecting the correct operation of the application.
   *
   * behavior values:
   *  MADV_NORMAL - the default behavior is to read clusters.  This
   *		results in some read-ahead and read-behind.
   *  MADV_RANDOM - the system should read the minimum amount of data
   *		on any access, since it is unlikely that the appli-
   *		cation will need more than what it asks for.
   *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
   *		once, so they can be aggressively read ahead, and
   *		can be freed soon after they are accessed.
   *  MADV_WILLNEED - the application is notifying the system to read
   *		some pages ahead.
   *  MADV_DONTNEED - the application is finished with the given range,
   *		so the kernel can free resources associated with it.
d7206a70a   Naoya Horiguchi   mm/madvise: updat...
728
729
   *  MADV_FREE - the application marks pages in the given range as lazy free,
   *		where actual purges are postponed until memory pressure happens.
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
730
731
   *  MADV_REMOVE - the application wants to free up the given range of
   *		pages and associated backing store.
3866ea90d   Hugh Dickins   ksm: first tidy u...
732
733
734
   *  MADV_DONTFORK - omit this area from child's address space when forking:
   *		typically, to avoid COWing pages pinned by get_user_pages().
   *  MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
c02c30093   Yang Shi   mm/madvise.c: add...
735
736
737
   *  MADV_WIPEONFORK - present the child process with zero-filled memory in this
   *              range after a fork.
   *  MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK
d7206a70a   Naoya Horiguchi   mm/madvise: updat...
738
739
740
   *  MADV_HWPOISON - trigger memory error handler as if the given memory range
   *		were corrupted by unrecoverable hardware memory failure.
   *  MADV_SOFT_OFFLINE - try to soft-offline the given range of memory.
f8af4da3b   Hugh Dickins   ksm: the mm inter...
741
742
743
   *  MADV_MERGEABLE - the application recommends that KSM try to merge pages in
   *		this area with pages of identical content from other such areas.
   *  MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
d7206a70a   Naoya Horiguchi   mm/madvise: updat...
744
745
746
747
748
749
750
751
752
   *  MADV_HUGEPAGE - the application wants to back the given range by transparent
   *		huge pages in the future. Existing pages might be coalesced and
   *		new pages might be allocated as THP.
   *  MADV_NOHUGEPAGE - mark the given range as not worth being backed by
   *		transparent huge pages so the existing pages will not be
   *		coalesced into THP and new pages will not be allocated as THP.
   *  MADV_DONTDUMP - the application wants to prevent pages in the given range
   *		from being included in its core dump.
   *  MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
753
754
755
756
757
   *
   * return values:
   *  zero    - success
   *  -EINVAL - start + len < 0, start is not page-aligned,
   *		"behavior" is not a valid value, or application
c02c30093   Yang Shi   mm/madvise.c: add...
758
759
760
   *		is attempting to release locked or shared pages,
   *		or the specified address range includes file, Huge TLB,
   *		MAP_SHARED or VMPFNMAP range.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
761
762
763
764
765
766
   *  -ENOMEM - addresses in the specified range are not currently
   *		mapped, or are outside the AS of the process.
   *  -EIO    - an I/O error occurred while paging in data.
   *  -EBADF  - map exists, but area maps something that isn't a file.
   *  -EAGAIN - a kernel resource was temporarily unavailable.
   */
3480b2574   Heiko Carstens   [CVE-2009-0029] S...
767
  SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
768
  {
05b743847   Prasanna Meda   [PATCH] madvise: ...
769
  	unsigned long end, tmp;
ec9bed9d3   Vladimir Cernov   mm/madvise.c: fix...
770
  	struct vm_area_struct *vma, *prev;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
771
772
  	int unmapped_error = 0;
  	int error = -EINVAL;
f79777932   Jason Baron   speed up madvise_...
773
  	int write;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
774
  	size_t len;
1998cc048   Shaohua Li   mm: make madvise(...
775
  	struct blk_plug plug;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
776

75927af8b   Nick Piggin   mm: madvise(): co...
777
778
  	if (!madvise_behavior_valid(behavior))
  		return error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
779
  	if (start & ~PAGE_MASK)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
780
  		return error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
781
782
783
784
  	len = (len_in + ~PAGE_MASK) & PAGE_MASK;
  
  	/* Check to see whether len was rounded up from small -ve to zero */
  	if (len_in && !len)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
785
  		return error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
786
787
788
  
  	end = start + len;
  	if (end < start)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
789
  		return error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
790
791
792
  
  	error = 0;
  	if (end == start)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
793
  		return error;
5e451be75   Anshuman Khandual   mm/madvise: move ...
794
795
796
797
  #ifdef CONFIG_MEMORY_FAILURE
  	if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
  		return madvise_inject_error(behavior, start, start + len_in);
  #endif
84d96d897   Rasmus Villemoes   mm: madvise: comp...
798
  	write = madvise_need_mmap_write(behavior);
dc0ef0df7   Michal Hocko   mm: make mmap_sem...
799
800
801
802
  	if (write) {
  		if (down_write_killable(&current->mm->mmap_sem))
  			return -EINTR;
  	} else {
84d96d897   Rasmus Villemoes   mm: madvise: comp...
803
  		down_read(&current->mm->mmap_sem);
dc0ef0df7   Michal Hocko   mm: make mmap_sem...
804
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
805
806
807
808
  
  	/*
  	 * If the interval [start,end) covers some unmapped address
  	 * ranges, just ignore them, but return -ENOMEM at the end.
05b743847   Prasanna Meda   [PATCH] madvise: ...
809
  	 * - different from the way of handling in mlock etc.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
810
  	 */
05b743847   Prasanna Meda   [PATCH] madvise: ...
811
  	vma = find_vma_prev(current->mm, start, &prev);
836d5ffd3   Hugh Dickins   [PATCH] mm: fix m...
812
813
  	if (vma && start > vma->vm_start)
  		prev = vma;
1998cc048   Shaohua Li   mm: make madvise(...
814
  	blk_start_plug(&plug);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
815
816
817
818
  	for (;;) {
  		/* Still start < end. */
  		error = -ENOMEM;
  		if (!vma)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
819
  			goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
820

05b743847   Prasanna Meda   [PATCH] madvise: ...
821
  		/* Here start < (end|vma->vm_end). */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
822
823
824
  		if (start < vma->vm_start) {
  			unmapped_error = -ENOMEM;
  			start = vma->vm_start;
05b743847   Prasanna Meda   [PATCH] madvise: ...
825
  			if (start >= end)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
826
  				goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
827
  		}
05b743847   Prasanna Meda   [PATCH] madvise: ...
828
829
830
831
  		/* Here vma->vm_start <= start < (end|vma->vm_end) */
  		tmp = vma->vm_end;
  		if (end < tmp)
  			tmp = end;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
832

05b743847   Prasanna Meda   [PATCH] madvise: ...
833
834
  		/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
  		error = madvise_vma(vma, &prev, start, tmp, behavior);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
835
  		if (error)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
836
  			goto out;
05b743847   Prasanna Meda   [PATCH] madvise: ...
837
  		start = tmp;
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
838
  		if (prev && start < prev->vm_end)
05b743847   Prasanna Meda   [PATCH] madvise: ...
839
840
841
  			start = prev->vm_end;
  		error = unmapped_error;
  		if (start >= end)
84d96d897   Rasmus Villemoes   mm: madvise: comp...
842
  			goto out;
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
843
844
845
846
  		if (prev)
  			vma = prev->vm_next;
  		else	/* madvise_remove dropped mmap_sem */
  			vma = find_vma(current->mm, start);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
847
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
848
  out:
84d96d897   Rasmus Villemoes   mm: madvise: comp...
849
  	blk_finish_plug(&plug);
f79777932   Jason Baron   speed up madvise_...
850
  	if (write)
0a27a14a6   Nick Piggin   mm: madvise avoid...
851
852
853
  		up_write(&current->mm->mmap_sem);
  	else
  		up_read(&current->mm->mmap_sem);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
854
855
  	return error;
  }