Blame view

mm/madvise.c 9.54 KB
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1
2
3
4
5
6
7
8
9
10
  /*
   *	linux/mm/madvise.c
   *
   * Copyright (C) 1999  Linus Torvalds
   * Copyright (C) 2002  Christoph Hellwig
   */
  
  #include <linux/mman.h>
  #include <linux/pagemap.h>
  #include <linux/syscalls.h>
05b743847   Prasanna Meda   [PATCH] madvise: ...
11
  #include <linux/mempolicy.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
12
  #include <linux/hugetlb.h>
e8edc6e03   Alexey Dobriyan   Detach sched.h fr...
13
  #include <linux/sched.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
14
15
  
  /*
0a27a14a6   Nick Piggin   mm: madvise avoid...
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
   * Any behaviour which results in changes to the vma->vm_flags needs to
   * take mmap_sem for writing. Others, which simply traverse vmas, need
   * to only take it for reading.
   */
  static int madvise_need_mmap_write(int behavior)
  {
  	switch (behavior) {
  	case MADV_REMOVE:
  	case MADV_WILLNEED:
  	case MADV_DONTNEED:
  		return 0;
  	default:
  		/* be safe, default to 1. list exceptions explicitly */
  		return 1;
  	}
  }
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
34
35
36
   * We can potentially split a vm area into separate
   * areas, each area with its own behavior.
   */
05b743847   Prasanna Meda   [PATCH] madvise: ...
37
38
39
  static long madvise_behavior(struct vm_area_struct * vma,
  		     struct vm_area_struct **prev,
  		     unsigned long start, unsigned long end, int behavior)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
40
41
42
  {
  	struct mm_struct * mm = vma->vm_mm;
  	int error = 0;
05b743847   Prasanna Meda   [PATCH] madvise: ...
43
  	pgoff_t pgoff;
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
44
  	int new_flags = vma->vm_flags;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
45
46
  
  	switch (behavior) {
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
47
48
49
  	case MADV_NORMAL:
  		new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
  		break;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
50
  	case MADV_SEQUENTIAL:
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
51
  		new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
52
53
  		break;
  	case MADV_RANDOM:
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
54
  		new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
55
  		break;
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
56
57
58
59
60
  	case MADV_DONTFORK:
  		new_flags |= VM_DONTCOPY;
  		break;
  	case MADV_DOFORK:
  		new_flags &= ~VM_DONTCOPY;
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
61
62
  		break;
  	}
05b743847   Prasanna Meda   [PATCH] madvise: ...
63
64
  	if (new_flags == vma->vm_flags) {
  		*prev = vma;
836d5ffd3   Hugh Dickins   [PATCH] mm: fix m...
65
  		goto out;
05b743847   Prasanna Meda   [PATCH] madvise: ...
66
67
68
69
70
71
72
73
74
75
76
  	}
  
  	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
  	*prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
  				vma->vm_file, pgoff, vma_policy(vma));
  	if (*prev) {
  		vma = *prev;
  		goto success;
  	}
  
  	*prev = vma;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
77
78
79
80
81
82
83
84
85
86
87
88
  
  	if (start != vma->vm_start) {
  		error = split_vma(mm, vma, start, 1);
  		if (error)
  			goto out;
  	}
  
  	if (end != vma->vm_end) {
  		error = split_vma(mm, vma, end, 0);
  		if (error)
  			goto out;
  	}
836d5ffd3   Hugh Dickins   [PATCH] mm: fix m...
89
  success:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
90
91
92
  	/*
  	 * vm_flags is protected by the mmap_sem held in write mode.
  	 */
e798c6e87   Prasanna Meda   [PATCH] madvise: ...
93
  	vma->vm_flags = new_flags;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
94
95
96
97
98
99
100
101
102
103
104
  
  out:
  	if (error == -ENOMEM)
  		error = -EAGAIN;
  	return error;
  }
  
  /*
   * Schedule all required I/O operations.  Do not wait for completion.
   */
  static long madvise_willneed(struct vm_area_struct * vma,
05b743847   Prasanna Meda   [PATCH] madvise: ...
105
  			     struct vm_area_struct ** prev,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
106
107
108
  			     unsigned long start, unsigned long end)
  {
  	struct file *file = vma->vm_file;
1bef40032   Suzuki   [PATCH] madvise: ...
109
110
  	if (!file)
  		return -EBADF;
70688e4dd   Nick Piggin   xip: support non-...
111
  	if (file->f_mapping->a_ops->get_xip_mem) {
fe77ba6f4   Carsten Otte   [PATCH] xip: madv...
112
113
114
  		/* no bad return value, but ignore advice */
  		return 0;
  	}
05b743847   Prasanna Meda   [PATCH] madvise: ...
115
  	*prev = vma;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
116
117
118
119
120
121
122
123
124
125
126
127
128
129
  	start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
  	if (end > vma->vm_end)
  		end = vma->vm_end;
  	end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
  
  	force_page_cache_readahead(file->f_mapping,
  			file, start, max_sane_readahead(end - start));
  	return 0;
  }
  
  /*
   * Application no longer needs these pages.  If the pages are dirty,
   * it's OK to just throw them away.  The app will be more careful about
   * data it wants to keep.  Be sure to free swap resources too.  The
7e6cbea39   Fernando Luis Vazquez Cao   madvise: update f...
130
   * zap_page_range call sets things up for shrink_active_list to actually free
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
131
132
   * these pages later if no one else has touched them in the meantime,
   * although we could add these pages to a global reuse list for
7e6cbea39   Fernando Luis Vazquez Cao   madvise: update f...
133
   * shrink_active_list to pick up before reclaiming other pages.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
134
135
136
137
138
139
140
141
142
143
144
145
   *
   * NB: This interface discards data rather than pushes it out to swap,
   * as some implementations do.  This has performance implications for
   * applications like large transactional databases which want to discard
   * pages in anonymous maps after committing to backing store the data
   * that was kept in them.  There is no reason to write this data out to
   * the swap area if the application is discarding it.
   *
   * An interface that causes the system to free clean pages and flush
   * dirty pages is already available as msync(MS_INVALIDATE).
   */
  static long madvise_dontneed(struct vm_area_struct * vma,
05b743847   Prasanna Meda   [PATCH] madvise: ...
146
  			     struct vm_area_struct ** prev,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
147
148
  			     unsigned long start, unsigned long end)
  {
05b743847   Prasanna Meda   [PATCH] madvise: ...
149
  	*prev = vma;
6aab341e0   Linus Torvalds   mm: re-architect ...
150
  	if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
151
152
153
154
155
156
157
158
159
160
161
162
  		return -EINVAL;
  
  	if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
  		struct zap_details details = {
  			.nonlinear_vma = vma,
  			.last_index = ULONG_MAX,
  		};
  		zap_page_range(vma, start, end - start, &details);
  	} else
  		zap_page_range(vma, start, end - start, NULL);
  	return 0;
  }
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
163
164
165
166
167
168
169
170
  /*
   * Application wants to free up the pages and associated backing store.
   * This is effectively punching a hole into the middle of a file.
   *
   * NOTE: Currently, only shmfs/tmpfs is supported for this operation.
   * Other filesystems return -ENOSYS.
   */
  static long madvise_remove(struct vm_area_struct *vma,
00e9fa2d6   Nick Piggin   [PATCH] mm: fix m...
171
  				struct vm_area_struct **prev,
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
172
173
174
  				unsigned long start, unsigned long end)
  {
  	struct address_space *mapping;
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
175
176
  	loff_t offset, endoff;
  	int error;
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
177

90ed52ebe   Hugh Dickins   [PATCH] holepunch...
178
  	*prev = NULL;	/* tell sys_madvise we drop mmap_sem */
00e9fa2d6   Nick Piggin   [PATCH] mm: fix m...
179

f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
180
181
182
183
184
185
186
  	if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
  		return -EINVAL;
  
  	if (!vma->vm_file || !vma->vm_file->f_mapping
  		|| !vma->vm_file->f_mapping->host) {
  			return -EINVAL;
  	}
69cf0fac6   Hugh Dickins   [PATCH] Fix MADV_...
187
188
  	if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
  		return -EACCES;
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
189
190
191
192
193
194
  	mapping = vma->vm_file->f_mapping;
  
  	offset = (loff_t)(start - vma->vm_start)
  			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
  	endoff = (loff_t)(end - vma->vm_start - 1)
  			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
195
196
  
  	/* vmtruncate_range needs to take i_mutex and i_alloc_sem */
0a27a14a6   Nick Piggin   mm: madvise avoid...
197
  	up_read(&current->mm->mmap_sem);
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
198
  	error = vmtruncate_range(mapping->host, offset, endoff);
0a27a14a6   Nick Piggin   mm: madvise avoid...
199
  	down_read(&current->mm->mmap_sem);
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
200
  	return error;
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
201
  }
165cd4023   suzuki   [PATCH] madvise()...
202
203
204
  static long
  madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
  		unsigned long start, unsigned long end, int behavior)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
205
  {
1bef40032   Suzuki   [PATCH] madvise: ...
206
  	long error;
165cd4023   suzuki   [PATCH] madvise()...
207

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
208
  	switch (behavior) {
f82256616   Michael S. Tsirkin   [PATCH] madvise M...
209
210
211
212
213
214
  	case MADV_DOFORK:
  		if (vma->vm_flags & VM_IO) {
  			error = -EINVAL;
  			break;
  		}
  	case MADV_DONTFORK:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
215
216
217
  	case MADV_NORMAL:
  	case MADV_SEQUENTIAL:
  	case MADV_RANDOM:
05b743847   Prasanna Meda   [PATCH] madvise: ...
218
  		error = madvise_behavior(vma, prev, start, end, behavior);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
219
  		break;
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
220
  	case MADV_REMOVE:
00e9fa2d6   Nick Piggin   [PATCH] mm: fix m...
221
  		error = madvise_remove(vma, prev, start, end);
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
222
  		break;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
223
224
  
  	case MADV_WILLNEED:
05b743847   Prasanna Meda   [PATCH] madvise: ...
225
  		error = madvise_willneed(vma, prev, start, end);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
226
227
228
  		break;
  
  	case MADV_DONTNEED:
05b743847   Prasanna Meda   [PATCH] madvise: ...
229
  		error = madvise_dontneed(vma, prev, start, end);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
230
231
232
233
234
235
  		break;
  
  	default:
  		error = -EINVAL;
  		break;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
  	return error;
  }
  
  /*
   * The madvise(2) system call.
   *
   * Applications can use madvise() to advise the kernel how it should
   * handle paging I/O in this VM area.  The idea is to help the kernel
   * use appropriate read-ahead and caching techniques.  The information
   * provided is advisory only, and can be safely disregarded by the
   * kernel without affecting the correct operation of the application.
   *
   * behavior values:
   *  MADV_NORMAL - the default behavior is to read clusters.  This
   *		results in some read-ahead and read-behind.
   *  MADV_RANDOM - the system should read the minimum amount of data
   *		on any access, since it is unlikely that the appli-
   *		cation will need more than what it asks for.
   *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
   *		once, so they can be aggressively read ahead, and
   *		can be freed soon after they are accessed.
   *  MADV_WILLNEED - the application is notifying the system to read
   *		some pages ahead.
   *  MADV_DONTNEED - the application is finished with the given range,
   *		so the kernel can free resources associated with it.
f6b3ec238   Badari Pulavarty   [PATCH] madvise(M...
261
262
   *  MADV_REMOVE - the application wants to free up the given range of
   *		pages and associated backing store.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
263
264
265
266
267
268
269
270
271
272
273
274
275
276
   *
   * return values:
   *  zero    - success
   *  -EINVAL - start + len < 0, start is not page-aligned,
   *		"behavior" is not a valid value, or application
   *		is attempting to release locked or shared pages.
   *  -ENOMEM - addresses in the specified range are not currently
   *		mapped, or are outside the AS of the process.
   *  -EIO    - an I/O error occurred while paging in data.
   *  -EBADF  - map exists, but area maps something that isn't a file.
   *  -EAGAIN - a kernel resource was temporarily unavailable.
   */
  asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior)
  {
05b743847   Prasanna Meda   [PATCH] madvise: ...
277
278
  	unsigned long end, tmp;
  	struct vm_area_struct * vma, *prev;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
279
280
  	int unmapped_error = 0;
  	int error = -EINVAL;
f79777932   Jason Baron   speed up madvise_...
281
  	int write;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
282
  	size_t len;
f79777932   Jason Baron   speed up madvise_...
283
284
  	write = madvise_need_mmap_write(behavior);
  	if (write)
0a27a14a6   Nick Piggin   mm: madvise avoid...
285
286
287
  		down_write(&current->mm->mmap_sem);
  	else
  		down_read(&current->mm->mmap_sem);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
  
  	if (start & ~PAGE_MASK)
  		goto out;
  	len = (len_in + ~PAGE_MASK) & PAGE_MASK;
  
  	/* Check to see whether len was rounded up from small -ve to zero */
  	if (len_in && !len)
  		goto out;
  
  	end = start + len;
  	if (end < start)
  		goto out;
  
  	error = 0;
  	if (end == start)
  		goto out;
  
  	/*
  	 * If the interval [start,end) covers some unmapped address
  	 * ranges, just ignore them, but return -ENOMEM at the end.
05b743847   Prasanna Meda   [PATCH] madvise: ...
308
  	 * - different from the way of handling in mlock etc.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
309
  	 */
05b743847   Prasanna Meda   [PATCH] madvise: ...
310
  	vma = find_vma_prev(current->mm, start, &prev);
836d5ffd3   Hugh Dickins   [PATCH] mm: fix m...
311
312
  	if (vma && start > vma->vm_start)
  		prev = vma;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
313
314
315
316
317
  	for (;;) {
  		/* Still start < end. */
  		error = -ENOMEM;
  		if (!vma)
  			goto out;
05b743847   Prasanna Meda   [PATCH] madvise: ...
318
  		/* Here start < (end|vma->vm_end). */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
319
320
321
  		if (start < vma->vm_start) {
  			unmapped_error = -ENOMEM;
  			start = vma->vm_start;
05b743847   Prasanna Meda   [PATCH] madvise: ...
322
323
  			if (start >= end)
  				goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
324
  		}
05b743847   Prasanna Meda   [PATCH] madvise: ...
325
326
327
328
  		/* Here vma->vm_start <= start < (end|vma->vm_end) */
  		tmp = vma->vm_end;
  		if (end < tmp)
  			tmp = end;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
329

05b743847   Prasanna Meda   [PATCH] madvise: ...
330
331
  		/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
  		error = madvise_vma(vma, &prev, start, tmp, behavior);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
332
333
  		if (error)
  			goto out;
05b743847   Prasanna Meda   [PATCH] madvise: ...
334
  		start = tmp;
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
335
  		if (prev && start < prev->vm_end)
05b743847   Prasanna Meda   [PATCH] madvise: ...
336
337
338
339
  			start = prev->vm_end;
  		error = unmapped_error;
  		if (start >= end)
  			goto out;
90ed52ebe   Hugh Dickins   [PATCH] holepunch...
340
341
342
343
  		if (prev)
  			vma = prev->vm_next;
  		else	/* madvise_remove dropped mmap_sem */
  			vma = find_vma(current->mm, start);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
344
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
345
  out:
f79777932   Jason Baron   speed up madvise_...
346
  	if (write)
0a27a14a6   Nick Piggin   mm: madvise avoid...
347
348
349
  		up_write(&current->mm->mmap_sem);
  	else
  		up_read(&current->mm->mmap_sem);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
350
351
  	return error;
  }