Commit c4ea95d7cd08d9ffd7fa75e6c5e0332d596dd11e

Authored by Daniel Forrest
Committed by Linus Torvalds
1 parent 2022b4d18a

mm: fix anon_vma_clone() error treatment

Andrew Morton noticed that the error return from anon_vma_clone() was
being dropped and replaced with -ENOMEM (which is not itself a bug
because the only error return value from anon_vma_clone() is -ENOMEM).

I did an audit of callers of anon_vma_clone() and discovered an actual
bug where the error return was being lost.  In __split_vma(), between
Linux 3.11 and 3.12 the code was changed so the err variable is used
before the call to anon_vma_clone() and the default initial value of
-ENOMEM is overwritten.  So a failure of anon_vma_clone() will return
success since err at this point is now zero.

Below is a patch which fixes this bug and also propagates the error
return value from anon_vma_clone() in all cases.

Fixes: ef0855d334e1 ("mm: mempolicy: turn vma_set_policy() into vma_dup_policy()")
Signed-off-by: Daniel Forrest <dan.forrest@ssec.wisc.edu>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Tim Hartrick <tim@edgecast.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: <stable@vger.kernel.org>	[3.12+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 2 changed files with 11 additions and 5 deletions Inline Diff

1 /* 1 /*
2 * mm/mmap.c 2 * mm/mmap.c
3 * 3 *
4 * Written by obz. 4 * Written by obz.
5 * 5 *
6 * Address space accounting code <alan@lxorguk.ukuu.org.uk> 6 * Address space accounting code <alan@lxorguk.ukuu.org.uk>
7 */ 7 */
8 8
9 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 9 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
10 10
11 #include <linux/kernel.h> 11 #include <linux/kernel.h>
12 #include <linux/slab.h> 12 #include <linux/slab.h>
13 #include <linux/backing-dev.h> 13 #include <linux/backing-dev.h>
14 #include <linux/mm.h> 14 #include <linux/mm.h>
15 #include <linux/vmacache.h> 15 #include <linux/vmacache.h>
16 #include <linux/shm.h> 16 #include <linux/shm.h>
17 #include <linux/mman.h> 17 #include <linux/mman.h>
18 #include <linux/pagemap.h> 18 #include <linux/pagemap.h>
19 #include <linux/swap.h> 19 #include <linux/swap.h>
20 #include <linux/syscalls.h> 20 #include <linux/syscalls.h>
21 #include <linux/capability.h> 21 #include <linux/capability.h>
22 #include <linux/init.h> 22 #include <linux/init.h>
23 #include <linux/file.h> 23 #include <linux/file.h>
24 #include <linux/fs.h> 24 #include <linux/fs.h>
25 #include <linux/personality.h> 25 #include <linux/personality.h>
26 #include <linux/security.h> 26 #include <linux/security.h>
27 #include <linux/hugetlb.h> 27 #include <linux/hugetlb.h>
28 #include <linux/profile.h> 28 #include <linux/profile.h>
29 #include <linux/export.h> 29 #include <linux/export.h>
30 #include <linux/mount.h> 30 #include <linux/mount.h>
31 #include <linux/mempolicy.h> 31 #include <linux/mempolicy.h>
32 #include <linux/rmap.h> 32 #include <linux/rmap.h>
33 #include <linux/mmu_notifier.h> 33 #include <linux/mmu_notifier.h>
34 #include <linux/mmdebug.h> 34 #include <linux/mmdebug.h>
35 #include <linux/perf_event.h> 35 #include <linux/perf_event.h>
36 #include <linux/audit.h> 36 #include <linux/audit.h>
37 #include <linux/khugepaged.h> 37 #include <linux/khugepaged.h>
38 #include <linux/uprobes.h> 38 #include <linux/uprobes.h>
39 #include <linux/rbtree_augmented.h> 39 #include <linux/rbtree_augmented.h>
40 #include <linux/sched/sysctl.h> 40 #include <linux/sched/sysctl.h>
41 #include <linux/notifier.h> 41 #include <linux/notifier.h>
42 #include <linux/memory.h> 42 #include <linux/memory.h>
43 #include <linux/printk.h> 43 #include <linux/printk.h>
44 44
45 #include <asm/uaccess.h> 45 #include <asm/uaccess.h>
46 #include <asm/cacheflush.h> 46 #include <asm/cacheflush.h>
47 #include <asm/tlb.h> 47 #include <asm/tlb.h>
48 #include <asm/mmu_context.h> 48 #include <asm/mmu_context.h>
49 49
50 #include "internal.h" 50 #include "internal.h"
51 51
52 #ifndef arch_mmap_check 52 #ifndef arch_mmap_check
53 #define arch_mmap_check(addr, len, flags) (0) 53 #define arch_mmap_check(addr, len, flags) (0)
54 #endif 54 #endif
55 55
56 #ifndef arch_rebalance_pgtables 56 #ifndef arch_rebalance_pgtables
57 #define arch_rebalance_pgtables(addr, len) (addr) 57 #define arch_rebalance_pgtables(addr, len) (addr)
58 #endif 58 #endif
59 59
60 static void unmap_region(struct mm_struct *mm, 60 static void unmap_region(struct mm_struct *mm,
61 struct vm_area_struct *vma, struct vm_area_struct *prev, 61 struct vm_area_struct *vma, struct vm_area_struct *prev,
62 unsigned long start, unsigned long end); 62 unsigned long start, unsigned long end);
63 63
64 /* description of effects of mapping type and prot in current implementation. 64 /* description of effects of mapping type and prot in current implementation.
65 * this is due to the limited x86 page protection hardware. The expected 65 * this is due to the limited x86 page protection hardware. The expected
66 * behavior is in parens: 66 * behavior is in parens:
67 * 67 *
68 * map_type prot 68 * map_type prot
69 * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC 69 * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC
70 * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes 70 * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes
71 * w: (no) no w: (no) no w: (yes) yes w: (no) no 71 * w: (no) no w: (no) no w: (yes) yes w: (no) no
72 * x: (no) no x: (no) yes x: (no) yes x: (yes) yes 72 * x: (no) no x: (no) yes x: (no) yes x: (yes) yes
73 * 73 *
74 * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes 74 * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes
75 * w: (no) no w: (no) no w: (copy) copy w: (no) no 75 * w: (no) no w: (no) no w: (copy) copy w: (no) no
76 * x: (no) no x: (no) yes x: (no) yes x: (yes) yes 76 * x: (no) no x: (no) yes x: (no) yes x: (yes) yes
77 * 77 *
78 */ 78 */
79 pgprot_t protection_map[16] = { 79 pgprot_t protection_map[16] = {
80 __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111, 80 __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
81 __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111 81 __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
82 }; 82 };
83 83
84 pgprot_t vm_get_page_prot(unsigned long vm_flags) 84 pgprot_t vm_get_page_prot(unsigned long vm_flags)
85 { 85 {
86 return __pgprot(pgprot_val(protection_map[vm_flags & 86 return __pgprot(pgprot_val(protection_map[vm_flags &
87 (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]) | 87 (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]) |
88 pgprot_val(arch_vm_get_page_prot(vm_flags))); 88 pgprot_val(arch_vm_get_page_prot(vm_flags)));
89 } 89 }
90 EXPORT_SYMBOL(vm_get_page_prot); 90 EXPORT_SYMBOL(vm_get_page_prot);
91 91
92 static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags) 92 static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags)
93 { 93 {
94 return pgprot_modify(oldprot, vm_get_page_prot(vm_flags)); 94 return pgprot_modify(oldprot, vm_get_page_prot(vm_flags));
95 } 95 }
96 96
97 /* Update vma->vm_page_prot to reflect vma->vm_flags. */ 97 /* Update vma->vm_page_prot to reflect vma->vm_flags. */
98 void vma_set_page_prot(struct vm_area_struct *vma) 98 void vma_set_page_prot(struct vm_area_struct *vma)
99 { 99 {
100 unsigned long vm_flags = vma->vm_flags; 100 unsigned long vm_flags = vma->vm_flags;
101 101
102 vma->vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags); 102 vma->vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags);
103 if (vma_wants_writenotify(vma)) { 103 if (vma_wants_writenotify(vma)) {
104 vm_flags &= ~VM_SHARED; 104 vm_flags &= ~VM_SHARED;
105 vma->vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, 105 vma->vm_page_prot = vm_pgprot_modify(vma->vm_page_prot,
106 vm_flags); 106 vm_flags);
107 } 107 }
108 } 108 }
109 109
110 110
111 int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */ 111 int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */
112 int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ 112 int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */
113 unsigned long sysctl_overcommit_kbytes __read_mostly; 113 unsigned long sysctl_overcommit_kbytes __read_mostly;
114 int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; 114 int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
115 unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ 115 unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
116 unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ 116 unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
117 /* 117 /*
118 * Make sure vm_committed_as in one cacheline and not cacheline shared with 118 * Make sure vm_committed_as in one cacheline and not cacheline shared with
119 * other variables. It can be updated by several CPUs frequently. 119 * other variables. It can be updated by several CPUs frequently.
120 */ 120 */
121 struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp; 121 struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
122 122
123 /* 123 /*
124 * The global memory commitment made in the system can be a metric 124 * The global memory commitment made in the system can be a metric
125 * that can be used to drive ballooning decisions when Linux is hosted 125 * that can be used to drive ballooning decisions when Linux is hosted
126 * as a guest. On Hyper-V, the host implements a policy engine for dynamically 126 * as a guest. On Hyper-V, the host implements a policy engine for dynamically
127 * balancing memory across competing virtual machines that are hosted. 127 * balancing memory across competing virtual machines that are hosted.
128 * Several metrics drive this policy engine including the guest reported 128 * Several metrics drive this policy engine including the guest reported
129 * memory commitment. 129 * memory commitment.
130 */ 130 */
131 unsigned long vm_memory_committed(void) 131 unsigned long vm_memory_committed(void)
132 { 132 {
133 return percpu_counter_read_positive(&vm_committed_as); 133 return percpu_counter_read_positive(&vm_committed_as);
134 } 134 }
135 EXPORT_SYMBOL_GPL(vm_memory_committed); 135 EXPORT_SYMBOL_GPL(vm_memory_committed);
136 136
137 /* 137 /*
138 * Check that a process has enough memory to allocate a new virtual 138 * Check that a process has enough memory to allocate a new virtual
139 * mapping. 0 means there is enough memory for the allocation to 139 * mapping. 0 means there is enough memory for the allocation to
140 * succeed and -ENOMEM implies there is not. 140 * succeed and -ENOMEM implies there is not.
141 * 141 *
142 * We currently support three overcommit policies, which are set via the 142 * We currently support three overcommit policies, which are set via the
143 * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting 143 * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting
144 * 144 *
145 * Strict overcommit modes added 2002 Feb 26 by Alan Cox. 145 * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
146 * Additional code 2002 Jul 20 by Robert Love. 146 * Additional code 2002 Jul 20 by Robert Love.
147 * 147 *
148 * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise. 148 * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
149 * 149 *
150 * Note this is a helper function intended to be used by LSMs which 150 * Note this is a helper function intended to be used by LSMs which
151 * wish to use this logic. 151 * wish to use this logic.
152 */ 152 */
153 int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) 153 int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
154 { 154 {
155 unsigned long free, allowed, reserve; 155 unsigned long free, allowed, reserve;
156 156
157 VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) < 157 VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) <
158 -(s64)vm_committed_as_batch * num_online_cpus(), 158 -(s64)vm_committed_as_batch * num_online_cpus(),
159 "memory commitment underflow"); 159 "memory commitment underflow");
160 160
161 vm_acct_memory(pages); 161 vm_acct_memory(pages);
162 162
163 /* 163 /*
164 * Sometimes we want to use more memory than we have 164 * Sometimes we want to use more memory than we have
165 */ 165 */
166 if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) 166 if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
167 return 0; 167 return 0;
168 168
169 if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { 169 if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
170 free = global_page_state(NR_FREE_PAGES); 170 free = global_page_state(NR_FREE_PAGES);
171 free += global_page_state(NR_FILE_PAGES); 171 free += global_page_state(NR_FILE_PAGES);
172 172
173 /* 173 /*
174 * shmem pages shouldn't be counted as free in this 174 * shmem pages shouldn't be counted as free in this
175 * case, they can't be purged, only swapped out, and 175 * case, they can't be purged, only swapped out, and
176 * that won't affect the overall amount of available 176 * that won't affect the overall amount of available
177 * memory in the system. 177 * memory in the system.
178 */ 178 */
179 free -= global_page_state(NR_SHMEM); 179 free -= global_page_state(NR_SHMEM);
180 180
181 free += get_nr_swap_pages(); 181 free += get_nr_swap_pages();
182 182
183 /* 183 /*
184 * Any slabs which are created with the 184 * Any slabs which are created with the
185 * SLAB_RECLAIM_ACCOUNT flag claim to have contents 185 * SLAB_RECLAIM_ACCOUNT flag claim to have contents
186 * which are reclaimable, under pressure. The dentry 186 * which are reclaimable, under pressure. The dentry
187 * cache and most inode caches should fall into this 187 * cache and most inode caches should fall into this
188 */ 188 */
189 free += global_page_state(NR_SLAB_RECLAIMABLE); 189 free += global_page_state(NR_SLAB_RECLAIMABLE);
190 190
191 /* 191 /*
192 * Leave reserved pages. The pages are not for anonymous pages. 192 * Leave reserved pages. The pages are not for anonymous pages.
193 */ 193 */
194 if (free <= totalreserve_pages) 194 if (free <= totalreserve_pages)
195 goto error; 195 goto error;
196 else 196 else
197 free -= totalreserve_pages; 197 free -= totalreserve_pages;
198 198
199 /* 199 /*
200 * Reserve some for root 200 * Reserve some for root
201 */ 201 */
202 if (!cap_sys_admin) 202 if (!cap_sys_admin)
203 free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); 203 free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
204 204
205 if (free > pages) 205 if (free > pages)
206 return 0; 206 return 0;
207 207
208 goto error; 208 goto error;
209 } 209 }
210 210
211 allowed = vm_commit_limit(); 211 allowed = vm_commit_limit();
212 /* 212 /*
213 * Reserve some for root 213 * Reserve some for root
214 */ 214 */
215 if (!cap_sys_admin) 215 if (!cap_sys_admin)
216 allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); 216 allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
217 217
218 /* 218 /*
219 * Don't let a single process grow so big a user can't recover 219 * Don't let a single process grow so big a user can't recover
220 */ 220 */
221 if (mm) { 221 if (mm) {
222 reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); 222 reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
223 allowed -= min(mm->total_vm / 32, reserve); 223 allowed -= min(mm->total_vm / 32, reserve);
224 } 224 }
225 225
226 if (percpu_counter_read_positive(&vm_committed_as) < allowed) 226 if (percpu_counter_read_positive(&vm_committed_as) < allowed)
227 return 0; 227 return 0;
228 error: 228 error:
229 vm_unacct_memory(pages); 229 vm_unacct_memory(pages);
230 230
231 return -ENOMEM; 231 return -ENOMEM;
232 } 232 }
233 233
234 /* 234 /*
235 * Requires inode->i_mapping->i_mmap_mutex 235 * Requires inode->i_mapping->i_mmap_mutex
236 */ 236 */
237 static void __remove_shared_vm_struct(struct vm_area_struct *vma, 237 static void __remove_shared_vm_struct(struct vm_area_struct *vma,
238 struct file *file, struct address_space *mapping) 238 struct file *file, struct address_space *mapping)
239 { 239 {
240 if (vma->vm_flags & VM_DENYWRITE) 240 if (vma->vm_flags & VM_DENYWRITE)
241 atomic_inc(&file_inode(file)->i_writecount); 241 atomic_inc(&file_inode(file)->i_writecount);
242 if (vma->vm_flags & VM_SHARED) 242 if (vma->vm_flags & VM_SHARED)
243 mapping_unmap_writable(mapping); 243 mapping_unmap_writable(mapping);
244 244
245 flush_dcache_mmap_lock(mapping); 245 flush_dcache_mmap_lock(mapping);
246 if (unlikely(vma->vm_flags & VM_NONLINEAR)) 246 if (unlikely(vma->vm_flags & VM_NONLINEAR))
247 list_del_init(&vma->shared.nonlinear); 247 list_del_init(&vma->shared.nonlinear);
248 else 248 else
249 vma_interval_tree_remove(vma, &mapping->i_mmap); 249 vma_interval_tree_remove(vma, &mapping->i_mmap);
250 flush_dcache_mmap_unlock(mapping); 250 flush_dcache_mmap_unlock(mapping);
251 } 251 }
252 252
253 /* 253 /*
254 * Unlink a file-based vm structure from its interval tree, to hide 254 * Unlink a file-based vm structure from its interval tree, to hide
255 * vma from rmap and vmtruncate before freeing its page tables. 255 * vma from rmap and vmtruncate before freeing its page tables.
256 */ 256 */
257 void unlink_file_vma(struct vm_area_struct *vma) 257 void unlink_file_vma(struct vm_area_struct *vma)
258 { 258 {
259 struct file *file = vma->vm_file; 259 struct file *file = vma->vm_file;
260 260
261 if (file) { 261 if (file) {
262 struct address_space *mapping = file->f_mapping; 262 struct address_space *mapping = file->f_mapping;
263 mutex_lock(&mapping->i_mmap_mutex); 263 mutex_lock(&mapping->i_mmap_mutex);
264 __remove_shared_vm_struct(vma, file, mapping); 264 __remove_shared_vm_struct(vma, file, mapping);
265 mutex_unlock(&mapping->i_mmap_mutex); 265 mutex_unlock(&mapping->i_mmap_mutex);
266 } 266 }
267 } 267 }
268 268
269 /* 269 /*
270 * Close a vm structure and free it, returning the next. 270 * Close a vm structure and free it, returning the next.
271 */ 271 */
272 static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) 272 static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
273 { 273 {
274 struct vm_area_struct *next = vma->vm_next; 274 struct vm_area_struct *next = vma->vm_next;
275 275
276 might_sleep(); 276 might_sleep();
277 if (vma->vm_ops && vma->vm_ops->close) 277 if (vma->vm_ops && vma->vm_ops->close)
278 vma->vm_ops->close(vma); 278 vma->vm_ops->close(vma);
279 if (vma->vm_file) 279 if (vma->vm_file)
280 fput(vma->vm_file); 280 fput(vma->vm_file);
281 mpol_put(vma_policy(vma)); 281 mpol_put(vma_policy(vma));
282 kmem_cache_free(vm_area_cachep, vma); 282 kmem_cache_free(vm_area_cachep, vma);
283 return next; 283 return next;
284 } 284 }
285 285
286 static unsigned long do_brk(unsigned long addr, unsigned long len); 286 static unsigned long do_brk(unsigned long addr, unsigned long len);
287 287
288 SYSCALL_DEFINE1(brk, unsigned long, brk) 288 SYSCALL_DEFINE1(brk, unsigned long, brk)
289 { 289 {
290 unsigned long retval; 290 unsigned long retval;
291 unsigned long newbrk, oldbrk; 291 unsigned long newbrk, oldbrk;
292 struct mm_struct *mm = current->mm; 292 struct mm_struct *mm = current->mm;
293 unsigned long min_brk; 293 unsigned long min_brk;
294 bool populate; 294 bool populate;
295 295
296 down_write(&mm->mmap_sem); 296 down_write(&mm->mmap_sem);
297 297
298 #ifdef CONFIG_COMPAT_BRK 298 #ifdef CONFIG_COMPAT_BRK
299 /* 299 /*
300 * CONFIG_COMPAT_BRK can still be overridden by setting 300 * CONFIG_COMPAT_BRK can still be overridden by setting
301 * randomize_va_space to 2, which will still cause mm->start_brk 301 * randomize_va_space to 2, which will still cause mm->start_brk
302 * to be arbitrarily shifted 302 * to be arbitrarily shifted
303 */ 303 */
304 if (current->brk_randomized) 304 if (current->brk_randomized)
305 min_brk = mm->start_brk; 305 min_brk = mm->start_brk;
306 else 306 else
307 min_brk = mm->end_data; 307 min_brk = mm->end_data;
308 #else 308 #else
309 min_brk = mm->start_brk; 309 min_brk = mm->start_brk;
310 #endif 310 #endif
311 if (brk < min_brk) 311 if (brk < min_brk)
312 goto out; 312 goto out;
313 313
314 /* 314 /*
315 * Check against rlimit here. If this check is done later after the test 315 * Check against rlimit here. If this check is done later after the test
316 * of oldbrk with newbrk then it can escape the test and let the data 316 * of oldbrk with newbrk then it can escape the test and let the data
317 * segment grow beyond its set limit the in case where the limit is 317 * segment grow beyond its set limit the in case where the limit is
318 * not page aligned -Ram Gupta 318 * not page aligned -Ram Gupta
319 */ 319 */
320 if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk, 320 if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk,
321 mm->end_data, mm->start_data)) 321 mm->end_data, mm->start_data))
322 goto out; 322 goto out;
323 323
324 newbrk = PAGE_ALIGN(brk); 324 newbrk = PAGE_ALIGN(brk);
325 oldbrk = PAGE_ALIGN(mm->brk); 325 oldbrk = PAGE_ALIGN(mm->brk);
326 if (oldbrk == newbrk) 326 if (oldbrk == newbrk)
327 goto set_brk; 327 goto set_brk;
328 328
329 /* Always allow shrinking brk. */ 329 /* Always allow shrinking brk. */
330 if (brk <= mm->brk) { 330 if (brk <= mm->brk) {
331 if (!do_munmap(mm, newbrk, oldbrk-newbrk)) 331 if (!do_munmap(mm, newbrk, oldbrk-newbrk))
332 goto set_brk; 332 goto set_brk;
333 goto out; 333 goto out;
334 } 334 }
335 335
336 /* Check against existing mmap mappings. */ 336 /* Check against existing mmap mappings. */
337 if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE)) 337 if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE))
338 goto out; 338 goto out;
339 339
340 /* Ok, looks good - let it rip. */ 340 /* Ok, looks good - let it rip. */
341 if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) 341 if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
342 goto out; 342 goto out;
343 343
344 set_brk: 344 set_brk:
345 mm->brk = brk; 345 mm->brk = brk;
346 populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0; 346 populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;
347 up_write(&mm->mmap_sem); 347 up_write(&mm->mmap_sem);
348 if (populate) 348 if (populate)
349 mm_populate(oldbrk, newbrk - oldbrk); 349 mm_populate(oldbrk, newbrk - oldbrk);
350 return brk; 350 return brk;
351 351
352 out: 352 out:
353 retval = mm->brk; 353 retval = mm->brk;
354 up_write(&mm->mmap_sem); 354 up_write(&mm->mmap_sem);
355 return retval; 355 return retval;
356 } 356 }
357 357
358 static long vma_compute_subtree_gap(struct vm_area_struct *vma) 358 static long vma_compute_subtree_gap(struct vm_area_struct *vma)
359 { 359 {
360 unsigned long max, subtree_gap; 360 unsigned long max, subtree_gap;
361 max = vma->vm_start; 361 max = vma->vm_start;
362 if (vma->vm_prev) 362 if (vma->vm_prev)
363 max -= vma->vm_prev->vm_end; 363 max -= vma->vm_prev->vm_end;
364 if (vma->vm_rb.rb_left) { 364 if (vma->vm_rb.rb_left) {
365 subtree_gap = rb_entry(vma->vm_rb.rb_left, 365 subtree_gap = rb_entry(vma->vm_rb.rb_left,
366 struct vm_area_struct, vm_rb)->rb_subtree_gap; 366 struct vm_area_struct, vm_rb)->rb_subtree_gap;
367 if (subtree_gap > max) 367 if (subtree_gap > max)
368 max = subtree_gap; 368 max = subtree_gap;
369 } 369 }
370 if (vma->vm_rb.rb_right) { 370 if (vma->vm_rb.rb_right) {
371 subtree_gap = rb_entry(vma->vm_rb.rb_right, 371 subtree_gap = rb_entry(vma->vm_rb.rb_right,
372 struct vm_area_struct, vm_rb)->rb_subtree_gap; 372 struct vm_area_struct, vm_rb)->rb_subtree_gap;
373 if (subtree_gap > max) 373 if (subtree_gap > max)
374 max = subtree_gap; 374 max = subtree_gap;
375 } 375 }
376 return max; 376 return max;
377 } 377 }
378 378
379 #ifdef CONFIG_DEBUG_VM_RB 379 #ifdef CONFIG_DEBUG_VM_RB
380 static int browse_rb(struct rb_root *root) 380 static int browse_rb(struct rb_root *root)
381 { 381 {
382 int i = 0, j, bug = 0; 382 int i = 0, j, bug = 0;
383 struct rb_node *nd, *pn = NULL; 383 struct rb_node *nd, *pn = NULL;
384 unsigned long prev = 0, pend = 0; 384 unsigned long prev = 0, pend = 0;
385 385
386 for (nd = rb_first(root); nd; nd = rb_next(nd)) { 386 for (nd = rb_first(root); nd; nd = rb_next(nd)) {
387 struct vm_area_struct *vma; 387 struct vm_area_struct *vma;
388 vma = rb_entry(nd, struct vm_area_struct, vm_rb); 388 vma = rb_entry(nd, struct vm_area_struct, vm_rb);
389 if (vma->vm_start < prev) { 389 if (vma->vm_start < prev) {
390 pr_emerg("vm_start %lx < prev %lx\n", 390 pr_emerg("vm_start %lx < prev %lx\n",
391 vma->vm_start, prev); 391 vma->vm_start, prev);
392 bug = 1; 392 bug = 1;
393 } 393 }
394 if (vma->vm_start < pend) { 394 if (vma->vm_start < pend) {
395 pr_emerg("vm_start %lx < pend %lx\n", 395 pr_emerg("vm_start %lx < pend %lx\n",
396 vma->vm_start, pend); 396 vma->vm_start, pend);
397 bug = 1; 397 bug = 1;
398 } 398 }
399 if (vma->vm_start > vma->vm_end) { 399 if (vma->vm_start > vma->vm_end) {
400 pr_emerg("vm_start %lx > vm_end %lx\n", 400 pr_emerg("vm_start %lx > vm_end %lx\n",
401 vma->vm_start, vma->vm_end); 401 vma->vm_start, vma->vm_end);
402 bug = 1; 402 bug = 1;
403 } 403 }
404 if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) { 404 if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
405 pr_emerg("free gap %lx, correct %lx\n", 405 pr_emerg("free gap %lx, correct %lx\n",
406 vma->rb_subtree_gap, 406 vma->rb_subtree_gap,
407 vma_compute_subtree_gap(vma)); 407 vma_compute_subtree_gap(vma));
408 bug = 1; 408 bug = 1;
409 } 409 }
410 i++; 410 i++;
411 pn = nd; 411 pn = nd;
412 prev = vma->vm_start; 412 prev = vma->vm_start;
413 pend = vma->vm_end; 413 pend = vma->vm_end;
414 } 414 }
415 j = 0; 415 j = 0;
416 for (nd = pn; nd; nd = rb_prev(nd)) 416 for (nd = pn; nd; nd = rb_prev(nd))
417 j++; 417 j++;
418 if (i != j) { 418 if (i != j) {
419 pr_emerg("backwards %d, forwards %d\n", j, i); 419 pr_emerg("backwards %d, forwards %d\n", j, i);
420 bug = 1; 420 bug = 1;
421 } 421 }
422 return bug ? -1 : i; 422 return bug ? -1 : i;
423 } 423 }
424 424
425 static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore) 425 static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore)
426 { 426 {
427 struct rb_node *nd; 427 struct rb_node *nd;
428 428
429 for (nd = rb_first(root); nd; nd = rb_next(nd)) { 429 for (nd = rb_first(root); nd; nd = rb_next(nd)) {
430 struct vm_area_struct *vma; 430 struct vm_area_struct *vma;
431 vma = rb_entry(nd, struct vm_area_struct, vm_rb); 431 vma = rb_entry(nd, struct vm_area_struct, vm_rb);
432 VM_BUG_ON_VMA(vma != ignore && 432 VM_BUG_ON_VMA(vma != ignore &&
433 vma->rb_subtree_gap != vma_compute_subtree_gap(vma), 433 vma->rb_subtree_gap != vma_compute_subtree_gap(vma),
434 vma); 434 vma);
435 } 435 }
436 } 436 }
437 437
438 static void validate_mm(struct mm_struct *mm) 438 static void validate_mm(struct mm_struct *mm)
439 { 439 {
440 int bug = 0; 440 int bug = 0;
441 int i = 0; 441 int i = 0;
442 unsigned long highest_address = 0; 442 unsigned long highest_address = 0;
443 struct vm_area_struct *vma = mm->mmap; 443 struct vm_area_struct *vma = mm->mmap;
444 444
445 while (vma) { 445 while (vma) {
446 struct anon_vma_chain *avc; 446 struct anon_vma_chain *avc;
447 447
448 vma_lock_anon_vma(vma); 448 vma_lock_anon_vma(vma);
449 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 449 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
450 anon_vma_interval_tree_verify(avc); 450 anon_vma_interval_tree_verify(avc);
451 vma_unlock_anon_vma(vma); 451 vma_unlock_anon_vma(vma);
452 highest_address = vma->vm_end; 452 highest_address = vma->vm_end;
453 vma = vma->vm_next; 453 vma = vma->vm_next;
454 i++; 454 i++;
455 } 455 }
456 if (i != mm->map_count) { 456 if (i != mm->map_count) {
457 pr_emerg("map_count %d vm_next %d\n", mm->map_count, i); 457 pr_emerg("map_count %d vm_next %d\n", mm->map_count, i);
458 bug = 1; 458 bug = 1;
459 } 459 }
460 if (highest_address != mm->highest_vm_end) { 460 if (highest_address != mm->highest_vm_end) {
461 pr_emerg("mm->highest_vm_end %lx, found %lx\n", 461 pr_emerg("mm->highest_vm_end %lx, found %lx\n",
462 mm->highest_vm_end, highest_address); 462 mm->highest_vm_end, highest_address);
463 bug = 1; 463 bug = 1;
464 } 464 }
465 i = browse_rb(&mm->mm_rb); 465 i = browse_rb(&mm->mm_rb);
466 if (i != mm->map_count) { 466 if (i != mm->map_count) {
467 if (i != -1) 467 if (i != -1)
468 pr_emerg("map_count %d rb %d\n", mm->map_count, i); 468 pr_emerg("map_count %d rb %d\n", mm->map_count, i);
469 bug = 1; 469 bug = 1;
470 } 470 }
471 VM_BUG_ON_MM(bug, mm); 471 VM_BUG_ON_MM(bug, mm);
472 } 472 }
473 #else 473 #else
474 #define validate_mm_rb(root, ignore) do { } while (0) 474 #define validate_mm_rb(root, ignore) do { } while (0)
475 #define validate_mm(mm) do { } while (0) 475 #define validate_mm(mm) do { } while (0)
476 #endif 476 #endif
477 477
478 RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb, 478 RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb,
479 unsigned long, rb_subtree_gap, vma_compute_subtree_gap) 479 unsigned long, rb_subtree_gap, vma_compute_subtree_gap)
480 480
481 /* 481 /*
482 * Update augmented rbtree rb_subtree_gap values after vma->vm_start or 482 * Update augmented rbtree rb_subtree_gap values after vma->vm_start or
483 * vma->vm_prev->vm_end values changed, without modifying the vma's position 483 * vma->vm_prev->vm_end values changed, without modifying the vma's position
484 * in the rbtree. 484 * in the rbtree.
485 */ 485 */
486 static void vma_gap_update(struct vm_area_struct *vma) 486 static void vma_gap_update(struct vm_area_struct *vma)
487 { 487 {
488 /* 488 /*
489 * As it turns out, RB_DECLARE_CALLBACKS() already created a callback 489 * As it turns out, RB_DECLARE_CALLBACKS() already created a callback
490 * function that does exacltly what we want. 490 * function that does exacltly what we want.
491 */ 491 */
492 vma_gap_callbacks_propagate(&vma->vm_rb, NULL); 492 vma_gap_callbacks_propagate(&vma->vm_rb, NULL);
493 } 493 }
494 494
495 static inline void vma_rb_insert(struct vm_area_struct *vma, 495 static inline void vma_rb_insert(struct vm_area_struct *vma,
496 struct rb_root *root) 496 struct rb_root *root)
497 { 497 {
498 /* All rb_subtree_gap values must be consistent prior to insertion */ 498 /* All rb_subtree_gap values must be consistent prior to insertion */
499 validate_mm_rb(root, NULL); 499 validate_mm_rb(root, NULL);
500 500
501 rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks); 501 rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
502 } 502 }
503 503
504 static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root) 504 static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
505 { 505 {
506 /* 506 /*
507 * All rb_subtree_gap values must be consistent prior to erase, 507 * All rb_subtree_gap values must be consistent prior to erase,
508 * with the possible exception of the vma being erased. 508 * with the possible exception of the vma being erased.
509 */ 509 */
510 validate_mm_rb(root, vma); 510 validate_mm_rb(root, vma);
511 511
512 /* 512 /*
513 * Note rb_erase_augmented is a fairly large inline function, 513 * Note rb_erase_augmented is a fairly large inline function,
514 * so make sure we instantiate it only once with our desired 514 * so make sure we instantiate it only once with our desired
515 * augmented rbtree callbacks. 515 * augmented rbtree callbacks.
516 */ 516 */
517 rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks); 517 rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
518 } 518 }
519 519
520 /* 520 /*
521 * vma has some anon_vma assigned, and is already inserted on that 521 * vma has some anon_vma assigned, and is already inserted on that
522 * anon_vma's interval trees. 522 * anon_vma's interval trees.
523 * 523 *
524 * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the 524 * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
525 * vma must be removed from the anon_vma's interval trees using 525 * vma must be removed from the anon_vma's interval trees using
526 * anon_vma_interval_tree_pre_update_vma(). 526 * anon_vma_interval_tree_pre_update_vma().
527 * 527 *
528 * After the update, the vma will be reinserted using 528 * After the update, the vma will be reinserted using
529 * anon_vma_interval_tree_post_update_vma(). 529 * anon_vma_interval_tree_post_update_vma().
530 * 530 *
531 * The entire update must be protected by exclusive mmap_sem and by 531 * The entire update must be protected by exclusive mmap_sem and by
532 * the root anon_vma's mutex. 532 * the root anon_vma's mutex.
533 */ 533 */
534 static inline void 534 static inline void
535 anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma) 535 anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
536 { 536 {
537 struct anon_vma_chain *avc; 537 struct anon_vma_chain *avc;
538 538
539 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 539 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
540 anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root); 540 anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
541 } 541 }
542 542
543 static inline void 543 static inline void
544 anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) 544 anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
545 { 545 {
546 struct anon_vma_chain *avc; 546 struct anon_vma_chain *avc;
547 547
548 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 548 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
549 anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); 549 anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
550 } 550 }
551 551
552 static int find_vma_links(struct mm_struct *mm, unsigned long addr, 552 static int find_vma_links(struct mm_struct *mm, unsigned long addr,
553 unsigned long end, struct vm_area_struct **pprev, 553 unsigned long end, struct vm_area_struct **pprev,
554 struct rb_node ***rb_link, struct rb_node **rb_parent) 554 struct rb_node ***rb_link, struct rb_node **rb_parent)
555 { 555 {
556 struct rb_node **__rb_link, *__rb_parent, *rb_prev; 556 struct rb_node **__rb_link, *__rb_parent, *rb_prev;
557 557
558 __rb_link = &mm->mm_rb.rb_node; 558 __rb_link = &mm->mm_rb.rb_node;
559 rb_prev = __rb_parent = NULL; 559 rb_prev = __rb_parent = NULL;
560 560
561 while (*__rb_link) { 561 while (*__rb_link) {
562 struct vm_area_struct *vma_tmp; 562 struct vm_area_struct *vma_tmp;
563 563
564 __rb_parent = *__rb_link; 564 __rb_parent = *__rb_link;
565 vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb); 565 vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
566 566
567 if (vma_tmp->vm_end > addr) { 567 if (vma_tmp->vm_end > addr) {
568 /* Fail if an existing vma overlaps the area */ 568 /* Fail if an existing vma overlaps the area */
569 if (vma_tmp->vm_start < end) 569 if (vma_tmp->vm_start < end)
570 return -ENOMEM; 570 return -ENOMEM;
571 __rb_link = &__rb_parent->rb_left; 571 __rb_link = &__rb_parent->rb_left;
572 } else { 572 } else {
573 rb_prev = __rb_parent; 573 rb_prev = __rb_parent;
574 __rb_link = &__rb_parent->rb_right; 574 __rb_link = &__rb_parent->rb_right;
575 } 575 }
576 } 576 }
577 577
578 *pprev = NULL; 578 *pprev = NULL;
579 if (rb_prev) 579 if (rb_prev)
580 *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); 580 *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
581 *rb_link = __rb_link; 581 *rb_link = __rb_link;
582 *rb_parent = __rb_parent; 582 *rb_parent = __rb_parent;
583 return 0; 583 return 0;
584 } 584 }
585 585
586 static unsigned long count_vma_pages_range(struct mm_struct *mm, 586 static unsigned long count_vma_pages_range(struct mm_struct *mm,
587 unsigned long addr, unsigned long end) 587 unsigned long addr, unsigned long end)
588 { 588 {
589 unsigned long nr_pages = 0; 589 unsigned long nr_pages = 0;
590 struct vm_area_struct *vma; 590 struct vm_area_struct *vma;
591 591
592 /* Find first overlaping mapping */ 592 /* Find first overlaping mapping */
593 vma = find_vma_intersection(mm, addr, end); 593 vma = find_vma_intersection(mm, addr, end);
594 if (!vma) 594 if (!vma)
595 return 0; 595 return 0;
596 596
597 nr_pages = (min(end, vma->vm_end) - 597 nr_pages = (min(end, vma->vm_end) -
598 max(addr, vma->vm_start)) >> PAGE_SHIFT; 598 max(addr, vma->vm_start)) >> PAGE_SHIFT;
599 599
600 /* Iterate over the rest of the overlaps */ 600 /* Iterate over the rest of the overlaps */
601 for (vma = vma->vm_next; vma; vma = vma->vm_next) { 601 for (vma = vma->vm_next; vma; vma = vma->vm_next) {
602 unsigned long overlap_len; 602 unsigned long overlap_len;
603 603
604 if (vma->vm_start > end) 604 if (vma->vm_start > end)
605 break; 605 break;
606 606
607 overlap_len = min(end, vma->vm_end) - vma->vm_start; 607 overlap_len = min(end, vma->vm_end) - vma->vm_start;
608 nr_pages += overlap_len >> PAGE_SHIFT; 608 nr_pages += overlap_len >> PAGE_SHIFT;
609 } 609 }
610 610
611 return nr_pages; 611 return nr_pages;
612 } 612 }
613 613
614 void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, 614 void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
615 struct rb_node **rb_link, struct rb_node *rb_parent) 615 struct rb_node **rb_link, struct rb_node *rb_parent)
616 { 616 {
617 /* Update tracking information for the gap following the new vma. */ 617 /* Update tracking information for the gap following the new vma. */
618 if (vma->vm_next) 618 if (vma->vm_next)
619 vma_gap_update(vma->vm_next); 619 vma_gap_update(vma->vm_next);
620 else 620 else
621 mm->highest_vm_end = vma->vm_end; 621 mm->highest_vm_end = vma->vm_end;
622 622
623 /* 623 /*
624 * vma->vm_prev wasn't known when we followed the rbtree to find the 624 * vma->vm_prev wasn't known when we followed the rbtree to find the
625 * correct insertion point for that vma. As a result, we could not 625 * correct insertion point for that vma. As a result, we could not
626 * update the vma vm_rb parents rb_subtree_gap values on the way down. 626 * update the vma vm_rb parents rb_subtree_gap values on the way down.
627 * So, we first insert the vma with a zero rb_subtree_gap value 627 * So, we first insert the vma with a zero rb_subtree_gap value
628 * (to be consistent with what we did on the way down), and then 628 * (to be consistent with what we did on the way down), and then
629 * immediately update the gap to the correct value. Finally we 629 * immediately update the gap to the correct value. Finally we
630 * rebalance the rbtree after all augmented values have been set. 630 * rebalance the rbtree after all augmented values have been set.
631 */ 631 */
632 rb_link_node(&vma->vm_rb, rb_parent, rb_link); 632 rb_link_node(&vma->vm_rb, rb_parent, rb_link);
633 vma->rb_subtree_gap = 0; 633 vma->rb_subtree_gap = 0;
634 vma_gap_update(vma); 634 vma_gap_update(vma);
635 vma_rb_insert(vma, &mm->mm_rb); 635 vma_rb_insert(vma, &mm->mm_rb);
636 } 636 }
637 637
638 static void __vma_link_file(struct vm_area_struct *vma) 638 static void __vma_link_file(struct vm_area_struct *vma)
639 { 639 {
640 struct file *file; 640 struct file *file;
641 641
642 file = vma->vm_file; 642 file = vma->vm_file;
643 if (file) { 643 if (file) {
644 struct address_space *mapping = file->f_mapping; 644 struct address_space *mapping = file->f_mapping;
645 645
646 if (vma->vm_flags & VM_DENYWRITE) 646 if (vma->vm_flags & VM_DENYWRITE)
647 atomic_dec(&file_inode(file)->i_writecount); 647 atomic_dec(&file_inode(file)->i_writecount);
648 if (vma->vm_flags & VM_SHARED) 648 if (vma->vm_flags & VM_SHARED)
649 atomic_inc(&mapping->i_mmap_writable); 649 atomic_inc(&mapping->i_mmap_writable);
650 650
651 flush_dcache_mmap_lock(mapping); 651 flush_dcache_mmap_lock(mapping);
652 if (unlikely(vma->vm_flags & VM_NONLINEAR)) 652 if (unlikely(vma->vm_flags & VM_NONLINEAR))
653 vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); 653 vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
654 else 654 else
655 vma_interval_tree_insert(vma, &mapping->i_mmap); 655 vma_interval_tree_insert(vma, &mapping->i_mmap);
656 flush_dcache_mmap_unlock(mapping); 656 flush_dcache_mmap_unlock(mapping);
657 } 657 }
658 } 658 }
659 659
660 static void 660 static void
661 __vma_link(struct mm_struct *mm, struct vm_area_struct *vma, 661 __vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
662 struct vm_area_struct *prev, struct rb_node **rb_link, 662 struct vm_area_struct *prev, struct rb_node **rb_link,
663 struct rb_node *rb_parent) 663 struct rb_node *rb_parent)
664 { 664 {
665 __vma_link_list(mm, vma, prev, rb_parent); 665 __vma_link_list(mm, vma, prev, rb_parent);
666 __vma_link_rb(mm, vma, rb_link, rb_parent); 666 __vma_link_rb(mm, vma, rb_link, rb_parent);
667 } 667 }
668 668
669 static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, 669 static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
670 struct vm_area_struct *prev, struct rb_node **rb_link, 670 struct vm_area_struct *prev, struct rb_node **rb_link,
671 struct rb_node *rb_parent) 671 struct rb_node *rb_parent)
672 { 672 {
673 struct address_space *mapping = NULL; 673 struct address_space *mapping = NULL;
674 674
675 if (vma->vm_file) { 675 if (vma->vm_file) {
676 mapping = vma->vm_file->f_mapping; 676 mapping = vma->vm_file->f_mapping;
677 mutex_lock(&mapping->i_mmap_mutex); 677 mutex_lock(&mapping->i_mmap_mutex);
678 } 678 }
679 679
680 __vma_link(mm, vma, prev, rb_link, rb_parent); 680 __vma_link(mm, vma, prev, rb_link, rb_parent);
681 __vma_link_file(vma); 681 __vma_link_file(vma);
682 682
683 if (mapping) 683 if (mapping)
684 mutex_unlock(&mapping->i_mmap_mutex); 684 mutex_unlock(&mapping->i_mmap_mutex);
685 685
686 mm->map_count++; 686 mm->map_count++;
687 validate_mm(mm); 687 validate_mm(mm);
688 } 688 }
689 689
690 /* 690 /*
691 * Helper for vma_adjust() in the split_vma insert case: insert a vma into the 691 * Helper for vma_adjust() in the split_vma insert case: insert a vma into the
692 * mm's list and rbtree. It has already been inserted into the interval tree. 692 * mm's list and rbtree. It has already been inserted into the interval tree.
693 */ 693 */
694 static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) 694 static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
695 { 695 {
696 struct vm_area_struct *prev; 696 struct vm_area_struct *prev;
697 struct rb_node **rb_link, *rb_parent; 697 struct rb_node **rb_link, *rb_parent;
698 698
699 if (find_vma_links(mm, vma->vm_start, vma->vm_end, 699 if (find_vma_links(mm, vma->vm_start, vma->vm_end,
700 &prev, &rb_link, &rb_parent)) 700 &prev, &rb_link, &rb_parent))
701 BUG(); 701 BUG();
702 __vma_link(mm, vma, prev, rb_link, rb_parent); 702 __vma_link(mm, vma, prev, rb_link, rb_parent);
703 mm->map_count++; 703 mm->map_count++;
704 } 704 }
705 705
706 static inline void 706 static inline void
707 __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, 707 __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
708 struct vm_area_struct *prev) 708 struct vm_area_struct *prev)
709 { 709 {
710 struct vm_area_struct *next; 710 struct vm_area_struct *next;
711 711
712 vma_rb_erase(vma, &mm->mm_rb); 712 vma_rb_erase(vma, &mm->mm_rb);
713 prev->vm_next = next = vma->vm_next; 713 prev->vm_next = next = vma->vm_next;
714 if (next) 714 if (next)
715 next->vm_prev = prev; 715 next->vm_prev = prev;
716 716
717 /* Kill the cache */ 717 /* Kill the cache */
718 vmacache_invalidate(mm); 718 vmacache_invalidate(mm);
719 } 719 }
720 720
721 /* 721 /*
722 * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that 722 * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that
723 * is already present in an i_mmap tree without adjusting the tree. 723 * is already present in an i_mmap tree without adjusting the tree.
724 * The following helper function should be used when such adjustments 724 * The following helper function should be used when such adjustments
725 * are necessary. The "insert" vma (if any) is to be inserted 725 * are necessary. The "insert" vma (if any) is to be inserted
726 * before we drop the necessary locks. 726 * before we drop the necessary locks.
727 */ 727 */
728 int vma_adjust(struct vm_area_struct *vma, unsigned long start, 728 int vma_adjust(struct vm_area_struct *vma, unsigned long start,
729 unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) 729 unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
730 { 730 {
731 struct mm_struct *mm = vma->vm_mm; 731 struct mm_struct *mm = vma->vm_mm;
732 struct vm_area_struct *next = vma->vm_next; 732 struct vm_area_struct *next = vma->vm_next;
733 struct vm_area_struct *importer = NULL; 733 struct vm_area_struct *importer = NULL;
734 struct address_space *mapping = NULL; 734 struct address_space *mapping = NULL;
735 struct rb_root *root = NULL; 735 struct rb_root *root = NULL;
736 struct anon_vma *anon_vma = NULL; 736 struct anon_vma *anon_vma = NULL;
737 struct file *file = vma->vm_file; 737 struct file *file = vma->vm_file;
738 bool start_changed = false, end_changed = false; 738 bool start_changed = false, end_changed = false;
739 long adjust_next = 0; 739 long adjust_next = 0;
740 int remove_next = 0; 740 int remove_next = 0;
741 741
742 if (next && !insert) { 742 if (next && !insert) {
743 struct vm_area_struct *exporter = NULL; 743 struct vm_area_struct *exporter = NULL;
744 744
745 if (end >= next->vm_end) { 745 if (end >= next->vm_end) {
746 /* 746 /*
747 * vma expands, overlapping all the next, and 747 * vma expands, overlapping all the next, and
748 * perhaps the one after too (mprotect case 6). 748 * perhaps the one after too (mprotect case 6).
749 */ 749 */
750 again: remove_next = 1 + (end > next->vm_end); 750 again: remove_next = 1 + (end > next->vm_end);
751 end = next->vm_end; 751 end = next->vm_end;
752 exporter = next; 752 exporter = next;
753 importer = vma; 753 importer = vma;
754 } else if (end > next->vm_start) { 754 } else if (end > next->vm_start) {
755 /* 755 /*
756 * vma expands, overlapping part of the next: 756 * vma expands, overlapping part of the next:
757 * mprotect case 5 shifting the boundary up. 757 * mprotect case 5 shifting the boundary up.
758 */ 758 */
759 adjust_next = (end - next->vm_start) >> PAGE_SHIFT; 759 adjust_next = (end - next->vm_start) >> PAGE_SHIFT;
760 exporter = next; 760 exporter = next;
761 importer = vma; 761 importer = vma;
762 } else if (end < vma->vm_end) { 762 } else if (end < vma->vm_end) {
763 /* 763 /*
764 * vma shrinks, and !insert tells it's not 764 * vma shrinks, and !insert tells it's not
765 * split_vma inserting another: so it must be 765 * split_vma inserting another: so it must be
766 * mprotect case 4 shifting the boundary down. 766 * mprotect case 4 shifting the boundary down.
767 */ 767 */
768 adjust_next = -((vma->vm_end - end) >> PAGE_SHIFT); 768 adjust_next = -((vma->vm_end - end) >> PAGE_SHIFT);
769 exporter = vma; 769 exporter = vma;
770 importer = next; 770 importer = next;
771 } 771 }
772 772
773 /* 773 /*
774 * Easily overlooked: when mprotect shifts the boundary, 774 * Easily overlooked: when mprotect shifts the boundary,
775 * make sure the expanding vma has anon_vma set if the 775 * make sure the expanding vma has anon_vma set if the
776 * shrinking vma had, to cover any anon pages imported. 776 * shrinking vma had, to cover any anon pages imported.
777 */ 777 */
778 if (exporter && exporter->anon_vma && !importer->anon_vma) { 778 if (exporter && exporter->anon_vma && !importer->anon_vma) {
779 if (anon_vma_clone(importer, exporter)) 779 int error;
780 return -ENOMEM; 780
781 error = anon_vma_clone(importer, exporter);
782 if (error)
783 return error;
781 importer->anon_vma = exporter->anon_vma; 784 importer->anon_vma = exporter->anon_vma;
782 } 785 }
783 } 786 }
784 787
785 if (file) { 788 if (file) {
786 mapping = file->f_mapping; 789 mapping = file->f_mapping;
787 if (!(vma->vm_flags & VM_NONLINEAR)) { 790 if (!(vma->vm_flags & VM_NONLINEAR)) {
788 root = &mapping->i_mmap; 791 root = &mapping->i_mmap;
789 uprobe_munmap(vma, vma->vm_start, vma->vm_end); 792 uprobe_munmap(vma, vma->vm_start, vma->vm_end);
790 793
791 if (adjust_next) 794 if (adjust_next)
792 uprobe_munmap(next, next->vm_start, 795 uprobe_munmap(next, next->vm_start,
793 next->vm_end); 796 next->vm_end);
794 } 797 }
795 798
796 mutex_lock(&mapping->i_mmap_mutex); 799 mutex_lock(&mapping->i_mmap_mutex);
797 if (insert) { 800 if (insert) {
798 /* 801 /*
799 * Put into interval tree now, so instantiated pages 802 * Put into interval tree now, so instantiated pages
800 * are visible to arm/parisc __flush_dcache_page 803 * are visible to arm/parisc __flush_dcache_page
801 * throughout; but we cannot insert into address 804 * throughout; but we cannot insert into address
802 * space until vma start or end is updated. 805 * space until vma start or end is updated.
803 */ 806 */
804 __vma_link_file(insert); 807 __vma_link_file(insert);
805 } 808 }
806 } 809 }
807 810
808 vma_adjust_trans_huge(vma, start, end, adjust_next); 811 vma_adjust_trans_huge(vma, start, end, adjust_next);
809 812
810 anon_vma = vma->anon_vma; 813 anon_vma = vma->anon_vma;
811 if (!anon_vma && adjust_next) 814 if (!anon_vma && adjust_next)
812 anon_vma = next->anon_vma; 815 anon_vma = next->anon_vma;
813 if (anon_vma) { 816 if (anon_vma) {
814 VM_BUG_ON_VMA(adjust_next && next->anon_vma && 817 VM_BUG_ON_VMA(adjust_next && next->anon_vma &&
815 anon_vma != next->anon_vma, next); 818 anon_vma != next->anon_vma, next);
816 anon_vma_lock_write(anon_vma); 819 anon_vma_lock_write(anon_vma);
817 anon_vma_interval_tree_pre_update_vma(vma); 820 anon_vma_interval_tree_pre_update_vma(vma);
818 if (adjust_next) 821 if (adjust_next)
819 anon_vma_interval_tree_pre_update_vma(next); 822 anon_vma_interval_tree_pre_update_vma(next);
820 } 823 }
821 824
822 if (root) { 825 if (root) {
823 flush_dcache_mmap_lock(mapping); 826 flush_dcache_mmap_lock(mapping);
824 vma_interval_tree_remove(vma, root); 827 vma_interval_tree_remove(vma, root);
825 if (adjust_next) 828 if (adjust_next)
826 vma_interval_tree_remove(next, root); 829 vma_interval_tree_remove(next, root);
827 } 830 }
828 831
829 if (start != vma->vm_start) { 832 if (start != vma->vm_start) {
830 vma->vm_start = start; 833 vma->vm_start = start;
831 start_changed = true; 834 start_changed = true;
832 } 835 }
833 if (end != vma->vm_end) { 836 if (end != vma->vm_end) {
834 vma->vm_end = end; 837 vma->vm_end = end;
835 end_changed = true; 838 end_changed = true;
836 } 839 }
837 vma->vm_pgoff = pgoff; 840 vma->vm_pgoff = pgoff;
838 if (adjust_next) { 841 if (adjust_next) {
839 next->vm_start += adjust_next << PAGE_SHIFT; 842 next->vm_start += adjust_next << PAGE_SHIFT;
840 next->vm_pgoff += adjust_next; 843 next->vm_pgoff += adjust_next;
841 } 844 }
842 845
843 if (root) { 846 if (root) {
844 if (adjust_next) 847 if (adjust_next)
845 vma_interval_tree_insert(next, root); 848 vma_interval_tree_insert(next, root);
846 vma_interval_tree_insert(vma, root); 849 vma_interval_tree_insert(vma, root);
847 flush_dcache_mmap_unlock(mapping); 850 flush_dcache_mmap_unlock(mapping);
848 } 851 }
849 852
850 if (remove_next) { 853 if (remove_next) {
851 /* 854 /*
852 * vma_merge has merged next into vma, and needs 855 * vma_merge has merged next into vma, and needs
853 * us to remove next before dropping the locks. 856 * us to remove next before dropping the locks.
854 */ 857 */
855 __vma_unlink(mm, next, vma); 858 __vma_unlink(mm, next, vma);
856 if (file) 859 if (file)
857 __remove_shared_vm_struct(next, file, mapping); 860 __remove_shared_vm_struct(next, file, mapping);
858 } else if (insert) { 861 } else if (insert) {
859 /* 862 /*
860 * split_vma has split insert from vma, and needs 863 * split_vma has split insert from vma, and needs
861 * us to insert it before dropping the locks 864 * us to insert it before dropping the locks
862 * (it may either follow vma or precede it). 865 * (it may either follow vma or precede it).
863 */ 866 */
864 __insert_vm_struct(mm, insert); 867 __insert_vm_struct(mm, insert);
865 } else { 868 } else {
866 if (start_changed) 869 if (start_changed)
867 vma_gap_update(vma); 870 vma_gap_update(vma);
868 if (end_changed) { 871 if (end_changed) {
869 if (!next) 872 if (!next)
870 mm->highest_vm_end = end; 873 mm->highest_vm_end = end;
871 else if (!adjust_next) 874 else if (!adjust_next)
872 vma_gap_update(next); 875 vma_gap_update(next);
873 } 876 }
874 } 877 }
875 878
876 if (anon_vma) { 879 if (anon_vma) {
877 anon_vma_interval_tree_post_update_vma(vma); 880 anon_vma_interval_tree_post_update_vma(vma);
878 if (adjust_next) 881 if (adjust_next)
879 anon_vma_interval_tree_post_update_vma(next); 882 anon_vma_interval_tree_post_update_vma(next);
880 anon_vma_unlock_write(anon_vma); 883 anon_vma_unlock_write(anon_vma);
881 } 884 }
882 if (mapping) 885 if (mapping)
883 mutex_unlock(&mapping->i_mmap_mutex); 886 mutex_unlock(&mapping->i_mmap_mutex);
884 887
885 if (root) { 888 if (root) {
886 uprobe_mmap(vma); 889 uprobe_mmap(vma);
887 890
888 if (adjust_next) 891 if (adjust_next)
889 uprobe_mmap(next); 892 uprobe_mmap(next);
890 } 893 }
891 894
892 if (remove_next) { 895 if (remove_next) {
893 if (file) { 896 if (file) {
894 uprobe_munmap(next, next->vm_start, next->vm_end); 897 uprobe_munmap(next, next->vm_start, next->vm_end);
895 fput(file); 898 fput(file);
896 } 899 }
897 if (next->anon_vma) 900 if (next->anon_vma)
898 anon_vma_merge(vma, next); 901 anon_vma_merge(vma, next);
899 mm->map_count--; 902 mm->map_count--;
900 mpol_put(vma_policy(next)); 903 mpol_put(vma_policy(next));
901 kmem_cache_free(vm_area_cachep, next); 904 kmem_cache_free(vm_area_cachep, next);
902 /* 905 /*
903 * In mprotect's case 6 (see comments on vma_merge), 906 * In mprotect's case 6 (see comments on vma_merge),
904 * we must remove another next too. It would clutter 907 * we must remove another next too. It would clutter
905 * up the code too much to do both in one go. 908 * up the code too much to do both in one go.
906 */ 909 */
907 next = vma->vm_next; 910 next = vma->vm_next;
908 if (remove_next == 2) 911 if (remove_next == 2)
909 goto again; 912 goto again;
910 else if (next) 913 else if (next)
911 vma_gap_update(next); 914 vma_gap_update(next);
912 else 915 else
913 mm->highest_vm_end = end; 916 mm->highest_vm_end = end;
914 } 917 }
915 if (insert && file) 918 if (insert && file)
916 uprobe_mmap(insert); 919 uprobe_mmap(insert);
917 920
918 validate_mm(mm); 921 validate_mm(mm);
919 922
920 return 0; 923 return 0;
921 } 924 }
922 925
923 /* 926 /*
924 * If the vma has a ->close operation then the driver probably needs to release 927 * If the vma has a ->close operation then the driver probably needs to release
925 * per-vma resources, so we don't attempt to merge those. 928 * per-vma resources, so we don't attempt to merge those.
926 */ 929 */
927 static inline int is_mergeable_vma(struct vm_area_struct *vma, 930 static inline int is_mergeable_vma(struct vm_area_struct *vma,
928 struct file *file, unsigned long vm_flags) 931 struct file *file, unsigned long vm_flags)
929 { 932 {
930 /* 933 /*
931 * VM_SOFTDIRTY should not prevent from VMA merging, if we 934 * VM_SOFTDIRTY should not prevent from VMA merging, if we
932 * match the flags but dirty bit -- the caller should mark 935 * match the flags but dirty bit -- the caller should mark
933 * merged VMA as dirty. If dirty bit won't be excluded from 936 * merged VMA as dirty. If dirty bit won't be excluded from
934 * comparison, we increase pressue on the memory system forcing 937 * comparison, we increase pressue on the memory system forcing
935 * the kernel to generate new VMAs when old one could be 938 * the kernel to generate new VMAs when old one could be
936 * extended instead. 939 * extended instead.
937 */ 940 */
938 if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY) 941 if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY)
939 return 0; 942 return 0;
940 if (vma->vm_file != file) 943 if (vma->vm_file != file)
941 return 0; 944 return 0;
942 if (vma->vm_ops && vma->vm_ops->close) 945 if (vma->vm_ops && vma->vm_ops->close)
943 return 0; 946 return 0;
944 return 1; 947 return 1;
945 } 948 }
946 949
947 static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1, 950 static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
948 struct anon_vma *anon_vma2, 951 struct anon_vma *anon_vma2,
949 struct vm_area_struct *vma) 952 struct vm_area_struct *vma)
950 { 953 {
951 /* 954 /*
952 * The list_is_singular() test is to avoid merging VMA cloned from 955 * The list_is_singular() test is to avoid merging VMA cloned from
953 * parents. This can improve scalability caused by anon_vma lock. 956 * parents. This can improve scalability caused by anon_vma lock.
954 */ 957 */
955 if ((!anon_vma1 || !anon_vma2) && (!vma || 958 if ((!anon_vma1 || !anon_vma2) && (!vma ||
956 list_is_singular(&vma->anon_vma_chain))) 959 list_is_singular(&vma->anon_vma_chain)))
957 return 1; 960 return 1;
958 return anon_vma1 == anon_vma2; 961 return anon_vma1 == anon_vma2;
959 } 962 }
960 963
961 /* 964 /*
962 * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) 965 * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
963 * in front of (at a lower virtual address and file offset than) the vma. 966 * in front of (at a lower virtual address and file offset than) the vma.
964 * 967 *
965 * We cannot merge two vmas if they have differently assigned (non-NULL) 968 * We cannot merge two vmas if they have differently assigned (non-NULL)
966 * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. 969 * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
967 * 970 *
968 * We don't check here for the merged mmap wrapping around the end of pagecache 971 * We don't check here for the merged mmap wrapping around the end of pagecache
969 * indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which 972 * indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which
970 * wrap, nor mmaps which cover the final page at index -1UL. 973 * wrap, nor mmaps which cover the final page at index -1UL.
971 */ 974 */
972 static int 975 static int
973 can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, 976 can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
974 struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) 977 struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
975 { 978 {
976 if (is_mergeable_vma(vma, file, vm_flags) && 979 if (is_mergeable_vma(vma, file, vm_flags) &&
977 is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { 980 is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
978 if (vma->vm_pgoff == vm_pgoff) 981 if (vma->vm_pgoff == vm_pgoff)
979 return 1; 982 return 1;
980 } 983 }
981 return 0; 984 return 0;
982 } 985 }
983 986
984 /* 987 /*
985 * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) 988 * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
986 * beyond (at a higher virtual address and file offset than) the vma. 989 * beyond (at a higher virtual address and file offset than) the vma.
987 * 990 *
988 * We cannot merge two vmas if they have differently assigned (non-NULL) 991 * We cannot merge two vmas if they have differently assigned (non-NULL)
989 * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. 992 * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
990 */ 993 */
991 static int 994 static int
992 can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, 995 can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
993 struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) 996 struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
994 { 997 {
995 if (is_mergeable_vma(vma, file, vm_flags) && 998 if (is_mergeable_vma(vma, file, vm_flags) &&
996 is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { 999 is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
997 pgoff_t vm_pglen; 1000 pgoff_t vm_pglen;
998 vm_pglen = vma_pages(vma); 1001 vm_pglen = vma_pages(vma);
999 if (vma->vm_pgoff + vm_pglen == vm_pgoff) 1002 if (vma->vm_pgoff + vm_pglen == vm_pgoff)
1000 return 1; 1003 return 1;
1001 } 1004 }
1002 return 0; 1005 return 0;
1003 } 1006 }
1004 1007
1005 /* 1008 /*
1006 * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out 1009 * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out
1007 * whether that can be merged with its predecessor or its successor. 1010 * whether that can be merged with its predecessor or its successor.
1008 * Or both (it neatly fills a hole). 1011 * Or both (it neatly fills a hole).
1009 * 1012 *
1010 * In most cases - when called for mmap, brk or mremap - [addr,end) is 1013 * In most cases - when called for mmap, brk or mremap - [addr,end) is
1011 * certain not to be mapped by the time vma_merge is called; but when 1014 * certain not to be mapped by the time vma_merge is called; but when
1012 * called for mprotect, it is certain to be already mapped (either at 1015 * called for mprotect, it is certain to be already mapped (either at
1013 * an offset within prev, or at the start of next), and the flags of 1016 * an offset within prev, or at the start of next), and the flags of
1014 * this area are about to be changed to vm_flags - and the no-change 1017 * this area are about to be changed to vm_flags - and the no-change
1015 * case has already been eliminated. 1018 * case has already been eliminated.
1016 * 1019 *
1017 * The following mprotect cases have to be considered, where AAAA is 1020 * The following mprotect cases have to be considered, where AAAA is
1018 * the area passed down from mprotect_fixup, never extending beyond one 1021 * the area passed down from mprotect_fixup, never extending beyond one
1019 * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after: 1022 * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after:
1020 * 1023 *
1021 * AAAA AAAA AAAA AAAA 1024 * AAAA AAAA AAAA AAAA
1022 * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPNNNNNN PPPPNNNNXXXX 1025 * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPNNNNNN PPPPNNNNXXXX
1023 * cannot merge might become might become might become 1026 * cannot merge might become might become might become
1024 * PPNNNNNNNNNN PPPPPPPPPPNN PPPPPPPPPPPP 6 or 1027 * PPNNNNNNNNNN PPPPPPPPPPNN PPPPPPPPPPPP 6 or
1025 * mmap, brk or case 4 below case 5 below PPPPPPPPXXXX 7 or 1028 * mmap, brk or case 4 below case 5 below PPPPPPPPXXXX 7 or
1026 * mremap move: PPPPNNNNNNNN 8 1029 * mremap move: PPPPNNNNNNNN 8
1027 * AAAA 1030 * AAAA
1028 * PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN 1031 * PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN
1029 * might become case 1 below case 2 below case 3 below 1032 * might become case 1 below case 2 below case 3 below
1030 * 1033 *
1031 * Odd one out? Case 8, because it extends NNNN but needs flags of XXXX: 1034 * Odd one out? Case 8, because it extends NNNN but needs flags of XXXX:
1032 * mprotect_fixup updates vm_flags & vm_page_prot on successful return. 1035 * mprotect_fixup updates vm_flags & vm_page_prot on successful return.
1033 */ 1036 */
1034 struct vm_area_struct *vma_merge(struct mm_struct *mm, 1037 struct vm_area_struct *vma_merge(struct mm_struct *mm,
1035 struct vm_area_struct *prev, unsigned long addr, 1038 struct vm_area_struct *prev, unsigned long addr,
1036 unsigned long end, unsigned long vm_flags, 1039 unsigned long end, unsigned long vm_flags,
1037 struct anon_vma *anon_vma, struct file *file, 1040 struct anon_vma *anon_vma, struct file *file,
1038 pgoff_t pgoff, struct mempolicy *policy) 1041 pgoff_t pgoff, struct mempolicy *policy)
1039 { 1042 {
1040 pgoff_t pglen = (end - addr) >> PAGE_SHIFT; 1043 pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
1041 struct vm_area_struct *area, *next; 1044 struct vm_area_struct *area, *next;
1042 int err; 1045 int err;
1043 1046
1044 /* 1047 /*
1045 * We later require that vma->vm_flags == vm_flags, 1048 * We later require that vma->vm_flags == vm_flags,
1046 * so this tests vma->vm_flags & VM_SPECIAL, too. 1049 * so this tests vma->vm_flags & VM_SPECIAL, too.
1047 */ 1050 */
1048 if (vm_flags & VM_SPECIAL) 1051 if (vm_flags & VM_SPECIAL)
1049 return NULL; 1052 return NULL;
1050 1053
1051 if (prev) 1054 if (prev)
1052 next = prev->vm_next; 1055 next = prev->vm_next;
1053 else 1056 else
1054 next = mm->mmap; 1057 next = mm->mmap;
1055 area = next; 1058 area = next;
1056 if (next && next->vm_end == end) /* cases 6, 7, 8 */ 1059 if (next && next->vm_end == end) /* cases 6, 7, 8 */
1057 next = next->vm_next; 1060 next = next->vm_next;
1058 1061
1059 /* 1062 /*
1060 * Can it merge with the predecessor? 1063 * Can it merge with the predecessor?
1061 */ 1064 */
1062 if (prev && prev->vm_end == addr && 1065 if (prev && prev->vm_end == addr &&
1063 mpol_equal(vma_policy(prev), policy) && 1066 mpol_equal(vma_policy(prev), policy) &&
1064 can_vma_merge_after(prev, vm_flags, 1067 can_vma_merge_after(prev, vm_flags,
1065 anon_vma, file, pgoff)) { 1068 anon_vma, file, pgoff)) {
1066 /* 1069 /*
1067 * OK, it can. Can we now merge in the successor as well? 1070 * OK, it can. Can we now merge in the successor as well?
1068 */ 1071 */
1069 if (next && end == next->vm_start && 1072 if (next && end == next->vm_start &&
1070 mpol_equal(policy, vma_policy(next)) && 1073 mpol_equal(policy, vma_policy(next)) &&
1071 can_vma_merge_before(next, vm_flags, 1074 can_vma_merge_before(next, vm_flags,
1072 anon_vma, file, pgoff+pglen) && 1075 anon_vma, file, pgoff+pglen) &&
1073 is_mergeable_anon_vma(prev->anon_vma, 1076 is_mergeable_anon_vma(prev->anon_vma,
1074 next->anon_vma, NULL)) { 1077 next->anon_vma, NULL)) {
1075 /* cases 1, 6 */ 1078 /* cases 1, 6 */
1076 err = vma_adjust(prev, prev->vm_start, 1079 err = vma_adjust(prev, prev->vm_start,
1077 next->vm_end, prev->vm_pgoff, NULL); 1080 next->vm_end, prev->vm_pgoff, NULL);
1078 } else /* cases 2, 5, 7 */ 1081 } else /* cases 2, 5, 7 */
1079 err = vma_adjust(prev, prev->vm_start, 1082 err = vma_adjust(prev, prev->vm_start,
1080 end, prev->vm_pgoff, NULL); 1083 end, prev->vm_pgoff, NULL);
1081 if (err) 1084 if (err)
1082 return NULL; 1085 return NULL;
1083 khugepaged_enter_vma_merge(prev, vm_flags); 1086 khugepaged_enter_vma_merge(prev, vm_flags);
1084 return prev; 1087 return prev;
1085 } 1088 }
1086 1089
1087 /* 1090 /*
1088 * Can this new request be merged in front of next? 1091 * Can this new request be merged in front of next?
1089 */ 1092 */
1090 if (next && end == next->vm_start && 1093 if (next && end == next->vm_start &&
1091 mpol_equal(policy, vma_policy(next)) && 1094 mpol_equal(policy, vma_policy(next)) &&
1092 can_vma_merge_before(next, vm_flags, 1095 can_vma_merge_before(next, vm_flags,
1093 anon_vma, file, pgoff+pglen)) { 1096 anon_vma, file, pgoff+pglen)) {
1094 if (prev && addr < prev->vm_end) /* case 4 */ 1097 if (prev && addr < prev->vm_end) /* case 4 */
1095 err = vma_adjust(prev, prev->vm_start, 1098 err = vma_adjust(prev, prev->vm_start,
1096 addr, prev->vm_pgoff, NULL); 1099 addr, prev->vm_pgoff, NULL);
1097 else /* cases 3, 8 */ 1100 else /* cases 3, 8 */
1098 err = vma_adjust(area, addr, next->vm_end, 1101 err = vma_adjust(area, addr, next->vm_end,
1099 next->vm_pgoff - pglen, NULL); 1102 next->vm_pgoff - pglen, NULL);
1100 if (err) 1103 if (err)
1101 return NULL; 1104 return NULL;
1102 khugepaged_enter_vma_merge(area, vm_flags); 1105 khugepaged_enter_vma_merge(area, vm_flags);
1103 return area; 1106 return area;
1104 } 1107 }
1105 1108
1106 return NULL; 1109 return NULL;
1107 } 1110 }
1108 1111
1109 /* 1112 /*
1110 * Rough compatbility check to quickly see if it's even worth looking 1113 * Rough compatbility check to quickly see if it's even worth looking
1111 * at sharing an anon_vma. 1114 * at sharing an anon_vma.
1112 * 1115 *
1113 * They need to have the same vm_file, and the flags can only differ 1116 * They need to have the same vm_file, and the flags can only differ
1114 * in things that mprotect may change. 1117 * in things that mprotect may change.
1115 * 1118 *
1116 * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that 1119 * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that
1117 * we can merge the two vma's. For example, we refuse to merge a vma if 1120 * we can merge the two vma's. For example, we refuse to merge a vma if
1118 * there is a vm_ops->close() function, because that indicates that the 1121 * there is a vm_ops->close() function, because that indicates that the
1119 * driver is doing some kind of reference counting. But that doesn't 1122 * driver is doing some kind of reference counting. But that doesn't
1120 * really matter for the anon_vma sharing case. 1123 * really matter for the anon_vma sharing case.
1121 */ 1124 */
1122 static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b) 1125 static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b)
1123 { 1126 {
1124 return a->vm_end == b->vm_start && 1127 return a->vm_end == b->vm_start &&
1125 mpol_equal(vma_policy(a), vma_policy(b)) && 1128 mpol_equal(vma_policy(a), vma_policy(b)) &&
1126 a->vm_file == b->vm_file && 1129 a->vm_file == b->vm_file &&
1127 !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC|VM_SOFTDIRTY)) && 1130 !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC|VM_SOFTDIRTY)) &&
1128 b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); 1131 b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);
1129 } 1132 }
1130 1133
1131 /* 1134 /*
1132 * Do some basic sanity checking to see if we can re-use the anon_vma 1135 * Do some basic sanity checking to see if we can re-use the anon_vma
1133 * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be 1136 * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be
1134 * the same as 'old', the other will be the new one that is trying 1137 * the same as 'old', the other will be the new one that is trying
1135 * to share the anon_vma. 1138 * to share the anon_vma.
1136 * 1139 *
1137 * NOTE! This runs with mm_sem held for reading, so it is possible that 1140 * NOTE! This runs with mm_sem held for reading, so it is possible that
1138 * the anon_vma of 'old' is concurrently in the process of being set up 1141 * the anon_vma of 'old' is concurrently in the process of being set up
1139 * by another page fault trying to merge _that_. But that's ok: if it 1142 * by another page fault trying to merge _that_. But that's ok: if it
1140 * is being set up, that automatically means that it will be a singleton 1143 * is being set up, that automatically means that it will be a singleton
1141 * acceptable for merging, so we can do all of this optimistically. But 1144 * acceptable for merging, so we can do all of this optimistically. But
1142 * we do that ACCESS_ONCE() to make sure that we never re-load the pointer. 1145 * we do that ACCESS_ONCE() to make sure that we never re-load the pointer.
1143 * 1146 *
1144 * IOW: that the "list_is_singular()" test on the anon_vma_chain only 1147 * IOW: that the "list_is_singular()" test on the anon_vma_chain only
1145 * matters for the 'stable anon_vma' case (ie the thing we want to avoid 1148 * matters for the 'stable anon_vma' case (ie the thing we want to avoid
1146 * is to return an anon_vma that is "complex" due to having gone through 1149 * is to return an anon_vma that is "complex" due to having gone through
1147 * a fork). 1150 * a fork).
1148 * 1151 *
1149 * We also make sure that the two vma's are compatible (adjacent, 1152 * We also make sure that the two vma's are compatible (adjacent,
1150 * and with the same memory policies). That's all stable, even with just 1153 * and with the same memory policies). That's all stable, even with just
1151 * a read lock on the mm_sem. 1154 * a read lock on the mm_sem.
1152 */ 1155 */
1153 static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b) 1156 static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b)
1154 { 1157 {
1155 if (anon_vma_compatible(a, b)) { 1158 if (anon_vma_compatible(a, b)) {
1156 struct anon_vma *anon_vma = ACCESS_ONCE(old->anon_vma); 1159 struct anon_vma *anon_vma = ACCESS_ONCE(old->anon_vma);
1157 1160
1158 if (anon_vma && list_is_singular(&old->anon_vma_chain)) 1161 if (anon_vma && list_is_singular(&old->anon_vma_chain))
1159 return anon_vma; 1162 return anon_vma;
1160 } 1163 }
1161 return NULL; 1164 return NULL;
1162 } 1165 }
1163 1166
1164 /* 1167 /*
1165 * find_mergeable_anon_vma is used by anon_vma_prepare, to check 1168 * find_mergeable_anon_vma is used by anon_vma_prepare, to check
1166 * neighbouring vmas for a suitable anon_vma, before it goes off 1169 * neighbouring vmas for a suitable anon_vma, before it goes off
1167 * to allocate a new anon_vma. It checks because a repetitive 1170 * to allocate a new anon_vma. It checks because a repetitive
1168 * sequence of mprotects and faults may otherwise lead to distinct 1171 * sequence of mprotects and faults may otherwise lead to distinct
1169 * anon_vmas being allocated, preventing vma merge in subsequent 1172 * anon_vmas being allocated, preventing vma merge in subsequent
1170 * mprotect. 1173 * mprotect.
1171 */ 1174 */
1172 struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) 1175 struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
1173 { 1176 {
1174 struct anon_vma *anon_vma; 1177 struct anon_vma *anon_vma;
1175 struct vm_area_struct *near; 1178 struct vm_area_struct *near;
1176 1179
1177 near = vma->vm_next; 1180 near = vma->vm_next;
1178 if (!near) 1181 if (!near)
1179 goto try_prev; 1182 goto try_prev;
1180 1183
1181 anon_vma = reusable_anon_vma(near, vma, near); 1184 anon_vma = reusable_anon_vma(near, vma, near);
1182 if (anon_vma) 1185 if (anon_vma)
1183 return anon_vma; 1186 return anon_vma;
1184 try_prev: 1187 try_prev:
1185 near = vma->vm_prev; 1188 near = vma->vm_prev;
1186 if (!near) 1189 if (!near)
1187 goto none; 1190 goto none;
1188 1191
1189 anon_vma = reusable_anon_vma(near, near, vma); 1192 anon_vma = reusable_anon_vma(near, near, vma);
1190 if (anon_vma) 1193 if (anon_vma)
1191 return anon_vma; 1194 return anon_vma;
1192 none: 1195 none:
1193 /* 1196 /*
1194 * There's no absolute need to look only at touching neighbours: 1197 * There's no absolute need to look only at touching neighbours:
1195 * we could search further afield for "compatible" anon_vmas. 1198 * we could search further afield for "compatible" anon_vmas.
1196 * But it would probably just be a waste of time searching, 1199 * But it would probably just be a waste of time searching,
1197 * or lead to too many vmas hanging off the same anon_vma. 1200 * or lead to too many vmas hanging off the same anon_vma.
1198 * We're trying to allow mprotect remerging later on, 1201 * We're trying to allow mprotect remerging later on,
1199 * not trying to minimize memory used for anon_vmas. 1202 * not trying to minimize memory used for anon_vmas.
1200 */ 1203 */
1201 return NULL; 1204 return NULL;
1202 } 1205 }
1203 1206
1204 #ifdef CONFIG_PROC_FS 1207 #ifdef CONFIG_PROC_FS
1205 void vm_stat_account(struct mm_struct *mm, unsigned long flags, 1208 void vm_stat_account(struct mm_struct *mm, unsigned long flags,
1206 struct file *file, long pages) 1209 struct file *file, long pages)
1207 { 1210 {
1208 const unsigned long stack_flags 1211 const unsigned long stack_flags
1209 = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN); 1212 = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN);
1210 1213
1211 mm->total_vm += pages; 1214 mm->total_vm += pages;
1212 1215
1213 if (file) { 1216 if (file) {
1214 mm->shared_vm += pages; 1217 mm->shared_vm += pages;
1215 if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC) 1218 if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC)
1216 mm->exec_vm += pages; 1219 mm->exec_vm += pages;
1217 } else if (flags & stack_flags) 1220 } else if (flags & stack_flags)
1218 mm->stack_vm += pages; 1221 mm->stack_vm += pages;
1219 } 1222 }
1220 #endif /* CONFIG_PROC_FS */ 1223 #endif /* CONFIG_PROC_FS */
1221 1224
1222 /* 1225 /*
1223 * If a hint addr is less than mmap_min_addr change hint to be as 1226 * If a hint addr is less than mmap_min_addr change hint to be as
1224 * low as possible but still greater than mmap_min_addr 1227 * low as possible but still greater than mmap_min_addr
1225 */ 1228 */
1226 static inline unsigned long round_hint_to_min(unsigned long hint) 1229 static inline unsigned long round_hint_to_min(unsigned long hint)
1227 { 1230 {
1228 hint &= PAGE_MASK; 1231 hint &= PAGE_MASK;
1229 if (((void *)hint != NULL) && 1232 if (((void *)hint != NULL) &&
1230 (hint < mmap_min_addr)) 1233 (hint < mmap_min_addr))
1231 return PAGE_ALIGN(mmap_min_addr); 1234 return PAGE_ALIGN(mmap_min_addr);
1232 return hint; 1235 return hint;
1233 } 1236 }
1234 1237
1235 static inline int mlock_future_check(struct mm_struct *mm, 1238 static inline int mlock_future_check(struct mm_struct *mm,
1236 unsigned long flags, 1239 unsigned long flags,
1237 unsigned long len) 1240 unsigned long len)
1238 { 1241 {
1239 unsigned long locked, lock_limit; 1242 unsigned long locked, lock_limit;
1240 1243
1241 /* mlock MCL_FUTURE? */ 1244 /* mlock MCL_FUTURE? */
1242 if (flags & VM_LOCKED) { 1245 if (flags & VM_LOCKED) {
1243 locked = len >> PAGE_SHIFT; 1246 locked = len >> PAGE_SHIFT;
1244 locked += mm->locked_vm; 1247 locked += mm->locked_vm;
1245 lock_limit = rlimit(RLIMIT_MEMLOCK); 1248 lock_limit = rlimit(RLIMIT_MEMLOCK);
1246 lock_limit >>= PAGE_SHIFT; 1249 lock_limit >>= PAGE_SHIFT;
1247 if (locked > lock_limit && !capable(CAP_IPC_LOCK)) 1250 if (locked > lock_limit && !capable(CAP_IPC_LOCK))
1248 return -EAGAIN; 1251 return -EAGAIN;
1249 } 1252 }
1250 return 0; 1253 return 0;
1251 } 1254 }
1252 1255
1253 /* 1256 /*
1254 * The caller must hold down_write(&current->mm->mmap_sem). 1257 * The caller must hold down_write(&current->mm->mmap_sem).
1255 */ 1258 */
1256 1259
1257 unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, 1260 unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1258 unsigned long len, unsigned long prot, 1261 unsigned long len, unsigned long prot,
1259 unsigned long flags, unsigned long pgoff, 1262 unsigned long flags, unsigned long pgoff,
1260 unsigned long *populate) 1263 unsigned long *populate)
1261 { 1264 {
1262 struct mm_struct *mm = current->mm; 1265 struct mm_struct *mm = current->mm;
1263 vm_flags_t vm_flags; 1266 vm_flags_t vm_flags;
1264 1267
1265 *populate = 0; 1268 *populate = 0;
1266 1269
1267 /* 1270 /*
1268 * Does the application expect PROT_READ to imply PROT_EXEC? 1271 * Does the application expect PROT_READ to imply PROT_EXEC?
1269 * 1272 *
1270 * (the exception is when the underlying filesystem is noexec 1273 * (the exception is when the underlying filesystem is noexec
1271 * mounted, in which case we dont add PROT_EXEC.) 1274 * mounted, in which case we dont add PROT_EXEC.)
1272 */ 1275 */
1273 if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) 1276 if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
1274 if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC))) 1277 if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC)))
1275 prot |= PROT_EXEC; 1278 prot |= PROT_EXEC;
1276 1279
1277 if (!len) 1280 if (!len)
1278 return -EINVAL; 1281 return -EINVAL;
1279 1282
1280 if (!(flags & MAP_FIXED)) 1283 if (!(flags & MAP_FIXED))
1281 addr = round_hint_to_min(addr); 1284 addr = round_hint_to_min(addr);
1282 1285
1283 /* Careful about overflows.. */ 1286 /* Careful about overflows.. */
1284 len = PAGE_ALIGN(len); 1287 len = PAGE_ALIGN(len);
1285 if (!len) 1288 if (!len)
1286 return -ENOMEM; 1289 return -ENOMEM;
1287 1290
1288 /* offset overflow? */ 1291 /* offset overflow? */
1289 if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) 1292 if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
1290 return -EOVERFLOW; 1293 return -EOVERFLOW;
1291 1294
1292 /* Too many mappings? */ 1295 /* Too many mappings? */
1293 if (mm->map_count > sysctl_max_map_count) 1296 if (mm->map_count > sysctl_max_map_count)
1294 return -ENOMEM; 1297 return -ENOMEM;
1295 1298
1296 /* Obtain the address to map to. we verify (or select) it and ensure 1299 /* Obtain the address to map to. we verify (or select) it and ensure
1297 * that it represents a valid section of the address space. 1300 * that it represents a valid section of the address space.
1298 */ 1301 */
1299 addr = get_unmapped_area(file, addr, len, pgoff, flags); 1302 addr = get_unmapped_area(file, addr, len, pgoff, flags);
1300 if (addr & ~PAGE_MASK) 1303 if (addr & ~PAGE_MASK)
1301 return addr; 1304 return addr;
1302 1305
1303 /* Do simple checking here so the lower-level routines won't have 1306 /* Do simple checking here so the lower-level routines won't have
1304 * to. we assume access permissions have been handled by the open 1307 * to. we assume access permissions have been handled by the open
1305 * of the memory object, so we don't do any here. 1308 * of the memory object, so we don't do any here.
1306 */ 1309 */
1307 vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | 1310 vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
1308 mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; 1311 mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
1309 1312
1310 if (flags & MAP_LOCKED) 1313 if (flags & MAP_LOCKED)
1311 if (!can_do_mlock()) 1314 if (!can_do_mlock())
1312 return -EPERM; 1315 return -EPERM;
1313 1316
1314 if (mlock_future_check(mm, vm_flags, len)) 1317 if (mlock_future_check(mm, vm_flags, len))
1315 return -EAGAIN; 1318 return -EAGAIN;
1316 1319
1317 if (file) { 1320 if (file) {
1318 struct inode *inode = file_inode(file); 1321 struct inode *inode = file_inode(file);
1319 1322
1320 switch (flags & MAP_TYPE) { 1323 switch (flags & MAP_TYPE) {
1321 case MAP_SHARED: 1324 case MAP_SHARED:
1322 if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE)) 1325 if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
1323 return -EACCES; 1326 return -EACCES;
1324 1327
1325 /* 1328 /*
1326 * Make sure we don't allow writing to an append-only 1329 * Make sure we don't allow writing to an append-only
1327 * file.. 1330 * file..
1328 */ 1331 */
1329 if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE)) 1332 if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
1330 return -EACCES; 1333 return -EACCES;
1331 1334
1332 /* 1335 /*
1333 * Make sure there are no mandatory locks on the file. 1336 * Make sure there are no mandatory locks on the file.
1334 */ 1337 */
1335 if (locks_verify_locked(file)) 1338 if (locks_verify_locked(file))
1336 return -EAGAIN; 1339 return -EAGAIN;
1337 1340
1338 vm_flags |= VM_SHARED | VM_MAYSHARE; 1341 vm_flags |= VM_SHARED | VM_MAYSHARE;
1339 if (!(file->f_mode & FMODE_WRITE)) 1342 if (!(file->f_mode & FMODE_WRITE))
1340 vm_flags &= ~(VM_MAYWRITE | VM_SHARED); 1343 vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
1341 1344
1342 /* fall through */ 1345 /* fall through */
1343 case MAP_PRIVATE: 1346 case MAP_PRIVATE:
1344 if (!(file->f_mode & FMODE_READ)) 1347 if (!(file->f_mode & FMODE_READ))
1345 return -EACCES; 1348 return -EACCES;
1346 if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) { 1349 if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
1347 if (vm_flags & VM_EXEC) 1350 if (vm_flags & VM_EXEC)
1348 return -EPERM; 1351 return -EPERM;
1349 vm_flags &= ~VM_MAYEXEC; 1352 vm_flags &= ~VM_MAYEXEC;
1350 } 1353 }
1351 1354
1352 if (!file->f_op->mmap) 1355 if (!file->f_op->mmap)
1353 return -ENODEV; 1356 return -ENODEV;
1354 if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) 1357 if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
1355 return -EINVAL; 1358 return -EINVAL;
1356 break; 1359 break;
1357 1360
1358 default: 1361 default:
1359 return -EINVAL; 1362 return -EINVAL;
1360 } 1363 }
1361 } else { 1364 } else {
1362 switch (flags & MAP_TYPE) { 1365 switch (flags & MAP_TYPE) {
1363 case MAP_SHARED: 1366 case MAP_SHARED:
1364 if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) 1367 if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
1365 return -EINVAL; 1368 return -EINVAL;
1366 /* 1369 /*
1367 * Ignore pgoff. 1370 * Ignore pgoff.
1368 */ 1371 */
1369 pgoff = 0; 1372 pgoff = 0;
1370 vm_flags |= VM_SHARED | VM_MAYSHARE; 1373 vm_flags |= VM_SHARED | VM_MAYSHARE;
1371 break; 1374 break;
1372 case MAP_PRIVATE: 1375 case MAP_PRIVATE:
1373 /* 1376 /*
1374 * Set pgoff according to addr for anon_vma. 1377 * Set pgoff according to addr for anon_vma.
1375 */ 1378 */
1376 pgoff = addr >> PAGE_SHIFT; 1379 pgoff = addr >> PAGE_SHIFT;
1377 break; 1380 break;
1378 default: 1381 default:
1379 return -EINVAL; 1382 return -EINVAL;
1380 } 1383 }
1381 } 1384 }
1382 1385
1383 /* 1386 /*
1384 * Set 'VM_NORESERVE' if we should not account for the 1387 * Set 'VM_NORESERVE' if we should not account for the
1385 * memory use of this mapping. 1388 * memory use of this mapping.
1386 */ 1389 */
1387 if (flags & MAP_NORESERVE) { 1390 if (flags & MAP_NORESERVE) {
1388 /* We honor MAP_NORESERVE if allowed to overcommit */ 1391 /* We honor MAP_NORESERVE if allowed to overcommit */
1389 if (sysctl_overcommit_memory != OVERCOMMIT_NEVER) 1392 if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
1390 vm_flags |= VM_NORESERVE; 1393 vm_flags |= VM_NORESERVE;
1391 1394
1392 /* hugetlb applies strict overcommit unless MAP_NORESERVE */ 1395 /* hugetlb applies strict overcommit unless MAP_NORESERVE */
1393 if (file && is_file_hugepages(file)) 1396 if (file && is_file_hugepages(file))
1394 vm_flags |= VM_NORESERVE; 1397 vm_flags |= VM_NORESERVE;
1395 } 1398 }
1396 1399
1397 addr = mmap_region(file, addr, len, vm_flags, pgoff); 1400 addr = mmap_region(file, addr, len, vm_flags, pgoff);
1398 if (!IS_ERR_VALUE(addr) && 1401 if (!IS_ERR_VALUE(addr) &&
1399 ((vm_flags & VM_LOCKED) || 1402 ((vm_flags & VM_LOCKED) ||
1400 (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE)) 1403 (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
1401 *populate = len; 1404 *populate = len;
1402 return addr; 1405 return addr;
1403 } 1406 }
1404 1407
1405 SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, 1408 SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1406 unsigned long, prot, unsigned long, flags, 1409 unsigned long, prot, unsigned long, flags,
1407 unsigned long, fd, unsigned long, pgoff) 1410 unsigned long, fd, unsigned long, pgoff)
1408 { 1411 {
1409 struct file *file = NULL; 1412 struct file *file = NULL;
1410 unsigned long retval = -EBADF; 1413 unsigned long retval = -EBADF;
1411 1414
1412 if (!(flags & MAP_ANONYMOUS)) { 1415 if (!(flags & MAP_ANONYMOUS)) {
1413 audit_mmap_fd(fd, flags); 1416 audit_mmap_fd(fd, flags);
1414 file = fget(fd); 1417 file = fget(fd);
1415 if (!file) 1418 if (!file)
1416 goto out; 1419 goto out;
1417 if (is_file_hugepages(file)) 1420 if (is_file_hugepages(file))
1418 len = ALIGN(len, huge_page_size(hstate_file(file))); 1421 len = ALIGN(len, huge_page_size(hstate_file(file)));
1419 retval = -EINVAL; 1422 retval = -EINVAL;
1420 if (unlikely(flags & MAP_HUGETLB && !is_file_hugepages(file))) 1423 if (unlikely(flags & MAP_HUGETLB && !is_file_hugepages(file)))
1421 goto out_fput; 1424 goto out_fput;
1422 } else if (flags & MAP_HUGETLB) { 1425 } else if (flags & MAP_HUGETLB) {
1423 struct user_struct *user = NULL; 1426 struct user_struct *user = NULL;
1424 struct hstate *hs; 1427 struct hstate *hs;
1425 1428
1426 hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & SHM_HUGE_MASK); 1429 hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & SHM_HUGE_MASK);
1427 if (!hs) 1430 if (!hs)
1428 return -EINVAL; 1431 return -EINVAL;
1429 1432
1430 len = ALIGN(len, huge_page_size(hs)); 1433 len = ALIGN(len, huge_page_size(hs));
1431 /* 1434 /*
1432 * VM_NORESERVE is used because the reservations will be 1435 * VM_NORESERVE is used because the reservations will be
1433 * taken when vm_ops->mmap() is called 1436 * taken when vm_ops->mmap() is called
1434 * A dummy user value is used because we are not locking 1437 * A dummy user value is used because we are not locking
1435 * memory so no accounting is necessary 1438 * memory so no accounting is necessary
1436 */ 1439 */
1437 file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, 1440 file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
1438 VM_NORESERVE, 1441 VM_NORESERVE,
1439 &user, HUGETLB_ANONHUGE_INODE, 1442 &user, HUGETLB_ANONHUGE_INODE,
1440 (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); 1443 (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
1441 if (IS_ERR(file)) 1444 if (IS_ERR(file))
1442 return PTR_ERR(file); 1445 return PTR_ERR(file);
1443 } 1446 }
1444 1447
1445 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); 1448 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
1446 1449
1447 retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); 1450 retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
1448 out_fput: 1451 out_fput:
1449 if (file) 1452 if (file)
1450 fput(file); 1453 fput(file);
1451 out: 1454 out:
1452 return retval; 1455 return retval;
1453 } 1456 }
1454 1457
1455 #ifdef __ARCH_WANT_SYS_OLD_MMAP 1458 #ifdef __ARCH_WANT_SYS_OLD_MMAP
1456 struct mmap_arg_struct { 1459 struct mmap_arg_struct {
1457 unsigned long addr; 1460 unsigned long addr;
1458 unsigned long len; 1461 unsigned long len;
1459 unsigned long prot; 1462 unsigned long prot;
1460 unsigned long flags; 1463 unsigned long flags;
1461 unsigned long fd; 1464 unsigned long fd;
1462 unsigned long offset; 1465 unsigned long offset;
1463 }; 1466 };
1464 1467
1465 SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) 1468 SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
1466 { 1469 {
1467 struct mmap_arg_struct a; 1470 struct mmap_arg_struct a;
1468 1471
1469 if (copy_from_user(&a, arg, sizeof(a))) 1472 if (copy_from_user(&a, arg, sizeof(a)))
1470 return -EFAULT; 1473 return -EFAULT;
1471 if (a.offset & ~PAGE_MASK) 1474 if (a.offset & ~PAGE_MASK)
1472 return -EINVAL; 1475 return -EINVAL;
1473 1476
1474 return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, 1477 return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
1475 a.offset >> PAGE_SHIFT); 1478 a.offset >> PAGE_SHIFT);
1476 } 1479 }
1477 #endif /* __ARCH_WANT_SYS_OLD_MMAP */ 1480 #endif /* __ARCH_WANT_SYS_OLD_MMAP */
1478 1481
1479 /* 1482 /*
1480 * Some shared mappigns will want the pages marked read-only 1483 * Some shared mappigns will want the pages marked read-only
1481 * to track write events. If so, we'll downgrade vm_page_prot 1484 * to track write events. If so, we'll downgrade vm_page_prot
1482 * to the private version (using protection_map[] without the 1485 * to the private version (using protection_map[] without the
1483 * VM_SHARED bit). 1486 * VM_SHARED bit).
1484 */ 1487 */
1485 int vma_wants_writenotify(struct vm_area_struct *vma) 1488 int vma_wants_writenotify(struct vm_area_struct *vma)
1486 { 1489 {
1487 vm_flags_t vm_flags = vma->vm_flags; 1490 vm_flags_t vm_flags = vma->vm_flags;
1488 1491
1489 /* If it was private or non-writable, the write bit is already clear */ 1492 /* If it was private or non-writable, the write bit is already clear */
1490 if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED))) 1493 if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED)))
1491 return 0; 1494 return 0;
1492 1495
1493 /* The backer wishes to know when pages are first written to? */ 1496 /* The backer wishes to know when pages are first written to? */
1494 if (vma->vm_ops && vma->vm_ops->page_mkwrite) 1497 if (vma->vm_ops && vma->vm_ops->page_mkwrite)
1495 return 1; 1498 return 1;
1496 1499
1497 /* The open routine did something to the protections that pgprot_modify 1500 /* The open routine did something to the protections that pgprot_modify
1498 * won't preserve? */ 1501 * won't preserve? */
1499 if (pgprot_val(vma->vm_page_prot) != 1502 if (pgprot_val(vma->vm_page_prot) !=
1500 pgprot_val(vm_pgprot_modify(vma->vm_page_prot, vm_flags))) 1503 pgprot_val(vm_pgprot_modify(vma->vm_page_prot, vm_flags)))
1501 return 0; 1504 return 0;
1502 1505
1503 /* Do we need to track softdirty? */ 1506 /* Do we need to track softdirty? */
1504 if (IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) && !(vm_flags & VM_SOFTDIRTY)) 1507 if (IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) && !(vm_flags & VM_SOFTDIRTY))
1505 return 1; 1508 return 1;
1506 1509
1507 /* Specialty mapping? */ 1510 /* Specialty mapping? */
1508 if (vm_flags & VM_PFNMAP) 1511 if (vm_flags & VM_PFNMAP)
1509 return 0; 1512 return 0;
1510 1513
1511 /* Can the mapping track the dirty pages? */ 1514 /* Can the mapping track the dirty pages? */
1512 return vma->vm_file && vma->vm_file->f_mapping && 1515 return vma->vm_file && vma->vm_file->f_mapping &&
1513 mapping_cap_account_dirty(vma->vm_file->f_mapping); 1516 mapping_cap_account_dirty(vma->vm_file->f_mapping);
1514 } 1517 }
1515 1518
1516 /* 1519 /*
1517 * We account for memory if it's a private writeable mapping, 1520 * We account for memory if it's a private writeable mapping,
1518 * not hugepages and VM_NORESERVE wasn't set. 1521 * not hugepages and VM_NORESERVE wasn't set.
1519 */ 1522 */
1520 static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags) 1523 static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
1521 { 1524 {
1522 /* 1525 /*
1523 * hugetlb has its own accounting separate from the core VM 1526 * hugetlb has its own accounting separate from the core VM
1524 * VM_HUGETLB may not be set yet so we cannot check for that flag. 1527 * VM_HUGETLB may not be set yet so we cannot check for that flag.
1525 */ 1528 */
1526 if (file && is_file_hugepages(file)) 1529 if (file && is_file_hugepages(file))
1527 return 0; 1530 return 0;
1528 1531
1529 return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE; 1532 return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
1530 } 1533 }
1531 1534
1532 unsigned long mmap_region(struct file *file, unsigned long addr, 1535 unsigned long mmap_region(struct file *file, unsigned long addr,
1533 unsigned long len, vm_flags_t vm_flags, unsigned long pgoff) 1536 unsigned long len, vm_flags_t vm_flags, unsigned long pgoff)
1534 { 1537 {
1535 struct mm_struct *mm = current->mm; 1538 struct mm_struct *mm = current->mm;
1536 struct vm_area_struct *vma, *prev; 1539 struct vm_area_struct *vma, *prev;
1537 int error; 1540 int error;
1538 struct rb_node **rb_link, *rb_parent; 1541 struct rb_node **rb_link, *rb_parent;
1539 unsigned long charged = 0; 1542 unsigned long charged = 0;
1540 1543
1541 /* Check against address space limit. */ 1544 /* Check against address space limit. */
1542 if (!may_expand_vm(mm, len >> PAGE_SHIFT)) { 1545 if (!may_expand_vm(mm, len >> PAGE_SHIFT)) {
1543 unsigned long nr_pages; 1546 unsigned long nr_pages;
1544 1547
1545 /* 1548 /*
1546 * MAP_FIXED may remove pages of mappings that intersects with 1549 * MAP_FIXED may remove pages of mappings that intersects with
1547 * requested mapping. Account for the pages it would unmap. 1550 * requested mapping. Account for the pages it would unmap.
1548 */ 1551 */
1549 if (!(vm_flags & MAP_FIXED)) 1552 if (!(vm_flags & MAP_FIXED))
1550 return -ENOMEM; 1553 return -ENOMEM;
1551 1554
1552 nr_pages = count_vma_pages_range(mm, addr, addr + len); 1555 nr_pages = count_vma_pages_range(mm, addr, addr + len);
1553 1556
1554 if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages)) 1557 if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages))
1555 return -ENOMEM; 1558 return -ENOMEM;
1556 } 1559 }
1557 1560
1558 /* Clear old maps */ 1561 /* Clear old maps */
1559 error = -ENOMEM; 1562 error = -ENOMEM;
1560 munmap_back: 1563 munmap_back:
1561 if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { 1564 if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
1562 if (do_munmap(mm, addr, len)) 1565 if (do_munmap(mm, addr, len))
1563 return -ENOMEM; 1566 return -ENOMEM;
1564 goto munmap_back; 1567 goto munmap_back;
1565 } 1568 }
1566 1569
1567 /* 1570 /*
1568 * Private writable mapping: check memory availability 1571 * Private writable mapping: check memory availability
1569 */ 1572 */
1570 if (accountable_mapping(file, vm_flags)) { 1573 if (accountable_mapping(file, vm_flags)) {
1571 charged = len >> PAGE_SHIFT; 1574 charged = len >> PAGE_SHIFT;
1572 if (security_vm_enough_memory_mm(mm, charged)) 1575 if (security_vm_enough_memory_mm(mm, charged))
1573 return -ENOMEM; 1576 return -ENOMEM;
1574 vm_flags |= VM_ACCOUNT; 1577 vm_flags |= VM_ACCOUNT;
1575 } 1578 }
1576 1579
1577 /* 1580 /*
1578 * Can we just expand an old mapping? 1581 * Can we just expand an old mapping?
1579 */ 1582 */
1580 vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL); 1583 vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL);
1581 if (vma) 1584 if (vma)
1582 goto out; 1585 goto out;
1583 1586
1584 /* 1587 /*
1585 * Determine the object being mapped and call the appropriate 1588 * Determine the object being mapped and call the appropriate
1586 * specific mapper. the address has already been validated, but 1589 * specific mapper. the address has already been validated, but
1587 * not unmapped, but the maps are removed from the list. 1590 * not unmapped, but the maps are removed from the list.
1588 */ 1591 */
1589 vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); 1592 vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
1590 if (!vma) { 1593 if (!vma) {
1591 error = -ENOMEM; 1594 error = -ENOMEM;
1592 goto unacct_error; 1595 goto unacct_error;
1593 } 1596 }
1594 1597
1595 vma->vm_mm = mm; 1598 vma->vm_mm = mm;
1596 vma->vm_start = addr; 1599 vma->vm_start = addr;
1597 vma->vm_end = addr + len; 1600 vma->vm_end = addr + len;
1598 vma->vm_flags = vm_flags; 1601 vma->vm_flags = vm_flags;
1599 vma->vm_page_prot = vm_get_page_prot(vm_flags); 1602 vma->vm_page_prot = vm_get_page_prot(vm_flags);
1600 vma->vm_pgoff = pgoff; 1603 vma->vm_pgoff = pgoff;
1601 INIT_LIST_HEAD(&vma->anon_vma_chain); 1604 INIT_LIST_HEAD(&vma->anon_vma_chain);
1602 1605
1603 if (file) { 1606 if (file) {
1604 if (vm_flags & VM_DENYWRITE) { 1607 if (vm_flags & VM_DENYWRITE) {
1605 error = deny_write_access(file); 1608 error = deny_write_access(file);
1606 if (error) 1609 if (error)
1607 goto free_vma; 1610 goto free_vma;
1608 } 1611 }
1609 if (vm_flags & VM_SHARED) { 1612 if (vm_flags & VM_SHARED) {
1610 error = mapping_map_writable(file->f_mapping); 1613 error = mapping_map_writable(file->f_mapping);
1611 if (error) 1614 if (error)
1612 goto allow_write_and_free_vma; 1615 goto allow_write_and_free_vma;
1613 } 1616 }
1614 1617
1615 /* ->mmap() can change vma->vm_file, but must guarantee that 1618 /* ->mmap() can change vma->vm_file, but must guarantee that
1616 * vma_link() below can deny write-access if VM_DENYWRITE is set 1619 * vma_link() below can deny write-access if VM_DENYWRITE is set
1617 * and map writably if VM_SHARED is set. This usually means the 1620 * and map writably if VM_SHARED is set. This usually means the
1618 * new file must not have been exposed to user-space, yet. 1621 * new file must not have been exposed to user-space, yet.
1619 */ 1622 */
1620 vma->vm_file = get_file(file); 1623 vma->vm_file = get_file(file);
1621 error = file->f_op->mmap(file, vma); 1624 error = file->f_op->mmap(file, vma);
1622 if (error) 1625 if (error)
1623 goto unmap_and_free_vma; 1626 goto unmap_and_free_vma;
1624 1627
1625 /* Can addr have changed?? 1628 /* Can addr have changed??
1626 * 1629 *
1627 * Answer: Yes, several device drivers can do it in their 1630 * Answer: Yes, several device drivers can do it in their
1628 * f_op->mmap method. -DaveM 1631 * f_op->mmap method. -DaveM
1629 * Bug: If addr is changed, prev, rb_link, rb_parent should 1632 * Bug: If addr is changed, prev, rb_link, rb_parent should
1630 * be updated for vma_link() 1633 * be updated for vma_link()
1631 */ 1634 */
1632 WARN_ON_ONCE(addr != vma->vm_start); 1635 WARN_ON_ONCE(addr != vma->vm_start);
1633 1636
1634 addr = vma->vm_start; 1637 addr = vma->vm_start;
1635 vm_flags = vma->vm_flags; 1638 vm_flags = vma->vm_flags;
1636 } else if (vm_flags & VM_SHARED) { 1639 } else if (vm_flags & VM_SHARED) {
1637 error = shmem_zero_setup(vma); 1640 error = shmem_zero_setup(vma);
1638 if (error) 1641 if (error)
1639 goto free_vma; 1642 goto free_vma;
1640 } 1643 }
1641 1644
1642 vma_link(mm, vma, prev, rb_link, rb_parent); 1645 vma_link(mm, vma, prev, rb_link, rb_parent);
1643 /* Once vma denies write, undo our temporary denial count */ 1646 /* Once vma denies write, undo our temporary denial count */
1644 if (file) { 1647 if (file) {
1645 if (vm_flags & VM_SHARED) 1648 if (vm_flags & VM_SHARED)
1646 mapping_unmap_writable(file->f_mapping); 1649 mapping_unmap_writable(file->f_mapping);
1647 if (vm_flags & VM_DENYWRITE) 1650 if (vm_flags & VM_DENYWRITE)
1648 allow_write_access(file); 1651 allow_write_access(file);
1649 } 1652 }
1650 file = vma->vm_file; 1653 file = vma->vm_file;
1651 out: 1654 out:
1652 perf_event_mmap(vma); 1655 perf_event_mmap(vma);
1653 1656
1654 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); 1657 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
1655 if (vm_flags & VM_LOCKED) { 1658 if (vm_flags & VM_LOCKED) {
1656 if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) || 1659 if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) ||
1657 vma == get_gate_vma(current->mm))) 1660 vma == get_gate_vma(current->mm)))
1658 mm->locked_vm += (len >> PAGE_SHIFT); 1661 mm->locked_vm += (len >> PAGE_SHIFT);
1659 else 1662 else
1660 vma->vm_flags &= ~VM_LOCKED; 1663 vma->vm_flags &= ~VM_LOCKED;
1661 } 1664 }
1662 1665
1663 if (file) 1666 if (file)
1664 uprobe_mmap(vma); 1667 uprobe_mmap(vma);
1665 1668
1666 /* 1669 /*
1667 * New (or expanded) vma always get soft dirty status. 1670 * New (or expanded) vma always get soft dirty status.
1668 * Otherwise user-space soft-dirty page tracker won't 1671 * Otherwise user-space soft-dirty page tracker won't
1669 * be able to distinguish situation when vma area unmapped, 1672 * be able to distinguish situation when vma area unmapped,
1670 * then new mapped in-place (which must be aimed as 1673 * then new mapped in-place (which must be aimed as
1671 * a completely new data area). 1674 * a completely new data area).
1672 */ 1675 */
1673 vma->vm_flags |= VM_SOFTDIRTY; 1676 vma->vm_flags |= VM_SOFTDIRTY;
1674 1677
1675 vma_set_page_prot(vma); 1678 vma_set_page_prot(vma);
1676 1679
1677 return addr; 1680 return addr;
1678 1681
1679 unmap_and_free_vma: 1682 unmap_and_free_vma:
1680 vma->vm_file = NULL; 1683 vma->vm_file = NULL;
1681 fput(file); 1684 fput(file);
1682 1685
1683 /* Undo any partial mapping done by a device driver. */ 1686 /* Undo any partial mapping done by a device driver. */
1684 unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end); 1687 unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
1685 charged = 0; 1688 charged = 0;
1686 if (vm_flags & VM_SHARED) 1689 if (vm_flags & VM_SHARED)
1687 mapping_unmap_writable(file->f_mapping); 1690 mapping_unmap_writable(file->f_mapping);
1688 allow_write_and_free_vma: 1691 allow_write_and_free_vma:
1689 if (vm_flags & VM_DENYWRITE) 1692 if (vm_flags & VM_DENYWRITE)
1690 allow_write_access(file); 1693 allow_write_access(file);
1691 free_vma: 1694 free_vma:
1692 kmem_cache_free(vm_area_cachep, vma); 1695 kmem_cache_free(vm_area_cachep, vma);
1693 unacct_error: 1696 unacct_error:
1694 if (charged) 1697 if (charged)
1695 vm_unacct_memory(charged); 1698 vm_unacct_memory(charged);
1696 return error; 1699 return error;
1697 } 1700 }
1698 1701
1699 unsigned long unmapped_area(struct vm_unmapped_area_info *info) 1702 unsigned long unmapped_area(struct vm_unmapped_area_info *info)
1700 { 1703 {
1701 /* 1704 /*
1702 * We implement the search by looking for an rbtree node that 1705 * We implement the search by looking for an rbtree node that
1703 * immediately follows a suitable gap. That is, 1706 * immediately follows a suitable gap. That is,
1704 * - gap_start = vma->vm_prev->vm_end <= info->high_limit - length; 1707 * - gap_start = vma->vm_prev->vm_end <= info->high_limit - length;
1705 * - gap_end = vma->vm_start >= info->low_limit + length; 1708 * - gap_end = vma->vm_start >= info->low_limit + length;
1706 * - gap_end - gap_start >= length 1709 * - gap_end - gap_start >= length
1707 */ 1710 */
1708 1711
1709 struct mm_struct *mm = current->mm; 1712 struct mm_struct *mm = current->mm;
1710 struct vm_area_struct *vma; 1713 struct vm_area_struct *vma;
1711 unsigned long length, low_limit, high_limit, gap_start, gap_end; 1714 unsigned long length, low_limit, high_limit, gap_start, gap_end;
1712 1715
1713 /* Adjust search length to account for worst case alignment overhead */ 1716 /* Adjust search length to account for worst case alignment overhead */
1714 length = info->length + info->align_mask; 1717 length = info->length + info->align_mask;
1715 if (length < info->length) 1718 if (length < info->length)
1716 return -ENOMEM; 1719 return -ENOMEM;
1717 1720
1718 /* Adjust search limits by the desired length */ 1721 /* Adjust search limits by the desired length */
1719 if (info->high_limit < length) 1722 if (info->high_limit < length)
1720 return -ENOMEM; 1723 return -ENOMEM;
1721 high_limit = info->high_limit - length; 1724 high_limit = info->high_limit - length;
1722 1725
1723 if (info->low_limit > high_limit) 1726 if (info->low_limit > high_limit)
1724 return -ENOMEM; 1727 return -ENOMEM;
1725 low_limit = info->low_limit + length; 1728 low_limit = info->low_limit + length;
1726 1729
1727 /* Check if rbtree root looks promising */ 1730 /* Check if rbtree root looks promising */
1728 if (RB_EMPTY_ROOT(&mm->mm_rb)) 1731 if (RB_EMPTY_ROOT(&mm->mm_rb))
1729 goto check_highest; 1732 goto check_highest;
1730 vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb); 1733 vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
1731 if (vma->rb_subtree_gap < length) 1734 if (vma->rb_subtree_gap < length)
1732 goto check_highest; 1735 goto check_highest;
1733 1736
1734 while (true) { 1737 while (true) {
1735 /* Visit left subtree if it looks promising */ 1738 /* Visit left subtree if it looks promising */
1736 gap_end = vma->vm_start; 1739 gap_end = vma->vm_start;
1737 if (gap_end >= low_limit && vma->vm_rb.rb_left) { 1740 if (gap_end >= low_limit && vma->vm_rb.rb_left) {
1738 struct vm_area_struct *left = 1741 struct vm_area_struct *left =
1739 rb_entry(vma->vm_rb.rb_left, 1742 rb_entry(vma->vm_rb.rb_left,
1740 struct vm_area_struct, vm_rb); 1743 struct vm_area_struct, vm_rb);
1741 if (left->rb_subtree_gap >= length) { 1744 if (left->rb_subtree_gap >= length) {
1742 vma = left; 1745 vma = left;
1743 continue; 1746 continue;
1744 } 1747 }
1745 } 1748 }
1746 1749
1747 gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0; 1750 gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;
1748 check_current: 1751 check_current:
1749 /* Check if current node has a suitable gap */ 1752 /* Check if current node has a suitable gap */
1750 if (gap_start > high_limit) 1753 if (gap_start > high_limit)
1751 return -ENOMEM; 1754 return -ENOMEM;
1752 if (gap_end >= low_limit && gap_end - gap_start >= length) 1755 if (gap_end >= low_limit && gap_end - gap_start >= length)
1753 goto found; 1756 goto found;
1754 1757
1755 /* Visit right subtree if it looks promising */ 1758 /* Visit right subtree if it looks promising */
1756 if (vma->vm_rb.rb_right) { 1759 if (vma->vm_rb.rb_right) {
1757 struct vm_area_struct *right = 1760 struct vm_area_struct *right =
1758 rb_entry(vma->vm_rb.rb_right, 1761 rb_entry(vma->vm_rb.rb_right,
1759 struct vm_area_struct, vm_rb); 1762 struct vm_area_struct, vm_rb);
1760 if (right->rb_subtree_gap >= length) { 1763 if (right->rb_subtree_gap >= length) {
1761 vma = right; 1764 vma = right;
1762 continue; 1765 continue;
1763 } 1766 }
1764 } 1767 }
1765 1768
1766 /* Go back up the rbtree to find next candidate node */ 1769 /* Go back up the rbtree to find next candidate node */
1767 while (true) { 1770 while (true) {
1768 struct rb_node *prev = &vma->vm_rb; 1771 struct rb_node *prev = &vma->vm_rb;
1769 if (!rb_parent(prev)) 1772 if (!rb_parent(prev))
1770 goto check_highest; 1773 goto check_highest;
1771 vma = rb_entry(rb_parent(prev), 1774 vma = rb_entry(rb_parent(prev),
1772 struct vm_area_struct, vm_rb); 1775 struct vm_area_struct, vm_rb);
1773 if (prev == vma->vm_rb.rb_left) { 1776 if (prev == vma->vm_rb.rb_left) {
1774 gap_start = vma->vm_prev->vm_end; 1777 gap_start = vma->vm_prev->vm_end;
1775 gap_end = vma->vm_start; 1778 gap_end = vma->vm_start;
1776 goto check_current; 1779 goto check_current;
1777 } 1780 }
1778 } 1781 }
1779 } 1782 }
1780 1783
1781 check_highest: 1784 check_highest:
1782 /* Check highest gap, which does not precede any rbtree node */ 1785 /* Check highest gap, which does not precede any rbtree node */
1783 gap_start = mm->highest_vm_end; 1786 gap_start = mm->highest_vm_end;
1784 gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */ 1787 gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */
1785 if (gap_start > high_limit) 1788 if (gap_start > high_limit)
1786 return -ENOMEM; 1789 return -ENOMEM;
1787 1790
1788 found: 1791 found:
1789 /* We found a suitable gap. Clip it with the original low_limit. */ 1792 /* We found a suitable gap. Clip it with the original low_limit. */
1790 if (gap_start < info->low_limit) 1793 if (gap_start < info->low_limit)
1791 gap_start = info->low_limit; 1794 gap_start = info->low_limit;
1792 1795
1793 /* Adjust gap address to the desired alignment */ 1796 /* Adjust gap address to the desired alignment */
1794 gap_start += (info->align_offset - gap_start) & info->align_mask; 1797 gap_start += (info->align_offset - gap_start) & info->align_mask;
1795 1798
1796 VM_BUG_ON(gap_start + info->length > info->high_limit); 1799 VM_BUG_ON(gap_start + info->length > info->high_limit);
1797 VM_BUG_ON(gap_start + info->length > gap_end); 1800 VM_BUG_ON(gap_start + info->length > gap_end);
1798 return gap_start; 1801 return gap_start;
1799 } 1802 }
1800 1803
1801 unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) 1804 unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
1802 { 1805 {
1803 struct mm_struct *mm = current->mm; 1806 struct mm_struct *mm = current->mm;
1804 struct vm_area_struct *vma; 1807 struct vm_area_struct *vma;
1805 unsigned long length, low_limit, high_limit, gap_start, gap_end; 1808 unsigned long length, low_limit, high_limit, gap_start, gap_end;
1806 1809
1807 /* Adjust search length to account for worst case alignment overhead */ 1810 /* Adjust search length to account for worst case alignment overhead */
1808 length = info->length + info->align_mask; 1811 length = info->length + info->align_mask;
1809 if (length < info->length) 1812 if (length < info->length)
1810 return -ENOMEM; 1813 return -ENOMEM;
1811 1814
1812 /* 1815 /*
1813 * Adjust search limits by the desired length. 1816 * Adjust search limits by the desired length.
1814 * See implementation comment at top of unmapped_area(). 1817 * See implementation comment at top of unmapped_area().
1815 */ 1818 */
1816 gap_end = info->high_limit; 1819 gap_end = info->high_limit;
1817 if (gap_end < length) 1820 if (gap_end < length)
1818 return -ENOMEM; 1821 return -ENOMEM;
1819 high_limit = gap_end - length; 1822 high_limit = gap_end - length;
1820 1823
1821 if (info->low_limit > high_limit) 1824 if (info->low_limit > high_limit)
1822 return -ENOMEM; 1825 return -ENOMEM;
1823 low_limit = info->low_limit + length; 1826 low_limit = info->low_limit + length;
1824 1827
1825 /* Check highest gap, which does not precede any rbtree node */ 1828 /* Check highest gap, which does not precede any rbtree node */
1826 gap_start = mm->highest_vm_end; 1829 gap_start = mm->highest_vm_end;
1827 if (gap_start <= high_limit) 1830 if (gap_start <= high_limit)
1828 goto found_highest; 1831 goto found_highest;
1829 1832
1830 /* Check if rbtree root looks promising */ 1833 /* Check if rbtree root looks promising */
1831 if (RB_EMPTY_ROOT(&mm->mm_rb)) 1834 if (RB_EMPTY_ROOT(&mm->mm_rb))
1832 return -ENOMEM; 1835 return -ENOMEM;
1833 vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb); 1836 vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
1834 if (vma->rb_subtree_gap < length) 1837 if (vma->rb_subtree_gap < length)
1835 return -ENOMEM; 1838 return -ENOMEM;
1836 1839
1837 while (true) { 1840 while (true) {
1838 /* Visit right subtree if it looks promising */ 1841 /* Visit right subtree if it looks promising */
1839 gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0; 1842 gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;
1840 if (gap_start <= high_limit && vma->vm_rb.rb_right) { 1843 if (gap_start <= high_limit && vma->vm_rb.rb_right) {
1841 struct vm_area_struct *right = 1844 struct vm_area_struct *right =
1842 rb_entry(vma->vm_rb.rb_right, 1845 rb_entry(vma->vm_rb.rb_right,
1843 struct vm_area_struct, vm_rb); 1846 struct vm_area_struct, vm_rb);
1844 if (right->rb_subtree_gap >= length) { 1847 if (right->rb_subtree_gap >= length) {
1845 vma = right; 1848 vma = right;
1846 continue; 1849 continue;
1847 } 1850 }
1848 } 1851 }
1849 1852
1850 check_current: 1853 check_current:
1851 /* Check if current node has a suitable gap */ 1854 /* Check if current node has a suitable gap */
1852 gap_end = vma->vm_start; 1855 gap_end = vma->vm_start;
1853 if (gap_end < low_limit) 1856 if (gap_end < low_limit)
1854 return -ENOMEM; 1857 return -ENOMEM;
1855 if (gap_start <= high_limit && gap_end - gap_start >= length) 1858 if (gap_start <= high_limit && gap_end - gap_start >= length)
1856 goto found; 1859 goto found;
1857 1860
1858 /* Visit left subtree if it looks promising */ 1861 /* Visit left subtree if it looks promising */
1859 if (vma->vm_rb.rb_left) { 1862 if (vma->vm_rb.rb_left) {
1860 struct vm_area_struct *left = 1863 struct vm_area_struct *left =
1861 rb_entry(vma->vm_rb.rb_left, 1864 rb_entry(vma->vm_rb.rb_left,
1862 struct vm_area_struct, vm_rb); 1865 struct vm_area_struct, vm_rb);
1863 if (left->rb_subtree_gap >= length) { 1866 if (left->rb_subtree_gap >= length) {
1864 vma = left; 1867 vma = left;
1865 continue; 1868 continue;
1866 } 1869 }
1867 } 1870 }
1868 1871
1869 /* Go back up the rbtree to find next candidate node */ 1872 /* Go back up the rbtree to find next candidate node */
1870 while (true) { 1873 while (true) {
1871 struct rb_node *prev = &vma->vm_rb; 1874 struct rb_node *prev = &vma->vm_rb;
1872 if (!rb_parent(prev)) 1875 if (!rb_parent(prev))
1873 return -ENOMEM; 1876 return -ENOMEM;
1874 vma = rb_entry(rb_parent(prev), 1877 vma = rb_entry(rb_parent(prev),
1875 struct vm_area_struct, vm_rb); 1878 struct vm_area_struct, vm_rb);
1876 if (prev == vma->vm_rb.rb_right) { 1879 if (prev == vma->vm_rb.rb_right) {
1877 gap_start = vma->vm_prev ? 1880 gap_start = vma->vm_prev ?
1878 vma->vm_prev->vm_end : 0; 1881 vma->vm_prev->vm_end : 0;
1879 goto check_current; 1882 goto check_current;
1880 } 1883 }
1881 } 1884 }
1882 } 1885 }
1883 1886
1884 found: 1887 found:
1885 /* We found a suitable gap. Clip it with the original high_limit. */ 1888 /* We found a suitable gap. Clip it with the original high_limit. */
1886 if (gap_end > info->high_limit) 1889 if (gap_end > info->high_limit)
1887 gap_end = info->high_limit; 1890 gap_end = info->high_limit;
1888 1891
1889 found_highest: 1892 found_highest:
1890 /* Compute highest gap address at the desired alignment */ 1893 /* Compute highest gap address at the desired alignment */
1891 gap_end -= info->length; 1894 gap_end -= info->length;
1892 gap_end -= (gap_end - info->align_offset) & info->align_mask; 1895 gap_end -= (gap_end - info->align_offset) & info->align_mask;
1893 1896
1894 VM_BUG_ON(gap_end < info->low_limit); 1897 VM_BUG_ON(gap_end < info->low_limit);
1895 VM_BUG_ON(gap_end < gap_start); 1898 VM_BUG_ON(gap_end < gap_start);
1896 return gap_end; 1899 return gap_end;
1897 } 1900 }
1898 1901
1899 /* Get an address range which is currently unmapped. 1902 /* Get an address range which is currently unmapped.
1900 * For shmat() with addr=0. 1903 * For shmat() with addr=0.
1901 * 1904 *
1902 * Ugly calling convention alert: 1905 * Ugly calling convention alert:
1903 * Return value with the low bits set means error value, 1906 * Return value with the low bits set means error value,
1904 * ie 1907 * ie
1905 * if (ret & ~PAGE_MASK) 1908 * if (ret & ~PAGE_MASK)
1906 * error = ret; 1909 * error = ret;
1907 * 1910 *
1908 * This function "knows" that -ENOMEM has the bits set. 1911 * This function "knows" that -ENOMEM has the bits set.
1909 */ 1912 */
1910 #ifndef HAVE_ARCH_UNMAPPED_AREA 1913 #ifndef HAVE_ARCH_UNMAPPED_AREA
1911 unsigned long 1914 unsigned long
1912 arch_get_unmapped_area(struct file *filp, unsigned long addr, 1915 arch_get_unmapped_area(struct file *filp, unsigned long addr,
1913 unsigned long len, unsigned long pgoff, unsigned long flags) 1916 unsigned long len, unsigned long pgoff, unsigned long flags)
1914 { 1917 {
1915 struct mm_struct *mm = current->mm; 1918 struct mm_struct *mm = current->mm;
1916 struct vm_area_struct *vma; 1919 struct vm_area_struct *vma;
1917 struct vm_unmapped_area_info info; 1920 struct vm_unmapped_area_info info;
1918 1921
1919 if (len > TASK_SIZE - mmap_min_addr) 1922 if (len > TASK_SIZE - mmap_min_addr)
1920 return -ENOMEM; 1923 return -ENOMEM;
1921 1924
1922 if (flags & MAP_FIXED) 1925 if (flags & MAP_FIXED)
1923 return addr; 1926 return addr;
1924 1927
1925 if (addr) { 1928 if (addr) {
1926 addr = PAGE_ALIGN(addr); 1929 addr = PAGE_ALIGN(addr);
1927 vma = find_vma(mm, addr); 1930 vma = find_vma(mm, addr);
1928 if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && 1931 if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
1929 (!vma || addr + len <= vma->vm_start)) 1932 (!vma || addr + len <= vma->vm_start))
1930 return addr; 1933 return addr;
1931 } 1934 }
1932 1935
1933 info.flags = 0; 1936 info.flags = 0;
1934 info.length = len; 1937 info.length = len;
1935 info.low_limit = mm->mmap_base; 1938 info.low_limit = mm->mmap_base;
1936 info.high_limit = TASK_SIZE; 1939 info.high_limit = TASK_SIZE;
1937 info.align_mask = 0; 1940 info.align_mask = 0;
1938 return vm_unmapped_area(&info); 1941 return vm_unmapped_area(&info);
1939 } 1942 }
1940 #endif 1943 #endif
1941 1944
1942 /* 1945 /*
1943 * This mmap-allocator allocates new areas top-down from below the 1946 * This mmap-allocator allocates new areas top-down from below the
1944 * stack's low limit (the base): 1947 * stack's low limit (the base):
1945 */ 1948 */
1946 #ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN 1949 #ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
1947 unsigned long 1950 unsigned long
1948 arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, 1951 arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
1949 const unsigned long len, const unsigned long pgoff, 1952 const unsigned long len, const unsigned long pgoff,
1950 const unsigned long flags) 1953 const unsigned long flags)
1951 { 1954 {
1952 struct vm_area_struct *vma; 1955 struct vm_area_struct *vma;
1953 struct mm_struct *mm = current->mm; 1956 struct mm_struct *mm = current->mm;
1954 unsigned long addr = addr0; 1957 unsigned long addr = addr0;
1955 struct vm_unmapped_area_info info; 1958 struct vm_unmapped_area_info info;
1956 1959
1957 /* requested length too big for entire address space */ 1960 /* requested length too big for entire address space */
1958 if (len > TASK_SIZE - mmap_min_addr) 1961 if (len > TASK_SIZE - mmap_min_addr)
1959 return -ENOMEM; 1962 return -ENOMEM;
1960 1963
1961 if (flags & MAP_FIXED) 1964 if (flags & MAP_FIXED)
1962 return addr; 1965 return addr;
1963 1966
1964 /* requesting a specific address */ 1967 /* requesting a specific address */
1965 if (addr) { 1968 if (addr) {
1966 addr = PAGE_ALIGN(addr); 1969 addr = PAGE_ALIGN(addr);
1967 vma = find_vma(mm, addr); 1970 vma = find_vma(mm, addr);
1968 if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && 1971 if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
1969 (!vma || addr + len <= vma->vm_start)) 1972 (!vma || addr + len <= vma->vm_start))
1970 return addr; 1973 return addr;
1971 } 1974 }
1972 1975
1973 info.flags = VM_UNMAPPED_AREA_TOPDOWN; 1976 info.flags = VM_UNMAPPED_AREA_TOPDOWN;
1974 info.length = len; 1977 info.length = len;
1975 info.low_limit = max(PAGE_SIZE, mmap_min_addr); 1978 info.low_limit = max(PAGE_SIZE, mmap_min_addr);
1976 info.high_limit = mm->mmap_base; 1979 info.high_limit = mm->mmap_base;
1977 info.align_mask = 0; 1980 info.align_mask = 0;
1978 addr = vm_unmapped_area(&info); 1981 addr = vm_unmapped_area(&info);
1979 1982
1980 /* 1983 /*
1981 * A failed mmap() very likely causes application failure, 1984 * A failed mmap() very likely causes application failure,
1982 * so fall back to the bottom-up function here. This scenario 1985 * so fall back to the bottom-up function here. This scenario
1983 * can happen with large stack limits and large mmap() 1986 * can happen with large stack limits and large mmap()
1984 * allocations. 1987 * allocations.
1985 */ 1988 */
1986 if (addr & ~PAGE_MASK) { 1989 if (addr & ~PAGE_MASK) {
1987 VM_BUG_ON(addr != -ENOMEM); 1990 VM_BUG_ON(addr != -ENOMEM);
1988 info.flags = 0; 1991 info.flags = 0;
1989 info.low_limit = TASK_UNMAPPED_BASE; 1992 info.low_limit = TASK_UNMAPPED_BASE;
1990 info.high_limit = TASK_SIZE; 1993 info.high_limit = TASK_SIZE;
1991 addr = vm_unmapped_area(&info); 1994 addr = vm_unmapped_area(&info);
1992 } 1995 }
1993 1996
1994 return addr; 1997 return addr;
1995 } 1998 }
1996 #endif 1999 #endif
1997 2000
1998 unsigned long 2001 unsigned long
1999 get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, 2002 get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
2000 unsigned long pgoff, unsigned long flags) 2003 unsigned long pgoff, unsigned long flags)
2001 { 2004 {
2002 unsigned long (*get_area)(struct file *, unsigned long, 2005 unsigned long (*get_area)(struct file *, unsigned long,
2003 unsigned long, unsigned long, unsigned long); 2006 unsigned long, unsigned long, unsigned long);
2004 2007
2005 unsigned long error = arch_mmap_check(addr, len, flags); 2008 unsigned long error = arch_mmap_check(addr, len, flags);
2006 if (error) 2009 if (error)
2007 return error; 2010 return error;
2008 2011
2009 /* Careful about overflows.. */ 2012 /* Careful about overflows.. */
2010 if (len > TASK_SIZE) 2013 if (len > TASK_SIZE)
2011 return -ENOMEM; 2014 return -ENOMEM;
2012 2015
2013 get_area = current->mm->get_unmapped_area; 2016 get_area = current->mm->get_unmapped_area;
2014 if (file && file->f_op->get_unmapped_area) 2017 if (file && file->f_op->get_unmapped_area)
2015 get_area = file->f_op->get_unmapped_area; 2018 get_area = file->f_op->get_unmapped_area;
2016 addr = get_area(file, addr, len, pgoff, flags); 2019 addr = get_area(file, addr, len, pgoff, flags);
2017 if (IS_ERR_VALUE(addr)) 2020 if (IS_ERR_VALUE(addr))
2018 return addr; 2021 return addr;
2019 2022
2020 if (addr > TASK_SIZE - len) 2023 if (addr > TASK_SIZE - len)
2021 return -ENOMEM; 2024 return -ENOMEM;
2022 if (addr & ~PAGE_MASK) 2025 if (addr & ~PAGE_MASK)
2023 return -EINVAL; 2026 return -EINVAL;
2024 2027
2025 addr = arch_rebalance_pgtables(addr, len); 2028 addr = arch_rebalance_pgtables(addr, len);
2026 error = security_mmap_addr(addr); 2029 error = security_mmap_addr(addr);
2027 return error ? error : addr; 2030 return error ? error : addr;
2028 } 2031 }
2029 2032
2030 EXPORT_SYMBOL(get_unmapped_area); 2033 EXPORT_SYMBOL(get_unmapped_area);
2031 2034
2032 /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ 2035 /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
2033 struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) 2036 struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
2034 { 2037 {
2035 struct rb_node *rb_node; 2038 struct rb_node *rb_node;
2036 struct vm_area_struct *vma; 2039 struct vm_area_struct *vma;
2037 2040
2038 /* Check the cache first. */ 2041 /* Check the cache first. */
2039 vma = vmacache_find(mm, addr); 2042 vma = vmacache_find(mm, addr);
2040 if (likely(vma)) 2043 if (likely(vma))
2041 return vma; 2044 return vma;
2042 2045
2043 rb_node = mm->mm_rb.rb_node; 2046 rb_node = mm->mm_rb.rb_node;
2044 vma = NULL; 2047 vma = NULL;
2045 2048
2046 while (rb_node) { 2049 while (rb_node) {
2047 struct vm_area_struct *tmp; 2050 struct vm_area_struct *tmp;
2048 2051
2049 tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb); 2052 tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
2050 2053
2051 if (tmp->vm_end > addr) { 2054 if (tmp->vm_end > addr) {
2052 vma = tmp; 2055 vma = tmp;
2053 if (tmp->vm_start <= addr) 2056 if (tmp->vm_start <= addr)
2054 break; 2057 break;
2055 rb_node = rb_node->rb_left; 2058 rb_node = rb_node->rb_left;
2056 } else 2059 } else
2057 rb_node = rb_node->rb_right; 2060 rb_node = rb_node->rb_right;
2058 } 2061 }
2059 2062
2060 if (vma) 2063 if (vma)
2061 vmacache_update(addr, vma); 2064 vmacache_update(addr, vma);
2062 return vma; 2065 return vma;
2063 } 2066 }
2064 2067
2065 EXPORT_SYMBOL(find_vma); 2068 EXPORT_SYMBOL(find_vma);
2066 2069
2067 /* 2070 /*
2068 * Same as find_vma, but also return a pointer to the previous VMA in *pprev. 2071 * Same as find_vma, but also return a pointer to the previous VMA in *pprev.
2069 */ 2072 */
2070 struct vm_area_struct * 2073 struct vm_area_struct *
2071 find_vma_prev(struct mm_struct *mm, unsigned long addr, 2074 find_vma_prev(struct mm_struct *mm, unsigned long addr,
2072 struct vm_area_struct **pprev) 2075 struct vm_area_struct **pprev)
2073 { 2076 {
2074 struct vm_area_struct *vma; 2077 struct vm_area_struct *vma;
2075 2078
2076 vma = find_vma(mm, addr); 2079 vma = find_vma(mm, addr);
2077 if (vma) { 2080 if (vma) {
2078 *pprev = vma->vm_prev; 2081 *pprev = vma->vm_prev;
2079 } else { 2082 } else {
2080 struct rb_node *rb_node = mm->mm_rb.rb_node; 2083 struct rb_node *rb_node = mm->mm_rb.rb_node;
2081 *pprev = NULL; 2084 *pprev = NULL;
2082 while (rb_node) { 2085 while (rb_node) {
2083 *pprev = rb_entry(rb_node, struct vm_area_struct, vm_rb); 2086 *pprev = rb_entry(rb_node, struct vm_area_struct, vm_rb);
2084 rb_node = rb_node->rb_right; 2087 rb_node = rb_node->rb_right;
2085 } 2088 }
2086 } 2089 }
2087 return vma; 2090 return vma;
2088 } 2091 }
2089 2092
2090 /* 2093 /*
2091 * Verify that the stack growth is acceptable and 2094 * Verify that the stack growth is acceptable and
2092 * update accounting. This is shared with both the 2095 * update accounting. This is shared with both the
2093 * grow-up and grow-down cases. 2096 * grow-up and grow-down cases.
2094 */ 2097 */
2095 static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow) 2098 static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow)
2096 { 2099 {
2097 struct mm_struct *mm = vma->vm_mm; 2100 struct mm_struct *mm = vma->vm_mm;
2098 struct rlimit *rlim = current->signal->rlim; 2101 struct rlimit *rlim = current->signal->rlim;
2099 unsigned long new_start; 2102 unsigned long new_start;
2100 2103
2101 /* address space limit tests */ 2104 /* address space limit tests */
2102 if (!may_expand_vm(mm, grow)) 2105 if (!may_expand_vm(mm, grow))
2103 return -ENOMEM; 2106 return -ENOMEM;
2104 2107
2105 /* Stack limit test */ 2108 /* Stack limit test */
2106 if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur)) 2109 if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur))
2107 return -ENOMEM; 2110 return -ENOMEM;
2108 2111
2109 /* mlock limit tests */ 2112 /* mlock limit tests */
2110 if (vma->vm_flags & VM_LOCKED) { 2113 if (vma->vm_flags & VM_LOCKED) {
2111 unsigned long locked; 2114 unsigned long locked;
2112 unsigned long limit; 2115 unsigned long limit;
2113 locked = mm->locked_vm + grow; 2116 locked = mm->locked_vm + grow;
2114 limit = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur); 2117 limit = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur);
2115 limit >>= PAGE_SHIFT; 2118 limit >>= PAGE_SHIFT;
2116 if (locked > limit && !capable(CAP_IPC_LOCK)) 2119 if (locked > limit && !capable(CAP_IPC_LOCK))
2117 return -ENOMEM; 2120 return -ENOMEM;
2118 } 2121 }
2119 2122
2120 /* Check to ensure the stack will not grow into a hugetlb-only region */ 2123 /* Check to ensure the stack will not grow into a hugetlb-only region */
2121 new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start : 2124 new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start :
2122 vma->vm_end - size; 2125 vma->vm_end - size;
2123 if (is_hugepage_only_range(vma->vm_mm, new_start, size)) 2126 if (is_hugepage_only_range(vma->vm_mm, new_start, size))
2124 return -EFAULT; 2127 return -EFAULT;
2125 2128
2126 /* 2129 /*
2127 * Overcommit.. This must be the final test, as it will 2130 * Overcommit.. This must be the final test, as it will
2128 * update security statistics. 2131 * update security statistics.
2129 */ 2132 */
2130 if (security_vm_enough_memory_mm(mm, grow)) 2133 if (security_vm_enough_memory_mm(mm, grow))
2131 return -ENOMEM; 2134 return -ENOMEM;
2132 2135
2133 /* Ok, everything looks good - let it rip */ 2136 /* Ok, everything looks good - let it rip */
2134 if (vma->vm_flags & VM_LOCKED) 2137 if (vma->vm_flags & VM_LOCKED)
2135 mm->locked_vm += grow; 2138 mm->locked_vm += grow;
2136 vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); 2139 vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
2137 return 0; 2140 return 0;
2138 } 2141 }
2139 2142
2140 #if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64) 2143 #if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64)
2141 /* 2144 /*
2142 * PA-RISC uses this for its stack; IA64 for its Register Backing Store. 2145 * PA-RISC uses this for its stack; IA64 for its Register Backing Store.
2143 * vma is the last one with address > vma->vm_end. Have to extend vma. 2146 * vma is the last one with address > vma->vm_end. Have to extend vma.
2144 */ 2147 */
2145 int expand_upwards(struct vm_area_struct *vma, unsigned long address) 2148 int expand_upwards(struct vm_area_struct *vma, unsigned long address)
2146 { 2149 {
2147 int error; 2150 int error;
2148 2151
2149 if (!(vma->vm_flags & VM_GROWSUP)) 2152 if (!(vma->vm_flags & VM_GROWSUP))
2150 return -EFAULT; 2153 return -EFAULT;
2151 2154
2152 /* 2155 /*
2153 * We must make sure the anon_vma is allocated 2156 * We must make sure the anon_vma is allocated
2154 * so that the anon_vma locking is not a noop. 2157 * so that the anon_vma locking is not a noop.
2155 */ 2158 */
2156 if (unlikely(anon_vma_prepare(vma))) 2159 if (unlikely(anon_vma_prepare(vma)))
2157 return -ENOMEM; 2160 return -ENOMEM;
2158 vma_lock_anon_vma(vma); 2161 vma_lock_anon_vma(vma);
2159 2162
2160 /* 2163 /*
2161 * vma->vm_start/vm_end cannot change under us because the caller 2164 * vma->vm_start/vm_end cannot change under us because the caller
2162 * is required to hold the mmap_sem in read mode. We need the 2165 * is required to hold the mmap_sem in read mode. We need the
2163 * anon_vma lock to serialize against concurrent expand_stacks. 2166 * anon_vma lock to serialize against concurrent expand_stacks.
2164 * Also guard against wrapping around to address 0. 2167 * Also guard against wrapping around to address 0.
2165 */ 2168 */
2166 if (address < PAGE_ALIGN(address+4)) 2169 if (address < PAGE_ALIGN(address+4))
2167 address = PAGE_ALIGN(address+4); 2170 address = PAGE_ALIGN(address+4);
2168 else { 2171 else {
2169 vma_unlock_anon_vma(vma); 2172 vma_unlock_anon_vma(vma);
2170 return -ENOMEM; 2173 return -ENOMEM;
2171 } 2174 }
2172 error = 0; 2175 error = 0;
2173 2176
2174 /* Somebody else might have raced and expanded it already */ 2177 /* Somebody else might have raced and expanded it already */
2175 if (address > vma->vm_end) { 2178 if (address > vma->vm_end) {
2176 unsigned long size, grow; 2179 unsigned long size, grow;
2177 2180
2178 size = address - vma->vm_start; 2181 size = address - vma->vm_start;
2179 grow = (address - vma->vm_end) >> PAGE_SHIFT; 2182 grow = (address - vma->vm_end) >> PAGE_SHIFT;
2180 2183
2181 error = -ENOMEM; 2184 error = -ENOMEM;
2182 if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { 2185 if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
2183 error = acct_stack_growth(vma, size, grow); 2186 error = acct_stack_growth(vma, size, grow);
2184 if (!error) { 2187 if (!error) {
2185 /* 2188 /*
2186 * vma_gap_update() doesn't support concurrent 2189 * vma_gap_update() doesn't support concurrent
2187 * updates, but we only hold a shared mmap_sem 2190 * updates, but we only hold a shared mmap_sem
2188 * lock here, so we need to protect against 2191 * lock here, so we need to protect against
2189 * concurrent vma expansions. 2192 * concurrent vma expansions.
2190 * vma_lock_anon_vma() doesn't help here, as 2193 * vma_lock_anon_vma() doesn't help here, as
2191 * we don't guarantee that all growable vmas 2194 * we don't guarantee that all growable vmas
2192 * in a mm share the same root anon vma. 2195 * in a mm share the same root anon vma.
2193 * So, we reuse mm->page_table_lock to guard 2196 * So, we reuse mm->page_table_lock to guard
2194 * against concurrent vma expansions. 2197 * against concurrent vma expansions.
2195 */ 2198 */
2196 spin_lock(&vma->vm_mm->page_table_lock); 2199 spin_lock(&vma->vm_mm->page_table_lock);
2197 anon_vma_interval_tree_pre_update_vma(vma); 2200 anon_vma_interval_tree_pre_update_vma(vma);
2198 vma->vm_end = address; 2201 vma->vm_end = address;
2199 anon_vma_interval_tree_post_update_vma(vma); 2202 anon_vma_interval_tree_post_update_vma(vma);
2200 if (vma->vm_next) 2203 if (vma->vm_next)
2201 vma_gap_update(vma->vm_next); 2204 vma_gap_update(vma->vm_next);
2202 else 2205 else
2203 vma->vm_mm->highest_vm_end = address; 2206 vma->vm_mm->highest_vm_end = address;
2204 spin_unlock(&vma->vm_mm->page_table_lock); 2207 spin_unlock(&vma->vm_mm->page_table_lock);
2205 2208
2206 perf_event_mmap(vma); 2209 perf_event_mmap(vma);
2207 } 2210 }
2208 } 2211 }
2209 } 2212 }
2210 vma_unlock_anon_vma(vma); 2213 vma_unlock_anon_vma(vma);
2211 khugepaged_enter_vma_merge(vma, vma->vm_flags); 2214 khugepaged_enter_vma_merge(vma, vma->vm_flags);
2212 validate_mm(vma->vm_mm); 2215 validate_mm(vma->vm_mm);
2213 return error; 2216 return error;
2214 } 2217 }
2215 #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ 2218 #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
2216 2219
2217 /* 2220 /*
2218 * vma is the first one with address < vma->vm_start. Have to extend vma. 2221 * vma is the first one with address < vma->vm_start. Have to extend vma.
2219 */ 2222 */
2220 int expand_downwards(struct vm_area_struct *vma, 2223 int expand_downwards(struct vm_area_struct *vma,
2221 unsigned long address) 2224 unsigned long address)
2222 { 2225 {
2223 int error; 2226 int error;
2224 2227
2225 /* 2228 /*
2226 * We must make sure the anon_vma is allocated 2229 * We must make sure the anon_vma is allocated
2227 * so that the anon_vma locking is not a noop. 2230 * so that the anon_vma locking is not a noop.
2228 */ 2231 */
2229 if (unlikely(anon_vma_prepare(vma))) 2232 if (unlikely(anon_vma_prepare(vma)))
2230 return -ENOMEM; 2233 return -ENOMEM;
2231 2234
2232 address &= PAGE_MASK; 2235 address &= PAGE_MASK;
2233 error = security_mmap_addr(address); 2236 error = security_mmap_addr(address);
2234 if (error) 2237 if (error)
2235 return error; 2238 return error;
2236 2239
2237 vma_lock_anon_vma(vma); 2240 vma_lock_anon_vma(vma);
2238 2241
2239 /* 2242 /*
2240 * vma->vm_start/vm_end cannot change under us because the caller 2243 * vma->vm_start/vm_end cannot change under us because the caller
2241 * is required to hold the mmap_sem in read mode. We need the 2244 * is required to hold the mmap_sem in read mode. We need the
2242 * anon_vma lock to serialize against concurrent expand_stacks. 2245 * anon_vma lock to serialize against concurrent expand_stacks.
2243 */ 2246 */
2244 2247
2245 /* Somebody else might have raced and expanded it already */ 2248 /* Somebody else might have raced and expanded it already */
2246 if (address < vma->vm_start) { 2249 if (address < vma->vm_start) {
2247 unsigned long size, grow; 2250 unsigned long size, grow;
2248 2251
2249 size = vma->vm_end - address; 2252 size = vma->vm_end - address;
2250 grow = (vma->vm_start - address) >> PAGE_SHIFT; 2253 grow = (vma->vm_start - address) >> PAGE_SHIFT;
2251 2254
2252 error = -ENOMEM; 2255 error = -ENOMEM;
2253 if (grow <= vma->vm_pgoff) { 2256 if (grow <= vma->vm_pgoff) {
2254 error = acct_stack_growth(vma, size, grow); 2257 error = acct_stack_growth(vma, size, grow);
2255 if (!error) { 2258 if (!error) {
2256 /* 2259 /*
2257 * vma_gap_update() doesn't support concurrent 2260 * vma_gap_update() doesn't support concurrent
2258 * updates, but we only hold a shared mmap_sem 2261 * updates, but we only hold a shared mmap_sem
2259 * lock here, so we need to protect against 2262 * lock here, so we need to protect against
2260 * concurrent vma expansions. 2263 * concurrent vma expansions.
2261 * vma_lock_anon_vma() doesn't help here, as 2264 * vma_lock_anon_vma() doesn't help here, as
2262 * we don't guarantee that all growable vmas 2265 * we don't guarantee that all growable vmas
2263 * in a mm share the same root anon vma. 2266 * in a mm share the same root anon vma.
2264 * So, we reuse mm->page_table_lock to guard 2267 * So, we reuse mm->page_table_lock to guard
2265 * against concurrent vma expansions. 2268 * against concurrent vma expansions.
2266 */ 2269 */
2267 spin_lock(&vma->vm_mm->page_table_lock); 2270 spin_lock(&vma->vm_mm->page_table_lock);
2268 anon_vma_interval_tree_pre_update_vma(vma); 2271 anon_vma_interval_tree_pre_update_vma(vma);
2269 vma->vm_start = address; 2272 vma->vm_start = address;
2270 vma->vm_pgoff -= grow; 2273 vma->vm_pgoff -= grow;
2271 anon_vma_interval_tree_post_update_vma(vma); 2274 anon_vma_interval_tree_post_update_vma(vma);
2272 vma_gap_update(vma); 2275 vma_gap_update(vma);
2273 spin_unlock(&vma->vm_mm->page_table_lock); 2276 spin_unlock(&vma->vm_mm->page_table_lock);
2274 2277
2275 perf_event_mmap(vma); 2278 perf_event_mmap(vma);
2276 } 2279 }
2277 } 2280 }
2278 } 2281 }
2279 vma_unlock_anon_vma(vma); 2282 vma_unlock_anon_vma(vma);
2280 khugepaged_enter_vma_merge(vma, vma->vm_flags); 2283 khugepaged_enter_vma_merge(vma, vma->vm_flags);
2281 validate_mm(vma->vm_mm); 2284 validate_mm(vma->vm_mm);
2282 return error; 2285 return error;
2283 } 2286 }
2284 2287
2285 /* 2288 /*
2286 * Note how expand_stack() refuses to expand the stack all the way to 2289 * Note how expand_stack() refuses to expand the stack all the way to
2287 * abut the next virtual mapping, *unless* that mapping itself is also 2290 * abut the next virtual mapping, *unless* that mapping itself is also
2288 * a stack mapping. We want to leave room for a guard page, after all 2291 * a stack mapping. We want to leave room for a guard page, after all
2289 * (the guard page itself is not added here, that is done by the 2292 * (the guard page itself is not added here, that is done by the
2290 * actual page faulting logic) 2293 * actual page faulting logic)
2291 * 2294 *
2292 * This matches the behavior of the guard page logic (see mm/memory.c: 2295 * This matches the behavior of the guard page logic (see mm/memory.c:
2293 * check_stack_guard_page()), which only allows the guard page to be 2296 * check_stack_guard_page()), which only allows the guard page to be
2294 * removed under these circumstances. 2297 * removed under these circumstances.
2295 */ 2298 */
2296 #ifdef CONFIG_STACK_GROWSUP 2299 #ifdef CONFIG_STACK_GROWSUP
2297 int expand_stack(struct vm_area_struct *vma, unsigned long address) 2300 int expand_stack(struct vm_area_struct *vma, unsigned long address)
2298 { 2301 {
2299 struct vm_area_struct *next; 2302 struct vm_area_struct *next;
2300 2303
2301 address &= PAGE_MASK; 2304 address &= PAGE_MASK;
2302 next = vma->vm_next; 2305 next = vma->vm_next;
2303 if (next && next->vm_start == address + PAGE_SIZE) { 2306 if (next && next->vm_start == address + PAGE_SIZE) {
2304 if (!(next->vm_flags & VM_GROWSUP)) 2307 if (!(next->vm_flags & VM_GROWSUP))
2305 return -ENOMEM; 2308 return -ENOMEM;
2306 } 2309 }
2307 return expand_upwards(vma, address); 2310 return expand_upwards(vma, address);
2308 } 2311 }
2309 2312
2310 struct vm_area_struct * 2313 struct vm_area_struct *
2311 find_extend_vma(struct mm_struct *mm, unsigned long addr) 2314 find_extend_vma(struct mm_struct *mm, unsigned long addr)
2312 { 2315 {
2313 struct vm_area_struct *vma, *prev; 2316 struct vm_area_struct *vma, *prev;
2314 2317
2315 addr &= PAGE_MASK; 2318 addr &= PAGE_MASK;
2316 vma = find_vma_prev(mm, addr, &prev); 2319 vma = find_vma_prev(mm, addr, &prev);
2317 if (vma && (vma->vm_start <= addr)) 2320 if (vma && (vma->vm_start <= addr))
2318 return vma; 2321 return vma;
2319 if (!prev || expand_stack(prev, addr)) 2322 if (!prev || expand_stack(prev, addr))
2320 return NULL; 2323 return NULL;
2321 if (prev->vm_flags & VM_LOCKED) 2324 if (prev->vm_flags & VM_LOCKED)
2322 __mlock_vma_pages_range(prev, addr, prev->vm_end, NULL); 2325 __mlock_vma_pages_range(prev, addr, prev->vm_end, NULL);
2323 return prev; 2326 return prev;
2324 } 2327 }
2325 #else 2328 #else
2326 int expand_stack(struct vm_area_struct *vma, unsigned long address) 2329 int expand_stack(struct vm_area_struct *vma, unsigned long address)
2327 { 2330 {
2328 struct vm_area_struct *prev; 2331 struct vm_area_struct *prev;
2329 2332
2330 address &= PAGE_MASK; 2333 address &= PAGE_MASK;
2331 prev = vma->vm_prev; 2334 prev = vma->vm_prev;
2332 if (prev && prev->vm_end == address) { 2335 if (prev && prev->vm_end == address) {
2333 if (!(prev->vm_flags & VM_GROWSDOWN)) 2336 if (!(prev->vm_flags & VM_GROWSDOWN))
2334 return -ENOMEM; 2337 return -ENOMEM;
2335 } 2338 }
2336 return expand_downwards(vma, address); 2339 return expand_downwards(vma, address);
2337 } 2340 }
2338 2341
2339 struct vm_area_struct * 2342 struct vm_area_struct *
2340 find_extend_vma(struct mm_struct *mm, unsigned long addr) 2343 find_extend_vma(struct mm_struct *mm, unsigned long addr)
2341 { 2344 {
2342 struct vm_area_struct *vma; 2345 struct vm_area_struct *vma;
2343 unsigned long start; 2346 unsigned long start;
2344 2347
2345 addr &= PAGE_MASK; 2348 addr &= PAGE_MASK;
2346 vma = find_vma(mm, addr); 2349 vma = find_vma(mm, addr);
2347 if (!vma) 2350 if (!vma)
2348 return NULL; 2351 return NULL;
2349 if (vma->vm_start <= addr) 2352 if (vma->vm_start <= addr)
2350 return vma; 2353 return vma;
2351 if (!(vma->vm_flags & VM_GROWSDOWN)) 2354 if (!(vma->vm_flags & VM_GROWSDOWN))
2352 return NULL; 2355 return NULL;
2353 start = vma->vm_start; 2356 start = vma->vm_start;
2354 if (expand_stack(vma, addr)) 2357 if (expand_stack(vma, addr))
2355 return NULL; 2358 return NULL;
2356 if (vma->vm_flags & VM_LOCKED) 2359 if (vma->vm_flags & VM_LOCKED)
2357 __mlock_vma_pages_range(vma, addr, start, NULL); 2360 __mlock_vma_pages_range(vma, addr, start, NULL);
2358 return vma; 2361 return vma;
2359 } 2362 }
2360 #endif 2363 #endif
2361 2364
2362 /* 2365 /*
2363 * Ok - we have the memory areas we should free on the vma list, 2366 * Ok - we have the memory areas we should free on the vma list,
2364 * so release them, and do the vma updates. 2367 * so release them, and do the vma updates.
2365 * 2368 *
2366 * Called with the mm semaphore held. 2369 * Called with the mm semaphore held.
2367 */ 2370 */
2368 static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) 2371 static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
2369 { 2372 {
2370 unsigned long nr_accounted = 0; 2373 unsigned long nr_accounted = 0;
2371 2374
2372 /* Update high watermark before we lower total_vm */ 2375 /* Update high watermark before we lower total_vm */
2373 update_hiwater_vm(mm); 2376 update_hiwater_vm(mm);
2374 do { 2377 do {
2375 long nrpages = vma_pages(vma); 2378 long nrpages = vma_pages(vma);
2376 2379
2377 if (vma->vm_flags & VM_ACCOUNT) 2380 if (vma->vm_flags & VM_ACCOUNT)
2378 nr_accounted += nrpages; 2381 nr_accounted += nrpages;
2379 vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); 2382 vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
2380 vma = remove_vma(vma); 2383 vma = remove_vma(vma);
2381 } while (vma); 2384 } while (vma);
2382 vm_unacct_memory(nr_accounted); 2385 vm_unacct_memory(nr_accounted);
2383 validate_mm(mm); 2386 validate_mm(mm);
2384 } 2387 }
2385 2388
2386 /* 2389 /*
2387 * Get rid of page table information in the indicated region. 2390 * Get rid of page table information in the indicated region.
2388 * 2391 *
2389 * Called with the mm semaphore held. 2392 * Called with the mm semaphore held.
2390 */ 2393 */
2391 static void unmap_region(struct mm_struct *mm, 2394 static void unmap_region(struct mm_struct *mm,
2392 struct vm_area_struct *vma, struct vm_area_struct *prev, 2395 struct vm_area_struct *vma, struct vm_area_struct *prev,
2393 unsigned long start, unsigned long end) 2396 unsigned long start, unsigned long end)
2394 { 2397 {
2395 struct vm_area_struct *next = prev ? prev->vm_next : mm->mmap; 2398 struct vm_area_struct *next = prev ? prev->vm_next : mm->mmap;
2396 struct mmu_gather tlb; 2399 struct mmu_gather tlb;
2397 2400
2398 lru_add_drain(); 2401 lru_add_drain();
2399 tlb_gather_mmu(&tlb, mm, start, end); 2402 tlb_gather_mmu(&tlb, mm, start, end);
2400 update_hiwater_rss(mm); 2403 update_hiwater_rss(mm);
2401 unmap_vmas(&tlb, vma, start, end); 2404 unmap_vmas(&tlb, vma, start, end);
2402 free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, 2405 free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
2403 next ? next->vm_start : USER_PGTABLES_CEILING); 2406 next ? next->vm_start : USER_PGTABLES_CEILING);
2404 tlb_finish_mmu(&tlb, start, end); 2407 tlb_finish_mmu(&tlb, start, end);
2405 } 2408 }
2406 2409
2407 /* 2410 /*
2408 * Create a list of vma's touched by the unmap, removing them from the mm's 2411 * Create a list of vma's touched by the unmap, removing them from the mm's
2409 * vma list as we go.. 2412 * vma list as we go..
2410 */ 2413 */
2411 static void 2414 static void
2412 detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, 2415 detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
2413 struct vm_area_struct *prev, unsigned long end) 2416 struct vm_area_struct *prev, unsigned long end)
2414 { 2417 {
2415 struct vm_area_struct **insertion_point; 2418 struct vm_area_struct **insertion_point;
2416 struct vm_area_struct *tail_vma = NULL; 2419 struct vm_area_struct *tail_vma = NULL;
2417 2420
2418 insertion_point = (prev ? &prev->vm_next : &mm->mmap); 2421 insertion_point = (prev ? &prev->vm_next : &mm->mmap);
2419 vma->vm_prev = NULL; 2422 vma->vm_prev = NULL;
2420 do { 2423 do {
2421 vma_rb_erase(vma, &mm->mm_rb); 2424 vma_rb_erase(vma, &mm->mm_rb);
2422 mm->map_count--; 2425 mm->map_count--;
2423 tail_vma = vma; 2426 tail_vma = vma;
2424 vma = vma->vm_next; 2427 vma = vma->vm_next;
2425 } while (vma && vma->vm_start < end); 2428 } while (vma && vma->vm_start < end);
2426 *insertion_point = vma; 2429 *insertion_point = vma;
2427 if (vma) { 2430 if (vma) {
2428 vma->vm_prev = prev; 2431 vma->vm_prev = prev;
2429 vma_gap_update(vma); 2432 vma_gap_update(vma);
2430 } else 2433 } else
2431 mm->highest_vm_end = prev ? prev->vm_end : 0; 2434 mm->highest_vm_end = prev ? prev->vm_end : 0;
2432 tail_vma->vm_next = NULL; 2435 tail_vma->vm_next = NULL;
2433 2436
2434 /* Kill the cache */ 2437 /* Kill the cache */
2435 vmacache_invalidate(mm); 2438 vmacache_invalidate(mm);
2436 } 2439 }
2437 2440
2438 /* 2441 /*
2439 * __split_vma() bypasses sysctl_max_map_count checking. We use this on the 2442 * __split_vma() bypasses sysctl_max_map_count checking. We use this on the
2440 * munmap path where it doesn't make sense to fail. 2443 * munmap path where it doesn't make sense to fail.
2441 */ 2444 */
2442 static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, 2445 static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
2443 unsigned long addr, int new_below) 2446 unsigned long addr, int new_below)
2444 { 2447 {
2445 struct vm_area_struct *new; 2448 struct vm_area_struct *new;
2446 int err = -ENOMEM; 2449 int err = -ENOMEM;
2447 2450
2448 if (is_vm_hugetlb_page(vma) && (addr & 2451 if (is_vm_hugetlb_page(vma) && (addr &
2449 ~(huge_page_mask(hstate_vma(vma))))) 2452 ~(huge_page_mask(hstate_vma(vma)))))
2450 return -EINVAL; 2453 return -EINVAL;
2451 2454
2452 new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); 2455 new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
2453 if (!new) 2456 if (!new)
2454 goto out_err; 2457 goto out_err;
2455 2458
2456 /* most fields are the same, copy all, and then fixup */ 2459 /* most fields are the same, copy all, and then fixup */
2457 *new = *vma; 2460 *new = *vma;
2458 2461
2459 INIT_LIST_HEAD(&new->anon_vma_chain); 2462 INIT_LIST_HEAD(&new->anon_vma_chain);
2460 2463
2461 if (new_below) 2464 if (new_below)
2462 new->vm_end = addr; 2465 new->vm_end = addr;
2463 else { 2466 else {
2464 new->vm_start = addr; 2467 new->vm_start = addr;
2465 new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); 2468 new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
2466 } 2469 }
2467 2470
2468 err = vma_dup_policy(vma, new); 2471 err = vma_dup_policy(vma, new);
2469 if (err) 2472 if (err)
2470 goto out_free_vma; 2473 goto out_free_vma;
2471 2474
2472 if (anon_vma_clone(new, vma)) 2475 err = anon_vma_clone(new, vma);
2476 if (err)
2473 goto out_free_mpol; 2477 goto out_free_mpol;
2474 2478
2475 if (new->vm_file) 2479 if (new->vm_file)
2476 get_file(new->vm_file); 2480 get_file(new->vm_file);
2477 2481
2478 if (new->vm_ops && new->vm_ops->open) 2482 if (new->vm_ops && new->vm_ops->open)
2479 new->vm_ops->open(new); 2483 new->vm_ops->open(new);
2480 2484
2481 if (new_below) 2485 if (new_below)
2482 err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + 2486 err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +
2483 ((addr - new->vm_start) >> PAGE_SHIFT), new); 2487 ((addr - new->vm_start) >> PAGE_SHIFT), new);
2484 else 2488 else
2485 err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); 2489 err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
2486 2490
2487 /* Success. */ 2491 /* Success. */
2488 if (!err) 2492 if (!err)
2489 return 0; 2493 return 0;
2490 2494
2491 /* Clean everything up if vma_adjust failed. */ 2495 /* Clean everything up if vma_adjust failed. */
2492 if (new->vm_ops && new->vm_ops->close) 2496 if (new->vm_ops && new->vm_ops->close)
2493 new->vm_ops->close(new); 2497 new->vm_ops->close(new);
2494 if (new->vm_file) 2498 if (new->vm_file)
2495 fput(new->vm_file); 2499 fput(new->vm_file);
2496 unlink_anon_vmas(new); 2500 unlink_anon_vmas(new);
2497 out_free_mpol: 2501 out_free_mpol:
2498 mpol_put(vma_policy(new)); 2502 mpol_put(vma_policy(new));
2499 out_free_vma: 2503 out_free_vma:
2500 kmem_cache_free(vm_area_cachep, new); 2504 kmem_cache_free(vm_area_cachep, new);
2501 out_err: 2505 out_err:
2502 return err; 2506 return err;
2503 } 2507 }
2504 2508
2505 /* 2509 /*
2506 * Split a vma into two pieces at address 'addr', a new vma is allocated 2510 * Split a vma into two pieces at address 'addr', a new vma is allocated
2507 * either for the first part or the tail. 2511 * either for the first part or the tail.
2508 */ 2512 */
2509 int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, 2513 int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
2510 unsigned long addr, int new_below) 2514 unsigned long addr, int new_below)
2511 { 2515 {
2512 if (mm->map_count >= sysctl_max_map_count) 2516 if (mm->map_count >= sysctl_max_map_count)
2513 return -ENOMEM; 2517 return -ENOMEM;
2514 2518
2515 return __split_vma(mm, vma, addr, new_below); 2519 return __split_vma(mm, vma, addr, new_below);
2516 } 2520 }
2517 2521
2518 /* Munmap is split into 2 main parts -- this part which finds 2522 /* Munmap is split into 2 main parts -- this part which finds
2519 * what needs doing, and the areas themselves, which do the 2523 * what needs doing, and the areas themselves, which do the
2520 * work. This now handles partial unmappings. 2524 * work. This now handles partial unmappings.
2521 * Jeremy Fitzhardinge <jeremy@goop.org> 2525 * Jeremy Fitzhardinge <jeremy@goop.org>
2522 */ 2526 */
2523 int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) 2527 int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
2524 { 2528 {
2525 unsigned long end; 2529 unsigned long end;
2526 struct vm_area_struct *vma, *prev, *last; 2530 struct vm_area_struct *vma, *prev, *last;
2527 2531
2528 if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start) 2532 if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start)
2529 return -EINVAL; 2533 return -EINVAL;
2530 2534
2531 len = PAGE_ALIGN(len); 2535 len = PAGE_ALIGN(len);
2532 if (len == 0) 2536 if (len == 0)
2533 return -EINVAL; 2537 return -EINVAL;
2534 2538
2535 /* Find the first overlapping VMA */ 2539 /* Find the first overlapping VMA */
2536 vma = find_vma(mm, start); 2540 vma = find_vma(mm, start);
2537 if (!vma) 2541 if (!vma)
2538 return 0; 2542 return 0;
2539 prev = vma->vm_prev; 2543 prev = vma->vm_prev;
2540 /* we have start < vma->vm_end */ 2544 /* we have start < vma->vm_end */
2541 2545
2542 /* if it doesn't overlap, we have nothing.. */ 2546 /* if it doesn't overlap, we have nothing.. */
2543 end = start + len; 2547 end = start + len;
2544 if (vma->vm_start >= end) 2548 if (vma->vm_start >= end)
2545 return 0; 2549 return 0;
2546 2550
2547 /* 2551 /*
2548 * If we need to split any vma, do it now to save pain later. 2552 * If we need to split any vma, do it now to save pain later.
2549 * 2553 *
2550 * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially 2554 * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially
2551 * unmapped vm_area_struct will remain in use: so lower split_vma 2555 * unmapped vm_area_struct will remain in use: so lower split_vma
2552 * places tmp vma above, and higher split_vma places tmp vma below. 2556 * places tmp vma above, and higher split_vma places tmp vma below.
2553 */ 2557 */
2554 if (start > vma->vm_start) { 2558 if (start > vma->vm_start) {
2555 int error; 2559 int error;
2556 2560
2557 /* 2561 /*
2558 * Make sure that map_count on return from munmap() will 2562 * Make sure that map_count on return from munmap() will
2559 * not exceed its limit; but let map_count go just above 2563 * not exceed its limit; but let map_count go just above
2560 * its limit temporarily, to help free resources as expected. 2564 * its limit temporarily, to help free resources as expected.
2561 */ 2565 */
2562 if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) 2566 if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
2563 return -ENOMEM; 2567 return -ENOMEM;
2564 2568
2565 error = __split_vma(mm, vma, start, 0); 2569 error = __split_vma(mm, vma, start, 0);
2566 if (error) 2570 if (error)
2567 return error; 2571 return error;
2568 prev = vma; 2572 prev = vma;
2569 } 2573 }
2570 2574
2571 /* Does it split the last one? */ 2575 /* Does it split the last one? */
2572 last = find_vma(mm, end); 2576 last = find_vma(mm, end);
2573 if (last && end > last->vm_start) { 2577 if (last && end > last->vm_start) {
2574 int error = __split_vma(mm, last, end, 1); 2578 int error = __split_vma(mm, last, end, 1);
2575 if (error) 2579 if (error)
2576 return error; 2580 return error;
2577 } 2581 }
2578 vma = prev ? prev->vm_next : mm->mmap; 2582 vma = prev ? prev->vm_next : mm->mmap;
2579 2583
2580 /* 2584 /*
2581 * unlock any mlock()ed ranges before detaching vmas 2585 * unlock any mlock()ed ranges before detaching vmas
2582 */ 2586 */
2583 if (mm->locked_vm) { 2587 if (mm->locked_vm) {
2584 struct vm_area_struct *tmp = vma; 2588 struct vm_area_struct *tmp = vma;
2585 while (tmp && tmp->vm_start < end) { 2589 while (tmp && tmp->vm_start < end) {
2586 if (tmp->vm_flags & VM_LOCKED) { 2590 if (tmp->vm_flags & VM_LOCKED) {
2587 mm->locked_vm -= vma_pages(tmp); 2591 mm->locked_vm -= vma_pages(tmp);
2588 munlock_vma_pages_all(tmp); 2592 munlock_vma_pages_all(tmp);
2589 } 2593 }
2590 tmp = tmp->vm_next; 2594 tmp = tmp->vm_next;
2591 } 2595 }
2592 } 2596 }
2593 2597
2594 /* 2598 /*
2595 * Remove the vma's, and unmap the actual pages 2599 * Remove the vma's, and unmap the actual pages
2596 */ 2600 */
2597 detach_vmas_to_be_unmapped(mm, vma, prev, end); 2601 detach_vmas_to_be_unmapped(mm, vma, prev, end);
2598 unmap_region(mm, vma, prev, start, end); 2602 unmap_region(mm, vma, prev, start, end);
2599 2603
2600 /* Fix up all other VM information */ 2604 /* Fix up all other VM information */
2601 remove_vma_list(mm, vma); 2605 remove_vma_list(mm, vma);
2602 2606
2603 return 0; 2607 return 0;
2604 } 2608 }
2605 2609
2606 int vm_munmap(unsigned long start, size_t len) 2610 int vm_munmap(unsigned long start, size_t len)
2607 { 2611 {
2608 int ret; 2612 int ret;
2609 struct mm_struct *mm = current->mm; 2613 struct mm_struct *mm = current->mm;
2610 2614
2611 down_write(&mm->mmap_sem); 2615 down_write(&mm->mmap_sem);
2612 ret = do_munmap(mm, start, len); 2616 ret = do_munmap(mm, start, len);
2613 up_write(&mm->mmap_sem); 2617 up_write(&mm->mmap_sem);
2614 return ret; 2618 return ret;
2615 } 2619 }
2616 EXPORT_SYMBOL(vm_munmap); 2620 EXPORT_SYMBOL(vm_munmap);
2617 2621
2618 SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) 2622 SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
2619 { 2623 {
2620 profile_munmap(addr); 2624 profile_munmap(addr);
2621 return vm_munmap(addr, len); 2625 return vm_munmap(addr, len);
2622 } 2626 }
2623 2627
2624 static inline void verify_mm_writelocked(struct mm_struct *mm) 2628 static inline void verify_mm_writelocked(struct mm_struct *mm)
2625 { 2629 {
2626 #ifdef CONFIG_DEBUG_VM 2630 #ifdef CONFIG_DEBUG_VM
2627 if (unlikely(down_read_trylock(&mm->mmap_sem))) { 2631 if (unlikely(down_read_trylock(&mm->mmap_sem))) {
2628 WARN_ON(1); 2632 WARN_ON(1);
2629 up_read(&mm->mmap_sem); 2633 up_read(&mm->mmap_sem);
2630 } 2634 }
2631 #endif 2635 #endif
2632 } 2636 }
2633 2637
2634 /* 2638 /*
2635 * this is really a simplified "do_mmap". it only handles 2639 * this is really a simplified "do_mmap". it only handles
2636 * anonymous maps. eventually we may be able to do some 2640 * anonymous maps. eventually we may be able to do some
2637 * brk-specific accounting here. 2641 * brk-specific accounting here.
2638 */ 2642 */
2639 static unsigned long do_brk(unsigned long addr, unsigned long len) 2643 static unsigned long do_brk(unsigned long addr, unsigned long len)
2640 { 2644 {
2641 struct mm_struct *mm = current->mm; 2645 struct mm_struct *mm = current->mm;
2642 struct vm_area_struct *vma, *prev; 2646 struct vm_area_struct *vma, *prev;
2643 unsigned long flags; 2647 unsigned long flags;
2644 struct rb_node **rb_link, *rb_parent; 2648 struct rb_node **rb_link, *rb_parent;
2645 pgoff_t pgoff = addr >> PAGE_SHIFT; 2649 pgoff_t pgoff = addr >> PAGE_SHIFT;
2646 int error; 2650 int error;
2647 2651
2648 len = PAGE_ALIGN(len); 2652 len = PAGE_ALIGN(len);
2649 if (!len) 2653 if (!len)
2650 return addr; 2654 return addr;
2651 2655
2652 flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; 2656 flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
2653 2657
2654 error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); 2658 error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
2655 if (error & ~PAGE_MASK) 2659 if (error & ~PAGE_MASK)
2656 return error; 2660 return error;
2657 2661
2658 error = mlock_future_check(mm, mm->def_flags, len); 2662 error = mlock_future_check(mm, mm->def_flags, len);
2659 if (error) 2663 if (error)
2660 return error; 2664 return error;
2661 2665
2662 /* 2666 /*
2663 * mm->mmap_sem is required to protect against another thread 2667 * mm->mmap_sem is required to protect against another thread
2664 * changing the mappings in case we sleep. 2668 * changing the mappings in case we sleep.
2665 */ 2669 */
2666 verify_mm_writelocked(mm); 2670 verify_mm_writelocked(mm);
2667 2671
2668 /* 2672 /*
2669 * Clear old maps. this also does some error checking for us 2673 * Clear old maps. this also does some error checking for us
2670 */ 2674 */
2671 munmap_back: 2675 munmap_back:
2672 if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { 2676 if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
2673 if (do_munmap(mm, addr, len)) 2677 if (do_munmap(mm, addr, len))
2674 return -ENOMEM; 2678 return -ENOMEM;
2675 goto munmap_back; 2679 goto munmap_back;
2676 } 2680 }
2677 2681
2678 /* Check against address space limits *after* clearing old maps... */ 2682 /* Check against address space limits *after* clearing old maps... */
2679 if (!may_expand_vm(mm, len >> PAGE_SHIFT)) 2683 if (!may_expand_vm(mm, len >> PAGE_SHIFT))
2680 return -ENOMEM; 2684 return -ENOMEM;
2681 2685
2682 if (mm->map_count > sysctl_max_map_count) 2686 if (mm->map_count > sysctl_max_map_count)
2683 return -ENOMEM; 2687 return -ENOMEM;
2684 2688
2685 if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT)) 2689 if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
2686 return -ENOMEM; 2690 return -ENOMEM;
2687 2691
2688 /* Can we just expand an old private anonymous mapping? */ 2692 /* Can we just expand an old private anonymous mapping? */
2689 vma = vma_merge(mm, prev, addr, addr + len, flags, 2693 vma = vma_merge(mm, prev, addr, addr + len, flags,
2690 NULL, NULL, pgoff, NULL); 2694 NULL, NULL, pgoff, NULL);
2691 if (vma) 2695 if (vma)
2692 goto out; 2696 goto out;
2693 2697
2694 /* 2698 /*
2695 * create a vma struct for an anonymous mapping 2699 * create a vma struct for an anonymous mapping
2696 */ 2700 */
2697 vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); 2701 vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
2698 if (!vma) { 2702 if (!vma) {
2699 vm_unacct_memory(len >> PAGE_SHIFT); 2703 vm_unacct_memory(len >> PAGE_SHIFT);
2700 return -ENOMEM; 2704 return -ENOMEM;
2701 } 2705 }
2702 2706
2703 INIT_LIST_HEAD(&vma->anon_vma_chain); 2707 INIT_LIST_HEAD(&vma->anon_vma_chain);
2704 vma->vm_mm = mm; 2708 vma->vm_mm = mm;
2705 vma->vm_start = addr; 2709 vma->vm_start = addr;
2706 vma->vm_end = addr + len; 2710 vma->vm_end = addr + len;
2707 vma->vm_pgoff = pgoff; 2711 vma->vm_pgoff = pgoff;
2708 vma->vm_flags = flags; 2712 vma->vm_flags = flags;
2709 vma->vm_page_prot = vm_get_page_prot(flags); 2713 vma->vm_page_prot = vm_get_page_prot(flags);
2710 vma_link(mm, vma, prev, rb_link, rb_parent); 2714 vma_link(mm, vma, prev, rb_link, rb_parent);
2711 out: 2715 out:
2712 perf_event_mmap(vma); 2716 perf_event_mmap(vma);
2713 mm->total_vm += len >> PAGE_SHIFT; 2717 mm->total_vm += len >> PAGE_SHIFT;
2714 if (flags & VM_LOCKED) 2718 if (flags & VM_LOCKED)
2715 mm->locked_vm += (len >> PAGE_SHIFT); 2719 mm->locked_vm += (len >> PAGE_SHIFT);
2716 vma->vm_flags |= VM_SOFTDIRTY; 2720 vma->vm_flags |= VM_SOFTDIRTY;
2717 return addr; 2721 return addr;
2718 } 2722 }
2719 2723
2720 unsigned long vm_brk(unsigned long addr, unsigned long len) 2724 unsigned long vm_brk(unsigned long addr, unsigned long len)
2721 { 2725 {
2722 struct mm_struct *mm = current->mm; 2726 struct mm_struct *mm = current->mm;
2723 unsigned long ret; 2727 unsigned long ret;
2724 bool populate; 2728 bool populate;
2725 2729
2726 down_write(&mm->mmap_sem); 2730 down_write(&mm->mmap_sem);
2727 ret = do_brk(addr, len); 2731 ret = do_brk(addr, len);
2728 populate = ((mm->def_flags & VM_LOCKED) != 0); 2732 populate = ((mm->def_flags & VM_LOCKED) != 0);
2729 up_write(&mm->mmap_sem); 2733 up_write(&mm->mmap_sem);
2730 if (populate) 2734 if (populate)
2731 mm_populate(addr, len); 2735 mm_populate(addr, len);
2732 return ret; 2736 return ret;
2733 } 2737 }
2734 EXPORT_SYMBOL(vm_brk); 2738 EXPORT_SYMBOL(vm_brk);
2735 2739
2736 /* Release all mmaps. */ 2740 /* Release all mmaps. */
2737 void exit_mmap(struct mm_struct *mm) 2741 void exit_mmap(struct mm_struct *mm)
2738 { 2742 {
2739 struct mmu_gather tlb; 2743 struct mmu_gather tlb;
2740 struct vm_area_struct *vma; 2744 struct vm_area_struct *vma;
2741 unsigned long nr_accounted = 0; 2745 unsigned long nr_accounted = 0;
2742 2746
2743 /* mm's last user has gone, and its about to be pulled down */ 2747 /* mm's last user has gone, and its about to be pulled down */
2744 mmu_notifier_release(mm); 2748 mmu_notifier_release(mm);
2745 2749
2746 if (mm->locked_vm) { 2750 if (mm->locked_vm) {
2747 vma = mm->mmap; 2751 vma = mm->mmap;
2748 while (vma) { 2752 while (vma) {
2749 if (vma->vm_flags & VM_LOCKED) 2753 if (vma->vm_flags & VM_LOCKED)
2750 munlock_vma_pages_all(vma); 2754 munlock_vma_pages_all(vma);
2751 vma = vma->vm_next; 2755 vma = vma->vm_next;
2752 } 2756 }
2753 } 2757 }
2754 2758
2755 arch_exit_mmap(mm); 2759 arch_exit_mmap(mm);
2756 2760
2757 vma = mm->mmap; 2761 vma = mm->mmap;
2758 if (!vma) /* Can happen if dup_mmap() received an OOM */ 2762 if (!vma) /* Can happen if dup_mmap() received an OOM */
2759 return; 2763 return;
2760 2764
2761 lru_add_drain(); 2765 lru_add_drain();
2762 flush_cache_mm(mm); 2766 flush_cache_mm(mm);
2763 tlb_gather_mmu(&tlb, mm, 0, -1); 2767 tlb_gather_mmu(&tlb, mm, 0, -1);
2764 /* update_hiwater_rss(mm) here? but nobody should be looking */ 2768 /* update_hiwater_rss(mm) here? but nobody should be looking */
2765 /* Use -1 here to ensure all VMAs in the mm are unmapped */ 2769 /* Use -1 here to ensure all VMAs in the mm are unmapped */
2766 unmap_vmas(&tlb, vma, 0, -1); 2770 unmap_vmas(&tlb, vma, 0, -1);
2767 2771
2768 free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); 2772 free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
2769 tlb_finish_mmu(&tlb, 0, -1); 2773 tlb_finish_mmu(&tlb, 0, -1);
2770 2774
2771 /* 2775 /*
2772 * Walk the list again, actually closing and freeing it, 2776 * Walk the list again, actually closing and freeing it,
2773 * with preemption enabled, without holding any MM locks. 2777 * with preemption enabled, without holding any MM locks.
2774 */ 2778 */
2775 while (vma) { 2779 while (vma) {
2776 if (vma->vm_flags & VM_ACCOUNT) 2780 if (vma->vm_flags & VM_ACCOUNT)
2777 nr_accounted += vma_pages(vma); 2781 nr_accounted += vma_pages(vma);
2778 vma = remove_vma(vma); 2782 vma = remove_vma(vma);
2779 } 2783 }
2780 vm_unacct_memory(nr_accounted); 2784 vm_unacct_memory(nr_accounted);
2781 2785
2782 WARN_ON(atomic_long_read(&mm->nr_ptes) > 2786 WARN_ON(atomic_long_read(&mm->nr_ptes) >
2783 (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); 2787 (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
2784 } 2788 }
2785 2789
2786 /* Insert vm structure into process list sorted by address 2790 /* Insert vm structure into process list sorted by address
2787 * and into the inode's i_mmap tree. If vm_file is non-NULL 2791 * and into the inode's i_mmap tree. If vm_file is non-NULL
2788 * then i_mmap_mutex is taken here. 2792 * then i_mmap_mutex is taken here.
2789 */ 2793 */
2790 int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) 2794 int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
2791 { 2795 {
2792 struct vm_area_struct *prev; 2796 struct vm_area_struct *prev;
2793 struct rb_node **rb_link, *rb_parent; 2797 struct rb_node **rb_link, *rb_parent;
2794 2798
2795 /* 2799 /*
2796 * The vm_pgoff of a purely anonymous vma should be irrelevant 2800 * The vm_pgoff of a purely anonymous vma should be irrelevant
2797 * until its first write fault, when page's anon_vma and index 2801 * until its first write fault, when page's anon_vma and index
2798 * are set. But now set the vm_pgoff it will almost certainly 2802 * are set. But now set the vm_pgoff it will almost certainly
2799 * end up with (unless mremap moves it elsewhere before that 2803 * end up with (unless mremap moves it elsewhere before that
2800 * first wfault), so /proc/pid/maps tells a consistent story. 2804 * first wfault), so /proc/pid/maps tells a consistent story.
2801 * 2805 *
2802 * By setting it to reflect the virtual start address of the 2806 * By setting it to reflect the virtual start address of the
2803 * vma, merges and splits can happen in a seamless way, just 2807 * vma, merges and splits can happen in a seamless way, just
2804 * using the existing file pgoff checks and manipulations. 2808 * using the existing file pgoff checks and manipulations.
2805 * Similarly in do_mmap_pgoff and in do_brk. 2809 * Similarly in do_mmap_pgoff and in do_brk.
2806 */ 2810 */
2807 if (!vma->vm_file) { 2811 if (!vma->vm_file) {
2808 BUG_ON(vma->anon_vma); 2812 BUG_ON(vma->anon_vma);
2809 vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; 2813 vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
2810 } 2814 }
2811 if (find_vma_links(mm, vma->vm_start, vma->vm_end, 2815 if (find_vma_links(mm, vma->vm_start, vma->vm_end,
2812 &prev, &rb_link, &rb_parent)) 2816 &prev, &rb_link, &rb_parent))
2813 return -ENOMEM; 2817 return -ENOMEM;
2814 if ((vma->vm_flags & VM_ACCOUNT) && 2818 if ((vma->vm_flags & VM_ACCOUNT) &&
2815 security_vm_enough_memory_mm(mm, vma_pages(vma))) 2819 security_vm_enough_memory_mm(mm, vma_pages(vma)))
2816 return -ENOMEM; 2820 return -ENOMEM;
2817 2821
2818 vma_link(mm, vma, prev, rb_link, rb_parent); 2822 vma_link(mm, vma, prev, rb_link, rb_parent);
2819 return 0; 2823 return 0;
2820 } 2824 }
2821 2825
2822 /* 2826 /*
2823 * Copy the vma structure to a new location in the same mm, 2827 * Copy the vma structure to a new location in the same mm,
2824 * prior to moving page table entries, to effect an mremap move. 2828 * prior to moving page table entries, to effect an mremap move.
2825 */ 2829 */
2826 struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, 2830 struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2827 unsigned long addr, unsigned long len, pgoff_t pgoff, 2831 unsigned long addr, unsigned long len, pgoff_t pgoff,
2828 bool *need_rmap_locks) 2832 bool *need_rmap_locks)
2829 { 2833 {
2830 struct vm_area_struct *vma = *vmap; 2834 struct vm_area_struct *vma = *vmap;
2831 unsigned long vma_start = vma->vm_start; 2835 unsigned long vma_start = vma->vm_start;
2832 struct mm_struct *mm = vma->vm_mm; 2836 struct mm_struct *mm = vma->vm_mm;
2833 struct vm_area_struct *new_vma, *prev; 2837 struct vm_area_struct *new_vma, *prev;
2834 struct rb_node **rb_link, *rb_parent; 2838 struct rb_node **rb_link, *rb_parent;
2835 bool faulted_in_anon_vma = true; 2839 bool faulted_in_anon_vma = true;
2836 2840
2837 /* 2841 /*
2838 * If anonymous vma has not yet been faulted, update new pgoff 2842 * If anonymous vma has not yet been faulted, update new pgoff
2839 * to match new location, to increase its chance of merging. 2843 * to match new location, to increase its chance of merging.
2840 */ 2844 */
2841 if (unlikely(!vma->vm_file && !vma->anon_vma)) { 2845 if (unlikely(!vma->vm_file && !vma->anon_vma)) {
2842 pgoff = addr >> PAGE_SHIFT; 2846 pgoff = addr >> PAGE_SHIFT;
2843 faulted_in_anon_vma = false; 2847 faulted_in_anon_vma = false;
2844 } 2848 }
2845 2849
2846 if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) 2850 if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
2847 return NULL; /* should never get here */ 2851 return NULL; /* should never get here */
2848 new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, 2852 new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
2849 vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); 2853 vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
2850 if (new_vma) { 2854 if (new_vma) {
2851 /* 2855 /*
2852 * Source vma may have been merged into new_vma 2856 * Source vma may have been merged into new_vma
2853 */ 2857 */
2854 if (unlikely(vma_start >= new_vma->vm_start && 2858 if (unlikely(vma_start >= new_vma->vm_start &&
2855 vma_start < new_vma->vm_end)) { 2859 vma_start < new_vma->vm_end)) {
2856 /* 2860 /*
2857 * The only way we can get a vma_merge with 2861 * The only way we can get a vma_merge with
2858 * self during an mremap is if the vma hasn't 2862 * self during an mremap is if the vma hasn't
2859 * been faulted in yet and we were allowed to 2863 * been faulted in yet and we were allowed to
2860 * reset the dst vma->vm_pgoff to the 2864 * reset the dst vma->vm_pgoff to the
2861 * destination address of the mremap to allow 2865 * destination address of the mremap to allow
2862 * the merge to happen. mremap must change the 2866 * the merge to happen. mremap must change the
2863 * vm_pgoff linearity between src and dst vmas 2867 * vm_pgoff linearity between src and dst vmas
2864 * (in turn preventing a vma_merge) to be 2868 * (in turn preventing a vma_merge) to be
2865 * safe. It is only safe to keep the vm_pgoff 2869 * safe. It is only safe to keep the vm_pgoff
2866 * linear if there are no pages mapped yet. 2870 * linear if there are no pages mapped yet.
2867 */ 2871 */
2868 VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma); 2872 VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma);
2869 *vmap = vma = new_vma; 2873 *vmap = vma = new_vma;
2870 } 2874 }
2871 *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); 2875 *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
2872 } else { 2876 } else {
2873 new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); 2877 new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
2874 if (new_vma) { 2878 if (new_vma) {
2875 *new_vma = *vma; 2879 *new_vma = *vma;
2876 new_vma->vm_start = addr; 2880 new_vma->vm_start = addr;
2877 new_vma->vm_end = addr + len; 2881 new_vma->vm_end = addr + len;
2878 new_vma->vm_pgoff = pgoff; 2882 new_vma->vm_pgoff = pgoff;
2879 if (vma_dup_policy(vma, new_vma)) 2883 if (vma_dup_policy(vma, new_vma))
2880 goto out_free_vma; 2884 goto out_free_vma;
2881 INIT_LIST_HEAD(&new_vma->anon_vma_chain); 2885 INIT_LIST_HEAD(&new_vma->anon_vma_chain);
2882 if (anon_vma_clone(new_vma, vma)) 2886 if (anon_vma_clone(new_vma, vma))
2883 goto out_free_mempol; 2887 goto out_free_mempol;
2884 if (new_vma->vm_file) 2888 if (new_vma->vm_file)
2885 get_file(new_vma->vm_file); 2889 get_file(new_vma->vm_file);
2886 if (new_vma->vm_ops && new_vma->vm_ops->open) 2890 if (new_vma->vm_ops && new_vma->vm_ops->open)
2887 new_vma->vm_ops->open(new_vma); 2891 new_vma->vm_ops->open(new_vma);
2888 vma_link(mm, new_vma, prev, rb_link, rb_parent); 2892 vma_link(mm, new_vma, prev, rb_link, rb_parent);
2889 *need_rmap_locks = false; 2893 *need_rmap_locks = false;
2890 } 2894 }
2891 } 2895 }
2892 return new_vma; 2896 return new_vma;
2893 2897
2894 out_free_mempol: 2898 out_free_mempol:
2895 mpol_put(vma_policy(new_vma)); 2899 mpol_put(vma_policy(new_vma));
2896 out_free_vma: 2900 out_free_vma:
2897 kmem_cache_free(vm_area_cachep, new_vma); 2901 kmem_cache_free(vm_area_cachep, new_vma);
2898 return NULL; 2902 return NULL;
2899 } 2903 }
2900 2904
2901 /* 2905 /*
2902 * Return true if the calling process may expand its vm space by the passed 2906 * Return true if the calling process may expand its vm space by the passed
2903 * number of pages 2907 * number of pages
2904 */ 2908 */
2905 int may_expand_vm(struct mm_struct *mm, unsigned long npages) 2909 int may_expand_vm(struct mm_struct *mm, unsigned long npages)
2906 { 2910 {
2907 unsigned long cur = mm->total_vm; /* pages */ 2911 unsigned long cur = mm->total_vm; /* pages */
2908 unsigned long lim; 2912 unsigned long lim;
2909 2913
2910 lim = rlimit(RLIMIT_AS) >> PAGE_SHIFT; 2914 lim = rlimit(RLIMIT_AS) >> PAGE_SHIFT;
2911 2915
2912 if (cur + npages > lim) 2916 if (cur + npages > lim)
2913 return 0; 2917 return 0;
2914 return 1; 2918 return 1;
2915 } 2919 }
2916 2920
2917 static int special_mapping_fault(struct vm_area_struct *vma, 2921 static int special_mapping_fault(struct vm_area_struct *vma,
2918 struct vm_fault *vmf); 2922 struct vm_fault *vmf);
2919 2923
2920 /* 2924 /*
2921 * Having a close hook prevents vma merging regardless of flags. 2925 * Having a close hook prevents vma merging regardless of flags.
2922 */ 2926 */
2923 static void special_mapping_close(struct vm_area_struct *vma) 2927 static void special_mapping_close(struct vm_area_struct *vma)
2924 { 2928 {
2925 } 2929 }
2926 2930
2927 static const char *special_mapping_name(struct vm_area_struct *vma) 2931 static const char *special_mapping_name(struct vm_area_struct *vma)
2928 { 2932 {
2929 return ((struct vm_special_mapping *)vma->vm_private_data)->name; 2933 return ((struct vm_special_mapping *)vma->vm_private_data)->name;
2930 } 2934 }
2931 2935
2932 static const struct vm_operations_struct special_mapping_vmops = { 2936 static const struct vm_operations_struct special_mapping_vmops = {
2933 .close = special_mapping_close, 2937 .close = special_mapping_close,
2934 .fault = special_mapping_fault, 2938 .fault = special_mapping_fault,
2935 .name = special_mapping_name, 2939 .name = special_mapping_name,
2936 }; 2940 };
2937 2941
2938 static const struct vm_operations_struct legacy_special_mapping_vmops = { 2942 static const struct vm_operations_struct legacy_special_mapping_vmops = {
2939 .close = special_mapping_close, 2943 .close = special_mapping_close,
2940 .fault = special_mapping_fault, 2944 .fault = special_mapping_fault,
2941 }; 2945 };
2942 2946
2943 static int special_mapping_fault(struct vm_area_struct *vma, 2947 static int special_mapping_fault(struct vm_area_struct *vma,
2944 struct vm_fault *vmf) 2948 struct vm_fault *vmf)
2945 { 2949 {
2946 pgoff_t pgoff; 2950 pgoff_t pgoff;
2947 struct page **pages; 2951 struct page **pages;
2948 2952
2949 /* 2953 /*
2950 * special mappings have no vm_file, and in that case, the mm 2954 * special mappings have no vm_file, and in that case, the mm
2951 * uses vm_pgoff internally. So we have to subtract it from here. 2955 * uses vm_pgoff internally. So we have to subtract it from here.
2952 * We are allowed to do this because we are the mm; do not copy 2956 * We are allowed to do this because we are the mm; do not copy
2953 * this code into drivers! 2957 * this code into drivers!
2954 */ 2958 */
2955 pgoff = vmf->pgoff - vma->vm_pgoff; 2959 pgoff = vmf->pgoff - vma->vm_pgoff;
2956 2960
2957 if (vma->vm_ops == &legacy_special_mapping_vmops) 2961 if (vma->vm_ops == &legacy_special_mapping_vmops)
2958 pages = vma->vm_private_data; 2962 pages = vma->vm_private_data;
2959 else 2963 else
2960 pages = ((struct vm_special_mapping *)vma->vm_private_data)-> 2964 pages = ((struct vm_special_mapping *)vma->vm_private_data)->
2961 pages; 2965 pages;
2962 2966
2963 for (; pgoff && *pages; ++pages) 2967 for (; pgoff && *pages; ++pages)
2964 pgoff--; 2968 pgoff--;
2965 2969
2966 if (*pages) { 2970 if (*pages) {
2967 struct page *page = *pages; 2971 struct page *page = *pages;
2968 get_page(page); 2972 get_page(page);
2969 vmf->page = page; 2973 vmf->page = page;
2970 return 0; 2974 return 0;
2971 } 2975 }
2972 2976
2973 return VM_FAULT_SIGBUS; 2977 return VM_FAULT_SIGBUS;
2974 } 2978 }
2975 2979
2976 static struct vm_area_struct *__install_special_mapping( 2980 static struct vm_area_struct *__install_special_mapping(
2977 struct mm_struct *mm, 2981 struct mm_struct *mm,
2978 unsigned long addr, unsigned long len, 2982 unsigned long addr, unsigned long len,
2979 unsigned long vm_flags, const struct vm_operations_struct *ops, 2983 unsigned long vm_flags, const struct vm_operations_struct *ops,
2980 void *priv) 2984 void *priv)
2981 { 2985 {
2982 int ret; 2986 int ret;
2983 struct vm_area_struct *vma; 2987 struct vm_area_struct *vma;
2984 2988
2985 vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); 2989 vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
2986 if (unlikely(vma == NULL)) 2990 if (unlikely(vma == NULL))
2987 return ERR_PTR(-ENOMEM); 2991 return ERR_PTR(-ENOMEM);
2988 2992
2989 INIT_LIST_HEAD(&vma->anon_vma_chain); 2993 INIT_LIST_HEAD(&vma->anon_vma_chain);
2990 vma->vm_mm = mm; 2994 vma->vm_mm = mm;
2991 vma->vm_start = addr; 2995 vma->vm_start = addr;
2992 vma->vm_end = addr + len; 2996 vma->vm_end = addr + len;
2993 2997
2994 vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY; 2998 vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY;
2995 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); 2999 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
2996 3000
2997 vma->vm_ops = ops; 3001 vma->vm_ops = ops;
2998 vma->vm_private_data = priv; 3002 vma->vm_private_data = priv;
2999 3003
3000 ret = insert_vm_struct(mm, vma); 3004 ret = insert_vm_struct(mm, vma);
3001 if (ret) 3005 if (ret)
3002 goto out; 3006 goto out;
3003 3007
3004 mm->total_vm += len >> PAGE_SHIFT; 3008 mm->total_vm += len >> PAGE_SHIFT;
3005 3009
3006 perf_event_mmap(vma); 3010 perf_event_mmap(vma);
3007 3011
3008 return vma; 3012 return vma;
3009 3013
3010 out: 3014 out:
3011 kmem_cache_free(vm_area_cachep, vma); 3015 kmem_cache_free(vm_area_cachep, vma);
3012 return ERR_PTR(ret); 3016 return ERR_PTR(ret);
3013 } 3017 }
3014 3018
3015 /* 3019 /*
3016 * Called with mm->mmap_sem held for writing. 3020 * Called with mm->mmap_sem held for writing.
3017 * Insert a new vma covering the given region, with the given flags. 3021 * Insert a new vma covering the given region, with the given flags.
3018 * Its pages are supplied by the given array of struct page *. 3022 * Its pages are supplied by the given array of struct page *.
3019 * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated. 3023 * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated.
3020 * The region past the last page supplied will always produce SIGBUS. 3024 * The region past the last page supplied will always produce SIGBUS.
3021 * The array pointer and the pages it points to are assumed to stay alive 3025 * The array pointer and the pages it points to are assumed to stay alive
3022 * for as long as this mapping might exist. 3026 * for as long as this mapping might exist.
3023 */ 3027 */
3024 struct vm_area_struct *_install_special_mapping( 3028 struct vm_area_struct *_install_special_mapping(
3025 struct mm_struct *mm, 3029 struct mm_struct *mm,
3026 unsigned long addr, unsigned long len, 3030 unsigned long addr, unsigned long len,
3027 unsigned long vm_flags, const struct vm_special_mapping *spec) 3031 unsigned long vm_flags, const struct vm_special_mapping *spec)
3028 { 3032 {
3029 return __install_special_mapping(mm, addr, len, vm_flags, 3033 return __install_special_mapping(mm, addr, len, vm_flags,
3030 &special_mapping_vmops, (void *)spec); 3034 &special_mapping_vmops, (void *)spec);
3031 } 3035 }
3032 3036
3033 int install_special_mapping(struct mm_struct *mm, 3037 int install_special_mapping(struct mm_struct *mm,
3034 unsigned long addr, unsigned long len, 3038 unsigned long addr, unsigned long len,
3035 unsigned long vm_flags, struct page **pages) 3039 unsigned long vm_flags, struct page **pages)
3036 { 3040 {
3037 struct vm_area_struct *vma = __install_special_mapping( 3041 struct vm_area_struct *vma = __install_special_mapping(
3038 mm, addr, len, vm_flags, &legacy_special_mapping_vmops, 3042 mm, addr, len, vm_flags, &legacy_special_mapping_vmops,
3039 (void *)pages); 3043 (void *)pages);
3040 3044
3041 return PTR_ERR_OR_ZERO(vma); 3045 return PTR_ERR_OR_ZERO(vma);
3042 } 3046 }
3043 3047
3044 static DEFINE_MUTEX(mm_all_locks_mutex); 3048 static DEFINE_MUTEX(mm_all_locks_mutex);
3045 3049
3046 static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) 3050 static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
3047 { 3051 {
3048 if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) { 3052 if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {
3049 /* 3053 /*
3050 * The LSB of head.next can't change from under us 3054 * The LSB of head.next can't change from under us
3051 * because we hold the mm_all_locks_mutex. 3055 * because we hold the mm_all_locks_mutex.
3052 */ 3056 */
3053 down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_sem); 3057 down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_sem);
3054 /* 3058 /*
3055 * We can safely modify head.next after taking the 3059 * We can safely modify head.next after taking the
3056 * anon_vma->root->rwsem. If some other vma in this mm shares 3060 * anon_vma->root->rwsem. If some other vma in this mm shares
3057 * the same anon_vma we won't take it again. 3061 * the same anon_vma we won't take it again.
3058 * 3062 *
3059 * No need of atomic instructions here, head.next 3063 * No need of atomic instructions here, head.next
3060 * can't change from under us thanks to the 3064 * can't change from under us thanks to the
3061 * anon_vma->root->rwsem. 3065 * anon_vma->root->rwsem.
3062 */ 3066 */
3063 if (__test_and_set_bit(0, (unsigned long *) 3067 if (__test_and_set_bit(0, (unsigned long *)
3064 &anon_vma->root->rb_root.rb_node)) 3068 &anon_vma->root->rb_root.rb_node))
3065 BUG(); 3069 BUG();
3066 } 3070 }
3067 } 3071 }
3068 3072
3069 static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) 3073 static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
3070 { 3074 {
3071 if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { 3075 if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
3072 /* 3076 /*
3073 * AS_MM_ALL_LOCKS can't change from under us because 3077 * AS_MM_ALL_LOCKS can't change from under us because
3074 * we hold the mm_all_locks_mutex. 3078 * we hold the mm_all_locks_mutex.
3075 * 3079 *
3076 * Operations on ->flags have to be atomic because 3080 * Operations on ->flags have to be atomic because
3077 * even if AS_MM_ALL_LOCKS is stable thanks to the 3081 * even if AS_MM_ALL_LOCKS is stable thanks to the
3078 * mm_all_locks_mutex, there may be other cpus 3082 * mm_all_locks_mutex, there may be other cpus
3079 * changing other bitflags in parallel to us. 3083 * changing other bitflags in parallel to us.
3080 */ 3084 */
3081 if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) 3085 if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
3082 BUG(); 3086 BUG();
3083 mutex_lock_nest_lock(&mapping->i_mmap_mutex, &mm->mmap_sem); 3087 mutex_lock_nest_lock(&mapping->i_mmap_mutex, &mm->mmap_sem);
3084 } 3088 }
3085 } 3089 }
3086 3090
3087 /* 3091 /*
3088 * This operation locks against the VM for all pte/vma/mm related 3092 * This operation locks against the VM for all pte/vma/mm related
3089 * operations that could ever happen on a certain mm. This includes 3093 * operations that could ever happen on a certain mm. This includes
3090 * vmtruncate, try_to_unmap, and all page faults. 3094 * vmtruncate, try_to_unmap, and all page faults.
3091 * 3095 *
3092 * The caller must take the mmap_sem in write mode before calling 3096 * The caller must take the mmap_sem in write mode before calling
3093 * mm_take_all_locks(). The caller isn't allowed to release the 3097 * mm_take_all_locks(). The caller isn't allowed to release the
3094 * mmap_sem until mm_drop_all_locks() returns. 3098 * mmap_sem until mm_drop_all_locks() returns.
3095 * 3099 *
3096 * mmap_sem in write mode is required in order to block all operations 3100 * mmap_sem in write mode is required in order to block all operations
3097 * that could modify pagetables and free pages without need of 3101 * that could modify pagetables and free pages without need of
3098 * altering the vma layout (for example populate_range() with 3102 * altering the vma layout (for example populate_range() with
3099 * nonlinear vmas). It's also needed in write mode to avoid new 3103 * nonlinear vmas). It's also needed in write mode to avoid new
3100 * anon_vmas to be associated with existing vmas. 3104 * anon_vmas to be associated with existing vmas.
3101 * 3105 *
3102 * A single task can't take more than one mm_take_all_locks() in a row 3106 * A single task can't take more than one mm_take_all_locks() in a row
3103 * or it would deadlock. 3107 * or it would deadlock.
3104 * 3108 *
3105 * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in 3109 * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in
3106 * mapping->flags avoid to take the same lock twice, if more than one 3110 * mapping->flags avoid to take the same lock twice, if more than one
3107 * vma in this mm is backed by the same anon_vma or address_space. 3111 * vma in this mm is backed by the same anon_vma or address_space.
3108 * 3112 *
3109 * We can take all the locks in random order because the VM code 3113 * We can take all the locks in random order because the VM code
3110 * taking i_mmap_mutex or anon_vma->rwsem outside the mmap_sem never 3114 * taking i_mmap_mutex or anon_vma->rwsem outside the mmap_sem never
3111 * takes more than one of them in a row. Secondly we're protected 3115 * takes more than one of them in a row. Secondly we're protected
3112 * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. 3116 * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
3113 * 3117 *
3114 * mm_take_all_locks() and mm_drop_all_locks are expensive operations 3118 * mm_take_all_locks() and mm_drop_all_locks are expensive operations
3115 * that may have to take thousand of locks. 3119 * that may have to take thousand of locks.
3116 * 3120 *
3117 * mm_take_all_locks() can fail if it's interrupted by signals. 3121 * mm_take_all_locks() can fail if it's interrupted by signals.
3118 */ 3122 */
3119 int mm_take_all_locks(struct mm_struct *mm) 3123 int mm_take_all_locks(struct mm_struct *mm)
3120 { 3124 {
3121 struct vm_area_struct *vma; 3125 struct vm_area_struct *vma;
3122 struct anon_vma_chain *avc; 3126 struct anon_vma_chain *avc;
3123 3127
3124 BUG_ON(down_read_trylock(&mm->mmap_sem)); 3128 BUG_ON(down_read_trylock(&mm->mmap_sem));
3125 3129
3126 mutex_lock(&mm_all_locks_mutex); 3130 mutex_lock(&mm_all_locks_mutex);
3127 3131
3128 for (vma = mm->mmap; vma; vma = vma->vm_next) { 3132 for (vma = mm->mmap; vma; vma = vma->vm_next) {
3129 if (signal_pending(current)) 3133 if (signal_pending(current))
3130 goto out_unlock; 3134 goto out_unlock;
3131 if (vma->vm_file && vma->vm_file->f_mapping) 3135 if (vma->vm_file && vma->vm_file->f_mapping)
3132 vm_lock_mapping(mm, vma->vm_file->f_mapping); 3136 vm_lock_mapping(mm, vma->vm_file->f_mapping);
3133 } 3137 }
3134 3138
3135 for (vma = mm->mmap; vma; vma = vma->vm_next) { 3139 for (vma = mm->mmap; vma; vma = vma->vm_next) {
3136 if (signal_pending(current)) 3140 if (signal_pending(current))
3137 goto out_unlock; 3141 goto out_unlock;
3138 if (vma->anon_vma) 3142 if (vma->anon_vma)
3139 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 3143 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
3140 vm_lock_anon_vma(mm, avc->anon_vma); 3144 vm_lock_anon_vma(mm, avc->anon_vma);
3141 } 3145 }
3142 3146
3143 return 0; 3147 return 0;
3144 3148
3145 out_unlock: 3149 out_unlock:
3146 mm_drop_all_locks(mm); 3150 mm_drop_all_locks(mm);
3147 return -EINTR; 3151 return -EINTR;
3148 } 3152 }
3149 3153
3150 static void vm_unlock_anon_vma(struct anon_vma *anon_vma) 3154 static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
3151 { 3155 {
3152 if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) { 3156 if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {
3153 /* 3157 /*
3154 * The LSB of head.next can't change to 0 from under 3158 * The LSB of head.next can't change to 0 from under
3155 * us because we hold the mm_all_locks_mutex. 3159 * us because we hold the mm_all_locks_mutex.
3156 * 3160 *
3157 * We must however clear the bitflag before unlocking 3161 * We must however clear the bitflag before unlocking
3158 * the vma so the users using the anon_vma->rb_root will 3162 * the vma so the users using the anon_vma->rb_root will
3159 * never see our bitflag. 3163 * never see our bitflag.
3160 * 3164 *
3161 * No need of atomic instructions here, head.next 3165 * No need of atomic instructions here, head.next
3162 * can't change from under us until we release the 3166 * can't change from under us until we release the
3163 * anon_vma->root->rwsem. 3167 * anon_vma->root->rwsem.
3164 */ 3168 */
3165 if (!__test_and_clear_bit(0, (unsigned long *) 3169 if (!__test_and_clear_bit(0, (unsigned long *)
3166 &anon_vma->root->rb_root.rb_node)) 3170 &anon_vma->root->rb_root.rb_node))
3167 BUG(); 3171 BUG();
3168 anon_vma_unlock_write(anon_vma); 3172 anon_vma_unlock_write(anon_vma);
3169 } 3173 }
3170 } 3174 }
3171 3175
3172 static void vm_unlock_mapping(struct address_space *mapping) 3176 static void vm_unlock_mapping(struct address_space *mapping)
3173 { 3177 {
3174 if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { 3178 if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
3175 /* 3179 /*
3176 * AS_MM_ALL_LOCKS can't change to 0 from under us 3180 * AS_MM_ALL_LOCKS can't change to 0 from under us
3177 * because we hold the mm_all_locks_mutex. 3181 * because we hold the mm_all_locks_mutex.
3178 */ 3182 */
3179 mutex_unlock(&mapping->i_mmap_mutex); 3183 mutex_unlock(&mapping->i_mmap_mutex);
3180 if (!test_and_clear_bit(AS_MM_ALL_LOCKS, 3184 if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
3181 &mapping->flags)) 3185 &mapping->flags))
3182 BUG(); 3186 BUG();
3183 } 3187 }
3184 } 3188 }
3185 3189
3186 /* 3190 /*
3187 * The mmap_sem cannot be released by the caller until 3191 * The mmap_sem cannot be released by the caller until
3188 * mm_drop_all_locks() returns. 3192 * mm_drop_all_locks() returns.
3189 */ 3193 */
3190 void mm_drop_all_locks(struct mm_struct *mm) 3194 void mm_drop_all_locks(struct mm_struct *mm)
3191 { 3195 {
3192 struct vm_area_struct *vma; 3196 struct vm_area_struct *vma;
3193 struct anon_vma_chain *avc; 3197 struct anon_vma_chain *avc;
3194 3198
3195 BUG_ON(down_read_trylock(&mm->mmap_sem)); 3199 BUG_ON(down_read_trylock(&mm->mmap_sem));
3196 BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); 3200 BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
3197 3201
3198 for (vma = mm->mmap; vma; vma = vma->vm_next) { 3202 for (vma = mm->mmap; vma; vma = vma->vm_next) {
3199 if (vma->anon_vma) 3203 if (vma->anon_vma)
3200 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 3204 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
3201 vm_unlock_anon_vma(avc->anon_vma); 3205 vm_unlock_anon_vma(avc->anon_vma);
3202 if (vma->vm_file && vma->vm_file->f_mapping) 3206 if (vma->vm_file && vma->vm_file->f_mapping)
3203 vm_unlock_mapping(vma->vm_file->f_mapping); 3207 vm_unlock_mapping(vma->vm_file->f_mapping);
3204 } 3208 }
3205 3209
3206 mutex_unlock(&mm_all_locks_mutex); 3210 mutex_unlock(&mm_all_locks_mutex);
3207 } 3211 }
3208 3212
3209 /* 3213 /*
3210 * initialise the VMA slab 3214 * initialise the VMA slab
3211 */ 3215 */
3212 void __init mmap_init(void) 3216 void __init mmap_init(void)
3213 { 3217 {
3214 int ret; 3218 int ret;
3215 3219
3216 ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); 3220 ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
3217 VM_BUG_ON(ret); 3221 VM_BUG_ON(ret);
3218 } 3222 }
3219 3223
3220 /* 3224 /*
3221 * Initialise sysctl_user_reserve_kbytes. 3225 * Initialise sysctl_user_reserve_kbytes.
3222 * 3226 *
3223 * This is intended to prevent a user from starting a single memory hogging 3227 * This is intended to prevent a user from starting a single memory hogging
3224 * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER 3228 * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER
3225 * mode. 3229 * mode.
3226 * 3230 *
3227 * The default value is min(3% of free memory, 128MB) 3231 * The default value is min(3% of free memory, 128MB)
3228 * 128MB is enough to recover with sshd/login, bash, and top/kill. 3232 * 128MB is enough to recover with sshd/login, bash, and top/kill.
3229 */ 3233 */
3230 static int init_user_reserve(void) 3234 static int init_user_reserve(void)
3231 { 3235 {
3232 unsigned long free_kbytes; 3236 unsigned long free_kbytes;
3233 3237
3234 free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); 3238 free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
3235 3239
3236 sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); 3240 sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
3237 return 0; 3241 return 0;
3238 } 3242 }
3239 subsys_initcall(init_user_reserve); 3243 subsys_initcall(init_user_reserve);
3240 3244
3241 /* 3245 /*
3242 * Initialise sysctl_admin_reserve_kbytes. 3246 * Initialise sysctl_admin_reserve_kbytes.
3243 * 3247 *
3244 * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin 3248 * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin
3245 * to log in and kill a memory hogging process. 3249 * to log in and kill a memory hogging process.
3246 * 3250 *
3247 * Systems with more than 256MB will reserve 8MB, enough to recover 3251 * Systems with more than 256MB will reserve 8MB, enough to recover
3248 * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will 3252 * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will
3249 * only reserve 3% of free pages by default. 3253 * only reserve 3% of free pages by default.
3250 */ 3254 */
3251 static int init_admin_reserve(void) 3255 static int init_admin_reserve(void)
3252 { 3256 {
3253 unsigned long free_kbytes; 3257 unsigned long free_kbytes;
3254 3258
3255 free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); 3259 free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
3256 3260
3257 sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); 3261 sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
3258 return 0; 3262 return 0;
3259 } 3263 }
3260 subsys_initcall(init_admin_reserve); 3264 subsys_initcall(init_admin_reserve);
3261 3265
3262 /* 3266 /*
3263 * Reinititalise user and admin reserves if memory is added or removed. 3267 * Reinititalise user and admin reserves if memory is added or removed.
3264 * 3268 *
3265 * The default user reserve max is 128MB, and the default max for the 3269 * The default user reserve max is 128MB, and the default max for the
3266 * admin reserve is 8MB. These are usually, but not always, enough to 3270 * admin reserve is 8MB. These are usually, but not always, enough to
3267 * enable recovery from a memory hogging process using login/sshd, a shell, 3271 * enable recovery from a memory hogging process using login/sshd, a shell,
3268 * and tools like top. It may make sense to increase or even disable the 3272 * and tools like top. It may make sense to increase or even disable the
3269 * reserve depending on the existence of swap or variations in the recovery 3273 * reserve depending on the existence of swap or variations in the recovery
3270 * tools. So, the admin may have changed them. 3274 * tools. So, the admin may have changed them.
3271 * 3275 *
3272 * If memory is added and the reserves have been eliminated or increased above 3276 * If memory is added and the reserves have been eliminated or increased above
3273 * the default max, then we'll trust the admin. 3277 * the default max, then we'll trust the admin.
3274 * 3278 *
3275 * If memory is removed and there isn't enough free memory, then we 3279 * If memory is removed and there isn't enough free memory, then we
3276 * need to reset the reserves. 3280 * need to reset the reserves.
3277 * 3281 *
3278 * Otherwise keep the reserve set by the admin. 3282 * Otherwise keep the reserve set by the admin.
3279 */ 3283 */
3280 static int reserve_mem_notifier(struct notifier_block *nb, 3284 static int reserve_mem_notifier(struct notifier_block *nb,
3281 unsigned long action, void *data) 3285 unsigned long action, void *data)
3282 { 3286 {
3283 unsigned long tmp, free_kbytes; 3287 unsigned long tmp, free_kbytes;
3284 3288
3285 switch (action) { 3289 switch (action) {
3286 case MEM_ONLINE: 3290 case MEM_ONLINE:
3287 /* Default max is 128MB. Leave alone if modified by operator. */ 3291 /* Default max is 128MB. Leave alone if modified by operator. */
3288 tmp = sysctl_user_reserve_kbytes; 3292 tmp = sysctl_user_reserve_kbytes;
3289 if (0 < tmp && tmp < (1UL << 17)) 3293 if (0 < tmp && tmp < (1UL << 17))
3290 init_user_reserve(); 3294 init_user_reserve();
3291 3295
3292 /* Default max is 8MB. Leave alone if modified by operator. */ 3296 /* Default max is 8MB. Leave alone if modified by operator. */
3293 tmp = sysctl_admin_reserve_kbytes; 3297 tmp = sysctl_admin_reserve_kbytes;
3294 if (0 < tmp && tmp < (1UL << 13)) 3298 if (0 < tmp && tmp < (1UL << 13))
3295 init_admin_reserve(); 3299 init_admin_reserve();
3296 3300
3297 break; 3301 break;
3298 case MEM_OFFLINE: 3302 case MEM_OFFLINE:
3299 free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); 3303 free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
3300 3304
3301 if (sysctl_user_reserve_kbytes > free_kbytes) { 3305 if (sysctl_user_reserve_kbytes > free_kbytes) {
3302 init_user_reserve(); 3306 init_user_reserve();
3303 pr_info("vm.user_reserve_kbytes reset to %lu\n", 3307 pr_info("vm.user_reserve_kbytes reset to %lu\n",
3304 sysctl_user_reserve_kbytes); 3308 sysctl_user_reserve_kbytes);
3305 } 3309 }
3306 3310
3307 if (sysctl_admin_reserve_kbytes > free_kbytes) { 3311 if (sysctl_admin_reserve_kbytes > free_kbytes) {
3308 init_admin_reserve(); 3312 init_admin_reserve();
3309 pr_info("vm.admin_reserve_kbytes reset to %lu\n", 3313 pr_info("vm.admin_reserve_kbytes reset to %lu\n",
3310 sysctl_admin_reserve_kbytes); 3314 sysctl_admin_reserve_kbytes);
3311 } 3315 }
3312 break; 3316 break;
3313 default: 3317 default:
3314 break; 3318 break;
3315 } 3319 }
3316 return NOTIFY_OK; 3320 return NOTIFY_OK;
3317 } 3321 }
3318 3322
3319 static struct notifier_block reserve_mem_nb = { 3323 static struct notifier_block reserve_mem_nb = {
3320 .notifier_call = reserve_mem_notifier, 3324 .notifier_call = reserve_mem_notifier,
3321 }; 3325 };
3322 3326
3323 static int __meminit init_reserve_notifier(void) 3327 static int __meminit init_reserve_notifier(void)
3324 { 3328 {
3325 if (register_hotmemory_notifier(&reserve_mem_nb)) 3329 if (register_hotmemory_notifier(&reserve_mem_nb))
3326 pr_err("Failed registering memory add/remove notifier for admin reserve\n"); 3330 pr_err("Failed registering memory add/remove notifier for admin reserve\n");
3327 3331
3328 return 0; 3332 return 0;
3329 } 3333 }
3330 subsys_initcall(init_reserve_notifier); 3334 subsys_initcall(init_reserve_notifier);
3331 3335
1 /* 1 /*
2 * mm/rmap.c - physical to virtual reverse mappings 2 * mm/rmap.c - physical to virtual reverse mappings
3 * 3 *
4 * Copyright 2001, Rik van Riel <riel@conectiva.com.br> 4 * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
5 * Released under the General Public License (GPL). 5 * Released under the General Public License (GPL).
6 * 6 *
7 * Simple, low overhead reverse mapping scheme. 7 * Simple, low overhead reverse mapping scheme.
8 * Please try to keep this thing as modular as possible. 8 * Please try to keep this thing as modular as possible.
9 * 9 *
10 * Provides methods for unmapping each kind of mapped page: 10 * Provides methods for unmapping each kind of mapped page:
11 * the anon methods track anonymous pages, and 11 * the anon methods track anonymous pages, and
12 * the file methods track pages belonging to an inode. 12 * the file methods track pages belonging to an inode.
13 * 13 *
14 * Original design by Rik van Riel <riel@conectiva.com.br> 2001 14 * Original design by Rik van Riel <riel@conectiva.com.br> 2001
15 * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004 15 * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
16 * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004 16 * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
17 * Contributions by Hugh Dickins 2003, 2004 17 * Contributions by Hugh Dickins 2003, 2004
18 */ 18 */
19 19
20 /* 20 /*
21 * Lock ordering in mm: 21 * Lock ordering in mm:
22 * 22 *
23 * inode->i_mutex (while writing or truncating, not reading or faulting) 23 * inode->i_mutex (while writing or truncating, not reading or faulting)
24 * mm->mmap_sem 24 * mm->mmap_sem
25 * page->flags PG_locked (lock_page) 25 * page->flags PG_locked (lock_page)
26 * mapping->i_mmap_mutex 26 * mapping->i_mmap_mutex
27 * anon_vma->rwsem 27 * anon_vma->rwsem
28 * mm->page_table_lock or pte_lock 28 * mm->page_table_lock or pte_lock
29 * zone->lru_lock (in mark_page_accessed, isolate_lru_page) 29 * zone->lru_lock (in mark_page_accessed, isolate_lru_page)
30 * swap_lock (in swap_duplicate, swap_info_get) 30 * swap_lock (in swap_duplicate, swap_info_get)
31 * mmlist_lock (in mmput, drain_mmlist and others) 31 * mmlist_lock (in mmput, drain_mmlist and others)
32 * mapping->private_lock (in __set_page_dirty_buffers) 32 * mapping->private_lock (in __set_page_dirty_buffers)
33 * inode->i_lock (in set_page_dirty's __mark_inode_dirty) 33 * inode->i_lock (in set_page_dirty's __mark_inode_dirty)
34 * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) 34 * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
35 * sb_lock (within inode_lock in fs/fs-writeback.c) 35 * sb_lock (within inode_lock in fs/fs-writeback.c)
36 * mapping->tree_lock (widely used, in set_page_dirty, 36 * mapping->tree_lock (widely used, in set_page_dirty,
37 * in arch-dependent flush_dcache_mmap_lock, 37 * in arch-dependent flush_dcache_mmap_lock,
38 * within bdi.wb->list_lock in __sync_single_inode) 38 * within bdi.wb->list_lock in __sync_single_inode)
39 * 39 *
40 * anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon) 40 * anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon)
41 * ->tasklist_lock 41 * ->tasklist_lock
42 * pte map lock 42 * pte map lock
43 */ 43 */
44 44
45 #include <linux/mm.h> 45 #include <linux/mm.h>
46 #include <linux/pagemap.h> 46 #include <linux/pagemap.h>
47 #include <linux/swap.h> 47 #include <linux/swap.h>
48 #include <linux/swapops.h> 48 #include <linux/swapops.h>
49 #include <linux/slab.h> 49 #include <linux/slab.h>
50 #include <linux/init.h> 50 #include <linux/init.h>
51 #include <linux/ksm.h> 51 #include <linux/ksm.h>
52 #include <linux/rmap.h> 52 #include <linux/rmap.h>
53 #include <linux/rcupdate.h> 53 #include <linux/rcupdate.h>
54 #include <linux/export.h> 54 #include <linux/export.h>
55 #include <linux/memcontrol.h> 55 #include <linux/memcontrol.h>
56 #include <linux/mmu_notifier.h> 56 #include <linux/mmu_notifier.h>
57 #include <linux/migrate.h> 57 #include <linux/migrate.h>
58 #include <linux/hugetlb.h> 58 #include <linux/hugetlb.h>
59 #include <linux/backing-dev.h> 59 #include <linux/backing-dev.h>
60 60
61 #include <asm/tlbflush.h> 61 #include <asm/tlbflush.h>
62 62
63 #include "internal.h" 63 #include "internal.h"
64 64
65 static struct kmem_cache *anon_vma_cachep; 65 static struct kmem_cache *anon_vma_cachep;
66 static struct kmem_cache *anon_vma_chain_cachep; 66 static struct kmem_cache *anon_vma_chain_cachep;
67 67
68 static inline struct anon_vma *anon_vma_alloc(void) 68 static inline struct anon_vma *anon_vma_alloc(void)
69 { 69 {
70 struct anon_vma *anon_vma; 70 struct anon_vma *anon_vma;
71 71
72 anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); 72 anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
73 if (anon_vma) { 73 if (anon_vma) {
74 atomic_set(&anon_vma->refcount, 1); 74 atomic_set(&anon_vma->refcount, 1);
75 /* 75 /*
76 * Initialise the anon_vma root to point to itself. If called 76 * Initialise the anon_vma root to point to itself. If called
77 * from fork, the root will be reset to the parents anon_vma. 77 * from fork, the root will be reset to the parents anon_vma.
78 */ 78 */
79 anon_vma->root = anon_vma; 79 anon_vma->root = anon_vma;
80 } 80 }
81 81
82 return anon_vma; 82 return anon_vma;
83 } 83 }
84 84
85 static inline void anon_vma_free(struct anon_vma *anon_vma) 85 static inline void anon_vma_free(struct anon_vma *anon_vma)
86 { 86 {
87 VM_BUG_ON(atomic_read(&anon_vma->refcount)); 87 VM_BUG_ON(atomic_read(&anon_vma->refcount));
88 88
89 /* 89 /*
90 * Synchronize against page_lock_anon_vma_read() such that 90 * Synchronize against page_lock_anon_vma_read() such that
91 * we can safely hold the lock without the anon_vma getting 91 * we can safely hold the lock without the anon_vma getting
92 * freed. 92 * freed.
93 * 93 *
94 * Relies on the full mb implied by the atomic_dec_and_test() from 94 * Relies on the full mb implied by the atomic_dec_and_test() from
95 * put_anon_vma() against the acquire barrier implied by 95 * put_anon_vma() against the acquire barrier implied by
96 * down_read_trylock() from page_lock_anon_vma_read(). This orders: 96 * down_read_trylock() from page_lock_anon_vma_read(). This orders:
97 * 97 *
98 * page_lock_anon_vma_read() VS put_anon_vma() 98 * page_lock_anon_vma_read() VS put_anon_vma()
99 * down_read_trylock() atomic_dec_and_test() 99 * down_read_trylock() atomic_dec_and_test()
100 * LOCK MB 100 * LOCK MB
101 * atomic_read() rwsem_is_locked() 101 * atomic_read() rwsem_is_locked()
102 * 102 *
103 * LOCK should suffice since the actual taking of the lock must 103 * LOCK should suffice since the actual taking of the lock must
104 * happen _before_ what follows. 104 * happen _before_ what follows.
105 */ 105 */
106 might_sleep(); 106 might_sleep();
107 if (rwsem_is_locked(&anon_vma->root->rwsem)) { 107 if (rwsem_is_locked(&anon_vma->root->rwsem)) {
108 anon_vma_lock_write(anon_vma); 108 anon_vma_lock_write(anon_vma);
109 anon_vma_unlock_write(anon_vma); 109 anon_vma_unlock_write(anon_vma);
110 } 110 }
111 111
112 kmem_cache_free(anon_vma_cachep, anon_vma); 112 kmem_cache_free(anon_vma_cachep, anon_vma);
113 } 113 }
114 114
115 static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp) 115 static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp)
116 { 116 {
117 return kmem_cache_alloc(anon_vma_chain_cachep, gfp); 117 return kmem_cache_alloc(anon_vma_chain_cachep, gfp);
118 } 118 }
119 119
120 static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) 120 static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
121 { 121 {
122 kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); 122 kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
123 } 123 }
124 124
125 static void anon_vma_chain_link(struct vm_area_struct *vma, 125 static void anon_vma_chain_link(struct vm_area_struct *vma,
126 struct anon_vma_chain *avc, 126 struct anon_vma_chain *avc,
127 struct anon_vma *anon_vma) 127 struct anon_vma *anon_vma)
128 { 128 {
129 avc->vma = vma; 129 avc->vma = vma;
130 avc->anon_vma = anon_vma; 130 avc->anon_vma = anon_vma;
131 list_add(&avc->same_vma, &vma->anon_vma_chain); 131 list_add(&avc->same_vma, &vma->anon_vma_chain);
132 anon_vma_interval_tree_insert(avc, &anon_vma->rb_root); 132 anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
133 } 133 }
134 134
135 /** 135 /**
136 * anon_vma_prepare - attach an anon_vma to a memory region 136 * anon_vma_prepare - attach an anon_vma to a memory region
137 * @vma: the memory region in question 137 * @vma: the memory region in question
138 * 138 *
139 * This makes sure the memory mapping described by 'vma' has 139 * This makes sure the memory mapping described by 'vma' has
140 * an 'anon_vma' attached to it, so that we can associate the 140 * an 'anon_vma' attached to it, so that we can associate the
141 * anonymous pages mapped into it with that anon_vma. 141 * anonymous pages mapped into it with that anon_vma.
142 * 142 *
143 * The common case will be that we already have one, but if 143 * The common case will be that we already have one, but if
144 * not we either need to find an adjacent mapping that we 144 * not we either need to find an adjacent mapping that we
145 * can re-use the anon_vma from (very common when the only 145 * can re-use the anon_vma from (very common when the only
146 * reason for splitting a vma has been mprotect()), or we 146 * reason for splitting a vma has been mprotect()), or we
147 * allocate a new one. 147 * allocate a new one.
148 * 148 *
149 * Anon-vma allocations are very subtle, because we may have 149 * Anon-vma allocations are very subtle, because we may have
150 * optimistically looked up an anon_vma in page_lock_anon_vma_read() 150 * optimistically looked up an anon_vma in page_lock_anon_vma_read()
151 * and that may actually touch the spinlock even in the newly 151 * and that may actually touch the spinlock even in the newly
152 * allocated vma (it depends on RCU to make sure that the 152 * allocated vma (it depends on RCU to make sure that the
153 * anon_vma isn't actually destroyed). 153 * anon_vma isn't actually destroyed).
154 * 154 *
155 * As a result, we need to do proper anon_vma locking even 155 * As a result, we need to do proper anon_vma locking even
156 * for the new allocation. At the same time, we do not want 156 * for the new allocation. At the same time, we do not want
157 * to do any locking for the common case of already having 157 * to do any locking for the common case of already having
158 * an anon_vma. 158 * an anon_vma.
159 * 159 *
160 * This must be called with the mmap_sem held for reading. 160 * This must be called with the mmap_sem held for reading.
161 */ 161 */
162 int anon_vma_prepare(struct vm_area_struct *vma) 162 int anon_vma_prepare(struct vm_area_struct *vma)
163 { 163 {
164 struct anon_vma *anon_vma = vma->anon_vma; 164 struct anon_vma *anon_vma = vma->anon_vma;
165 struct anon_vma_chain *avc; 165 struct anon_vma_chain *avc;
166 166
167 might_sleep(); 167 might_sleep();
168 if (unlikely(!anon_vma)) { 168 if (unlikely(!anon_vma)) {
169 struct mm_struct *mm = vma->vm_mm; 169 struct mm_struct *mm = vma->vm_mm;
170 struct anon_vma *allocated; 170 struct anon_vma *allocated;
171 171
172 avc = anon_vma_chain_alloc(GFP_KERNEL); 172 avc = anon_vma_chain_alloc(GFP_KERNEL);
173 if (!avc) 173 if (!avc)
174 goto out_enomem; 174 goto out_enomem;
175 175
176 anon_vma = find_mergeable_anon_vma(vma); 176 anon_vma = find_mergeable_anon_vma(vma);
177 allocated = NULL; 177 allocated = NULL;
178 if (!anon_vma) { 178 if (!anon_vma) {
179 anon_vma = anon_vma_alloc(); 179 anon_vma = anon_vma_alloc();
180 if (unlikely(!anon_vma)) 180 if (unlikely(!anon_vma))
181 goto out_enomem_free_avc; 181 goto out_enomem_free_avc;
182 allocated = anon_vma; 182 allocated = anon_vma;
183 } 183 }
184 184
185 anon_vma_lock_write(anon_vma); 185 anon_vma_lock_write(anon_vma);
186 /* page_table_lock to protect against threads */ 186 /* page_table_lock to protect against threads */
187 spin_lock(&mm->page_table_lock); 187 spin_lock(&mm->page_table_lock);
188 if (likely(!vma->anon_vma)) { 188 if (likely(!vma->anon_vma)) {
189 vma->anon_vma = anon_vma; 189 vma->anon_vma = anon_vma;
190 anon_vma_chain_link(vma, avc, anon_vma); 190 anon_vma_chain_link(vma, avc, anon_vma);
191 allocated = NULL; 191 allocated = NULL;
192 avc = NULL; 192 avc = NULL;
193 } 193 }
194 spin_unlock(&mm->page_table_lock); 194 spin_unlock(&mm->page_table_lock);
195 anon_vma_unlock_write(anon_vma); 195 anon_vma_unlock_write(anon_vma);
196 196
197 if (unlikely(allocated)) 197 if (unlikely(allocated))
198 put_anon_vma(allocated); 198 put_anon_vma(allocated);
199 if (unlikely(avc)) 199 if (unlikely(avc))
200 anon_vma_chain_free(avc); 200 anon_vma_chain_free(avc);
201 } 201 }
202 return 0; 202 return 0;
203 203
204 out_enomem_free_avc: 204 out_enomem_free_avc:
205 anon_vma_chain_free(avc); 205 anon_vma_chain_free(avc);
206 out_enomem: 206 out_enomem:
207 return -ENOMEM; 207 return -ENOMEM;
208 } 208 }
209 209
210 /* 210 /*
211 * This is a useful helper function for locking the anon_vma root as 211 * This is a useful helper function for locking the anon_vma root as
212 * we traverse the vma->anon_vma_chain, looping over anon_vma's that 212 * we traverse the vma->anon_vma_chain, looping over anon_vma's that
213 * have the same vma. 213 * have the same vma.
214 * 214 *
215 * Such anon_vma's should have the same root, so you'd expect to see 215 * Such anon_vma's should have the same root, so you'd expect to see
216 * just a single mutex_lock for the whole traversal. 216 * just a single mutex_lock for the whole traversal.
217 */ 217 */
218 static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma) 218 static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma)
219 { 219 {
220 struct anon_vma *new_root = anon_vma->root; 220 struct anon_vma *new_root = anon_vma->root;
221 if (new_root != root) { 221 if (new_root != root) {
222 if (WARN_ON_ONCE(root)) 222 if (WARN_ON_ONCE(root))
223 up_write(&root->rwsem); 223 up_write(&root->rwsem);
224 root = new_root; 224 root = new_root;
225 down_write(&root->rwsem); 225 down_write(&root->rwsem);
226 } 226 }
227 return root; 227 return root;
228 } 228 }
229 229
230 static inline void unlock_anon_vma_root(struct anon_vma *root) 230 static inline void unlock_anon_vma_root(struct anon_vma *root)
231 { 231 {
232 if (root) 232 if (root)
233 up_write(&root->rwsem); 233 up_write(&root->rwsem);
234 } 234 }
235 235
236 /* 236 /*
237 * Attach the anon_vmas from src to dst. 237 * Attach the anon_vmas from src to dst.
238 * Returns 0 on success, -ENOMEM on failure. 238 * Returns 0 on success, -ENOMEM on failure.
239 */ 239 */
240 int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) 240 int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
241 { 241 {
242 struct anon_vma_chain *avc, *pavc; 242 struct anon_vma_chain *avc, *pavc;
243 struct anon_vma *root = NULL; 243 struct anon_vma *root = NULL;
244 244
245 list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { 245 list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
246 struct anon_vma *anon_vma; 246 struct anon_vma *anon_vma;
247 247
248 avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN); 248 avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN);
249 if (unlikely(!avc)) { 249 if (unlikely(!avc)) {
250 unlock_anon_vma_root(root); 250 unlock_anon_vma_root(root);
251 root = NULL; 251 root = NULL;
252 avc = anon_vma_chain_alloc(GFP_KERNEL); 252 avc = anon_vma_chain_alloc(GFP_KERNEL);
253 if (!avc) 253 if (!avc)
254 goto enomem_failure; 254 goto enomem_failure;
255 } 255 }
256 anon_vma = pavc->anon_vma; 256 anon_vma = pavc->anon_vma;
257 root = lock_anon_vma_root(root, anon_vma); 257 root = lock_anon_vma_root(root, anon_vma);
258 anon_vma_chain_link(dst, avc, anon_vma); 258 anon_vma_chain_link(dst, avc, anon_vma);
259 } 259 }
260 unlock_anon_vma_root(root); 260 unlock_anon_vma_root(root);
261 return 0; 261 return 0;
262 262
263 enomem_failure: 263 enomem_failure:
264 unlink_anon_vmas(dst); 264 unlink_anon_vmas(dst);
265 return -ENOMEM; 265 return -ENOMEM;
266 } 266 }
267 267
268 /* 268 /*
269 * Attach vma to its own anon_vma, as well as to the anon_vmas that 269 * Attach vma to its own anon_vma, as well as to the anon_vmas that
270 * the corresponding VMA in the parent process is attached to. 270 * the corresponding VMA in the parent process is attached to.
271 * Returns 0 on success, non-zero on failure. 271 * Returns 0 on success, non-zero on failure.
272 */ 272 */
273 int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) 273 int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
274 { 274 {
275 struct anon_vma_chain *avc; 275 struct anon_vma_chain *avc;
276 struct anon_vma *anon_vma; 276 struct anon_vma *anon_vma;
277 int error;
277 278
278 /* Don't bother if the parent process has no anon_vma here. */ 279 /* Don't bother if the parent process has no anon_vma here. */
279 if (!pvma->anon_vma) 280 if (!pvma->anon_vma)
280 return 0; 281 return 0;
281 282
282 /* 283 /*
283 * First, attach the new VMA to the parent VMA's anon_vmas, 284 * First, attach the new VMA to the parent VMA's anon_vmas,
284 * so rmap can find non-COWed pages in child processes. 285 * so rmap can find non-COWed pages in child processes.
285 */ 286 */
286 if (anon_vma_clone(vma, pvma)) 287 error = anon_vma_clone(vma, pvma);
287 return -ENOMEM; 288 if (error)
289 return error;
288 290
289 /* Then add our own anon_vma. */ 291 /* Then add our own anon_vma. */
290 anon_vma = anon_vma_alloc(); 292 anon_vma = anon_vma_alloc();
291 if (!anon_vma) 293 if (!anon_vma)
292 goto out_error; 294 goto out_error;
293 avc = anon_vma_chain_alloc(GFP_KERNEL); 295 avc = anon_vma_chain_alloc(GFP_KERNEL);
294 if (!avc) 296 if (!avc)
295 goto out_error_free_anon_vma; 297 goto out_error_free_anon_vma;
296 298
297 /* 299 /*
298 * The root anon_vma's spinlock is the lock actually used when we 300 * The root anon_vma's spinlock is the lock actually used when we
299 * lock any of the anon_vmas in this anon_vma tree. 301 * lock any of the anon_vmas in this anon_vma tree.
300 */ 302 */
301 anon_vma->root = pvma->anon_vma->root; 303 anon_vma->root = pvma->anon_vma->root;
302 /* 304 /*
303 * With refcounts, an anon_vma can stay around longer than the 305 * With refcounts, an anon_vma can stay around longer than the
304 * process it belongs to. The root anon_vma needs to be pinned until 306 * process it belongs to. The root anon_vma needs to be pinned until
305 * this anon_vma is freed, because the lock lives in the root. 307 * this anon_vma is freed, because the lock lives in the root.
306 */ 308 */
307 get_anon_vma(anon_vma->root); 309 get_anon_vma(anon_vma->root);
308 /* Mark this anon_vma as the one where our new (COWed) pages go. */ 310 /* Mark this anon_vma as the one where our new (COWed) pages go. */
309 vma->anon_vma = anon_vma; 311 vma->anon_vma = anon_vma;
310 anon_vma_lock_write(anon_vma); 312 anon_vma_lock_write(anon_vma);
311 anon_vma_chain_link(vma, avc, anon_vma); 313 anon_vma_chain_link(vma, avc, anon_vma);
312 anon_vma_unlock_write(anon_vma); 314 anon_vma_unlock_write(anon_vma);
313 315
314 return 0; 316 return 0;
315 317
316 out_error_free_anon_vma: 318 out_error_free_anon_vma:
317 put_anon_vma(anon_vma); 319 put_anon_vma(anon_vma);
318 out_error: 320 out_error:
319 unlink_anon_vmas(vma); 321 unlink_anon_vmas(vma);
320 return -ENOMEM; 322 return -ENOMEM;
321 } 323 }
322 324
323 void unlink_anon_vmas(struct vm_area_struct *vma) 325 void unlink_anon_vmas(struct vm_area_struct *vma)
324 { 326 {
325 struct anon_vma_chain *avc, *next; 327 struct anon_vma_chain *avc, *next;
326 struct anon_vma *root = NULL; 328 struct anon_vma *root = NULL;
327 329
328 /* 330 /*
329 * Unlink each anon_vma chained to the VMA. This list is ordered 331 * Unlink each anon_vma chained to the VMA. This list is ordered
330 * from newest to oldest, ensuring the root anon_vma gets freed last. 332 * from newest to oldest, ensuring the root anon_vma gets freed last.
331 */ 333 */
332 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { 334 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
333 struct anon_vma *anon_vma = avc->anon_vma; 335 struct anon_vma *anon_vma = avc->anon_vma;
334 336
335 root = lock_anon_vma_root(root, anon_vma); 337 root = lock_anon_vma_root(root, anon_vma);
336 anon_vma_interval_tree_remove(avc, &anon_vma->rb_root); 338 anon_vma_interval_tree_remove(avc, &anon_vma->rb_root);
337 339
338 /* 340 /*
339 * Leave empty anon_vmas on the list - we'll need 341 * Leave empty anon_vmas on the list - we'll need
340 * to free them outside the lock. 342 * to free them outside the lock.
341 */ 343 */
342 if (RB_EMPTY_ROOT(&anon_vma->rb_root)) 344 if (RB_EMPTY_ROOT(&anon_vma->rb_root))
343 continue; 345 continue;
344 346
345 list_del(&avc->same_vma); 347 list_del(&avc->same_vma);
346 anon_vma_chain_free(avc); 348 anon_vma_chain_free(avc);
347 } 349 }
348 unlock_anon_vma_root(root); 350 unlock_anon_vma_root(root);
349 351
350 /* 352 /*
351 * Iterate the list once more, it now only contains empty and unlinked 353 * Iterate the list once more, it now only contains empty and unlinked
352 * anon_vmas, destroy them. Could not do before due to __put_anon_vma() 354 * anon_vmas, destroy them. Could not do before due to __put_anon_vma()
353 * needing to write-acquire the anon_vma->root->rwsem. 355 * needing to write-acquire the anon_vma->root->rwsem.
354 */ 356 */
355 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { 357 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
356 struct anon_vma *anon_vma = avc->anon_vma; 358 struct anon_vma *anon_vma = avc->anon_vma;
357 359
358 put_anon_vma(anon_vma); 360 put_anon_vma(anon_vma);
359 361
360 list_del(&avc->same_vma); 362 list_del(&avc->same_vma);
361 anon_vma_chain_free(avc); 363 anon_vma_chain_free(avc);
362 } 364 }
363 } 365 }
364 366
365 static void anon_vma_ctor(void *data) 367 static void anon_vma_ctor(void *data)
366 { 368 {
367 struct anon_vma *anon_vma = data; 369 struct anon_vma *anon_vma = data;
368 370
369 init_rwsem(&anon_vma->rwsem); 371 init_rwsem(&anon_vma->rwsem);
370 atomic_set(&anon_vma->refcount, 0); 372 atomic_set(&anon_vma->refcount, 0);
371 anon_vma->rb_root = RB_ROOT; 373 anon_vma->rb_root = RB_ROOT;
372 } 374 }
373 375
374 void __init anon_vma_init(void) 376 void __init anon_vma_init(void)
375 { 377 {
376 anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), 378 anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
377 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor); 379 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor);
378 anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC); 380 anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC);
379 } 381 }
380 382
381 /* 383 /*
382 * Getting a lock on a stable anon_vma from a page off the LRU is tricky! 384 * Getting a lock on a stable anon_vma from a page off the LRU is tricky!
383 * 385 *
384 * Since there is no serialization what so ever against page_remove_rmap() 386 * Since there is no serialization what so ever against page_remove_rmap()
385 * the best this function can do is return a locked anon_vma that might 387 * the best this function can do is return a locked anon_vma that might
386 * have been relevant to this page. 388 * have been relevant to this page.
387 * 389 *
388 * The page might have been remapped to a different anon_vma or the anon_vma 390 * The page might have been remapped to a different anon_vma or the anon_vma
389 * returned may already be freed (and even reused). 391 * returned may already be freed (and even reused).
390 * 392 *
391 * In case it was remapped to a different anon_vma, the new anon_vma will be a 393 * In case it was remapped to a different anon_vma, the new anon_vma will be a
392 * child of the old anon_vma, and the anon_vma lifetime rules will therefore 394 * child of the old anon_vma, and the anon_vma lifetime rules will therefore
393 * ensure that any anon_vma obtained from the page will still be valid for as 395 * ensure that any anon_vma obtained from the page will still be valid for as
394 * long as we observe page_mapped() [ hence all those page_mapped() tests ]. 396 * long as we observe page_mapped() [ hence all those page_mapped() tests ].
395 * 397 *
396 * All users of this function must be very careful when walking the anon_vma 398 * All users of this function must be very careful when walking the anon_vma
397 * chain and verify that the page in question is indeed mapped in it 399 * chain and verify that the page in question is indeed mapped in it
398 * [ something equivalent to page_mapped_in_vma() ]. 400 * [ something equivalent to page_mapped_in_vma() ].
399 * 401 *
400 * Since anon_vma's slab is DESTROY_BY_RCU and we know from page_remove_rmap() 402 * Since anon_vma's slab is DESTROY_BY_RCU and we know from page_remove_rmap()
401 * that the anon_vma pointer from page->mapping is valid if there is a 403 * that the anon_vma pointer from page->mapping is valid if there is a
402 * mapcount, we can dereference the anon_vma after observing those. 404 * mapcount, we can dereference the anon_vma after observing those.
403 */ 405 */
404 struct anon_vma *page_get_anon_vma(struct page *page) 406 struct anon_vma *page_get_anon_vma(struct page *page)
405 { 407 {
406 struct anon_vma *anon_vma = NULL; 408 struct anon_vma *anon_vma = NULL;
407 unsigned long anon_mapping; 409 unsigned long anon_mapping;
408 410
409 rcu_read_lock(); 411 rcu_read_lock();
410 anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping); 412 anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping);
411 if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) 413 if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
412 goto out; 414 goto out;
413 if (!page_mapped(page)) 415 if (!page_mapped(page))
414 goto out; 416 goto out;
415 417
416 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); 418 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
417 if (!atomic_inc_not_zero(&anon_vma->refcount)) { 419 if (!atomic_inc_not_zero(&anon_vma->refcount)) {
418 anon_vma = NULL; 420 anon_vma = NULL;
419 goto out; 421 goto out;
420 } 422 }
421 423
422 /* 424 /*
423 * If this page is still mapped, then its anon_vma cannot have been 425 * If this page is still mapped, then its anon_vma cannot have been
424 * freed. But if it has been unmapped, we have no security against the 426 * freed. But if it has been unmapped, we have no security against the
425 * anon_vma structure being freed and reused (for another anon_vma: 427 * anon_vma structure being freed and reused (for another anon_vma:
426 * SLAB_DESTROY_BY_RCU guarantees that - so the atomic_inc_not_zero() 428 * SLAB_DESTROY_BY_RCU guarantees that - so the atomic_inc_not_zero()
427 * above cannot corrupt). 429 * above cannot corrupt).
428 */ 430 */
429 if (!page_mapped(page)) { 431 if (!page_mapped(page)) {
430 rcu_read_unlock(); 432 rcu_read_unlock();
431 put_anon_vma(anon_vma); 433 put_anon_vma(anon_vma);
432 return NULL; 434 return NULL;
433 } 435 }
434 out: 436 out:
435 rcu_read_unlock(); 437 rcu_read_unlock();
436 438
437 return anon_vma; 439 return anon_vma;
438 } 440 }
439 441
440 /* 442 /*
441 * Similar to page_get_anon_vma() except it locks the anon_vma. 443 * Similar to page_get_anon_vma() except it locks the anon_vma.
442 * 444 *
443 * Its a little more complex as it tries to keep the fast path to a single 445 * Its a little more complex as it tries to keep the fast path to a single
444 * atomic op -- the trylock. If we fail the trylock, we fall back to getting a 446 * atomic op -- the trylock. If we fail the trylock, we fall back to getting a
445 * reference like with page_get_anon_vma() and then block on the mutex. 447 * reference like with page_get_anon_vma() and then block on the mutex.
446 */ 448 */
447 struct anon_vma *page_lock_anon_vma_read(struct page *page) 449 struct anon_vma *page_lock_anon_vma_read(struct page *page)
448 { 450 {
449 struct anon_vma *anon_vma = NULL; 451 struct anon_vma *anon_vma = NULL;
450 struct anon_vma *root_anon_vma; 452 struct anon_vma *root_anon_vma;
451 unsigned long anon_mapping; 453 unsigned long anon_mapping;
452 454
453 rcu_read_lock(); 455 rcu_read_lock();
454 anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping); 456 anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping);
455 if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) 457 if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
456 goto out; 458 goto out;
457 if (!page_mapped(page)) 459 if (!page_mapped(page))
458 goto out; 460 goto out;
459 461
460 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); 462 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
461 root_anon_vma = ACCESS_ONCE(anon_vma->root); 463 root_anon_vma = ACCESS_ONCE(anon_vma->root);
462 if (down_read_trylock(&root_anon_vma->rwsem)) { 464 if (down_read_trylock(&root_anon_vma->rwsem)) {
463 /* 465 /*
464 * If the page is still mapped, then this anon_vma is still 466 * If the page is still mapped, then this anon_vma is still
465 * its anon_vma, and holding the mutex ensures that it will 467 * its anon_vma, and holding the mutex ensures that it will
466 * not go away, see anon_vma_free(). 468 * not go away, see anon_vma_free().
467 */ 469 */
468 if (!page_mapped(page)) { 470 if (!page_mapped(page)) {
469 up_read(&root_anon_vma->rwsem); 471 up_read(&root_anon_vma->rwsem);
470 anon_vma = NULL; 472 anon_vma = NULL;
471 } 473 }
472 goto out; 474 goto out;
473 } 475 }
474 476
475 /* trylock failed, we got to sleep */ 477 /* trylock failed, we got to sleep */
476 if (!atomic_inc_not_zero(&anon_vma->refcount)) { 478 if (!atomic_inc_not_zero(&anon_vma->refcount)) {
477 anon_vma = NULL; 479 anon_vma = NULL;
478 goto out; 480 goto out;
479 } 481 }
480 482
481 if (!page_mapped(page)) { 483 if (!page_mapped(page)) {
482 rcu_read_unlock(); 484 rcu_read_unlock();
483 put_anon_vma(anon_vma); 485 put_anon_vma(anon_vma);
484 return NULL; 486 return NULL;
485 } 487 }
486 488
487 /* we pinned the anon_vma, its safe to sleep */ 489 /* we pinned the anon_vma, its safe to sleep */
488 rcu_read_unlock(); 490 rcu_read_unlock();
489 anon_vma_lock_read(anon_vma); 491 anon_vma_lock_read(anon_vma);
490 492
491 if (atomic_dec_and_test(&anon_vma->refcount)) { 493 if (atomic_dec_and_test(&anon_vma->refcount)) {
492 /* 494 /*
493 * Oops, we held the last refcount, release the lock 495 * Oops, we held the last refcount, release the lock
494 * and bail -- can't simply use put_anon_vma() because 496 * and bail -- can't simply use put_anon_vma() because
495 * we'll deadlock on the anon_vma_lock_write() recursion. 497 * we'll deadlock on the anon_vma_lock_write() recursion.
496 */ 498 */
497 anon_vma_unlock_read(anon_vma); 499 anon_vma_unlock_read(anon_vma);
498 __put_anon_vma(anon_vma); 500 __put_anon_vma(anon_vma);
499 anon_vma = NULL; 501 anon_vma = NULL;
500 } 502 }
501 503
502 return anon_vma; 504 return anon_vma;
503 505
504 out: 506 out:
505 rcu_read_unlock(); 507 rcu_read_unlock();
506 return anon_vma; 508 return anon_vma;
507 } 509 }
508 510
509 void page_unlock_anon_vma_read(struct anon_vma *anon_vma) 511 void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
510 { 512 {
511 anon_vma_unlock_read(anon_vma); 513 anon_vma_unlock_read(anon_vma);
512 } 514 }
513 515
514 /* 516 /*
515 * At what user virtual address is page expected in @vma? 517 * At what user virtual address is page expected in @vma?
516 */ 518 */
517 static inline unsigned long 519 static inline unsigned long
518 __vma_address(struct page *page, struct vm_area_struct *vma) 520 __vma_address(struct page *page, struct vm_area_struct *vma)
519 { 521 {
520 pgoff_t pgoff = page_to_pgoff(page); 522 pgoff_t pgoff = page_to_pgoff(page);
521 return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); 523 return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
522 } 524 }
523 525
524 inline unsigned long 526 inline unsigned long
525 vma_address(struct page *page, struct vm_area_struct *vma) 527 vma_address(struct page *page, struct vm_area_struct *vma)
526 { 528 {
527 unsigned long address = __vma_address(page, vma); 529 unsigned long address = __vma_address(page, vma);
528 530
529 /* page should be within @vma mapping range */ 531 /* page should be within @vma mapping range */
530 VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); 532 VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
531 533
532 return address; 534 return address;
533 } 535 }
534 536
535 /* 537 /*
536 * At what user virtual address is page expected in vma? 538 * At what user virtual address is page expected in vma?
537 * Caller should check the page is actually part of the vma. 539 * Caller should check the page is actually part of the vma.
538 */ 540 */
539 unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) 541 unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
540 { 542 {
541 unsigned long address; 543 unsigned long address;
542 if (PageAnon(page)) { 544 if (PageAnon(page)) {
543 struct anon_vma *page__anon_vma = page_anon_vma(page); 545 struct anon_vma *page__anon_vma = page_anon_vma(page);
544 /* 546 /*
545 * Note: swapoff's unuse_vma() is more efficient with this 547 * Note: swapoff's unuse_vma() is more efficient with this
546 * check, and needs it to match anon_vma when KSM is active. 548 * check, and needs it to match anon_vma when KSM is active.
547 */ 549 */
548 if (!vma->anon_vma || !page__anon_vma || 550 if (!vma->anon_vma || !page__anon_vma ||
549 vma->anon_vma->root != page__anon_vma->root) 551 vma->anon_vma->root != page__anon_vma->root)
550 return -EFAULT; 552 return -EFAULT;
551 } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { 553 } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
552 if (!vma->vm_file || 554 if (!vma->vm_file ||
553 vma->vm_file->f_mapping != page->mapping) 555 vma->vm_file->f_mapping != page->mapping)
554 return -EFAULT; 556 return -EFAULT;
555 } else 557 } else
556 return -EFAULT; 558 return -EFAULT;
557 address = __vma_address(page, vma); 559 address = __vma_address(page, vma);
558 if (unlikely(address < vma->vm_start || address >= vma->vm_end)) 560 if (unlikely(address < vma->vm_start || address >= vma->vm_end))
559 return -EFAULT; 561 return -EFAULT;
560 return address; 562 return address;
561 } 563 }
562 564
563 pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) 565 pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
564 { 566 {
565 pgd_t *pgd; 567 pgd_t *pgd;
566 pud_t *pud; 568 pud_t *pud;
567 pmd_t *pmd = NULL; 569 pmd_t *pmd = NULL;
568 pmd_t pmde; 570 pmd_t pmde;
569 571
570 pgd = pgd_offset(mm, address); 572 pgd = pgd_offset(mm, address);
571 if (!pgd_present(*pgd)) 573 if (!pgd_present(*pgd))
572 goto out; 574 goto out;
573 575
574 pud = pud_offset(pgd, address); 576 pud = pud_offset(pgd, address);
575 if (!pud_present(*pud)) 577 if (!pud_present(*pud))
576 goto out; 578 goto out;
577 579
578 pmd = pmd_offset(pud, address); 580 pmd = pmd_offset(pud, address);
579 /* 581 /*
580 * Some THP functions use the sequence pmdp_clear_flush(), set_pmd_at() 582 * Some THP functions use the sequence pmdp_clear_flush(), set_pmd_at()
581 * without holding anon_vma lock for write. So when looking for a 583 * without holding anon_vma lock for write. So when looking for a
582 * genuine pmde (in which to find pte), test present and !THP together. 584 * genuine pmde (in which to find pte), test present and !THP together.
583 */ 585 */
584 pmde = ACCESS_ONCE(*pmd); 586 pmde = ACCESS_ONCE(*pmd);
585 if (!pmd_present(pmde) || pmd_trans_huge(pmde)) 587 if (!pmd_present(pmde) || pmd_trans_huge(pmde))
586 pmd = NULL; 588 pmd = NULL;
587 out: 589 out:
588 return pmd; 590 return pmd;
589 } 591 }
590 592
591 /* 593 /*
592 * Check that @page is mapped at @address into @mm. 594 * Check that @page is mapped at @address into @mm.
593 * 595 *
594 * If @sync is false, page_check_address may perform a racy check to avoid 596 * If @sync is false, page_check_address may perform a racy check to avoid
595 * the page table lock when the pte is not present (helpful when reclaiming 597 * the page table lock when the pte is not present (helpful when reclaiming
596 * highly shared pages). 598 * highly shared pages).
597 * 599 *
598 * On success returns with pte mapped and locked. 600 * On success returns with pte mapped and locked.
599 */ 601 */
600 pte_t *__page_check_address(struct page *page, struct mm_struct *mm, 602 pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
601 unsigned long address, spinlock_t **ptlp, int sync) 603 unsigned long address, spinlock_t **ptlp, int sync)
602 { 604 {
603 pmd_t *pmd; 605 pmd_t *pmd;
604 pte_t *pte; 606 pte_t *pte;
605 spinlock_t *ptl; 607 spinlock_t *ptl;
606 608
607 if (unlikely(PageHuge(page))) { 609 if (unlikely(PageHuge(page))) {
608 /* when pud is not present, pte will be NULL */ 610 /* when pud is not present, pte will be NULL */
609 pte = huge_pte_offset(mm, address); 611 pte = huge_pte_offset(mm, address);
610 if (!pte) 612 if (!pte)
611 return NULL; 613 return NULL;
612 614
613 ptl = huge_pte_lockptr(page_hstate(page), mm, pte); 615 ptl = huge_pte_lockptr(page_hstate(page), mm, pte);
614 goto check; 616 goto check;
615 } 617 }
616 618
617 pmd = mm_find_pmd(mm, address); 619 pmd = mm_find_pmd(mm, address);
618 if (!pmd) 620 if (!pmd)
619 return NULL; 621 return NULL;
620 622
621 pte = pte_offset_map(pmd, address); 623 pte = pte_offset_map(pmd, address);
622 /* Make a quick check before getting the lock */ 624 /* Make a quick check before getting the lock */
623 if (!sync && !pte_present(*pte)) { 625 if (!sync && !pte_present(*pte)) {
624 pte_unmap(pte); 626 pte_unmap(pte);
625 return NULL; 627 return NULL;
626 } 628 }
627 629
628 ptl = pte_lockptr(mm, pmd); 630 ptl = pte_lockptr(mm, pmd);
629 check: 631 check:
630 spin_lock(ptl); 632 spin_lock(ptl);
631 if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) { 633 if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
632 *ptlp = ptl; 634 *ptlp = ptl;
633 return pte; 635 return pte;
634 } 636 }
635 pte_unmap_unlock(pte, ptl); 637 pte_unmap_unlock(pte, ptl);
636 return NULL; 638 return NULL;
637 } 639 }
638 640
639 /** 641 /**
640 * page_mapped_in_vma - check whether a page is really mapped in a VMA 642 * page_mapped_in_vma - check whether a page is really mapped in a VMA
641 * @page: the page to test 643 * @page: the page to test
642 * @vma: the VMA to test 644 * @vma: the VMA to test
643 * 645 *
644 * Returns 1 if the page is mapped into the page tables of the VMA, 0 646 * Returns 1 if the page is mapped into the page tables of the VMA, 0
645 * if the page is not mapped into the page tables of this VMA. Only 647 * if the page is not mapped into the page tables of this VMA. Only
646 * valid for normal file or anonymous VMAs. 648 * valid for normal file or anonymous VMAs.
647 */ 649 */
648 int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) 650 int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
649 { 651 {
650 unsigned long address; 652 unsigned long address;
651 pte_t *pte; 653 pte_t *pte;
652 spinlock_t *ptl; 654 spinlock_t *ptl;
653 655
654 address = __vma_address(page, vma); 656 address = __vma_address(page, vma);
655 if (unlikely(address < vma->vm_start || address >= vma->vm_end)) 657 if (unlikely(address < vma->vm_start || address >= vma->vm_end))
656 return 0; 658 return 0;
657 pte = page_check_address(page, vma->vm_mm, address, &ptl, 1); 659 pte = page_check_address(page, vma->vm_mm, address, &ptl, 1);
658 if (!pte) /* the page is not in this mm */ 660 if (!pte) /* the page is not in this mm */
659 return 0; 661 return 0;
660 pte_unmap_unlock(pte, ptl); 662 pte_unmap_unlock(pte, ptl);
661 663
662 return 1; 664 return 1;
663 } 665 }
664 666
665 struct page_referenced_arg { 667 struct page_referenced_arg {
666 int mapcount; 668 int mapcount;
667 int referenced; 669 int referenced;
668 unsigned long vm_flags; 670 unsigned long vm_flags;
669 struct mem_cgroup *memcg; 671 struct mem_cgroup *memcg;
670 }; 672 };
671 /* 673 /*
672 * arg: page_referenced_arg will be passed 674 * arg: page_referenced_arg will be passed
673 */ 675 */
674 static int page_referenced_one(struct page *page, struct vm_area_struct *vma, 676 static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
675 unsigned long address, void *arg) 677 unsigned long address, void *arg)
676 { 678 {
677 struct mm_struct *mm = vma->vm_mm; 679 struct mm_struct *mm = vma->vm_mm;
678 spinlock_t *ptl; 680 spinlock_t *ptl;
679 int referenced = 0; 681 int referenced = 0;
680 struct page_referenced_arg *pra = arg; 682 struct page_referenced_arg *pra = arg;
681 683
682 if (unlikely(PageTransHuge(page))) { 684 if (unlikely(PageTransHuge(page))) {
683 pmd_t *pmd; 685 pmd_t *pmd;
684 686
685 /* 687 /*
686 * rmap might return false positives; we must filter 688 * rmap might return false positives; we must filter
687 * these out using page_check_address_pmd(). 689 * these out using page_check_address_pmd().
688 */ 690 */
689 pmd = page_check_address_pmd(page, mm, address, 691 pmd = page_check_address_pmd(page, mm, address,
690 PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl); 692 PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
691 if (!pmd) 693 if (!pmd)
692 return SWAP_AGAIN; 694 return SWAP_AGAIN;
693 695
694 if (vma->vm_flags & VM_LOCKED) { 696 if (vma->vm_flags & VM_LOCKED) {
695 spin_unlock(ptl); 697 spin_unlock(ptl);
696 pra->vm_flags |= VM_LOCKED; 698 pra->vm_flags |= VM_LOCKED;
697 return SWAP_FAIL; /* To break the loop */ 699 return SWAP_FAIL; /* To break the loop */
698 } 700 }
699 701
700 /* go ahead even if the pmd is pmd_trans_splitting() */ 702 /* go ahead even if the pmd is pmd_trans_splitting() */
701 if (pmdp_clear_flush_young_notify(vma, address, pmd)) 703 if (pmdp_clear_flush_young_notify(vma, address, pmd))
702 referenced++; 704 referenced++;
703 spin_unlock(ptl); 705 spin_unlock(ptl);
704 } else { 706 } else {
705 pte_t *pte; 707 pte_t *pte;
706 708
707 /* 709 /*
708 * rmap might return false positives; we must filter 710 * rmap might return false positives; we must filter
709 * these out using page_check_address(). 711 * these out using page_check_address().
710 */ 712 */
711 pte = page_check_address(page, mm, address, &ptl, 0); 713 pte = page_check_address(page, mm, address, &ptl, 0);
712 if (!pte) 714 if (!pte)
713 return SWAP_AGAIN; 715 return SWAP_AGAIN;
714 716
715 if (vma->vm_flags & VM_LOCKED) { 717 if (vma->vm_flags & VM_LOCKED) {
716 pte_unmap_unlock(pte, ptl); 718 pte_unmap_unlock(pte, ptl);
717 pra->vm_flags |= VM_LOCKED; 719 pra->vm_flags |= VM_LOCKED;
718 return SWAP_FAIL; /* To break the loop */ 720 return SWAP_FAIL; /* To break the loop */
719 } 721 }
720 722
721 if (ptep_clear_flush_young_notify(vma, address, pte)) { 723 if (ptep_clear_flush_young_notify(vma, address, pte)) {
722 /* 724 /*
723 * Don't treat a reference through a sequentially read 725 * Don't treat a reference through a sequentially read
724 * mapping as such. If the page has been used in 726 * mapping as such. If the page has been used in
725 * another mapping, we will catch it; if this other 727 * another mapping, we will catch it; if this other
726 * mapping is already gone, the unmap path will have 728 * mapping is already gone, the unmap path will have
727 * set PG_referenced or activated the page. 729 * set PG_referenced or activated the page.
728 */ 730 */
729 if (likely(!(vma->vm_flags & VM_SEQ_READ))) 731 if (likely(!(vma->vm_flags & VM_SEQ_READ)))
730 referenced++; 732 referenced++;
731 } 733 }
732 pte_unmap_unlock(pte, ptl); 734 pte_unmap_unlock(pte, ptl);
733 } 735 }
734 736
735 if (referenced) { 737 if (referenced) {
736 pra->referenced++; 738 pra->referenced++;
737 pra->vm_flags |= vma->vm_flags; 739 pra->vm_flags |= vma->vm_flags;
738 } 740 }
739 741
740 pra->mapcount--; 742 pra->mapcount--;
741 if (!pra->mapcount) 743 if (!pra->mapcount)
742 return SWAP_SUCCESS; /* To break the loop */ 744 return SWAP_SUCCESS; /* To break the loop */
743 745
744 return SWAP_AGAIN; 746 return SWAP_AGAIN;
745 } 747 }
746 748
747 static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg) 749 static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg)
748 { 750 {
749 struct page_referenced_arg *pra = arg; 751 struct page_referenced_arg *pra = arg;
750 struct mem_cgroup *memcg = pra->memcg; 752 struct mem_cgroup *memcg = pra->memcg;
751 753
752 if (!mm_match_cgroup(vma->vm_mm, memcg)) 754 if (!mm_match_cgroup(vma->vm_mm, memcg))
753 return true; 755 return true;
754 756
755 return false; 757 return false;
756 } 758 }
757 759
758 /** 760 /**
759 * page_referenced - test if the page was referenced 761 * page_referenced - test if the page was referenced
760 * @page: the page to test 762 * @page: the page to test
761 * @is_locked: caller holds lock on the page 763 * @is_locked: caller holds lock on the page
762 * @memcg: target memory cgroup 764 * @memcg: target memory cgroup
763 * @vm_flags: collect encountered vma->vm_flags who actually referenced the page 765 * @vm_flags: collect encountered vma->vm_flags who actually referenced the page
764 * 766 *
765 * Quick test_and_clear_referenced for all mappings to a page, 767 * Quick test_and_clear_referenced for all mappings to a page,
766 * returns the number of ptes which referenced the page. 768 * returns the number of ptes which referenced the page.
767 */ 769 */
768 int page_referenced(struct page *page, 770 int page_referenced(struct page *page,
769 int is_locked, 771 int is_locked,
770 struct mem_cgroup *memcg, 772 struct mem_cgroup *memcg,
771 unsigned long *vm_flags) 773 unsigned long *vm_flags)
772 { 774 {
773 int ret; 775 int ret;
774 int we_locked = 0; 776 int we_locked = 0;
775 struct page_referenced_arg pra = { 777 struct page_referenced_arg pra = {
776 .mapcount = page_mapcount(page), 778 .mapcount = page_mapcount(page),
777 .memcg = memcg, 779 .memcg = memcg,
778 }; 780 };
779 struct rmap_walk_control rwc = { 781 struct rmap_walk_control rwc = {
780 .rmap_one = page_referenced_one, 782 .rmap_one = page_referenced_one,
781 .arg = (void *)&pra, 783 .arg = (void *)&pra,
782 .anon_lock = page_lock_anon_vma_read, 784 .anon_lock = page_lock_anon_vma_read,
783 }; 785 };
784 786
785 *vm_flags = 0; 787 *vm_flags = 0;
786 if (!page_mapped(page)) 788 if (!page_mapped(page))
787 return 0; 789 return 0;
788 790
789 if (!page_rmapping(page)) 791 if (!page_rmapping(page))
790 return 0; 792 return 0;
791 793
792 if (!is_locked && (!PageAnon(page) || PageKsm(page))) { 794 if (!is_locked && (!PageAnon(page) || PageKsm(page))) {
793 we_locked = trylock_page(page); 795 we_locked = trylock_page(page);
794 if (!we_locked) 796 if (!we_locked)
795 return 1; 797 return 1;
796 } 798 }
797 799
798 /* 800 /*
799 * If we are reclaiming on behalf of a cgroup, skip 801 * If we are reclaiming on behalf of a cgroup, skip
800 * counting on behalf of references from different 802 * counting on behalf of references from different
801 * cgroups 803 * cgroups
802 */ 804 */
803 if (memcg) { 805 if (memcg) {
804 rwc.invalid_vma = invalid_page_referenced_vma; 806 rwc.invalid_vma = invalid_page_referenced_vma;
805 } 807 }
806 808
807 ret = rmap_walk(page, &rwc); 809 ret = rmap_walk(page, &rwc);
808 *vm_flags = pra.vm_flags; 810 *vm_flags = pra.vm_flags;
809 811
810 if (we_locked) 812 if (we_locked)
811 unlock_page(page); 813 unlock_page(page);
812 814
813 return pra.referenced; 815 return pra.referenced;
814 } 816 }
815 817
816 static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, 818 static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
817 unsigned long address, void *arg) 819 unsigned long address, void *arg)
818 { 820 {
819 struct mm_struct *mm = vma->vm_mm; 821 struct mm_struct *mm = vma->vm_mm;
820 pte_t *pte; 822 pte_t *pte;
821 spinlock_t *ptl; 823 spinlock_t *ptl;
822 int ret = 0; 824 int ret = 0;
823 int *cleaned = arg; 825 int *cleaned = arg;
824 826
825 pte = page_check_address(page, mm, address, &ptl, 1); 827 pte = page_check_address(page, mm, address, &ptl, 1);
826 if (!pte) 828 if (!pte)
827 goto out; 829 goto out;
828 830
829 if (pte_dirty(*pte) || pte_write(*pte)) { 831 if (pte_dirty(*pte) || pte_write(*pte)) {
830 pte_t entry; 832 pte_t entry;
831 833
832 flush_cache_page(vma, address, pte_pfn(*pte)); 834 flush_cache_page(vma, address, pte_pfn(*pte));
833 entry = ptep_clear_flush(vma, address, pte); 835 entry = ptep_clear_flush(vma, address, pte);
834 entry = pte_wrprotect(entry); 836 entry = pte_wrprotect(entry);
835 entry = pte_mkclean(entry); 837 entry = pte_mkclean(entry);
836 set_pte_at(mm, address, pte, entry); 838 set_pte_at(mm, address, pte, entry);
837 ret = 1; 839 ret = 1;
838 } 840 }
839 841
840 pte_unmap_unlock(pte, ptl); 842 pte_unmap_unlock(pte, ptl);
841 843
842 if (ret) { 844 if (ret) {
843 mmu_notifier_invalidate_page(mm, address); 845 mmu_notifier_invalidate_page(mm, address);
844 (*cleaned)++; 846 (*cleaned)++;
845 } 847 }
846 out: 848 out:
847 return SWAP_AGAIN; 849 return SWAP_AGAIN;
848 } 850 }
849 851
850 static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg) 852 static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg)
851 { 853 {
852 if (vma->vm_flags & VM_SHARED) 854 if (vma->vm_flags & VM_SHARED)
853 return false; 855 return false;
854 856
855 return true; 857 return true;
856 } 858 }
857 859
858 int page_mkclean(struct page *page) 860 int page_mkclean(struct page *page)
859 { 861 {
860 int cleaned = 0; 862 int cleaned = 0;
861 struct address_space *mapping; 863 struct address_space *mapping;
862 struct rmap_walk_control rwc = { 864 struct rmap_walk_control rwc = {
863 .arg = (void *)&cleaned, 865 .arg = (void *)&cleaned,
864 .rmap_one = page_mkclean_one, 866 .rmap_one = page_mkclean_one,
865 .invalid_vma = invalid_mkclean_vma, 867 .invalid_vma = invalid_mkclean_vma,
866 }; 868 };
867 869
868 BUG_ON(!PageLocked(page)); 870 BUG_ON(!PageLocked(page));
869 871
870 if (!page_mapped(page)) 872 if (!page_mapped(page))
871 return 0; 873 return 0;
872 874
873 mapping = page_mapping(page); 875 mapping = page_mapping(page);
874 if (!mapping) 876 if (!mapping)
875 return 0; 877 return 0;
876 878
877 rmap_walk(page, &rwc); 879 rmap_walk(page, &rwc);
878 880
879 return cleaned; 881 return cleaned;
880 } 882 }
881 EXPORT_SYMBOL_GPL(page_mkclean); 883 EXPORT_SYMBOL_GPL(page_mkclean);
882 884
883 /** 885 /**
884 * page_move_anon_rmap - move a page to our anon_vma 886 * page_move_anon_rmap - move a page to our anon_vma
885 * @page: the page to move to our anon_vma 887 * @page: the page to move to our anon_vma
886 * @vma: the vma the page belongs to 888 * @vma: the vma the page belongs to
887 * @address: the user virtual address mapped 889 * @address: the user virtual address mapped
888 * 890 *
889 * When a page belongs exclusively to one process after a COW event, 891 * When a page belongs exclusively to one process after a COW event,
890 * that page can be moved into the anon_vma that belongs to just that 892 * that page can be moved into the anon_vma that belongs to just that
891 * process, so the rmap code will not search the parent or sibling 893 * process, so the rmap code will not search the parent or sibling
892 * processes. 894 * processes.
893 */ 895 */
894 void page_move_anon_rmap(struct page *page, 896 void page_move_anon_rmap(struct page *page,
895 struct vm_area_struct *vma, unsigned long address) 897 struct vm_area_struct *vma, unsigned long address)
896 { 898 {
897 struct anon_vma *anon_vma = vma->anon_vma; 899 struct anon_vma *anon_vma = vma->anon_vma;
898 900
899 VM_BUG_ON_PAGE(!PageLocked(page), page); 901 VM_BUG_ON_PAGE(!PageLocked(page), page);
900 VM_BUG_ON_VMA(!anon_vma, vma); 902 VM_BUG_ON_VMA(!anon_vma, vma);
901 VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page); 903 VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page);
902 904
903 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 905 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
904 page->mapping = (struct address_space *) anon_vma; 906 page->mapping = (struct address_space *) anon_vma;
905 } 907 }
906 908
907 /** 909 /**
908 * __page_set_anon_rmap - set up new anonymous rmap 910 * __page_set_anon_rmap - set up new anonymous rmap
909 * @page: Page to add to rmap 911 * @page: Page to add to rmap
910 * @vma: VM area to add page to. 912 * @vma: VM area to add page to.
911 * @address: User virtual address of the mapping 913 * @address: User virtual address of the mapping
912 * @exclusive: the page is exclusively owned by the current process 914 * @exclusive: the page is exclusively owned by the current process
913 */ 915 */
914 static void __page_set_anon_rmap(struct page *page, 916 static void __page_set_anon_rmap(struct page *page,
915 struct vm_area_struct *vma, unsigned long address, int exclusive) 917 struct vm_area_struct *vma, unsigned long address, int exclusive)
916 { 918 {
917 struct anon_vma *anon_vma = vma->anon_vma; 919 struct anon_vma *anon_vma = vma->anon_vma;
918 920
919 BUG_ON(!anon_vma); 921 BUG_ON(!anon_vma);
920 922
921 if (PageAnon(page)) 923 if (PageAnon(page))
922 return; 924 return;
923 925
924 /* 926 /*
925 * If the page isn't exclusively mapped into this vma, 927 * If the page isn't exclusively mapped into this vma,
926 * we must use the _oldest_ possible anon_vma for the 928 * we must use the _oldest_ possible anon_vma for the
927 * page mapping! 929 * page mapping!
928 */ 930 */
929 if (!exclusive) 931 if (!exclusive)
930 anon_vma = anon_vma->root; 932 anon_vma = anon_vma->root;
931 933
932 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 934 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
933 page->mapping = (struct address_space *) anon_vma; 935 page->mapping = (struct address_space *) anon_vma;
934 page->index = linear_page_index(vma, address); 936 page->index = linear_page_index(vma, address);
935 } 937 }
936 938
937 /** 939 /**
938 * __page_check_anon_rmap - sanity check anonymous rmap addition 940 * __page_check_anon_rmap - sanity check anonymous rmap addition
939 * @page: the page to add the mapping to 941 * @page: the page to add the mapping to
940 * @vma: the vm area in which the mapping is added 942 * @vma: the vm area in which the mapping is added
941 * @address: the user virtual address mapped 943 * @address: the user virtual address mapped
942 */ 944 */
943 static void __page_check_anon_rmap(struct page *page, 945 static void __page_check_anon_rmap(struct page *page,
944 struct vm_area_struct *vma, unsigned long address) 946 struct vm_area_struct *vma, unsigned long address)
945 { 947 {
946 #ifdef CONFIG_DEBUG_VM 948 #ifdef CONFIG_DEBUG_VM
947 /* 949 /*
948 * The page's anon-rmap details (mapping and index) are guaranteed to 950 * The page's anon-rmap details (mapping and index) are guaranteed to
949 * be set up correctly at this point. 951 * be set up correctly at this point.
950 * 952 *
951 * We have exclusion against page_add_anon_rmap because the caller 953 * We have exclusion against page_add_anon_rmap because the caller
952 * always holds the page locked, except if called from page_dup_rmap, 954 * always holds the page locked, except if called from page_dup_rmap,
953 * in which case the page is already known to be setup. 955 * in which case the page is already known to be setup.
954 * 956 *
955 * We have exclusion against page_add_new_anon_rmap because those pages 957 * We have exclusion against page_add_new_anon_rmap because those pages
956 * are initially only visible via the pagetables, and the pte is locked 958 * are initially only visible via the pagetables, and the pte is locked
957 * over the call to page_add_new_anon_rmap. 959 * over the call to page_add_new_anon_rmap.
958 */ 960 */
959 BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root); 961 BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root);
960 BUG_ON(page->index != linear_page_index(vma, address)); 962 BUG_ON(page->index != linear_page_index(vma, address));
961 #endif 963 #endif
962 } 964 }
963 965
964 /** 966 /**
965 * page_add_anon_rmap - add pte mapping to an anonymous page 967 * page_add_anon_rmap - add pte mapping to an anonymous page
966 * @page: the page to add the mapping to 968 * @page: the page to add the mapping to
967 * @vma: the vm area in which the mapping is added 969 * @vma: the vm area in which the mapping is added
968 * @address: the user virtual address mapped 970 * @address: the user virtual address mapped
969 * 971 *
970 * The caller needs to hold the pte lock, and the page must be locked in 972 * The caller needs to hold the pte lock, and the page must be locked in
971 * the anon_vma case: to serialize mapping,index checking after setting, 973 * the anon_vma case: to serialize mapping,index checking after setting,
972 * and to ensure that PageAnon is not being upgraded racily to PageKsm 974 * and to ensure that PageAnon is not being upgraded racily to PageKsm
973 * (but PageKsm is never downgraded to PageAnon). 975 * (but PageKsm is never downgraded to PageAnon).
974 */ 976 */
975 void page_add_anon_rmap(struct page *page, 977 void page_add_anon_rmap(struct page *page,
976 struct vm_area_struct *vma, unsigned long address) 978 struct vm_area_struct *vma, unsigned long address)
977 { 979 {
978 do_page_add_anon_rmap(page, vma, address, 0); 980 do_page_add_anon_rmap(page, vma, address, 0);
979 } 981 }
980 982
981 /* 983 /*
982 * Special version of the above for do_swap_page, which often runs 984 * Special version of the above for do_swap_page, which often runs
983 * into pages that are exclusively owned by the current process. 985 * into pages that are exclusively owned by the current process.
984 * Everybody else should continue to use page_add_anon_rmap above. 986 * Everybody else should continue to use page_add_anon_rmap above.
985 */ 987 */
986 void do_page_add_anon_rmap(struct page *page, 988 void do_page_add_anon_rmap(struct page *page,
987 struct vm_area_struct *vma, unsigned long address, int exclusive) 989 struct vm_area_struct *vma, unsigned long address, int exclusive)
988 { 990 {
989 int first = atomic_inc_and_test(&page->_mapcount); 991 int first = atomic_inc_and_test(&page->_mapcount);
990 if (first) { 992 if (first) {
991 /* 993 /*
992 * We use the irq-unsafe __{inc|mod}_zone_page_stat because 994 * We use the irq-unsafe __{inc|mod}_zone_page_stat because
993 * these counters are not modified in interrupt context, and 995 * these counters are not modified in interrupt context, and
994 * pte lock(a spinlock) is held, which implies preemption 996 * pte lock(a spinlock) is held, which implies preemption
995 * disabled. 997 * disabled.
996 */ 998 */
997 if (PageTransHuge(page)) 999 if (PageTransHuge(page))
998 __inc_zone_page_state(page, 1000 __inc_zone_page_state(page,
999 NR_ANON_TRANSPARENT_HUGEPAGES); 1001 NR_ANON_TRANSPARENT_HUGEPAGES);
1000 __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, 1002 __mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
1001 hpage_nr_pages(page)); 1003 hpage_nr_pages(page));
1002 } 1004 }
1003 if (unlikely(PageKsm(page))) 1005 if (unlikely(PageKsm(page)))
1004 return; 1006 return;
1005 1007
1006 VM_BUG_ON_PAGE(!PageLocked(page), page); 1008 VM_BUG_ON_PAGE(!PageLocked(page), page);
1007 /* address might be in next vma when migration races vma_adjust */ 1009 /* address might be in next vma when migration races vma_adjust */
1008 if (first) 1010 if (first)
1009 __page_set_anon_rmap(page, vma, address, exclusive); 1011 __page_set_anon_rmap(page, vma, address, exclusive);
1010 else 1012 else
1011 __page_check_anon_rmap(page, vma, address); 1013 __page_check_anon_rmap(page, vma, address);
1012 } 1014 }
1013 1015
1014 /** 1016 /**
1015 * page_add_new_anon_rmap - add pte mapping to a new anonymous page 1017 * page_add_new_anon_rmap - add pte mapping to a new anonymous page
1016 * @page: the page to add the mapping to 1018 * @page: the page to add the mapping to
1017 * @vma: the vm area in which the mapping is added 1019 * @vma: the vm area in which the mapping is added
1018 * @address: the user virtual address mapped 1020 * @address: the user virtual address mapped
1019 * 1021 *
1020 * Same as page_add_anon_rmap but must only be called on *new* pages. 1022 * Same as page_add_anon_rmap but must only be called on *new* pages.
1021 * This means the inc-and-test can be bypassed. 1023 * This means the inc-and-test can be bypassed.
1022 * Page does not have to be locked. 1024 * Page does not have to be locked.
1023 */ 1025 */
1024 void page_add_new_anon_rmap(struct page *page, 1026 void page_add_new_anon_rmap(struct page *page,
1025 struct vm_area_struct *vma, unsigned long address) 1027 struct vm_area_struct *vma, unsigned long address)
1026 { 1028 {
1027 VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); 1029 VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
1028 SetPageSwapBacked(page); 1030 SetPageSwapBacked(page);
1029 atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ 1031 atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
1030 if (PageTransHuge(page)) 1032 if (PageTransHuge(page))
1031 __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); 1033 __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
1032 __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, 1034 __mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
1033 hpage_nr_pages(page)); 1035 hpage_nr_pages(page));
1034 __page_set_anon_rmap(page, vma, address, 1); 1036 __page_set_anon_rmap(page, vma, address, 1);
1035 } 1037 }
1036 1038
1037 /** 1039 /**
1038 * page_add_file_rmap - add pte mapping to a file page 1040 * page_add_file_rmap - add pte mapping to a file page
1039 * @page: the page to add the mapping to 1041 * @page: the page to add the mapping to
1040 * 1042 *
1041 * The caller needs to hold the pte lock. 1043 * The caller needs to hold the pte lock.
1042 */ 1044 */
1043 void page_add_file_rmap(struct page *page) 1045 void page_add_file_rmap(struct page *page)
1044 { 1046 {
1045 struct mem_cgroup *memcg; 1047 struct mem_cgroup *memcg;
1046 unsigned long flags; 1048 unsigned long flags;
1047 bool locked; 1049 bool locked;
1048 1050
1049 memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); 1051 memcg = mem_cgroup_begin_page_stat(page, &locked, &flags);
1050 if (atomic_inc_and_test(&page->_mapcount)) { 1052 if (atomic_inc_and_test(&page->_mapcount)) {
1051 __inc_zone_page_state(page, NR_FILE_MAPPED); 1053 __inc_zone_page_state(page, NR_FILE_MAPPED);
1052 mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); 1054 mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
1053 } 1055 }
1054 mem_cgroup_end_page_stat(memcg, locked, flags); 1056 mem_cgroup_end_page_stat(memcg, locked, flags);
1055 } 1057 }
1056 1058
1057 static void page_remove_file_rmap(struct page *page) 1059 static void page_remove_file_rmap(struct page *page)
1058 { 1060 {
1059 struct mem_cgroup *memcg; 1061 struct mem_cgroup *memcg;
1060 unsigned long flags; 1062 unsigned long flags;
1061 bool locked; 1063 bool locked;
1062 1064
1063 memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); 1065 memcg = mem_cgroup_begin_page_stat(page, &locked, &flags);
1064 1066
1065 /* page still mapped by someone else? */ 1067 /* page still mapped by someone else? */
1066 if (!atomic_add_negative(-1, &page->_mapcount)) 1068 if (!atomic_add_negative(-1, &page->_mapcount))
1067 goto out; 1069 goto out;
1068 1070
1069 /* Hugepages are not counted in NR_FILE_MAPPED for now. */ 1071 /* Hugepages are not counted in NR_FILE_MAPPED for now. */
1070 if (unlikely(PageHuge(page))) 1072 if (unlikely(PageHuge(page)))
1071 goto out; 1073 goto out;
1072 1074
1073 /* 1075 /*
1074 * We use the irq-unsafe __{inc|mod}_zone_page_stat because 1076 * We use the irq-unsafe __{inc|mod}_zone_page_stat because
1075 * these counters are not modified in interrupt context, and 1077 * these counters are not modified in interrupt context, and
1076 * pte lock(a spinlock) is held, which implies preemption disabled. 1078 * pte lock(a spinlock) is held, which implies preemption disabled.
1077 */ 1079 */
1078 __dec_zone_page_state(page, NR_FILE_MAPPED); 1080 __dec_zone_page_state(page, NR_FILE_MAPPED);
1079 mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); 1081 mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
1080 1082
1081 if (unlikely(PageMlocked(page))) 1083 if (unlikely(PageMlocked(page)))
1082 clear_page_mlock(page); 1084 clear_page_mlock(page);
1083 out: 1085 out:
1084 mem_cgroup_end_page_stat(memcg, locked, flags); 1086 mem_cgroup_end_page_stat(memcg, locked, flags);
1085 } 1087 }
1086 1088
1087 /** 1089 /**
1088 * page_remove_rmap - take down pte mapping from a page 1090 * page_remove_rmap - take down pte mapping from a page
1089 * @page: page to remove mapping from 1091 * @page: page to remove mapping from
1090 * 1092 *
1091 * The caller needs to hold the pte lock. 1093 * The caller needs to hold the pte lock.
1092 */ 1094 */
1093 void page_remove_rmap(struct page *page) 1095 void page_remove_rmap(struct page *page)
1094 { 1096 {
1095 if (!PageAnon(page)) { 1097 if (!PageAnon(page)) {
1096 page_remove_file_rmap(page); 1098 page_remove_file_rmap(page);
1097 return; 1099 return;
1098 } 1100 }
1099 1101
1100 /* page still mapped by someone else? */ 1102 /* page still mapped by someone else? */
1101 if (!atomic_add_negative(-1, &page->_mapcount)) 1103 if (!atomic_add_negative(-1, &page->_mapcount))
1102 return; 1104 return;
1103 1105
1104 /* Hugepages are not counted in NR_ANON_PAGES for now. */ 1106 /* Hugepages are not counted in NR_ANON_PAGES for now. */
1105 if (unlikely(PageHuge(page))) 1107 if (unlikely(PageHuge(page)))
1106 return; 1108 return;
1107 1109
1108 /* 1110 /*
1109 * We use the irq-unsafe __{inc|mod}_zone_page_stat because 1111 * We use the irq-unsafe __{inc|mod}_zone_page_stat because
1110 * these counters are not modified in interrupt context, and 1112 * these counters are not modified in interrupt context, and
1111 * pte lock(a spinlock) is held, which implies preemption disabled. 1113 * pte lock(a spinlock) is held, which implies preemption disabled.
1112 */ 1114 */
1113 if (PageTransHuge(page)) 1115 if (PageTransHuge(page))
1114 __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); 1116 __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
1115 1117
1116 __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, 1118 __mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
1117 -hpage_nr_pages(page)); 1119 -hpage_nr_pages(page));
1118 1120
1119 if (unlikely(PageMlocked(page))) 1121 if (unlikely(PageMlocked(page)))
1120 clear_page_mlock(page); 1122 clear_page_mlock(page);
1121 1123
1122 /* 1124 /*
1123 * It would be tidy to reset the PageAnon mapping here, 1125 * It would be tidy to reset the PageAnon mapping here,
1124 * but that might overwrite a racing page_add_anon_rmap 1126 * but that might overwrite a racing page_add_anon_rmap
1125 * which increments mapcount after us but sets mapping 1127 * which increments mapcount after us but sets mapping
1126 * before us: so leave the reset to free_hot_cold_page, 1128 * before us: so leave the reset to free_hot_cold_page,
1127 * and remember that it's only reliable while mapped. 1129 * and remember that it's only reliable while mapped.
1128 * Leaving it set also helps swapoff to reinstate ptes 1130 * Leaving it set also helps swapoff to reinstate ptes
1129 * faster for those pages still in swapcache. 1131 * faster for those pages still in swapcache.
1130 */ 1132 */
1131 } 1133 }
1132 1134
1133 /* 1135 /*
1134 * @arg: enum ttu_flags will be passed to this argument 1136 * @arg: enum ttu_flags will be passed to this argument
1135 */ 1137 */
1136 static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, 1138 static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1137 unsigned long address, void *arg) 1139 unsigned long address, void *arg)
1138 { 1140 {
1139 struct mm_struct *mm = vma->vm_mm; 1141 struct mm_struct *mm = vma->vm_mm;
1140 pte_t *pte; 1142 pte_t *pte;
1141 pte_t pteval; 1143 pte_t pteval;
1142 spinlock_t *ptl; 1144 spinlock_t *ptl;
1143 int ret = SWAP_AGAIN; 1145 int ret = SWAP_AGAIN;
1144 enum ttu_flags flags = (enum ttu_flags)arg; 1146 enum ttu_flags flags = (enum ttu_flags)arg;
1145 1147
1146 pte = page_check_address(page, mm, address, &ptl, 0); 1148 pte = page_check_address(page, mm, address, &ptl, 0);
1147 if (!pte) 1149 if (!pte)
1148 goto out; 1150 goto out;
1149 1151
1150 /* 1152 /*
1151 * If the page is mlock()d, we cannot swap it out. 1153 * If the page is mlock()d, we cannot swap it out.
1152 * If it's recently referenced (perhaps page_referenced 1154 * If it's recently referenced (perhaps page_referenced
1153 * skipped over this mm) then we should reactivate it. 1155 * skipped over this mm) then we should reactivate it.
1154 */ 1156 */
1155 if (!(flags & TTU_IGNORE_MLOCK)) { 1157 if (!(flags & TTU_IGNORE_MLOCK)) {
1156 if (vma->vm_flags & VM_LOCKED) 1158 if (vma->vm_flags & VM_LOCKED)
1157 goto out_mlock; 1159 goto out_mlock;
1158 1160
1159 if (flags & TTU_MUNLOCK) 1161 if (flags & TTU_MUNLOCK)
1160 goto out_unmap; 1162 goto out_unmap;
1161 } 1163 }
1162 if (!(flags & TTU_IGNORE_ACCESS)) { 1164 if (!(flags & TTU_IGNORE_ACCESS)) {
1163 if (ptep_clear_flush_young_notify(vma, address, pte)) { 1165 if (ptep_clear_flush_young_notify(vma, address, pte)) {
1164 ret = SWAP_FAIL; 1166 ret = SWAP_FAIL;
1165 goto out_unmap; 1167 goto out_unmap;
1166 } 1168 }
1167 } 1169 }
1168 1170
1169 /* Nuke the page table entry. */ 1171 /* Nuke the page table entry. */
1170 flush_cache_page(vma, address, page_to_pfn(page)); 1172 flush_cache_page(vma, address, page_to_pfn(page));
1171 pteval = ptep_clear_flush(vma, address, pte); 1173 pteval = ptep_clear_flush(vma, address, pte);
1172 1174
1173 /* Move the dirty bit to the physical page now the pte is gone. */ 1175 /* Move the dirty bit to the physical page now the pte is gone. */
1174 if (pte_dirty(pteval)) 1176 if (pte_dirty(pteval))
1175 set_page_dirty(page); 1177 set_page_dirty(page);
1176 1178
1177 /* Update high watermark before we lower rss */ 1179 /* Update high watermark before we lower rss */
1178 update_hiwater_rss(mm); 1180 update_hiwater_rss(mm);
1179 1181
1180 if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { 1182 if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
1181 if (!PageHuge(page)) { 1183 if (!PageHuge(page)) {
1182 if (PageAnon(page)) 1184 if (PageAnon(page))
1183 dec_mm_counter(mm, MM_ANONPAGES); 1185 dec_mm_counter(mm, MM_ANONPAGES);
1184 else 1186 else
1185 dec_mm_counter(mm, MM_FILEPAGES); 1187 dec_mm_counter(mm, MM_FILEPAGES);
1186 } 1188 }
1187 set_pte_at(mm, address, pte, 1189 set_pte_at(mm, address, pte,
1188 swp_entry_to_pte(make_hwpoison_entry(page))); 1190 swp_entry_to_pte(make_hwpoison_entry(page)));
1189 } else if (pte_unused(pteval)) { 1191 } else if (pte_unused(pteval)) {
1190 /* 1192 /*
1191 * The guest indicated that the page content is of no 1193 * The guest indicated that the page content is of no
1192 * interest anymore. Simply discard the pte, vmscan 1194 * interest anymore. Simply discard the pte, vmscan
1193 * will take care of the rest. 1195 * will take care of the rest.
1194 */ 1196 */
1195 if (PageAnon(page)) 1197 if (PageAnon(page))
1196 dec_mm_counter(mm, MM_ANONPAGES); 1198 dec_mm_counter(mm, MM_ANONPAGES);
1197 else 1199 else
1198 dec_mm_counter(mm, MM_FILEPAGES); 1200 dec_mm_counter(mm, MM_FILEPAGES);
1199 } else if (PageAnon(page)) { 1201 } else if (PageAnon(page)) {
1200 swp_entry_t entry = { .val = page_private(page) }; 1202 swp_entry_t entry = { .val = page_private(page) };
1201 pte_t swp_pte; 1203 pte_t swp_pte;
1202 1204
1203 if (PageSwapCache(page)) { 1205 if (PageSwapCache(page)) {
1204 /* 1206 /*
1205 * Store the swap location in the pte. 1207 * Store the swap location in the pte.
1206 * See handle_pte_fault() ... 1208 * See handle_pte_fault() ...
1207 */ 1209 */
1208 if (swap_duplicate(entry) < 0) { 1210 if (swap_duplicate(entry) < 0) {
1209 set_pte_at(mm, address, pte, pteval); 1211 set_pte_at(mm, address, pte, pteval);
1210 ret = SWAP_FAIL; 1212 ret = SWAP_FAIL;
1211 goto out_unmap; 1213 goto out_unmap;
1212 } 1214 }
1213 if (list_empty(&mm->mmlist)) { 1215 if (list_empty(&mm->mmlist)) {
1214 spin_lock(&mmlist_lock); 1216 spin_lock(&mmlist_lock);
1215 if (list_empty(&mm->mmlist)) 1217 if (list_empty(&mm->mmlist))
1216 list_add(&mm->mmlist, &init_mm.mmlist); 1218 list_add(&mm->mmlist, &init_mm.mmlist);
1217 spin_unlock(&mmlist_lock); 1219 spin_unlock(&mmlist_lock);
1218 } 1220 }
1219 dec_mm_counter(mm, MM_ANONPAGES); 1221 dec_mm_counter(mm, MM_ANONPAGES);
1220 inc_mm_counter(mm, MM_SWAPENTS); 1222 inc_mm_counter(mm, MM_SWAPENTS);
1221 } else if (IS_ENABLED(CONFIG_MIGRATION)) { 1223 } else if (IS_ENABLED(CONFIG_MIGRATION)) {
1222 /* 1224 /*
1223 * Store the pfn of the page in a special migration 1225 * Store the pfn of the page in a special migration
1224 * pte. do_swap_page() will wait until the migration 1226 * pte. do_swap_page() will wait until the migration
1225 * pte is removed and then restart fault handling. 1227 * pte is removed and then restart fault handling.
1226 */ 1228 */
1227 BUG_ON(!(flags & TTU_MIGRATION)); 1229 BUG_ON(!(flags & TTU_MIGRATION));
1228 entry = make_migration_entry(page, pte_write(pteval)); 1230 entry = make_migration_entry(page, pte_write(pteval));
1229 } 1231 }
1230 swp_pte = swp_entry_to_pte(entry); 1232 swp_pte = swp_entry_to_pte(entry);
1231 if (pte_soft_dirty(pteval)) 1233 if (pte_soft_dirty(pteval))
1232 swp_pte = pte_swp_mksoft_dirty(swp_pte); 1234 swp_pte = pte_swp_mksoft_dirty(swp_pte);
1233 set_pte_at(mm, address, pte, swp_pte); 1235 set_pte_at(mm, address, pte, swp_pte);
1234 BUG_ON(pte_file(*pte)); 1236 BUG_ON(pte_file(*pte));
1235 } else if (IS_ENABLED(CONFIG_MIGRATION) && 1237 } else if (IS_ENABLED(CONFIG_MIGRATION) &&
1236 (flags & TTU_MIGRATION)) { 1238 (flags & TTU_MIGRATION)) {
1237 /* Establish migration entry for a file page */ 1239 /* Establish migration entry for a file page */
1238 swp_entry_t entry; 1240 swp_entry_t entry;
1239 entry = make_migration_entry(page, pte_write(pteval)); 1241 entry = make_migration_entry(page, pte_write(pteval));
1240 set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); 1242 set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
1241 } else 1243 } else
1242 dec_mm_counter(mm, MM_FILEPAGES); 1244 dec_mm_counter(mm, MM_FILEPAGES);
1243 1245
1244 page_remove_rmap(page); 1246 page_remove_rmap(page);
1245 page_cache_release(page); 1247 page_cache_release(page);
1246 1248
1247 out_unmap: 1249 out_unmap:
1248 pte_unmap_unlock(pte, ptl); 1250 pte_unmap_unlock(pte, ptl);
1249 if (ret != SWAP_FAIL && !(flags & TTU_MUNLOCK)) 1251 if (ret != SWAP_FAIL && !(flags & TTU_MUNLOCK))
1250 mmu_notifier_invalidate_page(mm, address); 1252 mmu_notifier_invalidate_page(mm, address);
1251 out: 1253 out:
1252 return ret; 1254 return ret;
1253 1255
1254 out_mlock: 1256 out_mlock:
1255 pte_unmap_unlock(pte, ptl); 1257 pte_unmap_unlock(pte, ptl);
1256 1258
1257 1259
1258 /* 1260 /*
1259 * We need mmap_sem locking, Otherwise VM_LOCKED check makes 1261 * We need mmap_sem locking, Otherwise VM_LOCKED check makes
1260 * unstable result and race. Plus, We can't wait here because 1262 * unstable result and race. Plus, We can't wait here because
1261 * we now hold anon_vma->rwsem or mapping->i_mmap_mutex. 1263 * we now hold anon_vma->rwsem or mapping->i_mmap_mutex.
1262 * if trylock failed, the page remain in evictable lru and later 1264 * if trylock failed, the page remain in evictable lru and later
1263 * vmscan could retry to move the page to unevictable lru if the 1265 * vmscan could retry to move the page to unevictable lru if the
1264 * page is actually mlocked. 1266 * page is actually mlocked.
1265 */ 1267 */
1266 if (down_read_trylock(&vma->vm_mm->mmap_sem)) { 1268 if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
1267 if (vma->vm_flags & VM_LOCKED) { 1269 if (vma->vm_flags & VM_LOCKED) {
1268 mlock_vma_page(page); 1270 mlock_vma_page(page);
1269 ret = SWAP_MLOCK; 1271 ret = SWAP_MLOCK;
1270 } 1272 }
1271 up_read(&vma->vm_mm->mmap_sem); 1273 up_read(&vma->vm_mm->mmap_sem);
1272 } 1274 }
1273 return ret; 1275 return ret;
1274 } 1276 }
1275 1277
1276 /* 1278 /*
1277 * objrmap doesn't work for nonlinear VMAs because the assumption that 1279 * objrmap doesn't work for nonlinear VMAs because the assumption that
1278 * offset-into-file correlates with offset-into-virtual-addresses does not hold. 1280 * offset-into-file correlates with offset-into-virtual-addresses does not hold.
1279 * Consequently, given a particular page and its ->index, we cannot locate the 1281 * Consequently, given a particular page and its ->index, we cannot locate the
1280 * ptes which are mapping that page without an exhaustive linear search. 1282 * ptes which are mapping that page without an exhaustive linear search.
1281 * 1283 *
1282 * So what this code does is a mini "virtual scan" of each nonlinear VMA which 1284 * So what this code does is a mini "virtual scan" of each nonlinear VMA which
1283 * maps the file to which the target page belongs. The ->vm_private_data field 1285 * maps the file to which the target page belongs. The ->vm_private_data field
1284 * holds the current cursor into that scan. Successive searches will circulate 1286 * holds the current cursor into that scan. Successive searches will circulate
1285 * around the vma's virtual address space. 1287 * around the vma's virtual address space.
1286 * 1288 *
1287 * So as more replacement pressure is applied to the pages in a nonlinear VMA, 1289 * So as more replacement pressure is applied to the pages in a nonlinear VMA,
1288 * more scanning pressure is placed against them as well. Eventually pages 1290 * more scanning pressure is placed against them as well. Eventually pages
1289 * will become fully unmapped and are eligible for eviction. 1291 * will become fully unmapped and are eligible for eviction.
1290 * 1292 *
1291 * For very sparsely populated VMAs this is a little inefficient - chances are 1293 * For very sparsely populated VMAs this is a little inefficient - chances are
1292 * there there won't be many ptes located within the scan cluster. In this case 1294 * there there won't be many ptes located within the scan cluster. In this case
1293 * maybe we could scan further - to the end of the pte page, perhaps. 1295 * maybe we could scan further - to the end of the pte page, perhaps.
1294 * 1296 *
1295 * Mlocked pages: check VM_LOCKED under mmap_sem held for read, if we can 1297 * Mlocked pages: check VM_LOCKED under mmap_sem held for read, if we can
1296 * acquire it without blocking. If vma locked, mlock the pages in the cluster, 1298 * acquire it without blocking. If vma locked, mlock the pages in the cluster,
1297 * rather than unmapping them. If we encounter the "check_page" that vmscan is 1299 * rather than unmapping them. If we encounter the "check_page" that vmscan is
1298 * trying to unmap, return SWAP_MLOCK, else default SWAP_AGAIN. 1300 * trying to unmap, return SWAP_MLOCK, else default SWAP_AGAIN.
1299 */ 1301 */
1300 #define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE) 1302 #define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE)
1301 #define CLUSTER_MASK (~(CLUSTER_SIZE - 1)) 1303 #define CLUSTER_MASK (~(CLUSTER_SIZE - 1))
1302 1304
1303 static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, 1305 static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1304 struct vm_area_struct *vma, struct page *check_page) 1306 struct vm_area_struct *vma, struct page *check_page)
1305 { 1307 {
1306 struct mm_struct *mm = vma->vm_mm; 1308 struct mm_struct *mm = vma->vm_mm;
1307 pmd_t *pmd; 1309 pmd_t *pmd;
1308 pte_t *pte; 1310 pte_t *pte;
1309 pte_t pteval; 1311 pte_t pteval;
1310 spinlock_t *ptl; 1312 spinlock_t *ptl;
1311 struct page *page; 1313 struct page *page;
1312 unsigned long address; 1314 unsigned long address;
1313 unsigned long mmun_start; /* For mmu_notifiers */ 1315 unsigned long mmun_start; /* For mmu_notifiers */
1314 unsigned long mmun_end; /* For mmu_notifiers */ 1316 unsigned long mmun_end; /* For mmu_notifiers */
1315 unsigned long end; 1317 unsigned long end;
1316 int ret = SWAP_AGAIN; 1318 int ret = SWAP_AGAIN;
1317 int locked_vma = 0; 1319 int locked_vma = 0;
1318 1320
1319 address = (vma->vm_start + cursor) & CLUSTER_MASK; 1321 address = (vma->vm_start + cursor) & CLUSTER_MASK;
1320 end = address + CLUSTER_SIZE; 1322 end = address + CLUSTER_SIZE;
1321 if (address < vma->vm_start) 1323 if (address < vma->vm_start)
1322 address = vma->vm_start; 1324 address = vma->vm_start;
1323 if (end > vma->vm_end) 1325 if (end > vma->vm_end)
1324 end = vma->vm_end; 1326 end = vma->vm_end;
1325 1327
1326 pmd = mm_find_pmd(mm, address); 1328 pmd = mm_find_pmd(mm, address);
1327 if (!pmd) 1329 if (!pmd)
1328 return ret; 1330 return ret;
1329 1331
1330 mmun_start = address; 1332 mmun_start = address;
1331 mmun_end = end; 1333 mmun_end = end;
1332 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 1334 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
1333 1335
1334 /* 1336 /*
1335 * If we can acquire the mmap_sem for read, and vma is VM_LOCKED, 1337 * If we can acquire the mmap_sem for read, and vma is VM_LOCKED,
1336 * keep the sem while scanning the cluster for mlocking pages. 1338 * keep the sem while scanning the cluster for mlocking pages.
1337 */ 1339 */
1338 if (down_read_trylock(&vma->vm_mm->mmap_sem)) { 1340 if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
1339 locked_vma = (vma->vm_flags & VM_LOCKED); 1341 locked_vma = (vma->vm_flags & VM_LOCKED);
1340 if (!locked_vma) 1342 if (!locked_vma)
1341 up_read(&vma->vm_mm->mmap_sem); /* don't need it */ 1343 up_read(&vma->vm_mm->mmap_sem); /* don't need it */
1342 } 1344 }
1343 1345
1344 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 1346 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
1345 1347
1346 /* Update high watermark before we lower rss */ 1348 /* Update high watermark before we lower rss */
1347 update_hiwater_rss(mm); 1349 update_hiwater_rss(mm);
1348 1350
1349 for (; address < end; pte++, address += PAGE_SIZE) { 1351 for (; address < end; pte++, address += PAGE_SIZE) {
1350 if (!pte_present(*pte)) 1352 if (!pte_present(*pte))
1351 continue; 1353 continue;
1352 page = vm_normal_page(vma, address, *pte); 1354 page = vm_normal_page(vma, address, *pte);
1353 BUG_ON(!page || PageAnon(page)); 1355 BUG_ON(!page || PageAnon(page));
1354 1356
1355 if (locked_vma) { 1357 if (locked_vma) {
1356 if (page == check_page) { 1358 if (page == check_page) {
1357 /* we know we have check_page locked */ 1359 /* we know we have check_page locked */
1358 mlock_vma_page(page); 1360 mlock_vma_page(page);
1359 ret = SWAP_MLOCK; 1361 ret = SWAP_MLOCK;
1360 } else if (trylock_page(page)) { 1362 } else if (trylock_page(page)) {
1361 /* 1363 /*
1362 * If we can lock the page, perform mlock. 1364 * If we can lock the page, perform mlock.
1363 * Otherwise leave the page alone, it will be 1365 * Otherwise leave the page alone, it will be
1364 * eventually encountered again later. 1366 * eventually encountered again later.
1365 */ 1367 */
1366 mlock_vma_page(page); 1368 mlock_vma_page(page);
1367 unlock_page(page); 1369 unlock_page(page);
1368 } 1370 }
1369 continue; /* don't unmap */ 1371 continue; /* don't unmap */
1370 } 1372 }
1371 1373
1372 /* 1374 /*
1373 * No need for _notify because we're within an 1375 * No need for _notify because we're within an
1374 * mmu_notifier_invalidate_range_ {start|end} scope. 1376 * mmu_notifier_invalidate_range_ {start|end} scope.
1375 */ 1377 */
1376 if (ptep_clear_flush_young(vma, address, pte)) 1378 if (ptep_clear_flush_young(vma, address, pte))
1377 continue; 1379 continue;
1378 1380
1379 /* Nuke the page table entry. */ 1381 /* Nuke the page table entry. */
1380 flush_cache_page(vma, address, pte_pfn(*pte)); 1382 flush_cache_page(vma, address, pte_pfn(*pte));
1381 pteval = ptep_clear_flush(vma, address, pte); 1383 pteval = ptep_clear_flush(vma, address, pte);
1382 1384
1383 /* If nonlinear, store the file page offset in the pte. */ 1385 /* If nonlinear, store the file page offset in the pte. */
1384 if (page->index != linear_page_index(vma, address)) { 1386 if (page->index != linear_page_index(vma, address)) {
1385 pte_t ptfile = pgoff_to_pte(page->index); 1387 pte_t ptfile = pgoff_to_pte(page->index);
1386 if (pte_soft_dirty(pteval)) 1388 if (pte_soft_dirty(pteval))
1387 ptfile = pte_file_mksoft_dirty(ptfile); 1389 ptfile = pte_file_mksoft_dirty(ptfile);
1388 set_pte_at(mm, address, pte, ptfile); 1390 set_pte_at(mm, address, pte, ptfile);
1389 } 1391 }
1390 1392
1391 /* Move the dirty bit to the physical page now the pte is gone. */ 1393 /* Move the dirty bit to the physical page now the pte is gone. */
1392 if (pte_dirty(pteval)) 1394 if (pte_dirty(pteval))
1393 set_page_dirty(page); 1395 set_page_dirty(page);
1394 1396
1395 page_remove_rmap(page); 1397 page_remove_rmap(page);
1396 page_cache_release(page); 1398 page_cache_release(page);
1397 dec_mm_counter(mm, MM_FILEPAGES); 1399 dec_mm_counter(mm, MM_FILEPAGES);
1398 (*mapcount)--; 1400 (*mapcount)--;
1399 } 1401 }
1400 pte_unmap_unlock(pte - 1, ptl); 1402 pte_unmap_unlock(pte - 1, ptl);
1401 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 1403 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1402 if (locked_vma) 1404 if (locked_vma)
1403 up_read(&vma->vm_mm->mmap_sem); 1405 up_read(&vma->vm_mm->mmap_sem);
1404 return ret; 1406 return ret;
1405 } 1407 }
1406 1408
1407 static int try_to_unmap_nonlinear(struct page *page, 1409 static int try_to_unmap_nonlinear(struct page *page,
1408 struct address_space *mapping, void *arg) 1410 struct address_space *mapping, void *arg)
1409 { 1411 {
1410 struct vm_area_struct *vma; 1412 struct vm_area_struct *vma;
1411 int ret = SWAP_AGAIN; 1413 int ret = SWAP_AGAIN;
1412 unsigned long cursor; 1414 unsigned long cursor;
1413 unsigned long max_nl_cursor = 0; 1415 unsigned long max_nl_cursor = 0;
1414 unsigned long max_nl_size = 0; 1416 unsigned long max_nl_size = 0;
1415 unsigned int mapcount; 1417 unsigned int mapcount;
1416 1418
1417 list_for_each_entry(vma, 1419 list_for_each_entry(vma,
1418 &mapping->i_mmap_nonlinear, shared.nonlinear) { 1420 &mapping->i_mmap_nonlinear, shared.nonlinear) {
1419 1421
1420 cursor = (unsigned long) vma->vm_private_data; 1422 cursor = (unsigned long) vma->vm_private_data;
1421 if (cursor > max_nl_cursor) 1423 if (cursor > max_nl_cursor)
1422 max_nl_cursor = cursor; 1424 max_nl_cursor = cursor;
1423 cursor = vma->vm_end - vma->vm_start; 1425 cursor = vma->vm_end - vma->vm_start;
1424 if (cursor > max_nl_size) 1426 if (cursor > max_nl_size)
1425 max_nl_size = cursor; 1427 max_nl_size = cursor;
1426 } 1428 }
1427 1429
1428 if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */ 1430 if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */
1429 return SWAP_FAIL; 1431 return SWAP_FAIL;
1430 } 1432 }
1431 1433
1432 /* 1434 /*
1433 * We don't try to search for this page in the nonlinear vmas, 1435 * We don't try to search for this page in the nonlinear vmas,
1434 * and page_referenced wouldn't have found it anyway. Instead 1436 * and page_referenced wouldn't have found it anyway. Instead
1435 * just walk the nonlinear vmas trying to age and unmap some. 1437 * just walk the nonlinear vmas trying to age and unmap some.
1436 * The mapcount of the page we came in with is irrelevant, 1438 * The mapcount of the page we came in with is irrelevant,
1437 * but even so use it as a guide to how hard we should try? 1439 * but even so use it as a guide to how hard we should try?
1438 */ 1440 */
1439 mapcount = page_mapcount(page); 1441 mapcount = page_mapcount(page);
1440 if (!mapcount) 1442 if (!mapcount)
1441 return ret; 1443 return ret;
1442 1444
1443 cond_resched(); 1445 cond_resched();
1444 1446
1445 max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; 1447 max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
1446 if (max_nl_cursor == 0) 1448 if (max_nl_cursor == 0)
1447 max_nl_cursor = CLUSTER_SIZE; 1449 max_nl_cursor = CLUSTER_SIZE;
1448 1450
1449 do { 1451 do {
1450 list_for_each_entry(vma, 1452 list_for_each_entry(vma,
1451 &mapping->i_mmap_nonlinear, shared.nonlinear) { 1453 &mapping->i_mmap_nonlinear, shared.nonlinear) {
1452 1454
1453 cursor = (unsigned long) vma->vm_private_data; 1455 cursor = (unsigned long) vma->vm_private_data;
1454 while (cursor < max_nl_cursor && 1456 while (cursor < max_nl_cursor &&
1455 cursor < vma->vm_end - vma->vm_start) { 1457 cursor < vma->vm_end - vma->vm_start) {
1456 if (try_to_unmap_cluster(cursor, &mapcount, 1458 if (try_to_unmap_cluster(cursor, &mapcount,
1457 vma, page) == SWAP_MLOCK) 1459 vma, page) == SWAP_MLOCK)
1458 ret = SWAP_MLOCK; 1460 ret = SWAP_MLOCK;
1459 cursor += CLUSTER_SIZE; 1461 cursor += CLUSTER_SIZE;
1460 vma->vm_private_data = (void *) cursor; 1462 vma->vm_private_data = (void *) cursor;
1461 if ((int)mapcount <= 0) 1463 if ((int)mapcount <= 0)
1462 return ret; 1464 return ret;
1463 } 1465 }
1464 vma->vm_private_data = (void *) max_nl_cursor; 1466 vma->vm_private_data = (void *) max_nl_cursor;
1465 } 1467 }
1466 cond_resched(); 1468 cond_resched();
1467 max_nl_cursor += CLUSTER_SIZE; 1469 max_nl_cursor += CLUSTER_SIZE;
1468 } while (max_nl_cursor <= max_nl_size); 1470 } while (max_nl_cursor <= max_nl_size);
1469 1471
1470 /* 1472 /*
1471 * Don't loop forever (perhaps all the remaining pages are 1473 * Don't loop forever (perhaps all the remaining pages are
1472 * in locked vmas). Reset cursor on all unreserved nonlinear 1474 * in locked vmas). Reset cursor on all unreserved nonlinear
1473 * vmas, now forgetting on which ones it had fallen behind. 1475 * vmas, now forgetting on which ones it had fallen behind.
1474 */ 1476 */
1475 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear) 1477 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear)
1476 vma->vm_private_data = NULL; 1478 vma->vm_private_data = NULL;
1477 1479
1478 return ret; 1480 return ret;
1479 } 1481 }
1480 1482
1481 bool is_vma_temporary_stack(struct vm_area_struct *vma) 1483 bool is_vma_temporary_stack(struct vm_area_struct *vma)
1482 { 1484 {
1483 int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); 1485 int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
1484 1486
1485 if (!maybe_stack) 1487 if (!maybe_stack)
1486 return false; 1488 return false;
1487 1489
1488 if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) == 1490 if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) ==
1489 VM_STACK_INCOMPLETE_SETUP) 1491 VM_STACK_INCOMPLETE_SETUP)
1490 return true; 1492 return true;
1491 1493
1492 return false; 1494 return false;
1493 } 1495 }
1494 1496
1495 static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg) 1497 static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg)
1496 { 1498 {
1497 return is_vma_temporary_stack(vma); 1499 return is_vma_temporary_stack(vma);
1498 } 1500 }
1499 1501
1500 static int page_not_mapped(struct page *page) 1502 static int page_not_mapped(struct page *page)
1501 { 1503 {
1502 return !page_mapped(page); 1504 return !page_mapped(page);
1503 }; 1505 };
1504 1506
1505 /** 1507 /**
1506 * try_to_unmap - try to remove all page table mappings to a page 1508 * try_to_unmap - try to remove all page table mappings to a page
1507 * @page: the page to get unmapped 1509 * @page: the page to get unmapped
1508 * @flags: action and flags 1510 * @flags: action and flags
1509 * 1511 *
1510 * Tries to remove all the page table entries which are mapping this 1512 * Tries to remove all the page table entries which are mapping this
1511 * page, used in the pageout path. Caller must hold the page lock. 1513 * page, used in the pageout path. Caller must hold the page lock.
1512 * Return values are: 1514 * Return values are:
1513 * 1515 *
1514 * SWAP_SUCCESS - we succeeded in removing all mappings 1516 * SWAP_SUCCESS - we succeeded in removing all mappings
1515 * SWAP_AGAIN - we missed a mapping, try again later 1517 * SWAP_AGAIN - we missed a mapping, try again later
1516 * SWAP_FAIL - the page is unswappable 1518 * SWAP_FAIL - the page is unswappable
1517 * SWAP_MLOCK - page is mlocked. 1519 * SWAP_MLOCK - page is mlocked.
1518 */ 1520 */
1519 int try_to_unmap(struct page *page, enum ttu_flags flags) 1521 int try_to_unmap(struct page *page, enum ttu_flags flags)
1520 { 1522 {
1521 int ret; 1523 int ret;
1522 struct rmap_walk_control rwc = { 1524 struct rmap_walk_control rwc = {
1523 .rmap_one = try_to_unmap_one, 1525 .rmap_one = try_to_unmap_one,
1524 .arg = (void *)flags, 1526 .arg = (void *)flags,
1525 .done = page_not_mapped, 1527 .done = page_not_mapped,
1526 .file_nonlinear = try_to_unmap_nonlinear, 1528 .file_nonlinear = try_to_unmap_nonlinear,
1527 .anon_lock = page_lock_anon_vma_read, 1529 .anon_lock = page_lock_anon_vma_read,
1528 }; 1530 };
1529 1531
1530 VM_BUG_ON_PAGE(!PageHuge(page) && PageTransHuge(page), page); 1532 VM_BUG_ON_PAGE(!PageHuge(page) && PageTransHuge(page), page);
1531 1533
1532 /* 1534 /*
1533 * During exec, a temporary VMA is setup and later moved. 1535 * During exec, a temporary VMA is setup and later moved.
1534 * The VMA is moved under the anon_vma lock but not the 1536 * The VMA is moved under the anon_vma lock but not the
1535 * page tables leading to a race where migration cannot 1537 * page tables leading to a race where migration cannot
1536 * find the migration ptes. Rather than increasing the 1538 * find the migration ptes. Rather than increasing the
1537 * locking requirements of exec(), migration skips 1539 * locking requirements of exec(), migration skips
1538 * temporary VMAs until after exec() completes. 1540 * temporary VMAs until after exec() completes.
1539 */ 1541 */
1540 if ((flags & TTU_MIGRATION) && !PageKsm(page) && PageAnon(page)) 1542 if ((flags & TTU_MIGRATION) && !PageKsm(page) && PageAnon(page))
1541 rwc.invalid_vma = invalid_migration_vma; 1543 rwc.invalid_vma = invalid_migration_vma;
1542 1544
1543 ret = rmap_walk(page, &rwc); 1545 ret = rmap_walk(page, &rwc);
1544 1546
1545 if (ret != SWAP_MLOCK && !page_mapped(page)) 1547 if (ret != SWAP_MLOCK && !page_mapped(page))
1546 ret = SWAP_SUCCESS; 1548 ret = SWAP_SUCCESS;
1547 return ret; 1549 return ret;
1548 } 1550 }
1549 1551
1550 /** 1552 /**
1551 * try_to_munlock - try to munlock a page 1553 * try_to_munlock - try to munlock a page
1552 * @page: the page to be munlocked 1554 * @page: the page to be munlocked
1553 * 1555 *
1554 * Called from munlock code. Checks all of the VMAs mapping the page 1556 * Called from munlock code. Checks all of the VMAs mapping the page
1555 * to make sure nobody else has this page mlocked. The page will be 1557 * to make sure nobody else has this page mlocked. The page will be
1556 * returned with PG_mlocked cleared if no other vmas have it mlocked. 1558 * returned with PG_mlocked cleared if no other vmas have it mlocked.
1557 * 1559 *
1558 * Return values are: 1560 * Return values are:
1559 * 1561 *
1560 * SWAP_AGAIN - no vma is holding page mlocked, or, 1562 * SWAP_AGAIN - no vma is holding page mlocked, or,
1561 * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem 1563 * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem
1562 * SWAP_FAIL - page cannot be located at present 1564 * SWAP_FAIL - page cannot be located at present
1563 * SWAP_MLOCK - page is now mlocked. 1565 * SWAP_MLOCK - page is now mlocked.
1564 */ 1566 */
1565 int try_to_munlock(struct page *page) 1567 int try_to_munlock(struct page *page)
1566 { 1568 {
1567 int ret; 1569 int ret;
1568 struct rmap_walk_control rwc = { 1570 struct rmap_walk_control rwc = {
1569 .rmap_one = try_to_unmap_one, 1571 .rmap_one = try_to_unmap_one,
1570 .arg = (void *)TTU_MUNLOCK, 1572 .arg = (void *)TTU_MUNLOCK,
1571 .done = page_not_mapped, 1573 .done = page_not_mapped,
1572 /* 1574 /*
1573 * We don't bother to try to find the munlocked page in 1575 * We don't bother to try to find the munlocked page in
1574 * nonlinears. It's costly. Instead, later, page reclaim logic 1576 * nonlinears. It's costly. Instead, later, page reclaim logic
1575 * may call try_to_unmap() and recover PG_mlocked lazily. 1577 * may call try_to_unmap() and recover PG_mlocked lazily.
1576 */ 1578 */
1577 .file_nonlinear = NULL, 1579 .file_nonlinear = NULL,
1578 .anon_lock = page_lock_anon_vma_read, 1580 .anon_lock = page_lock_anon_vma_read,
1579 1581
1580 }; 1582 };
1581 1583
1582 VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page); 1584 VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page);
1583 1585
1584 ret = rmap_walk(page, &rwc); 1586 ret = rmap_walk(page, &rwc);
1585 return ret; 1587 return ret;
1586 } 1588 }
1587 1589
1588 void __put_anon_vma(struct anon_vma *anon_vma) 1590 void __put_anon_vma(struct anon_vma *anon_vma)
1589 { 1591 {
1590 struct anon_vma *root = anon_vma->root; 1592 struct anon_vma *root = anon_vma->root;
1591 1593
1592 anon_vma_free(anon_vma); 1594 anon_vma_free(anon_vma);
1593 if (root != anon_vma && atomic_dec_and_test(&root->refcount)) 1595 if (root != anon_vma && atomic_dec_and_test(&root->refcount))
1594 anon_vma_free(root); 1596 anon_vma_free(root);
1595 } 1597 }
1596 1598
1597 static struct anon_vma *rmap_walk_anon_lock(struct page *page, 1599 static struct anon_vma *rmap_walk_anon_lock(struct page *page,
1598 struct rmap_walk_control *rwc) 1600 struct rmap_walk_control *rwc)
1599 { 1601 {
1600 struct anon_vma *anon_vma; 1602 struct anon_vma *anon_vma;
1601 1603
1602 if (rwc->anon_lock) 1604 if (rwc->anon_lock)
1603 return rwc->anon_lock(page); 1605 return rwc->anon_lock(page);
1604 1606
1605 /* 1607 /*
1606 * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read() 1608 * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()
1607 * because that depends on page_mapped(); but not all its usages 1609 * because that depends on page_mapped(); but not all its usages
1608 * are holding mmap_sem. Users without mmap_sem are required to 1610 * are holding mmap_sem. Users without mmap_sem are required to
1609 * take a reference count to prevent the anon_vma disappearing 1611 * take a reference count to prevent the anon_vma disappearing
1610 */ 1612 */
1611 anon_vma = page_anon_vma(page); 1613 anon_vma = page_anon_vma(page);
1612 if (!anon_vma) 1614 if (!anon_vma)
1613 return NULL; 1615 return NULL;
1614 1616
1615 anon_vma_lock_read(anon_vma); 1617 anon_vma_lock_read(anon_vma);
1616 return anon_vma; 1618 return anon_vma;
1617 } 1619 }
1618 1620
1619 /* 1621 /*
1620 * rmap_walk_anon - do something to anonymous page using the object-based 1622 * rmap_walk_anon - do something to anonymous page using the object-based
1621 * rmap method 1623 * rmap method
1622 * @page: the page to be handled 1624 * @page: the page to be handled
1623 * @rwc: control variable according to each walk type 1625 * @rwc: control variable according to each walk type
1624 * 1626 *
1625 * Find all the mappings of a page using the mapping pointer and the vma chains 1627 * Find all the mappings of a page using the mapping pointer and the vma chains
1626 * contained in the anon_vma struct it points to. 1628 * contained in the anon_vma struct it points to.
1627 * 1629 *
1628 * When called from try_to_munlock(), the mmap_sem of the mm containing the vma 1630 * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
1629 * where the page was found will be held for write. So, we won't recheck 1631 * where the page was found will be held for write. So, we won't recheck
1630 * vm_flags for that VMA. That should be OK, because that vma shouldn't be 1632 * vm_flags for that VMA. That should be OK, because that vma shouldn't be
1631 * LOCKED. 1633 * LOCKED.
1632 */ 1634 */
1633 static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) 1635 static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
1634 { 1636 {
1635 struct anon_vma *anon_vma; 1637 struct anon_vma *anon_vma;
1636 pgoff_t pgoff = page_to_pgoff(page); 1638 pgoff_t pgoff = page_to_pgoff(page);
1637 struct anon_vma_chain *avc; 1639 struct anon_vma_chain *avc;
1638 int ret = SWAP_AGAIN; 1640 int ret = SWAP_AGAIN;
1639 1641
1640 anon_vma = rmap_walk_anon_lock(page, rwc); 1642 anon_vma = rmap_walk_anon_lock(page, rwc);
1641 if (!anon_vma) 1643 if (!anon_vma)
1642 return ret; 1644 return ret;
1643 1645
1644 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { 1646 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
1645 struct vm_area_struct *vma = avc->vma; 1647 struct vm_area_struct *vma = avc->vma;
1646 unsigned long address = vma_address(page, vma); 1648 unsigned long address = vma_address(page, vma);
1647 1649
1648 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) 1650 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
1649 continue; 1651 continue;
1650 1652
1651 ret = rwc->rmap_one(page, vma, address, rwc->arg); 1653 ret = rwc->rmap_one(page, vma, address, rwc->arg);
1652 if (ret != SWAP_AGAIN) 1654 if (ret != SWAP_AGAIN)
1653 break; 1655 break;
1654 if (rwc->done && rwc->done(page)) 1656 if (rwc->done && rwc->done(page))
1655 break; 1657 break;
1656 } 1658 }
1657 anon_vma_unlock_read(anon_vma); 1659 anon_vma_unlock_read(anon_vma);
1658 return ret; 1660 return ret;
1659 } 1661 }
1660 1662
1661 /* 1663 /*
1662 * rmap_walk_file - do something to file page using the object-based rmap method 1664 * rmap_walk_file - do something to file page using the object-based rmap method
1663 * @page: the page to be handled 1665 * @page: the page to be handled
1664 * @rwc: control variable according to each walk type 1666 * @rwc: control variable according to each walk type
1665 * 1667 *
1666 * Find all the mappings of a page using the mapping pointer and the vma chains 1668 * Find all the mappings of a page using the mapping pointer and the vma chains
1667 * contained in the address_space struct it points to. 1669 * contained in the address_space struct it points to.
1668 * 1670 *
1669 * When called from try_to_munlock(), the mmap_sem of the mm containing the vma 1671 * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
1670 * where the page was found will be held for write. So, we won't recheck 1672 * where the page was found will be held for write. So, we won't recheck
1671 * vm_flags for that VMA. That should be OK, because that vma shouldn't be 1673 * vm_flags for that VMA. That should be OK, because that vma shouldn't be
1672 * LOCKED. 1674 * LOCKED.
1673 */ 1675 */
1674 static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) 1676 static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
1675 { 1677 {
1676 struct address_space *mapping = page->mapping; 1678 struct address_space *mapping = page->mapping;
1677 pgoff_t pgoff = page_to_pgoff(page); 1679 pgoff_t pgoff = page_to_pgoff(page);
1678 struct vm_area_struct *vma; 1680 struct vm_area_struct *vma;
1679 int ret = SWAP_AGAIN; 1681 int ret = SWAP_AGAIN;
1680 1682
1681 /* 1683 /*
1682 * The page lock not only makes sure that page->mapping cannot 1684 * The page lock not only makes sure that page->mapping cannot
1683 * suddenly be NULLified by truncation, it makes sure that the 1685 * suddenly be NULLified by truncation, it makes sure that the
1684 * structure at mapping cannot be freed and reused yet, 1686 * structure at mapping cannot be freed and reused yet,
1685 * so we can safely take mapping->i_mmap_mutex. 1687 * so we can safely take mapping->i_mmap_mutex.
1686 */ 1688 */
1687 VM_BUG_ON_PAGE(!PageLocked(page), page); 1689 VM_BUG_ON_PAGE(!PageLocked(page), page);
1688 1690
1689 if (!mapping) 1691 if (!mapping)
1690 return ret; 1692 return ret;
1691 mutex_lock(&mapping->i_mmap_mutex); 1693 mutex_lock(&mapping->i_mmap_mutex);
1692 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { 1694 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
1693 unsigned long address = vma_address(page, vma); 1695 unsigned long address = vma_address(page, vma);
1694 1696
1695 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) 1697 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
1696 continue; 1698 continue;
1697 1699
1698 ret = rwc->rmap_one(page, vma, address, rwc->arg); 1700 ret = rwc->rmap_one(page, vma, address, rwc->arg);
1699 if (ret != SWAP_AGAIN) 1701 if (ret != SWAP_AGAIN)
1700 goto done; 1702 goto done;
1701 if (rwc->done && rwc->done(page)) 1703 if (rwc->done && rwc->done(page))
1702 goto done; 1704 goto done;
1703 } 1705 }
1704 1706
1705 if (!rwc->file_nonlinear) 1707 if (!rwc->file_nonlinear)
1706 goto done; 1708 goto done;
1707 1709
1708 if (list_empty(&mapping->i_mmap_nonlinear)) 1710 if (list_empty(&mapping->i_mmap_nonlinear))
1709 goto done; 1711 goto done;
1710 1712
1711 ret = rwc->file_nonlinear(page, mapping, rwc->arg); 1713 ret = rwc->file_nonlinear(page, mapping, rwc->arg);
1712 1714
1713 done: 1715 done:
1714 mutex_unlock(&mapping->i_mmap_mutex); 1716 mutex_unlock(&mapping->i_mmap_mutex);
1715 return ret; 1717 return ret;
1716 } 1718 }
1717 1719
1718 int rmap_walk(struct page *page, struct rmap_walk_control *rwc) 1720 int rmap_walk(struct page *page, struct rmap_walk_control *rwc)
1719 { 1721 {
1720 if (unlikely(PageKsm(page))) 1722 if (unlikely(PageKsm(page)))
1721 return rmap_walk_ksm(page, rwc); 1723 return rmap_walk_ksm(page, rwc);
1722 else if (PageAnon(page)) 1724 else if (PageAnon(page))
1723 return rmap_walk_anon(page, rwc); 1725 return rmap_walk_anon(page, rwc);
1724 else 1726 else
1725 return rmap_walk_file(page, rwc); 1727 return rmap_walk_file(page, rwc);
1726 } 1728 }
1727 1729
1728 #ifdef CONFIG_HUGETLB_PAGE 1730 #ifdef CONFIG_HUGETLB_PAGE
1729 /* 1731 /*
1730 * The following three functions are for anonymous (private mapped) hugepages. 1732 * The following three functions are for anonymous (private mapped) hugepages.
1731 * Unlike common anonymous pages, anonymous hugepages have no accounting code 1733 * Unlike common anonymous pages, anonymous hugepages have no accounting code
1732 * and no lru code, because we handle hugepages differently from common pages. 1734 * and no lru code, because we handle hugepages differently from common pages.
1733 */ 1735 */
1734 static void __hugepage_set_anon_rmap(struct page *page, 1736 static void __hugepage_set_anon_rmap(struct page *page,
1735 struct vm_area_struct *vma, unsigned long address, int exclusive) 1737 struct vm_area_struct *vma, unsigned long address, int exclusive)
1736 { 1738 {
1737 struct anon_vma *anon_vma = vma->anon_vma; 1739 struct anon_vma *anon_vma = vma->anon_vma;
1738 1740
1739 BUG_ON(!anon_vma); 1741 BUG_ON(!anon_vma);
1740 1742
1741 if (PageAnon(page)) 1743 if (PageAnon(page))
1742 return; 1744 return;
1743 if (!exclusive) 1745 if (!exclusive)
1744 anon_vma = anon_vma->root; 1746 anon_vma = anon_vma->root;
1745 1747
1746 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 1748 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
1747 page->mapping = (struct address_space *) anon_vma; 1749 page->mapping = (struct address_space *) anon_vma;
1748 page->index = linear_page_index(vma, address); 1750 page->index = linear_page_index(vma, address);
1749 } 1751 }
1750 1752
1751 void hugepage_add_anon_rmap(struct page *page, 1753 void hugepage_add_anon_rmap(struct page *page,
1752 struct vm_area_struct *vma, unsigned long address) 1754 struct vm_area_struct *vma, unsigned long address)
1753 { 1755 {
1754 struct anon_vma *anon_vma = vma->anon_vma; 1756 struct anon_vma *anon_vma = vma->anon_vma;
1755 int first; 1757 int first;
1756 1758
1757 BUG_ON(!PageLocked(page)); 1759 BUG_ON(!PageLocked(page));
1758 BUG_ON(!anon_vma); 1760 BUG_ON(!anon_vma);
1759 /* address might be in next vma when migration races vma_adjust */ 1761 /* address might be in next vma when migration races vma_adjust */
1760 first = atomic_inc_and_test(&page->_mapcount); 1762 first = atomic_inc_and_test(&page->_mapcount);
1761 if (first) 1763 if (first)
1762 __hugepage_set_anon_rmap(page, vma, address, 0); 1764 __hugepage_set_anon_rmap(page, vma, address, 0);
1763 } 1765 }
1764 1766
1765 void hugepage_add_new_anon_rmap(struct page *page, 1767 void hugepage_add_new_anon_rmap(struct page *page,
1766 struct vm_area_struct *vma, unsigned long address) 1768 struct vm_area_struct *vma, unsigned long address)
1767 { 1769 {
1768 BUG_ON(address < vma->vm_start || address >= vma->vm_end); 1770 BUG_ON(address < vma->vm_start || address >= vma->vm_end);
1769 atomic_set(&page->_mapcount, 0); 1771 atomic_set(&page->_mapcount, 0);
1770 __hugepage_set_anon_rmap(page, vma, address, 1); 1772 __hugepage_set_anon_rmap(page, vma, address, 1);
1771 } 1773 }
1772 #endif /* CONFIG_HUGETLB_PAGE */ 1774 #endif /* CONFIG_HUGETLB_PAGE */
1773 1775