Commit 690eac53daff34169a4d74fc7bfbd388c4896abb

Authored by Linus Torvalds
1 parent 4850d37d3a

mm: Don't count the stack guard page towards RLIMIT_STACK

Commit fee7e49d4514 ("mm: propagate error from stack expansion even for
guard page") made sure that we return the error properly for stack
growth conditions.  It also theorized that counting the guard page
towards the stack limit might break something, but also said "Let's see
if anybody notices".

Somebody did notice.  Apparently android-x86 sets the stack limit very
close to the limit indeed, and including the guard page in the rlimit
check causes the android 'zygote' process problems.

So this adds the (fairly trivial) code to make the stack rlimit check be
against the actual real stack size, rather than the size of the vma that
includes the guard page.

Reported-and-tested-by: Chih-Wei Huang <cwhuang@android-x86.org>
Cc: Jay Foad <jay.foad@gmail.com>
Cc: stable@kernel.org  # to match back-porting of fee7e49d4514
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 1 changed file with 5 additions and 2 deletions Inline Diff

1 /* 1 /*
2 * mm/mmap.c 2 * mm/mmap.c
3 * 3 *
4 * Written by obz. 4 * Written by obz.
5 * 5 *
6 * Address space accounting code <alan@lxorguk.ukuu.org.uk> 6 * Address space accounting code <alan@lxorguk.ukuu.org.uk>
7 */ 7 */
8 8
9 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 9 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
10 10
11 #include <linux/kernel.h> 11 #include <linux/kernel.h>
12 #include <linux/slab.h> 12 #include <linux/slab.h>
13 #include <linux/backing-dev.h> 13 #include <linux/backing-dev.h>
14 #include <linux/mm.h> 14 #include <linux/mm.h>
15 #include <linux/vmacache.h> 15 #include <linux/vmacache.h>
16 #include <linux/shm.h> 16 #include <linux/shm.h>
17 #include <linux/mman.h> 17 #include <linux/mman.h>
18 #include <linux/pagemap.h> 18 #include <linux/pagemap.h>
19 #include <linux/swap.h> 19 #include <linux/swap.h>
20 #include <linux/syscalls.h> 20 #include <linux/syscalls.h>
21 #include <linux/capability.h> 21 #include <linux/capability.h>
22 #include <linux/init.h> 22 #include <linux/init.h>
23 #include <linux/file.h> 23 #include <linux/file.h>
24 #include <linux/fs.h> 24 #include <linux/fs.h>
25 #include <linux/personality.h> 25 #include <linux/personality.h>
26 #include <linux/security.h> 26 #include <linux/security.h>
27 #include <linux/hugetlb.h> 27 #include <linux/hugetlb.h>
28 #include <linux/profile.h> 28 #include <linux/profile.h>
29 #include <linux/export.h> 29 #include <linux/export.h>
30 #include <linux/mount.h> 30 #include <linux/mount.h>
31 #include <linux/mempolicy.h> 31 #include <linux/mempolicy.h>
32 #include <linux/rmap.h> 32 #include <linux/rmap.h>
33 #include <linux/mmu_notifier.h> 33 #include <linux/mmu_notifier.h>
34 #include <linux/mmdebug.h> 34 #include <linux/mmdebug.h>
35 #include <linux/perf_event.h> 35 #include <linux/perf_event.h>
36 #include <linux/audit.h> 36 #include <linux/audit.h>
37 #include <linux/khugepaged.h> 37 #include <linux/khugepaged.h>
38 #include <linux/uprobes.h> 38 #include <linux/uprobes.h>
39 #include <linux/rbtree_augmented.h> 39 #include <linux/rbtree_augmented.h>
40 #include <linux/sched/sysctl.h> 40 #include <linux/sched/sysctl.h>
41 #include <linux/notifier.h> 41 #include <linux/notifier.h>
42 #include <linux/memory.h> 42 #include <linux/memory.h>
43 #include <linux/printk.h> 43 #include <linux/printk.h>
44 44
45 #include <asm/uaccess.h> 45 #include <asm/uaccess.h>
46 #include <asm/cacheflush.h> 46 #include <asm/cacheflush.h>
47 #include <asm/tlb.h> 47 #include <asm/tlb.h>
48 #include <asm/mmu_context.h> 48 #include <asm/mmu_context.h>
49 49
50 #include "internal.h" 50 #include "internal.h"
51 51
52 #ifndef arch_mmap_check 52 #ifndef arch_mmap_check
53 #define arch_mmap_check(addr, len, flags) (0) 53 #define arch_mmap_check(addr, len, flags) (0)
54 #endif 54 #endif
55 55
56 #ifndef arch_rebalance_pgtables 56 #ifndef arch_rebalance_pgtables
57 #define arch_rebalance_pgtables(addr, len) (addr) 57 #define arch_rebalance_pgtables(addr, len) (addr)
58 #endif 58 #endif
59 59
60 static void unmap_region(struct mm_struct *mm, 60 static void unmap_region(struct mm_struct *mm,
61 struct vm_area_struct *vma, struct vm_area_struct *prev, 61 struct vm_area_struct *vma, struct vm_area_struct *prev,
62 unsigned long start, unsigned long end); 62 unsigned long start, unsigned long end);
63 63
64 /* description of effects of mapping type and prot in current implementation. 64 /* description of effects of mapping type and prot in current implementation.
65 * this is due to the limited x86 page protection hardware. The expected 65 * this is due to the limited x86 page protection hardware. The expected
66 * behavior is in parens: 66 * behavior is in parens:
67 * 67 *
68 * map_type prot 68 * map_type prot
69 * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC 69 * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC
70 * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes 70 * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes
71 * w: (no) no w: (no) no w: (yes) yes w: (no) no 71 * w: (no) no w: (no) no w: (yes) yes w: (no) no
72 * x: (no) no x: (no) yes x: (no) yes x: (yes) yes 72 * x: (no) no x: (no) yes x: (no) yes x: (yes) yes
73 * 73 *
74 * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes 74 * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes
75 * w: (no) no w: (no) no w: (copy) copy w: (no) no 75 * w: (no) no w: (no) no w: (copy) copy w: (no) no
76 * x: (no) no x: (no) yes x: (no) yes x: (yes) yes 76 * x: (no) no x: (no) yes x: (no) yes x: (yes) yes
77 * 77 *
78 */ 78 */
79 pgprot_t protection_map[16] = { 79 pgprot_t protection_map[16] = {
80 __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111, 80 __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
81 __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111 81 __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
82 }; 82 };
83 83
84 pgprot_t vm_get_page_prot(unsigned long vm_flags) 84 pgprot_t vm_get_page_prot(unsigned long vm_flags)
85 { 85 {
86 return __pgprot(pgprot_val(protection_map[vm_flags & 86 return __pgprot(pgprot_val(protection_map[vm_flags &
87 (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]) | 87 (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]) |
88 pgprot_val(arch_vm_get_page_prot(vm_flags))); 88 pgprot_val(arch_vm_get_page_prot(vm_flags)));
89 } 89 }
90 EXPORT_SYMBOL(vm_get_page_prot); 90 EXPORT_SYMBOL(vm_get_page_prot);
91 91
92 static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags) 92 static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags)
93 { 93 {
94 return pgprot_modify(oldprot, vm_get_page_prot(vm_flags)); 94 return pgprot_modify(oldprot, vm_get_page_prot(vm_flags));
95 } 95 }
96 96
97 /* Update vma->vm_page_prot to reflect vma->vm_flags. */ 97 /* Update vma->vm_page_prot to reflect vma->vm_flags. */
98 void vma_set_page_prot(struct vm_area_struct *vma) 98 void vma_set_page_prot(struct vm_area_struct *vma)
99 { 99 {
100 unsigned long vm_flags = vma->vm_flags; 100 unsigned long vm_flags = vma->vm_flags;
101 101
102 vma->vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags); 102 vma->vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags);
103 if (vma_wants_writenotify(vma)) { 103 if (vma_wants_writenotify(vma)) {
104 vm_flags &= ~VM_SHARED; 104 vm_flags &= ~VM_SHARED;
105 vma->vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, 105 vma->vm_page_prot = vm_pgprot_modify(vma->vm_page_prot,
106 vm_flags); 106 vm_flags);
107 } 107 }
108 } 108 }
109 109
110 110
111 int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */ 111 int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */
112 int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ 112 int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */
113 unsigned long sysctl_overcommit_kbytes __read_mostly; 113 unsigned long sysctl_overcommit_kbytes __read_mostly;
114 int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; 114 int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
115 unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ 115 unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
116 unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ 116 unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
117 /* 117 /*
118 * Make sure vm_committed_as in one cacheline and not cacheline shared with 118 * Make sure vm_committed_as in one cacheline and not cacheline shared with
119 * other variables. It can be updated by several CPUs frequently. 119 * other variables. It can be updated by several CPUs frequently.
120 */ 120 */
121 struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp; 121 struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
122 122
123 /* 123 /*
124 * The global memory commitment made in the system can be a metric 124 * The global memory commitment made in the system can be a metric
125 * that can be used to drive ballooning decisions when Linux is hosted 125 * that can be used to drive ballooning decisions when Linux is hosted
126 * as a guest. On Hyper-V, the host implements a policy engine for dynamically 126 * as a guest. On Hyper-V, the host implements a policy engine for dynamically
127 * balancing memory across competing virtual machines that are hosted. 127 * balancing memory across competing virtual machines that are hosted.
128 * Several metrics drive this policy engine including the guest reported 128 * Several metrics drive this policy engine including the guest reported
129 * memory commitment. 129 * memory commitment.
130 */ 130 */
131 unsigned long vm_memory_committed(void) 131 unsigned long vm_memory_committed(void)
132 { 132 {
133 return percpu_counter_read_positive(&vm_committed_as); 133 return percpu_counter_read_positive(&vm_committed_as);
134 } 134 }
135 EXPORT_SYMBOL_GPL(vm_memory_committed); 135 EXPORT_SYMBOL_GPL(vm_memory_committed);
136 136
137 /* 137 /*
138 * Check that a process has enough memory to allocate a new virtual 138 * Check that a process has enough memory to allocate a new virtual
139 * mapping. 0 means there is enough memory for the allocation to 139 * mapping. 0 means there is enough memory for the allocation to
140 * succeed and -ENOMEM implies there is not. 140 * succeed and -ENOMEM implies there is not.
141 * 141 *
142 * We currently support three overcommit policies, which are set via the 142 * We currently support three overcommit policies, which are set via the
143 * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting 143 * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting
144 * 144 *
145 * Strict overcommit modes added 2002 Feb 26 by Alan Cox. 145 * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
146 * Additional code 2002 Jul 20 by Robert Love. 146 * Additional code 2002 Jul 20 by Robert Love.
147 * 147 *
148 * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise. 148 * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
149 * 149 *
150 * Note this is a helper function intended to be used by LSMs which 150 * Note this is a helper function intended to be used by LSMs which
151 * wish to use this logic. 151 * wish to use this logic.
152 */ 152 */
153 int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) 153 int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
154 { 154 {
155 unsigned long free, allowed, reserve; 155 unsigned long free, allowed, reserve;
156 156
157 VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) < 157 VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) <
158 -(s64)vm_committed_as_batch * num_online_cpus(), 158 -(s64)vm_committed_as_batch * num_online_cpus(),
159 "memory commitment underflow"); 159 "memory commitment underflow");
160 160
161 vm_acct_memory(pages); 161 vm_acct_memory(pages);
162 162
163 /* 163 /*
164 * Sometimes we want to use more memory than we have 164 * Sometimes we want to use more memory than we have
165 */ 165 */
166 if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) 166 if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
167 return 0; 167 return 0;
168 168
169 if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { 169 if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
170 free = global_page_state(NR_FREE_PAGES); 170 free = global_page_state(NR_FREE_PAGES);
171 free += global_page_state(NR_FILE_PAGES); 171 free += global_page_state(NR_FILE_PAGES);
172 172
173 /* 173 /*
174 * shmem pages shouldn't be counted as free in this 174 * shmem pages shouldn't be counted as free in this
175 * case, they can't be purged, only swapped out, and 175 * case, they can't be purged, only swapped out, and
176 * that won't affect the overall amount of available 176 * that won't affect the overall amount of available
177 * memory in the system. 177 * memory in the system.
178 */ 178 */
179 free -= global_page_state(NR_SHMEM); 179 free -= global_page_state(NR_SHMEM);
180 180
181 free += get_nr_swap_pages(); 181 free += get_nr_swap_pages();
182 182
183 /* 183 /*
184 * Any slabs which are created with the 184 * Any slabs which are created with the
185 * SLAB_RECLAIM_ACCOUNT flag claim to have contents 185 * SLAB_RECLAIM_ACCOUNT flag claim to have contents
186 * which are reclaimable, under pressure. The dentry 186 * which are reclaimable, under pressure. The dentry
187 * cache and most inode caches should fall into this 187 * cache and most inode caches should fall into this
188 */ 188 */
189 free += global_page_state(NR_SLAB_RECLAIMABLE); 189 free += global_page_state(NR_SLAB_RECLAIMABLE);
190 190
191 /* 191 /*
192 * Leave reserved pages. The pages are not for anonymous pages. 192 * Leave reserved pages. The pages are not for anonymous pages.
193 */ 193 */
194 if (free <= totalreserve_pages) 194 if (free <= totalreserve_pages)
195 goto error; 195 goto error;
196 else 196 else
197 free -= totalreserve_pages; 197 free -= totalreserve_pages;
198 198
199 /* 199 /*
200 * Reserve some for root 200 * Reserve some for root
201 */ 201 */
202 if (!cap_sys_admin) 202 if (!cap_sys_admin)
203 free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); 203 free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
204 204
205 if (free > pages) 205 if (free > pages)
206 return 0; 206 return 0;
207 207
208 goto error; 208 goto error;
209 } 209 }
210 210
211 allowed = vm_commit_limit(); 211 allowed = vm_commit_limit();
212 /* 212 /*
213 * Reserve some for root 213 * Reserve some for root
214 */ 214 */
215 if (!cap_sys_admin) 215 if (!cap_sys_admin)
216 allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); 216 allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
217 217
218 /* 218 /*
219 * Don't let a single process grow so big a user can't recover 219 * Don't let a single process grow so big a user can't recover
220 */ 220 */
221 if (mm) { 221 if (mm) {
222 reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); 222 reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
223 allowed -= min(mm->total_vm / 32, reserve); 223 allowed -= min(mm->total_vm / 32, reserve);
224 } 224 }
225 225
226 if (percpu_counter_read_positive(&vm_committed_as) < allowed) 226 if (percpu_counter_read_positive(&vm_committed_as) < allowed)
227 return 0; 227 return 0;
228 error: 228 error:
229 vm_unacct_memory(pages); 229 vm_unacct_memory(pages);
230 230
231 return -ENOMEM; 231 return -ENOMEM;
232 } 232 }
233 233
234 /* 234 /*
235 * Requires inode->i_mapping->i_mmap_rwsem 235 * Requires inode->i_mapping->i_mmap_rwsem
236 */ 236 */
237 static void __remove_shared_vm_struct(struct vm_area_struct *vma, 237 static void __remove_shared_vm_struct(struct vm_area_struct *vma,
238 struct file *file, struct address_space *mapping) 238 struct file *file, struct address_space *mapping)
239 { 239 {
240 if (vma->vm_flags & VM_DENYWRITE) 240 if (vma->vm_flags & VM_DENYWRITE)
241 atomic_inc(&file_inode(file)->i_writecount); 241 atomic_inc(&file_inode(file)->i_writecount);
242 if (vma->vm_flags & VM_SHARED) 242 if (vma->vm_flags & VM_SHARED)
243 mapping_unmap_writable(mapping); 243 mapping_unmap_writable(mapping);
244 244
245 flush_dcache_mmap_lock(mapping); 245 flush_dcache_mmap_lock(mapping);
246 if (unlikely(vma->vm_flags & VM_NONLINEAR)) 246 if (unlikely(vma->vm_flags & VM_NONLINEAR))
247 list_del_init(&vma->shared.nonlinear); 247 list_del_init(&vma->shared.nonlinear);
248 else 248 else
249 vma_interval_tree_remove(vma, &mapping->i_mmap); 249 vma_interval_tree_remove(vma, &mapping->i_mmap);
250 flush_dcache_mmap_unlock(mapping); 250 flush_dcache_mmap_unlock(mapping);
251 } 251 }
252 252
253 /* 253 /*
254 * Unlink a file-based vm structure from its interval tree, to hide 254 * Unlink a file-based vm structure from its interval tree, to hide
255 * vma from rmap and vmtruncate before freeing its page tables. 255 * vma from rmap and vmtruncate before freeing its page tables.
256 */ 256 */
257 void unlink_file_vma(struct vm_area_struct *vma) 257 void unlink_file_vma(struct vm_area_struct *vma)
258 { 258 {
259 struct file *file = vma->vm_file; 259 struct file *file = vma->vm_file;
260 260
261 if (file) { 261 if (file) {
262 struct address_space *mapping = file->f_mapping; 262 struct address_space *mapping = file->f_mapping;
263 i_mmap_lock_write(mapping); 263 i_mmap_lock_write(mapping);
264 __remove_shared_vm_struct(vma, file, mapping); 264 __remove_shared_vm_struct(vma, file, mapping);
265 i_mmap_unlock_write(mapping); 265 i_mmap_unlock_write(mapping);
266 } 266 }
267 } 267 }
268 268
269 /* 269 /*
270 * Close a vm structure and free it, returning the next. 270 * Close a vm structure and free it, returning the next.
271 */ 271 */
272 static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) 272 static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
273 { 273 {
274 struct vm_area_struct *next = vma->vm_next; 274 struct vm_area_struct *next = vma->vm_next;
275 275
276 might_sleep(); 276 might_sleep();
277 if (vma->vm_ops && vma->vm_ops->close) 277 if (vma->vm_ops && vma->vm_ops->close)
278 vma->vm_ops->close(vma); 278 vma->vm_ops->close(vma);
279 if (vma->vm_file) 279 if (vma->vm_file)
280 fput(vma->vm_file); 280 fput(vma->vm_file);
281 mpol_put(vma_policy(vma)); 281 mpol_put(vma_policy(vma));
282 kmem_cache_free(vm_area_cachep, vma); 282 kmem_cache_free(vm_area_cachep, vma);
283 return next; 283 return next;
284 } 284 }
285 285
286 static unsigned long do_brk(unsigned long addr, unsigned long len); 286 static unsigned long do_brk(unsigned long addr, unsigned long len);
287 287
288 SYSCALL_DEFINE1(brk, unsigned long, brk) 288 SYSCALL_DEFINE1(brk, unsigned long, brk)
289 { 289 {
290 unsigned long retval; 290 unsigned long retval;
291 unsigned long newbrk, oldbrk; 291 unsigned long newbrk, oldbrk;
292 struct mm_struct *mm = current->mm; 292 struct mm_struct *mm = current->mm;
293 unsigned long min_brk; 293 unsigned long min_brk;
294 bool populate; 294 bool populate;
295 295
296 down_write(&mm->mmap_sem); 296 down_write(&mm->mmap_sem);
297 297
298 #ifdef CONFIG_COMPAT_BRK 298 #ifdef CONFIG_COMPAT_BRK
299 /* 299 /*
300 * CONFIG_COMPAT_BRK can still be overridden by setting 300 * CONFIG_COMPAT_BRK can still be overridden by setting
301 * randomize_va_space to 2, which will still cause mm->start_brk 301 * randomize_va_space to 2, which will still cause mm->start_brk
302 * to be arbitrarily shifted 302 * to be arbitrarily shifted
303 */ 303 */
304 if (current->brk_randomized) 304 if (current->brk_randomized)
305 min_brk = mm->start_brk; 305 min_brk = mm->start_brk;
306 else 306 else
307 min_brk = mm->end_data; 307 min_brk = mm->end_data;
308 #else 308 #else
309 min_brk = mm->start_brk; 309 min_brk = mm->start_brk;
310 #endif 310 #endif
311 if (brk < min_brk) 311 if (brk < min_brk)
312 goto out; 312 goto out;
313 313
314 /* 314 /*
315 * Check against rlimit here. If this check is done later after the test 315 * Check against rlimit here. If this check is done later after the test
316 * of oldbrk with newbrk then it can escape the test and let the data 316 * of oldbrk with newbrk then it can escape the test and let the data
317 * segment grow beyond its set limit the in case where the limit is 317 * segment grow beyond its set limit the in case where the limit is
318 * not page aligned -Ram Gupta 318 * not page aligned -Ram Gupta
319 */ 319 */
320 if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk, 320 if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk,
321 mm->end_data, mm->start_data)) 321 mm->end_data, mm->start_data))
322 goto out; 322 goto out;
323 323
324 newbrk = PAGE_ALIGN(brk); 324 newbrk = PAGE_ALIGN(brk);
325 oldbrk = PAGE_ALIGN(mm->brk); 325 oldbrk = PAGE_ALIGN(mm->brk);
326 if (oldbrk == newbrk) 326 if (oldbrk == newbrk)
327 goto set_brk; 327 goto set_brk;
328 328
329 /* Always allow shrinking brk. */ 329 /* Always allow shrinking brk. */
330 if (brk <= mm->brk) { 330 if (brk <= mm->brk) {
331 if (!do_munmap(mm, newbrk, oldbrk-newbrk)) 331 if (!do_munmap(mm, newbrk, oldbrk-newbrk))
332 goto set_brk; 332 goto set_brk;
333 goto out; 333 goto out;
334 } 334 }
335 335
336 /* Check against existing mmap mappings. */ 336 /* Check against existing mmap mappings. */
337 if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE)) 337 if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE))
338 goto out; 338 goto out;
339 339
340 /* Ok, looks good - let it rip. */ 340 /* Ok, looks good - let it rip. */
341 if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) 341 if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
342 goto out; 342 goto out;
343 343
344 set_brk: 344 set_brk:
345 mm->brk = brk; 345 mm->brk = brk;
346 populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0; 346 populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;
347 up_write(&mm->mmap_sem); 347 up_write(&mm->mmap_sem);
348 if (populate) 348 if (populate)
349 mm_populate(oldbrk, newbrk - oldbrk); 349 mm_populate(oldbrk, newbrk - oldbrk);
350 return brk; 350 return brk;
351 351
352 out: 352 out:
353 retval = mm->brk; 353 retval = mm->brk;
354 up_write(&mm->mmap_sem); 354 up_write(&mm->mmap_sem);
355 return retval; 355 return retval;
356 } 356 }
357 357
358 static long vma_compute_subtree_gap(struct vm_area_struct *vma) 358 static long vma_compute_subtree_gap(struct vm_area_struct *vma)
359 { 359 {
360 unsigned long max, subtree_gap; 360 unsigned long max, subtree_gap;
361 max = vma->vm_start; 361 max = vma->vm_start;
362 if (vma->vm_prev) 362 if (vma->vm_prev)
363 max -= vma->vm_prev->vm_end; 363 max -= vma->vm_prev->vm_end;
364 if (vma->vm_rb.rb_left) { 364 if (vma->vm_rb.rb_left) {
365 subtree_gap = rb_entry(vma->vm_rb.rb_left, 365 subtree_gap = rb_entry(vma->vm_rb.rb_left,
366 struct vm_area_struct, vm_rb)->rb_subtree_gap; 366 struct vm_area_struct, vm_rb)->rb_subtree_gap;
367 if (subtree_gap > max) 367 if (subtree_gap > max)
368 max = subtree_gap; 368 max = subtree_gap;
369 } 369 }
370 if (vma->vm_rb.rb_right) { 370 if (vma->vm_rb.rb_right) {
371 subtree_gap = rb_entry(vma->vm_rb.rb_right, 371 subtree_gap = rb_entry(vma->vm_rb.rb_right,
372 struct vm_area_struct, vm_rb)->rb_subtree_gap; 372 struct vm_area_struct, vm_rb)->rb_subtree_gap;
373 if (subtree_gap > max) 373 if (subtree_gap > max)
374 max = subtree_gap; 374 max = subtree_gap;
375 } 375 }
376 return max; 376 return max;
377 } 377 }
378 378
379 #ifdef CONFIG_DEBUG_VM_RB 379 #ifdef CONFIG_DEBUG_VM_RB
380 static int browse_rb(struct rb_root *root) 380 static int browse_rb(struct rb_root *root)
381 { 381 {
382 int i = 0, j, bug = 0; 382 int i = 0, j, bug = 0;
383 struct rb_node *nd, *pn = NULL; 383 struct rb_node *nd, *pn = NULL;
384 unsigned long prev = 0, pend = 0; 384 unsigned long prev = 0, pend = 0;
385 385
386 for (nd = rb_first(root); nd; nd = rb_next(nd)) { 386 for (nd = rb_first(root); nd; nd = rb_next(nd)) {
387 struct vm_area_struct *vma; 387 struct vm_area_struct *vma;
388 vma = rb_entry(nd, struct vm_area_struct, vm_rb); 388 vma = rb_entry(nd, struct vm_area_struct, vm_rb);
389 if (vma->vm_start < prev) { 389 if (vma->vm_start < prev) {
390 pr_emerg("vm_start %lx < prev %lx\n", 390 pr_emerg("vm_start %lx < prev %lx\n",
391 vma->vm_start, prev); 391 vma->vm_start, prev);
392 bug = 1; 392 bug = 1;
393 } 393 }
394 if (vma->vm_start < pend) { 394 if (vma->vm_start < pend) {
395 pr_emerg("vm_start %lx < pend %lx\n", 395 pr_emerg("vm_start %lx < pend %lx\n",
396 vma->vm_start, pend); 396 vma->vm_start, pend);
397 bug = 1; 397 bug = 1;
398 } 398 }
399 if (vma->vm_start > vma->vm_end) { 399 if (vma->vm_start > vma->vm_end) {
400 pr_emerg("vm_start %lx > vm_end %lx\n", 400 pr_emerg("vm_start %lx > vm_end %lx\n",
401 vma->vm_start, vma->vm_end); 401 vma->vm_start, vma->vm_end);
402 bug = 1; 402 bug = 1;
403 } 403 }
404 if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) { 404 if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
405 pr_emerg("free gap %lx, correct %lx\n", 405 pr_emerg("free gap %lx, correct %lx\n",
406 vma->rb_subtree_gap, 406 vma->rb_subtree_gap,
407 vma_compute_subtree_gap(vma)); 407 vma_compute_subtree_gap(vma));
408 bug = 1; 408 bug = 1;
409 } 409 }
410 i++; 410 i++;
411 pn = nd; 411 pn = nd;
412 prev = vma->vm_start; 412 prev = vma->vm_start;
413 pend = vma->vm_end; 413 pend = vma->vm_end;
414 } 414 }
415 j = 0; 415 j = 0;
416 for (nd = pn; nd; nd = rb_prev(nd)) 416 for (nd = pn; nd; nd = rb_prev(nd))
417 j++; 417 j++;
418 if (i != j) { 418 if (i != j) {
419 pr_emerg("backwards %d, forwards %d\n", j, i); 419 pr_emerg("backwards %d, forwards %d\n", j, i);
420 bug = 1; 420 bug = 1;
421 } 421 }
422 return bug ? -1 : i; 422 return bug ? -1 : i;
423 } 423 }
424 424
425 static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore) 425 static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore)
426 { 426 {
427 struct rb_node *nd; 427 struct rb_node *nd;
428 428
429 for (nd = rb_first(root); nd; nd = rb_next(nd)) { 429 for (nd = rb_first(root); nd; nd = rb_next(nd)) {
430 struct vm_area_struct *vma; 430 struct vm_area_struct *vma;
431 vma = rb_entry(nd, struct vm_area_struct, vm_rb); 431 vma = rb_entry(nd, struct vm_area_struct, vm_rb);
432 VM_BUG_ON_VMA(vma != ignore && 432 VM_BUG_ON_VMA(vma != ignore &&
433 vma->rb_subtree_gap != vma_compute_subtree_gap(vma), 433 vma->rb_subtree_gap != vma_compute_subtree_gap(vma),
434 vma); 434 vma);
435 } 435 }
436 } 436 }
437 437
438 static void validate_mm(struct mm_struct *mm) 438 static void validate_mm(struct mm_struct *mm)
439 { 439 {
440 int bug = 0; 440 int bug = 0;
441 int i = 0; 441 int i = 0;
442 unsigned long highest_address = 0; 442 unsigned long highest_address = 0;
443 struct vm_area_struct *vma = mm->mmap; 443 struct vm_area_struct *vma = mm->mmap;
444 444
445 while (vma) { 445 while (vma) {
446 struct anon_vma_chain *avc; 446 struct anon_vma_chain *avc;
447 447
448 vma_lock_anon_vma(vma); 448 vma_lock_anon_vma(vma);
449 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 449 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
450 anon_vma_interval_tree_verify(avc); 450 anon_vma_interval_tree_verify(avc);
451 vma_unlock_anon_vma(vma); 451 vma_unlock_anon_vma(vma);
452 highest_address = vma->vm_end; 452 highest_address = vma->vm_end;
453 vma = vma->vm_next; 453 vma = vma->vm_next;
454 i++; 454 i++;
455 } 455 }
456 if (i != mm->map_count) { 456 if (i != mm->map_count) {
457 pr_emerg("map_count %d vm_next %d\n", mm->map_count, i); 457 pr_emerg("map_count %d vm_next %d\n", mm->map_count, i);
458 bug = 1; 458 bug = 1;
459 } 459 }
460 if (highest_address != mm->highest_vm_end) { 460 if (highest_address != mm->highest_vm_end) {
461 pr_emerg("mm->highest_vm_end %lx, found %lx\n", 461 pr_emerg("mm->highest_vm_end %lx, found %lx\n",
462 mm->highest_vm_end, highest_address); 462 mm->highest_vm_end, highest_address);
463 bug = 1; 463 bug = 1;
464 } 464 }
465 i = browse_rb(&mm->mm_rb); 465 i = browse_rb(&mm->mm_rb);
466 if (i != mm->map_count) { 466 if (i != mm->map_count) {
467 if (i != -1) 467 if (i != -1)
468 pr_emerg("map_count %d rb %d\n", mm->map_count, i); 468 pr_emerg("map_count %d rb %d\n", mm->map_count, i);
469 bug = 1; 469 bug = 1;
470 } 470 }
471 VM_BUG_ON_MM(bug, mm); 471 VM_BUG_ON_MM(bug, mm);
472 } 472 }
473 #else 473 #else
474 #define validate_mm_rb(root, ignore) do { } while (0) 474 #define validate_mm_rb(root, ignore) do { } while (0)
475 #define validate_mm(mm) do { } while (0) 475 #define validate_mm(mm) do { } while (0)
476 #endif 476 #endif
477 477
478 RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb, 478 RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb,
479 unsigned long, rb_subtree_gap, vma_compute_subtree_gap) 479 unsigned long, rb_subtree_gap, vma_compute_subtree_gap)
480 480
481 /* 481 /*
482 * Update augmented rbtree rb_subtree_gap values after vma->vm_start or 482 * Update augmented rbtree rb_subtree_gap values after vma->vm_start or
483 * vma->vm_prev->vm_end values changed, without modifying the vma's position 483 * vma->vm_prev->vm_end values changed, without modifying the vma's position
484 * in the rbtree. 484 * in the rbtree.
485 */ 485 */
486 static void vma_gap_update(struct vm_area_struct *vma) 486 static void vma_gap_update(struct vm_area_struct *vma)
487 { 487 {
488 /* 488 /*
489 * As it turns out, RB_DECLARE_CALLBACKS() already created a callback 489 * As it turns out, RB_DECLARE_CALLBACKS() already created a callback
490 * function that does exacltly what we want. 490 * function that does exacltly what we want.
491 */ 491 */
492 vma_gap_callbacks_propagate(&vma->vm_rb, NULL); 492 vma_gap_callbacks_propagate(&vma->vm_rb, NULL);
493 } 493 }
494 494
495 static inline void vma_rb_insert(struct vm_area_struct *vma, 495 static inline void vma_rb_insert(struct vm_area_struct *vma,
496 struct rb_root *root) 496 struct rb_root *root)
497 { 497 {
498 /* All rb_subtree_gap values must be consistent prior to insertion */ 498 /* All rb_subtree_gap values must be consistent prior to insertion */
499 validate_mm_rb(root, NULL); 499 validate_mm_rb(root, NULL);
500 500
501 rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks); 501 rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
502 } 502 }
503 503
504 static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root) 504 static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
505 { 505 {
506 /* 506 /*
507 * All rb_subtree_gap values must be consistent prior to erase, 507 * All rb_subtree_gap values must be consistent prior to erase,
508 * with the possible exception of the vma being erased. 508 * with the possible exception of the vma being erased.
509 */ 509 */
510 validate_mm_rb(root, vma); 510 validate_mm_rb(root, vma);
511 511
512 /* 512 /*
513 * Note rb_erase_augmented is a fairly large inline function, 513 * Note rb_erase_augmented is a fairly large inline function,
514 * so make sure we instantiate it only once with our desired 514 * so make sure we instantiate it only once with our desired
515 * augmented rbtree callbacks. 515 * augmented rbtree callbacks.
516 */ 516 */
517 rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks); 517 rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
518 } 518 }
519 519
520 /* 520 /*
521 * vma has some anon_vma assigned, and is already inserted on that 521 * vma has some anon_vma assigned, and is already inserted on that
522 * anon_vma's interval trees. 522 * anon_vma's interval trees.
523 * 523 *
524 * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the 524 * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
525 * vma must be removed from the anon_vma's interval trees using 525 * vma must be removed from the anon_vma's interval trees using
526 * anon_vma_interval_tree_pre_update_vma(). 526 * anon_vma_interval_tree_pre_update_vma().
527 * 527 *
528 * After the update, the vma will be reinserted using 528 * After the update, the vma will be reinserted using
529 * anon_vma_interval_tree_post_update_vma(). 529 * anon_vma_interval_tree_post_update_vma().
530 * 530 *
531 * The entire update must be protected by exclusive mmap_sem and by 531 * The entire update must be protected by exclusive mmap_sem and by
532 * the root anon_vma's mutex. 532 * the root anon_vma's mutex.
533 */ 533 */
534 static inline void 534 static inline void
535 anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma) 535 anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
536 { 536 {
537 struct anon_vma_chain *avc; 537 struct anon_vma_chain *avc;
538 538
539 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 539 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
540 anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root); 540 anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
541 } 541 }
542 542
543 static inline void 543 static inline void
544 anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) 544 anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
545 { 545 {
546 struct anon_vma_chain *avc; 546 struct anon_vma_chain *avc;
547 547
548 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 548 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
549 anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); 549 anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
550 } 550 }
551 551
552 static int find_vma_links(struct mm_struct *mm, unsigned long addr, 552 static int find_vma_links(struct mm_struct *mm, unsigned long addr,
553 unsigned long end, struct vm_area_struct **pprev, 553 unsigned long end, struct vm_area_struct **pprev,
554 struct rb_node ***rb_link, struct rb_node **rb_parent) 554 struct rb_node ***rb_link, struct rb_node **rb_parent)
555 { 555 {
556 struct rb_node **__rb_link, *__rb_parent, *rb_prev; 556 struct rb_node **__rb_link, *__rb_parent, *rb_prev;
557 557
558 __rb_link = &mm->mm_rb.rb_node; 558 __rb_link = &mm->mm_rb.rb_node;
559 rb_prev = __rb_parent = NULL; 559 rb_prev = __rb_parent = NULL;
560 560
561 while (*__rb_link) { 561 while (*__rb_link) {
562 struct vm_area_struct *vma_tmp; 562 struct vm_area_struct *vma_tmp;
563 563
564 __rb_parent = *__rb_link; 564 __rb_parent = *__rb_link;
565 vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb); 565 vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
566 566
567 if (vma_tmp->vm_end > addr) { 567 if (vma_tmp->vm_end > addr) {
568 /* Fail if an existing vma overlaps the area */ 568 /* Fail if an existing vma overlaps the area */
569 if (vma_tmp->vm_start < end) 569 if (vma_tmp->vm_start < end)
570 return -ENOMEM; 570 return -ENOMEM;
571 __rb_link = &__rb_parent->rb_left; 571 __rb_link = &__rb_parent->rb_left;
572 } else { 572 } else {
573 rb_prev = __rb_parent; 573 rb_prev = __rb_parent;
574 __rb_link = &__rb_parent->rb_right; 574 __rb_link = &__rb_parent->rb_right;
575 } 575 }
576 } 576 }
577 577
578 *pprev = NULL; 578 *pprev = NULL;
579 if (rb_prev) 579 if (rb_prev)
580 *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); 580 *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
581 *rb_link = __rb_link; 581 *rb_link = __rb_link;
582 *rb_parent = __rb_parent; 582 *rb_parent = __rb_parent;
583 return 0; 583 return 0;
584 } 584 }
585 585
586 static unsigned long count_vma_pages_range(struct mm_struct *mm, 586 static unsigned long count_vma_pages_range(struct mm_struct *mm,
587 unsigned long addr, unsigned long end) 587 unsigned long addr, unsigned long end)
588 { 588 {
589 unsigned long nr_pages = 0; 589 unsigned long nr_pages = 0;
590 struct vm_area_struct *vma; 590 struct vm_area_struct *vma;
591 591
592 /* Find first overlaping mapping */ 592 /* Find first overlaping mapping */
593 vma = find_vma_intersection(mm, addr, end); 593 vma = find_vma_intersection(mm, addr, end);
594 if (!vma) 594 if (!vma)
595 return 0; 595 return 0;
596 596
597 nr_pages = (min(end, vma->vm_end) - 597 nr_pages = (min(end, vma->vm_end) -
598 max(addr, vma->vm_start)) >> PAGE_SHIFT; 598 max(addr, vma->vm_start)) >> PAGE_SHIFT;
599 599
600 /* Iterate over the rest of the overlaps */ 600 /* Iterate over the rest of the overlaps */
601 for (vma = vma->vm_next; vma; vma = vma->vm_next) { 601 for (vma = vma->vm_next; vma; vma = vma->vm_next) {
602 unsigned long overlap_len; 602 unsigned long overlap_len;
603 603
604 if (vma->vm_start > end) 604 if (vma->vm_start > end)
605 break; 605 break;
606 606
607 overlap_len = min(end, vma->vm_end) - vma->vm_start; 607 overlap_len = min(end, vma->vm_end) - vma->vm_start;
608 nr_pages += overlap_len >> PAGE_SHIFT; 608 nr_pages += overlap_len >> PAGE_SHIFT;
609 } 609 }
610 610
611 return nr_pages; 611 return nr_pages;
612 } 612 }
613 613
614 void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, 614 void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
615 struct rb_node **rb_link, struct rb_node *rb_parent) 615 struct rb_node **rb_link, struct rb_node *rb_parent)
616 { 616 {
617 /* Update tracking information for the gap following the new vma. */ 617 /* Update tracking information for the gap following the new vma. */
618 if (vma->vm_next) 618 if (vma->vm_next)
619 vma_gap_update(vma->vm_next); 619 vma_gap_update(vma->vm_next);
620 else 620 else
621 mm->highest_vm_end = vma->vm_end; 621 mm->highest_vm_end = vma->vm_end;
622 622
623 /* 623 /*
624 * vma->vm_prev wasn't known when we followed the rbtree to find the 624 * vma->vm_prev wasn't known when we followed the rbtree to find the
625 * correct insertion point for that vma. As a result, we could not 625 * correct insertion point for that vma. As a result, we could not
626 * update the vma vm_rb parents rb_subtree_gap values on the way down. 626 * update the vma vm_rb parents rb_subtree_gap values on the way down.
627 * So, we first insert the vma with a zero rb_subtree_gap value 627 * So, we first insert the vma with a zero rb_subtree_gap value
628 * (to be consistent with what we did on the way down), and then 628 * (to be consistent with what we did on the way down), and then
629 * immediately update the gap to the correct value. Finally we 629 * immediately update the gap to the correct value. Finally we
630 * rebalance the rbtree after all augmented values have been set. 630 * rebalance the rbtree after all augmented values have been set.
631 */ 631 */
632 rb_link_node(&vma->vm_rb, rb_parent, rb_link); 632 rb_link_node(&vma->vm_rb, rb_parent, rb_link);
633 vma->rb_subtree_gap = 0; 633 vma->rb_subtree_gap = 0;
634 vma_gap_update(vma); 634 vma_gap_update(vma);
635 vma_rb_insert(vma, &mm->mm_rb); 635 vma_rb_insert(vma, &mm->mm_rb);
636 } 636 }
637 637
638 static void __vma_link_file(struct vm_area_struct *vma) 638 static void __vma_link_file(struct vm_area_struct *vma)
639 { 639 {
640 struct file *file; 640 struct file *file;
641 641
642 file = vma->vm_file; 642 file = vma->vm_file;
643 if (file) { 643 if (file) {
644 struct address_space *mapping = file->f_mapping; 644 struct address_space *mapping = file->f_mapping;
645 645
646 if (vma->vm_flags & VM_DENYWRITE) 646 if (vma->vm_flags & VM_DENYWRITE)
647 atomic_dec(&file_inode(file)->i_writecount); 647 atomic_dec(&file_inode(file)->i_writecount);
648 if (vma->vm_flags & VM_SHARED) 648 if (vma->vm_flags & VM_SHARED)
649 atomic_inc(&mapping->i_mmap_writable); 649 atomic_inc(&mapping->i_mmap_writable);
650 650
651 flush_dcache_mmap_lock(mapping); 651 flush_dcache_mmap_lock(mapping);
652 if (unlikely(vma->vm_flags & VM_NONLINEAR)) 652 if (unlikely(vma->vm_flags & VM_NONLINEAR))
653 vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); 653 vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
654 else 654 else
655 vma_interval_tree_insert(vma, &mapping->i_mmap); 655 vma_interval_tree_insert(vma, &mapping->i_mmap);
656 flush_dcache_mmap_unlock(mapping); 656 flush_dcache_mmap_unlock(mapping);
657 } 657 }
658 } 658 }
659 659
660 static void 660 static void
661 __vma_link(struct mm_struct *mm, struct vm_area_struct *vma, 661 __vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
662 struct vm_area_struct *prev, struct rb_node **rb_link, 662 struct vm_area_struct *prev, struct rb_node **rb_link,
663 struct rb_node *rb_parent) 663 struct rb_node *rb_parent)
664 { 664 {
665 __vma_link_list(mm, vma, prev, rb_parent); 665 __vma_link_list(mm, vma, prev, rb_parent);
666 __vma_link_rb(mm, vma, rb_link, rb_parent); 666 __vma_link_rb(mm, vma, rb_link, rb_parent);
667 } 667 }
668 668
669 static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, 669 static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
670 struct vm_area_struct *prev, struct rb_node **rb_link, 670 struct vm_area_struct *prev, struct rb_node **rb_link,
671 struct rb_node *rb_parent) 671 struct rb_node *rb_parent)
672 { 672 {
673 struct address_space *mapping = NULL; 673 struct address_space *mapping = NULL;
674 674
675 if (vma->vm_file) { 675 if (vma->vm_file) {
676 mapping = vma->vm_file->f_mapping; 676 mapping = vma->vm_file->f_mapping;
677 i_mmap_lock_write(mapping); 677 i_mmap_lock_write(mapping);
678 } 678 }
679 679
680 __vma_link(mm, vma, prev, rb_link, rb_parent); 680 __vma_link(mm, vma, prev, rb_link, rb_parent);
681 __vma_link_file(vma); 681 __vma_link_file(vma);
682 682
683 if (mapping) 683 if (mapping)
684 i_mmap_unlock_write(mapping); 684 i_mmap_unlock_write(mapping);
685 685
686 mm->map_count++; 686 mm->map_count++;
687 validate_mm(mm); 687 validate_mm(mm);
688 } 688 }
689 689
690 /* 690 /*
691 * Helper for vma_adjust() in the split_vma insert case: insert a vma into the 691 * Helper for vma_adjust() in the split_vma insert case: insert a vma into the
692 * mm's list and rbtree. It has already been inserted into the interval tree. 692 * mm's list and rbtree. It has already been inserted into the interval tree.
693 */ 693 */
694 static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) 694 static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
695 { 695 {
696 struct vm_area_struct *prev; 696 struct vm_area_struct *prev;
697 struct rb_node **rb_link, *rb_parent; 697 struct rb_node **rb_link, *rb_parent;
698 698
699 if (find_vma_links(mm, vma->vm_start, vma->vm_end, 699 if (find_vma_links(mm, vma->vm_start, vma->vm_end,
700 &prev, &rb_link, &rb_parent)) 700 &prev, &rb_link, &rb_parent))
701 BUG(); 701 BUG();
702 __vma_link(mm, vma, prev, rb_link, rb_parent); 702 __vma_link(mm, vma, prev, rb_link, rb_parent);
703 mm->map_count++; 703 mm->map_count++;
704 } 704 }
705 705
706 static inline void 706 static inline void
707 __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, 707 __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
708 struct vm_area_struct *prev) 708 struct vm_area_struct *prev)
709 { 709 {
710 struct vm_area_struct *next; 710 struct vm_area_struct *next;
711 711
712 vma_rb_erase(vma, &mm->mm_rb); 712 vma_rb_erase(vma, &mm->mm_rb);
713 prev->vm_next = next = vma->vm_next; 713 prev->vm_next = next = vma->vm_next;
714 if (next) 714 if (next)
715 next->vm_prev = prev; 715 next->vm_prev = prev;
716 716
717 /* Kill the cache */ 717 /* Kill the cache */
718 vmacache_invalidate(mm); 718 vmacache_invalidate(mm);
719 } 719 }
720 720
721 /* 721 /*
722 * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that 722 * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that
723 * is already present in an i_mmap tree without adjusting the tree. 723 * is already present in an i_mmap tree without adjusting the tree.
724 * The following helper function should be used when such adjustments 724 * The following helper function should be used when such adjustments
725 * are necessary. The "insert" vma (if any) is to be inserted 725 * are necessary. The "insert" vma (if any) is to be inserted
726 * before we drop the necessary locks. 726 * before we drop the necessary locks.
727 */ 727 */
728 int vma_adjust(struct vm_area_struct *vma, unsigned long start, 728 int vma_adjust(struct vm_area_struct *vma, unsigned long start,
729 unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) 729 unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
730 { 730 {
731 struct mm_struct *mm = vma->vm_mm; 731 struct mm_struct *mm = vma->vm_mm;
732 struct vm_area_struct *next = vma->vm_next; 732 struct vm_area_struct *next = vma->vm_next;
733 struct vm_area_struct *importer = NULL; 733 struct vm_area_struct *importer = NULL;
734 struct address_space *mapping = NULL; 734 struct address_space *mapping = NULL;
735 struct rb_root *root = NULL; 735 struct rb_root *root = NULL;
736 struct anon_vma *anon_vma = NULL; 736 struct anon_vma *anon_vma = NULL;
737 struct file *file = vma->vm_file; 737 struct file *file = vma->vm_file;
738 bool start_changed = false, end_changed = false; 738 bool start_changed = false, end_changed = false;
739 long adjust_next = 0; 739 long adjust_next = 0;
740 int remove_next = 0; 740 int remove_next = 0;
741 741
742 if (next && !insert) { 742 if (next && !insert) {
743 struct vm_area_struct *exporter = NULL; 743 struct vm_area_struct *exporter = NULL;
744 744
745 if (end >= next->vm_end) { 745 if (end >= next->vm_end) {
746 /* 746 /*
747 * vma expands, overlapping all the next, and 747 * vma expands, overlapping all the next, and
748 * perhaps the one after too (mprotect case 6). 748 * perhaps the one after too (mprotect case 6).
749 */ 749 */
750 again: remove_next = 1 + (end > next->vm_end); 750 again: remove_next = 1 + (end > next->vm_end);
751 end = next->vm_end; 751 end = next->vm_end;
752 exporter = next; 752 exporter = next;
753 importer = vma; 753 importer = vma;
754 } else if (end > next->vm_start) { 754 } else if (end > next->vm_start) {
755 /* 755 /*
756 * vma expands, overlapping part of the next: 756 * vma expands, overlapping part of the next:
757 * mprotect case 5 shifting the boundary up. 757 * mprotect case 5 shifting the boundary up.
758 */ 758 */
759 adjust_next = (end - next->vm_start) >> PAGE_SHIFT; 759 adjust_next = (end - next->vm_start) >> PAGE_SHIFT;
760 exporter = next; 760 exporter = next;
761 importer = vma; 761 importer = vma;
762 } else if (end < vma->vm_end) { 762 } else if (end < vma->vm_end) {
763 /* 763 /*
764 * vma shrinks, and !insert tells it's not 764 * vma shrinks, and !insert tells it's not
765 * split_vma inserting another: so it must be 765 * split_vma inserting another: so it must be
766 * mprotect case 4 shifting the boundary down. 766 * mprotect case 4 shifting the boundary down.
767 */ 767 */
768 adjust_next = -((vma->vm_end - end) >> PAGE_SHIFT); 768 adjust_next = -((vma->vm_end - end) >> PAGE_SHIFT);
769 exporter = vma; 769 exporter = vma;
770 importer = next; 770 importer = next;
771 } 771 }
772 772
773 /* 773 /*
774 * Easily overlooked: when mprotect shifts the boundary, 774 * Easily overlooked: when mprotect shifts the boundary,
775 * make sure the expanding vma has anon_vma set if the 775 * make sure the expanding vma has anon_vma set if the
776 * shrinking vma had, to cover any anon pages imported. 776 * shrinking vma had, to cover any anon pages imported.
777 */ 777 */
778 if (exporter && exporter->anon_vma && !importer->anon_vma) { 778 if (exporter && exporter->anon_vma && !importer->anon_vma) {
779 int error; 779 int error;
780 780
781 error = anon_vma_clone(importer, exporter); 781 error = anon_vma_clone(importer, exporter);
782 if (error) 782 if (error)
783 return error; 783 return error;
784 importer->anon_vma = exporter->anon_vma; 784 importer->anon_vma = exporter->anon_vma;
785 } 785 }
786 } 786 }
787 787
788 if (file) { 788 if (file) {
789 mapping = file->f_mapping; 789 mapping = file->f_mapping;
790 if (!(vma->vm_flags & VM_NONLINEAR)) { 790 if (!(vma->vm_flags & VM_NONLINEAR)) {
791 root = &mapping->i_mmap; 791 root = &mapping->i_mmap;
792 uprobe_munmap(vma, vma->vm_start, vma->vm_end); 792 uprobe_munmap(vma, vma->vm_start, vma->vm_end);
793 793
794 if (adjust_next) 794 if (adjust_next)
795 uprobe_munmap(next, next->vm_start, 795 uprobe_munmap(next, next->vm_start,
796 next->vm_end); 796 next->vm_end);
797 } 797 }
798 798
799 i_mmap_lock_write(mapping); 799 i_mmap_lock_write(mapping);
800 if (insert) { 800 if (insert) {
801 /* 801 /*
802 * Put into interval tree now, so instantiated pages 802 * Put into interval tree now, so instantiated pages
803 * are visible to arm/parisc __flush_dcache_page 803 * are visible to arm/parisc __flush_dcache_page
804 * throughout; but we cannot insert into address 804 * throughout; but we cannot insert into address
805 * space until vma start or end is updated. 805 * space until vma start or end is updated.
806 */ 806 */
807 __vma_link_file(insert); 807 __vma_link_file(insert);
808 } 808 }
809 } 809 }
810 810
811 vma_adjust_trans_huge(vma, start, end, adjust_next); 811 vma_adjust_trans_huge(vma, start, end, adjust_next);
812 812
813 anon_vma = vma->anon_vma; 813 anon_vma = vma->anon_vma;
814 if (!anon_vma && adjust_next) 814 if (!anon_vma && adjust_next)
815 anon_vma = next->anon_vma; 815 anon_vma = next->anon_vma;
816 if (anon_vma) { 816 if (anon_vma) {
817 VM_BUG_ON_VMA(adjust_next && next->anon_vma && 817 VM_BUG_ON_VMA(adjust_next && next->anon_vma &&
818 anon_vma != next->anon_vma, next); 818 anon_vma != next->anon_vma, next);
819 anon_vma_lock_write(anon_vma); 819 anon_vma_lock_write(anon_vma);
820 anon_vma_interval_tree_pre_update_vma(vma); 820 anon_vma_interval_tree_pre_update_vma(vma);
821 if (adjust_next) 821 if (adjust_next)
822 anon_vma_interval_tree_pre_update_vma(next); 822 anon_vma_interval_tree_pre_update_vma(next);
823 } 823 }
824 824
825 if (root) { 825 if (root) {
826 flush_dcache_mmap_lock(mapping); 826 flush_dcache_mmap_lock(mapping);
827 vma_interval_tree_remove(vma, root); 827 vma_interval_tree_remove(vma, root);
828 if (adjust_next) 828 if (adjust_next)
829 vma_interval_tree_remove(next, root); 829 vma_interval_tree_remove(next, root);
830 } 830 }
831 831
832 if (start != vma->vm_start) { 832 if (start != vma->vm_start) {
833 vma->vm_start = start; 833 vma->vm_start = start;
834 start_changed = true; 834 start_changed = true;
835 } 835 }
836 if (end != vma->vm_end) { 836 if (end != vma->vm_end) {
837 vma->vm_end = end; 837 vma->vm_end = end;
838 end_changed = true; 838 end_changed = true;
839 } 839 }
840 vma->vm_pgoff = pgoff; 840 vma->vm_pgoff = pgoff;
841 if (adjust_next) { 841 if (adjust_next) {
842 next->vm_start += adjust_next << PAGE_SHIFT; 842 next->vm_start += adjust_next << PAGE_SHIFT;
843 next->vm_pgoff += adjust_next; 843 next->vm_pgoff += adjust_next;
844 } 844 }
845 845
846 if (root) { 846 if (root) {
847 if (adjust_next) 847 if (adjust_next)
848 vma_interval_tree_insert(next, root); 848 vma_interval_tree_insert(next, root);
849 vma_interval_tree_insert(vma, root); 849 vma_interval_tree_insert(vma, root);
850 flush_dcache_mmap_unlock(mapping); 850 flush_dcache_mmap_unlock(mapping);
851 } 851 }
852 852
853 if (remove_next) { 853 if (remove_next) {
854 /* 854 /*
855 * vma_merge has merged next into vma, and needs 855 * vma_merge has merged next into vma, and needs
856 * us to remove next before dropping the locks. 856 * us to remove next before dropping the locks.
857 */ 857 */
858 __vma_unlink(mm, next, vma); 858 __vma_unlink(mm, next, vma);
859 if (file) 859 if (file)
860 __remove_shared_vm_struct(next, file, mapping); 860 __remove_shared_vm_struct(next, file, mapping);
861 } else if (insert) { 861 } else if (insert) {
862 /* 862 /*
863 * split_vma has split insert from vma, and needs 863 * split_vma has split insert from vma, and needs
864 * us to insert it before dropping the locks 864 * us to insert it before dropping the locks
865 * (it may either follow vma or precede it). 865 * (it may either follow vma or precede it).
866 */ 866 */
867 __insert_vm_struct(mm, insert); 867 __insert_vm_struct(mm, insert);
868 } else { 868 } else {
869 if (start_changed) 869 if (start_changed)
870 vma_gap_update(vma); 870 vma_gap_update(vma);
871 if (end_changed) { 871 if (end_changed) {
872 if (!next) 872 if (!next)
873 mm->highest_vm_end = end; 873 mm->highest_vm_end = end;
874 else if (!adjust_next) 874 else if (!adjust_next)
875 vma_gap_update(next); 875 vma_gap_update(next);
876 } 876 }
877 } 877 }
878 878
879 if (anon_vma) { 879 if (anon_vma) {
880 anon_vma_interval_tree_post_update_vma(vma); 880 anon_vma_interval_tree_post_update_vma(vma);
881 if (adjust_next) 881 if (adjust_next)
882 anon_vma_interval_tree_post_update_vma(next); 882 anon_vma_interval_tree_post_update_vma(next);
883 anon_vma_unlock_write(anon_vma); 883 anon_vma_unlock_write(anon_vma);
884 } 884 }
885 if (mapping) 885 if (mapping)
886 i_mmap_unlock_write(mapping); 886 i_mmap_unlock_write(mapping);
887 887
888 if (root) { 888 if (root) {
889 uprobe_mmap(vma); 889 uprobe_mmap(vma);
890 890
891 if (adjust_next) 891 if (adjust_next)
892 uprobe_mmap(next); 892 uprobe_mmap(next);
893 } 893 }
894 894
895 if (remove_next) { 895 if (remove_next) {
896 if (file) { 896 if (file) {
897 uprobe_munmap(next, next->vm_start, next->vm_end); 897 uprobe_munmap(next, next->vm_start, next->vm_end);
898 fput(file); 898 fput(file);
899 } 899 }
900 if (next->anon_vma) 900 if (next->anon_vma)
901 anon_vma_merge(vma, next); 901 anon_vma_merge(vma, next);
902 mm->map_count--; 902 mm->map_count--;
903 mpol_put(vma_policy(next)); 903 mpol_put(vma_policy(next));
904 kmem_cache_free(vm_area_cachep, next); 904 kmem_cache_free(vm_area_cachep, next);
905 /* 905 /*
906 * In mprotect's case 6 (see comments on vma_merge), 906 * In mprotect's case 6 (see comments on vma_merge),
907 * we must remove another next too. It would clutter 907 * we must remove another next too. It would clutter
908 * up the code too much to do both in one go. 908 * up the code too much to do both in one go.
909 */ 909 */
910 next = vma->vm_next; 910 next = vma->vm_next;
911 if (remove_next == 2) 911 if (remove_next == 2)
912 goto again; 912 goto again;
913 else if (next) 913 else if (next)
914 vma_gap_update(next); 914 vma_gap_update(next);
915 else 915 else
916 mm->highest_vm_end = end; 916 mm->highest_vm_end = end;
917 } 917 }
918 if (insert && file) 918 if (insert && file)
919 uprobe_mmap(insert); 919 uprobe_mmap(insert);
920 920
921 validate_mm(mm); 921 validate_mm(mm);
922 922
923 return 0; 923 return 0;
924 } 924 }
925 925
926 /* 926 /*
927 * If the vma has a ->close operation then the driver probably needs to release 927 * If the vma has a ->close operation then the driver probably needs to release
928 * per-vma resources, so we don't attempt to merge those. 928 * per-vma resources, so we don't attempt to merge those.
929 */ 929 */
930 static inline int is_mergeable_vma(struct vm_area_struct *vma, 930 static inline int is_mergeable_vma(struct vm_area_struct *vma,
931 struct file *file, unsigned long vm_flags) 931 struct file *file, unsigned long vm_flags)
932 { 932 {
933 /* 933 /*
934 * VM_SOFTDIRTY should not prevent from VMA merging, if we 934 * VM_SOFTDIRTY should not prevent from VMA merging, if we
935 * match the flags but dirty bit -- the caller should mark 935 * match the flags but dirty bit -- the caller should mark
936 * merged VMA as dirty. If dirty bit won't be excluded from 936 * merged VMA as dirty. If dirty bit won't be excluded from
937 * comparison, we increase pressue on the memory system forcing 937 * comparison, we increase pressue on the memory system forcing
938 * the kernel to generate new VMAs when old one could be 938 * the kernel to generate new VMAs when old one could be
939 * extended instead. 939 * extended instead.
940 */ 940 */
941 if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY) 941 if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY)
942 return 0; 942 return 0;
943 if (vma->vm_file != file) 943 if (vma->vm_file != file)
944 return 0; 944 return 0;
945 if (vma->vm_ops && vma->vm_ops->close) 945 if (vma->vm_ops && vma->vm_ops->close)
946 return 0; 946 return 0;
947 return 1; 947 return 1;
948 } 948 }
949 949
950 static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1, 950 static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
951 struct anon_vma *anon_vma2, 951 struct anon_vma *anon_vma2,
952 struct vm_area_struct *vma) 952 struct vm_area_struct *vma)
953 { 953 {
954 /* 954 /*
955 * The list_is_singular() test is to avoid merging VMA cloned from 955 * The list_is_singular() test is to avoid merging VMA cloned from
956 * parents. This can improve scalability caused by anon_vma lock. 956 * parents. This can improve scalability caused by anon_vma lock.
957 */ 957 */
958 if ((!anon_vma1 || !anon_vma2) && (!vma || 958 if ((!anon_vma1 || !anon_vma2) && (!vma ||
959 list_is_singular(&vma->anon_vma_chain))) 959 list_is_singular(&vma->anon_vma_chain)))
960 return 1; 960 return 1;
961 return anon_vma1 == anon_vma2; 961 return anon_vma1 == anon_vma2;
962 } 962 }
963 963
964 /* 964 /*
965 * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) 965 * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
966 * in front of (at a lower virtual address and file offset than) the vma. 966 * in front of (at a lower virtual address and file offset than) the vma.
967 * 967 *
968 * We cannot merge two vmas if they have differently assigned (non-NULL) 968 * We cannot merge two vmas if they have differently assigned (non-NULL)
969 * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. 969 * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
970 * 970 *
971 * We don't check here for the merged mmap wrapping around the end of pagecache 971 * We don't check here for the merged mmap wrapping around the end of pagecache
972 * indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which 972 * indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which
973 * wrap, nor mmaps which cover the final page at index -1UL. 973 * wrap, nor mmaps which cover the final page at index -1UL.
974 */ 974 */
975 static int 975 static int
976 can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, 976 can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
977 struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) 977 struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
978 { 978 {
979 if (is_mergeable_vma(vma, file, vm_flags) && 979 if (is_mergeable_vma(vma, file, vm_flags) &&
980 is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { 980 is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
981 if (vma->vm_pgoff == vm_pgoff) 981 if (vma->vm_pgoff == vm_pgoff)
982 return 1; 982 return 1;
983 } 983 }
984 return 0; 984 return 0;
985 } 985 }
986 986
987 /* 987 /*
988 * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) 988 * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
989 * beyond (at a higher virtual address and file offset than) the vma. 989 * beyond (at a higher virtual address and file offset than) the vma.
990 * 990 *
991 * We cannot merge two vmas if they have differently assigned (non-NULL) 991 * We cannot merge two vmas if they have differently assigned (non-NULL)
992 * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. 992 * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
993 */ 993 */
994 static int 994 static int
995 can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, 995 can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
996 struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) 996 struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
997 { 997 {
998 if (is_mergeable_vma(vma, file, vm_flags) && 998 if (is_mergeable_vma(vma, file, vm_flags) &&
999 is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { 999 is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
1000 pgoff_t vm_pglen; 1000 pgoff_t vm_pglen;
1001 vm_pglen = vma_pages(vma); 1001 vm_pglen = vma_pages(vma);
1002 if (vma->vm_pgoff + vm_pglen == vm_pgoff) 1002 if (vma->vm_pgoff + vm_pglen == vm_pgoff)
1003 return 1; 1003 return 1;
1004 } 1004 }
1005 return 0; 1005 return 0;
1006 } 1006 }
1007 1007
1008 /* 1008 /*
1009 * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out 1009 * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out
1010 * whether that can be merged with its predecessor or its successor. 1010 * whether that can be merged with its predecessor or its successor.
1011 * Or both (it neatly fills a hole). 1011 * Or both (it neatly fills a hole).
1012 * 1012 *
1013 * In most cases - when called for mmap, brk or mremap - [addr,end) is 1013 * In most cases - when called for mmap, brk or mremap - [addr,end) is
1014 * certain not to be mapped by the time vma_merge is called; but when 1014 * certain not to be mapped by the time vma_merge is called; but when
1015 * called for mprotect, it is certain to be already mapped (either at 1015 * called for mprotect, it is certain to be already mapped (either at
1016 * an offset within prev, or at the start of next), and the flags of 1016 * an offset within prev, or at the start of next), and the flags of
1017 * this area are about to be changed to vm_flags - and the no-change 1017 * this area are about to be changed to vm_flags - and the no-change
1018 * case has already been eliminated. 1018 * case has already been eliminated.
1019 * 1019 *
1020 * The following mprotect cases have to be considered, where AAAA is 1020 * The following mprotect cases have to be considered, where AAAA is
1021 * the area passed down from mprotect_fixup, never extending beyond one 1021 * the area passed down from mprotect_fixup, never extending beyond one
1022 * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after: 1022 * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after:
1023 * 1023 *
1024 * AAAA AAAA AAAA AAAA 1024 * AAAA AAAA AAAA AAAA
1025 * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPNNNNNN PPPPNNNNXXXX 1025 * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPNNNNNN PPPPNNNNXXXX
1026 * cannot merge might become might become might become 1026 * cannot merge might become might become might become
1027 * PPNNNNNNNNNN PPPPPPPPPPNN PPPPPPPPPPPP 6 or 1027 * PPNNNNNNNNNN PPPPPPPPPPNN PPPPPPPPPPPP 6 or
1028 * mmap, brk or case 4 below case 5 below PPPPPPPPXXXX 7 or 1028 * mmap, brk or case 4 below case 5 below PPPPPPPPXXXX 7 or
1029 * mremap move: PPPPNNNNNNNN 8 1029 * mremap move: PPPPNNNNNNNN 8
1030 * AAAA 1030 * AAAA
1031 * PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN 1031 * PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN
1032 * might become case 1 below case 2 below case 3 below 1032 * might become case 1 below case 2 below case 3 below
1033 * 1033 *
1034 * Odd one out? Case 8, because it extends NNNN but needs flags of XXXX: 1034 * Odd one out? Case 8, because it extends NNNN but needs flags of XXXX:
1035 * mprotect_fixup updates vm_flags & vm_page_prot on successful return. 1035 * mprotect_fixup updates vm_flags & vm_page_prot on successful return.
1036 */ 1036 */
1037 struct vm_area_struct *vma_merge(struct mm_struct *mm, 1037 struct vm_area_struct *vma_merge(struct mm_struct *mm,
1038 struct vm_area_struct *prev, unsigned long addr, 1038 struct vm_area_struct *prev, unsigned long addr,
1039 unsigned long end, unsigned long vm_flags, 1039 unsigned long end, unsigned long vm_flags,
1040 struct anon_vma *anon_vma, struct file *file, 1040 struct anon_vma *anon_vma, struct file *file,
1041 pgoff_t pgoff, struct mempolicy *policy) 1041 pgoff_t pgoff, struct mempolicy *policy)
1042 { 1042 {
1043 pgoff_t pglen = (end - addr) >> PAGE_SHIFT; 1043 pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
1044 struct vm_area_struct *area, *next; 1044 struct vm_area_struct *area, *next;
1045 int err; 1045 int err;
1046 1046
1047 /* 1047 /*
1048 * We later require that vma->vm_flags == vm_flags, 1048 * We later require that vma->vm_flags == vm_flags,
1049 * so this tests vma->vm_flags & VM_SPECIAL, too. 1049 * so this tests vma->vm_flags & VM_SPECIAL, too.
1050 */ 1050 */
1051 if (vm_flags & VM_SPECIAL) 1051 if (vm_flags & VM_SPECIAL)
1052 return NULL; 1052 return NULL;
1053 1053
1054 if (prev) 1054 if (prev)
1055 next = prev->vm_next; 1055 next = prev->vm_next;
1056 else 1056 else
1057 next = mm->mmap; 1057 next = mm->mmap;
1058 area = next; 1058 area = next;
1059 if (next && next->vm_end == end) /* cases 6, 7, 8 */ 1059 if (next && next->vm_end == end) /* cases 6, 7, 8 */
1060 next = next->vm_next; 1060 next = next->vm_next;
1061 1061
1062 /* 1062 /*
1063 * Can it merge with the predecessor? 1063 * Can it merge with the predecessor?
1064 */ 1064 */
1065 if (prev && prev->vm_end == addr && 1065 if (prev && prev->vm_end == addr &&
1066 mpol_equal(vma_policy(prev), policy) && 1066 mpol_equal(vma_policy(prev), policy) &&
1067 can_vma_merge_after(prev, vm_flags, 1067 can_vma_merge_after(prev, vm_flags,
1068 anon_vma, file, pgoff)) { 1068 anon_vma, file, pgoff)) {
1069 /* 1069 /*
1070 * OK, it can. Can we now merge in the successor as well? 1070 * OK, it can. Can we now merge in the successor as well?
1071 */ 1071 */
1072 if (next && end == next->vm_start && 1072 if (next && end == next->vm_start &&
1073 mpol_equal(policy, vma_policy(next)) && 1073 mpol_equal(policy, vma_policy(next)) &&
1074 can_vma_merge_before(next, vm_flags, 1074 can_vma_merge_before(next, vm_flags,
1075 anon_vma, file, pgoff+pglen) && 1075 anon_vma, file, pgoff+pglen) &&
1076 is_mergeable_anon_vma(prev->anon_vma, 1076 is_mergeable_anon_vma(prev->anon_vma,
1077 next->anon_vma, NULL)) { 1077 next->anon_vma, NULL)) {
1078 /* cases 1, 6 */ 1078 /* cases 1, 6 */
1079 err = vma_adjust(prev, prev->vm_start, 1079 err = vma_adjust(prev, prev->vm_start,
1080 next->vm_end, prev->vm_pgoff, NULL); 1080 next->vm_end, prev->vm_pgoff, NULL);
1081 } else /* cases 2, 5, 7 */ 1081 } else /* cases 2, 5, 7 */
1082 err = vma_adjust(prev, prev->vm_start, 1082 err = vma_adjust(prev, prev->vm_start,
1083 end, prev->vm_pgoff, NULL); 1083 end, prev->vm_pgoff, NULL);
1084 if (err) 1084 if (err)
1085 return NULL; 1085 return NULL;
1086 khugepaged_enter_vma_merge(prev, vm_flags); 1086 khugepaged_enter_vma_merge(prev, vm_flags);
1087 return prev; 1087 return prev;
1088 } 1088 }
1089 1089
1090 /* 1090 /*
1091 * Can this new request be merged in front of next? 1091 * Can this new request be merged in front of next?
1092 */ 1092 */
1093 if (next && end == next->vm_start && 1093 if (next && end == next->vm_start &&
1094 mpol_equal(policy, vma_policy(next)) && 1094 mpol_equal(policy, vma_policy(next)) &&
1095 can_vma_merge_before(next, vm_flags, 1095 can_vma_merge_before(next, vm_flags,
1096 anon_vma, file, pgoff+pglen)) { 1096 anon_vma, file, pgoff+pglen)) {
1097 if (prev && addr < prev->vm_end) /* case 4 */ 1097 if (prev && addr < prev->vm_end) /* case 4 */
1098 err = vma_adjust(prev, prev->vm_start, 1098 err = vma_adjust(prev, prev->vm_start,
1099 addr, prev->vm_pgoff, NULL); 1099 addr, prev->vm_pgoff, NULL);
1100 else /* cases 3, 8 */ 1100 else /* cases 3, 8 */
1101 err = vma_adjust(area, addr, next->vm_end, 1101 err = vma_adjust(area, addr, next->vm_end,
1102 next->vm_pgoff - pglen, NULL); 1102 next->vm_pgoff - pglen, NULL);
1103 if (err) 1103 if (err)
1104 return NULL; 1104 return NULL;
1105 khugepaged_enter_vma_merge(area, vm_flags); 1105 khugepaged_enter_vma_merge(area, vm_flags);
1106 return area; 1106 return area;
1107 } 1107 }
1108 1108
1109 return NULL; 1109 return NULL;
1110 } 1110 }
1111 1111
1112 /* 1112 /*
1113 * Rough compatbility check to quickly see if it's even worth looking 1113 * Rough compatbility check to quickly see if it's even worth looking
1114 * at sharing an anon_vma. 1114 * at sharing an anon_vma.
1115 * 1115 *
1116 * They need to have the same vm_file, and the flags can only differ 1116 * They need to have the same vm_file, and the flags can only differ
1117 * in things that mprotect may change. 1117 * in things that mprotect may change.
1118 * 1118 *
1119 * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that 1119 * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that
1120 * we can merge the two vma's. For example, we refuse to merge a vma if 1120 * we can merge the two vma's. For example, we refuse to merge a vma if
1121 * there is a vm_ops->close() function, because that indicates that the 1121 * there is a vm_ops->close() function, because that indicates that the
1122 * driver is doing some kind of reference counting. But that doesn't 1122 * driver is doing some kind of reference counting. But that doesn't
1123 * really matter for the anon_vma sharing case. 1123 * really matter for the anon_vma sharing case.
1124 */ 1124 */
1125 static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b) 1125 static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b)
1126 { 1126 {
1127 return a->vm_end == b->vm_start && 1127 return a->vm_end == b->vm_start &&
1128 mpol_equal(vma_policy(a), vma_policy(b)) && 1128 mpol_equal(vma_policy(a), vma_policy(b)) &&
1129 a->vm_file == b->vm_file && 1129 a->vm_file == b->vm_file &&
1130 !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC|VM_SOFTDIRTY)) && 1130 !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC|VM_SOFTDIRTY)) &&
1131 b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); 1131 b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);
1132 } 1132 }
1133 1133
1134 /* 1134 /*
1135 * Do some basic sanity checking to see if we can re-use the anon_vma 1135 * Do some basic sanity checking to see if we can re-use the anon_vma
1136 * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be 1136 * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be
1137 * the same as 'old', the other will be the new one that is trying 1137 * the same as 'old', the other will be the new one that is trying
1138 * to share the anon_vma. 1138 * to share the anon_vma.
1139 * 1139 *
1140 * NOTE! This runs with mm_sem held for reading, so it is possible that 1140 * NOTE! This runs with mm_sem held for reading, so it is possible that
1141 * the anon_vma of 'old' is concurrently in the process of being set up 1141 * the anon_vma of 'old' is concurrently in the process of being set up
1142 * by another page fault trying to merge _that_. But that's ok: if it 1142 * by another page fault trying to merge _that_. But that's ok: if it
1143 * is being set up, that automatically means that it will be a singleton 1143 * is being set up, that automatically means that it will be a singleton
1144 * acceptable for merging, so we can do all of this optimistically. But 1144 * acceptable for merging, so we can do all of this optimistically. But
1145 * we do that ACCESS_ONCE() to make sure that we never re-load the pointer. 1145 * we do that ACCESS_ONCE() to make sure that we never re-load the pointer.
1146 * 1146 *
1147 * IOW: that the "list_is_singular()" test on the anon_vma_chain only 1147 * IOW: that the "list_is_singular()" test on the anon_vma_chain only
1148 * matters for the 'stable anon_vma' case (ie the thing we want to avoid 1148 * matters for the 'stable anon_vma' case (ie the thing we want to avoid
1149 * is to return an anon_vma that is "complex" due to having gone through 1149 * is to return an anon_vma that is "complex" due to having gone through
1150 * a fork). 1150 * a fork).
1151 * 1151 *
1152 * We also make sure that the two vma's are compatible (adjacent, 1152 * We also make sure that the two vma's are compatible (adjacent,
1153 * and with the same memory policies). That's all stable, even with just 1153 * and with the same memory policies). That's all stable, even with just
1154 * a read lock on the mm_sem. 1154 * a read lock on the mm_sem.
1155 */ 1155 */
1156 static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b) 1156 static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b)
1157 { 1157 {
1158 if (anon_vma_compatible(a, b)) { 1158 if (anon_vma_compatible(a, b)) {
1159 struct anon_vma *anon_vma = ACCESS_ONCE(old->anon_vma); 1159 struct anon_vma *anon_vma = ACCESS_ONCE(old->anon_vma);
1160 1160
1161 if (anon_vma && list_is_singular(&old->anon_vma_chain)) 1161 if (anon_vma && list_is_singular(&old->anon_vma_chain))
1162 return anon_vma; 1162 return anon_vma;
1163 } 1163 }
1164 return NULL; 1164 return NULL;
1165 } 1165 }
1166 1166
1167 /* 1167 /*
1168 * find_mergeable_anon_vma is used by anon_vma_prepare, to check 1168 * find_mergeable_anon_vma is used by anon_vma_prepare, to check
1169 * neighbouring vmas for a suitable anon_vma, before it goes off 1169 * neighbouring vmas for a suitable anon_vma, before it goes off
1170 * to allocate a new anon_vma. It checks because a repetitive 1170 * to allocate a new anon_vma. It checks because a repetitive
1171 * sequence of mprotects and faults may otherwise lead to distinct 1171 * sequence of mprotects and faults may otherwise lead to distinct
1172 * anon_vmas being allocated, preventing vma merge in subsequent 1172 * anon_vmas being allocated, preventing vma merge in subsequent
1173 * mprotect. 1173 * mprotect.
1174 */ 1174 */
1175 struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) 1175 struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
1176 { 1176 {
1177 struct anon_vma *anon_vma; 1177 struct anon_vma *anon_vma;
1178 struct vm_area_struct *near; 1178 struct vm_area_struct *near;
1179 1179
1180 near = vma->vm_next; 1180 near = vma->vm_next;
1181 if (!near) 1181 if (!near)
1182 goto try_prev; 1182 goto try_prev;
1183 1183
1184 anon_vma = reusable_anon_vma(near, vma, near); 1184 anon_vma = reusable_anon_vma(near, vma, near);
1185 if (anon_vma) 1185 if (anon_vma)
1186 return anon_vma; 1186 return anon_vma;
1187 try_prev: 1187 try_prev:
1188 near = vma->vm_prev; 1188 near = vma->vm_prev;
1189 if (!near) 1189 if (!near)
1190 goto none; 1190 goto none;
1191 1191
1192 anon_vma = reusable_anon_vma(near, near, vma); 1192 anon_vma = reusable_anon_vma(near, near, vma);
1193 if (anon_vma) 1193 if (anon_vma)
1194 return anon_vma; 1194 return anon_vma;
1195 none: 1195 none:
1196 /* 1196 /*
1197 * There's no absolute need to look only at touching neighbours: 1197 * There's no absolute need to look only at touching neighbours:
1198 * we could search further afield for "compatible" anon_vmas. 1198 * we could search further afield for "compatible" anon_vmas.
1199 * But it would probably just be a waste of time searching, 1199 * But it would probably just be a waste of time searching,
1200 * or lead to too many vmas hanging off the same anon_vma. 1200 * or lead to too many vmas hanging off the same anon_vma.
1201 * We're trying to allow mprotect remerging later on, 1201 * We're trying to allow mprotect remerging later on,
1202 * not trying to minimize memory used for anon_vmas. 1202 * not trying to minimize memory used for anon_vmas.
1203 */ 1203 */
1204 return NULL; 1204 return NULL;
1205 } 1205 }
1206 1206
1207 #ifdef CONFIG_PROC_FS 1207 #ifdef CONFIG_PROC_FS
1208 void vm_stat_account(struct mm_struct *mm, unsigned long flags, 1208 void vm_stat_account(struct mm_struct *mm, unsigned long flags,
1209 struct file *file, long pages) 1209 struct file *file, long pages)
1210 { 1210 {
1211 const unsigned long stack_flags 1211 const unsigned long stack_flags
1212 = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN); 1212 = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN);
1213 1213
1214 mm->total_vm += pages; 1214 mm->total_vm += pages;
1215 1215
1216 if (file) { 1216 if (file) {
1217 mm->shared_vm += pages; 1217 mm->shared_vm += pages;
1218 if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC) 1218 if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC)
1219 mm->exec_vm += pages; 1219 mm->exec_vm += pages;
1220 } else if (flags & stack_flags) 1220 } else if (flags & stack_flags)
1221 mm->stack_vm += pages; 1221 mm->stack_vm += pages;
1222 } 1222 }
1223 #endif /* CONFIG_PROC_FS */ 1223 #endif /* CONFIG_PROC_FS */
1224 1224
1225 /* 1225 /*
1226 * If a hint addr is less than mmap_min_addr change hint to be as 1226 * If a hint addr is less than mmap_min_addr change hint to be as
1227 * low as possible but still greater than mmap_min_addr 1227 * low as possible but still greater than mmap_min_addr
1228 */ 1228 */
1229 static inline unsigned long round_hint_to_min(unsigned long hint) 1229 static inline unsigned long round_hint_to_min(unsigned long hint)
1230 { 1230 {
1231 hint &= PAGE_MASK; 1231 hint &= PAGE_MASK;
1232 if (((void *)hint != NULL) && 1232 if (((void *)hint != NULL) &&
1233 (hint < mmap_min_addr)) 1233 (hint < mmap_min_addr))
1234 return PAGE_ALIGN(mmap_min_addr); 1234 return PAGE_ALIGN(mmap_min_addr);
1235 return hint; 1235 return hint;
1236 } 1236 }
1237 1237
1238 static inline int mlock_future_check(struct mm_struct *mm, 1238 static inline int mlock_future_check(struct mm_struct *mm,
1239 unsigned long flags, 1239 unsigned long flags,
1240 unsigned long len) 1240 unsigned long len)
1241 { 1241 {
1242 unsigned long locked, lock_limit; 1242 unsigned long locked, lock_limit;
1243 1243
1244 /* mlock MCL_FUTURE? */ 1244 /* mlock MCL_FUTURE? */
1245 if (flags & VM_LOCKED) { 1245 if (flags & VM_LOCKED) {
1246 locked = len >> PAGE_SHIFT; 1246 locked = len >> PAGE_SHIFT;
1247 locked += mm->locked_vm; 1247 locked += mm->locked_vm;
1248 lock_limit = rlimit(RLIMIT_MEMLOCK); 1248 lock_limit = rlimit(RLIMIT_MEMLOCK);
1249 lock_limit >>= PAGE_SHIFT; 1249 lock_limit >>= PAGE_SHIFT;
1250 if (locked > lock_limit && !capable(CAP_IPC_LOCK)) 1250 if (locked > lock_limit && !capable(CAP_IPC_LOCK))
1251 return -EAGAIN; 1251 return -EAGAIN;
1252 } 1252 }
1253 return 0; 1253 return 0;
1254 } 1254 }
1255 1255
1256 /* 1256 /*
1257 * The caller must hold down_write(&current->mm->mmap_sem). 1257 * The caller must hold down_write(&current->mm->mmap_sem).
1258 */ 1258 */
1259 1259
1260 unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, 1260 unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1261 unsigned long len, unsigned long prot, 1261 unsigned long len, unsigned long prot,
1262 unsigned long flags, unsigned long pgoff, 1262 unsigned long flags, unsigned long pgoff,
1263 unsigned long *populate) 1263 unsigned long *populate)
1264 { 1264 {
1265 struct mm_struct *mm = current->mm; 1265 struct mm_struct *mm = current->mm;
1266 vm_flags_t vm_flags; 1266 vm_flags_t vm_flags;
1267 1267
1268 *populate = 0; 1268 *populate = 0;
1269 1269
1270 /* 1270 /*
1271 * Does the application expect PROT_READ to imply PROT_EXEC? 1271 * Does the application expect PROT_READ to imply PROT_EXEC?
1272 * 1272 *
1273 * (the exception is when the underlying filesystem is noexec 1273 * (the exception is when the underlying filesystem is noexec
1274 * mounted, in which case we dont add PROT_EXEC.) 1274 * mounted, in which case we dont add PROT_EXEC.)
1275 */ 1275 */
1276 if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) 1276 if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
1277 if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC))) 1277 if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC)))
1278 prot |= PROT_EXEC; 1278 prot |= PROT_EXEC;
1279 1279
1280 if (!len) 1280 if (!len)
1281 return -EINVAL; 1281 return -EINVAL;
1282 1282
1283 if (!(flags & MAP_FIXED)) 1283 if (!(flags & MAP_FIXED))
1284 addr = round_hint_to_min(addr); 1284 addr = round_hint_to_min(addr);
1285 1285
1286 /* Careful about overflows.. */ 1286 /* Careful about overflows.. */
1287 len = PAGE_ALIGN(len); 1287 len = PAGE_ALIGN(len);
1288 if (!len) 1288 if (!len)
1289 return -ENOMEM; 1289 return -ENOMEM;
1290 1290
1291 /* offset overflow? */ 1291 /* offset overflow? */
1292 if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) 1292 if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
1293 return -EOVERFLOW; 1293 return -EOVERFLOW;
1294 1294
1295 /* Too many mappings? */ 1295 /* Too many mappings? */
1296 if (mm->map_count > sysctl_max_map_count) 1296 if (mm->map_count > sysctl_max_map_count)
1297 return -ENOMEM; 1297 return -ENOMEM;
1298 1298
1299 /* Obtain the address to map to. we verify (or select) it and ensure 1299 /* Obtain the address to map to. we verify (or select) it and ensure
1300 * that it represents a valid section of the address space. 1300 * that it represents a valid section of the address space.
1301 */ 1301 */
1302 addr = get_unmapped_area(file, addr, len, pgoff, flags); 1302 addr = get_unmapped_area(file, addr, len, pgoff, flags);
1303 if (addr & ~PAGE_MASK) 1303 if (addr & ~PAGE_MASK)
1304 return addr; 1304 return addr;
1305 1305
1306 /* Do simple checking here so the lower-level routines won't have 1306 /* Do simple checking here so the lower-level routines won't have
1307 * to. we assume access permissions have been handled by the open 1307 * to. we assume access permissions have been handled by the open
1308 * of the memory object, so we don't do any here. 1308 * of the memory object, so we don't do any here.
1309 */ 1309 */
1310 vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | 1310 vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
1311 mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; 1311 mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
1312 1312
1313 if (flags & MAP_LOCKED) 1313 if (flags & MAP_LOCKED)
1314 if (!can_do_mlock()) 1314 if (!can_do_mlock())
1315 return -EPERM; 1315 return -EPERM;
1316 1316
1317 if (mlock_future_check(mm, vm_flags, len)) 1317 if (mlock_future_check(mm, vm_flags, len))
1318 return -EAGAIN; 1318 return -EAGAIN;
1319 1319
1320 if (file) { 1320 if (file) {
1321 struct inode *inode = file_inode(file); 1321 struct inode *inode = file_inode(file);
1322 1322
1323 switch (flags & MAP_TYPE) { 1323 switch (flags & MAP_TYPE) {
1324 case MAP_SHARED: 1324 case MAP_SHARED:
1325 if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE)) 1325 if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
1326 return -EACCES; 1326 return -EACCES;
1327 1327
1328 /* 1328 /*
1329 * Make sure we don't allow writing to an append-only 1329 * Make sure we don't allow writing to an append-only
1330 * file.. 1330 * file..
1331 */ 1331 */
1332 if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE)) 1332 if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
1333 return -EACCES; 1333 return -EACCES;
1334 1334
1335 /* 1335 /*
1336 * Make sure there are no mandatory locks on the file. 1336 * Make sure there are no mandatory locks on the file.
1337 */ 1337 */
1338 if (locks_verify_locked(file)) 1338 if (locks_verify_locked(file))
1339 return -EAGAIN; 1339 return -EAGAIN;
1340 1340
1341 vm_flags |= VM_SHARED | VM_MAYSHARE; 1341 vm_flags |= VM_SHARED | VM_MAYSHARE;
1342 if (!(file->f_mode & FMODE_WRITE)) 1342 if (!(file->f_mode & FMODE_WRITE))
1343 vm_flags &= ~(VM_MAYWRITE | VM_SHARED); 1343 vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
1344 1344
1345 /* fall through */ 1345 /* fall through */
1346 case MAP_PRIVATE: 1346 case MAP_PRIVATE:
1347 if (!(file->f_mode & FMODE_READ)) 1347 if (!(file->f_mode & FMODE_READ))
1348 return -EACCES; 1348 return -EACCES;
1349 if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) { 1349 if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
1350 if (vm_flags & VM_EXEC) 1350 if (vm_flags & VM_EXEC)
1351 return -EPERM; 1351 return -EPERM;
1352 vm_flags &= ~VM_MAYEXEC; 1352 vm_flags &= ~VM_MAYEXEC;
1353 } 1353 }
1354 1354
1355 if (!file->f_op->mmap) 1355 if (!file->f_op->mmap)
1356 return -ENODEV; 1356 return -ENODEV;
1357 if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) 1357 if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
1358 return -EINVAL; 1358 return -EINVAL;
1359 break; 1359 break;
1360 1360
1361 default: 1361 default:
1362 return -EINVAL; 1362 return -EINVAL;
1363 } 1363 }
1364 } else { 1364 } else {
1365 switch (flags & MAP_TYPE) { 1365 switch (flags & MAP_TYPE) {
1366 case MAP_SHARED: 1366 case MAP_SHARED:
1367 if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) 1367 if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
1368 return -EINVAL; 1368 return -EINVAL;
1369 /* 1369 /*
1370 * Ignore pgoff. 1370 * Ignore pgoff.
1371 */ 1371 */
1372 pgoff = 0; 1372 pgoff = 0;
1373 vm_flags |= VM_SHARED | VM_MAYSHARE; 1373 vm_flags |= VM_SHARED | VM_MAYSHARE;
1374 break; 1374 break;
1375 case MAP_PRIVATE: 1375 case MAP_PRIVATE:
1376 /* 1376 /*
1377 * Set pgoff according to addr for anon_vma. 1377 * Set pgoff according to addr for anon_vma.
1378 */ 1378 */
1379 pgoff = addr >> PAGE_SHIFT; 1379 pgoff = addr >> PAGE_SHIFT;
1380 break; 1380 break;
1381 default: 1381 default:
1382 return -EINVAL; 1382 return -EINVAL;
1383 } 1383 }
1384 } 1384 }
1385 1385
1386 /* 1386 /*
1387 * Set 'VM_NORESERVE' if we should not account for the 1387 * Set 'VM_NORESERVE' if we should not account for the
1388 * memory use of this mapping. 1388 * memory use of this mapping.
1389 */ 1389 */
1390 if (flags & MAP_NORESERVE) { 1390 if (flags & MAP_NORESERVE) {
1391 /* We honor MAP_NORESERVE if allowed to overcommit */ 1391 /* We honor MAP_NORESERVE if allowed to overcommit */
1392 if (sysctl_overcommit_memory != OVERCOMMIT_NEVER) 1392 if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
1393 vm_flags |= VM_NORESERVE; 1393 vm_flags |= VM_NORESERVE;
1394 1394
1395 /* hugetlb applies strict overcommit unless MAP_NORESERVE */ 1395 /* hugetlb applies strict overcommit unless MAP_NORESERVE */
1396 if (file && is_file_hugepages(file)) 1396 if (file && is_file_hugepages(file))
1397 vm_flags |= VM_NORESERVE; 1397 vm_flags |= VM_NORESERVE;
1398 } 1398 }
1399 1399
1400 addr = mmap_region(file, addr, len, vm_flags, pgoff); 1400 addr = mmap_region(file, addr, len, vm_flags, pgoff);
1401 if (!IS_ERR_VALUE(addr) && 1401 if (!IS_ERR_VALUE(addr) &&
1402 ((vm_flags & VM_LOCKED) || 1402 ((vm_flags & VM_LOCKED) ||
1403 (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE)) 1403 (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
1404 *populate = len; 1404 *populate = len;
1405 return addr; 1405 return addr;
1406 } 1406 }
1407 1407
1408 SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, 1408 SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1409 unsigned long, prot, unsigned long, flags, 1409 unsigned long, prot, unsigned long, flags,
1410 unsigned long, fd, unsigned long, pgoff) 1410 unsigned long, fd, unsigned long, pgoff)
1411 { 1411 {
1412 struct file *file = NULL; 1412 struct file *file = NULL;
1413 unsigned long retval = -EBADF; 1413 unsigned long retval = -EBADF;
1414 1414
1415 if (!(flags & MAP_ANONYMOUS)) { 1415 if (!(flags & MAP_ANONYMOUS)) {
1416 audit_mmap_fd(fd, flags); 1416 audit_mmap_fd(fd, flags);
1417 file = fget(fd); 1417 file = fget(fd);
1418 if (!file) 1418 if (!file)
1419 goto out; 1419 goto out;
1420 if (is_file_hugepages(file)) 1420 if (is_file_hugepages(file))
1421 len = ALIGN(len, huge_page_size(hstate_file(file))); 1421 len = ALIGN(len, huge_page_size(hstate_file(file)));
1422 retval = -EINVAL; 1422 retval = -EINVAL;
1423 if (unlikely(flags & MAP_HUGETLB && !is_file_hugepages(file))) 1423 if (unlikely(flags & MAP_HUGETLB && !is_file_hugepages(file)))
1424 goto out_fput; 1424 goto out_fput;
1425 } else if (flags & MAP_HUGETLB) { 1425 } else if (flags & MAP_HUGETLB) {
1426 struct user_struct *user = NULL; 1426 struct user_struct *user = NULL;
1427 struct hstate *hs; 1427 struct hstate *hs;
1428 1428
1429 hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & SHM_HUGE_MASK); 1429 hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & SHM_HUGE_MASK);
1430 if (!hs) 1430 if (!hs)
1431 return -EINVAL; 1431 return -EINVAL;
1432 1432
1433 len = ALIGN(len, huge_page_size(hs)); 1433 len = ALIGN(len, huge_page_size(hs));
1434 /* 1434 /*
1435 * VM_NORESERVE is used because the reservations will be 1435 * VM_NORESERVE is used because the reservations will be
1436 * taken when vm_ops->mmap() is called 1436 * taken when vm_ops->mmap() is called
1437 * A dummy user value is used because we are not locking 1437 * A dummy user value is used because we are not locking
1438 * memory so no accounting is necessary 1438 * memory so no accounting is necessary
1439 */ 1439 */
1440 file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, 1440 file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
1441 VM_NORESERVE, 1441 VM_NORESERVE,
1442 &user, HUGETLB_ANONHUGE_INODE, 1442 &user, HUGETLB_ANONHUGE_INODE,
1443 (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); 1443 (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
1444 if (IS_ERR(file)) 1444 if (IS_ERR(file))
1445 return PTR_ERR(file); 1445 return PTR_ERR(file);
1446 } 1446 }
1447 1447
1448 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); 1448 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
1449 1449
1450 retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); 1450 retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
1451 out_fput: 1451 out_fput:
1452 if (file) 1452 if (file)
1453 fput(file); 1453 fput(file);
1454 out: 1454 out:
1455 return retval; 1455 return retval;
1456 } 1456 }
1457 1457
1458 #ifdef __ARCH_WANT_SYS_OLD_MMAP 1458 #ifdef __ARCH_WANT_SYS_OLD_MMAP
1459 struct mmap_arg_struct { 1459 struct mmap_arg_struct {
1460 unsigned long addr; 1460 unsigned long addr;
1461 unsigned long len; 1461 unsigned long len;
1462 unsigned long prot; 1462 unsigned long prot;
1463 unsigned long flags; 1463 unsigned long flags;
1464 unsigned long fd; 1464 unsigned long fd;
1465 unsigned long offset; 1465 unsigned long offset;
1466 }; 1466 };
1467 1467
1468 SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) 1468 SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
1469 { 1469 {
1470 struct mmap_arg_struct a; 1470 struct mmap_arg_struct a;
1471 1471
1472 if (copy_from_user(&a, arg, sizeof(a))) 1472 if (copy_from_user(&a, arg, sizeof(a)))
1473 return -EFAULT; 1473 return -EFAULT;
1474 if (a.offset & ~PAGE_MASK) 1474 if (a.offset & ~PAGE_MASK)
1475 return -EINVAL; 1475 return -EINVAL;
1476 1476
1477 return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, 1477 return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
1478 a.offset >> PAGE_SHIFT); 1478 a.offset >> PAGE_SHIFT);
1479 } 1479 }
1480 #endif /* __ARCH_WANT_SYS_OLD_MMAP */ 1480 #endif /* __ARCH_WANT_SYS_OLD_MMAP */
1481 1481
1482 /* 1482 /*
1483 * Some shared mappigns will want the pages marked read-only 1483 * Some shared mappigns will want the pages marked read-only
1484 * to track write events. If so, we'll downgrade vm_page_prot 1484 * to track write events. If so, we'll downgrade vm_page_prot
1485 * to the private version (using protection_map[] without the 1485 * to the private version (using protection_map[] without the
1486 * VM_SHARED bit). 1486 * VM_SHARED bit).
1487 */ 1487 */
1488 int vma_wants_writenotify(struct vm_area_struct *vma) 1488 int vma_wants_writenotify(struct vm_area_struct *vma)
1489 { 1489 {
1490 vm_flags_t vm_flags = vma->vm_flags; 1490 vm_flags_t vm_flags = vma->vm_flags;
1491 1491
1492 /* If it was private or non-writable, the write bit is already clear */ 1492 /* If it was private or non-writable, the write bit is already clear */
1493 if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED))) 1493 if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED)))
1494 return 0; 1494 return 0;
1495 1495
1496 /* The backer wishes to know when pages are first written to? */ 1496 /* The backer wishes to know when pages are first written to? */
1497 if (vma->vm_ops && vma->vm_ops->page_mkwrite) 1497 if (vma->vm_ops && vma->vm_ops->page_mkwrite)
1498 return 1; 1498 return 1;
1499 1499
1500 /* The open routine did something to the protections that pgprot_modify 1500 /* The open routine did something to the protections that pgprot_modify
1501 * won't preserve? */ 1501 * won't preserve? */
1502 if (pgprot_val(vma->vm_page_prot) != 1502 if (pgprot_val(vma->vm_page_prot) !=
1503 pgprot_val(vm_pgprot_modify(vma->vm_page_prot, vm_flags))) 1503 pgprot_val(vm_pgprot_modify(vma->vm_page_prot, vm_flags)))
1504 return 0; 1504 return 0;
1505 1505
1506 /* Do we need to track softdirty? */ 1506 /* Do we need to track softdirty? */
1507 if (IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) && !(vm_flags & VM_SOFTDIRTY)) 1507 if (IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) && !(vm_flags & VM_SOFTDIRTY))
1508 return 1; 1508 return 1;
1509 1509
1510 /* Specialty mapping? */ 1510 /* Specialty mapping? */
1511 if (vm_flags & VM_PFNMAP) 1511 if (vm_flags & VM_PFNMAP)
1512 return 0; 1512 return 0;
1513 1513
1514 /* Can the mapping track the dirty pages? */ 1514 /* Can the mapping track the dirty pages? */
1515 return vma->vm_file && vma->vm_file->f_mapping && 1515 return vma->vm_file && vma->vm_file->f_mapping &&
1516 mapping_cap_account_dirty(vma->vm_file->f_mapping); 1516 mapping_cap_account_dirty(vma->vm_file->f_mapping);
1517 } 1517 }
1518 1518
1519 /* 1519 /*
1520 * We account for memory if it's a private writeable mapping, 1520 * We account for memory if it's a private writeable mapping,
1521 * not hugepages and VM_NORESERVE wasn't set. 1521 * not hugepages and VM_NORESERVE wasn't set.
1522 */ 1522 */
1523 static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags) 1523 static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
1524 { 1524 {
1525 /* 1525 /*
1526 * hugetlb has its own accounting separate from the core VM 1526 * hugetlb has its own accounting separate from the core VM
1527 * VM_HUGETLB may not be set yet so we cannot check for that flag. 1527 * VM_HUGETLB may not be set yet so we cannot check for that flag.
1528 */ 1528 */
1529 if (file && is_file_hugepages(file)) 1529 if (file && is_file_hugepages(file))
1530 return 0; 1530 return 0;
1531 1531
1532 return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE; 1532 return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
1533 } 1533 }
1534 1534
1535 unsigned long mmap_region(struct file *file, unsigned long addr, 1535 unsigned long mmap_region(struct file *file, unsigned long addr,
1536 unsigned long len, vm_flags_t vm_flags, unsigned long pgoff) 1536 unsigned long len, vm_flags_t vm_flags, unsigned long pgoff)
1537 { 1537 {
1538 struct mm_struct *mm = current->mm; 1538 struct mm_struct *mm = current->mm;
1539 struct vm_area_struct *vma, *prev; 1539 struct vm_area_struct *vma, *prev;
1540 int error; 1540 int error;
1541 struct rb_node **rb_link, *rb_parent; 1541 struct rb_node **rb_link, *rb_parent;
1542 unsigned long charged = 0; 1542 unsigned long charged = 0;
1543 1543
1544 /* Check against address space limit. */ 1544 /* Check against address space limit. */
1545 if (!may_expand_vm(mm, len >> PAGE_SHIFT)) { 1545 if (!may_expand_vm(mm, len >> PAGE_SHIFT)) {
1546 unsigned long nr_pages; 1546 unsigned long nr_pages;
1547 1547
1548 /* 1548 /*
1549 * MAP_FIXED may remove pages of mappings that intersects with 1549 * MAP_FIXED may remove pages of mappings that intersects with
1550 * requested mapping. Account for the pages it would unmap. 1550 * requested mapping. Account for the pages it would unmap.
1551 */ 1551 */
1552 if (!(vm_flags & MAP_FIXED)) 1552 if (!(vm_flags & MAP_FIXED))
1553 return -ENOMEM; 1553 return -ENOMEM;
1554 1554
1555 nr_pages = count_vma_pages_range(mm, addr, addr + len); 1555 nr_pages = count_vma_pages_range(mm, addr, addr + len);
1556 1556
1557 if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages)) 1557 if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages))
1558 return -ENOMEM; 1558 return -ENOMEM;
1559 } 1559 }
1560 1560
1561 /* Clear old maps */ 1561 /* Clear old maps */
1562 error = -ENOMEM; 1562 error = -ENOMEM;
1563 munmap_back: 1563 munmap_back:
1564 if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { 1564 if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
1565 if (do_munmap(mm, addr, len)) 1565 if (do_munmap(mm, addr, len))
1566 return -ENOMEM; 1566 return -ENOMEM;
1567 goto munmap_back; 1567 goto munmap_back;
1568 } 1568 }
1569 1569
1570 /* 1570 /*
1571 * Private writable mapping: check memory availability 1571 * Private writable mapping: check memory availability
1572 */ 1572 */
1573 if (accountable_mapping(file, vm_flags)) { 1573 if (accountable_mapping(file, vm_flags)) {
1574 charged = len >> PAGE_SHIFT; 1574 charged = len >> PAGE_SHIFT;
1575 if (security_vm_enough_memory_mm(mm, charged)) 1575 if (security_vm_enough_memory_mm(mm, charged))
1576 return -ENOMEM; 1576 return -ENOMEM;
1577 vm_flags |= VM_ACCOUNT; 1577 vm_flags |= VM_ACCOUNT;
1578 } 1578 }
1579 1579
1580 /* 1580 /*
1581 * Can we just expand an old mapping? 1581 * Can we just expand an old mapping?
1582 */ 1582 */
1583 vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL); 1583 vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL);
1584 if (vma) 1584 if (vma)
1585 goto out; 1585 goto out;
1586 1586
1587 /* 1587 /*
1588 * Determine the object being mapped and call the appropriate 1588 * Determine the object being mapped and call the appropriate
1589 * specific mapper. the address has already been validated, but 1589 * specific mapper. the address has already been validated, but
1590 * not unmapped, but the maps are removed from the list. 1590 * not unmapped, but the maps are removed from the list.
1591 */ 1591 */
1592 vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); 1592 vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
1593 if (!vma) { 1593 if (!vma) {
1594 error = -ENOMEM; 1594 error = -ENOMEM;
1595 goto unacct_error; 1595 goto unacct_error;
1596 } 1596 }
1597 1597
1598 vma->vm_mm = mm; 1598 vma->vm_mm = mm;
1599 vma->vm_start = addr; 1599 vma->vm_start = addr;
1600 vma->vm_end = addr + len; 1600 vma->vm_end = addr + len;
1601 vma->vm_flags = vm_flags; 1601 vma->vm_flags = vm_flags;
1602 vma->vm_page_prot = vm_get_page_prot(vm_flags); 1602 vma->vm_page_prot = vm_get_page_prot(vm_flags);
1603 vma->vm_pgoff = pgoff; 1603 vma->vm_pgoff = pgoff;
1604 INIT_LIST_HEAD(&vma->anon_vma_chain); 1604 INIT_LIST_HEAD(&vma->anon_vma_chain);
1605 1605
1606 if (file) { 1606 if (file) {
1607 if (vm_flags & VM_DENYWRITE) { 1607 if (vm_flags & VM_DENYWRITE) {
1608 error = deny_write_access(file); 1608 error = deny_write_access(file);
1609 if (error) 1609 if (error)
1610 goto free_vma; 1610 goto free_vma;
1611 } 1611 }
1612 if (vm_flags & VM_SHARED) { 1612 if (vm_flags & VM_SHARED) {
1613 error = mapping_map_writable(file->f_mapping); 1613 error = mapping_map_writable(file->f_mapping);
1614 if (error) 1614 if (error)
1615 goto allow_write_and_free_vma; 1615 goto allow_write_and_free_vma;
1616 } 1616 }
1617 1617
1618 /* ->mmap() can change vma->vm_file, but must guarantee that 1618 /* ->mmap() can change vma->vm_file, but must guarantee that
1619 * vma_link() below can deny write-access if VM_DENYWRITE is set 1619 * vma_link() below can deny write-access if VM_DENYWRITE is set
1620 * and map writably if VM_SHARED is set. This usually means the 1620 * and map writably if VM_SHARED is set. This usually means the
1621 * new file must not have been exposed to user-space, yet. 1621 * new file must not have been exposed to user-space, yet.
1622 */ 1622 */
1623 vma->vm_file = get_file(file); 1623 vma->vm_file = get_file(file);
1624 error = file->f_op->mmap(file, vma); 1624 error = file->f_op->mmap(file, vma);
1625 if (error) 1625 if (error)
1626 goto unmap_and_free_vma; 1626 goto unmap_and_free_vma;
1627 1627
1628 /* Can addr have changed?? 1628 /* Can addr have changed??
1629 * 1629 *
1630 * Answer: Yes, several device drivers can do it in their 1630 * Answer: Yes, several device drivers can do it in their
1631 * f_op->mmap method. -DaveM 1631 * f_op->mmap method. -DaveM
1632 * Bug: If addr is changed, prev, rb_link, rb_parent should 1632 * Bug: If addr is changed, prev, rb_link, rb_parent should
1633 * be updated for vma_link() 1633 * be updated for vma_link()
1634 */ 1634 */
1635 WARN_ON_ONCE(addr != vma->vm_start); 1635 WARN_ON_ONCE(addr != vma->vm_start);
1636 1636
1637 addr = vma->vm_start; 1637 addr = vma->vm_start;
1638 vm_flags = vma->vm_flags; 1638 vm_flags = vma->vm_flags;
1639 } else if (vm_flags & VM_SHARED) { 1639 } else if (vm_flags & VM_SHARED) {
1640 error = shmem_zero_setup(vma); 1640 error = shmem_zero_setup(vma);
1641 if (error) 1641 if (error)
1642 goto free_vma; 1642 goto free_vma;
1643 } 1643 }
1644 1644
1645 vma_link(mm, vma, prev, rb_link, rb_parent); 1645 vma_link(mm, vma, prev, rb_link, rb_parent);
1646 /* Once vma denies write, undo our temporary denial count */ 1646 /* Once vma denies write, undo our temporary denial count */
1647 if (file) { 1647 if (file) {
1648 if (vm_flags & VM_SHARED) 1648 if (vm_flags & VM_SHARED)
1649 mapping_unmap_writable(file->f_mapping); 1649 mapping_unmap_writable(file->f_mapping);
1650 if (vm_flags & VM_DENYWRITE) 1650 if (vm_flags & VM_DENYWRITE)
1651 allow_write_access(file); 1651 allow_write_access(file);
1652 } 1652 }
1653 file = vma->vm_file; 1653 file = vma->vm_file;
1654 out: 1654 out:
1655 perf_event_mmap(vma); 1655 perf_event_mmap(vma);
1656 1656
1657 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); 1657 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
1658 if (vm_flags & VM_LOCKED) { 1658 if (vm_flags & VM_LOCKED) {
1659 if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) || 1659 if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) ||
1660 vma == get_gate_vma(current->mm))) 1660 vma == get_gate_vma(current->mm)))
1661 mm->locked_vm += (len >> PAGE_SHIFT); 1661 mm->locked_vm += (len >> PAGE_SHIFT);
1662 else 1662 else
1663 vma->vm_flags &= ~VM_LOCKED; 1663 vma->vm_flags &= ~VM_LOCKED;
1664 } 1664 }
1665 1665
1666 if (file) 1666 if (file)
1667 uprobe_mmap(vma); 1667 uprobe_mmap(vma);
1668 1668
1669 /* 1669 /*
1670 * New (or expanded) vma always get soft dirty status. 1670 * New (or expanded) vma always get soft dirty status.
1671 * Otherwise user-space soft-dirty page tracker won't 1671 * Otherwise user-space soft-dirty page tracker won't
1672 * be able to distinguish situation when vma area unmapped, 1672 * be able to distinguish situation when vma area unmapped,
1673 * then new mapped in-place (which must be aimed as 1673 * then new mapped in-place (which must be aimed as
1674 * a completely new data area). 1674 * a completely new data area).
1675 */ 1675 */
1676 vma->vm_flags |= VM_SOFTDIRTY; 1676 vma->vm_flags |= VM_SOFTDIRTY;
1677 1677
1678 vma_set_page_prot(vma); 1678 vma_set_page_prot(vma);
1679 1679
1680 return addr; 1680 return addr;
1681 1681
1682 unmap_and_free_vma: 1682 unmap_and_free_vma:
1683 vma->vm_file = NULL; 1683 vma->vm_file = NULL;
1684 fput(file); 1684 fput(file);
1685 1685
1686 /* Undo any partial mapping done by a device driver. */ 1686 /* Undo any partial mapping done by a device driver. */
1687 unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end); 1687 unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
1688 charged = 0; 1688 charged = 0;
1689 if (vm_flags & VM_SHARED) 1689 if (vm_flags & VM_SHARED)
1690 mapping_unmap_writable(file->f_mapping); 1690 mapping_unmap_writable(file->f_mapping);
1691 allow_write_and_free_vma: 1691 allow_write_and_free_vma:
1692 if (vm_flags & VM_DENYWRITE) 1692 if (vm_flags & VM_DENYWRITE)
1693 allow_write_access(file); 1693 allow_write_access(file);
1694 free_vma: 1694 free_vma:
1695 kmem_cache_free(vm_area_cachep, vma); 1695 kmem_cache_free(vm_area_cachep, vma);
1696 unacct_error: 1696 unacct_error:
1697 if (charged) 1697 if (charged)
1698 vm_unacct_memory(charged); 1698 vm_unacct_memory(charged);
1699 return error; 1699 return error;
1700 } 1700 }
1701 1701
1702 unsigned long unmapped_area(struct vm_unmapped_area_info *info) 1702 unsigned long unmapped_area(struct vm_unmapped_area_info *info)
1703 { 1703 {
1704 /* 1704 /*
1705 * We implement the search by looking for an rbtree node that 1705 * We implement the search by looking for an rbtree node that
1706 * immediately follows a suitable gap. That is, 1706 * immediately follows a suitable gap. That is,
1707 * - gap_start = vma->vm_prev->vm_end <= info->high_limit - length; 1707 * - gap_start = vma->vm_prev->vm_end <= info->high_limit - length;
1708 * - gap_end = vma->vm_start >= info->low_limit + length; 1708 * - gap_end = vma->vm_start >= info->low_limit + length;
1709 * - gap_end - gap_start >= length 1709 * - gap_end - gap_start >= length
1710 */ 1710 */
1711 1711
1712 struct mm_struct *mm = current->mm; 1712 struct mm_struct *mm = current->mm;
1713 struct vm_area_struct *vma; 1713 struct vm_area_struct *vma;
1714 unsigned long length, low_limit, high_limit, gap_start, gap_end; 1714 unsigned long length, low_limit, high_limit, gap_start, gap_end;
1715 1715
1716 /* Adjust search length to account for worst case alignment overhead */ 1716 /* Adjust search length to account for worst case alignment overhead */
1717 length = info->length + info->align_mask; 1717 length = info->length + info->align_mask;
1718 if (length < info->length) 1718 if (length < info->length)
1719 return -ENOMEM; 1719 return -ENOMEM;
1720 1720
1721 /* Adjust search limits by the desired length */ 1721 /* Adjust search limits by the desired length */
1722 if (info->high_limit < length) 1722 if (info->high_limit < length)
1723 return -ENOMEM; 1723 return -ENOMEM;
1724 high_limit = info->high_limit - length; 1724 high_limit = info->high_limit - length;
1725 1725
1726 if (info->low_limit > high_limit) 1726 if (info->low_limit > high_limit)
1727 return -ENOMEM; 1727 return -ENOMEM;
1728 low_limit = info->low_limit + length; 1728 low_limit = info->low_limit + length;
1729 1729
1730 /* Check if rbtree root looks promising */ 1730 /* Check if rbtree root looks promising */
1731 if (RB_EMPTY_ROOT(&mm->mm_rb)) 1731 if (RB_EMPTY_ROOT(&mm->mm_rb))
1732 goto check_highest; 1732 goto check_highest;
1733 vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb); 1733 vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
1734 if (vma->rb_subtree_gap < length) 1734 if (vma->rb_subtree_gap < length)
1735 goto check_highest; 1735 goto check_highest;
1736 1736
1737 while (true) { 1737 while (true) {
1738 /* Visit left subtree if it looks promising */ 1738 /* Visit left subtree if it looks promising */
1739 gap_end = vma->vm_start; 1739 gap_end = vma->vm_start;
1740 if (gap_end >= low_limit && vma->vm_rb.rb_left) { 1740 if (gap_end >= low_limit && vma->vm_rb.rb_left) {
1741 struct vm_area_struct *left = 1741 struct vm_area_struct *left =
1742 rb_entry(vma->vm_rb.rb_left, 1742 rb_entry(vma->vm_rb.rb_left,
1743 struct vm_area_struct, vm_rb); 1743 struct vm_area_struct, vm_rb);
1744 if (left->rb_subtree_gap >= length) { 1744 if (left->rb_subtree_gap >= length) {
1745 vma = left; 1745 vma = left;
1746 continue; 1746 continue;
1747 } 1747 }
1748 } 1748 }
1749 1749
1750 gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0; 1750 gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;
1751 check_current: 1751 check_current:
1752 /* Check if current node has a suitable gap */ 1752 /* Check if current node has a suitable gap */
1753 if (gap_start > high_limit) 1753 if (gap_start > high_limit)
1754 return -ENOMEM; 1754 return -ENOMEM;
1755 if (gap_end >= low_limit && gap_end - gap_start >= length) 1755 if (gap_end >= low_limit && gap_end - gap_start >= length)
1756 goto found; 1756 goto found;
1757 1757
1758 /* Visit right subtree if it looks promising */ 1758 /* Visit right subtree if it looks promising */
1759 if (vma->vm_rb.rb_right) { 1759 if (vma->vm_rb.rb_right) {
1760 struct vm_area_struct *right = 1760 struct vm_area_struct *right =
1761 rb_entry(vma->vm_rb.rb_right, 1761 rb_entry(vma->vm_rb.rb_right,
1762 struct vm_area_struct, vm_rb); 1762 struct vm_area_struct, vm_rb);
1763 if (right->rb_subtree_gap >= length) { 1763 if (right->rb_subtree_gap >= length) {
1764 vma = right; 1764 vma = right;
1765 continue; 1765 continue;
1766 } 1766 }
1767 } 1767 }
1768 1768
1769 /* Go back up the rbtree to find next candidate node */ 1769 /* Go back up the rbtree to find next candidate node */
1770 while (true) { 1770 while (true) {
1771 struct rb_node *prev = &vma->vm_rb; 1771 struct rb_node *prev = &vma->vm_rb;
1772 if (!rb_parent(prev)) 1772 if (!rb_parent(prev))
1773 goto check_highest; 1773 goto check_highest;
1774 vma = rb_entry(rb_parent(prev), 1774 vma = rb_entry(rb_parent(prev),
1775 struct vm_area_struct, vm_rb); 1775 struct vm_area_struct, vm_rb);
1776 if (prev == vma->vm_rb.rb_left) { 1776 if (prev == vma->vm_rb.rb_left) {
1777 gap_start = vma->vm_prev->vm_end; 1777 gap_start = vma->vm_prev->vm_end;
1778 gap_end = vma->vm_start; 1778 gap_end = vma->vm_start;
1779 goto check_current; 1779 goto check_current;
1780 } 1780 }
1781 } 1781 }
1782 } 1782 }
1783 1783
1784 check_highest: 1784 check_highest:
1785 /* Check highest gap, which does not precede any rbtree node */ 1785 /* Check highest gap, which does not precede any rbtree node */
1786 gap_start = mm->highest_vm_end; 1786 gap_start = mm->highest_vm_end;
1787 gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */ 1787 gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */
1788 if (gap_start > high_limit) 1788 if (gap_start > high_limit)
1789 return -ENOMEM; 1789 return -ENOMEM;
1790 1790
1791 found: 1791 found:
1792 /* We found a suitable gap. Clip it with the original low_limit. */ 1792 /* We found a suitable gap. Clip it with the original low_limit. */
1793 if (gap_start < info->low_limit) 1793 if (gap_start < info->low_limit)
1794 gap_start = info->low_limit; 1794 gap_start = info->low_limit;
1795 1795
1796 /* Adjust gap address to the desired alignment */ 1796 /* Adjust gap address to the desired alignment */
1797 gap_start += (info->align_offset - gap_start) & info->align_mask; 1797 gap_start += (info->align_offset - gap_start) & info->align_mask;
1798 1798
1799 VM_BUG_ON(gap_start + info->length > info->high_limit); 1799 VM_BUG_ON(gap_start + info->length > info->high_limit);
1800 VM_BUG_ON(gap_start + info->length > gap_end); 1800 VM_BUG_ON(gap_start + info->length > gap_end);
1801 return gap_start; 1801 return gap_start;
1802 } 1802 }
1803 1803
1804 unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) 1804 unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
1805 { 1805 {
1806 struct mm_struct *mm = current->mm; 1806 struct mm_struct *mm = current->mm;
1807 struct vm_area_struct *vma; 1807 struct vm_area_struct *vma;
1808 unsigned long length, low_limit, high_limit, gap_start, gap_end; 1808 unsigned long length, low_limit, high_limit, gap_start, gap_end;
1809 1809
1810 /* Adjust search length to account for worst case alignment overhead */ 1810 /* Adjust search length to account for worst case alignment overhead */
1811 length = info->length + info->align_mask; 1811 length = info->length + info->align_mask;
1812 if (length < info->length) 1812 if (length < info->length)
1813 return -ENOMEM; 1813 return -ENOMEM;
1814 1814
1815 /* 1815 /*
1816 * Adjust search limits by the desired length. 1816 * Adjust search limits by the desired length.
1817 * See implementation comment at top of unmapped_area(). 1817 * See implementation comment at top of unmapped_area().
1818 */ 1818 */
1819 gap_end = info->high_limit; 1819 gap_end = info->high_limit;
1820 if (gap_end < length) 1820 if (gap_end < length)
1821 return -ENOMEM; 1821 return -ENOMEM;
1822 high_limit = gap_end - length; 1822 high_limit = gap_end - length;
1823 1823
1824 if (info->low_limit > high_limit) 1824 if (info->low_limit > high_limit)
1825 return -ENOMEM; 1825 return -ENOMEM;
1826 low_limit = info->low_limit + length; 1826 low_limit = info->low_limit + length;
1827 1827
1828 /* Check highest gap, which does not precede any rbtree node */ 1828 /* Check highest gap, which does not precede any rbtree node */
1829 gap_start = mm->highest_vm_end; 1829 gap_start = mm->highest_vm_end;
1830 if (gap_start <= high_limit) 1830 if (gap_start <= high_limit)
1831 goto found_highest; 1831 goto found_highest;
1832 1832
1833 /* Check if rbtree root looks promising */ 1833 /* Check if rbtree root looks promising */
1834 if (RB_EMPTY_ROOT(&mm->mm_rb)) 1834 if (RB_EMPTY_ROOT(&mm->mm_rb))
1835 return -ENOMEM; 1835 return -ENOMEM;
1836 vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb); 1836 vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
1837 if (vma->rb_subtree_gap < length) 1837 if (vma->rb_subtree_gap < length)
1838 return -ENOMEM; 1838 return -ENOMEM;
1839 1839
1840 while (true) { 1840 while (true) {
1841 /* Visit right subtree if it looks promising */ 1841 /* Visit right subtree if it looks promising */
1842 gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0; 1842 gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;
1843 if (gap_start <= high_limit && vma->vm_rb.rb_right) { 1843 if (gap_start <= high_limit && vma->vm_rb.rb_right) {
1844 struct vm_area_struct *right = 1844 struct vm_area_struct *right =
1845 rb_entry(vma->vm_rb.rb_right, 1845 rb_entry(vma->vm_rb.rb_right,
1846 struct vm_area_struct, vm_rb); 1846 struct vm_area_struct, vm_rb);
1847 if (right->rb_subtree_gap >= length) { 1847 if (right->rb_subtree_gap >= length) {
1848 vma = right; 1848 vma = right;
1849 continue; 1849 continue;
1850 } 1850 }
1851 } 1851 }
1852 1852
1853 check_current: 1853 check_current:
1854 /* Check if current node has a suitable gap */ 1854 /* Check if current node has a suitable gap */
1855 gap_end = vma->vm_start; 1855 gap_end = vma->vm_start;
1856 if (gap_end < low_limit) 1856 if (gap_end < low_limit)
1857 return -ENOMEM; 1857 return -ENOMEM;
1858 if (gap_start <= high_limit && gap_end - gap_start >= length) 1858 if (gap_start <= high_limit && gap_end - gap_start >= length)
1859 goto found; 1859 goto found;
1860 1860
1861 /* Visit left subtree if it looks promising */ 1861 /* Visit left subtree if it looks promising */
1862 if (vma->vm_rb.rb_left) { 1862 if (vma->vm_rb.rb_left) {
1863 struct vm_area_struct *left = 1863 struct vm_area_struct *left =
1864 rb_entry(vma->vm_rb.rb_left, 1864 rb_entry(vma->vm_rb.rb_left,
1865 struct vm_area_struct, vm_rb); 1865 struct vm_area_struct, vm_rb);
1866 if (left->rb_subtree_gap >= length) { 1866 if (left->rb_subtree_gap >= length) {
1867 vma = left; 1867 vma = left;
1868 continue; 1868 continue;
1869 } 1869 }
1870 } 1870 }
1871 1871
1872 /* Go back up the rbtree to find next candidate node */ 1872 /* Go back up the rbtree to find next candidate node */
1873 while (true) { 1873 while (true) {
1874 struct rb_node *prev = &vma->vm_rb; 1874 struct rb_node *prev = &vma->vm_rb;
1875 if (!rb_parent(prev)) 1875 if (!rb_parent(prev))
1876 return -ENOMEM; 1876 return -ENOMEM;
1877 vma = rb_entry(rb_parent(prev), 1877 vma = rb_entry(rb_parent(prev),
1878 struct vm_area_struct, vm_rb); 1878 struct vm_area_struct, vm_rb);
1879 if (prev == vma->vm_rb.rb_right) { 1879 if (prev == vma->vm_rb.rb_right) {
1880 gap_start = vma->vm_prev ? 1880 gap_start = vma->vm_prev ?
1881 vma->vm_prev->vm_end : 0; 1881 vma->vm_prev->vm_end : 0;
1882 goto check_current; 1882 goto check_current;
1883 } 1883 }
1884 } 1884 }
1885 } 1885 }
1886 1886
1887 found: 1887 found:
1888 /* We found a suitable gap. Clip it with the original high_limit. */ 1888 /* We found a suitable gap. Clip it with the original high_limit. */
1889 if (gap_end > info->high_limit) 1889 if (gap_end > info->high_limit)
1890 gap_end = info->high_limit; 1890 gap_end = info->high_limit;
1891 1891
1892 found_highest: 1892 found_highest:
1893 /* Compute highest gap address at the desired alignment */ 1893 /* Compute highest gap address at the desired alignment */
1894 gap_end -= info->length; 1894 gap_end -= info->length;
1895 gap_end -= (gap_end - info->align_offset) & info->align_mask; 1895 gap_end -= (gap_end - info->align_offset) & info->align_mask;
1896 1896
1897 VM_BUG_ON(gap_end < info->low_limit); 1897 VM_BUG_ON(gap_end < info->low_limit);
1898 VM_BUG_ON(gap_end < gap_start); 1898 VM_BUG_ON(gap_end < gap_start);
1899 return gap_end; 1899 return gap_end;
1900 } 1900 }
1901 1901
1902 /* Get an address range which is currently unmapped. 1902 /* Get an address range which is currently unmapped.
1903 * For shmat() with addr=0. 1903 * For shmat() with addr=0.
1904 * 1904 *
1905 * Ugly calling convention alert: 1905 * Ugly calling convention alert:
1906 * Return value with the low bits set means error value, 1906 * Return value with the low bits set means error value,
1907 * ie 1907 * ie
1908 * if (ret & ~PAGE_MASK) 1908 * if (ret & ~PAGE_MASK)
1909 * error = ret; 1909 * error = ret;
1910 * 1910 *
1911 * This function "knows" that -ENOMEM has the bits set. 1911 * This function "knows" that -ENOMEM has the bits set.
1912 */ 1912 */
1913 #ifndef HAVE_ARCH_UNMAPPED_AREA 1913 #ifndef HAVE_ARCH_UNMAPPED_AREA
1914 unsigned long 1914 unsigned long
1915 arch_get_unmapped_area(struct file *filp, unsigned long addr, 1915 arch_get_unmapped_area(struct file *filp, unsigned long addr,
1916 unsigned long len, unsigned long pgoff, unsigned long flags) 1916 unsigned long len, unsigned long pgoff, unsigned long flags)
1917 { 1917 {
1918 struct mm_struct *mm = current->mm; 1918 struct mm_struct *mm = current->mm;
1919 struct vm_area_struct *vma; 1919 struct vm_area_struct *vma;
1920 struct vm_unmapped_area_info info; 1920 struct vm_unmapped_area_info info;
1921 1921
1922 if (len > TASK_SIZE - mmap_min_addr) 1922 if (len > TASK_SIZE - mmap_min_addr)
1923 return -ENOMEM; 1923 return -ENOMEM;
1924 1924
1925 if (flags & MAP_FIXED) 1925 if (flags & MAP_FIXED)
1926 return addr; 1926 return addr;
1927 1927
1928 if (addr) { 1928 if (addr) {
1929 addr = PAGE_ALIGN(addr); 1929 addr = PAGE_ALIGN(addr);
1930 vma = find_vma(mm, addr); 1930 vma = find_vma(mm, addr);
1931 if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && 1931 if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
1932 (!vma || addr + len <= vma->vm_start)) 1932 (!vma || addr + len <= vma->vm_start))
1933 return addr; 1933 return addr;
1934 } 1934 }
1935 1935
1936 info.flags = 0; 1936 info.flags = 0;
1937 info.length = len; 1937 info.length = len;
1938 info.low_limit = mm->mmap_base; 1938 info.low_limit = mm->mmap_base;
1939 info.high_limit = TASK_SIZE; 1939 info.high_limit = TASK_SIZE;
1940 info.align_mask = 0; 1940 info.align_mask = 0;
1941 return vm_unmapped_area(&info); 1941 return vm_unmapped_area(&info);
1942 } 1942 }
1943 #endif 1943 #endif
1944 1944
1945 /* 1945 /*
1946 * This mmap-allocator allocates new areas top-down from below the 1946 * This mmap-allocator allocates new areas top-down from below the
1947 * stack's low limit (the base): 1947 * stack's low limit (the base):
1948 */ 1948 */
1949 #ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN 1949 #ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
1950 unsigned long 1950 unsigned long
1951 arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, 1951 arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
1952 const unsigned long len, const unsigned long pgoff, 1952 const unsigned long len, const unsigned long pgoff,
1953 const unsigned long flags) 1953 const unsigned long flags)
1954 { 1954 {
1955 struct vm_area_struct *vma; 1955 struct vm_area_struct *vma;
1956 struct mm_struct *mm = current->mm; 1956 struct mm_struct *mm = current->mm;
1957 unsigned long addr = addr0; 1957 unsigned long addr = addr0;
1958 struct vm_unmapped_area_info info; 1958 struct vm_unmapped_area_info info;
1959 1959
1960 /* requested length too big for entire address space */ 1960 /* requested length too big for entire address space */
1961 if (len > TASK_SIZE - mmap_min_addr) 1961 if (len > TASK_SIZE - mmap_min_addr)
1962 return -ENOMEM; 1962 return -ENOMEM;
1963 1963
1964 if (flags & MAP_FIXED) 1964 if (flags & MAP_FIXED)
1965 return addr; 1965 return addr;
1966 1966
1967 /* requesting a specific address */ 1967 /* requesting a specific address */
1968 if (addr) { 1968 if (addr) {
1969 addr = PAGE_ALIGN(addr); 1969 addr = PAGE_ALIGN(addr);
1970 vma = find_vma(mm, addr); 1970 vma = find_vma(mm, addr);
1971 if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && 1971 if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
1972 (!vma || addr + len <= vma->vm_start)) 1972 (!vma || addr + len <= vma->vm_start))
1973 return addr; 1973 return addr;
1974 } 1974 }
1975 1975
1976 info.flags = VM_UNMAPPED_AREA_TOPDOWN; 1976 info.flags = VM_UNMAPPED_AREA_TOPDOWN;
1977 info.length = len; 1977 info.length = len;
1978 info.low_limit = max(PAGE_SIZE, mmap_min_addr); 1978 info.low_limit = max(PAGE_SIZE, mmap_min_addr);
1979 info.high_limit = mm->mmap_base; 1979 info.high_limit = mm->mmap_base;
1980 info.align_mask = 0; 1980 info.align_mask = 0;
1981 addr = vm_unmapped_area(&info); 1981 addr = vm_unmapped_area(&info);
1982 1982
1983 /* 1983 /*
1984 * A failed mmap() very likely causes application failure, 1984 * A failed mmap() very likely causes application failure,
1985 * so fall back to the bottom-up function here. This scenario 1985 * so fall back to the bottom-up function here. This scenario
1986 * can happen with large stack limits and large mmap() 1986 * can happen with large stack limits and large mmap()
1987 * allocations. 1987 * allocations.
1988 */ 1988 */
1989 if (addr & ~PAGE_MASK) { 1989 if (addr & ~PAGE_MASK) {
1990 VM_BUG_ON(addr != -ENOMEM); 1990 VM_BUG_ON(addr != -ENOMEM);
1991 info.flags = 0; 1991 info.flags = 0;
1992 info.low_limit = TASK_UNMAPPED_BASE; 1992 info.low_limit = TASK_UNMAPPED_BASE;
1993 info.high_limit = TASK_SIZE; 1993 info.high_limit = TASK_SIZE;
1994 addr = vm_unmapped_area(&info); 1994 addr = vm_unmapped_area(&info);
1995 } 1995 }
1996 1996
1997 return addr; 1997 return addr;
1998 } 1998 }
1999 #endif 1999 #endif
2000 2000
2001 unsigned long 2001 unsigned long
2002 get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, 2002 get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
2003 unsigned long pgoff, unsigned long flags) 2003 unsigned long pgoff, unsigned long flags)
2004 { 2004 {
2005 unsigned long (*get_area)(struct file *, unsigned long, 2005 unsigned long (*get_area)(struct file *, unsigned long,
2006 unsigned long, unsigned long, unsigned long); 2006 unsigned long, unsigned long, unsigned long);
2007 2007
2008 unsigned long error = arch_mmap_check(addr, len, flags); 2008 unsigned long error = arch_mmap_check(addr, len, flags);
2009 if (error) 2009 if (error)
2010 return error; 2010 return error;
2011 2011
2012 /* Careful about overflows.. */ 2012 /* Careful about overflows.. */
2013 if (len > TASK_SIZE) 2013 if (len > TASK_SIZE)
2014 return -ENOMEM; 2014 return -ENOMEM;
2015 2015
2016 get_area = current->mm->get_unmapped_area; 2016 get_area = current->mm->get_unmapped_area;
2017 if (file && file->f_op->get_unmapped_area) 2017 if (file && file->f_op->get_unmapped_area)
2018 get_area = file->f_op->get_unmapped_area; 2018 get_area = file->f_op->get_unmapped_area;
2019 addr = get_area(file, addr, len, pgoff, flags); 2019 addr = get_area(file, addr, len, pgoff, flags);
2020 if (IS_ERR_VALUE(addr)) 2020 if (IS_ERR_VALUE(addr))
2021 return addr; 2021 return addr;
2022 2022
2023 if (addr > TASK_SIZE - len) 2023 if (addr > TASK_SIZE - len)
2024 return -ENOMEM; 2024 return -ENOMEM;
2025 if (addr & ~PAGE_MASK) 2025 if (addr & ~PAGE_MASK)
2026 return -EINVAL; 2026 return -EINVAL;
2027 2027
2028 addr = arch_rebalance_pgtables(addr, len); 2028 addr = arch_rebalance_pgtables(addr, len);
2029 error = security_mmap_addr(addr); 2029 error = security_mmap_addr(addr);
2030 return error ? error : addr; 2030 return error ? error : addr;
2031 } 2031 }
2032 2032
2033 EXPORT_SYMBOL(get_unmapped_area); 2033 EXPORT_SYMBOL(get_unmapped_area);
2034 2034
2035 /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ 2035 /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
2036 struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) 2036 struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
2037 { 2037 {
2038 struct rb_node *rb_node; 2038 struct rb_node *rb_node;
2039 struct vm_area_struct *vma; 2039 struct vm_area_struct *vma;
2040 2040
2041 /* Check the cache first. */ 2041 /* Check the cache first. */
2042 vma = vmacache_find(mm, addr); 2042 vma = vmacache_find(mm, addr);
2043 if (likely(vma)) 2043 if (likely(vma))
2044 return vma; 2044 return vma;
2045 2045
2046 rb_node = mm->mm_rb.rb_node; 2046 rb_node = mm->mm_rb.rb_node;
2047 vma = NULL; 2047 vma = NULL;
2048 2048
2049 while (rb_node) { 2049 while (rb_node) {
2050 struct vm_area_struct *tmp; 2050 struct vm_area_struct *tmp;
2051 2051
2052 tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb); 2052 tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
2053 2053
2054 if (tmp->vm_end > addr) { 2054 if (tmp->vm_end > addr) {
2055 vma = tmp; 2055 vma = tmp;
2056 if (tmp->vm_start <= addr) 2056 if (tmp->vm_start <= addr)
2057 break; 2057 break;
2058 rb_node = rb_node->rb_left; 2058 rb_node = rb_node->rb_left;
2059 } else 2059 } else
2060 rb_node = rb_node->rb_right; 2060 rb_node = rb_node->rb_right;
2061 } 2061 }
2062 2062
2063 if (vma) 2063 if (vma)
2064 vmacache_update(addr, vma); 2064 vmacache_update(addr, vma);
2065 return vma; 2065 return vma;
2066 } 2066 }
2067 2067
2068 EXPORT_SYMBOL(find_vma); 2068 EXPORT_SYMBOL(find_vma);
2069 2069
2070 /* 2070 /*
2071 * Same as find_vma, but also return a pointer to the previous VMA in *pprev. 2071 * Same as find_vma, but also return a pointer to the previous VMA in *pprev.
2072 */ 2072 */
2073 struct vm_area_struct * 2073 struct vm_area_struct *
2074 find_vma_prev(struct mm_struct *mm, unsigned long addr, 2074 find_vma_prev(struct mm_struct *mm, unsigned long addr,
2075 struct vm_area_struct **pprev) 2075 struct vm_area_struct **pprev)
2076 { 2076 {
2077 struct vm_area_struct *vma; 2077 struct vm_area_struct *vma;
2078 2078
2079 vma = find_vma(mm, addr); 2079 vma = find_vma(mm, addr);
2080 if (vma) { 2080 if (vma) {
2081 *pprev = vma->vm_prev; 2081 *pprev = vma->vm_prev;
2082 } else { 2082 } else {
2083 struct rb_node *rb_node = mm->mm_rb.rb_node; 2083 struct rb_node *rb_node = mm->mm_rb.rb_node;
2084 *pprev = NULL; 2084 *pprev = NULL;
2085 while (rb_node) { 2085 while (rb_node) {
2086 *pprev = rb_entry(rb_node, struct vm_area_struct, vm_rb); 2086 *pprev = rb_entry(rb_node, struct vm_area_struct, vm_rb);
2087 rb_node = rb_node->rb_right; 2087 rb_node = rb_node->rb_right;
2088 } 2088 }
2089 } 2089 }
2090 return vma; 2090 return vma;
2091 } 2091 }
2092 2092
2093 /* 2093 /*
2094 * Verify that the stack growth is acceptable and 2094 * Verify that the stack growth is acceptable and
2095 * update accounting. This is shared with both the 2095 * update accounting. This is shared with both the
2096 * grow-up and grow-down cases. 2096 * grow-up and grow-down cases.
2097 */ 2097 */
2098 static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow) 2098 static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow)
2099 { 2099 {
2100 struct mm_struct *mm = vma->vm_mm; 2100 struct mm_struct *mm = vma->vm_mm;
2101 struct rlimit *rlim = current->signal->rlim; 2101 struct rlimit *rlim = current->signal->rlim;
2102 unsigned long new_start; 2102 unsigned long new_start, actual_size;
2103 2103
2104 /* address space limit tests */ 2104 /* address space limit tests */
2105 if (!may_expand_vm(mm, grow)) 2105 if (!may_expand_vm(mm, grow))
2106 return -ENOMEM; 2106 return -ENOMEM;
2107 2107
2108 /* Stack limit test */ 2108 /* Stack limit test */
2109 if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur)) 2109 actual_size = size;
2110 if (size && (vma->vm_flags & (VM_GROWSUP | VM_GROWSDOWN)))
2111 actual_size -= PAGE_SIZE;
2112 if (actual_size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur))
2110 return -ENOMEM; 2113 return -ENOMEM;
2111 2114
2112 /* mlock limit tests */ 2115 /* mlock limit tests */
2113 if (vma->vm_flags & VM_LOCKED) { 2116 if (vma->vm_flags & VM_LOCKED) {
2114 unsigned long locked; 2117 unsigned long locked;
2115 unsigned long limit; 2118 unsigned long limit;
2116 locked = mm->locked_vm + grow; 2119 locked = mm->locked_vm + grow;
2117 limit = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur); 2120 limit = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur);
2118 limit >>= PAGE_SHIFT; 2121 limit >>= PAGE_SHIFT;
2119 if (locked > limit && !capable(CAP_IPC_LOCK)) 2122 if (locked > limit && !capable(CAP_IPC_LOCK))
2120 return -ENOMEM; 2123 return -ENOMEM;
2121 } 2124 }
2122 2125
2123 /* Check to ensure the stack will not grow into a hugetlb-only region */ 2126 /* Check to ensure the stack will not grow into a hugetlb-only region */
2124 new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start : 2127 new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start :
2125 vma->vm_end - size; 2128 vma->vm_end - size;
2126 if (is_hugepage_only_range(vma->vm_mm, new_start, size)) 2129 if (is_hugepage_only_range(vma->vm_mm, new_start, size))
2127 return -EFAULT; 2130 return -EFAULT;
2128 2131
2129 /* 2132 /*
2130 * Overcommit.. This must be the final test, as it will 2133 * Overcommit.. This must be the final test, as it will
2131 * update security statistics. 2134 * update security statistics.
2132 */ 2135 */
2133 if (security_vm_enough_memory_mm(mm, grow)) 2136 if (security_vm_enough_memory_mm(mm, grow))
2134 return -ENOMEM; 2137 return -ENOMEM;
2135 2138
2136 /* Ok, everything looks good - let it rip */ 2139 /* Ok, everything looks good - let it rip */
2137 if (vma->vm_flags & VM_LOCKED) 2140 if (vma->vm_flags & VM_LOCKED)
2138 mm->locked_vm += grow; 2141 mm->locked_vm += grow;
2139 vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); 2142 vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
2140 return 0; 2143 return 0;
2141 } 2144 }
2142 2145
2143 #if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64) 2146 #if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64)
2144 /* 2147 /*
2145 * PA-RISC uses this for its stack; IA64 for its Register Backing Store. 2148 * PA-RISC uses this for its stack; IA64 for its Register Backing Store.
2146 * vma is the last one with address > vma->vm_end. Have to extend vma. 2149 * vma is the last one with address > vma->vm_end. Have to extend vma.
2147 */ 2150 */
2148 int expand_upwards(struct vm_area_struct *vma, unsigned long address) 2151 int expand_upwards(struct vm_area_struct *vma, unsigned long address)
2149 { 2152 {
2150 int error; 2153 int error;
2151 2154
2152 if (!(vma->vm_flags & VM_GROWSUP)) 2155 if (!(vma->vm_flags & VM_GROWSUP))
2153 return -EFAULT; 2156 return -EFAULT;
2154 2157
2155 /* 2158 /*
2156 * We must make sure the anon_vma is allocated 2159 * We must make sure the anon_vma is allocated
2157 * so that the anon_vma locking is not a noop. 2160 * so that the anon_vma locking is not a noop.
2158 */ 2161 */
2159 if (unlikely(anon_vma_prepare(vma))) 2162 if (unlikely(anon_vma_prepare(vma)))
2160 return -ENOMEM; 2163 return -ENOMEM;
2161 vma_lock_anon_vma(vma); 2164 vma_lock_anon_vma(vma);
2162 2165
2163 /* 2166 /*
2164 * vma->vm_start/vm_end cannot change under us because the caller 2167 * vma->vm_start/vm_end cannot change under us because the caller
2165 * is required to hold the mmap_sem in read mode. We need the 2168 * is required to hold the mmap_sem in read mode. We need the
2166 * anon_vma lock to serialize against concurrent expand_stacks. 2169 * anon_vma lock to serialize against concurrent expand_stacks.
2167 * Also guard against wrapping around to address 0. 2170 * Also guard against wrapping around to address 0.
2168 */ 2171 */
2169 if (address < PAGE_ALIGN(address+4)) 2172 if (address < PAGE_ALIGN(address+4))
2170 address = PAGE_ALIGN(address+4); 2173 address = PAGE_ALIGN(address+4);
2171 else { 2174 else {
2172 vma_unlock_anon_vma(vma); 2175 vma_unlock_anon_vma(vma);
2173 return -ENOMEM; 2176 return -ENOMEM;
2174 } 2177 }
2175 error = 0; 2178 error = 0;
2176 2179
2177 /* Somebody else might have raced and expanded it already */ 2180 /* Somebody else might have raced and expanded it already */
2178 if (address > vma->vm_end) { 2181 if (address > vma->vm_end) {
2179 unsigned long size, grow; 2182 unsigned long size, grow;
2180 2183
2181 size = address - vma->vm_start; 2184 size = address - vma->vm_start;
2182 grow = (address - vma->vm_end) >> PAGE_SHIFT; 2185 grow = (address - vma->vm_end) >> PAGE_SHIFT;
2183 2186
2184 error = -ENOMEM; 2187 error = -ENOMEM;
2185 if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { 2188 if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
2186 error = acct_stack_growth(vma, size, grow); 2189 error = acct_stack_growth(vma, size, grow);
2187 if (!error) { 2190 if (!error) {
2188 /* 2191 /*
2189 * vma_gap_update() doesn't support concurrent 2192 * vma_gap_update() doesn't support concurrent
2190 * updates, but we only hold a shared mmap_sem 2193 * updates, but we only hold a shared mmap_sem
2191 * lock here, so we need to protect against 2194 * lock here, so we need to protect against
2192 * concurrent vma expansions. 2195 * concurrent vma expansions.
2193 * vma_lock_anon_vma() doesn't help here, as 2196 * vma_lock_anon_vma() doesn't help here, as
2194 * we don't guarantee that all growable vmas 2197 * we don't guarantee that all growable vmas
2195 * in a mm share the same root anon vma. 2198 * in a mm share the same root anon vma.
2196 * So, we reuse mm->page_table_lock to guard 2199 * So, we reuse mm->page_table_lock to guard
2197 * against concurrent vma expansions. 2200 * against concurrent vma expansions.
2198 */ 2201 */
2199 spin_lock(&vma->vm_mm->page_table_lock); 2202 spin_lock(&vma->vm_mm->page_table_lock);
2200 anon_vma_interval_tree_pre_update_vma(vma); 2203 anon_vma_interval_tree_pre_update_vma(vma);
2201 vma->vm_end = address; 2204 vma->vm_end = address;
2202 anon_vma_interval_tree_post_update_vma(vma); 2205 anon_vma_interval_tree_post_update_vma(vma);
2203 if (vma->vm_next) 2206 if (vma->vm_next)
2204 vma_gap_update(vma->vm_next); 2207 vma_gap_update(vma->vm_next);
2205 else 2208 else
2206 vma->vm_mm->highest_vm_end = address; 2209 vma->vm_mm->highest_vm_end = address;
2207 spin_unlock(&vma->vm_mm->page_table_lock); 2210 spin_unlock(&vma->vm_mm->page_table_lock);
2208 2211
2209 perf_event_mmap(vma); 2212 perf_event_mmap(vma);
2210 } 2213 }
2211 } 2214 }
2212 } 2215 }
2213 vma_unlock_anon_vma(vma); 2216 vma_unlock_anon_vma(vma);
2214 khugepaged_enter_vma_merge(vma, vma->vm_flags); 2217 khugepaged_enter_vma_merge(vma, vma->vm_flags);
2215 validate_mm(vma->vm_mm); 2218 validate_mm(vma->vm_mm);
2216 return error; 2219 return error;
2217 } 2220 }
2218 #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ 2221 #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
2219 2222
2220 /* 2223 /*
2221 * vma is the first one with address < vma->vm_start. Have to extend vma. 2224 * vma is the first one with address < vma->vm_start. Have to extend vma.
2222 */ 2225 */
2223 int expand_downwards(struct vm_area_struct *vma, 2226 int expand_downwards(struct vm_area_struct *vma,
2224 unsigned long address) 2227 unsigned long address)
2225 { 2228 {
2226 int error; 2229 int error;
2227 2230
2228 /* 2231 /*
2229 * We must make sure the anon_vma is allocated 2232 * We must make sure the anon_vma is allocated
2230 * so that the anon_vma locking is not a noop. 2233 * so that the anon_vma locking is not a noop.
2231 */ 2234 */
2232 if (unlikely(anon_vma_prepare(vma))) 2235 if (unlikely(anon_vma_prepare(vma)))
2233 return -ENOMEM; 2236 return -ENOMEM;
2234 2237
2235 address &= PAGE_MASK; 2238 address &= PAGE_MASK;
2236 error = security_mmap_addr(address); 2239 error = security_mmap_addr(address);
2237 if (error) 2240 if (error)
2238 return error; 2241 return error;
2239 2242
2240 vma_lock_anon_vma(vma); 2243 vma_lock_anon_vma(vma);
2241 2244
2242 /* 2245 /*
2243 * vma->vm_start/vm_end cannot change under us because the caller 2246 * vma->vm_start/vm_end cannot change under us because the caller
2244 * is required to hold the mmap_sem in read mode. We need the 2247 * is required to hold the mmap_sem in read mode. We need the
2245 * anon_vma lock to serialize against concurrent expand_stacks. 2248 * anon_vma lock to serialize against concurrent expand_stacks.
2246 */ 2249 */
2247 2250
2248 /* Somebody else might have raced and expanded it already */ 2251 /* Somebody else might have raced and expanded it already */
2249 if (address < vma->vm_start) { 2252 if (address < vma->vm_start) {
2250 unsigned long size, grow; 2253 unsigned long size, grow;
2251 2254
2252 size = vma->vm_end - address; 2255 size = vma->vm_end - address;
2253 grow = (vma->vm_start - address) >> PAGE_SHIFT; 2256 grow = (vma->vm_start - address) >> PAGE_SHIFT;
2254 2257
2255 error = -ENOMEM; 2258 error = -ENOMEM;
2256 if (grow <= vma->vm_pgoff) { 2259 if (grow <= vma->vm_pgoff) {
2257 error = acct_stack_growth(vma, size, grow); 2260 error = acct_stack_growth(vma, size, grow);
2258 if (!error) { 2261 if (!error) {
2259 /* 2262 /*
2260 * vma_gap_update() doesn't support concurrent 2263 * vma_gap_update() doesn't support concurrent
2261 * updates, but we only hold a shared mmap_sem 2264 * updates, but we only hold a shared mmap_sem
2262 * lock here, so we need to protect against 2265 * lock here, so we need to protect against
2263 * concurrent vma expansions. 2266 * concurrent vma expansions.
2264 * vma_lock_anon_vma() doesn't help here, as 2267 * vma_lock_anon_vma() doesn't help here, as
2265 * we don't guarantee that all growable vmas 2268 * we don't guarantee that all growable vmas
2266 * in a mm share the same root anon vma. 2269 * in a mm share the same root anon vma.
2267 * So, we reuse mm->page_table_lock to guard 2270 * So, we reuse mm->page_table_lock to guard
2268 * against concurrent vma expansions. 2271 * against concurrent vma expansions.
2269 */ 2272 */
2270 spin_lock(&vma->vm_mm->page_table_lock); 2273 spin_lock(&vma->vm_mm->page_table_lock);
2271 anon_vma_interval_tree_pre_update_vma(vma); 2274 anon_vma_interval_tree_pre_update_vma(vma);
2272 vma->vm_start = address; 2275 vma->vm_start = address;
2273 vma->vm_pgoff -= grow; 2276 vma->vm_pgoff -= grow;
2274 anon_vma_interval_tree_post_update_vma(vma); 2277 anon_vma_interval_tree_post_update_vma(vma);
2275 vma_gap_update(vma); 2278 vma_gap_update(vma);
2276 spin_unlock(&vma->vm_mm->page_table_lock); 2279 spin_unlock(&vma->vm_mm->page_table_lock);
2277 2280
2278 perf_event_mmap(vma); 2281 perf_event_mmap(vma);
2279 } 2282 }
2280 } 2283 }
2281 } 2284 }
2282 vma_unlock_anon_vma(vma); 2285 vma_unlock_anon_vma(vma);
2283 khugepaged_enter_vma_merge(vma, vma->vm_flags); 2286 khugepaged_enter_vma_merge(vma, vma->vm_flags);
2284 validate_mm(vma->vm_mm); 2287 validate_mm(vma->vm_mm);
2285 return error; 2288 return error;
2286 } 2289 }
2287 2290
2288 /* 2291 /*
2289 * Note how expand_stack() refuses to expand the stack all the way to 2292 * Note how expand_stack() refuses to expand the stack all the way to
2290 * abut the next virtual mapping, *unless* that mapping itself is also 2293 * abut the next virtual mapping, *unless* that mapping itself is also
2291 * a stack mapping. We want to leave room for a guard page, after all 2294 * a stack mapping. We want to leave room for a guard page, after all
2292 * (the guard page itself is not added here, that is done by the 2295 * (the guard page itself is not added here, that is done by the
2293 * actual page faulting logic) 2296 * actual page faulting logic)
2294 * 2297 *
2295 * This matches the behavior of the guard page logic (see mm/memory.c: 2298 * This matches the behavior of the guard page logic (see mm/memory.c:
2296 * check_stack_guard_page()), which only allows the guard page to be 2299 * check_stack_guard_page()), which only allows the guard page to be
2297 * removed under these circumstances. 2300 * removed under these circumstances.
2298 */ 2301 */
2299 #ifdef CONFIG_STACK_GROWSUP 2302 #ifdef CONFIG_STACK_GROWSUP
2300 int expand_stack(struct vm_area_struct *vma, unsigned long address) 2303 int expand_stack(struct vm_area_struct *vma, unsigned long address)
2301 { 2304 {
2302 struct vm_area_struct *next; 2305 struct vm_area_struct *next;
2303 2306
2304 address &= PAGE_MASK; 2307 address &= PAGE_MASK;
2305 next = vma->vm_next; 2308 next = vma->vm_next;
2306 if (next && next->vm_start == address + PAGE_SIZE) { 2309 if (next && next->vm_start == address + PAGE_SIZE) {
2307 if (!(next->vm_flags & VM_GROWSUP)) 2310 if (!(next->vm_flags & VM_GROWSUP))
2308 return -ENOMEM; 2311 return -ENOMEM;
2309 } 2312 }
2310 return expand_upwards(vma, address); 2313 return expand_upwards(vma, address);
2311 } 2314 }
2312 2315
2313 struct vm_area_struct * 2316 struct vm_area_struct *
2314 find_extend_vma(struct mm_struct *mm, unsigned long addr) 2317 find_extend_vma(struct mm_struct *mm, unsigned long addr)
2315 { 2318 {
2316 struct vm_area_struct *vma, *prev; 2319 struct vm_area_struct *vma, *prev;
2317 2320
2318 addr &= PAGE_MASK; 2321 addr &= PAGE_MASK;
2319 vma = find_vma_prev(mm, addr, &prev); 2322 vma = find_vma_prev(mm, addr, &prev);
2320 if (vma && (vma->vm_start <= addr)) 2323 if (vma && (vma->vm_start <= addr))
2321 return vma; 2324 return vma;
2322 if (!prev || expand_stack(prev, addr)) 2325 if (!prev || expand_stack(prev, addr))
2323 return NULL; 2326 return NULL;
2324 if (prev->vm_flags & VM_LOCKED) 2327 if (prev->vm_flags & VM_LOCKED)
2325 __mlock_vma_pages_range(prev, addr, prev->vm_end, NULL); 2328 __mlock_vma_pages_range(prev, addr, prev->vm_end, NULL);
2326 return prev; 2329 return prev;
2327 } 2330 }
2328 #else 2331 #else
2329 int expand_stack(struct vm_area_struct *vma, unsigned long address) 2332 int expand_stack(struct vm_area_struct *vma, unsigned long address)
2330 { 2333 {
2331 struct vm_area_struct *prev; 2334 struct vm_area_struct *prev;
2332 2335
2333 address &= PAGE_MASK; 2336 address &= PAGE_MASK;
2334 prev = vma->vm_prev; 2337 prev = vma->vm_prev;
2335 if (prev && prev->vm_end == address) { 2338 if (prev && prev->vm_end == address) {
2336 if (!(prev->vm_flags & VM_GROWSDOWN)) 2339 if (!(prev->vm_flags & VM_GROWSDOWN))
2337 return -ENOMEM; 2340 return -ENOMEM;
2338 } 2341 }
2339 return expand_downwards(vma, address); 2342 return expand_downwards(vma, address);
2340 } 2343 }
2341 2344
2342 struct vm_area_struct * 2345 struct vm_area_struct *
2343 find_extend_vma(struct mm_struct *mm, unsigned long addr) 2346 find_extend_vma(struct mm_struct *mm, unsigned long addr)
2344 { 2347 {
2345 struct vm_area_struct *vma; 2348 struct vm_area_struct *vma;
2346 unsigned long start; 2349 unsigned long start;
2347 2350
2348 addr &= PAGE_MASK; 2351 addr &= PAGE_MASK;
2349 vma = find_vma(mm, addr); 2352 vma = find_vma(mm, addr);
2350 if (!vma) 2353 if (!vma)
2351 return NULL; 2354 return NULL;
2352 if (vma->vm_start <= addr) 2355 if (vma->vm_start <= addr)
2353 return vma; 2356 return vma;
2354 if (!(vma->vm_flags & VM_GROWSDOWN)) 2357 if (!(vma->vm_flags & VM_GROWSDOWN))
2355 return NULL; 2358 return NULL;
2356 start = vma->vm_start; 2359 start = vma->vm_start;
2357 if (expand_stack(vma, addr)) 2360 if (expand_stack(vma, addr))
2358 return NULL; 2361 return NULL;
2359 if (vma->vm_flags & VM_LOCKED) 2362 if (vma->vm_flags & VM_LOCKED)
2360 __mlock_vma_pages_range(vma, addr, start, NULL); 2363 __mlock_vma_pages_range(vma, addr, start, NULL);
2361 return vma; 2364 return vma;
2362 } 2365 }
2363 #endif 2366 #endif
2364 2367
2365 EXPORT_SYMBOL_GPL(find_extend_vma); 2368 EXPORT_SYMBOL_GPL(find_extend_vma);
2366 2369
2367 /* 2370 /*
2368 * Ok - we have the memory areas we should free on the vma list, 2371 * Ok - we have the memory areas we should free on the vma list,
2369 * so release them, and do the vma updates. 2372 * so release them, and do the vma updates.
2370 * 2373 *
2371 * Called with the mm semaphore held. 2374 * Called with the mm semaphore held.
2372 */ 2375 */
2373 static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) 2376 static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
2374 { 2377 {
2375 unsigned long nr_accounted = 0; 2378 unsigned long nr_accounted = 0;
2376 2379
2377 /* Update high watermark before we lower total_vm */ 2380 /* Update high watermark before we lower total_vm */
2378 update_hiwater_vm(mm); 2381 update_hiwater_vm(mm);
2379 do { 2382 do {
2380 long nrpages = vma_pages(vma); 2383 long nrpages = vma_pages(vma);
2381 2384
2382 if (vma->vm_flags & VM_ACCOUNT) 2385 if (vma->vm_flags & VM_ACCOUNT)
2383 nr_accounted += nrpages; 2386 nr_accounted += nrpages;
2384 vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); 2387 vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
2385 vma = remove_vma(vma); 2388 vma = remove_vma(vma);
2386 } while (vma); 2389 } while (vma);
2387 vm_unacct_memory(nr_accounted); 2390 vm_unacct_memory(nr_accounted);
2388 validate_mm(mm); 2391 validate_mm(mm);
2389 } 2392 }
2390 2393
2391 /* 2394 /*
2392 * Get rid of page table information in the indicated region. 2395 * Get rid of page table information in the indicated region.
2393 * 2396 *
2394 * Called with the mm semaphore held. 2397 * Called with the mm semaphore held.
2395 */ 2398 */
2396 static void unmap_region(struct mm_struct *mm, 2399 static void unmap_region(struct mm_struct *mm,
2397 struct vm_area_struct *vma, struct vm_area_struct *prev, 2400 struct vm_area_struct *vma, struct vm_area_struct *prev,
2398 unsigned long start, unsigned long end) 2401 unsigned long start, unsigned long end)
2399 { 2402 {
2400 struct vm_area_struct *next = prev ? prev->vm_next : mm->mmap; 2403 struct vm_area_struct *next = prev ? prev->vm_next : mm->mmap;
2401 struct mmu_gather tlb; 2404 struct mmu_gather tlb;
2402 2405
2403 lru_add_drain(); 2406 lru_add_drain();
2404 tlb_gather_mmu(&tlb, mm, start, end); 2407 tlb_gather_mmu(&tlb, mm, start, end);
2405 update_hiwater_rss(mm); 2408 update_hiwater_rss(mm);
2406 unmap_vmas(&tlb, vma, start, end); 2409 unmap_vmas(&tlb, vma, start, end);
2407 free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, 2410 free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
2408 next ? next->vm_start : USER_PGTABLES_CEILING); 2411 next ? next->vm_start : USER_PGTABLES_CEILING);
2409 tlb_finish_mmu(&tlb, start, end); 2412 tlb_finish_mmu(&tlb, start, end);
2410 } 2413 }
2411 2414
2412 /* 2415 /*
2413 * Create a list of vma's touched by the unmap, removing them from the mm's 2416 * Create a list of vma's touched by the unmap, removing them from the mm's
2414 * vma list as we go.. 2417 * vma list as we go..
2415 */ 2418 */
2416 static void 2419 static void
2417 detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, 2420 detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
2418 struct vm_area_struct *prev, unsigned long end) 2421 struct vm_area_struct *prev, unsigned long end)
2419 { 2422 {
2420 struct vm_area_struct **insertion_point; 2423 struct vm_area_struct **insertion_point;
2421 struct vm_area_struct *tail_vma = NULL; 2424 struct vm_area_struct *tail_vma = NULL;
2422 2425
2423 insertion_point = (prev ? &prev->vm_next : &mm->mmap); 2426 insertion_point = (prev ? &prev->vm_next : &mm->mmap);
2424 vma->vm_prev = NULL; 2427 vma->vm_prev = NULL;
2425 do { 2428 do {
2426 vma_rb_erase(vma, &mm->mm_rb); 2429 vma_rb_erase(vma, &mm->mm_rb);
2427 mm->map_count--; 2430 mm->map_count--;
2428 tail_vma = vma; 2431 tail_vma = vma;
2429 vma = vma->vm_next; 2432 vma = vma->vm_next;
2430 } while (vma && vma->vm_start < end); 2433 } while (vma && vma->vm_start < end);
2431 *insertion_point = vma; 2434 *insertion_point = vma;
2432 if (vma) { 2435 if (vma) {
2433 vma->vm_prev = prev; 2436 vma->vm_prev = prev;
2434 vma_gap_update(vma); 2437 vma_gap_update(vma);
2435 } else 2438 } else
2436 mm->highest_vm_end = prev ? prev->vm_end : 0; 2439 mm->highest_vm_end = prev ? prev->vm_end : 0;
2437 tail_vma->vm_next = NULL; 2440 tail_vma->vm_next = NULL;
2438 2441
2439 /* Kill the cache */ 2442 /* Kill the cache */
2440 vmacache_invalidate(mm); 2443 vmacache_invalidate(mm);
2441 } 2444 }
2442 2445
2443 /* 2446 /*
2444 * __split_vma() bypasses sysctl_max_map_count checking. We use this on the 2447 * __split_vma() bypasses sysctl_max_map_count checking. We use this on the
2445 * munmap path where it doesn't make sense to fail. 2448 * munmap path where it doesn't make sense to fail.
2446 */ 2449 */
2447 static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, 2450 static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
2448 unsigned long addr, int new_below) 2451 unsigned long addr, int new_below)
2449 { 2452 {
2450 struct vm_area_struct *new; 2453 struct vm_area_struct *new;
2451 int err = -ENOMEM; 2454 int err = -ENOMEM;
2452 2455
2453 if (is_vm_hugetlb_page(vma) && (addr & 2456 if (is_vm_hugetlb_page(vma) && (addr &
2454 ~(huge_page_mask(hstate_vma(vma))))) 2457 ~(huge_page_mask(hstate_vma(vma)))))
2455 return -EINVAL; 2458 return -EINVAL;
2456 2459
2457 new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); 2460 new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
2458 if (!new) 2461 if (!new)
2459 goto out_err; 2462 goto out_err;
2460 2463
2461 /* most fields are the same, copy all, and then fixup */ 2464 /* most fields are the same, copy all, and then fixup */
2462 *new = *vma; 2465 *new = *vma;
2463 2466
2464 INIT_LIST_HEAD(&new->anon_vma_chain); 2467 INIT_LIST_HEAD(&new->anon_vma_chain);
2465 2468
2466 if (new_below) 2469 if (new_below)
2467 new->vm_end = addr; 2470 new->vm_end = addr;
2468 else { 2471 else {
2469 new->vm_start = addr; 2472 new->vm_start = addr;
2470 new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); 2473 new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
2471 } 2474 }
2472 2475
2473 err = vma_dup_policy(vma, new); 2476 err = vma_dup_policy(vma, new);
2474 if (err) 2477 if (err)
2475 goto out_free_vma; 2478 goto out_free_vma;
2476 2479
2477 err = anon_vma_clone(new, vma); 2480 err = anon_vma_clone(new, vma);
2478 if (err) 2481 if (err)
2479 goto out_free_mpol; 2482 goto out_free_mpol;
2480 2483
2481 if (new->vm_file) 2484 if (new->vm_file)
2482 get_file(new->vm_file); 2485 get_file(new->vm_file);
2483 2486
2484 if (new->vm_ops && new->vm_ops->open) 2487 if (new->vm_ops && new->vm_ops->open)
2485 new->vm_ops->open(new); 2488 new->vm_ops->open(new);
2486 2489
2487 if (new_below) 2490 if (new_below)
2488 err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + 2491 err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +
2489 ((addr - new->vm_start) >> PAGE_SHIFT), new); 2492 ((addr - new->vm_start) >> PAGE_SHIFT), new);
2490 else 2493 else
2491 err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); 2494 err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
2492 2495
2493 /* Success. */ 2496 /* Success. */
2494 if (!err) 2497 if (!err)
2495 return 0; 2498 return 0;
2496 2499
2497 /* Clean everything up if vma_adjust failed. */ 2500 /* Clean everything up if vma_adjust failed. */
2498 if (new->vm_ops && new->vm_ops->close) 2501 if (new->vm_ops && new->vm_ops->close)
2499 new->vm_ops->close(new); 2502 new->vm_ops->close(new);
2500 if (new->vm_file) 2503 if (new->vm_file)
2501 fput(new->vm_file); 2504 fput(new->vm_file);
2502 unlink_anon_vmas(new); 2505 unlink_anon_vmas(new);
2503 out_free_mpol: 2506 out_free_mpol:
2504 mpol_put(vma_policy(new)); 2507 mpol_put(vma_policy(new));
2505 out_free_vma: 2508 out_free_vma:
2506 kmem_cache_free(vm_area_cachep, new); 2509 kmem_cache_free(vm_area_cachep, new);
2507 out_err: 2510 out_err:
2508 return err; 2511 return err;
2509 } 2512 }
2510 2513
2511 /* 2514 /*
2512 * Split a vma into two pieces at address 'addr', a new vma is allocated 2515 * Split a vma into two pieces at address 'addr', a new vma is allocated
2513 * either for the first part or the tail. 2516 * either for the first part or the tail.
2514 */ 2517 */
2515 int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, 2518 int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
2516 unsigned long addr, int new_below) 2519 unsigned long addr, int new_below)
2517 { 2520 {
2518 if (mm->map_count >= sysctl_max_map_count) 2521 if (mm->map_count >= sysctl_max_map_count)
2519 return -ENOMEM; 2522 return -ENOMEM;
2520 2523
2521 return __split_vma(mm, vma, addr, new_below); 2524 return __split_vma(mm, vma, addr, new_below);
2522 } 2525 }
2523 2526
2524 /* Munmap is split into 2 main parts -- this part which finds 2527 /* Munmap is split into 2 main parts -- this part which finds
2525 * what needs doing, and the areas themselves, which do the 2528 * what needs doing, and the areas themselves, which do the
2526 * work. This now handles partial unmappings. 2529 * work. This now handles partial unmappings.
2527 * Jeremy Fitzhardinge <jeremy@goop.org> 2530 * Jeremy Fitzhardinge <jeremy@goop.org>
2528 */ 2531 */
2529 int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) 2532 int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
2530 { 2533 {
2531 unsigned long end; 2534 unsigned long end;
2532 struct vm_area_struct *vma, *prev, *last; 2535 struct vm_area_struct *vma, *prev, *last;
2533 2536
2534 if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start) 2537 if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start)
2535 return -EINVAL; 2538 return -EINVAL;
2536 2539
2537 len = PAGE_ALIGN(len); 2540 len = PAGE_ALIGN(len);
2538 if (len == 0) 2541 if (len == 0)
2539 return -EINVAL; 2542 return -EINVAL;
2540 2543
2541 /* Find the first overlapping VMA */ 2544 /* Find the first overlapping VMA */
2542 vma = find_vma(mm, start); 2545 vma = find_vma(mm, start);
2543 if (!vma) 2546 if (!vma)
2544 return 0; 2547 return 0;
2545 prev = vma->vm_prev; 2548 prev = vma->vm_prev;
2546 /* we have start < vma->vm_end */ 2549 /* we have start < vma->vm_end */
2547 2550
2548 /* if it doesn't overlap, we have nothing.. */ 2551 /* if it doesn't overlap, we have nothing.. */
2549 end = start + len; 2552 end = start + len;
2550 if (vma->vm_start >= end) 2553 if (vma->vm_start >= end)
2551 return 0; 2554 return 0;
2552 2555
2553 /* 2556 /*
2554 * If we need to split any vma, do it now to save pain later. 2557 * If we need to split any vma, do it now to save pain later.
2555 * 2558 *
2556 * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially 2559 * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially
2557 * unmapped vm_area_struct will remain in use: so lower split_vma 2560 * unmapped vm_area_struct will remain in use: so lower split_vma
2558 * places tmp vma above, and higher split_vma places tmp vma below. 2561 * places tmp vma above, and higher split_vma places tmp vma below.
2559 */ 2562 */
2560 if (start > vma->vm_start) { 2563 if (start > vma->vm_start) {
2561 int error; 2564 int error;
2562 2565
2563 /* 2566 /*
2564 * Make sure that map_count on return from munmap() will 2567 * Make sure that map_count on return from munmap() will
2565 * not exceed its limit; but let map_count go just above 2568 * not exceed its limit; but let map_count go just above
2566 * its limit temporarily, to help free resources as expected. 2569 * its limit temporarily, to help free resources as expected.
2567 */ 2570 */
2568 if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) 2571 if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
2569 return -ENOMEM; 2572 return -ENOMEM;
2570 2573
2571 error = __split_vma(mm, vma, start, 0); 2574 error = __split_vma(mm, vma, start, 0);
2572 if (error) 2575 if (error)
2573 return error; 2576 return error;
2574 prev = vma; 2577 prev = vma;
2575 } 2578 }
2576 2579
2577 /* Does it split the last one? */ 2580 /* Does it split the last one? */
2578 last = find_vma(mm, end); 2581 last = find_vma(mm, end);
2579 if (last && end > last->vm_start) { 2582 if (last && end > last->vm_start) {
2580 int error = __split_vma(mm, last, end, 1); 2583 int error = __split_vma(mm, last, end, 1);
2581 if (error) 2584 if (error)
2582 return error; 2585 return error;
2583 } 2586 }
2584 vma = prev ? prev->vm_next : mm->mmap; 2587 vma = prev ? prev->vm_next : mm->mmap;
2585 2588
2586 /* 2589 /*
2587 * unlock any mlock()ed ranges before detaching vmas 2590 * unlock any mlock()ed ranges before detaching vmas
2588 */ 2591 */
2589 if (mm->locked_vm) { 2592 if (mm->locked_vm) {
2590 struct vm_area_struct *tmp = vma; 2593 struct vm_area_struct *tmp = vma;
2591 while (tmp && tmp->vm_start < end) { 2594 while (tmp && tmp->vm_start < end) {
2592 if (tmp->vm_flags & VM_LOCKED) { 2595 if (tmp->vm_flags & VM_LOCKED) {
2593 mm->locked_vm -= vma_pages(tmp); 2596 mm->locked_vm -= vma_pages(tmp);
2594 munlock_vma_pages_all(tmp); 2597 munlock_vma_pages_all(tmp);
2595 } 2598 }
2596 tmp = tmp->vm_next; 2599 tmp = tmp->vm_next;
2597 } 2600 }
2598 } 2601 }
2599 2602
2600 /* 2603 /*
2601 * Remove the vma's, and unmap the actual pages 2604 * Remove the vma's, and unmap the actual pages
2602 */ 2605 */
2603 detach_vmas_to_be_unmapped(mm, vma, prev, end); 2606 detach_vmas_to_be_unmapped(mm, vma, prev, end);
2604 unmap_region(mm, vma, prev, start, end); 2607 unmap_region(mm, vma, prev, start, end);
2605 2608
2606 arch_unmap(mm, vma, start, end); 2609 arch_unmap(mm, vma, start, end);
2607 2610
2608 /* Fix up all other VM information */ 2611 /* Fix up all other VM information */
2609 remove_vma_list(mm, vma); 2612 remove_vma_list(mm, vma);
2610 2613
2611 return 0; 2614 return 0;
2612 } 2615 }
2613 2616
2614 int vm_munmap(unsigned long start, size_t len) 2617 int vm_munmap(unsigned long start, size_t len)
2615 { 2618 {
2616 int ret; 2619 int ret;
2617 struct mm_struct *mm = current->mm; 2620 struct mm_struct *mm = current->mm;
2618 2621
2619 down_write(&mm->mmap_sem); 2622 down_write(&mm->mmap_sem);
2620 ret = do_munmap(mm, start, len); 2623 ret = do_munmap(mm, start, len);
2621 up_write(&mm->mmap_sem); 2624 up_write(&mm->mmap_sem);
2622 return ret; 2625 return ret;
2623 } 2626 }
2624 EXPORT_SYMBOL(vm_munmap); 2627 EXPORT_SYMBOL(vm_munmap);
2625 2628
2626 SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) 2629 SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
2627 { 2630 {
2628 profile_munmap(addr); 2631 profile_munmap(addr);
2629 return vm_munmap(addr, len); 2632 return vm_munmap(addr, len);
2630 } 2633 }
2631 2634
2632 static inline void verify_mm_writelocked(struct mm_struct *mm) 2635 static inline void verify_mm_writelocked(struct mm_struct *mm)
2633 { 2636 {
2634 #ifdef CONFIG_DEBUG_VM 2637 #ifdef CONFIG_DEBUG_VM
2635 if (unlikely(down_read_trylock(&mm->mmap_sem))) { 2638 if (unlikely(down_read_trylock(&mm->mmap_sem))) {
2636 WARN_ON(1); 2639 WARN_ON(1);
2637 up_read(&mm->mmap_sem); 2640 up_read(&mm->mmap_sem);
2638 } 2641 }
2639 #endif 2642 #endif
2640 } 2643 }
2641 2644
2642 /* 2645 /*
2643 * this is really a simplified "do_mmap". it only handles 2646 * this is really a simplified "do_mmap". it only handles
2644 * anonymous maps. eventually we may be able to do some 2647 * anonymous maps. eventually we may be able to do some
2645 * brk-specific accounting here. 2648 * brk-specific accounting here.
2646 */ 2649 */
2647 static unsigned long do_brk(unsigned long addr, unsigned long len) 2650 static unsigned long do_brk(unsigned long addr, unsigned long len)
2648 { 2651 {
2649 struct mm_struct *mm = current->mm; 2652 struct mm_struct *mm = current->mm;
2650 struct vm_area_struct *vma, *prev; 2653 struct vm_area_struct *vma, *prev;
2651 unsigned long flags; 2654 unsigned long flags;
2652 struct rb_node **rb_link, *rb_parent; 2655 struct rb_node **rb_link, *rb_parent;
2653 pgoff_t pgoff = addr >> PAGE_SHIFT; 2656 pgoff_t pgoff = addr >> PAGE_SHIFT;
2654 int error; 2657 int error;
2655 2658
2656 len = PAGE_ALIGN(len); 2659 len = PAGE_ALIGN(len);
2657 if (!len) 2660 if (!len)
2658 return addr; 2661 return addr;
2659 2662
2660 flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; 2663 flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
2661 2664
2662 error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); 2665 error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
2663 if (error & ~PAGE_MASK) 2666 if (error & ~PAGE_MASK)
2664 return error; 2667 return error;
2665 2668
2666 error = mlock_future_check(mm, mm->def_flags, len); 2669 error = mlock_future_check(mm, mm->def_flags, len);
2667 if (error) 2670 if (error)
2668 return error; 2671 return error;
2669 2672
2670 /* 2673 /*
2671 * mm->mmap_sem is required to protect against another thread 2674 * mm->mmap_sem is required to protect against another thread
2672 * changing the mappings in case we sleep. 2675 * changing the mappings in case we sleep.
2673 */ 2676 */
2674 verify_mm_writelocked(mm); 2677 verify_mm_writelocked(mm);
2675 2678
2676 /* 2679 /*
2677 * Clear old maps. this also does some error checking for us 2680 * Clear old maps. this also does some error checking for us
2678 */ 2681 */
2679 munmap_back: 2682 munmap_back:
2680 if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { 2683 if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
2681 if (do_munmap(mm, addr, len)) 2684 if (do_munmap(mm, addr, len))
2682 return -ENOMEM; 2685 return -ENOMEM;
2683 goto munmap_back; 2686 goto munmap_back;
2684 } 2687 }
2685 2688
2686 /* Check against address space limits *after* clearing old maps... */ 2689 /* Check against address space limits *after* clearing old maps... */
2687 if (!may_expand_vm(mm, len >> PAGE_SHIFT)) 2690 if (!may_expand_vm(mm, len >> PAGE_SHIFT))
2688 return -ENOMEM; 2691 return -ENOMEM;
2689 2692
2690 if (mm->map_count > sysctl_max_map_count) 2693 if (mm->map_count > sysctl_max_map_count)
2691 return -ENOMEM; 2694 return -ENOMEM;
2692 2695
2693 if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT)) 2696 if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
2694 return -ENOMEM; 2697 return -ENOMEM;
2695 2698
2696 /* Can we just expand an old private anonymous mapping? */ 2699 /* Can we just expand an old private anonymous mapping? */
2697 vma = vma_merge(mm, prev, addr, addr + len, flags, 2700 vma = vma_merge(mm, prev, addr, addr + len, flags,
2698 NULL, NULL, pgoff, NULL); 2701 NULL, NULL, pgoff, NULL);
2699 if (vma) 2702 if (vma)
2700 goto out; 2703 goto out;
2701 2704
2702 /* 2705 /*
2703 * create a vma struct for an anonymous mapping 2706 * create a vma struct for an anonymous mapping
2704 */ 2707 */
2705 vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); 2708 vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
2706 if (!vma) { 2709 if (!vma) {
2707 vm_unacct_memory(len >> PAGE_SHIFT); 2710 vm_unacct_memory(len >> PAGE_SHIFT);
2708 return -ENOMEM; 2711 return -ENOMEM;
2709 } 2712 }
2710 2713
2711 INIT_LIST_HEAD(&vma->anon_vma_chain); 2714 INIT_LIST_HEAD(&vma->anon_vma_chain);
2712 vma->vm_mm = mm; 2715 vma->vm_mm = mm;
2713 vma->vm_start = addr; 2716 vma->vm_start = addr;
2714 vma->vm_end = addr + len; 2717 vma->vm_end = addr + len;
2715 vma->vm_pgoff = pgoff; 2718 vma->vm_pgoff = pgoff;
2716 vma->vm_flags = flags; 2719 vma->vm_flags = flags;
2717 vma->vm_page_prot = vm_get_page_prot(flags); 2720 vma->vm_page_prot = vm_get_page_prot(flags);
2718 vma_link(mm, vma, prev, rb_link, rb_parent); 2721 vma_link(mm, vma, prev, rb_link, rb_parent);
2719 out: 2722 out:
2720 perf_event_mmap(vma); 2723 perf_event_mmap(vma);
2721 mm->total_vm += len >> PAGE_SHIFT; 2724 mm->total_vm += len >> PAGE_SHIFT;
2722 if (flags & VM_LOCKED) 2725 if (flags & VM_LOCKED)
2723 mm->locked_vm += (len >> PAGE_SHIFT); 2726 mm->locked_vm += (len >> PAGE_SHIFT);
2724 vma->vm_flags |= VM_SOFTDIRTY; 2727 vma->vm_flags |= VM_SOFTDIRTY;
2725 return addr; 2728 return addr;
2726 } 2729 }
2727 2730
2728 unsigned long vm_brk(unsigned long addr, unsigned long len) 2731 unsigned long vm_brk(unsigned long addr, unsigned long len)
2729 { 2732 {
2730 struct mm_struct *mm = current->mm; 2733 struct mm_struct *mm = current->mm;
2731 unsigned long ret; 2734 unsigned long ret;
2732 bool populate; 2735 bool populate;
2733 2736
2734 down_write(&mm->mmap_sem); 2737 down_write(&mm->mmap_sem);
2735 ret = do_brk(addr, len); 2738 ret = do_brk(addr, len);
2736 populate = ((mm->def_flags & VM_LOCKED) != 0); 2739 populate = ((mm->def_flags & VM_LOCKED) != 0);
2737 up_write(&mm->mmap_sem); 2740 up_write(&mm->mmap_sem);
2738 if (populate) 2741 if (populate)
2739 mm_populate(addr, len); 2742 mm_populate(addr, len);
2740 return ret; 2743 return ret;
2741 } 2744 }
2742 EXPORT_SYMBOL(vm_brk); 2745 EXPORT_SYMBOL(vm_brk);
2743 2746
2744 /* Release all mmaps. */ 2747 /* Release all mmaps. */
2745 void exit_mmap(struct mm_struct *mm) 2748 void exit_mmap(struct mm_struct *mm)
2746 { 2749 {
2747 struct mmu_gather tlb; 2750 struct mmu_gather tlb;
2748 struct vm_area_struct *vma; 2751 struct vm_area_struct *vma;
2749 unsigned long nr_accounted = 0; 2752 unsigned long nr_accounted = 0;
2750 2753
2751 /* mm's last user has gone, and its about to be pulled down */ 2754 /* mm's last user has gone, and its about to be pulled down */
2752 mmu_notifier_release(mm); 2755 mmu_notifier_release(mm);
2753 2756
2754 if (mm->locked_vm) { 2757 if (mm->locked_vm) {
2755 vma = mm->mmap; 2758 vma = mm->mmap;
2756 while (vma) { 2759 while (vma) {
2757 if (vma->vm_flags & VM_LOCKED) 2760 if (vma->vm_flags & VM_LOCKED)
2758 munlock_vma_pages_all(vma); 2761 munlock_vma_pages_all(vma);
2759 vma = vma->vm_next; 2762 vma = vma->vm_next;
2760 } 2763 }
2761 } 2764 }
2762 2765
2763 arch_exit_mmap(mm); 2766 arch_exit_mmap(mm);
2764 2767
2765 vma = mm->mmap; 2768 vma = mm->mmap;
2766 if (!vma) /* Can happen if dup_mmap() received an OOM */ 2769 if (!vma) /* Can happen if dup_mmap() received an OOM */
2767 return; 2770 return;
2768 2771
2769 lru_add_drain(); 2772 lru_add_drain();
2770 flush_cache_mm(mm); 2773 flush_cache_mm(mm);
2771 tlb_gather_mmu(&tlb, mm, 0, -1); 2774 tlb_gather_mmu(&tlb, mm, 0, -1);
2772 /* update_hiwater_rss(mm) here? but nobody should be looking */ 2775 /* update_hiwater_rss(mm) here? but nobody should be looking */
2773 /* Use -1 here to ensure all VMAs in the mm are unmapped */ 2776 /* Use -1 here to ensure all VMAs in the mm are unmapped */
2774 unmap_vmas(&tlb, vma, 0, -1); 2777 unmap_vmas(&tlb, vma, 0, -1);
2775 2778
2776 free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); 2779 free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
2777 tlb_finish_mmu(&tlb, 0, -1); 2780 tlb_finish_mmu(&tlb, 0, -1);
2778 2781
2779 /* 2782 /*
2780 * Walk the list again, actually closing and freeing it, 2783 * Walk the list again, actually closing and freeing it,
2781 * with preemption enabled, without holding any MM locks. 2784 * with preemption enabled, without holding any MM locks.
2782 */ 2785 */
2783 while (vma) { 2786 while (vma) {
2784 if (vma->vm_flags & VM_ACCOUNT) 2787 if (vma->vm_flags & VM_ACCOUNT)
2785 nr_accounted += vma_pages(vma); 2788 nr_accounted += vma_pages(vma);
2786 vma = remove_vma(vma); 2789 vma = remove_vma(vma);
2787 } 2790 }
2788 vm_unacct_memory(nr_accounted); 2791 vm_unacct_memory(nr_accounted);
2789 2792
2790 WARN_ON(atomic_long_read(&mm->nr_ptes) > 2793 WARN_ON(atomic_long_read(&mm->nr_ptes) >
2791 (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); 2794 (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
2792 } 2795 }
2793 2796
2794 /* Insert vm structure into process list sorted by address 2797 /* Insert vm structure into process list sorted by address
2795 * and into the inode's i_mmap tree. If vm_file is non-NULL 2798 * and into the inode's i_mmap tree. If vm_file is non-NULL
2796 * then i_mmap_rwsem is taken here. 2799 * then i_mmap_rwsem is taken here.
2797 */ 2800 */
2798 int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) 2801 int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
2799 { 2802 {
2800 struct vm_area_struct *prev; 2803 struct vm_area_struct *prev;
2801 struct rb_node **rb_link, *rb_parent; 2804 struct rb_node **rb_link, *rb_parent;
2802 2805
2803 /* 2806 /*
2804 * The vm_pgoff of a purely anonymous vma should be irrelevant 2807 * The vm_pgoff of a purely anonymous vma should be irrelevant
2805 * until its first write fault, when page's anon_vma and index 2808 * until its first write fault, when page's anon_vma and index
2806 * are set. But now set the vm_pgoff it will almost certainly 2809 * are set. But now set the vm_pgoff it will almost certainly
2807 * end up with (unless mremap moves it elsewhere before that 2810 * end up with (unless mremap moves it elsewhere before that
2808 * first wfault), so /proc/pid/maps tells a consistent story. 2811 * first wfault), so /proc/pid/maps tells a consistent story.
2809 * 2812 *
2810 * By setting it to reflect the virtual start address of the 2813 * By setting it to reflect the virtual start address of the
2811 * vma, merges and splits can happen in a seamless way, just 2814 * vma, merges and splits can happen in a seamless way, just
2812 * using the existing file pgoff checks and manipulations. 2815 * using the existing file pgoff checks and manipulations.
2813 * Similarly in do_mmap_pgoff and in do_brk. 2816 * Similarly in do_mmap_pgoff and in do_brk.
2814 */ 2817 */
2815 if (!vma->vm_file) { 2818 if (!vma->vm_file) {
2816 BUG_ON(vma->anon_vma); 2819 BUG_ON(vma->anon_vma);
2817 vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; 2820 vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
2818 } 2821 }
2819 if (find_vma_links(mm, vma->vm_start, vma->vm_end, 2822 if (find_vma_links(mm, vma->vm_start, vma->vm_end,
2820 &prev, &rb_link, &rb_parent)) 2823 &prev, &rb_link, &rb_parent))
2821 return -ENOMEM; 2824 return -ENOMEM;
2822 if ((vma->vm_flags & VM_ACCOUNT) && 2825 if ((vma->vm_flags & VM_ACCOUNT) &&
2823 security_vm_enough_memory_mm(mm, vma_pages(vma))) 2826 security_vm_enough_memory_mm(mm, vma_pages(vma)))
2824 return -ENOMEM; 2827 return -ENOMEM;
2825 2828
2826 vma_link(mm, vma, prev, rb_link, rb_parent); 2829 vma_link(mm, vma, prev, rb_link, rb_parent);
2827 return 0; 2830 return 0;
2828 } 2831 }
2829 2832
2830 /* 2833 /*
2831 * Copy the vma structure to a new location in the same mm, 2834 * Copy the vma structure to a new location in the same mm,
2832 * prior to moving page table entries, to effect an mremap move. 2835 * prior to moving page table entries, to effect an mremap move.
2833 */ 2836 */
2834 struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, 2837 struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2835 unsigned long addr, unsigned long len, pgoff_t pgoff, 2838 unsigned long addr, unsigned long len, pgoff_t pgoff,
2836 bool *need_rmap_locks) 2839 bool *need_rmap_locks)
2837 { 2840 {
2838 struct vm_area_struct *vma = *vmap; 2841 struct vm_area_struct *vma = *vmap;
2839 unsigned long vma_start = vma->vm_start; 2842 unsigned long vma_start = vma->vm_start;
2840 struct mm_struct *mm = vma->vm_mm; 2843 struct mm_struct *mm = vma->vm_mm;
2841 struct vm_area_struct *new_vma, *prev; 2844 struct vm_area_struct *new_vma, *prev;
2842 struct rb_node **rb_link, *rb_parent; 2845 struct rb_node **rb_link, *rb_parent;
2843 bool faulted_in_anon_vma = true; 2846 bool faulted_in_anon_vma = true;
2844 2847
2845 /* 2848 /*
2846 * If anonymous vma has not yet been faulted, update new pgoff 2849 * If anonymous vma has not yet been faulted, update new pgoff
2847 * to match new location, to increase its chance of merging. 2850 * to match new location, to increase its chance of merging.
2848 */ 2851 */
2849 if (unlikely(!vma->vm_file && !vma->anon_vma)) { 2852 if (unlikely(!vma->vm_file && !vma->anon_vma)) {
2850 pgoff = addr >> PAGE_SHIFT; 2853 pgoff = addr >> PAGE_SHIFT;
2851 faulted_in_anon_vma = false; 2854 faulted_in_anon_vma = false;
2852 } 2855 }
2853 2856
2854 if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) 2857 if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
2855 return NULL; /* should never get here */ 2858 return NULL; /* should never get here */
2856 new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, 2859 new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
2857 vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); 2860 vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
2858 if (new_vma) { 2861 if (new_vma) {
2859 /* 2862 /*
2860 * Source vma may have been merged into new_vma 2863 * Source vma may have been merged into new_vma
2861 */ 2864 */
2862 if (unlikely(vma_start >= new_vma->vm_start && 2865 if (unlikely(vma_start >= new_vma->vm_start &&
2863 vma_start < new_vma->vm_end)) { 2866 vma_start < new_vma->vm_end)) {
2864 /* 2867 /*
2865 * The only way we can get a vma_merge with 2868 * The only way we can get a vma_merge with
2866 * self during an mremap is if the vma hasn't 2869 * self during an mremap is if the vma hasn't
2867 * been faulted in yet and we were allowed to 2870 * been faulted in yet and we were allowed to
2868 * reset the dst vma->vm_pgoff to the 2871 * reset the dst vma->vm_pgoff to the
2869 * destination address of the mremap to allow 2872 * destination address of the mremap to allow
2870 * the merge to happen. mremap must change the 2873 * the merge to happen. mremap must change the
2871 * vm_pgoff linearity between src and dst vmas 2874 * vm_pgoff linearity between src and dst vmas
2872 * (in turn preventing a vma_merge) to be 2875 * (in turn preventing a vma_merge) to be
2873 * safe. It is only safe to keep the vm_pgoff 2876 * safe. It is only safe to keep the vm_pgoff
2874 * linear if there are no pages mapped yet. 2877 * linear if there are no pages mapped yet.
2875 */ 2878 */
2876 VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma); 2879 VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma);
2877 *vmap = vma = new_vma; 2880 *vmap = vma = new_vma;
2878 } 2881 }
2879 *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); 2882 *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
2880 } else { 2883 } else {
2881 new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); 2884 new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
2882 if (new_vma) { 2885 if (new_vma) {
2883 *new_vma = *vma; 2886 *new_vma = *vma;
2884 new_vma->vm_start = addr; 2887 new_vma->vm_start = addr;
2885 new_vma->vm_end = addr + len; 2888 new_vma->vm_end = addr + len;
2886 new_vma->vm_pgoff = pgoff; 2889 new_vma->vm_pgoff = pgoff;
2887 if (vma_dup_policy(vma, new_vma)) 2890 if (vma_dup_policy(vma, new_vma))
2888 goto out_free_vma; 2891 goto out_free_vma;
2889 INIT_LIST_HEAD(&new_vma->anon_vma_chain); 2892 INIT_LIST_HEAD(&new_vma->anon_vma_chain);
2890 if (anon_vma_clone(new_vma, vma)) 2893 if (anon_vma_clone(new_vma, vma))
2891 goto out_free_mempol; 2894 goto out_free_mempol;
2892 if (new_vma->vm_file) 2895 if (new_vma->vm_file)
2893 get_file(new_vma->vm_file); 2896 get_file(new_vma->vm_file);
2894 if (new_vma->vm_ops && new_vma->vm_ops->open) 2897 if (new_vma->vm_ops && new_vma->vm_ops->open)
2895 new_vma->vm_ops->open(new_vma); 2898 new_vma->vm_ops->open(new_vma);
2896 vma_link(mm, new_vma, prev, rb_link, rb_parent); 2899 vma_link(mm, new_vma, prev, rb_link, rb_parent);
2897 *need_rmap_locks = false; 2900 *need_rmap_locks = false;
2898 } 2901 }
2899 } 2902 }
2900 return new_vma; 2903 return new_vma;
2901 2904
2902 out_free_mempol: 2905 out_free_mempol:
2903 mpol_put(vma_policy(new_vma)); 2906 mpol_put(vma_policy(new_vma));
2904 out_free_vma: 2907 out_free_vma:
2905 kmem_cache_free(vm_area_cachep, new_vma); 2908 kmem_cache_free(vm_area_cachep, new_vma);
2906 return NULL; 2909 return NULL;
2907 } 2910 }
2908 2911
2909 /* 2912 /*
2910 * Return true if the calling process may expand its vm space by the passed 2913 * Return true if the calling process may expand its vm space by the passed
2911 * number of pages 2914 * number of pages
2912 */ 2915 */
2913 int may_expand_vm(struct mm_struct *mm, unsigned long npages) 2916 int may_expand_vm(struct mm_struct *mm, unsigned long npages)
2914 { 2917 {
2915 unsigned long cur = mm->total_vm; /* pages */ 2918 unsigned long cur = mm->total_vm; /* pages */
2916 unsigned long lim; 2919 unsigned long lim;
2917 2920
2918 lim = rlimit(RLIMIT_AS) >> PAGE_SHIFT; 2921 lim = rlimit(RLIMIT_AS) >> PAGE_SHIFT;
2919 2922
2920 if (cur + npages > lim) 2923 if (cur + npages > lim)
2921 return 0; 2924 return 0;
2922 return 1; 2925 return 1;
2923 } 2926 }
2924 2927
2925 static int special_mapping_fault(struct vm_area_struct *vma, 2928 static int special_mapping_fault(struct vm_area_struct *vma,
2926 struct vm_fault *vmf); 2929 struct vm_fault *vmf);
2927 2930
2928 /* 2931 /*
2929 * Having a close hook prevents vma merging regardless of flags. 2932 * Having a close hook prevents vma merging regardless of flags.
2930 */ 2933 */
2931 static void special_mapping_close(struct vm_area_struct *vma) 2934 static void special_mapping_close(struct vm_area_struct *vma)
2932 { 2935 {
2933 } 2936 }
2934 2937
2935 static const char *special_mapping_name(struct vm_area_struct *vma) 2938 static const char *special_mapping_name(struct vm_area_struct *vma)
2936 { 2939 {
2937 return ((struct vm_special_mapping *)vma->vm_private_data)->name; 2940 return ((struct vm_special_mapping *)vma->vm_private_data)->name;
2938 } 2941 }
2939 2942
2940 static const struct vm_operations_struct special_mapping_vmops = { 2943 static const struct vm_operations_struct special_mapping_vmops = {
2941 .close = special_mapping_close, 2944 .close = special_mapping_close,
2942 .fault = special_mapping_fault, 2945 .fault = special_mapping_fault,
2943 .name = special_mapping_name, 2946 .name = special_mapping_name,
2944 }; 2947 };
2945 2948
2946 static const struct vm_operations_struct legacy_special_mapping_vmops = { 2949 static const struct vm_operations_struct legacy_special_mapping_vmops = {
2947 .close = special_mapping_close, 2950 .close = special_mapping_close,
2948 .fault = special_mapping_fault, 2951 .fault = special_mapping_fault,
2949 }; 2952 };
2950 2953
2951 static int special_mapping_fault(struct vm_area_struct *vma, 2954 static int special_mapping_fault(struct vm_area_struct *vma,
2952 struct vm_fault *vmf) 2955 struct vm_fault *vmf)
2953 { 2956 {
2954 pgoff_t pgoff; 2957 pgoff_t pgoff;
2955 struct page **pages; 2958 struct page **pages;
2956 2959
2957 /* 2960 /*
2958 * special mappings have no vm_file, and in that case, the mm 2961 * special mappings have no vm_file, and in that case, the mm
2959 * uses vm_pgoff internally. So we have to subtract it from here. 2962 * uses vm_pgoff internally. So we have to subtract it from here.
2960 * We are allowed to do this because we are the mm; do not copy 2963 * We are allowed to do this because we are the mm; do not copy
2961 * this code into drivers! 2964 * this code into drivers!
2962 */ 2965 */
2963 pgoff = vmf->pgoff - vma->vm_pgoff; 2966 pgoff = vmf->pgoff - vma->vm_pgoff;
2964 2967
2965 if (vma->vm_ops == &legacy_special_mapping_vmops) 2968 if (vma->vm_ops == &legacy_special_mapping_vmops)
2966 pages = vma->vm_private_data; 2969 pages = vma->vm_private_data;
2967 else 2970 else
2968 pages = ((struct vm_special_mapping *)vma->vm_private_data)-> 2971 pages = ((struct vm_special_mapping *)vma->vm_private_data)->
2969 pages; 2972 pages;
2970 2973
2971 for (; pgoff && *pages; ++pages) 2974 for (; pgoff && *pages; ++pages)
2972 pgoff--; 2975 pgoff--;
2973 2976
2974 if (*pages) { 2977 if (*pages) {
2975 struct page *page = *pages; 2978 struct page *page = *pages;
2976 get_page(page); 2979 get_page(page);
2977 vmf->page = page; 2980 vmf->page = page;
2978 return 0; 2981 return 0;
2979 } 2982 }
2980 2983
2981 return VM_FAULT_SIGBUS; 2984 return VM_FAULT_SIGBUS;
2982 } 2985 }
2983 2986
2984 static struct vm_area_struct *__install_special_mapping( 2987 static struct vm_area_struct *__install_special_mapping(
2985 struct mm_struct *mm, 2988 struct mm_struct *mm,
2986 unsigned long addr, unsigned long len, 2989 unsigned long addr, unsigned long len,
2987 unsigned long vm_flags, const struct vm_operations_struct *ops, 2990 unsigned long vm_flags, const struct vm_operations_struct *ops,
2988 void *priv) 2991 void *priv)
2989 { 2992 {
2990 int ret; 2993 int ret;
2991 struct vm_area_struct *vma; 2994 struct vm_area_struct *vma;
2992 2995
2993 vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); 2996 vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
2994 if (unlikely(vma == NULL)) 2997 if (unlikely(vma == NULL))
2995 return ERR_PTR(-ENOMEM); 2998 return ERR_PTR(-ENOMEM);
2996 2999
2997 INIT_LIST_HEAD(&vma->anon_vma_chain); 3000 INIT_LIST_HEAD(&vma->anon_vma_chain);
2998 vma->vm_mm = mm; 3001 vma->vm_mm = mm;
2999 vma->vm_start = addr; 3002 vma->vm_start = addr;
3000 vma->vm_end = addr + len; 3003 vma->vm_end = addr + len;
3001 3004
3002 vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY; 3005 vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY;
3003 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); 3006 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
3004 3007
3005 vma->vm_ops = ops; 3008 vma->vm_ops = ops;
3006 vma->vm_private_data = priv; 3009 vma->vm_private_data = priv;
3007 3010
3008 ret = insert_vm_struct(mm, vma); 3011 ret = insert_vm_struct(mm, vma);
3009 if (ret) 3012 if (ret)
3010 goto out; 3013 goto out;
3011 3014
3012 mm->total_vm += len >> PAGE_SHIFT; 3015 mm->total_vm += len >> PAGE_SHIFT;
3013 3016
3014 perf_event_mmap(vma); 3017 perf_event_mmap(vma);
3015 3018
3016 return vma; 3019 return vma;
3017 3020
3018 out: 3021 out:
3019 kmem_cache_free(vm_area_cachep, vma); 3022 kmem_cache_free(vm_area_cachep, vma);
3020 return ERR_PTR(ret); 3023 return ERR_PTR(ret);
3021 } 3024 }
3022 3025
3023 /* 3026 /*
3024 * Called with mm->mmap_sem held for writing. 3027 * Called with mm->mmap_sem held for writing.
3025 * Insert a new vma covering the given region, with the given flags. 3028 * Insert a new vma covering the given region, with the given flags.
3026 * Its pages are supplied by the given array of struct page *. 3029 * Its pages are supplied by the given array of struct page *.
3027 * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated. 3030 * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated.
3028 * The region past the last page supplied will always produce SIGBUS. 3031 * The region past the last page supplied will always produce SIGBUS.
3029 * The array pointer and the pages it points to are assumed to stay alive 3032 * The array pointer and the pages it points to are assumed to stay alive
3030 * for as long as this mapping might exist. 3033 * for as long as this mapping might exist.
3031 */ 3034 */
3032 struct vm_area_struct *_install_special_mapping( 3035 struct vm_area_struct *_install_special_mapping(
3033 struct mm_struct *mm, 3036 struct mm_struct *mm,
3034 unsigned long addr, unsigned long len, 3037 unsigned long addr, unsigned long len,
3035 unsigned long vm_flags, const struct vm_special_mapping *spec) 3038 unsigned long vm_flags, const struct vm_special_mapping *spec)
3036 { 3039 {
3037 return __install_special_mapping(mm, addr, len, vm_flags, 3040 return __install_special_mapping(mm, addr, len, vm_flags,
3038 &special_mapping_vmops, (void *)spec); 3041 &special_mapping_vmops, (void *)spec);
3039 } 3042 }
3040 3043
3041 int install_special_mapping(struct mm_struct *mm, 3044 int install_special_mapping(struct mm_struct *mm,
3042 unsigned long addr, unsigned long len, 3045 unsigned long addr, unsigned long len,
3043 unsigned long vm_flags, struct page **pages) 3046 unsigned long vm_flags, struct page **pages)
3044 { 3047 {
3045 struct vm_area_struct *vma = __install_special_mapping( 3048 struct vm_area_struct *vma = __install_special_mapping(
3046 mm, addr, len, vm_flags, &legacy_special_mapping_vmops, 3049 mm, addr, len, vm_flags, &legacy_special_mapping_vmops,
3047 (void *)pages); 3050 (void *)pages);
3048 3051
3049 return PTR_ERR_OR_ZERO(vma); 3052 return PTR_ERR_OR_ZERO(vma);
3050 } 3053 }
3051 3054
3052 static DEFINE_MUTEX(mm_all_locks_mutex); 3055 static DEFINE_MUTEX(mm_all_locks_mutex);
3053 3056
3054 static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) 3057 static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
3055 { 3058 {
3056 if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) { 3059 if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {
3057 /* 3060 /*
3058 * The LSB of head.next can't change from under us 3061 * The LSB of head.next can't change from under us
3059 * because we hold the mm_all_locks_mutex. 3062 * because we hold the mm_all_locks_mutex.
3060 */ 3063 */
3061 down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_sem); 3064 down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_sem);
3062 /* 3065 /*
3063 * We can safely modify head.next after taking the 3066 * We can safely modify head.next after taking the
3064 * anon_vma->root->rwsem. If some other vma in this mm shares 3067 * anon_vma->root->rwsem. If some other vma in this mm shares
3065 * the same anon_vma we won't take it again. 3068 * the same anon_vma we won't take it again.
3066 * 3069 *
3067 * No need of atomic instructions here, head.next 3070 * No need of atomic instructions here, head.next
3068 * can't change from under us thanks to the 3071 * can't change from under us thanks to the
3069 * anon_vma->root->rwsem. 3072 * anon_vma->root->rwsem.
3070 */ 3073 */
3071 if (__test_and_set_bit(0, (unsigned long *) 3074 if (__test_and_set_bit(0, (unsigned long *)
3072 &anon_vma->root->rb_root.rb_node)) 3075 &anon_vma->root->rb_root.rb_node))
3073 BUG(); 3076 BUG();
3074 } 3077 }
3075 } 3078 }
3076 3079
3077 static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) 3080 static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
3078 { 3081 {
3079 if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { 3082 if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
3080 /* 3083 /*
3081 * AS_MM_ALL_LOCKS can't change from under us because 3084 * AS_MM_ALL_LOCKS can't change from under us because
3082 * we hold the mm_all_locks_mutex. 3085 * we hold the mm_all_locks_mutex.
3083 * 3086 *
3084 * Operations on ->flags have to be atomic because 3087 * Operations on ->flags have to be atomic because
3085 * even if AS_MM_ALL_LOCKS is stable thanks to the 3088 * even if AS_MM_ALL_LOCKS is stable thanks to the
3086 * mm_all_locks_mutex, there may be other cpus 3089 * mm_all_locks_mutex, there may be other cpus
3087 * changing other bitflags in parallel to us. 3090 * changing other bitflags in parallel to us.
3088 */ 3091 */
3089 if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) 3092 if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
3090 BUG(); 3093 BUG();
3091 down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_sem); 3094 down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_sem);
3092 } 3095 }
3093 } 3096 }
3094 3097
3095 /* 3098 /*
3096 * This operation locks against the VM for all pte/vma/mm related 3099 * This operation locks against the VM for all pte/vma/mm related
3097 * operations that could ever happen on a certain mm. This includes 3100 * operations that could ever happen on a certain mm. This includes
3098 * vmtruncate, try_to_unmap, and all page faults. 3101 * vmtruncate, try_to_unmap, and all page faults.
3099 * 3102 *
3100 * The caller must take the mmap_sem in write mode before calling 3103 * The caller must take the mmap_sem in write mode before calling
3101 * mm_take_all_locks(). The caller isn't allowed to release the 3104 * mm_take_all_locks(). The caller isn't allowed to release the
3102 * mmap_sem until mm_drop_all_locks() returns. 3105 * mmap_sem until mm_drop_all_locks() returns.
3103 * 3106 *
3104 * mmap_sem in write mode is required in order to block all operations 3107 * mmap_sem in write mode is required in order to block all operations
3105 * that could modify pagetables and free pages without need of 3108 * that could modify pagetables and free pages without need of
3106 * altering the vma layout (for example populate_range() with 3109 * altering the vma layout (for example populate_range() with
3107 * nonlinear vmas). It's also needed in write mode to avoid new 3110 * nonlinear vmas). It's also needed in write mode to avoid new
3108 * anon_vmas to be associated with existing vmas. 3111 * anon_vmas to be associated with existing vmas.
3109 * 3112 *
3110 * A single task can't take more than one mm_take_all_locks() in a row 3113 * A single task can't take more than one mm_take_all_locks() in a row
3111 * or it would deadlock. 3114 * or it would deadlock.
3112 * 3115 *
3113 * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in 3116 * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in
3114 * mapping->flags avoid to take the same lock twice, if more than one 3117 * mapping->flags avoid to take the same lock twice, if more than one
3115 * vma in this mm is backed by the same anon_vma or address_space. 3118 * vma in this mm is backed by the same anon_vma or address_space.
3116 * 3119 *
3117 * We can take all the locks in random order because the VM code 3120 * We can take all the locks in random order because the VM code
3118 * taking i_mmap_rwsem or anon_vma->rwsem outside the mmap_sem never 3121 * taking i_mmap_rwsem or anon_vma->rwsem outside the mmap_sem never
3119 * takes more than one of them in a row. Secondly we're protected 3122 * takes more than one of them in a row. Secondly we're protected
3120 * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. 3123 * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
3121 * 3124 *
3122 * mm_take_all_locks() and mm_drop_all_locks are expensive operations 3125 * mm_take_all_locks() and mm_drop_all_locks are expensive operations
3123 * that may have to take thousand of locks. 3126 * that may have to take thousand of locks.
3124 * 3127 *
3125 * mm_take_all_locks() can fail if it's interrupted by signals. 3128 * mm_take_all_locks() can fail if it's interrupted by signals.
3126 */ 3129 */
3127 int mm_take_all_locks(struct mm_struct *mm) 3130 int mm_take_all_locks(struct mm_struct *mm)
3128 { 3131 {
3129 struct vm_area_struct *vma; 3132 struct vm_area_struct *vma;
3130 struct anon_vma_chain *avc; 3133 struct anon_vma_chain *avc;
3131 3134
3132 BUG_ON(down_read_trylock(&mm->mmap_sem)); 3135 BUG_ON(down_read_trylock(&mm->mmap_sem));
3133 3136
3134 mutex_lock(&mm_all_locks_mutex); 3137 mutex_lock(&mm_all_locks_mutex);
3135 3138
3136 for (vma = mm->mmap; vma; vma = vma->vm_next) { 3139 for (vma = mm->mmap; vma; vma = vma->vm_next) {
3137 if (signal_pending(current)) 3140 if (signal_pending(current))
3138 goto out_unlock; 3141 goto out_unlock;
3139 if (vma->vm_file && vma->vm_file->f_mapping) 3142 if (vma->vm_file && vma->vm_file->f_mapping)
3140 vm_lock_mapping(mm, vma->vm_file->f_mapping); 3143 vm_lock_mapping(mm, vma->vm_file->f_mapping);
3141 } 3144 }
3142 3145
3143 for (vma = mm->mmap; vma; vma = vma->vm_next) { 3146 for (vma = mm->mmap; vma; vma = vma->vm_next) {
3144 if (signal_pending(current)) 3147 if (signal_pending(current))
3145 goto out_unlock; 3148 goto out_unlock;
3146 if (vma->anon_vma) 3149 if (vma->anon_vma)
3147 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 3150 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
3148 vm_lock_anon_vma(mm, avc->anon_vma); 3151 vm_lock_anon_vma(mm, avc->anon_vma);
3149 } 3152 }
3150 3153
3151 return 0; 3154 return 0;
3152 3155
3153 out_unlock: 3156 out_unlock:
3154 mm_drop_all_locks(mm); 3157 mm_drop_all_locks(mm);
3155 return -EINTR; 3158 return -EINTR;
3156 } 3159 }
3157 3160
3158 static void vm_unlock_anon_vma(struct anon_vma *anon_vma) 3161 static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
3159 { 3162 {
3160 if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) { 3163 if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {
3161 /* 3164 /*
3162 * The LSB of head.next can't change to 0 from under 3165 * The LSB of head.next can't change to 0 from under
3163 * us because we hold the mm_all_locks_mutex. 3166 * us because we hold the mm_all_locks_mutex.
3164 * 3167 *
3165 * We must however clear the bitflag before unlocking 3168 * We must however clear the bitflag before unlocking
3166 * the vma so the users using the anon_vma->rb_root will 3169 * the vma so the users using the anon_vma->rb_root will
3167 * never see our bitflag. 3170 * never see our bitflag.
3168 * 3171 *
3169 * No need of atomic instructions here, head.next 3172 * No need of atomic instructions here, head.next
3170 * can't change from under us until we release the 3173 * can't change from under us until we release the
3171 * anon_vma->root->rwsem. 3174 * anon_vma->root->rwsem.
3172 */ 3175 */
3173 if (!__test_and_clear_bit(0, (unsigned long *) 3176 if (!__test_and_clear_bit(0, (unsigned long *)
3174 &anon_vma->root->rb_root.rb_node)) 3177 &anon_vma->root->rb_root.rb_node))
3175 BUG(); 3178 BUG();
3176 anon_vma_unlock_write(anon_vma); 3179 anon_vma_unlock_write(anon_vma);
3177 } 3180 }
3178 } 3181 }
3179 3182
3180 static void vm_unlock_mapping(struct address_space *mapping) 3183 static void vm_unlock_mapping(struct address_space *mapping)
3181 { 3184 {
3182 if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { 3185 if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
3183 /* 3186 /*
3184 * AS_MM_ALL_LOCKS can't change to 0 from under us 3187 * AS_MM_ALL_LOCKS can't change to 0 from under us
3185 * because we hold the mm_all_locks_mutex. 3188 * because we hold the mm_all_locks_mutex.
3186 */ 3189 */
3187 i_mmap_unlock_write(mapping); 3190 i_mmap_unlock_write(mapping);
3188 if (!test_and_clear_bit(AS_MM_ALL_LOCKS, 3191 if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
3189 &mapping->flags)) 3192 &mapping->flags))
3190 BUG(); 3193 BUG();
3191 } 3194 }
3192 } 3195 }
3193 3196
3194 /* 3197 /*
3195 * The mmap_sem cannot be released by the caller until 3198 * The mmap_sem cannot be released by the caller until
3196 * mm_drop_all_locks() returns. 3199 * mm_drop_all_locks() returns.
3197 */ 3200 */
3198 void mm_drop_all_locks(struct mm_struct *mm) 3201 void mm_drop_all_locks(struct mm_struct *mm)
3199 { 3202 {
3200 struct vm_area_struct *vma; 3203 struct vm_area_struct *vma;
3201 struct anon_vma_chain *avc; 3204 struct anon_vma_chain *avc;
3202 3205
3203 BUG_ON(down_read_trylock(&mm->mmap_sem)); 3206 BUG_ON(down_read_trylock(&mm->mmap_sem));
3204 BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); 3207 BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
3205 3208
3206 for (vma = mm->mmap; vma; vma = vma->vm_next) { 3209 for (vma = mm->mmap; vma; vma = vma->vm_next) {
3207 if (vma->anon_vma) 3210 if (vma->anon_vma)
3208 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 3211 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
3209 vm_unlock_anon_vma(avc->anon_vma); 3212 vm_unlock_anon_vma(avc->anon_vma);
3210 if (vma->vm_file && vma->vm_file->f_mapping) 3213 if (vma->vm_file && vma->vm_file->f_mapping)
3211 vm_unlock_mapping(vma->vm_file->f_mapping); 3214 vm_unlock_mapping(vma->vm_file->f_mapping);
3212 } 3215 }
3213 3216
3214 mutex_unlock(&mm_all_locks_mutex); 3217 mutex_unlock(&mm_all_locks_mutex);
3215 } 3218 }
3216 3219
3217 /* 3220 /*
3218 * initialise the VMA slab 3221 * initialise the VMA slab
3219 */ 3222 */
3220 void __init mmap_init(void) 3223 void __init mmap_init(void)
3221 { 3224 {
3222 int ret; 3225 int ret;
3223 3226
3224 ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); 3227 ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
3225 VM_BUG_ON(ret); 3228 VM_BUG_ON(ret);
3226 } 3229 }
3227 3230
3228 /* 3231 /*
3229 * Initialise sysctl_user_reserve_kbytes. 3232 * Initialise sysctl_user_reserve_kbytes.
3230 * 3233 *
3231 * This is intended to prevent a user from starting a single memory hogging 3234 * This is intended to prevent a user from starting a single memory hogging
3232 * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER 3235 * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER
3233 * mode. 3236 * mode.
3234 * 3237 *
3235 * The default value is min(3% of free memory, 128MB) 3238 * The default value is min(3% of free memory, 128MB)
3236 * 128MB is enough to recover with sshd/login, bash, and top/kill. 3239 * 128MB is enough to recover with sshd/login, bash, and top/kill.
3237 */ 3240 */
3238 static int init_user_reserve(void) 3241 static int init_user_reserve(void)
3239 { 3242 {
3240 unsigned long free_kbytes; 3243 unsigned long free_kbytes;
3241 3244
3242 free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); 3245 free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
3243 3246
3244 sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); 3247 sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
3245 return 0; 3248 return 0;
3246 } 3249 }
3247 subsys_initcall(init_user_reserve); 3250 subsys_initcall(init_user_reserve);
3248 3251
3249 /* 3252 /*
3250 * Initialise sysctl_admin_reserve_kbytes. 3253 * Initialise sysctl_admin_reserve_kbytes.
3251 * 3254 *
3252 * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin 3255 * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin
3253 * to log in and kill a memory hogging process. 3256 * to log in and kill a memory hogging process.
3254 * 3257 *
3255 * Systems with more than 256MB will reserve 8MB, enough to recover 3258 * Systems with more than 256MB will reserve 8MB, enough to recover
3256 * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will 3259 * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will
3257 * only reserve 3% of free pages by default. 3260 * only reserve 3% of free pages by default.
3258 */ 3261 */
3259 static int init_admin_reserve(void) 3262 static int init_admin_reserve(void)
3260 { 3263 {
3261 unsigned long free_kbytes; 3264 unsigned long free_kbytes;
3262 3265
3263 free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); 3266 free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
3264 3267
3265 sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); 3268 sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
3266 return 0; 3269 return 0;
3267 } 3270 }
3268 subsys_initcall(init_admin_reserve); 3271 subsys_initcall(init_admin_reserve);
3269 3272
3270 /* 3273 /*
3271 * Reinititalise user and admin reserves if memory is added or removed. 3274 * Reinititalise user and admin reserves if memory is added or removed.
3272 * 3275 *
3273 * The default user reserve max is 128MB, and the default max for the 3276 * The default user reserve max is 128MB, and the default max for the
3274 * admin reserve is 8MB. These are usually, but not always, enough to 3277 * admin reserve is 8MB. These are usually, but not always, enough to
3275 * enable recovery from a memory hogging process using login/sshd, a shell, 3278 * enable recovery from a memory hogging process using login/sshd, a shell,
3276 * and tools like top. It may make sense to increase or even disable the 3279 * and tools like top. It may make sense to increase or even disable the
3277 * reserve depending on the existence of swap or variations in the recovery 3280 * reserve depending on the existence of swap or variations in the recovery
3278 * tools. So, the admin may have changed them. 3281 * tools. So, the admin may have changed them.
3279 * 3282 *
3280 * If memory is added and the reserves have been eliminated or increased above 3283 * If memory is added and the reserves have been eliminated or increased above
3281 * the default max, then we'll trust the admin. 3284 * the default max, then we'll trust the admin.
3282 * 3285 *
3283 * If memory is removed and there isn't enough free memory, then we 3286 * If memory is removed and there isn't enough free memory, then we
3284 * need to reset the reserves. 3287 * need to reset the reserves.
3285 * 3288 *
3286 * Otherwise keep the reserve set by the admin. 3289 * Otherwise keep the reserve set by the admin.
3287 */ 3290 */
3288 static int reserve_mem_notifier(struct notifier_block *nb, 3291 static int reserve_mem_notifier(struct notifier_block *nb,
3289 unsigned long action, void *data) 3292 unsigned long action, void *data)
3290 { 3293 {
3291 unsigned long tmp, free_kbytes; 3294 unsigned long tmp, free_kbytes;
3292 3295
3293 switch (action) { 3296 switch (action) {
3294 case MEM_ONLINE: 3297 case MEM_ONLINE:
3295 /* Default max is 128MB. Leave alone if modified by operator. */ 3298 /* Default max is 128MB. Leave alone if modified by operator. */
3296 tmp = sysctl_user_reserve_kbytes; 3299 tmp = sysctl_user_reserve_kbytes;
3297 if (0 < tmp && tmp < (1UL << 17)) 3300 if (0 < tmp && tmp < (1UL << 17))
3298 init_user_reserve(); 3301 init_user_reserve();
3299 3302
3300 /* Default max is 8MB. Leave alone if modified by operator. */ 3303 /* Default max is 8MB. Leave alone if modified by operator. */
3301 tmp = sysctl_admin_reserve_kbytes; 3304 tmp = sysctl_admin_reserve_kbytes;
3302 if (0 < tmp && tmp < (1UL << 13)) 3305 if (0 < tmp && tmp < (1UL << 13))
3303 init_admin_reserve(); 3306 init_admin_reserve();
3304 3307
3305 break; 3308 break;
3306 case MEM_OFFLINE: 3309 case MEM_OFFLINE:
3307 free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); 3310 free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
3308 3311
3309 if (sysctl_user_reserve_kbytes > free_kbytes) { 3312 if (sysctl_user_reserve_kbytes > free_kbytes) {
3310 init_user_reserve(); 3313 init_user_reserve();
3311 pr_info("vm.user_reserve_kbytes reset to %lu\n", 3314 pr_info("vm.user_reserve_kbytes reset to %lu\n",
3312 sysctl_user_reserve_kbytes); 3315 sysctl_user_reserve_kbytes);
3313 } 3316 }
3314 3317
3315 if (sysctl_admin_reserve_kbytes > free_kbytes) { 3318 if (sysctl_admin_reserve_kbytes > free_kbytes) {
3316 init_admin_reserve(); 3319 init_admin_reserve();
3317 pr_info("vm.admin_reserve_kbytes reset to %lu\n", 3320 pr_info("vm.admin_reserve_kbytes reset to %lu\n",
3318 sysctl_admin_reserve_kbytes); 3321 sysctl_admin_reserve_kbytes);
3319 } 3322 }
3320 break; 3323 break;
3321 default: 3324 default:
3322 break; 3325 break;
3323 } 3326 }
3324 return NOTIFY_OK; 3327 return NOTIFY_OK;
3325 } 3328 }
3326 3329
3327 static struct notifier_block reserve_mem_nb = { 3330 static struct notifier_block reserve_mem_nb = {
3328 .notifier_call = reserve_mem_notifier, 3331 .notifier_call = reserve_mem_notifier,
3329 }; 3332 };
3330 3333
3331 static int __meminit init_reserve_notifier(void) 3334 static int __meminit init_reserve_notifier(void)
3332 { 3335 {
3333 if (register_hotmemory_notifier(&reserve_mem_nb)) 3336 if (register_hotmemory_notifier(&reserve_mem_nb))
3334 pr_err("Failed registering memory add/remove notifier for admin reserve\n"); 3337 pr_err("Failed registering memory add/remove notifier for admin reserve\n");
3335 3338
3336 return 0; 3339 return 0;
3337 } 3340 }
3338 subsys_initcall(init_reserve_notifier); 3341 subsys_initcall(init_reserve_notifier);
3339 3342