Commit 690eac53daff34169a4d74fc7bfbd388c4896abb
1 parent
4850d37d3a
Exists in
ti-lsk-linux-4.1.y
and in
10 other branches
mm: Don't count the stack guard page towards RLIMIT_STACK
Commit fee7e49d4514 ("mm: propagate error from stack expansion even for guard page") made sure that we return the error properly for stack growth conditions. It also theorized that counting the guard page towards the stack limit might break something, but also said "Let's see if anybody notices". Somebody did notice. Apparently android-x86 sets the stack limit very close to the limit indeed, and including the guard page in the rlimit check causes the android 'zygote' process problems. So this adds the (fairly trivial) code to make the stack rlimit check be against the actual real stack size, rather than the size of the vma that includes the guard page. Reported-and-tested-by: Chih-Wei Huang <cwhuang@android-x86.org> Cc: Jay Foad <jay.foad@gmail.com> Cc: stable@kernel.org # to match back-porting of fee7e49d4514 Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 1 changed file with 5 additions and 2 deletions Inline Diff
mm/mmap.c
1 | /* | 1 | /* |
2 | * mm/mmap.c | 2 | * mm/mmap.c |
3 | * | 3 | * |
4 | * Written by obz. | 4 | * Written by obz. |
5 | * | 5 | * |
6 | * Address space accounting code <alan@lxorguk.ukuu.org.uk> | 6 | * Address space accounting code <alan@lxorguk.ukuu.org.uk> |
7 | */ | 7 | */ |
8 | 8 | ||
9 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | 9 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
10 | 10 | ||
11 | #include <linux/kernel.h> | 11 | #include <linux/kernel.h> |
12 | #include <linux/slab.h> | 12 | #include <linux/slab.h> |
13 | #include <linux/backing-dev.h> | 13 | #include <linux/backing-dev.h> |
14 | #include <linux/mm.h> | 14 | #include <linux/mm.h> |
15 | #include <linux/vmacache.h> | 15 | #include <linux/vmacache.h> |
16 | #include <linux/shm.h> | 16 | #include <linux/shm.h> |
17 | #include <linux/mman.h> | 17 | #include <linux/mman.h> |
18 | #include <linux/pagemap.h> | 18 | #include <linux/pagemap.h> |
19 | #include <linux/swap.h> | 19 | #include <linux/swap.h> |
20 | #include <linux/syscalls.h> | 20 | #include <linux/syscalls.h> |
21 | #include <linux/capability.h> | 21 | #include <linux/capability.h> |
22 | #include <linux/init.h> | 22 | #include <linux/init.h> |
23 | #include <linux/file.h> | 23 | #include <linux/file.h> |
24 | #include <linux/fs.h> | 24 | #include <linux/fs.h> |
25 | #include <linux/personality.h> | 25 | #include <linux/personality.h> |
26 | #include <linux/security.h> | 26 | #include <linux/security.h> |
27 | #include <linux/hugetlb.h> | 27 | #include <linux/hugetlb.h> |
28 | #include <linux/profile.h> | 28 | #include <linux/profile.h> |
29 | #include <linux/export.h> | 29 | #include <linux/export.h> |
30 | #include <linux/mount.h> | 30 | #include <linux/mount.h> |
31 | #include <linux/mempolicy.h> | 31 | #include <linux/mempolicy.h> |
32 | #include <linux/rmap.h> | 32 | #include <linux/rmap.h> |
33 | #include <linux/mmu_notifier.h> | 33 | #include <linux/mmu_notifier.h> |
34 | #include <linux/mmdebug.h> | 34 | #include <linux/mmdebug.h> |
35 | #include <linux/perf_event.h> | 35 | #include <linux/perf_event.h> |
36 | #include <linux/audit.h> | 36 | #include <linux/audit.h> |
37 | #include <linux/khugepaged.h> | 37 | #include <linux/khugepaged.h> |
38 | #include <linux/uprobes.h> | 38 | #include <linux/uprobes.h> |
39 | #include <linux/rbtree_augmented.h> | 39 | #include <linux/rbtree_augmented.h> |
40 | #include <linux/sched/sysctl.h> | 40 | #include <linux/sched/sysctl.h> |
41 | #include <linux/notifier.h> | 41 | #include <linux/notifier.h> |
42 | #include <linux/memory.h> | 42 | #include <linux/memory.h> |
43 | #include <linux/printk.h> | 43 | #include <linux/printk.h> |
44 | 44 | ||
45 | #include <asm/uaccess.h> | 45 | #include <asm/uaccess.h> |
46 | #include <asm/cacheflush.h> | 46 | #include <asm/cacheflush.h> |
47 | #include <asm/tlb.h> | 47 | #include <asm/tlb.h> |
48 | #include <asm/mmu_context.h> | 48 | #include <asm/mmu_context.h> |
49 | 49 | ||
50 | #include "internal.h" | 50 | #include "internal.h" |
51 | 51 | ||
52 | #ifndef arch_mmap_check | 52 | #ifndef arch_mmap_check |
53 | #define arch_mmap_check(addr, len, flags) (0) | 53 | #define arch_mmap_check(addr, len, flags) (0) |
54 | #endif | 54 | #endif |
55 | 55 | ||
56 | #ifndef arch_rebalance_pgtables | 56 | #ifndef arch_rebalance_pgtables |
57 | #define arch_rebalance_pgtables(addr, len) (addr) | 57 | #define arch_rebalance_pgtables(addr, len) (addr) |
58 | #endif | 58 | #endif |
59 | 59 | ||
60 | static void unmap_region(struct mm_struct *mm, | 60 | static void unmap_region(struct mm_struct *mm, |
61 | struct vm_area_struct *vma, struct vm_area_struct *prev, | 61 | struct vm_area_struct *vma, struct vm_area_struct *prev, |
62 | unsigned long start, unsigned long end); | 62 | unsigned long start, unsigned long end); |
63 | 63 | ||
64 | /* description of effects of mapping type and prot in current implementation. | 64 | /* description of effects of mapping type and prot in current implementation. |
65 | * this is due to the limited x86 page protection hardware. The expected | 65 | * this is due to the limited x86 page protection hardware. The expected |
66 | * behavior is in parens: | 66 | * behavior is in parens: |
67 | * | 67 | * |
68 | * map_type prot | 68 | * map_type prot |
69 | * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC | 69 | * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC |
70 | * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes | 70 | * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes |
71 | * w: (no) no w: (no) no w: (yes) yes w: (no) no | 71 | * w: (no) no w: (no) no w: (yes) yes w: (no) no |
72 | * x: (no) no x: (no) yes x: (no) yes x: (yes) yes | 72 | * x: (no) no x: (no) yes x: (no) yes x: (yes) yes |
73 | * | 73 | * |
74 | * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes | 74 | * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes |
75 | * w: (no) no w: (no) no w: (copy) copy w: (no) no | 75 | * w: (no) no w: (no) no w: (copy) copy w: (no) no |
76 | * x: (no) no x: (no) yes x: (no) yes x: (yes) yes | 76 | * x: (no) no x: (no) yes x: (no) yes x: (yes) yes |
77 | * | 77 | * |
78 | */ | 78 | */ |
79 | pgprot_t protection_map[16] = { | 79 | pgprot_t protection_map[16] = { |
80 | __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111, | 80 | __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111, |
81 | __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111 | 81 | __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111 |
82 | }; | 82 | }; |
83 | 83 | ||
84 | pgprot_t vm_get_page_prot(unsigned long vm_flags) | 84 | pgprot_t vm_get_page_prot(unsigned long vm_flags) |
85 | { | 85 | { |
86 | return __pgprot(pgprot_val(protection_map[vm_flags & | 86 | return __pgprot(pgprot_val(protection_map[vm_flags & |
87 | (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]) | | 87 | (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]) | |
88 | pgprot_val(arch_vm_get_page_prot(vm_flags))); | 88 | pgprot_val(arch_vm_get_page_prot(vm_flags))); |
89 | } | 89 | } |
90 | EXPORT_SYMBOL(vm_get_page_prot); | 90 | EXPORT_SYMBOL(vm_get_page_prot); |
91 | 91 | ||
92 | static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags) | 92 | static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags) |
93 | { | 93 | { |
94 | return pgprot_modify(oldprot, vm_get_page_prot(vm_flags)); | 94 | return pgprot_modify(oldprot, vm_get_page_prot(vm_flags)); |
95 | } | 95 | } |
96 | 96 | ||
97 | /* Update vma->vm_page_prot to reflect vma->vm_flags. */ | 97 | /* Update vma->vm_page_prot to reflect vma->vm_flags. */ |
98 | void vma_set_page_prot(struct vm_area_struct *vma) | 98 | void vma_set_page_prot(struct vm_area_struct *vma) |
99 | { | 99 | { |
100 | unsigned long vm_flags = vma->vm_flags; | 100 | unsigned long vm_flags = vma->vm_flags; |
101 | 101 | ||
102 | vma->vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags); | 102 | vma->vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags); |
103 | if (vma_wants_writenotify(vma)) { | 103 | if (vma_wants_writenotify(vma)) { |
104 | vm_flags &= ~VM_SHARED; | 104 | vm_flags &= ~VM_SHARED; |
105 | vma->vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, | 105 | vma->vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, |
106 | vm_flags); | 106 | vm_flags); |
107 | } | 107 | } |
108 | } | 108 | } |
109 | 109 | ||
110 | 110 | ||
111 | int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */ | 111 | int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */ |
112 | int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ | 112 | int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ |
113 | unsigned long sysctl_overcommit_kbytes __read_mostly; | 113 | unsigned long sysctl_overcommit_kbytes __read_mostly; |
114 | int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; | 114 | int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; |
115 | unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ | 115 | unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ |
116 | unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ | 116 | unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ |
117 | /* | 117 | /* |
118 | * Make sure vm_committed_as in one cacheline and not cacheline shared with | 118 | * Make sure vm_committed_as in one cacheline and not cacheline shared with |
119 | * other variables. It can be updated by several CPUs frequently. | 119 | * other variables. It can be updated by several CPUs frequently. |
120 | */ | 120 | */ |
121 | struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp; | 121 | struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp; |
122 | 122 | ||
123 | /* | 123 | /* |
124 | * The global memory commitment made in the system can be a metric | 124 | * The global memory commitment made in the system can be a metric |
125 | * that can be used to drive ballooning decisions when Linux is hosted | 125 | * that can be used to drive ballooning decisions when Linux is hosted |
126 | * as a guest. On Hyper-V, the host implements a policy engine for dynamically | 126 | * as a guest. On Hyper-V, the host implements a policy engine for dynamically |
127 | * balancing memory across competing virtual machines that are hosted. | 127 | * balancing memory across competing virtual machines that are hosted. |
128 | * Several metrics drive this policy engine including the guest reported | 128 | * Several metrics drive this policy engine including the guest reported |
129 | * memory commitment. | 129 | * memory commitment. |
130 | */ | 130 | */ |
131 | unsigned long vm_memory_committed(void) | 131 | unsigned long vm_memory_committed(void) |
132 | { | 132 | { |
133 | return percpu_counter_read_positive(&vm_committed_as); | 133 | return percpu_counter_read_positive(&vm_committed_as); |
134 | } | 134 | } |
135 | EXPORT_SYMBOL_GPL(vm_memory_committed); | 135 | EXPORT_SYMBOL_GPL(vm_memory_committed); |
136 | 136 | ||
137 | /* | 137 | /* |
138 | * Check that a process has enough memory to allocate a new virtual | 138 | * Check that a process has enough memory to allocate a new virtual |
139 | * mapping. 0 means there is enough memory for the allocation to | 139 | * mapping. 0 means there is enough memory for the allocation to |
140 | * succeed and -ENOMEM implies there is not. | 140 | * succeed and -ENOMEM implies there is not. |
141 | * | 141 | * |
142 | * We currently support three overcommit policies, which are set via the | 142 | * We currently support three overcommit policies, which are set via the |
143 | * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting | 143 | * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting |
144 | * | 144 | * |
145 | * Strict overcommit modes added 2002 Feb 26 by Alan Cox. | 145 | * Strict overcommit modes added 2002 Feb 26 by Alan Cox. |
146 | * Additional code 2002 Jul 20 by Robert Love. | 146 | * Additional code 2002 Jul 20 by Robert Love. |
147 | * | 147 | * |
148 | * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise. | 148 | * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise. |
149 | * | 149 | * |
150 | * Note this is a helper function intended to be used by LSMs which | 150 | * Note this is a helper function intended to be used by LSMs which |
151 | * wish to use this logic. | 151 | * wish to use this logic. |
152 | */ | 152 | */ |
153 | int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | 153 | int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) |
154 | { | 154 | { |
155 | unsigned long free, allowed, reserve; | 155 | unsigned long free, allowed, reserve; |
156 | 156 | ||
157 | VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) < | 157 | VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) < |
158 | -(s64)vm_committed_as_batch * num_online_cpus(), | 158 | -(s64)vm_committed_as_batch * num_online_cpus(), |
159 | "memory commitment underflow"); | 159 | "memory commitment underflow"); |
160 | 160 | ||
161 | vm_acct_memory(pages); | 161 | vm_acct_memory(pages); |
162 | 162 | ||
163 | /* | 163 | /* |
164 | * Sometimes we want to use more memory than we have | 164 | * Sometimes we want to use more memory than we have |
165 | */ | 165 | */ |
166 | if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) | 166 | if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) |
167 | return 0; | 167 | return 0; |
168 | 168 | ||
169 | if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { | 169 | if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { |
170 | free = global_page_state(NR_FREE_PAGES); | 170 | free = global_page_state(NR_FREE_PAGES); |
171 | free += global_page_state(NR_FILE_PAGES); | 171 | free += global_page_state(NR_FILE_PAGES); |
172 | 172 | ||
173 | /* | 173 | /* |
174 | * shmem pages shouldn't be counted as free in this | 174 | * shmem pages shouldn't be counted as free in this |
175 | * case, they can't be purged, only swapped out, and | 175 | * case, they can't be purged, only swapped out, and |
176 | * that won't affect the overall amount of available | 176 | * that won't affect the overall amount of available |
177 | * memory in the system. | 177 | * memory in the system. |
178 | */ | 178 | */ |
179 | free -= global_page_state(NR_SHMEM); | 179 | free -= global_page_state(NR_SHMEM); |
180 | 180 | ||
181 | free += get_nr_swap_pages(); | 181 | free += get_nr_swap_pages(); |
182 | 182 | ||
183 | /* | 183 | /* |
184 | * Any slabs which are created with the | 184 | * Any slabs which are created with the |
185 | * SLAB_RECLAIM_ACCOUNT flag claim to have contents | 185 | * SLAB_RECLAIM_ACCOUNT flag claim to have contents |
186 | * which are reclaimable, under pressure. The dentry | 186 | * which are reclaimable, under pressure. The dentry |
187 | * cache and most inode caches should fall into this | 187 | * cache and most inode caches should fall into this |
188 | */ | 188 | */ |
189 | free += global_page_state(NR_SLAB_RECLAIMABLE); | 189 | free += global_page_state(NR_SLAB_RECLAIMABLE); |
190 | 190 | ||
191 | /* | 191 | /* |
192 | * Leave reserved pages. The pages are not for anonymous pages. | 192 | * Leave reserved pages. The pages are not for anonymous pages. |
193 | */ | 193 | */ |
194 | if (free <= totalreserve_pages) | 194 | if (free <= totalreserve_pages) |
195 | goto error; | 195 | goto error; |
196 | else | 196 | else |
197 | free -= totalreserve_pages; | 197 | free -= totalreserve_pages; |
198 | 198 | ||
199 | /* | 199 | /* |
200 | * Reserve some for root | 200 | * Reserve some for root |
201 | */ | 201 | */ |
202 | if (!cap_sys_admin) | 202 | if (!cap_sys_admin) |
203 | free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); | 203 | free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); |
204 | 204 | ||
205 | if (free > pages) | 205 | if (free > pages) |
206 | return 0; | 206 | return 0; |
207 | 207 | ||
208 | goto error; | 208 | goto error; |
209 | } | 209 | } |
210 | 210 | ||
211 | allowed = vm_commit_limit(); | 211 | allowed = vm_commit_limit(); |
212 | /* | 212 | /* |
213 | * Reserve some for root | 213 | * Reserve some for root |
214 | */ | 214 | */ |
215 | if (!cap_sys_admin) | 215 | if (!cap_sys_admin) |
216 | allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); | 216 | allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); |
217 | 217 | ||
218 | /* | 218 | /* |
219 | * Don't let a single process grow so big a user can't recover | 219 | * Don't let a single process grow so big a user can't recover |
220 | */ | 220 | */ |
221 | if (mm) { | 221 | if (mm) { |
222 | reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); | 222 | reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); |
223 | allowed -= min(mm->total_vm / 32, reserve); | 223 | allowed -= min(mm->total_vm / 32, reserve); |
224 | } | 224 | } |
225 | 225 | ||
226 | if (percpu_counter_read_positive(&vm_committed_as) < allowed) | 226 | if (percpu_counter_read_positive(&vm_committed_as) < allowed) |
227 | return 0; | 227 | return 0; |
228 | error: | 228 | error: |
229 | vm_unacct_memory(pages); | 229 | vm_unacct_memory(pages); |
230 | 230 | ||
231 | return -ENOMEM; | 231 | return -ENOMEM; |
232 | } | 232 | } |
233 | 233 | ||
234 | /* | 234 | /* |
235 | * Requires inode->i_mapping->i_mmap_rwsem | 235 | * Requires inode->i_mapping->i_mmap_rwsem |
236 | */ | 236 | */ |
237 | static void __remove_shared_vm_struct(struct vm_area_struct *vma, | 237 | static void __remove_shared_vm_struct(struct vm_area_struct *vma, |
238 | struct file *file, struct address_space *mapping) | 238 | struct file *file, struct address_space *mapping) |
239 | { | 239 | { |
240 | if (vma->vm_flags & VM_DENYWRITE) | 240 | if (vma->vm_flags & VM_DENYWRITE) |
241 | atomic_inc(&file_inode(file)->i_writecount); | 241 | atomic_inc(&file_inode(file)->i_writecount); |
242 | if (vma->vm_flags & VM_SHARED) | 242 | if (vma->vm_flags & VM_SHARED) |
243 | mapping_unmap_writable(mapping); | 243 | mapping_unmap_writable(mapping); |
244 | 244 | ||
245 | flush_dcache_mmap_lock(mapping); | 245 | flush_dcache_mmap_lock(mapping); |
246 | if (unlikely(vma->vm_flags & VM_NONLINEAR)) | 246 | if (unlikely(vma->vm_flags & VM_NONLINEAR)) |
247 | list_del_init(&vma->shared.nonlinear); | 247 | list_del_init(&vma->shared.nonlinear); |
248 | else | 248 | else |
249 | vma_interval_tree_remove(vma, &mapping->i_mmap); | 249 | vma_interval_tree_remove(vma, &mapping->i_mmap); |
250 | flush_dcache_mmap_unlock(mapping); | 250 | flush_dcache_mmap_unlock(mapping); |
251 | } | 251 | } |
252 | 252 | ||
253 | /* | 253 | /* |
254 | * Unlink a file-based vm structure from its interval tree, to hide | 254 | * Unlink a file-based vm structure from its interval tree, to hide |
255 | * vma from rmap and vmtruncate before freeing its page tables. | 255 | * vma from rmap and vmtruncate before freeing its page tables. |
256 | */ | 256 | */ |
257 | void unlink_file_vma(struct vm_area_struct *vma) | 257 | void unlink_file_vma(struct vm_area_struct *vma) |
258 | { | 258 | { |
259 | struct file *file = vma->vm_file; | 259 | struct file *file = vma->vm_file; |
260 | 260 | ||
261 | if (file) { | 261 | if (file) { |
262 | struct address_space *mapping = file->f_mapping; | 262 | struct address_space *mapping = file->f_mapping; |
263 | i_mmap_lock_write(mapping); | 263 | i_mmap_lock_write(mapping); |
264 | __remove_shared_vm_struct(vma, file, mapping); | 264 | __remove_shared_vm_struct(vma, file, mapping); |
265 | i_mmap_unlock_write(mapping); | 265 | i_mmap_unlock_write(mapping); |
266 | } | 266 | } |
267 | } | 267 | } |
268 | 268 | ||
269 | /* | 269 | /* |
270 | * Close a vm structure and free it, returning the next. | 270 | * Close a vm structure and free it, returning the next. |
271 | */ | 271 | */ |
272 | static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) | 272 | static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) |
273 | { | 273 | { |
274 | struct vm_area_struct *next = vma->vm_next; | 274 | struct vm_area_struct *next = vma->vm_next; |
275 | 275 | ||
276 | might_sleep(); | 276 | might_sleep(); |
277 | if (vma->vm_ops && vma->vm_ops->close) | 277 | if (vma->vm_ops && vma->vm_ops->close) |
278 | vma->vm_ops->close(vma); | 278 | vma->vm_ops->close(vma); |
279 | if (vma->vm_file) | 279 | if (vma->vm_file) |
280 | fput(vma->vm_file); | 280 | fput(vma->vm_file); |
281 | mpol_put(vma_policy(vma)); | 281 | mpol_put(vma_policy(vma)); |
282 | kmem_cache_free(vm_area_cachep, vma); | 282 | kmem_cache_free(vm_area_cachep, vma); |
283 | return next; | 283 | return next; |
284 | } | 284 | } |
285 | 285 | ||
286 | static unsigned long do_brk(unsigned long addr, unsigned long len); | 286 | static unsigned long do_brk(unsigned long addr, unsigned long len); |
287 | 287 | ||
288 | SYSCALL_DEFINE1(brk, unsigned long, brk) | 288 | SYSCALL_DEFINE1(brk, unsigned long, brk) |
289 | { | 289 | { |
290 | unsigned long retval; | 290 | unsigned long retval; |
291 | unsigned long newbrk, oldbrk; | 291 | unsigned long newbrk, oldbrk; |
292 | struct mm_struct *mm = current->mm; | 292 | struct mm_struct *mm = current->mm; |
293 | unsigned long min_brk; | 293 | unsigned long min_brk; |
294 | bool populate; | 294 | bool populate; |
295 | 295 | ||
296 | down_write(&mm->mmap_sem); | 296 | down_write(&mm->mmap_sem); |
297 | 297 | ||
298 | #ifdef CONFIG_COMPAT_BRK | 298 | #ifdef CONFIG_COMPAT_BRK |
299 | /* | 299 | /* |
300 | * CONFIG_COMPAT_BRK can still be overridden by setting | 300 | * CONFIG_COMPAT_BRK can still be overridden by setting |
301 | * randomize_va_space to 2, which will still cause mm->start_brk | 301 | * randomize_va_space to 2, which will still cause mm->start_brk |
302 | * to be arbitrarily shifted | 302 | * to be arbitrarily shifted |
303 | */ | 303 | */ |
304 | if (current->brk_randomized) | 304 | if (current->brk_randomized) |
305 | min_brk = mm->start_brk; | 305 | min_brk = mm->start_brk; |
306 | else | 306 | else |
307 | min_brk = mm->end_data; | 307 | min_brk = mm->end_data; |
308 | #else | 308 | #else |
309 | min_brk = mm->start_brk; | 309 | min_brk = mm->start_brk; |
310 | #endif | 310 | #endif |
311 | if (brk < min_brk) | 311 | if (brk < min_brk) |
312 | goto out; | 312 | goto out; |
313 | 313 | ||
314 | /* | 314 | /* |
315 | * Check against rlimit here. If this check is done later after the test | 315 | * Check against rlimit here. If this check is done later after the test |
316 | * of oldbrk with newbrk then it can escape the test and let the data | 316 | * of oldbrk with newbrk then it can escape the test and let the data |
317 | * segment grow beyond its set limit the in case where the limit is | 317 | * segment grow beyond its set limit the in case where the limit is |
318 | * not page aligned -Ram Gupta | 318 | * not page aligned -Ram Gupta |
319 | */ | 319 | */ |
320 | if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk, | 320 | if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk, |
321 | mm->end_data, mm->start_data)) | 321 | mm->end_data, mm->start_data)) |
322 | goto out; | 322 | goto out; |
323 | 323 | ||
324 | newbrk = PAGE_ALIGN(brk); | 324 | newbrk = PAGE_ALIGN(brk); |
325 | oldbrk = PAGE_ALIGN(mm->brk); | 325 | oldbrk = PAGE_ALIGN(mm->brk); |
326 | if (oldbrk == newbrk) | 326 | if (oldbrk == newbrk) |
327 | goto set_brk; | 327 | goto set_brk; |
328 | 328 | ||
329 | /* Always allow shrinking brk. */ | 329 | /* Always allow shrinking brk. */ |
330 | if (brk <= mm->brk) { | 330 | if (brk <= mm->brk) { |
331 | if (!do_munmap(mm, newbrk, oldbrk-newbrk)) | 331 | if (!do_munmap(mm, newbrk, oldbrk-newbrk)) |
332 | goto set_brk; | 332 | goto set_brk; |
333 | goto out; | 333 | goto out; |
334 | } | 334 | } |
335 | 335 | ||
336 | /* Check against existing mmap mappings. */ | 336 | /* Check against existing mmap mappings. */ |
337 | if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE)) | 337 | if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE)) |
338 | goto out; | 338 | goto out; |
339 | 339 | ||
340 | /* Ok, looks good - let it rip. */ | 340 | /* Ok, looks good - let it rip. */ |
341 | if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) | 341 | if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) |
342 | goto out; | 342 | goto out; |
343 | 343 | ||
344 | set_brk: | 344 | set_brk: |
345 | mm->brk = brk; | 345 | mm->brk = brk; |
346 | populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0; | 346 | populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0; |
347 | up_write(&mm->mmap_sem); | 347 | up_write(&mm->mmap_sem); |
348 | if (populate) | 348 | if (populate) |
349 | mm_populate(oldbrk, newbrk - oldbrk); | 349 | mm_populate(oldbrk, newbrk - oldbrk); |
350 | return brk; | 350 | return brk; |
351 | 351 | ||
352 | out: | 352 | out: |
353 | retval = mm->brk; | 353 | retval = mm->brk; |
354 | up_write(&mm->mmap_sem); | 354 | up_write(&mm->mmap_sem); |
355 | return retval; | 355 | return retval; |
356 | } | 356 | } |
357 | 357 | ||
358 | static long vma_compute_subtree_gap(struct vm_area_struct *vma) | 358 | static long vma_compute_subtree_gap(struct vm_area_struct *vma) |
359 | { | 359 | { |
360 | unsigned long max, subtree_gap; | 360 | unsigned long max, subtree_gap; |
361 | max = vma->vm_start; | 361 | max = vma->vm_start; |
362 | if (vma->vm_prev) | 362 | if (vma->vm_prev) |
363 | max -= vma->vm_prev->vm_end; | 363 | max -= vma->vm_prev->vm_end; |
364 | if (vma->vm_rb.rb_left) { | 364 | if (vma->vm_rb.rb_left) { |
365 | subtree_gap = rb_entry(vma->vm_rb.rb_left, | 365 | subtree_gap = rb_entry(vma->vm_rb.rb_left, |
366 | struct vm_area_struct, vm_rb)->rb_subtree_gap; | 366 | struct vm_area_struct, vm_rb)->rb_subtree_gap; |
367 | if (subtree_gap > max) | 367 | if (subtree_gap > max) |
368 | max = subtree_gap; | 368 | max = subtree_gap; |
369 | } | 369 | } |
370 | if (vma->vm_rb.rb_right) { | 370 | if (vma->vm_rb.rb_right) { |
371 | subtree_gap = rb_entry(vma->vm_rb.rb_right, | 371 | subtree_gap = rb_entry(vma->vm_rb.rb_right, |
372 | struct vm_area_struct, vm_rb)->rb_subtree_gap; | 372 | struct vm_area_struct, vm_rb)->rb_subtree_gap; |
373 | if (subtree_gap > max) | 373 | if (subtree_gap > max) |
374 | max = subtree_gap; | 374 | max = subtree_gap; |
375 | } | 375 | } |
376 | return max; | 376 | return max; |
377 | } | 377 | } |
378 | 378 | ||
379 | #ifdef CONFIG_DEBUG_VM_RB | 379 | #ifdef CONFIG_DEBUG_VM_RB |
380 | static int browse_rb(struct rb_root *root) | 380 | static int browse_rb(struct rb_root *root) |
381 | { | 381 | { |
382 | int i = 0, j, bug = 0; | 382 | int i = 0, j, bug = 0; |
383 | struct rb_node *nd, *pn = NULL; | 383 | struct rb_node *nd, *pn = NULL; |
384 | unsigned long prev = 0, pend = 0; | 384 | unsigned long prev = 0, pend = 0; |
385 | 385 | ||
386 | for (nd = rb_first(root); nd; nd = rb_next(nd)) { | 386 | for (nd = rb_first(root); nd; nd = rb_next(nd)) { |
387 | struct vm_area_struct *vma; | 387 | struct vm_area_struct *vma; |
388 | vma = rb_entry(nd, struct vm_area_struct, vm_rb); | 388 | vma = rb_entry(nd, struct vm_area_struct, vm_rb); |
389 | if (vma->vm_start < prev) { | 389 | if (vma->vm_start < prev) { |
390 | pr_emerg("vm_start %lx < prev %lx\n", | 390 | pr_emerg("vm_start %lx < prev %lx\n", |
391 | vma->vm_start, prev); | 391 | vma->vm_start, prev); |
392 | bug = 1; | 392 | bug = 1; |
393 | } | 393 | } |
394 | if (vma->vm_start < pend) { | 394 | if (vma->vm_start < pend) { |
395 | pr_emerg("vm_start %lx < pend %lx\n", | 395 | pr_emerg("vm_start %lx < pend %lx\n", |
396 | vma->vm_start, pend); | 396 | vma->vm_start, pend); |
397 | bug = 1; | 397 | bug = 1; |
398 | } | 398 | } |
399 | if (vma->vm_start > vma->vm_end) { | 399 | if (vma->vm_start > vma->vm_end) { |
400 | pr_emerg("vm_start %lx > vm_end %lx\n", | 400 | pr_emerg("vm_start %lx > vm_end %lx\n", |
401 | vma->vm_start, vma->vm_end); | 401 | vma->vm_start, vma->vm_end); |
402 | bug = 1; | 402 | bug = 1; |
403 | } | 403 | } |
404 | if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) { | 404 | if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) { |
405 | pr_emerg("free gap %lx, correct %lx\n", | 405 | pr_emerg("free gap %lx, correct %lx\n", |
406 | vma->rb_subtree_gap, | 406 | vma->rb_subtree_gap, |
407 | vma_compute_subtree_gap(vma)); | 407 | vma_compute_subtree_gap(vma)); |
408 | bug = 1; | 408 | bug = 1; |
409 | } | 409 | } |
410 | i++; | 410 | i++; |
411 | pn = nd; | 411 | pn = nd; |
412 | prev = vma->vm_start; | 412 | prev = vma->vm_start; |
413 | pend = vma->vm_end; | 413 | pend = vma->vm_end; |
414 | } | 414 | } |
415 | j = 0; | 415 | j = 0; |
416 | for (nd = pn; nd; nd = rb_prev(nd)) | 416 | for (nd = pn; nd; nd = rb_prev(nd)) |
417 | j++; | 417 | j++; |
418 | if (i != j) { | 418 | if (i != j) { |
419 | pr_emerg("backwards %d, forwards %d\n", j, i); | 419 | pr_emerg("backwards %d, forwards %d\n", j, i); |
420 | bug = 1; | 420 | bug = 1; |
421 | } | 421 | } |
422 | return bug ? -1 : i; | 422 | return bug ? -1 : i; |
423 | } | 423 | } |
424 | 424 | ||
425 | static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore) | 425 | static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore) |
426 | { | 426 | { |
427 | struct rb_node *nd; | 427 | struct rb_node *nd; |
428 | 428 | ||
429 | for (nd = rb_first(root); nd; nd = rb_next(nd)) { | 429 | for (nd = rb_first(root); nd; nd = rb_next(nd)) { |
430 | struct vm_area_struct *vma; | 430 | struct vm_area_struct *vma; |
431 | vma = rb_entry(nd, struct vm_area_struct, vm_rb); | 431 | vma = rb_entry(nd, struct vm_area_struct, vm_rb); |
432 | VM_BUG_ON_VMA(vma != ignore && | 432 | VM_BUG_ON_VMA(vma != ignore && |
433 | vma->rb_subtree_gap != vma_compute_subtree_gap(vma), | 433 | vma->rb_subtree_gap != vma_compute_subtree_gap(vma), |
434 | vma); | 434 | vma); |
435 | } | 435 | } |
436 | } | 436 | } |
437 | 437 | ||
438 | static void validate_mm(struct mm_struct *mm) | 438 | static void validate_mm(struct mm_struct *mm) |
439 | { | 439 | { |
440 | int bug = 0; | 440 | int bug = 0; |
441 | int i = 0; | 441 | int i = 0; |
442 | unsigned long highest_address = 0; | 442 | unsigned long highest_address = 0; |
443 | struct vm_area_struct *vma = mm->mmap; | 443 | struct vm_area_struct *vma = mm->mmap; |
444 | 444 | ||
445 | while (vma) { | 445 | while (vma) { |
446 | struct anon_vma_chain *avc; | 446 | struct anon_vma_chain *avc; |
447 | 447 | ||
448 | vma_lock_anon_vma(vma); | 448 | vma_lock_anon_vma(vma); |
449 | list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) | 449 | list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) |
450 | anon_vma_interval_tree_verify(avc); | 450 | anon_vma_interval_tree_verify(avc); |
451 | vma_unlock_anon_vma(vma); | 451 | vma_unlock_anon_vma(vma); |
452 | highest_address = vma->vm_end; | 452 | highest_address = vma->vm_end; |
453 | vma = vma->vm_next; | 453 | vma = vma->vm_next; |
454 | i++; | 454 | i++; |
455 | } | 455 | } |
456 | if (i != mm->map_count) { | 456 | if (i != mm->map_count) { |
457 | pr_emerg("map_count %d vm_next %d\n", mm->map_count, i); | 457 | pr_emerg("map_count %d vm_next %d\n", mm->map_count, i); |
458 | bug = 1; | 458 | bug = 1; |
459 | } | 459 | } |
460 | if (highest_address != mm->highest_vm_end) { | 460 | if (highest_address != mm->highest_vm_end) { |
461 | pr_emerg("mm->highest_vm_end %lx, found %lx\n", | 461 | pr_emerg("mm->highest_vm_end %lx, found %lx\n", |
462 | mm->highest_vm_end, highest_address); | 462 | mm->highest_vm_end, highest_address); |
463 | bug = 1; | 463 | bug = 1; |
464 | } | 464 | } |
465 | i = browse_rb(&mm->mm_rb); | 465 | i = browse_rb(&mm->mm_rb); |
466 | if (i != mm->map_count) { | 466 | if (i != mm->map_count) { |
467 | if (i != -1) | 467 | if (i != -1) |
468 | pr_emerg("map_count %d rb %d\n", mm->map_count, i); | 468 | pr_emerg("map_count %d rb %d\n", mm->map_count, i); |
469 | bug = 1; | 469 | bug = 1; |
470 | } | 470 | } |
471 | VM_BUG_ON_MM(bug, mm); | 471 | VM_BUG_ON_MM(bug, mm); |
472 | } | 472 | } |
473 | #else | 473 | #else |
474 | #define validate_mm_rb(root, ignore) do { } while (0) | 474 | #define validate_mm_rb(root, ignore) do { } while (0) |
475 | #define validate_mm(mm) do { } while (0) | 475 | #define validate_mm(mm) do { } while (0) |
476 | #endif | 476 | #endif |
477 | 477 | ||
478 | RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb, | 478 | RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb, |
479 | unsigned long, rb_subtree_gap, vma_compute_subtree_gap) | 479 | unsigned long, rb_subtree_gap, vma_compute_subtree_gap) |
480 | 480 | ||
481 | /* | 481 | /* |
482 | * Update augmented rbtree rb_subtree_gap values after vma->vm_start or | 482 | * Update augmented rbtree rb_subtree_gap values after vma->vm_start or |
483 | * vma->vm_prev->vm_end values changed, without modifying the vma's position | 483 | * vma->vm_prev->vm_end values changed, without modifying the vma's position |
484 | * in the rbtree. | 484 | * in the rbtree. |
485 | */ | 485 | */ |
486 | static void vma_gap_update(struct vm_area_struct *vma) | 486 | static void vma_gap_update(struct vm_area_struct *vma) |
487 | { | 487 | { |
488 | /* | 488 | /* |
489 | * As it turns out, RB_DECLARE_CALLBACKS() already created a callback | 489 | * As it turns out, RB_DECLARE_CALLBACKS() already created a callback |
490 | * function that does exacltly what we want. | 490 | * function that does exacltly what we want. |
491 | */ | 491 | */ |
492 | vma_gap_callbacks_propagate(&vma->vm_rb, NULL); | 492 | vma_gap_callbacks_propagate(&vma->vm_rb, NULL); |
493 | } | 493 | } |
494 | 494 | ||
495 | static inline void vma_rb_insert(struct vm_area_struct *vma, | 495 | static inline void vma_rb_insert(struct vm_area_struct *vma, |
496 | struct rb_root *root) | 496 | struct rb_root *root) |
497 | { | 497 | { |
498 | /* All rb_subtree_gap values must be consistent prior to insertion */ | 498 | /* All rb_subtree_gap values must be consistent prior to insertion */ |
499 | validate_mm_rb(root, NULL); | 499 | validate_mm_rb(root, NULL); |
500 | 500 | ||
501 | rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks); | 501 | rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks); |
502 | } | 502 | } |
503 | 503 | ||
504 | static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root) | 504 | static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root) |
505 | { | 505 | { |
506 | /* | 506 | /* |
507 | * All rb_subtree_gap values must be consistent prior to erase, | 507 | * All rb_subtree_gap values must be consistent prior to erase, |
508 | * with the possible exception of the vma being erased. | 508 | * with the possible exception of the vma being erased. |
509 | */ | 509 | */ |
510 | validate_mm_rb(root, vma); | 510 | validate_mm_rb(root, vma); |
511 | 511 | ||
512 | /* | 512 | /* |
513 | * Note rb_erase_augmented is a fairly large inline function, | 513 | * Note rb_erase_augmented is a fairly large inline function, |
514 | * so make sure we instantiate it only once with our desired | 514 | * so make sure we instantiate it only once with our desired |
515 | * augmented rbtree callbacks. | 515 | * augmented rbtree callbacks. |
516 | */ | 516 | */ |
517 | rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks); | 517 | rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks); |
518 | } | 518 | } |
519 | 519 | ||
520 | /* | 520 | /* |
521 | * vma has some anon_vma assigned, and is already inserted on that | 521 | * vma has some anon_vma assigned, and is already inserted on that |
522 | * anon_vma's interval trees. | 522 | * anon_vma's interval trees. |
523 | * | 523 | * |
524 | * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the | 524 | * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the |
525 | * vma must be removed from the anon_vma's interval trees using | 525 | * vma must be removed from the anon_vma's interval trees using |
526 | * anon_vma_interval_tree_pre_update_vma(). | 526 | * anon_vma_interval_tree_pre_update_vma(). |
527 | * | 527 | * |
528 | * After the update, the vma will be reinserted using | 528 | * After the update, the vma will be reinserted using |
529 | * anon_vma_interval_tree_post_update_vma(). | 529 | * anon_vma_interval_tree_post_update_vma(). |
530 | * | 530 | * |
531 | * The entire update must be protected by exclusive mmap_sem and by | 531 | * The entire update must be protected by exclusive mmap_sem and by |
532 | * the root anon_vma's mutex. | 532 | * the root anon_vma's mutex. |
533 | */ | 533 | */ |
534 | static inline void | 534 | static inline void |
535 | anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma) | 535 | anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma) |
536 | { | 536 | { |
537 | struct anon_vma_chain *avc; | 537 | struct anon_vma_chain *avc; |
538 | 538 | ||
539 | list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) | 539 | list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) |
540 | anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root); | 540 | anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root); |
541 | } | 541 | } |
542 | 542 | ||
543 | static inline void | 543 | static inline void |
544 | anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) | 544 | anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) |
545 | { | 545 | { |
546 | struct anon_vma_chain *avc; | 546 | struct anon_vma_chain *avc; |
547 | 547 | ||
548 | list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) | 548 | list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) |
549 | anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); | 549 | anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); |
550 | } | 550 | } |
551 | 551 | ||
552 | static int find_vma_links(struct mm_struct *mm, unsigned long addr, | 552 | static int find_vma_links(struct mm_struct *mm, unsigned long addr, |
553 | unsigned long end, struct vm_area_struct **pprev, | 553 | unsigned long end, struct vm_area_struct **pprev, |
554 | struct rb_node ***rb_link, struct rb_node **rb_parent) | 554 | struct rb_node ***rb_link, struct rb_node **rb_parent) |
555 | { | 555 | { |
556 | struct rb_node **__rb_link, *__rb_parent, *rb_prev; | 556 | struct rb_node **__rb_link, *__rb_parent, *rb_prev; |
557 | 557 | ||
558 | __rb_link = &mm->mm_rb.rb_node; | 558 | __rb_link = &mm->mm_rb.rb_node; |
559 | rb_prev = __rb_parent = NULL; | 559 | rb_prev = __rb_parent = NULL; |
560 | 560 | ||
561 | while (*__rb_link) { | 561 | while (*__rb_link) { |
562 | struct vm_area_struct *vma_tmp; | 562 | struct vm_area_struct *vma_tmp; |
563 | 563 | ||
564 | __rb_parent = *__rb_link; | 564 | __rb_parent = *__rb_link; |
565 | vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb); | 565 | vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb); |
566 | 566 | ||
567 | if (vma_tmp->vm_end > addr) { | 567 | if (vma_tmp->vm_end > addr) { |
568 | /* Fail if an existing vma overlaps the area */ | 568 | /* Fail if an existing vma overlaps the area */ |
569 | if (vma_tmp->vm_start < end) | 569 | if (vma_tmp->vm_start < end) |
570 | return -ENOMEM; | 570 | return -ENOMEM; |
571 | __rb_link = &__rb_parent->rb_left; | 571 | __rb_link = &__rb_parent->rb_left; |
572 | } else { | 572 | } else { |
573 | rb_prev = __rb_parent; | 573 | rb_prev = __rb_parent; |
574 | __rb_link = &__rb_parent->rb_right; | 574 | __rb_link = &__rb_parent->rb_right; |
575 | } | 575 | } |
576 | } | 576 | } |
577 | 577 | ||
578 | *pprev = NULL; | 578 | *pprev = NULL; |
579 | if (rb_prev) | 579 | if (rb_prev) |
580 | *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); | 580 | *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); |
581 | *rb_link = __rb_link; | 581 | *rb_link = __rb_link; |
582 | *rb_parent = __rb_parent; | 582 | *rb_parent = __rb_parent; |
583 | return 0; | 583 | return 0; |
584 | } | 584 | } |
585 | 585 | ||
586 | static unsigned long count_vma_pages_range(struct mm_struct *mm, | 586 | static unsigned long count_vma_pages_range(struct mm_struct *mm, |
587 | unsigned long addr, unsigned long end) | 587 | unsigned long addr, unsigned long end) |
588 | { | 588 | { |
589 | unsigned long nr_pages = 0; | 589 | unsigned long nr_pages = 0; |
590 | struct vm_area_struct *vma; | 590 | struct vm_area_struct *vma; |
591 | 591 | ||
592 | /* Find first overlaping mapping */ | 592 | /* Find first overlaping mapping */ |
593 | vma = find_vma_intersection(mm, addr, end); | 593 | vma = find_vma_intersection(mm, addr, end); |
594 | if (!vma) | 594 | if (!vma) |
595 | return 0; | 595 | return 0; |
596 | 596 | ||
597 | nr_pages = (min(end, vma->vm_end) - | 597 | nr_pages = (min(end, vma->vm_end) - |
598 | max(addr, vma->vm_start)) >> PAGE_SHIFT; | 598 | max(addr, vma->vm_start)) >> PAGE_SHIFT; |
599 | 599 | ||
600 | /* Iterate over the rest of the overlaps */ | 600 | /* Iterate over the rest of the overlaps */ |
601 | for (vma = vma->vm_next; vma; vma = vma->vm_next) { | 601 | for (vma = vma->vm_next; vma; vma = vma->vm_next) { |
602 | unsigned long overlap_len; | 602 | unsigned long overlap_len; |
603 | 603 | ||
604 | if (vma->vm_start > end) | 604 | if (vma->vm_start > end) |
605 | break; | 605 | break; |
606 | 606 | ||
607 | overlap_len = min(end, vma->vm_end) - vma->vm_start; | 607 | overlap_len = min(end, vma->vm_end) - vma->vm_start; |
608 | nr_pages += overlap_len >> PAGE_SHIFT; | 608 | nr_pages += overlap_len >> PAGE_SHIFT; |
609 | } | 609 | } |
610 | 610 | ||
611 | return nr_pages; | 611 | return nr_pages; |
612 | } | 612 | } |
613 | 613 | ||
614 | void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, | 614 | void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, |
615 | struct rb_node **rb_link, struct rb_node *rb_parent) | 615 | struct rb_node **rb_link, struct rb_node *rb_parent) |
616 | { | 616 | { |
617 | /* Update tracking information for the gap following the new vma. */ | 617 | /* Update tracking information for the gap following the new vma. */ |
618 | if (vma->vm_next) | 618 | if (vma->vm_next) |
619 | vma_gap_update(vma->vm_next); | 619 | vma_gap_update(vma->vm_next); |
620 | else | 620 | else |
621 | mm->highest_vm_end = vma->vm_end; | 621 | mm->highest_vm_end = vma->vm_end; |
622 | 622 | ||
623 | /* | 623 | /* |
624 | * vma->vm_prev wasn't known when we followed the rbtree to find the | 624 | * vma->vm_prev wasn't known when we followed the rbtree to find the |
625 | * correct insertion point for that vma. As a result, we could not | 625 | * correct insertion point for that vma. As a result, we could not |
626 | * update the vma vm_rb parents rb_subtree_gap values on the way down. | 626 | * update the vma vm_rb parents rb_subtree_gap values on the way down. |
627 | * So, we first insert the vma with a zero rb_subtree_gap value | 627 | * So, we first insert the vma with a zero rb_subtree_gap value |
628 | * (to be consistent with what we did on the way down), and then | 628 | * (to be consistent with what we did on the way down), and then |
629 | * immediately update the gap to the correct value. Finally we | 629 | * immediately update the gap to the correct value. Finally we |
630 | * rebalance the rbtree after all augmented values have been set. | 630 | * rebalance the rbtree after all augmented values have been set. |
631 | */ | 631 | */ |
632 | rb_link_node(&vma->vm_rb, rb_parent, rb_link); | 632 | rb_link_node(&vma->vm_rb, rb_parent, rb_link); |
633 | vma->rb_subtree_gap = 0; | 633 | vma->rb_subtree_gap = 0; |
634 | vma_gap_update(vma); | 634 | vma_gap_update(vma); |
635 | vma_rb_insert(vma, &mm->mm_rb); | 635 | vma_rb_insert(vma, &mm->mm_rb); |
636 | } | 636 | } |
637 | 637 | ||
638 | static void __vma_link_file(struct vm_area_struct *vma) | 638 | static void __vma_link_file(struct vm_area_struct *vma) |
639 | { | 639 | { |
640 | struct file *file; | 640 | struct file *file; |
641 | 641 | ||
642 | file = vma->vm_file; | 642 | file = vma->vm_file; |
643 | if (file) { | 643 | if (file) { |
644 | struct address_space *mapping = file->f_mapping; | 644 | struct address_space *mapping = file->f_mapping; |
645 | 645 | ||
646 | if (vma->vm_flags & VM_DENYWRITE) | 646 | if (vma->vm_flags & VM_DENYWRITE) |
647 | atomic_dec(&file_inode(file)->i_writecount); | 647 | atomic_dec(&file_inode(file)->i_writecount); |
648 | if (vma->vm_flags & VM_SHARED) | 648 | if (vma->vm_flags & VM_SHARED) |
649 | atomic_inc(&mapping->i_mmap_writable); | 649 | atomic_inc(&mapping->i_mmap_writable); |
650 | 650 | ||
651 | flush_dcache_mmap_lock(mapping); | 651 | flush_dcache_mmap_lock(mapping); |
652 | if (unlikely(vma->vm_flags & VM_NONLINEAR)) | 652 | if (unlikely(vma->vm_flags & VM_NONLINEAR)) |
653 | vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); | 653 | vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); |
654 | else | 654 | else |
655 | vma_interval_tree_insert(vma, &mapping->i_mmap); | 655 | vma_interval_tree_insert(vma, &mapping->i_mmap); |
656 | flush_dcache_mmap_unlock(mapping); | 656 | flush_dcache_mmap_unlock(mapping); |
657 | } | 657 | } |
658 | } | 658 | } |
659 | 659 | ||
660 | static void | 660 | static void |
661 | __vma_link(struct mm_struct *mm, struct vm_area_struct *vma, | 661 | __vma_link(struct mm_struct *mm, struct vm_area_struct *vma, |
662 | struct vm_area_struct *prev, struct rb_node **rb_link, | 662 | struct vm_area_struct *prev, struct rb_node **rb_link, |
663 | struct rb_node *rb_parent) | 663 | struct rb_node *rb_parent) |
664 | { | 664 | { |
665 | __vma_link_list(mm, vma, prev, rb_parent); | 665 | __vma_link_list(mm, vma, prev, rb_parent); |
666 | __vma_link_rb(mm, vma, rb_link, rb_parent); | 666 | __vma_link_rb(mm, vma, rb_link, rb_parent); |
667 | } | 667 | } |
668 | 668 | ||
669 | static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, | 669 | static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, |
670 | struct vm_area_struct *prev, struct rb_node **rb_link, | 670 | struct vm_area_struct *prev, struct rb_node **rb_link, |
671 | struct rb_node *rb_parent) | 671 | struct rb_node *rb_parent) |
672 | { | 672 | { |
673 | struct address_space *mapping = NULL; | 673 | struct address_space *mapping = NULL; |
674 | 674 | ||
675 | if (vma->vm_file) { | 675 | if (vma->vm_file) { |
676 | mapping = vma->vm_file->f_mapping; | 676 | mapping = vma->vm_file->f_mapping; |
677 | i_mmap_lock_write(mapping); | 677 | i_mmap_lock_write(mapping); |
678 | } | 678 | } |
679 | 679 | ||
680 | __vma_link(mm, vma, prev, rb_link, rb_parent); | 680 | __vma_link(mm, vma, prev, rb_link, rb_parent); |
681 | __vma_link_file(vma); | 681 | __vma_link_file(vma); |
682 | 682 | ||
683 | if (mapping) | 683 | if (mapping) |
684 | i_mmap_unlock_write(mapping); | 684 | i_mmap_unlock_write(mapping); |
685 | 685 | ||
686 | mm->map_count++; | 686 | mm->map_count++; |
687 | validate_mm(mm); | 687 | validate_mm(mm); |
688 | } | 688 | } |
689 | 689 | ||
690 | /* | 690 | /* |
691 | * Helper for vma_adjust() in the split_vma insert case: insert a vma into the | 691 | * Helper for vma_adjust() in the split_vma insert case: insert a vma into the |
692 | * mm's list and rbtree. It has already been inserted into the interval tree. | 692 | * mm's list and rbtree. It has already been inserted into the interval tree. |
693 | */ | 693 | */ |
694 | static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) | 694 | static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) |
695 | { | 695 | { |
696 | struct vm_area_struct *prev; | 696 | struct vm_area_struct *prev; |
697 | struct rb_node **rb_link, *rb_parent; | 697 | struct rb_node **rb_link, *rb_parent; |
698 | 698 | ||
699 | if (find_vma_links(mm, vma->vm_start, vma->vm_end, | 699 | if (find_vma_links(mm, vma->vm_start, vma->vm_end, |
700 | &prev, &rb_link, &rb_parent)) | 700 | &prev, &rb_link, &rb_parent)) |
701 | BUG(); | 701 | BUG(); |
702 | __vma_link(mm, vma, prev, rb_link, rb_parent); | 702 | __vma_link(mm, vma, prev, rb_link, rb_parent); |
703 | mm->map_count++; | 703 | mm->map_count++; |
704 | } | 704 | } |
705 | 705 | ||
706 | static inline void | 706 | static inline void |
707 | __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, | 707 | __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, |
708 | struct vm_area_struct *prev) | 708 | struct vm_area_struct *prev) |
709 | { | 709 | { |
710 | struct vm_area_struct *next; | 710 | struct vm_area_struct *next; |
711 | 711 | ||
712 | vma_rb_erase(vma, &mm->mm_rb); | 712 | vma_rb_erase(vma, &mm->mm_rb); |
713 | prev->vm_next = next = vma->vm_next; | 713 | prev->vm_next = next = vma->vm_next; |
714 | if (next) | 714 | if (next) |
715 | next->vm_prev = prev; | 715 | next->vm_prev = prev; |
716 | 716 | ||
717 | /* Kill the cache */ | 717 | /* Kill the cache */ |
718 | vmacache_invalidate(mm); | 718 | vmacache_invalidate(mm); |
719 | } | 719 | } |
720 | 720 | ||
721 | /* | 721 | /* |
722 | * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that | 722 | * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that |
723 | * is already present in an i_mmap tree without adjusting the tree. | 723 | * is already present in an i_mmap tree without adjusting the tree. |
724 | * The following helper function should be used when such adjustments | 724 | * The following helper function should be used when such adjustments |
725 | * are necessary. The "insert" vma (if any) is to be inserted | 725 | * are necessary. The "insert" vma (if any) is to be inserted |
726 | * before we drop the necessary locks. | 726 | * before we drop the necessary locks. |
727 | */ | 727 | */ |
728 | int vma_adjust(struct vm_area_struct *vma, unsigned long start, | 728 | int vma_adjust(struct vm_area_struct *vma, unsigned long start, |
729 | unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) | 729 | unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) |
730 | { | 730 | { |
731 | struct mm_struct *mm = vma->vm_mm; | 731 | struct mm_struct *mm = vma->vm_mm; |
732 | struct vm_area_struct *next = vma->vm_next; | 732 | struct vm_area_struct *next = vma->vm_next; |
733 | struct vm_area_struct *importer = NULL; | 733 | struct vm_area_struct *importer = NULL; |
734 | struct address_space *mapping = NULL; | 734 | struct address_space *mapping = NULL; |
735 | struct rb_root *root = NULL; | 735 | struct rb_root *root = NULL; |
736 | struct anon_vma *anon_vma = NULL; | 736 | struct anon_vma *anon_vma = NULL; |
737 | struct file *file = vma->vm_file; | 737 | struct file *file = vma->vm_file; |
738 | bool start_changed = false, end_changed = false; | 738 | bool start_changed = false, end_changed = false; |
739 | long adjust_next = 0; | 739 | long adjust_next = 0; |
740 | int remove_next = 0; | 740 | int remove_next = 0; |
741 | 741 | ||
742 | if (next && !insert) { | 742 | if (next && !insert) { |
743 | struct vm_area_struct *exporter = NULL; | 743 | struct vm_area_struct *exporter = NULL; |
744 | 744 | ||
745 | if (end >= next->vm_end) { | 745 | if (end >= next->vm_end) { |
746 | /* | 746 | /* |
747 | * vma expands, overlapping all the next, and | 747 | * vma expands, overlapping all the next, and |
748 | * perhaps the one after too (mprotect case 6). | 748 | * perhaps the one after too (mprotect case 6). |
749 | */ | 749 | */ |
750 | again: remove_next = 1 + (end > next->vm_end); | 750 | again: remove_next = 1 + (end > next->vm_end); |
751 | end = next->vm_end; | 751 | end = next->vm_end; |
752 | exporter = next; | 752 | exporter = next; |
753 | importer = vma; | 753 | importer = vma; |
754 | } else if (end > next->vm_start) { | 754 | } else if (end > next->vm_start) { |
755 | /* | 755 | /* |
756 | * vma expands, overlapping part of the next: | 756 | * vma expands, overlapping part of the next: |
757 | * mprotect case 5 shifting the boundary up. | 757 | * mprotect case 5 shifting the boundary up. |
758 | */ | 758 | */ |
759 | adjust_next = (end - next->vm_start) >> PAGE_SHIFT; | 759 | adjust_next = (end - next->vm_start) >> PAGE_SHIFT; |
760 | exporter = next; | 760 | exporter = next; |
761 | importer = vma; | 761 | importer = vma; |
762 | } else if (end < vma->vm_end) { | 762 | } else if (end < vma->vm_end) { |
763 | /* | 763 | /* |
764 | * vma shrinks, and !insert tells it's not | 764 | * vma shrinks, and !insert tells it's not |
765 | * split_vma inserting another: so it must be | 765 | * split_vma inserting another: so it must be |
766 | * mprotect case 4 shifting the boundary down. | 766 | * mprotect case 4 shifting the boundary down. |
767 | */ | 767 | */ |
768 | adjust_next = -((vma->vm_end - end) >> PAGE_SHIFT); | 768 | adjust_next = -((vma->vm_end - end) >> PAGE_SHIFT); |
769 | exporter = vma; | 769 | exporter = vma; |
770 | importer = next; | 770 | importer = next; |
771 | } | 771 | } |
772 | 772 | ||
773 | /* | 773 | /* |
774 | * Easily overlooked: when mprotect shifts the boundary, | 774 | * Easily overlooked: when mprotect shifts the boundary, |
775 | * make sure the expanding vma has anon_vma set if the | 775 | * make sure the expanding vma has anon_vma set if the |
776 | * shrinking vma had, to cover any anon pages imported. | 776 | * shrinking vma had, to cover any anon pages imported. |
777 | */ | 777 | */ |
778 | if (exporter && exporter->anon_vma && !importer->anon_vma) { | 778 | if (exporter && exporter->anon_vma && !importer->anon_vma) { |
779 | int error; | 779 | int error; |
780 | 780 | ||
781 | error = anon_vma_clone(importer, exporter); | 781 | error = anon_vma_clone(importer, exporter); |
782 | if (error) | 782 | if (error) |
783 | return error; | 783 | return error; |
784 | importer->anon_vma = exporter->anon_vma; | 784 | importer->anon_vma = exporter->anon_vma; |
785 | } | 785 | } |
786 | } | 786 | } |
787 | 787 | ||
788 | if (file) { | 788 | if (file) { |
789 | mapping = file->f_mapping; | 789 | mapping = file->f_mapping; |
790 | if (!(vma->vm_flags & VM_NONLINEAR)) { | 790 | if (!(vma->vm_flags & VM_NONLINEAR)) { |
791 | root = &mapping->i_mmap; | 791 | root = &mapping->i_mmap; |
792 | uprobe_munmap(vma, vma->vm_start, vma->vm_end); | 792 | uprobe_munmap(vma, vma->vm_start, vma->vm_end); |
793 | 793 | ||
794 | if (adjust_next) | 794 | if (adjust_next) |
795 | uprobe_munmap(next, next->vm_start, | 795 | uprobe_munmap(next, next->vm_start, |
796 | next->vm_end); | 796 | next->vm_end); |
797 | } | 797 | } |
798 | 798 | ||
799 | i_mmap_lock_write(mapping); | 799 | i_mmap_lock_write(mapping); |
800 | if (insert) { | 800 | if (insert) { |
801 | /* | 801 | /* |
802 | * Put into interval tree now, so instantiated pages | 802 | * Put into interval tree now, so instantiated pages |
803 | * are visible to arm/parisc __flush_dcache_page | 803 | * are visible to arm/parisc __flush_dcache_page |
804 | * throughout; but we cannot insert into address | 804 | * throughout; but we cannot insert into address |
805 | * space until vma start or end is updated. | 805 | * space until vma start or end is updated. |
806 | */ | 806 | */ |
807 | __vma_link_file(insert); | 807 | __vma_link_file(insert); |
808 | } | 808 | } |
809 | } | 809 | } |
810 | 810 | ||
811 | vma_adjust_trans_huge(vma, start, end, adjust_next); | 811 | vma_adjust_trans_huge(vma, start, end, adjust_next); |
812 | 812 | ||
813 | anon_vma = vma->anon_vma; | 813 | anon_vma = vma->anon_vma; |
814 | if (!anon_vma && adjust_next) | 814 | if (!anon_vma && adjust_next) |
815 | anon_vma = next->anon_vma; | 815 | anon_vma = next->anon_vma; |
816 | if (anon_vma) { | 816 | if (anon_vma) { |
817 | VM_BUG_ON_VMA(adjust_next && next->anon_vma && | 817 | VM_BUG_ON_VMA(adjust_next && next->anon_vma && |
818 | anon_vma != next->anon_vma, next); | 818 | anon_vma != next->anon_vma, next); |
819 | anon_vma_lock_write(anon_vma); | 819 | anon_vma_lock_write(anon_vma); |
820 | anon_vma_interval_tree_pre_update_vma(vma); | 820 | anon_vma_interval_tree_pre_update_vma(vma); |
821 | if (adjust_next) | 821 | if (adjust_next) |
822 | anon_vma_interval_tree_pre_update_vma(next); | 822 | anon_vma_interval_tree_pre_update_vma(next); |
823 | } | 823 | } |
824 | 824 | ||
825 | if (root) { | 825 | if (root) { |
826 | flush_dcache_mmap_lock(mapping); | 826 | flush_dcache_mmap_lock(mapping); |
827 | vma_interval_tree_remove(vma, root); | 827 | vma_interval_tree_remove(vma, root); |
828 | if (adjust_next) | 828 | if (adjust_next) |
829 | vma_interval_tree_remove(next, root); | 829 | vma_interval_tree_remove(next, root); |
830 | } | 830 | } |
831 | 831 | ||
832 | if (start != vma->vm_start) { | 832 | if (start != vma->vm_start) { |
833 | vma->vm_start = start; | 833 | vma->vm_start = start; |
834 | start_changed = true; | 834 | start_changed = true; |
835 | } | 835 | } |
836 | if (end != vma->vm_end) { | 836 | if (end != vma->vm_end) { |
837 | vma->vm_end = end; | 837 | vma->vm_end = end; |
838 | end_changed = true; | 838 | end_changed = true; |
839 | } | 839 | } |
840 | vma->vm_pgoff = pgoff; | 840 | vma->vm_pgoff = pgoff; |
841 | if (adjust_next) { | 841 | if (adjust_next) { |
842 | next->vm_start += adjust_next << PAGE_SHIFT; | 842 | next->vm_start += adjust_next << PAGE_SHIFT; |
843 | next->vm_pgoff += adjust_next; | 843 | next->vm_pgoff += adjust_next; |
844 | } | 844 | } |
845 | 845 | ||
846 | if (root) { | 846 | if (root) { |
847 | if (adjust_next) | 847 | if (adjust_next) |
848 | vma_interval_tree_insert(next, root); | 848 | vma_interval_tree_insert(next, root); |
849 | vma_interval_tree_insert(vma, root); | 849 | vma_interval_tree_insert(vma, root); |
850 | flush_dcache_mmap_unlock(mapping); | 850 | flush_dcache_mmap_unlock(mapping); |
851 | } | 851 | } |
852 | 852 | ||
853 | if (remove_next) { | 853 | if (remove_next) { |
854 | /* | 854 | /* |
855 | * vma_merge has merged next into vma, and needs | 855 | * vma_merge has merged next into vma, and needs |
856 | * us to remove next before dropping the locks. | 856 | * us to remove next before dropping the locks. |
857 | */ | 857 | */ |
858 | __vma_unlink(mm, next, vma); | 858 | __vma_unlink(mm, next, vma); |
859 | if (file) | 859 | if (file) |
860 | __remove_shared_vm_struct(next, file, mapping); | 860 | __remove_shared_vm_struct(next, file, mapping); |
861 | } else if (insert) { | 861 | } else if (insert) { |
862 | /* | 862 | /* |
863 | * split_vma has split insert from vma, and needs | 863 | * split_vma has split insert from vma, and needs |
864 | * us to insert it before dropping the locks | 864 | * us to insert it before dropping the locks |
865 | * (it may either follow vma or precede it). | 865 | * (it may either follow vma or precede it). |
866 | */ | 866 | */ |
867 | __insert_vm_struct(mm, insert); | 867 | __insert_vm_struct(mm, insert); |
868 | } else { | 868 | } else { |
869 | if (start_changed) | 869 | if (start_changed) |
870 | vma_gap_update(vma); | 870 | vma_gap_update(vma); |
871 | if (end_changed) { | 871 | if (end_changed) { |
872 | if (!next) | 872 | if (!next) |
873 | mm->highest_vm_end = end; | 873 | mm->highest_vm_end = end; |
874 | else if (!adjust_next) | 874 | else if (!adjust_next) |
875 | vma_gap_update(next); | 875 | vma_gap_update(next); |
876 | } | 876 | } |
877 | } | 877 | } |
878 | 878 | ||
879 | if (anon_vma) { | 879 | if (anon_vma) { |
880 | anon_vma_interval_tree_post_update_vma(vma); | 880 | anon_vma_interval_tree_post_update_vma(vma); |
881 | if (adjust_next) | 881 | if (adjust_next) |
882 | anon_vma_interval_tree_post_update_vma(next); | 882 | anon_vma_interval_tree_post_update_vma(next); |
883 | anon_vma_unlock_write(anon_vma); | 883 | anon_vma_unlock_write(anon_vma); |
884 | } | 884 | } |
885 | if (mapping) | 885 | if (mapping) |
886 | i_mmap_unlock_write(mapping); | 886 | i_mmap_unlock_write(mapping); |
887 | 887 | ||
888 | if (root) { | 888 | if (root) { |
889 | uprobe_mmap(vma); | 889 | uprobe_mmap(vma); |
890 | 890 | ||
891 | if (adjust_next) | 891 | if (adjust_next) |
892 | uprobe_mmap(next); | 892 | uprobe_mmap(next); |
893 | } | 893 | } |
894 | 894 | ||
895 | if (remove_next) { | 895 | if (remove_next) { |
896 | if (file) { | 896 | if (file) { |
897 | uprobe_munmap(next, next->vm_start, next->vm_end); | 897 | uprobe_munmap(next, next->vm_start, next->vm_end); |
898 | fput(file); | 898 | fput(file); |
899 | } | 899 | } |
900 | if (next->anon_vma) | 900 | if (next->anon_vma) |
901 | anon_vma_merge(vma, next); | 901 | anon_vma_merge(vma, next); |
902 | mm->map_count--; | 902 | mm->map_count--; |
903 | mpol_put(vma_policy(next)); | 903 | mpol_put(vma_policy(next)); |
904 | kmem_cache_free(vm_area_cachep, next); | 904 | kmem_cache_free(vm_area_cachep, next); |
905 | /* | 905 | /* |
906 | * In mprotect's case 6 (see comments on vma_merge), | 906 | * In mprotect's case 6 (see comments on vma_merge), |
907 | * we must remove another next too. It would clutter | 907 | * we must remove another next too. It would clutter |
908 | * up the code too much to do both in one go. | 908 | * up the code too much to do both in one go. |
909 | */ | 909 | */ |
910 | next = vma->vm_next; | 910 | next = vma->vm_next; |
911 | if (remove_next == 2) | 911 | if (remove_next == 2) |
912 | goto again; | 912 | goto again; |
913 | else if (next) | 913 | else if (next) |
914 | vma_gap_update(next); | 914 | vma_gap_update(next); |
915 | else | 915 | else |
916 | mm->highest_vm_end = end; | 916 | mm->highest_vm_end = end; |
917 | } | 917 | } |
918 | if (insert && file) | 918 | if (insert && file) |
919 | uprobe_mmap(insert); | 919 | uprobe_mmap(insert); |
920 | 920 | ||
921 | validate_mm(mm); | 921 | validate_mm(mm); |
922 | 922 | ||
923 | return 0; | 923 | return 0; |
924 | } | 924 | } |
925 | 925 | ||
926 | /* | 926 | /* |
927 | * If the vma has a ->close operation then the driver probably needs to release | 927 | * If the vma has a ->close operation then the driver probably needs to release |
928 | * per-vma resources, so we don't attempt to merge those. | 928 | * per-vma resources, so we don't attempt to merge those. |
929 | */ | 929 | */ |
930 | static inline int is_mergeable_vma(struct vm_area_struct *vma, | 930 | static inline int is_mergeable_vma(struct vm_area_struct *vma, |
931 | struct file *file, unsigned long vm_flags) | 931 | struct file *file, unsigned long vm_flags) |
932 | { | 932 | { |
933 | /* | 933 | /* |
934 | * VM_SOFTDIRTY should not prevent from VMA merging, if we | 934 | * VM_SOFTDIRTY should not prevent from VMA merging, if we |
935 | * match the flags but dirty bit -- the caller should mark | 935 | * match the flags but dirty bit -- the caller should mark |
936 | * merged VMA as dirty. If dirty bit won't be excluded from | 936 | * merged VMA as dirty. If dirty bit won't be excluded from |
937 | * comparison, we increase pressue on the memory system forcing | 937 | * comparison, we increase pressue on the memory system forcing |
938 | * the kernel to generate new VMAs when old one could be | 938 | * the kernel to generate new VMAs when old one could be |
939 | * extended instead. | 939 | * extended instead. |
940 | */ | 940 | */ |
941 | if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY) | 941 | if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY) |
942 | return 0; | 942 | return 0; |
943 | if (vma->vm_file != file) | 943 | if (vma->vm_file != file) |
944 | return 0; | 944 | return 0; |
945 | if (vma->vm_ops && vma->vm_ops->close) | 945 | if (vma->vm_ops && vma->vm_ops->close) |
946 | return 0; | 946 | return 0; |
947 | return 1; | 947 | return 1; |
948 | } | 948 | } |
949 | 949 | ||
950 | static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1, | 950 | static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1, |
951 | struct anon_vma *anon_vma2, | 951 | struct anon_vma *anon_vma2, |
952 | struct vm_area_struct *vma) | 952 | struct vm_area_struct *vma) |
953 | { | 953 | { |
954 | /* | 954 | /* |
955 | * The list_is_singular() test is to avoid merging VMA cloned from | 955 | * The list_is_singular() test is to avoid merging VMA cloned from |
956 | * parents. This can improve scalability caused by anon_vma lock. | 956 | * parents. This can improve scalability caused by anon_vma lock. |
957 | */ | 957 | */ |
958 | if ((!anon_vma1 || !anon_vma2) && (!vma || | 958 | if ((!anon_vma1 || !anon_vma2) && (!vma || |
959 | list_is_singular(&vma->anon_vma_chain))) | 959 | list_is_singular(&vma->anon_vma_chain))) |
960 | return 1; | 960 | return 1; |
961 | return anon_vma1 == anon_vma2; | 961 | return anon_vma1 == anon_vma2; |
962 | } | 962 | } |
963 | 963 | ||
964 | /* | 964 | /* |
965 | * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) | 965 | * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) |
966 | * in front of (at a lower virtual address and file offset than) the vma. | 966 | * in front of (at a lower virtual address and file offset than) the vma. |
967 | * | 967 | * |
968 | * We cannot merge two vmas if they have differently assigned (non-NULL) | 968 | * We cannot merge two vmas if they have differently assigned (non-NULL) |
969 | * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. | 969 | * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. |
970 | * | 970 | * |
971 | * We don't check here for the merged mmap wrapping around the end of pagecache | 971 | * We don't check here for the merged mmap wrapping around the end of pagecache |
972 | * indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which | 972 | * indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which |
973 | * wrap, nor mmaps which cover the final page at index -1UL. | 973 | * wrap, nor mmaps which cover the final page at index -1UL. |
974 | */ | 974 | */ |
975 | static int | 975 | static int |
976 | can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, | 976 | can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, |
977 | struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) | 977 | struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) |
978 | { | 978 | { |
979 | if (is_mergeable_vma(vma, file, vm_flags) && | 979 | if (is_mergeable_vma(vma, file, vm_flags) && |
980 | is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { | 980 | is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { |
981 | if (vma->vm_pgoff == vm_pgoff) | 981 | if (vma->vm_pgoff == vm_pgoff) |
982 | return 1; | 982 | return 1; |
983 | } | 983 | } |
984 | return 0; | 984 | return 0; |
985 | } | 985 | } |
986 | 986 | ||
987 | /* | 987 | /* |
988 | * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) | 988 | * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) |
989 | * beyond (at a higher virtual address and file offset than) the vma. | 989 | * beyond (at a higher virtual address and file offset than) the vma. |
990 | * | 990 | * |
991 | * We cannot merge two vmas if they have differently assigned (non-NULL) | 991 | * We cannot merge two vmas if they have differently assigned (non-NULL) |
992 | * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. | 992 | * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. |
993 | */ | 993 | */ |
994 | static int | 994 | static int |
995 | can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, | 995 | can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, |
996 | struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) | 996 | struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) |
997 | { | 997 | { |
998 | if (is_mergeable_vma(vma, file, vm_flags) && | 998 | if (is_mergeable_vma(vma, file, vm_flags) && |
999 | is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { | 999 | is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { |
1000 | pgoff_t vm_pglen; | 1000 | pgoff_t vm_pglen; |
1001 | vm_pglen = vma_pages(vma); | 1001 | vm_pglen = vma_pages(vma); |
1002 | if (vma->vm_pgoff + vm_pglen == vm_pgoff) | 1002 | if (vma->vm_pgoff + vm_pglen == vm_pgoff) |
1003 | return 1; | 1003 | return 1; |
1004 | } | 1004 | } |
1005 | return 0; | 1005 | return 0; |
1006 | } | 1006 | } |
1007 | 1007 | ||
1008 | /* | 1008 | /* |
1009 | * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out | 1009 | * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out |
1010 | * whether that can be merged with its predecessor or its successor. | 1010 | * whether that can be merged with its predecessor or its successor. |
1011 | * Or both (it neatly fills a hole). | 1011 | * Or both (it neatly fills a hole). |
1012 | * | 1012 | * |
1013 | * In most cases - when called for mmap, brk or mremap - [addr,end) is | 1013 | * In most cases - when called for mmap, brk or mremap - [addr,end) is |
1014 | * certain not to be mapped by the time vma_merge is called; but when | 1014 | * certain not to be mapped by the time vma_merge is called; but when |
1015 | * called for mprotect, it is certain to be already mapped (either at | 1015 | * called for mprotect, it is certain to be already mapped (either at |
1016 | * an offset within prev, or at the start of next), and the flags of | 1016 | * an offset within prev, or at the start of next), and the flags of |
1017 | * this area are about to be changed to vm_flags - and the no-change | 1017 | * this area are about to be changed to vm_flags - and the no-change |
1018 | * case has already been eliminated. | 1018 | * case has already been eliminated. |
1019 | * | 1019 | * |
1020 | * The following mprotect cases have to be considered, where AAAA is | 1020 | * The following mprotect cases have to be considered, where AAAA is |
1021 | * the area passed down from mprotect_fixup, never extending beyond one | 1021 | * the area passed down from mprotect_fixup, never extending beyond one |
1022 | * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after: | 1022 | * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after: |
1023 | * | 1023 | * |
1024 | * AAAA AAAA AAAA AAAA | 1024 | * AAAA AAAA AAAA AAAA |
1025 | * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPNNNNNN PPPPNNNNXXXX | 1025 | * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPNNNNNN PPPPNNNNXXXX |
1026 | * cannot merge might become might become might become | 1026 | * cannot merge might become might become might become |
1027 | * PPNNNNNNNNNN PPPPPPPPPPNN PPPPPPPPPPPP 6 or | 1027 | * PPNNNNNNNNNN PPPPPPPPPPNN PPPPPPPPPPPP 6 or |
1028 | * mmap, brk or case 4 below case 5 below PPPPPPPPXXXX 7 or | 1028 | * mmap, brk or case 4 below case 5 below PPPPPPPPXXXX 7 or |
1029 | * mremap move: PPPPNNNNNNNN 8 | 1029 | * mremap move: PPPPNNNNNNNN 8 |
1030 | * AAAA | 1030 | * AAAA |
1031 | * PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN | 1031 | * PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN |
1032 | * might become case 1 below case 2 below case 3 below | 1032 | * might become case 1 below case 2 below case 3 below |
1033 | * | 1033 | * |
1034 | * Odd one out? Case 8, because it extends NNNN but needs flags of XXXX: | 1034 | * Odd one out? Case 8, because it extends NNNN but needs flags of XXXX: |
1035 | * mprotect_fixup updates vm_flags & vm_page_prot on successful return. | 1035 | * mprotect_fixup updates vm_flags & vm_page_prot on successful return. |
1036 | */ | 1036 | */ |
1037 | struct vm_area_struct *vma_merge(struct mm_struct *mm, | 1037 | struct vm_area_struct *vma_merge(struct mm_struct *mm, |
1038 | struct vm_area_struct *prev, unsigned long addr, | 1038 | struct vm_area_struct *prev, unsigned long addr, |
1039 | unsigned long end, unsigned long vm_flags, | 1039 | unsigned long end, unsigned long vm_flags, |
1040 | struct anon_vma *anon_vma, struct file *file, | 1040 | struct anon_vma *anon_vma, struct file *file, |
1041 | pgoff_t pgoff, struct mempolicy *policy) | 1041 | pgoff_t pgoff, struct mempolicy *policy) |
1042 | { | 1042 | { |
1043 | pgoff_t pglen = (end - addr) >> PAGE_SHIFT; | 1043 | pgoff_t pglen = (end - addr) >> PAGE_SHIFT; |
1044 | struct vm_area_struct *area, *next; | 1044 | struct vm_area_struct *area, *next; |
1045 | int err; | 1045 | int err; |
1046 | 1046 | ||
1047 | /* | 1047 | /* |
1048 | * We later require that vma->vm_flags == vm_flags, | 1048 | * We later require that vma->vm_flags == vm_flags, |
1049 | * so this tests vma->vm_flags & VM_SPECIAL, too. | 1049 | * so this tests vma->vm_flags & VM_SPECIAL, too. |
1050 | */ | 1050 | */ |
1051 | if (vm_flags & VM_SPECIAL) | 1051 | if (vm_flags & VM_SPECIAL) |
1052 | return NULL; | 1052 | return NULL; |
1053 | 1053 | ||
1054 | if (prev) | 1054 | if (prev) |
1055 | next = prev->vm_next; | 1055 | next = prev->vm_next; |
1056 | else | 1056 | else |
1057 | next = mm->mmap; | 1057 | next = mm->mmap; |
1058 | area = next; | 1058 | area = next; |
1059 | if (next && next->vm_end == end) /* cases 6, 7, 8 */ | 1059 | if (next && next->vm_end == end) /* cases 6, 7, 8 */ |
1060 | next = next->vm_next; | 1060 | next = next->vm_next; |
1061 | 1061 | ||
1062 | /* | 1062 | /* |
1063 | * Can it merge with the predecessor? | 1063 | * Can it merge with the predecessor? |
1064 | */ | 1064 | */ |
1065 | if (prev && prev->vm_end == addr && | 1065 | if (prev && prev->vm_end == addr && |
1066 | mpol_equal(vma_policy(prev), policy) && | 1066 | mpol_equal(vma_policy(prev), policy) && |
1067 | can_vma_merge_after(prev, vm_flags, | 1067 | can_vma_merge_after(prev, vm_flags, |
1068 | anon_vma, file, pgoff)) { | 1068 | anon_vma, file, pgoff)) { |
1069 | /* | 1069 | /* |
1070 | * OK, it can. Can we now merge in the successor as well? | 1070 | * OK, it can. Can we now merge in the successor as well? |
1071 | */ | 1071 | */ |
1072 | if (next && end == next->vm_start && | 1072 | if (next && end == next->vm_start && |
1073 | mpol_equal(policy, vma_policy(next)) && | 1073 | mpol_equal(policy, vma_policy(next)) && |
1074 | can_vma_merge_before(next, vm_flags, | 1074 | can_vma_merge_before(next, vm_flags, |
1075 | anon_vma, file, pgoff+pglen) && | 1075 | anon_vma, file, pgoff+pglen) && |
1076 | is_mergeable_anon_vma(prev->anon_vma, | 1076 | is_mergeable_anon_vma(prev->anon_vma, |
1077 | next->anon_vma, NULL)) { | 1077 | next->anon_vma, NULL)) { |
1078 | /* cases 1, 6 */ | 1078 | /* cases 1, 6 */ |
1079 | err = vma_adjust(prev, prev->vm_start, | 1079 | err = vma_adjust(prev, prev->vm_start, |
1080 | next->vm_end, prev->vm_pgoff, NULL); | 1080 | next->vm_end, prev->vm_pgoff, NULL); |
1081 | } else /* cases 2, 5, 7 */ | 1081 | } else /* cases 2, 5, 7 */ |
1082 | err = vma_adjust(prev, prev->vm_start, | 1082 | err = vma_adjust(prev, prev->vm_start, |
1083 | end, prev->vm_pgoff, NULL); | 1083 | end, prev->vm_pgoff, NULL); |
1084 | if (err) | 1084 | if (err) |
1085 | return NULL; | 1085 | return NULL; |
1086 | khugepaged_enter_vma_merge(prev, vm_flags); | 1086 | khugepaged_enter_vma_merge(prev, vm_flags); |
1087 | return prev; | 1087 | return prev; |
1088 | } | 1088 | } |
1089 | 1089 | ||
1090 | /* | 1090 | /* |
1091 | * Can this new request be merged in front of next? | 1091 | * Can this new request be merged in front of next? |
1092 | */ | 1092 | */ |
1093 | if (next && end == next->vm_start && | 1093 | if (next && end == next->vm_start && |
1094 | mpol_equal(policy, vma_policy(next)) && | 1094 | mpol_equal(policy, vma_policy(next)) && |
1095 | can_vma_merge_before(next, vm_flags, | 1095 | can_vma_merge_before(next, vm_flags, |
1096 | anon_vma, file, pgoff+pglen)) { | 1096 | anon_vma, file, pgoff+pglen)) { |
1097 | if (prev && addr < prev->vm_end) /* case 4 */ | 1097 | if (prev && addr < prev->vm_end) /* case 4 */ |
1098 | err = vma_adjust(prev, prev->vm_start, | 1098 | err = vma_adjust(prev, prev->vm_start, |
1099 | addr, prev->vm_pgoff, NULL); | 1099 | addr, prev->vm_pgoff, NULL); |
1100 | else /* cases 3, 8 */ | 1100 | else /* cases 3, 8 */ |
1101 | err = vma_adjust(area, addr, next->vm_end, | 1101 | err = vma_adjust(area, addr, next->vm_end, |
1102 | next->vm_pgoff - pglen, NULL); | 1102 | next->vm_pgoff - pglen, NULL); |
1103 | if (err) | 1103 | if (err) |
1104 | return NULL; | 1104 | return NULL; |
1105 | khugepaged_enter_vma_merge(area, vm_flags); | 1105 | khugepaged_enter_vma_merge(area, vm_flags); |
1106 | return area; | 1106 | return area; |
1107 | } | 1107 | } |
1108 | 1108 | ||
1109 | return NULL; | 1109 | return NULL; |
1110 | } | 1110 | } |
1111 | 1111 | ||
1112 | /* | 1112 | /* |
1113 | * Rough compatbility check to quickly see if it's even worth looking | 1113 | * Rough compatbility check to quickly see if it's even worth looking |
1114 | * at sharing an anon_vma. | 1114 | * at sharing an anon_vma. |
1115 | * | 1115 | * |
1116 | * They need to have the same vm_file, and the flags can only differ | 1116 | * They need to have the same vm_file, and the flags can only differ |
1117 | * in things that mprotect may change. | 1117 | * in things that mprotect may change. |
1118 | * | 1118 | * |
1119 | * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that | 1119 | * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that |
1120 | * we can merge the two vma's. For example, we refuse to merge a vma if | 1120 | * we can merge the two vma's. For example, we refuse to merge a vma if |
1121 | * there is a vm_ops->close() function, because that indicates that the | 1121 | * there is a vm_ops->close() function, because that indicates that the |
1122 | * driver is doing some kind of reference counting. But that doesn't | 1122 | * driver is doing some kind of reference counting. But that doesn't |
1123 | * really matter for the anon_vma sharing case. | 1123 | * really matter for the anon_vma sharing case. |
1124 | */ | 1124 | */ |
1125 | static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b) | 1125 | static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b) |
1126 | { | 1126 | { |
1127 | return a->vm_end == b->vm_start && | 1127 | return a->vm_end == b->vm_start && |
1128 | mpol_equal(vma_policy(a), vma_policy(b)) && | 1128 | mpol_equal(vma_policy(a), vma_policy(b)) && |
1129 | a->vm_file == b->vm_file && | 1129 | a->vm_file == b->vm_file && |
1130 | !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC|VM_SOFTDIRTY)) && | 1130 | !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC|VM_SOFTDIRTY)) && |
1131 | b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); | 1131 | b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); |
1132 | } | 1132 | } |
1133 | 1133 | ||
1134 | /* | 1134 | /* |
1135 | * Do some basic sanity checking to see if we can re-use the anon_vma | 1135 | * Do some basic sanity checking to see if we can re-use the anon_vma |
1136 | * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be | 1136 | * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be |
1137 | * the same as 'old', the other will be the new one that is trying | 1137 | * the same as 'old', the other will be the new one that is trying |
1138 | * to share the anon_vma. | 1138 | * to share the anon_vma. |
1139 | * | 1139 | * |
1140 | * NOTE! This runs with mm_sem held for reading, so it is possible that | 1140 | * NOTE! This runs with mm_sem held for reading, so it is possible that |
1141 | * the anon_vma of 'old' is concurrently in the process of being set up | 1141 | * the anon_vma of 'old' is concurrently in the process of being set up |
1142 | * by another page fault trying to merge _that_. But that's ok: if it | 1142 | * by another page fault trying to merge _that_. But that's ok: if it |
1143 | * is being set up, that automatically means that it will be a singleton | 1143 | * is being set up, that automatically means that it will be a singleton |
1144 | * acceptable for merging, so we can do all of this optimistically. But | 1144 | * acceptable for merging, so we can do all of this optimistically. But |
1145 | * we do that ACCESS_ONCE() to make sure that we never re-load the pointer. | 1145 | * we do that ACCESS_ONCE() to make sure that we never re-load the pointer. |
1146 | * | 1146 | * |
1147 | * IOW: that the "list_is_singular()" test on the anon_vma_chain only | 1147 | * IOW: that the "list_is_singular()" test on the anon_vma_chain only |
1148 | * matters for the 'stable anon_vma' case (ie the thing we want to avoid | 1148 | * matters for the 'stable anon_vma' case (ie the thing we want to avoid |
1149 | * is to return an anon_vma that is "complex" due to having gone through | 1149 | * is to return an anon_vma that is "complex" due to having gone through |
1150 | * a fork). | 1150 | * a fork). |
1151 | * | 1151 | * |
1152 | * We also make sure that the two vma's are compatible (adjacent, | 1152 | * We also make sure that the two vma's are compatible (adjacent, |
1153 | * and with the same memory policies). That's all stable, even with just | 1153 | * and with the same memory policies). That's all stable, even with just |
1154 | * a read lock on the mm_sem. | 1154 | * a read lock on the mm_sem. |
1155 | */ | 1155 | */ |
1156 | static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b) | 1156 | static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b) |
1157 | { | 1157 | { |
1158 | if (anon_vma_compatible(a, b)) { | 1158 | if (anon_vma_compatible(a, b)) { |
1159 | struct anon_vma *anon_vma = ACCESS_ONCE(old->anon_vma); | 1159 | struct anon_vma *anon_vma = ACCESS_ONCE(old->anon_vma); |
1160 | 1160 | ||
1161 | if (anon_vma && list_is_singular(&old->anon_vma_chain)) | 1161 | if (anon_vma && list_is_singular(&old->anon_vma_chain)) |
1162 | return anon_vma; | 1162 | return anon_vma; |
1163 | } | 1163 | } |
1164 | return NULL; | 1164 | return NULL; |
1165 | } | 1165 | } |
1166 | 1166 | ||
1167 | /* | 1167 | /* |
1168 | * find_mergeable_anon_vma is used by anon_vma_prepare, to check | 1168 | * find_mergeable_anon_vma is used by anon_vma_prepare, to check |
1169 | * neighbouring vmas for a suitable anon_vma, before it goes off | 1169 | * neighbouring vmas for a suitable anon_vma, before it goes off |
1170 | * to allocate a new anon_vma. It checks because a repetitive | 1170 | * to allocate a new anon_vma. It checks because a repetitive |
1171 | * sequence of mprotects and faults may otherwise lead to distinct | 1171 | * sequence of mprotects and faults may otherwise lead to distinct |
1172 | * anon_vmas being allocated, preventing vma merge in subsequent | 1172 | * anon_vmas being allocated, preventing vma merge in subsequent |
1173 | * mprotect. | 1173 | * mprotect. |
1174 | */ | 1174 | */ |
1175 | struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) | 1175 | struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) |
1176 | { | 1176 | { |
1177 | struct anon_vma *anon_vma; | 1177 | struct anon_vma *anon_vma; |
1178 | struct vm_area_struct *near; | 1178 | struct vm_area_struct *near; |
1179 | 1179 | ||
1180 | near = vma->vm_next; | 1180 | near = vma->vm_next; |
1181 | if (!near) | 1181 | if (!near) |
1182 | goto try_prev; | 1182 | goto try_prev; |
1183 | 1183 | ||
1184 | anon_vma = reusable_anon_vma(near, vma, near); | 1184 | anon_vma = reusable_anon_vma(near, vma, near); |
1185 | if (anon_vma) | 1185 | if (anon_vma) |
1186 | return anon_vma; | 1186 | return anon_vma; |
1187 | try_prev: | 1187 | try_prev: |
1188 | near = vma->vm_prev; | 1188 | near = vma->vm_prev; |
1189 | if (!near) | 1189 | if (!near) |
1190 | goto none; | 1190 | goto none; |
1191 | 1191 | ||
1192 | anon_vma = reusable_anon_vma(near, near, vma); | 1192 | anon_vma = reusable_anon_vma(near, near, vma); |
1193 | if (anon_vma) | 1193 | if (anon_vma) |
1194 | return anon_vma; | 1194 | return anon_vma; |
1195 | none: | 1195 | none: |
1196 | /* | 1196 | /* |
1197 | * There's no absolute need to look only at touching neighbours: | 1197 | * There's no absolute need to look only at touching neighbours: |
1198 | * we could search further afield for "compatible" anon_vmas. | 1198 | * we could search further afield for "compatible" anon_vmas. |
1199 | * But it would probably just be a waste of time searching, | 1199 | * But it would probably just be a waste of time searching, |
1200 | * or lead to too many vmas hanging off the same anon_vma. | 1200 | * or lead to too many vmas hanging off the same anon_vma. |
1201 | * We're trying to allow mprotect remerging later on, | 1201 | * We're trying to allow mprotect remerging later on, |
1202 | * not trying to minimize memory used for anon_vmas. | 1202 | * not trying to minimize memory used for anon_vmas. |
1203 | */ | 1203 | */ |
1204 | return NULL; | 1204 | return NULL; |
1205 | } | 1205 | } |
1206 | 1206 | ||
1207 | #ifdef CONFIG_PROC_FS | 1207 | #ifdef CONFIG_PROC_FS |
1208 | void vm_stat_account(struct mm_struct *mm, unsigned long flags, | 1208 | void vm_stat_account(struct mm_struct *mm, unsigned long flags, |
1209 | struct file *file, long pages) | 1209 | struct file *file, long pages) |
1210 | { | 1210 | { |
1211 | const unsigned long stack_flags | 1211 | const unsigned long stack_flags |
1212 | = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN); | 1212 | = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN); |
1213 | 1213 | ||
1214 | mm->total_vm += pages; | 1214 | mm->total_vm += pages; |
1215 | 1215 | ||
1216 | if (file) { | 1216 | if (file) { |
1217 | mm->shared_vm += pages; | 1217 | mm->shared_vm += pages; |
1218 | if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC) | 1218 | if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC) |
1219 | mm->exec_vm += pages; | 1219 | mm->exec_vm += pages; |
1220 | } else if (flags & stack_flags) | 1220 | } else if (flags & stack_flags) |
1221 | mm->stack_vm += pages; | 1221 | mm->stack_vm += pages; |
1222 | } | 1222 | } |
1223 | #endif /* CONFIG_PROC_FS */ | 1223 | #endif /* CONFIG_PROC_FS */ |
1224 | 1224 | ||
1225 | /* | 1225 | /* |
1226 | * If a hint addr is less than mmap_min_addr change hint to be as | 1226 | * If a hint addr is less than mmap_min_addr change hint to be as |
1227 | * low as possible but still greater than mmap_min_addr | 1227 | * low as possible but still greater than mmap_min_addr |
1228 | */ | 1228 | */ |
1229 | static inline unsigned long round_hint_to_min(unsigned long hint) | 1229 | static inline unsigned long round_hint_to_min(unsigned long hint) |
1230 | { | 1230 | { |
1231 | hint &= PAGE_MASK; | 1231 | hint &= PAGE_MASK; |
1232 | if (((void *)hint != NULL) && | 1232 | if (((void *)hint != NULL) && |
1233 | (hint < mmap_min_addr)) | 1233 | (hint < mmap_min_addr)) |
1234 | return PAGE_ALIGN(mmap_min_addr); | 1234 | return PAGE_ALIGN(mmap_min_addr); |
1235 | return hint; | 1235 | return hint; |
1236 | } | 1236 | } |
1237 | 1237 | ||
1238 | static inline int mlock_future_check(struct mm_struct *mm, | 1238 | static inline int mlock_future_check(struct mm_struct *mm, |
1239 | unsigned long flags, | 1239 | unsigned long flags, |
1240 | unsigned long len) | 1240 | unsigned long len) |
1241 | { | 1241 | { |
1242 | unsigned long locked, lock_limit; | 1242 | unsigned long locked, lock_limit; |
1243 | 1243 | ||
1244 | /* mlock MCL_FUTURE? */ | 1244 | /* mlock MCL_FUTURE? */ |
1245 | if (flags & VM_LOCKED) { | 1245 | if (flags & VM_LOCKED) { |
1246 | locked = len >> PAGE_SHIFT; | 1246 | locked = len >> PAGE_SHIFT; |
1247 | locked += mm->locked_vm; | 1247 | locked += mm->locked_vm; |
1248 | lock_limit = rlimit(RLIMIT_MEMLOCK); | 1248 | lock_limit = rlimit(RLIMIT_MEMLOCK); |
1249 | lock_limit >>= PAGE_SHIFT; | 1249 | lock_limit >>= PAGE_SHIFT; |
1250 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) | 1250 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) |
1251 | return -EAGAIN; | 1251 | return -EAGAIN; |
1252 | } | 1252 | } |
1253 | return 0; | 1253 | return 0; |
1254 | } | 1254 | } |
1255 | 1255 | ||
1256 | /* | 1256 | /* |
1257 | * The caller must hold down_write(¤t->mm->mmap_sem). | 1257 | * The caller must hold down_write(¤t->mm->mmap_sem). |
1258 | */ | 1258 | */ |
1259 | 1259 | ||
1260 | unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | 1260 | unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, |
1261 | unsigned long len, unsigned long prot, | 1261 | unsigned long len, unsigned long prot, |
1262 | unsigned long flags, unsigned long pgoff, | 1262 | unsigned long flags, unsigned long pgoff, |
1263 | unsigned long *populate) | 1263 | unsigned long *populate) |
1264 | { | 1264 | { |
1265 | struct mm_struct *mm = current->mm; | 1265 | struct mm_struct *mm = current->mm; |
1266 | vm_flags_t vm_flags; | 1266 | vm_flags_t vm_flags; |
1267 | 1267 | ||
1268 | *populate = 0; | 1268 | *populate = 0; |
1269 | 1269 | ||
1270 | /* | 1270 | /* |
1271 | * Does the application expect PROT_READ to imply PROT_EXEC? | 1271 | * Does the application expect PROT_READ to imply PROT_EXEC? |
1272 | * | 1272 | * |
1273 | * (the exception is when the underlying filesystem is noexec | 1273 | * (the exception is when the underlying filesystem is noexec |
1274 | * mounted, in which case we dont add PROT_EXEC.) | 1274 | * mounted, in which case we dont add PROT_EXEC.) |
1275 | */ | 1275 | */ |
1276 | if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) | 1276 | if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) |
1277 | if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC))) | 1277 | if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC))) |
1278 | prot |= PROT_EXEC; | 1278 | prot |= PROT_EXEC; |
1279 | 1279 | ||
1280 | if (!len) | 1280 | if (!len) |
1281 | return -EINVAL; | 1281 | return -EINVAL; |
1282 | 1282 | ||
1283 | if (!(flags & MAP_FIXED)) | 1283 | if (!(flags & MAP_FIXED)) |
1284 | addr = round_hint_to_min(addr); | 1284 | addr = round_hint_to_min(addr); |
1285 | 1285 | ||
1286 | /* Careful about overflows.. */ | 1286 | /* Careful about overflows.. */ |
1287 | len = PAGE_ALIGN(len); | 1287 | len = PAGE_ALIGN(len); |
1288 | if (!len) | 1288 | if (!len) |
1289 | return -ENOMEM; | 1289 | return -ENOMEM; |
1290 | 1290 | ||
1291 | /* offset overflow? */ | 1291 | /* offset overflow? */ |
1292 | if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) | 1292 | if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) |
1293 | return -EOVERFLOW; | 1293 | return -EOVERFLOW; |
1294 | 1294 | ||
1295 | /* Too many mappings? */ | 1295 | /* Too many mappings? */ |
1296 | if (mm->map_count > sysctl_max_map_count) | 1296 | if (mm->map_count > sysctl_max_map_count) |
1297 | return -ENOMEM; | 1297 | return -ENOMEM; |
1298 | 1298 | ||
1299 | /* Obtain the address to map to. we verify (or select) it and ensure | 1299 | /* Obtain the address to map to. we verify (or select) it and ensure |
1300 | * that it represents a valid section of the address space. | 1300 | * that it represents a valid section of the address space. |
1301 | */ | 1301 | */ |
1302 | addr = get_unmapped_area(file, addr, len, pgoff, flags); | 1302 | addr = get_unmapped_area(file, addr, len, pgoff, flags); |
1303 | if (addr & ~PAGE_MASK) | 1303 | if (addr & ~PAGE_MASK) |
1304 | return addr; | 1304 | return addr; |
1305 | 1305 | ||
1306 | /* Do simple checking here so the lower-level routines won't have | 1306 | /* Do simple checking here so the lower-level routines won't have |
1307 | * to. we assume access permissions have been handled by the open | 1307 | * to. we assume access permissions have been handled by the open |
1308 | * of the memory object, so we don't do any here. | 1308 | * of the memory object, so we don't do any here. |
1309 | */ | 1309 | */ |
1310 | vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | | 1310 | vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | |
1311 | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; | 1311 | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; |
1312 | 1312 | ||
1313 | if (flags & MAP_LOCKED) | 1313 | if (flags & MAP_LOCKED) |
1314 | if (!can_do_mlock()) | 1314 | if (!can_do_mlock()) |
1315 | return -EPERM; | 1315 | return -EPERM; |
1316 | 1316 | ||
1317 | if (mlock_future_check(mm, vm_flags, len)) | 1317 | if (mlock_future_check(mm, vm_flags, len)) |
1318 | return -EAGAIN; | 1318 | return -EAGAIN; |
1319 | 1319 | ||
1320 | if (file) { | 1320 | if (file) { |
1321 | struct inode *inode = file_inode(file); | 1321 | struct inode *inode = file_inode(file); |
1322 | 1322 | ||
1323 | switch (flags & MAP_TYPE) { | 1323 | switch (flags & MAP_TYPE) { |
1324 | case MAP_SHARED: | 1324 | case MAP_SHARED: |
1325 | if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE)) | 1325 | if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE)) |
1326 | return -EACCES; | 1326 | return -EACCES; |
1327 | 1327 | ||
1328 | /* | 1328 | /* |
1329 | * Make sure we don't allow writing to an append-only | 1329 | * Make sure we don't allow writing to an append-only |
1330 | * file.. | 1330 | * file.. |
1331 | */ | 1331 | */ |
1332 | if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE)) | 1332 | if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE)) |
1333 | return -EACCES; | 1333 | return -EACCES; |
1334 | 1334 | ||
1335 | /* | 1335 | /* |
1336 | * Make sure there are no mandatory locks on the file. | 1336 | * Make sure there are no mandatory locks on the file. |
1337 | */ | 1337 | */ |
1338 | if (locks_verify_locked(file)) | 1338 | if (locks_verify_locked(file)) |
1339 | return -EAGAIN; | 1339 | return -EAGAIN; |
1340 | 1340 | ||
1341 | vm_flags |= VM_SHARED | VM_MAYSHARE; | 1341 | vm_flags |= VM_SHARED | VM_MAYSHARE; |
1342 | if (!(file->f_mode & FMODE_WRITE)) | 1342 | if (!(file->f_mode & FMODE_WRITE)) |
1343 | vm_flags &= ~(VM_MAYWRITE | VM_SHARED); | 1343 | vm_flags &= ~(VM_MAYWRITE | VM_SHARED); |
1344 | 1344 | ||
1345 | /* fall through */ | 1345 | /* fall through */ |
1346 | case MAP_PRIVATE: | 1346 | case MAP_PRIVATE: |
1347 | if (!(file->f_mode & FMODE_READ)) | 1347 | if (!(file->f_mode & FMODE_READ)) |
1348 | return -EACCES; | 1348 | return -EACCES; |
1349 | if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) { | 1349 | if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) { |
1350 | if (vm_flags & VM_EXEC) | 1350 | if (vm_flags & VM_EXEC) |
1351 | return -EPERM; | 1351 | return -EPERM; |
1352 | vm_flags &= ~VM_MAYEXEC; | 1352 | vm_flags &= ~VM_MAYEXEC; |
1353 | } | 1353 | } |
1354 | 1354 | ||
1355 | if (!file->f_op->mmap) | 1355 | if (!file->f_op->mmap) |
1356 | return -ENODEV; | 1356 | return -ENODEV; |
1357 | if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) | 1357 | if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) |
1358 | return -EINVAL; | 1358 | return -EINVAL; |
1359 | break; | 1359 | break; |
1360 | 1360 | ||
1361 | default: | 1361 | default: |
1362 | return -EINVAL; | 1362 | return -EINVAL; |
1363 | } | 1363 | } |
1364 | } else { | 1364 | } else { |
1365 | switch (flags & MAP_TYPE) { | 1365 | switch (flags & MAP_TYPE) { |
1366 | case MAP_SHARED: | 1366 | case MAP_SHARED: |
1367 | if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) | 1367 | if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) |
1368 | return -EINVAL; | 1368 | return -EINVAL; |
1369 | /* | 1369 | /* |
1370 | * Ignore pgoff. | 1370 | * Ignore pgoff. |
1371 | */ | 1371 | */ |
1372 | pgoff = 0; | 1372 | pgoff = 0; |
1373 | vm_flags |= VM_SHARED | VM_MAYSHARE; | 1373 | vm_flags |= VM_SHARED | VM_MAYSHARE; |
1374 | break; | 1374 | break; |
1375 | case MAP_PRIVATE: | 1375 | case MAP_PRIVATE: |
1376 | /* | 1376 | /* |
1377 | * Set pgoff according to addr for anon_vma. | 1377 | * Set pgoff according to addr for anon_vma. |
1378 | */ | 1378 | */ |
1379 | pgoff = addr >> PAGE_SHIFT; | 1379 | pgoff = addr >> PAGE_SHIFT; |
1380 | break; | 1380 | break; |
1381 | default: | 1381 | default: |
1382 | return -EINVAL; | 1382 | return -EINVAL; |
1383 | } | 1383 | } |
1384 | } | 1384 | } |
1385 | 1385 | ||
1386 | /* | 1386 | /* |
1387 | * Set 'VM_NORESERVE' if we should not account for the | 1387 | * Set 'VM_NORESERVE' if we should not account for the |
1388 | * memory use of this mapping. | 1388 | * memory use of this mapping. |
1389 | */ | 1389 | */ |
1390 | if (flags & MAP_NORESERVE) { | 1390 | if (flags & MAP_NORESERVE) { |
1391 | /* We honor MAP_NORESERVE if allowed to overcommit */ | 1391 | /* We honor MAP_NORESERVE if allowed to overcommit */ |
1392 | if (sysctl_overcommit_memory != OVERCOMMIT_NEVER) | 1392 | if (sysctl_overcommit_memory != OVERCOMMIT_NEVER) |
1393 | vm_flags |= VM_NORESERVE; | 1393 | vm_flags |= VM_NORESERVE; |
1394 | 1394 | ||
1395 | /* hugetlb applies strict overcommit unless MAP_NORESERVE */ | 1395 | /* hugetlb applies strict overcommit unless MAP_NORESERVE */ |
1396 | if (file && is_file_hugepages(file)) | 1396 | if (file && is_file_hugepages(file)) |
1397 | vm_flags |= VM_NORESERVE; | 1397 | vm_flags |= VM_NORESERVE; |
1398 | } | 1398 | } |
1399 | 1399 | ||
1400 | addr = mmap_region(file, addr, len, vm_flags, pgoff); | 1400 | addr = mmap_region(file, addr, len, vm_flags, pgoff); |
1401 | if (!IS_ERR_VALUE(addr) && | 1401 | if (!IS_ERR_VALUE(addr) && |
1402 | ((vm_flags & VM_LOCKED) || | 1402 | ((vm_flags & VM_LOCKED) || |
1403 | (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE)) | 1403 | (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE)) |
1404 | *populate = len; | 1404 | *populate = len; |
1405 | return addr; | 1405 | return addr; |
1406 | } | 1406 | } |
1407 | 1407 | ||
1408 | SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, | 1408 | SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, |
1409 | unsigned long, prot, unsigned long, flags, | 1409 | unsigned long, prot, unsigned long, flags, |
1410 | unsigned long, fd, unsigned long, pgoff) | 1410 | unsigned long, fd, unsigned long, pgoff) |
1411 | { | 1411 | { |
1412 | struct file *file = NULL; | 1412 | struct file *file = NULL; |
1413 | unsigned long retval = -EBADF; | 1413 | unsigned long retval = -EBADF; |
1414 | 1414 | ||
1415 | if (!(flags & MAP_ANONYMOUS)) { | 1415 | if (!(flags & MAP_ANONYMOUS)) { |
1416 | audit_mmap_fd(fd, flags); | 1416 | audit_mmap_fd(fd, flags); |
1417 | file = fget(fd); | 1417 | file = fget(fd); |
1418 | if (!file) | 1418 | if (!file) |
1419 | goto out; | 1419 | goto out; |
1420 | if (is_file_hugepages(file)) | 1420 | if (is_file_hugepages(file)) |
1421 | len = ALIGN(len, huge_page_size(hstate_file(file))); | 1421 | len = ALIGN(len, huge_page_size(hstate_file(file))); |
1422 | retval = -EINVAL; | 1422 | retval = -EINVAL; |
1423 | if (unlikely(flags & MAP_HUGETLB && !is_file_hugepages(file))) | 1423 | if (unlikely(flags & MAP_HUGETLB && !is_file_hugepages(file))) |
1424 | goto out_fput; | 1424 | goto out_fput; |
1425 | } else if (flags & MAP_HUGETLB) { | 1425 | } else if (flags & MAP_HUGETLB) { |
1426 | struct user_struct *user = NULL; | 1426 | struct user_struct *user = NULL; |
1427 | struct hstate *hs; | 1427 | struct hstate *hs; |
1428 | 1428 | ||
1429 | hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & SHM_HUGE_MASK); | 1429 | hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & SHM_HUGE_MASK); |
1430 | if (!hs) | 1430 | if (!hs) |
1431 | return -EINVAL; | 1431 | return -EINVAL; |
1432 | 1432 | ||
1433 | len = ALIGN(len, huge_page_size(hs)); | 1433 | len = ALIGN(len, huge_page_size(hs)); |
1434 | /* | 1434 | /* |
1435 | * VM_NORESERVE is used because the reservations will be | 1435 | * VM_NORESERVE is used because the reservations will be |
1436 | * taken when vm_ops->mmap() is called | 1436 | * taken when vm_ops->mmap() is called |
1437 | * A dummy user value is used because we are not locking | 1437 | * A dummy user value is used because we are not locking |
1438 | * memory so no accounting is necessary | 1438 | * memory so no accounting is necessary |
1439 | */ | 1439 | */ |
1440 | file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, | 1440 | file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, |
1441 | VM_NORESERVE, | 1441 | VM_NORESERVE, |
1442 | &user, HUGETLB_ANONHUGE_INODE, | 1442 | &user, HUGETLB_ANONHUGE_INODE, |
1443 | (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); | 1443 | (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); |
1444 | if (IS_ERR(file)) | 1444 | if (IS_ERR(file)) |
1445 | return PTR_ERR(file); | 1445 | return PTR_ERR(file); |
1446 | } | 1446 | } |
1447 | 1447 | ||
1448 | flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); | 1448 | flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); |
1449 | 1449 | ||
1450 | retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); | 1450 | retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); |
1451 | out_fput: | 1451 | out_fput: |
1452 | if (file) | 1452 | if (file) |
1453 | fput(file); | 1453 | fput(file); |
1454 | out: | 1454 | out: |
1455 | return retval; | 1455 | return retval; |
1456 | } | 1456 | } |
1457 | 1457 | ||
1458 | #ifdef __ARCH_WANT_SYS_OLD_MMAP | 1458 | #ifdef __ARCH_WANT_SYS_OLD_MMAP |
1459 | struct mmap_arg_struct { | 1459 | struct mmap_arg_struct { |
1460 | unsigned long addr; | 1460 | unsigned long addr; |
1461 | unsigned long len; | 1461 | unsigned long len; |
1462 | unsigned long prot; | 1462 | unsigned long prot; |
1463 | unsigned long flags; | 1463 | unsigned long flags; |
1464 | unsigned long fd; | 1464 | unsigned long fd; |
1465 | unsigned long offset; | 1465 | unsigned long offset; |
1466 | }; | 1466 | }; |
1467 | 1467 | ||
1468 | SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) | 1468 | SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) |
1469 | { | 1469 | { |
1470 | struct mmap_arg_struct a; | 1470 | struct mmap_arg_struct a; |
1471 | 1471 | ||
1472 | if (copy_from_user(&a, arg, sizeof(a))) | 1472 | if (copy_from_user(&a, arg, sizeof(a))) |
1473 | return -EFAULT; | 1473 | return -EFAULT; |
1474 | if (a.offset & ~PAGE_MASK) | 1474 | if (a.offset & ~PAGE_MASK) |
1475 | return -EINVAL; | 1475 | return -EINVAL; |
1476 | 1476 | ||
1477 | return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, | 1477 | return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, |
1478 | a.offset >> PAGE_SHIFT); | 1478 | a.offset >> PAGE_SHIFT); |
1479 | } | 1479 | } |
1480 | #endif /* __ARCH_WANT_SYS_OLD_MMAP */ | 1480 | #endif /* __ARCH_WANT_SYS_OLD_MMAP */ |
1481 | 1481 | ||
1482 | /* | 1482 | /* |
1483 | * Some shared mappigns will want the pages marked read-only | 1483 | * Some shared mappigns will want the pages marked read-only |
1484 | * to track write events. If so, we'll downgrade vm_page_prot | 1484 | * to track write events. If so, we'll downgrade vm_page_prot |
1485 | * to the private version (using protection_map[] without the | 1485 | * to the private version (using protection_map[] without the |
1486 | * VM_SHARED bit). | 1486 | * VM_SHARED bit). |
1487 | */ | 1487 | */ |
1488 | int vma_wants_writenotify(struct vm_area_struct *vma) | 1488 | int vma_wants_writenotify(struct vm_area_struct *vma) |
1489 | { | 1489 | { |
1490 | vm_flags_t vm_flags = vma->vm_flags; | 1490 | vm_flags_t vm_flags = vma->vm_flags; |
1491 | 1491 | ||
1492 | /* If it was private or non-writable, the write bit is already clear */ | 1492 | /* If it was private or non-writable, the write bit is already clear */ |
1493 | if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED))) | 1493 | if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED))) |
1494 | return 0; | 1494 | return 0; |
1495 | 1495 | ||
1496 | /* The backer wishes to know when pages are first written to? */ | 1496 | /* The backer wishes to know when pages are first written to? */ |
1497 | if (vma->vm_ops && vma->vm_ops->page_mkwrite) | 1497 | if (vma->vm_ops && vma->vm_ops->page_mkwrite) |
1498 | return 1; | 1498 | return 1; |
1499 | 1499 | ||
1500 | /* The open routine did something to the protections that pgprot_modify | 1500 | /* The open routine did something to the protections that pgprot_modify |
1501 | * won't preserve? */ | 1501 | * won't preserve? */ |
1502 | if (pgprot_val(vma->vm_page_prot) != | 1502 | if (pgprot_val(vma->vm_page_prot) != |
1503 | pgprot_val(vm_pgprot_modify(vma->vm_page_prot, vm_flags))) | 1503 | pgprot_val(vm_pgprot_modify(vma->vm_page_prot, vm_flags))) |
1504 | return 0; | 1504 | return 0; |
1505 | 1505 | ||
1506 | /* Do we need to track softdirty? */ | 1506 | /* Do we need to track softdirty? */ |
1507 | if (IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) && !(vm_flags & VM_SOFTDIRTY)) | 1507 | if (IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) && !(vm_flags & VM_SOFTDIRTY)) |
1508 | return 1; | 1508 | return 1; |
1509 | 1509 | ||
1510 | /* Specialty mapping? */ | 1510 | /* Specialty mapping? */ |
1511 | if (vm_flags & VM_PFNMAP) | 1511 | if (vm_flags & VM_PFNMAP) |
1512 | return 0; | 1512 | return 0; |
1513 | 1513 | ||
1514 | /* Can the mapping track the dirty pages? */ | 1514 | /* Can the mapping track the dirty pages? */ |
1515 | return vma->vm_file && vma->vm_file->f_mapping && | 1515 | return vma->vm_file && vma->vm_file->f_mapping && |
1516 | mapping_cap_account_dirty(vma->vm_file->f_mapping); | 1516 | mapping_cap_account_dirty(vma->vm_file->f_mapping); |
1517 | } | 1517 | } |
1518 | 1518 | ||
1519 | /* | 1519 | /* |
1520 | * We account for memory if it's a private writeable mapping, | 1520 | * We account for memory if it's a private writeable mapping, |
1521 | * not hugepages and VM_NORESERVE wasn't set. | 1521 | * not hugepages and VM_NORESERVE wasn't set. |
1522 | */ | 1522 | */ |
1523 | static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags) | 1523 | static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags) |
1524 | { | 1524 | { |
1525 | /* | 1525 | /* |
1526 | * hugetlb has its own accounting separate from the core VM | 1526 | * hugetlb has its own accounting separate from the core VM |
1527 | * VM_HUGETLB may not be set yet so we cannot check for that flag. | 1527 | * VM_HUGETLB may not be set yet so we cannot check for that flag. |
1528 | */ | 1528 | */ |
1529 | if (file && is_file_hugepages(file)) | 1529 | if (file && is_file_hugepages(file)) |
1530 | return 0; | 1530 | return 0; |
1531 | 1531 | ||
1532 | return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE; | 1532 | return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE; |
1533 | } | 1533 | } |
1534 | 1534 | ||
1535 | unsigned long mmap_region(struct file *file, unsigned long addr, | 1535 | unsigned long mmap_region(struct file *file, unsigned long addr, |
1536 | unsigned long len, vm_flags_t vm_flags, unsigned long pgoff) | 1536 | unsigned long len, vm_flags_t vm_flags, unsigned long pgoff) |
1537 | { | 1537 | { |
1538 | struct mm_struct *mm = current->mm; | 1538 | struct mm_struct *mm = current->mm; |
1539 | struct vm_area_struct *vma, *prev; | 1539 | struct vm_area_struct *vma, *prev; |
1540 | int error; | 1540 | int error; |
1541 | struct rb_node **rb_link, *rb_parent; | 1541 | struct rb_node **rb_link, *rb_parent; |
1542 | unsigned long charged = 0; | 1542 | unsigned long charged = 0; |
1543 | 1543 | ||
1544 | /* Check against address space limit. */ | 1544 | /* Check against address space limit. */ |
1545 | if (!may_expand_vm(mm, len >> PAGE_SHIFT)) { | 1545 | if (!may_expand_vm(mm, len >> PAGE_SHIFT)) { |
1546 | unsigned long nr_pages; | 1546 | unsigned long nr_pages; |
1547 | 1547 | ||
1548 | /* | 1548 | /* |
1549 | * MAP_FIXED may remove pages of mappings that intersects with | 1549 | * MAP_FIXED may remove pages of mappings that intersects with |
1550 | * requested mapping. Account for the pages it would unmap. | 1550 | * requested mapping. Account for the pages it would unmap. |
1551 | */ | 1551 | */ |
1552 | if (!(vm_flags & MAP_FIXED)) | 1552 | if (!(vm_flags & MAP_FIXED)) |
1553 | return -ENOMEM; | 1553 | return -ENOMEM; |
1554 | 1554 | ||
1555 | nr_pages = count_vma_pages_range(mm, addr, addr + len); | 1555 | nr_pages = count_vma_pages_range(mm, addr, addr + len); |
1556 | 1556 | ||
1557 | if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages)) | 1557 | if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages)) |
1558 | return -ENOMEM; | 1558 | return -ENOMEM; |
1559 | } | 1559 | } |
1560 | 1560 | ||
1561 | /* Clear old maps */ | 1561 | /* Clear old maps */ |
1562 | error = -ENOMEM; | 1562 | error = -ENOMEM; |
1563 | munmap_back: | 1563 | munmap_back: |
1564 | if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { | 1564 | if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { |
1565 | if (do_munmap(mm, addr, len)) | 1565 | if (do_munmap(mm, addr, len)) |
1566 | return -ENOMEM; | 1566 | return -ENOMEM; |
1567 | goto munmap_back; | 1567 | goto munmap_back; |
1568 | } | 1568 | } |
1569 | 1569 | ||
1570 | /* | 1570 | /* |
1571 | * Private writable mapping: check memory availability | 1571 | * Private writable mapping: check memory availability |
1572 | */ | 1572 | */ |
1573 | if (accountable_mapping(file, vm_flags)) { | 1573 | if (accountable_mapping(file, vm_flags)) { |
1574 | charged = len >> PAGE_SHIFT; | 1574 | charged = len >> PAGE_SHIFT; |
1575 | if (security_vm_enough_memory_mm(mm, charged)) | 1575 | if (security_vm_enough_memory_mm(mm, charged)) |
1576 | return -ENOMEM; | 1576 | return -ENOMEM; |
1577 | vm_flags |= VM_ACCOUNT; | 1577 | vm_flags |= VM_ACCOUNT; |
1578 | } | 1578 | } |
1579 | 1579 | ||
1580 | /* | 1580 | /* |
1581 | * Can we just expand an old mapping? | 1581 | * Can we just expand an old mapping? |
1582 | */ | 1582 | */ |
1583 | vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL); | 1583 | vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL); |
1584 | if (vma) | 1584 | if (vma) |
1585 | goto out; | 1585 | goto out; |
1586 | 1586 | ||
1587 | /* | 1587 | /* |
1588 | * Determine the object being mapped and call the appropriate | 1588 | * Determine the object being mapped and call the appropriate |
1589 | * specific mapper. the address has already been validated, but | 1589 | * specific mapper. the address has already been validated, but |
1590 | * not unmapped, but the maps are removed from the list. | 1590 | * not unmapped, but the maps are removed from the list. |
1591 | */ | 1591 | */ |
1592 | vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); | 1592 | vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); |
1593 | if (!vma) { | 1593 | if (!vma) { |
1594 | error = -ENOMEM; | 1594 | error = -ENOMEM; |
1595 | goto unacct_error; | 1595 | goto unacct_error; |
1596 | } | 1596 | } |
1597 | 1597 | ||
1598 | vma->vm_mm = mm; | 1598 | vma->vm_mm = mm; |
1599 | vma->vm_start = addr; | 1599 | vma->vm_start = addr; |
1600 | vma->vm_end = addr + len; | 1600 | vma->vm_end = addr + len; |
1601 | vma->vm_flags = vm_flags; | 1601 | vma->vm_flags = vm_flags; |
1602 | vma->vm_page_prot = vm_get_page_prot(vm_flags); | 1602 | vma->vm_page_prot = vm_get_page_prot(vm_flags); |
1603 | vma->vm_pgoff = pgoff; | 1603 | vma->vm_pgoff = pgoff; |
1604 | INIT_LIST_HEAD(&vma->anon_vma_chain); | 1604 | INIT_LIST_HEAD(&vma->anon_vma_chain); |
1605 | 1605 | ||
1606 | if (file) { | 1606 | if (file) { |
1607 | if (vm_flags & VM_DENYWRITE) { | 1607 | if (vm_flags & VM_DENYWRITE) { |
1608 | error = deny_write_access(file); | 1608 | error = deny_write_access(file); |
1609 | if (error) | 1609 | if (error) |
1610 | goto free_vma; | 1610 | goto free_vma; |
1611 | } | 1611 | } |
1612 | if (vm_flags & VM_SHARED) { | 1612 | if (vm_flags & VM_SHARED) { |
1613 | error = mapping_map_writable(file->f_mapping); | 1613 | error = mapping_map_writable(file->f_mapping); |
1614 | if (error) | 1614 | if (error) |
1615 | goto allow_write_and_free_vma; | 1615 | goto allow_write_and_free_vma; |
1616 | } | 1616 | } |
1617 | 1617 | ||
1618 | /* ->mmap() can change vma->vm_file, but must guarantee that | 1618 | /* ->mmap() can change vma->vm_file, but must guarantee that |
1619 | * vma_link() below can deny write-access if VM_DENYWRITE is set | 1619 | * vma_link() below can deny write-access if VM_DENYWRITE is set |
1620 | * and map writably if VM_SHARED is set. This usually means the | 1620 | * and map writably if VM_SHARED is set. This usually means the |
1621 | * new file must not have been exposed to user-space, yet. | 1621 | * new file must not have been exposed to user-space, yet. |
1622 | */ | 1622 | */ |
1623 | vma->vm_file = get_file(file); | 1623 | vma->vm_file = get_file(file); |
1624 | error = file->f_op->mmap(file, vma); | 1624 | error = file->f_op->mmap(file, vma); |
1625 | if (error) | 1625 | if (error) |
1626 | goto unmap_and_free_vma; | 1626 | goto unmap_and_free_vma; |
1627 | 1627 | ||
1628 | /* Can addr have changed?? | 1628 | /* Can addr have changed?? |
1629 | * | 1629 | * |
1630 | * Answer: Yes, several device drivers can do it in their | 1630 | * Answer: Yes, several device drivers can do it in their |
1631 | * f_op->mmap method. -DaveM | 1631 | * f_op->mmap method. -DaveM |
1632 | * Bug: If addr is changed, prev, rb_link, rb_parent should | 1632 | * Bug: If addr is changed, prev, rb_link, rb_parent should |
1633 | * be updated for vma_link() | 1633 | * be updated for vma_link() |
1634 | */ | 1634 | */ |
1635 | WARN_ON_ONCE(addr != vma->vm_start); | 1635 | WARN_ON_ONCE(addr != vma->vm_start); |
1636 | 1636 | ||
1637 | addr = vma->vm_start; | 1637 | addr = vma->vm_start; |
1638 | vm_flags = vma->vm_flags; | 1638 | vm_flags = vma->vm_flags; |
1639 | } else if (vm_flags & VM_SHARED) { | 1639 | } else if (vm_flags & VM_SHARED) { |
1640 | error = shmem_zero_setup(vma); | 1640 | error = shmem_zero_setup(vma); |
1641 | if (error) | 1641 | if (error) |
1642 | goto free_vma; | 1642 | goto free_vma; |
1643 | } | 1643 | } |
1644 | 1644 | ||
1645 | vma_link(mm, vma, prev, rb_link, rb_parent); | 1645 | vma_link(mm, vma, prev, rb_link, rb_parent); |
1646 | /* Once vma denies write, undo our temporary denial count */ | 1646 | /* Once vma denies write, undo our temporary denial count */ |
1647 | if (file) { | 1647 | if (file) { |
1648 | if (vm_flags & VM_SHARED) | 1648 | if (vm_flags & VM_SHARED) |
1649 | mapping_unmap_writable(file->f_mapping); | 1649 | mapping_unmap_writable(file->f_mapping); |
1650 | if (vm_flags & VM_DENYWRITE) | 1650 | if (vm_flags & VM_DENYWRITE) |
1651 | allow_write_access(file); | 1651 | allow_write_access(file); |
1652 | } | 1652 | } |
1653 | file = vma->vm_file; | 1653 | file = vma->vm_file; |
1654 | out: | 1654 | out: |
1655 | perf_event_mmap(vma); | 1655 | perf_event_mmap(vma); |
1656 | 1656 | ||
1657 | vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); | 1657 | vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); |
1658 | if (vm_flags & VM_LOCKED) { | 1658 | if (vm_flags & VM_LOCKED) { |
1659 | if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) || | 1659 | if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) || |
1660 | vma == get_gate_vma(current->mm))) | 1660 | vma == get_gate_vma(current->mm))) |
1661 | mm->locked_vm += (len >> PAGE_SHIFT); | 1661 | mm->locked_vm += (len >> PAGE_SHIFT); |
1662 | else | 1662 | else |
1663 | vma->vm_flags &= ~VM_LOCKED; | 1663 | vma->vm_flags &= ~VM_LOCKED; |
1664 | } | 1664 | } |
1665 | 1665 | ||
1666 | if (file) | 1666 | if (file) |
1667 | uprobe_mmap(vma); | 1667 | uprobe_mmap(vma); |
1668 | 1668 | ||
1669 | /* | 1669 | /* |
1670 | * New (or expanded) vma always get soft dirty status. | 1670 | * New (or expanded) vma always get soft dirty status. |
1671 | * Otherwise user-space soft-dirty page tracker won't | 1671 | * Otherwise user-space soft-dirty page tracker won't |
1672 | * be able to distinguish situation when vma area unmapped, | 1672 | * be able to distinguish situation when vma area unmapped, |
1673 | * then new mapped in-place (which must be aimed as | 1673 | * then new mapped in-place (which must be aimed as |
1674 | * a completely new data area). | 1674 | * a completely new data area). |
1675 | */ | 1675 | */ |
1676 | vma->vm_flags |= VM_SOFTDIRTY; | 1676 | vma->vm_flags |= VM_SOFTDIRTY; |
1677 | 1677 | ||
1678 | vma_set_page_prot(vma); | 1678 | vma_set_page_prot(vma); |
1679 | 1679 | ||
1680 | return addr; | 1680 | return addr; |
1681 | 1681 | ||
1682 | unmap_and_free_vma: | 1682 | unmap_and_free_vma: |
1683 | vma->vm_file = NULL; | 1683 | vma->vm_file = NULL; |
1684 | fput(file); | 1684 | fput(file); |
1685 | 1685 | ||
1686 | /* Undo any partial mapping done by a device driver. */ | 1686 | /* Undo any partial mapping done by a device driver. */ |
1687 | unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end); | 1687 | unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end); |
1688 | charged = 0; | 1688 | charged = 0; |
1689 | if (vm_flags & VM_SHARED) | 1689 | if (vm_flags & VM_SHARED) |
1690 | mapping_unmap_writable(file->f_mapping); | 1690 | mapping_unmap_writable(file->f_mapping); |
1691 | allow_write_and_free_vma: | 1691 | allow_write_and_free_vma: |
1692 | if (vm_flags & VM_DENYWRITE) | 1692 | if (vm_flags & VM_DENYWRITE) |
1693 | allow_write_access(file); | 1693 | allow_write_access(file); |
1694 | free_vma: | 1694 | free_vma: |
1695 | kmem_cache_free(vm_area_cachep, vma); | 1695 | kmem_cache_free(vm_area_cachep, vma); |
1696 | unacct_error: | 1696 | unacct_error: |
1697 | if (charged) | 1697 | if (charged) |
1698 | vm_unacct_memory(charged); | 1698 | vm_unacct_memory(charged); |
1699 | return error; | 1699 | return error; |
1700 | } | 1700 | } |
1701 | 1701 | ||
1702 | unsigned long unmapped_area(struct vm_unmapped_area_info *info) | 1702 | unsigned long unmapped_area(struct vm_unmapped_area_info *info) |
1703 | { | 1703 | { |
1704 | /* | 1704 | /* |
1705 | * We implement the search by looking for an rbtree node that | 1705 | * We implement the search by looking for an rbtree node that |
1706 | * immediately follows a suitable gap. That is, | 1706 | * immediately follows a suitable gap. That is, |
1707 | * - gap_start = vma->vm_prev->vm_end <= info->high_limit - length; | 1707 | * - gap_start = vma->vm_prev->vm_end <= info->high_limit - length; |
1708 | * - gap_end = vma->vm_start >= info->low_limit + length; | 1708 | * - gap_end = vma->vm_start >= info->low_limit + length; |
1709 | * - gap_end - gap_start >= length | 1709 | * - gap_end - gap_start >= length |
1710 | */ | 1710 | */ |
1711 | 1711 | ||
1712 | struct mm_struct *mm = current->mm; | 1712 | struct mm_struct *mm = current->mm; |
1713 | struct vm_area_struct *vma; | 1713 | struct vm_area_struct *vma; |
1714 | unsigned long length, low_limit, high_limit, gap_start, gap_end; | 1714 | unsigned long length, low_limit, high_limit, gap_start, gap_end; |
1715 | 1715 | ||
1716 | /* Adjust search length to account for worst case alignment overhead */ | 1716 | /* Adjust search length to account for worst case alignment overhead */ |
1717 | length = info->length + info->align_mask; | 1717 | length = info->length + info->align_mask; |
1718 | if (length < info->length) | 1718 | if (length < info->length) |
1719 | return -ENOMEM; | 1719 | return -ENOMEM; |
1720 | 1720 | ||
1721 | /* Adjust search limits by the desired length */ | 1721 | /* Adjust search limits by the desired length */ |
1722 | if (info->high_limit < length) | 1722 | if (info->high_limit < length) |
1723 | return -ENOMEM; | 1723 | return -ENOMEM; |
1724 | high_limit = info->high_limit - length; | 1724 | high_limit = info->high_limit - length; |
1725 | 1725 | ||
1726 | if (info->low_limit > high_limit) | 1726 | if (info->low_limit > high_limit) |
1727 | return -ENOMEM; | 1727 | return -ENOMEM; |
1728 | low_limit = info->low_limit + length; | 1728 | low_limit = info->low_limit + length; |
1729 | 1729 | ||
1730 | /* Check if rbtree root looks promising */ | 1730 | /* Check if rbtree root looks promising */ |
1731 | if (RB_EMPTY_ROOT(&mm->mm_rb)) | 1731 | if (RB_EMPTY_ROOT(&mm->mm_rb)) |
1732 | goto check_highest; | 1732 | goto check_highest; |
1733 | vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb); | 1733 | vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb); |
1734 | if (vma->rb_subtree_gap < length) | 1734 | if (vma->rb_subtree_gap < length) |
1735 | goto check_highest; | 1735 | goto check_highest; |
1736 | 1736 | ||
1737 | while (true) { | 1737 | while (true) { |
1738 | /* Visit left subtree if it looks promising */ | 1738 | /* Visit left subtree if it looks promising */ |
1739 | gap_end = vma->vm_start; | 1739 | gap_end = vma->vm_start; |
1740 | if (gap_end >= low_limit && vma->vm_rb.rb_left) { | 1740 | if (gap_end >= low_limit && vma->vm_rb.rb_left) { |
1741 | struct vm_area_struct *left = | 1741 | struct vm_area_struct *left = |
1742 | rb_entry(vma->vm_rb.rb_left, | 1742 | rb_entry(vma->vm_rb.rb_left, |
1743 | struct vm_area_struct, vm_rb); | 1743 | struct vm_area_struct, vm_rb); |
1744 | if (left->rb_subtree_gap >= length) { | 1744 | if (left->rb_subtree_gap >= length) { |
1745 | vma = left; | 1745 | vma = left; |
1746 | continue; | 1746 | continue; |
1747 | } | 1747 | } |
1748 | } | 1748 | } |
1749 | 1749 | ||
1750 | gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0; | 1750 | gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0; |
1751 | check_current: | 1751 | check_current: |
1752 | /* Check if current node has a suitable gap */ | 1752 | /* Check if current node has a suitable gap */ |
1753 | if (gap_start > high_limit) | 1753 | if (gap_start > high_limit) |
1754 | return -ENOMEM; | 1754 | return -ENOMEM; |
1755 | if (gap_end >= low_limit && gap_end - gap_start >= length) | 1755 | if (gap_end >= low_limit && gap_end - gap_start >= length) |
1756 | goto found; | 1756 | goto found; |
1757 | 1757 | ||
1758 | /* Visit right subtree if it looks promising */ | 1758 | /* Visit right subtree if it looks promising */ |
1759 | if (vma->vm_rb.rb_right) { | 1759 | if (vma->vm_rb.rb_right) { |
1760 | struct vm_area_struct *right = | 1760 | struct vm_area_struct *right = |
1761 | rb_entry(vma->vm_rb.rb_right, | 1761 | rb_entry(vma->vm_rb.rb_right, |
1762 | struct vm_area_struct, vm_rb); | 1762 | struct vm_area_struct, vm_rb); |
1763 | if (right->rb_subtree_gap >= length) { | 1763 | if (right->rb_subtree_gap >= length) { |
1764 | vma = right; | 1764 | vma = right; |
1765 | continue; | 1765 | continue; |
1766 | } | 1766 | } |
1767 | } | 1767 | } |
1768 | 1768 | ||
1769 | /* Go back up the rbtree to find next candidate node */ | 1769 | /* Go back up the rbtree to find next candidate node */ |
1770 | while (true) { | 1770 | while (true) { |
1771 | struct rb_node *prev = &vma->vm_rb; | 1771 | struct rb_node *prev = &vma->vm_rb; |
1772 | if (!rb_parent(prev)) | 1772 | if (!rb_parent(prev)) |
1773 | goto check_highest; | 1773 | goto check_highest; |
1774 | vma = rb_entry(rb_parent(prev), | 1774 | vma = rb_entry(rb_parent(prev), |
1775 | struct vm_area_struct, vm_rb); | 1775 | struct vm_area_struct, vm_rb); |
1776 | if (prev == vma->vm_rb.rb_left) { | 1776 | if (prev == vma->vm_rb.rb_left) { |
1777 | gap_start = vma->vm_prev->vm_end; | 1777 | gap_start = vma->vm_prev->vm_end; |
1778 | gap_end = vma->vm_start; | 1778 | gap_end = vma->vm_start; |
1779 | goto check_current; | 1779 | goto check_current; |
1780 | } | 1780 | } |
1781 | } | 1781 | } |
1782 | } | 1782 | } |
1783 | 1783 | ||
1784 | check_highest: | 1784 | check_highest: |
1785 | /* Check highest gap, which does not precede any rbtree node */ | 1785 | /* Check highest gap, which does not precede any rbtree node */ |
1786 | gap_start = mm->highest_vm_end; | 1786 | gap_start = mm->highest_vm_end; |
1787 | gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */ | 1787 | gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */ |
1788 | if (gap_start > high_limit) | 1788 | if (gap_start > high_limit) |
1789 | return -ENOMEM; | 1789 | return -ENOMEM; |
1790 | 1790 | ||
1791 | found: | 1791 | found: |
1792 | /* We found a suitable gap. Clip it with the original low_limit. */ | 1792 | /* We found a suitable gap. Clip it with the original low_limit. */ |
1793 | if (gap_start < info->low_limit) | 1793 | if (gap_start < info->low_limit) |
1794 | gap_start = info->low_limit; | 1794 | gap_start = info->low_limit; |
1795 | 1795 | ||
1796 | /* Adjust gap address to the desired alignment */ | 1796 | /* Adjust gap address to the desired alignment */ |
1797 | gap_start += (info->align_offset - gap_start) & info->align_mask; | 1797 | gap_start += (info->align_offset - gap_start) & info->align_mask; |
1798 | 1798 | ||
1799 | VM_BUG_ON(gap_start + info->length > info->high_limit); | 1799 | VM_BUG_ON(gap_start + info->length > info->high_limit); |
1800 | VM_BUG_ON(gap_start + info->length > gap_end); | 1800 | VM_BUG_ON(gap_start + info->length > gap_end); |
1801 | return gap_start; | 1801 | return gap_start; |
1802 | } | 1802 | } |
1803 | 1803 | ||
1804 | unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) | 1804 | unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) |
1805 | { | 1805 | { |
1806 | struct mm_struct *mm = current->mm; | 1806 | struct mm_struct *mm = current->mm; |
1807 | struct vm_area_struct *vma; | 1807 | struct vm_area_struct *vma; |
1808 | unsigned long length, low_limit, high_limit, gap_start, gap_end; | 1808 | unsigned long length, low_limit, high_limit, gap_start, gap_end; |
1809 | 1809 | ||
1810 | /* Adjust search length to account for worst case alignment overhead */ | 1810 | /* Adjust search length to account for worst case alignment overhead */ |
1811 | length = info->length + info->align_mask; | 1811 | length = info->length + info->align_mask; |
1812 | if (length < info->length) | 1812 | if (length < info->length) |
1813 | return -ENOMEM; | 1813 | return -ENOMEM; |
1814 | 1814 | ||
1815 | /* | 1815 | /* |
1816 | * Adjust search limits by the desired length. | 1816 | * Adjust search limits by the desired length. |
1817 | * See implementation comment at top of unmapped_area(). | 1817 | * See implementation comment at top of unmapped_area(). |
1818 | */ | 1818 | */ |
1819 | gap_end = info->high_limit; | 1819 | gap_end = info->high_limit; |
1820 | if (gap_end < length) | 1820 | if (gap_end < length) |
1821 | return -ENOMEM; | 1821 | return -ENOMEM; |
1822 | high_limit = gap_end - length; | 1822 | high_limit = gap_end - length; |
1823 | 1823 | ||
1824 | if (info->low_limit > high_limit) | 1824 | if (info->low_limit > high_limit) |
1825 | return -ENOMEM; | 1825 | return -ENOMEM; |
1826 | low_limit = info->low_limit + length; | 1826 | low_limit = info->low_limit + length; |
1827 | 1827 | ||
1828 | /* Check highest gap, which does not precede any rbtree node */ | 1828 | /* Check highest gap, which does not precede any rbtree node */ |
1829 | gap_start = mm->highest_vm_end; | 1829 | gap_start = mm->highest_vm_end; |
1830 | if (gap_start <= high_limit) | 1830 | if (gap_start <= high_limit) |
1831 | goto found_highest; | 1831 | goto found_highest; |
1832 | 1832 | ||
1833 | /* Check if rbtree root looks promising */ | 1833 | /* Check if rbtree root looks promising */ |
1834 | if (RB_EMPTY_ROOT(&mm->mm_rb)) | 1834 | if (RB_EMPTY_ROOT(&mm->mm_rb)) |
1835 | return -ENOMEM; | 1835 | return -ENOMEM; |
1836 | vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb); | 1836 | vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb); |
1837 | if (vma->rb_subtree_gap < length) | 1837 | if (vma->rb_subtree_gap < length) |
1838 | return -ENOMEM; | 1838 | return -ENOMEM; |
1839 | 1839 | ||
1840 | while (true) { | 1840 | while (true) { |
1841 | /* Visit right subtree if it looks promising */ | 1841 | /* Visit right subtree if it looks promising */ |
1842 | gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0; | 1842 | gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0; |
1843 | if (gap_start <= high_limit && vma->vm_rb.rb_right) { | 1843 | if (gap_start <= high_limit && vma->vm_rb.rb_right) { |
1844 | struct vm_area_struct *right = | 1844 | struct vm_area_struct *right = |
1845 | rb_entry(vma->vm_rb.rb_right, | 1845 | rb_entry(vma->vm_rb.rb_right, |
1846 | struct vm_area_struct, vm_rb); | 1846 | struct vm_area_struct, vm_rb); |
1847 | if (right->rb_subtree_gap >= length) { | 1847 | if (right->rb_subtree_gap >= length) { |
1848 | vma = right; | 1848 | vma = right; |
1849 | continue; | 1849 | continue; |
1850 | } | 1850 | } |
1851 | } | 1851 | } |
1852 | 1852 | ||
1853 | check_current: | 1853 | check_current: |
1854 | /* Check if current node has a suitable gap */ | 1854 | /* Check if current node has a suitable gap */ |
1855 | gap_end = vma->vm_start; | 1855 | gap_end = vma->vm_start; |
1856 | if (gap_end < low_limit) | 1856 | if (gap_end < low_limit) |
1857 | return -ENOMEM; | 1857 | return -ENOMEM; |
1858 | if (gap_start <= high_limit && gap_end - gap_start >= length) | 1858 | if (gap_start <= high_limit && gap_end - gap_start >= length) |
1859 | goto found; | 1859 | goto found; |
1860 | 1860 | ||
1861 | /* Visit left subtree if it looks promising */ | 1861 | /* Visit left subtree if it looks promising */ |
1862 | if (vma->vm_rb.rb_left) { | 1862 | if (vma->vm_rb.rb_left) { |
1863 | struct vm_area_struct *left = | 1863 | struct vm_area_struct *left = |
1864 | rb_entry(vma->vm_rb.rb_left, | 1864 | rb_entry(vma->vm_rb.rb_left, |
1865 | struct vm_area_struct, vm_rb); | 1865 | struct vm_area_struct, vm_rb); |
1866 | if (left->rb_subtree_gap >= length) { | 1866 | if (left->rb_subtree_gap >= length) { |
1867 | vma = left; | 1867 | vma = left; |
1868 | continue; | 1868 | continue; |
1869 | } | 1869 | } |
1870 | } | 1870 | } |
1871 | 1871 | ||
1872 | /* Go back up the rbtree to find next candidate node */ | 1872 | /* Go back up the rbtree to find next candidate node */ |
1873 | while (true) { | 1873 | while (true) { |
1874 | struct rb_node *prev = &vma->vm_rb; | 1874 | struct rb_node *prev = &vma->vm_rb; |
1875 | if (!rb_parent(prev)) | 1875 | if (!rb_parent(prev)) |
1876 | return -ENOMEM; | 1876 | return -ENOMEM; |
1877 | vma = rb_entry(rb_parent(prev), | 1877 | vma = rb_entry(rb_parent(prev), |
1878 | struct vm_area_struct, vm_rb); | 1878 | struct vm_area_struct, vm_rb); |
1879 | if (prev == vma->vm_rb.rb_right) { | 1879 | if (prev == vma->vm_rb.rb_right) { |
1880 | gap_start = vma->vm_prev ? | 1880 | gap_start = vma->vm_prev ? |
1881 | vma->vm_prev->vm_end : 0; | 1881 | vma->vm_prev->vm_end : 0; |
1882 | goto check_current; | 1882 | goto check_current; |
1883 | } | 1883 | } |
1884 | } | 1884 | } |
1885 | } | 1885 | } |
1886 | 1886 | ||
1887 | found: | 1887 | found: |
1888 | /* We found a suitable gap. Clip it with the original high_limit. */ | 1888 | /* We found a suitable gap. Clip it with the original high_limit. */ |
1889 | if (gap_end > info->high_limit) | 1889 | if (gap_end > info->high_limit) |
1890 | gap_end = info->high_limit; | 1890 | gap_end = info->high_limit; |
1891 | 1891 | ||
1892 | found_highest: | 1892 | found_highest: |
1893 | /* Compute highest gap address at the desired alignment */ | 1893 | /* Compute highest gap address at the desired alignment */ |
1894 | gap_end -= info->length; | 1894 | gap_end -= info->length; |
1895 | gap_end -= (gap_end - info->align_offset) & info->align_mask; | 1895 | gap_end -= (gap_end - info->align_offset) & info->align_mask; |
1896 | 1896 | ||
1897 | VM_BUG_ON(gap_end < info->low_limit); | 1897 | VM_BUG_ON(gap_end < info->low_limit); |
1898 | VM_BUG_ON(gap_end < gap_start); | 1898 | VM_BUG_ON(gap_end < gap_start); |
1899 | return gap_end; | 1899 | return gap_end; |
1900 | } | 1900 | } |
1901 | 1901 | ||
1902 | /* Get an address range which is currently unmapped. | 1902 | /* Get an address range which is currently unmapped. |
1903 | * For shmat() with addr=0. | 1903 | * For shmat() with addr=0. |
1904 | * | 1904 | * |
1905 | * Ugly calling convention alert: | 1905 | * Ugly calling convention alert: |
1906 | * Return value with the low bits set means error value, | 1906 | * Return value with the low bits set means error value, |
1907 | * ie | 1907 | * ie |
1908 | * if (ret & ~PAGE_MASK) | 1908 | * if (ret & ~PAGE_MASK) |
1909 | * error = ret; | 1909 | * error = ret; |
1910 | * | 1910 | * |
1911 | * This function "knows" that -ENOMEM has the bits set. | 1911 | * This function "knows" that -ENOMEM has the bits set. |
1912 | */ | 1912 | */ |
1913 | #ifndef HAVE_ARCH_UNMAPPED_AREA | 1913 | #ifndef HAVE_ARCH_UNMAPPED_AREA |
1914 | unsigned long | 1914 | unsigned long |
1915 | arch_get_unmapped_area(struct file *filp, unsigned long addr, | 1915 | arch_get_unmapped_area(struct file *filp, unsigned long addr, |
1916 | unsigned long len, unsigned long pgoff, unsigned long flags) | 1916 | unsigned long len, unsigned long pgoff, unsigned long flags) |
1917 | { | 1917 | { |
1918 | struct mm_struct *mm = current->mm; | 1918 | struct mm_struct *mm = current->mm; |
1919 | struct vm_area_struct *vma; | 1919 | struct vm_area_struct *vma; |
1920 | struct vm_unmapped_area_info info; | 1920 | struct vm_unmapped_area_info info; |
1921 | 1921 | ||
1922 | if (len > TASK_SIZE - mmap_min_addr) | 1922 | if (len > TASK_SIZE - mmap_min_addr) |
1923 | return -ENOMEM; | 1923 | return -ENOMEM; |
1924 | 1924 | ||
1925 | if (flags & MAP_FIXED) | 1925 | if (flags & MAP_FIXED) |
1926 | return addr; | 1926 | return addr; |
1927 | 1927 | ||
1928 | if (addr) { | 1928 | if (addr) { |
1929 | addr = PAGE_ALIGN(addr); | 1929 | addr = PAGE_ALIGN(addr); |
1930 | vma = find_vma(mm, addr); | 1930 | vma = find_vma(mm, addr); |
1931 | if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && | 1931 | if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && |
1932 | (!vma || addr + len <= vma->vm_start)) | 1932 | (!vma || addr + len <= vma->vm_start)) |
1933 | return addr; | 1933 | return addr; |
1934 | } | 1934 | } |
1935 | 1935 | ||
1936 | info.flags = 0; | 1936 | info.flags = 0; |
1937 | info.length = len; | 1937 | info.length = len; |
1938 | info.low_limit = mm->mmap_base; | 1938 | info.low_limit = mm->mmap_base; |
1939 | info.high_limit = TASK_SIZE; | 1939 | info.high_limit = TASK_SIZE; |
1940 | info.align_mask = 0; | 1940 | info.align_mask = 0; |
1941 | return vm_unmapped_area(&info); | 1941 | return vm_unmapped_area(&info); |
1942 | } | 1942 | } |
1943 | #endif | 1943 | #endif |
1944 | 1944 | ||
1945 | /* | 1945 | /* |
1946 | * This mmap-allocator allocates new areas top-down from below the | 1946 | * This mmap-allocator allocates new areas top-down from below the |
1947 | * stack's low limit (the base): | 1947 | * stack's low limit (the base): |
1948 | */ | 1948 | */ |
1949 | #ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN | 1949 | #ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN |
1950 | unsigned long | 1950 | unsigned long |
1951 | arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | 1951 | arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, |
1952 | const unsigned long len, const unsigned long pgoff, | 1952 | const unsigned long len, const unsigned long pgoff, |
1953 | const unsigned long flags) | 1953 | const unsigned long flags) |
1954 | { | 1954 | { |
1955 | struct vm_area_struct *vma; | 1955 | struct vm_area_struct *vma; |
1956 | struct mm_struct *mm = current->mm; | 1956 | struct mm_struct *mm = current->mm; |
1957 | unsigned long addr = addr0; | 1957 | unsigned long addr = addr0; |
1958 | struct vm_unmapped_area_info info; | 1958 | struct vm_unmapped_area_info info; |
1959 | 1959 | ||
1960 | /* requested length too big for entire address space */ | 1960 | /* requested length too big for entire address space */ |
1961 | if (len > TASK_SIZE - mmap_min_addr) | 1961 | if (len > TASK_SIZE - mmap_min_addr) |
1962 | return -ENOMEM; | 1962 | return -ENOMEM; |
1963 | 1963 | ||
1964 | if (flags & MAP_FIXED) | 1964 | if (flags & MAP_FIXED) |
1965 | return addr; | 1965 | return addr; |
1966 | 1966 | ||
1967 | /* requesting a specific address */ | 1967 | /* requesting a specific address */ |
1968 | if (addr) { | 1968 | if (addr) { |
1969 | addr = PAGE_ALIGN(addr); | 1969 | addr = PAGE_ALIGN(addr); |
1970 | vma = find_vma(mm, addr); | 1970 | vma = find_vma(mm, addr); |
1971 | if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && | 1971 | if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && |
1972 | (!vma || addr + len <= vma->vm_start)) | 1972 | (!vma || addr + len <= vma->vm_start)) |
1973 | return addr; | 1973 | return addr; |
1974 | } | 1974 | } |
1975 | 1975 | ||
1976 | info.flags = VM_UNMAPPED_AREA_TOPDOWN; | 1976 | info.flags = VM_UNMAPPED_AREA_TOPDOWN; |
1977 | info.length = len; | 1977 | info.length = len; |
1978 | info.low_limit = max(PAGE_SIZE, mmap_min_addr); | 1978 | info.low_limit = max(PAGE_SIZE, mmap_min_addr); |
1979 | info.high_limit = mm->mmap_base; | 1979 | info.high_limit = mm->mmap_base; |
1980 | info.align_mask = 0; | 1980 | info.align_mask = 0; |
1981 | addr = vm_unmapped_area(&info); | 1981 | addr = vm_unmapped_area(&info); |
1982 | 1982 | ||
1983 | /* | 1983 | /* |
1984 | * A failed mmap() very likely causes application failure, | 1984 | * A failed mmap() very likely causes application failure, |
1985 | * so fall back to the bottom-up function here. This scenario | 1985 | * so fall back to the bottom-up function here. This scenario |
1986 | * can happen with large stack limits and large mmap() | 1986 | * can happen with large stack limits and large mmap() |
1987 | * allocations. | 1987 | * allocations. |
1988 | */ | 1988 | */ |
1989 | if (addr & ~PAGE_MASK) { | 1989 | if (addr & ~PAGE_MASK) { |
1990 | VM_BUG_ON(addr != -ENOMEM); | 1990 | VM_BUG_ON(addr != -ENOMEM); |
1991 | info.flags = 0; | 1991 | info.flags = 0; |
1992 | info.low_limit = TASK_UNMAPPED_BASE; | 1992 | info.low_limit = TASK_UNMAPPED_BASE; |
1993 | info.high_limit = TASK_SIZE; | 1993 | info.high_limit = TASK_SIZE; |
1994 | addr = vm_unmapped_area(&info); | 1994 | addr = vm_unmapped_area(&info); |
1995 | } | 1995 | } |
1996 | 1996 | ||
1997 | return addr; | 1997 | return addr; |
1998 | } | 1998 | } |
1999 | #endif | 1999 | #endif |
2000 | 2000 | ||
2001 | unsigned long | 2001 | unsigned long |
2002 | get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, | 2002 | get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, |
2003 | unsigned long pgoff, unsigned long flags) | 2003 | unsigned long pgoff, unsigned long flags) |
2004 | { | 2004 | { |
2005 | unsigned long (*get_area)(struct file *, unsigned long, | 2005 | unsigned long (*get_area)(struct file *, unsigned long, |
2006 | unsigned long, unsigned long, unsigned long); | 2006 | unsigned long, unsigned long, unsigned long); |
2007 | 2007 | ||
2008 | unsigned long error = arch_mmap_check(addr, len, flags); | 2008 | unsigned long error = arch_mmap_check(addr, len, flags); |
2009 | if (error) | 2009 | if (error) |
2010 | return error; | 2010 | return error; |
2011 | 2011 | ||
2012 | /* Careful about overflows.. */ | 2012 | /* Careful about overflows.. */ |
2013 | if (len > TASK_SIZE) | 2013 | if (len > TASK_SIZE) |
2014 | return -ENOMEM; | 2014 | return -ENOMEM; |
2015 | 2015 | ||
2016 | get_area = current->mm->get_unmapped_area; | 2016 | get_area = current->mm->get_unmapped_area; |
2017 | if (file && file->f_op->get_unmapped_area) | 2017 | if (file && file->f_op->get_unmapped_area) |
2018 | get_area = file->f_op->get_unmapped_area; | 2018 | get_area = file->f_op->get_unmapped_area; |
2019 | addr = get_area(file, addr, len, pgoff, flags); | 2019 | addr = get_area(file, addr, len, pgoff, flags); |
2020 | if (IS_ERR_VALUE(addr)) | 2020 | if (IS_ERR_VALUE(addr)) |
2021 | return addr; | 2021 | return addr; |
2022 | 2022 | ||
2023 | if (addr > TASK_SIZE - len) | 2023 | if (addr > TASK_SIZE - len) |
2024 | return -ENOMEM; | 2024 | return -ENOMEM; |
2025 | if (addr & ~PAGE_MASK) | 2025 | if (addr & ~PAGE_MASK) |
2026 | return -EINVAL; | 2026 | return -EINVAL; |
2027 | 2027 | ||
2028 | addr = arch_rebalance_pgtables(addr, len); | 2028 | addr = arch_rebalance_pgtables(addr, len); |
2029 | error = security_mmap_addr(addr); | 2029 | error = security_mmap_addr(addr); |
2030 | return error ? error : addr; | 2030 | return error ? error : addr; |
2031 | } | 2031 | } |
2032 | 2032 | ||
2033 | EXPORT_SYMBOL(get_unmapped_area); | 2033 | EXPORT_SYMBOL(get_unmapped_area); |
2034 | 2034 | ||
2035 | /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ | 2035 | /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ |
2036 | struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) | 2036 | struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) |
2037 | { | 2037 | { |
2038 | struct rb_node *rb_node; | 2038 | struct rb_node *rb_node; |
2039 | struct vm_area_struct *vma; | 2039 | struct vm_area_struct *vma; |
2040 | 2040 | ||
2041 | /* Check the cache first. */ | 2041 | /* Check the cache first. */ |
2042 | vma = vmacache_find(mm, addr); | 2042 | vma = vmacache_find(mm, addr); |
2043 | if (likely(vma)) | 2043 | if (likely(vma)) |
2044 | return vma; | 2044 | return vma; |
2045 | 2045 | ||
2046 | rb_node = mm->mm_rb.rb_node; | 2046 | rb_node = mm->mm_rb.rb_node; |
2047 | vma = NULL; | 2047 | vma = NULL; |
2048 | 2048 | ||
2049 | while (rb_node) { | 2049 | while (rb_node) { |
2050 | struct vm_area_struct *tmp; | 2050 | struct vm_area_struct *tmp; |
2051 | 2051 | ||
2052 | tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb); | 2052 | tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb); |
2053 | 2053 | ||
2054 | if (tmp->vm_end > addr) { | 2054 | if (tmp->vm_end > addr) { |
2055 | vma = tmp; | 2055 | vma = tmp; |
2056 | if (tmp->vm_start <= addr) | 2056 | if (tmp->vm_start <= addr) |
2057 | break; | 2057 | break; |
2058 | rb_node = rb_node->rb_left; | 2058 | rb_node = rb_node->rb_left; |
2059 | } else | 2059 | } else |
2060 | rb_node = rb_node->rb_right; | 2060 | rb_node = rb_node->rb_right; |
2061 | } | 2061 | } |
2062 | 2062 | ||
2063 | if (vma) | 2063 | if (vma) |
2064 | vmacache_update(addr, vma); | 2064 | vmacache_update(addr, vma); |
2065 | return vma; | 2065 | return vma; |
2066 | } | 2066 | } |
2067 | 2067 | ||
2068 | EXPORT_SYMBOL(find_vma); | 2068 | EXPORT_SYMBOL(find_vma); |
2069 | 2069 | ||
2070 | /* | 2070 | /* |
2071 | * Same as find_vma, but also return a pointer to the previous VMA in *pprev. | 2071 | * Same as find_vma, but also return a pointer to the previous VMA in *pprev. |
2072 | */ | 2072 | */ |
2073 | struct vm_area_struct * | 2073 | struct vm_area_struct * |
2074 | find_vma_prev(struct mm_struct *mm, unsigned long addr, | 2074 | find_vma_prev(struct mm_struct *mm, unsigned long addr, |
2075 | struct vm_area_struct **pprev) | 2075 | struct vm_area_struct **pprev) |
2076 | { | 2076 | { |
2077 | struct vm_area_struct *vma; | 2077 | struct vm_area_struct *vma; |
2078 | 2078 | ||
2079 | vma = find_vma(mm, addr); | 2079 | vma = find_vma(mm, addr); |
2080 | if (vma) { | 2080 | if (vma) { |
2081 | *pprev = vma->vm_prev; | 2081 | *pprev = vma->vm_prev; |
2082 | } else { | 2082 | } else { |
2083 | struct rb_node *rb_node = mm->mm_rb.rb_node; | 2083 | struct rb_node *rb_node = mm->mm_rb.rb_node; |
2084 | *pprev = NULL; | 2084 | *pprev = NULL; |
2085 | while (rb_node) { | 2085 | while (rb_node) { |
2086 | *pprev = rb_entry(rb_node, struct vm_area_struct, vm_rb); | 2086 | *pprev = rb_entry(rb_node, struct vm_area_struct, vm_rb); |
2087 | rb_node = rb_node->rb_right; | 2087 | rb_node = rb_node->rb_right; |
2088 | } | 2088 | } |
2089 | } | 2089 | } |
2090 | return vma; | 2090 | return vma; |
2091 | } | 2091 | } |
2092 | 2092 | ||
2093 | /* | 2093 | /* |
2094 | * Verify that the stack growth is acceptable and | 2094 | * Verify that the stack growth is acceptable and |
2095 | * update accounting. This is shared with both the | 2095 | * update accounting. This is shared with both the |
2096 | * grow-up and grow-down cases. | 2096 | * grow-up and grow-down cases. |
2097 | */ | 2097 | */ |
2098 | static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow) | 2098 | static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow) |
2099 | { | 2099 | { |
2100 | struct mm_struct *mm = vma->vm_mm; | 2100 | struct mm_struct *mm = vma->vm_mm; |
2101 | struct rlimit *rlim = current->signal->rlim; | 2101 | struct rlimit *rlim = current->signal->rlim; |
2102 | unsigned long new_start; | 2102 | unsigned long new_start, actual_size; |
2103 | 2103 | ||
2104 | /* address space limit tests */ | 2104 | /* address space limit tests */ |
2105 | if (!may_expand_vm(mm, grow)) | 2105 | if (!may_expand_vm(mm, grow)) |
2106 | return -ENOMEM; | 2106 | return -ENOMEM; |
2107 | 2107 | ||
2108 | /* Stack limit test */ | 2108 | /* Stack limit test */ |
2109 | if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur)) | 2109 | actual_size = size; |
2110 | if (size && (vma->vm_flags & (VM_GROWSUP | VM_GROWSDOWN))) | ||
2111 | actual_size -= PAGE_SIZE; | ||
2112 | if (actual_size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur)) | ||
2110 | return -ENOMEM; | 2113 | return -ENOMEM; |
2111 | 2114 | ||
2112 | /* mlock limit tests */ | 2115 | /* mlock limit tests */ |
2113 | if (vma->vm_flags & VM_LOCKED) { | 2116 | if (vma->vm_flags & VM_LOCKED) { |
2114 | unsigned long locked; | 2117 | unsigned long locked; |
2115 | unsigned long limit; | 2118 | unsigned long limit; |
2116 | locked = mm->locked_vm + grow; | 2119 | locked = mm->locked_vm + grow; |
2117 | limit = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur); | 2120 | limit = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur); |
2118 | limit >>= PAGE_SHIFT; | 2121 | limit >>= PAGE_SHIFT; |
2119 | if (locked > limit && !capable(CAP_IPC_LOCK)) | 2122 | if (locked > limit && !capable(CAP_IPC_LOCK)) |
2120 | return -ENOMEM; | 2123 | return -ENOMEM; |
2121 | } | 2124 | } |
2122 | 2125 | ||
2123 | /* Check to ensure the stack will not grow into a hugetlb-only region */ | 2126 | /* Check to ensure the stack will not grow into a hugetlb-only region */ |
2124 | new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start : | 2127 | new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start : |
2125 | vma->vm_end - size; | 2128 | vma->vm_end - size; |
2126 | if (is_hugepage_only_range(vma->vm_mm, new_start, size)) | 2129 | if (is_hugepage_only_range(vma->vm_mm, new_start, size)) |
2127 | return -EFAULT; | 2130 | return -EFAULT; |
2128 | 2131 | ||
2129 | /* | 2132 | /* |
2130 | * Overcommit.. This must be the final test, as it will | 2133 | * Overcommit.. This must be the final test, as it will |
2131 | * update security statistics. | 2134 | * update security statistics. |
2132 | */ | 2135 | */ |
2133 | if (security_vm_enough_memory_mm(mm, grow)) | 2136 | if (security_vm_enough_memory_mm(mm, grow)) |
2134 | return -ENOMEM; | 2137 | return -ENOMEM; |
2135 | 2138 | ||
2136 | /* Ok, everything looks good - let it rip */ | 2139 | /* Ok, everything looks good - let it rip */ |
2137 | if (vma->vm_flags & VM_LOCKED) | 2140 | if (vma->vm_flags & VM_LOCKED) |
2138 | mm->locked_vm += grow; | 2141 | mm->locked_vm += grow; |
2139 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); | 2142 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); |
2140 | return 0; | 2143 | return 0; |
2141 | } | 2144 | } |
2142 | 2145 | ||
2143 | #if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64) | 2146 | #if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64) |
2144 | /* | 2147 | /* |
2145 | * PA-RISC uses this for its stack; IA64 for its Register Backing Store. | 2148 | * PA-RISC uses this for its stack; IA64 for its Register Backing Store. |
2146 | * vma is the last one with address > vma->vm_end. Have to extend vma. | 2149 | * vma is the last one with address > vma->vm_end. Have to extend vma. |
2147 | */ | 2150 | */ |
2148 | int expand_upwards(struct vm_area_struct *vma, unsigned long address) | 2151 | int expand_upwards(struct vm_area_struct *vma, unsigned long address) |
2149 | { | 2152 | { |
2150 | int error; | 2153 | int error; |
2151 | 2154 | ||
2152 | if (!(vma->vm_flags & VM_GROWSUP)) | 2155 | if (!(vma->vm_flags & VM_GROWSUP)) |
2153 | return -EFAULT; | 2156 | return -EFAULT; |
2154 | 2157 | ||
2155 | /* | 2158 | /* |
2156 | * We must make sure the anon_vma is allocated | 2159 | * We must make sure the anon_vma is allocated |
2157 | * so that the anon_vma locking is not a noop. | 2160 | * so that the anon_vma locking is not a noop. |
2158 | */ | 2161 | */ |
2159 | if (unlikely(anon_vma_prepare(vma))) | 2162 | if (unlikely(anon_vma_prepare(vma))) |
2160 | return -ENOMEM; | 2163 | return -ENOMEM; |
2161 | vma_lock_anon_vma(vma); | 2164 | vma_lock_anon_vma(vma); |
2162 | 2165 | ||
2163 | /* | 2166 | /* |
2164 | * vma->vm_start/vm_end cannot change under us because the caller | 2167 | * vma->vm_start/vm_end cannot change under us because the caller |
2165 | * is required to hold the mmap_sem in read mode. We need the | 2168 | * is required to hold the mmap_sem in read mode. We need the |
2166 | * anon_vma lock to serialize against concurrent expand_stacks. | 2169 | * anon_vma lock to serialize against concurrent expand_stacks. |
2167 | * Also guard against wrapping around to address 0. | 2170 | * Also guard against wrapping around to address 0. |
2168 | */ | 2171 | */ |
2169 | if (address < PAGE_ALIGN(address+4)) | 2172 | if (address < PAGE_ALIGN(address+4)) |
2170 | address = PAGE_ALIGN(address+4); | 2173 | address = PAGE_ALIGN(address+4); |
2171 | else { | 2174 | else { |
2172 | vma_unlock_anon_vma(vma); | 2175 | vma_unlock_anon_vma(vma); |
2173 | return -ENOMEM; | 2176 | return -ENOMEM; |
2174 | } | 2177 | } |
2175 | error = 0; | 2178 | error = 0; |
2176 | 2179 | ||
2177 | /* Somebody else might have raced and expanded it already */ | 2180 | /* Somebody else might have raced and expanded it already */ |
2178 | if (address > vma->vm_end) { | 2181 | if (address > vma->vm_end) { |
2179 | unsigned long size, grow; | 2182 | unsigned long size, grow; |
2180 | 2183 | ||
2181 | size = address - vma->vm_start; | 2184 | size = address - vma->vm_start; |
2182 | grow = (address - vma->vm_end) >> PAGE_SHIFT; | 2185 | grow = (address - vma->vm_end) >> PAGE_SHIFT; |
2183 | 2186 | ||
2184 | error = -ENOMEM; | 2187 | error = -ENOMEM; |
2185 | if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { | 2188 | if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { |
2186 | error = acct_stack_growth(vma, size, grow); | 2189 | error = acct_stack_growth(vma, size, grow); |
2187 | if (!error) { | 2190 | if (!error) { |
2188 | /* | 2191 | /* |
2189 | * vma_gap_update() doesn't support concurrent | 2192 | * vma_gap_update() doesn't support concurrent |
2190 | * updates, but we only hold a shared mmap_sem | 2193 | * updates, but we only hold a shared mmap_sem |
2191 | * lock here, so we need to protect against | 2194 | * lock here, so we need to protect against |
2192 | * concurrent vma expansions. | 2195 | * concurrent vma expansions. |
2193 | * vma_lock_anon_vma() doesn't help here, as | 2196 | * vma_lock_anon_vma() doesn't help here, as |
2194 | * we don't guarantee that all growable vmas | 2197 | * we don't guarantee that all growable vmas |
2195 | * in a mm share the same root anon vma. | 2198 | * in a mm share the same root anon vma. |
2196 | * So, we reuse mm->page_table_lock to guard | 2199 | * So, we reuse mm->page_table_lock to guard |
2197 | * against concurrent vma expansions. | 2200 | * against concurrent vma expansions. |
2198 | */ | 2201 | */ |
2199 | spin_lock(&vma->vm_mm->page_table_lock); | 2202 | spin_lock(&vma->vm_mm->page_table_lock); |
2200 | anon_vma_interval_tree_pre_update_vma(vma); | 2203 | anon_vma_interval_tree_pre_update_vma(vma); |
2201 | vma->vm_end = address; | 2204 | vma->vm_end = address; |
2202 | anon_vma_interval_tree_post_update_vma(vma); | 2205 | anon_vma_interval_tree_post_update_vma(vma); |
2203 | if (vma->vm_next) | 2206 | if (vma->vm_next) |
2204 | vma_gap_update(vma->vm_next); | 2207 | vma_gap_update(vma->vm_next); |
2205 | else | 2208 | else |
2206 | vma->vm_mm->highest_vm_end = address; | 2209 | vma->vm_mm->highest_vm_end = address; |
2207 | spin_unlock(&vma->vm_mm->page_table_lock); | 2210 | spin_unlock(&vma->vm_mm->page_table_lock); |
2208 | 2211 | ||
2209 | perf_event_mmap(vma); | 2212 | perf_event_mmap(vma); |
2210 | } | 2213 | } |
2211 | } | 2214 | } |
2212 | } | 2215 | } |
2213 | vma_unlock_anon_vma(vma); | 2216 | vma_unlock_anon_vma(vma); |
2214 | khugepaged_enter_vma_merge(vma, vma->vm_flags); | 2217 | khugepaged_enter_vma_merge(vma, vma->vm_flags); |
2215 | validate_mm(vma->vm_mm); | 2218 | validate_mm(vma->vm_mm); |
2216 | return error; | 2219 | return error; |
2217 | } | 2220 | } |
2218 | #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ | 2221 | #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ |
2219 | 2222 | ||
2220 | /* | 2223 | /* |
2221 | * vma is the first one with address < vma->vm_start. Have to extend vma. | 2224 | * vma is the first one with address < vma->vm_start. Have to extend vma. |
2222 | */ | 2225 | */ |
2223 | int expand_downwards(struct vm_area_struct *vma, | 2226 | int expand_downwards(struct vm_area_struct *vma, |
2224 | unsigned long address) | 2227 | unsigned long address) |
2225 | { | 2228 | { |
2226 | int error; | 2229 | int error; |
2227 | 2230 | ||
2228 | /* | 2231 | /* |
2229 | * We must make sure the anon_vma is allocated | 2232 | * We must make sure the anon_vma is allocated |
2230 | * so that the anon_vma locking is not a noop. | 2233 | * so that the anon_vma locking is not a noop. |
2231 | */ | 2234 | */ |
2232 | if (unlikely(anon_vma_prepare(vma))) | 2235 | if (unlikely(anon_vma_prepare(vma))) |
2233 | return -ENOMEM; | 2236 | return -ENOMEM; |
2234 | 2237 | ||
2235 | address &= PAGE_MASK; | 2238 | address &= PAGE_MASK; |
2236 | error = security_mmap_addr(address); | 2239 | error = security_mmap_addr(address); |
2237 | if (error) | 2240 | if (error) |
2238 | return error; | 2241 | return error; |
2239 | 2242 | ||
2240 | vma_lock_anon_vma(vma); | 2243 | vma_lock_anon_vma(vma); |
2241 | 2244 | ||
2242 | /* | 2245 | /* |
2243 | * vma->vm_start/vm_end cannot change under us because the caller | 2246 | * vma->vm_start/vm_end cannot change under us because the caller |
2244 | * is required to hold the mmap_sem in read mode. We need the | 2247 | * is required to hold the mmap_sem in read mode. We need the |
2245 | * anon_vma lock to serialize against concurrent expand_stacks. | 2248 | * anon_vma lock to serialize against concurrent expand_stacks. |
2246 | */ | 2249 | */ |
2247 | 2250 | ||
2248 | /* Somebody else might have raced and expanded it already */ | 2251 | /* Somebody else might have raced and expanded it already */ |
2249 | if (address < vma->vm_start) { | 2252 | if (address < vma->vm_start) { |
2250 | unsigned long size, grow; | 2253 | unsigned long size, grow; |
2251 | 2254 | ||
2252 | size = vma->vm_end - address; | 2255 | size = vma->vm_end - address; |
2253 | grow = (vma->vm_start - address) >> PAGE_SHIFT; | 2256 | grow = (vma->vm_start - address) >> PAGE_SHIFT; |
2254 | 2257 | ||
2255 | error = -ENOMEM; | 2258 | error = -ENOMEM; |
2256 | if (grow <= vma->vm_pgoff) { | 2259 | if (grow <= vma->vm_pgoff) { |
2257 | error = acct_stack_growth(vma, size, grow); | 2260 | error = acct_stack_growth(vma, size, grow); |
2258 | if (!error) { | 2261 | if (!error) { |
2259 | /* | 2262 | /* |
2260 | * vma_gap_update() doesn't support concurrent | 2263 | * vma_gap_update() doesn't support concurrent |
2261 | * updates, but we only hold a shared mmap_sem | 2264 | * updates, but we only hold a shared mmap_sem |
2262 | * lock here, so we need to protect against | 2265 | * lock here, so we need to protect against |
2263 | * concurrent vma expansions. | 2266 | * concurrent vma expansions. |
2264 | * vma_lock_anon_vma() doesn't help here, as | 2267 | * vma_lock_anon_vma() doesn't help here, as |
2265 | * we don't guarantee that all growable vmas | 2268 | * we don't guarantee that all growable vmas |
2266 | * in a mm share the same root anon vma. | 2269 | * in a mm share the same root anon vma. |
2267 | * So, we reuse mm->page_table_lock to guard | 2270 | * So, we reuse mm->page_table_lock to guard |
2268 | * against concurrent vma expansions. | 2271 | * against concurrent vma expansions. |
2269 | */ | 2272 | */ |
2270 | spin_lock(&vma->vm_mm->page_table_lock); | 2273 | spin_lock(&vma->vm_mm->page_table_lock); |
2271 | anon_vma_interval_tree_pre_update_vma(vma); | 2274 | anon_vma_interval_tree_pre_update_vma(vma); |
2272 | vma->vm_start = address; | 2275 | vma->vm_start = address; |
2273 | vma->vm_pgoff -= grow; | 2276 | vma->vm_pgoff -= grow; |
2274 | anon_vma_interval_tree_post_update_vma(vma); | 2277 | anon_vma_interval_tree_post_update_vma(vma); |
2275 | vma_gap_update(vma); | 2278 | vma_gap_update(vma); |
2276 | spin_unlock(&vma->vm_mm->page_table_lock); | 2279 | spin_unlock(&vma->vm_mm->page_table_lock); |
2277 | 2280 | ||
2278 | perf_event_mmap(vma); | 2281 | perf_event_mmap(vma); |
2279 | } | 2282 | } |
2280 | } | 2283 | } |
2281 | } | 2284 | } |
2282 | vma_unlock_anon_vma(vma); | 2285 | vma_unlock_anon_vma(vma); |
2283 | khugepaged_enter_vma_merge(vma, vma->vm_flags); | 2286 | khugepaged_enter_vma_merge(vma, vma->vm_flags); |
2284 | validate_mm(vma->vm_mm); | 2287 | validate_mm(vma->vm_mm); |
2285 | return error; | 2288 | return error; |
2286 | } | 2289 | } |
2287 | 2290 | ||
2288 | /* | 2291 | /* |
2289 | * Note how expand_stack() refuses to expand the stack all the way to | 2292 | * Note how expand_stack() refuses to expand the stack all the way to |
2290 | * abut the next virtual mapping, *unless* that mapping itself is also | 2293 | * abut the next virtual mapping, *unless* that mapping itself is also |
2291 | * a stack mapping. We want to leave room for a guard page, after all | 2294 | * a stack mapping. We want to leave room for a guard page, after all |
2292 | * (the guard page itself is not added here, that is done by the | 2295 | * (the guard page itself is not added here, that is done by the |
2293 | * actual page faulting logic) | 2296 | * actual page faulting logic) |
2294 | * | 2297 | * |
2295 | * This matches the behavior of the guard page logic (see mm/memory.c: | 2298 | * This matches the behavior of the guard page logic (see mm/memory.c: |
2296 | * check_stack_guard_page()), which only allows the guard page to be | 2299 | * check_stack_guard_page()), which only allows the guard page to be |
2297 | * removed under these circumstances. | 2300 | * removed under these circumstances. |
2298 | */ | 2301 | */ |
2299 | #ifdef CONFIG_STACK_GROWSUP | 2302 | #ifdef CONFIG_STACK_GROWSUP |
2300 | int expand_stack(struct vm_area_struct *vma, unsigned long address) | 2303 | int expand_stack(struct vm_area_struct *vma, unsigned long address) |
2301 | { | 2304 | { |
2302 | struct vm_area_struct *next; | 2305 | struct vm_area_struct *next; |
2303 | 2306 | ||
2304 | address &= PAGE_MASK; | 2307 | address &= PAGE_MASK; |
2305 | next = vma->vm_next; | 2308 | next = vma->vm_next; |
2306 | if (next && next->vm_start == address + PAGE_SIZE) { | 2309 | if (next && next->vm_start == address + PAGE_SIZE) { |
2307 | if (!(next->vm_flags & VM_GROWSUP)) | 2310 | if (!(next->vm_flags & VM_GROWSUP)) |
2308 | return -ENOMEM; | 2311 | return -ENOMEM; |
2309 | } | 2312 | } |
2310 | return expand_upwards(vma, address); | 2313 | return expand_upwards(vma, address); |
2311 | } | 2314 | } |
2312 | 2315 | ||
2313 | struct vm_area_struct * | 2316 | struct vm_area_struct * |
2314 | find_extend_vma(struct mm_struct *mm, unsigned long addr) | 2317 | find_extend_vma(struct mm_struct *mm, unsigned long addr) |
2315 | { | 2318 | { |
2316 | struct vm_area_struct *vma, *prev; | 2319 | struct vm_area_struct *vma, *prev; |
2317 | 2320 | ||
2318 | addr &= PAGE_MASK; | 2321 | addr &= PAGE_MASK; |
2319 | vma = find_vma_prev(mm, addr, &prev); | 2322 | vma = find_vma_prev(mm, addr, &prev); |
2320 | if (vma && (vma->vm_start <= addr)) | 2323 | if (vma && (vma->vm_start <= addr)) |
2321 | return vma; | 2324 | return vma; |
2322 | if (!prev || expand_stack(prev, addr)) | 2325 | if (!prev || expand_stack(prev, addr)) |
2323 | return NULL; | 2326 | return NULL; |
2324 | if (prev->vm_flags & VM_LOCKED) | 2327 | if (prev->vm_flags & VM_LOCKED) |
2325 | __mlock_vma_pages_range(prev, addr, prev->vm_end, NULL); | 2328 | __mlock_vma_pages_range(prev, addr, prev->vm_end, NULL); |
2326 | return prev; | 2329 | return prev; |
2327 | } | 2330 | } |
2328 | #else | 2331 | #else |
2329 | int expand_stack(struct vm_area_struct *vma, unsigned long address) | 2332 | int expand_stack(struct vm_area_struct *vma, unsigned long address) |
2330 | { | 2333 | { |
2331 | struct vm_area_struct *prev; | 2334 | struct vm_area_struct *prev; |
2332 | 2335 | ||
2333 | address &= PAGE_MASK; | 2336 | address &= PAGE_MASK; |
2334 | prev = vma->vm_prev; | 2337 | prev = vma->vm_prev; |
2335 | if (prev && prev->vm_end == address) { | 2338 | if (prev && prev->vm_end == address) { |
2336 | if (!(prev->vm_flags & VM_GROWSDOWN)) | 2339 | if (!(prev->vm_flags & VM_GROWSDOWN)) |
2337 | return -ENOMEM; | 2340 | return -ENOMEM; |
2338 | } | 2341 | } |
2339 | return expand_downwards(vma, address); | 2342 | return expand_downwards(vma, address); |
2340 | } | 2343 | } |
2341 | 2344 | ||
2342 | struct vm_area_struct * | 2345 | struct vm_area_struct * |
2343 | find_extend_vma(struct mm_struct *mm, unsigned long addr) | 2346 | find_extend_vma(struct mm_struct *mm, unsigned long addr) |
2344 | { | 2347 | { |
2345 | struct vm_area_struct *vma; | 2348 | struct vm_area_struct *vma; |
2346 | unsigned long start; | 2349 | unsigned long start; |
2347 | 2350 | ||
2348 | addr &= PAGE_MASK; | 2351 | addr &= PAGE_MASK; |
2349 | vma = find_vma(mm, addr); | 2352 | vma = find_vma(mm, addr); |
2350 | if (!vma) | 2353 | if (!vma) |
2351 | return NULL; | 2354 | return NULL; |
2352 | if (vma->vm_start <= addr) | 2355 | if (vma->vm_start <= addr) |
2353 | return vma; | 2356 | return vma; |
2354 | if (!(vma->vm_flags & VM_GROWSDOWN)) | 2357 | if (!(vma->vm_flags & VM_GROWSDOWN)) |
2355 | return NULL; | 2358 | return NULL; |
2356 | start = vma->vm_start; | 2359 | start = vma->vm_start; |
2357 | if (expand_stack(vma, addr)) | 2360 | if (expand_stack(vma, addr)) |
2358 | return NULL; | 2361 | return NULL; |
2359 | if (vma->vm_flags & VM_LOCKED) | 2362 | if (vma->vm_flags & VM_LOCKED) |
2360 | __mlock_vma_pages_range(vma, addr, start, NULL); | 2363 | __mlock_vma_pages_range(vma, addr, start, NULL); |
2361 | return vma; | 2364 | return vma; |
2362 | } | 2365 | } |
2363 | #endif | 2366 | #endif |
2364 | 2367 | ||
2365 | EXPORT_SYMBOL_GPL(find_extend_vma); | 2368 | EXPORT_SYMBOL_GPL(find_extend_vma); |
2366 | 2369 | ||
2367 | /* | 2370 | /* |
2368 | * Ok - we have the memory areas we should free on the vma list, | 2371 | * Ok - we have the memory areas we should free on the vma list, |
2369 | * so release them, and do the vma updates. | 2372 | * so release them, and do the vma updates. |
2370 | * | 2373 | * |
2371 | * Called with the mm semaphore held. | 2374 | * Called with the mm semaphore held. |
2372 | */ | 2375 | */ |
2373 | static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) | 2376 | static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) |
2374 | { | 2377 | { |
2375 | unsigned long nr_accounted = 0; | 2378 | unsigned long nr_accounted = 0; |
2376 | 2379 | ||
2377 | /* Update high watermark before we lower total_vm */ | 2380 | /* Update high watermark before we lower total_vm */ |
2378 | update_hiwater_vm(mm); | 2381 | update_hiwater_vm(mm); |
2379 | do { | 2382 | do { |
2380 | long nrpages = vma_pages(vma); | 2383 | long nrpages = vma_pages(vma); |
2381 | 2384 | ||
2382 | if (vma->vm_flags & VM_ACCOUNT) | 2385 | if (vma->vm_flags & VM_ACCOUNT) |
2383 | nr_accounted += nrpages; | 2386 | nr_accounted += nrpages; |
2384 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); | 2387 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); |
2385 | vma = remove_vma(vma); | 2388 | vma = remove_vma(vma); |
2386 | } while (vma); | 2389 | } while (vma); |
2387 | vm_unacct_memory(nr_accounted); | 2390 | vm_unacct_memory(nr_accounted); |
2388 | validate_mm(mm); | 2391 | validate_mm(mm); |
2389 | } | 2392 | } |
2390 | 2393 | ||
2391 | /* | 2394 | /* |
2392 | * Get rid of page table information in the indicated region. | 2395 | * Get rid of page table information in the indicated region. |
2393 | * | 2396 | * |
2394 | * Called with the mm semaphore held. | 2397 | * Called with the mm semaphore held. |
2395 | */ | 2398 | */ |
2396 | static void unmap_region(struct mm_struct *mm, | 2399 | static void unmap_region(struct mm_struct *mm, |
2397 | struct vm_area_struct *vma, struct vm_area_struct *prev, | 2400 | struct vm_area_struct *vma, struct vm_area_struct *prev, |
2398 | unsigned long start, unsigned long end) | 2401 | unsigned long start, unsigned long end) |
2399 | { | 2402 | { |
2400 | struct vm_area_struct *next = prev ? prev->vm_next : mm->mmap; | 2403 | struct vm_area_struct *next = prev ? prev->vm_next : mm->mmap; |
2401 | struct mmu_gather tlb; | 2404 | struct mmu_gather tlb; |
2402 | 2405 | ||
2403 | lru_add_drain(); | 2406 | lru_add_drain(); |
2404 | tlb_gather_mmu(&tlb, mm, start, end); | 2407 | tlb_gather_mmu(&tlb, mm, start, end); |
2405 | update_hiwater_rss(mm); | 2408 | update_hiwater_rss(mm); |
2406 | unmap_vmas(&tlb, vma, start, end); | 2409 | unmap_vmas(&tlb, vma, start, end); |
2407 | free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, | 2410 | free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, |
2408 | next ? next->vm_start : USER_PGTABLES_CEILING); | 2411 | next ? next->vm_start : USER_PGTABLES_CEILING); |
2409 | tlb_finish_mmu(&tlb, start, end); | 2412 | tlb_finish_mmu(&tlb, start, end); |
2410 | } | 2413 | } |
2411 | 2414 | ||
2412 | /* | 2415 | /* |
2413 | * Create a list of vma's touched by the unmap, removing them from the mm's | 2416 | * Create a list of vma's touched by the unmap, removing them from the mm's |
2414 | * vma list as we go.. | 2417 | * vma list as we go.. |
2415 | */ | 2418 | */ |
2416 | static void | 2419 | static void |
2417 | detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, | 2420 | detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, |
2418 | struct vm_area_struct *prev, unsigned long end) | 2421 | struct vm_area_struct *prev, unsigned long end) |
2419 | { | 2422 | { |
2420 | struct vm_area_struct **insertion_point; | 2423 | struct vm_area_struct **insertion_point; |
2421 | struct vm_area_struct *tail_vma = NULL; | 2424 | struct vm_area_struct *tail_vma = NULL; |
2422 | 2425 | ||
2423 | insertion_point = (prev ? &prev->vm_next : &mm->mmap); | 2426 | insertion_point = (prev ? &prev->vm_next : &mm->mmap); |
2424 | vma->vm_prev = NULL; | 2427 | vma->vm_prev = NULL; |
2425 | do { | 2428 | do { |
2426 | vma_rb_erase(vma, &mm->mm_rb); | 2429 | vma_rb_erase(vma, &mm->mm_rb); |
2427 | mm->map_count--; | 2430 | mm->map_count--; |
2428 | tail_vma = vma; | 2431 | tail_vma = vma; |
2429 | vma = vma->vm_next; | 2432 | vma = vma->vm_next; |
2430 | } while (vma && vma->vm_start < end); | 2433 | } while (vma && vma->vm_start < end); |
2431 | *insertion_point = vma; | 2434 | *insertion_point = vma; |
2432 | if (vma) { | 2435 | if (vma) { |
2433 | vma->vm_prev = prev; | 2436 | vma->vm_prev = prev; |
2434 | vma_gap_update(vma); | 2437 | vma_gap_update(vma); |
2435 | } else | 2438 | } else |
2436 | mm->highest_vm_end = prev ? prev->vm_end : 0; | 2439 | mm->highest_vm_end = prev ? prev->vm_end : 0; |
2437 | tail_vma->vm_next = NULL; | 2440 | tail_vma->vm_next = NULL; |
2438 | 2441 | ||
2439 | /* Kill the cache */ | 2442 | /* Kill the cache */ |
2440 | vmacache_invalidate(mm); | 2443 | vmacache_invalidate(mm); |
2441 | } | 2444 | } |
2442 | 2445 | ||
2443 | /* | 2446 | /* |
2444 | * __split_vma() bypasses sysctl_max_map_count checking. We use this on the | 2447 | * __split_vma() bypasses sysctl_max_map_count checking. We use this on the |
2445 | * munmap path where it doesn't make sense to fail. | 2448 | * munmap path where it doesn't make sense to fail. |
2446 | */ | 2449 | */ |
2447 | static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, | 2450 | static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, |
2448 | unsigned long addr, int new_below) | 2451 | unsigned long addr, int new_below) |
2449 | { | 2452 | { |
2450 | struct vm_area_struct *new; | 2453 | struct vm_area_struct *new; |
2451 | int err = -ENOMEM; | 2454 | int err = -ENOMEM; |
2452 | 2455 | ||
2453 | if (is_vm_hugetlb_page(vma) && (addr & | 2456 | if (is_vm_hugetlb_page(vma) && (addr & |
2454 | ~(huge_page_mask(hstate_vma(vma))))) | 2457 | ~(huge_page_mask(hstate_vma(vma))))) |
2455 | return -EINVAL; | 2458 | return -EINVAL; |
2456 | 2459 | ||
2457 | new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); | 2460 | new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); |
2458 | if (!new) | 2461 | if (!new) |
2459 | goto out_err; | 2462 | goto out_err; |
2460 | 2463 | ||
2461 | /* most fields are the same, copy all, and then fixup */ | 2464 | /* most fields are the same, copy all, and then fixup */ |
2462 | *new = *vma; | 2465 | *new = *vma; |
2463 | 2466 | ||
2464 | INIT_LIST_HEAD(&new->anon_vma_chain); | 2467 | INIT_LIST_HEAD(&new->anon_vma_chain); |
2465 | 2468 | ||
2466 | if (new_below) | 2469 | if (new_below) |
2467 | new->vm_end = addr; | 2470 | new->vm_end = addr; |
2468 | else { | 2471 | else { |
2469 | new->vm_start = addr; | 2472 | new->vm_start = addr; |
2470 | new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); | 2473 | new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); |
2471 | } | 2474 | } |
2472 | 2475 | ||
2473 | err = vma_dup_policy(vma, new); | 2476 | err = vma_dup_policy(vma, new); |
2474 | if (err) | 2477 | if (err) |
2475 | goto out_free_vma; | 2478 | goto out_free_vma; |
2476 | 2479 | ||
2477 | err = anon_vma_clone(new, vma); | 2480 | err = anon_vma_clone(new, vma); |
2478 | if (err) | 2481 | if (err) |
2479 | goto out_free_mpol; | 2482 | goto out_free_mpol; |
2480 | 2483 | ||
2481 | if (new->vm_file) | 2484 | if (new->vm_file) |
2482 | get_file(new->vm_file); | 2485 | get_file(new->vm_file); |
2483 | 2486 | ||
2484 | if (new->vm_ops && new->vm_ops->open) | 2487 | if (new->vm_ops && new->vm_ops->open) |
2485 | new->vm_ops->open(new); | 2488 | new->vm_ops->open(new); |
2486 | 2489 | ||
2487 | if (new_below) | 2490 | if (new_below) |
2488 | err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + | 2491 | err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + |
2489 | ((addr - new->vm_start) >> PAGE_SHIFT), new); | 2492 | ((addr - new->vm_start) >> PAGE_SHIFT), new); |
2490 | else | 2493 | else |
2491 | err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); | 2494 | err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); |
2492 | 2495 | ||
2493 | /* Success. */ | 2496 | /* Success. */ |
2494 | if (!err) | 2497 | if (!err) |
2495 | return 0; | 2498 | return 0; |
2496 | 2499 | ||
2497 | /* Clean everything up if vma_adjust failed. */ | 2500 | /* Clean everything up if vma_adjust failed. */ |
2498 | if (new->vm_ops && new->vm_ops->close) | 2501 | if (new->vm_ops && new->vm_ops->close) |
2499 | new->vm_ops->close(new); | 2502 | new->vm_ops->close(new); |
2500 | if (new->vm_file) | 2503 | if (new->vm_file) |
2501 | fput(new->vm_file); | 2504 | fput(new->vm_file); |
2502 | unlink_anon_vmas(new); | 2505 | unlink_anon_vmas(new); |
2503 | out_free_mpol: | 2506 | out_free_mpol: |
2504 | mpol_put(vma_policy(new)); | 2507 | mpol_put(vma_policy(new)); |
2505 | out_free_vma: | 2508 | out_free_vma: |
2506 | kmem_cache_free(vm_area_cachep, new); | 2509 | kmem_cache_free(vm_area_cachep, new); |
2507 | out_err: | 2510 | out_err: |
2508 | return err; | 2511 | return err; |
2509 | } | 2512 | } |
2510 | 2513 | ||
2511 | /* | 2514 | /* |
2512 | * Split a vma into two pieces at address 'addr', a new vma is allocated | 2515 | * Split a vma into two pieces at address 'addr', a new vma is allocated |
2513 | * either for the first part or the tail. | 2516 | * either for the first part or the tail. |
2514 | */ | 2517 | */ |
2515 | int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, | 2518 | int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, |
2516 | unsigned long addr, int new_below) | 2519 | unsigned long addr, int new_below) |
2517 | { | 2520 | { |
2518 | if (mm->map_count >= sysctl_max_map_count) | 2521 | if (mm->map_count >= sysctl_max_map_count) |
2519 | return -ENOMEM; | 2522 | return -ENOMEM; |
2520 | 2523 | ||
2521 | return __split_vma(mm, vma, addr, new_below); | 2524 | return __split_vma(mm, vma, addr, new_below); |
2522 | } | 2525 | } |
2523 | 2526 | ||
2524 | /* Munmap is split into 2 main parts -- this part which finds | 2527 | /* Munmap is split into 2 main parts -- this part which finds |
2525 | * what needs doing, and the areas themselves, which do the | 2528 | * what needs doing, and the areas themselves, which do the |
2526 | * work. This now handles partial unmappings. | 2529 | * work. This now handles partial unmappings. |
2527 | * Jeremy Fitzhardinge <jeremy@goop.org> | 2530 | * Jeremy Fitzhardinge <jeremy@goop.org> |
2528 | */ | 2531 | */ |
2529 | int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) | 2532 | int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) |
2530 | { | 2533 | { |
2531 | unsigned long end; | 2534 | unsigned long end; |
2532 | struct vm_area_struct *vma, *prev, *last; | 2535 | struct vm_area_struct *vma, *prev, *last; |
2533 | 2536 | ||
2534 | if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start) | 2537 | if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start) |
2535 | return -EINVAL; | 2538 | return -EINVAL; |
2536 | 2539 | ||
2537 | len = PAGE_ALIGN(len); | 2540 | len = PAGE_ALIGN(len); |
2538 | if (len == 0) | 2541 | if (len == 0) |
2539 | return -EINVAL; | 2542 | return -EINVAL; |
2540 | 2543 | ||
2541 | /* Find the first overlapping VMA */ | 2544 | /* Find the first overlapping VMA */ |
2542 | vma = find_vma(mm, start); | 2545 | vma = find_vma(mm, start); |
2543 | if (!vma) | 2546 | if (!vma) |
2544 | return 0; | 2547 | return 0; |
2545 | prev = vma->vm_prev; | 2548 | prev = vma->vm_prev; |
2546 | /* we have start < vma->vm_end */ | 2549 | /* we have start < vma->vm_end */ |
2547 | 2550 | ||
2548 | /* if it doesn't overlap, we have nothing.. */ | 2551 | /* if it doesn't overlap, we have nothing.. */ |
2549 | end = start + len; | 2552 | end = start + len; |
2550 | if (vma->vm_start >= end) | 2553 | if (vma->vm_start >= end) |
2551 | return 0; | 2554 | return 0; |
2552 | 2555 | ||
2553 | /* | 2556 | /* |
2554 | * If we need to split any vma, do it now to save pain later. | 2557 | * If we need to split any vma, do it now to save pain later. |
2555 | * | 2558 | * |
2556 | * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially | 2559 | * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially |
2557 | * unmapped vm_area_struct will remain in use: so lower split_vma | 2560 | * unmapped vm_area_struct will remain in use: so lower split_vma |
2558 | * places tmp vma above, and higher split_vma places tmp vma below. | 2561 | * places tmp vma above, and higher split_vma places tmp vma below. |
2559 | */ | 2562 | */ |
2560 | if (start > vma->vm_start) { | 2563 | if (start > vma->vm_start) { |
2561 | int error; | 2564 | int error; |
2562 | 2565 | ||
2563 | /* | 2566 | /* |
2564 | * Make sure that map_count on return from munmap() will | 2567 | * Make sure that map_count on return from munmap() will |
2565 | * not exceed its limit; but let map_count go just above | 2568 | * not exceed its limit; but let map_count go just above |
2566 | * its limit temporarily, to help free resources as expected. | 2569 | * its limit temporarily, to help free resources as expected. |
2567 | */ | 2570 | */ |
2568 | if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) | 2571 | if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) |
2569 | return -ENOMEM; | 2572 | return -ENOMEM; |
2570 | 2573 | ||
2571 | error = __split_vma(mm, vma, start, 0); | 2574 | error = __split_vma(mm, vma, start, 0); |
2572 | if (error) | 2575 | if (error) |
2573 | return error; | 2576 | return error; |
2574 | prev = vma; | 2577 | prev = vma; |
2575 | } | 2578 | } |
2576 | 2579 | ||
2577 | /* Does it split the last one? */ | 2580 | /* Does it split the last one? */ |
2578 | last = find_vma(mm, end); | 2581 | last = find_vma(mm, end); |
2579 | if (last && end > last->vm_start) { | 2582 | if (last && end > last->vm_start) { |
2580 | int error = __split_vma(mm, last, end, 1); | 2583 | int error = __split_vma(mm, last, end, 1); |
2581 | if (error) | 2584 | if (error) |
2582 | return error; | 2585 | return error; |
2583 | } | 2586 | } |
2584 | vma = prev ? prev->vm_next : mm->mmap; | 2587 | vma = prev ? prev->vm_next : mm->mmap; |
2585 | 2588 | ||
2586 | /* | 2589 | /* |
2587 | * unlock any mlock()ed ranges before detaching vmas | 2590 | * unlock any mlock()ed ranges before detaching vmas |
2588 | */ | 2591 | */ |
2589 | if (mm->locked_vm) { | 2592 | if (mm->locked_vm) { |
2590 | struct vm_area_struct *tmp = vma; | 2593 | struct vm_area_struct *tmp = vma; |
2591 | while (tmp && tmp->vm_start < end) { | 2594 | while (tmp && tmp->vm_start < end) { |
2592 | if (tmp->vm_flags & VM_LOCKED) { | 2595 | if (tmp->vm_flags & VM_LOCKED) { |
2593 | mm->locked_vm -= vma_pages(tmp); | 2596 | mm->locked_vm -= vma_pages(tmp); |
2594 | munlock_vma_pages_all(tmp); | 2597 | munlock_vma_pages_all(tmp); |
2595 | } | 2598 | } |
2596 | tmp = tmp->vm_next; | 2599 | tmp = tmp->vm_next; |
2597 | } | 2600 | } |
2598 | } | 2601 | } |
2599 | 2602 | ||
2600 | /* | 2603 | /* |
2601 | * Remove the vma's, and unmap the actual pages | 2604 | * Remove the vma's, and unmap the actual pages |
2602 | */ | 2605 | */ |
2603 | detach_vmas_to_be_unmapped(mm, vma, prev, end); | 2606 | detach_vmas_to_be_unmapped(mm, vma, prev, end); |
2604 | unmap_region(mm, vma, prev, start, end); | 2607 | unmap_region(mm, vma, prev, start, end); |
2605 | 2608 | ||
2606 | arch_unmap(mm, vma, start, end); | 2609 | arch_unmap(mm, vma, start, end); |
2607 | 2610 | ||
2608 | /* Fix up all other VM information */ | 2611 | /* Fix up all other VM information */ |
2609 | remove_vma_list(mm, vma); | 2612 | remove_vma_list(mm, vma); |
2610 | 2613 | ||
2611 | return 0; | 2614 | return 0; |
2612 | } | 2615 | } |
2613 | 2616 | ||
2614 | int vm_munmap(unsigned long start, size_t len) | 2617 | int vm_munmap(unsigned long start, size_t len) |
2615 | { | 2618 | { |
2616 | int ret; | 2619 | int ret; |
2617 | struct mm_struct *mm = current->mm; | 2620 | struct mm_struct *mm = current->mm; |
2618 | 2621 | ||
2619 | down_write(&mm->mmap_sem); | 2622 | down_write(&mm->mmap_sem); |
2620 | ret = do_munmap(mm, start, len); | 2623 | ret = do_munmap(mm, start, len); |
2621 | up_write(&mm->mmap_sem); | 2624 | up_write(&mm->mmap_sem); |
2622 | return ret; | 2625 | return ret; |
2623 | } | 2626 | } |
2624 | EXPORT_SYMBOL(vm_munmap); | 2627 | EXPORT_SYMBOL(vm_munmap); |
2625 | 2628 | ||
2626 | SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) | 2629 | SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) |
2627 | { | 2630 | { |
2628 | profile_munmap(addr); | 2631 | profile_munmap(addr); |
2629 | return vm_munmap(addr, len); | 2632 | return vm_munmap(addr, len); |
2630 | } | 2633 | } |
2631 | 2634 | ||
2632 | static inline void verify_mm_writelocked(struct mm_struct *mm) | 2635 | static inline void verify_mm_writelocked(struct mm_struct *mm) |
2633 | { | 2636 | { |
2634 | #ifdef CONFIG_DEBUG_VM | 2637 | #ifdef CONFIG_DEBUG_VM |
2635 | if (unlikely(down_read_trylock(&mm->mmap_sem))) { | 2638 | if (unlikely(down_read_trylock(&mm->mmap_sem))) { |
2636 | WARN_ON(1); | 2639 | WARN_ON(1); |
2637 | up_read(&mm->mmap_sem); | 2640 | up_read(&mm->mmap_sem); |
2638 | } | 2641 | } |
2639 | #endif | 2642 | #endif |
2640 | } | 2643 | } |
2641 | 2644 | ||
2642 | /* | 2645 | /* |
2643 | * this is really a simplified "do_mmap". it only handles | 2646 | * this is really a simplified "do_mmap". it only handles |
2644 | * anonymous maps. eventually we may be able to do some | 2647 | * anonymous maps. eventually we may be able to do some |
2645 | * brk-specific accounting here. | 2648 | * brk-specific accounting here. |
2646 | */ | 2649 | */ |
2647 | static unsigned long do_brk(unsigned long addr, unsigned long len) | 2650 | static unsigned long do_brk(unsigned long addr, unsigned long len) |
2648 | { | 2651 | { |
2649 | struct mm_struct *mm = current->mm; | 2652 | struct mm_struct *mm = current->mm; |
2650 | struct vm_area_struct *vma, *prev; | 2653 | struct vm_area_struct *vma, *prev; |
2651 | unsigned long flags; | 2654 | unsigned long flags; |
2652 | struct rb_node **rb_link, *rb_parent; | 2655 | struct rb_node **rb_link, *rb_parent; |
2653 | pgoff_t pgoff = addr >> PAGE_SHIFT; | 2656 | pgoff_t pgoff = addr >> PAGE_SHIFT; |
2654 | int error; | 2657 | int error; |
2655 | 2658 | ||
2656 | len = PAGE_ALIGN(len); | 2659 | len = PAGE_ALIGN(len); |
2657 | if (!len) | 2660 | if (!len) |
2658 | return addr; | 2661 | return addr; |
2659 | 2662 | ||
2660 | flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; | 2663 | flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; |
2661 | 2664 | ||
2662 | error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); | 2665 | error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); |
2663 | if (error & ~PAGE_MASK) | 2666 | if (error & ~PAGE_MASK) |
2664 | return error; | 2667 | return error; |
2665 | 2668 | ||
2666 | error = mlock_future_check(mm, mm->def_flags, len); | 2669 | error = mlock_future_check(mm, mm->def_flags, len); |
2667 | if (error) | 2670 | if (error) |
2668 | return error; | 2671 | return error; |
2669 | 2672 | ||
2670 | /* | 2673 | /* |
2671 | * mm->mmap_sem is required to protect against another thread | 2674 | * mm->mmap_sem is required to protect against another thread |
2672 | * changing the mappings in case we sleep. | 2675 | * changing the mappings in case we sleep. |
2673 | */ | 2676 | */ |
2674 | verify_mm_writelocked(mm); | 2677 | verify_mm_writelocked(mm); |
2675 | 2678 | ||
2676 | /* | 2679 | /* |
2677 | * Clear old maps. this also does some error checking for us | 2680 | * Clear old maps. this also does some error checking for us |
2678 | */ | 2681 | */ |
2679 | munmap_back: | 2682 | munmap_back: |
2680 | if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { | 2683 | if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { |
2681 | if (do_munmap(mm, addr, len)) | 2684 | if (do_munmap(mm, addr, len)) |
2682 | return -ENOMEM; | 2685 | return -ENOMEM; |
2683 | goto munmap_back; | 2686 | goto munmap_back; |
2684 | } | 2687 | } |
2685 | 2688 | ||
2686 | /* Check against address space limits *after* clearing old maps... */ | 2689 | /* Check against address space limits *after* clearing old maps... */ |
2687 | if (!may_expand_vm(mm, len >> PAGE_SHIFT)) | 2690 | if (!may_expand_vm(mm, len >> PAGE_SHIFT)) |
2688 | return -ENOMEM; | 2691 | return -ENOMEM; |
2689 | 2692 | ||
2690 | if (mm->map_count > sysctl_max_map_count) | 2693 | if (mm->map_count > sysctl_max_map_count) |
2691 | return -ENOMEM; | 2694 | return -ENOMEM; |
2692 | 2695 | ||
2693 | if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT)) | 2696 | if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT)) |
2694 | return -ENOMEM; | 2697 | return -ENOMEM; |
2695 | 2698 | ||
2696 | /* Can we just expand an old private anonymous mapping? */ | 2699 | /* Can we just expand an old private anonymous mapping? */ |
2697 | vma = vma_merge(mm, prev, addr, addr + len, flags, | 2700 | vma = vma_merge(mm, prev, addr, addr + len, flags, |
2698 | NULL, NULL, pgoff, NULL); | 2701 | NULL, NULL, pgoff, NULL); |
2699 | if (vma) | 2702 | if (vma) |
2700 | goto out; | 2703 | goto out; |
2701 | 2704 | ||
2702 | /* | 2705 | /* |
2703 | * create a vma struct for an anonymous mapping | 2706 | * create a vma struct for an anonymous mapping |
2704 | */ | 2707 | */ |
2705 | vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); | 2708 | vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); |
2706 | if (!vma) { | 2709 | if (!vma) { |
2707 | vm_unacct_memory(len >> PAGE_SHIFT); | 2710 | vm_unacct_memory(len >> PAGE_SHIFT); |
2708 | return -ENOMEM; | 2711 | return -ENOMEM; |
2709 | } | 2712 | } |
2710 | 2713 | ||
2711 | INIT_LIST_HEAD(&vma->anon_vma_chain); | 2714 | INIT_LIST_HEAD(&vma->anon_vma_chain); |
2712 | vma->vm_mm = mm; | 2715 | vma->vm_mm = mm; |
2713 | vma->vm_start = addr; | 2716 | vma->vm_start = addr; |
2714 | vma->vm_end = addr + len; | 2717 | vma->vm_end = addr + len; |
2715 | vma->vm_pgoff = pgoff; | 2718 | vma->vm_pgoff = pgoff; |
2716 | vma->vm_flags = flags; | 2719 | vma->vm_flags = flags; |
2717 | vma->vm_page_prot = vm_get_page_prot(flags); | 2720 | vma->vm_page_prot = vm_get_page_prot(flags); |
2718 | vma_link(mm, vma, prev, rb_link, rb_parent); | 2721 | vma_link(mm, vma, prev, rb_link, rb_parent); |
2719 | out: | 2722 | out: |
2720 | perf_event_mmap(vma); | 2723 | perf_event_mmap(vma); |
2721 | mm->total_vm += len >> PAGE_SHIFT; | 2724 | mm->total_vm += len >> PAGE_SHIFT; |
2722 | if (flags & VM_LOCKED) | 2725 | if (flags & VM_LOCKED) |
2723 | mm->locked_vm += (len >> PAGE_SHIFT); | 2726 | mm->locked_vm += (len >> PAGE_SHIFT); |
2724 | vma->vm_flags |= VM_SOFTDIRTY; | 2727 | vma->vm_flags |= VM_SOFTDIRTY; |
2725 | return addr; | 2728 | return addr; |
2726 | } | 2729 | } |
2727 | 2730 | ||
2728 | unsigned long vm_brk(unsigned long addr, unsigned long len) | 2731 | unsigned long vm_brk(unsigned long addr, unsigned long len) |
2729 | { | 2732 | { |
2730 | struct mm_struct *mm = current->mm; | 2733 | struct mm_struct *mm = current->mm; |
2731 | unsigned long ret; | 2734 | unsigned long ret; |
2732 | bool populate; | 2735 | bool populate; |
2733 | 2736 | ||
2734 | down_write(&mm->mmap_sem); | 2737 | down_write(&mm->mmap_sem); |
2735 | ret = do_brk(addr, len); | 2738 | ret = do_brk(addr, len); |
2736 | populate = ((mm->def_flags & VM_LOCKED) != 0); | 2739 | populate = ((mm->def_flags & VM_LOCKED) != 0); |
2737 | up_write(&mm->mmap_sem); | 2740 | up_write(&mm->mmap_sem); |
2738 | if (populate) | 2741 | if (populate) |
2739 | mm_populate(addr, len); | 2742 | mm_populate(addr, len); |
2740 | return ret; | 2743 | return ret; |
2741 | } | 2744 | } |
2742 | EXPORT_SYMBOL(vm_brk); | 2745 | EXPORT_SYMBOL(vm_brk); |
2743 | 2746 | ||
2744 | /* Release all mmaps. */ | 2747 | /* Release all mmaps. */ |
2745 | void exit_mmap(struct mm_struct *mm) | 2748 | void exit_mmap(struct mm_struct *mm) |
2746 | { | 2749 | { |
2747 | struct mmu_gather tlb; | 2750 | struct mmu_gather tlb; |
2748 | struct vm_area_struct *vma; | 2751 | struct vm_area_struct *vma; |
2749 | unsigned long nr_accounted = 0; | 2752 | unsigned long nr_accounted = 0; |
2750 | 2753 | ||
2751 | /* mm's last user has gone, and its about to be pulled down */ | 2754 | /* mm's last user has gone, and its about to be pulled down */ |
2752 | mmu_notifier_release(mm); | 2755 | mmu_notifier_release(mm); |
2753 | 2756 | ||
2754 | if (mm->locked_vm) { | 2757 | if (mm->locked_vm) { |
2755 | vma = mm->mmap; | 2758 | vma = mm->mmap; |
2756 | while (vma) { | 2759 | while (vma) { |
2757 | if (vma->vm_flags & VM_LOCKED) | 2760 | if (vma->vm_flags & VM_LOCKED) |
2758 | munlock_vma_pages_all(vma); | 2761 | munlock_vma_pages_all(vma); |
2759 | vma = vma->vm_next; | 2762 | vma = vma->vm_next; |
2760 | } | 2763 | } |
2761 | } | 2764 | } |
2762 | 2765 | ||
2763 | arch_exit_mmap(mm); | 2766 | arch_exit_mmap(mm); |
2764 | 2767 | ||
2765 | vma = mm->mmap; | 2768 | vma = mm->mmap; |
2766 | if (!vma) /* Can happen if dup_mmap() received an OOM */ | 2769 | if (!vma) /* Can happen if dup_mmap() received an OOM */ |
2767 | return; | 2770 | return; |
2768 | 2771 | ||
2769 | lru_add_drain(); | 2772 | lru_add_drain(); |
2770 | flush_cache_mm(mm); | 2773 | flush_cache_mm(mm); |
2771 | tlb_gather_mmu(&tlb, mm, 0, -1); | 2774 | tlb_gather_mmu(&tlb, mm, 0, -1); |
2772 | /* update_hiwater_rss(mm) here? but nobody should be looking */ | 2775 | /* update_hiwater_rss(mm) here? but nobody should be looking */ |
2773 | /* Use -1 here to ensure all VMAs in the mm are unmapped */ | 2776 | /* Use -1 here to ensure all VMAs in the mm are unmapped */ |
2774 | unmap_vmas(&tlb, vma, 0, -1); | 2777 | unmap_vmas(&tlb, vma, 0, -1); |
2775 | 2778 | ||
2776 | free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); | 2779 | free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); |
2777 | tlb_finish_mmu(&tlb, 0, -1); | 2780 | tlb_finish_mmu(&tlb, 0, -1); |
2778 | 2781 | ||
2779 | /* | 2782 | /* |
2780 | * Walk the list again, actually closing and freeing it, | 2783 | * Walk the list again, actually closing and freeing it, |
2781 | * with preemption enabled, without holding any MM locks. | 2784 | * with preemption enabled, without holding any MM locks. |
2782 | */ | 2785 | */ |
2783 | while (vma) { | 2786 | while (vma) { |
2784 | if (vma->vm_flags & VM_ACCOUNT) | 2787 | if (vma->vm_flags & VM_ACCOUNT) |
2785 | nr_accounted += vma_pages(vma); | 2788 | nr_accounted += vma_pages(vma); |
2786 | vma = remove_vma(vma); | 2789 | vma = remove_vma(vma); |
2787 | } | 2790 | } |
2788 | vm_unacct_memory(nr_accounted); | 2791 | vm_unacct_memory(nr_accounted); |
2789 | 2792 | ||
2790 | WARN_ON(atomic_long_read(&mm->nr_ptes) > | 2793 | WARN_ON(atomic_long_read(&mm->nr_ptes) > |
2791 | (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); | 2794 | (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); |
2792 | } | 2795 | } |
2793 | 2796 | ||
2794 | /* Insert vm structure into process list sorted by address | 2797 | /* Insert vm structure into process list sorted by address |
2795 | * and into the inode's i_mmap tree. If vm_file is non-NULL | 2798 | * and into the inode's i_mmap tree. If vm_file is non-NULL |
2796 | * then i_mmap_rwsem is taken here. | 2799 | * then i_mmap_rwsem is taken here. |
2797 | */ | 2800 | */ |
2798 | int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) | 2801 | int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) |
2799 | { | 2802 | { |
2800 | struct vm_area_struct *prev; | 2803 | struct vm_area_struct *prev; |
2801 | struct rb_node **rb_link, *rb_parent; | 2804 | struct rb_node **rb_link, *rb_parent; |
2802 | 2805 | ||
2803 | /* | 2806 | /* |
2804 | * The vm_pgoff of a purely anonymous vma should be irrelevant | 2807 | * The vm_pgoff of a purely anonymous vma should be irrelevant |
2805 | * until its first write fault, when page's anon_vma and index | 2808 | * until its first write fault, when page's anon_vma and index |
2806 | * are set. But now set the vm_pgoff it will almost certainly | 2809 | * are set. But now set the vm_pgoff it will almost certainly |
2807 | * end up with (unless mremap moves it elsewhere before that | 2810 | * end up with (unless mremap moves it elsewhere before that |
2808 | * first wfault), so /proc/pid/maps tells a consistent story. | 2811 | * first wfault), so /proc/pid/maps tells a consistent story. |
2809 | * | 2812 | * |
2810 | * By setting it to reflect the virtual start address of the | 2813 | * By setting it to reflect the virtual start address of the |
2811 | * vma, merges and splits can happen in a seamless way, just | 2814 | * vma, merges and splits can happen in a seamless way, just |
2812 | * using the existing file pgoff checks and manipulations. | 2815 | * using the existing file pgoff checks and manipulations. |
2813 | * Similarly in do_mmap_pgoff and in do_brk. | 2816 | * Similarly in do_mmap_pgoff and in do_brk. |
2814 | */ | 2817 | */ |
2815 | if (!vma->vm_file) { | 2818 | if (!vma->vm_file) { |
2816 | BUG_ON(vma->anon_vma); | 2819 | BUG_ON(vma->anon_vma); |
2817 | vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; | 2820 | vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; |
2818 | } | 2821 | } |
2819 | if (find_vma_links(mm, vma->vm_start, vma->vm_end, | 2822 | if (find_vma_links(mm, vma->vm_start, vma->vm_end, |
2820 | &prev, &rb_link, &rb_parent)) | 2823 | &prev, &rb_link, &rb_parent)) |
2821 | return -ENOMEM; | 2824 | return -ENOMEM; |
2822 | if ((vma->vm_flags & VM_ACCOUNT) && | 2825 | if ((vma->vm_flags & VM_ACCOUNT) && |
2823 | security_vm_enough_memory_mm(mm, vma_pages(vma))) | 2826 | security_vm_enough_memory_mm(mm, vma_pages(vma))) |
2824 | return -ENOMEM; | 2827 | return -ENOMEM; |
2825 | 2828 | ||
2826 | vma_link(mm, vma, prev, rb_link, rb_parent); | 2829 | vma_link(mm, vma, prev, rb_link, rb_parent); |
2827 | return 0; | 2830 | return 0; |
2828 | } | 2831 | } |
2829 | 2832 | ||
2830 | /* | 2833 | /* |
2831 | * Copy the vma structure to a new location in the same mm, | 2834 | * Copy the vma structure to a new location in the same mm, |
2832 | * prior to moving page table entries, to effect an mremap move. | 2835 | * prior to moving page table entries, to effect an mremap move. |
2833 | */ | 2836 | */ |
2834 | struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, | 2837 | struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, |
2835 | unsigned long addr, unsigned long len, pgoff_t pgoff, | 2838 | unsigned long addr, unsigned long len, pgoff_t pgoff, |
2836 | bool *need_rmap_locks) | 2839 | bool *need_rmap_locks) |
2837 | { | 2840 | { |
2838 | struct vm_area_struct *vma = *vmap; | 2841 | struct vm_area_struct *vma = *vmap; |
2839 | unsigned long vma_start = vma->vm_start; | 2842 | unsigned long vma_start = vma->vm_start; |
2840 | struct mm_struct *mm = vma->vm_mm; | 2843 | struct mm_struct *mm = vma->vm_mm; |
2841 | struct vm_area_struct *new_vma, *prev; | 2844 | struct vm_area_struct *new_vma, *prev; |
2842 | struct rb_node **rb_link, *rb_parent; | 2845 | struct rb_node **rb_link, *rb_parent; |
2843 | bool faulted_in_anon_vma = true; | 2846 | bool faulted_in_anon_vma = true; |
2844 | 2847 | ||
2845 | /* | 2848 | /* |
2846 | * If anonymous vma has not yet been faulted, update new pgoff | 2849 | * If anonymous vma has not yet been faulted, update new pgoff |
2847 | * to match new location, to increase its chance of merging. | 2850 | * to match new location, to increase its chance of merging. |
2848 | */ | 2851 | */ |
2849 | if (unlikely(!vma->vm_file && !vma->anon_vma)) { | 2852 | if (unlikely(!vma->vm_file && !vma->anon_vma)) { |
2850 | pgoff = addr >> PAGE_SHIFT; | 2853 | pgoff = addr >> PAGE_SHIFT; |
2851 | faulted_in_anon_vma = false; | 2854 | faulted_in_anon_vma = false; |
2852 | } | 2855 | } |
2853 | 2856 | ||
2854 | if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) | 2857 | if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) |
2855 | return NULL; /* should never get here */ | 2858 | return NULL; /* should never get here */ |
2856 | new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, | 2859 | new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, |
2857 | vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); | 2860 | vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); |
2858 | if (new_vma) { | 2861 | if (new_vma) { |
2859 | /* | 2862 | /* |
2860 | * Source vma may have been merged into new_vma | 2863 | * Source vma may have been merged into new_vma |
2861 | */ | 2864 | */ |
2862 | if (unlikely(vma_start >= new_vma->vm_start && | 2865 | if (unlikely(vma_start >= new_vma->vm_start && |
2863 | vma_start < new_vma->vm_end)) { | 2866 | vma_start < new_vma->vm_end)) { |
2864 | /* | 2867 | /* |
2865 | * The only way we can get a vma_merge with | 2868 | * The only way we can get a vma_merge with |
2866 | * self during an mremap is if the vma hasn't | 2869 | * self during an mremap is if the vma hasn't |
2867 | * been faulted in yet and we were allowed to | 2870 | * been faulted in yet and we were allowed to |
2868 | * reset the dst vma->vm_pgoff to the | 2871 | * reset the dst vma->vm_pgoff to the |
2869 | * destination address of the mremap to allow | 2872 | * destination address of the mremap to allow |
2870 | * the merge to happen. mremap must change the | 2873 | * the merge to happen. mremap must change the |
2871 | * vm_pgoff linearity between src and dst vmas | 2874 | * vm_pgoff linearity between src and dst vmas |
2872 | * (in turn preventing a vma_merge) to be | 2875 | * (in turn preventing a vma_merge) to be |
2873 | * safe. It is only safe to keep the vm_pgoff | 2876 | * safe. It is only safe to keep the vm_pgoff |
2874 | * linear if there are no pages mapped yet. | 2877 | * linear if there are no pages mapped yet. |
2875 | */ | 2878 | */ |
2876 | VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma); | 2879 | VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma); |
2877 | *vmap = vma = new_vma; | 2880 | *vmap = vma = new_vma; |
2878 | } | 2881 | } |
2879 | *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); | 2882 | *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); |
2880 | } else { | 2883 | } else { |
2881 | new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); | 2884 | new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); |
2882 | if (new_vma) { | 2885 | if (new_vma) { |
2883 | *new_vma = *vma; | 2886 | *new_vma = *vma; |
2884 | new_vma->vm_start = addr; | 2887 | new_vma->vm_start = addr; |
2885 | new_vma->vm_end = addr + len; | 2888 | new_vma->vm_end = addr + len; |
2886 | new_vma->vm_pgoff = pgoff; | 2889 | new_vma->vm_pgoff = pgoff; |
2887 | if (vma_dup_policy(vma, new_vma)) | 2890 | if (vma_dup_policy(vma, new_vma)) |
2888 | goto out_free_vma; | 2891 | goto out_free_vma; |
2889 | INIT_LIST_HEAD(&new_vma->anon_vma_chain); | 2892 | INIT_LIST_HEAD(&new_vma->anon_vma_chain); |
2890 | if (anon_vma_clone(new_vma, vma)) | 2893 | if (anon_vma_clone(new_vma, vma)) |
2891 | goto out_free_mempol; | 2894 | goto out_free_mempol; |
2892 | if (new_vma->vm_file) | 2895 | if (new_vma->vm_file) |
2893 | get_file(new_vma->vm_file); | 2896 | get_file(new_vma->vm_file); |
2894 | if (new_vma->vm_ops && new_vma->vm_ops->open) | 2897 | if (new_vma->vm_ops && new_vma->vm_ops->open) |
2895 | new_vma->vm_ops->open(new_vma); | 2898 | new_vma->vm_ops->open(new_vma); |
2896 | vma_link(mm, new_vma, prev, rb_link, rb_parent); | 2899 | vma_link(mm, new_vma, prev, rb_link, rb_parent); |
2897 | *need_rmap_locks = false; | 2900 | *need_rmap_locks = false; |
2898 | } | 2901 | } |
2899 | } | 2902 | } |
2900 | return new_vma; | 2903 | return new_vma; |
2901 | 2904 | ||
2902 | out_free_mempol: | 2905 | out_free_mempol: |
2903 | mpol_put(vma_policy(new_vma)); | 2906 | mpol_put(vma_policy(new_vma)); |
2904 | out_free_vma: | 2907 | out_free_vma: |
2905 | kmem_cache_free(vm_area_cachep, new_vma); | 2908 | kmem_cache_free(vm_area_cachep, new_vma); |
2906 | return NULL; | 2909 | return NULL; |
2907 | } | 2910 | } |
2908 | 2911 | ||
2909 | /* | 2912 | /* |
2910 | * Return true if the calling process may expand its vm space by the passed | 2913 | * Return true if the calling process may expand its vm space by the passed |
2911 | * number of pages | 2914 | * number of pages |
2912 | */ | 2915 | */ |
2913 | int may_expand_vm(struct mm_struct *mm, unsigned long npages) | 2916 | int may_expand_vm(struct mm_struct *mm, unsigned long npages) |
2914 | { | 2917 | { |
2915 | unsigned long cur = mm->total_vm; /* pages */ | 2918 | unsigned long cur = mm->total_vm; /* pages */ |
2916 | unsigned long lim; | 2919 | unsigned long lim; |
2917 | 2920 | ||
2918 | lim = rlimit(RLIMIT_AS) >> PAGE_SHIFT; | 2921 | lim = rlimit(RLIMIT_AS) >> PAGE_SHIFT; |
2919 | 2922 | ||
2920 | if (cur + npages > lim) | 2923 | if (cur + npages > lim) |
2921 | return 0; | 2924 | return 0; |
2922 | return 1; | 2925 | return 1; |
2923 | } | 2926 | } |
2924 | 2927 | ||
2925 | static int special_mapping_fault(struct vm_area_struct *vma, | 2928 | static int special_mapping_fault(struct vm_area_struct *vma, |
2926 | struct vm_fault *vmf); | 2929 | struct vm_fault *vmf); |
2927 | 2930 | ||
2928 | /* | 2931 | /* |
2929 | * Having a close hook prevents vma merging regardless of flags. | 2932 | * Having a close hook prevents vma merging regardless of flags. |
2930 | */ | 2933 | */ |
2931 | static void special_mapping_close(struct vm_area_struct *vma) | 2934 | static void special_mapping_close(struct vm_area_struct *vma) |
2932 | { | 2935 | { |
2933 | } | 2936 | } |
2934 | 2937 | ||
2935 | static const char *special_mapping_name(struct vm_area_struct *vma) | 2938 | static const char *special_mapping_name(struct vm_area_struct *vma) |
2936 | { | 2939 | { |
2937 | return ((struct vm_special_mapping *)vma->vm_private_data)->name; | 2940 | return ((struct vm_special_mapping *)vma->vm_private_data)->name; |
2938 | } | 2941 | } |
2939 | 2942 | ||
2940 | static const struct vm_operations_struct special_mapping_vmops = { | 2943 | static const struct vm_operations_struct special_mapping_vmops = { |
2941 | .close = special_mapping_close, | 2944 | .close = special_mapping_close, |
2942 | .fault = special_mapping_fault, | 2945 | .fault = special_mapping_fault, |
2943 | .name = special_mapping_name, | 2946 | .name = special_mapping_name, |
2944 | }; | 2947 | }; |
2945 | 2948 | ||
2946 | static const struct vm_operations_struct legacy_special_mapping_vmops = { | 2949 | static const struct vm_operations_struct legacy_special_mapping_vmops = { |
2947 | .close = special_mapping_close, | 2950 | .close = special_mapping_close, |
2948 | .fault = special_mapping_fault, | 2951 | .fault = special_mapping_fault, |
2949 | }; | 2952 | }; |
2950 | 2953 | ||
2951 | static int special_mapping_fault(struct vm_area_struct *vma, | 2954 | static int special_mapping_fault(struct vm_area_struct *vma, |
2952 | struct vm_fault *vmf) | 2955 | struct vm_fault *vmf) |
2953 | { | 2956 | { |
2954 | pgoff_t pgoff; | 2957 | pgoff_t pgoff; |
2955 | struct page **pages; | 2958 | struct page **pages; |
2956 | 2959 | ||
2957 | /* | 2960 | /* |
2958 | * special mappings have no vm_file, and in that case, the mm | 2961 | * special mappings have no vm_file, and in that case, the mm |
2959 | * uses vm_pgoff internally. So we have to subtract it from here. | 2962 | * uses vm_pgoff internally. So we have to subtract it from here. |
2960 | * We are allowed to do this because we are the mm; do not copy | 2963 | * We are allowed to do this because we are the mm; do not copy |
2961 | * this code into drivers! | 2964 | * this code into drivers! |
2962 | */ | 2965 | */ |
2963 | pgoff = vmf->pgoff - vma->vm_pgoff; | 2966 | pgoff = vmf->pgoff - vma->vm_pgoff; |
2964 | 2967 | ||
2965 | if (vma->vm_ops == &legacy_special_mapping_vmops) | 2968 | if (vma->vm_ops == &legacy_special_mapping_vmops) |
2966 | pages = vma->vm_private_data; | 2969 | pages = vma->vm_private_data; |
2967 | else | 2970 | else |
2968 | pages = ((struct vm_special_mapping *)vma->vm_private_data)-> | 2971 | pages = ((struct vm_special_mapping *)vma->vm_private_data)-> |
2969 | pages; | 2972 | pages; |
2970 | 2973 | ||
2971 | for (; pgoff && *pages; ++pages) | 2974 | for (; pgoff && *pages; ++pages) |
2972 | pgoff--; | 2975 | pgoff--; |
2973 | 2976 | ||
2974 | if (*pages) { | 2977 | if (*pages) { |
2975 | struct page *page = *pages; | 2978 | struct page *page = *pages; |
2976 | get_page(page); | 2979 | get_page(page); |
2977 | vmf->page = page; | 2980 | vmf->page = page; |
2978 | return 0; | 2981 | return 0; |
2979 | } | 2982 | } |
2980 | 2983 | ||
2981 | return VM_FAULT_SIGBUS; | 2984 | return VM_FAULT_SIGBUS; |
2982 | } | 2985 | } |
2983 | 2986 | ||
2984 | static struct vm_area_struct *__install_special_mapping( | 2987 | static struct vm_area_struct *__install_special_mapping( |
2985 | struct mm_struct *mm, | 2988 | struct mm_struct *mm, |
2986 | unsigned long addr, unsigned long len, | 2989 | unsigned long addr, unsigned long len, |
2987 | unsigned long vm_flags, const struct vm_operations_struct *ops, | 2990 | unsigned long vm_flags, const struct vm_operations_struct *ops, |
2988 | void *priv) | 2991 | void *priv) |
2989 | { | 2992 | { |
2990 | int ret; | 2993 | int ret; |
2991 | struct vm_area_struct *vma; | 2994 | struct vm_area_struct *vma; |
2992 | 2995 | ||
2993 | vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); | 2996 | vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); |
2994 | if (unlikely(vma == NULL)) | 2997 | if (unlikely(vma == NULL)) |
2995 | return ERR_PTR(-ENOMEM); | 2998 | return ERR_PTR(-ENOMEM); |
2996 | 2999 | ||
2997 | INIT_LIST_HEAD(&vma->anon_vma_chain); | 3000 | INIT_LIST_HEAD(&vma->anon_vma_chain); |
2998 | vma->vm_mm = mm; | 3001 | vma->vm_mm = mm; |
2999 | vma->vm_start = addr; | 3002 | vma->vm_start = addr; |
3000 | vma->vm_end = addr + len; | 3003 | vma->vm_end = addr + len; |
3001 | 3004 | ||
3002 | vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY; | 3005 | vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY; |
3003 | vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); | 3006 | vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); |
3004 | 3007 | ||
3005 | vma->vm_ops = ops; | 3008 | vma->vm_ops = ops; |
3006 | vma->vm_private_data = priv; | 3009 | vma->vm_private_data = priv; |
3007 | 3010 | ||
3008 | ret = insert_vm_struct(mm, vma); | 3011 | ret = insert_vm_struct(mm, vma); |
3009 | if (ret) | 3012 | if (ret) |
3010 | goto out; | 3013 | goto out; |
3011 | 3014 | ||
3012 | mm->total_vm += len >> PAGE_SHIFT; | 3015 | mm->total_vm += len >> PAGE_SHIFT; |
3013 | 3016 | ||
3014 | perf_event_mmap(vma); | 3017 | perf_event_mmap(vma); |
3015 | 3018 | ||
3016 | return vma; | 3019 | return vma; |
3017 | 3020 | ||
3018 | out: | 3021 | out: |
3019 | kmem_cache_free(vm_area_cachep, vma); | 3022 | kmem_cache_free(vm_area_cachep, vma); |
3020 | return ERR_PTR(ret); | 3023 | return ERR_PTR(ret); |
3021 | } | 3024 | } |
3022 | 3025 | ||
3023 | /* | 3026 | /* |
3024 | * Called with mm->mmap_sem held for writing. | 3027 | * Called with mm->mmap_sem held for writing. |
3025 | * Insert a new vma covering the given region, with the given flags. | 3028 | * Insert a new vma covering the given region, with the given flags. |
3026 | * Its pages are supplied by the given array of struct page *. | 3029 | * Its pages are supplied by the given array of struct page *. |
3027 | * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated. | 3030 | * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated. |
3028 | * The region past the last page supplied will always produce SIGBUS. | 3031 | * The region past the last page supplied will always produce SIGBUS. |
3029 | * The array pointer and the pages it points to are assumed to stay alive | 3032 | * The array pointer and the pages it points to are assumed to stay alive |
3030 | * for as long as this mapping might exist. | 3033 | * for as long as this mapping might exist. |
3031 | */ | 3034 | */ |
3032 | struct vm_area_struct *_install_special_mapping( | 3035 | struct vm_area_struct *_install_special_mapping( |
3033 | struct mm_struct *mm, | 3036 | struct mm_struct *mm, |
3034 | unsigned long addr, unsigned long len, | 3037 | unsigned long addr, unsigned long len, |
3035 | unsigned long vm_flags, const struct vm_special_mapping *spec) | 3038 | unsigned long vm_flags, const struct vm_special_mapping *spec) |
3036 | { | 3039 | { |
3037 | return __install_special_mapping(mm, addr, len, vm_flags, | 3040 | return __install_special_mapping(mm, addr, len, vm_flags, |
3038 | &special_mapping_vmops, (void *)spec); | 3041 | &special_mapping_vmops, (void *)spec); |
3039 | } | 3042 | } |
3040 | 3043 | ||
3041 | int install_special_mapping(struct mm_struct *mm, | 3044 | int install_special_mapping(struct mm_struct *mm, |
3042 | unsigned long addr, unsigned long len, | 3045 | unsigned long addr, unsigned long len, |
3043 | unsigned long vm_flags, struct page **pages) | 3046 | unsigned long vm_flags, struct page **pages) |
3044 | { | 3047 | { |
3045 | struct vm_area_struct *vma = __install_special_mapping( | 3048 | struct vm_area_struct *vma = __install_special_mapping( |
3046 | mm, addr, len, vm_flags, &legacy_special_mapping_vmops, | 3049 | mm, addr, len, vm_flags, &legacy_special_mapping_vmops, |
3047 | (void *)pages); | 3050 | (void *)pages); |
3048 | 3051 | ||
3049 | return PTR_ERR_OR_ZERO(vma); | 3052 | return PTR_ERR_OR_ZERO(vma); |
3050 | } | 3053 | } |
3051 | 3054 | ||
3052 | static DEFINE_MUTEX(mm_all_locks_mutex); | 3055 | static DEFINE_MUTEX(mm_all_locks_mutex); |
3053 | 3056 | ||
3054 | static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) | 3057 | static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) |
3055 | { | 3058 | { |
3056 | if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) { | 3059 | if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) { |
3057 | /* | 3060 | /* |
3058 | * The LSB of head.next can't change from under us | 3061 | * The LSB of head.next can't change from under us |
3059 | * because we hold the mm_all_locks_mutex. | 3062 | * because we hold the mm_all_locks_mutex. |
3060 | */ | 3063 | */ |
3061 | down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_sem); | 3064 | down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_sem); |
3062 | /* | 3065 | /* |
3063 | * We can safely modify head.next after taking the | 3066 | * We can safely modify head.next after taking the |
3064 | * anon_vma->root->rwsem. If some other vma in this mm shares | 3067 | * anon_vma->root->rwsem. If some other vma in this mm shares |
3065 | * the same anon_vma we won't take it again. | 3068 | * the same anon_vma we won't take it again. |
3066 | * | 3069 | * |
3067 | * No need of atomic instructions here, head.next | 3070 | * No need of atomic instructions here, head.next |
3068 | * can't change from under us thanks to the | 3071 | * can't change from under us thanks to the |
3069 | * anon_vma->root->rwsem. | 3072 | * anon_vma->root->rwsem. |
3070 | */ | 3073 | */ |
3071 | if (__test_and_set_bit(0, (unsigned long *) | 3074 | if (__test_and_set_bit(0, (unsigned long *) |
3072 | &anon_vma->root->rb_root.rb_node)) | 3075 | &anon_vma->root->rb_root.rb_node)) |
3073 | BUG(); | 3076 | BUG(); |
3074 | } | 3077 | } |
3075 | } | 3078 | } |
3076 | 3079 | ||
3077 | static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) | 3080 | static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) |
3078 | { | 3081 | { |
3079 | if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { | 3082 | if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { |
3080 | /* | 3083 | /* |
3081 | * AS_MM_ALL_LOCKS can't change from under us because | 3084 | * AS_MM_ALL_LOCKS can't change from under us because |
3082 | * we hold the mm_all_locks_mutex. | 3085 | * we hold the mm_all_locks_mutex. |
3083 | * | 3086 | * |
3084 | * Operations on ->flags have to be atomic because | 3087 | * Operations on ->flags have to be atomic because |
3085 | * even if AS_MM_ALL_LOCKS is stable thanks to the | 3088 | * even if AS_MM_ALL_LOCKS is stable thanks to the |
3086 | * mm_all_locks_mutex, there may be other cpus | 3089 | * mm_all_locks_mutex, there may be other cpus |
3087 | * changing other bitflags in parallel to us. | 3090 | * changing other bitflags in parallel to us. |
3088 | */ | 3091 | */ |
3089 | if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) | 3092 | if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) |
3090 | BUG(); | 3093 | BUG(); |
3091 | down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_sem); | 3094 | down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_sem); |
3092 | } | 3095 | } |
3093 | } | 3096 | } |
3094 | 3097 | ||
3095 | /* | 3098 | /* |
3096 | * This operation locks against the VM for all pte/vma/mm related | 3099 | * This operation locks against the VM for all pte/vma/mm related |
3097 | * operations that could ever happen on a certain mm. This includes | 3100 | * operations that could ever happen on a certain mm. This includes |
3098 | * vmtruncate, try_to_unmap, and all page faults. | 3101 | * vmtruncate, try_to_unmap, and all page faults. |
3099 | * | 3102 | * |
3100 | * The caller must take the mmap_sem in write mode before calling | 3103 | * The caller must take the mmap_sem in write mode before calling |
3101 | * mm_take_all_locks(). The caller isn't allowed to release the | 3104 | * mm_take_all_locks(). The caller isn't allowed to release the |
3102 | * mmap_sem until mm_drop_all_locks() returns. | 3105 | * mmap_sem until mm_drop_all_locks() returns. |
3103 | * | 3106 | * |
3104 | * mmap_sem in write mode is required in order to block all operations | 3107 | * mmap_sem in write mode is required in order to block all operations |
3105 | * that could modify pagetables and free pages without need of | 3108 | * that could modify pagetables and free pages without need of |
3106 | * altering the vma layout (for example populate_range() with | 3109 | * altering the vma layout (for example populate_range() with |
3107 | * nonlinear vmas). It's also needed in write mode to avoid new | 3110 | * nonlinear vmas). It's also needed in write mode to avoid new |
3108 | * anon_vmas to be associated with existing vmas. | 3111 | * anon_vmas to be associated with existing vmas. |
3109 | * | 3112 | * |
3110 | * A single task can't take more than one mm_take_all_locks() in a row | 3113 | * A single task can't take more than one mm_take_all_locks() in a row |
3111 | * or it would deadlock. | 3114 | * or it would deadlock. |
3112 | * | 3115 | * |
3113 | * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in | 3116 | * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in |
3114 | * mapping->flags avoid to take the same lock twice, if more than one | 3117 | * mapping->flags avoid to take the same lock twice, if more than one |
3115 | * vma in this mm is backed by the same anon_vma or address_space. | 3118 | * vma in this mm is backed by the same anon_vma or address_space. |
3116 | * | 3119 | * |
3117 | * We can take all the locks in random order because the VM code | 3120 | * We can take all the locks in random order because the VM code |
3118 | * taking i_mmap_rwsem or anon_vma->rwsem outside the mmap_sem never | 3121 | * taking i_mmap_rwsem or anon_vma->rwsem outside the mmap_sem never |
3119 | * takes more than one of them in a row. Secondly we're protected | 3122 | * takes more than one of them in a row. Secondly we're protected |
3120 | * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. | 3123 | * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. |
3121 | * | 3124 | * |
3122 | * mm_take_all_locks() and mm_drop_all_locks are expensive operations | 3125 | * mm_take_all_locks() and mm_drop_all_locks are expensive operations |
3123 | * that may have to take thousand of locks. | 3126 | * that may have to take thousand of locks. |
3124 | * | 3127 | * |
3125 | * mm_take_all_locks() can fail if it's interrupted by signals. | 3128 | * mm_take_all_locks() can fail if it's interrupted by signals. |
3126 | */ | 3129 | */ |
3127 | int mm_take_all_locks(struct mm_struct *mm) | 3130 | int mm_take_all_locks(struct mm_struct *mm) |
3128 | { | 3131 | { |
3129 | struct vm_area_struct *vma; | 3132 | struct vm_area_struct *vma; |
3130 | struct anon_vma_chain *avc; | 3133 | struct anon_vma_chain *avc; |
3131 | 3134 | ||
3132 | BUG_ON(down_read_trylock(&mm->mmap_sem)); | 3135 | BUG_ON(down_read_trylock(&mm->mmap_sem)); |
3133 | 3136 | ||
3134 | mutex_lock(&mm_all_locks_mutex); | 3137 | mutex_lock(&mm_all_locks_mutex); |
3135 | 3138 | ||
3136 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 3139 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
3137 | if (signal_pending(current)) | 3140 | if (signal_pending(current)) |
3138 | goto out_unlock; | 3141 | goto out_unlock; |
3139 | if (vma->vm_file && vma->vm_file->f_mapping) | 3142 | if (vma->vm_file && vma->vm_file->f_mapping) |
3140 | vm_lock_mapping(mm, vma->vm_file->f_mapping); | 3143 | vm_lock_mapping(mm, vma->vm_file->f_mapping); |
3141 | } | 3144 | } |
3142 | 3145 | ||
3143 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 3146 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
3144 | if (signal_pending(current)) | 3147 | if (signal_pending(current)) |
3145 | goto out_unlock; | 3148 | goto out_unlock; |
3146 | if (vma->anon_vma) | 3149 | if (vma->anon_vma) |
3147 | list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) | 3150 | list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) |
3148 | vm_lock_anon_vma(mm, avc->anon_vma); | 3151 | vm_lock_anon_vma(mm, avc->anon_vma); |
3149 | } | 3152 | } |
3150 | 3153 | ||
3151 | return 0; | 3154 | return 0; |
3152 | 3155 | ||
3153 | out_unlock: | 3156 | out_unlock: |
3154 | mm_drop_all_locks(mm); | 3157 | mm_drop_all_locks(mm); |
3155 | return -EINTR; | 3158 | return -EINTR; |
3156 | } | 3159 | } |
3157 | 3160 | ||
3158 | static void vm_unlock_anon_vma(struct anon_vma *anon_vma) | 3161 | static void vm_unlock_anon_vma(struct anon_vma *anon_vma) |
3159 | { | 3162 | { |
3160 | if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) { | 3163 | if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) { |
3161 | /* | 3164 | /* |
3162 | * The LSB of head.next can't change to 0 from under | 3165 | * The LSB of head.next can't change to 0 from under |
3163 | * us because we hold the mm_all_locks_mutex. | 3166 | * us because we hold the mm_all_locks_mutex. |
3164 | * | 3167 | * |
3165 | * We must however clear the bitflag before unlocking | 3168 | * We must however clear the bitflag before unlocking |
3166 | * the vma so the users using the anon_vma->rb_root will | 3169 | * the vma so the users using the anon_vma->rb_root will |
3167 | * never see our bitflag. | 3170 | * never see our bitflag. |
3168 | * | 3171 | * |
3169 | * No need of atomic instructions here, head.next | 3172 | * No need of atomic instructions here, head.next |
3170 | * can't change from under us until we release the | 3173 | * can't change from under us until we release the |
3171 | * anon_vma->root->rwsem. | 3174 | * anon_vma->root->rwsem. |
3172 | */ | 3175 | */ |
3173 | if (!__test_and_clear_bit(0, (unsigned long *) | 3176 | if (!__test_and_clear_bit(0, (unsigned long *) |
3174 | &anon_vma->root->rb_root.rb_node)) | 3177 | &anon_vma->root->rb_root.rb_node)) |
3175 | BUG(); | 3178 | BUG(); |
3176 | anon_vma_unlock_write(anon_vma); | 3179 | anon_vma_unlock_write(anon_vma); |
3177 | } | 3180 | } |
3178 | } | 3181 | } |
3179 | 3182 | ||
3180 | static void vm_unlock_mapping(struct address_space *mapping) | 3183 | static void vm_unlock_mapping(struct address_space *mapping) |
3181 | { | 3184 | { |
3182 | if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { | 3185 | if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { |
3183 | /* | 3186 | /* |
3184 | * AS_MM_ALL_LOCKS can't change to 0 from under us | 3187 | * AS_MM_ALL_LOCKS can't change to 0 from under us |
3185 | * because we hold the mm_all_locks_mutex. | 3188 | * because we hold the mm_all_locks_mutex. |
3186 | */ | 3189 | */ |
3187 | i_mmap_unlock_write(mapping); | 3190 | i_mmap_unlock_write(mapping); |
3188 | if (!test_and_clear_bit(AS_MM_ALL_LOCKS, | 3191 | if (!test_and_clear_bit(AS_MM_ALL_LOCKS, |
3189 | &mapping->flags)) | 3192 | &mapping->flags)) |
3190 | BUG(); | 3193 | BUG(); |
3191 | } | 3194 | } |
3192 | } | 3195 | } |
3193 | 3196 | ||
3194 | /* | 3197 | /* |
3195 | * The mmap_sem cannot be released by the caller until | 3198 | * The mmap_sem cannot be released by the caller until |
3196 | * mm_drop_all_locks() returns. | 3199 | * mm_drop_all_locks() returns. |
3197 | */ | 3200 | */ |
3198 | void mm_drop_all_locks(struct mm_struct *mm) | 3201 | void mm_drop_all_locks(struct mm_struct *mm) |
3199 | { | 3202 | { |
3200 | struct vm_area_struct *vma; | 3203 | struct vm_area_struct *vma; |
3201 | struct anon_vma_chain *avc; | 3204 | struct anon_vma_chain *avc; |
3202 | 3205 | ||
3203 | BUG_ON(down_read_trylock(&mm->mmap_sem)); | 3206 | BUG_ON(down_read_trylock(&mm->mmap_sem)); |
3204 | BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); | 3207 | BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); |
3205 | 3208 | ||
3206 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 3209 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
3207 | if (vma->anon_vma) | 3210 | if (vma->anon_vma) |
3208 | list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) | 3211 | list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) |
3209 | vm_unlock_anon_vma(avc->anon_vma); | 3212 | vm_unlock_anon_vma(avc->anon_vma); |
3210 | if (vma->vm_file && vma->vm_file->f_mapping) | 3213 | if (vma->vm_file && vma->vm_file->f_mapping) |
3211 | vm_unlock_mapping(vma->vm_file->f_mapping); | 3214 | vm_unlock_mapping(vma->vm_file->f_mapping); |
3212 | } | 3215 | } |
3213 | 3216 | ||
3214 | mutex_unlock(&mm_all_locks_mutex); | 3217 | mutex_unlock(&mm_all_locks_mutex); |
3215 | } | 3218 | } |
3216 | 3219 | ||
3217 | /* | 3220 | /* |
3218 | * initialise the VMA slab | 3221 | * initialise the VMA slab |
3219 | */ | 3222 | */ |
3220 | void __init mmap_init(void) | 3223 | void __init mmap_init(void) |
3221 | { | 3224 | { |
3222 | int ret; | 3225 | int ret; |
3223 | 3226 | ||
3224 | ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); | 3227 | ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); |
3225 | VM_BUG_ON(ret); | 3228 | VM_BUG_ON(ret); |
3226 | } | 3229 | } |
3227 | 3230 | ||
3228 | /* | 3231 | /* |
3229 | * Initialise sysctl_user_reserve_kbytes. | 3232 | * Initialise sysctl_user_reserve_kbytes. |
3230 | * | 3233 | * |
3231 | * This is intended to prevent a user from starting a single memory hogging | 3234 | * This is intended to prevent a user from starting a single memory hogging |
3232 | * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER | 3235 | * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER |
3233 | * mode. | 3236 | * mode. |
3234 | * | 3237 | * |
3235 | * The default value is min(3% of free memory, 128MB) | 3238 | * The default value is min(3% of free memory, 128MB) |
3236 | * 128MB is enough to recover with sshd/login, bash, and top/kill. | 3239 | * 128MB is enough to recover with sshd/login, bash, and top/kill. |
3237 | */ | 3240 | */ |
3238 | static int init_user_reserve(void) | 3241 | static int init_user_reserve(void) |
3239 | { | 3242 | { |
3240 | unsigned long free_kbytes; | 3243 | unsigned long free_kbytes; |
3241 | 3244 | ||
3242 | free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); | 3245 | free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); |
3243 | 3246 | ||
3244 | sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); | 3247 | sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); |
3245 | return 0; | 3248 | return 0; |
3246 | } | 3249 | } |
3247 | subsys_initcall(init_user_reserve); | 3250 | subsys_initcall(init_user_reserve); |
3248 | 3251 | ||
3249 | /* | 3252 | /* |
3250 | * Initialise sysctl_admin_reserve_kbytes. | 3253 | * Initialise sysctl_admin_reserve_kbytes. |
3251 | * | 3254 | * |
3252 | * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin | 3255 | * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin |
3253 | * to log in and kill a memory hogging process. | 3256 | * to log in and kill a memory hogging process. |
3254 | * | 3257 | * |
3255 | * Systems with more than 256MB will reserve 8MB, enough to recover | 3258 | * Systems with more than 256MB will reserve 8MB, enough to recover |
3256 | * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will | 3259 | * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will |
3257 | * only reserve 3% of free pages by default. | 3260 | * only reserve 3% of free pages by default. |
3258 | */ | 3261 | */ |
3259 | static int init_admin_reserve(void) | 3262 | static int init_admin_reserve(void) |
3260 | { | 3263 | { |
3261 | unsigned long free_kbytes; | 3264 | unsigned long free_kbytes; |
3262 | 3265 | ||
3263 | free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); | 3266 | free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); |
3264 | 3267 | ||
3265 | sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); | 3268 | sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); |
3266 | return 0; | 3269 | return 0; |
3267 | } | 3270 | } |
3268 | subsys_initcall(init_admin_reserve); | 3271 | subsys_initcall(init_admin_reserve); |
3269 | 3272 | ||
3270 | /* | 3273 | /* |
3271 | * Reinititalise user and admin reserves if memory is added or removed. | 3274 | * Reinititalise user and admin reserves if memory is added or removed. |
3272 | * | 3275 | * |
3273 | * The default user reserve max is 128MB, and the default max for the | 3276 | * The default user reserve max is 128MB, and the default max for the |
3274 | * admin reserve is 8MB. These are usually, but not always, enough to | 3277 | * admin reserve is 8MB. These are usually, but not always, enough to |
3275 | * enable recovery from a memory hogging process using login/sshd, a shell, | 3278 | * enable recovery from a memory hogging process using login/sshd, a shell, |
3276 | * and tools like top. It may make sense to increase or even disable the | 3279 | * and tools like top. It may make sense to increase or even disable the |
3277 | * reserve depending on the existence of swap or variations in the recovery | 3280 | * reserve depending on the existence of swap or variations in the recovery |
3278 | * tools. So, the admin may have changed them. | 3281 | * tools. So, the admin may have changed them. |
3279 | * | 3282 | * |
3280 | * If memory is added and the reserves have been eliminated or increased above | 3283 | * If memory is added and the reserves have been eliminated or increased above |
3281 | * the default max, then we'll trust the admin. | 3284 | * the default max, then we'll trust the admin. |
3282 | * | 3285 | * |
3283 | * If memory is removed and there isn't enough free memory, then we | 3286 | * If memory is removed and there isn't enough free memory, then we |
3284 | * need to reset the reserves. | 3287 | * need to reset the reserves. |
3285 | * | 3288 | * |
3286 | * Otherwise keep the reserve set by the admin. | 3289 | * Otherwise keep the reserve set by the admin. |
3287 | */ | 3290 | */ |
3288 | static int reserve_mem_notifier(struct notifier_block *nb, | 3291 | static int reserve_mem_notifier(struct notifier_block *nb, |
3289 | unsigned long action, void *data) | 3292 | unsigned long action, void *data) |
3290 | { | 3293 | { |
3291 | unsigned long tmp, free_kbytes; | 3294 | unsigned long tmp, free_kbytes; |
3292 | 3295 | ||
3293 | switch (action) { | 3296 | switch (action) { |
3294 | case MEM_ONLINE: | 3297 | case MEM_ONLINE: |
3295 | /* Default max is 128MB. Leave alone if modified by operator. */ | 3298 | /* Default max is 128MB. Leave alone if modified by operator. */ |
3296 | tmp = sysctl_user_reserve_kbytes; | 3299 | tmp = sysctl_user_reserve_kbytes; |
3297 | if (0 < tmp && tmp < (1UL << 17)) | 3300 | if (0 < tmp && tmp < (1UL << 17)) |
3298 | init_user_reserve(); | 3301 | init_user_reserve(); |
3299 | 3302 | ||
3300 | /* Default max is 8MB. Leave alone if modified by operator. */ | 3303 | /* Default max is 8MB. Leave alone if modified by operator. */ |
3301 | tmp = sysctl_admin_reserve_kbytes; | 3304 | tmp = sysctl_admin_reserve_kbytes; |
3302 | if (0 < tmp && tmp < (1UL << 13)) | 3305 | if (0 < tmp && tmp < (1UL << 13)) |
3303 | init_admin_reserve(); | 3306 | init_admin_reserve(); |
3304 | 3307 | ||
3305 | break; | 3308 | break; |
3306 | case MEM_OFFLINE: | 3309 | case MEM_OFFLINE: |
3307 | free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); | 3310 | free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); |
3308 | 3311 | ||
3309 | if (sysctl_user_reserve_kbytes > free_kbytes) { | 3312 | if (sysctl_user_reserve_kbytes > free_kbytes) { |
3310 | init_user_reserve(); | 3313 | init_user_reserve(); |
3311 | pr_info("vm.user_reserve_kbytes reset to %lu\n", | 3314 | pr_info("vm.user_reserve_kbytes reset to %lu\n", |
3312 | sysctl_user_reserve_kbytes); | 3315 | sysctl_user_reserve_kbytes); |
3313 | } | 3316 | } |
3314 | 3317 | ||
3315 | if (sysctl_admin_reserve_kbytes > free_kbytes) { | 3318 | if (sysctl_admin_reserve_kbytes > free_kbytes) { |
3316 | init_admin_reserve(); | 3319 | init_admin_reserve(); |
3317 | pr_info("vm.admin_reserve_kbytes reset to %lu\n", | 3320 | pr_info("vm.admin_reserve_kbytes reset to %lu\n", |
3318 | sysctl_admin_reserve_kbytes); | 3321 | sysctl_admin_reserve_kbytes); |
3319 | } | 3322 | } |
3320 | break; | 3323 | break; |
3321 | default: | 3324 | default: |
3322 | break; | 3325 | break; |
3323 | } | 3326 | } |
3324 | return NOTIFY_OK; | 3327 | return NOTIFY_OK; |
3325 | } | 3328 | } |
3326 | 3329 | ||
3327 | static struct notifier_block reserve_mem_nb = { | 3330 | static struct notifier_block reserve_mem_nb = { |
3328 | .notifier_call = reserve_mem_notifier, | 3331 | .notifier_call = reserve_mem_notifier, |
3329 | }; | 3332 | }; |
3330 | 3333 | ||
3331 | static int __meminit init_reserve_notifier(void) | 3334 | static int __meminit init_reserve_notifier(void) |
3332 | { | 3335 | { |
3333 | if (register_hotmemory_notifier(&reserve_mem_nb)) | 3336 | if (register_hotmemory_notifier(&reserve_mem_nb)) |
3334 | pr_err("Failed registering memory add/remove notifier for admin reserve\n"); | 3337 | pr_err("Failed registering memory add/remove notifier for admin reserve\n"); |
3335 | 3338 | ||
3336 | return 0; | 3339 | return 0; |
3337 | } | 3340 | } |
3338 | subsys_initcall(init_reserve_notifier); | 3341 | subsys_initcall(init_reserve_notifier); |
3339 | 3342 |