Commit af73e4d9506d3b797509f3c030e7dcd554f7d9c4

Authored by Naoya Horiguchi
Committed by Linus Torvalds
1 parent 1ab4ce7623

hugetlbfs: fix mmap failure in unaligned size request

The current kernel returns -EINVAL unless a given mmap length is
"almost" hugepage aligned.  This is because in sys_mmap_pgoff() the
given length is passed to vm_mmap_pgoff() as it is without being aligned
with hugepage boundary.

This is a regression introduced in commit 40716e29243d ("hugetlbfs: fix
alignment of huge page requests"), where alignment code is pushed into
hugetlb_file_setup() and the variable len in caller side is not changed.

To fix this, this patch partially reverts that commit, and adds
alignment code in caller side.  And it also introduces hstate_sizelog()
in order to get proper hstate to specified hugepage size.

Addresses https://bugzilla.kernel.org/show_bug.cgi?id=56881

[akpm@linux-foundation.org: fix warning when CONFIG_HUGETLB_PAGE=n]
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: <iceman_dvd@yahoo.com>
Cc: Steven Truelove <steven.truelove@utoronto.ca>
Cc: Jianguo Wu <wujianguo@huawei.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 4 changed files with 34 additions and 22 deletions Inline Diff

fs/hugetlbfs/inode.c
1 /* 1 /*
2 * hugetlbpage-backed filesystem. Based on ramfs. 2 * hugetlbpage-backed filesystem. Based on ramfs.
3 * 3 *
4 * Nadia Yvette Chambers, 2002 4 * Nadia Yvette Chambers, 2002
5 * 5 *
6 * Copyright (C) 2002 Linus Torvalds. 6 * Copyright (C) 2002 Linus Torvalds.
7 */ 7 */
8 8
9 #include <linux/module.h> 9 #include <linux/module.h>
10 #include <linux/thread_info.h> 10 #include <linux/thread_info.h>
11 #include <asm/current.h> 11 #include <asm/current.h>
12 #include <linux/sched.h> /* remove ASAP */ 12 #include <linux/sched.h> /* remove ASAP */
13 #include <linux/fs.h> 13 #include <linux/fs.h>
14 #include <linux/mount.h> 14 #include <linux/mount.h>
15 #include <linux/file.h> 15 #include <linux/file.h>
16 #include <linux/kernel.h> 16 #include <linux/kernel.h>
17 #include <linux/writeback.h> 17 #include <linux/writeback.h>
18 #include <linux/pagemap.h> 18 #include <linux/pagemap.h>
19 #include <linux/highmem.h> 19 #include <linux/highmem.h>
20 #include <linux/init.h> 20 #include <linux/init.h>
21 #include <linux/string.h> 21 #include <linux/string.h>
22 #include <linux/capability.h> 22 #include <linux/capability.h>
23 #include <linux/ctype.h> 23 #include <linux/ctype.h>
24 #include <linux/backing-dev.h> 24 #include <linux/backing-dev.h>
25 #include <linux/hugetlb.h> 25 #include <linux/hugetlb.h>
26 #include <linux/pagevec.h> 26 #include <linux/pagevec.h>
27 #include <linux/parser.h> 27 #include <linux/parser.h>
28 #include <linux/mman.h> 28 #include <linux/mman.h>
29 #include <linux/slab.h> 29 #include <linux/slab.h>
30 #include <linux/dnotify.h> 30 #include <linux/dnotify.h>
31 #include <linux/statfs.h> 31 #include <linux/statfs.h>
32 #include <linux/security.h> 32 #include <linux/security.h>
33 #include <linux/magic.h> 33 #include <linux/magic.h>
34 #include <linux/migrate.h> 34 #include <linux/migrate.h>
35 35
36 #include <asm/uaccess.h> 36 #include <asm/uaccess.h>
37 37
38 static const struct super_operations hugetlbfs_ops; 38 static const struct super_operations hugetlbfs_ops;
39 static const struct address_space_operations hugetlbfs_aops; 39 static const struct address_space_operations hugetlbfs_aops;
40 const struct file_operations hugetlbfs_file_operations; 40 const struct file_operations hugetlbfs_file_operations;
41 static const struct inode_operations hugetlbfs_dir_inode_operations; 41 static const struct inode_operations hugetlbfs_dir_inode_operations;
42 static const struct inode_operations hugetlbfs_inode_operations; 42 static const struct inode_operations hugetlbfs_inode_operations;
43 43
44 struct hugetlbfs_config { 44 struct hugetlbfs_config {
45 kuid_t uid; 45 kuid_t uid;
46 kgid_t gid; 46 kgid_t gid;
47 umode_t mode; 47 umode_t mode;
48 long nr_blocks; 48 long nr_blocks;
49 long nr_inodes; 49 long nr_inodes;
50 struct hstate *hstate; 50 struct hstate *hstate;
51 }; 51 };
52 52
53 struct hugetlbfs_inode_info { 53 struct hugetlbfs_inode_info {
54 struct shared_policy policy; 54 struct shared_policy policy;
55 struct inode vfs_inode; 55 struct inode vfs_inode;
56 }; 56 };
57 57
58 static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode) 58 static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode)
59 { 59 {
60 return container_of(inode, struct hugetlbfs_inode_info, vfs_inode); 60 return container_of(inode, struct hugetlbfs_inode_info, vfs_inode);
61 } 61 }
62 62
63 static struct backing_dev_info hugetlbfs_backing_dev_info = { 63 static struct backing_dev_info hugetlbfs_backing_dev_info = {
64 .name = "hugetlbfs", 64 .name = "hugetlbfs",
65 .ra_pages = 0, /* No readahead */ 65 .ra_pages = 0, /* No readahead */
66 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 66 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
67 }; 67 };
68 68
69 int sysctl_hugetlb_shm_group; 69 int sysctl_hugetlb_shm_group;
70 70
71 enum { 71 enum {
72 Opt_size, Opt_nr_inodes, 72 Opt_size, Opt_nr_inodes,
73 Opt_mode, Opt_uid, Opt_gid, 73 Opt_mode, Opt_uid, Opt_gid,
74 Opt_pagesize, 74 Opt_pagesize,
75 Opt_err, 75 Opt_err,
76 }; 76 };
77 77
78 static const match_table_t tokens = { 78 static const match_table_t tokens = {
79 {Opt_size, "size=%s"}, 79 {Opt_size, "size=%s"},
80 {Opt_nr_inodes, "nr_inodes=%s"}, 80 {Opt_nr_inodes, "nr_inodes=%s"},
81 {Opt_mode, "mode=%o"}, 81 {Opt_mode, "mode=%o"},
82 {Opt_uid, "uid=%u"}, 82 {Opt_uid, "uid=%u"},
83 {Opt_gid, "gid=%u"}, 83 {Opt_gid, "gid=%u"},
84 {Opt_pagesize, "pagesize=%s"}, 84 {Opt_pagesize, "pagesize=%s"},
85 {Opt_err, NULL}, 85 {Opt_err, NULL},
86 }; 86 };
87 87
88 static void huge_pagevec_release(struct pagevec *pvec) 88 static void huge_pagevec_release(struct pagevec *pvec)
89 { 89 {
90 int i; 90 int i;
91 91
92 for (i = 0; i < pagevec_count(pvec); ++i) 92 for (i = 0; i < pagevec_count(pvec); ++i)
93 put_page(pvec->pages[i]); 93 put_page(pvec->pages[i]);
94 94
95 pagevec_reinit(pvec); 95 pagevec_reinit(pvec);
96 } 96 }
97 97
98 static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) 98 static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
99 { 99 {
100 struct inode *inode = file_inode(file); 100 struct inode *inode = file_inode(file);
101 loff_t len, vma_len; 101 loff_t len, vma_len;
102 int ret; 102 int ret;
103 struct hstate *h = hstate_file(file); 103 struct hstate *h = hstate_file(file);
104 104
105 /* 105 /*
106 * vma address alignment (but not the pgoff alignment) has 106 * vma address alignment (but not the pgoff alignment) has
107 * already been checked by prepare_hugepage_range. If you add 107 * already been checked by prepare_hugepage_range. If you add
108 * any error returns here, do so after setting VM_HUGETLB, so 108 * any error returns here, do so after setting VM_HUGETLB, so
109 * is_vm_hugetlb_page tests below unmap_region go the right 109 * is_vm_hugetlb_page tests below unmap_region go the right
110 * way when do_mmap_pgoff unwinds (may be important on powerpc 110 * way when do_mmap_pgoff unwinds (may be important on powerpc
111 * and ia64). 111 * and ia64).
112 */ 112 */
113 vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND; 113 vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND;
114 vma->vm_ops = &hugetlb_vm_ops; 114 vma->vm_ops = &hugetlb_vm_ops;
115 115
116 if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) 116 if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
117 return -EINVAL; 117 return -EINVAL;
118 118
119 vma_len = (loff_t)(vma->vm_end - vma->vm_start); 119 vma_len = (loff_t)(vma->vm_end - vma->vm_start);
120 120
121 mutex_lock(&inode->i_mutex); 121 mutex_lock(&inode->i_mutex);
122 file_accessed(file); 122 file_accessed(file);
123 123
124 ret = -ENOMEM; 124 ret = -ENOMEM;
125 len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 125 len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
126 126
127 if (hugetlb_reserve_pages(inode, 127 if (hugetlb_reserve_pages(inode,
128 vma->vm_pgoff >> huge_page_order(h), 128 vma->vm_pgoff >> huge_page_order(h),
129 len >> huge_page_shift(h), vma, 129 len >> huge_page_shift(h), vma,
130 vma->vm_flags)) 130 vma->vm_flags))
131 goto out; 131 goto out;
132 132
133 ret = 0; 133 ret = 0;
134 hugetlb_prefault_arch_hook(vma->vm_mm); 134 hugetlb_prefault_arch_hook(vma->vm_mm);
135 if (vma->vm_flags & VM_WRITE && inode->i_size < len) 135 if (vma->vm_flags & VM_WRITE && inode->i_size < len)
136 inode->i_size = len; 136 inode->i_size = len;
137 out: 137 out:
138 mutex_unlock(&inode->i_mutex); 138 mutex_unlock(&inode->i_mutex);
139 139
140 return ret; 140 return ret;
141 } 141 }
142 142
143 /* 143 /*
144 * Called under down_write(mmap_sem). 144 * Called under down_write(mmap_sem).
145 */ 145 */
146 146
147 #ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA 147 #ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
148 static unsigned long 148 static unsigned long
149 hugetlb_get_unmapped_area(struct file *file, unsigned long addr, 149 hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
150 unsigned long len, unsigned long pgoff, unsigned long flags) 150 unsigned long len, unsigned long pgoff, unsigned long flags)
151 { 151 {
152 struct mm_struct *mm = current->mm; 152 struct mm_struct *mm = current->mm;
153 struct vm_area_struct *vma; 153 struct vm_area_struct *vma;
154 struct hstate *h = hstate_file(file); 154 struct hstate *h = hstate_file(file);
155 struct vm_unmapped_area_info info; 155 struct vm_unmapped_area_info info;
156 156
157 if (len & ~huge_page_mask(h)) 157 if (len & ~huge_page_mask(h))
158 return -EINVAL; 158 return -EINVAL;
159 if (len > TASK_SIZE) 159 if (len > TASK_SIZE)
160 return -ENOMEM; 160 return -ENOMEM;
161 161
162 if (flags & MAP_FIXED) { 162 if (flags & MAP_FIXED) {
163 if (prepare_hugepage_range(file, addr, len)) 163 if (prepare_hugepage_range(file, addr, len))
164 return -EINVAL; 164 return -EINVAL;
165 return addr; 165 return addr;
166 } 166 }
167 167
168 if (addr) { 168 if (addr) {
169 addr = ALIGN(addr, huge_page_size(h)); 169 addr = ALIGN(addr, huge_page_size(h));
170 vma = find_vma(mm, addr); 170 vma = find_vma(mm, addr);
171 if (TASK_SIZE - len >= addr && 171 if (TASK_SIZE - len >= addr &&
172 (!vma || addr + len <= vma->vm_start)) 172 (!vma || addr + len <= vma->vm_start))
173 return addr; 173 return addr;
174 } 174 }
175 175
176 info.flags = 0; 176 info.flags = 0;
177 info.length = len; 177 info.length = len;
178 info.low_limit = TASK_UNMAPPED_BASE; 178 info.low_limit = TASK_UNMAPPED_BASE;
179 info.high_limit = TASK_SIZE; 179 info.high_limit = TASK_SIZE;
180 info.align_mask = PAGE_MASK & ~huge_page_mask(h); 180 info.align_mask = PAGE_MASK & ~huge_page_mask(h);
181 info.align_offset = 0; 181 info.align_offset = 0;
182 return vm_unmapped_area(&info); 182 return vm_unmapped_area(&info);
183 } 183 }
184 #endif 184 #endif
185 185
186 static int 186 static int
187 hugetlbfs_read_actor(struct page *page, unsigned long offset, 187 hugetlbfs_read_actor(struct page *page, unsigned long offset,
188 char __user *buf, unsigned long count, 188 char __user *buf, unsigned long count,
189 unsigned long size) 189 unsigned long size)
190 { 190 {
191 char *kaddr; 191 char *kaddr;
192 unsigned long left, copied = 0; 192 unsigned long left, copied = 0;
193 int i, chunksize; 193 int i, chunksize;
194 194
195 if (size > count) 195 if (size > count)
196 size = count; 196 size = count;
197 197
198 /* Find which 4k chunk and offset with in that chunk */ 198 /* Find which 4k chunk and offset with in that chunk */
199 i = offset >> PAGE_CACHE_SHIFT; 199 i = offset >> PAGE_CACHE_SHIFT;
200 offset = offset & ~PAGE_CACHE_MASK; 200 offset = offset & ~PAGE_CACHE_MASK;
201 201
202 while (size) { 202 while (size) {
203 chunksize = PAGE_CACHE_SIZE; 203 chunksize = PAGE_CACHE_SIZE;
204 if (offset) 204 if (offset)
205 chunksize -= offset; 205 chunksize -= offset;
206 if (chunksize > size) 206 if (chunksize > size)
207 chunksize = size; 207 chunksize = size;
208 kaddr = kmap(&page[i]); 208 kaddr = kmap(&page[i]);
209 left = __copy_to_user(buf, kaddr + offset, chunksize); 209 left = __copy_to_user(buf, kaddr + offset, chunksize);
210 kunmap(&page[i]); 210 kunmap(&page[i]);
211 if (left) { 211 if (left) {
212 copied += (chunksize - left); 212 copied += (chunksize - left);
213 break; 213 break;
214 } 214 }
215 offset = 0; 215 offset = 0;
216 size -= chunksize; 216 size -= chunksize;
217 buf += chunksize; 217 buf += chunksize;
218 copied += chunksize; 218 copied += chunksize;
219 i++; 219 i++;
220 } 220 }
221 return copied ? copied : -EFAULT; 221 return copied ? copied : -EFAULT;
222 } 222 }
223 223
224 /* 224 /*
225 * Support for read() - Find the page attached to f_mapping and copy out the 225 * Support for read() - Find the page attached to f_mapping and copy out the
226 * data. Its *very* similar to do_generic_mapping_read(), we can't use that 226 * data. Its *very* similar to do_generic_mapping_read(), we can't use that
227 * since it has PAGE_CACHE_SIZE assumptions. 227 * since it has PAGE_CACHE_SIZE assumptions.
228 */ 228 */
229 static ssize_t hugetlbfs_read(struct file *filp, char __user *buf, 229 static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
230 size_t len, loff_t *ppos) 230 size_t len, loff_t *ppos)
231 { 231 {
232 struct hstate *h = hstate_file(filp); 232 struct hstate *h = hstate_file(filp);
233 struct address_space *mapping = filp->f_mapping; 233 struct address_space *mapping = filp->f_mapping;
234 struct inode *inode = mapping->host; 234 struct inode *inode = mapping->host;
235 unsigned long index = *ppos >> huge_page_shift(h); 235 unsigned long index = *ppos >> huge_page_shift(h);
236 unsigned long offset = *ppos & ~huge_page_mask(h); 236 unsigned long offset = *ppos & ~huge_page_mask(h);
237 unsigned long end_index; 237 unsigned long end_index;
238 loff_t isize; 238 loff_t isize;
239 ssize_t retval = 0; 239 ssize_t retval = 0;
240 240
241 /* validate length */ 241 /* validate length */
242 if (len == 0) 242 if (len == 0)
243 goto out; 243 goto out;
244 244
245 for (;;) { 245 for (;;) {
246 struct page *page; 246 struct page *page;
247 unsigned long nr, ret; 247 unsigned long nr, ret;
248 int ra; 248 int ra;
249 249
250 /* nr is the maximum number of bytes to copy from this page */ 250 /* nr is the maximum number of bytes to copy from this page */
251 nr = huge_page_size(h); 251 nr = huge_page_size(h);
252 isize = i_size_read(inode); 252 isize = i_size_read(inode);
253 if (!isize) 253 if (!isize)
254 goto out; 254 goto out;
255 end_index = (isize - 1) >> huge_page_shift(h); 255 end_index = (isize - 1) >> huge_page_shift(h);
256 if (index >= end_index) { 256 if (index >= end_index) {
257 if (index > end_index) 257 if (index > end_index)
258 goto out; 258 goto out;
259 nr = ((isize - 1) & ~huge_page_mask(h)) + 1; 259 nr = ((isize - 1) & ~huge_page_mask(h)) + 1;
260 if (nr <= offset) 260 if (nr <= offset)
261 goto out; 261 goto out;
262 } 262 }
263 nr = nr - offset; 263 nr = nr - offset;
264 264
265 /* Find the page */ 265 /* Find the page */
266 page = find_lock_page(mapping, index); 266 page = find_lock_page(mapping, index);
267 if (unlikely(page == NULL)) { 267 if (unlikely(page == NULL)) {
268 /* 268 /*
269 * We have a HOLE, zero out the user-buffer for the 269 * We have a HOLE, zero out the user-buffer for the
270 * length of the hole or request. 270 * length of the hole or request.
271 */ 271 */
272 ret = len < nr ? len : nr; 272 ret = len < nr ? len : nr;
273 if (clear_user(buf, ret)) 273 if (clear_user(buf, ret))
274 ra = -EFAULT; 274 ra = -EFAULT;
275 else 275 else
276 ra = 0; 276 ra = 0;
277 } else { 277 } else {
278 unlock_page(page); 278 unlock_page(page);
279 279
280 /* 280 /*
281 * We have the page, copy it to user space buffer. 281 * We have the page, copy it to user space buffer.
282 */ 282 */
283 ra = hugetlbfs_read_actor(page, offset, buf, len, nr); 283 ra = hugetlbfs_read_actor(page, offset, buf, len, nr);
284 ret = ra; 284 ret = ra;
285 page_cache_release(page); 285 page_cache_release(page);
286 } 286 }
287 if (ra < 0) { 287 if (ra < 0) {
288 if (retval == 0) 288 if (retval == 0)
289 retval = ra; 289 retval = ra;
290 goto out; 290 goto out;
291 } 291 }
292 292
293 offset += ret; 293 offset += ret;
294 retval += ret; 294 retval += ret;
295 len -= ret; 295 len -= ret;
296 index += offset >> huge_page_shift(h); 296 index += offset >> huge_page_shift(h);
297 offset &= ~huge_page_mask(h); 297 offset &= ~huge_page_mask(h);
298 298
299 /* short read or no more work */ 299 /* short read or no more work */
300 if ((ret != nr) || (len == 0)) 300 if ((ret != nr) || (len == 0))
301 break; 301 break;
302 } 302 }
303 out: 303 out:
304 *ppos = ((loff_t)index << huge_page_shift(h)) + offset; 304 *ppos = ((loff_t)index << huge_page_shift(h)) + offset;
305 return retval; 305 return retval;
306 } 306 }
307 307
308 static int hugetlbfs_write_begin(struct file *file, 308 static int hugetlbfs_write_begin(struct file *file,
309 struct address_space *mapping, 309 struct address_space *mapping,
310 loff_t pos, unsigned len, unsigned flags, 310 loff_t pos, unsigned len, unsigned flags,
311 struct page **pagep, void **fsdata) 311 struct page **pagep, void **fsdata)
312 { 312 {
313 return -EINVAL; 313 return -EINVAL;
314 } 314 }
315 315
316 static int hugetlbfs_write_end(struct file *file, struct address_space *mapping, 316 static int hugetlbfs_write_end(struct file *file, struct address_space *mapping,
317 loff_t pos, unsigned len, unsigned copied, 317 loff_t pos, unsigned len, unsigned copied,
318 struct page *page, void *fsdata) 318 struct page *page, void *fsdata)
319 { 319 {
320 BUG(); 320 BUG();
321 return -EINVAL; 321 return -EINVAL;
322 } 322 }
323 323
324 static void truncate_huge_page(struct page *page) 324 static void truncate_huge_page(struct page *page)
325 { 325 {
326 cancel_dirty_page(page, /* No IO accounting for huge pages? */0); 326 cancel_dirty_page(page, /* No IO accounting for huge pages? */0);
327 ClearPageUptodate(page); 327 ClearPageUptodate(page);
328 delete_from_page_cache(page); 328 delete_from_page_cache(page);
329 } 329 }
330 330
331 static void truncate_hugepages(struct inode *inode, loff_t lstart) 331 static void truncate_hugepages(struct inode *inode, loff_t lstart)
332 { 332 {
333 struct hstate *h = hstate_inode(inode); 333 struct hstate *h = hstate_inode(inode);
334 struct address_space *mapping = &inode->i_data; 334 struct address_space *mapping = &inode->i_data;
335 const pgoff_t start = lstart >> huge_page_shift(h); 335 const pgoff_t start = lstart >> huge_page_shift(h);
336 struct pagevec pvec; 336 struct pagevec pvec;
337 pgoff_t next; 337 pgoff_t next;
338 int i, freed = 0; 338 int i, freed = 0;
339 339
340 pagevec_init(&pvec, 0); 340 pagevec_init(&pvec, 0);
341 next = start; 341 next = start;
342 while (1) { 342 while (1) {
343 if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { 343 if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
344 if (next == start) 344 if (next == start)
345 break; 345 break;
346 next = start; 346 next = start;
347 continue; 347 continue;
348 } 348 }
349 349
350 for (i = 0; i < pagevec_count(&pvec); ++i) { 350 for (i = 0; i < pagevec_count(&pvec); ++i) {
351 struct page *page = pvec.pages[i]; 351 struct page *page = pvec.pages[i];
352 352
353 lock_page(page); 353 lock_page(page);
354 if (page->index > next) 354 if (page->index > next)
355 next = page->index; 355 next = page->index;
356 ++next; 356 ++next;
357 truncate_huge_page(page); 357 truncate_huge_page(page);
358 unlock_page(page); 358 unlock_page(page);
359 freed++; 359 freed++;
360 } 360 }
361 huge_pagevec_release(&pvec); 361 huge_pagevec_release(&pvec);
362 } 362 }
363 BUG_ON(!lstart && mapping->nrpages); 363 BUG_ON(!lstart && mapping->nrpages);
364 hugetlb_unreserve_pages(inode, start, freed); 364 hugetlb_unreserve_pages(inode, start, freed);
365 } 365 }
366 366
367 static void hugetlbfs_evict_inode(struct inode *inode) 367 static void hugetlbfs_evict_inode(struct inode *inode)
368 { 368 {
369 truncate_hugepages(inode, 0); 369 truncate_hugepages(inode, 0);
370 clear_inode(inode); 370 clear_inode(inode);
371 } 371 }
372 372
373 static inline void 373 static inline void
374 hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff) 374 hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff)
375 { 375 {
376 struct vm_area_struct *vma; 376 struct vm_area_struct *vma;
377 377
378 vma_interval_tree_foreach(vma, root, pgoff, ULONG_MAX) { 378 vma_interval_tree_foreach(vma, root, pgoff, ULONG_MAX) {
379 unsigned long v_offset; 379 unsigned long v_offset;
380 380
381 /* 381 /*
382 * Can the expression below overflow on 32-bit arches? 382 * Can the expression below overflow on 32-bit arches?
383 * No, because the interval tree returns us only those vmas 383 * No, because the interval tree returns us only those vmas
384 * which overlap the truncated area starting at pgoff, 384 * which overlap the truncated area starting at pgoff,
385 * and no vma on a 32-bit arch can span beyond the 4GB. 385 * and no vma on a 32-bit arch can span beyond the 4GB.
386 */ 386 */
387 if (vma->vm_pgoff < pgoff) 387 if (vma->vm_pgoff < pgoff)
388 v_offset = (pgoff - vma->vm_pgoff) << PAGE_SHIFT; 388 v_offset = (pgoff - vma->vm_pgoff) << PAGE_SHIFT;
389 else 389 else
390 v_offset = 0; 390 v_offset = 0;
391 391
392 unmap_hugepage_range(vma, vma->vm_start + v_offset, 392 unmap_hugepage_range(vma, vma->vm_start + v_offset,
393 vma->vm_end, NULL); 393 vma->vm_end, NULL);
394 } 394 }
395 } 395 }
396 396
397 static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) 397 static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
398 { 398 {
399 pgoff_t pgoff; 399 pgoff_t pgoff;
400 struct address_space *mapping = inode->i_mapping; 400 struct address_space *mapping = inode->i_mapping;
401 struct hstate *h = hstate_inode(inode); 401 struct hstate *h = hstate_inode(inode);
402 402
403 BUG_ON(offset & ~huge_page_mask(h)); 403 BUG_ON(offset & ~huge_page_mask(h));
404 pgoff = offset >> PAGE_SHIFT; 404 pgoff = offset >> PAGE_SHIFT;
405 405
406 i_size_write(inode, offset); 406 i_size_write(inode, offset);
407 mutex_lock(&mapping->i_mmap_mutex); 407 mutex_lock(&mapping->i_mmap_mutex);
408 if (!RB_EMPTY_ROOT(&mapping->i_mmap)) 408 if (!RB_EMPTY_ROOT(&mapping->i_mmap))
409 hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); 409 hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
410 mutex_unlock(&mapping->i_mmap_mutex); 410 mutex_unlock(&mapping->i_mmap_mutex);
411 truncate_hugepages(inode, offset); 411 truncate_hugepages(inode, offset);
412 return 0; 412 return 0;
413 } 413 }
414 414
415 static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr) 415 static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
416 { 416 {
417 struct inode *inode = dentry->d_inode; 417 struct inode *inode = dentry->d_inode;
418 struct hstate *h = hstate_inode(inode); 418 struct hstate *h = hstate_inode(inode);
419 int error; 419 int error;
420 unsigned int ia_valid = attr->ia_valid; 420 unsigned int ia_valid = attr->ia_valid;
421 421
422 BUG_ON(!inode); 422 BUG_ON(!inode);
423 423
424 error = inode_change_ok(inode, attr); 424 error = inode_change_ok(inode, attr);
425 if (error) 425 if (error)
426 return error; 426 return error;
427 427
428 if (ia_valid & ATTR_SIZE) { 428 if (ia_valid & ATTR_SIZE) {
429 error = -EINVAL; 429 error = -EINVAL;
430 if (attr->ia_size & ~huge_page_mask(h)) 430 if (attr->ia_size & ~huge_page_mask(h))
431 return -EINVAL; 431 return -EINVAL;
432 error = hugetlb_vmtruncate(inode, attr->ia_size); 432 error = hugetlb_vmtruncate(inode, attr->ia_size);
433 if (error) 433 if (error)
434 return error; 434 return error;
435 } 435 }
436 436
437 setattr_copy(inode, attr); 437 setattr_copy(inode, attr);
438 mark_inode_dirty(inode); 438 mark_inode_dirty(inode);
439 return 0; 439 return 0;
440 } 440 }
441 441
442 static struct inode *hugetlbfs_get_root(struct super_block *sb, 442 static struct inode *hugetlbfs_get_root(struct super_block *sb,
443 struct hugetlbfs_config *config) 443 struct hugetlbfs_config *config)
444 { 444 {
445 struct inode *inode; 445 struct inode *inode;
446 446
447 inode = new_inode(sb); 447 inode = new_inode(sb);
448 if (inode) { 448 if (inode) {
449 struct hugetlbfs_inode_info *info; 449 struct hugetlbfs_inode_info *info;
450 inode->i_ino = get_next_ino(); 450 inode->i_ino = get_next_ino();
451 inode->i_mode = S_IFDIR | config->mode; 451 inode->i_mode = S_IFDIR | config->mode;
452 inode->i_uid = config->uid; 452 inode->i_uid = config->uid;
453 inode->i_gid = config->gid; 453 inode->i_gid = config->gid;
454 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 454 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
455 info = HUGETLBFS_I(inode); 455 info = HUGETLBFS_I(inode);
456 mpol_shared_policy_init(&info->policy, NULL); 456 mpol_shared_policy_init(&info->policy, NULL);
457 inode->i_op = &hugetlbfs_dir_inode_operations; 457 inode->i_op = &hugetlbfs_dir_inode_operations;
458 inode->i_fop = &simple_dir_operations; 458 inode->i_fop = &simple_dir_operations;
459 /* directory inodes start off with i_nlink == 2 (for "." entry) */ 459 /* directory inodes start off with i_nlink == 2 (for "." entry) */
460 inc_nlink(inode); 460 inc_nlink(inode);
461 lockdep_annotate_inode_mutex_key(inode); 461 lockdep_annotate_inode_mutex_key(inode);
462 } 462 }
463 return inode; 463 return inode;
464 } 464 }
465 465
466 static struct inode *hugetlbfs_get_inode(struct super_block *sb, 466 static struct inode *hugetlbfs_get_inode(struct super_block *sb,
467 struct inode *dir, 467 struct inode *dir,
468 umode_t mode, dev_t dev) 468 umode_t mode, dev_t dev)
469 { 469 {
470 struct inode *inode; 470 struct inode *inode;
471 471
472 inode = new_inode(sb); 472 inode = new_inode(sb);
473 if (inode) { 473 if (inode) {
474 struct hugetlbfs_inode_info *info; 474 struct hugetlbfs_inode_info *info;
475 inode->i_ino = get_next_ino(); 475 inode->i_ino = get_next_ino();
476 inode_init_owner(inode, dir, mode); 476 inode_init_owner(inode, dir, mode);
477 inode->i_mapping->a_ops = &hugetlbfs_aops; 477 inode->i_mapping->a_ops = &hugetlbfs_aops;
478 inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info; 478 inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
479 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 479 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
480 INIT_LIST_HEAD(&inode->i_mapping->private_list); 480 INIT_LIST_HEAD(&inode->i_mapping->private_list);
481 info = HUGETLBFS_I(inode); 481 info = HUGETLBFS_I(inode);
482 /* 482 /*
483 * The policy is initialized here even if we are creating a 483 * The policy is initialized here even if we are creating a
484 * private inode because initialization simply creates an 484 * private inode because initialization simply creates an
485 * an empty rb tree and calls spin_lock_init(), later when we 485 * an empty rb tree and calls spin_lock_init(), later when we
486 * call mpol_free_shared_policy() it will just return because 486 * call mpol_free_shared_policy() it will just return because
487 * the rb tree will still be empty. 487 * the rb tree will still be empty.
488 */ 488 */
489 mpol_shared_policy_init(&info->policy, NULL); 489 mpol_shared_policy_init(&info->policy, NULL);
490 switch (mode & S_IFMT) { 490 switch (mode & S_IFMT) {
491 default: 491 default:
492 init_special_inode(inode, mode, dev); 492 init_special_inode(inode, mode, dev);
493 break; 493 break;
494 case S_IFREG: 494 case S_IFREG:
495 inode->i_op = &hugetlbfs_inode_operations; 495 inode->i_op = &hugetlbfs_inode_operations;
496 inode->i_fop = &hugetlbfs_file_operations; 496 inode->i_fop = &hugetlbfs_file_operations;
497 break; 497 break;
498 case S_IFDIR: 498 case S_IFDIR:
499 inode->i_op = &hugetlbfs_dir_inode_operations; 499 inode->i_op = &hugetlbfs_dir_inode_operations;
500 inode->i_fop = &simple_dir_operations; 500 inode->i_fop = &simple_dir_operations;
501 501
502 /* directory inodes start off with i_nlink == 2 (for "." entry) */ 502 /* directory inodes start off with i_nlink == 2 (for "." entry) */
503 inc_nlink(inode); 503 inc_nlink(inode);
504 break; 504 break;
505 case S_IFLNK: 505 case S_IFLNK:
506 inode->i_op = &page_symlink_inode_operations; 506 inode->i_op = &page_symlink_inode_operations;
507 break; 507 break;
508 } 508 }
509 lockdep_annotate_inode_mutex_key(inode); 509 lockdep_annotate_inode_mutex_key(inode);
510 } 510 }
511 return inode; 511 return inode;
512 } 512 }
513 513
514 /* 514 /*
515 * File creation. Allocate an inode, and we're done.. 515 * File creation. Allocate an inode, and we're done..
516 */ 516 */
517 static int hugetlbfs_mknod(struct inode *dir, 517 static int hugetlbfs_mknod(struct inode *dir,
518 struct dentry *dentry, umode_t mode, dev_t dev) 518 struct dentry *dentry, umode_t mode, dev_t dev)
519 { 519 {
520 struct inode *inode; 520 struct inode *inode;
521 int error = -ENOSPC; 521 int error = -ENOSPC;
522 522
523 inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev); 523 inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev);
524 if (inode) { 524 if (inode) {
525 dir->i_ctime = dir->i_mtime = CURRENT_TIME; 525 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
526 d_instantiate(dentry, inode); 526 d_instantiate(dentry, inode);
527 dget(dentry); /* Extra count - pin the dentry in core */ 527 dget(dentry); /* Extra count - pin the dentry in core */
528 error = 0; 528 error = 0;
529 } 529 }
530 return error; 530 return error;
531 } 531 }
532 532
533 static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 533 static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
534 { 534 {
535 int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0); 535 int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0);
536 if (!retval) 536 if (!retval)
537 inc_nlink(dir); 537 inc_nlink(dir);
538 return retval; 538 return retval;
539 } 539 }
540 540
541 static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) 541 static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl)
542 { 542 {
543 return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0); 543 return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0);
544 } 544 }
545 545
546 static int hugetlbfs_symlink(struct inode *dir, 546 static int hugetlbfs_symlink(struct inode *dir,
547 struct dentry *dentry, const char *symname) 547 struct dentry *dentry, const char *symname)
548 { 548 {
549 struct inode *inode; 549 struct inode *inode;
550 int error = -ENOSPC; 550 int error = -ENOSPC;
551 551
552 inode = hugetlbfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0); 552 inode = hugetlbfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0);
553 if (inode) { 553 if (inode) {
554 int l = strlen(symname)+1; 554 int l = strlen(symname)+1;
555 error = page_symlink(inode, symname, l); 555 error = page_symlink(inode, symname, l);
556 if (!error) { 556 if (!error) {
557 d_instantiate(dentry, inode); 557 d_instantiate(dentry, inode);
558 dget(dentry); 558 dget(dentry);
559 } else 559 } else
560 iput(inode); 560 iput(inode);
561 } 561 }
562 dir->i_ctime = dir->i_mtime = CURRENT_TIME; 562 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
563 563
564 return error; 564 return error;
565 } 565 }
566 566
567 /* 567 /*
568 * mark the head page dirty 568 * mark the head page dirty
569 */ 569 */
570 static int hugetlbfs_set_page_dirty(struct page *page) 570 static int hugetlbfs_set_page_dirty(struct page *page)
571 { 571 {
572 struct page *head = compound_head(page); 572 struct page *head = compound_head(page);
573 573
574 SetPageDirty(head); 574 SetPageDirty(head);
575 return 0; 575 return 0;
576 } 576 }
577 577
578 static int hugetlbfs_migrate_page(struct address_space *mapping, 578 static int hugetlbfs_migrate_page(struct address_space *mapping,
579 struct page *newpage, struct page *page, 579 struct page *newpage, struct page *page,
580 enum migrate_mode mode) 580 enum migrate_mode mode)
581 { 581 {
582 int rc; 582 int rc;
583 583
584 rc = migrate_huge_page_move_mapping(mapping, newpage, page); 584 rc = migrate_huge_page_move_mapping(mapping, newpage, page);
585 if (rc != MIGRATEPAGE_SUCCESS) 585 if (rc != MIGRATEPAGE_SUCCESS)
586 return rc; 586 return rc;
587 migrate_page_copy(newpage, page); 587 migrate_page_copy(newpage, page);
588 588
589 return MIGRATEPAGE_SUCCESS; 589 return MIGRATEPAGE_SUCCESS;
590 } 590 }
591 591
592 static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) 592 static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
593 { 593 {
594 struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb); 594 struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
595 struct hstate *h = hstate_inode(dentry->d_inode); 595 struct hstate *h = hstate_inode(dentry->d_inode);
596 596
597 buf->f_type = HUGETLBFS_MAGIC; 597 buf->f_type = HUGETLBFS_MAGIC;
598 buf->f_bsize = huge_page_size(h); 598 buf->f_bsize = huge_page_size(h);
599 if (sbinfo) { 599 if (sbinfo) {
600 spin_lock(&sbinfo->stat_lock); 600 spin_lock(&sbinfo->stat_lock);
601 /* If no limits set, just report 0 for max/free/used 601 /* If no limits set, just report 0 for max/free/used
602 * blocks, like simple_statfs() */ 602 * blocks, like simple_statfs() */
603 if (sbinfo->spool) { 603 if (sbinfo->spool) {
604 long free_pages; 604 long free_pages;
605 605
606 spin_lock(&sbinfo->spool->lock); 606 spin_lock(&sbinfo->spool->lock);
607 buf->f_blocks = sbinfo->spool->max_hpages; 607 buf->f_blocks = sbinfo->spool->max_hpages;
608 free_pages = sbinfo->spool->max_hpages 608 free_pages = sbinfo->spool->max_hpages
609 - sbinfo->spool->used_hpages; 609 - sbinfo->spool->used_hpages;
610 buf->f_bavail = buf->f_bfree = free_pages; 610 buf->f_bavail = buf->f_bfree = free_pages;
611 spin_unlock(&sbinfo->spool->lock); 611 spin_unlock(&sbinfo->spool->lock);
612 buf->f_files = sbinfo->max_inodes; 612 buf->f_files = sbinfo->max_inodes;
613 buf->f_ffree = sbinfo->free_inodes; 613 buf->f_ffree = sbinfo->free_inodes;
614 } 614 }
615 spin_unlock(&sbinfo->stat_lock); 615 spin_unlock(&sbinfo->stat_lock);
616 } 616 }
617 buf->f_namelen = NAME_MAX; 617 buf->f_namelen = NAME_MAX;
618 return 0; 618 return 0;
619 } 619 }
620 620
621 static void hugetlbfs_put_super(struct super_block *sb) 621 static void hugetlbfs_put_super(struct super_block *sb)
622 { 622 {
623 struct hugetlbfs_sb_info *sbi = HUGETLBFS_SB(sb); 623 struct hugetlbfs_sb_info *sbi = HUGETLBFS_SB(sb);
624 624
625 if (sbi) { 625 if (sbi) {
626 sb->s_fs_info = NULL; 626 sb->s_fs_info = NULL;
627 627
628 if (sbi->spool) 628 if (sbi->spool)
629 hugepage_put_subpool(sbi->spool); 629 hugepage_put_subpool(sbi->spool);
630 630
631 kfree(sbi); 631 kfree(sbi);
632 } 632 }
633 } 633 }
634 634
635 static inline int hugetlbfs_dec_free_inodes(struct hugetlbfs_sb_info *sbinfo) 635 static inline int hugetlbfs_dec_free_inodes(struct hugetlbfs_sb_info *sbinfo)
636 { 636 {
637 if (sbinfo->free_inodes >= 0) { 637 if (sbinfo->free_inodes >= 0) {
638 spin_lock(&sbinfo->stat_lock); 638 spin_lock(&sbinfo->stat_lock);
639 if (unlikely(!sbinfo->free_inodes)) { 639 if (unlikely(!sbinfo->free_inodes)) {
640 spin_unlock(&sbinfo->stat_lock); 640 spin_unlock(&sbinfo->stat_lock);
641 return 0; 641 return 0;
642 } 642 }
643 sbinfo->free_inodes--; 643 sbinfo->free_inodes--;
644 spin_unlock(&sbinfo->stat_lock); 644 spin_unlock(&sbinfo->stat_lock);
645 } 645 }
646 646
647 return 1; 647 return 1;
648 } 648 }
649 649
650 static void hugetlbfs_inc_free_inodes(struct hugetlbfs_sb_info *sbinfo) 650 static void hugetlbfs_inc_free_inodes(struct hugetlbfs_sb_info *sbinfo)
651 { 651 {
652 if (sbinfo->free_inodes >= 0) { 652 if (sbinfo->free_inodes >= 0) {
653 spin_lock(&sbinfo->stat_lock); 653 spin_lock(&sbinfo->stat_lock);
654 sbinfo->free_inodes++; 654 sbinfo->free_inodes++;
655 spin_unlock(&sbinfo->stat_lock); 655 spin_unlock(&sbinfo->stat_lock);
656 } 656 }
657 } 657 }
658 658
659 659
660 static struct kmem_cache *hugetlbfs_inode_cachep; 660 static struct kmem_cache *hugetlbfs_inode_cachep;
661 661
662 static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) 662 static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
663 { 663 {
664 struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb); 664 struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb);
665 struct hugetlbfs_inode_info *p; 665 struct hugetlbfs_inode_info *p;
666 666
667 if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo))) 667 if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo)))
668 return NULL; 668 return NULL;
669 p = kmem_cache_alloc(hugetlbfs_inode_cachep, GFP_KERNEL); 669 p = kmem_cache_alloc(hugetlbfs_inode_cachep, GFP_KERNEL);
670 if (unlikely(!p)) { 670 if (unlikely(!p)) {
671 hugetlbfs_inc_free_inodes(sbinfo); 671 hugetlbfs_inc_free_inodes(sbinfo);
672 return NULL; 672 return NULL;
673 } 673 }
674 return &p->vfs_inode; 674 return &p->vfs_inode;
675 } 675 }
676 676
677 static void hugetlbfs_i_callback(struct rcu_head *head) 677 static void hugetlbfs_i_callback(struct rcu_head *head)
678 { 678 {
679 struct inode *inode = container_of(head, struct inode, i_rcu); 679 struct inode *inode = container_of(head, struct inode, i_rcu);
680 kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode)); 680 kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
681 } 681 }
682 682
683 static void hugetlbfs_destroy_inode(struct inode *inode) 683 static void hugetlbfs_destroy_inode(struct inode *inode)
684 { 684 {
685 hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb)); 685 hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb));
686 mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy); 686 mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy);
687 call_rcu(&inode->i_rcu, hugetlbfs_i_callback); 687 call_rcu(&inode->i_rcu, hugetlbfs_i_callback);
688 } 688 }
689 689
690 static const struct address_space_operations hugetlbfs_aops = { 690 static const struct address_space_operations hugetlbfs_aops = {
691 .write_begin = hugetlbfs_write_begin, 691 .write_begin = hugetlbfs_write_begin,
692 .write_end = hugetlbfs_write_end, 692 .write_end = hugetlbfs_write_end,
693 .set_page_dirty = hugetlbfs_set_page_dirty, 693 .set_page_dirty = hugetlbfs_set_page_dirty,
694 .migratepage = hugetlbfs_migrate_page, 694 .migratepage = hugetlbfs_migrate_page,
695 }; 695 };
696 696
697 697
698 static void init_once(void *foo) 698 static void init_once(void *foo)
699 { 699 {
700 struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo; 700 struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo;
701 701
702 inode_init_once(&ei->vfs_inode); 702 inode_init_once(&ei->vfs_inode);
703 } 703 }
704 704
705 const struct file_operations hugetlbfs_file_operations = { 705 const struct file_operations hugetlbfs_file_operations = {
706 .read = hugetlbfs_read, 706 .read = hugetlbfs_read,
707 .mmap = hugetlbfs_file_mmap, 707 .mmap = hugetlbfs_file_mmap,
708 .fsync = noop_fsync, 708 .fsync = noop_fsync,
709 .get_unmapped_area = hugetlb_get_unmapped_area, 709 .get_unmapped_area = hugetlb_get_unmapped_area,
710 .llseek = default_llseek, 710 .llseek = default_llseek,
711 }; 711 };
712 712
713 static const struct inode_operations hugetlbfs_dir_inode_operations = { 713 static const struct inode_operations hugetlbfs_dir_inode_operations = {
714 .create = hugetlbfs_create, 714 .create = hugetlbfs_create,
715 .lookup = simple_lookup, 715 .lookup = simple_lookup,
716 .link = simple_link, 716 .link = simple_link,
717 .unlink = simple_unlink, 717 .unlink = simple_unlink,
718 .symlink = hugetlbfs_symlink, 718 .symlink = hugetlbfs_symlink,
719 .mkdir = hugetlbfs_mkdir, 719 .mkdir = hugetlbfs_mkdir,
720 .rmdir = simple_rmdir, 720 .rmdir = simple_rmdir,
721 .mknod = hugetlbfs_mknod, 721 .mknod = hugetlbfs_mknod,
722 .rename = simple_rename, 722 .rename = simple_rename,
723 .setattr = hugetlbfs_setattr, 723 .setattr = hugetlbfs_setattr,
724 }; 724 };
725 725
726 static const struct inode_operations hugetlbfs_inode_operations = { 726 static const struct inode_operations hugetlbfs_inode_operations = {
727 .setattr = hugetlbfs_setattr, 727 .setattr = hugetlbfs_setattr,
728 }; 728 };
729 729
730 static const struct super_operations hugetlbfs_ops = { 730 static const struct super_operations hugetlbfs_ops = {
731 .alloc_inode = hugetlbfs_alloc_inode, 731 .alloc_inode = hugetlbfs_alloc_inode,
732 .destroy_inode = hugetlbfs_destroy_inode, 732 .destroy_inode = hugetlbfs_destroy_inode,
733 .evict_inode = hugetlbfs_evict_inode, 733 .evict_inode = hugetlbfs_evict_inode,
734 .statfs = hugetlbfs_statfs, 734 .statfs = hugetlbfs_statfs,
735 .put_super = hugetlbfs_put_super, 735 .put_super = hugetlbfs_put_super,
736 .show_options = generic_show_options, 736 .show_options = generic_show_options,
737 }; 737 };
738 738
739 static int 739 static int
740 hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) 740 hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
741 { 741 {
742 char *p, *rest; 742 char *p, *rest;
743 substring_t args[MAX_OPT_ARGS]; 743 substring_t args[MAX_OPT_ARGS];
744 int option; 744 int option;
745 unsigned long long size = 0; 745 unsigned long long size = 0;
746 enum { NO_SIZE, SIZE_STD, SIZE_PERCENT } setsize = NO_SIZE; 746 enum { NO_SIZE, SIZE_STD, SIZE_PERCENT } setsize = NO_SIZE;
747 747
748 if (!options) 748 if (!options)
749 return 0; 749 return 0;
750 750
751 while ((p = strsep(&options, ",")) != NULL) { 751 while ((p = strsep(&options, ",")) != NULL) {
752 int token; 752 int token;
753 if (!*p) 753 if (!*p)
754 continue; 754 continue;
755 755
756 token = match_token(p, tokens, args); 756 token = match_token(p, tokens, args);
757 switch (token) { 757 switch (token) {
758 case Opt_uid: 758 case Opt_uid:
759 if (match_int(&args[0], &option)) 759 if (match_int(&args[0], &option))
760 goto bad_val; 760 goto bad_val;
761 pconfig->uid = make_kuid(current_user_ns(), option); 761 pconfig->uid = make_kuid(current_user_ns(), option);
762 if (!uid_valid(pconfig->uid)) 762 if (!uid_valid(pconfig->uid))
763 goto bad_val; 763 goto bad_val;
764 break; 764 break;
765 765
766 case Opt_gid: 766 case Opt_gid:
767 if (match_int(&args[0], &option)) 767 if (match_int(&args[0], &option))
768 goto bad_val; 768 goto bad_val;
769 pconfig->gid = make_kgid(current_user_ns(), option); 769 pconfig->gid = make_kgid(current_user_ns(), option);
770 if (!gid_valid(pconfig->gid)) 770 if (!gid_valid(pconfig->gid))
771 goto bad_val; 771 goto bad_val;
772 break; 772 break;
773 773
774 case Opt_mode: 774 case Opt_mode:
775 if (match_octal(&args[0], &option)) 775 if (match_octal(&args[0], &option))
776 goto bad_val; 776 goto bad_val;
777 pconfig->mode = option & 01777U; 777 pconfig->mode = option & 01777U;
778 break; 778 break;
779 779
780 case Opt_size: { 780 case Opt_size: {
781 /* memparse() will accept a K/M/G without a digit */ 781 /* memparse() will accept a K/M/G without a digit */
782 if (!isdigit(*args[0].from)) 782 if (!isdigit(*args[0].from))
783 goto bad_val; 783 goto bad_val;
784 size = memparse(args[0].from, &rest); 784 size = memparse(args[0].from, &rest);
785 setsize = SIZE_STD; 785 setsize = SIZE_STD;
786 if (*rest == '%') 786 if (*rest == '%')
787 setsize = SIZE_PERCENT; 787 setsize = SIZE_PERCENT;
788 break; 788 break;
789 } 789 }
790 790
791 case Opt_nr_inodes: 791 case Opt_nr_inodes:
792 /* memparse() will accept a K/M/G without a digit */ 792 /* memparse() will accept a K/M/G without a digit */
793 if (!isdigit(*args[0].from)) 793 if (!isdigit(*args[0].from))
794 goto bad_val; 794 goto bad_val;
795 pconfig->nr_inodes = memparse(args[0].from, &rest); 795 pconfig->nr_inodes = memparse(args[0].from, &rest);
796 break; 796 break;
797 797
798 case Opt_pagesize: { 798 case Opt_pagesize: {
799 unsigned long ps; 799 unsigned long ps;
800 ps = memparse(args[0].from, &rest); 800 ps = memparse(args[0].from, &rest);
801 pconfig->hstate = size_to_hstate(ps); 801 pconfig->hstate = size_to_hstate(ps);
802 if (!pconfig->hstate) { 802 if (!pconfig->hstate) {
803 printk(KERN_ERR 803 printk(KERN_ERR
804 "hugetlbfs: Unsupported page size %lu MB\n", 804 "hugetlbfs: Unsupported page size %lu MB\n",
805 ps >> 20); 805 ps >> 20);
806 return -EINVAL; 806 return -EINVAL;
807 } 807 }
808 break; 808 break;
809 } 809 }
810 810
811 default: 811 default:
812 printk(KERN_ERR "hugetlbfs: Bad mount option: \"%s\"\n", 812 printk(KERN_ERR "hugetlbfs: Bad mount option: \"%s\"\n",
813 p); 813 p);
814 return -EINVAL; 814 return -EINVAL;
815 break; 815 break;
816 } 816 }
817 } 817 }
818 818
819 /* Do size after hstate is set up */ 819 /* Do size after hstate is set up */
820 if (setsize > NO_SIZE) { 820 if (setsize > NO_SIZE) {
821 struct hstate *h = pconfig->hstate; 821 struct hstate *h = pconfig->hstate;
822 if (setsize == SIZE_PERCENT) { 822 if (setsize == SIZE_PERCENT) {
823 size <<= huge_page_shift(h); 823 size <<= huge_page_shift(h);
824 size *= h->max_huge_pages; 824 size *= h->max_huge_pages;
825 do_div(size, 100); 825 do_div(size, 100);
826 } 826 }
827 pconfig->nr_blocks = (size >> huge_page_shift(h)); 827 pconfig->nr_blocks = (size >> huge_page_shift(h));
828 } 828 }
829 829
830 return 0; 830 return 0;
831 831
832 bad_val: 832 bad_val:
833 printk(KERN_ERR "hugetlbfs: Bad value '%s' for mount option '%s'\n", 833 printk(KERN_ERR "hugetlbfs: Bad value '%s' for mount option '%s'\n",
834 args[0].from, p); 834 args[0].from, p);
835 return -EINVAL; 835 return -EINVAL;
836 } 836 }
837 837
838 static int 838 static int
839 hugetlbfs_fill_super(struct super_block *sb, void *data, int silent) 839 hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
840 { 840 {
841 int ret; 841 int ret;
842 struct hugetlbfs_config config; 842 struct hugetlbfs_config config;
843 struct hugetlbfs_sb_info *sbinfo; 843 struct hugetlbfs_sb_info *sbinfo;
844 844
845 save_mount_options(sb, data); 845 save_mount_options(sb, data);
846 846
847 config.nr_blocks = -1; /* No limit on size by default */ 847 config.nr_blocks = -1; /* No limit on size by default */
848 config.nr_inodes = -1; /* No limit on number of inodes by default */ 848 config.nr_inodes = -1; /* No limit on number of inodes by default */
849 config.uid = current_fsuid(); 849 config.uid = current_fsuid();
850 config.gid = current_fsgid(); 850 config.gid = current_fsgid();
851 config.mode = 0755; 851 config.mode = 0755;
852 config.hstate = &default_hstate; 852 config.hstate = &default_hstate;
853 ret = hugetlbfs_parse_options(data, &config); 853 ret = hugetlbfs_parse_options(data, &config);
854 if (ret) 854 if (ret)
855 return ret; 855 return ret;
856 856
857 sbinfo = kmalloc(sizeof(struct hugetlbfs_sb_info), GFP_KERNEL); 857 sbinfo = kmalloc(sizeof(struct hugetlbfs_sb_info), GFP_KERNEL);
858 if (!sbinfo) 858 if (!sbinfo)
859 return -ENOMEM; 859 return -ENOMEM;
860 sb->s_fs_info = sbinfo; 860 sb->s_fs_info = sbinfo;
861 sbinfo->hstate = config.hstate; 861 sbinfo->hstate = config.hstate;
862 spin_lock_init(&sbinfo->stat_lock); 862 spin_lock_init(&sbinfo->stat_lock);
863 sbinfo->max_inodes = config.nr_inodes; 863 sbinfo->max_inodes = config.nr_inodes;
864 sbinfo->free_inodes = config.nr_inodes; 864 sbinfo->free_inodes = config.nr_inodes;
865 sbinfo->spool = NULL; 865 sbinfo->spool = NULL;
866 if (config.nr_blocks != -1) { 866 if (config.nr_blocks != -1) {
867 sbinfo->spool = hugepage_new_subpool(config.nr_blocks); 867 sbinfo->spool = hugepage_new_subpool(config.nr_blocks);
868 if (!sbinfo->spool) 868 if (!sbinfo->spool)
869 goto out_free; 869 goto out_free;
870 } 870 }
871 sb->s_maxbytes = MAX_LFS_FILESIZE; 871 sb->s_maxbytes = MAX_LFS_FILESIZE;
872 sb->s_blocksize = huge_page_size(config.hstate); 872 sb->s_blocksize = huge_page_size(config.hstate);
873 sb->s_blocksize_bits = huge_page_shift(config.hstate); 873 sb->s_blocksize_bits = huge_page_shift(config.hstate);
874 sb->s_magic = HUGETLBFS_MAGIC; 874 sb->s_magic = HUGETLBFS_MAGIC;
875 sb->s_op = &hugetlbfs_ops; 875 sb->s_op = &hugetlbfs_ops;
876 sb->s_time_gran = 1; 876 sb->s_time_gran = 1;
877 sb->s_root = d_make_root(hugetlbfs_get_root(sb, &config)); 877 sb->s_root = d_make_root(hugetlbfs_get_root(sb, &config));
878 if (!sb->s_root) 878 if (!sb->s_root)
879 goto out_free; 879 goto out_free;
880 return 0; 880 return 0;
881 out_free: 881 out_free:
882 if (sbinfo->spool) 882 if (sbinfo->spool)
883 kfree(sbinfo->spool); 883 kfree(sbinfo->spool);
884 kfree(sbinfo); 884 kfree(sbinfo);
885 return -ENOMEM; 885 return -ENOMEM;
886 } 886 }
887 887
888 static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type, 888 static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type,
889 int flags, const char *dev_name, void *data) 889 int flags, const char *dev_name, void *data)
890 { 890 {
891 return mount_nodev(fs_type, flags, data, hugetlbfs_fill_super); 891 return mount_nodev(fs_type, flags, data, hugetlbfs_fill_super);
892 } 892 }
893 893
894 static struct file_system_type hugetlbfs_fs_type = { 894 static struct file_system_type hugetlbfs_fs_type = {
895 .name = "hugetlbfs", 895 .name = "hugetlbfs",
896 .mount = hugetlbfs_mount, 896 .mount = hugetlbfs_mount,
897 .kill_sb = kill_litter_super, 897 .kill_sb = kill_litter_super,
898 }; 898 };
899 MODULE_ALIAS_FS("hugetlbfs"); 899 MODULE_ALIAS_FS("hugetlbfs");
900 900
901 static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE]; 901 static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];
902 902
903 static int can_do_hugetlb_shm(void) 903 static int can_do_hugetlb_shm(void)
904 { 904 {
905 kgid_t shm_group; 905 kgid_t shm_group;
906 shm_group = make_kgid(&init_user_ns, sysctl_hugetlb_shm_group); 906 shm_group = make_kgid(&init_user_ns, sysctl_hugetlb_shm_group);
907 return capable(CAP_IPC_LOCK) || in_group_p(shm_group); 907 return capable(CAP_IPC_LOCK) || in_group_p(shm_group);
908 } 908 }
909 909
910 static int get_hstate_idx(int page_size_log) 910 static int get_hstate_idx(int page_size_log)
911 { 911 {
912 struct hstate *h; 912 struct hstate *h = hstate_sizelog(page_size_log);
913 913
914 if (!page_size_log)
915 return default_hstate_idx;
916 h = size_to_hstate(1 << page_size_log);
917 if (!h) 914 if (!h)
918 return -1; 915 return -1;
919 return h - hstates; 916 return h - hstates;
920 } 917 }
921 918
922 static char *hugetlb_dname(struct dentry *dentry, char *buffer, int buflen) 919 static char *hugetlb_dname(struct dentry *dentry, char *buffer, int buflen)
923 { 920 {
924 return dynamic_dname(dentry, buffer, buflen, "/%s (deleted)", 921 return dynamic_dname(dentry, buffer, buflen, "/%s (deleted)",
925 dentry->d_name.name); 922 dentry->d_name.name);
926 } 923 }
927 924
928 static struct dentry_operations anon_ops = { 925 static struct dentry_operations anon_ops = {
929 .d_dname = hugetlb_dname 926 .d_dname = hugetlb_dname
930 }; 927 };
931 928
932 struct file *hugetlb_file_setup(const char *name, unsigned long addr, 929 /*
933 size_t size, vm_flags_t acctflag, 930 * Note that size should be aligned to proper hugepage size in caller side,
934 struct user_struct **user, 931 * otherwise hugetlb_reserve_pages reserves one less hugepages than intended.
932 */
933 struct file *hugetlb_file_setup(const char *name, size_t size,
934 vm_flags_t acctflag, struct user_struct **user,
935 int creat_flags, int page_size_log) 935 int creat_flags, int page_size_log)
936 { 936 {
937 struct file *file = ERR_PTR(-ENOMEM); 937 struct file *file = ERR_PTR(-ENOMEM);
938 struct inode *inode; 938 struct inode *inode;
939 struct path path; 939 struct path path;
940 struct super_block *sb; 940 struct super_block *sb;
941 struct qstr quick_string; 941 struct qstr quick_string;
942 struct hstate *hstate;
943 unsigned long num_pages;
944 int hstate_idx; 942 int hstate_idx;
945 943
946 hstate_idx = get_hstate_idx(page_size_log); 944 hstate_idx = get_hstate_idx(page_size_log);
947 if (hstate_idx < 0) 945 if (hstate_idx < 0)
948 return ERR_PTR(-ENODEV); 946 return ERR_PTR(-ENODEV);
949 947
950 *user = NULL; 948 *user = NULL;
951 if (!hugetlbfs_vfsmount[hstate_idx]) 949 if (!hugetlbfs_vfsmount[hstate_idx])
952 return ERR_PTR(-ENOENT); 950 return ERR_PTR(-ENOENT);
953 951
954 if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) { 952 if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
955 *user = current_user(); 953 *user = current_user();
956 if (user_shm_lock(size, *user)) { 954 if (user_shm_lock(size, *user)) {
957 task_lock(current); 955 task_lock(current);
958 printk_once(KERN_WARNING 956 printk_once(KERN_WARNING
959 "%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n", 957 "%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n",
960 current->comm, current->pid); 958 current->comm, current->pid);
961 task_unlock(current); 959 task_unlock(current);
962 } else { 960 } else {
963 *user = NULL; 961 *user = NULL;
964 return ERR_PTR(-EPERM); 962 return ERR_PTR(-EPERM);
965 } 963 }
966 } 964 }
967 965
968 sb = hugetlbfs_vfsmount[hstate_idx]->mnt_sb; 966 sb = hugetlbfs_vfsmount[hstate_idx]->mnt_sb;
969 quick_string.name = name; 967 quick_string.name = name;
970 quick_string.len = strlen(quick_string.name); 968 quick_string.len = strlen(quick_string.name);
971 quick_string.hash = 0; 969 quick_string.hash = 0;
972 path.dentry = d_alloc_pseudo(sb, &quick_string); 970 path.dentry = d_alloc_pseudo(sb, &quick_string);
973 if (!path.dentry) 971 if (!path.dentry)
974 goto out_shm_unlock; 972 goto out_shm_unlock;
975 973
976 d_set_d_op(path.dentry, &anon_ops); 974 d_set_d_op(path.dentry, &anon_ops);
977 path.mnt = mntget(hugetlbfs_vfsmount[hstate_idx]); 975 path.mnt = mntget(hugetlbfs_vfsmount[hstate_idx]);
978 file = ERR_PTR(-ENOSPC); 976 file = ERR_PTR(-ENOSPC);
979 inode = hugetlbfs_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0); 977 inode = hugetlbfs_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0);
980 if (!inode) 978 if (!inode)
981 goto out_dentry; 979 goto out_dentry;
982 980
983 hstate = hstate_inode(inode);
984 size += addr & ~huge_page_mask(hstate);
985 num_pages = ALIGN(size, huge_page_size(hstate)) >>
986 huge_page_shift(hstate);
987 file = ERR_PTR(-ENOMEM); 981 file = ERR_PTR(-ENOMEM);
988 if (hugetlb_reserve_pages(inode, 0, num_pages, NULL, acctflag)) 982 if (hugetlb_reserve_pages(inode, 0,
983 size >> huge_page_shift(hstate_inode(inode)), NULL,
984 acctflag))
989 goto out_inode; 985 goto out_inode;
990 986
991 d_instantiate(path.dentry, inode); 987 d_instantiate(path.dentry, inode);
992 inode->i_size = size; 988 inode->i_size = size;
993 clear_nlink(inode); 989 clear_nlink(inode);
994 990
995 file = alloc_file(&path, FMODE_WRITE | FMODE_READ, 991 file = alloc_file(&path, FMODE_WRITE | FMODE_READ,
996 &hugetlbfs_file_operations); 992 &hugetlbfs_file_operations);
997 if (IS_ERR(file)) 993 if (IS_ERR(file))
998 goto out_dentry; /* inode is already attached */ 994 goto out_dentry; /* inode is already attached */
999 995
1000 return file; 996 return file;
1001 997
1002 out_inode: 998 out_inode:
1003 iput(inode); 999 iput(inode);
1004 out_dentry: 1000 out_dentry:
1005 path_put(&path); 1001 path_put(&path);
1006 out_shm_unlock: 1002 out_shm_unlock:
1007 if (*user) { 1003 if (*user) {
1008 user_shm_unlock(size, *user); 1004 user_shm_unlock(size, *user);
1009 *user = NULL; 1005 *user = NULL;
1010 } 1006 }
1011 return file; 1007 return file;
1012 } 1008 }
1013 1009
1014 static int __init init_hugetlbfs_fs(void) 1010 static int __init init_hugetlbfs_fs(void)
1015 { 1011 {
1016 struct hstate *h; 1012 struct hstate *h;
1017 int error; 1013 int error;
1018 int i; 1014 int i;
1019 1015
1020 error = bdi_init(&hugetlbfs_backing_dev_info); 1016 error = bdi_init(&hugetlbfs_backing_dev_info);
1021 if (error) 1017 if (error)
1022 return error; 1018 return error;
1023 1019
1024 error = -ENOMEM; 1020 error = -ENOMEM;
1025 hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache", 1021 hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
1026 sizeof(struct hugetlbfs_inode_info), 1022 sizeof(struct hugetlbfs_inode_info),
1027 0, 0, init_once); 1023 0, 0, init_once);
1028 if (hugetlbfs_inode_cachep == NULL) 1024 if (hugetlbfs_inode_cachep == NULL)
1029 goto out2; 1025 goto out2;
1030 1026
1031 error = register_filesystem(&hugetlbfs_fs_type); 1027 error = register_filesystem(&hugetlbfs_fs_type);
1032 if (error) 1028 if (error)
1033 goto out; 1029 goto out;
1034 1030
1035 i = 0; 1031 i = 0;
1036 for_each_hstate(h) { 1032 for_each_hstate(h) {
1037 char buf[50]; 1033 char buf[50];
1038 unsigned ps_kb = 1U << (h->order + PAGE_SHIFT - 10); 1034 unsigned ps_kb = 1U << (h->order + PAGE_SHIFT - 10);
1039 1035
1040 snprintf(buf, sizeof(buf), "pagesize=%uK", ps_kb); 1036 snprintf(buf, sizeof(buf), "pagesize=%uK", ps_kb);
1041 hugetlbfs_vfsmount[i] = kern_mount_data(&hugetlbfs_fs_type, 1037 hugetlbfs_vfsmount[i] = kern_mount_data(&hugetlbfs_fs_type,
1042 buf); 1038 buf);
1043 1039
1044 if (IS_ERR(hugetlbfs_vfsmount[i])) { 1040 if (IS_ERR(hugetlbfs_vfsmount[i])) {
1045 pr_err("hugetlb: Cannot mount internal hugetlbfs for " 1041 pr_err("hugetlb: Cannot mount internal hugetlbfs for "
1046 "page size %uK", ps_kb); 1042 "page size %uK", ps_kb);
1047 error = PTR_ERR(hugetlbfs_vfsmount[i]); 1043 error = PTR_ERR(hugetlbfs_vfsmount[i]);
1048 hugetlbfs_vfsmount[i] = NULL; 1044 hugetlbfs_vfsmount[i] = NULL;
1049 } 1045 }
1050 i++; 1046 i++;
1051 } 1047 }
1052 /* Non default hstates are optional */ 1048 /* Non default hstates are optional */
1053 if (!IS_ERR_OR_NULL(hugetlbfs_vfsmount[default_hstate_idx])) 1049 if (!IS_ERR_OR_NULL(hugetlbfs_vfsmount[default_hstate_idx]))
1054 return 0; 1050 return 0;
1055 1051
1056 out: 1052 out:
1057 kmem_cache_destroy(hugetlbfs_inode_cachep); 1053 kmem_cache_destroy(hugetlbfs_inode_cachep);
1058 out2: 1054 out2:
1059 bdi_destroy(&hugetlbfs_backing_dev_info); 1055 bdi_destroy(&hugetlbfs_backing_dev_info);
1060 return error; 1056 return error;
1061 } 1057 }
1062 1058
1063 static void __exit exit_hugetlbfs_fs(void) 1059 static void __exit exit_hugetlbfs_fs(void)
1064 { 1060 {
1065 struct hstate *h; 1061 struct hstate *h;
1066 int i; 1062 int i;
1067 1063
1068 1064
1069 /* 1065 /*
1070 * Make sure all delayed rcu free inodes are flushed before we 1066 * Make sure all delayed rcu free inodes are flushed before we
1071 * destroy cache. 1067 * destroy cache.
1072 */ 1068 */
1073 rcu_barrier(); 1069 rcu_barrier();
1074 kmem_cache_destroy(hugetlbfs_inode_cachep); 1070 kmem_cache_destroy(hugetlbfs_inode_cachep);
1075 i = 0; 1071 i = 0;
1076 for_each_hstate(h) 1072 for_each_hstate(h)
1077 kern_unmount(hugetlbfs_vfsmount[i++]); 1073 kern_unmount(hugetlbfs_vfsmount[i++]);
1078 unregister_filesystem(&hugetlbfs_fs_type); 1074 unregister_filesystem(&hugetlbfs_fs_type);
1079 bdi_destroy(&hugetlbfs_backing_dev_info); 1075 bdi_destroy(&hugetlbfs_backing_dev_info);
1080 } 1076 }
1081 1077
include/linux/hugetlb.h
1 #ifndef _LINUX_HUGETLB_H 1 #ifndef _LINUX_HUGETLB_H
2 #define _LINUX_HUGETLB_H 2 #define _LINUX_HUGETLB_H
3 3
4 #include <linux/mm_types.h> 4 #include <linux/mm_types.h>
5 #include <linux/fs.h> 5 #include <linux/fs.h>
6 #include <linux/hugetlb_inline.h> 6 #include <linux/hugetlb_inline.h>
7 #include <linux/cgroup.h> 7 #include <linux/cgroup.h>
8 8
9 struct ctl_table; 9 struct ctl_table;
10 struct user_struct; 10 struct user_struct;
11 struct mmu_gather; 11 struct mmu_gather;
12 12
13 #ifdef CONFIG_HUGETLB_PAGE 13 #ifdef CONFIG_HUGETLB_PAGE
14 14
15 #include <linux/mempolicy.h> 15 #include <linux/mempolicy.h>
16 #include <linux/shm.h> 16 #include <linux/shm.h>
17 #include <asm/tlbflush.h> 17 #include <asm/tlbflush.h>
18 18
19 struct hugepage_subpool { 19 struct hugepage_subpool {
20 spinlock_t lock; 20 spinlock_t lock;
21 long count; 21 long count;
22 long max_hpages, used_hpages; 22 long max_hpages, used_hpages;
23 }; 23 };
24 24
25 extern spinlock_t hugetlb_lock; 25 extern spinlock_t hugetlb_lock;
26 extern int hugetlb_max_hstate __read_mostly; 26 extern int hugetlb_max_hstate __read_mostly;
27 #define for_each_hstate(h) \ 27 #define for_each_hstate(h) \
28 for ((h) = hstates; (h) < &hstates[hugetlb_max_hstate]; (h)++) 28 for ((h) = hstates; (h) < &hstates[hugetlb_max_hstate]; (h)++)
29 29
30 struct hugepage_subpool *hugepage_new_subpool(long nr_blocks); 30 struct hugepage_subpool *hugepage_new_subpool(long nr_blocks);
31 void hugepage_put_subpool(struct hugepage_subpool *spool); 31 void hugepage_put_subpool(struct hugepage_subpool *spool);
32 32
33 int PageHuge(struct page *page); 33 int PageHuge(struct page *page);
34 34
35 void reset_vma_resv_huge_pages(struct vm_area_struct *vma); 35 void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
36 int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); 36 int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
37 int hugetlb_overcommit_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); 37 int hugetlb_overcommit_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
38 int hugetlb_treat_movable_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); 38 int hugetlb_treat_movable_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
39 39
40 #ifdef CONFIG_NUMA 40 #ifdef CONFIG_NUMA
41 int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int, 41 int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int,
42 void __user *, size_t *, loff_t *); 42 void __user *, size_t *, loff_t *);
43 #endif 43 #endif
44 44
45 int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *); 45 int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
46 long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, 46 long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
47 struct page **, struct vm_area_struct **, 47 struct page **, struct vm_area_struct **,
48 unsigned long *, unsigned long *, long, unsigned int); 48 unsigned long *, unsigned long *, long, unsigned int);
49 void unmap_hugepage_range(struct vm_area_struct *, 49 void unmap_hugepage_range(struct vm_area_struct *,
50 unsigned long, unsigned long, struct page *); 50 unsigned long, unsigned long, struct page *);
51 void __unmap_hugepage_range_final(struct mmu_gather *tlb, 51 void __unmap_hugepage_range_final(struct mmu_gather *tlb,
52 struct vm_area_struct *vma, 52 struct vm_area_struct *vma,
53 unsigned long start, unsigned long end, 53 unsigned long start, unsigned long end,
54 struct page *ref_page); 54 struct page *ref_page);
55 void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, 55 void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
56 unsigned long start, unsigned long end, 56 unsigned long start, unsigned long end,
57 struct page *ref_page); 57 struct page *ref_page);
58 int hugetlb_prefault(struct address_space *, struct vm_area_struct *); 58 int hugetlb_prefault(struct address_space *, struct vm_area_struct *);
59 void hugetlb_report_meminfo(struct seq_file *); 59 void hugetlb_report_meminfo(struct seq_file *);
60 int hugetlb_report_node_meminfo(int, char *); 60 int hugetlb_report_node_meminfo(int, char *);
61 void hugetlb_show_meminfo(void); 61 void hugetlb_show_meminfo(void);
62 unsigned long hugetlb_total_pages(void); 62 unsigned long hugetlb_total_pages(void);
63 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, 63 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
64 unsigned long address, unsigned int flags); 64 unsigned long address, unsigned int flags);
65 int hugetlb_reserve_pages(struct inode *inode, long from, long to, 65 int hugetlb_reserve_pages(struct inode *inode, long from, long to,
66 struct vm_area_struct *vma, 66 struct vm_area_struct *vma,
67 vm_flags_t vm_flags); 67 vm_flags_t vm_flags);
68 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed); 68 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
69 int dequeue_hwpoisoned_huge_page(struct page *page); 69 int dequeue_hwpoisoned_huge_page(struct page *page);
70 void copy_huge_page(struct page *dst, struct page *src); 70 void copy_huge_page(struct page *dst, struct page *src);
71 71
72 extern unsigned long hugepages_treat_as_movable; 72 extern unsigned long hugepages_treat_as_movable;
73 extern const unsigned long hugetlb_zero, hugetlb_infinity; 73 extern const unsigned long hugetlb_zero, hugetlb_infinity;
74 extern int sysctl_hugetlb_shm_group; 74 extern int sysctl_hugetlb_shm_group;
75 extern struct list_head huge_boot_pages; 75 extern struct list_head huge_boot_pages;
76 76
77 /* arch callbacks */ 77 /* arch callbacks */
78 78
79 pte_t *huge_pte_alloc(struct mm_struct *mm, 79 pte_t *huge_pte_alloc(struct mm_struct *mm,
80 unsigned long addr, unsigned long sz); 80 unsigned long addr, unsigned long sz);
81 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr); 81 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr);
82 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep); 82 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep);
83 struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, 83 struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
84 int write); 84 int write);
85 struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, 85 struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
86 pmd_t *pmd, int write); 86 pmd_t *pmd, int write);
87 struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address, 87 struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
88 pud_t *pud, int write); 88 pud_t *pud, int write);
89 int pmd_huge(pmd_t pmd); 89 int pmd_huge(pmd_t pmd);
90 int pud_huge(pud_t pmd); 90 int pud_huge(pud_t pmd);
91 unsigned long hugetlb_change_protection(struct vm_area_struct *vma, 91 unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
92 unsigned long address, unsigned long end, pgprot_t newprot); 92 unsigned long address, unsigned long end, pgprot_t newprot);
93 93
94 #else /* !CONFIG_HUGETLB_PAGE */ 94 #else /* !CONFIG_HUGETLB_PAGE */
95 95
96 static inline int PageHuge(struct page *page) 96 static inline int PageHuge(struct page *page)
97 { 97 {
98 return 0; 98 return 0;
99 } 99 }
100 100
101 static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma) 101 static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
102 { 102 {
103 } 103 }
104 104
105 static inline unsigned long hugetlb_total_pages(void) 105 static inline unsigned long hugetlb_total_pages(void)
106 { 106 {
107 return 0; 107 return 0;
108 } 108 }
109 109
110 #define follow_hugetlb_page(m,v,p,vs,a,b,i,w) ({ BUG(); 0; }) 110 #define follow_hugetlb_page(m,v,p,vs,a,b,i,w) ({ BUG(); 0; })
111 #define follow_huge_addr(mm, addr, write) ERR_PTR(-EINVAL) 111 #define follow_huge_addr(mm, addr, write) ERR_PTR(-EINVAL)
112 #define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; }) 112 #define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; })
113 #define hugetlb_prefault(mapping, vma) ({ BUG(); 0; }) 113 #define hugetlb_prefault(mapping, vma) ({ BUG(); 0; })
114 static inline void hugetlb_report_meminfo(struct seq_file *m) 114 static inline void hugetlb_report_meminfo(struct seq_file *m)
115 { 115 {
116 } 116 }
117 #define hugetlb_report_node_meminfo(n, buf) 0 117 #define hugetlb_report_node_meminfo(n, buf) 0
118 static inline void hugetlb_show_meminfo(void) 118 static inline void hugetlb_show_meminfo(void)
119 { 119 {
120 } 120 }
121 #define follow_huge_pmd(mm, addr, pmd, write) NULL 121 #define follow_huge_pmd(mm, addr, pmd, write) NULL
122 #define follow_huge_pud(mm, addr, pud, write) NULL 122 #define follow_huge_pud(mm, addr, pud, write) NULL
123 #define prepare_hugepage_range(file, addr, len) (-EINVAL) 123 #define prepare_hugepage_range(file, addr, len) (-EINVAL)
124 #define pmd_huge(x) 0 124 #define pmd_huge(x) 0
125 #define pud_huge(x) 0 125 #define pud_huge(x) 0
126 #define is_hugepage_only_range(mm, addr, len) 0 126 #define is_hugepage_only_range(mm, addr, len) 0
127 #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; }) 127 #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; })
128 #define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; }) 128 #define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; })
129 #define huge_pte_offset(mm, address) 0 129 #define huge_pte_offset(mm, address) 0
130 static inline int dequeue_hwpoisoned_huge_page(struct page *page) 130 static inline int dequeue_hwpoisoned_huge_page(struct page *page)
131 { 131 {
132 return 0; 132 return 0;
133 } 133 }
134 134
135 static inline void copy_huge_page(struct page *dst, struct page *src) 135 static inline void copy_huge_page(struct page *dst, struct page *src)
136 { 136 {
137 } 137 }
138 138
139 static inline unsigned long hugetlb_change_protection(struct vm_area_struct *vma, 139 static inline unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
140 unsigned long address, unsigned long end, pgprot_t newprot) 140 unsigned long address, unsigned long end, pgprot_t newprot)
141 { 141 {
142 return 0; 142 return 0;
143 } 143 }
144 144
145 static inline void __unmap_hugepage_range_final(struct mmu_gather *tlb, 145 static inline void __unmap_hugepage_range_final(struct mmu_gather *tlb,
146 struct vm_area_struct *vma, unsigned long start, 146 struct vm_area_struct *vma, unsigned long start,
147 unsigned long end, struct page *ref_page) 147 unsigned long end, struct page *ref_page)
148 { 148 {
149 BUG(); 149 BUG();
150 } 150 }
151 151
152 static inline void __unmap_hugepage_range(struct mmu_gather *tlb, 152 static inline void __unmap_hugepage_range(struct mmu_gather *tlb,
153 struct vm_area_struct *vma, unsigned long start, 153 struct vm_area_struct *vma, unsigned long start,
154 unsigned long end, struct page *ref_page) 154 unsigned long end, struct page *ref_page)
155 { 155 {
156 BUG(); 156 BUG();
157 } 157 }
158 158
159 #endif /* !CONFIG_HUGETLB_PAGE */ 159 #endif /* !CONFIG_HUGETLB_PAGE */
160 160
161 #define HUGETLB_ANON_FILE "anon_hugepage" 161 #define HUGETLB_ANON_FILE "anon_hugepage"
162 162
163 enum { 163 enum {
164 /* 164 /*
165 * The file will be used as an shm file so shmfs accounting rules 165 * The file will be used as an shm file so shmfs accounting rules
166 * apply 166 * apply
167 */ 167 */
168 HUGETLB_SHMFS_INODE = 1, 168 HUGETLB_SHMFS_INODE = 1,
169 /* 169 /*
170 * The file is being created on the internal vfs mount and shmfs 170 * The file is being created on the internal vfs mount and shmfs
171 * accounting rules do not apply 171 * accounting rules do not apply
172 */ 172 */
173 HUGETLB_ANONHUGE_INODE = 2, 173 HUGETLB_ANONHUGE_INODE = 2,
174 }; 174 };
175 175
176 #ifdef CONFIG_HUGETLBFS 176 #ifdef CONFIG_HUGETLBFS
177 struct hugetlbfs_sb_info { 177 struct hugetlbfs_sb_info {
178 long max_inodes; /* inodes allowed */ 178 long max_inodes; /* inodes allowed */
179 long free_inodes; /* inodes free */ 179 long free_inodes; /* inodes free */
180 spinlock_t stat_lock; 180 spinlock_t stat_lock;
181 struct hstate *hstate; 181 struct hstate *hstate;
182 struct hugepage_subpool *spool; 182 struct hugepage_subpool *spool;
183 }; 183 };
184 184
185 static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb) 185 static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
186 { 186 {
187 return sb->s_fs_info; 187 return sb->s_fs_info;
188 } 188 }
189 189
190 extern const struct file_operations hugetlbfs_file_operations; 190 extern const struct file_operations hugetlbfs_file_operations;
191 extern const struct vm_operations_struct hugetlb_vm_ops; 191 extern const struct vm_operations_struct hugetlb_vm_ops;
192 struct file *hugetlb_file_setup(const char *name, unsigned long addr, 192 struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct,
193 size_t size, vm_flags_t acct,
194 struct user_struct **user, int creat_flags, 193 struct user_struct **user, int creat_flags,
195 int page_size_log); 194 int page_size_log);
196 195
197 static inline int is_file_hugepages(struct file *file) 196 static inline int is_file_hugepages(struct file *file)
198 { 197 {
199 if (file->f_op == &hugetlbfs_file_operations) 198 if (file->f_op == &hugetlbfs_file_operations)
200 return 1; 199 return 1;
201 if (is_file_shm_hugepages(file)) 200 if (is_file_shm_hugepages(file))
202 return 1; 201 return 1;
203 202
204 return 0; 203 return 0;
205 } 204 }
206 205
207 206
208 #else /* !CONFIG_HUGETLBFS */ 207 #else /* !CONFIG_HUGETLBFS */
209 208
210 #define is_file_hugepages(file) 0 209 #define is_file_hugepages(file) 0
211 static inline struct file * 210 static inline struct file *
212 hugetlb_file_setup(const char *name, unsigned long addr, size_t size, 211 hugetlb_file_setup(const char *name, size_t size, vm_flags_t acctflag,
213 vm_flags_t acctflag, struct user_struct **user, int creat_flags, 212 struct user_struct **user, int creat_flags,
214 int page_size_log) 213 int page_size_log)
215 { 214 {
216 return ERR_PTR(-ENOSYS); 215 return ERR_PTR(-ENOSYS);
217 } 216 }
218 217
219 #endif /* !CONFIG_HUGETLBFS */ 218 #endif /* !CONFIG_HUGETLBFS */
220 219
221 #ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA 220 #ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
222 unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, 221 unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
223 unsigned long len, unsigned long pgoff, 222 unsigned long len, unsigned long pgoff,
224 unsigned long flags); 223 unsigned long flags);
225 #endif /* HAVE_ARCH_HUGETLB_UNMAPPED_AREA */ 224 #endif /* HAVE_ARCH_HUGETLB_UNMAPPED_AREA */
226 225
227 #ifdef CONFIG_HUGETLB_PAGE 226 #ifdef CONFIG_HUGETLB_PAGE
228 227
229 #define HSTATE_NAME_LEN 32 228 #define HSTATE_NAME_LEN 32
230 /* Defines one hugetlb page size */ 229 /* Defines one hugetlb page size */
231 struct hstate { 230 struct hstate {
232 int next_nid_to_alloc; 231 int next_nid_to_alloc;
233 int next_nid_to_free; 232 int next_nid_to_free;
234 unsigned int order; 233 unsigned int order;
235 unsigned long mask; 234 unsigned long mask;
236 unsigned long max_huge_pages; 235 unsigned long max_huge_pages;
237 unsigned long nr_huge_pages; 236 unsigned long nr_huge_pages;
238 unsigned long free_huge_pages; 237 unsigned long free_huge_pages;
239 unsigned long resv_huge_pages; 238 unsigned long resv_huge_pages;
240 unsigned long surplus_huge_pages; 239 unsigned long surplus_huge_pages;
241 unsigned long nr_overcommit_huge_pages; 240 unsigned long nr_overcommit_huge_pages;
242 struct list_head hugepage_activelist; 241 struct list_head hugepage_activelist;
243 struct list_head hugepage_freelists[MAX_NUMNODES]; 242 struct list_head hugepage_freelists[MAX_NUMNODES];
244 unsigned int nr_huge_pages_node[MAX_NUMNODES]; 243 unsigned int nr_huge_pages_node[MAX_NUMNODES];
245 unsigned int free_huge_pages_node[MAX_NUMNODES]; 244 unsigned int free_huge_pages_node[MAX_NUMNODES];
246 unsigned int surplus_huge_pages_node[MAX_NUMNODES]; 245 unsigned int surplus_huge_pages_node[MAX_NUMNODES];
247 #ifdef CONFIG_CGROUP_HUGETLB 246 #ifdef CONFIG_CGROUP_HUGETLB
248 /* cgroup control files */ 247 /* cgroup control files */
249 struct cftype cgroup_files[5]; 248 struct cftype cgroup_files[5];
250 #endif 249 #endif
251 char name[HSTATE_NAME_LEN]; 250 char name[HSTATE_NAME_LEN];
252 }; 251 };
253 252
254 struct huge_bootmem_page { 253 struct huge_bootmem_page {
255 struct list_head list; 254 struct list_head list;
256 struct hstate *hstate; 255 struct hstate *hstate;
257 #ifdef CONFIG_HIGHMEM 256 #ifdef CONFIG_HIGHMEM
258 phys_addr_t phys; 257 phys_addr_t phys;
259 #endif 258 #endif
260 }; 259 };
261 260
262 struct page *alloc_huge_page_node(struct hstate *h, int nid); 261 struct page *alloc_huge_page_node(struct hstate *h, int nid);
263 262
264 /* arch callback */ 263 /* arch callback */
265 int __init alloc_bootmem_huge_page(struct hstate *h); 264 int __init alloc_bootmem_huge_page(struct hstate *h);
266 265
267 void __init hugetlb_add_hstate(unsigned order); 266 void __init hugetlb_add_hstate(unsigned order);
268 struct hstate *size_to_hstate(unsigned long size); 267 struct hstate *size_to_hstate(unsigned long size);
269 268
270 #ifndef HUGE_MAX_HSTATE 269 #ifndef HUGE_MAX_HSTATE
271 #define HUGE_MAX_HSTATE 1 270 #define HUGE_MAX_HSTATE 1
272 #endif 271 #endif
273 272
274 extern struct hstate hstates[HUGE_MAX_HSTATE]; 273 extern struct hstate hstates[HUGE_MAX_HSTATE];
275 extern unsigned int default_hstate_idx; 274 extern unsigned int default_hstate_idx;
276 275
277 #define default_hstate (hstates[default_hstate_idx]) 276 #define default_hstate (hstates[default_hstate_idx])
278 277
279 static inline struct hstate *hstate_inode(struct inode *i) 278 static inline struct hstate *hstate_inode(struct inode *i)
280 { 279 {
281 struct hugetlbfs_sb_info *hsb; 280 struct hugetlbfs_sb_info *hsb;
282 hsb = HUGETLBFS_SB(i->i_sb); 281 hsb = HUGETLBFS_SB(i->i_sb);
283 return hsb->hstate; 282 return hsb->hstate;
284 } 283 }
285 284
286 static inline struct hstate *hstate_file(struct file *f) 285 static inline struct hstate *hstate_file(struct file *f)
287 { 286 {
288 return hstate_inode(file_inode(f)); 287 return hstate_inode(file_inode(f));
289 } 288 }
290 289
290 static inline struct hstate *hstate_sizelog(int page_size_log)
291 {
292 if (!page_size_log)
293 return &default_hstate;
294 return size_to_hstate(1 << page_size_log);
295 }
296
291 static inline struct hstate *hstate_vma(struct vm_area_struct *vma) 297 static inline struct hstate *hstate_vma(struct vm_area_struct *vma)
292 { 298 {
293 return hstate_file(vma->vm_file); 299 return hstate_file(vma->vm_file);
294 } 300 }
295 301
296 static inline unsigned long huge_page_size(struct hstate *h) 302 static inline unsigned long huge_page_size(struct hstate *h)
297 { 303 {
298 return (unsigned long)PAGE_SIZE << h->order; 304 return (unsigned long)PAGE_SIZE << h->order;
299 } 305 }
300 306
301 extern unsigned long vma_kernel_pagesize(struct vm_area_struct *vma); 307 extern unsigned long vma_kernel_pagesize(struct vm_area_struct *vma);
302 308
303 extern unsigned long vma_mmu_pagesize(struct vm_area_struct *vma); 309 extern unsigned long vma_mmu_pagesize(struct vm_area_struct *vma);
304 310
305 static inline unsigned long huge_page_mask(struct hstate *h) 311 static inline unsigned long huge_page_mask(struct hstate *h)
306 { 312 {
307 return h->mask; 313 return h->mask;
308 } 314 }
309 315
310 static inline unsigned int huge_page_order(struct hstate *h) 316 static inline unsigned int huge_page_order(struct hstate *h)
311 { 317 {
312 return h->order; 318 return h->order;
313 } 319 }
314 320
315 static inline unsigned huge_page_shift(struct hstate *h) 321 static inline unsigned huge_page_shift(struct hstate *h)
316 { 322 {
317 return h->order + PAGE_SHIFT; 323 return h->order + PAGE_SHIFT;
318 } 324 }
319 325
320 static inline unsigned int pages_per_huge_page(struct hstate *h) 326 static inline unsigned int pages_per_huge_page(struct hstate *h)
321 { 327 {
322 return 1 << h->order; 328 return 1 << h->order;
323 } 329 }
324 330
325 static inline unsigned int blocks_per_huge_page(struct hstate *h) 331 static inline unsigned int blocks_per_huge_page(struct hstate *h)
326 { 332 {
327 return huge_page_size(h) / 512; 333 return huge_page_size(h) / 512;
328 } 334 }
329 335
330 #include <asm/hugetlb.h> 336 #include <asm/hugetlb.h>
331 337
332 #ifndef arch_make_huge_pte 338 #ifndef arch_make_huge_pte
333 static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, 339 static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
334 struct page *page, int writable) 340 struct page *page, int writable)
335 { 341 {
336 return entry; 342 return entry;
337 } 343 }
338 #endif 344 #endif
339 345
340 static inline struct hstate *page_hstate(struct page *page) 346 static inline struct hstate *page_hstate(struct page *page)
341 { 347 {
342 return size_to_hstate(PAGE_SIZE << compound_order(page)); 348 return size_to_hstate(PAGE_SIZE << compound_order(page));
343 } 349 }
344 350
345 static inline unsigned hstate_index_to_shift(unsigned index) 351 static inline unsigned hstate_index_to_shift(unsigned index)
346 { 352 {
347 return hstates[index].order + PAGE_SHIFT; 353 return hstates[index].order + PAGE_SHIFT;
348 } 354 }
349 355
350 static inline int hstate_index(struct hstate *h) 356 static inline int hstate_index(struct hstate *h)
351 { 357 {
352 return h - hstates; 358 return h - hstates;
353 } 359 }
354 360
355 #else 361 #else /* CONFIG_HUGETLB_PAGE */
356 struct hstate {}; 362 struct hstate {};
357 #define alloc_huge_page_node(h, nid) NULL 363 #define alloc_huge_page_node(h, nid) NULL
358 #define alloc_bootmem_huge_page(h) NULL 364 #define alloc_bootmem_huge_page(h) NULL
359 #define hstate_file(f) NULL 365 #define hstate_file(f) NULL
366 #define hstate_sizelog(s) NULL
360 #define hstate_vma(v) NULL 367 #define hstate_vma(v) NULL
361 #define hstate_inode(i) NULL 368 #define hstate_inode(i) NULL
362 #define huge_page_size(h) PAGE_SIZE 369 #define huge_page_size(h) PAGE_SIZE
363 #define huge_page_mask(h) PAGE_MASK 370 #define huge_page_mask(h) PAGE_MASK
364 #define vma_kernel_pagesize(v) PAGE_SIZE 371 #define vma_kernel_pagesize(v) PAGE_SIZE
365 #define vma_mmu_pagesize(v) PAGE_SIZE 372 #define vma_mmu_pagesize(v) PAGE_SIZE
366 #define huge_page_order(h) 0 373 #define huge_page_order(h) 0
367 #define huge_page_shift(h) PAGE_SHIFT 374 #define huge_page_shift(h) PAGE_SHIFT
368 static inline unsigned int pages_per_huge_page(struct hstate *h) 375 static inline unsigned int pages_per_huge_page(struct hstate *h)
369 { 376 {
370 return 1; 377 return 1;
371 } 378 }
372 #define hstate_index_to_shift(index) 0 379 #define hstate_index_to_shift(index) 0
373 #define hstate_index(h) 0 380 #define hstate_index(h) 0
374 #endif 381 #endif /* CONFIG_HUGETLB_PAGE */
375 382
376 #endif /* _LINUX_HUGETLB_H */ 383 #endif /* _LINUX_HUGETLB_H */
1 /* 1 /*
2 * linux/ipc/shm.c 2 * linux/ipc/shm.c
3 * Copyright (C) 1992, 1993 Krishna Balasubramanian 3 * Copyright (C) 1992, 1993 Krishna Balasubramanian
4 * Many improvements/fixes by Bruno Haible. 4 * Many improvements/fixes by Bruno Haible.
5 * Replaced `struct shm_desc' by `struct vm_area_struct', July 1994. 5 * Replaced `struct shm_desc' by `struct vm_area_struct', July 1994.
6 * Fixed the shm swap deallocation (shm_unuse()), August 1998 Andrea Arcangeli. 6 * Fixed the shm swap deallocation (shm_unuse()), August 1998 Andrea Arcangeli.
7 * 7 *
8 * /proc/sysvipc/shm support (c) 1999 Dragos Acostachioaie <dragos@iname.com> 8 * /proc/sysvipc/shm support (c) 1999 Dragos Acostachioaie <dragos@iname.com>
9 * BIGMEM support, Andrea Arcangeli <andrea@suse.de> 9 * BIGMEM support, Andrea Arcangeli <andrea@suse.de>
10 * SMP thread shm, Jean-Luc Boyard <jean-luc.boyard@siemens.fr> 10 * SMP thread shm, Jean-Luc Boyard <jean-luc.boyard@siemens.fr>
11 * HIGHMEM support, Ingo Molnar <mingo@redhat.com> 11 * HIGHMEM support, Ingo Molnar <mingo@redhat.com>
12 * Make shmmax, shmall, shmmni sysctl'able, Christoph Rohland <cr@sap.com> 12 * Make shmmax, shmall, shmmni sysctl'able, Christoph Rohland <cr@sap.com>
13 * Shared /dev/zero support, Kanoj Sarcar <kanoj@sgi.com> 13 * Shared /dev/zero support, Kanoj Sarcar <kanoj@sgi.com>
14 * Move the mm functionality over to mm/shmem.c, Christoph Rohland <cr@sap.com> 14 * Move the mm functionality over to mm/shmem.c, Christoph Rohland <cr@sap.com>
15 * 15 *
16 * support for audit of ipc object properties and permission changes 16 * support for audit of ipc object properties and permission changes
17 * Dustin Kirkland <dustin.kirkland@us.ibm.com> 17 * Dustin Kirkland <dustin.kirkland@us.ibm.com>
18 * 18 *
19 * namespaces support 19 * namespaces support
20 * OpenVZ, SWsoft Inc. 20 * OpenVZ, SWsoft Inc.
21 * Pavel Emelianov <xemul@openvz.org> 21 * Pavel Emelianov <xemul@openvz.org>
22 */ 22 */
23 23
24 #include <linux/slab.h> 24 #include <linux/slab.h>
25 #include <linux/mm.h> 25 #include <linux/mm.h>
26 #include <linux/hugetlb.h> 26 #include <linux/hugetlb.h>
27 #include <linux/shm.h> 27 #include <linux/shm.h>
28 #include <linux/init.h> 28 #include <linux/init.h>
29 #include <linux/file.h> 29 #include <linux/file.h>
30 #include <linux/mman.h> 30 #include <linux/mman.h>
31 #include <linux/shmem_fs.h> 31 #include <linux/shmem_fs.h>
32 #include <linux/security.h> 32 #include <linux/security.h>
33 #include <linux/syscalls.h> 33 #include <linux/syscalls.h>
34 #include <linux/audit.h> 34 #include <linux/audit.h>
35 #include <linux/capability.h> 35 #include <linux/capability.h>
36 #include <linux/ptrace.h> 36 #include <linux/ptrace.h>
37 #include <linux/seq_file.h> 37 #include <linux/seq_file.h>
38 #include <linux/rwsem.h> 38 #include <linux/rwsem.h>
39 #include <linux/nsproxy.h> 39 #include <linux/nsproxy.h>
40 #include <linux/mount.h> 40 #include <linux/mount.h>
41 #include <linux/ipc_namespace.h> 41 #include <linux/ipc_namespace.h>
42 42
43 #include <asm/uaccess.h> 43 #include <asm/uaccess.h>
44 44
45 #include "util.h" 45 #include "util.h"
46 46
47 struct shm_file_data { 47 struct shm_file_data {
48 int id; 48 int id;
49 struct ipc_namespace *ns; 49 struct ipc_namespace *ns;
50 struct file *file; 50 struct file *file;
51 const struct vm_operations_struct *vm_ops; 51 const struct vm_operations_struct *vm_ops;
52 }; 52 };
53 53
54 #define shm_file_data(file) (*((struct shm_file_data **)&(file)->private_data)) 54 #define shm_file_data(file) (*((struct shm_file_data **)&(file)->private_data))
55 55
56 static const struct file_operations shm_file_operations; 56 static const struct file_operations shm_file_operations;
57 static const struct vm_operations_struct shm_vm_ops; 57 static const struct vm_operations_struct shm_vm_ops;
58 58
59 #define shm_ids(ns) ((ns)->ids[IPC_SHM_IDS]) 59 #define shm_ids(ns) ((ns)->ids[IPC_SHM_IDS])
60 60
61 #define shm_unlock(shp) \ 61 #define shm_unlock(shp) \
62 ipc_unlock(&(shp)->shm_perm) 62 ipc_unlock(&(shp)->shm_perm)
63 63
64 static int newseg(struct ipc_namespace *, struct ipc_params *); 64 static int newseg(struct ipc_namespace *, struct ipc_params *);
65 static void shm_open(struct vm_area_struct *vma); 65 static void shm_open(struct vm_area_struct *vma);
66 static void shm_close(struct vm_area_struct *vma); 66 static void shm_close(struct vm_area_struct *vma);
67 static void shm_destroy (struct ipc_namespace *ns, struct shmid_kernel *shp); 67 static void shm_destroy (struct ipc_namespace *ns, struct shmid_kernel *shp);
68 #ifdef CONFIG_PROC_FS 68 #ifdef CONFIG_PROC_FS
69 static int sysvipc_shm_proc_show(struct seq_file *s, void *it); 69 static int sysvipc_shm_proc_show(struct seq_file *s, void *it);
70 #endif 70 #endif
71 71
72 void shm_init_ns(struct ipc_namespace *ns) 72 void shm_init_ns(struct ipc_namespace *ns)
73 { 73 {
74 ns->shm_ctlmax = SHMMAX; 74 ns->shm_ctlmax = SHMMAX;
75 ns->shm_ctlall = SHMALL; 75 ns->shm_ctlall = SHMALL;
76 ns->shm_ctlmni = SHMMNI; 76 ns->shm_ctlmni = SHMMNI;
77 ns->shm_rmid_forced = 0; 77 ns->shm_rmid_forced = 0;
78 ns->shm_tot = 0; 78 ns->shm_tot = 0;
79 ipc_init_ids(&shm_ids(ns)); 79 ipc_init_ids(&shm_ids(ns));
80 } 80 }
81 81
82 /* 82 /*
83 * Called with shm_ids.rw_mutex (writer) and the shp structure locked. 83 * Called with shm_ids.rw_mutex (writer) and the shp structure locked.
84 * Only shm_ids.rw_mutex remains locked on exit. 84 * Only shm_ids.rw_mutex remains locked on exit.
85 */ 85 */
86 static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) 86 static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
87 { 87 {
88 struct shmid_kernel *shp; 88 struct shmid_kernel *shp;
89 shp = container_of(ipcp, struct shmid_kernel, shm_perm); 89 shp = container_of(ipcp, struct shmid_kernel, shm_perm);
90 90
91 if (shp->shm_nattch){ 91 if (shp->shm_nattch){
92 shp->shm_perm.mode |= SHM_DEST; 92 shp->shm_perm.mode |= SHM_DEST;
93 /* Do not find it any more */ 93 /* Do not find it any more */
94 shp->shm_perm.key = IPC_PRIVATE; 94 shp->shm_perm.key = IPC_PRIVATE;
95 shm_unlock(shp); 95 shm_unlock(shp);
96 } else 96 } else
97 shm_destroy(ns, shp); 97 shm_destroy(ns, shp);
98 } 98 }
99 99
100 #ifdef CONFIG_IPC_NS 100 #ifdef CONFIG_IPC_NS
101 void shm_exit_ns(struct ipc_namespace *ns) 101 void shm_exit_ns(struct ipc_namespace *ns)
102 { 102 {
103 free_ipcs(ns, &shm_ids(ns), do_shm_rmid); 103 free_ipcs(ns, &shm_ids(ns), do_shm_rmid);
104 idr_destroy(&ns->ids[IPC_SHM_IDS].ipcs_idr); 104 idr_destroy(&ns->ids[IPC_SHM_IDS].ipcs_idr);
105 } 105 }
106 #endif 106 #endif
107 107
108 static int __init ipc_ns_init(void) 108 static int __init ipc_ns_init(void)
109 { 109 {
110 shm_init_ns(&init_ipc_ns); 110 shm_init_ns(&init_ipc_ns);
111 return 0; 111 return 0;
112 } 112 }
113 113
114 pure_initcall(ipc_ns_init); 114 pure_initcall(ipc_ns_init);
115 115
116 void __init shm_init (void) 116 void __init shm_init (void)
117 { 117 {
118 ipc_init_proc_interface("sysvipc/shm", 118 ipc_init_proc_interface("sysvipc/shm",
119 #if BITS_PER_LONG <= 32 119 #if BITS_PER_LONG <= 32
120 " key shmid perms size cpid lpid nattch uid gid cuid cgid atime dtime ctime rss swap\n", 120 " key shmid perms size cpid lpid nattch uid gid cuid cgid atime dtime ctime rss swap\n",
121 #else 121 #else
122 " key shmid perms size cpid lpid nattch uid gid cuid cgid atime dtime ctime rss swap\n", 122 " key shmid perms size cpid lpid nattch uid gid cuid cgid atime dtime ctime rss swap\n",
123 #endif 123 #endif
124 IPC_SHM_IDS, sysvipc_shm_proc_show); 124 IPC_SHM_IDS, sysvipc_shm_proc_show);
125 } 125 }
126 126
127 /* 127 /*
128 * shm_lock_(check_) routines are called in the paths where the rw_mutex 128 * shm_lock_(check_) routines are called in the paths where the rw_mutex
129 * is not necessarily held. 129 * is not necessarily held.
130 */ 130 */
131 static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id) 131 static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id)
132 { 132 {
133 struct kern_ipc_perm *ipcp = ipc_lock(&shm_ids(ns), id); 133 struct kern_ipc_perm *ipcp = ipc_lock(&shm_ids(ns), id);
134 134
135 if (IS_ERR(ipcp)) 135 if (IS_ERR(ipcp))
136 return (struct shmid_kernel *)ipcp; 136 return (struct shmid_kernel *)ipcp;
137 137
138 return container_of(ipcp, struct shmid_kernel, shm_perm); 138 return container_of(ipcp, struct shmid_kernel, shm_perm);
139 } 139 }
140 140
141 static inline void shm_lock_by_ptr(struct shmid_kernel *ipcp) 141 static inline void shm_lock_by_ptr(struct shmid_kernel *ipcp)
142 { 142 {
143 rcu_read_lock(); 143 rcu_read_lock();
144 spin_lock(&ipcp->shm_perm.lock); 144 spin_lock(&ipcp->shm_perm.lock);
145 } 145 }
146 146
147 static inline struct shmid_kernel *shm_lock_check(struct ipc_namespace *ns, 147 static inline struct shmid_kernel *shm_lock_check(struct ipc_namespace *ns,
148 int id) 148 int id)
149 { 149 {
150 struct kern_ipc_perm *ipcp = ipc_lock_check(&shm_ids(ns), id); 150 struct kern_ipc_perm *ipcp = ipc_lock_check(&shm_ids(ns), id);
151 151
152 if (IS_ERR(ipcp)) 152 if (IS_ERR(ipcp))
153 return (struct shmid_kernel *)ipcp; 153 return (struct shmid_kernel *)ipcp;
154 154
155 return container_of(ipcp, struct shmid_kernel, shm_perm); 155 return container_of(ipcp, struct shmid_kernel, shm_perm);
156 } 156 }
157 157
158 static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s) 158 static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s)
159 { 159 {
160 ipc_rmid(&shm_ids(ns), &s->shm_perm); 160 ipc_rmid(&shm_ids(ns), &s->shm_perm);
161 } 161 }
162 162
163 163
164 /* This is called by fork, once for every shm attach. */ 164 /* This is called by fork, once for every shm attach. */
165 static void shm_open(struct vm_area_struct *vma) 165 static void shm_open(struct vm_area_struct *vma)
166 { 166 {
167 struct file *file = vma->vm_file; 167 struct file *file = vma->vm_file;
168 struct shm_file_data *sfd = shm_file_data(file); 168 struct shm_file_data *sfd = shm_file_data(file);
169 struct shmid_kernel *shp; 169 struct shmid_kernel *shp;
170 170
171 shp = shm_lock(sfd->ns, sfd->id); 171 shp = shm_lock(sfd->ns, sfd->id);
172 BUG_ON(IS_ERR(shp)); 172 BUG_ON(IS_ERR(shp));
173 shp->shm_atim = get_seconds(); 173 shp->shm_atim = get_seconds();
174 shp->shm_lprid = task_tgid_vnr(current); 174 shp->shm_lprid = task_tgid_vnr(current);
175 shp->shm_nattch++; 175 shp->shm_nattch++;
176 shm_unlock(shp); 176 shm_unlock(shp);
177 } 177 }
178 178
179 /* 179 /*
180 * shm_destroy - free the struct shmid_kernel 180 * shm_destroy - free the struct shmid_kernel
181 * 181 *
182 * @ns: namespace 182 * @ns: namespace
183 * @shp: struct to free 183 * @shp: struct to free
184 * 184 *
185 * It has to be called with shp and shm_ids.rw_mutex (writer) locked, 185 * It has to be called with shp and shm_ids.rw_mutex (writer) locked,
186 * but returns with shp unlocked and freed. 186 * but returns with shp unlocked and freed.
187 */ 187 */
188 static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) 188 static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
189 { 189 {
190 ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT; 190 ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT;
191 shm_rmid(ns, shp); 191 shm_rmid(ns, shp);
192 shm_unlock(shp); 192 shm_unlock(shp);
193 if (!is_file_hugepages(shp->shm_file)) 193 if (!is_file_hugepages(shp->shm_file))
194 shmem_lock(shp->shm_file, 0, shp->mlock_user); 194 shmem_lock(shp->shm_file, 0, shp->mlock_user);
195 else if (shp->mlock_user) 195 else if (shp->mlock_user)
196 user_shm_unlock(file_inode(shp->shm_file)->i_size, 196 user_shm_unlock(file_inode(shp->shm_file)->i_size,
197 shp->mlock_user); 197 shp->mlock_user);
198 fput (shp->shm_file); 198 fput (shp->shm_file);
199 security_shm_free(shp); 199 security_shm_free(shp);
200 ipc_rcu_putref(shp); 200 ipc_rcu_putref(shp);
201 } 201 }
202 202
203 /* 203 /*
204 * shm_may_destroy - identifies whether shm segment should be destroyed now 204 * shm_may_destroy - identifies whether shm segment should be destroyed now
205 * 205 *
206 * Returns true if and only if there are no active users of the segment and 206 * Returns true if and only if there are no active users of the segment and
207 * one of the following is true: 207 * one of the following is true:
208 * 208 *
209 * 1) shmctl(id, IPC_RMID, NULL) was called for this shp 209 * 1) shmctl(id, IPC_RMID, NULL) was called for this shp
210 * 210 *
211 * 2) sysctl kernel.shm_rmid_forced is set to 1. 211 * 2) sysctl kernel.shm_rmid_forced is set to 1.
212 */ 212 */
213 static bool shm_may_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) 213 static bool shm_may_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
214 { 214 {
215 return (shp->shm_nattch == 0) && 215 return (shp->shm_nattch == 0) &&
216 (ns->shm_rmid_forced || 216 (ns->shm_rmid_forced ||
217 (shp->shm_perm.mode & SHM_DEST)); 217 (shp->shm_perm.mode & SHM_DEST));
218 } 218 }
219 219
220 /* 220 /*
221 * remove the attach descriptor vma. 221 * remove the attach descriptor vma.
222 * free memory for segment if it is marked destroyed. 222 * free memory for segment if it is marked destroyed.
223 * The descriptor has already been removed from the current->mm->mmap list 223 * The descriptor has already been removed from the current->mm->mmap list
224 * and will later be kfree()d. 224 * and will later be kfree()d.
225 */ 225 */
226 static void shm_close(struct vm_area_struct *vma) 226 static void shm_close(struct vm_area_struct *vma)
227 { 227 {
228 struct file * file = vma->vm_file; 228 struct file * file = vma->vm_file;
229 struct shm_file_data *sfd = shm_file_data(file); 229 struct shm_file_data *sfd = shm_file_data(file);
230 struct shmid_kernel *shp; 230 struct shmid_kernel *shp;
231 struct ipc_namespace *ns = sfd->ns; 231 struct ipc_namespace *ns = sfd->ns;
232 232
233 down_write(&shm_ids(ns).rw_mutex); 233 down_write(&shm_ids(ns).rw_mutex);
234 /* remove from the list of attaches of the shm segment */ 234 /* remove from the list of attaches of the shm segment */
235 shp = shm_lock(ns, sfd->id); 235 shp = shm_lock(ns, sfd->id);
236 BUG_ON(IS_ERR(shp)); 236 BUG_ON(IS_ERR(shp));
237 shp->shm_lprid = task_tgid_vnr(current); 237 shp->shm_lprid = task_tgid_vnr(current);
238 shp->shm_dtim = get_seconds(); 238 shp->shm_dtim = get_seconds();
239 shp->shm_nattch--; 239 shp->shm_nattch--;
240 if (shm_may_destroy(ns, shp)) 240 if (shm_may_destroy(ns, shp))
241 shm_destroy(ns, shp); 241 shm_destroy(ns, shp);
242 else 242 else
243 shm_unlock(shp); 243 shm_unlock(shp);
244 up_write(&shm_ids(ns).rw_mutex); 244 up_write(&shm_ids(ns).rw_mutex);
245 } 245 }
246 246
247 /* Called with ns->shm_ids(ns).rw_mutex locked */ 247 /* Called with ns->shm_ids(ns).rw_mutex locked */
248 static int shm_try_destroy_current(int id, void *p, void *data) 248 static int shm_try_destroy_current(int id, void *p, void *data)
249 { 249 {
250 struct ipc_namespace *ns = data; 250 struct ipc_namespace *ns = data;
251 struct kern_ipc_perm *ipcp = p; 251 struct kern_ipc_perm *ipcp = p;
252 struct shmid_kernel *shp = container_of(ipcp, struct shmid_kernel, shm_perm); 252 struct shmid_kernel *shp = container_of(ipcp, struct shmid_kernel, shm_perm);
253 253
254 if (shp->shm_creator != current) 254 if (shp->shm_creator != current)
255 return 0; 255 return 0;
256 256
257 /* 257 /*
258 * Mark it as orphaned to destroy the segment when 258 * Mark it as orphaned to destroy the segment when
259 * kernel.shm_rmid_forced is changed. 259 * kernel.shm_rmid_forced is changed.
260 * It is noop if the following shm_may_destroy() returns true. 260 * It is noop if the following shm_may_destroy() returns true.
261 */ 261 */
262 shp->shm_creator = NULL; 262 shp->shm_creator = NULL;
263 263
264 /* 264 /*
265 * Don't even try to destroy it. If shm_rmid_forced=0 and IPC_RMID 265 * Don't even try to destroy it. If shm_rmid_forced=0 and IPC_RMID
266 * is not set, it shouldn't be deleted here. 266 * is not set, it shouldn't be deleted here.
267 */ 267 */
268 if (!ns->shm_rmid_forced) 268 if (!ns->shm_rmid_forced)
269 return 0; 269 return 0;
270 270
271 if (shm_may_destroy(ns, shp)) { 271 if (shm_may_destroy(ns, shp)) {
272 shm_lock_by_ptr(shp); 272 shm_lock_by_ptr(shp);
273 shm_destroy(ns, shp); 273 shm_destroy(ns, shp);
274 } 274 }
275 return 0; 275 return 0;
276 } 276 }
277 277
278 /* Called with ns->shm_ids(ns).rw_mutex locked */ 278 /* Called with ns->shm_ids(ns).rw_mutex locked */
279 static int shm_try_destroy_orphaned(int id, void *p, void *data) 279 static int shm_try_destroy_orphaned(int id, void *p, void *data)
280 { 280 {
281 struct ipc_namespace *ns = data; 281 struct ipc_namespace *ns = data;
282 struct kern_ipc_perm *ipcp = p; 282 struct kern_ipc_perm *ipcp = p;
283 struct shmid_kernel *shp = container_of(ipcp, struct shmid_kernel, shm_perm); 283 struct shmid_kernel *shp = container_of(ipcp, struct shmid_kernel, shm_perm);
284 284
285 /* 285 /*
286 * We want to destroy segments without users and with already 286 * We want to destroy segments without users and with already
287 * exit'ed originating process. 287 * exit'ed originating process.
288 * 288 *
289 * As shp->* are changed under rw_mutex, it's safe to skip shp locking. 289 * As shp->* are changed under rw_mutex, it's safe to skip shp locking.
290 */ 290 */
291 if (shp->shm_creator != NULL) 291 if (shp->shm_creator != NULL)
292 return 0; 292 return 0;
293 293
294 if (shm_may_destroy(ns, shp)) { 294 if (shm_may_destroy(ns, shp)) {
295 shm_lock_by_ptr(shp); 295 shm_lock_by_ptr(shp);
296 shm_destroy(ns, shp); 296 shm_destroy(ns, shp);
297 } 297 }
298 return 0; 298 return 0;
299 } 299 }
300 300
301 void shm_destroy_orphaned(struct ipc_namespace *ns) 301 void shm_destroy_orphaned(struct ipc_namespace *ns)
302 { 302 {
303 down_write(&shm_ids(ns).rw_mutex); 303 down_write(&shm_ids(ns).rw_mutex);
304 if (shm_ids(ns).in_use) 304 if (shm_ids(ns).in_use)
305 idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_orphaned, ns); 305 idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_orphaned, ns);
306 up_write(&shm_ids(ns).rw_mutex); 306 up_write(&shm_ids(ns).rw_mutex);
307 } 307 }
308 308
309 309
310 void exit_shm(struct task_struct *task) 310 void exit_shm(struct task_struct *task)
311 { 311 {
312 struct ipc_namespace *ns = task->nsproxy->ipc_ns; 312 struct ipc_namespace *ns = task->nsproxy->ipc_ns;
313 313
314 if (shm_ids(ns).in_use == 0) 314 if (shm_ids(ns).in_use == 0)
315 return; 315 return;
316 316
317 /* Destroy all already created segments, but not mapped yet */ 317 /* Destroy all already created segments, but not mapped yet */
318 down_write(&shm_ids(ns).rw_mutex); 318 down_write(&shm_ids(ns).rw_mutex);
319 if (shm_ids(ns).in_use) 319 if (shm_ids(ns).in_use)
320 idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_current, ns); 320 idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_current, ns);
321 up_write(&shm_ids(ns).rw_mutex); 321 up_write(&shm_ids(ns).rw_mutex);
322 } 322 }
323 323
324 static int shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 324 static int shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
325 { 325 {
326 struct file *file = vma->vm_file; 326 struct file *file = vma->vm_file;
327 struct shm_file_data *sfd = shm_file_data(file); 327 struct shm_file_data *sfd = shm_file_data(file);
328 328
329 return sfd->vm_ops->fault(vma, vmf); 329 return sfd->vm_ops->fault(vma, vmf);
330 } 330 }
331 331
332 #ifdef CONFIG_NUMA 332 #ifdef CONFIG_NUMA
333 static int shm_set_policy(struct vm_area_struct *vma, struct mempolicy *new) 333 static int shm_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
334 { 334 {
335 struct file *file = vma->vm_file; 335 struct file *file = vma->vm_file;
336 struct shm_file_data *sfd = shm_file_data(file); 336 struct shm_file_data *sfd = shm_file_data(file);
337 int err = 0; 337 int err = 0;
338 if (sfd->vm_ops->set_policy) 338 if (sfd->vm_ops->set_policy)
339 err = sfd->vm_ops->set_policy(vma, new); 339 err = sfd->vm_ops->set_policy(vma, new);
340 return err; 340 return err;
341 } 341 }
342 342
343 static struct mempolicy *shm_get_policy(struct vm_area_struct *vma, 343 static struct mempolicy *shm_get_policy(struct vm_area_struct *vma,
344 unsigned long addr) 344 unsigned long addr)
345 { 345 {
346 struct file *file = vma->vm_file; 346 struct file *file = vma->vm_file;
347 struct shm_file_data *sfd = shm_file_data(file); 347 struct shm_file_data *sfd = shm_file_data(file);
348 struct mempolicy *pol = NULL; 348 struct mempolicy *pol = NULL;
349 349
350 if (sfd->vm_ops->get_policy) 350 if (sfd->vm_ops->get_policy)
351 pol = sfd->vm_ops->get_policy(vma, addr); 351 pol = sfd->vm_ops->get_policy(vma, addr);
352 else if (vma->vm_policy) 352 else if (vma->vm_policy)
353 pol = vma->vm_policy; 353 pol = vma->vm_policy;
354 354
355 return pol; 355 return pol;
356 } 356 }
357 #endif 357 #endif
358 358
359 static int shm_mmap(struct file * file, struct vm_area_struct * vma) 359 static int shm_mmap(struct file * file, struct vm_area_struct * vma)
360 { 360 {
361 struct shm_file_data *sfd = shm_file_data(file); 361 struct shm_file_data *sfd = shm_file_data(file);
362 int ret; 362 int ret;
363 363
364 ret = sfd->file->f_op->mmap(sfd->file, vma); 364 ret = sfd->file->f_op->mmap(sfd->file, vma);
365 if (ret != 0) 365 if (ret != 0)
366 return ret; 366 return ret;
367 sfd->vm_ops = vma->vm_ops; 367 sfd->vm_ops = vma->vm_ops;
368 #ifdef CONFIG_MMU 368 #ifdef CONFIG_MMU
369 BUG_ON(!sfd->vm_ops->fault); 369 BUG_ON(!sfd->vm_ops->fault);
370 #endif 370 #endif
371 vma->vm_ops = &shm_vm_ops; 371 vma->vm_ops = &shm_vm_ops;
372 shm_open(vma); 372 shm_open(vma);
373 373
374 return ret; 374 return ret;
375 } 375 }
376 376
377 static int shm_release(struct inode *ino, struct file *file) 377 static int shm_release(struct inode *ino, struct file *file)
378 { 378 {
379 struct shm_file_data *sfd = shm_file_data(file); 379 struct shm_file_data *sfd = shm_file_data(file);
380 380
381 put_ipc_ns(sfd->ns); 381 put_ipc_ns(sfd->ns);
382 shm_file_data(file) = NULL; 382 shm_file_data(file) = NULL;
383 kfree(sfd); 383 kfree(sfd);
384 return 0; 384 return 0;
385 } 385 }
386 386
387 static int shm_fsync(struct file *file, loff_t start, loff_t end, int datasync) 387 static int shm_fsync(struct file *file, loff_t start, loff_t end, int datasync)
388 { 388 {
389 struct shm_file_data *sfd = shm_file_data(file); 389 struct shm_file_data *sfd = shm_file_data(file);
390 390
391 if (!sfd->file->f_op->fsync) 391 if (!sfd->file->f_op->fsync)
392 return -EINVAL; 392 return -EINVAL;
393 return sfd->file->f_op->fsync(sfd->file, start, end, datasync); 393 return sfd->file->f_op->fsync(sfd->file, start, end, datasync);
394 } 394 }
395 395
396 static long shm_fallocate(struct file *file, int mode, loff_t offset, 396 static long shm_fallocate(struct file *file, int mode, loff_t offset,
397 loff_t len) 397 loff_t len)
398 { 398 {
399 struct shm_file_data *sfd = shm_file_data(file); 399 struct shm_file_data *sfd = shm_file_data(file);
400 400
401 if (!sfd->file->f_op->fallocate) 401 if (!sfd->file->f_op->fallocate)
402 return -EOPNOTSUPP; 402 return -EOPNOTSUPP;
403 return sfd->file->f_op->fallocate(file, mode, offset, len); 403 return sfd->file->f_op->fallocate(file, mode, offset, len);
404 } 404 }
405 405
406 static unsigned long shm_get_unmapped_area(struct file *file, 406 static unsigned long shm_get_unmapped_area(struct file *file,
407 unsigned long addr, unsigned long len, unsigned long pgoff, 407 unsigned long addr, unsigned long len, unsigned long pgoff,
408 unsigned long flags) 408 unsigned long flags)
409 { 409 {
410 struct shm_file_data *sfd = shm_file_data(file); 410 struct shm_file_data *sfd = shm_file_data(file);
411 return sfd->file->f_op->get_unmapped_area(sfd->file, addr, len, 411 return sfd->file->f_op->get_unmapped_area(sfd->file, addr, len,
412 pgoff, flags); 412 pgoff, flags);
413 } 413 }
414 414
415 static const struct file_operations shm_file_operations = { 415 static const struct file_operations shm_file_operations = {
416 .mmap = shm_mmap, 416 .mmap = shm_mmap,
417 .fsync = shm_fsync, 417 .fsync = shm_fsync,
418 .release = shm_release, 418 .release = shm_release,
419 #ifndef CONFIG_MMU 419 #ifndef CONFIG_MMU
420 .get_unmapped_area = shm_get_unmapped_area, 420 .get_unmapped_area = shm_get_unmapped_area,
421 #endif 421 #endif
422 .llseek = noop_llseek, 422 .llseek = noop_llseek,
423 .fallocate = shm_fallocate, 423 .fallocate = shm_fallocate,
424 }; 424 };
425 425
426 static const struct file_operations shm_file_operations_huge = { 426 static const struct file_operations shm_file_operations_huge = {
427 .mmap = shm_mmap, 427 .mmap = shm_mmap,
428 .fsync = shm_fsync, 428 .fsync = shm_fsync,
429 .release = shm_release, 429 .release = shm_release,
430 .get_unmapped_area = shm_get_unmapped_area, 430 .get_unmapped_area = shm_get_unmapped_area,
431 .llseek = noop_llseek, 431 .llseek = noop_llseek,
432 .fallocate = shm_fallocate, 432 .fallocate = shm_fallocate,
433 }; 433 };
434 434
435 int is_file_shm_hugepages(struct file *file) 435 int is_file_shm_hugepages(struct file *file)
436 { 436 {
437 return file->f_op == &shm_file_operations_huge; 437 return file->f_op == &shm_file_operations_huge;
438 } 438 }
439 439
440 static const struct vm_operations_struct shm_vm_ops = { 440 static const struct vm_operations_struct shm_vm_ops = {
441 .open = shm_open, /* callback for a new vm-area open */ 441 .open = shm_open, /* callback for a new vm-area open */
442 .close = shm_close, /* callback for when the vm-area is released */ 442 .close = shm_close, /* callback for when the vm-area is released */
443 .fault = shm_fault, 443 .fault = shm_fault,
444 #if defined(CONFIG_NUMA) 444 #if defined(CONFIG_NUMA)
445 .set_policy = shm_set_policy, 445 .set_policy = shm_set_policy,
446 .get_policy = shm_get_policy, 446 .get_policy = shm_get_policy,
447 #endif 447 #endif
448 }; 448 };
449 449
450 /** 450 /**
451 * newseg - Create a new shared memory segment 451 * newseg - Create a new shared memory segment
452 * @ns: namespace 452 * @ns: namespace
453 * @params: ptr to the structure that contains key, size and shmflg 453 * @params: ptr to the structure that contains key, size and shmflg
454 * 454 *
455 * Called with shm_ids.rw_mutex held as a writer. 455 * Called with shm_ids.rw_mutex held as a writer.
456 */ 456 */
457 457
458 static int newseg(struct ipc_namespace *ns, struct ipc_params *params) 458 static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
459 { 459 {
460 key_t key = params->key; 460 key_t key = params->key;
461 int shmflg = params->flg; 461 int shmflg = params->flg;
462 size_t size = params->u.size; 462 size_t size = params->u.size;
463 int error; 463 int error;
464 struct shmid_kernel *shp; 464 struct shmid_kernel *shp;
465 size_t numpages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; 465 size_t numpages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
466 struct file * file; 466 struct file * file;
467 char name[13]; 467 char name[13];
468 int id; 468 int id;
469 vm_flags_t acctflag = 0; 469 vm_flags_t acctflag = 0;
470 470
471 if (size < SHMMIN || size > ns->shm_ctlmax) 471 if (size < SHMMIN || size > ns->shm_ctlmax)
472 return -EINVAL; 472 return -EINVAL;
473 473
474 if (ns->shm_tot + numpages > ns->shm_ctlall) 474 if (ns->shm_tot + numpages > ns->shm_ctlall)
475 return -ENOSPC; 475 return -ENOSPC;
476 476
477 shp = ipc_rcu_alloc(sizeof(*shp)); 477 shp = ipc_rcu_alloc(sizeof(*shp));
478 if (!shp) 478 if (!shp)
479 return -ENOMEM; 479 return -ENOMEM;
480 480
481 shp->shm_perm.key = key; 481 shp->shm_perm.key = key;
482 shp->shm_perm.mode = (shmflg & S_IRWXUGO); 482 shp->shm_perm.mode = (shmflg & S_IRWXUGO);
483 shp->mlock_user = NULL; 483 shp->mlock_user = NULL;
484 484
485 shp->shm_perm.security = NULL; 485 shp->shm_perm.security = NULL;
486 error = security_shm_alloc(shp); 486 error = security_shm_alloc(shp);
487 if (error) { 487 if (error) {
488 ipc_rcu_putref(shp); 488 ipc_rcu_putref(shp);
489 return error; 489 return error;
490 } 490 }
491 491
492 sprintf (name, "SYSV%08x", key); 492 sprintf (name, "SYSV%08x", key);
493 if (shmflg & SHM_HUGETLB) { 493 if (shmflg & SHM_HUGETLB) {
494 struct hstate *hs = hstate_sizelog((shmflg >> SHM_HUGE_SHIFT)
495 & SHM_HUGE_MASK);
496 size_t hugesize = ALIGN(size, huge_page_size(hs));
497
494 /* hugetlb_file_setup applies strict accounting */ 498 /* hugetlb_file_setup applies strict accounting */
495 if (shmflg & SHM_NORESERVE) 499 if (shmflg & SHM_NORESERVE)
496 acctflag = VM_NORESERVE; 500 acctflag = VM_NORESERVE;
497 file = hugetlb_file_setup(name, 0, size, acctflag, 501 file = hugetlb_file_setup(name, hugesize, acctflag,
498 &shp->mlock_user, HUGETLB_SHMFS_INODE, 502 &shp->mlock_user, HUGETLB_SHMFS_INODE,
499 (shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK); 503 (shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK);
500 } else { 504 } else {
501 /* 505 /*
502 * Do not allow no accounting for OVERCOMMIT_NEVER, even 506 * Do not allow no accounting for OVERCOMMIT_NEVER, even
503 * if it's asked for. 507 * if it's asked for.
504 */ 508 */
505 if ((shmflg & SHM_NORESERVE) && 509 if ((shmflg & SHM_NORESERVE) &&
506 sysctl_overcommit_memory != OVERCOMMIT_NEVER) 510 sysctl_overcommit_memory != OVERCOMMIT_NEVER)
507 acctflag = VM_NORESERVE; 511 acctflag = VM_NORESERVE;
508 file = shmem_file_setup(name, size, acctflag); 512 file = shmem_file_setup(name, size, acctflag);
509 } 513 }
510 error = PTR_ERR(file); 514 error = PTR_ERR(file);
511 if (IS_ERR(file)) 515 if (IS_ERR(file))
512 goto no_file; 516 goto no_file;
513 517
514 id = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni); 518 id = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni);
515 if (id < 0) { 519 if (id < 0) {
516 error = id; 520 error = id;
517 goto no_id; 521 goto no_id;
518 } 522 }
519 523
520 shp->shm_cprid = task_tgid_vnr(current); 524 shp->shm_cprid = task_tgid_vnr(current);
521 shp->shm_lprid = 0; 525 shp->shm_lprid = 0;
522 shp->shm_atim = shp->shm_dtim = 0; 526 shp->shm_atim = shp->shm_dtim = 0;
523 shp->shm_ctim = get_seconds(); 527 shp->shm_ctim = get_seconds();
524 shp->shm_segsz = size; 528 shp->shm_segsz = size;
525 shp->shm_nattch = 0; 529 shp->shm_nattch = 0;
526 shp->shm_file = file; 530 shp->shm_file = file;
527 shp->shm_creator = current; 531 shp->shm_creator = current;
528 /* 532 /*
529 * shmid gets reported as "inode#" in /proc/pid/maps. 533 * shmid gets reported as "inode#" in /proc/pid/maps.
530 * proc-ps tools use this. Changing this will break them. 534 * proc-ps tools use this. Changing this will break them.
531 */ 535 */
532 file_inode(file)->i_ino = shp->shm_perm.id; 536 file_inode(file)->i_ino = shp->shm_perm.id;
533 537
534 ns->shm_tot += numpages; 538 ns->shm_tot += numpages;
535 error = shp->shm_perm.id; 539 error = shp->shm_perm.id;
536 shm_unlock(shp); 540 shm_unlock(shp);
537 return error; 541 return error;
538 542
539 no_id: 543 no_id:
540 if (is_file_hugepages(file) && shp->mlock_user) 544 if (is_file_hugepages(file) && shp->mlock_user)
541 user_shm_unlock(size, shp->mlock_user); 545 user_shm_unlock(size, shp->mlock_user);
542 fput(file); 546 fput(file);
543 no_file: 547 no_file:
544 security_shm_free(shp); 548 security_shm_free(shp);
545 ipc_rcu_putref(shp); 549 ipc_rcu_putref(shp);
546 return error; 550 return error;
547 } 551 }
548 552
549 /* 553 /*
550 * Called with shm_ids.rw_mutex and ipcp locked. 554 * Called with shm_ids.rw_mutex and ipcp locked.
551 */ 555 */
552 static inline int shm_security(struct kern_ipc_perm *ipcp, int shmflg) 556 static inline int shm_security(struct kern_ipc_perm *ipcp, int shmflg)
553 { 557 {
554 struct shmid_kernel *shp; 558 struct shmid_kernel *shp;
555 559
556 shp = container_of(ipcp, struct shmid_kernel, shm_perm); 560 shp = container_of(ipcp, struct shmid_kernel, shm_perm);
557 return security_shm_associate(shp, shmflg); 561 return security_shm_associate(shp, shmflg);
558 } 562 }
559 563
560 /* 564 /*
561 * Called with shm_ids.rw_mutex and ipcp locked. 565 * Called with shm_ids.rw_mutex and ipcp locked.
562 */ 566 */
563 static inline int shm_more_checks(struct kern_ipc_perm *ipcp, 567 static inline int shm_more_checks(struct kern_ipc_perm *ipcp,
564 struct ipc_params *params) 568 struct ipc_params *params)
565 { 569 {
566 struct shmid_kernel *shp; 570 struct shmid_kernel *shp;
567 571
568 shp = container_of(ipcp, struct shmid_kernel, shm_perm); 572 shp = container_of(ipcp, struct shmid_kernel, shm_perm);
569 if (shp->shm_segsz < params->u.size) 573 if (shp->shm_segsz < params->u.size)
570 return -EINVAL; 574 return -EINVAL;
571 575
572 return 0; 576 return 0;
573 } 577 }
574 578
575 SYSCALL_DEFINE3(shmget, key_t, key, size_t, size, int, shmflg) 579 SYSCALL_DEFINE3(shmget, key_t, key, size_t, size, int, shmflg)
576 { 580 {
577 struct ipc_namespace *ns; 581 struct ipc_namespace *ns;
578 struct ipc_ops shm_ops; 582 struct ipc_ops shm_ops;
579 struct ipc_params shm_params; 583 struct ipc_params shm_params;
580 584
581 ns = current->nsproxy->ipc_ns; 585 ns = current->nsproxy->ipc_ns;
582 586
583 shm_ops.getnew = newseg; 587 shm_ops.getnew = newseg;
584 shm_ops.associate = shm_security; 588 shm_ops.associate = shm_security;
585 shm_ops.more_checks = shm_more_checks; 589 shm_ops.more_checks = shm_more_checks;
586 590
587 shm_params.key = key; 591 shm_params.key = key;
588 shm_params.flg = shmflg; 592 shm_params.flg = shmflg;
589 shm_params.u.size = size; 593 shm_params.u.size = size;
590 594
591 return ipcget(ns, &shm_ids(ns), &shm_ops, &shm_params); 595 return ipcget(ns, &shm_ids(ns), &shm_ops, &shm_params);
592 } 596 }
593 597
594 static inline unsigned long copy_shmid_to_user(void __user *buf, struct shmid64_ds *in, int version) 598 static inline unsigned long copy_shmid_to_user(void __user *buf, struct shmid64_ds *in, int version)
595 { 599 {
596 switch(version) { 600 switch(version) {
597 case IPC_64: 601 case IPC_64:
598 return copy_to_user(buf, in, sizeof(*in)); 602 return copy_to_user(buf, in, sizeof(*in));
599 case IPC_OLD: 603 case IPC_OLD:
600 { 604 {
601 struct shmid_ds out; 605 struct shmid_ds out;
602 606
603 memset(&out, 0, sizeof(out)); 607 memset(&out, 0, sizeof(out));
604 ipc64_perm_to_ipc_perm(&in->shm_perm, &out.shm_perm); 608 ipc64_perm_to_ipc_perm(&in->shm_perm, &out.shm_perm);
605 out.shm_segsz = in->shm_segsz; 609 out.shm_segsz = in->shm_segsz;
606 out.shm_atime = in->shm_atime; 610 out.shm_atime = in->shm_atime;
607 out.shm_dtime = in->shm_dtime; 611 out.shm_dtime = in->shm_dtime;
608 out.shm_ctime = in->shm_ctime; 612 out.shm_ctime = in->shm_ctime;
609 out.shm_cpid = in->shm_cpid; 613 out.shm_cpid = in->shm_cpid;
610 out.shm_lpid = in->shm_lpid; 614 out.shm_lpid = in->shm_lpid;
611 out.shm_nattch = in->shm_nattch; 615 out.shm_nattch = in->shm_nattch;
612 616
613 return copy_to_user(buf, &out, sizeof(out)); 617 return copy_to_user(buf, &out, sizeof(out));
614 } 618 }
615 default: 619 default:
616 return -EINVAL; 620 return -EINVAL;
617 } 621 }
618 } 622 }
619 623
620 static inline unsigned long 624 static inline unsigned long
621 copy_shmid_from_user(struct shmid64_ds *out, void __user *buf, int version) 625 copy_shmid_from_user(struct shmid64_ds *out, void __user *buf, int version)
622 { 626 {
623 switch(version) { 627 switch(version) {
624 case IPC_64: 628 case IPC_64:
625 if (copy_from_user(out, buf, sizeof(*out))) 629 if (copy_from_user(out, buf, sizeof(*out)))
626 return -EFAULT; 630 return -EFAULT;
627 return 0; 631 return 0;
628 case IPC_OLD: 632 case IPC_OLD:
629 { 633 {
630 struct shmid_ds tbuf_old; 634 struct shmid_ds tbuf_old;
631 635
632 if (copy_from_user(&tbuf_old, buf, sizeof(tbuf_old))) 636 if (copy_from_user(&tbuf_old, buf, sizeof(tbuf_old)))
633 return -EFAULT; 637 return -EFAULT;
634 638
635 out->shm_perm.uid = tbuf_old.shm_perm.uid; 639 out->shm_perm.uid = tbuf_old.shm_perm.uid;
636 out->shm_perm.gid = tbuf_old.shm_perm.gid; 640 out->shm_perm.gid = tbuf_old.shm_perm.gid;
637 out->shm_perm.mode = tbuf_old.shm_perm.mode; 641 out->shm_perm.mode = tbuf_old.shm_perm.mode;
638 642
639 return 0; 643 return 0;
640 } 644 }
641 default: 645 default:
642 return -EINVAL; 646 return -EINVAL;
643 } 647 }
644 } 648 }
645 649
646 static inline unsigned long copy_shminfo_to_user(void __user *buf, struct shminfo64 *in, int version) 650 static inline unsigned long copy_shminfo_to_user(void __user *buf, struct shminfo64 *in, int version)
647 { 651 {
648 switch(version) { 652 switch(version) {
649 case IPC_64: 653 case IPC_64:
650 return copy_to_user(buf, in, sizeof(*in)); 654 return copy_to_user(buf, in, sizeof(*in));
651 case IPC_OLD: 655 case IPC_OLD:
652 { 656 {
653 struct shminfo out; 657 struct shminfo out;
654 658
655 if(in->shmmax > INT_MAX) 659 if(in->shmmax > INT_MAX)
656 out.shmmax = INT_MAX; 660 out.shmmax = INT_MAX;
657 else 661 else
658 out.shmmax = (int)in->shmmax; 662 out.shmmax = (int)in->shmmax;
659 663
660 out.shmmin = in->shmmin; 664 out.shmmin = in->shmmin;
661 out.shmmni = in->shmmni; 665 out.shmmni = in->shmmni;
662 out.shmseg = in->shmseg; 666 out.shmseg = in->shmseg;
663 out.shmall = in->shmall; 667 out.shmall = in->shmall;
664 668
665 return copy_to_user(buf, &out, sizeof(out)); 669 return copy_to_user(buf, &out, sizeof(out));
666 } 670 }
667 default: 671 default:
668 return -EINVAL; 672 return -EINVAL;
669 } 673 }
670 } 674 }
671 675
672 /* 676 /*
673 * Calculate and add used RSS and swap pages of a shm. 677 * Calculate and add used RSS and swap pages of a shm.
674 * Called with shm_ids.rw_mutex held as a reader 678 * Called with shm_ids.rw_mutex held as a reader
675 */ 679 */
676 static void shm_add_rss_swap(struct shmid_kernel *shp, 680 static void shm_add_rss_swap(struct shmid_kernel *shp,
677 unsigned long *rss_add, unsigned long *swp_add) 681 unsigned long *rss_add, unsigned long *swp_add)
678 { 682 {
679 struct inode *inode; 683 struct inode *inode;
680 684
681 inode = file_inode(shp->shm_file); 685 inode = file_inode(shp->shm_file);
682 686
683 if (is_file_hugepages(shp->shm_file)) { 687 if (is_file_hugepages(shp->shm_file)) {
684 struct address_space *mapping = inode->i_mapping; 688 struct address_space *mapping = inode->i_mapping;
685 struct hstate *h = hstate_file(shp->shm_file); 689 struct hstate *h = hstate_file(shp->shm_file);
686 *rss_add += pages_per_huge_page(h) * mapping->nrpages; 690 *rss_add += pages_per_huge_page(h) * mapping->nrpages;
687 } else { 691 } else {
688 #ifdef CONFIG_SHMEM 692 #ifdef CONFIG_SHMEM
689 struct shmem_inode_info *info = SHMEM_I(inode); 693 struct shmem_inode_info *info = SHMEM_I(inode);
690 spin_lock(&info->lock); 694 spin_lock(&info->lock);
691 *rss_add += inode->i_mapping->nrpages; 695 *rss_add += inode->i_mapping->nrpages;
692 *swp_add += info->swapped; 696 *swp_add += info->swapped;
693 spin_unlock(&info->lock); 697 spin_unlock(&info->lock);
694 #else 698 #else
695 *rss_add += inode->i_mapping->nrpages; 699 *rss_add += inode->i_mapping->nrpages;
696 #endif 700 #endif
697 } 701 }
698 } 702 }
699 703
700 /* 704 /*
701 * Called with shm_ids.rw_mutex held as a reader 705 * Called with shm_ids.rw_mutex held as a reader
702 */ 706 */
703 static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss, 707 static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss,
704 unsigned long *swp) 708 unsigned long *swp)
705 { 709 {
706 int next_id; 710 int next_id;
707 int total, in_use; 711 int total, in_use;
708 712
709 *rss = 0; 713 *rss = 0;
710 *swp = 0; 714 *swp = 0;
711 715
712 in_use = shm_ids(ns).in_use; 716 in_use = shm_ids(ns).in_use;
713 717
714 for (total = 0, next_id = 0; total < in_use; next_id++) { 718 for (total = 0, next_id = 0; total < in_use; next_id++) {
715 struct kern_ipc_perm *ipc; 719 struct kern_ipc_perm *ipc;
716 struct shmid_kernel *shp; 720 struct shmid_kernel *shp;
717 721
718 ipc = idr_find(&shm_ids(ns).ipcs_idr, next_id); 722 ipc = idr_find(&shm_ids(ns).ipcs_idr, next_id);
719 if (ipc == NULL) 723 if (ipc == NULL)
720 continue; 724 continue;
721 shp = container_of(ipc, struct shmid_kernel, shm_perm); 725 shp = container_of(ipc, struct shmid_kernel, shm_perm);
722 726
723 shm_add_rss_swap(shp, rss, swp); 727 shm_add_rss_swap(shp, rss, swp);
724 728
725 total++; 729 total++;
726 } 730 }
727 } 731 }
728 732
729 /* 733 /*
730 * This function handles some shmctl commands which require the rw_mutex 734 * This function handles some shmctl commands which require the rw_mutex
731 * to be held in write mode. 735 * to be held in write mode.
732 * NOTE: no locks must be held, the rw_mutex is taken inside this function. 736 * NOTE: no locks must be held, the rw_mutex is taken inside this function.
733 */ 737 */
734 static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd, 738 static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd,
735 struct shmid_ds __user *buf, int version) 739 struct shmid_ds __user *buf, int version)
736 { 740 {
737 struct kern_ipc_perm *ipcp; 741 struct kern_ipc_perm *ipcp;
738 struct shmid64_ds shmid64; 742 struct shmid64_ds shmid64;
739 struct shmid_kernel *shp; 743 struct shmid_kernel *shp;
740 int err; 744 int err;
741 745
742 if (cmd == IPC_SET) { 746 if (cmd == IPC_SET) {
743 if (copy_shmid_from_user(&shmid64, buf, version)) 747 if (copy_shmid_from_user(&shmid64, buf, version))
744 return -EFAULT; 748 return -EFAULT;
745 } 749 }
746 750
747 ipcp = ipcctl_pre_down(ns, &shm_ids(ns), shmid, cmd, 751 ipcp = ipcctl_pre_down(ns, &shm_ids(ns), shmid, cmd,
748 &shmid64.shm_perm, 0); 752 &shmid64.shm_perm, 0);
749 if (IS_ERR(ipcp)) 753 if (IS_ERR(ipcp))
750 return PTR_ERR(ipcp); 754 return PTR_ERR(ipcp);
751 755
752 shp = container_of(ipcp, struct shmid_kernel, shm_perm); 756 shp = container_of(ipcp, struct shmid_kernel, shm_perm);
753 757
754 err = security_shm_shmctl(shp, cmd); 758 err = security_shm_shmctl(shp, cmd);
755 if (err) 759 if (err)
756 goto out_unlock; 760 goto out_unlock;
757 switch (cmd) { 761 switch (cmd) {
758 case IPC_RMID: 762 case IPC_RMID:
759 do_shm_rmid(ns, ipcp); 763 do_shm_rmid(ns, ipcp);
760 goto out_up; 764 goto out_up;
761 case IPC_SET: 765 case IPC_SET:
762 err = ipc_update_perm(&shmid64.shm_perm, ipcp); 766 err = ipc_update_perm(&shmid64.shm_perm, ipcp);
763 if (err) 767 if (err)
764 goto out_unlock; 768 goto out_unlock;
765 shp->shm_ctim = get_seconds(); 769 shp->shm_ctim = get_seconds();
766 break; 770 break;
767 default: 771 default:
768 err = -EINVAL; 772 err = -EINVAL;
769 } 773 }
770 out_unlock: 774 out_unlock:
771 shm_unlock(shp); 775 shm_unlock(shp);
772 out_up: 776 out_up:
773 up_write(&shm_ids(ns).rw_mutex); 777 up_write(&shm_ids(ns).rw_mutex);
774 return err; 778 return err;
775 } 779 }
776 780
777 SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf) 781 SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf)
778 { 782 {
779 struct shmid_kernel *shp; 783 struct shmid_kernel *shp;
780 int err, version; 784 int err, version;
781 struct ipc_namespace *ns; 785 struct ipc_namespace *ns;
782 786
783 if (cmd < 0 || shmid < 0) { 787 if (cmd < 0 || shmid < 0) {
784 err = -EINVAL; 788 err = -EINVAL;
785 goto out; 789 goto out;
786 } 790 }
787 791
788 version = ipc_parse_version(&cmd); 792 version = ipc_parse_version(&cmd);
789 ns = current->nsproxy->ipc_ns; 793 ns = current->nsproxy->ipc_ns;
790 794
791 switch (cmd) { /* replace with proc interface ? */ 795 switch (cmd) { /* replace with proc interface ? */
792 case IPC_INFO: 796 case IPC_INFO:
793 { 797 {
794 struct shminfo64 shminfo; 798 struct shminfo64 shminfo;
795 799
796 err = security_shm_shmctl(NULL, cmd); 800 err = security_shm_shmctl(NULL, cmd);
797 if (err) 801 if (err)
798 return err; 802 return err;
799 803
800 memset(&shminfo, 0, sizeof(shminfo)); 804 memset(&shminfo, 0, sizeof(shminfo));
801 shminfo.shmmni = shminfo.shmseg = ns->shm_ctlmni; 805 shminfo.shmmni = shminfo.shmseg = ns->shm_ctlmni;
802 shminfo.shmmax = ns->shm_ctlmax; 806 shminfo.shmmax = ns->shm_ctlmax;
803 shminfo.shmall = ns->shm_ctlall; 807 shminfo.shmall = ns->shm_ctlall;
804 808
805 shminfo.shmmin = SHMMIN; 809 shminfo.shmmin = SHMMIN;
806 if(copy_shminfo_to_user (buf, &shminfo, version)) 810 if(copy_shminfo_to_user (buf, &shminfo, version))
807 return -EFAULT; 811 return -EFAULT;
808 812
809 down_read(&shm_ids(ns).rw_mutex); 813 down_read(&shm_ids(ns).rw_mutex);
810 err = ipc_get_maxid(&shm_ids(ns)); 814 err = ipc_get_maxid(&shm_ids(ns));
811 up_read(&shm_ids(ns).rw_mutex); 815 up_read(&shm_ids(ns).rw_mutex);
812 816
813 if(err<0) 817 if(err<0)
814 err = 0; 818 err = 0;
815 goto out; 819 goto out;
816 } 820 }
817 case SHM_INFO: 821 case SHM_INFO:
818 { 822 {
819 struct shm_info shm_info; 823 struct shm_info shm_info;
820 824
821 err = security_shm_shmctl(NULL, cmd); 825 err = security_shm_shmctl(NULL, cmd);
822 if (err) 826 if (err)
823 return err; 827 return err;
824 828
825 memset(&shm_info, 0, sizeof(shm_info)); 829 memset(&shm_info, 0, sizeof(shm_info));
826 down_read(&shm_ids(ns).rw_mutex); 830 down_read(&shm_ids(ns).rw_mutex);
827 shm_info.used_ids = shm_ids(ns).in_use; 831 shm_info.used_ids = shm_ids(ns).in_use;
828 shm_get_stat (ns, &shm_info.shm_rss, &shm_info.shm_swp); 832 shm_get_stat (ns, &shm_info.shm_rss, &shm_info.shm_swp);
829 shm_info.shm_tot = ns->shm_tot; 833 shm_info.shm_tot = ns->shm_tot;
830 shm_info.swap_attempts = 0; 834 shm_info.swap_attempts = 0;
831 shm_info.swap_successes = 0; 835 shm_info.swap_successes = 0;
832 err = ipc_get_maxid(&shm_ids(ns)); 836 err = ipc_get_maxid(&shm_ids(ns));
833 up_read(&shm_ids(ns).rw_mutex); 837 up_read(&shm_ids(ns).rw_mutex);
834 if (copy_to_user(buf, &shm_info, sizeof(shm_info))) { 838 if (copy_to_user(buf, &shm_info, sizeof(shm_info))) {
835 err = -EFAULT; 839 err = -EFAULT;
836 goto out; 840 goto out;
837 } 841 }
838 842
839 err = err < 0 ? 0 : err; 843 err = err < 0 ? 0 : err;
840 goto out; 844 goto out;
841 } 845 }
842 case SHM_STAT: 846 case SHM_STAT:
843 case IPC_STAT: 847 case IPC_STAT:
844 { 848 {
845 struct shmid64_ds tbuf; 849 struct shmid64_ds tbuf;
846 int result; 850 int result;
847 851
848 if (cmd == SHM_STAT) { 852 if (cmd == SHM_STAT) {
849 shp = shm_lock(ns, shmid); 853 shp = shm_lock(ns, shmid);
850 if (IS_ERR(shp)) { 854 if (IS_ERR(shp)) {
851 err = PTR_ERR(shp); 855 err = PTR_ERR(shp);
852 goto out; 856 goto out;
853 } 857 }
854 result = shp->shm_perm.id; 858 result = shp->shm_perm.id;
855 } else { 859 } else {
856 shp = shm_lock_check(ns, shmid); 860 shp = shm_lock_check(ns, shmid);
857 if (IS_ERR(shp)) { 861 if (IS_ERR(shp)) {
858 err = PTR_ERR(shp); 862 err = PTR_ERR(shp);
859 goto out; 863 goto out;
860 } 864 }
861 result = 0; 865 result = 0;
862 } 866 }
863 err = -EACCES; 867 err = -EACCES;
864 if (ipcperms(ns, &shp->shm_perm, S_IRUGO)) 868 if (ipcperms(ns, &shp->shm_perm, S_IRUGO))
865 goto out_unlock; 869 goto out_unlock;
866 err = security_shm_shmctl(shp, cmd); 870 err = security_shm_shmctl(shp, cmd);
867 if (err) 871 if (err)
868 goto out_unlock; 872 goto out_unlock;
869 memset(&tbuf, 0, sizeof(tbuf)); 873 memset(&tbuf, 0, sizeof(tbuf));
870 kernel_to_ipc64_perm(&shp->shm_perm, &tbuf.shm_perm); 874 kernel_to_ipc64_perm(&shp->shm_perm, &tbuf.shm_perm);
871 tbuf.shm_segsz = shp->shm_segsz; 875 tbuf.shm_segsz = shp->shm_segsz;
872 tbuf.shm_atime = shp->shm_atim; 876 tbuf.shm_atime = shp->shm_atim;
873 tbuf.shm_dtime = shp->shm_dtim; 877 tbuf.shm_dtime = shp->shm_dtim;
874 tbuf.shm_ctime = shp->shm_ctim; 878 tbuf.shm_ctime = shp->shm_ctim;
875 tbuf.shm_cpid = shp->shm_cprid; 879 tbuf.shm_cpid = shp->shm_cprid;
876 tbuf.shm_lpid = shp->shm_lprid; 880 tbuf.shm_lpid = shp->shm_lprid;
877 tbuf.shm_nattch = shp->shm_nattch; 881 tbuf.shm_nattch = shp->shm_nattch;
878 shm_unlock(shp); 882 shm_unlock(shp);
879 if(copy_shmid_to_user (buf, &tbuf, version)) 883 if(copy_shmid_to_user (buf, &tbuf, version))
880 err = -EFAULT; 884 err = -EFAULT;
881 else 885 else
882 err = result; 886 err = result;
883 goto out; 887 goto out;
884 } 888 }
885 case SHM_LOCK: 889 case SHM_LOCK:
886 case SHM_UNLOCK: 890 case SHM_UNLOCK:
887 { 891 {
888 struct file *shm_file; 892 struct file *shm_file;
889 893
890 shp = shm_lock_check(ns, shmid); 894 shp = shm_lock_check(ns, shmid);
891 if (IS_ERR(shp)) { 895 if (IS_ERR(shp)) {
892 err = PTR_ERR(shp); 896 err = PTR_ERR(shp);
893 goto out; 897 goto out;
894 } 898 }
895 899
896 audit_ipc_obj(&(shp->shm_perm)); 900 audit_ipc_obj(&(shp->shm_perm));
897 901
898 if (!ns_capable(ns->user_ns, CAP_IPC_LOCK)) { 902 if (!ns_capable(ns->user_ns, CAP_IPC_LOCK)) {
899 kuid_t euid = current_euid(); 903 kuid_t euid = current_euid();
900 err = -EPERM; 904 err = -EPERM;
901 if (!uid_eq(euid, shp->shm_perm.uid) && 905 if (!uid_eq(euid, shp->shm_perm.uid) &&
902 !uid_eq(euid, shp->shm_perm.cuid)) 906 !uid_eq(euid, shp->shm_perm.cuid))
903 goto out_unlock; 907 goto out_unlock;
904 if (cmd == SHM_LOCK && !rlimit(RLIMIT_MEMLOCK)) 908 if (cmd == SHM_LOCK && !rlimit(RLIMIT_MEMLOCK))
905 goto out_unlock; 909 goto out_unlock;
906 } 910 }
907 911
908 err = security_shm_shmctl(shp, cmd); 912 err = security_shm_shmctl(shp, cmd);
909 if (err) 913 if (err)
910 goto out_unlock; 914 goto out_unlock;
911 915
912 shm_file = shp->shm_file; 916 shm_file = shp->shm_file;
913 if (is_file_hugepages(shm_file)) 917 if (is_file_hugepages(shm_file))
914 goto out_unlock; 918 goto out_unlock;
915 919
916 if (cmd == SHM_LOCK) { 920 if (cmd == SHM_LOCK) {
917 struct user_struct *user = current_user(); 921 struct user_struct *user = current_user();
918 err = shmem_lock(shm_file, 1, user); 922 err = shmem_lock(shm_file, 1, user);
919 if (!err && !(shp->shm_perm.mode & SHM_LOCKED)) { 923 if (!err && !(shp->shm_perm.mode & SHM_LOCKED)) {
920 shp->shm_perm.mode |= SHM_LOCKED; 924 shp->shm_perm.mode |= SHM_LOCKED;
921 shp->mlock_user = user; 925 shp->mlock_user = user;
922 } 926 }
923 goto out_unlock; 927 goto out_unlock;
924 } 928 }
925 929
926 /* SHM_UNLOCK */ 930 /* SHM_UNLOCK */
927 if (!(shp->shm_perm.mode & SHM_LOCKED)) 931 if (!(shp->shm_perm.mode & SHM_LOCKED))
928 goto out_unlock; 932 goto out_unlock;
929 shmem_lock(shm_file, 0, shp->mlock_user); 933 shmem_lock(shm_file, 0, shp->mlock_user);
930 shp->shm_perm.mode &= ~SHM_LOCKED; 934 shp->shm_perm.mode &= ~SHM_LOCKED;
931 shp->mlock_user = NULL; 935 shp->mlock_user = NULL;
932 get_file(shm_file); 936 get_file(shm_file);
933 shm_unlock(shp); 937 shm_unlock(shp);
934 shmem_unlock_mapping(shm_file->f_mapping); 938 shmem_unlock_mapping(shm_file->f_mapping);
935 fput(shm_file); 939 fput(shm_file);
936 goto out; 940 goto out;
937 } 941 }
938 case IPC_RMID: 942 case IPC_RMID:
939 case IPC_SET: 943 case IPC_SET:
940 err = shmctl_down(ns, shmid, cmd, buf, version); 944 err = shmctl_down(ns, shmid, cmd, buf, version);
941 return err; 945 return err;
942 default: 946 default:
943 return -EINVAL; 947 return -EINVAL;
944 } 948 }
945 949
946 out_unlock: 950 out_unlock:
947 shm_unlock(shp); 951 shm_unlock(shp);
948 out: 952 out:
949 return err; 953 return err;
950 } 954 }
951 955
952 /* 956 /*
953 * Fix shmaddr, allocate descriptor, map shm, add attach descriptor to lists. 957 * Fix shmaddr, allocate descriptor, map shm, add attach descriptor to lists.
954 * 958 *
955 * NOTE! Despite the name, this is NOT a direct system call entrypoint. The 959 * NOTE! Despite the name, this is NOT a direct system call entrypoint. The
956 * "raddr" thing points to kernel space, and there has to be a wrapper around 960 * "raddr" thing points to kernel space, and there has to be a wrapper around
957 * this. 961 * this.
958 */ 962 */
959 long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr, 963 long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr,
960 unsigned long shmlba) 964 unsigned long shmlba)
961 { 965 {
962 struct shmid_kernel *shp; 966 struct shmid_kernel *shp;
963 unsigned long addr; 967 unsigned long addr;
964 unsigned long size; 968 unsigned long size;
965 struct file * file; 969 struct file * file;
966 int err; 970 int err;
967 unsigned long flags; 971 unsigned long flags;
968 unsigned long prot; 972 unsigned long prot;
969 int acc_mode; 973 int acc_mode;
970 struct ipc_namespace *ns; 974 struct ipc_namespace *ns;
971 struct shm_file_data *sfd; 975 struct shm_file_data *sfd;
972 struct path path; 976 struct path path;
973 fmode_t f_mode; 977 fmode_t f_mode;
974 unsigned long populate = 0; 978 unsigned long populate = 0;
975 979
976 err = -EINVAL; 980 err = -EINVAL;
977 if (shmid < 0) 981 if (shmid < 0)
978 goto out; 982 goto out;
979 else if ((addr = (ulong)shmaddr)) { 983 else if ((addr = (ulong)shmaddr)) {
980 if (addr & (shmlba - 1)) { 984 if (addr & (shmlba - 1)) {
981 if (shmflg & SHM_RND) 985 if (shmflg & SHM_RND)
982 addr &= ~(shmlba - 1); /* round down */ 986 addr &= ~(shmlba - 1); /* round down */
983 else 987 else
984 #ifndef __ARCH_FORCE_SHMLBA 988 #ifndef __ARCH_FORCE_SHMLBA
985 if (addr & ~PAGE_MASK) 989 if (addr & ~PAGE_MASK)
986 #endif 990 #endif
987 goto out; 991 goto out;
988 } 992 }
989 flags = MAP_SHARED | MAP_FIXED; 993 flags = MAP_SHARED | MAP_FIXED;
990 } else { 994 } else {
991 if ((shmflg & SHM_REMAP)) 995 if ((shmflg & SHM_REMAP))
992 goto out; 996 goto out;
993 997
994 flags = MAP_SHARED; 998 flags = MAP_SHARED;
995 } 999 }
996 1000
997 if (shmflg & SHM_RDONLY) { 1001 if (shmflg & SHM_RDONLY) {
998 prot = PROT_READ; 1002 prot = PROT_READ;
999 acc_mode = S_IRUGO; 1003 acc_mode = S_IRUGO;
1000 f_mode = FMODE_READ; 1004 f_mode = FMODE_READ;
1001 } else { 1005 } else {
1002 prot = PROT_READ | PROT_WRITE; 1006 prot = PROT_READ | PROT_WRITE;
1003 acc_mode = S_IRUGO | S_IWUGO; 1007 acc_mode = S_IRUGO | S_IWUGO;
1004 f_mode = FMODE_READ | FMODE_WRITE; 1008 f_mode = FMODE_READ | FMODE_WRITE;
1005 } 1009 }
1006 if (shmflg & SHM_EXEC) { 1010 if (shmflg & SHM_EXEC) {
1007 prot |= PROT_EXEC; 1011 prot |= PROT_EXEC;
1008 acc_mode |= S_IXUGO; 1012 acc_mode |= S_IXUGO;
1009 } 1013 }
1010 1014
1011 /* 1015 /*
1012 * We cannot rely on the fs check since SYSV IPC does have an 1016 * We cannot rely on the fs check since SYSV IPC does have an
1013 * additional creator id... 1017 * additional creator id...
1014 */ 1018 */
1015 ns = current->nsproxy->ipc_ns; 1019 ns = current->nsproxy->ipc_ns;
1016 shp = shm_lock_check(ns, shmid); 1020 shp = shm_lock_check(ns, shmid);
1017 if (IS_ERR(shp)) { 1021 if (IS_ERR(shp)) {
1018 err = PTR_ERR(shp); 1022 err = PTR_ERR(shp);
1019 goto out; 1023 goto out;
1020 } 1024 }
1021 1025
1022 err = -EACCES; 1026 err = -EACCES;
1023 if (ipcperms(ns, &shp->shm_perm, acc_mode)) 1027 if (ipcperms(ns, &shp->shm_perm, acc_mode))
1024 goto out_unlock; 1028 goto out_unlock;
1025 1029
1026 err = security_shm_shmat(shp, shmaddr, shmflg); 1030 err = security_shm_shmat(shp, shmaddr, shmflg);
1027 if (err) 1031 if (err)
1028 goto out_unlock; 1032 goto out_unlock;
1029 1033
1030 path = shp->shm_file->f_path; 1034 path = shp->shm_file->f_path;
1031 path_get(&path); 1035 path_get(&path);
1032 shp->shm_nattch++; 1036 shp->shm_nattch++;
1033 size = i_size_read(path.dentry->d_inode); 1037 size = i_size_read(path.dentry->d_inode);
1034 shm_unlock(shp); 1038 shm_unlock(shp);
1035 1039
1036 err = -ENOMEM; 1040 err = -ENOMEM;
1037 sfd = kzalloc(sizeof(*sfd), GFP_KERNEL); 1041 sfd = kzalloc(sizeof(*sfd), GFP_KERNEL);
1038 if (!sfd) 1042 if (!sfd)
1039 goto out_put_dentry; 1043 goto out_put_dentry;
1040 1044
1041 file = alloc_file(&path, f_mode, 1045 file = alloc_file(&path, f_mode,
1042 is_file_hugepages(shp->shm_file) ? 1046 is_file_hugepages(shp->shm_file) ?
1043 &shm_file_operations_huge : 1047 &shm_file_operations_huge :
1044 &shm_file_operations); 1048 &shm_file_operations);
1045 err = PTR_ERR(file); 1049 err = PTR_ERR(file);
1046 if (IS_ERR(file)) 1050 if (IS_ERR(file))
1047 goto out_free; 1051 goto out_free;
1048 1052
1049 file->private_data = sfd; 1053 file->private_data = sfd;
1050 file->f_mapping = shp->shm_file->f_mapping; 1054 file->f_mapping = shp->shm_file->f_mapping;
1051 sfd->id = shp->shm_perm.id; 1055 sfd->id = shp->shm_perm.id;
1052 sfd->ns = get_ipc_ns(ns); 1056 sfd->ns = get_ipc_ns(ns);
1053 sfd->file = shp->shm_file; 1057 sfd->file = shp->shm_file;
1054 sfd->vm_ops = NULL; 1058 sfd->vm_ops = NULL;
1055 1059
1056 err = security_mmap_file(file, prot, flags); 1060 err = security_mmap_file(file, prot, flags);
1057 if (err) 1061 if (err)
1058 goto out_fput; 1062 goto out_fput;
1059 1063
1060 down_write(&current->mm->mmap_sem); 1064 down_write(&current->mm->mmap_sem);
1061 if (addr && !(shmflg & SHM_REMAP)) { 1065 if (addr && !(shmflg & SHM_REMAP)) {
1062 err = -EINVAL; 1066 err = -EINVAL;
1063 if (find_vma_intersection(current->mm, addr, addr + size)) 1067 if (find_vma_intersection(current->mm, addr, addr + size))
1064 goto invalid; 1068 goto invalid;
1065 /* 1069 /*
1066 * If shm segment goes below stack, make sure there is some 1070 * If shm segment goes below stack, make sure there is some
1067 * space left for the stack to grow (at least 4 pages). 1071 * space left for the stack to grow (at least 4 pages).
1068 */ 1072 */
1069 if (addr < current->mm->start_stack && 1073 if (addr < current->mm->start_stack &&
1070 addr > current->mm->start_stack - size - PAGE_SIZE * 5) 1074 addr > current->mm->start_stack - size - PAGE_SIZE * 5)
1071 goto invalid; 1075 goto invalid;
1072 } 1076 }
1073 1077
1074 addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate); 1078 addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate);
1075 *raddr = addr; 1079 *raddr = addr;
1076 err = 0; 1080 err = 0;
1077 if (IS_ERR_VALUE(addr)) 1081 if (IS_ERR_VALUE(addr))
1078 err = (long)addr; 1082 err = (long)addr;
1079 invalid: 1083 invalid:
1080 up_write(&current->mm->mmap_sem); 1084 up_write(&current->mm->mmap_sem);
1081 if (populate) 1085 if (populate)
1082 mm_populate(addr, populate); 1086 mm_populate(addr, populate);
1083 1087
1084 out_fput: 1088 out_fput:
1085 fput(file); 1089 fput(file);
1086 1090
1087 out_nattch: 1091 out_nattch:
1088 down_write(&shm_ids(ns).rw_mutex); 1092 down_write(&shm_ids(ns).rw_mutex);
1089 shp = shm_lock(ns, shmid); 1093 shp = shm_lock(ns, shmid);
1090 BUG_ON(IS_ERR(shp)); 1094 BUG_ON(IS_ERR(shp));
1091 shp->shm_nattch--; 1095 shp->shm_nattch--;
1092 if (shm_may_destroy(ns, shp)) 1096 if (shm_may_destroy(ns, shp))
1093 shm_destroy(ns, shp); 1097 shm_destroy(ns, shp);
1094 else 1098 else
1095 shm_unlock(shp); 1099 shm_unlock(shp);
1096 up_write(&shm_ids(ns).rw_mutex); 1100 up_write(&shm_ids(ns).rw_mutex);
1097 1101
1098 out: 1102 out:
1099 return err; 1103 return err;
1100 1104
1101 out_unlock: 1105 out_unlock:
1102 shm_unlock(shp); 1106 shm_unlock(shp);
1103 goto out; 1107 goto out;
1104 1108
1105 out_free: 1109 out_free:
1106 kfree(sfd); 1110 kfree(sfd);
1107 out_put_dentry: 1111 out_put_dentry:
1108 path_put(&path); 1112 path_put(&path);
1109 goto out_nattch; 1113 goto out_nattch;
1110 } 1114 }
1111 1115
1112 SYSCALL_DEFINE3(shmat, int, shmid, char __user *, shmaddr, int, shmflg) 1116 SYSCALL_DEFINE3(shmat, int, shmid, char __user *, shmaddr, int, shmflg)
1113 { 1117 {
1114 unsigned long ret; 1118 unsigned long ret;
1115 long err; 1119 long err;
1116 1120
1117 err = do_shmat(shmid, shmaddr, shmflg, &ret, SHMLBA); 1121 err = do_shmat(shmid, shmaddr, shmflg, &ret, SHMLBA);
1118 if (err) 1122 if (err)
1119 return err; 1123 return err;
1120 force_successful_syscall_return(); 1124 force_successful_syscall_return();
1121 return (long)ret; 1125 return (long)ret;
1122 } 1126 }
1123 1127
1124 /* 1128 /*
1125 * detach and kill segment if marked destroyed. 1129 * detach and kill segment if marked destroyed.
1126 * The work is done in shm_close. 1130 * The work is done in shm_close.
1127 */ 1131 */
1128 SYSCALL_DEFINE1(shmdt, char __user *, shmaddr) 1132 SYSCALL_DEFINE1(shmdt, char __user *, shmaddr)
1129 { 1133 {
1130 struct mm_struct *mm = current->mm; 1134 struct mm_struct *mm = current->mm;
1131 struct vm_area_struct *vma; 1135 struct vm_area_struct *vma;
1132 unsigned long addr = (unsigned long)shmaddr; 1136 unsigned long addr = (unsigned long)shmaddr;
1133 int retval = -EINVAL; 1137 int retval = -EINVAL;
1134 #ifdef CONFIG_MMU 1138 #ifdef CONFIG_MMU
1135 loff_t size = 0; 1139 loff_t size = 0;
1136 struct vm_area_struct *next; 1140 struct vm_area_struct *next;
1137 #endif 1141 #endif
1138 1142
1139 if (addr & ~PAGE_MASK) 1143 if (addr & ~PAGE_MASK)
1140 return retval; 1144 return retval;
1141 1145
1142 down_write(&mm->mmap_sem); 1146 down_write(&mm->mmap_sem);
1143 1147
1144 /* 1148 /*
1145 * This function tries to be smart and unmap shm segments that 1149 * This function tries to be smart and unmap shm segments that
1146 * were modified by partial mlock or munmap calls: 1150 * were modified by partial mlock or munmap calls:
1147 * - It first determines the size of the shm segment that should be 1151 * - It first determines the size of the shm segment that should be
1148 * unmapped: It searches for a vma that is backed by shm and that 1152 * unmapped: It searches for a vma that is backed by shm and that
1149 * started at address shmaddr. It records it's size and then unmaps 1153 * started at address shmaddr. It records it's size and then unmaps
1150 * it. 1154 * it.
1151 * - Then it unmaps all shm vmas that started at shmaddr and that 1155 * - Then it unmaps all shm vmas that started at shmaddr and that
1152 * are within the initially determined size. 1156 * are within the initially determined size.
1153 * Errors from do_munmap are ignored: the function only fails if 1157 * Errors from do_munmap are ignored: the function only fails if
1154 * it's called with invalid parameters or if it's called to unmap 1158 * it's called with invalid parameters or if it's called to unmap
1155 * a part of a vma. Both calls in this function are for full vmas, 1159 * a part of a vma. Both calls in this function are for full vmas,
1156 * the parameters are directly copied from the vma itself and always 1160 * the parameters are directly copied from the vma itself and always
1157 * valid - therefore do_munmap cannot fail. (famous last words?) 1161 * valid - therefore do_munmap cannot fail. (famous last words?)
1158 */ 1162 */
1159 /* 1163 /*
1160 * If it had been mremap()'d, the starting address would not 1164 * If it had been mremap()'d, the starting address would not
1161 * match the usual checks anyway. So assume all vma's are 1165 * match the usual checks anyway. So assume all vma's are
1162 * above the starting address given. 1166 * above the starting address given.
1163 */ 1167 */
1164 vma = find_vma(mm, addr); 1168 vma = find_vma(mm, addr);
1165 1169
1166 #ifdef CONFIG_MMU 1170 #ifdef CONFIG_MMU
1167 while (vma) { 1171 while (vma) {
1168 next = vma->vm_next; 1172 next = vma->vm_next;
1169 1173
1170 /* 1174 /*
1171 * Check if the starting address would match, i.e. it's 1175 * Check if the starting address would match, i.e. it's
1172 * a fragment created by mprotect() and/or munmap(), or it 1176 * a fragment created by mprotect() and/or munmap(), or it
1173 * otherwise it starts at this address with no hassles. 1177 * otherwise it starts at this address with no hassles.
1174 */ 1178 */
1175 if ((vma->vm_ops == &shm_vm_ops) && 1179 if ((vma->vm_ops == &shm_vm_ops) &&
1176 (vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) { 1180 (vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) {
1177 1181
1178 1182
1179 size = file_inode(vma->vm_file)->i_size; 1183 size = file_inode(vma->vm_file)->i_size;
1180 do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); 1184 do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
1181 /* 1185 /*
1182 * We discovered the size of the shm segment, so 1186 * We discovered the size of the shm segment, so
1183 * break out of here and fall through to the next 1187 * break out of here and fall through to the next
1184 * loop that uses the size information to stop 1188 * loop that uses the size information to stop
1185 * searching for matching vma's. 1189 * searching for matching vma's.
1186 */ 1190 */
1187 retval = 0; 1191 retval = 0;
1188 vma = next; 1192 vma = next;
1189 break; 1193 break;
1190 } 1194 }
1191 vma = next; 1195 vma = next;
1192 } 1196 }
1193 1197
1194 /* 1198 /*
1195 * We need look no further than the maximum address a fragment 1199 * We need look no further than the maximum address a fragment
1196 * could possibly have landed at. Also cast things to loff_t to 1200 * could possibly have landed at. Also cast things to loff_t to
1197 * prevent overflows and make comparisons vs. equal-width types. 1201 * prevent overflows and make comparisons vs. equal-width types.
1198 */ 1202 */
1199 size = PAGE_ALIGN(size); 1203 size = PAGE_ALIGN(size);
1200 while (vma && (loff_t)(vma->vm_end - addr) <= size) { 1204 while (vma && (loff_t)(vma->vm_end - addr) <= size) {
1201 next = vma->vm_next; 1205 next = vma->vm_next;
1202 1206
1203 /* finding a matching vma now does not alter retval */ 1207 /* finding a matching vma now does not alter retval */
1204 if ((vma->vm_ops == &shm_vm_ops) && 1208 if ((vma->vm_ops == &shm_vm_ops) &&
1205 (vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) 1209 (vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff)
1206 1210
1207 do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); 1211 do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
1208 vma = next; 1212 vma = next;
1209 } 1213 }
1210 1214
1211 #else /* CONFIG_MMU */ 1215 #else /* CONFIG_MMU */
1212 /* under NOMMU conditions, the exact address to be destroyed must be 1216 /* under NOMMU conditions, the exact address to be destroyed must be
1213 * given */ 1217 * given */
1214 retval = -EINVAL; 1218 retval = -EINVAL;
1215 if (vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) { 1219 if (vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) {
1216 do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); 1220 do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
1217 retval = 0; 1221 retval = 0;
1218 } 1222 }
1219 1223
1220 #endif 1224 #endif
1221 1225
1222 up_write(&mm->mmap_sem); 1226 up_write(&mm->mmap_sem);
1223 return retval; 1227 return retval;
1224 } 1228 }
1225 1229
1226 #ifdef CONFIG_PROC_FS 1230 #ifdef CONFIG_PROC_FS
1227 static int sysvipc_shm_proc_show(struct seq_file *s, void *it) 1231 static int sysvipc_shm_proc_show(struct seq_file *s, void *it)
1228 { 1232 {
1229 struct user_namespace *user_ns = seq_user_ns(s); 1233 struct user_namespace *user_ns = seq_user_ns(s);
1230 struct shmid_kernel *shp = it; 1234 struct shmid_kernel *shp = it;
1231 unsigned long rss = 0, swp = 0; 1235 unsigned long rss = 0, swp = 0;
1232 1236
1233 shm_add_rss_swap(shp, &rss, &swp); 1237 shm_add_rss_swap(shp, &rss, &swp);
1234 1238
1235 #if BITS_PER_LONG <= 32 1239 #if BITS_PER_LONG <= 32
1236 #define SIZE_SPEC "%10lu" 1240 #define SIZE_SPEC "%10lu"
1237 #else 1241 #else
1238 #define SIZE_SPEC "%21lu" 1242 #define SIZE_SPEC "%21lu"
1239 #endif 1243 #endif
1240 1244
1241 return seq_printf(s, 1245 return seq_printf(s,
1242 "%10d %10d %4o " SIZE_SPEC " %5u %5u " 1246 "%10d %10d %4o " SIZE_SPEC " %5u %5u "
1243 "%5lu %5u %5u %5u %5u %10lu %10lu %10lu " 1247 "%5lu %5u %5u %5u %5u %10lu %10lu %10lu "
1244 SIZE_SPEC " " SIZE_SPEC "\n", 1248 SIZE_SPEC " " SIZE_SPEC "\n",
1245 shp->shm_perm.key, 1249 shp->shm_perm.key,
1246 shp->shm_perm.id, 1250 shp->shm_perm.id,
1247 shp->shm_perm.mode, 1251 shp->shm_perm.mode,
1248 shp->shm_segsz, 1252 shp->shm_segsz,
1249 shp->shm_cprid, 1253 shp->shm_cprid,
1250 shp->shm_lprid, 1254 shp->shm_lprid,
1251 shp->shm_nattch, 1255 shp->shm_nattch,
1252 from_kuid_munged(user_ns, shp->shm_perm.uid), 1256 from_kuid_munged(user_ns, shp->shm_perm.uid),
1253 from_kgid_munged(user_ns, shp->shm_perm.gid), 1257 from_kgid_munged(user_ns, shp->shm_perm.gid),
1254 from_kuid_munged(user_ns, shp->shm_perm.cuid), 1258 from_kuid_munged(user_ns, shp->shm_perm.cuid),
1255 from_kgid_munged(user_ns, shp->shm_perm.cgid), 1259 from_kgid_munged(user_ns, shp->shm_perm.cgid),
1256 shp->shm_atim, 1260 shp->shm_atim,
1257 shp->shm_dtim, 1261 shp->shm_dtim,
1258 shp->shm_ctim, 1262 shp->shm_ctim,
1259 rss * PAGE_SIZE, 1263 rss * PAGE_SIZE,
1260 swp * PAGE_SIZE); 1264 swp * PAGE_SIZE);
1261 } 1265 }
1262 #endif 1266 #endif
1263 1267
1 /* 1 /*
2 * mm/mmap.c 2 * mm/mmap.c
3 * 3 *
4 * Written by obz. 4 * Written by obz.
5 * 5 *
6 * Address space accounting code <alan@lxorguk.ukuu.org.uk> 6 * Address space accounting code <alan@lxorguk.ukuu.org.uk>
7 */ 7 */
8 8
9 #include <linux/kernel.h> 9 #include <linux/kernel.h>
10 #include <linux/slab.h> 10 #include <linux/slab.h>
11 #include <linux/backing-dev.h> 11 #include <linux/backing-dev.h>
12 #include <linux/mm.h> 12 #include <linux/mm.h>
13 #include <linux/shm.h> 13 #include <linux/shm.h>
14 #include <linux/mman.h> 14 #include <linux/mman.h>
15 #include <linux/pagemap.h> 15 #include <linux/pagemap.h>
16 #include <linux/swap.h> 16 #include <linux/swap.h>
17 #include <linux/syscalls.h> 17 #include <linux/syscalls.h>
18 #include <linux/capability.h> 18 #include <linux/capability.h>
19 #include <linux/init.h> 19 #include <linux/init.h>
20 #include <linux/file.h> 20 #include <linux/file.h>
21 #include <linux/fs.h> 21 #include <linux/fs.h>
22 #include <linux/personality.h> 22 #include <linux/personality.h>
23 #include <linux/security.h> 23 #include <linux/security.h>
24 #include <linux/hugetlb.h> 24 #include <linux/hugetlb.h>
25 #include <linux/profile.h> 25 #include <linux/profile.h>
26 #include <linux/export.h> 26 #include <linux/export.h>
27 #include <linux/mount.h> 27 #include <linux/mount.h>
28 #include <linux/mempolicy.h> 28 #include <linux/mempolicy.h>
29 #include <linux/rmap.h> 29 #include <linux/rmap.h>
30 #include <linux/mmu_notifier.h> 30 #include <linux/mmu_notifier.h>
31 #include <linux/perf_event.h> 31 #include <linux/perf_event.h>
32 #include <linux/audit.h> 32 #include <linux/audit.h>
33 #include <linux/khugepaged.h> 33 #include <linux/khugepaged.h>
34 #include <linux/uprobes.h> 34 #include <linux/uprobes.h>
35 #include <linux/rbtree_augmented.h> 35 #include <linux/rbtree_augmented.h>
36 #include <linux/sched/sysctl.h> 36 #include <linux/sched/sysctl.h>
37 #include <linux/notifier.h> 37 #include <linux/notifier.h>
38 #include <linux/memory.h> 38 #include <linux/memory.h>
39 39
40 #include <asm/uaccess.h> 40 #include <asm/uaccess.h>
41 #include <asm/cacheflush.h> 41 #include <asm/cacheflush.h>
42 #include <asm/tlb.h> 42 #include <asm/tlb.h>
43 #include <asm/mmu_context.h> 43 #include <asm/mmu_context.h>
44 44
45 #include "internal.h" 45 #include "internal.h"
46 46
47 #ifndef arch_mmap_check 47 #ifndef arch_mmap_check
48 #define arch_mmap_check(addr, len, flags) (0) 48 #define arch_mmap_check(addr, len, flags) (0)
49 #endif 49 #endif
50 50
51 #ifndef arch_rebalance_pgtables 51 #ifndef arch_rebalance_pgtables
52 #define arch_rebalance_pgtables(addr, len) (addr) 52 #define arch_rebalance_pgtables(addr, len) (addr)
53 #endif 53 #endif
54 54
55 static void unmap_region(struct mm_struct *mm, 55 static void unmap_region(struct mm_struct *mm,
56 struct vm_area_struct *vma, struct vm_area_struct *prev, 56 struct vm_area_struct *vma, struct vm_area_struct *prev,
57 unsigned long start, unsigned long end); 57 unsigned long start, unsigned long end);
58 58
59 /* description of effects of mapping type and prot in current implementation. 59 /* description of effects of mapping type and prot in current implementation.
60 * this is due to the limited x86 page protection hardware. The expected 60 * this is due to the limited x86 page protection hardware. The expected
61 * behavior is in parens: 61 * behavior is in parens:
62 * 62 *
63 * map_type prot 63 * map_type prot
64 * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC 64 * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC
65 * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes 65 * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes
66 * w: (no) no w: (no) no w: (yes) yes w: (no) no 66 * w: (no) no w: (no) no w: (yes) yes w: (no) no
67 * x: (no) no x: (no) yes x: (no) yes x: (yes) yes 67 * x: (no) no x: (no) yes x: (no) yes x: (yes) yes
68 * 68 *
69 * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes 69 * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes
70 * w: (no) no w: (no) no w: (copy) copy w: (no) no 70 * w: (no) no w: (no) no w: (copy) copy w: (no) no
71 * x: (no) no x: (no) yes x: (no) yes x: (yes) yes 71 * x: (no) no x: (no) yes x: (no) yes x: (yes) yes
72 * 72 *
73 */ 73 */
74 pgprot_t protection_map[16] = { 74 pgprot_t protection_map[16] = {
75 __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111, 75 __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
76 __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111 76 __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
77 }; 77 };
78 78
79 pgprot_t vm_get_page_prot(unsigned long vm_flags) 79 pgprot_t vm_get_page_prot(unsigned long vm_flags)
80 { 80 {
81 return __pgprot(pgprot_val(protection_map[vm_flags & 81 return __pgprot(pgprot_val(protection_map[vm_flags &
82 (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]) | 82 (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]) |
83 pgprot_val(arch_vm_get_page_prot(vm_flags))); 83 pgprot_val(arch_vm_get_page_prot(vm_flags)));
84 } 84 }
85 EXPORT_SYMBOL(vm_get_page_prot); 85 EXPORT_SYMBOL(vm_get_page_prot);
86 86
87 int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */ 87 int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */
88 int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ 88 int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */
89 int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; 89 int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
90 unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ 90 unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
91 unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ 91 unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
92 /* 92 /*
93 * Make sure vm_committed_as in one cacheline and not cacheline shared with 93 * Make sure vm_committed_as in one cacheline and not cacheline shared with
94 * other variables. It can be updated by several CPUs frequently. 94 * other variables. It can be updated by several CPUs frequently.
95 */ 95 */
96 struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp; 96 struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
97 97
98 /* 98 /*
99 * The global memory commitment made in the system can be a metric 99 * The global memory commitment made in the system can be a metric
100 * that can be used to drive ballooning decisions when Linux is hosted 100 * that can be used to drive ballooning decisions when Linux is hosted
101 * as a guest. On Hyper-V, the host implements a policy engine for dynamically 101 * as a guest. On Hyper-V, the host implements a policy engine for dynamically
102 * balancing memory across competing virtual machines that are hosted. 102 * balancing memory across competing virtual machines that are hosted.
103 * Several metrics drive this policy engine including the guest reported 103 * Several metrics drive this policy engine including the guest reported
104 * memory commitment. 104 * memory commitment.
105 */ 105 */
106 unsigned long vm_memory_committed(void) 106 unsigned long vm_memory_committed(void)
107 { 107 {
108 return percpu_counter_read_positive(&vm_committed_as); 108 return percpu_counter_read_positive(&vm_committed_as);
109 } 109 }
110 EXPORT_SYMBOL_GPL(vm_memory_committed); 110 EXPORT_SYMBOL_GPL(vm_memory_committed);
111 111
112 /* 112 /*
113 * Check that a process has enough memory to allocate a new virtual 113 * Check that a process has enough memory to allocate a new virtual
114 * mapping. 0 means there is enough memory for the allocation to 114 * mapping. 0 means there is enough memory for the allocation to
115 * succeed and -ENOMEM implies there is not. 115 * succeed and -ENOMEM implies there is not.
116 * 116 *
117 * We currently support three overcommit policies, which are set via the 117 * We currently support three overcommit policies, which are set via the
118 * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting 118 * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting
119 * 119 *
120 * Strict overcommit modes added 2002 Feb 26 by Alan Cox. 120 * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
121 * Additional code 2002 Jul 20 by Robert Love. 121 * Additional code 2002 Jul 20 by Robert Love.
122 * 122 *
123 * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise. 123 * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
124 * 124 *
125 * Note this is a helper function intended to be used by LSMs which 125 * Note this is a helper function intended to be used by LSMs which
126 * wish to use this logic. 126 * wish to use this logic.
127 */ 127 */
128 int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) 128 int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
129 { 129 {
130 unsigned long free, allowed, reserve; 130 unsigned long free, allowed, reserve;
131 131
132 vm_acct_memory(pages); 132 vm_acct_memory(pages);
133 133
134 /* 134 /*
135 * Sometimes we want to use more memory than we have 135 * Sometimes we want to use more memory than we have
136 */ 136 */
137 if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) 137 if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
138 return 0; 138 return 0;
139 139
140 if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { 140 if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
141 free = global_page_state(NR_FREE_PAGES); 141 free = global_page_state(NR_FREE_PAGES);
142 free += global_page_state(NR_FILE_PAGES); 142 free += global_page_state(NR_FILE_PAGES);
143 143
144 /* 144 /*
145 * shmem pages shouldn't be counted as free in this 145 * shmem pages shouldn't be counted as free in this
146 * case, they can't be purged, only swapped out, and 146 * case, they can't be purged, only swapped out, and
147 * that won't affect the overall amount of available 147 * that won't affect the overall amount of available
148 * memory in the system. 148 * memory in the system.
149 */ 149 */
150 free -= global_page_state(NR_SHMEM); 150 free -= global_page_state(NR_SHMEM);
151 151
152 free += get_nr_swap_pages(); 152 free += get_nr_swap_pages();
153 153
154 /* 154 /*
155 * Any slabs which are created with the 155 * Any slabs which are created with the
156 * SLAB_RECLAIM_ACCOUNT flag claim to have contents 156 * SLAB_RECLAIM_ACCOUNT flag claim to have contents
157 * which are reclaimable, under pressure. The dentry 157 * which are reclaimable, under pressure. The dentry
158 * cache and most inode caches should fall into this 158 * cache and most inode caches should fall into this
159 */ 159 */
160 free += global_page_state(NR_SLAB_RECLAIMABLE); 160 free += global_page_state(NR_SLAB_RECLAIMABLE);
161 161
162 /* 162 /*
163 * Leave reserved pages. The pages are not for anonymous pages. 163 * Leave reserved pages. The pages are not for anonymous pages.
164 */ 164 */
165 if (free <= totalreserve_pages) 165 if (free <= totalreserve_pages)
166 goto error; 166 goto error;
167 else 167 else
168 free -= totalreserve_pages; 168 free -= totalreserve_pages;
169 169
170 /* 170 /*
171 * Reserve some for root 171 * Reserve some for root
172 */ 172 */
173 if (!cap_sys_admin) 173 if (!cap_sys_admin)
174 free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); 174 free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
175 175
176 if (free > pages) 176 if (free > pages)
177 return 0; 177 return 0;
178 178
179 goto error; 179 goto error;
180 } 180 }
181 181
182 allowed = (totalram_pages - hugetlb_total_pages()) 182 allowed = (totalram_pages - hugetlb_total_pages())
183 * sysctl_overcommit_ratio / 100; 183 * sysctl_overcommit_ratio / 100;
184 /* 184 /*
185 * Reserve some for root 185 * Reserve some for root
186 */ 186 */
187 if (!cap_sys_admin) 187 if (!cap_sys_admin)
188 allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); 188 allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
189 allowed += total_swap_pages; 189 allowed += total_swap_pages;
190 190
191 /* 191 /*
192 * Don't let a single process grow so big a user can't recover 192 * Don't let a single process grow so big a user can't recover
193 */ 193 */
194 if (mm) { 194 if (mm) {
195 reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); 195 reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
196 allowed -= min(mm->total_vm / 32, reserve); 196 allowed -= min(mm->total_vm / 32, reserve);
197 } 197 }
198 198
199 if (percpu_counter_read_positive(&vm_committed_as) < allowed) 199 if (percpu_counter_read_positive(&vm_committed_as) < allowed)
200 return 0; 200 return 0;
201 error: 201 error:
202 vm_unacct_memory(pages); 202 vm_unacct_memory(pages);
203 203
204 return -ENOMEM; 204 return -ENOMEM;
205 } 205 }
206 206
207 /* 207 /*
208 * Requires inode->i_mapping->i_mmap_mutex 208 * Requires inode->i_mapping->i_mmap_mutex
209 */ 209 */
210 static void __remove_shared_vm_struct(struct vm_area_struct *vma, 210 static void __remove_shared_vm_struct(struct vm_area_struct *vma,
211 struct file *file, struct address_space *mapping) 211 struct file *file, struct address_space *mapping)
212 { 212 {
213 if (vma->vm_flags & VM_DENYWRITE) 213 if (vma->vm_flags & VM_DENYWRITE)
214 atomic_inc(&file_inode(file)->i_writecount); 214 atomic_inc(&file_inode(file)->i_writecount);
215 if (vma->vm_flags & VM_SHARED) 215 if (vma->vm_flags & VM_SHARED)
216 mapping->i_mmap_writable--; 216 mapping->i_mmap_writable--;
217 217
218 flush_dcache_mmap_lock(mapping); 218 flush_dcache_mmap_lock(mapping);
219 if (unlikely(vma->vm_flags & VM_NONLINEAR)) 219 if (unlikely(vma->vm_flags & VM_NONLINEAR))
220 list_del_init(&vma->shared.nonlinear); 220 list_del_init(&vma->shared.nonlinear);
221 else 221 else
222 vma_interval_tree_remove(vma, &mapping->i_mmap); 222 vma_interval_tree_remove(vma, &mapping->i_mmap);
223 flush_dcache_mmap_unlock(mapping); 223 flush_dcache_mmap_unlock(mapping);
224 } 224 }
225 225
226 /* 226 /*
227 * Unlink a file-based vm structure from its interval tree, to hide 227 * Unlink a file-based vm structure from its interval tree, to hide
228 * vma from rmap and vmtruncate before freeing its page tables. 228 * vma from rmap and vmtruncate before freeing its page tables.
229 */ 229 */
230 void unlink_file_vma(struct vm_area_struct *vma) 230 void unlink_file_vma(struct vm_area_struct *vma)
231 { 231 {
232 struct file *file = vma->vm_file; 232 struct file *file = vma->vm_file;
233 233
234 if (file) { 234 if (file) {
235 struct address_space *mapping = file->f_mapping; 235 struct address_space *mapping = file->f_mapping;
236 mutex_lock(&mapping->i_mmap_mutex); 236 mutex_lock(&mapping->i_mmap_mutex);
237 __remove_shared_vm_struct(vma, file, mapping); 237 __remove_shared_vm_struct(vma, file, mapping);
238 mutex_unlock(&mapping->i_mmap_mutex); 238 mutex_unlock(&mapping->i_mmap_mutex);
239 } 239 }
240 } 240 }
241 241
242 /* 242 /*
243 * Close a vm structure and free it, returning the next. 243 * Close a vm structure and free it, returning the next.
244 */ 244 */
245 static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) 245 static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
246 { 246 {
247 struct vm_area_struct *next = vma->vm_next; 247 struct vm_area_struct *next = vma->vm_next;
248 248
249 might_sleep(); 249 might_sleep();
250 if (vma->vm_ops && vma->vm_ops->close) 250 if (vma->vm_ops && vma->vm_ops->close)
251 vma->vm_ops->close(vma); 251 vma->vm_ops->close(vma);
252 if (vma->vm_file) 252 if (vma->vm_file)
253 fput(vma->vm_file); 253 fput(vma->vm_file);
254 mpol_put(vma_policy(vma)); 254 mpol_put(vma_policy(vma));
255 kmem_cache_free(vm_area_cachep, vma); 255 kmem_cache_free(vm_area_cachep, vma);
256 return next; 256 return next;
257 } 257 }
258 258
259 static unsigned long do_brk(unsigned long addr, unsigned long len); 259 static unsigned long do_brk(unsigned long addr, unsigned long len);
260 260
261 SYSCALL_DEFINE1(brk, unsigned long, brk) 261 SYSCALL_DEFINE1(brk, unsigned long, brk)
262 { 262 {
263 unsigned long rlim, retval; 263 unsigned long rlim, retval;
264 unsigned long newbrk, oldbrk; 264 unsigned long newbrk, oldbrk;
265 struct mm_struct *mm = current->mm; 265 struct mm_struct *mm = current->mm;
266 unsigned long min_brk; 266 unsigned long min_brk;
267 bool populate; 267 bool populate;
268 268
269 down_write(&mm->mmap_sem); 269 down_write(&mm->mmap_sem);
270 270
271 #ifdef CONFIG_COMPAT_BRK 271 #ifdef CONFIG_COMPAT_BRK
272 /* 272 /*
273 * CONFIG_COMPAT_BRK can still be overridden by setting 273 * CONFIG_COMPAT_BRK can still be overridden by setting
274 * randomize_va_space to 2, which will still cause mm->start_brk 274 * randomize_va_space to 2, which will still cause mm->start_brk
275 * to be arbitrarily shifted 275 * to be arbitrarily shifted
276 */ 276 */
277 if (current->brk_randomized) 277 if (current->brk_randomized)
278 min_brk = mm->start_brk; 278 min_brk = mm->start_brk;
279 else 279 else
280 min_brk = mm->end_data; 280 min_brk = mm->end_data;
281 #else 281 #else
282 min_brk = mm->start_brk; 282 min_brk = mm->start_brk;
283 #endif 283 #endif
284 if (brk < min_brk) 284 if (brk < min_brk)
285 goto out; 285 goto out;
286 286
287 /* 287 /*
288 * Check against rlimit here. If this check is done later after the test 288 * Check against rlimit here. If this check is done later after the test
289 * of oldbrk with newbrk then it can escape the test and let the data 289 * of oldbrk with newbrk then it can escape the test and let the data
290 * segment grow beyond its set limit the in case where the limit is 290 * segment grow beyond its set limit the in case where the limit is
291 * not page aligned -Ram Gupta 291 * not page aligned -Ram Gupta
292 */ 292 */
293 rlim = rlimit(RLIMIT_DATA); 293 rlim = rlimit(RLIMIT_DATA);
294 if (rlim < RLIM_INFINITY && (brk - mm->start_brk) + 294 if (rlim < RLIM_INFINITY && (brk - mm->start_brk) +
295 (mm->end_data - mm->start_data) > rlim) 295 (mm->end_data - mm->start_data) > rlim)
296 goto out; 296 goto out;
297 297
298 newbrk = PAGE_ALIGN(brk); 298 newbrk = PAGE_ALIGN(brk);
299 oldbrk = PAGE_ALIGN(mm->brk); 299 oldbrk = PAGE_ALIGN(mm->brk);
300 if (oldbrk == newbrk) 300 if (oldbrk == newbrk)
301 goto set_brk; 301 goto set_brk;
302 302
303 /* Always allow shrinking brk. */ 303 /* Always allow shrinking brk. */
304 if (brk <= mm->brk) { 304 if (brk <= mm->brk) {
305 if (!do_munmap(mm, newbrk, oldbrk-newbrk)) 305 if (!do_munmap(mm, newbrk, oldbrk-newbrk))
306 goto set_brk; 306 goto set_brk;
307 goto out; 307 goto out;
308 } 308 }
309 309
310 /* Check against existing mmap mappings. */ 310 /* Check against existing mmap mappings. */
311 if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE)) 311 if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE))
312 goto out; 312 goto out;
313 313
314 /* Ok, looks good - let it rip. */ 314 /* Ok, looks good - let it rip. */
315 if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) 315 if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
316 goto out; 316 goto out;
317 317
318 set_brk: 318 set_brk:
319 mm->brk = brk; 319 mm->brk = brk;
320 populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0; 320 populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;
321 up_write(&mm->mmap_sem); 321 up_write(&mm->mmap_sem);
322 if (populate) 322 if (populate)
323 mm_populate(oldbrk, newbrk - oldbrk); 323 mm_populate(oldbrk, newbrk - oldbrk);
324 return brk; 324 return brk;
325 325
326 out: 326 out:
327 retval = mm->brk; 327 retval = mm->brk;
328 up_write(&mm->mmap_sem); 328 up_write(&mm->mmap_sem);
329 return retval; 329 return retval;
330 } 330 }
331 331
332 static long vma_compute_subtree_gap(struct vm_area_struct *vma) 332 static long vma_compute_subtree_gap(struct vm_area_struct *vma)
333 { 333 {
334 unsigned long max, subtree_gap; 334 unsigned long max, subtree_gap;
335 max = vma->vm_start; 335 max = vma->vm_start;
336 if (vma->vm_prev) 336 if (vma->vm_prev)
337 max -= vma->vm_prev->vm_end; 337 max -= vma->vm_prev->vm_end;
338 if (vma->vm_rb.rb_left) { 338 if (vma->vm_rb.rb_left) {
339 subtree_gap = rb_entry(vma->vm_rb.rb_left, 339 subtree_gap = rb_entry(vma->vm_rb.rb_left,
340 struct vm_area_struct, vm_rb)->rb_subtree_gap; 340 struct vm_area_struct, vm_rb)->rb_subtree_gap;
341 if (subtree_gap > max) 341 if (subtree_gap > max)
342 max = subtree_gap; 342 max = subtree_gap;
343 } 343 }
344 if (vma->vm_rb.rb_right) { 344 if (vma->vm_rb.rb_right) {
345 subtree_gap = rb_entry(vma->vm_rb.rb_right, 345 subtree_gap = rb_entry(vma->vm_rb.rb_right,
346 struct vm_area_struct, vm_rb)->rb_subtree_gap; 346 struct vm_area_struct, vm_rb)->rb_subtree_gap;
347 if (subtree_gap > max) 347 if (subtree_gap > max)
348 max = subtree_gap; 348 max = subtree_gap;
349 } 349 }
350 return max; 350 return max;
351 } 351 }
352 352
353 #ifdef CONFIG_DEBUG_VM_RB 353 #ifdef CONFIG_DEBUG_VM_RB
354 static int browse_rb(struct rb_root *root) 354 static int browse_rb(struct rb_root *root)
355 { 355 {
356 int i = 0, j, bug = 0; 356 int i = 0, j, bug = 0;
357 struct rb_node *nd, *pn = NULL; 357 struct rb_node *nd, *pn = NULL;
358 unsigned long prev = 0, pend = 0; 358 unsigned long prev = 0, pend = 0;
359 359
360 for (nd = rb_first(root); nd; nd = rb_next(nd)) { 360 for (nd = rb_first(root); nd; nd = rb_next(nd)) {
361 struct vm_area_struct *vma; 361 struct vm_area_struct *vma;
362 vma = rb_entry(nd, struct vm_area_struct, vm_rb); 362 vma = rb_entry(nd, struct vm_area_struct, vm_rb);
363 if (vma->vm_start < prev) { 363 if (vma->vm_start < prev) {
364 printk("vm_start %lx prev %lx\n", vma->vm_start, prev); 364 printk("vm_start %lx prev %lx\n", vma->vm_start, prev);
365 bug = 1; 365 bug = 1;
366 } 366 }
367 if (vma->vm_start < pend) { 367 if (vma->vm_start < pend) {
368 printk("vm_start %lx pend %lx\n", vma->vm_start, pend); 368 printk("vm_start %lx pend %lx\n", vma->vm_start, pend);
369 bug = 1; 369 bug = 1;
370 } 370 }
371 if (vma->vm_start > vma->vm_end) { 371 if (vma->vm_start > vma->vm_end) {
372 printk("vm_end %lx < vm_start %lx\n", 372 printk("vm_end %lx < vm_start %lx\n",
373 vma->vm_end, vma->vm_start); 373 vma->vm_end, vma->vm_start);
374 bug = 1; 374 bug = 1;
375 } 375 }
376 if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) { 376 if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
377 printk("free gap %lx, correct %lx\n", 377 printk("free gap %lx, correct %lx\n",
378 vma->rb_subtree_gap, 378 vma->rb_subtree_gap,
379 vma_compute_subtree_gap(vma)); 379 vma_compute_subtree_gap(vma));
380 bug = 1; 380 bug = 1;
381 } 381 }
382 i++; 382 i++;
383 pn = nd; 383 pn = nd;
384 prev = vma->vm_start; 384 prev = vma->vm_start;
385 pend = vma->vm_end; 385 pend = vma->vm_end;
386 } 386 }
387 j = 0; 387 j = 0;
388 for (nd = pn; nd; nd = rb_prev(nd)) 388 for (nd = pn; nd; nd = rb_prev(nd))
389 j++; 389 j++;
390 if (i != j) { 390 if (i != j) {
391 printk("backwards %d, forwards %d\n", j, i); 391 printk("backwards %d, forwards %d\n", j, i);
392 bug = 1; 392 bug = 1;
393 } 393 }
394 return bug ? -1 : i; 394 return bug ? -1 : i;
395 } 395 }
396 396
397 static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore) 397 static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore)
398 { 398 {
399 struct rb_node *nd; 399 struct rb_node *nd;
400 400
401 for (nd = rb_first(root); nd; nd = rb_next(nd)) { 401 for (nd = rb_first(root); nd; nd = rb_next(nd)) {
402 struct vm_area_struct *vma; 402 struct vm_area_struct *vma;
403 vma = rb_entry(nd, struct vm_area_struct, vm_rb); 403 vma = rb_entry(nd, struct vm_area_struct, vm_rb);
404 BUG_ON(vma != ignore && 404 BUG_ON(vma != ignore &&
405 vma->rb_subtree_gap != vma_compute_subtree_gap(vma)); 405 vma->rb_subtree_gap != vma_compute_subtree_gap(vma));
406 } 406 }
407 } 407 }
408 408
409 void validate_mm(struct mm_struct *mm) 409 void validate_mm(struct mm_struct *mm)
410 { 410 {
411 int bug = 0; 411 int bug = 0;
412 int i = 0; 412 int i = 0;
413 unsigned long highest_address = 0; 413 unsigned long highest_address = 0;
414 struct vm_area_struct *vma = mm->mmap; 414 struct vm_area_struct *vma = mm->mmap;
415 while (vma) { 415 while (vma) {
416 struct anon_vma_chain *avc; 416 struct anon_vma_chain *avc;
417 vma_lock_anon_vma(vma); 417 vma_lock_anon_vma(vma);
418 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 418 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
419 anon_vma_interval_tree_verify(avc); 419 anon_vma_interval_tree_verify(avc);
420 vma_unlock_anon_vma(vma); 420 vma_unlock_anon_vma(vma);
421 highest_address = vma->vm_end; 421 highest_address = vma->vm_end;
422 vma = vma->vm_next; 422 vma = vma->vm_next;
423 i++; 423 i++;
424 } 424 }
425 if (i != mm->map_count) { 425 if (i != mm->map_count) {
426 printk("map_count %d vm_next %d\n", mm->map_count, i); 426 printk("map_count %d vm_next %d\n", mm->map_count, i);
427 bug = 1; 427 bug = 1;
428 } 428 }
429 if (highest_address != mm->highest_vm_end) { 429 if (highest_address != mm->highest_vm_end) {
430 printk("mm->highest_vm_end %lx, found %lx\n", 430 printk("mm->highest_vm_end %lx, found %lx\n",
431 mm->highest_vm_end, highest_address); 431 mm->highest_vm_end, highest_address);
432 bug = 1; 432 bug = 1;
433 } 433 }
434 i = browse_rb(&mm->mm_rb); 434 i = browse_rb(&mm->mm_rb);
435 if (i != mm->map_count) { 435 if (i != mm->map_count) {
436 printk("map_count %d rb %d\n", mm->map_count, i); 436 printk("map_count %d rb %d\n", mm->map_count, i);
437 bug = 1; 437 bug = 1;
438 } 438 }
439 BUG_ON(bug); 439 BUG_ON(bug);
440 } 440 }
441 #else 441 #else
442 #define validate_mm_rb(root, ignore) do { } while (0) 442 #define validate_mm_rb(root, ignore) do { } while (0)
443 #define validate_mm(mm) do { } while (0) 443 #define validate_mm(mm) do { } while (0)
444 #endif 444 #endif
445 445
446 RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb, 446 RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb,
447 unsigned long, rb_subtree_gap, vma_compute_subtree_gap) 447 unsigned long, rb_subtree_gap, vma_compute_subtree_gap)
448 448
449 /* 449 /*
450 * Update augmented rbtree rb_subtree_gap values after vma->vm_start or 450 * Update augmented rbtree rb_subtree_gap values after vma->vm_start or
451 * vma->vm_prev->vm_end values changed, without modifying the vma's position 451 * vma->vm_prev->vm_end values changed, without modifying the vma's position
452 * in the rbtree. 452 * in the rbtree.
453 */ 453 */
454 static void vma_gap_update(struct vm_area_struct *vma) 454 static void vma_gap_update(struct vm_area_struct *vma)
455 { 455 {
456 /* 456 /*
457 * As it turns out, RB_DECLARE_CALLBACKS() already created a callback 457 * As it turns out, RB_DECLARE_CALLBACKS() already created a callback
458 * function that does exacltly what we want. 458 * function that does exacltly what we want.
459 */ 459 */
460 vma_gap_callbacks_propagate(&vma->vm_rb, NULL); 460 vma_gap_callbacks_propagate(&vma->vm_rb, NULL);
461 } 461 }
462 462
463 static inline void vma_rb_insert(struct vm_area_struct *vma, 463 static inline void vma_rb_insert(struct vm_area_struct *vma,
464 struct rb_root *root) 464 struct rb_root *root)
465 { 465 {
466 /* All rb_subtree_gap values must be consistent prior to insertion */ 466 /* All rb_subtree_gap values must be consistent prior to insertion */
467 validate_mm_rb(root, NULL); 467 validate_mm_rb(root, NULL);
468 468
469 rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks); 469 rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
470 } 470 }
471 471
472 static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root) 472 static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
473 { 473 {
474 /* 474 /*
475 * All rb_subtree_gap values must be consistent prior to erase, 475 * All rb_subtree_gap values must be consistent prior to erase,
476 * with the possible exception of the vma being erased. 476 * with the possible exception of the vma being erased.
477 */ 477 */
478 validate_mm_rb(root, vma); 478 validate_mm_rb(root, vma);
479 479
480 /* 480 /*
481 * Note rb_erase_augmented is a fairly large inline function, 481 * Note rb_erase_augmented is a fairly large inline function,
482 * so make sure we instantiate it only once with our desired 482 * so make sure we instantiate it only once with our desired
483 * augmented rbtree callbacks. 483 * augmented rbtree callbacks.
484 */ 484 */
485 rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks); 485 rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
486 } 486 }
487 487
488 /* 488 /*
489 * vma has some anon_vma assigned, and is already inserted on that 489 * vma has some anon_vma assigned, and is already inserted on that
490 * anon_vma's interval trees. 490 * anon_vma's interval trees.
491 * 491 *
492 * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the 492 * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
493 * vma must be removed from the anon_vma's interval trees using 493 * vma must be removed from the anon_vma's interval trees using
494 * anon_vma_interval_tree_pre_update_vma(). 494 * anon_vma_interval_tree_pre_update_vma().
495 * 495 *
496 * After the update, the vma will be reinserted using 496 * After the update, the vma will be reinserted using
497 * anon_vma_interval_tree_post_update_vma(). 497 * anon_vma_interval_tree_post_update_vma().
498 * 498 *
499 * The entire update must be protected by exclusive mmap_sem and by 499 * The entire update must be protected by exclusive mmap_sem and by
500 * the root anon_vma's mutex. 500 * the root anon_vma's mutex.
501 */ 501 */
502 static inline void 502 static inline void
503 anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma) 503 anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
504 { 504 {
505 struct anon_vma_chain *avc; 505 struct anon_vma_chain *avc;
506 506
507 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 507 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
508 anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root); 508 anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
509 } 509 }
510 510
511 static inline void 511 static inline void
512 anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) 512 anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
513 { 513 {
514 struct anon_vma_chain *avc; 514 struct anon_vma_chain *avc;
515 515
516 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 516 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
517 anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); 517 anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
518 } 518 }
519 519
520 static int find_vma_links(struct mm_struct *mm, unsigned long addr, 520 static int find_vma_links(struct mm_struct *mm, unsigned long addr,
521 unsigned long end, struct vm_area_struct **pprev, 521 unsigned long end, struct vm_area_struct **pprev,
522 struct rb_node ***rb_link, struct rb_node **rb_parent) 522 struct rb_node ***rb_link, struct rb_node **rb_parent)
523 { 523 {
524 struct rb_node **__rb_link, *__rb_parent, *rb_prev; 524 struct rb_node **__rb_link, *__rb_parent, *rb_prev;
525 525
526 __rb_link = &mm->mm_rb.rb_node; 526 __rb_link = &mm->mm_rb.rb_node;
527 rb_prev = __rb_parent = NULL; 527 rb_prev = __rb_parent = NULL;
528 528
529 while (*__rb_link) { 529 while (*__rb_link) {
530 struct vm_area_struct *vma_tmp; 530 struct vm_area_struct *vma_tmp;
531 531
532 __rb_parent = *__rb_link; 532 __rb_parent = *__rb_link;
533 vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb); 533 vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
534 534
535 if (vma_tmp->vm_end > addr) { 535 if (vma_tmp->vm_end > addr) {
536 /* Fail if an existing vma overlaps the area */ 536 /* Fail if an existing vma overlaps the area */
537 if (vma_tmp->vm_start < end) 537 if (vma_tmp->vm_start < end)
538 return -ENOMEM; 538 return -ENOMEM;
539 __rb_link = &__rb_parent->rb_left; 539 __rb_link = &__rb_parent->rb_left;
540 } else { 540 } else {
541 rb_prev = __rb_parent; 541 rb_prev = __rb_parent;
542 __rb_link = &__rb_parent->rb_right; 542 __rb_link = &__rb_parent->rb_right;
543 } 543 }
544 } 544 }
545 545
546 *pprev = NULL; 546 *pprev = NULL;
547 if (rb_prev) 547 if (rb_prev)
548 *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); 548 *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
549 *rb_link = __rb_link; 549 *rb_link = __rb_link;
550 *rb_parent = __rb_parent; 550 *rb_parent = __rb_parent;
551 return 0; 551 return 0;
552 } 552 }
553 553
554 static unsigned long count_vma_pages_range(struct mm_struct *mm, 554 static unsigned long count_vma_pages_range(struct mm_struct *mm,
555 unsigned long addr, unsigned long end) 555 unsigned long addr, unsigned long end)
556 { 556 {
557 unsigned long nr_pages = 0; 557 unsigned long nr_pages = 0;
558 struct vm_area_struct *vma; 558 struct vm_area_struct *vma;
559 559
560 /* Find first overlaping mapping */ 560 /* Find first overlaping mapping */
561 vma = find_vma_intersection(mm, addr, end); 561 vma = find_vma_intersection(mm, addr, end);
562 if (!vma) 562 if (!vma)
563 return 0; 563 return 0;
564 564
565 nr_pages = (min(end, vma->vm_end) - 565 nr_pages = (min(end, vma->vm_end) -
566 max(addr, vma->vm_start)) >> PAGE_SHIFT; 566 max(addr, vma->vm_start)) >> PAGE_SHIFT;
567 567
568 /* Iterate over the rest of the overlaps */ 568 /* Iterate over the rest of the overlaps */
569 for (vma = vma->vm_next; vma; vma = vma->vm_next) { 569 for (vma = vma->vm_next; vma; vma = vma->vm_next) {
570 unsigned long overlap_len; 570 unsigned long overlap_len;
571 571
572 if (vma->vm_start > end) 572 if (vma->vm_start > end)
573 break; 573 break;
574 574
575 overlap_len = min(end, vma->vm_end) - vma->vm_start; 575 overlap_len = min(end, vma->vm_end) - vma->vm_start;
576 nr_pages += overlap_len >> PAGE_SHIFT; 576 nr_pages += overlap_len >> PAGE_SHIFT;
577 } 577 }
578 578
579 return nr_pages; 579 return nr_pages;
580 } 580 }
581 581
582 void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, 582 void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
583 struct rb_node **rb_link, struct rb_node *rb_parent) 583 struct rb_node **rb_link, struct rb_node *rb_parent)
584 { 584 {
585 /* Update tracking information for the gap following the new vma. */ 585 /* Update tracking information for the gap following the new vma. */
586 if (vma->vm_next) 586 if (vma->vm_next)
587 vma_gap_update(vma->vm_next); 587 vma_gap_update(vma->vm_next);
588 else 588 else
589 mm->highest_vm_end = vma->vm_end; 589 mm->highest_vm_end = vma->vm_end;
590 590
591 /* 591 /*
592 * vma->vm_prev wasn't known when we followed the rbtree to find the 592 * vma->vm_prev wasn't known when we followed the rbtree to find the
593 * correct insertion point for that vma. As a result, we could not 593 * correct insertion point for that vma. As a result, we could not
594 * update the vma vm_rb parents rb_subtree_gap values on the way down. 594 * update the vma vm_rb parents rb_subtree_gap values on the way down.
595 * So, we first insert the vma with a zero rb_subtree_gap value 595 * So, we first insert the vma with a zero rb_subtree_gap value
596 * (to be consistent with what we did on the way down), and then 596 * (to be consistent with what we did on the way down), and then
597 * immediately update the gap to the correct value. Finally we 597 * immediately update the gap to the correct value. Finally we
598 * rebalance the rbtree after all augmented values have been set. 598 * rebalance the rbtree after all augmented values have been set.
599 */ 599 */
600 rb_link_node(&vma->vm_rb, rb_parent, rb_link); 600 rb_link_node(&vma->vm_rb, rb_parent, rb_link);
601 vma->rb_subtree_gap = 0; 601 vma->rb_subtree_gap = 0;
602 vma_gap_update(vma); 602 vma_gap_update(vma);
603 vma_rb_insert(vma, &mm->mm_rb); 603 vma_rb_insert(vma, &mm->mm_rb);
604 } 604 }
605 605
606 static void __vma_link_file(struct vm_area_struct *vma) 606 static void __vma_link_file(struct vm_area_struct *vma)
607 { 607 {
608 struct file *file; 608 struct file *file;
609 609
610 file = vma->vm_file; 610 file = vma->vm_file;
611 if (file) { 611 if (file) {
612 struct address_space *mapping = file->f_mapping; 612 struct address_space *mapping = file->f_mapping;
613 613
614 if (vma->vm_flags & VM_DENYWRITE) 614 if (vma->vm_flags & VM_DENYWRITE)
615 atomic_dec(&file_inode(file)->i_writecount); 615 atomic_dec(&file_inode(file)->i_writecount);
616 if (vma->vm_flags & VM_SHARED) 616 if (vma->vm_flags & VM_SHARED)
617 mapping->i_mmap_writable++; 617 mapping->i_mmap_writable++;
618 618
619 flush_dcache_mmap_lock(mapping); 619 flush_dcache_mmap_lock(mapping);
620 if (unlikely(vma->vm_flags & VM_NONLINEAR)) 620 if (unlikely(vma->vm_flags & VM_NONLINEAR))
621 vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); 621 vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
622 else 622 else
623 vma_interval_tree_insert(vma, &mapping->i_mmap); 623 vma_interval_tree_insert(vma, &mapping->i_mmap);
624 flush_dcache_mmap_unlock(mapping); 624 flush_dcache_mmap_unlock(mapping);
625 } 625 }
626 } 626 }
627 627
628 static void 628 static void
629 __vma_link(struct mm_struct *mm, struct vm_area_struct *vma, 629 __vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
630 struct vm_area_struct *prev, struct rb_node **rb_link, 630 struct vm_area_struct *prev, struct rb_node **rb_link,
631 struct rb_node *rb_parent) 631 struct rb_node *rb_parent)
632 { 632 {
633 __vma_link_list(mm, vma, prev, rb_parent); 633 __vma_link_list(mm, vma, prev, rb_parent);
634 __vma_link_rb(mm, vma, rb_link, rb_parent); 634 __vma_link_rb(mm, vma, rb_link, rb_parent);
635 } 635 }
636 636
637 static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, 637 static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
638 struct vm_area_struct *prev, struct rb_node **rb_link, 638 struct vm_area_struct *prev, struct rb_node **rb_link,
639 struct rb_node *rb_parent) 639 struct rb_node *rb_parent)
640 { 640 {
641 struct address_space *mapping = NULL; 641 struct address_space *mapping = NULL;
642 642
643 if (vma->vm_file) 643 if (vma->vm_file)
644 mapping = vma->vm_file->f_mapping; 644 mapping = vma->vm_file->f_mapping;
645 645
646 if (mapping) 646 if (mapping)
647 mutex_lock(&mapping->i_mmap_mutex); 647 mutex_lock(&mapping->i_mmap_mutex);
648 648
649 __vma_link(mm, vma, prev, rb_link, rb_parent); 649 __vma_link(mm, vma, prev, rb_link, rb_parent);
650 __vma_link_file(vma); 650 __vma_link_file(vma);
651 651
652 if (mapping) 652 if (mapping)
653 mutex_unlock(&mapping->i_mmap_mutex); 653 mutex_unlock(&mapping->i_mmap_mutex);
654 654
655 mm->map_count++; 655 mm->map_count++;
656 validate_mm(mm); 656 validate_mm(mm);
657 } 657 }
658 658
659 /* 659 /*
660 * Helper for vma_adjust() in the split_vma insert case: insert a vma into the 660 * Helper for vma_adjust() in the split_vma insert case: insert a vma into the
661 * mm's list and rbtree. It has already been inserted into the interval tree. 661 * mm's list and rbtree. It has already been inserted into the interval tree.
662 */ 662 */
663 static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) 663 static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
664 { 664 {
665 struct vm_area_struct *prev; 665 struct vm_area_struct *prev;
666 struct rb_node **rb_link, *rb_parent; 666 struct rb_node **rb_link, *rb_parent;
667 667
668 if (find_vma_links(mm, vma->vm_start, vma->vm_end, 668 if (find_vma_links(mm, vma->vm_start, vma->vm_end,
669 &prev, &rb_link, &rb_parent)) 669 &prev, &rb_link, &rb_parent))
670 BUG(); 670 BUG();
671 __vma_link(mm, vma, prev, rb_link, rb_parent); 671 __vma_link(mm, vma, prev, rb_link, rb_parent);
672 mm->map_count++; 672 mm->map_count++;
673 } 673 }
674 674
675 static inline void 675 static inline void
676 __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, 676 __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
677 struct vm_area_struct *prev) 677 struct vm_area_struct *prev)
678 { 678 {
679 struct vm_area_struct *next; 679 struct vm_area_struct *next;
680 680
681 vma_rb_erase(vma, &mm->mm_rb); 681 vma_rb_erase(vma, &mm->mm_rb);
682 prev->vm_next = next = vma->vm_next; 682 prev->vm_next = next = vma->vm_next;
683 if (next) 683 if (next)
684 next->vm_prev = prev; 684 next->vm_prev = prev;
685 if (mm->mmap_cache == vma) 685 if (mm->mmap_cache == vma)
686 mm->mmap_cache = prev; 686 mm->mmap_cache = prev;
687 } 687 }
688 688
689 /* 689 /*
690 * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that 690 * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that
691 * is already present in an i_mmap tree without adjusting the tree. 691 * is already present in an i_mmap tree without adjusting the tree.
692 * The following helper function should be used when such adjustments 692 * The following helper function should be used when such adjustments
693 * are necessary. The "insert" vma (if any) is to be inserted 693 * are necessary. The "insert" vma (if any) is to be inserted
694 * before we drop the necessary locks. 694 * before we drop the necessary locks.
695 */ 695 */
696 int vma_adjust(struct vm_area_struct *vma, unsigned long start, 696 int vma_adjust(struct vm_area_struct *vma, unsigned long start,
697 unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) 697 unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
698 { 698 {
699 struct mm_struct *mm = vma->vm_mm; 699 struct mm_struct *mm = vma->vm_mm;
700 struct vm_area_struct *next = vma->vm_next; 700 struct vm_area_struct *next = vma->vm_next;
701 struct vm_area_struct *importer = NULL; 701 struct vm_area_struct *importer = NULL;
702 struct address_space *mapping = NULL; 702 struct address_space *mapping = NULL;
703 struct rb_root *root = NULL; 703 struct rb_root *root = NULL;
704 struct anon_vma *anon_vma = NULL; 704 struct anon_vma *anon_vma = NULL;
705 struct file *file = vma->vm_file; 705 struct file *file = vma->vm_file;
706 bool start_changed = false, end_changed = false; 706 bool start_changed = false, end_changed = false;
707 long adjust_next = 0; 707 long adjust_next = 0;
708 int remove_next = 0; 708 int remove_next = 0;
709 709
710 if (next && !insert) { 710 if (next && !insert) {
711 struct vm_area_struct *exporter = NULL; 711 struct vm_area_struct *exporter = NULL;
712 712
713 if (end >= next->vm_end) { 713 if (end >= next->vm_end) {
714 /* 714 /*
715 * vma expands, overlapping all the next, and 715 * vma expands, overlapping all the next, and
716 * perhaps the one after too (mprotect case 6). 716 * perhaps the one after too (mprotect case 6).
717 */ 717 */
718 again: remove_next = 1 + (end > next->vm_end); 718 again: remove_next = 1 + (end > next->vm_end);
719 end = next->vm_end; 719 end = next->vm_end;
720 exporter = next; 720 exporter = next;
721 importer = vma; 721 importer = vma;
722 } else if (end > next->vm_start) { 722 } else if (end > next->vm_start) {
723 /* 723 /*
724 * vma expands, overlapping part of the next: 724 * vma expands, overlapping part of the next:
725 * mprotect case 5 shifting the boundary up. 725 * mprotect case 5 shifting the boundary up.
726 */ 726 */
727 adjust_next = (end - next->vm_start) >> PAGE_SHIFT; 727 adjust_next = (end - next->vm_start) >> PAGE_SHIFT;
728 exporter = next; 728 exporter = next;
729 importer = vma; 729 importer = vma;
730 } else if (end < vma->vm_end) { 730 } else if (end < vma->vm_end) {
731 /* 731 /*
732 * vma shrinks, and !insert tells it's not 732 * vma shrinks, and !insert tells it's not
733 * split_vma inserting another: so it must be 733 * split_vma inserting another: so it must be
734 * mprotect case 4 shifting the boundary down. 734 * mprotect case 4 shifting the boundary down.
735 */ 735 */
736 adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT); 736 adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT);
737 exporter = vma; 737 exporter = vma;
738 importer = next; 738 importer = next;
739 } 739 }
740 740
741 /* 741 /*
742 * Easily overlooked: when mprotect shifts the boundary, 742 * Easily overlooked: when mprotect shifts the boundary,
743 * make sure the expanding vma has anon_vma set if the 743 * make sure the expanding vma has anon_vma set if the
744 * shrinking vma had, to cover any anon pages imported. 744 * shrinking vma had, to cover any anon pages imported.
745 */ 745 */
746 if (exporter && exporter->anon_vma && !importer->anon_vma) { 746 if (exporter && exporter->anon_vma && !importer->anon_vma) {
747 if (anon_vma_clone(importer, exporter)) 747 if (anon_vma_clone(importer, exporter))
748 return -ENOMEM; 748 return -ENOMEM;
749 importer->anon_vma = exporter->anon_vma; 749 importer->anon_vma = exporter->anon_vma;
750 } 750 }
751 } 751 }
752 752
753 if (file) { 753 if (file) {
754 mapping = file->f_mapping; 754 mapping = file->f_mapping;
755 if (!(vma->vm_flags & VM_NONLINEAR)) { 755 if (!(vma->vm_flags & VM_NONLINEAR)) {
756 root = &mapping->i_mmap; 756 root = &mapping->i_mmap;
757 uprobe_munmap(vma, vma->vm_start, vma->vm_end); 757 uprobe_munmap(vma, vma->vm_start, vma->vm_end);
758 758
759 if (adjust_next) 759 if (adjust_next)
760 uprobe_munmap(next, next->vm_start, 760 uprobe_munmap(next, next->vm_start,
761 next->vm_end); 761 next->vm_end);
762 } 762 }
763 763
764 mutex_lock(&mapping->i_mmap_mutex); 764 mutex_lock(&mapping->i_mmap_mutex);
765 if (insert) { 765 if (insert) {
766 /* 766 /*
767 * Put into interval tree now, so instantiated pages 767 * Put into interval tree now, so instantiated pages
768 * are visible to arm/parisc __flush_dcache_page 768 * are visible to arm/parisc __flush_dcache_page
769 * throughout; but we cannot insert into address 769 * throughout; but we cannot insert into address
770 * space until vma start or end is updated. 770 * space until vma start or end is updated.
771 */ 771 */
772 __vma_link_file(insert); 772 __vma_link_file(insert);
773 } 773 }
774 } 774 }
775 775
776 vma_adjust_trans_huge(vma, start, end, adjust_next); 776 vma_adjust_trans_huge(vma, start, end, adjust_next);
777 777
778 anon_vma = vma->anon_vma; 778 anon_vma = vma->anon_vma;
779 if (!anon_vma && adjust_next) 779 if (!anon_vma && adjust_next)
780 anon_vma = next->anon_vma; 780 anon_vma = next->anon_vma;
781 if (anon_vma) { 781 if (anon_vma) {
782 VM_BUG_ON(adjust_next && next->anon_vma && 782 VM_BUG_ON(adjust_next && next->anon_vma &&
783 anon_vma != next->anon_vma); 783 anon_vma != next->anon_vma);
784 anon_vma_lock_write(anon_vma); 784 anon_vma_lock_write(anon_vma);
785 anon_vma_interval_tree_pre_update_vma(vma); 785 anon_vma_interval_tree_pre_update_vma(vma);
786 if (adjust_next) 786 if (adjust_next)
787 anon_vma_interval_tree_pre_update_vma(next); 787 anon_vma_interval_tree_pre_update_vma(next);
788 } 788 }
789 789
790 if (root) { 790 if (root) {
791 flush_dcache_mmap_lock(mapping); 791 flush_dcache_mmap_lock(mapping);
792 vma_interval_tree_remove(vma, root); 792 vma_interval_tree_remove(vma, root);
793 if (adjust_next) 793 if (adjust_next)
794 vma_interval_tree_remove(next, root); 794 vma_interval_tree_remove(next, root);
795 } 795 }
796 796
797 if (start != vma->vm_start) { 797 if (start != vma->vm_start) {
798 vma->vm_start = start; 798 vma->vm_start = start;
799 start_changed = true; 799 start_changed = true;
800 } 800 }
801 if (end != vma->vm_end) { 801 if (end != vma->vm_end) {
802 vma->vm_end = end; 802 vma->vm_end = end;
803 end_changed = true; 803 end_changed = true;
804 } 804 }
805 vma->vm_pgoff = pgoff; 805 vma->vm_pgoff = pgoff;
806 if (adjust_next) { 806 if (adjust_next) {
807 next->vm_start += adjust_next << PAGE_SHIFT; 807 next->vm_start += adjust_next << PAGE_SHIFT;
808 next->vm_pgoff += adjust_next; 808 next->vm_pgoff += adjust_next;
809 } 809 }
810 810
811 if (root) { 811 if (root) {
812 if (adjust_next) 812 if (adjust_next)
813 vma_interval_tree_insert(next, root); 813 vma_interval_tree_insert(next, root);
814 vma_interval_tree_insert(vma, root); 814 vma_interval_tree_insert(vma, root);
815 flush_dcache_mmap_unlock(mapping); 815 flush_dcache_mmap_unlock(mapping);
816 } 816 }
817 817
818 if (remove_next) { 818 if (remove_next) {
819 /* 819 /*
820 * vma_merge has merged next into vma, and needs 820 * vma_merge has merged next into vma, and needs
821 * us to remove next before dropping the locks. 821 * us to remove next before dropping the locks.
822 */ 822 */
823 __vma_unlink(mm, next, vma); 823 __vma_unlink(mm, next, vma);
824 if (file) 824 if (file)
825 __remove_shared_vm_struct(next, file, mapping); 825 __remove_shared_vm_struct(next, file, mapping);
826 } else if (insert) { 826 } else if (insert) {
827 /* 827 /*
828 * split_vma has split insert from vma, and needs 828 * split_vma has split insert from vma, and needs
829 * us to insert it before dropping the locks 829 * us to insert it before dropping the locks
830 * (it may either follow vma or precede it). 830 * (it may either follow vma or precede it).
831 */ 831 */
832 __insert_vm_struct(mm, insert); 832 __insert_vm_struct(mm, insert);
833 } else { 833 } else {
834 if (start_changed) 834 if (start_changed)
835 vma_gap_update(vma); 835 vma_gap_update(vma);
836 if (end_changed) { 836 if (end_changed) {
837 if (!next) 837 if (!next)
838 mm->highest_vm_end = end; 838 mm->highest_vm_end = end;
839 else if (!adjust_next) 839 else if (!adjust_next)
840 vma_gap_update(next); 840 vma_gap_update(next);
841 } 841 }
842 } 842 }
843 843
844 if (anon_vma) { 844 if (anon_vma) {
845 anon_vma_interval_tree_post_update_vma(vma); 845 anon_vma_interval_tree_post_update_vma(vma);
846 if (adjust_next) 846 if (adjust_next)
847 anon_vma_interval_tree_post_update_vma(next); 847 anon_vma_interval_tree_post_update_vma(next);
848 anon_vma_unlock_write(anon_vma); 848 anon_vma_unlock_write(anon_vma);
849 } 849 }
850 if (mapping) 850 if (mapping)
851 mutex_unlock(&mapping->i_mmap_mutex); 851 mutex_unlock(&mapping->i_mmap_mutex);
852 852
853 if (root) { 853 if (root) {
854 uprobe_mmap(vma); 854 uprobe_mmap(vma);
855 855
856 if (adjust_next) 856 if (adjust_next)
857 uprobe_mmap(next); 857 uprobe_mmap(next);
858 } 858 }
859 859
860 if (remove_next) { 860 if (remove_next) {
861 if (file) { 861 if (file) {
862 uprobe_munmap(next, next->vm_start, next->vm_end); 862 uprobe_munmap(next, next->vm_start, next->vm_end);
863 fput(file); 863 fput(file);
864 } 864 }
865 if (next->anon_vma) 865 if (next->anon_vma)
866 anon_vma_merge(vma, next); 866 anon_vma_merge(vma, next);
867 mm->map_count--; 867 mm->map_count--;
868 vma_set_policy(vma, vma_policy(next)); 868 vma_set_policy(vma, vma_policy(next));
869 kmem_cache_free(vm_area_cachep, next); 869 kmem_cache_free(vm_area_cachep, next);
870 /* 870 /*
871 * In mprotect's case 6 (see comments on vma_merge), 871 * In mprotect's case 6 (see comments on vma_merge),
872 * we must remove another next too. It would clutter 872 * we must remove another next too. It would clutter
873 * up the code too much to do both in one go. 873 * up the code too much to do both in one go.
874 */ 874 */
875 next = vma->vm_next; 875 next = vma->vm_next;
876 if (remove_next == 2) 876 if (remove_next == 2)
877 goto again; 877 goto again;
878 else if (next) 878 else if (next)
879 vma_gap_update(next); 879 vma_gap_update(next);
880 else 880 else
881 mm->highest_vm_end = end; 881 mm->highest_vm_end = end;
882 } 882 }
883 if (insert && file) 883 if (insert && file)
884 uprobe_mmap(insert); 884 uprobe_mmap(insert);
885 885
886 validate_mm(mm); 886 validate_mm(mm);
887 887
888 return 0; 888 return 0;
889 } 889 }
890 890
891 /* 891 /*
892 * If the vma has a ->close operation then the driver probably needs to release 892 * If the vma has a ->close operation then the driver probably needs to release
893 * per-vma resources, so we don't attempt to merge those. 893 * per-vma resources, so we don't attempt to merge those.
894 */ 894 */
895 static inline int is_mergeable_vma(struct vm_area_struct *vma, 895 static inline int is_mergeable_vma(struct vm_area_struct *vma,
896 struct file *file, unsigned long vm_flags) 896 struct file *file, unsigned long vm_flags)
897 { 897 {
898 if (vma->vm_flags ^ vm_flags) 898 if (vma->vm_flags ^ vm_flags)
899 return 0; 899 return 0;
900 if (vma->vm_file != file) 900 if (vma->vm_file != file)
901 return 0; 901 return 0;
902 if (vma->vm_ops && vma->vm_ops->close) 902 if (vma->vm_ops && vma->vm_ops->close)
903 return 0; 903 return 0;
904 return 1; 904 return 1;
905 } 905 }
906 906
907 static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1, 907 static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
908 struct anon_vma *anon_vma2, 908 struct anon_vma *anon_vma2,
909 struct vm_area_struct *vma) 909 struct vm_area_struct *vma)
910 { 910 {
911 /* 911 /*
912 * The list_is_singular() test is to avoid merging VMA cloned from 912 * The list_is_singular() test is to avoid merging VMA cloned from
913 * parents. This can improve scalability caused by anon_vma lock. 913 * parents. This can improve scalability caused by anon_vma lock.
914 */ 914 */
915 if ((!anon_vma1 || !anon_vma2) && (!vma || 915 if ((!anon_vma1 || !anon_vma2) && (!vma ||
916 list_is_singular(&vma->anon_vma_chain))) 916 list_is_singular(&vma->anon_vma_chain)))
917 return 1; 917 return 1;
918 return anon_vma1 == anon_vma2; 918 return anon_vma1 == anon_vma2;
919 } 919 }
920 920
921 /* 921 /*
922 * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) 922 * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
923 * in front of (at a lower virtual address and file offset than) the vma. 923 * in front of (at a lower virtual address and file offset than) the vma.
924 * 924 *
925 * We cannot merge two vmas if they have differently assigned (non-NULL) 925 * We cannot merge two vmas if they have differently assigned (non-NULL)
926 * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. 926 * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
927 * 927 *
928 * We don't check here for the merged mmap wrapping around the end of pagecache 928 * We don't check here for the merged mmap wrapping around the end of pagecache
929 * indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which 929 * indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which
930 * wrap, nor mmaps which cover the final page at index -1UL. 930 * wrap, nor mmaps which cover the final page at index -1UL.
931 */ 931 */
932 static int 932 static int
933 can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, 933 can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
934 struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) 934 struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
935 { 935 {
936 if (is_mergeable_vma(vma, file, vm_flags) && 936 if (is_mergeable_vma(vma, file, vm_flags) &&
937 is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { 937 is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
938 if (vma->vm_pgoff == vm_pgoff) 938 if (vma->vm_pgoff == vm_pgoff)
939 return 1; 939 return 1;
940 } 940 }
941 return 0; 941 return 0;
942 } 942 }
943 943
944 /* 944 /*
945 * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) 945 * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
946 * beyond (at a higher virtual address and file offset than) the vma. 946 * beyond (at a higher virtual address and file offset than) the vma.
947 * 947 *
948 * We cannot merge two vmas if they have differently assigned (non-NULL) 948 * We cannot merge two vmas if they have differently assigned (non-NULL)
949 * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. 949 * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
950 */ 950 */
951 static int 951 static int
952 can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, 952 can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
953 struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) 953 struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
954 { 954 {
955 if (is_mergeable_vma(vma, file, vm_flags) && 955 if (is_mergeable_vma(vma, file, vm_flags) &&
956 is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { 956 is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
957 pgoff_t vm_pglen; 957 pgoff_t vm_pglen;
958 vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; 958 vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
959 if (vma->vm_pgoff + vm_pglen == vm_pgoff) 959 if (vma->vm_pgoff + vm_pglen == vm_pgoff)
960 return 1; 960 return 1;
961 } 961 }
962 return 0; 962 return 0;
963 } 963 }
964 964
965 /* 965 /*
966 * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out 966 * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out
967 * whether that can be merged with its predecessor or its successor. 967 * whether that can be merged with its predecessor or its successor.
968 * Or both (it neatly fills a hole). 968 * Or both (it neatly fills a hole).
969 * 969 *
970 * In most cases - when called for mmap, brk or mremap - [addr,end) is 970 * In most cases - when called for mmap, brk or mremap - [addr,end) is
971 * certain not to be mapped by the time vma_merge is called; but when 971 * certain not to be mapped by the time vma_merge is called; but when
972 * called for mprotect, it is certain to be already mapped (either at 972 * called for mprotect, it is certain to be already mapped (either at
973 * an offset within prev, or at the start of next), and the flags of 973 * an offset within prev, or at the start of next), and the flags of
974 * this area are about to be changed to vm_flags - and the no-change 974 * this area are about to be changed to vm_flags - and the no-change
975 * case has already been eliminated. 975 * case has already been eliminated.
976 * 976 *
977 * The following mprotect cases have to be considered, where AAAA is 977 * The following mprotect cases have to be considered, where AAAA is
978 * the area passed down from mprotect_fixup, never extending beyond one 978 * the area passed down from mprotect_fixup, never extending beyond one
979 * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after: 979 * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after:
980 * 980 *
981 * AAAA AAAA AAAA AAAA 981 * AAAA AAAA AAAA AAAA
982 * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPNNNNNN PPPPNNNNXXXX 982 * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPNNNNNN PPPPNNNNXXXX
983 * cannot merge might become might become might become 983 * cannot merge might become might become might become
984 * PPNNNNNNNNNN PPPPPPPPPPNN PPPPPPPPPPPP 6 or 984 * PPNNNNNNNNNN PPPPPPPPPPNN PPPPPPPPPPPP 6 or
985 * mmap, brk or case 4 below case 5 below PPPPPPPPXXXX 7 or 985 * mmap, brk or case 4 below case 5 below PPPPPPPPXXXX 7 or
986 * mremap move: PPPPNNNNNNNN 8 986 * mremap move: PPPPNNNNNNNN 8
987 * AAAA 987 * AAAA
988 * PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN 988 * PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN
989 * might become case 1 below case 2 below case 3 below 989 * might become case 1 below case 2 below case 3 below
990 * 990 *
991 * Odd one out? Case 8, because it extends NNNN but needs flags of XXXX: 991 * Odd one out? Case 8, because it extends NNNN but needs flags of XXXX:
992 * mprotect_fixup updates vm_flags & vm_page_prot on successful return. 992 * mprotect_fixup updates vm_flags & vm_page_prot on successful return.
993 */ 993 */
994 struct vm_area_struct *vma_merge(struct mm_struct *mm, 994 struct vm_area_struct *vma_merge(struct mm_struct *mm,
995 struct vm_area_struct *prev, unsigned long addr, 995 struct vm_area_struct *prev, unsigned long addr,
996 unsigned long end, unsigned long vm_flags, 996 unsigned long end, unsigned long vm_flags,
997 struct anon_vma *anon_vma, struct file *file, 997 struct anon_vma *anon_vma, struct file *file,
998 pgoff_t pgoff, struct mempolicy *policy) 998 pgoff_t pgoff, struct mempolicy *policy)
999 { 999 {
1000 pgoff_t pglen = (end - addr) >> PAGE_SHIFT; 1000 pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
1001 struct vm_area_struct *area, *next; 1001 struct vm_area_struct *area, *next;
1002 int err; 1002 int err;
1003 1003
1004 /* 1004 /*
1005 * We later require that vma->vm_flags == vm_flags, 1005 * We later require that vma->vm_flags == vm_flags,
1006 * so this tests vma->vm_flags & VM_SPECIAL, too. 1006 * so this tests vma->vm_flags & VM_SPECIAL, too.
1007 */ 1007 */
1008 if (vm_flags & VM_SPECIAL) 1008 if (vm_flags & VM_SPECIAL)
1009 return NULL; 1009 return NULL;
1010 1010
1011 if (prev) 1011 if (prev)
1012 next = prev->vm_next; 1012 next = prev->vm_next;
1013 else 1013 else
1014 next = mm->mmap; 1014 next = mm->mmap;
1015 area = next; 1015 area = next;
1016 if (next && next->vm_end == end) /* cases 6, 7, 8 */ 1016 if (next && next->vm_end == end) /* cases 6, 7, 8 */
1017 next = next->vm_next; 1017 next = next->vm_next;
1018 1018
1019 /* 1019 /*
1020 * Can it merge with the predecessor? 1020 * Can it merge with the predecessor?
1021 */ 1021 */
1022 if (prev && prev->vm_end == addr && 1022 if (prev && prev->vm_end == addr &&
1023 mpol_equal(vma_policy(prev), policy) && 1023 mpol_equal(vma_policy(prev), policy) &&
1024 can_vma_merge_after(prev, vm_flags, 1024 can_vma_merge_after(prev, vm_flags,
1025 anon_vma, file, pgoff)) { 1025 anon_vma, file, pgoff)) {
1026 /* 1026 /*
1027 * OK, it can. Can we now merge in the successor as well? 1027 * OK, it can. Can we now merge in the successor as well?
1028 */ 1028 */
1029 if (next && end == next->vm_start && 1029 if (next && end == next->vm_start &&
1030 mpol_equal(policy, vma_policy(next)) && 1030 mpol_equal(policy, vma_policy(next)) &&
1031 can_vma_merge_before(next, vm_flags, 1031 can_vma_merge_before(next, vm_flags,
1032 anon_vma, file, pgoff+pglen) && 1032 anon_vma, file, pgoff+pglen) &&
1033 is_mergeable_anon_vma(prev->anon_vma, 1033 is_mergeable_anon_vma(prev->anon_vma,
1034 next->anon_vma, NULL)) { 1034 next->anon_vma, NULL)) {
1035 /* cases 1, 6 */ 1035 /* cases 1, 6 */
1036 err = vma_adjust(prev, prev->vm_start, 1036 err = vma_adjust(prev, prev->vm_start,
1037 next->vm_end, prev->vm_pgoff, NULL); 1037 next->vm_end, prev->vm_pgoff, NULL);
1038 } else /* cases 2, 5, 7 */ 1038 } else /* cases 2, 5, 7 */
1039 err = vma_adjust(prev, prev->vm_start, 1039 err = vma_adjust(prev, prev->vm_start,
1040 end, prev->vm_pgoff, NULL); 1040 end, prev->vm_pgoff, NULL);
1041 if (err) 1041 if (err)
1042 return NULL; 1042 return NULL;
1043 khugepaged_enter_vma_merge(prev); 1043 khugepaged_enter_vma_merge(prev);
1044 return prev; 1044 return prev;
1045 } 1045 }
1046 1046
1047 /* 1047 /*
1048 * Can this new request be merged in front of next? 1048 * Can this new request be merged in front of next?
1049 */ 1049 */
1050 if (next && end == next->vm_start && 1050 if (next && end == next->vm_start &&
1051 mpol_equal(policy, vma_policy(next)) && 1051 mpol_equal(policy, vma_policy(next)) &&
1052 can_vma_merge_before(next, vm_flags, 1052 can_vma_merge_before(next, vm_flags,
1053 anon_vma, file, pgoff+pglen)) { 1053 anon_vma, file, pgoff+pglen)) {
1054 if (prev && addr < prev->vm_end) /* case 4 */ 1054 if (prev && addr < prev->vm_end) /* case 4 */
1055 err = vma_adjust(prev, prev->vm_start, 1055 err = vma_adjust(prev, prev->vm_start,
1056 addr, prev->vm_pgoff, NULL); 1056 addr, prev->vm_pgoff, NULL);
1057 else /* cases 3, 8 */ 1057 else /* cases 3, 8 */
1058 err = vma_adjust(area, addr, next->vm_end, 1058 err = vma_adjust(area, addr, next->vm_end,
1059 next->vm_pgoff - pglen, NULL); 1059 next->vm_pgoff - pglen, NULL);
1060 if (err) 1060 if (err)
1061 return NULL; 1061 return NULL;
1062 khugepaged_enter_vma_merge(area); 1062 khugepaged_enter_vma_merge(area);
1063 return area; 1063 return area;
1064 } 1064 }
1065 1065
1066 return NULL; 1066 return NULL;
1067 } 1067 }
1068 1068
1069 /* 1069 /*
1070 * Rough compatbility check to quickly see if it's even worth looking 1070 * Rough compatbility check to quickly see if it's even worth looking
1071 * at sharing an anon_vma. 1071 * at sharing an anon_vma.
1072 * 1072 *
1073 * They need to have the same vm_file, and the flags can only differ 1073 * They need to have the same vm_file, and the flags can only differ
1074 * in things that mprotect may change. 1074 * in things that mprotect may change.
1075 * 1075 *
1076 * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that 1076 * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that
1077 * we can merge the two vma's. For example, we refuse to merge a vma if 1077 * we can merge the two vma's. For example, we refuse to merge a vma if
1078 * there is a vm_ops->close() function, because that indicates that the 1078 * there is a vm_ops->close() function, because that indicates that the
1079 * driver is doing some kind of reference counting. But that doesn't 1079 * driver is doing some kind of reference counting. But that doesn't
1080 * really matter for the anon_vma sharing case. 1080 * really matter for the anon_vma sharing case.
1081 */ 1081 */
1082 static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b) 1082 static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b)
1083 { 1083 {
1084 return a->vm_end == b->vm_start && 1084 return a->vm_end == b->vm_start &&
1085 mpol_equal(vma_policy(a), vma_policy(b)) && 1085 mpol_equal(vma_policy(a), vma_policy(b)) &&
1086 a->vm_file == b->vm_file && 1086 a->vm_file == b->vm_file &&
1087 !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC)) && 1087 !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC)) &&
1088 b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); 1088 b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);
1089 } 1089 }
1090 1090
1091 /* 1091 /*
1092 * Do some basic sanity checking to see if we can re-use the anon_vma 1092 * Do some basic sanity checking to see if we can re-use the anon_vma
1093 * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be 1093 * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be
1094 * the same as 'old', the other will be the new one that is trying 1094 * the same as 'old', the other will be the new one that is trying
1095 * to share the anon_vma. 1095 * to share the anon_vma.
1096 * 1096 *
1097 * NOTE! This runs with mm_sem held for reading, so it is possible that 1097 * NOTE! This runs with mm_sem held for reading, so it is possible that
1098 * the anon_vma of 'old' is concurrently in the process of being set up 1098 * the anon_vma of 'old' is concurrently in the process of being set up
1099 * by another page fault trying to merge _that_. But that's ok: if it 1099 * by another page fault trying to merge _that_. But that's ok: if it
1100 * is being set up, that automatically means that it will be a singleton 1100 * is being set up, that automatically means that it will be a singleton
1101 * acceptable for merging, so we can do all of this optimistically. But 1101 * acceptable for merging, so we can do all of this optimistically. But
1102 * we do that ACCESS_ONCE() to make sure that we never re-load the pointer. 1102 * we do that ACCESS_ONCE() to make sure that we never re-load the pointer.
1103 * 1103 *
1104 * IOW: that the "list_is_singular()" test on the anon_vma_chain only 1104 * IOW: that the "list_is_singular()" test on the anon_vma_chain only
1105 * matters for the 'stable anon_vma' case (ie the thing we want to avoid 1105 * matters for the 'stable anon_vma' case (ie the thing we want to avoid
1106 * is to return an anon_vma that is "complex" due to having gone through 1106 * is to return an anon_vma that is "complex" due to having gone through
1107 * a fork). 1107 * a fork).
1108 * 1108 *
1109 * We also make sure that the two vma's are compatible (adjacent, 1109 * We also make sure that the two vma's are compatible (adjacent,
1110 * and with the same memory policies). That's all stable, even with just 1110 * and with the same memory policies). That's all stable, even with just
1111 * a read lock on the mm_sem. 1111 * a read lock on the mm_sem.
1112 */ 1112 */
1113 static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b) 1113 static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b)
1114 { 1114 {
1115 if (anon_vma_compatible(a, b)) { 1115 if (anon_vma_compatible(a, b)) {
1116 struct anon_vma *anon_vma = ACCESS_ONCE(old->anon_vma); 1116 struct anon_vma *anon_vma = ACCESS_ONCE(old->anon_vma);
1117 1117
1118 if (anon_vma && list_is_singular(&old->anon_vma_chain)) 1118 if (anon_vma && list_is_singular(&old->anon_vma_chain))
1119 return anon_vma; 1119 return anon_vma;
1120 } 1120 }
1121 return NULL; 1121 return NULL;
1122 } 1122 }
1123 1123
1124 /* 1124 /*
1125 * find_mergeable_anon_vma is used by anon_vma_prepare, to check 1125 * find_mergeable_anon_vma is used by anon_vma_prepare, to check
1126 * neighbouring vmas for a suitable anon_vma, before it goes off 1126 * neighbouring vmas for a suitable anon_vma, before it goes off
1127 * to allocate a new anon_vma. It checks because a repetitive 1127 * to allocate a new anon_vma. It checks because a repetitive
1128 * sequence of mprotects and faults may otherwise lead to distinct 1128 * sequence of mprotects and faults may otherwise lead to distinct
1129 * anon_vmas being allocated, preventing vma merge in subsequent 1129 * anon_vmas being allocated, preventing vma merge in subsequent
1130 * mprotect. 1130 * mprotect.
1131 */ 1131 */
1132 struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) 1132 struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
1133 { 1133 {
1134 struct anon_vma *anon_vma; 1134 struct anon_vma *anon_vma;
1135 struct vm_area_struct *near; 1135 struct vm_area_struct *near;
1136 1136
1137 near = vma->vm_next; 1137 near = vma->vm_next;
1138 if (!near) 1138 if (!near)
1139 goto try_prev; 1139 goto try_prev;
1140 1140
1141 anon_vma = reusable_anon_vma(near, vma, near); 1141 anon_vma = reusable_anon_vma(near, vma, near);
1142 if (anon_vma) 1142 if (anon_vma)
1143 return anon_vma; 1143 return anon_vma;
1144 try_prev: 1144 try_prev:
1145 near = vma->vm_prev; 1145 near = vma->vm_prev;
1146 if (!near) 1146 if (!near)
1147 goto none; 1147 goto none;
1148 1148
1149 anon_vma = reusable_anon_vma(near, near, vma); 1149 anon_vma = reusable_anon_vma(near, near, vma);
1150 if (anon_vma) 1150 if (anon_vma)
1151 return anon_vma; 1151 return anon_vma;
1152 none: 1152 none:
1153 /* 1153 /*
1154 * There's no absolute need to look only at touching neighbours: 1154 * There's no absolute need to look only at touching neighbours:
1155 * we could search further afield for "compatible" anon_vmas. 1155 * we could search further afield for "compatible" anon_vmas.
1156 * But it would probably just be a waste of time searching, 1156 * But it would probably just be a waste of time searching,
1157 * or lead to too many vmas hanging off the same anon_vma. 1157 * or lead to too many vmas hanging off the same anon_vma.
1158 * We're trying to allow mprotect remerging later on, 1158 * We're trying to allow mprotect remerging later on,
1159 * not trying to minimize memory used for anon_vmas. 1159 * not trying to minimize memory used for anon_vmas.
1160 */ 1160 */
1161 return NULL; 1161 return NULL;
1162 } 1162 }
1163 1163
1164 #ifdef CONFIG_PROC_FS 1164 #ifdef CONFIG_PROC_FS
1165 void vm_stat_account(struct mm_struct *mm, unsigned long flags, 1165 void vm_stat_account(struct mm_struct *mm, unsigned long flags,
1166 struct file *file, long pages) 1166 struct file *file, long pages)
1167 { 1167 {
1168 const unsigned long stack_flags 1168 const unsigned long stack_flags
1169 = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN); 1169 = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN);
1170 1170
1171 mm->total_vm += pages; 1171 mm->total_vm += pages;
1172 1172
1173 if (file) { 1173 if (file) {
1174 mm->shared_vm += pages; 1174 mm->shared_vm += pages;
1175 if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC) 1175 if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC)
1176 mm->exec_vm += pages; 1176 mm->exec_vm += pages;
1177 } else if (flags & stack_flags) 1177 } else if (flags & stack_flags)
1178 mm->stack_vm += pages; 1178 mm->stack_vm += pages;
1179 } 1179 }
1180 #endif /* CONFIG_PROC_FS */ 1180 #endif /* CONFIG_PROC_FS */
1181 1181
1182 /* 1182 /*
1183 * If a hint addr is less than mmap_min_addr change hint to be as 1183 * If a hint addr is less than mmap_min_addr change hint to be as
1184 * low as possible but still greater than mmap_min_addr 1184 * low as possible but still greater than mmap_min_addr
1185 */ 1185 */
1186 static inline unsigned long round_hint_to_min(unsigned long hint) 1186 static inline unsigned long round_hint_to_min(unsigned long hint)
1187 { 1187 {
1188 hint &= PAGE_MASK; 1188 hint &= PAGE_MASK;
1189 if (((void *)hint != NULL) && 1189 if (((void *)hint != NULL) &&
1190 (hint < mmap_min_addr)) 1190 (hint < mmap_min_addr))
1191 return PAGE_ALIGN(mmap_min_addr); 1191 return PAGE_ALIGN(mmap_min_addr);
1192 return hint; 1192 return hint;
1193 } 1193 }
1194 1194
1195 /* 1195 /*
1196 * The caller must hold down_write(&current->mm->mmap_sem). 1196 * The caller must hold down_write(&current->mm->mmap_sem).
1197 */ 1197 */
1198 1198
1199 unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, 1199 unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1200 unsigned long len, unsigned long prot, 1200 unsigned long len, unsigned long prot,
1201 unsigned long flags, unsigned long pgoff, 1201 unsigned long flags, unsigned long pgoff,
1202 unsigned long *populate) 1202 unsigned long *populate)
1203 { 1203 {
1204 struct mm_struct * mm = current->mm; 1204 struct mm_struct * mm = current->mm;
1205 struct inode *inode; 1205 struct inode *inode;
1206 vm_flags_t vm_flags; 1206 vm_flags_t vm_flags;
1207 1207
1208 *populate = 0; 1208 *populate = 0;
1209 1209
1210 /* 1210 /*
1211 * Does the application expect PROT_READ to imply PROT_EXEC? 1211 * Does the application expect PROT_READ to imply PROT_EXEC?
1212 * 1212 *
1213 * (the exception is when the underlying filesystem is noexec 1213 * (the exception is when the underlying filesystem is noexec
1214 * mounted, in which case we dont add PROT_EXEC.) 1214 * mounted, in which case we dont add PROT_EXEC.)
1215 */ 1215 */
1216 if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) 1216 if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
1217 if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC))) 1217 if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC)))
1218 prot |= PROT_EXEC; 1218 prot |= PROT_EXEC;
1219 1219
1220 if (!len) 1220 if (!len)
1221 return -EINVAL; 1221 return -EINVAL;
1222 1222
1223 if (!(flags & MAP_FIXED)) 1223 if (!(flags & MAP_FIXED))
1224 addr = round_hint_to_min(addr); 1224 addr = round_hint_to_min(addr);
1225 1225
1226 /* Careful about overflows.. */ 1226 /* Careful about overflows.. */
1227 len = PAGE_ALIGN(len); 1227 len = PAGE_ALIGN(len);
1228 if (!len) 1228 if (!len)
1229 return -ENOMEM; 1229 return -ENOMEM;
1230 1230
1231 /* offset overflow? */ 1231 /* offset overflow? */
1232 if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) 1232 if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
1233 return -EOVERFLOW; 1233 return -EOVERFLOW;
1234 1234
1235 /* Too many mappings? */ 1235 /* Too many mappings? */
1236 if (mm->map_count > sysctl_max_map_count) 1236 if (mm->map_count > sysctl_max_map_count)
1237 return -ENOMEM; 1237 return -ENOMEM;
1238 1238
1239 /* Obtain the address to map to. we verify (or select) it and ensure 1239 /* Obtain the address to map to. we verify (or select) it and ensure
1240 * that it represents a valid section of the address space. 1240 * that it represents a valid section of the address space.
1241 */ 1241 */
1242 addr = get_unmapped_area(file, addr, len, pgoff, flags); 1242 addr = get_unmapped_area(file, addr, len, pgoff, flags);
1243 if (addr & ~PAGE_MASK) 1243 if (addr & ~PAGE_MASK)
1244 return addr; 1244 return addr;
1245 1245
1246 /* Do simple checking here so the lower-level routines won't have 1246 /* Do simple checking here so the lower-level routines won't have
1247 * to. we assume access permissions have been handled by the open 1247 * to. we assume access permissions have been handled by the open
1248 * of the memory object, so we don't do any here. 1248 * of the memory object, so we don't do any here.
1249 */ 1249 */
1250 vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | 1250 vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
1251 mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; 1251 mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
1252 1252
1253 if (flags & MAP_LOCKED) 1253 if (flags & MAP_LOCKED)
1254 if (!can_do_mlock()) 1254 if (!can_do_mlock())
1255 return -EPERM; 1255 return -EPERM;
1256 1256
1257 /* mlock MCL_FUTURE? */ 1257 /* mlock MCL_FUTURE? */
1258 if (vm_flags & VM_LOCKED) { 1258 if (vm_flags & VM_LOCKED) {
1259 unsigned long locked, lock_limit; 1259 unsigned long locked, lock_limit;
1260 locked = len >> PAGE_SHIFT; 1260 locked = len >> PAGE_SHIFT;
1261 locked += mm->locked_vm; 1261 locked += mm->locked_vm;
1262 lock_limit = rlimit(RLIMIT_MEMLOCK); 1262 lock_limit = rlimit(RLIMIT_MEMLOCK);
1263 lock_limit >>= PAGE_SHIFT; 1263 lock_limit >>= PAGE_SHIFT;
1264 if (locked > lock_limit && !capable(CAP_IPC_LOCK)) 1264 if (locked > lock_limit && !capable(CAP_IPC_LOCK))
1265 return -EAGAIN; 1265 return -EAGAIN;
1266 } 1266 }
1267 1267
1268 inode = file ? file_inode(file) : NULL; 1268 inode = file ? file_inode(file) : NULL;
1269 1269
1270 if (file) { 1270 if (file) {
1271 switch (flags & MAP_TYPE) { 1271 switch (flags & MAP_TYPE) {
1272 case MAP_SHARED: 1272 case MAP_SHARED:
1273 if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE)) 1273 if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
1274 return -EACCES; 1274 return -EACCES;
1275 1275
1276 /* 1276 /*
1277 * Make sure we don't allow writing to an append-only 1277 * Make sure we don't allow writing to an append-only
1278 * file.. 1278 * file..
1279 */ 1279 */
1280 if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE)) 1280 if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
1281 return -EACCES; 1281 return -EACCES;
1282 1282
1283 /* 1283 /*
1284 * Make sure there are no mandatory locks on the file. 1284 * Make sure there are no mandatory locks on the file.
1285 */ 1285 */
1286 if (locks_verify_locked(inode)) 1286 if (locks_verify_locked(inode))
1287 return -EAGAIN; 1287 return -EAGAIN;
1288 1288
1289 vm_flags |= VM_SHARED | VM_MAYSHARE; 1289 vm_flags |= VM_SHARED | VM_MAYSHARE;
1290 if (!(file->f_mode & FMODE_WRITE)) 1290 if (!(file->f_mode & FMODE_WRITE))
1291 vm_flags &= ~(VM_MAYWRITE | VM_SHARED); 1291 vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
1292 1292
1293 /* fall through */ 1293 /* fall through */
1294 case MAP_PRIVATE: 1294 case MAP_PRIVATE:
1295 if (!(file->f_mode & FMODE_READ)) 1295 if (!(file->f_mode & FMODE_READ))
1296 return -EACCES; 1296 return -EACCES;
1297 if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) { 1297 if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
1298 if (vm_flags & VM_EXEC) 1298 if (vm_flags & VM_EXEC)
1299 return -EPERM; 1299 return -EPERM;
1300 vm_flags &= ~VM_MAYEXEC; 1300 vm_flags &= ~VM_MAYEXEC;
1301 } 1301 }
1302 1302
1303 if (!file->f_op || !file->f_op->mmap) 1303 if (!file->f_op || !file->f_op->mmap)
1304 return -ENODEV; 1304 return -ENODEV;
1305 break; 1305 break;
1306 1306
1307 default: 1307 default:
1308 return -EINVAL; 1308 return -EINVAL;
1309 } 1309 }
1310 } else { 1310 } else {
1311 switch (flags & MAP_TYPE) { 1311 switch (flags & MAP_TYPE) {
1312 case MAP_SHARED: 1312 case MAP_SHARED:
1313 /* 1313 /*
1314 * Ignore pgoff. 1314 * Ignore pgoff.
1315 */ 1315 */
1316 pgoff = 0; 1316 pgoff = 0;
1317 vm_flags |= VM_SHARED | VM_MAYSHARE; 1317 vm_flags |= VM_SHARED | VM_MAYSHARE;
1318 break; 1318 break;
1319 case MAP_PRIVATE: 1319 case MAP_PRIVATE:
1320 /* 1320 /*
1321 * Set pgoff according to addr for anon_vma. 1321 * Set pgoff according to addr for anon_vma.
1322 */ 1322 */
1323 pgoff = addr >> PAGE_SHIFT; 1323 pgoff = addr >> PAGE_SHIFT;
1324 break; 1324 break;
1325 default: 1325 default:
1326 return -EINVAL; 1326 return -EINVAL;
1327 } 1327 }
1328 } 1328 }
1329 1329
1330 /* 1330 /*
1331 * Set 'VM_NORESERVE' if we should not account for the 1331 * Set 'VM_NORESERVE' if we should not account for the
1332 * memory use of this mapping. 1332 * memory use of this mapping.
1333 */ 1333 */
1334 if (flags & MAP_NORESERVE) { 1334 if (flags & MAP_NORESERVE) {
1335 /* We honor MAP_NORESERVE if allowed to overcommit */ 1335 /* We honor MAP_NORESERVE if allowed to overcommit */
1336 if (sysctl_overcommit_memory != OVERCOMMIT_NEVER) 1336 if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
1337 vm_flags |= VM_NORESERVE; 1337 vm_flags |= VM_NORESERVE;
1338 1338
1339 /* hugetlb applies strict overcommit unless MAP_NORESERVE */ 1339 /* hugetlb applies strict overcommit unless MAP_NORESERVE */
1340 if (file && is_file_hugepages(file)) 1340 if (file && is_file_hugepages(file))
1341 vm_flags |= VM_NORESERVE; 1341 vm_flags |= VM_NORESERVE;
1342 } 1342 }
1343 1343
1344 addr = mmap_region(file, addr, len, vm_flags, pgoff); 1344 addr = mmap_region(file, addr, len, vm_flags, pgoff);
1345 if (!IS_ERR_VALUE(addr) && 1345 if (!IS_ERR_VALUE(addr) &&
1346 ((vm_flags & VM_LOCKED) || 1346 ((vm_flags & VM_LOCKED) ||
1347 (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE)) 1347 (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
1348 *populate = len; 1348 *populate = len;
1349 return addr; 1349 return addr;
1350 } 1350 }
1351 1351
1352 SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, 1352 SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1353 unsigned long, prot, unsigned long, flags, 1353 unsigned long, prot, unsigned long, flags,
1354 unsigned long, fd, unsigned long, pgoff) 1354 unsigned long, fd, unsigned long, pgoff)
1355 { 1355 {
1356 struct file *file = NULL; 1356 struct file *file = NULL;
1357 unsigned long retval = -EBADF; 1357 unsigned long retval = -EBADF;
1358 1358
1359 if (!(flags & MAP_ANONYMOUS)) { 1359 if (!(flags & MAP_ANONYMOUS)) {
1360 audit_mmap_fd(fd, flags); 1360 audit_mmap_fd(fd, flags);
1361 if (unlikely(flags & MAP_HUGETLB)) 1361 if (unlikely(flags & MAP_HUGETLB))
1362 return -EINVAL; 1362 return -EINVAL;
1363 file = fget(fd); 1363 file = fget(fd);
1364 if (!file) 1364 if (!file)
1365 goto out; 1365 goto out;
1366 if (is_file_hugepages(file))
1367 len = ALIGN(len, huge_page_size(hstate_file(file)));
1366 } else if (flags & MAP_HUGETLB) { 1368 } else if (flags & MAP_HUGETLB) {
1367 struct user_struct *user = NULL; 1369 struct user_struct *user = NULL;
1370
1371 len = ALIGN(len, huge_page_size(hstate_sizelog(
1372 (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK)));
1368 /* 1373 /*
1369 * VM_NORESERVE is used because the reservations will be 1374 * VM_NORESERVE is used because the reservations will be
1370 * taken when vm_ops->mmap() is called 1375 * taken when vm_ops->mmap() is called
1371 * A dummy user value is used because we are not locking 1376 * A dummy user value is used because we are not locking
1372 * memory so no accounting is necessary 1377 * memory so no accounting is necessary
1373 */ 1378 */
1374 file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len, 1379 file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
1375 VM_NORESERVE, 1380 VM_NORESERVE,
1376 &user, HUGETLB_ANONHUGE_INODE, 1381 &user, HUGETLB_ANONHUGE_INODE,
1377 (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); 1382 (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
1378 if (IS_ERR(file)) 1383 if (IS_ERR(file))
1379 return PTR_ERR(file); 1384 return PTR_ERR(file);
1380 } 1385 }
1381 1386
1382 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); 1387 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
1383 1388
1384 retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); 1389 retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
1385 if (file) 1390 if (file)
1386 fput(file); 1391 fput(file);
1387 out: 1392 out:
1388 return retval; 1393 return retval;
1389 } 1394 }
1390 1395
1391 #ifdef __ARCH_WANT_SYS_OLD_MMAP 1396 #ifdef __ARCH_WANT_SYS_OLD_MMAP
1392 struct mmap_arg_struct { 1397 struct mmap_arg_struct {
1393 unsigned long addr; 1398 unsigned long addr;
1394 unsigned long len; 1399 unsigned long len;
1395 unsigned long prot; 1400 unsigned long prot;
1396 unsigned long flags; 1401 unsigned long flags;
1397 unsigned long fd; 1402 unsigned long fd;
1398 unsigned long offset; 1403 unsigned long offset;
1399 }; 1404 };
1400 1405
1401 SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) 1406 SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
1402 { 1407 {
1403 struct mmap_arg_struct a; 1408 struct mmap_arg_struct a;
1404 1409
1405 if (copy_from_user(&a, arg, sizeof(a))) 1410 if (copy_from_user(&a, arg, sizeof(a)))
1406 return -EFAULT; 1411 return -EFAULT;
1407 if (a.offset & ~PAGE_MASK) 1412 if (a.offset & ~PAGE_MASK)
1408 return -EINVAL; 1413 return -EINVAL;
1409 1414
1410 return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, 1415 return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
1411 a.offset >> PAGE_SHIFT); 1416 a.offset >> PAGE_SHIFT);
1412 } 1417 }
1413 #endif /* __ARCH_WANT_SYS_OLD_MMAP */ 1418 #endif /* __ARCH_WANT_SYS_OLD_MMAP */
1414 1419
1415 /* 1420 /*
1416 * Some shared mappigns will want the pages marked read-only 1421 * Some shared mappigns will want the pages marked read-only
1417 * to track write events. If so, we'll downgrade vm_page_prot 1422 * to track write events. If so, we'll downgrade vm_page_prot
1418 * to the private version (using protection_map[] without the 1423 * to the private version (using protection_map[] without the
1419 * VM_SHARED bit). 1424 * VM_SHARED bit).
1420 */ 1425 */
1421 int vma_wants_writenotify(struct vm_area_struct *vma) 1426 int vma_wants_writenotify(struct vm_area_struct *vma)
1422 { 1427 {
1423 vm_flags_t vm_flags = vma->vm_flags; 1428 vm_flags_t vm_flags = vma->vm_flags;
1424 1429
1425 /* If it was private or non-writable, the write bit is already clear */ 1430 /* If it was private or non-writable, the write bit is already clear */
1426 if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED))) 1431 if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED)))
1427 return 0; 1432 return 0;
1428 1433
1429 /* The backer wishes to know when pages are first written to? */ 1434 /* The backer wishes to know when pages are first written to? */
1430 if (vma->vm_ops && vma->vm_ops->page_mkwrite) 1435 if (vma->vm_ops && vma->vm_ops->page_mkwrite)
1431 return 1; 1436 return 1;
1432 1437
1433 /* The open routine did something to the protections already? */ 1438 /* The open routine did something to the protections already? */
1434 if (pgprot_val(vma->vm_page_prot) != 1439 if (pgprot_val(vma->vm_page_prot) !=
1435 pgprot_val(vm_get_page_prot(vm_flags))) 1440 pgprot_val(vm_get_page_prot(vm_flags)))
1436 return 0; 1441 return 0;
1437 1442
1438 /* Specialty mapping? */ 1443 /* Specialty mapping? */
1439 if (vm_flags & VM_PFNMAP) 1444 if (vm_flags & VM_PFNMAP)
1440 return 0; 1445 return 0;
1441 1446
1442 /* Can the mapping track the dirty pages? */ 1447 /* Can the mapping track the dirty pages? */
1443 return vma->vm_file && vma->vm_file->f_mapping && 1448 return vma->vm_file && vma->vm_file->f_mapping &&
1444 mapping_cap_account_dirty(vma->vm_file->f_mapping); 1449 mapping_cap_account_dirty(vma->vm_file->f_mapping);
1445 } 1450 }
1446 1451
1447 /* 1452 /*
1448 * We account for memory if it's a private writeable mapping, 1453 * We account for memory if it's a private writeable mapping,
1449 * not hugepages and VM_NORESERVE wasn't set. 1454 * not hugepages and VM_NORESERVE wasn't set.
1450 */ 1455 */
1451 static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags) 1456 static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
1452 { 1457 {
1453 /* 1458 /*
1454 * hugetlb has its own accounting separate from the core VM 1459 * hugetlb has its own accounting separate from the core VM
1455 * VM_HUGETLB may not be set yet so we cannot check for that flag. 1460 * VM_HUGETLB may not be set yet so we cannot check for that flag.
1456 */ 1461 */
1457 if (file && is_file_hugepages(file)) 1462 if (file && is_file_hugepages(file))
1458 return 0; 1463 return 0;
1459 1464
1460 return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE; 1465 return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
1461 } 1466 }
1462 1467
1463 unsigned long mmap_region(struct file *file, unsigned long addr, 1468 unsigned long mmap_region(struct file *file, unsigned long addr,
1464 unsigned long len, vm_flags_t vm_flags, unsigned long pgoff) 1469 unsigned long len, vm_flags_t vm_flags, unsigned long pgoff)
1465 { 1470 {
1466 struct mm_struct *mm = current->mm; 1471 struct mm_struct *mm = current->mm;
1467 struct vm_area_struct *vma, *prev; 1472 struct vm_area_struct *vma, *prev;
1468 int correct_wcount = 0; 1473 int correct_wcount = 0;
1469 int error; 1474 int error;
1470 struct rb_node **rb_link, *rb_parent; 1475 struct rb_node **rb_link, *rb_parent;
1471 unsigned long charged = 0; 1476 unsigned long charged = 0;
1472 struct inode *inode = file ? file_inode(file) : NULL; 1477 struct inode *inode = file ? file_inode(file) : NULL;
1473 1478
1474 /* Check against address space limit. */ 1479 /* Check against address space limit. */
1475 if (!may_expand_vm(mm, len >> PAGE_SHIFT)) { 1480 if (!may_expand_vm(mm, len >> PAGE_SHIFT)) {
1476 unsigned long nr_pages; 1481 unsigned long nr_pages;
1477 1482
1478 /* 1483 /*
1479 * MAP_FIXED may remove pages of mappings that intersects with 1484 * MAP_FIXED may remove pages of mappings that intersects with
1480 * requested mapping. Account for the pages it would unmap. 1485 * requested mapping. Account for the pages it would unmap.
1481 */ 1486 */
1482 if (!(vm_flags & MAP_FIXED)) 1487 if (!(vm_flags & MAP_FIXED))
1483 return -ENOMEM; 1488 return -ENOMEM;
1484 1489
1485 nr_pages = count_vma_pages_range(mm, addr, addr + len); 1490 nr_pages = count_vma_pages_range(mm, addr, addr + len);
1486 1491
1487 if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages)) 1492 if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages))
1488 return -ENOMEM; 1493 return -ENOMEM;
1489 } 1494 }
1490 1495
1491 /* Clear old maps */ 1496 /* Clear old maps */
1492 error = -ENOMEM; 1497 error = -ENOMEM;
1493 munmap_back: 1498 munmap_back:
1494 if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { 1499 if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
1495 if (do_munmap(mm, addr, len)) 1500 if (do_munmap(mm, addr, len))
1496 return -ENOMEM; 1501 return -ENOMEM;
1497 goto munmap_back; 1502 goto munmap_back;
1498 } 1503 }
1499 1504
1500 /* 1505 /*
1501 * Private writable mapping: check memory availability 1506 * Private writable mapping: check memory availability
1502 */ 1507 */
1503 if (accountable_mapping(file, vm_flags)) { 1508 if (accountable_mapping(file, vm_flags)) {
1504 charged = len >> PAGE_SHIFT; 1509 charged = len >> PAGE_SHIFT;
1505 if (security_vm_enough_memory_mm(mm, charged)) 1510 if (security_vm_enough_memory_mm(mm, charged))
1506 return -ENOMEM; 1511 return -ENOMEM;
1507 vm_flags |= VM_ACCOUNT; 1512 vm_flags |= VM_ACCOUNT;
1508 } 1513 }
1509 1514
1510 /* 1515 /*
1511 * Can we just expand an old mapping? 1516 * Can we just expand an old mapping?
1512 */ 1517 */
1513 vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL); 1518 vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL);
1514 if (vma) 1519 if (vma)
1515 goto out; 1520 goto out;
1516 1521
1517 /* 1522 /*
1518 * Determine the object being mapped and call the appropriate 1523 * Determine the object being mapped and call the appropriate
1519 * specific mapper. the address has already been validated, but 1524 * specific mapper. the address has already been validated, but
1520 * not unmapped, but the maps are removed from the list. 1525 * not unmapped, but the maps are removed from the list.
1521 */ 1526 */
1522 vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); 1527 vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
1523 if (!vma) { 1528 if (!vma) {
1524 error = -ENOMEM; 1529 error = -ENOMEM;
1525 goto unacct_error; 1530 goto unacct_error;
1526 } 1531 }
1527 1532
1528 vma->vm_mm = mm; 1533 vma->vm_mm = mm;
1529 vma->vm_start = addr; 1534 vma->vm_start = addr;
1530 vma->vm_end = addr + len; 1535 vma->vm_end = addr + len;
1531 vma->vm_flags = vm_flags; 1536 vma->vm_flags = vm_flags;
1532 vma->vm_page_prot = vm_get_page_prot(vm_flags); 1537 vma->vm_page_prot = vm_get_page_prot(vm_flags);
1533 vma->vm_pgoff = pgoff; 1538 vma->vm_pgoff = pgoff;
1534 INIT_LIST_HEAD(&vma->anon_vma_chain); 1539 INIT_LIST_HEAD(&vma->anon_vma_chain);
1535 1540
1536 error = -EINVAL; /* when rejecting VM_GROWSDOWN|VM_GROWSUP */ 1541 error = -EINVAL; /* when rejecting VM_GROWSDOWN|VM_GROWSUP */
1537 1542
1538 if (file) { 1543 if (file) {
1539 if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) 1544 if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
1540 goto free_vma; 1545 goto free_vma;
1541 if (vm_flags & VM_DENYWRITE) { 1546 if (vm_flags & VM_DENYWRITE) {
1542 error = deny_write_access(file); 1547 error = deny_write_access(file);
1543 if (error) 1548 if (error)
1544 goto free_vma; 1549 goto free_vma;
1545 correct_wcount = 1; 1550 correct_wcount = 1;
1546 } 1551 }
1547 vma->vm_file = get_file(file); 1552 vma->vm_file = get_file(file);
1548 error = file->f_op->mmap(file, vma); 1553 error = file->f_op->mmap(file, vma);
1549 if (error) 1554 if (error)
1550 goto unmap_and_free_vma; 1555 goto unmap_and_free_vma;
1551 1556
1552 /* Can addr have changed?? 1557 /* Can addr have changed??
1553 * 1558 *
1554 * Answer: Yes, several device drivers can do it in their 1559 * Answer: Yes, several device drivers can do it in their
1555 * f_op->mmap method. -DaveM 1560 * f_op->mmap method. -DaveM
1556 * Bug: If addr is changed, prev, rb_link, rb_parent should 1561 * Bug: If addr is changed, prev, rb_link, rb_parent should
1557 * be updated for vma_link() 1562 * be updated for vma_link()
1558 */ 1563 */
1559 WARN_ON_ONCE(addr != vma->vm_start); 1564 WARN_ON_ONCE(addr != vma->vm_start);
1560 1565
1561 addr = vma->vm_start; 1566 addr = vma->vm_start;
1562 pgoff = vma->vm_pgoff; 1567 pgoff = vma->vm_pgoff;
1563 vm_flags = vma->vm_flags; 1568 vm_flags = vma->vm_flags;
1564 } else if (vm_flags & VM_SHARED) { 1569 } else if (vm_flags & VM_SHARED) {
1565 if (unlikely(vm_flags & (VM_GROWSDOWN|VM_GROWSUP))) 1570 if (unlikely(vm_flags & (VM_GROWSDOWN|VM_GROWSUP)))
1566 goto free_vma; 1571 goto free_vma;
1567 error = shmem_zero_setup(vma); 1572 error = shmem_zero_setup(vma);
1568 if (error) 1573 if (error)
1569 goto free_vma; 1574 goto free_vma;
1570 } 1575 }
1571 1576
1572 if (vma_wants_writenotify(vma)) { 1577 if (vma_wants_writenotify(vma)) {
1573 pgprot_t pprot = vma->vm_page_prot; 1578 pgprot_t pprot = vma->vm_page_prot;
1574 1579
1575 /* Can vma->vm_page_prot have changed?? 1580 /* Can vma->vm_page_prot have changed??
1576 * 1581 *
1577 * Answer: Yes, drivers may have changed it in their 1582 * Answer: Yes, drivers may have changed it in their
1578 * f_op->mmap method. 1583 * f_op->mmap method.
1579 * 1584 *
1580 * Ensures that vmas marked as uncached stay that way. 1585 * Ensures that vmas marked as uncached stay that way.
1581 */ 1586 */
1582 vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED); 1587 vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED);
1583 if (pgprot_val(pprot) == pgprot_val(pgprot_noncached(pprot))) 1588 if (pgprot_val(pprot) == pgprot_val(pgprot_noncached(pprot)))
1584 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); 1589 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
1585 } 1590 }
1586 1591
1587 vma_link(mm, vma, prev, rb_link, rb_parent); 1592 vma_link(mm, vma, prev, rb_link, rb_parent);
1588 file = vma->vm_file; 1593 file = vma->vm_file;
1589 1594
1590 /* Once vma denies write, undo our temporary denial count */ 1595 /* Once vma denies write, undo our temporary denial count */
1591 if (correct_wcount) 1596 if (correct_wcount)
1592 atomic_inc(&inode->i_writecount); 1597 atomic_inc(&inode->i_writecount);
1593 out: 1598 out:
1594 perf_event_mmap(vma); 1599 perf_event_mmap(vma);
1595 1600
1596 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); 1601 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
1597 if (vm_flags & VM_LOCKED) { 1602 if (vm_flags & VM_LOCKED) {
1598 if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) || 1603 if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) ||
1599 vma == get_gate_vma(current->mm))) 1604 vma == get_gate_vma(current->mm)))
1600 mm->locked_vm += (len >> PAGE_SHIFT); 1605 mm->locked_vm += (len >> PAGE_SHIFT);
1601 else 1606 else
1602 vma->vm_flags &= ~VM_LOCKED; 1607 vma->vm_flags &= ~VM_LOCKED;
1603 } 1608 }
1604 1609
1605 if (file) 1610 if (file)
1606 uprobe_mmap(vma); 1611 uprobe_mmap(vma);
1607 1612
1608 return addr; 1613 return addr;
1609 1614
1610 unmap_and_free_vma: 1615 unmap_and_free_vma:
1611 if (correct_wcount) 1616 if (correct_wcount)
1612 atomic_inc(&inode->i_writecount); 1617 atomic_inc(&inode->i_writecount);
1613 vma->vm_file = NULL; 1618 vma->vm_file = NULL;
1614 fput(file); 1619 fput(file);
1615 1620
1616 /* Undo any partial mapping done by a device driver. */ 1621 /* Undo any partial mapping done by a device driver. */
1617 unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end); 1622 unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
1618 charged = 0; 1623 charged = 0;
1619 free_vma: 1624 free_vma:
1620 kmem_cache_free(vm_area_cachep, vma); 1625 kmem_cache_free(vm_area_cachep, vma);
1621 unacct_error: 1626 unacct_error:
1622 if (charged) 1627 if (charged)
1623 vm_unacct_memory(charged); 1628 vm_unacct_memory(charged);
1624 return error; 1629 return error;
1625 } 1630 }
1626 1631
1627 unsigned long unmapped_area(struct vm_unmapped_area_info *info) 1632 unsigned long unmapped_area(struct vm_unmapped_area_info *info)
1628 { 1633 {
1629 /* 1634 /*
1630 * We implement the search by looking for an rbtree node that 1635 * We implement the search by looking for an rbtree node that
1631 * immediately follows a suitable gap. That is, 1636 * immediately follows a suitable gap. That is,
1632 * - gap_start = vma->vm_prev->vm_end <= info->high_limit - length; 1637 * - gap_start = vma->vm_prev->vm_end <= info->high_limit - length;
1633 * - gap_end = vma->vm_start >= info->low_limit + length; 1638 * - gap_end = vma->vm_start >= info->low_limit + length;
1634 * - gap_end - gap_start >= length 1639 * - gap_end - gap_start >= length
1635 */ 1640 */
1636 1641
1637 struct mm_struct *mm = current->mm; 1642 struct mm_struct *mm = current->mm;
1638 struct vm_area_struct *vma; 1643 struct vm_area_struct *vma;
1639 unsigned long length, low_limit, high_limit, gap_start, gap_end; 1644 unsigned long length, low_limit, high_limit, gap_start, gap_end;
1640 1645
1641 /* Adjust search length to account for worst case alignment overhead */ 1646 /* Adjust search length to account for worst case alignment overhead */
1642 length = info->length + info->align_mask; 1647 length = info->length + info->align_mask;
1643 if (length < info->length) 1648 if (length < info->length)
1644 return -ENOMEM; 1649 return -ENOMEM;
1645 1650
1646 /* Adjust search limits by the desired length */ 1651 /* Adjust search limits by the desired length */
1647 if (info->high_limit < length) 1652 if (info->high_limit < length)
1648 return -ENOMEM; 1653 return -ENOMEM;
1649 high_limit = info->high_limit - length; 1654 high_limit = info->high_limit - length;
1650 1655
1651 if (info->low_limit > high_limit) 1656 if (info->low_limit > high_limit)
1652 return -ENOMEM; 1657 return -ENOMEM;
1653 low_limit = info->low_limit + length; 1658 low_limit = info->low_limit + length;
1654 1659
1655 /* Check if rbtree root looks promising */ 1660 /* Check if rbtree root looks promising */
1656 if (RB_EMPTY_ROOT(&mm->mm_rb)) 1661 if (RB_EMPTY_ROOT(&mm->mm_rb))
1657 goto check_highest; 1662 goto check_highest;
1658 vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb); 1663 vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
1659 if (vma->rb_subtree_gap < length) 1664 if (vma->rb_subtree_gap < length)
1660 goto check_highest; 1665 goto check_highest;
1661 1666
1662 while (true) { 1667 while (true) {
1663 /* Visit left subtree if it looks promising */ 1668 /* Visit left subtree if it looks promising */
1664 gap_end = vma->vm_start; 1669 gap_end = vma->vm_start;
1665 if (gap_end >= low_limit && vma->vm_rb.rb_left) { 1670 if (gap_end >= low_limit && vma->vm_rb.rb_left) {
1666 struct vm_area_struct *left = 1671 struct vm_area_struct *left =
1667 rb_entry(vma->vm_rb.rb_left, 1672 rb_entry(vma->vm_rb.rb_left,
1668 struct vm_area_struct, vm_rb); 1673 struct vm_area_struct, vm_rb);
1669 if (left->rb_subtree_gap >= length) { 1674 if (left->rb_subtree_gap >= length) {
1670 vma = left; 1675 vma = left;
1671 continue; 1676 continue;
1672 } 1677 }
1673 } 1678 }
1674 1679
1675 gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0; 1680 gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;
1676 check_current: 1681 check_current:
1677 /* Check if current node has a suitable gap */ 1682 /* Check if current node has a suitable gap */
1678 if (gap_start > high_limit) 1683 if (gap_start > high_limit)
1679 return -ENOMEM; 1684 return -ENOMEM;
1680 if (gap_end >= low_limit && gap_end - gap_start >= length) 1685 if (gap_end >= low_limit && gap_end - gap_start >= length)
1681 goto found; 1686 goto found;
1682 1687
1683 /* Visit right subtree if it looks promising */ 1688 /* Visit right subtree if it looks promising */
1684 if (vma->vm_rb.rb_right) { 1689 if (vma->vm_rb.rb_right) {
1685 struct vm_area_struct *right = 1690 struct vm_area_struct *right =
1686 rb_entry(vma->vm_rb.rb_right, 1691 rb_entry(vma->vm_rb.rb_right,
1687 struct vm_area_struct, vm_rb); 1692 struct vm_area_struct, vm_rb);
1688 if (right->rb_subtree_gap >= length) { 1693 if (right->rb_subtree_gap >= length) {
1689 vma = right; 1694 vma = right;
1690 continue; 1695 continue;
1691 } 1696 }
1692 } 1697 }
1693 1698
1694 /* Go back up the rbtree to find next candidate node */ 1699 /* Go back up the rbtree to find next candidate node */
1695 while (true) { 1700 while (true) {
1696 struct rb_node *prev = &vma->vm_rb; 1701 struct rb_node *prev = &vma->vm_rb;
1697 if (!rb_parent(prev)) 1702 if (!rb_parent(prev))
1698 goto check_highest; 1703 goto check_highest;
1699 vma = rb_entry(rb_parent(prev), 1704 vma = rb_entry(rb_parent(prev),
1700 struct vm_area_struct, vm_rb); 1705 struct vm_area_struct, vm_rb);
1701 if (prev == vma->vm_rb.rb_left) { 1706 if (prev == vma->vm_rb.rb_left) {
1702 gap_start = vma->vm_prev->vm_end; 1707 gap_start = vma->vm_prev->vm_end;
1703 gap_end = vma->vm_start; 1708 gap_end = vma->vm_start;
1704 goto check_current; 1709 goto check_current;
1705 } 1710 }
1706 } 1711 }
1707 } 1712 }
1708 1713
1709 check_highest: 1714 check_highest:
1710 /* Check highest gap, which does not precede any rbtree node */ 1715 /* Check highest gap, which does not precede any rbtree node */
1711 gap_start = mm->highest_vm_end; 1716 gap_start = mm->highest_vm_end;
1712 gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */ 1717 gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */
1713 if (gap_start > high_limit) 1718 if (gap_start > high_limit)
1714 return -ENOMEM; 1719 return -ENOMEM;
1715 1720
1716 found: 1721 found:
1717 /* We found a suitable gap. Clip it with the original low_limit. */ 1722 /* We found a suitable gap. Clip it with the original low_limit. */
1718 if (gap_start < info->low_limit) 1723 if (gap_start < info->low_limit)
1719 gap_start = info->low_limit; 1724 gap_start = info->low_limit;
1720 1725
1721 /* Adjust gap address to the desired alignment */ 1726 /* Adjust gap address to the desired alignment */
1722 gap_start += (info->align_offset - gap_start) & info->align_mask; 1727 gap_start += (info->align_offset - gap_start) & info->align_mask;
1723 1728
1724 VM_BUG_ON(gap_start + info->length > info->high_limit); 1729 VM_BUG_ON(gap_start + info->length > info->high_limit);
1725 VM_BUG_ON(gap_start + info->length > gap_end); 1730 VM_BUG_ON(gap_start + info->length > gap_end);
1726 return gap_start; 1731 return gap_start;
1727 } 1732 }
1728 1733
1729 unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) 1734 unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
1730 { 1735 {
1731 struct mm_struct *mm = current->mm; 1736 struct mm_struct *mm = current->mm;
1732 struct vm_area_struct *vma; 1737 struct vm_area_struct *vma;
1733 unsigned long length, low_limit, high_limit, gap_start, gap_end; 1738 unsigned long length, low_limit, high_limit, gap_start, gap_end;
1734 1739
1735 /* Adjust search length to account for worst case alignment overhead */ 1740 /* Adjust search length to account for worst case alignment overhead */
1736 length = info->length + info->align_mask; 1741 length = info->length + info->align_mask;
1737 if (length < info->length) 1742 if (length < info->length)
1738 return -ENOMEM; 1743 return -ENOMEM;
1739 1744
1740 /* 1745 /*
1741 * Adjust search limits by the desired length. 1746 * Adjust search limits by the desired length.
1742 * See implementation comment at top of unmapped_area(). 1747 * See implementation comment at top of unmapped_area().
1743 */ 1748 */
1744 gap_end = info->high_limit; 1749 gap_end = info->high_limit;
1745 if (gap_end < length) 1750 if (gap_end < length)
1746 return -ENOMEM; 1751 return -ENOMEM;
1747 high_limit = gap_end - length; 1752 high_limit = gap_end - length;
1748 1753
1749 if (info->low_limit > high_limit) 1754 if (info->low_limit > high_limit)
1750 return -ENOMEM; 1755 return -ENOMEM;
1751 low_limit = info->low_limit + length; 1756 low_limit = info->low_limit + length;
1752 1757
1753 /* Check highest gap, which does not precede any rbtree node */ 1758 /* Check highest gap, which does not precede any rbtree node */
1754 gap_start = mm->highest_vm_end; 1759 gap_start = mm->highest_vm_end;
1755 if (gap_start <= high_limit) 1760 if (gap_start <= high_limit)
1756 goto found_highest; 1761 goto found_highest;
1757 1762
1758 /* Check if rbtree root looks promising */ 1763 /* Check if rbtree root looks promising */
1759 if (RB_EMPTY_ROOT(&mm->mm_rb)) 1764 if (RB_EMPTY_ROOT(&mm->mm_rb))
1760 return -ENOMEM; 1765 return -ENOMEM;
1761 vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb); 1766 vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
1762 if (vma->rb_subtree_gap < length) 1767 if (vma->rb_subtree_gap < length)
1763 return -ENOMEM; 1768 return -ENOMEM;
1764 1769
1765 while (true) { 1770 while (true) {
1766 /* Visit right subtree if it looks promising */ 1771 /* Visit right subtree if it looks promising */
1767 gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0; 1772 gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;
1768 if (gap_start <= high_limit && vma->vm_rb.rb_right) { 1773 if (gap_start <= high_limit && vma->vm_rb.rb_right) {
1769 struct vm_area_struct *right = 1774 struct vm_area_struct *right =
1770 rb_entry(vma->vm_rb.rb_right, 1775 rb_entry(vma->vm_rb.rb_right,
1771 struct vm_area_struct, vm_rb); 1776 struct vm_area_struct, vm_rb);
1772 if (right->rb_subtree_gap >= length) { 1777 if (right->rb_subtree_gap >= length) {
1773 vma = right; 1778 vma = right;
1774 continue; 1779 continue;
1775 } 1780 }
1776 } 1781 }
1777 1782
1778 check_current: 1783 check_current:
1779 /* Check if current node has a suitable gap */ 1784 /* Check if current node has a suitable gap */
1780 gap_end = vma->vm_start; 1785 gap_end = vma->vm_start;
1781 if (gap_end < low_limit) 1786 if (gap_end < low_limit)
1782 return -ENOMEM; 1787 return -ENOMEM;
1783 if (gap_start <= high_limit && gap_end - gap_start >= length) 1788 if (gap_start <= high_limit && gap_end - gap_start >= length)
1784 goto found; 1789 goto found;
1785 1790
1786 /* Visit left subtree if it looks promising */ 1791 /* Visit left subtree if it looks promising */
1787 if (vma->vm_rb.rb_left) { 1792 if (vma->vm_rb.rb_left) {
1788 struct vm_area_struct *left = 1793 struct vm_area_struct *left =
1789 rb_entry(vma->vm_rb.rb_left, 1794 rb_entry(vma->vm_rb.rb_left,
1790 struct vm_area_struct, vm_rb); 1795 struct vm_area_struct, vm_rb);
1791 if (left->rb_subtree_gap >= length) { 1796 if (left->rb_subtree_gap >= length) {
1792 vma = left; 1797 vma = left;
1793 continue; 1798 continue;
1794 } 1799 }
1795 } 1800 }
1796 1801
1797 /* Go back up the rbtree to find next candidate node */ 1802 /* Go back up the rbtree to find next candidate node */
1798 while (true) { 1803 while (true) {
1799 struct rb_node *prev = &vma->vm_rb; 1804 struct rb_node *prev = &vma->vm_rb;
1800 if (!rb_parent(prev)) 1805 if (!rb_parent(prev))
1801 return -ENOMEM; 1806 return -ENOMEM;
1802 vma = rb_entry(rb_parent(prev), 1807 vma = rb_entry(rb_parent(prev),
1803 struct vm_area_struct, vm_rb); 1808 struct vm_area_struct, vm_rb);
1804 if (prev == vma->vm_rb.rb_right) { 1809 if (prev == vma->vm_rb.rb_right) {
1805 gap_start = vma->vm_prev ? 1810 gap_start = vma->vm_prev ?
1806 vma->vm_prev->vm_end : 0; 1811 vma->vm_prev->vm_end : 0;
1807 goto check_current; 1812 goto check_current;
1808 } 1813 }
1809 } 1814 }
1810 } 1815 }
1811 1816
1812 found: 1817 found:
1813 /* We found a suitable gap. Clip it with the original high_limit. */ 1818 /* We found a suitable gap. Clip it with the original high_limit. */
1814 if (gap_end > info->high_limit) 1819 if (gap_end > info->high_limit)
1815 gap_end = info->high_limit; 1820 gap_end = info->high_limit;
1816 1821
1817 found_highest: 1822 found_highest:
1818 /* Compute highest gap address at the desired alignment */ 1823 /* Compute highest gap address at the desired alignment */
1819 gap_end -= info->length; 1824 gap_end -= info->length;
1820 gap_end -= (gap_end - info->align_offset) & info->align_mask; 1825 gap_end -= (gap_end - info->align_offset) & info->align_mask;
1821 1826
1822 VM_BUG_ON(gap_end < info->low_limit); 1827 VM_BUG_ON(gap_end < info->low_limit);
1823 VM_BUG_ON(gap_end < gap_start); 1828 VM_BUG_ON(gap_end < gap_start);
1824 return gap_end; 1829 return gap_end;
1825 } 1830 }
1826 1831
1827 /* Get an address range which is currently unmapped. 1832 /* Get an address range which is currently unmapped.
1828 * For shmat() with addr=0. 1833 * For shmat() with addr=0.
1829 * 1834 *
1830 * Ugly calling convention alert: 1835 * Ugly calling convention alert:
1831 * Return value with the low bits set means error value, 1836 * Return value with the low bits set means error value,
1832 * ie 1837 * ie
1833 * if (ret & ~PAGE_MASK) 1838 * if (ret & ~PAGE_MASK)
1834 * error = ret; 1839 * error = ret;
1835 * 1840 *
1836 * This function "knows" that -ENOMEM has the bits set. 1841 * This function "knows" that -ENOMEM has the bits set.
1837 */ 1842 */
1838 #ifndef HAVE_ARCH_UNMAPPED_AREA 1843 #ifndef HAVE_ARCH_UNMAPPED_AREA
1839 unsigned long 1844 unsigned long
1840 arch_get_unmapped_area(struct file *filp, unsigned long addr, 1845 arch_get_unmapped_area(struct file *filp, unsigned long addr,
1841 unsigned long len, unsigned long pgoff, unsigned long flags) 1846 unsigned long len, unsigned long pgoff, unsigned long flags)
1842 { 1847 {
1843 struct mm_struct *mm = current->mm; 1848 struct mm_struct *mm = current->mm;
1844 struct vm_area_struct *vma; 1849 struct vm_area_struct *vma;
1845 struct vm_unmapped_area_info info; 1850 struct vm_unmapped_area_info info;
1846 1851
1847 if (len > TASK_SIZE) 1852 if (len > TASK_SIZE)
1848 return -ENOMEM; 1853 return -ENOMEM;
1849 1854
1850 if (flags & MAP_FIXED) 1855 if (flags & MAP_FIXED)
1851 return addr; 1856 return addr;
1852 1857
1853 if (addr) { 1858 if (addr) {
1854 addr = PAGE_ALIGN(addr); 1859 addr = PAGE_ALIGN(addr);
1855 vma = find_vma(mm, addr); 1860 vma = find_vma(mm, addr);
1856 if (TASK_SIZE - len >= addr && 1861 if (TASK_SIZE - len >= addr &&
1857 (!vma || addr + len <= vma->vm_start)) 1862 (!vma || addr + len <= vma->vm_start))
1858 return addr; 1863 return addr;
1859 } 1864 }
1860 1865
1861 info.flags = 0; 1866 info.flags = 0;
1862 info.length = len; 1867 info.length = len;
1863 info.low_limit = TASK_UNMAPPED_BASE; 1868 info.low_limit = TASK_UNMAPPED_BASE;
1864 info.high_limit = TASK_SIZE; 1869 info.high_limit = TASK_SIZE;
1865 info.align_mask = 0; 1870 info.align_mask = 0;
1866 return vm_unmapped_area(&info); 1871 return vm_unmapped_area(&info);
1867 } 1872 }
1868 #endif 1873 #endif
1869 1874
1870 void arch_unmap_area(struct mm_struct *mm, unsigned long addr) 1875 void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
1871 { 1876 {
1872 /* 1877 /*
1873 * Is this a new hole at the lowest possible address? 1878 * Is this a new hole at the lowest possible address?
1874 */ 1879 */
1875 if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache) 1880 if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache)
1876 mm->free_area_cache = addr; 1881 mm->free_area_cache = addr;
1877 } 1882 }
1878 1883
1879 /* 1884 /*
1880 * This mmap-allocator allocates new areas top-down from below the 1885 * This mmap-allocator allocates new areas top-down from below the
1881 * stack's low limit (the base): 1886 * stack's low limit (the base):
1882 */ 1887 */
1883 #ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN 1888 #ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
1884 unsigned long 1889 unsigned long
1885 arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, 1890 arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
1886 const unsigned long len, const unsigned long pgoff, 1891 const unsigned long len, const unsigned long pgoff,
1887 const unsigned long flags) 1892 const unsigned long flags)
1888 { 1893 {
1889 struct vm_area_struct *vma; 1894 struct vm_area_struct *vma;
1890 struct mm_struct *mm = current->mm; 1895 struct mm_struct *mm = current->mm;
1891 unsigned long addr = addr0; 1896 unsigned long addr = addr0;
1892 struct vm_unmapped_area_info info; 1897 struct vm_unmapped_area_info info;
1893 1898
1894 /* requested length too big for entire address space */ 1899 /* requested length too big for entire address space */
1895 if (len > TASK_SIZE) 1900 if (len > TASK_SIZE)
1896 return -ENOMEM; 1901 return -ENOMEM;
1897 1902
1898 if (flags & MAP_FIXED) 1903 if (flags & MAP_FIXED)
1899 return addr; 1904 return addr;
1900 1905
1901 /* requesting a specific address */ 1906 /* requesting a specific address */
1902 if (addr) { 1907 if (addr) {
1903 addr = PAGE_ALIGN(addr); 1908 addr = PAGE_ALIGN(addr);
1904 vma = find_vma(mm, addr); 1909 vma = find_vma(mm, addr);
1905 if (TASK_SIZE - len >= addr && 1910 if (TASK_SIZE - len >= addr &&
1906 (!vma || addr + len <= vma->vm_start)) 1911 (!vma || addr + len <= vma->vm_start))
1907 return addr; 1912 return addr;
1908 } 1913 }
1909 1914
1910 info.flags = VM_UNMAPPED_AREA_TOPDOWN; 1915 info.flags = VM_UNMAPPED_AREA_TOPDOWN;
1911 info.length = len; 1916 info.length = len;
1912 info.low_limit = PAGE_SIZE; 1917 info.low_limit = PAGE_SIZE;
1913 info.high_limit = mm->mmap_base; 1918 info.high_limit = mm->mmap_base;
1914 info.align_mask = 0; 1919 info.align_mask = 0;
1915 addr = vm_unmapped_area(&info); 1920 addr = vm_unmapped_area(&info);
1916 1921
1917 /* 1922 /*
1918 * A failed mmap() very likely causes application failure, 1923 * A failed mmap() very likely causes application failure,
1919 * so fall back to the bottom-up function here. This scenario 1924 * so fall back to the bottom-up function here. This scenario
1920 * can happen with large stack limits and large mmap() 1925 * can happen with large stack limits and large mmap()
1921 * allocations. 1926 * allocations.
1922 */ 1927 */
1923 if (addr & ~PAGE_MASK) { 1928 if (addr & ~PAGE_MASK) {
1924 VM_BUG_ON(addr != -ENOMEM); 1929 VM_BUG_ON(addr != -ENOMEM);
1925 info.flags = 0; 1930 info.flags = 0;
1926 info.low_limit = TASK_UNMAPPED_BASE; 1931 info.low_limit = TASK_UNMAPPED_BASE;
1927 info.high_limit = TASK_SIZE; 1932 info.high_limit = TASK_SIZE;
1928 addr = vm_unmapped_area(&info); 1933 addr = vm_unmapped_area(&info);
1929 } 1934 }
1930 1935
1931 return addr; 1936 return addr;
1932 } 1937 }
1933 #endif 1938 #endif
1934 1939
1935 void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr) 1940 void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr)
1936 { 1941 {
1937 /* 1942 /*
1938 * Is this a new hole at the highest possible address? 1943 * Is this a new hole at the highest possible address?
1939 */ 1944 */
1940 if (addr > mm->free_area_cache) 1945 if (addr > mm->free_area_cache)
1941 mm->free_area_cache = addr; 1946 mm->free_area_cache = addr;
1942 1947
1943 /* dont allow allocations above current base */ 1948 /* dont allow allocations above current base */
1944 if (mm->free_area_cache > mm->mmap_base) 1949 if (mm->free_area_cache > mm->mmap_base)
1945 mm->free_area_cache = mm->mmap_base; 1950 mm->free_area_cache = mm->mmap_base;
1946 } 1951 }
1947 1952
1948 unsigned long 1953 unsigned long
1949 get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, 1954 get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
1950 unsigned long pgoff, unsigned long flags) 1955 unsigned long pgoff, unsigned long flags)
1951 { 1956 {
1952 unsigned long (*get_area)(struct file *, unsigned long, 1957 unsigned long (*get_area)(struct file *, unsigned long,
1953 unsigned long, unsigned long, unsigned long); 1958 unsigned long, unsigned long, unsigned long);
1954 1959
1955 unsigned long error = arch_mmap_check(addr, len, flags); 1960 unsigned long error = arch_mmap_check(addr, len, flags);
1956 if (error) 1961 if (error)
1957 return error; 1962 return error;
1958 1963
1959 /* Careful about overflows.. */ 1964 /* Careful about overflows.. */
1960 if (len > TASK_SIZE) 1965 if (len > TASK_SIZE)
1961 return -ENOMEM; 1966 return -ENOMEM;
1962 1967
1963 get_area = current->mm->get_unmapped_area; 1968 get_area = current->mm->get_unmapped_area;
1964 if (file && file->f_op && file->f_op->get_unmapped_area) 1969 if (file && file->f_op && file->f_op->get_unmapped_area)
1965 get_area = file->f_op->get_unmapped_area; 1970 get_area = file->f_op->get_unmapped_area;
1966 addr = get_area(file, addr, len, pgoff, flags); 1971 addr = get_area(file, addr, len, pgoff, flags);
1967 if (IS_ERR_VALUE(addr)) 1972 if (IS_ERR_VALUE(addr))
1968 return addr; 1973 return addr;
1969 1974
1970 if (addr > TASK_SIZE - len) 1975 if (addr > TASK_SIZE - len)
1971 return -ENOMEM; 1976 return -ENOMEM;
1972 if (addr & ~PAGE_MASK) 1977 if (addr & ~PAGE_MASK)
1973 return -EINVAL; 1978 return -EINVAL;
1974 1979
1975 addr = arch_rebalance_pgtables(addr, len); 1980 addr = arch_rebalance_pgtables(addr, len);
1976 error = security_mmap_addr(addr); 1981 error = security_mmap_addr(addr);
1977 return error ? error : addr; 1982 return error ? error : addr;
1978 } 1983 }
1979 1984
1980 EXPORT_SYMBOL(get_unmapped_area); 1985 EXPORT_SYMBOL(get_unmapped_area);
1981 1986
1982 /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ 1987 /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
1983 struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) 1988 struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
1984 { 1989 {
1985 struct vm_area_struct *vma = NULL; 1990 struct vm_area_struct *vma = NULL;
1986 1991
1987 /* Check the cache first. */ 1992 /* Check the cache first. */
1988 /* (Cache hit rate is typically around 35%.) */ 1993 /* (Cache hit rate is typically around 35%.) */
1989 vma = ACCESS_ONCE(mm->mmap_cache); 1994 vma = ACCESS_ONCE(mm->mmap_cache);
1990 if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { 1995 if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
1991 struct rb_node *rb_node; 1996 struct rb_node *rb_node;
1992 1997
1993 rb_node = mm->mm_rb.rb_node; 1998 rb_node = mm->mm_rb.rb_node;
1994 vma = NULL; 1999 vma = NULL;
1995 2000
1996 while (rb_node) { 2001 while (rb_node) {
1997 struct vm_area_struct *vma_tmp; 2002 struct vm_area_struct *vma_tmp;
1998 2003
1999 vma_tmp = rb_entry(rb_node, 2004 vma_tmp = rb_entry(rb_node,
2000 struct vm_area_struct, vm_rb); 2005 struct vm_area_struct, vm_rb);
2001 2006
2002 if (vma_tmp->vm_end > addr) { 2007 if (vma_tmp->vm_end > addr) {
2003 vma = vma_tmp; 2008 vma = vma_tmp;
2004 if (vma_tmp->vm_start <= addr) 2009 if (vma_tmp->vm_start <= addr)
2005 break; 2010 break;
2006 rb_node = rb_node->rb_left; 2011 rb_node = rb_node->rb_left;
2007 } else 2012 } else
2008 rb_node = rb_node->rb_right; 2013 rb_node = rb_node->rb_right;
2009 } 2014 }
2010 if (vma) 2015 if (vma)
2011 mm->mmap_cache = vma; 2016 mm->mmap_cache = vma;
2012 } 2017 }
2013 return vma; 2018 return vma;
2014 } 2019 }
2015 2020
2016 EXPORT_SYMBOL(find_vma); 2021 EXPORT_SYMBOL(find_vma);
2017 2022
2018 /* 2023 /*
2019 * Same as find_vma, but also return a pointer to the previous VMA in *pprev. 2024 * Same as find_vma, but also return a pointer to the previous VMA in *pprev.
2020 */ 2025 */
2021 struct vm_area_struct * 2026 struct vm_area_struct *
2022 find_vma_prev(struct mm_struct *mm, unsigned long addr, 2027 find_vma_prev(struct mm_struct *mm, unsigned long addr,
2023 struct vm_area_struct **pprev) 2028 struct vm_area_struct **pprev)
2024 { 2029 {
2025 struct vm_area_struct *vma; 2030 struct vm_area_struct *vma;
2026 2031
2027 vma = find_vma(mm, addr); 2032 vma = find_vma(mm, addr);
2028 if (vma) { 2033 if (vma) {
2029 *pprev = vma->vm_prev; 2034 *pprev = vma->vm_prev;
2030 } else { 2035 } else {
2031 struct rb_node *rb_node = mm->mm_rb.rb_node; 2036 struct rb_node *rb_node = mm->mm_rb.rb_node;
2032 *pprev = NULL; 2037 *pprev = NULL;
2033 while (rb_node) { 2038 while (rb_node) {
2034 *pprev = rb_entry(rb_node, struct vm_area_struct, vm_rb); 2039 *pprev = rb_entry(rb_node, struct vm_area_struct, vm_rb);
2035 rb_node = rb_node->rb_right; 2040 rb_node = rb_node->rb_right;
2036 } 2041 }
2037 } 2042 }
2038 return vma; 2043 return vma;
2039 } 2044 }
2040 2045
2041 /* 2046 /*
2042 * Verify that the stack growth is acceptable and 2047 * Verify that the stack growth is acceptable and
2043 * update accounting. This is shared with both the 2048 * update accounting. This is shared with both the
2044 * grow-up and grow-down cases. 2049 * grow-up and grow-down cases.
2045 */ 2050 */
2046 static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow) 2051 static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow)
2047 { 2052 {
2048 struct mm_struct *mm = vma->vm_mm; 2053 struct mm_struct *mm = vma->vm_mm;
2049 struct rlimit *rlim = current->signal->rlim; 2054 struct rlimit *rlim = current->signal->rlim;
2050 unsigned long new_start; 2055 unsigned long new_start;
2051 2056
2052 /* address space limit tests */ 2057 /* address space limit tests */
2053 if (!may_expand_vm(mm, grow)) 2058 if (!may_expand_vm(mm, grow))
2054 return -ENOMEM; 2059 return -ENOMEM;
2055 2060
2056 /* Stack limit test */ 2061 /* Stack limit test */
2057 if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur)) 2062 if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur))
2058 return -ENOMEM; 2063 return -ENOMEM;
2059 2064
2060 /* mlock limit tests */ 2065 /* mlock limit tests */
2061 if (vma->vm_flags & VM_LOCKED) { 2066 if (vma->vm_flags & VM_LOCKED) {
2062 unsigned long locked; 2067 unsigned long locked;
2063 unsigned long limit; 2068 unsigned long limit;
2064 locked = mm->locked_vm + grow; 2069 locked = mm->locked_vm + grow;
2065 limit = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur); 2070 limit = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur);
2066 limit >>= PAGE_SHIFT; 2071 limit >>= PAGE_SHIFT;
2067 if (locked > limit && !capable(CAP_IPC_LOCK)) 2072 if (locked > limit && !capable(CAP_IPC_LOCK))
2068 return -ENOMEM; 2073 return -ENOMEM;
2069 } 2074 }
2070 2075
2071 /* Check to ensure the stack will not grow into a hugetlb-only region */ 2076 /* Check to ensure the stack will not grow into a hugetlb-only region */
2072 new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start : 2077 new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start :
2073 vma->vm_end - size; 2078 vma->vm_end - size;
2074 if (is_hugepage_only_range(vma->vm_mm, new_start, size)) 2079 if (is_hugepage_only_range(vma->vm_mm, new_start, size))
2075 return -EFAULT; 2080 return -EFAULT;
2076 2081
2077 /* 2082 /*
2078 * Overcommit.. This must be the final test, as it will 2083 * Overcommit.. This must be the final test, as it will
2079 * update security statistics. 2084 * update security statistics.
2080 */ 2085 */
2081 if (security_vm_enough_memory_mm(mm, grow)) 2086 if (security_vm_enough_memory_mm(mm, grow))
2082 return -ENOMEM; 2087 return -ENOMEM;
2083 2088
2084 /* Ok, everything looks good - let it rip */ 2089 /* Ok, everything looks good - let it rip */
2085 if (vma->vm_flags & VM_LOCKED) 2090 if (vma->vm_flags & VM_LOCKED)
2086 mm->locked_vm += grow; 2091 mm->locked_vm += grow;
2087 vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); 2092 vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
2088 return 0; 2093 return 0;
2089 } 2094 }
2090 2095
2091 #if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64) 2096 #if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64)
2092 /* 2097 /*
2093 * PA-RISC uses this for its stack; IA64 for its Register Backing Store. 2098 * PA-RISC uses this for its stack; IA64 for its Register Backing Store.
2094 * vma is the last one with address > vma->vm_end. Have to extend vma. 2099 * vma is the last one with address > vma->vm_end. Have to extend vma.
2095 */ 2100 */
2096 int expand_upwards(struct vm_area_struct *vma, unsigned long address) 2101 int expand_upwards(struct vm_area_struct *vma, unsigned long address)
2097 { 2102 {
2098 int error; 2103 int error;
2099 2104
2100 if (!(vma->vm_flags & VM_GROWSUP)) 2105 if (!(vma->vm_flags & VM_GROWSUP))
2101 return -EFAULT; 2106 return -EFAULT;
2102 2107
2103 /* 2108 /*
2104 * We must make sure the anon_vma is allocated 2109 * We must make sure the anon_vma is allocated
2105 * so that the anon_vma locking is not a noop. 2110 * so that the anon_vma locking is not a noop.
2106 */ 2111 */
2107 if (unlikely(anon_vma_prepare(vma))) 2112 if (unlikely(anon_vma_prepare(vma)))
2108 return -ENOMEM; 2113 return -ENOMEM;
2109 vma_lock_anon_vma(vma); 2114 vma_lock_anon_vma(vma);
2110 2115
2111 /* 2116 /*
2112 * vma->vm_start/vm_end cannot change under us because the caller 2117 * vma->vm_start/vm_end cannot change under us because the caller
2113 * is required to hold the mmap_sem in read mode. We need the 2118 * is required to hold the mmap_sem in read mode. We need the
2114 * anon_vma lock to serialize against concurrent expand_stacks. 2119 * anon_vma lock to serialize against concurrent expand_stacks.
2115 * Also guard against wrapping around to address 0. 2120 * Also guard against wrapping around to address 0.
2116 */ 2121 */
2117 if (address < PAGE_ALIGN(address+4)) 2122 if (address < PAGE_ALIGN(address+4))
2118 address = PAGE_ALIGN(address+4); 2123 address = PAGE_ALIGN(address+4);
2119 else { 2124 else {
2120 vma_unlock_anon_vma(vma); 2125 vma_unlock_anon_vma(vma);
2121 return -ENOMEM; 2126 return -ENOMEM;
2122 } 2127 }
2123 error = 0; 2128 error = 0;
2124 2129
2125 /* Somebody else might have raced and expanded it already */ 2130 /* Somebody else might have raced and expanded it already */
2126 if (address > vma->vm_end) { 2131 if (address > vma->vm_end) {
2127 unsigned long size, grow; 2132 unsigned long size, grow;
2128 2133
2129 size = address - vma->vm_start; 2134 size = address - vma->vm_start;
2130 grow = (address - vma->vm_end) >> PAGE_SHIFT; 2135 grow = (address - vma->vm_end) >> PAGE_SHIFT;
2131 2136
2132 error = -ENOMEM; 2137 error = -ENOMEM;
2133 if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { 2138 if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
2134 error = acct_stack_growth(vma, size, grow); 2139 error = acct_stack_growth(vma, size, grow);
2135 if (!error) { 2140 if (!error) {
2136 /* 2141 /*
2137 * vma_gap_update() doesn't support concurrent 2142 * vma_gap_update() doesn't support concurrent
2138 * updates, but we only hold a shared mmap_sem 2143 * updates, but we only hold a shared mmap_sem
2139 * lock here, so we need to protect against 2144 * lock here, so we need to protect against
2140 * concurrent vma expansions. 2145 * concurrent vma expansions.
2141 * vma_lock_anon_vma() doesn't help here, as 2146 * vma_lock_anon_vma() doesn't help here, as
2142 * we don't guarantee that all growable vmas 2147 * we don't guarantee that all growable vmas
2143 * in a mm share the same root anon vma. 2148 * in a mm share the same root anon vma.
2144 * So, we reuse mm->page_table_lock to guard 2149 * So, we reuse mm->page_table_lock to guard
2145 * against concurrent vma expansions. 2150 * against concurrent vma expansions.
2146 */ 2151 */
2147 spin_lock(&vma->vm_mm->page_table_lock); 2152 spin_lock(&vma->vm_mm->page_table_lock);
2148 anon_vma_interval_tree_pre_update_vma(vma); 2153 anon_vma_interval_tree_pre_update_vma(vma);
2149 vma->vm_end = address; 2154 vma->vm_end = address;
2150 anon_vma_interval_tree_post_update_vma(vma); 2155 anon_vma_interval_tree_post_update_vma(vma);
2151 if (vma->vm_next) 2156 if (vma->vm_next)
2152 vma_gap_update(vma->vm_next); 2157 vma_gap_update(vma->vm_next);
2153 else 2158 else
2154 vma->vm_mm->highest_vm_end = address; 2159 vma->vm_mm->highest_vm_end = address;
2155 spin_unlock(&vma->vm_mm->page_table_lock); 2160 spin_unlock(&vma->vm_mm->page_table_lock);
2156 2161
2157 perf_event_mmap(vma); 2162 perf_event_mmap(vma);
2158 } 2163 }
2159 } 2164 }
2160 } 2165 }
2161 vma_unlock_anon_vma(vma); 2166 vma_unlock_anon_vma(vma);
2162 khugepaged_enter_vma_merge(vma); 2167 khugepaged_enter_vma_merge(vma);
2163 validate_mm(vma->vm_mm); 2168 validate_mm(vma->vm_mm);
2164 return error; 2169 return error;
2165 } 2170 }
2166 #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ 2171 #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
2167 2172
2168 /* 2173 /*
2169 * vma is the first one with address < vma->vm_start. Have to extend vma. 2174 * vma is the first one with address < vma->vm_start. Have to extend vma.
2170 */ 2175 */
2171 int expand_downwards(struct vm_area_struct *vma, 2176 int expand_downwards(struct vm_area_struct *vma,
2172 unsigned long address) 2177 unsigned long address)
2173 { 2178 {
2174 int error; 2179 int error;
2175 2180
2176 /* 2181 /*
2177 * We must make sure the anon_vma is allocated 2182 * We must make sure the anon_vma is allocated
2178 * so that the anon_vma locking is not a noop. 2183 * so that the anon_vma locking is not a noop.
2179 */ 2184 */
2180 if (unlikely(anon_vma_prepare(vma))) 2185 if (unlikely(anon_vma_prepare(vma)))
2181 return -ENOMEM; 2186 return -ENOMEM;
2182 2187
2183 address &= PAGE_MASK; 2188 address &= PAGE_MASK;
2184 error = security_mmap_addr(address); 2189 error = security_mmap_addr(address);
2185 if (error) 2190 if (error)
2186 return error; 2191 return error;
2187 2192
2188 vma_lock_anon_vma(vma); 2193 vma_lock_anon_vma(vma);
2189 2194
2190 /* 2195 /*
2191 * vma->vm_start/vm_end cannot change under us because the caller 2196 * vma->vm_start/vm_end cannot change under us because the caller
2192 * is required to hold the mmap_sem in read mode. We need the 2197 * is required to hold the mmap_sem in read mode. We need the
2193 * anon_vma lock to serialize against concurrent expand_stacks. 2198 * anon_vma lock to serialize against concurrent expand_stacks.
2194 */ 2199 */
2195 2200
2196 /* Somebody else might have raced and expanded it already */ 2201 /* Somebody else might have raced and expanded it already */
2197 if (address < vma->vm_start) { 2202 if (address < vma->vm_start) {
2198 unsigned long size, grow; 2203 unsigned long size, grow;
2199 2204
2200 size = vma->vm_end - address; 2205 size = vma->vm_end - address;
2201 grow = (vma->vm_start - address) >> PAGE_SHIFT; 2206 grow = (vma->vm_start - address) >> PAGE_SHIFT;
2202 2207
2203 error = -ENOMEM; 2208 error = -ENOMEM;
2204 if (grow <= vma->vm_pgoff) { 2209 if (grow <= vma->vm_pgoff) {
2205 error = acct_stack_growth(vma, size, grow); 2210 error = acct_stack_growth(vma, size, grow);
2206 if (!error) { 2211 if (!error) {
2207 /* 2212 /*
2208 * vma_gap_update() doesn't support concurrent 2213 * vma_gap_update() doesn't support concurrent
2209 * updates, but we only hold a shared mmap_sem 2214 * updates, but we only hold a shared mmap_sem
2210 * lock here, so we need to protect against 2215 * lock here, so we need to protect against
2211 * concurrent vma expansions. 2216 * concurrent vma expansions.
2212 * vma_lock_anon_vma() doesn't help here, as 2217 * vma_lock_anon_vma() doesn't help here, as
2213 * we don't guarantee that all growable vmas 2218 * we don't guarantee that all growable vmas
2214 * in a mm share the same root anon vma. 2219 * in a mm share the same root anon vma.
2215 * So, we reuse mm->page_table_lock to guard 2220 * So, we reuse mm->page_table_lock to guard
2216 * against concurrent vma expansions. 2221 * against concurrent vma expansions.
2217 */ 2222 */
2218 spin_lock(&vma->vm_mm->page_table_lock); 2223 spin_lock(&vma->vm_mm->page_table_lock);
2219 anon_vma_interval_tree_pre_update_vma(vma); 2224 anon_vma_interval_tree_pre_update_vma(vma);
2220 vma->vm_start = address; 2225 vma->vm_start = address;
2221 vma->vm_pgoff -= grow; 2226 vma->vm_pgoff -= grow;
2222 anon_vma_interval_tree_post_update_vma(vma); 2227 anon_vma_interval_tree_post_update_vma(vma);
2223 vma_gap_update(vma); 2228 vma_gap_update(vma);
2224 spin_unlock(&vma->vm_mm->page_table_lock); 2229 spin_unlock(&vma->vm_mm->page_table_lock);
2225 2230
2226 perf_event_mmap(vma); 2231 perf_event_mmap(vma);
2227 } 2232 }
2228 } 2233 }
2229 } 2234 }
2230 vma_unlock_anon_vma(vma); 2235 vma_unlock_anon_vma(vma);
2231 khugepaged_enter_vma_merge(vma); 2236 khugepaged_enter_vma_merge(vma);
2232 validate_mm(vma->vm_mm); 2237 validate_mm(vma->vm_mm);
2233 return error; 2238 return error;
2234 } 2239 }
2235 2240
2236 /* 2241 /*
2237 * Note how expand_stack() refuses to expand the stack all the way to 2242 * Note how expand_stack() refuses to expand the stack all the way to
2238 * abut the next virtual mapping, *unless* that mapping itself is also 2243 * abut the next virtual mapping, *unless* that mapping itself is also
2239 * a stack mapping. We want to leave room for a guard page, after all 2244 * a stack mapping. We want to leave room for a guard page, after all
2240 * (the guard page itself is not added here, that is done by the 2245 * (the guard page itself is not added here, that is done by the
2241 * actual page faulting logic) 2246 * actual page faulting logic)
2242 * 2247 *
2243 * This matches the behavior of the guard page logic (see mm/memory.c: 2248 * This matches the behavior of the guard page logic (see mm/memory.c:
2244 * check_stack_guard_page()), which only allows the guard page to be 2249 * check_stack_guard_page()), which only allows the guard page to be
2245 * removed under these circumstances. 2250 * removed under these circumstances.
2246 */ 2251 */
2247 #ifdef CONFIG_STACK_GROWSUP 2252 #ifdef CONFIG_STACK_GROWSUP
2248 int expand_stack(struct vm_area_struct *vma, unsigned long address) 2253 int expand_stack(struct vm_area_struct *vma, unsigned long address)
2249 { 2254 {
2250 struct vm_area_struct *next; 2255 struct vm_area_struct *next;
2251 2256
2252 address &= PAGE_MASK; 2257 address &= PAGE_MASK;
2253 next = vma->vm_next; 2258 next = vma->vm_next;
2254 if (next && next->vm_start == address + PAGE_SIZE) { 2259 if (next && next->vm_start == address + PAGE_SIZE) {
2255 if (!(next->vm_flags & VM_GROWSUP)) 2260 if (!(next->vm_flags & VM_GROWSUP))
2256 return -ENOMEM; 2261 return -ENOMEM;
2257 } 2262 }
2258 return expand_upwards(vma, address); 2263 return expand_upwards(vma, address);
2259 } 2264 }
2260 2265
2261 struct vm_area_struct * 2266 struct vm_area_struct *
2262 find_extend_vma(struct mm_struct *mm, unsigned long addr) 2267 find_extend_vma(struct mm_struct *mm, unsigned long addr)
2263 { 2268 {
2264 struct vm_area_struct *vma, *prev; 2269 struct vm_area_struct *vma, *prev;
2265 2270
2266 addr &= PAGE_MASK; 2271 addr &= PAGE_MASK;
2267 vma = find_vma_prev(mm, addr, &prev); 2272 vma = find_vma_prev(mm, addr, &prev);
2268 if (vma && (vma->vm_start <= addr)) 2273 if (vma && (vma->vm_start <= addr))
2269 return vma; 2274 return vma;
2270 if (!prev || expand_stack(prev, addr)) 2275 if (!prev || expand_stack(prev, addr))
2271 return NULL; 2276 return NULL;
2272 if (prev->vm_flags & VM_LOCKED) 2277 if (prev->vm_flags & VM_LOCKED)
2273 __mlock_vma_pages_range(prev, addr, prev->vm_end, NULL); 2278 __mlock_vma_pages_range(prev, addr, prev->vm_end, NULL);
2274 return prev; 2279 return prev;
2275 } 2280 }
2276 #else 2281 #else
2277 int expand_stack(struct vm_area_struct *vma, unsigned long address) 2282 int expand_stack(struct vm_area_struct *vma, unsigned long address)
2278 { 2283 {
2279 struct vm_area_struct *prev; 2284 struct vm_area_struct *prev;
2280 2285
2281 address &= PAGE_MASK; 2286 address &= PAGE_MASK;
2282 prev = vma->vm_prev; 2287 prev = vma->vm_prev;
2283 if (prev && prev->vm_end == address) { 2288 if (prev && prev->vm_end == address) {
2284 if (!(prev->vm_flags & VM_GROWSDOWN)) 2289 if (!(prev->vm_flags & VM_GROWSDOWN))
2285 return -ENOMEM; 2290 return -ENOMEM;
2286 } 2291 }
2287 return expand_downwards(vma, address); 2292 return expand_downwards(vma, address);
2288 } 2293 }
2289 2294
2290 struct vm_area_struct * 2295 struct vm_area_struct *
2291 find_extend_vma(struct mm_struct * mm, unsigned long addr) 2296 find_extend_vma(struct mm_struct * mm, unsigned long addr)
2292 { 2297 {
2293 struct vm_area_struct * vma; 2298 struct vm_area_struct * vma;
2294 unsigned long start; 2299 unsigned long start;
2295 2300
2296 addr &= PAGE_MASK; 2301 addr &= PAGE_MASK;
2297 vma = find_vma(mm,addr); 2302 vma = find_vma(mm,addr);
2298 if (!vma) 2303 if (!vma)
2299 return NULL; 2304 return NULL;
2300 if (vma->vm_start <= addr) 2305 if (vma->vm_start <= addr)
2301 return vma; 2306 return vma;
2302 if (!(vma->vm_flags & VM_GROWSDOWN)) 2307 if (!(vma->vm_flags & VM_GROWSDOWN))
2303 return NULL; 2308 return NULL;
2304 start = vma->vm_start; 2309 start = vma->vm_start;
2305 if (expand_stack(vma, addr)) 2310 if (expand_stack(vma, addr))
2306 return NULL; 2311 return NULL;
2307 if (vma->vm_flags & VM_LOCKED) 2312 if (vma->vm_flags & VM_LOCKED)
2308 __mlock_vma_pages_range(vma, addr, start, NULL); 2313 __mlock_vma_pages_range(vma, addr, start, NULL);
2309 return vma; 2314 return vma;
2310 } 2315 }
2311 #endif 2316 #endif
2312 2317
2313 /* 2318 /*
2314 * Ok - we have the memory areas we should free on the vma list, 2319 * Ok - we have the memory areas we should free on the vma list,
2315 * so release them, and do the vma updates. 2320 * so release them, and do the vma updates.
2316 * 2321 *
2317 * Called with the mm semaphore held. 2322 * Called with the mm semaphore held.
2318 */ 2323 */
2319 static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) 2324 static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
2320 { 2325 {
2321 unsigned long nr_accounted = 0; 2326 unsigned long nr_accounted = 0;
2322 2327
2323 /* Update high watermark before we lower total_vm */ 2328 /* Update high watermark before we lower total_vm */
2324 update_hiwater_vm(mm); 2329 update_hiwater_vm(mm);
2325 do { 2330 do {
2326 long nrpages = vma_pages(vma); 2331 long nrpages = vma_pages(vma);
2327 2332
2328 if (vma->vm_flags & VM_ACCOUNT) 2333 if (vma->vm_flags & VM_ACCOUNT)
2329 nr_accounted += nrpages; 2334 nr_accounted += nrpages;
2330 vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); 2335 vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
2331 vma = remove_vma(vma); 2336 vma = remove_vma(vma);
2332 } while (vma); 2337 } while (vma);
2333 vm_unacct_memory(nr_accounted); 2338 vm_unacct_memory(nr_accounted);
2334 validate_mm(mm); 2339 validate_mm(mm);
2335 } 2340 }
2336 2341
2337 /* 2342 /*
2338 * Get rid of page table information in the indicated region. 2343 * Get rid of page table information in the indicated region.
2339 * 2344 *
2340 * Called with the mm semaphore held. 2345 * Called with the mm semaphore held.
2341 */ 2346 */
2342 static void unmap_region(struct mm_struct *mm, 2347 static void unmap_region(struct mm_struct *mm,
2343 struct vm_area_struct *vma, struct vm_area_struct *prev, 2348 struct vm_area_struct *vma, struct vm_area_struct *prev,
2344 unsigned long start, unsigned long end) 2349 unsigned long start, unsigned long end)
2345 { 2350 {
2346 struct vm_area_struct *next = prev? prev->vm_next: mm->mmap; 2351 struct vm_area_struct *next = prev? prev->vm_next: mm->mmap;
2347 struct mmu_gather tlb; 2352 struct mmu_gather tlb;
2348 2353
2349 lru_add_drain(); 2354 lru_add_drain();
2350 tlb_gather_mmu(&tlb, mm, 0); 2355 tlb_gather_mmu(&tlb, mm, 0);
2351 update_hiwater_rss(mm); 2356 update_hiwater_rss(mm);
2352 unmap_vmas(&tlb, vma, start, end); 2357 unmap_vmas(&tlb, vma, start, end);
2353 free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, 2358 free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
2354 next ? next->vm_start : USER_PGTABLES_CEILING); 2359 next ? next->vm_start : USER_PGTABLES_CEILING);
2355 tlb_finish_mmu(&tlb, start, end); 2360 tlb_finish_mmu(&tlb, start, end);
2356 } 2361 }
2357 2362
2358 /* 2363 /*
2359 * Create a list of vma's touched by the unmap, removing them from the mm's 2364 * Create a list of vma's touched by the unmap, removing them from the mm's
2360 * vma list as we go.. 2365 * vma list as we go..
2361 */ 2366 */
2362 static void 2367 static void
2363 detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, 2368 detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
2364 struct vm_area_struct *prev, unsigned long end) 2369 struct vm_area_struct *prev, unsigned long end)
2365 { 2370 {
2366 struct vm_area_struct **insertion_point; 2371 struct vm_area_struct **insertion_point;
2367 struct vm_area_struct *tail_vma = NULL; 2372 struct vm_area_struct *tail_vma = NULL;
2368 unsigned long addr; 2373 unsigned long addr;
2369 2374
2370 insertion_point = (prev ? &prev->vm_next : &mm->mmap); 2375 insertion_point = (prev ? &prev->vm_next : &mm->mmap);
2371 vma->vm_prev = NULL; 2376 vma->vm_prev = NULL;
2372 do { 2377 do {
2373 vma_rb_erase(vma, &mm->mm_rb); 2378 vma_rb_erase(vma, &mm->mm_rb);
2374 mm->map_count--; 2379 mm->map_count--;
2375 tail_vma = vma; 2380 tail_vma = vma;
2376 vma = vma->vm_next; 2381 vma = vma->vm_next;
2377 } while (vma && vma->vm_start < end); 2382 } while (vma && vma->vm_start < end);
2378 *insertion_point = vma; 2383 *insertion_point = vma;
2379 if (vma) { 2384 if (vma) {
2380 vma->vm_prev = prev; 2385 vma->vm_prev = prev;
2381 vma_gap_update(vma); 2386 vma_gap_update(vma);
2382 } else 2387 } else
2383 mm->highest_vm_end = prev ? prev->vm_end : 0; 2388 mm->highest_vm_end = prev ? prev->vm_end : 0;
2384 tail_vma->vm_next = NULL; 2389 tail_vma->vm_next = NULL;
2385 if (mm->unmap_area == arch_unmap_area) 2390 if (mm->unmap_area == arch_unmap_area)
2386 addr = prev ? prev->vm_end : mm->mmap_base; 2391 addr = prev ? prev->vm_end : mm->mmap_base;
2387 else 2392 else
2388 addr = vma ? vma->vm_start : mm->mmap_base; 2393 addr = vma ? vma->vm_start : mm->mmap_base;
2389 mm->unmap_area(mm, addr); 2394 mm->unmap_area(mm, addr);
2390 mm->mmap_cache = NULL; /* Kill the cache. */ 2395 mm->mmap_cache = NULL; /* Kill the cache. */
2391 } 2396 }
2392 2397
2393 /* 2398 /*
2394 * __split_vma() bypasses sysctl_max_map_count checking. We use this on the 2399 * __split_vma() bypasses sysctl_max_map_count checking. We use this on the
2395 * munmap path where it doesn't make sense to fail. 2400 * munmap path where it doesn't make sense to fail.
2396 */ 2401 */
2397 static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, 2402 static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
2398 unsigned long addr, int new_below) 2403 unsigned long addr, int new_below)
2399 { 2404 {
2400 struct mempolicy *pol; 2405 struct mempolicy *pol;
2401 struct vm_area_struct *new; 2406 struct vm_area_struct *new;
2402 int err = -ENOMEM; 2407 int err = -ENOMEM;
2403 2408
2404 if (is_vm_hugetlb_page(vma) && (addr & 2409 if (is_vm_hugetlb_page(vma) && (addr &
2405 ~(huge_page_mask(hstate_vma(vma))))) 2410 ~(huge_page_mask(hstate_vma(vma)))))
2406 return -EINVAL; 2411 return -EINVAL;
2407 2412
2408 new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); 2413 new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
2409 if (!new) 2414 if (!new)
2410 goto out_err; 2415 goto out_err;
2411 2416
2412 /* most fields are the same, copy all, and then fixup */ 2417 /* most fields are the same, copy all, and then fixup */
2413 *new = *vma; 2418 *new = *vma;
2414 2419
2415 INIT_LIST_HEAD(&new->anon_vma_chain); 2420 INIT_LIST_HEAD(&new->anon_vma_chain);
2416 2421
2417 if (new_below) 2422 if (new_below)
2418 new->vm_end = addr; 2423 new->vm_end = addr;
2419 else { 2424 else {
2420 new->vm_start = addr; 2425 new->vm_start = addr;
2421 new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); 2426 new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
2422 } 2427 }
2423 2428
2424 pol = mpol_dup(vma_policy(vma)); 2429 pol = mpol_dup(vma_policy(vma));
2425 if (IS_ERR(pol)) { 2430 if (IS_ERR(pol)) {
2426 err = PTR_ERR(pol); 2431 err = PTR_ERR(pol);
2427 goto out_free_vma; 2432 goto out_free_vma;
2428 } 2433 }
2429 vma_set_policy(new, pol); 2434 vma_set_policy(new, pol);
2430 2435
2431 if (anon_vma_clone(new, vma)) 2436 if (anon_vma_clone(new, vma))
2432 goto out_free_mpol; 2437 goto out_free_mpol;
2433 2438
2434 if (new->vm_file) 2439 if (new->vm_file)
2435 get_file(new->vm_file); 2440 get_file(new->vm_file);
2436 2441
2437 if (new->vm_ops && new->vm_ops->open) 2442 if (new->vm_ops && new->vm_ops->open)
2438 new->vm_ops->open(new); 2443 new->vm_ops->open(new);
2439 2444
2440 if (new_below) 2445 if (new_below)
2441 err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + 2446 err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +
2442 ((addr - new->vm_start) >> PAGE_SHIFT), new); 2447 ((addr - new->vm_start) >> PAGE_SHIFT), new);
2443 else 2448 else
2444 err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); 2449 err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
2445 2450
2446 /* Success. */ 2451 /* Success. */
2447 if (!err) 2452 if (!err)
2448 return 0; 2453 return 0;
2449 2454
2450 /* Clean everything up if vma_adjust failed. */ 2455 /* Clean everything up if vma_adjust failed. */
2451 if (new->vm_ops && new->vm_ops->close) 2456 if (new->vm_ops && new->vm_ops->close)
2452 new->vm_ops->close(new); 2457 new->vm_ops->close(new);
2453 if (new->vm_file) 2458 if (new->vm_file)
2454 fput(new->vm_file); 2459 fput(new->vm_file);
2455 unlink_anon_vmas(new); 2460 unlink_anon_vmas(new);
2456 out_free_mpol: 2461 out_free_mpol:
2457 mpol_put(pol); 2462 mpol_put(pol);
2458 out_free_vma: 2463 out_free_vma:
2459 kmem_cache_free(vm_area_cachep, new); 2464 kmem_cache_free(vm_area_cachep, new);
2460 out_err: 2465 out_err:
2461 return err; 2466 return err;
2462 } 2467 }
2463 2468
2464 /* 2469 /*
2465 * Split a vma into two pieces at address 'addr', a new vma is allocated 2470 * Split a vma into two pieces at address 'addr', a new vma is allocated
2466 * either for the first part or the tail. 2471 * either for the first part or the tail.
2467 */ 2472 */
2468 int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, 2473 int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
2469 unsigned long addr, int new_below) 2474 unsigned long addr, int new_below)
2470 { 2475 {
2471 if (mm->map_count >= sysctl_max_map_count) 2476 if (mm->map_count >= sysctl_max_map_count)
2472 return -ENOMEM; 2477 return -ENOMEM;
2473 2478
2474 return __split_vma(mm, vma, addr, new_below); 2479 return __split_vma(mm, vma, addr, new_below);
2475 } 2480 }
2476 2481
2477 /* Munmap is split into 2 main parts -- this part which finds 2482 /* Munmap is split into 2 main parts -- this part which finds
2478 * what needs doing, and the areas themselves, which do the 2483 * what needs doing, and the areas themselves, which do the
2479 * work. This now handles partial unmappings. 2484 * work. This now handles partial unmappings.
2480 * Jeremy Fitzhardinge <jeremy@goop.org> 2485 * Jeremy Fitzhardinge <jeremy@goop.org>
2481 */ 2486 */
2482 int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) 2487 int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
2483 { 2488 {
2484 unsigned long end; 2489 unsigned long end;
2485 struct vm_area_struct *vma, *prev, *last; 2490 struct vm_area_struct *vma, *prev, *last;
2486 2491
2487 if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start) 2492 if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start)
2488 return -EINVAL; 2493 return -EINVAL;
2489 2494
2490 if ((len = PAGE_ALIGN(len)) == 0) 2495 if ((len = PAGE_ALIGN(len)) == 0)
2491 return -EINVAL; 2496 return -EINVAL;
2492 2497
2493 /* Find the first overlapping VMA */ 2498 /* Find the first overlapping VMA */
2494 vma = find_vma(mm, start); 2499 vma = find_vma(mm, start);
2495 if (!vma) 2500 if (!vma)
2496 return 0; 2501 return 0;
2497 prev = vma->vm_prev; 2502 prev = vma->vm_prev;
2498 /* we have start < vma->vm_end */ 2503 /* we have start < vma->vm_end */
2499 2504
2500 /* if it doesn't overlap, we have nothing.. */ 2505 /* if it doesn't overlap, we have nothing.. */
2501 end = start + len; 2506 end = start + len;
2502 if (vma->vm_start >= end) 2507 if (vma->vm_start >= end)
2503 return 0; 2508 return 0;
2504 2509
2505 /* 2510 /*
2506 * If we need to split any vma, do it now to save pain later. 2511 * If we need to split any vma, do it now to save pain later.
2507 * 2512 *
2508 * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially 2513 * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially
2509 * unmapped vm_area_struct will remain in use: so lower split_vma 2514 * unmapped vm_area_struct will remain in use: so lower split_vma
2510 * places tmp vma above, and higher split_vma places tmp vma below. 2515 * places tmp vma above, and higher split_vma places tmp vma below.
2511 */ 2516 */
2512 if (start > vma->vm_start) { 2517 if (start > vma->vm_start) {
2513 int error; 2518 int error;
2514 2519
2515 /* 2520 /*
2516 * Make sure that map_count on return from munmap() will 2521 * Make sure that map_count on return from munmap() will
2517 * not exceed its limit; but let map_count go just above 2522 * not exceed its limit; but let map_count go just above
2518 * its limit temporarily, to help free resources as expected. 2523 * its limit temporarily, to help free resources as expected.
2519 */ 2524 */
2520 if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) 2525 if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
2521 return -ENOMEM; 2526 return -ENOMEM;
2522 2527
2523 error = __split_vma(mm, vma, start, 0); 2528 error = __split_vma(mm, vma, start, 0);
2524 if (error) 2529 if (error)
2525 return error; 2530 return error;
2526 prev = vma; 2531 prev = vma;
2527 } 2532 }
2528 2533
2529 /* Does it split the last one? */ 2534 /* Does it split the last one? */
2530 last = find_vma(mm, end); 2535 last = find_vma(mm, end);
2531 if (last && end > last->vm_start) { 2536 if (last && end > last->vm_start) {
2532 int error = __split_vma(mm, last, end, 1); 2537 int error = __split_vma(mm, last, end, 1);
2533 if (error) 2538 if (error)
2534 return error; 2539 return error;
2535 } 2540 }
2536 vma = prev? prev->vm_next: mm->mmap; 2541 vma = prev? prev->vm_next: mm->mmap;
2537 2542
2538 /* 2543 /*
2539 * unlock any mlock()ed ranges before detaching vmas 2544 * unlock any mlock()ed ranges before detaching vmas
2540 */ 2545 */
2541 if (mm->locked_vm) { 2546 if (mm->locked_vm) {
2542 struct vm_area_struct *tmp = vma; 2547 struct vm_area_struct *tmp = vma;
2543 while (tmp && tmp->vm_start < end) { 2548 while (tmp && tmp->vm_start < end) {
2544 if (tmp->vm_flags & VM_LOCKED) { 2549 if (tmp->vm_flags & VM_LOCKED) {
2545 mm->locked_vm -= vma_pages(tmp); 2550 mm->locked_vm -= vma_pages(tmp);
2546 munlock_vma_pages_all(tmp); 2551 munlock_vma_pages_all(tmp);
2547 } 2552 }
2548 tmp = tmp->vm_next; 2553 tmp = tmp->vm_next;
2549 } 2554 }
2550 } 2555 }
2551 2556
2552 /* 2557 /*
2553 * Remove the vma's, and unmap the actual pages 2558 * Remove the vma's, and unmap the actual pages
2554 */ 2559 */
2555 detach_vmas_to_be_unmapped(mm, vma, prev, end); 2560 detach_vmas_to_be_unmapped(mm, vma, prev, end);
2556 unmap_region(mm, vma, prev, start, end); 2561 unmap_region(mm, vma, prev, start, end);
2557 2562
2558 /* Fix up all other VM information */ 2563 /* Fix up all other VM information */
2559 remove_vma_list(mm, vma); 2564 remove_vma_list(mm, vma);
2560 2565
2561 return 0; 2566 return 0;
2562 } 2567 }
2563 2568
2564 int vm_munmap(unsigned long start, size_t len) 2569 int vm_munmap(unsigned long start, size_t len)
2565 { 2570 {
2566 int ret; 2571 int ret;
2567 struct mm_struct *mm = current->mm; 2572 struct mm_struct *mm = current->mm;
2568 2573
2569 down_write(&mm->mmap_sem); 2574 down_write(&mm->mmap_sem);
2570 ret = do_munmap(mm, start, len); 2575 ret = do_munmap(mm, start, len);
2571 up_write(&mm->mmap_sem); 2576 up_write(&mm->mmap_sem);
2572 return ret; 2577 return ret;
2573 } 2578 }
2574 EXPORT_SYMBOL(vm_munmap); 2579 EXPORT_SYMBOL(vm_munmap);
2575 2580
2576 SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) 2581 SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
2577 { 2582 {
2578 profile_munmap(addr); 2583 profile_munmap(addr);
2579 return vm_munmap(addr, len); 2584 return vm_munmap(addr, len);
2580 } 2585 }
2581 2586
2582 static inline void verify_mm_writelocked(struct mm_struct *mm) 2587 static inline void verify_mm_writelocked(struct mm_struct *mm)
2583 { 2588 {
2584 #ifdef CONFIG_DEBUG_VM 2589 #ifdef CONFIG_DEBUG_VM
2585 if (unlikely(down_read_trylock(&mm->mmap_sem))) { 2590 if (unlikely(down_read_trylock(&mm->mmap_sem))) {
2586 WARN_ON(1); 2591 WARN_ON(1);
2587 up_read(&mm->mmap_sem); 2592 up_read(&mm->mmap_sem);
2588 } 2593 }
2589 #endif 2594 #endif
2590 } 2595 }
2591 2596
2592 /* 2597 /*
2593 * this is really a simplified "do_mmap". it only handles 2598 * this is really a simplified "do_mmap". it only handles
2594 * anonymous maps. eventually we may be able to do some 2599 * anonymous maps. eventually we may be able to do some
2595 * brk-specific accounting here. 2600 * brk-specific accounting here.
2596 */ 2601 */
2597 static unsigned long do_brk(unsigned long addr, unsigned long len) 2602 static unsigned long do_brk(unsigned long addr, unsigned long len)
2598 { 2603 {
2599 struct mm_struct * mm = current->mm; 2604 struct mm_struct * mm = current->mm;
2600 struct vm_area_struct * vma, * prev; 2605 struct vm_area_struct * vma, * prev;
2601 unsigned long flags; 2606 unsigned long flags;
2602 struct rb_node ** rb_link, * rb_parent; 2607 struct rb_node ** rb_link, * rb_parent;
2603 pgoff_t pgoff = addr >> PAGE_SHIFT; 2608 pgoff_t pgoff = addr >> PAGE_SHIFT;
2604 int error; 2609 int error;
2605 2610
2606 len = PAGE_ALIGN(len); 2611 len = PAGE_ALIGN(len);
2607 if (!len) 2612 if (!len)
2608 return addr; 2613 return addr;
2609 2614
2610 flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; 2615 flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
2611 2616
2612 error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); 2617 error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
2613 if (error & ~PAGE_MASK) 2618 if (error & ~PAGE_MASK)
2614 return error; 2619 return error;
2615 2620
2616 /* 2621 /*
2617 * mlock MCL_FUTURE? 2622 * mlock MCL_FUTURE?
2618 */ 2623 */
2619 if (mm->def_flags & VM_LOCKED) { 2624 if (mm->def_flags & VM_LOCKED) {
2620 unsigned long locked, lock_limit; 2625 unsigned long locked, lock_limit;
2621 locked = len >> PAGE_SHIFT; 2626 locked = len >> PAGE_SHIFT;
2622 locked += mm->locked_vm; 2627 locked += mm->locked_vm;
2623 lock_limit = rlimit(RLIMIT_MEMLOCK); 2628 lock_limit = rlimit(RLIMIT_MEMLOCK);
2624 lock_limit >>= PAGE_SHIFT; 2629 lock_limit >>= PAGE_SHIFT;
2625 if (locked > lock_limit && !capable(CAP_IPC_LOCK)) 2630 if (locked > lock_limit && !capable(CAP_IPC_LOCK))
2626 return -EAGAIN; 2631 return -EAGAIN;
2627 } 2632 }
2628 2633
2629 /* 2634 /*
2630 * mm->mmap_sem is required to protect against another thread 2635 * mm->mmap_sem is required to protect against another thread
2631 * changing the mappings in case we sleep. 2636 * changing the mappings in case we sleep.
2632 */ 2637 */
2633 verify_mm_writelocked(mm); 2638 verify_mm_writelocked(mm);
2634 2639
2635 /* 2640 /*
2636 * Clear old maps. this also does some error checking for us 2641 * Clear old maps. this also does some error checking for us
2637 */ 2642 */
2638 munmap_back: 2643 munmap_back:
2639 if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { 2644 if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
2640 if (do_munmap(mm, addr, len)) 2645 if (do_munmap(mm, addr, len))
2641 return -ENOMEM; 2646 return -ENOMEM;
2642 goto munmap_back; 2647 goto munmap_back;
2643 } 2648 }
2644 2649
2645 /* Check against address space limits *after* clearing old maps... */ 2650 /* Check against address space limits *after* clearing old maps... */
2646 if (!may_expand_vm(mm, len >> PAGE_SHIFT)) 2651 if (!may_expand_vm(mm, len >> PAGE_SHIFT))
2647 return -ENOMEM; 2652 return -ENOMEM;
2648 2653
2649 if (mm->map_count > sysctl_max_map_count) 2654 if (mm->map_count > sysctl_max_map_count)
2650 return -ENOMEM; 2655 return -ENOMEM;
2651 2656
2652 if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT)) 2657 if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
2653 return -ENOMEM; 2658 return -ENOMEM;
2654 2659
2655 /* Can we just expand an old private anonymous mapping? */ 2660 /* Can we just expand an old private anonymous mapping? */
2656 vma = vma_merge(mm, prev, addr, addr + len, flags, 2661 vma = vma_merge(mm, prev, addr, addr + len, flags,
2657 NULL, NULL, pgoff, NULL); 2662 NULL, NULL, pgoff, NULL);
2658 if (vma) 2663 if (vma)
2659 goto out; 2664 goto out;
2660 2665
2661 /* 2666 /*
2662 * create a vma struct for an anonymous mapping 2667 * create a vma struct for an anonymous mapping
2663 */ 2668 */
2664 vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); 2669 vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
2665 if (!vma) { 2670 if (!vma) {
2666 vm_unacct_memory(len >> PAGE_SHIFT); 2671 vm_unacct_memory(len >> PAGE_SHIFT);
2667 return -ENOMEM; 2672 return -ENOMEM;
2668 } 2673 }
2669 2674
2670 INIT_LIST_HEAD(&vma->anon_vma_chain); 2675 INIT_LIST_HEAD(&vma->anon_vma_chain);
2671 vma->vm_mm = mm; 2676 vma->vm_mm = mm;
2672 vma->vm_start = addr; 2677 vma->vm_start = addr;
2673 vma->vm_end = addr + len; 2678 vma->vm_end = addr + len;
2674 vma->vm_pgoff = pgoff; 2679 vma->vm_pgoff = pgoff;
2675 vma->vm_flags = flags; 2680 vma->vm_flags = flags;
2676 vma->vm_page_prot = vm_get_page_prot(flags); 2681 vma->vm_page_prot = vm_get_page_prot(flags);
2677 vma_link(mm, vma, prev, rb_link, rb_parent); 2682 vma_link(mm, vma, prev, rb_link, rb_parent);
2678 out: 2683 out:
2679 perf_event_mmap(vma); 2684 perf_event_mmap(vma);
2680 mm->total_vm += len >> PAGE_SHIFT; 2685 mm->total_vm += len >> PAGE_SHIFT;
2681 if (flags & VM_LOCKED) 2686 if (flags & VM_LOCKED)
2682 mm->locked_vm += (len >> PAGE_SHIFT); 2687 mm->locked_vm += (len >> PAGE_SHIFT);
2683 return addr; 2688 return addr;
2684 } 2689 }
2685 2690
2686 unsigned long vm_brk(unsigned long addr, unsigned long len) 2691 unsigned long vm_brk(unsigned long addr, unsigned long len)
2687 { 2692 {
2688 struct mm_struct *mm = current->mm; 2693 struct mm_struct *mm = current->mm;
2689 unsigned long ret; 2694 unsigned long ret;
2690 bool populate; 2695 bool populate;
2691 2696
2692 down_write(&mm->mmap_sem); 2697 down_write(&mm->mmap_sem);
2693 ret = do_brk(addr, len); 2698 ret = do_brk(addr, len);
2694 populate = ((mm->def_flags & VM_LOCKED) != 0); 2699 populate = ((mm->def_flags & VM_LOCKED) != 0);
2695 up_write(&mm->mmap_sem); 2700 up_write(&mm->mmap_sem);
2696 if (populate) 2701 if (populate)
2697 mm_populate(addr, len); 2702 mm_populate(addr, len);
2698 return ret; 2703 return ret;
2699 } 2704 }
2700 EXPORT_SYMBOL(vm_brk); 2705 EXPORT_SYMBOL(vm_brk);
2701 2706
2702 /* Release all mmaps. */ 2707 /* Release all mmaps. */
2703 void exit_mmap(struct mm_struct *mm) 2708 void exit_mmap(struct mm_struct *mm)
2704 { 2709 {
2705 struct mmu_gather tlb; 2710 struct mmu_gather tlb;
2706 struct vm_area_struct *vma; 2711 struct vm_area_struct *vma;
2707 unsigned long nr_accounted = 0; 2712 unsigned long nr_accounted = 0;
2708 2713
2709 /* mm's last user has gone, and its about to be pulled down */ 2714 /* mm's last user has gone, and its about to be pulled down */
2710 mmu_notifier_release(mm); 2715 mmu_notifier_release(mm);
2711 2716
2712 if (mm->locked_vm) { 2717 if (mm->locked_vm) {
2713 vma = mm->mmap; 2718 vma = mm->mmap;
2714 while (vma) { 2719 while (vma) {
2715 if (vma->vm_flags & VM_LOCKED) 2720 if (vma->vm_flags & VM_LOCKED)
2716 munlock_vma_pages_all(vma); 2721 munlock_vma_pages_all(vma);
2717 vma = vma->vm_next; 2722 vma = vma->vm_next;
2718 } 2723 }
2719 } 2724 }
2720 2725
2721 arch_exit_mmap(mm); 2726 arch_exit_mmap(mm);
2722 2727
2723 vma = mm->mmap; 2728 vma = mm->mmap;
2724 if (!vma) /* Can happen if dup_mmap() received an OOM */ 2729 if (!vma) /* Can happen if dup_mmap() received an OOM */
2725 return; 2730 return;
2726 2731
2727 lru_add_drain(); 2732 lru_add_drain();
2728 flush_cache_mm(mm); 2733 flush_cache_mm(mm);
2729 tlb_gather_mmu(&tlb, mm, 1); 2734 tlb_gather_mmu(&tlb, mm, 1);
2730 /* update_hiwater_rss(mm) here? but nobody should be looking */ 2735 /* update_hiwater_rss(mm) here? but nobody should be looking */
2731 /* Use -1 here to ensure all VMAs in the mm are unmapped */ 2736 /* Use -1 here to ensure all VMAs in the mm are unmapped */
2732 unmap_vmas(&tlb, vma, 0, -1); 2737 unmap_vmas(&tlb, vma, 0, -1);
2733 2738
2734 free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); 2739 free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
2735 tlb_finish_mmu(&tlb, 0, -1); 2740 tlb_finish_mmu(&tlb, 0, -1);
2736 2741
2737 /* 2742 /*
2738 * Walk the list again, actually closing and freeing it, 2743 * Walk the list again, actually closing and freeing it,
2739 * with preemption enabled, without holding any MM locks. 2744 * with preemption enabled, without holding any MM locks.
2740 */ 2745 */
2741 while (vma) { 2746 while (vma) {
2742 if (vma->vm_flags & VM_ACCOUNT) 2747 if (vma->vm_flags & VM_ACCOUNT)
2743 nr_accounted += vma_pages(vma); 2748 nr_accounted += vma_pages(vma);
2744 vma = remove_vma(vma); 2749 vma = remove_vma(vma);
2745 } 2750 }
2746 vm_unacct_memory(nr_accounted); 2751 vm_unacct_memory(nr_accounted);
2747 2752
2748 WARN_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); 2753 WARN_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
2749 } 2754 }
2750 2755
2751 /* Insert vm structure into process list sorted by address 2756 /* Insert vm structure into process list sorted by address
2752 * and into the inode's i_mmap tree. If vm_file is non-NULL 2757 * and into the inode's i_mmap tree. If vm_file is non-NULL
2753 * then i_mmap_mutex is taken here. 2758 * then i_mmap_mutex is taken here.
2754 */ 2759 */
2755 int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) 2760 int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
2756 { 2761 {
2757 struct vm_area_struct *prev; 2762 struct vm_area_struct *prev;
2758 struct rb_node **rb_link, *rb_parent; 2763 struct rb_node **rb_link, *rb_parent;
2759 2764
2760 /* 2765 /*
2761 * The vm_pgoff of a purely anonymous vma should be irrelevant 2766 * The vm_pgoff of a purely anonymous vma should be irrelevant
2762 * until its first write fault, when page's anon_vma and index 2767 * until its first write fault, when page's anon_vma and index
2763 * are set. But now set the vm_pgoff it will almost certainly 2768 * are set. But now set the vm_pgoff it will almost certainly
2764 * end up with (unless mremap moves it elsewhere before that 2769 * end up with (unless mremap moves it elsewhere before that
2765 * first wfault), so /proc/pid/maps tells a consistent story. 2770 * first wfault), so /proc/pid/maps tells a consistent story.
2766 * 2771 *
2767 * By setting it to reflect the virtual start address of the 2772 * By setting it to reflect the virtual start address of the
2768 * vma, merges and splits can happen in a seamless way, just 2773 * vma, merges and splits can happen in a seamless way, just
2769 * using the existing file pgoff checks and manipulations. 2774 * using the existing file pgoff checks and manipulations.
2770 * Similarly in do_mmap_pgoff and in do_brk. 2775 * Similarly in do_mmap_pgoff and in do_brk.
2771 */ 2776 */
2772 if (!vma->vm_file) { 2777 if (!vma->vm_file) {
2773 BUG_ON(vma->anon_vma); 2778 BUG_ON(vma->anon_vma);
2774 vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; 2779 vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
2775 } 2780 }
2776 if (find_vma_links(mm, vma->vm_start, vma->vm_end, 2781 if (find_vma_links(mm, vma->vm_start, vma->vm_end,
2777 &prev, &rb_link, &rb_parent)) 2782 &prev, &rb_link, &rb_parent))
2778 return -ENOMEM; 2783 return -ENOMEM;
2779 if ((vma->vm_flags & VM_ACCOUNT) && 2784 if ((vma->vm_flags & VM_ACCOUNT) &&
2780 security_vm_enough_memory_mm(mm, vma_pages(vma))) 2785 security_vm_enough_memory_mm(mm, vma_pages(vma)))
2781 return -ENOMEM; 2786 return -ENOMEM;
2782 2787
2783 vma_link(mm, vma, prev, rb_link, rb_parent); 2788 vma_link(mm, vma, prev, rb_link, rb_parent);
2784 return 0; 2789 return 0;
2785 } 2790 }
2786 2791
2787 /* 2792 /*
2788 * Copy the vma structure to a new location in the same mm, 2793 * Copy the vma structure to a new location in the same mm,
2789 * prior to moving page table entries, to effect an mremap move. 2794 * prior to moving page table entries, to effect an mremap move.
2790 */ 2795 */
2791 struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, 2796 struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2792 unsigned long addr, unsigned long len, pgoff_t pgoff, 2797 unsigned long addr, unsigned long len, pgoff_t pgoff,
2793 bool *need_rmap_locks) 2798 bool *need_rmap_locks)
2794 { 2799 {
2795 struct vm_area_struct *vma = *vmap; 2800 struct vm_area_struct *vma = *vmap;
2796 unsigned long vma_start = vma->vm_start; 2801 unsigned long vma_start = vma->vm_start;
2797 struct mm_struct *mm = vma->vm_mm; 2802 struct mm_struct *mm = vma->vm_mm;
2798 struct vm_area_struct *new_vma, *prev; 2803 struct vm_area_struct *new_vma, *prev;
2799 struct rb_node **rb_link, *rb_parent; 2804 struct rb_node **rb_link, *rb_parent;
2800 struct mempolicy *pol; 2805 struct mempolicy *pol;
2801 bool faulted_in_anon_vma = true; 2806 bool faulted_in_anon_vma = true;
2802 2807
2803 /* 2808 /*
2804 * If anonymous vma has not yet been faulted, update new pgoff 2809 * If anonymous vma has not yet been faulted, update new pgoff
2805 * to match new location, to increase its chance of merging. 2810 * to match new location, to increase its chance of merging.
2806 */ 2811 */
2807 if (unlikely(!vma->vm_file && !vma->anon_vma)) { 2812 if (unlikely(!vma->vm_file && !vma->anon_vma)) {
2808 pgoff = addr >> PAGE_SHIFT; 2813 pgoff = addr >> PAGE_SHIFT;
2809 faulted_in_anon_vma = false; 2814 faulted_in_anon_vma = false;
2810 } 2815 }
2811 2816
2812 if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) 2817 if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
2813 return NULL; /* should never get here */ 2818 return NULL; /* should never get here */
2814 new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, 2819 new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
2815 vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); 2820 vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
2816 if (new_vma) { 2821 if (new_vma) {
2817 /* 2822 /*
2818 * Source vma may have been merged into new_vma 2823 * Source vma may have been merged into new_vma
2819 */ 2824 */
2820 if (unlikely(vma_start >= new_vma->vm_start && 2825 if (unlikely(vma_start >= new_vma->vm_start &&
2821 vma_start < new_vma->vm_end)) { 2826 vma_start < new_vma->vm_end)) {
2822 /* 2827 /*
2823 * The only way we can get a vma_merge with 2828 * The only way we can get a vma_merge with
2824 * self during an mremap is if the vma hasn't 2829 * self during an mremap is if the vma hasn't
2825 * been faulted in yet and we were allowed to 2830 * been faulted in yet and we were allowed to
2826 * reset the dst vma->vm_pgoff to the 2831 * reset the dst vma->vm_pgoff to the
2827 * destination address of the mremap to allow 2832 * destination address of the mremap to allow
2828 * the merge to happen. mremap must change the 2833 * the merge to happen. mremap must change the
2829 * vm_pgoff linearity between src and dst vmas 2834 * vm_pgoff linearity between src and dst vmas
2830 * (in turn preventing a vma_merge) to be 2835 * (in turn preventing a vma_merge) to be
2831 * safe. It is only safe to keep the vm_pgoff 2836 * safe. It is only safe to keep the vm_pgoff
2832 * linear if there are no pages mapped yet. 2837 * linear if there are no pages mapped yet.
2833 */ 2838 */
2834 VM_BUG_ON(faulted_in_anon_vma); 2839 VM_BUG_ON(faulted_in_anon_vma);
2835 *vmap = vma = new_vma; 2840 *vmap = vma = new_vma;
2836 } 2841 }
2837 *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); 2842 *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
2838 } else { 2843 } else {
2839 new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); 2844 new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
2840 if (new_vma) { 2845 if (new_vma) {
2841 *new_vma = *vma; 2846 *new_vma = *vma;
2842 new_vma->vm_start = addr; 2847 new_vma->vm_start = addr;
2843 new_vma->vm_end = addr + len; 2848 new_vma->vm_end = addr + len;
2844 new_vma->vm_pgoff = pgoff; 2849 new_vma->vm_pgoff = pgoff;
2845 pol = mpol_dup(vma_policy(vma)); 2850 pol = mpol_dup(vma_policy(vma));
2846 if (IS_ERR(pol)) 2851 if (IS_ERR(pol))
2847 goto out_free_vma; 2852 goto out_free_vma;
2848 vma_set_policy(new_vma, pol); 2853 vma_set_policy(new_vma, pol);
2849 INIT_LIST_HEAD(&new_vma->anon_vma_chain); 2854 INIT_LIST_HEAD(&new_vma->anon_vma_chain);
2850 if (anon_vma_clone(new_vma, vma)) 2855 if (anon_vma_clone(new_vma, vma))
2851 goto out_free_mempol; 2856 goto out_free_mempol;
2852 if (new_vma->vm_file) 2857 if (new_vma->vm_file)
2853 get_file(new_vma->vm_file); 2858 get_file(new_vma->vm_file);
2854 if (new_vma->vm_ops && new_vma->vm_ops->open) 2859 if (new_vma->vm_ops && new_vma->vm_ops->open)
2855 new_vma->vm_ops->open(new_vma); 2860 new_vma->vm_ops->open(new_vma);
2856 vma_link(mm, new_vma, prev, rb_link, rb_parent); 2861 vma_link(mm, new_vma, prev, rb_link, rb_parent);
2857 *need_rmap_locks = false; 2862 *need_rmap_locks = false;
2858 } 2863 }
2859 } 2864 }
2860 return new_vma; 2865 return new_vma;
2861 2866
2862 out_free_mempol: 2867 out_free_mempol:
2863 mpol_put(pol); 2868 mpol_put(pol);
2864 out_free_vma: 2869 out_free_vma:
2865 kmem_cache_free(vm_area_cachep, new_vma); 2870 kmem_cache_free(vm_area_cachep, new_vma);
2866 return NULL; 2871 return NULL;
2867 } 2872 }
2868 2873
2869 /* 2874 /*
2870 * Return true if the calling process may expand its vm space by the passed 2875 * Return true if the calling process may expand its vm space by the passed
2871 * number of pages 2876 * number of pages
2872 */ 2877 */
2873 int may_expand_vm(struct mm_struct *mm, unsigned long npages) 2878 int may_expand_vm(struct mm_struct *mm, unsigned long npages)
2874 { 2879 {
2875 unsigned long cur = mm->total_vm; /* pages */ 2880 unsigned long cur = mm->total_vm; /* pages */
2876 unsigned long lim; 2881 unsigned long lim;
2877 2882
2878 lim = rlimit(RLIMIT_AS) >> PAGE_SHIFT; 2883 lim = rlimit(RLIMIT_AS) >> PAGE_SHIFT;
2879 2884
2880 if (cur + npages > lim) 2885 if (cur + npages > lim)
2881 return 0; 2886 return 0;
2882 return 1; 2887 return 1;
2883 } 2888 }
2884 2889
2885 2890
2886 static int special_mapping_fault(struct vm_area_struct *vma, 2891 static int special_mapping_fault(struct vm_area_struct *vma,
2887 struct vm_fault *vmf) 2892 struct vm_fault *vmf)
2888 { 2893 {
2889 pgoff_t pgoff; 2894 pgoff_t pgoff;
2890 struct page **pages; 2895 struct page **pages;
2891 2896
2892 /* 2897 /*
2893 * special mappings have no vm_file, and in that case, the mm 2898 * special mappings have no vm_file, and in that case, the mm
2894 * uses vm_pgoff internally. So we have to subtract it from here. 2899 * uses vm_pgoff internally. So we have to subtract it from here.
2895 * We are allowed to do this because we are the mm; do not copy 2900 * We are allowed to do this because we are the mm; do not copy
2896 * this code into drivers! 2901 * this code into drivers!
2897 */ 2902 */
2898 pgoff = vmf->pgoff - vma->vm_pgoff; 2903 pgoff = vmf->pgoff - vma->vm_pgoff;
2899 2904
2900 for (pages = vma->vm_private_data; pgoff && *pages; ++pages) 2905 for (pages = vma->vm_private_data; pgoff && *pages; ++pages)
2901 pgoff--; 2906 pgoff--;
2902 2907
2903 if (*pages) { 2908 if (*pages) {
2904 struct page *page = *pages; 2909 struct page *page = *pages;
2905 get_page(page); 2910 get_page(page);
2906 vmf->page = page; 2911 vmf->page = page;
2907 return 0; 2912 return 0;
2908 } 2913 }
2909 2914
2910 return VM_FAULT_SIGBUS; 2915 return VM_FAULT_SIGBUS;
2911 } 2916 }
2912 2917
2913 /* 2918 /*
2914 * Having a close hook prevents vma merging regardless of flags. 2919 * Having a close hook prevents vma merging regardless of flags.
2915 */ 2920 */
2916 static void special_mapping_close(struct vm_area_struct *vma) 2921 static void special_mapping_close(struct vm_area_struct *vma)
2917 { 2922 {
2918 } 2923 }
2919 2924
2920 static const struct vm_operations_struct special_mapping_vmops = { 2925 static const struct vm_operations_struct special_mapping_vmops = {
2921 .close = special_mapping_close, 2926 .close = special_mapping_close,
2922 .fault = special_mapping_fault, 2927 .fault = special_mapping_fault,
2923 }; 2928 };
2924 2929
2925 /* 2930 /*
2926 * Called with mm->mmap_sem held for writing. 2931 * Called with mm->mmap_sem held for writing.
2927 * Insert a new vma covering the given region, with the given flags. 2932 * Insert a new vma covering the given region, with the given flags.
2928 * Its pages are supplied by the given array of struct page *. 2933 * Its pages are supplied by the given array of struct page *.
2929 * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated. 2934 * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated.
2930 * The region past the last page supplied will always produce SIGBUS. 2935 * The region past the last page supplied will always produce SIGBUS.
2931 * The array pointer and the pages it points to are assumed to stay alive 2936 * The array pointer and the pages it points to are assumed to stay alive
2932 * for as long as this mapping might exist. 2937 * for as long as this mapping might exist.
2933 */ 2938 */
2934 int install_special_mapping(struct mm_struct *mm, 2939 int install_special_mapping(struct mm_struct *mm,
2935 unsigned long addr, unsigned long len, 2940 unsigned long addr, unsigned long len,
2936 unsigned long vm_flags, struct page **pages) 2941 unsigned long vm_flags, struct page **pages)
2937 { 2942 {
2938 int ret; 2943 int ret;
2939 struct vm_area_struct *vma; 2944 struct vm_area_struct *vma;
2940 2945
2941 vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); 2946 vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
2942 if (unlikely(vma == NULL)) 2947 if (unlikely(vma == NULL))
2943 return -ENOMEM; 2948 return -ENOMEM;
2944 2949
2945 INIT_LIST_HEAD(&vma->anon_vma_chain); 2950 INIT_LIST_HEAD(&vma->anon_vma_chain);
2946 vma->vm_mm = mm; 2951 vma->vm_mm = mm;
2947 vma->vm_start = addr; 2952 vma->vm_start = addr;
2948 vma->vm_end = addr + len; 2953 vma->vm_end = addr + len;
2949 2954
2950 vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND; 2955 vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND;
2951 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); 2956 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
2952 2957
2953 vma->vm_ops = &special_mapping_vmops; 2958 vma->vm_ops = &special_mapping_vmops;
2954 vma->vm_private_data = pages; 2959 vma->vm_private_data = pages;
2955 2960
2956 ret = insert_vm_struct(mm, vma); 2961 ret = insert_vm_struct(mm, vma);
2957 if (ret) 2962 if (ret)
2958 goto out; 2963 goto out;
2959 2964
2960 mm->total_vm += len >> PAGE_SHIFT; 2965 mm->total_vm += len >> PAGE_SHIFT;
2961 2966
2962 perf_event_mmap(vma); 2967 perf_event_mmap(vma);
2963 2968
2964 return 0; 2969 return 0;
2965 2970
2966 out: 2971 out:
2967 kmem_cache_free(vm_area_cachep, vma); 2972 kmem_cache_free(vm_area_cachep, vma);
2968 return ret; 2973 return ret;
2969 } 2974 }
2970 2975
2971 static DEFINE_MUTEX(mm_all_locks_mutex); 2976 static DEFINE_MUTEX(mm_all_locks_mutex);
2972 2977
2973 static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) 2978 static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
2974 { 2979 {
2975 if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) { 2980 if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {
2976 /* 2981 /*
2977 * The LSB of head.next can't change from under us 2982 * The LSB of head.next can't change from under us
2978 * because we hold the mm_all_locks_mutex. 2983 * because we hold the mm_all_locks_mutex.
2979 */ 2984 */
2980 down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_sem); 2985 down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_sem);
2981 /* 2986 /*
2982 * We can safely modify head.next after taking the 2987 * We can safely modify head.next after taking the
2983 * anon_vma->root->rwsem. If some other vma in this mm shares 2988 * anon_vma->root->rwsem. If some other vma in this mm shares
2984 * the same anon_vma we won't take it again. 2989 * the same anon_vma we won't take it again.
2985 * 2990 *
2986 * No need of atomic instructions here, head.next 2991 * No need of atomic instructions here, head.next
2987 * can't change from under us thanks to the 2992 * can't change from under us thanks to the
2988 * anon_vma->root->rwsem. 2993 * anon_vma->root->rwsem.
2989 */ 2994 */
2990 if (__test_and_set_bit(0, (unsigned long *) 2995 if (__test_and_set_bit(0, (unsigned long *)
2991 &anon_vma->root->rb_root.rb_node)) 2996 &anon_vma->root->rb_root.rb_node))
2992 BUG(); 2997 BUG();
2993 } 2998 }
2994 } 2999 }
2995 3000
2996 static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) 3001 static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
2997 { 3002 {
2998 if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { 3003 if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
2999 /* 3004 /*
3000 * AS_MM_ALL_LOCKS can't change from under us because 3005 * AS_MM_ALL_LOCKS can't change from under us because
3001 * we hold the mm_all_locks_mutex. 3006 * we hold the mm_all_locks_mutex.
3002 * 3007 *
3003 * Operations on ->flags have to be atomic because 3008 * Operations on ->flags have to be atomic because
3004 * even if AS_MM_ALL_LOCKS is stable thanks to the 3009 * even if AS_MM_ALL_LOCKS is stable thanks to the
3005 * mm_all_locks_mutex, there may be other cpus 3010 * mm_all_locks_mutex, there may be other cpus
3006 * changing other bitflags in parallel to us. 3011 * changing other bitflags in parallel to us.
3007 */ 3012 */
3008 if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) 3013 if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
3009 BUG(); 3014 BUG();
3010 mutex_lock_nest_lock(&mapping->i_mmap_mutex, &mm->mmap_sem); 3015 mutex_lock_nest_lock(&mapping->i_mmap_mutex, &mm->mmap_sem);
3011 } 3016 }
3012 } 3017 }
3013 3018
3014 /* 3019 /*
3015 * This operation locks against the VM for all pte/vma/mm related 3020 * This operation locks against the VM for all pte/vma/mm related
3016 * operations that could ever happen on a certain mm. This includes 3021 * operations that could ever happen on a certain mm. This includes
3017 * vmtruncate, try_to_unmap, and all page faults. 3022 * vmtruncate, try_to_unmap, and all page faults.
3018 * 3023 *
3019 * The caller must take the mmap_sem in write mode before calling 3024 * The caller must take the mmap_sem in write mode before calling
3020 * mm_take_all_locks(). The caller isn't allowed to release the 3025 * mm_take_all_locks(). The caller isn't allowed to release the
3021 * mmap_sem until mm_drop_all_locks() returns. 3026 * mmap_sem until mm_drop_all_locks() returns.
3022 * 3027 *
3023 * mmap_sem in write mode is required in order to block all operations 3028 * mmap_sem in write mode is required in order to block all operations
3024 * that could modify pagetables and free pages without need of 3029 * that could modify pagetables and free pages without need of
3025 * altering the vma layout (for example populate_range() with 3030 * altering the vma layout (for example populate_range() with
3026 * nonlinear vmas). It's also needed in write mode to avoid new 3031 * nonlinear vmas). It's also needed in write mode to avoid new
3027 * anon_vmas to be associated with existing vmas. 3032 * anon_vmas to be associated with existing vmas.
3028 * 3033 *
3029 * A single task can't take more than one mm_take_all_locks() in a row 3034 * A single task can't take more than one mm_take_all_locks() in a row
3030 * or it would deadlock. 3035 * or it would deadlock.
3031 * 3036 *
3032 * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in 3037 * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in
3033 * mapping->flags avoid to take the same lock twice, if more than one 3038 * mapping->flags avoid to take the same lock twice, if more than one
3034 * vma in this mm is backed by the same anon_vma or address_space. 3039 * vma in this mm is backed by the same anon_vma or address_space.
3035 * 3040 *
3036 * We can take all the locks in random order because the VM code 3041 * We can take all the locks in random order because the VM code
3037 * taking i_mmap_mutex or anon_vma->rwsem outside the mmap_sem never 3042 * taking i_mmap_mutex or anon_vma->rwsem outside the mmap_sem never
3038 * takes more than one of them in a row. Secondly we're protected 3043 * takes more than one of them in a row. Secondly we're protected
3039 * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. 3044 * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
3040 * 3045 *
3041 * mm_take_all_locks() and mm_drop_all_locks are expensive operations 3046 * mm_take_all_locks() and mm_drop_all_locks are expensive operations
3042 * that may have to take thousand of locks. 3047 * that may have to take thousand of locks.
3043 * 3048 *
3044 * mm_take_all_locks() can fail if it's interrupted by signals. 3049 * mm_take_all_locks() can fail if it's interrupted by signals.
3045 */ 3050 */
3046 int mm_take_all_locks(struct mm_struct *mm) 3051 int mm_take_all_locks(struct mm_struct *mm)
3047 { 3052 {
3048 struct vm_area_struct *vma; 3053 struct vm_area_struct *vma;
3049 struct anon_vma_chain *avc; 3054 struct anon_vma_chain *avc;
3050 3055
3051 BUG_ON(down_read_trylock(&mm->mmap_sem)); 3056 BUG_ON(down_read_trylock(&mm->mmap_sem));
3052 3057
3053 mutex_lock(&mm_all_locks_mutex); 3058 mutex_lock(&mm_all_locks_mutex);
3054 3059
3055 for (vma = mm->mmap; vma; vma = vma->vm_next) { 3060 for (vma = mm->mmap; vma; vma = vma->vm_next) {
3056 if (signal_pending(current)) 3061 if (signal_pending(current))
3057 goto out_unlock; 3062 goto out_unlock;
3058 if (vma->vm_file && vma->vm_file->f_mapping) 3063 if (vma->vm_file && vma->vm_file->f_mapping)
3059 vm_lock_mapping(mm, vma->vm_file->f_mapping); 3064 vm_lock_mapping(mm, vma->vm_file->f_mapping);
3060 } 3065 }
3061 3066
3062 for (vma = mm->mmap; vma; vma = vma->vm_next) { 3067 for (vma = mm->mmap; vma; vma = vma->vm_next) {
3063 if (signal_pending(current)) 3068 if (signal_pending(current))
3064 goto out_unlock; 3069 goto out_unlock;
3065 if (vma->anon_vma) 3070 if (vma->anon_vma)
3066 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 3071 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
3067 vm_lock_anon_vma(mm, avc->anon_vma); 3072 vm_lock_anon_vma(mm, avc->anon_vma);
3068 } 3073 }
3069 3074
3070 return 0; 3075 return 0;
3071 3076
3072 out_unlock: 3077 out_unlock:
3073 mm_drop_all_locks(mm); 3078 mm_drop_all_locks(mm);
3074 return -EINTR; 3079 return -EINTR;
3075 } 3080 }
3076 3081
3077 static void vm_unlock_anon_vma(struct anon_vma *anon_vma) 3082 static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
3078 { 3083 {
3079 if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) { 3084 if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {
3080 /* 3085 /*
3081 * The LSB of head.next can't change to 0 from under 3086 * The LSB of head.next can't change to 0 from under
3082 * us because we hold the mm_all_locks_mutex. 3087 * us because we hold the mm_all_locks_mutex.
3083 * 3088 *
3084 * We must however clear the bitflag before unlocking 3089 * We must however clear the bitflag before unlocking
3085 * the vma so the users using the anon_vma->rb_root will 3090 * the vma so the users using the anon_vma->rb_root will
3086 * never see our bitflag. 3091 * never see our bitflag.
3087 * 3092 *
3088 * No need of atomic instructions here, head.next 3093 * No need of atomic instructions here, head.next
3089 * can't change from under us until we release the 3094 * can't change from under us until we release the
3090 * anon_vma->root->rwsem. 3095 * anon_vma->root->rwsem.
3091 */ 3096 */
3092 if (!__test_and_clear_bit(0, (unsigned long *) 3097 if (!__test_and_clear_bit(0, (unsigned long *)
3093 &anon_vma->root->rb_root.rb_node)) 3098 &anon_vma->root->rb_root.rb_node))
3094 BUG(); 3099 BUG();
3095 anon_vma_unlock_write(anon_vma); 3100 anon_vma_unlock_write(anon_vma);
3096 } 3101 }
3097 } 3102 }
3098 3103
3099 static void vm_unlock_mapping(struct address_space *mapping) 3104 static void vm_unlock_mapping(struct address_space *mapping)
3100 { 3105 {
3101 if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { 3106 if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
3102 /* 3107 /*
3103 * AS_MM_ALL_LOCKS can't change to 0 from under us 3108 * AS_MM_ALL_LOCKS can't change to 0 from under us
3104 * because we hold the mm_all_locks_mutex. 3109 * because we hold the mm_all_locks_mutex.
3105 */ 3110 */
3106 mutex_unlock(&mapping->i_mmap_mutex); 3111 mutex_unlock(&mapping->i_mmap_mutex);
3107 if (!test_and_clear_bit(AS_MM_ALL_LOCKS, 3112 if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
3108 &mapping->flags)) 3113 &mapping->flags))
3109 BUG(); 3114 BUG();
3110 } 3115 }
3111 } 3116 }
3112 3117
3113 /* 3118 /*
3114 * The mmap_sem cannot be released by the caller until 3119 * The mmap_sem cannot be released by the caller until
3115 * mm_drop_all_locks() returns. 3120 * mm_drop_all_locks() returns.
3116 */ 3121 */
3117 void mm_drop_all_locks(struct mm_struct *mm) 3122 void mm_drop_all_locks(struct mm_struct *mm)
3118 { 3123 {
3119 struct vm_area_struct *vma; 3124 struct vm_area_struct *vma;
3120 struct anon_vma_chain *avc; 3125 struct anon_vma_chain *avc;
3121 3126
3122 BUG_ON(down_read_trylock(&mm->mmap_sem)); 3127 BUG_ON(down_read_trylock(&mm->mmap_sem));
3123 BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); 3128 BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
3124 3129
3125 for (vma = mm->mmap; vma; vma = vma->vm_next) { 3130 for (vma = mm->mmap; vma; vma = vma->vm_next) {
3126 if (vma->anon_vma) 3131 if (vma->anon_vma)
3127 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 3132 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
3128 vm_unlock_anon_vma(avc->anon_vma); 3133 vm_unlock_anon_vma(avc->anon_vma);
3129 if (vma->vm_file && vma->vm_file->f_mapping) 3134 if (vma->vm_file && vma->vm_file->f_mapping)
3130 vm_unlock_mapping(vma->vm_file->f_mapping); 3135 vm_unlock_mapping(vma->vm_file->f_mapping);
3131 } 3136 }
3132 3137
3133 mutex_unlock(&mm_all_locks_mutex); 3138 mutex_unlock(&mm_all_locks_mutex);
3134 } 3139 }
3135 3140
3136 /* 3141 /*
3137 * initialise the VMA slab 3142 * initialise the VMA slab
3138 */ 3143 */
3139 void __init mmap_init(void) 3144 void __init mmap_init(void)
3140 { 3145 {
3141 int ret; 3146 int ret;
3142 3147
3143 ret = percpu_counter_init(&vm_committed_as, 0); 3148 ret = percpu_counter_init(&vm_committed_as, 0);
3144 VM_BUG_ON(ret); 3149 VM_BUG_ON(ret);
3145 } 3150 }
3146 3151
3147 /* 3152 /*
3148 * Initialise sysctl_user_reserve_kbytes. 3153 * Initialise sysctl_user_reserve_kbytes.
3149 * 3154 *
3150 * This is intended to prevent a user from starting a single memory hogging 3155 * This is intended to prevent a user from starting a single memory hogging
3151 * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER 3156 * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER
3152 * mode. 3157 * mode.
3153 * 3158 *
3154 * The default value is min(3% of free memory, 128MB) 3159 * The default value is min(3% of free memory, 128MB)
3155 * 128MB is enough to recover with sshd/login, bash, and top/kill. 3160 * 128MB is enough to recover with sshd/login, bash, and top/kill.
3156 */ 3161 */
3157 static int init_user_reserve(void) 3162 static int init_user_reserve(void)
3158 { 3163 {
3159 unsigned long free_kbytes; 3164 unsigned long free_kbytes;
3160 3165
3161 free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); 3166 free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
3162 3167
3163 sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); 3168 sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
3164 return 0; 3169 return 0;
3165 } 3170 }
3166 module_init(init_user_reserve) 3171 module_init(init_user_reserve)
3167 3172
3168 /* 3173 /*
3169 * Initialise sysctl_admin_reserve_kbytes. 3174 * Initialise sysctl_admin_reserve_kbytes.
3170 * 3175 *
3171 * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin 3176 * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin
3172 * to log in and kill a memory hogging process. 3177 * to log in and kill a memory hogging process.
3173 * 3178 *
3174 * Systems with more than 256MB will reserve 8MB, enough to recover 3179 * Systems with more than 256MB will reserve 8MB, enough to recover
3175 * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will 3180 * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will
3176 * only reserve 3% of free pages by default. 3181 * only reserve 3% of free pages by default.
3177 */ 3182 */
3178 static int init_admin_reserve(void) 3183 static int init_admin_reserve(void)
3179 { 3184 {
3180 unsigned long free_kbytes; 3185 unsigned long free_kbytes;
3181 3186
3182 free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); 3187 free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
3183 3188
3184 sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); 3189 sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
3185 return 0; 3190 return 0;
3186 } 3191 }
3187 module_init(init_admin_reserve) 3192 module_init(init_admin_reserve)
3188 3193
3189 /* 3194 /*
3190 * Reinititalise user and admin reserves if memory is added or removed. 3195 * Reinititalise user and admin reserves if memory is added or removed.
3191 * 3196 *
3192 * The default user reserve max is 128MB, and the default max for the 3197 * The default user reserve max is 128MB, and the default max for the
3193 * admin reserve is 8MB. These are usually, but not always, enough to 3198 * admin reserve is 8MB. These are usually, but not always, enough to
3194 * enable recovery from a memory hogging process using login/sshd, a shell, 3199 * enable recovery from a memory hogging process using login/sshd, a shell,
3195 * and tools like top. It may make sense to increase or even disable the 3200 * and tools like top. It may make sense to increase or even disable the
3196 * reserve depending on the existence of swap or variations in the recovery 3201 * reserve depending on the existence of swap or variations in the recovery
3197 * tools. So, the admin may have changed them. 3202 * tools. So, the admin may have changed them.
3198 * 3203 *
3199 * If memory is added and the reserves have been eliminated or increased above 3204 * If memory is added and the reserves have been eliminated or increased above
3200 * the default max, then we'll trust the admin. 3205 * the default max, then we'll trust the admin.
3201 * 3206 *
3202 * If memory is removed and there isn't enough free memory, then we 3207 * If memory is removed and there isn't enough free memory, then we
3203 * need to reset the reserves. 3208 * need to reset the reserves.
3204 * 3209 *
3205 * Otherwise keep the reserve set by the admin. 3210 * Otherwise keep the reserve set by the admin.
3206 */ 3211 */
3207 static int reserve_mem_notifier(struct notifier_block *nb, 3212 static int reserve_mem_notifier(struct notifier_block *nb,
3208 unsigned long action, void *data) 3213 unsigned long action, void *data)
3209 { 3214 {
3210 unsigned long tmp, free_kbytes; 3215 unsigned long tmp, free_kbytes;
3211 3216
3212 switch (action) { 3217 switch (action) {
3213 case MEM_ONLINE: 3218 case MEM_ONLINE:
3214 /* Default max is 128MB. Leave alone if modified by operator. */ 3219 /* Default max is 128MB. Leave alone if modified by operator. */
3215 tmp = sysctl_user_reserve_kbytes; 3220 tmp = sysctl_user_reserve_kbytes;
3216 if (0 < tmp && tmp < (1UL << 17)) 3221 if (0 < tmp && tmp < (1UL << 17))
3217 init_user_reserve(); 3222 init_user_reserve();
3218 3223
3219 /* Default max is 8MB. Leave alone if modified by operator. */ 3224 /* Default max is 8MB. Leave alone if modified by operator. */
3220 tmp = sysctl_admin_reserve_kbytes; 3225 tmp = sysctl_admin_reserve_kbytes;
3221 if (0 < tmp && tmp < (1UL << 13)) 3226 if (0 < tmp && tmp < (1UL << 13))
3222 init_admin_reserve(); 3227 init_admin_reserve();
3223 3228
3224 break; 3229 break;
3225 case MEM_OFFLINE: 3230 case MEM_OFFLINE:
3226 free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); 3231 free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
3227 3232
3228 if (sysctl_user_reserve_kbytes > free_kbytes) { 3233 if (sysctl_user_reserve_kbytes > free_kbytes) {
3229 init_user_reserve(); 3234 init_user_reserve();
3230 pr_info("vm.user_reserve_kbytes reset to %lu\n", 3235 pr_info("vm.user_reserve_kbytes reset to %lu\n",
3231 sysctl_user_reserve_kbytes); 3236 sysctl_user_reserve_kbytes);
3232 } 3237 }
3233 3238
3234 if (sysctl_admin_reserve_kbytes > free_kbytes) { 3239 if (sysctl_admin_reserve_kbytes > free_kbytes) {
3235 init_admin_reserve(); 3240 init_admin_reserve();
3236 pr_info("vm.admin_reserve_kbytes reset to %lu\n", 3241 pr_info("vm.admin_reserve_kbytes reset to %lu\n",
3237 sysctl_admin_reserve_kbytes); 3242 sysctl_admin_reserve_kbytes);
3238 } 3243 }
3239 break; 3244 break;
3240 default: 3245 default:
3241 break; 3246 break;
3242 } 3247 }
3243 return NOTIFY_OK; 3248 return NOTIFY_OK;
3244 } 3249 }
3245 3250
3246 static struct notifier_block reserve_mem_nb = { 3251 static struct notifier_block reserve_mem_nb = {
3247 .notifier_call = reserve_mem_notifier, 3252 .notifier_call = reserve_mem_notifier,
3248 }; 3253 };
3249 3254
3250 static int __meminit init_reserve_notifier(void) 3255 static int __meminit init_reserve_notifier(void)
3251 { 3256 {
3252 if (register_hotmemory_notifier(&reserve_mem_nb)) 3257 if (register_hotmemory_notifier(&reserve_mem_nb))
3253 printk("Failed registering memory add/remove notifier for admin reserve"); 3258 printk("Failed registering memory add/remove notifier for admin reserve");
3254 3259
3255 return 0; 3260 return 0;
3256 } 3261 }
3257 module_init(init_reserve_notifier) 3262 module_init(init_reserve_notifier)
3258 3263