Commit af73e4d9506d3b797509f3c030e7dcd554f7d9c4
Committed by
Linus Torvalds
1 parent
1ab4ce7623
Exists in
smarc-l5.0.0_1.0.0-ga
and in
5 other branches
hugetlbfs: fix mmap failure in unaligned size request
The current kernel returns -EINVAL unless a given mmap length is "almost" hugepage aligned. This is because in sys_mmap_pgoff() the given length is passed to vm_mmap_pgoff() as it is without being aligned with hugepage boundary. This is a regression introduced in commit 40716e29243d ("hugetlbfs: fix alignment of huge page requests"), where alignment code is pushed into hugetlb_file_setup() and the variable len in caller side is not changed. To fix this, this patch partially reverts that commit, and adds alignment code in caller side. And it also introduces hstate_sizelog() in order to get proper hstate to specified hugepage size. Addresses https://bugzilla.kernel.org/show_bug.cgi?id=56881 [akpm@linux-foundation.org: fix warning when CONFIG_HUGETLB_PAGE=n] Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Reported-by: <iceman_dvd@yahoo.com> Cc: Steven Truelove <steven.truelove@utoronto.ca> Cc: Jianguo Wu <wujianguo@huawei.com> Cc: Hugh Dickins <hughd@google.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 4 changed files with 34 additions and 22 deletions Inline Diff
fs/hugetlbfs/inode.c
1 | /* | 1 | /* |
2 | * hugetlbpage-backed filesystem. Based on ramfs. | 2 | * hugetlbpage-backed filesystem. Based on ramfs. |
3 | * | 3 | * |
4 | * Nadia Yvette Chambers, 2002 | 4 | * Nadia Yvette Chambers, 2002 |
5 | * | 5 | * |
6 | * Copyright (C) 2002 Linus Torvalds. | 6 | * Copyright (C) 2002 Linus Torvalds. |
7 | */ | 7 | */ |
8 | 8 | ||
9 | #include <linux/module.h> | 9 | #include <linux/module.h> |
10 | #include <linux/thread_info.h> | 10 | #include <linux/thread_info.h> |
11 | #include <asm/current.h> | 11 | #include <asm/current.h> |
12 | #include <linux/sched.h> /* remove ASAP */ | 12 | #include <linux/sched.h> /* remove ASAP */ |
13 | #include <linux/fs.h> | 13 | #include <linux/fs.h> |
14 | #include <linux/mount.h> | 14 | #include <linux/mount.h> |
15 | #include <linux/file.h> | 15 | #include <linux/file.h> |
16 | #include <linux/kernel.h> | 16 | #include <linux/kernel.h> |
17 | #include <linux/writeback.h> | 17 | #include <linux/writeback.h> |
18 | #include <linux/pagemap.h> | 18 | #include <linux/pagemap.h> |
19 | #include <linux/highmem.h> | 19 | #include <linux/highmem.h> |
20 | #include <linux/init.h> | 20 | #include <linux/init.h> |
21 | #include <linux/string.h> | 21 | #include <linux/string.h> |
22 | #include <linux/capability.h> | 22 | #include <linux/capability.h> |
23 | #include <linux/ctype.h> | 23 | #include <linux/ctype.h> |
24 | #include <linux/backing-dev.h> | 24 | #include <linux/backing-dev.h> |
25 | #include <linux/hugetlb.h> | 25 | #include <linux/hugetlb.h> |
26 | #include <linux/pagevec.h> | 26 | #include <linux/pagevec.h> |
27 | #include <linux/parser.h> | 27 | #include <linux/parser.h> |
28 | #include <linux/mman.h> | 28 | #include <linux/mman.h> |
29 | #include <linux/slab.h> | 29 | #include <linux/slab.h> |
30 | #include <linux/dnotify.h> | 30 | #include <linux/dnotify.h> |
31 | #include <linux/statfs.h> | 31 | #include <linux/statfs.h> |
32 | #include <linux/security.h> | 32 | #include <linux/security.h> |
33 | #include <linux/magic.h> | 33 | #include <linux/magic.h> |
34 | #include <linux/migrate.h> | 34 | #include <linux/migrate.h> |
35 | 35 | ||
36 | #include <asm/uaccess.h> | 36 | #include <asm/uaccess.h> |
37 | 37 | ||
38 | static const struct super_operations hugetlbfs_ops; | 38 | static const struct super_operations hugetlbfs_ops; |
39 | static const struct address_space_operations hugetlbfs_aops; | 39 | static const struct address_space_operations hugetlbfs_aops; |
40 | const struct file_operations hugetlbfs_file_operations; | 40 | const struct file_operations hugetlbfs_file_operations; |
41 | static const struct inode_operations hugetlbfs_dir_inode_operations; | 41 | static const struct inode_operations hugetlbfs_dir_inode_operations; |
42 | static const struct inode_operations hugetlbfs_inode_operations; | 42 | static const struct inode_operations hugetlbfs_inode_operations; |
43 | 43 | ||
44 | struct hugetlbfs_config { | 44 | struct hugetlbfs_config { |
45 | kuid_t uid; | 45 | kuid_t uid; |
46 | kgid_t gid; | 46 | kgid_t gid; |
47 | umode_t mode; | 47 | umode_t mode; |
48 | long nr_blocks; | 48 | long nr_blocks; |
49 | long nr_inodes; | 49 | long nr_inodes; |
50 | struct hstate *hstate; | 50 | struct hstate *hstate; |
51 | }; | 51 | }; |
52 | 52 | ||
53 | struct hugetlbfs_inode_info { | 53 | struct hugetlbfs_inode_info { |
54 | struct shared_policy policy; | 54 | struct shared_policy policy; |
55 | struct inode vfs_inode; | 55 | struct inode vfs_inode; |
56 | }; | 56 | }; |
57 | 57 | ||
58 | static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode) | 58 | static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode) |
59 | { | 59 | { |
60 | return container_of(inode, struct hugetlbfs_inode_info, vfs_inode); | 60 | return container_of(inode, struct hugetlbfs_inode_info, vfs_inode); |
61 | } | 61 | } |
62 | 62 | ||
63 | static struct backing_dev_info hugetlbfs_backing_dev_info = { | 63 | static struct backing_dev_info hugetlbfs_backing_dev_info = { |
64 | .name = "hugetlbfs", | 64 | .name = "hugetlbfs", |
65 | .ra_pages = 0, /* No readahead */ | 65 | .ra_pages = 0, /* No readahead */ |
66 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, | 66 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, |
67 | }; | 67 | }; |
68 | 68 | ||
69 | int sysctl_hugetlb_shm_group; | 69 | int sysctl_hugetlb_shm_group; |
70 | 70 | ||
71 | enum { | 71 | enum { |
72 | Opt_size, Opt_nr_inodes, | 72 | Opt_size, Opt_nr_inodes, |
73 | Opt_mode, Opt_uid, Opt_gid, | 73 | Opt_mode, Opt_uid, Opt_gid, |
74 | Opt_pagesize, | 74 | Opt_pagesize, |
75 | Opt_err, | 75 | Opt_err, |
76 | }; | 76 | }; |
77 | 77 | ||
78 | static const match_table_t tokens = { | 78 | static const match_table_t tokens = { |
79 | {Opt_size, "size=%s"}, | 79 | {Opt_size, "size=%s"}, |
80 | {Opt_nr_inodes, "nr_inodes=%s"}, | 80 | {Opt_nr_inodes, "nr_inodes=%s"}, |
81 | {Opt_mode, "mode=%o"}, | 81 | {Opt_mode, "mode=%o"}, |
82 | {Opt_uid, "uid=%u"}, | 82 | {Opt_uid, "uid=%u"}, |
83 | {Opt_gid, "gid=%u"}, | 83 | {Opt_gid, "gid=%u"}, |
84 | {Opt_pagesize, "pagesize=%s"}, | 84 | {Opt_pagesize, "pagesize=%s"}, |
85 | {Opt_err, NULL}, | 85 | {Opt_err, NULL}, |
86 | }; | 86 | }; |
87 | 87 | ||
88 | static void huge_pagevec_release(struct pagevec *pvec) | 88 | static void huge_pagevec_release(struct pagevec *pvec) |
89 | { | 89 | { |
90 | int i; | 90 | int i; |
91 | 91 | ||
92 | for (i = 0; i < pagevec_count(pvec); ++i) | 92 | for (i = 0; i < pagevec_count(pvec); ++i) |
93 | put_page(pvec->pages[i]); | 93 | put_page(pvec->pages[i]); |
94 | 94 | ||
95 | pagevec_reinit(pvec); | 95 | pagevec_reinit(pvec); |
96 | } | 96 | } |
97 | 97 | ||
98 | static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) | 98 | static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) |
99 | { | 99 | { |
100 | struct inode *inode = file_inode(file); | 100 | struct inode *inode = file_inode(file); |
101 | loff_t len, vma_len; | 101 | loff_t len, vma_len; |
102 | int ret; | 102 | int ret; |
103 | struct hstate *h = hstate_file(file); | 103 | struct hstate *h = hstate_file(file); |
104 | 104 | ||
105 | /* | 105 | /* |
106 | * vma address alignment (but not the pgoff alignment) has | 106 | * vma address alignment (but not the pgoff alignment) has |
107 | * already been checked by prepare_hugepage_range. If you add | 107 | * already been checked by prepare_hugepage_range. If you add |
108 | * any error returns here, do so after setting VM_HUGETLB, so | 108 | * any error returns here, do so after setting VM_HUGETLB, so |
109 | * is_vm_hugetlb_page tests below unmap_region go the right | 109 | * is_vm_hugetlb_page tests below unmap_region go the right |
110 | * way when do_mmap_pgoff unwinds (may be important on powerpc | 110 | * way when do_mmap_pgoff unwinds (may be important on powerpc |
111 | * and ia64). | 111 | * and ia64). |
112 | */ | 112 | */ |
113 | vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND; | 113 | vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND; |
114 | vma->vm_ops = &hugetlb_vm_ops; | 114 | vma->vm_ops = &hugetlb_vm_ops; |
115 | 115 | ||
116 | if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) | 116 | if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) |
117 | return -EINVAL; | 117 | return -EINVAL; |
118 | 118 | ||
119 | vma_len = (loff_t)(vma->vm_end - vma->vm_start); | 119 | vma_len = (loff_t)(vma->vm_end - vma->vm_start); |
120 | 120 | ||
121 | mutex_lock(&inode->i_mutex); | 121 | mutex_lock(&inode->i_mutex); |
122 | file_accessed(file); | 122 | file_accessed(file); |
123 | 123 | ||
124 | ret = -ENOMEM; | 124 | ret = -ENOMEM; |
125 | len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); | 125 | len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); |
126 | 126 | ||
127 | if (hugetlb_reserve_pages(inode, | 127 | if (hugetlb_reserve_pages(inode, |
128 | vma->vm_pgoff >> huge_page_order(h), | 128 | vma->vm_pgoff >> huge_page_order(h), |
129 | len >> huge_page_shift(h), vma, | 129 | len >> huge_page_shift(h), vma, |
130 | vma->vm_flags)) | 130 | vma->vm_flags)) |
131 | goto out; | 131 | goto out; |
132 | 132 | ||
133 | ret = 0; | 133 | ret = 0; |
134 | hugetlb_prefault_arch_hook(vma->vm_mm); | 134 | hugetlb_prefault_arch_hook(vma->vm_mm); |
135 | if (vma->vm_flags & VM_WRITE && inode->i_size < len) | 135 | if (vma->vm_flags & VM_WRITE && inode->i_size < len) |
136 | inode->i_size = len; | 136 | inode->i_size = len; |
137 | out: | 137 | out: |
138 | mutex_unlock(&inode->i_mutex); | 138 | mutex_unlock(&inode->i_mutex); |
139 | 139 | ||
140 | return ret; | 140 | return ret; |
141 | } | 141 | } |
142 | 142 | ||
143 | /* | 143 | /* |
144 | * Called under down_write(mmap_sem). | 144 | * Called under down_write(mmap_sem). |
145 | */ | 145 | */ |
146 | 146 | ||
147 | #ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA | 147 | #ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA |
148 | static unsigned long | 148 | static unsigned long |
149 | hugetlb_get_unmapped_area(struct file *file, unsigned long addr, | 149 | hugetlb_get_unmapped_area(struct file *file, unsigned long addr, |
150 | unsigned long len, unsigned long pgoff, unsigned long flags) | 150 | unsigned long len, unsigned long pgoff, unsigned long flags) |
151 | { | 151 | { |
152 | struct mm_struct *mm = current->mm; | 152 | struct mm_struct *mm = current->mm; |
153 | struct vm_area_struct *vma; | 153 | struct vm_area_struct *vma; |
154 | struct hstate *h = hstate_file(file); | 154 | struct hstate *h = hstate_file(file); |
155 | struct vm_unmapped_area_info info; | 155 | struct vm_unmapped_area_info info; |
156 | 156 | ||
157 | if (len & ~huge_page_mask(h)) | 157 | if (len & ~huge_page_mask(h)) |
158 | return -EINVAL; | 158 | return -EINVAL; |
159 | if (len > TASK_SIZE) | 159 | if (len > TASK_SIZE) |
160 | return -ENOMEM; | 160 | return -ENOMEM; |
161 | 161 | ||
162 | if (flags & MAP_FIXED) { | 162 | if (flags & MAP_FIXED) { |
163 | if (prepare_hugepage_range(file, addr, len)) | 163 | if (prepare_hugepage_range(file, addr, len)) |
164 | return -EINVAL; | 164 | return -EINVAL; |
165 | return addr; | 165 | return addr; |
166 | } | 166 | } |
167 | 167 | ||
168 | if (addr) { | 168 | if (addr) { |
169 | addr = ALIGN(addr, huge_page_size(h)); | 169 | addr = ALIGN(addr, huge_page_size(h)); |
170 | vma = find_vma(mm, addr); | 170 | vma = find_vma(mm, addr); |
171 | if (TASK_SIZE - len >= addr && | 171 | if (TASK_SIZE - len >= addr && |
172 | (!vma || addr + len <= vma->vm_start)) | 172 | (!vma || addr + len <= vma->vm_start)) |
173 | return addr; | 173 | return addr; |
174 | } | 174 | } |
175 | 175 | ||
176 | info.flags = 0; | 176 | info.flags = 0; |
177 | info.length = len; | 177 | info.length = len; |
178 | info.low_limit = TASK_UNMAPPED_BASE; | 178 | info.low_limit = TASK_UNMAPPED_BASE; |
179 | info.high_limit = TASK_SIZE; | 179 | info.high_limit = TASK_SIZE; |
180 | info.align_mask = PAGE_MASK & ~huge_page_mask(h); | 180 | info.align_mask = PAGE_MASK & ~huge_page_mask(h); |
181 | info.align_offset = 0; | 181 | info.align_offset = 0; |
182 | return vm_unmapped_area(&info); | 182 | return vm_unmapped_area(&info); |
183 | } | 183 | } |
184 | #endif | 184 | #endif |
185 | 185 | ||
186 | static int | 186 | static int |
187 | hugetlbfs_read_actor(struct page *page, unsigned long offset, | 187 | hugetlbfs_read_actor(struct page *page, unsigned long offset, |
188 | char __user *buf, unsigned long count, | 188 | char __user *buf, unsigned long count, |
189 | unsigned long size) | 189 | unsigned long size) |
190 | { | 190 | { |
191 | char *kaddr; | 191 | char *kaddr; |
192 | unsigned long left, copied = 0; | 192 | unsigned long left, copied = 0; |
193 | int i, chunksize; | 193 | int i, chunksize; |
194 | 194 | ||
195 | if (size > count) | 195 | if (size > count) |
196 | size = count; | 196 | size = count; |
197 | 197 | ||
198 | /* Find which 4k chunk and offset with in that chunk */ | 198 | /* Find which 4k chunk and offset with in that chunk */ |
199 | i = offset >> PAGE_CACHE_SHIFT; | 199 | i = offset >> PAGE_CACHE_SHIFT; |
200 | offset = offset & ~PAGE_CACHE_MASK; | 200 | offset = offset & ~PAGE_CACHE_MASK; |
201 | 201 | ||
202 | while (size) { | 202 | while (size) { |
203 | chunksize = PAGE_CACHE_SIZE; | 203 | chunksize = PAGE_CACHE_SIZE; |
204 | if (offset) | 204 | if (offset) |
205 | chunksize -= offset; | 205 | chunksize -= offset; |
206 | if (chunksize > size) | 206 | if (chunksize > size) |
207 | chunksize = size; | 207 | chunksize = size; |
208 | kaddr = kmap(&page[i]); | 208 | kaddr = kmap(&page[i]); |
209 | left = __copy_to_user(buf, kaddr + offset, chunksize); | 209 | left = __copy_to_user(buf, kaddr + offset, chunksize); |
210 | kunmap(&page[i]); | 210 | kunmap(&page[i]); |
211 | if (left) { | 211 | if (left) { |
212 | copied += (chunksize - left); | 212 | copied += (chunksize - left); |
213 | break; | 213 | break; |
214 | } | 214 | } |
215 | offset = 0; | 215 | offset = 0; |
216 | size -= chunksize; | 216 | size -= chunksize; |
217 | buf += chunksize; | 217 | buf += chunksize; |
218 | copied += chunksize; | 218 | copied += chunksize; |
219 | i++; | 219 | i++; |
220 | } | 220 | } |
221 | return copied ? copied : -EFAULT; | 221 | return copied ? copied : -EFAULT; |
222 | } | 222 | } |
223 | 223 | ||
224 | /* | 224 | /* |
225 | * Support for read() - Find the page attached to f_mapping and copy out the | 225 | * Support for read() - Find the page attached to f_mapping and copy out the |
226 | * data. Its *very* similar to do_generic_mapping_read(), we can't use that | 226 | * data. Its *very* similar to do_generic_mapping_read(), we can't use that |
227 | * since it has PAGE_CACHE_SIZE assumptions. | 227 | * since it has PAGE_CACHE_SIZE assumptions. |
228 | */ | 228 | */ |
229 | static ssize_t hugetlbfs_read(struct file *filp, char __user *buf, | 229 | static ssize_t hugetlbfs_read(struct file *filp, char __user *buf, |
230 | size_t len, loff_t *ppos) | 230 | size_t len, loff_t *ppos) |
231 | { | 231 | { |
232 | struct hstate *h = hstate_file(filp); | 232 | struct hstate *h = hstate_file(filp); |
233 | struct address_space *mapping = filp->f_mapping; | 233 | struct address_space *mapping = filp->f_mapping; |
234 | struct inode *inode = mapping->host; | 234 | struct inode *inode = mapping->host; |
235 | unsigned long index = *ppos >> huge_page_shift(h); | 235 | unsigned long index = *ppos >> huge_page_shift(h); |
236 | unsigned long offset = *ppos & ~huge_page_mask(h); | 236 | unsigned long offset = *ppos & ~huge_page_mask(h); |
237 | unsigned long end_index; | 237 | unsigned long end_index; |
238 | loff_t isize; | 238 | loff_t isize; |
239 | ssize_t retval = 0; | 239 | ssize_t retval = 0; |
240 | 240 | ||
241 | /* validate length */ | 241 | /* validate length */ |
242 | if (len == 0) | 242 | if (len == 0) |
243 | goto out; | 243 | goto out; |
244 | 244 | ||
245 | for (;;) { | 245 | for (;;) { |
246 | struct page *page; | 246 | struct page *page; |
247 | unsigned long nr, ret; | 247 | unsigned long nr, ret; |
248 | int ra; | 248 | int ra; |
249 | 249 | ||
250 | /* nr is the maximum number of bytes to copy from this page */ | 250 | /* nr is the maximum number of bytes to copy from this page */ |
251 | nr = huge_page_size(h); | 251 | nr = huge_page_size(h); |
252 | isize = i_size_read(inode); | 252 | isize = i_size_read(inode); |
253 | if (!isize) | 253 | if (!isize) |
254 | goto out; | 254 | goto out; |
255 | end_index = (isize - 1) >> huge_page_shift(h); | 255 | end_index = (isize - 1) >> huge_page_shift(h); |
256 | if (index >= end_index) { | 256 | if (index >= end_index) { |
257 | if (index > end_index) | 257 | if (index > end_index) |
258 | goto out; | 258 | goto out; |
259 | nr = ((isize - 1) & ~huge_page_mask(h)) + 1; | 259 | nr = ((isize - 1) & ~huge_page_mask(h)) + 1; |
260 | if (nr <= offset) | 260 | if (nr <= offset) |
261 | goto out; | 261 | goto out; |
262 | } | 262 | } |
263 | nr = nr - offset; | 263 | nr = nr - offset; |
264 | 264 | ||
265 | /* Find the page */ | 265 | /* Find the page */ |
266 | page = find_lock_page(mapping, index); | 266 | page = find_lock_page(mapping, index); |
267 | if (unlikely(page == NULL)) { | 267 | if (unlikely(page == NULL)) { |
268 | /* | 268 | /* |
269 | * We have a HOLE, zero out the user-buffer for the | 269 | * We have a HOLE, zero out the user-buffer for the |
270 | * length of the hole or request. | 270 | * length of the hole or request. |
271 | */ | 271 | */ |
272 | ret = len < nr ? len : nr; | 272 | ret = len < nr ? len : nr; |
273 | if (clear_user(buf, ret)) | 273 | if (clear_user(buf, ret)) |
274 | ra = -EFAULT; | 274 | ra = -EFAULT; |
275 | else | 275 | else |
276 | ra = 0; | 276 | ra = 0; |
277 | } else { | 277 | } else { |
278 | unlock_page(page); | 278 | unlock_page(page); |
279 | 279 | ||
280 | /* | 280 | /* |
281 | * We have the page, copy it to user space buffer. | 281 | * We have the page, copy it to user space buffer. |
282 | */ | 282 | */ |
283 | ra = hugetlbfs_read_actor(page, offset, buf, len, nr); | 283 | ra = hugetlbfs_read_actor(page, offset, buf, len, nr); |
284 | ret = ra; | 284 | ret = ra; |
285 | page_cache_release(page); | 285 | page_cache_release(page); |
286 | } | 286 | } |
287 | if (ra < 0) { | 287 | if (ra < 0) { |
288 | if (retval == 0) | 288 | if (retval == 0) |
289 | retval = ra; | 289 | retval = ra; |
290 | goto out; | 290 | goto out; |
291 | } | 291 | } |
292 | 292 | ||
293 | offset += ret; | 293 | offset += ret; |
294 | retval += ret; | 294 | retval += ret; |
295 | len -= ret; | 295 | len -= ret; |
296 | index += offset >> huge_page_shift(h); | 296 | index += offset >> huge_page_shift(h); |
297 | offset &= ~huge_page_mask(h); | 297 | offset &= ~huge_page_mask(h); |
298 | 298 | ||
299 | /* short read or no more work */ | 299 | /* short read or no more work */ |
300 | if ((ret != nr) || (len == 0)) | 300 | if ((ret != nr) || (len == 0)) |
301 | break; | 301 | break; |
302 | } | 302 | } |
303 | out: | 303 | out: |
304 | *ppos = ((loff_t)index << huge_page_shift(h)) + offset; | 304 | *ppos = ((loff_t)index << huge_page_shift(h)) + offset; |
305 | return retval; | 305 | return retval; |
306 | } | 306 | } |
307 | 307 | ||
308 | static int hugetlbfs_write_begin(struct file *file, | 308 | static int hugetlbfs_write_begin(struct file *file, |
309 | struct address_space *mapping, | 309 | struct address_space *mapping, |
310 | loff_t pos, unsigned len, unsigned flags, | 310 | loff_t pos, unsigned len, unsigned flags, |
311 | struct page **pagep, void **fsdata) | 311 | struct page **pagep, void **fsdata) |
312 | { | 312 | { |
313 | return -EINVAL; | 313 | return -EINVAL; |
314 | } | 314 | } |
315 | 315 | ||
316 | static int hugetlbfs_write_end(struct file *file, struct address_space *mapping, | 316 | static int hugetlbfs_write_end(struct file *file, struct address_space *mapping, |
317 | loff_t pos, unsigned len, unsigned copied, | 317 | loff_t pos, unsigned len, unsigned copied, |
318 | struct page *page, void *fsdata) | 318 | struct page *page, void *fsdata) |
319 | { | 319 | { |
320 | BUG(); | 320 | BUG(); |
321 | return -EINVAL; | 321 | return -EINVAL; |
322 | } | 322 | } |
323 | 323 | ||
324 | static void truncate_huge_page(struct page *page) | 324 | static void truncate_huge_page(struct page *page) |
325 | { | 325 | { |
326 | cancel_dirty_page(page, /* No IO accounting for huge pages? */0); | 326 | cancel_dirty_page(page, /* No IO accounting for huge pages? */0); |
327 | ClearPageUptodate(page); | 327 | ClearPageUptodate(page); |
328 | delete_from_page_cache(page); | 328 | delete_from_page_cache(page); |
329 | } | 329 | } |
330 | 330 | ||
331 | static void truncate_hugepages(struct inode *inode, loff_t lstart) | 331 | static void truncate_hugepages(struct inode *inode, loff_t lstart) |
332 | { | 332 | { |
333 | struct hstate *h = hstate_inode(inode); | 333 | struct hstate *h = hstate_inode(inode); |
334 | struct address_space *mapping = &inode->i_data; | 334 | struct address_space *mapping = &inode->i_data; |
335 | const pgoff_t start = lstart >> huge_page_shift(h); | 335 | const pgoff_t start = lstart >> huge_page_shift(h); |
336 | struct pagevec pvec; | 336 | struct pagevec pvec; |
337 | pgoff_t next; | 337 | pgoff_t next; |
338 | int i, freed = 0; | 338 | int i, freed = 0; |
339 | 339 | ||
340 | pagevec_init(&pvec, 0); | 340 | pagevec_init(&pvec, 0); |
341 | next = start; | 341 | next = start; |
342 | while (1) { | 342 | while (1) { |
343 | if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { | 343 | if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { |
344 | if (next == start) | 344 | if (next == start) |
345 | break; | 345 | break; |
346 | next = start; | 346 | next = start; |
347 | continue; | 347 | continue; |
348 | } | 348 | } |
349 | 349 | ||
350 | for (i = 0; i < pagevec_count(&pvec); ++i) { | 350 | for (i = 0; i < pagevec_count(&pvec); ++i) { |
351 | struct page *page = pvec.pages[i]; | 351 | struct page *page = pvec.pages[i]; |
352 | 352 | ||
353 | lock_page(page); | 353 | lock_page(page); |
354 | if (page->index > next) | 354 | if (page->index > next) |
355 | next = page->index; | 355 | next = page->index; |
356 | ++next; | 356 | ++next; |
357 | truncate_huge_page(page); | 357 | truncate_huge_page(page); |
358 | unlock_page(page); | 358 | unlock_page(page); |
359 | freed++; | 359 | freed++; |
360 | } | 360 | } |
361 | huge_pagevec_release(&pvec); | 361 | huge_pagevec_release(&pvec); |
362 | } | 362 | } |
363 | BUG_ON(!lstart && mapping->nrpages); | 363 | BUG_ON(!lstart && mapping->nrpages); |
364 | hugetlb_unreserve_pages(inode, start, freed); | 364 | hugetlb_unreserve_pages(inode, start, freed); |
365 | } | 365 | } |
366 | 366 | ||
367 | static void hugetlbfs_evict_inode(struct inode *inode) | 367 | static void hugetlbfs_evict_inode(struct inode *inode) |
368 | { | 368 | { |
369 | truncate_hugepages(inode, 0); | 369 | truncate_hugepages(inode, 0); |
370 | clear_inode(inode); | 370 | clear_inode(inode); |
371 | } | 371 | } |
372 | 372 | ||
373 | static inline void | 373 | static inline void |
374 | hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff) | 374 | hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff) |
375 | { | 375 | { |
376 | struct vm_area_struct *vma; | 376 | struct vm_area_struct *vma; |
377 | 377 | ||
378 | vma_interval_tree_foreach(vma, root, pgoff, ULONG_MAX) { | 378 | vma_interval_tree_foreach(vma, root, pgoff, ULONG_MAX) { |
379 | unsigned long v_offset; | 379 | unsigned long v_offset; |
380 | 380 | ||
381 | /* | 381 | /* |
382 | * Can the expression below overflow on 32-bit arches? | 382 | * Can the expression below overflow on 32-bit arches? |
383 | * No, because the interval tree returns us only those vmas | 383 | * No, because the interval tree returns us only those vmas |
384 | * which overlap the truncated area starting at pgoff, | 384 | * which overlap the truncated area starting at pgoff, |
385 | * and no vma on a 32-bit arch can span beyond the 4GB. | 385 | * and no vma on a 32-bit arch can span beyond the 4GB. |
386 | */ | 386 | */ |
387 | if (vma->vm_pgoff < pgoff) | 387 | if (vma->vm_pgoff < pgoff) |
388 | v_offset = (pgoff - vma->vm_pgoff) << PAGE_SHIFT; | 388 | v_offset = (pgoff - vma->vm_pgoff) << PAGE_SHIFT; |
389 | else | 389 | else |
390 | v_offset = 0; | 390 | v_offset = 0; |
391 | 391 | ||
392 | unmap_hugepage_range(vma, vma->vm_start + v_offset, | 392 | unmap_hugepage_range(vma, vma->vm_start + v_offset, |
393 | vma->vm_end, NULL); | 393 | vma->vm_end, NULL); |
394 | } | 394 | } |
395 | } | 395 | } |
396 | 396 | ||
397 | static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) | 397 | static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) |
398 | { | 398 | { |
399 | pgoff_t pgoff; | 399 | pgoff_t pgoff; |
400 | struct address_space *mapping = inode->i_mapping; | 400 | struct address_space *mapping = inode->i_mapping; |
401 | struct hstate *h = hstate_inode(inode); | 401 | struct hstate *h = hstate_inode(inode); |
402 | 402 | ||
403 | BUG_ON(offset & ~huge_page_mask(h)); | 403 | BUG_ON(offset & ~huge_page_mask(h)); |
404 | pgoff = offset >> PAGE_SHIFT; | 404 | pgoff = offset >> PAGE_SHIFT; |
405 | 405 | ||
406 | i_size_write(inode, offset); | 406 | i_size_write(inode, offset); |
407 | mutex_lock(&mapping->i_mmap_mutex); | 407 | mutex_lock(&mapping->i_mmap_mutex); |
408 | if (!RB_EMPTY_ROOT(&mapping->i_mmap)) | 408 | if (!RB_EMPTY_ROOT(&mapping->i_mmap)) |
409 | hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); | 409 | hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); |
410 | mutex_unlock(&mapping->i_mmap_mutex); | 410 | mutex_unlock(&mapping->i_mmap_mutex); |
411 | truncate_hugepages(inode, offset); | 411 | truncate_hugepages(inode, offset); |
412 | return 0; | 412 | return 0; |
413 | } | 413 | } |
414 | 414 | ||
415 | static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr) | 415 | static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr) |
416 | { | 416 | { |
417 | struct inode *inode = dentry->d_inode; | 417 | struct inode *inode = dentry->d_inode; |
418 | struct hstate *h = hstate_inode(inode); | 418 | struct hstate *h = hstate_inode(inode); |
419 | int error; | 419 | int error; |
420 | unsigned int ia_valid = attr->ia_valid; | 420 | unsigned int ia_valid = attr->ia_valid; |
421 | 421 | ||
422 | BUG_ON(!inode); | 422 | BUG_ON(!inode); |
423 | 423 | ||
424 | error = inode_change_ok(inode, attr); | 424 | error = inode_change_ok(inode, attr); |
425 | if (error) | 425 | if (error) |
426 | return error; | 426 | return error; |
427 | 427 | ||
428 | if (ia_valid & ATTR_SIZE) { | 428 | if (ia_valid & ATTR_SIZE) { |
429 | error = -EINVAL; | 429 | error = -EINVAL; |
430 | if (attr->ia_size & ~huge_page_mask(h)) | 430 | if (attr->ia_size & ~huge_page_mask(h)) |
431 | return -EINVAL; | 431 | return -EINVAL; |
432 | error = hugetlb_vmtruncate(inode, attr->ia_size); | 432 | error = hugetlb_vmtruncate(inode, attr->ia_size); |
433 | if (error) | 433 | if (error) |
434 | return error; | 434 | return error; |
435 | } | 435 | } |
436 | 436 | ||
437 | setattr_copy(inode, attr); | 437 | setattr_copy(inode, attr); |
438 | mark_inode_dirty(inode); | 438 | mark_inode_dirty(inode); |
439 | return 0; | 439 | return 0; |
440 | } | 440 | } |
441 | 441 | ||
442 | static struct inode *hugetlbfs_get_root(struct super_block *sb, | 442 | static struct inode *hugetlbfs_get_root(struct super_block *sb, |
443 | struct hugetlbfs_config *config) | 443 | struct hugetlbfs_config *config) |
444 | { | 444 | { |
445 | struct inode *inode; | 445 | struct inode *inode; |
446 | 446 | ||
447 | inode = new_inode(sb); | 447 | inode = new_inode(sb); |
448 | if (inode) { | 448 | if (inode) { |
449 | struct hugetlbfs_inode_info *info; | 449 | struct hugetlbfs_inode_info *info; |
450 | inode->i_ino = get_next_ino(); | 450 | inode->i_ino = get_next_ino(); |
451 | inode->i_mode = S_IFDIR | config->mode; | 451 | inode->i_mode = S_IFDIR | config->mode; |
452 | inode->i_uid = config->uid; | 452 | inode->i_uid = config->uid; |
453 | inode->i_gid = config->gid; | 453 | inode->i_gid = config->gid; |
454 | inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; | 454 | inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; |
455 | info = HUGETLBFS_I(inode); | 455 | info = HUGETLBFS_I(inode); |
456 | mpol_shared_policy_init(&info->policy, NULL); | 456 | mpol_shared_policy_init(&info->policy, NULL); |
457 | inode->i_op = &hugetlbfs_dir_inode_operations; | 457 | inode->i_op = &hugetlbfs_dir_inode_operations; |
458 | inode->i_fop = &simple_dir_operations; | 458 | inode->i_fop = &simple_dir_operations; |
459 | /* directory inodes start off with i_nlink == 2 (for "." entry) */ | 459 | /* directory inodes start off with i_nlink == 2 (for "." entry) */ |
460 | inc_nlink(inode); | 460 | inc_nlink(inode); |
461 | lockdep_annotate_inode_mutex_key(inode); | 461 | lockdep_annotate_inode_mutex_key(inode); |
462 | } | 462 | } |
463 | return inode; | 463 | return inode; |
464 | } | 464 | } |
465 | 465 | ||
466 | static struct inode *hugetlbfs_get_inode(struct super_block *sb, | 466 | static struct inode *hugetlbfs_get_inode(struct super_block *sb, |
467 | struct inode *dir, | 467 | struct inode *dir, |
468 | umode_t mode, dev_t dev) | 468 | umode_t mode, dev_t dev) |
469 | { | 469 | { |
470 | struct inode *inode; | 470 | struct inode *inode; |
471 | 471 | ||
472 | inode = new_inode(sb); | 472 | inode = new_inode(sb); |
473 | if (inode) { | 473 | if (inode) { |
474 | struct hugetlbfs_inode_info *info; | 474 | struct hugetlbfs_inode_info *info; |
475 | inode->i_ino = get_next_ino(); | 475 | inode->i_ino = get_next_ino(); |
476 | inode_init_owner(inode, dir, mode); | 476 | inode_init_owner(inode, dir, mode); |
477 | inode->i_mapping->a_ops = &hugetlbfs_aops; | 477 | inode->i_mapping->a_ops = &hugetlbfs_aops; |
478 | inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info; | 478 | inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info; |
479 | inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; | 479 | inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; |
480 | INIT_LIST_HEAD(&inode->i_mapping->private_list); | 480 | INIT_LIST_HEAD(&inode->i_mapping->private_list); |
481 | info = HUGETLBFS_I(inode); | 481 | info = HUGETLBFS_I(inode); |
482 | /* | 482 | /* |
483 | * The policy is initialized here even if we are creating a | 483 | * The policy is initialized here even if we are creating a |
484 | * private inode because initialization simply creates an | 484 | * private inode because initialization simply creates an |
485 | * an empty rb tree and calls spin_lock_init(), later when we | 485 | * an empty rb tree and calls spin_lock_init(), later when we |
486 | * call mpol_free_shared_policy() it will just return because | 486 | * call mpol_free_shared_policy() it will just return because |
487 | * the rb tree will still be empty. | 487 | * the rb tree will still be empty. |
488 | */ | 488 | */ |
489 | mpol_shared_policy_init(&info->policy, NULL); | 489 | mpol_shared_policy_init(&info->policy, NULL); |
490 | switch (mode & S_IFMT) { | 490 | switch (mode & S_IFMT) { |
491 | default: | 491 | default: |
492 | init_special_inode(inode, mode, dev); | 492 | init_special_inode(inode, mode, dev); |
493 | break; | 493 | break; |
494 | case S_IFREG: | 494 | case S_IFREG: |
495 | inode->i_op = &hugetlbfs_inode_operations; | 495 | inode->i_op = &hugetlbfs_inode_operations; |
496 | inode->i_fop = &hugetlbfs_file_operations; | 496 | inode->i_fop = &hugetlbfs_file_operations; |
497 | break; | 497 | break; |
498 | case S_IFDIR: | 498 | case S_IFDIR: |
499 | inode->i_op = &hugetlbfs_dir_inode_operations; | 499 | inode->i_op = &hugetlbfs_dir_inode_operations; |
500 | inode->i_fop = &simple_dir_operations; | 500 | inode->i_fop = &simple_dir_operations; |
501 | 501 | ||
502 | /* directory inodes start off with i_nlink == 2 (for "." entry) */ | 502 | /* directory inodes start off with i_nlink == 2 (for "." entry) */ |
503 | inc_nlink(inode); | 503 | inc_nlink(inode); |
504 | break; | 504 | break; |
505 | case S_IFLNK: | 505 | case S_IFLNK: |
506 | inode->i_op = &page_symlink_inode_operations; | 506 | inode->i_op = &page_symlink_inode_operations; |
507 | break; | 507 | break; |
508 | } | 508 | } |
509 | lockdep_annotate_inode_mutex_key(inode); | 509 | lockdep_annotate_inode_mutex_key(inode); |
510 | } | 510 | } |
511 | return inode; | 511 | return inode; |
512 | } | 512 | } |
513 | 513 | ||
514 | /* | 514 | /* |
515 | * File creation. Allocate an inode, and we're done.. | 515 | * File creation. Allocate an inode, and we're done.. |
516 | */ | 516 | */ |
517 | static int hugetlbfs_mknod(struct inode *dir, | 517 | static int hugetlbfs_mknod(struct inode *dir, |
518 | struct dentry *dentry, umode_t mode, dev_t dev) | 518 | struct dentry *dentry, umode_t mode, dev_t dev) |
519 | { | 519 | { |
520 | struct inode *inode; | 520 | struct inode *inode; |
521 | int error = -ENOSPC; | 521 | int error = -ENOSPC; |
522 | 522 | ||
523 | inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev); | 523 | inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev); |
524 | if (inode) { | 524 | if (inode) { |
525 | dir->i_ctime = dir->i_mtime = CURRENT_TIME; | 525 | dir->i_ctime = dir->i_mtime = CURRENT_TIME; |
526 | d_instantiate(dentry, inode); | 526 | d_instantiate(dentry, inode); |
527 | dget(dentry); /* Extra count - pin the dentry in core */ | 527 | dget(dentry); /* Extra count - pin the dentry in core */ |
528 | error = 0; | 528 | error = 0; |
529 | } | 529 | } |
530 | return error; | 530 | return error; |
531 | } | 531 | } |
532 | 532 | ||
533 | static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) | 533 | static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) |
534 | { | 534 | { |
535 | int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0); | 535 | int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0); |
536 | if (!retval) | 536 | if (!retval) |
537 | inc_nlink(dir); | 537 | inc_nlink(dir); |
538 | return retval; | 538 | return retval; |
539 | } | 539 | } |
540 | 540 | ||
541 | static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) | 541 | static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) |
542 | { | 542 | { |
543 | return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0); | 543 | return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0); |
544 | } | 544 | } |
545 | 545 | ||
546 | static int hugetlbfs_symlink(struct inode *dir, | 546 | static int hugetlbfs_symlink(struct inode *dir, |
547 | struct dentry *dentry, const char *symname) | 547 | struct dentry *dentry, const char *symname) |
548 | { | 548 | { |
549 | struct inode *inode; | 549 | struct inode *inode; |
550 | int error = -ENOSPC; | 550 | int error = -ENOSPC; |
551 | 551 | ||
552 | inode = hugetlbfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0); | 552 | inode = hugetlbfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0); |
553 | if (inode) { | 553 | if (inode) { |
554 | int l = strlen(symname)+1; | 554 | int l = strlen(symname)+1; |
555 | error = page_symlink(inode, symname, l); | 555 | error = page_symlink(inode, symname, l); |
556 | if (!error) { | 556 | if (!error) { |
557 | d_instantiate(dentry, inode); | 557 | d_instantiate(dentry, inode); |
558 | dget(dentry); | 558 | dget(dentry); |
559 | } else | 559 | } else |
560 | iput(inode); | 560 | iput(inode); |
561 | } | 561 | } |
562 | dir->i_ctime = dir->i_mtime = CURRENT_TIME; | 562 | dir->i_ctime = dir->i_mtime = CURRENT_TIME; |
563 | 563 | ||
564 | return error; | 564 | return error; |
565 | } | 565 | } |
566 | 566 | ||
567 | /* | 567 | /* |
568 | * mark the head page dirty | 568 | * mark the head page dirty |
569 | */ | 569 | */ |
570 | static int hugetlbfs_set_page_dirty(struct page *page) | 570 | static int hugetlbfs_set_page_dirty(struct page *page) |
571 | { | 571 | { |
572 | struct page *head = compound_head(page); | 572 | struct page *head = compound_head(page); |
573 | 573 | ||
574 | SetPageDirty(head); | 574 | SetPageDirty(head); |
575 | return 0; | 575 | return 0; |
576 | } | 576 | } |
577 | 577 | ||
578 | static int hugetlbfs_migrate_page(struct address_space *mapping, | 578 | static int hugetlbfs_migrate_page(struct address_space *mapping, |
579 | struct page *newpage, struct page *page, | 579 | struct page *newpage, struct page *page, |
580 | enum migrate_mode mode) | 580 | enum migrate_mode mode) |
581 | { | 581 | { |
582 | int rc; | 582 | int rc; |
583 | 583 | ||
584 | rc = migrate_huge_page_move_mapping(mapping, newpage, page); | 584 | rc = migrate_huge_page_move_mapping(mapping, newpage, page); |
585 | if (rc != MIGRATEPAGE_SUCCESS) | 585 | if (rc != MIGRATEPAGE_SUCCESS) |
586 | return rc; | 586 | return rc; |
587 | migrate_page_copy(newpage, page); | 587 | migrate_page_copy(newpage, page); |
588 | 588 | ||
589 | return MIGRATEPAGE_SUCCESS; | 589 | return MIGRATEPAGE_SUCCESS; |
590 | } | 590 | } |
591 | 591 | ||
592 | static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) | 592 | static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) |
593 | { | 593 | { |
594 | struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb); | 594 | struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb); |
595 | struct hstate *h = hstate_inode(dentry->d_inode); | 595 | struct hstate *h = hstate_inode(dentry->d_inode); |
596 | 596 | ||
597 | buf->f_type = HUGETLBFS_MAGIC; | 597 | buf->f_type = HUGETLBFS_MAGIC; |
598 | buf->f_bsize = huge_page_size(h); | 598 | buf->f_bsize = huge_page_size(h); |
599 | if (sbinfo) { | 599 | if (sbinfo) { |
600 | spin_lock(&sbinfo->stat_lock); | 600 | spin_lock(&sbinfo->stat_lock); |
601 | /* If no limits set, just report 0 for max/free/used | 601 | /* If no limits set, just report 0 for max/free/used |
602 | * blocks, like simple_statfs() */ | 602 | * blocks, like simple_statfs() */ |
603 | if (sbinfo->spool) { | 603 | if (sbinfo->spool) { |
604 | long free_pages; | 604 | long free_pages; |
605 | 605 | ||
606 | spin_lock(&sbinfo->spool->lock); | 606 | spin_lock(&sbinfo->spool->lock); |
607 | buf->f_blocks = sbinfo->spool->max_hpages; | 607 | buf->f_blocks = sbinfo->spool->max_hpages; |
608 | free_pages = sbinfo->spool->max_hpages | 608 | free_pages = sbinfo->spool->max_hpages |
609 | - sbinfo->spool->used_hpages; | 609 | - sbinfo->spool->used_hpages; |
610 | buf->f_bavail = buf->f_bfree = free_pages; | 610 | buf->f_bavail = buf->f_bfree = free_pages; |
611 | spin_unlock(&sbinfo->spool->lock); | 611 | spin_unlock(&sbinfo->spool->lock); |
612 | buf->f_files = sbinfo->max_inodes; | 612 | buf->f_files = sbinfo->max_inodes; |
613 | buf->f_ffree = sbinfo->free_inodes; | 613 | buf->f_ffree = sbinfo->free_inodes; |
614 | } | 614 | } |
615 | spin_unlock(&sbinfo->stat_lock); | 615 | spin_unlock(&sbinfo->stat_lock); |
616 | } | 616 | } |
617 | buf->f_namelen = NAME_MAX; | 617 | buf->f_namelen = NAME_MAX; |
618 | return 0; | 618 | return 0; |
619 | } | 619 | } |
620 | 620 | ||
621 | static void hugetlbfs_put_super(struct super_block *sb) | 621 | static void hugetlbfs_put_super(struct super_block *sb) |
622 | { | 622 | { |
623 | struct hugetlbfs_sb_info *sbi = HUGETLBFS_SB(sb); | 623 | struct hugetlbfs_sb_info *sbi = HUGETLBFS_SB(sb); |
624 | 624 | ||
625 | if (sbi) { | 625 | if (sbi) { |
626 | sb->s_fs_info = NULL; | 626 | sb->s_fs_info = NULL; |
627 | 627 | ||
628 | if (sbi->spool) | 628 | if (sbi->spool) |
629 | hugepage_put_subpool(sbi->spool); | 629 | hugepage_put_subpool(sbi->spool); |
630 | 630 | ||
631 | kfree(sbi); | 631 | kfree(sbi); |
632 | } | 632 | } |
633 | } | 633 | } |
634 | 634 | ||
635 | static inline int hugetlbfs_dec_free_inodes(struct hugetlbfs_sb_info *sbinfo) | 635 | static inline int hugetlbfs_dec_free_inodes(struct hugetlbfs_sb_info *sbinfo) |
636 | { | 636 | { |
637 | if (sbinfo->free_inodes >= 0) { | 637 | if (sbinfo->free_inodes >= 0) { |
638 | spin_lock(&sbinfo->stat_lock); | 638 | spin_lock(&sbinfo->stat_lock); |
639 | if (unlikely(!sbinfo->free_inodes)) { | 639 | if (unlikely(!sbinfo->free_inodes)) { |
640 | spin_unlock(&sbinfo->stat_lock); | 640 | spin_unlock(&sbinfo->stat_lock); |
641 | return 0; | 641 | return 0; |
642 | } | 642 | } |
643 | sbinfo->free_inodes--; | 643 | sbinfo->free_inodes--; |
644 | spin_unlock(&sbinfo->stat_lock); | 644 | spin_unlock(&sbinfo->stat_lock); |
645 | } | 645 | } |
646 | 646 | ||
647 | return 1; | 647 | return 1; |
648 | } | 648 | } |
649 | 649 | ||
650 | static void hugetlbfs_inc_free_inodes(struct hugetlbfs_sb_info *sbinfo) | 650 | static void hugetlbfs_inc_free_inodes(struct hugetlbfs_sb_info *sbinfo) |
651 | { | 651 | { |
652 | if (sbinfo->free_inodes >= 0) { | 652 | if (sbinfo->free_inodes >= 0) { |
653 | spin_lock(&sbinfo->stat_lock); | 653 | spin_lock(&sbinfo->stat_lock); |
654 | sbinfo->free_inodes++; | 654 | sbinfo->free_inodes++; |
655 | spin_unlock(&sbinfo->stat_lock); | 655 | spin_unlock(&sbinfo->stat_lock); |
656 | } | 656 | } |
657 | } | 657 | } |
658 | 658 | ||
659 | 659 | ||
660 | static struct kmem_cache *hugetlbfs_inode_cachep; | 660 | static struct kmem_cache *hugetlbfs_inode_cachep; |
661 | 661 | ||
662 | static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) | 662 | static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) |
663 | { | 663 | { |
664 | struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb); | 664 | struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb); |
665 | struct hugetlbfs_inode_info *p; | 665 | struct hugetlbfs_inode_info *p; |
666 | 666 | ||
667 | if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo))) | 667 | if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo))) |
668 | return NULL; | 668 | return NULL; |
669 | p = kmem_cache_alloc(hugetlbfs_inode_cachep, GFP_KERNEL); | 669 | p = kmem_cache_alloc(hugetlbfs_inode_cachep, GFP_KERNEL); |
670 | if (unlikely(!p)) { | 670 | if (unlikely(!p)) { |
671 | hugetlbfs_inc_free_inodes(sbinfo); | 671 | hugetlbfs_inc_free_inodes(sbinfo); |
672 | return NULL; | 672 | return NULL; |
673 | } | 673 | } |
674 | return &p->vfs_inode; | 674 | return &p->vfs_inode; |
675 | } | 675 | } |
676 | 676 | ||
677 | static void hugetlbfs_i_callback(struct rcu_head *head) | 677 | static void hugetlbfs_i_callback(struct rcu_head *head) |
678 | { | 678 | { |
679 | struct inode *inode = container_of(head, struct inode, i_rcu); | 679 | struct inode *inode = container_of(head, struct inode, i_rcu); |
680 | kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode)); | 680 | kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode)); |
681 | } | 681 | } |
682 | 682 | ||
683 | static void hugetlbfs_destroy_inode(struct inode *inode) | 683 | static void hugetlbfs_destroy_inode(struct inode *inode) |
684 | { | 684 | { |
685 | hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb)); | 685 | hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb)); |
686 | mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy); | 686 | mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy); |
687 | call_rcu(&inode->i_rcu, hugetlbfs_i_callback); | 687 | call_rcu(&inode->i_rcu, hugetlbfs_i_callback); |
688 | } | 688 | } |
689 | 689 | ||
690 | static const struct address_space_operations hugetlbfs_aops = { | 690 | static const struct address_space_operations hugetlbfs_aops = { |
691 | .write_begin = hugetlbfs_write_begin, | 691 | .write_begin = hugetlbfs_write_begin, |
692 | .write_end = hugetlbfs_write_end, | 692 | .write_end = hugetlbfs_write_end, |
693 | .set_page_dirty = hugetlbfs_set_page_dirty, | 693 | .set_page_dirty = hugetlbfs_set_page_dirty, |
694 | .migratepage = hugetlbfs_migrate_page, | 694 | .migratepage = hugetlbfs_migrate_page, |
695 | }; | 695 | }; |
696 | 696 | ||
697 | 697 | ||
698 | static void init_once(void *foo) | 698 | static void init_once(void *foo) |
699 | { | 699 | { |
700 | struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo; | 700 | struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo; |
701 | 701 | ||
702 | inode_init_once(&ei->vfs_inode); | 702 | inode_init_once(&ei->vfs_inode); |
703 | } | 703 | } |
704 | 704 | ||
705 | const struct file_operations hugetlbfs_file_operations = { | 705 | const struct file_operations hugetlbfs_file_operations = { |
706 | .read = hugetlbfs_read, | 706 | .read = hugetlbfs_read, |
707 | .mmap = hugetlbfs_file_mmap, | 707 | .mmap = hugetlbfs_file_mmap, |
708 | .fsync = noop_fsync, | 708 | .fsync = noop_fsync, |
709 | .get_unmapped_area = hugetlb_get_unmapped_area, | 709 | .get_unmapped_area = hugetlb_get_unmapped_area, |
710 | .llseek = default_llseek, | 710 | .llseek = default_llseek, |
711 | }; | 711 | }; |
712 | 712 | ||
713 | static const struct inode_operations hugetlbfs_dir_inode_operations = { | 713 | static const struct inode_operations hugetlbfs_dir_inode_operations = { |
714 | .create = hugetlbfs_create, | 714 | .create = hugetlbfs_create, |
715 | .lookup = simple_lookup, | 715 | .lookup = simple_lookup, |
716 | .link = simple_link, | 716 | .link = simple_link, |
717 | .unlink = simple_unlink, | 717 | .unlink = simple_unlink, |
718 | .symlink = hugetlbfs_symlink, | 718 | .symlink = hugetlbfs_symlink, |
719 | .mkdir = hugetlbfs_mkdir, | 719 | .mkdir = hugetlbfs_mkdir, |
720 | .rmdir = simple_rmdir, | 720 | .rmdir = simple_rmdir, |
721 | .mknod = hugetlbfs_mknod, | 721 | .mknod = hugetlbfs_mknod, |
722 | .rename = simple_rename, | 722 | .rename = simple_rename, |
723 | .setattr = hugetlbfs_setattr, | 723 | .setattr = hugetlbfs_setattr, |
724 | }; | 724 | }; |
725 | 725 | ||
726 | static const struct inode_operations hugetlbfs_inode_operations = { | 726 | static const struct inode_operations hugetlbfs_inode_operations = { |
727 | .setattr = hugetlbfs_setattr, | 727 | .setattr = hugetlbfs_setattr, |
728 | }; | 728 | }; |
729 | 729 | ||
730 | static const struct super_operations hugetlbfs_ops = { | 730 | static const struct super_operations hugetlbfs_ops = { |
731 | .alloc_inode = hugetlbfs_alloc_inode, | 731 | .alloc_inode = hugetlbfs_alloc_inode, |
732 | .destroy_inode = hugetlbfs_destroy_inode, | 732 | .destroy_inode = hugetlbfs_destroy_inode, |
733 | .evict_inode = hugetlbfs_evict_inode, | 733 | .evict_inode = hugetlbfs_evict_inode, |
734 | .statfs = hugetlbfs_statfs, | 734 | .statfs = hugetlbfs_statfs, |
735 | .put_super = hugetlbfs_put_super, | 735 | .put_super = hugetlbfs_put_super, |
736 | .show_options = generic_show_options, | 736 | .show_options = generic_show_options, |
737 | }; | 737 | }; |
738 | 738 | ||
739 | static int | 739 | static int |
740 | hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) | 740 | hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) |
741 | { | 741 | { |
742 | char *p, *rest; | 742 | char *p, *rest; |
743 | substring_t args[MAX_OPT_ARGS]; | 743 | substring_t args[MAX_OPT_ARGS]; |
744 | int option; | 744 | int option; |
745 | unsigned long long size = 0; | 745 | unsigned long long size = 0; |
746 | enum { NO_SIZE, SIZE_STD, SIZE_PERCENT } setsize = NO_SIZE; | 746 | enum { NO_SIZE, SIZE_STD, SIZE_PERCENT } setsize = NO_SIZE; |
747 | 747 | ||
748 | if (!options) | 748 | if (!options) |
749 | return 0; | 749 | return 0; |
750 | 750 | ||
751 | while ((p = strsep(&options, ",")) != NULL) { | 751 | while ((p = strsep(&options, ",")) != NULL) { |
752 | int token; | 752 | int token; |
753 | if (!*p) | 753 | if (!*p) |
754 | continue; | 754 | continue; |
755 | 755 | ||
756 | token = match_token(p, tokens, args); | 756 | token = match_token(p, tokens, args); |
757 | switch (token) { | 757 | switch (token) { |
758 | case Opt_uid: | 758 | case Opt_uid: |
759 | if (match_int(&args[0], &option)) | 759 | if (match_int(&args[0], &option)) |
760 | goto bad_val; | 760 | goto bad_val; |
761 | pconfig->uid = make_kuid(current_user_ns(), option); | 761 | pconfig->uid = make_kuid(current_user_ns(), option); |
762 | if (!uid_valid(pconfig->uid)) | 762 | if (!uid_valid(pconfig->uid)) |
763 | goto bad_val; | 763 | goto bad_val; |
764 | break; | 764 | break; |
765 | 765 | ||
766 | case Opt_gid: | 766 | case Opt_gid: |
767 | if (match_int(&args[0], &option)) | 767 | if (match_int(&args[0], &option)) |
768 | goto bad_val; | 768 | goto bad_val; |
769 | pconfig->gid = make_kgid(current_user_ns(), option); | 769 | pconfig->gid = make_kgid(current_user_ns(), option); |
770 | if (!gid_valid(pconfig->gid)) | 770 | if (!gid_valid(pconfig->gid)) |
771 | goto bad_val; | 771 | goto bad_val; |
772 | break; | 772 | break; |
773 | 773 | ||
774 | case Opt_mode: | 774 | case Opt_mode: |
775 | if (match_octal(&args[0], &option)) | 775 | if (match_octal(&args[0], &option)) |
776 | goto bad_val; | 776 | goto bad_val; |
777 | pconfig->mode = option & 01777U; | 777 | pconfig->mode = option & 01777U; |
778 | break; | 778 | break; |
779 | 779 | ||
780 | case Opt_size: { | 780 | case Opt_size: { |
781 | /* memparse() will accept a K/M/G without a digit */ | 781 | /* memparse() will accept a K/M/G without a digit */ |
782 | if (!isdigit(*args[0].from)) | 782 | if (!isdigit(*args[0].from)) |
783 | goto bad_val; | 783 | goto bad_val; |
784 | size = memparse(args[0].from, &rest); | 784 | size = memparse(args[0].from, &rest); |
785 | setsize = SIZE_STD; | 785 | setsize = SIZE_STD; |
786 | if (*rest == '%') | 786 | if (*rest == '%') |
787 | setsize = SIZE_PERCENT; | 787 | setsize = SIZE_PERCENT; |
788 | break; | 788 | break; |
789 | } | 789 | } |
790 | 790 | ||
791 | case Opt_nr_inodes: | 791 | case Opt_nr_inodes: |
792 | /* memparse() will accept a K/M/G without a digit */ | 792 | /* memparse() will accept a K/M/G without a digit */ |
793 | if (!isdigit(*args[0].from)) | 793 | if (!isdigit(*args[0].from)) |
794 | goto bad_val; | 794 | goto bad_val; |
795 | pconfig->nr_inodes = memparse(args[0].from, &rest); | 795 | pconfig->nr_inodes = memparse(args[0].from, &rest); |
796 | break; | 796 | break; |
797 | 797 | ||
798 | case Opt_pagesize: { | 798 | case Opt_pagesize: { |
799 | unsigned long ps; | 799 | unsigned long ps; |
800 | ps = memparse(args[0].from, &rest); | 800 | ps = memparse(args[0].from, &rest); |
801 | pconfig->hstate = size_to_hstate(ps); | 801 | pconfig->hstate = size_to_hstate(ps); |
802 | if (!pconfig->hstate) { | 802 | if (!pconfig->hstate) { |
803 | printk(KERN_ERR | 803 | printk(KERN_ERR |
804 | "hugetlbfs: Unsupported page size %lu MB\n", | 804 | "hugetlbfs: Unsupported page size %lu MB\n", |
805 | ps >> 20); | 805 | ps >> 20); |
806 | return -EINVAL; | 806 | return -EINVAL; |
807 | } | 807 | } |
808 | break; | 808 | break; |
809 | } | 809 | } |
810 | 810 | ||
811 | default: | 811 | default: |
812 | printk(KERN_ERR "hugetlbfs: Bad mount option: \"%s\"\n", | 812 | printk(KERN_ERR "hugetlbfs: Bad mount option: \"%s\"\n", |
813 | p); | 813 | p); |
814 | return -EINVAL; | 814 | return -EINVAL; |
815 | break; | 815 | break; |
816 | } | 816 | } |
817 | } | 817 | } |
818 | 818 | ||
819 | /* Do size after hstate is set up */ | 819 | /* Do size after hstate is set up */ |
820 | if (setsize > NO_SIZE) { | 820 | if (setsize > NO_SIZE) { |
821 | struct hstate *h = pconfig->hstate; | 821 | struct hstate *h = pconfig->hstate; |
822 | if (setsize == SIZE_PERCENT) { | 822 | if (setsize == SIZE_PERCENT) { |
823 | size <<= huge_page_shift(h); | 823 | size <<= huge_page_shift(h); |
824 | size *= h->max_huge_pages; | 824 | size *= h->max_huge_pages; |
825 | do_div(size, 100); | 825 | do_div(size, 100); |
826 | } | 826 | } |
827 | pconfig->nr_blocks = (size >> huge_page_shift(h)); | 827 | pconfig->nr_blocks = (size >> huge_page_shift(h)); |
828 | } | 828 | } |
829 | 829 | ||
830 | return 0; | 830 | return 0; |
831 | 831 | ||
832 | bad_val: | 832 | bad_val: |
833 | printk(KERN_ERR "hugetlbfs: Bad value '%s' for mount option '%s'\n", | 833 | printk(KERN_ERR "hugetlbfs: Bad value '%s' for mount option '%s'\n", |
834 | args[0].from, p); | 834 | args[0].from, p); |
835 | return -EINVAL; | 835 | return -EINVAL; |
836 | } | 836 | } |
837 | 837 | ||
838 | static int | 838 | static int |
839 | hugetlbfs_fill_super(struct super_block *sb, void *data, int silent) | 839 | hugetlbfs_fill_super(struct super_block *sb, void *data, int silent) |
840 | { | 840 | { |
841 | int ret; | 841 | int ret; |
842 | struct hugetlbfs_config config; | 842 | struct hugetlbfs_config config; |
843 | struct hugetlbfs_sb_info *sbinfo; | 843 | struct hugetlbfs_sb_info *sbinfo; |
844 | 844 | ||
845 | save_mount_options(sb, data); | 845 | save_mount_options(sb, data); |
846 | 846 | ||
847 | config.nr_blocks = -1; /* No limit on size by default */ | 847 | config.nr_blocks = -1; /* No limit on size by default */ |
848 | config.nr_inodes = -1; /* No limit on number of inodes by default */ | 848 | config.nr_inodes = -1; /* No limit on number of inodes by default */ |
849 | config.uid = current_fsuid(); | 849 | config.uid = current_fsuid(); |
850 | config.gid = current_fsgid(); | 850 | config.gid = current_fsgid(); |
851 | config.mode = 0755; | 851 | config.mode = 0755; |
852 | config.hstate = &default_hstate; | 852 | config.hstate = &default_hstate; |
853 | ret = hugetlbfs_parse_options(data, &config); | 853 | ret = hugetlbfs_parse_options(data, &config); |
854 | if (ret) | 854 | if (ret) |
855 | return ret; | 855 | return ret; |
856 | 856 | ||
857 | sbinfo = kmalloc(sizeof(struct hugetlbfs_sb_info), GFP_KERNEL); | 857 | sbinfo = kmalloc(sizeof(struct hugetlbfs_sb_info), GFP_KERNEL); |
858 | if (!sbinfo) | 858 | if (!sbinfo) |
859 | return -ENOMEM; | 859 | return -ENOMEM; |
860 | sb->s_fs_info = sbinfo; | 860 | sb->s_fs_info = sbinfo; |
861 | sbinfo->hstate = config.hstate; | 861 | sbinfo->hstate = config.hstate; |
862 | spin_lock_init(&sbinfo->stat_lock); | 862 | spin_lock_init(&sbinfo->stat_lock); |
863 | sbinfo->max_inodes = config.nr_inodes; | 863 | sbinfo->max_inodes = config.nr_inodes; |
864 | sbinfo->free_inodes = config.nr_inodes; | 864 | sbinfo->free_inodes = config.nr_inodes; |
865 | sbinfo->spool = NULL; | 865 | sbinfo->spool = NULL; |
866 | if (config.nr_blocks != -1) { | 866 | if (config.nr_blocks != -1) { |
867 | sbinfo->spool = hugepage_new_subpool(config.nr_blocks); | 867 | sbinfo->spool = hugepage_new_subpool(config.nr_blocks); |
868 | if (!sbinfo->spool) | 868 | if (!sbinfo->spool) |
869 | goto out_free; | 869 | goto out_free; |
870 | } | 870 | } |
871 | sb->s_maxbytes = MAX_LFS_FILESIZE; | 871 | sb->s_maxbytes = MAX_LFS_FILESIZE; |
872 | sb->s_blocksize = huge_page_size(config.hstate); | 872 | sb->s_blocksize = huge_page_size(config.hstate); |
873 | sb->s_blocksize_bits = huge_page_shift(config.hstate); | 873 | sb->s_blocksize_bits = huge_page_shift(config.hstate); |
874 | sb->s_magic = HUGETLBFS_MAGIC; | 874 | sb->s_magic = HUGETLBFS_MAGIC; |
875 | sb->s_op = &hugetlbfs_ops; | 875 | sb->s_op = &hugetlbfs_ops; |
876 | sb->s_time_gran = 1; | 876 | sb->s_time_gran = 1; |
877 | sb->s_root = d_make_root(hugetlbfs_get_root(sb, &config)); | 877 | sb->s_root = d_make_root(hugetlbfs_get_root(sb, &config)); |
878 | if (!sb->s_root) | 878 | if (!sb->s_root) |
879 | goto out_free; | 879 | goto out_free; |
880 | return 0; | 880 | return 0; |
881 | out_free: | 881 | out_free: |
882 | if (sbinfo->spool) | 882 | if (sbinfo->spool) |
883 | kfree(sbinfo->spool); | 883 | kfree(sbinfo->spool); |
884 | kfree(sbinfo); | 884 | kfree(sbinfo); |
885 | return -ENOMEM; | 885 | return -ENOMEM; |
886 | } | 886 | } |
887 | 887 | ||
888 | static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type, | 888 | static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type, |
889 | int flags, const char *dev_name, void *data) | 889 | int flags, const char *dev_name, void *data) |
890 | { | 890 | { |
891 | return mount_nodev(fs_type, flags, data, hugetlbfs_fill_super); | 891 | return mount_nodev(fs_type, flags, data, hugetlbfs_fill_super); |
892 | } | 892 | } |
893 | 893 | ||
894 | static struct file_system_type hugetlbfs_fs_type = { | 894 | static struct file_system_type hugetlbfs_fs_type = { |
895 | .name = "hugetlbfs", | 895 | .name = "hugetlbfs", |
896 | .mount = hugetlbfs_mount, | 896 | .mount = hugetlbfs_mount, |
897 | .kill_sb = kill_litter_super, | 897 | .kill_sb = kill_litter_super, |
898 | }; | 898 | }; |
899 | MODULE_ALIAS_FS("hugetlbfs"); | 899 | MODULE_ALIAS_FS("hugetlbfs"); |
900 | 900 | ||
901 | static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE]; | 901 | static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE]; |
902 | 902 | ||
903 | static int can_do_hugetlb_shm(void) | 903 | static int can_do_hugetlb_shm(void) |
904 | { | 904 | { |
905 | kgid_t shm_group; | 905 | kgid_t shm_group; |
906 | shm_group = make_kgid(&init_user_ns, sysctl_hugetlb_shm_group); | 906 | shm_group = make_kgid(&init_user_ns, sysctl_hugetlb_shm_group); |
907 | return capable(CAP_IPC_LOCK) || in_group_p(shm_group); | 907 | return capable(CAP_IPC_LOCK) || in_group_p(shm_group); |
908 | } | 908 | } |
909 | 909 | ||
910 | static int get_hstate_idx(int page_size_log) | 910 | static int get_hstate_idx(int page_size_log) |
911 | { | 911 | { |
912 | struct hstate *h; | 912 | struct hstate *h = hstate_sizelog(page_size_log); |
913 | 913 | ||
914 | if (!page_size_log) | ||
915 | return default_hstate_idx; | ||
916 | h = size_to_hstate(1 << page_size_log); | ||
917 | if (!h) | 914 | if (!h) |
918 | return -1; | 915 | return -1; |
919 | return h - hstates; | 916 | return h - hstates; |
920 | } | 917 | } |
921 | 918 | ||
922 | static char *hugetlb_dname(struct dentry *dentry, char *buffer, int buflen) | 919 | static char *hugetlb_dname(struct dentry *dentry, char *buffer, int buflen) |
923 | { | 920 | { |
924 | return dynamic_dname(dentry, buffer, buflen, "/%s (deleted)", | 921 | return dynamic_dname(dentry, buffer, buflen, "/%s (deleted)", |
925 | dentry->d_name.name); | 922 | dentry->d_name.name); |
926 | } | 923 | } |
927 | 924 | ||
928 | static struct dentry_operations anon_ops = { | 925 | static struct dentry_operations anon_ops = { |
929 | .d_dname = hugetlb_dname | 926 | .d_dname = hugetlb_dname |
930 | }; | 927 | }; |
931 | 928 | ||
932 | struct file *hugetlb_file_setup(const char *name, unsigned long addr, | 929 | /* |
933 | size_t size, vm_flags_t acctflag, | 930 | * Note that size should be aligned to proper hugepage size in caller side, |
934 | struct user_struct **user, | 931 | * otherwise hugetlb_reserve_pages reserves one less hugepages than intended. |
932 | */ | ||
933 | struct file *hugetlb_file_setup(const char *name, size_t size, | ||
934 | vm_flags_t acctflag, struct user_struct **user, | ||
935 | int creat_flags, int page_size_log) | 935 | int creat_flags, int page_size_log) |
936 | { | 936 | { |
937 | struct file *file = ERR_PTR(-ENOMEM); | 937 | struct file *file = ERR_PTR(-ENOMEM); |
938 | struct inode *inode; | 938 | struct inode *inode; |
939 | struct path path; | 939 | struct path path; |
940 | struct super_block *sb; | 940 | struct super_block *sb; |
941 | struct qstr quick_string; | 941 | struct qstr quick_string; |
942 | struct hstate *hstate; | ||
943 | unsigned long num_pages; | ||
944 | int hstate_idx; | 942 | int hstate_idx; |
945 | 943 | ||
946 | hstate_idx = get_hstate_idx(page_size_log); | 944 | hstate_idx = get_hstate_idx(page_size_log); |
947 | if (hstate_idx < 0) | 945 | if (hstate_idx < 0) |
948 | return ERR_PTR(-ENODEV); | 946 | return ERR_PTR(-ENODEV); |
949 | 947 | ||
950 | *user = NULL; | 948 | *user = NULL; |
951 | if (!hugetlbfs_vfsmount[hstate_idx]) | 949 | if (!hugetlbfs_vfsmount[hstate_idx]) |
952 | return ERR_PTR(-ENOENT); | 950 | return ERR_PTR(-ENOENT); |
953 | 951 | ||
954 | if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) { | 952 | if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) { |
955 | *user = current_user(); | 953 | *user = current_user(); |
956 | if (user_shm_lock(size, *user)) { | 954 | if (user_shm_lock(size, *user)) { |
957 | task_lock(current); | 955 | task_lock(current); |
958 | printk_once(KERN_WARNING | 956 | printk_once(KERN_WARNING |
959 | "%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n", | 957 | "%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n", |
960 | current->comm, current->pid); | 958 | current->comm, current->pid); |
961 | task_unlock(current); | 959 | task_unlock(current); |
962 | } else { | 960 | } else { |
963 | *user = NULL; | 961 | *user = NULL; |
964 | return ERR_PTR(-EPERM); | 962 | return ERR_PTR(-EPERM); |
965 | } | 963 | } |
966 | } | 964 | } |
967 | 965 | ||
968 | sb = hugetlbfs_vfsmount[hstate_idx]->mnt_sb; | 966 | sb = hugetlbfs_vfsmount[hstate_idx]->mnt_sb; |
969 | quick_string.name = name; | 967 | quick_string.name = name; |
970 | quick_string.len = strlen(quick_string.name); | 968 | quick_string.len = strlen(quick_string.name); |
971 | quick_string.hash = 0; | 969 | quick_string.hash = 0; |
972 | path.dentry = d_alloc_pseudo(sb, &quick_string); | 970 | path.dentry = d_alloc_pseudo(sb, &quick_string); |
973 | if (!path.dentry) | 971 | if (!path.dentry) |
974 | goto out_shm_unlock; | 972 | goto out_shm_unlock; |
975 | 973 | ||
976 | d_set_d_op(path.dentry, &anon_ops); | 974 | d_set_d_op(path.dentry, &anon_ops); |
977 | path.mnt = mntget(hugetlbfs_vfsmount[hstate_idx]); | 975 | path.mnt = mntget(hugetlbfs_vfsmount[hstate_idx]); |
978 | file = ERR_PTR(-ENOSPC); | 976 | file = ERR_PTR(-ENOSPC); |
979 | inode = hugetlbfs_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0); | 977 | inode = hugetlbfs_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0); |
980 | if (!inode) | 978 | if (!inode) |
981 | goto out_dentry; | 979 | goto out_dentry; |
982 | 980 | ||
983 | hstate = hstate_inode(inode); | ||
984 | size += addr & ~huge_page_mask(hstate); | ||
985 | num_pages = ALIGN(size, huge_page_size(hstate)) >> | ||
986 | huge_page_shift(hstate); | ||
987 | file = ERR_PTR(-ENOMEM); | 981 | file = ERR_PTR(-ENOMEM); |
988 | if (hugetlb_reserve_pages(inode, 0, num_pages, NULL, acctflag)) | 982 | if (hugetlb_reserve_pages(inode, 0, |
983 | size >> huge_page_shift(hstate_inode(inode)), NULL, | ||
984 | acctflag)) | ||
989 | goto out_inode; | 985 | goto out_inode; |
990 | 986 | ||
991 | d_instantiate(path.dentry, inode); | 987 | d_instantiate(path.dentry, inode); |
992 | inode->i_size = size; | 988 | inode->i_size = size; |
993 | clear_nlink(inode); | 989 | clear_nlink(inode); |
994 | 990 | ||
995 | file = alloc_file(&path, FMODE_WRITE | FMODE_READ, | 991 | file = alloc_file(&path, FMODE_WRITE | FMODE_READ, |
996 | &hugetlbfs_file_operations); | 992 | &hugetlbfs_file_operations); |
997 | if (IS_ERR(file)) | 993 | if (IS_ERR(file)) |
998 | goto out_dentry; /* inode is already attached */ | 994 | goto out_dentry; /* inode is already attached */ |
999 | 995 | ||
1000 | return file; | 996 | return file; |
1001 | 997 | ||
1002 | out_inode: | 998 | out_inode: |
1003 | iput(inode); | 999 | iput(inode); |
1004 | out_dentry: | 1000 | out_dentry: |
1005 | path_put(&path); | 1001 | path_put(&path); |
1006 | out_shm_unlock: | 1002 | out_shm_unlock: |
1007 | if (*user) { | 1003 | if (*user) { |
1008 | user_shm_unlock(size, *user); | 1004 | user_shm_unlock(size, *user); |
1009 | *user = NULL; | 1005 | *user = NULL; |
1010 | } | 1006 | } |
1011 | return file; | 1007 | return file; |
1012 | } | 1008 | } |
1013 | 1009 | ||
1014 | static int __init init_hugetlbfs_fs(void) | 1010 | static int __init init_hugetlbfs_fs(void) |
1015 | { | 1011 | { |
1016 | struct hstate *h; | 1012 | struct hstate *h; |
1017 | int error; | 1013 | int error; |
1018 | int i; | 1014 | int i; |
1019 | 1015 | ||
1020 | error = bdi_init(&hugetlbfs_backing_dev_info); | 1016 | error = bdi_init(&hugetlbfs_backing_dev_info); |
1021 | if (error) | 1017 | if (error) |
1022 | return error; | 1018 | return error; |
1023 | 1019 | ||
1024 | error = -ENOMEM; | 1020 | error = -ENOMEM; |
1025 | hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache", | 1021 | hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache", |
1026 | sizeof(struct hugetlbfs_inode_info), | 1022 | sizeof(struct hugetlbfs_inode_info), |
1027 | 0, 0, init_once); | 1023 | 0, 0, init_once); |
1028 | if (hugetlbfs_inode_cachep == NULL) | 1024 | if (hugetlbfs_inode_cachep == NULL) |
1029 | goto out2; | 1025 | goto out2; |
1030 | 1026 | ||
1031 | error = register_filesystem(&hugetlbfs_fs_type); | 1027 | error = register_filesystem(&hugetlbfs_fs_type); |
1032 | if (error) | 1028 | if (error) |
1033 | goto out; | 1029 | goto out; |
1034 | 1030 | ||
1035 | i = 0; | 1031 | i = 0; |
1036 | for_each_hstate(h) { | 1032 | for_each_hstate(h) { |
1037 | char buf[50]; | 1033 | char buf[50]; |
1038 | unsigned ps_kb = 1U << (h->order + PAGE_SHIFT - 10); | 1034 | unsigned ps_kb = 1U << (h->order + PAGE_SHIFT - 10); |
1039 | 1035 | ||
1040 | snprintf(buf, sizeof(buf), "pagesize=%uK", ps_kb); | 1036 | snprintf(buf, sizeof(buf), "pagesize=%uK", ps_kb); |
1041 | hugetlbfs_vfsmount[i] = kern_mount_data(&hugetlbfs_fs_type, | 1037 | hugetlbfs_vfsmount[i] = kern_mount_data(&hugetlbfs_fs_type, |
1042 | buf); | 1038 | buf); |
1043 | 1039 | ||
1044 | if (IS_ERR(hugetlbfs_vfsmount[i])) { | 1040 | if (IS_ERR(hugetlbfs_vfsmount[i])) { |
1045 | pr_err("hugetlb: Cannot mount internal hugetlbfs for " | 1041 | pr_err("hugetlb: Cannot mount internal hugetlbfs for " |
1046 | "page size %uK", ps_kb); | 1042 | "page size %uK", ps_kb); |
1047 | error = PTR_ERR(hugetlbfs_vfsmount[i]); | 1043 | error = PTR_ERR(hugetlbfs_vfsmount[i]); |
1048 | hugetlbfs_vfsmount[i] = NULL; | 1044 | hugetlbfs_vfsmount[i] = NULL; |
1049 | } | 1045 | } |
1050 | i++; | 1046 | i++; |
1051 | } | 1047 | } |
1052 | /* Non default hstates are optional */ | 1048 | /* Non default hstates are optional */ |
1053 | if (!IS_ERR_OR_NULL(hugetlbfs_vfsmount[default_hstate_idx])) | 1049 | if (!IS_ERR_OR_NULL(hugetlbfs_vfsmount[default_hstate_idx])) |
1054 | return 0; | 1050 | return 0; |
1055 | 1051 | ||
1056 | out: | 1052 | out: |
1057 | kmem_cache_destroy(hugetlbfs_inode_cachep); | 1053 | kmem_cache_destroy(hugetlbfs_inode_cachep); |
1058 | out2: | 1054 | out2: |
1059 | bdi_destroy(&hugetlbfs_backing_dev_info); | 1055 | bdi_destroy(&hugetlbfs_backing_dev_info); |
1060 | return error; | 1056 | return error; |
1061 | } | 1057 | } |
1062 | 1058 | ||
1063 | static void __exit exit_hugetlbfs_fs(void) | 1059 | static void __exit exit_hugetlbfs_fs(void) |
1064 | { | 1060 | { |
1065 | struct hstate *h; | 1061 | struct hstate *h; |
1066 | int i; | 1062 | int i; |
1067 | 1063 | ||
1068 | 1064 | ||
1069 | /* | 1065 | /* |
1070 | * Make sure all delayed rcu free inodes are flushed before we | 1066 | * Make sure all delayed rcu free inodes are flushed before we |
1071 | * destroy cache. | 1067 | * destroy cache. |
1072 | */ | 1068 | */ |
1073 | rcu_barrier(); | 1069 | rcu_barrier(); |
1074 | kmem_cache_destroy(hugetlbfs_inode_cachep); | 1070 | kmem_cache_destroy(hugetlbfs_inode_cachep); |
1075 | i = 0; | 1071 | i = 0; |
1076 | for_each_hstate(h) | 1072 | for_each_hstate(h) |
1077 | kern_unmount(hugetlbfs_vfsmount[i++]); | 1073 | kern_unmount(hugetlbfs_vfsmount[i++]); |
1078 | unregister_filesystem(&hugetlbfs_fs_type); | 1074 | unregister_filesystem(&hugetlbfs_fs_type); |
1079 | bdi_destroy(&hugetlbfs_backing_dev_info); | 1075 | bdi_destroy(&hugetlbfs_backing_dev_info); |
1080 | } | 1076 | } |
1081 | 1077 |
include/linux/hugetlb.h
1 | #ifndef _LINUX_HUGETLB_H | 1 | #ifndef _LINUX_HUGETLB_H |
2 | #define _LINUX_HUGETLB_H | 2 | #define _LINUX_HUGETLB_H |
3 | 3 | ||
4 | #include <linux/mm_types.h> | 4 | #include <linux/mm_types.h> |
5 | #include <linux/fs.h> | 5 | #include <linux/fs.h> |
6 | #include <linux/hugetlb_inline.h> | 6 | #include <linux/hugetlb_inline.h> |
7 | #include <linux/cgroup.h> | 7 | #include <linux/cgroup.h> |
8 | 8 | ||
9 | struct ctl_table; | 9 | struct ctl_table; |
10 | struct user_struct; | 10 | struct user_struct; |
11 | struct mmu_gather; | 11 | struct mmu_gather; |
12 | 12 | ||
13 | #ifdef CONFIG_HUGETLB_PAGE | 13 | #ifdef CONFIG_HUGETLB_PAGE |
14 | 14 | ||
15 | #include <linux/mempolicy.h> | 15 | #include <linux/mempolicy.h> |
16 | #include <linux/shm.h> | 16 | #include <linux/shm.h> |
17 | #include <asm/tlbflush.h> | 17 | #include <asm/tlbflush.h> |
18 | 18 | ||
19 | struct hugepage_subpool { | 19 | struct hugepage_subpool { |
20 | spinlock_t lock; | 20 | spinlock_t lock; |
21 | long count; | 21 | long count; |
22 | long max_hpages, used_hpages; | 22 | long max_hpages, used_hpages; |
23 | }; | 23 | }; |
24 | 24 | ||
25 | extern spinlock_t hugetlb_lock; | 25 | extern spinlock_t hugetlb_lock; |
26 | extern int hugetlb_max_hstate __read_mostly; | 26 | extern int hugetlb_max_hstate __read_mostly; |
27 | #define for_each_hstate(h) \ | 27 | #define for_each_hstate(h) \ |
28 | for ((h) = hstates; (h) < &hstates[hugetlb_max_hstate]; (h)++) | 28 | for ((h) = hstates; (h) < &hstates[hugetlb_max_hstate]; (h)++) |
29 | 29 | ||
30 | struct hugepage_subpool *hugepage_new_subpool(long nr_blocks); | 30 | struct hugepage_subpool *hugepage_new_subpool(long nr_blocks); |
31 | void hugepage_put_subpool(struct hugepage_subpool *spool); | 31 | void hugepage_put_subpool(struct hugepage_subpool *spool); |
32 | 32 | ||
33 | int PageHuge(struct page *page); | 33 | int PageHuge(struct page *page); |
34 | 34 | ||
35 | void reset_vma_resv_huge_pages(struct vm_area_struct *vma); | 35 | void reset_vma_resv_huge_pages(struct vm_area_struct *vma); |
36 | int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); | 36 | int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); |
37 | int hugetlb_overcommit_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); | 37 | int hugetlb_overcommit_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); |
38 | int hugetlb_treat_movable_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); | 38 | int hugetlb_treat_movable_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); |
39 | 39 | ||
40 | #ifdef CONFIG_NUMA | 40 | #ifdef CONFIG_NUMA |
41 | int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int, | 41 | int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int, |
42 | void __user *, size_t *, loff_t *); | 42 | void __user *, size_t *, loff_t *); |
43 | #endif | 43 | #endif |
44 | 44 | ||
45 | int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *); | 45 | int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *); |
46 | long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, | 46 | long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, |
47 | struct page **, struct vm_area_struct **, | 47 | struct page **, struct vm_area_struct **, |
48 | unsigned long *, unsigned long *, long, unsigned int); | 48 | unsigned long *, unsigned long *, long, unsigned int); |
49 | void unmap_hugepage_range(struct vm_area_struct *, | 49 | void unmap_hugepage_range(struct vm_area_struct *, |
50 | unsigned long, unsigned long, struct page *); | 50 | unsigned long, unsigned long, struct page *); |
51 | void __unmap_hugepage_range_final(struct mmu_gather *tlb, | 51 | void __unmap_hugepage_range_final(struct mmu_gather *tlb, |
52 | struct vm_area_struct *vma, | 52 | struct vm_area_struct *vma, |
53 | unsigned long start, unsigned long end, | 53 | unsigned long start, unsigned long end, |
54 | struct page *ref_page); | 54 | struct page *ref_page); |
55 | void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, | 55 | void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, |
56 | unsigned long start, unsigned long end, | 56 | unsigned long start, unsigned long end, |
57 | struct page *ref_page); | 57 | struct page *ref_page); |
58 | int hugetlb_prefault(struct address_space *, struct vm_area_struct *); | 58 | int hugetlb_prefault(struct address_space *, struct vm_area_struct *); |
59 | void hugetlb_report_meminfo(struct seq_file *); | 59 | void hugetlb_report_meminfo(struct seq_file *); |
60 | int hugetlb_report_node_meminfo(int, char *); | 60 | int hugetlb_report_node_meminfo(int, char *); |
61 | void hugetlb_show_meminfo(void); | 61 | void hugetlb_show_meminfo(void); |
62 | unsigned long hugetlb_total_pages(void); | 62 | unsigned long hugetlb_total_pages(void); |
63 | int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 63 | int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, |
64 | unsigned long address, unsigned int flags); | 64 | unsigned long address, unsigned int flags); |
65 | int hugetlb_reserve_pages(struct inode *inode, long from, long to, | 65 | int hugetlb_reserve_pages(struct inode *inode, long from, long to, |
66 | struct vm_area_struct *vma, | 66 | struct vm_area_struct *vma, |
67 | vm_flags_t vm_flags); | 67 | vm_flags_t vm_flags); |
68 | void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed); | 68 | void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed); |
69 | int dequeue_hwpoisoned_huge_page(struct page *page); | 69 | int dequeue_hwpoisoned_huge_page(struct page *page); |
70 | void copy_huge_page(struct page *dst, struct page *src); | 70 | void copy_huge_page(struct page *dst, struct page *src); |
71 | 71 | ||
72 | extern unsigned long hugepages_treat_as_movable; | 72 | extern unsigned long hugepages_treat_as_movable; |
73 | extern const unsigned long hugetlb_zero, hugetlb_infinity; | 73 | extern const unsigned long hugetlb_zero, hugetlb_infinity; |
74 | extern int sysctl_hugetlb_shm_group; | 74 | extern int sysctl_hugetlb_shm_group; |
75 | extern struct list_head huge_boot_pages; | 75 | extern struct list_head huge_boot_pages; |
76 | 76 | ||
77 | /* arch callbacks */ | 77 | /* arch callbacks */ |
78 | 78 | ||
79 | pte_t *huge_pte_alloc(struct mm_struct *mm, | 79 | pte_t *huge_pte_alloc(struct mm_struct *mm, |
80 | unsigned long addr, unsigned long sz); | 80 | unsigned long addr, unsigned long sz); |
81 | pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr); | 81 | pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr); |
82 | int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep); | 82 | int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep); |
83 | struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, | 83 | struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, |
84 | int write); | 84 | int write); |
85 | struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, | 85 | struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, |
86 | pmd_t *pmd, int write); | 86 | pmd_t *pmd, int write); |
87 | struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address, | 87 | struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address, |
88 | pud_t *pud, int write); | 88 | pud_t *pud, int write); |
89 | int pmd_huge(pmd_t pmd); | 89 | int pmd_huge(pmd_t pmd); |
90 | int pud_huge(pud_t pmd); | 90 | int pud_huge(pud_t pmd); |
91 | unsigned long hugetlb_change_protection(struct vm_area_struct *vma, | 91 | unsigned long hugetlb_change_protection(struct vm_area_struct *vma, |
92 | unsigned long address, unsigned long end, pgprot_t newprot); | 92 | unsigned long address, unsigned long end, pgprot_t newprot); |
93 | 93 | ||
94 | #else /* !CONFIG_HUGETLB_PAGE */ | 94 | #else /* !CONFIG_HUGETLB_PAGE */ |
95 | 95 | ||
96 | static inline int PageHuge(struct page *page) | 96 | static inline int PageHuge(struct page *page) |
97 | { | 97 | { |
98 | return 0; | 98 | return 0; |
99 | } | 99 | } |
100 | 100 | ||
101 | static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma) | 101 | static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma) |
102 | { | 102 | { |
103 | } | 103 | } |
104 | 104 | ||
105 | static inline unsigned long hugetlb_total_pages(void) | 105 | static inline unsigned long hugetlb_total_pages(void) |
106 | { | 106 | { |
107 | return 0; | 107 | return 0; |
108 | } | 108 | } |
109 | 109 | ||
110 | #define follow_hugetlb_page(m,v,p,vs,a,b,i,w) ({ BUG(); 0; }) | 110 | #define follow_hugetlb_page(m,v,p,vs,a,b,i,w) ({ BUG(); 0; }) |
111 | #define follow_huge_addr(mm, addr, write) ERR_PTR(-EINVAL) | 111 | #define follow_huge_addr(mm, addr, write) ERR_PTR(-EINVAL) |
112 | #define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; }) | 112 | #define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; }) |
113 | #define hugetlb_prefault(mapping, vma) ({ BUG(); 0; }) | 113 | #define hugetlb_prefault(mapping, vma) ({ BUG(); 0; }) |
114 | static inline void hugetlb_report_meminfo(struct seq_file *m) | 114 | static inline void hugetlb_report_meminfo(struct seq_file *m) |
115 | { | 115 | { |
116 | } | 116 | } |
117 | #define hugetlb_report_node_meminfo(n, buf) 0 | 117 | #define hugetlb_report_node_meminfo(n, buf) 0 |
118 | static inline void hugetlb_show_meminfo(void) | 118 | static inline void hugetlb_show_meminfo(void) |
119 | { | 119 | { |
120 | } | 120 | } |
121 | #define follow_huge_pmd(mm, addr, pmd, write) NULL | 121 | #define follow_huge_pmd(mm, addr, pmd, write) NULL |
122 | #define follow_huge_pud(mm, addr, pud, write) NULL | 122 | #define follow_huge_pud(mm, addr, pud, write) NULL |
123 | #define prepare_hugepage_range(file, addr, len) (-EINVAL) | 123 | #define prepare_hugepage_range(file, addr, len) (-EINVAL) |
124 | #define pmd_huge(x) 0 | 124 | #define pmd_huge(x) 0 |
125 | #define pud_huge(x) 0 | 125 | #define pud_huge(x) 0 |
126 | #define is_hugepage_only_range(mm, addr, len) 0 | 126 | #define is_hugepage_only_range(mm, addr, len) 0 |
127 | #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; }) | 127 | #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; }) |
128 | #define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; }) | 128 | #define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; }) |
129 | #define huge_pte_offset(mm, address) 0 | 129 | #define huge_pte_offset(mm, address) 0 |
130 | static inline int dequeue_hwpoisoned_huge_page(struct page *page) | 130 | static inline int dequeue_hwpoisoned_huge_page(struct page *page) |
131 | { | 131 | { |
132 | return 0; | 132 | return 0; |
133 | } | 133 | } |
134 | 134 | ||
135 | static inline void copy_huge_page(struct page *dst, struct page *src) | 135 | static inline void copy_huge_page(struct page *dst, struct page *src) |
136 | { | 136 | { |
137 | } | 137 | } |
138 | 138 | ||
139 | static inline unsigned long hugetlb_change_protection(struct vm_area_struct *vma, | 139 | static inline unsigned long hugetlb_change_protection(struct vm_area_struct *vma, |
140 | unsigned long address, unsigned long end, pgprot_t newprot) | 140 | unsigned long address, unsigned long end, pgprot_t newprot) |
141 | { | 141 | { |
142 | return 0; | 142 | return 0; |
143 | } | 143 | } |
144 | 144 | ||
145 | static inline void __unmap_hugepage_range_final(struct mmu_gather *tlb, | 145 | static inline void __unmap_hugepage_range_final(struct mmu_gather *tlb, |
146 | struct vm_area_struct *vma, unsigned long start, | 146 | struct vm_area_struct *vma, unsigned long start, |
147 | unsigned long end, struct page *ref_page) | 147 | unsigned long end, struct page *ref_page) |
148 | { | 148 | { |
149 | BUG(); | 149 | BUG(); |
150 | } | 150 | } |
151 | 151 | ||
152 | static inline void __unmap_hugepage_range(struct mmu_gather *tlb, | 152 | static inline void __unmap_hugepage_range(struct mmu_gather *tlb, |
153 | struct vm_area_struct *vma, unsigned long start, | 153 | struct vm_area_struct *vma, unsigned long start, |
154 | unsigned long end, struct page *ref_page) | 154 | unsigned long end, struct page *ref_page) |
155 | { | 155 | { |
156 | BUG(); | 156 | BUG(); |
157 | } | 157 | } |
158 | 158 | ||
159 | #endif /* !CONFIG_HUGETLB_PAGE */ | 159 | #endif /* !CONFIG_HUGETLB_PAGE */ |
160 | 160 | ||
161 | #define HUGETLB_ANON_FILE "anon_hugepage" | 161 | #define HUGETLB_ANON_FILE "anon_hugepage" |
162 | 162 | ||
163 | enum { | 163 | enum { |
164 | /* | 164 | /* |
165 | * The file will be used as an shm file so shmfs accounting rules | 165 | * The file will be used as an shm file so shmfs accounting rules |
166 | * apply | 166 | * apply |
167 | */ | 167 | */ |
168 | HUGETLB_SHMFS_INODE = 1, | 168 | HUGETLB_SHMFS_INODE = 1, |
169 | /* | 169 | /* |
170 | * The file is being created on the internal vfs mount and shmfs | 170 | * The file is being created on the internal vfs mount and shmfs |
171 | * accounting rules do not apply | 171 | * accounting rules do not apply |
172 | */ | 172 | */ |
173 | HUGETLB_ANONHUGE_INODE = 2, | 173 | HUGETLB_ANONHUGE_INODE = 2, |
174 | }; | 174 | }; |
175 | 175 | ||
176 | #ifdef CONFIG_HUGETLBFS | 176 | #ifdef CONFIG_HUGETLBFS |
177 | struct hugetlbfs_sb_info { | 177 | struct hugetlbfs_sb_info { |
178 | long max_inodes; /* inodes allowed */ | 178 | long max_inodes; /* inodes allowed */ |
179 | long free_inodes; /* inodes free */ | 179 | long free_inodes; /* inodes free */ |
180 | spinlock_t stat_lock; | 180 | spinlock_t stat_lock; |
181 | struct hstate *hstate; | 181 | struct hstate *hstate; |
182 | struct hugepage_subpool *spool; | 182 | struct hugepage_subpool *spool; |
183 | }; | 183 | }; |
184 | 184 | ||
185 | static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb) | 185 | static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb) |
186 | { | 186 | { |
187 | return sb->s_fs_info; | 187 | return sb->s_fs_info; |
188 | } | 188 | } |
189 | 189 | ||
190 | extern const struct file_operations hugetlbfs_file_operations; | 190 | extern const struct file_operations hugetlbfs_file_operations; |
191 | extern const struct vm_operations_struct hugetlb_vm_ops; | 191 | extern const struct vm_operations_struct hugetlb_vm_ops; |
192 | struct file *hugetlb_file_setup(const char *name, unsigned long addr, | 192 | struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct, |
193 | size_t size, vm_flags_t acct, | ||
194 | struct user_struct **user, int creat_flags, | 193 | struct user_struct **user, int creat_flags, |
195 | int page_size_log); | 194 | int page_size_log); |
196 | 195 | ||
197 | static inline int is_file_hugepages(struct file *file) | 196 | static inline int is_file_hugepages(struct file *file) |
198 | { | 197 | { |
199 | if (file->f_op == &hugetlbfs_file_operations) | 198 | if (file->f_op == &hugetlbfs_file_operations) |
200 | return 1; | 199 | return 1; |
201 | if (is_file_shm_hugepages(file)) | 200 | if (is_file_shm_hugepages(file)) |
202 | return 1; | 201 | return 1; |
203 | 202 | ||
204 | return 0; | 203 | return 0; |
205 | } | 204 | } |
206 | 205 | ||
207 | 206 | ||
208 | #else /* !CONFIG_HUGETLBFS */ | 207 | #else /* !CONFIG_HUGETLBFS */ |
209 | 208 | ||
210 | #define is_file_hugepages(file) 0 | 209 | #define is_file_hugepages(file) 0 |
211 | static inline struct file * | 210 | static inline struct file * |
212 | hugetlb_file_setup(const char *name, unsigned long addr, size_t size, | 211 | hugetlb_file_setup(const char *name, size_t size, vm_flags_t acctflag, |
213 | vm_flags_t acctflag, struct user_struct **user, int creat_flags, | 212 | struct user_struct **user, int creat_flags, |
214 | int page_size_log) | 213 | int page_size_log) |
215 | { | 214 | { |
216 | return ERR_PTR(-ENOSYS); | 215 | return ERR_PTR(-ENOSYS); |
217 | } | 216 | } |
218 | 217 | ||
219 | #endif /* !CONFIG_HUGETLBFS */ | 218 | #endif /* !CONFIG_HUGETLBFS */ |
220 | 219 | ||
221 | #ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA | 220 | #ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA |
222 | unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, | 221 | unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, |
223 | unsigned long len, unsigned long pgoff, | 222 | unsigned long len, unsigned long pgoff, |
224 | unsigned long flags); | 223 | unsigned long flags); |
225 | #endif /* HAVE_ARCH_HUGETLB_UNMAPPED_AREA */ | 224 | #endif /* HAVE_ARCH_HUGETLB_UNMAPPED_AREA */ |
226 | 225 | ||
227 | #ifdef CONFIG_HUGETLB_PAGE | 226 | #ifdef CONFIG_HUGETLB_PAGE |
228 | 227 | ||
229 | #define HSTATE_NAME_LEN 32 | 228 | #define HSTATE_NAME_LEN 32 |
230 | /* Defines one hugetlb page size */ | 229 | /* Defines one hugetlb page size */ |
231 | struct hstate { | 230 | struct hstate { |
232 | int next_nid_to_alloc; | 231 | int next_nid_to_alloc; |
233 | int next_nid_to_free; | 232 | int next_nid_to_free; |
234 | unsigned int order; | 233 | unsigned int order; |
235 | unsigned long mask; | 234 | unsigned long mask; |
236 | unsigned long max_huge_pages; | 235 | unsigned long max_huge_pages; |
237 | unsigned long nr_huge_pages; | 236 | unsigned long nr_huge_pages; |
238 | unsigned long free_huge_pages; | 237 | unsigned long free_huge_pages; |
239 | unsigned long resv_huge_pages; | 238 | unsigned long resv_huge_pages; |
240 | unsigned long surplus_huge_pages; | 239 | unsigned long surplus_huge_pages; |
241 | unsigned long nr_overcommit_huge_pages; | 240 | unsigned long nr_overcommit_huge_pages; |
242 | struct list_head hugepage_activelist; | 241 | struct list_head hugepage_activelist; |
243 | struct list_head hugepage_freelists[MAX_NUMNODES]; | 242 | struct list_head hugepage_freelists[MAX_NUMNODES]; |
244 | unsigned int nr_huge_pages_node[MAX_NUMNODES]; | 243 | unsigned int nr_huge_pages_node[MAX_NUMNODES]; |
245 | unsigned int free_huge_pages_node[MAX_NUMNODES]; | 244 | unsigned int free_huge_pages_node[MAX_NUMNODES]; |
246 | unsigned int surplus_huge_pages_node[MAX_NUMNODES]; | 245 | unsigned int surplus_huge_pages_node[MAX_NUMNODES]; |
247 | #ifdef CONFIG_CGROUP_HUGETLB | 246 | #ifdef CONFIG_CGROUP_HUGETLB |
248 | /* cgroup control files */ | 247 | /* cgroup control files */ |
249 | struct cftype cgroup_files[5]; | 248 | struct cftype cgroup_files[5]; |
250 | #endif | 249 | #endif |
251 | char name[HSTATE_NAME_LEN]; | 250 | char name[HSTATE_NAME_LEN]; |
252 | }; | 251 | }; |
253 | 252 | ||
254 | struct huge_bootmem_page { | 253 | struct huge_bootmem_page { |
255 | struct list_head list; | 254 | struct list_head list; |
256 | struct hstate *hstate; | 255 | struct hstate *hstate; |
257 | #ifdef CONFIG_HIGHMEM | 256 | #ifdef CONFIG_HIGHMEM |
258 | phys_addr_t phys; | 257 | phys_addr_t phys; |
259 | #endif | 258 | #endif |
260 | }; | 259 | }; |
261 | 260 | ||
262 | struct page *alloc_huge_page_node(struct hstate *h, int nid); | 261 | struct page *alloc_huge_page_node(struct hstate *h, int nid); |
263 | 262 | ||
264 | /* arch callback */ | 263 | /* arch callback */ |
265 | int __init alloc_bootmem_huge_page(struct hstate *h); | 264 | int __init alloc_bootmem_huge_page(struct hstate *h); |
266 | 265 | ||
267 | void __init hugetlb_add_hstate(unsigned order); | 266 | void __init hugetlb_add_hstate(unsigned order); |
268 | struct hstate *size_to_hstate(unsigned long size); | 267 | struct hstate *size_to_hstate(unsigned long size); |
269 | 268 | ||
270 | #ifndef HUGE_MAX_HSTATE | 269 | #ifndef HUGE_MAX_HSTATE |
271 | #define HUGE_MAX_HSTATE 1 | 270 | #define HUGE_MAX_HSTATE 1 |
272 | #endif | 271 | #endif |
273 | 272 | ||
274 | extern struct hstate hstates[HUGE_MAX_HSTATE]; | 273 | extern struct hstate hstates[HUGE_MAX_HSTATE]; |
275 | extern unsigned int default_hstate_idx; | 274 | extern unsigned int default_hstate_idx; |
276 | 275 | ||
277 | #define default_hstate (hstates[default_hstate_idx]) | 276 | #define default_hstate (hstates[default_hstate_idx]) |
278 | 277 | ||
279 | static inline struct hstate *hstate_inode(struct inode *i) | 278 | static inline struct hstate *hstate_inode(struct inode *i) |
280 | { | 279 | { |
281 | struct hugetlbfs_sb_info *hsb; | 280 | struct hugetlbfs_sb_info *hsb; |
282 | hsb = HUGETLBFS_SB(i->i_sb); | 281 | hsb = HUGETLBFS_SB(i->i_sb); |
283 | return hsb->hstate; | 282 | return hsb->hstate; |
284 | } | 283 | } |
285 | 284 | ||
286 | static inline struct hstate *hstate_file(struct file *f) | 285 | static inline struct hstate *hstate_file(struct file *f) |
287 | { | 286 | { |
288 | return hstate_inode(file_inode(f)); | 287 | return hstate_inode(file_inode(f)); |
289 | } | 288 | } |
290 | 289 | ||
290 | static inline struct hstate *hstate_sizelog(int page_size_log) | ||
291 | { | ||
292 | if (!page_size_log) | ||
293 | return &default_hstate; | ||
294 | return size_to_hstate(1 << page_size_log); | ||
295 | } | ||
296 | |||
291 | static inline struct hstate *hstate_vma(struct vm_area_struct *vma) | 297 | static inline struct hstate *hstate_vma(struct vm_area_struct *vma) |
292 | { | 298 | { |
293 | return hstate_file(vma->vm_file); | 299 | return hstate_file(vma->vm_file); |
294 | } | 300 | } |
295 | 301 | ||
296 | static inline unsigned long huge_page_size(struct hstate *h) | 302 | static inline unsigned long huge_page_size(struct hstate *h) |
297 | { | 303 | { |
298 | return (unsigned long)PAGE_SIZE << h->order; | 304 | return (unsigned long)PAGE_SIZE << h->order; |
299 | } | 305 | } |
300 | 306 | ||
301 | extern unsigned long vma_kernel_pagesize(struct vm_area_struct *vma); | 307 | extern unsigned long vma_kernel_pagesize(struct vm_area_struct *vma); |
302 | 308 | ||
303 | extern unsigned long vma_mmu_pagesize(struct vm_area_struct *vma); | 309 | extern unsigned long vma_mmu_pagesize(struct vm_area_struct *vma); |
304 | 310 | ||
305 | static inline unsigned long huge_page_mask(struct hstate *h) | 311 | static inline unsigned long huge_page_mask(struct hstate *h) |
306 | { | 312 | { |
307 | return h->mask; | 313 | return h->mask; |
308 | } | 314 | } |
309 | 315 | ||
310 | static inline unsigned int huge_page_order(struct hstate *h) | 316 | static inline unsigned int huge_page_order(struct hstate *h) |
311 | { | 317 | { |
312 | return h->order; | 318 | return h->order; |
313 | } | 319 | } |
314 | 320 | ||
315 | static inline unsigned huge_page_shift(struct hstate *h) | 321 | static inline unsigned huge_page_shift(struct hstate *h) |
316 | { | 322 | { |
317 | return h->order + PAGE_SHIFT; | 323 | return h->order + PAGE_SHIFT; |
318 | } | 324 | } |
319 | 325 | ||
320 | static inline unsigned int pages_per_huge_page(struct hstate *h) | 326 | static inline unsigned int pages_per_huge_page(struct hstate *h) |
321 | { | 327 | { |
322 | return 1 << h->order; | 328 | return 1 << h->order; |
323 | } | 329 | } |
324 | 330 | ||
325 | static inline unsigned int blocks_per_huge_page(struct hstate *h) | 331 | static inline unsigned int blocks_per_huge_page(struct hstate *h) |
326 | { | 332 | { |
327 | return huge_page_size(h) / 512; | 333 | return huge_page_size(h) / 512; |
328 | } | 334 | } |
329 | 335 | ||
330 | #include <asm/hugetlb.h> | 336 | #include <asm/hugetlb.h> |
331 | 337 | ||
332 | #ifndef arch_make_huge_pte | 338 | #ifndef arch_make_huge_pte |
333 | static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, | 339 | static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, |
334 | struct page *page, int writable) | 340 | struct page *page, int writable) |
335 | { | 341 | { |
336 | return entry; | 342 | return entry; |
337 | } | 343 | } |
338 | #endif | 344 | #endif |
339 | 345 | ||
340 | static inline struct hstate *page_hstate(struct page *page) | 346 | static inline struct hstate *page_hstate(struct page *page) |
341 | { | 347 | { |
342 | return size_to_hstate(PAGE_SIZE << compound_order(page)); | 348 | return size_to_hstate(PAGE_SIZE << compound_order(page)); |
343 | } | 349 | } |
344 | 350 | ||
345 | static inline unsigned hstate_index_to_shift(unsigned index) | 351 | static inline unsigned hstate_index_to_shift(unsigned index) |
346 | { | 352 | { |
347 | return hstates[index].order + PAGE_SHIFT; | 353 | return hstates[index].order + PAGE_SHIFT; |
348 | } | 354 | } |
349 | 355 | ||
350 | static inline int hstate_index(struct hstate *h) | 356 | static inline int hstate_index(struct hstate *h) |
351 | { | 357 | { |
352 | return h - hstates; | 358 | return h - hstates; |
353 | } | 359 | } |
354 | 360 | ||
355 | #else | 361 | #else /* CONFIG_HUGETLB_PAGE */ |
356 | struct hstate {}; | 362 | struct hstate {}; |
357 | #define alloc_huge_page_node(h, nid) NULL | 363 | #define alloc_huge_page_node(h, nid) NULL |
358 | #define alloc_bootmem_huge_page(h) NULL | 364 | #define alloc_bootmem_huge_page(h) NULL |
359 | #define hstate_file(f) NULL | 365 | #define hstate_file(f) NULL |
366 | #define hstate_sizelog(s) NULL | ||
360 | #define hstate_vma(v) NULL | 367 | #define hstate_vma(v) NULL |
361 | #define hstate_inode(i) NULL | 368 | #define hstate_inode(i) NULL |
362 | #define huge_page_size(h) PAGE_SIZE | 369 | #define huge_page_size(h) PAGE_SIZE |
363 | #define huge_page_mask(h) PAGE_MASK | 370 | #define huge_page_mask(h) PAGE_MASK |
364 | #define vma_kernel_pagesize(v) PAGE_SIZE | 371 | #define vma_kernel_pagesize(v) PAGE_SIZE |
365 | #define vma_mmu_pagesize(v) PAGE_SIZE | 372 | #define vma_mmu_pagesize(v) PAGE_SIZE |
366 | #define huge_page_order(h) 0 | 373 | #define huge_page_order(h) 0 |
367 | #define huge_page_shift(h) PAGE_SHIFT | 374 | #define huge_page_shift(h) PAGE_SHIFT |
368 | static inline unsigned int pages_per_huge_page(struct hstate *h) | 375 | static inline unsigned int pages_per_huge_page(struct hstate *h) |
369 | { | 376 | { |
370 | return 1; | 377 | return 1; |
371 | } | 378 | } |
372 | #define hstate_index_to_shift(index) 0 | 379 | #define hstate_index_to_shift(index) 0 |
373 | #define hstate_index(h) 0 | 380 | #define hstate_index(h) 0 |
374 | #endif | 381 | #endif /* CONFIG_HUGETLB_PAGE */ |
375 | 382 | ||
376 | #endif /* _LINUX_HUGETLB_H */ | 383 | #endif /* _LINUX_HUGETLB_H */ |
ipc/shm.c
1 | /* | 1 | /* |
2 | * linux/ipc/shm.c | 2 | * linux/ipc/shm.c |
3 | * Copyright (C) 1992, 1993 Krishna Balasubramanian | 3 | * Copyright (C) 1992, 1993 Krishna Balasubramanian |
4 | * Many improvements/fixes by Bruno Haible. | 4 | * Many improvements/fixes by Bruno Haible. |
5 | * Replaced `struct shm_desc' by `struct vm_area_struct', July 1994. | 5 | * Replaced `struct shm_desc' by `struct vm_area_struct', July 1994. |
6 | * Fixed the shm swap deallocation (shm_unuse()), August 1998 Andrea Arcangeli. | 6 | * Fixed the shm swap deallocation (shm_unuse()), August 1998 Andrea Arcangeli. |
7 | * | 7 | * |
8 | * /proc/sysvipc/shm support (c) 1999 Dragos Acostachioaie <dragos@iname.com> | 8 | * /proc/sysvipc/shm support (c) 1999 Dragos Acostachioaie <dragos@iname.com> |
9 | * BIGMEM support, Andrea Arcangeli <andrea@suse.de> | 9 | * BIGMEM support, Andrea Arcangeli <andrea@suse.de> |
10 | * SMP thread shm, Jean-Luc Boyard <jean-luc.boyard@siemens.fr> | 10 | * SMP thread shm, Jean-Luc Boyard <jean-luc.boyard@siemens.fr> |
11 | * HIGHMEM support, Ingo Molnar <mingo@redhat.com> | 11 | * HIGHMEM support, Ingo Molnar <mingo@redhat.com> |
12 | * Make shmmax, shmall, shmmni sysctl'able, Christoph Rohland <cr@sap.com> | 12 | * Make shmmax, shmall, shmmni sysctl'able, Christoph Rohland <cr@sap.com> |
13 | * Shared /dev/zero support, Kanoj Sarcar <kanoj@sgi.com> | 13 | * Shared /dev/zero support, Kanoj Sarcar <kanoj@sgi.com> |
14 | * Move the mm functionality over to mm/shmem.c, Christoph Rohland <cr@sap.com> | 14 | * Move the mm functionality over to mm/shmem.c, Christoph Rohland <cr@sap.com> |
15 | * | 15 | * |
16 | * support for audit of ipc object properties and permission changes | 16 | * support for audit of ipc object properties and permission changes |
17 | * Dustin Kirkland <dustin.kirkland@us.ibm.com> | 17 | * Dustin Kirkland <dustin.kirkland@us.ibm.com> |
18 | * | 18 | * |
19 | * namespaces support | 19 | * namespaces support |
20 | * OpenVZ, SWsoft Inc. | 20 | * OpenVZ, SWsoft Inc. |
21 | * Pavel Emelianov <xemul@openvz.org> | 21 | * Pavel Emelianov <xemul@openvz.org> |
22 | */ | 22 | */ |
23 | 23 | ||
24 | #include <linux/slab.h> | 24 | #include <linux/slab.h> |
25 | #include <linux/mm.h> | 25 | #include <linux/mm.h> |
26 | #include <linux/hugetlb.h> | 26 | #include <linux/hugetlb.h> |
27 | #include <linux/shm.h> | 27 | #include <linux/shm.h> |
28 | #include <linux/init.h> | 28 | #include <linux/init.h> |
29 | #include <linux/file.h> | 29 | #include <linux/file.h> |
30 | #include <linux/mman.h> | 30 | #include <linux/mman.h> |
31 | #include <linux/shmem_fs.h> | 31 | #include <linux/shmem_fs.h> |
32 | #include <linux/security.h> | 32 | #include <linux/security.h> |
33 | #include <linux/syscalls.h> | 33 | #include <linux/syscalls.h> |
34 | #include <linux/audit.h> | 34 | #include <linux/audit.h> |
35 | #include <linux/capability.h> | 35 | #include <linux/capability.h> |
36 | #include <linux/ptrace.h> | 36 | #include <linux/ptrace.h> |
37 | #include <linux/seq_file.h> | 37 | #include <linux/seq_file.h> |
38 | #include <linux/rwsem.h> | 38 | #include <linux/rwsem.h> |
39 | #include <linux/nsproxy.h> | 39 | #include <linux/nsproxy.h> |
40 | #include <linux/mount.h> | 40 | #include <linux/mount.h> |
41 | #include <linux/ipc_namespace.h> | 41 | #include <linux/ipc_namespace.h> |
42 | 42 | ||
43 | #include <asm/uaccess.h> | 43 | #include <asm/uaccess.h> |
44 | 44 | ||
45 | #include "util.h" | 45 | #include "util.h" |
46 | 46 | ||
47 | struct shm_file_data { | 47 | struct shm_file_data { |
48 | int id; | 48 | int id; |
49 | struct ipc_namespace *ns; | 49 | struct ipc_namespace *ns; |
50 | struct file *file; | 50 | struct file *file; |
51 | const struct vm_operations_struct *vm_ops; | 51 | const struct vm_operations_struct *vm_ops; |
52 | }; | 52 | }; |
53 | 53 | ||
54 | #define shm_file_data(file) (*((struct shm_file_data **)&(file)->private_data)) | 54 | #define shm_file_data(file) (*((struct shm_file_data **)&(file)->private_data)) |
55 | 55 | ||
56 | static const struct file_operations shm_file_operations; | 56 | static const struct file_operations shm_file_operations; |
57 | static const struct vm_operations_struct shm_vm_ops; | 57 | static const struct vm_operations_struct shm_vm_ops; |
58 | 58 | ||
59 | #define shm_ids(ns) ((ns)->ids[IPC_SHM_IDS]) | 59 | #define shm_ids(ns) ((ns)->ids[IPC_SHM_IDS]) |
60 | 60 | ||
61 | #define shm_unlock(shp) \ | 61 | #define shm_unlock(shp) \ |
62 | ipc_unlock(&(shp)->shm_perm) | 62 | ipc_unlock(&(shp)->shm_perm) |
63 | 63 | ||
64 | static int newseg(struct ipc_namespace *, struct ipc_params *); | 64 | static int newseg(struct ipc_namespace *, struct ipc_params *); |
65 | static void shm_open(struct vm_area_struct *vma); | 65 | static void shm_open(struct vm_area_struct *vma); |
66 | static void shm_close(struct vm_area_struct *vma); | 66 | static void shm_close(struct vm_area_struct *vma); |
67 | static void shm_destroy (struct ipc_namespace *ns, struct shmid_kernel *shp); | 67 | static void shm_destroy (struct ipc_namespace *ns, struct shmid_kernel *shp); |
68 | #ifdef CONFIG_PROC_FS | 68 | #ifdef CONFIG_PROC_FS |
69 | static int sysvipc_shm_proc_show(struct seq_file *s, void *it); | 69 | static int sysvipc_shm_proc_show(struct seq_file *s, void *it); |
70 | #endif | 70 | #endif |
71 | 71 | ||
72 | void shm_init_ns(struct ipc_namespace *ns) | 72 | void shm_init_ns(struct ipc_namespace *ns) |
73 | { | 73 | { |
74 | ns->shm_ctlmax = SHMMAX; | 74 | ns->shm_ctlmax = SHMMAX; |
75 | ns->shm_ctlall = SHMALL; | 75 | ns->shm_ctlall = SHMALL; |
76 | ns->shm_ctlmni = SHMMNI; | 76 | ns->shm_ctlmni = SHMMNI; |
77 | ns->shm_rmid_forced = 0; | 77 | ns->shm_rmid_forced = 0; |
78 | ns->shm_tot = 0; | 78 | ns->shm_tot = 0; |
79 | ipc_init_ids(&shm_ids(ns)); | 79 | ipc_init_ids(&shm_ids(ns)); |
80 | } | 80 | } |
81 | 81 | ||
82 | /* | 82 | /* |
83 | * Called with shm_ids.rw_mutex (writer) and the shp structure locked. | 83 | * Called with shm_ids.rw_mutex (writer) and the shp structure locked. |
84 | * Only shm_ids.rw_mutex remains locked on exit. | 84 | * Only shm_ids.rw_mutex remains locked on exit. |
85 | */ | 85 | */ |
86 | static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) | 86 | static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) |
87 | { | 87 | { |
88 | struct shmid_kernel *shp; | 88 | struct shmid_kernel *shp; |
89 | shp = container_of(ipcp, struct shmid_kernel, shm_perm); | 89 | shp = container_of(ipcp, struct shmid_kernel, shm_perm); |
90 | 90 | ||
91 | if (shp->shm_nattch){ | 91 | if (shp->shm_nattch){ |
92 | shp->shm_perm.mode |= SHM_DEST; | 92 | shp->shm_perm.mode |= SHM_DEST; |
93 | /* Do not find it any more */ | 93 | /* Do not find it any more */ |
94 | shp->shm_perm.key = IPC_PRIVATE; | 94 | shp->shm_perm.key = IPC_PRIVATE; |
95 | shm_unlock(shp); | 95 | shm_unlock(shp); |
96 | } else | 96 | } else |
97 | shm_destroy(ns, shp); | 97 | shm_destroy(ns, shp); |
98 | } | 98 | } |
99 | 99 | ||
100 | #ifdef CONFIG_IPC_NS | 100 | #ifdef CONFIG_IPC_NS |
101 | void shm_exit_ns(struct ipc_namespace *ns) | 101 | void shm_exit_ns(struct ipc_namespace *ns) |
102 | { | 102 | { |
103 | free_ipcs(ns, &shm_ids(ns), do_shm_rmid); | 103 | free_ipcs(ns, &shm_ids(ns), do_shm_rmid); |
104 | idr_destroy(&ns->ids[IPC_SHM_IDS].ipcs_idr); | 104 | idr_destroy(&ns->ids[IPC_SHM_IDS].ipcs_idr); |
105 | } | 105 | } |
106 | #endif | 106 | #endif |
107 | 107 | ||
108 | static int __init ipc_ns_init(void) | 108 | static int __init ipc_ns_init(void) |
109 | { | 109 | { |
110 | shm_init_ns(&init_ipc_ns); | 110 | shm_init_ns(&init_ipc_ns); |
111 | return 0; | 111 | return 0; |
112 | } | 112 | } |
113 | 113 | ||
114 | pure_initcall(ipc_ns_init); | 114 | pure_initcall(ipc_ns_init); |
115 | 115 | ||
116 | void __init shm_init (void) | 116 | void __init shm_init (void) |
117 | { | 117 | { |
118 | ipc_init_proc_interface("sysvipc/shm", | 118 | ipc_init_proc_interface("sysvipc/shm", |
119 | #if BITS_PER_LONG <= 32 | 119 | #if BITS_PER_LONG <= 32 |
120 | " key shmid perms size cpid lpid nattch uid gid cuid cgid atime dtime ctime rss swap\n", | 120 | " key shmid perms size cpid lpid nattch uid gid cuid cgid atime dtime ctime rss swap\n", |
121 | #else | 121 | #else |
122 | " key shmid perms size cpid lpid nattch uid gid cuid cgid atime dtime ctime rss swap\n", | 122 | " key shmid perms size cpid lpid nattch uid gid cuid cgid atime dtime ctime rss swap\n", |
123 | #endif | 123 | #endif |
124 | IPC_SHM_IDS, sysvipc_shm_proc_show); | 124 | IPC_SHM_IDS, sysvipc_shm_proc_show); |
125 | } | 125 | } |
126 | 126 | ||
127 | /* | 127 | /* |
128 | * shm_lock_(check_) routines are called in the paths where the rw_mutex | 128 | * shm_lock_(check_) routines are called in the paths where the rw_mutex |
129 | * is not necessarily held. | 129 | * is not necessarily held. |
130 | */ | 130 | */ |
131 | static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id) | 131 | static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id) |
132 | { | 132 | { |
133 | struct kern_ipc_perm *ipcp = ipc_lock(&shm_ids(ns), id); | 133 | struct kern_ipc_perm *ipcp = ipc_lock(&shm_ids(ns), id); |
134 | 134 | ||
135 | if (IS_ERR(ipcp)) | 135 | if (IS_ERR(ipcp)) |
136 | return (struct shmid_kernel *)ipcp; | 136 | return (struct shmid_kernel *)ipcp; |
137 | 137 | ||
138 | return container_of(ipcp, struct shmid_kernel, shm_perm); | 138 | return container_of(ipcp, struct shmid_kernel, shm_perm); |
139 | } | 139 | } |
140 | 140 | ||
141 | static inline void shm_lock_by_ptr(struct shmid_kernel *ipcp) | 141 | static inline void shm_lock_by_ptr(struct shmid_kernel *ipcp) |
142 | { | 142 | { |
143 | rcu_read_lock(); | 143 | rcu_read_lock(); |
144 | spin_lock(&ipcp->shm_perm.lock); | 144 | spin_lock(&ipcp->shm_perm.lock); |
145 | } | 145 | } |
146 | 146 | ||
147 | static inline struct shmid_kernel *shm_lock_check(struct ipc_namespace *ns, | 147 | static inline struct shmid_kernel *shm_lock_check(struct ipc_namespace *ns, |
148 | int id) | 148 | int id) |
149 | { | 149 | { |
150 | struct kern_ipc_perm *ipcp = ipc_lock_check(&shm_ids(ns), id); | 150 | struct kern_ipc_perm *ipcp = ipc_lock_check(&shm_ids(ns), id); |
151 | 151 | ||
152 | if (IS_ERR(ipcp)) | 152 | if (IS_ERR(ipcp)) |
153 | return (struct shmid_kernel *)ipcp; | 153 | return (struct shmid_kernel *)ipcp; |
154 | 154 | ||
155 | return container_of(ipcp, struct shmid_kernel, shm_perm); | 155 | return container_of(ipcp, struct shmid_kernel, shm_perm); |
156 | } | 156 | } |
157 | 157 | ||
158 | static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s) | 158 | static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s) |
159 | { | 159 | { |
160 | ipc_rmid(&shm_ids(ns), &s->shm_perm); | 160 | ipc_rmid(&shm_ids(ns), &s->shm_perm); |
161 | } | 161 | } |
162 | 162 | ||
163 | 163 | ||
164 | /* This is called by fork, once for every shm attach. */ | 164 | /* This is called by fork, once for every shm attach. */ |
165 | static void shm_open(struct vm_area_struct *vma) | 165 | static void shm_open(struct vm_area_struct *vma) |
166 | { | 166 | { |
167 | struct file *file = vma->vm_file; | 167 | struct file *file = vma->vm_file; |
168 | struct shm_file_data *sfd = shm_file_data(file); | 168 | struct shm_file_data *sfd = shm_file_data(file); |
169 | struct shmid_kernel *shp; | 169 | struct shmid_kernel *shp; |
170 | 170 | ||
171 | shp = shm_lock(sfd->ns, sfd->id); | 171 | shp = shm_lock(sfd->ns, sfd->id); |
172 | BUG_ON(IS_ERR(shp)); | 172 | BUG_ON(IS_ERR(shp)); |
173 | shp->shm_atim = get_seconds(); | 173 | shp->shm_atim = get_seconds(); |
174 | shp->shm_lprid = task_tgid_vnr(current); | 174 | shp->shm_lprid = task_tgid_vnr(current); |
175 | shp->shm_nattch++; | 175 | shp->shm_nattch++; |
176 | shm_unlock(shp); | 176 | shm_unlock(shp); |
177 | } | 177 | } |
178 | 178 | ||
179 | /* | 179 | /* |
180 | * shm_destroy - free the struct shmid_kernel | 180 | * shm_destroy - free the struct shmid_kernel |
181 | * | 181 | * |
182 | * @ns: namespace | 182 | * @ns: namespace |
183 | * @shp: struct to free | 183 | * @shp: struct to free |
184 | * | 184 | * |
185 | * It has to be called with shp and shm_ids.rw_mutex (writer) locked, | 185 | * It has to be called with shp and shm_ids.rw_mutex (writer) locked, |
186 | * but returns with shp unlocked and freed. | 186 | * but returns with shp unlocked and freed. |
187 | */ | 187 | */ |
188 | static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) | 188 | static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) |
189 | { | 189 | { |
190 | ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT; | 190 | ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT; |
191 | shm_rmid(ns, shp); | 191 | shm_rmid(ns, shp); |
192 | shm_unlock(shp); | 192 | shm_unlock(shp); |
193 | if (!is_file_hugepages(shp->shm_file)) | 193 | if (!is_file_hugepages(shp->shm_file)) |
194 | shmem_lock(shp->shm_file, 0, shp->mlock_user); | 194 | shmem_lock(shp->shm_file, 0, shp->mlock_user); |
195 | else if (shp->mlock_user) | 195 | else if (shp->mlock_user) |
196 | user_shm_unlock(file_inode(shp->shm_file)->i_size, | 196 | user_shm_unlock(file_inode(shp->shm_file)->i_size, |
197 | shp->mlock_user); | 197 | shp->mlock_user); |
198 | fput (shp->shm_file); | 198 | fput (shp->shm_file); |
199 | security_shm_free(shp); | 199 | security_shm_free(shp); |
200 | ipc_rcu_putref(shp); | 200 | ipc_rcu_putref(shp); |
201 | } | 201 | } |
202 | 202 | ||
203 | /* | 203 | /* |
204 | * shm_may_destroy - identifies whether shm segment should be destroyed now | 204 | * shm_may_destroy - identifies whether shm segment should be destroyed now |
205 | * | 205 | * |
206 | * Returns true if and only if there are no active users of the segment and | 206 | * Returns true if and only if there are no active users of the segment and |
207 | * one of the following is true: | 207 | * one of the following is true: |
208 | * | 208 | * |
209 | * 1) shmctl(id, IPC_RMID, NULL) was called for this shp | 209 | * 1) shmctl(id, IPC_RMID, NULL) was called for this shp |
210 | * | 210 | * |
211 | * 2) sysctl kernel.shm_rmid_forced is set to 1. | 211 | * 2) sysctl kernel.shm_rmid_forced is set to 1. |
212 | */ | 212 | */ |
213 | static bool shm_may_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) | 213 | static bool shm_may_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) |
214 | { | 214 | { |
215 | return (shp->shm_nattch == 0) && | 215 | return (shp->shm_nattch == 0) && |
216 | (ns->shm_rmid_forced || | 216 | (ns->shm_rmid_forced || |
217 | (shp->shm_perm.mode & SHM_DEST)); | 217 | (shp->shm_perm.mode & SHM_DEST)); |
218 | } | 218 | } |
219 | 219 | ||
220 | /* | 220 | /* |
221 | * remove the attach descriptor vma. | 221 | * remove the attach descriptor vma. |
222 | * free memory for segment if it is marked destroyed. | 222 | * free memory for segment if it is marked destroyed. |
223 | * The descriptor has already been removed from the current->mm->mmap list | 223 | * The descriptor has already been removed from the current->mm->mmap list |
224 | * and will later be kfree()d. | 224 | * and will later be kfree()d. |
225 | */ | 225 | */ |
226 | static void shm_close(struct vm_area_struct *vma) | 226 | static void shm_close(struct vm_area_struct *vma) |
227 | { | 227 | { |
228 | struct file * file = vma->vm_file; | 228 | struct file * file = vma->vm_file; |
229 | struct shm_file_data *sfd = shm_file_data(file); | 229 | struct shm_file_data *sfd = shm_file_data(file); |
230 | struct shmid_kernel *shp; | 230 | struct shmid_kernel *shp; |
231 | struct ipc_namespace *ns = sfd->ns; | 231 | struct ipc_namespace *ns = sfd->ns; |
232 | 232 | ||
233 | down_write(&shm_ids(ns).rw_mutex); | 233 | down_write(&shm_ids(ns).rw_mutex); |
234 | /* remove from the list of attaches of the shm segment */ | 234 | /* remove from the list of attaches of the shm segment */ |
235 | shp = shm_lock(ns, sfd->id); | 235 | shp = shm_lock(ns, sfd->id); |
236 | BUG_ON(IS_ERR(shp)); | 236 | BUG_ON(IS_ERR(shp)); |
237 | shp->shm_lprid = task_tgid_vnr(current); | 237 | shp->shm_lprid = task_tgid_vnr(current); |
238 | shp->shm_dtim = get_seconds(); | 238 | shp->shm_dtim = get_seconds(); |
239 | shp->shm_nattch--; | 239 | shp->shm_nattch--; |
240 | if (shm_may_destroy(ns, shp)) | 240 | if (shm_may_destroy(ns, shp)) |
241 | shm_destroy(ns, shp); | 241 | shm_destroy(ns, shp); |
242 | else | 242 | else |
243 | shm_unlock(shp); | 243 | shm_unlock(shp); |
244 | up_write(&shm_ids(ns).rw_mutex); | 244 | up_write(&shm_ids(ns).rw_mutex); |
245 | } | 245 | } |
246 | 246 | ||
247 | /* Called with ns->shm_ids(ns).rw_mutex locked */ | 247 | /* Called with ns->shm_ids(ns).rw_mutex locked */ |
248 | static int shm_try_destroy_current(int id, void *p, void *data) | 248 | static int shm_try_destroy_current(int id, void *p, void *data) |
249 | { | 249 | { |
250 | struct ipc_namespace *ns = data; | 250 | struct ipc_namespace *ns = data; |
251 | struct kern_ipc_perm *ipcp = p; | 251 | struct kern_ipc_perm *ipcp = p; |
252 | struct shmid_kernel *shp = container_of(ipcp, struct shmid_kernel, shm_perm); | 252 | struct shmid_kernel *shp = container_of(ipcp, struct shmid_kernel, shm_perm); |
253 | 253 | ||
254 | if (shp->shm_creator != current) | 254 | if (shp->shm_creator != current) |
255 | return 0; | 255 | return 0; |
256 | 256 | ||
257 | /* | 257 | /* |
258 | * Mark it as orphaned to destroy the segment when | 258 | * Mark it as orphaned to destroy the segment when |
259 | * kernel.shm_rmid_forced is changed. | 259 | * kernel.shm_rmid_forced is changed. |
260 | * It is noop if the following shm_may_destroy() returns true. | 260 | * It is noop if the following shm_may_destroy() returns true. |
261 | */ | 261 | */ |
262 | shp->shm_creator = NULL; | 262 | shp->shm_creator = NULL; |
263 | 263 | ||
264 | /* | 264 | /* |
265 | * Don't even try to destroy it. If shm_rmid_forced=0 and IPC_RMID | 265 | * Don't even try to destroy it. If shm_rmid_forced=0 and IPC_RMID |
266 | * is not set, it shouldn't be deleted here. | 266 | * is not set, it shouldn't be deleted here. |
267 | */ | 267 | */ |
268 | if (!ns->shm_rmid_forced) | 268 | if (!ns->shm_rmid_forced) |
269 | return 0; | 269 | return 0; |
270 | 270 | ||
271 | if (shm_may_destroy(ns, shp)) { | 271 | if (shm_may_destroy(ns, shp)) { |
272 | shm_lock_by_ptr(shp); | 272 | shm_lock_by_ptr(shp); |
273 | shm_destroy(ns, shp); | 273 | shm_destroy(ns, shp); |
274 | } | 274 | } |
275 | return 0; | 275 | return 0; |
276 | } | 276 | } |
277 | 277 | ||
278 | /* Called with ns->shm_ids(ns).rw_mutex locked */ | 278 | /* Called with ns->shm_ids(ns).rw_mutex locked */ |
279 | static int shm_try_destroy_orphaned(int id, void *p, void *data) | 279 | static int shm_try_destroy_orphaned(int id, void *p, void *data) |
280 | { | 280 | { |
281 | struct ipc_namespace *ns = data; | 281 | struct ipc_namespace *ns = data; |
282 | struct kern_ipc_perm *ipcp = p; | 282 | struct kern_ipc_perm *ipcp = p; |
283 | struct shmid_kernel *shp = container_of(ipcp, struct shmid_kernel, shm_perm); | 283 | struct shmid_kernel *shp = container_of(ipcp, struct shmid_kernel, shm_perm); |
284 | 284 | ||
285 | /* | 285 | /* |
286 | * We want to destroy segments without users and with already | 286 | * We want to destroy segments without users and with already |
287 | * exit'ed originating process. | 287 | * exit'ed originating process. |
288 | * | 288 | * |
289 | * As shp->* are changed under rw_mutex, it's safe to skip shp locking. | 289 | * As shp->* are changed under rw_mutex, it's safe to skip shp locking. |
290 | */ | 290 | */ |
291 | if (shp->shm_creator != NULL) | 291 | if (shp->shm_creator != NULL) |
292 | return 0; | 292 | return 0; |
293 | 293 | ||
294 | if (shm_may_destroy(ns, shp)) { | 294 | if (shm_may_destroy(ns, shp)) { |
295 | shm_lock_by_ptr(shp); | 295 | shm_lock_by_ptr(shp); |
296 | shm_destroy(ns, shp); | 296 | shm_destroy(ns, shp); |
297 | } | 297 | } |
298 | return 0; | 298 | return 0; |
299 | } | 299 | } |
300 | 300 | ||
301 | void shm_destroy_orphaned(struct ipc_namespace *ns) | 301 | void shm_destroy_orphaned(struct ipc_namespace *ns) |
302 | { | 302 | { |
303 | down_write(&shm_ids(ns).rw_mutex); | 303 | down_write(&shm_ids(ns).rw_mutex); |
304 | if (shm_ids(ns).in_use) | 304 | if (shm_ids(ns).in_use) |
305 | idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_orphaned, ns); | 305 | idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_orphaned, ns); |
306 | up_write(&shm_ids(ns).rw_mutex); | 306 | up_write(&shm_ids(ns).rw_mutex); |
307 | } | 307 | } |
308 | 308 | ||
309 | 309 | ||
310 | void exit_shm(struct task_struct *task) | 310 | void exit_shm(struct task_struct *task) |
311 | { | 311 | { |
312 | struct ipc_namespace *ns = task->nsproxy->ipc_ns; | 312 | struct ipc_namespace *ns = task->nsproxy->ipc_ns; |
313 | 313 | ||
314 | if (shm_ids(ns).in_use == 0) | 314 | if (shm_ids(ns).in_use == 0) |
315 | return; | 315 | return; |
316 | 316 | ||
317 | /* Destroy all already created segments, but not mapped yet */ | 317 | /* Destroy all already created segments, but not mapped yet */ |
318 | down_write(&shm_ids(ns).rw_mutex); | 318 | down_write(&shm_ids(ns).rw_mutex); |
319 | if (shm_ids(ns).in_use) | 319 | if (shm_ids(ns).in_use) |
320 | idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_current, ns); | 320 | idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_current, ns); |
321 | up_write(&shm_ids(ns).rw_mutex); | 321 | up_write(&shm_ids(ns).rw_mutex); |
322 | } | 322 | } |
323 | 323 | ||
324 | static int shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | 324 | static int shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) |
325 | { | 325 | { |
326 | struct file *file = vma->vm_file; | 326 | struct file *file = vma->vm_file; |
327 | struct shm_file_data *sfd = shm_file_data(file); | 327 | struct shm_file_data *sfd = shm_file_data(file); |
328 | 328 | ||
329 | return sfd->vm_ops->fault(vma, vmf); | 329 | return sfd->vm_ops->fault(vma, vmf); |
330 | } | 330 | } |
331 | 331 | ||
332 | #ifdef CONFIG_NUMA | 332 | #ifdef CONFIG_NUMA |
333 | static int shm_set_policy(struct vm_area_struct *vma, struct mempolicy *new) | 333 | static int shm_set_policy(struct vm_area_struct *vma, struct mempolicy *new) |
334 | { | 334 | { |
335 | struct file *file = vma->vm_file; | 335 | struct file *file = vma->vm_file; |
336 | struct shm_file_data *sfd = shm_file_data(file); | 336 | struct shm_file_data *sfd = shm_file_data(file); |
337 | int err = 0; | 337 | int err = 0; |
338 | if (sfd->vm_ops->set_policy) | 338 | if (sfd->vm_ops->set_policy) |
339 | err = sfd->vm_ops->set_policy(vma, new); | 339 | err = sfd->vm_ops->set_policy(vma, new); |
340 | return err; | 340 | return err; |
341 | } | 341 | } |
342 | 342 | ||
343 | static struct mempolicy *shm_get_policy(struct vm_area_struct *vma, | 343 | static struct mempolicy *shm_get_policy(struct vm_area_struct *vma, |
344 | unsigned long addr) | 344 | unsigned long addr) |
345 | { | 345 | { |
346 | struct file *file = vma->vm_file; | 346 | struct file *file = vma->vm_file; |
347 | struct shm_file_data *sfd = shm_file_data(file); | 347 | struct shm_file_data *sfd = shm_file_data(file); |
348 | struct mempolicy *pol = NULL; | 348 | struct mempolicy *pol = NULL; |
349 | 349 | ||
350 | if (sfd->vm_ops->get_policy) | 350 | if (sfd->vm_ops->get_policy) |
351 | pol = sfd->vm_ops->get_policy(vma, addr); | 351 | pol = sfd->vm_ops->get_policy(vma, addr); |
352 | else if (vma->vm_policy) | 352 | else if (vma->vm_policy) |
353 | pol = vma->vm_policy; | 353 | pol = vma->vm_policy; |
354 | 354 | ||
355 | return pol; | 355 | return pol; |
356 | } | 356 | } |
357 | #endif | 357 | #endif |
358 | 358 | ||
359 | static int shm_mmap(struct file * file, struct vm_area_struct * vma) | 359 | static int shm_mmap(struct file * file, struct vm_area_struct * vma) |
360 | { | 360 | { |
361 | struct shm_file_data *sfd = shm_file_data(file); | 361 | struct shm_file_data *sfd = shm_file_data(file); |
362 | int ret; | 362 | int ret; |
363 | 363 | ||
364 | ret = sfd->file->f_op->mmap(sfd->file, vma); | 364 | ret = sfd->file->f_op->mmap(sfd->file, vma); |
365 | if (ret != 0) | 365 | if (ret != 0) |
366 | return ret; | 366 | return ret; |
367 | sfd->vm_ops = vma->vm_ops; | 367 | sfd->vm_ops = vma->vm_ops; |
368 | #ifdef CONFIG_MMU | 368 | #ifdef CONFIG_MMU |
369 | BUG_ON(!sfd->vm_ops->fault); | 369 | BUG_ON(!sfd->vm_ops->fault); |
370 | #endif | 370 | #endif |
371 | vma->vm_ops = &shm_vm_ops; | 371 | vma->vm_ops = &shm_vm_ops; |
372 | shm_open(vma); | 372 | shm_open(vma); |
373 | 373 | ||
374 | return ret; | 374 | return ret; |
375 | } | 375 | } |
376 | 376 | ||
377 | static int shm_release(struct inode *ino, struct file *file) | 377 | static int shm_release(struct inode *ino, struct file *file) |
378 | { | 378 | { |
379 | struct shm_file_data *sfd = shm_file_data(file); | 379 | struct shm_file_data *sfd = shm_file_data(file); |
380 | 380 | ||
381 | put_ipc_ns(sfd->ns); | 381 | put_ipc_ns(sfd->ns); |
382 | shm_file_data(file) = NULL; | 382 | shm_file_data(file) = NULL; |
383 | kfree(sfd); | 383 | kfree(sfd); |
384 | return 0; | 384 | return 0; |
385 | } | 385 | } |
386 | 386 | ||
387 | static int shm_fsync(struct file *file, loff_t start, loff_t end, int datasync) | 387 | static int shm_fsync(struct file *file, loff_t start, loff_t end, int datasync) |
388 | { | 388 | { |
389 | struct shm_file_data *sfd = shm_file_data(file); | 389 | struct shm_file_data *sfd = shm_file_data(file); |
390 | 390 | ||
391 | if (!sfd->file->f_op->fsync) | 391 | if (!sfd->file->f_op->fsync) |
392 | return -EINVAL; | 392 | return -EINVAL; |
393 | return sfd->file->f_op->fsync(sfd->file, start, end, datasync); | 393 | return sfd->file->f_op->fsync(sfd->file, start, end, datasync); |
394 | } | 394 | } |
395 | 395 | ||
396 | static long shm_fallocate(struct file *file, int mode, loff_t offset, | 396 | static long shm_fallocate(struct file *file, int mode, loff_t offset, |
397 | loff_t len) | 397 | loff_t len) |
398 | { | 398 | { |
399 | struct shm_file_data *sfd = shm_file_data(file); | 399 | struct shm_file_data *sfd = shm_file_data(file); |
400 | 400 | ||
401 | if (!sfd->file->f_op->fallocate) | 401 | if (!sfd->file->f_op->fallocate) |
402 | return -EOPNOTSUPP; | 402 | return -EOPNOTSUPP; |
403 | return sfd->file->f_op->fallocate(file, mode, offset, len); | 403 | return sfd->file->f_op->fallocate(file, mode, offset, len); |
404 | } | 404 | } |
405 | 405 | ||
406 | static unsigned long shm_get_unmapped_area(struct file *file, | 406 | static unsigned long shm_get_unmapped_area(struct file *file, |
407 | unsigned long addr, unsigned long len, unsigned long pgoff, | 407 | unsigned long addr, unsigned long len, unsigned long pgoff, |
408 | unsigned long flags) | 408 | unsigned long flags) |
409 | { | 409 | { |
410 | struct shm_file_data *sfd = shm_file_data(file); | 410 | struct shm_file_data *sfd = shm_file_data(file); |
411 | return sfd->file->f_op->get_unmapped_area(sfd->file, addr, len, | 411 | return sfd->file->f_op->get_unmapped_area(sfd->file, addr, len, |
412 | pgoff, flags); | 412 | pgoff, flags); |
413 | } | 413 | } |
414 | 414 | ||
415 | static const struct file_operations shm_file_operations = { | 415 | static const struct file_operations shm_file_operations = { |
416 | .mmap = shm_mmap, | 416 | .mmap = shm_mmap, |
417 | .fsync = shm_fsync, | 417 | .fsync = shm_fsync, |
418 | .release = shm_release, | 418 | .release = shm_release, |
419 | #ifndef CONFIG_MMU | 419 | #ifndef CONFIG_MMU |
420 | .get_unmapped_area = shm_get_unmapped_area, | 420 | .get_unmapped_area = shm_get_unmapped_area, |
421 | #endif | 421 | #endif |
422 | .llseek = noop_llseek, | 422 | .llseek = noop_llseek, |
423 | .fallocate = shm_fallocate, | 423 | .fallocate = shm_fallocate, |
424 | }; | 424 | }; |
425 | 425 | ||
426 | static const struct file_operations shm_file_operations_huge = { | 426 | static const struct file_operations shm_file_operations_huge = { |
427 | .mmap = shm_mmap, | 427 | .mmap = shm_mmap, |
428 | .fsync = shm_fsync, | 428 | .fsync = shm_fsync, |
429 | .release = shm_release, | 429 | .release = shm_release, |
430 | .get_unmapped_area = shm_get_unmapped_area, | 430 | .get_unmapped_area = shm_get_unmapped_area, |
431 | .llseek = noop_llseek, | 431 | .llseek = noop_llseek, |
432 | .fallocate = shm_fallocate, | 432 | .fallocate = shm_fallocate, |
433 | }; | 433 | }; |
434 | 434 | ||
435 | int is_file_shm_hugepages(struct file *file) | 435 | int is_file_shm_hugepages(struct file *file) |
436 | { | 436 | { |
437 | return file->f_op == &shm_file_operations_huge; | 437 | return file->f_op == &shm_file_operations_huge; |
438 | } | 438 | } |
439 | 439 | ||
440 | static const struct vm_operations_struct shm_vm_ops = { | 440 | static const struct vm_operations_struct shm_vm_ops = { |
441 | .open = shm_open, /* callback for a new vm-area open */ | 441 | .open = shm_open, /* callback for a new vm-area open */ |
442 | .close = shm_close, /* callback for when the vm-area is released */ | 442 | .close = shm_close, /* callback for when the vm-area is released */ |
443 | .fault = shm_fault, | 443 | .fault = shm_fault, |
444 | #if defined(CONFIG_NUMA) | 444 | #if defined(CONFIG_NUMA) |
445 | .set_policy = shm_set_policy, | 445 | .set_policy = shm_set_policy, |
446 | .get_policy = shm_get_policy, | 446 | .get_policy = shm_get_policy, |
447 | #endif | 447 | #endif |
448 | }; | 448 | }; |
449 | 449 | ||
450 | /** | 450 | /** |
451 | * newseg - Create a new shared memory segment | 451 | * newseg - Create a new shared memory segment |
452 | * @ns: namespace | 452 | * @ns: namespace |
453 | * @params: ptr to the structure that contains key, size and shmflg | 453 | * @params: ptr to the structure that contains key, size and shmflg |
454 | * | 454 | * |
455 | * Called with shm_ids.rw_mutex held as a writer. | 455 | * Called with shm_ids.rw_mutex held as a writer. |
456 | */ | 456 | */ |
457 | 457 | ||
458 | static int newseg(struct ipc_namespace *ns, struct ipc_params *params) | 458 | static int newseg(struct ipc_namespace *ns, struct ipc_params *params) |
459 | { | 459 | { |
460 | key_t key = params->key; | 460 | key_t key = params->key; |
461 | int shmflg = params->flg; | 461 | int shmflg = params->flg; |
462 | size_t size = params->u.size; | 462 | size_t size = params->u.size; |
463 | int error; | 463 | int error; |
464 | struct shmid_kernel *shp; | 464 | struct shmid_kernel *shp; |
465 | size_t numpages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; | 465 | size_t numpages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; |
466 | struct file * file; | 466 | struct file * file; |
467 | char name[13]; | 467 | char name[13]; |
468 | int id; | 468 | int id; |
469 | vm_flags_t acctflag = 0; | 469 | vm_flags_t acctflag = 0; |
470 | 470 | ||
471 | if (size < SHMMIN || size > ns->shm_ctlmax) | 471 | if (size < SHMMIN || size > ns->shm_ctlmax) |
472 | return -EINVAL; | 472 | return -EINVAL; |
473 | 473 | ||
474 | if (ns->shm_tot + numpages > ns->shm_ctlall) | 474 | if (ns->shm_tot + numpages > ns->shm_ctlall) |
475 | return -ENOSPC; | 475 | return -ENOSPC; |
476 | 476 | ||
477 | shp = ipc_rcu_alloc(sizeof(*shp)); | 477 | shp = ipc_rcu_alloc(sizeof(*shp)); |
478 | if (!shp) | 478 | if (!shp) |
479 | return -ENOMEM; | 479 | return -ENOMEM; |
480 | 480 | ||
481 | shp->shm_perm.key = key; | 481 | shp->shm_perm.key = key; |
482 | shp->shm_perm.mode = (shmflg & S_IRWXUGO); | 482 | shp->shm_perm.mode = (shmflg & S_IRWXUGO); |
483 | shp->mlock_user = NULL; | 483 | shp->mlock_user = NULL; |
484 | 484 | ||
485 | shp->shm_perm.security = NULL; | 485 | shp->shm_perm.security = NULL; |
486 | error = security_shm_alloc(shp); | 486 | error = security_shm_alloc(shp); |
487 | if (error) { | 487 | if (error) { |
488 | ipc_rcu_putref(shp); | 488 | ipc_rcu_putref(shp); |
489 | return error; | 489 | return error; |
490 | } | 490 | } |
491 | 491 | ||
492 | sprintf (name, "SYSV%08x", key); | 492 | sprintf (name, "SYSV%08x", key); |
493 | if (shmflg & SHM_HUGETLB) { | 493 | if (shmflg & SHM_HUGETLB) { |
494 | struct hstate *hs = hstate_sizelog((shmflg >> SHM_HUGE_SHIFT) | ||
495 | & SHM_HUGE_MASK); | ||
496 | size_t hugesize = ALIGN(size, huge_page_size(hs)); | ||
497 | |||
494 | /* hugetlb_file_setup applies strict accounting */ | 498 | /* hugetlb_file_setup applies strict accounting */ |
495 | if (shmflg & SHM_NORESERVE) | 499 | if (shmflg & SHM_NORESERVE) |
496 | acctflag = VM_NORESERVE; | 500 | acctflag = VM_NORESERVE; |
497 | file = hugetlb_file_setup(name, 0, size, acctflag, | 501 | file = hugetlb_file_setup(name, hugesize, acctflag, |
498 | &shp->mlock_user, HUGETLB_SHMFS_INODE, | 502 | &shp->mlock_user, HUGETLB_SHMFS_INODE, |
499 | (shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK); | 503 | (shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK); |
500 | } else { | 504 | } else { |
501 | /* | 505 | /* |
502 | * Do not allow no accounting for OVERCOMMIT_NEVER, even | 506 | * Do not allow no accounting for OVERCOMMIT_NEVER, even |
503 | * if it's asked for. | 507 | * if it's asked for. |
504 | */ | 508 | */ |
505 | if ((shmflg & SHM_NORESERVE) && | 509 | if ((shmflg & SHM_NORESERVE) && |
506 | sysctl_overcommit_memory != OVERCOMMIT_NEVER) | 510 | sysctl_overcommit_memory != OVERCOMMIT_NEVER) |
507 | acctflag = VM_NORESERVE; | 511 | acctflag = VM_NORESERVE; |
508 | file = shmem_file_setup(name, size, acctflag); | 512 | file = shmem_file_setup(name, size, acctflag); |
509 | } | 513 | } |
510 | error = PTR_ERR(file); | 514 | error = PTR_ERR(file); |
511 | if (IS_ERR(file)) | 515 | if (IS_ERR(file)) |
512 | goto no_file; | 516 | goto no_file; |
513 | 517 | ||
514 | id = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni); | 518 | id = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni); |
515 | if (id < 0) { | 519 | if (id < 0) { |
516 | error = id; | 520 | error = id; |
517 | goto no_id; | 521 | goto no_id; |
518 | } | 522 | } |
519 | 523 | ||
520 | shp->shm_cprid = task_tgid_vnr(current); | 524 | shp->shm_cprid = task_tgid_vnr(current); |
521 | shp->shm_lprid = 0; | 525 | shp->shm_lprid = 0; |
522 | shp->shm_atim = shp->shm_dtim = 0; | 526 | shp->shm_atim = shp->shm_dtim = 0; |
523 | shp->shm_ctim = get_seconds(); | 527 | shp->shm_ctim = get_seconds(); |
524 | shp->shm_segsz = size; | 528 | shp->shm_segsz = size; |
525 | shp->shm_nattch = 0; | 529 | shp->shm_nattch = 0; |
526 | shp->shm_file = file; | 530 | shp->shm_file = file; |
527 | shp->shm_creator = current; | 531 | shp->shm_creator = current; |
528 | /* | 532 | /* |
529 | * shmid gets reported as "inode#" in /proc/pid/maps. | 533 | * shmid gets reported as "inode#" in /proc/pid/maps. |
530 | * proc-ps tools use this. Changing this will break them. | 534 | * proc-ps tools use this. Changing this will break them. |
531 | */ | 535 | */ |
532 | file_inode(file)->i_ino = shp->shm_perm.id; | 536 | file_inode(file)->i_ino = shp->shm_perm.id; |
533 | 537 | ||
534 | ns->shm_tot += numpages; | 538 | ns->shm_tot += numpages; |
535 | error = shp->shm_perm.id; | 539 | error = shp->shm_perm.id; |
536 | shm_unlock(shp); | 540 | shm_unlock(shp); |
537 | return error; | 541 | return error; |
538 | 542 | ||
539 | no_id: | 543 | no_id: |
540 | if (is_file_hugepages(file) && shp->mlock_user) | 544 | if (is_file_hugepages(file) && shp->mlock_user) |
541 | user_shm_unlock(size, shp->mlock_user); | 545 | user_shm_unlock(size, shp->mlock_user); |
542 | fput(file); | 546 | fput(file); |
543 | no_file: | 547 | no_file: |
544 | security_shm_free(shp); | 548 | security_shm_free(shp); |
545 | ipc_rcu_putref(shp); | 549 | ipc_rcu_putref(shp); |
546 | return error; | 550 | return error; |
547 | } | 551 | } |
548 | 552 | ||
549 | /* | 553 | /* |
550 | * Called with shm_ids.rw_mutex and ipcp locked. | 554 | * Called with shm_ids.rw_mutex and ipcp locked. |
551 | */ | 555 | */ |
552 | static inline int shm_security(struct kern_ipc_perm *ipcp, int shmflg) | 556 | static inline int shm_security(struct kern_ipc_perm *ipcp, int shmflg) |
553 | { | 557 | { |
554 | struct shmid_kernel *shp; | 558 | struct shmid_kernel *shp; |
555 | 559 | ||
556 | shp = container_of(ipcp, struct shmid_kernel, shm_perm); | 560 | shp = container_of(ipcp, struct shmid_kernel, shm_perm); |
557 | return security_shm_associate(shp, shmflg); | 561 | return security_shm_associate(shp, shmflg); |
558 | } | 562 | } |
559 | 563 | ||
560 | /* | 564 | /* |
561 | * Called with shm_ids.rw_mutex and ipcp locked. | 565 | * Called with shm_ids.rw_mutex and ipcp locked. |
562 | */ | 566 | */ |
563 | static inline int shm_more_checks(struct kern_ipc_perm *ipcp, | 567 | static inline int shm_more_checks(struct kern_ipc_perm *ipcp, |
564 | struct ipc_params *params) | 568 | struct ipc_params *params) |
565 | { | 569 | { |
566 | struct shmid_kernel *shp; | 570 | struct shmid_kernel *shp; |
567 | 571 | ||
568 | shp = container_of(ipcp, struct shmid_kernel, shm_perm); | 572 | shp = container_of(ipcp, struct shmid_kernel, shm_perm); |
569 | if (shp->shm_segsz < params->u.size) | 573 | if (shp->shm_segsz < params->u.size) |
570 | return -EINVAL; | 574 | return -EINVAL; |
571 | 575 | ||
572 | return 0; | 576 | return 0; |
573 | } | 577 | } |
574 | 578 | ||
575 | SYSCALL_DEFINE3(shmget, key_t, key, size_t, size, int, shmflg) | 579 | SYSCALL_DEFINE3(shmget, key_t, key, size_t, size, int, shmflg) |
576 | { | 580 | { |
577 | struct ipc_namespace *ns; | 581 | struct ipc_namespace *ns; |
578 | struct ipc_ops shm_ops; | 582 | struct ipc_ops shm_ops; |
579 | struct ipc_params shm_params; | 583 | struct ipc_params shm_params; |
580 | 584 | ||
581 | ns = current->nsproxy->ipc_ns; | 585 | ns = current->nsproxy->ipc_ns; |
582 | 586 | ||
583 | shm_ops.getnew = newseg; | 587 | shm_ops.getnew = newseg; |
584 | shm_ops.associate = shm_security; | 588 | shm_ops.associate = shm_security; |
585 | shm_ops.more_checks = shm_more_checks; | 589 | shm_ops.more_checks = shm_more_checks; |
586 | 590 | ||
587 | shm_params.key = key; | 591 | shm_params.key = key; |
588 | shm_params.flg = shmflg; | 592 | shm_params.flg = shmflg; |
589 | shm_params.u.size = size; | 593 | shm_params.u.size = size; |
590 | 594 | ||
591 | return ipcget(ns, &shm_ids(ns), &shm_ops, &shm_params); | 595 | return ipcget(ns, &shm_ids(ns), &shm_ops, &shm_params); |
592 | } | 596 | } |
593 | 597 | ||
594 | static inline unsigned long copy_shmid_to_user(void __user *buf, struct shmid64_ds *in, int version) | 598 | static inline unsigned long copy_shmid_to_user(void __user *buf, struct shmid64_ds *in, int version) |
595 | { | 599 | { |
596 | switch(version) { | 600 | switch(version) { |
597 | case IPC_64: | 601 | case IPC_64: |
598 | return copy_to_user(buf, in, sizeof(*in)); | 602 | return copy_to_user(buf, in, sizeof(*in)); |
599 | case IPC_OLD: | 603 | case IPC_OLD: |
600 | { | 604 | { |
601 | struct shmid_ds out; | 605 | struct shmid_ds out; |
602 | 606 | ||
603 | memset(&out, 0, sizeof(out)); | 607 | memset(&out, 0, sizeof(out)); |
604 | ipc64_perm_to_ipc_perm(&in->shm_perm, &out.shm_perm); | 608 | ipc64_perm_to_ipc_perm(&in->shm_perm, &out.shm_perm); |
605 | out.shm_segsz = in->shm_segsz; | 609 | out.shm_segsz = in->shm_segsz; |
606 | out.shm_atime = in->shm_atime; | 610 | out.shm_atime = in->shm_atime; |
607 | out.shm_dtime = in->shm_dtime; | 611 | out.shm_dtime = in->shm_dtime; |
608 | out.shm_ctime = in->shm_ctime; | 612 | out.shm_ctime = in->shm_ctime; |
609 | out.shm_cpid = in->shm_cpid; | 613 | out.shm_cpid = in->shm_cpid; |
610 | out.shm_lpid = in->shm_lpid; | 614 | out.shm_lpid = in->shm_lpid; |
611 | out.shm_nattch = in->shm_nattch; | 615 | out.shm_nattch = in->shm_nattch; |
612 | 616 | ||
613 | return copy_to_user(buf, &out, sizeof(out)); | 617 | return copy_to_user(buf, &out, sizeof(out)); |
614 | } | 618 | } |
615 | default: | 619 | default: |
616 | return -EINVAL; | 620 | return -EINVAL; |
617 | } | 621 | } |
618 | } | 622 | } |
619 | 623 | ||
620 | static inline unsigned long | 624 | static inline unsigned long |
621 | copy_shmid_from_user(struct shmid64_ds *out, void __user *buf, int version) | 625 | copy_shmid_from_user(struct shmid64_ds *out, void __user *buf, int version) |
622 | { | 626 | { |
623 | switch(version) { | 627 | switch(version) { |
624 | case IPC_64: | 628 | case IPC_64: |
625 | if (copy_from_user(out, buf, sizeof(*out))) | 629 | if (copy_from_user(out, buf, sizeof(*out))) |
626 | return -EFAULT; | 630 | return -EFAULT; |
627 | return 0; | 631 | return 0; |
628 | case IPC_OLD: | 632 | case IPC_OLD: |
629 | { | 633 | { |
630 | struct shmid_ds tbuf_old; | 634 | struct shmid_ds tbuf_old; |
631 | 635 | ||
632 | if (copy_from_user(&tbuf_old, buf, sizeof(tbuf_old))) | 636 | if (copy_from_user(&tbuf_old, buf, sizeof(tbuf_old))) |
633 | return -EFAULT; | 637 | return -EFAULT; |
634 | 638 | ||
635 | out->shm_perm.uid = tbuf_old.shm_perm.uid; | 639 | out->shm_perm.uid = tbuf_old.shm_perm.uid; |
636 | out->shm_perm.gid = tbuf_old.shm_perm.gid; | 640 | out->shm_perm.gid = tbuf_old.shm_perm.gid; |
637 | out->shm_perm.mode = tbuf_old.shm_perm.mode; | 641 | out->shm_perm.mode = tbuf_old.shm_perm.mode; |
638 | 642 | ||
639 | return 0; | 643 | return 0; |
640 | } | 644 | } |
641 | default: | 645 | default: |
642 | return -EINVAL; | 646 | return -EINVAL; |
643 | } | 647 | } |
644 | } | 648 | } |
645 | 649 | ||
646 | static inline unsigned long copy_shminfo_to_user(void __user *buf, struct shminfo64 *in, int version) | 650 | static inline unsigned long copy_shminfo_to_user(void __user *buf, struct shminfo64 *in, int version) |
647 | { | 651 | { |
648 | switch(version) { | 652 | switch(version) { |
649 | case IPC_64: | 653 | case IPC_64: |
650 | return copy_to_user(buf, in, sizeof(*in)); | 654 | return copy_to_user(buf, in, sizeof(*in)); |
651 | case IPC_OLD: | 655 | case IPC_OLD: |
652 | { | 656 | { |
653 | struct shminfo out; | 657 | struct shminfo out; |
654 | 658 | ||
655 | if(in->shmmax > INT_MAX) | 659 | if(in->shmmax > INT_MAX) |
656 | out.shmmax = INT_MAX; | 660 | out.shmmax = INT_MAX; |
657 | else | 661 | else |
658 | out.shmmax = (int)in->shmmax; | 662 | out.shmmax = (int)in->shmmax; |
659 | 663 | ||
660 | out.shmmin = in->shmmin; | 664 | out.shmmin = in->shmmin; |
661 | out.shmmni = in->shmmni; | 665 | out.shmmni = in->shmmni; |
662 | out.shmseg = in->shmseg; | 666 | out.shmseg = in->shmseg; |
663 | out.shmall = in->shmall; | 667 | out.shmall = in->shmall; |
664 | 668 | ||
665 | return copy_to_user(buf, &out, sizeof(out)); | 669 | return copy_to_user(buf, &out, sizeof(out)); |
666 | } | 670 | } |
667 | default: | 671 | default: |
668 | return -EINVAL; | 672 | return -EINVAL; |
669 | } | 673 | } |
670 | } | 674 | } |
671 | 675 | ||
672 | /* | 676 | /* |
673 | * Calculate and add used RSS and swap pages of a shm. | 677 | * Calculate and add used RSS and swap pages of a shm. |
674 | * Called with shm_ids.rw_mutex held as a reader | 678 | * Called with shm_ids.rw_mutex held as a reader |
675 | */ | 679 | */ |
676 | static void shm_add_rss_swap(struct shmid_kernel *shp, | 680 | static void shm_add_rss_swap(struct shmid_kernel *shp, |
677 | unsigned long *rss_add, unsigned long *swp_add) | 681 | unsigned long *rss_add, unsigned long *swp_add) |
678 | { | 682 | { |
679 | struct inode *inode; | 683 | struct inode *inode; |
680 | 684 | ||
681 | inode = file_inode(shp->shm_file); | 685 | inode = file_inode(shp->shm_file); |
682 | 686 | ||
683 | if (is_file_hugepages(shp->shm_file)) { | 687 | if (is_file_hugepages(shp->shm_file)) { |
684 | struct address_space *mapping = inode->i_mapping; | 688 | struct address_space *mapping = inode->i_mapping; |
685 | struct hstate *h = hstate_file(shp->shm_file); | 689 | struct hstate *h = hstate_file(shp->shm_file); |
686 | *rss_add += pages_per_huge_page(h) * mapping->nrpages; | 690 | *rss_add += pages_per_huge_page(h) * mapping->nrpages; |
687 | } else { | 691 | } else { |
688 | #ifdef CONFIG_SHMEM | 692 | #ifdef CONFIG_SHMEM |
689 | struct shmem_inode_info *info = SHMEM_I(inode); | 693 | struct shmem_inode_info *info = SHMEM_I(inode); |
690 | spin_lock(&info->lock); | 694 | spin_lock(&info->lock); |
691 | *rss_add += inode->i_mapping->nrpages; | 695 | *rss_add += inode->i_mapping->nrpages; |
692 | *swp_add += info->swapped; | 696 | *swp_add += info->swapped; |
693 | spin_unlock(&info->lock); | 697 | spin_unlock(&info->lock); |
694 | #else | 698 | #else |
695 | *rss_add += inode->i_mapping->nrpages; | 699 | *rss_add += inode->i_mapping->nrpages; |
696 | #endif | 700 | #endif |
697 | } | 701 | } |
698 | } | 702 | } |
699 | 703 | ||
700 | /* | 704 | /* |
701 | * Called with shm_ids.rw_mutex held as a reader | 705 | * Called with shm_ids.rw_mutex held as a reader |
702 | */ | 706 | */ |
703 | static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss, | 707 | static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss, |
704 | unsigned long *swp) | 708 | unsigned long *swp) |
705 | { | 709 | { |
706 | int next_id; | 710 | int next_id; |
707 | int total, in_use; | 711 | int total, in_use; |
708 | 712 | ||
709 | *rss = 0; | 713 | *rss = 0; |
710 | *swp = 0; | 714 | *swp = 0; |
711 | 715 | ||
712 | in_use = shm_ids(ns).in_use; | 716 | in_use = shm_ids(ns).in_use; |
713 | 717 | ||
714 | for (total = 0, next_id = 0; total < in_use; next_id++) { | 718 | for (total = 0, next_id = 0; total < in_use; next_id++) { |
715 | struct kern_ipc_perm *ipc; | 719 | struct kern_ipc_perm *ipc; |
716 | struct shmid_kernel *shp; | 720 | struct shmid_kernel *shp; |
717 | 721 | ||
718 | ipc = idr_find(&shm_ids(ns).ipcs_idr, next_id); | 722 | ipc = idr_find(&shm_ids(ns).ipcs_idr, next_id); |
719 | if (ipc == NULL) | 723 | if (ipc == NULL) |
720 | continue; | 724 | continue; |
721 | shp = container_of(ipc, struct shmid_kernel, shm_perm); | 725 | shp = container_of(ipc, struct shmid_kernel, shm_perm); |
722 | 726 | ||
723 | shm_add_rss_swap(shp, rss, swp); | 727 | shm_add_rss_swap(shp, rss, swp); |
724 | 728 | ||
725 | total++; | 729 | total++; |
726 | } | 730 | } |
727 | } | 731 | } |
728 | 732 | ||
729 | /* | 733 | /* |
730 | * This function handles some shmctl commands which require the rw_mutex | 734 | * This function handles some shmctl commands which require the rw_mutex |
731 | * to be held in write mode. | 735 | * to be held in write mode. |
732 | * NOTE: no locks must be held, the rw_mutex is taken inside this function. | 736 | * NOTE: no locks must be held, the rw_mutex is taken inside this function. |
733 | */ | 737 | */ |
734 | static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd, | 738 | static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd, |
735 | struct shmid_ds __user *buf, int version) | 739 | struct shmid_ds __user *buf, int version) |
736 | { | 740 | { |
737 | struct kern_ipc_perm *ipcp; | 741 | struct kern_ipc_perm *ipcp; |
738 | struct shmid64_ds shmid64; | 742 | struct shmid64_ds shmid64; |
739 | struct shmid_kernel *shp; | 743 | struct shmid_kernel *shp; |
740 | int err; | 744 | int err; |
741 | 745 | ||
742 | if (cmd == IPC_SET) { | 746 | if (cmd == IPC_SET) { |
743 | if (copy_shmid_from_user(&shmid64, buf, version)) | 747 | if (copy_shmid_from_user(&shmid64, buf, version)) |
744 | return -EFAULT; | 748 | return -EFAULT; |
745 | } | 749 | } |
746 | 750 | ||
747 | ipcp = ipcctl_pre_down(ns, &shm_ids(ns), shmid, cmd, | 751 | ipcp = ipcctl_pre_down(ns, &shm_ids(ns), shmid, cmd, |
748 | &shmid64.shm_perm, 0); | 752 | &shmid64.shm_perm, 0); |
749 | if (IS_ERR(ipcp)) | 753 | if (IS_ERR(ipcp)) |
750 | return PTR_ERR(ipcp); | 754 | return PTR_ERR(ipcp); |
751 | 755 | ||
752 | shp = container_of(ipcp, struct shmid_kernel, shm_perm); | 756 | shp = container_of(ipcp, struct shmid_kernel, shm_perm); |
753 | 757 | ||
754 | err = security_shm_shmctl(shp, cmd); | 758 | err = security_shm_shmctl(shp, cmd); |
755 | if (err) | 759 | if (err) |
756 | goto out_unlock; | 760 | goto out_unlock; |
757 | switch (cmd) { | 761 | switch (cmd) { |
758 | case IPC_RMID: | 762 | case IPC_RMID: |
759 | do_shm_rmid(ns, ipcp); | 763 | do_shm_rmid(ns, ipcp); |
760 | goto out_up; | 764 | goto out_up; |
761 | case IPC_SET: | 765 | case IPC_SET: |
762 | err = ipc_update_perm(&shmid64.shm_perm, ipcp); | 766 | err = ipc_update_perm(&shmid64.shm_perm, ipcp); |
763 | if (err) | 767 | if (err) |
764 | goto out_unlock; | 768 | goto out_unlock; |
765 | shp->shm_ctim = get_seconds(); | 769 | shp->shm_ctim = get_seconds(); |
766 | break; | 770 | break; |
767 | default: | 771 | default: |
768 | err = -EINVAL; | 772 | err = -EINVAL; |
769 | } | 773 | } |
770 | out_unlock: | 774 | out_unlock: |
771 | shm_unlock(shp); | 775 | shm_unlock(shp); |
772 | out_up: | 776 | out_up: |
773 | up_write(&shm_ids(ns).rw_mutex); | 777 | up_write(&shm_ids(ns).rw_mutex); |
774 | return err; | 778 | return err; |
775 | } | 779 | } |
776 | 780 | ||
777 | SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf) | 781 | SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf) |
778 | { | 782 | { |
779 | struct shmid_kernel *shp; | 783 | struct shmid_kernel *shp; |
780 | int err, version; | 784 | int err, version; |
781 | struct ipc_namespace *ns; | 785 | struct ipc_namespace *ns; |
782 | 786 | ||
783 | if (cmd < 0 || shmid < 0) { | 787 | if (cmd < 0 || shmid < 0) { |
784 | err = -EINVAL; | 788 | err = -EINVAL; |
785 | goto out; | 789 | goto out; |
786 | } | 790 | } |
787 | 791 | ||
788 | version = ipc_parse_version(&cmd); | 792 | version = ipc_parse_version(&cmd); |
789 | ns = current->nsproxy->ipc_ns; | 793 | ns = current->nsproxy->ipc_ns; |
790 | 794 | ||
791 | switch (cmd) { /* replace with proc interface ? */ | 795 | switch (cmd) { /* replace with proc interface ? */ |
792 | case IPC_INFO: | 796 | case IPC_INFO: |
793 | { | 797 | { |
794 | struct shminfo64 shminfo; | 798 | struct shminfo64 shminfo; |
795 | 799 | ||
796 | err = security_shm_shmctl(NULL, cmd); | 800 | err = security_shm_shmctl(NULL, cmd); |
797 | if (err) | 801 | if (err) |
798 | return err; | 802 | return err; |
799 | 803 | ||
800 | memset(&shminfo, 0, sizeof(shminfo)); | 804 | memset(&shminfo, 0, sizeof(shminfo)); |
801 | shminfo.shmmni = shminfo.shmseg = ns->shm_ctlmni; | 805 | shminfo.shmmni = shminfo.shmseg = ns->shm_ctlmni; |
802 | shminfo.shmmax = ns->shm_ctlmax; | 806 | shminfo.shmmax = ns->shm_ctlmax; |
803 | shminfo.shmall = ns->shm_ctlall; | 807 | shminfo.shmall = ns->shm_ctlall; |
804 | 808 | ||
805 | shminfo.shmmin = SHMMIN; | 809 | shminfo.shmmin = SHMMIN; |
806 | if(copy_shminfo_to_user (buf, &shminfo, version)) | 810 | if(copy_shminfo_to_user (buf, &shminfo, version)) |
807 | return -EFAULT; | 811 | return -EFAULT; |
808 | 812 | ||
809 | down_read(&shm_ids(ns).rw_mutex); | 813 | down_read(&shm_ids(ns).rw_mutex); |
810 | err = ipc_get_maxid(&shm_ids(ns)); | 814 | err = ipc_get_maxid(&shm_ids(ns)); |
811 | up_read(&shm_ids(ns).rw_mutex); | 815 | up_read(&shm_ids(ns).rw_mutex); |
812 | 816 | ||
813 | if(err<0) | 817 | if(err<0) |
814 | err = 0; | 818 | err = 0; |
815 | goto out; | 819 | goto out; |
816 | } | 820 | } |
817 | case SHM_INFO: | 821 | case SHM_INFO: |
818 | { | 822 | { |
819 | struct shm_info shm_info; | 823 | struct shm_info shm_info; |
820 | 824 | ||
821 | err = security_shm_shmctl(NULL, cmd); | 825 | err = security_shm_shmctl(NULL, cmd); |
822 | if (err) | 826 | if (err) |
823 | return err; | 827 | return err; |
824 | 828 | ||
825 | memset(&shm_info, 0, sizeof(shm_info)); | 829 | memset(&shm_info, 0, sizeof(shm_info)); |
826 | down_read(&shm_ids(ns).rw_mutex); | 830 | down_read(&shm_ids(ns).rw_mutex); |
827 | shm_info.used_ids = shm_ids(ns).in_use; | 831 | shm_info.used_ids = shm_ids(ns).in_use; |
828 | shm_get_stat (ns, &shm_info.shm_rss, &shm_info.shm_swp); | 832 | shm_get_stat (ns, &shm_info.shm_rss, &shm_info.shm_swp); |
829 | shm_info.shm_tot = ns->shm_tot; | 833 | shm_info.shm_tot = ns->shm_tot; |
830 | shm_info.swap_attempts = 0; | 834 | shm_info.swap_attempts = 0; |
831 | shm_info.swap_successes = 0; | 835 | shm_info.swap_successes = 0; |
832 | err = ipc_get_maxid(&shm_ids(ns)); | 836 | err = ipc_get_maxid(&shm_ids(ns)); |
833 | up_read(&shm_ids(ns).rw_mutex); | 837 | up_read(&shm_ids(ns).rw_mutex); |
834 | if (copy_to_user(buf, &shm_info, sizeof(shm_info))) { | 838 | if (copy_to_user(buf, &shm_info, sizeof(shm_info))) { |
835 | err = -EFAULT; | 839 | err = -EFAULT; |
836 | goto out; | 840 | goto out; |
837 | } | 841 | } |
838 | 842 | ||
839 | err = err < 0 ? 0 : err; | 843 | err = err < 0 ? 0 : err; |
840 | goto out; | 844 | goto out; |
841 | } | 845 | } |
842 | case SHM_STAT: | 846 | case SHM_STAT: |
843 | case IPC_STAT: | 847 | case IPC_STAT: |
844 | { | 848 | { |
845 | struct shmid64_ds tbuf; | 849 | struct shmid64_ds tbuf; |
846 | int result; | 850 | int result; |
847 | 851 | ||
848 | if (cmd == SHM_STAT) { | 852 | if (cmd == SHM_STAT) { |
849 | shp = shm_lock(ns, shmid); | 853 | shp = shm_lock(ns, shmid); |
850 | if (IS_ERR(shp)) { | 854 | if (IS_ERR(shp)) { |
851 | err = PTR_ERR(shp); | 855 | err = PTR_ERR(shp); |
852 | goto out; | 856 | goto out; |
853 | } | 857 | } |
854 | result = shp->shm_perm.id; | 858 | result = shp->shm_perm.id; |
855 | } else { | 859 | } else { |
856 | shp = shm_lock_check(ns, shmid); | 860 | shp = shm_lock_check(ns, shmid); |
857 | if (IS_ERR(shp)) { | 861 | if (IS_ERR(shp)) { |
858 | err = PTR_ERR(shp); | 862 | err = PTR_ERR(shp); |
859 | goto out; | 863 | goto out; |
860 | } | 864 | } |
861 | result = 0; | 865 | result = 0; |
862 | } | 866 | } |
863 | err = -EACCES; | 867 | err = -EACCES; |
864 | if (ipcperms(ns, &shp->shm_perm, S_IRUGO)) | 868 | if (ipcperms(ns, &shp->shm_perm, S_IRUGO)) |
865 | goto out_unlock; | 869 | goto out_unlock; |
866 | err = security_shm_shmctl(shp, cmd); | 870 | err = security_shm_shmctl(shp, cmd); |
867 | if (err) | 871 | if (err) |
868 | goto out_unlock; | 872 | goto out_unlock; |
869 | memset(&tbuf, 0, sizeof(tbuf)); | 873 | memset(&tbuf, 0, sizeof(tbuf)); |
870 | kernel_to_ipc64_perm(&shp->shm_perm, &tbuf.shm_perm); | 874 | kernel_to_ipc64_perm(&shp->shm_perm, &tbuf.shm_perm); |
871 | tbuf.shm_segsz = shp->shm_segsz; | 875 | tbuf.shm_segsz = shp->shm_segsz; |
872 | tbuf.shm_atime = shp->shm_atim; | 876 | tbuf.shm_atime = shp->shm_atim; |
873 | tbuf.shm_dtime = shp->shm_dtim; | 877 | tbuf.shm_dtime = shp->shm_dtim; |
874 | tbuf.shm_ctime = shp->shm_ctim; | 878 | tbuf.shm_ctime = shp->shm_ctim; |
875 | tbuf.shm_cpid = shp->shm_cprid; | 879 | tbuf.shm_cpid = shp->shm_cprid; |
876 | tbuf.shm_lpid = shp->shm_lprid; | 880 | tbuf.shm_lpid = shp->shm_lprid; |
877 | tbuf.shm_nattch = shp->shm_nattch; | 881 | tbuf.shm_nattch = shp->shm_nattch; |
878 | shm_unlock(shp); | 882 | shm_unlock(shp); |
879 | if(copy_shmid_to_user (buf, &tbuf, version)) | 883 | if(copy_shmid_to_user (buf, &tbuf, version)) |
880 | err = -EFAULT; | 884 | err = -EFAULT; |
881 | else | 885 | else |
882 | err = result; | 886 | err = result; |
883 | goto out; | 887 | goto out; |
884 | } | 888 | } |
885 | case SHM_LOCK: | 889 | case SHM_LOCK: |
886 | case SHM_UNLOCK: | 890 | case SHM_UNLOCK: |
887 | { | 891 | { |
888 | struct file *shm_file; | 892 | struct file *shm_file; |
889 | 893 | ||
890 | shp = shm_lock_check(ns, shmid); | 894 | shp = shm_lock_check(ns, shmid); |
891 | if (IS_ERR(shp)) { | 895 | if (IS_ERR(shp)) { |
892 | err = PTR_ERR(shp); | 896 | err = PTR_ERR(shp); |
893 | goto out; | 897 | goto out; |
894 | } | 898 | } |
895 | 899 | ||
896 | audit_ipc_obj(&(shp->shm_perm)); | 900 | audit_ipc_obj(&(shp->shm_perm)); |
897 | 901 | ||
898 | if (!ns_capable(ns->user_ns, CAP_IPC_LOCK)) { | 902 | if (!ns_capable(ns->user_ns, CAP_IPC_LOCK)) { |
899 | kuid_t euid = current_euid(); | 903 | kuid_t euid = current_euid(); |
900 | err = -EPERM; | 904 | err = -EPERM; |
901 | if (!uid_eq(euid, shp->shm_perm.uid) && | 905 | if (!uid_eq(euid, shp->shm_perm.uid) && |
902 | !uid_eq(euid, shp->shm_perm.cuid)) | 906 | !uid_eq(euid, shp->shm_perm.cuid)) |
903 | goto out_unlock; | 907 | goto out_unlock; |
904 | if (cmd == SHM_LOCK && !rlimit(RLIMIT_MEMLOCK)) | 908 | if (cmd == SHM_LOCK && !rlimit(RLIMIT_MEMLOCK)) |
905 | goto out_unlock; | 909 | goto out_unlock; |
906 | } | 910 | } |
907 | 911 | ||
908 | err = security_shm_shmctl(shp, cmd); | 912 | err = security_shm_shmctl(shp, cmd); |
909 | if (err) | 913 | if (err) |
910 | goto out_unlock; | 914 | goto out_unlock; |
911 | 915 | ||
912 | shm_file = shp->shm_file; | 916 | shm_file = shp->shm_file; |
913 | if (is_file_hugepages(shm_file)) | 917 | if (is_file_hugepages(shm_file)) |
914 | goto out_unlock; | 918 | goto out_unlock; |
915 | 919 | ||
916 | if (cmd == SHM_LOCK) { | 920 | if (cmd == SHM_LOCK) { |
917 | struct user_struct *user = current_user(); | 921 | struct user_struct *user = current_user(); |
918 | err = shmem_lock(shm_file, 1, user); | 922 | err = shmem_lock(shm_file, 1, user); |
919 | if (!err && !(shp->shm_perm.mode & SHM_LOCKED)) { | 923 | if (!err && !(shp->shm_perm.mode & SHM_LOCKED)) { |
920 | shp->shm_perm.mode |= SHM_LOCKED; | 924 | shp->shm_perm.mode |= SHM_LOCKED; |
921 | shp->mlock_user = user; | 925 | shp->mlock_user = user; |
922 | } | 926 | } |
923 | goto out_unlock; | 927 | goto out_unlock; |
924 | } | 928 | } |
925 | 929 | ||
926 | /* SHM_UNLOCK */ | 930 | /* SHM_UNLOCK */ |
927 | if (!(shp->shm_perm.mode & SHM_LOCKED)) | 931 | if (!(shp->shm_perm.mode & SHM_LOCKED)) |
928 | goto out_unlock; | 932 | goto out_unlock; |
929 | shmem_lock(shm_file, 0, shp->mlock_user); | 933 | shmem_lock(shm_file, 0, shp->mlock_user); |
930 | shp->shm_perm.mode &= ~SHM_LOCKED; | 934 | shp->shm_perm.mode &= ~SHM_LOCKED; |
931 | shp->mlock_user = NULL; | 935 | shp->mlock_user = NULL; |
932 | get_file(shm_file); | 936 | get_file(shm_file); |
933 | shm_unlock(shp); | 937 | shm_unlock(shp); |
934 | shmem_unlock_mapping(shm_file->f_mapping); | 938 | shmem_unlock_mapping(shm_file->f_mapping); |
935 | fput(shm_file); | 939 | fput(shm_file); |
936 | goto out; | 940 | goto out; |
937 | } | 941 | } |
938 | case IPC_RMID: | 942 | case IPC_RMID: |
939 | case IPC_SET: | 943 | case IPC_SET: |
940 | err = shmctl_down(ns, shmid, cmd, buf, version); | 944 | err = shmctl_down(ns, shmid, cmd, buf, version); |
941 | return err; | 945 | return err; |
942 | default: | 946 | default: |
943 | return -EINVAL; | 947 | return -EINVAL; |
944 | } | 948 | } |
945 | 949 | ||
946 | out_unlock: | 950 | out_unlock: |
947 | shm_unlock(shp); | 951 | shm_unlock(shp); |
948 | out: | 952 | out: |
949 | return err; | 953 | return err; |
950 | } | 954 | } |
951 | 955 | ||
952 | /* | 956 | /* |
953 | * Fix shmaddr, allocate descriptor, map shm, add attach descriptor to lists. | 957 | * Fix shmaddr, allocate descriptor, map shm, add attach descriptor to lists. |
954 | * | 958 | * |
955 | * NOTE! Despite the name, this is NOT a direct system call entrypoint. The | 959 | * NOTE! Despite the name, this is NOT a direct system call entrypoint. The |
956 | * "raddr" thing points to kernel space, and there has to be a wrapper around | 960 | * "raddr" thing points to kernel space, and there has to be a wrapper around |
957 | * this. | 961 | * this. |
958 | */ | 962 | */ |
959 | long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr, | 963 | long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr, |
960 | unsigned long shmlba) | 964 | unsigned long shmlba) |
961 | { | 965 | { |
962 | struct shmid_kernel *shp; | 966 | struct shmid_kernel *shp; |
963 | unsigned long addr; | 967 | unsigned long addr; |
964 | unsigned long size; | 968 | unsigned long size; |
965 | struct file * file; | 969 | struct file * file; |
966 | int err; | 970 | int err; |
967 | unsigned long flags; | 971 | unsigned long flags; |
968 | unsigned long prot; | 972 | unsigned long prot; |
969 | int acc_mode; | 973 | int acc_mode; |
970 | struct ipc_namespace *ns; | 974 | struct ipc_namespace *ns; |
971 | struct shm_file_data *sfd; | 975 | struct shm_file_data *sfd; |
972 | struct path path; | 976 | struct path path; |
973 | fmode_t f_mode; | 977 | fmode_t f_mode; |
974 | unsigned long populate = 0; | 978 | unsigned long populate = 0; |
975 | 979 | ||
976 | err = -EINVAL; | 980 | err = -EINVAL; |
977 | if (shmid < 0) | 981 | if (shmid < 0) |
978 | goto out; | 982 | goto out; |
979 | else if ((addr = (ulong)shmaddr)) { | 983 | else if ((addr = (ulong)shmaddr)) { |
980 | if (addr & (shmlba - 1)) { | 984 | if (addr & (shmlba - 1)) { |
981 | if (shmflg & SHM_RND) | 985 | if (shmflg & SHM_RND) |
982 | addr &= ~(shmlba - 1); /* round down */ | 986 | addr &= ~(shmlba - 1); /* round down */ |
983 | else | 987 | else |
984 | #ifndef __ARCH_FORCE_SHMLBA | 988 | #ifndef __ARCH_FORCE_SHMLBA |
985 | if (addr & ~PAGE_MASK) | 989 | if (addr & ~PAGE_MASK) |
986 | #endif | 990 | #endif |
987 | goto out; | 991 | goto out; |
988 | } | 992 | } |
989 | flags = MAP_SHARED | MAP_FIXED; | 993 | flags = MAP_SHARED | MAP_FIXED; |
990 | } else { | 994 | } else { |
991 | if ((shmflg & SHM_REMAP)) | 995 | if ((shmflg & SHM_REMAP)) |
992 | goto out; | 996 | goto out; |
993 | 997 | ||
994 | flags = MAP_SHARED; | 998 | flags = MAP_SHARED; |
995 | } | 999 | } |
996 | 1000 | ||
997 | if (shmflg & SHM_RDONLY) { | 1001 | if (shmflg & SHM_RDONLY) { |
998 | prot = PROT_READ; | 1002 | prot = PROT_READ; |
999 | acc_mode = S_IRUGO; | 1003 | acc_mode = S_IRUGO; |
1000 | f_mode = FMODE_READ; | 1004 | f_mode = FMODE_READ; |
1001 | } else { | 1005 | } else { |
1002 | prot = PROT_READ | PROT_WRITE; | 1006 | prot = PROT_READ | PROT_WRITE; |
1003 | acc_mode = S_IRUGO | S_IWUGO; | 1007 | acc_mode = S_IRUGO | S_IWUGO; |
1004 | f_mode = FMODE_READ | FMODE_WRITE; | 1008 | f_mode = FMODE_READ | FMODE_WRITE; |
1005 | } | 1009 | } |
1006 | if (shmflg & SHM_EXEC) { | 1010 | if (shmflg & SHM_EXEC) { |
1007 | prot |= PROT_EXEC; | 1011 | prot |= PROT_EXEC; |
1008 | acc_mode |= S_IXUGO; | 1012 | acc_mode |= S_IXUGO; |
1009 | } | 1013 | } |
1010 | 1014 | ||
1011 | /* | 1015 | /* |
1012 | * We cannot rely on the fs check since SYSV IPC does have an | 1016 | * We cannot rely on the fs check since SYSV IPC does have an |
1013 | * additional creator id... | 1017 | * additional creator id... |
1014 | */ | 1018 | */ |
1015 | ns = current->nsproxy->ipc_ns; | 1019 | ns = current->nsproxy->ipc_ns; |
1016 | shp = shm_lock_check(ns, shmid); | 1020 | shp = shm_lock_check(ns, shmid); |
1017 | if (IS_ERR(shp)) { | 1021 | if (IS_ERR(shp)) { |
1018 | err = PTR_ERR(shp); | 1022 | err = PTR_ERR(shp); |
1019 | goto out; | 1023 | goto out; |
1020 | } | 1024 | } |
1021 | 1025 | ||
1022 | err = -EACCES; | 1026 | err = -EACCES; |
1023 | if (ipcperms(ns, &shp->shm_perm, acc_mode)) | 1027 | if (ipcperms(ns, &shp->shm_perm, acc_mode)) |
1024 | goto out_unlock; | 1028 | goto out_unlock; |
1025 | 1029 | ||
1026 | err = security_shm_shmat(shp, shmaddr, shmflg); | 1030 | err = security_shm_shmat(shp, shmaddr, shmflg); |
1027 | if (err) | 1031 | if (err) |
1028 | goto out_unlock; | 1032 | goto out_unlock; |
1029 | 1033 | ||
1030 | path = shp->shm_file->f_path; | 1034 | path = shp->shm_file->f_path; |
1031 | path_get(&path); | 1035 | path_get(&path); |
1032 | shp->shm_nattch++; | 1036 | shp->shm_nattch++; |
1033 | size = i_size_read(path.dentry->d_inode); | 1037 | size = i_size_read(path.dentry->d_inode); |
1034 | shm_unlock(shp); | 1038 | shm_unlock(shp); |
1035 | 1039 | ||
1036 | err = -ENOMEM; | 1040 | err = -ENOMEM; |
1037 | sfd = kzalloc(sizeof(*sfd), GFP_KERNEL); | 1041 | sfd = kzalloc(sizeof(*sfd), GFP_KERNEL); |
1038 | if (!sfd) | 1042 | if (!sfd) |
1039 | goto out_put_dentry; | 1043 | goto out_put_dentry; |
1040 | 1044 | ||
1041 | file = alloc_file(&path, f_mode, | 1045 | file = alloc_file(&path, f_mode, |
1042 | is_file_hugepages(shp->shm_file) ? | 1046 | is_file_hugepages(shp->shm_file) ? |
1043 | &shm_file_operations_huge : | 1047 | &shm_file_operations_huge : |
1044 | &shm_file_operations); | 1048 | &shm_file_operations); |
1045 | err = PTR_ERR(file); | 1049 | err = PTR_ERR(file); |
1046 | if (IS_ERR(file)) | 1050 | if (IS_ERR(file)) |
1047 | goto out_free; | 1051 | goto out_free; |
1048 | 1052 | ||
1049 | file->private_data = sfd; | 1053 | file->private_data = sfd; |
1050 | file->f_mapping = shp->shm_file->f_mapping; | 1054 | file->f_mapping = shp->shm_file->f_mapping; |
1051 | sfd->id = shp->shm_perm.id; | 1055 | sfd->id = shp->shm_perm.id; |
1052 | sfd->ns = get_ipc_ns(ns); | 1056 | sfd->ns = get_ipc_ns(ns); |
1053 | sfd->file = shp->shm_file; | 1057 | sfd->file = shp->shm_file; |
1054 | sfd->vm_ops = NULL; | 1058 | sfd->vm_ops = NULL; |
1055 | 1059 | ||
1056 | err = security_mmap_file(file, prot, flags); | 1060 | err = security_mmap_file(file, prot, flags); |
1057 | if (err) | 1061 | if (err) |
1058 | goto out_fput; | 1062 | goto out_fput; |
1059 | 1063 | ||
1060 | down_write(¤t->mm->mmap_sem); | 1064 | down_write(¤t->mm->mmap_sem); |
1061 | if (addr && !(shmflg & SHM_REMAP)) { | 1065 | if (addr && !(shmflg & SHM_REMAP)) { |
1062 | err = -EINVAL; | 1066 | err = -EINVAL; |
1063 | if (find_vma_intersection(current->mm, addr, addr + size)) | 1067 | if (find_vma_intersection(current->mm, addr, addr + size)) |
1064 | goto invalid; | 1068 | goto invalid; |
1065 | /* | 1069 | /* |
1066 | * If shm segment goes below stack, make sure there is some | 1070 | * If shm segment goes below stack, make sure there is some |
1067 | * space left for the stack to grow (at least 4 pages). | 1071 | * space left for the stack to grow (at least 4 pages). |
1068 | */ | 1072 | */ |
1069 | if (addr < current->mm->start_stack && | 1073 | if (addr < current->mm->start_stack && |
1070 | addr > current->mm->start_stack - size - PAGE_SIZE * 5) | 1074 | addr > current->mm->start_stack - size - PAGE_SIZE * 5) |
1071 | goto invalid; | 1075 | goto invalid; |
1072 | } | 1076 | } |
1073 | 1077 | ||
1074 | addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate); | 1078 | addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate); |
1075 | *raddr = addr; | 1079 | *raddr = addr; |
1076 | err = 0; | 1080 | err = 0; |
1077 | if (IS_ERR_VALUE(addr)) | 1081 | if (IS_ERR_VALUE(addr)) |
1078 | err = (long)addr; | 1082 | err = (long)addr; |
1079 | invalid: | 1083 | invalid: |
1080 | up_write(¤t->mm->mmap_sem); | 1084 | up_write(¤t->mm->mmap_sem); |
1081 | if (populate) | 1085 | if (populate) |
1082 | mm_populate(addr, populate); | 1086 | mm_populate(addr, populate); |
1083 | 1087 | ||
1084 | out_fput: | 1088 | out_fput: |
1085 | fput(file); | 1089 | fput(file); |
1086 | 1090 | ||
1087 | out_nattch: | 1091 | out_nattch: |
1088 | down_write(&shm_ids(ns).rw_mutex); | 1092 | down_write(&shm_ids(ns).rw_mutex); |
1089 | shp = shm_lock(ns, shmid); | 1093 | shp = shm_lock(ns, shmid); |
1090 | BUG_ON(IS_ERR(shp)); | 1094 | BUG_ON(IS_ERR(shp)); |
1091 | shp->shm_nattch--; | 1095 | shp->shm_nattch--; |
1092 | if (shm_may_destroy(ns, shp)) | 1096 | if (shm_may_destroy(ns, shp)) |
1093 | shm_destroy(ns, shp); | 1097 | shm_destroy(ns, shp); |
1094 | else | 1098 | else |
1095 | shm_unlock(shp); | 1099 | shm_unlock(shp); |
1096 | up_write(&shm_ids(ns).rw_mutex); | 1100 | up_write(&shm_ids(ns).rw_mutex); |
1097 | 1101 | ||
1098 | out: | 1102 | out: |
1099 | return err; | 1103 | return err; |
1100 | 1104 | ||
1101 | out_unlock: | 1105 | out_unlock: |
1102 | shm_unlock(shp); | 1106 | shm_unlock(shp); |
1103 | goto out; | 1107 | goto out; |
1104 | 1108 | ||
1105 | out_free: | 1109 | out_free: |
1106 | kfree(sfd); | 1110 | kfree(sfd); |
1107 | out_put_dentry: | 1111 | out_put_dentry: |
1108 | path_put(&path); | 1112 | path_put(&path); |
1109 | goto out_nattch; | 1113 | goto out_nattch; |
1110 | } | 1114 | } |
1111 | 1115 | ||
1112 | SYSCALL_DEFINE3(shmat, int, shmid, char __user *, shmaddr, int, shmflg) | 1116 | SYSCALL_DEFINE3(shmat, int, shmid, char __user *, shmaddr, int, shmflg) |
1113 | { | 1117 | { |
1114 | unsigned long ret; | 1118 | unsigned long ret; |
1115 | long err; | 1119 | long err; |
1116 | 1120 | ||
1117 | err = do_shmat(shmid, shmaddr, shmflg, &ret, SHMLBA); | 1121 | err = do_shmat(shmid, shmaddr, shmflg, &ret, SHMLBA); |
1118 | if (err) | 1122 | if (err) |
1119 | return err; | 1123 | return err; |
1120 | force_successful_syscall_return(); | 1124 | force_successful_syscall_return(); |
1121 | return (long)ret; | 1125 | return (long)ret; |
1122 | } | 1126 | } |
1123 | 1127 | ||
1124 | /* | 1128 | /* |
1125 | * detach and kill segment if marked destroyed. | 1129 | * detach and kill segment if marked destroyed. |
1126 | * The work is done in shm_close. | 1130 | * The work is done in shm_close. |
1127 | */ | 1131 | */ |
1128 | SYSCALL_DEFINE1(shmdt, char __user *, shmaddr) | 1132 | SYSCALL_DEFINE1(shmdt, char __user *, shmaddr) |
1129 | { | 1133 | { |
1130 | struct mm_struct *mm = current->mm; | 1134 | struct mm_struct *mm = current->mm; |
1131 | struct vm_area_struct *vma; | 1135 | struct vm_area_struct *vma; |
1132 | unsigned long addr = (unsigned long)shmaddr; | 1136 | unsigned long addr = (unsigned long)shmaddr; |
1133 | int retval = -EINVAL; | 1137 | int retval = -EINVAL; |
1134 | #ifdef CONFIG_MMU | 1138 | #ifdef CONFIG_MMU |
1135 | loff_t size = 0; | 1139 | loff_t size = 0; |
1136 | struct vm_area_struct *next; | 1140 | struct vm_area_struct *next; |
1137 | #endif | 1141 | #endif |
1138 | 1142 | ||
1139 | if (addr & ~PAGE_MASK) | 1143 | if (addr & ~PAGE_MASK) |
1140 | return retval; | 1144 | return retval; |
1141 | 1145 | ||
1142 | down_write(&mm->mmap_sem); | 1146 | down_write(&mm->mmap_sem); |
1143 | 1147 | ||
1144 | /* | 1148 | /* |
1145 | * This function tries to be smart and unmap shm segments that | 1149 | * This function tries to be smart and unmap shm segments that |
1146 | * were modified by partial mlock or munmap calls: | 1150 | * were modified by partial mlock or munmap calls: |
1147 | * - It first determines the size of the shm segment that should be | 1151 | * - It first determines the size of the shm segment that should be |
1148 | * unmapped: It searches for a vma that is backed by shm and that | 1152 | * unmapped: It searches for a vma that is backed by shm and that |
1149 | * started at address shmaddr. It records it's size and then unmaps | 1153 | * started at address shmaddr. It records it's size and then unmaps |
1150 | * it. | 1154 | * it. |
1151 | * - Then it unmaps all shm vmas that started at shmaddr and that | 1155 | * - Then it unmaps all shm vmas that started at shmaddr and that |
1152 | * are within the initially determined size. | 1156 | * are within the initially determined size. |
1153 | * Errors from do_munmap are ignored: the function only fails if | 1157 | * Errors from do_munmap are ignored: the function only fails if |
1154 | * it's called with invalid parameters or if it's called to unmap | 1158 | * it's called with invalid parameters or if it's called to unmap |
1155 | * a part of a vma. Both calls in this function are for full vmas, | 1159 | * a part of a vma. Both calls in this function are for full vmas, |
1156 | * the parameters are directly copied from the vma itself and always | 1160 | * the parameters are directly copied from the vma itself and always |
1157 | * valid - therefore do_munmap cannot fail. (famous last words?) | 1161 | * valid - therefore do_munmap cannot fail. (famous last words?) |
1158 | */ | 1162 | */ |
1159 | /* | 1163 | /* |
1160 | * If it had been mremap()'d, the starting address would not | 1164 | * If it had been mremap()'d, the starting address would not |
1161 | * match the usual checks anyway. So assume all vma's are | 1165 | * match the usual checks anyway. So assume all vma's are |
1162 | * above the starting address given. | 1166 | * above the starting address given. |
1163 | */ | 1167 | */ |
1164 | vma = find_vma(mm, addr); | 1168 | vma = find_vma(mm, addr); |
1165 | 1169 | ||
1166 | #ifdef CONFIG_MMU | 1170 | #ifdef CONFIG_MMU |
1167 | while (vma) { | 1171 | while (vma) { |
1168 | next = vma->vm_next; | 1172 | next = vma->vm_next; |
1169 | 1173 | ||
1170 | /* | 1174 | /* |
1171 | * Check if the starting address would match, i.e. it's | 1175 | * Check if the starting address would match, i.e. it's |
1172 | * a fragment created by mprotect() and/or munmap(), or it | 1176 | * a fragment created by mprotect() and/or munmap(), or it |
1173 | * otherwise it starts at this address with no hassles. | 1177 | * otherwise it starts at this address with no hassles. |
1174 | */ | 1178 | */ |
1175 | if ((vma->vm_ops == &shm_vm_ops) && | 1179 | if ((vma->vm_ops == &shm_vm_ops) && |
1176 | (vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) { | 1180 | (vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) { |
1177 | 1181 | ||
1178 | 1182 | ||
1179 | size = file_inode(vma->vm_file)->i_size; | 1183 | size = file_inode(vma->vm_file)->i_size; |
1180 | do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); | 1184 | do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); |
1181 | /* | 1185 | /* |
1182 | * We discovered the size of the shm segment, so | 1186 | * We discovered the size of the shm segment, so |
1183 | * break out of here and fall through to the next | 1187 | * break out of here and fall through to the next |
1184 | * loop that uses the size information to stop | 1188 | * loop that uses the size information to stop |
1185 | * searching for matching vma's. | 1189 | * searching for matching vma's. |
1186 | */ | 1190 | */ |
1187 | retval = 0; | 1191 | retval = 0; |
1188 | vma = next; | 1192 | vma = next; |
1189 | break; | 1193 | break; |
1190 | } | 1194 | } |
1191 | vma = next; | 1195 | vma = next; |
1192 | } | 1196 | } |
1193 | 1197 | ||
1194 | /* | 1198 | /* |
1195 | * We need look no further than the maximum address a fragment | 1199 | * We need look no further than the maximum address a fragment |
1196 | * could possibly have landed at. Also cast things to loff_t to | 1200 | * could possibly have landed at. Also cast things to loff_t to |
1197 | * prevent overflows and make comparisons vs. equal-width types. | 1201 | * prevent overflows and make comparisons vs. equal-width types. |
1198 | */ | 1202 | */ |
1199 | size = PAGE_ALIGN(size); | 1203 | size = PAGE_ALIGN(size); |
1200 | while (vma && (loff_t)(vma->vm_end - addr) <= size) { | 1204 | while (vma && (loff_t)(vma->vm_end - addr) <= size) { |
1201 | next = vma->vm_next; | 1205 | next = vma->vm_next; |
1202 | 1206 | ||
1203 | /* finding a matching vma now does not alter retval */ | 1207 | /* finding a matching vma now does not alter retval */ |
1204 | if ((vma->vm_ops == &shm_vm_ops) && | 1208 | if ((vma->vm_ops == &shm_vm_ops) && |
1205 | (vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) | 1209 | (vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) |
1206 | 1210 | ||
1207 | do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); | 1211 | do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); |
1208 | vma = next; | 1212 | vma = next; |
1209 | } | 1213 | } |
1210 | 1214 | ||
1211 | #else /* CONFIG_MMU */ | 1215 | #else /* CONFIG_MMU */ |
1212 | /* under NOMMU conditions, the exact address to be destroyed must be | 1216 | /* under NOMMU conditions, the exact address to be destroyed must be |
1213 | * given */ | 1217 | * given */ |
1214 | retval = -EINVAL; | 1218 | retval = -EINVAL; |
1215 | if (vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) { | 1219 | if (vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) { |
1216 | do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); | 1220 | do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); |
1217 | retval = 0; | 1221 | retval = 0; |
1218 | } | 1222 | } |
1219 | 1223 | ||
1220 | #endif | 1224 | #endif |
1221 | 1225 | ||
1222 | up_write(&mm->mmap_sem); | 1226 | up_write(&mm->mmap_sem); |
1223 | return retval; | 1227 | return retval; |
1224 | } | 1228 | } |
1225 | 1229 | ||
1226 | #ifdef CONFIG_PROC_FS | 1230 | #ifdef CONFIG_PROC_FS |
1227 | static int sysvipc_shm_proc_show(struct seq_file *s, void *it) | 1231 | static int sysvipc_shm_proc_show(struct seq_file *s, void *it) |
1228 | { | 1232 | { |
1229 | struct user_namespace *user_ns = seq_user_ns(s); | 1233 | struct user_namespace *user_ns = seq_user_ns(s); |
1230 | struct shmid_kernel *shp = it; | 1234 | struct shmid_kernel *shp = it; |
1231 | unsigned long rss = 0, swp = 0; | 1235 | unsigned long rss = 0, swp = 0; |
1232 | 1236 | ||
1233 | shm_add_rss_swap(shp, &rss, &swp); | 1237 | shm_add_rss_swap(shp, &rss, &swp); |
1234 | 1238 | ||
1235 | #if BITS_PER_LONG <= 32 | 1239 | #if BITS_PER_LONG <= 32 |
1236 | #define SIZE_SPEC "%10lu" | 1240 | #define SIZE_SPEC "%10lu" |
1237 | #else | 1241 | #else |
1238 | #define SIZE_SPEC "%21lu" | 1242 | #define SIZE_SPEC "%21lu" |
1239 | #endif | 1243 | #endif |
1240 | 1244 | ||
1241 | return seq_printf(s, | 1245 | return seq_printf(s, |
1242 | "%10d %10d %4o " SIZE_SPEC " %5u %5u " | 1246 | "%10d %10d %4o " SIZE_SPEC " %5u %5u " |
1243 | "%5lu %5u %5u %5u %5u %10lu %10lu %10lu " | 1247 | "%5lu %5u %5u %5u %5u %10lu %10lu %10lu " |
1244 | SIZE_SPEC " " SIZE_SPEC "\n", | 1248 | SIZE_SPEC " " SIZE_SPEC "\n", |
1245 | shp->shm_perm.key, | 1249 | shp->shm_perm.key, |
1246 | shp->shm_perm.id, | 1250 | shp->shm_perm.id, |
1247 | shp->shm_perm.mode, | 1251 | shp->shm_perm.mode, |
1248 | shp->shm_segsz, | 1252 | shp->shm_segsz, |
1249 | shp->shm_cprid, | 1253 | shp->shm_cprid, |
1250 | shp->shm_lprid, | 1254 | shp->shm_lprid, |
1251 | shp->shm_nattch, | 1255 | shp->shm_nattch, |
1252 | from_kuid_munged(user_ns, shp->shm_perm.uid), | 1256 | from_kuid_munged(user_ns, shp->shm_perm.uid), |
1253 | from_kgid_munged(user_ns, shp->shm_perm.gid), | 1257 | from_kgid_munged(user_ns, shp->shm_perm.gid), |
1254 | from_kuid_munged(user_ns, shp->shm_perm.cuid), | 1258 | from_kuid_munged(user_ns, shp->shm_perm.cuid), |
1255 | from_kgid_munged(user_ns, shp->shm_perm.cgid), | 1259 | from_kgid_munged(user_ns, shp->shm_perm.cgid), |
1256 | shp->shm_atim, | 1260 | shp->shm_atim, |
1257 | shp->shm_dtim, | 1261 | shp->shm_dtim, |
1258 | shp->shm_ctim, | 1262 | shp->shm_ctim, |
1259 | rss * PAGE_SIZE, | 1263 | rss * PAGE_SIZE, |
1260 | swp * PAGE_SIZE); | 1264 | swp * PAGE_SIZE); |
1261 | } | 1265 | } |
1262 | #endif | 1266 | #endif |
1263 | 1267 |
mm/mmap.c
1 | /* | 1 | /* |
2 | * mm/mmap.c | 2 | * mm/mmap.c |
3 | * | 3 | * |
4 | * Written by obz. | 4 | * Written by obz. |
5 | * | 5 | * |
6 | * Address space accounting code <alan@lxorguk.ukuu.org.uk> | 6 | * Address space accounting code <alan@lxorguk.ukuu.org.uk> |
7 | */ | 7 | */ |
8 | 8 | ||
9 | #include <linux/kernel.h> | 9 | #include <linux/kernel.h> |
10 | #include <linux/slab.h> | 10 | #include <linux/slab.h> |
11 | #include <linux/backing-dev.h> | 11 | #include <linux/backing-dev.h> |
12 | #include <linux/mm.h> | 12 | #include <linux/mm.h> |
13 | #include <linux/shm.h> | 13 | #include <linux/shm.h> |
14 | #include <linux/mman.h> | 14 | #include <linux/mman.h> |
15 | #include <linux/pagemap.h> | 15 | #include <linux/pagemap.h> |
16 | #include <linux/swap.h> | 16 | #include <linux/swap.h> |
17 | #include <linux/syscalls.h> | 17 | #include <linux/syscalls.h> |
18 | #include <linux/capability.h> | 18 | #include <linux/capability.h> |
19 | #include <linux/init.h> | 19 | #include <linux/init.h> |
20 | #include <linux/file.h> | 20 | #include <linux/file.h> |
21 | #include <linux/fs.h> | 21 | #include <linux/fs.h> |
22 | #include <linux/personality.h> | 22 | #include <linux/personality.h> |
23 | #include <linux/security.h> | 23 | #include <linux/security.h> |
24 | #include <linux/hugetlb.h> | 24 | #include <linux/hugetlb.h> |
25 | #include <linux/profile.h> | 25 | #include <linux/profile.h> |
26 | #include <linux/export.h> | 26 | #include <linux/export.h> |
27 | #include <linux/mount.h> | 27 | #include <linux/mount.h> |
28 | #include <linux/mempolicy.h> | 28 | #include <linux/mempolicy.h> |
29 | #include <linux/rmap.h> | 29 | #include <linux/rmap.h> |
30 | #include <linux/mmu_notifier.h> | 30 | #include <linux/mmu_notifier.h> |
31 | #include <linux/perf_event.h> | 31 | #include <linux/perf_event.h> |
32 | #include <linux/audit.h> | 32 | #include <linux/audit.h> |
33 | #include <linux/khugepaged.h> | 33 | #include <linux/khugepaged.h> |
34 | #include <linux/uprobes.h> | 34 | #include <linux/uprobes.h> |
35 | #include <linux/rbtree_augmented.h> | 35 | #include <linux/rbtree_augmented.h> |
36 | #include <linux/sched/sysctl.h> | 36 | #include <linux/sched/sysctl.h> |
37 | #include <linux/notifier.h> | 37 | #include <linux/notifier.h> |
38 | #include <linux/memory.h> | 38 | #include <linux/memory.h> |
39 | 39 | ||
40 | #include <asm/uaccess.h> | 40 | #include <asm/uaccess.h> |
41 | #include <asm/cacheflush.h> | 41 | #include <asm/cacheflush.h> |
42 | #include <asm/tlb.h> | 42 | #include <asm/tlb.h> |
43 | #include <asm/mmu_context.h> | 43 | #include <asm/mmu_context.h> |
44 | 44 | ||
45 | #include "internal.h" | 45 | #include "internal.h" |
46 | 46 | ||
47 | #ifndef arch_mmap_check | 47 | #ifndef arch_mmap_check |
48 | #define arch_mmap_check(addr, len, flags) (0) | 48 | #define arch_mmap_check(addr, len, flags) (0) |
49 | #endif | 49 | #endif |
50 | 50 | ||
51 | #ifndef arch_rebalance_pgtables | 51 | #ifndef arch_rebalance_pgtables |
52 | #define arch_rebalance_pgtables(addr, len) (addr) | 52 | #define arch_rebalance_pgtables(addr, len) (addr) |
53 | #endif | 53 | #endif |
54 | 54 | ||
55 | static void unmap_region(struct mm_struct *mm, | 55 | static void unmap_region(struct mm_struct *mm, |
56 | struct vm_area_struct *vma, struct vm_area_struct *prev, | 56 | struct vm_area_struct *vma, struct vm_area_struct *prev, |
57 | unsigned long start, unsigned long end); | 57 | unsigned long start, unsigned long end); |
58 | 58 | ||
59 | /* description of effects of mapping type and prot in current implementation. | 59 | /* description of effects of mapping type and prot in current implementation. |
60 | * this is due to the limited x86 page protection hardware. The expected | 60 | * this is due to the limited x86 page protection hardware. The expected |
61 | * behavior is in parens: | 61 | * behavior is in parens: |
62 | * | 62 | * |
63 | * map_type prot | 63 | * map_type prot |
64 | * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC | 64 | * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC |
65 | * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes | 65 | * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes |
66 | * w: (no) no w: (no) no w: (yes) yes w: (no) no | 66 | * w: (no) no w: (no) no w: (yes) yes w: (no) no |
67 | * x: (no) no x: (no) yes x: (no) yes x: (yes) yes | 67 | * x: (no) no x: (no) yes x: (no) yes x: (yes) yes |
68 | * | 68 | * |
69 | * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes | 69 | * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes |
70 | * w: (no) no w: (no) no w: (copy) copy w: (no) no | 70 | * w: (no) no w: (no) no w: (copy) copy w: (no) no |
71 | * x: (no) no x: (no) yes x: (no) yes x: (yes) yes | 71 | * x: (no) no x: (no) yes x: (no) yes x: (yes) yes |
72 | * | 72 | * |
73 | */ | 73 | */ |
74 | pgprot_t protection_map[16] = { | 74 | pgprot_t protection_map[16] = { |
75 | __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111, | 75 | __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111, |
76 | __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111 | 76 | __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111 |
77 | }; | 77 | }; |
78 | 78 | ||
79 | pgprot_t vm_get_page_prot(unsigned long vm_flags) | 79 | pgprot_t vm_get_page_prot(unsigned long vm_flags) |
80 | { | 80 | { |
81 | return __pgprot(pgprot_val(protection_map[vm_flags & | 81 | return __pgprot(pgprot_val(protection_map[vm_flags & |
82 | (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]) | | 82 | (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]) | |
83 | pgprot_val(arch_vm_get_page_prot(vm_flags))); | 83 | pgprot_val(arch_vm_get_page_prot(vm_flags))); |
84 | } | 84 | } |
85 | EXPORT_SYMBOL(vm_get_page_prot); | 85 | EXPORT_SYMBOL(vm_get_page_prot); |
86 | 86 | ||
87 | int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */ | 87 | int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */ |
88 | int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ | 88 | int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ |
89 | int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; | 89 | int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; |
90 | unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ | 90 | unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ |
91 | unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ | 91 | unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ |
92 | /* | 92 | /* |
93 | * Make sure vm_committed_as in one cacheline and not cacheline shared with | 93 | * Make sure vm_committed_as in one cacheline and not cacheline shared with |
94 | * other variables. It can be updated by several CPUs frequently. | 94 | * other variables. It can be updated by several CPUs frequently. |
95 | */ | 95 | */ |
96 | struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp; | 96 | struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp; |
97 | 97 | ||
98 | /* | 98 | /* |
99 | * The global memory commitment made in the system can be a metric | 99 | * The global memory commitment made in the system can be a metric |
100 | * that can be used to drive ballooning decisions when Linux is hosted | 100 | * that can be used to drive ballooning decisions when Linux is hosted |
101 | * as a guest. On Hyper-V, the host implements a policy engine for dynamically | 101 | * as a guest. On Hyper-V, the host implements a policy engine for dynamically |
102 | * balancing memory across competing virtual machines that are hosted. | 102 | * balancing memory across competing virtual machines that are hosted. |
103 | * Several metrics drive this policy engine including the guest reported | 103 | * Several metrics drive this policy engine including the guest reported |
104 | * memory commitment. | 104 | * memory commitment. |
105 | */ | 105 | */ |
106 | unsigned long vm_memory_committed(void) | 106 | unsigned long vm_memory_committed(void) |
107 | { | 107 | { |
108 | return percpu_counter_read_positive(&vm_committed_as); | 108 | return percpu_counter_read_positive(&vm_committed_as); |
109 | } | 109 | } |
110 | EXPORT_SYMBOL_GPL(vm_memory_committed); | 110 | EXPORT_SYMBOL_GPL(vm_memory_committed); |
111 | 111 | ||
112 | /* | 112 | /* |
113 | * Check that a process has enough memory to allocate a new virtual | 113 | * Check that a process has enough memory to allocate a new virtual |
114 | * mapping. 0 means there is enough memory for the allocation to | 114 | * mapping. 0 means there is enough memory for the allocation to |
115 | * succeed and -ENOMEM implies there is not. | 115 | * succeed and -ENOMEM implies there is not. |
116 | * | 116 | * |
117 | * We currently support three overcommit policies, which are set via the | 117 | * We currently support three overcommit policies, which are set via the |
118 | * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting | 118 | * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting |
119 | * | 119 | * |
120 | * Strict overcommit modes added 2002 Feb 26 by Alan Cox. | 120 | * Strict overcommit modes added 2002 Feb 26 by Alan Cox. |
121 | * Additional code 2002 Jul 20 by Robert Love. | 121 | * Additional code 2002 Jul 20 by Robert Love. |
122 | * | 122 | * |
123 | * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise. | 123 | * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise. |
124 | * | 124 | * |
125 | * Note this is a helper function intended to be used by LSMs which | 125 | * Note this is a helper function intended to be used by LSMs which |
126 | * wish to use this logic. | 126 | * wish to use this logic. |
127 | */ | 127 | */ |
128 | int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | 128 | int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) |
129 | { | 129 | { |
130 | unsigned long free, allowed, reserve; | 130 | unsigned long free, allowed, reserve; |
131 | 131 | ||
132 | vm_acct_memory(pages); | 132 | vm_acct_memory(pages); |
133 | 133 | ||
134 | /* | 134 | /* |
135 | * Sometimes we want to use more memory than we have | 135 | * Sometimes we want to use more memory than we have |
136 | */ | 136 | */ |
137 | if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) | 137 | if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) |
138 | return 0; | 138 | return 0; |
139 | 139 | ||
140 | if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { | 140 | if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { |
141 | free = global_page_state(NR_FREE_PAGES); | 141 | free = global_page_state(NR_FREE_PAGES); |
142 | free += global_page_state(NR_FILE_PAGES); | 142 | free += global_page_state(NR_FILE_PAGES); |
143 | 143 | ||
144 | /* | 144 | /* |
145 | * shmem pages shouldn't be counted as free in this | 145 | * shmem pages shouldn't be counted as free in this |
146 | * case, they can't be purged, only swapped out, and | 146 | * case, they can't be purged, only swapped out, and |
147 | * that won't affect the overall amount of available | 147 | * that won't affect the overall amount of available |
148 | * memory in the system. | 148 | * memory in the system. |
149 | */ | 149 | */ |
150 | free -= global_page_state(NR_SHMEM); | 150 | free -= global_page_state(NR_SHMEM); |
151 | 151 | ||
152 | free += get_nr_swap_pages(); | 152 | free += get_nr_swap_pages(); |
153 | 153 | ||
154 | /* | 154 | /* |
155 | * Any slabs which are created with the | 155 | * Any slabs which are created with the |
156 | * SLAB_RECLAIM_ACCOUNT flag claim to have contents | 156 | * SLAB_RECLAIM_ACCOUNT flag claim to have contents |
157 | * which are reclaimable, under pressure. The dentry | 157 | * which are reclaimable, under pressure. The dentry |
158 | * cache and most inode caches should fall into this | 158 | * cache and most inode caches should fall into this |
159 | */ | 159 | */ |
160 | free += global_page_state(NR_SLAB_RECLAIMABLE); | 160 | free += global_page_state(NR_SLAB_RECLAIMABLE); |
161 | 161 | ||
162 | /* | 162 | /* |
163 | * Leave reserved pages. The pages are not for anonymous pages. | 163 | * Leave reserved pages. The pages are not for anonymous pages. |
164 | */ | 164 | */ |
165 | if (free <= totalreserve_pages) | 165 | if (free <= totalreserve_pages) |
166 | goto error; | 166 | goto error; |
167 | else | 167 | else |
168 | free -= totalreserve_pages; | 168 | free -= totalreserve_pages; |
169 | 169 | ||
170 | /* | 170 | /* |
171 | * Reserve some for root | 171 | * Reserve some for root |
172 | */ | 172 | */ |
173 | if (!cap_sys_admin) | 173 | if (!cap_sys_admin) |
174 | free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); | 174 | free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); |
175 | 175 | ||
176 | if (free > pages) | 176 | if (free > pages) |
177 | return 0; | 177 | return 0; |
178 | 178 | ||
179 | goto error; | 179 | goto error; |
180 | } | 180 | } |
181 | 181 | ||
182 | allowed = (totalram_pages - hugetlb_total_pages()) | 182 | allowed = (totalram_pages - hugetlb_total_pages()) |
183 | * sysctl_overcommit_ratio / 100; | 183 | * sysctl_overcommit_ratio / 100; |
184 | /* | 184 | /* |
185 | * Reserve some for root | 185 | * Reserve some for root |
186 | */ | 186 | */ |
187 | if (!cap_sys_admin) | 187 | if (!cap_sys_admin) |
188 | allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); | 188 | allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); |
189 | allowed += total_swap_pages; | 189 | allowed += total_swap_pages; |
190 | 190 | ||
191 | /* | 191 | /* |
192 | * Don't let a single process grow so big a user can't recover | 192 | * Don't let a single process grow so big a user can't recover |
193 | */ | 193 | */ |
194 | if (mm) { | 194 | if (mm) { |
195 | reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); | 195 | reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); |
196 | allowed -= min(mm->total_vm / 32, reserve); | 196 | allowed -= min(mm->total_vm / 32, reserve); |
197 | } | 197 | } |
198 | 198 | ||
199 | if (percpu_counter_read_positive(&vm_committed_as) < allowed) | 199 | if (percpu_counter_read_positive(&vm_committed_as) < allowed) |
200 | return 0; | 200 | return 0; |
201 | error: | 201 | error: |
202 | vm_unacct_memory(pages); | 202 | vm_unacct_memory(pages); |
203 | 203 | ||
204 | return -ENOMEM; | 204 | return -ENOMEM; |
205 | } | 205 | } |
206 | 206 | ||
207 | /* | 207 | /* |
208 | * Requires inode->i_mapping->i_mmap_mutex | 208 | * Requires inode->i_mapping->i_mmap_mutex |
209 | */ | 209 | */ |
210 | static void __remove_shared_vm_struct(struct vm_area_struct *vma, | 210 | static void __remove_shared_vm_struct(struct vm_area_struct *vma, |
211 | struct file *file, struct address_space *mapping) | 211 | struct file *file, struct address_space *mapping) |
212 | { | 212 | { |
213 | if (vma->vm_flags & VM_DENYWRITE) | 213 | if (vma->vm_flags & VM_DENYWRITE) |
214 | atomic_inc(&file_inode(file)->i_writecount); | 214 | atomic_inc(&file_inode(file)->i_writecount); |
215 | if (vma->vm_flags & VM_SHARED) | 215 | if (vma->vm_flags & VM_SHARED) |
216 | mapping->i_mmap_writable--; | 216 | mapping->i_mmap_writable--; |
217 | 217 | ||
218 | flush_dcache_mmap_lock(mapping); | 218 | flush_dcache_mmap_lock(mapping); |
219 | if (unlikely(vma->vm_flags & VM_NONLINEAR)) | 219 | if (unlikely(vma->vm_flags & VM_NONLINEAR)) |
220 | list_del_init(&vma->shared.nonlinear); | 220 | list_del_init(&vma->shared.nonlinear); |
221 | else | 221 | else |
222 | vma_interval_tree_remove(vma, &mapping->i_mmap); | 222 | vma_interval_tree_remove(vma, &mapping->i_mmap); |
223 | flush_dcache_mmap_unlock(mapping); | 223 | flush_dcache_mmap_unlock(mapping); |
224 | } | 224 | } |
225 | 225 | ||
226 | /* | 226 | /* |
227 | * Unlink a file-based vm structure from its interval tree, to hide | 227 | * Unlink a file-based vm structure from its interval tree, to hide |
228 | * vma from rmap and vmtruncate before freeing its page tables. | 228 | * vma from rmap and vmtruncate before freeing its page tables. |
229 | */ | 229 | */ |
230 | void unlink_file_vma(struct vm_area_struct *vma) | 230 | void unlink_file_vma(struct vm_area_struct *vma) |
231 | { | 231 | { |
232 | struct file *file = vma->vm_file; | 232 | struct file *file = vma->vm_file; |
233 | 233 | ||
234 | if (file) { | 234 | if (file) { |
235 | struct address_space *mapping = file->f_mapping; | 235 | struct address_space *mapping = file->f_mapping; |
236 | mutex_lock(&mapping->i_mmap_mutex); | 236 | mutex_lock(&mapping->i_mmap_mutex); |
237 | __remove_shared_vm_struct(vma, file, mapping); | 237 | __remove_shared_vm_struct(vma, file, mapping); |
238 | mutex_unlock(&mapping->i_mmap_mutex); | 238 | mutex_unlock(&mapping->i_mmap_mutex); |
239 | } | 239 | } |
240 | } | 240 | } |
241 | 241 | ||
242 | /* | 242 | /* |
243 | * Close a vm structure and free it, returning the next. | 243 | * Close a vm structure and free it, returning the next. |
244 | */ | 244 | */ |
245 | static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) | 245 | static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) |
246 | { | 246 | { |
247 | struct vm_area_struct *next = vma->vm_next; | 247 | struct vm_area_struct *next = vma->vm_next; |
248 | 248 | ||
249 | might_sleep(); | 249 | might_sleep(); |
250 | if (vma->vm_ops && vma->vm_ops->close) | 250 | if (vma->vm_ops && vma->vm_ops->close) |
251 | vma->vm_ops->close(vma); | 251 | vma->vm_ops->close(vma); |
252 | if (vma->vm_file) | 252 | if (vma->vm_file) |
253 | fput(vma->vm_file); | 253 | fput(vma->vm_file); |
254 | mpol_put(vma_policy(vma)); | 254 | mpol_put(vma_policy(vma)); |
255 | kmem_cache_free(vm_area_cachep, vma); | 255 | kmem_cache_free(vm_area_cachep, vma); |
256 | return next; | 256 | return next; |
257 | } | 257 | } |
258 | 258 | ||
259 | static unsigned long do_brk(unsigned long addr, unsigned long len); | 259 | static unsigned long do_brk(unsigned long addr, unsigned long len); |
260 | 260 | ||
261 | SYSCALL_DEFINE1(brk, unsigned long, brk) | 261 | SYSCALL_DEFINE1(brk, unsigned long, brk) |
262 | { | 262 | { |
263 | unsigned long rlim, retval; | 263 | unsigned long rlim, retval; |
264 | unsigned long newbrk, oldbrk; | 264 | unsigned long newbrk, oldbrk; |
265 | struct mm_struct *mm = current->mm; | 265 | struct mm_struct *mm = current->mm; |
266 | unsigned long min_brk; | 266 | unsigned long min_brk; |
267 | bool populate; | 267 | bool populate; |
268 | 268 | ||
269 | down_write(&mm->mmap_sem); | 269 | down_write(&mm->mmap_sem); |
270 | 270 | ||
271 | #ifdef CONFIG_COMPAT_BRK | 271 | #ifdef CONFIG_COMPAT_BRK |
272 | /* | 272 | /* |
273 | * CONFIG_COMPAT_BRK can still be overridden by setting | 273 | * CONFIG_COMPAT_BRK can still be overridden by setting |
274 | * randomize_va_space to 2, which will still cause mm->start_brk | 274 | * randomize_va_space to 2, which will still cause mm->start_brk |
275 | * to be arbitrarily shifted | 275 | * to be arbitrarily shifted |
276 | */ | 276 | */ |
277 | if (current->brk_randomized) | 277 | if (current->brk_randomized) |
278 | min_brk = mm->start_brk; | 278 | min_brk = mm->start_brk; |
279 | else | 279 | else |
280 | min_brk = mm->end_data; | 280 | min_brk = mm->end_data; |
281 | #else | 281 | #else |
282 | min_brk = mm->start_brk; | 282 | min_brk = mm->start_brk; |
283 | #endif | 283 | #endif |
284 | if (brk < min_brk) | 284 | if (brk < min_brk) |
285 | goto out; | 285 | goto out; |
286 | 286 | ||
287 | /* | 287 | /* |
288 | * Check against rlimit here. If this check is done later after the test | 288 | * Check against rlimit here. If this check is done later after the test |
289 | * of oldbrk with newbrk then it can escape the test and let the data | 289 | * of oldbrk with newbrk then it can escape the test and let the data |
290 | * segment grow beyond its set limit the in case where the limit is | 290 | * segment grow beyond its set limit the in case where the limit is |
291 | * not page aligned -Ram Gupta | 291 | * not page aligned -Ram Gupta |
292 | */ | 292 | */ |
293 | rlim = rlimit(RLIMIT_DATA); | 293 | rlim = rlimit(RLIMIT_DATA); |
294 | if (rlim < RLIM_INFINITY && (brk - mm->start_brk) + | 294 | if (rlim < RLIM_INFINITY && (brk - mm->start_brk) + |
295 | (mm->end_data - mm->start_data) > rlim) | 295 | (mm->end_data - mm->start_data) > rlim) |
296 | goto out; | 296 | goto out; |
297 | 297 | ||
298 | newbrk = PAGE_ALIGN(brk); | 298 | newbrk = PAGE_ALIGN(brk); |
299 | oldbrk = PAGE_ALIGN(mm->brk); | 299 | oldbrk = PAGE_ALIGN(mm->brk); |
300 | if (oldbrk == newbrk) | 300 | if (oldbrk == newbrk) |
301 | goto set_brk; | 301 | goto set_brk; |
302 | 302 | ||
303 | /* Always allow shrinking brk. */ | 303 | /* Always allow shrinking brk. */ |
304 | if (brk <= mm->brk) { | 304 | if (brk <= mm->brk) { |
305 | if (!do_munmap(mm, newbrk, oldbrk-newbrk)) | 305 | if (!do_munmap(mm, newbrk, oldbrk-newbrk)) |
306 | goto set_brk; | 306 | goto set_brk; |
307 | goto out; | 307 | goto out; |
308 | } | 308 | } |
309 | 309 | ||
310 | /* Check against existing mmap mappings. */ | 310 | /* Check against existing mmap mappings. */ |
311 | if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE)) | 311 | if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE)) |
312 | goto out; | 312 | goto out; |
313 | 313 | ||
314 | /* Ok, looks good - let it rip. */ | 314 | /* Ok, looks good - let it rip. */ |
315 | if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) | 315 | if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) |
316 | goto out; | 316 | goto out; |
317 | 317 | ||
318 | set_brk: | 318 | set_brk: |
319 | mm->brk = brk; | 319 | mm->brk = brk; |
320 | populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0; | 320 | populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0; |
321 | up_write(&mm->mmap_sem); | 321 | up_write(&mm->mmap_sem); |
322 | if (populate) | 322 | if (populate) |
323 | mm_populate(oldbrk, newbrk - oldbrk); | 323 | mm_populate(oldbrk, newbrk - oldbrk); |
324 | return brk; | 324 | return brk; |
325 | 325 | ||
326 | out: | 326 | out: |
327 | retval = mm->brk; | 327 | retval = mm->brk; |
328 | up_write(&mm->mmap_sem); | 328 | up_write(&mm->mmap_sem); |
329 | return retval; | 329 | return retval; |
330 | } | 330 | } |
331 | 331 | ||
332 | static long vma_compute_subtree_gap(struct vm_area_struct *vma) | 332 | static long vma_compute_subtree_gap(struct vm_area_struct *vma) |
333 | { | 333 | { |
334 | unsigned long max, subtree_gap; | 334 | unsigned long max, subtree_gap; |
335 | max = vma->vm_start; | 335 | max = vma->vm_start; |
336 | if (vma->vm_prev) | 336 | if (vma->vm_prev) |
337 | max -= vma->vm_prev->vm_end; | 337 | max -= vma->vm_prev->vm_end; |
338 | if (vma->vm_rb.rb_left) { | 338 | if (vma->vm_rb.rb_left) { |
339 | subtree_gap = rb_entry(vma->vm_rb.rb_left, | 339 | subtree_gap = rb_entry(vma->vm_rb.rb_left, |
340 | struct vm_area_struct, vm_rb)->rb_subtree_gap; | 340 | struct vm_area_struct, vm_rb)->rb_subtree_gap; |
341 | if (subtree_gap > max) | 341 | if (subtree_gap > max) |
342 | max = subtree_gap; | 342 | max = subtree_gap; |
343 | } | 343 | } |
344 | if (vma->vm_rb.rb_right) { | 344 | if (vma->vm_rb.rb_right) { |
345 | subtree_gap = rb_entry(vma->vm_rb.rb_right, | 345 | subtree_gap = rb_entry(vma->vm_rb.rb_right, |
346 | struct vm_area_struct, vm_rb)->rb_subtree_gap; | 346 | struct vm_area_struct, vm_rb)->rb_subtree_gap; |
347 | if (subtree_gap > max) | 347 | if (subtree_gap > max) |
348 | max = subtree_gap; | 348 | max = subtree_gap; |
349 | } | 349 | } |
350 | return max; | 350 | return max; |
351 | } | 351 | } |
352 | 352 | ||
353 | #ifdef CONFIG_DEBUG_VM_RB | 353 | #ifdef CONFIG_DEBUG_VM_RB |
354 | static int browse_rb(struct rb_root *root) | 354 | static int browse_rb(struct rb_root *root) |
355 | { | 355 | { |
356 | int i = 0, j, bug = 0; | 356 | int i = 0, j, bug = 0; |
357 | struct rb_node *nd, *pn = NULL; | 357 | struct rb_node *nd, *pn = NULL; |
358 | unsigned long prev = 0, pend = 0; | 358 | unsigned long prev = 0, pend = 0; |
359 | 359 | ||
360 | for (nd = rb_first(root); nd; nd = rb_next(nd)) { | 360 | for (nd = rb_first(root); nd; nd = rb_next(nd)) { |
361 | struct vm_area_struct *vma; | 361 | struct vm_area_struct *vma; |
362 | vma = rb_entry(nd, struct vm_area_struct, vm_rb); | 362 | vma = rb_entry(nd, struct vm_area_struct, vm_rb); |
363 | if (vma->vm_start < prev) { | 363 | if (vma->vm_start < prev) { |
364 | printk("vm_start %lx prev %lx\n", vma->vm_start, prev); | 364 | printk("vm_start %lx prev %lx\n", vma->vm_start, prev); |
365 | bug = 1; | 365 | bug = 1; |
366 | } | 366 | } |
367 | if (vma->vm_start < pend) { | 367 | if (vma->vm_start < pend) { |
368 | printk("vm_start %lx pend %lx\n", vma->vm_start, pend); | 368 | printk("vm_start %lx pend %lx\n", vma->vm_start, pend); |
369 | bug = 1; | 369 | bug = 1; |
370 | } | 370 | } |
371 | if (vma->vm_start > vma->vm_end) { | 371 | if (vma->vm_start > vma->vm_end) { |
372 | printk("vm_end %lx < vm_start %lx\n", | 372 | printk("vm_end %lx < vm_start %lx\n", |
373 | vma->vm_end, vma->vm_start); | 373 | vma->vm_end, vma->vm_start); |
374 | bug = 1; | 374 | bug = 1; |
375 | } | 375 | } |
376 | if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) { | 376 | if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) { |
377 | printk("free gap %lx, correct %lx\n", | 377 | printk("free gap %lx, correct %lx\n", |
378 | vma->rb_subtree_gap, | 378 | vma->rb_subtree_gap, |
379 | vma_compute_subtree_gap(vma)); | 379 | vma_compute_subtree_gap(vma)); |
380 | bug = 1; | 380 | bug = 1; |
381 | } | 381 | } |
382 | i++; | 382 | i++; |
383 | pn = nd; | 383 | pn = nd; |
384 | prev = vma->vm_start; | 384 | prev = vma->vm_start; |
385 | pend = vma->vm_end; | 385 | pend = vma->vm_end; |
386 | } | 386 | } |
387 | j = 0; | 387 | j = 0; |
388 | for (nd = pn; nd; nd = rb_prev(nd)) | 388 | for (nd = pn; nd; nd = rb_prev(nd)) |
389 | j++; | 389 | j++; |
390 | if (i != j) { | 390 | if (i != j) { |
391 | printk("backwards %d, forwards %d\n", j, i); | 391 | printk("backwards %d, forwards %d\n", j, i); |
392 | bug = 1; | 392 | bug = 1; |
393 | } | 393 | } |
394 | return bug ? -1 : i; | 394 | return bug ? -1 : i; |
395 | } | 395 | } |
396 | 396 | ||
397 | static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore) | 397 | static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore) |
398 | { | 398 | { |
399 | struct rb_node *nd; | 399 | struct rb_node *nd; |
400 | 400 | ||
401 | for (nd = rb_first(root); nd; nd = rb_next(nd)) { | 401 | for (nd = rb_first(root); nd; nd = rb_next(nd)) { |
402 | struct vm_area_struct *vma; | 402 | struct vm_area_struct *vma; |
403 | vma = rb_entry(nd, struct vm_area_struct, vm_rb); | 403 | vma = rb_entry(nd, struct vm_area_struct, vm_rb); |
404 | BUG_ON(vma != ignore && | 404 | BUG_ON(vma != ignore && |
405 | vma->rb_subtree_gap != vma_compute_subtree_gap(vma)); | 405 | vma->rb_subtree_gap != vma_compute_subtree_gap(vma)); |
406 | } | 406 | } |
407 | } | 407 | } |
408 | 408 | ||
409 | void validate_mm(struct mm_struct *mm) | 409 | void validate_mm(struct mm_struct *mm) |
410 | { | 410 | { |
411 | int bug = 0; | 411 | int bug = 0; |
412 | int i = 0; | 412 | int i = 0; |
413 | unsigned long highest_address = 0; | 413 | unsigned long highest_address = 0; |
414 | struct vm_area_struct *vma = mm->mmap; | 414 | struct vm_area_struct *vma = mm->mmap; |
415 | while (vma) { | 415 | while (vma) { |
416 | struct anon_vma_chain *avc; | 416 | struct anon_vma_chain *avc; |
417 | vma_lock_anon_vma(vma); | 417 | vma_lock_anon_vma(vma); |
418 | list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) | 418 | list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) |
419 | anon_vma_interval_tree_verify(avc); | 419 | anon_vma_interval_tree_verify(avc); |
420 | vma_unlock_anon_vma(vma); | 420 | vma_unlock_anon_vma(vma); |
421 | highest_address = vma->vm_end; | 421 | highest_address = vma->vm_end; |
422 | vma = vma->vm_next; | 422 | vma = vma->vm_next; |
423 | i++; | 423 | i++; |
424 | } | 424 | } |
425 | if (i != mm->map_count) { | 425 | if (i != mm->map_count) { |
426 | printk("map_count %d vm_next %d\n", mm->map_count, i); | 426 | printk("map_count %d vm_next %d\n", mm->map_count, i); |
427 | bug = 1; | 427 | bug = 1; |
428 | } | 428 | } |
429 | if (highest_address != mm->highest_vm_end) { | 429 | if (highest_address != mm->highest_vm_end) { |
430 | printk("mm->highest_vm_end %lx, found %lx\n", | 430 | printk("mm->highest_vm_end %lx, found %lx\n", |
431 | mm->highest_vm_end, highest_address); | 431 | mm->highest_vm_end, highest_address); |
432 | bug = 1; | 432 | bug = 1; |
433 | } | 433 | } |
434 | i = browse_rb(&mm->mm_rb); | 434 | i = browse_rb(&mm->mm_rb); |
435 | if (i != mm->map_count) { | 435 | if (i != mm->map_count) { |
436 | printk("map_count %d rb %d\n", mm->map_count, i); | 436 | printk("map_count %d rb %d\n", mm->map_count, i); |
437 | bug = 1; | 437 | bug = 1; |
438 | } | 438 | } |
439 | BUG_ON(bug); | 439 | BUG_ON(bug); |
440 | } | 440 | } |
441 | #else | 441 | #else |
442 | #define validate_mm_rb(root, ignore) do { } while (0) | 442 | #define validate_mm_rb(root, ignore) do { } while (0) |
443 | #define validate_mm(mm) do { } while (0) | 443 | #define validate_mm(mm) do { } while (0) |
444 | #endif | 444 | #endif |
445 | 445 | ||
446 | RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb, | 446 | RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb, |
447 | unsigned long, rb_subtree_gap, vma_compute_subtree_gap) | 447 | unsigned long, rb_subtree_gap, vma_compute_subtree_gap) |
448 | 448 | ||
449 | /* | 449 | /* |
450 | * Update augmented rbtree rb_subtree_gap values after vma->vm_start or | 450 | * Update augmented rbtree rb_subtree_gap values after vma->vm_start or |
451 | * vma->vm_prev->vm_end values changed, without modifying the vma's position | 451 | * vma->vm_prev->vm_end values changed, without modifying the vma's position |
452 | * in the rbtree. | 452 | * in the rbtree. |
453 | */ | 453 | */ |
454 | static void vma_gap_update(struct vm_area_struct *vma) | 454 | static void vma_gap_update(struct vm_area_struct *vma) |
455 | { | 455 | { |
456 | /* | 456 | /* |
457 | * As it turns out, RB_DECLARE_CALLBACKS() already created a callback | 457 | * As it turns out, RB_DECLARE_CALLBACKS() already created a callback |
458 | * function that does exacltly what we want. | 458 | * function that does exacltly what we want. |
459 | */ | 459 | */ |
460 | vma_gap_callbacks_propagate(&vma->vm_rb, NULL); | 460 | vma_gap_callbacks_propagate(&vma->vm_rb, NULL); |
461 | } | 461 | } |
462 | 462 | ||
463 | static inline void vma_rb_insert(struct vm_area_struct *vma, | 463 | static inline void vma_rb_insert(struct vm_area_struct *vma, |
464 | struct rb_root *root) | 464 | struct rb_root *root) |
465 | { | 465 | { |
466 | /* All rb_subtree_gap values must be consistent prior to insertion */ | 466 | /* All rb_subtree_gap values must be consistent prior to insertion */ |
467 | validate_mm_rb(root, NULL); | 467 | validate_mm_rb(root, NULL); |
468 | 468 | ||
469 | rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks); | 469 | rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks); |
470 | } | 470 | } |
471 | 471 | ||
472 | static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root) | 472 | static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root) |
473 | { | 473 | { |
474 | /* | 474 | /* |
475 | * All rb_subtree_gap values must be consistent prior to erase, | 475 | * All rb_subtree_gap values must be consistent prior to erase, |
476 | * with the possible exception of the vma being erased. | 476 | * with the possible exception of the vma being erased. |
477 | */ | 477 | */ |
478 | validate_mm_rb(root, vma); | 478 | validate_mm_rb(root, vma); |
479 | 479 | ||
480 | /* | 480 | /* |
481 | * Note rb_erase_augmented is a fairly large inline function, | 481 | * Note rb_erase_augmented is a fairly large inline function, |
482 | * so make sure we instantiate it only once with our desired | 482 | * so make sure we instantiate it only once with our desired |
483 | * augmented rbtree callbacks. | 483 | * augmented rbtree callbacks. |
484 | */ | 484 | */ |
485 | rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks); | 485 | rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks); |
486 | } | 486 | } |
487 | 487 | ||
488 | /* | 488 | /* |
489 | * vma has some anon_vma assigned, and is already inserted on that | 489 | * vma has some anon_vma assigned, and is already inserted on that |
490 | * anon_vma's interval trees. | 490 | * anon_vma's interval trees. |
491 | * | 491 | * |
492 | * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the | 492 | * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the |
493 | * vma must be removed from the anon_vma's interval trees using | 493 | * vma must be removed from the anon_vma's interval trees using |
494 | * anon_vma_interval_tree_pre_update_vma(). | 494 | * anon_vma_interval_tree_pre_update_vma(). |
495 | * | 495 | * |
496 | * After the update, the vma will be reinserted using | 496 | * After the update, the vma will be reinserted using |
497 | * anon_vma_interval_tree_post_update_vma(). | 497 | * anon_vma_interval_tree_post_update_vma(). |
498 | * | 498 | * |
499 | * The entire update must be protected by exclusive mmap_sem and by | 499 | * The entire update must be protected by exclusive mmap_sem and by |
500 | * the root anon_vma's mutex. | 500 | * the root anon_vma's mutex. |
501 | */ | 501 | */ |
502 | static inline void | 502 | static inline void |
503 | anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma) | 503 | anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma) |
504 | { | 504 | { |
505 | struct anon_vma_chain *avc; | 505 | struct anon_vma_chain *avc; |
506 | 506 | ||
507 | list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) | 507 | list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) |
508 | anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root); | 508 | anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root); |
509 | } | 509 | } |
510 | 510 | ||
511 | static inline void | 511 | static inline void |
512 | anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) | 512 | anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) |
513 | { | 513 | { |
514 | struct anon_vma_chain *avc; | 514 | struct anon_vma_chain *avc; |
515 | 515 | ||
516 | list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) | 516 | list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) |
517 | anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); | 517 | anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); |
518 | } | 518 | } |
519 | 519 | ||
520 | static int find_vma_links(struct mm_struct *mm, unsigned long addr, | 520 | static int find_vma_links(struct mm_struct *mm, unsigned long addr, |
521 | unsigned long end, struct vm_area_struct **pprev, | 521 | unsigned long end, struct vm_area_struct **pprev, |
522 | struct rb_node ***rb_link, struct rb_node **rb_parent) | 522 | struct rb_node ***rb_link, struct rb_node **rb_parent) |
523 | { | 523 | { |
524 | struct rb_node **__rb_link, *__rb_parent, *rb_prev; | 524 | struct rb_node **__rb_link, *__rb_parent, *rb_prev; |
525 | 525 | ||
526 | __rb_link = &mm->mm_rb.rb_node; | 526 | __rb_link = &mm->mm_rb.rb_node; |
527 | rb_prev = __rb_parent = NULL; | 527 | rb_prev = __rb_parent = NULL; |
528 | 528 | ||
529 | while (*__rb_link) { | 529 | while (*__rb_link) { |
530 | struct vm_area_struct *vma_tmp; | 530 | struct vm_area_struct *vma_tmp; |
531 | 531 | ||
532 | __rb_parent = *__rb_link; | 532 | __rb_parent = *__rb_link; |
533 | vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb); | 533 | vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb); |
534 | 534 | ||
535 | if (vma_tmp->vm_end > addr) { | 535 | if (vma_tmp->vm_end > addr) { |
536 | /* Fail if an existing vma overlaps the area */ | 536 | /* Fail if an existing vma overlaps the area */ |
537 | if (vma_tmp->vm_start < end) | 537 | if (vma_tmp->vm_start < end) |
538 | return -ENOMEM; | 538 | return -ENOMEM; |
539 | __rb_link = &__rb_parent->rb_left; | 539 | __rb_link = &__rb_parent->rb_left; |
540 | } else { | 540 | } else { |
541 | rb_prev = __rb_parent; | 541 | rb_prev = __rb_parent; |
542 | __rb_link = &__rb_parent->rb_right; | 542 | __rb_link = &__rb_parent->rb_right; |
543 | } | 543 | } |
544 | } | 544 | } |
545 | 545 | ||
546 | *pprev = NULL; | 546 | *pprev = NULL; |
547 | if (rb_prev) | 547 | if (rb_prev) |
548 | *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); | 548 | *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); |
549 | *rb_link = __rb_link; | 549 | *rb_link = __rb_link; |
550 | *rb_parent = __rb_parent; | 550 | *rb_parent = __rb_parent; |
551 | return 0; | 551 | return 0; |
552 | } | 552 | } |
553 | 553 | ||
554 | static unsigned long count_vma_pages_range(struct mm_struct *mm, | 554 | static unsigned long count_vma_pages_range(struct mm_struct *mm, |
555 | unsigned long addr, unsigned long end) | 555 | unsigned long addr, unsigned long end) |
556 | { | 556 | { |
557 | unsigned long nr_pages = 0; | 557 | unsigned long nr_pages = 0; |
558 | struct vm_area_struct *vma; | 558 | struct vm_area_struct *vma; |
559 | 559 | ||
560 | /* Find first overlaping mapping */ | 560 | /* Find first overlaping mapping */ |
561 | vma = find_vma_intersection(mm, addr, end); | 561 | vma = find_vma_intersection(mm, addr, end); |
562 | if (!vma) | 562 | if (!vma) |
563 | return 0; | 563 | return 0; |
564 | 564 | ||
565 | nr_pages = (min(end, vma->vm_end) - | 565 | nr_pages = (min(end, vma->vm_end) - |
566 | max(addr, vma->vm_start)) >> PAGE_SHIFT; | 566 | max(addr, vma->vm_start)) >> PAGE_SHIFT; |
567 | 567 | ||
568 | /* Iterate over the rest of the overlaps */ | 568 | /* Iterate over the rest of the overlaps */ |
569 | for (vma = vma->vm_next; vma; vma = vma->vm_next) { | 569 | for (vma = vma->vm_next; vma; vma = vma->vm_next) { |
570 | unsigned long overlap_len; | 570 | unsigned long overlap_len; |
571 | 571 | ||
572 | if (vma->vm_start > end) | 572 | if (vma->vm_start > end) |
573 | break; | 573 | break; |
574 | 574 | ||
575 | overlap_len = min(end, vma->vm_end) - vma->vm_start; | 575 | overlap_len = min(end, vma->vm_end) - vma->vm_start; |
576 | nr_pages += overlap_len >> PAGE_SHIFT; | 576 | nr_pages += overlap_len >> PAGE_SHIFT; |
577 | } | 577 | } |
578 | 578 | ||
579 | return nr_pages; | 579 | return nr_pages; |
580 | } | 580 | } |
581 | 581 | ||
582 | void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, | 582 | void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, |
583 | struct rb_node **rb_link, struct rb_node *rb_parent) | 583 | struct rb_node **rb_link, struct rb_node *rb_parent) |
584 | { | 584 | { |
585 | /* Update tracking information for the gap following the new vma. */ | 585 | /* Update tracking information for the gap following the new vma. */ |
586 | if (vma->vm_next) | 586 | if (vma->vm_next) |
587 | vma_gap_update(vma->vm_next); | 587 | vma_gap_update(vma->vm_next); |
588 | else | 588 | else |
589 | mm->highest_vm_end = vma->vm_end; | 589 | mm->highest_vm_end = vma->vm_end; |
590 | 590 | ||
591 | /* | 591 | /* |
592 | * vma->vm_prev wasn't known when we followed the rbtree to find the | 592 | * vma->vm_prev wasn't known when we followed the rbtree to find the |
593 | * correct insertion point for that vma. As a result, we could not | 593 | * correct insertion point for that vma. As a result, we could not |
594 | * update the vma vm_rb parents rb_subtree_gap values on the way down. | 594 | * update the vma vm_rb parents rb_subtree_gap values on the way down. |
595 | * So, we first insert the vma with a zero rb_subtree_gap value | 595 | * So, we first insert the vma with a zero rb_subtree_gap value |
596 | * (to be consistent with what we did on the way down), and then | 596 | * (to be consistent with what we did on the way down), and then |
597 | * immediately update the gap to the correct value. Finally we | 597 | * immediately update the gap to the correct value. Finally we |
598 | * rebalance the rbtree after all augmented values have been set. | 598 | * rebalance the rbtree after all augmented values have been set. |
599 | */ | 599 | */ |
600 | rb_link_node(&vma->vm_rb, rb_parent, rb_link); | 600 | rb_link_node(&vma->vm_rb, rb_parent, rb_link); |
601 | vma->rb_subtree_gap = 0; | 601 | vma->rb_subtree_gap = 0; |
602 | vma_gap_update(vma); | 602 | vma_gap_update(vma); |
603 | vma_rb_insert(vma, &mm->mm_rb); | 603 | vma_rb_insert(vma, &mm->mm_rb); |
604 | } | 604 | } |
605 | 605 | ||
606 | static void __vma_link_file(struct vm_area_struct *vma) | 606 | static void __vma_link_file(struct vm_area_struct *vma) |
607 | { | 607 | { |
608 | struct file *file; | 608 | struct file *file; |
609 | 609 | ||
610 | file = vma->vm_file; | 610 | file = vma->vm_file; |
611 | if (file) { | 611 | if (file) { |
612 | struct address_space *mapping = file->f_mapping; | 612 | struct address_space *mapping = file->f_mapping; |
613 | 613 | ||
614 | if (vma->vm_flags & VM_DENYWRITE) | 614 | if (vma->vm_flags & VM_DENYWRITE) |
615 | atomic_dec(&file_inode(file)->i_writecount); | 615 | atomic_dec(&file_inode(file)->i_writecount); |
616 | if (vma->vm_flags & VM_SHARED) | 616 | if (vma->vm_flags & VM_SHARED) |
617 | mapping->i_mmap_writable++; | 617 | mapping->i_mmap_writable++; |
618 | 618 | ||
619 | flush_dcache_mmap_lock(mapping); | 619 | flush_dcache_mmap_lock(mapping); |
620 | if (unlikely(vma->vm_flags & VM_NONLINEAR)) | 620 | if (unlikely(vma->vm_flags & VM_NONLINEAR)) |
621 | vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); | 621 | vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); |
622 | else | 622 | else |
623 | vma_interval_tree_insert(vma, &mapping->i_mmap); | 623 | vma_interval_tree_insert(vma, &mapping->i_mmap); |
624 | flush_dcache_mmap_unlock(mapping); | 624 | flush_dcache_mmap_unlock(mapping); |
625 | } | 625 | } |
626 | } | 626 | } |
627 | 627 | ||
628 | static void | 628 | static void |
629 | __vma_link(struct mm_struct *mm, struct vm_area_struct *vma, | 629 | __vma_link(struct mm_struct *mm, struct vm_area_struct *vma, |
630 | struct vm_area_struct *prev, struct rb_node **rb_link, | 630 | struct vm_area_struct *prev, struct rb_node **rb_link, |
631 | struct rb_node *rb_parent) | 631 | struct rb_node *rb_parent) |
632 | { | 632 | { |
633 | __vma_link_list(mm, vma, prev, rb_parent); | 633 | __vma_link_list(mm, vma, prev, rb_parent); |
634 | __vma_link_rb(mm, vma, rb_link, rb_parent); | 634 | __vma_link_rb(mm, vma, rb_link, rb_parent); |
635 | } | 635 | } |
636 | 636 | ||
637 | static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, | 637 | static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, |
638 | struct vm_area_struct *prev, struct rb_node **rb_link, | 638 | struct vm_area_struct *prev, struct rb_node **rb_link, |
639 | struct rb_node *rb_parent) | 639 | struct rb_node *rb_parent) |
640 | { | 640 | { |
641 | struct address_space *mapping = NULL; | 641 | struct address_space *mapping = NULL; |
642 | 642 | ||
643 | if (vma->vm_file) | 643 | if (vma->vm_file) |
644 | mapping = vma->vm_file->f_mapping; | 644 | mapping = vma->vm_file->f_mapping; |
645 | 645 | ||
646 | if (mapping) | 646 | if (mapping) |
647 | mutex_lock(&mapping->i_mmap_mutex); | 647 | mutex_lock(&mapping->i_mmap_mutex); |
648 | 648 | ||
649 | __vma_link(mm, vma, prev, rb_link, rb_parent); | 649 | __vma_link(mm, vma, prev, rb_link, rb_parent); |
650 | __vma_link_file(vma); | 650 | __vma_link_file(vma); |
651 | 651 | ||
652 | if (mapping) | 652 | if (mapping) |
653 | mutex_unlock(&mapping->i_mmap_mutex); | 653 | mutex_unlock(&mapping->i_mmap_mutex); |
654 | 654 | ||
655 | mm->map_count++; | 655 | mm->map_count++; |
656 | validate_mm(mm); | 656 | validate_mm(mm); |
657 | } | 657 | } |
658 | 658 | ||
659 | /* | 659 | /* |
660 | * Helper for vma_adjust() in the split_vma insert case: insert a vma into the | 660 | * Helper for vma_adjust() in the split_vma insert case: insert a vma into the |
661 | * mm's list and rbtree. It has already been inserted into the interval tree. | 661 | * mm's list and rbtree. It has already been inserted into the interval tree. |
662 | */ | 662 | */ |
663 | static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) | 663 | static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) |
664 | { | 664 | { |
665 | struct vm_area_struct *prev; | 665 | struct vm_area_struct *prev; |
666 | struct rb_node **rb_link, *rb_parent; | 666 | struct rb_node **rb_link, *rb_parent; |
667 | 667 | ||
668 | if (find_vma_links(mm, vma->vm_start, vma->vm_end, | 668 | if (find_vma_links(mm, vma->vm_start, vma->vm_end, |
669 | &prev, &rb_link, &rb_parent)) | 669 | &prev, &rb_link, &rb_parent)) |
670 | BUG(); | 670 | BUG(); |
671 | __vma_link(mm, vma, prev, rb_link, rb_parent); | 671 | __vma_link(mm, vma, prev, rb_link, rb_parent); |
672 | mm->map_count++; | 672 | mm->map_count++; |
673 | } | 673 | } |
674 | 674 | ||
675 | static inline void | 675 | static inline void |
676 | __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, | 676 | __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, |
677 | struct vm_area_struct *prev) | 677 | struct vm_area_struct *prev) |
678 | { | 678 | { |
679 | struct vm_area_struct *next; | 679 | struct vm_area_struct *next; |
680 | 680 | ||
681 | vma_rb_erase(vma, &mm->mm_rb); | 681 | vma_rb_erase(vma, &mm->mm_rb); |
682 | prev->vm_next = next = vma->vm_next; | 682 | prev->vm_next = next = vma->vm_next; |
683 | if (next) | 683 | if (next) |
684 | next->vm_prev = prev; | 684 | next->vm_prev = prev; |
685 | if (mm->mmap_cache == vma) | 685 | if (mm->mmap_cache == vma) |
686 | mm->mmap_cache = prev; | 686 | mm->mmap_cache = prev; |
687 | } | 687 | } |
688 | 688 | ||
689 | /* | 689 | /* |
690 | * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that | 690 | * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that |
691 | * is already present in an i_mmap tree without adjusting the tree. | 691 | * is already present in an i_mmap tree without adjusting the tree. |
692 | * The following helper function should be used when such adjustments | 692 | * The following helper function should be used when such adjustments |
693 | * are necessary. The "insert" vma (if any) is to be inserted | 693 | * are necessary. The "insert" vma (if any) is to be inserted |
694 | * before we drop the necessary locks. | 694 | * before we drop the necessary locks. |
695 | */ | 695 | */ |
696 | int vma_adjust(struct vm_area_struct *vma, unsigned long start, | 696 | int vma_adjust(struct vm_area_struct *vma, unsigned long start, |
697 | unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) | 697 | unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) |
698 | { | 698 | { |
699 | struct mm_struct *mm = vma->vm_mm; | 699 | struct mm_struct *mm = vma->vm_mm; |
700 | struct vm_area_struct *next = vma->vm_next; | 700 | struct vm_area_struct *next = vma->vm_next; |
701 | struct vm_area_struct *importer = NULL; | 701 | struct vm_area_struct *importer = NULL; |
702 | struct address_space *mapping = NULL; | 702 | struct address_space *mapping = NULL; |
703 | struct rb_root *root = NULL; | 703 | struct rb_root *root = NULL; |
704 | struct anon_vma *anon_vma = NULL; | 704 | struct anon_vma *anon_vma = NULL; |
705 | struct file *file = vma->vm_file; | 705 | struct file *file = vma->vm_file; |
706 | bool start_changed = false, end_changed = false; | 706 | bool start_changed = false, end_changed = false; |
707 | long adjust_next = 0; | 707 | long adjust_next = 0; |
708 | int remove_next = 0; | 708 | int remove_next = 0; |
709 | 709 | ||
710 | if (next && !insert) { | 710 | if (next && !insert) { |
711 | struct vm_area_struct *exporter = NULL; | 711 | struct vm_area_struct *exporter = NULL; |
712 | 712 | ||
713 | if (end >= next->vm_end) { | 713 | if (end >= next->vm_end) { |
714 | /* | 714 | /* |
715 | * vma expands, overlapping all the next, and | 715 | * vma expands, overlapping all the next, and |
716 | * perhaps the one after too (mprotect case 6). | 716 | * perhaps the one after too (mprotect case 6). |
717 | */ | 717 | */ |
718 | again: remove_next = 1 + (end > next->vm_end); | 718 | again: remove_next = 1 + (end > next->vm_end); |
719 | end = next->vm_end; | 719 | end = next->vm_end; |
720 | exporter = next; | 720 | exporter = next; |
721 | importer = vma; | 721 | importer = vma; |
722 | } else if (end > next->vm_start) { | 722 | } else if (end > next->vm_start) { |
723 | /* | 723 | /* |
724 | * vma expands, overlapping part of the next: | 724 | * vma expands, overlapping part of the next: |
725 | * mprotect case 5 shifting the boundary up. | 725 | * mprotect case 5 shifting the boundary up. |
726 | */ | 726 | */ |
727 | adjust_next = (end - next->vm_start) >> PAGE_SHIFT; | 727 | adjust_next = (end - next->vm_start) >> PAGE_SHIFT; |
728 | exporter = next; | 728 | exporter = next; |
729 | importer = vma; | 729 | importer = vma; |
730 | } else if (end < vma->vm_end) { | 730 | } else if (end < vma->vm_end) { |
731 | /* | 731 | /* |
732 | * vma shrinks, and !insert tells it's not | 732 | * vma shrinks, and !insert tells it's not |
733 | * split_vma inserting another: so it must be | 733 | * split_vma inserting another: so it must be |
734 | * mprotect case 4 shifting the boundary down. | 734 | * mprotect case 4 shifting the boundary down. |
735 | */ | 735 | */ |
736 | adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT); | 736 | adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT); |
737 | exporter = vma; | 737 | exporter = vma; |
738 | importer = next; | 738 | importer = next; |
739 | } | 739 | } |
740 | 740 | ||
741 | /* | 741 | /* |
742 | * Easily overlooked: when mprotect shifts the boundary, | 742 | * Easily overlooked: when mprotect shifts the boundary, |
743 | * make sure the expanding vma has anon_vma set if the | 743 | * make sure the expanding vma has anon_vma set if the |
744 | * shrinking vma had, to cover any anon pages imported. | 744 | * shrinking vma had, to cover any anon pages imported. |
745 | */ | 745 | */ |
746 | if (exporter && exporter->anon_vma && !importer->anon_vma) { | 746 | if (exporter && exporter->anon_vma && !importer->anon_vma) { |
747 | if (anon_vma_clone(importer, exporter)) | 747 | if (anon_vma_clone(importer, exporter)) |
748 | return -ENOMEM; | 748 | return -ENOMEM; |
749 | importer->anon_vma = exporter->anon_vma; | 749 | importer->anon_vma = exporter->anon_vma; |
750 | } | 750 | } |
751 | } | 751 | } |
752 | 752 | ||
753 | if (file) { | 753 | if (file) { |
754 | mapping = file->f_mapping; | 754 | mapping = file->f_mapping; |
755 | if (!(vma->vm_flags & VM_NONLINEAR)) { | 755 | if (!(vma->vm_flags & VM_NONLINEAR)) { |
756 | root = &mapping->i_mmap; | 756 | root = &mapping->i_mmap; |
757 | uprobe_munmap(vma, vma->vm_start, vma->vm_end); | 757 | uprobe_munmap(vma, vma->vm_start, vma->vm_end); |
758 | 758 | ||
759 | if (adjust_next) | 759 | if (adjust_next) |
760 | uprobe_munmap(next, next->vm_start, | 760 | uprobe_munmap(next, next->vm_start, |
761 | next->vm_end); | 761 | next->vm_end); |
762 | } | 762 | } |
763 | 763 | ||
764 | mutex_lock(&mapping->i_mmap_mutex); | 764 | mutex_lock(&mapping->i_mmap_mutex); |
765 | if (insert) { | 765 | if (insert) { |
766 | /* | 766 | /* |
767 | * Put into interval tree now, so instantiated pages | 767 | * Put into interval tree now, so instantiated pages |
768 | * are visible to arm/parisc __flush_dcache_page | 768 | * are visible to arm/parisc __flush_dcache_page |
769 | * throughout; but we cannot insert into address | 769 | * throughout; but we cannot insert into address |
770 | * space until vma start or end is updated. | 770 | * space until vma start or end is updated. |
771 | */ | 771 | */ |
772 | __vma_link_file(insert); | 772 | __vma_link_file(insert); |
773 | } | 773 | } |
774 | } | 774 | } |
775 | 775 | ||
776 | vma_adjust_trans_huge(vma, start, end, adjust_next); | 776 | vma_adjust_trans_huge(vma, start, end, adjust_next); |
777 | 777 | ||
778 | anon_vma = vma->anon_vma; | 778 | anon_vma = vma->anon_vma; |
779 | if (!anon_vma && adjust_next) | 779 | if (!anon_vma && adjust_next) |
780 | anon_vma = next->anon_vma; | 780 | anon_vma = next->anon_vma; |
781 | if (anon_vma) { | 781 | if (anon_vma) { |
782 | VM_BUG_ON(adjust_next && next->anon_vma && | 782 | VM_BUG_ON(adjust_next && next->anon_vma && |
783 | anon_vma != next->anon_vma); | 783 | anon_vma != next->anon_vma); |
784 | anon_vma_lock_write(anon_vma); | 784 | anon_vma_lock_write(anon_vma); |
785 | anon_vma_interval_tree_pre_update_vma(vma); | 785 | anon_vma_interval_tree_pre_update_vma(vma); |
786 | if (adjust_next) | 786 | if (adjust_next) |
787 | anon_vma_interval_tree_pre_update_vma(next); | 787 | anon_vma_interval_tree_pre_update_vma(next); |
788 | } | 788 | } |
789 | 789 | ||
790 | if (root) { | 790 | if (root) { |
791 | flush_dcache_mmap_lock(mapping); | 791 | flush_dcache_mmap_lock(mapping); |
792 | vma_interval_tree_remove(vma, root); | 792 | vma_interval_tree_remove(vma, root); |
793 | if (adjust_next) | 793 | if (adjust_next) |
794 | vma_interval_tree_remove(next, root); | 794 | vma_interval_tree_remove(next, root); |
795 | } | 795 | } |
796 | 796 | ||
797 | if (start != vma->vm_start) { | 797 | if (start != vma->vm_start) { |
798 | vma->vm_start = start; | 798 | vma->vm_start = start; |
799 | start_changed = true; | 799 | start_changed = true; |
800 | } | 800 | } |
801 | if (end != vma->vm_end) { | 801 | if (end != vma->vm_end) { |
802 | vma->vm_end = end; | 802 | vma->vm_end = end; |
803 | end_changed = true; | 803 | end_changed = true; |
804 | } | 804 | } |
805 | vma->vm_pgoff = pgoff; | 805 | vma->vm_pgoff = pgoff; |
806 | if (adjust_next) { | 806 | if (adjust_next) { |
807 | next->vm_start += adjust_next << PAGE_SHIFT; | 807 | next->vm_start += adjust_next << PAGE_SHIFT; |
808 | next->vm_pgoff += adjust_next; | 808 | next->vm_pgoff += adjust_next; |
809 | } | 809 | } |
810 | 810 | ||
811 | if (root) { | 811 | if (root) { |
812 | if (adjust_next) | 812 | if (adjust_next) |
813 | vma_interval_tree_insert(next, root); | 813 | vma_interval_tree_insert(next, root); |
814 | vma_interval_tree_insert(vma, root); | 814 | vma_interval_tree_insert(vma, root); |
815 | flush_dcache_mmap_unlock(mapping); | 815 | flush_dcache_mmap_unlock(mapping); |
816 | } | 816 | } |
817 | 817 | ||
818 | if (remove_next) { | 818 | if (remove_next) { |
819 | /* | 819 | /* |
820 | * vma_merge has merged next into vma, and needs | 820 | * vma_merge has merged next into vma, and needs |
821 | * us to remove next before dropping the locks. | 821 | * us to remove next before dropping the locks. |
822 | */ | 822 | */ |
823 | __vma_unlink(mm, next, vma); | 823 | __vma_unlink(mm, next, vma); |
824 | if (file) | 824 | if (file) |
825 | __remove_shared_vm_struct(next, file, mapping); | 825 | __remove_shared_vm_struct(next, file, mapping); |
826 | } else if (insert) { | 826 | } else if (insert) { |
827 | /* | 827 | /* |
828 | * split_vma has split insert from vma, and needs | 828 | * split_vma has split insert from vma, and needs |
829 | * us to insert it before dropping the locks | 829 | * us to insert it before dropping the locks |
830 | * (it may either follow vma or precede it). | 830 | * (it may either follow vma or precede it). |
831 | */ | 831 | */ |
832 | __insert_vm_struct(mm, insert); | 832 | __insert_vm_struct(mm, insert); |
833 | } else { | 833 | } else { |
834 | if (start_changed) | 834 | if (start_changed) |
835 | vma_gap_update(vma); | 835 | vma_gap_update(vma); |
836 | if (end_changed) { | 836 | if (end_changed) { |
837 | if (!next) | 837 | if (!next) |
838 | mm->highest_vm_end = end; | 838 | mm->highest_vm_end = end; |
839 | else if (!adjust_next) | 839 | else if (!adjust_next) |
840 | vma_gap_update(next); | 840 | vma_gap_update(next); |
841 | } | 841 | } |
842 | } | 842 | } |
843 | 843 | ||
844 | if (anon_vma) { | 844 | if (anon_vma) { |
845 | anon_vma_interval_tree_post_update_vma(vma); | 845 | anon_vma_interval_tree_post_update_vma(vma); |
846 | if (adjust_next) | 846 | if (adjust_next) |
847 | anon_vma_interval_tree_post_update_vma(next); | 847 | anon_vma_interval_tree_post_update_vma(next); |
848 | anon_vma_unlock_write(anon_vma); | 848 | anon_vma_unlock_write(anon_vma); |
849 | } | 849 | } |
850 | if (mapping) | 850 | if (mapping) |
851 | mutex_unlock(&mapping->i_mmap_mutex); | 851 | mutex_unlock(&mapping->i_mmap_mutex); |
852 | 852 | ||
853 | if (root) { | 853 | if (root) { |
854 | uprobe_mmap(vma); | 854 | uprobe_mmap(vma); |
855 | 855 | ||
856 | if (adjust_next) | 856 | if (adjust_next) |
857 | uprobe_mmap(next); | 857 | uprobe_mmap(next); |
858 | } | 858 | } |
859 | 859 | ||
860 | if (remove_next) { | 860 | if (remove_next) { |
861 | if (file) { | 861 | if (file) { |
862 | uprobe_munmap(next, next->vm_start, next->vm_end); | 862 | uprobe_munmap(next, next->vm_start, next->vm_end); |
863 | fput(file); | 863 | fput(file); |
864 | } | 864 | } |
865 | if (next->anon_vma) | 865 | if (next->anon_vma) |
866 | anon_vma_merge(vma, next); | 866 | anon_vma_merge(vma, next); |
867 | mm->map_count--; | 867 | mm->map_count--; |
868 | vma_set_policy(vma, vma_policy(next)); | 868 | vma_set_policy(vma, vma_policy(next)); |
869 | kmem_cache_free(vm_area_cachep, next); | 869 | kmem_cache_free(vm_area_cachep, next); |
870 | /* | 870 | /* |
871 | * In mprotect's case 6 (see comments on vma_merge), | 871 | * In mprotect's case 6 (see comments on vma_merge), |
872 | * we must remove another next too. It would clutter | 872 | * we must remove another next too. It would clutter |
873 | * up the code too much to do both in one go. | 873 | * up the code too much to do both in one go. |
874 | */ | 874 | */ |
875 | next = vma->vm_next; | 875 | next = vma->vm_next; |
876 | if (remove_next == 2) | 876 | if (remove_next == 2) |
877 | goto again; | 877 | goto again; |
878 | else if (next) | 878 | else if (next) |
879 | vma_gap_update(next); | 879 | vma_gap_update(next); |
880 | else | 880 | else |
881 | mm->highest_vm_end = end; | 881 | mm->highest_vm_end = end; |
882 | } | 882 | } |
883 | if (insert && file) | 883 | if (insert && file) |
884 | uprobe_mmap(insert); | 884 | uprobe_mmap(insert); |
885 | 885 | ||
886 | validate_mm(mm); | 886 | validate_mm(mm); |
887 | 887 | ||
888 | return 0; | 888 | return 0; |
889 | } | 889 | } |
890 | 890 | ||
891 | /* | 891 | /* |
892 | * If the vma has a ->close operation then the driver probably needs to release | 892 | * If the vma has a ->close operation then the driver probably needs to release |
893 | * per-vma resources, so we don't attempt to merge those. | 893 | * per-vma resources, so we don't attempt to merge those. |
894 | */ | 894 | */ |
895 | static inline int is_mergeable_vma(struct vm_area_struct *vma, | 895 | static inline int is_mergeable_vma(struct vm_area_struct *vma, |
896 | struct file *file, unsigned long vm_flags) | 896 | struct file *file, unsigned long vm_flags) |
897 | { | 897 | { |
898 | if (vma->vm_flags ^ vm_flags) | 898 | if (vma->vm_flags ^ vm_flags) |
899 | return 0; | 899 | return 0; |
900 | if (vma->vm_file != file) | 900 | if (vma->vm_file != file) |
901 | return 0; | 901 | return 0; |
902 | if (vma->vm_ops && vma->vm_ops->close) | 902 | if (vma->vm_ops && vma->vm_ops->close) |
903 | return 0; | 903 | return 0; |
904 | return 1; | 904 | return 1; |
905 | } | 905 | } |
906 | 906 | ||
907 | static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1, | 907 | static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1, |
908 | struct anon_vma *anon_vma2, | 908 | struct anon_vma *anon_vma2, |
909 | struct vm_area_struct *vma) | 909 | struct vm_area_struct *vma) |
910 | { | 910 | { |
911 | /* | 911 | /* |
912 | * The list_is_singular() test is to avoid merging VMA cloned from | 912 | * The list_is_singular() test is to avoid merging VMA cloned from |
913 | * parents. This can improve scalability caused by anon_vma lock. | 913 | * parents. This can improve scalability caused by anon_vma lock. |
914 | */ | 914 | */ |
915 | if ((!anon_vma1 || !anon_vma2) && (!vma || | 915 | if ((!anon_vma1 || !anon_vma2) && (!vma || |
916 | list_is_singular(&vma->anon_vma_chain))) | 916 | list_is_singular(&vma->anon_vma_chain))) |
917 | return 1; | 917 | return 1; |
918 | return anon_vma1 == anon_vma2; | 918 | return anon_vma1 == anon_vma2; |
919 | } | 919 | } |
920 | 920 | ||
921 | /* | 921 | /* |
922 | * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) | 922 | * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) |
923 | * in front of (at a lower virtual address and file offset than) the vma. | 923 | * in front of (at a lower virtual address and file offset than) the vma. |
924 | * | 924 | * |
925 | * We cannot merge two vmas if they have differently assigned (non-NULL) | 925 | * We cannot merge two vmas if they have differently assigned (non-NULL) |
926 | * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. | 926 | * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. |
927 | * | 927 | * |
928 | * We don't check here for the merged mmap wrapping around the end of pagecache | 928 | * We don't check here for the merged mmap wrapping around the end of pagecache |
929 | * indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which | 929 | * indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which |
930 | * wrap, nor mmaps which cover the final page at index -1UL. | 930 | * wrap, nor mmaps which cover the final page at index -1UL. |
931 | */ | 931 | */ |
932 | static int | 932 | static int |
933 | can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, | 933 | can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, |
934 | struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) | 934 | struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) |
935 | { | 935 | { |
936 | if (is_mergeable_vma(vma, file, vm_flags) && | 936 | if (is_mergeable_vma(vma, file, vm_flags) && |
937 | is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { | 937 | is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { |
938 | if (vma->vm_pgoff == vm_pgoff) | 938 | if (vma->vm_pgoff == vm_pgoff) |
939 | return 1; | 939 | return 1; |
940 | } | 940 | } |
941 | return 0; | 941 | return 0; |
942 | } | 942 | } |
943 | 943 | ||
944 | /* | 944 | /* |
945 | * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) | 945 | * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) |
946 | * beyond (at a higher virtual address and file offset than) the vma. | 946 | * beyond (at a higher virtual address and file offset than) the vma. |
947 | * | 947 | * |
948 | * We cannot merge two vmas if they have differently assigned (non-NULL) | 948 | * We cannot merge two vmas if they have differently assigned (non-NULL) |
949 | * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. | 949 | * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. |
950 | */ | 950 | */ |
951 | static int | 951 | static int |
952 | can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, | 952 | can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, |
953 | struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) | 953 | struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) |
954 | { | 954 | { |
955 | if (is_mergeable_vma(vma, file, vm_flags) && | 955 | if (is_mergeable_vma(vma, file, vm_flags) && |
956 | is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { | 956 | is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { |
957 | pgoff_t vm_pglen; | 957 | pgoff_t vm_pglen; |
958 | vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; | 958 | vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; |
959 | if (vma->vm_pgoff + vm_pglen == vm_pgoff) | 959 | if (vma->vm_pgoff + vm_pglen == vm_pgoff) |
960 | return 1; | 960 | return 1; |
961 | } | 961 | } |
962 | return 0; | 962 | return 0; |
963 | } | 963 | } |
964 | 964 | ||
965 | /* | 965 | /* |
966 | * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out | 966 | * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out |
967 | * whether that can be merged with its predecessor or its successor. | 967 | * whether that can be merged with its predecessor or its successor. |
968 | * Or both (it neatly fills a hole). | 968 | * Or both (it neatly fills a hole). |
969 | * | 969 | * |
970 | * In most cases - when called for mmap, brk or mremap - [addr,end) is | 970 | * In most cases - when called for mmap, brk or mremap - [addr,end) is |
971 | * certain not to be mapped by the time vma_merge is called; but when | 971 | * certain not to be mapped by the time vma_merge is called; but when |
972 | * called for mprotect, it is certain to be already mapped (either at | 972 | * called for mprotect, it is certain to be already mapped (either at |
973 | * an offset within prev, or at the start of next), and the flags of | 973 | * an offset within prev, or at the start of next), and the flags of |
974 | * this area are about to be changed to vm_flags - and the no-change | 974 | * this area are about to be changed to vm_flags - and the no-change |
975 | * case has already been eliminated. | 975 | * case has already been eliminated. |
976 | * | 976 | * |
977 | * The following mprotect cases have to be considered, where AAAA is | 977 | * The following mprotect cases have to be considered, where AAAA is |
978 | * the area passed down from mprotect_fixup, never extending beyond one | 978 | * the area passed down from mprotect_fixup, never extending beyond one |
979 | * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after: | 979 | * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after: |
980 | * | 980 | * |
981 | * AAAA AAAA AAAA AAAA | 981 | * AAAA AAAA AAAA AAAA |
982 | * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPNNNNNN PPPPNNNNXXXX | 982 | * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPNNNNNN PPPPNNNNXXXX |
983 | * cannot merge might become might become might become | 983 | * cannot merge might become might become might become |
984 | * PPNNNNNNNNNN PPPPPPPPPPNN PPPPPPPPPPPP 6 or | 984 | * PPNNNNNNNNNN PPPPPPPPPPNN PPPPPPPPPPPP 6 or |
985 | * mmap, brk or case 4 below case 5 below PPPPPPPPXXXX 7 or | 985 | * mmap, brk or case 4 below case 5 below PPPPPPPPXXXX 7 or |
986 | * mremap move: PPPPNNNNNNNN 8 | 986 | * mremap move: PPPPNNNNNNNN 8 |
987 | * AAAA | 987 | * AAAA |
988 | * PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN | 988 | * PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN |
989 | * might become case 1 below case 2 below case 3 below | 989 | * might become case 1 below case 2 below case 3 below |
990 | * | 990 | * |
991 | * Odd one out? Case 8, because it extends NNNN but needs flags of XXXX: | 991 | * Odd one out? Case 8, because it extends NNNN but needs flags of XXXX: |
992 | * mprotect_fixup updates vm_flags & vm_page_prot on successful return. | 992 | * mprotect_fixup updates vm_flags & vm_page_prot on successful return. |
993 | */ | 993 | */ |
994 | struct vm_area_struct *vma_merge(struct mm_struct *mm, | 994 | struct vm_area_struct *vma_merge(struct mm_struct *mm, |
995 | struct vm_area_struct *prev, unsigned long addr, | 995 | struct vm_area_struct *prev, unsigned long addr, |
996 | unsigned long end, unsigned long vm_flags, | 996 | unsigned long end, unsigned long vm_flags, |
997 | struct anon_vma *anon_vma, struct file *file, | 997 | struct anon_vma *anon_vma, struct file *file, |
998 | pgoff_t pgoff, struct mempolicy *policy) | 998 | pgoff_t pgoff, struct mempolicy *policy) |
999 | { | 999 | { |
1000 | pgoff_t pglen = (end - addr) >> PAGE_SHIFT; | 1000 | pgoff_t pglen = (end - addr) >> PAGE_SHIFT; |
1001 | struct vm_area_struct *area, *next; | 1001 | struct vm_area_struct *area, *next; |
1002 | int err; | 1002 | int err; |
1003 | 1003 | ||
1004 | /* | 1004 | /* |
1005 | * We later require that vma->vm_flags == vm_flags, | 1005 | * We later require that vma->vm_flags == vm_flags, |
1006 | * so this tests vma->vm_flags & VM_SPECIAL, too. | 1006 | * so this tests vma->vm_flags & VM_SPECIAL, too. |
1007 | */ | 1007 | */ |
1008 | if (vm_flags & VM_SPECIAL) | 1008 | if (vm_flags & VM_SPECIAL) |
1009 | return NULL; | 1009 | return NULL; |
1010 | 1010 | ||
1011 | if (prev) | 1011 | if (prev) |
1012 | next = prev->vm_next; | 1012 | next = prev->vm_next; |
1013 | else | 1013 | else |
1014 | next = mm->mmap; | 1014 | next = mm->mmap; |
1015 | area = next; | 1015 | area = next; |
1016 | if (next && next->vm_end == end) /* cases 6, 7, 8 */ | 1016 | if (next && next->vm_end == end) /* cases 6, 7, 8 */ |
1017 | next = next->vm_next; | 1017 | next = next->vm_next; |
1018 | 1018 | ||
1019 | /* | 1019 | /* |
1020 | * Can it merge with the predecessor? | 1020 | * Can it merge with the predecessor? |
1021 | */ | 1021 | */ |
1022 | if (prev && prev->vm_end == addr && | 1022 | if (prev && prev->vm_end == addr && |
1023 | mpol_equal(vma_policy(prev), policy) && | 1023 | mpol_equal(vma_policy(prev), policy) && |
1024 | can_vma_merge_after(prev, vm_flags, | 1024 | can_vma_merge_after(prev, vm_flags, |
1025 | anon_vma, file, pgoff)) { | 1025 | anon_vma, file, pgoff)) { |
1026 | /* | 1026 | /* |
1027 | * OK, it can. Can we now merge in the successor as well? | 1027 | * OK, it can. Can we now merge in the successor as well? |
1028 | */ | 1028 | */ |
1029 | if (next && end == next->vm_start && | 1029 | if (next && end == next->vm_start && |
1030 | mpol_equal(policy, vma_policy(next)) && | 1030 | mpol_equal(policy, vma_policy(next)) && |
1031 | can_vma_merge_before(next, vm_flags, | 1031 | can_vma_merge_before(next, vm_flags, |
1032 | anon_vma, file, pgoff+pglen) && | 1032 | anon_vma, file, pgoff+pglen) && |
1033 | is_mergeable_anon_vma(prev->anon_vma, | 1033 | is_mergeable_anon_vma(prev->anon_vma, |
1034 | next->anon_vma, NULL)) { | 1034 | next->anon_vma, NULL)) { |
1035 | /* cases 1, 6 */ | 1035 | /* cases 1, 6 */ |
1036 | err = vma_adjust(prev, prev->vm_start, | 1036 | err = vma_adjust(prev, prev->vm_start, |
1037 | next->vm_end, prev->vm_pgoff, NULL); | 1037 | next->vm_end, prev->vm_pgoff, NULL); |
1038 | } else /* cases 2, 5, 7 */ | 1038 | } else /* cases 2, 5, 7 */ |
1039 | err = vma_adjust(prev, prev->vm_start, | 1039 | err = vma_adjust(prev, prev->vm_start, |
1040 | end, prev->vm_pgoff, NULL); | 1040 | end, prev->vm_pgoff, NULL); |
1041 | if (err) | 1041 | if (err) |
1042 | return NULL; | 1042 | return NULL; |
1043 | khugepaged_enter_vma_merge(prev); | 1043 | khugepaged_enter_vma_merge(prev); |
1044 | return prev; | 1044 | return prev; |
1045 | } | 1045 | } |
1046 | 1046 | ||
1047 | /* | 1047 | /* |
1048 | * Can this new request be merged in front of next? | 1048 | * Can this new request be merged in front of next? |
1049 | */ | 1049 | */ |
1050 | if (next && end == next->vm_start && | 1050 | if (next && end == next->vm_start && |
1051 | mpol_equal(policy, vma_policy(next)) && | 1051 | mpol_equal(policy, vma_policy(next)) && |
1052 | can_vma_merge_before(next, vm_flags, | 1052 | can_vma_merge_before(next, vm_flags, |
1053 | anon_vma, file, pgoff+pglen)) { | 1053 | anon_vma, file, pgoff+pglen)) { |
1054 | if (prev && addr < prev->vm_end) /* case 4 */ | 1054 | if (prev && addr < prev->vm_end) /* case 4 */ |
1055 | err = vma_adjust(prev, prev->vm_start, | 1055 | err = vma_adjust(prev, prev->vm_start, |
1056 | addr, prev->vm_pgoff, NULL); | 1056 | addr, prev->vm_pgoff, NULL); |
1057 | else /* cases 3, 8 */ | 1057 | else /* cases 3, 8 */ |
1058 | err = vma_adjust(area, addr, next->vm_end, | 1058 | err = vma_adjust(area, addr, next->vm_end, |
1059 | next->vm_pgoff - pglen, NULL); | 1059 | next->vm_pgoff - pglen, NULL); |
1060 | if (err) | 1060 | if (err) |
1061 | return NULL; | 1061 | return NULL; |
1062 | khugepaged_enter_vma_merge(area); | 1062 | khugepaged_enter_vma_merge(area); |
1063 | return area; | 1063 | return area; |
1064 | } | 1064 | } |
1065 | 1065 | ||
1066 | return NULL; | 1066 | return NULL; |
1067 | } | 1067 | } |
1068 | 1068 | ||
1069 | /* | 1069 | /* |
1070 | * Rough compatbility check to quickly see if it's even worth looking | 1070 | * Rough compatbility check to quickly see if it's even worth looking |
1071 | * at sharing an anon_vma. | 1071 | * at sharing an anon_vma. |
1072 | * | 1072 | * |
1073 | * They need to have the same vm_file, and the flags can only differ | 1073 | * They need to have the same vm_file, and the flags can only differ |
1074 | * in things that mprotect may change. | 1074 | * in things that mprotect may change. |
1075 | * | 1075 | * |
1076 | * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that | 1076 | * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that |
1077 | * we can merge the two vma's. For example, we refuse to merge a vma if | 1077 | * we can merge the two vma's. For example, we refuse to merge a vma if |
1078 | * there is a vm_ops->close() function, because that indicates that the | 1078 | * there is a vm_ops->close() function, because that indicates that the |
1079 | * driver is doing some kind of reference counting. But that doesn't | 1079 | * driver is doing some kind of reference counting. But that doesn't |
1080 | * really matter for the anon_vma sharing case. | 1080 | * really matter for the anon_vma sharing case. |
1081 | */ | 1081 | */ |
1082 | static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b) | 1082 | static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b) |
1083 | { | 1083 | { |
1084 | return a->vm_end == b->vm_start && | 1084 | return a->vm_end == b->vm_start && |
1085 | mpol_equal(vma_policy(a), vma_policy(b)) && | 1085 | mpol_equal(vma_policy(a), vma_policy(b)) && |
1086 | a->vm_file == b->vm_file && | 1086 | a->vm_file == b->vm_file && |
1087 | !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC)) && | 1087 | !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC)) && |
1088 | b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); | 1088 | b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); |
1089 | } | 1089 | } |
1090 | 1090 | ||
1091 | /* | 1091 | /* |
1092 | * Do some basic sanity checking to see if we can re-use the anon_vma | 1092 | * Do some basic sanity checking to see if we can re-use the anon_vma |
1093 | * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be | 1093 | * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be |
1094 | * the same as 'old', the other will be the new one that is trying | 1094 | * the same as 'old', the other will be the new one that is trying |
1095 | * to share the anon_vma. | 1095 | * to share the anon_vma. |
1096 | * | 1096 | * |
1097 | * NOTE! This runs with mm_sem held for reading, so it is possible that | 1097 | * NOTE! This runs with mm_sem held for reading, so it is possible that |
1098 | * the anon_vma of 'old' is concurrently in the process of being set up | 1098 | * the anon_vma of 'old' is concurrently in the process of being set up |
1099 | * by another page fault trying to merge _that_. But that's ok: if it | 1099 | * by another page fault trying to merge _that_. But that's ok: if it |
1100 | * is being set up, that automatically means that it will be a singleton | 1100 | * is being set up, that automatically means that it will be a singleton |
1101 | * acceptable for merging, so we can do all of this optimistically. But | 1101 | * acceptable for merging, so we can do all of this optimistically. But |
1102 | * we do that ACCESS_ONCE() to make sure that we never re-load the pointer. | 1102 | * we do that ACCESS_ONCE() to make sure that we never re-load the pointer. |
1103 | * | 1103 | * |
1104 | * IOW: that the "list_is_singular()" test on the anon_vma_chain only | 1104 | * IOW: that the "list_is_singular()" test on the anon_vma_chain only |
1105 | * matters for the 'stable anon_vma' case (ie the thing we want to avoid | 1105 | * matters for the 'stable anon_vma' case (ie the thing we want to avoid |
1106 | * is to return an anon_vma that is "complex" due to having gone through | 1106 | * is to return an anon_vma that is "complex" due to having gone through |
1107 | * a fork). | 1107 | * a fork). |
1108 | * | 1108 | * |
1109 | * We also make sure that the two vma's are compatible (adjacent, | 1109 | * We also make sure that the two vma's are compatible (adjacent, |
1110 | * and with the same memory policies). That's all stable, even with just | 1110 | * and with the same memory policies). That's all stable, even with just |
1111 | * a read lock on the mm_sem. | 1111 | * a read lock on the mm_sem. |
1112 | */ | 1112 | */ |
1113 | static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b) | 1113 | static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b) |
1114 | { | 1114 | { |
1115 | if (anon_vma_compatible(a, b)) { | 1115 | if (anon_vma_compatible(a, b)) { |
1116 | struct anon_vma *anon_vma = ACCESS_ONCE(old->anon_vma); | 1116 | struct anon_vma *anon_vma = ACCESS_ONCE(old->anon_vma); |
1117 | 1117 | ||
1118 | if (anon_vma && list_is_singular(&old->anon_vma_chain)) | 1118 | if (anon_vma && list_is_singular(&old->anon_vma_chain)) |
1119 | return anon_vma; | 1119 | return anon_vma; |
1120 | } | 1120 | } |
1121 | return NULL; | 1121 | return NULL; |
1122 | } | 1122 | } |
1123 | 1123 | ||
1124 | /* | 1124 | /* |
1125 | * find_mergeable_anon_vma is used by anon_vma_prepare, to check | 1125 | * find_mergeable_anon_vma is used by anon_vma_prepare, to check |
1126 | * neighbouring vmas for a suitable anon_vma, before it goes off | 1126 | * neighbouring vmas for a suitable anon_vma, before it goes off |
1127 | * to allocate a new anon_vma. It checks because a repetitive | 1127 | * to allocate a new anon_vma. It checks because a repetitive |
1128 | * sequence of mprotects and faults may otherwise lead to distinct | 1128 | * sequence of mprotects and faults may otherwise lead to distinct |
1129 | * anon_vmas being allocated, preventing vma merge in subsequent | 1129 | * anon_vmas being allocated, preventing vma merge in subsequent |
1130 | * mprotect. | 1130 | * mprotect. |
1131 | */ | 1131 | */ |
1132 | struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) | 1132 | struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) |
1133 | { | 1133 | { |
1134 | struct anon_vma *anon_vma; | 1134 | struct anon_vma *anon_vma; |
1135 | struct vm_area_struct *near; | 1135 | struct vm_area_struct *near; |
1136 | 1136 | ||
1137 | near = vma->vm_next; | 1137 | near = vma->vm_next; |
1138 | if (!near) | 1138 | if (!near) |
1139 | goto try_prev; | 1139 | goto try_prev; |
1140 | 1140 | ||
1141 | anon_vma = reusable_anon_vma(near, vma, near); | 1141 | anon_vma = reusable_anon_vma(near, vma, near); |
1142 | if (anon_vma) | 1142 | if (anon_vma) |
1143 | return anon_vma; | 1143 | return anon_vma; |
1144 | try_prev: | 1144 | try_prev: |
1145 | near = vma->vm_prev; | 1145 | near = vma->vm_prev; |
1146 | if (!near) | 1146 | if (!near) |
1147 | goto none; | 1147 | goto none; |
1148 | 1148 | ||
1149 | anon_vma = reusable_anon_vma(near, near, vma); | 1149 | anon_vma = reusable_anon_vma(near, near, vma); |
1150 | if (anon_vma) | 1150 | if (anon_vma) |
1151 | return anon_vma; | 1151 | return anon_vma; |
1152 | none: | 1152 | none: |
1153 | /* | 1153 | /* |
1154 | * There's no absolute need to look only at touching neighbours: | 1154 | * There's no absolute need to look only at touching neighbours: |
1155 | * we could search further afield for "compatible" anon_vmas. | 1155 | * we could search further afield for "compatible" anon_vmas. |
1156 | * But it would probably just be a waste of time searching, | 1156 | * But it would probably just be a waste of time searching, |
1157 | * or lead to too many vmas hanging off the same anon_vma. | 1157 | * or lead to too many vmas hanging off the same anon_vma. |
1158 | * We're trying to allow mprotect remerging later on, | 1158 | * We're trying to allow mprotect remerging later on, |
1159 | * not trying to minimize memory used for anon_vmas. | 1159 | * not trying to minimize memory used for anon_vmas. |
1160 | */ | 1160 | */ |
1161 | return NULL; | 1161 | return NULL; |
1162 | } | 1162 | } |
1163 | 1163 | ||
1164 | #ifdef CONFIG_PROC_FS | 1164 | #ifdef CONFIG_PROC_FS |
1165 | void vm_stat_account(struct mm_struct *mm, unsigned long flags, | 1165 | void vm_stat_account(struct mm_struct *mm, unsigned long flags, |
1166 | struct file *file, long pages) | 1166 | struct file *file, long pages) |
1167 | { | 1167 | { |
1168 | const unsigned long stack_flags | 1168 | const unsigned long stack_flags |
1169 | = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN); | 1169 | = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN); |
1170 | 1170 | ||
1171 | mm->total_vm += pages; | 1171 | mm->total_vm += pages; |
1172 | 1172 | ||
1173 | if (file) { | 1173 | if (file) { |
1174 | mm->shared_vm += pages; | 1174 | mm->shared_vm += pages; |
1175 | if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC) | 1175 | if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC) |
1176 | mm->exec_vm += pages; | 1176 | mm->exec_vm += pages; |
1177 | } else if (flags & stack_flags) | 1177 | } else if (flags & stack_flags) |
1178 | mm->stack_vm += pages; | 1178 | mm->stack_vm += pages; |
1179 | } | 1179 | } |
1180 | #endif /* CONFIG_PROC_FS */ | 1180 | #endif /* CONFIG_PROC_FS */ |
1181 | 1181 | ||
1182 | /* | 1182 | /* |
1183 | * If a hint addr is less than mmap_min_addr change hint to be as | 1183 | * If a hint addr is less than mmap_min_addr change hint to be as |
1184 | * low as possible but still greater than mmap_min_addr | 1184 | * low as possible but still greater than mmap_min_addr |
1185 | */ | 1185 | */ |
1186 | static inline unsigned long round_hint_to_min(unsigned long hint) | 1186 | static inline unsigned long round_hint_to_min(unsigned long hint) |
1187 | { | 1187 | { |
1188 | hint &= PAGE_MASK; | 1188 | hint &= PAGE_MASK; |
1189 | if (((void *)hint != NULL) && | 1189 | if (((void *)hint != NULL) && |
1190 | (hint < mmap_min_addr)) | 1190 | (hint < mmap_min_addr)) |
1191 | return PAGE_ALIGN(mmap_min_addr); | 1191 | return PAGE_ALIGN(mmap_min_addr); |
1192 | return hint; | 1192 | return hint; |
1193 | } | 1193 | } |
1194 | 1194 | ||
1195 | /* | 1195 | /* |
1196 | * The caller must hold down_write(¤t->mm->mmap_sem). | 1196 | * The caller must hold down_write(¤t->mm->mmap_sem). |
1197 | */ | 1197 | */ |
1198 | 1198 | ||
1199 | unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | 1199 | unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, |
1200 | unsigned long len, unsigned long prot, | 1200 | unsigned long len, unsigned long prot, |
1201 | unsigned long flags, unsigned long pgoff, | 1201 | unsigned long flags, unsigned long pgoff, |
1202 | unsigned long *populate) | 1202 | unsigned long *populate) |
1203 | { | 1203 | { |
1204 | struct mm_struct * mm = current->mm; | 1204 | struct mm_struct * mm = current->mm; |
1205 | struct inode *inode; | 1205 | struct inode *inode; |
1206 | vm_flags_t vm_flags; | 1206 | vm_flags_t vm_flags; |
1207 | 1207 | ||
1208 | *populate = 0; | 1208 | *populate = 0; |
1209 | 1209 | ||
1210 | /* | 1210 | /* |
1211 | * Does the application expect PROT_READ to imply PROT_EXEC? | 1211 | * Does the application expect PROT_READ to imply PROT_EXEC? |
1212 | * | 1212 | * |
1213 | * (the exception is when the underlying filesystem is noexec | 1213 | * (the exception is when the underlying filesystem is noexec |
1214 | * mounted, in which case we dont add PROT_EXEC.) | 1214 | * mounted, in which case we dont add PROT_EXEC.) |
1215 | */ | 1215 | */ |
1216 | if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) | 1216 | if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) |
1217 | if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC))) | 1217 | if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC))) |
1218 | prot |= PROT_EXEC; | 1218 | prot |= PROT_EXEC; |
1219 | 1219 | ||
1220 | if (!len) | 1220 | if (!len) |
1221 | return -EINVAL; | 1221 | return -EINVAL; |
1222 | 1222 | ||
1223 | if (!(flags & MAP_FIXED)) | 1223 | if (!(flags & MAP_FIXED)) |
1224 | addr = round_hint_to_min(addr); | 1224 | addr = round_hint_to_min(addr); |
1225 | 1225 | ||
1226 | /* Careful about overflows.. */ | 1226 | /* Careful about overflows.. */ |
1227 | len = PAGE_ALIGN(len); | 1227 | len = PAGE_ALIGN(len); |
1228 | if (!len) | 1228 | if (!len) |
1229 | return -ENOMEM; | 1229 | return -ENOMEM; |
1230 | 1230 | ||
1231 | /* offset overflow? */ | 1231 | /* offset overflow? */ |
1232 | if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) | 1232 | if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) |
1233 | return -EOVERFLOW; | 1233 | return -EOVERFLOW; |
1234 | 1234 | ||
1235 | /* Too many mappings? */ | 1235 | /* Too many mappings? */ |
1236 | if (mm->map_count > sysctl_max_map_count) | 1236 | if (mm->map_count > sysctl_max_map_count) |
1237 | return -ENOMEM; | 1237 | return -ENOMEM; |
1238 | 1238 | ||
1239 | /* Obtain the address to map to. we verify (or select) it and ensure | 1239 | /* Obtain the address to map to. we verify (or select) it and ensure |
1240 | * that it represents a valid section of the address space. | 1240 | * that it represents a valid section of the address space. |
1241 | */ | 1241 | */ |
1242 | addr = get_unmapped_area(file, addr, len, pgoff, flags); | 1242 | addr = get_unmapped_area(file, addr, len, pgoff, flags); |
1243 | if (addr & ~PAGE_MASK) | 1243 | if (addr & ~PAGE_MASK) |
1244 | return addr; | 1244 | return addr; |
1245 | 1245 | ||
1246 | /* Do simple checking here so the lower-level routines won't have | 1246 | /* Do simple checking here so the lower-level routines won't have |
1247 | * to. we assume access permissions have been handled by the open | 1247 | * to. we assume access permissions have been handled by the open |
1248 | * of the memory object, so we don't do any here. | 1248 | * of the memory object, so we don't do any here. |
1249 | */ | 1249 | */ |
1250 | vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | | 1250 | vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | |
1251 | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; | 1251 | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; |
1252 | 1252 | ||
1253 | if (flags & MAP_LOCKED) | 1253 | if (flags & MAP_LOCKED) |
1254 | if (!can_do_mlock()) | 1254 | if (!can_do_mlock()) |
1255 | return -EPERM; | 1255 | return -EPERM; |
1256 | 1256 | ||
1257 | /* mlock MCL_FUTURE? */ | 1257 | /* mlock MCL_FUTURE? */ |
1258 | if (vm_flags & VM_LOCKED) { | 1258 | if (vm_flags & VM_LOCKED) { |
1259 | unsigned long locked, lock_limit; | 1259 | unsigned long locked, lock_limit; |
1260 | locked = len >> PAGE_SHIFT; | 1260 | locked = len >> PAGE_SHIFT; |
1261 | locked += mm->locked_vm; | 1261 | locked += mm->locked_vm; |
1262 | lock_limit = rlimit(RLIMIT_MEMLOCK); | 1262 | lock_limit = rlimit(RLIMIT_MEMLOCK); |
1263 | lock_limit >>= PAGE_SHIFT; | 1263 | lock_limit >>= PAGE_SHIFT; |
1264 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) | 1264 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) |
1265 | return -EAGAIN; | 1265 | return -EAGAIN; |
1266 | } | 1266 | } |
1267 | 1267 | ||
1268 | inode = file ? file_inode(file) : NULL; | 1268 | inode = file ? file_inode(file) : NULL; |
1269 | 1269 | ||
1270 | if (file) { | 1270 | if (file) { |
1271 | switch (flags & MAP_TYPE) { | 1271 | switch (flags & MAP_TYPE) { |
1272 | case MAP_SHARED: | 1272 | case MAP_SHARED: |
1273 | if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE)) | 1273 | if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE)) |
1274 | return -EACCES; | 1274 | return -EACCES; |
1275 | 1275 | ||
1276 | /* | 1276 | /* |
1277 | * Make sure we don't allow writing to an append-only | 1277 | * Make sure we don't allow writing to an append-only |
1278 | * file.. | 1278 | * file.. |
1279 | */ | 1279 | */ |
1280 | if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE)) | 1280 | if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE)) |
1281 | return -EACCES; | 1281 | return -EACCES; |
1282 | 1282 | ||
1283 | /* | 1283 | /* |
1284 | * Make sure there are no mandatory locks on the file. | 1284 | * Make sure there are no mandatory locks on the file. |
1285 | */ | 1285 | */ |
1286 | if (locks_verify_locked(inode)) | 1286 | if (locks_verify_locked(inode)) |
1287 | return -EAGAIN; | 1287 | return -EAGAIN; |
1288 | 1288 | ||
1289 | vm_flags |= VM_SHARED | VM_MAYSHARE; | 1289 | vm_flags |= VM_SHARED | VM_MAYSHARE; |
1290 | if (!(file->f_mode & FMODE_WRITE)) | 1290 | if (!(file->f_mode & FMODE_WRITE)) |
1291 | vm_flags &= ~(VM_MAYWRITE | VM_SHARED); | 1291 | vm_flags &= ~(VM_MAYWRITE | VM_SHARED); |
1292 | 1292 | ||
1293 | /* fall through */ | 1293 | /* fall through */ |
1294 | case MAP_PRIVATE: | 1294 | case MAP_PRIVATE: |
1295 | if (!(file->f_mode & FMODE_READ)) | 1295 | if (!(file->f_mode & FMODE_READ)) |
1296 | return -EACCES; | 1296 | return -EACCES; |
1297 | if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) { | 1297 | if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) { |
1298 | if (vm_flags & VM_EXEC) | 1298 | if (vm_flags & VM_EXEC) |
1299 | return -EPERM; | 1299 | return -EPERM; |
1300 | vm_flags &= ~VM_MAYEXEC; | 1300 | vm_flags &= ~VM_MAYEXEC; |
1301 | } | 1301 | } |
1302 | 1302 | ||
1303 | if (!file->f_op || !file->f_op->mmap) | 1303 | if (!file->f_op || !file->f_op->mmap) |
1304 | return -ENODEV; | 1304 | return -ENODEV; |
1305 | break; | 1305 | break; |
1306 | 1306 | ||
1307 | default: | 1307 | default: |
1308 | return -EINVAL; | 1308 | return -EINVAL; |
1309 | } | 1309 | } |
1310 | } else { | 1310 | } else { |
1311 | switch (flags & MAP_TYPE) { | 1311 | switch (flags & MAP_TYPE) { |
1312 | case MAP_SHARED: | 1312 | case MAP_SHARED: |
1313 | /* | 1313 | /* |
1314 | * Ignore pgoff. | 1314 | * Ignore pgoff. |
1315 | */ | 1315 | */ |
1316 | pgoff = 0; | 1316 | pgoff = 0; |
1317 | vm_flags |= VM_SHARED | VM_MAYSHARE; | 1317 | vm_flags |= VM_SHARED | VM_MAYSHARE; |
1318 | break; | 1318 | break; |
1319 | case MAP_PRIVATE: | 1319 | case MAP_PRIVATE: |
1320 | /* | 1320 | /* |
1321 | * Set pgoff according to addr for anon_vma. | 1321 | * Set pgoff according to addr for anon_vma. |
1322 | */ | 1322 | */ |
1323 | pgoff = addr >> PAGE_SHIFT; | 1323 | pgoff = addr >> PAGE_SHIFT; |
1324 | break; | 1324 | break; |
1325 | default: | 1325 | default: |
1326 | return -EINVAL; | 1326 | return -EINVAL; |
1327 | } | 1327 | } |
1328 | } | 1328 | } |
1329 | 1329 | ||
1330 | /* | 1330 | /* |
1331 | * Set 'VM_NORESERVE' if we should not account for the | 1331 | * Set 'VM_NORESERVE' if we should not account for the |
1332 | * memory use of this mapping. | 1332 | * memory use of this mapping. |
1333 | */ | 1333 | */ |
1334 | if (flags & MAP_NORESERVE) { | 1334 | if (flags & MAP_NORESERVE) { |
1335 | /* We honor MAP_NORESERVE if allowed to overcommit */ | 1335 | /* We honor MAP_NORESERVE if allowed to overcommit */ |
1336 | if (sysctl_overcommit_memory != OVERCOMMIT_NEVER) | 1336 | if (sysctl_overcommit_memory != OVERCOMMIT_NEVER) |
1337 | vm_flags |= VM_NORESERVE; | 1337 | vm_flags |= VM_NORESERVE; |
1338 | 1338 | ||
1339 | /* hugetlb applies strict overcommit unless MAP_NORESERVE */ | 1339 | /* hugetlb applies strict overcommit unless MAP_NORESERVE */ |
1340 | if (file && is_file_hugepages(file)) | 1340 | if (file && is_file_hugepages(file)) |
1341 | vm_flags |= VM_NORESERVE; | 1341 | vm_flags |= VM_NORESERVE; |
1342 | } | 1342 | } |
1343 | 1343 | ||
1344 | addr = mmap_region(file, addr, len, vm_flags, pgoff); | 1344 | addr = mmap_region(file, addr, len, vm_flags, pgoff); |
1345 | if (!IS_ERR_VALUE(addr) && | 1345 | if (!IS_ERR_VALUE(addr) && |
1346 | ((vm_flags & VM_LOCKED) || | 1346 | ((vm_flags & VM_LOCKED) || |
1347 | (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE)) | 1347 | (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE)) |
1348 | *populate = len; | 1348 | *populate = len; |
1349 | return addr; | 1349 | return addr; |
1350 | } | 1350 | } |
1351 | 1351 | ||
1352 | SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, | 1352 | SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, |
1353 | unsigned long, prot, unsigned long, flags, | 1353 | unsigned long, prot, unsigned long, flags, |
1354 | unsigned long, fd, unsigned long, pgoff) | 1354 | unsigned long, fd, unsigned long, pgoff) |
1355 | { | 1355 | { |
1356 | struct file *file = NULL; | 1356 | struct file *file = NULL; |
1357 | unsigned long retval = -EBADF; | 1357 | unsigned long retval = -EBADF; |
1358 | 1358 | ||
1359 | if (!(flags & MAP_ANONYMOUS)) { | 1359 | if (!(flags & MAP_ANONYMOUS)) { |
1360 | audit_mmap_fd(fd, flags); | 1360 | audit_mmap_fd(fd, flags); |
1361 | if (unlikely(flags & MAP_HUGETLB)) | 1361 | if (unlikely(flags & MAP_HUGETLB)) |
1362 | return -EINVAL; | 1362 | return -EINVAL; |
1363 | file = fget(fd); | 1363 | file = fget(fd); |
1364 | if (!file) | 1364 | if (!file) |
1365 | goto out; | 1365 | goto out; |
1366 | if (is_file_hugepages(file)) | ||
1367 | len = ALIGN(len, huge_page_size(hstate_file(file))); | ||
1366 | } else if (flags & MAP_HUGETLB) { | 1368 | } else if (flags & MAP_HUGETLB) { |
1367 | struct user_struct *user = NULL; | 1369 | struct user_struct *user = NULL; |
1370 | |||
1371 | len = ALIGN(len, huge_page_size(hstate_sizelog( | ||
1372 | (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK))); | ||
1368 | /* | 1373 | /* |
1369 | * VM_NORESERVE is used because the reservations will be | 1374 | * VM_NORESERVE is used because the reservations will be |
1370 | * taken when vm_ops->mmap() is called | 1375 | * taken when vm_ops->mmap() is called |
1371 | * A dummy user value is used because we are not locking | 1376 | * A dummy user value is used because we are not locking |
1372 | * memory so no accounting is necessary | 1377 | * memory so no accounting is necessary |
1373 | */ | 1378 | */ |
1374 | file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len, | 1379 | file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, |
1375 | VM_NORESERVE, | 1380 | VM_NORESERVE, |
1376 | &user, HUGETLB_ANONHUGE_INODE, | 1381 | &user, HUGETLB_ANONHUGE_INODE, |
1377 | (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); | 1382 | (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); |
1378 | if (IS_ERR(file)) | 1383 | if (IS_ERR(file)) |
1379 | return PTR_ERR(file); | 1384 | return PTR_ERR(file); |
1380 | } | 1385 | } |
1381 | 1386 | ||
1382 | flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); | 1387 | flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); |
1383 | 1388 | ||
1384 | retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); | 1389 | retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); |
1385 | if (file) | 1390 | if (file) |
1386 | fput(file); | 1391 | fput(file); |
1387 | out: | 1392 | out: |
1388 | return retval; | 1393 | return retval; |
1389 | } | 1394 | } |
1390 | 1395 | ||
1391 | #ifdef __ARCH_WANT_SYS_OLD_MMAP | 1396 | #ifdef __ARCH_WANT_SYS_OLD_MMAP |
1392 | struct mmap_arg_struct { | 1397 | struct mmap_arg_struct { |
1393 | unsigned long addr; | 1398 | unsigned long addr; |
1394 | unsigned long len; | 1399 | unsigned long len; |
1395 | unsigned long prot; | 1400 | unsigned long prot; |
1396 | unsigned long flags; | 1401 | unsigned long flags; |
1397 | unsigned long fd; | 1402 | unsigned long fd; |
1398 | unsigned long offset; | 1403 | unsigned long offset; |
1399 | }; | 1404 | }; |
1400 | 1405 | ||
1401 | SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) | 1406 | SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) |
1402 | { | 1407 | { |
1403 | struct mmap_arg_struct a; | 1408 | struct mmap_arg_struct a; |
1404 | 1409 | ||
1405 | if (copy_from_user(&a, arg, sizeof(a))) | 1410 | if (copy_from_user(&a, arg, sizeof(a))) |
1406 | return -EFAULT; | 1411 | return -EFAULT; |
1407 | if (a.offset & ~PAGE_MASK) | 1412 | if (a.offset & ~PAGE_MASK) |
1408 | return -EINVAL; | 1413 | return -EINVAL; |
1409 | 1414 | ||
1410 | return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, | 1415 | return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, |
1411 | a.offset >> PAGE_SHIFT); | 1416 | a.offset >> PAGE_SHIFT); |
1412 | } | 1417 | } |
1413 | #endif /* __ARCH_WANT_SYS_OLD_MMAP */ | 1418 | #endif /* __ARCH_WANT_SYS_OLD_MMAP */ |
1414 | 1419 | ||
1415 | /* | 1420 | /* |
1416 | * Some shared mappigns will want the pages marked read-only | 1421 | * Some shared mappigns will want the pages marked read-only |
1417 | * to track write events. If so, we'll downgrade vm_page_prot | 1422 | * to track write events. If so, we'll downgrade vm_page_prot |
1418 | * to the private version (using protection_map[] without the | 1423 | * to the private version (using protection_map[] without the |
1419 | * VM_SHARED bit). | 1424 | * VM_SHARED bit). |
1420 | */ | 1425 | */ |
1421 | int vma_wants_writenotify(struct vm_area_struct *vma) | 1426 | int vma_wants_writenotify(struct vm_area_struct *vma) |
1422 | { | 1427 | { |
1423 | vm_flags_t vm_flags = vma->vm_flags; | 1428 | vm_flags_t vm_flags = vma->vm_flags; |
1424 | 1429 | ||
1425 | /* If it was private or non-writable, the write bit is already clear */ | 1430 | /* If it was private or non-writable, the write bit is already clear */ |
1426 | if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED))) | 1431 | if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED))) |
1427 | return 0; | 1432 | return 0; |
1428 | 1433 | ||
1429 | /* The backer wishes to know when pages are first written to? */ | 1434 | /* The backer wishes to know when pages are first written to? */ |
1430 | if (vma->vm_ops && vma->vm_ops->page_mkwrite) | 1435 | if (vma->vm_ops && vma->vm_ops->page_mkwrite) |
1431 | return 1; | 1436 | return 1; |
1432 | 1437 | ||
1433 | /* The open routine did something to the protections already? */ | 1438 | /* The open routine did something to the protections already? */ |
1434 | if (pgprot_val(vma->vm_page_prot) != | 1439 | if (pgprot_val(vma->vm_page_prot) != |
1435 | pgprot_val(vm_get_page_prot(vm_flags))) | 1440 | pgprot_val(vm_get_page_prot(vm_flags))) |
1436 | return 0; | 1441 | return 0; |
1437 | 1442 | ||
1438 | /* Specialty mapping? */ | 1443 | /* Specialty mapping? */ |
1439 | if (vm_flags & VM_PFNMAP) | 1444 | if (vm_flags & VM_PFNMAP) |
1440 | return 0; | 1445 | return 0; |
1441 | 1446 | ||
1442 | /* Can the mapping track the dirty pages? */ | 1447 | /* Can the mapping track the dirty pages? */ |
1443 | return vma->vm_file && vma->vm_file->f_mapping && | 1448 | return vma->vm_file && vma->vm_file->f_mapping && |
1444 | mapping_cap_account_dirty(vma->vm_file->f_mapping); | 1449 | mapping_cap_account_dirty(vma->vm_file->f_mapping); |
1445 | } | 1450 | } |
1446 | 1451 | ||
1447 | /* | 1452 | /* |
1448 | * We account for memory if it's a private writeable mapping, | 1453 | * We account for memory if it's a private writeable mapping, |
1449 | * not hugepages and VM_NORESERVE wasn't set. | 1454 | * not hugepages and VM_NORESERVE wasn't set. |
1450 | */ | 1455 | */ |
1451 | static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags) | 1456 | static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags) |
1452 | { | 1457 | { |
1453 | /* | 1458 | /* |
1454 | * hugetlb has its own accounting separate from the core VM | 1459 | * hugetlb has its own accounting separate from the core VM |
1455 | * VM_HUGETLB may not be set yet so we cannot check for that flag. | 1460 | * VM_HUGETLB may not be set yet so we cannot check for that flag. |
1456 | */ | 1461 | */ |
1457 | if (file && is_file_hugepages(file)) | 1462 | if (file && is_file_hugepages(file)) |
1458 | return 0; | 1463 | return 0; |
1459 | 1464 | ||
1460 | return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE; | 1465 | return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE; |
1461 | } | 1466 | } |
1462 | 1467 | ||
1463 | unsigned long mmap_region(struct file *file, unsigned long addr, | 1468 | unsigned long mmap_region(struct file *file, unsigned long addr, |
1464 | unsigned long len, vm_flags_t vm_flags, unsigned long pgoff) | 1469 | unsigned long len, vm_flags_t vm_flags, unsigned long pgoff) |
1465 | { | 1470 | { |
1466 | struct mm_struct *mm = current->mm; | 1471 | struct mm_struct *mm = current->mm; |
1467 | struct vm_area_struct *vma, *prev; | 1472 | struct vm_area_struct *vma, *prev; |
1468 | int correct_wcount = 0; | 1473 | int correct_wcount = 0; |
1469 | int error; | 1474 | int error; |
1470 | struct rb_node **rb_link, *rb_parent; | 1475 | struct rb_node **rb_link, *rb_parent; |
1471 | unsigned long charged = 0; | 1476 | unsigned long charged = 0; |
1472 | struct inode *inode = file ? file_inode(file) : NULL; | 1477 | struct inode *inode = file ? file_inode(file) : NULL; |
1473 | 1478 | ||
1474 | /* Check against address space limit. */ | 1479 | /* Check against address space limit. */ |
1475 | if (!may_expand_vm(mm, len >> PAGE_SHIFT)) { | 1480 | if (!may_expand_vm(mm, len >> PAGE_SHIFT)) { |
1476 | unsigned long nr_pages; | 1481 | unsigned long nr_pages; |
1477 | 1482 | ||
1478 | /* | 1483 | /* |
1479 | * MAP_FIXED may remove pages of mappings that intersects with | 1484 | * MAP_FIXED may remove pages of mappings that intersects with |
1480 | * requested mapping. Account for the pages it would unmap. | 1485 | * requested mapping. Account for the pages it would unmap. |
1481 | */ | 1486 | */ |
1482 | if (!(vm_flags & MAP_FIXED)) | 1487 | if (!(vm_flags & MAP_FIXED)) |
1483 | return -ENOMEM; | 1488 | return -ENOMEM; |
1484 | 1489 | ||
1485 | nr_pages = count_vma_pages_range(mm, addr, addr + len); | 1490 | nr_pages = count_vma_pages_range(mm, addr, addr + len); |
1486 | 1491 | ||
1487 | if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages)) | 1492 | if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages)) |
1488 | return -ENOMEM; | 1493 | return -ENOMEM; |
1489 | } | 1494 | } |
1490 | 1495 | ||
1491 | /* Clear old maps */ | 1496 | /* Clear old maps */ |
1492 | error = -ENOMEM; | 1497 | error = -ENOMEM; |
1493 | munmap_back: | 1498 | munmap_back: |
1494 | if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { | 1499 | if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { |
1495 | if (do_munmap(mm, addr, len)) | 1500 | if (do_munmap(mm, addr, len)) |
1496 | return -ENOMEM; | 1501 | return -ENOMEM; |
1497 | goto munmap_back; | 1502 | goto munmap_back; |
1498 | } | 1503 | } |
1499 | 1504 | ||
1500 | /* | 1505 | /* |
1501 | * Private writable mapping: check memory availability | 1506 | * Private writable mapping: check memory availability |
1502 | */ | 1507 | */ |
1503 | if (accountable_mapping(file, vm_flags)) { | 1508 | if (accountable_mapping(file, vm_flags)) { |
1504 | charged = len >> PAGE_SHIFT; | 1509 | charged = len >> PAGE_SHIFT; |
1505 | if (security_vm_enough_memory_mm(mm, charged)) | 1510 | if (security_vm_enough_memory_mm(mm, charged)) |
1506 | return -ENOMEM; | 1511 | return -ENOMEM; |
1507 | vm_flags |= VM_ACCOUNT; | 1512 | vm_flags |= VM_ACCOUNT; |
1508 | } | 1513 | } |
1509 | 1514 | ||
1510 | /* | 1515 | /* |
1511 | * Can we just expand an old mapping? | 1516 | * Can we just expand an old mapping? |
1512 | */ | 1517 | */ |
1513 | vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL); | 1518 | vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL); |
1514 | if (vma) | 1519 | if (vma) |
1515 | goto out; | 1520 | goto out; |
1516 | 1521 | ||
1517 | /* | 1522 | /* |
1518 | * Determine the object being mapped and call the appropriate | 1523 | * Determine the object being mapped and call the appropriate |
1519 | * specific mapper. the address has already been validated, but | 1524 | * specific mapper. the address has already been validated, but |
1520 | * not unmapped, but the maps are removed from the list. | 1525 | * not unmapped, but the maps are removed from the list. |
1521 | */ | 1526 | */ |
1522 | vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); | 1527 | vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); |
1523 | if (!vma) { | 1528 | if (!vma) { |
1524 | error = -ENOMEM; | 1529 | error = -ENOMEM; |
1525 | goto unacct_error; | 1530 | goto unacct_error; |
1526 | } | 1531 | } |
1527 | 1532 | ||
1528 | vma->vm_mm = mm; | 1533 | vma->vm_mm = mm; |
1529 | vma->vm_start = addr; | 1534 | vma->vm_start = addr; |
1530 | vma->vm_end = addr + len; | 1535 | vma->vm_end = addr + len; |
1531 | vma->vm_flags = vm_flags; | 1536 | vma->vm_flags = vm_flags; |
1532 | vma->vm_page_prot = vm_get_page_prot(vm_flags); | 1537 | vma->vm_page_prot = vm_get_page_prot(vm_flags); |
1533 | vma->vm_pgoff = pgoff; | 1538 | vma->vm_pgoff = pgoff; |
1534 | INIT_LIST_HEAD(&vma->anon_vma_chain); | 1539 | INIT_LIST_HEAD(&vma->anon_vma_chain); |
1535 | 1540 | ||
1536 | error = -EINVAL; /* when rejecting VM_GROWSDOWN|VM_GROWSUP */ | 1541 | error = -EINVAL; /* when rejecting VM_GROWSDOWN|VM_GROWSUP */ |
1537 | 1542 | ||
1538 | if (file) { | 1543 | if (file) { |
1539 | if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) | 1544 | if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) |
1540 | goto free_vma; | 1545 | goto free_vma; |
1541 | if (vm_flags & VM_DENYWRITE) { | 1546 | if (vm_flags & VM_DENYWRITE) { |
1542 | error = deny_write_access(file); | 1547 | error = deny_write_access(file); |
1543 | if (error) | 1548 | if (error) |
1544 | goto free_vma; | 1549 | goto free_vma; |
1545 | correct_wcount = 1; | 1550 | correct_wcount = 1; |
1546 | } | 1551 | } |
1547 | vma->vm_file = get_file(file); | 1552 | vma->vm_file = get_file(file); |
1548 | error = file->f_op->mmap(file, vma); | 1553 | error = file->f_op->mmap(file, vma); |
1549 | if (error) | 1554 | if (error) |
1550 | goto unmap_and_free_vma; | 1555 | goto unmap_and_free_vma; |
1551 | 1556 | ||
1552 | /* Can addr have changed?? | 1557 | /* Can addr have changed?? |
1553 | * | 1558 | * |
1554 | * Answer: Yes, several device drivers can do it in their | 1559 | * Answer: Yes, several device drivers can do it in their |
1555 | * f_op->mmap method. -DaveM | 1560 | * f_op->mmap method. -DaveM |
1556 | * Bug: If addr is changed, prev, rb_link, rb_parent should | 1561 | * Bug: If addr is changed, prev, rb_link, rb_parent should |
1557 | * be updated for vma_link() | 1562 | * be updated for vma_link() |
1558 | */ | 1563 | */ |
1559 | WARN_ON_ONCE(addr != vma->vm_start); | 1564 | WARN_ON_ONCE(addr != vma->vm_start); |
1560 | 1565 | ||
1561 | addr = vma->vm_start; | 1566 | addr = vma->vm_start; |
1562 | pgoff = vma->vm_pgoff; | 1567 | pgoff = vma->vm_pgoff; |
1563 | vm_flags = vma->vm_flags; | 1568 | vm_flags = vma->vm_flags; |
1564 | } else if (vm_flags & VM_SHARED) { | 1569 | } else if (vm_flags & VM_SHARED) { |
1565 | if (unlikely(vm_flags & (VM_GROWSDOWN|VM_GROWSUP))) | 1570 | if (unlikely(vm_flags & (VM_GROWSDOWN|VM_GROWSUP))) |
1566 | goto free_vma; | 1571 | goto free_vma; |
1567 | error = shmem_zero_setup(vma); | 1572 | error = shmem_zero_setup(vma); |
1568 | if (error) | 1573 | if (error) |
1569 | goto free_vma; | 1574 | goto free_vma; |
1570 | } | 1575 | } |
1571 | 1576 | ||
1572 | if (vma_wants_writenotify(vma)) { | 1577 | if (vma_wants_writenotify(vma)) { |
1573 | pgprot_t pprot = vma->vm_page_prot; | 1578 | pgprot_t pprot = vma->vm_page_prot; |
1574 | 1579 | ||
1575 | /* Can vma->vm_page_prot have changed?? | 1580 | /* Can vma->vm_page_prot have changed?? |
1576 | * | 1581 | * |
1577 | * Answer: Yes, drivers may have changed it in their | 1582 | * Answer: Yes, drivers may have changed it in their |
1578 | * f_op->mmap method. | 1583 | * f_op->mmap method. |
1579 | * | 1584 | * |
1580 | * Ensures that vmas marked as uncached stay that way. | 1585 | * Ensures that vmas marked as uncached stay that way. |
1581 | */ | 1586 | */ |
1582 | vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED); | 1587 | vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED); |
1583 | if (pgprot_val(pprot) == pgprot_val(pgprot_noncached(pprot))) | 1588 | if (pgprot_val(pprot) == pgprot_val(pgprot_noncached(pprot))) |
1584 | vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); | 1589 | vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); |
1585 | } | 1590 | } |
1586 | 1591 | ||
1587 | vma_link(mm, vma, prev, rb_link, rb_parent); | 1592 | vma_link(mm, vma, prev, rb_link, rb_parent); |
1588 | file = vma->vm_file; | 1593 | file = vma->vm_file; |
1589 | 1594 | ||
1590 | /* Once vma denies write, undo our temporary denial count */ | 1595 | /* Once vma denies write, undo our temporary denial count */ |
1591 | if (correct_wcount) | 1596 | if (correct_wcount) |
1592 | atomic_inc(&inode->i_writecount); | 1597 | atomic_inc(&inode->i_writecount); |
1593 | out: | 1598 | out: |
1594 | perf_event_mmap(vma); | 1599 | perf_event_mmap(vma); |
1595 | 1600 | ||
1596 | vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); | 1601 | vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); |
1597 | if (vm_flags & VM_LOCKED) { | 1602 | if (vm_flags & VM_LOCKED) { |
1598 | if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) || | 1603 | if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) || |
1599 | vma == get_gate_vma(current->mm))) | 1604 | vma == get_gate_vma(current->mm))) |
1600 | mm->locked_vm += (len >> PAGE_SHIFT); | 1605 | mm->locked_vm += (len >> PAGE_SHIFT); |
1601 | else | 1606 | else |
1602 | vma->vm_flags &= ~VM_LOCKED; | 1607 | vma->vm_flags &= ~VM_LOCKED; |
1603 | } | 1608 | } |
1604 | 1609 | ||
1605 | if (file) | 1610 | if (file) |
1606 | uprobe_mmap(vma); | 1611 | uprobe_mmap(vma); |
1607 | 1612 | ||
1608 | return addr; | 1613 | return addr; |
1609 | 1614 | ||
1610 | unmap_and_free_vma: | 1615 | unmap_and_free_vma: |
1611 | if (correct_wcount) | 1616 | if (correct_wcount) |
1612 | atomic_inc(&inode->i_writecount); | 1617 | atomic_inc(&inode->i_writecount); |
1613 | vma->vm_file = NULL; | 1618 | vma->vm_file = NULL; |
1614 | fput(file); | 1619 | fput(file); |
1615 | 1620 | ||
1616 | /* Undo any partial mapping done by a device driver. */ | 1621 | /* Undo any partial mapping done by a device driver. */ |
1617 | unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end); | 1622 | unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end); |
1618 | charged = 0; | 1623 | charged = 0; |
1619 | free_vma: | 1624 | free_vma: |
1620 | kmem_cache_free(vm_area_cachep, vma); | 1625 | kmem_cache_free(vm_area_cachep, vma); |
1621 | unacct_error: | 1626 | unacct_error: |
1622 | if (charged) | 1627 | if (charged) |
1623 | vm_unacct_memory(charged); | 1628 | vm_unacct_memory(charged); |
1624 | return error; | 1629 | return error; |
1625 | } | 1630 | } |
1626 | 1631 | ||
1627 | unsigned long unmapped_area(struct vm_unmapped_area_info *info) | 1632 | unsigned long unmapped_area(struct vm_unmapped_area_info *info) |
1628 | { | 1633 | { |
1629 | /* | 1634 | /* |
1630 | * We implement the search by looking for an rbtree node that | 1635 | * We implement the search by looking for an rbtree node that |
1631 | * immediately follows a suitable gap. That is, | 1636 | * immediately follows a suitable gap. That is, |
1632 | * - gap_start = vma->vm_prev->vm_end <= info->high_limit - length; | 1637 | * - gap_start = vma->vm_prev->vm_end <= info->high_limit - length; |
1633 | * - gap_end = vma->vm_start >= info->low_limit + length; | 1638 | * - gap_end = vma->vm_start >= info->low_limit + length; |
1634 | * - gap_end - gap_start >= length | 1639 | * - gap_end - gap_start >= length |
1635 | */ | 1640 | */ |
1636 | 1641 | ||
1637 | struct mm_struct *mm = current->mm; | 1642 | struct mm_struct *mm = current->mm; |
1638 | struct vm_area_struct *vma; | 1643 | struct vm_area_struct *vma; |
1639 | unsigned long length, low_limit, high_limit, gap_start, gap_end; | 1644 | unsigned long length, low_limit, high_limit, gap_start, gap_end; |
1640 | 1645 | ||
1641 | /* Adjust search length to account for worst case alignment overhead */ | 1646 | /* Adjust search length to account for worst case alignment overhead */ |
1642 | length = info->length + info->align_mask; | 1647 | length = info->length + info->align_mask; |
1643 | if (length < info->length) | 1648 | if (length < info->length) |
1644 | return -ENOMEM; | 1649 | return -ENOMEM; |
1645 | 1650 | ||
1646 | /* Adjust search limits by the desired length */ | 1651 | /* Adjust search limits by the desired length */ |
1647 | if (info->high_limit < length) | 1652 | if (info->high_limit < length) |
1648 | return -ENOMEM; | 1653 | return -ENOMEM; |
1649 | high_limit = info->high_limit - length; | 1654 | high_limit = info->high_limit - length; |
1650 | 1655 | ||
1651 | if (info->low_limit > high_limit) | 1656 | if (info->low_limit > high_limit) |
1652 | return -ENOMEM; | 1657 | return -ENOMEM; |
1653 | low_limit = info->low_limit + length; | 1658 | low_limit = info->low_limit + length; |
1654 | 1659 | ||
1655 | /* Check if rbtree root looks promising */ | 1660 | /* Check if rbtree root looks promising */ |
1656 | if (RB_EMPTY_ROOT(&mm->mm_rb)) | 1661 | if (RB_EMPTY_ROOT(&mm->mm_rb)) |
1657 | goto check_highest; | 1662 | goto check_highest; |
1658 | vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb); | 1663 | vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb); |
1659 | if (vma->rb_subtree_gap < length) | 1664 | if (vma->rb_subtree_gap < length) |
1660 | goto check_highest; | 1665 | goto check_highest; |
1661 | 1666 | ||
1662 | while (true) { | 1667 | while (true) { |
1663 | /* Visit left subtree if it looks promising */ | 1668 | /* Visit left subtree if it looks promising */ |
1664 | gap_end = vma->vm_start; | 1669 | gap_end = vma->vm_start; |
1665 | if (gap_end >= low_limit && vma->vm_rb.rb_left) { | 1670 | if (gap_end >= low_limit && vma->vm_rb.rb_left) { |
1666 | struct vm_area_struct *left = | 1671 | struct vm_area_struct *left = |
1667 | rb_entry(vma->vm_rb.rb_left, | 1672 | rb_entry(vma->vm_rb.rb_left, |
1668 | struct vm_area_struct, vm_rb); | 1673 | struct vm_area_struct, vm_rb); |
1669 | if (left->rb_subtree_gap >= length) { | 1674 | if (left->rb_subtree_gap >= length) { |
1670 | vma = left; | 1675 | vma = left; |
1671 | continue; | 1676 | continue; |
1672 | } | 1677 | } |
1673 | } | 1678 | } |
1674 | 1679 | ||
1675 | gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0; | 1680 | gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0; |
1676 | check_current: | 1681 | check_current: |
1677 | /* Check if current node has a suitable gap */ | 1682 | /* Check if current node has a suitable gap */ |
1678 | if (gap_start > high_limit) | 1683 | if (gap_start > high_limit) |
1679 | return -ENOMEM; | 1684 | return -ENOMEM; |
1680 | if (gap_end >= low_limit && gap_end - gap_start >= length) | 1685 | if (gap_end >= low_limit && gap_end - gap_start >= length) |
1681 | goto found; | 1686 | goto found; |
1682 | 1687 | ||
1683 | /* Visit right subtree if it looks promising */ | 1688 | /* Visit right subtree if it looks promising */ |
1684 | if (vma->vm_rb.rb_right) { | 1689 | if (vma->vm_rb.rb_right) { |
1685 | struct vm_area_struct *right = | 1690 | struct vm_area_struct *right = |
1686 | rb_entry(vma->vm_rb.rb_right, | 1691 | rb_entry(vma->vm_rb.rb_right, |
1687 | struct vm_area_struct, vm_rb); | 1692 | struct vm_area_struct, vm_rb); |
1688 | if (right->rb_subtree_gap >= length) { | 1693 | if (right->rb_subtree_gap >= length) { |
1689 | vma = right; | 1694 | vma = right; |
1690 | continue; | 1695 | continue; |
1691 | } | 1696 | } |
1692 | } | 1697 | } |
1693 | 1698 | ||
1694 | /* Go back up the rbtree to find next candidate node */ | 1699 | /* Go back up the rbtree to find next candidate node */ |
1695 | while (true) { | 1700 | while (true) { |
1696 | struct rb_node *prev = &vma->vm_rb; | 1701 | struct rb_node *prev = &vma->vm_rb; |
1697 | if (!rb_parent(prev)) | 1702 | if (!rb_parent(prev)) |
1698 | goto check_highest; | 1703 | goto check_highest; |
1699 | vma = rb_entry(rb_parent(prev), | 1704 | vma = rb_entry(rb_parent(prev), |
1700 | struct vm_area_struct, vm_rb); | 1705 | struct vm_area_struct, vm_rb); |
1701 | if (prev == vma->vm_rb.rb_left) { | 1706 | if (prev == vma->vm_rb.rb_left) { |
1702 | gap_start = vma->vm_prev->vm_end; | 1707 | gap_start = vma->vm_prev->vm_end; |
1703 | gap_end = vma->vm_start; | 1708 | gap_end = vma->vm_start; |
1704 | goto check_current; | 1709 | goto check_current; |
1705 | } | 1710 | } |
1706 | } | 1711 | } |
1707 | } | 1712 | } |
1708 | 1713 | ||
1709 | check_highest: | 1714 | check_highest: |
1710 | /* Check highest gap, which does not precede any rbtree node */ | 1715 | /* Check highest gap, which does not precede any rbtree node */ |
1711 | gap_start = mm->highest_vm_end; | 1716 | gap_start = mm->highest_vm_end; |
1712 | gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */ | 1717 | gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */ |
1713 | if (gap_start > high_limit) | 1718 | if (gap_start > high_limit) |
1714 | return -ENOMEM; | 1719 | return -ENOMEM; |
1715 | 1720 | ||
1716 | found: | 1721 | found: |
1717 | /* We found a suitable gap. Clip it with the original low_limit. */ | 1722 | /* We found a suitable gap. Clip it with the original low_limit. */ |
1718 | if (gap_start < info->low_limit) | 1723 | if (gap_start < info->low_limit) |
1719 | gap_start = info->low_limit; | 1724 | gap_start = info->low_limit; |
1720 | 1725 | ||
1721 | /* Adjust gap address to the desired alignment */ | 1726 | /* Adjust gap address to the desired alignment */ |
1722 | gap_start += (info->align_offset - gap_start) & info->align_mask; | 1727 | gap_start += (info->align_offset - gap_start) & info->align_mask; |
1723 | 1728 | ||
1724 | VM_BUG_ON(gap_start + info->length > info->high_limit); | 1729 | VM_BUG_ON(gap_start + info->length > info->high_limit); |
1725 | VM_BUG_ON(gap_start + info->length > gap_end); | 1730 | VM_BUG_ON(gap_start + info->length > gap_end); |
1726 | return gap_start; | 1731 | return gap_start; |
1727 | } | 1732 | } |
1728 | 1733 | ||
1729 | unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) | 1734 | unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) |
1730 | { | 1735 | { |
1731 | struct mm_struct *mm = current->mm; | 1736 | struct mm_struct *mm = current->mm; |
1732 | struct vm_area_struct *vma; | 1737 | struct vm_area_struct *vma; |
1733 | unsigned long length, low_limit, high_limit, gap_start, gap_end; | 1738 | unsigned long length, low_limit, high_limit, gap_start, gap_end; |
1734 | 1739 | ||
1735 | /* Adjust search length to account for worst case alignment overhead */ | 1740 | /* Adjust search length to account for worst case alignment overhead */ |
1736 | length = info->length + info->align_mask; | 1741 | length = info->length + info->align_mask; |
1737 | if (length < info->length) | 1742 | if (length < info->length) |
1738 | return -ENOMEM; | 1743 | return -ENOMEM; |
1739 | 1744 | ||
1740 | /* | 1745 | /* |
1741 | * Adjust search limits by the desired length. | 1746 | * Adjust search limits by the desired length. |
1742 | * See implementation comment at top of unmapped_area(). | 1747 | * See implementation comment at top of unmapped_area(). |
1743 | */ | 1748 | */ |
1744 | gap_end = info->high_limit; | 1749 | gap_end = info->high_limit; |
1745 | if (gap_end < length) | 1750 | if (gap_end < length) |
1746 | return -ENOMEM; | 1751 | return -ENOMEM; |
1747 | high_limit = gap_end - length; | 1752 | high_limit = gap_end - length; |
1748 | 1753 | ||
1749 | if (info->low_limit > high_limit) | 1754 | if (info->low_limit > high_limit) |
1750 | return -ENOMEM; | 1755 | return -ENOMEM; |
1751 | low_limit = info->low_limit + length; | 1756 | low_limit = info->low_limit + length; |
1752 | 1757 | ||
1753 | /* Check highest gap, which does not precede any rbtree node */ | 1758 | /* Check highest gap, which does not precede any rbtree node */ |
1754 | gap_start = mm->highest_vm_end; | 1759 | gap_start = mm->highest_vm_end; |
1755 | if (gap_start <= high_limit) | 1760 | if (gap_start <= high_limit) |
1756 | goto found_highest; | 1761 | goto found_highest; |
1757 | 1762 | ||
1758 | /* Check if rbtree root looks promising */ | 1763 | /* Check if rbtree root looks promising */ |
1759 | if (RB_EMPTY_ROOT(&mm->mm_rb)) | 1764 | if (RB_EMPTY_ROOT(&mm->mm_rb)) |
1760 | return -ENOMEM; | 1765 | return -ENOMEM; |
1761 | vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb); | 1766 | vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb); |
1762 | if (vma->rb_subtree_gap < length) | 1767 | if (vma->rb_subtree_gap < length) |
1763 | return -ENOMEM; | 1768 | return -ENOMEM; |
1764 | 1769 | ||
1765 | while (true) { | 1770 | while (true) { |
1766 | /* Visit right subtree if it looks promising */ | 1771 | /* Visit right subtree if it looks promising */ |
1767 | gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0; | 1772 | gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0; |
1768 | if (gap_start <= high_limit && vma->vm_rb.rb_right) { | 1773 | if (gap_start <= high_limit && vma->vm_rb.rb_right) { |
1769 | struct vm_area_struct *right = | 1774 | struct vm_area_struct *right = |
1770 | rb_entry(vma->vm_rb.rb_right, | 1775 | rb_entry(vma->vm_rb.rb_right, |
1771 | struct vm_area_struct, vm_rb); | 1776 | struct vm_area_struct, vm_rb); |
1772 | if (right->rb_subtree_gap >= length) { | 1777 | if (right->rb_subtree_gap >= length) { |
1773 | vma = right; | 1778 | vma = right; |
1774 | continue; | 1779 | continue; |
1775 | } | 1780 | } |
1776 | } | 1781 | } |
1777 | 1782 | ||
1778 | check_current: | 1783 | check_current: |
1779 | /* Check if current node has a suitable gap */ | 1784 | /* Check if current node has a suitable gap */ |
1780 | gap_end = vma->vm_start; | 1785 | gap_end = vma->vm_start; |
1781 | if (gap_end < low_limit) | 1786 | if (gap_end < low_limit) |
1782 | return -ENOMEM; | 1787 | return -ENOMEM; |
1783 | if (gap_start <= high_limit && gap_end - gap_start >= length) | 1788 | if (gap_start <= high_limit && gap_end - gap_start >= length) |
1784 | goto found; | 1789 | goto found; |
1785 | 1790 | ||
1786 | /* Visit left subtree if it looks promising */ | 1791 | /* Visit left subtree if it looks promising */ |
1787 | if (vma->vm_rb.rb_left) { | 1792 | if (vma->vm_rb.rb_left) { |
1788 | struct vm_area_struct *left = | 1793 | struct vm_area_struct *left = |
1789 | rb_entry(vma->vm_rb.rb_left, | 1794 | rb_entry(vma->vm_rb.rb_left, |
1790 | struct vm_area_struct, vm_rb); | 1795 | struct vm_area_struct, vm_rb); |
1791 | if (left->rb_subtree_gap >= length) { | 1796 | if (left->rb_subtree_gap >= length) { |
1792 | vma = left; | 1797 | vma = left; |
1793 | continue; | 1798 | continue; |
1794 | } | 1799 | } |
1795 | } | 1800 | } |
1796 | 1801 | ||
1797 | /* Go back up the rbtree to find next candidate node */ | 1802 | /* Go back up the rbtree to find next candidate node */ |
1798 | while (true) { | 1803 | while (true) { |
1799 | struct rb_node *prev = &vma->vm_rb; | 1804 | struct rb_node *prev = &vma->vm_rb; |
1800 | if (!rb_parent(prev)) | 1805 | if (!rb_parent(prev)) |
1801 | return -ENOMEM; | 1806 | return -ENOMEM; |
1802 | vma = rb_entry(rb_parent(prev), | 1807 | vma = rb_entry(rb_parent(prev), |
1803 | struct vm_area_struct, vm_rb); | 1808 | struct vm_area_struct, vm_rb); |
1804 | if (prev == vma->vm_rb.rb_right) { | 1809 | if (prev == vma->vm_rb.rb_right) { |
1805 | gap_start = vma->vm_prev ? | 1810 | gap_start = vma->vm_prev ? |
1806 | vma->vm_prev->vm_end : 0; | 1811 | vma->vm_prev->vm_end : 0; |
1807 | goto check_current; | 1812 | goto check_current; |
1808 | } | 1813 | } |
1809 | } | 1814 | } |
1810 | } | 1815 | } |
1811 | 1816 | ||
1812 | found: | 1817 | found: |
1813 | /* We found a suitable gap. Clip it with the original high_limit. */ | 1818 | /* We found a suitable gap. Clip it with the original high_limit. */ |
1814 | if (gap_end > info->high_limit) | 1819 | if (gap_end > info->high_limit) |
1815 | gap_end = info->high_limit; | 1820 | gap_end = info->high_limit; |
1816 | 1821 | ||
1817 | found_highest: | 1822 | found_highest: |
1818 | /* Compute highest gap address at the desired alignment */ | 1823 | /* Compute highest gap address at the desired alignment */ |
1819 | gap_end -= info->length; | 1824 | gap_end -= info->length; |
1820 | gap_end -= (gap_end - info->align_offset) & info->align_mask; | 1825 | gap_end -= (gap_end - info->align_offset) & info->align_mask; |
1821 | 1826 | ||
1822 | VM_BUG_ON(gap_end < info->low_limit); | 1827 | VM_BUG_ON(gap_end < info->low_limit); |
1823 | VM_BUG_ON(gap_end < gap_start); | 1828 | VM_BUG_ON(gap_end < gap_start); |
1824 | return gap_end; | 1829 | return gap_end; |
1825 | } | 1830 | } |
1826 | 1831 | ||
1827 | /* Get an address range which is currently unmapped. | 1832 | /* Get an address range which is currently unmapped. |
1828 | * For shmat() with addr=0. | 1833 | * For shmat() with addr=0. |
1829 | * | 1834 | * |
1830 | * Ugly calling convention alert: | 1835 | * Ugly calling convention alert: |
1831 | * Return value with the low bits set means error value, | 1836 | * Return value with the low bits set means error value, |
1832 | * ie | 1837 | * ie |
1833 | * if (ret & ~PAGE_MASK) | 1838 | * if (ret & ~PAGE_MASK) |
1834 | * error = ret; | 1839 | * error = ret; |
1835 | * | 1840 | * |
1836 | * This function "knows" that -ENOMEM has the bits set. | 1841 | * This function "knows" that -ENOMEM has the bits set. |
1837 | */ | 1842 | */ |
1838 | #ifndef HAVE_ARCH_UNMAPPED_AREA | 1843 | #ifndef HAVE_ARCH_UNMAPPED_AREA |
1839 | unsigned long | 1844 | unsigned long |
1840 | arch_get_unmapped_area(struct file *filp, unsigned long addr, | 1845 | arch_get_unmapped_area(struct file *filp, unsigned long addr, |
1841 | unsigned long len, unsigned long pgoff, unsigned long flags) | 1846 | unsigned long len, unsigned long pgoff, unsigned long flags) |
1842 | { | 1847 | { |
1843 | struct mm_struct *mm = current->mm; | 1848 | struct mm_struct *mm = current->mm; |
1844 | struct vm_area_struct *vma; | 1849 | struct vm_area_struct *vma; |
1845 | struct vm_unmapped_area_info info; | 1850 | struct vm_unmapped_area_info info; |
1846 | 1851 | ||
1847 | if (len > TASK_SIZE) | 1852 | if (len > TASK_SIZE) |
1848 | return -ENOMEM; | 1853 | return -ENOMEM; |
1849 | 1854 | ||
1850 | if (flags & MAP_FIXED) | 1855 | if (flags & MAP_FIXED) |
1851 | return addr; | 1856 | return addr; |
1852 | 1857 | ||
1853 | if (addr) { | 1858 | if (addr) { |
1854 | addr = PAGE_ALIGN(addr); | 1859 | addr = PAGE_ALIGN(addr); |
1855 | vma = find_vma(mm, addr); | 1860 | vma = find_vma(mm, addr); |
1856 | if (TASK_SIZE - len >= addr && | 1861 | if (TASK_SIZE - len >= addr && |
1857 | (!vma || addr + len <= vma->vm_start)) | 1862 | (!vma || addr + len <= vma->vm_start)) |
1858 | return addr; | 1863 | return addr; |
1859 | } | 1864 | } |
1860 | 1865 | ||
1861 | info.flags = 0; | 1866 | info.flags = 0; |
1862 | info.length = len; | 1867 | info.length = len; |
1863 | info.low_limit = TASK_UNMAPPED_BASE; | 1868 | info.low_limit = TASK_UNMAPPED_BASE; |
1864 | info.high_limit = TASK_SIZE; | 1869 | info.high_limit = TASK_SIZE; |
1865 | info.align_mask = 0; | 1870 | info.align_mask = 0; |
1866 | return vm_unmapped_area(&info); | 1871 | return vm_unmapped_area(&info); |
1867 | } | 1872 | } |
1868 | #endif | 1873 | #endif |
1869 | 1874 | ||
1870 | void arch_unmap_area(struct mm_struct *mm, unsigned long addr) | 1875 | void arch_unmap_area(struct mm_struct *mm, unsigned long addr) |
1871 | { | 1876 | { |
1872 | /* | 1877 | /* |
1873 | * Is this a new hole at the lowest possible address? | 1878 | * Is this a new hole at the lowest possible address? |
1874 | */ | 1879 | */ |
1875 | if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache) | 1880 | if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache) |
1876 | mm->free_area_cache = addr; | 1881 | mm->free_area_cache = addr; |
1877 | } | 1882 | } |
1878 | 1883 | ||
1879 | /* | 1884 | /* |
1880 | * This mmap-allocator allocates new areas top-down from below the | 1885 | * This mmap-allocator allocates new areas top-down from below the |
1881 | * stack's low limit (the base): | 1886 | * stack's low limit (the base): |
1882 | */ | 1887 | */ |
1883 | #ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN | 1888 | #ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN |
1884 | unsigned long | 1889 | unsigned long |
1885 | arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | 1890 | arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, |
1886 | const unsigned long len, const unsigned long pgoff, | 1891 | const unsigned long len, const unsigned long pgoff, |
1887 | const unsigned long flags) | 1892 | const unsigned long flags) |
1888 | { | 1893 | { |
1889 | struct vm_area_struct *vma; | 1894 | struct vm_area_struct *vma; |
1890 | struct mm_struct *mm = current->mm; | 1895 | struct mm_struct *mm = current->mm; |
1891 | unsigned long addr = addr0; | 1896 | unsigned long addr = addr0; |
1892 | struct vm_unmapped_area_info info; | 1897 | struct vm_unmapped_area_info info; |
1893 | 1898 | ||
1894 | /* requested length too big for entire address space */ | 1899 | /* requested length too big for entire address space */ |
1895 | if (len > TASK_SIZE) | 1900 | if (len > TASK_SIZE) |
1896 | return -ENOMEM; | 1901 | return -ENOMEM; |
1897 | 1902 | ||
1898 | if (flags & MAP_FIXED) | 1903 | if (flags & MAP_FIXED) |
1899 | return addr; | 1904 | return addr; |
1900 | 1905 | ||
1901 | /* requesting a specific address */ | 1906 | /* requesting a specific address */ |
1902 | if (addr) { | 1907 | if (addr) { |
1903 | addr = PAGE_ALIGN(addr); | 1908 | addr = PAGE_ALIGN(addr); |
1904 | vma = find_vma(mm, addr); | 1909 | vma = find_vma(mm, addr); |
1905 | if (TASK_SIZE - len >= addr && | 1910 | if (TASK_SIZE - len >= addr && |
1906 | (!vma || addr + len <= vma->vm_start)) | 1911 | (!vma || addr + len <= vma->vm_start)) |
1907 | return addr; | 1912 | return addr; |
1908 | } | 1913 | } |
1909 | 1914 | ||
1910 | info.flags = VM_UNMAPPED_AREA_TOPDOWN; | 1915 | info.flags = VM_UNMAPPED_AREA_TOPDOWN; |
1911 | info.length = len; | 1916 | info.length = len; |
1912 | info.low_limit = PAGE_SIZE; | 1917 | info.low_limit = PAGE_SIZE; |
1913 | info.high_limit = mm->mmap_base; | 1918 | info.high_limit = mm->mmap_base; |
1914 | info.align_mask = 0; | 1919 | info.align_mask = 0; |
1915 | addr = vm_unmapped_area(&info); | 1920 | addr = vm_unmapped_area(&info); |
1916 | 1921 | ||
1917 | /* | 1922 | /* |
1918 | * A failed mmap() very likely causes application failure, | 1923 | * A failed mmap() very likely causes application failure, |
1919 | * so fall back to the bottom-up function here. This scenario | 1924 | * so fall back to the bottom-up function here. This scenario |
1920 | * can happen with large stack limits and large mmap() | 1925 | * can happen with large stack limits and large mmap() |
1921 | * allocations. | 1926 | * allocations. |
1922 | */ | 1927 | */ |
1923 | if (addr & ~PAGE_MASK) { | 1928 | if (addr & ~PAGE_MASK) { |
1924 | VM_BUG_ON(addr != -ENOMEM); | 1929 | VM_BUG_ON(addr != -ENOMEM); |
1925 | info.flags = 0; | 1930 | info.flags = 0; |
1926 | info.low_limit = TASK_UNMAPPED_BASE; | 1931 | info.low_limit = TASK_UNMAPPED_BASE; |
1927 | info.high_limit = TASK_SIZE; | 1932 | info.high_limit = TASK_SIZE; |
1928 | addr = vm_unmapped_area(&info); | 1933 | addr = vm_unmapped_area(&info); |
1929 | } | 1934 | } |
1930 | 1935 | ||
1931 | return addr; | 1936 | return addr; |
1932 | } | 1937 | } |
1933 | #endif | 1938 | #endif |
1934 | 1939 | ||
1935 | void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr) | 1940 | void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr) |
1936 | { | 1941 | { |
1937 | /* | 1942 | /* |
1938 | * Is this a new hole at the highest possible address? | 1943 | * Is this a new hole at the highest possible address? |
1939 | */ | 1944 | */ |
1940 | if (addr > mm->free_area_cache) | 1945 | if (addr > mm->free_area_cache) |
1941 | mm->free_area_cache = addr; | 1946 | mm->free_area_cache = addr; |
1942 | 1947 | ||
1943 | /* dont allow allocations above current base */ | 1948 | /* dont allow allocations above current base */ |
1944 | if (mm->free_area_cache > mm->mmap_base) | 1949 | if (mm->free_area_cache > mm->mmap_base) |
1945 | mm->free_area_cache = mm->mmap_base; | 1950 | mm->free_area_cache = mm->mmap_base; |
1946 | } | 1951 | } |
1947 | 1952 | ||
1948 | unsigned long | 1953 | unsigned long |
1949 | get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, | 1954 | get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, |
1950 | unsigned long pgoff, unsigned long flags) | 1955 | unsigned long pgoff, unsigned long flags) |
1951 | { | 1956 | { |
1952 | unsigned long (*get_area)(struct file *, unsigned long, | 1957 | unsigned long (*get_area)(struct file *, unsigned long, |
1953 | unsigned long, unsigned long, unsigned long); | 1958 | unsigned long, unsigned long, unsigned long); |
1954 | 1959 | ||
1955 | unsigned long error = arch_mmap_check(addr, len, flags); | 1960 | unsigned long error = arch_mmap_check(addr, len, flags); |
1956 | if (error) | 1961 | if (error) |
1957 | return error; | 1962 | return error; |
1958 | 1963 | ||
1959 | /* Careful about overflows.. */ | 1964 | /* Careful about overflows.. */ |
1960 | if (len > TASK_SIZE) | 1965 | if (len > TASK_SIZE) |
1961 | return -ENOMEM; | 1966 | return -ENOMEM; |
1962 | 1967 | ||
1963 | get_area = current->mm->get_unmapped_area; | 1968 | get_area = current->mm->get_unmapped_area; |
1964 | if (file && file->f_op && file->f_op->get_unmapped_area) | 1969 | if (file && file->f_op && file->f_op->get_unmapped_area) |
1965 | get_area = file->f_op->get_unmapped_area; | 1970 | get_area = file->f_op->get_unmapped_area; |
1966 | addr = get_area(file, addr, len, pgoff, flags); | 1971 | addr = get_area(file, addr, len, pgoff, flags); |
1967 | if (IS_ERR_VALUE(addr)) | 1972 | if (IS_ERR_VALUE(addr)) |
1968 | return addr; | 1973 | return addr; |
1969 | 1974 | ||
1970 | if (addr > TASK_SIZE - len) | 1975 | if (addr > TASK_SIZE - len) |
1971 | return -ENOMEM; | 1976 | return -ENOMEM; |
1972 | if (addr & ~PAGE_MASK) | 1977 | if (addr & ~PAGE_MASK) |
1973 | return -EINVAL; | 1978 | return -EINVAL; |
1974 | 1979 | ||
1975 | addr = arch_rebalance_pgtables(addr, len); | 1980 | addr = arch_rebalance_pgtables(addr, len); |
1976 | error = security_mmap_addr(addr); | 1981 | error = security_mmap_addr(addr); |
1977 | return error ? error : addr; | 1982 | return error ? error : addr; |
1978 | } | 1983 | } |
1979 | 1984 | ||
1980 | EXPORT_SYMBOL(get_unmapped_area); | 1985 | EXPORT_SYMBOL(get_unmapped_area); |
1981 | 1986 | ||
1982 | /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ | 1987 | /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ |
1983 | struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) | 1988 | struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) |
1984 | { | 1989 | { |
1985 | struct vm_area_struct *vma = NULL; | 1990 | struct vm_area_struct *vma = NULL; |
1986 | 1991 | ||
1987 | /* Check the cache first. */ | 1992 | /* Check the cache first. */ |
1988 | /* (Cache hit rate is typically around 35%.) */ | 1993 | /* (Cache hit rate is typically around 35%.) */ |
1989 | vma = ACCESS_ONCE(mm->mmap_cache); | 1994 | vma = ACCESS_ONCE(mm->mmap_cache); |
1990 | if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { | 1995 | if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { |
1991 | struct rb_node *rb_node; | 1996 | struct rb_node *rb_node; |
1992 | 1997 | ||
1993 | rb_node = mm->mm_rb.rb_node; | 1998 | rb_node = mm->mm_rb.rb_node; |
1994 | vma = NULL; | 1999 | vma = NULL; |
1995 | 2000 | ||
1996 | while (rb_node) { | 2001 | while (rb_node) { |
1997 | struct vm_area_struct *vma_tmp; | 2002 | struct vm_area_struct *vma_tmp; |
1998 | 2003 | ||
1999 | vma_tmp = rb_entry(rb_node, | 2004 | vma_tmp = rb_entry(rb_node, |
2000 | struct vm_area_struct, vm_rb); | 2005 | struct vm_area_struct, vm_rb); |
2001 | 2006 | ||
2002 | if (vma_tmp->vm_end > addr) { | 2007 | if (vma_tmp->vm_end > addr) { |
2003 | vma = vma_tmp; | 2008 | vma = vma_tmp; |
2004 | if (vma_tmp->vm_start <= addr) | 2009 | if (vma_tmp->vm_start <= addr) |
2005 | break; | 2010 | break; |
2006 | rb_node = rb_node->rb_left; | 2011 | rb_node = rb_node->rb_left; |
2007 | } else | 2012 | } else |
2008 | rb_node = rb_node->rb_right; | 2013 | rb_node = rb_node->rb_right; |
2009 | } | 2014 | } |
2010 | if (vma) | 2015 | if (vma) |
2011 | mm->mmap_cache = vma; | 2016 | mm->mmap_cache = vma; |
2012 | } | 2017 | } |
2013 | return vma; | 2018 | return vma; |
2014 | } | 2019 | } |
2015 | 2020 | ||
2016 | EXPORT_SYMBOL(find_vma); | 2021 | EXPORT_SYMBOL(find_vma); |
2017 | 2022 | ||
2018 | /* | 2023 | /* |
2019 | * Same as find_vma, but also return a pointer to the previous VMA in *pprev. | 2024 | * Same as find_vma, but also return a pointer to the previous VMA in *pprev. |
2020 | */ | 2025 | */ |
2021 | struct vm_area_struct * | 2026 | struct vm_area_struct * |
2022 | find_vma_prev(struct mm_struct *mm, unsigned long addr, | 2027 | find_vma_prev(struct mm_struct *mm, unsigned long addr, |
2023 | struct vm_area_struct **pprev) | 2028 | struct vm_area_struct **pprev) |
2024 | { | 2029 | { |
2025 | struct vm_area_struct *vma; | 2030 | struct vm_area_struct *vma; |
2026 | 2031 | ||
2027 | vma = find_vma(mm, addr); | 2032 | vma = find_vma(mm, addr); |
2028 | if (vma) { | 2033 | if (vma) { |
2029 | *pprev = vma->vm_prev; | 2034 | *pprev = vma->vm_prev; |
2030 | } else { | 2035 | } else { |
2031 | struct rb_node *rb_node = mm->mm_rb.rb_node; | 2036 | struct rb_node *rb_node = mm->mm_rb.rb_node; |
2032 | *pprev = NULL; | 2037 | *pprev = NULL; |
2033 | while (rb_node) { | 2038 | while (rb_node) { |
2034 | *pprev = rb_entry(rb_node, struct vm_area_struct, vm_rb); | 2039 | *pprev = rb_entry(rb_node, struct vm_area_struct, vm_rb); |
2035 | rb_node = rb_node->rb_right; | 2040 | rb_node = rb_node->rb_right; |
2036 | } | 2041 | } |
2037 | } | 2042 | } |
2038 | return vma; | 2043 | return vma; |
2039 | } | 2044 | } |
2040 | 2045 | ||
2041 | /* | 2046 | /* |
2042 | * Verify that the stack growth is acceptable and | 2047 | * Verify that the stack growth is acceptable and |
2043 | * update accounting. This is shared with both the | 2048 | * update accounting. This is shared with both the |
2044 | * grow-up and grow-down cases. | 2049 | * grow-up and grow-down cases. |
2045 | */ | 2050 | */ |
2046 | static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow) | 2051 | static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow) |
2047 | { | 2052 | { |
2048 | struct mm_struct *mm = vma->vm_mm; | 2053 | struct mm_struct *mm = vma->vm_mm; |
2049 | struct rlimit *rlim = current->signal->rlim; | 2054 | struct rlimit *rlim = current->signal->rlim; |
2050 | unsigned long new_start; | 2055 | unsigned long new_start; |
2051 | 2056 | ||
2052 | /* address space limit tests */ | 2057 | /* address space limit tests */ |
2053 | if (!may_expand_vm(mm, grow)) | 2058 | if (!may_expand_vm(mm, grow)) |
2054 | return -ENOMEM; | 2059 | return -ENOMEM; |
2055 | 2060 | ||
2056 | /* Stack limit test */ | 2061 | /* Stack limit test */ |
2057 | if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur)) | 2062 | if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur)) |
2058 | return -ENOMEM; | 2063 | return -ENOMEM; |
2059 | 2064 | ||
2060 | /* mlock limit tests */ | 2065 | /* mlock limit tests */ |
2061 | if (vma->vm_flags & VM_LOCKED) { | 2066 | if (vma->vm_flags & VM_LOCKED) { |
2062 | unsigned long locked; | 2067 | unsigned long locked; |
2063 | unsigned long limit; | 2068 | unsigned long limit; |
2064 | locked = mm->locked_vm + grow; | 2069 | locked = mm->locked_vm + grow; |
2065 | limit = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur); | 2070 | limit = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur); |
2066 | limit >>= PAGE_SHIFT; | 2071 | limit >>= PAGE_SHIFT; |
2067 | if (locked > limit && !capable(CAP_IPC_LOCK)) | 2072 | if (locked > limit && !capable(CAP_IPC_LOCK)) |
2068 | return -ENOMEM; | 2073 | return -ENOMEM; |
2069 | } | 2074 | } |
2070 | 2075 | ||
2071 | /* Check to ensure the stack will not grow into a hugetlb-only region */ | 2076 | /* Check to ensure the stack will not grow into a hugetlb-only region */ |
2072 | new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start : | 2077 | new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start : |
2073 | vma->vm_end - size; | 2078 | vma->vm_end - size; |
2074 | if (is_hugepage_only_range(vma->vm_mm, new_start, size)) | 2079 | if (is_hugepage_only_range(vma->vm_mm, new_start, size)) |
2075 | return -EFAULT; | 2080 | return -EFAULT; |
2076 | 2081 | ||
2077 | /* | 2082 | /* |
2078 | * Overcommit.. This must be the final test, as it will | 2083 | * Overcommit.. This must be the final test, as it will |
2079 | * update security statistics. | 2084 | * update security statistics. |
2080 | */ | 2085 | */ |
2081 | if (security_vm_enough_memory_mm(mm, grow)) | 2086 | if (security_vm_enough_memory_mm(mm, grow)) |
2082 | return -ENOMEM; | 2087 | return -ENOMEM; |
2083 | 2088 | ||
2084 | /* Ok, everything looks good - let it rip */ | 2089 | /* Ok, everything looks good - let it rip */ |
2085 | if (vma->vm_flags & VM_LOCKED) | 2090 | if (vma->vm_flags & VM_LOCKED) |
2086 | mm->locked_vm += grow; | 2091 | mm->locked_vm += grow; |
2087 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); | 2092 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); |
2088 | return 0; | 2093 | return 0; |
2089 | } | 2094 | } |
2090 | 2095 | ||
2091 | #if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64) | 2096 | #if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64) |
2092 | /* | 2097 | /* |
2093 | * PA-RISC uses this for its stack; IA64 for its Register Backing Store. | 2098 | * PA-RISC uses this for its stack; IA64 for its Register Backing Store. |
2094 | * vma is the last one with address > vma->vm_end. Have to extend vma. | 2099 | * vma is the last one with address > vma->vm_end. Have to extend vma. |
2095 | */ | 2100 | */ |
2096 | int expand_upwards(struct vm_area_struct *vma, unsigned long address) | 2101 | int expand_upwards(struct vm_area_struct *vma, unsigned long address) |
2097 | { | 2102 | { |
2098 | int error; | 2103 | int error; |
2099 | 2104 | ||
2100 | if (!(vma->vm_flags & VM_GROWSUP)) | 2105 | if (!(vma->vm_flags & VM_GROWSUP)) |
2101 | return -EFAULT; | 2106 | return -EFAULT; |
2102 | 2107 | ||
2103 | /* | 2108 | /* |
2104 | * We must make sure the anon_vma is allocated | 2109 | * We must make sure the anon_vma is allocated |
2105 | * so that the anon_vma locking is not a noop. | 2110 | * so that the anon_vma locking is not a noop. |
2106 | */ | 2111 | */ |
2107 | if (unlikely(anon_vma_prepare(vma))) | 2112 | if (unlikely(anon_vma_prepare(vma))) |
2108 | return -ENOMEM; | 2113 | return -ENOMEM; |
2109 | vma_lock_anon_vma(vma); | 2114 | vma_lock_anon_vma(vma); |
2110 | 2115 | ||
2111 | /* | 2116 | /* |
2112 | * vma->vm_start/vm_end cannot change under us because the caller | 2117 | * vma->vm_start/vm_end cannot change under us because the caller |
2113 | * is required to hold the mmap_sem in read mode. We need the | 2118 | * is required to hold the mmap_sem in read mode. We need the |
2114 | * anon_vma lock to serialize against concurrent expand_stacks. | 2119 | * anon_vma lock to serialize against concurrent expand_stacks. |
2115 | * Also guard against wrapping around to address 0. | 2120 | * Also guard against wrapping around to address 0. |
2116 | */ | 2121 | */ |
2117 | if (address < PAGE_ALIGN(address+4)) | 2122 | if (address < PAGE_ALIGN(address+4)) |
2118 | address = PAGE_ALIGN(address+4); | 2123 | address = PAGE_ALIGN(address+4); |
2119 | else { | 2124 | else { |
2120 | vma_unlock_anon_vma(vma); | 2125 | vma_unlock_anon_vma(vma); |
2121 | return -ENOMEM; | 2126 | return -ENOMEM; |
2122 | } | 2127 | } |
2123 | error = 0; | 2128 | error = 0; |
2124 | 2129 | ||
2125 | /* Somebody else might have raced and expanded it already */ | 2130 | /* Somebody else might have raced and expanded it already */ |
2126 | if (address > vma->vm_end) { | 2131 | if (address > vma->vm_end) { |
2127 | unsigned long size, grow; | 2132 | unsigned long size, grow; |
2128 | 2133 | ||
2129 | size = address - vma->vm_start; | 2134 | size = address - vma->vm_start; |
2130 | grow = (address - vma->vm_end) >> PAGE_SHIFT; | 2135 | grow = (address - vma->vm_end) >> PAGE_SHIFT; |
2131 | 2136 | ||
2132 | error = -ENOMEM; | 2137 | error = -ENOMEM; |
2133 | if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { | 2138 | if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { |
2134 | error = acct_stack_growth(vma, size, grow); | 2139 | error = acct_stack_growth(vma, size, grow); |
2135 | if (!error) { | 2140 | if (!error) { |
2136 | /* | 2141 | /* |
2137 | * vma_gap_update() doesn't support concurrent | 2142 | * vma_gap_update() doesn't support concurrent |
2138 | * updates, but we only hold a shared mmap_sem | 2143 | * updates, but we only hold a shared mmap_sem |
2139 | * lock here, so we need to protect against | 2144 | * lock here, so we need to protect against |
2140 | * concurrent vma expansions. | 2145 | * concurrent vma expansions. |
2141 | * vma_lock_anon_vma() doesn't help here, as | 2146 | * vma_lock_anon_vma() doesn't help here, as |
2142 | * we don't guarantee that all growable vmas | 2147 | * we don't guarantee that all growable vmas |
2143 | * in a mm share the same root anon vma. | 2148 | * in a mm share the same root anon vma. |
2144 | * So, we reuse mm->page_table_lock to guard | 2149 | * So, we reuse mm->page_table_lock to guard |
2145 | * against concurrent vma expansions. | 2150 | * against concurrent vma expansions. |
2146 | */ | 2151 | */ |
2147 | spin_lock(&vma->vm_mm->page_table_lock); | 2152 | spin_lock(&vma->vm_mm->page_table_lock); |
2148 | anon_vma_interval_tree_pre_update_vma(vma); | 2153 | anon_vma_interval_tree_pre_update_vma(vma); |
2149 | vma->vm_end = address; | 2154 | vma->vm_end = address; |
2150 | anon_vma_interval_tree_post_update_vma(vma); | 2155 | anon_vma_interval_tree_post_update_vma(vma); |
2151 | if (vma->vm_next) | 2156 | if (vma->vm_next) |
2152 | vma_gap_update(vma->vm_next); | 2157 | vma_gap_update(vma->vm_next); |
2153 | else | 2158 | else |
2154 | vma->vm_mm->highest_vm_end = address; | 2159 | vma->vm_mm->highest_vm_end = address; |
2155 | spin_unlock(&vma->vm_mm->page_table_lock); | 2160 | spin_unlock(&vma->vm_mm->page_table_lock); |
2156 | 2161 | ||
2157 | perf_event_mmap(vma); | 2162 | perf_event_mmap(vma); |
2158 | } | 2163 | } |
2159 | } | 2164 | } |
2160 | } | 2165 | } |
2161 | vma_unlock_anon_vma(vma); | 2166 | vma_unlock_anon_vma(vma); |
2162 | khugepaged_enter_vma_merge(vma); | 2167 | khugepaged_enter_vma_merge(vma); |
2163 | validate_mm(vma->vm_mm); | 2168 | validate_mm(vma->vm_mm); |
2164 | return error; | 2169 | return error; |
2165 | } | 2170 | } |
2166 | #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ | 2171 | #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ |
2167 | 2172 | ||
2168 | /* | 2173 | /* |
2169 | * vma is the first one with address < vma->vm_start. Have to extend vma. | 2174 | * vma is the first one with address < vma->vm_start. Have to extend vma. |
2170 | */ | 2175 | */ |
2171 | int expand_downwards(struct vm_area_struct *vma, | 2176 | int expand_downwards(struct vm_area_struct *vma, |
2172 | unsigned long address) | 2177 | unsigned long address) |
2173 | { | 2178 | { |
2174 | int error; | 2179 | int error; |
2175 | 2180 | ||
2176 | /* | 2181 | /* |
2177 | * We must make sure the anon_vma is allocated | 2182 | * We must make sure the anon_vma is allocated |
2178 | * so that the anon_vma locking is not a noop. | 2183 | * so that the anon_vma locking is not a noop. |
2179 | */ | 2184 | */ |
2180 | if (unlikely(anon_vma_prepare(vma))) | 2185 | if (unlikely(anon_vma_prepare(vma))) |
2181 | return -ENOMEM; | 2186 | return -ENOMEM; |
2182 | 2187 | ||
2183 | address &= PAGE_MASK; | 2188 | address &= PAGE_MASK; |
2184 | error = security_mmap_addr(address); | 2189 | error = security_mmap_addr(address); |
2185 | if (error) | 2190 | if (error) |
2186 | return error; | 2191 | return error; |
2187 | 2192 | ||
2188 | vma_lock_anon_vma(vma); | 2193 | vma_lock_anon_vma(vma); |
2189 | 2194 | ||
2190 | /* | 2195 | /* |
2191 | * vma->vm_start/vm_end cannot change under us because the caller | 2196 | * vma->vm_start/vm_end cannot change under us because the caller |
2192 | * is required to hold the mmap_sem in read mode. We need the | 2197 | * is required to hold the mmap_sem in read mode. We need the |
2193 | * anon_vma lock to serialize against concurrent expand_stacks. | 2198 | * anon_vma lock to serialize against concurrent expand_stacks. |
2194 | */ | 2199 | */ |
2195 | 2200 | ||
2196 | /* Somebody else might have raced and expanded it already */ | 2201 | /* Somebody else might have raced and expanded it already */ |
2197 | if (address < vma->vm_start) { | 2202 | if (address < vma->vm_start) { |
2198 | unsigned long size, grow; | 2203 | unsigned long size, grow; |
2199 | 2204 | ||
2200 | size = vma->vm_end - address; | 2205 | size = vma->vm_end - address; |
2201 | grow = (vma->vm_start - address) >> PAGE_SHIFT; | 2206 | grow = (vma->vm_start - address) >> PAGE_SHIFT; |
2202 | 2207 | ||
2203 | error = -ENOMEM; | 2208 | error = -ENOMEM; |
2204 | if (grow <= vma->vm_pgoff) { | 2209 | if (grow <= vma->vm_pgoff) { |
2205 | error = acct_stack_growth(vma, size, grow); | 2210 | error = acct_stack_growth(vma, size, grow); |
2206 | if (!error) { | 2211 | if (!error) { |
2207 | /* | 2212 | /* |
2208 | * vma_gap_update() doesn't support concurrent | 2213 | * vma_gap_update() doesn't support concurrent |
2209 | * updates, but we only hold a shared mmap_sem | 2214 | * updates, but we only hold a shared mmap_sem |
2210 | * lock here, so we need to protect against | 2215 | * lock here, so we need to protect against |
2211 | * concurrent vma expansions. | 2216 | * concurrent vma expansions. |
2212 | * vma_lock_anon_vma() doesn't help here, as | 2217 | * vma_lock_anon_vma() doesn't help here, as |
2213 | * we don't guarantee that all growable vmas | 2218 | * we don't guarantee that all growable vmas |
2214 | * in a mm share the same root anon vma. | 2219 | * in a mm share the same root anon vma. |
2215 | * So, we reuse mm->page_table_lock to guard | 2220 | * So, we reuse mm->page_table_lock to guard |
2216 | * against concurrent vma expansions. | 2221 | * against concurrent vma expansions. |
2217 | */ | 2222 | */ |
2218 | spin_lock(&vma->vm_mm->page_table_lock); | 2223 | spin_lock(&vma->vm_mm->page_table_lock); |
2219 | anon_vma_interval_tree_pre_update_vma(vma); | 2224 | anon_vma_interval_tree_pre_update_vma(vma); |
2220 | vma->vm_start = address; | 2225 | vma->vm_start = address; |
2221 | vma->vm_pgoff -= grow; | 2226 | vma->vm_pgoff -= grow; |
2222 | anon_vma_interval_tree_post_update_vma(vma); | 2227 | anon_vma_interval_tree_post_update_vma(vma); |
2223 | vma_gap_update(vma); | 2228 | vma_gap_update(vma); |
2224 | spin_unlock(&vma->vm_mm->page_table_lock); | 2229 | spin_unlock(&vma->vm_mm->page_table_lock); |
2225 | 2230 | ||
2226 | perf_event_mmap(vma); | 2231 | perf_event_mmap(vma); |
2227 | } | 2232 | } |
2228 | } | 2233 | } |
2229 | } | 2234 | } |
2230 | vma_unlock_anon_vma(vma); | 2235 | vma_unlock_anon_vma(vma); |
2231 | khugepaged_enter_vma_merge(vma); | 2236 | khugepaged_enter_vma_merge(vma); |
2232 | validate_mm(vma->vm_mm); | 2237 | validate_mm(vma->vm_mm); |
2233 | return error; | 2238 | return error; |
2234 | } | 2239 | } |
2235 | 2240 | ||
2236 | /* | 2241 | /* |
2237 | * Note how expand_stack() refuses to expand the stack all the way to | 2242 | * Note how expand_stack() refuses to expand the stack all the way to |
2238 | * abut the next virtual mapping, *unless* that mapping itself is also | 2243 | * abut the next virtual mapping, *unless* that mapping itself is also |
2239 | * a stack mapping. We want to leave room for a guard page, after all | 2244 | * a stack mapping. We want to leave room for a guard page, after all |
2240 | * (the guard page itself is not added here, that is done by the | 2245 | * (the guard page itself is not added here, that is done by the |
2241 | * actual page faulting logic) | 2246 | * actual page faulting logic) |
2242 | * | 2247 | * |
2243 | * This matches the behavior of the guard page logic (see mm/memory.c: | 2248 | * This matches the behavior of the guard page logic (see mm/memory.c: |
2244 | * check_stack_guard_page()), which only allows the guard page to be | 2249 | * check_stack_guard_page()), which only allows the guard page to be |
2245 | * removed under these circumstances. | 2250 | * removed under these circumstances. |
2246 | */ | 2251 | */ |
2247 | #ifdef CONFIG_STACK_GROWSUP | 2252 | #ifdef CONFIG_STACK_GROWSUP |
2248 | int expand_stack(struct vm_area_struct *vma, unsigned long address) | 2253 | int expand_stack(struct vm_area_struct *vma, unsigned long address) |
2249 | { | 2254 | { |
2250 | struct vm_area_struct *next; | 2255 | struct vm_area_struct *next; |
2251 | 2256 | ||
2252 | address &= PAGE_MASK; | 2257 | address &= PAGE_MASK; |
2253 | next = vma->vm_next; | 2258 | next = vma->vm_next; |
2254 | if (next && next->vm_start == address + PAGE_SIZE) { | 2259 | if (next && next->vm_start == address + PAGE_SIZE) { |
2255 | if (!(next->vm_flags & VM_GROWSUP)) | 2260 | if (!(next->vm_flags & VM_GROWSUP)) |
2256 | return -ENOMEM; | 2261 | return -ENOMEM; |
2257 | } | 2262 | } |
2258 | return expand_upwards(vma, address); | 2263 | return expand_upwards(vma, address); |
2259 | } | 2264 | } |
2260 | 2265 | ||
2261 | struct vm_area_struct * | 2266 | struct vm_area_struct * |
2262 | find_extend_vma(struct mm_struct *mm, unsigned long addr) | 2267 | find_extend_vma(struct mm_struct *mm, unsigned long addr) |
2263 | { | 2268 | { |
2264 | struct vm_area_struct *vma, *prev; | 2269 | struct vm_area_struct *vma, *prev; |
2265 | 2270 | ||
2266 | addr &= PAGE_MASK; | 2271 | addr &= PAGE_MASK; |
2267 | vma = find_vma_prev(mm, addr, &prev); | 2272 | vma = find_vma_prev(mm, addr, &prev); |
2268 | if (vma && (vma->vm_start <= addr)) | 2273 | if (vma && (vma->vm_start <= addr)) |
2269 | return vma; | 2274 | return vma; |
2270 | if (!prev || expand_stack(prev, addr)) | 2275 | if (!prev || expand_stack(prev, addr)) |
2271 | return NULL; | 2276 | return NULL; |
2272 | if (prev->vm_flags & VM_LOCKED) | 2277 | if (prev->vm_flags & VM_LOCKED) |
2273 | __mlock_vma_pages_range(prev, addr, prev->vm_end, NULL); | 2278 | __mlock_vma_pages_range(prev, addr, prev->vm_end, NULL); |
2274 | return prev; | 2279 | return prev; |
2275 | } | 2280 | } |
2276 | #else | 2281 | #else |
2277 | int expand_stack(struct vm_area_struct *vma, unsigned long address) | 2282 | int expand_stack(struct vm_area_struct *vma, unsigned long address) |
2278 | { | 2283 | { |
2279 | struct vm_area_struct *prev; | 2284 | struct vm_area_struct *prev; |
2280 | 2285 | ||
2281 | address &= PAGE_MASK; | 2286 | address &= PAGE_MASK; |
2282 | prev = vma->vm_prev; | 2287 | prev = vma->vm_prev; |
2283 | if (prev && prev->vm_end == address) { | 2288 | if (prev && prev->vm_end == address) { |
2284 | if (!(prev->vm_flags & VM_GROWSDOWN)) | 2289 | if (!(prev->vm_flags & VM_GROWSDOWN)) |
2285 | return -ENOMEM; | 2290 | return -ENOMEM; |
2286 | } | 2291 | } |
2287 | return expand_downwards(vma, address); | 2292 | return expand_downwards(vma, address); |
2288 | } | 2293 | } |
2289 | 2294 | ||
2290 | struct vm_area_struct * | 2295 | struct vm_area_struct * |
2291 | find_extend_vma(struct mm_struct * mm, unsigned long addr) | 2296 | find_extend_vma(struct mm_struct * mm, unsigned long addr) |
2292 | { | 2297 | { |
2293 | struct vm_area_struct * vma; | 2298 | struct vm_area_struct * vma; |
2294 | unsigned long start; | 2299 | unsigned long start; |
2295 | 2300 | ||
2296 | addr &= PAGE_MASK; | 2301 | addr &= PAGE_MASK; |
2297 | vma = find_vma(mm,addr); | 2302 | vma = find_vma(mm,addr); |
2298 | if (!vma) | 2303 | if (!vma) |
2299 | return NULL; | 2304 | return NULL; |
2300 | if (vma->vm_start <= addr) | 2305 | if (vma->vm_start <= addr) |
2301 | return vma; | 2306 | return vma; |
2302 | if (!(vma->vm_flags & VM_GROWSDOWN)) | 2307 | if (!(vma->vm_flags & VM_GROWSDOWN)) |
2303 | return NULL; | 2308 | return NULL; |
2304 | start = vma->vm_start; | 2309 | start = vma->vm_start; |
2305 | if (expand_stack(vma, addr)) | 2310 | if (expand_stack(vma, addr)) |
2306 | return NULL; | 2311 | return NULL; |
2307 | if (vma->vm_flags & VM_LOCKED) | 2312 | if (vma->vm_flags & VM_LOCKED) |
2308 | __mlock_vma_pages_range(vma, addr, start, NULL); | 2313 | __mlock_vma_pages_range(vma, addr, start, NULL); |
2309 | return vma; | 2314 | return vma; |
2310 | } | 2315 | } |
2311 | #endif | 2316 | #endif |
2312 | 2317 | ||
2313 | /* | 2318 | /* |
2314 | * Ok - we have the memory areas we should free on the vma list, | 2319 | * Ok - we have the memory areas we should free on the vma list, |
2315 | * so release them, and do the vma updates. | 2320 | * so release them, and do the vma updates. |
2316 | * | 2321 | * |
2317 | * Called with the mm semaphore held. | 2322 | * Called with the mm semaphore held. |
2318 | */ | 2323 | */ |
2319 | static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) | 2324 | static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) |
2320 | { | 2325 | { |
2321 | unsigned long nr_accounted = 0; | 2326 | unsigned long nr_accounted = 0; |
2322 | 2327 | ||
2323 | /* Update high watermark before we lower total_vm */ | 2328 | /* Update high watermark before we lower total_vm */ |
2324 | update_hiwater_vm(mm); | 2329 | update_hiwater_vm(mm); |
2325 | do { | 2330 | do { |
2326 | long nrpages = vma_pages(vma); | 2331 | long nrpages = vma_pages(vma); |
2327 | 2332 | ||
2328 | if (vma->vm_flags & VM_ACCOUNT) | 2333 | if (vma->vm_flags & VM_ACCOUNT) |
2329 | nr_accounted += nrpages; | 2334 | nr_accounted += nrpages; |
2330 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); | 2335 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); |
2331 | vma = remove_vma(vma); | 2336 | vma = remove_vma(vma); |
2332 | } while (vma); | 2337 | } while (vma); |
2333 | vm_unacct_memory(nr_accounted); | 2338 | vm_unacct_memory(nr_accounted); |
2334 | validate_mm(mm); | 2339 | validate_mm(mm); |
2335 | } | 2340 | } |
2336 | 2341 | ||
2337 | /* | 2342 | /* |
2338 | * Get rid of page table information in the indicated region. | 2343 | * Get rid of page table information in the indicated region. |
2339 | * | 2344 | * |
2340 | * Called with the mm semaphore held. | 2345 | * Called with the mm semaphore held. |
2341 | */ | 2346 | */ |
2342 | static void unmap_region(struct mm_struct *mm, | 2347 | static void unmap_region(struct mm_struct *mm, |
2343 | struct vm_area_struct *vma, struct vm_area_struct *prev, | 2348 | struct vm_area_struct *vma, struct vm_area_struct *prev, |
2344 | unsigned long start, unsigned long end) | 2349 | unsigned long start, unsigned long end) |
2345 | { | 2350 | { |
2346 | struct vm_area_struct *next = prev? prev->vm_next: mm->mmap; | 2351 | struct vm_area_struct *next = prev? prev->vm_next: mm->mmap; |
2347 | struct mmu_gather tlb; | 2352 | struct mmu_gather tlb; |
2348 | 2353 | ||
2349 | lru_add_drain(); | 2354 | lru_add_drain(); |
2350 | tlb_gather_mmu(&tlb, mm, 0); | 2355 | tlb_gather_mmu(&tlb, mm, 0); |
2351 | update_hiwater_rss(mm); | 2356 | update_hiwater_rss(mm); |
2352 | unmap_vmas(&tlb, vma, start, end); | 2357 | unmap_vmas(&tlb, vma, start, end); |
2353 | free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, | 2358 | free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, |
2354 | next ? next->vm_start : USER_PGTABLES_CEILING); | 2359 | next ? next->vm_start : USER_PGTABLES_CEILING); |
2355 | tlb_finish_mmu(&tlb, start, end); | 2360 | tlb_finish_mmu(&tlb, start, end); |
2356 | } | 2361 | } |
2357 | 2362 | ||
2358 | /* | 2363 | /* |
2359 | * Create a list of vma's touched by the unmap, removing them from the mm's | 2364 | * Create a list of vma's touched by the unmap, removing them from the mm's |
2360 | * vma list as we go.. | 2365 | * vma list as we go.. |
2361 | */ | 2366 | */ |
2362 | static void | 2367 | static void |
2363 | detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, | 2368 | detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, |
2364 | struct vm_area_struct *prev, unsigned long end) | 2369 | struct vm_area_struct *prev, unsigned long end) |
2365 | { | 2370 | { |
2366 | struct vm_area_struct **insertion_point; | 2371 | struct vm_area_struct **insertion_point; |
2367 | struct vm_area_struct *tail_vma = NULL; | 2372 | struct vm_area_struct *tail_vma = NULL; |
2368 | unsigned long addr; | 2373 | unsigned long addr; |
2369 | 2374 | ||
2370 | insertion_point = (prev ? &prev->vm_next : &mm->mmap); | 2375 | insertion_point = (prev ? &prev->vm_next : &mm->mmap); |
2371 | vma->vm_prev = NULL; | 2376 | vma->vm_prev = NULL; |
2372 | do { | 2377 | do { |
2373 | vma_rb_erase(vma, &mm->mm_rb); | 2378 | vma_rb_erase(vma, &mm->mm_rb); |
2374 | mm->map_count--; | 2379 | mm->map_count--; |
2375 | tail_vma = vma; | 2380 | tail_vma = vma; |
2376 | vma = vma->vm_next; | 2381 | vma = vma->vm_next; |
2377 | } while (vma && vma->vm_start < end); | 2382 | } while (vma && vma->vm_start < end); |
2378 | *insertion_point = vma; | 2383 | *insertion_point = vma; |
2379 | if (vma) { | 2384 | if (vma) { |
2380 | vma->vm_prev = prev; | 2385 | vma->vm_prev = prev; |
2381 | vma_gap_update(vma); | 2386 | vma_gap_update(vma); |
2382 | } else | 2387 | } else |
2383 | mm->highest_vm_end = prev ? prev->vm_end : 0; | 2388 | mm->highest_vm_end = prev ? prev->vm_end : 0; |
2384 | tail_vma->vm_next = NULL; | 2389 | tail_vma->vm_next = NULL; |
2385 | if (mm->unmap_area == arch_unmap_area) | 2390 | if (mm->unmap_area == arch_unmap_area) |
2386 | addr = prev ? prev->vm_end : mm->mmap_base; | 2391 | addr = prev ? prev->vm_end : mm->mmap_base; |
2387 | else | 2392 | else |
2388 | addr = vma ? vma->vm_start : mm->mmap_base; | 2393 | addr = vma ? vma->vm_start : mm->mmap_base; |
2389 | mm->unmap_area(mm, addr); | 2394 | mm->unmap_area(mm, addr); |
2390 | mm->mmap_cache = NULL; /* Kill the cache. */ | 2395 | mm->mmap_cache = NULL; /* Kill the cache. */ |
2391 | } | 2396 | } |
2392 | 2397 | ||
2393 | /* | 2398 | /* |
2394 | * __split_vma() bypasses sysctl_max_map_count checking. We use this on the | 2399 | * __split_vma() bypasses sysctl_max_map_count checking. We use this on the |
2395 | * munmap path where it doesn't make sense to fail. | 2400 | * munmap path where it doesn't make sense to fail. |
2396 | */ | 2401 | */ |
2397 | static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, | 2402 | static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, |
2398 | unsigned long addr, int new_below) | 2403 | unsigned long addr, int new_below) |
2399 | { | 2404 | { |
2400 | struct mempolicy *pol; | 2405 | struct mempolicy *pol; |
2401 | struct vm_area_struct *new; | 2406 | struct vm_area_struct *new; |
2402 | int err = -ENOMEM; | 2407 | int err = -ENOMEM; |
2403 | 2408 | ||
2404 | if (is_vm_hugetlb_page(vma) && (addr & | 2409 | if (is_vm_hugetlb_page(vma) && (addr & |
2405 | ~(huge_page_mask(hstate_vma(vma))))) | 2410 | ~(huge_page_mask(hstate_vma(vma))))) |
2406 | return -EINVAL; | 2411 | return -EINVAL; |
2407 | 2412 | ||
2408 | new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); | 2413 | new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); |
2409 | if (!new) | 2414 | if (!new) |
2410 | goto out_err; | 2415 | goto out_err; |
2411 | 2416 | ||
2412 | /* most fields are the same, copy all, and then fixup */ | 2417 | /* most fields are the same, copy all, and then fixup */ |
2413 | *new = *vma; | 2418 | *new = *vma; |
2414 | 2419 | ||
2415 | INIT_LIST_HEAD(&new->anon_vma_chain); | 2420 | INIT_LIST_HEAD(&new->anon_vma_chain); |
2416 | 2421 | ||
2417 | if (new_below) | 2422 | if (new_below) |
2418 | new->vm_end = addr; | 2423 | new->vm_end = addr; |
2419 | else { | 2424 | else { |
2420 | new->vm_start = addr; | 2425 | new->vm_start = addr; |
2421 | new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); | 2426 | new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); |
2422 | } | 2427 | } |
2423 | 2428 | ||
2424 | pol = mpol_dup(vma_policy(vma)); | 2429 | pol = mpol_dup(vma_policy(vma)); |
2425 | if (IS_ERR(pol)) { | 2430 | if (IS_ERR(pol)) { |
2426 | err = PTR_ERR(pol); | 2431 | err = PTR_ERR(pol); |
2427 | goto out_free_vma; | 2432 | goto out_free_vma; |
2428 | } | 2433 | } |
2429 | vma_set_policy(new, pol); | 2434 | vma_set_policy(new, pol); |
2430 | 2435 | ||
2431 | if (anon_vma_clone(new, vma)) | 2436 | if (anon_vma_clone(new, vma)) |
2432 | goto out_free_mpol; | 2437 | goto out_free_mpol; |
2433 | 2438 | ||
2434 | if (new->vm_file) | 2439 | if (new->vm_file) |
2435 | get_file(new->vm_file); | 2440 | get_file(new->vm_file); |
2436 | 2441 | ||
2437 | if (new->vm_ops && new->vm_ops->open) | 2442 | if (new->vm_ops && new->vm_ops->open) |
2438 | new->vm_ops->open(new); | 2443 | new->vm_ops->open(new); |
2439 | 2444 | ||
2440 | if (new_below) | 2445 | if (new_below) |
2441 | err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + | 2446 | err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + |
2442 | ((addr - new->vm_start) >> PAGE_SHIFT), new); | 2447 | ((addr - new->vm_start) >> PAGE_SHIFT), new); |
2443 | else | 2448 | else |
2444 | err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); | 2449 | err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); |
2445 | 2450 | ||
2446 | /* Success. */ | 2451 | /* Success. */ |
2447 | if (!err) | 2452 | if (!err) |
2448 | return 0; | 2453 | return 0; |
2449 | 2454 | ||
2450 | /* Clean everything up if vma_adjust failed. */ | 2455 | /* Clean everything up if vma_adjust failed. */ |
2451 | if (new->vm_ops && new->vm_ops->close) | 2456 | if (new->vm_ops && new->vm_ops->close) |
2452 | new->vm_ops->close(new); | 2457 | new->vm_ops->close(new); |
2453 | if (new->vm_file) | 2458 | if (new->vm_file) |
2454 | fput(new->vm_file); | 2459 | fput(new->vm_file); |
2455 | unlink_anon_vmas(new); | 2460 | unlink_anon_vmas(new); |
2456 | out_free_mpol: | 2461 | out_free_mpol: |
2457 | mpol_put(pol); | 2462 | mpol_put(pol); |
2458 | out_free_vma: | 2463 | out_free_vma: |
2459 | kmem_cache_free(vm_area_cachep, new); | 2464 | kmem_cache_free(vm_area_cachep, new); |
2460 | out_err: | 2465 | out_err: |
2461 | return err; | 2466 | return err; |
2462 | } | 2467 | } |
2463 | 2468 | ||
2464 | /* | 2469 | /* |
2465 | * Split a vma into two pieces at address 'addr', a new vma is allocated | 2470 | * Split a vma into two pieces at address 'addr', a new vma is allocated |
2466 | * either for the first part or the tail. | 2471 | * either for the first part or the tail. |
2467 | */ | 2472 | */ |
2468 | int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, | 2473 | int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, |
2469 | unsigned long addr, int new_below) | 2474 | unsigned long addr, int new_below) |
2470 | { | 2475 | { |
2471 | if (mm->map_count >= sysctl_max_map_count) | 2476 | if (mm->map_count >= sysctl_max_map_count) |
2472 | return -ENOMEM; | 2477 | return -ENOMEM; |
2473 | 2478 | ||
2474 | return __split_vma(mm, vma, addr, new_below); | 2479 | return __split_vma(mm, vma, addr, new_below); |
2475 | } | 2480 | } |
2476 | 2481 | ||
2477 | /* Munmap is split into 2 main parts -- this part which finds | 2482 | /* Munmap is split into 2 main parts -- this part which finds |
2478 | * what needs doing, and the areas themselves, which do the | 2483 | * what needs doing, and the areas themselves, which do the |
2479 | * work. This now handles partial unmappings. | 2484 | * work. This now handles partial unmappings. |
2480 | * Jeremy Fitzhardinge <jeremy@goop.org> | 2485 | * Jeremy Fitzhardinge <jeremy@goop.org> |
2481 | */ | 2486 | */ |
2482 | int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) | 2487 | int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) |
2483 | { | 2488 | { |
2484 | unsigned long end; | 2489 | unsigned long end; |
2485 | struct vm_area_struct *vma, *prev, *last; | 2490 | struct vm_area_struct *vma, *prev, *last; |
2486 | 2491 | ||
2487 | if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start) | 2492 | if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start) |
2488 | return -EINVAL; | 2493 | return -EINVAL; |
2489 | 2494 | ||
2490 | if ((len = PAGE_ALIGN(len)) == 0) | 2495 | if ((len = PAGE_ALIGN(len)) == 0) |
2491 | return -EINVAL; | 2496 | return -EINVAL; |
2492 | 2497 | ||
2493 | /* Find the first overlapping VMA */ | 2498 | /* Find the first overlapping VMA */ |
2494 | vma = find_vma(mm, start); | 2499 | vma = find_vma(mm, start); |
2495 | if (!vma) | 2500 | if (!vma) |
2496 | return 0; | 2501 | return 0; |
2497 | prev = vma->vm_prev; | 2502 | prev = vma->vm_prev; |
2498 | /* we have start < vma->vm_end */ | 2503 | /* we have start < vma->vm_end */ |
2499 | 2504 | ||
2500 | /* if it doesn't overlap, we have nothing.. */ | 2505 | /* if it doesn't overlap, we have nothing.. */ |
2501 | end = start + len; | 2506 | end = start + len; |
2502 | if (vma->vm_start >= end) | 2507 | if (vma->vm_start >= end) |
2503 | return 0; | 2508 | return 0; |
2504 | 2509 | ||
2505 | /* | 2510 | /* |
2506 | * If we need to split any vma, do it now to save pain later. | 2511 | * If we need to split any vma, do it now to save pain later. |
2507 | * | 2512 | * |
2508 | * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially | 2513 | * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially |
2509 | * unmapped vm_area_struct will remain in use: so lower split_vma | 2514 | * unmapped vm_area_struct will remain in use: so lower split_vma |
2510 | * places tmp vma above, and higher split_vma places tmp vma below. | 2515 | * places tmp vma above, and higher split_vma places tmp vma below. |
2511 | */ | 2516 | */ |
2512 | if (start > vma->vm_start) { | 2517 | if (start > vma->vm_start) { |
2513 | int error; | 2518 | int error; |
2514 | 2519 | ||
2515 | /* | 2520 | /* |
2516 | * Make sure that map_count on return from munmap() will | 2521 | * Make sure that map_count on return from munmap() will |
2517 | * not exceed its limit; but let map_count go just above | 2522 | * not exceed its limit; but let map_count go just above |
2518 | * its limit temporarily, to help free resources as expected. | 2523 | * its limit temporarily, to help free resources as expected. |
2519 | */ | 2524 | */ |
2520 | if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) | 2525 | if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) |
2521 | return -ENOMEM; | 2526 | return -ENOMEM; |
2522 | 2527 | ||
2523 | error = __split_vma(mm, vma, start, 0); | 2528 | error = __split_vma(mm, vma, start, 0); |
2524 | if (error) | 2529 | if (error) |
2525 | return error; | 2530 | return error; |
2526 | prev = vma; | 2531 | prev = vma; |
2527 | } | 2532 | } |
2528 | 2533 | ||
2529 | /* Does it split the last one? */ | 2534 | /* Does it split the last one? */ |
2530 | last = find_vma(mm, end); | 2535 | last = find_vma(mm, end); |
2531 | if (last && end > last->vm_start) { | 2536 | if (last && end > last->vm_start) { |
2532 | int error = __split_vma(mm, last, end, 1); | 2537 | int error = __split_vma(mm, last, end, 1); |
2533 | if (error) | 2538 | if (error) |
2534 | return error; | 2539 | return error; |
2535 | } | 2540 | } |
2536 | vma = prev? prev->vm_next: mm->mmap; | 2541 | vma = prev? prev->vm_next: mm->mmap; |
2537 | 2542 | ||
2538 | /* | 2543 | /* |
2539 | * unlock any mlock()ed ranges before detaching vmas | 2544 | * unlock any mlock()ed ranges before detaching vmas |
2540 | */ | 2545 | */ |
2541 | if (mm->locked_vm) { | 2546 | if (mm->locked_vm) { |
2542 | struct vm_area_struct *tmp = vma; | 2547 | struct vm_area_struct *tmp = vma; |
2543 | while (tmp && tmp->vm_start < end) { | 2548 | while (tmp && tmp->vm_start < end) { |
2544 | if (tmp->vm_flags & VM_LOCKED) { | 2549 | if (tmp->vm_flags & VM_LOCKED) { |
2545 | mm->locked_vm -= vma_pages(tmp); | 2550 | mm->locked_vm -= vma_pages(tmp); |
2546 | munlock_vma_pages_all(tmp); | 2551 | munlock_vma_pages_all(tmp); |
2547 | } | 2552 | } |
2548 | tmp = tmp->vm_next; | 2553 | tmp = tmp->vm_next; |
2549 | } | 2554 | } |
2550 | } | 2555 | } |
2551 | 2556 | ||
2552 | /* | 2557 | /* |
2553 | * Remove the vma's, and unmap the actual pages | 2558 | * Remove the vma's, and unmap the actual pages |
2554 | */ | 2559 | */ |
2555 | detach_vmas_to_be_unmapped(mm, vma, prev, end); | 2560 | detach_vmas_to_be_unmapped(mm, vma, prev, end); |
2556 | unmap_region(mm, vma, prev, start, end); | 2561 | unmap_region(mm, vma, prev, start, end); |
2557 | 2562 | ||
2558 | /* Fix up all other VM information */ | 2563 | /* Fix up all other VM information */ |
2559 | remove_vma_list(mm, vma); | 2564 | remove_vma_list(mm, vma); |
2560 | 2565 | ||
2561 | return 0; | 2566 | return 0; |
2562 | } | 2567 | } |
2563 | 2568 | ||
2564 | int vm_munmap(unsigned long start, size_t len) | 2569 | int vm_munmap(unsigned long start, size_t len) |
2565 | { | 2570 | { |
2566 | int ret; | 2571 | int ret; |
2567 | struct mm_struct *mm = current->mm; | 2572 | struct mm_struct *mm = current->mm; |
2568 | 2573 | ||
2569 | down_write(&mm->mmap_sem); | 2574 | down_write(&mm->mmap_sem); |
2570 | ret = do_munmap(mm, start, len); | 2575 | ret = do_munmap(mm, start, len); |
2571 | up_write(&mm->mmap_sem); | 2576 | up_write(&mm->mmap_sem); |
2572 | return ret; | 2577 | return ret; |
2573 | } | 2578 | } |
2574 | EXPORT_SYMBOL(vm_munmap); | 2579 | EXPORT_SYMBOL(vm_munmap); |
2575 | 2580 | ||
2576 | SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) | 2581 | SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) |
2577 | { | 2582 | { |
2578 | profile_munmap(addr); | 2583 | profile_munmap(addr); |
2579 | return vm_munmap(addr, len); | 2584 | return vm_munmap(addr, len); |
2580 | } | 2585 | } |
2581 | 2586 | ||
2582 | static inline void verify_mm_writelocked(struct mm_struct *mm) | 2587 | static inline void verify_mm_writelocked(struct mm_struct *mm) |
2583 | { | 2588 | { |
2584 | #ifdef CONFIG_DEBUG_VM | 2589 | #ifdef CONFIG_DEBUG_VM |
2585 | if (unlikely(down_read_trylock(&mm->mmap_sem))) { | 2590 | if (unlikely(down_read_trylock(&mm->mmap_sem))) { |
2586 | WARN_ON(1); | 2591 | WARN_ON(1); |
2587 | up_read(&mm->mmap_sem); | 2592 | up_read(&mm->mmap_sem); |
2588 | } | 2593 | } |
2589 | #endif | 2594 | #endif |
2590 | } | 2595 | } |
2591 | 2596 | ||
2592 | /* | 2597 | /* |
2593 | * this is really a simplified "do_mmap". it only handles | 2598 | * this is really a simplified "do_mmap". it only handles |
2594 | * anonymous maps. eventually we may be able to do some | 2599 | * anonymous maps. eventually we may be able to do some |
2595 | * brk-specific accounting here. | 2600 | * brk-specific accounting here. |
2596 | */ | 2601 | */ |
2597 | static unsigned long do_brk(unsigned long addr, unsigned long len) | 2602 | static unsigned long do_brk(unsigned long addr, unsigned long len) |
2598 | { | 2603 | { |
2599 | struct mm_struct * mm = current->mm; | 2604 | struct mm_struct * mm = current->mm; |
2600 | struct vm_area_struct * vma, * prev; | 2605 | struct vm_area_struct * vma, * prev; |
2601 | unsigned long flags; | 2606 | unsigned long flags; |
2602 | struct rb_node ** rb_link, * rb_parent; | 2607 | struct rb_node ** rb_link, * rb_parent; |
2603 | pgoff_t pgoff = addr >> PAGE_SHIFT; | 2608 | pgoff_t pgoff = addr >> PAGE_SHIFT; |
2604 | int error; | 2609 | int error; |
2605 | 2610 | ||
2606 | len = PAGE_ALIGN(len); | 2611 | len = PAGE_ALIGN(len); |
2607 | if (!len) | 2612 | if (!len) |
2608 | return addr; | 2613 | return addr; |
2609 | 2614 | ||
2610 | flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; | 2615 | flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; |
2611 | 2616 | ||
2612 | error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); | 2617 | error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); |
2613 | if (error & ~PAGE_MASK) | 2618 | if (error & ~PAGE_MASK) |
2614 | return error; | 2619 | return error; |
2615 | 2620 | ||
2616 | /* | 2621 | /* |
2617 | * mlock MCL_FUTURE? | 2622 | * mlock MCL_FUTURE? |
2618 | */ | 2623 | */ |
2619 | if (mm->def_flags & VM_LOCKED) { | 2624 | if (mm->def_flags & VM_LOCKED) { |
2620 | unsigned long locked, lock_limit; | 2625 | unsigned long locked, lock_limit; |
2621 | locked = len >> PAGE_SHIFT; | 2626 | locked = len >> PAGE_SHIFT; |
2622 | locked += mm->locked_vm; | 2627 | locked += mm->locked_vm; |
2623 | lock_limit = rlimit(RLIMIT_MEMLOCK); | 2628 | lock_limit = rlimit(RLIMIT_MEMLOCK); |
2624 | lock_limit >>= PAGE_SHIFT; | 2629 | lock_limit >>= PAGE_SHIFT; |
2625 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) | 2630 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) |
2626 | return -EAGAIN; | 2631 | return -EAGAIN; |
2627 | } | 2632 | } |
2628 | 2633 | ||
2629 | /* | 2634 | /* |
2630 | * mm->mmap_sem is required to protect against another thread | 2635 | * mm->mmap_sem is required to protect against another thread |
2631 | * changing the mappings in case we sleep. | 2636 | * changing the mappings in case we sleep. |
2632 | */ | 2637 | */ |
2633 | verify_mm_writelocked(mm); | 2638 | verify_mm_writelocked(mm); |
2634 | 2639 | ||
2635 | /* | 2640 | /* |
2636 | * Clear old maps. this also does some error checking for us | 2641 | * Clear old maps. this also does some error checking for us |
2637 | */ | 2642 | */ |
2638 | munmap_back: | 2643 | munmap_back: |
2639 | if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { | 2644 | if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { |
2640 | if (do_munmap(mm, addr, len)) | 2645 | if (do_munmap(mm, addr, len)) |
2641 | return -ENOMEM; | 2646 | return -ENOMEM; |
2642 | goto munmap_back; | 2647 | goto munmap_back; |
2643 | } | 2648 | } |
2644 | 2649 | ||
2645 | /* Check against address space limits *after* clearing old maps... */ | 2650 | /* Check against address space limits *after* clearing old maps... */ |
2646 | if (!may_expand_vm(mm, len >> PAGE_SHIFT)) | 2651 | if (!may_expand_vm(mm, len >> PAGE_SHIFT)) |
2647 | return -ENOMEM; | 2652 | return -ENOMEM; |
2648 | 2653 | ||
2649 | if (mm->map_count > sysctl_max_map_count) | 2654 | if (mm->map_count > sysctl_max_map_count) |
2650 | return -ENOMEM; | 2655 | return -ENOMEM; |
2651 | 2656 | ||
2652 | if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT)) | 2657 | if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT)) |
2653 | return -ENOMEM; | 2658 | return -ENOMEM; |
2654 | 2659 | ||
2655 | /* Can we just expand an old private anonymous mapping? */ | 2660 | /* Can we just expand an old private anonymous mapping? */ |
2656 | vma = vma_merge(mm, prev, addr, addr + len, flags, | 2661 | vma = vma_merge(mm, prev, addr, addr + len, flags, |
2657 | NULL, NULL, pgoff, NULL); | 2662 | NULL, NULL, pgoff, NULL); |
2658 | if (vma) | 2663 | if (vma) |
2659 | goto out; | 2664 | goto out; |
2660 | 2665 | ||
2661 | /* | 2666 | /* |
2662 | * create a vma struct for an anonymous mapping | 2667 | * create a vma struct for an anonymous mapping |
2663 | */ | 2668 | */ |
2664 | vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); | 2669 | vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); |
2665 | if (!vma) { | 2670 | if (!vma) { |
2666 | vm_unacct_memory(len >> PAGE_SHIFT); | 2671 | vm_unacct_memory(len >> PAGE_SHIFT); |
2667 | return -ENOMEM; | 2672 | return -ENOMEM; |
2668 | } | 2673 | } |
2669 | 2674 | ||
2670 | INIT_LIST_HEAD(&vma->anon_vma_chain); | 2675 | INIT_LIST_HEAD(&vma->anon_vma_chain); |
2671 | vma->vm_mm = mm; | 2676 | vma->vm_mm = mm; |
2672 | vma->vm_start = addr; | 2677 | vma->vm_start = addr; |
2673 | vma->vm_end = addr + len; | 2678 | vma->vm_end = addr + len; |
2674 | vma->vm_pgoff = pgoff; | 2679 | vma->vm_pgoff = pgoff; |
2675 | vma->vm_flags = flags; | 2680 | vma->vm_flags = flags; |
2676 | vma->vm_page_prot = vm_get_page_prot(flags); | 2681 | vma->vm_page_prot = vm_get_page_prot(flags); |
2677 | vma_link(mm, vma, prev, rb_link, rb_parent); | 2682 | vma_link(mm, vma, prev, rb_link, rb_parent); |
2678 | out: | 2683 | out: |
2679 | perf_event_mmap(vma); | 2684 | perf_event_mmap(vma); |
2680 | mm->total_vm += len >> PAGE_SHIFT; | 2685 | mm->total_vm += len >> PAGE_SHIFT; |
2681 | if (flags & VM_LOCKED) | 2686 | if (flags & VM_LOCKED) |
2682 | mm->locked_vm += (len >> PAGE_SHIFT); | 2687 | mm->locked_vm += (len >> PAGE_SHIFT); |
2683 | return addr; | 2688 | return addr; |
2684 | } | 2689 | } |
2685 | 2690 | ||
2686 | unsigned long vm_brk(unsigned long addr, unsigned long len) | 2691 | unsigned long vm_brk(unsigned long addr, unsigned long len) |
2687 | { | 2692 | { |
2688 | struct mm_struct *mm = current->mm; | 2693 | struct mm_struct *mm = current->mm; |
2689 | unsigned long ret; | 2694 | unsigned long ret; |
2690 | bool populate; | 2695 | bool populate; |
2691 | 2696 | ||
2692 | down_write(&mm->mmap_sem); | 2697 | down_write(&mm->mmap_sem); |
2693 | ret = do_brk(addr, len); | 2698 | ret = do_brk(addr, len); |
2694 | populate = ((mm->def_flags & VM_LOCKED) != 0); | 2699 | populate = ((mm->def_flags & VM_LOCKED) != 0); |
2695 | up_write(&mm->mmap_sem); | 2700 | up_write(&mm->mmap_sem); |
2696 | if (populate) | 2701 | if (populate) |
2697 | mm_populate(addr, len); | 2702 | mm_populate(addr, len); |
2698 | return ret; | 2703 | return ret; |
2699 | } | 2704 | } |
2700 | EXPORT_SYMBOL(vm_brk); | 2705 | EXPORT_SYMBOL(vm_brk); |
2701 | 2706 | ||
2702 | /* Release all mmaps. */ | 2707 | /* Release all mmaps. */ |
2703 | void exit_mmap(struct mm_struct *mm) | 2708 | void exit_mmap(struct mm_struct *mm) |
2704 | { | 2709 | { |
2705 | struct mmu_gather tlb; | 2710 | struct mmu_gather tlb; |
2706 | struct vm_area_struct *vma; | 2711 | struct vm_area_struct *vma; |
2707 | unsigned long nr_accounted = 0; | 2712 | unsigned long nr_accounted = 0; |
2708 | 2713 | ||
2709 | /* mm's last user has gone, and its about to be pulled down */ | 2714 | /* mm's last user has gone, and its about to be pulled down */ |
2710 | mmu_notifier_release(mm); | 2715 | mmu_notifier_release(mm); |
2711 | 2716 | ||
2712 | if (mm->locked_vm) { | 2717 | if (mm->locked_vm) { |
2713 | vma = mm->mmap; | 2718 | vma = mm->mmap; |
2714 | while (vma) { | 2719 | while (vma) { |
2715 | if (vma->vm_flags & VM_LOCKED) | 2720 | if (vma->vm_flags & VM_LOCKED) |
2716 | munlock_vma_pages_all(vma); | 2721 | munlock_vma_pages_all(vma); |
2717 | vma = vma->vm_next; | 2722 | vma = vma->vm_next; |
2718 | } | 2723 | } |
2719 | } | 2724 | } |
2720 | 2725 | ||
2721 | arch_exit_mmap(mm); | 2726 | arch_exit_mmap(mm); |
2722 | 2727 | ||
2723 | vma = mm->mmap; | 2728 | vma = mm->mmap; |
2724 | if (!vma) /* Can happen if dup_mmap() received an OOM */ | 2729 | if (!vma) /* Can happen if dup_mmap() received an OOM */ |
2725 | return; | 2730 | return; |
2726 | 2731 | ||
2727 | lru_add_drain(); | 2732 | lru_add_drain(); |
2728 | flush_cache_mm(mm); | 2733 | flush_cache_mm(mm); |
2729 | tlb_gather_mmu(&tlb, mm, 1); | 2734 | tlb_gather_mmu(&tlb, mm, 1); |
2730 | /* update_hiwater_rss(mm) here? but nobody should be looking */ | 2735 | /* update_hiwater_rss(mm) here? but nobody should be looking */ |
2731 | /* Use -1 here to ensure all VMAs in the mm are unmapped */ | 2736 | /* Use -1 here to ensure all VMAs in the mm are unmapped */ |
2732 | unmap_vmas(&tlb, vma, 0, -1); | 2737 | unmap_vmas(&tlb, vma, 0, -1); |
2733 | 2738 | ||
2734 | free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); | 2739 | free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); |
2735 | tlb_finish_mmu(&tlb, 0, -1); | 2740 | tlb_finish_mmu(&tlb, 0, -1); |
2736 | 2741 | ||
2737 | /* | 2742 | /* |
2738 | * Walk the list again, actually closing and freeing it, | 2743 | * Walk the list again, actually closing and freeing it, |
2739 | * with preemption enabled, without holding any MM locks. | 2744 | * with preemption enabled, without holding any MM locks. |
2740 | */ | 2745 | */ |
2741 | while (vma) { | 2746 | while (vma) { |
2742 | if (vma->vm_flags & VM_ACCOUNT) | 2747 | if (vma->vm_flags & VM_ACCOUNT) |
2743 | nr_accounted += vma_pages(vma); | 2748 | nr_accounted += vma_pages(vma); |
2744 | vma = remove_vma(vma); | 2749 | vma = remove_vma(vma); |
2745 | } | 2750 | } |
2746 | vm_unacct_memory(nr_accounted); | 2751 | vm_unacct_memory(nr_accounted); |
2747 | 2752 | ||
2748 | WARN_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); | 2753 | WARN_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); |
2749 | } | 2754 | } |
2750 | 2755 | ||
2751 | /* Insert vm structure into process list sorted by address | 2756 | /* Insert vm structure into process list sorted by address |
2752 | * and into the inode's i_mmap tree. If vm_file is non-NULL | 2757 | * and into the inode's i_mmap tree. If vm_file is non-NULL |
2753 | * then i_mmap_mutex is taken here. | 2758 | * then i_mmap_mutex is taken here. |
2754 | */ | 2759 | */ |
2755 | int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) | 2760 | int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) |
2756 | { | 2761 | { |
2757 | struct vm_area_struct *prev; | 2762 | struct vm_area_struct *prev; |
2758 | struct rb_node **rb_link, *rb_parent; | 2763 | struct rb_node **rb_link, *rb_parent; |
2759 | 2764 | ||
2760 | /* | 2765 | /* |
2761 | * The vm_pgoff of a purely anonymous vma should be irrelevant | 2766 | * The vm_pgoff of a purely anonymous vma should be irrelevant |
2762 | * until its first write fault, when page's anon_vma and index | 2767 | * until its first write fault, when page's anon_vma and index |
2763 | * are set. But now set the vm_pgoff it will almost certainly | 2768 | * are set. But now set the vm_pgoff it will almost certainly |
2764 | * end up with (unless mremap moves it elsewhere before that | 2769 | * end up with (unless mremap moves it elsewhere before that |
2765 | * first wfault), so /proc/pid/maps tells a consistent story. | 2770 | * first wfault), so /proc/pid/maps tells a consistent story. |
2766 | * | 2771 | * |
2767 | * By setting it to reflect the virtual start address of the | 2772 | * By setting it to reflect the virtual start address of the |
2768 | * vma, merges and splits can happen in a seamless way, just | 2773 | * vma, merges and splits can happen in a seamless way, just |
2769 | * using the existing file pgoff checks and manipulations. | 2774 | * using the existing file pgoff checks and manipulations. |
2770 | * Similarly in do_mmap_pgoff and in do_brk. | 2775 | * Similarly in do_mmap_pgoff and in do_brk. |
2771 | */ | 2776 | */ |
2772 | if (!vma->vm_file) { | 2777 | if (!vma->vm_file) { |
2773 | BUG_ON(vma->anon_vma); | 2778 | BUG_ON(vma->anon_vma); |
2774 | vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; | 2779 | vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; |
2775 | } | 2780 | } |
2776 | if (find_vma_links(mm, vma->vm_start, vma->vm_end, | 2781 | if (find_vma_links(mm, vma->vm_start, vma->vm_end, |
2777 | &prev, &rb_link, &rb_parent)) | 2782 | &prev, &rb_link, &rb_parent)) |
2778 | return -ENOMEM; | 2783 | return -ENOMEM; |
2779 | if ((vma->vm_flags & VM_ACCOUNT) && | 2784 | if ((vma->vm_flags & VM_ACCOUNT) && |
2780 | security_vm_enough_memory_mm(mm, vma_pages(vma))) | 2785 | security_vm_enough_memory_mm(mm, vma_pages(vma))) |
2781 | return -ENOMEM; | 2786 | return -ENOMEM; |
2782 | 2787 | ||
2783 | vma_link(mm, vma, prev, rb_link, rb_parent); | 2788 | vma_link(mm, vma, prev, rb_link, rb_parent); |
2784 | return 0; | 2789 | return 0; |
2785 | } | 2790 | } |
2786 | 2791 | ||
2787 | /* | 2792 | /* |
2788 | * Copy the vma structure to a new location in the same mm, | 2793 | * Copy the vma structure to a new location in the same mm, |
2789 | * prior to moving page table entries, to effect an mremap move. | 2794 | * prior to moving page table entries, to effect an mremap move. |
2790 | */ | 2795 | */ |
2791 | struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, | 2796 | struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, |
2792 | unsigned long addr, unsigned long len, pgoff_t pgoff, | 2797 | unsigned long addr, unsigned long len, pgoff_t pgoff, |
2793 | bool *need_rmap_locks) | 2798 | bool *need_rmap_locks) |
2794 | { | 2799 | { |
2795 | struct vm_area_struct *vma = *vmap; | 2800 | struct vm_area_struct *vma = *vmap; |
2796 | unsigned long vma_start = vma->vm_start; | 2801 | unsigned long vma_start = vma->vm_start; |
2797 | struct mm_struct *mm = vma->vm_mm; | 2802 | struct mm_struct *mm = vma->vm_mm; |
2798 | struct vm_area_struct *new_vma, *prev; | 2803 | struct vm_area_struct *new_vma, *prev; |
2799 | struct rb_node **rb_link, *rb_parent; | 2804 | struct rb_node **rb_link, *rb_parent; |
2800 | struct mempolicy *pol; | 2805 | struct mempolicy *pol; |
2801 | bool faulted_in_anon_vma = true; | 2806 | bool faulted_in_anon_vma = true; |
2802 | 2807 | ||
2803 | /* | 2808 | /* |
2804 | * If anonymous vma has not yet been faulted, update new pgoff | 2809 | * If anonymous vma has not yet been faulted, update new pgoff |
2805 | * to match new location, to increase its chance of merging. | 2810 | * to match new location, to increase its chance of merging. |
2806 | */ | 2811 | */ |
2807 | if (unlikely(!vma->vm_file && !vma->anon_vma)) { | 2812 | if (unlikely(!vma->vm_file && !vma->anon_vma)) { |
2808 | pgoff = addr >> PAGE_SHIFT; | 2813 | pgoff = addr >> PAGE_SHIFT; |
2809 | faulted_in_anon_vma = false; | 2814 | faulted_in_anon_vma = false; |
2810 | } | 2815 | } |
2811 | 2816 | ||
2812 | if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) | 2817 | if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) |
2813 | return NULL; /* should never get here */ | 2818 | return NULL; /* should never get here */ |
2814 | new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, | 2819 | new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, |
2815 | vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); | 2820 | vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); |
2816 | if (new_vma) { | 2821 | if (new_vma) { |
2817 | /* | 2822 | /* |
2818 | * Source vma may have been merged into new_vma | 2823 | * Source vma may have been merged into new_vma |
2819 | */ | 2824 | */ |
2820 | if (unlikely(vma_start >= new_vma->vm_start && | 2825 | if (unlikely(vma_start >= new_vma->vm_start && |
2821 | vma_start < new_vma->vm_end)) { | 2826 | vma_start < new_vma->vm_end)) { |
2822 | /* | 2827 | /* |
2823 | * The only way we can get a vma_merge with | 2828 | * The only way we can get a vma_merge with |
2824 | * self during an mremap is if the vma hasn't | 2829 | * self during an mremap is if the vma hasn't |
2825 | * been faulted in yet and we were allowed to | 2830 | * been faulted in yet and we were allowed to |
2826 | * reset the dst vma->vm_pgoff to the | 2831 | * reset the dst vma->vm_pgoff to the |
2827 | * destination address of the mremap to allow | 2832 | * destination address of the mremap to allow |
2828 | * the merge to happen. mremap must change the | 2833 | * the merge to happen. mremap must change the |
2829 | * vm_pgoff linearity between src and dst vmas | 2834 | * vm_pgoff linearity between src and dst vmas |
2830 | * (in turn preventing a vma_merge) to be | 2835 | * (in turn preventing a vma_merge) to be |
2831 | * safe. It is only safe to keep the vm_pgoff | 2836 | * safe. It is only safe to keep the vm_pgoff |
2832 | * linear if there are no pages mapped yet. | 2837 | * linear if there are no pages mapped yet. |
2833 | */ | 2838 | */ |
2834 | VM_BUG_ON(faulted_in_anon_vma); | 2839 | VM_BUG_ON(faulted_in_anon_vma); |
2835 | *vmap = vma = new_vma; | 2840 | *vmap = vma = new_vma; |
2836 | } | 2841 | } |
2837 | *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); | 2842 | *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); |
2838 | } else { | 2843 | } else { |
2839 | new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); | 2844 | new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); |
2840 | if (new_vma) { | 2845 | if (new_vma) { |
2841 | *new_vma = *vma; | 2846 | *new_vma = *vma; |
2842 | new_vma->vm_start = addr; | 2847 | new_vma->vm_start = addr; |
2843 | new_vma->vm_end = addr + len; | 2848 | new_vma->vm_end = addr + len; |
2844 | new_vma->vm_pgoff = pgoff; | 2849 | new_vma->vm_pgoff = pgoff; |
2845 | pol = mpol_dup(vma_policy(vma)); | 2850 | pol = mpol_dup(vma_policy(vma)); |
2846 | if (IS_ERR(pol)) | 2851 | if (IS_ERR(pol)) |
2847 | goto out_free_vma; | 2852 | goto out_free_vma; |
2848 | vma_set_policy(new_vma, pol); | 2853 | vma_set_policy(new_vma, pol); |
2849 | INIT_LIST_HEAD(&new_vma->anon_vma_chain); | 2854 | INIT_LIST_HEAD(&new_vma->anon_vma_chain); |
2850 | if (anon_vma_clone(new_vma, vma)) | 2855 | if (anon_vma_clone(new_vma, vma)) |
2851 | goto out_free_mempol; | 2856 | goto out_free_mempol; |
2852 | if (new_vma->vm_file) | 2857 | if (new_vma->vm_file) |
2853 | get_file(new_vma->vm_file); | 2858 | get_file(new_vma->vm_file); |
2854 | if (new_vma->vm_ops && new_vma->vm_ops->open) | 2859 | if (new_vma->vm_ops && new_vma->vm_ops->open) |
2855 | new_vma->vm_ops->open(new_vma); | 2860 | new_vma->vm_ops->open(new_vma); |
2856 | vma_link(mm, new_vma, prev, rb_link, rb_parent); | 2861 | vma_link(mm, new_vma, prev, rb_link, rb_parent); |
2857 | *need_rmap_locks = false; | 2862 | *need_rmap_locks = false; |
2858 | } | 2863 | } |
2859 | } | 2864 | } |
2860 | return new_vma; | 2865 | return new_vma; |
2861 | 2866 | ||
2862 | out_free_mempol: | 2867 | out_free_mempol: |
2863 | mpol_put(pol); | 2868 | mpol_put(pol); |
2864 | out_free_vma: | 2869 | out_free_vma: |
2865 | kmem_cache_free(vm_area_cachep, new_vma); | 2870 | kmem_cache_free(vm_area_cachep, new_vma); |
2866 | return NULL; | 2871 | return NULL; |
2867 | } | 2872 | } |
2868 | 2873 | ||
2869 | /* | 2874 | /* |
2870 | * Return true if the calling process may expand its vm space by the passed | 2875 | * Return true if the calling process may expand its vm space by the passed |
2871 | * number of pages | 2876 | * number of pages |
2872 | */ | 2877 | */ |
2873 | int may_expand_vm(struct mm_struct *mm, unsigned long npages) | 2878 | int may_expand_vm(struct mm_struct *mm, unsigned long npages) |
2874 | { | 2879 | { |
2875 | unsigned long cur = mm->total_vm; /* pages */ | 2880 | unsigned long cur = mm->total_vm; /* pages */ |
2876 | unsigned long lim; | 2881 | unsigned long lim; |
2877 | 2882 | ||
2878 | lim = rlimit(RLIMIT_AS) >> PAGE_SHIFT; | 2883 | lim = rlimit(RLIMIT_AS) >> PAGE_SHIFT; |
2879 | 2884 | ||
2880 | if (cur + npages > lim) | 2885 | if (cur + npages > lim) |
2881 | return 0; | 2886 | return 0; |
2882 | return 1; | 2887 | return 1; |
2883 | } | 2888 | } |
2884 | 2889 | ||
2885 | 2890 | ||
2886 | static int special_mapping_fault(struct vm_area_struct *vma, | 2891 | static int special_mapping_fault(struct vm_area_struct *vma, |
2887 | struct vm_fault *vmf) | 2892 | struct vm_fault *vmf) |
2888 | { | 2893 | { |
2889 | pgoff_t pgoff; | 2894 | pgoff_t pgoff; |
2890 | struct page **pages; | 2895 | struct page **pages; |
2891 | 2896 | ||
2892 | /* | 2897 | /* |
2893 | * special mappings have no vm_file, and in that case, the mm | 2898 | * special mappings have no vm_file, and in that case, the mm |
2894 | * uses vm_pgoff internally. So we have to subtract it from here. | 2899 | * uses vm_pgoff internally. So we have to subtract it from here. |
2895 | * We are allowed to do this because we are the mm; do not copy | 2900 | * We are allowed to do this because we are the mm; do not copy |
2896 | * this code into drivers! | 2901 | * this code into drivers! |
2897 | */ | 2902 | */ |
2898 | pgoff = vmf->pgoff - vma->vm_pgoff; | 2903 | pgoff = vmf->pgoff - vma->vm_pgoff; |
2899 | 2904 | ||
2900 | for (pages = vma->vm_private_data; pgoff && *pages; ++pages) | 2905 | for (pages = vma->vm_private_data; pgoff && *pages; ++pages) |
2901 | pgoff--; | 2906 | pgoff--; |
2902 | 2907 | ||
2903 | if (*pages) { | 2908 | if (*pages) { |
2904 | struct page *page = *pages; | 2909 | struct page *page = *pages; |
2905 | get_page(page); | 2910 | get_page(page); |
2906 | vmf->page = page; | 2911 | vmf->page = page; |
2907 | return 0; | 2912 | return 0; |
2908 | } | 2913 | } |
2909 | 2914 | ||
2910 | return VM_FAULT_SIGBUS; | 2915 | return VM_FAULT_SIGBUS; |
2911 | } | 2916 | } |
2912 | 2917 | ||
2913 | /* | 2918 | /* |
2914 | * Having a close hook prevents vma merging regardless of flags. | 2919 | * Having a close hook prevents vma merging regardless of flags. |
2915 | */ | 2920 | */ |
2916 | static void special_mapping_close(struct vm_area_struct *vma) | 2921 | static void special_mapping_close(struct vm_area_struct *vma) |
2917 | { | 2922 | { |
2918 | } | 2923 | } |
2919 | 2924 | ||
2920 | static const struct vm_operations_struct special_mapping_vmops = { | 2925 | static const struct vm_operations_struct special_mapping_vmops = { |
2921 | .close = special_mapping_close, | 2926 | .close = special_mapping_close, |
2922 | .fault = special_mapping_fault, | 2927 | .fault = special_mapping_fault, |
2923 | }; | 2928 | }; |
2924 | 2929 | ||
2925 | /* | 2930 | /* |
2926 | * Called with mm->mmap_sem held for writing. | 2931 | * Called with mm->mmap_sem held for writing. |
2927 | * Insert a new vma covering the given region, with the given flags. | 2932 | * Insert a new vma covering the given region, with the given flags. |
2928 | * Its pages are supplied by the given array of struct page *. | 2933 | * Its pages are supplied by the given array of struct page *. |
2929 | * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated. | 2934 | * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated. |
2930 | * The region past the last page supplied will always produce SIGBUS. | 2935 | * The region past the last page supplied will always produce SIGBUS. |
2931 | * The array pointer and the pages it points to are assumed to stay alive | 2936 | * The array pointer and the pages it points to are assumed to stay alive |
2932 | * for as long as this mapping might exist. | 2937 | * for as long as this mapping might exist. |
2933 | */ | 2938 | */ |
2934 | int install_special_mapping(struct mm_struct *mm, | 2939 | int install_special_mapping(struct mm_struct *mm, |
2935 | unsigned long addr, unsigned long len, | 2940 | unsigned long addr, unsigned long len, |
2936 | unsigned long vm_flags, struct page **pages) | 2941 | unsigned long vm_flags, struct page **pages) |
2937 | { | 2942 | { |
2938 | int ret; | 2943 | int ret; |
2939 | struct vm_area_struct *vma; | 2944 | struct vm_area_struct *vma; |
2940 | 2945 | ||
2941 | vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); | 2946 | vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); |
2942 | if (unlikely(vma == NULL)) | 2947 | if (unlikely(vma == NULL)) |
2943 | return -ENOMEM; | 2948 | return -ENOMEM; |
2944 | 2949 | ||
2945 | INIT_LIST_HEAD(&vma->anon_vma_chain); | 2950 | INIT_LIST_HEAD(&vma->anon_vma_chain); |
2946 | vma->vm_mm = mm; | 2951 | vma->vm_mm = mm; |
2947 | vma->vm_start = addr; | 2952 | vma->vm_start = addr; |
2948 | vma->vm_end = addr + len; | 2953 | vma->vm_end = addr + len; |
2949 | 2954 | ||
2950 | vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND; | 2955 | vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND; |
2951 | vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); | 2956 | vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); |
2952 | 2957 | ||
2953 | vma->vm_ops = &special_mapping_vmops; | 2958 | vma->vm_ops = &special_mapping_vmops; |
2954 | vma->vm_private_data = pages; | 2959 | vma->vm_private_data = pages; |
2955 | 2960 | ||
2956 | ret = insert_vm_struct(mm, vma); | 2961 | ret = insert_vm_struct(mm, vma); |
2957 | if (ret) | 2962 | if (ret) |
2958 | goto out; | 2963 | goto out; |
2959 | 2964 | ||
2960 | mm->total_vm += len >> PAGE_SHIFT; | 2965 | mm->total_vm += len >> PAGE_SHIFT; |
2961 | 2966 | ||
2962 | perf_event_mmap(vma); | 2967 | perf_event_mmap(vma); |
2963 | 2968 | ||
2964 | return 0; | 2969 | return 0; |
2965 | 2970 | ||
2966 | out: | 2971 | out: |
2967 | kmem_cache_free(vm_area_cachep, vma); | 2972 | kmem_cache_free(vm_area_cachep, vma); |
2968 | return ret; | 2973 | return ret; |
2969 | } | 2974 | } |
2970 | 2975 | ||
2971 | static DEFINE_MUTEX(mm_all_locks_mutex); | 2976 | static DEFINE_MUTEX(mm_all_locks_mutex); |
2972 | 2977 | ||
2973 | static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) | 2978 | static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) |
2974 | { | 2979 | { |
2975 | if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) { | 2980 | if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) { |
2976 | /* | 2981 | /* |
2977 | * The LSB of head.next can't change from under us | 2982 | * The LSB of head.next can't change from under us |
2978 | * because we hold the mm_all_locks_mutex. | 2983 | * because we hold the mm_all_locks_mutex. |
2979 | */ | 2984 | */ |
2980 | down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_sem); | 2985 | down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_sem); |
2981 | /* | 2986 | /* |
2982 | * We can safely modify head.next after taking the | 2987 | * We can safely modify head.next after taking the |
2983 | * anon_vma->root->rwsem. If some other vma in this mm shares | 2988 | * anon_vma->root->rwsem. If some other vma in this mm shares |
2984 | * the same anon_vma we won't take it again. | 2989 | * the same anon_vma we won't take it again. |
2985 | * | 2990 | * |
2986 | * No need of atomic instructions here, head.next | 2991 | * No need of atomic instructions here, head.next |
2987 | * can't change from under us thanks to the | 2992 | * can't change from under us thanks to the |
2988 | * anon_vma->root->rwsem. | 2993 | * anon_vma->root->rwsem. |
2989 | */ | 2994 | */ |
2990 | if (__test_and_set_bit(0, (unsigned long *) | 2995 | if (__test_and_set_bit(0, (unsigned long *) |
2991 | &anon_vma->root->rb_root.rb_node)) | 2996 | &anon_vma->root->rb_root.rb_node)) |
2992 | BUG(); | 2997 | BUG(); |
2993 | } | 2998 | } |
2994 | } | 2999 | } |
2995 | 3000 | ||
2996 | static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) | 3001 | static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) |
2997 | { | 3002 | { |
2998 | if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { | 3003 | if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { |
2999 | /* | 3004 | /* |
3000 | * AS_MM_ALL_LOCKS can't change from under us because | 3005 | * AS_MM_ALL_LOCKS can't change from under us because |
3001 | * we hold the mm_all_locks_mutex. | 3006 | * we hold the mm_all_locks_mutex. |
3002 | * | 3007 | * |
3003 | * Operations on ->flags have to be atomic because | 3008 | * Operations on ->flags have to be atomic because |
3004 | * even if AS_MM_ALL_LOCKS is stable thanks to the | 3009 | * even if AS_MM_ALL_LOCKS is stable thanks to the |
3005 | * mm_all_locks_mutex, there may be other cpus | 3010 | * mm_all_locks_mutex, there may be other cpus |
3006 | * changing other bitflags in parallel to us. | 3011 | * changing other bitflags in parallel to us. |
3007 | */ | 3012 | */ |
3008 | if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) | 3013 | if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) |
3009 | BUG(); | 3014 | BUG(); |
3010 | mutex_lock_nest_lock(&mapping->i_mmap_mutex, &mm->mmap_sem); | 3015 | mutex_lock_nest_lock(&mapping->i_mmap_mutex, &mm->mmap_sem); |
3011 | } | 3016 | } |
3012 | } | 3017 | } |
3013 | 3018 | ||
3014 | /* | 3019 | /* |
3015 | * This operation locks against the VM for all pte/vma/mm related | 3020 | * This operation locks against the VM for all pte/vma/mm related |
3016 | * operations that could ever happen on a certain mm. This includes | 3021 | * operations that could ever happen on a certain mm. This includes |
3017 | * vmtruncate, try_to_unmap, and all page faults. | 3022 | * vmtruncate, try_to_unmap, and all page faults. |
3018 | * | 3023 | * |
3019 | * The caller must take the mmap_sem in write mode before calling | 3024 | * The caller must take the mmap_sem in write mode before calling |
3020 | * mm_take_all_locks(). The caller isn't allowed to release the | 3025 | * mm_take_all_locks(). The caller isn't allowed to release the |
3021 | * mmap_sem until mm_drop_all_locks() returns. | 3026 | * mmap_sem until mm_drop_all_locks() returns. |
3022 | * | 3027 | * |
3023 | * mmap_sem in write mode is required in order to block all operations | 3028 | * mmap_sem in write mode is required in order to block all operations |
3024 | * that could modify pagetables and free pages without need of | 3029 | * that could modify pagetables and free pages without need of |
3025 | * altering the vma layout (for example populate_range() with | 3030 | * altering the vma layout (for example populate_range() with |
3026 | * nonlinear vmas). It's also needed in write mode to avoid new | 3031 | * nonlinear vmas). It's also needed in write mode to avoid new |
3027 | * anon_vmas to be associated with existing vmas. | 3032 | * anon_vmas to be associated with existing vmas. |
3028 | * | 3033 | * |
3029 | * A single task can't take more than one mm_take_all_locks() in a row | 3034 | * A single task can't take more than one mm_take_all_locks() in a row |
3030 | * or it would deadlock. | 3035 | * or it would deadlock. |
3031 | * | 3036 | * |
3032 | * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in | 3037 | * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in |
3033 | * mapping->flags avoid to take the same lock twice, if more than one | 3038 | * mapping->flags avoid to take the same lock twice, if more than one |
3034 | * vma in this mm is backed by the same anon_vma or address_space. | 3039 | * vma in this mm is backed by the same anon_vma or address_space. |
3035 | * | 3040 | * |
3036 | * We can take all the locks in random order because the VM code | 3041 | * We can take all the locks in random order because the VM code |
3037 | * taking i_mmap_mutex or anon_vma->rwsem outside the mmap_sem never | 3042 | * taking i_mmap_mutex or anon_vma->rwsem outside the mmap_sem never |
3038 | * takes more than one of them in a row. Secondly we're protected | 3043 | * takes more than one of them in a row. Secondly we're protected |
3039 | * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. | 3044 | * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. |
3040 | * | 3045 | * |
3041 | * mm_take_all_locks() and mm_drop_all_locks are expensive operations | 3046 | * mm_take_all_locks() and mm_drop_all_locks are expensive operations |
3042 | * that may have to take thousand of locks. | 3047 | * that may have to take thousand of locks. |
3043 | * | 3048 | * |
3044 | * mm_take_all_locks() can fail if it's interrupted by signals. | 3049 | * mm_take_all_locks() can fail if it's interrupted by signals. |
3045 | */ | 3050 | */ |
3046 | int mm_take_all_locks(struct mm_struct *mm) | 3051 | int mm_take_all_locks(struct mm_struct *mm) |
3047 | { | 3052 | { |
3048 | struct vm_area_struct *vma; | 3053 | struct vm_area_struct *vma; |
3049 | struct anon_vma_chain *avc; | 3054 | struct anon_vma_chain *avc; |
3050 | 3055 | ||
3051 | BUG_ON(down_read_trylock(&mm->mmap_sem)); | 3056 | BUG_ON(down_read_trylock(&mm->mmap_sem)); |
3052 | 3057 | ||
3053 | mutex_lock(&mm_all_locks_mutex); | 3058 | mutex_lock(&mm_all_locks_mutex); |
3054 | 3059 | ||
3055 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 3060 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
3056 | if (signal_pending(current)) | 3061 | if (signal_pending(current)) |
3057 | goto out_unlock; | 3062 | goto out_unlock; |
3058 | if (vma->vm_file && vma->vm_file->f_mapping) | 3063 | if (vma->vm_file && vma->vm_file->f_mapping) |
3059 | vm_lock_mapping(mm, vma->vm_file->f_mapping); | 3064 | vm_lock_mapping(mm, vma->vm_file->f_mapping); |
3060 | } | 3065 | } |
3061 | 3066 | ||
3062 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 3067 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
3063 | if (signal_pending(current)) | 3068 | if (signal_pending(current)) |
3064 | goto out_unlock; | 3069 | goto out_unlock; |
3065 | if (vma->anon_vma) | 3070 | if (vma->anon_vma) |
3066 | list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) | 3071 | list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) |
3067 | vm_lock_anon_vma(mm, avc->anon_vma); | 3072 | vm_lock_anon_vma(mm, avc->anon_vma); |
3068 | } | 3073 | } |
3069 | 3074 | ||
3070 | return 0; | 3075 | return 0; |
3071 | 3076 | ||
3072 | out_unlock: | 3077 | out_unlock: |
3073 | mm_drop_all_locks(mm); | 3078 | mm_drop_all_locks(mm); |
3074 | return -EINTR; | 3079 | return -EINTR; |
3075 | } | 3080 | } |
3076 | 3081 | ||
3077 | static void vm_unlock_anon_vma(struct anon_vma *anon_vma) | 3082 | static void vm_unlock_anon_vma(struct anon_vma *anon_vma) |
3078 | { | 3083 | { |
3079 | if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) { | 3084 | if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) { |
3080 | /* | 3085 | /* |
3081 | * The LSB of head.next can't change to 0 from under | 3086 | * The LSB of head.next can't change to 0 from under |
3082 | * us because we hold the mm_all_locks_mutex. | 3087 | * us because we hold the mm_all_locks_mutex. |
3083 | * | 3088 | * |
3084 | * We must however clear the bitflag before unlocking | 3089 | * We must however clear the bitflag before unlocking |
3085 | * the vma so the users using the anon_vma->rb_root will | 3090 | * the vma so the users using the anon_vma->rb_root will |
3086 | * never see our bitflag. | 3091 | * never see our bitflag. |
3087 | * | 3092 | * |
3088 | * No need of atomic instructions here, head.next | 3093 | * No need of atomic instructions here, head.next |
3089 | * can't change from under us until we release the | 3094 | * can't change from under us until we release the |
3090 | * anon_vma->root->rwsem. | 3095 | * anon_vma->root->rwsem. |
3091 | */ | 3096 | */ |
3092 | if (!__test_and_clear_bit(0, (unsigned long *) | 3097 | if (!__test_and_clear_bit(0, (unsigned long *) |
3093 | &anon_vma->root->rb_root.rb_node)) | 3098 | &anon_vma->root->rb_root.rb_node)) |
3094 | BUG(); | 3099 | BUG(); |
3095 | anon_vma_unlock_write(anon_vma); | 3100 | anon_vma_unlock_write(anon_vma); |
3096 | } | 3101 | } |
3097 | } | 3102 | } |
3098 | 3103 | ||
3099 | static void vm_unlock_mapping(struct address_space *mapping) | 3104 | static void vm_unlock_mapping(struct address_space *mapping) |
3100 | { | 3105 | { |
3101 | if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { | 3106 | if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { |
3102 | /* | 3107 | /* |
3103 | * AS_MM_ALL_LOCKS can't change to 0 from under us | 3108 | * AS_MM_ALL_LOCKS can't change to 0 from under us |
3104 | * because we hold the mm_all_locks_mutex. | 3109 | * because we hold the mm_all_locks_mutex. |
3105 | */ | 3110 | */ |
3106 | mutex_unlock(&mapping->i_mmap_mutex); | 3111 | mutex_unlock(&mapping->i_mmap_mutex); |
3107 | if (!test_and_clear_bit(AS_MM_ALL_LOCKS, | 3112 | if (!test_and_clear_bit(AS_MM_ALL_LOCKS, |
3108 | &mapping->flags)) | 3113 | &mapping->flags)) |
3109 | BUG(); | 3114 | BUG(); |
3110 | } | 3115 | } |
3111 | } | 3116 | } |
3112 | 3117 | ||
3113 | /* | 3118 | /* |
3114 | * The mmap_sem cannot be released by the caller until | 3119 | * The mmap_sem cannot be released by the caller until |
3115 | * mm_drop_all_locks() returns. | 3120 | * mm_drop_all_locks() returns. |
3116 | */ | 3121 | */ |
3117 | void mm_drop_all_locks(struct mm_struct *mm) | 3122 | void mm_drop_all_locks(struct mm_struct *mm) |
3118 | { | 3123 | { |
3119 | struct vm_area_struct *vma; | 3124 | struct vm_area_struct *vma; |
3120 | struct anon_vma_chain *avc; | 3125 | struct anon_vma_chain *avc; |
3121 | 3126 | ||
3122 | BUG_ON(down_read_trylock(&mm->mmap_sem)); | 3127 | BUG_ON(down_read_trylock(&mm->mmap_sem)); |
3123 | BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); | 3128 | BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); |
3124 | 3129 | ||
3125 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 3130 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
3126 | if (vma->anon_vma) | 3131 | if (vma->anon_vma) |
3127 | list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) | 3132 | list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) |
3128 | vm_unlock_anon_vma(avc->anon_vma); | 3133 | vm_unlock_anon_vma(avc->anon_vma); |
3129 | if (vma->vm_file && vma->vm_file->f_mapping) | 3134 | if (vma->vm_file && vma->vm_file->f_mapping) |
3130 | vm_unlock_mapping(vma->vm_file->f_mapping); | 3135 | vm_unlock_mapping(vma->vm_file->f_mapping); |
3131 | } | 3136 | } |
3132 | 3137 | ||
3133 | mutex_unlock(&mm_all_locks_mutex); | 3138 | mutex_unlock(&mm_all_locks_mutex); |
3134 | } | 3139 | } |
3135 | 3140 | ||
3136 | /* | 3141 | /* |
3137 | * initialise the VMA slab | 3142 | * initialise the VMA slab |
3138 | */ | 3143 | */ |
3139 | void __init mmap_init(void) | 3144 | void __init mmap_init(void) |
3140 | { | 3145 | { |
3141 | int ret; | 3146 | int ret; |
3142 | 3147 | ||
3143 | ret = percpu_counter_init(&vm_committed_as, 0); | 3148 | ret = percpu_counter_init(&vm_committed_as, 0); |
3144 | VM_BUG_ON(ret); | 3149 | VM_BUG_ON(ret); |
3145 | } | 3150 | } |
3146 | 3151 | ||
3147 | /* | 3152 | /* |
3148 | * Initialise sysctl_user_reserve_kbytes. | 3153 | * Initialise sysctl_user_reserve_kbytes. |
3149 | * | 3154 | * |
3150 | * This is intended to prevent a user from starting a single memory hogging | 3155 | * This is intended to prevent a user from starting a single memory hogging |
3151 | * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER | 3156 | * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER |
3152 | * mode. | 3157 | * mode. |
3153 | * | 3158 | * |
3154 | * The default value is min(3% of free memory, 128MB) | 3159 | * The default value is min(3% of free memory, 128MB) |
3155 | * 128MB is enough to recover with sshd/login, bash, and top/kill. | 3160 | * 128MB is enough to recover with sshd/login, bash, and top/kill. |
3156 | */ | 3161 | */ |
3157 | static int init_user_reserve(void) | 3162 | static int init_user_reserve(void) |
3158 | { | 3163 | { |
3159 | unsigned long free_kbytes; | 3164 | unsigned long free_kbytes; |
3160 | 3165 | ||
3161 | free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); | 3166 | free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); |
3162 | 3167 | ||
3163 | sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); | 3168 | sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); |
3164 | return 0; | 3169 | return 0; |
3165 | } | 3170 | } |
3166 | module_init(init_user_reserve) | 3171 | module_init(init_user_reserve) |
3167 | 3172 | ||
3168 | /* | 3173 | /* |
3169 | * Initialise sysctl_admin_reserve_kbytes. | 3174 | * Initialise sysctl_admin_reserve_kbytes. |
3170 | * | 3175 | * |
3171 | * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin | 3176 | * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin |
3172 | * to log in and kill a memory hogging process. | 3177 | * to log in and kill a memory hogging process. |
3173 | * | 3178 | * |
3174 | * Systems with more than 256MB will reserve 8MB, enough to recover | 3179 | * Systems with more than 256MB will reserve 8MB, enough to recover |
3175 | * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will | 3180 | * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will |
3176 | * only reserve 3% of free pages by default. | 3181 | * only reserve 3% of free pages by default. |
3177 | */ | 3182 | */ |
3178 | static int init_admin_reserve(void) | 3183 | static int init_admin_reserve(void) |
3179 | { | 3184 | { |
3180 | unsigned long free_kbytes; | 3185 | unsigned long free_kbytes; |
3181 | 3186 | ||
3182 | free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); | 3187 | free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); |
3183 | 3188 | ||
3184 | sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); | 3189 | sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); |
3185 | return 0; | 3190 | return 0; |
3186 | } | 3191 | } |
3187 | module_init(init_admin_reserve) | 3192 | module_init(init_admin_reserve) |
3188 | 3193 | ||
3189 | /* | 3194 | /* |
3190 | * Reinititalise user and admin reserves if memory is added or removed. | 3195 | * Reinititalise user and admin reserves if memory is added or removed. |
3191 | * | 3196 | * |
3192 | * The default user reserve max is 128MB, and the default max for the | 3197 | * The default user reserve max is 128MB, and the default max for the |
3193 | * admin reserve is 8MB. These are usually, but not always, enough to | 3198 | * admin reserve is 8MB. These are usually, but not always, enough to |
3194 | * enable recovery from a memory hogging process using login/sshd, a shell, | 3199 | * enable recovery from a memory hogging process using login/sshd, a shell, |
3195 | * and tools like top. It may make sense to increase or even disable the | 3200 | * and tools like top. It may make sense to increase or even disable the |
3196 | * reserve depending on the existence of swap or variations in the recovery | 3201 | * reserve depending on the existence of swap or variations in the recovery |
3197 | * tools. So, the admin may have changed them. | 3202 | * tools. So, the admin may have changed them. |
3198 | * | 3203 | * |
3199 | * If memory is added and the reserves have been eliminated or increased above | 3204 | * If memory is added and the reserves have been eliminated or increased above |
3200 | * the default max, then we'll trust the admin. | 3205 | * the default max, then we'll trust the admin. |
3201 | * | 3206 | * |
3202 | * If memory is removed and there isn't enough free memory, then we | 3207 | * If memory is removed and there isn't enough free memory, then we |
3203 | * need to reset the reserves. | 3208 | * need to reset the reserves. |
3204 | * | 3209 | * |
3205 | * Otherwise keep the reserve set by the admin. | 3210 | * Otherwise keep the reserve set by the admin. |
3206 | */ | 3211 | */ |
3207 | static int reserve_mem_notifier(struct notifier_block *nb, | 3212 | static int reserve_mem_notifier(struct notifier_block *nb, |
3208 | unsigned long action, void *data) | 3213 | unsigned long action, void *data) |
3209 | { | 3214 | { |
3210 | unsigned long tmp, free_kbytes; | 3215 | unsigned long tmp, free_kbytes; |
3211 | 3216 | ||
3212 | switch (action) { | 3217 | switch (action) { |
3213 | case MEM_ONLINE: | 3218 | case MEM_ONLINE: |
3214 | /* Default max is 128MB. Leave alone if modified by operator. */ | 3219 | /* Default max is 128MB. Leave alone if modified by operator. */ |
3215 | tmp = sysctl_user_reserve_kbytes; | 3220 | tmp = sysctl_user_reserve_kbytes; |
3216 | if (0 < tmp && tmp < (1UL << 17)) | 3221 | if (0 < tmp && tmp < (1UL << 17)) |
3217 | init_user_reserve(); | 3222 | init_user_reserve(); |
3218 | 3223 | ||
3219 | /* Default max is 8MB. Leave alone if modified by operator. */ | 3224 | /* Default max is 8MB. Leave alone if modified by operator. */ |
3220 | tmp = sysctl_admin_reserve_kbytes; | 3225 | tmp = sysctl_admin_reserve_kbytes; |
3221 | if (0 < tmp && tmp < (1UL << 13)) | 3226 | if (0 < tmp && tmp < (1UL << 13)) |
3222 | init_admin_reserve(); | 3227 | init_admin_reserve(); |
3223 | 3228 | ||
3224 | break; | 3229 | break; |
3225 | case MEM_OFFLINE: | 3230 | case MEM_OFFLINE: |
3226 | free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); | 3231 | free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); |
3227 | 3232 | ||
3228 | if (sysctl_user_reserve_kbytes > free_kbytes) { | 3233 | if (sysctl_user_reserve_kbytes > free_kbytes) { |
3229 | init_user_reserve(); | 3234 | init_user_reserve(); |
3230 | pr_info("vm.user_reserve_kbytes reset to %lu\n", | 3235 | pr_info("vm.user_reserve_kbytes reset to %lu\n", |
3231 | sysctl_user_reserve_kbytes); | 3236 | sysctl_user_reserve_kbytes); |
3232 | } | 3237 | } |
3233 | 3238 | ||
3234 | if (sysctl_admin_reserve_kbytes > free_kbytes) { | 3239 | if (sysctl_admin_reserve_kbytes > free_kbytes) { |
3235 | init_admin_reserve(); | 3240 | init_admin_reserve(); |
3236 | pr_info("vm.admin_reserve_kbytes reset to %lu\n", | 3241 | pr_info("vm.admin_reserve_kbytes reset to %lu\n", |
3237 | sysctl_admin_reserve_kbytes); | 3242 | sysctl_admin_reserve_kbytes); |
3238 | } | 3243 | } |
3239 | break; | 3244 | break; |
3240 | default: | 3245 | default: |
3241 | break; | 3246 | break; |
3242 | } | 3247 | } |
3243 | return NOTIFY_OK; | 3248 | return NOTIFY_OK; |
3244 | } | 3249 | } |
3245 | 3250 | ||
3246 | static struct notifier_block reserve_mem_nb = { | 3251 | static struct notifier_block reserve_mem_nb = { |
3247 | .notifier_call = reserve_mem_notifier, | 3252 | .notifier_call = reserve_mem_notifier, |
3248 | }; | 3253 | }; |
3249 | 3254 | ||
3250 | static int __meminit init_reserve_notifier(void) | 3255 | static int __meminit init_reserve_notifier(void) |
3251 | { | 3256 | { |
3252 | if (register_hotmemory_notifier(&reserve_mem_nb)) | 3257 | if (register_hotmemory_notifier(&reserve_mem_nb)) |
3253 | printk("Failed registering memory add/remove notifier for admin reserve"); | 3258 | printk("Failed registering memory add/remove notifier for admin reserve"); |
3254 | 3259 | ||
3255 | return 0; | 3260 | return 0; |
3256 | } | 3261 | } |
3257 | module_init(init_reserve_notifier) | 3262 | module_init(init_reserve_notifier) |
3258 | 3263 |