Commit 5a6fe125950676015f5108fb71b2a67441755003

Authored by Mel Gorman
Committed by Linus Torvalds
1 parent 4c098bcd55

Do not account for the address space used by hugetlbfs using VM_ACCOUNT

When overcommit is disabled, the core VM accounts for pages used by anonymous
shared, private mappings and special mappings. It keeps track of VMAs that
should be accounted for with VM_ACCOUNT and VMAs that never had a reserve
with VM_NORESERVE.

Overcommit for hugetlbfs is much riskier than overcommit for base pages
due to contiguity requirements. It avoids overcommiting on both shared and
private mappings using reservation counters that are checked and updated
during mmap(). This ensures (within limits) that hugepages exist in the
future when faults occurs or it is too easy to applications to be SIGKILLed.

As hugetlbfs makes its own reservations of a different unit to the base page
size, VM_ACCOUNT should never be set. Even if the units were correct, we would
double account for the usage in the core VM and hugetlbfs. VM_NORESERVE may
be set because an application can request no reserves be made for hugetlbfs
at the risk of getting killed later.

With commit fc8744adc870a8d4366908221508bb113d8b72ee, VM_NORESERVE and
VM_ACCOUNT are getting unconditionally set for hugetlbfs-backed mappings. This
breaks the accounting for both the core VM and hugetlbfs, can trigger an
OOM storm when hugepage pools are too small lockups and corrupted counters
otherwise are used. This patch brings hugetlbfs more in line with how the
core VM treats VM_NORESERVE but prevents VM_ACCOUNT being set.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 8 changed files with 65 additions and 43 deletions Side-by-side Diff

fs/hugetlbfs/inode.c
... ... @@ -108,7 +108,8 @@
108 108  
109 109 if (hugetlb_reserve_pages(inode,
110 110 vma->vm_pgoff >> huge_page_order(h),
111   - len >> huge_page_shift(h), vma))
  111 + len >> huge_page_shift(h), vma,
  112 + vma->vm_flags))
112 113 goto out;
113 114  
114 115 ret = 0;
... ... @@ -947,7 +948,7 @@
947 948 can_do_mlock());
948 949 }
949 950  
950   -struct file *hugetlb_file_setup(const char *name, size_t size)
  951 +struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag)
951 952 {
952 953 int error = -ENOMEM;
953 954 struct file *file;
... ... @@ -981,7 +982,8 @@
981 982  
982 983 error = -ENOMEM;
983 984 if (hugetlb_reserve_pages(inode, 0,
984   - size >> huge_page_shift(hstate_inode(inode)), NULL))
  985 + size >> huge_page_shift(hstate_inode(inode)), NULL,
  986 + acctflag))
985 987 goto out_inode;
986 988  
987 989 d_instantiate(dentry, inode);
include/linux/hugetlb.h
... ... @@ -33,7 +33,8 @@
33 33 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
34 34 unsigned long address, int write_access);
35 35 int hugetlb_reserve_pages(struct inode *inode, long from, long to,
36   - struct vm_area_struct *vma);
  36 + struct vm_area_struct *vma,
  37 + int acctflags);
37 38 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
38 39  
39 40 extern unsigned long hugepages_treat_as_movable;
... ... @@ -138,7 +139,7 @@
138 139  
139 140 extern const struct file_operations hugetlbfs_file_operations;
140 141 extern struct vm_operations_struct hugetlb_vm_ops;
141   -struct file *hugetlb_file_setup(const char *name, size_t);
  142 +struct file *hugetlb_file_setup(const char *name, size_t, int);
142 143 int hugetlb_get_quota(struct address_space *mapping, long delta);
143 144 void hugetlb_put_quota(struct address_space *mapping, long delta);
144 145  
... ... @@ -1129,8 +1129,7 @@
1129 1129 unsigned long flag, unsigned long pgoff);
1130 1130 extern unsigned long mmap_region(struct file *file, unsigned long addr,
1131 1131 unsigned long len, unsigned long flags,
1132   - unsigned int vm_flags, unsigned long pgoff,
1133   - int accountable);
  1132 + unsigned int vm_flags, unsigned long pgoff);
1134 1133  
1135 1134 static inline unsigned long do_mmap(struct file *file, unsigned long addr,
1136 1135 unsigned long len, unsigned long prot,
... ... @@ -340,6 +340,7 @@
340 340 struct file * file;
341 341 char name[13];
342 342 int id;
  343 + int acctflag = 0;
343 344  
344 345 if (size < SHMMIN || size > ns->shm_ctlmax)
345 346 return -EINVAL;
346 347  
... ... @@ -364,11 +365,12 @@
364 365  
365 366 sprintf (name, "SYSV%08x", key);
366 367 if (shmflg & SHM_HUGETLB) {
367   - /* hugetlb_file_setup takes care of mlock user accounting */
368   - file = hugetlb_file_setup(name, size);
  368 + /* hugetlb_file_setup applies strict accounting */
  369 + if (shmflg & SHM_NORESERVE)
  370 + acctflag = VM_NORESERVE;
  371 + file = hugetlb_file_setup(name, size, acctflag);
369 372 shp->mlock_user = current_user();
370 373 } else {
371   - int acctflag = 0;
372 374 /*
373 375 * Do not allow no accounting for OVERCOMMIT_NEVER, even
374 376 * if it's asked for.
... ... @@ -198,7 +198,7 @@
198 198 flags &= MAP_NONBLOCK;
199 199 get_file(file);
200 200 addr = mmap_region(file, start, size,
201   - flags, vma->vm_flags, pgoff, 1);
  201 + flags, vma->vm_flags, pgoff);
202 202 fput(file);
203 203 if (IS_ERR_VALUE(addr)) {
204 204 err = addr;
... ... @@ -2269,14 +2269,12 @@
2269 2269  
2270 2270 int hugetlb_reserve_pages(struct inode *inode,
2271 2271 long from, long to,
2272   - struct vm_area_struct *vma)
  2272 + struct vm_area_struct *vma,
  2273 + int acctflag)
2273 2274 {
2274   - long ret, chg;
  2275 + long ret = 0, chg;
2275 2276 struct hstate *h = hstate_inode(inode);
2276 2277  
2277   - if (vma && vma->vm_flags & VM_NORESERVE)
2278   - return 0;
2279   -
2280 2278 /*
2281 2279 * Shared mappings base their reservation on the number of pages that
2282 2280 * are already allocated on behalf of the file. Private mappings need
2283 2281  
2284 2282  
... ... @@ -2285,22 +2283,25 @@
2285 2283 */
2286 2284 if (!vma || vma->vm_flags & VM_SHARED)
2287 2285 chg = region_chg(&inode->i_mapping->private_list, from, to);
2288   - else {
2289   - struct resv_map *resv_map = resv_map_alloc();
2290   - if (!resv_map)
2291   - return -ENOMEM;
2292   -
  2286 + else
2293 2287 chg = to - from;
2294 2288  
2295   - set_vma_resv_map(vma, resv_map);
2296   - set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
2297   - }
2298   -
2299 2289 if (chg < 0)
2300 2290 return chg;
2301 2291  
2302 2292 if (hugetlb_get_quota(inode->i_mapping, chg))
2303 2293 return -ENOSPC;
  2294 +
  2295 + /*
  2296 + * Only apply hugepage reservation if asked. We still have to
  2297 + * take the filesystem quota because it is an upper limit
  2298 + * defined for the mount and not necessarily memory as a whole
  2299 + */
  2300 + if (acctflag & VM_NORESERVE) {
  2301 + reset_vma_resv_huge_pages(vma);
  2302 + return 0;
  2303 + }
  2304 +
2304 2305 ret = hugetlb_acct_memory(h, chg);
2305 2306 if (ret < 0) {
2306 2307 hugetlb_put_quota(inode->i_mapping, chg);
... ... @@ -2308,6 +2309,16 @@
2308 2309 }
2309 2310 if (!vma || vma->vm_flags & VM_SHARED)
2310 2311 region_add(&inode->i_mapping->private_list, from, to);
  2312 + else {
  2313 + struct resv_map *resv_map = resv_map_alloc();
  2314 +
  2315 + if (!resv_map)
  2316 + return -ENOMEM;
  2317 +
  2318 + set_vma_resv_map(vma, resv_map);
  2319 + set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
  2320 + }
  2321 +
2311 2322 return 0;
2312 2323 }
2313 2324  
... ... @@ -918,7 +918,6 @@
918 918 struct inode *inode;
919 919 unsigned int vm_flags;
920 920 int error;
921   - int accountable = 1;
922 921 unsigned long reqprot = prot;
923 922  
924 923 /*
... ... @@ -1019,8 +1018,6 @@
1019 1018 return -EPERM;
1020 1019 vm_flags &= ~VM_MAYEXEC;
1021 1020 }
1022   - if (is_file_hugepages(file))
1023   - accountable = 0;
1024 1021  
1025 1022 if (!file->f_op || !file->f_op->mmap)
1026 1023 return -ENODEV;
... ... @@ -1053,8 +1050,7 @@
1053 1050 if (error)
1054 1051 return error;
1055 1052  
1056   - return mmap_region(file, addr, len, flags, vm_flags, pgoff,
1057   - accountable);
  1053 + return mmap_region(file, addr, len, flags, vm_flags, pgoff);
1058 1054 }
1059 1055 EXPORT_SYMBOL(do_mmap_pgoff);
1060 1056  
1061 1057  
1062 1058  
1063 1059  
... ... @@ -1092,17 +1088,23 @@
1092 1088  
1093 1089 /*
1094 1090 * We account for memory if it's a private writeable mapping,
1095   - * and VM_NORESERVE wasn't set.
  1091 + * not hugepages and VM_NORESERVE wasn't set.
1096 1092 */
1097   -static inline int accountable_mapping(unsigned int vm_flags)
  1093 +static inline int accountable_mapping(struct file *file, unsigned int vm_flags)
1098 1094 {
  1095 + /*
  1096 + * hugetlb has its own accounting separate from the core VM
  1097 + * VM_HUGETLB may not be set yet so we cannot check for that flag.
  1098 + */
  1099 + if (file && is_file_hugepages(file))
  1100 + return 0;
  1101 +
1099 1102 return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
1100 1103 }
1101 1104  
1102 1105 unsigned long mmap_region(struct file *file, unsigned long addr,
1103 1106 unsigned long len, unsigned long flags,
1104   - unsigned int vm_flags, unsigned long pgoff,
1105   - int accountable)
  1107 + unsigned int vm_flags, unsigned long pgoff)
1106 1108 {
1107 1109 struct mm_struct *mm = current->mm;
1108 1110 struct vm_area_struct *vma, *prev;
1109 1111  
1110 1112  
1111 1113  
... ... @@ -1128,18 +1130,22 @@
1128 1130  
1129 1131 /*
1130 1132 * Set 'VM_NORESERVE' if we should not account for the
1131   - * memory use of this mapping. We only honor MAP_NORESERVE
1132   - * if we're allowed to overcommit memory.
  1133 + * memory use of this mapping.
1133 1134 */
1134   - if ((flags & MAP_NORESERVE) && sysctl_overcommit_memory != OVERCOMMIT_NEVER)
1135   - vm_flags |= VM_NORESERVE;
1136   - if (!accountable)
1137   - vm_flags |= VM_NORESERVE;
  1135 + if ((flags & MAP_NORESERVE)) {
  1136 + /* We honor MAP_NORESERVE if allowed to overcommit */
  1137 + if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
  1138 + vm_flags |= VM_NORESERVE;
1138 1139  
  1140 + /* hugetlb applies strict overcommit unless MAP_NORESERVE */
  1141 + if (file && is_file_hugepages(file))
  1142 + vm_flags |= VM_NORESERVE;
  1143 + }
  1144 +
1139 1145 /*
1140 1146 * Private writable mapping: check memory availability
1141 1147 */
1142   - if (accountable_mapping(vm_flags)) {
  1148 + if (accountable_mapping(file, vm_flags)) {
1143 1149 charged = len >> PAGE_SHIFT;
1144 1150 if (security_vm_enough_memory(charged))
1145 1151 return -ENOMEM;
... ... @@ -151,10 +151,11 @@
151 151 /*
152 152 * If we make a private mapping writable we increase our commit;
153 153 * but (without finer accounting) cannot reduce our commit if we
154   - * make it unwritable again.
  154 + * make it unwritable again. hugetlb mapping were accounted for
  155 + * even if read-only so there is no need to account for them here
155 156 */
156 157 if (newflags & VM_WRITE) {
157   - if (!(oldflags & (VM_ACCOUNT|VM_WRITE|
  158 + if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB|
158 159 VM_SHARED|VM_NORESERVE))) {
159 160 charged = nrpages;
160 161 if (security_vm_enough_memory(charged))