Blame view
fs/hugetlbfs/inode.c
34.5 KB
1da177e4c
|
1 2 3 |
/* * hugetlbpage-backed filesystem. Based on ramfs. * |
6d49e352a
|
4 |
* Nadia Yvette Chambers, 2002 |
1da177e4c
|
5 6 |
* * Copyright (C) 2002 Linus Torvalds. |
3e89e1c5e
|
7 |
* License: GPL |
1da177e4c
|
8 |
*/ |
9b857d26d
|
9 |
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
1da177e4c
|
10 11 12 |
#include <linux/thread_info.h> #include <asm/current.h> #include <linux/sched.h> /* remove ASAP */ |
70c3547e3
|
13 |
#include <linux/falloc.h> |
1da177e4c
|
14 15 16 |
#include <linux/fs.h> #include <linux/mount.h> #include <linux/file.h> |
e73a75fa7
|
17 |
#include <linux/kernel.h> |
1da177e4c
|
18 19 20 21 22 |
#include <linux/writeback.h> #include <linux/pagemap.h> #include <linux/highmem.h> #include <linux/init.h> #include <linux/string.h> |
16f7e0fe2
|
23 |
#include <linux/capability.h> |
e73a75fa7
|
24 |
#include <linux/ctype.h> |
1da177e4c
|
25 26 27 |
#include <linux/backing-dev.h> #include <linux/hugetlb.h> #include <linux/pagevec.h> |
e73a75fa7
|
28 |
#include <linux/parser.h> |
036e08568
|
29 |
#include <linux/mman.h> |
1da177e4c
|
30 31 32 33 |
#include <linux/slab.h> #include <linux/dnotify.h> #include <linux/statfs.h> #include <linux/security.h> |
1fd7317d0
|
34 |
#include <linux/magic.h> |
290408d4a
|
35 |
#include <linux/migrate.h> |
34d0640e2
|
36 |
#include <linux/uio.h> |
1da177e4c
|
37 38 |
#include <asm/uaccess.h> |
ee9b6d61a
|
39 |
static const struct super_operations hugetlbfs_ops; |
f5e54d6e5
|
40 |
static const struct address_space_operations hugetlbfs_aops; |
4b6f5d20b
|
41 |
const struct file_operations hugetlbfs_file_operations; |
92e1d5be9
|
42 43 |
static const struct inode_operations hugetlbfs_dir_inode_operations; static const struct inode_operations hugetlbfs_inode_operations; |
1da177e4c
|
44 |
|
a1d776ee3
|
45 |
struct hugetlbfs_config { |
a0eb3a05a
|
46 47 |
kuid_t uid; kgid_t gid; |
a1d776ee3
|
48 |
umode_t mode; |
7ca02d0ae
|
49 |
long max_hpages; |
a1d776ee3
|
50 51 |
long nr_inodes; struct hstate *hstate; |
7ca02d0ae
|
52 |
long min_hpages; |
a1d776ee3
|
53 54 55 56 57 58 59 60 61 62 63 |
}; struct hugetlbfs_inode_info { struct shared_policy policy; struct inode vfs_inode; }; static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode) { return container_of(inode, struct hugetlbfs_inode_info, vfs_inode); } |
1da177e4c
|
64 |
int sysctl_hugetlb_shm_group; |
e73a75fa7
|
65 66 67 |
enum { Opt_size, Opt_nr_inodes, Opt_mode, Opt_uid, Opt_gid, |
7ca02d0ae
|
68 |
Opt_pagesize, Opt_min_size, |
e73a75fa7
|
69 70 |
Opt_err, }; |
a447c0932
|
71 |
static const match_table_t tokens = { |
e73a75fa7
|
72 73 74 75 76 |
{Opt_size, "size=%s"}, {Opt_nr_inodes, "nr_inodes=%s"}, {Opt_mode, "mode=%o"}, {Opt_uid, "uid=%u"}, {Opt_gid, "gid=%u"}, |
a137e1cc6
|
77 |
{Opt_pagesize, "pagesize=%s"}, |
7ca02d0ae
|
78 |
{Opt_min_size, "min_size=%s"}, |
e73a75fa7
|
79 80 |
{Opt_err, NULL}, }; |
70c3547e3
|
81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
#ifdef CONFIG_NUMA static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma, struct inode *inode, pgoff_t index) { vma->vm_policy = mpol_shared_policy_lookup(&HUGETLBFS_I(inode)->policy, index); } static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma) { mpol_cond_put(vma->vm_policy); } #else static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma, struct inode *inode, pgoff_t index) { } static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma) { } #endif |
2e9b367c2
|
103 104 105 106 107 108 109 110 111 |
static void huge_pagevec_release(struct pagevec *pvec) { int i; for (i = 0; i < pagevec_count(pvec); ++i) put_page(pvec->pages[i]); pagevec_reinit(pvec); } |
1da177e4c
|
112 113 |
static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) { |
496ad9aa8
|
114 |
struct inode *inode = file_inode(file); |
1da177e4c
|
115 116 |
loff_t len, vma_len; int ret; |
a55164389
|
117 |
struct hstate *h = hstate_file(file); |
1da177e4c
|
118 |
|
68589bc35
|
119 |
/* |
dec4ad86c
|
120 121 122 123 124 125 |
* vma address alignment (but not the pgoff alignment) has * already been checked by prepare_hugepage_range. If you add * any error returns here, do so after setting VM_HUGETLB, so * is_vm_hugetlb_page tests below unmap_region go the right * way when do_mmap_pgoff unwinds (may be important on powerpc * and ia64). |
68589bc35
|
126 |
*/ |
a2fce9143
|
127 |
vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND; |
68589bc35
|
128 |
vma->vm_ops = &hugetlb_vm_ops; |
1da177e4c
|
129 |
|
2b37c35e6
|
130 |
if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) |
dec4ad86c
|
131 |
return -EINVAL; |
1da177e4c
|
132 |
vma_len = (loff_t)(vma->vm_end - vma->vm_start); |
5955102c9
|
133 |
inode_lock(inode); |
1da177e4c
|
134 |
file_accessed(file); |
1da177e4c
|
135 136 137 |
ret = -ENOMEM; len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); |
1da177e4c
|
138 |
|
a1e78772d
|
139 |
if (hugetlb_reserve_pages(inode, |
a55164389
|
140 |
vma->vm_pgoff >> huge_page_order(h), |
5a6fe1259
|
141 142 |
len >> huge_page_shift(h), vma, vma->vm_flags)) |
a43a8c39b
|
143 |
goto out; |
b45b5bd65
|
144 |
|
4c8872659
|
145 |
ret = 0; |
b6174df5e
|
146 |
if (vma->vm_flags & VM_WRITE && inode->i_size < len) |
1da177e4c
|
147 148 |
inode->i_size = len; out: |
5955102c9
|
149 |
inode_unlock(inode); |
1da177e4c
|
150 151 152 153 154 |
return ret; } /* |
508034a32
|
155 |
* Called under down_write(mmap_sem). |
1da177e4c
|
156 |
*/ |
d2ba27e80
|
157 |
#ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA |
1da177e4c
|
158 159 160 161 162 163 |
static unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; |
a55164389
|
164 |
struct hstate *h = hstate_file(file); |
086593559
|
165 |
struct vm_unmapped_area_info info; |
1da177e4c
|
166 |
|
a55164389
|
167 |
if (len & ~huge_page_mask(h)) |
1da177e4c
|
168 169 170 |
return -EINVAL; if (len > TASK_SIZE) return -ENOMEM; |
036e08568
|
171 |
if (flags & MAP_FIXED) { |
a55164389
|
172 |
if (prepare_hugepage_range(file, addr, len)) |
036e08568
|
173 174 175 |
return -EINVAL; return addr; } |
1da177e4c
|
176 |
if (addr) { |
a55164389
|
177 |
addr = ALIGN(addr, huge_page_size(h)); |
1da177e4c
|
178 179 180 181 182 |
vma = find_vma(mm, addr); if (TASK_SIZE - len >= addr && (!vma || addr + len <= vma->vm_start)) return addr; } |
086593559
|
183 184 185 186 187 188 189 |
info.flags = 0; info.length = len; info.low_limit = TASK_UNMAPPED_BASE; info.high_limit = TASK_SIZE; info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; return vm_unmapped_area(&info); |
1da177e4c
|
190 191 |
} #endif |
34d0640e2
|
192 |
static size_t |
e63e1e5a6
|
193 |
hugetlbfs_read_actor(struct page *page, unsigned long offset, |
34d0640e2
|
194 |
struct iov_iter *to, unsigned long size) |
e63e1e5a6
|
195 |
{ |
34d0640e2
|
196 |
size_t copied = 0; |
e63e1e5a6
|
197 |
int i, chunksize; |
e63e1e5a6
|
198 199 200 201 202 |
/* Find which 4k chunk and offset with in that chunk */ i = offset >> PAGE_CACHE_SHIFT; offset = offset & ~PAGE_CACHE_MASK; while (size) { |
34d0640e2
|
203 |
size_t n; |
e63e1e5a6
|
204 205 206 207 208 |
chunksize = PAGE_CACHE_SIZE; if (offset) chunksize -= offset; if (chunksize > size) chunksize = size; |
34d0640e2
|
209 210 211 212 |
n = copy_page_to_iter(&page[i], offset, chunksize, to); copied += n; if (n != chunksize) return copied; |
e63e1e5a6
|
213 214 |
offset = 0; size -= chunksize; |
e63e1e5a6
|
215 216 |
i++; } |
34d0640e2
|
217 |
return copied; |
e63e1e5a6
|
218 219 220 221 222 223 224 |
} /* * Support for read() - Find the page attached to f_mapping and copy out the * data. Its *very* similar to do_generic_mapping_read(), we can't use that * since it has PAGE_CACHE_SIZE assumptions. */ |
34d0640e2
|
225 |
static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) |
e63e1e5a6
|
226 |
{ |
34d0640e2
|
227 228 229 |
struct file *file = iocb->ki_filp; struct hstate *h = hstate_file(file); struct address_space *mapping = file->f_mapping; |
e63e1e5a6
|
230 |
struct inode *inode = mapping->host; |
34d0640e2
|
231 232 |
unsigned long index = iocb->ki_pos >> huge_page_shift(h); unsigned long offset = iocb->ki_pos & ~huge_page_mask(h); |
e63e1e5a6
|
233 234 235 |
unsigned long end_index; loff_t isize; ssize_t retval = 0; |
34d0640e2
|
236 |
while (iov_iter_count(to)) { |
e63e1e5a6
|
237 |
struct page *page; |
34d0640e2
|
238 |
size_t nr, copied; |
e63e1e5a6
|
239 240 |
/* nr is the maximum number of bytes to copy from this page */ |
a55164389
|
241 |
nr = huge_page_size(h); |
a05b0855f
|
242 243 |
isize = i_size_read(inode); if (!isize) |
34d0640e2
|
244 |
break; |
a05b0855f
|
245 |
end_index = (isize - 1) >> huge_page_shift(h); |
34d0640e2
|
246 247 248 |
if (index > end_index) break; if (index == end_index) { |
a55164389
|
249 |
nr = ((isize - 1) & ~huge_page_mask(h)) + 1; |
a05b0855f
|
250 |
if (nr <= offset) |
34d0640e2
|
251 |
break; |
e63e1e5a6
|
252 253 254 255 |
} nr = nr - offset; /* Find the page */ |
a05b0855f
|
256 |
page = find_lock_page(mapping, index); |
e63e1e5a6
|
257 258 259 260 261 |
if (unlikely(page == NULL)) { /* * We have a HOLE, zero out the user-buffer for the * length of the hole or request. */ |
34d0640e2
|
262 |
copied = iov_iter_zero(nr, to); |
e63e1e5a6
|
263 |
} else { |
a05b0855f
|
264 |
unlock_page(page); |
e63e1e5a6
|
265 266 267 |
/* * We have the page, copy it to user space buffer. */ |
34d0640e2
|
268 |
copied = hugetlbfs_read_actor(page, offset, to, nr); |
a05b0855f
|
269 |
page_cache_release(page); |
e63e1e5a6
|
270 |
} |
34d0640e2
|
271 272 273 274 275 276 |
offset += copied; retval += copied; if (copied != nr && iov_iter_count(to)) { if (!retval) retval = -EFAULT; break; |
e63e1e5a6
|
277 |
} |
a55164389
|
278 279 |
index += offset >> huge_page_shift(h); offset &= ~huge_page_mask(h); |
e63e1e5a6
|
280 |
} |
34d0640e2
|
281 |
iocb->ki_pos = ((loff_t)index << huge_page_shift(h)) + offset; |
e63e1e5a6
|
282 283 |
return retval; } |
800d15a53
|
284 285 286 287 |
static int hugetlbfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) |
1da177e4c
|
288 289 290 |
{ return -EINVAL; } |
800d15a53
|
291 292 293 |
static int hugetlbfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) |
1da177e4c
|
294 |
{ |
800d15a53
|
295 |
BUG(); |
1da177e4c
|
296 297 |
return -EINVAL; } |
b5cec28d3
|
298 |
static void remove_huge_page(struct page *page) |
1da177e4c
|
299 |
{ |
b9ea25152
|
300 |
ClearPageDirty(page); |
1da177e4c
|
301 |
ClearPageUptodate(page); |
bd65cb86c
|
302 |
delete_from_page_cache(page); |
1da177e4c
|
303 |
} |
4aae8d1c0
|
304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 |
static void hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end) { struct vm_area_struct *vma; /* * end == 0 indicates that the entire range after * start should be unmapped. */ vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) { unsigned long v_offset; unsigned long v_end; /* * Can the expression below overflow on 32-bit arches? * No, because the interval tree returns us only those vmas * which overlap the truncated area starting at pgoff, * and no vma on a 32-bit arch can span beyond the 4GB. */ if (vma->vm_pgoff < start) v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT; else v_offset = 0; if (!end) v_end = vma->vm_end; else { v_end = ((end - vma->vm_pgoff) << PAGE_SHIFT) + vma->vm_start; if (v_end > vma->vm_end) v_end = vma->vm_end; } unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end, NULL); } } |
b5cec28d3
|
341 342 343 344 |
/* * remove_inode_hugepages handles two distinct cases: truncation and hole * punch. There are subtle differences in operation for each case. |
4aae8d1c0
|
345 |
* |
b5cec28d3
|
346 347 348 |
* truncation is indicated by end of range being LLONG_MAX * In this case, we first scan the range and release found pages. * After releasing pages, hugetlb_unreserve_pages cleans up region/reserv |
1817889e3
|
349 350 351 352 353 |
* maps and global counts. Page faults can not race with truncation * in this routine. hugetlb_no_page() prevents page faults in the * truncated range. It checks i_size before allocation, and again after * with the page table lock for the page held. The same lock must be * acquired to unmap a page. |
b5cec28d3
|
354 355 356 357 |
* hole punch is indicated if end is not LLONG_MAX * In the hole punch case we scan the range and release found pages. * Only when releasing a page is the associated region/reserv map * deleted. The region/reserv map for ranges without associated |
1817889e3
|
358 359 |
* pages are not modified. Page faults can race with hole punch. * This is indicated if we find a mapped page. |
b5cec28d3
|
360 361 362 363 364 |
* Note: If the passed end of range value is beyond the end of file, but * not LLONG_MAX this routine still performs a hole punch operation. */ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, loff_t lend) |
1da177e4c
|
365 |
{ |
a55164389
|
366 |
struct hstate *h = hstate_inode(inode); |
b45b5bd65
|
367 |
struct address_space *mapping = &inode->i_data; |
a55164389
|
368 |
const pgoff_t start = lstart >> huge_page_shift(h); |
b5cec28d3
|
369 370 |
const pgoff_t end = lend >> huge_page_shift(h); struct vm_area_struct pseudo_vma; |
1da177e4c
|
371 372 |
struct pagevec pvec; pgoff_t next; |
a43a8c39b
|
373 |
int i, freed = 0; |
b5cec28d3
|
374 375 |
long lookup_nr = PAGEVEC_SIZE; bool truncate_op = (lend == LLONG_MAX); |
1da177e4c
|
376 |
|
b5cec28d3
|
377 378 |
memset(&pseudo_vma, 0, sizeof(struct vm_area_struct)); pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED); |
1da177e4c
|
379 380 |
pagevec_init(&pvec, 0); next = start; |
b5cec28d3
|
381 382 |
while (next < end) { /* |
1817889e3
|
383 |
* Don't grab more pages than the number left in the range. |
b5cec28d3
|
384 385 386 387 388 |
*/ if (end - next < lookup_nr) lookup_nr = end - next; /* |
1817889e3
|
389 |
* When no more pages are found, we are done. |
b5cec28d3
|
390 |
*/ |
1817889e3
|
391 392 |
if (!pagevec_lookup(&pvec, mapping, next, lookup_nr)) break; |
1da177e4c
|
393 394 395 |
for (i = 0; i < pagevec_count(&pvec); ++i) { struct page *page = pvec.pages[i]; |
4aae8d1c0
|
396 |
bool rsv_on_error; |
b5cec28d3
|
397 |
u32 hash; |
1817889e3
|
398 399 400 401 402 403 404 405 |
/* * The page (index) could be beyond end. This is * only possible in the punch hole case as end is * max page offset in the truncate case. */ next = page->index; if (next >= end) break; |
b5cec28d3
|
406 407 408 409 |
hash = hugetlb_fault_mutex_hash(h, current->mm, &pseudo_vma, mapping, next, 0); mutex_lock(&hugetlb_fault_mutex_table[hash]); |
1da177e4c
|
410 |
|
4aae8d1c0
|
411 412 413 414 415 416 417 418 419 420 |
/* * If page is mapped, it was faulted in after being * unmapped in caller. Unmap (again) now after taking * the fault mutex. The mutex will prevent faults * until we finish removing the page. * * This race can only happen in the hole punch case. * Getting here in a truncate operation is a bug. */ if (unlikely(page_mapped(page))) { |
1817889e3
|
421 |
BUG_ON(truncate_op); |
4aae8d1c0
|
422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 |
i_mmap_lock_write(mapping); hugetlb_vmdelete_list(&mapping->i_mmap, next * pages_per_huge_page(h), (next + 1) * pages_per_huge_page(h)); i_mmap_unlock_write(mapping); } lock_page(page); /* * We must free the huge page and remove from page * cache (remove_huge_page) BEFORE removing the * region/reserve map (hugetlb_unreserve_pages). In * rare out of memory conditions, removal of the * region/reserve map could fail. Before free'ing * the page, note PagePrivate which is used in case * of error. */ rsv_on_error = !PagePrivate(page); remove_huge_page(page); freed++; if (!truncate_op) { if (unlikely(hugetlb_unreserve_pages(inode, next, next + 1, 1))) hugetlb_fix_reserve_counts(inode, rsv_on_error); |
b5cec28d3
|
448 |
} |
1da177e4c
|
449 |
unlock_page(page); |
b5cec28d3
|
450 |
mutex_unlock(&hugetlb_fault_mutex_table[hash]); |
1da177e4c
|
451 |
} |
1817889e3
|
452 |
++next; |
1da177e4c
|
453 |
huge_pagevec_release(&pvec); |
1817889e3
|
454 |
cond_resched(); |
1da177e4c
|
455 |
} |
b5cec28d3
|
456 457 458 |
if (truncate_op) (void)hugetlb_unreserve_pages(inode, start, LONG_MAX, freed); |
1da177e4c
|
459 |
} |
2bbbda308
|
460 |
static void hugetlbfs_evict_inode(struct inode *inode) |
1da177e4c
|
461 |
{ |
9119a41e9
|
462 |
struct resv_map *resv_map; |
b5cec28d3
|
463 |
remove_inode_hugepages(inode, 0, LLONG_MAX); |
9119a41e9
|
464 465 466 467 |
resv_map = (struct resv_map *)inode->i_mapping->private_data; /* root inode doesn't have the resv_map, so we should check it */ if (resv_map) resv_map_release(&resv_map->refs); |
dbd5768f8
|
468 |
clear_inode(inode); |
149f4211a
|
469 |
} |
1da177e4c
|
470 471 |
static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) { |
856fc2950
|
472 |
pgoff_t pgoff; |
1da177e4c
|
473 |
struct address_space *mapping = inode->i_mapping; |
a55164389
|
474 |
struct hstate *h = hstate_inode(inode); |
1da177e4c
|
475 |
|
a55164389
|
476 |
BUG_ON(offset & ~huge_page_mask(h)); |
856fc2950
|
477 |
pgoff = offset >> PAGE_SHIFT; |
1da177e4c
|
478 |
|
7aa91e104
|
479 |
i_size_write(inode, offset); |
83cde9e8b
|
480 |
i_mmap_lock_write(mapping); |
6b2dbba8b
|
481 |
if (!RB_EMPTY_ROOT(&mapping->i_mmap)) |
1bfad99ab
|
482 |
hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0); |
83cde9e8b
|
483 |
i_mmap_unlock_write(mapping); |
b5cec28d3
|
484 |
remove_inode_hugepages(inode, offset, LLONG_MAX); |
1da177e4c
|
485 486 |
return 0; } |
70c3547e3
|
487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 |
static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) { struct hstate *h = hstate_inode(inode); loff_t hpage_size = huge_page_size(h); loff_t hole_start, hole_end; /* * For hole punch round up the beginning offset of the hole and * round down the end. */ hole_start = round_up(offset, hpage_size); hole_end = round_down(offset + len, hpage_size); if (hole_end > hole_start) { struct address_space *mapping = inode->i_mapping; |
5955102c9
|
502 |
inode_lock(inode); |
70c3547e3
|
503 504 505 506 507 508 509 |
i_mmap_lock_write(mapping); if (!RB_EMPTY_ROOT(&mapping->i_mmap)) hugetlb_vmdelete_list(&mapping->i_mmap, hole_start >> PAGE_SHIFT, hole_end >> PAGE_SHIFT); i_mmap_unlock_write(mapping); remove_inode_hugepages(inode, hole_start, hole_end); |
5955102c9
|
510 |
inode_unlock(inode); |
70c3547e3
|
511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 |
} return 0; } static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); struct address_space *mapping = inode->i_mapping; struct hstate *h = hstate_inode(inode); struct vm_area_struct pseudo_vma; struct mm_struct *mm = current->mm; loff_t hpage_size = huge_page_size(h); unsigned long hpage_shift = huge_page_shift(h); pgoff_t start, index, end; int error; u32 hash; if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) return -EOPNOTSUPP; if (mode & FALLOC_FL_PUNCH_HOLE) return hugetlbfs_punch_hole(inode, offset, len); /* * Default preallocate case. * For this range, start is rounded down and end is rounded up * as well as being converted to page offsets. */ start = offset >> hpage_shift; end = (offset + len + hpage_size - 1) >> hpage_shift; |
5955102c9
|
543 |
inode_lock(inode); |
70c3547e3
|
544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 |
/* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ error = inode_newsize_ok(inode, offset + len); if (error) goto out; /* * Initialize a pseudo vma as this is required by the huge page * allocation routines. If NUMA is configured, use page index * as input to create an allocation policy. */ memset(&pseudo_vma, 0, sizeof(struct vm_area_struct)); pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED); pseudo_vma.vm_file = file; for (index = start; index < end; index++) { /* * This is supposed to be the vaddr where the page is being * faulted in, but we have no vaddr here. */ struct page *page; unsigned long addr; int avoid_reserve = 0; cond_resched(); /* * fallocate(2) manpage permits EINTR; we may have been * interrupted because we are using up too much memory. */ if (signal_pending(current)) { error = -EINTR; break; } /* Set numa allocation policy based on index */ hugetlb_set_vma_policy(&pseudo_vma, inode, index); /* addr is the offset within the file (zero based) */ addr = index * hpage_size; /* mutex taken here, fault path and hole punch */ hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping, index, addr); mutex_lock(&hugetlb_fault_mutex_table[hash]); /* See if already present in mapping to avoid alloc/free */ page = find_get_page(mapping, index); if (page) { put_page(page); mutex_unlock(&hugetlb_fault_mutex_table[hash]); hugetlb_drop_vma_policy(&pseudo_vma); continue; } /* Allocate page and add to page cache */ page = alloc_huge_page(&pseudo_vma, addr, avoid_reserve); hugetlb_drop_vma_policy(&pseudo_vma); if (IS_ERR(page)) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); error = PTR_ERR(page); goto out; } clear_huge_page(page, addr, pages_per_huge_page(h)); __SetPageUptodate(page); error = huge_add_to_page_cache(page, mapping, index); if (unlikely(error)) { put_page(page); mutex_unlock(&hugetlb_fault_mutex_table[hash]); goto out; } mutex_unlock(&hugetlb_fault_mutex_table[hash]); /* * page_put due to reference from alloc_huge_page() * unlock_page because locked by add_to_page_cache() */ put_page(page); unlock_page(page); } if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) i_size_write(inode, offset + len); inode->i_ctime = CURRENT_TIME; |
70c3547e3
|
629 |
out: |
5955102c9
|
630 |
inode_unlock(inode); |
70c3547e3
|
631 632 |
return error; } |
1da177e4c
|
633 634 |
static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr) { |
2b0143b5c
|
635 |
struct inode *inode = d_inode(dentry); |
a55164389
|
636 |
struct hstate *h = hstate_inode(inode); |
1da177e4c
|
637 638 639 640 641 642 643 |
int error; unsigned int ia_valid = attr->ia_valid; BUG_ON(!inode); error = inode_change_ok(inode, attr); if (error) |
1025774ce
|
644 |
return error; |
1da177e4c
|
645 646 647 |
if (ia_valid & ATTR_SIZE) { error = -EINVAL; |
1025774ce
|
648 649 650 |
if (attr->ia_size & ~huge_page_mask(h)) return -EINVAL; error = hugetlb_vmtruncate(inode, attr->ia_size); |
1da177e4c
|
651 |
if (error) |
1025774ce
|
652 |
return error; |
1da177e4c
|
653 |
} |
1025774ce
|
654 655 656 657 |
setattr_copy(inode, attr); mark_inode_dirty(inode); return 0; |
1da177e4c
|
658 |
} |
7d54fa647
|
659 660 |
static struct inode *hugetlbfs_get_root(struct super_block *sb, struct hugetlbfs_config *config) |
1da177e4c
|
661 662 |
{ struct inode *inode; |
1da177e4c
|
663 664 665 666 |
inode = new_inode(sb); if (inode) { struct hugetlbfs_inode_info *info; |
85fe4025c
|
667 |
inode->i_ino = get_next_ino(); |
7d54fa647
|
668 669 670 671 672 673 674 675 676 677 |
inode->i_mode = S_IFDIR | config->mode; inode->i_uid = config->uid; inode->i_gid = config->gid; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; info = HUGETLBFS_I(inode); mpol_shared_policy_init(&info->policy, NULL); inode->i_op = &hugetlbfs_dir_inode_operations; inode->i_fop = &simple_dir_operations; /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); |
65ed76010
|
678 |
lockdep_annotate_inode_mutex_key(inode); |
7d54fa647
|
679 680 681 |
} return inode; } |
b610ded71
|
682 |
/* |
c8c06efa8
|
683 |
* Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never |
b610ded71
|
684 |
* be taken from reclaim -- unlike regular filesystems. This needs an |
88f306b68
|
685 |
* annotation because huge_pmd_share() does an allocation under hugetlb's |
c8c06efa8
|
686 |
* i_mmap_rwsem. |
b610ded71
|
687 |
*/ |
c8c06efa8
|
688 |
static struct lock_class_key hugetlbfs_i_mmap_rwsem_key; |
b610ded71
|
689 |
|
7d54fa647
|
690 691 |
static struct inode *hugetlbfs_get_inode(struct super_block *sb, struct inode *dir, |
18df22524
|
692 |
umode_t mode, dev_t dev) |
7d54fa647
|
693 694 |
{ struct inode *inode; |
9119a41e9
|
695 696 697 698 699 |
struct resv_map *resv_map; resv_map = resv_map_alloc(); if (!resv_map) return NULL; |
7d54fa647
|
700 701 702 703 704 705 |
inode = new_inode(sb); if (inode) { struct hugetlbfs_inode_info *info; inode->i_ino = get_next_ino(); inode_init_owner(inode, dir, mode); |
c8c06efa8
|
706 707 |
lockdep_set_class(&inode->i_mapping->i_mmap_rwsem, &hugetlbfs_i_mmap_rwsem_key); |
1da177e4c
|
708 |
inode->i_mapping->a_ops = &hugetlbfs_aops; |
1da177e4c
|
709 |
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; |
9119a41e9
|
710 |
inode->i_mapping->private_data = resv_map; |
1da177e4c
|
711 |
info = HUGETLBFS_I(inode); |
6bfde05bf
|
712 713 714 |
/* * The policy is initialized here even if we are creating a * private inode because initialization simply creates an |
4a8c7bb59
|
715 |
* an empty rb tree and calls rwlock_init(), later when we |
6bfde05bf
|
716 717 718 |
* call mpol_free_shared_policy() it will just return because * the rb tree will still be empty. */ |
71fe804b6
|
719 |
mpol_shared_policy_init(&info->policy, NULL); |
1da177e4c
|
720 721 722 723 724 725 726 727 728 729 730 731 732 |
switch (mode & S_IFMT) { default: init_special_inode(inode, mode, dev); break; case S_IFREG: inode->i_op = &hugetlbfs_inode_operations; inode->i_fop = &hugetlbfs_file_operations; break; case S_IFDIR: inode->i_op = &hugetlbfs_dir_inode_operations; inode->i_fop = &simple_dir_operations; /* directory inodes start off with i_nlink == 2 (for "." entry) */ |
d8c76e6f4
|
733 |
inc_nlink(inode); |
1da177e4c
|
734 735 736 |
break; case S_IFLNK: inode->i_op = &page_symlink_inode_operations; |
21fc61c73
|
737 |
inode_nohighmem(inode); |
1da177e4c
|
738 739 |
break; } |
e096d0c7e
|
740 |
lockdep_annotate_inode_mutex_key(inode); |
9119a41e9
|
741 742 |
} else kref_put(&resv_map->refs, resv_map_release); |
1da177e4c
|
743 744 745 746 747 748 749 |
return inode; } /* * File creation. Allocate an inode, and we're done.. */ static int hugetlbfs_mknod(struct inode *dir, |
1a67aafb5
|
750 |
struct dentry *dentry, umode_t mode, dev_t dev) |
1da177e4c
|
751 752 753 |
{ struct inode *inode; int error = -ENOSPC; |
7d54fa647
|
754 755 |
inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev); |
1da177e4c
|
756 757 758 759 760 761 762 763 |
if (inode) { dir->i_ctime = dir->i_mtime = CURRENT_TIME; d_instantiate(dentry, inode); dget(dentry); /* Extra count - pin the dentry in core */ error = 0; } return error; } |
18bb1db3e
|
764 |
static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) |
1da177e4c
|
765 766 767 |
{ int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0); if (!retval) |
d8c76e6f4
|
768 |
inc_nlink(dir); |
1da177e4c
|
769 770 |
return retval; } |
ebfc3b49a
|
771 |
static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) |
1da177e4c
|
772 773 774 775 776 777 778 779 780 |
{ return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0); } static int hugetlbfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { struct inode *inode; int error = -ENOSPC; |
1da177e4c
|
781 |
|
7d54fa647
|
782 |
inode = hugetlbfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0); |
1da177e4c
|
783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 |
if (inode) { int l = strlen(symname)+1; error = page_symlink(inode, symname, l); if (!error) { d_instantiate(dentry, inode); dget(dentry); } else iput(inode); } dir->i_ctime = dir->i_mtime = CURRENT_TIME; return error; } /* |
6649a3863
|
798 |
* mark the head page dirty |
1da177e4c
|
799 800 801 |
*/ static int hugetlbfs_set_page_dirty(struct page *page) { |
d85f33855
|
802 |
struct page *head = compound_head(page); |
6649a3863
|
803 804 |
SetPageDirty(head); |
1da177e4c
|
805 806 |
return 0; } |
290408d4a
|
807 |
static int hugetlbfs_migrate_page(struct address_space *mapping, |
b969c4ab9
|
808 |
struct page *newpage, struct page *page, |
a6bc32b89
|
809 |
enum migrate_mode mode) |
290408d4a
|
810 811 812 813 |
{ int rc; rc = migrate_huge_page_move_mapping(mapping, newpage, page); |
78bd52097
|
814 |
if (rc != MIGRATEPAGE_SUCCESS) |
290408d4a
|
815 816 |
return rc; migrate_page_copy(newpage, page); |
78bd52097
|
817 |
return MIGRATEPAGE_SUCCESS; |
290408d4a
|
818 |
} |
726c33422
|
819 |
static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) |
1da177e4c
|
820 |
{ |
726c33422
|
821 |
struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb); |
2b0143b5c
|
822 |
struct hstate *h = hstate_inode(d_inode(dentry)); |
1da177e4c
|
823 824 |
buf->f_type = HUGETLBFS_MAGIC; |
a55164389
|
825 |
buf->f_bsize = huge_page_size(h); |
1da177e4c
|
826 827 |
if (sbinfo) { spin_lock(&sbinfo->stat_lock); |
74a8a65c5
|
828 829 |
/* If no limits set, just report 0 for max/free/used * blocks, like simple_statfs() */ |
90481622d
|
830 831 832 833 834 835 836 837 838 |
if (sbinfo->spool) { long free_pages; spin_lock(&sbinfo->spool->lock); buf->f_blocks = sbinfo->spool->max_hpages; free_pages = sbinfo->spool->max_hpages - sbinfo->spool->used_hpages; buf->f_bavail = buf->f_bfree = free_pages; spin_unlock(&sbinfo->spool->lock); |
74a8a65c5
|
839 840 841 |
buf->f_files = sbinfo->max_inodes; buf->f_ffree = sbinfo->free_inodes; } |
1da177e4c
|
842 843 844 845 846 847 848 849 850 851 852 853 |
spin_unlock(&sbinfo->stat_lock); } buf->f_namelen = NAME_MAX; return 0; } static void hugetlbfs_put_super(struct super_block *sb) { struct hugetlbfs_sb_info *sbi = HUGETLBFS_SB(sb); if (sbi) { sb->s_fs_info = NULL; |
90481622d
|
854 855 856 |
if (sbi->spool) hugepage_put_subpool(sbi->spool); |
1da177e4c
|
857 858 859 |
kfree(sbi); } } |
96527980d
|
860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 |
static inline int hugetlbfs_dec_free_inodes(struct hugetlbfs_sb_info *sbinfo) { if (sbinfo->free_inodes >= 0) { spin_lock(&sbinfo->stat_lock); if (unlikely(!sbinfo->free_inodes)) { spin_unlock(&sbinfo->stat_lock); return 0; } sbinfo->free_inodes--; spin_unlock(&sbinfo->stat_lock); } return 1; } static void hugetlbfs_inc_free_inodes(struct hugetlbfs_sb_info *sbinfo) { if (sbinfo->free_inodes >= 0) { spin_lock(&sbinfo->stat_lock); sbinfo->free_inodes++; spin_unlock(&sbinfo->stat_lock); } } |
e18b890bb
|
883 |
static struct kmem_cache *hugetlbfs_inode_cachep; |
1da177e4c
|
884 885 886 |
static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) { |
96527980d
|
887 |
struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb); |
1da177e4c
|
888 |
struct hugetlbfs_inode_info *p; |
96527980d
|
889 890 |
if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo))) return NULL; |
e94b17660
|
891 |
p = kmem_cache_alloc(hugetlbfs_inode_cachep, GFP_KERNEL); |
96527980d
|
892 893 |
if (unlikely(!p)) { hugetlbfs_inc_free_inodes(sbinfo); |
1da177e4c
|
894 |
return NULL; |
96527980d
|
895 |
} |
1da177e4c
|
896 897 |
return &p->vfs_inode; } |
fa0d7e3de
|
898 899 900 |
static void hugetlbfs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); |
fa0d7e3de
|
901 902 |
kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode)); } |
1da177e4c
|
903 904 |
static void hugetlbfs_destroy_inode(struct inode *inode) { |
96527980d
|
905 |
hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb)); |
1da177e4c
|
906 |
mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy); |
fa0d7e3de
|
907 |
call_rcu(&inode->i_rcu, hugetlbfs_i_callback); |
1da177e4c
|
908 |
} |
f5e54d6e5
|
909 |
static const struct address_space_operations hugetlbfs_aops = { |
800d15a53
|
910 911 |
.write_begin = hugetlbfs_write_begin, .write_end = hugetlbfs_write_end, |
1da177e4c
|
912 |
.set_page_dirty = hugetlbfs_set_page_dirty, |
290408d4a
|
913 |
.migratepage = hugetlbfs_migrate_page, |
1da177e4c
|
914 |
}; |
96527980d
|
915 |
|
51cc50685
|
916 |
static void init_once(void *foo) |
96527980d
|
917 918 |
{ struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo; |
a35afb830
|
919 |
inode_init_once(&ei->vfs_inode); |
96527980d
|
920 |
} |
4b6f5d20b
|
921 |
const struct file_operations hugetlbfs_file_operations = { |
34d0640e2
|
922 |
.read_iter = hugetlbfs_read_iter, |
1da177e4c
|
923 |
.mmap = hugetlbfs_file_mmap, |
1b061d924
|
924 |
.fsync = noop_fsync, |
1da177e4c
|
925 |
.get_unmapped_area = hugetlb_get_unmapped_area, |
70c3547e3
|
926 927 |
.llseek = default_llseek, .fallocate = hugetlbfs_fallocate, |
1da177e4c
|
928 |
}; |
92e1d5be9
|
929 |
static const struct inode_operations hugetlbfs_dir_inode_operations = { |
1da177e4c
|
930 931 932 933 934 935 936 937 938 939 940 |
.create = hugetlbfs_create, .lookup = simple_lookup, .link = simple_link, .unlink = simple_unlink, .symlink = hugetlbfs_symlink, .mkdir = hugetlbfs_mkdir, .rmdir = simple_rmdir, .mknod = hugetlbfs_mknod, .rename = simple_rename, .setattr = hugetlbfs_setattr, }; |
92e1d5be9
|
941 |
static const struct inode_operations hugetlbfs_inode_operations = { |
1da177e4c
|
942 943 |
.setattr = hugetlbfs_setattr, }; |
ee9b6d61a
|
944 |
static const struct super_operations hugetlbfs_ops = { |
1da177e4c
|
945 946 |
.alloc_inode = hugetlbfs_alloc_inode, .destroy_inode = hugetlbfs_destroy_inode, |
2bbbda308
|
947 |
.evict_inode = hugetlbfs_evict_inode, |
1da177e4c
|
948 |
.statfs = hugetlbfs_statfs, |
1da177e4c
|
949 |
.put_super = hugetlbfs_put_super, |
10f19a86a
|
950 |
.show_options = generic_show_options, |
1da177e4c
|
951 |
}; |
7ca02d0ae
|
952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 |
enum { NO_SIZE, SIZE_STD, SIZE_PERCENT }; /* * Convert size option passed from command line to number of huge pages * in the pool specified by hstate. Size option could be in bytes * (val_type == SIZE_STD) or percentage of the pool (val_type == SIZE_PERCENT). */ static long long hugetlbfs_size_to_hpages(struct hstate *h, unsigned long long size_opt, int val_type) { if (val_type == NO_SIZE) return -1; if (val_type == SIZE_PERCENT) { size_opt <<= huge_page_shift(h); size_opt *= h->max_huge_pages; do_div(size_opt, 100); } size_opt >>= huge_page_shift(h); return size_opt; } |
1da177e4c
|
975 976 977 |
static int hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) { |
e73a75fa7
|
978 979 980 |
char *p, *rest; substring_t args[MAX_OPT_ARGS]; int option; |
7ca02d0ae
|
981 982 |
unsigned long long max_size_opt = 0, min_size_opt = 0; int max_val_type = NO_SIZE, min_val_type = NO_SIZE; |
1da177e4c
|
983 984 985 |
if (!options) return 0; |
1da177e4c
|
986 |
|
e73a75fa7
|
987 988 |
while ((p = strsep(&options, ",")) != NULL) { int token; |
b4c07bce7
|
989 990 |
if (!*p) continue; |
e73a75fa7
|
991 992 993 994 995 996 |
token = match_token(p, tokens, args); switch (token) { case Opt_uid: if (match_int(&args[0], &option)) goto bad_val; |
a0eb3a05a
|
997 998 999 |
pconfig->uid = make_kuid(current_user_ns(), option); if (!uid_valid(pconfig->uid)) goto bad_val; |
e73a75fa7
|
1000 1001 1002 1003 1004 |
break; case Opt_gid: if (match_int(&args[0], &option)) goto bad_val; |
a0eb3a05a
|
1005 1006 1007 |
pconfig->gid = make_kgid(current_user_ns(), option); if (!gid_valid(pconfig->gid)) goto bad_val; |
e73a75fa7
|
1008 1009 1010 1011 1012 |
break; case Opt_mode: if (match_octal(&args[0], &option)) goto bad_val; |
75897d60a
|
1013 |
pconfig->mode = option & 01777U; |
e73a75fa7
|
1014 1015 1016 |
break; case Opt_size: { |
e73a75fa7
|
1017 1018 1019 |
/* memparse() will accept a K/M/G without a digit */ if (!isdigit(*args[0].from)) goto bad_val; |
7ca02d0ae
|
1020 1021 |
max_size_opt = memparse(args[0].from, &rest); max_val_type = SIZE_STD; |
a137e1cc6
|
1022 |
if (*rest == '%') |
7ca02d0ae
|
1023 |
max_val_type = SIZE_PERCENT; |
e73a75fa7
|
1024 1025 |
break; } |
1da177e4c
|
1026 |
|
e73a75fa7
|
1027 1028 1029 1030 1031 1032 |
case Opt_nr_inodes: /* memparse() will accept a K/M/G without a digit */ if (!isdigit(*args[0].from)) goto bad_val; pconfig->nr_inodes = memparse(args[0].from, &rest); break; |
a137e1cc6
|
1033 1034 1035 1036 1037 |
case Opt_pagesize: { unsigned long ps; ps = memparse(args[0].from, &rest); pconfig->hstate = size_to_hstate(ps); if (!pconfig->hstate) { |
9b857d26d
|
1038 1039 |
pr_err("Unsupported page size %lu MB ", |
a137e1cc6
|
1040 1041 1042 1043 1044 |
ps >> 20); return -EINVAL; } break; } |
7ca02d0ae
|
1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 |
case Opt_min_size: { /* memparse() will accept a K/M/G without a digit */ if (!isdigit(*args[0].from)) goto bad_val; min_size_opt = memparse(args[0].from, &rest); min_val_type = SIZE_STD; if (*rest == '%') min_val_type = SIZE_PERCENT; break; } |
e73a75fa7
|
1055 |
default: |
9b857d26d
|
1056 1057 |
pr_err("Bad mount option: \"%s\" ", p); |
b4c07bce7
|
1058 |
return -EINVAL; |
e73a75fa7
|
1059 1060 |
break; } |
1da177e4c
|
1061 |
} |
a137e1cc6
|
1062 |
|
7ca02d0ae
|
1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 |
/* * Use huge page pool size (in hstate) to convert the size * options to number of huge pages. If NO_SIZE, -1 is returned. */ pconfig->max_hpages = hugetlbfs_size_to_hpages(pconfig->hstate, max_size_opt, max_val_type); pconfig->min_hpages = hugetlbfs_size_to_hpages(pconfig->hstate, min_size_opt, min_val_type); /* * If max_size was specified, then min_size must be smaller */ if (max_val_type > NO_SIZE && pconfig->min_hpages > pconfig->max_hpages) { pr_err("minimum size can not be greater than maximum size "); return -EINVAL; |
a137e1cc6
|
1080 |
} |
1da177e4c
|
1081 |
return 0; |
e73a75fa7
|
1082 1083 |
bad_val: |
9b857d26d
|
1084 1085 |
pr_err("Bad value '%s' for mount option '%s' ", args[0].from, p); |
c12ddba09
|
1086 |
return -EINVAL; |
1da177e4c
|
1087 1088 1089 1090 1091 |
} static int hugetlbfs_fill_super(struct super_block *sb, void *data, int silent) { |
1da177e4c
|
1092 1093 1094 |
int ret; struct hugetlbfs_config config; struct hugetlbfs_sb_info *sbinfo; |
10f19a86a
|
1095 |
save_mount_options(sb, data); |
7ca02d0ae
|
1096 |
config.max_hpages = -1; /* No limit on size by default */ |
1da177e4c
|
1097 |
config.nr_inodes = -1; /* No limit on number of inodes by default */ |
77c70de15
|
1098 1099 |
config.uid = current_fsuid(); config.gid = current_fsgid(); |
1da177e4c
|
1100 |
config.mode = 0755; |
a137e1cc6
|
1101 |
config.hstate = &default_hstate; |
7ca02d0ae
|
1102 |
config.min_hpages = -1; /* No default minimum size */ |
1da177e4c
|
1103 |
ret = hugetlbfs_parse_options(data, &config); |
1da177e4c
|
1104 1105 1106 1107 1108 1109 1110 |
if (ret) return ret; sbinfo = kmalloc(sizeof(struct hugetlbfs_sb_info), GFP_KERNEL); if (!sbinfo) return -ENOMEM; sb->s_fs_info = sbinfo; |
a137e1cc6
|
1111 |
sbinfo->hstate = config.hstate; |
1da177e4c
|
1112 |
spin_lock_init(&sbinfo->stat_lock); |
1da177e4c
|
1113 1114 |
sbinfo->max_inodes = config.nr_inodes; sbinfo->free_inodes = config.nr_inodes; |
90481622d
|
1115 |
sbinfo->spool = NULL; |
7ca02d0ae
|
1116 1117 1118 1119 1120 1121 1122 1123 1124 |
/* * Allocate and initialize subpool if maximum or minimum size is * specified. Any needed reservations (for minimim size) are taken * taken when the subpool is created. */ if (config.max_hpages != -1 || config.min_hpages != -1) { sbinfo->spool = hugepage_new_subpool(config.hstate, config.max_hpages, config.min_hpages); |
90481622d
|
1125 1126 1127 |
if (!sbinfo->spool) goto out_free; } |
1da177e4c
|
1128 |
sb->s_maxbytes = MAX_LFS_FILESIZE; |
a137e1cc6
|
1129 1130 |
sb->s_blocksize = huge_page_size(config.hstate); sb->s_blocksize_bits = huge_page_shift(config.hstate); |
1da177e4c
|
1131 1132 1133 |
sb->s_magic = HUGETLBFS_MAGIC; sb->s_op = &hugetlbfs_ops; sb->s_time_gran = 1; |
48fde701a
|
1134 1135 |
sb->s_root = d_make_root(hugetlbfs_get_root(sb, &config)); if (!sb->s_root) |
1da177e4c
|
1136 |
goto out_free; |
1da177e4c
|
1137 1138 |
return 0; out_free: |
6e6870d4f
|
1139 |
kfree(sbinfo->spool); |
1da177e4c
|
1140 1141 1142 |
kfree(sbinfo); return -ENOMEM; } |
3c26ff6e4
|
1143 1144 |
static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) |
1da177e4c
|
1145 |
{ |
3c26ff6e4
|
1146 |
return mount_nodev(fs_type, flags, data, hugetlbfs_fill_super); |
1da177e4c
|
1147 1148 1149 1150 |
} static struct file_system_type hugetlbfs_fs_type = { .name = "hugetlbfs", |
3c26ff6e4
|
1151 |
.mount = hugetlbfs_mount, |
1da177e4c
|
1152 1153 |
.kill_sb = kill_litter_super, }; |
42d7395fe
|
1154 |
static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE]; |
1da177e4c
|
1155 |
|
ef1ff6b8c
|
1156 |
static int can_do_hugetlb_shm(void) |
1da177e4c
|
1157 |
{ |
a0eb3a05a
|
1158 1159 1160 |
kgid_t shm_group; shm_group = make_kgid(&init_user_ns, sysctl_hugetlb_shm_group); return capable(CAP_IPC_LOCK) || in_group_p(shm_group); |
1da177e4c
|
1161 |
} |
42d7395fe
|
1162 1163 |
static int get_hstate_idx(int page_size_log) { |
af73e4d95
|
1164 |
struct hstate *h = hstate_sizelog(page_size_log); |
42d7395fe
|
1165 |
|
42d7395fe
|
1166 1167 1168 1169 |
if (!h) return -1; return h - hstates; } |
be1d2cf5e
|
1170 |
static const struct dentry_operations anon_ops = { |
118b23022
|
1171 |
.d_dname = simple_dname |
0df4d6e5b
|
1172 |
}; |
af73e4d95
|
1173 1174 1175 1176 1177 1178 |
/* * Note that size should be aligned to proper hugepage size in caller side, * otherwise hugetlb_reserve_pages reserves one less hugepages than intended. */ struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acctflag, struct user_struct **user, |
42d7395fe
|
1179 |
int creat_flags, int page_size_log) |
1da177e4c
|
1180 |
{ |
39b652527
|
1181 |
struct file *file = ERR_PTR(-ENOMEM); |
1da177e4c
|
1182 |
struct inode *inode; |
2c48b9c45
|
1183 |
struct path path; |
0df4d6e5b
|
1184 |
struct super_block *sb; |
1da177e4c
|
1185 |
struct qstr quick_string; |
42d7395fe
|
1186 1187 1188 1189 1190 |
int hstate_idx; hstate_idx = get_hstate_idx(page_size_log); if (hstate_idx < 0) return ERR_PTR(-ENODEV); |
1da177e4c
|
1191 |
|
353d5c30c
|
1192 |
*user = NULL; |
42d7395fe
|
1193 |
if (!hugetlbfs_vfsmount[hstate_idx]) |
5bc98594d
|
1194 |
return ERR_PTR(-ENOENT); |
ef1ff6b8c
|
1195 |
if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) { |
353d5c30c
|
1196 1197 |
*user = current_user(); if (user_shm_lock(size, *user)) { |
21a3c273f
|
1198 |
task_lock(current); |
9b857d26d
|
1199 1200 |
pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated ", |
21a3c273f
|
1201 1202 |
current->comm, current->pid); task_unlock(current); |
353d5c30c
|
1203 1204 |
} else { *user = NULL; |
2584e5173
|
1205 |
return ERR_PTR(-EPERM); |
353d5c30c
|
1206 |
} |
2584e5173
|
1207 |
} |
1da177e4c
|
1208 |
|
0df4d6e5b
|
1209 |
sb = hugetlbfs_vfsmount[hstate_idx]->mnt_sb; |
9d66586f7
|
1210 |
quick_string.name = name; |
1da177e4c
|
1211 1212 |
quick_string.len = strlen(quick_string.name); quick_string.hash = 0; |
0df4d6e5b
|
1213 |
path.dentry = d_alloc_pseudo(sb, &quick_string); |
2c48b9c45
|
1214 |
if (!path.dentry) |
1da177e4c
|
1215 |
goto out_shm_unlock; |
0df4d6e5b
|
1216 |
d_set_d_op(path.dentry, &anon_ops); |
42d7395fe
|
1217 |
path.mnt = mntget(hugetlbfs_vfsmount[hstate_idx]); |
39b652527
|
1218 |
file = ERR_PTR(-ENOSPC); |
0df4d6e5b
|
1219 |
inode = hugetlbfs_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0); |
1da177e4c
|
1220 |
if (!inode) |
ce8d2cdf3
|
1221 |
goto out_dentry; |
e1832f292
|
1222 1223 |
if (creat_flags == HUGETLB_SHMFS_INODE) inode->i_flags |= S_PRIVATE; |
1da177e4c
|
1224 |
|
39b652527
|
1225 |
file = ERR_PTR(-ENOMEM); |
af73e4d95
|
1226 1227 1228 |
if (hugetlb_reserve_pages(inode, 0, size >> huge_page_shift(hstate_inode(inode)), NULL, acctflag)) |
b45b5bd65
|
1229 |
goto out_inode; |
2c48b9c45
|
1230 |
d_instantiate(path.dentry, inode); |
1da177e4c
|
1231 |
inode->i_size = size; |
6d6b77f16
|
1232 |
clear_nlink(inode); |
ce8d2cdf3
|
1233 |
|
2c48b9c45
|
1234 |
file = alloc_file(&path, FMODE_WRITE | FMODE_READ, |
ce8d2cdf3
|
1235 |
&hugetlbfs_file_operations); |
39b652527
|
1236 |
if (IS_ERR(file)) |
b4d232e65
|
1237 |
goto out_dentry; /* inode is already attached */ |
ce8d2cdf3
|
1238 |
|
1da177e4c
|
1239 |
return file; |
b45b5bd65
|
1240 1241 |
out_inode: iput(inode); |
1da177e4c
|
1242 |
out_dentry: |
2c48b9c45
|
1243 |
path_put(&path); |
1da177e4c
|
1244 |
out_shm_unlock: |
353d5c30c
|
1245 1246 1247 1248 |
if (*user) { user_shm_unlock(size, *user); *user = NULL; } |
39b652527
|
1249 |
return file; |
1da177e4c
|
1250 1251 1252 1253 |
} static int __init init_hugetlbfs_fs(void) { |
42d7395fe
|
1254 |
struct hstate *h; |
1da177e4c
|
1255 |
int error; |
42d7395fe
|
1256 |
int i; |
1da177e4c
|
1257 |
|
457c1b27e
|
1258 |
if (!hugepages_supported()) { |
9b857d26d
|
1259 1260 |
pr_info("disabling because there are no supported hugepage sizes "); |
457c1b27e
|
1261 1262 |
return -ENOTSUPP; } |
d1d5e05ff
|
1263 |
error = -ENOMEM; |
1da177e4c
|
1264 1265 |
hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache", sizeof(struct hugetlbfs_inode_info), |
5d097056c
|
1266 |
0, SLAB_ACCOUNT, init_once); |
1da177e4c
|
1267 |
if (hugetlbfs_inode_cachep == NULL) |
e0bf68dde
|
1268 |
goto out2; |
1da177e4c
|
1269 1270 1271 1272 |
error = register_filesystem(&hugetlbfs_fs_type); if (error) goto out; |
42d7395fe
|
1273 1274 1275 1276 |
i = 0; for_each_hstate(h) { char buf[50]; unsigned ps_kb = 1U << (h->order + PAGE_SHIFT - 10); |
1da177e4c
|
1277 |
|
42d7395fe
|
1278 1279 1280 |
snprintf(buf, sizeof(buf), "pagesize=%uK", ps_kb); hugetlbfs_vfsmount[i] = kern_mount_data(&hugetlbfs_fs_type, buf); |
1da177e4c
|
1281 |
|
42d7395fe
|
1282 |
if (IS_ERR(hugetlbfs_vfsmount[i])) { |
9b857d26d
|
1283 |
pr_err("Cannot mount internal hugetlbfs for " |
42d7395fe
|
1284 1285 1286 1287 1288 1289 1290 1291 1292 |
"page size %uK", ps_kb); error = PTR_ERR(hugetlbfs_vfsmount[i]); hugetlbfs_vfsmount[i] = NULL; } i++; } /* Non default hstates are optional */ if (!IS_ERR_OR_NULL(hugetlbfs_vfsmount[default_hstate_idx])) return 0; |
1da177e4c
|
1293 1294 |
out: |
d1d5e05ff
|
1295 |
kmem_cache_destroy(hugetlbfs_inode_cachep); |
e0bf68dde
|
1296 |
out2: |
1da177e4c
|
1297 1298 |
return error; } |
3e89e1c5e
|
1299 |
fs_initcall(init_hugetlbfs_fs) |