Commit 25d9e2d15286281ec834b829a4aaf8969011f1cd

Authored by npiggin@suse.de
Committed by al
1 parent eca6f534e6

truncate: new helpers

Introduce new truncate helpers truncate_pagecache and inode_newsize_ok.
vmtruncate is also consolidated from mm/memory.c and mm/nommu.c and
into mm/truncate.c.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

Showing 9 changed files with 120 additions and 108 deletions Side-by-side Diff

Documentation/vm/locking
... ... @@ -80,7 +80,7 @@
80 80 mm start up ... this is a loose form of stability on mm_users. For
81 81 example, it is used in copy_mm to protect against a racing tlb_gather_mmu
82 82 single address space optimization, so that the zap_page_range (from
83   -vmtruncate) does not lose sending ipi's to cloned threads that might
  83 +truncate) does not lose sending ipi's to cloned threads that might
84 84 be spawned underneath it and go to user mode to drag in pte's into tlbs.
85 85  
86 86 swap_lock
... ... @@ -18,7 +18,7 @@
18 18 /* Taken over from the old code... */
19 19  
20 20 /* POSIX UID/GID verification for setting inode attributes. */
21   -int inode_change_ok(struct inode *inode, struct iattr *attr)
  21 +int inode_change_ok(const struct inode *inode, struct iattr *attr)
22 22 {
23 23 int retval = -EPERM;
24 24 unsigned int ia_valid = attr->ia_valid;
25 25  
... ... @@ -60,8 +60,50 @@
60 60 error:
61 61 return retval;
62 62 }
63   -
64 63 EXPORT_SYMBOL(inode_change_ok);
  64 +
  65 +/**
  66 + * inode_newsize_ok - may this inode be truncated to a given size
  67 + * @inode: the inode to be truncated
  68 + * @offset: the new size to assign to the inode
  69 + * @Returns: 0 on success, -ve errno on failure
  70 + *
  71 + * inode_newsize_ok will check filesystem limits and ulimits to check that the
  72 + * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ
  73 + * when necessary. Caller must not proceed with inode size change if failure is
  74 + * returned. @inode must be a file (not directory), with appropriate
  75 + * permissions to allow truncate (inode_newsize_ok does NOT check these
  76 + * conditions).
  77 + *
  78 + * inode_newsize_ok must be called with i_mutex held.
  79 + */
  80 +int inode_newsize_ok(const struct inode *inode, loff_t offset)
  81 +{
  82 + if (inode->i_size < offset) {
  83 + unsigned long limit;
  84 +
  85 + limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
  86 + if (limit != RLIM_INFINITY && offset > limit)
  87 + goto out_sig;
  88 + if (offset > inode->i_sb->s_maxbytes)
  89 + goto out_big;
  90 + } else {
  91 + /*
  92 + * truncation of in-use swapfiles is disallowed - it would
  93 + * cause subsequent swapout to scribble on the now-freed
  94 + * blocks.
  95 + */
  96 + if (IS_SWAPFILE(inode))
  97 + return -ETXTBSY;
  98 + }
  99 +
  100 + return 0;
  101 +out_sig:
  102 + send_sig(SIGXFSZ, current, 0);
  103 +out_big:
  104 + return -EFBIG;
  105 +}
  106 +EXPORT_SYMBOL(inode_newsize_ok);
65 107  
66 108 int inode_setattr(struct inode * inode, struct iattr * attr)
67 109 {
... ... @@ -2382,7 +2382,8 @@
2382 2382 #define buffer_migrate_page NULL
2383 2383 #endif
2384 2384  
2385   -extern int inode_change_ok(struct inode *, struct iattr *);
  2385 +extern int inode_change_ok(const struct inode *, struct iattr *);
  2386 +extern int inode_newsize_ok(const struct inode *, loff_t offset);
2386 2387 extern int __must_check inode_setattr(struct inode *, struct iattr *);
2387 2388  
2388 2389 extern void file_update_time(struct file *file);
... ... @@ -791,8 +791,9 @@
791 791 unmap_mapping_range(mapping, holebegin, holelen, 0);
792 792 }
793 793  
794   -extern int vmtruncate(struct inode * inode, loff_t offset);
795   -extern int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end);
  794 +extern void truncate_pagecache(struct inode *inode, loff_t old, loff_t new);
  795 +extern int vmtruncate(struct inode *inode, loff_t offset);
  796 +extern int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end);
796 797  
797 798 #ifdef CONFIG_MMU
798 799 extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
... ... @@ -58,7 +58,7 @@
58 58 /*
59 59 * Lock ordering:
60 60 *
61   - * ->i_mmap_lock (vmtruncate)
  61 + * ->i_mmap_lock (truncate_pagecache)
62 62 * ->private_lock (__free_pte->__set_page_dirty_buffers)
63 63 * ->swap_lock (exclusive_swap_page, others)
64 64 * ->mapping->tree_lock
... ... @@ -297,7 +297,8 @@
297 297 unsigned long addr = vma->vm_start;
298 298  
299 299 /*
300   - * Hide vma from rmap and vmtruncate before freeing pgtables
  300 + * Hide vma from rmap and truncate_pagecache before freeing
  301 + * pgtables
301 302 */
302 303 anon_vma_unlink(vma);
303 304 unlink_file_vma(vma);
... ... @@ -2407,7 +2408,7 @@
2407 2408 * @mapping: the address space containing mmaps to be unmapped.
2408 2409 * @holebegin: byte in first page to unmap, relative to the start of
2409 2410 * the underlying file. This will be rounded down to a PAGE_SIZE
2410   - * boundary. Note that this is different from vmtruncate(), which
  2411 + * boundary. Note that this is different from truncate_pagecache(), which
2411 2412 * must keep the partial page. In contrast, we must get rid of
2412 2413 * partial pages.
2413 2414 * @holelen: size of prospective hole in bytes. This will be rounded
... ... @@ -2457,63 +2458,6 @@
2457 2458 spin_unlock(&mapping->i_mmap_lock);
2458 2459 }
2459 2460 EXPORT_SYMBOL(unmap_mapping_range);
2460   -
2461   -/**
2462   - * vmtruncate - unmap mappings "freed" by truncate() syscall
2463   - * @inode: inode of the file used
2464   - * @offset: file offset to start truncating
2465   - *
2466   - * NOTE! We have to be ready to update the memory sharing
2467   - * between the file and the memory map for a potential last
2468   - * incomplete page. Ugly, but necessary.
2469   - */
2470   -int vmtruncate(struct inode * inode, loff_t offset)
2471   -{
2472   - if (inode->i_size < offset) {
2473   - unsigned long limit;
2474   -
2475   - limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
2476   - if (limit != RLIM_INFINITY && offset > limit)
2477   - goto out_sig;
2478   - if (offset > inode->i_sb->s_maxbytes)
2479   - goto out_big;
2480   - i_size_write(inode, offset);
2481   - } else {
2482   - struct address_space *mapping = inode->i_mapping;
2483   -
2484   - /*
2485   - * truncation of in-use swapfiles is disallowed - it would
2486   - * cause subsequent swapout to scribble on the now-freed
2487   - * blocks.
2488   - */
2489   - if (IS_SWAPFILE(inode))
2490   - return -ETXTBSY;
2491   - i_size_write(inode, offset);
2492   -
2493   - /*
2494   - * unmap_mapping_range is called twice, first simply for
2495   - * efficiency so that truncate_inode_pages does fewer
2496   - * single-page unmaps. However after this first call, and
2497   - * before truncate_inode_pages finishes, it is possible for
2498   - * private pages to be COWed, which remain after
2499   - * truncate_inode_pages finishes, hence the second
2500   - * unmap_mapping_range call must be made for correctness.
2501   - */
2502   - unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
2503   - truncate_inode_pages(mapping, offset);
2504   - unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
2505   - }
2506   -
2507   - if (inode->i_op->truncate)
2508   - inode->i_op->truncate(inode);
2509   - return 0;
2510   -
2511   -out_sig:
2512   - send_sig(SIGXFSZ, current, 0);
2513   -out_big:
2514   - return -EFBIG;
2515   -}
2516   -EXPORT_SYMBOL(vmtruncate);
2517 2461  
2518 2462 int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
2519 2463 {
... ... @@ -86,8 +86,8 @@
86 86 if (vma->vm_file) {
87 87 /*
88 88 * Subtle point from Rajesh Venkatasubramanian: before
89   - * moving file-based ptes, we must lock vmtruncate out,
90   - * since it might clean the dst vma before the src vma,
  89 + * moving file-based ptes, we must lock truncate_pagecache
  90 + * out, since it might clean the dst vma before the src vma,
91 91 * and we propagate stale pages into the dst afterward.
92 92 */
93 93 mapping = vma->vm_file->f_mapping;
... ... @@ -83,46 +83,6 @@
83 83 };
84 84  
85 85 /*
86   - * Handle all mappings that got truncated by a "truncate()"
87   - * system call.
88   - *
89   - * NOTE! We have to be ready to update the memory sharing
90   - * between the file and the memory map for a potential last
91   - * incomplete page. Ugly, but necessary.
92   - */
93   -int vmtruncate(struct inode *inode, loff_t offset)
94   -{
95   - struct address_space *mapping = inode->i_mapping;
96   - unsigned long limit;
97   -
98   - if (inode->i_size < offset)
99   - goto do_expand;
100   - i_size_write(inode, offset);
101   -
102   - truncate_inode_pages(mapping, offset);
103   - goto out_truncate;
104   -
105   -do_expand:
106   - limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
107   - if (limit != RLIM_INFINITY && offset > limit)
108   - goto out_sig;
109   - if (offset > inode->i_sb->s_maxbytes)
110   - goto out;
111   - i_size_write(inode, offset);
112   -
113   -out_truncate:
114   - if (inode->i_op->truncate)
115   - inode->i_op->truncate(inode);
116   - return 0;
117   -out_sig:
118   - send_sig(SIGXFSZ, current, 0);
119   -out:
120   - return -EFBIG;
121   -}
122   -
123   -EXPORT_SYMBOL(vmtruncate);
124   -
125   -/*
126 86 * Return the total memory allocated for this pointer, not
127 87 * just what the caller asked for.
128 88 *
... ... @@ -465,4 +465,68 @@
465 465 return invalidate_inode_pages2_range(mapping, 0, -1);
466 466 }
467 467 EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
  468 +
  469 +/**
  470 + * truncate_pagecache - unmap and remove pagecache that has been truncated
  471 + * @inode: inode
  472 + * @old: old file offset
  473 + * @new: new file offset
  474 + *
  475 + * inode's new i_size must already be written before truncate_pagecache
  476 + * is called.
  477 + *
  478 + * This function should typically be called before the filesystem
  479 + * releases resources associated with the freed range (eg. deallocates
  480 + * blocks). This way, pagecache will always stay logically coherent
  481 + * with on-disk format, and the filesystem would not have to deal with
  482 + * situations such as writepage being called for a page that has already
  483 + * had its underlying blocks deallocated.
  484 + */
  485 +void truncate_pagecache(struct inode *inode, loff_t old, loff_t new)
  486 +{
  487 + if (new < old) {
  488 + struct address_space *mapping = inode->i_mapping;
  489 +
  490 + /*
  491 + * unmap_mapping_range is called twice, first simply for
  492 + * efficiency so that truncate_inode_pages does fewer
  493 + * single-page unmaps. However after this first call, and
  494 + * before truncate_inode_pages finishes, it is possible for
  495 + * private pages to be COWed, which remain after
  496 + * truncate_inode_pages finishes, hence the second
  497 + * unmap_mapping_range call must be made for correctness.
  498 + */
  499 + unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
  500 + truncate_inode_pages(mapping, new);
  501 + unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
  502 + }
  503 +}
  504 +EXPORT_SYMBOL(truncate_pagecache);
  505 +
  506 +/**
  507 + * vmtruncate - unmap mappings "freed" by truncate() syscall
  508 + * @inode: inode of the file used
  509 + * @offset: file offset to start truncating
  510 + *
  511 + * NOTE! We have to be ready to update the memory sharing
  512 + * between the file and the memory map for a potential last
  513 + * incomplete page. Ugly, but necessary.
  514 + */
  515 +int vmtruncate(struct inode *inode, loff_t offset)
  516 +{
  517 + loff_t oldsize;
  518 + int error;
  519 +
  520 + error = inode_newsize_ok(inode, offset);
  521 + if (error)
  522 + return error;
  523 + oldsize = inode->i_size;
  524 + i_size_write(inode, offset);
  525 + truncate_pagecache(inode, oldsize, offset);
  526 + if (inode->i_op->truncate)
  527 + inode->i_op->truncate(inode);
  528 +
  529 + return error;
  530 +}
  531 +EXPORT_SYMBOL(vmtruncate);