Commit 4a3fd211ccfc08a88edc824300e25a87785c6a5f

Authored by Dave Hansen
Committed by Al Viro
1 parent 42a74f206b

[PATCH] r/o bind mounts: elevate write count for open()s

This is the first really tricky patch in the series.  It elevates the writer
count on a mount each time a non-special file is opened for write.

We used to do this in may_open(), but Miklos pointed out that __dentry_open()
is used as well to create filps.  This will cover even those cases, while a
call in may_open() would not have.

There is also an elevated count around the vfs_create() call in open_namei().
See the comments for more details, but we need this to fix a 'create, remount,
fail r/w open()' race.

Some filesystems forego the use of normal vfs calls to create
struct files.   Make sure that these users elevate the mnt
writer count because they will get __fput(), and we need
to make sure they're balanced.

Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

Showing 4 changed files with 127 additions and 14 deletions Side-by-side Diff

... ... @@ -199,6 +199,17 @@
199 199 file->f_mapping = dentry->d_inode->i_mapping;
200 200 file->f_mode = mode;
201 201 file->f_op = fop;
  202 +
  203 + /*
  204 + * These mounts don't really matter in practice
  205 + * for r/o bind mounts. They aren't userspace-
  206 + * visible. We do this for consistency, and so
  207 + * that we can do debugging checks at __fput()
  208 + */
  209 + if ((mode & FMODE_WRITE) && !special_file(dentry->d_inode->i_mode)) {
  210 + error = mnt_want_write(mnt);
  211 + WARN_ON(error);
  212 + }
202 213 return error;
203 214 }
204 215 EXPORT_SYMBOL(init_file);
205 216  
... ... @@ -221,10 +232,13 @@
221 232 */
222 233 void drop_file_write_access(struct file *file)
223 234 {
  235 + struct vfsmount *mnt = file->f_path.mnt;
224 236 struct dentry *dentry = file->f_path.dentry;
225 237 struct inode *inode = dentry->d_inode;
226 238  
227 239 put_write_access(inode);
  240 + if (!special_file(inode->i_mode))
  241 + mnt_drop_write(mnt);
228 242 }
229 243 EXPORT_SYMBOL_GPL(drop_file_write_access);
230 244  
... ... @@ -1623,8 +1623,7 @@
1623 1623 return -EACCES;
1624 1624  
1625 1625 flag &= ~O_TRUNC;
1626   - } else if (IS_RDONLY(inode) && (acc_mode & MAY_WRITE))
1627   - return -EROFS;
  1626 + }
1628 1627  
1629 1628 error = vfs_permission(nd, acc_mode);
1630 1629 if (error)
1631 1630  
1632 1631  
1633 1632  
... ... @@ -1724,18 +1723,32 @@
1724 1723 return flag;
1725 1724 }
1726 1725  
  1726 +static int open_will_write_to_fs(int flag, struct inode *inode)
  1727 +{
  1728 + /*
  1729 + * We'll never write to the fs underlying
  1730 + * a device file.
  1731 + */
  1732 + if (special_file(inode->i_mode))
  1733 + return 0;
  1734 + return (flag & O_TRUNC);
  1735 +}
  1736 +
1727 1737 /*
1728   - * Note that the low bits of "flag" aren't the same as in the open
1729   - * system call. See open_to_namei_flags().
  1738 + * Note that the low bits of the passed in "open_flag"
  1739 + * are not the same as in the local variable "flag". See
  1740 + * open_to_namei_flags() for more details.
1730 1741 */
1731 1742 struct file *do_filp_open(int dfd, const char *pathname,
1732 1743 int open_flag, int mode)
1733 1744 {
  1745 + struct file *filp;
1734 1746 struct nameidata nd;
1735 1747 int acc_mode, error;
1736 1748 struct path path;
1737 1749 struct dentry *dir;
1738 1750 int count = 0;
  1751 + int will_write;
1739 1752 int flag = open_to_namei_flags(open_flag);
1740 1753  
1741 1754 acc_mode = ACC_MODE(flag);
1742 1755  
1743 1756  
1744 1757  
1745 1758  
... ... @@ -1791,17 +1804,30 @@
1791 1804 }
1792 1805  
1793 1806 if (IS_ERR(nd.intent.open.file)) {
1794   - mutex_unlock(&dir->d_inode->i_mutex);
1795 1807 error = PTR_ERR(nd.intent.open.file);
1796   - goto exit_dput;
  1808 + goto exit_mutex_unlock;
1797 1809 }
1798 1810  
1799 1811 /* Negative dentry, just create the file */
1800 1812 if (!path.dentry->d_inode) {
1801   - error = __open_namei_create(&nd, &path, flag, mode);
  1813 + /*
  1814 + * This write is needed to ensure that a
  1815 + * ro->rw transition does not occur between
  1816 + * the time when the file is created and when
  1817 + * a permanent write count is taken through
  1818 + * the 'struct file' in nameidata_to_filp().
  1819 + */
  1820 + error = mnt_want_write(nd.path.mnt);
1802 1821 if (error)
  1822 + goto exit_mutex_unlock;
  1823 + error = __open_namei_create(&nd, &path, flag, mode);
  1824 + if (error) {
  1825 + mnt_drop_write(nd.path.mnt);
1803 1826 goto exit;
1804   - return nameidata_to_filp(&nd, open_flag);
  1827 + }
  1828 + filp = nameidata_to_filp(&nd, open_flag);
  1829 + mnt_drop_write(nd.path.mnt);
  1830 + return filp;
1805 1831 }
1806 1832  
1807 1833 /*
1808 1834  
1809 1835  
1810 1836  
... ... @@ -1831,11 +1857,40 @@
1831 1857 if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode))
1832 1858 goto exit;
1833 1859 ok:
  1860 + /*
  1861 + * Consider:
  1862 + * 1. may_open() truncates a file
  1863 + * 2. a rw->ro mount transition occurs
  1864 + * 3. nameidata_to_filp() fails due to
  1865 + * the ro mount.
  1866 + * That would be inconsistent, and should
  1867 + * be avoided. Taking this mnt write here
  1868 + * ensures that (2) can not occur.
  1869 + */
  1870 + will_write = open_will_write_to_fs(flag, nd.path.dentry->d_inode);
  1871 + if (will_write) {
  1872 + error = mnt_want_write(nd.path.mnt);
  1873 + if (error)
  1874 + goto exit;
  1875 + }
1834 1876 error = may_open(&nd, acc_mode, flag);
1835   - if (error)
  1877 + if (error) {
  1878 + if (will_write)
  1879 + mnt_drop_write(nd.path.mnt);
1836 1880 goto exit;
1837   - return nameidata_to_filp(&nd, open_flag);
  1881 + }
  1882 + filp = nameidata_to_filp(&nd, open_flag);
  1883 + /*
  1884 + * It is now safe to drop the mnt write
  1885 + * because the filp has had a write taken
  1886 + * on its behalf.
  1887 + */
  1888 + if (will_write)
  1889 + mnt_drop_write(nd.path.mnt);
  1890 + return filp;
1838 1891  
  1892 +exit_mutex_unlock:
  1893 + mutex_unlock(&dir->d_inode->i_mutex);
1839 1894 exit_dput:
1840 1895 path_put_conditional(&path, &nd);
1841 1896 exit:
... ... @@ -730,6 +730,35 @@
730 730 return error;
731 731 }
732 732  
  733 +/*
  734 + * You have to be very careful that these write
  735 + * counts get cleaned up in error cases and
  736 + * upon __fput(). This should probably never
  737 + * be called outside of __dentry_open().
  738 + */
  739 +static inline int __get_file_write_access(struct inode *inode,
  740 + struct vfsmount *mnt)
  741 +{
  742 + int error;
  743 + error = get_write_access(inode);
  744 + if (error)
  745 + return error;
  746 + /*
  747 + * Do not take mount writer counts on
  748 + * special files since no writes to
  749 + * the mount itself will occur.
  750 + */
  751 + if (!special_file(inode->i_mode)) {
  752 + /*
  753 + * Balanced in __fput()
  754 + */
  755 + error = mnt_want_write(mnt);
  756 + if (error)
  757 + put_write_access(inode);
  758 + }
  759 + return error;
  760 +}
  761 +
733 762 static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
734 763 int flags, struct file *f,
735 764 int (*open)(struct inode *, struct file *))
... ... @@ -742,7 +771,7 @@
742 771 FMODE_PREAD | FMODE_PWRITE;
743 772 inode = dentry->d_inode;
744 773 if (f->f_mode & FMODE_WRITE) {
745   - error = get_write_access(inode);
  774 + error = __get_file_write_access(inode, mnt);
746 775 if (error)
747 776 goto cleanup_file;
748 777 }
749 778  
... ... @@ -784,8 +813,11 @@
784 813  
785 814 cleanup_all:
786 815 fops_put(f->f_op);
787   - if (f->f_mode & FMODE_WRITE)
  816 + if (f->f_mode & FMODE_WRITE) {
788 817 put_write_access(inode);
  818 + if (!special_file(inode->i_mode))
  819 + mnt_drop_write(mnt);
  820 + }
789 821 file_kill(f);
790 822 f->f_path.dentry = NULL;
791 823 f->f_path.mnt = NULL;
... ... @@ -598,6 +598,7 @@
598 598 int oflag, mode_t mode, struct mq_attr __user *u_attr)
599 599 {
600 600 struct mq_attr attr;
  601 + struct file *result;
601 602 int ret;
602 603  
603 604 if (u_attr) {
604 605  
605 606  
606 607  
... ... @@ -612,13 +613,24 @@
612 613 }
613 614  
614 615 mode &= ~current->fs->umask;
  616 + ret = mnt_want_write(mqueue_mnt);
  617 + if (ret)
  618 + goto out;
615 619 ret = vfs_create(dir->d_inode, dentry, mode, NULL);
616 620 dentry->d_fsdata = NULL;
617 621 if (ret)
618   - goto out;
  622 + goto out_drop_write;
619 623  
620   - return dentry_open(dentry, mqueue_mnt, oflag);
  624 + result = dentry_open(dentry, mqueue_mnt, oflag);
  625 + /*
  626 + * dentry_open() took a persistent mnt_want_write(),
  627 + * so we can now drop this one.
  628 + */
  629 + mnt_drop_write(mqueue_mnt);
  630 + return result;
621 631  
  632 +out_drop_write:
  633 + mnt_drop_write(mqueue_mnt);
622 634 out:
623 635 dput(dentry);
624 636 mntput(mqueue_mnt);