Commit 4a3fd211ccfc08a88edc824300e25a87785c6a5f
Committed by
Al Viro
1 parent
42a74f206b
Exists in
master
and in
4 other branches
[PATCH] r/o bind mounts: elevate write count for open()s
This is the first really tricky patch in the series. It elevates the writer count on a mount each time a non-special file is opened for write. We used to do this in may_open(), but Miklos pointed out that __dentry_open() is used as well to create filps. This will cover even those cases, while a call in may_open() would not have. There is also an elevated count around the vfs_create() call in open_namei(). See the comments for more details, but we need this to fix a 'create, remount, fail r/w open()' race. Some filesystems forego the use of normal vfs calls to create struct files. Make sure that these users elevate the mnt writer count because they will get __fput(), and we need to make sure they're balanced. Acked-by: Al Viro <viro@ZenIV.linux.org.uk> Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Dave Hansen <haveblue@us.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Showing 4 changed files with 127 additions and 14 deletions Side-by-side Diff
fs/file_table.c
... | ... | @@ -199,6 +199,17 @@ |
199 | 199 | file->f_mapping = dentry->d_inode->i_mapping; |
200 | 200 | file->f_mode = mode; |
201 | 201 | file->f_op = fop; |
202 | + | |
203 | + /* | |
204 | + * These mounts don't really matter in practice | |
205 | + * for r/o bind mounts. They aren't userspace- | |
206 | + * visible. We do this for consistency, and so | |
207 | + * that we can do debugging checks at __fput() | |
208 | + */ | |
209 | + if ((mode & FMODE_WRITE) && !special_file(dentry->d_inode->i_mode)) { | |
210 | + error = mnt_want_write(mnt); | |
211 | + WARN_ON(error); | |
212 | + } | |
202 | 213 | return error; |
203 | 214 | } |
204 | 215 | EXPORT_SYMBOL(init_file); |
205 | 216 | |
... | ... | @@ -221,10 +232,13 @@ |
221 | 232 | */ |
222 | 233 | void drop_file_write_access(struct file *file) |
223 | 234 | { |
235 | + struct vfsmount *mnt = file->f_path.mnt; | |
224 | 236 | struct dentry *dentry = file->f_path.dentry; |
225 | 237 | struct inode *inode = dentry->d_inode; |
226 | 238 | |
227 | 239 | put_write_access(inode); |
240 | + if (!special_file(inode->i_mode)) | |
241 | + mnt_drop_write(mnt); | |
228 | 242 | } |
229 | 243 | EXPORT_SYMBOL_GPL(drop_file_write_access); |
230 | 244 |
fs/namei.c
... | ... | @@ -1623,8 +1623,7 @@ |
1623 | 1623 | return -EACCES; |
1624 | 1624 | |
1625 | 1625 | flag &= ~O_TRUNC; |
1626 | - } else if (IS_RDONLY(inode) && (acc_mode & MAY_WRITE)) | |
1627 | - return -EROFS; | |
1626 | + } | |
1628 | 1627 | |
1629 | 1628 | error = vfs_permission(nd, acc_mode); |
1630 | 1629 | if (error) |
1631 | 1630 | |
1632 | 1631 | |
1633 | 1632 | |
... | ... | @@ -1724,18 +1723,32 @@ |
1724 | 1723 | return flag; |
1725 | 1724 | } |
1726 | 1725 | |
1726 | +static int open_will_write_to_fs(int flag, struct inode *inode) | |
1727 | +{ | |
1728 | + /* | |
1729 | + * We'll never write to the fs underlying | |
1730 | + * a device file. | |
1731 | + */ | |
1732 | + if (special_file(inode->i_mode)) | |
1733 | + return 0; | |
1734 | + return (flag & O_TRUNC); | |
1735 | +} | |
1736 | + | |
1727 | 1737 | /* |
1728 | - * Note that the low bits of "flag" aren't the same as in the open | |
1729 | - * system call. See open_to_namei_flags(). | |
1738 | + * Note that the low bits of the passed in "open_flag" | |
1739 | + * are not the same as in the local variable "flag". See | |
1740 | + * open_to_namei_flags() for more details. | |
1730 | 1741 | */ |
1731 | 1742 | struct file *do_filp_open(int dfd, const char *pathname, |
1732 | 1743 | int open_flag, int mode) |
1733 | 1744 | { |
1745 | + struct file *filp; | |
1734 | 1746 | struct nameidata nd; |
1735 | 1747 | int acc_mode, error; |
1736 | 1748 | struct path path; |
1737 | 1749 | struct dentry *dir; |
1738 | 1750 | int count = 0; |
1751 | + int will_write; | |
1739 | 1752 | int flag = open_to_namei_flags(open_flag); |
1740 | 1753 | |
1741 | 1754 | acc_mode = ACC_MODE(flag); |
1742 | 1755 | |
1743 | 1756 | |
1744 | 1757 | |
1745 | 1758 | |
... | ... | @@ -1791,17 +1804,30 @@ |
1791 | 1804 | } |
1792 | 1805 | |
1793 | 1806 | if (IS_ERR(nd.intent.open.file)) { |
1794 | - mutex_unlock(&dir->d_inode->i_mutex); | |
1795 | 1807 | error = PTR_ERR(nd.intent.open.file); |
1796 | - goto exit_dput; | |
1808 | + goto exit_mutex_unlock; | |
1797 | 1809 | } |
1798 | 1810 | |
1799 | 1811 | /* Negative dentry, just create the file */ |
1800 | 1812 | if (!path.dentry->d_inode) { |
1801 | - error = __open_namei_create(&nd, &path, flag, mode); | |
1813 | + /* | |
1814 | + * This write is needed to ensure that a | |
1815 | + * ro->rw transition does not occur between | |
1816 | + * the time when the file is created and when | |
1817 | + * a permanent write count is taken through | |
1818 | + * the 'struct file' in nameidata_to_filp(). | |
1819 | + */ | |
1820 | + error = mnt_want_write(nd.path.mnt); | |
1802 | 1821 | if (error) |
1822 | + goto exit_mutex_unlock; | |
1823 | + error = __open_namei_create(&nd, &path, flag, mode); | |
1824 | + if (error) { | |
1825 | + mnt_drop_write(nd.path.mnt); | |
1803 | 1826 | goto exit; |
1804 | - return nameidata_to_filp(&nd, open_flag); | |
1827 | + } | |
1828 | + filp = nameidata_to_filp(&nd, open_flag); | |
1829 | + mnt_drop_write(nd.path.mnt); | |
1830 | + return filp; | |
1805 | 1831 | } |
1806 | 1832 | |
1807 | 1833 | /* |
1808 | 1834 | |
1809 | 1835 | |
1810 | 1836 | |
... | ... | @@ -1831,11 +1857,40 @@ |
1831 | 1857 | if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode)) |
1832 | 1858 | goto exit; |
1833 | 1859 | ok: |
1860 | + /* | |
1861 | + * Consider: | |
1862 | + * 1. may_open() truncates a file | |
1863 | + * 2. a rw->ro mount transition occurs | |
1864 | + * 3. nameidata_to_filp() fails due to | |
1865 | + * the ro mount. | |
1866 | + * That would be inconsistent, and should | |
1867 | + * be avoided. Taking this mnt write here | |
1868 | + * ensures that (2) can not occur. | |
1869 | + */ | |
1870 | + will_write = open_will_write_to_fs(flag, nd.path.dentry->d_inode); | |
1871 | + if (will_write) { | |
1872 | + error = mnt_want_write(nd.path.mnt); | |
1873 | + if (error) | |
1874 | + goto exit; | |
1875 | + } | |
1834 | 1876 | error = may_open(&nd, acc_mode, flag); |
1835 | - if (error) | |
1877 | + if (error) { | |
1878 | + if (will_write) | |
1879 | + mnt_drop_write(nd.path.mnt); | |
1836 | 1880 | goto exit; |
1837 | - return nameidata_to_filp(&nd, open_flag); | |
1881 | + } | |
1882 | + filp = nameidata_to_filp(&nd, open_flag); | |
1883 | + /* | |
1884 | + * It is now safe to drop the mnt write | |
1885 | + * because the filp has had a write taken | |
1886 | + * on its behalf. | |
1887 | + */ | |
1888 | + if (will_write) | |
1889 | + mnt_drop_write(nd.path.mnt); | |
1890 | + return filp; | |
1838 | 1891 | |
1892 | +exit_mutex_unlock: | |
1893 | + mutex_unlock(&dir->d_inode->i_mutex); | |
1839 | 1894 | exit_dput: |
1840 | 1895 | path_put_conditional(&path, &nd); |
1841 | 1896 | exit: |
fs/open.c
... | ... | @@ -730,6 +730,35 @@ |
730 | 730 | return error; |
731 | 731 | } |
732 | 732 | |
733 | +/* | |
734 | + * You have to be very careful that these write | |
735 | + * counts get cleaned up in error cases and | |
736 | + * upon __fput(). This should probably never | |
737 | + * be called outside of __dentry_open(). | |
738 | + */ | |
739 | +static inline int __get_file_write_access(struct inode *inode, | |
740 | + struct vfsmount *mnt) | |
741 | +{ | |
742 | + int error; | |
743 | + error = get_write_access(inode); | |
744 | + if (error) | |
745 | + return error; | |
746 | + /* | |
747 | + * Do not take mount writer counts on | |
748 | + * special files since no writes to | |
749 | + * the mount itself will occur. | |
750 | + */ | |
751 | + if (!special_file(inode->i_mode)) { | |
752 | + /* | |
753 | + * Balanced in __fput() | |
754 | + */ | |
755 | + error = mnt_want_write(mnt); | |
756 | + if (error) | |
757 | + put_write_access(inode); | |
758 | + } | |
759 | + return error; | |
760 | +} | |
761 | + | |
733 | 762 | static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, |
734 | 763 | int flags, struct file *f, |
735 | 764 | int (*open)(struct inode *, struct file *)) |
... | ... | @@ -742,7 +771,7 @@ |
742 | 771 | FMODE_PREAD | FMODE_PWRITE; |
743 | 772 | inode = dentry->d_inode; |
744 | 773 | if (f->f_mode & FMODE_WRITE) { |
745 | - error = get_write_access(inode); | |
774 | + error = __get_file_write_access(inode, mnt); | |
746 | 775 | if (error) |
747 | 776 | goto cleanup_file; |
748 | 777 | } |
749 | 778 | |
... | ... | @@ -784,8 +813,11 @@ |
784 | 813 | |
785 | 814 | cleanup_all: |
786 | 815 | fops_put(f->f_op); |
787 | - if (f->f_mode & FMODE_WRITE) | |
816 | + if (f->f_mode & FMODE_WRITE) { | |
788 | 817 | put_write_access(inode); |
818 | + if (!special_file(inode->i_mode)) | |
819 | + mnt_drop_write(mnt); | |
820 | + } | |
789 | 821 | file_kill(f); |
790 | 822 | f->f_path.dentry = NULL; |
791 | 823 | f->f_path.mnt = NULL; |
ipc/mqueue.c
... | ... | @@ -598,6 +598,7 @@ |
598 | 598 | int oflag, mode_t mode, struct mq_attr __user *u_attr) |
599 | 599 | { |
600 | 600 | struct mq_attr attr; |
601 | + struct file *result; | |
601 | 602 | int ret; |
602 | 603 | |
603 | 604 | if (u_attr) { |
604 | 605 | |
605 | 606 | |
606 | 607 | |
... | ... | @@ -612,13 +613,24 @@ |
612 | 613 | } |
613 | 614 | |
614 | 615 | mode &= ~current->fs->umask; |
616 | + ret = mnt_want_write(mqueue_mnt); | |
617 | + if (ret) | |
618 | + goto out; | |
615 | 619 | ret = vfs_create(dir->d_inode, dentry, mode, NULL); |
616 | 620 | dentry->d_fsdata = NULL; |
617 | 621 | if (ret) |
618 | - goto out; | |
622 | + goto out_drop_write; | |
619 | 623 | |
620 | - return dentry_open(dentry, mqueue_mnt, oflag); | |
624 | + result = dentry_open(dentry, mqueue_mnt, oflag); | |
625 | + /* | |
626 | + * dentry_open() took a persistent mnt_want_write(), | |
627 | + * so we can now drop this one. | |
628 | + */ | |
629 | + mnt_drop_write(mqueue_mnt); | |
630 | + return result; | |
621 | 631 | |
632 | +out_drop_write: | |
633 | + mnt_drop_write(mqueue_mnt); | |
622 | 634 | out: |
623 | 635 | dput(dentry); |
624 | 636 | mntput(mqueue_mnt); |