[PATCH] r/o bind mounts: elevate write count for open()s

This is the first really tricky patch in the series. It elevates the writer count on a mount each time a non-special file is opened for write. We used to do this in may_open(), but Miklos pointed out that __dentry_open() is used as well to create filps. This will cover even those cases, while a call in may_open() would not have. There is also an elevated count around the vfs_create() call in open_namei(). See the comments for more details, but we need this to fix a 'create, remount, fail r/w open()' race. Some filesystems forego the use of normal vfs calls to create struct files. Make sure that these users elevate the mnt writer count because they will get __fput(), and we need to make sure they're balanced. Acked-by: Al Viro <viro@ZenIV.linux.org.uk> Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Dave Hansen <haveblue@us.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

[PATCH] r/o bind mounts: elevate write count for open()s
This is the first really tricky patch in the series. It elevates the writer count on a mount each time a non-special file is opened for write. We used to do this in may_open(), but Miklos pointed out that __dentry_open() is used as well to create filps. This will cover even those cases, while a call in may_open() would not have. There is also an elevated count around the vfs_create() call in open_namei(). See the comments for more details, but we need this to fix a 'create, remount, fail r/w open()' race. Some filesystems forego the use of normal vfs calls to create struct files. Make sure that these users elevate the mnt writer count because they will get __fput(), and we need to make sure they're balanced. Acked-by: Al Viro <viro@ZenIV.linux.org.uk> Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Dave Hansen <haveblue@us.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Dave Hansen · Al Viro
1 parent 42a74f206b
Showing 4 changed files with 127 additions and 14 deletions Side-by-side Diff
fs/file_table.c
fs/namei.c
fs/open.c
ipc/mqueue.c
@@ -199,6 +199,17 @@
 	file->f_mapping = dentry->d_inode->i_mapping;
 	file->f_mode = mode;
 	file->f_op = fop;
+
+	/*
+	 * These mounts don't really matter in practice
+	 * for r/o bind mounts.  They aren't userspace-
+	 * visible.  We do this for consistency, and so
+	 * that we can do debugging checks at __fput()
+	 */
+	if ((mode & FMODE_WRITE) && !special_file(dentry->d_inode->i_mode)) {
+		error = mnt_want_write(mnt);
+		WARN_ON(error);
+	}
 	return error;
 }
 EXPORT_SYMBOL(init_file);
  
@@ -221,10 +232,13 @@
  */
 void drop_file_write_access(struct file *file)
 {
+	struct vfsmount *mnt = file->f_path.mnt;
 	struct dentry *dentry = file->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
  
 	put_write_access(inode);
+	if (!special_file(inode->i_mode))
+		mnt_drop_write(mnt);
 }
 EXPORT_SYMBOL_GPL(drop_file_write_access);
  
@@ -1623,8 +1623,7 @@
 			return -EACCES;
  
 		flag &= ~O_TRUNC;
-	} else if (IS_RDONLY(inode) && (acc_mode & MAY_WRITE))
-		return -EROFS;
+	}
  
 	error = vfs_permission(nd, acc_mode);
 	if (error)
  
  
  
@@ -1724,18 +1723,32 @@
 	return flag;
 }
  
+static int open_will_write_to_fs(int flag, struct inode *inode)
+{
+	/*
+	 * We'll never write to the fs underlying
+	 * a device file.
+	 */
+	if (special_file(inode->i_mode))
+		return 0;
+	return (flag & O_TRUNC);
+}
+
 /*
- * Note that the low bits of "flag" aren't the same as in the open
- * system call.  See open_to_namei_flags().
+ * Note that the low bits of the passed in "open_flag"
+ * are not the same as in the local variable "flag". See
+ * open_to_namei_flags() for more details.
  */
 struct file *do_filp_open(int dfd, const char *pathname,
 		int open_flag, int mode)
 {
+	struct file *filp;
 	struct nameidata nd;
 	int acc_mode, error;
 	struct path path;
 	struct dentry *dir;
 	int count = 0;
+	int will_write;
 	int flag = open_to_namei_flags(open_flag);
  
 	acc_mode = ACC_MODE(flag);
  
  
  
  
@@ -1791,17 +1804,30 @@
 	}
  
 	if (IS_ERR(nd.intent.open.file)) {
-		mutex_unlock(&dir->d_inode->i_mutex);
 		error = PTR_ERR(nd.intent.open.file);
-		goto exit_dput;
+		goto exit_mutex_unlock;
 	}
  
 	/* Negative dentry, just create the file */
 	if (!path.dentry->d_inode) {
-		error = __open_namei_create(&nd, &path, flag, mode);
+		/*
+		 * This write is needed to ensure that a
+		 * ro->rw transition does not occur between
+		 * the time when the file is created and when
+		 * a permanent write count is taken through
+		 * the 'struct file' in nameidata_to_filp().
+		 */
+		error = mnt_want_write(nd.path.mnt);
 		if (error)
+			goto exit_mutex_unlock;
+		error = __open_namei_create(&nd, &path, flag, mode);
+		if (error) {
+			mnt_drop_write(nd.path.mnt);
 			goto exit;
-		return nameidata_to_filp(&nd, open_flag);
+		}
+		filp = nameidata_to_filp(&nd, open_flag);
+		mnt_drop_write(nd.path.mnt);
+		return filp;
 	}
  
 	/*
  
  
  
@@ -1831,11 +1857,40 @@
 	if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode))
 		goto exit;
 ok:
+	/*
+	 * Consider:
+	 * 1. may_open() truncates a file
+	 * 2. a rw->ro mount transition occurs
+	 * 3. nameidata_to_filp() fails due to
+	 *    the ro mount.
+	 * That would be inconsistent, and should
+	 * be avoided. Taking this mnt write here
+	 * ensures that (2) can not occur.
+	 */
+	will_write = open_will_write_to_fs(flag, nd.path.dentry->d_inode);
+	if (will_write) {
+		error = mnt_want_write(nd.path.mnt);
+		if (error)
+			goto exit;
+	}
 	error = may_open(&nd, acc_mode, flag);
-	if (error)
+	if (error) {
+		if (will_write)
+			mnt_drop_write(nd.path.mnt);
 		goto exit;
-	return nameidata_to_filp(&nd, open_flag);
+	}
+	filp = nameidata_to_filp(&nd, open_flag);
+	/*
+	 * It is now safe to drop the mnt write
+	 * because the filp has had a write taken
+	 * on its behalf.
+	 */
+	if (will_write)
+		mnt_drop_write(nd.path.mnt);
+	return filp;
  
+exit_mutex_unlock:
+	mutex_unlock(&dir->d_inode->i_mutex);
 exit_dput:
 	path_put_conditional(&path, &nd);
 exit:
@@ -730,6 +730,35 @@
 	return error;
 }
  
+/*
+ * You have to be very careful that these write
+ * counts get cleaned up in error cases and
+ * upon __fput().  This should probably never
+ * be called outside of __dentry_open().
+ */
+static inline int __get_file_write_access(struct inode *inode,
+					  struct vfsmount *mnt)
+{
+	int error;
+	error = get_write_access(inode);
+	if (error)
+		return error;
+	/*
+	 * Do not take mount writer counts on
+	 * special files since no writes to
+	 * the mount itself will occur.
+	 */
+	if (!special_file(inode->i_mode)) {
+		/*
+		 * Balanced in __fput()
+		 */
+		error = mnt_want_write(mnt);
+		if (error)
+			put_write_access(inode);
+	}
+	return error;
+}
+
 static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
 					int flags, struct file *f,
 					int (*open)(struct inode *, struct file *))
@@ -742,7 +771,7 @@
 				FMODE_PREAD | FMODE_PWRITE;
 	inode = dentry->d_inode;
 	if (f->f_mode & FMODE_WRITE) {
-		error = get_write_access(inode);
+		error = __get_file_write_access(inode, mnt);
 		if (error)
 			goto cleanup_file;
 	}
  
@@ -784,8 +813,11 @@
  
 cleanup_all:
 	fops_put(f->f_op);
-	if (f->f_mode & FMODE_WRITE)
+	if (f->f_mode & FMODE_WRITE) {
 		put_write_access(inode);
+		if (!special_file(inode->i_mode))
+			mnt_drop_write(mnt);
+	}
 	file_kill(f);
 	f->f_path.dentry = NULL;
 	f->f_path.mnt = NULL;
@@ -598,6 +598,7 @@
 			int oflag, mode_t mode, struct mq_attr __user *u_attr)
 {
 	struct mq_attr attr;
+	struct file *result;
 	int ret;
  
 	if (u_attr) {
  
  
  
@@ -612,13 +613,24 @@
 	}
  
 	mode &= ~current->fs->umask;
+	ret = mnt_want_write(mqueue_mnt);
+	if (ret)
+		goto out;
 	ret = vfs_create(dir->d_inode, dentry, mode, NULL);
 	dentry->d_fsdata = NULL;
 	if (ret)
-		goto out;
+		goto out_drop_write;
  
-	return dentry_open(dentry, mqueue_mnt, oflag);
+	result = dentry_open(dentry, mqueue_mnt, oflag);
+	/*
+	 * dentry_open() took a persistent mnt_want_write(),
+	 * so we can now drop this one.
+	 */
+	mnt_drop_write(mqueue_mnt);
+	return result;
  
+out_drop_write:
+	mnt_drop_write(mqueue_mnt);
 out:
 	dput(dentry);
 	mntput(mqueue_mnt);
...	...	@@ -199,6 +199,17 @@
199	199	file->f_mapping = dentry->d_inode->i_mapping;
200	200	file->f_mode = mode;
201	201	file->f_op = fop;
	202	+
	203	+ /*
	204	+ * These mounts don't really matter in practice
	205	+ * for r/o bind mounts. They aren't userspace-
	206	+ * visible. We do this for consistency, and so
	207	+ * that we can do debugging checks at __fput()
	208	+ */
	209	+ if ((mode & FMODE_WRITE) && !special_file(dentry->d_inode->i_mode)) {
	210	+ error = mnt_want_write(mnt);
	211	+ WARN_ON(error);
	212	+ }
202	213	return error;
203	214	}
204	215	EXPORT_SYMBOL(init_file);
205	216
...	...	@@ -221,10 +232,13 @@
221	232	*/
222	233	void drop_file_write_access(struct file *file)
223	234	{
	235	+ struct vfsmount *mnt = file->f_path.mnt;
224	236	struct dentry *dentry = file->f_path.dentry;
225	237	struct inode *inode = dentry->d_inode;
226	238
227	239	put_write_access(inode);
	240	+ if (!special_file(inode->i_mode))
	241	+ mnt_drop_write(mnt);
228	242	}
229	243	EXPORT_SYMBOL_GPL(drop_file_write_access);
230	244
...	...	@@ -1623,8 +1623,7 @@
1623	1623	return -EACCES;
1624	1624
1625	1625	flag &= ~O_TRUNC;
1626		- } else if (IS_RDONLY(inode) && (acc_mode & MAY_WRITE))
1627		- return -EROFS;
	1626	+ }
1628	1627
1629	1628	error = vfs_permission(nd, acc_mode);
1630	1629	if (error)
1631	1630
1632	1631
1633	1632
...	...	@@ -1724,18 +1723,32 @@
1724	1723	return flag;
1725	1724	}
1726	1725
	1726	+static int open_will_write_to_fs(int flag, struct inode *inode)
	1727	+{
	1728	+ /*
	1729	+ * We'll never write to the fs underlying
	1730	+ * a device file.
	1731	+ */
	1732	+ if (special_file(inode->i_mode))
	1733	+ return 0;
	1734	+ return (flag & O_TRUNC);
	1735	+}
	1736	+
1727	1737	/*
1728		- * Note that the low bits of "flag" aren't the same as in the open
1729		- * system call. See open_to_namei_flags().
	1738	+ * Note that the low bits of the passed in "open_flag"
	1739	+ * are not the same as in the local variable "flag". See
	1740	+ * open_to_namei_flags() for more details.
1730	1741	*/
1731	1742	struct file do_filp_open(int dfd, const char pathname,
1732	1743	int open_flag, int mode)
1733	1744	{
	1745	+ struct file *filp;
1734	1746	struct nameidata nd;
1735	1747	int acc_mode, error;
1736	1748	struct path path;
1737	1749	struct dentry *dir;
1738	1750	int count = 0;
	1751	+ int will_write;
1739	1752	int flag = open_to_namei_flags(open_flag);
1740	1753
1741	1754	acc_mode = ACC_MODE(flag);
1742	1755
1743	1756
1744	1757
1745	1758
...	...	@@ -1791,17 +1804,30 @@
1791	1804	}
1792	1805
1793	1806	if (IS_ERR(nd.intent.open.file)) {
1794		- mutex_unlock(&dir->d_inode->i_mutex);
1795	1807	error = PTR_ERR(nd.intent.open.file);
1796		- goto exit_dput;
	1808	+ goto exit_mutex_unlock;
1797	1809	}
1798	1810
1799	1811	/* Negative dentry, just create the file */
1800	1812	if (!path.dentry->d_inode) {
1801		- error = __open_namei_create(&nd, &path, flag, mode);
	1813	+ /*
	1814	+ * This write is needed to ensure that a
	1815	+ * ro->rw transition does not occur between
	1816	+ * the time when the file is created and when
	1817	+ * a permanent write count is taken through
	1818	+ * the 'struct file' in nameidata_to_filp().
	1819	+ */
	1820	+ error = mnt_want_write(nd.path.mnt);
1802	1821	if (error)
	1822	+ goto exit_mutex_unlock;
	1823	+ error = __open_namei_create(&nd, &path, flag, mode);
	1824	+ if (error) {
	1825	+ mnt_drop_write(nd.path.mnt);
1803	1826	goto exit;
1804		- return nameidata_to_filp(&nd, open_flag);
	1827	+ }
	1828	+ filp = nameidata_to_filp(&nd, open_flag);
	1829	+ mnt_drop_write(nd.path.mnt);
	1830	+ return filp;
1805	1831	}
1806	1832
1807	1833	/*
1808	1834
1809	1835
1810	1836
...	...	@@ -1831,11 +1857,40 @@
1831	1857	if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode))
1832	1858	goto exit;
1833	1859	ok:
	1860	+ /*
	1861	+ * Consider:
	1862	+ * 1. may_open() truncates a file
	1863	+ * 2. a rw->ro mount transition occurs
	1864	+ * 3. nameidata_to_filp() fails due to
	1865	+ * the ro mount.
	1866	+ * That would be inconsistent, and should
	1867	+ * be avoided. Taking this mnt write here
	1868	+ * ensures that (2) can not occur.
	1869	+ */
	1870	+ will_write = open_will_write_to_fs(flag, nd.path.dentry->d_inode);
	1871	+ if (will_write) {
	1872	+ error = mnt_want_write(nd.path.mnt);
	1873	+ if (error)
	1874	+ goto exit;
	1875	+ }
1834	1876	error = may_open(&nd, acc_mode, flag);
1835		- if (error)
	1877	+ if (error) {
	1878	+ if (will_write)
	1879	+ mnt_drop_write(nd.path.mnt);
1836	1880	goto exit;
1837		- return nameidata_to_filp(&nd, open_flag);
	1881	+ }
	1882	+ filp = nameidata_to_filp(&nd, open_flag);
	1883	+ /*
	1884	+ * It is now safe to drop the mnt write
	1885	+ * because the filp has had a write taken
	1886	+ * on its behalf.
	1887	+ */
	1888	+ if (will_write)
	1889	+ mnt_drop_write(nd.path.mnt);
	1890	+ return filp;
1838	1891
	1892	+exit_mutex_unlock:
	1893	+ mutex_unlock(&dir->d_inode->i_mutex);
1839	1894	exit_dput:
1840	1895	path_put_conditional(&path, &nd);
1841	1896	exit:
...	...	@@ -730,6 +730,35 @@
730	730	return error;
731	731	}
732	732
	733	+/*
	734	+ * You have to be very careful that these write
	735	+ * counts get cleaned up in error cases and
	736	+ * upon __fput(). This should probably never
	737	+ * be called outside of __dentry_open().
	738	+ */
	739	+static inline int __get_file_write_access(struct inode *inode,
	740	+ struct vfsmount *mnt)
	741	+{
	742	+ int error;
	743	+ error = get_write_access(inode);
	744	+ if (error)
	745	+ return error;
	746	+ /*
	747	+ * Do not take mount writer counts on
	748	+ * special files since no writes to
	749	+ * the mount itself will occur.
	750	+ */
	751	+ if (!special_file(inode->i_mode)) {
	752	+ /*
	753	+ * Balanced in __fput()
	754	+ */
	755	+ error = mnt_want_write(mnt);
	756	+ if (error)
	757	+ put_write_access(inode);
	758	+ }
	759	+ return error;
	760	+}
	761	+
733	762	static struct file __dentry_open(struct dentry dentry, struct vfsmount *mnt,
734	763	int flags, struct file *f,
735	764	int (open)(struct inode , struct file *))
...	...	@@ -742,7 +771,7 @@
742	771	FMODE_PREAD \| FMODE_PWRITE;
743	772	inode = dentry->d_inode;
744	773	if (f->f_mode & FMODE_WRITE) {
745		- error = get_write_access(inode);
	774	+ error = __get_file_write_access(inode, mnt);
746	775	if (error)
747	776	goto cleanup_file;
748	777	}
749	778
...	...	@@ -784,8 +813,11 @@
784	813
785	814	cleanup_all:
786	815	fops_put(f->f_op);
787		- if (f->f_mode & FMODE_WRITE)
	816	+ if (f->f_mode & FMODE_WRITE) {
788	817	put_write_access(inode);
	818	+ if (!special_file(inode->i_mode))
	819	+ mnt_drop_write(mnt);
	820	+ }
789	821	file_kill(f);
790	822	f->f_path.dentry = NULL;
791	823	f->f_path.mnt = NULL;
...	...	@@ -598,6 +598,7 @@
598	598	int oflag, mode_t mode, struct mq_attr __user *u_attr)
599	599	{
600	600	struct mq_attr attr;
	601	+ struct file *result;
601	602	int ret;
602	603
603	604	if (u_attr) {
604	605
605	606
606	607
...	...	@@ -612,13 +613,24 @@
612	613	}
613	614
614	615	mode &= ~current->fs->umask;
	616	+ ret = mnt_want_write(mqueue_mnt);
	617	+ if (ret)
	618	+ goto out;
615	619	ret = vfs_create(dir->d_inode, dentry, mode, NULL);
616	620	dentry->d_fsdata = NULL;
617	621	if (ret)
618		- goto out;
	622	+ goto out_drop_write;
619	623
620		- return dentry_open(dentry, mqueue_mnt, oflag);
	624	+ result = dentry_open(dentry, mqueue_mnt, oflag);
	625	+ /*
	626	+ * dentry_open() took a persistent mnt_want_write(),
	627	+ * so we can now drop this one.
	628	+ */
	629	+ mnt_drop_write(mqueue_mnt);
	630	+ return result;
621	631
	632	+out_drop_write:
	633	+ mnt_drop_write(mqueue_mnt);
622	634	out:
623	635	dput(dentry);
624	636	mntput(mqueue_mnt);