xfs: merge fsync and O_SYNC handling

The guarantees for O_SYNC are exactly the same as the ones we need to make for an fsync call (and given that Linux O_SYNC is O_DSYNC the equivalent is fdadatasync, but we treat both the same in XFS), except with a range data writeout. Jan Kara has started unifying these two path for filesystems using the generic helpers, and I've started to look at XFS. The actual transaction commited by xfs_fsync and xfs_write_sync_logforce has a different transaction number, but actually is exactly the same. We'll only use the fsync transaction going forward. One major difference is that xfs_write_sync_logforce never issues a cache flush unless we commit a transaction causing that as a side-effect, which is an obvious bug in the O_SYNC handling. Second all the locking and i_update_size vs i_update_core changes from 978b7237123d007b9fa983af6e0e2fa8f97f9934 never made it to xfs_write_sync_logforce, so we add them back. To make xfs_fsync easily usable from the O_SYNC path, the filemap_fdatawait call is moved up to xfs_file_fsync, so that we don't wait on the whole file after we already waited for our portion in xfs_write. We'll also use a plain call to filemap_write_and_wait_range instead of the previous sync_page_rang which did it in two steps including an half-hearted inode write out that doesn't help us. Once we're done with this also remove the now useless i_update_size tracking. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Felix Blyakher <felixb@sgi.com> Signed-off-by: Felix Blyakher <felixb@sgi.com>

xfs: merge fsync and O_SYNC handling
The guarantees for O_SYNC are exactly the same as the ones we need to make for an fsync call (and given that Linux O_SYNC is O_DSYNC the equivalent is fdadatasync, but we treat both the same in XFS), except with a range data writeout. Jan Kara has started unifying these two path for filesystems using the generic helpers, and I've started to look at XFS. The actual transaction commited by xfs_fsync and xfs_write_sync_logforce has a different transaction number, but actually is exactly the same. We'll only use the fsync transaction going forward. One major difference is that xfs_write_sync_logforce never issues a cache flush unless we commit a transaction causing that as a side-effect, which is an obvious bug in the O_SYNC handling. Second all the locking and i_update_size vs i_update_core changes from 978b7237123d007b9fa983af6e0e2fa8f97f9934 never made it to xfs_write_sync_logforce, so we add them back. To make xfs_fsync easily usable from the O_SYNC path, the filemap_fdatawait call is moved up to xfs_file_fsync, so that we don't wait on the whole file after we already waited for our portion in xfs_write. We'll also use a plain call to filemap_write_and_wait_range instead of the previous sync_page_rang which did it in two steps including an half-hearted inode write out that doesn't help us. Once we're done with this also remove the now useless i_update_size tracking. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Felix Blyakher <felixb@sgi.com> Signed-off-by: Felix Blyakher <felixb@sgi.com>
Christoph Hellwig · Felix Blyakher
1 parent bd16956599
Showing 10 changed files with 23 additions and 112 deletions Side-by-side Diff
fs/xfs/linux-2.6/xfs_aops.c
fs/xfs/linux-2.6/xfs_file.c
fs/xfs/linux-2.6/xfs_lrw.c
fs/xfs/xfs_iget.c
fs/xfs/xfs_inode.h
fs/xfs/xfs_inode_item.c
fs/xfs/xfs_rw.c
fs/xfs/xfs_rw.h
fs/xfs/xfs_trans.h
fs/xfs/xfs_vnodeops.c
@@ -216,7 +216,6 @@
 	if (ip->i_d.di_size < isize) {
 		ip->i_d.di_size = isize;
 		ip->i_update_core = 1;
-		ip->i_update_size = 1;
 		xfs_mark_inode_dirty_sync(ip);
 	}
  
@@ -172,12 +172,21 @@
  */
 STATIC int
 xfs_file_fsync(
-	struct file	*filp,
-	struct dentry	*dentry,
-	int		datasync)
+	struct file		*file,
+	struct dentry		*dentry,
+	int			datasync)
 {
-	xfs_iflags_clear(XFS_I(dentry->d_inode), XFS_ITRUNCATED);
-	return -xfs_fsync(XFS_I(dentry->d_inode));
+	struct inode		*inode = dentry->d_inode;
+	struct xfs_inode	*ip = XFS_I(inode);
+	int			error;
+
+	/* capture size updates in I/O completion before writing the inode. */
+	error = filemap_fdatawait(inode->i_mapping);
+	if (error)
+		return error;
+
+	xfs_iflags_clear(ip, XFS_ITRUNCATED);
+	return -xfs_fsync(ip);
 }
  
 STATIC int
@@ -812,18 +812,21 @@
  
 	/* Handle various SYNC-type writes */
 	if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
+		loff_t end = pos + ret - 1;
 		int error2;
  
 		xfs_iunlock(xip, iolock);
 		if (need_i_mutex)
 			mutex_unlock(&inode->i_mutex);
-		error2 = sync_page_range(inode, mapping, pos, ret);
+
+		error2 = filemap_write_and_wait_range(mapping, pos, end);
 		if (!error)
 			error = error2;
 		if (need_i_mutex)
 			mutex_lock(&inode->i_mutex);
 		xfs_ilock(xip, iolock);
-		error2 = xfs_write_sync_logforce(mp, xip);
+
+		error2 = xfs_fsync(xip);
 		if (!error)
 			error = error2;
 	}
@@ -82,7 +82,6 @@
 	memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
 	ip->i_flags = 0;
 	ip->i_update_core = 0;
-	ip->i_update_size = 0;
 	ip->i_delayed_blks = 0;
 	memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
 	ip->i_size = 0;
@@ -261,7 +261,6 @@
 	/* Miscellaneous state. */
 	unsigned short		i_flags;	/* see defined flags below */
 	unsigned char		i_update_core;	/* timestamps/size is dirty */
-	unsigned char		i_update_size;	/* di_size field is dirty */
 	unsigned int		i_delayed_blks;	/* count of delay alloc blks */
  
 	xfs_icdinode_t		i_d;		/* most of ondisk inode */
@@ -263,14 +263,6 @@
 	}
  
 	/*
-	 * We don't have to worry about re-ordering here because
-	 * the update_size field is protected by the inode lock
-	 * and we have that held in exclusive mode.
-	 */
-	if (ip->i_update_size)
-		ip->i_update_size = 0;
-
-	/*
 	 * Make sure to get the latest atime from the Linux inode.
 	 */
 	xfs_synchronize_atime(ip);
@@ -88,90 +88,6 @@
 }
  
 /*
- * Handle logging requirements of various synchronous types of write.
- */
-int
-xfs_write_sync_logforce(
-	xfs_mount_t	*mp,
-	xfs_inode_t	*ip)
-{
-	int		error = 0;
-
-	/*
-	 * If we're treating this as O_DSYNC and we have not updated the
-	 * size, force the log.
-	 */
-	if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) &&
-	    !(ip->i_update_size)) {
-		xfs_inode_log_item_t	*iip = ip->i_itemp;
-
-		/*
-		 * If an allocation transaction occurred
-		 * without extending the size, then we have to force
-		 * the log up the proper point to ensure that the
-		 * allocation is permanent.  We can't count on
-		 * the fact that buffered writes lock out direct I/O
-		 * writes - the direct I/O write could have extended
-		 * the size nontransactionally, then finished before
-		 * we started.  xfs_write_file will think that the file
-		 * didn't grow but the update isn't safe unless the
-		 * size change is logged.
-		 *
-		 * Force the log if we've committed a transaction
-		 * against the inode or if someone else has and
-		 * the commit record hasn't gone to disk (e.g.
-		 * the inode is pinned).  This guarantees that
-		 * all changes affecting the inode are permanent
-		 * when we return.
-		 */
-		if (iip && iip->ili_last_lsn) {
-			error = _xfs_log_force(mp, iip->ili_last_lsn,
-					XFS_LOG_FORCE | XFS_LOG_SYNC, NULL);
-		} else if (xfs_ipincount(ip) > 0) {
-			error = _xfs_log_force(mp, (xfs_lsn_t)0,
-					XFS_LOG_FORCE | XFS_LOG_SYNC, NULL);
-		}
-
-	} else {
-		xfs_trans_t	*tp;
-
-		/*
-		 * O_SYNC or O_DSYNC _with_ a size update are handled
-		 * the same way.
-		 *
-		 * If the write was synchronous then we need to make
-		 * sure that the inode modification time is permanent.
-		 * We'll have updated the timestamp above, so here
-		 * we use a synchronous transaction to log the inode.
-		 * It's not fast, but it's necessary.
-		 *
-		 * If this a dsync write and the size got changed
-		 * non-transactionally, then we need to ensure that
-		 * the size change gets logged in a synchronous
-		 * transaction.
-		 */
-		tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);
-		if ((error = xfs_trans_reserve(tp, 0,
-						XFS_SWRITE_LOG_RES(mp),
-						0, 0, 0))) {
-			/* Transaction reserve failed */
-			xfs_trans_cancel(tp, 0);
-		} else {
-			/* Transaction reserve successful */
-			xfs_ilock(ip, XFS_ILOCK_EXCL);
-			xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-			xfs_trans_ihold(tp, ip);
-			xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-			xfs_trans_set_sync(tp);
-			error = xfs_trans_commit(tp, 0);
-			xfs_iunlock(ip, XFS_ILOCK_EXCL);
-		}
-	}
-
-	return error;
-}
-
-/*
  * Force a shutdown of the filesystem instantly while keeping
  * the filesystem consistent. We don't do an unmount here; just shutdown
  * the shop, make sure that absolutely nothing persistent happens to
@@ -68,7 +68,6 @@
  * Prototypes for functions in xfs_rw.c.
  */
 extern int xfs_write_clear_setuid(struct xfs_inode *ip);
-extern int xfs_write_sync_logforce(struct xfs_mount *mp, struct xfs_inode *ip);
 extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp);
 extern int xfs_bioerror(struct xfs_buf *bp);
 extern int xfs_bioerror_relse(struct xfs_buf *bp);
@@ -68,7 +68,7 @@
 #define XFS_TRANS_GROWFS		14
 #define XFS_TRANS_STRAT_WRITE		15
 #define XFS_TRANS_DIOSTRAT		16
-#define	XFS_TRANS_WRITE_SYNC		17
+/* 17 was XFS_TRANS_WRITE_SYNC */
 #define	XFS_TRANS_WRITEID		18
 #define	XFS_TRANS_ADDAFORK		19
 #define	XFS_TRANS_ATTRINVAL		20
@@ -611,7 +611,7 @@
 	xfs_inode_t	*ip)
 {
 	xfs_trans_t	*tp;
-	int		error;
+	int		error = 0;
 	int		log_flushed = 0, changed = 1;
  
 	xfs_itrace_entry(ip);
  
@@ -619,14 +619,9 @@
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 		return XFS_ERROR(EIO);
  
-	/* capture size updates in I/O completion before writing the inode. */
-	error = xfs_wait_on_pages(ip, 0, -1);
-	if (error)
-		return XFS_ERROR(error);
-
 	/*
 	 * We always need to make sure that the required inode state is safe on
-	 * disk.  The vnode might be clean but we still might need to force the
+	 * disk.  The inode might be clean but we still might need to force the
 	 * log because of committed transactions that haven't hit the disk yet.
 	 * Likewise, there could be unflushed non-transactional changes to the
 	 * inode core that have to go to disk and this requires us to issue
@@ -638,7 +633,7 @@
 	 */
 	xfs_ilock(ip, XFS_ILOCK_SHARED);
  
-	if (!(ip->i_update_size || ip->i_update_core)) {
+	if (!ip->i_update_core) {
 		/*
 		 * Timestamps/size haven't changed since last inode flush or
 		 * inode transaction commit.  That means either nothing got
...	...	@@ -216,7 +216,6 @@
216	216	if (ip->i_d.di_size < isize) {
217	217	ip->i_d.di_size = isize;
218	218	ip->i_update_core = 1;
219		- ip->i_update_size = 1;
220	219	xfs_mark_inode_dirty_sync(ip);
221	220	}
222	221
...	...	@@ -172,12 +172,21 @@
172	172	*/
173	173	STATIC int
174	174	xfs_file_fsync(
175		- struct file *filp,
176		- struct dentry *dentry,
177		- int datasync)
	175	+ struct file *file,
	176	+ struct dentry *dentry,
	177	+ int datasync)
178	178	{
179		- xfs_iflags_clear(XFS_I(dentry->d_inode), XFS_ITRUNCATED);
180		- return -xfs_fsync(XFS_I(dentry->d_inode));
	179	+ struct inode *inode = dentry->d_inode;
	180	+ struct xfs_inode *ip = XFS_I(inode);
	181	+ int error;
	182	+
	183	+ /* capture size updates in I/O completion before writing the inode. */
	184	+ error = filemap_fdatawait(inode->i_mapping);
	185	+ if (error)
	186	+ return error;
	187	+
	188	+ xfs_iflags_clear(ip, XFS_ITRUNCATED);
	189	+ return -xfs_fsync(ip);
181	190	}
182	191
183	192	STATIC int
...	...	@@ -812,18 +812,21 @@
812	812
813	813	/* Handle various SYNC-type writes */
814	814	if ((file->f_flags & O_SYNC) \|\| IS_SYNC(inode)) {
	815	+ loff_t end = pos + ret - 1;
815	816	int error2;
816	817
817	818	xfs_iunlock(xip, iolock);
818	819	if (need_i_mutex)
819	820	mutex_unlock(&inode->i_mutex);
820		- error2 = sync_page_range(inode, mapping, pos, ret);
	821	+
	822	+ error2 = filemap_write_and_wait_range(mapping, pos, end);
821	823	if (!error)
822	824	error = error2;
823	825	if (need_i_mutex)
824	826	mutex_lock(&inode->i_mutex);
825	827	xfs_ilock(xip, iolock);
826		- error2 = xfs_write_sync_logforce(mp, xip);
	828	+
	829	+ error2 = xfs_fsync(xip);
827	830	if (!error)
828	831	error = error2;
829	832	}
...	...	@@ -82,7 +82,6 @@
82	82	memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
83	83	ip->i_flags = 0;
84	84	ip->i_update_core = 0;
85		- ip->i_update_size = 0;
86	85	ip->i_delayed_blks = 0;
87	86	memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
88	87	ip->i_size = 0;
...	...	@@ -261,7 +261,6 @@
261	261	/* Miscellaneous state. */
262	262	unsigned short i_flags; /* see defined flags below */
263	263	unsigned char i_update_core; /* timestamps/size is dirty */
264		- unsigned char i_update_size; /* di_size field is dirty */
265	264	unsigned int i_delayed_blks; /* count of delay alloc blks */
266	265
267	266	xfs_icdinode_t i_d; /* most of ondisk inode */
...	...	@@ -263,14 +263,6 @@
263	263	}
264	264
265	265	/*
266		- * We don't have to worry about re-ordering here because
267		- * the update_size field is protected by the inode lock
268		- * and we have that held in exclusive mode.
269		- */
270		- if (ip->i_update_size)
271		- ip->i_update_size = 0;
272		-
273		- /*
274	266	* Make sure to get the latest atime from the Linux inode.
275	267	*/
276	268	xfs_synchronize_atime(ip);
...	...	@@ -88,90 +88,6 @@
88	88	}
89	89
90	90	/*
91		- * Handle logging requirements of various synchronous types of write.
92		- */
93		-int
94		-xfs_write_sync_logforce(
95		- xfs_mount_t *mp,
96		- xfs_inode_t *ip)
97		-{
98		- int error = 0;
99		-
100		- /*
101		- * If we're treating this as O_DSYNC and we have not updated the
102		- * size, force the log.
103		- */
104		- if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) &&
105		- !(ip->i_update_size)) {
106		- xfs_inode_log_item_t *iip = ip->i_itemp;
107		-
108		- /*
109		- * If an allocation transaction occurred
110		- * without extending the size, then we have to force
111		- * the log up the proper point to ensure that the
112		- * allocation is permanent. We can't count on
113		- * the fact that buffered writes lock out direct I/O
114		- * writes - the direct I/O write could have extended
115		- * the size nontransactionally, then finished before
116		- * we started. xfs_write_file will think that the file
117		- * didn't grow but the update isn't safe unless the
118		- * size change is logged.
119		- *
120		- * Force the log if we've committed a transaction
121		- * against the inode or if someone else has and
122		- * the commit record hasn't gone to disk (e.g.
123		- * the inode is pinned). This guarantees that
124		- * all changes affecting the inode are permanent
125		- * when we return.
126		- */
127		- if (iip && iip->ili_last_lsn) {
128		- error = _xfs_log_force(mp, iip->ili_last_lsn,
129		- XFS_LOG_FORCE \| XFS_LOG_SYNC, NULL);
130		- } else if (xfs_ipincount(ip) > 0) {
131		- error = _xfs_log_force(mp, (xfs_lsn_t)0,
132		- XFS_LOG_FORCE \| XFS_LOG_SYNC, NULL);
133		- }
134		-
135		- } else {
136		- xfs_trans_t *tp;
137		-
138		- /*
139		- * O_SYNC or O_DSYNC _with_ a size update are handled
140		- * the same way.
141		- *
142		- * If the write was synchronous then we need to make
143		- * sure that the inode modification time is permanent.
144		- * We'll have updated the timestamp above, so here
145		- * we use a synchronous transaction to log the inode.
146		- * It's not fast, but it's necessary.
147		- *
148		- * If this a dsync write and the size got changed
149		- * non-transactionally, then we need to ensure that
150		- * the size change gets logged in a synchronous
151		- * transaction.
152		- */
153		- tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);
154		- if ((error = xfs_trans_reserve(tp, 0,
155		- XFS_SWRITE_LOG_RES(mp),
156		- 0, 0, 0))) {
157		- /* Transaction reserve failed */
158		- xfs_trans_cancel(tp, 0);
159		- } else {
160		- /* Transaction reserve successful */
161		- xfs_ilock(ip, XFS_ILOCK_EXCL);
162		- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
163		- xfs_trans_ihold(tp, ip);
164		- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
165		- xfs_trans_set_sync(tp);
166		- error = xfs_trans_commit(tp, 0);
167		- xfs_iunlock(ip, XFS_ILOCK_EXCL);
168		- }
169		- }
170		-
171		- return error;
172		-}
173		-
174		-/*
175	91	* Force a shutdown of the filesystem instantly while keeping
176	92	* the filesystem consistent. We don't do an unmount here; just shutdown
177	93	* the shop, make sure that absolutely nothing persistent happens to
...	...	@@ -68,7 +68,6 @@
68	68	* Prototypes for functions in xfs_rw.c.
69	69	*/
70	70	extern int xfs_write_clear_setuid(struct xfs_inode *ip);
71		-extern int xfs_write_sync_logforce(struct xfs_mount mp, struct xfs_inode ip);
72	71	extern int xfs_bwrite(struct xfs_mount mp, struct xfs_buf bp);
73	72	extern int xfs_bioerror(struct xfs_buf *bp);
74	73	extern int xfs_bioerror_relse(struct xfs_buf *bp);
...	...	@@ -68,7 +68,7 @@
68	68	#define XFS_TRANS_GROWFS 14
69	69	#define XFS_TRANS_STRAT_WRITE 15
70	70	#define XFS_TRANS_DIOSTRAT 16
71		-#define XFS_TRANS_WRITE_SYNC 17
	71	+/* 17 was XFS_TRANS_WRITE_SYNC */
72	72	#define XFS_TRANS_WRITEID 18
73	73	#define XFS_TRANS_ADDAFORK 19
74	74	#define XFS_TRANS_ATTRINVAL 20
...	...	@@ -611,7 +611,7 @@
611	611	xfs_inode_t *ip)
612	612	{
613	613	xfs_trans_t *tp;
614		- int error;
	614	+ int error = 0;
615	615	int log_flushed = 0, changed = 1;
616	616
617	617	xfs_itrace_entry(ip);
618	618
...	...	@@ -619,14 +619,9 @@
619	619	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
620	620	return XFS_ERROR(EIO);
621	621
622		- /* capture size updates in I/O completion before writing the inode. */
623		- error = xfs_wait_on_pages(ip, 0, -1);
624		- if (error)
625		- return XFS_ERROR(error);
626		-
627	622	/*
628	623	* We always need to make sure that the required inode state is safe on
629		- * disk. The vnode might be clean but we still might need to force the
	624	+ * disk. The inode might be clean but we still might need to force the
630	625	* log because of committed transactions that haven't hit the disk yet.
631	626	* Likewise, there could be unflushed non-transactional changes to the
632	627	* inode core that have to go to disk and this requires us to issue
...	...	@@ -638,7 +633,7 @@
638	633	*/
639	634	xfs_ilock(ip, XFS_ILOCK_SHARED);
640	635
641		- if (!(ip->i_update_size \|\| ip->i_update_core)) {
	636	+ if (!ip->i_update_core) {
642	637	/*
643	638	* Timestamps/size haven't changed since last inode flush or
644	639	* inode transaction commit. That means either nothing got