xfs: log all dirty inodes in xfs_fs_sync_fs

Since Linux 2.6.36 the writeback code has introduces various measures for live lock prevention during sync(). Unfortunately some of these are actively harmful for the XFS model, where the inode gets marked dirty for metadata from the data I/O handler. The older_than_this checks that are now more strictly enforced since writeback: avoid livelocking WB_SYNC_ALL writeback by only calling into __writeback_inodes_sb and thus only sampling the current cut off time once. But on a slow enough devices the previous asynchronous sync pass might not have fully completed yet, and thus XFS might mark metadata dirty only after that sampling of the cut off time for the blocking pass already happened. I have not myself reproduced this myself on a real system, but by introducing artificial delay into the XFS I/O completion workqueues it can be reproduced easily. Fix this by iterating over all XFS inodes in ->sync_fs and log all that are dirty. This might log inode that only got redirtied after the previous pass, but given how cheap delayed logging of inodes is it isn't a major concern for performance. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Dave Chinner <dchinner@redhat.com> Tested-by: Mark Tinguely <tinguely@sgi.com> Reviewed-by: Mark Tinguely <tinguely@sgi.com> Signed-off-by: Ben Myers <bpm@sgi.com>

xfs: log all dirty inodes in xfs_fs_sync_fs
Since Linux 2.6.36 the writeback code has introduces various measures for live lock prevention during sync(). Unfortunately some of these are actively harmful for the XFS model, where the inode gets marked dirty for metadata from the data I/O handler. The older_than_this checks that are now more strictly enforced since writeback: avoid livelocking WB_SYNC_ALL writeback by only calling into __writeback_inodes_sb and thus only sampling the current cut off time once. But on a slow enough devices the previous asynchronous sync pass might not have fully completed yet, and thus XFS might mark metadata dirty only after that sampling of the cut off time for the blocking pass already happened. I have not myself reproduced this myself on a real system, but by introducing artificial delay into the XFS I/O completion workqueues it can be reproduced easily. Fix this by iterating over all XFS inodes in ->sync_fs and log all that are dirty. This might log inode that only got redirtied after the previous pass, but given how cheap delayed logging of inodes is it isn't a major concern for performance. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Dave Chinner <dchinner@redhat.com> Tested-by: Mark Tinguely <tinguely@sgi.com> Reviewed-by: Mark Tinguely <tinguely@sgi.com> Signed-off-by: Ben Myers <bpm@sgi.com>
Christoph Hellwig · Ben Myers
1 parent 0b8fd3033c
Showing 3 changed files with 42 additions and 24 deletions Side-by-side Diff
fs/xfs/xfs_super.c
fs/xfs/xfs_sync.c
fs/xfs/xfs_sync.h
@@ -869,27 +869,6 @@
 }
  
 STATIC int
-xfs_log_inode(
-	struct xfs_inode	*ip)
-{
-	struct xfs_mount	*mp = ip->i_mount;
-	struct xfs_trans	*tp;
-	int			error;
-
-	tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
-	error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
-	if (error) {
-		xfs_trans_cancel(tp, 0);
-		return error;
-	}
-
-	xfs_ilock(ip, XFS_ILOCK_EXCL);
-	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-	return xfs_trans_commit(tp, 0);
-}
-
-STATIC int
 xfs_fs_write_inode(
 	struct inode		*inode,
 	struct writeback_control *wbc)
@@ -902,8 +881,6 @@
  
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -XFS_ERROR(EIO);
-	if (!ip->i_update_core)
-		return 0;
  
 	if (wbc->sync_mode == WB_SYNC_ALL || wbc->for_kupdate) {
 		/*
  
@@ -913,11 +890,14 @@
 		 * ->sync_fs call do that for thus, which reduces the number
 		 * of synchronous log forces dramatically.
 		 */
-		error = xfs_log_inode(ip);
+		error = xfs_log_dirty_inode(ip, NULL, 0);
 		if (error)
 			goto out;
 		return 0;
 	} else {
+		if (!ip->i_update_core)
+			return 0;
+
 		/*
 		 * We make this non-blocking if the inode is contended, return
 		 * EAGAIN to indicate to the caller that they did not succeed.
@@ -336,6 +336,32 @@
 	return error;
 }
  
+int
+xfs_log_dirty_inode(
+	struct xfs_inode	*ip,
+	struct xfs_perag	*pag,
+	int			flags)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	int			error;
+
+	if (!ip->i_update_core)
+		return 0;
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
+	error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
+	if (error) {
+		xfs_trans_cancel(tp, 0);
+		return error;
+	}
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	return xfs_trans_commit(tp, 0);
+}
+
 /*
  * When remounting a filesystem read-only or freezing the filesystem, we have
  * two phases to execute. This first phase is syncing the data before we
@@ -358,6 +384,16 @@
 	struct xfs_mount	*mp)
 {
 	int			error, error2 = 0;
+
+	/*
+	 * Log all pending size and timestamp updates.  The vfs writeback
+	 * code is supposed to do this, but due to its overagressive
+	 * livelock detection it will skip inodes where appending writes
+	 * were written out in the first non-blocking sync phase if their
+	 * completion took long enough that it happened after taking the
+	 * timestamp for the cut-off in the blocking phase.
+	 */
+	xfs_inode_ag_iterator(mp, xfs_log_dirty_inode, 0);
  
 	xfs_qm_sync(mp, SYNC_TRYLOCK);
 	xfs_qm_sync(mp, SYNC_WAIT);
@@ -34,6 +34,8 @@
  
 void xfs_flush_inodes(struct xfs_inode *ip);
  
+int xfs_log_dirty_inode(struct xfs_inode *ip, struct xfs_perag *pag, int flags);
+
 int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
 int xfs_reclaim_inodes_count(struct xfs_mount *mp);
 void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
...	...	@@ -869,27 +869,6 @@
869	869	}
870	870
871	871	STATIC int
872		-xfs_log_inode(
873		- struct xfs_inode *ip)
874		-{
875		- struct xfs_mount *mp = ip->i_mount;
876		- struct xfs_trans *tp;
877		- int error;
878		-
879		- tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
880		- error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
881		- if (error) {
882		- xfs_trans_cancel(tp, 0);
883		- return error;
884		- }
885		-
886		- xfs_ilock(ip, XFS_ILOCK_EXCL);
887		- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
888		- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
889		- return xfs_trans_commit(tp, 0);
890		-}
891		-
892		-STATIC int
893	872	xfs_fs_write_inode(
894	873	struct inode *inode,
895	874	struct writeback_control *wbc)
...	...	@@ -902,8 +881,6 @@
902	881
903	882	if (XFS_FORCED_SHUTDOWN(mp))
904	883	return -XFS_ERROR(EIO);
905		- if (!ip->i_update_core)
906		- return 0;
907	884
908	885	if (wbc->sync_mode == WB_SYNC_ALL \|\| wbc->for_kupdate) {
909	886	/*
910	887
...	...	@@ -913,11 +890,14 @@
913	890	* ->sync_fs call do that for thus, which reduces the number
914	891	* of synchronous log forces dramatically.
915	892	*/
916		- error = xfs_log_inode(ip);
	893	+ error = xfs_log_dirty_inode(ip, NULL, 0);
917	894	if (error)
918	895	goto out;
919	896	return 0;
920	897	} else {
	898	+ if (!ip->i_update_core)
	899	+ return 0;
	900	+
921	901	/*
922	902	* We make this non-blocking if the inode is contended, return
923	903	* EAGAIN to indicate to the caller that they did not succeed.
...	...	@@ -336,6 +336,32 @@
336	336	return error;
337	337	}
338	338
	339	+int
	340	+xfs_log_dirty_inode(
	341	+ struct xfs_inode *ip,
	342	+ struct xfs_perag *pag,
	343	+ int flags)
	344	+{
	345	+ struct xfs_mount *mp = ip->i_mount;
	346	+ struct xfs_trans *tp;
	347	+ int error;
	348	+
	349	+ if (!ip->i_update_core)
	350	+ return 0;
	351	+
	352	+ tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
	353	+ error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
	354	+ if (error) {
	355	+ xfs_trans_cancel(tp, 0);
	356	+ return error;
	357	+ }
	358	+
	359	+ xfs_ilock(ip, XFS_ILOCK_EXCL);
	360	+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
	361	+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
	362	+ return xfs_trans_commit(tp, 0);
	363	+}
	364	+
339	365	/*
340	366	* When remounting a filesystem read-only or freezing the filesystem, we have
341	367	* two phases to execute. This first phase is syncing the data before we
...	...	@@ -358,6 +384,16 @@
358	384	struct xfs_mount *mp)
359	385	{
360	386	int error, error2 = 0;
	387	+
	388	+ /*
	389	+ * Log all pending size and timestamp updates. The vfs writeback
	390	+ * code is supposed to do this, but due to its overagressive
	391	+ * livelock detection it will skip inodes where appending writes
	392	+ * were written out in the first non-blocking sync phase if their
	393	+ * completion took long enough that it happened after taking the
	394	+ * timestamp for the cut-off in the blocking phase.
	395	+ */
	396	+ xfs_inode_ag_iterator(mp, xfs_log_dirty_inode, 0);
361	397
362	398	xfs_qm_sync(mp, SYNC_TRYLOCK);
363	399	xfs_qm_sync(mp, SYNC_WAIT);
...	...	@@ -34,6 +34,8 @@
34	34
35	35	void xfs_flush_inodes(struct xfs_inode *ip);
36	36
	37	+int xfs_log_dirty_inode(struct xfs_inode ip, struct xfs_perag pag, int flags);
	38	+
37	39	int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
38	40	int xfs_reclaim_inodes_count(struct xfs_mount *mp);
39	41	void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);