xfs: make log devices with write back caches work

There's no reason not to support cache flushing on external log devices. The only thing this really requires is flushing the data device first both in fsync and log commits. A side effect is that we also have to remove the barrier write test during mount, which has been superflous since the new FLUSH+FUA code anyway. Also use the chance to flush the RT subvolume write cache before the fsync commit, which is required for correct semantics. Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com>

xfs: make log devices with write back caches work
There's no reason not to support cache flushing on external log devices. The only thing this really requires is flushing the data device first both in fsync and log commits. A side effect is that we also have to remove the barrier write test during mount, which has been superflous since the new FLUSH+FUA code anyway. Also use the chance to flush the RT subvolume write cache before the fsync commit, which is required for correct semantics. Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com>
Christoph Hellwig · Alex Elder
1 parent c46a131c0c
Showing 3 changed files with 41 additions and 95 deletions Side-by-side Diff
fs/xfs/linux-2.6/xfs_file.c
fs/xfs/linux-2.6/xfs_super.c
fs/xfs/xfs_log.c
@@ -131,19 +131,34 @@
 {
 	struct inode		*inode = file->f_mapping->host;
 	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_trans	*tp;
 	int			error = 0;
 	int			log_flushed = 0;
  
 	trace_xfs_file_fsync(ip);
  
-	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+	if (XFS_FORCED_SHUTDOWN(mp))
 		return -XFS_ERROR(EIO);
  
 	xfs_iflags_clear(ip, XFS_ITRUNCATED);
  
 	xfs_ioend_wait(ip);
  
+	if (mp->m_flags & XFS_MOUNT_BARRIER) {
+		/*
+		 * If we have an RT and/or log subvolume we need to make sure
+		 * to flush the write cache the device used for file data
+		 * first.  This is to ensure newly written file data make
+		 * it to disk before logging the new inode size in case of
+		 * an extending write.
+		 */
+		if (XFS_IS_REALTIME_INODE(ip))
+			xfs_blkdev_issue_flush(mp->m_rtdev_targp);
+		else if (mp->m_logdev_targp != mp->m_ddev_targp)
+			xfs_blkdev_issue_flush(mp->m_ddev_targp);
+	}
+
 	/*
 	 * We always need to make sure that the required inode state is safe on
 	 * disk.  The inode might be clean but we still might need to force the
  
@@ -175,9 +190,9 @@
 		 * updates.  The sync transaction will also force the log.
 		 */
 		xfs_iunlock(ip, XFS_ILOCK_SHARED);
-		tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
+		tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
 		error = xfs_trans_reserve(tp, 0,
-				XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0);
+				XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
 		if (error) {
 			xfs_trans_cancel(tp, 0);
 			return -error;
  
@@ -209,28 +224,25 @@
 		 * force the log.
 		 */
 		if (xfs_ipincount(ip)) {
-			error = _xfs_log_force_lsn(ip->i_mount,
+			error = _xfs_log_force_lsn(mp,
 					ip->i_itemp->ili_last_lsn,
 					XFS_LOG_SYNC, &log_flushed);
 		}
 		xfs_iunlock(ip, XFS_ILOCK_SHARED);
 	}
  
-	if (ip->i_mount->m_flags & XFS_MOUNT_BARRIER) {
-		/*
-		 * If the log write didn't issue an ordered tag we need
-		 * to flush the disk cache for the data device now.
-		 */
-		if (!log_flushed)
-			xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp);
-
-		/*
-		 * If this inode is on the RT dev we need to flush that
-		 * cache as well.
-		 */
-		if (XFS_IS_REALTIME_INODE(ip))
-			xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
-	}
+	/*
+	 * If we only have a single device, and the log force about was
+	 * a no-op we might have to flush the data device cache here.
+	 * This can only happen for fdatasync/O_DSYNC if we were overwriting
+	 * an already allocated file and thus do not have any metadata to
+	 * commit.
+	 */
+	if ((mp->m_flags & XFS_MOUNT_BARRIER) &&
+	    mp->m_logdev_targp == mp->m_ddev_targp &&
+	    !XFS_IS_REALTIME_INODE(ip) &&
+	    !log_flushed)
+		xfs_blkdev_issue_flush(mp->m_ddev_targp);
  
 	return -error;
 }
@@ -627,68 +627,6 @@
 		blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 }
  
-/*
- * Try to write out the superblock using barriers.
- */
-STATIC int
-xfs_barrier_test(
-	xfs_mount_t	*mp)
-{
-	xfs_buf_t	*sbp = xfs_getsb(mp, 0);
-	int		error;
-
-	XFS_BUF_UNDONE(sbp);
-	XFS_BUF_UNREAD(sbp);
-	XFS_BUF_UNDELAYWRITE(sbp);
-	XFS_BUF_WRITE(sbp);
-	XFS_BUF_UNASYNC(sbp);
-	XFS_BUF_ORDERED(sbp);
-
-	xfsbdstrat(mp, sbp);
-	error = xfs_buf_iowait(sbp);
-
-	/*
-	 * Clear all the flags we set and possible error state in the
-	 * buffer.  We only did the write to try out whether barriers
-	 * worked and shouldn't leave any traces in the superblock
-	 * buffer.
-	 */
-	XFS_BUF_DONE(sbp);
-	XFS_BUF_ERROR(sbp, 0);
-	XFS_BUF_UNORDERED(sbp);
-
-	xfs_buf_relse(sbp);
-	return error;
-}
-
-STATIC void
-xfs_mountfs_check_barriers(xfs_mount_t *mp)
-{
-	int error;
-
-	if (mp->m_logdev_targp != mp->m_ddev_targp) {
-		xfs_notice(mp,
-		  "Disabling barriers, not supported with external log device");
-		mp->m_flags &= ~XFS_MOUNT_BARRIER;
-		return;
-	}
-
-	if (xfs_readonly_buftarg(mp->m_ddev_targp)) {
-		xfs_notice(mp,
-			"Disabling barriers, underlying device is readonly");
-		mp->m_flags &= ~XFS_MOUNT_BARRIER;
-		return;
-	}
-
-	error = xfs_barrier_test(mp);
-	if (error) {
-		xfs_notice(mp,
-			"Disabling barriers, trial barrier write failed");
-		mp->m_flags &= ~XFS_MOUNT_BARRIER;
-		return;
-	}
-}
-
 void
 xfs_blkdev_issue_flush(
 	xfs_buftarg_t		*buftarg)
@@ -1240,14 +1178,6 @@
 		switch (token) {
 		case Opt_barrier:
 			mp->m_flags |= XFS_MOUNT_BARRIER;
-
-			/*
-			 * Test if barriers are actually working if we can,
-			 * else delay this check until the filesystem is
-			 * marked writeable.
-			 */
-			if (!(mp->m_flags & XFS_MOUNT_RDONLY))
-				xfs_mountfs_check_barriers(mp);
 			break;
 		case Opt_nobarrier:
 			mp->m_flags &= ~XFS_MOUNT_BARRIER;
@@ -1282,8 +1212,6 @@
 	/* ro -> rw */
 	if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) {
 		mp->m_flags &= ~XFS_MOUNT_RDONLY;
-		if (mp->m_flags & XFS_MOUNT_BARRIER)
-			xfs_mountfs_check_barriers(mp);
  
 		/*
 		 * If this is the first remount to writeable state we
@@ -1464,9 +1392,6 @@
 	error = xfs_setup_devices(mp);
 	if (error)
 		goto out_free_sb;
-
-	if (mp->m_flags & XFS_MOUNT_BARRIER)
-		xfs_mountfs_check_barriers(mp);
  
 	error = xfs_filestream_mount(mp);
 	if (error)
@@ -1372,8 +1372,17 @@
 	XFS_BUF_ASYNC(bp);
 	bp->b_flags |= XBF_LOG_BUFFER;
  
-	if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
+	if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) {
+		/*
+		 * If we have an external log device, flush the data device
+		 * before flushing the log to make sure all meta data
+		 * written back from the AIL actually made it to disk
+		 * before writing out the new log tail LSN in the log buffer.
+		 */
+		if (log->l_mp->m_logdev_targp != log->l_mp->m_ddev_targp)
+			xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp);
 		XFS_BUF_ORDERED(bp);
+	}
  
 	ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
 	ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
...	...	@@ -131,19 +131,34 @@
131	131	{
132	132	struct inode *inode = file->f_mapping->host;
133	133	struct xfs_inode *ip = XFS_I(inode);
	134	+ struct xfs_mount *mp = ip->i_mount;
134	135	struct xfs_trans *tp;
135	136	int error = 0;
136	137	int log_flushed = 0;
137	138
138	139	trace_xfs_file_fsync(ip);
139	140
140		- if (XFS_FORCED_SHUTDOWN(ip->i_mount))
	141	+ if (XFS_FORCED_SHUTDOWN(mp))
141	142	return -XFS_ERROR(EIO);
142	143
143	144	xfs_iflags_clear(ip, XFS_ITRUNCATED);
144	145
145	146	xfs_ioend_wait(ip);
146	147
	148	+ if (mp->m_flags & XFS_MOUNT_BARRIER) {
	149	+ /*
	150	+ * If we have an RT and/or log subvolume we need to make sure
	151	+ * to flush the write cache the device used for file data
	152	+ * first. This is to ensure newly written file data make
	153	+ * it to disk before logging the new inode size in case of
	154	+ * an extending write.
	155	+ */
	156	+ if (XFS_IS_REALTIME_INODE(ip))
	157	+ xfs_blkdev_issue_flush(mp->m_rtdev_targp);
	158	+ else if (mp->m_logdev_targp != mp->m_ddev_targp)
	159	+ xfs_blkdev_issue_flush(mp->m_ddev_targp);
	160	+ }
	161	+
147	162	/*
148	163	* We always need to make sure that the required inode state is safe on
149	164	* disk. The inode might be clean but we still might need to force the
150	165
...	...	@@ -175,9 +190,9 @@
175	190	* updates. The sync transaction will also force the log.
176	191	*/
177	192	xfs_iunlock(ip, XFS_ILOCK_SHARED);
178		- tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
	193	+ tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
179	194	error = xfs_trans_reserve(tp, 0,
180		- XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0);
	195	+ XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
181	196	if (error) {
182	197	xfs_trans_cancel(tp, 0);
183	198	return -error;
184	199
...	...	@@ -209,28 +224,25 @@
209	224	* force the log.
210	225	*/
211	226	if (xfs_ipincount(ip)) {
212		- error = _xfs_log_force_lsn(ip->i_mount,
	227	+ error = _xfs_log_force_lsn(mp,
213	228	ip->i_itemp->ili_last_lsn,
214	229	XFS_LOG_SYNC, &log_flushed);
215	230	}
216	231	xfs_iunlock(ip, XFS_ILOCK_SHARED);
217	232	}
218	233
219		- if (ip->i_mount->m_flags & XFS_MOUNT_BARRIER) {
220		- /*
221		- * If the log write didn't issue an ordered tag we need
222		- * to flush the disk cache for the data device now.
223		- */
224		- if (!log_flushed)
225		- xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp);
226		-
227		- /*
228		- * If this inode is on the RT dev we need to flush that
229		- * cache as well.
230		- */
231		- if (XFS_IS_REALTIME_INODE(ip))
232		- xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
233		- }
	234	+ /*
	235	+ * If we only have a single device, and the log force about was
	236	+ * a no-op we might have to flush the data device cache here.
	237	+ * This can only happen for fdatasync/O_DSYNC if we were overwriting
	238	+ * an already allocated file and thus do not have any metadata to
	239	+ * commit.
	240	+ */
	241	+ if ((mp->m_flags & XFS_MOUNT_BARRIER) &&
	242	+ mp->m_logdev_targp == mp->m_ddev_targp &&
	243	+ !XFS_IS_REALTIME_INODE(ip) &&
	244	+ !log_flushed)
	245	+ xfs_blkdev_issue_flush(mp->m_ddev_targp);
234	246
235	247	return -error;
236	248	}
...	...	@@ -627,68 +627,6 @@
627	627	blkdev_put(bdev, FMODE_READ\|FMODE_WRITE\|FMODE_EXCL);
628	628	}
629	629
630		-/*
631		- * Try to write out the superblock using barriers.
632		- */
633		-STATIC int
634		-xfs_barrier_test(
635		- xfs_mount_t *mp)
636		-{
637		- xfs_buf_t *sbp = xfs_getsb(mp, 0);
638		- int error;
639		-
640		- XFS_BUF_UNDONE(sbp);
641		- XFS_BUF_UNREAD(sbp);
642		- XFS_BUF_UNDELAYWRITE(sbp);
643		- XFS_BUF_WRITE(sbp);
644		- XFS_BUF_UNASYNC(sbp);
645		- XFS_BUF_ORDERED(sbp);
646		-
647		- xfsbdstrat(mp, sbp);
648		- error = xfs_buf_iowait(sbp);
649		-
650		- /*
651		- * Clear all the flags we set and possible error state in the
652		- * buffer. We only did the write to try out whether barriers
653		- * worked and shouldn't leave any traces in the superblock
654		- * buffer.
655		- */
656		- XFS_BUF_DONE(sbp);
657		- XFS_BUF_ERROR(sbp, 0);
658		- XFS_BUF_UNORDERED(sbp);
659		-
660		- xfs_buf_relse(sbp);
661		- return error;
662		-}
663		-
664		-STATIC void
665		-xfs_mountfs_check_barriers(xfs_mount_t *mp)
666		-{
667		- int error;
668		-
669		- if (mp->m_logdev_targp != mp->m_ddev_targp) {
670		- xfs_notice(mp,
671		- "Disabling barriers, not supported with external log device");
672		- mp->m_flags &= ~XFS_MOUNT_BARRIER;
673		- return;
674		- }
675		-
676		- if (xfs_readonly_buftarg(mp->m_ddev_targp)) {
677		- xfs_notice(mp,
678		- "Disabling barriers, underlying device is readonly");
679		- mp->m_flags &= ~XFS_MOUNT_BARRIER;
680		- return;
681		- }
682		-
683		- error = xfs_barrier_test(mp);
684		- if (error) {
685		- xfs_notice(mp,
686		- "Disabling barriers, trial barrier write failed");
687		- mp->m_flags &= ~XFS_MOUNT_BARRIER;
688		- return;
689		- }
690		-}
691		-
692	630	void
693	631	xfs_blkdev_issue_flush(
694	632	xfs_buftarg_t *buftarg)
...	...	@@ -1240,14 +1178,6 @@
1240	1178	switch (token) {
1241	1179	case Opt_barrier:
1242	1180	mp->m_flags \|= XFS_MOUNT_BARRIER;
1243		-
1244		- /*
1245		- * Test if barriers are actually working if we can,
1246		- * else delay this check until the filesystem is
1247		- * marked writeable.
1248		- */
1249		- if (!(mp->m_flags & XFS_MOUNT_RDONLY))
1250		- xfs_mountfs_check_barriers(mp);
1251	1181	break;
1252	1182	case Opt_nobarrier:
1253	1183	mp->m_flags &= ~XFS_MOUNT_BARRIER;
...	...	@@ -1282,8 +1212,6 @@
1282	1212	/* ro -> rw */
1283	1213	if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) {
1284	1214	mp->m_flags &= ~XFS_MOUNT_RDONLY;
1285		- if (mp->m_flags & XFS_MOUNT_BARRIER)
1286		- xfs_mountfs_check_barriers(mp);
1287	1215
1288	1216	/*
1289	1217	* If this is the first remount to writeable state we
...	...	@@ -1464,9 +1392,6 @@
1464	1392	error = xfs_setup_devices(mp);
1465	1393	if (error)
1466	1394	goto out_free_sb;
1467		-
1468		- if (mp->m_flags & XFS_MOUNT_BARRIER)
1469		- xfs_mountfs_check_barriers(mp);
1470	1395
1471	1396	error = xfs_filestream_mount(mp);
1472	1397	if (error)
...	...	@@ -1372,8 +1372,17 @@
1372	1372	XFS_BUF_ASYNC(bp);
1373	1373	bp->b_flags \|= XBF_LOG_BUFFER;
1374	1374
1375		- if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
	1375	+ if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) {
	1376	+ /*
	1377	+ * If we have an external log device, flush the data device
	1378	+ * before flushing the log to make sure all meta data
	1379	+ * written back from the AIL actually made it to disk
	1380	+ * before writing out the new log tail LSN in the log buffer.
	1381	+ */
	1382	+ if (log->l_mp->m_logdev_targp != log->l_mp->m_ddev_targp)
	1383	+ xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp);
1376	1384	XFS_BUF_ORDERED(bp);
	1385	+ }
1377	1386
1378	1387	ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
1379	1388	ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);