xfs: xfs_sync_data is redundant.

We don't do any data writeback from XFS any more - the VFS is completely responsible for that, including for freeze. We can replace the remaining caller with a VFS level function that achieves the same thing, but without conflicting with current writeback work. This means we can remove the flush_work and xfs_flush_inodes() - the VFS functionality completely replaces the internal flush queue for doing this writeback work in a separate context to avoid stack overruns. This does have one complication - it cannot be called with page locks held. Hence move the flushing of delalloc space when ENOSPC occurs back up into xfs_file_aio_buffered_write when we don't hold any locks that will stall writeback. Unfortunately, writeback_inodes_sb_if_idle() is not sufficient to trigger delalloc conversion fast enough to prevent spurious ENOSPC whent here are hundreds of writers, thousands of small files and GBs of free RAM. Hence we need to use sync_sb_inodes() to block callers while we wait for writeback like the previous xfs_flush_inodes implementation did. That means we have to hold the s_umount lock here, but because this call can nest inside i_mutex (the parent directory in the create case, held by the VFS), we have to use down_read_trylock() to avoid potential deadlocks. In practice, this trylock will succeed on almost every attempt as unmount/remount type operations are exceedingly rare. Note: we always need to pass a count of zero to generic_file_buffered_write() as the previously written byte count. We only do this by accident before this patch by the virtue of ret always being zero when there are no errors. Make this explicit rather than needing to specifically zero ret in the ENOSPC retry case. Signed-off-by: Dave Chinner <dchinner@redhat.com> Tested-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Ben Myers <bpm@sgi.com>

xfs: xfs_sync_data is redundant.
We don't do any data writeback from XFS any more - the VFS is completely responsible for that, including for freeze. We can replace the remaining caller with a VFS level function that achieves the same thing, but without conflicting with current writeback work. This means we can remove the flush_work and xfs_flush_inodes() - the VFS functionality completely replaces the internal flush queue for doing this writeback work in a separate context to avoid stack overruns. This does have one complication - it cannot be called with page locks held. Hence move the flushing of delalloc space when ENOSPC occurs back up into xfs_file_aio_buffered_write when we don't hold any locks that will stall writeback. Unfortunately, writeback_inodes_sb_if_idle() is not sufficient to trigger delalloc conversion fast enough to prevent spurious ENOSPC whent here are hundreds of writers, thousands of small files and GBs of free RAM. Hence we need to use sync_sb_inodes() to block callers while we wait for writeback like the previous xfs_flush_inodes implementation did. That means we have to hold the s_umount lock here, but because this call can nest inside i_mutex (the parent directory in the create case, held by the VFS), we have to use down_read_trylock() to avoid potential deadlocks. In practice, this trylock will succeed on almost every attempt as unmount/remount type operations are exceedingly rare. Note: we always need to pass a count of zero to generic_file_buffered_write() as the previously written byte count. We only do this by accident before this patch by the virtue of ret always being zero when there are no errors. Make this explicit rather than needing to specifically zero ret in the ENOSPC retry case. Signed-off-by: Dave Chinner <dchinner@redhat.com> Tested-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Ben Myers <bpm@sgi.com>
Dave Chinner · Ben Myers
1 parent cf2931db2d
Showing 8 changed files with 34 additions and 108 deletions Side-by-side Diff
fs/xfs/xfs_file.c
fs/xfs/xfs_iomap.c
fs/xfs/xfs_mount.h
fs/xfs/xfs_super.c
fs/xfs/xfs_super.h
fs/xfs/xfs_sync.c
fs/xfs/xfs_sync.h
fs/xfs/xfs_vnodeops.c
@@ -728,16 +728,17 @@
 write_retry:
 	trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
 	ret = generic_file_buffered_write(iocb, iovp, nr_segs,
-			pos, &iocb->ki_pos, count, ret);
+			pos, &iocb->ki_pos, count, 0);
+
 	/*
-	 * if we just got an ENOSPC, flush the inode now we aren't holding any
-	 * page locks and retry *once*
+	 * If we just got an ENOSPC, try to write back all dirty inodes to
+	 * convert delalloc space to free up some of the excess reserved
+	 * metadata space.
 	 */
 	if (ret == -ENOSPC && !enospc) {
 		enospc = 1;
-		ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
-		if (!ret)
-			goto write_retry;
+		xfs_flush_inodes(ip->i_mount);
+		goto write_retry;
 	}
  
 	current->backing_dev_info = NULL;
@@ -373,7 +373,7 @@
 	xfs_extlen_t	extsz;
 	int		nimaps;
 	xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
-	int		prealloc, flushed = 0;
+	int		prealloc;
 	int		error;
  
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
  
  
@@ -434,26 +434,17 @@
 	}
  
 	/*
-	 * If bmapi returned us nothing, we got either ENOSPC or EDQUOT.  For
-	 * ENOSPC, * flush all other inodes with delalloc blocks to free up
-	 * some of the excess reserved metadata space. For both cases, retry
+	 * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. Retry
 	 * without EOF preallocation.
 	 */
 	if (nimaps == 0) {
 		trace_xfs_delalloc_enospc(ip, offset, count);
-		if (flushed)
-			return XFS_ERROR(error ? error : ENOSPC);
-
-		if (error == ENOSPC) {
-			xfs_iunlock(ip, XFS_ILOCK_EXCL);
-			xfs_flush_inodes(ip);
-			xfs_ilock(ip, XFS_ILOCK_EXCL);
+		if (prealloc) {
+			prealloc = 0;
+			error = 0;
+			goto retry;
 		}
-
-		flushed = 1;
-		error = 0;
-		prealloc = 0;
-		goto retry;
+		return XFS_ERROR(error ? error : ENOSPC);
 	}
  
 	if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip)))
@@ -198,7 +198,6 @@
 #endif
 	struct xfs_mru_cache	*m_filestream;  /* per-mount filestream data */
 	struct delayed_work	m_reclaim_work;	/* background inode reclaim */
-	struct work_struct	m_flush_work;	/* background inode flush */
 	__int64_t		m_update_flags;	/* sb flags we need to update
 						   on the next remount,rw */
 	struct shrinker		m_inode_shrink;	/* inode reclaim shrinker */
@@ -882,6 +882,24 @@
 	destroy_workqueue(mp->m_unwritten_workqueue);
 }
  
+/*
+ * Flush all dirty data to disk. Must not be called while holding an XFS_ILOCK
+ * or a page lock. We use sync_inodes_sb() here to ensure we block while waiting
+ * for IO to complete so that we effectively throttle multiple callers to the
+ * rate at which IO is completing.
+ */
+void
+xfs_flush_inodes(
+	struct xfs_mount	*mp)
+{
+	struct super_block	*sb = mp->m_super;
+
+	if (down_read_trylock(&sb->s_umount)) {
+		sync_inodes_sb(sb);
+		up_read(&sb->s_umount);
+	}
+}
+
 /* Catch misguided souls that try to use this interface on XFS */
 STATIC struct inode *
 xfs_fs_alloc_inode(
@@ -1005,8 +1023,6 @@
 {
 	struct xfs_mount	*mp = XFS_M(sb);
  
-	cancel_work_sync(&mp->m_flush_work);
-
 	xfs_filestream_unmount(mp);
 	xfs_unmountfs(mp);
  
@@ -1324,7 +1340,6 @@
 	spin_lock_init(&mp->m_sb_lock);
 	mutex_init(&mp->m_growlock);
 	atomic_set(&mp->m_active_trans, 0);
-	INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
 	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
  
 	mp->m_super = sb;
@@ -74,6 +74,7 @@
  
 extern __uint64_t xfs_max_file_offset(unsigned int);
  
+extern void xfs_flush_inodes(struct xfs_mount *mp);
 extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
 extern xfs_agnumber_t xfs_set_inode32(struct xfs_mount *);
 extern xfs_agnumber_t xfs_set_inode64(struct xfs_mount *);
@@ -217,51 +217,6 @@
 }
  
 STATIC int
-xfs_sync_inode_data(
-	struct xfs_inode	*ip,
-	struct xfs_perag	*pag,
-	int			flags)
-{
-	struct inode		*inode = VFS_I(ip);
-	struct address_space *mapping = inode->i_mapping;
-	int			error = 0;
-
-	if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
-		return 0;
-
-	if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) {
-		if (flags & SYNC_TRYLOCK)
-			return 0;
-		xfs_ilock(ip, XFS_IOLOCK_SHARED);
-	}
-
-	error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ?
-				0 : XBF_ASYNC, FI_NONE);
-	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-	return error;
-}
-
-/*
- * Write out pagecache data for the whole filesystem.
- */
-STATIC int
-xfs_sync_data(
-	struct xfs_mount	*mp,
-	int			flags)
-{
-	int			error;
-
-	ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
-
-	error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags);
-	if (error)
-		return XFS_ERROR(error);
-
-	xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
-	return 0;
-}
-
-STATIC int
 xfs_sync_fsdata(
 	struct xfs_mount	*mp)
 {
@@ -413,39 +368,6 @@
  
 	xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
 	xfs_syncd_queue_reclaim(mp);
-}
-
-/*
- * Flush delayed allocate data, attempting to free up reserved space
- * from existing allocations.  At this point a new allocation attempt
- * has failed with ENOSPC and we are in the process of scratching our
- * heads, looking about for more room.
- *
- * Queue a new data flush if there isn't one already in progress and
- * wait for completion of the flush. This means that we only ever have one
- * inode flush in progress no matter how many ENOSPC events are occurring and
- * so will prevent the system from bogging down due to every concurrent
- * ENOSPC event scanning all the active inodes in the system for writeback.
- */
-void
-xfs_flush_inodes(
-	struct xfs_inode	*ip)
-{
-	struct xfs_mount	*mp = ip->i_mount;
-
-	queue_work(xfs_syncd_wq, &mp->m_flush_work);
-	flush_work(&mp->m_flush_work);
-}
-
-void
-xfs_flush_worker(
-	struct work_struct *work)
-{
-	struct xfs_mount *mp = container_of(work,
-					struct xfs_mount, m_flush_work);
-
-	xfs_sync_data(mp, SYNC_TRYLOCK);
-	xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
 }
  
 void
@@ -26,13 +26,10 @@
  
 extern struct workqueue_struct	*xfs_syncd_wq;	/* sync workqueue */
  
-void xfs_flush_worker(struct work_struct *work);
 void xfs_reclaim_worker(struct work_struct *work);
  
 int xfs_quiesce_data(struct xfs_mount *mp);
 void xfs_quiesce_attr(struct xfs_mount *mp);
-
-void xfs_flush_inodes(struct xfs_inode *ip);
  
 int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
 int xfs_reclaim_inodes_count(struct xfs_mount *mp);
@@ -777,7 +777,7 @@
 			XFS_TRANS_PERM_LOG_RES, log_count);
 	if (error == ENOSPC) {
 		/* flush outstanding delalloc blocks and retry */
-		xfs_flush_inodes(dp);
+		xfs_flush_inodes(mp);
 		error = xfs_trans_reserve(tp, resblks, log_res, 0,
 				XFS_TRANS_PERM_LOG_RES, log_count);
 	}
...	...	@@ -728,16 +728,17 @@
728	728	write_retry:
729	729	trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
730	730	ret = generic_file_buffered_write(iocb, iovp, nr_segs,
731		- pos, &iocb->ki_pos, count, ret);
	731	+ pos, &iocb->ki_pos, count, 0);
	732	+
732	733	/*
733		- * if we just got an ENOSPC, flush the inode now we aren't holding any
734		- * page locks and retry once
	734	+ * If we just got an ENOSPC, try to write back all dirty inodes to
	735	+ * convert delalloc space to free up some of the excess reserved
	736	+ * metadata space.
735	737	*/
736	738	if (ret == -ENOSPC && !enospc) {
737	739	enospc = 1;
738		- ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
739		- if (!ret)
740		- goto write_retry;
	740	+ xfs_flush_inodes(ip->i_mount);
	741	+ goto write_retry;
741	742	}
742	743
743	744	current->backing_dev_info = NULL;
...	...	@@ -373,7 +373,7 @@
373	373	xfs_extlen_t extsz;
374	374	int nimaps;
375	375	xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
376		- int prealloc, flushed = 0;
	376	+ int prealloc;
377	377	int error;
378	378
379	379	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
380	380
381	381
...	...	@@ -434,26 +434,17 @@
434	434	}
435	435
436	436	/*
437		- * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. For
438		- * ENOSPC, * flush all other inodes with delalloc blocks to free up
439		- * some of the excess reserved metadata space. For both cases, retry
	437	+ * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. Retry
440	438	* without EOF preallocation.
441	439	*/
442	440	if (nimaps == 0) {
443	441	trace_xfs_delalloc_enospc(ip, offset, count);
444		- if (flushed)
445		- return XFS_ERROR(error ? error : ENOSPC);
446		-
447		- if (error == ENOSPC) {
448		- xfs_iunlock(ip, XFS_ILOCK_EXCL);
449		- xfs_flush_inodes(ip);
450		- xfs_ilock(ip, XFS_ILOCK_EXCL);
	442	+ if (prealloc) {
	443	+ prealloc = 0;
	444	+ error = 0;
	445	+ goto retry;
451	446	}
452		-
453		- flushed = 1;
454		- error = 0;
455		- prealloc = 0;
456		- goto retry;
	447	+ return XFS_ERROR(error ? error : ENOSPC);
457	448	}
458	449
459	450	if (!(imap[0].br_startblock \|\| XFS_IS_REALTIME_INODE(ip)))
...	...	@@ -198,7 +198,6 @@
198	198	#endif
199	199	struct xfs_mru_cache m_filestream; / per-mount filestream data */
200	200	struct delayed_work m_reclaim_work; /* background inode reclaim */
201		- struct work_struct m_flush_work; /* background inode flush */
202	201	__int64_t m_update_flags; /* sb flags we need to update
203	202	on the next remount,rw */
204	203	struct shrinker m_inode_shrink; /* inode reclaim shrinker */
...	...	@@ -882,6 +882,24 @@
882	882	destroy_workqueue(mp->m_unwritten_workqueue);
883	883	}
884	884
	885	+/*
	886	+ * Flush all dirty data to disk. Must not be called while holding an XFS_ILOCK
	887	+ * or a page lock. We use sync_inodes_sb() here to ensure we block while waiting
	888	+ * for IO to complete so that we effectively throttle multiple callers to the
	889	+ * rate at which IO is completing.
	890	+ */
	891	+void
	892	+xfs_flush_inodes(
	893	+ struct xfs_mount *mp)
	894	+{
	895	+ struct super_block *sb = mp->m_super;
	896	+
	897	+ if (down_read_trylock(&sb->s_umount)) {
	898	+ sync_inodes_sb(sb);
	899	+ up_read(&sb->s_umount);
	900	+ }
	901	+}
	902	+
885	903	/* Catch misguided souls that try to use this interface on XFS */
886	904	STATIC struct inode *
887	905	xfs_fs_alloc_inode(
...	...	@@ -1005,8 +1023,6 @@
1005	1023	{
1006	1024	struct xfs_mount *mp = XFS_M(sb);
1007	1025
1008		- cancel_work_sync(&mp->m_flush_work);
1009		-
1010	1026	xfs_filestream_unmount(mp);
1011	1027	xfs_unmountfs(mp);
1012	1028
...	...	@@ -1324,7 +1340,6 @@
1324	1340	spin_lock_init(&mp->m_sb_lock);
1325	1341	mutex_init(&mp->m_growlock);
1326	1342	atomic_set(&mp->m_active_trans, 0);
1327		- INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
1328	1343	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
1329	1344
1330	1345	mp->m_super = sb;
...	...	@@ -74,6 +74,7 @@
74	74
75	75	extern __uint64_t xfs_max_file_offset(unsigned int);
76	76
	77	+extern void xfs_flush_inodes(struct xfs_mount *mp);
77	78	extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
78	79	extern xfs_agnumber_t xfs_set_inode32(struct xfs_mount *);
79	80	extern xfs_agnumber_t xfs_set_inode64(struct xfs_mount *);
...	...	@@ -217,51 +217,6 @@
217	217	}
218	218
219	219	STATIC int
220		-xfs_sync_inode_data(
221		- struct xfs_inode *ip,
222		- struct xfs_perag *pag,
223		- int flags)
224		-{
225		- struct inode *inode = VFS_I(ip);
226		- struct address_space *mapping = inode->i_mapping;
227		- int error = 0;
228		-
229		- if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
230		- return 0;
231		-
232		- if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) {
233		- if (flags & SYNC_TRYLOCK)
234		- return 0;
235		- xfs_ilock(ip, XFS_IOLOCK_SHARED);
236		- }
237		-
238		- error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ?
239		- 0 : XBF_ASYNC, FI_NONE);
240		- xfs_iunlock(ip, XFS_IOLOCK_SHARED);
241		- return error;
242		-}
243		-
244		-/*
245		- * Write out pagecache data for the whole filesystem.
246		- */
247		-STATIC int
248		-xfs_sync_data(
249		- struct xfs_mount *mp,
250		- int flags)
251		-{
252		- int error;
253		-
254		- ASSERT((flags & ~(SYNC_TRYLOCK\|SYNC_WAIT)) == 0);
255		-
256		- error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags);
257		- if (error)
258		- return XFS_ERROR(error);
259		-
260		- xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
261		- return 0;
262		-}
263		-
264		-STATIC int
265	220	xfs_sync_fsdata(
266	221	struct xfs_mount *mp)
267	222	{
...	...	@@ -413,39 +368,6 @@
413	368
414	369	xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
415	370	xfs_syncd_queue_reclaim(mp);
416		-}
417		-
418		-/*
419		- * Flush delayed allocate data, attempting to free up reserved space
420		- * from existing allocations. At this point a new allocation attempt
421		- * has failed with ENOSPC and we are in the process of scratching our
422		- * heads, looking about for more room.
423		- *
424		- * Queue a new data flush if there isn't one already in progress and
425		- * wait for completion of the flush. This means that we only ever have one
426		- * inode flush in progress no matter how many ENOSPC events are occurring and
427		- * so will prevent the system from bogging down due to every concurrent
428		- * ENOSPC event scanning all the active inodes in the system for writeback.
429		- */
430		-void
431		-xfs_flush_inodes(
432		- struct xfs_inode *ip)
433		-{
434		- struct xfs_mount *mp = ip->i_mount;
435		-
436		- queue_work(xfs_syncd_wq, &mp->m_flush_work);
437		- flush_work(&mp->m_flush_work);
438		-}
439		-
440		-void
441		-xfs_flush_worker(
442		- struct work_struct *work)
443		-{
444		- struct xfs_mount *mp = container_of(work,
445		- struct xfs_mount, m_flush_work);
446		-
447		- xfs_sync_data(mp, SYNC_TRYLOCK);
448		- xfs_sync_data(mp, SYNC_TRYLOCK \| SYNC_WAIT);
449	371	}
450	372
451	373	void
...	...	@@ -26,13 +26,10 @@
26	26
27	27	extern struct workqueue_struct xfs_syncd_wq; / sync workqueue */
28	28
29		-void xfs_flush_worker(struct work_struct *work);
30	29	void xfs_reclaim_worker(struct work_struct *work);
31	30
32	31	int xfs_quiesce_data(struct xfs_mount *mp);
33	32	void xfs_quiesce_attr(struct xfs_mount *mp);
34		-
35		-void xfs_flush_inodes(struct xfs_inode *ip);
36	33
37	34	int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
38	35	int xfs_reclaim_inodes_count(struct xfs_mount *mp);
...	...	@@ -777,7 +777,7 @@
777	777	XFS_TRANS_PERM_LOG_RES, log_count);
778	778	if (error == ENOSPC) {
779	779	/* flush outstanding delalloc blocks and retry */
780		- xfs_flush_inodes(dp);
	780	+ xfs_flush_inodes(mp);
781	781	error = xfs_trans_reserve(tp, resblks, log_res, 0,
782	782	XFS_TRANS_PERM_LOG_RES, log_count);
783	783	}