Commit 9aa05000f2b7cab4be582afba64af10b2d74727e

Authored by Dave Chinner
Committed by Ben Myers
1 parent cf2931db2d

xfs: xfs_sync_data is redundant.

We don't do any data writeback from XFS any more - the VFS is
completely responsible for that, including for freeze. We can
replace the remaining caller with a VFS level function that
achieves the same thing, but without conflicting with current
writeback work.

This means we can remove the flush_work and xfs_flush_inodes() - the
VFS functionality completely replaces the internal flush queue for
doing this writeback work in a separate context to avoid stack
overruns.

This does have one complication - it cannot be called with page
locks held.  Hence move the flushing of delalloc space when ENOSPC
occurs back up into xfs_file_aio_buffered_write when we don't hold
any locks that will stall writeback.

Unfortunately, writeback_inodes_sb_if_idle() is not sufficient to
trigger delalloc conversion fast enough to prevent spurious ENOSPC
whent here are hundreds of writers, thousands of small files and GBs
of free RAM.  Hence we need to use sync_sb_inodes() to block callers
while we wait for writeback like the previous xfs_flush_inodes
implementation did.

That means we have to hold the s_umount lock here, but because this
call can nest inside i_mutex (the parent directory in the create
case, held by the VFS), we have to use down_read_trylock() to avoid
potential deadlocks. In practice, this trylock will succeed on
almost every attempt as unmount/remount type operations are
exceedingly rare.

Note: we always need to pass a count of zero to
generic_file_buffered_write() as the previously written byte count.
We only do this by accident before this patch by the virtue of ret
always being zero when there are no errors. Make this explicit
rather than needing to specifically zero ret in the ENOSPC retry
case.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Tested-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ben Myers <bpm@sgi.com>

Showing 8 changed files with 34 additions and 108 deletions Side-by-side Diff

... ... @@ -728,16 +728,17 @@
728 728 write_retry:
729 729 trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
730 730 ret = generic_file_buffered_write(iocb, iovp, nr_segs,
731   - pos, &iocb->ki_pos, count, ret);
  731 + pos, &iocb->ki_pos, count, 0);
  732 +
732 733 /*
733   - * if we just got an ENOSPC, flush the inode now we aren't holding any
734   - * page locks and retry *once*
  734 + * If we just got an ENOSPC, try to write back all dirty inodes to
  735 + * convert delalloc space to free up some of the excess reserved
  736 + * metadata space.
735 737 */
736 738 if (ret == -ENOSPC && !enospc) {
737 739 enospc = 1;
738   - ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
739   - if (!ret)
740   - goto write_retry;
  740 + xfs_flush_inodes(ip->i_mount);
  741 + goto write_retry;
741 742 }
742 743  
743 744 current->backing_dev_info = NULL;
... ... @@ -373,7 +373,7 @@
373 373 xfs_extlen_t extsz;
374 374 int nimaps;
375 375 xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
376   - int prealloc, flushed = 0;
  376 + int prealloc;
377 377 int error;
378 378  
379 379 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
380 380  
381 381  
... ... @@ -434,26 +434,17 @@
434 434 }
435 435  
436 436 /*
437   - * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. For
438   - * ENOSPC, * flush all other inodes with delalloc blocks to free up
439   - * some of the excess reserved metadata space. For both cases, retry
  437 + * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. Retry
440 438 * without EOF preallocation.
441 439 */
442 440 if (nimaps == 0) {
443 441 trace_xfs_delalloc_enospc(ip, offset, count);
444   - if (flushed)
445   - return XFS_ERROR(error ? error : ENOSPC);
446   -
447   - if (error == ENOSPC) {
448   - xfs_iunlock(ip, XFS_ILOCK_EXCL);
449   - xfs_flush_inodes(ip);
450   - xfs_ilock(ip, XFS_ILOCK_EXCL);
  442 + if (prealloc) {
  443 + prealloc = 0;
  444 + error = 0;
  445 + goto retry;
451 446 }
452   -
453   - flushed = 1;
454   - error = 0;
455   - prealloc = 0;
456   - goto retry;
  447 + return XFS_ERROR(error ? error : ENOSPC);
457 448 }
458 449  
459 450 if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip)))
... ... @@ -198,7 +198,6 @@
198 198 #endif
199 199 struct xfs_mru_cache *m_filestream; /* per-mount filestream data */
200 200 struct delayed_work m_reclaim_work; /* background inode reclaim */
201   - struct work_struct m_flush_work; /* background inode flush */
202 201 __int64_t m_update_flags; /* sb flags we need to update
203 202 on the next remount,rw */
204 203 struct shrinker m_inode_shrink; /* inode reclaim shrinker */
... ... @@ -882,6 +882,24 @@
882 882 destroy_workqueue(mp->m_unwritten_workqueue);
883 883 }
884 884  
  885 +/*
  886 + * Flush all dirty data to disk. Must not be called while holding an XFS_ILOCK
  887 + * or a page lock. We use sync_inodes_sb() here to ensure we block while waiting
  888 + * for IO to complete so that we effectively throttle multiple callers to the
  889 + * rate at which IO is completing.
  890 + */
  891 +void
  892 +xfs_flush_inodes(
  893 + struct xfs_mount *mp)
  894 +{
  895 + struct super_block *sb = mp->m_super;
  896 +
  897 + if (down_read_trylock(&sb->s_umount)) {
  898 + sync_inodes_sb(sb);
  899 + up_read(&sb->s_umount);
  900 + }
  901 +}
  902 +
885 903 /* Catch misguided souls that try to use this interface on XFS */
886 904 STATIC struct inode *
887 905 xfs_fs_alloc_inode(
... ... @@ -1005,8 +1023,6 @@
1005 1023 {
1006 1024 struct xfs_mount *mp = XFS_M(sb);
1007 1025  
1008   - cancel_work_sync(&mp->m_flush_work);
1009   -
1010 1026 xfs_filestream_unmount(mp);
1011 1027 xfs_unmountfs(mp);
1012 1028  
... ... @@ -1324,7 +1340,6 @@
1324 1340 spin_lock_init(&mp->m_sb_lock);
1325 1341 mutex_init(&mp->m_growlock);
1326 1342 atomic_set(&mp->m_active_trans, 0);
1327   - INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
1328 1343 INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
1329 1344  
1330 1345 mp->m_super = sb;
... ... @@ -74,6 +74,7 @@
74 74  
75 75 extern __uint64_t xfs_max_file_offset(unsigned int);
76 76  
  77 +extern void xfs_flush_inodes(struct xfs_mount *mp);
77 78 extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
78 79 extern xfs_agnumber_t xfs_set_inode32(struct xfs_mount *);
79 80 extern xfs_agnumber_t xfs_set_inode64(struct xfs_mount *);
... ... @@ -217,51 +217,6 @@
217 217 }
218 218  
219 219 STATIC int
220   -xfs_sync_inode_data(
221   - struct xfs_inode *ip,
222   - struct xfs_perag *pag,
223   - int flags)
224   -{
225   - struct inode *inode = VFS_I(ip);
226   - struct address_space *mapping = inode->i_mapping;
227   - int error = 0;
228   -
229   - if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
230   - return 0;
231   -
232   - if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) {
233   - if (flags & SYNC_TRYLOCK)
234   - return 0;
235   - xfs_ilock(ip, XFS_IOLOCK_SHARED);
236   - }
237   -
238   - error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ?
239   - 0 : XBF_ASYNC, FI_NONE);
240   - xfs_iunlock(ip, XFS_IOLOCK_SHARED);
241   - return error;
242   -}
243   -
244   -/*
245   - * Write out pagecache data for the whole filesystem.
246   - */
247   -STATIC int
248   -xfs_sync_data(
249   - struct xfs_mount *mp,
250   - int flags)
251   -{
252   - int error;
253   -
254   - ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
255   -
256   - error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags);
257   - if (error)
258   - return XFS_ERROR(error);
259   -
260   - xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
261   - return 0;
262   -}
263   -
264   -STATIC int
265 220 xfs_sync_fsdata(
266 221 struct xfs_mount *mp)
267 222 {
... ... @@ -413,39 +368,6 @@
413 368  
414 369 xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
415 370 xfs_syncd_queue_reclaim(mp);
416   -}
417   -
418   -/*
419   - * Flush delayed allocate data, attempting to free up reserved space
420   - * from existing allocations. At this point a new allocation attempt
421   - * has failed with ENOSPC and we are in the process of scratching our
422   - * heads, looking about for more room.
423   - *
424   - * Queue a new data flush if there isn't one already in progress and
425   - * wait for completion of the flush. This means that we only ever have one
426   - * inode flush in progress no matter how many ENOSPC events are occurring and
427   - * so will prevent the system from bogging down due to every concurrent
428   - * ENOSPC event scanning all the active inodes in the system for writeback.
429   - */
430   -void
431   -xfs_flush_inodes(
432   - struct xfs_inode *ip)
433   -{
434   - struct xfs_mount *mp = ip->i_mount;
435   -
436   - queue_work(xfs_syncd_wq, &mp->m_flush_work);
437   - flush_work(&mp->m_flush_work);
438   -}
439   -
440   -void
441   -xfs_flush_worker(
442   - struct work_struct *work)
443   -{
444   - struct xfs_mount *mp = container_of(work,
445   - struct xfs_mount, m_flush_work);
446   -
447   - xfs_sync_data(mp, SYNC_TRYLOCK);
448   - xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
449 371 }
450 372  
451 373 void
... ... @@ -26,13 +26,10 @@
26 26  
27 27 extern struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */
28 28  
29   -void xfs_flush_worker(struct work_struct *work);
30 29 void xfs_reclaim_worker(struct work_struct *work);
31 30  
32 31 int xfs_quiesce_data(struct xfs_mount *mp);
33 32 void xfs_quiesce_attr(struct xfs_mount *mp);
34   -
35   -void xfs_flush_inodes(struct xfs_inode *ip);
36 33  
37 34 int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
38 35 int xfs_reclaim_inodes_count(struct xfs_mount *mp);
fs/xfs/xfs_vnodeops.c
... ... @@ -777,7 +777,7 @@
777 777 XFS_TRANS_PERM_LOG_RES, log_count);
778 778 if (error == ENOSPC) {
779 779 /* flush outstanding delalloc blocks and retry */
780   - xfs_flush_inodes(dp);
  780 + xfs_flush_inodes(mp);
781 781 error = xfs_trans_reserve(tp, resblks, log_res, 0,
782 782 XFS_TRANS_PERM_LOG_RES, log_count);
783 783 }