Commit be4f1ac828776bbc7868a68b465cd8eedb733cfd

Authored by Christoph Hellwig
Committed by Ben Myers
1 parent 0b8fd3033c

xfs: log all dirty inodes in xfs_fs_sync_fs

Since Linux 2.6.36 the writeback code has introduces various measures for
live lock prevention during sync().  Unfortunately some of these are
actively harmful for the XFS model, where the inode gets marked dirty for
metadata from the data I/O handler.

The older_than_this checks that are now more strictly enforced since

    writeback: avoid livelocking WB_SYNC_ALL writeback

by only calling into __writeback_inodes_sb and thus only sampling the
current cut off time once.  But on a slow enough devices the previous
asynchronous sync pass might not have fully completed yet, and thus XFS
might mark metadata dirty only after that sampling of the cut off time for
the blocking pass already happened.  I have not myself reproduced this
myself on a real system, but by introducing artificial delay into the
XFS I/O completion workqueues it can be reproduced easily.

Fix this by iterating over all XFS inodes in ->sync_fs and log all that
are dirty.  This might log inode that only got redirtied after the
previous pass, but given how cheap delayed logging of inodes is it
isn't a major concern for performance.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Tested-by: Mark Tinguely <tinguely@sgi.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>

Showing 3 changed files with 42 additions and 24 deletions Side-by-side Diff

... ... @@ -869,27 +869,6 @@
869 869 }
870 870  
871 871 STATIC int
872   -xfs_log_inode(
873   - struct xfs_inode *ip)
874   -{
875   - struct xfs_mount *mp = ip->i_mount;
876   - struct xfs_trans *tp;
877   - int error;
878   -
879   - tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
880   - error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
881   - if (error) {
882   - xfs_trans_cancel(tp, 0);
883   - return error;
884   - }
885   -
886   - xfs_ilock(ip, XFS_ILOCK_EXCL);
887   - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
888   - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
889   - return xfs_trans_commit(tp, 0);
890   -}
891   -
892   -STATIC int
893 872 xfs_fs_write_inode(
894 873 struct inode *inode,
895 874 struct writeback_control *wbc)
... ... @@ -902,8 +881,6 @@
902 881  
903 882 if (XFS_FORCED_SHUTDOWN(mp))
904 883 return -XFS_ERROR(EIO);
905   - if (!ip->i_update_core)
906   - return 0;
907 884  
908 885 if (wbc->sync_mode == WB_SYNC_ALL || wbc->for_kupdate) {
909 886 /*
910 887  
... ... @@ -913,11 +890,14 @@
913 890 * ->sync_fs call do that for thus, which reduces the number
914 891 * of synchronous log forces dramatically.
915 892 */
916   - error = xfs_log_inode(ip);
  893 + error = xfs_log_dirty_inode(ip, NULL, 0);
917 894 if (error)
918 895 goto out;
919 896 return 0;
920 897 } else {
  898 + if (!ip->i_update_core)
  899 + return 0;
  900 +
921 901 /*
922 902 * We make this non-blocking if the inode is contended, return
923 903 * EAGAIN to indicate to the caller that they did not succeed.
... ... @@ -336,6 +336,32 @@
336 336 return error;
337 337 }
338 338  
  339 +int
  340 +xfs_log_dirty_inode(
  341 + struct xfs_inode *ip,
  342 + struct xfs_perag *pag,
  343 + int flags)
  344 +{
  345 + struct xfs_mount *mp = ip->i_mount;
  346 + struct xfs_trans *tp;
  347 + int error;
  348 +
  349 + if (!ip->i_update_core)
  350 + return 0;
  351 +
  352 + tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
  353 + error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
  354 + if (error) {
  355 + xfs_trans_cancel(tp, 0);
  356 + return error;
  357 + }
  358 +
  359 + xfs_ilock(ip, XFS_ILOCK_EXCL);
  360 + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
  361 + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
  362 + return xfs_trans_commit(tp, 0);
  363 +}
  364 +
339 365 /*
340 366 * When remounting a filesystem read-only or freezing the filesystem, we have
341 367 * two phases to execute. This first phase is syncing the data before we
... ... @@ -358,6 +384,16 @@
358 384 struct xfs_mount *mp)
359 385 {
360 386 int error, error2 = 0;
  387 +
  388 + /*
  389 + * Log all pending size and timestamp updates. The vfs writeback
  390 + * code is supposed to do this, but due to its overagressive
  391 + * livelock detection it will skip inodes where appending writes
  392 + * were written out in the first non-blocking sync phase if their
  393 + * completion took long enough that it happened after taking the
  394 + * timestamp for the cut-off in the blocking phase.
  395 + */
  396 + xfs_inode_ag_iterator(mp, xfs_log_dirty_inode, 0);
361 397  
362 398 xfs_qm_sync(mp, SYNC_TRYLOCK);
363 399 xfs_qm_sync(mp, SYNC_WAIT);
... ... @@ -34,6 +34,8 @@
34 34  
35 35 void xfs_flush_inodes(struct xfs_inode *ip);
36 36  
  37 +int xfs_log_dirty_inode(struct xfs_inode *ip, struct xfs_perag *pag, int flags);
  38 +
37 39 int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
38 40 int xfs_reclaim_inodes_count(struct xfs_mount *mp);
39 41 void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);