Commit 13e6d5cdde0e785aa943810f08b801cadd0935df

Authored by Christoph Hellwig
Committed by Felix Blyakher
1 parent bd16956599

xfs: merge fsync and O_SYNC handling

The guarantees for O_SYNC are exactly the same as the ones we need to
make for an fsync call (and given that Linux O_SYNC is O_DSYNC the
equivalent is fdadatasync, but we treat both the same in XFS), except
with a range data writeout.  Jan Kara has started unifying these two
path for filesystems using the generic helpers, and I've started to
look at XFS.

The actual transaction commited by xfs_fsync and xfs_write_sync_logforce
has a different transaction number, but actually is exactly the same.
We'll only use the fsync transaction going forward.  One major difference
is that xfs_write_sync_logforce never issues a cache flush unless we
commit a transaction causing that as a side-effect, which is an obvious
bug in the O_SYNC handling.  Second all the locking and i_update_size
vs i_update_core changes from 978b7237123d007b9fa983af6e0e2fa8f97f9934
never made it to xfs_write_sync_logforce, so we add them back.

To make xfs_fsync easily usable from the O_SYNC path, the filemap_fdatawait
call is moved up to xfs_file_fsync, so that we don't wait on the whole
file after we already waited for our portion in xfs_write.

We'll also use a plain call to filemap_write_and_wait_range instead
of the previous sync_page_rang which did it in two steps including
an half-hearted inode write out that doesn't help us.

Once we're done with this also remove the now useless i_update_size
tracking.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>

Showing 10 changed files with 23 additions and 112 deletions Side-by-side Diff

fs/xfs/linux-2.6/xfs_aops.c
... ... @@ -216,7 +216,6 @@
216 216 if (ip->i_d.di_size < isize) {
217 217 ip->i_d.di_size = isize;
218 218 ip->i_update_core = 1;
219   - ip->i_update_size = 1;
220 219 xfs_mark_inode_dirty_sync(ip);
221 220 }
222 221  
fs/xfs/linux-2.6/xfs_file.c
... ... @@ -172,12 +172,21 @@
172 172 */
173 173 STATIC int
174 174 xfs_file_fsync(
175   - struct file *filp,
176   - struct dentry *dentry,
177   - int datasync)
  175 + struct file *file,
  176 + struct dentry *dentry,
  177 + int datasync)
178 178 {
179   - xfs_iflags_clear(XFS_I(dentry->d_inode), XFS_ITRUNCATED);
180   - return -xfs_fsync(XFS_I(dentry->d_inode));
  179 + struct inode *inode = dentry->d_inode;
  180 + struct xfs_inode *ip = XFS_I(inode);
  181 + int error;
  182 +
  183 + /* capture size updates in I/O completion before writing the inode. */
  184 + error = filemap_fdatawait(inode->i_mapping);
  185 + if (error)
  186 + return error;
  187 +
  188 + xfs_iflags_clear(ip, XFS_ITRUNCATED);
  189 + return -xfs_fsync(ip);
181 190 }
182 191  
183 192 STATIC int
fs/xfs/linux-2.6/xfs_lrw.c
... ... @@ -812,18 +812,21 @@
812 812  
813 813 /* Handle various SYNC-type writes */
814 814 if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
  815 + loff_t end = pos + ret - 1;
815 816 int error2;
816 817  
817 818 xfs_iunlock(xip, iolock);
818 819 if (need_i_mutex)
819 820 mutex_unlock(&inode->i_mutex);
820   - error2 = sync_page_range(inode, mapping, pos, ret);
  821 +
  822 + error2 = filemap_write_and_wait_range(mapping, pos, end);
821 823 if (!error)
822 824 error = error2;
823 825 if (need_i_mutex)
824 826 mutex_lock(&inode->i_mutex);
825 827 xfs_ilock(xip, iolock);
826   - error2 = xfs_write_sync_logforce(mp, xip);
  828 +
  829 + error2 = xfs_fsync(xip);
827 830 if (!error)
828 831 error = error2;
829 832 }
... ... @@ -82,7 +82,6 @@
82 82 memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
83 83 ip->i_flags = 0;
84 84 ip->i_update_core = 0;
85   - ip->i_update_size = 0;
86 85 ip->i_delayed_blks = 0;
87 86 memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
88 87 ip->i_size = 0;
... ... @@ -261,7 +261,6 @@
261 261 /* Miscellaneous state. */
262 262 unsigned short i_flags; /* see defined flags below */
263 263 unsigned char i_update_core; /* timestamps/size is dirty */
264   - unsigned char i_update_size; /* di_size field is dirty */
265 264 unsigned int i_delayed_blks; /* count of delay alloc blks */
266 265  
267 266 xfs_icdinode_t i_d; /* most of ondisk inode */
fs/xfs/xfs_inode_item.c
... ... @@ -263,14 +263,6 @@
263 263 }
264 264  
265 265 /*
266   - * We don't have to worry about re-ordering here because
267   - * the update_size field is protected by the inode lock
268   - * and we have that held in exclusive mode.
269   - */
270   - if (ip->i_update_size)
271   - ip->i_update_size = 0;
272   -
273   - /*
274 266 * Make sure to get the latest atime from the Linux inode.
275 267 */
276 268 xfs_synchronize_atime(ip);
... ... @@ -88,90 +88,6 @@
88 88 }
89 89  
90 90 /*
91   - * Handle logging requirements of various synchronous types of write.
92   - */
93   -int
94   -xfs_write_sync_logforce(
95   - xfs_mount_t *mp,
96   - xfs_inode_t *ip)
97   -{
98   - int error = 0;
99   -
100   - /*
101   - * If we're treating this as O_DSYNC and we have not updated the
102   - * size, force the log.
103   - */
104   - if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) &&
105   - !(ip->i_update_size)) {
106   - xfs_inode_log_item_t *iip = ip->i_itemp;
107   -
108   - /*
109   - * If an allocation transaction occurred
110   - * without extending the size, then we have to force
111   - * the log up the proper point to ensure that the
112   - * allocation is permanent. We can't count on
113   - * the fact that buffered writes lock out direct I/O
114   - * writes - the direct I/O write could have extended
115   - * the size nontransactionally, then finished before
116   - * we started. xfs_write_file will think that the file
117   - * didn't grow but the update isn't safe unless the
118   - * size change is logged.
119   - *
120   - * Force the log if we've committed a transaction
121   - * against the inode or if someone else has and
122   - * the commit record hasn't gone to disk (e.g.
123   - * the inode is pinned). This guarantees that
124   - * all changes affecting the inode are permanent
125   - * when we return.
126   - */
127   - if (iip && iip->ili_last_lsn) {
128   - error = _xfs_log_force(mp, iip->ili_last_lsn,
129   - XFS_LOG_FORCE | XFS_LOG_SYNC, NULL);
130   - } else if (xfs_ipincount(ip) > 0) {
131   - error = _xfs_log_force(mp, (xfs_lsn_t)0,
132   - XFS_LOG_FORCE | XFS_LOG_SYNC, NULL);
133   - }
134   -
135   - } else {
136   - xfs_trans_t *tp;
137   -
138   - /*
139   - * O_SYNC or O_DSYNC _with_ a size update are handled
140   - * the same way.
141   - *
142   - * If the write was synchronous then we need to make
143   - * sure that the inode modification time is permanent.
144   - * We'll have updated the timestamp above, so here
145   - * we use a synchronous transaction to log the inode.
146   - * It's not fast, but it's necessary.
147   - *
148   - * If this a dsync write and the size got changed
149   - * non-transactionally, then we need to ensure that
150   - * the size change gets logged in a synchronous
151   - * transaction.
152   - */
153   - tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);
154   - if ((error = xfs_trans_reserve(tp, 0,
155   - XFS_SWRITE_LOG_RES(mp),
156   - 0, 0, 0))) {
157   - /* Transaction reserve failed */
158   - xfs_trans_cancel(tp, 0);
159   - } else {
160   - /* Transaction reserve successful */
161   - xfs_ilock(ip, XFS_ILOCK_EXCL);
162   - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
163   - xfs_trans_ihold(tp, ip);
164   - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
165   - xfs_trans_set_sync(tp);
166   - error = xfs_trans_commit(tp, 0);
167   - xfs_iunlock(ip, XFS_ILOCK_EXCL);
168   - }
169   - }
170   -
171   - return error;
172   -}
173   -
174   -/*
175 91 * Force a shutdown of the filesystem instantly while keeping
176 92 * the filesystem consistent. We don't do an unmount here; just shutdown
177 93 * the shop, make sure that absolutely nothing persistent happens to
... ... @@ -68,7 +68,6 @@
68 68 * Prototypes for functions in xfs_rw.c.
69 69 */
70 70 extern int xfs_write_clear_setuid(struct xfs_inode *ip);
71   -extern int xfs_write_sync_logforce(struct xfs_mount *mp, struct xfs_inode *ip);
72 71 extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp);
73 72 extern int xfs_bioerror(struct xfs_buf *bp);
74 73 extern int xfs_bioerror_relse(struct xfs_buf *bp);
... ... @@ -68,7 +68,7 @@
68 68 #define XFS_TRANS_GROWFS 14
69 69 #define XFS_TRANS_STRAT_WRITE 15
70 70 #define XFS_TRANS_DIOSTRAT 16
71   -#define XFS_TRANS_WRITE_SYNC 17
  71 +/* 17 was XFS_TRANS_WRITE_SYNC */
72 72 #define XFS_TRANS_WRITEID 18
73 73 #define XFS_TRANS_ADDAFORK 19
74 74 #define XFS_TRANS_ATTRINVAL 20
fs/xfs/xfs_vnodeops.c
... ... @@ -611,7 +611,7 @@
611 611 xfs_inode_t *ip)
612 612 {
613 613 xfs_trans_t *tp;
614   - int error;
  614 + int error = 0;
615 615 int log_flushed = 0, changed = 1;
616 616  
617 617 xfs_itrace_entry(ip);
618 618  
... ... @@ -619,14 +619,9 @@
619 619 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
620 620 return XFS_ERROR(EIO);
621 621  
622   - /* capture size updates in I/O completion before writing the inode. */
623   - error = xfs_wait_on_pages(ip, 0, -1);
624   - if (error)
625   - return XFS_ERROR(error);
626   -
627 622 /*
628 623 * We always need to make sure that the required inode state is safe on
629   - * disk. The vnode might be clean but we still might need to force the
  624 + * disk. The inode might be clean but we still might need to force the
630 625 * log because of committed transactions that haven't hit the disk yet.
631 626 * Likewise, there could be unflushed non-transactional changes to the
632 627 * inode core that have to go to disk and this requires us to issue
... ... @@ -638,7 +633,7 @@
638 633 */
639 634 xfs_ilock(ip, XFS_ILOCK_SHARED);
640 635  
641   - if (!(ip->i_update_size || ip->i_update_core)) {
  636 + if (!ip->i_update_core) {
642 637 /*
643 638 * Timestamps/size haven't changed since last inode flush or
644 639 * inode transaction commit. That means either nothing got