Commit 13e6d5cdde0e785aa943810f08b801cadd0935df
Committed by
Felix Blyakher
1 parent
bd16956599
Exists in
master
and in
7 other branches
xfs: merge fsync and O_SYNC handling
The guarantees for O_SYNC are exactly the same as the ones we need to make for an fsync call (and given that Linux O_SYNC is O_DSYNC the equivalent is fdadatasync, but we treat both the same in XFS), except with a range data writeout. Jan Kara has started unifying these two path for filesystems using the generic helpers, and I've started to look at XFS. The actual transaction commited by xfs_fsync and xfs_write_sync_logforce has a different transaction number, but actually is exactly the same. We'll only use the fsync transaction going forward. One major difference is that xfs_write_sync_logforce never issues a cache flush unless we commit a transaction causing that as a side-effect, which is an obvious bug in the O_SYNC handling. Second all the locking and i_update_size vs i_update_core changes from 978b7237123d007b9fa983af6e0e2fa8f97f9934 never made it to xfs_write_sync_logforce, so we add them back. To make xfs_fsync easily usable from the O_SYNC path, the filemap_fdatawait call is moved up to xfs_file_fsync, so that we don't wait on the whole file after we already waited for our portion in xfs_write. We'll also use a plain call to filemap_write_and_wait_range instead of the previous sync_page_rang which did it in two steps including an half-hearted inode write out that doesn't help us. Once we're done with this also remove the now useless i_update_size tracking. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Felix Blyakher <felixb@sgi.com> Signed-off-by: Felix Blyakher <felixb@sgi.com>
Showing 10 changed files with 23 additions and 112 deletions Side-by-side Diff
fs/xfs/linux-2.6/xfs_aops.c
fs/xfs/linux-2.6/xfs_file.c
... | ... | @@ -172,12 +172,21 @@ |
172 | 172 | */ |
173 | 173 | STATIC int |
174 | 174 | xfs_file_fsync( |
175 | - struct file *filp, | |
176 | - struct dentry *dentry, | |
177 | - int datasync) | |
175 | + struct file *file, | |
176 | + struct dentry *dentry, | |
177 | + int datasync) | |
178 | 178 | { |
179 | - xfs_iflags_clear(XFS_I(dentry->d_inode), XFS_ITRUNCATED); | |
180 | - return -xfs_fsync(XFS_I(dentry->d_inode)); | |
179 | + struct inode *inode = dentry->d_inode; | |
180 | + struct xfs_inode *ip = XFS_I(inode); | |
181 | + int error; | |
182 | + | |
183 | + /* capture size updates in I/O completion before writing the inode. */ | |
184 | + error = filemap_fdatawait(inode->i_mapping); | |
185 | + if (error) | |
186 | + return error; | |
187 | + | |
188 | + xfs_iflags_clear(ip, XFS_ITRUNCATED); | |
189 | + return -xfs_fsync(ip); | |
181 | 190 | } |
182 | 191 | |
183 | 192 | STATIC int |
fs/xfs/linux-2.6/xfs_lrw.c
... | ... | @@ -812,18 +812,21 @@ |
812 | 812 | |
813 | 813 | /* Handle various SYNC-type writes */ |
814 | 814 | if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) { |
815 | + loff_t end = pos + ret - 1; | |
815 | 816 | int error2; |
816 | 817 | |
817 | 818 | xfs_iunlock(xip, iolock); |
818 | 819 | if (need_i_mutex) |
819 | 820 | mutex_unlock(&inode->i_mutex); |
820 | - error2 = sync_page_range(inode, mapping, pos, ret); | |
821 | + | |
822 | + error2 = filemap_write_and_wait_range(mapping, pos, end); | |
821 | 823 | if (!error) |
822 | 824 | error = error2; |
823 | 825 | if (need_i_mutex) |
824 | 826 | mutex_lock(&inode->i_mutex); |
825 | 827 | xfs_ilock(xip, iolock); |
826 | - error2 = xfs_write_sync_logforce(mp, xip); | |
828 | + | |
829 | + error2 = xfs_fsync(xip); | |
827 | 830 | if (!error) |
828 | 831 | error = error2; |
829 | 832 | } |
fs/xfs/xfs_iget.c
fs/xfs/xfs_inode.h
... | ... | @@ -261,7 +261,6 @@ |
261 | 261 | /* Miscellaneous state. */ |
262 | 262 | unsigned short i_flags; /* see defined flags below */ |
263 | 263 | unsigned char i_update_core; /* timestamps/size is dirty */ |
264 | - unsigned char i_update_size; /* di_size field is dirty */ | |
265 | 264 | unsigned int i_delayed_blks; /* count of delay alloc blks */ |
266 | 265 | |
267 | 266 | xfs_icdinode_t i_d; /* most of ondisk inode */ |
fs/xfs/xfs_inode_item.c
... | ... | @@ -263,14 +263,6 @@ |
263 | 263 | } |
264 | 264 | |
265 | 265 | /* |
266 | - * We don't have to worry about re-ordering here because | |
267 | - * the update_size field is protected by the inode lock | |
268 | - * and we have that held in exclusive mode. | |
269 | - */ | |
270 | - if (ip->i_update_size) | |
271 | - ip->i_update_size = 0; | |
272 | - | |
273 | - /* | |
274 | 266 | * Make sure to get the latest atime from the Linux inode. |
275 | 267 | */ |
276 | 268 | xfs_synchronize_atime(ip); |
fs/xfs/xfs_rw.c
... | ... | @@ -88,90 +88,6 @@ |
88 | 88 | } |
89 | 89 | |
90 | 90 | /* |
91 | - * Handle logging requirements of various synchronous types of write. | |
92 | - */ | |
93 | -int | |
94 | -xfs_write_sync_logforce( | |
95 | - xfs_mount_t *mp, | |
96 | - xfs_inode_t *ip) | |
97 | -{ | |
98 | - int error = 0; | |
99 | - | |
100 | - /* | |
101 | - * If we're treating this as O_DSYNC and we have not updated the | |
102 | - * size, force the log. | |
103 | - */ | |
104 | - if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) && | |
105 | - !(ip->i_update_size)) { | |
106 | - xfs_inode_log_item_t *iip = ip->i_itemp; | |
107 | - | |
108 | - /* | |
109 | - * If an allocation transaction occurred | |
110 | - * without extending the size, then we have to force | |
111 | - * the log up the proper point to ensure that the | |
112 | - * allocation is permanent. We can't count on | |
113 | - * the fact that buffered writes lock out direct I/O | |
114 | - * writes - the direct I/O write could have extended | |
115 | - * the size nontransactionally, then finished before | |
116 | - * we started. xfs_write_file will think that the file | |
117 | - * didn't grow but the update isn't safe unless the | |
118 | - * size change is logged. | |
119 | - * | |
120 | - * Force the log if we've committed a transaction | |
121 | - * against the inode or if someone else has and | |
122 | - * the commit record hasn't gone to disk (e.g. | |
123 | - * the inode is pinned). This guarantees that | |
124 | - * all changes affecting the inode are permanent | |
125 | - * when we return. | |
126 | - */ | |
127 | - if (iip && iip->ili_last_lsn) { | |
128 | - error = _xfs_log_force(mp, iip->ili_last_lsn, | |
129 | - XFS_LOG_FORCE | XFS_LOG_SYNC, NULL); | |
130 | - } else if (xfs_ipincount(ip) > 0) { | |
131 | - error = _xfs_log_force(mp, (xfs_lsn_t)0, | |
132 | - XFS_LOG_FORCE | XFS_LOG_SYNC, NULL); | |
133 | - } | |
134 | - | |
135 | - } else { | |
136 | - xfs_trans_t *tp; | |
137 | - | |
138 | - /* | |
139 | - * O_SYNC or O_DSYNC _with_ a size update are handled | |
140 | - * the same way. | |
141 | - * | |
142 | - * If the write was synchronous then we need to make | |
143 | - * sure that the inode modification time is permanent. | |
144 | - * We'll have updated the timestamp above, so here | |
145 | - * we use a synchronous transaction to log the inode. | |
146 | - * It's not fast, but it's necessary. | |
147 | - * | |
148 | - * If this a dsync write and the size got changed | |
149 | - * non-transactionally, then we need to ensure that | |
150 | - * the size change gets logged in a synchronous | |
151 | - * transaction. | |
152 | - */ | |
153 | - tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC); | |
154 | - if ((error = xfs_trans_reserve(tp, 0, | |
155 | - XFS_SWRITE_LOG_RES(mp), | |
156 | - 0, 0, 0))) { | |
157 | - /* Transaction reserve failed */ | |
158 | - xfs_trans_cancel(tp, 0); | |
159 | - } else { | |
160 | - /* Transaction reserve successful */ | |
161 | - xfs_ilock(ip, XFS_ILOCK_EXCL); | |
162 | - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); | |
163 | - xfs_trans_ihold(tp, ip); | |
164 | - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); | |
165 | - xfs_trans_set_sync(tp); | |
166 | - error = xfs_trans_commit(tp, 0); | |
167 | - xfs_iunlock(ip, XFS_ILOCK_EXCL); | |
168 | - } | |
169 | - } | |
170 | - | |
171 | - return error; | |
172 | -} | |
173 | - | |
174 | -/* | |
175 | 91 | * Force a shutdown of the filesystem instantly while keeping |
176 | 92 | * the filesystem consistent. We don't do an unmount here; just shutdown |
177 | 93 | * the shop, make sure that absolutely nothing persistent happens to |
fs/xfs/xfs_rw.h
... | ... | @@ -68,7 +68,6 @@ |
68 | 68 | * Prototypes for functions in xfs_rw.c. |
69 | 69 | */ |
70 | 70 | extern int xfs_write_clear_setuid(struct xfs_inode *ip); |
71 | -extern int xfs_write_sync_logforce(struct xfs_mount *mp, struct xfs_inode *ip); | |
72 | 71 | extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp); |
73 | 72 | extern int xfs_bioerror(struct xfs_buf *bp); |
74 | 73 | extern int xfs_bioerror_relse(struct xfs_buf *bp); |
fs/xfs/xfs_trans.h
... | ... | @@ -68,7 +68,7 @@ |
68 | 68 | #define XFS_TRANS_GROWFS 14 |
69 | 69 | #define XFS_TRANS_STRAT_WRITE 15 |
70 | 70 | #define XFS_TRANS_DIOSTRAT 16 |
71 | -#define XFS_TRANS_WRITE_SYNC 17 | |
71 | +/* 17 was XFS_TRANS_WRITE_SYNC */ | |
72 | 72 | #define XFS_TRANS_WRITEID 18 |
73 | 73 | #define XFS_TRANS_ADDAFORK 19 |
74 | 74 | #define XFS_TRANS_ATTRINVAL 20 |
fs/xfs/xfs_vnodeops.c
... | ... | @@ -611,7 +611,7 @@ |
611 | 611 | xfs_inode_t *ip) |
612 | 612 | { |
613 | 613 | xfs_trans_t *tp; |
614 | - int error; | |
614 | + int error = 0; | |
615 | 615 | int log_flushed = 0, changed = 1; |
616 | 616 | |
617 | 617 | xfs_itrace_entry(ip); |
618 | 618 | |
... | ... | @@ -619,14 +619,9 @@ |
619 | 619 | if (XFS_FORCED_SHUTDOWN(ip->i_mount)) |
620 | 620 | return XFS_ERROR(EIO); |
621 | 621 | |
622 | - /* capture size updates in I/O completion before writing the inode. */ | |
623 | - error = xfs_wait_on_pages(ip, 0, -1); | |
624 | - if (error) | |
625 | - return XFS_ERROR(error); | |
626 | - | |
627 | 622 | /* |
628 | 623 | * We always need to make sure that the required inode state is safe on |
629 | - * disk. The vnode might be clean but we still might need to force the | |
624 | + * disk. The inode might be clean but we still might need to force the | |
630 | 625 | * log because of committed transactions that haven't hit the disk yet. |
631 | 626 | * Likewise, there could be unflushed non-transactional changes to the |
632 | 627 | * inode core that have to go to disk and this requires us to issue |
... | ... | @@ -638,7 +633,7 @@ |
638 | 633 | */ |
639 | 634 | xfs_ilock(ip, XFS_ILOCK_SHARED); |
640 | 635 | |
641 | - if (!(ip->i_update_size || ip->i_update_core)) { | |
636 | + if (!ip->i_update_core) { | |
642 | 637 | /* |
643 | 638 | * Timestamps/size haven't changed since last inode flush or |
644 | 639 | * inode transaction commit. That means either nothing got |