Commit bc990f5cb424cdca9dda866785d088e2c2110ecc
Committed by
Felix Blyakher
1 parent
894ef820b1
Exists in
master
and in
20 other branches
xfs: fix locking in xfs_iget_cache_hit
The locking in xfs_iget_cache_hit currently has numerous problems: - we clear the reclaim tag without i_flags_lock which protects modifications to it - we call inode_init_always which can sleep with pag_ici_lock held (this is oss.sgi.com BZ #819) - we acquire and drop i_flags_lock a lot and thus provide no consistency between the various flags we set/clear under it This patch fixes all that with a major revamp of the locking in the function. The new version acquires i_flags_lock early and only drops it once we need to call into inode_init_always or before calling xfs_ilock. This patch fixes a bug seen in the wild where we race modifying the reclaim tag. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Felix Blyakher <felixb@sgi.com> Reviewed-by: Eric Sandeen <sandeen@sandeen.net> Signed-off-by: Felix Blyakher <felixb@sgi.com>
Showing 3 changed files with 70 additions and 57 deletions Side-by-side Diff
fs/xfs/linux-2.6/xfs_sync.c
... | ... | @@ -708,6 +708,16 @@ |
708 | 708 | return 0; |
709 | 709 | } |
710 | 710 | |
711 | +void | |
712 | +__xfs_inode_set_reclaim_tag( | |
713 | + struct xfs_perag *pag, | |
714 | + struct xfs_inode *ip) | |
715 | +{ | |
716 | + radix_tree_tag_set(&pag->pag_ici_root, | |
717 | + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), | |
718 | + XFS_ICI_RECLAIM_TAG); | |
719 | +} | |
720 | + | |
711 | 721 | /* |
712 | 722 | * We set the inode flag atomically with the radix tree tag. |
713 | 723 | * Once we get tag lookups on the radix tree, this inode flag |
... | ... | @@ -722,8 +732,7 @@ |
722 | 732 | |
723 | 733 | read_lock(&pag->pag_ici_lock); |
724 | 734 | spin_lock(&ip->i_flags_lock); |
725 | - radix_tree_tag_set(&pag->pag_ici_root, | |
726 | - XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); | |
735 | + __xfs_inode_set_reclaim_tag(pag, ip); | |
727 | 736 | __xfs_iflags_set(ip, XFS_IRECLAIMABLE); |
728 | 737 | spin_unlock(&ip->i_flags_lock); |
729 | 738 | read_unlock(&pag->pag_ici_lock); |
fs/xfs/linux-2.6/xfs_sync.h
... | ... | @@ -48,6 +48,7 @@ |
48 | 48 | int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); |
49 | 49 | |
50 | 50 | void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); |
51 | +void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip); | |
51 | 52 | void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip); |
52 | 53 | void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag, |
53 | 54 | struct xfs_inode *ip); |
fs/xfs/xfs_iget.c
... | ... | @@ -191,80 +191,82 @@ |
191 | 191 | int flags, |
192 | 192 | int lock_flags) __releases(pag->pag_ici_lock) |
193 | 193 | { |
194 | + struct inode *inode = VFS_I(ip); | |
194 | 195 | struct xfs_mount *mp = ip->i_mount; |
195 | - int error = EAGAIN; | |
196 | + int error; | |
196 | 197 | |
198 | + spin_lock(&ip->i_flags_lock); | |
199 | + | |
197 | 200 | /* |
198 | - * If INEW is set this inode is being set up | |
199 | - * If IRECLAIM is set this inode is being torn down | |
200 | - * Pause and try again. | |
201 | + * If we are racing with another cache hit that is currently | |
202 | + * instantiating this inode or currently recycling it out of | |
203 | + * reclaimabe state, wait for the initialisation to complete | |
204 | + * before continuing. | |
205 | + * | |
206 | + * XXX(hch): eventually we should do something equivalent to | |
207 | + * wait_on_inode to wait for these flags to be cleared | |
208 | + * instead of polling for it. | |
201 | 209 | */ |
202 | - if (xfs_iflags_test(ip, (XFS_INEW|XFS_IRECLAIM))) { | |
210 | + if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) { | |
203 | 211 | XFS_STATS_INC(xs_ig_frecycle); |
212 | + error = EAGAIN; | |
204 | 213 | goto out_error; |
205 | 214 | } |
206 | 215 | |
207 | - /* If IRECLAIMABLE is set, we've torn down the vfs inode part */ | |
208 | - if (xfs_iflags_test(ip, XFS_IRECLAIMABLE)) { | |
216 | + /* | |
217 | + * If lookup is racing with unlink return an error immediately. | |
218 | + */ | |
219 | + if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) { | |
220 | + error = ENOENT; | |
221 | + goto out_error; | |
222 | + } | |
209 | 223 | |
210 | - /* | |
211 | - * If lookup is racing with unlink, then we should return an | |
212 | - * error immediately so we don't remove it from the reclaim | |
213 | - * list and potentially leak the inode. | |
214 | - */ | |
215 | - if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) { | |
216 | - error = ENOENT; | |
217 | - goto out_error; | |
218 | - } | |
219 | - | |
224 | + /* | |
225 | + * If IRECLAIMABLE is set, we've torn down the VFS inode already. | |
226 | + * Need to carefully get it back into useable state. | |
227 | + */ | |
228 | + if (ip->i_flags & XFS_IRECLAIMABLE) { | |
220 | 229 | xfs_itrace_exit_tag(ip, "xfs_iget.alloc"); |
221 | 230 | |
222 | 231 | /* |
223 | - * We need to re-initialise the VFS inode as it has been | |
224 | - * 'freed' by the VFS. Do this here so we can deal with | |
225 | - * errors cleanly, then tag it so it can be set up correctly | |
226 | - * later. | |
232 | + * We need to set XFS_INEW atomically with clearing the | |
233 | + * reclaimable tag so that we do have an indicator of the | |
234 | + * inode still being initialized. | |
227 | 235 | */ |
228 | - if (inode_init_always(mp->m_super, VFS_I(ip))) { | |
229 | - error = ENOMEM; | |
230 | - goto out_error; | |
231 | - } | |
236 | + ip->i_flags |= XFS_INEW; | |
237 | + ip->i_flags &= ~XFS_IRECLAIMABLE; | |
238 | + __xfs_inode_clear_reclaim_tag(mp, pag, ip); | |
232 | 239 | |
233 | - /* | |
234 | - * We must set the XFS_INEW flag before clearing the | |
235 | - * XFS_IRECLAIMABLE flag so that if a racing lookup does | |
236 | - * not find the XFS_IRECLAIMABLE above but has the igrab() | |
237 | - * below succeed we can safely check XFS_INEW to detect | |
238 | - * that this inode is still being initialised. | |
239 | - */ | |
240 | - xfs_iflags_set(ip, XFS_INEW); | |
241 | - xfs_iflags_clear(ip, XFS_IRECLAIMABLE); | |
240 | + spin_unlock(&ip->i_flags_lock); | |
241 | + read_unlock(&pag->pag_ici_lock); | |
242 | 242 | |
243 | - /* clear the radix tree reclaim flag as well. */ | |
244 | - __xfs_inode_clear_reclaim_tag(mp, pag, ip); | |
245 | - } else if (!igrab(VFS_I(ip))) { | |
243 | + error = -inode_init_always(mp->m_super, inode); | |
244 | + if (error) { | |
245 | + /* | |
246 | + * Re-initializing the inode failed, and we are in deep | |
247 | + * trouble. Try to re-add it to the reclaim list. | |
248 | + */ | |
249 | + read_lock(&pag->pag_ici_lock); | |
250 | + spin_lock(&ip->i_flags_lock); | |
251 | + | |
252 | + ip->i_flags &= ~XFS_INEW; | |
253 | + ip->i_flags |= XFS_IRECLAIMABLE; | |
254 | + __xfs_inode_set_reclaim_tag(pag, ip); | |
255 | + goto out_error; | |
256 | + } | |
257 | + inode->i_state = I_LOCK|I_NEW; | |
258 | + } else { | |
246 | 259 | /* If the VFS inode is being torn down, pause and try again. */ |
247 | - XFS_STATS_INC(xs_ig_frecycle); | |
248 | - goto out_error; | |
249 | - } else if (xfs_iflags_test(ip, XFS_INEW)) { | |
250 | - /* | |
251 | - * We are racing with another cache hit that is | |
252 | - * currently recycling this inode out of the XFS_IRECLAIMABLE | |
253 | - * state. Wait for the initialisation to complete before | |
254 | - * continuing. | |
255 | - */ | |
256 | - wait_on_inode(VFS_I(ip)); | |
257 | - } | |
260 | + if (!igrab(inode)) { | |
261 | + error = EAGAIN; | |
262 | + goto out_error; | |
263 | + } | |
258 | 264 | |
259 | - if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) { | |
260 | - error = ENOENT; | |
261 | - iput(VFS_I(ip)); | |
262 | - goto out_error; | |
265 | + /* We've got a live one. */ | |
266 | + spin_unlock(&ip->i_flags_lock); | |
267 | + read_unlock(&pag->pag_ici_lock); | |
263 | 268 | } |
264 | 269 | |
265 | - /* We've got a live one. */ | |
266 | - read_unlock(&pag->pag_ici_lock); | |
267 | - | |
268 | 270 | if (lock_flags != 0) |
269 | 271 | xfs_ilock(ip, lock_flags); |
270 | 272 | |
... | ... | @@ -274,6 +276,7 @@ |
274 | 276 | return 0; |
275 | 277 | |
276 | 278 | out_error: |
279 | + spin_unlock(&ip->i_flags_lock); | |
277 | 280 | read_unlock(&pag->pag_ici_lock); |
278 | 281 | return error; |
279 | 282 | } |