Commit bc990f5cb424cdca9dda866785d088e2c2110ecc

Authored by Christoph Hellwig
Committed by Felix Blyakher
1 parent 894ef820b1

xfs: fix locking in xfs_iget_cache_hit

The locking in xfs_iget_cache_hit currently has numerous problems:

 - we clear the reclaim tag without i_flags_lock which protects
   modifications to it
 - we call inode_init_always which can sleep with pag_ici_lock
   held (this is oss.sgi.com BZ #819)
 - we acquire and drop i_flags_lock a lot and thus provide no
   consistency between the various flags we set/clear under it

This patch fixes all that with a major revamp of the locking in
the function.  The new version acquires i_flags_lock early and
only drops it once we need to call into inode_init_always or before
calling xfs_ilock.

This patch fixes a bug seen in the wild where we race modifying the
reclaim tag.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Reviewed-by: Eric Sandeen <sandeen@sandeen.net>
Signed-off-by: Felix Blyakher <felixb@sgi.com>

Showing 3 changed files with 70 additions and 57 deletions Side-by-side Diff

fs/xfs/linux-2.6/xfs_sync.c
... ... @@ -708,6 +708,16 @@
708 708 return 0;
709 709 }
710 710  
  711 +void
  712 +__xfs_inode_set_reclaim_tag(
  713 + struct xfs_perag *pag,
  714 + struct xfs_inode *ip)
  715 +{
  716 + radix_tree_tag_set(&pag->pag_ici_root,
  717 + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
  718 + XFS_ICI_RECLAIM_TAG);
  719 +}
  720 +
711 721 /*
712 722 * We set the inode flag atomically with the radix tree tag.
713 723 * Once we get tag lookups on the radix tree, this inode flag
... ... @@ -722,8 +732,7 @@
722 732  
723 733 read_lock(&pag->pag_ici_lock);
724 734 spin_lock(&ip->i_flags_lock);
725   - radix_tree_tag_set(&pag->pag_ici_root,
726   - XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
  735 + __xfs_inode_set_reclaim_tag(pag, ip);
727 736 __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
728 737 spin_unlock(&ip->i_flags_lock);
729 738 read_unlock(&pag->pag_ici_lock);
fs/xfs/linux-2.6/xfs_sync.h
... ... @@ -48,6 +48,7 @@
48 48 int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
49 49  
50 50 void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
  51 +void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
51 52 void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip);
52 53 void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
53 54 struct xfs_inode *ip);
... ... @@ -191,80 +191,82 @@
191 191 int flags,
192 192 int lock_flags) __releases(pag->pag_ici_lock)
193 193 {
  194 + struct inode *inode = VFS_I(ip);
194 195 struct xfs_mount *mp = ip->i_mount;
195   - int error = EAGAIN;
  196 + int error;
196 197  
  198 + spin_lock(&ip->i_flags_lock);
  199 +
197 200 /*
198   - * If INEW is set this inode is being set up
199   - * If IRECLAIM is set this inode is being torn down
200   - * Pause and try again.
  201 + * If we are racing with another cache hit that is currently
  202 + * instantiating this inode or currently recycling it out of
  203 + * reclaimabe state, wait for the initialisation to complete
  204 + * before continuing.
  205 + *
  206 + * XXX(hch): eventually we should do something equivalent to
  207 + * wait_on_inode to wait for these flags to be cleared
  208 + * instead of polling for it.
201 209 */
202   - if (xfs_iflags_test(ip, (XFS_INEW|XFS_IRECLAIM))) {
  210 + if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
203 211 XFS_STATS_INC(xs_ig_frecycle);
  212 + error = EAGAIN;
204 213 goto out_error;
205 214 }
206 215  
207   - /* If IRECLAIMABLE is set, we've torn down the vfs inode part */
208   - if (xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
  216 + /*
  217 + * If lookup is racing with unlink return an error immediately.
  218 + */
  219 + if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
  220 + error = ENOENT;
  221 + goto out_error;
  222 + }
209 223  
210   - /*
211   - * If lookup is racing with unlink, then we should return an
212   - * error immediately so we don't remove it from the reclaim
213   - * list and potentially leak the inode.
214   - */
215   - if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
216   - error = ENOENT;
217   - goto out_error;
218   - }
219   -
  224 + /*
  225 + * If IRECLAIMABLE is set, we've torn down the VFS inode already.
  226 + * Need to carefully get it back into useable state.
  227 + */
  228 + if (ip->i_flags & XFS_IRECLAIMABLE) {
220 229 xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
221 230  
222 231 /*
223   - * We need to re-initialise the VFS inode as it has been
224   - * 'freed' by the VFS. Do this here so we can deal with
225   - * errors cleanly, then tag it so it can be set up correctly
226   - * later.
  232 + * We need to set XFS_INEW atomically with clearing the
  233 + * reclaimable tag so that we do have an indicator of the
  234 + * inode still being initialized.
227 235 */
228   - if (inode_init_always(mp->m_super, VFS_I(ip))) {
229   - error = ENOMEM;
230   - goto out_error;
231   - }
  236 + ip->i_flags |= XFS_INEW;
  237 + ip->i_flags &= ~XFS_IRECLAIMABLE;
  238 + __xfs_inode_clear_reclaim_tag(mp, pag, ip);
232 239  
233   - /*
234   - * We must set the XFS_INEW flag before clearing the
235   - * XFS_IRECLAIMABLE flag so that if a racing lookup does
236   - * not find the XFS_IRECLAIMABLE above but has the igrab()
237   - * below succeed we can safely check XFS_INEW to detect
238   - * that this inode is still being initialised.
239   - */
240   - xfs_iflags_set(ip, XFS_INEW);
241   - xfs_iflags_clear(ip, XFS_IRECLAIMABLE);
  240 + spin_unlock(&ip->i_flags_lock);
  241 + read_unlock(&pag->pag_ici_lock);
242 242  
243   - /* clear the radix tree reclaim flag as well. */
244   - __xfs_inode_clear_reclaim_tag(mp, pag, ip);
245   - } else if (!igrab(VFS_I(ip))) {
  243 + error = -inode_init_always(mp->m_super, inode);
  244 + if (error) {
  245 + /*
  246 + * Re-initializing the inode failed, and we are in deep
  247 + * trouble. Try to re-add it to the reclaim list.
  248 + */
  249 + read_lock(&pag->pag_ici_lock);
  250 + spin_lock(&ip->i_flags_lock);
  251 +
  252 + ip->i_flags &= ~XFS_INEW;
  253 + ip->i_flags |= XFS_IRECLAIMABLE;
  254 + __xfs_inode_set_reclaim_tag(pag, ip);
  255 + goto out_error;
  256 + }
  257 + inode->i_state = I_LOCK|I_NEW;
  258 + } else {
246 259 /* If the VFS inode is being torn down, pause and try again. */
247   - XFS_STATS_INC(xs_ig_frecycle);
248   - goto out_error;
249   - } else if (xfs_iflags_test(ip, XFS_INEW)) {
250   - /*
251   - * We are racing with another cache hit that is
252   - * currently recycling this inode out of the XFS_IRECLAIMABLE
253   - * state. Wait for the initialisation to complete before
254   - * continuing.
255   - */
256   - wait_on_inode(VFS_I(ip));
257   - }
  260 + if (!igrab(inode)) {
  261 + error = EAGAIN;
  262 + goto out_error;
  263 + }
258 264  
259   - if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
260   - error = ENOENT;
261   - iput(VFS_I(ip));
262   - goto out_error;
  265 + /* We've got a live one. */
  266 + spin_unlock(&ip->i_flags_lock);
  267 + read_unlock(&pag->pag_ici_lock);
263 268 }
264 269  
265   - /* We've got a live one. */
266   - read_unlock(&pag->pag_ici_lock);
267   -
268 270 if (lock_flags != 0)
269 271 xfs_ilock(ip, lock_flags);
270 272  
... ... @@ -274,6 +276,7 @@
274 276 return 0;
275 277  
276 278 out_error:
  279 + spin_unlock(&ip->i_flags_lock);
277 280 read_unlock(&pag->pag_ici_lock);
278 281 return error;
279 282 }