Commit 4be536debe3f7b0c62283e77fd6bd8bdb9f83c6f

Authored by David Chinner
Committed by David Chatterton
1 parent 10387e5eb4

[XFS] Prevent free space oversubscription and xfssyncd looping.

The fix for recent ENOSPC deadlocks introduced certain limitations on
allocations. The fix could cause xfssyncd to loop endlessly if we did not
leave some space free for the allocator to work correctly. Basically, we
needed to ensure that we had at least 4 blocks free for an AG free list
and a block for the inode bmap btree at all times.

However, this did not take into account the fact that each AG has a free
list that needs 4 blocks. Hence any filesystem with more than one AG could
cause oversubscription of free space and make xfssyncd spin forever trying
to allocate space needed for AG freelists that was not available in the
AG.

The following patch reserves space for the free lists in all AGs plus the
inode bmap btree which prevents oversubscription. It also prevents those
blocks from being reported as free space (as they can never be used) and
makes the SMP in-core superblock accounting code and the reserved block
ioctl respect this requirement.

SGI-PV: 955674
SGI-Modid: xfs-linux-melb:xfs-kern:26894a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: David Chatterton <chatz@sgi.com>

Showing 4 changed files with 40 additions and 31 deletions Side-by-side Diff

... ... @@ -44,6 +44,26 @@
44 44 #define XFS_ALLOC_FLAG_FREEING 0x00000002 /* indicate caller is freeing extents*/
45 45  
46 46 /*
  47 + * In order to avoid ENOSPC-related deadlock caused by
  48 + * out-of-order locking of AGF buffer (PV 947395), we place
  49 + * constraints on the relationship among actual allocations for
  50 + * data blocks, freelist blocks, and potential file data bmap
  51 + * btree blocks. However, these restrictions may result in no
  52 + * actual space allocated for a delayed extent, for example, a data
  53 + * block in a certain AG is allocated but there is no additional
  54 + * block for the additional bmap btree block due to a split of the
  55 + * bmap btree of the file. The result of this may lead to an
  56 + * infinite loop in xfssyncd when the file gets flushed to disk and
  57 + * all delayed extents need to be actually allocated. To get around
  58 + * this, we explicitly set aside a few blocks which will not be
  59 + * reserved in delayed allocation. Considering the minimum number of
  60 + * needed freelist blocks is 4 fsbs _per AG_, a potential split of file's bmap
  61 + * btree requires 1 fsb, so we set the number of set-aside blocks
  62 + * to 4 + 4*agcount.
  63 + */
  64 +#define XFS_ALLOC_SET_ASIDE(mp) (4 + ((mp)->m_sb.sb_agcount * 4))
  65 +
  66 +/*
47 67 * Argument structure for xfs_alloc routines.
48 68 * This is turned into a structure to avoid having 20 arguments passed
49 69 * down several levels of the stack.
... ... @@ -462,7 +462,7 @@
462 462  
463 463 xfs_icsb_sync_counters_lazy(mp);
464 464 s = XFS_SB_LOCK(mp);
465   - cnt->freedata = mp->m_sb.sb_fdblocks;
  465 + cnt->freedata = mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
466 466 cnt->freertx = mp->m_sb.sb_frextents;
467 467 cnt->freeino = mp->m_sb.sb_ifree;
468 468 cnt->allocino = mp->m_sb.sb_icount;
469 469  
470 470  
471 471  
... ... @@ -519,15 +519,19 @@
519 519 }
520 520 mp->m_resblks = request;
521 521 } else {
  522 + __int64_t free;
  523 +
  524 + free = mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
522 525 delta = request - mp->m_resblks;
523   - lcounter = mp->m_sb.sb_fdblocks - delta;
  526 + lcounter = free - delta;
524 527 if (lcounter < 0) {
525 528 /* We can't satisfy the request, just get what we can */
526   - mp->m_resblks += mp->m_sb.sb_fdblocks;
527   - mp->m_resblks_avail += mp->m_sb.sb_fdblocks;
528   - mp->m_sb.sb_fdblocks = 0;
  529 + mp->m_resblks += free;
  530 + mp->m_resblks_avail += free;
  531 + mp->m_sb.sb_fdblocks = XFS_ALLOC_SET_ASIDE(mp);
529 532 } else {
530   - mp->m_sb.sb_fdblocks = lcounter;
  533 + mp->m_sb.sb_fdblocks =
  534 + lcounter + XFS_ALLOC_SET_ASIDE(mp);
531 535 mp->m_resblks = request;
532 536 mp->m_resblks_avail += delta;
533 537 }
... ... @@ -1243,24 +1243,6 @@
1243 1243 xfs_trans_log_buf(tp, bp, first, last);
1244 1244 }
1245 1245  
1246   -/*
1247   - * In order to avoid ENOSPC-related deadlock caused by
1248   - * out-of-order locking of AGF buffer (PV 947395), we place
1249   - * constraints on the relationship among actual allocations for
1250   - * data blocks, freelist blocks, and potential file data bmap
1251   - * btree blocks. However, these restrictions may result in no
1252   - * actual space allocated for a delayed extent, for example, a data
1253   - * block in a certain AG is allocated but there is no additional
1254   - * block for the additional bmap btree block due to a split of the
1255   - * bmap btree of the file. The result of this may lead to an
1256   - * infinite loop in xfssyncd when the file gets flushed to disk and
1257   - * all delayed extents need to be actually allocated. To get around
1258   - * this, we explicitly set aside a few blocks which will not be
1259   - * reserved in delayed allocation. Considering the minimum number of
1260   - * needed freelist blocks is 4 fsbs, a potential split of file's bmap
1261   - * btree requires 1 fsb, so we set the number of set-aside blocks to 8.
1262   -*/
1263   -#define SET_ASIDE_BLOCKS 8
1264 1246  
1265 1247 /*
1266 1248 * xfs_mod_incore_sb_unlocked() is a utility routine common used to apply
... ... @@ -1306,7 +1288,8 @@
1306 1288 return 0;
1307 1289 case XFS_SBS_FDBLOCKS:
1308 1290  
1309   - lcounter = (long long)mp->m_sb.sb_fdblocks - SET_ASIDE_BLOCKS;
  1291 + lcounter = (long long)
  1292 + mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
1310 1293 res_used = (long long)(mp->m_resblks - mp->m_resblks_avail);
1311 1294  
1312 1295 if (delta > 0) { /* Putting blocks back */
... ... @@ -1340,7 +1323,7 @@
1340 1323 }
1341 1324 }
1342 1325  
1343   - mp->m_sb.sb_fdblocks = lcounter + SET_ASIDE_BLOCKS;
  1326 + mp->m_sb.sb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp);
1344 1327 return 0;
1345 1328 case XFS_SBS_FREXTENTS:
1346 1329 lcounter = (long long)mp->m_sb.sb_frextents;
... ... @@ -2021,7 +2004,8 @@
2021 2004 * when we get near ENOSPC.
2022 2005 */
2023 2006 #define XFS_ICSB_INO_CNTR_REENABLE 64
2024   -#define XFS_ICSB_FDBLK_CNTR_REENABLE 512
  2007 +#define XFS_ICSB_FDBLK_CNTR_REENABLE(mp) \
  2008 + (512 + XFS_ALLOC_SET_ASIDE(mp))
2025 2009 STATIC void
2026 2010 xfs_icsb_balance_counter(
2027 2011 xfs_mount_t *mp,
... ... @@ -2055,7 +2039,7 @@
2055 2039 case XFS_SBS_FDBLOCKS:
2056 2040 count = mp->m_sb.sb_fdblocks;
2057 2041 resid = do_div(count, weight);
2058   - if (count < XFS_ICSB_FDBLK_CNTR_REENABLE)
  2042 + if (count < XFS_ICSB_FDBLK_CNTR_REENABLE(mp))
2059 2043 goto out;
2060 2044 break;
2061 2045 default:
2062 2046  
... ... @@ -2110,11 +2094,11 @@
2110 2094 case XFS_SBS_FDBLOCKS:
2111 2095 BUG_ON((mp->m_resblks - mp->m_resblks_avail) != 0);
2112 2096  
2113   - lcounter = icsbp->icsb_fdblocks;
  2097 + lcounter = icsbp->icsb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
2114 2098 lcounter += delta;
2115 2099 if (unlikely(lcounter < 0))
2116 2100 goto slow_path;
2117   - icsbp->icsb_fdblocks = lcounter;
  2101 + icsbp->icsb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp);
2118 2102 break;
2119 2103 default:
2120 2104 BUG();
... ... @@ -811,7 +811,8 @@
811 811 statp->f_bsize = sbp->sb_blocksize;
812 812 lsize = sbp->sb_logstart ? sbp->sb_logblocks : 0;
813 813 statp->f_blocks = sbp->sb_dblocks - lsize;
814   - statp->f_bfree = statp->f_bavail = sbp->sb_fdblocks;
  814 + statp->f_bfree = statp->f_bavail =
  815 + sbp->sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
815 816 fakeinos = statp->f_bfree << sbp->sb_inopblog;
816 817 #if XFS_BIG_INUMS
817 818 fakeinos += mp->m_inoadd;