Commit ba87ea699ebd9dd577bf055ebc4a98200e337542

Authored by Lachlan McIlroy
Committed by Tim Shimmin
1 parent 2a32963130

[XFS] Fix to prevent the notorious 'NULL files' problem after a crash.

The problem that has been addressed is that of synchronising updates of
the file size with writes that extend a file. Without the fix the update
of a file's size, as a result of a write beyond eof, is independent of
when the cached data is flushed to disk. Often the file size update would
be written to the filesystem log before the data is flushed to disk. When
a system crashes between these two events and the filesystem log is
replayed on mount the file's size will be set but since the contents never
made it to disk the file is full of holes. If some of the cached data was
flushed to disk then it may just be a section of the file at the end that
has holes.

There are existing fixes to help alleviate this problem, particularly in
the case where a file has been truncated, that force cached data to be
flushed to disk when the file is closed. If the system crashes while the
file(s) are still open then this flushing will never occur.

The fix that we have implemented is to introduce a second file size,
called the in-memory file size, that represents the current file size as
viewed by the user. The existing file size, called the on-disk file size,
is the one that get's written to the filesystem log and we only update it
when it is safe to do so. When we write to a file beyond eof we only
update the in- memory file size in the write operation. Later when the I/O
operation, that flushes the cached data to disk completes, an I/O
completion routine will update the on-disk file size. The on-disk file
size will be updated to the maximum offset of the I/O or to the value of
the in-memory file size if the I/O includes eof.

SGI-PV: 958522
SGI-Modid: xfs-linux-melb:xfs-kern:28322a

Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Tim Shimmin <tes@sgi.com>

Showing 9 changed files with 208 additions and 88 deletions Side-by-side Diff

fs/xfs/linux-2.6/xfs_aops.c
... ... @@ -141,9 +141,46 @@
141 141 }
142 142  
143 143 /*
  144 + * Update on-disk file size now that data has been written to disk.
  145 + * The current in-memory file size is i_size. If a write is beyond
  146 + * eof io_new_size will be the intended file size until i_size is
  147 + * updated. If this write does not extend all the way to the valid
  148 + * file size then restrict this update to the end of the write.
  149 + */
  150 +STATIC void
  151 +xfs_setfilesize(
  152 + xfs_ioend_t *ioend)
  153 +{
  154 + xfs_inode_t *ip;
  155 + xfs_fsize_t isize;
  156 + xfs_fsize_t bsize;
  157 +
  158 + ip = xfs_vtoi(ioend->io_vnode);
  159 +
  160 + ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
  161 + ASSERT(ioend->io_type != IOMAP_READ);
  162 +
  163 + if (unlikely(ioend->io_error))
  164 + return;
  165 +
  166 + bsize = ioend->io_offset + ioend->io_size;
  167 +
  168 + xfs_ilock(ip, XFS_ILOCK_EXCL);
  169 +
  170 + isize = MAX(ip->i_size, ip->i_iocore.io_new_size);
  171 + isize = MIN(isize, bsize);
  172 +
  173 + if (ip->i_d.di_size < isize) {
  174 + ip->i_d.di_size = isize;
  175 + ip->i_update_core = 1;
  176 + ip->i_update_size = 1;
  177 + }
  178 +
  179 + xfs_iunlock(ip, XFS_ILOCK_EXCL);
  180 +}
  181 +
  182 +/*
144 183 * Buffered IO write completion for delayed allocate extents.
145   - * TODO: Update ondisk isize now that we know the file data
146   - * has been flushed (i.e. the notorious "NULL file" problem).
147 184 */
148 185 STATIC void
149 186 xfs_end_bio_delalloc(
... ... @@ -152,6 +189,7 @@
152 189 xfs_ioend_t *ioend =
153 190 container_of(work, xfs_ioend_t, io_work);
154 191  
  192 + xfs_setfilesize(ioend);
155 193 xfs_destroy_ioend(ioend);
156 194 }
157 195  
... ... @@ -165,6 +203,7 @@
165 203 xfs_ioend_t *ioend =
166 204 container_of(work, xfs_ioend_t, io_work);
167 205  
  206 + xfs_setfilesize(ioend);
168 207 xfs_destroy_ioend(ioend);
169 208 }
170 209  
171 210  
172 211  
... ... @@ -184,12 +223,27 @@
184 223 xfs_off_t offset = ioend->io_offset;
185 224 size_t size = ioend->io_size;
186 225  
187   - if (likely(!ioend->io_error))
  226 + if (likely(!ioend->io_error)) {
188 227 bhv_vop_bmap(vp, offset, size, BMAPI_UNWRITTEN, NULL, NULL);
  228 + xfs_setfilesize(ioend);
  229 + }
189 230 xfs_destroy_ioend(ioend);
190 231 }
191 232  
192 233 /*
  234 + * IO read completion for regular, written extents.
  235 + */
  236 +STATIC void
  237 +xfs_end_bio_read(
  238 + struct work_struct *work)
  239 +{
  240 + xfs_ioend_t *ioend =
  241 + container_of(work, xfs_ioend_t, io_work);
  242 +
  243 + xfs_destroy_ioend(ioend);
  244 +}
  245 +
  246 +/*
193 247 * Allocate and initialise an IO completion structure.
194 248 * We need to track unwritten extent write completion here initially.
195 249 * We'll need to extend this for updating the ondisk inode size later
... ... @@ -224,6 +278,8 @@
224 278 INIT_WORK(&ioend->io_work, xfs_end_bio_unwritten);
225 279 else if (type == IOMAP_DELAY)
226 280 INIT_WORK(&ioend->io_work, xfs_end_bio_delalloc);
  281 + else if (type == IOMAP_READ)
  282 + INIT_WORK(&ioend->io_work, xfs_end_bio_read);
227 283 else
228 284 INIT_WORK(&ioend->io_work, xfs_end_bio_written);
229 285  
... ... @@ -913,7 +969,7 @@
913 969 bh = head = page_buffers(page);
914 970 offset = page_offset(page);
915 971 flags = -1;
916   - type = 0;
  972 + type = IOMAP_READ;
917 973  
918 974 /* TODO: cleanup count and page_dirty */
919 975  
... ... @@ -999,7 +1055,7 @@
999 1055 * That means it must already have extents allocated
1000 1056 * underneath it. Map the extent by reading it.
1001 1057 */
1002   - if (!iomap_valid || type != 0) {
  1058 + if (!iomap_valid || type != IOMAP_READ) {
1003 1059 flags = BMAPI_READ;
1004 1060 size = xfs_probe_cluster(inode, page, bh,
1005 1061 head, 1);
... ... @@ -1010,7 +1066,7 @@
1010 1066 iomap_valid = xfs_iomap_valid(&iomap, offset);
1011 1067 }
1012 1068  
1013   - type = 0;
  1069 + type = IOMAP_READ;
1014 1070 if (!test_and_set_bit(BH_Lock, &bh->b_state)) {
1015 1071 ASSERT(buffer_mapped(bh));
1016 1072 if (iomap_valid)
1017 1073  
1018 1074  
... ... @@ -1356,12 +1412,21 @@
1356 1412 * completion handler in the future, in which case all this can
1357 1413 * go away.
1358 1414 */
1359   - if (private && size > 0) {
1360   - ioend->io_offset = offset;
1361   - ioend->io_size = size;
  1415 + ioend->io_offset = offset;
  1416 + ioend->io_size = size;
  1417 + if (ioend->io_type == IOMAP_READ) {
1362 1418 xfs_finish_ioend(ioend);
  1419 + } else if (private && size > 0) {
  1420 + xfs_finish_ioend(ioend);
1363 1421 } else {
1364   - xfs_destroy_ioend(ioend);
  1422 + /*
  1423 + * A direct I/O write ioend starts it's life in unwritten
  1424 + * state in case they map an unwritten extent. This write
  1425 + * didn't map an unwritten extent so switch it's completion
  1426 + * handler.
  1427 + */
  1428 + INIT_WORK(&ioend->io_work, xfs_end_bio_written);
  1429 + xfs_finish_ioend(ioend);
1365 1430 }
1366 1431  
1367 1432 /*
1368 1433  
1369 1434  
... ... @@ -1392,15 +1457,15 @@
1392 1457 if (error)
1393 1458 return -error;
1394 1459  
1395   - iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN);
1396   -
1397 1460 if (rw == WRITE) {
  1461 + iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN);
1398 1462 ret = blockdev_direct_IO_own_locking(rw, iocb, inode,
1399 1463 iomap.iomap_target->bt_bdev,
1400 1464 iov, offset, nr_segs,
1401 1465 xfs_get_blocks_direct,
1402 1466 xfs_end_io_direct);
1403 1467 } else {
  1468 + iocb->private = xfs_alloc_ioend(inode, IOMAP_READ);
1404 1469 ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
1405 1470 iomap.iomap_target->bt_bdev,
1406 1471 iov, offset, nr_segs,
fs/xfs/linux-2.6/xfs_lrw.c
... ... @@ -224,7 +224,7 @@
224 224 mp->m_rtdev_targp : mp->m_ddev_targp;
225 225 if ((*offset & target->bt_smask) ||
226 226 (size & target->bt_smask)) {
227   - if (*offset == ip->i_d.di_size) {
  227 + if (*offset == ip->i_size) {
228 228 return (0);
229 229 }
230 230 return -XFS_ERROR(EINVAL);
231 231  
... ... @@ -387,9 +387,10 @@
387 387 {
388 388 xfs_inode_t *ip = XFS_BHVTOI(bdp);
389 389 xfs_mount_t *mp = ip->i_mount;
  390 + xfs_iocore_t *io = &ip->i_iocore;
390 391 ssize_t ret;
391 392 struct inode *inode = outfilp->f_mapping->host;
392   - xfs_fsize_t isize;
  393 + xfs_fsize_t isize, new_size;
393 394  
394 395 XFS_STATS_INC(xs_write_calls);
395 396 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
... ... @@ -410,6 +411,14 @@
410 411 return -error;
411 412 }
412 413 }
  414 +
  415 + new_size = *ppos + count;
  416 +
  417 + xfs_ilock(ip, XFS_ILOCK_EXCL);
  418 + if (new_size > ip->i_size)
  419 + io->io_new_size = new_size;
  420 + xfs_iunlock(ip, XFS_ILOCK_EXCL);
  421 +
413 422 xfs_rw_enter_trace(XFS_SPLICE_WRITE_ENTER, &ip->i_iocore,
414 423 pipe, count, *ppos, ioflags);
415 424 ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
416 425  
417 426  
... ... @@ -420,16 +429,20 @@
420 429 if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize))
421 430 *ppos = isize;
422 431  
423   - if (*ppos > ip->i_d.di_size) {
  432 + if (*ppos > ip->i_size) {
424 433 xfs_ilock(ip, XFS_ILOCK_EXCL);
425   - if (*ppos > ip->i_d.di_size) {
426   - ip->i_d.di_size = *ppos;
427   - i_size_write(inode, *ppos);
428   - ip->i_update_core = 1;
429   - ip->i_update_size = 1;
430   - }
  434 + if (*ppos > ip->i_size)
  435 + ip->i_size = *ppos;
431 436 xfs_iunlock(ip, XFS_ILOCK_EXCL);
432 437 }
  438 +
  439 + if (io->io_new_size) {
  440 + xfs_ilock(ip, XFS_ILOCK_EXCL);
  441 + io->io_new_size = 0;
  442 + if (ip->i_d.di_size > ip->i_size)
  443 + ip->i_d.di_size = ip->i_size;
  444 + xfs_iunlock(ip, XFS_ILOCK_EXCL);
  445 + }
433 446 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
434 447 return ret;
435 448 }
... ... @@ -711,8 +724,6 @@
711 724 goto out_unlock_mutex;
712 725 }
713 726  
714   - isize = i_size_read(inode);
715   -
716 727 if (ioflags & IO_ISDIRECT) {
717 728 xfs_buftarg_t *target =
718 729 (xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
... ... @@ -723,7 +734,7 @@
723 734 return XFS_ERROR(-EINVAL);
724 735 }
725 736  
726   - if (!need_i_mutex && (VN_CACHED(vp) || pos > isize)) {
  737 + if (!need_i_mutex && (VN_CACHED(vp) || pos > xip->i_size)) {
727 738 xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
728 739 iolock = XFS_IOLOCK_EXCL;
729 740 locktype = VRWLOCK_WRITE;
... ... @@ -735,7 +746,7 @@
735 746 }
736 747  
737 748 new_size = pos + count;
738   - if (new_size > isize)
  749 + if (new_size > xip->i_size)
739 750 io->io_new_size = new_size;
740 751  
741 752 if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) &&
... ... @@ -751,8 +762,7 @@
751 762 pos, count,
752 763 dmflags, &locktype);
753 764 if (error) {
754   - xfs_iunlock(xip, iolock);
755   - goto out_unlock_mutex;
  765 + goto out_unlock_internal;
756 766 }
757 767 xfs_ilock(xip, XFS_ILOCK_EXCL);
758 768 eventsent = 1;
759 769  
... ... @@ -764,9 +774,8 @@
764 774 * event prevents another call to XFS_SEND_DATA, which is
765 775 * what allows the size to change in the first place.
766 776 */
767   - if ((file->f_flags & O_APPEND) && savedsize != isize) {
  777 + if ((file->f_flags & O_APPEND) && savedsize != xip->i_size)
768 778 goto start;
769   - }
770 779 }
771 780  
772 781 if (likely(!(ioflags & IO_INVIS))) {
773 782  
... ... @@ -784,11 +793,11 @@
784 793 * to zero it out up to the new size.
785 794 */
786 795  
787   - if (pos > isize) {
788   - error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, pos, isize);
  796 + if (pos > xip->i_size) {
  797 + error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, pos, xip->i_size);
789 798 if (error) {
790   - xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
791   - goto out_unlock_mutex;
  799 + xfs_iunlock(xip, XFS_ILOCK_EXCL);
  800 + goto out_unlock_internal;
792 801 }
793 802 }
794 803 xfs_iunlock(xip, XFS_ILOCK_EXCL);
... ... @@ -808,8 +817,7 @@
808 817 if (likely(!error))
809 818 error = -remove_suid(file->f_path.dentry);
810 819 if (unlikely(error)) {
811   - xfs_iunlock(xip, iolock);
812   - goto out_unlock_mutex;
  820 + goto out_unlock_internal;
813 821 }
814 822 }
815 823  
816 824  
... ... @@ -879,12 +887,12 @@
879 887 error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp,
880 888 DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL,
881 889 0, 0, 0); /* Delay flag intentionally unused */
882   - if (error)
883   - goto out_nounlocks;
884 890 if (need_i_mutex)
885 891 mutex_lock(&inode->i_mutex);
886 892 xfs_rwlock(bdp, locktype);
887   - pos = xip->i_d.di_size;
  893 + if (error)
  894 + goto out_unlock_internal;
  895 + pos = xip->i_size;
888 896 ret = 0;
889 897 goto retry;
890 898 }
891 899  
... ... @@ -893,14 +901,10 @@
893 901 if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize))
894 902 *offset = isize;
895 903  
896   - if (*offset > xip->i_d.di_size) {
  904 + if (*offset > xip->i_size) {
897 905 xfs_ilock(xip, XFS_ILOCK_EXCL);
898   - if (*offset > xip->i_d.di_size) {
899   - xip->i_d.di_size = *offset;
900   - i_size_write(inode, *offset);
901   - xip->i_update_core = 1;
902   - xip->i_update_size = 1;
903   - }
  906 + if (*offset > xip->i_size)
  907 + xip->i_size = *offset;
904 908 xfs_iunlock(xip, XFS_ILOCK_EXCL);
905 909 }
906 910  
907 911  
908 912  
... ... @@ -922,16 +926,31 @@
922 926  
923 927 error = sync_page_range(inode, mapping, pos, ret);
924 928 if (!error)
925   - error = ret;
926   - return error;
  929 + error = -ret;
  930 + if (need_i_mutex)
  931 + mutex_lock(&inode->i_mutex);
  932 + xfs_rwlock(bdp, locktype);
927 933 }
928 934  
929 935 out_unlock_internal:
  936 + if (io->io_new_size) {
  937 + xfs_ilock(xip, XFS_ILOCK_EXCL);
  938 + io->io_new_size = 0;
  939 + /*
  940 + * If this was a direct or synchronous I/O that failed (such
  941 + * as ENOSPC) then part of the I/O may have been written to
  942 + * disk before the error occured. In this case the on-disk
  943 + * file size may have been adjusted beyond the in-memory file
  944 + * size and now needs to be truncated back.
  945 + */
  946 + if (xip->i_d.di_size > xip->i_size)
  947 + xip->i_d.di_size = xip->i_size;
  948 + xfs_iunlock(xip, XFS_ILOCK_EXCL);
  949 + }
930 950 xfs_rwunlock(bdp, locktype);
931 951 out_unlock_mutex:
932 952 if (need_i_mutex)
933 953 mutex_unlock(&inode->i_mutex);
934   - out_nounlocks:
935 954 return -error;
936 955 }
937 956  
... ... @@ -4444,8 +4444,11 @@
4444 4444 xfs_bmbt_irec_t s; /* internal version of extent */
4445 4445  
4446 4446 #ifndef DEBUG
4447   - if (whichfork == XFS_DATA_FORK)
4448   - return ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize;
  4447 + if (whichfork == XFS_DATA_FORK) {
  4448 + return ((ip->i_d.di_mode & S_IFMT) == S_IFREG) ?
  4449 + (ip->i_size == ip->i_mount->m_sb.sb_blocksize) :
  4450 + (ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize);
  4451 + }
4449 4452 #endif /* !DEBUG */
4450 4453 if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1)
4451 4454 return 0;
... ... @@ -4457,7 +4460,7 @@
4457 4460 xfs_bmbt_get_all(ep, &s);
4458 4461 rval = s.br_startoff == 0 && s.br_blockcount == 1;
4459 4462 if (rval && whichfork == XFS_DATA_FORK)
4460   - ASSERT(ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize);
  4463 + ASSERT(ip->i_size == ip->i_mount->m_sb.sb_blocksize);
4461 4464 return rval;
4462 4465 }
4463 4466  
... ... @@ -5817,7 +5820,7 @@
5817 5820 fixlen = XFS_MAXIOFFSET(mp);
5818 5821 } else {
5819 5822 prealloced = 0;
5820   - fixlen = ip->i_d.di_size;
  5823 + fixlen = ip->i_size;
5821 5824 }
5822 5825 } else {
5823 5826 prealloced = 0;
... ... @@ -5841,7 +5844,8 @@
5841 5844  
5842 5845 xfs_ilock(ip, XFS_IOLOCK_SHARED);
5843 5846  
5844   - if (whichfork == XFS_DATA_FORK && ip->i_delayed_blks) {
  5847 + if (whichfork == XFS_DATA_FORK &&
  5848 + (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size)) {
5845 5849 /* xfs_fsize_t last_byte = xfs_file_last_byte(ip); */
5846 5850 error = bhv_vop_flush_pages(vp, (xfs_off_t)0, -1, 0, FI_REMAPF);
5847 5851 }
... ... @@ -442,6 +442,7 @@
442 442 return XFS_ERROR(EFSCORRUPTED);
443 443 }
444 444 ip->i_d.di_size = 0;
  445 + ip->i_size = 0;
445 446 ip->i_df.if_u2.if_rdev = INT_GET(dip->di_u.di_dev, ARCH_CONVERT);
446 447 break;
447 448  
... ... @@ -980,6 +981,7 @@
980 981 }
981 982  
982 983 ip->i_delayed_blks = 0;
  984 + ip->i_size = ip->i_d.di_size;
983 985  
984 986 /*
985 987 * Mark the buffer containing the inode as something to keep
... ... @@ -1170,6 +1172,7 @@
1170 1172 }
1171 1173  
1172 1174 ip->i_d.di_size = 0;
  1175 + ip->i_size = 0;
1173 1176 ip->i_d.di_nextents = 0;
1174 1177 ASSERT(ip->i_d.di_nblocks == 0);
1175 1178 xfs_ichgtime(ip, XFS_ICHGTIME_CHG|XFS_ICHGTIME_ACC|XFS_ICHGTIME_MOD);
... ... @@ -1340,7 +1343,7 @@
1340 1343 } else {
1341 1344 last_block = 0;
1342 1345 }
1343   - size_last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)ip->i_d.di_size);
  1346 + size_last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)ip->i_size);
1344 1347 last_block = XFS_FILEOFF_MAX(last_block, size_last_block);
1345 1348  
1346 1349 last_byte = XFS_FSB_TO_B(mp, last_block);
... ... @@ -1434,7 +1437,7 @@
1434 1437 int error = 0;
1435 1438  
1436 1439 ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE) != 0);
1437   - ASSERT((new_size == 0) || (new_size <= ip->i_d.di_size));
  1440 + ASSERT((new_size == 0) || (new_size <= ip->i_size));
1438 1441 ASSERT((flags == XFS_ITRUNC_DEFINITE) ||
1439 1442 (flags == XFS_ITRUNC_MAYBE));
1440 1443  
... ... @@ -1558,7 +1561,7 @@
1558 1561  
1559 1562 ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE) != 0);
1560 1563 ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE) != 0);
1561   - ASSERT((new_size == 0) || (new_size <= ip->i_d.di_size));
  1564 + ASSERT((new_size == 0) || (new_size <= ip->i_size));
1562 1565 ASSERT(*tp != NULL);
1563 1566 ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
1564 1567 ASSERT(ip->i_transp == *tp);
... ... @@ -1632,8 +1635,20 @@
1632 1635 */
1633 1636 if (fork == XFS_DATA_FORK) {
1634 1637 if (ip->i_d.di_nextents > 0) {
1635   - ip->i_d.di_size = new_size;
1636   - xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
  1638 + /*
  1639 + * If we are not changing the file size then do
  1640 + * not update the on-disk file size - we may be
  1641 + * called from xfs_inactive_free_eofblocks(). If we
  1642 + * update the on-disk file size and then the system
  1643 + * crashes before the contents of the file are
  1644 + * flushed to disk then the files may be full of
  1645 + * holes (ie NULL files bug).
  1646 + */
  1647 + if (ip->i_size != new_size) {
  1648 + ip->i_d.di_size = new_size;
  1649 + ip->i_size = new_size;
  1650 + xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
  1651 + }
1637 1652 }
1638 1653 } else if (sync) {
1639 1654 ASSERT(!(mp->m_flags & XFS_MOUNT_WSYNC));
... ... @@ -1769,7 +1784,19 @@
1769 1784 */
1770 1785 if (fork == XFS_DATA_FORK) {
1771 1786 xfs_isize_check(mp, ip, new_size);
1772   - ip->i_d.di_size = new_size;
  1787 + /*
  1788 + * If we are not changing the file size then do
  1789 + * not update the on-disk file size - we may be
  1790 + * called from xfs_inactive_free_eofblocks(). If we
  1791 + * update the on-disk file size and then the system
  1792 + * crashes before the contents of the file are
  1793 + * flushed to disk then the files may be full of
  1794 + * holes (ie NULL files bug).
  1795 + */
  1796 + if (ip->i_size != new_size) {
  1797 + ip->i_d.di_size = new_size;
  1798 + ip->i_size = new_size;
  1799 + }
1773 1800 }
1774 1801 xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
1775 1802 ASSERT((new_size != 0) ||
... ... @@ -1802,7 +1829,7 @@
1802 1829  
1803 1830 ASSERT(ismrlocked(&(ip->i_lock), MR_UPDATE) != 0);
1804 1831 ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE) != 0);
1805   - ASSERT(new_size > ip->i_d.di_size);
  1832 + ASSERT(new_size > ip->i_size);
1806 1833  
1807 1834 /*
1808 1835 * Zero any pages that may have been created by
... ... @@ -1810,7 +1837,7 @@
1810 1837 * and any blocks between the old and new file sizes.
1811 1838 */
1812 1839 error = xfs_zero_eof(XFS_ITOV(ip), &ip->i_iocore, new_size,
1813   - ip->i_d.di_size);
  1840 + ip->i_size);
1814 1841 return error;
1815 1842 }
1816 1843  
1817 1844  
... ... @@ -1834,13 +1861,14 @@
1834 1861 ASSERT(ismrlocked(&(ip->i_lock), MR_UPDATE) != 0);
1835 1862 ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE) != 0);
1836 1863 ASSERT(ip->i_transp == tp);
1837   - ASSERT(new_size > ip->i_d.di_size);
  1864 + ASSERT(new_size > ip->i_size);
1838 1865  
1839 1866 /*
1840 1867 * Update the file size. Update the inode change timestamp
1841 1868 * if change_flag set.
1842 1869 */
1843 1870 ip->i_d.di_size = new_size;
  1871 + ip->i_size = new_size;
1844 1872 if (change_flag)
1845 1873 xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
1846 1874 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
... ... @@ -2323,7 +2351,7 @@
2323 2351 ASSERT(ip->i_d.di_nlink == 0);
2324 2352 ASSERT(ip->i_d.di_nextents == 0);
2325 2353 ASSERT(ip->i_d.di_anextents == 0);
2326   - ASSERT((ip->i_d.di_size == 0) ||
  2354 + ASSERT((ip->i_d.di_size == 0 && ip->i_size == 0) ||
2327 2355 ((ip->i_d.di_mode & S_IFMT) != S_IFREG));
2328 2356 ASSERT(ip->i_d.di_nblocks == 0);
2329 2357  
... ... @@ -287,6 +287,7 @@
287 287 struct xfs_inode *i_cnext; /* cluster hash link forward */
288 288 struct xfs_inode *i_cprev; /* cluster hash link backward */
289 289  
  290 + xfs_fsize_t i_size; /* in-memory size */
290 291 /* Trace buffers per inode. */
291 292 #ifdef XFS_BMAP_TRACE
292 293 struct ktrace *i_xtrace; /* inode extent list trace */
... ... @@ -305,6 +306,8 @@
305 306 #endif
306 307 } xfs_inode_t;
307 308  
  309 +#define XFS_ISIZE(ip) (((ip)->i_d.di_mode & S_IFMT) == S_IFREG) ? \
  310 + (ip)->i_size : (ip)->i_d.di_size;
308 311  
309 312 /*
310 313 * i_flags helper functions
... ... @@ -52,7 +52,7 @@
52 52 xfs_size_fn(
53 53 xfs_inode_t *ip)
54 54 {
55   - return (ip->i_d.di_size);
  55 + return XFS_ISIZE(ip);
56 56 }
57 57  
58 58 STATIC int
... ... @@ -458,7 +458,7 @@
458 458 extsz = ip->i_d.di_extsize;
459 459 }
460 460  
461   - isize = ip->i_d.di_size;
  461 + isize = ip->i_size;
462 462 if (io->io_new_size > isize)
463 463 isize = io->io_new_size;
464 464  
... ... @@ -524,7 +524,7 @@
524 524 xfs_trans_ihold(tp, ip);
525 525  
526 526 bmapi_flag = XFS_BMAPI_WRITE;
527   - if ((flags & BMAPI_DIRECT) && (offset < ip->i_d.di_size || extsz))
  527 + if ((flags & BMAPI_DIRECT) && (offset < ip->i_size || extsz))
528 528 bmapi_flag |= XFS_BMAPI_PREALLOC;
529 529  
530 530 /*
... ... @@ -676,7 +676,7 @@
676 676 offset_fsb = XFS_B_TO_FSBT(mp, offset);
677 677  
678 678 retry:
679   - isize = ip->i_d.di_size;
  679 + isize = ip->i_size;
680 680 if (io->io_new_size > isize)
681 681 isize = io->io_new_size;
682 682  
... ... @@ -817,7 +817,7 @@
817 817 * we dropped the ilock in the interim.
818 818 */
819 819  
820   - end_fsb = XFS_B_TO_FSB(mp, ip->i_d.di_size);
  820 + end_fsb = XFS_B_TO_FSB(mp, ip->i_size);
821 821 xfs_bmap_last_offset(NULL, ip, &last_block,
822 822 XFS_DATA_FORK);
823 823 last_block = XFS_FILEOFF_MAX(last_block, end_fsb);
... ... @@ -22,6 +22,7 @@
22 22  
23 23  
24 24 typedef enum { /* iomap_flags values */
  25 + IOMAP_READ = 0, /* mapping for a read */
25 26 IOMAP_EOF = 0x01, /* mapping contains EOF */
26 27 IOMAP_HOLE = 0x02, /* mapping covers a hole */
27 28 IOMAP_DELAY = 0x04, /* mapping covers delalloc region */
fs/xfs/xfs_vnodeops.c
... ... @@ -133,7 +133,7 @@
133 133 if (!(flags & ATTR_LAZY))
134 134 xfs_ilock(ip, XFS_ILOCK_SHARED);
135 135  
136   - vap->va_size = ip->i_d.di_size;
  136 + vap->va_size = XFS_ISIZE(ip);
137 137 if (vap->va_mask == XFS_AT_SIZE)
138 138 goto all_done;
139 139  
... ... @@ -496,7 +496,7 @@
496 496 if (mask & XFS_AT_SIZE) {
497 497 /* Short circuit the truncate case for zero length files */
498 498 if ((vap->va_size == 0) &&
499   - (ip->i_d.di_size == 0) && (ip->i_d.di_nextents == 0)) {
  499 + (ip->i_size == 0) && (ip->i_d.di_nextents == 0)) {
500 500 xfs_iunlock(ip, XFS_ILOCK_EXCL);
501 501 lock_flags &= ~XFS_ILOCK_EXCL;
502 502 if (mask & XFS_AT_CTIME)
... ... @@ -614,7 +614,7 @@
614 614 */
615 615 if (mask & XFS_AT_SIZE) {
616 616 code = 0;
617   - if ((vap->va_size > ip->i_d.di_size) &&
  617 + if ((vap->va_size > ip->i_size) &&
618 618 (flags & ATTR_NOSIZETOK) == 0) {
619 619 code = xfs_igrow_start(ip, vap->va_size, credp);
620 620 }
621 621  
... ... @@ -654,10 +654,10 @@
654 654 * Truncate file. Must have write permission and not be a directory.
655 655 */
656 656 if (mask & XFS_AT_SIZE) {
657   - if (vap->va_size > ip->i_d.di_size) {
  657 + if (vap->va_size > ip->i_size) {
658 658 xfs_igrow_finish(tp, ip, vap->va_size,
659 659 !(flags & ATTR_DMI));
660   - } else if ((vap->va_size <= ip->i_d.di_size) ||
  660 + } else if ((vap->va_size <= ip->i_size) ||
661 661 ((vap->va_size == 0) && ip->i_d.di_nextents)) {
662 662 /*
663 663 * signal a sync transaction unless
... ... @@ -1221,7 +1221,7 @@
1221 1221 * Figure out if there are any blocks beyond the end
1222 1222 * of the file. If not, then there is nothing to do.
1223 1223 */
1224   - end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_d.di_size));
  1224 + end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_size));
1225 1225 last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
1226 1226 map_len = last_fsb - end_fsb;
1227 1227 if (map_len <= 0)
... ... @@ -1258,7 +1258,7 @@
1258 1258 */
1259 1259 xfs_ilock(ip, XFS_IOLOCK_EXCL);
1260 1260 error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE,
1261   - ip->i_d.di_size);
  1261 + ip->i_size);
1262 1262 if (error) {
1263 1263 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1264 1264 return error;
... ... @@ -1282,7 +1282,7 @@
1282 1282 xfs_trans_ihold(tp, ip);
1283 1283  
1284 1284 error = xfs_itruncate_finish(&tp, ip,
1285   - ip->i_d.di_size,
  1285 + ip->i_size,
1286 1286 XFS_DATA_FORK,
1287 1287 0);
1288 1288 /*
... ... @@ -1568,7 +1568,7 @@
1568 1568  
1569 1569 if (ip->i_d.di_nlink != 0) {
1570 1570 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
1571   - ((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0 ||
  1571 + ((ip->i_size > 0) || (VN_CACHED(vp) > 0 ||
1572 1572 ip->i_delayed_blks > 0)) &&
1573 1573 (ip->i_df.if_flags & XFS_IFEXTENTS)) &&
1574 1574 (!(ip->i_d.di_flags &
... ... @@ -1629,8 +1629,8 @@
1629 1629 * only one with a reference to the inode.
1630 1630 */
1631 1631 truncate = ((ip->i_d.di_nlink == 0) &&
1632   - ((ip->i_d.di_size != 0) || (ip->i_d.di_nextents > 0) ||
1633   - (ip->i_delayed_blks > 0)) &&
  1632 + ((ip->i_d.di_size != 0) || (ip->i_size != 0) ||
  1633 + (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) &&
1634 1634 ((ip->i_d.di_mode & S_IFMT) == S_IFREG));
1635 1635  
1636 1636 mp = ip->i_mount;
... ... @@ -1648,7 +1648,7 @@
1648 1648  
1649 1649 if (ip->i_d.di_nlink != 0) {
1650 1650 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
1651   - ((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0 ||
  1651 + ((ip->i_size > 0) || (VN_CACHED(vp) > 0 ||
1652 1652 ip->i_delayed_blks > 0)) &&
1653 1653 (ip->i_df.if_flags & XFS_IFEXTENTS) &&
1654 1654 (!(ip->i_d.di_flags &
1655 1655  
... ... @@ -4055,14 +4055,14 @@
4055 4055 allocatesize_fsb = XFS_B_TO_FSB(mp, count);
4056 4056  
4057 4057 /* Generate a DMAPI event if needed. */
4058   - if (alloc_type != 0 && offset < ip->i_d.di_size &&
  4058 + if (alloc_type != 0 && offset < ip->i_size &&
4059 4059 (attr_flags&ATTR_DMI) == 0 &&
4060 4060 DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_WRITE)) {
4061 4061 xfs_off_t end_dmi_offset;
4062 4062  
4063 4063 end_dmi_offset = offset+len;
4064   - if (end_dmi_offset > ip->i_d.di_size)
4065   - end_dmi_offset = ip->i_d.di_size;
  4064 + if (end_dmi_offset > ip->i_size)
  4065 + end_dmi_offset = ip->i_size;
4066 4066 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, XFS_ITOV(ip),
4067 4067 offset, end_dmi_offset - offset,
4068 4068 0, NULL);
4069 4069  
... ... @@ -4318,11 +4318,11 @@
4318 4318 end_dmi_offset = offset + len;
4319 4319 endoffset_fsb = XFS_B_TO_FSBT(mp, end_dmi_offset);
4320 4320  
4321   - if (offset < ip->i_d.di_size &&
  4321 + if (offset < ip->i_size &&
4322 4322 (attr_flags & ATTR_DMI) == 0 &&
4323 4323 DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_WRITE)) {
4324   - if (end_dmi_offset > ip->i_d.di_size)
4325   - end_dmi_offset = ip->i_d.di_size;
  4324 + if (end_dmi_offset > ip->i_size)
  4325 + end_dmi_offset = ip->i_size;
4326 4326 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, vp,
4327 4327 offset, end_dmi_offset - offset,
4328 4328 AT_DELAY_FLAG(attr_flags), NULL);
... ... @@ -4541,7 +4541,7 @@
4541 4541 bf->l_start += offset;
4542 4542 break;
4543 4543 case 2: /*SEEK_END*/
4544   - bf->l_start += ip->i_d.di_size;
  4544 + bf->l_start += ip->i_size;
4545 4545 break;
4546 4546 default:
4547 4547 return XFS_ERROR(EINVAL);
... ... @@ -4558,7 +4558,7 @@
4558 4558 bf->l_whence = 0;
4559 4559  
4560 4560 startoffset = bf->l_start;
4561   - fsize = ip->i_d.di_size;
  4561 + fsize = ip->i_size;
4562 4562  
4563 4563 /*
4564 4564 * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve