Commit a601e63717a269b9171a7164ab9e285788362d1b
Merge tag 'for-linus-v3.7-rc5' of git://oss.sgi.com/xfs/xfs
Pull xfs bugfixes from Ben Myers: - fix for large transactions spanning multiple iclog buffers - zero the allocation_args structure on the stack before using it to determine whether to use a worker for allocation - move allocation stack switch to xfs_bmapi_allocate in order to prevent deadlock on AGF buffers - growfs no longer reads in garbage for new secondary superblocks - silence a build warning - ensure that invalid buffers never get written to disk while on free list - don't vmap inode cluster buffers during free - fix buffer shutdown reference count mismatch - fix reading of wrapped log data * tag 'for-linus-v3.7-rc5' of git://oss.sgi.com/xfs/xfs: xfs: fix reading of wrapped log data xfs: fix buffer shudown reference count mismatch xfs: don't vmap inode cluster buffers during free xfs: invalidate allocbt blocks moved to the free list xfs: silence uninitialised f.file warning. xfs: growfs: don't read garbage for new secondary superblocks xfs: move allocation stack switch up to xfs_bmapi_allocate xfs: introduce XFS_BMAPI_STACK_SWITCH xfs: zero allocation_args on the kernel stack xfs: only update the last_sync_lsn when a transaction completes
Showing 13 changed files Side-by-side Diff
fs/xfs/xfs_alloc.c
... | ... | @@ -1866,6 +1866,7 @@ |
1866 | 1866 | /* |
1867 | 1867 | * Initialize the args structure. |
1868 | 1868 | */ |
1869 | + memset(&targs, 0, sizeof(targs)); | |
1869 | 1870 | targs.tp = tp; |
1870 | 1871 | targs.mp = mp; |
1871 | 1872 | targs.agbp = agbp; |
... | ... | @@ -2207,7 +2208,7 @@ |
2207 | 2208 | * group or loop over the allocation groups to find the result. |
2208 | 2209 | */ |
2209 | 2210 | int /* error */ |
2210 | -__xfs_alloc_vextent( | |
2211 | +xfs_alloc_vextent( | |
2211 | 2212 | xfs_alloc_arg_t *args) /* allocation argument structure */ |
2212 | 2213 | { |
2213 | 2214 | xfs_agblock_t agsize; /* allocation group size */ |
... | ... | @@ -2415,46 +2416,6 @@ |
2415 | 2416 | error0: |
2416 | 2417 | xfs_perag_put(args->pag); |
2417 | 2418 | return error; |
2418 | -} | |
2419 | - | |
2420 | -static void | |
2421 | -xfs_alloc_vextent_worker( | |
2422 | - struct work_struct *work) | |
2423 | -{ | |
2424 | - struct xfs_alloc_arg *args = container_of(work, | |
2425 | - struct xfs_alloc_arg, work); | |
2426 | - unsigned long pflags; | |
2427 | - | |
2428 | - /* we are in a transaction context here */ | |
2429 | - current_set_flags_nested(&pflags, PF_FSTRANS); | |
2430 | - | |
2431 | - args->result = __xfs_alloc_vextent(args); | |
2432 | - complete(args->done); | |
2433 | - | |
2434 | - current_restore_flags_nested(&pflags, PF_FSTRANS); | |
2435 | -} | |
2436 | - | |
2437 | -/* | |
2438 | - * Data allocation requests often come in with little stack to work on. Push | |
2439 | - * them off to a worker thread so there is lots of stack to use. Metadata | |
2440 | - * requests, OTOH, are generally from low stack usage paths, so avoid the | |
2441 | - * context switch overhead here. | |
2442 | - */ | |
2443 | -int | |
2444 | -xfs_alloc_vextent( | |
2445 | - struct xfs_alloc_arg *args) | |
2446 | -{ | |
2447 | - DECLARE_COMPLETION_ONSTACK(done); | |
2448 | - | |
2449 | - if (!args->userdata) | |
2450 | - return __xfs_alloc_vextent(args); | |
2451 | - | |
2452 | - | |
2453 | - args->done = &done; | |
2454 | - INIT_WORK_ONSTACK(&args->work, xfs_alloc_vextent_worker); | |
2455 | - queue_work(xfs_alloc_wq, &args->work); | |
2456 | - wait_for_completion(&done); | |
2457 | - return args->result; | |
2458 | 2419 | } |
2459 | 2420 | |
2460 | 2421 | /* |
fs/xfs/xfs_alloc.h
... | ... | @@ -120,9 +120,6 @@ |
120 | 120 | char isfl; /* set if is freelist blocks - !acctg */ |
121 | 121 | char userdata; /* set if this is user data */ |
122 | 122 | xfs_fsblock_t firstblock; /* io first block allocated */ |
123 | - struct completion *done; | |
124 | - struct work_struct work; | |
125 | - int result; | |
126 | 123 | } xfs_alloc_arg_t; |
127 | 124 | |
128 | 125 | /* |
fs/xfs/xfs_alloc_btree.c
fs/xfs/xfs_bmap.c
... | ... | @@ -2437,6 +2437,7 @@ |
2437 | 2437 | * Normal allocation, done through xfs_alloc_vextent. |
2438 | 2438 | */ |
2439 | 2439 | tryagain = isaligned = 0; |
2440 | + memset(&args, 0, sizeof(args)); | |
2440 | 2441 | args.tp = ap->tp; |
2441 | 2442 | args.mp = mp; |
2442 | 2443 | args.fsbno = ap->blkno; |
... | ... | @@ -3082,6 +3083,7 @@ |
3082 | 3083 | * Convert to a btree with two levels, one record in root. |
3083 | 3084 | */ |
3084 | 3085 | XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE); |
3086 | + memset(&args, 0, sizeof(args)); | |
3085 | 3087 | args.tp = tp; |
3086 | 3088 | args.mp = mp; |
3087 | 3089 | args.firstblock = *firstblock; |
... | ... | @@ -3237,6 +3239,7 @@ |
3237 | 3239 | xfs_buf_t *bp; /* buffer for extent block */ |
3238 | 3240 | xfs_bmbt_rec_host_t *ep;/* extent record pointer */ |
3239 | 3241 | |
3242 | + memset(&args, 0, sizeof(args)); | |
3240 | 3243 | args.tp = tp; |
3241 | 3244 | args.mp = ip->i_mount; |
3242 | 3245 | args.firstblock = *firstblock; |
3243 | 3246 | |
... | ... | @@ -4616,12 +4619,11 @@ |
4616 | 4619 | |
4617 | 4620 | |
4618 | 4621 | STATIC int |
4619 | -xfs_bmapi_allocate( | |
4620 | - struct xfs_bmalloca *bma, | |
4621 | - int flags) | |
4622 | +__xfs_bmapi_allocate( | |
4623 | + struct xfs_bmalloca *bma) | |
4622 | 4624 | { |
4623 | 4625 | struct xfs_mount *mp = bma->ip->i_mount; |
4624 | - int whichfork = (flags & XFS_BMAPI_ATTRFORK) ? | |
4626 | + int whichfork = (bma->flags & XFS_BMAPI_ATTRFORK) ? | |
4625 | 4627 | XFS_ATTR_FORK : XFS_DATA_FORK; |
4626 | 4628 | struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); |
4627 | 4629 | int tmp_logflags = 0; |
4628 | 4630 | |
4629 | 4631 | |
4630 | 4632 | |
... | ... | @@ -4654,24 +4656,27 @@ |
4654 | 4656 | * Indicate if this is the first user data in the file, or just any |
4655 | 4657 | * user data. |
4656 | 4658 | */ |
4657 | - if (!(flags & XFS_BMAPI_METADATA)) { | |
4659 | + if (!(bma->flags & XFS_BMAPI_METADATA)) { | |
4658 | 4660 | bma->userdata = (bma->offset == 0) ? |
4659 | 4661 | XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA; |
4660 | 4662 | } |
4661 | 4663 | |
4662 | - bma->minlen = (flags & XFS_BMAPI_CONTIG) ? bma->length : 1; | |
4664 | + bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1; | |
4663 | 4665 | |
4664 | 4666 | /* |
4665 | 4667 | * Only want to do the alignment at the eof if it is userdata and |
4666 | 4668 | * allocation length is larger than a stripe unit. |
4667 | 4669 | */ |
4668 | 4670 | if (mp->m_dalign && bma->length >= mp->m_dalign && |
4669 | - !(flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) { | |
4671 | + !(bma->flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) { | |
4670 | 4672 | error = xfs_bmap_isaeof(bma, whichfork); |
4671 | 4673 | if (error) |
4672 | 4674 | return error; |
4673 | 4675 | } |
4674 | 4676 | |
4677 | + if (bma->flags & XFS_BMAPI_STACK_SWITCH) | |
4678 | + bma->stack_switch = 1; | |
4679 | + | |
4675 | 4680 | error = xfs_bmap_alloc(bma); |
4676 | 4681 | if (error) |
4677 | 4682 | return error; |
... | ... | @@ -4706,7 +4711,7 @@ |
4706 | 4711 | * A wasdelay extent has been initialized, so shouldn't be flagged |
4707 | 4712 | * as unwritten. |
4708 | 4713 | */ |
4709 | - if (!bma->wasdel && (flags & XFS_BMAPI_PREALLOC) && | |
4714 | + if (!bma->wasdel && (bma->flags & XFS_BMAPI_PREALLOC) && | |
4710 | 4715 | xfs_sb_version_hasextflgbit(&mp->m_sb)) |
4711 | 4716 | bma->got.br_state = XFS_EXT_UNWRITTEN; |
4712 | 4717 | |
... | ... | @@ -4734,6 +4739,45 @@ |
4734 | 4739 | return 0; |
4735 | 4740 | } |
4736 | 4741 | |
4742 | +static void | |
4743 | +xfs_bmapi_allocate_worker( | |
4744 | + struct work_struct *work) | |
4745 | +{ | |
4746 | + struct xfs_bmalloca *args = container_of(work, | |
4747 | + struct xfs_bmalloca, work); | |
4748 | + unsigned long pflags; | |
4749 | + | |
4750 | + /* we are in a transaction context here */ | |
4751 | + current_set_flags_nested(&pflags, PF_FSTRANS); | |
4752 | + | |
4753 | + args->result = __xfs_bmapi_allocate(args); | |
4754 | + complete(args->done); | |
4755 | + | |
4756 | + current_restore_flags_nested(&pflags, PF_FSTRANS); | |
4757 | +} | |
4758 | + | |
4759 | +/* | |
4760 | + * Some allocation requests often come in with little stack to work on. Push | |
4761 | + * them off to a worker thread so there is lots of stack to use. Otherwise just | |
4762 | + * call directly to avoid the context switch overhead here. | |
4763 | + */ | |
4764 | +int | |
4765 | +xfs_bmapi_allocate( | |
4766 | + struct xfs_bmalloca *args) | |
4767 | +{ | |
4768 | + DECLARE_COMPLETION_ONSTACK(done); | |
4769 | + | |
4770 | + if (!args->stack_switch) | |
4771 | + return __xfs_bmapi_allocate(args); | |
4772 | + | |
4773 | + | |
4774 | + args->done = &done; | |
4775 | + INIT_WORK_ONSTACK(&args->work, xfs_bmapi_allocate_worker); | |
4776 | + queue_work(xfs_alloc_wq, &args->work); | |
4777 | + wait_for_completion(&done); | |
4778 | + return args->result; | |
4779 | +} | |
4780 | + | |
4737 | 4781 | STATIC int |
4738 | 4782 | xfs_bmapi_convert_unwritten( |
4739 | 4783 | struct xfs_bmalloca *bma, |
... | ... | @@ -4919,6 +4963,7 @@ |
4919 | 4963 | bma.conv = !!(flags & XFS_BMAPI_CONVERT); |
4920 | 4964 | bma.wasdel = wasdelay; |
4921 | 4965 | bma.offset = bno; |
4966 | + bma.flags = flags; | |
4922 | 4967 | |
4923 | 4968 | /* |
4924 | 4969 | * There's a 32/64 bit type mismatch between the |
... | ... | @@ -4934,7 +4979,7 @@ |
4934 | 4979 | |
4935 | 4980 | ASSERT(len > 0); |
4936 | 4981 | ASSERT(bma.length > 0); |
4937 | - error = xfs_bmapi_allocate(&bma, flags); | |
4982 | + error = xfs_bmapi_allocate(&bma); | |
4938 | 4983 | if (error) |
4939 | 4984 | goto error0; |
4940 | 4985 | if (bma.blkno == NULLFSBLOCK) |
fs/xfs/xfs_bmap.h
... | ... | @@ -77,6 +77,7 @@ |
77 | 77 | * from written to unwritten, otherwise convert from unwritten to written. |
78 | 78 | */ |
79 | 79 | #define XFS_BMAPI_CONVERT 0x040 |
80 | +#define XFS_BMAPI_STACK_SWITCH 0x080 | |
80 | 81 | |
81 | 82 | #define XFS_BMAPI_FLAGS \ |
82 | 83 | { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ |
... | ... | @@ -85,7 +86,8 @@ |
85 | 86 | { XFS_BMAPI_PREALLOC, "PREALLOC" }, \ |
86 | 87 | { XFS_BMAPI_IGSTATE, "IGSTATE" }, \ |
87 | 88 | { XFS_BMAPI_CONTIG, "CONTIG" }, \ |
88 | - { XFS_BMAPI_CONVERT, "CONVERT" } | |
89 | + { XFS_BMAPI_CONVERT, "CONVERT" }, \ | |
90 | + { XFS_BMAPI_STACK_SWITCH, "STACK_SWITCH" } | |
89 | 91 | |
90 | 92 | |
91 | 93 | static inline int xfs_bmapi_aflag(int w) |
... | ... | @@ -133,6 +135,11 @@ |
133 | 135 | char userdata;/* set if is user data */ |
134 | 136 | char aeof; /* allocated space at eof */ |
135 | 137 | char conv; /* overwriting unwritten extents */ |
138 | + char stack_switch; | |
139 | + int flags; | |
140 | + struct completion *done; | |
141 | + struct work_struct work; | |
142 | + int result; | |
136 | 143 | } xfs_bmalloca_t; |
137 | 144 | |
138 | 145 | /* |
fs/xfs/xfs_buf_item.c
... | ... | @@ -526,7 +526,25 @@ |
526 | 526 | } |
527 | 527 | xfs_buf_relse(bp); |
528 | 528 | } else if (freed && remove) { |
529 | + /* | |
530 | + * There are currently two references to the buffer - the active | |
531 | + * LRU reference and the buf log item. What we are about to do | |
532 | + * here - simulate a failed IO completion - requires 3 | |
533 | + * references. | |
534 | + * | |
535 | + * The LRU reference is removed by the xfs_buf_stale() call. The | |
536 | + * buf item reference is removed by the xfs_buf_iodone() | |
537 | + * callback that is run by xfs_buf_do_callbacks() during ioend | |
538 | + * processing (via the bp->b_iodone callback), and then finally | |
539 | + * the ioend processing will drop the IO reference if the buffer | |
540 | + * is marked XBF_ASYNC. | |
541 | + * | |
542 | + * Hence we need to take an additional reference here so that IO | |
543 | + * completion processing doesn't free the buffer prematurely. | |
544 | + */ | |
529 | 545 | xfs_buf_lock(bp); |
546 | + xfs_buf_hold(bp); | |
547 | + bp->b_flags |= XBF_ASYNC; | |
530 | 548 | xfs_buf_ioerror(bp, EIO); |
531 | 549 | XFS_BUF_UNDONE(bp); |
532 | 550 | xfs_buf_stale(bp); |
fs/xfs/xfs_fsops.c
... | ... | @@ -399,9 +399,26 @@ |
399 | 399 | |
400 | 400 | /* update secondary superblocks. */ |
401 | 401 | for (agno = 1; agno < nagcount; agno++) { |
402 | - error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, | |
402 | + error = 0; | |
403 | + /* | |
404 | + * new secondary superblocks need to be zeroed, not read from | |
405 | + * disk as the contents of the new area we are growing into is | |
406 | + * completely unknown. | |
407 | + */ | |
408 | + if (agno < oagcount) { | |
409 | + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, | |
403 | 410 | XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), |
404 | 411 | XFS_FSS_TO_BB(mp, 1), 0, &bp); |
412 | + } else { | |
413 | + bp = xfs_trans_get_buf(NULL, mp->m_ddev_targp, | |
414 | + XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), | |
415 | + XFS_FSS_TO_BB(mp, 1), 0); | |
416 | + if (bp) | |
417 | + xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); | |
418 | + else | |
419 | + error = ENOMEM; | |
420 | + } | |
421 | + | |
405 | 422 | if (error) { |
406 | 423 | xfs_warn(mp, |
407 | 424 | "error %d reading secondary superblock for ag %d", |
... | ... | @@ -423,7 +440,7 @@ |
423 | 440 | break; /* no point in continuing */ |
424 | 441 | } |
425 | 442 | } |
426 | - return 0; | |
443 | + return error; | |
427 | 444 | |
428 | 445 | error0: |
429 | 446 | xfs_trans_cancel(tp, XFS_TRANS_ABORT); |
fs/xfs/xfs_ialloc.c
fs/xfs/xfs_inode.c
... | ... | @@ -1509,7 +1509,8 @@ |
1509 | 1509 | * to mark all the active inodes on the buffer stale. |
1510 | 1510 | */ |
1511 | 1511 | bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, |
1512 | - mp->m_bsize * blks_per_cluster, 0); | |
1512 | + mp->m_bsize * blks_per_cluster, | |
1513 | + XBF_UNMAPPED); | |
1513 | 1514 | |
1514 | 1515 | if (!bp) |
1515 | 1516 | return ENOMEM; |
fs/xfs/xfs_ioctl.c
fs/xfs/xfs_iomap.c
... | ... | @@ -584,7 +584,9 @@ |
584 | 584 | * pointer that the caller gave to us. |
585 | 585 | */ |
586 | 586 | error = xfs_bmapi_write(tp, ip, map_start_fsb, |
587 | - count_fsb, 0, &first_block, 1, | |
587 | + count_fsb, | |
588 | + XFS_BMAPI_STACK_SWITCH, | |
589 | + &first_block, 1, | |
588 | 590 | imap, &nimaps, &free_list); |
589 | 591 | if (error) |
590 | 592 | goto trans_cancel; |
fs/xfs/xfs_log.c
... | ... | @@ -2387,14 +2387,27 @@ |
2387 | 2387 | |
2388 | 2388 | |
2389 | 2389 | /* |
2390 | - * update the last_sync_lsn before we drop the | |
2390 | + * Completion of a iclog IO does not imply that | |
2391 | + * a transaction has completed, as transactions | |
2392 | + * can be large enough to span many iclogs. We | |
2393 | + * cannot change the tail of the log half way | |
2394 | + * through a transaction as this may be the only | |
2395 | + * transaction in the log and moving th etail to | |
2396 | + * point to the middle of it will prevent | |
2397 | + * recovery from finding the start of the | |
2398 | + * transaction. Hence we should only update the | |
2399 | + * last_sync_lsn if this iclog contains | |
2400 | + * transaction completion callbacks on it. | |
2401 | + * | |
2402 | + * We have to do this before we drop the | |
2391 | 2403 | * icloglock to ensure we are the only one that |
2392 | 2404 | * can update it. |
2393 | 2405 | */ |
2394 | 2406 | ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn), |
2395 | 2407 | be64_to_cpu(iclog->ic_header.h_lsn)) <= 0); |
2396 | - atomic64_set(&log->l_last_sync_lsn, | |
2397 | - be64_to_cpu(iclog->ic_header.h_lsn)); | |
2408 | + if (iclog->ic_callback) | |
2409 | + atomic64_set(&log->l_last_sync_lsn, | |
2410 | + be64_to_cpu(iclog->ic_header.h_lsn)); | |
2398 | 2411 | |
2399 | 2412 | } else |
2400 | 2413 | ioerrors++; |
fs/xfs/xfs_log_recover.c