Commit 79b6fad5 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'xfs-6.4-rc5-fixes' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

Pull xfs fixes from Dave Chinner:
 "These are a set of regression fixes discovered on recent kernels. I
  was hoping to send this to you a week and half ago, but events out of
  my control delayed finalising the changes until early this week.

  Whilst the diffstat looks large for this stage of the merge window, a
  large chunk of it comes from moving the guts of one function from one
  file to another i.e. it's the same code, it is just run in a different
  context where it is safe to hold a specific lock. Otherwise the
  individual changes are relatively small and straigtht forward.

  Summary:

   - Propagate unlinked inode list corruption back up to log recovery
     (regression fix)

   - improve corruption detection for AGFL entries, AGFL indexes and
     XEFI extents (syzkaller fuzzer oops report)

   - Avoid double perag reference release (regression fix)

   - Improve extent merging detection in scrub (regression fix)

   - Fix a new undefined high bit shift (regression fix)

   - Fix for AGF vs inode cluster buffer deadlock (regression fix)"

* tag 'xfs-6.4-rc5-fixes' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux:
  xfs: collect errors from inodegc for unlinked inode recovery
  xfs: validate block number being freed before adding to xefi
  xfs: validity check agbnos on the AGFL
  xfs: fix agf/agfl verification on v4 filesystems
  xfs: fix double xfs_perag_rele() in xfs_filestream_pick_ag()
  xfs: fix broken logic when detecting mergeable bmap records
  xfs: Fix undefined behavior of shift into sign bit
  xfs: fix AGF vs inode cluster buffer deadlock
  xfs: defered work could create precommits
  xfs: restore allocation trylock iteration
  xfs: buffer pins need to hold a buffer reference
parents 5f63595e d4d12c02
......@@ -984,7 +984,10 @@ xfs_ag_shrink_space(
if (err2 != -ENOSPC)
goto resv_err;
__xfs_free_extent_later(*tpp, args.fsbno, delta, NULL, true);
err2 = __xfs_free_extent_later(*tpp, args.fsbno, delta, NULL,
true);
if (err2)
goto resv_err;
/*
* Roll the transaction before trying to re-init the per-ag
......
......@@ -628,6 +628,25 @@ xfs_alloc_fixup_trees(
return 0;
}
/*
* We do not verify the AGFL contents against AGF-based index counters here,
* even though we may have access to the perag that contains shadow copies. We
* don't know if the AGF based counters have been checked, and if they have they
* still may be inconsistent because they haven't yet been reset on the first
* allocation after the AGF has been read in.
*
* This means we can only check that all agfl entries contain valid or null
* values because we can't reliably determine the active range to exclude
* NULLAGBNO as a valid value.
*
* However, we can't even do that for v4 format filesystems because there are
* old versions of mkfs out there that does not initialise the AGFL to known,
* verifiable values. HEnce we can't tell the difference between a AGFL block
* allocated by mkfs and a corrupted AGFL block here on v4 filesystems.
*
* As a result, we can only fully validate AGFL block numbers when we pull them
* from the freelist in xfs_alloc_get_freelist().
*/
static xfs_failaddr_t
xfs_agfl_verify(
struct xfs_buf *bp)
......@@ -637,12 +656,6 @@ xfs_agfl_verify(
__be32 *agfl_bno = xfs_buf_to_agfl_bno(bp);
int i;
/*
* There is no verification of non-crc AGFLs because mkfs does not
* initialise the AGFL to zero or NULL. Hence the only valid part of the
* AGFL is what the AGF says is active. We can't get to the AGF, so we
* can't verify just those entries are valid.
*/
if (!xfs_has_crc(mp))
return NULL;
......@@ -2321,12 +2334,16 @@ xfs_free_agfl_block(
}
/*
* Check the agfl fields of the agf for inconsistency or corruption. The purpose
* is to detect an agfl header padding mismatch between current and early v5
* kernels. This problem manifests as a 1-slot size difference between the
* on-disk flcount and the active [first, last] range of a wrapped agfl. This
* may also catch variants of agfl count corruption unrelated to padding. Either
* way, we'll reset the agfl and warn the user.
* Check the agfl fields of the agf for inconsistency or corruption.
*
* The original purpose was to detect an agfl header padding mismatch between
* current and early v5 kernels. This problem manifests as a 1-slot size
* difference between the on-disk flcount and the active [first, last] range of
* a wrapped agfl.
*
* However, we need to use these same checks to catch agfl count corruptions
* unrelated to padding. This could occur on any v4 or v5 filesystem, so either
* way, we need to reset the agfl and warn the user.
*
* Return true if a reset is required before the agfl can be used, false
* otherwise.
......@@ -2342,10 +2359,6 @@ xfs_agfl_needs_reset(
int agfl_size = xfs_agfl_size(mp);
int active;
/* no agfl header on v4 supers */
if (!xfs_has_crc(mp))
return false;
/*
* The agf read verifier catches severe corruption of these fields.
* Repeat some sanity checks to cover a packed -> unpacked mismatch if
......@@ -2418,7 +2431,7 @@ xfs_agfl_reset(
* the real allocation can proceed. Deferring the free disconnects freeing up
* the AGFL slot from freeing the block.
*/
STATIC void
static int
xfs_defer_agfl_block(
struct xfs_trans *tp,
xfs_agnumber_t agno,
......@@ -2437,17 +2450,21 @@ xfs_defer_agfl_block(
xefi->xefi_blockcount = 1;
xefi->xefi_owner = oinfo->oi_owner;
if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbno(mp, xefi->xefi_startblock)))
return -EFSCORRUPTED;
trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1);
xfs_extent_free_get_group(mp, xefi);
xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &xefi->xefi_list);
return 0;
}
/*
* Add the extent to the list of extents to be free at transaction end.
* The list is maintained sorted (by block number).
*/
void
int
__xfs_free_extent_later(
struct xfs_trans *tp,
xfs_fsblock_t bno,
......@@ -2474,6 +2491,9 @@ __xfs_free_extent_later(
#endif
ASSERT(xfs_extfree_item_cache != NULL);
if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbext(mp, bno, len)))
return -EFSCORRUPTED;
xefi = kmem_cache_zalloc(xfs_extfree_item_cache,
GFP_KERNEL | __GFP_NOFAIL);
xefi->xefi_startblock = bno;
......@@ -2497,6 +2517,7 @@ __xfs_free_extent_later(
xfs_extent_free_get_group(mp, xefi);
xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &xefi->xefi_list);
return 0;
}
#ifdef DEBUG
......@@ -2657,7 +2678,9 @@ xfs_alloc_fix_freelist(
goto out_agbp_relse;
/* defer agfl frees */
xfs_defer_agfl_block(tp, args->agno, bno, &targs.oinfo);
error = xfs_defer_agfl_block(tp, args->agno, bno, &targs.oinfo);
if (error)
goto out_agbp_relse;
}
targs.tp = tp;
......@@ -2767,6 +2790,9 @@ xfs_alloc_get_freelist(
*/
agfl_bno = xfs_buf_to_agfl_bno(agflbp);
bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
if (XFS_IS_CORRUPT(tp->t_mountp, !xfs_verify_agbno(pag, bno)))
return -EFSCORRUPTED;
be32_add_cpu(&agf->agf_flfirst, 1);
xfs_trans_brelse(tp, agflbp);
if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp))
......@@ -2889,6 +2915,19 @@ xfs_alloc_put_freelist(
return 0;
}
/*
* Verify the AGF is consistent.
*
* We do not verify the AGFL indexes in the AGF are fully consistent here
* because of issues with variable on-disk structure sizes. Instead, we check
* the agfl indexes for consistency when we initialise the perag from the AGF
* information after a read completes.
*
* If the index is inconsistent, then we mark the perag as needing an AGFL
* reset. The first AGFL update performed then resets the AGFL indexes and
* refills the AGFL with known good free blocks, allowing the filesystem to
* continue operating normally at the cost of a few leaked free space blocks.
*/
static xfs_failaddr_t
xfs_agf_verify(
struct xfs_buf *bp)
......@@ -2962,7 +3001,6 @@ xfs_agf_verify(
return __this_address;
return NULL;
}
static void
......@@ -3187,7 +3225,8 @@ xfs_alloc_vextent_check_args(
*/
static int
xfs_alloc_vextent_prepare_ag(
struct xfs_alloc_arg *args)
struct xfs_alloc_arg *args,
uint32_t flags)
{
bool need_pag = !args->pag;
int error;
......@@ -3196,7 +3235,7 @@ xfs_alloc_vextent_prepare_ag(
args->pag = xfs_perag_get(args->mp, args->agno);
args->agbp = NULL;
error = xfs_alloc_fix_freelist(args, 0);
error = xfs_alloc_fix_freelist(args, flags);
if (error) {
trace_xfs_alloc_vextent_nofix(args);
if (need_pag)
......@@ -3336,7 +3375,7 @@ xfs_alloc_vextent_this_ag(
return error;
}
error = xfs_alloc_vextent_prepare_ag(args);
error = xfs_alloc_vextent_prepare_ag(args, 0);
if (!error && args->agbp)
error = xfs_alloc_ag_vextent_size(args);
......@@ -3380,7 +3419,7 @@ xfs_alloc_vextent_iterate_ags(
for_each_perag_wrap_range(mp, start_agno, restart_agno,
mp->m_sb.sb_agcount, agno, args->pag) {
args->agno = agno;
error = xfs_alloc_vextent_prepare_ag(args);
error = xfs_alloc_vextent_prepare_ag(args, flags);
if (error)
break;
if (!args->agbp) {
......@@ -3546,7 +3585,7 @@ xfs_alloc_vextent_exact_bno(
return error;
}
error = xfs_alloc_vextent_prepare_ag(args);
error = xfs_alloc_vextent_prepare_ag(args, 0);
if (!error && args->agbp)
error = xfs_alloc_ag_vextent_exact(args);
......@@ -3587,7 +3626,7 @@ xfs_alloc_vextent_near_bno(
if (needs_perag)
args->pag = xfs_perag_grab(mp, args->agno);
error = xfs_alloc_vextent_prepare_ag(args);
error = xfs_alloc_vextent_prepare_ag(args, 0);
if (!error && args->agbp)
error = xfs_alloc_ag_vextent_near(args);
......
......@@ -230,7 +230,7 @@ xfs_buf_to_agfl_bno(
return bp->b_addr;
}
void __xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno,
int __xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno,
xfs_filblks_t len, const struct xfs_owner_info *oinfo,
bool skip_discard);
......@@ -254,14 +254,14 @@ void xfs_extent_free_get_group(struct xfs_mount *mp,
#define XFS_EFI_ATTR_FORK (1U << 1) /* freeing attr fork block */
#define XFS_EFI_BMBT_BLOCK (1U << 2) /* freeing bmap btree block */
static inline void
static inline int
xfs_free_extent_later(
struct xfs_trans *tp,
xfs_fsblock_t bno,
xfs_filblks_t len,
const struct xfs_owner_info *oinfo)
{
__xfs_free_extent_later(tp, bno, len, oinfo, false);
return __xfs_free_extent_later(tp, bno, len, oinfo, false);
}
......
......@@ -572,8 +572,12 @@ xfs_bmap_btree_to_extents(
cblock = XFS_BUF_TO_BLOCK(cbp);
if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
return error;
xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork);
xfs_free_extent_later(cur->bc_tp, cbno, 1, &oinfo);
error = xfs_free_extent_later(cur->bc_tp, cbno, 1, &oinfo);
if (error)
return error;
ip->i_nblocks--;
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
xfs_trans_binval(tp, cbp);
......@@ -5230,10 +5234,12 @@ xfs_bmap_del_extent_real(
if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) {
xfs_refcount_decrease_extent(tp, del);
} else {
__xfs_free_extent_later(tp, del->br_startblock,
error = __xfs_free_extent_later(tp, del->br_startblock,
del->br_blockcount, NULL,
(bflags & XFS_BMAPI_NODISCARD) ||
del->br_state == XFS_EXT_UNWRITTEN);
if (error)
goto done;
}
}
......
......@@ -268,11 +268,14 @@ xfs_bmbt_free_block(
struct xfs_trans *tp = cur->bc_tp;
xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
struct xfs_owner_info oinfo;
int error;
xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork);
xfs_free_extent_later(cur->bc_tp, fsbno, 1, &oinfo);
ip->i_nblocks--;
error = xfs_free_extent_later(cur->bc_tp, fsbno, 1, &oinfo);
if (error)
return error;
ip->i_nblocks--;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
return 0;
......
......@@ -1834,7 +1834,7 @@ xfs_dialloc(
* might be sparse and only free the regions that are allocated as part of the
* chunk.
*/
STATIC void
static int
xfs_difree_inode_chunk(
struct xfs_trans *tp,
xfs_agnumber_t agno,
......@@ -1851,10 +1851,10 @@ xfs_difree_inode_chunk(
if (!xfs_inobt_issparse(rec->ir_holemask)) {
/* not sparse, calculate extent info directly */
xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, sagbno),
return xfs_free_extent_later(tp,
XFS_AGB_TO_FSB(mp, agno, sagbno),
M_IGEO(mp)->ialloc_blks,
&XFS_RMAP_OINFO_INODES);
return;
}
/* holemask is only 16-bits (fits in an unsigned long) */
......@@ -1871,6 +1871,8 @@ xfs_difree_inode_chunk(
XFS_INOBT_HOLEMASK_BITS);
nextbit = startidx + 1;
while (startidx < XFS_INOBT_HOLEMASK_BITS) {
int error;
nextbit = find_next_zero_bit(holemask, XFS_INOBT_HOLEMASK_BITS,
nextbit);
/*
......@@ -1896,8 +1898,11 @@ xfs_difree_inode_chunk(
ASSERT(agbno % mp->m_sb.sb_spino_align == 0);
ASSERT(contigblk % mp->m_sb.sb_spino_align == 0);
xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, agbno),
error = xfs_free_extent_later(tp,
XFS_AGB_TO_FSB(mp, agno, agbno),
contigblk, &XFS_RMAP_OINFO_INODES);
if (error)
return error;
/* reset range to current bit and carry on... */
startidx = endidx = nextbit;
......@@ -1905,6 +1910,7 @@ xfs_difree_inode_chunk(
next:
nextbit++;
}
return 0;
}
STATIC int
......@@ -2003,7 +2009,9 @@ xfs_difree_inobt(
goto error0;
}
xfs_difree_inode_chunk(tp, pag->pag_agno, &rec);
error = xfs_difree_inode_chunk(tp, pag->pag_agno, &rec);
if (error)
goto error0;
} else {
xic->deleted = false;
......
......@@ -324,7 +324,6 @@ struct xfs_inode_log_format_32 {
#define XFS_ILOG_DOWNER 0x200 /* change the data fork owner on replay */
#define XFS_ILOG_AOWNER 0x400 /* change the attr fork owner on replay */
/*
* The timestamps are dirty, but not necessarily anything else in the inode
* core. Unlike the other fields above this one must never make it to disk
......@@ -333,6 +332,14 @@ struct xfs_inode_log_format_32 {
*/
#define XFS_ILOG_TIMESTAMP 0x4000
/*
* The version field has been changed, but not necessarily anything else of
* interest. This must never make it to disk - it is used purely to ensure that
* the inode item ->precommit operation can update the fsync flag triggers
* in the inode item correctly.
*/
#define XFS_ILOG_IVERSION 0x8000
#define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
XFS_ILOG_DBROOT | XFS_ILOG_DEV | \
XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
......
......@@ -1151,8 +1151,10 @@ xfs_refcount_adjust_extents(
fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
cur->bc_ag.pag->pag_agno,
tmp.rc_startblock);
xfs_free_extent_later(cur->bc_tp, fsbno,
error = xfs_free_extent_later(cur->bc_tp, fsbno,
tmp.rc_blockcount, NULL);
if (error)
goto out_error;
}
(*agbno) += tmp.rc_blockcount;
......@@ -1210,8 +1212,10 @@ xfs_refcount_adjust_extents(
fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
cur->bc_ag.pag->pag_agno,
ext.rc_startblock);
xfs_free_extent_later(cur->bc_tp, fsbno,
error = xfs_free_extent_later(cur->bc_tp, fsbno,
ext.rc_blockcount, NULL);
if (error)
goto out_error;
}
skip:
......@@ -1976,7 +1980,10 @@ xfs_refcount_recover_cow_leftovers(
rr->rr_rrec.rc_blockcount);
/* Free the block. */
xfs_free_extent_later(tp, fsb, rr->rr_rrec.rc_blockcount, NULL);
error = xfs_free_extent_later(tp, fsb,
rr->rr_rrec.rc_blockcount, NULL);
if (error)
goto out_trans;
error = xfs_trans_commit(tp);
if (error)
......
......@@ -40,9 +40,8 @@ xfs_trans_ijoin(
iip->ili_lock_flags = lock_flags;
ASSERT(!xfs_iflags_test(ip, XFS_ISTALE));
/*
* Get a log_item_desc to point at the new item.
*/
/* Reset the per-tx dirty context and add the item to the tx. */
iip->ili_dirty_flags = 0;
xfs_trans_add_item(tp, &iip->ili_item);
}
......@@ -76,17 +75,10 @@ xfs_trans_ichgtime(
/*
* This is called to mark the fields indicated in fieldmask as needing to be
* logged when the transaction is committed. The inode must already be
* associated with the given transaction.
*
* The values for fieldmask are defined in xfs_inode_item.h. We always log all
* of the core inode if any of it has changed, and we always log all of the
* inline data/extents/b-tree root if any of them has changed.
*
* Grab and pin the cluster buffer associated with this inode to avoid RMW
* cycles at inode writeback time. Avoid the need to add error handling to every
* xfs_trans_log_inode() call by shutting down on read error. This will cause
* transactions to fail and everything to error out, just like if we return a
* read error in a dirty transaction and cancel it.
* associated with the given transaction. All we do here is record where the
* inode was dirtied and mark the transaction and inode log item dirty;
* everything else is done in the ->precommit log item operation after the
* changes in the transaction have been completed.
*/
void
xfs_trans_log_inode(
......@@ -96,7 +88,6 @@ xfs_trans_log_inode(
{
struct xfs_inode_log_item *iip = ip->i_itemp;
struct inode *inode = VFS_I(ip);
uint iversion_flags = 0;
ASSERT(iip);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
......@@ -104,18 +95,6 @@ xfs_trans_log_inode(
tp->t_flags |= XFS_TRANS_DIRTY;
/*
* Don't bother with i_lock for the I_DIRTY_TIME check here, as races
* don't matter - we either will need an extra transaction in 24 hours
* to log the timestamps, or will clear already cleared fields in the
* worst case.
*/
if (inode->i_state & I_DIRTY_TIME) {
spin_lock(&inode->i_lock);
inode->i_state &= ~I_DIRTY_TIME;
spin_unlock(&inode->i_lock);
}
/*
* First time we log the inode in a transaction, bump the inode change
* counter if it is configured for this to occur. While we have the
......@@ -128,86 +107,10 @@ xfs_trans_log_inode(
if (!test_and_set_bit(XFS_LI_DIRTY, &iip->ili_item.li_flags)) {
if (IS_I_VERSION(inode) &&
inode_maybe_inc_iversion(inode, flags & XFS_ILOG_CORE))
iversion_flags = XFS_ILOG_CORE;
}
/*
* If we're updating the inode core or the timestamps and it's possible
* to upgrade this inode to bigtime format, do so now.
*/
if ((flags & (XFS_ILOG_CORE | XFS_ILOG_TIMESTAMP)) &&
xfs_has_bigtime(ip->i_mount) &&
!xfs_inode_has_bigtime(ip)) {
ip->i_diflags2 |= XFS_DIFLAG2_BIGTIME;
flags |= XFS_ILOG_CORE;
}
/*
* Inode verifiers do not check that the extent size hint is an integer
* multiple of the rt extent size on a directory with both rtinherit
* and extszinherit flags set. If we're logging a directory that is
* misconfigured in this way, clear the hint.
*/
if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
(ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) &&
(ip->i_extsize % ip->i_mount->m_sb.sb_rextsize) > 0) {
ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE |
XFS_DIFLAG_EXTSZINHERIT);
ip->i_extsize = 0;
flags |= XFS_ILOG_CORE;
flags |= XFS_ILOG_IVERSION;
}
/*
* Record the specific change for fdatasync optimisation. This allows
* fdatasync to skip log forces for inodes that are only timestamp
* dirty.
*/
spin_lock(&iip->ili_lock);
iip->ili_fsync_fields |= flags;
if (!iip->ili_item.li_buf) {
struct xfs_buf *bp;
int error;
/*
* We hold the ILOCK here, so this inode is not going to be
* flushed while we are here. Further, because there is no
* buffer attached to the item, we know that there is no IO in
* progress, so nothing will clear the ili_fields while we read
* in the buffer. Hence we can safely drop the spin lock and
* read the buffer knowing that the state will not change from
* here.
*/
spin_unlock(&iip->ili_lock);
error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, &bp);
if (error) {
xfs_force_shutdown(ip->i_mount, SHUTDOWN_META_IO_ERROR);
return;
}
/*
* We need an explicit buffer reference for the log item but
* don't want the buffer to remain attached to the transaction.
* Hold the buffer but release the transaction reference once
* we've attached the inode log item to the buffer log item
* list.
*/
xfs_buf_hold(bp);
spin_lock(&iip->ili_lock);
iip->ili_item.li_buf = bp;
bp->b_flags |= _XBF_INODES;
list_add_tail(&iip->ili_item.li_bio_list, &bp->b_li_list);
xfs_trans_brelse(tp, bp);
}
/*
* Always OR in the bits from the ili_last_fields field. This is to
* coordinate with the xfs_iflush() and xfs_buf_inode_iodone() routines
* in the eventual clearing of the ili_fields bits. See the big comment
* in xfs_iflush() for an explanation of this coordination mechanism.
*/
iip->ili_fields |= (flags | iip->ili_last_fields | iversion_flags);
spin_unlock(&iip->ili_lock);
iip->ili_dirty_flags |= flags;
}
int
......
......@@ -769,14 +769,14 @@ xchk_are_bmaps_contiguous(
* mapping or false if there are no more mappings. Caller must ensure that
* @info.icur is zeroed before the first call.
*/
static int
static bool
xchk_bmap_iext_iter(
struct xchk_bmap_info *info,
struct xfs_bmbt_irec *irec)
{
struct xfs_bmbt_irec got;
struct xfs_ifork *ifp;
xfs_filblks_t prev_len;
unsigned int nr = 0;
ifp = xfs_ifork_ptr(info->sc->ip, info->whichfork);
......@@ -790,12 +790,12 @@ xchk_bmap_iext_iter(
irec->br_startoff);
return false;
}
nr++;
/*
* Iterate subsequent iextent records and merge them with the one
* that we just read, if possible.
*/
prev_len = irec->br_blockcount;
while (xfs_iext_peek_next_extent(ifp, &info->icur, &got)) {
if (!xchk_are_bmaps_contiguous(irec, &got))
break;
......@@ -805,20 +805,21 @@ xchk_bmap_iext_iter(
got.br_startoff);
return false;
}
/*
* Notify the user of mergeable records in the data or attr
* forks. CoW forks only exist in memory so we ignore them.
*/
if (info->whichfork != XFS_COW_FORK &&
prev_len + got.br_blockcount > BMBT_BLOCKCOUNT_MASK)
xchk_ino_set_preen(info->sc, info->sc->ip->i_ino);
nr++;
irec->br_blockcount += got.br_blockcount;
prev_len = got.br_blockcount;
xfs_iext_next(ifp, &info->icur);
}
/*
* If the merged mapping could be expressed with fewer bmbt records
* than we actually found, notify the user that this fork could be
* optimized. CoW forks only exist in memory so we ignore them.
*/
if (nr > 1 && info->whichfork != XFS_COW_FORK &&
howmany_64(irec->br_blockcount, XFS_MAX_BMBT_EXTLEN) < nr)
xchk_ino_set_preen(info->sc, info->sc->ip->i_ino);
return true;
}
......
......@@ -105,10 +105,10 @@ struct xfs_scrub {
};
/* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */
#define XCHK_TRY_HARDER (1 << 0) /* can't get resources, try again */
#define XCHK_FSGATES_DRAIN (1 << 2) /* defer ops draining enabled */
#define XCHK_NEED_DRAIN (1 << 3) /* scrub needs to drain defer ops */
#define XREP_ALREADY_FIXED (1 << 31) /* checking our repair work */
#define XCHK_TRY_HARDER (1U << 0) /* can't get resources, try again */
#define XCHK_FSGATES_DRAIN (1U << 2) /* defer ops draining enabled */
#define XCHK_NEED_DRAIN (1U << 3) /* scrub needs to drain defer ops */
#define XREP_ALREADY_FIXED (1U << 31) /* checking our repair work */
/*
* The XCHK_FSGATES* flags reflect functionality in the main filesystem that
......
......@@ -452,10 +452,18 @@ xfs_buf_item_format(
* This is called to pin the buffer associated with the buf log item in memory
* so it cannot be written out.
*
* We also always take a reference to the buffer log item here so that the bli
* is held while the item is pinned in memory. This means that we can
* unconditionally drop the reference count a transaction holds when the
* transaction is completed.
* We take a reference to the buffer log item here so that the BLI life cycle
* extends at least until the buffer is unpinned via xfs_buf_item_unpin() and
* inserted into the AIL.
*
* We also need to take a reference to the buffer itself as the BLI unpin
* processing requires accessing the buffer after the BLI has dropped the final
* BLI reference. See xfs_buf_item_unpin() for an explanation.
* If unpins race to drop the final BLI reference and only the
* BLI owns a reference to the buffer, then the loser of the race can have the
* buffer fgreed from under it (e.g. on shutdown). Taking a buffer reference per
* pin count ensures the life cycle of the buffer extends for as
* long as we hold the buffer pin reference in xfs_buf_item_unpin().
*/
STATIC void
xfs_buf_item_pin(
......@@ -470,13 +478,30 @@ xfs_buf_item_pin(
trace_xfs_buf_item_pin(bip);
xfs_buf_hold(bip->bli_buf);
atomic_inc(&bip->bli_refcount);
atomic_inc(&bip->bli_buf->b_pin_count);
}
/*
* This is called to unpin the buffer associated with the buf log item which
* was previously pinned with a call to xfs_buf_item_pin().
* This is called to unpin the buffer associated with the buf log item which was
* previously pinned with a call to xfs_buf_item_pin(). We enter this function
* with a buffer pin count, a buffer reference and a BLI reference.
*
* We must drop the BLI reference before we unpin the buffer because the AIL
* doesn't acquire a BLI reference whenever it accesses it. Therefore if the
* refcount drops to zero, the bli could still be AIL resident and the buffer
* submitted for I/O at any point before we return. This can result in IO
* completion freeing the buffer while we are still trying to access it here.
* This race condition can also occur in shutdown situations where we abort and
* unpin buffers from contexts other that journal IO completion.
*
* Hence we have to hold a buffer reference per pin count to ensure that the
* buffer cannot be freed until we have finished processing the unpin operation.
* The reference is taken in xfs_buf_item_pin(), and we must hold it until we
* are done processing the buffer state. In the case of an abort (remove =
* true) then we re-use the current pin reference as the IO reference we hand
* off to IO failure handling.
*/
STATIC void
xfs_buf_item_unpin(
......@@ -493,24 +518,18 @@ xfs_buf_item_unpin(
trace_xfs_buf_item_unpin(bip);
/*
* Drop the bli ref associated with the pin and grab the hold required
* for the I/O simulation failure in the abort case. We have to do this
* before the pin count drops because the AIL doesn't acquire a bli
* reference. Therefore if the refcount drops to zero, the bli could
* still be AIL resident and the buffer submitted for I/O (and freed on
* completion) at any point before we return. This can be removed once
* the AIL properly holds a reference on the bli.
*/
freed = atomic_dec_and_test(&bip->bli_refcount);
if (freed && !stale && remove)
xfs_buf_hold(bp);
if (atomic_dec_and_test(&bp->b_pin_count))
wake_up_all(&bp->b_waiters);
/* nothing to do but drop the pin count if the bli is active */
if (!freed)
/*
* Nothing to do but drop the buffer pin reference if the BLI is
* still active.
*/
if (!freed) {
xfs_buf_rele(bp);
return;
}
if (stale) {
ASSERT(bip->bli_flags & XFS_BLI_STALE);
......@@ -522,6 +541,15 @@ xfs_buf_item_unpin(
trace_xfs_buf_item_unpin_stale(bip);
/*
* The buffer has been locked and referenced since it was marked
* stale so we own both lock and reference exclusively here. We
* do not need the pin reference any more, so drop it now so
* that we only have one reference to drop once item completion
* processing is complete.
*/
xfs_buf_rele(bp);
/*
* If we get called here because of an IO error, we may or may
* not have the item on the AIL. xfs_trans_ail_delete() will
......@@ -538,16 +566,30 @@ xfs_buf_item_unpin(
ASSERT(bp->b_log_item == NULL);
}
xfs_buf_relse(bp);
} else if (remove) {
return;
}
if (remove) {
/*
* The buffer must be locked and held by the caller to simulate
* an async I/O failure. We acquired the hold for this case
* before the buffer was unpinned.
* We need to simulate an async IO failures here to ensure that
* the correct error completion is run on this buffer. This
* requires a reference to the buffer and for the buffer to be
* locked. We can safely pass ownership of the pin reference to
* the IO to ensure that nothing can free the buffer while we
* wait for the lock and then run the IO failure completion.
*/
xfs_buf_lock(bp);
bp->b_flags |= XBF_ASYNC;
xfs_buf_ioend_fail(bp);
return;
}
/*
* BLI has no more active references - it will be moved to the AIL to
* manage the remaining BLI/buffer life cycle. There is nothing left for
* us to do here so drop the pin reference to the buffer.
*/
xfs_buf_rele(bp);
}
STATIC uint
......
......@@ -78,7 +78,6 @@ xfs_filestream_pick_ag(
*longest = 0;
err = xfs_bmap_longest_free_extent(pag, NULL, longest);
if (err) {
xfs_perag_rele(pag);
if (err != -EAGAIN)
break;
/* Couldn't lock the AGF, skip this AG. */
......
......@@ -454,6 +454,27 @@ xfs_inodegc_queue_all(
return ret;
}
/* Wait for all queued work and collect errors */
static int
xfs_inodegc_wait_all(
struct xfs_mount *mp)
{
int cpu;
int error = 0;
flush_workqueue(mp->m_inodegc_wq);
for_each_online_cpu(cpu) {
struct xfs_inodegc *gc;
gc = per_cpu_ptr(mp->m_inodegc, cpu);
if (gc->error && !error)
error = gc->error;
gc->error = 0;
}
return error;
}
/*
* Check the validity of the inode we just found it the cache
*/
......@@ -1491,15 +1512,14 @@ xfs_blockgc_free_space(
if (error)
return error;
xfs_inodegc_flush(mp);
return 0;
return xfs_inodegc_flush(mp);
}
/*
* Reclaim all the free space that we can by scheduling the background blockgc
* and inodegc workers immediately and waiting for them all to clear.
*/
void
int
xfs_blockgc_flush_all(
struct xfs_mount *mp)
{
......@@ -1520,7 +1540,7 @@ xfs_blockgc_flush_all(
for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
flush_delayed_work(&pag->pag_blockgc_work);
xfs_inodegc_flush(mp);
return xfs_inodegc_flush(mp);
}
/*
......@@ -1842,13 +1862,17 @@ xfs_inodegc_set_reclaimable(
* This is the last chance to make changes to an otherwise unreferenced file
* before incore reclamation happens.
*/
static void
static int
xfs_inodegc_inactivate(
struct xfs_inode *ip)
{
int error;
trace_xfs_inode_inactivating(ip);
xfs_inactive(ip);
error = xfs_inactive(ip);
xfs_inodegc_set_reclaimable(ip);
return error;
}
void
......@@ -1880,8 +1904,12 @@ xfs_inodegc_worker(
WRITE_ONCE(gc->shrinker_hits, 0);
llist_for_each_entry_safe(ip, n, node, i_gclist) {
int error;
xfs_iflags_set(ip, XFS_INACTIVATING);
xfs_inodegc_inactivate(ip);
error = xfs_inodegc_inactivate(ip);
if (error && !gc->error)
gc->error = error;
}
memalloc_nofs_restore(nofs_flag);
......@@ -1905,13 +1933,13 @@ xfs_inodegc_push(
* Force all currently queued inode inactivation work to run immediately and
* wait for the work to finish.
*/
void
int
xfs_inodegc_flush(
struct xfs_mount *mp)
{
xfs_inodegc_push(mp);
trace_xfs_inodegc_flush(mp, __return_address);
flush_workqueue(mp->m_inodegc_wq);
return xfs_inodegc_wait_all(mp);
}
/*
......
......@@ -62,7 +62,7 @@ int xfs_blockgc_free_dquots(struct xfs_mount *mp, struct xfs_dquot *udqp,
unsigned int iwalk_flags);
int xfs_blockgc_free_quota(struct xfs_inode *ip, unsigned int iwalk_flags);
int xfs_blockgc_free_space(struct xfs_mount *mp, struct xfs_icwalk *icm);
void xfs_blockgc_flush_all(struct xfs_mount *mp);
int xfs_blockgc_flush_all(struct xfs_mount *mp);
void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip);
void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
......@@ -80,7 +80,7 @@ void xfs_blockgc_start(struct xfs_mount *mp);
void xfs_inodegc_worker(struct work_struct *work);
void xfs_inodegc_push(struct xfs_mount *mp);
void xfs_inodegc_flush(struct xfs_mount *mp);
int xfs_inodegc_flush(struct xfs_mount *mp);
void xfs_inodegc_stop(struct xfs_mount *mp);
void xfs_inodegc_start(struct xfs_mount *mp);
void xfs_inodegc_cpu_dead(struct xfs_mount *mp, unsigned int cpu);
......
......@@ -1620,16 +1620,7 @@ xfs_inactive_ifree(
*/
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
/*
* Just ignore errors at this point. There is nothing we can do except
* to try to keep going. Make sure it's not a silent error.
*/
error = xfs_trans_commit(tp);
if (error)
xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
__func__, error);
return 0;
return xfs_trans_commit(tp);
}
/*
......@@ -1693,12 +1684,12 @@ xfs_inode_needs_inactive(
* now be truncated. Also, we clear all of the read-ahead state
* kept for the inode here since the file is now closed.
*/
void
int
xfs_inactive(
xfs_inode_t *ip)
{
struct xfs_mount *mp;
int error;
int error = 0;
int truncate = 0;
/*
......@@ -1736,7 +1727,7 @@ xfs_inactive(
* reference to the inode at this point anyways.
*/
if (xfs_can_free_eofblocks(ip, true))
xfs_free_eofblocks(ip);
error = xfs_free_eofblocks(ip);
goto out;
}
......@@ -1773,7 +1764,7 @@ xfs_inactive(
/*
* Free the inode.
*/
xfs_inactive_ifree(ip);
error = xfs_inactive_ifree(ip);
out:
/*
......@@ -1781,6 +1772,7 @@ xfs_inactive(
* the attached dquots.
*/
xfs_qm_dqdetach(ip);
return error;
}
/*
......
......@@ -470,7 +470,7 @@ enum layout_break_reason {
(xfs_has_grpid((pip)->i_mount) || (VFS_I(pip)->i_mode & S_ISGID))
int xfs_release(struct xfs_inode *ip);
void xfs_inactive(struct xfs_inode *ip);
int xfs_inactive(struct xfs_inode *ip);
int xfs_lookup(struct xfs_inode *dp, const struct xfs_name *name,
struct xfs_inode **ipp, struct xfs_name *ci_name);
int xfs_create(struct mnt_idmap *idmap,
......
......@@ -29,6 +29,153 @@ static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip)
return container_of(lip, struct xfs_inode_log_item, ili_item);
}
static uint64_t
xfs_inode_item_sort(
struct xfs_log_item *lip)
{
return INODE_ITEM(lip)->ili_inode->i_ino;
}
/*
* Prior to finally logging the inode, we have to ensure that all the
* per-modification inode state changes are applied. This includes VFS inode
* state updates, format conversions, verifier state synchronisation and
* ensuring the inode buffer remains in memory whilst the inode is dirty.
*
* We have to be careful when we grab the inode cluster buffer due to lock
* ordering constraints. The unlinked inode modifications (xfs_iunlink_item)
* require AGI -> inode cluster buffer lock order. The inode cluster buffer is
* not locked until ->precommit, so it happens after everything else has been
* modified.
*
* Further, we have AGI -> AGF lock ordering, and with O_TMPFILE handling we
* have AGI -> AGF -> iunlink item -> inode cluster buffer lock order. Hence we
* cannot safely lock the inode cluster buffer in xfs_trans_log_inode() because
* it can be called on a inode (e.g. via bumplink/droplink) before we take the
* AGF lock modifying directory blocks.
*
* Rather than force a complete rework of all the transactions to call
* xfs_trans_log_inode() once and once only at the end of every transaction, we
* move the pinning of the inode cluster buffer to a ->precommit operation. This
* matches how the xfs_iunlink_item locks the inode cluster buffer, and it
* ensures that the inode cluster buffer locking is always done last in a
* transaction. i.e. we ensure the lock order is always AGI -> AGF -> inode
* cluster buffer.
*
* If we return the inode number as the precommit sort key then we'll also
* guarantee that the order all inode cluster buffer locking is the same all the
* inodes and unlink items in the transaction.
*/
static int
xfs_inode_item_precommit(
struct xfs_trans *tp,
struct xfs_log_item *lip)
{
struct xfs_inode_log_item *iip = INODE_ITEM(lip);
struct xfs_inode *ip = iip->ili_inode;
struct inode *inode = VFS_I(ip);
unsigned int flags = iip->ili_dirty_flags;
/*
* Don't bother with i_lock for the I_DIRTY_TIME check here, as races
* don't matter - we either will need an extra transaction in 24 hours
* to log the timestamps, or will clear already cleared fields in the
* worst case.
*/
if (inode->i_state & I_DIRTY_TIME) {
spin_lock(&inode->i_lock);
inode->i_state &= ~I_DIRTY_TIME;
spin_unlock(&inode->i_lock);
}
/*
* If we're updating the inode core or the timestamps and it's possible
* to upgrade this inode to bigtime format, do so now.
*/
if ((flags & (XFS_ILOG_CORE | XFS_ILOG_TIMESTAMP)) &&
xfs_has_bigtime(ip->i_mount) &&
!xfs_inode_has_bigtime(ip)) {
ip->i_diflags2 |= XFS_DIFLAG2_BIGTIME;
flags |= XFS_ILOG_CORE;
}
/*
* Inode verifiers do not check that the extent size hint is an integer
* multiple of the rt extent size on a directory with both rtinherit
* and extszinherit flags set. If we're logging a directory that is
* misconfigured in this way, clear the hint.
*/
if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
(ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) &&
(ip->i_extsize % ip->i_mount->m_sb.sb_rextsize) > 0) {
ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE |
XFS_DIFLAG_EXTSZINHERIT);
ip->i_extsize = 0;
flags |= XFS_ILOG_CORE;
}
/*
* Record the specific change for fdatasync optimisation. This allows
* fdatasync to skip log forces for inodes that are only timestamp
* dirty. Once we've processed the XFS_ILOG_IVERSION flag, convert it
* to XFS_ILOG_CORE so that the actual on-disk dirty tracking
* (ili_fields) correctly tracks that the version has changed.
*/
spin_lock(&iip->ili_lock);
iip->ili_fsync_fields |= (flags & ~XFS_ILOG_IVERSION);
if (flags & XFS_ILOG_IVERSION)
flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE);
if (!iip->ili_item.li_buf) {
struct xfs_buf *bp;
int error;
/*
* We hold the ILOCK here, so this inode is not going to be
* flushed while we are here. Further, because there is no
* buffer attached to the item, we know that there is no IO in
* progress, so nothing will clear the ili_fields while we read
* in the buffer. Hence we can safely drop the spin lock and
* read the buffer knowing that the state will not change from
* here.
*/
spin_unlock(&iip->ili_lock);
error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, &bp);
if (error)
return error;
/*
* We need an explicit buffer reference for the log item but
* don't want the buffer to remain attached to the transaction.
* Hold the buffer but release the transaction reference once
* we've attached the inode log item to the buffer log item
* list.
*/
xfs_buf_hold(bp);
spin_lock(&iip->ili_lock);
iip->ili_item.li_buf = bp;
bp->b_flags |= _XBF_INODES;
list_add_tail(&iip->ili_item.li_bio_list, &bp->b_li_list);
xfs_trans_brelse(tp, bp);
}
/*
* Always OR in the bits from the ili_last_fields field. This is to
* coordinate with the xfs_iflush() and xfs_buf_inode_iodone() routines
* in the eventual clearing of the ili_fields bits. See the big comment
* in xfs_iflush() for an explanation of this coordination mechanism.
*/
iip->ili_fields |= (flags | iip->ili_last_fields);
spin_unlock(&iip->ili_lock);
/*
* We are done with the log item transaction dirty state, so clear it so
* that it doesn't pollute future transactions.
*/
iip->ili_dirty_flags = 0;
return 0;
}
/*
* The logged size of an inode fork is always the current size of the inode
* fork. This means that when an inode fork is relogged, the size of the logged
......@@ -662,6 +809,8 @@ xfs_inode_item_committing(
}
static const struct xfs_item_ops xfs_inode_item_ops = {
.iop_sort = xfs_inode_item_sort,
.iop_precommit = xfs_inode_item_precommit,
.iop_size = xfs_inode_item_size,
.iop_format = xfs_inode_item_format,
.iop_pin = xfs_inode_item_pin,
......
......@@ -17,6 +17,7 @@ struct xfs_inode_log_item {
struct xfs_log_item ili_item; /* common portion */
struct xfs_inode *ili_inode; /* inode ptr */
unsigned short ili_lock_flags; /* inode lock flags */
unsigned int ili_dirty_flags; /* dirty in current tx */
/*
* The ili_lock protects the interactions between the dirty state and
* the flush state of the inode log item. This allows us to do atomic
......
......@@ -2711,7 +2711,9 @@ xlog_recover_iunlink_bucket(
* just to flush the inodegc queue and wait for it to
* complete.
*/
xfs_inodegc_flush(mp);
error = xfs_inodegc_flush(mp);
if (error)
break;
}
prev_agino = agino;
......@@ -2719,10 +2721,15 @@ xlog_recover_iunlink_bucket(
}
if (prev_ip) {
int error2;
ip->i_prev_unlinked = prev_agino;
xfs_irele(prev_ip);
error2 = xfs_inodegc_flush(mp);
if (error2 && !error)
return error2;
}
xfs_inodegc_flush(mp);
return error;
}
......@@ -2789,7 +2796,6 @@ xlog_recover_iunlink_ag(
* bucket and remaining inodes on it unreferenced and
* unfreeable.
*/
xfs_inodegc_flush(pag->pag_mount);
xlog_recover_clear_agi_bucket(pag, bucket);
}
}
......@@ -2806,13 +2812,6 @@ xlog_recover_process_iunlinks(
for_each_perag(log->l_mp, agno, pag)
xlog_recover_iunlink_ag(pag);
/*
* Flush the pending unlinked inodes to ensure that the inactivations
* are fully completed on disk and the incore inodes can be reclaimed
* before we signal that recovery is complete.
*/
xfs_inodegc_flush(log->l_mp);
}
STATIC void
......
......@@ -62,6 +62,7 @@ struct xfs_error_cfg {
struct xfs_inodegc {
struct llist_head list;
struct delayed_work work;
int error;
/* approximate count of inodes in the list */
unsigned int items;
......
......@@ -616,8 +616,10 @@ xfs_reflink_cancel_cow_blocks(
xfs_refcount_free_cow_extent(*tpp, del.br_startblock,
del.br_blockcount);
xfs_free_extent_later(*tpp, del.br_startblock,
error = xfs_free_extent_later(*tpp, del.br_startblock,
del.br_blockcount, NULL);
if (error)
break;
/* Roll the transaction */
error = xfs_defer_finish(tpp);
......
......@@ -1100,6 +1100,7 @@ xfs_inodegc_init_percpu(
#endif
init_llist_head(&gc->list);
gc->items = 0;
gc->error = 0;
INIT_DELAYED_WORK(&gc->work, xfs_inodegc_worker);
}
return 0;
......
......@@ -290,7 +290,9 @@ xfs_trans_alloc(
* Do not perform a synchronous scan because callers can hold
* other locks.
*/
xfs_blockgc_flush_all(mp);
error = xfs_blockgc_flush_all(mp);
if (error)
return error;
want_retry = false;
goto retry;
}
......@@ -970,6 +972,11 @@ __xfs_trans_commit(
error = xfs_defer_finish_noroll(&tp);
if (error)
goto out_unreserve;
/* Run precommits from final tx in defer chain. */
error = xfs_trans_run_precommits(tp);
if (error)
goto out_unreserve;
}
/*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment