Commit 9ba8e658 authored by Chandan Babu R's avatar Chandan Babu R

Merge tag 'inode-repair-improvements-6.10_2024-04-15' of...

Merge tag 'inode-repair-improvements-6.10_2024-04-15' of https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux into xfs-6.10-mergeA

xfs: inode-related repair fixes

While doing QA of the online fsck code, I made a few observations:
First, nobody was checking that the di_onlink field is actually zero;
Second, that allocating a temporary file for repairs can fail (and
thus bring down the entire fs) if the inode cluster is corrupt; and
Third, that file link counts do not pin at ~0U to prevent integer
overflows.  Fourth, the x{chk,rep}_metadata_inode_fork functions
should be subclassing the main scrub context, not modifying the
parent's setup willy-nilly.

This scattered patchset fixes those three problems.
Signed-off-by: default avatarDarrick J. Wong <djwong@kernel.org>
Signed-off-by: default avatarChandan Babu R <chandanbabu@kernel.org>

* tag 'inode-repair-improvements-6.10_2024-04-15' of https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux:
  xfs: create subordinate scrub contexts for xchk_metadata_inode_subtype
  xfs: pin inodes that would otherwise overflow link count
  xfs: try to avoid allocating from sick inode clusters
  xfs: check unused nlink fields in the ondisk inode
parents 1eef0125 1a5f6e08
...@@ -899,6 +899,12 @@ static inline uint xfs_dinode_size(int version) ...@@ -899,6 +899,12 @@ static inline uint xfs_dinode_size(int version)
*/ */
#define XFS_MAXLINK ((1U << 31) - 1U) #define XFS_MAXLINK ((1U << 31) - 1U)
/*
* Any file that hits the maximum ondisk link count should be pinned to avoid
* a use-after-free situation.
*/
#define XFS_NLINK_PINNED (~0U)
/* /*
* Values for di_format * Values for di_format
* *
......
...@@ -1057,6 +1057,33 @@ xfs_inobt_first_free_inode( ...@@ -1057,6 +1057,33 @@ xfs_inobt_first_free_inode(
return xfs_lowbit64(realfree); return xfs_lowbit64(realfree);
} }
/*
* If this AG has corrupt inodes, check if allocating this inode would fail
* with corruption errors. Returns 0 if we're clear, or EAGAIN to try again
* somewhere else.
*/
static int
xfs_dialloc_check_ino(
struct xfs_perag *pag,
struct xfs_trans *tp,
xfs_ino_t ino)
{
struct xfs_imap imap;
struct xfs_buf *bp;
int error;
error = xfs_imap(pag, tp, ino, &imap, 0);
if (error)
return -EAGAIN;
error = xfs_imap_to_bp(pag->pag_mount, tp, &imap, &bp);
if (error)
return -EAGAIN;
xfs_trans_brelse(tp, bp);
return 0;
}
/* /*
* Allocate an inode using the inobt-only algorithm. * Allocate an inode using the inobt-only algorithm.
*/ */
...@@ -1309,6 +1336,13 @@ xfs_dialloc_ag_inobt( ...@@ -1309,6 +1336,13 @@ xfs_dialloc_ag_inobt(
ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
XFS_INODES_PER_CHUNK) == 0); XFS_INODES_PER_CHUNK) == 0);
ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino + offset); ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino + offset);
if (xfs_ag_has_sickness(pag, XFS_SICK_AG_INODES)) {
error = xfs_dialloc_check_ino(pag, tp, ino);
if (error)
goto error0;
}
rec.ir_free &= ~XFS_INOBT_MASK(offset); rec.ir_free &= ~XFS_INOBT_MASK(offset);
rec.ir_freecount--; rec.ir_freecount--;
error = xfs_inobt_update(cur, &rec); error = xfs_inobt_update(cur, &rec);
...@@ -1584,6 +1618,12 @@ xfs_dialloc_ag( ...@@ -1584,6 +1618,12 @@ xfs_dialloc_ag(
XFS_INODES_PER_CHUNK) == 0); XFS_INODES_PER_CHUNK) == 0);
ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino + offset); ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino + offset);
if (xfs_ag_has_sickness(pag, XFS_SICK_AG_INODES)) {
error = xfs_dialloc_check_ino(pag, tp, ino);
if (error)
goto error_cur;
}
/* /*
* Modify or remove the finobt record. * Modify or remove the finobt record.
*/ */
......
...@@ -491,6 +491,14 @@ xfs_dinode_verify( ...@@ -491,6 +491,14 @@ xfs_dinode_verify(
return __this_address; return __this_address;
} }
if (dip->di_version > 1) {
if (dip->di_onlink)
return __this_address;
} else {
if (dip->di_nlink)
return __this_address;
}
/* don't allow invalid i_size */ /* don't allow invalid i_size */
di_size = be64_to_cpu(dip->di_size); di_size = be64_to_cpu(dip->di_size);
if (di_size & (1ULL << 63)) if (di_size & (1ULL << 63))
......
...@@ -1203,27 +1203,12 @@ xchk_metadata_inode_subtype( ...@@ -1203,27 +1203,12 @@ xchk_metadata_inode_subtype(
struct xfs_scrub *sc, struct xfs_scrub *sc,
unsigned int scrub_type) unsigned int scrub_type)
{ {
__u32 smtype = sc->sm->sm_type; struct xfs_scrub_subord *sub;
unsigned int sick_mask = sc->sick_mask;
int error; int error;
sc->sm->sm_type = scrub_type; sub = xchk_scrub_create_subord(sc, scrub_type);
error = sub->sc.ops->scrub(&sub->sc);
switch (scrub_type) { xchk_scrub_free_subord(sub);
case XFS_SCRUB_TYPE_INODE:
error = xchk_inode(sc);
break;
case XFS_SCRUB_TYPE_BMBTD:
error = xchk_bmap_data(sc);
break;
default:
ASSERT(0);
error = -EFSCORRUPTED;
break;
}
sc->sick_mask = sick_mask;
sc->sm->sm_type = smtype;
return error; return error;
} }
......
...@@ -1145,7 +1145,9 @@ xrep_dir_set_nlink( ...@@ -1145,7 +1145,9 @@ xrep_dir_set_nlink(
struct xfs_scrub *sc = rd->sc; struct xfs_scrub *sc = rd->sc;
struct xfs_inode *dp = sc->ip; struct xfs_inode *dp = sc->ip;
struct xfs_perag *pag; struct xfs_perag *pag;
unsigned int new_nlink = rd->subdirs + 2; unsigned int new_nlink = min_t(unsigned long long,
rd->subdirs + 2,
XFS_NLINK_PINNED);
int error; int error;
/* /*
...@@ -1201,13 +1203,6 @@ xrep_dir_swap( ...@@ -1201,13 +1203,6 @@ xrep_dir_swap(
bool ip_local, temp_local; bool ip_local, temp_local;
int error = 0; int error = 0;
/*
* If we found enough subdirs to overflow this directory's link count,
* bail out to userspace before we modify anything.
*/
if (rd->subdirs + 2 > XFS_MAXLINK)
return -EFSCORRUPTED;
/* /*
* If we never found the parent for this directory, temporarily assign * If we never found the parent for this directory, temporarily assign
* the root dir as the parent; we'll move this to the orphanage after * the root dir as the parent; we'll move this to the orphanage after
......
...@@ -516,6 +516,17 @@ xrep_dinode_mode( ...@@ -516,6 +516,17 @@ xrep_dinode_mode(
return 0; return 0;
} }
/* Fix unused link count fields having nonzero values. */
STATIC void
xrep_dinode_nlinks(
struct xfs_dinode *dip)
{
if (dip->di_version > 1)
dip->di_onlink = 0;
else
dip->di_nlink = 0;
}
/* Fix any conflicting flags that the verifiers complain about. */ /* Fix any conflicting flags that the verifiers complain about. */
STATIC void STATIC void
xrep_dinode_flags( xrep_dinode_flags(
...@@ -1377,6 +1388,7 @@ xrep_dinode_core( ...@@ -1377,6 +1388,7 @@ xrep_dinode_core(
iget_error = xrep_dinode_mode(ri, dip); iget_error = xrep_dinode_mode(ri, dip);
if (iget_error) if (iget_error)
goto write; goto write;
xrep_dinode_nlinks(dip);
xrep_dinode_flags(sc, dip, ri->rt_extents > 0); xrep_dinode_flags(sc, dip, ri->rt_extents > 0);
xrep_dinode_size(ri, dip); xrep_dinode_size(ri, dip);
xrep_dinode_extsize_hints(sc, dip); xrep_dinode_extsize_hints(sc, dip);
......
...@@ -607,9 +607,11 @@ xchk_nlinks_compare_inode( ...@@ -607,9 +607,11 @@ xchk_nlinks_compare_inode(
* this as a corruption. The VFS won't let users increase the link * this as a corruption. The VFS won't let users increase the link
* count, but it will let them decrease it. * count, but it will let them decrease it.
*/ */
if (total_links > XFS_MAXLINK) { if (total_links > XFS_NLINK_PINNED) {
xchk_ino_set_corrupt(sc, ip->i_ino); xchk_ino_set_corrupt(sc, ip->i_ino);
goto out_corrupt; goto out_corrupt;
} else if (total_links > XFS_MAXLINK) {
xchk_ino_set_warning(sc, ip->i_ino);
} }
/* Link counts should match. */ /* Link counts should match. */
......
...@@ -238,14 +238,10 @@ xrep_nlinks_repair_inode( ...@@ -238,14 +238,10 @@ xrep_nlinks_repair_inode(
/* Commit the new link count if it changed. */ /* Commit the new link count if it changed. */
if (total_links != actual_nlink) { if (total_links != actual_nlink) {
if (total_links > XFS_MAXLINK) {
trace_xrep_nlinks_unfixable_inode(mp, ip, &obs);
goto out_trans;
}
trace_xrep_nlinks_update_inode(mp, ip, &obs); trace_xrep_nlinks_update_inode(mp, ip, &obs);
set_nlink(VFS_I(ip), total_links); set_nlink(VFS_I(ip), min_t(unsigned long long, total_links,
XFS_NLINK_PINNED));
dirty = true; dirty = true;
} }
......
...@@ -1009,55 +1009,27 @@ xrep_metadata_inode_subtype( ...@@ -1009,55 +1009,27 @@ xrep_metadata_inode_subtype(
struct xfs_scrub *sc, struct xfs_scrub *sc,
unsigned int scrub_type) unsigned int scrub_type)
{ {
__u32 smtype = sc->sm->sm_type; struct xfs_scrub_subord *sub;
__u32 smflags = sc->sm->sm_flags;
unsigned int sick_mask = sc->sick_mask;
int error; int error;
/* /*
* Let's see if the inode needs repair. We're going to open-code calls * Let's see if the inode needs repair. Use a subordinate scrub context
* to the scrub and repair functions so that we can hang on to the * to call the scrub and repair functions so that we can hang on to the
* resources that we already acquired instead of using the standard * resources that we already acquired instead of using the standard
* setup/teardown routines. * setup/teardown routines.
*/ */
sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; sub = xchk_scrub_create_subord(sc, scrub_type);
sc->sm->sm_type = scrub_type; error = sub->sc.ops->scrub(&sub->sc);
switch (scrub_type) {
case XFS_SCRUB_TYPE_INODE:
error = xchk_inode(sc);
break;
case XFS_SCRUB_TYPE_BMBTD:
error = xchk_bmap_data(sc);
break;
case XFS_SCRUB_TYPE_BMBTA:
error = xchk_bmap_attr(sc);
break;
default:
ASSERT(0);
error = -EFSCORRUPTED;
}
if (error) if (error)
goto out; goto out;
if (!xrep_will_attempt(&sub->sc))
if (!xrep_will_attempt(sc))
goto out; goto out;
/* /*
* Repair some part of the inode. This will potentially join the inode * Repair some part of the inode. This will potentially join the inode
* to the transaction. * to the transaction.
*/ */
switch (scrub_type) { error = sub->sc.ops->repair(&sub->sc);
case XFS_SCRUB_TYPE_INODE:
error = xrep_inode(sc);
break;
case XFS_SCRUB_TYPE_BMBTD:
error = xrep_bmap(sc, XFS_DATA_FORK, false);
break;
case XFS_SCRUB_TYPE_BMBTA:
error = xrep_bmap(sc, XFS_ATTR_FORK, false);
break;
}
if (error) if (error)
goto out; goto out;
...@@ -1066,10 +1038,10 @@ xrep_metadata_inode_subtype( ...@@ -1066,10 +1038,10 @@ xrep_metadata_inode_subtype(
* that the inode will not be joined to the transaction when we exit * that the inode will not be joined to the transaction when we exit
* the function. * the function.
*/ */
error = xfs_defer_finish(&sc->tp); error = xfs_defer_finish(&sub->sc.tp);
if (error) if (error)
goto out; goto out;
error = xfs_trans_roll(&sc->tp); error = xfs_trans_roll(&sub->sc.tp);
if (error) if (error)
goto out; goto out;
...@@ -1077,31 +1049,18 @@ xrep_metadata_inode_subtype( ...@@ -1077,31 +1049,18 @@ xrep_metadata_inode_subtype(
* Clear the corruption flags and re-check the metadata that we just * Clear the corruption flags and re-check the metadata that we just
* repaired. * repaired.
*/ */
sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; sub->sc.sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
error = sub->sc.ops->scrub(&sub->sc);
switch (scrub_type) {
case XFS_SCRUB_TYPE_INODE:
error = xchk_inode(sc);
break;
case XFS_SCRUB_TYPE_BMBTD:
error = xchk_bmap_data(sc);
break;
case XFS_SCRUB_TYPE_BMBTA:
error = xchk_bmap_attr(sc);
break;
}
if (error) if (error)
goto out; goto out;
/* If corruption persists, the repair has failed. */ /* If corruption persists, the repair has failed. */
if (xchk_needs_repair(sc->sm)) { if (xchk_needs_repair(sub->sc.sm)) {
error = -EFSCORRUPTED; error = -EFSCORRUPTED;
goto out; goto out;
} }
out: out:
sc->sick_mask = sick_mask; xchk_scrub_free_subord(sub);
sc->sm->sm_type = smtype;
sc->sm->sm_flags = smflags;
return error; return error;
} }
......
...@@ -177,6 +177,39 @@ xchk_fsgates_disable( ...@@ -177,6 +177,39 @@ xchk_fsgates_disable(
} }
#undef FSGATES_MASK #undef FSGATES_MASK
/* Free the resources associated with a scrub subtype. */
void
xchk_scrub_free_subord(
struct xfs_scrub_subord *sub)
{
struct xfs_scrub *sc = sub->parent_sc;
ASSERT(sc->ip == sub->sc.ip);
ASSERT(sc->orphanage == sub->sc.orphanage);
ASSERT(sc->tempip == sub->sc.tempip);
sc->sm->sm_type = sub->old_smtype;
sc->sm->sm_flags = sub->old_smflags |
(sc->sm->sm_flags & XFS_SCRUB_FLAGS_OUT);
sc->tp = sub->sc.tp;
if (sub->sc.buf) {
if (sub->sc.buf_cleanup)
sub->sc.buf_cleanup(sub->sc.buf);
kvfree(sub->sc.buf);
}
if (sub->sc.xmbtp)
xmbuf_free(sub->sc.xmbtp);
if (sub->sc.xfile)
xfile_destroy(sub->sc.xfile);
sc->ilock_flags = sub->sc.ilock_flags;
sc->orphanage_ilock_flags = sub->sc.orphanage_ilock_flags;
sc->temp_ilock_flags = sub->sc.temp_ilock_flags;
kfree(sub);
}
/* Free all the resources and finish the transactions. */ /* Free all the resources and finish the transactions. */
STATIC int STATIC int
xchk_teardown( xchk_teardown(
...@@ -505,6 +538,36 @@ static inline void xchk_postmortem(struct xfs_scrub *sc) ...@@ -505,6 +538,36 @@ static inline void xchk_postmortem(struct xfs_scrub *sc)
} }
#endif /* CONFIG_XFS_ONLINE_REPAIR */ #endif /* CONFIG_XFS_ONLINE_REPAIR */
/*
* Create a new scrub context from an existing one, but with a different scrub
* type.
*/
struct xfs_scrub_subord *
xchk_scrub_create_subord(
struct xfs_scrub *sc,
unsigned int subtype)
{
struct xfs_scrub_subord *sub;
sub = kzalloc(sizeof(*sub), XCHK_GFP_FLAGS);
if (!sub)
return ERR_PTR(-ENOMEM);
sub->old_smtype = sc->sm->sm_type;
sub->old_smflags = sc->sm->sm_flags;
sub->parent_sc = sc;
memcpy(&sub->sc, sc, sizeof(struct xfs_scrub));
sub->sc.ops = &meta_scrub_ops[subtype];
sub->sc.sm->sm_type = subtype;
sub->sc.sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
sub->sc.buf = NULL;
sub->sc.buf_cleanup = NULL;
sub->sc.xfile = NULL;
sub->sc.xmbtp = NULL;
return sub;
}
/* Dispatch metadata scrubbing. */ /* Dispatch metadata scrubbing. */
int int
xfs_scrub_metadata( xfs_scrub_metadata(
......
...@@ -156,6 +156,17 @@ struct xfs_scrub { ...@@ -156,6 +156,17 @@ struct xfs_scrub {
*/ */
#define XREP_FSGATES_ALL (XREP_FSGATES_EXCHANGE_RANGE) #define XREP_FSGATES_ALL (XREP_FSGATES_EXCHANGE_RANGE)
struct xfs_scrub_subord {
struct xfs_scrub sc;
struct xfs_scrub *parent_sc;
unsigned int old_smtype;
unsigned int old_smflags;
};
struct xfs_scrub_subord *xchk_scrub_create_subord(struct xfs_scrub *sc,
unsigned int subtype);
void xchk_scrub_free_subord(struct xfs_scrub_subord *sub);
/* Metadata scrubbers */ /* Metadata scrubbers */
int xchk_tester(struct xfs_scrub *sc); int xchk_tester(struct xfs_scrub *sc);
int xchk_superblock(struct xfs_scrub *sc); int xchk_superblock(struct xfs_scrub *sc);
......
...@@ -890,22 +890,25 @@ xfs_init_new_inode( ...@@ -890,22 +890,25 @@ xfs_init_new_inode(
*/ */
static int /* error */ static int /* error */
xfs_droplink( xfs_droplink(
xfs_trans_t *tp, struct xfs_trans *tp,
xfs_inode_t *ip) struct xfs_inode *ip)
{ {
if (VFS_I(ip)->i_nlink == 0) { struct inode *inode = VFS_I(ip);
xfs_alert(ip->i_mount,
"%s: Attempt to drop inode (%llu) with nlink zero.",
__func__, ip->i_ino);
return -EFSCORRUPTED;
}
xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
drop_nlink(VFS_I(ip)); if (inode->i_nlink == 0) {
xfs_info_ratelimited(tp->t_mountp,
"Inode 0x%llx link count dropped below zero. Pinning link count.",
ip->i_ino);
set_nlink(inode, XFS_NLINK_PINNED);
}
if (inode->i_nlink != XFS_NLINK_PINNED)
drop_nlink(inode);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
if (VFS_I(ip)->i_nlink) if (inode->i_nlink)
return 0; return 0;
return xfs_iunlink(tp, ip); return xfs_iunlink(tp, ip);
...@@ -919,9 +922,17 @@ xfs_bumplink( ...@@ -919,9 +922,17 @@ xfs_bumplink(
struct xfs_trans *tp, struct xfs_trans *tp,
struct xfs_inode *ip) struct xfs_inode *ip)
{ {
struct inode *inode = VFS_I(ip);
xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
inc_nlink(VFS_I(ip)); if (inode->i_nlink == XFS_NLINK_PINNED - 1)
xfs_info_ratelimited(tp->t_mountp,
"Inode 0x%llx link count exceeded maximum. Pinning link count.",
ip->i_ino);
if (inode->i_nlink != XFS_NLINK_PINNED)
inc_nlink(inode);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment