Commit 9ba8e658 authored by Chandan Babu R's avatar Chandan Babu R

Merge tag 'inode-repair-improvements-6.10_2024-04-15' of...

Merge tag 'inode-repair-improvements-6.10_2024-04-15' of https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux into xfs-6.10-mergeA

xfs: inode-related repair fixes

While doing QA of the online fsck code, I made a few observations:
First, nobody was checking that the di_onlink field is actually zero;
Second, that allocating a temporary file for repairs can fail (and
thus bring down the entire fs) if the inode cluster is corrupt; and
Third, that file link counts do not pin at ~0U to prevent integer
overflows.  Fourth, the x{chk,rep}_metadata_inode_fork functions
should be subclassing the main scrub context, not modifying the
parent's setup willy-nilly.

This scattered patchset fixes those three problems.
Signed-off-by: default avatarDarrick J. Wong <djwong@kernel.org>
Signed-off-by: default avatarChandan Babu R <chandanbabu@kernel.org>

* tag 'inode-repair-improvements-6.10_2024-04-15' of https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux:
  xfs: create subordinate scrub contexts for xchk_metadata_inode_subtype
  xfs: pin inodes that would otherwise overflow link count
  xfs: try to avoid allocating from sick inode clusters
  xfs: check unused nlink fields in the ondisk inode
parents 1eef0125 1a5f6e08
......@@ -899,6 +899,12 @@ static inline uint xfs_dinode_size(int version)
*/
#define XFS_MAXLINK ((1U << 31) - 1U)
/*
* Any file that hits the maximum ondisk link count should be pinned to avoid
* a use-after-free situation.
*/
#define XFS_NLINK_PINNED (~0U)
/*
* Values for di_format
*
......
......@@ -1057,6 +1057,33 @@ xfs_inobt_first_free_inode(
return xfs_lowbit64(realfree);
}
/*
* If this AG has corrupt inodes, check if allocating this inode would fail
* with corruption errors. Returns 0 if we're clear, or EAGAIN to try again
* somewhere else.
*/
static int
xfs_dialloc_check_ino(
struct xfs_perag *pag,
struct xfs_trans *tp,
xfs_ino_t ino)
{
struct xfs_imap imap;
struct xfs_buf *bp;
int error;
error = xfs_imap(pag, tp, ino, &imap, 0);
if (error)
return -EAGAIN;
error = xfs_imap_to_bp(pag->pag_mount, tp, &imap, &bp);
if (error)
return -EAGAIN;
xfs_trans_brelse(tp, bp);
return 0;
}
/*
* Allocate an inode using the inobt-only algorithm.
*/
......@@ -1309,6 +1336,13 @@ xfs_dialloc_ag_inobt(
ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
XFS_INODES_PER_CHUNK) == 0);
ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino + offset);
if (xfs_ag_has_sickness(pag, XFS_SICK_AG_INODES)) {
error = xfs_dialloc_check_ino(pag, tp, ino);
if (error)
goto error0;
}
rec.ir_free &= ~XFS_INOBT_MASK(offset);
rec.ir_freecount--;
error = xfs_inobt_update(cur, &rec);
......@@ -1584,6 +1618,12 @@ xfs_dialloc_ag(
XFS_INODES_PER_CHUNK) == 0);
ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino + offset);
if (xfs_ag_has_sickness(pag, XFS_SICK_AG_INODES)) {
error = xfs_dialloc_check_ino(pag, tp, ino);
if (error)
goto error_cur;
}
/*
* Modify or remove the finobt record.
*/
......
......@@ -491,6 +491,14 @@ xfs_dinode_verify(
return __this_address;
}
if (dip->di_version > 1) {
if (dip->di_onlink)
return __this_address;
} else {
if (dip->di_nlink)
return __this_address;
}
/* don't allow invalid i_size */
di_size = be64_to_cpu(dip->di_size);
if (di_size & (1ULL << 63))
......
......@@ -1203,27 +1203,12 @@ xchk_metadata_inode_subtype(
struct xfs_scrub *sc,
unsigned int scrub_type)
{
__u32 smtype = sc->sm->sm_type;
unsigned int sick_mask = sc->sick_mask;
struct xfs_scrub_subord *sub;
int error;
sc->sm->sm_type = scrub_type;
switch (scrub_type) {
case XFS_SCRUB_TYPE_INODE:
error = xchk_inode(sc);
break;
case XFS_SCRUB_TYPE_BMBTD:
error = xchk_bmap_data(sc);
break;
default:
ASSERT(0);
error = -EFSCORRUPTED;
break;
}
sc->sick_mask = sick_mask;
sc->sm->sm_type = smtype;
sub = xchk_scrub_create_subord(sc, scrub_type);
error = sub->sc.ops->scrub(&sub->sc);
xchk_scrub_free_subord(sub);
return error;
}
......
......@@ -1145,7 +1145,9 @@ xrep_dir_set_nlink(
struct xfs_scrub *sc = rd->sc;
struct xfs_inode *dp = sc->ip;
struct xfs_perag *pag;
unsigned int new_nlink = rd->subdirs + 2;
unsigned int new_nlink = min_t(unsigned long long,
rd->subdirs + 2,
XFS_NLINK_PINNED);
int error;
/*
......@@ -1201,13 +1203,6 @@ xrep_dir_swap(
bool ip_local, temp_local;
int error = 0;
/*
* If we found enough subdirs to overflow this directory's link count,
* bail out to userspace before we modify anything.
*/
if (rd->subdirs + 2 > XFS_MAXLINK)
return -EFSCORRUPTED;
/*
* If we never found the parent for this directory, temporarily assign
* the root dir as the parent; we'll move this to the orphanage after
......
......@@ -516,6 +516,17 @@ xrep_dinode_mode(
return 0;
}
/* Fix unused link count fields having nonzero values. */
STATIC void
xrep_dinode_nlinks(
struct xfs_dinode *dip)
{
if (dip->di_version > 1)
dip->di_onlink = 0;
else
dip->di_nlink = 0;
}
/* Fix any conflicting flags that the verifiers complain about. */
STATIC void
xrep_dinode_flags(
......@@ -1377,6 +1388,7 @@ xrep_dinode_core(
iget_error = xrep_dinode_mode(ri, dip);
if (iget_error)
goto write;
xrep_dinode_nlinks(dip);
xrep_dinode_flags(sc, dip, ri->rt_extents > 0);
xrep_dinode_size(ri, dip);
xrep_dinode_extsize_hints(sc, dip);
......
......@@ -607,9 +607,11 @@ xchk_nlinks_compare_inode(
* this as a corruption. The VFS won't let users increase the link
* count, but it will let them decrease it.
*/
if (total_links > XFS_MAXLINK) {
if (total_links > XFS_NLINK_PINNED) {
xchk_ino_set_corrupt(sc, ip->i_ino);
goto out_corrupt;
} else if (total_links > XFS_MAXLINK) {
xchk_ino_set_warning(sc, ip->i_ino);
}
/* Link counts should match. */
......
......@@ -238,14 +238,10 @@ xrep_nlinks_repair_inode(
/* Commit the new link count if it changed. */
if (total_links != actual_nlink) {
if (total_links > XFS_MAXLINK) {
trace_xrep_nlinks_unfixable_inode(mp, ip, &obs);
goto out_trans;
}
trace_xrep_nlinks_update_inode(mp, ip, &obs);
set_nlink(VFS_I(ip), total_links);
set_nlink(VFS_I(ip), min_t(unsigned long long, total_links,
XFS_NLINK_PINNED));
dirty = true;
}
......
......@@ -1009,55 +1009,27 @@ xrep_metadata_inode_subtype(
struct xfs_scrub *sc,
unsigned int scrub_type)
{
__u32 smtype = sc->sm->sm_type;
__u32 smflags = sc->sm->sm_flags;
unsigned int sick_mask = sc->sick_mask;
struct xfs_scrub_subord *sub;
int error;
/*
* Let's see if the inode needs repair. We're going to open-code calls
* to the scrub and repair functions so that we can hang on to the
* Let's see if the inode needs repair. Use a subordinate scrub context
* to call the scrub and repair functions so that we can hang on to the
* resources that we already acquired instead of using the standard
* setup/teardown routines.
*/
sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
sc->sm->sm_type = scrub_type;
switch (scrub_type) {
case XFS_SCRUB_TYPE_INODE:
error = xchk_inode(sc);
break;
case XFS_SCRUB_TYPE_BMBTD:
error = xchk_bmap_data(sc);
break;
case XFS_SCRUB_TYPE_BMBTA:
error = xchk_bmap_attr(sc);
break;
default:
ASSERT(0);
error = -EFSCORRUPTED;
}
sub = xchk_scrub_create_subord(sc, scrub_type);
error = sub->sc.ops->scrub(&sub->sc);
if (error)
goto out;
if (!xrep_will_attempt(sc))
if (!xrep_will_attempt(&sub->sc))
goto out;
/*
* Repair some part of the inode. This will potentially join the inode
* to the transaction.
*/
switch (scrub_type) {
case XFS_SCRUB_TYPE_INODE:
error = xrep_inode(sc);
break;
case XFS_SCRUB_TYPE_BMBTD:
error = xrep_bmap(sc, XFS_DATA_FORK, false);
break;
case XFS_SCRUB_TYPE_BMBTA:
error = xrep_bmap(sc, XFS_ATTR_FORK, false);
break;
}
error = sub->sc.ops->repair(&sub->sc);
if (error)
goto out;
......@@ -1066,10 +1038,10 @@ xrep_metadata_inode_subtype(
* that the inode will not be joined to the transaction when we exit
* the function.
*/
error = xfs_defer_finish(&sc->tp);
error = xfs_defer_finish(&sub->sc.tp);
if (error)
goto out;
error = xfs_trans_roll(&sc->tp);
error = xfs_trans_roll(&sub->sc.tp);
if (error)
goto out;
......@@ -1077,31 +1049,18 @@ xrep_metadata_inode_subtype(
* Clear the corruption flags and re-check the metadata that we just
* repaired.
*/
sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
switch (scrub_type) {
case XFS_SCRUB_TYPE_INODE:
error = xchk_inode(sc);
break;
case XFS_SCRUB_TYPE_BMBTD:
error = xchk_bmap_data(sc);
break;
case XFS_SCRUB_TYPE_BMBTA:
error = xchk_bmap_attr(sc);
break;
}
sub->sc.sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
error = sub->sc.ops->scrub(&sub->sc);
if (error)
goto out;
/* If corruption persists, the repair has failed. */
if (xchk_needs_repair(sc->sm)) {
if (xchk_needs_repair(sub->sc.sm)) {
error = -EFSCORRUPTED;
goto out;
}
out:
sc->sick_mask = sick_mask;
sc->sm->sm_type = smtype;
sc->sm->sm_flags = smflags;
xchk_scrub_free_subord(sub);
return error;
}
......
......@@ -177,6 +177,39 @@ xchk_fsgates_disable(
}
#undef FSGATES_MASK
/* Free the resources associated with a scrub subtype. */
void
xchk_scrub_free_subord(
struct xfs_scrub_subord *sub)
{
struct xfs_scrub *sc = sub->parent_sc;
ASSERT(sc->ip == sub->sc.ip);
ASSERT(sc->orphanage == sub->sc.orphanage);
ASSERT(sc->tempip == sub->sc.tempip);
sc->sm->sm_type = sub->old_smtype;
sc->sm->sm_flags = sub->old_smflags |
(sc->sm->sm_flags & XFS_SCRUB_FLAGS_OUT);
sc->tp = sub->sc.tp;
if (sub->sc.buf) {
if (sub->sc.buf_cleanup)
sub->sc.buf_cleanup(sub->sc.buf);
kvfree(sub->sc.buf);
}
if (sub->sc.xmbtp)
xmbuf_free(sub->sc.xmbtp);
if (sub->sc.xfile)
xfile_destroy(sub->sc.xfile);
sc->ilock_flags = sub->sc.ilock_flags;
sc->orphanage_ilock_flags = sub->sc.orphanage_ilock_flags;
sc->temp_ilock_flags = sub->sc.temp_ilock_flags;
kfree(sub);
}
/* Free all the resources and finish the transactions. */
STATIC int
xchk_teardown(
......@@ -505,6 +538,36 @@ static inline void xchk_postmortem(struct xfs_scrub *sc)
}
#endif /* CONFIG_XFS_ONLINE_REPAIR */
/*
* Create a new scrub context from an existing one, but with a different scrub
* type.
*/
struct xfs_scrub_subord *
xchk_scrub_create_subord(
struct xfs_scrub *sc,
unsigned int subtype)
{
struct xfs_scrub_subord *sub;
sub = kzalloc(sizeof(*sub), XCHK_GFP_FLAGS);
if (!sub)
return ERR_PTR(-ENOMEM);
sub->old_smtype = sc->sm->sm_type;
sub->old_smflags = sc->sm->sm_flags;
sub->parent_sc = sc;
memcpy(&sub->sc, sc, sizeof(struct xfs_scrub));
sub->sc.ops = &meta_scrub_ops[subtype];
sub->sc.sm->sm_type = subtype;
sub->sc.sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
sub->sc.buf = NULL;
sub->sc.buf_cleanup = NULL;
sub->sc.xfile = NULL;
sub->sc.xmbtp = NULL;
return sub;
}
/* Dispatch metadata scrubbing. */
int
xfs_scrub_metadata(
......
......@@ -156,6 +156,17 @@ struct xfs_scrub {
*/
#define XREP_FSGATES_ALL (XREP_FSGATES_EXCHANGE_RANGE)
struct xfs_scrub_subord {
struct xfs_scrub sc;
struct xfs_scrub *parent_sc;
unsigned int old_smtype;
unsigned int old_smflags;
};
struct xfs_scrub_subord *xchk_scrub_create_subord(struct xfs_scrub *sc,
unsigned int subtype);
void xchk_scrub_free_subord(struct xfs_scrub_subord *sub);
/* Metadata scrubbers */
int xchk_tester(struct xfs_scrub *sc);
int xchk_superblock(struct xfs_scrub *sc);
......
......@@ -890,22 +890,25 @@ xfs_init_new_inode(
*/
static int /* error */
xfs_droplink(
xfs_trans_t *tp,
xfs_inode_t *ip)
struct xfs_trans *tp,
struct xfs_inode *ip)
{
if (VFS_I(ip)->i_nlink == 0) {
xfs_alert(ip->i_mount,
"%s: Attempt to drop inode (%llu) with nlink zero.",
__func__, ip->i_ino);
return -EFSCORRUPTED;
}
struct inode *inode = VFS_I(ip);
xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
drop_nlink(VFS_I(ip));
if (inode->i_nlink == 0) {
xfs_info_ratelimited(tp->t_mountp,
"Inode 0x%llx link count dropped below zero. Pinning link count.",
ip->i_ino);
set_nlink(inode, XFS_NLINK_PINNED);
}
if (inode->i_nlink != XFS_NLINK_PINNED)
drop_nlink(inode);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
if (VFS_I(ip)->i_nlink)
if (inode->i_nlink)
return 0;
return xfs_iunlink(tp, ip);
......@@ -919,9 +922,17 @@ xfs_bumplink(
struct xfs_trans *tp,
struct xfs_inode *ip)
{
struct inode *inode = VFS_I(ip);
xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
inc_nlink(VFS_I(ip));
if (inode->i_nlink == XFS_NLINK_PINNED - 1)
xfs_info_ratelimited(tp->t_mountp,
"Inode 0x%llx link count exceeded maximum. Pinning link count.",
ip->i_ino);
if (inode->i_nlink != XFS_NLINK_PINNED)
inc_nlink(inode);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment