Commit 496baa2c authored by Chandan Babu R's avatar Chandan Babu R

Merge tag 'vectorized-scrub-6.10_2024-04-23' of...

Merge tag 'vectorized-scrub-6.10_2024-04-23' of https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux into xfs-6.10-mergeC

xfs: vectorize scrub kernel calls

Create a vectorized version of the metadata scrub and repair ioctl, and
adapt xfs_scrub to use that.  This mitigates the impact of system call
overhead on xfs_scrub runtime.
Signed-off-by: default avatarDarrick J. Wong <djwong@kernel.org>
Signed-off-by: default avatarChandan Babu R <chandanbabu@kernel.org>

* tag 'vectorized-scrub-6.10_2024-04-23' of https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux:
  xfs: introduce vectored scrub mode
  xfs: move xfs_ioc_scrub_metadata to scrub.c
  xfs: reduce the rate of cond_resched calls inside scrub
parents f7cea946 c77b3758
......@@ -725,6 +725,15 @@ struct xfs_scrub_metadata {
/* Number of scrub subcommands. */
#define XFS_SCRUB_TYPE_NR 29
/*
* This special type code only applies to the vectored scrub implementation.
*
* If any of the previous scrub vectors recorded runtime errors or have
* sv_flags bits set that match the OFLAG bits in the barrier vector's
* sv_flags, set the barrier's sv_ret to -ECANCELED and return to userspace.
*/
#define XFS_SCRUB_TYPE_BARRIER (0xFFFFFFFF)
/* i: Repair this metadata. */
#define XFS_SCRUB_IFLAG_REPAIR (1u << 0)
......@@ -769,6 +778,29 @@ struct xfs_scrub_metadata {
XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED)
#define XFS_SCRUB_FLAGS_ALL (XFS_SCRUB_FLAGS_IN | XFS_SCRUB_FLAGS_OUT)
/* Vectored scrub calls to reduce the number of kernel transitions. */
struct xfs_scrub_vec {
__u32 sv_type; /* XFS_SCRUB_TYPE_* */
__u32 sv_flags; /* XFS_SCRUB_FLAGS_* */
__s32 sv_ret; /* 0 or a negative error code */
__u32 sv_reserved; /* must be zero */
};
/* Vectored metadata scrub control structure. */
struct xfs_scrub_vec_head {
__u64 svh_ino; /* inode number. */
__u32 svh_gen; /* inode generation. */
__u32 svh_agno; /* ag number. */
__u32 svh_flags; /* XFS_SCRUB_VEC_FLAGS_* */
__u16 svh_rest_us; /* wait this much time between vector items */
__u16 svh_nr; /* number of svh_vectors */
__u64 svh_reserved; /* must be zero */
__u64 svh_vectors; /* pointer to buffer of xfs_scrub_vec */
};
#define XFS_SCRUB_VEC_FLAGS_ALL (0)
/*
* ioctl limits
*/
......@@ -928,6 +960,7 @@ struct xfs_getparents_by_handle {
#define XFS_IOC_AG_GEOMETRY _IOWR('X', 61, struct xfs_ag_geometry)
#define XFS_IOC_GETPARENTS _IOWR('X', 62, struct xfs_getparents)
#define XFS_IOC_GETPARENTS_BY_HANDLE _IOWR('X', 63, struct xfs_getparents_by_handle)
#define XFS_IOC_SCRUBV_METADATA _IOWR('X', 64, struct xfs_scrub_vec_head)
/*
* ioctl commands that replace IRIX syssgi()'s
......
......@@ -6,31 +6,6 @@
#ifndef __XFS_SCRUB_COMMON_H__
#define __XFS_SCRUB_COMMON_H__
/*
* We /could/ terminate a scrub/repair operation early. If we're not
* in a good place to continue (fatal signal, etc.) then bail out.
* Note that we're careful not to make any judgements about *error.
*/
static inline bool
xchk_should_terminate(
struct xfs_scrub *sc,
int *error)
{
/*
* If preemption is disabled, we need to yield to the scheduler every
* few seconds so that we don't run afoul of the soft lockup watchdog
* or RCU stall detector.
*/
cond_resched();
if (fatal_signal_pending(current)) {
if (*error == 0)
*error = -EINTR;
return true;
}
return false;
}
int xchk_trans_alloc(struct xfs_scrub *sc, uint resblks);
int xchk_trans_alloc_empty(struct xfs_scrub *sc);
void xchk_trans_cancel(struct xfs_scrub *sc);
......
......@@ -21,6 +21,7 @@
#include "xfs_exchmaps.h"
#include "xfs_dir2.h"
#include "xfs_parent.h"
#include "xfs_icache.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
......@@ -578,7 +579,7 @@ xchk_scrub_create_subord(
}
/* Dispatch metadata scrubbing. */
int
STATIC int
xfs_scrub_metadata(
struct file *file,
struct xfs_scrub_metadata *sm)
......@@ -620,6 +621,7 @@ xfs_scrub_metadata(
sc->sm = sm;
sc->ops = &meta_scrub_ops[sm->sm_type];
sc->sick_mask = xchk_health_mask_for_scrub_type(sm->sm_type);
sc->relax = INIT_XCHK_RELAX;
retry_op:
/*
* When repairs are allowed, prevent freezing or readonly remount while
......@@ -723,3 +725,176 @@ xfs_scrub_metadata(
run.retries++;
goto retry_op;
}
/* Scrub one aspect of one piece of metadata. */
int
xfs_ioc_scrub_metadata(
struct file *file,
void __user *arg)
{
struct xfs_scrub_metadata scrub;
int error;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (copy_from_user(&scrub, arg, sizeof(scrub)))
return -EFAULT;
error = xfs_scrub_metadata(file, &scrub);
if (error)
return error;
if (copy_to_user(arg, &scrub, sizeof(scrub)))
return -EFAULT;
return 0;
}
/* Decide if there have been any scrub failures up to this point. */
static inline int
xfs_scrubv_check_barrier(
struct xfs_mount *mp,
const struct xfs_scrub_vec *vectors,
const struct xfs_scrub_vec *stop_vec)
{
const struct xfs_scrub_vec *v;
__u32 failmask;
failmask = stop_vec->sv_flags & XFS_SCRUB_FLAGS_OUT;
for (v = vectors; v < stop_vec; v++) {
if (v->sv_type == XFS_SCRUB_TYPE_BARRIER)
continue;
/*
* Runtime errors count as a previous failure, except the ones
* used to ask userspace to retry.
*/
switch (v->sv_ret) {
case -EBUSY:
case -ENOENT:
case -EUSERS:
case 0:
break;
default:
return -ECANCELED;
}
/*
* If any of the out-flags on the scrub vector match the mask
* that was set on the barrier vector, that's a previous fail.
*/
if (v->sv_flags & failmask)
return -ECANCELED;
}
return 0;
}
/* Vectored scrub implementation to reduce ioctl calls. */
int
xfs_ioc_scrubv_metadata(
struct file *file,
void __user *arg)
{
struct xfs_scrub_vec_head head;
struct xfs_scrub_vec_head __user *uhead = arg;
struct xfs_scrub_vec *vectors;
struct xfs_scrub_vec __user *uvectors;
struct xfs_inode *ip_in = XFS_I(file_inode(file));
struct xfs_mount *mp = ip_in->i_mount;
struct xfs_scrub_vec *v;
size_t vec_bytes;
unsigned int i;
int error = 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (copy_from_user(&head, uhead, sizeof(head)))
return -EFAULT;
if (head.svh_reserved)
return -EINVAL;
if (head.svh_flags & ~XFS_SCRUB_VEC_FLAGS_ALL)
return -EINVAL;
if (head.svh_nr == 0)
return 0;
vec_bytes = array_size(head.svh_nr, sizeof(struct xfs_scrub_vec));
if (vec_bytes > PAGE_SIZE)
return -ENOMEM;
uvectors = (void __user *)(uintptr_t)head.svh_vectors;
vectors = memdup_user(uvectors, vec_bytes);
if (IS_ERR(vectors))
return PTR_ERR(vectors);
trace_xchk_scrubv_start(ip_in, &head);
for (i = 0, v = vectors; i < head.svh_nr; i++, v++) {
if (v->sv_reserved) {
error = -EINVAL;
goto out_free;
}
if (v->sv_type == XFS_SCRUB_TYPE_BARRIER &&
(v->sv_flags & ~XFS_SCRUB_FLAGS_OUT)) {
error = -EINVAL;
goto out_free;
}
trace_xchk_scrubv_item(mp, &head, i, v);
}
/* Run all the scrubbers. */
for (i = 0, v = vectors; i < head.svh_nr; i++, v++) {
struct xfs_scrub_metadata sm = {
.sm_type = v->sv_type,
.sm_flags = v->sv_flags,
.sm_ino = head.svh_ino,
.sm_gen = head.svh_gen,
.sm_agno = head.svh_agno,
};
if (v->sv_type == XFS_SCRUB_TYPE_BARRIER) {
v->sv_ret = xfs_scrubv_check_barrier(mp, vectors, v);
if (v->sv_ret) {
trace_xchk_scrubv_barrier_fail(mp, &head, i, v);
break;
}
continue;
}
v->sv_ret = xfs_scrub_metadata(file, &sm);
v->sv_flags = sm.sm_flags;
trace_xchk_scrubv_outcome(mp, &head, i, v);
if (head.svh_rest_us) {
ktime_t expires;
expires = ktime_add_ns(ktime_get(),
head.svh_rest_us * 1000);
set_current_state(TASK_KILLABLE);
schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
}
if (fatal_signal_pending(current)) {
error = -EINTR;
goto out_free;
}
}
if (copy_to_user(uvectors, vectors, vec_bytes) ||
copy_to_user(uhead, &head, sizeof(head))) {
error = -EFAULT;
goto out_free;
}
out_free:
kfree(vectors);
return error;
}
......@@ -8,6 +8,49 @@
struct xfs_scrub;
struct xchk_relax {
unsigned long next_resched;
unsigned int resched_nr;
bool interruptible;
};
/* Yield to the scheduler at most 10x per second. */
#define XCHK_RELAX_NEXT (jiffies + (HZ / 10))
#define INIT_XCHK_RELAX \
(struct xchk_relax){ \
.next_resched = XCHK_RELAX_NEXT, \
.resched_nr = 0, \
.interruptible = true, \
}
/*
* Relax during a scrub operation and exit if there's a fatal signal pending.
*
* If preemption is disabled, we need to yield to the scheduler every now and
* then so that we don't run afoul of the soft lockup watchdog or RCU stall
* detector. cond_resched calls are somewhat expensive (~5ns) so we want to
* ratelimit this to 10x per second. Amortize the cost of the other checks by
* only doing it once every 100 calls.
*/
static inline int xchk_maybe_relax(struct xchk_relax *widget)
{
/* Amortize the cost of scheduling and checking signals. */
if (likely(++widget->resched_nr < 100))
return 0;
widget->resched_nr = 0;
if (unlikely(widget->next_resched <= jiffies)) {
cond_resched();
widget->next_resched = XCHK_RELAX_NEXT;
}
if (widget->interruptible && fatal_signal_pending(current))
return -EINTR;
return 0;
}
/*
* Standard flags for allocating memory within scrub. NOFS context is
* configured by the process allocation scope. Scrub and repair must be able
......@@ -123,6 +166,9 @@ struct xfs_scrub {
*/
unsigned int sick_mask;
/* next time we want to cond_resched() */
struct xchk_relax relax;
/* State tracking for single-AG operations. */
struct xchk_ag sa;
};
......@@ -167,6 +213,24 @@ struct xfs_scrub_subord *xchk_scrub_create_subord(struct xfs_scrub *sc,
unsigned int subtype);
void xchk_scrub_free_subord(struct xfs_scrub_subord *sub);
/*
* We /could/ terminate a scrub/repair operation early. If we're not
* in a good place to continue (fatal signal, etc.) then bail out.
* Note that we're careful not to make any judgements about *error.
*/
static inline bool
xchk_should_terminate(
struct xfs_scrub *sc,
int *error)
{
if (xchk_maybe_relax(&sc->relax)) {
if (*error == 0)
*error = -EINTR;
return true;
}
return false;
}
/* Metadata scrubbers */
int xchk_tester(struct xfs_scrub *sc);
int xchk_superblock(struct xfs_scrub *sc);
......
......@@ -69,6 +69,7 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_QUOTACHECK);
TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_NLINKS);
TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_HEALTHY);
TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_DIRTREE);
TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BARRIER);
#define XFS_SCRUB_TYPE_STRINGS \
{ XFS_SCRUB_TYPE_PROBE, "probe" }, \
......@@ -99,7 +100,8 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_DIRTREE);
{ XFS_SCRUB_TYPE_QUOTACHECK, "quotacheck" }, \
{ XFS_SCRUB_TYPE_NLINKS, "nlinks" }, \
{ XFS_SCRUB_TYPE_HEALTHY, "healthy" }, \
{ XFS_SCRUB_TYPE_DIRTREE, "dirtree" }
{ XFS_SCRUB_TYPE_DIRTREE, "dirtree" }, \
{ XFS_SCRUB_TYPE_BARRIER, "barrier" }
#define XFS_SCRUB_FLAG_STRINGS \
{ XFS_SCRUB_IFLAG_REPAIR, "repair" }, \
......@@ -208,6 +210,81 @@ DEFINE_EVENT(xchk_fsgate_class, name, \
DEFINE_SCRUB_FSHOOK_EVENT(xchk_fsgates_enable);
DEFINE_SCRUB_FSHOOK_EVENT(xchk_fsgates_disable);
DECLARE_EVENT_CLASS(xchk_vector_head_class,
TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_vec_head *vhead),
TP_ARGS(ip, vhead),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
__field(xfs_agnumber_t, agno)
__field(xfs_ino_t, inum)
__field(unsigned int, gen)
__field(unsigned int, flags)
__field(unsigned short, rest_us)
__field(unsigned short, nr_vecs)
),
TP_fast_assign(
__entry->dev = ip->i_mount->m_super->s_dev;
__entry->ino = ip->i_ino;
__entry->agno = vhead->svh_agno;
__entry->inum = vhead->svh_ino;
__entry->gen = vhead->svh_gen;
__entry->flags = vhead->svh_flags;
__entry->rest_us = vhead->svh_rest_us;
__entry->nr_vecs = vhead->svh_nr;
),
TP_printk("dev %d:%d ino 0x%llx agno 0x%x inum 0x%llx gen 0x%x flags 0x%x rest_us %u nr_vecs %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->agno,
__entry->inum,
__entry->gen,
__entry->flags,
__entry->rest_us,
__entry->nr_vecs)
)
#define DEFINE_SCRUBV_HEAD_EVENT(name) \
DEFINE_EVENT(xchk_vector_head_class, name, \
TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_vec_head *vhead), \
TP_ARGS(ip, vhead))
DEFINE_SCRUBV_HEAD_EVENT(xchk_scrubv_start);
DECLARE_EVENT_CLASS(xchk_vector_class,
TP_PROTO(struct xfs_mount *mp, struct xfs_scrub_vec_head *vhead,
unsigned int vec_nr, struct xfs_scrub_vec *v),
TP_ARGS(mp, vhead, vec_nr, v),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(unsigned int, vec_nr)
__field(unsigned int, vec_type)
__field(unsigned int, vec_flags)
__field(int, vec_ret)
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
__entry->vec_nr = vec_nr;
__entry->vec_type = v->sv_type;
__entry->vec_flags = v->sv_flags;
__entry->vec_ret = v->sv_ret;
),
TP_printk("dev %d:%d vec[%u] type %s flags %s ret %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->vec_nr,
__print_symbolic(__entry->vec_type, XFS_SCRUB_TYPE_STRINGS),
__print_flags(__entry->vec_flags, "|", XFS_SCRUB_FLAG_STRINGS),
__entry->vec_ret)
)
#define DEFINE_SCRUBV_EVENT(name) \
DEFINE_EVENT(xchk_vector_class, name, \
TP_PROTO(struct xfs_mount *mp, struct xfs_scrub_vec_head *vhead, \
unsigned int vec_nr, struct xfs_scrub_vec *v), \
TP_ARGS(mp, vhead, vec_nr, v))
DEFINE_SCRUBV_EVENT(xchk_scrubv_barrier_fail);
DEFINE_SCRUBV_EVENT(xchk_scrubv_item);
DEFINE_SCRUBV_EVENT(xchk_scrubv_outcome);
TRACE_EVENT(xchk_op_error,
TP_PROTO(struct xfs_scrub *sc, xfs_agnumber_t agno,
xfs_agblock_t bno, int error, void *ret_ip),
......
......@@ -7,9 +7,9 @@
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "scrub/scrub.h"
#include "scrub/xfile.h"
#include "scrub/xfarray.h"
#include "scrub/scrub.h"
#include "scrub/trace.h"
/*
......@@ -486,6 +486,9 @@ xfarray_sortinfo_alloc(
xfarray_sortinfo_lo(si)[0] = 0;
xfarray_sortinfo_hi(si)[0] = array->nr - 1;
si->relax = INIT_XCHK_RELAX;
if (flags & XFARRAY_SORT_KILLABLE)
si->relax.interruptible = false;
trace_xfarray_sort(si, nr_bytes);
*infop = si;
......@@ -503,10 +506,7 @@ xfarray_sort_terminated(
* few seconds so that we don't run afoul of the soft lockup watchdog
* or RCU stall detector.
*/
cond_resched();
if ((si->flags & XFARRAY_SORT_KILLABLE) &&
fatal_signal_pending(current)) {
if (xchk_maybe_relax(&si->relax)) {
if (*error == 0)
*error = -EINTR;
return true;
......
......@@ -127,6 +127,9 @@ struct xfarray_sortinfo {
/* XFARRAY_SORT_* flags; see below. */
unsigned int flags;
/* next time we want to cond_resched() */
struct xchk_relax relax;
/* Cache a folio here for faster scanning for pivots */
struct folio *folio;
......
......@@ -10,9 +10,9 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "scrub/scrub.h"
#include "scrub/xfile.h"
#include "scrub/xfarray.h"
#include "scrub/scrub.h"
#include "scrub/trace.h"
#include <linux/shmem_fs.h>
......
......@@ -7,9 +7,11 @@
#define __XFS_SCRUB_H__
#ifndef CONFIG_XFS_ONLINE_SCRUB
# define xfs_scrub_metadata(file, sm) (-ENOTTY)
# define xfs_ioc_scrub_metadata(f, a) (-ENOTTY)
# define xfs_ioc_scrubv_metadata(f, a) (-ENOTTY)
#else
int xfs_scrub_metadata(struct file *file, struct xfs_scrub_metadata *sm);
int xfs_ioc_scrub_metadata(struct file *file, void __user *arg);
int xfs_ioc_scrubv_metadata(struct file *file, void __user *arg);
#endif /* CONFIG_XFS_ONLINE_SCRUB */
#endif /* __XFS_SCRUB_H__ */
......@@ -1055,30 +1055,6 @@ xfs_ioc_getfsmap(
return error;
}
STATIC int
xfs_ioc_scrub_metadata(
struct file *file,
void __user *arg)
{
struct xfs_scrub_metadata scrub;
int error;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (copy_from_user(&scrub, arg, sizeof(scrub)))
return -EFAULT;
error = xfs_scrub_metadata(file, &scrub);
if (error)
return error;
if (copy_to_user(arg, &scrub, sizeof(scrub)))
return -EFAULT;
return 0;
}
int
xfs_ioc_swapext(
xfs_swapext_t *sxp)
......@@ -1437,6 +1413,8 @@ xfs_file_ioctl(
case FS_IOC_GETFSMAP:
return xfs_ioc_getfsmap(ip, arg);
case XFS_IOC_SCRUBV_METADATA:
return xfs_ioc_scrubv_metadata(filp, arg);
case XFS_IOC_SCRUB_METADATA:
return xfs_ioc_scrub_metadata(filp, arg);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment