Commit e32cde8d authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'sched_ext-for-6.12-rc1-fixes-1' of...

Merge tag 'sched_ext-for-6.12-rc1-fixes-1' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext

Pull sched_ext fixes from Tejun Heo:

 - When sched_ext is in bypass mode (e.g. while disabling the BPF
   scheduler), it was using one DSQ to implement global FIFO scheduling
   as all it has to do is guaranteeing reasonable forward progress.

   On multi-socket machines, this can lead to live-lock conditions under
   certain workloads. Fixed by splitting the queue used for FIFO
   scheduling per NUMA node. This required several preparation patches.

 - Hotplug tests on powerpc could reliably trigger deadlock while
   enabling a BPF scheduler.

   This was caused by cpu_hotplug_lock nesting inside scx_fork_rwsem and
   then CPU hotplug path trying to fork a new thread while holding
   cpu_hotplug_lock.

   Fixed by restructuring locking in enable and disable paths so that
   the two locks are not coupled. This required several preparation
   patches which also fixed a couple other issues in the enable path.

 - A build fix for !CONFIG_SMP

 - Userspace tooling sync and updates

* tag 'sched_ext-for-6.12-rc1-fixes-1' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext:
  sched_ext: Remove redundant p->nr_cpus_allowed checker
  sched_ext: Decouple locks in scx_ops_enable()
  sched_ext: Decouple locks in scx_ops_disable_workfn()
  sched_ext: Add scx_cgroup_enabled to gate cgroup operations and fix scx_tg_online()
  sched_ext: Enable scx_ops_init_task() separately
  sched_ext: Fix SCX_TASK_INIT -> SCX_TASK_READY transitions in scx_ops_enable()
  sched_ext: Initialize in bypass mode
  sched_ext: Remove SCX_OPS_PREPPING
  sched_ext: Relocate check_hotplug_seq() call in scx_ops_enable()
  sched_ext: Use shorter slice while bypassing
  sched_ext: Split the global DSQ per NUMA node
  sched_ext: Relocate find_user_dsq()
  sched_ext: Allow only user DSQs for scx_bpf_consume(), scx_bpf_dsq_nr_queued() and bpf_iter_scx_dsq_new()
  scx_flatcg: Use a user DSQ for fallback instead of SCX_DSQ_GLOBAL
  tools/sched_ext: Receive misc updates from SCX repo
  sched_ext: Add __COMPAT helpers for features added during v6.12 devel cycle
  sched_ext: Build fix for !CONFIG_SMP
parents 190ecde7 95b87369
This diff is collapsed.
...@@ -7,7 +7,13 @@ ...@@ -7,7 +7,13 @@
#ifndef __SCX_COMMON_BPF_H #ifndef __SCX_COMMON_BPF_H
#define __SCX_COMMON_BPF_H #define __SCX_COMMON_BPF_H
#ifdef LSP
#define __bpf__
#include "../vmlinux/vmlinux.h"
#else
#include "vmlinux.h" #include "vmlinux.h"
#endif
#include <bpf/bpf_helpers.h> #include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h> #include <bpf/bpf_tracing.h>
#include <asm-generic/errno.h> #include <asm-generic/errno.h>
...@@ -309,6 +315,15 @@ void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src) __ksym ...@@ -309,6 +315,15 @@ void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src) __ksym
u32 bpf_cpumask_any_distribute(const struct cpumask *cpumask) __ksym; u32 bpf_cpumask_any_distribute(const struct cpumask *cpumask) __ksym;
u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1, u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1,
const struct cpumask *src2) __ksym; const struct cpumask *src2) __ksym;
u32 bpf_cpumask_weight(const struct cpumask *cpumask) __ksym;
/*
* Access a cpumask in read-only mode (typically to check bits).
*/
const struct cpumask *cast_mask(struct bpf_cpumask *mask)
{
return (const struct cpumask *)mask;
}
/* rcu */ /* rcu */
void bpf_rcu_read_lock(void) __ksym; void bpf_rcu_read_lock(void) __ksym;
......
...@@ -15,6 +15,25 @@ ...@@ -15,6 +15,25 @@
__ret; \ __ret; \
}) })
/* v6.12: 819513666966 ("sched_ext: Add cgroup support") */
#define __COMPAT_scx_bpf_task_cgroup(p) \
(bpf_ksym_exists(scx_bpf_task_cgroup) ? \
scx_bpf_task_cgroup((p)) : NULL)
/* v6.12: 4c30f5ce4f7a ("sched_ext: Implement scx_bpf_dispatch[_vtime]_from_dsq()") */
#define __COMPAT_scx_bpf_dispatch_from_dsq_set_slice(it, slice) \
(bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_slice) ? \
scx_bpf_dispatch_from_dsq_set_slice((it), (slice)) : (void)0)
#define __COMPAT_scx_bpf_dispatch_from_dsq_set_vtime(it, vtime) \
(bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_vtime) ? \
scx_bpf_dispatch_from_dsq_set_vtime((it), (vtime)) : (void)0)
#define __COMPAT_scx_bpf_dispatch_from_dsq(it, p, dsq_id, enq_flags) \
(bpf_ksym_exists(scx_bpf_dispatch_from_dsq) ? \
scx_bpf_dispatch_from_dsq((it), (p), (dsq_id), (enq_flags)) : false)
#define __COMPAT_scx_bpf_dispatch_vtime_from_dsq(it, p, dsq_id, enq_flags) \
(bpf_ksym_exists(scx_bpf_dispatch_vtime_from_dsq) ? \
scx_bpf_dispatch_vtime_from_dsq((it), (p), (dsq_id), (enq_flags)) : false)
/* /*
* Define sched_ext_ops. This may be expanded to define multiple variants for * Define sched_ext_ops. This may be expanded to define multiple variants for
* backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH(). * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH().
......
...@@ -25,7 +25,11 @@ struct user_exit_info { ...@@ -25,7 +25,11 @@ struct user_exit_info {
#ifdef __bpf__ #ifdef __bpf__
#ifdef LSP
#include "../vmlinux/vmlinux.h"
#else
#include "vmlinux.h" #include "vmlinux.h"
#endif
#include <bpf/bpf_core_read.h> #include <bpf/bpf_core_read.h>
#define UEI_DEFINE(__name) \ #define UEI_DEFINE(__name) \
......
...@@ -49,7 +49,10 @@ ...@@ -49,7 +49,10 @@
/* /*
* Maximum amount of retries to find a valid cgroup. * Maximum amount of retries to find a valid cgroup.
*/ */
#define CGROUP_MAX_RETRIES 1024 enum {
FALLBACK_DSQ = 0,
CGROUP_MAX_RETRIES = 1024,
};
char _license[] SEC("license") = "GPL"; char _license[] SEC("license") = "GPL";
...@@ -225,7 +228,7 @@ static void cgrp_refresh_hweight(struct cgroup *cgrp, struct fcg_cgrp_ctx *cgc) ...@@ -225,7 +228,7 @@ static void cgrp_refresh_hweight(struct cgroup *cgrp, struct fcg_cgrp_ctx *cgc)
break; break;
/* /*
* We can be oppotunistic here and not grab the * We can be opportunistic here and not grab the
* cgv_tree_lock and deal with the occasional races. * cgv_tree_lock and deal with the occasional races.
* However, hweight updates are already cached and * However, hweight updates are already cached and
* relatively low-frequency. Let's just do the * relatively low-frequency. Let's just do the
...@@ -258,8 +261,7 @@ static void cgrp_cap_budget(struct cgv_node *cgv_node, struct fcg_cgrp_ctx *cgc) ...@@ -258,8 +261,7 @@ static void cgrp_cap_budget(struct cgv_node *cgv_node, struct fcg_cgrp_ctx *cgc)
* and thus can't be updated and repositioned. Instead, we collect the * and thus can't be updated and repositioned. Instead, we collect the
* vtime deltas separately and apply it asynchronously here. * vtime deltas separately and apply it asynchronously here.
*/ */
delta = cgc->cvtime_delta; delta = __sync_fetch_and_sub(&cgc->cvtime_delta, cgc->cvtime_delta);
__sync_fetch_and_sub(&cgc->cvtime_delta, delta);
cvtime = cgv_node->cvtime + delta; cvtime = cgv_node->cvtime + delta;
/* /*
...@@ -378,12 +380,12 @@ void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags) ...@@ -378,12 +380,12 @@ void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags)
scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags); scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
} else { } else {
stat_inc(FCG_STAT_GLOBAL); stat_inc(FCG_STAT_GLOBAL);
scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); scx_bpf_dispatch(p, FALLBACK_DSQ, SCX_SLICE_DFL, enq_flags);
} }
return; return;
} }
cgrp = scx_bpf_task_cgroup(p); cgrp = __COMPAT_scx_bpf_task_cgroup(p);
cgc = find_cgrp_ctx(cgrp); cgc = find_cgrp_ctx(cgrp);
if (!cgc) if (!cgc)
goto out_release; goto out_release;
...@@ -509,7 +511,7 @@ void BPF_STRUCT_OPS(fcg_runnable, struct task_struct *p, u64 enq_flags) ...@@ -509,7 +511,7 @@ void BPF_STRUCT_OPS(fcg_runnable, struct task_struct *p, u64 enq_flags)
{ {
struct cgroup *cgrp; struct cgroup *cgrp;
cgrp = scx_bpf_task_cgroup(p); cgrp = __COMPAT_scx_bpf_task_cgroup(p);
update_active_weight_sums(cgrp, true); update_active_weight_sums(cgrp, true);
bpf_cgroup_release(cgrp); bpf_cgroup_release(cgrp);
} }
...@@ -522,7 +524,7 @@ void BPF_STRUCT_OPS(fcg_running, struct task_struct *p) ...@@ -522,7 +524,7 @@ void BPF_STRUCT_OPS(fcg_running, struct task_struct *p)
if (fifo_sched) if (fifo_sched)
return; return;
cgrp = scx_bpf_task_cgroup(p); cgrp = __COMPAT_scx_bpf_task_cgroup(p);
cgc = find_cgrp_ctx(cgrp); cgc = find_cgrp_ctx(cgrp);
if (cgc) { if (cgc) {
/* /*
...@@ -565,7 +567,7 @@ void BPF_STRUCT_OPS(fcg_stopping, struct task_struct *p, bool runnable) ...@@ -565,7 +567,7 @@ void BPF_STRUCT_OPS(fcg_stopping, struct task_struct *p, bool runnable)
if (!taskc->bypassed_at) if (!taskc->bypassed_at)
return; return;
cgrp = scx_bpf_task_cgroup(p); cgrp = __COMPAT_scx_bpf_task_cgroup(p);
cgc = find_cgrp_ctx(cgrp); cgc = find_cgrp_ctx(cgrp);
if (cgc) { if (cgc) {
__sync_fetch_and_add(&cgc->cvtime_delta, __sync_fetch_and_add(&cgc->cvtime_delta,
...@@ -579,7 +581,7 @@ void BPF_STRUCT_OPS(fcg_quiescent, struct task_struct *p, u64 deq_flags) ...@@ -579,7 +581,7 @@ void BPF_STRUCT_OPS(fcg_quiescent, struct task_struct *p, u64 deq_flags)
{ {
struct cgroup *cgrp; struct cgroup *cgrp;
cgrp = scx_bpf_task_cgroup(p); cgrp = __COMPAT_scx_bpf_task_cgroup(p);
update_active_weight_sums(cgrp, false); update_active_weight_sums(cgrp, false);
bpf_cgroup_release(cgrp); bpf_cgroup_release(cgrp);
} }
...@@ -781,7 +783,7 @@ void BPF_STRUCT_OPS(fcg_dispatch, s32 cpu, struct task_struct *prev) ...@@ -781,7 +783,7 @@ void BPF_STRUCT_OPS(fcg_dispatch, s32 cpu, struct task_struct *prev)
pick_next_cgroup: pick_next_cgroup:
cpuc->cur_at = now; cpuc->cur_at = now;
if (scx_bpf_consume(SCX_DSQ_GLOBAL)) { if (scx_bpf_consume(FALLBACK_DSQ)) {
cpuc->cur_cgid = 0; cpuc->cur_cgid = 0;
return; return;
} }
...@@ -838,7 +840,7 @@ int BPF_STRUCT_OPS_SLEEPABLE(fcg_cgroup_init, struct cgroup *cgrp, ...@@ -838,7 +840,7 @@ int BPF_STRUCT_OPS_SLEEPABLE(fcg_cgroup_init, struct cgroup *cgrp,
int ret; int ret;
/* /*
* Technically incorrect as cgroup ID is full 64bit while dq ID is * Technically incorrect as cgroup ID is full 64bit while dsq ID is
* 63bit. Should not be a problem in practice and easy to spot in the * 63bit. Should not be a problem in practice and easy to spot in the
* unlikely case that it breaks. * unlikely case that it breaks.
*/ */
...@@ -926,6 +928,11 @@ void BPF_STRUCT_OPS(fcg_cgroup_move, struct task_struct *p, ...@@ -926,6 +928,11 @@ void BPF_STRUCT_OPS(fcg_cgroup_move, struct task_struct *p,
p->scx.dsq_vtime = to_cgc->tvtime_now + vtime_delta; p->scx.dsq_vtime = to_cgc->tvtime_now + vtime_delta;
} }
s32 BPF_STRUCT_OPS_SLEEPABLE(fcg_init)
{
return scx_bpf_create_dsq(FALLBACK_DSQ, -1);
}
void BPF_STRUCT_OPS(fcg_exit, struct scx_exit_info *ei) void BPF_STRUCT_OPS(fcg_exit, struct scx_exit_info *ei)
{ {
UEI_RECORD(uei, ei); UEI_RECORD(uei, ei);
...@@ -944,6 +951,7 @@ SCX_OPS_DEFINE(flatcg_ops, ...@@ -944,6 +951,7 @@ SCX_OPS_DEFINE(flatcg_ops,
.cgroup_init = (void *)fcg_cgroup_init, .cgroup_init = (void *)fcg_cgroup_init,
.cgroup_exit = (void *)fcg_cgroup_exit, .cgroup_exit = (void *)fcg_cgroup_exit,
.cgroup_move = (void *)fcg_cgroup_move, .cgroup_move = (void *)fcg_cgroup_move,
.init = (void *)fcg_init,
.exit = (void *)fcg_exit, .exit = (void *)fcg_exit,
.flags = SCX_OPS_HAS_CGROUP_WEIGHT | SCX_OPS_ENQ_EXITING, .flags = SCX_OPS_HAS_CGROUP_WEIGHT | SCX_OPS_ENQ_EXITING,
.name = "flatcg"); .name = "flatcg");
...@@ -318,11 +318,11 @@ static bool dispatch_highpri(bool from_timer) ...@@ -318,11 +318,11 @@ static bool dispatch_highpri(bool from_timer)
if (tctx->highpri) { if (tctx->highpri) {
/* exercise the set_*() and vtime interface too */ /* exercise the set_*() and vtime interface too */
scx_bpf_dispatch_from_dsq_set_slice( __COMPAT_scx_bpf_dispatch_from_dsq_set_slice(
BPF_FOR_EACH_ITER, slice_ns * 2); BPF_FOR_EACH_ITER, slice_ns * 2);
scx_bpf_dispatch_from_dsq_set_vtime( __COMPAT_scx_bpf_dispatch_from_dsq_set_vtime(
BPF_FOR_EACH_ITER, highpri_seq++); BPF_FOR_EACH_ITER, highpri_seq++);
scx_bpf_dispatch_vtime_from_dsq( __COMPAT_scx_bpf_dispatch_vtime_from_dsq(
BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0); BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0);
} }
} }
...@@ -340,9 +340,9 @@ static bool dispatch_highpri(bool from_timer) ...@@ -340,9 +340,9 @@ static bool dispatch_highpri(bool from_timer)
else else
cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0); cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0);
if (scx_bpf_dispatch_from_dsq(BPF_FOR_EACH_ITER, p, if (__COMPAT_scx_bpf_dispatch_from_dsq(BPF_FOR_EACH_ITER, p,
SCX_DSQ_LOCAL_ON | cpu, SCX_DSQ_LOCAL_ON | cpu,
SCX_ENQ_PREEMPT)) { SCX_ENQ_PREEMPT)) {
if (cpu == this_cpu) { if (cpu == this_cpu) {
dispatched = true; dispatched = true;
__sync_fetch_and_add(&nr_expedited_local, 1); __sync_fetch_and_add(&nr_expedited_local, 1);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment