scx: Allow calling sleepable kfuncs from BPF_PROG_TYPE_SYSCALL

We currently only allow calling sleepable scx kfuncs (i.e. scx_bpf_create_dsq()) from BPF_PROG_TYPE_STRUCT_OPS progs. The idea here was that we'd never have to call scx_bpf_create_dsq() outside of a sched_ext struct_ops callback, but that might not actually be true. For example, a scheduler could do something like the following: 1. Open and load (not yet attach) a scheduler skel 2. Synchronously call into a BPF_PROG_TYPE_SYSCALL prog from user space. For example, to initialize an LLC domain, or some other global, read-only state. 3. Attach the skel, which actually enables the scheduler The advantage of doing this is that it can preclude having to do pretty ugly boilerplate like initializing a read-only, statically sized array of u64[]'s which the kernel consumes literally once at init time to then create struct bpf_cpumask objects which are actually queried at runtime. Doing the above is already possible given that we can invoke core BPF kfuncs, such as bpf_cpumask_create(), from BPF_PROG_TYPE_SYSCALL progs. We already allow many scx kfuncs to be called from BPF_PROG_TYPE_SYSCALL progs (e.g. scx_bpf_kick_cpu()). Let's allow the sleepable kfuncs as well. Signed-off-by: David Vernet <void@manifault.com> Signed-off-by: Tejun Heo <tj@kernel.org>

scx: Allow calling sleepable kfuncs from BPF_PROG_TYPE_SYSCALL
We currently only allow calling sleepable scx kfuncs (i.e. scx_bpf_create_dsq()) from BPF_PROG_TYPE_STRUCT_OPS progs. The idea here was that we'd never have to call scx_bpf_create_dsq() outside of a sched_ext struct_ops callback, but that might not actually be true. For example, a scheduler could do something like the following: 1. Open and load (not yet attach) a scheduler skel 2. Synchronously call into a BPF_PROG_TYPE_SYSCALL prog from user space. For example, to initialize an LLC domain, or some other global, read-only state. 3. Attach the skel, which actually enables the scheduler The advantage of doing this is that it can preclude having to do pretty ugly boilerplate like initializing a read-only, statically sized array of u64[]'s which the kernel consumes literally once at init time to then create struct bpf_cpumask objects which are actually queried at runtime. Doing the above is already possible given that we can invoke core BPF kfuncs, such as bpf_cpumask_create(), from BPF_PROG_TYPE_SYSCALL progs. We already allow many scx kfuncs to be called from BPF_PROG_TYPE_SYSCALL progs (e.g. scx_bpf_kick_cpu()). Let's allow the sleepable kfuncs as well. Signed-off-by: David Vernet <void@manifault.com> Signed-off-by: Tejun Heo <tj@kernel.org>
298dec19 · David Vernet · Tejun Heo · c8faf11c · 298dec19 · 298dec19
Commit 298dec19 authored Jul 31, 2024 by David Vernet Committed by Tejun Heo Jul 31, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 17 additions and 24 deletions

include/linux/sched/ext.h include/linux/sched/ext.h +6 -8

kernel/sched/ext.c kernel/sched/ext.c +11 -16

No files found.
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -106,16 +106,14 @@ enum scx_ent_dsq_flags {
 * mechanism. See scx_kf_allow().
 */
 enum scx_kf_mask {
-	SCX_KF_UNLOCKED		= 0,	  /* not sleepable, not rq locked */
-	/* all non-sleepables may be nested inside SLEEPABLE */
-	SCX_KF_SLEEPABLE	= 1 << 0, /* sleepable init operations */
+	SCX_KF_UNLOCKED		= 0,	  /* sleepable and not rq locked */
 	/* ENQUEUE and DISPATCH may be nested inside CPU_RELEASE */
-	SCX_KF_CPU_RELEASE	= 1 << 1, /* ops.cpu_release() */
+	SCX_KF_CPU_RELEASE	= 1 << 0, /* ops.cpu_release() */
 	/* ops.dequeue (in REST) may be nested inside DISPATCH */
-	SCX_KF_DISPATCH		= 1 << 2, /* ops.dispatch() */
-	SCX_KF_ENQUEUE		= 1 << 3, /* ops.enqueue() and ops.select_cpu() */
-	SCX_KF_SELECT_CPU	= 1 << 4, /* ops.select_cpu() */
-	SCX_KF_REST		= 1 << 5, /* other rq-locked operations */
+	SCX_KF_DISPATCH		= 1 << 1, /* ops.dispatch() */
+	SCX_KF_ENQUEUE		= 1 << 2, /* ops.enqueue() and ops.select_cpu() */
+	SCX_KF_SELECT_CPU	= 1 << 3, /* ops.select_cpu() */
+	SCX_KF_REST		= 1 << 4, /* other rq-locked operations */

 	__SCX_KF_RQ_LOCKED	= SCX_KF_CPU_RELEASE | SCX_KF_DISPATCH |
 				  SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST,

--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1029,16 +1029,12 @@ static __always_inline bool scx_kf_allowed(u32 mask)
 		return false;
 	}

-	if (unlikely((mask & SCX_KF_SLEEPABLE) && in_interrupt())) {
-		scx_ops_error("sleepable kfunc called from non-sleepable context");
-		return false;
-	}
-
 	/*
 	 * Enforce nesting boundaries. e.g. A kfunc which can be called from
 	 * DISPATCH must not be called if we're running DEQUEUE which is nested
-	 * inside ops.dispatch(). We don't need to check the SCX_KF_SLEEPABLE
-	 * boundary thanks to the above in_interrupt() check.
+	 * inside ops.dispatch(). We don't need to check boundaries for any
+	 * blocking kfuncs as the verifier ensures they're only called from
+	 * sleepable progs.
 	 */
 	if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE &&
 		     (current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) {
@@ -3224,9 +3220,9 @@ static void handle_hotplug(struct rq *rq, bool online)
 	atomic_long_inc(&scx_hotplug_seq);

 	if (online && SCX_HAS_OP(cpu_online))
-		SCX_CALL_OP(SCX_KF_SLEEPABLE, cpu_online, cpu);
+		SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, cpu);
 	else if (!online && SCX_HAS_OP(cpu_offline))
-		SCX_CALL_OP(SCX_KF_SLEEPABLE, cpu_offline, cpu);
+		SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_offline, cpu);
 	else
 		scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
 			     "cpu %d going %s, exiting scheduler", cpu,
@@ -3390,7 +3386,7 @@ static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool
 			.fork = fork,
 		};

-		ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, init_task, p, &args);
+		ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init_task, p, &args);
 		if (unlikely(ret)) {
 			ret = ops_sanitize_err("init_task", ret);
 			return ret;
@@ -4648,7 +4644,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 	cpus_read_lock();

 	if (scx_ops.init) {
-		ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, init);
+		ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init);
 		if (ret) {
 			ret = ops_sanitize_err("init", ret);
 			goto err_disable_unlock_cpus;
@@ -5424,14 +5420,11 @@ __bpf_kfunc_start_defs();
 * @dsq_id: DSQ to create
 * @node: NUMA node to allocate from
 *
- * Create a custom DSQ identified by @dsq_id. Can be called from ops.init() and
- * ops.init_task().
+ * Create a custom DSQ identified by @dsq_id. Can be called from any sleepable
+ * scx callback, and any BPF_PROG_TYPE_SYSCALL prog.
 */
 __bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node)
 {
-	if (!scx_kf_allowed(SCX_KF_SLEEPABLE))
-		return -EINVAL;
-
 	if (unlikely(node >= (int)nr_node_ids ||
 		     (node < 0 && node != NUMA_NO_NODE)))
 		return -EINVAL;
@@ -6490,6 +6483,8 @@ static int __init scx_init(void)
 	 */
 	if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
 					     &scx_kfunc_set_sleepable)) ||
+	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL,
+					     &scx_kfunc_set_sleepable)) ||
 	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
 					     &scx_kfunc_set_select_cpu)) ||
 	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,