Commit 88264981 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'sched_ext-for-6.12' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext

Pull sched_ext support from Tejun Heo:
 "This implements a new scheduler class called ‘ext_sched_class’, or
  sched_ext, which allows scheduling policies to be implemented as BPF
  programs.

  The goals of this are:

   - Ease of experimentation and exploration: Enabling rapid iteration
     of new scheduling policies.

   - Customization: Building application-specific schedulers which
     implement policies that are not applicable to general-purpose
     schedulers.

   - Rapid scheduler deployments: Non-disruptive swap outs of scheduling
     policies in production environments"

See individual commits for more documentation, but also the cover letter
for the latest series:

Link: https://lore.kernel.org/all/20240618212056.2833381-1-tj@kernel.org/

* tag 'sched_ext-for-6.12' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext: (110 commits)
  sched: Move update_other_load_avgs() to kernel/sched/pelt.c
  sched_ext: Don't trigger ops.quiescent/runnable() on migrations
  sched_ext: Synchronize bypass state changes with rq lock
  scx_qmap: Implement highpri boosting
  sched_ext: Implement scx_bpf_dispatch[_vtime]_from_dsq()
  sched_ext: Compact struct bpf_iter_scx_dsq_kern
  sched_ext: Replace consume_local_task() with move_local_task_to_local_dsq()
  sched_ext: Move consume_local_task() upward
  sched_ext: Move sanity check and dsq_mod_nr() into task_unlink_from_dsq()
  sched_ext: Reorder args for consume_local/remote_task()
  sched_ext: Restructure dispatch_to_local_dsq()
  sched_ext: Fix processs_ddsp_deferred_locals() by unifying DTL_INVALID handling
  sched_ext: Make find_dsq_for_dispatch() handle SCX_DSQ_LOCAL_ON
  sched_ext: Refactor consume_remote_task()
  sched_ext: Rename scx_kfunc_set_sleepable to unlocked and relocate
  sched_ext: Add missing static to scx_dump_data
  sched_ext: Add missing static to scx_has_op[]
  sched_ext: Temporarily work around pick_task_scx() being called without balance_scx()
  sched_ext: Add a cgroup scheduler which uses flattened hierarchy
  sched_ext: Add cgroup support
  ...
parents 440b6523 902d67a2
...@@ -21,6 +21,7 @@ Scheduler ...@@ -21,6 +21,7 @@ Scheduler
sched-nice-design sched-nice-design
sched-rt-group sched-rt-group
sched-stats sched-stats
sched-ext
sched-debug sched-debug
text_files text_files
......
This diff is collapsed.
...@@ -20511,6 +20511,19 @@ F: include/linux/wait.h ...@@ -20511,6 +20511,19 @@ F: include/linux/wait.h
F: include/uapi/linux/sched.h F: include/uapi/linux/sched.h
F: kernel/sched/ F: kernel/sched/
SCHEDULER - SCHED_EXT
R: Tejun Heo <tj@kernel.org>
R: David Vernet <void@manifault.com>
L: linux-kernel@vger.kernel.org
S: Maintained
W: https://github.com/sched-ext/scx
T: git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git
F: include/linux/sched/ext.h
F: kernel/sched/ext.h
F: kernel/sched/ext.c
F: tools/sched_ext/
F: tools/testing/selftests/sched_ext
SCIOSENSE ENS160 MULTI-GAS SENSOR DRIVER SCIOSENSE ENS160 MULTI-GAS SENSOR DRIVER
M: Gustavo Silva <gustavograzs@gmail.com> M: Gustavo Silva <gustavograzs@gmail.com>
S: Maintained S: Maintained
......
...@@ -531,6 +531,7 @@ static const struct sysrq_key_op *sysrq_key_table[62] = { ...@@ -531,6 +531,7 @@ static const struct sysrq_key_op *sysrq_key_table[62] = {
NULL, /* P */ NULL, /* P */
NULL, /* Q */ NULL, /* Q */
&sysrq_replay_logs_op, /* R */ &sysrq_replay_logs_op, /* R */
/* S: May be registered by sched_ext for resetting */
NULL, /* S */ NULL, /* S */
NULL, /* T */ NULL, /* T */
NULL, /* U */ NULL, /* U */
......
...@@ -133,6 +133,7 @@ ...@@ -133,6 +133,7 @@
*(__dl_sched_class) \ *(__dl_sched_class) \
*(__rt_sched_class) \ *(__rt_sched_class) \
*(__fair_sched_class) \ *(__fair_sched_class) \
*(__ext_sched_class) \
*(__idle_sched_class) \ *(__idle_sched_class) \
__sched_class_lowest = .; __sched_class_lowest = .;
......
...@@ -29,8 +29,6 @@ ...@@ -29,8 +29,6 @@
struct kernel_clone_args; struct kernel_clone_args;
#ifdef CONFIG_CGROUPS
/* /*
* All weight knobs on the default hierarchy should use the following min, * All weight knobs on the default hierarchy should use the following min,
* default and max values. The default value is the logarithmic center of * default and max values. The default value is the logarithmic center of
...@@ -40,6 +38,8 @@ struct kernel_clone_args; ...@@ -40,6 +38,8 @@ struct kernel_clone_args;
#define CGROUP_WEIGHT_DFL 100 #define CGROUP_WEIGHT_DFL 100
#define CGROUP_WEIGHT_MAX 10000 #define CGROUP_WEIGHT_MAX 10000
#ifdef CONFIG_CGROUPS
enum { enum {
CSS_TASK_ITER_PROCS = (1U << 0), /* walk only threadgroup leaders */ CSS_TASK_ITER_PROCS = (1U << 0), /* walk only threadgroup leaders */
CSS_TASK_ITER_THREADED = (1U << 1), /* walk all threaded css_sets in the domain */ CSS_TASK_ITER_THREADED = (1U << 1), /* walk all threaded css_sets in the domain */
......
...@@ -82,6 +82,8 @@ struct task_group; ...@@ -82,6 +82,8 @@ struct task_group;
struct task_struct; struct task_struct;
struct user_event_mm; struct user_event_mm;
#include <linux/sched/ext.h>
/* /*
* Task state bitmask. NOTE! These bits are also * Task state bitmask. NOTE! These bits are also
* encoded in fs/proc/array.c: get_task_state(). * encoded in fs/proc/array.c: get_task_state().
...@@ -830,6 +832,9 @@ struct task_struct { ...@@ -830,6 +832,9 @@ struct task_struct {
struct sched_rt_entity rt; struct sched_rt_entity rt;
struct sched_dl_entity dl; struct sched_dl_entity dl;
struct sched_dl_entity *dl_server; struct sched_dl_entity *dl_server;
#ifdef CONFIG_SCHED_CLASS_EXT
struct sched_ext_entity scx;
#endif
const struct sched_class *sched_class; const struct sched_class *sched_class;
#ifdef CONFIG_SCHED_CORE #ifdef CONFIG_SCHED_CORE
......
/* SPDX-License-Identifier: GPL-2.0 */
/*
* BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
*
* Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
* Copyright (c) 2022 Tejun Heo <tj@kernel.org>
* Copyright (c) 2022 David Vernet <dvernet@meta.com>
*/
#ifndef _LINUX_SCHED_EXT_H
#define _LINUX_SCHED_EXT_H
#ifdef CONFIG_SCHED_CLASS_EXT
#include <linux/llist.h>
#include <linux/rhashtable-types.h>
enum scx_public_consts {
SCX_OPS_NAME_LEN = 128,
SCX_SLICE_DFL = 20 * 1000000, /* 20ms */
SCX_SLICE_INF = U64_MAX, /* infinite, implies nohz */
};
/*
* DSQ (dispatch queue) IDs are 64bit of the format:
*
* Bits: [63] [62 .. 0]
* [ B] [ ID ]
*
* B: 1 for IDs for built-in DSQs, 0 for ops-created user DSQs
* ID: 63 bit ID
*
* Built-in IDs:
*
* Bits: [63] [62] [61..32] [31 .. 0]
* [ 1] [ L] [ R ] [ V ]
*
* 1: 1 for built-in DSQs.
* L: 1 for LOCAL_ON DSQ IDs, 0 for others
* V: For LOCAL_ON DSQ IDs, a CPU number. For others, a pre-defined value.
*/
enum scx_dsq_id_flags {
SCX_DSQ_FLAG_BUILTIN = 1LLU << 63,
SCX_DSQ_FLAG_LOCAL_ON = 1LLU << 62,
SCX_DSQ_INVALID = SCX_DSQ_FLAG_BUILTIN | 0,
SCX_DSQ_GLOBAL = SCX_DSQ_FLAG_BUILTIN | 1,
SCX_DSQ_LOCAL = SCX_DSQ_FLAG_BUILTIN | 2,
SCX_DSQ_LOCAL_ON = SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON,
SCX_DSQ_LOCAL_CPU_MASK = 0xffffffffLLU,
};
/*
* A dispatch queue (DSQ) can be either a FIFO or p->scx.dsq_vtime ordered
* queue. A built-in DSQ is always a FIFO. The built-in local DSQs are used to
* buffer between the scheduler core and the BPF scheduler. See the
* documentation for more details.
*/
struct scx_dispatch_q {
raw_spinlock_t lock;
struct list_head list; /* tasks in dispatch order */
struct rb_root priq; /* used to order by p->scx.dsq_vtime */
u32 nr;
u32 seq; /* used by BPF iter */
u64 id;
struct rhash_head hash_node;
struct llist_node free_node;
struct rcu_head rcu;
};
/* scx_entity.flags */
enum scx_ent_flags {
SCX_TASK_QUEUED = 1 << 0, /* on ext runqueue */
SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */
SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */
SCX_TASK_STATE_SHIFT = 8, /* bit 8 and 9 are used to carry scx_task_state */
SCX_TASK_STATE_BITS = 2,
SCX_TASK_STATE_MASK = ((1 << SCX_TASK_STATE_BITS) - 1) << SCX_TASK_STATE_SHIFT,
SCX_TASK_CURSOR = 1 << 31, /* iteration cursor, not a task */
};
/* scx_entity.flags & SCX_TASK_STATE_MASK */
enum scx_task_state {
SCX_TASK_NONE, /* ops.init_task() not called yet */
SCX_TASK_INIT, /* ops.init_task() succeeded, but task can be cancelled */
SCX_TASK_READY, /* fully initialized, but not in sched_ext */
SCX_TASK_ENABLED, /* fully initialized and in sched_ext */
SCX_TASK_NR_STATES,
};
/* scx_entity.dsq_flags */
enum scx_ent_dsq_flags {
SCX_TASK_DSQ_ON_PRIQ = 1 << 0, /* task is queued on the priority queue of a dsq */
};
/*
* Mask bits for scx_entity.kf_mask. Not all kfuncs can be called from
* everywhere and the following bits track which kfunc sets are currently
* allowed for %current. This simple per-task tracking works because SCX ops
* nest in a limited way. BPF will likely implement a way to allow and disallow
* kfuncs depending on the calling context which will replace this manual
* mechanism. See scx_kf_allow().
*/
enum scx_kf_mask {
SCX_KF_UNLOCKED = 0, /* sleepable and not rq locked */
/* ENQUEUE and DISPATCH may be nested inside CPU_RELEASE */
SCX_KF_CPU_RELEASE = 1 << 0, /* ops.cpu_release() */
/* ops.dequeue (in REST) may be nested inside DISPATCH */
SCX_KF_DISPATCH = 1 << 1, /* ops.dispatch() */
SCX_KF_ENQUEUE = 1 << 2, /* ops.enqueue() and ops.select_cpu() */
SCX_KF_SELECT_CPU = 1 << 3, /* ops.select_cpu() */
SCX_KF_REST = 1 << 4, /* other rq-locked operations */
__SCX_KF_RQ_LOCKED = SCX_KF_CPU_RELEASE | SCX_KF_DISPATCH |
SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST,
__SCX_KF_TERMINAL = SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST,
};
enum scx_dsq_lnode_flags {
SCX_DSQ_LNODE_ITER_CURSOR = 1 << 0,
/* high 16 bits can be for iter cursor flags */
__SCX_DSQ_LNODE_PRIV_SHIFT = 16,
};
struct scx_dsq_list_node {
struct list_head node;
u32 flags;
u32 priv; /* can be used by iter cursor */
};
/*
* The following is embedded in task_struct and contains all fields necessary
* for a task to be scheduled by SCX.
*/
struct sched_ext_entity {
struct scx_dispatch_q *dsq;
struct scx_dsq_list_node dsq_list; /* dispatch order */
struct rb_node dsq_priq; /* p->scx.dsq_vtime order */
u32 dsq_seq;
u32 dsq_flags; /* protected by DSQ lock */
u32 flags; /* protected by rq lock */
u32 weight;
s32 sticky_cpu;
s32 holding_cpu;
u32 kf_mask; /* see scx_kf_mask above */
struct task_struct *kf_tasks[2]; /* see SCX_CALL_OP_TASK() */
atomic_long_t ops_state;
struct list_head runnable_node; /* rq->scx.runnable_list */
unsigned long runnable_at;
#ifdef CONFIG_SCHED_CORE
u64 core_sched_at; /* see scx_prio_less() */
#endif
u64 ddsp_dsq_id;
u64 ddsp_enq_flags;
/* BPF scheduler modifiable fields */
/*
* Runtime budget in nsecs. This is usually set through
* scx_bpf_dispatch() but can also be modified directly by the BPF
* scheduler. Automatically decreased by SCX as the task executes. On
* depletion, a scheduling event is triggered.
*
* This value is cleared to zero if the task is preempted by
* %SCX_KICK_PREEMPT and shouldn't be used to determine how long the
* task ran. Use p->se.sum_exec_runtime instead.
*/
u64 slice;
/*
* Used to order tasks when dispatching to the vtime-ordered priority
* queue of a dsq. This is usually set through scx_bpf_dispatch_vtime()
* but can also be modified directly by the BPF scheduler. Modifying it
* while a task is queued on a dsq may mangle the ordering and is not
* recommended.
*/
u64 dsq_vtime;
/*
* If set, reject future sched_setscheduler(2) calls updating the policy
* to %SCHED_EXT with -%EACCES.
*
* Can be set from ops.init_task() while the BPF scheduler is being
* loaded (!scx_init_task_args->fork). If set and the task's policy is
* already %SCHED_EXT, the task's policy is rejected and forcefully
* reverted to %SCHED_NORMAL. The number of such events are reported
* through /sys/kernel/debug/sched_ext::nr_rejected. Setting this flag
* during fork is not allowed.
*/
bool disallow; /* reject switching into SCX */
/* cold fields */
#ifdef CONFIG_EXT_GROUP_SCHED
struct cgroup *cgrp_moving_from;
#endif
/* must be the last field, see init_scx_entity() */
struct list_head tasks_node;
};
void sched_ext_free(struct task_struct *p);
void print_scx_info(const char *log_lvl, struct task_struct *p);
#else /* !CONFIG_SCHED_CLASS_EXT */
static inline void sched_ext_free(struct task_struct *p) {}
static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {}
#endif /* CONFIG_SCHED_CLASS_EXT */
#endif /* _LINUX_SCHED_EXT_H */
...@@ -63,7 +63,8 @@ extern asmlinkage void schedule_tail(struct task_struct *prev); ...@@ -63,7 +63,8 @@ extern asmlinkage void schedule_tail(struct task_struct *prev);
extern void init_idle(struct task_struct *idle, int cpu); extern void init_idle(struct task_struct *idle, int cpu);
extern int sched_fork(unsigned long clone_flags, struct task_struct *p); extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
extern void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs); extern int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs);
extern void sched_cancel_fork(struct task_struct *p);
extern void sched_post_fork(struct task_struct *p); extern void sched_post_fork(struct task_struct *p);
extern void sched_dead(struct task_struct *p); extern void sched_dead(struct task_struct *p);
...@@ -119,6 +120,11 @@ static inline struct task_struct *get_task_struct(struct task_struct *t) ...@@ -119,6 +120,11 @@ static inline struct task_struct *get_task_struct(struct task_struct *t)
return t; return t;
} }
static inline struct task_struct *tryget_task_struct(struct task_struct *t)
{
return refcount_inc_not_zero(&t->usage) ? t : NULL;
}
extern void __put_task_struct(struct task_struct *t); extern void __put_task_struct(struct task_struct *t);
extern void __put_task_struct_rcu_cb(struct rcu_head *rhp); extern void __put_task_struct_rcu_cb(struct rcu_head *rhp);
......
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM sched_ext
#if !defined(_TRACE_SCHED_EXT_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_SCHED_EXT_H
#include <linux/tracepoint.h>
TRACE_EVENT(sched_ext_dump,
TP_PROTO(const char *line),
TP_ARGS(line),
TP_STRUCT__entry(
__string(line, line)
),
TP_fast_assign(
__assign_str(line);
),
TP_printk("%s",
__get_str(line)
)
);
#endif /* _TRACE_SCHED_EXT_H */
/* This part must be outside protection */
#include <trace/define_trace.h>
...@@ -118,6 +118,7 @@ struct clone_args { ...@@ -118,6 +118,7 @@ struct clone_args {
/* SCHED_ISO: reserved but not implemented yet */ /* SCHED_ISO: reserved but not implemented yet */
#define SCHED_IDLE 5 #define SCHED_IDLE 5
#define SCHED_DEADLINE 6 #define SCHED_DEADLINE 6
#define SCHED_EXT 7
/* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
#define SCHED_RESET_ON_FORK 0x40000000 #define SCHED_RESET_ON_FORK 0x40000000
......
...@@ -1025,9 +1025,13 @@ menuconfig CGROUP_SCHED ...@@ -1025,9 +1025,13 @@ menuconfig CGROUP_SCHED
tasks. tasks.
if CGROUP_SCHED if CGROUP_SCHED
config GROUP_SCHED_WEIGHT
def_bool n
config FAIR_GROUP_SCHED config FAIR_GROUP_SCHED
bool "Group scheduling for SCHED_OTHER" bool "Group scheduling for SCHED_OTHER"
depends on CGROUP_SCHED depends on CGROUP_SCHED
select GROUP_SCHED_WEIGHT
default CGROUP_SCHED default CGROUP_SCHED
config CFS_BANDWIDTH config CFS_BANDWIDTH
...@@ -1052,6 +1056,12 @@ config RT_GROUP_SCHED ...@@ -1052,6 +1056,12 @@ config RT_GROUP_SCHED
realtime bandwidth for them. realtime bandwidth for them.
See Documentation/scheduler/sched-rt-group.rst for more information. See Documentation/scheduler/sched-rt-group.rst for more information.
config EXT_GROUP_SCHED
bool
depends on SCHED_CLASS_EXT && CGROUP_SCHED
select GROUP_SCHED_WEIGHT
default y
endif #CGROUP_SCHED endif #CGROUP_SCHED
config SCHED_MM_CID config SCHED_MM_CID
......
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
#include <linux/sched/sysctl.h> #include <linux/sched/sysctl.h>
#include <linux/sched/rt.h> #include <linux/sched/rt.h>
#include <linux/sched/task.h> #include <linux/sched/task.h>
#include <linux/sched/ext.h>
#include <linux/init.h> #include <linux/init.h>
#include <linux/fs.h> #include <linux/fs.h>
#include <linux/mm.h> #include <linux/mm.h>
...@@ -98,6 +99,17 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = { ...@@ -98,6 +99,17 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
#endif #endif
#ifdef CONFIG_CGROUP_SCHED #ifdef CONFIG_CGROUP_SCHED
.sched_task_group = &root_task_group, .sched_task_group = &root_task_group,
#endif
#ifdef CONFIG_SCHED_CLASS_EXT
.scx = {
.dsq_list.node = LIST_HEAD_INIT(init_task.scx.dsq_list.node),
.sticky_cpu = -1,
.holding_cpu = -1,
.runnable_node = LIST_HEAD_INIT(init_task.scx.runnable_node),
.runnable_at = INITIAL_JIFFIES,
.ddsp_dsq_id = SCX_DSQ_INVALID,
.slice = SCX_SLICE_DFL,
},
#endif #endif
.ptraced = LIST_HEAD_INIT(init_task.ptraced), .ptraced = LIST_HEAD_INIT(init_task.ptraced),
.ptrace_entry = LIST_HEAD_INIT(init_task.ptrace_entry), .ptrace_entry = LIST_HEAD_INIT(init_task.ptrace_entry),
......
...@@ -133,4 +133,29 @@ config SCHED_CORE ...@@ -133,4 +133,29 @@ config SCHED_CORE
which is the likely usage by Linux distributions, there should which is the likely usage by Linux distributions, there should
be no measurable impact on performance. be no measurable impact on performance.
config SCHED_CLASS_EXT
bool "Extensible Scheduling Class"
depends on BPF_SYSCALL && BPF_JIT && DEBUG_INFO_BTF
select STACKTRACE if STACKTRACE_SUPPORT
help
This option enables a new scheduler class sched_ext (SCX), which
allows scheduling policies to be implemented as BPF programs to
achieve the following:
- Ease of experimentation and exploration: Enabling rapid
iteration of new scheduling policies.
- Customization: Building application-specific schedulers which
implement policies that are not applicable to general-purpose
schedulers.
- Rapid scheduler deployments: Non-disruptive swap outs of
scheduling policies in production environments.
sched_ext leverages BPF struct_ops feature to define a structure
which exports function callbacks and flags to BPF programs that
wish to implement scheduling policies. The struct_ops structure
exported by sched_ext is struct sched_ext_ops, and is conceptually
similar to struct sched_class.
For more information:
Documentation/scheduler/sched-ext.rst
https://github.com/sched-ext/scx
...@@ -23,6 +23,7 @@ ...@@ -23,6 +23,7 @@
#include <linux/sched/task.h> #include <linux/sched/task.h>
#include <linux/sched/task_stack.h> #include <linux/sched/task_stack.h>
#include <linux/sched/cputime.h> #include <linux/sched/cputime.h>
#include <linux/sched/ext.h>
#include <linux/seq_file.h> #include <linux/seq_file.h>
#include <linux/rtmutex.h> #include <linux/rtmutex.h>
#include <linux/init.h> #include <linux/init.h>
...@@ -969,6 +970,7 @@ void __put_task_struct(struct task_struct *tsk) ...@@ -969,6 +970,7 @@ void __put_task_struct(struct task_struct *tsk)
WARN_ON(refcount_read(&tsk->usage)); WARN_ON(refcount_read(&tsk->usage));
WARN_ON(tsk == current); WARN_ON(tsk == current);
sched_ext_free(tsk);
io_uring_free(tsk); io_uring_free(tsk);
cgroup_free(tsk); cgroup_free(tsk);
task_numa_free(tsk, true); task_numa_free(tsk, true);
...@@ -2346,7 +2348,7 @@ __latent_entropy struct task_struct *copy_process( ...@@ -2346,7 +2348,7 @@ __latent_entropy struct task_struct *copy_process(
retval = perf_event_init_task(p, clone_flags); retval = perf_event_init_task(p, clone_flags);
if (retval) if (retval)
goto bad_fork_cleanup_policy; goto bad_fork_sched_cancel_fork;
retval = audit_alloc(p); retval = audit_alloc(p);
if (retval) if (retval)
goto bad_fork_cleanup_perf; goto bad_fork_cleanup_perf;
...@@ -2479,7 +2481,9 @@ __latent_entropy struct task_struct *copy_process( ...@@ -2479,7 +2481,9 @@ __latent_entropy struct task_struct *copy_process(
* cgroup specific, it unconditionally needs to place the task on a * cgroup specific, it unconditionally needs to place the task on a
* runqueue. * runqueue.
*/ */
sched_cgroup_fork(p, args); retval = sched_cgroup_fork(p, args);
if (retval)
goto bad_fork_cancel_cgroup;
/* /*
* From this point on we must avoid any synchronous user-space * From this point on we must avoid any synchronous user-space
...@@ -2525,13 +2529,13 @@ __latent_entropy struct task_struct *copy_process( ...@@ -2525,13 +2529,13 @@ __latent_entropy struct task_struct *copy_process(
/* Don't start children in a dying pid namespace */ /* Don't start children in a dying pid namespace */
if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) { if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
retval = -ENOMEM; retval = -ENOMEM;
goto bad_fork_cancel_cgroup; goto bad_fork_core_free;
} }
/* Let kill terminate clone/fork in the middle */ /* Let kill terminate clone/fork in the middle */
if (fatal_signal_pending(current)) { if (fatal_signal_pending(current)) {
retval = -EINTR; retval = -EINTR;
goto bad_fork_cancel_cgroup; goto bad_fork_core_free;
} }
/* No more failure paths after this point. */ /* No more failure paths after this point. */
...@@ -2605,10 +2609,11 @@ __latent_entropy struct task_struct *copy_process( ...@@ -2605,10 +2609,11 @@ __latent_entropy struct task_struct *copy_process(
return p; return p;
bad_fork_cancel_cgroup: bad_fork_core_free:
sched_core_free(p); sched_core_free(p);
spin_unlock(&current->sighand->siglock); spin_unlock(&current->sighand->siglock);
write_unlock_irq(&tasklist_lock); write_unlock_irq(&tasklist_lock);
bad_fork_cancel_cgroup:
cgroup_cancel_fork(p, args); cgroup_cancel_fork(p, args);
bad_fork_put_pidfd: bad_fork_put_pidfd:
if (clone_flags & CLONE_PIDFD) { if (clone_flags & CLONE_PIDFD) {
...@@ -2647,6 +2652,8 @@ __latent_entropy struct task_struct *copy_process( ...@@ -2647,6 +2652,8 @@ __latent_entropy struct task_struct *copy_process(
audit_free(p); audit_free(p);
bad_fork_cleanup_perf: bad_fork_cleanup_perf:
perf_event_free_task(p); perf_event_free_task(p);
bad_fork_sched_cancel_fork:
sched_cancel_fork(p);
bad_fork_cleanup_policy: bad_fork_cleanup_policy:
lockdep_free_task(p); lockdep_free_task(p);
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA
......
...@@ -16,18 +16,25 @@ ...@@ -16,18 +16,25 @@
#include <linux/sched/clock.h> #include <linux/sched/clock.h>
#include <linux/sched/cputime.h> #include <linux/sched/cputime.h>
#include <linux/sched/hotplug.h> #include <linux/sched/hotplug.h>
#include <linux/sched/isolation.h>
#include <linux/sched/posix-timers.h> #include <linux/sched/posix-timers.h>
#include <linux/sched/rt.h> #include <linux/sched/rt.h>
#include <linux/cpuidle.h> #include <linux/cpuidle.h>
#include <linux/jiffies.h> #include <linux/jiffies.h>
#include <linux/kobject.h>
#include <linux/livepatch.h> #include <linux/livepatch.h>
#include <linux/pm.h>
#include <linux/psi.h> #include <linux/psi.h>
#include <linux/rhashtable.h>
#include <linux/seq_buf.h>
#include <linux/seqlock_api.h> #include <linux/seqlock_api.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/suspend.h> #include <linux/suspend.h>
#include <linux/tsacct_kern.h> #include <linux/tsacct_kern.h>
#include <linux/vtime.h> #include <linux/vtime.h>
#include <linux/sysrq.h>
#include <linux/percpu-rwsem.h>
#include <uapi/linux/sched/types.h> #include <uapi/linux/sched/types.h>
...@@ -52,4 +59,8 @@ ...@@ -52,4 +59,8 @@
#include "cputime.c" #include "cputime.c"
#include "deadline.c" #include "deadline.c"
#ifdef CONFIG_SCHED_CLASS_EXT
# include "ext.c"
#endif
#include "syscalls.c" #include "syscalls.c"
This diff is collapsed.
...@@ -197,8 +197,10 @@ unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual, ...@@ -197,8 +197,10 @@ unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual,
static void sugov_get_util(struct sugov_cpu *sg_cpu, unsigned long boost) static void sugov_get_util(struct sugov_cpu *sg_cpu, unsigned long boost)
{ {
unsigned long min, max, util = cpu_util_cfs_boost(sg_cpu->cpu); unsigned long min, max, util = scx_cpuperf_target(sg_cpu->cpu);
if (!scx_switched_all())
util += cpu_util_cfs_boost(sg_cpu->cpu);
util = effective_cpu_util(sg_cpu->cpu, util, &min, &max); util = effective_cpu_util(sg_cpu->cpu, util, &min, &max);
util = max(util, boost); util = max(util, boost);
sg_cpu->bw_min = min; sg_cpu->bw_min = min;
...@@ -325,16 +327,35 @@ static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, ...@@ -325,16 +327,35 @@ static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
} }
#ifdef CONFIG_NO_HZ_COMMON #ifdef CONFIG_NO_HZ_COMMON
static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) static bool sugov_hold_freq(struct sugov_cpu *sg_cpu)
{ {
unsigned long idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu); unsigned long idle_calls;
bool ret = idle_calls == sg_cpu->saved_idle_calls; bool ret;
/*
* The heuristics in this function is for the fair class. For SCX, the
* performance target comes directly from the BPF scheduler. Let's just
* follow it.
*/
if (scx_switched_all())
return false;
/* if capped by uclamp_max, always update to be in compliance */
if (uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)))
return false;
/*
* Maintain the frequency if the CPU has not been idle recently, as
* reduction is likely to be premature.
*/
idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu);
ret = idle_calls == sg_cpu->saved_idle_calls;
sg_cpu->saved_idle_calls = idle_calls; sg_cpu->saved_idle_calls = idle_calls;
return ret; return ret;
} }
#else #else
static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } static inline bool sugov_hold_freq(struct sugov_cpu *sg_cpu) { return false; }
#endif /* CONFIG_NO_HZ_COMMON */ #endif /* CONFIG_NO_HZ_COMMON */
/* /*
...@@ -382,14 +403,8 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time, ...@@ -382,14 +403,8 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time,
return; return;
next_f = get_next_freq(sg_policy, sg_cpu->util, max_cap); next_f = get_next_freq(sg_policy, sg_cpu->util, max_cap);
/*
* Do not reduce the frequency if the CPU has not been idle if (sugov_hold_freq(sg_cpu) && next_f < sg_policy->next_freq &&
* recently, as the reduction is likely to be premature then.
*
* Except when the rq is capped by uclamp_max.
*/
if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) &&
sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq &&
!sg_policy->need_freq_update) { !sg_policy->need_freq_update) {
next_f = sg_policy->next_freq; next_f = sg_policy->next_freq;
...@@ -436,14 +451,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time, ...@@ -436,14 +451,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
if (!sugov_update_single_common(sg_cpu, time, max_cap, flags)) if (!sugov_update_single_common(sg_cpu, time, max_cap, flags))
return; return;
/* if (sugov_hold_freq(sg_cpu) && sg_cpu->util < prev_util)
* Do not reduce the target performance level if the CPU has not been
* idle recently, as the reduction is likely to be premature then.
*
* Except when the rq is capped by uclamp_max.
*/
if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) &&
sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util)
sg_cpu->util = prev_util; sg_cpu->util = prev_util;
cpufreq_driver_adjust_perf(sg_cpu->cpu, sg_cpu->bw_min, cpufreq_driver_adjust_perf(sg_cpu->cpu, sg_cpu->bw_min,
......
...@@ -1264,6 +1264,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, ...@@ -1264,6 +1264,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
P(dl.runtime); P(dl.runtime);
P(dl.deadline); P(dl.deadline);
} }
#ifdef CONFIG_SCHED_CLASS_EXT
__PS("ext.enabled", task_on_scx(p));
#endif
#undef PN_SCHEDSTAT #undef PN_SCHEDSTAT
#undef P_SCHEDSTAT #undef P_SCHEDSTAT
......
This diff is collapsed.
/* SPDX-License-Identifier: GPL-2.0 */
/*
* BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
*
* Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
* Copyright (c) 2022 Tejun Heo <tj@kernel.org>
* Copyright (c) 2022 David Vernet <dvernet@meta.com>
*/
#ifdef CONFIG_SCHED_CLASS_EXT
void scx_tick(struct rq *rq);
void init_scx_entity(struct sched_ext_entity *scx);
void scx_pre_fork(struct task_struct *p);
int scx_fork(struct task_struct *p);
void scx_post_fork(struct task_struct *p);
void scx_cancel_fork(struct task_struct *p);
bool scx_can_stop_tick(struct rq *rq);
void scx_rq_activate(struct rq *rq);
void scx_rq_deactivate(struct rq *rq);
int scx_check_setscheduler(struct task_struct *p, int policy);
bool task_should_scx(struct task_struct *p);
void init_sched_ext_class(void);
static inline u32 scx_cpuperf_target(s32 cpu)
{
if (scx_enabled())
return cpu_rq(cpu)->scx.cpuperf_target;
else
return 0;
}
static inline bool task_on_scx(const struct task_struct *p)
{
return scx_enabled() && p->sched_class == &ext_sched_class;
}
#ifdef CONFIG_SCHED_CORE
bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
bool in_fi);
#endif
#else /* CONFIG_SCHED_CLASS_EXT */
static inline void scx_tick(struct rq *rq) {}
static inline void scx_pre_fork(struct task_struct *p) {}
static inline int scx_fork(struct task_struct *p) { return 0; }
static inline void scx_post_fork(struct task_struct *p) {}
static inline void scx_cancel_fork(struct task_struct *p) {}
static inline u32 scx_cpuperf_target(s32 cpu) { return 0; }
static inline bool scx_can_stop_tick(struct rq *rq) { return true; }
static inline void scx_rq_activate(struct rq *rq) {}
static inline void scx_rq_deactivate(struct rq *rq) {}
static inline int scx_check_setscheduler(struct task_struct *p, int policy) { return 0; }
static inline bool task_on_scx(const struct task_struct *p) { return false; }
static inline void init_sched_ext_class(void) {}
#endif /* CONFIG_SCHED_CLASS_EXT */
#if defined(CONFIG_SCHED_CLASS_EXT) && defined(CONFIG_SMP)
void __scx_update_idle(struct rq *rq, bool idle);
static inline void scx_update_idle(struct rq *rq, bool idle)
{
if (scx_enabled())
__scx_update_idle(rq, idle);
}
#else
static inline void scx_update_idle(struct rq *rq, bool idle) {}
#endif
#ifdef CONFIG_CGROUP_SCHED
#ifdef CONFIG_EXT_GROUP_SCHED
int scx_tg_online(struct task_group *tg);
void scx_tg_offline(struct task_group *tg);
int scx_cgroup_can_attach(struct cgroup_taskset *tset);
void scx_move_task(struct task_struct *p);
void scx_cgroup_finish_attach(void);
void scx_cgroup_cancel_attach(struct cgroup_taskset *tset);
void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight);
void scx_group_set_idle(struct task_group *tg, bool idle);
#else /* CONFIG_EXT_GROUP_SCHED */
static inline int scx_tg_online(struct task_group *tg) { return 0; }
static inline void scx_tg_offline(struct task_group *tg) {}
static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; }
static inline void scx_move_task(struct task_struct *p) {}
static inline void scx_cgroup_finish_attach(void) {}
static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {}
static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {}
static inline void scx_group_set_idle(struct task_group *tg, bool idle) {}
#endif /* CONFIG_EXT_GROUP_SCHED */
#endif /* CONFIG_CGROUP_SCHED */
...@@ -3924,7 +3924,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, ...@@ -3924,7 +3924,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
} }
} }
void reweight_task(struct task_struct *p, const struct load_weight *lw) static void reweight_task_fair(struct rq *rq, struct task_struct *p,
const struct load_weight *lw)
{ {
struct sched_entity *se = &p->se; struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se); struct cfs_rq *cfs_rq = cfs_rq_of(se);
...@@ -8807,7 +8808,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int ...@@ -8807,7 +8808,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
/* /*
* BATCH and IDLE tasks do not preempt others. * BATCH and IDLE tasks do not preempt others.
*/ */
if (unlikely(p->policy != SCHED_NORMAL)) if (unlikely(!normal_policy(p->policy)))
return; return;
cfs_rq = cfs_rq_of(se); cfs_rq = cfs_rq_of(se);
...@@ -9716,29 +9717,18 @@ static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) { ...@@ -9716,29 +9717,18 @@ static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {
static bool __update_blocked_others(struct rq *rq, bool *done) static bool __update_blocked_others(struct rq *rq, bool *done)
{ {
const struct sched_class *curr_class; bool updated;
u64 now = rq_clock_pelt(rq);
unsigned long hw_pressure;
bool decayed;
/* /*
* update_load_avg() can call cpufreq_update_util(). Make sure that RT, * update_load_avg() can call cpufreq_update_util(). Make sure that RT,
* DL and IRQ signals have been updated before updating CFS. * DL and IRQ signals have been updated before updating CFS.
*/ */
curr_class = rq->curr->sched_class; updated = update_other_load_avgs(rq);
hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
/* hw_pressure doesn't care about invariance */
decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure) |
update_irq_load_avg(rq, 0);
if (others_have_blocked(rq)) if (others_have_blocked(rq))
*done = false; *done = false;
return decayed; return updated;
} }
#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_FAIR_GROUP_SCHED
...@@ -13612,6 +13602,7 @@ DEFINE_SCHED_CLASS(fair) = { ...@@ -13612,6 +13602,7 @@ DEFINE_SCHED_CLASS(fair) = {
.task_tick = task_tick_fair, .task_tick = task_tick_fair,
.task_fork = task_fork_fair, .task_fork = task_fork_fair,
.reweight_task = reweight_task_fair,
.prio_changed = prio_changed_fair, .prio_changed = prio_changed_fair,
.switched_from = switched_from_fair, .switched_from = switched_from_fair,
.switched_to = switched_to_fair, .switched_to = switched_to_fair,
......
...@@ -453,11 +453,13 @@ static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags) ...@@ -453,11 +453,13 @@ static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags)
static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct task_struct *next) static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct task_struct *next)
{ {
dl_server_update_idle_time(rq, prev); dl_server_update_idle_time(rq, prev);
scx_update_idle(rq, false);
} }
static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first) static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first)
{ {
update_idle_core(rq); update_idle_core(rq);
scx_update_idle(rq, true);
schedstat_inc(rq->sched_goidle); schedstat_inc(rq->sched_goidle);
next->se.exec_start = rq_clock_task(rq); next->se.exec_start = rq_clock_task(rq);
} }
......
...@@ -467,3 +467,23 @@ int update_irq_load_avg(struct rq *rq, u64 running) ...@@ -467,3 +467,23 @@ int update_irq_load_avg(struct rq *rq, u64 running)
return ret; return ret;
} }
#endif #endif
/*
* Load avg and utiliztion metrics need to be updated periodically and before
* consumption. This function updates the metrics for all subsystems except for
* the fair class. @rq must be locked and have its clock updated.
*/
bool update_other_load_avgs(struct rq *rq)
{
u64 now = rq_clock_pelt(rq);
const struct sched_class *curr_class = rq->curr->sched_class;
unsigned long hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
lockdep_assert_rq_held(rq);
/* hw_pressure doesn't care about invariance */
return update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure) |
update_irq_load_avg(rq, 0);
}
...@@ -6,6 +6,7 @@ int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se ...@@ -6,6 +6,7 @@ int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se
int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq); int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq);
int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);
int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); int update_dl_rq_load_avg(u64 now, struct rq *rq, int running);
bool update_other_load_avgs(struct rq *rq);
#ifdef CONFIG_SCHED_HW_PRESSURE #ifdef CONFIG_SCHED_HW_PRESSURE
int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity); int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity);
......
This diff is collapsed.
...@@ -612,6 +612,10 @@ int __sched_setscheduler(struct task_struct *p, ...@@ -612,6 +612,10 @@ int __sched_setscheduler(struct task_struct *p,
goto unlock; goto unlock;
} }
retval = scx_check_setscheduler(p, policy);
if (retval)
goto unlock;
/* /*
* If not changing anything there's no need to proceed further, * If not changing anything there's no need to proceed further,
* but store a possible modification of reset_on_fork. * but store a possible modification of reset_on_fork.
...@@ -716,6 +720,7 @@ int __sched_setscheduler(struct task_struct *p, ...@@ -716,6 +720,7 @@ int __sched_setscheduler(struct task_struct *p,
__setscheduler_prio(p, newprio); __setscheduler_prio(p, newprio);
} }
__setscheduler_uclamp(p, attr); __setscheduler_uclamp(p, attr);
check_class_changing(rq, p, prev_class);
if (queued) { if (queued) {
/* /*
...@@ -1526,6 +1531,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy) ...@@ -1526,6 +1531,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
case SCHED_NORMAL: case SCHED_NORMAL:
case SCHED_BATCH: case SCHED_BATCH:
case SCHED_IDLE: case SCHED_IDLE:
case SCHED_EXT:
ret = 0; ret = 0;
break; break;
} }
...@@ -1553,6 +1559,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) ...@@ -1553,6 +1559,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
case SCHED_NORMAL: case SCHED_NORMAL:
case SCHED_BATCH: case SCHED_BATCH:
case SCHED_IDLE: case SCHED_IDLE:
case SCHED_EXT:
ret = 0; ret = 0;
} }
return ret; return ret;
......
...@@ -73,6 +73,7 @@ void dump_stack_print_info(const char *log_lvl) ...@@ -73,6 +73,7 @@ void dump_stack_print_info(const char *log_lvl)
print_worker_info(log_lvl, current); print_worker_info(log_lvl, current);
print_stop_info(log_lvl, current); print_stop_info(log_lvl, current);
print_scx_info(log_lvl, current);
} }
/** /**
......
...@@ -28,6 +28,7 @@ help: ...@@ -28,6 +28,7 @@ help:
@echo ' pci - PCI tools' @echo ' pci - PCI tools'
@echo ' perf - Linux performance measurement and analysis tool' @echo ' perf - Linux performance measurement and analysis tool'
@echo ' selftests - various kernel selftests' @echo ' selftests - various kernel selftests'
@echo ' sched_ext - sched_ext example schedulers'
@echo ' bootconfig - boot config tool' @echo ' bootconfig - boot config tool'
@echo ' spi - spi tools' @echo ' spi - spi tools'
@echo ' tmon - thermal monitoring and tuning tool' @echo ' tmon - thermal monitoring and tuning tool'
...@@ -91,6 +92,9 @@ perf: FORCE ...@@ -91,6 +92,9 @@ perf: FORCE
$(Q)mkdir -p $(PERF_O) . $(Q)mkdir -p $(PERF_O) .
$(Q)$(MAKE) --no-print-directory -C perf O=$(PERF_O) subdir= $(Q)$(MAKE) --no-print-directory -C perf O=$(PERF_O) subdir=
sched_ext: FORCE
$(call descend,sched_ext)
selftests: FORCE selftests: FORCE
$(call descend,testing/$@) $(call descend,testing/$@)
...@@ -184,6 +188,9 @@ perf_clean: ...@@ -184,6 +188,9 @@ perf_clean:
$(Q)mkdir -p $(PERF_O) . $(Q)mkdir -p $(PERF_O) .
$(Q)$(MAKE) --no-print-directory -C perf O=$(PERF_O) subdir= clean $(Q)$(MAKE) --no-print-directory -C perf O=$(PERF_O) subdir= clean
sched_ext_clean:
$(call descend,sched_ext,clean)
selftests_clean: selftests_clean:
$(call descend,testing/$(@:_clean=),clean) $(call descend,testing/$(@:_clean=),clean)
...@@ -213,6 +220,7 @@ clean: acpi_clean counter_clean cpupower_clean hv_clean firewire_clean \ ...@@ -213,6 +220,7 @@ clean: acpi_clean counter_clean cpupower_clean hv_clean firewire_clean \
mm_clean bpf_clean iio_clean x86_energy_perf_policy_clean tmon_clean \ mm_clean bpf_clean iio_clean x86_energy_perf_policy_clean tmon_clean \
freefall_clean build_clean libbpf_clean libsubcmd_clean \ freefall_clean build_clean libbpf_clean libsubcmd_clean \
gpio_clean objtool_clean leds_clean wmi_clean pci_clean firmware_clean debugging_clean \ gpio_clean objtool_clean leds_clean wmi_clean pci_clean firmware_clean debugging_clean \
intel-speed-select_clean tracing_clean thermal_clean thermometer_clean thermal-engine_clean intel-speed-select_clean tracing_clean thermal_clean thermometer_clean thermal-engine_clean \
sched_ext_clean
.PHONY: FORCE .PHONY: FORCE
# SPDX-License-Identifier: GPL-2.0
# Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
include ../build/Build.include
include ../scripts/Makefile.arch
include ../scripts/Makefile.include
all: all_targets
ifneq ($(LLVM),)
ifneq ($(filter %/,$(LLVM)),)
LLVM_PREFIX := $(LLVM)
else ifneq ($(filter -%,$(LLVM)),)
LLVM_SUFFIX := $(LLVM)
endif
CLANG_TARGET_FLAGS_arm := arm-linux-gnueabi
CLANG_TARGET_FLAGS_arm64 := aarch64-linux-gnu
CLANG_TARGET_FLAGS_hexagon := hexagon-linux-musl
CLANG_TARGET_FLAGS_m68k := m68k-linux-gnu
CLANG_TARGET_FLAGS_mips := mipsel-linux-gnu
CLANG_TARGET_FLAGS_powerpc := powerpc64le-linux-gnu
CLANG_TARGET_FLAGS_riscv := riscv64-linux-gnu
CLANG_TARGET_FLAGS_s390 := s390x-linux-gnu
CLANG_TARGET_FLAGS_x86 := x86_64-linux-gnu
CLANG_TARGET_FLAGS := $(CLANG_TARGET_FLAGS_$(ARCH))
ifeq ($(CROSS_COMPILE),)
ifeq ($(CLANG_TARGET_FLAGS),)
$(error Specify CROSS_COMPILE or add '--target=' option to lib.mk)
else
CLANG_FLAGS += --target=$(CLANG_TARGET_FLAGS)
endif # CLANG_TARGET_FLAGS
else
CLANG_FLAGS += --target=$(notdir $(CROSS_COMPILE:%-=%))
endif # CROSS_COMPILE
CC := $(LLVM_PREFIX)clang$(LLVM_SUFFIX) $(CLANG_FLAGS) -fintegrated-as
else
CC := $(CROSS_COMPILE)gcc
endif # LLVM
CURDIR := $(abspath .)
TOOLSDIR := $(abspath ..)
LIBDIR := $(TOOLSDIR)/lib
BPFDIR := $(LIBDIR)/bpf
TOOLSINCDIR := $(TOOLSDIR)/include
BPFTOOLDIR := $(TOOLSDIR)/bpf/bpftool
APIDIR := $(TOOLSINCDIR)/uapi
GENDIR := $(abspath ../../include/generated)
GENHDR := $(GENDIR)/autoconf.h
ifeq ($(O),)
OUTPUT_DIR := $(CURDIR)/build
else
OUTPUT_DIR := $(O)/build
endif # O
OBJ_DIR := $(OUTPUT_DIR)/obj
INCLUDE_DIR := $(OUTPUT_DIR)/include
BPFOBJ_DIR := $(OBJ_DIR)/libbpf
SCXOBJ_DIR := $(OBJ_DIR)/sched_ext
BINDIR := $(OUTPUT_DIR)/bin
BPFOBJ := $(BPFOBJ_DIR)/libbpf.a
ifneq ($(CROSS_COMPILE),)
HOST_BUILD_DIR := $(OBJ_DIR)/host
HOST_OUTPUT_DIR := host-tools
HOST_INCLUDE_DIR := $(HOST_OUTPUT_DIR)/include
else
HOST_BUILD_DIR := $(OBJ_DIR)
HOST_OUTPUT_DIR := $(OUTPUT_DIR)
HOST_INCLUDE_DIR := $(INCLUDE_DIR)
endif
HOST_BPFOBJ := $(HOST_BUILD_DIR)/libbpf/libbpf.a
RESOLVE_BTFIDS := $(HOST_BUILD_DIR)/resolve_btfids/resolve_btfids
DEFAULT_BPFTOOL := $(HOST_OUTPUT_DIR)/sbin/bpftool
VMLINUX_BTF_PATHS ?= $(if $(O),$(O)/vmlinux) \
$(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux) \
../../vmlinux \
/sys/kernel/btf/vmlinux \
/boot/vmlinux-$(shell uname -r)
VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS))))
ifeq ($(VMLINUX_BTF),)
$(error Cannot find a vmlinux for VMLINUX_BTF at any of "$(VMLINUX_BTF_PATHS)")
endif
BPFTOOL ?= $(DEFAULT_BPFTOOL)
ifneq ($(wildcard $(GENHDR)),)
GENFLAGS := -DHAVE_GENHDR
endif
CFLAGS += -g -O2 -rdynamic -pthread -Wall -Werror $(GENFLAGS) \
-I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) \
-I$(TOOLSINCDIR) -I$(APIDIR) -I$(CURDIR)/include
# Silence some warnings when compiled with clang
ifneq ($(LLVM),)
CFLAGS += -Wno-unused-command-line-argument
endif
LDFLAGS = -lelf -lz -lpthread
IS_LITTLE_ENDIAN = $(shell $(CC) -dM -E - </dev/null | \
grep 'define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__')
# Get Clang's default includes on this system, as opposed to those seen by
# '-target bpf'. This fixes "missing" files on some architectures/distros,
# such as asm/byteorder.h, asm/socket.h, asm/sockios.h, sys/cdefs.h etc.
#
# Use '-idirafter': Don't interfere with include mechanics except where the
# build would have failed anyways.
define get_sys_includes
$(shell $(1) -v -E - </dev/null 2>&1 \
| sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \
$(shell $(1) -dM -E - </dev/null | grep '__riscv_xlen ' | awk '{printf("-D__riscv_xlen=%d -D__BITS_PER_LONG=%d", $$3, $$3)}')
endef
BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH) \
$(if $(IS_LITTLE_ENDIAN),-mlittle-endian,-mbig-endian) \
-I$(CURDIR)/include -I$(CURDIR)/include/bpf-compat \
-I$(INCLUDE_DIR) -I$(APIDIR) \
-I../../include \
$(call get_sys_includes,$(CLANG)) \
-Wall -Wno-compare-distinct-pointer-types \
-O2 -mcpu=v3
# sort removes libbpf duplicates when not cross-building
MAKE_DIRS := $(sort $(OBJ_DIR)/libbpf $(HOST_BUILD_DIR)/libbpf \
$(HOST_BUILD_DIR)/bpftool $(HOST_BUILD_DIR)/resolve_btfids \
$(INCLUDE_DIR) $(SCXOBJ_DIR) $(BINDIR))
$(MAKE_DIRS):
$(call msg,MKDIR,,$@)
$(Q)mkdir -p $@
$(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \
$(APIDIR)/linux/bpf.h \
| $(OBJ_DIR)/libbpf
$(Q)$(MAKE) $(submake_extras) -C $(BPFDIR) OUTPUT=$(OBJ_DIR)/libbpf/ \
EXTRA_CFLAGS='-g -O0 -fPIC' \
DESTDIR=$(OUTPUT_DIR) prefix= all install_headers
$(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile) \
$(HOST_BPFOBJ) | $(HOST_BUILD_DIR)/bpftool
$(Q)$(MAKE) $(submake_extras) -C $(BPFTOOLDIR) \
ARCH= CROSS_COMPILE= CC=$(HOSTCC) LD=$(HOSTLD) \
EXTRA_CFLAGS='-g -O0' \
OUTPUT=$(HOST_BUILD_DIR)/bpftool/ \
LIBBPF_OUTPUT=$(HOST_BUILD_DIR)/libbpf/ \
LIBBPF_DESTDIR=$(HOST_OUTPUT_DIR)/ \
prefix= DESTDIR=$(HOST_OUTPUT_DIR)/ install-bin
$(INCLUDE_DIR)/vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL) | $(INCLUDE_DIR)
ifeq ($(VMLINUX_H),)
$(call msg,GEN,,$@)
$(Q)$(BPFTOOL) btf dump file $(VMLINUX_BTF) format c > $@
else
$(call msg,CP,,$@)
$(Q)cp "$(VMLINUX_H)" $@
endif
$(SCXOBJ_DIR)/%.bpf.o: %.bpf.c $(INCLUDE_DIR)/vmlinux.h include/scx/*.h \
| $(BPFOBJ) $(SCXOBJ_DIR)
$(call msg,CLNG-BPF,,$(notdir $@))
$(Q)$(CLANG) $(BPF_CFLAGS) -target bpf -c $< -o $@
$(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BPFTOOL)
$(eval sched=$(notdir $@))
$(call msg,GEN-SKEL,,$(sched))
$(Q)$(BPFTOOL) gen object $(<:.o=.linked1.o) $<
$(Q)$(BPFTOOL) gen object $(<:.o=.linked2.o) $(<:.o=.linked1.o)
$(Q)$(BPFTOOL) gen object $(<:.o=.linked3.o) $(<:.o=.linked2.o)
$(Q)diff $(<:.o=.linked2.o) $(<:.o=.linked3.o)
$(Q)$(BPFTOOL) gen skeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $@
$(Q)$(BPFTOOL) gen subskeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $(@:.skel.h=.subskel.h)
SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR)
c-sched-targets = scx_simple scx_qmap scx_central scx_flatcg
$(addprefix $(BINDIR)/,$(c-sched-targets)): \
$(BINDIR)/%: \
$(filter-out %.bpf.c,%.c) \
$(INCLUDE_DIR)/%.bpf.skel.h \
$(SCX_COMMON_DEPS)
$(eval sched=$(notdir $@))
$(CC) $(CFLAGS) -c $(sched).c -o $(SCXOBJ_DIR)/$(sched).o
$(CC) -o $@ $(SCXOBJ_DIR)/$(sched).o $(HOST_BPFOBJ) $(LDFLAGS)
$(c-sched-targets): %: $(BINDIR)/%
install: all
$(Q)mkdir -p $(DESTDIR)/usr/local/bin/
$(Q)cp $(BINDIR)/* $(DESTDIR)/usr/local/bin/
clean:
rm -rf $(OUTPUT_DIR) $(HOST_OUTPUT_DIR)
rm -f *.o *.bpf.o *.bpf.skel.h *.bpf.subskel.h
rm -f $(c-sched-targets)
help:
@echo 'Building targets'
@echo '================'
@echo ''
@echo ' all - Compile all schedulers'
@echo ''
@echo 'Alternatively, you may compile individual schedulers:'
@echo ''
@printf ' %s\n' $(c-sched-targets)
@echo ''
@echo 'For any scheduler build target, you may specify an alternative'
@echo 'build output path with the O= environment variable. For example:'
@echo ''
@echo ' O=/tmp/sched_ext make all'
@echo ''
@echo 'will compile all schedulers, and emit the build artifacts to'
@echo '/tmp/sched_ext/build.'
@echo ''
@echo ''
@echo 'Installing targets'
@echo '=================='
@echo ''
@echo ' install - Compile and install all schedulers to /usr/bin.'
@echo ' You may specify the DESTDIR= environment variable'
@echo ' to indicate a prefix for /usr/bin. For example:'
@echo ''
@echo ' DESTDIR=/tmp/sched_ext make install'
@echo ''
@echo ' will build the schedulers in CWD/build, and'
@echo ' install the schedulers to /tmp/sched_ext/usr/bin.'
@echo ''
@echo ''
@echo 'Cleaning targets'
@echo '================'
@echo ''
@echo ' clean - Remove all generated files'
all_targets: $(c-sched-targets)
.PHONY: all all_targets $(c-sched-targets) clean help
# delete failed targets
.DELETE_ON_ERROR:
# keep intermediate (.bpf.skel.h, .bpf.o, etc) targets
.SECONDARY:
SCHED_EXT EXAMPLE SCHEDULERS
============================
# Introduction
This directory contains a number of example sched_ext schedulers. These
schedulers are meant to provide examples of different types of schedulers
that can be built using sched_ext, and illustrate how various features of
sched_ext can be used.
Some of the examples are performant, production-ready schedulers. That is, for
the correct workload and with the correct tuning, they may be deployed in a
production environment with acceptable or possibly even improved performance.
Others are just examples that in practice, would not provide acceptable
performance (though they could be improved to get there).
This README will describe these example schedulers, including describing the
types of workloads or scenarios they're designed to accommodate, and whether or
not they're production ready. For more details on any of these schedulers,
please see the header comment in their .bpf.c file.
# Compiling the examples
There are a few toolchain dependencies for compiling the example schedulers.
## Toolchain dependencies
1. clang >= 16.0.0
The schedulers are BPF programs, and therefore must be compiled with clang. gcc
is actively working on adding a BPF backend compiler as well, but are still
missing some features such as BTF type tags which are necessary for using
kptrs.
2. pahole >= 1.25
You may need pahole in order to generate BTF from DWARF.
3. rust >= 1.70.0
Rust schedulers uses features present in the rust toolchain >= 1.70.0. You
should be able to use the stable build from rustup, but if that doesn't
work, try using the rustup nightly build.
There are other requirements as well, such as make, but these are the main /
non-trivial ones.
## Compiling the kernel
In order to run a sched_ext scheduler, you'll have to run a kernel compiled
with the patches in this repository, and with a minimum set of necessary
Kconfig options:
```
CONFIG_BPF=y
CONFIG_SCHED_CLASS_EXT=y
CONFIG_BPF_SYSCALL=y
CONFIG_BPF_JIT=y
CONFIG_DEBUG_INFO_BTF=y
```
It's also recommended that you also include the following Kconfig options:
```
CONFIG_BPF_JIT_ALWAYS_ON=y
CONFIG_BPF_JIT_DEFAULT_ON=y
CONFIG_PAHOLE_HAS_SPLIT_BTF=y
CONFIG_PAHOLE_HAS_BTF_TAG=y
```
There is a `Kconfig` file in this directory whose contents you can append to
your local `.config` file, as long as there are no conflicts with any existing
options in the file.
## Getting a vmlinux.h file
You may notice that most of the example schedulers include a "vmlinux.h" file.
This is a large, auto-generated header file that contains all of the types
defined in some vmlinux binary that was compiled with
[BTF](https://docs.kernel.org/bpf/btf.html) (i.e. with the BTF-related Kconfig
options specified above).
The header file is created using `bpftool`, by passing it a vmlinux binary
compiled with BTF as follows:
```bash
$ bpftool btf dump file /path/to/vmlinux format c > vmlinux.h
```
`bpftool` analyzes all of the BTF encodings in the binary, and produces a
header file that can be included by BPF programs to access those types. For
example, using vmlinux.h allows a scheduler to access fields defined directly
in vmlinux as follows:
```c
#include "vmlinux.h"
// vmlinux.h is also implicitly included by scx_common.bpf.h.
#include "scx_common.bpf.h"
/*
* vmlinux.h provides definitions for struct task_struct and
* struct scx_enable_args.
*/
void BPF_STRUCT_OPS(example_enable, struct task_struct *p,
struct scx_enable_args *args)
{
bpf_printk("Task %s enabled in example scheduler", p->comm);
}
// vmlinux.h provides the definition for struct sched_ext_ops.
SEC(".struct_ops.link")
struct sched_ext_ops example_ops {
.enable = (void *)example_enable,
.name = "example",
}
```
The scheduler build system will generate this vmlinux.h file as part of the
scheduler build pipeline. It looks for a vmlinux file in the following
dependency order:
1. If the O= environment variable is defined, at `$O/vmlinux`
2. If the KBUILD_OUTPUT= environment variable is defined, at
`$KBUILD_OUTPUT/vmlinux`
3. At `../../vmlinux` (i.e. at the root of the kernel tree where you're
compiling the schedulers)
3. `/sys/kernel/btf/vmlinux`
4. `/boot/vmlinux-$(uname -r)`
In other words, if you have compiled a kernel in your local repo, its vmlinux
file will be used to generate vmlinux.h. Otherwise, it will be the vmlinux of
the kernel you're currently running on. This means that if you're running on a
kernel with sched_ext support, you may not need to compile a local kernel at
all.
### Aside on CO-RE
One of the cooler features of BPF is that it supports
[CO-RE](https://nakryiko.com/posts/bpf-core-reference-guide/) (Compile Once Run
Everywhere). This feature allows you to reference fields inside of structs with
types defined internal to the kernel, and not have to recompile if you load the
BPF program on a different kernel with the field at a different offset. In our
example above, we print out a task name with `p->comm`. CO-RE would perform
relocations for that access when the program is loaded to ensure that it's
referencing the correct offset for the currently running kernel.
## Compiling the schedulers
Once you have your toolchain setup, and a vmlinux that can be used to generate
a full vmlinux.h file, you can compile the schedulers using `make`:
```bash
$ make -j($nproc)
```
# Example schedulers
This directory contains the following example schedulers. These schedulers are
for testing and demonstrating different aspects of sched_ext. While some may be
useful in limited scenarios, they are not intended to be practical.
For more scheduler implementations, tools and documentation, visit
https://github.com/sched-ext/scx.
## scx_simple
A simple scheduler that provides an example of a minimal sched_ext scheduler.
scx_simple can be run in either global weighted vtime mode, or FIFO mode.
Though very simple, in limited scenarios, this scheduler can perform reasonably
well on single-socket systems with a unified L3 cache.
## scx_qmap
Another simple, yet slightly more complex scheduler that provides an example of
a basic weighted FIFO queuing policy. It also provides examples of some common
useful BPF features, such as sleepable per-task storage allocation in the
`ops.prep_enable()` callback, and using the `BPF_MAP_TYPE_QUEUE` map type to
enqueue tasks. It also illustrates how core-sched support could be implemented.
## scx_central
A "central" scheduler where scheduling decisions are made from a single CPU.
This scheduler illustrates how scheduling decisions can be dispatched from a
single CPU, allowing other cores to run with infinite slices, without timer
ticks, and without having to incur the overhead of making scheduling decisions.
The approach demonstrated by this scheduler may be useful for any workload that
benefits from minimizing scheduling overhead and timer ticks. An example of
where this could be particularly useful is running VMs, where running with
infinite slices and no timer ticks allows the VM to avoid unnecessary expensive
vmexits.
## scx_flatcg
A flattened cgroup hierarchy scheduler. This scheduler implements hierarchical
weight-based cgroup CPU control by flattening the cgroup hierarchy into a single
layer, by compounding the active weight share at each level. The effect of this
is a much more performant CPU controller, which does not need to descend down
cgroup trees in order to properly compute a cgroup's share.
Similar to scx_simple, in limited scenarios, this scheduler can perform
reasonably well on single socket-socket systems with a unified L3 cache and show
significantly lowered hierarchical scheduling overhead.
# Troubleshooting
There are a number of common issues that you may run into when building the
schedulers. We'll go over some of the common ones here.
## Build Failures
### Old version of clang
```
error: static assertion failed due to requirement 'SCX_DSQ_FLAG_BUILTIN': bpftool generated vmlinux.h is missing high bits for 64bit enums, upgrade clang and pahole
_Static_assert(SCX_DSQ_FLAG_BUILTIN,
^~~~~~~~~~~~~~~~~~~~
1 error generated.
```
This means you built the kernel or the schedulers with an older version of
clang than what's supported (i.e. older than 16.0.0). To remediate this:
1. `which clang` to make sure you're using a sufficiently new version of clang.
2. `make fullclean` in the root path of the repository, and rebuild the kernel
and schedulers.
3. Rebuild the kernel, and then your example schedulers.
The schedulers are also cleaned if you invoke `make mrproper` in the root
directory of the tree.
### Stale kernel build / incomplete vmlinux.h file
As described above, you'll need a `vmlinux.h` file that was generated from a
vmlinux built with BTF, and with sched_ext support enabled. If you don't,
you'll see errors such as the following which indicate that a type being
referenced in a scheduler is unknown:
```
/path/to/sched_ext/tools/sched_ext/user_exit_info.h:25:23: note: forward declaration of 'struct scx_exit_info'
const struct scx_exit_info *ei)
^
```
In order to resolve this, please follow the steps above in
[Getting a vmlinux.h file](#getting-a-vmlinuxh-file) in order to ensure your
schedulers are using a vmlinux.h file that includes the requisite types.
## Misc
### llvm: [OFF]
You may see the following output when building the schedulers:
```
Auto-detecting system features:
... clang-bpf-co-re: [ on ]
... llvm: [ OFF ]
... libcap: [ on ]
... libbfd: [ on ]
```
Seeing `llvm: [ OFF ]` here is not an issue. You can safely ignore.
/*
* Dummy gnu/stubs.h. clang can end up including /usr/include/gnu/stubs.h when
* compiling BPF files although its content doesn't play any role. The file in
* turn includes stubs-64.h or stubs-32.h depending on whether __x86_64__ is
* defined. When compiling a BPF source, __x86_64__ isn't set and thus
* stubs-32.h is selected. However, the file is not there if the system doesn't
* have 32bit glibc devel package installed leading to a build failure.
*
* The problem is worked around by making this file available in the include
* search paths before the system one when building BPF.
*/
This diff is collapsed.
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
* Copyright (c) 2023 Tejun Heo <tj@kernel.org>
* Copyright (c) 2023 David Vernet <dvernet@meta.com>
*/
#ifndef __SCHED_EXT_COMMON_H
#define __SCHED_EXT_COMMON_H
#ifdef __KERNEL__
#error "Should not be included by BPF programs"
#endif
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <errno.h>
typedef uint8_t u8;
typedef uint16_t u16;
typedef uint32_t u32;
typedef uint64_t u64;
typedef int8_t s8;
typedef int16_t s16;
typedef int32_t s32;
typedef int64_t s64;
#define SCX_BUG(__fmt, ...) \
do { \
fprintf(stderr, "[SCX_BUG] %s:%d", __FILE__, __LINE__); \
if (errno) \
fprintf(stderr, " (%s)\n", strerror(errno)); \
else \
fprintf(stderr, "\n"); \
fprintf(stderr, __fmt __VA_OPT__(,) __VA_ARGS__); \
fprintf(stderr, "\n"); \
\
exit(EXIT_FAILURE); \
} while (0)
#define SCX_BUG_ON(__cond, __fmt, ...) \
do { \
if (__cond) \
SCX_BUG((__fmt) __VA_OPT__(,) __VA_ARGS__); \
} while (0)
/**
* RESIZE_ARRAY - Convenience macro for resizing a BPF array
* @__skel: the skeleton containing the array
* @elfsec: the data section of the BPF program in which the array exists
* @arr: the name of the array
* @n: the desired array element count
*
* For BPF arrays declared with RESIZABLE_ARRAY(), this macro performs two
* operations. It resizes the map which corresponds to the custom data
* section that contains the target array. As a side effect, the BTF info for
* the array is adjusted so that the array length is sized to cover the new
* data section size. The second operation is reassigning the skeleton pointer
* for that custom data section so that it points to the newly memory mapped
* region.
*/
#define RESIZE_ARRAY(__skel, elfsec, arr, n) \
do { \
size_t __sz; \
bpf_map__set_value_size((__skel)->maps.elfsec##_##arr, \
sizeof((__skel)->elfsec##_##arr->arr[0]) * (n)); \
(__skel)->elfsec##_##arr = \
bpf_map__initial_value((__skel)->maps.elfsec##_##arr, &__sz); \
} while (0)
#include "user_exit_info.h"
#include "compat.h"
#endif /* __SCHED_EXT_COMMON_H */
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
* Copyright (c) 2024 Tejun Heo <tj@kernel.org>
* Copyright (c) 2024 David Vernet <dvernet@meta.com>
*/
#ifndef __SCX_COMPAT_BPF_H
#define __SCX_COMPAT_BPF_H
#define __COMPAT_ENUM_OR_ZERO(__type, __ent) \
({ \
__type __ret = 0; \
if (bpf_core_enum_value_exists(__type, __ent)) \
__ret = __ent; \
__ret; \
})
/*
* Define sched_ext_ops. This may be expanded to define multiple variants for
* backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH().
*/
#define SCX_OPS_DEFINE(__name, ...) \
SEC(".struct_ops.link") \
struct sched_ext_ops __name = { \
__VA_ARGS__, \
};
#endif /* __SCX_COMPAT_BPF_H */
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
* Copyright (c) 2024 Tejun Heo <tj@kernel.org>
* Copyright (c) 2024 David Vernet <dvernet@meta.com>
*/
#ifndef __SCX_COMPAT_H
#define __SCX_COMPAT_H
#include <bpf/btf.h>
#include <fcntl.h>
#include <stdlib.h>
#include <unistd.h>
struct btf *__COMPAT_vmlinux_btf __attribute__((weak));
static inline void __COMPAT_load_vmlinux_btf(void)
{
if (!__COMPAT_vmlinux_btf) {
__COMPAT_vmlinux_btf = btf__load_vmlinux_btf();
SCX_BUG_ON(!__COMPAT_vmlinux_btf, "btf__load_vmlinux_btf()");
}
}
static inline bool __COMPAT_read_enum(const char *type, const char *name, u64 *v)
{
const struct btf_type *t;
const char *n;
s32 tid;
int i;
__COMPAT_load_vmlinux_btf();
tid = btf__find_by_name(__COMPAT_vmlinux_btf, type);
if (tid < 0)
return false;
t = btf__type_by_id(__COMPAT_vmlinux_btf, tid);
SCX_BUG_ON(!t, "btf__type_by_id(%d)", tid);
if (btf_is_enum(t)) {
struct btf_enum *e = btf_enum(t);
for (i = 0; i < BTF_INFO_VLEN(t->info); i++) {
n = btf__name_by_offset(__COMPAT_vmlinux_btf, e[i].name_off);
SCX_BUG_ON(!n, "btf__name_by_offset()");
if (!strcmp(n, name)) {
*v = e[i].val;
return true;
}
}
} else if (btf_is_enum64(t)) {
struct btf_enum64 *e = btf_enum64(t);
for (i = 0; i < BTF_INFO_VLEN(t->info); i++) {
n = btf__name_by_offset(__COMPAT_vmlinux_btf, e[i].name_off);
SCX_BUG_ON(!n, "btf__name_by_offset()");
if (!strcmp(n, name)) {
*v = btf_enum64_value(&e[i]);
return true;
}
}
}
return false;
}
#define __COMPAT_ENUM_OR_ZERO(__type, __ent) \
({ \
u64 __val = 0; \
__COMPAT_read_enum(__type, __ent, &__val); \
__val; \
})
static inline bool __COMPAT_has_ksym(const char *ksym)
{
__COMPAT_load_vmlinux_btf();
return btf__find_by_name(__COMPAT_vmlinux_btf, ksym) >= 0;
}
static inline bool __COMPAT_struct_has_field(const char *type, const char *field)
{
const struct btf_type *t;
const struct btf_member *m;
const char *n;
s32 tid;
int i;
__COMPAT_load_vmlinux_btf();
tid = btf__find_by_name_kind(__COMPAT_vmlinux_btf, type, BTF_KIND_STRUCT);
if (tid < 0)
return false;
t = btf__type_by_id(__COMPAT_vmlinux_btf, tid);
SCX_BUG_ON(!t, "btf__type_by_id(%d)", tid);
m = btf_members(t);
for (i = 0; i < BTF_INFO_VLEN(t->info); i++) {
n = btf__name_by_offset(__COMPAT_vmlinux_btf, m[i].name_off);
SCX_BUG_ON(!n, "btf__name_by_offset()");
if (!strcmp(n, field))
return true;
}
return false;
}
#define SCX_OPS_SWITCH_PARTIAL \
__COMPAT_ENUM_OR_ZERO("scx_ops_flags", "SCX_OPS_SWITCH_PARTIAL")
static inline long scx_hotplug_seq(void)
{
int fd;
char buf[32];
ssize_t len;
long val;
fd = open("/sys/kernel/sched_ext/hotplug_seq", O_RDONLY);
if (fd < 0)
return -ENOENT;
len = read(fd, buf, sizeof(buf) - 1);
SCX_BUG_ON(len <= 0, "read failed (%ld)", len);
buf[len] = 0;
close(fd);
val = strtoul(buf, NULL, 10);
SCX_BUG_ON(val < 0, "invalid num hotplug events: %lu", val);
return val;
}
/*
* struct sched_ext_ops can change over time. If compat.bpf.h::SCX_OPS_DEFINE()
* is used to define ops and compat.h::SCX_OPS_LOAD/ATTACH() are used to load
* and attach it, backward compatibility is automatically maintained where
* reasonable.
*
* ec7e3b0463e1 ("implement-ops") in https://github.com/sched-ext/sched_ext is
* the current minimum required kernel version.
*/
#define SCX_OPS_OPEN(__ops_name, __scx_name) ({ \
struct __scx_name *__skel; \
\
SCX_BUG_ON(!__COMPAT_struct_has_field("sched_ext_ops", "dump"), \
"sched_ext_ops.dump() missing, kernel too old?"); \
\
__skel = __scx_name##__open(); \
SCX_BUG_ON(!__skel, "Could not open " #__scx_name); \
__skel->struct_ops.__ops_name->hotplug_seq = scx_hotplug_seq(); \
__skel; \
})
#define SCX_OPS_LOAD(__skel, __ops_name, __scx_name, __uei_name) ({ \
UEI_SET_SIZE(__skel, __ops_name, __uei_name); \
SCX_BUG_ON(__scx_name##__load((__skel)), "Failed to load skel"); \
})
/*
* New versions of bpftool now emit additional link placeholders for BPF maps,
* and set up BPF skeleton in such a way that libbpf will auto-attach BPF maps
* automatically, assumming libbpf is recent enough (v1.5+). Old libbpf will do
* nothing with those links and won't attempt to auto-attach maps.
*
* To maintain compatibility with older libbpf while avoiding trying to attach
* twice, disable the autoattach feature on newer libbpf.
*/
#if LIBBPF_MAJOR_VERSION > 1 || \
(LIBBPF_MAJOR_VERSION == 1 && LIBBPF_MINOR_VERSION >= 5)
#define __SCX_OPS_DISABLE_AUTOATTACH(__skel, __ops_name) \
bpf_map__set_autoattach((__skel)->maps.__ops_name, false)
#else
#define __SCX_OPS_DISABLE_AUTOATTACH(__skel, __ops_name) do {} while (0)
#endif
#define SCX_OPS_ATTACH(__skel, __ops_name, __scx_name) ({ \
struct bpf_link *__link; \
__SCX_OPS_DISABLE_AUTOATTACH(__skel, __ops_name); \
SCX_BUG_ON(__scx_name##__attach((__skel)), "Failed to attach skel"); \
__link = bpf_map__attach_struct_ops((__skel)->maps.__ops_name); \
SCX_BUG_ON(!__link, "Failed to attach struct_ops"); \
__link; \
})
#endif /* __SCX_COMPAT_H */
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Define struct user_exit_info which is shared between BPF and userspace parts
* to communicate exit status and other information.
*
* Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
* Copyright (c) 2022 Tejun Heo <tj@kernel.org>
* Copyright (c) 2022 David Vernet <dvernet@meta.com>
*/
#ifndef __USER_EXIT_INFO_H
#define __USER_EXIT_INFO_H
enum uei_sizes {
UEI_REASON_LEN = 128,
UEI_MSG_LEN = 1024,
UEI_DUMP_DFL_LEN = 32768,
};
struct user_exit_info {
int kind;
s64 exit_code;
char reason[UEI_REASON_LEN];
char msg[UEI_MSG_LEN];
};
#ifdef __bpf__
#include "vmlinux.h"
#include <bpf/bpf_core_read.h>
#define UEI_DEFINE(__name) \
char RESIZABLE_ARRAY(data, __name##_dump); \
const volatile u32 __name##_dump_len; \
struct user_exit_info __name SEC(".data")
#define UEI_RECORD(__uei_name, __ei) ({ \
bpf_probe_read_kernel_str(__uei_name.reason, \
sizeof(__uei_name.reason), (__ei)->reason); \
bpf_probe_read_kernel_str(__uei_name.msg, \
sizeof(__uei_name.msg), (__ei)->msg); \
bpf_probe_read_kernel_str(__uei_name##_dump, \
__uei_name##_dump_len, (__ei)->dump); \
if (bpf_core_field_exists((__ei)->exit_code)) \
__uei_name.exit_code = (__ei)->exit_code; \
/* use __sync to force memory barrier */ \
__sync_val_compare_and_swap(&__uei_name.kind, __uei_name.kind, \
(__ei)->kind); \
})
#else /* !__bpf__ */
#include <stdio.h>
#include <stdbool.h>
/* no need to call the following explicitly if SCX_OPS_LOAD() is used */
#define UEI_SET_SIZE(__skel, __ops_name, __uei_name) ({ \
u32 __len = (__skel)->struct_ops.__ops_name->exit_dump_len ?: UEI_DUMP_DFL_LEN; \
(__skel)->rodata->__uei_name##_dump_len = __len; \
RESIZE_ARRAY((__skel), data, __uei_name##_dump, __len); \
})
#define UEI_EXITED(__skel, __uei_name) ({ \
/* use __sync to force memory barrier */ \
__sync_val_compare_and_swap(&(__skel)->data->__uei_name.kind, -1, -1); \
})
#define UEI_REPORT(__skel, __uei_name) ({ \
struct user_exit_info *__uei = &(__skel)->data->__uei_name; \
char *__uei_dump = (__skel)->data_##__uei_name##_dump->__uei_name##_dump; \
if (__uei_dump[0] != '\0') { \
fputs("\nDEBUG DUMP\n", stderr); \
fputs("================================================================================\n\n", stderr); \
fputs(__uei_dump, stderr); \
fputs("\n================================================================================\n\n", stderr); \
} \
fprintf(stderr, "EXIT: %s", __uei->reason); \
if (__uei->msg[0] != '\0') \
fprintf(stderr, " (%s)", __uei->msg); \
fputs("\n", stderr); \
__uei->exit_code; \
})
/*
* We can't import vmlinux.h while compiling user C code. Let's duplicate
* scx_exit_code definition.
*/
enum scx_exit_code {
/* Reasons */
SCX_ECODE_RSN_HOTPLUG = 1LLU << 32,
/* Actions */
SCX_ECODE_ACT_RESTART = 1LLU << 48,
};
enum uei_ecode_mask {
UEI_ECODE_USER_MASK = ((1LLU << 32) - 1),
UEI_ECODE_SYS_RSN_MASK = ((1LLU << 16) - 1) << 32,
UEI_ECODE_SYS_ACT_MASK = ((1LLU << 16) - 1) << 48,
};
/*
* These macro interpret the ecode returned from UEI_REPORT().
*/
#define UEI_ECODE_USER(__ecode) ((__ecode) & UEI_ECODE_USER_MASK)
#define UEI_ECODE_SYS_RSN(__ecode) ((__ecode) & UEI_ECODE_SYS_RSN_MASK)
#define UEI_ECODE_SYS_ACT(__ecode) ((__ecode) & UEI_ECODE_SYS_ACT_MASK)
#define UEI_ECODE_RESTART(__ecode) (UEI_ECODE_SYS_ACT((__ecode)) == SCX_ECODE_ACT_RESTART)
#endif /* __bpf__ */
#endif /* __USER_EXIT_INFO_H */
This diff is collapsed.
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
* Copyright (c) 2022 Tejun Heo <tj@kernel.org>
* Copyright (c) 2022 David Vernet <dvernet@meta.com>
*/
#define _GNU_SOURCE
#include <sched.h>
#include <stdio.h>
#include <unistd.h>
#include <inttypes.h>
#include <signal.h>
#include <libgen.h>
#include <bpf/bpf.h>
#include <scx/common.h>
#include "scx_central.bpf.skel.h"
const char help_fmt[] =
"A central FIFO sched_ext scheduler.\n"
"\n"
"See the top-level comment in .bpf.c for more details.\n"
"\n"
"Usage: %s [-s SLICE_US] [-c CPU]\n"
"\n"
" -s SLICE_US Override slice duration\n"
" -c CPU Override the central CPU (default: 0)\n"
" -v Print libbpf debug messages\n"
" -h Display this help and exit\n";
static bool verbose;
static volatile int exit_req;
static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
{
if (level == LIBBPF_DEBUG && !verbose)
return 0;
return vfprintf(stderr, format, args);
}
static void sigint_handler(int dummy)
{
exit_req = 1;
}
int main(int argc, char **argv)
{
struct scx_central *skel;
struct bpf_link *link;
__u64 seq = 0, ecode;
__s32 opt;
cpu_set_t *cpuset;
libbpf_set_print(libbpf_print_fn);
signal(SIGINT, sigint_handler);
signal(SIGTERM, sigint_handler);
restart:
skel = SCX_OPS_OPEN(central_ops, scx_central);
skel->rodata->central_cpu = 0;
skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus();
while ((opt = getopt(argc, argv, "s:c:pvh")) != -1) {
switch (opt) {
case 's':
skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
break;
case 'c':
skel->rodata->central_cpu = strtoul(optarg, NULL, 0);
break;
case 'v':
verbose = true;
break;
default:
fprintf(stderr, help_fmt, basename(argv[0]));
return opt != 'h';
}
}
/* Resize arrays so their element count is equal to cpu count. */
RESIZE_ARRAY(skel, data, cpu_gimme_task, skel->rodata->nr_cpu_ids);
RESIZE_ARRAY(skel, data, cpu_started_at, skel->rodata->nr_cpu_ids);
SCX_OPS_LOAD(skel, central_ops, scx_central, uei);
/*
* Affinitize the loading thread to the central CPU, as:
* - That's where the BPF timer is first invoked in the BPF program.
* - We probably don't want this user space component to take up a core
* from a task that would benefit from avoiding preemption on one of
* the tickless cores.
*
* Until BPF supports pinning the timer, it's not guaranteed that it
* will always be invoked on the central CPU. In practice, this
* suffices the majority of the time.
*/
cpuset = CPU_ALLOC(skel->rodata->nr_cpu_ids);
SCX_BUG_ON(!cpuset, "Failed to allocate cpuset");
CPU_ZERO(cpuset);
CPU_SET(skel->rodata->central_cpu, cpuset);
SCX_BUG_ON(sched_setaffinity(0, sizeof(cpuset), cpuset),
"Failed to affinitize to central CPU %d (max %d)",
skel->rodata->central_cpu, skel->rodata->nr_cpu_ids - 1);
CPU_FREE(cpuset);
link = SCX_OPS_ATTACH(skel, central_ops, scx_central);
if (!skel->data->timer_pinned)
printf("WARNING : BPF_F_TIMER_CPU_PIN not available, timer not pinned to central\n");
while (!exit_req && !UEI_EXITED(skel, uei)) {
printf("[SEQ %llu]\n", seq++);
printf("total :%10" PRIu64 " local:%10" PRIu64 " queued:%10" PRIu64 " lost:%10" PRIu64 "\n",
skel->bss->nr_total,
skel->bss->nr_locals,
skel->bss->nr_queued,
skel->bss->nr_lost_pids);
printf("timer :%10" PRIu64 " dispatch:%10" PRIu64 " mismatch:%10" PRIu64 " retry:%10" PRIu64 "\n",
skel->bss->nr_timers,
skel->bss->nr_dispatches,
skel->bss->nr_mismatches,
skel->bss->nr_retries);
printf("overflow:%10" PRIu64 "\n",
skel->bss->nr_overflows);
fflush(stdout);
sleep(1);
}
bpf_link__destroy(link);
ecode = UEI_REPORT(skel, uei);
scx_central__destroy(skel);
if (UEI_ECODE_RESTART(ecode))
goto restart;
return 0;
}
This diff is collapsed.
This diff is collapsed.
#ifndef __SCX_EXAMPLE_FLATCG_H
#define __SCX_EXAMPLE_FLATCG_H
enum {
FCG_HWEIGHT_ONE = 1LLU << 16,
};
enum fcg_stat_idx {
FCG_STAT_ACT,
FCG_STAT_DEACT,
FCG_STAT_LOCAL,
FCG_STAT_GLOBAL,
FCG_STAT_HWT_UPDATES,
FCG_STAT_HWT_CACHE,
FCG_STAT_HWT_SKIP,
FCG_STAT_HWT_RACE,
FCG_STAT_ENQ_SKIP,
FCG_STAT_ENQ_RACE,
FCG_STAT_CNS_KEEP,
FCG_STAT_CNS_EXPIRE,
FCG_STAT_CNS_EMPTY,
FCG_STAT_CNS_GONE,
FCG_STAT_PNC_NO_CGRP,
FCG_STAT_PNC_NEXT,
FCG_STAT_PNC_EMPTY,
FCG_STAT_PNC_GONE,
FCG_STAT_PNC_RACE,
FCG_STAT_PNC_FAIL,
FCG_STAT_BAD_REMOVAL,
FCG_NR_STATS,
};
struct fcg_cgrp_ctx {
u32 nr_active;
u32 nr_runnable;
u32 queued;
u32 weight;
u32 hweight;
u64 child_weight_sum;
u64 hweight_gen;
s64 cvtime_delta;
u64 tvtime_now;
};
#endif /* __SCX_EXAMPLE_FLATCG_H */
This diff is collapsed.
This diff is collapsed.
#!/usr/bin/env drgn
#
# Copyright (C) 2024 Tejun Heo <tj@kernel.org>
# Copyright (C) 2024 Meta Platforms, Inc. and affiliates.
desc = """
This is a drgn script to show the current sched_ext state.
For more info on drgn, visit https://github.com/osandov/drgn.
"""
import drgn
import sys
def err(s):
print(s, file=sys.stderr, flush=True)
sys.exit(1)
def read_int(name):
return int(prog[name].value_())
def read_atomic(name):
return prog[name].counter.value_()
def read_static_key(name):
return prog[name].key.enabled.counter.value_()
def ops_state_str(state):
return prog['scx_ops_enable_state_str'][state].string_().decode()
ops = prog['scx_ops']
enable_state = read_atomic("scx_ops_enable_state_var")
print(f'ops : {ops.name.string_().decode()}')
print(f'enabled : {read_static_key("__scx_ops_enabled")}')
print(f'switching_all : {read_int("scx_switching_all")}')
print(f'switched_all : {read_static_key("__scx_switched_all")}')
print(f'enable_state : {ops_state_str(enable_state)} ({enable_state})')
print(f'bypass_depth : {read_atomic("scx_ops_bypass_depth")}')
print(f'nr_rejected : {read_atomic("scx_nr_rejected")}')
This diff is collapsed.
This diff is collapsed.
*
!*.c
!*.h
!Makefile
!.gitignore
!config
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment