Commit 88bf1858 authored by Alexei Starovoitov's avatar Alexei Starovoitov

Merge branch 'sleepable uprobe support'

Delyan Kratunov says:

====================

This series implements support for sleepable uprobe programs.
Key work is in patches 2 and 3, the rest is plumbing and tests.

The main observation is that the only obstacle in the way of sleepable uprobe
programs is not the uprobe infrastructure, which already runs in a user-like
context, but the rcu usage around bpf_prog_array.

Details are in patch 2 but the tl;dr is that we chain trace_tasks and normal rcu
grace periods when releasing to array to accommodate users of either rcu type.
This introduces latency for non-sleepable users (kprobe, tp) but that's deemed
acceptable, given recent benchmarks by Andrii [1]. We're a couple of orders of
magnitude under the rate of bpf_prog_array churn that would raise flags (~1MM/s per Paul).

  [1]: https://lore.kernel.org/bpf/CAEf4BzbpjN6ca7D9KOTiFPOoBYkciYvTz0UJNp5c-_3ptm=Mrg@mail.gmail.com/

v3 -> v4:
 * Fix kdoc and inline issues
 * Rebase

v2 -> v3:
 * Inline uprobe_call_bpf into trace_uprobe.c, it's just a bpf_prog_run_array_sleepable call now.
 * Do not disable preemption for uprobe non-sleepable programs.
 * Add acks.

v1 -> v2:
 * Fix lockdep annotations in bpf_prog_run_array_sleepable
 * Chain rcu grace periods only for perf_event-attached programs. This limits
   the additional latency on the free path to use cases where we know it won't
   be a problem.
 * Add tests calling helpers only available in sleepable programs.
 * Remove kprobe.s support from libbpf.
====================
Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parents 3e6fe5ce cb3f4a4a
......@@ -5,6 +5,7 @@
#define _LINUX_BPF_H 1
#include <uapi/linux/bpf.h>
#include <uapi/linux/filter.h>
#include <linux/workqueue.h>
#include <linux/file.h>
......@@ -22,8 +23,10 @@
#include <linux/sched/mm.h>
#include <linux/slab.h>
#include <linux/percpu-refcount.h>
#include <linux/stddef.h>
#include <linux/bpfptr.h>
#include <linux/btf.h>
#include <linux/rcupdate_trace.h>
struct bpf_verifier_env;
struct bpf_verifier_log;
......@@ -1084,6 +1087,40 @@ struct bpf_prog_aux {
};
};
struct bpf_prog {
u16 pages; /* Number of allocated pages */
u16 jited:1, /* Is our filter JIT'ed? */
jit_requested:1,/* archs need to JIT the prog */
gpl_compatible:1, /* Is filter GPL compatible? */
cb_access:1, /* Is control block accessed? */
dst_needed:1, /* Do we need dst entry? */
blinding_requested:1, /* needs constant blinding */
blinded:1, /* Was blinded */
is_func:1, /* program is a bpf function */
kprobe_override:1, /* Do we override a kprobe? */
has_callchain_buf:1, /* callchain buffer allocated? */
enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */
call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */
call_get_func_ip:1, /* Do we call get_func_ip() */
tstamp_type_access:1; /* Accessed __sk_buff->tstamp_type */
enum bpf_prog_type type; /* Type of BPF program */
enum bpf_attach_type expected_attach_type; /* For some prog types */
u32 len; /* Number of filter blocks */
u32 jited_len; /* Size of jited insns in bytes */
u8 tag[BPF_TAG_SIZE];
struct bpf_prog_stats __percpu *stats;
int __percpu *active;
unsigned int (*bpf_func)(const void *ctx,
const struct bpf_insn *insn);
struct bpf_prog_aux *aux; /* Auxiliary fields */
struct sock_fprog_kern *orig_prog; /* Original BPF program */
/* Instructions for interpreter */
union {
DECLARE_FLEX_ARRAY(struct sock_filter, insns);
DECLARE_FLEX_ARRAY(struct bpf_insn, insnsi);
};
};
struct bpf_array_aux {
/* Programs with direct jumps into programs part of this array. */
struct list_head poke_progs;
......@@ -1336,6 +1373,8 @@ extern struct bpf_empty_prog_array bpf_empty_prog_array;
struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags);
void bpf_prog_array_free(struct bpf_prog_array *progs);
/* Use when traversal over the bpf_prog_array uses tasks_trace rcu */
void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs);
int bpf_prog_array_length(struct bpf_prog_array *progs);
bool bpf_prog_array_is_empty(struct bpf_prog_array *array);
int bpf_prog_array_copy_to_user(struct bpf_prog_array *progs,
......@@ -1427,6 +1466,55 @@ bpf_prog_run_array(const struct bpf_prog_array *array,
return ret;
}
/* Notes on RCU design for bpf_prog_arrays containing sleepable programs:
*
* We use the tasks_trace rcu flavor read section to protect the bpf_prog_array
* overall. As a result, we must use the bpf_prog_array_free_sleepable
* in order to use the tasks_trace rcu grace period.
*
* When a non-sleepable program is inside the array, we take the rcu read
* section and disable preemption for that program alone, so it can access
* rcu-protected dynamically sized maps.
*/
static __always_inline u32
bpf_prog_run_array_sleepable(const struct bpf_prog_array __rcu *array_rcu,
const void *ctx, bpf_prog_run_fn run_prog)
{
const struct bpf_prog_array_item *item;
const struct bpf_prog *prog;
const struct bpf_prog_array *array;
struct bpf_run_ctx *old_run_ctx;
struct bpf_trace_run_ctx run_ctx;
u32 ret = 1;
might_fault();
rcu_read_lock_trace();
migrate_disable();
array = rcu_dereference_check(array_rcu, rcu_read_lock_trace_held());
if (unlikely(!array))
goto out;
old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
item = &array->items[0];
while ((prog = READ_ONCE(item->prog))) {
if (!prog->aux->sleepable)
rcu_read_lock();
run_ctx.bpf_cookie = item->bpf_cookie;
ret &= run_prog(prog, ctx);
item++;
if (!prog->aux->sleepable)
rcu_read_unlock();
}
bpf_reset_run_ctx(old_run_ctx);
out:
migrate_enable();
rcu_read_unlock_trace();
return ret;
}
#ifdef CONFIG_BPF_SYSCALL
DECLARE_PER_CPU(int, bpf_prog_active);
extern struct mutex bpf_stats_enabled_mutex;
......
......@@ -559,40 +559,6 @@ struct bpf_prog_stats {
struct u64_stats_sync syncp;
} __aligned(2 * sizeof(u64));
struct bpf_prog {
u16 pages; /* Number of allocated pages */
u16 jited:1, /* Is our filter JIT'ed? */
jit_requested:1,/* archs need to JIT the prog */
gpl_compatible:1, /* Is filter GPL compatible? */
cb_access:1, /* Is control block accessed? */
dst_needed:1, /* Do we need dst entry? */
blinding_requested:1, /* needs constant blinding */
blinded:1, /* Was blinded */
is_func:1, /* program is a bpf function */
kprobe_override:1, /* Do we override a kprobe? */
has_callchain_buf:1, /* callchain buffer allocated? */
enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */
call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */
call_get_func_ip:1, /* Do we call get_func_ip() */
tstamp_type_access:1; /* Accessed __sk_buff->tstamp_type */
enum bpf_prog_type type; /* Type of BPF program */
enum bpf_attach_type expected_attach_type; /* For some prog types */
u32 len; /* Number of filter blocks */
u32 jited_len; /* Size of jited insns in bytes */
u8 tag[BPF_TAG_SIZE];
struct bpf_prog_stats __percpu *stats;
int __percpu *active;
unsigned int (*bpf_func)(const void *ctx,
const struct bpf_insn *insn);
struct bpf_prog_aux *aux; /* Auxiliary fields */
struct sock_fprog_kern *orig_prog; /* Original BPF program */
/* Instructions for interpreter */
union {
DECLARE_FLEX_ARRAY(struct sock_filter, insns);
DECLARE_FLEX_ARRAY(struct bpf_insn, insnsi);
};
};
struct sk_filter {
refcount_t refcnt;
struct rcu_head rcu;
......
......@@ -2279,6 +2279,21 @@ void bpf_prog_array_free(struct bpf_prog_array *progs)
kfree_rcu(progs, rcu);
}
static void __bpf_prog_array_free_sleepable_cb(struct rcu_head *rcu)
{
struct bpf_prog_array *progs;
progs = container_of(rcu, struct bpf_prog_array, rcu);
kfree_rcu(progs, rcu);
}
void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs)
{
if (!progs || progs == &bpf_empty_prog_array.hdr)
return;
call_rcu_tasks_trace(&progs->rcu, __bpf_prog_array_free_sleepable_cb);
}
int bpf_prog_array_length(struct bpf_prog_array *array)
{
struct bpf_prog_array_item *item;
......
......@@ -14829,8 +14829,8 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
}
if (prog->aux->sleepable && prog->type != BPF_PROG_TYPE_TRACING &&
prog->type != BPF_PROG_TYPE_LSM) {
verbose(env, "Only fentry/fexit/fmod_ret and lsm programs can be sleepable\n");
prog->type != BPF_PROG_TYPE_LSM && prog->type != BPF_PROG_TYPE_KPROBE) {
verbose(env, "Only fentry/fexit/fmod_ret, lsm, and kprobe/uprobe programs can be sleepable\n");
return -EINVAL;
}
......
......@@ -10069,26 +10069,30 @@ static inline bool perf_event_is_tracing(struct perf_event *event)
int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
u64 bpf_cookie)
{
bool is_kprobe, is_tracepoint, is_syscall_tp;
bool is_kprobe, is_uprobe, is_tracepoint, is_syscall_tp;
if (!perf_event_is_tracing(event))
return perf_event_set_bpf_handler(event, prog, bpf_cookie);
is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_KPROBE;
is_uprobe = event->tp_event->flags & TRACE_EVENT_FL_UPROBE;
is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
is_syscall_tp = is_syscall_trace_event(event->tp_event);
if (!is_kprobe && !is_tracepoint && !is_syscall_tp)
if (!is_kprobe && !is_uprobe && !is_tracepoint && !is_syscall_tp)
/* bpf programs can only be attached to u/kprobe or tracepoint */
return -EINVAL;
if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
if (((is_kprobe || is_uprobe) && prog->type != BPF_PROG_TYPE_KPROBE) ||
(is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
(is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT))
return -EINVAL;
if (prog->type == BPF_PROG_TYPE_KPROBE && prog->aux->sleepable && !is_uprobe)
/* only uprobe programs are allowed to be sleepable */
return -EINVAL;
/* Kprobe override only works for kprobes, not uprobes. */
if (prog->kprobe_override &&
!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE))
if (prog->kprobe_override && !is_kprobe)
return -EINVAL;
if (is_tracepoint || is_syscall_tp) {
......
......@@ -1936,7 +1936,7 @@ int perf_event_attach_bpf_prog(struct perf_event *event,
event->prog = prog;
event->bpf_cookie = bpf_cookie;
rcu_assign_pointer(event->tp_event->prog_array, new_array);
bpf_prog_array_free(old_array);
bpf_prog_array_free_sleepable(old_array);
unlock:
mutex_unlock(&bpf_event_mutex);
......@@ -1962,7 +1962,7 @@ void perf_event_detach_bpf_prog(struct perf_event *event)
bpf_prog_array_delete_safe(old_array, event->prog);
} else {
rcu_assign_pointer(event->tp_event->prog_array, new_array);
bpf_prog_array_free(old_array);
bpf_prog_array_free_sleepable(old_array);
}
bpf_prog_put(event->prog);
......
......@@ -16,6 +16,7 @@
#include <linux/namei.h>
#include <linux/string.h>
#include <linux/rculist.h>
#include <linux/filter.h>
#include "trace_dynevent.h"
#include "trace_probe.h"
......@@ -1346,9 +1347,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
if (bpf_prog_array_valid(call)) {
u32 ret;
preempt_disable();
ret = trace_call_bpf(call, regs);
preempt_enable();
ret = bpf_prog_run_array_sleepable(call->prog_array, regs, bpf_prog_run);
if (!ret)
return;
}
......
......@@ -9177,8 +9177,10 @@ static const struct bpf_sec_def section_defs[] = {
SEC_DEF("sk_reuseport", SK_REUSEPORT, BPF_SK_REUSEPORT_SELECT, SEC_ATTACHABLE | SEC_SLOPPY_PFX),
SEC_DEF("kprobe+", KPROBE, 0, SEC_NONE, attach_kprobe),
SEC_DEF("uprobe+", KPROBE, 0, SEC_NONE, attach_uprobe),
SEC_DEF("uprobe.s+", KPROBE, 0, SEC_SLEEPABLE, attach_uprobe),
SEC_DEF("kretprobe+", KPROBE, 0, SEC_NONE, attach_kprobe),
SEC_DEF("uretprobe+", KPROBE, 0, SEC_NONE, attach_uprobe),
SEC_DEF("uretprobe.s+", KPROBE, 0, SEC_SLEEPABLE, attach_uprobe),
SEC_DEF("kprobe.multi+", KPROBE, BPF_TRACE_KPROBE_MULTI, SEC_NONE, attach_kprobe_multi),
SEC_DEF("kretprobe.multi+", KPROBE, BPF_TRACE_KPROBE_MULTI, SEC_NONE, attach_kprobe_multi),
SEC_DEF("usdt+", KPROBE, 0, SEC_NONE, attach_usdt),
......@@ -11571,7 +11573,8 @@ static int attach_uprobe(const struct bpf_program *prog, long cookie, struct bpf
break;
case 3:
case 4:
opts.retprobe = strcmp(probe_type, "uretprobe") == 0;
opts.retprobe = strcmp(probe_type, "uretprobe") == 0 ||
strcmp(probe_type, "uretprobe.s") == 0;
if (opts.retprobe && offset != 0) {
pr_warn("prog '%s': uretprobes do not support offset specification\n",
prog->name);
......
......@@ -17,6 +17,14 @@ static void trigger_func2(void)
asm volatile ("");
}
/* attach point for byname sleepable uprobe */
static void trigger_func3(void)
{
asm volatile ("");
}
static char test_data[] = "test_data";
void test_attach_probe(void)
{
DECLARE_LIBBPF_OPTS(bpf_uprobe_opts, uprobe_opts);
......@@ -49,9 +57,17 @@ void test_attach_probe(void)
if (!ASSERT_GE(ref_ctr_offset, 0, "ref_ctr_offset"))
return;
skel = test_attach_probe__open_and_load();
skel = test_attach_probe__open();
if (!ASSERT_OK_PTR(skel, "skel_open"))
return;
/* sleepable kprobe test case needs flags set before loading */
if (!ASSERT_OK(bpf_program__set_flags(skel->progs.handle_kprobe_sleepable,
BPF_F_SLEEPABLE), "kprobe_sleepable_flags"))
goto cleanup;
if (!ASSERT_OK(test_attach_probe__load(skel), "skel_load"))
goto cleanup;
if (!ASSERT_OK_PTR(skel->bss, "check_bss"))
goto cleanup;
......@@ -151,6 +167,30 @@ void test_attach_probe(void)
if (!ASSERT_OK_PTR(skel->links.handle_uretprobe_byname2, "attach_uretprobe_byname2"))
goto cleanup;
/* sleepable kprobes should not attach successfully */
skel->links.handle_kprobe_sleepable = bpf_program__attach(skel->progs.handle_kprobe_sleepable);
if (!ASSERT_ERR_PTR(skel->links.handle_kprobe_sleepable, "attach_kprobe_sleepable"))
goto cleanup;
/* test sleepable uprobe and uretprobe variants */
skel->links.handle_uprobe_byname3_sleepable = bpf_program__attach(skel->progs.handle_uprobe_byname3_sleepable);
if (!ASSERT_OK_PTR(skel->links.handle_uprobe_byname3_sleepable, "attach_uprobe_byname3_sleepable"))
goto cleanup;
skel->links.handle_uprobe_byname3 = bpf_program__attach(skel->progs.handle_uprobe_byname3);
if (!ASSERT_OK_PTR(skel->links.handle_uprobe_byname3, "attach_uprobe_byname3"))
goto cleanup;
skel->links.handle_uretprobe_byname3_sleepable = bpf_program__attach(skel->progs.handle_uretprobe_byname3_sleepable);
if (!ASSERT_OK_PTR(skel->links.handle_uretprobe_byname3_sleepable, "attach_uretprobe_byname3_sleepable"))
goto cleanup;
skel->links.handle_uretprobe_byname3 = bpf_program__attach(skel->progs.handle_uretprobe_byname3);
if (!ASSERT_OK_PTR(skel->links.handle_uretprobe_byname3, "attach_uretprobe_byname3"))
goto cleanup;
skel->bss->user_ptr = test_data;
/* trigger & validate kprobe && kretprobe */
usleep(1);
......@@ -164,6 +204,9 @@ void test_attach_probe(void)
/* trigger & validate uprobe attached by name */
trigger_func2();
/* trigger & validate sleepable uprobe attached by name */
trigger_func3();
ASSERT_EQ(skel->bss->kprobe_res, 1, "check_kprobe_res");
ASSERT_EQ(skel->bss->kprobe2_res, 11, "check_kprobe_auto_res");
ASSERT_EQ(skel->bss->kretprobe_res, 2, "check_kretprobe_res");
......@@ -174,6 +217,10 @@ void test_attach_probe(void)
ASSERT_EQ(skel->bss->uretprobe_byname_res, 6, "check_uretprobe_byname_res");
ASSERT_EQ(skel->bss->uprobe_byname2_res, 7, "check_uprobe_byname2_res");
ASSERT_EQ(skel->bss->uretprobe_byname2_res, 8, "check_uretprobe_byname2_res");
ASSERT_EQ(skel->bss->uprobe_byname3_sleepable_res, 9, "check_uprobe_byname3_sleepable_res");
ASSERT_EQ(skel->bss->uprobe_byname3_res, 10, "check_uprobe_byname3_res");
ASSERT_EQ(skel->bss->uretprobe_byname3_sleepable_res, 11, "check_uretprobe_byname3_sleepable_res");
ASSERT_EQ(skel->bss->uretprobe_byname3_res, 12, "check_uretprobe_byname3_res");
cleanup:
test_attach_probe__destroy(skel);
......
......@@ -5,6 +5,7 @@
#include <linux/bpf.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#include <stdbool.h>
#include "bpf_misc.h"
int kprobe_res = 0;
......@@ -17,6 +18,11 @@ int uprobe_byname_res = 0;
int uretprobe_byname_res = 0;
int uprobe_byname2_res = 0;
int uretprobe_byname2_res = 0;
int uprobe_byname3_sleepable_res = 0;
int uprobe_byname3_res = 0;
int uretprobe_byname3_sleepable_res = 0;
int uretprobe_byname3_res = 0;
void *user_ptr = 0;
SEC("kprobe")
int handle_kprobe(struct pt_regs *ctx)
......@@ -32,6 +38,17 @@ int BPF_KPROBE(handle_kprobe_auto)
return 0;
}
/**
* This program will be manually made sleepable on the userspace side
* and should thus be unattachable.
*/
SEC("kprobe/" SYS_PREFIX "sys_nanosleep")
int handle_kprobe_sleepable(struct pt_regs *ctx)
{
kprobe_res = 2;
return 0;
}
SEC("kretprobe")
int handle_kretprobe(struct pt_regs *ctx)
{
......@@ -93,4 +110,47 @@ int handle_uretprobe_byname2(struct pt_regs *ctx)
return 0;
}
static __always_inline bool verify_sleepable_user_copy(void)
{
char data[9];
bpf_copy_from_user(data, sizeof(data), user_ptr);
return bpf_strncmp(data, sizeof(data), "test_data") == 0;
}
SEC("uprobe.s//proc/self/exe:trigger_func3")
int handle_uprobe_byname3_sleepable(struct pt_regs *ctx)
{
if (verify_sleepable_user_copy())
uprobe_byname3_sleepable_res = 9;
return 0;
}
/**
* same target as the uprobe.s above to force sleepable and non-sleepable
* programs in the same bpf_prog_array
*/
SEC("uprobe//proc/self/exe:trigger_func3")
int handle_uprobe_byname3(struct pt_regs *ctx)
{
uprobe_byname3_res = 10;
return 0;
}
SEC("uretprobe.s//proc/self/exe:trigger_func3")
int handle_uretprobe_byname3_sleepable(struct pt_regs *ctx)
{
if (verify_sleepable_user_copy())
uretprobe_byname3_sleepable_res = 11;
return 0;
}
SEC("uretprobe//proc/self/exe:trigger_func3")
int handle_uretprobe_byname3(struct pt_regs *ctx)
{
uretprobe_byname3_res = 12;
return 0;
}
char _license[] SEC("license") = "GPL";
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment