Commit f8711655 authored by David S. Miller's avatar David S. Miller

Merge branch 'bpf-tracepoints'

Alexei Starovoitov says:

====================
allow bpf attach to tracepoints

Hi Steven, Peter,

v1->v2: addressed Peter's comments:
- fixed wording in patch 1, added ack
- refactored 2nd patch into 3:
2/10 remove unused __perf_addr macro which frees up
an argument in perf_trace_buf_submit
3/10 split perf_trace_buf_prepare into alloc and update parts, so that bpf
programs don't have to pay performance penalty for update of struct trace_entry
which is not going to be accessed by bpf
4/10 actual addition of bpf filter to perf tracepoint handler is now trivial
and bpf prog can be used as proper filter of tracepoints

v1 cover:
last time we discussed bpf+tracepoints it was a year ago [1] and the reason
we didn't proceed with that approach was that bpf would make arguments
arg1, arg2 to trace_xx(arg1, arg2) call to be exposed to bpf program
and that was considered unnecessary extension of abi. Back then I wanted
to avoid the cost of buffer alloc and field assign part in all
of the tracepoints, but looks like when optimized the cost is acceptable.
So this new apporach doesn't expose any new abi to bpf program.
The program is looking at tracepoint fields after they were copied
by perf_trace_xx() and described in /sys/kernel/debug/tracing/events/xxx/format
We made a tool [2] that takes arguments from /sys/.../format and works as:
$ tplist.py -v random:urandom_read
    int got_bits;
    int pool_left;
    int input_left;
Then these fields can be copy-pasted into bpf program like:
struct urandom_read {
    __u64 hidden_pad;
    int got_bits;
    int pool_left;
    int input_left;
};
and the program can use it:
SEC("tracepoint/random/urandom_read")
int bpf_prog(struct urandom_read *ctx)
{
    return ctx->pool_left > 0 ? 1 : 0;
}
This way the program can access tracepoint fields faster than
equivalent bpf+kprobe program, which is the main goal of these patches.

Patch 1-4 are simple changes in perf core side, please review.
I'd like to take the whole set via net-next tree, since the rest of
the patches might conflict with other bpf work going on in net-next
and we want to avoid cross-tree merge conflicts.
Alternatively we can put patches 1-4 into both tip and net-next.

Patch 9 is an example of access to tracepoint fields from bpf prog.
Patch 10 is a micro benchmark for bpf+kprobe vs bpf+tracepoint.

Note that for actual tracing tools the user doesn't need to
run tplist.py and copy-paste fields manually. The tools do it
automatically. Like argdist tool [3] can be used as:
$ argdist -H 't:block:block_rq_complete():u32:nr_sector'
where 'nr_sector' is name of tracepoint field taken from
/sys/kernel/debug/tracing/events/block/block_rq_complete/format
and appropriate bpf program is generated on the fly.

[1] http://thread.gmane.org/gmane.linux.kernel.api/8127/focus=8165
[2] https://github.com/iovisor/bcc/blob/master/tools/tplist.py
[3] https://github.com/iovisor/bcc/blob/master/tools/argdist.py
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents b33b0a1b e3edfdec
......@@ -131,6 +131,7 @@ struct bpf_prog_type_list {
struct bpf_prog_aux {
atomic_t refcnt;
u32 used_map_cnt;
u32 max_ctx_offset;
const struct bpf_verifier_ops *ops;
struct bpf_map **used_maps;
struct bpf_prog *prog;
......@@ -160,6 +161,7 @@ struct bpf_array {
#define MAX_TAIL_CALL_CNT 32
u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
void bpf_fd_array_map_clear(struct bpf_map *map);
bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp);
const struct bpf_func_proto *bpf_get_trace_printk_proto(void);
......
......@@ -882,8 +882,6 @@ static inline void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned lo
*/
static inline void perf_fetch_caller_regs(struct pt_regs *regs)
{
memset(regs, 0, sizeof(*regs));
perf_arch_fetch_caller_regs(regs, CALLER_ADDR0);
}
......@@ -1018,7 +1016,7 @@ static inline bool perf_paranoid_kernel(void)
}
extern void perf_event_init(void);
extern void perf_tp_event(u64 addr, u64 count, void *record,
extern void perf_tp_event(u16 event_type, u64 count, void *record,
int entry_size, struct pt_regs *regs,
struct hlist_head *head, int rctx,
struct task_struct *task);
......
......@@ -569,6 +569,7 @@ extern int trace_define_field(struct trace_event_call *call, const char *type,
int is_signed, int filter_type);
extern int trace_add_event_call(struct trace_event_call *call);
extern int trace_remove_event_call(struct trace_event_call *call);
extern int trace_event_get_offsets(struct trace_event_call *call);
#define is_signed_type(type) (((type)(-1)) < (type)1)
......@@ -605,15 +606,15 @@ extern void perf_trace_del(struct perf_event *event, int flags);
extern int ftrace_profile_set_filter(struct perf_event *event, int event_id,
char *filter_str);
extern void ftrace_profile_free_filter(struct perf_event *event);
extern void *perf_trace_buf_prepare(int size, unsigned short type,
struct pt_regs **regs, int *rctxp);
void perf_trace_buf_update(void *record, u16 type);
void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp);
static inline void
perf_trace_buf_submit(void *raw_data, int size, int rctx, u64 addr,
perf_trace_buf_submit(void *raw_data, int size, int rctx, u16 type,
u64 count, struct pt_regs *regs, void *head,
struct task_struct *task)
{
perf_tp_event(addr, count, raw_data, size, regs, head, rctx, task);
perf_tp_event(type, count, raw_data, size, regs, head, rctx, task);
}
#endif
......
......@@ -20,9 +20,6 @@
#undef __get_bitmask
#define __get_bitmask(field) (char *)__get_dynamic_array(field)
#undef __perf_addr
#define __perf_addr(a) (__addr = (a))
#undef __perf_count
#define __perf_count(c) (__count = (c))
......@@ -37,8 +34,9 @@ perf_trace_##call(void *__data, proto) \
struct trace_event_call *event_call = __data; \
struct trace_event_data_offsets_##call __maybe_unused __data_offsets;\
struct trace_event_raw_##call *entry; \
struct bpf_prog *prog = event_call->prog; \
struct pt_regs *__regs; \
u64 __addr = 0, __count = 1; \
u64 __count = 1; \
struct task_struct *__task = NULL; \
struct hlist_head *head; \
int __entry_size; \
......@@ -48,7 +46,7 @@ perf_trace_##call(void *__data, proto) \
__data_size = trace_event_get_offsets_##call(&__data_offsets, args); \
\
head = this_cpu_ptr(event_call->perf_events); \
if (__builtin_constant_p(!__task) && !__task && \
if (!prog && __builtin_constant_p(!__task) && !__task && \
hlist_empty(head)) \
return; \
\
......@@ -56,8 +54,7 @@ perf_trace_##call(void *__data, proto) \
sizeof(u64)); \
__entry_size -= sizeof(u32); \
\
entry = perf_trace_buf_prepare(__entry_size, \
event_call->event.type, &__regs, &rctx); \
entry = perf_trace_buf_alloc(__entry_size, &__regs, &rctx); \
if (!entry) \
return; \
\
......@@ -67,8 +64,16 @@ perf_trace_##call(void *__data, proto) \
\
{ assign; } \
\
perf_trace_buf_submit(entry, __entry_size, rctx, __addr, \
__count, __regs, head, __task); \
if (prog) { \
*(struct pt_regs **)entry = __regs; \
if (!trace_call_bpf(prog, entry) || hlist_empty(head)) { \
perf_swevent_put_recursion_context(rctx); \
return; \
} \
} \
perf_trace_buf_submit(entry, __entry_size, rctx, \
event_call->event.type, __count, __regs, \
head, __task); \
}
/*
......
......@@ -652,9 +652,6 @@ static inline notrace int trace_event_get_offsets_##call( \
#undef TP_fast_assign
#define TP_fast_assign(args...) args
#undef __perf_addr
#define __perf_addr(a) (a)
#undef __perf_count
#define __perf_count(c) (c)
......
......@@ -92,6 +92,7 @@ enum bpf_prog_type {
BPF_PROG_TYPE_KPROBE,
BPF_PROG_TYPE_SCHED_CLS,
BPF_PROG_TYPE_SCHED_ACT,
BPF_PROG_TYPE_TRACEPOINT,
};
#define BPF_PSEUDO_MAP_FD 1
......
......@@ -116,7 +116,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
return ERR_PTR(err);
}
static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
{
struct pt_regs *regs = (struct pt_regs *) (long) r1;
struct bpf_map *map = (struct bpf_map *) (long) r2;
......
......@@ -652,8 +652,12 @@ static int check_ctx_access(struct verifier_env *env, int off, int size,
enum bpf_access_type t)
{
if (env->prog->aux->ops->is_valid_access &&
env->prog->aux->ops->is_valid_access(off, size, t))
env->prog->aux->ops->is_valid_access(off, size, t)) {
/* remember the offset of last byte accessed in ctx */
if (env->prog->aux->max_ctx_offset < off + size)
env->prog->aux->max_ctx_offset = off + size;
return 0;
}
verbose("invalid bpf_context access off=%d size=%d\n", off, size);
return -EACCES;
......
......@@ -6725,12 +6725,13 @@ int perf_swevent_get_recursion_context(void)
}
EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
inline void perf_swevent_put_recursion_context(int rctx)
void perf_swevent_put_recursion_context(int rctx)
{
struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
put_recursion_context(swhash->recursion, rctx);
}
EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
{
......@@ -6987,7 +6988,7 @@ static int perf_tp_event_match(struct perf_event *event,
return 1;
}
void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
struct pt_regs *regs, struct hlist_head *head, int rctx,
struct task_struct *task)
{
......@@ -6999,9 +7000,11 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
.data = record,
};
perf_sample_data_init(&data, addr, 0);
perf_sample_data_init(&data, 0, 0);
data.raw = &raw;
perf_trace_buf_update(record, event_type);
hlist_for_each_entry_rcu(event, head, hlist_entry) {
if (perf_tp_event_match(event, &data, regs))
perf_swevent_event(event, count, &data, regs);
......@@ -7104,6 +7107,7 @@ static void perf_event_free_filter(struct perf_event *event)
static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
{
bool is_kprobe, is_tracepoint;
struct bpf_prog *prog;
if (event->attr.type != PERF_TYPE_TRACEPOINT)
......@@ -7112,20 +7116,31 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
if (event->tp_event->prog)
return -EEXIST;
if (!(event->tp_event->flags & TRACE_EVENT_FL_UKPROBE))
/* bpf programs can only be attached to u/kprobes */
is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
if (!is_kprobe && !is_tracepoint)
/* bpf programs can only be attached to u/kprobe or tracepoint */
return -EINVAL;
prog = bpf_prog_get(prog_fd);
if (IS_ERR(prog))
return PTR_ERR(prog);
if (prog->type != BPF_PROG_TYPE_KPROBE) {
if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
(is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
/* valid fd, but invalid bpf program type */
bpf_prog_put(prog);
return -EINVAL;
}
if (is_tracepoint) {
int off = trace_event_get_offsets(event->tp_event);
if (prog->aux->max_ctx_offset > off) {
bpf_prog_put(prog);
return -EACCES;
}
}
event->tp_event->prog = prog;
return 0;
......
......@@ -268,7 +268,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {
.arg5_type = ARG_CONST_STACK_SIZE,
};
static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
{
switch (func_id) {
case BPF_FUNC_map_lookup_elem:
......@@ -295,12 +295,20 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
return &bpf_get_smp_processor_id_proto;
case BPF_FUNC_perf_event_read:
return &bpf_perf_event_read_proto;
default:
return NULL;
}
}
static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
{
switch (func_id) {
case BPF_FUNC_perf_event_output:
return &bpf_perf_event_output_proto;
case BPF_FUNC_get_stackid:
return &bpf_get_stackid_proto;
default:
return NULL;
return tracing_func_proto(func_id);
}
}
......@@ -332,9 +340,82 @@ static struct bpf_prog_type_list kprobe_tl = {
.type = BPF_PROG_TYPE_KPROBE,
};
static u64 bpf_perf_event_output_tp(u64 r1, u64 r2, u64 index, u64 r4, u64 size)
{
/*
* r1 points to perf tracepoint buffer where first 8 bytes are hidden
* from bpf program and contain a pointer to 'struct pt_regs'. Fetch it
* from there and call the same bpf_perf_event_output() helper
*/
u64 ctx = *(long *)r1;
return bpf_perf_event_output(ctx, r2, index, r4, size);
}
static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {
.func = bpf_perf_event_output_tp,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_PTR_TO_STACK,
.arg5_type = ARG_CONST_STACK_SIZE,
};
static u64 bpf_get_stackid_tp(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
{
u64 ctx = *(long *)r1;
return bpf_get_stackid(ctx, r2, r3, r4, r5);
}
static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
.func = bpf_get_stackid_tp,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
};
static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
{
switch (func_id) {
case BPF_FUNC_perf_event_output:
return &bpf_perf_event_output_proto_tp;
case BPF_FUNC_get_stackid:
return &bpf_get_stackid_proto_tp;
default:
return tracing_func_proto(func_id);
}
}
static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type)
{
if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE)
return false;
if (type != BPF_READ)
return false;
if (off % size != 0)
return false;
return true;
}
static const struct bpf_verifier_ops tracepoint_prog_ops = {
.get_func_proto = tp_prog_func_proto,
.is_valid_access = tp_prog_is_valid_access,
};
static struct bpf_prog_type_list tracepoint_tl = {
.ops = &tracepoint_prog_ops,
.type = BPF_PROG_TYPE_TRACEPOINT,
};
static int __init register_kprobe_prog_ops(void)
{
bpf_register_prog_type(&kprobe_tl);
bpf_register_prog_type(&tracepoint_tl);
return 0;
}
late_initcall(register_kprobe_prog_ops);
......@@ -260,13 +260,10 @@ void perf_trace_del(struct perf_event *p_event, int flags)
tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
}
void *perf_trace_buf_prepare(int size, unsigned short type,
struct pt_regs **regs, int *rctxp)
void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp)
{
struct trace_entry *entry;
unsigned long flags;
char *raw_data;
int pc;
int rctx;
BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
......@@ -274,28 +271,32 @@ void *perf_trace_buf_prepare(int size, unsigned short type,
"perf buffer not large enough"))
return NULL;
pc = preempt_count();
*rctxp = perf_swevent_get_recursion_context();
if (*rctxp < 0)
*rctxp = rctx = perf_swevent_get_recursion_context();
if (rctx < 0)
return NULL;
if (regs)
*regs = this_cpu_ptr(&__perf_regs[*rctxp]);
raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]);
*regs = this_cpu_ptr(&__perf_regs[rctx]);
raw_data = this_cpu_ptr(perf_trace_buf[rctx]);
/* zero the dead bytes from align to not leak stack to user */
memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64));
return raw_data;
}
EXPORT_SYMBOL_GPL(perf_trace_buf_alloc);
NOKPROBE_SYMBOL(perf_trace_buf_alloc);
void perf_trace_buf_update(void *record, u16 type)
{
struct trace_entry *entry = record;
int pc = preempt_count();
unsigned long flags;
entry = (struct trace_entry *)raw_data;
local_save_flags(flags);
tracing_generic_entry_update(entry, flags, pc);
entry->type = type;
return raw_data;
}
EXPORT_SYMBOL_GPL(perf_trace_buf_prepare);
NOKPROBE_SYMBOL(perf_trace_buf_prepare);
NOKPROBE_SYMBOL(perf_trace_buf_update);
#ifdef CONFIG_FUNCTION_TRACER
static void
......@@ -316,15 +317,16 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
BUILD_BUG_ON(ENTRY_SIZE > PERF_MAX_TRACE_SIZE);
memset(&regs, 0, sizeof(regs));
perf_fetch_caller_regs(&regs);
entry = perf_trace_buf_prepare(ENTRY_SIZE, TRACE_FN, NULL, &rctx);
entry = perf_trace_buf_alloc(ENTRY_SIZE, NULL, &rctx);
if (!entry)
return;
entry->ip = ip;
entry->parent_ip = parent_ip;
perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0,
perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN,
1, &regs, head, NULL);
#undef ENTRY_SIZE
......
......@@ -204,6 +204,24 @@ static void trace_destroy_fields(struct trace_event_call *call)
}
}
/*
* run-time version of trace_event_get_offsets_<call>() that returns the last
* accessible offset of trace fields excluding __dynamic_array bytes
*/
int trace_event_get_offsets(struct trace_event_call *call)
{
struct ftrace_event_field *tail;
struct list_head *head;
head = trace_get_fields(call);
/*
* head->next points to the last field with the largest offset,
* since it was added last by trace_define_field()
*/
tail = list_first_entry(head, struct ftrace_event_field, link);
return tail->offset + tail->size;
}
int trace_event_raw_init(struct trace_event_call *call)
{
int id;
......
......@@ -1149,14 +1149,15 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
size = ALIGN(__size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
entry = perf_trace_buf_alloc(size, NULL, &rctx);
if (!entry)
return;
entry->ip = (unsigned long)tk->rp.kp.addr;
memset(&entry[1], 0, dsize);
store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
head, NULL);
}
NOKPROBE_SYMBOL(kprobe_perf_func);
......@@ -1184,14 +1185,15 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
size = ALIGN(__size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
entry = perf_trace_buf_alloc(size, NULL, &rctx);
if (!entry)
return;
entry->func = (unsigned long)tk->rp.kp.addr;
entry->ret_ip = (unsigned long)ri->ret_addr;
store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
head, NULL);
}
NOKPROBE_SYMBOL(kretprobe_perf_func);
#endif /* CONFIG_PERF_EVENTS */
......
......@@ -587,15 +587,16 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
size = ALIGN(size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
sys_data->enter_event->event.type, NULL, &rctx);
rec = perf_trace_buf_alloc(size, NULL, &rctx);
if (!rec)
return;
rec->nr = syscall_nr;
syscall_get_arguments(current, regs, 0, sys_data->nb_args,
(unsigned long *)&rec->args);
perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
perf_trace_buf_submit(rec, size, rctx,
sys_data->enter_event->event.type, 1, regs,
head, NULL);
}
static int perf_sysenter_enable(struct trace_event_call *call)
......@@ -660,14 +661,14 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
sys_data->exit_event->event.type, NULL, &rctx);
rec = perf_trace_buf_alloc(size, NULL, &rctx);
if (!rec)
return;
rec->nr = syscall_nr;
rec->ret = syscall_get_return_value(current, regs);
perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type,
1, regs, head, NULL);
}
static int perf_sysexit_enable(struct trace_event_call *call)
......
......@@ -1131,7 +1131,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
if (hlist_empty(head))
goto out;
entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
entry = perf_trace_buf_alloc(size, NULL, &rctx);
if (!entry)
goto out;
......@@ -1152,7 +1152,8 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
memset(data + len, 0, size - esize - len);
}
perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
head, NULL);
out:
preempt_enable();
}
......
......@@ -19,6 +19,7 @@ hostprogs-y += lathist
hostprogs-y += offwaketime
hostprogs-y += spintest
hostprogs-y += map_perf_test
hostprogs-y += test_overhead
test_verifier-objs := test_verifier.o libbpf.o
test_maps-objs := test_maps.o libbpf.o
......@@ -38,6 +39,7 @@ lathist-objs := bpf_load.o libbpf.o lathist_user.o
offwaketime-objs := bpf_load.o libbpf.o offwaketime_user.o
spintest-objs := bpf_load.o libbpf.o spintest_user.o
map_perf_test-objs := bpf_load.o libbpf.o map_perf_test_user.o
test_overhead-objs := bpf_load.o libbpf.o test_overhead_user.o
# Tell kbuild to always build the programs
always := $(hostprogs-y)
......@@ -56,6 +58,8 @@ always += lathist_kern.o
always += offwaketime_kern.o
always += spintest_kern.o
always += map_perf_test_kern.o
always += test_overhead_tp_kern.o
always += test_overhead_kprobe_kern.o
HOSTCFLAGS += -I$(objtree)/usr/include
......@@ -75,6 +79,7 @@ HOSTLOADLIBES_lathist += -lelf
HOSTLOADLIBES_offwaketime += -lelf
HOSTLOADLIBES_spintest += -lelf
HOSTLOADLIBES_map_perf_test += -lelf -lrt
HOSTLOADLIBES_test_overhead += -lelf -lrt
# point this to your LLVM backend with bpf support
LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
......
......@@ -49,6 +49,7 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
bool is_socket = strncmp(event, "socket", 6) == 0;
bool is_kprobe = strncmp(event, "kprobe/", 7) == 0;
bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0;
bool is_tracepoint = strncmp(event, "tracepoint/", 11) == 0;
enum bpf_prog_type prog_type;
char buf[256];
int fd, efd, err, id;
......@@ -63,6 +64,8 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
} else if (is_kprobe || is_kretprobe) {
prog_type = BPF_PROG_TYPE_KPROBE;
} else if (is_tracepoint) {
prog_type = BPF_PROG_TYPE_TRACEPOINT;
} else {
printf("Unknown event '%s'\n", event);
return -1;
......@@ -111,12 +114,23 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
event, strerror(errno));
return -1;
}
}
strcpy(buf, DEBUGFS);
strcat(buf, "events/kprobes/");
strcat(buf, event);
strcat(buf, "/id");
} else if (is_tracepoint) {
event += 11;
if (*event == 0) {
printf("event name cannot be empty\n");
return -1;
}
strcpy(buf, DEBUGFS);
strcat(buf, "events/");
strcat(buf, event);
strcat(buf, "/id");
}
efd = open(buf, O_RDONLY, 0);
if (efd < 0) {
......@@ -304,6 +318,7 @@ int load_bpf_file(char *path)
if (memcmp(shname_prog, "kprobe/", 7) == 0 ||
memcmp(shname_prog, "kretprobe/", 10) == 0 ||
memcmp(shname_prog, "tracepoint/", 11) == 0 ||
memcmp(shname_prog, "socket", 6) == 0)
load_and_attach(shname_prog, insns, data_prog->d_size);
}
......@@ -320,6 +335,7 @@ int load_bpf_file(char *path)
if (memcmp(shname, "kprobe/", 7) == 0 ||
memcmp(shname, "kretprobe/", 10) == 0 ||
memcmp(shname, "tracepoint/", 11) == 0 ||
memcmp(shname, "socket", 6) == 0)
load_and_attach(shname, data->d_buf, data->d_size);
}
......
......@@ -73,7 +73,7 @@ int waker(struct pt_regs *ctx)
return 0;
}
static inline int update_counts(struct pt_regs *ctx, u32 pid, u64 delta)
static inline int update_counts(void *ctx, u32 pid, u64 delta)
{
struct key_t key = {};
struct wokeby_t *woke;
......@@ -100,15 +100,33 @@ static inline int update_counts(struct pt_regs *ctx, u32 pid, u64 delta)
return 0;
}
#if 1
/* taken from /sys/kernel/debug/tracing/events/sched/sched_switch/format */
struct sched_switch_args {
unsigned long long pad;
char prev_comm[16];
int prev_pid;
int prev_prio;
long long prev_state;
char next_comm[16];
int next_pid;
int next_prio;
};
SEC("tracepoint/sched/sched_switch")
int oncpu(struct sched_switch_args *ctx)
{
/* record previous thread sleep time */
u32 pid = ctx->prev_pid;
#else
SEC("kprobe/finish_task_switch")
int oncpu(struct pt_regs *ctx)
{
struct task_struct *p = (void *) PT_REGS_PARM1(ctx);
/* record previous thread sleep time */
u32 pid = _(p->pid);
#endif
u64 delta, ts, *tsp;
u32 pid;
/* record previous thread sleep time */
pid = _(p->pid);
ts = bpf_ktime_get_ns();
bpf_map_update_elem(&start, &pid, &ts, BPF_ANY);
......
/* Copyright (c) 2016 Facebook
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*/
#include <linux/version.h>
#include <linux/ptrace.h>
#include <uapi/linux/bpf.h>
#include "bpf_helpers.h"
#define _(P) ({typeof(P) val = 0; bpf_probe_read(&val, sizeof(val), &P); val;})
SEC("kprobe/__set_task_comm")
int prog(struct pt_regs *ctx)
{
struct signal_struct *signal;
struct task_struct *tsk;
char oldcomm[16] = {};
char newcomm[16] = {};
u16 oom_score_adj;
u32 pid;
tsk = (void *)PT_REGS_PARM1(ctx);
pid = _(tsk->pid);
bpf_probe_read(oldcomm, sizeof(oldcomm), &tsk->comm);
bpf_probe_read(newcomm, sizeof(newcomm), (void *)PT_REGS_PARM2(ctx));
signal = _(tsk->signal);
oom_score_adj = _(signal->oom_score_adj);
return 0;
}
SEC("kprobe/urandom_read")
int prog2(struct pt_regs *ctx)
{
return 0;
}
char _license[] SEC("license") = "GPL";
u32 _version SEC("version") = LINUX_VERSION_CODE;
/* Copyright (c) 2016 Facebook
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*/
#include <uapi/linux/bpf.h>
#include "bpf_helpers.h"
/* from /sys/kernel/debug/tracing/events/task/task_rename/format */
struct task_rename {
__u64 pad;
__u32 pid;
char oldcomm[16];
char newcomm[16];
__u16 oom_score_adj;
};
SEC("tracepoint/task/task_rename")
int prog(struct task_rename *ctx)
{
return 0;
}
/* from /sys/kernel/debug/tracing/events/random/urandom_read/format */
struct urandom_read {
__u64 pad;
int got_bits;
int pool_left;
int input_left;
};
SEC("tracepoint/random/urandom_read")
int prog2(struct urandom_read *ctx)
{
return 0;
}
char _license[] SEC("license") = "GPL";
/* Copyright (c) 2016 Facebook
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*/
#define _GNU_SOURCE
#include <sched.h>
#include <stdio.h>
#include <sys/types.h>
#include <asm/unistd.h>
#include <fcntl.h>
#include <unistd.h>
#include <assert.h>
#include <sys/wait.h>
#include <stdlib.h>
#include <signal.h>
#include <linux/bpf.h>
#include <string.h>
#include <time.h>
#include <sys/resource.h>
#include "libbpf.h"
#include "bpf_load.h"
#define MAX_CNT 1000000
static __u64 time_get_ns(void)
{
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return ts.tv_sec * 1000000000ull + ts.tv_nsec;
}
static void test_task_rename(int cpu)
{
__u64 start_time;
char buf[] = "test\n";
int i, fd;
fd = open("/proc/self/comm", O_WRONLY|O_TRUNC);
if (fd < 0) {
printf("couldn't open /proc\n");
exit(1);
}
start_time = time_get_ns();
for (i = 0; i < MAX_CNT; i++)
write(fd, buf, sizeof(buf));
printf("task_rename:%d: %lld events per sec\n",
cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time));
close(fd);
}
static void test_urandom_read(int cpu)
{
__u64 start_time;
char buf[4];
int i, fd;
fd = open("/dev/urandom", O_RDONLY);
if (fd < 0) {
printf("couldn't open /dev/urandom\n");
exit(1);
}
start_time = time_get_ns();
for (i = 0; i < MAX_CNT; i++)
read(fd, buf, sizeof(buf));
printf("urandom_read:%d: %lld events per sec\n",
cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time));
close(fd);
}
static void loop(int cpu, int flags)
{
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
CPU_SET(cpu, &cpuset);
sched_setaffinity(0, sizeof(cpuset), &cpuset);
if (flags & 1)
test_task_rename(cpu);
if (flags & 2)
test_urandom_read(cpu);
}
static void run_perf_test(int tasks, int flags)
{
pid_t pid[tasks];
int i;
for (i = 0; i < tasks; i++) {
pid[i] = fork();
if (pid[i] == 0) {
loop(i, flags);
exit(0);
} else if (pid[i] == -1) {
printf("couldn't spawn #%d process\n", i);
exit(1);
}
}
for (i = 0; i < tasks; i++) {
int status;
assert(waitpid(pid[i], &status, 0) == pid[i]);
assert(status == 0);
}
}
static void unload_progs(void)
{
close(prog_fd[0]);
close(prog_fd[1]);
close(event_fd[0]);
close(event_fd[1]);
}
int main(int argc, char **argv)
{
struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
char filename[256];
int num_cpu = 8;
int test_flags = ~0;
setrlimit(RLIMIT_MEMLOCK, &r);
if (argc > 1)
test_flags = atoi(argv[1]) ? : test_flags;
if (argc > 2)
num_cpu = atoi(argv[2]) ? : num_cpu;
if (test_flags & 0x3) {
printf("BASE\n");
run_perf_test(num_cpu, test_flags);
}
if (test_flags & 0xC) {
snprintf(filename, sizeof(filename),
"%s_kprobe_kern.o", argv[0]);
if (load_bpf_file(filename)) {
printf("%s", bpf_log_buf);
return 1;
}
printf("w/KPROBE\n");
run_perf_test(num_cpu, test_flags >> 2);
unload_progs();
}
if (test_flags & 0x30) {
snprintf(filename, sizeof(filename),
"%s_tp_kern.o", argv[0]);
if (load_bpf_file(filename)) {
printf("%s", bpf_log_buf);
return 1;
}
printf("w/TRACEPOINT\n");
run_perf_test(num_cpu, test_flags >> 4);
unload_progs();
}
return 0;
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment