Commit 70a66320 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'probes-v6.10' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace

Pull probes updates from Masami Hiramatsu:

 - tracing/probes: Add new pseudo-types %pd and %pD support for dumping
   dentry name from 'struct dentry *' and file name from 'struct file *'

 - uprobes performance optimizations:
    - Speed up the BPF uprobe event by delaying the fetching of the
      uprobe event arguments that are not used in BPF
    - Avoid locking by speculatively checking whether uprobe event is
      valid
    - Reduce lock contention by using read/write_lock instead of
      spinlock for uprobe list operation. This improved BPF uprobe
      benchmark result 43% on average

 - rethook: Remove non-fatal warning messages when tracing stack from
   BPF and skip rcu_is_watching() validation in rethook if possible

 - objpool: Optimize objpool (which is used by kretprobes and fprobe as
   rethook backend storage) by inlining functions and avoid caching
   nr_cpu_ids because it is a const value

 - fprobe: Add entry/exit callbacks types (code cleanup)

 - kprobes: Check ftrace was killed in kprobes if it uses ftrace

* tag 'probes-v6.10' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace:
  kprobe/ftrace: bail out if ftrace was killed
  selftests/ftrace: Fix required features for VFS type test case
  objpool: cache nr_possible_cpus() and avoid caching nr_cpu_ids
  objpool: enable inlining objpool_push() and objpool_pop() operations
  rethook: honor CONFIG_FTRACE_VALIDATE_RCU_IS_WATCHING in rethook_try_get()
  ftrace: make extra rcu_is_watching() validation check optional
  uprobes: reduce contention on uprobes_tree access
  rethook: Remove warning messages printed for finding return address of a frame.
  fprobe: Add entry/exit callbacks types
  selftests/ftrace: add fprobe test cases for VFS type "%pd" and "%pD"
  selftests/ftrace: add kprobe test cases for VFS type "%pd" and "%pD"
  Documentation: tracing: add new type '%pd' and '%pD' for kprobe
  tracing/probes: support '%pD' type for print struct file's name
  tracing/probes: support '%pd' type for print struct dentry's name
  uprobes: add speculative lockless system-wide uprobe filter check
  uprobes: prepare uprobe args buffer lazily
  uprobes: encapsulate preparation of uprobe args buffer
parents e9d68251 1a7d0890
......@@ -58,8 +58,9 @@ Synopsis of kprobe_events
NAME=FETCHARG : Set NAME as the argument name of FETCHARG.
FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types
(u8/u16/u32/u64/s8/s16/s32/s64), hexadecimal types
(x8/x16/x32/x64), "char", "string", "ustring", "symbol", "symstr"
and bitfield are supported.
(x8/x16/x32/x64), VFS layer common type(%pd/%pD), "char",
"string", "ustring", "symbol", "symstr" and bitfield are
supported.
(\*1) only for the probe on function entry (offs == 0). Note, this argument access
is best effort, because depending on the argument type, it may be passed on
......@@ -122,6 +123,9 @@ With 'symstr' type, you can filter the event with wildcard pattern of the
symbols, and you don't need to solve symbol name by yourself.
For $comm, the default type is "string"; any other type is invalid.
VFS layer common type(%pd/%pD) is a special type, which fetches dentry's or
file's name from struct dentry's address or struct file's address.
.. _user_mem_access:
User Memory Access
......
......@@ -12,6 +12,9 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
struct kprobe_ctlblk *kcb;
struct pt_regs *regs;
if (unlikely(kprobe_ftrace_disabled))
return;
bit = ftrace_test_recursion_trylock(ip, parent_ip);
if (bit < 0)
return;
......
......@@ -287,6 +287,9 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
struct kprobe *p;
struct kprobe_ctlblk *kcb;
if (unlikely(kprobe_ftrace_disabled))
return;
bit = ftrace_test_recursion_trylock(ip, parent_ip);
if (bit < 0)
return;
......
......@@ -206,6 +206,9 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
struct kprobe *p;
int bit;
if (unlikely(kprobe_ftrace_disabled))
return;
bit = ftrace_test_recursion_trylock(ip, parent_ip);
if (bit < 0)
return;
......
......@@ -21,6 +21,9 @@ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip,
struct pt_regs *regs;
int bit;
if (unlikely(kprobe_ftrace_disabled))
return;
bit = ftrace_test_recursion_trylock(nip, parent_nip);
if (bit < 0)
return;
......
......@@ -11,6 +11,9 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
struct kprobe_ctlblk *kcb;
int bit;
if (unlikely(kprobe_ftrace_disabled))
return;
bit = ftrace_test_recursion_trylock(ip, parent_ip);
if (bit < 0)
return;
......
......@@ -296,6 +296,9 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
struct kprobe *p;
int bit;
if (unlikely(kprobe_ftrace_disabled))
return;
bit = ftrace_test_recursion_trylock(ip, parent_ip);
if (bit < 0)
return;
......
......@@ -21,6 +21,9 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
struct kprobe_ctlblk *kcb;
int bit;
if (unlikely(kprobe_ftrace_disabled))
return;
bit = ftrace_test_recursion_trylock(ip, parent_ip);
if (bit < 0)
return;
......
......@@ -7,6 +7,16 @@
#include <linux/ftrace.h>
#include <linux/rethook.h>
struct fprobe;
typedef int (*fprobe_entry_cb)(struct fprobe *fp, unsigned long entry_ip,
unsigned long ret_ip, struct pt_regs *regs,
void *entry_data);
typedef void (*fprobe_exit_cb)(struct fprobe *fp, unsigned long entry_ip,
unsigned long ret_ip, struct pt_regs *regs,
void *entry_data);
/**
* struct fprobe - ftrace based probe.
* @ops: The ftrace_ops.
......@@ -34,12 +44,8 @@ struct fprobe {
size_t entry_data_size;
int nr_maxactive;
int (*entry_handler)(struct fprobe *fp, unsigned long entry_ip,
unsigned long ret_ip, struct pt_regs *regs,
void *entry_data);
void (*exit_handler)(struct fprobe *fp, unsigned long entry_ip,
unsigned long ret_ip, struct pt_regs *regs,
void *entry_data);
fprobe_entry_cb entry_handler;
fprobe_exit_cb exit_handler;
};
/* This fprobe is soft-disabled. */
......
......@@ -378,11 +378,15 @@ static inline void wait_for_kprobe_optimizer(void) { }
extern void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *ops, struct ftrace_regs *fregs);
extern int arch_prepare_kprobe_ftrace(struct kprobe *p);
/* Set when ftrace has been killed: kprobes on ftrace must be disabled for safety */
extern bool kprobe_ftrace_disabled __read_mostly;
extern void kprobe_ftrace_kill(void);
#else
static inline int arch_prepare_kprobe_ftrace(struct kprobe *p)
{
return -EINVAL;
}
static inline void kprobe_ftrace_kill(void) {}
#endif /* CONFIG_KPROBES_ON_FTRACE */
/* Get the kprobe at this addr (if any) - called with preemption disabled */
......@@ -495,6 +499,9 @@ static inline void kprobe_flush_task(struct task_struct *tk)
static inline void kprobe_free_init_mem(void)
{
}
static inline void kprobe_ftrace_kill(void)
{
}
static inline int disable_kprobe(struct kprobe *kp)
{
return -EOPNOTSUPP;
......
......@@ -5,6 +5,10 @@
#include <linux/types.h>
#include <linux/refcount.h>
#include <linux/atomic.h>
#include <linux/cpumask.h>
#include <linux/irqflags.h>
#include <linux/smp.h>
/*
* objpool: ring-array based lockless MPMC queue
......@@ -69,7 +73,7 @@ typedef int (*objpool_fini_cb)(struct objpool_head *head, void *context);
* struct objpool_head - object pooling metadata
* @obj_size: object size, aligned to sizeof(void *)
* @nr_objs: total objs (to be pre-allocated with objpool)
* @nr_cpus: local copy of nr_cpu_ids
* @nr_possible_cpus: cached value of num_possible_cpus()
* @capacity: max objs can be managed by one objpool_slot
* @gfp: gfp flags for kmalloc & vmalloc
* @ref: refcount of objpool
......@@ -81,7 +85,7 @@ typedef int (*objpool_fini_cb)(struct objpool_head *head, void *context);
struct objpool_head {
int obj_size;
int nr_objs;
int nr_cpus;
int nr_possible_cpus;
int capacity;
gfp_t gfp;
refcount_t ref;
......@@ -118,13 +122,94 @@ int objpool_init(struct objpool_head *pool, int nr_objs, int object_size,
gfp_t gfp, void *context, objpool_init_obj_cb objinit,
objpool_fini_cb release);
/* try to retrieve object from slot */
static inline void *__objpool_try_get_slot(struct objpool_head *pool, int cpu)
{
struct objpool_slot *slot = pool->cpu_slots[cpu];
/* load head snapshot, other cpus may change it */
uint32_t head = smp_load_acquire(&slot->head);
while (head != READ_ONCE(slot->last)) {
void *obj;
/*
* data visibility of 'last' and 'head' could be out of
* order since memory updating of 'last' and 'head' are
* performed in push() and pop() independently
*
* before any retrieving attempts, pop() must guarantee
* 'last' is behind 'head', that is to say, there must
* be available objects in slot, which could be ensured
* by condition 'last != head && last - head <= nr_objs'
* that is equivalent to 'last - head - 1 < nr_objs' as
* 'last' and 'head' are both unsigned int32
*/
if (READ_ONCE(slot->last) - head - 1 >= pool->nr_objs) {
head = READ_ONCE(slot->head);
continue;
}
/* obj must be retrieved before moving forward head */
obj = READ_ONCE(slot->entries[head & slot->mask]);
/* move head forward to mark it's consumption */
if (try_cmpxchg_release(&slot->head, &head, head + 1))
return obj;
}
return NULL;
}
/**
* objpool_pop() - allocate an object from objpool
* @pool: object pool
*
* return value: object ptr or NULL if failed
*/
void *objpool_pop(struct objpool_head *pool);
static inline void *objpool_pop(struct objpool_head *pool)
{
void *obj = NULL;
unsigned long flags;
int i, cpu;
/* disable local irq to avoid preemption & interruption */
raw_local_irq_save(flags);
cpu = raw_smp_processor_id();
for (i = 0; i < pool->nr_possible_cpus; i++) {
obj = __objpool_try_get_slot(pool, cpu);
if (obj)
break;
cpu = cpumask_next_wrap(cpu, cpu_possible_mask, -1, 1);
}
raw_local_irq_restore(flags);
return obj;
}
/* adding object to slot, abort if the slot was already full */
static inline int
__objpool_try_add_slot(void *obj, struct objpool_head *pool, int cpu)
{
struct objpool_slot *slot = pool->cpu_slots[cpu];
uint32_t head, tail;
/* loading tail and head as a local snapshot, tail first */
tail = READ_ONCE(slot->tail);
do {
head = READ_ONCE(slot->head);
/* fault caught: something must be wrong */
WARN_ON_ONCE(tail - head > pool->nr_objs);
} while (!try_cmpxchg_acquire(&slot->tail, &tail, tail + 1));
/* now the tail position is reserved for the given obj */
WRITE_ONCE(slot->entries[tail & slot->mask], obj);
/* update sequence to make this obj available for pop() */
smp_store_release(&slot->last, tail + 1);
return 0;
}
/**
* objpool_push() - reclaim the object and return back to objpool
......@@ -134,7 +219,19 @@ void *objpool_pop(struct objpool_head *pool);
* return: 0 or error code (it fails only when user tries to push
* the same object multiple times or wrong "objects" into objpool)
*/
int objpool_push(void *obj, struct objpool_head *pool);
static inline int objpool_push(void *obj, struct objpool_head *pool)
{
unsigned long flags;
int rc;
/* disable local irq to avoid preemption & interruption */
raw_local_irq_save(flags);
rc = __objpool_try_add_slot(obj, pool, raw_smp_processor_id());
raw_local_irq_restore(flags);
return rc;
}
/**
* objpool_drop() - discard the object and deref objpool
......
......@@ -135,7 +135,7 @@ extern void ftrace_record_recursion(unsigned long ip, unsigned long parent_ip);
# define do_ftrace_record_recursion(ip, pip) do { } while (0)
#endif
#ifdef CONFIG_ARCH_WANTS_NO_INSTR
#ifdef CONFIG_FTRACE_VALIDATE_RCU_IS_WATCHING
# define trace_warn_on_no_rcu(ip) \
({ \
bool __ret = !rcu_is_watching(); \
......
......@@ -39,7 +39,7 @@ static struct rb_root uprobes_tree = RB_ROOT;
*/
#define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree)
static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */
static DEFINE_RWLOCK(uprobes_treelock); /* serialize rbtree access */
#define UPROBES_HASH_SZ 13
/* serialize uprobe->pending_list */
......@@ -669,9 +669,9 @@ static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
{
struct uprobe *uprobe;
spin_lock(&uprobes_treelock);
read_lock(&uprobes_treelock);
uprobe = __find_uprobe(inode, offset);
spin_unlock(&uprobes_treelock);
read_unlock(&uprobes_treelock);
return uprobe;
}
......@@ -701,9 +701,9 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe)
{
struct uprobe *u;
spin_lock(&uprobes_treelock);
write_lock(&uprobes_treelock);
u = __insert_uprobe(uprobe);
spin_unlock(&uprobes_treelock);
write_unlock(&uprobes_treelock);
return u;
}
......@@ -935,9 +935,9 @@ static void delete_uprobe(struct uprobe *uprobe)
if (WARN_ON(!uprobe_is_active(uprobe)))
return;
spin_lock(&uprobes_treelock);
write_lock(&uprobes_treelock);
rb_erase(&uprobe->rb_node, &uprobes_tree);
spin_unlock(&uprobes_treelock);
write_unlock(&uprobes_treelock);
RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */
put_uprobe(uprobe);
}
......@@ -1298,7 +1298,7 @@ static void build_probe_list(struct inode *inode,
min = vaddr_to_offset(vma, start);
max = min + (end - start) - 1;
spin_lock(&uprobes_treelock);
read_lock(&uprobes_treelock);
n = find_node_in_range(inode, min, max);
if (n) {
for (t = n; t; t = rb_prev(t)) {
......@@ -1316,7 +1316,7 @@ static void build_probe_list(struct inode *inode,
get_uprobe(u);
}
}
spin_unlock(&uprobes_treelock);
read_unlock(&uprobes_treelock);
}
/* @vma contains reference counter, not the probed instruction. */
......@@ -1407,9 +1407,9 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long e
min = vaddr_to_offset(vma, start);
max = min + (end - start) - 1;
spin_lock(&uprobes_treelock);
read_lock(&uprobes_treelock);
n = find_node_in_range(inode, min, max);
spin_unlock(&uprobes_treelock);
read_unlock(&uprobes_treelock);
return !!n;
}
......
......@@ -1067,6 +1067,7 @@ static struct ftrace_ops kprobe_ipmodify_ops __read_mostly = {
static int kprobe_ipmodify_enabled;
static int kprobe_ftrace_enabled;
bool kprobe_ftrace_disabled;
static int __arm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops,
int *cnt)
......@@ -1135,6 +1136,11 @@ static int disarm_kprobe_ftrace(struct kprobe *p)
ipmodify ? &kprobe_ipmodify_ops : &kprobe_ftrace_ops,
ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled);
}
void kprobe_ftrace_kill()
{
kprobe_ftrace_disabled = true;
}
#else /* !CONFIG_KPROBES_ON_FTRACE */
static inline int arm_kprobe_ftrace(struct kprobe *p)
{
......
......@@ -974,6 +974,19 @@ config FTRACE_RECORD_RECURSION_SIZE
This file can be reset, but the limit can not change in
size at runtime.
config FTRACE_VALIDATE_RCU_IS_WATCHING
bool "Validate RCU is on during ftrace execution"
depends on FUNCTION_TRACER
depends on ARCH_WANTS_NO_INSTR
help
All callbacks that attach to the function tracing have some sort of
protection against recursion. This option is only to verify that
ftrace (and other users of ftrace_test_recursion_trylock()) are not
called outside of RCU, as if they are, it can cause a race. But it
also has a noticeable overhead when enabled.
If unsure, say N
config RING_BUFFER_RECORD_RECURSION
bool "Record functions that recurse in the ring buffer"
depends on FTRACE_RECORD_RECURSION
......
......@@ -7894,6 +7894,7 @@ void ftrace_kill(void)
ftrace_disabled = 1;
ftrace_enabled = 0;
ftrace_trace_function = ftrace_stub;
kprobe_ftrace_kill();
}
/**
......
......@@ -166,6 +166,7 @@ struct rethook_node *rethook_try_get(struct rethook *rh)
if (unlikely(!handler))
return NULL;
#if defined(CONFIG_FTRACE_VALIDATE_RCU_IS_WATCHING) || defined(CONFIG_KPROBE_EVENTS_ON_NOTRACE)
/*
* This expects the caller will set up a rethook on a function entry.
* When the function returns, the rethook will eventually be reclaimed
......@@ -174,6 +175,7 @@ struct rethook_node *rethook_try_get(struct rethook *rh)
*/
if (unlikely(!rcu_is_watching()))
return NULL;
#endif
return (struct rethook_node *)objpool_pop(&rh->pool);
}
......@@ -248,7 +250,7 @@ unsigned long rethook_find_ret_addr(struct task_struct *tsk, unsigned long frame
if (WARN_ON_ONCE(!cur))
return 0;
if (WARN_ON_ONCE(tsk != current && task_is_running(tsk)))
if (tsk != current && task_is_running(tsk))
return 0;
do {
......
......@@ -5540,7 +5540,7 @@ static const char readme_msg[] =
"\t kernel return probes support: $retval, $arg<N>, $comm\n"
"\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, char, string, symbol,\n"
"\t b<bit-width>@<bit-offset>/<container-size>, ustring,\n"
"\t symstr, <type>\\[<array-size>\\]\n"
"\t symstr, %pd/%pD, <type>\\[<array-size>\\]\n"
#ifdef CONFIG_HIST_TRIGGERS
"\t field: <stype> <name>;\n"
"\t stype: u8/u16/u32/u64, s8/s16/s32/s64, pid_t,\n"
......
......@@ -994,6 +994,7 @@ static int __trace_fprobe_create(int argc, const char *argv[])
char gbuf[MAX_EVENT_NAME_LEN];
char sbuf[KSYM_NAME_LEN];
char abuf[MAX_BTF_ARGS_LEN];
char *dbuf = NULL;
bool is_tracepoint = false;
struct tracepoint *tpoint = NULL;
struct traceprobe_parse_context ctx = {
......@@ -1104,6 +1105,10 @@ static int __trace_fprobe_create(int argc, const char *argv[])
argv = new_argv;
}
ret = traceprobe_expand_dentry_args(argc, argv, &dbuf);
if (ret)
goto out;
/* setup a probe */
tf = alloc_trace_fprobe(group, event, symbol, tpoint, maxactive,
argc, is_return);
......@@ -1154,6 +1159,7 @@ static int __trace_fprobe_create(int argc, const char *argv[])
trace_probe_log_clear();
kfree(new_argv);
kfree(symbol);
kfree(dbuf);
return ret;
parse_error:
......
......@@ -800,6 +800,7 @@ static int __trace_kprobe_create(int argc, const char *argv[])
char buf[MAX_EVENT_NAME_LEN];
char gbuf[MAX_EVENT_NAME_LEN];
char abuf[MAX_BTF_ARGS_LEN];
char *dbuf = NULL;
struct traceprobe_parse_context ctx = { .flags = TPARG_FL_KERNEL };
switch (argv[0][0]) {
......@@ -951,6 +952,10 @@ static int __trace_kprobe_create(int argc, const char *argv[])
argv = new_argv;
}
ret = traceprobe_expand_dentry_args(argc, argv, &dbuf);
if (ret)
goto out;
/* setup a probe */
tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive,
argc, is_return);
......@@ -997,6 +1002,7 @@ static int __trace_kprobe_create(int argc, const char *argv[])
trace_probe_log_clear();
kfree(new_argv);
kfree(symbol);
kfree(dbuf);
return ret;
parse_error:
......
......@@ -12,6 +12,7 @@
#define pr_fmt(fmt) "trace_probe: " fmt
#include <linux/bpf.h>
#include <linux/fs.h>
#include "trace_btf.h"
#include "trace_probe.h"
......@@ -1737,6 +1738,68 @@ const char **traceprobe_expand_meta_args(int argc, const char *argv[],
return ERR_PTR(ret);
}
/* @buf: *buf must be equal to NULL. Caller must to free *buf */
int traceprobe_expand_dentry_args(int argc, const char *argv[], char **buf)
{
int i, used, ret;
const int bufsize = MAX_DENTRY_ARGS_LEN;
char *tmpbuf = NULL;
if (*buf)
return -EINVAL;
used = 0;
for (i = 0; i < argc; i++) {
char *tmp;
char *equal;
size_t arg_len;
if (!glob_match("*:%p[dD]", argv[i]))
continue;
if (!tmpbuf) {
tmpbuf = kmalloc(bufsize, GFP_KERNEL);
if (!tmpbuf)
return -ENOMEM;
}
tmp = kstrdup(argv[i], GFP_KERNEL);
if (!tmp)
goto nomem;
equal = strchr(tmp, '=');
if (equal)
*equal = '\0';
arg_len = strlen(argv[i]);
tmp[arg_len - 4] = '\0';
if (argv[i][arg_len - 1] == 'd')
ret = snprintf(tmpbuf + used, bufsize - used,
"%s%s+0x0(+0x%zx(%s)):string",
equal ? tmp : "", equal ? "=" : "",
offsetof(struct dentry, d_name.name),
equal ? equal + 1 : tmp);
else
ret = snprintf(tmpbuf + used, bufsize - used,
"%s%s+0x0(+0x%zx(+0x%zx(%s))):string",
equal ? tmp : "", equal ? "=" : "",
offsetof(struct dentry, d_name.name),
offsetof(struct file, f_path.dentry),
equal ? equal + 1 : tmp);
kfree(tmp);
if (ret >= bufsize - used)
goto nomem;
argv[i] = tmpbuf + used;
used += ret + 1;
}
*buf = tmpbuf;
return 0;
nomem:
kfree(tmpbuf);
return -ENOMEM;
}
void traceprobe_finish_parse(struct traceprobe_parse_context *ctx)
{
clear_btf_context(ctx);
......
......@@ -34,6 +34,7 @@
#define MAX_ARRAY_LEN 64
#define MAX_ARG_NAME_LEN 32
#define MAX_BTF_ARGS_LEN 128
#define MAX_DENTRY_ARGS_LEN 256
#define MAX_STRING_SIZE PATH_MAX
#define MAX_ARG_BUF_LEN (MAX_TRACE_ARGS * MAX_ARG_NAME_LEN)
......@@ -428,6 +429,7 @@ extern int traceprobe_parse_probe_arg(struct trace_probe *tp, int i,
const char **traceprobe_expand_meta_args(int argc, const char *argv[],
int *new_argc, char *buf, int bufsize,
struct traceprobe_parse_context *ctx);
extern int traceprobe_expand_dentry_args(int argc, const char *argv[], char **buf);
extern int traceprobe_update_arg(struct probe_arg *arg);
extern void traceprobe_free_probe_arg(struct probe_arg *arg);
......
......@@ -854,6 +854,7 @@ static const struct file_operations uprobe_profile_ops = {
struct uprobe_cpu_buffer {
struct mutex mutex;
void *buf;
int dsize;
};
static struct uprobe_cpu_buffer __percpu *uprobe_cpu_buffer;
static int uprobe_buffer_refcnt;
......@@ -940,30 +941,56 @@ static struct uprobe_cpu_buffer *uprobe_buffer_get(void)
static void uprobe_buffer_put(struct uprobe_cpu_buffer *ucb)
{
if (!ucb)
return;
mutex_unlock(&ucb->mutex);
}
static struct uprobe_cpu_buffer *prepare_uprobe_buffer(struct trace_uprobe *tu,
struct pt_regs *regs,
struct uprobe_cpu_buffer **ucbp)
{
struct uprobe_cpu_buffer *ucb;
int dsize, esize;
if (*ucbp)
return *ucbp;
esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
dsize = __get_data_size(&tu->tp, regs, NULL);
ucb = uprobe_buffer_get();
ucb->dsize = tu->tp.size + dsize;
store_trace_args(ucb->buf, &tu->tp, regs, NULL, esize, dsize);
*ucbp = ucb;
return ucb;
}
static void __uprobe_trace_func(struct trace_uprobe *tu,
unsigned long func, struct pt_regs *regs,
struct uprobe_cpu_buffer *ucb, int dsize,
struct uprobe_cpu_buffer **ucbp,
struct trace_event_file *trace_file)
{
struct uprobe_trace_entry_head *entry;
struct trace_event_buffer fbuffer;
struct uprobe_cpu_buffer *ucb;
void *data;
int size, esize;
struct trace_event_call *call = trace_probe_event_call(&tu->tp);
WARN_ON(call != trace_file->event_call);
if (WARN_ON_ONCE(tu->tp.size + dsize > PAGE_SIZE))
ucb = prepare_uprobe_buffer(tu, regs, ucbp);
if (WARN_ON_ONCE(ucb->dsize > PAGE_SIZE))
return;
if (trace_trigger_soft_disabled(trace_file))
return;
esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
size = esize + tu->tp.size + dsize;
size = esize + ucb->dsize;
entry = trace_event_buffer_reserve(&fbuffer, trace_file, size);
if (!entry)
return;
......@@ -977,14 +1004,14 @@ static void __uprobe_trace_func(struct trace_uprobe *tu,
data = DATAOF_TRACE_ENTRY(entry, false);
}
memcpy(data, ucb->buf, tu->tp.size + dsize);
memcpy(data, ucb->buf, ucb->dsize);
trace_event_buffer_commit(&fbuffer);
}
/* uprobe handler */
static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs,
struct uprobe_cpu_buffer *ucb, int dsize)
struct uprobe_cpu_buffer **ucbp)
{
struct event_file_link *link;
......@@ -993,7 +1020,7 @@ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs,
rcu_read_lock();
trace_probe_for_each_link_rcu(link, &tu->tp)
__uprobe_trace_func(tu, 0, regs, ucb, dsize, link->file);
__uprobe_trace_func(tu, 0, regs, ucbp, link->file);
rcu_read_unlock();
return 0;
......@@ -1001,13 +1028,13 @@ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs,
static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func,
struct pt_regs *regs,
struct uprobe_cpu_buffer *ucb, int dsize)
struct uprobe_cpu_buffer **ucbp)
{
struct event_file_link *link;
rcu_read_lock();
trace_probe_for_each_link_rcu(link, &tu->tp)
__uprobe_trace_func(tu, func, regs, ucb, dsize, link->file);
__uprobe_trace_func(tu, func, regs, ucbp, link->file);
rcu_read_unlock();
}
......@@ -1199,9 +1226,6 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm)
{
struct perf_event *event;
if (filter->nr_systemwide)
return true;
list_for_each_entry(event, &filter->perf_events, hw.tp_list) {
if (event->hw.target->mm == mm)
return true;
......@@ -1326,6 +1350,13 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc,
tu = container_of(uc, struct trace_uprobe, consumer);
filter = tu->tp.event->filter;
/*
* speculative short-circuiting check to avoid unnecessarily taking
* filter->rwlock below, if the uprobe has system-wide consumer
*/
if (READ_ONCE(filter->nr_systemwide))
return true;
read_lock(&filter->rwlock);
ret = __uprobe_perf_filter(filter, mm);
read_unlock(&filter->rwlock);
......@@ -1335,10 +1366,11 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc,
static void __uprobe_perf_func(struct trace_uprobe *tu,
unsigned long func, struct pt_regs *regs,
struct uprobe_cpu_buffer *ucb, int dsize)
struct uprobe_cpu_buffer **ucbp)
{
struct trace_event_call *call = trace_probe_event_call(&tu->tp);
struct uprobe_trace_entry_head *entry;
struct uprobe_cpu_buffer *ucb;
struct hlist_head *head;
void *data;
int size, esize;
......@@ -1356,7 +1388,8 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
size = esize + tu->tp.size + dsize;
ucb = prepare_uprobe_buffer(tu, regs, ucbp);
size = esize + ucb->dsize;
size = ALIGN(size + sizeof(u32), sizeof(u64)) - sizeof(u32);
if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
return;
......@@ -1379,13 +1412,10 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
data = DATAOF_TRACE_ENTRY(entry, false);
}
memcpy(data, ucb->buf, tu->tp.size + dsize);
if (size - esize > tu->tp.size + dsize) {
int len = tu->tp.size + dsize;
memcpy(data, ucb->buf, ucb->dsize);
memset(data + len, 0, size - esize - len);
}
if (size - esize > ucb->dsize)
memset(data + ucb->dsize, 0, size - esize - ucb->dsize);
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
head, NULL);
......@@ -1395,21 +1425,21 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
/* uprobe profile handler */
static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs,
struct uprobe_cpu_buffer *ucb, int dsize)
struct uprobe_cpu_buffer **ucbp)
{
if (!uprobe_perf_filter(&tu->consumer, 0, current->mm))
return UPROBE_HANDLER_REMOVE;
if (!is_ret_probe(tu))
__uprobe_perf_func(tu, 0, regs, ucb, dsize);
__uprobe_perf_func(tu, 0, regs, ucbp);
return 0;
}
static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func,
struct pt_regs *regs,
struct uprobe_cpu_buffer *ucb, int dsize)
struct uprobe_cpu_buffer **ucbp)
{
__uprobe_perf_func(tu, func, regs, ucb, dsize);
__uprobe_perf_func(tu, func, regs, ucbp);
}
int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type,
......@@ -1474,11 +1504,9 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
{
struct trace_uprobe *tu;
struct uprobe_dispatch_data udd;
struct uprobe_cpu_buffer *ucb;
int dsize, esize;
struct uprobe_cpu_buffer *ucb = NULL;
int ret = 0;
tu = container_of(con, struct trace_uprobe, consumer);
tu->nhit++;
......@@ -1490,18 +1518,12 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
if (WARN_ON_ONCE(!uprobe_cpu_buffer))
return 0;
dsize = __get_data_size(&tu->tp, regs, NULL);
esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
ucb = uprobe_buffer_get();
store_trace_args(ucb->buf, &tu->tp, regs, NULL, esize, dsize);
if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE))
ret |= uprobe_trace_func(tu, regs, ucb, dsize);
ret |= uprobe_trace_func(tu, regs, &ucb);
#ifdef CONFIG_PERF_EVENTS
if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE))
ret |= uprobe_perf_func(tu, regs, ucb, dsize);
ret |= uprobe_perf_func(tu, regs, &ucb);
#endif
uprobe_buffer_put(ucb);
return ret;
......@@ -1512,8 +1534,7 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con,
{
struct trace_uprobe *tu;
struct uprobe_dispatch_data udd;
struct uprobe_cpu_buffer *ucb;
int dsize, esize;
struct uprobe_cpu_buffer *ucb = NULL;
tu = container_of(con, struct trace_uprobe, consumer);
......@@ -1525,18 +1546,12 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con,
if (WARN_ON_ONCE(!uprobe_cpu_buffer))
return 0;
dsize = __get_data_size(&tu->tp, regs, NULL);
esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
ucb = uprobe_buffer_get();
store_trace_args(ucb->buf, &tu->tp, regs, NULL, esize, dsize);
if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE))
uretprobe_trace_func(tu, func, regs, ucb, dsize);
uretprobe_trace_func(tu, func, regs, &ucb);
#ifdef CONFIG_PERF_EVENTS
if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE))
uretprobe_perf_func(tu, func, regs, ucb, dsize);
uretprobe_perf_func(tu, func, regs, &ucb);
#endif
uprobe_buffer_put(ucb);
return 0;
......
......@@ -50,7 +50,7 @@ objpool_init_percpu_slots(struct objpool_head *pool, int nr_objs,
{
int i, cpu_count = 0;
for (i = 0; i < pool->nr_cpus; i++) {
for (i = 0; i < nr_cpu_ids; i++) {
struct objpool_slot *slot;
int nodes, size, rc;
......@@ -60,8 +60,8 @@ objpool_init_percpu_slots(struct objpool_head *pool, int nr_objs,
continue;
/* compute how many objects to be allocated with this slot */
nodes = nr_objs / num_possible_cpus();
if (cpu_count < (nr_objs % num_possible_cpus()))
nodes = nr_objs / pool->nr_possible_cpus;
if (cpu_count < (nr_objs % pool->nr_possible_cpus))
nodes++;
cpu_count++;
......@@ -103,7 +103,7 @@ static void objpool_fini_percpu_slots(struct objpool_head *pool)
if (!pool->cpu_slots)
return;
for (i = 0; i < pool->nr_cpus; i++)
for (i = 0; i < nr_cpu_ids; i++)
kvfree(pool->cpu_slots[i]);
kfree(pool->cpu_slots);
}
......@@ -130,13 +130,13 @@ int objpool_init(struct objpool_head *pool, int nr_objs, int object_size,
/* initialize objpool pool */
memset(pool, 0, sizeof(struct objpool_head));
pool->nr_cpus = nr_cpu_ids;
pool->nr_possible_cpus = num_possible_cpus();
pool->obj_size = object_size;
pool->capacity = capacity;
pool->gfp = gfp & ~__GFP_ZERO;
pool->context = context;
pool->release = release;
slot_size = pool->nr_cpus * sizeof(struct objpool_slot);
slot_size = nr_cpu_ids * sizeof(struct objpool_slot);
pool->cpu_slots = kzalloc(slot_size, pool->gfp);
if (!pool->cpu_slots)
return -ENOMEM;
......@@ -152,106 +152,6 @@ int objpool_init(struct objpool_head *pool, int nr_objs, int object_size,
}
EXPORT_SYMBOL_GPL(objpool_init);
/* adding object to slot, abort if the slot was already full */
static inline int
objpool_try_add_slot(void *obj, struct objpool_head *pool, int cpu)
{
struct objpool_slot *slot = pool->cpu_slots[cpu];
uint32_t head, tail;
/* loading tail and head as a local snapshot, tail first */
tail = READ_ONCE(slot->tail);
do {
head = READ_ONCE(slot->head);
/* fault caught: something must be wrong */
WARN_ON_ONCE(tail - head > pool->nr_objs);
} while (!try_cmpxchg_acquire(&slot->tail, &tail, tail + 1));
/* now the tail position is reserved for the given obj */
WRITE_ONCE(slot->entries[tail & slot->mask], obj);
/* update sequence to make this obj available for pop() */
smp_store_release(&slot->last, tail + 1);
return 0;
}
/* reclaim an object to object pool */
int objpool_push(void *obj, struct objpool_head *pool)
{
unsigned long flags;
int rc;
/* disable local irq to avoid preemption & interruption */
raw_local_irq_save(flags);
rc = objpool_try_add_slot(obj, pool, raw_smp_processor_id());
raw_local_irq_restore(flags);
return rc;
}
EXPORT_SYMBOL_GPL(objpool_push);
/* try to retrieve object from slot */
static inline void *objpool_try_get_slot(struct objpool_head *pool, int cpu)
{
struct objpool_slot *slot = pool->cpu_slots[cpu];
/* load head snapshot, other cpus may change it */
uint32_t head = smp_load_acquire(&slot->head);
while (head != READ_ONCE(slot->last)) {
void *obj;
/*
* data visibility of 'last' and 'head' could be out of
* order since memory updating of 'last' and 'head' are
* performed in push() and pop() independently
*
* before any retrieving attempts, pop() must guarantee
* 'last' is behind 'head', that is to say, there must
* be available objects in slot, which could be ensured
* by condition 'last != head && last - head <= nr_objs'
* that is equivalent to 'last - head - 1 < nr_objs' as
* 'last' and 'head' are both unsigned int32
*/
if (READ_ONCE(slot->last) - head - 1 >= pool->nr_objs) {
head = READ_ONCE(slot->head);
continue;
}
/* obj must be retrieved before moving forward head */
obj = READ_ONCE(slot->entries[head & slot->mask]);
/* move head forward to mark it's consumption */
if (try_cmpxchg_release(&slot->head, &head, head + 1))
return obj;
}
return NULL;
}
/* allocate an object from object pool */
void *objpool_pop(struct objpool_head *pool)
{
void *obj = NULL;
unsigned long flags;
int i, cpu;
/* disable local irq to avoid preemption & interruption */
raw_local_irq_save(flags);
cpu = raw_smp_processor_id();
for (i = 0; i < num_possible_cpus(); i++) {
obj = objpool_try_get_slot(pool, cpu);
if (obj)
break;
cpu = cpumask_next_wrap(cpu, cpu_possible_mask, -1, 1);
}
raw_local_irq_restore(flags);
return obj;
}
EXPORT_SYMBOL_GPL(objpool_pop);
/* release whole objpool forcely */
void objpool_free(struct objpool_head *pool)
{
......
#!/bin/sh
# SPDX-License-Identifier: GPL-2.0
# description: Fprobe event VFS type argument
# requires: dynamic_events "%pd/%pD":README "f[:[<group>/][<event>]] <func-name>[%return] [<args>]":README
: "Test argument %pd with name for fprobe"
echo 'f:testprobe dput name=$arg1:%pd' > dynamic_events
echo 1 > events/fprobes/testprobe/enable
grep -q "1" events/fprobes/testprobe/enable
echo 0 > events/fprobes/testprobe/enable
grep "dput" trace | grep -q "enable"
echo "" > dynamic_events
echo "" > trace
: "Test argument %pd without name for fprobe"
echo 'f:testprobe dput $arg1:%pd' > dynamic_events
echo 1 > events/fprobes/testprobe/enable
grep -q "1" events/fprobes/testprobe/enable
echo 0 > events/fprobes/testprobe/enable
grep "dput" trace | grep -q "enable"
echo "" > dynamic_events
echo "" > trace
: "Test argument %pD with name for fprobe"
echo 'f:testprobe vfs_read name=$arg1:%pD' > dynamic_events
echo 1 > events/fprobes/testprobe/enable
grep -q "1" events/fprobes/testprobe/enable
echo 0 > events/fprobes/testprobe/enable
grep "vfs_read" trace | grep -q "enable"
echo "" > dynamic_events
echo "" > trace
: "Test argument %pD without name for fprobe"
echo 'f:testprobe vfs_read $arg1:%pD' > dynamic_events
echo 1 > events/fprobes/testprobe/enable
grep -q "1" events/fprobes/testprobe/enable
echo 0 > events/fprobes/testprobe/enable
grep "vfs_read" trace | grep -q "enable"
echo "" > dynamic_events
echo "" > trace
#!/bin/sh
# SPDX-License-Identifier: GPL-2.0
# description: Kprobe event VFS type argument
# requires: kprobe_events "%pd/%pD":README
: "Test argument %pd with name"
echo 'p:testprobe dput name=$arg1:%pd' > kprobe_events
echo 1 > events/kprobes/testprobe/enable
grep -q "1" events/kprobes/testprobe/enable
echo 0 > events/kprobes/testprobe/enable
grep "dput" trace | grep -q "enable"
echo "" > kprobe_events
echo "" > trace
: "Test argument %pd without name"
echo 'p:testprobe dput $arg1:%pd' > kprobe_events
echo 1 > events/kprobes/testprobe/enable
grep -q "1" events/kprobes/testprobe/enable
echo 0 > events/kprobes/testprobe/enable
grep "dput" trace | grep -q "enable"
echo "" > kprobe_events
echo "" > trace
: "Test argument %pD with name"
echo 'p:testprobe vfs_read name=$arg1:%pD' > kprobe_events
echo 1 > events/kprobes/testprobe/enable
grep -q "1" events/kprobes/testprobe/enable
echo 0 > events/kprobes/testprobe/enable
grep "vfs_read" trace | grep -q "enable"
echo "" > kprobe_events
echo "" > trace
: "Test argument %pD without name"
echo 'p:testprobe vfs_read $arg1:%pD' > kprobe_events
echo 1 > events/kprobes/testprobe/enable
grep -q "1" events/kprobes/testprobe/enable
echo 0 > events/kprobes/testprobe/enable
grep "vfs_read" trace | grep -q "enable"
echo "" > kprobe_events
echo "" > trace
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment