Commit 266d6f69 authored by Mark Drayton's avatar Mark Drayton

Use BPF_PERF_OUTPUT for fileslower and stacksnoop

parent 40cf791c
......@@ -31,7 +31,9 @@
from __future__ import print_function
from bcc import BPF
import argparse
import ctypes as ct
import signal
import time
# arguments
examples = """examples:
......@@ -60,17 +62,31 @@ def signal_ignore(signal, frame):
bpf_text = """
#include <uapi/linux/ptrace.h>
#include <linux/fs.h>
#include <linux/sched.h>
#define TRACE_READ 0
#define TRACE_WRITE 1
enum trace_mode {
MODE_READ,
MODE_WRITE
};
struct val_t {
u32 sz;
u64 ts;
char name[DNAME_INLINE_LEN];
char comm[TASK_COMM_LEN];
};
struct data_t {
enum trace_mode mode;
u32 pid;
u32 sz;
u64 delta_us;
char name[DNAME_INLINE_LEN];
char comm[TASK_COMM_LEN];
};
BPF_HASH(entryinfo, pid_t, struct val_t);
BPF_PERF_OUTPUT(events);
// store timestamp and size on entry
static int trace_rw_entry(struct pt_regs *ctx, struct file *file,
......@@ -92,6 +108,7 @@ static int trace_rw_entry(struct pt_regs *ctx, struct file *file,
val.sz = count;
val.ts = bpf_ktime_get_ns();
__builtin_memcpy(&val.name, de->d_iname, sizeof(val.name));
bpf_get_current_comm(&val.comm, sizeof(val.comm));
entryinfo.update(&pid, &val);
return 0;
......@@ -131,23 +148,26 @@ static int trace_rw_return(struct pt_regs *ctx, int type)
if (delta_us < MIN_US)
return 0;
if (type == TRACE_READ) {
bpf_trace_printk("R %d %d %s\\n", valp->sz, delta_us, valp->name);
} else {
bpf_trace_printk("W %d %d %s\\n", valp->sz, delta_us, valp->name);
}
struct data_t data = {};
data.mode = type;
data.pid = pid;
data.sz = valp->sz;
data.delta_us = delta_us;
bpf_probe_read(&data.name, sizeof(data.name), valp->name);
bpf_probe_read(&data.comm, sizeof(data.comm), valp->comm);
events.perf_submit(ctx, &data, sizeof(data));
return 0;
}
int trace_read_return(struct pt_regs *ctx)
{
return trace_rw_return(ctx, TRACE_READ);
return trace_rw_return(ctx, MODE_READ);
}
int trace_write_return(struct pt_regs *ctx)
{
return trace_rw_return(ctx, TRACE_WRITE);
return trace_rw_return(ctx, MODE_WRITE);
}
"""
......@@ -171,26 +191,40 @@ b.attach_kprobe(event="__vfs_write", fn_name="trace_write_entry")
b.attach_kretprobe(event="__vfs_read", fn_name="trace_read_return")
b.attach_kretprobe(event="__vfs_write", fn_name="trace_write_return")
TASK_COMM_LEN = 16 # linux/sched.h
DNAME_INLINE_LEN = 32 # linux/dcache.h
class Data(ct.Structure):
_fields_ = [
("mode", ct.c_int),
("pid", ct.c_uint),
("sz", ct.c_uint),
("delta_us", ct.c_ulonglong),
("name", ct.c_char * DNAME_INLINE_LEN),
("comm", ct.c_char * TASK_COMM_LEN),
]
mode_s = {
0: 'R',
1: 'W',
}
# header
print("Tracing sync read/writes slower than %d ms" % min_ms)
print("%-8s %-14s %-6s %1s %-7s %7s %s" % ("TIME(s)", "COMM", "PID", "D",
"BYTES", "LAT(ms)", "FILENAME"))
start_ts = 0
start_ts = time.time()
# format output
while 1:
(task, pid, cpu, flags, ts, msg) = b.trace_fields()
args = msg.split(" ", 3)
(type_s, sz, delta_us_s) = (args[0], args[1], args[2])
try:
filename = args[3]
except:
filename = "?"
if start_ts == 0:
start_ts = ts
ms = float(int(delta_us_s, 10)) / 1000
def print_event(cpu, data, size):
event = ct.cast(data, ct.POINTER(Data)).contents
ms = float(event.delta_us) / 1000
print("%-8.3f %-14.14s %-6s %1s %-7s %7.2f %s" % (
ts - start_ts, task, pid, type_s, sz, ms, filename))
time.time() - start_ts, event.comm, event.pid, mode_s[event.mode],
event.sz, ms, event.name))
b["events"].open_perf_buffer(print_event)
while 1:
b.kprobe_poll()
......@@ -17,17 +17,30 @@ limitations under the License.
local program = [[
#include <uapi/linux/ptrace.h>
#include <linux/sched.h>
struct data_t {
u64 stack_id;
u32 pid;
char comm[TASK_COMM_LEN];
};
BPF_STACK_TRACE(stack_traces, 128)
BPF_PERF_OUTPUT(events);
void trace_stack(struct pt_regs *ctx) {
u32 pid = bpf_get_current_pid_tgid();
FILTER
int stack_id = stack_traces.get_stackid(ctx, BPF_F_REUSE_STACKID);
if (stack_id >= 0)
bpf_trace_printk("stack_id=%d\n", stack_id);
struct data_t data = {};
data.stack_id = stack_traces.get_stackid(ctx, BPF_F_REUSE_STACKID),
data.pid = pid;
bpf_get_current_comm(&data.comm, sizeof(data.comm));
events.perf_submit(ctx, &data, sizeof(data));
}
]]
local ffi = require("ffi")
return function(BPF, utils)
local parser = utils.argparse("stacksnoop",
"Trace and print kernel stack traces for a kernel function")
......@@ -41,11 +54,7 @@ return function(BPF, utils)
local filter = ""
if args.pid then
filter = [[
u32 pid;
pid = bpf_get_current_pid_tgid();
if (pid != %d) { return; }
]] % args.pid
filter = "if (pid != %d) { return; }" % args.pid
end
local text = program:gsub("FILTER", filter)
......@@ -53,40 +62,50 @@ return function(BPF, utils)
bpf:attach_kprobe{event=args.fn, fn_name="trace_stack"}
if BPF.num_open_kprobes() == 0 then
print("Function \"%s\" not found. Exiting." % args.fn)
print("Function \"%s\" not found. Exiting." % {args.fn})
return
end
if args.verbose then
print("%-18s %-12s %-6s %-3s %s" %
{"TIME(s)", "COMM", "PID", "CPU", "SYSCALL"})
{"TIME(s)", "COMM", "PID", "CPU", "FUNCTION"})
else
print("%-18s %s" % {"TIME(s)", "SYSCALL"})
print("%-18s %s" % {"TIME(s)", "FUNCTION"})
end
local stack_traces = bpf:get_table("stack_traces")
local pipe = bpf:pipe()
local start_ts = utils.posix.time_ns()
while true do
local task, pid, cpu, flags, ts, msg = pipe:trace_fields()
local stack_id = string.match(msg, "stack_id=(%d+)")
local function print_event(cpu, event)
local ts = (utils.posix.time_ns() - start_ts) / 1e9
if stack_id then
if args.verbose then
print("%-18.9f %-12.12s %-6d %-3d %s" % {ts, task, pid, cpu, args.fn})
else
print("%-18.9f %s" % {ts, args.fn})
end
if args.verbose then
print("%-18.9f %-12.12s %-6d %-3d %s" %
{ts, ffi.string(event.comm), event.pid, cpu, args.fn})
else
print("%-18.9f %s" % {ts, args.fn})
end
for addr in stack_traces:walk(tonumber(stack_id)) do
local sym, offset = ksym:resolve(addr)
if args.offset then
print("\t%-16p %s+0x%x" % {addr, sym, tonumber(offset)})
else
print("\t%-16p %s" % {addr, sym})
end
for addr in stack_traces:walk(tonumber(event.stack_id)) do
local sym, offset = ksym:resolve(addr)
if args.offset then
print("\t%-16p %s+0x%x" % {addr, sym, tonumber(offset)})
else
print("\t%-16p %s" % {addr, sym})
end
end
print()
end
local TASK_COMM_LEN = 16 -- linux/sched.h
bpf:get_table("events"):open_perf_buffer(print_event, [[
struct {
uint64_t stack_id;
uint32_t pid;
char comm[%d];
}
]] % {TASK_COMM_LEN})
bpf:kprobe_poll_loop()
end
......@@ -20,7 +20,8 @@
from __future__ import print_function
from bcc import BPF
import argparse
import re
import ctypes as ct
import time
# arguments
examples = """examples:
......@@ -50,20 +51,30 @@ debug = 0
# define BPF program
bpf_text = """
#include <uapi/linux/ptrace.h>
#include <linux/sched.h>
struct data_t {
u64 stack_id;
u32 pid;
char comm[TASK_COMM_LEN];
};
BPF_STACK_TRACE(stack_traces, 128)
BPF_PERF_OUTPUT(events);
void trace_stack(struct pt_regs *ctx) {
u32 pid = bpf_get_current_pid_tgid();
FILTER
int stack_id = stack_traces.get_stackid(ctx, BPF_F_REUSE_STACKID);
if (stack_id >= 0)
bpf_trace_printk("stack_id=%d\\n", stack_id);
struct data_t data = {};
data.stack_id = stack_traces.get_stackid(ctx, BPF_F_REUSE_STACKID),
data.pid = pid;
bpf_get_current_comm(&data.comm, sizeof(data.comm));
events.perf_submit(ctx, &data, sizeof(data));
}
"""
if args.pid:
bpf_text = bpf_text.replace('FILTER',
('u32 pid; pid = bpf_get_current_pid_tgid(); ' +
'if (pid != %s) { return; }') % (args.pid))
'if (pid != %s) { return; }' % args.pid)
else:
bpf_text = bpf_text.replace('FILTER', '')
if debug:
......@@ -72,33 +83,48 @@ if debug:
# initialize BPF
b = BPF(text=bpf_text)
b.attach_kprobe(event=function, fn_name="trace_stack")
TASK_COMM_LEN = 16 # linux/sched.h
class Data(ct.Structure):
_fields_ = [
("stack_id", ct.c_ulonglong),
("pid", ct.c_uint),
("comm", ct.c_char * TASK_COMM_LEN),
]
matched = b.num_open_kprobes()
if matched == 0:
print("Function \"%s\" not found. Exiting." % function)
exit()
stack_traces = b.get_table("stack_traces")
msg_regexp = re.compile("stack_id=(\d+)")
start_ts = time.time()
# header
if verbose:
print("%-18s %-12s %-6s %-3s %s" % ("TIME(s)", "COMM", "PID", "CPU", "SYSCALL"))
print("%-18s %-12s %-6s %-3s %s" %
("TIME(s)", "COMM", "PID", "CPU", "FUNCTION"))
else:
print("%-18s %s" % ("TIME(s)", "SYSCALL"))
print("%-18s %s" % ("TIME(s)", "FUNCTION"))
def print_event(cpu, data, size):
event = ct.cast(data, ct.POINTER(Data)).contents
ts = time.time() - start_ts
if verbose:
print("%-18.9f %-12.12s %-6d %-3d %s" % (ts, event.comm, event.pid, cpu,
function))
else:
print("%-18.9f %s" % (ts, function))
for addr in stack_traces.walk(event.stack_id):
sym = b.ksymaddr(addr) if offset else b.ksym(addr)
print("\t%016x %s" % (addr, sym))
print()
# format output
b["events"].open_perf_buffer(print_event)
while 1:
(task, pid, cpu, flags, ts, msg) = b.trace_fields()
m = msg_regexp.match(msg)
if m:
if verbose:
print("%-18.9f %-12.12s %-6d %-3d %s" % (ts, task, pid, cpu, function))
else:
print("%-18.9f %s" % (ts, function))
stack_id = int(m.group(1))
for addr in stack_traces.walk(stack_id):
sym = b.ksymaddr(addr) if offset else b.ksym(addr)
print("\t%016x %s" % (addr, sym))
print()
b.kprobe_poll()
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment