Commit a2478d86 authored by Brenden Blanco's avatar Brenden Blanco

Merge pull request #372 from mcaleavya/master

migrated biosnoop and bashreadline to use bpf_perf_event_output
parents d189d451 62c6152c
...@@ -10,6 +10,10 @@ entered command may fail: this is just showing what was entered. ...@@ -10,6 +10,10 @@ entered command may fail: this is just showing what was entered.
This program is also a basic example of eBPF/bcc and uprobes. This program is also a basic example of eBPF/bcc and uprobes.
This makes use of a Linux 4.5 feature (bpf_perf_event_output());
for kernels older than 4.5, see the version under tools/old,
which uses an older mechanism
Since this uses BPF, only the root user can use this tool. Since this uses BPF, only the root user can use this tool.
.SH REQUIREMENTS .SH REQUIREMENTS
CONFIG_BPF and bcc. CONFIG_BPF and bcc.
......
...@@ -15,6 +15,10 @@ request, as well as a starting timestamp for calculating I/O latency. ...@@ -15,6 +15,10 @@ request, as well as a starting timestamp for calculating I/O latency.
This works by tracing various kernel blk_*() functions using dynamic tracing, This works by tracing various kernel blk_*() functions using dynamic tracing,
and will need updating to match any changes to these functions. and will need updating to match any changes to these functions.
This makes use of a Linux 4.5 feature (bpf_perf_event_output());
for kernels older than 4.5, see the version under tools/old,
which uses an older mechanism
Since this uses BPF, only the root user can use this tool. Since this uses BPF, only the root user can use this tool.
.SH REQUIREMENTS .SH REQUIREMENTS
CONFIG_BPF and bcc. CONFIG_BPF and bcc.
......
...@@ -9,35 +9,55 @@ ...@@ -9,35 +9,55 @@
# Licensed under the Apache License, Version 2.0 (the "License") # Licensed under the Apache License, Version 2.0 (the "License")
# #
# 28-Jan-2016 Brendan Gregg Created this. # 28-Jan-2016 Brendan Gregg Created this.
# 12-Feb-2016 Allan McAleavy migrated to BPF_PERF_OUTPUT
from __future__ import print_function from __future__ import print_function
from bcc import BPF from bcc import BPF
from time import strftime from time import strftime
import ctypes as ct
# load BPF program # load BPF program
bpf_text = """ bpf_text = """
#include <uapi/linux/ptrace.h> #include <uapi/linux/ptrace.h>
struct str_t {
u64 pid;
char str[80];
};
BPF_PERF_OUTPUT(events);
int printret(struct pt_regs *ctx) { int printret(struct pt_regs *ctx) {
struct str_t data = {};
u32 pid;
if (!ctx->ax) if (!ctx->ax)
return 0; return 0;
pid = bpf_get_current_pid_tgid();
char str[80] = {}; data.pid = pid;
bpf_probe_read(&str, sizeof(str), (void *)ctx->ax); bpf_probe_read(&data.str, sizeof(data.str), (void *)ctx->ax);
bpf_trace_printk("%s\\n", &str); events.perf_submit(ctx,&data,sizeof(data));
return 0; return 0;
}; };
""" """
STR_DATA = 80
class Data(ct.Structure):
_fields_ = [
("pid", ct.c_ulonglong),
("str", ct.c_char * STR_DATA)
]
b = BPF(text=bpf_text) b = BPF(text=bpf_text)
b.attach_uretprobe(name="/bin/bash", sym="readline", fn_name="printret") b.attach_uretprobe(name="/bin/bash", sym="readline", fn_name="printret")
# header # header
print("%-9s %-6s %s" % ("TIME", "PID", "COMMAND")) print("%-9s %-6s %s" % ("TIME", "PID", "COMMAND"))
# format output def print_event(cpu, data, size):
event = ct.cast(data, ct.POINTER(Data)).contents
print("%-9s %-6d %s" % (strftime("%H:%M:%S"), event.pid, event.str))
b["events"].open_perf_buffer(print_event)
while 1: while 1:
try: b.kprobe_poll()
(task, pid, cpu, flags, ts, msg) = b.trace_fields()
except ValueError:
continue
print("%-9s %-6d %s" % (strftime("%H:%M:%S"), pid, msg))
...@@ -11,9 +11,12 @@ ...@@ -11,9 +11,12 @@
# Licensed under the Apache License, Version 2.0 (the "License") # Licensed under the Apache License, Version 2.0 (the "License")
# #
# 16-Sep-2015 Brendan Gregg Created this. # 16-Sep-2015 Brendan Gregg Created this.
# 11-Feb-2016 Allan McAleavy updated for BPF_PERF_OUTPUT
from __future__ import print_function from __future__ import print_function
from bcc import BPF from bcc import BPF
import ctypes as ct
import re
# load BPF program # load BPF program
b = BPF(text=""" b = BPF(text="""
...@@ -25,8 +28,20 @@ struct val_t { ...@@ -25,8 +28,20 @@ struct val_t {
char name[TASK_COMM_LEN]; char name[TASK_COMM_LEN];
}; };
struct data_t {
u32 pid;
u64 rwflag;
u64 delta;
u64 sector;
u64 len;
u64 ts;
char disk_name[DISK_NAME_LEN];
char name[TASK_COMM_LEN];
};
BPF_HASH(start, struct request *); BPF_HASH(start, struct request *);
BPF_HASH(infobyreq, struct request *, struct val_t); BPF_HASH(infobyreq, struct request *, struct val_t);
BPF_PERF_OUTPUT(events);
// cache PID and comm by-req // cache PID and comm by-req
int trace_pid_start(struct pt_regs *ctx, struct request *req) int trace_pid_start(struct pt_regs *ctx, struct request *req)
...@@ -37,7 +52,6 @@ int trace_pid_start(struct pt_regs *ctx, struct request *req) ...@@ -37,7 +52,6 @@ int trace_pid_start(struct pt_regs *ctx, struct request *req)
val.pid = bpf_get_current_pid_tgid(); val.pid = bpf_get_current_pid_tgid();
infobyreq.update(&req, &val); infobyreq.update(&req, &val);
} }
return 0; return 0;
} }
...@@ -58,6 +72,8 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req) ...@@ -58,6 +72,8 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
u64 *tsp, delta; u64 *tsp, delta;
u32 *pidp = 0; u32 *pidp = 0;
struct val_t *valp; struct val_t *valp;
struct data_t data ={};
u64 ts;
// fetch timestamp and calculate delta // fetch timestamp and calculate delta
tsp = start.lookup(&req); tsp = start.lookup(&req);
...@@ -65,30 +81,29 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req) ...@@ -65,30 +81,29 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
// missed tracing issue // missed tracing issue
return 0; return 0;
} }
delta = bpf_ktime_get_ns() - *tsp; ts = bpf_ktime_get_ns();
data.delta = ts - *tsp;
data.ts = ts / 1000;
//
// Fetch and output issuing pid and comm.
// As bpf_trace_prink() is limited to a maximum of 1 string and 2
// integers, we'll use more than one to output the data.
//
valp = infobyreq.lookup(&req); valp = infobyreq.lookup(&req);
if (valp == 0) { if (valp == 0) {
bpf_trace_printk("0 0 ? %d\\n", req->__data_len); data.len = req->__data_len;
strcpy(data.name,"?");
} else { } else {
bpf_trace_printk("0 %d %s %d\\n", valp->pid, valp->name, data.pid = valp->pid;
req->__data_len); data.len = req->__data_len;
data.sector = req->__sector;
bpf_probe_read(&data.name, sizeof(data.name), valp->name);
bpf_probe_read(&data.disk_name, sizeof(data.disk_name),
req->rq_disk->disk_name);
} }
// output remaining details
if (req->cmd_flags & REQ_WRITE) { if (req->cmd_flags & REQ_WRITE) {
bpf_trace_printk("1 W %s %d %d ?\\n", req->rq_disk->disk_name, data.rwflag=1;
req->__sector, delta / 1000);
} else { } else {
bpf_trace_printk("1 R %s %d %d ?\\n", req->rq_disk->disk_name, data.rwflag=0;
req->__sector, delta / 1000);
} }
events.perf_submit(ctx,&data,sizeof(data));
start.delete(&req); start.delete(&req);
infobyreq.delete(&req); infobyreq.delete(&req);
...@@ -101,29 +116,61 @@ b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start") ...@@ -101,29 +116,61 @@ b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start")
b.attach_kprobe(event="blk_account_io_completion", b.attach_kprobe(event="blk_account_io_completion",
fn_name="trace_req_completion") fn_name="trace_req_completion")
TASK_COMM_LEN = 16 # linux/sched.h
DISK_NAME_LEN = 32 # linux/genhd.h
class Data(ct.Structure):
_fields_ = [
("pid", ct.c_ulonglong),
("rwflag", ct.c_ulonglong),
("delta", ct.c_ulonglong),
("sector", ct.c_ulonglong),
("len", ct.c_ulonglong),
("ts", ct.c_ulonglong),
("disk_name", ct.c_char * DISK_NAME_LEN),
("name", ct.c_char * TASK_COMM_LEN)
]
# header # header
print("%-14s %-14s %-6s %-7s %-2s %-9s %-7s %7s" % ("TIME(s)", "COMM", "PID", print("%-14s %-14s %-6s %-7s %-2s %-9s %-7s %7s" % ("TIME(s)", "COMM", "PID",
"DISK", "T", "SECTOR", "BYTES", "LAT(ms)")) "DISK", "T", "SECTOR", "BYTES", "LAT(ms)"))
rwflg = ""
start_ts = 0 start_ts = 0
prev_ts = 0
delta = 0
# format output # process event
while 1: def print_event(cpu, data, size):
(task, pid, cpu, flags, ts, msg) = b.trace_fields() event = ct.cast(data, ct.POINTER(Data)).contents
args = msg.split(" ")
if start_ts == 0: val = -1
start_ts = ts global start_ts
global prev_ts
global delta
if event.rwflag == 1:
rwflg = "W"
if event.rwflag == 0:
rwflg = "R"
if not re.match('\?', event.name):
val = event.sector
if args[0] == "0": if start_ts == 0:
(real_pid, real_comm, bytes_s) = (args[1], args[2], args[3]) prev_ts = start_ts
continue
else:
(type_s, disk_s, sector_s, us_s) = (args[1], args[2], args[3],
args[4])
ms = float(int(us_s, 10)) / 1000 if start_ts == 1:
delta = float(delta) + (event.ts - prev_ts)
print("%-14.9f %-14.14s %-6s %-7s %-2s %-9s %-7s %7.2f" % ( print("%-14.9f %-14.14s %-6s %-7s %-2s %-9s %-7s %7.2f" % (
ts - start_ts, real_comm, real_pid, disk_s, type_s, sector_s, delta / 1000000, event.name, event.pid, event.disk_name, rwflg, val,
bytes_s, ms)) event.len, float(event.delta) / 1000000))
prev_ts = event.ts
start_ts = 1
# loop with callback to print_event
b["events"].open_perf_buffer(print_event)
while 1:
b.kprobe_poll()
#!/usr/bin/python
#
# bashreadline Print entered bash commands from all running shells.
# For Linux, uses BCC, eBPF. Embedded C.
#
# This works by tracing the readline() function using a uretprobe (uprobes).
#
# Copyright 2016 Netflix, Inc.
# Licensed under the Apache License, Version 2.0 (the "License")
#
# 28-Jan-2016 Brendan Gregg Created this.
from __future__ import print_function
from bcc import BPF
from time import strftime
# load BPF program
bpf_text = """
#include <uapi/linux/ptrace.h>
int printret(struct pt_regs *ctx) {
if (!ctx->ax)
return 0;
char str[80] = {};
bpf_probe_read(&str, sizeof(str), (void *)ctx->ax);
bpf_trace_printk("%s\\n", &str);
return 0;
};
"""
b = BPF(text=bpf_text)
b.attach_uretprobe(name="/bin/bash", sym="readline", fn_name="printret")
# header
print("%-9s %-6s %s" % ("TIME", "PID", "COMMAND"))
# format output
while 1:
try:
(task, pid, cpu, flags, ts, msg) = b.trace_fields()
except ValueError:
continue
print("%-9s %-6d %s" % (strftime("%H:%M:%S"), pid, msg))
#!/usr/bin/python
# @lint-avoid-python-3-compatibility-imports
#
# biosnoop Trace block device I/O and print details including issuing PID.
# For Linux, uses BCC, eBPF.
#
# This uses in-kernel eBPF maps to cache process details (PID and comm) by I/O
# request, as well as a starting timestamp for calculating I/O latency.
#
# Copyright (c) 2015 Brendan Gregg.
# Licensed under the Apache License, Version 2.0 (the "License")
#
# 16-Sep-2015 Brendan Gregg Created this.
from __future__ import print_function
from bcc import BPF
# load BPF program
b = BPF(text="""
#include <uapi/linux/ptrace.h>
#include <linux/blkdev.h>
struct val_t {
u32 pid;
char name[TASK_COMM_LEN];
};
BPF_HASH(start, struct request *);
BPF_HASH(infobyreq, struct request *, struct val_t);
// cache PID and comm by-req
int trace_pid_start(struct pt_regs *ctx, struct request *req)
{
struct val_t val = {};
if (bpf_get_current_comm(&val.name, sizeof(val.name)) == 0) {
val.pid = bpf_get_current_pid_tgid();
infobyreq.update(&req, &val);
}
return 0;
}
// time block I/O
int trace_req_start(struct pt_regs *ctx, struct request *req)
{
u64 ts;
ts = bpf_ktime_get_ns();
start.update(&req, &ts);
return 0;
}
// output
int trace_req_completion(struct pt_regs *ctx, struct request *req)
{
u64 *tsp, delta;
u32 *pidp = 0;
struct val_t *valp;
// fetch timestamp and calculate delta
tsp = start.lookup(&req);
if (tsp == 0) {
// missed tracing issue
return 0;
}
delta = bpf_ktime_get_ns() - *tsp;
//
// Fetch and output issuing pid and comm.
// As bpf_trace_prink() is limited to a maximum of 1 string and 2
// integers, we'll use more than one to output the data.
//
valp = infobyreq.lookup(&req);
if (valp == 0) {
bpf_trace_printk("0 0 ? %d\\n", req->__data_len);
} else {
bpf_trace_printk("0 %d %s %d\\n", valp->pid, valp->name,
req->__data_len);
}
// output remaining details
if (req->cmd_flags & REQ_WRITE) {
bpf_trace_printk("1 W %s %d %d ?\\n", req->rq_disk->disk_name,
req->__sector, delta / 1000);
} else {
bpf_trace_printk("1 R %s %d %d ?\\n", req->rq_disk->disk_name,
req->__sector, delta / 1000);
}
start.delete(&req);
infobyreq.delete(&req);
return 0;
}
""", debug=0)
b.attach_kprobe(event="blk_account_io_start", fn_name="trace_pid_start")
b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start")
b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start")
b.attach_kprobe(event="blk_account_io_completion",
fn_name="trace_req_completion")
# header
print("%-14s %-14s %-6s %-7s %-2s %-9s %-7s %7s" % ("TIME(s)", "COMM", "PID",
"DISK", "T", "SECTOR", "BYTES", "LAT(ms)"))
start_ts = 0
# format output
while 1:
(task, pid, cpu, flags, ts, msg) = b.trace_fields()
args = msg.split(" ")
if start_ts == 0:
start_ts = ts
if args[0] == "0":
(real_pid, real_comm, bytes_s) = (args[1], args[2], args[3])
continue
else:
(type_s, disk_s, sector_s, us_s) = (args[1], args[2], args[3],
args[4])
ms = float(int(us_s, 10)) / 1000
print("%-14.9f %-14.14s %-6s %-7s %-2s %-9s %-7s %7.2f" % (
ts - start_ts, real_comm, real_pid, disk_s, type_s, sector_s,
bytes_s, ms))
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment