Commit 79b6f337 authored by Brenden Blanco's avatar Brenden Blanco

Merge pull request #222 from brendangregg/master

biosnoop and disk updates
parents f29cced9 a080b3e3
......@@ -38,7 +38,7 @@ static unsigned int log2l(unsigned long v)
return log2(v) + 1;
}
int kprobe__blk_start_request(struct pt_regs *ctx, struct request *req)
int kprobe__blk_account_io_completion(struct pt_regs *ctx, struct request *req)
{
int index = log2l(req->__data_len / 1024);
u64 *leaf = dist.lookup(&index);
......
......@@ -15,14 +15,14 @@
BPF_HASH(start, struct request *);
void kprobe__blk_start_request(struct pt_regs *ctx, struct request *req) {
void trace_start(struct pt_regs *ctx, struct request *req) {
// stash start timestamp by request ptr
u64 ts = bpf_ktime_get_ns();
start.update(&req, &ts);
}
void kprobe__blk_update_request(struct pt_regs *ctx, struct request *req) {
void trace_completion(struct pt_regs *ctx, struct request *req) {
u64 *tsp, delta;
tsp = start.lookup(&req);
......
......@@ -17,6 +17,9 @@ REQ_WRITE = 1 # from include/linux/blk_types.h
# load BPF program
b = BPF(src_file="disksnoop.c")
b.attach_kprobe(event="blk_start_request", fn_name="trace_start")
b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_start")
b.attach_kprobe(event="blk_account_io_completion", fn_name="trace_completion")
# header
print("%-18s %-2s %-7s %8s" % ("TIME(s)", "T", "BYTES", "LAT(ms)"))
......
.TH biosnoop 8 "2015-09-16" "USER COMMANDS"
.SH NAME
biosnoop \- Trace block device I/O and print details incl. issuing PID.
.SH SYNOPSIS
.B biosnoop
.SH DESCRIPTION
This tools traces block device I/O (disk I/O), and prints a one-line summary
for each I/O showing various details. These include the latency from the time of
issue to the device to its completion, and the PID and process name from when
the I/O was first created (which usually identifies the responsible process).
This uses in-kernel eBPF maps to cache process details (PID and comm) by I/O
request, as well as a starting timestamp for calculating I/O latency.
Since this uses BPF, only the root user can use this tool.
.SH REQUIREMENTS
CONFIG_BPF and bcc.
.SH EXAMPLES
.TP
Trace all block device I/O and print a summary line per I/O:
#
.B biosnoop
.SH FIELDS
.TP
TIME(s)
Time of the I/O, in seconds since the first I/O was seen.
.TP
COMM
Cached process name, if present. This usually (but isn't guaranteed) to identify
the responsible process for the I/O.
.TP
PID
Cached process ID, if present. This usually (but isn't guaranteed) to identify
the responsible process for the I/O.
.TP
DISK
Disk device name.
.TP
T
Type of I/O: R = read, W = write. This is a simplification.
.TP
SECTOR
Device sector for the I/O.
.TP
BYTES
Size of the I/O, in bytes.
.TP
LAT(ms)
Time for the I/O (latency) from the issue to the device, to its completion,
in milliseconds.
.SH OVERHEAD
Since block device I/O usually has a relatively low frequency (< 10,000/s),
the overhead for this tool is expected to be negligible. For high IOPS storage
systems, test and quantify before use.
.SH SOURCE
This is from bcc.
.IP
https://github.com/iovisor/bcc
.PP
Also look in the bcc distribution for a companion _examples.txt file containing
example usage, output, and commentary for this tool.
.SH OS
Linux
.SH STABILITY
Unstable - in development.
.SH AUTHOR
Brendan Gregg
.SH SEE ALSO
disksnoop(8), iostat(1)
#!/usr/bin/python
#
# biosnoop Trace block device I/O and print details including issuing PID.
# For Linux, uses BCC, eBPF.
#
# This uses in-kernel eBPF maps to cache process details (PID and comm) by I/O
# request, as well as a starting timestamp for calculating I/O latency.
#
# Copyright (c) 2015 Brendan Gregg.
# Licensed under the Apache License, Version 2.0 (the "License")
#
# 16-Sep-2015 Brendan Gregg Created this.
from __future__ import print_function
from bcc import BPF
# load BPF program
b = BPF(text="""
#include <uapi/linux/ptrace.h>
#include <linux/blkdev.h>
struct val_t {
char name[TASK_COMM_LEN];
};
BPF_HASH(start, struct request *);
BPF_HASH(pidbyreq, struct request *, u32);
BPF_HASH(commbyreq, struct request *, struct val_t);
// cache PID and comm by-req
int trace_pid_start(struct pt_regs *ctx, struct request *req)
{
u32 pid;
struct val_t val = {};
pid = bpf_get_current_pid_tgid();
pidbyreq.update(&req, &pid);
if (bpf_get_current_comm(&val.name, sizeof(val.name)) == 0) {
commbyreq.update(&req, &val);
}
return 0;
}
// time block I/O
int trace_req_start(struct pt_regs *ctx, struct request *req)
{
u64 ts;
ts = bpf_ktime_get_ns();
start.update(&req, &ts);
return 0;
}
// output
int trace_req_completion(struct pt_regs *ctx, struct request *req)
{
u64 *tsp, delta;
u32 *pidp = 0;
struct val_t *valp;
// fetch timestamp and calculate delta
tsp = start.lookup(&req);
if (tsp == 0) {
// missed tracing issue
return 0;
}
delta = bpf_ktime_get_ns() - *tsp;
//
// Fetch and output issuing pid and comm.
// As bpf_trace_prink() is limited to a maximum of 1 string and 2
// integers, we'll use more than one to output the data.
//
valp = commbyreq.lookup(&req);
pidp = pidbyreq.lookup(&req);
if (pidp == 0 || valp == 0) {
bpf_trace_printk("0 0 ? %d\\n", req->__data_len);
} else {
bpf_trace_printk("0 %d %s %d\\n", *pidp, valp->name,
req->__data_len);
}
// output remaining details
if (req->cmd_flags & REQ_WRITE) {
bpf_trace_printk("1 W %s %d %d ?\\n", req->rq_disk->disk_name,
req->__sector, delta / 1000);
} else {
bpf_trace_printk("1 R %s %d %d ?\\n", req->rq_disk->disk_name,
req->__sector, delta / 1000);
}
start.delete(&req);
pidbyreq.delete(&req);
commbyreq.delete(&req);
return 0;
}
""")
b.attach_kprobe(event="blk_account_io_start", fn_name="trace_pid_start")
b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start")
b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start")
b.attach_kprobe(event="blk_account_io_completion",
fn_name="trace_req_completion")
# header
print("%-14s %-14s %-6s %-7s %-2s %-9s %-7s %7s" % ("TIME(s)", "COMM", "PID",
"DISK", "T", "SECTOR", "BYTES", "LAT(ms)"))
start_ts = 0
# format output
while 1:
(task, pid, cpu, flags, ts, msg) = b.trace_fields()
args = msg.split(" ")
if start_ts == 0:
start_ts = ts
if args[0] == "0":
(real_pid, real_comm, bytes_s) = (args[1], args[2], args[3])
continue
else:
(type_s, disk_s, sector_s, us_s) = (args[1], args[2], args[3],
args[4])
ms = float(int(us_s, 10)) / 1000
print("%-14.9f %-14.14s %-6s %-7s %-2s %-9s %-7s %7.2f" % (
ts - start_ts, real_comm, real_pid, disk_s, type_s, sector_s,
bytes_s, ms))
Demonstrations of biosnoop, the Linux eBPF/bcc version.
biosnoop traces block device I/O (disk I/O), and prints a line of output
per I/O. Example:
# ./biosnoop
TIME(s) COMM PID DISK T SECTOR BYTES LAT(ms)
0.000004001 supervise 1950 xvda1 W 13092560 4096 0.74
0.000178002 supervise 1950 xvda1 W 13092432 4096 0.61
0.001469001 supervise 1956 xvda1 W 13092440 4096 1.24
0.001588002 supervise 1956 xvda1 W 13115128 4096 1.09
1.022346001 supervise 1950 xvda1 W 13115272 4096 0.98
1.022568002 supervise 1950 xvda1 W 13188496 4096 0.93
1.023534000 supervise 1956 xvda1 W 13188520 4096 0.79
1.023585003 supervise 1956 xvda1 W 13189512 4096 0.60
2.003920000 xfsaild/md0 456 xvdc W 62901512 8192 0.23
2.003931001 xfsaild/md0 456 xvdb W 62901513 512 0.25
2.004034001 xfsaild/md0 456 xvdb W 62901520 8192 0.35
2.004042000 xfsaild/md0 456 xvdb W 63542016 4096 0.36
2.004204001 kworker/0:3 26040 xvdb W 41950344 65536 0.34
2.044352002 supervise 1950 xvda1 W 13192672 4096 0.65
2.044574000 supervise 1950 xvda1 W 13189072 4096 0.58
This includes the PID and comm (process name) that were on-CPU at the time of
issue (which usually means the process responsible).
The latency of the disk I/O, measured from the issue to the device to its
completion, is included as the last column.
This example output is from what should be an idle system, however, the
following is visible in iostat:
$ iostat -x 1
[...]
avg-cpu: %user %nice %system %iowait %steal %idle
0.12 0.00 0.12 0.00 0.00 99.75
Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util
xvda 0.00 0.00 0.00 4.00 0.00 16.00 8.00 0.00 0.00 0.00 0.00 0.00 0.00
xvdb 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
xvdc 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
md0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
There are 4 write IOPS.
The output of biosnoop identifies the reason: multiple supervise processes are
issuing writes to the xvda1 disk. I can now drill down on supervise using other
tools to understand its file system workload.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment