Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
B
bcc
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
bcc
Commits
5be5194f
Commit
5be5194f
authored
Sep 16, 2015
by
Brendan Gregg
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
biosnoop for block device I/O
parent
f29cced9
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
251 additions
and
0 deletions
+251
-0
man/man8/biosnoop.8
man/man8/biosnoop.8
+69
-0
tools/biosnoop
tools/biosnoop
+133
-0
tools/biosnoop_example.txt
tools/biosnoop_example.txt
+49
-0
No files found.
man/man8/biosnoop.8
0 → 100644
View file @
5be5194f
.TH biosnoop 8 "2015-09-16" "USER COMMANDS"
.SH NAME
biosnoop \- Trace block device I/O and print details incl. issuing PID.
.SH SYNOPSIS
.B biosnoop
.SH DESCRIPTION
This tools traces block device I/O (disk I/O), and prints a one-line summary
for each I/O showing various details. These include the latency from the time of
issue to the device to its completion, and the PID and process name from when
the I/O was first created (which usually identifies the responsible process).
This uses in-kernel eBPF maps to cache process details (PID and comm) by I/O
request, as well as a starting timestamp for calculating I/O latency.
Since this uses BPF, only the root user can use this tool.
.SH REQUIREMENTS
CONFIG_BPF and bcc.
.SH EXAMPLES
.TP
Trace all block device I/O and print a summary line per I/O:
#
.B biosnoop
.SH FIELDS
.TP
TIME(s)
Time of the I/O, in seconds since the first I/O was seen.
.TP
COMM
Cached process name, if present. This usually (but isn't guaranteed) to identify
the responsible process for the I/O.
.TP
PID
Cached process ID, if present. This usually (but isn't guaranteed) to identify
the responsible process for the I/O.
.TP
DISK
Disk device name.
.TP
T
Type of I/O: R = read, W = write. This is a simplification.
.TP
SECTOR
Device sector for the I/O.
.TP
BYTES
Size of the I/O, in bytes.
.TP
LAT(ms)
Time for the I/O (latency) from the issue to the device, to its completion,
in milliseconds.
.SH OVERHEAD
Since block device I/O usually has a relatively low frequency (< 10,000/s),
the overhead for this tool is expected to be negligible. For high IOPS storage
systems, test and quantify before use.
.SH SOURCE
This is from bcc.
.IP
https://github.com/iovisor/bcc
.PP
Also look in the bcc distribution for a companion _examples.txt file containing
example usage, output, and commentary for this tool.
.SH OS
Linux
.SH STABILITY
Unstable - in development.
.SH AUTHOR
Brendan Gregg
.SH SEE ALSO
disksnoop(8), iostat(1)
tools/biosnoop
0 → 100755
View file @
5be5194f
#!/usr/bin/python
#
# biosnoop Trace block device I/O and print details including issuing PID.
# For Linux, uses BCC, eBPF.
#
# This uses in-kernel eBPF maps to cache process details (PID and comm) by I/O
# request, as well as a starting timestamp for calculating I/O latency.
#
# Copyright (c) 2015 Brendan Gregg.
# Licensed under the Apache License, Version 2.0 (the "License")
#
# 16-Sep-2015 Brendan Gregg Created this.
from
__future__
import
print_function
from
bcc
import
BPF
# load BPF program
b
=
BPF
(
text
=
"""
#include <uapi/linux/ptrace.h>
#include <linux/blkdev.h>
struct val_t {
char name[TASK_COMM_LEN];
};
BPF_HASH(start, struct request *);
BPF_HASH(pidbyreq, struct request *, u32);
BPF_HASH(commbyreq, struct request *, struct val_t);
// cache PID and comm by-req
int trace_pid_start(struct pt_regs *ctx, struct request *req)
{
u32 pid;
struct val_t val = {};
pid = bpf_get_current_pid_tgid();
pidbyreq.update(&req, &pid);
if (bpf_get_current_comm(&val.name, sizeof(val.name)) == 0) {
commbyreq.update(&req, &val);
}
return 0;
}
// time block I/O
int trace_req_start(struct pt_regs *ctx, struct request *req)
{
u64 ts;
ts = bpf_ktime_get_ns();
start.update(&req, &ts);
return 0;
}
// output
int trace_req_completion(struct pt_regs *ctx, struct request *req)
{
u64 *tsp, delta;
u32 *pidp = 0;
struct val_t *valp;
// fetch timestamp and calculate delta
tsp = start.lookup(&req);
if (tsp == 0) {
// missed tracing issue
return 0;
}
delta = bpf_ktime_get_ns() - *tsp;
//
// Fetch and output issuing pid and comm.
// As bpf_trace_prink() is limited to a maximum of 1 string and 2
// integers, we'll use more than one to output the data.
//
valp = commbyreq.lookup(&req);
pidp = pidbyreq.lookup(&req);
if (pidp == 0 || valp == 0) {
bpf_trace_printk("0 0 ? %d
\
\
n", req->__data_len);
} else {
bpf_trace_printk("0 %d %s %d
\
\
n", *pidp, valp->name,
req->__data_len);
}
// output remaining details
if (req->cmd_flags & REQ_WRITE) {
bpf_trace_printk("1 W %s %d %d ?
\
\
n", req->rq_disk->disk_name,
req->__sector, delta / 1000);
} else {
bpf_trace_printk("1 R %s %d %d ?
\
\
n", req->rq_disk->disk_name,
req->__sector, delta / 1000);
}
start.delete(&req);
pidbyreq.delete(&req);
commbyreq.delete(&req);
return 0;
}
"""
)
# blk_account_io_start is chosen to be as
b
.
attach_kprobe
(
event
=
"blk_account_io_start"
,
fn_name
=
"trace_pid_start"
)
b
.
attach_kprobe
(
event
=
"blk_start_request"
,
fn_name
=
"trace_req_start"
)
b
.
attach_kprobe
(
event
=
"blk_mq_start_request"
,
fn_name
=
"trace_req_start"
)
b
.
attach_kprobe
(
event
=
"blk_account_io_completion"
,
fn_name
=
"trace_req_completion"
)
# header
print
(
"%-14s %-14s %-6s %-7s %-2s %-9s %-7s %7s"
%
(
"TIME(s)"
,
"COMM"
,
"PID"
,
"DISK"
,
"T"
,
"SECTOR"
,
"BYTES"
,
"LAT(ms)"
))
start_ts
=
0
# format output
while
1
:
(
task
,
pid
,
cpu
,
flags
,
ts
,
msg
)
=
b
.
trace_fields
()
args
=
msg
.
split
(
" "
)
if
start_ts
==
0
:
start_ts
=
ts
if
args
[
0
]
==
"0"
:
(
real_pid
,
real_comm
,
bytes_s
)
=
(
args
[
1
],
args
[
2
],
args
[
3
])
continue
else
:
(
type_s
,
disk_s
,
sector_s
,
us_s
)
=
(
args
[
1
],
args
[
2
],
args
[
3
],
args
[
4
])
ms
=
float
(
int
(
us_s
,
10
))
/
1000
print
(
"%-14.9f %-14.14s %-6s %-7s %-2s %-9s %-7s %7.2f"
%
(
ts
-
start_ts
,
real_comm
,
real_pid
,
disk_s
,
type_s
,
sector_s
,
bytes_s
,
ms
))
tools/biosnoop_example.txt
0 → 100644
View file @
5be5194f
Demonstrations of biosnoop, the Linux eBPF/bcc version.
biosnoop traces block device I/O (disk I/O), and prints a line of output
per I/O. Example:
# ./biosnoop
TIME(s) COMM PID DISK T SECTOR BYTES LAT(ms)
0.000004001 supervise 1950 xvda1 W 13092560 4096 0.74
0.000178002 supervise 1950 xvda1 W 13092432 4096 0.61
0.001469001 supervise 1956 xvda1 W 13092440 4096 1.24
0.001588002 supervise 1956 xvda1 W 13115128 4096 1.09
1.022346001 supervise 1950 xvda1 W 13115272 4096 0.98
1.022568002 supervise 1950 xvda1 W 13188496 4096 0.93
1.023534000 supervise 1956 xvda1 W 13188520 4096 0.79
1.023585003 supervise 1956 xvda1 W 13189512 4096 0.60
2.003920000 xfsaild/md0 456 xvdc W 62901512 8192 0.23
2.003931001 xfsaild/md0 456 xvdb W 62901513 512 0.25
2.004034001 xfsaild/md0 456 xvdb W 62901520 8192 0.35
2.004042000 xfsaild/md0 456 xvdb W 63542016 4096 0.36
2.004204001 kworker/0:3 26040 xvdb W 41950344 65536 0.34
2.044352002 supervise 1950 xvda1 W 13192672 4096 0.65
2.044574000 supervise 1950 xvda1 W 13189072 4096 0.58
This includes the PID and comm (process name) that were on-CPU at the time of
issue (which usually means the process responsible).
The latency of the disk I/O, measured from the issue to the device to its
completion, is included as the last column.
This example output is from what should be an idle system, however, the
following is visible in iostat:
$ iostat -x 1
[...]
avg-cpu: %user %nice %system %iowait %steal %idle
0.12 0.00 0.12 0.00 0.00 99.75
Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util
xvda 0.00 0.00 0.00 4.00 0.00 16.00 8.00 0.00 0.00 0.00 0.00 0.00 0.00
xvdb 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
xvdc 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
md0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
There are 4 write IOPS.
The output of biosnoop identifies the reason: multiple supervise processes are
issuing writes to the xvda1 disk. I can now drill down on supervise using other
tools to understand its file system workload.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment