Commit 15d1f3b4 authored by Brenden Blanco's avatar Brenden Blanco

Merge pull request #241 from brendangregg/master

biolatency, funclatency, and bpf_log2l usage
parents 7a4782a7 6583fea0
......@@ -64,8 +64,10 @@ Examples:
Tools:
- tools/[biolatency](tools/biolatency): Summarize block device I/O latency as a histogram. [Examples](tools/biolatency_example.txt).
- tools/[biosnoop](tools/biosnoop): Trace block device I/O with PID and latency. [Examples](tools/biosnoop_example.txt).
- tools/[funccount](tools/funccount): Count kernel function calls. [Examples](tools/funccount_example.txt).
- tools/[funclatency](tools/funclatency): Time kernel functions and show their latency distribution. [Examples](tools/funclatency_example.txt).
- tools/[killsnoop](tools/killsnoop): Trace signals issued by the kill() syscall. [Examples](tools/killsnoop_example.txt).
- tools/[opensnoop](tools/opensnoop): Trace open() syscalls. [Examples](tools/opensnoop_example.txt).
- tools/[pidpersec](tools/pidpersec): Count new processes (via fork). [Examples](tools/pidpersec_example.txt).
......
......@@ -2,7 +2,6 @@
* bitehist.c Block I/O size histogram.
* For Linux, uses BCC, eBPF. See .py file.
*
* Based on eBPF sample tracex2 by Alexi Starovoitov.
* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
......@@ -16,31 +15,9 @@
BPF_TABLE("array", int, u64, dist, 64);
static unsigned int log2(unsigned int v)
{
unsigned int r;
unsigned int shift;
r = (v > 0xFFFF) << 4; v >>= r;
shift = (v > 0xFF) << 3; v >>= shift; r |= shift;
shift = (v > 0xF) << 2; v >>= shift; r |= shift;
shift = (v > 0x3) << 1; v >>= shift; r |= shift;
r |= (v >> 1);
return r;
}
static unsigned int log2l(unsigned long v)
{
unsigned int hi = v >> 32;
if (hi)
return log2(hi) + 32 + 1;
else
return log2(v) + 1;
}
int kprobe__blk_account_io_completion(struct pt_regs *ctx, struct request *req)
{
int index = log2l(req->__data_len / 1024);
int index = bpf_log2l(req->__data_len / 1024);
u64 *leaf = dist.lookup(&index);
if (leaf) (*leaf)++;
......
......@@ -2,7 +2,6 @@
* vfsreadlat.c VFS read latency distribution.
* For Linux, uses BCC, eBPF. See .py file.
*
* Based on eBPF sample tracex2 by Alexi Starovoitov.
* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
......@@ -16,28 +15,6 @@
BPF_HASH(start, u32);
BPF_TABLE("array", int, u64, dist, 64);
static unsigned int log2(unsigned int v)
{
unsigned int r;
unsigned int shift;
r = (v > 0xFFFF) << 4; v >>= r;
shift = (v > 0xFF) << 3; v >>= shift; r |= shift;
shift = (v > 0xF) << 2; v >>= shift; r |= shift;
shift = (v > 0x3) << 1; v >>= shift; r |= shift;
r |= (v >> 1);
return r;
}
static unsigned int log2l(unsigned long v)
{
unsigned int hi = v >> 32;
if (hi)
return log2(hi) + 32 + 1;
else
return log2(v) + 1;
}
int do_entry(struct pt_regs *ctx)
{
u32 pid;
......@@ -59,7 +36,7 @@ int do_return(struct pt_regs *ctx)
if (tsp != 0) {
delta = bpf_ktime_get_ns() - *tsp;
int index = log2l(delta / 1000);
int index = bpf_log2l(delta / 1000);
u64 *leaf = dist.lookup(&index);
if (leaf) (*leaf)++;
start.delete(&pid);
......
.TH biolatency 8 "2015-08-20" "USER COMMANDS"
.SH NAME
biolatency \- Summarize block device I/O latency as a histogram.
.SH SYNOPSIS
.B biolatency [\-h] [\-T] [\-Q] [\-m] [interval [count]]
.SH DESCRIPTION
biolatency traces block device I/O (disk I/O), and records the distribution
of I/O latency (time). This is printed as a histogram either on Ctrl-C, or
after a given interval in seconds.
The latency of the disk I/O is measured from the issue to the device to its
completion. A \-Q option can be used to include time queued in the kernel.
This tool uses in-kernel eBPF maps for storing timestamps and the histogram,
for efficiency.
This works by tracing various kernel blk_*() functions using dynamic tracing,
and will need updating to match any changes to these functions.
Since this uses BPF, only the root user can use this tool.
.SH REQUIREMENTS
CONFIG_BPF and bcc.
.SH OPTIONS
\-h
Print usage message.
.TP
\-T
Include timestamps on output.
.TP
\-m
Output histogram in milliseconds.
.TP
interval
Output interval, in seconds.
.TP
count
Number of outputs.
.SH EXAMPLES
.TP
Summarize block device I/O latency as a histogram:
#
.B biolatency
.TP
Print 1 second summaries, 10 times:
#
.B biolatency 1 10
.TP
Print 1 second summaries, using milliseconds as units for the histogram, and
include timestamps on output:
#
.B biolatency \-mT 1
.TP
Include OS queued time in I/O time:
#
.B biolatency \-Q
.SH FIELDS
.TP
usecs
Microsecond range
.TP
mecs
Millisecond range
.TP
count
How many I/O fell into this range
.TP
distribution
An ASCII bar chart to visualize the distribution (count column)
.SH OVERHEAD
This traces kernel functions and maintains in-kernel timestamps and a histgroam,
which are asynchronously copied to user-space. This method is very efficient,
and the overhead for most storage I/O rates (< 10k IOPS) should be negligible.
If you have a higher IOPS storage environment, test and quantify the overhead
before use.
.SH SOURCE
This is from bcc.
.IP
https://github.com/iovisor/bcc
.PP
Also look in the bcc distribution for a companion _examples.txt file containing
example usage, output, and commentary for this tool.
.SH OS
Linux
.SH STABILITY
Unstable - in development.
.SH AUTHOR
Brendan Gregg
.SH SEE ALSO
biosnoop(8)
.TH funclatency 8 "2015-08-18" "USER COMMANDS"
.SH NAME
funclatency \- Time kernel funcitons and print latency as a histogram.
.SH SYNOPSIS
.B funclatency [\-h] [\-p PID] [\-i INTERVAL] [\-T] [\-u] [\-m] [\-r] pattern
.SH DESCRIPTION
This tool traces kernel function calls and times their duration (latency), and
shows the latency distribution as a histogram. The time is measured from when
the function is called to when it returns, and is inclusive of both on-CPU
time and time spent blocked.
This tool uses in-kernel eBPF maps for storing timestamps and the histogram,
for efficiency.
WARNING: This uses dynamic tracing of (what can be many) kernel functions, an
activity that has had issues on some kernel versions (risk of panics or
freezes). Test, and know what you are doing, before use.
Since this uses BPF, only the root user can use this tool.
.SH REQUIREMENTS
CONFIG_BPF and bcc.
.SH OPTIONS
pattern
Function name or search pattern. Supports "*" wildcards. See EXAMPLES.
You can also use \-r for regular expressions.
\-h
Print usage message.
.TP
\-p PID
Trace this process ID only.
.TP
\-i INTERVAL
Print output every interval seconds.
.TP
\-T
Include timestamps on output.
.TP
\-u
Output histogram in microseconds.
.TP
\-m
Output histogram in milliseconds.
.TP
\-r
Use regular expressions for the search pattern.
.SH EXAMPLES
.TP
Time the do_sys_open() kernel function, and print the distribution as a histogram:
#
.B funclatency do_sys_open
.TP
Time vfs_read(), and print the histogram in units of microseconds:
#
.B funclatency \-u vfs_read
.TP
Time do_nanosleep(), and print the histogram in units of milliseconds:
#
.B funclatency \-m do_nanosleep
.TP
Time vfs_read(), and print output every 5 seconds, with timestamps:
#
.B funclatency \-mTi 5 vfs_read
.TP
Time vfs_read() for process ID 181 only:
#
.B funclatency \-p 181 vfs_read:
.TP
Time both vfs_fstat() and vfs_fstatat() calls, by use of a wildcard:
#
.B funclatency 'vfs_fstat*'
.SH FIELDS
.TP
necs
Nanosecond range
.TP
usecs
Microsecond range
.TP
mecs
Millisecond range
.TP
count
How many calls fell into this range
.TP
distribution
An ASCII bar chart to visualize the distribution (count column)
.SH OVERHEAD
This traces kernel functions and maintains in-kernel timestamps and a histgroam,
which are asynchronously copied to user-space. While this method is very
efficient, the rate of kernel functions can also be very high (>1M/sec), at
which point the overhead is expected to be measurable. Measure in a test
environment and understand overheads before use. You can also use funccount
to measure the rate of kernel functions over a short duration, to set some
expectations before use.
.SH SOURCE
This is from bcc.
.IP
https://github.com/iovisor/bcc
.PP
Also look in the bcc distribution for a companion _examples.txt file containing
example usage, output, and commentary for this tool.
.SH OS
Linux
.SH STABILITY
Unstable - in development.
.SH AUTHOR
Brendan Gregg
.SH SEE ALSO
funccount(8)
#!/usr/bin/python
#
# biolatency Summarize block device I/O latency as a histogram.
# For Linux, uses BCC, eBPF.
#
# USAGE: biolatency [-h] [-T] [-Q] [-m] [interval] [count]
#
# Copyright (c) 2015 Brendan Gregg.
# Licensed under the Apache License, Version 2.0 (the "License")
#
# 20-Sep-2015 Brendan Gregg Created this.
from __future__ import print_function
from bcc import BPF
from time import sleep, strftime
import argparse
# arguments
examples = """examples:
./biolatency # summarize block I/O latency as a histogram
./biolatency 1 10 # print 1 second summaries, 10 times
./biolatency -mT 1 # 1s summaries, milliseconds, and timestamps
./biolatency -Q # include OS queued time in I/O time
"""
parser = argparse.ArgumentParser(
description="Summarize block device I/O latency as a histogram",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=examples)
parser.add_argument("-T", "--timestamp", action="store_true",
help="include timestamp on output")
parser.add_argument("-Q", "--queued", action="store_true",
help="include OS queued time in I/O time")
parser.add_argument("-m", "--milliseconds", action="store_true",
help="millisecond histogram")
parser.add_argument("interval", nargs="?", default=99999999,
help="output interval, in seconds")
parser.add_argument("count", nargs="?", default=99999999,
help="number of outputs")
args = parser.parse_args()
countdown = int(args.count)
debug = 0
# load BPF program
bpf_text = """
#include <uapi/linux/ptrace.h>
#include <linux/blkdev.h>
BPF_TABLE(\"array\", int, u64, dist, 64);
BPF_HASH(start, struct request *);
// time block I/O
int trace_req_start(struct pt_regs *ctx, struct request *req)
{
u64 ts = bpf_ktime_get_ns();
start.update(&req, &ts);
return 0;
}
// output
int trace_req_completion(struct pt_regs *ctx, struct request *req)
{
u64 *tsp, delta;
// fetch timestamp and calculate delta
tsp = start.lookup(&req);
if (tsp == 0) {
return 0; // missed issue
}
delta = bpf_ktime_get_ns() - *tsp;
FACTOR
// store as histogram
int index = bpf_log2l(delta);
u64 *leaf = dist.lookup(&index);
if (leaf) (*leaf)++;
start.delete(&req);
return 0;
}
"""
if args.milliseconds:
bpf_text = bpf_text.replace('FACTOR', 'delta /= 1000000;')
label = "msecs"
else:
bpf_text = bpf_text.replace('FACTOR', 'delta /= 1000;')
label = "usecs"
if debug:
print(bpf_text)
# load BPF program
b = BPF(text=bpf_text)
if args.queued:
b.attach_kprobe(event="blk_account_io_start", fn_name="trace_req_start")
else:
b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start")
b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start")
b.attach_kprobe(event="blk_account_io_completion",
fn_name="trace_req_completion")
print("Tracing block device I/O... Hit Ctrl-C to end.")
# output
exiting = 0 if args.interval else 1
dist = b.get_table("dist")
while (1):
try:
sleep(int(args.interval))
except KeyboardInterrupt:
exiting=1
print()
if args.timestamp:
print("%-8s\n" % strftime("%H:%M:%S"), end="")
dist.print_log2_hist(label)
dist.clear()
countdown -= 1
if exiting or countdown == 0:
exit()
Demonstrations of biolatency, the Linux eBPF/bcc version.
biolatency traces block device I/O (disk I/O), and records the distribution
of I/O latency (time), printing this as a histogram when Ctrl-C is hit.
For example:
# ./biolatency
Tracing block device I/O... Hit Ctrl-C to end.
^C
usecs : count distribution
0 -> 1 : 0 | |
2 -> 3 : 0 | |
4 -> 7 : 0 | |
8 -> 15 : 0 | |
16 -> 31 : 0 | |
32 -> 63 : 0 | |
64 -> 127 : 1 | |
128 -> 255 : 12 |******** |
256 -> 511 : 15 |********** |
512 -> 1023 : 43 |******************************* |
1024 -> 2047 : 52 |**************************************|
2048 -> 4095 : 47 |********************************** |
4096 -> 8191 : 52 |**************************************|
8192 -> 16383 : 36 |************************** |
16384 -> 32767 : 15 |********** |
32768 -> 65535 : 2 |* |
65536 -> 131071 : 2 |* |
The latency of the disk I/O is measured from the issue to the device to its
completion. A -Q option can be used to include time queued in the kernel.
This example output shows a large mode of latency from about 128 microseconds
to about 32767 microseconds (33 milliseconds). The bulk of the I/O was
between 1 and 8 ms, which is the expected block device latency for
rotational storage devices.
The highest latency seen while tracing was between 65 and 131 milliseconds:
the last row printed, for which there were 2 I/O.
For efficiency, biolatency uses an in-kernel eBPF map to store timestamps
with requests, and another in-kernel map to store the histogram (the "count")
column, which is copied to user-space only when output is printed. These
methods lower the perormance overhead when tracing is performed.
In the following example, the -m option is used to print a histogram using
milliseconds as the units (which eliminates the first several rows), -T to
print timestamps with the output, and to print 1 second summaries 5 times:
# ./biolatency -mT 1 5
Tracing block device I/O... Hit Ctrl-C to end.
06:20:16
msecs : count distribution
0 -> 1 : 36 |**************************************|
2 -> 3 : 1 |* |
4 -> 7 : 3 |*** |
8 -> 15 : 17 |***************** |
16 -> 31 : 33 |********************************** |
32 -> 63 : 7 |******* |
64 -> 127 : 6 |****** |
06:20:17
msecs : count distribution
0 -> 1 : 96 |************************************ |
2 -> 3 : 25 |********* |
4 -> 7 : 29 |*********** |
8 -> 15 : 62 |*********************** |
16 -> 31 : 100 |**************************************|
32 -> 63 : 62 |*********************** |
64 -> 127 : 18 |****** |
06:20:18
msecs : count distribution
0 -> 1 : 68 |************************* |
2 -> 3 : 76 |**************************** |
4 -> 7 : 20 |******* |
8 -> 15 : 48 |***************** |
16 -> 31 : 103 |**************************************|
32 -> 63 : 49 |****************** |
64 -> 127 : 17 |****** |
06:20:19
msecs : count distribution
0 -> 1 : 522 |*************************************+|
2 -> 3 : 225 |**************** |
4 -> 7 : 38 |** |
8 -> 15 : 8 | |
16 -> 31 : 1 | |
06:20:20
msecs : count distribution
0 -> 1 : 436 |**************************************|
2 -> 3 : 106 |********* |
4 -> 7 : 34 |** |
8 -> 15 : 19 |* |
16 -> 31 : 1 | |
How the I/O latency distribution changes over time can be seen.
The -Q option begins measuring I/O latency from when the request was first
queued in the kernel, and includes queuing latency:
# ./biolatency -Q
Tracing block device I/O... Hit Ctrl-C to end.
^C
usecs : count distribution
0 -> 1 : 0 | |
2 -> 3 : 0 | |
4 -> 7 : 0 | |
8 -> 15 : 0 | |
16 -> 31 : 0 | |
32 -> 63 : 0 | |
64 -> 127 : 0 | |
128 -> 255 : 3 |* |
256 -> 511 : 37 |************** |
512 -> 1023 : 30 |*********** |
1024 -> 2047 : 18 |******* |
2048 -> 4095 : 22 |******** |
4096 -> 8191 : 14 |***** |
8192 -> 16383 : 48 |******************* |
16384 -> 32767 : 96 |**************************************|
32768 -> 65535 : 31 |************ |
65536 -> 131071 : 26 |********** |
131072 -> 262143 : 12 |**** |
This better reflects the latency suffered by the application (if it is
synchronous I/O), whereas the default mode without kernel queueing better
reflects the performance of the device.
Note that the storage device (and storage device controller) usually have
queues of their own, which are always included in the latency, with or
without -Q.
USAGE message:
# ./biolatency -h
usage: biolatency [-h] [-T] [-Q] [-m] [interval] [count]
Summarize block device I/O latency as a histogram
positional arguments:
interval output interval, in seconds
count number of outputs
optional arguments:
-h, --help show this help message and exit
-T, --timestamp include timestamp on output
-Q, --queued include OS queued time in I/O time
-m, --milliseconds millisecond histogram
examples:
./biolatency # summarize block I/O latency as a histogram
./biolatency 1 10 # print 1 second summaries, 10 times
./biolatency -mT 1 # 1s summaries, milliseconds, and timestamps
./biolatency -Q # include OS queued time in I/O time
#!/usr/bin/python
#
# funclatency Time kernel funcitons and print latency as a histogram.
# For Linux, uses BCC, eBPF.
#
# USAGE: funclatency [-h] [-p PID] [-i INTERVAL] [-T] [-u] [-m] [-r] pattern
#
# Run "funclatency -h" for full usage.
#
# The pattern is a string with optional '*' wildcards, similar to file globbing.
# If you'd prefer to use regular expressions, use the -r option. Matching
# multiple functions is of limited use, since the output has one histogram for
# everything. Future versions should split the output histogram by the function.
#
# Copyright (c) 2015 Brendan Gregg.
# Licensed under the Apache License, Version 2.0 (the "License")
#
# 20-Sep-2015 Brendan Gregg Created this.
from __future__ import print_function
from bcc import BPF
from time import sleep, strftime
import argparse
import signal
# arguments
examples = """examples:
./funclatency do_sys_open # time the do_sys_open() kenel function
./funclatency -u vfs_read # time vfs_read(), in microseconds
./funclatency -m do_nanosleep # time do_nanosleep(), in milliseconds
./funclatency -mTi 5 vfs_read # output every 5 seconds, with timestamps
./funclatency -p 181 vfs_read # time process 181 only
./funclatency 'vfs_fstat*' # time both vfs_fstat() and vfs_fstatat()
"""
parser = argparse.ArgumentParser(
description="Time kernel funcitons and print latency as a histogram",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=examples)
parser.add_argument("-p", "--pid",
help="trace this PID only")
parser.add_argument("-i", "--interval", default=99999999,
help="summary interval, seconds")
parser.add_argument("-T", "--timestamp", action="store_true",
help="include timestamp on output")
parser.add_argument("-u", "--microseconds", action="store_true",
help="microsecond histogram")
parser.add_argument("-m", "--milliseconds", action="store_true",
help="millisecond histogram")
parser.add_argument("-r", "--regexp", action="store_true",
help="use regular expressions. Default is \"*\" wildcards only.")
parser.add_argument("pattern",
help="search expression for kernel functions")
args = parser.parse_args()
pattern = args.pattern
if not args.regexp:
pattern = pattern.replace('*', '.*')
pattern = '^' + pattern + '$'
debug = 0
# define BPF program
bpf_text = """
#include <uapi/linux/ptrace.h>
#include <linux/blkdev.h>
BPF_TABLE(\"array\", int, u64, dist, 64);
BPF_HASH(start, u32);
int trace_func_entry(struct pt_regs *ctx)
{
u32 pid = bpf_get_current_pid_tgid();
u64 ts = bpf_ktime_get_ns();
FILTER
start.update(&pid, &ts);
return 0;
}
int trace_func_return(struct pt_regs *ctx)
{
u64 *tsp, delta;
u32 pid = bpf_get_current_pid_tgid();
// calculate delta time
tsp = start.lookup(&pid);
if (tsp == 0) {
return 0; // missed start
}
start.delete(&pid);
delta = bpf_ktime_get_ns() - *tsp;
FACTOR
// store as histogram
int index = bpf_log2l(delta);
u64 *leaf = dist.lookup(&index);
if (leaf) (*leaf)++;
return 0;
}
"""
if args.pid:
bpf_text = bpf_text.replace('FILTER',
'if (pid != %s) { return 0; }' % args.pid)
else:
bpf_text = bpf_text.replace('FILTER', '')
if args.milliseconds:
bpf_text = bpf_text.replace('FACTOR', 'delta /= 1000000;')
label = "msecs"
elif args.microseconds:
bpf_text = bpf_text.replace('FACTOR', 'delta /= 1000;')
label = "usecs"
else:
bpf_text = bpf_text.replace('FACTOR', '')
label = "nsecs"
if debug:
print(bpf_text)
# signal handler
def signal_ignore(signal, frame):
print()
# load BPF program
b = BPF(text=bpf_text)
b.attach_kprobe(event_re=pattern, fn_name="trace_func_entry")
b.attach_kretprobe(event_re=pattern, fn_name="trace_func_return")
# header
print("Tracing %s... Hit Ctrl-C to end." % args.pattern)
# output
exiting = 0 if args.interval else 1
dist = b.get_table("dist")
while (1):
try:
sleep(int(args.interval))
except KeyboardInterrupt:
exiting=1
# as cleanup can take many seconds, trap Ctrl-C:
signal.signal(signal.SIGINT, signal_ignore)
print()
if args.timestamp:
print("%-8s\n" % strftime("%H:%M:%S"), end="")
dist.print_log2_hist(label)
dist.clear()
if exiting:
print("Detaching...")
exit()
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment