Commit 74347318 authored by Mark Drayton's avatar Mark Drayton

fileslower/filetop: use de->d_name.name, add filtering

parent 4ebb7cf1
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
.SH NAME .SH NAME
fileslower \- Trace slow synchronous file reads and writes. fileslower \- Trace slow synchronous file reads and writes.
.SH SYNOPSIS .SH SYNOPSIS
.B fileslower [\-h] [\-p PID] [min_ms] .B fileslower [\-h] [\-p PID] [-a] [min_ms]
.SH DESCRIPTION .SH DESCRIPTION
This script uses kernel dynamic tracing of synchronous reads and writes This script uses kernel dynamic tracing of synchronous reads and writes
at the VFS interface, to identify slow file reads and writes for any file at the VFS interface, to identify slow file reads and writes for any file
...@@ -30,6 +30,9 @@ CONFIG_BPF and bcc. ...@@ -30,6 +30,9 @@ CONFIG_BPF and bcc.
\-p PID \-p PID
Trace this PID only. Trace this PID only.
.TP .TP
\-a
Include non-regular file types in output (sockets, FIFOs, etc).
.TP
min_ms min_ms
Minimum I/O latency (duration) to trace, in milliseconds. Default is 10 ms. Minimum I/O latency (duration) to trace, in milliseconds. Default is 10 ms.
.SH EXAMPLES .SH EXAMPLES
......
...@@ -8,7 +8,8 @@ This is top for files. ...@@ -8,7 +8,8 @@ This is top for files.
This traces file reads and writes, and prints a per-file summary every This traces file reads and writes, and prints a per-file summary every
interval (by default, 1 second). The summary is sorted on the highest read interval (by default, 1 second). The summary is sorted on the highest read
throughput (Kbytes). throughput (Kbytes). By default only IO on regular files is shown. The -a
option will list all file types (sokets, FIFOs, etc).
This uses in-kernel eBPF maps to store per process summaries for efficiency. This uses in-kernel eBPF maps to store per process summaries for efficiency.
...@@ -29,6 +30,9 @@ Since this uses BPF, only the root user can use this tool. ...@@ -29,6 +30,9 @@ Since this uses BPF, only the root user can use this tool.
CONFIG_BPF and bcc. CONFIG_BPF and bcc.
.SH OPTIONS .SH OPTIONS
.TP .TP
\-a
Include non-regular file types (sockets, FIFOs, etc).
.TP
\-C \-C
Don't clear the screen. Don't clear the screen.
.TP .TP
......
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
# fileslower Trace slow synchronous file reads and writes. # fileslower Trace slow synchronous file reads and writes.
# For Linux, uses BCC, eBPF. # For Linux, uses BCC, eBPF.
# #
# USAGE: fileslower [-h] [-p PID] [min_ms] # USAGE: fileslower [-h] [-p PID] [-a] [min_ms]
# #
# This script uses kernel dynamic tracing of synchronous reads and writes # This script uses kernel dynamic tracing of synchronous reads and writes
# at the VFS interface, to identify slow file reads and writes for any file # at the VFS interface, to identify slow file reads and writes for any file
...@@ -32,7 +32,6 @@ from __future__ import print_function ...@@ -32,7 +32,6 @@ from __future__ import print_function
from bcc import BPF from bcc import BPF
import argparse import argparse
import ctypes as ct import ctypes as ct
import signal
import time import time
# arguments # arguments
...@@ -45,19 +44,17 @@ parser = argparse.ArgumentParser( ...@@ -45,19 +44,17 @@ parser = argparse.ArgumentParser(
description="Trace slow synchronous file reads and writes", description="Trace slow synchronous file reads and writes",
formatter_class=argparse.RawDescriptionHelpFormatter, formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=examples) epilog=examples)
parser.add_argument("-p", "--pid", parser.add_argument("-p", "--pid", type=int, metavar="PID", dest="tgid",
help="trace this PID only") help="trace this PID only")
parser.add_argument("-a", "--all-files", action="store_true",
help="include non-regular file types (sockets, FIFOs, etc)")
parser.add_argument("min_ms", nargs="?", default='10', parser.add_argument("min_ms", nargs="?", default='10',
help="minimum I/O duration to trace, in ms (default 10)") help="minimum I/O duration to trace, in ms (default 10)")
args = parser.parse_args() args = parser.parse_args()
min_ms = int(args.min_ms) min_ms = int(args.min_ms)
pid = args.pid tgid = args.tgid
debug = 0 debug = 0
# signal handler
def signal_ignore(signal, frame):
print()
# define BPF program # define BPF program
bpf_text = """ bpf_text = """
#include <uapi/linux/ptrace.h> #include <uapi/linux/ptrace.h>
...@@ -72,6 +69,8 @@ enum trace_mode { ...@@ -72,6 +69,8 @@ enum trace_mode {
struct val_t { struct val_t {
u32 sz; u32 sz;
u64 ts; u64 ts;
u32 name_len;
// de->d_name.name may point to de->d_iname so limit len accordingly
char name[DNAME_INLINE_LEN]; char name[DNAME_INLINE_LEN];
char comm[TASK_COMM_LEN]; char comm[TASK_COMM_LEN];
}; };
...@@ -81,6 +80,7 @@ struct data_t { ...@@ -81,6 +80,7 @@ struct data_t {
u32 pid; u32 pid;
u32 sz; u32 sz;
u64 delta_us; u64 delta_us;
u32 name_len;
char name[DNAME_INLINE_LEN]; char name[DNAME_INLINE_LEN];
char comm[TASK_COMM_LEN]; char comm[TASK_COMM_LEN];
}; };
...@@ -92,22 +92,25 @@ BPF_PERF_OUTPUT(events); ...@@ -92,22 +92,25 @@ BPF_PERF_OUTPUT(events);
static int trace_rw_entry(struct pt_regs *ctx, struct file *file, static int trace_rw_entry(struct pt_regs *ctx, struct file *file,
char __user *buf, size_t count) char __user *buf, size_t count)
{ {
u32 pid; u32 tgid = bpf_get_current_pid_tgid() >> 32;
if (TGID_FILTER)
pid = bpf_get_current_pid_tgid();
if (FILTER)
return 0; return 0;
u32 pid = bpf_get_current_pid_tgid();
// skip I/O lacking a filename // skip I/O lacking a filename
struct dentry *de = file->f_path.dentry; struct dentry *de = file->f_path.dentry;
if (de->d_iname[0] == 0) int mode = file->f_inode->i_mode;
if (de->d_name.len == 0 || TYPE_FILTER)
return 0; return 0;
// store size and timestamp by pid // store size and timestamp by pid
struct val_t val = {}; struct val_t val = {};
val.sz = count; val.sz = count;
val.ts = bpf_ktime_get_ns(); val.ts = bpf_ktime_get_ns();
__builtin_memcpy(&val.name, de->d_iname, sizeof(val.name));
val.name_len = de->d_name.len;
bpf_probe_read(&val.name, sizeof(val.name), (void *)de->d_name.name);
bpf_get_current_comm(&val.comm, sizeof(val.comm)); bpf_get_current_comm(&val.comm, sizeof(val.comm));
entryinfo.update(&pid, &val); entryinfo.update(&pid, &val);
...@@ -153,6 +156,7 @@ static int trace_rw_return(struct pt_regs *ctx, int type) ...@@ -153,6 +156,7 @@ static int trace_rw_return(struct pt_regs *ctx, int type)
data.pid = pid; data.pid = pid;
data.sz = valp->sz; data.sz = valp->sz;
data.delta_us = delta_us; data.delta_us = delta_us;
data.name_len = valp->name_len;
bpf_probe_read(&data.name, sizeof(data.name), valp->name); bpf_probe_read(&data.name, sizeof(data.name), valp->name);
bpf_probe_read(&data.comm, sizeof(data.comm), valp->comm); bpf_probe_read(&data.comm, sizeof(data.comm), valp->comm);
events.perf_submit(ctx, &data, sizeof(data)); events.perf_submit(ctx, &data, sizeof(data));
...@@ -172,15 +176,20 @@ int trace_write_return(struct pt_regs *ctx) ...@@ -172,15 +176,20 @@ int trace_write_return(struct pt_regs *ctx)
""" """
bpf_text = bpf_text.replace('MIN_US', str(min_ms * 1000)) bpf_text = bpf_text.replace('MIN_US', str(min_ms * 1000))
if args.pid: if args.tgid:
bpf_text = bpf_text.replace('FILTER', 'pid != %s' % pid) bpf_text = bpf_text.replace('TGID_FILTER', 'tgid != %d' % tgid)
else: else:
bpf_text = bpf_text.replace('FILTER', '0') bpf_text = bpf_text.replace('TGID_FILTER', '0')
if args.all_files:
bpf_text = bpf_text.replace('TYPE_FILTER', '0')
else:
bpf_text = bpf_text.replace('TYPE_FILTER', '!S_ISREG(mode)')
if debug: if debug:
print(bpf_text) print(bpf_text)
# initialize BPF # initialize BPF
b = BPF(text=bpf_text) b = BPF(text=bpf_text,)
# I'd rather trace these via new_sync_read/new_sync_write (which used to be # I'd rather trace these via new_sync_read/new_sync_write (which used to be
# do_sync_read/do_sync_write), but those became static. So trace these from # do_sync_read/do_sync_write), but those became static. So trace these from
...@@ -205,6 +214,7 @@ class Data(ct.Structure): ...@@ -205,6 +214,7 @@ class Data(ct.Structure):
("pid", ct.c_uint), ("pid", ct.c_uint),
("sz", ct.c_uint), ("sz", ct.c_uint),
("delta_us", ct.c_ulonglong), ("delta_us", ct.c_ulonglong),
("name_len", ct.c_uint),
("name", ct.c_char * DNAME_INLINE_LEN), ("name", ct.c_char * DNAME_INLINE_LEN),
("comm", ct.c_char * TASK_COMM_LEN), ("comm", ct.c_char * TASK_COMM_LEN),
] ]
...@@ -216,7 +226,7 @@ mode_s = { ...@@ -216,7 +226,7 @@ mode_s = {
# header # header
print("Tracing sync read/writes slower than %d ms" % min_ms) print("Tracing sync read/writes slower than %d ms" % min_ms)
print("%-8s %-14s %-6s %1s %-7s %7s %s" % ("TIME(s)", "COMM", "PID", "D", print("%-8s %-14s %-6s %1s %-7s %7s %s" % ("TIME(s)", "COMM", "TID", "D",
"BYTES", "LAT(ms)", "FILENAME")) "BYTES", "LAT(ms)", "FILENAME"))
start_ts = time.time() start_ts = time.time()
...@@ -225,10 +235,13 @@ def print_event(cpu, data, size): ...@@ -225,10 +235,13 @@ def print_event(cpu, data, size):
event = ct.cast(data, ct.POINTER(Data)).contents event = ct.cast(data, ct.POINTER(Data)).contents
ms = float(event.delta_us) / 1000 ms = float(event.delta_us) / 1000
name = event.name
if event.name_len > DNAME_INLINE_LEN:
name = name[:-3] + "..."
print("%-8.3f %-14.14s %-6s %1s %-7s %7.2f %s" % ( print("%-8.3f %-14.14s %-6s %1s %-7s %7.2f %s" % (
time.time() - start_ts, event.comm, event.pid, mode_s[event.mode], time.time() - start_ts, event.comm, event.pid, mode_s[event.mode],
event.sz, ms, event.name)) event.sz, ms, name))
b["events"].open_perf_buffer(print_event) b["events"].open_perf_buffer(print_event)
while 1: while 1:
......
...@@ -105,7 +105,7 @@ locks, run queue latency, etc. These can be explored using other commands. ...@@ -105,7 +105,7 @@ locks, run queue latency, etc. These can be explored using other commands.
USAGE message: USAGE message:
# ./fileslower -h # ./fileslower -h
usage: fileslower [-h] [-p PID] [min_ms] usage: fileslower.py [-h] [-p PID] [-a] [min_ms]
Trace slow synchronous file reads and writes Trace slow synchronous file reads and writes
...@@ -115,6 +115,7 @@ positional arguments: ...@@ -115,6 +115,7 @@ positional arguments:
optional arguments: optional arguments:
-h, --help show this help message and exit -h, --help show this help message and exit
-p PID, --pid PID trace this PID only -p PID, --pid PID trace this PID only
-a, --all-files include non-regular file types
examples: examples:
./fileslower # trace sync file I/O slower than 10 ms (default) ./fileslower # trace sync file I/O slower than 10 ms (default)
......
...@@ -32,11 +32,13 @@ parser = argparse.ArgumentParser( ...@@ -32,11 +32,13 @@ parser = argparse.ArgumentParser(
description="File reads and writes by process", description="File reads and writes by process",
formatter_class=argparse.RawDescriptionHelpFormatter, formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=examples) epilog=examples)
parser.add_argument("-a", "--all-files", action="store_true",
help="include non-regular file types (sockets, FIFOs, etc)")
parser.add_argument("-C", "--noclear", action="store_true", parser.add_argument("-C", "--noclear", action="store_true",
help="don't clear the screen") help="don't clear the screen")
parser.add_argument("-r", "--maxrows", default=20, parser.add_argument("-r", "--maxrows", default=20,
help="maximum rows to print, default 20") help="maximum rows to print, default 20")
parser.add_argument("-p", "--pid", parser.add_argument("-p", "--pid", type=int, metavar="PID", dest="tgid",
help="trace this PID only") help="trace this PID only")
parser.add_argument("interval", nargs="?", default=1, parser.add_argument("interval", nargs="?", default=1,
help="output interval, in seconds") help="output interval, in seconds")
...@@ -61,13 +63,13 @@ bpf_text = """ ...@@ -61,13 +63,13 @@ bpf_text = """
#include <uapi/linux/ptrace.h> #include <uapi/linux/ptrace.h>
#include <linux/blkdev.h> #include <linux/blkdev.h>
#define MAX_FILE_LEN 32
// the key for the output summary // the key for the output summary
struct info_t { struct info_t {
u32 pid; u32 pid;
char name[TASK_COMM_LEN]; u32 name_len;
char file[MAX_FILE_LEN]; char comm[TASK_COMM_LEN];
// de->d_name.name may point to de->d_iname so limit len accordingly
char name[DNAME_INLINE_LEN];
char type; char type;
}; };
...@@ -84,22 +86,23 @@ BPF_HASH(counts, struct info_t, struct val_t); ...@@ -84,22 +86,23 @@ BPF_HASH(counts, struct info_t, struct val_t);
static int do_entry(struct pt_regs *ctx, struct file *file, static int do_entry(struct pt_regs *ctx, struct file *file,
char __user *buf, size_t count, int is_read) char __user *buf, size_t count, int is_read)
{ {
u32 pid; u32 tgid = bpf_get_current_pid_tgid() >> 32;
if (TGID_FILTER)
pid = bpf_get_current_pid_tgid();
if (FILTER)
return 0; return 0;
u32 pid = bpf_get_current_pid_tgid();
// skip I/O lacking a filename // skip I/O lacking a filename
struct dentry *de = file->f_path.dentry; struct dentry *de = file->f_path.dentry;
if (de->d_iname[0] == 0) int mode = file->f_inode->i_mode;
if (de->d_name.len == 0 || TYPE_FILTER)
return 0; return 0;
// store counts and sizes by pid & file // store counts and sizes by pid & file
struct info_t info = {.pid = pid}; struct info_t info = {.pid = pid};
bpf_get_current_comm(&info.name, sizeof(info.name)); bpf_get_current_comm(&info.comm, sizeof(info.comm));
__builtin_memcpy(&info.file, de->d_iname, sizeof(info.file)); info.name_len = de->d_name.len;
int mode = file->f_inode->i_mode; bpf_probe_read(&info.name, sizeof(info.name), (void *)de->d_name.name);
if (S_ISREG(mode)) { if (S_ISREG(mode)) {
info.type = 'R'; info.type = 'R';
} else if (S_ISSOCK(mode)) { } else if (S_ISSOCK(mode)) {
...@@ -134,17 +137,28 @@ int trace_write_entry(struct pt_regs *ctx, struct file *file, ...@@ -134,17 +137,28 @@ int trace_write_entry(struct pt_regs *ctx, struct file *file,
} }
""" """
if args.pid: if args.tgid:
bpf_text = bpf_text.replace('FILTER', 'pid != %s' % args.pid) bpf_text = bpf_text.replace('TGID_FILTER', 'tgid != %d' % args.tgid)
else:
bpf_text = bpf_text.replace('TGID_FILTER', '0')
if args.all_files:
bpf_text = bpf_text.replace('TYPE_FILTER', '0')
else: else:
bpf_text = bpf_text.replace('FILTER', '0') bpf_text = bpf_text.replace('TYPE_FILTER', '!S_ISREG(mode)')
if debug: if debug:
print(bpf_text) print(bpf_text)
# initialize BPF # initialize BPF
b = BPF(text=bpf_text) b = BPF(text=bpf_text)
b.attach_kprobe(event="__vfs_read", fn_name="trace_read_entry") b.attach_kprobe(event="__vfs_read", fn_name="trace_read_entry")
b.attach_kprobe(event="__vfs_write", fn_name="trace_write_entry") try:
b.attach_kprobe(event="__vfs_write", fn_name="trace_write_entry")
except:
# older kernels don't have __vfs_write so try vfs_write instead
b.attach_kprobe(event="vfs_write", fn_name="trace_write_entry")
DNAME_INLINE_LEN = 32 # linux/dcache.h
print('Tracing... Output every %d secs. Hit Ctrl-C to end' % interval) print('Tracing... Output every %d secs. Hit Ctrl-C to end' % interval)
...@@ -163,19 +177,22 @@ while 1: ...@@ -163,19 +177,22 @@ while 1:
print() print()
with open(loadavg) as stats: with open(loadavg) as stats:
print("%-8s loadavg: %s" % (strftime("%H:%M:%S"), stats.read())) print("%-8s loadavg: %s" % (strftime("%H:%M:%S"), stats.read()))
print("%-6s %-16s %-6s %-6s %-7s %-7s %1s %s" % ("PID", "COMM", print("%-6s %-16s %-6s %-6s %-7s %-7s %1s %s" % ("TID", "COMM",
"READS", "WRITES", "R_Kb", "W_Kb", "T", "FILE")) "READS", "WRITES", "R_Kb", "W_Kb", "T", "FILE"))
# by-PID output # by-TID output
counts = b.get_table("counts") counts = b.get_table("counts")
line = 0 line = 0
for k, v in reversed(sorted(counts.items(), for k, v in reversed(sorted(counts.items(),
key=lambda counts: counts[1].rbytes)): key=lambda counts: counts[1].rbytes)):
name = k.name
if k.name_len > DNAME_INLINE_LEN:
name = name[:-3] + "..."
# print line # print line
print("%-6d %-16s %-6d %-6d %-7d %-7d %1s %s" % (k.pid, k.name, print("%-6d %-16s %-6d %-6d %-7d %-7d %1s %s" % (k.pid, k.comm,
v.reads, v.writes, v.rbytes / 1024, v.wbytes / 1024, k.type, v.reads, v.writes, v.rbytes / 1024, v.wbytes / 1024, k.type,
k.file)) name))
line += 1 line += 1
if line >= maxrows: if line >= maxrows:
......
...@@ -24,11 +24,9 @@ PID COMM READS WRITES R_Kb W_Kb T FILE ...@@ -24,11 +24,9 @@ PID COMM READS WRITES R_Kb W_Kb T FILE
26628 ld 18 0 72 0 R hibernate.o 26628 ld 18 0 72 0 R hibernate.o
26628 ld 16 0 64 0 R suspend.o 26628 ld 16 0 64 0 R suspend.o
26628 ld 16 0 64 0 R snapshot.o 26628 ld 16 0 64 0 R snapshot.o
26630 cat 1 0 64 0 O null
26628 ld 16 0 64 0 R qos.o 26628 ld 16 0 64 0 R qos.o
26628 ld 13 0 52 0 R main.o 26628 ld 13 0 52 0 R main.o
26628 ld 12 0 52 0 R swap.o 26628 ld 12 0 52 0 R swap.o
12421 sshd 3 0 48 0 O ptmx
[...] [...]
This shows various files read and written during a Linux kernel build. The This shows various files read and written during a Linux kernel build. The
...@@ -40,7 +38,8 @@ While not printed, the average read and write size can be calculated by ...@@ -40,7 +38,8 @@ While not printed, the average read and write size can be calculated by
dividing R_Kb by READS, and the same for writes. dividing R_Kb by READS, and the same for writes.
The "T" column indicates the type of the file: "R" for regular files, "S" for The "T" column indicates the type of the file: "R" for regular files, "S" for
sockets, and "O" for other (including pipes). sockets, and "O" for other (including pipes). By default only regular files are
shown; use the -a option to show all file types.
This script works by tracing the vfs_read() and vfs_write() functions using This script works by tracing the vfs_read() and vfs_write() functions using
kernel dynamic tracing, which instruments explicit read and write calls. If kernel dynamic tracing, which instruments explicit read and write calls. If
...@@ -65,7 +64,6 @@ Tracing... Output every 1 secs. Hit Ctrl-C to end ...@@ -65,7 +64,6 @@ Tracing... Output every 1 secs. Hit Ctrl-C to end
PID COMM READS WRITES R_Kb W_Kb T FILE PID COMM READS WRITES R_Kb W_Kb T FILE
32672 cksum 5006 0 320384 0 R data1 32672 cksum 5006 0 320384 0 R data1
12296 sshd 2 0 32 0 O ptmx
809 run 2 0 8 0 R nsswitch.conf 809 run 2 0 8 0 R nsswitch.conf
811 run 2 0 8 0 R nsswitch.conf 811 run 2 0 8 0 R nsswitch.conf
804 chown 2 0 8 0 R nsswitch.conf 804 chown 2 0 8 0 R nsswitch.conf
...@@ -89,13 +87,13 @@ PID COMM READS WRITES R_Kb W_Kb T FILE ...@@ -89,13 +87,13 @@ PID COMM READS WRITES R_Kb W_Kb T FILE
848 run 2 0 8 0 R nsswitch.conf 848 run 2 0 8 0 R nsswitch.conf
[...] [...]
This output shows a cksum command reading data1. Note that This output shows a cksum command reading data1.
An optional interval and optional count can also be added to the end of the An optional interval and optional count can also be added to the end of the
command line. For example, for 1 second interval, and 3 summaries in total: command line. For example, for 1 second interval, and 3 summaries in total:
# ./filetop -Cr 5 1 3 # ./filetop -Cr 5 -a 1 3
Tracing... Output every 1 secs. Hit Ctrl-C to end Tracing... Output every 1 secs. Hit Ctrl-C to end
08:08:20 loadavg: 0.30 0.42 0.31 3/282 5187 08:08:20 loadavg: 0.30 0.42 0.31 3/282 5187
...@@ -126,15 +124,15 @@ PID COMM READS WRITES R_Kb W_Kb T FILE ...@@ -126,15 +124,15 @@ PID COMM READS WRITES R_Kb W_Kb T FILE
5227 run 2 0 8 0 R nsswitch.conf 5227 run 2 0 8 0 R nsswitch.conf
Detaching... Detaching...
This example has caught heavy socket I/O from an sshd process, showing up as This example shows the -a option to include all file types. It caught heavy
non-regular file types (the "O" for other, and "S" for socket, in the type socket I/O from an sshd process, showing up as non-regular file types (the "O"
column: "T"). for other, and "S" for socket, in the type column: "T").
USAGE message: USAGE message:
# ./filetop -h # ./filetop -h
usage: filetop [-h] [-C] [-r MAXROWS] [-p PID] [interval] [count] usage: filetop.py [-h] [-a] [-C] [-r MAXROWS] [-p PID] [interval] [count]
File reads and writes by process File reads and writes by process
...@@ -144,6 +142,7 @@ positional arguments: ...@@ -144,6 +142,7 @@ positional arguments:
optional arguments: optional arguments:
-h, --help show this help message and exit -h, --help show this help message and exit
-a, --all-files include non-regular file types (sockets, FIFOs, etc)
-C, --noclear don't clear the screen -C, --noclear don't clear the screen
-r MAXROWS, --maxrows MAXROWS -r MAXROWS, --maxrows MAXROWS
maximum rows to print, default 20 maximum rows to print, default 20
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment