Commit 62af0eac authored by yonghong-song's avatar yonghong-song Committed by GitHub

Merge pull request #1790 from brendangregg/master

add tcpdrop tool
parents b0098f2e 1a5090a5
......@@ -141,6 +141,7 @@ pair of .c and .py files, and some are directories of files.
- tools/[tcpaccept](tools/tcpaccept.py): Trace TCP passive connections (accept()). [Examples](tools/tcpaccept_example.txt).
- tools/[tcpconnect](tools/tcpconnect.py): Trace TCP active connections (connect()). [Examples](tools/tcpconnect_example.txt).
- tools/[tcpconnlat](tools/tcpconnlat.py): Trace TCP active connection latency (connect()). [Examples](tools/tcpconnlat_example.txt).
- tools/[tcpdrop](tools/tcpdrop.py): Trace kernel-based TCP packet drops with details. [Examples](tools/tcpdrop_example.txt).
- tools/[tcplife](tools/tcplife.py): Trace TCP sessions and summarize lifespan. [Examples](tools/tcplife_example.txt).
- tools/[tcpretrans](tools/tcpretrans.py): Trace TCP retransmits and TLPs. [Examples](tools/tcpretrans_example.txt).
- tools/[tcpstates](tools/tcpstates.py): Trace TCP session state changes with durations. [Examples](tools/tcpstates_example.txt).
......
.TH tcpdrop 8 "2018-05-30" "USER COMMANDS"
.SH NAME
tcpdrop \- Trace kernel-based TCP packet drops with details. Uses Linux eBPF/bcc.
.SH SYNOPSIS
.B tcpdrop [\-h] [\-T] [\-t] [\-w] [\-s] [\-p PID] [\-D PORTS] [\-L PORTS]
.SH DESCRIPTION
This tool traces TCP packets or segments that were dropped by the kernel, and
shows details from the IP and TCP headers, the socket state, and the
kernel stack trace. This is useful for debugging cases of high kernel drops,
which can cause timer-based retransmits and performance issues.
This tool works using dynamic tracing of the tcp_drop() kernel function,
which requires a recent kernel version.
Since this uses BPF, only the root user can use this tool.
.SH REQUIREMENTS
CONFIG_BPF and bcc.
.SH OPTIONS
.TP
\-h
Print usage message.
.B tcpdrop
.SH FIELDS
.TP
TIME
Time of the drop, in HH:MM:SS format.
.TP
PID
Process ID that was on-CPU during the drop. This may be unrelated, as drops
can occur on the receive interrupt and be unrelated to the PID that was
interrupted.
.TP
IP
IP address family (4 or 6)
.TP
SADDR
Source IP address.
.TP
SPORT
Source TCP port.
.TP
DADDR
Destination IP address.
.TP
DPORT
Destionation TCP port.
.TP
STATE
TCP session state ("ESTABLISHED", etc).
.TP
FLAGS
TCP flags ("SYN", etc).
.SH OVERHEAD
This traces the kernel tcp_drop() function, which should be low frequency,
and therefore the overhead of this tool should be negligible.
As always, test and understand this tools overhead for your types of
workloads before production use.
.SH SOURCE
This is from bcc.
.IP
https://github.com/iovisor/bcc
.PP
Also look in the bcc distribution for a companion _examples.txt file containing
example usage, output, and commentary for this tool.
.SH OS
Linux
.SH STABILITY
Unstable - in development.
.SH AUTHOR
Brendan Gregg
.SH SEE ALSO
tcplife(8), tcpaccept(8), tcpconnect(8), tcptop(8)
# Copyright 2018 Netflix, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# from include/net/tcp_states.h:
tcpstate = {}
tcpstate[1] = 'ESTABLISHED'
tcpstate[2] = 'SYN_SENT'
tcpstate[3] = 'SYN_RECV'
tcpstate[4] = 'FIN_WAIT1'
tcpstate[5] = 'FIN_WAIT2'
tcpstate[6] = 'TIME_WAIT'
tcpstate[7] = 'CLOSE'
tcpstate[8] = 'CLOSE_WAIT'
tcpstate[9] = 'LAST_ACK'
tcpstate[10] = 'LISTEN'
tcpstate[11] = 'CLOSING'
tcpstate[12] = 'NEW_SYN_RECV'
# from include/net/tcp.h:
TCPHDR_FIN = 0x01;
TCPHDR_SYN = 0x02;
TCPHDR_RST = 0x04;
TCPHDR_PSH = 0x08;
TCPHDR_ACK = 0x10;
TCPHDR_URG = 0x20;
TCPHDR_ECE = 0x40;
TCPHDR_CWR = 0x80;
def flags2str(flags):
arr = [];
if flags & TCPHDR_FIN:
arr.append("FIN");
if flags & TCPHDR_SYN:
arr.append("SYN");
if flags & TCPHDR_RST:
arr.append("RST");
if flags & TCPHDR_PSH:
arr.append("PSH");
if flags & TCPHDR_ACK:
arr.append("ACK");
if flags & TCPHDR_URG:
arr.append("URG");
if flags & TCPHDR_ECE:
arr.append("ECE");
if flags & TCPHDR_CWR:
arr.append("CWR");
return "|".join(arr);
#!/usr/bin/python
# @lint-avoid-python-3-compatibility-imports
#
# tcpdrop Trace TCP kernel-dropped packets/segments.
# For Linux, uses BCC, eBPF. Embedded C.
#
# This provides information such as packet details, socket state, and kernel
# stack trace for packets/segments that were dropped via tcp_drop().
#
# USAGE: tcpdrop [-c] [-h] [-l]
#
# This uses dynamic tracing of kernel functions, and will need to be updated
# to match kernel changes.
#
# Copyright 2018 Netflix, Inc.
# Licensed under the Apache License, Version 2.0 (the "License")
#
# 30-May-2018 Brendan Gregg Created this.
from __future__ import print_function
from bcc import BPF
import argparse
from time import strftime
from socket import inet_ntop, AF_INET, AF_INET6
from struct import pack
import ctypes as ct
from time import sleep
from bcc import tcp
# arguments
examples = """examples:
./tcpdrop # trace kernel TCP drops
"""
parser = argparse.ArgumentParser(
description="Trace TCP drops by the kernel",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=examples)
parser.add_argument("--ebpf", action="store_true",
help=argparse.SUPPRESS)
args = parser.parse_args()
debug = 0
# define BPF program
bpf_text = """
#include <uapi/linux/ptrace.h>
#include <uapi/linux/tcp.h>
#include <uapi/linux/ip.h>
#include <net/sock.h>
#include <bcc/proto.h>
BPF_STACK_TRACE(stack_traces, 1024);
// separate data structs for ipv4 and ipv6
struct ipv4_data_t {
u32 pid;
u64 ip;
u64 saddr;
u64 daddr;
u16 sport;
u16 dport;
u8 state;
u8 tcpflags;
u32 stack_id;
};
BPF_PERF_OUTPUT(ipv4_events);
struct ipv6_data_t {
u32 pid;
u64 ip;
unsigned __int128 saddr;
unsigned __int128 daddr;
u16 sport;
u16 dport;
u8 state;
u8 tcpflags;
u32 stack_id;
};
BPF_PERF_OUTPUT(ipv6_events);
static struct tcphdr *skb_to_tcphdr(const struct sk_buff *skb)
{
// unstable API. verify logic in tcp_hdr() -> skb_transport_header().
return (struct tcphdr *)(skb->head + skb->transport_header);
}
static inline struct iphdr *skb_to_iphdr(const struct sk_buff *skb)
{
// unstable API. verify logic in ip_hdr() -> skb_network_header().
return (struct iphdr *)(skb->head + skb->network_header);
}
// from include/net/tcp.h:
#ifndef tcp_flag_byte
#define tcp_flag_byte(th) (((u_int8_t *)th)[13])
#endif
int trace_tcp_drop(struct pt_regs *ctx, struct sock *sk, struct sk_buff *skb)
{
if (sk == NULL)
return 0;
u32 pid = bpf_get_current_pid_tgid();
// pull in details from the packet headers and the sock struct
u16 family = sk->__sk_common.skc_family;
char state = sk->__sk_common.skc_state;
u16 sport = 0, dport = 0;
u8 tcpflags = 0;
struct tcphdr *tcp = skb_to_tcphdr(skb);
struct iphdr *ip = skb_to_iphdr(skb);
bpf_probe_read(&sport, sizeof(sport), &tcp->source);
bpf_probe_read(&dport, sizeof(dport), &tcp->dest);
bpf_probe_read(&tcpflags, sizeof(tcpflags), &tcp_flag_byte(tcp));
sport = ntohs(sport);
dport = ntohs(dport);
if (family == AF_INET) {
struct ipv4_data_t data4 = {.pid = pid, .ip = 4};
bpf_probe_read(&data4.saddr, sizeof(u32), &ip->saddr);
bpf_probe_read(&data4.daddr, sizeof(u32), &ip->daddr);
data4.dport = dport;
data4.sport = sport;
data4.state = state;
data4.tcpflags = tcpflags;
data4.stack_id = stack_traces.get_stackid(ctx, 0);
ipv4_events.perf_submit(ctx, &data4, sizeof(data4));
} else if (family == AF_INET6) {
struct ipv6_data_t data6 = {.pid = pid, .ip = 6};
bpf_probe_read(&data6.saddr, sizeof(data6.saddr),
sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32);
bpf_probe_read(&data6.daddr, sizeof(data6.daddr),
sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32);
data6.dport = dport;
data6.sport = sport;
data6.state = state;
data6.tcpflags = tcpflags;
data6.stack_id = stack_traces.get_stackid(ctx, 0);
ipv6_events.perf_submit(ctx, &data6, sizeof(data6));
}
// else drop
return 0;
}
"""
if debug or args.ebpf:
print(bpf_text)
if args.ebpf:
exit()
# event data
class Data_ipv4(ct.Structure):
_fields_ = [
("pid", ct.c_ulong),
("ip", ct.c_ulonglong),
("saddr", ct.c_ulonglong),
("daddr", ct.c_ulonglong),
("sport", ct.c_ushort),
("dport", ct.c_ushort),
("state", ct.c_ubyte),
("tcpflags", ct.c_ubyte),
("stack_id", ct.c_ulong)
]
class Data_ipv6(ct.Structure):
_fields_ = [
("pid", ct.c_ulong),
("ip", ct.c_ulonglong),
("saddr", (ct.c_ulonglong * 2)),
("daddr", (ct.c_ulonglong * 2)),
("sport", ct.c_ushort),
("dport", ct.c_ushort),
("state", ct.c_ubyte),
("tcpflags", ct.c_ubyte),
("stack_id", ct.c_ulong)
]
# process event
def print_ipv4_event(cpu, data, size):
event = ct.cast(data, ct.POINTER(Data_ipv4)).contents
print("%-8s %-6d %-2d %-20s > %-20s %s (%s)" % (
strftime("%H:%M:%S"), event.pid, event.ip,
"%s:%d" % (inet_ntop(AF_INET, pack('I', event.saddr)), event.sport),
"%s:%s" % (inet_ntop(AF_INET, pack('I', event.daddr)), event.dport),
tcp.tcpstate[event.state], tcp.flags2str(event.tcpflags)))
for addr in stack_traces.walk(event.stack_id):
sym = b.ksym(addr, show_offset=True)
print("\t%s" % sym)
print("")
def print_ipv6_event(cpu, data, size):
event = ct.cast(data, ct.POINTER(Data_ipv6)).contents
print("%-8s %-6d %-2d %-20s > %-20s %s (%s)" % (
strftime("%H:%M:%S"), event.pid, event.ip,
"%s:%d" % (inet_ntop(AF_INET6, event.saddr), event.sport),
"%s:%d" % (inet_ntop(AF_INET6, event.daddr), event.dport),
tcp.tcpstate[event.state], tcp.flags2str(event.tcpflags)))
for addr in stack_traces.walk(event.stack_id):
sym = b.ksym(addr, show_offset=True)
print("\t%s" % sym)
print("")
# initialize BPF
b = BPF(text=bpf_text)
if b.get_kprobe_functions("tcp_drop"):
b.attach_kprobe(event="tcp_drop", fn_name="trace_tcp_drop")
else:
print("ERROR: tcp_drop() kernel function not found or traceable. "
"Older kernel versions not supported.")
exit()
stack_traces = b.get_table("stack_traces")
# header
print("%-8s %-6s %-2s %-20s > %-20s %s (%s)" % ("TIME", "PID", "IP",
"SADDR:SPORT", "DADDR:DPORT", "STATE", "FLAGS"))
# read events
b["ipv4_events"].open_perf_buffer(print_ipv4_event)
b["ipv6_events"].open_perf_buffer(print_ipv6_event)
while 1:
b.perf_buffer_poll()
Demonstrations of tcpdrop, the Linux BPF/bcc version.
tcpdrop prints details of TCP packets or segments that were dropped by the
kernel, including the kernel stack trace that led to the drop:
# ./tcpdrop.py
TIME PID IP SADDR:SPORT > DADDR:DPORT STATE (FLAGS)
20:49:06 0 4 10.32.119.56:443 > 10.66.65.252:22912 CLOSE (ACK)
tcp_drop+0x1
tcp_v4_do_rcv+0x135
tcp_v4_rcv+0x9c7
ip_local_deliver_finish+0x62
ip_local_deliver+0x6f
ip_rcv_finish+0x129
ip_rcv+0x28f
__netif_receive_skb_core+0x432
__netif_receive_skb+0x18
netif_receive_skb_internal+0x37
napi_gro_receive+0xc5
ena_clean_rx_irq+0x3c3
ena_io_poll+0x33f
net_rx_action+0x140
__softirqentry_text_start+0xdf
irq_exit+0xb6
do_IRQ+0x82
ret_from_intr+0x0
native_safe_halt+0x6
default_idle+0x20
arch_cpu_idle+0x15
default_idle_call+0x23
do_idle+0x17f
cpu_startup_entry+0x73
rest_init+0xae
start_kernel+0x4dc
x86_64_start_reservations+0x24
x86_64_start_kernel+0x74
secondary_startup_64+0xa5
20:49:50 12431 4 127.0.0.1:8198 > 127.0.0.1:48280 CLOSE (RST|ACK)
tcp_drop+0x1
tcp_v4_do_rcv+0x135
__release_sock+0x88
release_sock+0x30
inet_stream_connect+0x47
SYSC_connect+0x9e
sys_connect+0xe
do_syscall_64+0x73
entry_SYSCALL_64_after_hwframe+0x3d
[...]
The last two columns show the state of the TCP session, and the TCP flags.
These two examples show packets arriving for a session in the closed state,
that were dropped by the kernel.
This tool is useful for debugging high rates of drops, which can cause the
remote end to do timer-based retransmits, hurting performance.
USAGE:
# ./tcpdrop.py -h
usage: tcpdrop.py [-h]
Trace TCP drops by the kernel
optional arguments:
-h, --help show this help message and exit
examples:
./tcpdrop # trace kernel TCP drops
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment