Commit cf378e25 authored by Brendan Gregg's avatar Brendan Gregg Committed by GitHub

Merge pull request #1131 from cherusk/retrans_sum

tcpretrans: extend for sum up
parents 4a2f2214 1e9467f8
.TH tcpaccept 8 "2016-02-14" "USER COMMANDS" .TH tcpaccept 8 "2016-02-14" "USER COMMANDS"
.SH NAME .SH NAME
tcpaccept \- Trace TCP retransmits and TLPs. Uses Linux eBPF/bcc. tcpaccept \- Trace or count TCP retransmits and TLPs. Uses Linux eBPF/bcc.
.SH SYNOPSIS .SH SYNOPSIS
.B tcpaccept [\-h] [\-l] .B tcpaccept [\-h] [\-l]
.SH DESCRIPTION .SH DESCRIPTION
...@@ -8,7 +8,9 @@ This traces TCP retransmits, showing address, port, and TCP state information, ...@@ -8,7 +8,9 @@ This traces TCP retransmits, showing address, port, and TCP state information,
and sometimes the PID (although usually not, since retransmits are usually and sometimes the PID (although usually not, since retransmits are usually
sent by the kernel on timeouts). To keep overhead very low, only sent by the kernel on timeouts). To keep overhead very low, only
the TCP retransmit functions are traced. This does not trace every packet the TCP retransmit functions are traced. This does not trace every packet
(like tcpdump(8) or a packet sniffer). (like tcpdump(8) or a packet sniffer). Optionally, it can count retransmits
over a user signalled interval to spot potentially dropping network paths the
flows are traversing.
This uses dynamic tracing of the kernel tcp_retransmit_skb() and This uses dynamic tracing of the kernel tcp_retransmit_skb() and
tcp_send_loss_probe() functions, and will need to be updated to tcp_send_loss_probe() functions, and will need to be updated to
...@@ -25,6 +27,9 @@ Print usage message. ...@@ -25,6 +27,9 @@ Print usage message.
\-l \-l
Include tail loss probe attempts (in some cases the kernel may not Include tail loss probe attempts (in some cases the kernel may not
complete the TLP send). complete the TLP send).
.TP
\-c
Count occurring retransmits per flow.
.SH EXAMPLES .SH EXAMPLES
.TP .TP
Trace TCP retransmits: Trace TCP retransmits:
...@@ -63,6 +68,9 @@ Remote port. ...@@ -63,6 +68,9 @@ Remote port.
.TP .TP
STATE STATE
TCP session state. TCP session state.
.TP
RETRANSMITS
Accumulated occurred retransmits since start.
.SH OVERHEAD .SH OVERHEAD
Should be negligible: TCP retransmit events should be low (<1000/s), and the Should be negligible: TCP retransmit events should be low (<1000/s), and the
low overhead this tool adds to each event should make the cost negligible. low overhead this tool adds to each event should make the cost negligible.
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
# Licensed under the Apache License, Version 2.0 (the "License") # Licensed under the Apache License, Version 2.0 (the "License")
# #
# 14-Feb-2016 Brendan Gregg Created this. # 14-Feb-2016 Brendan Gregg Created this.
# 03-Nov-2017 Matthias Tafelmeier Extended this.
from __future__ import print_function from __future__ import print_function
from bcc import BPF from bcc import BPF
...@@ -21,6 +22,7 @@ from time import strftime ...@@ -21,6 +22,7 @@ from time import strftime
from socket import inet_ntop, AF_INET, AF_INET6 from socket import inet_ntop, AF_INET, AF_INET6
from struct import pack from struct import pack
import ctypes as ct import ctypes as ct
from time import sleep
# arguments # arguments
examples = """examples: examples = """examples:
...@@ -33,6 +35,8 @@ parser = argparse.ArgumentParser( ...@@ -33,6 +35,8 @@ parser = argparse.ArgumentParser(
epilog=examples) epilog=examples)
parser.add_argument("-l", "--lossprobe", action="store_true", parser.add_argument("-l", "--lossprobe", action="store_true",
help="include tail loss probe attempts") help="include tail loss probe attempts")
parser.add_argument("-c", "--count", action="store_true",
help="count occurred retransmits per flow")
args = parser.parse_args() args = parser.parse_args()
# define BPF program # define BPF program
...@@ -70,6 +74,23 @@ struct ipv6_data_t { ...@@ -70,6 +74,23 @@ struct ipv6_data_t {
}; };
BPF_PERF_OUTPUT(ipv6_events); BPF_PERF_OUTPUT(ipv6_events);
// separate flow keys per address family
struct ipv4_flow_key_t {
u32 saddr;
u32 daddr;
u16 lport;
u16 dport;
};
BPF_HASH(ipv4_count, struct ipv4_flow_key_t);
struct ipv6_flow_key_t {
unsigned __int128 saddr;
unsigned __int128 daddr;
u16 lport;
u16 dport;
};
BPF_HASH(ipv6_count, struct ipv6_flow_key_t);
static int trace_event(struct pt_regs *ctx, struct sock *skp, int type) static int trace_event(struct pt_regs *ctx, struct sock *skp, int type)
{ {
if (skp == NULL) if (skp == NULL)
...@@ -83,26 +104,11 @@ static int trace_event(struct pt_regs *ctx, struct sock *skp, int type) ...@@ -83,26 +104,11 @@ static int trace_event(struct pt_regs *ctx, struct sock *skp, int type)
char state = skp->__sk_common.skc_state; char state = skp->__sk_common.skc_state;
if (family == AF_INET) { if (family == AF_INET) {
struct ipv4_data_t data4 = {.pid = pid, .ip = 4, .type = type}; IPV4_INIT
data4.saddr = skp->__sk_common.skc_rcv_saddr; IPV4_CORE
data4.daddr = skp->__sk_common.skc_daddr;
// lport is host order
data4.lport = lport;
data4.dport = ntohs(dport);
data4.state = state;
ipv4_events.perf_submit(ctx, &data4, sizeof(data4));
} else if (family == AF_INET6) { } else if (family == AF_INET6) {
struct ipv6_data_t data6 = {.pid = pid, .ip = 6, .type = type}; IPV6_INIT
bpf_probe_read(&data6.saddr, sizeof(data6.saddr), IPV6_CORE
skp->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32);
bpf_probe_read(&data6.daddr, sizeof(data6.daddr),
skp->__sk_common.skc_v6_daddr.in6_u.u6_addr32);
// lport is host order
data6.lport = lport;
data6.dport = ntohs(dport);
data6.state = state;
ipv6_events.perf_submit(ctx, &data6, sizeof(data6));
} }
// else drop // else drop
...@@ -122,6 +128,66 @@ int trace_tlp(struct pt_regs *ctx, struct sock *sk) ...@@ -122,6 +128,66 @@ int trace_tlp(struct pt_regs *ctx, struct sock *sk)
} }
""" """
struct_init = { 'ipv4':
{ 'count' :
"""
struct ipv4_flow_key_t flow_key = {};
flow_key.saddr = skp->__sk_common.skc_rcv_saddr;
flow_key.daddr = skp->__sk_common.skc_daddr;
// lport is host order
flow_key.lport = lport;
flow_key.dport = ntohs(dport);""",
'trace' :
"""
struct ipv4_data_t data4 = {.pid = pid, .ip = 4, .type = type};
data4.saddr = skp->__sk_common.skc_rcv_saddr;
data4.daddr = skp->__sk_common.skc_daddr;
// lport is host order
data4.lport = lport;
data4.dport = ntohs(dport);
data4.state = state; """
},
'ipv6':
{ 'count' :
"""
struct ipv6_flow_key_t flow_key = {};
bpf_probe_read(&flow_key.saddr, sizeof(flow_key.saddr),
skp->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32);
bpf_probe_read(&flow_key.daddr, sizeof(flow_key.daddr),
skp->__sk_common.skc_v6_daddr.in6_u.u6_addr32);
// lport is host order
flow_key.lport = lport;
flow_key.dport = ntohs(dport);""",
'trace' : """
struct ipv6_data_t data6 = {.pid = pid, .ip = 6, .type = type};
bpf_probe_read(&data6.saddr, sizeof(data6.saddr),
skp->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32);
bpf_probe_read(&data6.daddr, sizeof(data6.daddr),
skp->__sk_common.skc_v6_daddr.in6_u.u6_addr32);
// lport is host order
data6.lport = lport;
data6.dport = ntohs(dport);
data6.state = state;"""
}
}
count_core_base = """
u64 zero = 0, *val;
val = COUNT_STRUCT.lookup_or_init(&flow_key, &zero);
(*val)++;
"""
if args.count:
bpf_text = bpf_text.replace("IPV4_INIT", struct_init['ipv4']['count'])
bpf_text = bpf_text.replace("IPV6_INIT", struct_init['ipv6']['count'])
bpf_text = bpf_text.replace("IPV4_CORE", count_core_base.replace("COUNT_STRUCT", 'ipv4_count'))
bpf_text = bpf_text.replace("IPV6_CORE", count_core_base.replace("COUNT_STRUCT", 'ipv6_count'))
else:
bpf_text = bpf_text.replace("IPV4_INIT", struct_init['ipv4']['trace'])
bpf_text = bpf_text.replace("IPV6_INIT", struct_init['ipv6']['trace'])
bpf_text = bpf_text.replace("IPV4_CORE", "ipv4_events.perf_submit(ctx, &data4, sizeof(data4));")
bpf_text = bpf_text.replace("IPV6_CORE", "ipv6_events.perf_submit(ctx, &data6, sizeof(data6));")
# event data # event data
class Data_ipv4(ct.Structure): class Data_ipv4(ct.Structure):
_fields_ = [ _fields_ = [
...@@ -186,18 +252,44 @@ def print_ipv6_event(cpu, data, size): ...@@ -186,18 +252,44 @@ def print_ipv6_event(cpu, data, size):
"%s:%d" % (inet_ntop(AF_INET6, event.daddr), event.dport), "%s:%d" % (inet_ntop(AF_INET6, event.daddr), event.dport),
tcpstate[event.state])) tcpstate[event.state]))
def depict_cnt(counts_tab, l3prot='ipv4'):
for k, v in sorted(counts_tab.items(), key=lambda counts: counts[1].value):
depict_key = ""
ep_fmt = "[%s]#%d"
if l3prot == 'ipv4':
depict_key = "%-20s <-> %-20s" % (ep_fmt % (inet_ntop(AF_INET, pack('I', k.saddr)), k.lport),
ep_fmt % (inet_ntop(AF_INET, pack('I', k.daddr)), k.dport))
else:
depict_key = "%-20s <-> %-20s" % (ep_fmt % (inet_ntop(AF_INET6, k.saddr), k.lport),
ep_fmt % (inet_ntop(AF_INET6, k.daddr), k.dport))
print ("%s %10d" % (depict_key, v.value))
# initialize BPF # initialize BPF
b = BPF(text=bpf_text) b = BPF(text=bpf_text)
b.attach_kprobe(event="tcp_retransmit_skb", fn_name="trace_retransmit") b.attach_kprobe(event="tcp_retransmit_skb", fn_name="trace_retransmit")
if args.lossprobe: if args.lossprobe:
b.attach_kprobe(event="tcp_send_loss_probe", fn_name="trace_tlp") b.attach_kprobe(event="tcp_send_loss_probe", fn_name="trace_tlp")
# header print("Tracing retransmits ... Hit Ctrl-C to end")
print("%-8s %-6s %-2s %-20s %1s> %-20s %-4s" % ("TIME", "PID", "IP", if args.count:
"LADDR:LPORT", "T", "RADDR:RPORT", "STATE")) try:
while 1:
sleep(99999999)
except BaseException:
pass
# header
print("\n%-25s %-25s %-10s" % (
"LADDR:LPORT", "RADDR:RPORT", "RETRANSMITS"))
depict_cnt(b.get_table("ipv4_count"))
depict_cnt(b.get_table("ipv6_count"), l3prot='ipv6')
# read events # read events
b["ipv4_events"].open_perf_buffer(print_ipv4_event) else:
b["ipv6_events"].open_perf_buffer(print_ipv6_event) # header
while 1: print("%-8s %-6s %-2s %-20s %1s> %-20s %-4s" % ("TIME", "PID", "IP",
"LADDR:LPORT", "T", "RADDR:RPORT", "STATE"))
b["ipv4_events"].open_perf_buffer(print_ipv4_event)
b["ipv6_events"].open_perf_buffer(print_ipv6_event)
while 1:
b.kprobe_poll() b.kprobe_poll()
...@@ -44,6 +44,20 @@ TIME PID IP LADDR:LPORT T> RADDR:RPORT STATE ...@@ -44,6 +44,20 @@ TIME PID IP LADDR:LPORT T> RADDR:RPORT STATE
See the "L>" in the "T>" column. These are attempts: the kernel probably See the "L>" in the "T>" column. These are attempts: the kernel probably
sent a TLP, but in some cases it might not have been ultimately sent. sent a TLP, but in some cases it might not have been ultimately sent.
To spot heavily retransmitting flows quickly one can use the -c flag. It will
count occurring retransmits per flow.
# ./tcpretrans.py -c
Tracing retransmits ... Hit Ctrl-C to end
^C
LADDR:LPORT RADDR:RPORT RETRANSMITS
192.168.10.50:60366 <-> 172.217.21.194:443 700
192.168.10.50:666 <-> 172.213.11.195:443 345
192.168.10.50:366 <-> 172.212.22.194:443 211
[...]
This can ease to quickly isolate congested or otherwise awry network paths
responsible for clamping tcp performance.
USAGE message: USAGE message:
...@@ -55,6 +69,7 @@ Trace TCP retransmits ...@@ -55,6 +69,7 @@ Trace TCP retransmits
optional arguments: optional arguments:
-h, --help show this help message and exit -h, --help show this help message and exit
-l, --lossprobe include tail loss probe attempts -l, --lossprobe include tail loss probe attempts
-c, --count count occurred retransmits per flow
examples: examples:
./tcpretrans # trace TCP retransmits ./tcpretrans # trace TCP retransmits
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment