Commit 2f695553 authored by David S. Miller's avatar David S. Miller

Merge branch 'tcp-take-a-bit-more-care-of-backlog-stress'

Eric Dumazet says:

====================
tcp: take a bit more care of backlog stress

While working on the SACK compression issue Jean-Louis Dupond
reported, we found that his linux box was suffering very hard
from tail drops on the socket backlog queue.

First patch hints the compiler about sack flows being the norm.

Second patch changes non-sack code in preparation of the ack
compression.

Third patch fixes tcp_space() to take backlog into account.

Fourth patch is attempting coalescing when a new packet must
be added to the backlog queue. Cooking bigger skbs helps
to keep backlog list smaller and speeds its handling when
user thread finally releases the socket lock.

v3: Neal/Yuchung feedback addressed :
     Do not aggregate if any skb has URG bit set.
     Do not aggregate if the skbs have different ECE/CWR bits

v2: added feedback from Neal : tcp: take care of compressed acks in tcp_add_reno_sack()
    added : tcp: hint compiler about sack flows
	added : tcp: make tcp_space() aware of socket backlog
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents b0e3f1bd 4f693b55
...@@ -1124,7 +1124,7 @@ void tcp_rate_check_app_limited(struct sock *sk); ...@@ -1124,7 +1124,7 @@ void tcp_rate_check_app_limited(struct sock *sk);
*/ */
static inline int tcp_is_sack(const struct tcp_sock *tp) static inline int tcp_is_sack(const struct tcp_sock *tp)
{ {
return tp->rx_opt.sack_ok; return likely(tp->rx_opt.sack_ok);
} }
static inline bool tcp_is_reno(const struct tcp_sock *tp) static inline bool tcp_is_reno(const struct tcp_sock *tp)
...@@ -1368,7 +1368,7 @@ static inline int tcp_win_from_space(const struct sock *sk, int space) ...@@ -1368,7 +1368,7 @@ static inline int tcp_win_from_space(const struct sock *sk, int space)
/* Note: caller must be prepared to deal with negative returns */ /* Note: caller must be prepared to deal with negative returns */
static inline int tcp_space(const struct sock *sk) static inline int tcp_space(const struct sock *sk)
{ {
return tcp_win_from_space(sk, sk->sk_rcvbuf - return tcp_win_from_space(sk, sk->sk_rcvbuf - sk->sk_backlog.len -
atomic_read(&sk->sk_rmem_alloc)); atomic_read(&sk->sk_rmem_alloc));
} }
......
...@@ -243,6 +243,7 @@ enum ...@@ -243,6 +243,7 @@ enum
LINUX_MIB_TCPREQQFULLDROP, /* TCPReqQFullDrop */ LINUX_MIB_TCPREQQFULLDROP, /* TCPReqQFullDrop */
LINUX_MIB_TCPRETRANSFAIL, /* TCPRetransFail */ LINUX_MIB_TCPRETRANSFAIL, /* TCPRetransFail */
LINUX_MIB_TCPRCVCOALESCE, /* TCPRcvCoalesce */ LINUX_MIB_TCPRCVCOALESCE, /* TCPRcvCoalesce */
LINUX_MIB_TCPBACKLOGCOALESCE, /* TCPBacklogCoalesce */
LINUX_MIB_TCPOFOQUEUE, /* TCPOFOQueue */ LINUX_MIB_TCPOFOQUEUE, /* TCPOFOQueue */
LINUX_MIB_TCPOFODROP, /* TCPOFODrop */ LINUX_MIB_TCPOFODROP, /* TCPOFODrop */
LINUX_MIB_TCPOFOMERGE, /* TCPOFOMerge */ LINUX_MIB_TCPOFOMERGE, /* TCPOFOMerge */
......
...@@ -219,6 +219,7 @@ static const struct snmp_mib snmp4_net_list[] = { ...@@ -219,6 +219,7 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL), SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL),
SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL), SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL),
SNMP_MIB_ITEM("TCPRcvCollapsed", LINUX_MIB_TCPRCVCOLLAPSED), SNMP_MIB_ITEM("TCPRcvCollapsed", LINUX_MIB_TCPRCVCOLLAPSED),
SNMP_MIB_ITEM("TCPBacklogCoalesce", LINUX_MIB_TCPBACKLOGCOALESCE),
SNMP_MIB_ITEM("TCPDSACKOldSent", LINUX_MIB_TCPDSACKOLDSENT), SNMP_MIB_ITEM("TCPDSACKOldSent", LINUX_MIB_TCPDSACKOLDSENT),
SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT), SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT),
SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV), SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV),
......
...@@ -1865,16 +1865,20 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend) ...@@ -1865,16 +1865,20 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend)
/* Emulate SACKs for SACKless connection: account for a new dupack. */ /* Emulate SACKs for SACKless connection: account for a new dupack. */
static void tcp_add_reno_sack(struct sock *sk) static void tcp_add_reno_sack(struct sock *sk, int num_dupack)
{ {
struct tcp_sock *tp = tcp_sk(sk); if (num_dupack) {
u32 prior_sacked = tp->sacked_out; struct tcp_sock *tp = tcp_sk(sk);
u32 prior_sacked = tp->sacked_out;
s32 delivered;
tp->sacked_out++; tp->sacked_out += num_dupack;
tcp_check_reno_reordering(sk, 0); tcp_check_reno_reordering(sk, 0);
if (tp->sacked_out > prior_sacked) delivered = tp->sacked_out - prior_sacked;
tp->delivered++; /* Some out-of-order packet is delivered */ if (delivered > 0)
tcp_verify_left_out(tp); tp->delivered += delivered;
tcp_verify_left_out(tp);
}
} }
/* Account for ACK, ACKing some data in Reno Recovery phase. */ /* Account for ACK, ACKing some data in Reno Recovery phase. */
...@@ -2636,7 +2640,7 @@ void tcp_enter_recovery(struct sock *sk, bool ece_ack) ...@@ -2636,7 +2640,7 @@ void tcp_enter_recovery(struct sock *sk, bool ece_ack)
/* Process an ACK in CA_Loss state. Move to CA_Open if lost data are /* Process an ACK in CA_Loss state. Move to CA_Open if lost data are
* recovered or spurious. Otherwise retransmits more on partial ACKs. * recovered or spurious. Otherwise retransmits more on partial ACKs.
*/ */
static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack, static void tcp_process_loss(struct sock *sk, int flag, int num_dupack,
int *rexmit) int *rexmit)
{ {
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
...@@ -2655,7 +2659,7 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack, ...@@ -2655,7 +2659,7 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
return; return;
if (after(tp->snd_nxt, tp->high_seq)) { if (after(tp->snd_nxt, tp->high_seq)) {
if (flag & FLAG_DATA_SACKED || is_dupack) if (flag & FLAG_DATA_SACKED || num_dupack)
tp->frto = 0; /* Step 3.a. loss was real */ tp->frto = 0; /* Step 3.a. loss was real */
} else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) { } else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
tp->high_seq = tp->snd_nxt; tp->high_seq = tp->snd_nxt;
...@@ -2681,8 +2685,8 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack, ...@@ -2681,8 +2685,8 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
/* A Reno DUPACK means new data in F-RTO step 2.b above are /* A Reno DUPACK means new data in F-RTO step 2.b above are
* delivered. Lower inflight to clock out (re)tranmissions. * delivered. Lower inflight to clock out (re)tranmissions.
*/ */
if (after(tp->snd_nxt, tp->high_seq) && is_dupack) if (after(tp->snd_nxt, tp->high_seq) && num_dupack)
tcp_add_reno_sack(sk); tcp_add_reno_sack(sk, num_dupack);
else if (flag & FLAG_SND_UNA_ADVANCED) else if (flag & FLAG_SND_UNA_ADVANCED)
tcp_reset_reno_sack(tp); tcp_reset_reno_sack(tp);
} }
...@@ -2759,13 +2763,13 @@ static bool tcp_force_fast_retransmit(struct sock *sk) ...@@ -2759,13 +2763,13 @@ static bool tcp_force_fast_retransmit(struct sock *sk)
* tcp_xmit_retransmit_queue(). * tcp_xmit_retransmit_queue().
*/ */
static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
bool is_dupack, int *ack_flag, int *rexmit) int num_dupack, int *ack_flag, int *rexmit)
{ {
struct inet_connection_sock *icsk = inet_csk(sk); struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
int fast_rexmit = 0, flag = *ack_flag; int fast_rexmit = 0, flag = *ack_flag;
bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && bool do_lost = num_dupack || ((flag & FLAG_DATA_SACKED) &&
tcp_force_fast_retransmit(sk)); tcp_force_fast_retransmit(sk));
if (!tp->packets_out && tp->sacked_out) if (!tp->packets_out && tp->sacked_out)
tp->sacked_out = 0; tp->sacked_out = 0;
...@@ -2812,8 +2816,8 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, ...@@ -2812,8 +2816,8 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
switch (icsk->icsk_ca_state) { switch (icsk->icsk_ca_state) {
case TCP_CA_Recovery: case TCP_CA_Recovery:
if (!(flag & FLAG_SND_UNA_ADVANCED)) { if (!(flag & FLAG_SND_UNA_ADVANCED)) {
if (tcp_is_reno(tp) && is_dupack) if (tcp_is_reno(tp))
tcp_add_reno_sack(sk); tcp_add_reno_sack(sk, num_dupack);
} else { } else {
if (tcp_try_undo_partial(sk, prior_snd_una)) if (tcp_try_undo_partial(sk, prior_snd_una))
return; return;
...@@ -2828,7 +2832,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, ...@@ -2828,7 +2832,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
tcp_identify_packet_loss(sk, ack_flag); tcp_identify_packet_loss(sk, ack_flag);
break; break;
case TCP_CA_Loss: case TCP_CA_Loss:
tcp_process_loss(sk, flag, is_dupack, rexmit); tcp_process_loss(sk, flag, num_dupack, rexmit);
tcp_identify_packet_loss(sk, ack_flag); tcp_identify_packet_loss(sk, ack_flag);
if (!(icsk->icsk_ca_state == TCP_CA_Open || if (!(icsk->icsk_ca_state == TCP_CA_Open ||
(*ack_flag & FLAG_LOST_RETRANS))) (*ack_flag & FLAG_LOST_RETRANS)))
...@@ -2839,8 +2843,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, ...@@ -2839,8 +2843,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
if (tcp_is_reno(tp)) { if (tcp_is_reno(tp)) {
if (flag & FLAG_SND_UNA_ADVANCED) if (flag & FLAG_SND_UNA_ADVANCED)
tcp_reset_reno_sack(tp); tcp_reset_reno_sack(tp);
if (is_dupack) tcp_add_reno_sack(sk, num_dupack);
tcp_add_reno_sack(sk);
} }
if (icsk->icsk_ca_state <= TCP_CA_Disorder) if (icsk->icsk_ca_state <= TCP_CA_Disorder)
...@@ -3562,7 +3565,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) ...@@ -3562,7 +3565,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
bool is_sack_reneg = tp->is_sack_reneg; bool is_sack_reneg = tp->is_sack_reneg;
u32 ack_seq = TCP_SKB_CB(skb)->seq; u32 ack_seq = TCP_SKB_CB(skb)->seq;
u32 ack = TCP_SKB_CB(skb)->ack_seq; u32 ack = TCP_SKB_CB(skb)->ack_seq;
bool is_dupack = false; int num_dupack = 0;
int prior_packets = tp->packets_out; int prior_packets = tp->packets_out;
u32 delivered = tp->delivered; u32 delivered = tp->delivered;
u32 lost = tp->lost; u32 lost = tp->lost;
...@@ -3673,8 +3676,13 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) ...@@ -3673,8 +3676,13 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
tcp_set_xmit_timer(sk); tcp_set_xmit_timer(sk);
if (tcp_ack_is_dubious(sk, flag)) { if (tcp_ack_is_dubious(sk, flag)) {
is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); if (!(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP))) {
tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, num_dupack = 1;
/* Consider if pure acks were aggregated in tcp_add_backlog() */
if (!(flag & FLAG_DATA))
num_dupack = max_t(u16, 1, skb_shinfo(skb)->gso_segs);
}
tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
&rexmit); &rexmit);
} }
...@@ -3692,7 +3700,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) ...@@ -3692,7 +3700,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
no_queue: no_queue:
/* If data was DSACKed, see if we can undo a cwnd reduction. */ /* If data was DSACKed, see if we can undo a cwnd reduction. */
if (flag & FLAG_DSACKING_ACK) { if (flag & FLAG_DSACKING_ACK) {
tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
&rexmit); &rexmit);
tcp_newly_delivered(sk, delivered, flag); tcp_newly_delivered(sk, delivered, flag);
} }
...@@ -3717,7 +3725,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) ...@@ -3717,7 +3725,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (TCP_SKB_CB(skb)->sacked) { if (TCP_SKB_CB(skb)->sacked) {
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
&sack_state); &sack_state);
tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
&rexmit); &rexmit);
tcp_newly_delivered(sk, delivered, flag); tcp_newly_delivered(sk, delivered, flag);
tcp_xmit_recovery(sk, rexmit); tcp_xmit_recovery(sk, rexmit);
......
...@@ -1619,12 +1619,14 @@ int tcp_v4_early_demux(struct sk_buff *skb) ...@@ -1619,12 +1619,14 @@ int tcp_v4_early_demux(struct sk_buff *skb)
bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
{ {
u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf; u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
struct skb_shared_info *shinfo;
/* Only socket owner can try to collapse/prune rx queues const struct tcphdr *th;
* to reduce memory overhead, so add a little headroom here. struct tcphdr *thtail;
* Few sockets backlog are possibly concurrently non empty. struct sk_buff *tail;
*/ unsigned int hdrlen;
limit += 64*1024; bool fragstolen;
u32 gso_segs;
int delta;
/* In case all data was pulled from skb frags (in __pskb_pull_tail()), /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
* we can fix skb->truesize to its real value to avoid future drops. * we can fix skb->truesize to its real value to avoid future drops.
...@@ -1636,6 +1638,84 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) ...@@ -1636,6 +1638,84 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
skb_dst_drop(skb); skb_dst_drop(skb);
if (unlikely(tcp_checksum_complete(skb))) {
bh_unlock_sock(sk);
__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
return true;
}
/* Attempt coalescing to last skb in backlog, even if we are
* above the limits.
* This is okay because skb capacity is limited to MAX_SKB_FRAGS.
*/
th = (const struct tcphdr *)skb->data;
hdrlen = th->doff * 4;
shinfo = skb_shinfo(skb);
if (!shinfo->gso_size)
shinfo->gso_size = skb->len - hdrlen;
if (!shinfo->gso_segs)
shinfo->gso_segs = 1;
tail = sk->sk_backlog.tail;
if (!tail)
goto no_coalesce;
thtail = (struct tcphdr *)tail->data;
if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
((TCP_SKB_CB(tail)->tcp_flags |
TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_URG) ||
((TCP_SKB_CB(tail)->tcp_flags ^
TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
#ifdef CONFIG_TLS_DEVICE
tail->decrypted != skb->decrypted ||
#endif
thtail->doff != th->doff ||
memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
goto no_coalesce;
__skb_pull(skb, hdrlen);
if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
thtail->window = th->window;
TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))
TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
if (TCP_SKB_CB(skb)->has_rxtstamp) {
TCP_SKB_CB(tail)->has_rxtstamp = true;
tail->tstamp = skb->tstamp;
skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
}
/* Not as strict as GRO. We only need to carry mss max value */
skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
skb_shinfo(tail)->gso_size);
gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
sk->sk_backlog.len += delta;
__NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPBACKLOGCOALESCE);
kfree_skb_partial(skb, fragstolen);
return false;
}
__skb_push(skb, hdrlen);
no_coalesce:
/* Only socket owner can try to collapse/prune rx queues
* to reduce memory overhead, so add a little headroom here.
* Few sockets backlog are possibly concurrently non empty.
*/
limit += 64*1024;
if (unlikely(sk_add_backlog(sk, skb, limit))) { if (unlikely(sk_add_backlog(sk, skb, limit))) {
bh_unlock_sock(sk); bh_unlock_sock(sk);
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment