Commit 65f1df11 authored by Jakub Kicinski's avatar Jakub Kicinski

Merge branch 'tcp-avoid-sending-too-small-packets'

Eric Dumazet says:

====================
tcp: avoid sending too small packets

tcp_sendmsg() cooks 'large' skbs, that are later split
if needed from tcp_write_xmit().

After a split, the leftover skb size is smaller than the optimal
size, and this causes a performance drop.

In this series, tcp_grow_skb() helper is added to shift
payload from the second skb in the write queue to the first
skb to always send optimal sized skbs.

This increases TSO efficiency, and decreases number of ACK
packets.
====================

Link: https://lore.kernel.org/r/20240418214600.1291486-1-edumazet@google.comSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents f62a5e71 8ee602c6
...@@ -1502,18 +1502,22 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) ...@@ -1502,18 +1502,22 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
} }
/* Initialize TSO segments for a packet. */ /* Initialize TSO segments for a packet. */
static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) static int tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
{ {
int tso_segs;
if (skb->len <= mss_now) { if (skb->len <= mss_now) {
/* Avoid the costly divide in the normal /* Avoid the costly divide in the normal
* non-TSO case. * non-TSO case.
*/ */
tcp_skb_pcount_set(skb, 1);
TCP_SKB_CB(skb)->tcp_gso_size = 0; TCP_SKB_CB(skb)->tcp_gso_size = 0;
} else { tcp_skb_pcount_set(skb, 1);
tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now)); return 1;
TCP_SKB_CB(skb)->tcp_gso_size = mss_now;
} }
TCP_SKB_CB(skb)->tcp_gso_size = mss_now;
tso_segs = DIV_ROUND_UP(skb->len, mss_now);
tcp_skb_pcount_set(skb, tso_segs);
return tso_segs;
} }
/* Pcount in the middle of the write queue got changed, we need to do various /* Pcount in the middle of the write queue got changed, we need to do various
...@@ -2073,16 +2077,10 @@ static unsigned int tcp_mss_split_point(const struct sock *sk, ...@@ -2073,16 +2077,10 @@ static unsigned int tcp_mss_split_point(const struct sock *sk,
/* Can at least one segment of SKB be sent right now, according to the /* Can at least one segment of SKB be sent right now, according to the
* congestion window rules? If so, return how many segments are allowed. * congestion window rules? If so, return how many segments are allowed.
*/ */
static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp, static u32 tcp_cwnd_test(const struct tcp_sock *tp)
const struct sk_buff *skb)
{ {
u32 in_flight, cwnd, halfcwnd; u32 in_flight, cwnd, halfcwnd;
/* Don't be strict about the congestion window for the final FIN. */
if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
tcp_skb_pcount(skb) == 1)
return 1;
in_flight = tcp_packets_in_flight(tp); in_flight = tcp_packets_in_flight(tp);
cwnd = tcp_snd_cwnd(tp); cwnd = tcp_snd_cwnd(tp);
if (in_flight >= cwnd) if (in_flight >= cwnd)
...@@ -2103,10 +2101,9 @@ static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now) ...@@ -2103,10 +2101,9 @@ static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now)
{ {
int tso_segs = tcp_skb_pcount(skb); int tso_segs = tcp_skb_pcount(skb);
if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) { if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now))
tcp_set_skb_tso_segs(skb, mss_now); return tcp_set_skb_tso_segs(skb, mss_now);
tso_segs = tcp_skb_pcount(skb);
}
return tso_segs; return tso_segs;
} }
...@@ -2686,6 +2683,36 @@ void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type) ...@@ -2686,6 +2683,36 @@ void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
tcp_chrono_set(tp, TCP_CHRONO_BUSY); tcp_chrono_set(tp, TCP_CHRONO_BUSY);
} }
/* First skb in the write queue is smaller than ideal packet size.
* Check if we can move payload from the second skb in the queue.
*/
static void tcp_grow_skb(struct sock *sk, struct sk_buff *skb, int amount)
{
struct sk_buff *next_skb = skb->next;
unsigned int nlen;
if (tcp_skb_is_last(sk, skb))
return;
if (!tcp_skb_can_collapse(skb, next_skb))
return;
nlen = min_t(u32, amount, next_skb->len);
if (!nlen || !skb_shift(skb, next_skb, nlen))
return;
TCP_SKB_CB(skb)->end_seq += nlen;
TCP_SKB_CB(next_skb)->seq += nlen;
if (!next_skb->len) {
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
TCP_SKB_CB(skb)->eor = TCP_SKB_CB(next_skb)->eor;
TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(next_skb)->tcp_flags;
tcp_unlink_write_queue(next_skb, sk);
tcp_wmem_free_skb(sk, next_skb);
}
}
/* This routine writes packets to the network. It advances the /* This routine writes packets to the network. It advances the
* send_head. This happens as incoming acks open up the remote * send_head. This happens as incoming acks open up the remote
* window for us. * window for us.
...@@ -2706,10 +2733,9 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, ...@@ -2706,10 +2733,9 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb; struct sk_buff *skb;
unsigned int tso_segs, sent_pkts; unsigned int tso_segs, sent_pkts;
int cwnd_quota; u32 cwnd_quota, max_segs;
int result; int result;
bool is_cwnd_limited = false, is_rwnd_limited = false; bool is_cwnd_limited = false, is_rwnd_limited = false;
u32 max_segs;
sent_pkts = 0; sent_pkts = 0;
...@@ -2727,6 +2753,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, ...@@ -2727,6 +2753,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
max_segs = tcp_tso_segs(sk, mss_now); max_segs = tcp_tso_segs(sk, mss_now);
while ((skb = tcp_send_head(sk))) { while ((skb = tcp_send_head(sk))) {
unsigned int limit; unsigned int limit;
int missing_bytes;
if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
/* "skb_mstamp_ns" is used as a start point for the retransmit timer */ /* "skb_mstamp_ns" is used as a start point for the retransmit timer */
...@@ -2740,10 +2767,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, ...@@ -2740,10 +2767,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
if (tcp_pacing_check(sk)) if (tcp_pacing_check(sk))
break; break;
tso_segs = tcp_init_tso_segs(skb, mss_now); cwnd_quota = tcp_cwnd_test(tp);
BUG_ON(!tso_segs);
cwnd_quota = tcp_cwnd_test(tp, skb);
if (!cwnd_quota) { if (!cwnd_quota) {
if (push_one == 2) if (push_one == 2)
/* Force out a loss probe pkt. */ /* Force out a loss probe pkt. */
...@@ -2751,6 +2775,12 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, ...@@ -2751,6 +2775,12 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
else else
break; break;
} }
cwnd_quota = min(cwnd_quota, max_segs);
missing_bytes = cwnd_quota * mss_now - skb->len;
if (missing_bytes > 0)
tcp_grow_skb(sk, skb, missing_bytes);
tso_segs = tcp_set_skb_tso_segs(skb, mss_now);
if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) { if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) {
is_rwnd_limited = true; is_rwnd_limited = true;
...@@ -2772,9 +2802,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, ...@@ -2772,9 +2802,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
limit = mss_now; limit = mss_now;
if (tso_segs > 1 && !tcp_urg_mode(tp)) if (tso_segs > 1 && !tcp_urg_mode(tp))
limit = tcp_mss_split_point(sk, skb, mss_now, limit = tcp_mss_split_point(sk, skb, mss_now,
min_t(unsigned int, cwnd_quota,
cwnd_quota,
max_segs),
nonagle); nonagle);
if (skb->len > limit && if (skb->len > limit &&
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment