Commit b0e01253 authored by Eric Dumazet's avatar Eric Dumazet Committed by Jakub Kicinski

tcp: refine tcp_prune_ofo_queue() logic

After commits 36a6503f ("tcp: refine tcp_prune_ofo_queue()
to not drop all packets") and 72cd43ba
("tcp: free batches of packets in tcp_prune_ofo_queue()")
tcp_prune_ofo_queue() drops a fraction of ooo queue,
to make room for incoming packet.

However it makes no sense to drop packets that are
before the incoming packet, in sequence space.

In order to recover from packet losses faster,
it makes more sense to only drop ooo packets
which are after the incoming packet.

Tested:
packetdrill test:
   0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
   +0 setsockopt(3, SOL_SOCKET, SO_RCVBUF, [3800], 4) = 0
   +0 bind(3, ..., ...) = 0
   +0 listen(3, 1) = 0

   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 0>
  +.1 < . 1:1(0) ack 1 win 1024
   +0 accept(3, ..., ...) = 4

 +.01 < . 200:300(100) ack 1 win 1024
   +0 > . 1:1(0) ack 1 <nop,nop, sack 200:300>

 +.01 < . 400:500(100) ack 1 win 1024
   +0 > . 1:1(0) ack 1 <nop,nop, sack 400:500 200:300>

 +.01 < . 600:700(100) ack 1 win 1024
   +0 > . 1:1(0) ack 1 <nop,nop, sack 600:700 400:500 200:300>

 +.01 < . 800:900(100) ack 1 win 1024
   +0 > . 1:1(0) ack 1 <nop,nop, sack 800:900 600:700 400:500 200:300>

 +.01 < . 1000:1100(100) ack 1 win 1024
   +0 > . 1:1(0) ack 1 <nop,nop, sack 1000:1100 800:900 600:700 400:500>

 +.01 < . 1200:1300(100) ack 1 win 1024
   +0 > . 1:1(0) ack 1 <nop,nop, sack 1200:1300 1000:1100 800:900 600:700>

// this packet is dropped because we have no room left.
 +.01 < . 1400:1500(100) ack 1 win 1024

 +.01 < . 1:200(199) ack 1 win 1024
// Make sure kernel did not drop 200:300 sequence
   +0 > . 1:1(0) ack 300 <nop,nop, sack 1200:1300 1000:1100 800:900 600:700>
// Make room, since our RCVBUF is very small
   +0 read(4, ..., 299) = 299

 +.01 < . 300:400(100) ack 1 win 1024
   +0 > . 1:1(0) ack 500 <nop,nop, sack 1200:1300 1000:1100 800:900 600:700>

 +.01 < . 500:600(100) ack 1 win 1024
   +0 > . 1:1(0) ack 700 <nop,nop, sack 1200:1300 1000:1100 800:900>

   +0 read(4, ..., 400) = 400

 +.01 < . 700:800(100) ack 1 win 1024
   +0 > . 1:1(0) ack 900 <nop,nop, sack 1200:1300 1000:1100>

 +.01 < . 900:1000(100) ack 1 win 1024
   +0 > . 1:1(0) ack 1100 <nop,nop, sack 1200:1300>

 +.01 < . 1100:1200(100) ack 1 win 1024
// This checks that 1200:1300 has not been removed from ooo queue
   +0 > . 1:1(0) ack 1300
Suggested-by: default avatarJakub Kicinski <kuba@kernel.org>
Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Acked-by: default avatarSoheil Hassas Yeganeh <soheil@google.com>
Acked-by: default avatarKuniyuki Iwashima <kuniyu@amazon.com>
Link: https://lore.kernel.org/r/20221101035234.3910189-1-edumazet@google.comSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parent 44827016
......@@ -4764,8 +4764,8 @@ static void tcp_ofo_queue(struct sock *sk)
}
}
static bool tcp_prune_ofo_queue(struct sock *sk);
static int tcp_prune_queue(struct sock *sk);
static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb);
static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb);
static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
unsigned int size)
......@@ -4773,11 +4773,11 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
!sk_rmem_schedule(sk, skb, size)) {
if (tcp_prune_queue(sk) < 0)
if (tcp_prune_queue(sk, skb) < 0)
return -1;
while (!sk_rmem_schedule(sk, skb, size)) {
if (!tcp_prune_ofo_queue(sk))
if (!tcp_prune_ofo_queue(sk, skb))
return -1;
}
}
......@@ -5329,6 +5329,8 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
* Clean the out-of-order queue to make room.
* We drop high sequences packets to :
* 1) Let a chance for holes to be filled.
* This means we do not drop packets from ooo queue if their sequence
* is before incoming packet sequence.
* 2) not add too big latencies if thousands of packets sit there.
* (But if application shrinks SO_RCVBUF, we could still end up
* freeing whole queue here)
......@@ -5336,24 +5338,31 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
*
* Return true if queue has shrunk.
*/
static bool tcp_prune_ofo_queue(struct sock *sk)
static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct rb_node *node, *prev;
bool pruned = false;
int goal;
if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
return false;
NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
goal = sk->sk_rcvbuf >> 3;
node = &tp->ooo_last_skb->rbnode;
do {
struct sk_buff *skb = rb_to_skb(node);
/* If incoming skb would land last in ofo queue, stop pruning. */
if (after(TCP_SKB_CB(in_skb)->seq, TCP_SKB_CB(skb)->seq))
break;
pruned = true;
prev = rb_prev(node);
rb_erase(node, &tp->out_of_order_queue);
goal -= rb_to_skb(node)->truesize;
tcp_drop_reason(sk, rb_to_skb(node),
SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE);
goal -= skb->truesize;
tcp_drop_reason(sk, skb, SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE);
tp->ooo_last_skb = rb_to_skb(prev);
if (!prev || goal <= 0) {
if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
!tcp_under_memory_pressure(sk))
......@@ -5362,8 +5371,9 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
}
node = prev;
} while (node);
tp->ooo_last_skb = rb_to_skb(prev);
if (pruned) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
/* Reset SACK state. A conforming SACK implementation will
* do the same at a timeout based retransmit. When a connection
* is in a sad state like this, we care only about integrity
......@@ -5371,7 +5381,8 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
*/
if (tp->rx_opt.sack_ok)
tcp_sack_reset(&tp->rx_opt);
return true;
}
return pruned;
}
/* Reduce allocated memory if we can, trying to get
......@@ -5381,7 +5392,7 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
* until the socket owning process reads some of the data
* to stabilize the situation.
*/
static int tcp_prune_queue(struct sock *sk)
static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb)
{
struct tcp_sock *tp = tcp_sk(sk);
......@@ -5408,7 +5419,7 @@ static int tcp_prune_queue(struct sock *sk)
/* Collapsing did not help, destructive actions follow.
* This must not ever occur. */
tcp_prune_ofo_queue(sk);
tcp_prune_ofo_queue(sk, in_skb);
if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
return 0;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment