Commit 4666b6e2 authored by Yaogong Wang's avatar Yaogong Wang Committed by Greg Kroah-Hartman

tcp: use an RB tree for ooo receive queue

[ Upstream commit 9f5afeae ]

Over the years, TCP BDP has increased by several orders of magnitude,
and some people are considering to reach the 2 Gbytes limit.

Even with current window scale limit of 14, ~1 Gbytes maps to ~740,000
MSS.

In presence of packet losses (or reorders), TCP stores incoming packets
into an out of order queue, and number of skbs sitting there waiting for
the missing packets to be received can be in the 10^5 range.

Most packets are appended to the tail of this queue, and when
packets can finally be transferred to receive queue, we scan the queue
from its head.

However, in presence of heavy losses, we might have to find an arbitrary
point in this queue, involving a linear scan for every incoming packet,
throwing away cpu caches.

This patch converts it to a RB tree, to get bounded latencies.

Yaogong wrote a preliminary patch about 2 years ago.
Eric did the rebase, added ofo_last_skb cache, polishing and tests.

Tested with network dropping between 1 and 10 % packets, with good
success (about 30 % increase of throughput in stress tests)

Next step would be to also use an RB tree for the write queue at sender
side ;)
Signed-off-by: default avatarYaogong Wang <wygivan@google.com>
Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Acked-By: default avatarIlpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
Signed-off-by: default avatarMao Wenan <maowenan@huawei.com>
Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
parent ec7055c6
...@@ -2273,6 +2273,8 @@ static inline void __skb_queue_purge(struct sk_buff_head *list) ...@@ -2273,6 +2273,8 @@ static inline void __skb_queue_purge(struct sk_buff_head *list)
kfree_skb(skb); kfree_skb(skb);
} }
void skb_rbtree_purge(struct rb_root *root);
void *netdev_alloc_frag(unsigned int fragsz); void *netdev_alloc_frag(unsigned int fragsz);
struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int length, struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int length,
...@@ -2807,6 +2809,12 @@ static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len) ...@@ -2807,6 +2809,12 @@ static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len)
return __pskb_trim(skb, len); return __pskb_trim(skb, len);
} }
#define rb_to_skb(rb) rb_entry_safe(rb, struct sk_buff, rbnode)
#define skb_rb_first(root) rb_to_skb(rb_first(root))
#define skb_rb_last(root) rb_to_skb(rb_last(root))
#define skb_rb_next(skb) rb_to_skb(rb_next(&(skb)->rbnode))
#define skb_rb_prev(skb) rb_to_skb(rb_prev(&(skb)->rbnode))
#define skb_queue_walk(queue, skb) \ #define skb_queue_walk(queue, skb) \
for (skb = (queue)->next; \ for (skb = (queue)->next; \
skb != (struct sk_buff *)(queue); \ skb != (struct sk_buff *)(queue); \
......
...@@ -279,10 +279,9 @@ struct tcp_sock { ...@@ -279,10 +279,9 @@ struct tcp_sock {
struct sk_buff* lost_skb_hint; struct sk_buff* lost_skb_hint;
struct sk_buff *retransmit_skb_hint; struct sk_buff *retransmit_skb_hint;
/* OOO segments go in this list. Note that socket lock must be held, /* OOO segments go in this rbtree. Socket lock must be held. */
* as we do not use sk_buff_head lock. struct rb_root out_of_order_queue;
*/ struct sk_buff *ooo_last_skb; /* cache rb_last(out_of_order_queue) */
struct sk_buff_head out_of_order_queue;
/* SACKs data, these 2 need to be together (see tcp_options_write) */ /* SACKs data, these 2 need to be together (see tcp_options_write) */
struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */ struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */
......
...@@ -649,7 +649,7 @@ static inline void tcp_fast_path_check(struct sock *sk) ...@@ -649,7 +649,7 @@ static inline void tcp_fast_path_check(struct sock *sk)
{ {
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
if (skb_queue_empty(&tp->out_of_order_queue) && if (RB_EMPTY_ROOT(&tp->out_of_order_queue) &&
tp->rcv_wnd && tp->rcv_wnd &&
atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf && atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf &&
!tp->urg_data) !tp->urg_data)
......
...@@ -2377,6 +2377,25 @@ void skb_queue_purge(struct sk_buff_head *list) ...@@ -2377,6 +2377,25 @@ void skb_queue_purge(struct sk_buff_head *list)
} }
EXPORT_SYMBOL(skb_queue_purge); EXPORT_SYMBOL(skb_queue_purge);
/**
* skb_rbtree_purge - empty a skb rbtree
* @root: root of the rbtree to empty
*
* Delete all buffers on an &sk_buff rbtree. Each buffer is removed from
* the list and one reference dropped. This function does not take
* any lock. Synchronization should be handled by the caller (e.g., TCP
* out-of-order queue is protected by the socket lock).
*/
void skb_rbtree_purge(struct rb_root *root)
{
struct sk_buff *skb, *next;
rbtree_postorder_for_each_entry_safe(skb, next, root, rbnode)
kfree_skb(skb);
*root = RB_ROOT;
}
/** /**
* skb_queue_head - queue a buffer at the list head * skb_queue_head - queue a buffer at the list head
* @list: list to use * @list: list to use
......
...@@ -382,7 +382,7 @@ void tcp_init_sock(struct sock *sk) ...@@ -382,7 +382,7 @@ void tcp_init_sock(struct sock *sk)
struct inet_connection_sock *icsk = inet_csk(sk); struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
__skb_queue_head_init(&tp->out_of_order_queue); tp->out_of_order_queue = RB_ROOT;
tcp_init_xmit_timers(sk); tcp_init_xmit_timers(sk);
tcp_prequeue_init(tp); tcp_prequeue_init(tp);
INIT_LIST_HEAD(&tp->tsq_node); INIT_LIST_HEAD(&tp->tsq_node);
...@@ -2240,7 +2240,7 @@ int tcp_disconnect(struct sock *sk, int flags) ...@@ -2240,7 +2240,7 @@ int tcp_disconnect(struct sock *sk, int flags)
tcp_clear_xmit_timers(sk); tcp_clear_xmit_timers(sk);
__skb_queue_purge(&sk->sk_receive_queue); __skb_queue_purge(&sk->sk_receive_queue);
tcp_write_queue_purge(sk); tcp_write_queue_purge(sk);
__skb_queue_purge(&tp->out_of_order_queue); skb_rbtree_purge(&tp->out_of_order_queue);
inet->inet_dport = 0; inet->inet_dport = 0;
......
This diff is collapsed.
...@@ -1830,7 +1830,7 @@ void tcp_v4_destroy_sock(struct sock *sk) ...@@ -1830,7 +1830,7 @@ void tcp_v4_destroy_sock(struct sock *sk)
tcp_write_queue_purge(sk); tcp_write_queue_purge(sk);
/* Cleans up our, hopefully empty, out_of_order_queue. */ /* Cleans up our, hopefully empty, out_of_order_queue. */
__skb_queue_purge(&tp->out_of_order_queue); skb_rbtree_purge(&tp->out_of_order_queue);
#ifdef CONFIG_TCP_MD5SIG #ifdef CONFIG_TCP_MD5SIG
/* Clean up the MD5 key list, if any */ /* Clean up the MD5 key list, if any */
......
...@@ -496,7 +496,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, ...@@ -496,7 +496,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->snd_cwnd_cnt = 0; newtp->snd_cwnd_cnt = 0;
tcp_init_xmit_timers(newsk); tcp_init_xmit_timers(newsk);
__skb_queue_head_init(&newtp->out_of_order_queue);
newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1; newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1;
newtp->rx_opt.saw_tstamp = 0; newtp->rx_opt.saw_tstamp = 0;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment