Commit 3f4888ad authored by David S. Miller's avatar David S. Miller

Merge branch 'tcp-tsq-perf'

Eric Dumazet says:

====================
tcp: tsq: performance series

Under very high TX stress, CPU handling NIC TX completions can spend
considerable amount of cycles handling TSQ (TCP Small Queues) logic.

This patch series avoids some atomic operations, but most notable
patch is the 3rd one, allowing other cpus processing ACK packets and
calling tcp_write_xmit() to grab TCP_TSQ_DEFERRED so that
tcp_tasklet_func() can skip already processed sockets.

This avoid lots of lock acquisitions and cache lines accesses,
particularly under load.

In v2, I added :

- tcp_small_queue_check() change to allow 1st and 2nd packets
  in write queue to be sent, even in the case TX completion of
    already acknowledged packets did not happen yet.
      This helps when TX completion coalescing parameters are set
        even to insane values, and/or busy polling is used.

- A reorganization of struct sock fields to
  lower false sharing and increase data locality.

- Then I moved tsq_flags from tcp_sock to struct sock also
  to reduce cache line misses during TX completions.

I measured an overall throughput gain of 22 % for heavy TCP use
over a single TX queue.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents f83e8303 7aa5470c
......@@ -186,7 +186,6 @@ struct tcp_sock {
u32 tsoffset; /* timestamp offset */
struct list_head tsq_node; /* anchor in tsq_tasklet.head list */
unsigned long tsq_flags;
/* Data for direct copy to user */
struct {
......@@ -364,7 +363,7 @@ struct tcp_sock {
u32 *saved_syn;
};
enum tsq_flags {
enum tsq_enum {
TSQ_THROTTLED,
TSQ_QUEUED,
TCP_TSQ_DEFERRED, /* tcp_tasklet_func() found socket was owned */
......@@ -375,6 +374,15 @@ enum tsq_flags {
*/
};
enum tsq_flags {
TSQF_THROTTLED = (1UL << TSQ_THROTTLED),
TSQF_QUEUED = (1UL << TSQ_QUEUED),
TCPF_TSQ_DEFERRED = (1UL << TCP_TSQ_DEFERRED),
TCPF_WRITE_TIMER_DEFERRED = (1UL << TCP_WRITE_TIMER_DEFERRED),
TCPF_DELACK_TIMER_DEFERRED = (1UL << TCP_DELACK_TIMER_DEFERRED),
TCPF_MTU_REDUCED_DEFERRED = (1UL << TCP_MTU_REDUCED_DEFERRED),
};
static inline struct tcp_sock *tcp_sk(const struct sock *sk)
{
return (struct tcp_sock *)sk;
......
......@@ -343,6 +343,9 @@ struct sock {
#define sk_rxhash __sk_common.skc_rxhash
socket_lock_t sk_lock;
atomic_t sk_drops;
int sk_rcvlowat;
struct sk_buff_head sk_error_queue;
struct sk_buff_head sk_receive_queue;
/*
* The backlog queue is special, it is always used with
......@@ -359,14 +362,13 @@ struct sock {
struct sk_buff *tail;
} sk_backlog;
#define sk_rmem_alloc sk_backlog.rmem_alloc
int sk_forward_alloc;
__u32 sk_txhash;
int sk_forward_alloc;
#ifdef CONFIG_NET_RX_BUSY_POLL
unsigned int sk_napi_id;
unsigned int sk_ll_usec;
/* ===== mostly read cache line ===== */
unsigned int sk_napi_id;
#endif
atomic_t sk_drops;
int sk_rcvbuf;
struct sk_filter __rcu *sk_filter;
......@@ -379,11 +381,30 @@ struct sock {
#endif
struct dst_entry *sk_rx_dst;
struct dst_entry __rcu *sk_dst_cache;
/* Note: 32bit hole on 64bit arches */
atomic_t sk_wmem_alloc;
atomic_t sk_omem_alloc;
int sk_sndbuf;
/* ===== cache line for TX ===== */
int sk_wmem_queued;
atomic_t sk_wmem_alloc;
unsigned long sk_tsq_flags;
struct sk_buff *sk_send_head;
struct sk_buff_head sk_write_queue;
__s32 sk_peek_off;
int sk_write_pending;
long sk_sndtimeo;
struct timer_list sk_timer;
__u32 sk_priority;
__u32 sk_mark;
u32 sk_pacing_rate; /* bytes per second */
u32 sk_max_pacing_rate;
struct page_frag sk_frag;
netdev_features_t sk_route_caps;
netdev_features_t sk_route_nocaps;
int sk_gso_type;
unsigned int sk_gso_max_size;
gfp_t sk_allocation;
__u32 sk_txhash;
/*
* Because of non atomicity rules, all
......@@ -414,42 +435,24 @@ struct sock {
#define SK_PROTOCOL_MAX U8_MAX
kmemcheck_bitfield_end(flags);
int sk_wmem_queued;
gfp_t sk_allocation;
u32 sk_pacing_rate; /* bytes per second */
u32 sk_max_pacing_rate;
netdev_features_t sk_route_caps;
netdev_features_t sk_route_nocaps;
int sk_gso_type;
unsigned int sk_gso_max_size;
u16 sk_gso_max_segs;
int sk_rcvlowat;
unsigned long sk_lingertime;
struct sk_buff_head sk_error_queue;
struct proto *sk_prot_creator;
rwlock_t sk_callback_lock;
int sk_err,
sk_err_soft;
u32 sk_ack_backlog;
u32 sk_max_ack_backlog;
__u32 sk_priority;
__u32 sk_mark;
kuid_t sk_uid;
struct pid *sk_peer_pid;
const struct cred *sk_peer_cred;
long sk_rcvtimeo;
long sk_sndtimeo;
struct timer_list sk_timer;
ktime_t sk_stamp;
u16 sk_tsflags;
u8 sk_shutdown;
u32 sk_tskey;
struct socket *sk_socket;
void *sk_user_data;
struct page_frag sk_frag;
struct sk_buff *sk_send_head;
__s32 sk_peek_off;
int sk_write_pending;
#ifdef CONFIG_SECURITY
void *sk_security;
#endif
......
......@@ -663,9 +663,9 @@ static void tcp_push(struct sock *sk, int flags, int mss_now,
if (tcp_should_autocork(sk, skb, size_goal)) {
/* avoid atomic op if TSQ_THROTTLED bit is already set */
if (!test_bit(TSQ_THROTTLED, &tp->tsq_flags)) {
if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
set_bit(TSQ_THROTTLED, &tp->tsq_flags);
set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
}
/* It is possible TX completion already happened
* before we set TSQ_THROTTLED.
......
......@@ -443,7 +443,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
if (!sock_owned_by_user(sk)) {
tcp_v4_mtu_reduced(sk);
} else {
if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
sock_hold(sk);
}
goto out;
......
......@@ -769,25 +769,26 @@ static void tcp_tasklet_func(unsigned long data)
list_del(&tp->tsq_node);
sk = (struct sock *)tp;
bh_lock_sock(sk);
clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags);
if (!sk->sk_lock.owned &&
test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) {
bh_lock_sock(sk);
if (!sock_owned_by_user(sk)) {
clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags);
tcp_tsq_handler(sk);
} else {
/* defer the work to tcp_release_cb() */
set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags);
}
bh_unlock_sock(sk);
}
clear_bit(TSQ_QUEUED, &tp->tsq_flags);
sk_free(sk);
}
}
#define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) | \
(1UL << TCP_WRITE_TIMER_DEFERRED) | \
(1UL << TCP_DELACK_TIMER_DEFERRED) | \
(1UL << TCP_MTU_REDUCED_DEFERRED))
#define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED | \
TCPF_WRITE_TIMER_DEFERRED | \
TCPF_DELACK_TIMER_DEFERRED | \
TCPF_MTU_REDUCED_DEFERRED)
/**
* tcp_release_cb - tcp release_sock() callback
* @sk: socket
......@@ -797,18 +798,17 @@ static void tcp_tasklet_func(unsigned long data)
*/
void tcp_release_cb(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
unsigned long flags, nflags;
/* perform an atomic operation only if at least one flag is set */
do {
flags = tp->tsq_flags;
flags = sk->sk_tsq_flags;
if (!(flags & TCP_DEFERRED_ALL))
return;
nflags = flags & ~TCP_DEFERRED_ALL;
} while (cmpxchg(&tp->tsq_flags, flags, nflags) != flags);
} while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags);
if (flags & (1UL << TCP_TSQ_DEFERRED))
if (flags & TCPF_TSQ_DEFERRED)
tcp_tsq_handler(sk);
/* Here begins the tricky part :
......@@ -822,15 +822,15 @@ void tcp_release_cb(struct sock *sk)
*/
sock_release_ownership(sk);
if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) {
if (flags & TCPF_WRITE_TIMER_DEFERRED) {
tcp_write_timer_handler(sk);
__sock_put(sk);
}
if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) {
if (flags & TCPF_DELACK_TIMER_DEFERRED) {
tcp_delack_timer_handler(sk);
__sock_put(sk);
}
if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) {
if (flags & TCPF_MTU_REDUCED_DEFERRED) {
inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
__sock_put(sk);
}
......@@ -860,6 +860,7 @@ void tcp_wfree(struct sk_buff *skb)
{
struct sock *sk = skb->sk;
struct tcp_sock *tp = tcp_sk(sk);
unsigned long flags, nval, oval;
int wmem;
/* Keep one reference on sk_wmem_alloc.
......@@ -877,15 +878,24 @@ void tcp_wfree(struct sk_buff *skb)
if (wmem >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current)
goto out;
if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) &&
!test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) {
unsigned long flags;
for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) {
struct tsq_tasklet *tsq;
bool empty;
if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED))
goto out;
nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED;
nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
if (nval != oval)
continue;
/* queue this socket to tasklet queue */
local_irq_save(flags);
tsq = this_cpu_ptr(&tsq_tasklet);
empty = list_empty(&tsq->head);
list_add(&tp->tsq_node, &tsq->head);
if (empty)
tasklet_schedule(&tsq->tasklet);
local_irq_restore(flags);
return;
......@@ -1922,26 +1932,26 @@ static inline void tcp_mtu_check_reprobe(struct sock *sk)
*/
static int tcp_mtu_probe(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb, *nskb, *next;
struct net *net = sock_net(sk);
int len;
int probe_size;
int size_needed;
int copy;
int copy, len;
int mss_now;
int interval;
/* Not currently probing/verifying,
* not in recovery,
* have enough cwnd, and
* not SACKing (the variable headers throw things off) */
if (!icsk->icsk_mtup.enabled ||
* not SACKing (the variable headers throw things off)
*/
if (likely(!icsk->icsk_mtup.enabled ||
icsk->icsk_mtup.probe_size ||
inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
tp->snd_cwnd < 11 ||
tp->rx_opt.num_sacks || tp->rx_opt.dsack)
tp->rx_opt.num_sacks || tp->rx_opt.dsack))
return -1;
/* Use binary search for probe_size between tcp_mss_base,
......@@ -2081,7 +2091,16 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
limit <<= factor;
if (atomic_read(&sk->sk_wmem_alloc) > limit) {
set_bit(TSQ_THROTTLED, &tcp_sk(sk)->tsq_flags);
/* Always send the 1st or 2nd skb in write queue.
* No need to wait for TX completion to call us back,
* after softirq/tasklet schedule.
* This helps when TX completions are delayed too much.
*/
if (skb == sk->sk_write_queue.next ||
skb->prev == sk->sk_write_queue.next)
return false;
set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
/* It is possible TX completion already happened
* before we set TSQ_THROTTLED, so we must
* test again the condition.
......@@ -2222,6 +2241,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
break;
if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags);
if (tcp_small_queue_check(sk, skb, 0))
break;
......@@ -3524,8 +3545,6 @@ void tcp_send_ack(struct sock *sk)
/* We do not want pure acks influencing TCP Small Queues or fq/pacing
* too much.
* SKB_TRUESIZE(max(1 .. 66, MAX_TCP_HEADER)) is unfortunately ~784
* We also avoid tcp_wfree() overhead (cache line miss accessing
* tp->tsq_flags) by using regular sock_wfree()
*/
skb_set_tcp_pure_ack(buff);
......
......@@ -310,7 +310,7 @@ static void tcp_delack_timer(unsigned long data)
inet_csk(sk)->icsk_ack.blocked = 1;
__NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
/* deleguate our work to tcp_release_cb() */
if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags))
sock_hold(sk);
}
bh_unlock_sock(sk);
......@@ -592,7 +592,7 @@ static void tcp_write_timer(unsigned long data)
tcp_write_timer_handler(sk);
} else {
/* delegate our work to tcp_release_cb() */
if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &sk->sk_tsq_flags))
sock_hold(sk);
}
bh_unlock_sock(sk);
......
......@@ -399,7 +399,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
if (!sock_owned_by_user(sk))
tcp_v6_mtu_reduced(sk);
else if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED,
&tp->tsq_flags))
&sk->sk_tsq_flags))
sock_hold(sk);
goto out;
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment