Commit 9494dc0b authored by Jakub Kicinski's avatar Jakub Kicinski

Merge branch 'udp-small-changes-on-receive-path'

Eric Dumazet says:

====================
udp: small changes on receive path

This series is based on an observation I made in UDP receive path.

The sock_def_readable() costs are pretty high, especially when
epoll is used to generate EPOLLIN events.

First patch annotates races on sk->sk_rcvbuf reads.

Second patch replaces an atomic_add_return()
 with a less expensive atomic_add()

Third patch avoids calling sock_def_readable() when possible.

Fourth patch adds sk_wake_async_rcu() to get better inlining
and code generation.
====================

Link: https://lore.kernel.org/r/20240328144032.1864988-1-edumazet@google.comSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 46dc11be 1abe267f
...@@ -847,7 +847,7 @@ void af_alg_wmem_wakeup(struct sock *sk) ...@@ -847,7 +847,7 @@ void af_alg_wmem_wakeup(struct sock *sk)
wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN |
EPOLLRDNORM | EPOLLRDNORM |
EPOLLRDBAND); EPOLLRDBAND);
sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
rcu_read_unlock(); rcu_read_unlock();
} }
EXPORT_SYMBOL_GPL(af_alg_wmem_wakeup); EXPORT_SYMBOL_GPL(af_alg_wmem_wakeup);
...@@ -914,7 +914,7 @@ static void af_alg_data_wakeup(struct sock *sk) ...@@ -914,7 +914,7 @@ static void af_alg_data_wakeup(struct sock *sk)
wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
EPOLLRDNORM | EPOLLRDNORM |
EPOLLRDBAND); EPOLLRDBAND);
sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
rcu_read_unlock(); rcu_read_unlock();
} }
......
...@@ -2513,6 +2513,12 @@ static inline void sk_wake_async(const struct sock *sk, int how, int band) ...@@ -2513,6 +2513,12 @@ static inline void sk_wake_async(const struct sock *sk, int how, int band)
} }
} }
static inline void sk_wake_async_rcu(const struct sock *sk, int how, int band)
{
if (unlikely(sock_flag(sk, SOCK_FASYNC)))
sock_wake_async(rcu_dereference(sk->sk_wq), how, band);
}
/* Since sk_{r,w}mem_alloc sums skb->truesize, even a small frame might /* Since sk_{r,w}mem_alloc sums skb->truesize, even a small frame might
* need sizeof(sk_buff) + MTU + padding, unless net driver perform copybreak. * need sizeof(sk_buff) + MTU + padding, unless net driver perform copybreak.
* Note: for send buffers, TCP works better if we can build two skbs at * Note: for send buffers, TCP works better if we can build two skbs at
......
...@@ -116,7 +116,7 @@ static void vcc_write_space(struct sock *sk) ...@@ -116,7 +116,7 @@ static void vcc_write_space(struct sock *sk)
if (skwq_has_sleeper(wq)) if (skwq_has_sleeper(wq))
wake_up_interruptible(&wq->wait); wake_up_interruptible(&wq->wait);
sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
} }
rcu_read_unlock(); rcu_read_unlock();
......
...@@ -3338,7 +3338,7 @@ static void sock_def_error_report(struct sock *sk) ...@@ -3338,7 +3338,7 @@ static void sock_def_error_report(struct sock *sk)
wq = rcu_dereference(sk->sk_wq); wq = rcu_dereference(sk->sk_wq);
if (skwq_has_sleeper(wq)) if (skwq_has_sleeper(wq))
wake_up_interruptible_poll(&wq->wait, EPOLLERR); wake_up_interruptible_poll(&wq->wait, EPOLLERR);
sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR);
rcu_read_unlock(); rcu_read_unlock();
} }
...@@ -3353,7 +3353,7 @@ void sock_def_readable(struct sock *sk) ...@@ -3353,7 +3353,7 @@ void sock_def_readable(struct sock *sk)
if (skwq_has_sleeper(wq)) if (skwq_has_sleeper(wq))
wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
EPOLLRDNORM | EPOLLRDBAND); EPOLLRDNORM | EPOLLRDBAND);
sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
rcu_read_unlock(); rcu_read_unlock();
} }
...@@ -3373,7 +3373,7 @@ static void sock_def_write_space(struct sock *sk) ...@@ -3373,7 +3373,7 @@ static void sock_def_write_space(struct sock *sk)
EPOLLWRNORM | EPOLLWRBAND); EPOLLWRNORM | EPOLLWRBAND);
/* Should agree with poll, otherwise some programs break */ /* Should agree with poll, otherwise some programs break */
sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
} }
rcu_read_unlock(); rcu_read_unlock();
...@@ -3398,7 +3398,7 @@ static void sock_def_write_space_wfree(struct sock *sk) ...@@ -3398,7 +3398,7 @@ static void sock_def_write_space_wfree(struct sock *sk)
EPOLLWRNORM | EPOLLWRBAND); EPOLLWRNORM | EPOLLWRBAND);
/* Should agree with poll, otherwise some programs break */ /* Should agree with poll, otherwise some programs break */
sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
} }
} }
......
...@@ -204,7 +204,7 @@ void dccp_write_space(struct sock *sk) ...@@ -204,7 +204,7 @@ void dccp_write_space(struct sock *sk)
wake_up_interruptible(&wq->wait); wake_up_interruptible(&wq->wait);
/* Should agree with poll, otherwise some programs break */ /* Should agree with poll, otherwise some programs break */
if (sock_writeable(sk)) if (sock_writeable(sk))
sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
rcu_read_unlock(); rcu_read_unlock();
} }
......
...@@ -1492,13 +1492,15 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) ...@@ -1492,13 +1492,15 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
struct sk_buff_head *list = &sk->sk_receive_queue; struct sk_buff_head *list = &sk->sk_receive_queue;
int rmem, err = -ENOMEM; int rmem, err = -ENOMEM;
spinlock_t *busy = NULL; spinlock_t *busy = NULL;
int size; bool becomes_readable;
int size, rcvbuf;
/* try to avoid the costly atomic add/sub pair when the receive /* Immediately drop when the receive queue is full.
* queue is full; always allow at least a packet * Always allow at least one packet.
*/ */
rmem = atomic_read(&sk->sk_rmem_alloc); rmem = atomic_read(&sk->sk_rmem_alloc);
if (rmem > sk->sk_rcvbuf) rcvbuf = READ_ONCE(sk->sk_rcvbuf);
if (rmem > rcvbuf)
goto drop; goto drop;
/* Under mem pressure, it might be helpful to help udp_recvmsg() /* Under mem pressure, it might be helpful to help udp_recvmsg()
...@@ -1507,7 +1509,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) ...@@ -1507,7 +1509,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
* - Less cache line misses at copyout() time * - Less cache line misses at copyout() time
* - Less work at consume_skb() (less alien page frag freeing) * - Less work at consume_skb() (less alien page frag freeing)
*/ */
if (rmem > (sk->sk_rcvbuf >> 1)) { if (rmem > (rcvbuf >> 1)) {
skb_condense(skb); skb_condense(skb);
busy = busylock_acquire(sk); busy = busylock_acquire(sk);
...@@ -1515,12 +1517,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) ...@@ -1515,12 +1517,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
size = skb->truesize; size = skb->truesize;
udp_set_dev_scratch(skb); udp_set_dev_scratch(skb);
/* we drop only if the receive buf is full and the receive atomic_add(size, &sk->sk_rmem_alloc);
* queue contains some other skb
*/
rmem = atomic_add_return(size, &sk->sk_rmem_alloc);
if (rmem > (size + (unsigned int)sk->sk_rcvbuf))
goto uncharge_drop;
spin_lock(&list->lock); spin_lock(&list->lock);
err = udp_rmem_schedule(sk, size); err = udp_rmem_schedule(sk, size);
...@@ -1536,12 +1533,19 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) ...@@ -1536,12 +1533,19 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
*/ */
sock_skb_set_dropcount(sk, skb); sock_skb_set_dropcount(sk, skb);
becomes_readable = skb_queue_empty(list);
__skb_queue_tail(list, skb); __skb_queue_tail(list, skb);
spin_unlock(&list->lock); spin_unlock(&list->lock);
if (!sock_flag(sk, SOCK_DEAD)) if (!sock_flag(sk, SOCK_DEAD)) {
INDIRECT_CALL_1(sk->sk_data_ready, sock_def_readable, sk); if (becomes_readable ||
sk->sk_data_ready != sock_def_readable ||
READ_ONCE(sk->sk_peek_off) >= 0)
INDIRECT_CALL_1(sk->sk_data_ready,
sock_def_readable, sk);
else
sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
}
busylock_release(busy); busylock_release(busy);
return 0; return 0;
......
...@@ -184,7 +184,7 @@ static void iucv_sock_wake_msglim(struct sock *sk) ...@@ -184,7 +184,7 @@ static void iucv_sock_wake_msglim(struct sock *sk)
wq = rcu_dereference(sk->sk_wq); wq = rcu_dereference(sk->sk_wq);
if (skwq_has_sleeper(wq)) if (skwq_has_sleeper(wq))
wake_up_interruptible_all(&wq->wait); wake_up_interruptible_all(&wq->wait);
sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
rcu_read_unlock(); rcu_read_unlock();
} }
......
...@@ -65,7 +65,7 @@ static void rxrpc_write_space(struct sock *sk) ...@@ -65,7 +65,7 @@ static void rxrpc_write_space(struct sock *sk)
if (skwq_has_sleeper(wq)) if (skwq_has_sleeper(wq))
wake_up_interruptible(&wq->wait); wake_up_interruptible(&wq->wait);
sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
} }
rcu_read_unlock(); rcu_read_unlock();
} }
......
...@@ -9276,7 +9276,7 @@ void sctp_data_ready(struct sock *sk) ...@@ -9276,7 +9276,7 @@ void sctp_data_ready(struct sock *sk)
if (skwq_has_sleeper(wq)) if (skwq_has_sleeper(wq))
wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN |
EPOLLRDNORM | EPOLLRDBAND); EPOLLRDNORM | EPOLLRDBAND);
sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
rcu_read_unlock(); rcu_read_unlock();
} }
......
...@@ -42,10 +42,10 @@ static void smc_rx_wake_up(struct sock *sk) ...@@ -42,10 +42,10 @@ static void smc_rx_wake_up(struct sock *sk)
if (skwq_has_sleeper(wq)) if (skwq_has_sleeper(wq))
wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
EPOLLRDNORM | EPOLLRDBAND); EPOLLRDNORM | EPOLLRDBAND);
sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
if ((sk->sk_shutdown == SHUTDOWN_MASK) || if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
(sk->sk_state == SMC_CLOSED)) (sk->sk_state == SMC_CLOSED))
sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_HUP);
rcu_read_unlock(); rcu_read_unlock();
} }
......
...@@ -546,7 +546,7 @@ static void unix_write_space(struct sock *sk) ...@@ -546,7 +546,7 @@ static void unix_write_space(struct sock *sk)
if (skwq_has_sleeper(wq)) if (skwq_has_sleeper(wq))
wake_up_interruptible_sync_poll(&wq->wait, wake_up_interruptible_sync_poll(&wq->wait,
EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
} }
rcu_read_unlock(); rcu_read_unlock();
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment