Commit 28b24f90 authored by Eric Dumazet's avatar Eric Dumazet Committed by David S. Miller

net: implement lockless SO_MAX_PACING_RATE

SO_MAX_PACING_RATE setsockopt() does not need to hold
the socket lock, because sk->sk_pacing_rate readers
can run fine if the value is changed by other threads,
after adding READ_ONCE() accessors.
Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 2a4319cf
...@@ -44,7 +44,7 @@ TRACE_EVENT(mptcp_subflow_get_send, ...@@ -44,7 +44,7 @@ TRACE_EVENT(mptcp_subflow_get_send,
ssk = mptcp_subflow_tcp_sock(subflow); ssk = mptcp_subflow_tcp_sock(subflow);
if (ssk && sk_fullsock(ssk)) { if (ssk && sk_fullsock(ssk)) {
__entry->snd_wnd = tcp_sk(ssk)->snd_wnd; __entry->snd_wnd = tcp_sk(ssk)->snd_wnd;
__entry->pace = ssk->sk_pacing_rate; __entry->pace = READ_ONCE(ssk->sk_pacing_rate);
} else { } else {
__entry->snd_wnd = 0; __entry->snd_wnd = 0;
__entry->pace = 0; __entry->pace = 0;
......
...@@ -1160,6 +1160,27 @@ int sk_setsockopt(struct sock *sk, int level, int optname, ...@@ -1160,6 +1160,27 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
WRITE_ONCE(sk->sk_busy_poll_budget, val); WRITE_ONCE(sk->sk_busy_poll_budget, val);
return 0; return 0;
#endif #endif
case SO_MAX_PACING_RATE:
{
unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
unsigned long pacing_rate;
if (sizeof(ulval) != sizeof(val) &&
optlen >= sizeof(ulval) &&
copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
return -EFAULT;
}
if (ulval != ~0UL)
cmpxchg(&sk->sk_pacing_status,
SK_PACING_NONE,
SK_PACING_NEEDED);
/* Pairs with READ_ONCE() from sk_getsockopt() */
WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
pacing_rate = READ_ONCE(sk->sk_pacing_rate);
if (ulval < pacing_rate)
WRITE_ONCE(sk->sk_pacing_rate, ulval);
return 0;
}
} }
sockopt_lock_sock(sk); sockopt_lock_sock(sk);
...@@ -1423,25 +1444,6 @@ int sk_setsockopt(struct sock *sk, int level, int optname, ...@@ -1423,25 +1444,6 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
break; break;
case SO_MAX_PACING_RATE:
{
unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
if (sizeof(ulval) != sizeof(val) &&
optlen >= sizeof(ulval) &&
copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
ret = -EFAULT;
break;
}
if (ulval != ~0UL)
cmpxchg(&sk->sk_pacing_status,
SK_PACING_NONE,
SK_PACING_NEEDED);
/* Pairs with READ_ONCE() from sk_getsockopt() */
WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
break;
}
case SO_INCOMING_CPU: case SO_INCOMING_CPU:
reuseport_update_incoming_cpu(sk, val); reuseport_update_incoming_cpu(sk, val);
break; break;
......
...@@ -258,7 +258,7 @@ static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain) ...@@ -258,7 +258,7 @@ static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
u64 rate = bw; u64 rate = bw;
rate = bbr_rate_bytes_per_sec(sk, rate, gain); rate = bbr_rate_bytes_per_sec(sk, rate, gain);
rate = min_t(u64, rate, sk->sk_max_pacing_rate); rate = min_t(u64, rate, READ_ONCE(sk->sk_max_pacing_rate));
return rate; return rate;
} }
...@@ -278,7 +278,8 @@ static void bbr_init_pacing_rate_from_rtt(struct sock *sk) ...@@ -278,7 +278,8 @@ static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
} }
bw = (u64)tcp_snd_cwnd(tp) * BW_UNIT; bw = (u64)tcp_snd_cwnd(tp) * BW_UNIT;
do_div(bw, rtt_us); do_div(bw, rtt_us);
sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain); WRITE_ONCE(sk->sk_pacing_rate,
bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain));
} }
/* Pace using current bw estimate and a gain factor. */ /* Pace using current bw estimate and a gain factor. */
...@@ -290,14 +291,14 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) ...@@ -290,14 +291,14 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
if (unlikely(!bbr->has_seen_rtt && tp->srtt_us)) if (unlikely(!bbr->has_seen_rtt && tp->srtt_us))
bbr_init_pacing_rate_from_rtt(sk); bbr_init_pacing_rate_from_rtt(sk);
if (bbr_full_bw_reached(sk) || rate > sk->sk_pacing_rate) if (bbr_full_bw_reached(sk) || rate > READ_ONCE(sk->sk_pacing_rate))
sk->sk_pacing_rate = rate; WRITE_ONCE(sk->sk_pacing_rate, rate);
} }
/* override sysctl_tcp_min_tso_segs */ /* override sysctl_tcp_min_tso_segs */
__bpf_kfunc static u32 bbr_min_tso_segs(struct sock *sk) __bpf_kfunc static u32 bbr_min_tso_segs(struct sock *sk)
{ {
return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2; return READ_ONCE(sk->sk_pacing_rate) < (bbr_min_tso_rate >> 3) ? 1 : 2;
} }
static u32 bbr_tso_segs_goal(struct sock *sk) static u32 bbr_tso_segs_goal(struct sock *sk)
...@@ -309,7 +310,7 @@ static u32 bbr_tso_segs_goal(struct sock *sk) ...@@ -309,7 +310,7 @@ static u32 bbr_tso_segs_goal(struct sock *sk)
* driver provided sk_gso_max_size. * driver provided sk_gso_max_size.
*/ */
bytes = min_t(unsigned long, bytes = min_t(unsigned long,
sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift), READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift),
GSO_LEGACY_MAX_SIZE - 1 - MAX_TCP_HEADER); GSO_LEGACY_MAX_SIZE - 1 - MAX_TCP_HEADER);
segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk)); segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk));
......
...@@ -927,8 +927,8 @@ static void tcp_update_pacing_rate(struct sock *sk) ...@@ -927,8 +927,8 @@ static void tcp_update_pacing_rate(struct sock *sk)
* without any lock. We want to make sure compiler wont store * without any lock. We want to make sure compiler wont store
* intermediate values in this location. * intermediate values in this location.
*/ */
WRITE_ONCE(sk->sk_pacing_rate, min_t(u64, rate, WRITE_ONCE(sk->sk_pacing_rate,
sk->sk_max_pacing_rate)); min_t(u64, rate, READ_ONCE(sk->sk_max_pacing_rate)));
} }
/* Calculate rto without backoff. This is the second half of Van Jacobson's /* Calculate rto without backoff. This is the second half of Van Jacobson's
......
...@@ -1201,7 +1201,7 @@ static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb, ...@@ -1201,7 +1201,7 @@ static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb,
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
if (sk->sk_pacing_status != SK_PACING_NONE) { if (sk->sk_pacing_status != SK_PACING_NONE) {
unsigned long rate = sk->sk_pacing_rate; unsigned long rate = READ_ONCE(sk->sk_pacing_rate);
/* Original sch_fq does not pace first 10 MSS /* Original sch_fq does not pace first 10 MSS
* Note that tp->data_segs_out overflows after 2^32 packets, * Note that tp->data_segs_out overflows after 2^32 packets,
...@@ -1973,7 +1973,7 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, ...@@ -1973,7 +1973,7 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
unsigned long bytes; unsigned long bytes;
u32 r; u32 r;
bytes = sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift); bytes = READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift);
r = tcp_min_rtt(tcp_sk(sk)) >> READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_rtt_log); r = tcp_min_rtt(tcp_sk(sk)) >> READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_rtt_log);
if (r < BITS_PER_TYPE(sk->sk_gso_max_size)) if (r < BITS_PER_TYPE(sk->sk_gso_max_size))
...@@ -2553,7 +2553,7 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, ...@@ -2553,7 +2553,7 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
limit = max_t(unsigned long, limit = max_t(unsigned long,
2 * skb->truesize, 2 * skb->truesize,
sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift)); READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift));
if (sk->sk_pacing_status == SK_PACING_NONE) if (sk->sk_pacing_status == SK_PACING_NONE)
limit = min_t(unsigned long, limit, limit = min_t(unsigned long, limit,
READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes)); READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes));
...@@ -2561,7 +2561,8 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, ...@@ -2561,7 +2561,8 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
if (static_branch_unlikely(&tcp_tx_delay_enabled) && if (static_branch_unlikely(&tcp_tx_delay_enabled) &&
tcp_sk(sk)->tcp_tx_delay) { tcp_sk(sk)->tcp_tx_delay) {
u64 extra_bytes = (u64)sk->sk_pacing_rate * tcp_sk(sk)->tcp_tx_delay; u64 extra_bytes = (u64)READ_ONCE(sk->sk_pacing_rate) *
tcp_sk(sk)->tcp_tx_delay;
/* TSQ is based on skb truesize sum (sk_wmem_alloc), so we /* TSQ is based on skb truesize sum (sk_wmem_alloc), so we
* approximate our needs assuming an ~100% skb->truesize overhead. * approximate our needs assuming an ~100% skb->truesize overhead.
......
...@@ -668,7 +668,7 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch) ...@@ -668,7 +668,7 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
*/ */
if (!skb->tstamp) { if (!skb->tstamp) {
if (skb->sk) if (skb->sk)
rate = min(skb->sk->sk_pacing_rate, rate); rate = min(READ_ONCE(skb->sk->sk_pacing_rate), rate);
if (rate <= q->low_rate_threshold) { if (rate <= q->low_rate_threshold) {
f->credit = 0; f->credit = 0;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment