Commit 01b2a995 authored by David S. Miller's avatar David S. Miller

Merge branch 'hash-rethink'

Akhmat Karakotov says:

====================
Make hash rethink configurable

As it was shown in the report by Alexander Azimov, hash rethink at the
client-side may lead to connection timeout toward stateful anycast
services. Tom Herbert created a patchset to address this issue by applying
hash rethink only after a negative routing event (3RTOs) [1]. This change
also affects server-side behavior, which we found undesirable. This
patchset changes defaults in a way to make them safe: hash rethink at the
client-side is disabled and enabled at the server-side upon each RTO
event or in case of duplicate acknowledgments.

This patchset provides two options to change default behaviour. The hash
rethink may be disabled at the server-side by the new sysctl option.
Changes in the sysctl option don't affect default behavior at the
client-side.

Hash rethink can also be enabled/disabled with socket option or bpf
syscalls which ovewrite both default and sysctl settings. This socket
option is available on both client and server-side. This should provide
mechanics to enable hash rethink inside administrative domain, such as DC,
where hash rethink at the client-side can be desirable.

[1] https://lore.kernel.org/netdev/20210809185314.38187-1-tom@herbertland.com/

v2:
	- Changed sysctl default to ENABLED in all patches. Reduced sysctl
	  and socket option size to u8. Fixed netns bug reported by kernel
	  test robot.

v3:
	- Fixed bug with bad u8 comparison. Moved sk_txrehash to use less
	  bytes in struct. Added WRITE_ONCE() in setsockopt in and
	  READ_ONCE() in tcp_rtx_synack.

v4:
	- Rebase and add documentation for sysctl option.

v5:
	- Move sk_txrehash out of busy poll ifdef.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 678dfd52 cb6cd2ce
...@@ -365,6 +365,15 @@ new netns has been created. ...@@ -365,6 +365,15 @@ new netns has been created.
Default : 0 (for compatibility reasons) Default : 0 (for compatibility reasons)
txrehash
--------
Controls default hash rethink behaviour on listening socket when SO_TXREHASH
option is set to SOCK_TXREHASH_DEFAULT (i. e. not overridden by setsockopt).
If set to 1 (default), hash rethink is performed on listening socket.
If set to 0, hash rethink is not performed.
2. /proc/sys/net/unix - Parameters for Unix domain sockets 2. /proc/sys/net/unix - Parameters for Unix domain sockets
---------------------------------------------------------- ----------------------------------------------------------
......
...@@ -133,6 +133,8 @@ ...@@ -133,6 +133,8 @@
#define SO_RESERVE_MEM 73 #define SO_RESERVE_MEM 73
#define SO_TXREHASH 74
#if !defined(__KERNEL__) #if !defined(__KERNEL__)
#if __BITS_PER_LONG == 64 #if __BITS_PER_LONG == 64
......
...@@ -144,6 +144,8 @@ ...@@ -144,6 +144,8 @@
#define SO_RESERVE_MEM 73 #define SO_RESERVE_MEM 73
#define SO_TXREHASH 74
#if !defined(__KERNEL__) #if !defined(__KERNEL__)
#if __BITS_PER_LONG == 64 #if __BITS_PER_LONG == 64
......
...@@ -125,6 +125,8 @@ ...@@ -125,6 +125,8 @@
#define SO_RESERVE_MEM 0x4047 #define SO_RESERVE_MEM 0x4047
#define SO_TXREHASH 0x4048
#if !defined(__KERNEL__) #if !defined(__KERNEL__)
#if __BITS_PER_LONG == 64 #if __BITS_PER_LONG == 64
......
...@@ -126,6 +126,8 @@ ...@@ -126,6 +126,8 @@
#define SO_RESERVE_MEM 0x0052 #define SO_RESERVE_MEM 0x0052
#define SO_TXREHASH 0x0053
#if !defined(__KERNEL__) #if !defined(__KERNEL__)
......
...@@ -10,6 +10,7 @@ struct netns_core { ...@@ -10,6 +10,7 @@ struct netns_core {
struct ctl_table_header *sysctl_hdr; struct ctl_table_header *sysctl_hdr;
int sysctl_somaxconn; int sysctl_somaxconn;
u8 sysctl_txrehash;
#ifdef CONFIG_PROC_FS #ifdef CONFIG_PROC_FS
struct prot_inuse __percpu *prot_inuse; struct prot_inuse __percpu *prot_inuse;
......
...@@ -316,6 +316,7 @@ struct sk_filter; ...@@ -316,6 +316,7 @@ struct sk_filter;
* @sk_rcvtimeo: %SO_RCVTIMEO setting * @sk_rcvtimeo: %SO_RCVTIMEO setting
* @sk_sndtimeo: %SO_SNDTIMEO setting * @sk_sndtimeo: %SO_SNDTIMEO setting
* @sk_txhash: computed flow hash for use on transmit * @sk_txhash: computed flow hash for use on transmit
* @sk_txrehash: enable TX hash rethink
* @sk_filter: socket filtering instructions * @sk_filter: socket filtering instructions
* @sk_timer: sock cleanup timer * @sk_timer: sock cleanup timer
* @sk_stamp: time stamp of last packet received * @sk_stamp: time stamp of last packet received
...@@ -491,6 +492,7 @@ struct sock { ...@@ -491,6 +492,7 @@ struct sock {
u32 sk_ack_backlog; u32 sk_ack_backlog;
u32 sk_max_ack_backlog; u32 sk_max_ack_backlog;
kuid_t sk_uid; kuid_t sk_uid;
u8 sk_txrehash;
#ifdef CONFIG_NET_RX_BUSY_POLL #ifdef CONFIG_NET_RX_BUSY_POLL
u8 sk_prefer_busy_poll; u8 sk_prefer_busy_poll;
u16 sk_busy_poll_budget; u16 sk_busy_poll_budget;
...@@ -587,6 +589,18 @@ static inline bool sk_user_data_is_nocopy(const struct sock *sk) ...@@ -587,6 +589,18 @@ static inline bool sk_user_data_is_nocopy(const struct sock *sk)
__tmp | SK_USER_DATA_NOCOPY); \ __tmp | SK_USER_DATA_NOCOPY); \
}) })
static inline
struct net *sock_net(const struct sock *sk)
{
return read_pnet(&sk->sk_net);
}
static inline
void sock_net_set(struct sock *sk, struct net *net)
{
write_pnet(&sk->sk_net, net);
}
/* /*
* SK_CAN_REUSE and SK_NO_REUSE on a socket mean that the socket is OK * SK_CAN_REUSE and SK_NO_REUSE on a socket mean that the socket is OK
* or not whether his port will be reused by someone else. SK_FORCE_REUSE * or not whether his port will be reused by someone else. SK_FORCE_REUSE
...@@ -2054,7 +2068,7 @@ static inline void sk_set_txhash(struct sock *sk) ...@@ -2054,7 +2068,7 @@ static inline void sk_set_txhash(struct sock *sk)
static inline bool sk_rethink_txhash(struct sock *sk) static inline bool sk_rethink_txhash(struct sock *sk)
{ {
if (sk->sk_txhash) { if (sk->sk_txhash && sk->sk_txrehash == SOCK_TXREHASH_ENABLED) {
sk_set_txhash(sk); sk_set_txhash(sk);
return true; return true;
} }
...@@ -2704,18 +2718,6 @@ static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb) ...@@ -2704,18 +2718,6 @@ static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb)
__kfree_skb(skb); __kfree_skb(skb);
} }
static inline
struct net *sock_net(const struct sock *sk)
{
return read_pnet(&sk->sk_net);
}
static inline
void sock_net_set(struct sock *sk, struct net *net)
{
write_pnet(&sk->sk_net, net);
}
static inline bool static inline bool
skb_sk_is_prefetched(struct sk_buff *skb) skb_sk_is_prefetched(struct sk_buff *skb)
{ {
......
...@@ -128,6 +128,8 @@ ...@@ -128,6 +128,8 @@
#define SO_RESERVE_MEM 73 #define SO_RESERVE_MEM 73
#define SO_TXREHASH 74
#if !defined(__KERNEL__) #if !defined(__KERNEL__)
#if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__)) #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__))
......
...@@ -31,4 +31,8 @@ struct __kernel_sockaddr_storage { ...@@ -31,4 +31,8 @@ struct __kernel_sockaddr_storage {
#define SOCK_BUF_LOCK_MASK (SOCK_SNDBUF_LOCK | SOCK_RCVBUF_LOCK) #define SOCK_BUF_LOCK_MASK (SOCK_SNDBUF_LOCK | SOCK_RCVBUF_LOCK)
#define SOCK_TXREHASH_DEFAULT ((u8)-1)
#define SOCK_TXREHASH_DISABLED 0
#define SOCK_TXREHASH_ENABLED 1
#endif /* _UAPI_LINUX_SOCKET_H */ #endif /* _UAPI_LINUX_SOCKET_H */
...@@ -5091,6 +5091,13 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, ...@@ -5091,6 +5091,13 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname,
case SO_REUSEPORT: case SO_REUSEPORT:
sk->sk_reuseport = valbool; sk->sk_reuseport = valbool;
break; break;
case SO_TXREHASH:
if (val < -1 || val > 1) {
ret = -EINVAL;
break;
}
sk->sk_txrehash = (u8)val;
break;
default: default:
ret = -EINVAL; ret = -EINVAL;
} }
...@@ -5269,6 +5276,9 @@ static int _bpf_getsockopt(struct sock *sk, int level, int optname, ...@@ -5269,6 +5276,9 @@ static int _bpf_getsockopt(struct sock *sk, int level, int optname,
case SO_REUSEPORT: case SO_REUSEPORT:
*((int *)optval) = sk->sk_reuseport; *((int *)optval) = sk->sk_reuseport;
break; break;
case SO_TXREHASH:
*((int *)optval) = sk->sk_txrehash;
break;
default: default:
goto err_clear; goto err_clear;
} }
......
...@@ -364,6 +364,8 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) ...@@ -364,6 +364,8 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
static int __net_init net_defaults_init_net(struct net *net) static int __net_init net_defaults_init_net(struct net *net)
{ {
net->core.sysctl_somaxconn = SOMAXCONN; net->core.sysctl_somaxconn = SOMAXCONN;
net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED;
return 0; return 0;
} }
......
...@@ -1447,6 +1447,15 @@ int sock_setsockopt(struct socket *sock, int level, int optname, ...@@ -1447,6 +1447,15 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
break; break;
} }
case SO_TXREHASH:
if (val < -1 || val > 1) {
ret = -EINVAL;
break;
}
/* Paired with READ_ONCE() in tcp_rtx_synack() */
WRITE_ONCE(sk->sk_txrehash, (u8)val);
break;
default: default:
ret = -ENOPROTOOPT; ret = -ENOPROTOOPT;
break; break;
...@@ -1834,6 +1843,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname, ...@@ -1834,6 +1843,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
v.val = sk->sk_reserved_mem; v.val = sk->sk_reserved_mem;
break; break;
case SO_TXREHASH:
v.val = sk->sk_txrehash;
break;
default: default:
/* We implement the SO_SNDLOWAT etc to not be settable /* We implement the SO_SNDLOWAT etc to not be settable
* (1003.1g 7). * (1003.1g 7).
...@@ -3279,6 +3292,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) ...@@ -3279,6 +3292,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_pacing_rate = ~0UL; sk->sk_pacing_rate = ~0UL;
WRITE_ONCE(sk->sk_pacing_shift, 10); WRITE_ONCE(sk->sk_pacing_shift, 10);
sk->sk_incoming_cpu = -1; sk->sk_incoming_cpu = -1;
sk->sk_txrehash = SOCK_TXREHASH_DEFAULT;
sk_rx_queue_clear(sk); sk_rx_queue_clear(sk);
/* /*
......
...@@ -593,6 +593,15 @@ static struct ctl_table netns_core_table[] = { ...@@ -593,6 +593,15 @@ static struct ctl_table netns_core_table[] = {
.extra1 = SYSCTL_ZERO, .extra1 = SYSCTL_ZERO,
.proc_handler = proc_dointvec_minmax .proc_handler = proc_dointvec_minmax
}, },
{
.procname = "txrehash",
.data = &init_net.core.sysctl_txrehash,
.maxlen = sizeof(u8),
.mode = 0644,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
.proc_handler = proc_dou8vec_minmax,
},
{ } { }
}; };
...@@ -611,7 +620,7 @@ __setup("fb_tunnels=", fb_tunnels_only_for_init_net_sysctl_setup); ...@@ -611,7 +620,7 @@ __setup("fb_tunnels=", fb_tunnels_only_for_init_net_sysctl_setup);
static __net_init int sysctl_core_net_init(struct net *net) static __net_init int sysctl_core_net_init(struct net *net)
{ {
struct ctl_table *tbl; struct ctl_table *tbl, *tmp;
tbl = netns_core_table; tbl = netns_core_table;
if (!net_eq(net, &init_net)) { if (!net_eq(net, &init_net)) {
...@@ -619,7 +628,8 @@ static __net_init int sysctl_core_net_init(struct net *net) ...@@ -619,7 +628,8 @@ static __net_init int sysctl_core_net_init(struct net *net)
if (tbl == NULL) if (tbl == NULL)
goto err_dup; goto err_dup;
tbl[0].data = &net->core.sysctl_somaxconn; for (tmp = tbl; tmp->procname; tmp++)
tmp->data += (char *)net - (char *)&init_net;
/* Don't export any sysctls to unprivileged users */ /* Don't export any sysctls to unprivileged users */
if (net->user_ns != &init_user_ns) { if (net->user_ns != &init_user_ns) {
......
...@@ -1046,6 +1046,9 @@ int inet_csk_listen_start(struct sock *sk) ...@@ -1046,6 +1046,9 @@ int inet_csk_listen_start(struct sock *sk)
sk->sk_ack_backlog = 0; sk->sk_ack_backlog = 0;
inet_csk_delack_init(sk); inet_csk_delack_init(sk);
if (sk->sk_txrehash == SOCK_TXREHASH_DEFAULT)
sk->sk_txrehash = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
/* There is race window here: we announce ourselves listening, /* There is race window here: we announce ourselves listening,
* but this transition is still not validated by get_port(). * but this transition is still not validated by get_port().
* It is OK, because this socket enters to hash table only * It is OK, because this socket enters to hash table only
......
...@@ -4092,6 +4092,8 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) ...@@ -4092,6 +4092,8 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
struct flowi fl; struct flowi fl;
int res; int res;
/* Paired with WRITE_ONCE() in sock_setsockopt() */
if (READ_ONCE(sk->sk_txrehash) == SOCK_TXREHASH_ENABLED)
tcp_rsk(req)->txhash = net_tx_rndhash(); tcp_rsk(req)->txhash = net_tx_rndhash();
res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL, res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL,
NULL); NULL);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment