Commit 37ba017d authored by Eric Dumazet's avatar Eric Dumazet Committed by David S. Miller

ipv4/tcp: do not use per netns ctl sockets

TCP ipv4 uses per-cpu/per-netns ctl sockets in order to send
RST and some ACK packets (on behalf of TIMEWAIT sockets).

This adds memory and cpu costs, which do not seem needed.
Now typical servers have 256 or more cores, this adds considerable
tax to netns users.

tcp sockets are used from BH context, are not receiving packets,
and do not store any persistent state but the 'struct net' pointer
in order to be able to use IPv4 output functions.

Note that I attempted a related change in the past, that had
to be hot-fixed in commit bdbbb852 ("ipv4: tcp: get rid of ugly unicast_sock")

This patch could very well surface old bugs, on layers not
taking care of sk->sk_kern_sock properly.
Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 6a17b961
...@@ -73,7 +73,6 @@ struct netns_ipv4 { ...@@ -73,7 +73,6 @@ struct netns_ipv4 {
struct sock *mc_autojoin_sk; struct sock *mc_autojoin_sk;
struct inet_peer_base *peers; struct inet_peer_base *peers;
struct sock * __percpu *tcp_sk;
struct fqdir *fqdir; struct fqdir *fqdir;
u8 sysctl_icmp_echo_ignore_all; u8 sysctl_icmp_echo_ignore_all;
......
...@@ -91,6 +91,8 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, ...@@ -91,6 +91,8 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
struct inet_hashinfo tcp_hashinfo; struct inet_hashinfo tcp_hashinfo;
EXPORT_SYMBOL(tcp_hashinfo); EXPORT_SYMBOL(tcp_hashinfo);
static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
static u32 tcp_v4_init_seq(const struct sk_buff *skb) static u32 tcp_v4_init_seq(const struct sk_buff *skb)
{ {
return secure_tcp_seq(ip_hdr(skb)->daddr, return secure_tcp_seq(ip_hdr(skb)->daddr,
...@@ -810,7 +812,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) ...@@ -810,7 +812,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
arg.tos = ip_hdr(skb)->tos; arg.tos = ip_hdr(skb)->tos;
arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
local_bh_disable(); local_bh_disable();
ctl_sk = this_cpu_read(*net->ipv4.tcp_sk); ctl_sk = this_cpu_read(ipv4_tcp_sk);
sock_net_set(ctl_sk, net);
if (sk) { if (sk) {
ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
inet_twsk(sk)->tw_mark : sk->sk_mark; inet_twsk(sk)->tw_mark : sk->sk_mark;
...@@ -825,6 +828,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) ...@@ -825,6 +828,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
transmit_time); transmit_time);
ctl_sk->sk_mark = 0; ctl_sk->sk_mark = 0;
sock_net_set(ctl_sk, &init_net);
__TCP_INC_STATS(net, TCP_MIB_OUTSEGS); __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
__TCP_INC_STATS(net, TCP_MIB_OUTRSTS); __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
local_bh_enable(); local_bh_enable();
...@@ -908,7 +912,8 @@ static void tcp_v4_send_ack(const struct sock *sk, ...@@ -908,7 +912,8 @@ static void tcp_v4_send_ack(const struct sock *sk,
arg.tos = tos; arg.tos = tos;
arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
local_bh_disable(); local_bh_disable();
ctl_sk = this_cpu_read(*net->ipv4.tcp_sk); ctl_sk = this_cpu_read(ipv4_tcp_sk);
sock_net_set(ctl_sk, net);
ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
inet_twsk(sk)->tw_mark : sk->sk_mark; inet_twsk(sk)->tw_mark : sk->sk_mark;
ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
...@@ -921,6 +926,7 @@ static void tcp_v4_send_ack(const struct sock *sk, ...@@ -921,6 +926,7 @@ static void tcp_v4_send_ack(const struct sock *sk,
transmit_time); transmit_time);
ctl_sk->sk_mark = 0; ctl_sk->sk_mark = 0;
sock_net_set(ctl_sk, &init_net);
__TCP_INC_STATS(net, TCP_MIB_OUTSEGS); __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
local_bh_enable(); local_bh_enable();
} }
...@@ -3111,41 +3117,14 @@ EXPORT_SYMBOL(tcp_prot); ...@@ -3111,41 +3117,14 @@ EXPORT_SYMBOL(tcp_prot);
static void __net_exit tcp_sk_exit(struct net *net) static void __net_exit tcp_sk_exit(struct net *net)
{ {
int cpu;
if (net->ipv4.tcp_congestion_control) if (net->ipv4.tcp_congestion_control)
bpf_module_put(net->ipv4.tcp_congestion_control, bpf_module_put(net->ipv4.tcp_congestion_control,
net->ipv4.tcp_congestion_control->owner); net->ipv4.tcp_congestion_control->owner);
for_each_possible_cpu(cpu)
inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
free_percpu(net->ipv4.tcp_sk);
} }
static int __net_init tcp_sk_init(struct net *net) static int __net_init tcp_sk_init(struct net *net)
{ {
int res, cpu, cnt; int cnt;
net->ipv4.tcp_sk = alloc_percpu(struct sock *);
if (!net->ipv4.tcp_sk)
return -ENOMEM;
for_each_possible_cpu(cpu) {
struct sock *sk;
res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
IPPROTO_TCP, net);
if (res)
goto fail;
sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
/* Please enforce IP_DF and IPID==0 for RST and
* ACK sent in SYN-RECV and TIME-WAIT state.
*/
inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
}
net->ipv4.sysctl_tcp_ecn = 2; net->ipv4.sysctl_tcp_ecn = 2;
net->ipv4.sysctl_tcp_ecn_fallback = 1; net->ipv4.sysctl_tcp_ecn_fallback = 1;
...@@ -3229,10 +3208,6 @@ static int __net_init tcp_sk_init(struct net *net) ...@@ -3229,10 +3208,6 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.tcp_congestion_control = &tcp_reno; net->ipv4.tcp_congestion_control = &tcp_reno;
return 0; return 0;
fail:
tcp_sk_exit(net);
return res;
} }
static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
...@@ -3324,6 +3299,24 @@ static void __init bpf_iter_register(void) ...@@ -3324,6 +3299,24 @@ static void __init bpf_iter_register(void)
void __init tcp_v4_init(void) void __init tcp_v4_init(void)
{ {
int cpu, res;
for_each_possible_cpu(cpu) {
struct sock *sk;
res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
IPPROTO_TCP, &init_net);
if (res)
panic("Failed to create the TCP control socket.\n");
sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
/* Please enforce IP_DF and IPID==0 for RST and
* ACK sent in SYN-RECV and TIME-WAIT state.
*/
inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
per_cpu(ipv4_tcp_sk, cpu) = sk;
}
if (register_pernet_subsys(&tcp_sk_ops)) if (register_pernet_subsys(&tcp_sk_ops))
panic("Failed to create the TCP control socket.\n"); panic("Failed to create the TCP control socket.\n");
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment