Commit c21e1bf4 authored by Jakub Kicinski's avatar Jakub Kicinski

Merge branch 'add-a-second-bind-table-hashed-by-port-and-address'

Joanne Koong says:

====================
Add a second bind table hashed by port and address

Currently, there is one bind hashtable (bhash) that hashes by port only.
This patchset adds a second bind table (bhash2) that hashes by port and
address.

The motivation for adding bhash2 is to expedite bind requests in situations
where the port has many sockets in its bhash table entry (eg a large number
of sockets bound to different addresses on the same port), which makes checking
bind conflicts costly especially given that we acquire the table entry spinlock
while doing so, which can cause softirq cpu lockups and can prevent new tcp
connections.

We ran into this problem at Meta where the traffic team binds a large number
of IPs to port 443 and the bind() call took a significant amount of time
which led to cpu softirq lockups, which caused packet drops and other failures
on the machine.

When experimentally testing this on a local server for ~24k sockets bound to
the port, the results seen were:

ipv4:
before - 0.002317 seconds
with bhash2 - 0.000020 seconds

ipv6:
before - 0.002431 seconds
with bhash2 - 0.000021 seconds

The additions to the initial bhash2 submission [0] are:
* Updating bhash2 in the cases where a socket's rcv saddr changes after it has
* been bound
* Adding locks for bhash2 hashbuckets

[0] https://lore.kernel.org/netdev/20220520001834.2247810-1-kuba@kernel.org/

v3: https://lore.kernel.org/netdev/20220722195406.1304948-2-joannelkoong@gmail.com/
v2: https://lore.kernel.org/netdev/20220712235310.1935121-1-joannelkoong@gmail.com/
v1: https://lore.kernel.org/netdev/20220623234242.2083895-2-joannelkoong@gmail.com/
====================

Link: https://lore.kernel.org/r/20220822181023.3979645-1-joannelkoong@gmail.comSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 0bf73255 1be9ac87
......@@ -25,6 +25,7 @@
#undef INET_CSK_CLEAR_TIMERS
struct inet_bind_bucket;
struct inet_bind2_bucket;
struct tcp_congestion_ops;
/*
......@@ -57,6 +58,7 @@ struct inet_connection_sock_af_ops {
*
* @icsk_accept_queue: FIFO of established children
* @icsk_bind_hash: Bind node
* @icsk_bind2_hash: Bind node in the bhash2 table
* @icsk_timeout: Timeout
* @icsk_retransmit_timer: Resend (no ack)
* @icsk_rto: Retransmit timeout
......@@ -83,6 +85,7 @@ struct inet_connection_sock {
struct inet_sock icsk_inet;
struct request_sock_queue icsk_accept_queue;
struct inet_bind_bucket *icsk_bind_hash;
struct inet_bind2_bucket *icsk_bind2_hash;
unsigned long icsk_timeout;
struct timer_list icsk_retransmit_timer;
struct timer_list icsk_delack_timer;
......
......@@ -23,6 +23,7 @@
#include <net/inet_connection_sock.h>
#include <net/inet_sock.h>
#include <net/ip.h>
#include <net/sock.h>
#include <net/route.h>
#include <net/tcp_states.h>
......@@ -90,7 +91,28 @@ struct inet_bind_bucket {
struct hlist_head owners;
};
static inline struct net *ib_net(struct inet_bind_bucket *ib)
struct inet_bind2_bucket {
possible_net_t ib_net;
int l3mdev;
unsigned short port;
union {
#if IS_ENABLED(CONFIG_IPV6)
struct in6_addr v6_rcv_saddr;
#endif
__be32 rcv_saddr;
};
/* Node in the bhash2 inet_bind_hashbucket chain */
struct hlist_node node;
/* List of sockets hashed to this bucket */
struct hlist_head owners;
};
static inline struct net *ib_net(const struct inet_bind_bucket *ib)
{
return read_pnet(&ib->ib_net);
}
static inline struct net *ib2_net(const struct inet_bind2_bucket *ib)
{
return read_pnet(&ib->ib_net);
}
......@@ -133,7 +155,14 @@ struct inet_hashinfo {
* TCP hash as well as the others for fast bind/connect.
*/
struct kmem_cache *bind_bucket_cachep;
/* This bind table is hashed by local port */
struct inet_bind_hashbucket *bhash;
struct kmem_cache *bind2_bucket_cachep;
/* This bind table is hashed by local port and sk->sk_rcv_saddr (ipv4)
* or sk->sk_v6_rcv_saddr (ipv6). This 2nd bind table is used
* primarily for expediting bind conflict resolution.
*/
struct inet_bind_hashbucket *bhash2;
unsigned int bhash_size;
/* The 2nd listener table hashed by local port and address */
......@@ -182,14 +211,61 @@ inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net,
void inet_bind_bucket_destroy(struct kmem_cache *cachep,
struct inet_bind_bucket *tb);
bool inet_bind_bucket_match(const struct inet_bind_bucket *tb,
const struct net *net, unsigned short port,
int l3mdev);
struct inet_bind2_bucket *
inet_bind2_bucket_create(struct kmem_cache *cachep, struct net *net,
struct inet_bind_hashbucket *head,
unsigned short port, int l3mdev,
const struct sock *sk);
void inet_bind2_bucket_destroy(struct kmem_cache *cachep,
struct inet_bind2_bucket *tb);
struct inet_bind2_bucket *
inet_bind2_bucket_find(const struct inet_bind_hashbucket *head,
const struct net *net,
unsigned short port, int l3mdev,
const struct sock *sk);
bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb,
const struct net *net, unsigned short port,
int l3mdev, const struct sock *sk);
static inline u32 inet_bhashfn(const struct net *net, const __u16 lport,
const u32 bhash_size)
{
return (lport + net_hash_mix(net)) & (bhash_size - 1);
}
static inline struct inet_bind_hashbucket *
inet_bhashfn_portaddr(const struct inet_hashinfo *hinfo, const struct sock *sk,
const struct net *net, unsigned short port)
{
u32 hash;
#if IS_ENABLED(CONFIG_IPV6)
if (sk->sk_family == AF_INET6)
hash = ipv6_portaddr_hash(net, &sk->sk_v6_rcv_saddr, port);
else
#endif
hash = ipv4_portaddr_hash(net, sk->sk_rcv_saddr, port);
return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)];
}
struct inet_bind_hashbucket *
inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port);
/* This should be called whenever a socket's sk_rcv_saddr (ipv4) or
* sk_v6_rcv_saddr (ipv6) changes after it has been binded. The socket's
* rcv_saddr field should already have been updated when this is called.
*/
int inet_bhash2_update_saddr(struct inet_bind_hashbucket *prev_saddr, struct sock *sk);
void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
const unsigned short snum);
struct inet_bind2_bucket *tb2, unsigned short port);
/* Caller must disable local BH processing. */
int __inet_inherit_port(const struct sock *sk, struct sock *child);
......
......@@ -348,6 +348,7 @@ struct sk_filter;
* @sk_txtime_report_errors: set report errors mode for SO_TXTIME
* @sk_txtime_unused: unused txtime flags
* @ns_tracker: tracker for netns reference
* @sk_bind2_node: bind node in the bhash2 table
*/
struct sock {
/*
......@@ -537,6 +538,7 @@ struct sock {
#endif
struct rcu_head sk_rcu;
netns_tracker ns_tracker;
struct hlist_node sk_bind2_node;
};
enum sk_pacing {
......@@ -870,6 +872,16 @@ static inline void sk_add_bind_node(struct sock *sk,
hlist_add_head(&sk->sk_bind_node, list);
}
static inline void __sk_del_bind2_node(struct sock *sk)
{
__hlist_del(&sk->sk_bind2_node);
}
static inline void sk_add_bind2_node(struct sock *sk, struct hlist_head *list)
{
hlist_add_head(&sk->sk_bind2_node, list);
}
#define sk_for_each(__sk, list) \
hlist_for_each_entry(__sk, list, sk_node)
#define sk_for_each_rcu(__sk, list) \
......@@ -887,6 +899,8 @@ static inline void sk_add_bind_node(struct sock *sk,
hlist_for_each_entry_safe(__sk, tmp, list, sk_node)
#define sk_for_each_bound(__sk, list) \
hlist_for_each_entry(__sk, list, sk_bind_node)
#define sk_for_each_bound_bhash2(__sk, list) \
hlist_for_each_entry(__sk, list, sk_bind2_node)
/**
* sk_for_each_entry_offset_rcu - iterate over a list at a given struct offset
......
......@@ -45,10 +45,11 @@ static unsigned int dccp_v4_pernet_id __read_mostly;
int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
const struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
struct inet_bind_hashbucket *prev_addr_hashbucket = NULL;
__be32 daddr, nexthop, prev_sk_rcv_saddr;
struct inet_sock *inet = inet_sk(sk);
struct dccp_sock *dp = dccp_sk(sk);
__be16 orig_sport, orig_dport;
__be32 daddr, nexthop;
struct flowi4 *fl4;
struct rtable *rt;
int err;
......@@ -89,9 +90,29 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
if (inet_opt == NULL || !inet_opt->opt.srr)
daddr = fl4->daddr;
if (inet->inet_saddr == 0)
if (inet->inet_saddr == 0) {
if (inet_csk(sk)->icsk_bind2_hash) {
prev_addr_hashbucket =
inet_bhashfn_portaddr(&dccp_hashinfo, sk,
sock_net(sk),
inet->inet_num);
prev_sk_rcv_saddr = sk->sk_rcv_saddr;
}
inet->inet_saddr = fl4->saddr;
}
sk_rcv_saddr_set(sk, inet->inet_saddr);
if (prev_addr_hashbucket) {
err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk);
if (err) {
inet->inet_saddr = 0;
sk_rcv_saddr_set(sk, prev_sk_rcv_saddr);
ip_rt_put(rt);
return err;
}
}
inet->inet_dport = usin->sin_port;
sk_daddr_set(sk, daddr);
......
......@@ -934,8 +934,26 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
}
if (saddr == NULL) {
struct inet_bind_hashbucket *prev_addr_hashbucket = NULL;
struct in6_addr prev_v6_rcv_saddr;
if (icsk->icsk_bind2_hash) {
prev_addr_hashbucket = inet_bhashfn_portaddr(&dccp_hashinfo,
sk, sock_net(sk),
inet->inet_num);
prev_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
}
saddr = &fl6.saddr;
sk->sk_v6_rcv_saddr = *saddr;
if (prev_addr_hashbucket) {
err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk);
if (err) {
sk->sk_v6_rcv_saddr = prev_v6_rcv_saddr;
goto failure;
}
}
}
/* set the source address */
......
......@@ -1120,6 +1120,12 @@ static int __init dccp_init(void)
SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
if (!dccp_hashinfo.bind_bucket_cachep)
goto out_free_hashinfo2;
dccp_hashinfo.bind2_bucket_cachep =
kmem_cache_create("dccp_bind2_bucket",
sizeof(struct inet_bind2_bucket), 0,
SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
if (!dccp_hashinfo.bind2_bucket_cachep)
goto out_free_bind_bucket_cachep;
/*
* Size and allocate the main established and bind bucket
......@@ -1150,7 +1156,7 @@ static int __init dccp_init(void)
if (!dccp_hashinfo.ehash) {
DCCP_CRIT("Failed to allocate DCCP established hash table");
goto out_free_bind_bucket_cachep;
goto out_free_bind2_bucket_cachep;
}
for (i = 0; i <= dccp_hashinfo.ehash_mask; i++)
......@@ -1176,14 +1182,24 @@ static int __init dccp_init(void)
goto out_free_dccp_locks;
}
dccp_hashinfo.bhash2 = (struct inet_bind_hashbucket *)
__get_free_pages(GFP_ATOMIC | __GFP_NOWARN, bhash_order);
if (!dccp_hashinfo.bhash2) {
DCCP_CRIT("Failed to allocate DCCP bind2 hash table");
goto out_free_dccp_bhash;
}
for (i = 0; i < dccp_hashinfo.bhash_size; i++) {
spin_lock_init(&dccp_hashinfo.bhash[i].lock);
INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain);
spin_lock_init(&dccp_hashinfo.bhash2[i].lock);
INIT_HLIST_HEAD(&dccp_hashinfo.bhash2[i].chain);
}
rc = dccp_mib_init();
if (rc)
goto out_free_dccp_bhash;
goto out_free_dccp_bhash2;
rc = dccp_ackvec_init();
if (rc)
......@@ -1207,30 +1223,38 @@ static int __init dccp_init(void)
dccp_ackvec_exit();
out_free_dccp_mib:
dccp_mib_exit();
out_free_dccp_bhash2:
free_pages((unsigned long)dccp_hashinfo.bhash2, bhash_order);
out_free_dccp_bhash:
free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order);
out_free_dccp_locks:
inet_ehash_locks_free(&dccp_hashinfo);
out_free_dccp_ehash:
free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order);
out_free_bind2_bucket_cachep:
kmem_cache_destroy(dccp_hashinfo.bind2_bucket_cachep);
out_free_bind_bucket_cachep:
kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
out_free_hashinfo2:
inet_hashinfo2_free_mod(&dccp_hashinfo);
out_fail:
dccp_hashinfo.bhash = NULL;
dccp_hashinfo.bhash2 = NULL;
dccp_hashinfo.ehash = NULL;
dccp_hashinfo.bind_bucket_cachep = NULL;
dccp_hashinfo.bind2_bucket_cachep = NULL;
return rc;
}
static void __exit dccp_fini(void)
{
int bhash_order = get_order(dccp_hashinfo.bhash_size *
sizeof(struct inet_bind_hashbucket));
ccid_cleanup_builtins();
dccp_mib_exit();
free_pages((unsigned long)dccp_hashinfo.bhash,
get_order(dccp_hashinfo.bhash_size *
sizeof(struct inet_bind_hashbucket)));
free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order);
free_pages((unsigned long)dccp_hashinfo.bhash2, bhash_order);
free_pages((unsigned long)dccp_hashinfo.ehash,
get_order((dccp_hashinfo.ehash_mask + 1) *
sizeof(struct inet_ehash_bucket)));
......
......@@ -1219,6 +1219,7 @@ EXPORT_SYMBOL(inet_unregister_protosw);
static int inet_sk_reselect_saddr(struct sock *sk)
{
struct inet_bind_hashbucket *prev_addr_hashbucket;
struct inet_sock *inet = inet_sk(sk);
__be32 old_saddr = inet->inet_saddr;
__be32 daddr = inet->inet_daddr;
......@@ -1226,6 +1227,7 @@ static int inet_sk_reselect_saddr(struct sock *sk)
struct rtable *rt;
__be32 new_saddr;
struct ip_options_rcu *inet_opt;
int err;
inet_opt = rcu_dereference_protected(inet->inet_opt,
lockdep_sock_is_held(sk));
......@@ -1240,20 +1242,34 @@ static int inet_sk_reselect_saddr(struct sock *sk)
if (IS_ERR(rt))
return PTR_ERR(rt);
sk_setup_caps(sk, &rt->dst);
new_saddr = fl4->saddr;
if (new_saddr == old_saddr)
if (new_saddr == old_saddr) {
sk_setup_caps(sk, &rt->dst);
return 0;
}
prev_addr_hashbucket =
inet_bhashfn_portaddr(sk->sk_prot->h.hashinfo, sk,
sock_net(sk), inet->inet_num);
inet->inet_saddr = inet->inet_rcv_saddr = new_saddr;
err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk);
if (err) {
inet->inet_saddr = old_saddr;
inet->inet_rcv_saddr = old_saddr;
ip_rt_put(rt);
return err;
}
sk_setup_caps(sk, &rt->dst);
if (READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_dynaddr) > 1) {
pr_info("%s(): shifting inet->saddr from %pI4 to %pI4\n",
__func__, &old_saddr, &new_saddr);
}
inet->inet_saddr = inet->inet_rcv_saddr = new_saddr;
/*
* XXX The only one ugly spot where we need to
* XXX really change the sockets identity after
......
This diff is collapsed.
This diff is collapsed.
......@@ -4742,6 +4742,12 @@ void __init tcp_init(void)
SLAB_HWCACHE_ALIGN | SLAB_PANIC |
SLAB_ACCOUNT,
NULL);
tcp_hashinfo.bind2_bucket_cachep =
kmem_cache_create("tcp_bind2_bucket",
sizeof(struct inet_bind2_bucket), 0,
SLAB_HWCACHE_ALIGN | SLAB_PANIC |
SLAB_ACCOUNT,
NULL);
/* Size and allocate the main established and bind bucket
* hash tables.
......@@ -4765,7 +4771,7 @@ void __init tcp_init(void)
panic("TCP: failed to alloc ehash_locks");
tcp_hashinfo.bhash =
alloc_large_system_hash("TCP bind",
sizeof(struct inet_bind_hashbucket),
2 * sizeof(struct inet_bind_hashbucket),
tcp_hashinfo.ehash_mask + 1,
17, /* one slot per 128 KB of memory */
0,
......@@ -4774,9 +4780,12 @@ void __init tcp_init(void)
0,
64 * 1024);
tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
tcp_hashinfo.bhash2 = tcp_hashinfo.bhash + tcp_hashinfo.bhash_size;
for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
spin_lock_init(&tcp_hashinfo.bhash[i].lock);
INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
spin_lock_init(&tcp_hashinfo.bhash2[i].lock);
INIT_HLIST_HEAD(&tcp_hashinfo.bhash2[i].chain);
}
......
......@@ -199,11 +199,12 @@ static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
/* This will initiate an outgoing connection. */
int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
struct inet_bind_hashbucket *prev_addr_hashbucket = NULL;
struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
__be32 daddr, nexthop, prev_sk_rcv_saddr;
struct inet_sock *inet = inet_sk(sk);
struct tcp_sock *tp = tcp_sk(sk);
__be16 orig_sport, orig_dport;
__be32 daddr, nexthop;
struct flowi4 *fl4;
struct rtable *rt;
int err;
......@@ -246,10 +247,28 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
if (!inet_opt || !inet_opt->opt.srr)
daddr = fl4->daddr;
if (!inet->inet_saddr)
if (!inet->inet_saddr) {
if (inet_csk(sk)->icsk_bind2_hash) {
prev_addr_hashbucket = inet_bhashfn_portaddr(&tcp_hashinfo,
sk, sock_net(sk),
inet->inet_num);
prev_sk_rcv_saddr = sk->sk_rcv_saddr;
}
inet->inet_saddr = fl4->saddr;
}
sk_rcv_saddr_set(sk, inet->inet_saddr);
if (prev_addr_hashbucket) {
err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk);
if (err) {
inet->inet_saddr = 0;
sk_rcv_saddr_set(sk, prev_sk_rcv_saddr);
ip_rt_put(rt);
return err;
}
}
if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
/* Reset inherited state */
tp->rx_opt.ts_recent = 0;
......
......@@ -287,8 +287,25 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
}
if (!saddr) {
struct inet_bind_hashbucket *prev_addr_hashbucket = NULL;
struct in6_addr prev_v6_rcv_saddr;
if (icsk->icsk_bind2_hash) {
prev_addr_hashbucket = inet_bhashfn_portaddr(&tcp_hashinfo,
sk, sock_net(sk),
inet->inet_num);
prev_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
}
saddr = &fl6.saddr;
sk->sk_v6_rcv_saddr = *saddr;
if (prev_addr_hashbucket) {
err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk);
if (err) {
sk->sk_v6_rcv_saddr = prev_v6_rcv_saddr;
goto failure;
}
}
}
/* set the source address */
......
......@@ -39,4 +39,7 @@ toeplitz
tun
cmsg_sender
unix_connect
tap
\ No newline at end of file
tap
bind_bhash
sk_bind_sendto_listen
sk_connect_zero_addr
......@@ -43,6 +43,7 @@ TEST_PROGS += ndisc_unsolicited_na_test.sh
TEST_PROGS += arp_ndisc_untracked_subnets.sh
TEST_PROGS += stress_reuseport_listen.sh
TEST_PROGS := l2_tos_ttl_inherit.sh
TEST_PROGS += bind_bhash.sh
TEST_PROGS_EXTENDED := in_netns.sh setup_loopback.sh setup_veth.sh
TEST_PROGS_EXTENDED += toeplitz_client.sh toeplitz.sh
TEST_GEN_FILES = socket nettest
......@@ -64,6 +65,9 @@ TEST_GEN_FILES += cmsg_sender
TEST_GEN_FILES += stress_reuseport_listen
TEST_PROGS += test_vxlan_vnifiltering.sh
TEST_GEN_FILES += io_uring_zerocopy_tx
TEST_GEN_FILES += bind_bhash
TEST_GEN_PROGS += sk_bind_sendto_listen
TEST_GEN_PROGS += sk_connect_zero_addr
TEST_FILES := settings
......@@ -74,3 +78,4 @@ include bpf/Makefile
$(OUTPUT)/reuseport_bpf_numa: LDLIBS += -lnuma
$(OUTPUT)/tcp_mmap: LDLIBS += -lpthread
$(OUTPUT)/tcp_inq: LDLIBS += -lpthread
$(OUTPUT)/bind_bhash: LDLIBS += -lpthread
// SPDX-License-Identifier: GPL-2.0
/*
* This times how long it takes to bind to a port when the port already
* has multiple sockets in its bhash table.
*
* In the setup(), we populate the port's bhash table with
* MAX_THREADS * MAX_CONNECTIONS number of entries.
*/
#include <unistd.h>
#include <stdio.h>
#include <netdb.h>
#include <pthread.h>
#include <string.h>
#include <stdbool.h>
#define MAX_THREADS 600
#define MAX_CONNECTIONS 40
static const char *setup_addr_v6 = "::1";
static const char *setup_addr_v4 = "127.0.0.1";
static const char *setup_addr;
static const char *bind_addr;
static const char *port;
bool use_v6;
int ret;
static int fd_array[MAX_THREADS][MAX_CONNECTIONS];
static int bind_socket(int opt, const char *addr)
{
struct addrinfo *res, hint = {};
int sock_fd, reuse = 1, err;
int domain = use_v6 ? AF_INET6 : AF_INET;
sock_fd = socket(domain, SOCK_STREAM, 0);
if (sock_fd < 0) {
perror("socket fd err");
return sock_fd;
}
hint.ai_family = domain;
hint.ai_socktype = SOCK_STREAM;
err = getaddrinfo(addr, port, &hint, &res);
if (err) {
perror("getaddrinfo failed");
goto cleanup;
}
if (opt) {
err = setsockopt(sock_fd, SOL_SOCKET, opt, &reuse, sizeof(reuse));
if (err) {
perror("setsockopt failed");
goto cleanup;
}
}
err = bind(sock_fd, res->ai_addr, res->ai_addrlen);
if (err) {
perror("failed to bind to port");
goto cleanup;
}
return sock_fd;
cleanup:
close(sock_fd);
return err;
}
static void *setup(void *arg)
{
int sock_fd, i;
int *array = (int *)arg;
for (i = 0; i < MAX_CONNECTIONS; i++) {
sock_fd = bind_socket(SO_REUSEADDR | SO_REUSEPORT, setup_addr);
if (sock_fd < 0) {
ret = sock_fd;
pthread_exit(&ret);
}
array[i] = sock_fd;
}
return NULL;
}
int main(int argc, const char *argv[])
{
int listener_fd, sock_fd, i, j;
pthread_t tid[MAX_THREADS];
clock_t begin, end;
if (argc != 4) {
printf("Usage: listener <port> <ipv6 | ipv4> <bind-addr>\n");
return -1;
}
port = argv[1];
use_v6 = strcmp(argv[2], "ipv6") == 0;
bind_addr = argv[3];
setup_addr = use_v6 ? setup_addr_v6 : setup_addr_v4;
listener_fd = bind_socket(SO_REUSEADDR | SO_REUSEPORT, setup_addr);
if (listen(listener_fd, 100) < 0) {
perror("listen failed");
return -1;
}
/* Set up threads to populate the bhash table entry for the port */
for (i = 0; i < MAX_THREADS; i++)
pthread_create(&tid[i], NULL, setup, fd_array[i]);
for (i = 0; i < MAX_THREADS; i++)
pthread_join(tid[i], NULL);
if (ret)
goto done;
begin = clock();
/* Bind to the same port on a different address */
sock_fd = bind_socket(0, bind_addr);
if (sock_fd < 0)
goto done;
end = clock();
printf("time spent = %f\n", (double)(end - begin) / CLOCKS_PER_SEC);
/* clean up */
close(sock_fd);
done:
close(listener_fd);
for (i = 0; i < MAX_THREADS; i++) {
for (j = 0; i < MAX_THREADS; i++)
close(fd_array[i][j]);
}
return 0;
}
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
NR_FILES=32768
SAVED_NR_FILES=$(ulimit -n)
# default values
port=443
addr_v6="2001:0db8:0:f101::1"
addr_v4="10.8.8.8"
use_v6=true
addr=""
usage() {
echo "Usage: $0 [-6 | -4] [-p port] [-a address]"
echo -e "\t6: use ipv6"
echo -e "\t4: use ipv4"
echo -e "\tport: Port number"
echo -e "\taddress: ip address"
}
while getopts "ha:p:64" opt; do
case ${opt} in
h)
usage $0
exit 0
;;
a) addr=$OPTARG;;
p)
port=$OPTARG;;
6)
use_v6=true;;
4)
use_v6=false;;
esac
done
setup() {
if [[ "$use_v6" == true ]]; then
ip addr add $addr_v6 nodad dev eth0
else
ip addr add $addr_v4 dev lo
fi
ulimit -n $NR_FILES
}
cleanup() {
if [[ "$use_v6" == true ]]; then
ip addr del $addr_v6 dev eth0
else
ip addr del $addr_v4/32 dev lo
fi
ulimit -n $SAVED_NR_FILES
}
if [[ "$addr" != "" ]]; then
addr_v4=$addr;
addr_v6=$addr;
fi
setup
if [[ "$use_v6" == true ]] ; then
./bind_bhash $port "ipv6" $addr_v6
else
./bind_bhash $port "ipv4" $addr_v4
fi
cleanup
// SPDX-License-Identifier: GPL-2.0
#include <arpa/inet.h>
#include <error.h>
#include <errno.h>
#include <unistd.h>
int main(void)
{
int fd1, fd2, one = 1;
struct sockaddr_in6 bind_addr = {
.sin6_family = AF_INET6,
.sin6_port = htons(20000),
.sin6_flowinfo = htonl(0),
.sin6_addr = {},
.sin6_scope_id = 0,
};
inet_pton(AF_INET6, "::", &bind_addr.sin6_addr);
fd1 = socket(AF_INET6, SOCK_STREAM, IPPROTO_IP);
if (fd1 < 0) {
error(1, errno, "socket fd1");
return -1;
}
if (setsockopt(fd1, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one))) {
error(1, errno, "setsockopt(SO_REUSEADDR) fd1");
goto out_err1;
}
if (bind(fd1, (struct sockaddr *)&bind_addr, sizeof(bind_addr))) {
error(1, errno, "bind fd1");
goto out_err1;
}
if (sendto(fd1, NULL, 0, MSG_FASTOPEN, (struct sockaddr *)&bind_addr,
sizeof(bind_addr))) {
error(1, errno, "sendto fd1");
goto out_err1;
}
fd2 = socket(AF_INET6, SOCK_STREAM, IPPROTO_IP);
if (fd2 < 0) {
error(1, errno, "socket fd2");
goto out_err1;
}
if (setsockopt(fd2, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one))) {
error(1, errno, "setsockopt(SO_REUSEADDR) fd2");
goto out_err2;
}
if (bind(fd2, (struct sockaddr *)&bind_addr, sizeof(bind_addr))) {
error(1, errno, "bind fd2");
goto out_err2;
}
if (sendto(fd2, NULL, 0, MSG_FASTOPEN, (struct sockaddr *)&bind_addr,
sizeof(bind_addr)) != -1) {
error(1, errno, "sendto fd2");
goto out_err2;
}
if (listen(fd2, 0)) {
error(1, errno, "listen");
goto out_err2;
}
close(fd2);
close(fd1);
return 0;
out_err2:
close(fd2);
out_err1:
close(fd1);
return -1;
}
// SPDX-License-Identifier: GPL-2.0
#include <arpa/inet.h>
#include <error.h>
#include <errno.h>
#include <unistd.h>
int main(void)
{
int fd1, fd2, one = 1;
struct sockaddr_in6 bind_addr = {
.sin6_family = AF_INET6,
.sin6_port = htons(20000),
.sin6_flowinfo = htonl(0),
.sin6_addr = {},
.sin6_scope_id = 0,
};
inet_pton(AF_INET6, "::", &bind_addr.sin6_addr);
fd1 = socket(AF_INET6, SOCK_STREAM, IPPROTO_IP);
if (fd1 < 0) {
error(1, errno, "socket fd1");
return -1;
}
if (setsockopt(fd1, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one))) {
error(1, errno, "setsockopt(SO_REUSEADDR) fd1");
goto out_err1;
}
if (bind(fd1, (struct sockaddr *)&bind_addr, sizeof(bind_addr))) {
error(1, errno, "bind fd1");
goto out_err1;
}
if (listen(fd1, 0)) {
error(1, errno, "listen");
goto out_err1;
}
fd2 = socket(AF_INET6, SOCK_STREAM, IPPROTO_IP);
if (fd2 < 0) {
error(1, errno, "socket fd2");
goto out_err1;
}
if (connect(fd2, (struct sockaddr *)&bind_addr, sizeof(bind_addr))) {
error(1, errno, "bind fd2");
goto out_err2;
}
close(fd2);
close(fd1);
return 0;
out_err2:
close(fd2);
out_err1:
close(fd1);
return -1;
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment