Commit aa5334b1 authored by Jakub Kicinski's avatar Jakub Kicinski

Merge branch 'add-a-bhash2-table-hashed-by-port-address'

Joanne Koong says:

====================
Add a bhash2 table hashed by port + address

This patchset proposes adding a bhash2 table that hashes by port and address.
The motivation behind bhash2 is to expedite bind requests in situations where
the port has many sockets in its bhash table entry, which makes checking bind
conflicts costly especially given that we acquire the table entry spinlock
while doing so, which can cause softirq cpu lockups and can prevent new tcp
connections.

We ran into this problem at Meta where the traffic team binds a large number
of IPs to port 443 and the bind() call took a significant amount of time
which led to cpu softirq lockups, which caused packet drops and other failures
on the machine

The patches are as follows:
1/2 - Adds a second bhash table (bhash2) hashed by port and address
2/2 - Adds a test for timing how long an additional bind request takes when
the bhash entry is populated

When experimentally testing this on a local server for ~24k sockets bound to
the port, the results seen were:

ipv4:
before - 0.002317 seconds
with bhash2 - 0.000018 seconds

ipv6:
before - 0.002431 seconds
with bhash2 - 0.000021 seconds
====================

Link: https://lore.kernel.org/r/20220520001834.2247810-1-kuba@kernel.orgSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents eac67d83 538aaf9b
...@@ -25,6 +25,7 @@ ...@@ -25,6 +25,7 @@
#undef INET_CSK_CLEAR_TIMERS #undef INET_CSK_CLEAR_TIMERS
struct inet_bind_bucket; struct inet_bind_bucket;
struct inet_bind2_bucket;
struct tcp_congestion_ops; struct tcp_congestion_ops;
/* /*
...@@ -57,6 +58,7 @@ struct inet_connection_sock_af_ops { ...@@ -57,6 +58,7 @@ struct inet_connection_sock_af_ops {
* *
* @icsk_accept_queue: FIFO of established children * @icsk_accept_queue: FIFO of established children
* @icsk_bind_hash: Bind node * @icsk_bind_hash: Bind node
* @icsk_bind2_hash: Bind node in the bhash2 table
* @icsk_timeout: Timeout * @icsk_timeout: Timeout
* @icsk_retransmit_timer: Resend (no ack) * @icsk_retransmit_timer: Resend (no ack)
* @icsk_rto: Retransmit timeout * @icsk_rto: Retransmit timeout
...@@ -83,6 +85,7 @@ struct inet_connection_sock { ...@@ -83,6 +85,7 @@ struct inet_connection_sock {
struct inet_sock icsk_inet; struct inet_sock icsk_inet;
struct request_sock_queue icsk_accept_queue; struct request_sock_queue icsk_accept_queue;
struct inet_bind_bucket *icsk_bind_hash; struct inet_bind_bucket *icsk_bind_hash;
struct inet_bind2_bucket *icsk_bind2_hash;
unsigned long icsk_timeout; unsigned long icsk_timeout;
struct timer_list icsk_retransmit_timer; struct timer_list icsk_retransmit_timer;
struct timer_list icsk_delack_timer; struct timer_list icsk_delack_timer;
......
...@@ -90,11 +90,32 @@ struct inet_bind_bucket { ...@@ -90,11 +90,32 @@ struct inet_bind_bucket {
struct hlist_head owners; struct hlist_head owners;
}; };
struct inet_bind2_bucket {
possible_net_t ib_net;
int l3mdev;
unsigned short port;
union {
#if IS_ENABLED(CONFIG_IPV6)
struct in6_addr v6_rcv_saddr;
#endif
__be32 rcv_saddr;
};
/* Node in the inet2_bind_hashbucket chain */
struct hlist_node node;
/* List of sockets hashed to this bucket */
struct hlist_head owners;
};
static inline struct net *ib_net(struct inet_bind_bucket *ib) static inline struct net *ib_net(struct inet_bind_bucket *ib)
{ {
return read_pnet(&ib->ib_net); return read_pnet(&ib->ib_net);
} }
static inline struct net *ib2_net(struct inet_bind2_bucket *ib)
{
return read_pnet(&ib->ib_net);
}
#define inet_bind_bucket_for_each(tb, head) \ #define inet_bind_bucket_for_each(tb, head) \
hlist_for_each_entry(tb, head, node) hlist_for_each_entry(tb, head, node)
...@@ -103,6 +124,15 @@ struct inet_bind_hashbucket { ...@@ -103,6 +124,15 @@ struct inet_bind_hashbucket {
struct hlist_head chain; struct hlist_head chain;
}; };
/* This is synchronized using the inet_bind_hashbucket's spinlock.
* Instead of having separate spinlocks, the inet_bind2_hashbucket can share
* the inet_bind_hashbucket's given that in every case where the bhash2 table
* is useful, a lookup in the bhash table also occurs.
*/
struct inet_bind2_hashbucket {
struct hlist_head chain;
};
/* Sockets can be hashed in established or listening table. /* Sockets can be hashed in established or listening table.
* We must use different 'nulls' end-of-chain value for all hash buckets : * We must use different 'nulls' end-of-chain value for all hash buckets :
* A socket might transition from ESTABLISH to LISTEN state without * A socket might transition from ESTABLISH to LISTEN state without
...@@ -134,6 +164,12 @@ struct inet_hashinfo { ...@@ -134,6 +164,12 @@ struct inet_hashinfo {
*/ */
struct kmem_cache *bind_bucket_cachep; struct kmem_cache *bind_bucket_cachep;
struct inet_bind_hashbucket *bhash; struct inet_bind_hashbucket *bhash;
/* The 2nd binding table hashed by port and address.
* This is used primarily for expediting the resolution of bind
* conflicts.
*/
struct kmem_cache *bind2_bucket_cachep;
struct inet_bind2_hashbucket *bhash2;
unsigned int bhash_size; unsigned int bhash_size;
/* The 2nd listener table hashed by local port and address */ /* The 2nd listener table hashed by local port and address */
...@@ -193,6 +229,36 @@ inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net, ...@@ -193,6 +229,36 @@ inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net,
void inet_bind_bucket_destroy(struct kmem_cache *cachep, void inet_bind_bucket_destroy(struct kmem_cache *cachep,
struct inet_bind_bucket *tb); struct inet_bind_bucket *tb);
static inline bool check_bind_bucket_match(struct inet_bind_bucket *tb,
struct net *net,
const unsigned short port,
int l3mdev)
{
return net_eq(ib_net(tb), net) && tb->port == port &&
tb->l3mdev == l3mdev;
}
struct inet_bind2_bucket *
inet_bind2_bucket_create(struct kmem_cache *cachep, struct net *net,
struct inet_bind2_hashbucket *head,
const unsigned short port, int l3mdev,
const struct sock *sk);
void inet_bind2_bucket_destroy(struct kmem_cache *cachep,
struct inet_bind2_bucket *tb);
struct inet_bind2_bucket *
inet_bind2_bucket_find(struct inet_hashinfo *hinfo, struct net *net,
const unsigned short port, int l3mdev,
struct sock *sk,
struct inet_bind2_hashbucket **head);
bool check_bind2_bucket_match_nulladdr(struct inet_bind2_bucket *tb,
struct net *net,
const unsigned short port,
int l3mdev,
const struct sock *sk);
static inline u32 inet_bhashfn(const struct net *net, const __u16 lport, static inline u32 inet_bhashfn(const struct net *net, const __u16 lport,
const u32 bhash_size) const u32 bhash_size)
{ {
...@@ -200,7 +266,7 @@ static inline u32 inet_bhashfn(const struct net *net, const __u16 lport, ...@@ -200,7 +266,7 @@ static inline u32 inet_bhashfn(const struct net *net, const __u16 lport,
} }
void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
const unsigned short snum); struct inet_bind2_bucket *tb2, const unsigned short snum);
/* Caller must disable local BH processing. */ /* Caller must disable local BH processing. */
int __inet_inherit_port(const struct sock *sk, struct sock *child); int __inet_inherit_port(const struct sock *sk, struct sock *child);
......
...@@ -348,6 +348,7 @@ struct sk_filter; ...@@ -348,6 +348,7 @@ struct sk_filter;
* @sk_txtime_report_errors: set report errors mode for SO_TXTIME * @sk_txtime_report_errors: set report errors mode for SO_TXTIME
* @sk_txtime_unused: unused txtime flags * @sk_txtime_unused: unused txtime flags
* @ns_tracker: tracker for netns reference * @ns_tracker: tracker for netns reference
* @sk_bind2_node: bind node in the bhash2 table
*/ */
struct sock { struct sock {
/* /*
...@@ -537,6 +538,7 @@ struct sock { ...@@ -537,6 +538,7 @@ struct sock {
#endif #endif
struct rcu_head sk_rcu; struct rcu_head sk_rcu;
netns_tracker ns_tracker; netns_tracker ns_tracker;
struct hlist_node sk_bind2_node;
}; };
enum sk_pacing { enum sk_pacing {
...@@ -817,6 +819,16 @@ static inline void sk_add_bind_node(struct sock *sk, ...@@ -817,6 +819,16 @@ static inline void sk_add_bind_node(struct sock *sk,
hlist_add_head(&sk->sk_bind_node, list); hlist_add_head(&sk->sk_bind_node, list);
} }
static inline void __sk_del_bind2_node(struct sock *sk)
{
__hlist_del(&sk->sk_bind2_node);
}
static inline void sk_add_bind2_node(struct sock *sk, struct hlist_head *list)
{
hlist_add_head(&sk->sk_bind2_node, list);
}
#define sk_for_each(__sk, list) \ #define sk_for_each(__sk, list) \
hlist_for_each_entry(__sk, list, sk_node) hlist_for_each_entry(__sk, list, sk_node)
#define sk_for_each_rcu(__sk, list) \ #define sk_for_each_rcu(__sk, list) \
...@@ -834,6 +846,8 @@ static inline void sk_add_bind_node(struct sock *sk, ...@@ -834,6 +846,8 @@ static inline void sk_add_bind_node(struct sock *sk,
hlist_for_each_entry_safe(__sk, tmp, list, sk_node) hlist_for_each_entry_safe(__sk, tmp, list, sk_node)
#define sk_for_each_bound(__sk, list) \ #define sk_for_each_bound(__sk, list) \
hlist_for_each_entry(__sk, list, sk_bind_node) hlist_for_each_entry(__sk, list, sk_bind_node)
#define sk_for_each_bound_bhash2(__sk, list) \
hlist_for_each_entry(__sk, list, sk_bind2_node)
/** /**
* sk_for_each_entry_offset_rcu - iterate over a list at a given struct offset * sk_for_each_entry_offset_rcu - iterate over a list at a given struct offset
......
...@@ -1120,6 +1120,12 @@ static int __init dccp_init(void) ...@@ -1120,6 +1120,12 @@ static int __init dccp_init(void)
SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
if (!dccp_hashinfo.bind_bucket_cachep) if (!dccp_hashinfo.bind_bucket_cachep)
goto out_free_hashinfo2; goto out_free_hashinfo2;
dccp_hashinfo.bind2_bucket_cachep =
kmem_cache_create("dccp_bind2_bucket",
sizeof(struct inet_bind2_bucket), 0,
SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
if (!dccp_hashinfo.bind2_bucket_cachep)
goto out_free_bind_bucket_cachep;
/* /*
* Size and allocate the main established and bind bucket * Size and allocate the main established and bind bucket
...@@ -1150,7 +1156,7 @@ static int __init dccp_init(void) ...@@ -1150,7 +1156,7 @@ static int __init dccp_init(void)
if (!dccp_hashinfo.ehash) { if (!dccp_hashinfo.ehash) {
DCCP_CRIT("Failed to allocate DCCP established hash table"); DCCP_CRIT("Failed to allocate DCCP established hash table");
goto out_free_bind_bucket_cachep; goto out_free_bind2_bucket_cachep;
} }
for (i = 0; i <= dccp_hashinfo.ehash_mask; i++) for (i = 0; i <= dccp_hashinfo.ehash_mask; i++)
...@@ -1176,14 +1182,23 @@ static int __init dccp_init(void) ...@@ -1176,14 +1182,23 @@ static int __init dccp_init(void)
goto out_free_dccp_locks; goto out_free_dccp_locks;
} }
dccp_hashinfo.bhash2 = (struct inet_bind2_hashbucket *)
__get_free_pages(GFP_ATOMIC | __GFP_NOWARN, bhash_order);
if (!dccp_hashinfo.bhash2) {
DCCP_CRIT("Failed to allocate DCCP bind2 hash table");
goto out_free_dccp_bhash;
}
for (i = 0; i < dccp_hashinfo.bhash_size; i++) { for (i = 0; i < dccp_hashinfo.bhash_size; i++) {
spin_lock_init(&dccp_hashinfo.bhash[i].lock); spin_lock_init(&dccp_hashinfo.bhash[i].lock);
INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain); INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain);
INIT_HLIST_HEAD(&dccp_hashinfo.bhash2[i].chain);
} }
rc = dccp_mib_init(); rc = dccp_mib_init();
if (rc) if (rc)
goto out_free_dccp_bhash; goto out_free_dccp_bhash2;
rc = dccp_ackvec_init(); rc = dccp_ackvec_init();
if (rc) if (rc)
...@@ -1207,30 +1222,38 @@ static int __init dccp_init(void) ...@@ -1207,30 +1222,38 @@ static int __init dccp_init(void)
dccp_ackvec_exit(); dccp_ackvec_exit();
out_free_dccp_mib: out_free_dccp_mib:
dccp_mib_exit(); dccp_mib_exit();
out_free_dccp_bhash2:
free_pages((unsigned long)dccp_hashinfo.bhash2, bhash_order);
out_free_dccp_bhash: out_free_dccp_bhash:
free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order); free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order);
out_free_dccp_locks: out_free_dccp_locks:
inet_ehash_locks_free(&dccp_hashinfo); inet_ehash_locks_free(&dccp_hashinfo);
out_free_dccp_ehash: out_free_dccp_ehash:
free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order); free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order);
out_free_bind2_bucket_cachep:
kmem_cache_destroy(dccp_hashinfo.bind2_bucket_cachep);
out_free_bind_bucket_cachep: out_free_bind_bucket_cachep:
kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
out_free_hashinfo2: out_free_hashinfo2:
inet_hashinfo2_free_mod(&dccp_hashinfo); inet_hashinfo2_free_mod(&dccp_hashinfo);
out_fail: out_fail:
dccp_hashinfo.bhash = NULL; dccp_hashinfo.bhash = NULL;
dccp_hashinfo.bhash2 = NULL;
dccp_hashinfo.ehash = NULL; dccp_hashinfo.ehash = NULL;
dccp_hashinfo.bind_bucket_cachep = NULL; dccp_hashinfo.bind_bucket_cachep = NULL;
dccp_hashinfo.bind2_bucket_cachep = NULL;
return rc; return rc;
} }
static void __exit dccp_fini(void) static void __exit dccp_fini(void)
{ {
int bhash_order = get_order(dccp_hashinfo.bhash_size *
sizeof(struct inet_bind_hashbucket));
ccid_cleanup_builtins(); ccid_cleanup_builtins();
dccp_mib_exit(); dccp_mib_exit();
free_pages((unsigned long)dccp_hashinfo.bhash, free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order);
get_order(dccp_hashinfo.bhash_size * free_pages((unsigned long)dccp_hashinfo.bhash2, bhash_order);
sizeof(struct inet_bind_hashbucket)));
free_pages((unsigned long)dccp_hashinfo.ehash, free_pages((unsigned long)dccp_hashinfo.ehash,
get_order((dccp_hashinfo.ehash_mask + 1) * get_order((dccp_hashinfo.ehash_mask + 1) *
sizeof(struct inet_ehash_bucket))); sizeof(struct inet_ehash_bucket)));
......
This diff is collapsed.
...@@ -81,6 +81,41 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, ...@@ -81,6 +81,41 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
return tb; return tb;
} }
struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep,
struct net *net,
struct inet_bind2_hashbucket *head,
const unsigned short port,
int l3mdev,
const struct sock *sk)
{
struct inet_bind2_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);
if (tb) {
write_pnet(&tb->ib_net, net);
tb->l3mdev = l3mdev;
tb->port = port;
#if IS_ENABLED(CONFIG_IPV6)
if (sk->sk_family == AF_INET6)
tb->v6_rcv_saddr = sk->sk_v6_rcv_saddr;
else
#endif
tb->rcv_saddr = sk->sk_rcv_saddr;
INIT_HLIST_HEAD(&tb->owners);
hlist_add_head(&tb->node, &head->chain);
}
return tb;
}
static bool bind2_bucket_addr_match(struct inet_bind2_bucket *tb2, struct sock *sk)
{
#if IS_ENABLED(CONFIG_IPV6)
if (sk->sk_family == AF_INET6)
return ipv6_addr_equal(&tb2->v6_rcv_saddr,
&sk->sk_v6_rcv_saddr);
#endif
return tb2->rcv_saddr == sk->sk_rcv_saddr;
}
/* /*
* Caller must hold hashbucket lock for this tb with local BH disabled * Caller must hold hashbucket lock for this tb with local BH disabled
*/ */
...@@ -92,12 +127,25 @@ void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket ...@@ -92,12 +127,25 @@ void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket
} }
} }
/* Caller must hold the lock for the corresponding hashbucket in the bhash table
* with local BH disabled
*/
void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb)
{
if (hlist_empty(&tb->owners)) {
__hlist_del(&tb->node);
kmem_cache_free(cachep, tb);
}
}
void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
const unsigned short snum) struct inet_bind2_bucket *tb2, const unsigned short snum)
{ {
inet_sk(sk)->inet_num = snum; inet_sk(sk)->inet_num = snum;
sk_add_bind_node(sk, &tb->owners); sk_add_bind_node(sk, &tb->owners);
inet_csk(sk)->icsk_bind_hash = tb; inet_csk(sk)->icsk_bind_hash = tb;
sk_add_bind2_node(sk, &tb2->owners);
inet_csk(sk)->icsk_bind2_hash = tb2;
} }
/* /*
...@@ -109,6 +157,7 @@ static void __inet_put_port(struct sock *sk) ...@@ -109,6 +157,7 @@ static void __inet_put_port(struct sock *sk)
const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->inet_num, const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->inet_num,
hashinfo->bhash_size); hashinfo->bhash_size);
struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash];
struct inet_bind2_bucket *tb2;
struct inet_bind_bucket *tb; struct inet_bind_bucket *tb;
spin_lock(&head->lock); spin_lock(&head->lock);
...@@ -117,6 +166,13 @@ static void __inet_put_port(struct sock *sk) ...@@ -117,6 +166,13 @@ static void __inet_put_port(struct sock *sk)
inet_csk(sk)->icsk_bind_hash = NULL; inet_csk(sk)->icsk_bind_hash = NULL;
inet_sk(sk)->inet_num = 0; inet_sk(sk)->inet_num = 0;
inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
if (inet_csk(sk)->icsk_bind2_hash) {
tb2 = inet_csk(sk)->icsk_bind2_hash;
__sk_del_bind2_node(sk);
inet_csk(sk)->icsk_bind2_hash = NULL;
inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2);
}
spin_unlock(&head->lock); spin_unlock(&head->lock);
} }
...@@ -133,14 +189,19 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child) ...@@ -133,14 +189,19 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child)
struct inet_hashinfo *table = sk->sk_prot->h.hashinfo; struct inet_hashinfo *table = sk->sk_prot->h.hashinfo;
unsigned short port = inet_sk(child)->inet_num; unsigned short port = inet_sk(child)->inet_num;
const int bhash = inet_bhashfn(sock_net(sk), port, const int bhash = inet_bhashfn(sock_net(sk), port,
table->bhash_size); table->bhash_size);
struct inet_bind_hashbucket *head = &table->bhash[bhash]; struct inet_bind_hashbucket *head = &table->bhash[bhash];
struct inet_bind2_hashbucket *head_bhash2;
bool created_inet_bind_bucket = false;
struct net *net = sock_net(sk);
struct inet_bind2_bucket *tb2;
struct inet_bind_bucket *tb; struct inet_bind_bucket *tb;
int l3mdev; int l3mdev;
spin_lock(&head->lock); spin_lock(&head->lock);
tb = inet_csk(sk)->icsk_bind_hash; tb = inet_csk(sk)->icsk_bind_hash;
if (unlikely(!tb)) { tb2 = inet_csk(sk)->icsk_bind2_hash;
if (unlikely(!tb || !tb2)) {
spin_unlock(&head->lock); spin_unlock(&head->lock);
return -ENOENT; return -ENOENT;
} }
...@@ -153,25 +214,45 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child) ...@@ -153,25 +214,45 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child)
* as that of the child socket. We have to look up or * as that of the child socket. We have to look up or
* create a new bind bucket for the child here. */ * create a new bind bucket for the child here. */
inet_bind_bucket_for_each(tb, &head->chain) { inet_bind_bucket_for_each(tb, &head->chain) {
if (net_eq(ib_net(tb), sock_net(sk)) && if (check_bind_bucket_match(tb, net, port, l3mdev))
tb->l3mdev == l3mdev && tb->port == port)
break; break;
} }
if (!tb) { if (!tb) {
tb = inet_bind_bucket_create(table->bind_bucket_cachep, tb = inet_bind_bucket_create(table->bind_bucket_cachep,
sock_net(sk), head, port, net, head, port, l3mdev);
l3mdev);
if (!tb) { if (!tb) {
spin_unlock(&head->lock); spin_unlock(&head->lock);
return -ENOMEM; return -ENOMEM;
} }
created_inet_bind_bucket = true;
} }
inet_csk_update_fastreuse(tb, child); inet_csk_update_fastreuse(tb, child);
goto bhash2_find;
} else if (!bind2_bucket_addr_match(tb2, child)) {
l3mdev = inet_sk_bound_l3mdev(sk);
bhash2_find:
tb2 = inet_bind2_bucket_find(table, net, port, l3mdev, child,
&head_bhash2);
if (!tb2) {
tb2 = inet_bind2_bucket_create(table->bind2_bucket_cachep,
net, head_bhash2, port,
l3mdev, child);
if (!tb2)
goto error;
}
} }
inet_bind_hash(child, tb, port); inet_bind_hash(child, tb, tb2, port);
spin_unlock(&head->lock); spin_unlock(&head->lock);
return 0; return 0;
error:
if (created_inet_bind_bucket)
inet_bind_bucket_destroy(table->bind_bucket_cachep, tb);
spin_unlock(&head->lock);
return -ENOMEM;
} }
EXPORT_SYMBOL_GPL(__inet_inherit_port); EXPORT_SYMBOL_GPL(__inet_inherit_port);
...@@ -675,6 +756,76 @@ void inet_unhash(struct sock *sk) ...@@ -675,6 +756,76 @@ void inet_unhash(struct sock *sk)
} }
EXPORT_SYMBOL_GPL(inet_unhash); EXPORT_SYMBOL_GPL(inet_unhash);
static bool check_bind2_bucket_match(struct inet_bind2_bucket *tb,
struct net *net, unsigned short port,
int l3mdev, struct sock *sk)
{
#if IS_ENABLED(CONFIG_IPV6)
if (sk->sk_family == AF_INET6)
return net_eq(ib2_net(tb), net) && tb->port == port &&
tb->l3mdev == l3mdev &&
ipv6_addr_equal(&tb->v6_rcv_saddr, &sk->sk_v6_rcv_saddr);
else
#endif
return net_eq(ib2_net(tb), net) && tb->port == port &&
tb->l3mdev == l3mdev && tb->rcv_saddr == sk->sk_rcv_saddr;
}
bool check_bind2_bucket_match_nulladdr(struct inet_bind2_bucket *tb,
struct net *net, const unsigned short port,
int l3mdev, const struct sock *sk)
{
#if IS_ENABLED(CONFIG_IPV6)
struct in6_addr nulladdr = {};
if (sk->sk_family == AF_INET6)
return net_eq(ib2_net(tb), net) && tb->port == port &&
tb->l3mdev == l3mdev &&
ipv6_addr_equal(&tb->v6_rcv_saddr, &nulladdr);
else
#endif
return net_eq(ib2_net(tb), net) && tb->port == port &&
tb->l3mdev == l3mdev && tb->rcv_saddr == 0;
}
static struct inet_bind2_hashbucket *
inet_bhashfn_portaddr(struct inet_hashinfo *hinfo, const struct sock *sk,
const struct net *net, unsigned short port)
{
u32 hash;
#if IS_ENABLED(CONFIG_IPV6)
if (sk->sk_family == AF_INET6)
hash = ipv6_portaddr_hash(net, &sk->sk_v6_rcv_saddr, port);
else
#endif
hash = ipv4_portaddr_hash(net, sk->sk_rcv_saddr, port);
return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)];
}
/* This should only be called when the spinlock for the socket's corresponding
* bind_hashbucket is held
*/
struct inet_bind2_bucket *
inet_bind2_bucket_find(struct inet_hashinfo *hinfo, struct net *net,
const unsigned short port, int l3mdev, struct sock *sk,
struct inet_bind2_hashbucket **head)
{
struct inet_bind2_bucket *bhash2 = NULL;
struct inet_bind2_hashbucket *h;
h = inet_bhashfn_portaddr(hinfo, sk, net, port);
inet_bind_bucket_for_each(bhash2, &h->chain) {
if (check_bind2_bucket_match(bhash2, net, port, l3mdev, sk))
break;
}
if (head)
*head = h;
return bhash2;
}
/* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm /* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm
* Note that we use 32bit integers (vs RFC 'short integers') * Note that we use 32bit integers (vs RFC 'short integers')
* because 2^16 is not a multiple of num_ephemeral and this * because 2^16 is not a multiple of num_ephemeral and this
...@@ -695,10 +846,13 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, ...@@ -695,10 +846,13 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
{ {
struct inet_hashinfo *hinfo = death_row->hashinfo; struct inet_hashinfo *hinfo = death_row->hashinfo;
struct inet_timewait_sock *tw = NULL; struct inet_timewait_sock *tw = NULL;
struct inet_bind2_hashbucket *head2;
struct inet_bind_hashbucket *head; struct inet_bind_hashbucket *head;
int port = inet_sk(sk)->inet_num; int port = inet_sk(sk)->inet_num;
struct net *net = sock_net(sk); struct net *net = sock_net(sk);
struct inet_bind2_bucket *tb2;
struct inet_bind_bucket *tb; struct inet_bind_bucket *tb;
bool tb_created = false;
u32 remaining, offset; u32 remaining, offset;
int ret, i, low, high; int ret, i, low, high;
int l3mdev; int l3mdev;
...@@ -755,8 +909,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, ...@@ -755,8 +909,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
* the established check is already unique enough. * the established check is already unique enough.
*/ */
inet_bind_bucket_for_each(tb, &head->chain) { inet_bind_bucket_for_each(tb, &head->chain) {
if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev && if (check_bind_bucket_match(tb, net, port, l3mdev)) {
tb->port == port) {
if (tb->fastreuse >= 0 || if (tb->fastreuse >= 0 ||
tb->fastreuseport >= 0) tb->fastreuseport >= 0)
goto next_port; goto next_port;
...@@ -774,6 +927,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, ...@@ -774,6 +927,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
spin_unlock_bh(&head->lock); spin_unlock_bh(&head->lock);
return -ENOMEM; return -ENOMEM;
} }
tb_created = true;
tb->fastreuse = -1; tb->fastreuse = -1;
tb->fastreuseport = -1; tb->fastreuseport = -1;
goto ok; goto ok;
...@@ -789,6 +943,17 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, ...@@ -789,6 +943,17 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
return -EADDRNOTAVAIL; return -EADDRNOTAVAIL;
ok: ok:
/* Find the corresponding tb2 bucket since we need to
* add the socket to the bhash2 table as well
*/
tb2 = inet_bind2_bucket_find(hinfo, net, port, l3mdev, sk, &head2);
if (!tb2) {
tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep, net,
head2, port, l3mdev, sk);
if (!tb2)
goto error;
}
/* Here we want to add a little bit of randomness to the next source /* Here we want to add a little bit of randomness to the next source
* port that will be chosen. We use a max() with a random here so that * port that will be chosen. We use a max() with a random here so that
* on low contention the randomness is maximal and on high contention * on low contention the randomness is maximal and on high contention
...@@ -798,7 +963,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, ...@@ -798,7 +963,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + 2); WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + 2);
/* Head lock still held and bh's disabled */ /* Head lock still held and bh's disabled */
inet_bind_hash(sk, tb, port); inet_bind_hash(sk, tb, tb2, port);
if (sk_unhashed(sk)) { if (sk_unhashed(sk)) {
inet_sk(sk)->inet_sport = htons(port); inet_sk(sk)->inet_sport = htons(port);
inet_ehash_nolisten(sk, (struct sock *)tw, NULL); inet_ehash_nolisten(sk, (struct sock *)tw, NULL);
...@@ -810,6 +975,12 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, ...@@ -810,6 +975,12 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
inet_twsk_deschedule_put(tw); inet_twsk_deschedule_put(tw);
local_bh_enable(); local_bh_enable();
return 0; return 0;
error:
if (tb_created)
inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb);
spin_unlock_bh(&head->lock);
return -ENOMEM;
} }
/* /*
......
...@@ -4604,6 +4604,12 @@ void __init tcp_init(void) ...@@ -4604,6 +4604,12 @@ void __init tcp_init(void)
SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_HWCACHE_ALIGN | SLAB_PANIC |
SLAB_ACCOUNT, SLAB_ACCOUNT,
NULL); NULL);
tcp_hashinfo.bind2_bucket_cachep =
kmem_cache_create("tcp_bind2_bucket",
sizeof(struct inet_bind2_bucket), 0,
SLAB_HWCACHE_ALIGN | SLAB_PANIC |
SLAB_ACCOUNT,
NULL);
/* Size and allocate the main established and bind bucket /* Size and allocate the main established and bind bucket
* hash tables. * hash tables.
...@@ -4626,8 +4632,9 @@ void __init tcp_init(void) ...@@ -4626,8 +4632,9 @@ void __init tcp_init(void)
if (inet_ehash_locks_alloc(&tcp_hashinfo)) if (inet_ehash_locks_alloc(&tcp_hashinfo))
panic("TCP: failed to alloc ehash_locks"); panic("TCP: failed to alloc ehash_locks");
tcp_hashinfo.bhash = tcp_hashinfo.bhash =
alloc_large_system_hash("TCP bind", alloc_large_system_hash("TCP bind bhash tables",
sizeof(struct inet_bind_hashbucket), sizeof(struct inet_bind_hashbucket) +
sizeof(struct inet_bind2_hashbucket),
tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.ehash_mask + 1,
17, /* one slot per 128 KB of memory */ 17, /* one slot per 128 KB of memory */
0, 0,
...@@ -4636,9 +4643,12 @@ void __init tcp_init(void) ...@@ -4636,9 +4643,12 @@ void __init tcp_init(void)
0, 0,
64 * 1024); 64 * 1024);
tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size; tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
tcp_hashinfo.bhash2 =
(struct inet_bind2_hashbucket *)(tcp_hashinfo.bhash + tcp_hashinfo.bhash_size);
for (i = 0; i < tcp_hashinfo.bhash_size; i++) { for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
spin_lock_init(&tcp_hashinfo.bhash[i].lock); spin_lock_init(&tcp_hashinfo.bhash[i].lock);
INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
INIT_HLIST_HEAD(&tcp_hashinfo.bhash2[i].chain);
} }
......
...@@ -36,3 +36,4 @@ gro ...@@ -36,3 +36,4 @@ gro
ioam6_parser ioam6_parser
toeplitz toeplitz
cmsg_sender cmsg_sender
bind_bhash_test
...@@ -59,6 +59,7 @@ TEST_GEN_FILES += toeplitz ...@@ -59,6 +59,7 @@ TEST_GEN_FILES += toeplitz
TEST_GEN_FILES += cmsg_sender TEST_GEN_FILES += cmsg_sender
TEST_GEN_FILES += stress_reuseport_listen TEST_GEN_FILES += stress_reuseport_listen
TEST_PROGS += test_vxlan_vnifiltering.sh TEST_PROGS += test_vxlan_vnifiltering.sh
TEST_GEN_FILES += bind_bhash_test
TEST_FILES := settings TEST_FILES := settings
...@@ -69,4 +70,5 @@ include bpf/Makefile ...@@ -69,4 +70,5 @@ include bpf/Makefile
$(OUTPUT)/reuseport_bpf_numa: LDLIBS += -lnuma $(OUTPUT)/reuseport_bpf_numa: LDLIBS += -lnuma
$(OUTPUT)/tcp_mmap: LDLIBS += -lpthread $(OUTPUT)/tcp_mmap: LDLIBS += -lpthread
$(OUTPUT)/bind_bhash_test: LDLIBS += -lpthread
$(OUTPUT)/tcp_inq: LDLIBS += -lpthread $(OUTPUT)/tcp_inq: LDLIBS += -lpthread
// SPDX-License-Identifier: GPL-2.0
/*
* This times how long it takes to bind to a port when the port already
* has multiple sockets in its bhash table.
*
* In the setup(), we populate the port's bhash table with
* MAX_THREADS * MAX_CONNECTIONS number of entries.
*/
#include <unistd.h>
#include <stdio.h>
#include <netdb.h>
#include <pthread.h>
#define MAX_THREADS 600
#define MAX_CONNECTIONS 40
static const char *bind_addr = "::1";
static const char *port;
static int fd_array[MAX_THREADS][MAX_CONNECTIONS];
static int bind_socket(int opt, const char *addr)
{
struct addrinfo *res, hint = {};
int sock_fd, reuse = 1, err;
sock_fd = socket(AF_INET6, SOCK_STREAM, 0);
if (sock_fd < 0) {
perror("socket fd err");
return -1;
}
hint.ai_family = AF_INET6;
hint.ai_socktype = SOCK_STREAM;
err = getaddrinfo(addr, port, &hint, &res);
if (err) {
perror("getaddrinfo failed");
return -1;
}
if (opt) {
err = setsockopt(sock_fd, SOL_SOCKET, opt, &reuse, sizeof(reuse));
if (err) {
perror("setsockopt failed");
return -1;
}
}
err = bind(sock_fd, res->ai_addr, res->ai_addrlen);
if (err) {
perror("failed to bind to port");
return -1;
}
return sock_fd;
}
static void *setup(void *arg)
{
int sock_fd, i;
int *array = (int *)arg;
for (i = 0; i < MAX_CONNECTIONS; i++) {
sock_fd = bind_socket(SO_REUSEADDR | SO_REUSEPORT, bind_addr);
if (sock_fd < 0)
return NULL;
array[i] = sock_fd;
}
return NULL;
}
int main(int argc, const char *argv[])
{
int listener_fd, sock_fd, i, j;
pthread_t tid[MAX_THREADS];
clock_t begin, end;
if (argc != 2) {
printf("Usage: listener <port>\n");
return -1;
}
port = argv[1];
listener_fd = bind_socket(SO_REUSEADDR | SO_REUSEPORT, bind_addr);
if (listen(listener_fd, 100) < 0) {
perror("listen failed");
return -1;
}
/* Set up threads to populate the bhash table entry for the port */
for (i = 0; i < MAX_THREADS; i++)
pthread_create(&tid[i], NULL, setup, fd_array[i]);
for (i = 0; i < MAX_THREADS; i++)
pthread_join(tid[i], NULL);
begin = clock();
/* Bind to the same port on a different address */
sock_fd = bind_socket(0, "2001:0db8:0:f101::1");
end = clock();
printf("time spent = %f\n", (double)(end - begin) / CLOCKS_PER_SEC);
/* clean up */
close(sock_fd);
close(listener_fd);
for (i = 0; i < MAX_THREADS; i++) {
for (j = 0; i < MAX_THREADS; i++)
close(fd_array[i][j]);
}
return 0;
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment