Commit 593d1ebe authored by Joanne Koong's avatar Joanne Koong Committed by Jakub Kicinski

Revert "net: Add a second bind table hashed by port and address"

This reverts:

commit d5a42de8 ("net: Add a second bind table hashed by port and address")
commit 538aaf9b ("selftests: Add test for timing a bind request to a port with a populated bhash entry")
Link: https://lore.kernel.org/netdev/20220520001834.2247810-1-kuba@kernel.org/

There are a few things that need to be fixed here:
* Updating bhash2 in cases where the socket's rcv saddr changes
* Adding bhash2 hashbucket locks

Links to syzbot reports:
https://lore.kernel.org/netdev/00000000000022208805e0df247a@google.com/
https://lore.kernel.org/netdev/0000000000003f33bc05dfaf44fe@google.com/

Fixes: d5a42de8 ("net: Add a second bind table hashed by port and address")
Reported-by: syzbot+015d756bbd1f8b5c8f09@syzkaller.appspotmail.com
Reported-by: syzbot+98fd2d1422063b0f8c44@syzkaller.appspotmail.com
Reported-by: syzbot+0a847a982613c6438fba@syzkaller.appspotmail.com
Signed-off-by: default avatarJoanne Koong <joannelkoong@gmail.com>
Link: https://lore.kernel.org/r/20220615193213.2419568-1-joannelkoong@gmail.comSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parent 219b51a6
...@@ -25,7 +25,6 @@ ...@@ -25,7 +25,6 @@
#undef INET_CSK_CLEAR_TIMERS #undef INET_CSK_CLEAR_TIMERS
struct inet_bind_bucket; struct inet_bind_bucket;
struct inet_bind2_bucket;
struct tcp_congestion_ops; struct tcp_congestion_ops;
/* /*
...@@ -58,7 +57,6 @@ struct inet_connection_sock_af_ops { ...@@ -58,7 +57,6 @@ struct inet_connection_sock_af_ops {
* *
* @icsk_accept_queue: FIFO of established children * @icsk_accept_queue: FIFO of established children
* @icsk_bind_hash: Bind node * @icsk_bind_hash: Bind node
* @icsk_bind2_hash: Bind node in the bhash2 table
* @icsk_timeout: Timeout * @icsk_timeout: Timeout
* @icsk_retransmit_timer: Resend (no ack) * @icsk_retransmit_timer: Resend (no ack)
* @icsk_rto: Retransmit timeout * @icsk_rto: Retransmit timeout
...@@ -85,7 +83,6 @@ struct inet_connection_sock { ...@@ -85,7 +83,6 @@ struct inet_connection_sock {
struct inet_sock icsk_inet; struct inet_sock icsk_inet;
struct request_sock_queue icsk_accept_queue; struct request_sock_queue icsk_accept_queue;
struct inet_bind_bucket *icsk_bind_hash; struct inet_bind_bucket *icsk_bind_hash;
struct inet_bind2_bucket *icsk_bind2_hash;
unsigned long icsk_timeout; unsigned long icsk_timeout;
struct timer_list icsk_retransmit_timer; struct timer_list icsk_retransmit_timer;
struct timer_list icsk_delack_timer; struct timer_list icsk_delack_timer;
......
...@@ -90,32 +90,11 @@ struct inet_bind_bucket { ...@@ -90,32 +90,11 @@ struct inet_bind_bucket {
struct hlist_head owners; struct hlist_head owners;
}; };
struct inet_bind2_bucket {
possible_net_t ib_net;
int l3mdev;
unsigned short port;
union {
#if IS_ENABLED(CONFIG_IPV6)
struct in6_addr v6_rcv_saddr;
#endif
__be32 rcv_saddr;
};
/* Node in the inet2_bind_hashbucket chain */
struct hlist_node node;
/* List of sockets hashed to this bucket */
struct hlist_head owners;
};
static inline struct net *ib_net(struct inet_bind_bucket *ib) static inline struct net *ib_net(struct inet_bind_bucket *ib)
{ {
return read_pnet(&ib->ib_net); return read_pnet(&ib->ib_net);
} }
static inline struct net *ib2_net(struct inet_bind2_bucket *ib)
{
return read_pnet(&ib->ib_net);
}
#define inet_bind_bucket_for_each(tb, head) \ #define inet_bind_bucket_for_each(tb, head) \
hlist_for_each_entry(tb, head, node) hlist_for_each_entry(tb, head, node)
...@@ -124,15 +103,6 @@ struct inet_bind_hashbucket { ...@@ -124,15 +103,6 @@ struct inet_bind_hashbucket {
struct hlist_head chain; struct hlist_head chain;
}; };
/* This is synchronized using the inet_bind_hashbucket's spinlock.
* Instead of having separate spinlocks, the inet_bind2_hashbucket can share
* the inet_bind_hashbucket's given that in every case where the bhash2 table
* is useful, a lookup in the bhash table also occurs.
*/
struct inet_bind2_hashbucket {
struct hlist_head chain;
};
/* Sockets can be hashed in established or listening table. /* Sockets can be hashed in established or listening table.
* We must use different 'nulls' end-of-chain value for all hash buckets : * We must use different 'nulls' end-of-chain value for all hash buckets :
* A socket might transition from ESTABLISH to LISTEN state without * A socket might transition from ESTABLISH to LISTEN state without
...@@ -164,12 +134,6 @@ struct inet_hashinfo { ...@@ -164,12 +134,6 @@ struct inet_hashinfo {
*/ */
struct kmem_cache *bind_bucket_cachep; struct kmem_cache *bind_bucket_cachep;
struct inet_bind_hashbucket *bhash; struct inet_bind_hashbucket *bhash;
/* The 2nd binding table hashed by port and address.
* This is used primarily for expediting the resolution of bind
* conflicts.
*/
struct kmem_cache *bind2_bucket_cachep;
struct inet_bind2_hashbucket *bhash2;
unsigned int bhash_size; unsigned int bhash_size;
/* The 2nd listener table hashed by local port and address */ /* The 2nd listener table hashed by local port and address */
...@@ -229,36 +193,6 @@ inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net, ...@@ -229,36 +193,6 @@ inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net,
void inet_bind_bucket_destroy(struct kmem_cache *cachep, void inet_bind_bucket_destroy(struct kmem_cache *cachep,
struct inet_bind_bucket *tb); struct inet_bind_bucket *tb);
static inline bool check_bind_bucket_match(struct inet_bind_bucket *tb,
struct net *net,
const unsigned short port,
int l3mdev)
{
return net_eq(ib_net(tb), net) && tb->port == port &&
tb->l3mdev == l3mdev;
}
struct inet_bind2_bucket *
inet_bind2_bucket_create(struct kmem_cache *cachep, struct net *net,
struct inet_bind2_hashbucket *head,
const unsigned short port, int l3mdev,
const struct sock *sk);
void inet_bind2_bucket_destroy(struct kmem_cache *cachep,
struct inet_bind2_bucket *tb);
struct inet_bind2_bucket *
inet_bind2_bucket_find(struct inet_hashinfo *hinfo, struct net *net,
const unsigned short port, int l3mdev,
struct sock *sk,
struct inet_bind2_hashbucket **head);
bool check_bind2_bucket_match_nulladdr(struct inet_bind2_bucket *tb,
struct net *net,
const unsigned short port,
int l3mdev,
const struct sock *sk);
static inline u32 inet_bhashfn(const struct net *net, const __u16 lport, static inline u32 inet_bhashfn(const struct net *net, const __u16 lport,
const u32 bhash_size) const u32 bhash_size)
{ {
...@@ -266,7 +200,7 @@ static inline u32 inet_bhashfn(const struct net *net, const __u16 lport, ...@@ -266,7 +200,7 @@ static inline u32 inet_bhashfn(const struct net *net, const __u16 lport,
} }
void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
struct inet_bind2_bucket *tb2, const unsigned short snum); const unsigned short snum);
/* Caller must disable local BH processing. */ /* Caller must disable local BH processing. */
int __inet_inherit_port(const struct sock *sk, struct sock *child); int __inet_inherit_port(const struct sock *sk, struct sock *child);
......
...@@ -348,7 +348,6 @@ struct sk_filter; ...@@ -348,7 +348,6 @@ struct sk_filter;
* @sk_txtime_report_errors: set report errors mode for SO_TXTIME * @sk_txtime_report_errors: set report errors mode for SO_TXTIME
* @sk_txtime_unused: unused txtime flags * @sk_txtime_unused: unused txtime flags
* @ns_tracker: tracker for netns reference * @ns_tracker: tracker for netns reference
* @sk_bind2_node: bind node in the bhash2 table
*/ */
struct sock { struct sock {
/* /*
...@@ -538,7 +537,6 @@ struct sock { ...@@ -538,7 +537,6 @@ struct sock {
#endif #endif
struct rcu_head sk_rcu; struct rcu_head sk_rcu;
netns_tracker ns_tracker; netns_tracker ns_tracker;
struct hlist_node sk_bind2_node;
}; };
enum sk_pacing { enum sk_pacing {
...@@ -819,16 +817,6 @@ static inline void sk_add_bind_node(struct sock *sk, ...@@ -819,16 +817,6 @@ static inline void sk_add_bind_node(struct sock *sk,
hlist_add_head(&sk->sk_bind_node, list); hlist_add_head(&sk->sk_bind_node, list);
} }
static inline void __sk_del_bind2_node(struct sock *sk)
{
__hlist_del(&sk->sk_bind2_node);
}
static inline void sk_add_bind2_node(struct sock *sk, struct hlist_head *list)
{
hlist_add_head(&sk->sk_bind2_node, list);
}
#define sk_for_each(__sk, list) \ #define sk_for_each(__sk, list) \
hlist_for_each_entry(__sk, list, sk_node) hlist_for_each_entry(__sk, list, sk_node)
#define sk_for_each_rcu(__sk, list) \ #define sk_for_each_rcu(__sk, list) \
...@@ -846,8 +834,6 @@ static inline void sk_add_bind2_node(struct sock *sk, struct hlist_head *list) ...@@ -846,8 +834,6 @@ static inline void sk_add_bind2_node(struct sock *sk, struct hlist_head *list)
hlist_for_each_entry_safe(__sk, tmp, list, sk_node) hlist_for_each_entry_safe(__sk, tmp, list, sk_node)
#define sk_for_each_bound(__sk, list) \ #define sk_for_each_bound(__sk, list) \
hlist_for_each_entry(__sk, list, sk_bind_node) hlist_for_each_entry(__sk, list, sk_bind_node)
#define sk_for_each_bound_bhash2(__sk, list) \
hlist_for_each_entry(__sk, list, sk_bind2_node)
/** /**
* sk_for_each_entry_offset_rcu - iterate over a list at a given struct offset * sk_for_each_entry_offset_rcu - iterate over a list at a given struct offset
......
...@@ -1120,12 +1120,6 @@ static int __init dccp_init(void) ...@@ -1120,12 +1120,6 @@ static int __init dccp_init(void)
SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
if (!dccp_hashinfo.bind_bucket_cachep) if (!dccp_hashinfo.bind_bucket_cachep)
goto out_free_hashinfo2; goto out_free_hashinfo2;
dccp_hashinfo.bind2_bucket_cachep =
kmem_cache_create("dccp_bind2_bucket",
sizeof(struct inet_bind2_bucket), 0,
SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
if (!dccp_hashinfo.bind2_bucket_cachep)
goto out_free_bind_bucket_cachep;
/* /*
* Size and allocate the main established and bind bucket * Size and allocate the main established and bind bucket
...@@ -1156,7 +1150,7 @@ static int __init dccp_init(void) ...@@ -1156,7 +1150,7 @@ static int __init dccp_init(void)
if (!dccp_hashinfo.ehash) { if (!dccp_hashinfo.ehash) {
DCCP_CRIT("Failed to allocate DCCP established hash table"); DCCP_CRIT("Failed to allocate DCCP established hash table");
goto out_free_bind2_bucket_cachep; goto out_free_bind_bucket_cachep;
} }
for (i = 0; i <= dccp_hashinfo.ehash_mask; i++) for (i = 0; i <= dccp_hashinfo.ehash_mask; i++)
...@@ -1182,23 +1176,14 @@ static int __init dccp_init(void) ...@@ -1182,23 +1176,14 @@ static int __init dccp_init(void)
goto out_free_dccp_locks; goto out_free_dccp_locks;
} }
dccp_hashinfo.bhash2 = (struct inet_bind2_hashbucket *)
__get_free_pages(GFP_ATOMIC | __GFP_NOWARN, bhash_order);
if (!dccp_hashinfo.bhash2) {
DCCP_CRIT("Failed to allocate DCCP bind2 hash table");
goto out_free_dccp_bhash;
}
for (i = 0; i < dccp_hashinfo.bhash_size; i++) { for (i = 0; i < dccp_hashinfo.bhash_size; i++) {
spin_lock_init(&dccp_hashinfo.bhash[i].lock); spin_lock_init(&dccp_hashinfo.bhash[i].lock);
INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain); INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain);
INIT_HLIST_HEAD(&dccp_hashinfo.bhash2[i].chain);
} }
rc = dccp_mib_init(); rc = dccp_mib_init();
if (rc) if (rc)
goto out_free_dccp_bhash2; goto out_free_dccp_bhash;
rc = dccp_ackvec_init(); rc = dccp_ackvec_init();
if (rc) if (rc)
...@@ -1222,38 +1207,30 @@ static int __init dccp_init(void) ...@@ -1222,38 +1207,30 @@ static int __init dccp_init(void)
dccp_ackvec_exit(); dccp_ackvec_exit();
out_free_dccp_mib: out_free_dccp_mib:
dccp_mib_exit(); dccp_mib_exit();
out_free_dccp_bhash2:
free_pages((unsigned long)dccp_hashinfo.bhash2, bhash_order);
out_free_dccp_bhash: out_free_dccp_bhash:
free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order); free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order);
out_free_dccp_locks: out_free_dccp_locks:
inet_ehash_locks_free(&dccp_hashinfo); inet_ehash_locks_free(&dccp_hashinfo);
out_free_dccp_ehash: out_free_dccp_ehash:
free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order); free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order);
out_free_bind2_bucket_cachep:
kmem_cache_destroy(dccp_hashinfo.bind2_bucket_cachep);
out_free_bind_bucket_cachep: out_free_bind_bucket_cachep:
kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
out_free_hashinfo2: out_free_hashinfo2:
inet_hashinfo2_free_mod(&dccp_hashinfo); inet_hashinfo2_free_mod(&dccp_hashinfo);
out_fail: out_fail:
dccp_hashinfo.bhash = NULL; dccp_hashinfo.bhash = NULL;
dccp_hashinfo.bhash2 = NULL;
dccp_hashinfo.ehash = NULL; dccp_hashinfo.ehash = NULL;
dccp_hashinfo.bind_bucket_cachep = NULL; dccp_hashinfo.bind_bucket_cachep = NULL;
dccp_hashinfo.bind2_bucket_cachep = NULL;
return rc; return rc;
} }
static void __exit dccp_fini(void) static void __exit dccp_fini(void)
{ {
int bhash_order = get_order(dccp_hashinfo.bhash_size *
sizeof(struct inet_bind_hashbucket));
ccid_cleanup_builtins(); ccid_cleanup_builtins();
dccp_mib_exit(); dccp_mib_exit();
free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order); free_pages((unsigned long)dccp_hashinfo.bhash,
free_pages((unsigned long)dccp_hashinfo.bhash2, bhash_order); get_order(dccp_hashinfo.bhash_size *
sizeof(struct inet_bind_hashbucket)));
free_pages((unsigned long)dccp_hashinfo.ehash, free_pages((unsigned long)dccp_hashinfo.ehash,
get_order((dccp_hashinfo.ehash_mask + 1) * get_order((dccp_hashinfo.ehash_mask + 1) *
sizeof(struct inet_ehash_bucket))); sizeof(struct inet_ehash_bucket)));
......
This diff is collapsed.
...@@ -81,41 +81,6 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, ...@@ -81,41 +81,6 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
return tb; return tb;
} }
struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep,
struct net *net,
struct inet_bind2_hashbucket *head,
const unsigned short port,
int l3mdev,
const struct sock *sk)
{
struct inet_bind2_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);
if (tb) {
write_pnet(&tb->ib_net, net);
tb->l3mdev = l3mdev;
tb->port = port;
#if IS_ENABLED(CONFIG_IPV6)
if (sk->sk_family == AF_INET6)
tb->v6_rcv_saddr = sk->sk_v6_rcv_saddr;
else
#endif
tb->rcv_saddr = sk->sk_rcv_saddr;
INIT_HLIST_HEAD(&tb->owners);
hlist_add_head(&tb->node, &head->chain);
}
return tb;
}
static bool bind2_bucket_addr_match(struct inet_bind2_bucket *tb2, struct sock *sk)
{
#if IS_ENABLED(CONFIG_IPV6)
if (sk->sk_family == AF_INET6)
return ipv6_addr_equal(&tb2->v6_rcv_saddr,
&sk->sk_v6_rcv_saddr);
#endif
return tb2->rcv_saddr == sk->sk_rcv_saddr;
}
/* /*
* Caller must hold hashbucket lock for this tb with local BH disabled * Caller must hold hashbucket lock for this tb with local BH disabled
*/ */
...@@ -127,25 +92,12 @@ void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket ...@@ -127,25 +92,12 @@ void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket
} }
} }
/* Caller must hold the lock for the corresponding hashbucket in the bhash table
* with local BH disabled
*/
void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb)
{
if (hlist_empty(&tb->owners)) {
__hlist_del(&tb->node);
kmem_cache_free(cachep, tb);
}
}
void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
struct inet_bind2_bucket *tb2, const unsigned short snum) const unsigned short snum)
{ {
inet_sk(sk)->inet_num = snum; inet_sk(sk)->inet_num = snum;
sk_add_bind_node(sk, &tb->owners); sk_add_bind_node(sk, &tb->owners);
inet_csk(sk)->icsk_bind_hash = tb; inet_csk(sk)->icsk_bind_hash = tb;
sk_add_bind2_node(sk, &tb2->owners);
inet_csk(sk)->icsk_bind2_hash = tb2;
} }
/* /*
...@@ -157,7 +109,6 @@ static void __inet_put_port(struct sock *sk) ...@@ -157,7 +109,6 @@ static void __inet_put_port(struct sock *sk)
const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->inet_num, const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->inet_num,
hashinfo->bhash_size); hashinfo->bhash_size);
struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash];
struct inet_bind2_bucket *tb2;
struct inet_bind_bucket *tb; struct inet_bind_bucket *tb;
spin_lock(&head->lock); spin_lock(&head->lock);
...@@ -166,13 +117,6 @@ static void __inet_put_port(struct sock *sk) ...@@ -166,13 +117,6 @@ static void __inet_put_port(struct sock *sk)
inet_csk(sk)->icsk_bind_hash = NULL; inet_csk(sk)->icsk_bind_hash = NULL;
inet_sk(sk)->inet_num = 0; inet_sk(sk)->inet_num = 0;
inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
if (inet_csk(sk)->icsk_bind2_hash) {
tb2 = inet_csk(sk)->icsk_bind2_hash;
__sk_del_bind2_node(sk);
inet_csk(sk)->icsk_bind2_hash = NULL;
inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2);
}
spin_unlock(&head->lock); spin_unlock(&head->lock);
} }
...@@ -189,19 +133,14 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child) ...@@ -189,19 +133,14 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child)
struct inet_hashinfo *table = sk->sk_prot->h.hashinfo; struct inet_hashinfo *table = sk->sk_prot->h.hashinfo;
unsigned short port = inet_sk(child)->inet_num; unsigned short port = inet_sk(child)->inet_num;
const int bhash = inet_bhashfn(sock_net(sk), port, const int bhash = inet_bhashfn(sock_net(sk), port,
table->bhash_size); table->bhash_size);
struct inet_bind_hashbucket *head = &table->bhash[bhash]; struct inet_bind_hashbucket *head = &table->bhash[bhash];
struct inet_bind2_hashbucket *head_bhash2;
bool created_inet_bind_bucket = false;
struct net *net = sock_net(sk);
struct inet_bind2_bucket *tb2;
struct inet_bind_bucket *tb; struct inet_bind_bucket *tb;
int l3mdev; int l3mdev;
spin_lock(&head->lock); spin_lock(&head->lock);
tb = inet_csk(sk)->icsk_bind_hash; tb = inet_csk(sk)->icsk_bind_hash;
tb2 = inet_csk(sk)->icsk_bind2_hash; if (unlikely(!tb)) {
if (unlikely(!tb || !tb2)) {
spin_unlock(&head->lock); spin_unlock(&head->lock);
return -ENOENT; return -ENOENT;
} }
...@@ -214,45 +153,25 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child) ...@@ -214,45 +153,25 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child)
* as that of the child socket. We have to look up or * as that of the child socket. We have to look up or
* create a new bind bucket for the child here. */ * create a new bind bucket for the child here. */
inet_bind_bucket_for_each(tb, &head->chain) { inet_bind_bucket_for_each(tb, &head->chain) {
if (check_bind_bucket_match(tb, net, port, l3mdev)) if (net_eq(ib_net(tb), sock_net(sk)) &&
tb->l3mdev == l3mdev && tb->port == port)
break; break;
} }
if (!tb) { if (!tb) {
tb = inet_bind_bucket_create(table->bind_bucket_cachep, tb = inet_bind_bucket_create(table->bind_bucket_cachep,
net, head, port, l3mdev); sock_net(sk), head, port,
l3mdev);
if (!tb) { if (!tb) {
spin_unlock(&head->lock); spin_unlock(&head->lock);
return -ENOMEM; return -ENOMEM;
} }
created_inet_bind_bucket = true;
} }
inet_csk_update_fastreuse(tb, child); inet_csk_update_fastreuse(tb, child);
goto bhash2_find;
} else if (!bind2_bucket_addr_match(tb2, child)) {
l3mdev = inet_sk_bound_l3mdev(sk);
bhash2_find:
tb2 = inet_bind2_bucket_find(table, net, port, l3mdev, child,
&head_bhash2);
if (!tb2) {
tb2 = inet_bind2_bucket_create(table->bind2_bucket_cachep,
net, head_bhash2, port,
l3mdev, child);
if (!tb2)
goto error;
}
} }
inet_bind_hash(child, tb, tb2, port); inet_bind_hash(child, tb, port);
spin_unlock(&head->lock); spin_unlock(&head->lock);
return 0; return 0;
error:
if (created_inet_bind_bucket)
inet_bind_bucket_destroy(table->bind_bucket_cachep, tb);
spin_unlock(&head->lock);
return -ENOMEM;
} }
EXPORT_SYMBOL_GPL(__inet_inherit_port); EXPORT_SYMBOL_GPL(__inet_inherit_port);
...@@ -756,76 +675,6 @@ void inet_unhash(struct sock *sk) ...@@ -756,76 +675,6 @@ void inet_unhash(struct sock *sk)
} }
EXPORT_SYMBOL_GPL(inet_unhash); EXPORT_SYMBOL_GPL(inet_unhash);
static bool check_bind2_bucket_match(struct inet_bind2_bucket *tb,
struct net *net, unsigned short port,
int l3mdev, struct sock *sk)
{
#if IS_ENABLED(CONFIG_IPV6)
if (sk->sk_family == AF_INET6)
return net_eq(ib2_net(tb), net) && tb->port == port &&
tb->l3mdev == l3mdev &&
ipv6_addr_equal(&tb->v6_rcv_saddr, &sk->sk_v6_rcv_saddr);
else
#endif
return net_eq(ib2_net(tb), net) && tb->port == port &&
tb->l3mdev == l3mdev && tb->rcv_saddr == sk->sk_rcv_saddr;
}
bool check_bind2_bucket_match_nulladdr(struct inet_bind2_bucket *tb,
struct net *net, const unsigned short port,
int l3mdev, const struct sock *sk)
{
#if IS_ENABLED(CONFIG_IPV6)
struct in6_addr nulladdr = {};
if (sk->sk_family == AF_INET6)
return net_eq(ib2_net(tb), net) && tb->port == port &&
tb->l3mdev == l3mdev &&
ipv6_addr_equal(&tb->v6_rcv_saddr, &nulladdr);
else
#endif
return net_eq(ib2_net(tb), net) && tb->port == port &&
tb->l3mdev == l3mdev && tb->rcv_saddr == 0;
}
static struct inet_bind2_hashbucket *
inet_bhashfn_portaddr(struct inet_hashinfo *hinfo, const struct sock *sk,
const struct net *net, unsigned short port)
{
u32 hash;
#if IS_ENABLED(CONFIG_IPV6)
if (sk->sk_family == AF_INET6)
hash = ipv6_portaddr_hash(net, &sk->sk_v6_rcv_saddr, port);
else
#endif
hash = ipv4_portaddr_hash(net, sk->sk_rcv_saddr, port);
return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)];
}
/* This should only be called when the spinlock for the socket's corresponding
* bind_hashbucket is held
*/
struct inet_bind2_bucket *
inet_bind2_bucket_find(struct inet_hashinfo *hinfo, struct net *net,
const unsigned short port, int l3mdev, struct sock *sk,
struct inet_bind2_hashbucket **head)
{
struct inet_bind2_bucket *bhash2 = NULL;
struct inet_bind2_hashbucket *h;
h = inet_bhashfn_portaddr(hinfo, sk, net, port);
inet_bind_bucket_for_each(bhash2, &h->chain) {
if (check_bind2_bucket_match(bhash2, net, port, l3mdev, sk))
break;
}
if (head)
*head = h;
return bhash2;
}
/* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm /* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm
* Note that we use 32bit integers (vs RFC 'short integers') * Note that we use 32bit integers (vs RFC 'short integers')
* because 2^16 is not a multiple of num_ephemeral and this * because 2^16 is not a multiple of num_ephemeral and this
...@@ -846,13 +695,10 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, ...@@ -846,13 +695,10 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
{ {
struct inet_hashinfo *hinfo = death_row->hashinfo; struct inet_hashinfo *hinfo = death_row->hashinfo;
struct inet_timewait_sock *tw = NULL; struct inet_timewait_sock *tw = NULL;
struct inet_bind2_hashbucket *head2;
struct inet_bind_hashbucket *head; struct inet_bind_hashbucket *head;
int port = inet_sk(sk)->inet_num; int port = inet_sk(sk)->inet_num;
struct net *net = sock_net(sk); struct net *net = sock_net(sk);
struct inet_bind2_bucket *tb2;
struct inet_bind_bucket *tb; struct inet_bind_bucket *tb;
bool tb_created = false;
u32 remaining, offset; u32 remaining, offset;
int ret, i, low, high; int ret, i, low, high;
int l3mdev; int l3mdev;
...@@ -909,7 +755,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, ...@@ -909,7 +755,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
* the established check is already unique enough. * the established check is already unique enough.
*/ */
inet_bind_bucket_for_each(tb, &head->chain) { inet_bind_bucket_for_each(tb, &head->chain) {
if (check_bind_bucket_match(tb, net, port, l3mdev)) { if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev &&
tb->port == port) {
if (tb->fastreuse >= 0 || if (tb->fastreuse >= 0 ||
tb->fastreuseport >= 0) tb->fastreuseport >= 0)
goto next_port; goto next_port;
...@@ -927,7 +774,6 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, ...@@ -927,7 +774,6 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
spin_unlock_bh(&head->lock); spin_unlock_bh(&head->lock);
return -ENOMEM; return -ENOMEM;
} }
tb_created = true;
tb->fastreuse = -1; tb->fastreuse = -1;
tb->fastreuseport = -1; tb->fastreuseport = -1;
goto ok; goto ok;
...@@ -943,17 +789,6 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, ...@@ -943,17 +789,6 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
return -EADDRNOTAVAIL; return -EADDRNOTAVAIL;
ok: ok:
/* Find the corresponding tb2 bucket since we need to
* add the socket to the bhash2 table as well
*/
tb2 = inet_bind2_bucket_find(hinfo, net, port, l3mdev, sk, &head2);
if (!tb2) {
tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep, net,
head2, port, l3mdev, sk);
if (!tb2)
goto error;
}
/* Here we want to add a little bit of randomness to the next source /* Here we want to add a little bit of randomness to the next source
* port that will be chosen. We use a max() with a random here so that * port that will be chosen. We use a max() with a random here so that
* on low contention the randomness is maximal and on high contention * on low contention the randomness is maximal and on high contention
...@@ -963,7 +798,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, ...@@ -963,7 +798,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + 2); WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + 2);
/* Head lock still held and bh's disabled */ /* Head lock still held and bh's disabled */
inet_bind_hash(sk, tb, tb2, port); inet_bind_hash(sk, tb, port);
if (sk_unhashed(sk)) { if (sk_unhashed(sk)) {
inet_sk(sk)->inet_sport = htons(port); inet_sk(sk)->inet_sport = htons(port);
inet_ehash_nolisten(sk, (struct sock *)tw, NULL); inet_ehash_nolisten(sk, (struct sock *)tw, NULL);
...@@ -975,12 +810,6 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, ...@@ -975,12 +810,6 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
inet_twsk_deschedule_put(tw); inet_twsk_deschedule_put(tw);
local_bh_enable(); local_bh_enable();
return 0; return 0;
error:
if (tb_created)
inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb);
spin_unlock_bh(&head->lock);
return -ENOMEM;
} }
/* /*
......
...@@ -4604,12 +4604,6 @@ void __init tcp_init(void) ...@@ -4604,12 +4604,6 @@ void __init tcp_init(void)
SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_HWCACHE_ALIGN | SLAB_PANIC |
SLAB_ACCOUNT, SLAB_ACCOUNT,
NULL); NULL);
tcp_hashinfo.bind2_bucket_cachep =
kmem_cache_create("tcp_bind2_bucket",
sizeof(struct inet_bind2_bucket), 0,
SLAB_HWCACHE_ALIGN | SLAB_PANIC |
SLAB_ACCOUNT,
NULL);
/* Size and allocate the main established and bind bucket /* Size and allocate the main established and bind bucket
* hash tables. * hash tables.
...@@ -4632,9 +4626,8 @@ void __init tcp_init(void) ...@@ -4632,9 +4626,8 @@ void __init tcp_init(void)
if (inet_ehash_locks_alloc(&tcp_hashinfo)) if (inet_ehash_locks_alloc(&tcp_hashinfo))
panic("TCP: failed to alloc ehash_locks"); panic("TCP: failed to alloc ehash_locks");
tcp_hashinfo.bhash = tcp_hashinfo.bhash =
alloc_large_system_hash("TCP bind bhash tables", alloc_large_system_hash("TCP bind",
sizeof(struct inet_bind_hashbucket) + sizeof(struct inet_bind_hashbucket),
sizeof(struct inet_bind2_hashbucket),
tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.ehash_mask + 1,
17, /* one slot per 128 KB of memory */ 17, /* one slot per 128 KB of memory */
0, 0,
...@@ -4643,12 +4636,9 @@ void __init tcp_init(void) ...@@ -4643,12 +4636,9 @@ void __init tcp_init(void)
0, 0,
64 * 1024); 64 * 1024);
tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size; tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
tcp_hashinfo.bhash2 =
(struct inet_bind2_hashbucket *)(tcp_hashinfo.bhash + tcp_hashinfo.bhash_size);
for (i = 0; i < tcp_hashinfo.bhash_size; i++) { for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
spin_lock_init(&tcp_hashinfo.bhash[i].lock); spin_lock_init(&tcp_hashinfo.bhash[i].lock);
INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
INIT_HLIST_HEAD(&tcp_hashinfo.bhash2[i].chain);
} }
......
...@@ -37,4 +37,3 @@ gro ...@@ -37,4 +37,3 @@ gro
ioam6_parser ioam6_parser
toeplitz toeplitz
cmsg_sender cmsg_sender
bind_bhash_test
...@@ -59,7 +59,6 @@ TEST_GEN_FILES += toeplitz ...@@ -59,7 +59,6 @@ TEST_GEN_FILES += toeplitz
TEST_GEN_FILES += cmsg_sender TEST_GEN_FILES += cmsg_sender
TEST_GEN_FILES += stress_reuseport_listen TEST_GEN_FILES += stress_reuseport_listen
TEST_PROGS += test_vxlan_vnifiltering.sh TEST_PROGS += test_vxlan_vnifiltering.sh
TEST_GEN_FILES += bind_bhash_test
TEST_FILES := settings TEST_FILES := settings
...@@ -70,5 +69,4 @@ include bpf/Makefile ...@@ -70,5 +69,4 @@ include bpf/Makefile
$(OUTPUT)/reuseport_bpf_numa: LDLIBS += -lnuma $(OUTPUT)/reuseport_bpf_numa: LDLIBS += -lnuma
$(OUTPUT)/tcp_mmap: LDLIBS += -lpthread $(OUTPUT)/tcp_mmap: LDLIBS += -lpthread
$(OUTPUT)/bind_bhash_test: LDLIBS += -lpthread
$(OUTPUT)/tcp_inq: LDLIBS += -lpthread $(OUTPUT)/tcp_inq: LDLIBS += -lpthread
// SPDX-License-Identifier: GPL-2.0
/*
* This times how long it takes to bind to a port when the port already
* has multiple sockets in its bhash table.
*
* In the setup(), we populate the port's bhash table with
* MAX_THREADS * MAX_CONNECTIONS number of entries.
*/
#include <unistd.h>
#include <stdio.h>
#include <netdb.h>
#include <pthread.h>
#define MAX_THREADS 600
#define MAX_CONNECTIONS 40
static const char *bind_addr = "::1";
static const char *port;
static int fd_array[MAX_THREADS][MAX_CONNECTIONS];
static int bind_socket(int opt, const char *addr)
{
struct addrinfo *res, hint = {};
int sock_fd, reuse = 1, err;
sock_fd = socket(AF_INET6, SOCK_STREAM, 0);
if (sock_fd < 0) {
perror("socket fd err");
return -1;
}
hint.ai_family = AF_INET6;
hint.ai_socktype = SOCK_STREAM;
err = getaddrinfo(addr, port, &hint, &res);
if (err) {
perror("getaddrinfo failed");
return -1;
}
if (opt) {
err = setsockopt(sock_fd, SOL_SOCKET, opt, &reuse, sizeof(reuse));
if (err) {
perror("setsockopt failed");
return -1;
}
}
err = bind(sock_fd, res->ai_addr, res->ai_addrlen);
if (err) {
perror("failed to bind to port");
return -1;
}
return sock_fd;
}
static void *setup(void *arg)
{
int sock_fd, i;
int *array = (int *)arg;
for (i = 0; i < MAX_CONNECTIONS; i++) {
sock_fd = bind_socket(SO_REUSEADDR | SO_REUSEPORT, bind_addr);
if (sock_fd < 0)
return NULL;
array[i] = sock_fd;
}
return NULL;
}
int main(int argc, const char *argv[])
{
int listener_fd, sock_fd, i, j;
pthread_t tid[MAX_THREADS];
clock_t begin, end;
if (argc != 2) {
printf("Usage: listener <port>\n");
return -1;
}
port = argv[1];
listener_fd = bind_socket(SO_REUSEADDR | SO_REUSEPORT, bind_addr);
if (listen(listener_fd, 100) < 0) {
perror("listen failed");
return -1;
}
/* Set up threads to populate the bhash table entry for the port */
for (i = 0; i < MAX_THREADS; i++)
pthread_create(&tid[i], NULL, setup, fd_array[i]);
for (i = 0; i < MAX_THREADS; i++)
pthread_join(tid[i], NULL);
begin = clock();
/* Bind to the same port on a different address */
sock_fd = bind_socket(0, "2001:0db8:0:f101::1");
end = clock();
printf("time spent = %f\n", (double)(end - begin) / CLOCKS_PER_SEC);
/* clean up */
close(sock_fd);
close(listener_fd);
for (i = 0; i < MAX_THREADS; i++) {
for (j = 0; i < MAX_THREADS; i++)
close(fd_array[i][j]);
}
return 0;
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment