Commit 593d1ebe authored by Joanne Koong's avatar Joanne Koong Committed by Jakub Kicinski

Revert "net: Add a second bind table hashed by port and address"

This reverts:

commit d5a42de8 ("net: Add a second bind table hashed by port and address")
commit 538aaf9b ("selftests: Add test for timing a bind request to a port with a populated bhash entry")
Link: https://lore.kernel.org/netdev/20220520001834.2247810-1-kuba@kernel.org/

There are a few things that need to be fixed here:
* Updating bhash2 in cases where the socket's rcv saddr changes
* Adding bhash2 hashbucket locks

Links to syzbot reports:
https://lore.kernel.org/netdev/00000000000022208805e0df247a@google.com/
https://lore.kernel.org/netdev/0000000000003f33bc05dfaf44fe@google.com/

Fixes: d5a42de8 ("net: Add a second bind table hashed by port and address")
Reported-by: syzbot+015d756bbd1f8b5c8f09@syzkaller.appspotmail.com
Reported-by: syzbot+98fd2d1422063b0f8c44@syzkaller.appspotmail.com
Reported-by: syzbot+0a847a982613c6438fba@syzkaller.appspotmail.com
Signed-off-by: default avatarJoanne Koong <joannelkoong@gmail.com>
Link: https://lore.kernel.org/r/20220615193213.2419568-1-joannelkoong@gmail.comSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parent 219b51a6
......@@ -25,7 +25,6 @@
#undef INET_CSK_CLEAR_TIMERS
struct inet_bind_bucket;
struct inet_bind2_bucket;
struct tcp_congestion_ops;
/*
......@@ -58,7 +57,6 @@ struct inet_connection_sock_af_ops {
*
* @icsk_accept_queue: FIFO of established children
* @icsk_bind_hash: Bind node
* @icsk_bind2_hash: Bind node in the bhash2 table
* @icsk_timeout: Timeout
* @icsk_retransmit_timer: Resend (no ack)
* @icsk_rto: Retransmit timeout
......@@ -85,7 +83,6 @@ struct inet_connection_sock {
struct inet_sock icsk_inet;
struct request_sock_queue icsk_accept_queue;
struct inet_bind_bucket *icsk_bind_hash;
struct inet_bind2_bucket *icsk_bind2_hash;
unsigned long icsk_timeout;
struct timer_list icsk_retransmit_timer;
struct timer_list icsk_delack_timer;
......
......@@ -90,32 +90,11 @@ struct inet_bind_bucket {
struct hlist_head owners;
};
struct inet_bind2_bucket {
possible_net_t ib_net;
int l3mdev;
unsigned short port;
union {
#if IS_ENABLED(CONFIG_IPV6)
struct in6_addr v6_rcv_saddr;
#endif
__be32 rcv_saddr;
};
/* Node in the inet2_bind_hashbucket chain */
struct hlist_node node;
/* List of sockets hashed to this bucket */
struct hlist_head owners;
};
static inline struct net *ib_net(struct inet_bind_bucket *ib)
{
return read_pnet(&ib->ib_net);
}
static inline struct net *ib2_net(struct inet_bind2_bucket *ib)
{
return read_pnet(&ib->ib_net);
}
#define inet_bind_bucket_for_each(tb, head) \
hlist_for_each_entry(tb, head, node)
......@@ -124,15 +103,6 @@ struct inet_bind_hashbucket {
struct hlist_head chain;
};
/* This is synchronized using the inet_bind_hashbucket's spinlock.
* Instead of having separate spinlocks, the inet_bind2_hashbucket can share
* the inet_bind_hashbucket's given that in every case where the bhash2 table
* is useful, a lookup in the bhash table also occurs.
*/
struct inet_bind2_hashbucket {
struct hlist_head chain;
};
/* Sockets can be hashed in established or listening table.
* We must use different 'nulls' end-of-chain value for all hash buckets :
* A socket might transition from ESTABLISH to LISTEN state without
......@@ -164,12 +134,6 @@ struct inet_hashinfo {
*/
struct kmem_cache *bind_bucket_cachep;
struct inet_bind_hashbucket *bhash;
/* The 2nd binding table hashed by port and address.
* This is used primarily for expediting the resolution of bind
* conflicts.
*/
struct kmem_cache *bind2_bucket_cachep;
struct inet_bind2_hashbucket *bhash2;
unsigned int bhash_size;
/* The 2nd listener table hashed by local port and address */
......@@ -229,36 +193,6 @@ inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net,
void inet_bind_bucket_destroy(struct kmem_cache *cachep,
struct inet_bind_bucket *tb);
static inline bool check_bind_bucket_match(struct inet_bind_bucket *tb,
struct net *net,
const unsigned short port,
int l3mdev)
{
return net_eq(ib_net(tb), net) && tb->port == port &&
tb->l3mdev == l3mdev;
}
struct inet_bind2_bucket *
inet_bind2_bucket_create(struct kmem_cache *cachep, struct net *net,
struct inet_bind2_hashbucket *head,
const unsigned short port, int l3mdev,
const struct sock *sk);
void inet_bind2_bucket_destroy(struct kmem_cache *cachep,
struct inet_bind2_bucket *tb);
struct inet_bind2_bucket *
inet_bind2_bucket_find(struct inet_hashinfo *hinfo, struct net *net,
const unsigned short port, int l3mdev,
struct sock *sk,
struct inet_bind2_hashbucket **head);
bool check_bind2_bucket_match_nulladdr(struct inet_bind2_bucket *tb,
struct net *net,
const unsigned short port,
int l3mdev,
const struct sock *sk);
static inline u32 inet_bhashfn(const struct net *net, const __u16 lport,
const u32 bhash_size)
{
......@@ -266,7 +200,7 @@ static inline u32 inet_bhashfn(const struct net *net, const __u16 lport,
}
void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
struct inet_bind2_bucket *tb2, const unsigned short snum);
const unsigned short snum);
/* Caller must disable local BH processing. */
int __inet_inherit_port(const struct sock *sk, struct sock *child);
......
......@@ -348,7 +348,6 @@ struct sk_filter;
* @sk_txtime_report_errors: set report errors mode for SO_TXTIME
* @sk_txtime_unused: unused txtime flags
* @ns_tracker: tracker for netns reference
* @sk_bind2_node: bind node in the bhash2 table
*/
struct sock {
/*
......@@ -538,7 +537,6 @@ struct sock {
#endif
struct rcu_head sk_rcu;
netns_tracker ns_tracker;
struct hlist_node sk_bind2_node;
};
enum sk_pacing {
......@@ -819,16 +817,6 @@ static inline void sk_add_bind_node(struct sock *sk,
hlist_add_head(&sk->sk_bind_node, list);
}
static inline void __sk_del_bind2_node(struct sock *sk)
{
__hlist_del(&sk->sk_bind2_node);
}
static inline void sk_add_bind2_node(struct sock *sk, struct hlist_head *list)
{
hlist_add_head(&sk->sk_bind2_node, list);
}
#define sk_for_each(__sk, list) \
hlist_for_each_entry(__sk, list, sk_node)
#define sk_for_each_rcu(__sk, list) \
......@@ -846,8 +834,6 @@ static inline void sk_add_bind2_node(struct sock *sk, struct hlist_head *list)
hlist_for_each_entry_safe(__sk, tmp, list, sk_node)
#define sk_for_each_bound(__sk, list) \
hlist_for_each_entry(__sk, list, sk_bind_node)
#define sk_for_each_bound_bhash2(__sk, list) \
hlist_for_each_entry(__sk, list, sk_bind2_node)
/**
* sk_for_each_entry_offset_rcu - iterate over a list at a given struct offset
......
......@@ -1120,12 +1120,6 @@ static int __init dccp_init(void)
SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
if (!dccp_hashinfo.bind_bucket_cachep)
goto out_free_hashinfo2;
dccp_hashinfo.bind2_bucket_cachep =
kmem_cache_create("dccp_bind2_bucket",
sizeof(struct inet_bind2_bucket), 0,
SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
if (!dccp_hashinfo.bind2_bucket_cachep)
goto out_free_bind_bucket_cachep;
/*
* Size and allocate the main established and bind bucket
......@@ -1156,7 +1150,7 @@ static int __init dccp_init(void)
if (!dccp_hashinfo.ehash) {
DCCP_CRIT("Failed to allocate DCCP established hash table");
goto out_free_bind2_bucket_cachep;
goto out_free_bind_bucket_cachep;
}
for (i = 0; i <= dccp_hashinfo.ehash_mask; i++)
......@@ -1182,23 +1176,14 @@ static int __init dccp_init(void)
goto out_free_dccp_locks;
}
dccp_hashinfo.bhash2 = (struct inet_bind2_hashbucket *)
__get_free_pages(GFP_ATOMIC | __GFP_NOWARN, bhash_order);
if (!dccp_hashinfo.bhash2) {
DCCP_CRIT("Failed to allocate DCCP bind2 hash table");
goto out_free_dccp_bhash;
}
for (i = 0; i < dccp_hashinfo.bhash_size; i++) {
spin_lock_init(&dccp_hashinfo.bhash[i].lock);
INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain);
INIT_HLIST_HEAD(&dccp_hashinfo.bhash2[i].chain);
}
rc = dccp_mib_init();
if (rc)
goto out_free_dccp_bhash2;
goto out_free_dccp_bhash;
rc = dccp_ackvec_init();
if (rc)
......@@ -1222,38 +1207,30 @@ static int __init dccp_init(void)
dccp_ackvec_exit();
out_free_dccp_mib:
dccp_mib_exit();
out_free_dccp_bhash2:
free_pages((unsigned long)dccp_hashinfo.bhash2, bhash_order);
out_free_dccp_bhash:
free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order);
out_free_dccp_locks:
inet_ehash_locks_free(&dccp_hashinfo);
out_free_dccp_ehash:
free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order);
out_free_bind2_bucket_cachep:
kmem_cache_destroy(dccp_hashinfo.bind2_bucket_cachep);
out_free_bind_bucket_cachep:
kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
out_free_hashinfo2:
inet_hashinfo2_free_mod(&dccp_hashinfo);
out_fail:
dccp_hashinfo.bhash = NULL;
dccp_hashinfo.bhash2 = NULL;
dccp_hashinfo.ehash = NULL;
dccp_hashinfo.bind_bucket_cachep = NULL;
dccp_hashinfo.bind2_bucket_cachep = NULL;
return rc;
}
static void __exit dccp_fini(void)
{
int bhash_order = get_order(dccp_hashinfo.bhash_size *
sizeof(struct inet_bind_hashbucket));
ccid_cleanup_builtins();
dccp_mib_exit();
free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order);
free_pages((unsigned long)dccp_hashinfo.bhash2, bhash_order);
free_pages((unsigned long)dccp_hashinfo.bhash,
get_order(dccp_hashinfo.bhash_size *
sizeof(struct inet_bind_hashbucket)));
free_pages((unsigned long)dccp_hashinfo.ehash,
get_order((dccp_hashinfo.ehash_mask + 1) *
sizeof(struct inet_ehash_bucket)));
......
......@@ -117,32 +117,6 @@ bool inet_rcv_saddr_any(const struct sock *sk)
return !sk->sk_rcv_saddr;
}
static bool use_bhash2_on_bind(const struct sock *sk)
{
#if IS_ENABLED(CONFIG_IPV6)
int addr_type;
if (sk->sk_family == AF_INET6) {
addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr);
return addr_type != IPV6_ADDR_ANY &&
addr_type != IPV6_ADDR_MAPPED;
}
#endif
return sk->sk_rcv_saddr != htonl(INADDR_ANY);
}
static u32 get_bhash2_nulladdr_hash(const struct sock *sk, struct net *net,
int port)
{
#if IS_ENABLED(CONFIG_IPV6)
struct in6_addr nulladdr = {};
if (sk->sk_family == AF_INET6)
return ipv6_portaddr_hash(net, &nulladdr, port);
#endif
return ipv4_portaddr_hash(net, 0, port);
}
void inet_get_local_port_range(struct net *net, int *low, int *high)
{
unsigned int seq;
......@@ -156,71 +130,16 @@ void inet_get_local_port_range(struct net *net, int *low, int *high)
}
EXPORT_SYMBOL(inet_get_local_port_range);
static bool bind_conflict_exist(const struct sock *sk, struct sock *sk2,
kuid_t sk_uid, bool relax,
bool reuseport_cb_ok, bool reuseport_ok)
{
int bound_dev_if2;
if (sk == sk2)
return false;
bound_dev_if2 = READ_ONCE(sk2->sk_bound_dev_if);
if (!sk->sk_bound_dev_if || !bound_dev_if2 ||
sk->sk_bound_dev_if == bound_dev_if2) {
if (sk->sk_reuse && sk2->sk_reuse &&
sk2->sk_state != TCP_LISTEN) {
if (!relax || (!reuseport_ok && sk->sk_reuseport &&
sk2->sk_reuseport && reuseport_cb_ok &&
(sk2->sk_state == TCP_TIME_WAIT ||
uid_eq(sk_uid, sock_i_uid(sk2)))))
return true;
} else if (!reuseport_ok || !sk->sk_reuseport ||
!sk2->sk_reuseport || !reuseport_cb_ok ||
(sk2->sk_state != TCP_TIME_WAIT &&
!uid_eq(sk_uid, sock_i_uid(sk2)))) {
return true;
}
}
return false;
}
static bool check_bhash2_conflict(const struct sock *sk,
struct inet_bind2_bucket *tb2, kuid_t sk_uid,
bool relax, bool reuseport_cb_ok,
bool reuseport_ok)
{
struct sock *sk2;
sk_for_each_bound_bhash2(sk2, &tb2->owners) {
if (sk->sk_family == AF_INET && ipv6_only_sock(sk2))
continue;
if (bind_conflict_exist(sk, sk2, sk_uid, relax,
reuseport_cb_ok, reuseport_ok))
return true;
}
return false;
}
/* This should be called only when the corresponding inet_bind_bucket spinlock
* is held
*/
static int inet_csk_bind_conflict(const struct sock *sk, int port,
struct inet_bind_bucket *tb,
struct inet_bind2_bucket *tb2, /* may be null */
static int inet_csk_bind_conflict(const struct sock *sk,
const struct inet_bind_bucket *tb,
bool relax, bool reuseport_ok)
{
struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
kuid_t uid = sock_i_uid((struct sock *)sk);
struct sock_reuseport *reuseport_cb;
struct inet_bind2_hashbucket *head2;
bool reuseport_cb_ok;
struct sock *sk2;
struct net *net;
int l3mdev;
u32 hash;
bool reuseport_cb_ok;
bool reuse = sk->sk_reuse;
bool reuseport = !!sk->sk_reuseport;
struct sock_reuseport *reuseport_cb;
kuid_t uid = sock_i_uid((struct sock *)sk);
rcu_read_lock();
reuseport_cb = rcu_dereference(sk->sk_reuseport_cb);
......@@ -231,42 +150,40 @@ static int inet_csk_bind_conflict(const struct sock *sk, int port,
/*
* Unlike other sk lookup places we do not check
* for sk_net here, since _all_ the socks listed
* in tb->owners and tb2->owners list belong
* to the same net
* in tb->owners list belong to the same net - the
* one this bucket belongs to.
*/
if (!use_bhash2_on_bind(sk)) {
sk_for_each_bound(sk2, &tb->owners)
if (bind_conflict_exist(sk, sk2, uid, relax,
reuseport_cb_ok, reuseport_ok) &&
inet_rcv_saddr_equal(sk, sk2, true))
return true;
sk_for_each_bound(sk2, &tb->owners) {
int bound_dev_if2;
return false;
if (sk == sk2)
continue;
bound_dev_if2 = READ_ONCE(sk2->sk_bound_dev_if);
if ((!sk->sk_bound_dev_if ||
!bound_dev_if2 ||
sk->sk_bound_dev_if == bound_dev_if2)) {
if (reuse && sk2->sk_reuse &&
sk2->sk_state != TCP_LISTEN) {
if ((!relax ||
(!reuseport_ok &&
reuseport && sk2->sk_reuseport &&
reuseport_cb_ok &&
(sk2->sk_state == TCP_TIME_WAIT ||
uid_eq(uid, sock_i_uid(sk2))))) &&
inet_rcv_saddr_equal(sk, sk2, true))
break;
} else if (!reuseport_ok ||
!reuseport || !sk2->sk_reuseport ||
!reuseport_cb_ok ||
(sk2->sk_state != TCP_TIME_WAIT &&
!uid_eq(uid, sock_i_uid(sk2)))) {
if (inet_rcv_saddr_equal(sk, sk2, true))
break;
}
}
}
if (tb2 && check_bhash2_conflict(sk, tb2, uid, relax, reuseport_cb_ok,
reuseport_ok))
return true;
net = sock_net(sk);
/* check there's no conflict with an existing IPV6_ADDR_ANY (if ipv6) or
* INADDR_ANY (if ipv4) socket.
*/
hash = get_bhash2_nulladdr_hash(sk, net, port);
head2 = &hinfo->bhash2[hash & (hinfo->bhash_size - 1)];
l3mdev = inet_sk_bound_l3mdev(sk);
inet_bind_bucket_for_each(tb2, &head2->chain)
if (check_bind2_bucket_match_nulladdr(tb2, net, port, l3mdev, sk))
break;
if (tb2 && check_bhash2_conflict(sk, tb2, uid, relax, reuseport_cb_ok,
reuseport_ok))
return true;
return false;
return sk2 != NULL;
}
/*
......@@ -274,20 +191,16 @@ static int inet_csk_bind_conflict(const struct sock *sk, int port,
* inet_bind_hashbucket lock held.
*/
static struct inet_bind_hashbucket *
inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret,
struct inet_bind2_bucket **tb2_ret,
struct inet_bind2_hashbucket **head2_ret, int *port_ret)
inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret, int *port_ret)
{
struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
struct inet_bind2_hashbucket *head2;
int port = 0;
struct inet_bind_hashbucket *head;
struct net *net = sock_net(sk);
bool relax = false;
int i, low, high, attempt_half;
struct inet_bind2_bucket *tb2;
struct inet_bind_bucket *tb;
u32 remaining, offset;
bool relax = false;
int port = 0;
int l3mdev;
l3mdev = inet_sk_bound_l3mdev(sk);
......@@ -326,12 +239,10 @@ inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret,
head = &hinfo->bhash[inet_bhashfn(net, port,
hinfo->bhash_size)];
spin_lock_bh(&head->lock);
tb2 = inet_bind2_bucket_find(hinfo, net, port, l3mdev, sk,
&head2);
inet_bind_bucket_for_each(tb, &head->chain)
if (check_bind_bucket_match(tb, net, port, l3mdev)) {
if (!inet_csk_bind_conflict(sk, port, tb, tb2,
relax, false))
if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev &&
tb->port == port) {
if (!inet_csk_bind_conflict(sk, tb, relax, false))
goto success;
goto next_port;
}
......@@ -361,8 +272,6 @@ inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret,
success:
*port_ret = port;
*tb_ret = tb;
*tb2_ret = tb2;
*head2_ret = head2;
return head;
}
......@@ -458,81 +367,54 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
{
bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
bool bhash_created = false, bhash2_created = false;
struct inet_bind2_bucket *tb2 = NULL;
struct inet_bind2_hashbucket *head2;
struct inet_bind_bucket *tb = NULL;
int ret = 1, port = snum;
struct inet_bind_hashbucket *head;
struct net *net = sock_net(sk);
int ret = 1, port = snum;
bool found_port = false;
struct inet_bind_bucket *tb = NULL;
int l3mdev;
l3mdev = inet_sk_bound_l3mdev(sk);
if (!port) {
head = inet_csk_find_open_port(sk, &tb, &tb2, &head2, &port);
head = inet_csk_find_open_port(sk, &tb, &port);
if (!head)
return ret;
if (tb && tb2)
goto success;
found_port = true;
} else {
head = &hinfo->bhash[inet_bhashfn(net, port,
hinfo->bhash_size)];
spin_lock_bh(&head->lock);
inet_bind_bucket_for_each(tb, &head->chain)
if (check_bind_bucket_match(tb, net, port, l3mdev))
break;
tb2 = inet_bind2_bucket_find(hinfo, net, port, l3mdev, sk,
&head2);
}
if (!tb) {
tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, net,
head, port, l3mdev);
if (!tb)
goto fail_unlock;
bhash_created = true;
}
if (!tb2) {
tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep,
net, head2, port, l3mdev, sk);
if (!tb2)
goto fail_unlock;
bhash2_created = true;
goto tb_not_found;
goto success;
}
/* If we had to find an open port, we already checked for conflicts */
if (!found_port && !hlist_empty(&tb->owners)) {
head = &hinfo->bhash[inet_bhashfn(net, port,
hinfo->bhash_size)];
spin_lock_bh(&head->lock);
inet_bind_bucket_for_each(tb, &head->chain)
if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev &&
tb->port == port)
goto tb_found;
tb_not_found:
tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
net, head, port, l3mdev);
if (!tb)
goto fail_unlock;
tb_found:
if (!hlist_empty(&tb->owners)) {
if (sk->sk_reuse == SK_FORCE_REUSE)
goto success;
if ((tb->fastreuse > 0 && reuse) ||
sk_reuseport_match(tb, sk))
goto success;
if (inet_csk_bind_conflict(sk, port, tb, tb2, true, true))
if (inet_csk_bind_conflict(sk, tb, true, true))
goto fail_unlock;
}
success:
inet_csk_update_fastreuse(tb, sk);
if (!inet_csk(sk)->icsk_bind_hash)
inet_bind_hash(sk, tb, tb2, port);
inet_bind_hash(sk, tb, port);
WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
WARN_ON(inet_csk(sk)->icsk_bind2_hash != tb2);
ret = 0;
fail_unlock:
if (ret) {
if (bhash_created)
inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb);
if (bhash2_created)
inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep,
tb2);
}
spin_unlock_bh(&head->lock);
return ret;
}
......@@ -1079,7 +961,6 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
inet_sk_set_state(newsk, TCP_SYN_RECV);
newicsk->icsk_bind_hash = NULL;
newicsk->icsk_bind2_hash = NULL;
inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port;
inet_sk(newsk)->inet_num = inet_rsk(req)->ir_num;
......
......@@ -81,41 +81,6 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
return tb;
}
struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep,
struct net *net,
struct inet_bind2_hashbucket *head,
const unsigned short port,
int l3mdev,
const struct sock *sk)
{
struct inet_bind2_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);
if (tb) {
write_pnet(&tb->ib_net, net);
tb->l3mdev = l3mdev;
tb->port = port;
#if IS_ENABLED(CONFIG_IPV6)
if (sk->sk_family == AF_INET6)
tb->v6_rcv_saddr = sk->sk_v6_rcv_saddr;
else
#endif
tb->rcv_saddr = sk->sk_rcv_saddr;
INIT_HLIST_HEAD(&tb->owners);
hlist_add_head(&tb->node, &head->chain);
}
return tb;
}
static bool bind2_bucket_addr_match(struct inet_bind2_bucket *tb2, struct sock *sk)
{
#if IS_ENABLED(CONFIG_IPV6)
if (sk->sk_family == AF_INET6)
return ipv6_addr_equal(&tb2->v6_rcv_saddr,
&sk->sk_v6_rcv_saddr);
#endif
return tb2->rcv_saddr == sk->sk_rcv_saddr;
}
/*
* Caller must hold hashbucket lock for this tb with local BH disabled
*/
......@@ -127,25 +92,12 @@ void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket
}
}
/* Caller must hold the lock for the corresponding hashbucket in the bhash table
* with local BH disabled
*/
void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb)
{
if (hlist_empty(&tb->owners)) {
__hlist_del(&tb->node);
kmem_cache_free(cachep, tb);
}
}
void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
struct inet_bind2_bucket *tb2, const unsigned short snum)
const unsigned short snum)
{
inet_sk(sk)->inet_num = snum;
sk_add_bind_node(sk, &tb->owners);
inet_csk(sk)->icsk_bind_hash = tb;
sk_add_bind2_node(sk, &tb2->owners);
inet_csk(sk)->icsk_bind2_hash = tb2;
}
/*
......@@ -157,7 +109,6 @@ static void __inet_put_port(struct sock *sk)
const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->inet_num,
hashinfo->bhash_size);
struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash];
struct inet_bind2_bucket *tb2;
struct inet_bind_bucket *tb;
spin_lock(&head->lock);
......@@ -166,13 +117,6 @@ static void __inet_put_port(struct sock *sk)
inet_csk(sk)->icsk_bind_hash = NULL;
inet_sk(sk)->inet_num = 0;
inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
if (inet_csk(sk)->icsk_bind2_hash) {
tb2 = inet_csk(sk)->icsk_bind2_hash;
__sk_del_bind2_node(sk);
inet_csk(sk)->icsk_bind2_hash = NULL;
inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2);
}
spin_unlock(&head->lock);
}
......@@ -189,19 +133,14 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child)
struct inet_hashinfo *table = sk->sk_prot->h.hashinfo;
unsigned short port = inet_sk(child)->inet_num;
const int bhash = inet_bhashfn(sock_net(sk), port,
table->bhash_size);
table->bhash_size);
struct inet_bind_hashbucket *head = &table->bhash[bhash];
struct inet_bind2_hashbucket *head_bhash2;
bool created_inet_bind_bucket = false;
struct net *net = sock_net(sk);
struct inet_bind2_bucket *tb2;
struct inet_bind_bucket *tb;
int l3mdev;
spin_lock(&head->lock);
tb = inet_csk(sk)->icsk_bind_hash;
tb2 = inet_csk(sk)->icsk_bind2_hash;
if (unlikely(!tb || !tb2)) {
if (unlikely(!tb)) {
spin_unlock(&head->lock);
return -ENOENT;
}
......@@ -214,45 +153,25 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child)
* as that of the child socket. We have to look up or
* create a new bind bucket for the child here. */
inet_bind_bucket_for_each(tb, &head->chain) {
if (check_bind_bucket_match(tb, net, port, l3mdev))
if (net_eq(ib_net(tb), sock_net(sk)) &&
tb->l3mdev == l3mdev && tb->port == port)
break;
}
if (!tb) {
tb = inet_bind_bucket_create(table->bind_bucket_cachep,
net, head, port, l3mdev);
sock_net(sk), head, port,
l3mdev);
if (!tb) {
spin_unlock(&head->lock);
return -ENOMEM;
}
created_inet_bind_bucket = true;
}
inet_csk_update_fastreuse(tb, child);
goto bhash2_find;
} else if (!bind2_bucket_addr_match(tb2, child)) {
l3mdev = inet_sk_bound_l3mdev(sk);
bhash2_find:
tb2 = inet_bind2_bucket_find(table, net, port, l3mdev, child,
&head_bhash2);
if (!tb2) {
tb2 = inet_bind2_bucket_create(table->bind2_bucket_cachep,
net, head_bhash2, port,
l3mdev, child);
if (!tb2)
goto error;
}
}
inet_bind_hash(child, tb, tb2, port);
inet_bind_hash(child, tb, port);
spin_unlock(&head->lock);
return 0;
error:
if (created_inet_bind_bucket)
inet_bind_bucket_destroy(table->bind_bucket_cachep, tb);
spin_unlock(&head->lock);
return -ENOMEM;
}
EXPORT_SYMBOL_GPL(__inet_inherit_port);
......@@ -756,76 +675,6 @@ void inet_unhash(struct sock *sk)
}
EXPORT_SYMBOL_GPL(inet_unhash);
static bool check_bind2_bucket_match(struct inet_bind2_bucket *tb,
struct net *net, unsigned short port,
int l3mdev, struct sock *sk)
{
#if IS_ENABLED(CONFIG_IPV6)
if (sk->sk_family == AF_INET6)
return net_eq(ib2_net(tb), net) && tb->port == port &&
tb->l3mdev == l3mdev &&
ipv6_addr_equal(&tb->v6_rcv_saddr, &sk->sk_v6_rcv_saddr);
else
#endif
return net_eq(ib2_net(tb), net) && tb->port == port &&
tb->l3mdev == l3mdev && tb->rcv_saddr == sk->sk_rcv_saddr;
}
bool check_bind2_bucket_match_nulladdr(struct inet_bind2_bucket *tb,
struct net *net, const unsigned short port,
int l3mdev, const struct sock *sk)
{
#if IS_ENABLED(CONFIG_IPV6)
struct in6_addr nulladdr = {};
if (sk->sk_family == AF_INET6)
return net_eq(ib2_net(tb), net) && tb->port == port &&
tb->l3mdev == l3mdev &&
ipv6_addr_equal(&tb->v6_rcv_saddr, &nulladdr);
else
#endif
return net_eq(ib2_net(tb), net) && tb->port == port &&
tb->l3mdev == l3mdev && tb->rcv_saddr == 0;
}
static struct inet_bind2_hashbucket *
inet_bhashfn_portaddr(struct inet_hashinfo *hinfo, const struct sock *sk,
const struct net *net, unsigned short port)
{
u32 hash;
#if IS_ENABLED(CONFIG_IPV6)
if (sk->sk_family == AF_INET6)
hash = ipv6_portaddr_hash(net, &sk->sk_v6_rcv_saddr, port);
else
#endif
hash = ipv4_portaddr_hash(net, sk->sk_rcv_saddr, port);
return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)];
}
/* This should only be called when the spinlock for the socket's corresponding
* bind_hashbucket is held
*/
struct inet_bind2_bucket *
inet_bind2_bucket_find(struct inet_hashinfo *hinfo, struct net *net,
const unsigned short port, int l3mdev, struct sock *sk,
struct inet_bind2_hashbucket **head)
{
struct inet_bind2_bucket *bhash2 = NULL;
struct inet_bind2_hashbucket *h;
h = inet_bhashfn_portaddr(hinfo, sk, net, port);
inet_bind_bucket_for_each(bhash2, &h->chain) {
if (check_bind2_bucket_match(bhash2, net, port, l3mdev, sk))
break;
}
if (head)
*head = h;
return bhash2;
}
/* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm
* Note that we use 32bit integers (vs RFC 'short integers')
* because 2^16 is not a multiple of num_ephemeral and this
......@@ -846,13 +695,10 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
{
struct inet_hashinfo *hinfo = death_row->hashinfo;
struct inet_timewait_sock *tw = NULL;
struct inet_bind2_hashbucket *head2;
struct inet_bind_hashbucket *head;
int port = inet_sk(sk)->inet_num;
struct net *net = sock_net(sk);
struct inet_bind2_bucket *tb2;
struct inet_bind_bucket *tb;
bool tb_created = false;
u32 remaining, offset;
int ret, i, low, high;
int l3mdev;
......@@ -909,7 +755,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
* the established check is already unique enough.
*/
inet_bind_bucket_for_each(tb, &head->chain) {
if (check_bind_bucket_match(tb, net, port, l3mdev)) {
if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev &&
tb->port == port) {
if (tb->fastreuse >= 0 ||
tb->fastreuseport >= 0)
goto next_port;
......@@ -927,7 +774,6 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
spin_unlock_bh(&head->lock);
return -ENOMEM;
}
tb_created = true;
tb->fastreuse = -1;
tb->fastreuseport = -1;
goto ok;
......@@ -943,17 +789,6 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
return -EADDRNOTAVAIL;
ok:
/* Find the corresponding tb2 bucket since we need to
* add the socket to the bhash2 table as well
*/
tb2 = inet_bind2_bucket_find(hinfo, net, port, l3mdev, sk, &head2);
if (!tb2) {
tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep, net,
head2, port, l3mdev, sk);
if (!tb2)
goto error;
}
/* Here we want to add a little bit of randomness to the next source
* port that will be chosen. We use a max() with a random here so that
* on low contention the randomness is maximal and on high contention
......@@ -963,7 +798,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + 2);
/* Head lock still held and bh's disabled */
inet_bind_hash(sk, tb, tb2, port);
inet_bind_hash(sk, tb, port);
if (sk_unhashed(sk)) {
inet_sk(sk)->inet_sport = htons(port);
inet_ehash_nolisten(sk, (struct sock *)tw, NULL);
......@@ -975,12 +810,6 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
inet_twsk_deschedule_put(tw);
local_bh_enable();
return 0;
error:
if (tb_created)
inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb);
spin_unlock_bh(&head->lock);
return -ENOMEM;
}
/*
......
......@@ -4604,12 +4604,6 @@ void __init tcp_init(void)
SLAB_HWCACHE_ALIGN | SLAB_PANIC |
SLAB_ACCOUNT,
NULL);
tcp_hashinfo.bind2_bucket_cachep =
kmem_cache_create("tcp_bind2_bucket",
sizeof(struct inet_bind2_bucket), 0,
SLAB_HWCACHE_ALIGN | SLAB_PANIC |
SLAB_ACCOUNT,
NULL);
/* Size and allocate the main established and bind bucket
* hash tables.
......@@ -4632,9 +4626,8 @@ void __init tcp_init(void)
if (inet_ehash_locks_alloc(&tcp_hashinfo))
panic("TCP: failed to alloc ehash_locks");
tcp_hashinfo.bhash =
alloc_large_system_hash("TCP bind bhash tables",
sizeof(struct inet_bind_hashbucket) +
sizeof(struct inet_bind2_hashbucket),
alloc_large_system_hash("TCP bind",
sizeof(struct inet_bind_hashbucket),
tcp_hashinfo.ehash_mask + 1,
17, /* one slot per 128 KB of memory */
0,
......@@ -4643,12 +4636,9 @@ void __init tcp_init(void)
0,
64 * 1024);
tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
tcp_hashinfo.bhash2 =
(struct inet_bind2_hashbucket *)(tcp_hashinfo.bhash + tcp_hashinfo.bhash_size);
for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
spin_lock_init(&tcp_hashinfo.bhash[i].lock);
INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
INIT_HLIST_HEAD(&tcp_hashinfo.bhash2[i].chain);
}
......
......@@ -37,4 +37,3 @@ gro
ioam6_parser
toeplitz
cmsg_sender
bind_bhash_test
......@@ -59,7 +59,6 @@ TEST_GEN_FILES += toeplitz
TEST_GEN_FILES += cmsg_sender
TEST_GEN_FILES += stress_reuseport_listen
TEST_PROGS += test_vxlan_vnifiltering.sh
TEST_GEN_FILES += bind_bhash_test
TEST_FILES := settings
......@@ -70,5 +69,4 @@ include bpf/Makefile
$(OUTPUT)/reuseport_bpf_numa: LDLIBS += -lnuma
$(OUTPUT)/tcp_mmap: LDLIBS += -lpthread
$(OUTPUT)/bind_bhash_test: LDLIBS += -lpthread
$(OUTPUT)/tcp_inq: LDLIBS += -lpthread
// SPDX-License-Identifier: GPL-2.0
/*
* This times how long it takes to bind to a port when the port already
* has multiple sockets in its bhash table.
*
* In the setup(), we populate the port's bhash table with
* MAX_THREADS * MAX_CONNECTIONS number of entries.
*/
#include <unistd.h>
#include <stdio.h>
#include <netdb.h>
#include <pthread.h>
#define MAX_THREADS 600
#define MAX_CONNECTIONS 40
static const char *bind_addr = "::1";
static const char *port;
static int fd_array[MAX_THREADS][MAX_CONNECTIONS];
static int bind_socket(int opt, const char *addr)
{
struct addrinfo *res, hint = {};
int sock_fd, reuse = 1, err;
sock_fd = socket(AF_INET6, SOCK_STREAM, 0);
if (sock_fd < 0) {
perror("socket fd err");
return -1;
}
hint.ai_family = AF_INET6;
hint.ai_socktype = SOCK_STREAM;
err = getaddrinfo(addr, port, &hint, &res);
if (err) {
perror("getaddrinfo failed");
return -1;
}
if (opt) {
err = setsockopt(sock_fd, SOL_SOCKET, opt, &reuse, sizeof(reuse));
if (err) {
perror("setsockopt failed");
return -1;
}
}
err = bind(sock_fd, res->ai_addr, res->ai_addrlen);
if (err) {
perror("failed to bind to port");
return -1;
}
return sock_fd;
}
static void *setup(void *arg)
{
int sock_fd, i;
int *array = (int *)arg;
for (i = 0; i < MAX_CONNECTIONS; i++) {
sock_fd = bind_socket(SO_REUSEADDR | SO_REUSEPORT, bind_addr);
if (sock_fd < 0)
return NULL;
array[i] = sock_fd;
}
return NULL;
}
int main(int argc, const char *argv[])
{
int listener_fd, sock_fd, i, j;
pthread_t tid[MAX_THREADS];
clock_t begin, end;
if (argc != 2) {
printf("Usage: listener <port>\n");
return -1;
}
port = argv[1];
listener_fd = bind_socket(SO_REUSEADDR | SO_REUSEPORT, bind_addr);
if (listen(listener_fd, 100) < 0) {
perror("listen failed");
return -1;
}
/* Set up threads to populate the bhash table entry for the port */
for (i = 0; i < MAX_THREADS; i++)
pthread_create(&tid[i], NULL, setup, fd_array[i]);
for (i = 0; i < MAX_THREADS; i++)
pthread_join(tid[i], NULL);
begin = clock();
/* Bind to the same port on a different address */
sock_fd = bind_socket(0, "2001:0db8:0:f101::1");
end = clock();
printf("time spent = %f\n", (double)(end - begin) / CLOCKS_PER_SEC);
/* clean up */
close(sock_fd);
close(listener_fd);
for (i = 0; i < MAX_THREADS; i++) {
for (j = 0; i < MAX_THREADS; i++)
close(fd_array[i][j]);
}
return 0;
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment