Commit 0798311c authored by David S. Miller's avatar David S. Miller

Merge branch 'tcp-bhash2-fixes'

Kuniyuki Iwashima says:

===================
tcp: Fix bhash2 and TIME_WAIT regression.

We forgot to add twsk to bhash2.  Therefore TIME_WAIT sockets cannot
prevent bind() to the same local address and port.

Changes:
  v1:
    * Patch 1:
      * Add tw_bind2_node in inet_timewait_sock instead of
        moving sk_bind2_node from struct sock to struct
	sock_common.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 40cab44b 2c042e8e
...@@ -108,6 +108,10 @@ struct inet_bind2_bucket { ...@@ -108,6 +108,10 @@ struct inet_bind2_bucket {
struct hlist_node node; struct hlist_node node;
/* List of sockets hashed to this bucket */ /* List of sockets hashed to this bucket */
struct hlist_head owners; struct hlist_head owners;
/* bhash has twsk in owners, but bhash2 has twsk in
* deathrow not to add a member in struct sock_common.
*/
struct hlist_head deathrow;
}; };
static inline struct net *ib_net(const struct inet_bind_bucket *ib) static inline struct net *ib_net(const struct inet_bind_bucket *ib)
......
...@@ -73,9 +73,14 @@ struct inet_timewait_sock { ...@@ -73,9 +73,14 @@ struct inet_timewait_sock {
u32 tw_priority; u32 tw_priority;
struct timer_list tw_timer; struct timer_list tw_timer;
struct inet_bind_bucket *tw_tb; struct inet_bind_bucket *tw_tb;
struct inet_bind2_bucket *tw_tb2;
struct hlist_node tw_bind2_node;
}; };
#define tw_tclass tw_tos #define tw_tclass tw_tos
#define twsk_for_each_bound_bhash2(__tw, list) \
hlist_for_each_entry(__tw, list, tw_bind2_node)
static inline struct inet_timewait_sock *inet_twsk(const struct sock *sk) static inline struct inet_timewait_sock *inet_twsk(const struct sock *sk)
{ {
return (struct inet_timewait_sock *)sk; return (struct inet_timewait_sock *)sk;
......
...@@ -173,22 +173,40 @@ static bool inet_bind_conflict(const struct sock *sk, struct sock *sk2, ...@@ -173,22 +173,40 @@ static bool inet_bind_conflict(const struct sock *sk, struct sock *sk2,
return false; return false;
} }
static bool __inet_bhash2_conflict(const struct sock *sk, struct sock *sk2,
kuid_t sk_uid, bool relax,
bool reuseport_cb_ok, bool reuseport_ok)
{
if (sk->sk_family == AF_INET && ipv6_only_sock(sk2))
return false;
return inet_bind_conflict(sk, sk2, sk_uid, relax,
reuseport_cb_ok, reuseport_ok);
}
static bool inet_bhash2_conflict(const struct sock *sk, static bool inet_bhash2_conflict(const struct sock *sk,
const struct inet_bind2_bucket *tb2, const struct inet_bind2_bucket *tb2,
kuid_t sk_uid, kuid_t sk_uid,
bool relax, bool reuseport_cb_ok, bool relax, bool reuseport_cb_ok,
bool reuseport_ok) bool reuseport_ok)
{ {
struct inet_timewait_sock *tw2;
struct sock *sk2; struct sock *sk2;
sk_for_each_bound_bhash2(sk2, &tb2->owners) { sk_for_each_bound_bhash2(sk2, &tb2->owners) {
if (sk->sk_family == AF_INET && ipv6_only_sock(sk2)) if (__inet_bhash2_conflict(sk, sk2, sk_uid, relax,
continue; reuseport_cb_ok, reuseport_ok))
return true;
}
if (inet_bind_conflict(sk, sk2, sk_uid, relax, twsk_for_each_bound_bhash2(tw2, &tb2->deathrow) {
reuseport_cb_ok, reuseport_ok)) sk2 = (struct sock *)tw2;
if (__inet_bhash2_conflict(sk, sk2, sk_uid, relax,
reuseport_cb_ok, reuseport_ok))
return true; return true;
} }
return false; return false;
} }
......
...@@ -116,6 +116,7 @@ static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb, ...@@ -116,6 +116,7 @@ static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb,
#endif #endif
tb->rcv_saddr = sk->sk_rcv_saddr; tb->rcv_saddr = sk->sk_rcv_saddr;
INIT_HLIST_HEAD(&tb->owners); INIT_HLIST_HEAD(&tb->owners);
INIT_HLIST_HEAD(&tb->deathrow);
hlist_add_head(&tb->node, &head->chain); hlist_add_head(&tb->node, &head->chain);
} }
...@@ -137,7 +138,7 @@ struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep, ...@@ -137,7 +138,7 @@ struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep,
/* Caller must hold hashbucket lock for this tb with local BH disabled */ /* Caller must hold hashbucket lock for this tb with local BH disabled */
void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb) void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb)
{ {
if (hlist_empty(&tb->owners)) { if (hlist_empty(&tb->owners) && hlist_empty(&tb->deathrow)) {
__hlist_del(&tb->node); __hlist_del(&tb->node);
kmem_cache_free(cachep, tb); kmem_cache_free(cachep, tb);
} }
...@@ -1103,15 +1104,16 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, ...@@ -1103,15 +1104,16 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
/* Head lock still held and bh's disabled */ /* Head lock still held and bh's disabled */
inet_bind_hash(sk, tb, tb2, port); inet_bind_hash(sk, tb, tb2, port);
spin_unlock(&head2->lock);
if (sk_unhashed(sk)) { if (sk_unhashed(sk)) {
inet_sk(sk)->inet_sport = htons(port); inet_sk(sk)->inet_sport = htons(port);
inet_ehash_nolisten(sk, (struct sock *)tw, NULL); inet_ehash_nolisten(sk, (struct sock *)tw, NULL);
} }
if (tw) if (tw)
inet_twsk_bind_unhash(tw, hinfo); inet_twsk_bind_unhash(tw, hinfo);
spin_unlock(&head2->lock);
spin_unlock(&head->lock); spin_unlock(&head->lock);
if (tw) if (tw)
inet_twsk_deschedule_put(tw); inet_twsk_deschedule_put(tw);
local_bh_enable(); local_bh_enable();
......
...@@ -29,6 +29,7 @@ ...@@ -29,6 +29,7 @@
void inet_twsk_bind_unhash(struct inet_timewait_sock *tw, void inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
struct inet_hashinfo *hashinfo) struct inet_hashinfo *hashinfo)
{ {
struct inet_bind2_bucket *tb2 = tw->tw_tb2;
struct inet_bind_bucket *tb = tw->tw_tb; struct inet_bind_bucket *tb = tw->tw_tb;
if (!tb) if (!tb)
...@@ -37,6 +38,11 @@ void inet_twsk_bind_unhash(struct inet_timewait_sock *tw, ...@@ -37,6 +38,11 @@ void inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
__hlist_del(&tw->tw_bind_node); __hlist_del(&tw->tw_bind_node);
tw->tw_tb = NULL; tw->tw_tb = NULL;
inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
__hlist_del(&tw->tw_bind2_node);
tw->tw_tb2 = NULL;
inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2);
__sock_put((struct sock *)tw); __sock_put((struct sock *)tw);
} }
...@@ -45,7 +51,7 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw) ...@@ -45,7 +51,7 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw)
{ {
struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo; struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo;
spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash); spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
struct inet_bind_hashbucket *bhead; struct inet_bind_hashbucket *bhead, *bhead2;
spin_lock(lock); spin_lock(lock);
sk_nulls_del_node_init_rcu((struct sock *)tw); sk_nulls_del_node_init_rcu((struct sock *)tw);
...@@ -54,9 +60,13 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw) ...@@ -54,9 +60,13 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw)
/* Disassociate with bind bucket. */ /* Disassociate with bind bucket. */
bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num, bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num,
hashinfo->bhash_size)]; hashinfo->bhash_size)];
bhead2 = inet_bhashfn_portaddr(hashinfo, (struct sock *)tw,
twsk_net(tw), tw->tw_num);
spin_lock(&bhead->lock); spin_lock(&bhead->lock);
spin_lock(&bhead2->lock);
inet_twsk_bind_unhash(tw, hashinfo); inet_twsk_bind_unhash(tw, hashinfo);
spin_unlock(&bhead2->lock);
spin_unlock(&bhead->lock); spin_unlock(&bhead->lock);
refcount_dec(&tw->tw_dr->tw_refcount); refcount_dec(&tw->tw_dr->tw_refcount);
...@@ -93,6 +103,12 @@ static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw, ...@@ -93,6 +103,12 @@ static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw,
hlist_add_head(&tw->tw_bind_node, list); hlist_add_head(&tw->tw_bind_node, list);
} }
static void inet_twsk_add_bind2_node(struct inet_timewait_sock *tw,
struct hlist_head *list)
{
hlist_add_head(&tw->tw_bind2_node, list);
}
/* /*
* Enter the time wait state. This is called with locally disabled BH. * Enter the time wait state. This is called with locally disabled BH.
* Essentially we whip up a timewait bucket, copy the relevant info into it * Essentially we whip up a timewait bucket, copy the relevant info into it
...@@ -105,17 +121,28 @@ void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, ...@@ -105,17 +121,28 @@ void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
const struct inet_connection_sock *icsk = inet_csk(sk); const struct inet_connection_sock *icsk = inet_csk(sk);
struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash); struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash);
spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
struct inet_bind_hashbucket *bhead; struct inet_bind_hashbucket *bhead, *bhead2;
/* Step 1: Put TW into bind hash. Original socket stays there too. /* Step 1: Put TW into bind hash. Original socket stays there too.
Note, that any socket with inet->num != 0 MUST be bound in Note, that any socket with inet->num != 0 MUST be bound in
binding cache, even if it is closed. binding cache, even if it is closed.
*/ */
bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num, bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num,
hashinfo->bhash_size)]; hashinfo->bhash_size)];
bhead2 = inet_bhashfn_portaddr(hashinfo, sk, twsk_net(tw), inet->inet_num);
spin_lock(&bhead->lock); spin_lock(&bhead->lock);
spin_lock(&bhead2->lock);
tw->tw_tb = icsk->icsk_bind_hash; tw->tw_tb = icsk->icsk_bind_hash;
WARN_ON(!icsk->icsk_bind_hash); WARN_ON(!icsk->icsk_bind_hash);
inet_twsk_add_bind_node(tw, &tw->tw_tb->owners); inet_twsk_add_bind_node(tw, &tw->tw_tb->owners);
tw->tw_tb2 = icsk->icsk_bind2_hash;
WARN_ON(!icsk->icsk_bind2_hash);
inet_twsk_add_bind2_node(tw, &tw->tw_tb2->deathrow);
spin_unlock(&bhead2->lock);
spin_unlock(&bhead->lock); spin_unlock(&bhead->lock);
spin_lock(lock); spin_lock(lock);
......
# SPDX-License-Identifier: GPL-2.0-only # SPDX-License-Identifier: GPL-2.0-only
bind_bhash bind_bhash
bind_timewait
csum csum
cmsg_sender cmsg_sender
diag_uid diag_uid
......
// SPDX-License-Identifier: GPL-2.0
/* Copyright Amazon.com Inc. or its affiliates. */
#include <sys/socket.h>
#include <netinet/in.h>
#include "../kselftest_harness.h"
FIXTURE(bind_timewait)
{
struct sockaddr_in addr;
socklen_t addrlen;
};
FIXTURE_VARIANT(bind_timewait)
{
__u32 addr_const;
};
FIXTURE_VARIANT_ADD(bind_timewait, localhost)
{
.addr_const = INADDR_LOOPBACK
};
FIXTURE_VARIANT_ADD(bind_timewait, addrany)
{
.addr_const = INADDR_ANY
};
FIXTURE_SETUP(bind_timewait)
{
self->addr.sin_family = AF_INET;
self->addr.sin_port = 0;
self->addr.sin_addr.s_addr = htonl(variant->addr_const);
self->addrlen = sizeof(self->addr);
}
FIXTURE_TEARDOWN(bind_timewait)
{
}
void create_timewait_socket(struct __test_metadata *_metadata,
FIXTURE_DATA(bind_timewait) *self)
{
int server_fd, client_fd, child_fd, ret;
struct sockaddr_in addr;
socklen_t addrlen;
server_fd = socket(AF_INET, SOCK_STREAM, 0);
ASSERT_GT(server_fd, 0);
ret = bind(server_fd, (struct sockaddr *)&self->addr, self->addrlen);
ASSERT_EQ(ret, 0);
ret = listen(server_fd, 1);
ASSERT_EQ(ret, 0);
ret = getsockname(server_fd, (struct sockaddr *)&self->addr, &self->addrlen);
ASSERT_EQ(ret, 0);
client_fd = socket(AF_INET, SOCK_STREAM, 0);
ASSERT_GT(client_fd, 0);
ret = connect(client_fd, (struct sockaddr *)&self->addr, self->addrlen);
ASSERT_EQ(ret, 0);
addrlen = sizeof(addr);
child_fd = accept(server_fd, (struct sockaddr *)&addr, &addrlen);
ASSERT_GT(child_fd, 0);
close(child_fd);
close(client_fd);
close(server_fd);
}
TEST_F(bind_timewait, 1)
{
int fd, ret;
create_timewait_socket(_metadata, self);
fd = socket(AF_INET, SOCK_STREAM, 0);
ASSERT_GT(fd, 0);
ret = bind(fd, (struct sockaddr *)&self->addr, self->addrlen);
ASSERT_EQ(ret, -1);
ASSERT_EQ(errno, EADDRINUSE);
close(fd);
}
TEST_HARNESS_MAIN
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment