Commit 261501d9 authored by David S. Miller's avatar David S. Miller

Merge branch 'sctp-add-support-for-sk_reuseport'

Xin Long says:

====================
sctp: add support for sk_reuseport

sctp sk_reuseport allows multiple socks to listen on the same port and
addresses, as long as these socks have the same uid. This works pretty
much as TCP/UDP does, the only difference is that sctp is multi-homing
and all the bind_addrs in these socks will have to completely matched,
otherwise listen() will return err.

The below is when 5 sockets are listening on 172.16.254.254:6400 on a
server, 26 sockets on a client connect to 172.16.254.254:6400 and each
may be processed by a different socket on the server which is selected
by hash(lport, pport, paddr) in reuseport_select_sock():

 # ss --sctp -nn
   State      Recv-Q Send-Q        Local Address:Port     Peer Address:Port
   LISTEN     0      10           172.16.254.254:6400                *:*
   `- ESTAB   0      0       172.16.254.254%eth1:6400       172.16.2.1:1234
   `- ESTAB   0      0       172.16.254.254%eth1:6400       172.16.2.4:1234
   `- ESTAB   0      0       172.16.254.254%eth1:6400       172.16.3.3:1234
   `- ESTAB   0      0       172.16.254.254%eth1:6400       172.16.3.4:1234
   `- ESTAB   0      0       172.16.254.254%eth1:6400       172.16.5.2:1234
   `- ESTAB   0      0       172.16.254.254%eth1:6400       172.16.5.3:1234
   LISTEN     0      10           172.16.254.254:6400                *:*
   `- ESTAB   0      0       172.16.254.254%eth1:6400       172.16.1.3:1234
   `- ESTAB   0      0       172.16.254.254%eth1:6400       172.16.1.4:1234
   `- ESTAB   0      0       172.16.254.254%eth1:6400       172.16.3.2:1234
   `- ESTAB   0      0       172.16.254.254%eth1:6400       172.16.4.1:1234
   `- ESTAB   0      0       172.16.254.254%eth1:6400       172.16.4.2:1234
   `- ESTAB   0      0       172.16.254.254%eth1:6400       172.16.4.3:1234
   `- ESTAB   0      0       172.16.254.254%eth1:6400       172.16.4.4:1234
   LISTEN     0      10           172.16.254.254:6400                *:*
   `- ESTAB   0      0       172.16.254.254%eth1:6400       172.16.1.2:1234
   `- ESTAB   0      0       172.16.254.254%eth1:6400       172.16.3.5:1234
   `- ESTAB   0      0       172.16.254.254%eth1:6400       172.16.4.5:1234
   `- ESTAB   0      0       172.16.254.254%eth1:6400   172.16.253.253:1234
   LISTEN     0      10           172.16.254.254:6400                *:*
   `- ESTAB   0      0       172.16.254.254%eth1:6400       172.16.2.2:1234
   `- ESTAB   0      0       172.16.254.254%eth1:6400       172.16.2.3:1234
   `- ESTAB   0      0       172.16.254.254%eth1:6400       172.16.5.4:1234
   `- ESTAB   0      0       172.16.254.254%eth1:6400       172.16.5.5:1234
   LISTEN     0      10           172.16.254.254:6400                *:*
   `- ESTAB   0      0       172.16.254.254%eth1:6400       172.16.1.1:1234
   `- ESTAB   0      0       172.16.254.254%eth1:6400       172.16.1.5:1234
   `- ESTAB   0      0       172.16.254.254%eth1:6400       172.16.2.5:1234
   `- ESTAB   0      0       172.16.254.254%eth1:6400       172.16.3.1:1234
   `- ESTAB   0      0       172.16.254.254%eth1:6400       172.16.5.1:1234
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 30beabb3 6ba84574
......@@ -152,7 +152,7 @@ int sctp_primitive_RECONF(struct net *net, struct sctp_association *asoc,
*/
int sctp_rcv(struct sk_buff *skb);
int sctp_v4_err(struct sk_buff *skb, u32 info);
void sctp_hash_endpoint(struct sctp_endpoint *);
int sctp_hash_endpoint(struct sctp_endpoint *ep);
void sctp_unhash_endpoint(struct sctp_endpoint *);
struct sock *sctp_err_lookup(struct net *net, int family, struct sk_buff *,
struct sctphdr *, struct sctp_association **,
......
......@@ -96,7 +96,9 @@ struct sctp_stream;
struct sctp_bind_bucket {
unsigned short port;
unsigned short fastreuse;
signed char fastreuse;
signed char fastreuseport;
kuid_t fastuid;
struct hlist_node node;
struct hlist_head owner;
struct net *net;
......@@ -1190,6 +1192,8 @@ int sctp_bind_addr_conflict(struct sctp_bind_addr *, const union sctp_addr *,
struct sctp_sock *, struct sctp_sock *);
int sctp_bind_addr_state(const struct sctp_bind_addr *bp,
const union sctp_addr *addr);
int sctp_bind_addrs_check(struct sctp_sock *sp,
struct sctp_sock *sp2, int cnt2);
union sctp_addr *sctp_find_unmatch_addr(struct sctp_bind_addr *bp,
const union sctp_addr *addrs,
int addrcnt,
......
......@@ -187,6 +187,7 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany)
call_rcu(&old_reuse->rcu, reuseport_free_rcu);
return 0;
}
EXPORT_SYMBOL(reuseport_add_sock);
void reuseport_detach_sock(struct sock *sk)
{
......
......@@ -337,6 +337,34 @@ int sctp_bind_addr_match(struct sctp_bind_addr *bp,
return match;
}
int sctp_bind_addrs_check(struct sctp_sock *sp,
struct sctp_sock *sp2, int cnt2)
{
struct sctp_bind_addr *bp2 = &sp2->ep->base.bind_addr;
struct sctp_bind_addr *bp = &sp->ep->base.bind_addr;
struct sctp_sockaddr_entry *laddr, *laddr2;
bool exist = false;
int cnt = 0;
rcu_read_lock();
list_for_each_entry_rcu(laddr, &bp->address_list, list) {
list_for_each_entry_rcu(laddr2, &bp2->address_list, list) {
if (sp->pf->af->cmp_addr(&laddr->a, &laddr2->a) &&
laddr->valid && laddr2->valid) {
exist = true;
goto next;
}
}
cnt = 0;
break;
next:
cnt++;
}
rcu_read_unlock();
return (cnt == cnt2) ? 0 : (exist ? -EEXIST : 1);
}
/* Does the address 'addr' conflict with any addresses in
* the bp.
*/
......
......@@ -57,6 +57,7 @@
#include <net/sctp/checksum.h>
#include <net/net_namespace.h>
#include <linux/rhashtable.h>
#include <net/sock_reuseport.h>
/* Forward declarations for internal helpers. */
static int sctp_rcv_ootb(struct sk_buff *);
......@@ -65,8 +66,10 @@ static struct sctp_association *__sctp_rcv_lookup(struct net *net,
const union sctp_addr *paddr,
const union sctp_addr *laddr,
struct sctp_transport **transportp);
static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(struct net *net,
const union sctp_addr *laddr);
static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(
struct net *net, struct sk_buff *skb,
const union sctp_addr *laddr,
const union sctp_addr *daddr);
static struct sctp_association *__sctp_lookup_association(
struct net *net,
const union sctp_addr *local,
......@@ -171,7 +174,7 @@ int sctp_rcv(struct sk_buff *skb)
asoc = __sctp_rcv_lookup(net, skb, &src, &dest, &transport);
if (!asoc)
ep = __sctp_rcv_lookup_endpoint(net, &dest);
ep = __sctp_rcv_lookup_endpoint(net, skb, &dest, &src);
/* Retrieve the common input handling substructure. */
rcvr = asoc ? &asoc->base : &ep->base;
......@@ -721,43 +724,87 @@ static int sctp_rcv_ootb(struct sk_buff *skb)
}
/* Insert endpoint into the hash table. */
static void __sctp_hash_endpoint(struct sctp_endpoint *ep)
static int __sctp_hash_endpoint(struct sctp_endpoint *ep)
{
struct net *net = sock_net(ep->base.sk);
struct sctp_ep_common *epb;
struct sock *sk = ep->base.sk;
struct net *net = sock_net(sk);
struct sctp_hashbucket *head;
struct sctp_ep_common *epb;
epb = &ep->base;
epb->hashent = sctp_ep_hashfn(net, epb->bind_addr.port);
head = &sctp_ep_hashtable[epb->hashent];
if (sk->sk_reuseport) {
bool any = sctp_is_ep_boundall(sk);
struct sctp_ep_common *epb2;
struct list_head *list;
int cnt = 0, err = 1;
list_for_each(list, &ep->base.bind_addr.address_list)
cnt++;
sctp_for_each_hentry(epb2, &head->chain) {
struct sock *sk2 = epb2->sk;
if (!net_eq(sock_net(sk2), net) || sk2 == sk ||
!uid_eq(sock_i_uid(sk2), sock_i_uid(sk)) ||
!sk2->sk_reuseport)
continue;
err = sctp_bind_addrs_check(sctp_sk(sk2),
sctp_sk(sk), cnt);
if (!err) {
err = reuseport_add_sock(sk, sk2, any);
if (err)
return err;
break;
} else if (err < 0) {
return err;
}
}
if (err) {
err = reuseport_alloc(sk, any);
if (err)
return err;
}
}
write_lock(&head->lock);
hlist_add_head(&epb->node, &head->chain);
write_unlock(&head->lock);
return 0;
}
/* Add an endpoint to the hash. Local BH-safe. */
void sctp_hash_endpoint(struct sctp_endpoint *ep)
int sctp_hash_endpoint(struct sctp_endpoint *ep)
{
int err;
local_bh_disable();
__sctp_hash_endpoint(ep);
err = __sctp_hash_endpoint(ep);
local_bh_enable();
return err;
}
/* Remove endpoint from the hash table. */
static void __sctp_unhash_endpoint(struct sctp_endpoint *ep)
{
struct net *net = sock_net(ep->base.sk);
struct sock *sk = ep->base.sk;
struct sctp_hashbucket *head;
struct sctp_ep_common *epb;
epb = &ep->base;
epb->hashent = sctp_ep_hashfn(net, epb->bind_addr.port);
epb->hashent = sctp_ep_hashfn(sock_net(sk), epb->bind_addr.port);
head = &sctp_ep_hashtable[epb->hashent];
if (rcu_access_pointer(sk->sk_reuseport_cb))
reuseport_detach_sock(sk);
write_lock(&head->lock);
hlist_del_init(&epb->node);
write_unlock(&head->lock);
......@@ -771,16 +818,35 @@ void sctp_unhash_endpoint(struct sctp_endpoint *ep)
local_bh_enable();
}
static inline __u32 sctp_hashfn(const struct net *net, __be16 lport,
const union sctp_addr *paddr, __u32 seed)
{
__u32 addr;
if (paddr->sa.sa_family == AF_INET6)
addr = jhash(&paddr->v6.sin6_addr, 16, seed);
else
addr = (__force __u32)paddr->v4.sin_addr.s_addr;
return jhash_3words(addr, ((__force __u32)paddr->v4.sin_port) << 16 |
(__force __u32)lport, net_hash_mix(net), seed);
}
/* Look up an endpoint. */
static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(struct net *net,
const union sctp_addr *laddr)
static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(
struct net *net, struct sk_buff *skb,
const union sctp_addr *laddr,
const union sctp_addr *paddr)
{
struct sctp_hashbucket *head;
struct sctp_ep_common *epb;
struct sctp_endpoint *ep;
struct sock *sk;
__be16 lport;
int hash;
hash = sctp_ep_hashfn(net, ntohs(laddr->v4.sin_port));
lport = laddr->v4.sin_port;
hash = sctp_ep_hashfn(net, ntohs(lport));
head = &sctp_ep_hashtable[hash];
read_lock(&head->lock);
sctp_for_each_hentry(epb, &head->chain) {
......@@ -792,6 +858,15 @@ static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(struct net *net,
ep = sctp_sk(net->sctp.ctl_sock)->ep;
hit:
sk = ep->base.sk;
if (sk->sk_reuseport) {
__u32 phash = sctp_hashfn(net, lport, paddr, 0);
sk = reuseport_select_sock(sk, phash, skb,
sizeof(struct sctphdr));
if (sk)
ep = sctp_sk(sk)->ep;
}
sctp_endpoint_hold(ep);
read_unlock(&head->lock);
return ep;
......@@ -830,35 +905,17 @@ static inline int sctp_hash_cmp(struct rhashtable_compare_arg *arg,
static inline __u32 sctp_hash_obj(const void *data, u32 len, u32 seed)
{
const struct sctp_transport *t = data;
const union sctp_addr *paddr = &t->ipaddr;
const struct net *net = sock_net(t->asoc->base.sk);
__be16 lport = htons(t->asoc->base.bind_addr.port);
__u32 addr;
if (paddr->sa.sa_family == AF_INET6)
addr = jhash(&paddr->v6.sin6_addr, 16, seed);
else
addr = (__force __u32)paddr->v4.sin_addr.s_addr;
return jhash_3words(addr, ((__force __u32)paddr->v4.sin_port) << 16 |
(__force __u32)lport, net_hash_mix(net), seed);
return sctp_hashfn(sock_net(t->asoc->base.sk),
htons(t->asoc->base.bind_addr.port),
&t->ipaddr, seed);
}
static inline __u32 sctp_hash_key(const void *data, u32 len, u32 seed)
{
const struct sctp_hash_cmp_arg *x = data;
const union sctp_addr *paddr = x->paddr;
const struct net *net = x->net;
__be16 lport = x->lport;
__u32 addr;
if (paddr->sa.sa_family == AF_INET6)
addr = jhash(&paddr->v6.sin6_addr, 16, seed);
else
addr = (__force __u32)paddr->v4.sin_addr.s_addr;
return jhash_3words(addr, ((__force __u32)paddr->v4.sin_port) << 16 |
(__force __u32)lport, net_hash_mix(net), seed);
return sctp_hashfn(x->net, x->lport, x->paddr, seed);
}
static const struct rhashtable_params sctp_hash_params = {
......
......@@ -7644,8 +7644,10 @@ static struct sctp_bind_bucket *sctp_bucket_create(
static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr)
{
bool reuse = (sk->sk_reuse || sctp_sk(sk)->reuse);
struct sctp_sock *sp = sctp_sk(sk);
bool reuse = (sk->sk_reuse || sp->reuse);
struct sctp_bind_hashbucket *head; /* hash list */
kuid_t uid = sock_i_uid(sk);
struct sctp_bind_bucket *pp;
unsigned short snum;
int ret;
......@@ -7721,7 +7723,10 @@ static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr)
pr_debug("%s: found a possible match\n", __func__);
if (pp->fastreuse && reuse && sk->sk_state != SCTP_SS_LISTENING)
if ((pp->fastreuse && reuse &&
sk->sk_state != SCTP_SS_LISTENING) ||
(pp->fastreuseport && sk->sk_reuseport &&
uid_eq(pp->fastuid, uid)))
goto success;
/* Run through the list of sockets bound to the port
......@@ -7735,16 +7740,18 @@ static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr)
* in an endpoint.
*/
sk_for_each_bound(sk2, &pp->owner) {
struct sctp_endpoint *ep2;
ep2 = sctp_sk(sk2)->ep;
struct sctp_sock *sp2 = sctp_sk(sk2);
struct sctp_endpoint *ep2 = sp2->ep;
if (sk == sk2 ||
(reuse && (sk2->sk_reuse || sctp_sk(sk2)->reuse) &&
sk2->sk_state != SCTP_SS_LISTENING))
(reuse && (sk2->sk_reuse || sp2->reuse) &&
sk2->sk_state != SCTP_SS_LISTENING) ||
(sk->sk_reuseport && sk2->sk_reuseport &&
uid_eq(uid, sock_i_uid(sk2))))
continue;
if (sctp_bind_addr_conflict(&ep2->base.bind_addr, addr,
sctp_sk(sk2), sctp_sk(sk))) {
if (sctp_bind_addr_conflict(&ep2->base.bind_addr,
addr, sp2, sp)) {
ret = (long)sk2;
goto fail_unlock;
}
......@@ -7767,19 +7774,32 @@ static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr)
pp->fastreuse = 1;
else
pp->fastreuse = 0;
} else if (pp->fastreuse &&
if (sk->sk_reuseport) {
pp->fastreuseport = 1;
pp->fastuid = uid;
} else {
pp->fastreuseport = 0;
}
} else {
if (pp->fastreuse &&
(!reuse || sk->sk_state == SCTP_SS_LISTENING))
pp->fastreuse = 0;
if (pp->fastreuseport &&
(!sk->sk_reuseport || !uid_eq(pp->fastuid, uid)))
pp->fastreuseport = 0;
}
/* We are set, so fill up all the data in the hash table
* entry, tie the socket list information with the rest of the
* sockets FIXME: Blurry, NPI (ipg).
*/
success:
if (!sctp_sk(sk)->bind_hash) {
if (!sp->bind_hash) {
inet_sk(sk)->inet_num = snum;
sk_add_bind_node(sk, &pp->owner);
sctp_sk(sk)->bind_hash = pp;
sp->bind_hash = pp;
}
ret = 0;
......@@ -7852,8 +7872,7 @@ static int sctp_listen_start(struct sock *sk, int backlog)
}
sk->sk_max_ack_backlog = backlog;
sctp_hash_endpoint(ep);
return 0;
return sctp_hash_endpoint(ep);
}
/*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment