Commit e18a4459 authored by David S. Miller's avatar David S. Miller

Merge branch 'udp_hash'

David Held says:

====================
udp: Fix multicast performance issues.

Fix performance issues with listening to many different multicast
sockets on different addresses with the same port. Instead of always
using hash1, fall back to hash2 lookup when hash1 lookup is long.
Patch 1 is a general cleanup and simplification which also makes the
main implementation in Patch 2 simpler.

Eric's recent change 63c6f81c avoided this being an issue in early
demux. This makes it work for regular delivery as well.

v1->v2
 - updated hash collision detection

v2->v3
 - avoid flushing under lock unnecessarily at ARRAY_SIZE boundary
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 3e1c0f0b 2dc41cff
...@@ -660,6 +660,20 @@ static inline void sk_add_bind_node(struct sock *sk, ...@@ -660,6 +660,20 @@ static inline void sk_add_bind_node(struct sock *sk,
#define sk_for_each_bound(__sk, list) \ #define sk_for_each_bound(__sk, list) \
hlist_for_each_entry(__sk, list, sk_bind_node) hlist_for_each_entry(__sk, list, sk_bind_node)
/**
* sk_nulls_for_each_entry_offset - iterate over a list at a given struct offset
* @tpos: the type * to use as a loop cursor.
* @pos: the &struct hlist_node to use as a loop cursor.
* @head: the head for your list.
* @offset: offset of hlist_node within the struct.
*
*/
#define sk_nulls_for_each_entry_offset(tpos, pos, head, offset) \
for (pos = (head)->first; \
(!is_a_nulls(pos)) && \
({ tpos = (typeof(*tpos) *)((void *)pos - offset); 1;}); \
pos = pos->next)
static inline struct user_namespace *sk_user_ns(struct sock *sk) static inline struct user_namespace *sk_user_ns(struct sock *sk)
{ {
/* Careful only use this in a context where these parameters /* Careful only use this in a context where these parameters
......
...@@ -594,26 +594,6 @@ static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk, ...@@ -594,26 +594,6 @@ static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
return true; return true;
} }
static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk,
__be16 loc_port, __be32 loc_addr,
__be16 rmt_port, __be32 rmt_addr,
int dif)
{
struct hlist_nulls_node *node;
unsigned short hnum = ntohs(loc_port);
sk_nulls_for_each_from(sk, node) {
if (__udp_is_mcast_sock(net, sk,
loc_port, loc_addr,
rmt_port, rmt_addr,
dif, hnum))
goto found;
}
sk = NULL;
found:
return sk;
}
/* /*
* This routine is called by the ICMP module when it gets some * This routine is called by the ICMP module when it gets some
* sort of error condition. If err < 0 then the socket should * sort of error condition. If err < 0 then the socket should
...@@ -1639,6 +1619,8 @@ static void flush_stack(struct sock **stack, unsigned int count, ...@@ -1639,6 +1619,8 @@ static void flush_stack(struct sock **stack, unsigned int count,
if (skb1 && udp_queue_rcv_skb(sk, skb1) <= 0) if (skb1 && udp_queue_rcv_skb(sk, skb1) <= 0)
skb1 = NULL; skb1 = NULL;
sock_put(sk);
} }
if (unlikely(skb1)) if (unlikely(skb1))
kfree_skb(skb1); kfree_skb(skb1);
...@@ -1667,41 +1649,50 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, ...@@ -1667,41 +1649,50 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
struct udp_table *udptable) struct udp_table *udptable)
{ {
struct sock *sk, *stack[256 / sizeof(struct sock *)]; struct sock *sk, *stack[256 / sizeof(struct sock *)];
struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest)); struct hlist_nulls_node *node;
int dif; unsigned short hnum = ntohs(uh->dest);
unsigned int i, count = 0; struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
int dif = skb->dev->ifindex;
unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node);
unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
if (use_hash2) {
hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) &
udp_table.mask;
hash2 = udp4_portaddr_hash(net, daddr, hnum) & udp_table.mask;
start_lookup:
hslot = &udp_table.hash2[hash2];
offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
}
spin_lock(&hslot->lock); spin_lock(&hslot->lock);
sk = sk_nulls_head(&hslot->head); sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) {
dif = skb->dev->ifindex; if (__udp_is_mcast_sock(net, sk,
sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif); uh->dest, daddr,
while (sk) { uh->source, saddr,
stack[count++] = sk; dif, hnum)) {
sk = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest, if (unlikely(count == ARRAY_SIZE(stack))) {
daddr, uh->source, saddr, dif); flush_stack(stack, count, skb, ~0);
if (unlikely(count == ARRAY_SIZE(stack))) { count = 0;
if (!sk) }
break; stack[count++] = sk;
flush_stack(stack, count, skb, ~0); sock_hold(sk);
count = 0;
} }
} }
/*
* before releasing chain lock, we must take a reference on sockets
*/
for (i = 0; i < count; i++)
sock_hold(stack[i]);
spin_unlock(&hslot->lock); spin_unlock(&hslot->lock);
/* Also lookup *:port if we are using hash2 and haven't done so yet. */
if (use_hash2 && hash2 != hash2_any) {
hash2 = hash2_any;
goto start_lookup;
}
/* /*
* do the slow work with no lock held * do the slow work with no lock held
*/ */
if (count) { if (count) {
flush_stack(stack, count, skb, count - 1); flush_stack(stack, count, skb, count - 1);
for (i = 0; i < count; i++)
sock_put(stack[i]);
} else { } else {
kfree_skb(skb); kfree_skb(skb);
} }
......
...@@ -702,43 +702,26 @@ int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) ...@@ -702,43 +702,26 @@ int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
return -1; return -1;
} }
static struct sock *udp_v6_mcast_next(struct net *net, struct sock *sk, static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk,
__be16 loc_port, const struct in6_addr *loc_addr, __be16 loc_port, const struct in6_addr *loc_addr,
__be16 rmt_port, const struct in6_addr *rmt_addr, __be16 rmt_port, const struct in6_addr *rmt_addr,
int dif) int dif, unsigned short hnum)
{ {
struct hlist_nulls_node *node; struct inet_sock *inet = inet_sk(sk);
unsigned short num = ntohs(loc_port);
sk_nulls_for_each_from(sk, node) {
struct inet_sock *inet = inet_sk(sk);
if (!net_eq(sock_net(sk), net))
continue;
if (udp_sk(sk)->udp_port_hash == num &&
sk->sk_family == PF_INET6) {
if (inet->inet_dport) {
if (inet->inet_dport != rmt_port)
continue;
}
if (!ipv6_addr_any(&sk->sk_v6_daddr) &&
!ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr))
continue;
if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)
continue;
if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) { if (!net_eq(sock_net(sk), net))
if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, loc_addr)) return false;
continue;
} if (udp_sk(sk)->udp_port_hash != hnum ||
if (!inet6_mc_check(sk, loc_addr, rmt_addr)) sk->sk_family != PF_INET6 ||
continue; (inet->inet_dport && inet->inet_dport != rmt_port) ||
return sk; (!ipv6_addr_any(&sk->sk_v6_daddr) &&
} !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr)) ||
} (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
return NULL; return false;
if (!inet6_mc_check(sk, loc_addr, rmt_addr))
return false;
return true;
} }
static void flush_stack(struct sock **stack, unsigned int count, static void flush_stack(struct sock **stack, unsigned int count,
...@@ -762,6 +745,7 @@ static void flush_stack(struct sock **stack, unsigned int count, ...@@ -762,6 +745,7 @@ static void flush_stack(struct sock **stack, unsigned int count,
if (skb1 && udpv6_queue_rcv_skb(sk, skb1) <= 0) if (skb1 && udpv6_queue_rcv_skb(sk, skb1) <= 0)
skb1 = NULL; skb1 = NULL;
sock_put(sk);
} }
if (unlikely(skb1)) if (unlikely(skb1))
kfree_skb(skb1); kfree_skb(skb1);
...@@ -787,43 +771,51 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, ...@@ -787,43 +771,51 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
{ {
struct sock *sk, *stack[256 / sizeof(struct sock *)]; struct sock *sk, *stack[256 / sizeof(struct sock *)];
const struct udphdr *uh = udp_hdr(skb); const struct udphdr *uh = udp_hdr(skb);
struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest)); struct hlist_nulls_node *node;
int dif; unsigned short hnum = ntohs(uh->dest);
unsigned int i, count = 0; struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
int dif = inet6_iif(skb);
unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node);
unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
if (use_hash2) {
hash2_any = udp6_portaddr_hash(net, &in6addr_any, hnum) &
udp_table.mask;
hash2 = udp6_portaddr_hash(net, daddr, hnum) & udp_table.mask;
start_lookup:
hslot = &udp_table.hash2[hash2];
offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
}
spin_lock(&hslot->lock); spin_lock(&hslot->lock);
sk = sk_nulls_head(&hslot->head); sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) {
dif = inet6_iif(skb); if (__udp_v6_is_mcast_sock(net, sk,
sk = udp_v6_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif); uh->dest, daddr,
while (sk) { uh->source, saddr,
/* If zero checksum and no_check is not on for dif, hnum) &&
* the socket then skip it. /* If zero checksum and no_check is not on for
*/ * the socket then skip it.
if (uh->check || udp_sk(sk)->no_check6_rx) */
(uh->check || udp_sk(sk)->no_check6_rx)) {
if (unlikely(count == ARRAY_SIZE(stack))) {
flush_stack(stack, count, skb, ~0);
count = 0;
}
stack[count++] = sk; stack[count++] = sk;
sock_hold(sk);
sk = udp_v6_mcast_next(net, sk_nulls_next(sk), uh->dest, daddr,
uh->source, saddr, dif);
if (unlikely(count == ARRAY_SIZE(stack))) {
if (!sk)
break;
flush_stack(stack, count, skb, ~0);
count = 0;
} }
} }
/*
* before releasing the lock, we must take reference on sockets
*/
for (i = 0; i < count; i++)
sock_hold(stack[i]);
spin_unlock(&hslot->lock); spin_unlock(&hslot->lock);
/* Also lookup *:port if we are using hash2 and haven't done so yet. */
if (use_hash2 && hash2 != hash2_any) {
hash2 = hash2_any;
goto start_lookup;
}
if (count) { if (count) {
flush_stack(stack, count, skb, count - 1); flush_stack(stack, count, skb, count - 1);
for (i = 0; i < count; i++)
sock_put(stack[i]);
} else { } else {
kfree_skb(skb); kfree_skb(skb);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment