Commit e20cf8d3 authored by Paolo Abeni's avatar Paolo Abeni Committed by David S. Miller

udp: implement GRO for plain UDP sockets.

This is the RX counterpart of commit bec1f6f6 ("udp: generate gso
with UDP_SEGMENT"). When UDP_GRO is enabled, such socket is also
eligible for GRO in the rx path: UDP segments directed to such socket
are assembled into a larger GSO_UDP_L4 packet.

The core UDP GRO support is enabled with setsockopt(UDP_GRO).

Initial benchmark numbers:

Before:
udp rx:   1079 MB/s   769065 calls/s

After:
udp rx:   1466 MB/s    24877 calls/s

This change introduces a side effect in respect to UDP tunnels:
after a UDP tunnel creation, now the kernel performs a lookup per ingress
UDP packet, while before such lookup happened only if the ingress packet
carried a valid internal header csum.

rfc v2 -> rfc v3:
 - fixed typos in macro name and comments
 - really enforce UDP_GRO_CNT_MAX, instead of UDP_GRO_CNT_MAX + 1
 - acquire socket lock in UDP_GRO setsockopt

rfc v1 -> rfc v2:
 - use a new option to enable UDP GRO
 - use static keys to protect the UDP GRO socket lookup
Signed-off-by: default avatarPaolo Abeni <pabeni@redhat.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 60fb9567
...@@ -50,11 +50,12 @@ struct udp_sock { ...@@ -50,11 +50,12 @@ struct udp_sock {
__u8 encap_type; /* Is this an Encapsulation socket? */ __u8 encap_type; /* Is this an Encapsulation socket? */
unsigned char no_check6_tx:1,/* Send zero UDP6 checksums on TX? */ unsigned char no_check6_tx:1,/* Send zero UDP6 checksums on TX? */
no_check6_rx:1,/* Allow zero UDP6 checksums on RX? */ no_check6_rx:1,/* Allow zero UDP6 checksums on RX? */
encap_enabled:1; /* This socket enabled encap encap_enabled:1, /* This socket enabled encap
* processing; UDP tunnels and * processing; UDP tunnels and
* different encapsulation layer set * different encapsulation layer set
* this * this
*/ */
gro_enabled:1; /* Can accept GRO packets */
/* /*
* Following member retains the information to create a UDP header * Following member retains the information to create a UDP header
* when the socket is uncorked. * when the socket is uncorked.
......
...@@ -33,6 +33,7 @@ struct udphdr { ...@@ -33,6 +33,7 @@ struct udphdr {
#define UDP_NO_CHECK6_TX 101 /* Disable sending checksum for UDP6X */ #define UDP_NO_CHECK6_TX 101 /* Disable sending checksum for UDP6X */
#define UDP_NO_CHECK6_RX 102 /* Disable accpeting checksum for UDP6 */ #define UDP_NO_CHECK6_RX 102 /* Disable accpeting checksum for UDP6 */
#define UDP_SEGMENT 103 /* Set GSO segmentation size */ #define UDP_SEGMENT 103 /* Set GSO segmentation size */
#define UDP_GRO 104 /* This socket can receive UDP GRO packets */
/* UDP encapsulation types */ /* UDP encapsulation types */
#define UDP_ENCAP_ESPINUDP_NON_IKE 1 /* draft-ietf-ipsec-nat-t-ike-00/01 */ #define UDP_ENCAP_ESPINUDP_NON_IKE 1 /* draft-ietf-ipsec-nat-t-ike-00/01 */
......
...@@ -2473,6 +2473,14 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, ...@@ -2473,6 +2473,14 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
up->gso_size = val; up->gso_size = val;
break; break;
case UDP_GRO:
lock_sock(sk);
if (valbool)
udp_tunnel_encap_enable(sk->sk_socket);
up->gro_enabled = valbool;
release_sock(sk);
break;
/* /*
* UDP-Lite's partial checksum coverage (RFC 3828). * UDP-Lite's partial checksum coverage (RFC 3828).
*/ */
......
...@@ -343,6 +343,54 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, ...@@ -343,6 +343,54 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
return segs; return segs;
} }
#define UDP_GRO_CNT_MAX 64
static struct sk_buff *udp_gro_receive_segment(struct list_head *head,
struct sk_buff *skb)
{
struct udphdr *uh = udp_hdr(skb);
struct sk_buff *pp = NULL;
struct udphdr *uh2;
struct sk_buff *p;
/* requires non zero csum, for symmetry with GSO */
if (!uh->check) {
NAPI_GRO_CB(skb)->flush = 1;
return NULL;
}
/* pull encapsulating udp header */
skb_gro_pull(skb, sizeof(struct udphdr));
skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr));
list_for_each_entry(p, head, list) {
if (!NAPI_GRO_CB(p)->same_flow)
continue;
uh2 = udp_hdr(p);
/* Match ports only, as csum is always non zero */
if ((*(u32 *)&uh->source != *(u32 *)&uh2->source)) {
NAPI_GRO_CB(p)->same_flow = 0;
continue;
}
/* Terminate the flow on len mismatch or if it grow "too much".
* Under small packet flood GRO count could elsewhere grow a lot
* leading to execessive truesize values
*/
if (!skb_gro_receive(p, skb) &&
NAPI_GRO_CB(p)->count >= UDP_GRO_CNT_MAX)
pp = p;
else if (uh->len != uh2->len)
pp = p;
return pp;
}
/* mismatch, but we never need to flush */
return NULL;
}
struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
struct udphdr *uh, udp_lookup_t lookup) struct udphdr *uh, udp_lookup_t lookup)
{ {
...@@ -353,23 +401,27 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, ...@@ -353,23 +401,27 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
int flush = 1; int flush = 1;
struct sock *sk; struct sock *sk;
rcu_read_lock();
sk = (*lookup)(skb, uh->source, uh->dest);
if (!sk)
goto out_unlock;
if (udp_sk(sk)->gro_enabled) {
pp = call_gro_receive(udp_gro_receive_segment, head, skb);
rcu_read_unlock();
return pp;
}
if (NAPI_GRO_CB(skb)->encap_mark || if (NAPI_GRO_CB(skb)->encap_mark ||
(skb->ip_summed != CHECKSUM_PARTIAL && (skb->ip_summed != CHECKSUM_PARTIAL &&
NAPI_GRO_CB(skb)->csum_cnt == 0 && NAPI_GRO_CB(skb)->csum_cnt == 0 &&
!NAPI_GRO_CB(skb)->csum_valid)) !NAPI_GRO_CB(skb)->csum_valid) ||
goto out; !udp_sk(sk)->gro_receive)
goto out_unlock;
/* mark that this skb passed once through the tunnel gro layer */ /* mark that this skb passed once through the tunnel gro layer */
NAPI_GRO_CB(skb)->encap_mark = 1; NAPI_GRO_CB(skb)->encap_mark = 1;
rcu_read_lock();
sk = (*lookup)(skb, uh->source, uh->dest);
if (sk && udp_sk(sk)->gro_receive)
goto unflush;
goto out_unlock;
unflush:
flush = 0; flush = 0;
list_for_each_entry(p, head, list) { list_for_each_entry(p, head, list) {
...@@ -394,7 +446,6 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, ...@@ -394,7 +446,6 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
out_unlock: out_unlock:
rcu_read_unlock(); rcu_read_unlock();
out:
skb_gro_flush_final(skb, pp, flush); skb_gro_flush_final(skb, pp, flush);
return pp; return pp;
} }
...@@ -427,6 +478,19 @@ static struct sk_buff *udp4_gro_receive(struct list_head *head, ...@@ -427,6 +478,19 @@ static struct sk_buff *udp4_gro_receive(struct list_head *head,
return NULL; return NULL;
} }
static int udp_gro_complete_segment(struct sk_buff *skb)
{
struct udphdr *uh = udp_hdr(skb);
skb->csum_start = (unsigned char *)uh - skb->head;
skb->csum_offset = offsetof(struct udphdr, check);
skb->ip_summed = CHECKSUM_PARTIAL;
skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_L4;
return 0;
}
int udp_gro_complete(struct sk_buff *skb, int nhoff, int udp_gro_complete(struct sk_buff *skb, int nhoff,
udp_lookup_t lookup) udp_lookup_t lookup)
{ {
...@@ -437,16 +501,21 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff, ...@@ -437,16 +501,21 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff,
uh->len = newlen; uh->len = newlen;
/* Set encapsulation before calling into inner gro_complete() functions
* to make them set up the inner offsets.
*/
skb->encapsulation = 1;
rcu_read_lock(); rcu_read_lock();
sk = (*lookup)(skb, uh->source, uh->dest); sk = (*lookup)(skb, uh->source, uh->dest);
if (sk && udp_sk(sk)->gro_complete) if (sk && udp_sk(sk)->gro_enabled) {
err = udp_gro_complete_segment(skb);
} else if (sk && udp_sk(sk)->gro_complete) {
skb_shinfo(skb)->gso_type = uh->check ? SKB_GSO_UDP_TUNNEL_CSUM
: SKB_GSO_UDP_TUNNEL;
/* Set encapsulation before calling into inner gro_complete()
* functions to make them set up the inner offsets.
*/
skb->encapsulation = 1;
err = udp_sk(sk)->gro_complete(sk, skb, err = udp_sk(sk)->gro_complete(sk, skb,
nhoff + sizeof(struct udphdr)); nhoff + sizeof(struct udphdr));
}
rcu_read_unlock(); rcu_read_unlock();
if (skb->remcsum_offload) if (skb->remcsum_offload)
...@@ -461,13 +530,9 @@ static int udp4_gro_complete(struct sk_buff *skb, int nhoff) ...@@ -461,13 +530,9 @@ static int udp4_gro_complete(struct sk_buff *skb, int nhoff)
const struct iphdr *iph = ip_hdr(skb); const struct iphdr *iph = ip_hdr(skb);
struct udphdr *uh = (struct udphdr *)(skb->data + nhoff); struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
if (uh->check) { if (uh->check)
skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;
uh->check = ~udp_v4_check(skb->len - nhoff, iph->saddr, uh->check = ~udp_v4_check(skb->len - nhoff, iph->saddr,
iph->daddr, 0); iph->daddr, 0);
} else {
skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL;
}
return udp_gro_complete(skb, nhoff, udp4_lib_lookup_skb); return udp_gro_complete(skb, nhoff, udp4_lib_lookup_skb);
} }
......
...@@ -147,13 +147,9 @@ static int udp6_gro_complete(struct sk_buff *skb, int nhoff) ...@@ -147,13 +147,9 @@ static int udp6_gro_complete(struct sk_buff *skb, int nhoff)
const struct ipv6hdr *ipv6h = ipv6_hdr(skb); const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
struct udphdr *uh = (struct udphdr *)(skb->data + nhoff); struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
if (uh->check) { if (uh->check)
skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;
uh->check = ~udp_v6_check(skb->len - nhoff, &ipv6h->saddr, uh->check = ~udp_v6_check(skb->len - nhoff, &ipv6h->saddr,
&ipv6h->daddr, 0); &ipv6h->daddr, 0);
} else {
skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL;
}
return udp_gro_complete(skb, nhoff, udp6_lib_lookup_skb); return udp_gro_complete(skb, nhoff, udp6_lib_lookup_skb);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment