Commit df82e9c6 authored by David S. Miller's avatar David S. Miller

Merge branch 'udp-gro-L4'

Paolo Abeni says:

====================
udp: GRO L4 improvements

This series improves the UDP L4 - either 'forward' or 'frag_list' -
co-existence with UDP tunnel GRO, allowing the first to take place
correctly even for encapsulated UDP traffic.

The first for patches are mostly bugfixes, addressing some GRO
edge-cases when both tunnels and L4 are present, enabled and in use.

The next 3 patches avoid unneeded segmentation when UDP GRO
traffic traverses in the receive path UDP tunnels.

Finally, some self-tests are included, covering the relevant
GRO scenarios.

Even if most patches are actually bugfixes, this series is
targeting net-next, as overall it makes available a new feature.

v2 -> v3:
 - no code changes, more verbose commit messages and comment in
   patch 1/8

v1 -> v2:
 - restrict post segmentation csum fixup to the only the relevant pkts
 - use individual 'accept_gso_type' fields instead of whole gso bitmask
   (Willem)
 - use only ipv6 addesses from test range in self-tests (Willem)
 - hopefully clarified most individual patches commit messages
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents dc5fa207 a062260a
......@@ -218,6 +218,7 @@ static struct socket *bareudp_create_sock(struct net *net, __be16 port)
if (err < 0)
return ERR_PTR(err);
udp_allow_gso(sock->sk);
return sock;
}
......
......@@ -461,6 +461,7 @@ static struct socket *geneve_create_sock(struct net *net, bool ipv6,
if (err < 0)
return ERR_PTR(err);
udp_allow_gso(sock->sk);
return sock;
}
......
......@@ -3484,6 +3484,7 @@ static struct socket *vxlan_create_sock(struct net *net, bool ipv6,
if (err < 0)
return ERR_PTR(err);
udp_allow_gso(sock->sk);
return sock;
}
......
......@@ -51,7 +51,9 @@ struct udp_sock {
* different encapsulation layer set
* this
*/
gro_enabled:1; /* Can accept GRO packets */
gro_enabled:1, /* Request GRO aggregation */
accept_udp_l4:1,
accept_udp_fraglist:1;
/*
* Following member retains the information to create a UDP header
* when the socket is uncorked.
......@@ -131,8 +133,22 @@ static inline void udp_cmsg_recv(struct msghdr *msg, struct sock *sk,
static inline bool udp_unexpected_gso(struct sock *sk, struct sk_buff *skb)
{
return !udp_sk(sk)->gro_enabled && skb_is_gso(skb) &&
skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4;
if (!skb_is_gso(skb))
return false;
if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 && !udp_sk(sk)->accept_udp_l4)
return true;
if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST && !udp_sk(sk)->accept_udp_fraglist)
return true;
return false;
}
static inline void udp_allow_gso(struct sock *sk)
{
udp_sk(sk)->accept_udp_l4 = 1;
udp_sk(sk)->accept_udp_fraglist = 1;
}
#define udp_portaddr_for_each_entry(__sk, list) \
......
......@@ -515,6 +515,29 @@ static inline struct sk_buff *udp_rcv_segment(struct sock *sk,
return segs;
}
static inline void udp_post_segment_fix_csum(struct sk_buff *skb)
{
/* UDP-lite can't land here - no GRO */
WARN_ON_ONCE(UDP_SKB_CB(skb)->partial_cov);
/* UDP packets generated with UDP_SEGMENT and traversing:
*
* UDP tunnel(xmit) -> veth (segmentation) -> veth (gro) -> UDP tunnel (rx)
*
* can reach an UDP socket with CHECKSUM_NONE, because
* __iptunnel_pull_header() converts CHECKSUM_PARTIAL into NONE.
* SKB_GSO_UDP_L4 or SKB_GSO_FRAGLIST packets with no UDP tunnel will
* have a valid checksum, as the GRO engine validates the UDP csum
* before the aggregation and nobody strips such info in between.
* Instead of adding another check in the tunnel fastpath, we can force
* a valid csum after the segmentation.
* Additionally fixup the UDP CB.
*/
UDP_SKB_CB(skb)->cscov = skb->len;
if (skb->ip_summed == CHECKSUM_NONE && !skb->csum_valid)
skb->csum_valid = 1;
}
#ifdef CONFIG_BPF_SYSCALL
struct sk_psock;
struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock);
......
......@@ -2178,6 +2178,8 @@ static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
segs = udp_rcv_segment(sk, skb, true);
skb_list_walk_safe(segs, skb, next) {
__skb_pull(skb, skb_transport_offset(skb));
udp_post_segment_fix_csum(skb);
ret = udp_queue_rcv_one_skb(sk, skb);
if (ret > 0)
ip_protocol_deliver_rcu(dev_net(skb->dev), skb, ret);
......@@ -2664,9 +2666,12 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
case UDP_GRO:
lock_sock(sk);
/* when enabling GRO, accept the related GSO packet type */
if (valbool)
udp_tunnel_encap_enable(sk->sk_socket);
up->gro_enabled = valbool;
up->accept_udp_l4 = valbool;
release_sock(sk);
break;
......
......@@ -515,21 +515,24 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
unsigned int off = skb_gro_offset(skb);
int flush = 1;
/* we can do L4 aggregation only if the packet can't land in a tunnel
* otherwise we could corrupt the inner stream
*/
NAPI_GRO_CB(skb)->is_flist = 0;
if (!sk || !udp_sk(sk)->gro_receive) {
if (skb->dev->features & NETIF_F_GRO_FRAGLIST)
NAPI_GRO_CB(skb)->is_flist = sk ? !udp_sk(sk)->gro_enabled: 1;
NAPI_GRO_CB(skb)->is_flist = sk ? !udp_sk(sk)->gro_enabled : 1;
if ((!sk && (skb->dev->features & NETIF_F_GRO_UDP_FWD)) ||
(sk && udp_sk(sk)->gro_enabled) || NAPI_GRO_CB(skb)->is_flist) {
(sk && udp_sk(sk)->gro_enabled) || NAPI_GRO_CB(skb)->is_flist)
pp = call_gro_receive(udp_gro_receive_segment, head, skb);
return pp;
}
if (!sk || NAPI_GRO_CB(skb)->encap_mark ||
if (NAPI_GRO_CB(skb)->encap_mark ||
(uh->check && skb->ip_summed != CHECKSUM_PARTIAL &&
NAPI_GRO_CB(skb)->csum_cnt == 0 &&
!NAPI_GRO_CB(skb)->csum_valid) ||
!udp_sk(sk)->gro_receive)
!NAPI_GRO_CB(skb)->csum_valid))
goto out;
/* mark that this skb passed once through the tunnel gro layer */
......@@ -639,6 +642,11 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff,
skb_shinfo(skb)->gso_type = uh->check ? SKB_GSO_UDP_TUNNEL_CSUM
: SKB_GSO_UDP_TUNNEL;
/* clear the encap mark, so that inner frag_list gro_complete
* can take place
*/
NAPI_GRO_CB(skb)->encap_mark = 0;
/* Set encapsulation before calling into inner gro_complete()
* functions to make them set up the inner offsets.
*/
......@@ -662,7 +670,8 @@ INDIRECT_CALLABLE_SCOPE int udp4_gro_complete(struct sk_buff *skb, int nhoff)
const struct iphdr *iph = ip_hdr(skb);
struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
if (NAPI_GRO_CB(skb)->is_flist) {
/* do fraglist only if there is no outer UDP encap (or we already processed it) */
if (NAPI_GRO_CB(skb)->is_flist && !NAPI_GRO_CB(skb)->encap_mark) {
uh->len = htons(skb->len - nhoff);
skb_shinfo(skb)->gso_type |= (SKB_GSO_FRAGLIST|SKB_GSO_UDP_L4);
......
......@@ -749,6 +749,7 @@ static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
skb_list_walk_safe(segs, skb, next) {
__skb_pull(skb, skb_transport_offset(skb));
udp_post_segment_fix_csum(skb);
ret = udpv6_queue_rcv_one_skb(sk, skb);
if (ret > 0)
ip6_protocol_deliver_rcu(dev_net(skb->dev), skb, ret,
......
......@@ -163,7 +163,8 @@ INDIRECT_CALLABLE_SCOPE int udp6_gro_complete(struct sk_buff *skb, int nhoff)
const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
if (NAPI_GRO_CB(skb)->is_flist) {
/* do fraglist only if there is no outer UDP encap (or we already processed it) */
if (NAPI_GRO_CB(skb)->is_flist && !NAPI_GRO_CB(skb)->encap_mark) {
uh->len = htons(skb->len - nhoff);
skb_shinfo(skb)->gso_type |= (SKB_GSO_FRAGLIST|SKB_GSO_UDP_L4);
......
......@@ -23,6 +23,7 @@ TEST_PROGS += drop_monitor_tests.sh
TEST_PROGS += vrf_route_leaking.sh
TEST_PROGS += bareudp.sh
TEST_PROGS += unicast_extensions.sh
TEST_PROGS += udpgro_fwd.sh
TEST_PROGS_EXTENDED := in_netns.sh
TEST_GEN_FILES = socket nettest
TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy reuseport_addr_any
......
#!/bin/sh
# SPDX-License-Identifier: GPL-2.0
readonly BASE="ns-$(mktemp -u XXXXXX)"
readonly SRC=2
readonly DST=1
readonly DST_NAT=100
readonly NS_SRC=$BASE$SRC
readonly NS_DST=$BASE$DST
# "baremetal" network used for raw UDP traffic
readonly BM_NET_V4=192.168.1.
readonly BM_NET_V6=2001:db8::
# "overlay" network used for UDP over UDP tunnel traffic
readonly OL_NET_V4=172.16.1.
readonly OL_NET_V6=2001:db8:1::
readonly NPROCS=`nproc`
cleanup() {
local ns
local -r jobs="$(jobs -p)"
[ -n "${jobs}" ] && kill -1 ${jobs} 2>/dev/null
for ns in $NS_SRC $NS_DST; do
ip netns del $ns 2>/dev/null
done
}
trap cleanup EXIT
create_ns() {
local net
local ns
for ns in $NS_SRC $NS_DST; do
ip netns add $ns
ip -n $ns link set dev lo up
done
ip link add name veth$SRC type veth peer name veth$DST
for ns in $SRC $DST; do
ip link set dev veth$ns netns $BASE$ns
ip -n $BASE$ns link set dev veth$ns up
ip -n $BASE$ns addr add dev veth$ns $BM_NET_V4$ns/24
ip -n $BASE$ns addr add dev veth$ns $BM_NET_V6$ns/64 nodad
done
ip -n $NS_DST link set veth$DST xdp object ../bpf/xdp_dummy.o section xdp_dummy 2>/dev/null
}
create_vxlan_endpoint() {
local -r netns=$1
local -r bm_dev=$2
local -r bm_rem_addr=$3
local -r vxlan_dev=$4
local -r vxlan_id=$5
local -r vxlan_port=4789
ip -n $netns link set dev $bm_dev up
ip -n $netns link add dev $vxlan_dev type vxlan id $vxlan_id \
dstport $vxlan_port remote $bm_rem_addr
ip -n $netns link set dev $vxlan_dev up
}
create_vxlan_pair() {
local ns
create_ns
for ns in $SRC $DST; do
# note that 3 - $SRC == $DST and 3 - $DST == $SRC
create_vxlan_endpoint $BASE$ns veth$ns $BM_NET_V4$((3 - $ns)) vxlan$ns 4
ip -n $BASE$ns addr add dev vxlan$ns $OL_NET_V4$ns/24
done
for ns in $SRC $DST; do
create_vxlan_endpoint $BASE$ns veth$ns $BM_NET_V6$((3 - $ns)) vxlan6$ns 6
ip -n $BASE$ns addr add dev vxlan6$ns $OL_NET_V6$ns/24 nodad
done
}
is_ipv6() {
if [[ $1 =~ .*:.* ]]; then
return 0
fi
return 1
}
run_test() {
local -r msg=$1
local -r dst=$2
local -r pkts=$3
local -r vxpkts=$4
local bind=$5
local rx_args=""
local rx_family="-4"
local family=-4
local filter=IpInReceives
local ipt=iptables
printf "%-40s" "$msg"
if is_ipv6 $dst; then
# rx program does not support '-6' and implies ipv6 usage by default
rx_family=""
family=-6
filter=Ip6InReceives
ipt=ip6tables
fi
rx_args="$rx_family"
[ -n "$bind" ] && rx_args="$rx_args -b $bind"
# send a single GSO packet, segmented in 10 UDP frames.
# Always expect 10 UDP frames on RX side as rx socket does
# not enable GRO
ip netns exec $NS_DST $ipt -A INPUT -p udp --dport 4789
ip netns exec $NS_DST $ipt -A INPUT -p udp --dport 8000
ip netns exec $NS_DST ./udpgso_bench_rx -C 1000 -R 10 -n 10 -l 1300 $rx_args &
local spid=$!
sleep 0.1
ip netns exec $NS_SRC ./udpgso_bench_tx $family -M 1 -s 13000 -S 1300 -D $dst
local retc=$?
wait $spid
local rets=$?
if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ]; then
echo " fail client exit code $retc, server $rets"
ret=1
return
fi
local rcv=`ip netns exec $NS_DST $ipt"-save" -c | grep 'dport 8000' | \
sed -e 's/\[//' -e 's/:.*//'`
if [ $rcv != $pkts ]; then
echo " fail - received $rvs packets, expected $pkts"
ret=1
return
fi
local vxrcv=`ip netns exec $NS_DST $ipt"-save" -c | grep 'dport 4789' | \
sed -e 's/\[//' -e 's/:.*//'`
# upper net can generate a little noise, allow some tolerance
if [ $vxrcv -lt $vxpkts -o $vxrcv -gt $((vxpkts + 3)) ]; then
echo " fail - received $vxrcv vxlan packets, expected $vxpkts"
ret=1
return
fi
echo " ok"
}
run_bench() {
local -r msg=$1
local -r dst=$2
local family=-4
printf "%-40s" "$msg"
if [ $NPROCS -lt 2 ]; then
echo " skip - needed 2 CPUs found $NPROCS"
return
fi
is_ipv6 $dst && family=-6
# bind the sender and the receiver to different CPUs to try
# get reproducible results
ip netns exec $NS_DST bash -c "echo 2 > /sys/class/net/veth$DST/queues/rx-0/rps_cpus"
ip netns exec $NS_DST taskset 0x2 ./udpgso_bench_rx -C 1000 -R 10 &
local spid=$!
sleep 0.1
ip netns exec $NS_SRC taskset 0x1 ./udpgso_bench_tx $family -l 3 -S 1300 -D $dst
local retc=$?
wait $spid
local rets=$?
if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ]; then
echo " fail client exit code $retc, server $rets"
ret=1
return
fi
}
for family in 4 6; do
BM_NET=$BM_NET_V4
OL_NET=$OL_NET_V4
IPT=iptables
SUFFIX=24
VXDEV=vxlan
if [ $family = 6 ]; then
BM_NET=$BM_NET_V6
OL_NET=$OL_NET_V6
SUFFIX="64 nodad"
VXDEV=vxlan6
IPT=ip6tables
fi
echo "IPv$family"
create_ns
run_test "No GRO" $BM_NET$DST 10 0
cleanup
create_ns
ip netns exec $NS_DST ethtool -K veth$DST rx-gro-list on
run_test "GRO frag list" $BM_NET$DST 1 0
cleanup
# UDP GRO fwd skips aggregation when find an udp socket with the GRO option
# if there is an UDP tunnel in the running system, such lookup happen
# take place.
# use NAT to circumvent GRO FWD check
create_ns
ip -n $NS_DST addr add dev veth$DST $BM_NET$DST_NAT/$SUFFIX
ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on
ip netns exec $NS_DST $IPT -t nat -I PREROUTING -d $BM_NET$DST_NAT \
-j DNAT --to-destination $BM_NET$DST
run_test "GRO fwd" $BM_NET$DST_NAT 1 0 $BM_NET$DST
cleanup
create_ns
run_bench "UDP fwd perf" $BM_NET$DST
ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on
run_bench "UDP GRO fwd perf" $BM_NET$DST
cleanup
create_vxlan_pair
ip netns exec $NS_DST ethtool -K veth$DST rx-gro-list on
run_test "GRO frag list over UDP tunnel" $OL_NET$DST 1 1
cleanup
# use NAT to circumvent GRO FWD check
create_vxlan_pair
ip -n $NS_DST addr add dev $VXDEV$DST $OL_NET$DST_NAT/$SUFFIX
ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on
ip netns exec $NS_DST $IPT -t nat -I PREROUTING -d $OL_NET$DST_NAT \
-j DNAT --to-destination $OL_NET$DST
# load arp cache before running the test to reduce the amount of
# stray traffic on top of the UDP tunnel
ip netns exec $NS_SRC ping -q -c 1 $OL_NET$DST_NAT >/dev/null
run_test "GRO fwd over UDP tunnel" $OL_NET$DST_NAT 1 1 $OL_NET$DST
cleanup
create_vxlan_pair
run_bench "UDP tunnel fwd perf" $OL_NET$DST
ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on
run_bench "UDP tunnel GRO fwd perf" $OL_NET$DST
cleanup
done
exit $ret
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment