Commit 94c59aab authored by Daniel Borkmann's avatar Daniel Borkmann

Merge branch 'bpf-l2-encap'

Alan Maguire says:

====================
Extend bpf_skb_adjust_room growth to mark inner MAC header so
that L2 encapsulation can be used for tc tunnels.

Patch #1 extends the existing test_tc_tunnel to support UDP
encapsulation; later we want to be able to test MPLS over UDP
and MPLS over GRE encapsulation.

Patch #2 adds the BPF_F_ADJ_ROOM_ENCAP_L2(len) macro, which
allows specification of inner mac length.  Other approaches were
explored prior to taking this approach.  Specifically, I tried
automatically computing the inner mac length on the basis of the
specified flags (so inner maclen for GRE/IPv4 encap is the len_diff
specified to bpf_skb_adjust_room minus GRE + IPv4 header length
for example).  Problem with this is that we don't know for sure
what form of GRE/UDP header we have; is it a full GRE header,
or is it a FOU UDP header or generic UDP encap header? My fear
here was we'd end up with an explosion of flags.  The other approach
tried was to support inner L2 header marking as a separate room
adjustment, i.e. adjust for L3/L4 encap, then call
bpf_skb_adjust_room for L2 encap.  This can be made to work but
because it imposed an order on operations, felt a bit clunky.

Patch #3 syncs tools/ bpf.h.

Patch #4 extends the tests again to support MPLSoverGRE,
MPLSoverUDP, and transparent ethernet bridging (TEB) where
the inner L2 header is an ethernet header.  Testing of BPF
encap against tunnels is done for cases where configuration
of such tunnels is possible (MPLSoverGRE[6], MPLSoverUDP,
gre[6]tap), and skipped otherwise.  Testing of BPF encap/decap
is always carried out.

Changes since v2:
 - updated tools/testing/selftest/bpf/config with FOU/MPLS CONFIG
   variables (patches 1, 4)
 - reduced noise in patch 1 by avoiding unnecessary movement of code
 - eliminated inner_mac variable in bpf_skb_net_grow (patch 2)

Changes since v1:
 - fixed formatting of commit references.
 - BPF_F_ADJ_ROOM_FIXED_GSO flag enabled on all variants (patch 1)
 - fixed fou6 options for UDP encap; checksum errors observed were
   due to the fact fou6 tunnel was not set up with correct ipproto
   options (41 -6).  0 checksums work fine (patch 1)
 - added definitions for mask and shift used in setting L2 length
   (patch 2)
 - allow udp encap with fixed GSO (patch 2)
 - changed "elen" to "l2_len" to be more descriptive (patch 4)
====================
Acked-by: default avatarWillem de Bruijn <willemb@google.com>
Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
parents c695865c 3ec61df8
...@@ -1523,6 +1523,10 @@ union bpf_attr { ...@@ -1523,6 +1523,10 @@ union bpf_attr {
* * **BPF_F_ADJ_ROOM_ENCAP_L4_UDP **: * * **BPF_F_ADJ_ROOM_ENCAP_L4_UDP **:
* Use with ENCAP_L3 flags to further specify the tunnel type. * Use with ENCAP_L3 flags to further specify the tunnel type.
* *
* * **BPF_F_ADJ_ROOM_ENCAP_L2(len) **:
* Use with ENCAP_L3/L4 flags to further specify the tunnel
* type; **len** is the length of the inner MAC header.
*
* A call to this helper is susceptible to change the underlaying * A call to this helper is susceptible to change the underlaying
* packet buffer. Therefore, at load time, all checks on pointers * packet buffer. Therefore, at load time, all checks on pointers
* previously done by the verifier are invalidated and must be * previously done by the verifier are invalidated and must be
...@@ -2664,10 +2668,16 @@ enum bpf_func_id { ...@@ -2664,10 +2668,16 @@ enum bpf_func_id {
/* BPF_FUNC_skb_adjust_room flags. */ /* BPF_FUNC_skb_adjust_room flags. */
#define BPF_F_ADJ_ROOM_FIXED_GSO (1ULL << 0) #define BPF_F_ADJ_ROOM_FIXED_GSO (1ULL << 0)
#define BPF_ADJ_ROOM_ENCAP_L2_MASK 0xff
#define BPF_ADJ_ROOM_ENCAP_L2_SHIFT 56
#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 (1ULL << 1) #define BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 (1ULL << 1)
#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 (1ULL << 2) #define BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 (1ULL << 2)
#define BPF_F_ADJ_ROOM_ENCAP_L4_GRE (1ULL << 3) #define BPF_F_ADJ_ROOM_ENCAP_L4_GRE (1ULL << 3)
#define BPF_F_ADJ_ROOM_ENCAP_L4_UDP (1ULL << 4) #define BPF_F_ADJ_ROOM_ENCAP_L4_UDP (1ULL << 4)
#define BPF_F_ADJ_ROOM_ENCAP_L2(len) (((__u64)len & \
BPF_ADJ_ROOM_ENCAP_L2_MASK) \
<< BPF_ADJ_ROOM_ENCAP_L2_SHIFT)
/* Mode for BPF_FUNC_skb_adjust_room helper. */ /* Mode for BPF_FUNC_skb_adjust_room helper. */
enum bpf_adj_room_mode { enum bpf_adj_room_mode {
......
...@@ -2969,11 +2969,14 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb) ...@@ -2969,11 +2969,14 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb)
#define BPF_F_ADJ_ROOM_MASK (BPF_F_ADJ_ROOM_FIXED_GSO | \ #define BPF_F_ADJ_ROOM_MASK (BPF_F_ADJ_ROOM_FIXED_GSO | \
BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \ BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \
BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \ BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \
BPF_F_ADJ_ROOM_ENCAP_L4_UDP) BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \
BPF_F_ADJ_ROOM_ENCAP_L2( \
BPF_ADJ_ROOM_ENCAP_L2_MASK))
static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
u64 flags) u64 flags)
{ {
u8 inner_mac_len = flags >> BPF_ADJ_ROOM_ENCAP_L2_SHIFT;
bool encap = flags & BPF_F_ADJ_ROOM_ENCAP_L3_MASK; bool encap = flags & BPF_F_ADJ_ROOM_ENCAP_L3_MASK;
u16 mac_len = 0, inner_net = 0, inner_trans = 0; u16 mac_len = 0, inner_net = 0, inner_trans = 0;
unsigned int gso_type = SKB_GSO_DODGY; unsigned int gso_type = SKB_GSO_DODGY;
...@@ -3008,6 +3011,8 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, ...@@ -3008,6 +3011,8 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
mac_len = skb->network_header - skb->mac_header; mac_len = skb->network_header - skb->mac_header;
inner_net = skb->network_header; inner_net = skb->network_header;
if (inner_mac_len > len_diff)
return -EINVAL;
inner_trans = skb->transport_header; inner_trans = skb->transport_header;
} }
...@@ -3016,8 +3021,7 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, ...@@ -3016,8 +3021,7 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
return ret; return ret;
if (encap) { if (encap) {
/* inner mac == inner_net on l3 encap */ skb->inner_mac_header = inner_net - inner_mac_len;
skb->inner_mac_header = inner_net;
skb->inner_network_header = inner_net; skb->inner_network_header = inner_net;
skb->inner_transport_header = inner_trans; skb->inner_transport_header = inner_trans;
skb_set_inner_protocol(skb, skb->protocol); skb_set_inner_protocol(skb, skb->protocol);
...@@ -3031,7 +3035,7 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, ...@@ -3031,7 +3035,7 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
gso_type |= SKB_GSO_GRE; gso_type |= SKB_GSO_GRE;
else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
gso_type |= SKB_GSO_IPXIP6; gso_type |= SKB_GSO_IPXIP6;
else else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4)
gso_type |= SKB_GSO_IPXIP4; gso_type |= SKB_GSO_IPXIP4;
if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE || if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE ||
......
...@@ -1523,6 +1523,10 @@ union bpf_attr { ...@@ -1523,6 +1523,10 @@ union bpf_attr {
* * **BPF_F_ADJ_ROOM_ENCAP_L4_UDP **: * * **BPF_F_ADJ_ROOM_ENCAP_L4_UDP **:
* Use with ENCAP_L3 flags to further specify the tunnel type. * Use with ENCAP_L3 flags to further specify the tunnel type.
* *
* * **BPF_F_ADJ_ROOM_ENCAP_L2(len) **:
* Use with ENCAP_L3/L4 flags to further specify the tunnel
* type; **len** is the length of the inner MAC header.
*
* A call to this helper is susceptible to change the underlaying * A call to this helper is susceptible to change the underlaying
* packet buffer. Therefore, at load time, all checks on pointers * packet buffer. Therefore, at load time, all checks on pointers
* previously done by the verifier are invalidated and must be * previously done by the verifier are invalidated and must be
...@@ -2664,10 +2668,16 @@ enum bpf_func_id { ...@@ -2664,10 +2668,16 @@ enum bpf_func_id {
/* BPF_FUNC_skb_adjust_room flags. */ /* BPF_FUNC_skb_adjust_room flags. */
#define BPF_F_ADJ_ROOM_FIXED_GSO (1ULL << 0) #define BPF_F_ADJ_ROOM_FIXED_GSO (1ULL << 0)
#define BPF_ADJ_ROOM_ENCAP_L2_MASK 0xff
#define BPF_ADJ_ROOM_ENCAP_L2_SHIFT 56
#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 (1ULL << 1) #define BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 (1ULL << 1)
#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 (1ULL << 2) #define BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 (1ULL << 2)
#define BPF_F_ADJ_ROOM_ENCAP_L4_GRE (1ULL << 3) #define BPF_F_ADJ_ROOM_ENCAP_L4_GRE (1ULL << 3)
#define BPF_F_ADJ_ROOM_ENCAP_L4_UDP (1ULL << 4) #define BPF_F_ADJ_ROOM_ENCAP_L4_UDP (1ULL << 4)
#define BPF_F_ADJ_ROOM_ENCAP_L2(len) (((__u64)len & \
BPF_ADJ_ROOM_ENCAP_L2_MASK) \
<< BPF_ADJ_ROOM_ENCAP_L2_SHIFT)
/* Mode for BPF_FUNC_skb_adjust_room helper. */ /* Mode for BPF_FUNC_skb_adjust_room helper. */
enum bpf_adj_room_mode { enum bpf_adj_room_mode {
......
...@@ -25,3 +25,11 @@ CONFIG_XDP_SOCKETS=y ...@@ -25,3 +25,11 @@ CONFIG_XDP_SOCKETS=y
CONFIG_FTRACE_SYSCALLS=y CONFIG_FTRACE_SYSCALLS=y
CONFIG_IPV6_TUNNEL=y CONFIG_IPV6_TUNNEL=y
CONFIG_IPV6_GRE=y CONFIG_IPV6_GRE=y
CONFIG_NET_FOU=m
CONFIG_NET_FOU_IP_TUNNELS=y
CONFIG_IPV6_FOU=m
CONFIG_IPV6_FOU_TUNNEL=m
CONFIG_MPLS=y
CONFIG_NET_MPLS_GSO=m
CONFIG_MPLS_ROUTING=m
CONFIG_MPLS_IPTUNNEL=m
...@@ -11,7 +11,9 @@ ...@@ -11,7 +11,9 @@
#include <linux/in.h> #include <linux/in.h>
#include <linux/ip.h> #include <linux/ip.h>
#include <linux/ipv6.h> #include <linux/ipv6.h>
#include <linux/mpls.h>
#include <linux/tcp.h> #include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/pkt_cls.h> #include <linux/pkt_cls.h>
#include <linux/types.h> #include <linux/types.h>
...@@ -20,16 +22,36 @@ ...@@ -20,16 +22,36 @@
static const int cfg_port = 8000; static const int cfg_port = 8000;
struct grev4hdr { static const int cfg_udp_src = 20000;
struct iphdr ip;
#define UDP_PORT 5555
#define MPLS_OVER_UDP_PORT 6635
#define ETH_OVER_UDP_PORT 7777
/* MPLS label 1000 with S bit (last label) set and ttl of 255. */
static const __u32 mpls_label = __bpf_constant_htonl(1000 << 12 |
MPLS_LS_S_MASK | 0xff);
struct gre_hdr {
__be16 flags; __be16 flags;
__be16 protocol; __be16 protocol;
} __attribute__((packed)); } __attribute__((packed));
struct grev6hdr { union l4hdr {
struct udphdr udp;
struct gre_hdr gre;
};
struct v4hdr {
struct iphdr ip;
union l4hdr l4hdr;
__u8 pad[16]; /* enough space for L2 header */
} __attribute__((packed));
struct v6hdr {
struct ipv6hdr ip; struct ipv6hdr ip;
__be16 flags; union l4hdr l4hdr;
__be16 protocol; __u8 pad[16]; /* enough space for L2 header */
} __attribute__((packed)); } __attribute__((packed));
static __always_inline void set_ipv4_csum(struct iphdr *iph) static __always_inline void set_ipv4_csum(struct iphdr *iph)
...@@ -47,13 +69,15 @@ static __always_inline void set_ipv4_csum(struct iphdr *iph) ...@@ -47,13 +69,15 @@ static __always_inline void set_ipv4_csum(struct iphdr *iph)
iph->check = ~((csum & 0xffff) + (csum >> 16)); iph->check = ~((csum & 0xffff) + (csum >> 16));
} }
static __always_inline int encap_ipv4(struct __sk_buff *skb, bool with_gre) static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto,
__u16 l2_proto)
{ {
struct grev4hdr h_outer; __u16 udp_dst = UDP_PORT;
struct iphdr iph_inner; struct iphdr iph_inner;
struct v4hdr h_outer;
struct tcphdr tcph; struct tcphdr tcph;
int olen, l2_len;
__u64 flags; __u64 flags;
int olen;
if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph_inner, if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph_inner,
sizeof(iph_inner)) < 0) sizeof(iph_inner)) < 0)
...@@ -70,14 +94,59 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, bool with_gre) ...@@ -70,14 +94,59 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, bool with_gre)
if (tcph.dest != __bpf_constant_htons(cfg_port)) if (tcph.dest != __bpf_constant_htons(cfg_port))
return TC_ACT_OK; return TC_ACT_OK;
olen = sizeof(h_outer.ip);
l2_len = 0;
flags = BPF_F_ADJ_ROOM_FIXED_GSO | BPF_F_ADJ_ROOM_ENCAP_L3_IPV4; flags = BPF_F_ADJ_ROOM_FIXED_GSO | BPF_F_ADJ_ROOM_ENCAP_L3_IPV4;
if (with_gre) {
switch (l2_proto) {
case ETH_P_MPLS_UC:
l2_len = sizeof(mpls_label);
udp_dst = MPLS_OVER_UDP_PORT;
break;
case ETH_P_TEB:
l2_len = ETH_HLEN;
udp_dst = ETH_OVER_UDP_PORT;
break;
}
flags |= BPF_F_ADJ_ROOM_ENCAP_L2(l2_len);
switch (encap_proto) {
case IPPROTO_GRE:
flags |= BPF_F_ADJ_ROOM_ENCAP_L4_GRE; flags |= BPF_F_ADJ_ROOM_ENCAP_L4_GRE;
olen = sizeof(h_outer); olen += sizeof(h_outer.l4hdr.gre);
} else { h_outer.l4hdr.gre.protocol = bpf_htons(l2_proto);
olen = sizeof(h_outer.ip); h_outer.l4hdr.gre.flags = 0;
break;
case IPPROTO_UDP:
flags |= BPF_F_ADJ_ROOM_ENCAP_L4_UDP;
olen += sizeof(h_outer.l4hdr.udp);
h_outer.l4hdr.udp.source = __bpf_constant_htons(cfg_udp_src);
h_outer.l4hdr.udp.dest = bpf_htons(udp_dst);
h_outer.l4hdr.udp.check = 0;
h_outer.l4hdr.udp.len = bpf_htons(bpf_ntohs(iph_inner.tot_len) +
sizeof(h_outer.l4hdr.udp) +
l2_len);
break;
case IPPROTO_IPIP:
break;
default:
return TC_ACT_OK;
} }
/* add L2 encap (if specified) */
switch (l2_proto) {
case ETH_P_MPLS_UC:
*((__u32 *)((__u8 *)&h_outer + olen)) = mpls_label;
break;
case ETH_P_TEB:
if (bpf_skb_load_bytes(skb, 0, (__u8 *)&h_outer + olen,
ETH_HLEN))
return TC_ACT_SHOT;
break;
}
olen += l2_len;
/* add room between mac and network header */ /* add room between mac and network header */
if (bpf_skb_adjust_room(skb, olen, BPF_ADJ_ROOM_MAC, flags)) if (bpf_skb_adjust_room(skb, olen, BPF_ADJ_ROOM_MAC, flags))
return TC_ACT_SHOT; return TC_ACT_SHOT;
...@@ -85,16 +154,10 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, bool with_gre) ...@@ -85,16 +154,10 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, bool with_gre)
/* prepare new outer network header */ /* prepare new outer network header */
h_outer.ip = iph_inner; h_outer.ip = iph_inner;
h_outer.ip.tot_len = bpf_htons(olen + h_outer.ip.tot_len = bpf_htons(olen +
bpf_htons(h_outer.ip.tot_len)); bpf_ntohs(h_outer.ip.tot_len));
if (with_gre) { h_outer.ip.protocol = encap_proto;
h_outer.ip.protocol = IPPROTO_GRE;
h_outer.protocol = bpf_htons(ETH_P_IP);
h_outer.flags = 0;
} else {
h_outer.ip.protocol = IPPROTO_IPIP;
}
set_ipv4_csum((void *)&h_outer.ip); set_ipv4_csum(&h_outer.ip);
/* store new outer network header */ /* store new outer network header */
if (bpf_skb_store_bytes(skb, ETH_HLEN, &h_outer, olen, if (bpf_skb_store_bytes(skb, ETH_HLEN, &h_outer, olen,
...@@ -104,13 +167,16 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, bool with_gre) ...@@ -104,13 +167,16 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, bool with_gre)
return TC_ACT_OK; return TC_ACT_OK;
} }
static __always_inline int encap_ipv6(struct __sk_buff *skb, bool with_gre) static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
__u16 l2_proto)
{ {
__u16 udp_dst = UDP_PORT;
struct ipv6hdr iph_inner; struct ipv6hdr iph_inner;
struct grev6hdr h_outer; struct v6hdr h_outer;
struct tcphdr tcph; struct tcphdr tcph;
int olen, l2_len;
__u16 tot_len;
__u64 flags; __u64 flags;
int olen;
if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph_inner, if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph_inner,
sizeof(iph_inner)) < 0) sizeof(iph_inner)) < 0)
...@@ -124,14 +190,58 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, bool with_gre) ...@@ -124,14 +190,58 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, bool with_gre)
if (tcph.dest != __bpf_constant_htons(cfg_port)) if (tcph.dest != __bpf_constant_htons(cfg_port))
return TC_ACT_OK; return TC_ACT_OK;
olen = sizeof(h_outer.ip);
l2_len = 0;
flags = BPF_F_ADJ_ROOM_FIXED_GSO | BPF_F_ADJ_ROOM_ENCAP_L3_IPV6; flags = BPF_F_ADJ_ROOM_FIXED_GSO | BPF_F_ADJ_ROOM_ENCAP_L3_IPV6;
if (with_gre) {
switch (l2_proto) {
case ETH_P_MPLS_UC:
l2_len = sizeof(mpls_label);
udp_dst = MPLS_OVER_UDP_PORT;
break;
case ETH_P_TEB:
l2_len = ETH_HLEN;
udp_dst = ETH_OVER_UDP_PORT;
break;
}
flags |= BPF_F_ADJ_ROOM_ENCAP_L2(l2_len);
switch (encap_proto) {
case IPPROTO_GRE:
flags |= BPF_F_ADJ_ROOM_ENCAP_L4_GRE; flags |= BPF_F_ADJ_ROOM_ENCAP_L4_GRE;
olen = sizeof(h_outer); olen += sizeof(h_outer.l4hdr.gre);
} else { h_outer.l4hdr.gre.protocol = bpf_htons(l2_proto);
olen = sizeof(h_outer.ip); h_outer.l4hdr.gre.flags = 0;
break;
case IPPROTO_UDP:
flags |= BPF_F_ADJ_ROOM_ENCAP_L4_UDP;
olen += sizeof(h_outer.l4hdr.udp);
h_outer.l4hdr.udp.source = __bpf_constant_htons(cfg_udp_src);
h_outer.l4hdr.udp.dest = bpf_htons(udp_dst);
tot_len = bpf_ntohs(iph_inner.payload_len) + sizeof(iph_inner) +
sizeof(h_outer.l4hdr.udp);
h_outer.l4hdr.udp.check = 0;
h_outer.l4hdr.udp.len = bpf_htons(tot_len);
break;
case IPPROTO_IPV6:
break;
default:
return TC_ACT_OK;
} }
/* add L2 encap (if specified) */
switch (l2_proto) {
case ETH_P_MPLS_UC:
*((__u32 *)((__u8 *)&h_outer + olen)) = mpls_label;
break;
case ETH_P_TEB:
if (bpf_skb_load_bytes(skb, 0, (__u8 *)&h_outer + olen,
ETH_HLEN))
return TC_ACT_SHOT;
break;
}
olen += l2_len;
/* add room between mac and network header */ /* add room between mac and network header */
if (bpf_skb_adjust_room(skb, olen, BPF_ADJ_ROOM_MAC, flags)) if (bpf_skb_adjust_room(skb, olen, BPF_ADJ_ROOM_MAC, flags))
...@@ -141,13 +251,8 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, bool with_gre) ...@@ -141,13 +251,8 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, bool with_gre)
h_outer.ip = iph_inner; h_outer.ip = iph_inner;
h_outer.ip.payload_len = bpf_htons(olen + h_outer.ip.payload_len = bpf_htons(olen +
bpf_ntohs(h_outer.ip.payload_len)); bpf_ntohs(h_outer.ip.payload_len));
if (with_gre) {
h_outer.ip.nexthdr = IPPROTO_GRE; h_outer.ip.nexthdr = encap_proto;
h_outer.protocol = bpf_htons(ETH_P_IPV6);
h_outer.flags = 0;
} else {
h_outer.ip.nexthdr = IPPROTO_IPV6;
}
/* store new outer network header */ /* store new outer network header */
if (bpf_skb_store_bytes(skb, ETH_HLEN, &h_outer, olen, if (bpf_skb_store_bytes(skb, ETH_HLEN, &h_outer, olen,
...@@ -157,54 +262,168 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, bool with_gre) ...@@ -157,54 +262,168 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, bool with_gre)
return TC_ACT_OK; return TC_ACT_OK;
} }
SEC("encap_ipip") SEC("encap_ipip_none")
int __encap_ipip(struct __sk_buff *skb) int __encap_ipip_none(struct __sk_buff *skb)
{ {
if (skb->protocol == __bpf_constant_htons(ETH_P_IP)) if (skb->protocol == __bpf_constant_htons(ETH_P_IP))
return encap_ipv4(skb, false); return encap_ipv4(skb, IPPROTO_IPIP, ETH_P_IP);
else else
return TC_ACT_OK; return TC_ACT_OK;
} }
SEC("encap_gre") SEC("encap_gre_none")
int __encap_gre(struct __sk_buff *skb) int __encap_gre_none(struct __sk_buff *skb)
{ {
if (skb->protocol == __bpf_constant_htons(ETH_P_IP)) if (skb->protocol == __bpf_constant_htons(ETH_P_IP))
return encap_ipv4(skb, true); return encap_ipv4(skb, IPPROTO_GRE, ETH_P_IP);
else else
return TC_ACT_OK; return TC_ACT_OK;
} }
SEC("encap_ip6tnl") SEC("encap_gre_mpls")
int __encap_ip6tnl(struct __sk_buff *skb) int __encap_gre_mpls(struct __sk_buff *skb)
{
if (skb->protocol == __bpf_constant_htons(ETH_P_IP))
return encap_ipv4(skb, IPPROTO_GRE, ETH_P_MPLS_UC);
else
return TC_ACT_OK;
}
SEC("encap_gre_eth")
int __encap_gre_eth(struct __sk_buff *skb)
{
if (skb->protocol == __bpf_constant_htons(ETH_P_IP))
return encap_ipv4(skb, IPPROTO_GRE, ETH_P_TEB);
else
return TC_ACT_OK;
}
SEC("encap_udp_none")
int __encap_udp_none(struct __sk_buff *skb)
{
if (skb->protocol == __bpf_constant_htons(ETH_P_IP))
return encap_ipv4(skb, IPPROTO_UDP, ETH_P_IP);
else
return TC_ACT_OK;
}
SEC("encap_udp_mpls")
int __encap_udp_mpls(struct __sk_buff *skb)
{
if (skb->protocol == __bpf_constant_htons(ETH_P_IP))
return encap_ipv4(skb, IPPROTO_UDP, ETH_P_MPLS_UC);
else
return TC_ACT_OK;
}
SEC("encap_udp_eth")
int __encap_udp_eth(struct __sk_buff *skb)
{
if (skb->protocol == __bpf_constant_htons(ETH_P_IP))
return encap_ipv4(skb, IPPROTO_UDP, ETH_P_TEB);
else
return TC_ACT_OK;
}
SEC("encap_ip6tnl_none")
int __encap_ip6tnl_none(struct __sk_buff *skb)
{
if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6))
return encap_ipv6(skb, IPPROTO_IPV6, ETH_P_IPV6);
else
return TC_ACT_OK;
}
SEC("encap_ip6gre_none")
int __encap_ip6gre_none(struct __sk_buff *skb)
{
if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6))
return encap_ipv6(skb, IPPROTO_GRE, ETH_P_IPV6);
else
return TC_ACT_OK;
}
SEC("encap_ip6gre_mpls")
int __encap_ip6gre_mpls(struct __sk_buff *skb)
{
if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6))
return encap_ipv6(skb, IPPROTO_GRE, ETH_P_MPLS_UC);
else
return TC_ACT_OK;
}
SEC("encap_ip6gre_eth")
int __encap_ip6gre_eth(struct __sk_buff *skb)
{
if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6))
return encap_ipv6(skb, IPPROTO_GRE, ETH_P_TEB);
else
return TC_ACT_OK;
}
SEC("encap_ip6udp_none")
int __encap_ip6udp_none(struct __sk_buff *skb)
{ {
if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6)) if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6))
return encap_ipv6(skb, false); return encap_ipv6(skb, IPPROTO_UDP, ETH_P_IPV6);
else else
return TC_ACT_OK; return TC_ACT_OK;
} }
SEC("encap_ip6gre") SEC("encap_ip6udp_mpls")
int __encap_ip6gre(struct __sk_buff *skb) int __encap_ip6udp_mpls(struct __sk_buff *skb)
{ {
if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6)) if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6))
return encap_ipv6(skb, true); return encap_ipv6(skb, IPPROTO_UDP, ETH_P_MPLS_UC);
else
return TC_ACT_OK;
}
SEC("encap_ip6udp_eth")
int __encap_ip6udp_eth(struct __sk_buff *skb)
{
if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6))
return encap_ipv6(skb, IPPROTO_UDP, ETH_P_TEB);
else else
return TC_ACT_OK; return TC_ACT_OK;
} }
static int decap_internal(struct __sk_buff *skb, int off, int len, char proto) static int decap_internal(struct __sk_buff *skb, int off, int len, char proto)
{ {
char buf[sizeof(struct grev6hdr)]; char buf[sizeof(struct v6hdr)];
int olen; struct gre_hdr greh;
struct udphdr udph;
int olen = len;
switch (proto) { switch (proto) {
case IPPROTO_IPIP: case IPPROTO_IPIP:
case IPPROTO_IPV6: case IPPROTO_IPV6:
olen = len;
break; break;
case IPPROTO_GRE: case IPPROTO_GRE:
olen = len + 4 /* gre hdr */; olen += sizeof(struct gre_hdr);
if (bpf_skb_load_bytes(skb, off + len, &greh, sizeof(greh)) < 0)
return TC_ACT_OK;
switch (bpf_ntohs(greh.protocol)) {
case ETH_P_MPLS_UC:
olen += sizeof(mpls_label);
break;
case ETH_P_TEB:
olen += ETH_HLEN;
break;
}
break;
case IPPROTO_UDP:
olen += sizeof(struct udphdr);
if (bpf_skb_load_bytes(skb, off + len, &udph, sizeof(udph)) < 0)
return TC_ACT_OK;
switch (bpf_ntohs(udph.dest)) {
case MPLS_OVER_UDP_PORT:
olen += sizeof(mpls_label);
break;
case ETH_OVER_UDP_PORT:
olen += ETH_HLEN;
break;
}
break; break;
default: default:
return TC_ACT_OK; return TC_ACT_OK;
......
...@@ -15,6 +15,12 @@ readonly ns2_v4=192.168.1.2 ...@@ -15,6 +15,12 @@ readonly ns2_v4=192.168.1.2
readonly ns1_v6=fd::1 readonly ns1_v6=fd::1
readonly ns2_v6=fd::2 readonly ns2_v6=fd::2
# Must match port used by bpf program
readonly udpport=5555
# MPLSoverUDP
readonly mplsudpport=6635
readonly mplsproto=137
readonly infile="$(mktemp)" readonly infile="$(mktemp)"
readonly outfile="$(mktemp)" readonly outfile="$(mktemp)"
...@@ -38,8 +44,8 @@ setup() { ...@@ -38,8 +44,8 @@ setup() {
# clamp route to reserve room for tunnel headers # clamp route to reserve room for tunnel headers
ip -netns "${ns1}" -4 route flush table main ip -netns "${ns1}" -4 route flush table main
ip -netns "${ns1}" -6 route flush table main ip -netns "${ns1}" -6 route flush table main
ip -netns "${ns1}" -4 route add "${ns2_v4}" mtu 1476 dev veth1 ip -netns "${ns1}" -4 route add "${ns2_v4}" mtu 1458 dev veth1
ip -netns "${ns1}" -6 route add "${ns2_v6}" mtu 1456 dev veth1 ip -netns "${ns1}" -6 route add "${ns2_v6}" mtu 1438 dev veth1
sleep 1 sleep 1
...@@ -86,30 +92,44 @@ set -e ...@@ -86,30 +92,44 @@ set -e
# no arguments: automated test, run all # no arguments: automated test, run all
if [[ "$#" -eq "0" ]]; then if [[ "$#" -eq "0" ]]; then
echo "ipip" echo "ipip"
$0 ipv4 ipip 100 $0 ipv4 ipip none 100
echo "ip6ip6" echo "ip6ip6"
$0 ipv6 ip6tnl 100 $0 ipv6 ip6tnl none 100
for mac in none mpls eth ; do
echo "ip gre $mac"
$0 ipv4 gre $mac 100
echo "ip6 gre $mac"
$0 ipv6 ip6gre $mac 100
echo "ip gre $mac gso"
$0 ipv4 gre $mac 2000
echo "ip gre" echo "ip6 gre $mac gso"
$0 ipv4 gre 100 $0 ipv6 ip6gre $mac 2000
echo "ip6 gre" echo "ip udp $mac"
$0 ipv6 ip6gre 100 $0 ipv4 udp $mac 100
echo "ip gre gso" echo "ip6 udp $mac"
$0 ipv4 gre 2000 $0 ipv6 ip6udp $mac 100
echo "ip6 gre gso" echo "ip udp $mac gso"
$0 ipv6 ip6gre 2000 $0 ipv4 udp $mac 2000
echo "ip6 udp $mac gso"
$0 ipv6 ip6udp $mac 2000
done
echo "OK. All tests passed" echo "OK. All tests passed"
exit 0 exit 0
fi fi
if [[ "$#" -ne "3" ]]; then if [[ "$#" -ne "4" ]]; then
echo "Usage: $0" echo "Usage: $0"
echo " or: $0 <ipv4|ipv6> <tuntype> <data_len>" echo " or: $0 <ipv4|ipv6> <tuntype> <none|mpls|eth> <data_len>"
exit 1 exit 1
fi fi
...@@ -117,12 +137,24 @@ case "$1" in ...@@ -117,12 +137,24 @@ case "$1" in
"ipv4") "ipv4")
readonly addr1="${ns1_v4}" readonly addr1="${ns1_v4}"
readonly addr2="${ns2_v4}" readonly addr2="${ns2_v4}"
readonly netcat_opt=-4 readonly ipproto=4
readonly netcat_opt=-${ipproto}
readonly foumod=fou
readonly foutype=ipip
readonly fouproto=4
readonly fouproto_mpls=${mplsproto}
readonly gretaptype=gretap
;; ;;
"ipv6") "ipv6")
readonly addr1="${ns1_v6}" readonly addr1="${ns1_v6}"
readonly addr2="${ns2_v6}" readonly addr2="${ns2_v6}"
readonly netcat_opt=-6 readonly ipproto=6
readonly netcat_opt=-${ipproto}
readonly foumod=fou6
readonly foutype=ip6tnl
readonly fouproto="41 -6"
readonly fouproto_mpls="${mplsproto} -6"
readonly gretaptype=ip6gretap
;; ;;
*) *)
echo "unknown arg: $1" echo "unknown arg: $1"
...@@ -131,9 +163,10 @@ case "$1" in ...@@ -131,9 +163,10 @@ case "$1" in
esac esac
readonly tuntype=$2 readonly tuntype=$2
readonly datalen=$3 readonly mac=$3
readonly datalen=$4
echo "encap ${addr1} to ${addr2}, type ${tuntype}, len ${datalen}" echo "encap ${addr1} to ${addr2}, type ${tuntype}, mac ${mac} len ${datalen}"
trap cleanup EXIT trap cleanup EXIT
...@@ -150,16 +183,63 @@ verify_data ...@@ -150,16 +183,63 @@ verify_data
ip netns exec "${ns1}" tc qdisc add dev veth1 clsact ip netns exec "${ns1}" tc qdisc add dev veth1 clsact
ip netns exec "${ns1}" tc filter add dev veth1 egress \ ip netns exec "${ns1}" tc filter add dev veth1 egress \
bpf direct-action object-file ./test_tc_tunnel.o \ bpf direct-action object-file ./test_tc_tunnel.o \
section "encap_${tuntype}" section "encap_${tuntype}_${mac}"
echo "test bpf encap without decap (expect failure)" echo "test bpf encap without decap (expect failure)"
server_listen server_listen
! client_connect ! client_connect
if [[ "$tuntype" =~ "udp" ]]; then
# Set up fou tunnel.
ttype="${foutype}"
targs="encap fou encap-sport auto encap-dport $udpport"
# fou may be a module; allow this to fail.
modprobe "${foumod}" ||true
if [[ "$mac" == "mpls" ]]; then
dport=${mplsudpport}
dproto=${fouproto_mpls}
tmode="mode any ttl 255"
else
dport=${udpport}
dproto=${fouproto}
fi
ip netns exec "${ns2}" ip fou add port $dport ipproto ${dproto}
targs="encap fou encap-sport auto encap-dport $dport"
elif [[ "$tuntype" =~ "gre" && "$mac" == "eth" ]]; then
ttype=$gretaptype
else
ttype=$tuntype
targs=""
fi
# serverside, insert decap module # serverside, insert decap module
# server is still running # server is still running
# client can connect again # client can connect again
ip netns exec "${ns2}" ip link add dev testtun0 type "${tuntype}" \ ip netns exec "${ns2}" ip link add name testtun0 type "${ttype}" \
remote "${addr1}" local "${addr2}" ${tmode} remote "${addr1}" local "${addr2}" $targs
expect_tun_fail=0
if [[ "$tuntype" == "ip6udp" && "$mac" == "mpls" ]]; then
# No support for MPLS IPv6 fou tunnel; expect failure.
expect_tun_fail=1
elif [[ "$tuntype" =~ "udp" && "$mac" == "eth" ]]; then
# No support for TEB fou tunnel; expect failure.
expect_tun_fail=1
elif [[ "$tuntype" =~ "gre" && "$mac" == "eth" ]]; then
# Share ethernet address between tunnel/veth2 so L2 decap works.
ethaddr=$(ip netns exec "${ns2}" ip link show veth2 | \
awk '/ether/ { print $2 }')
ip netns exec "${ns2}" ip link set testtun0 address $ethaddr
elif [[ "$mac" == "mpls" ]]; then
modprobe mpls_iptunnel ||true
modprobe mpls_gso ||true
ip netns exec "${ns2}" sysctl -qw net.mpls.platform_labels=65536
ip netns exec "${ns2}" ip -f mpls route add 1000 dev lo
ip netns exec "${ns2}" ip link set lo up
ip netns exec "${ns2}" sysctl -qw net.mpls.conf.testtun0.input=1
ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.lo.rp_filter=0
fi
# Because packets are decapped by the tunnel they arrive on testtun0 from # Because packets are decapped by the tunnel they arrive on testtun0 from
# the IP stack perspective. Ensure reverse path filtering is disabled # the IP stack perspective. Ensure reverse path filtering is disabled
# otherwise we drop the TCP SYN as arriving on testtun0 instead of the # otherwise we drop the TCP SYN as arriving on testtun0 instead of the
...@@ -169,16 +249,22 @@ ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.all.rp_filter=0 ...@@ -169,16 +249,22 @@ ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.all.rp_filter=0
# selected as the max of the "all" and device-specific values. # selected as the max of the "all" and device-specific values.
ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.testtun0.rp_filter=0 ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.testtun0.rp_filter=0
ip netns exec "${ns2}" ip link set dev testtun0 up ip netns exec "${ns2}" ip link set dev testtun0 up
echo "test bpf encap with tunnel device decap" if [[ "$expect_tun_fail" == 1 ]]; then
client_connect # This tunnel mode is not supported, so we expect failure.
verify_data echo "test bpf encap with tunnel device decap (expect failure)"
! client_connect
else
echo "test bpf encap with tunnel device decap"
client_connect
verify_data
server_listen
fi
# serverside, use BPF for decap # serverside, use BPF for decap
ip netns exec "${ns2}" ip link del dev testtun0 ip netns exec "${ns2}" ip link del dev testtun0
ip netns exec "${ns2}" tc qdisc add dev veth2 clsact ip netns exec "${ns2}" tc qdisc add dev veth2 clsact
ip netns exec "${ns2}" tc filter add dev veth2 ingress \ ip netns exec "${ns2}" tc filter add dev veth2 ingress \
bpf direct-action object-file ./test_tc_tunnel.o section decap bpf direct-action object-file ./test_tc_tunnel.o section decap
server_listen
echo "test bpf encap with bpf decap" echo "test bpf encap with bpf decap"
client_connect client_connect
verify_data verify_data
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment