Commit 87486b23 authored by Alexei Starovoitov's avatar Alexei Starovoitov

Merge branch 'lwt_encap_ip'

Peter Oskolkov says:

====================
This patchset implements BPF_LWT_ENCAP_IP mode in bpf_lwt_push_encap
BPF helper. It enables BPF programs (specifically, BPF_PROG_TYPE_LWT_IN
and BPF_PROG_TYPE_LWT_XMIT prog types) to add IP encapsulation headers
to packets (e.g. IP/GRE, GUE, IPIP).

This is useful when thousands of different short-lived flows should be
encapped, each with different and dynamically determined destination.
Although lwtunnels can be used in some of these scenarios, the ability
to dynamically generate encap headers adds more flexibility, e.g.
when routing depends on the state of the host (reflected in global bpf
maps).

V2 changes: added flowi-based route lookup, IPv6 encapping, and
   encapping on ingress.

V3 changes: incorporated David Ahern's suggestions:
   - added l3mdev check/oif (patch 2)
   - sync bpf.h from include/uapi into tools/include/uapi
   - selftest tweaks

V4 changes: moved route lookup/dst change from bpf_push_ip_encap
   to when BPF_LWT_REROUTE is handled, as suggested by David Ahern.

V5 changes: added a check in lwt_xmit that skb->protocol stays the
   same if the skb is to be passed back to the stack (ret == BPF_OK).
   Again, suggested by David Ahern.

V6 changes: abandoned.

V7 changes: added handling of GSO packets (patch 3 in the patchset added),
   as suggested by BPF maintainers.

V8 changes:
   - fixed build errors when LWT or IPV6 are not enabled;
   - whitelisted TCP GSO instead of blacklisting SCTP and UDP GSO, as
     suggested by Willem de Bruijn;
   - added validation that pushed length cover needed headers when GRE/UDP
     encap is detected, as suggested by Willem de Bruijn;
   - a couple of minor/stylistic tweaks/fixed typos.

V9 changes:
   - fixed a kbuild test robot compiler warning;
   - added ipv6_route_input to ipv6_stub (patch 4 in the patchset
     added), and IPv6 routing functions are now invoked via ipv6_stub,
     as suggested by David Ahern.

V10 changes:
   - removed unnecessary IS_ENABLED and pr_warn_once from patch 5.

V11 changes: fixed a potential dst leak in patch 5, as suggested by
    David Ahern.
====================
Reviewed-by: default avatarDavid Ahern <dsahern@gmail.com>
Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parents dd27c2e3 0fde56e4
......@@ -248,6 +248,7 @@ struct ipv6_stub {
const struct in6_addr *addr);
int (*ipv6_dst_lookup)(struct net *net, struct sock *sk,
struct dst_entry **dst, struct flowi6 *fl6);
int (*ipv6_route_input)(struct sk_buff *skb);
struct fib6_table *(*fib6_get_table)(struct net *net, u32 id);
struct fib6_info *(*fib6_lookup)(struct net *net, int oif,
......
......@@ -126,6 +126,8 @@ int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b);
int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb);
int lwtunnel_input(struct sk_buff *skb);
int lwtunnel_xmit(struct sk_buff *skb);
int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len,
bool ingress);
static inline void lwtunnel_set_redirect(struct dst_entry *dst)
{
......
......@@ -2016,6 +2016,19 @@ union bpf_attr {
* Only works if *skb* contains an IPv6 packet. Insert a
* Segment Routing Header (**struct ipv6_sr_hdr**) inside
* the IPv6 header.
* **BPF_LWT_ENCAP_IP**
* IP encapsulation (GRE/GUE/IPIP/etc). The outer header
* must be IPv4 or IPv6, followed by zero or more
* additional headers, up to LWT_BPF_MAX_HEADROOM total
* bytes in all prepended headers. Please note that
* if skb_is_gso(skb) is true, no more than two headers
* can be prepended, and the inner header, if present,
* should be either GRE or UDP/GUE.
*
* BPF_LWT_ENCAP_SEG6*** types can be called by bpf programs of
* type BPF_PROG_TYPE_LWT_IN; BPF_LWT_ENCAP_IP type can be called
* by bpf programs of types BPF_PROG_TYPE_LWT_IN and
* BPF_PROG_TYPE_LWT_XMIT.
*
* A call to this helper is susceptible to change the underlaying
* packet buffer. Therefore, at load time, all checks on pointers
......@@ -2517,7 +2530,8 @@ enum bpf_hdr_start_off {
/* Encapsulation type for BPF_FUNC_lwt_push_encap helper. */
enum bpf_lwt_encap_mode {
BPF_LWT_ENCAP_SEG6,
BPF_LWT_ENCAP_SEG6_INLINE
BPF_LWT_ENCAP_SEG6_INLINE,
BPF_LWT_ENCAP_IP,
};
#define __bpf_md_ptr(type, name) \
......@@ -2606,7 +2620,15 @@ enum bpf_ret_code {
BPF_DROP = 2,
/* 3-6 reserved */
BPF_REDIRECT = 7,
/* >127 are reserved for prog type specific return codes */
/* >127 are reserved for prog type specific return codes.
*
* BPF_LWT_REROUTE: used by BPF_PROG_TYPE_LWT_IN and
* BPF_PROG_TYPE_LWT_XMIT to indicate that skb had been
* changed and should be routed based on its new L3 header.
* (This is an L3 redirect, as opposed to L2 redirect
* represented by BPF_REDIRECT above).
*/
BPF_LWT_REROUTE = 128,
};
struct bpf_sock {
......
......@@ -73,6 +73,7 @@
#include <linux/seg6_local.h>
#include <net/seg6.h>
#include <net/seg6_local.h>
#include <net/lwtunnel.h>
/**
* sk_filter_trim_cap - run a packet through a socket filter
......@@ -4815,7 +4816,15 @@ static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len
}
#endif /* CONFIG_IPV6_SEG6_BPF */
BPF_CALL_4(bpf_lwt_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,
#if IS_ENABLED(CONFIG_LWTUNNEL_BPF)
static int bpf_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len,
bool ingress)
{
return bpf_lwt_push_ip_encap(skb, hdr, len, ingress);
}
#endif
BPF_CALL_4(bpf_lwt_in_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,
u32, len)
{
switch (type) {
......@@ -4823,14 +4832,41 @@ BPF_CALL_4(bpf_lwt_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,
case BPF_LWT_ENCAP_SEG6:
case BPF_LWT_ENCAP_SEG6_INLINE:
return bpf_push_seg6_encap(skb, type, hdr, len);
#endif
#if IS_ENABLED(CONFIG_LWTUNNEL_BPF)
case BPF_LWT_ENCAP_IP:
return bpf_push_ip_encap(skb, hdr, len, true /* ingress */);
#endif
default:
return -EINVAL;
}
}
BPF_CALL_4(bpf_lwt_xmit_push_encap, struct sk_buff *, skb, u32, type,
void *, hdr, u32, len)
{
switch (type) {
#if IS_ENABLED(CONFIG_LWTUNNEL_BPF)
case BPF_LWT_ENCAP_IP:
return bpf_push_ip_encap(skb, hdr, len, false /* egress */);
#endif
default:
return -EINVAL;
}
}
static const struct bpf_func_proto bpf_lwt_push_encap_proto = {
.func = bpf_lwt_push_encap,
static const struct bpf_func_proto bpf_lwt_in_push_encap_proto = {
.func = bpf_lwt_in_push_encap,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_ANYTHING,
.arg3_type = ARG_PTR_TO_MEM,
.arg4_type = ARG_CONST_SIZE
};
static const struct bpf_func_proto bpf_lwt_xmit_push_encap_proto = {
.func = bpf_lwt_xmit_push_encap,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
......@@ -5417,7 +5453,8 @@ bool bpf_helper_changes_pkt_data(void *func)
func == bpf_lwt_seg6_adjust_srh ||
func == bpf_lwt_seg6_action ||
#endif
func == bpf_lwt_push_encap)
func == bpf_lwt_in_push_encap ||
func == bpf_lwt_xmit_push_encap)
return true;
return false;
......@@ -5815,7 +5852,7 @@ lwt_in_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
case BPF_FUNC_lwt_push_encap:
return &bpf_lwt_push_encap_proto;
return &bpf_lwt_in_push_encap_proto;
default:
return lwt_out_func_proto(func_id, prog);
}
......@@ -5851,6 +5888,8 @@ lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_l4_csum_replace_proto;
case BPF_FUNC_set_hash_invalid:
return &bpf_set_hash_invalid_proto;
case BPF_FUNC_lwt_push_encap:
return &bpf_lwt_xmit_push_encap_proto;
default:
return lwt_out_func_proto(func_id, prog);
}
......
......@@ -16,6 +16,8 @@
#include <linux/types.h>
#include <linux/bpf.h>
#include <net/lwtunnel.h>
#include <net/gre.h>
#include <net/ip6_route.h>
struct bpf_lwt_prog {
struct bpf_prog *prog;
......@@ -55,6 +57,7 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
switch (ret) {
case BPF_OK:
case BPF_LWT_REROUTE:
break;
case BPF_REDIRECT:
......@@ -87,6 +90,30 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
return ret;
}
static int bpf_lwt_input_reroute(struct sk_buff *skb)
{
int err = -EINVAL;
if (skb->protocol == htons(ETH_P_IP)) {
struct iphdr *iph = ip_hdr(skb);
err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
iph->tos, skb_dst(skb)->dev);
} else if (skb->protocol == htons(ETH_P_IPV6)) {
err = ipv6_stub->ipv6_route_input(skb);
} else {
err = -EAFNOSUPPORT;
}
if (err)
goto err;
return dst_input(skb);
err:
kfree_skb(skb);
return err;
}
static int bpf_input(struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
......@@ -98,11 +125,11 @@ static int bpf_input(struct sk_buff *skb)
ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT);
if (ret < 0)
return ret;
if (ret == BPF_LWT_REROUTE)
return bpf_lwt_input_reroute(skb);
}
if (unlikely(!dst->lwtstate->orig_input)) {
pr_warn_once("orig_input not set on dst for prog %s\n",
bpf->out.name);
kfree_skb(skb);
return -EINVAL;
}
......@@ -147,6 +174,91 @@ static int xmit_check_hhlen(struct sk_buff *skb)
return 0;
}
static int bpf_lwt_xmit_reroute(struct sk_buff *skb)
{
struct net_device *l3mdev = l3mdev_master_dev_rcu(skb_dst(skb)->dev);
int oif = l3mdev ? l3mdev->ifindex : 0;
struct dst_entry *dst = NULL;
struct sock *sk;
struct net *net;
bool ipv4;
int err;
if (skb->protocol == htons(ETH_P_IP))
ipv4 = true;
else if (skb->protocol == htons(ETH_P_IPV6))
ipv4 = false;
else
return -EAFNOSUPPORT;
sk = sk_to_full_sk(skb->sk);
if (sk) {
if (sk->sk_bound_dev_if)
oif = sk->sk_bound_dev_if;
net = sock_net(sk);
} else {
net = dev_net(skb_dst(skb)->dev);
}
if (ipv4) {
struct iphdr *iph = ip_hdr(skb);
struct flowi4 fl4 = {};
struct rtable *rt;
fl4.flowi4_oif = oif;
fl4.flowi4_mark = skb->mark;
fl4.flowi4_uid = sock_net_uid(net, sk);
fl4.flowi4_tos = RT_TOS(iph->tos);
fl4.flowi4_flags = FLOWI_FLAG_ANYSRC;
fl4.flowi4_proto = iph->protocol;
fl4.daddr = iph->daddr;
fl4.saddr = iph->saddr;
rt = ip_route_output_key(net, &fl4);
if (IS_ERR(rt))
return -EINVAL;
dst = &rt->dst;
} else {
struct ipv6hdr *iph6 = ipv6_hdr(skb);
struct flowi6 fl6 = {};
fl6.flowi6_oif = oif;
fl6.flowi6_mark = skb->mark;
fl6.flowi6_uid = sock_net_uid(net, sk);
fl6.flowlabel = ip6_flowinfo(iph6);
fl6.flowi6_proto = iph6->nexthdr;
fl6.daddr = iph6->daddr;
fl6.saddr = iph6->saddr;
err = ipv6_stub->ipv6_dst_lookup(net, skb->sk, &dst, &fl6);
if (err || IS_ERR(dst))
return -EINVAL;
}
if (unlikely(dst->error)) {
dst_release(dst);
return -EINVAL;
}
/* Although skb header was reserved in bpf_lwt_push_ip_encap(), it
* was done for the previous dst, so we are doing it here again, in
* case the new dst needs much more space. The call below is a noop
* if there is enough header space in skb.
*/
err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev));
if (unlikely(err))
return err;
skb_dst_drop(skb);
skb_dst_set(skb, dst);
err = dst_output(dev_net(skb_dst(skb)->dev), skb->sk, skb);
if (unlikely(err))
return err;
/* ip[6]_finish_output2 understand LWTUNNEL_XMIT_DONE */
return LWTUNNEL_XMIT_DONE;
}
static int bpf_xmit(struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
......@@ -154,11 +266,20 @@ static int bpf_xmit(struct sk_buff *skb)
bpf = bpf_lwt_lwtunnel(dst->lwtstate);
if (bpf->xmit.prog) {
__be16 proto = skb->protocol;
int ret;
ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT);
switch (ret) {
case BPF_OK:
/* If the header changed, e.g. via bpf_lwt_push_encap,
* BPF_LWT_REROUTE below should have been used if the
* protocol was also changed.
*/
if (skb->protocol != proto) {
kfree_skb(skb);
return -EINVAL;
}
/* If the header was expanded, headroom might be too
* small for L2 header to come, expand as needed.
*/
......@@ -169,6 +290,8 @@ static int bpf_xmit(struct sk_buff *skb)
return LWTUNNEL_XMIT_CONTINUE;
case BPF_REDIRECT:
return LWTUNNEL_XMIT_DONE;
case BPF_LWT_REROUTE:
return bpf_lwt_xmit_reroute(skb);
default:
return ret;
}
......@@ -390,6 +513,133 @@ static const struct lwtunnel_encap_ops bpf_encap_ops = {
.owner = THIS_MODULE,
};
static int handle_gso_type(struct sk_buff *skb, unsigned int gso_type,
int encap_len)
{
struct skb_shared_info *shinfo = skb_shinfo(skb);
gso_type |= SKB_GSO_DODGY;
shinfo->gso_type |= gso_type;
skb_decrease_gso_size(shinfo, encap_len);
shinfo->gso_segs = 0;
return 0;
}
static int handle_gso_encap(struct sk_buff *skb, bool ipv4, int encap_len)
{
int next_hdr_offset;
void *next_hdr;
__u8 protocol;
/* SCTP and UDP_L4 gso need more nuanced handling than what
* handle_gso_type() does above: skb_decrease_gso_size() is not enough.
* So at the moment only TCP GSO packets are let through.
*/
if (!(skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
return -ENOTSUPP;
if (ipv4) {
protocol = ip_hdr(skb)->protocol;
next_hdr_offset = sizeof(struct iphdr);
next_hdr = skb_network_header(skb) + next_hdr_offset;
} else {
protocol = ipv6_hdr(skb)->nexthdr;
next_hdr_offset = sizeof(struct ipv6hdr);
next_hdr = skb_network_header(skb) + next_hdr_offset;
}
switch (protocol) {
case IPPROTO_GRE:
next_hdr_offset += sizeof(struct gre_base_hdr);
if (next_hdr_offset > encap_len)
return -EINVAL;
if (((struct gre_base_hdr *)next_hdr)->flags & GRE_CSUM)
return handle_gso_type(skb, SKB_GSO_GRE_CSUM,
encap_len);
return handle_gso_type(skb, SKB_GSO_GRE, encap_len);
case IPPROTO_UDP:
next_hdr_offset += sizeof(struct udphdr);
if (next_hdr_offset > encap_len)
return -EINVAL;
if (((struct udphdr *)next_hdr)->check)
return handle_gso_type(skb, SKB_GSO_UDP_TUNNEL_CSUM,
encap_len);
return handle_gso_type(skb, SKB_GSO_UDP_TUNNEL, encap_len);
case IPPROTO_IP:
case IPPROTO_IPV6:
if (ipv4)
return handle_gso_type(skb, SKB_GSO_IPXIP4, encap_len);
else
return handle_gso_type(skb, SKB_GSO_IPXIP6, encap_len);
default:
return -EPROTONOSUPPORT;
}
}
int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress)
{
struct iphdr *iph;
bool ipv4;
int err;
if (unlikely(len < sizeof(struct iphdr) || len > LWT_BPF_MAX_HEADROOM))
return -EINVAL;
/* validate protocol and length */
iph = (struct iphdr *)hdr;
if (iph->version == 4) {
ipv4 = true;
if (unlikely(len < iph->ihl * 4))
return -EINVAL;
} else if (iph->version == 6) {
ipv4 = false;
if (unlikely(len < sizeof(struct ipv6hdr)))
return -EINVAL;
} else {
return -EINVAL;
}
if (ingress)
err = skb_cow_head(skb, len + skb->mac_len);
else
err = skb_cow_head(skb,
len + LL_RESERVED_SPACE(skb_dst(skb)->dev));
if (unlikely(err))
return err;
/* push the encap headers and fix pointers */
skb_reset_inner_headers(skb);
skb->encapsulation = 1;
skb_push(skb, len);
if (ingress)
skb_postpush_rcsum(skb, iph, len);
skb_reset_network_header(skb);
memcpy(skb_network_header(skb), hdr, len);
bpf_compute_data_pointers(skb);
skb_clear_hash(skb);
if (ipv4) {
skb->protocol = htons(ETH_P_IP);
iph = ip_hdr(skb);
if (!iph->check)
iph->check = ip_fast_csum((unsigned char *)iph,
iph->ihl);
} else {
skb->protocol = htons(ETH_P_IPV6);
}
if (skb_is_gso(skb))
return handle_gso_encap(skb, ipv4, len);
return 0;
}
static int __init bpf_lwt_init(void)
{
return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF);
......
......@@ -134,6 +134,11 @@ static int eafnosupport_ipv6_dst_lookup(struct net *net, struct sock *u1,
return -EAFNOSUPPORT;
}
static int eafnosupport_ipv6_route_input(struct sk_buff *skb)
{
return -EAFNOSUPPORT;
}
static struct fib6_table *eafnosupport_fib6_get_table(struct net *net, u32 id)
{
return NULL;
......@@ -170,6 +175,7 @@ eafnosupport_ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) {
.ipv6_dst_lookup = eafnosupport_ipv6_dst_lookup,
.ipv6_route_input = eafnosupport_ipv6_route_input,
.fib6_get_table = eafnosupport_fib6_get_table,
.fib6_table_lookup = eafnosupport_fib6_table_lookup,
.fib6_lookup = eafnosupport_fib6_lookup,
......
......@@ -900,10 +900,17 @@ static struct pernet_operations inet6_net_ops = {
.exit = inet6_net_exit,
};
static int ipv6_route_input(struct sk_buff *skb)
{
ip6_route_input(skb);
return skb_dst(skb)->error;
}
static const struct ipv6_stub ipv6_stub_impl = {
.ipv6_sock_mc_join = ipv6_sock_mc_join,
.ipv6_sock_mc_drop = ipv6_sock_mc_drop,
.ipv6_dst_lookup = ip6_dst_lookup,
.ipv6_route_input = ipv6_route_input,
.fib6_get_table = fib6_get_table,
.fib6_table_lookup = fib6_table_lookup,
.fib6_lookup = fib6_lookup,
......
......@@ -2016,6 +2016,19 @@ union bpf_attr {
* Only works if *skb* contains an IPv6 packet. Insert a
* Segment Routing Header (**struct ipv6_sr_hdr**) inside
* the IPv6 header.
* **BPF_LWT_ENCAP_IP**
* IP encapsulation (GRE/GUE/IPIP/etc). The outer header
* must be IPv4 or IPv6, followed by zero or more
* additional headers, up to LWT_BPF_MAX_HEADROOM total
* bytes in all prepended headers. Please note that
* if skb_is_gso(skb) is true, no more than two headers
* can be prepended, and the inner header, if present,
* should be either GRE or UDP/GUE.
*
* BPF_LWT_ENCAP_SEG6*** types can be called by bpf programs of
* type BPF_PROG_TYPE_LWT_IN; BPF_LWT_ENCAP_IP type can be called
* by bpf programs of types BPF_PROG_TYPE_LWT_IN and
* BPF_PROG_TYPE_LWT_XMIT.
*
* A call to this helper is susceptible to change the underlaying
* packet buffer. Therefore, at load time, all checks on pointers
......@@ -2517,7 +2530,8 @@ enum bpf_hdr_start_off {
/* Encapsulation type for BPF_FUNC_lwt_push_encap helper. */
enum bpf_lwt_encap_mode {
BPF_LWT_ENCAP_SEG6,
BPF_LWT_ENCAP_SEG6_INLINE
BPF_LWT_ENCAP_SEG6_INLINE,
BPF_LWT_ENCAP_IP,
};
#define __bpf_md_ptr(type, name) \
......@@ -2606,7 +2620,15 @@ enum bpf_ret_code {
BPF_DROP = 2,
/* 3-6 reserved */
BPF_REDIRECT = 7,
/* >127 are reserved for prog type specific return codes */
/* >127 are reserved for prog type specific return codes.
*
* BPF_LWT_REROUTE: used by BPF_PROG_TYPE_LWT_IN and
* BPF_PROG_TYPE_LWT_XMIT to indicate that skb had been
* changed and should be routed based on its new L3 header.
* (This is an L3 redirect, as opposed to L2 redirect
* represented by BPF_REDIRECT above).
*/
BPF_LWT_REROUTE = 128,
};
struct bpf_sock {
......
......@@ -50,7 +50,8 @@ TEST_PROGS := test_kmod.sh \
test_lirc_mode2.sh \
test_skb_cgroup_id.sh \
test_flow_dissector.sh \
test_xdp_vlan.sh
test_xdp_vlan.sh \
test_lwt_ip_encap.sh
TEST_PROGS_EXTENDED := with_addr.sh \
with_tunnels.sh \
......
// SPDX-License-Identifier: GPL-2.0
#include <stddef.h>
#include <string.h>
#include <linux/bpf.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include "bpf_helpers.h"
#include "bpf_endian.h"
struct grehdr {
__be16 flags;
__be16 protocol;
};
SEC("encap_gre")
int bpf_lwt_encap_gre(struct __sk_buff *skb)
{
struct encap_hdr {
struct iphdr iph;
struct grehdr greh;
} hdr;
int err;
memset(&hdr, 0, sizeof(struct encap_hdr));
hdr.iph.ihl = 5;
hdr.iph.version = 4;
hdr.iph.ttl = 0x40;
hdr.iph.protocol = 47; /* IPPROTO_GRE */
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
hdr.iph.saddr = 0x640110ac; /* 172.16.1.100 */
hdr.iph.daddr = 0x641010ac; /* 172.16.16.100 */
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
hdr.iph.saddr = 0xac100164; /* 172.16.1.100 */
hdr.iph.daddr = 0xac101064; /* 172.16.16.100 */
#else
#error "Fix your compiler's __BYTE_ORDER__?!"
#endif
hdr.iph.tot_len = bpf_htons(skb->len + sizeof(struct encap_hdr));
hdr.greh.protocol = skb->protocol;
err = bpf_lwt_push_encap(skb, BPF_LWT_ENCAP_IP, &hdr,
sizeof(struct encap_hdr));
if (err)
return BPF_DROP;
return BPF_LWT_REROUTE;
}
SEC("encap_gre6")
int bpf_lwt_encap_gre6(struct __sk_buff *skb)
{
struct encap_hdr {
struct ipv6hdr ip6hdr;
struct grehdr greh;
} hdr;
int err;
memset(&hdr, 0, sizeof(struct encap_hdr));
hdr.ip6hdr.version = 6;
hdr.ip6hdr.payload_len = bpf_htons(skb->len + sizeof(struct grehdr));
hdr.ip6hdr.nexthdr = 47; /* IPPROTO_GRE */
hdr.ip6hdr.hop_limit = 0x40;
/* fb01::1 */
hdr.ip6hdr.saddr.s6_addr[0] = 0xfb;
hdr.ip6hdr.saddr.s6_addr[1] = 1;
hdr.ip6hdr.saddr.s6_addr[15] = 1;
/* fb10::1 */
hdr.ip6hdr.daddr.s6_addr[0] = 0xfb;
hdr.ip6hdr.daddr.s6_addr[1] = 0x10;
hdr.ip6hdr.daddr.s6_addr[15] = 1;
hdr.greh.protocol = skb->protocol;
err = bpf_lwt_push_encap(skb, BPF_LWT_ENCAP_IP, &hdr,
sizeof(struct encap_hdr));
if (err)
return BPF_DROP;
return BPF_LWT_REROUTE;
}
char _license[] SEC("license") = "GPL";
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
#
# Setup/topology:
#
# NS1 NS2 NS3
# veth1 <---> veth2 veth3 <---> veth4 (the top route)
# veth5 <---> veth6 veth7 <---> veth8 (the bottom route)
#
# each vethN gets IPv[4|6]_N address
#
# IPv*_SRC = IPv*_1
# IPv*_DST = IPv*_4
#
# all tests test pings from IPv*_SRC to IPv*_DST
#
# by default, routes are configured to allow packets to go
# IP*_1 <=> IP*_2 <=> IP*_3 <=> IP*_4 (the top route)
#
# a GRE device is installed in NS3 with IPv*_GRE, and
# NS1/NS2 are configured to route packets to IPv*_GRE via IP*_8
# (the bottom route)
#
# Tests:
#
# 1. routes NS2->IPv*_DST are brought down, so the only way a ping
# from IP*_SRC to IP*_DST can work is via IPv*_GRE
#
# 2a. in an egress test, a bpf LWT_XMIT program is installed on veth1
# that encaps the packets with an IP/GRE header to route to IPv*_GRE
#
# ping: SRC->[encap at veth1:egress]->GRE:decap->DST
# ping replies go DST->SRC directly
#
# 2b. in an ingress test, a bpf LWT_IN program is installed on veth2
# that encaps the packets with an IP/GRE header to route to IPv*_GRE
#
# ping: SRC->[encap at veth2:ingress]->GRE:decap->DST
# ping replies go DST->SRC directly
set -e # exit on error
if [[ $EUID -ne 0 ]]; then
echo "This script must be run as root"
echo "FAIL"
exit 1
fi
readonly NS1="ns1-$(mktemp -u XXXXXX)"
readonly NS2="ns2-$(mktemp -u XXXXXX)"
readonly NS3="ns3-$(mktemp -u XXXXXX)"
readonly IPv4_1="172.16.1.100"
readonly IPv4_2="172.16.2.100"
readonly IPv4_3="172.16.3.100"
readonly IPv4_4="172.16.4.100"
readonly IPv4_5="172.16.5.100"
readonly IPv4_6="172.16.6.100"
readonly IPv4_7="172.16.7.100"
readonly IPv4_8="172.16.8.100"
readonly IPv4_GRE="172.16.16.100"
readonly IPv4_SRC=$IPv4_1
readonly IPv4_DST=$IPv4_4
readonly IPv6_1="fb01::1"
readonly IPv6_2="fb02::1"
readonly IPv6_3="fb03::1"
readonly IPv6_4="fb04::1"
readonly IPv6_5="fb05::1"
readonly IPv6_6="fb06::1"
readonly IPv6_7="fb07::1"
readonly IPv6_8="fb08::1"
readonly IPv6_GRE="fb10::1"
readonly IPv6_SRC=$IPv6_1
readonly IPv6_DST=$IPv6_4
setup() {
set -e # exit on error
# create devices and namespaces
ip netns add "${NS1}"
ip netns add "${NS2}"
ip netns add "${NS3}"
ip link add veth1 type veth peer name veth2
ip link add veth3 type veth peer name veth4
ip link add veth5 type veth peer name veth6
ip link add veth7 type veth peer name veth8
ip netns exec ${NS2} sysctl -wq net.ipv4.ip_forward=1
ip netns exec ${NS2} sysctl -wq net.ipv6.conf.all.forwarding=1
ip link set veth1 netns ${NS1}
ip link set veth2 netns ${NS2}
ip link set veth3 netns ${NS2}
ip link set veth4 netns ${NS3}
ip link set veth5 netns ${NS1}
ip link set veth6 netns ${NS2}
ip link set veth7 netns ${NS2}
ip link set veth8 netns ${NS3}
# configure addesses: the top route (1-2-3-4)
ip -netns ${NS1} addr add ${IPv4_1}/24 dev veth1
ip -netns ${NS2} addr add ${IPv4_2}/24 dev veth2
ip -netns ${NS2} addr add ${IPv4_3}/24 dev veth3
ip -netns ${NS3} addr add ${IPv4_4}/24 dev veth4
ip -netns ${NS1} -6 addr add ${IPv6_1}/128 nodad dev veth1
ip -netns ${NS2} -6 addr add ${IPv6_2}/128 nodad dev veth2
ip -netns ${NS2} -6 addr add ${IPv6_3}/128 nodad dev veth3
ip -netns ${NS3} -6 addr add ${IPv6_4}/128 nodad dev veth4
# configure addresses: the bottom route (5-6-7-8)
ip -netns ${NS1} addr add ${IPv4_5}/24 dev veth5
ip -netns ${NS2} addr add ${IPv4_6}/24 dev veth6
ip -netns ${NS2} addr add ${IPv4_7}/24 dev veth7
ip -netns ${NS3} addr add ${IPv4_8}/24 dev veth8
ip -netns ${NS1} -6 addr add ${IPv6_5}/128 nodad dev veth5
ip -netns ${NS2} -6 addr add ${IPv6_6}/128 nodad dev veth6
ip -netns ${NS2} -6 addr add ${IPv6_7}/128 nodad dev veth7
ip -netns ${NS3} -6 addr add ${IPv6_8}/128 nodad dev veth8
ip -netns ${NS1} link set dev veth1 up
ip -netns ${NS2} link set dev veth2 up
ip -netns ${NS2} link set dev veth3 up
ip -netns ${NS3} link set dev veth4 up
ip -netns ${NS1} link set dev veth5 up
ip -netns ${NS2} link set dev veth6 up
ip -netns ${NS2} link set dev veth7 up
ip -netns ${NS3} link set dev veth8 up
# configure routes: IP*_SRC -> veth1/IP*_2 (= top route) default;
# the bottom route to specific bottom addresses
# NS1
# top route
ip -netns ${NS1} route add ${IPv4_2}/32 dev veth1
ip -netns ${NS1} route add default dev veth1 via ${IPv4_2} # go top by default
ip -netns ${NS1} -6 route add ${IPv6_2}/128 dev veth1
ip -netns ${NS1} -6 route add default dev veth1 via ${IPv6_2} # go top by default
# bottom route
ip -netns ${NS1} route add ${IPv4_6}/32 dev veth5
ip -netns ${NS1} route add ${IPv4_7}/32 dev veth5 via ${IPv4_6}
ip -netns ${NS1} route add ${IPv4_8}/32 dev veth5 via ${IPv4_6}
ip -netns ${NS1} -6 route add ${IPv6_6}/128 dev veth5
ip -netns ${NS1} -6 route add ${IPv6_7}/128 dev veth5 via ${IPv6_6}
ip -netns ${NS1} -6 route add ${IPv6_8}/128 dev veth5 via ${IPv6_6}
# NS2
# top route
ip -netns ${NS2} route add ${IPv4_1}/32 dev veth2
ip -netns ${NS2} route add ${IPv4_4}/32 dev veth3
ip -netns ${NS2} -6 route add ${IPv6_1}/128 dev veth2
ip -netns ${NS2} -6 route add ${IPv6_4}/128 dev veth3
# bottom route
ip -netns ${NS2} route add ${IPv4_5}/32 dev veth6
ip -netns ${NS2} route add ${IPv4_8}/32 dev veth7
ip -netns ${NS2} -6 route add ${IPv6_5}/128 dev veth6
ip -netns ${NS2} -6 route add ${IPv6_8}/128 dev veth7
# NS3
# top route
ip -netns ${NS3} route add ${IPv4_3}/32 dev veth4
ip -netns ${NS3} route add ${IPv4_1}/32 dev veth4 via ${IPv4_3}
ip -netns ${NS3} route add ${IPv4_2}/32 dev veth4 via ${IPv4_3}
ip -netns ${NS3} -6 route add ${IPv6_3}/128 dev veth4
ip -netns ${NS3} -6 route add ${IPv6_1}/128 dev veth4 via ${IPv6_3}
ip -netns ${NS3} -6 route add ${IPv6_2}/128 dev veth4 via ${IPv6_3}
# bottom route
ip -netns ${NS3} route add ${IPv4_7}/32 dev veth8
ip -netns ${NS3} route add ${IPv4_5}/32 dev veth8 via ${IPv4_7}
ip -netns ${NS3} route add ${IPv4_6}/32 dev veth8 via ${IPv4_7}
ip -netns ${NS3} -6 route add ${IPv6_7}/128 dev veth8
ip -netns ${NS3} -6 route add ${IPv6_5}/128 dev veth8 via ${IPv6_7}
ip -netns ${NS3} -6 route add ${IPv6_6}/128 dev veth8 via ${IPv6_7}
# configure IPv4 GRE device in NS3, and a route to it via the "bottom" route
ip -netns ${NS3} tunnel add gre_dev mode gre remote ${IPv4_1} local ${IPv4_GRE} ttl 255
ip -netns ${NS3} link set gre_dev up
ip -netns ${NS3} addr add ${IPv4_GRE} dev gre_dev
ip -netns ${NS1} route add ${IPv4_GRE}/32 dev veth5 via ${IPv4_6}
ip -netns ${NS2} route add ${IPv4_GRE}/32 dev veth7 via ${IPv4_8}
# configure IPv6 GRE device in NS3, and a route to it via the "bottom" route
ip -netns ${NS3} -6 tunnel add name gre6_dev mode ip6gre remote ${IPv6_1} local ${IPv6_GRE} ttl 255
ip -netns ${NS3} link set gre6_dev up
ip -netns ${NS3} -6 addr add ${IPv6_GRE} nodad dev gre6_dev
ip -netns ${NS1} -6 route add ${IPv6_GRE}/128 dev veth5 via ${IPv6_6}
ip -netns ${NS2} -6 route add ${IPv6_GRE}/128 dev veth7 via ${IPv6_8}
# rp_filter gets confused by what these tests are doing, so disable it
ip netns exec ${NS1} sysctl -wq net.ipv4.conf.all.rp_filter=0
ip netns exec ${NS2} sysctl -wq net.ipv4.conf.all.rp_filter=0
ip netns exec ${NS3} sysctl -wq net.ipv4.conf.all.rp_filter=0
}
cleanup() {
ip netns del ${NS1} 2> /dev/null
ip netns del ${NS2} 2> /dev/null
ip netns del ${NS3} 2> /dev/null
}
trap cleanup EXIT
test_ping() {
local readonly PROTO=$1
local readonly EXPECTED=$2
local RET=0
set +e
if [ "${PROTO}" == "IPv4" ] ; then
ip netns exec ${NS1} ping -c 1 -W 1 -I ${IPv4_SRC} ${IPv4_DST} 2>&1 > /dev/null
RET=$?
elif [ "${PROTO}" == "IPv6" ] ; then
ip netns exec ${NS1} ping6 -c 1 -W 6 -I ${IPv6_SRC} ${IPv6_DST} 2>&1 > /dev/null
RET=$?
else
echo "test_ping: unknown PROTO: ${PROTO}"
exit 1
fi
set -e
if [ "0" != "${RET}" ]; then
RET=1
fi
if [ "${EXPECTED}" != "${RET}" ] ; then
echo "FAIL: test_ping: ${RET}"
exit 1
fi
}
test_egress() {
local readonly ENCAP=$1
echo "starting egress ${ENCAP} encap test"
setup
# need to wait a bit for IPv6 to autoconf, otherwise
# ping6 sometimes fails with "unable to bind to address"
# by default, pings work
test_ping IPv4 0
test_ping IPv6 0
# remove NS2->DST routes, ping fails
ip -netns ${NS2} route del ${IPv4_DST}/32 dev veth3
ip -netns ${NS2} -6 route del ${IPv6_DST}/128 dev veth3
test_ping IPv4 1
test_ping IPv6 1
# install replacement routes (LWT/eBPF), pings succeed
if [ "${ENCAP}" == "IPv4" ] ; then
ip -netns ${NS1} route add ${IPv4_DST} encap bpf xmit obj test_lwt_ip_encap.o sec encap_gre dev veth1
ip -netns ${NS1} -6 route add ${IPv6_DST} encap bpf xmit obj test_lwt_ip_encap.o sec encap_gre dev veth1
elif [ "${ENCAP}" == "IPv6" ] ; then
ip -netns ${NS1} route add ${IPv4_DST} encap bpf xmit obj test_lwt_ip_encap.o sec encap_gre6 dev veth1
ip -netns ${NS1} -6 route add ${IPv6_DST} encap bpf xmit obj test_lwt_ip_encap.o sec encap_gre6 dev veth1
else
echo "FAIL: unknown encap ${ENCAP}"
fi
test_ping IPv4 0
test_ping IPv6 0
cleanup
echo "PASS"
}
test_ingress() {
local readonly ENCAP=$1
echo "starting ingress ${ENCAP} encap test"
setup
# need to wait a bit for IPv6 to autoconf, otherwise
# ping6 sometimes fails with "unable to bind to address"
# by default, pings work
test_ping IPv4 0
test_ping IPv6 0
# remove NS2->DST routes, pings fail
ip -netns ${NS2} route del ${IPv4_DST}/32 dev veth3
ip -netns ${NS2} -6 route del ${IPv6_DST}/128 dev veth3
test_ping IPv4 1
test_ping IPv6 1
# install replacement routes (LWT/eBPF), pings succeed
if [ "${ENCAP}" == "IPv4" ] ; then
ip -netns ${NS2} route add ${IPv4_DST} encap bpf in obj test_lwt_ip_encap.o sec encap_gre dev veth2
ip -netns ${NS2} -6 route add ${IPv6_DST} encap bpf in obj test_lwt_ip_encap.o sec encap_gre dev veth2
elif [ "${ENCAP}" == "IPv6" ] ; then
ip -netns ${NS2} route add ${IPv4_DST} encap bpf in obj test_lwt_ip_encap.o sec encap_gre6 dev veth2
ip -netns ${NS2} -6 route add ${IPv6_DST} encap bpf in obj test_lwt_ip_encap.o sec encap_gre6 dev veth2
else
echo "FAIL: unknown encap ${ENCAP}"
fi
test_ping IPv4 0
test_ping IPv6 0
cleanup
echo "PASS"
}
test_egress IPv4
test_egress IPv6
test_ingress IPv4
test_ingress IPv6
echo "all tests passed"
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment