Commit 60afdf06 authored by Daniel Borkmann's avatar Daniel Borkmann

Merge branch 'bpf-veth-xdp-support'

Toshiaki Makita says:

====================
This patch set introduces driver XDP for veth.
Basically this is used in conjunction with redirect action of another XDP
program.

  NIC -----------> veth===veth
 (XDP) (redirect)        (XDP)

In this case xdp_frame can be forwarded to the peer veth without
modification, so we can expect far better performance than generic XDP.

Envisioned use-cases
--------------------

* Container managed XDP program
Container host redirects frames to containers by XDP redirect action, and
privileged containers can deploy their own XDP programs.

* XDP program cascading
Two or more XDP programs can be called for each packet by redirecting
xdp frames to veth.

* Internal interface for an XDP bridge
When using XDP redirection to create a virtual bridge, veth can be used
to create an internal interface for the bridge.

Implementation
--------------

This changeset is making use of NAPI to implement ndo_xdp_xmit and
XDP_TX/REDIRECT. This is mainly because XDP heavily relies on NAPI
context.
 - patch 1: Export a function needed for veth XDP.
 - patch 2-3: Basic implementation of veth XDP.
 - patch 4-6: Add ndo_xdp_xmit.
 - patch 7-9: Add XDP_TX and XDP_REDIRECT.
 - patch 10: Performance optimization for multi-queue env.

Tests and performance numbers
-----------------------------

Tested with a simple XDP program which only redirects packets between
NIC and veth. I used i40e 25G NIC (XXV710) for the physical NIC. The
server has 20 of Xeon Silver 2.20 GHz cores.

  pktgen --(wire)--> XXV710 (i40e) <--(XDP redirect)--> veth===veth (XDP)

The rightmost veth loads XDP progs and just does DROP or TX. The number
of packets is measured in the XDP progs. The leftmost pktgen sends
packets at 37.1 Mpps (almost 25G wire speed).

veth XDP action    Flows    Mpps
================================
DROP                   1    10.6
DROP                   2    21.2
DROP                 100    36.0
TX                     1     5.0
TX                     2    10.0
TX                   100    31.0

I also measured netperf TCP_STREAM but was not so great performance due
to lack of tx/rx checksum offload and TSO, etc.

  netperf <--(wire)--> XXV710 (i40e) <--(XDP redirect)--> veth===veth (XDP PASS)

Direction         Flows   Gbps
==============================
external->veth        1   20.8
external->veth        2   23.5
external->veth      100   23.6
veth->external        1    9.0
veth->external        2   17.8
veth->external      100   22.9

Also tested doing ifup/down or load/unload a XDP program repeatedly
during processing XDP packets in order to check if enabling/disabling
NAPI is working as expected, and found no problems.

v8:
- Don't use xdp_frame pointer address to calculate skb->head, headroom,
  and xdp_buff.data_hard_start.

v7:
- Introduce xdp_scrub_frame() to clear kernel pointers in xdp_frame and
  use it instead of memset().

v6:
- Check skb->len only if reallocation is needed.
- Add __GFP_NOWARN to alloc_page() since it can be triggered by external
  events.
- Fix sparse warning around EXPORT_SYMBOL.

v5:
- Fix broken SOBs.

v4:
- Don't adjust MTU automatically.
- Skip peer IFF_UP check on .ndo_xdp_xmit() because it is unnecessary.
  Add comments to explain that.
- Use redirect_info instead of xdp_mem_info for storing no_direct flag
  to avoid per packet copy cost.

v3:
- Drop skb bulk xmit patch since it makes little performance
  difference. The hotspot in TCP skb xmit at this point is checksum
  computation in skb_segment and packet copy on XDP_REDIRECT due to
  cloned/nonlinear skb.
- Fix race on closing device.
- Add extack messages in ndo_bpf.

v2:
- Squash NAPI patch with "Add driver XDP" patch.
- Remove conversion from xdp_frame to skb when NAPI is not enabled.
- Introduce per-queue XDP ring (patch 8).
- Introduce bulk skb xmit when XDP is enabled on the peer (patch 9).
====================
Signed-off-by: default avatarToshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
parents c4c20217 638264dc
This diff is collapsed.
...@@ -537,6 +537,20 @@ struct sk_msg_buff { ...@@ -537,6 +537,20 @@ struct sk_msg_buff {
struct list_head list; struct list_head list;
}; };
struct bpf_redirect_info {
u32 ifindex;
u32 flags;
struct bpf_map *map;
struct bpf_map *map_to_flush;
unsigned long map_owner;
u32 kern_flags;
};
DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info);
/* flags for bpf_redirect_info kern_flags */
#define BPF_RI_F_RF_NO_DIRECT BIT(0) /* no napi_direct on return_frame */
/* Compute the linear packet data range [data, data_end) which /* Compute the linear packet data range [data, data_end) which
* will be accessed by various program types (cls_bpf, act_bpf, * will be accessed by various program types (cls_bpf, act_bpf,
* lwt, ...). Subsystems allowing direct data access must (!) * lwt, ...). Subsystems allowing direct data access must (!)
...@@ -765,6 +779,27 @@ static inline bool bpf_dump_raw_ok(void) ...@@ -765,6 +779,27 @@ static inline bool bpf_dump_raw_ok(void)
struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
const struct bpf_insn *patch, u32 len); const struct bpf_insn *patch, u32 len);
static inline bool xdp_return_frame_no_direct(void)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
return ri->kern_flags & BPF_RI_F_RF_NO_DIRECT;
}
static inline void xdp_set_return_frame_no_direct(void)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
ri->kern_flags |= BPF_RI_F_RF_NO_DIRECT;
}
static inline void xdp_clear_return_frame_no_direct(void)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
ri->kern_flags &= ~BPF_RI_F_RF_NO_DIRECT;
}
static inline int xdp_ok_fwd_dev(const struct net_device *fwd, static inline int xdp_ok_fwd_dev(const struct net_device *fwd,
unsigned int pktlen) unsigned int pktlen)
{ {
......
...@@ -1038,6 +1038,7 @@ static inline struct sk_buff *alloc_skb_fclone(unsigned int size, ...@@ -1038,6 +1038,7 @@ static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
} }
struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src); struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src);
void skb_headers_offset_update(struct sk_buff *skb, int off);
int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask); int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask);
struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t priority); struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t priority);
void skb_copy_header(struct sk_buff *new, const struct sk_buff *old); void skb_copy_header(struct sk_buff *new, const struct sk_buff *old);
......
...@@ -84,6 +84,13 @@ struct xdp_frame { ...@@ -84,6 +84,13 @@ struct xdp_frame {
struct net_device *dev_rx; /* used by cpumap */ struct net_device *dev_rx; /* used by cpumap */
}; };
/* Clear kernel pointers in xdp_frame */
static inline void xdp_scrub_frame(struct xdp_frame *frame)
{
frame->data = NULL;
frame->dev_rx = NULL;
}
/* Convert xdp_buff to xdp_frame */ /* Convert xdp_buff to xdp_frame */
static inline static inline
struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
......
...@@ -2082,19 +2082,12 @@ static const struct bpf_func_proto bpf_clone_redirect_proto = { ...@@ -2082,19 +2082,12 @@ static const struct bpf_func_proto bpf_clone_redirect_proto = {
.arg3_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING,
}; };
struct redirect_info { DEFINE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info);
u32 ifindex; EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info);
u32 flags;
struct bpf_map *map;
struct bpf_map *map_to_flush;
unsigned long map_owner;
};
static DEFINE_PER_CPU(struct redirect_info, redirect_info);
BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
{ {
struct redirect_info *ri = this_cpu_ptr(&redirect_info); struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
if (unlikely(flags & ~(BPF_F_INGRESS))) if (unlikely(flags & ~(BPF_F_INGRESS)))
return TC_ACT_SHOT; return TC_ACT_SHOT;
...@@ -2107,7 +2100,7 @@ BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) ...@@ -2107,7 +2100,7 @@ BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
int skb_do_redirect(struct sk_buff *skb) int skb_do_redirect(struct sk_buff *skb)
{ {
struct redirect_info *ri = this_cpu_ptr(&redirect_info); struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
struct net_device *dev; struct net_device *dev;
dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->ifindex); dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->ifindex);
...@@ -3200,7 +3193,7 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, ...@@ -3200,7 +3193,7 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
void xdp_do_flush_map(void) void xdp_do_flush_map(void)
{ {
struct redirect_info *ri = this_cpu_ptr(&redirect_info); struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
struct bpf_map *map = ri->map_to_flush; struct bpf_map *map = ri->map_to_flush;
ri->map_to_flush = NULL; ri->map_to_flush = NULL;
...@@ -3245,7 +3238,7 @@ static inline bool xdp_map_invalid(const struct bpf_prog *xdp_prog, ...@@ -3245,7 +3238,7 @@ static inline bool xdp_map_invalid(const struct bpf_prog *xdp_prog,
static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
struct bpf_prog *xdp_prog) struct bpf_prog *xdp_prog)
{ {
struct redirect_info *ri = this_cpu_ptr(&redirect_info); struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
unsigned long map_owner = ri->map_owner; unsigned long map_owner = ri->map_owner;
struct bpf_map *map = ri->map; struct bpf_map *map = ri->map;
u32 index = ri->ifindex; u32 index = ri->ifindex;
...@@ -3285,7 +3278,7 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, ...@@ -3285,7 +3278,7 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
struct bpf_prog *xdp_prog) struct bpf_prog *xdp_prog)
{ {
struct redirect_info *ri = this_cpu_ptr(&redirect_info); struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
struct net_device *fwd; struct net_device *fwd;
u32 index = ri->ifindex; u32 index = ri->ifindex;
int err; int err;
...@@ -3317,7 +3310,7 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, ...@@ -3317,7 +3310,7 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
struct xdp_buff *xdp, struct xdp_buff *xdp,
struct bpf_prog *xdp_prog) struct bpf_prog *xdp_prog)
{ {
struct redirect_info *ri = this_cpu_ptr(&redirect_info); struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
unsigned long map_owner = ri->map_owner; unsigned long map_owner = ri->map_owner;
struct bpf_map *map = ri->map; struct bpf_map *map = ri->map;
u32 index = ri->ifindex; u32 index = ri->ifindex;
...@@ -3368,7 +3361,7 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, ...@@ -3368,7 +3361,7 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
struct xdp_buff *xdp, struct bpf_prog *xdp_prog) struct xdp_buff *xdp, struct bpf_prog *xdp_prog)
{ {
struct redirect_info *ri = this_cpu_ptr(&redirect_info); struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
u32 index = ri->ifindex; u32 index = ri->ifindex;
struct net_device *fwd; struct net_device *fwd;
int err = 0; int err = 0;
...@@ -3399,7 +3392,7 @@ EXPORT_SYMBOL_GPL(xdp_do_generic_redirect); ...@@ -3399,7 +3392,7 @@ EXPORT_SYMBOL_GPL(xdp_do_generic_redirect);
BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags)
{ {
struct redirect_info *ri = this_cpu_ptr(&redirect_info); struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
if (unlikely(flags)) if (unlikely(flags))
return XDP_ABORTED; return XDP_ABORTED;
...@@ -3423,7 +3416,7 @@ static const struct bpf_func_proto bpf_xdp_redirect_proto = { ...@@ -3423,7 +3416,7 @@ static const struct bpf_func_proto bpf_xdp_redirect_proto = {
BPF_CALL_4(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags, BPF_CALL_4(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags,
unsigned long, map_owner) unsigned long, map_owner)
{ {
struct redirect_info *ri = this_cpu_ptr(&redirect_info); struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
if (unlikely(flags)) if (unlikely(flags))
return XDP_ABORTED; return XDP_ABORTED;
......
...@@ -1291,7 +1291,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) ...@@ -1291,7 +1291,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
} }
EXPORT_SYMBOL(skb_clone); EXPORT_SYMBOL(skb_clone);
static void skb_headers_offset_update(struct sk_buff *skb, int off) void skb_headers_offset_update(struct sk_buff *skb, int off)
{ {
/* Only adjust this if it actually is csum_start rather than csum */ /* Only adjust this if it actually is csum_start rather than csum */
if (skb->ip_summed == CHECKSUM_PARTIAL) if (skb->ip_summed == CHECKSUM_PARTIAL)
...@@ -1305,6 +1305,7 @@ static void skb_headers_offset_update(struct sk_buff *skb, int off) ...@@ -1305,6 +1305,7 @@ static void skb_headers_offset_update(struct sk_buff *skb, int off)
skb->inner_network_header += off; skb->inner_network_header += off;
skb->inner_mac_header += off; skb->inner_mac_header += off;
} }
EXPORT_SYMBOL(skb_headers_offset_update);
void skb_copy_header(struct sk_buff *new, const struct sk_buff *old) void skb_copy_header(struct sk_buff *new, const struct sk_buff *old)
{ {
......
...@@ -330,10 +330,12 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, ...@@ -330,10 +330,12 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
/* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */ /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */
xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
page = virt_to_head_page(data); page = virt_to_head_page(data);
if (xa) if (xa) {
napi_direct &= !xdp_return_frame_no_direct();
page_pool_put_page(xa->page_pool, page, napi_direct); page_pool_put_page(xa->page_pool, page, napi_direct);
else } else {
put_page(page); put_page(page);
}
rcu_read_unlock(); rcu_read_unlock();
break; break;
case MEM_TYPE_PAGE_SHARED: case MEM_TYPE_PAGE_SHARED:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment