Commit 735fc405 authored by Jesper Dangaard Brouer's avatar Jesper Dangaard Brouer Committed by Alexei Starovoitov

xdp: change ndo_xdp_xmit API to support bulking

This patch change the API for ndo_xdp_xmit to support bulking
xdp_frames.

When kernel is compiled with CONFIG_RETPOLINE, XDP sees a huge slowdown.
Most of the slowdown is caused by DMA API indirect function calls, but
also the net_device->ndo_xdp_xmit() call.

Benchmarked patch with CONFIG_RETPOLINE, using xdp_redirect_map with
single flow/core test (CPU E5-1650 v4 @ 3.60GHz), showed
performance improved:
 for driver ixgbe: 6,042,682 pps -> 6,853,768 pps = +811,086 pps
 for driver i40e : 6,187,169 pps -> 6,724,519 pps = +537,350 pps

With frames avail as a bulk inside the driver ndo_xdp_xmit call,
further optimizations are possible, like bulk DMA-mapping for TX.

Testing without CONFIG_RETPOLINE show the same performance for
physical NIC drivers.

The virtual NIC driver tun sees a huge performance boost, as it can
avoid doing per frame producer locking, but instead amortize the
locking cost over the bulk.

V2: Fix compile errors reported by kbuild test robot <lkp@intel.com>
V4: Isolated ndo, driver changes and callers.
Signed-off-by: default avatarJesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parent 389ab7f0
...@@ -3664,14 +3664,19 @@ netdev_tx_t i40e_lan_xmit_frame(struct sk_buff *skb, struct net_device *netdev) ...@@ -3664,14 +3664,19 @@ netdev_tx_t i40e_lan_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
* @dev: netdev * @dev: netdev
* @xdp: XDP buffer * @xdp: XDP buffer
* *
* Returns Zero if sent, else an error code * Returns number of frames successfully sent. Frames that fail are
* free'ed via XDP return API.
*
* For error cases, a negative errno code is returned and no-frames
* are transmitted (caller must handle freeing frames).
**/ **/
int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf) int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames)
{ {
struct i40e_netdev_priv *np = netdev_priv(dev); struct i40e_netdev_priv *np = netdev_priv(dev);
unsigned int queue_index = smp_processor_id(); unsigned int queue_index = smp_processor_id();
struct i40e_vsi *vsi = np->vsi; struct i40e_vsi *vsi = np->vsi;
int err; int drops = 0;
int i;
if (test_bit(__I40E_VSI_DOWN, vsi->state)) if (test_bit(__I40E_VSI_DOWN, vsi->state))
return -ENETDOWN; return -ENETDOWN;
...@@ -3679,11 +3684,18 @@ int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf) ...@@ -3679,11 +3684,18 @@ int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
if (!i40e_enabled_xdp_vsi(vsi) || queue_index >= vsi->num_queue_pairs) if (!i40e_enabled_xdp_vsi(vsi) || queue_index >= vsi->num_queue_pairs)
return -ENXIO; return -ENXIO;
err = i40e_xmit_xdp_ring(xdpf, vsi->xdp_rings[queue_index]); for (i = 0; i < n; i++) {
if (err != I40E_XDP_TX) struct xdp_frame *xdpf = frames[i];
return -ENOSPC; int err;
return 0; err = i40e_xmit_xdp_ring(xdpf, vsi->xdp_rings[queue_index]);
if (err != I40E_XDP_TX) {
xdp_return_frame_rx_napi(xdpf);
drops++;
}
}
return n - drops;
} }
/** /**
......
...@@ -487,7 +487,7 @@ u32 i40e_get_tx_pending(struct i40e_ring *ring, bool in_sw); ...@@ -487,7 +487,7 @@ u32 i40e_get_tx_pending(struct i40e_ring *ring, bool in_sw);
void i40e_detect_recover_hung(struct i40e_vsi *vsi); void i40e_detect_recover_hung(struct i40e_vsi *vsi);
int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size); int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size);
bool __i40e_chk_linearize(struct sk_buff *skb); bool __i40e_chk_linearize(struct sk_buff *skb);
int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf); int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames);
void i40e_xdp_flush(struct net_device *dev); void i40e_xdp_flush(struct net_device *dev);
/** /**
......
...@@ -10017,11 +10017,13 @@ static int ixgbe_xdp(struct net_device *dev, struct netdev_bpf *xdp) ...@@ -10017,11 +10017,13 @@ static int ixgbe_xdp(struct net_device *dev, struct netdev_bpf *xdp)
} }
} }
static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf) static int ixgbe_xdp_xmit(struct net_device *dev, int n,
struct xdp_frame **frames)
{ {
struct ixgbe_adapter *adapter = netdev_priv(dev); struct ixgbe_adapter *adapter = netdev_priv(dev);
struct ixgbe_ring *ring; struct ixgbe_ring *ring;
int err; int drops = 0;
int i;
if (unlikely(test_bit(__IXGBE_DOWN, &adapter->state))) if (unlikely(test_bit(__IXGBE_DOWN, &adapter->state)))
return -ENETDOWN; return -ENETDOWN;
...@@ -10033,11 +10035,18 @@ static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf) ...@@ -10033,11 +10035,18 @@ static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
if (unlikely(!ring)) if (unlikely(!ring))
return -ENXIO; return -ENXIO;
err = ixgbe_xmit_xdp_ring(adapter, xdpf); for (i = 0; i < n; i++) {
if (err != IXGBE_XDP_TX) struct xdp_frame *xdpf = frames[i];
return -ENOSPC; int err;
return 0; err = ixgbe_xmit_xdp_ring(adapter, xdpf);
if (err != IXGBE_XDP_TX) {
xdp_return_frame_rx_napi(xdpf);
drops++;
}
}
return n - drops;
} }
static void ixgbe_xdp_flush(struct net_device *dev) static void ixgbe_xdp_flush(struct net_device *dev)
......
...@@ -70,6 +70,7 @@ ...@@ -70,6 +70,7 @@
#include <net/netns/generic.h> #include <net/netns/generic.h>
#include <net/rtnetlink.h> #include <net/rtnetlink.h>
#include <net/sock.h> #include <net/sock.h>
#include <net/xdp.h>
#include <linux/seq_file.h> #include <linux/seq_file.h>
#include <linux/uio.h> #include <linux/uio.h>
#include <linux/skb_array.h> #include <linux/skb_array.h>
...@@ -1290,34 +1291,44 @@ static const struct net_device_ops tun_netdev_ops = { ...@@ -1290,34 +1291,44 @@ static const struct net_device_ops tun_netdev_ops = {
.ndo_get_stats64 = tun_net_get_stats64, .ndo_get_stats64 = tun_net_get_stats64,
}; };
static int tun_xdp_xmit(struct net_device *dev, struct xdp_frame *frame) static int tun_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames)
{ {
struct tun_struct *tun = netdev_priv(dev); struct tun_struct *tun = netdev_priv(dev);
struct tun_file *tfile; struct tun_file *tfile;
u32 numqueues; u32 numqueues;
int ret = 0; int drops = 0;
int cnt = n;
int i;
rcu_read_lock(); rcu_read_lock();
numqueues = READ_ONCE(tun->numqueues); numqueues = READ_ONCE(tun->numqueues);
if (!numqueues) { if (!numqueues) {
ret = -ENOSPC; rcu_read_unlock();
goto out; return -ENXIO; /* Caller will free/return all frames */
} }
tfile = rcu_dereference(tun->tfiles[smp_processor_id() % tfile = rcu_dereference(tun->tfiles[smp_processor_id() %
numqueues]); numqueues]);
/* Encode the XDP flag into lowest bit for consumer to differ
* XDP buffer from sk_buff. spin_lock(&tfile->tx_ring.producer_lock);
*/ for (i = 0; i < n; i++) {
if (ptr_ring_produce(&tfile->tx_ring, tun_xdp_to_ptr(frame))) { struct xdp_frame *xdp = frames[i];
this_cpu_inc(tun->pcpu_stats->tx_dropped); /* Encode the XDP flag into lowest bit for consumer to differ
ret = -ENOSPC; * XDP buffer from sk_buff.
*/
void *frame = tun_xdp_to_ptr(xdp);
if (__ptr_ring_produce(&tfile->tx_ring, frame)) {
this_cpu_inc(tun->pcpu_stats->tx_dropped);
xdp_return_frame_rx_napi(xdp);
drops++;
}
} }
spin_unlock(&tfile->tx_ring.producer_lock);
out:
rcu_read_unlock(); rcu_read_unlock();
return ret; return cnt - drops;
} }
static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp) static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
...@@ -1327,7 +1338,7 @@ static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp) ...@@ -1327,7 +1338,7 @@ static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
if (unlikely(!frame)) if (unlikely(!frame))
return -EOVERFLOW; return -EOVERFLOW;
return tun_xdp_xmit(dev, frame); return tun_xdp_xmit(dev, 1, &frame);
} }
static void tun_xdp_flush(struct net_device *dev) static void tun_xdp_flush(struct net_device *dev)
......
...@@ -419,23 +419,13 @@ static void virtnet_xdp_flush(struct net_device *dev) ...@@ -419,23 +419,13 @@ static void virtnet_xdp_flush(struct net_device *dev)
virtqueue_kick(sq->vq); virtqueue_kick(sq->vq);
} }
static int __virtnet_xdp_xmit(struct virtnet_info *vi, static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
struct xdp_frame *xdpf) struct send_queue *sq,
struct xdp_frame *xdpf)
{ {
struct virtio_net_hdr_mrg_rxbuf *hdr; struct virtio_net_hdr_mrg_rxbuf *hdr;
struct xdp_frame *xdpf_sent;
struct send_queue *sq;
unsigned int len;
unsigned int qp;
int err; int err;
qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
sq = &vi->sq[qp];
/* Free up any pending old buffers before queueing new ones. */
while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL)
xdp_return_frame(xdpf_sent);
/* virtqueue want to use data area in-front of packet */ /* virtqueue want to use data area in-front of packet */
if (unlikely(xdpf->metasize > 0)) if (unlikely(xdpf->metasize > 0))
return -EOPNOTSUPP; return -EOPNOTSUPP;
...@@ -459,11 +449,40 @@ static int __virtnet_xdp_xmit(struct virtnet_info *vi, ...@@ -459,11 +449,40 @@ static int __virtnet_xdp_xmit(struct virtnet_info *vi,
return 0; return 0;
} }
static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf) static int __virtnet_xdp_tx_xmit(struct virtnet_info *vi,
struct xdp_frame *xdpf)
{
struct xdp_frame *xdpf_sent;
struct send_queue *sq;
unsigned int len;
unsigned int qp;
qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
sq = &vi->sq[qp];
/* Free up any pending old buffers before queueing new ones. */
while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL)
xdp_return_frame(xdpf_sent);
return __virtnet_xdp_xmit_one(vi, sq, xdpf);
}
static int virtnet_xdp_xmit(struct net_device *dev,
int n, struct xdp_frame **frames)
{ {
struct virtnet_info *vi = netdev_priv(dev); struct virtnet_info *vi = netdev_priv(dev);
struct receive_queue *rq = vi->rq; struct receive_queue *rq = vi->rq;
struct xdp_frame *xdpf_sent;
struct bpf_prog *xdp_prog; struct bpf_prog *xdp_prog;
struct send_queue *sq;
unsigned int len;
unsigned int qp;
int drops = 0;
int err;
int i;
qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
sq = &vi->sq[qp];
/* Only allow ndo_xdp_xmit if XDP is loaded on dev, as this /* Only allow ndo_xdp_xmit if XDP is loaded on dev, as this
* indicate XDP resources have been successfully allocated. * indicate XDP resources have been successfully allocated.
...@@ -472,7 +491,20 @@ static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf) ...@@ -472,7 +491,20 @@ static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
if (!xdp_prog) if (!xdp_prog)
return -ENXIO; return -ENXIO;
return __virtnet_xdp_xmit(vi, xdpf); /* Free up any pending old buffers before queueing new ones. */
while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL)
xdp_return_frame(xdpf_sent);
for (i = 0; i < n; i++) {
struct xdp_frame *xdpf = frames[i];
err = __virtnet_xdp_xmit_one(vi, sq, xdpf);
if (err) {
xdp_return_frame_rx_napi(xdpf);
drops++;
}
}
return n - drops;
} }
static unsigned int virtnet_get_headroom(struct virtnet_info *vi) static unsigned int virtnet_get_headroom(struct virtnet_info *vi)
...@@ -616,7 +648,7 @@ static struct sk_buff *receive_small(struct net_device *dev, ...@@ -616,7 +648,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
xdpf = convert_to_xdp_frame(&xdp); xdpf = convert_to_xdp_frame(&xdp);
if (unlikely(!xdpf)) if (unlikely(!xdpf))
goto err_xdp; goto err_xdp;
err = __virtnet_xdp_xmit(vi, xdpf); err = __virtnet_xdp_tx_xmit(vi, xdpf);
if (unlikely(err)) { if (unlikely(err)) {
trace_xdp_exception(vi->dev, xdp_prog, act); trace_xdp_exception(vi->dev, xdp_prog, act);
goto err_xdp; goto err_xdp;
...@@ -779,7 +811,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, ...@@ -779,7 +811,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
xdpf = convert_to_xdp_frame(&xdp); xdpf = convert_to_xdp_frame(&xdp);
if (unlikely(!xdpf)) if (unlikely(!xdpf))
goto err_xdp; goto err_xdp;
err = __virtnet_xdp_xmit(vi, xdpf); err = __virtnet_xdp_tx_xmit(vi, xdpf);
if (unlikely(err)) { if (unlikely(err)) {
trace_xdp_exception(vi->dev, xdp_prog, act); trace_xdp_exception(vi->dev, xdp_prog, act);
if (unlikely(xdp_page != page)) if (unlikely(xdp_page != page))
......
...@@ -1185,9 +1185,13 @@ struct dev_ifalias { ...@@ -1185,9 +1185,13 @@ struct dev_ifalias {
* This function is used to set or query state related to XDP on the * This function is used to set or query state related to XDP on the
* netdevice and manage BPF offload. See definition of * netdevice and manage BPF offload. See definition of
* enum bpf_netdev_command for details. * enum bpf_netdev_command for details.
* int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_frame *xdp); * int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp);
* This function is used to submit a XDP packet for transmit on a * This function is used to submit @n XDP packets for transmit on a
* netdevice. * netdevice. Returns number of frames successfully transmitted, frames
* that got dropped are freed/returned via xdp_return_frame().
* Returns negative number, means general error invoking ndo, meaning
* no frames were xmit'ed and core-caller will free all frames.
* TODO: Consider add flag to allow sending flush operation.
* void (*ndo_xdp_flush)(struct net_device *dev); * void (*ndo_xdp_flush)(struct net_device *dev);
* This function is used to inform the driver to flush a particular * This function is used to inform the driver to flush a particular
* xdp tx queue. Must be called on same CPU as xdp_xmit. * xdp tx queue. Must be called on same CPU as xdp_xmit.
...@@ -1375,8 +1379,8 @@ struct net_device_ops { ...@@ -1375,8 +1379,8 @@ struct net_device_ops {
int needed_headroom); int needed_headroom);
int (*ndo_bpf)(struct net_device *dev, int (*ndo_bpf)(struct net_device *dev,
struct netdev_bpf *bpf); struct netdev_bpf *bpf);
int (*ndo_xdp_xmit)(struct net_device *dev, int (*ndo_xdp_xmit)(struct net_device *dev, int n,
struct xdp_frame *xdp); struct xdp_frame **xdp);
void (*ndo_xdp_flush)(struct net_device *dev); void (*ndo_xdp_flush)(struct net_device *dev);
}; };
......
...@@ -232,24 +232,31 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj, ...@@ -232,24 +232,31 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
prefetch(xdpf); prefetch(xdpf);
} }
for (i = 0; i < bq->count; i++) { sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q);
struct xdp_frame *xdpf = bq->q[i]; if (sent < 0) {
int err; sent = 0;
goto error;
err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
if (err) {
drops++;
xdp_return_frame_rx_napi(xdpf);
} else {
sent++;
}
} }
drops = bq->count - sent;
out:
bq->count = 0; bq->count = 0;
trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit, trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit,
sent, drops, bq->dev_rx, dev); sent, drops, bq->dev_rx, dev);
bq->dev_rx = NULL; bq->dev_rx = NULL;
return 0; return 0;
error:
/* If ndo_xdp_xmit fails with an errno, no frames have been
* xmit'ed and it's our responsibility to them free all.
*/
for (i = 0; i < bq->count; i++) {
struct xdp_frame *xdpf = bq->q[i];
/* RX path under NAPI protection, can return frames faster */
xdp_return_frame_rx_napi(xdpf);
drops++;
}
goto out;
} }
/* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled /* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled
......
...@@ -3039,7 +3039,7 @@ static int __bpf_tx_xdp(struct net_device *dev, ...@@ -3039,7 +3039,7 @@ static int __bpf_tx_xdp(struct net_device *dev,
u32 index) u32 index)
{ {
struct xdp_frame *xdpf; struct xdp_frame *xdpf;
int err; int sent;
if (!dev->netdev_ops->ndo_xdp_xmit) { if (!dev->netdev_ops->ndo_xdp_xmit) {
return -EOPNOTSUPP; return -EOPNOTSUPP;
...@@ -3049,9 +3049,9 @@ static int __bpf_tx_xdp(struct net_device *dev, ...@@ -3049,9 +3049,9 @@ static int __bpf_tx_xdp(struct net_device *dev,
if (unlikely(!xdpf)) if (unlikely(!xdpf))
return -EOVERFLOW; return -EOVERFLOW;
err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf); sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf);
if (err) if (sent <= 0)
return err; return sent;
dev->netdev_ops->ndo_xdp_flush(dev); dev->netdev_ops->ndo_xdp_flush(dev);
return 0; return 0;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment