Commit fcb905d8 authored by Martin KaFai Lau's avatar Martin KaFai Lau

Merge branch 'bpf_redirect_peer fixes'

Daniel Borkmann says:

====================
This fixes bpf_redirect_peer stats accounting for veth and netkit,
and adds tstats in the first place for the latter. Utilise indirect
call wrapper for bpf_redirect_peer, and improve test coverage of the
latter also for netkit devices. Details in the patches, thanks!

The series was targeted at bpf originally, and is done here as well,
so it can trigger BPF CI. Jakub, if you think directly going via net
is better since the majority of the diff touches net anyway, that is
fine, too.

Thanks!

v2 -> v3:
  - Add kdoc for pcpu_stat_type (Simon)
  - Reject invalid type value in netdev_do_alloc_pcpu_stats (Simon)
  - Add Reviewed-by tags from list
v1 -> v2:
  - Move stats allocation/freeing into net core (Jakub)
  - As prepwork for the above, move vrf's dstats over into the core
  - Add a check into stats alloc to enforce tstats upon
    implementing ndo_get_peer_dev
  - Add Acked-by tags from list

Daniel Borkmann (6):
  net, vrf: Move dstats structure to core
  net: Move {l,t,d}stats allocation to core and convert veth & vrf
  netkit: Add tstats per-CPU traffic counters
  bpf, netkit: Add indirect call wrapper for fetching peer dev
  selftests/bpf: De-veth-ize the tc_redirect test case
  selftests/bpf: Add netkit to tc_redirect selftest
====================
Signed-off-by: default avatarMartin KaFai Lau <martin.lau@kernel.org>
parents 76df934c adfeae2d
......@@ -7,6 +7,7 @@
#include <linux/filter.h>
#include <linux/netfilter_netdev.h>
#include <linux/bpf_mprog.h>
#include <linux/indirect_call_wrapper.h>
#include <net/netkit.h>
#include <net/dst.h>
......@@ -68,6 +69,7 @@ static netdev_tx_t netkit_xmit(struct sk_buff *skb, struct net_device *dev)
netdev_tx_t ret_dev = NET_XMIT_SUCCESS;
const struct bpf_mprog_entry *entry;
struct net_device *peer;
int len = skb->len;
rcu_read_lock();
peer = rcu_dereference(nk->peer);
......@@ -85,15 +87,22 @@ static netdev_tx_t netkit_xmit(struct sk_buff *skb, struct net_device *dev)
case NETKIT_PASS:
skb->protocol = eth_type_trans(skb, skb->dev);
skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
__netif_rx(skb);
if (likely(__netif_rx(skb) == NET_RX_SUCCESS)) {
dev_sw_netstats_tx_add(dev, 1, len);
dev_sw_netstats_rx_add(peer, len);
} else {
goto drop_stats;
}
break;
case NETKIT_REDIRECT:
dev_sw_netstats_tx_add(dev, 1, len);
skb_do_redirect(skb);
break;
case NETKIT_DROP:
default:
drop:
kfree_skb(skb);
drop_stats:
dev_core_stats_tx_dropped_inc(dev);
ret_dev = NET_XMIT_DROP;
break;
......@@ -169,11 +178,18 @@ static void netkit_set_headroom(struct net_device *dev, int headroom)
rcu_read_unlock();
}
static struct net_device *netkit_peer_dev(struct net_device *dev)
INDIRECT_CALLABLE_SCOPE struct net_device *netkit_peer_dev(struct net_device *dev)
{
return rcu_dereference(netkit_priv(dev)->peer);
}
static void netkit_get_stats(struct net_device *dev,
struct rtnl_link_stats64 *stats)
{
dev_fetch_sw_netstats(stats, dev->tstats);
stats->tx_dropped = DEV_STATS_READ(dev, tx_dropped);
}
static void netkit_uninit(struct net_device *dev);
static const struct net_device_ops netkit_netdev_ops = {
......@@ -184,6 +200,7 @@ static const struct net_device_ops netkit_netdev_ops = {
.ndo_set_rx_headroom = netkit_set_headroom,
.ndo_get_iflink = netkit_get_iflink,
.ndo_get_peer_dev = netkit_peer_dev,
.ndo_get_stats64 = netkit_get_stats,
.ndo_uninit = netkit_uninit,
.ndo_features_check = passthru_features_check,
};
......@@ -218,6 +235,7 @@ static void netkit_setup(struct net_device *dev)
ether_setup(dev);
dev->max_mtu = ETH_MAX_MTU;
dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS;
dev->flags |= IFF_NOARP;
dev->priv_flags &= ~IFF_TX_SKB_SHARING;
......
......@@ -373,7 +373,7 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
skb_tx_timestamp(skb);
if (likely(veth_forward_skb(rcv, skb, rq, use_napi) == NET_RX_SUCCESS)) {
if (!use_napi)
dev_lstats_add(dev, length);
dev_sw_netstats_tx_add(dev, 1, length);
else
__veth_xdp_flush(rq);
} else {
......@@ -387,14 +387,6 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
return ret;
}
static u64 veth_stats_tx(struct net_device *dev, u64 *packets, u64 *bytes)
{
struct veth_priv *priv = netdev_priv(dev);
dev_lstats_read(dev, packets, bytes);
return atomic64_read(&priv->dropped);
}
static void veth_stats_rx(struct veth_stats *result, struct net_device *dev)
{
struct veth_priv *priv = netdev_priv(dev);
......@@ -432,24 +424,24 @@ static void veth_get_stats64(struct net_device *dev,
struct veth_priv *priv = netdev_priv(dev);
struct net_device *peer;
struct veth_stats rx;
u64 packets, bytes;
tot->tx_dropped = veth_stats_tx(dev, &packets, &bytes);
tot->tx_bytes = bytes;
tot->tx_packets = packets;
tot->tx_dropped = atomic64_read(&priv->dropped);
dev_fetch_sw_netstats(tot, dev->tstats);
veth_stats_rx(&rx, dev);
tot->tx_dropped += rx.xdp_tx_err;
tot->rx_dropped = rx.rx_drops + rx.peer_tq_xdp_xmit_err;
tot->rx_bytes = rx.xdp_bytes;
tot->rx_packets = rx.xdp_packets;
tot->rx_bytes += rx.xdp_bytes;
tot->rx_packets += rx.xdp_packets;
rcu_read_lock();
peer = rcu_dereference(priv->peer);
if (peer) {
veth_stats_tx(peer, &packets, &bytes);
tot->rx_bytes += bytes;
tot->rx_packets += packets;
struct rtnl_link_stats64 tot_peer = {};
dev_fetch_sw_netstats(&tot_peer, peer->tstats);
tot->rx_bytes += tot_peer.tx_bytes;
tot->rx_packets += tot_peer.tx_packets;
veth_stats_rx(&rx, peer);
tot->tx_dropped += rx.peer_tq_xdp_xmit_err;
......@@ -1506,25 +1498,12 @@ static void veth_free_queues(struct net_device *dev)
static int veth_dev_init(struct net_device *dev)
{
int err;
dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats);
if (!dev->lstats)
return -ENOMEM;
err = veth_alloc_queues(dev);
if (err) {
free_percpu(dev->lstats);
return err;
}
return 0;
return veth_alloc_queues(dev);
}
static void veth_dev_free(struct net_device *dev)
{
veth_free_queues(dev);
free_percpu(dev->lstats);
}
#ifdef CONFIG_NET_POLL_CONTROLLER
......@@ -1796,6 +1775,7 @@ static void veth_setup(struct net_device *dev)
NETIF_F_HW_VLAN_STAG_RX);
dev->needs_free_netdev = true;
dev->priv_destructor = veth_dev_free;
dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS;
dev->max_mtu = ETH_MAX_MTU;
dev->hw_features = VETH_FEATURES;
......
......@@ -121,22 +121,12 @@ struct net_vrf {
int ifindex;
};
struct pcpu_dstats {
u64 tx_pkts;
u64 tx_bytes;
u64 tx_drps;
u64 rx_pkts;
u64 rx_bytes;
u64 rx_drps;
struct u64_stats_sync syncp;
};
static void vrf_rx_stats(struct net_device *dev, int len)
{
struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats);
u64_stats_update_begin(&dstats->syncp);
dstats->rx_pkts++;
dstats->rx_packets++;
dstats->rx_bytes += len;
u64_stats_update_end(&dstats->syncp);
}
......@@ -161,10 +151,10 @@ static void vrf_get_stats64(struct net_device *dev,
do {
start = u64_stats_fetch_begin(&dstats->syncp);
tbytes = dstats->tx_bytes;
tpkts = dstats->tx_pkts;
tdrops = dstats->tx_drps;
tpkts = dstats->tx_packets;
tdrops = dstats->tx_drops;
rbytes = dstats->rx_bytes;
rpkts = dstats->rx_pkts;
rpkts = dstats->rx_packets;
} while (u64_stats_fetch_retry(&dstats->syncp, start));
stats->tx_bytes += tbytes;
stats->tx_packets += tpkts;
......@@ -421,7 +411,7 @@ static int vrf_local_xmit(struct sk_buff *skb, struct net_device *dev,
if (likely(__netif_rx(skb) == NET_RX_SUCCESS))
vrf_rx_stats(dev, len);
else
this_cpu_inc(dev->dstats->rx_drps);
this_cpu_inc(dev->dstats->rx_drops);
return NETDEV_TX_OK;
}
......@@ -616,11 +606,11 @@ static netdev_tx_t vrf_xmit(struct sk_buff *skb, struct net_device *dev)
struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats);
u64_stats_update_begin(&dstats->syncp);
dstats->tx_pkts++;
dstats->tx_packets++;
dstats->tx_bytes += len;
u64_stats_update_end(&dstats->syncp);
} else {
this_cpu_inc(dev->dstats->tx_drps);
this_cpu_inc(dev->dstats->tx_drops);
}
return ret;
......@@ -1174,22 +1164,15 @@ static void vrf_dev_uninit(struct net_device *dev)
vrf_rtable_release(dev, vrf);
vrf_rt6_release(dev, vrf);
free_percpu(dev->dstats);
dev->dstats = NULL;
}
static int vrf_dev_init(struct net_device *dev)
{
struct net_vrf *vrf = netdev_priv(dev);
dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats);
if (!dev->dstats)
goto out_nomem;
/* create the default dst which points back to us */
if (vrf_rtable_create(dev) != 0)
goto out_stats;
goto out_nomem;
if (vrf_rt6_create(dev) != 0)
goto out_rth;
......@@ -1203,9 +1186,6 @@ static int vrf_dev_init(struct net_device *dev)
out_rth:
vrf_rtable_release(dev, vrf);
out_stats:
free_percpu(dev->dstats);
dev->dstats = NULL;
out_nomem:
return -ENOMEM;
}
......@@ -1704,6 +1684,8 @@ static void vrf_setup(struct net_device *dev)
dev->min_mtu = IPV6_MIN_MTU;
dev->max_mtu = IP6_MAX_MTU;
dev->mtu = dev->max_mtu;
dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS;
}
static int vrf_validate(struct nlattr *tb[], struct nlattr *data[],
......
......@@ -1797,6 +1797,13 @@ enum netdev_ml_priv_type {
ML_PRIV_CAN,
};
enum netdev_stat_type {
NETDEV_PCPU_STAT_NONE,
NETDEV_PCPU_STAT_LSTATS, /* struct pcpu_lstats */
NETDEV_PCPU_STAT_TSTATS, /* struct pcpu_sw_netstats */
NETDEV_PCPU_STAT_DSTATS, /* struct pcpu_dstats */
};
/**
* struct net_device - The DEVICE structure.
*
......@@ -1991,10 +1998,14 @@ enum netdev_ml_priv_type {
*
* @ml_priv: Mid-layer private
* @ml_priv_type: Mid-layer private type
* @lstats: Loopback statistics
* @tstats: Tunnel statistics
* @dstats: Dummy statistics
* @vstats: Virtual ethernet statistics
*
* @pcpu_stat_type: Type of device statistics which the core should
* allocate/free: none, lstats, tstats, dstats. none
* means the driver is handling statistics allocation/
* freeing internally.
* @lstats: Loopback statistics: packets, bytes
* @tstats: Tunnel statistics: RX/TX packets, RX/TX bytes
* @dstats: Dummy statistics: RX/TX/drop packets, RX/TX bytes
*
* @garp_port: GARP
* @mrp_port: MRP
......@@ -2354,6 +2365,7 @@ struct net_device {
void *ml_priv;
enum netdev_ml_priv_type ml_priv_type;
enum netdev_stat_type pcpu_stat_type:8;
union {
struct pcpu_lstats __percpu *lstats;
struct pcpu_sw_netstats __percpu *tstats;
......@@ -2755,6 +2767,16 @@ struct pcpu_sw_netstats {
struct u64_stats_sync syncp;
} __aligned(4 * sizeof(u64));
struct pcpu_dstats {
u64 rx_packets;
u64 rx_bytes;
u64 rx_drops;
u64 tx_packets;
u64 tx_bytes;
u64 tx_drops;
struct u64_stats_sync syncp;
} __aligned(8 * sizeof(u64));
struct pcpu_lstats {
u64_stats_t packets;
u64_stats_t bytes;
......
......@@ -10,6 +10,7 @@ int netkit_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog);
int netkit_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
int netkit_prog_detach(const union bpf_attr *attr, struct bpf_prog *prog);
int netkit_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr);
INDIRECT_CALLABLE_DECLARE(struct net_device *netkit_peer_dev(struct net_device *dev));
#else
static inline int netkit_prog_attach(const union bpf_attr *attr,
struct bpf_prog *prog)
......@@ -34,5 +35,10 @@ static inline int netkit_prog_query(const union bpf_attr *attr,
{
return -EINVAL;
}
static inline struct net_device *netkit_peer_dev(struct net_device *dev)
{
return NULL;
}
#endif /* CONFIG_NETKIT */
#endif /* __NET_NETKIT_H */
......@@ -10051,6 +10051,54 @@ void netif_tx_stop_all_queues(struct net_device *dev)
}
EXPORT_SYMBOL(netif_tx_stop_all_queues);
static int netdev_do_alloc_pcpu_stats(struct net_device *dev)
{
void __percpu *v;
/* Drivers implementing ndo_get_peer_dev must support tstat
* accounting, so that skb_do_redirect() can bump the dev's
* RX stats upon network namespace switch.
*/
if (dev->netdev_ops->ndo_get_peer_dev &&
dev->pcpu_stat_type != NETDEV_PCPU_STAT_TSTATS)
return -EOPNOTSUPP;
switch (dev->pcpu_stat_type) {
case NETDEV_PCPU_STAT_NONE:
return 0;
case NETDEV_PCPU_STAT_LSTATS:
v = dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats);
break;
case NETDEV_PCPU_STAT_TSTATS:
v = dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
break;
case NETDEV_PCPU_STAT_DSTATS:
v = dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats);
break;
default:
return -EINVAL;
}
return v ? 0 : -ENOMEM;
}
static void netdev_do_free_pcpu_stats(struct net_device *dev)
{
switch (dev->pcpu_stat_type) {
case NETDEV_PCPU_STAT_NONE:
return;
case NETDEV_PCPU_STAT_LSTATS:
free_percpu(dev->lstats);
break;
case NETDEV_PCPU_STAT_TSTATS:
free_percpu(dev->tstats);
break;
case NETDEV_PCPU_STAT_DSTATS:
free_percpu(dev->dstats);
break;
}
}
/**
* register_netdevice() - register a network device
* @dev: device to register
......@@ -10111,9 +10159,13 @@ int register_netdevice(struct net_device *dev)
goto err_uninit;
}
ret = netdev_do_alloc_pcpu_stats(dev);
if (ret)
goto err_uninit;
ret = dev_index_reserve(net, dev->ifindex);
if (ret < 0)
goto err_uninit;
goto err_free_pcpu;
dev->ifindex = ret;
/* Transfer changeable features to wanted_features and enable
......@@ -10219,6 +10271,8 @@ int register_netdevice(struct net_device *dev)
call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);
err_ifindex_release:
dev_index_release(net, dev->ifindex);
err_free_pcpu:
netdev_do_free_pcpu_stats(dev);
err_uninit:
if (dev->netdev_ops->ndo_uninit)
dev->netdev_ops->ndo_uninit(dev);
......@@ -10471,6 +10525,7 @@ void netdev_run_todo(void)
WARN_ON(rcu_access_pointer(dev->ip_ptr));
WARN_ON(rcu_access_pointer(dev->ip6_ptr));
netdev_do_free_pcpu_stats(dev);
if (dev->priv_destructor)
dev->priv_destructor(dev);
if (dev->needs_free_netdev)
......
......@@ -81,6 +81,7 @@
#include <net/xdp.h>
#include <net/mptcp.h>
#include <net/netfilter/nf_conntrack_bpf.h>
#include <net/netkit.h>
#include <linux/un.h>
#include "dev.h"
......@@ -2468,6 +2469,16 @@ static const struct bpf_func_proto bpf_clone_redirect_proto = {
DEFINE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info);
EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info);
static struct net_device *skb_get_peer_dev(struct net_device *dev)
{
const struct net_device_ops *ops = dev->netdev_ops;
if (likely(ops->ndo_get_peer_dev))
return INDIRECT_CALL_1(ops->ndo_get_peer_dev,
netkit_peer_dev, dev);
return NULL;
}
int skb_do_redirect(struct sk_buff *skb)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
......@@ -2481,17 +2492,15 @@ int skb_do_redirect(struct sk_buff *skb)
if (unlikely(!dev))
goto out_drop;
if (flags & BPF_F_PEER) {
const struct net_device_ops *ops = dev->netdev_ops;
if (unlikely(!ops->ndo_get_peer_dev ||
!skb_at_tc_ingress(skb)))
if (unlikely(!skb_at_tc_ingress(skb)))
goto out_drop;
dev = ops->ndo_get_peer_dev(dev);
dev = skb_get_peer_dev(dev);
if (unlikely(!dev ||
!(dev->flags & IFF_UP) ||
net_eq(net, dev_net(dev))))
goto out_drop;
skb->dev = dev;
dev_sw_netstats_rx_add(dev, skb->len);
return -EAGAIN;
}
return flags & BPF_F_NEIGH ?
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment