Commit c3f1010b authored by David S. Miller's avatar David S. Miller

Merge branch 'vrf-pktinfo'

David Ahern says:

====================
net: vrf: Fixup PKTINFO to return enslaved device index

Applications such as OSPF and BFD need the original ingress device not
the VRF device; the latter can be derived from the former. To that end
move the packet intercept from an rx handler that is invoked by
__netif_receive_skb_core to the ipv4 and ipv6 receive processing.

IPv6 already saves the skb_iif to the control buffer in ipv6_rcv. Since
the skb->dev has not been switched the cb has the enslaved device. Make
the same happen for IPv4 by adding the skb_iif to inet_skb_parm and set
it in ipv4 code after clearing the skb control buffer similar to IPv6.
From there the pktinfo can just pull it from cb with the PKTINFO_SKB_CB
cast.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents ca4aa976 0b922b7a
...@@ -42,9 +42,6 @@ ...@@ -42,9 +42,6 @@
#define DRV_NAME "vrf" #define DRV_NAME "vrf"
#define DRV_VERSION "1.0" #define DRV_VERSION "1.0"
#define vrf_master_get_rcu(dev) \
((struct net_device *)rcu_dereference(dev->rx_handler_data))
struct net_vrf { struct net_vrf {
struct rtable *rth; struct rtable *rth;
struct rt6_info *rt6; struct rt6_info *rt6;
...@@ -60,90 +57,12 @@ struct pcpu_dstats { ...@@ -60,90 +57,12 @@ struct pcpu_dstats {
struct u64_stats_sync syncp; struct u64_stats_sync syncp;
}; };
/* neighbor handling is done with actual device; do not want
* to flip skb->dev for those ndisc packets. This really fails
* for multiple next protocols (e.g., NEXTHDR_HOP). But it is
* a start.
*/
#if IS_ENABLED(CONFIG_IPV6)
static bool check_ipv6_frame(const struct sk_buff *skb)
{
const struct ipv6hdr *ipv6h;
struct ipv6hdr _ipv6h;
bool rc = true;
ipv6h = skb_header_pointer(skb, 0, sizeof(_ipv6h), &_ipv6h);
if (!ipv6h)
goto out;
if (ipv6h->nexthdr == NEXTHDR_ICMP) {
const struct icmp6hdr *icmph;
struct icmp6hdr _icmph;
icmph = skb_header_pointer(skb, sizeof(_ipv6h),
sizeof(_icmph), &_icmph);
if (!icmph)
goto out;
switch (icmph->icmp6_type) {
case NDISC_ROUTER_SOLICITATION:
case NDISC_ROUTER_ADVERTISEMENT:
case NDISC_NEIGHBOUR_SOLICITATION:
case NDISC_NEIGHBOUR_ADVERTISEMENT:
case NDISC_REDIRECT:
rc = false;
break;
}
}
out:
return rc;
}
#else
static bool check_ipv6_frame(const struct sk_buff *skb)
{
return false;
}
#endif
static bool is_ip_rx_frame(struct sk_buff *skb)
{
switch (skb->protocol) {
case htons(ETH_P_IP):
return true;
case htons(ETH_P_IPV6):
return check_ipv6_frame(skb);
}
return false;
}
static void vrf_tx_error(struct net_device *vrf_dev, struct sk_buff *skb) static void vrf_tx_error(struct net_device *vrf_dev, struct sk_buff *skb)
{ {
vrf_dev->stats.tx_errors++; vrf_dev->stats.tx_errors++;
kfree_skb(skb); kfree_skb(skb);
} }
/* note: already called with rcu_read_lock */
static rx_handler_result_t vrf_handle_frame(struct sk_buff **pskb)
{
struct sk_buff *skb = *pskb;
if (is_ip_rx_frame(skb)) {
struct net_device *dev = vrf_master_get_rcu(skb->dev);
struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats);
u64_stats_update_begin(&dstats->syncp);
dstats->rx_pkts++;
dstats->rx_bytes += skb->len;
u64_stats_update_end(&dstats->syncp);
skb->dev = dev;
return RX_HANDLER_ANOTHER;
}
return RX_HANDLER_PASS;
}
static struct rtnl_link_stats64 *vrf_get_stats64(struct net_device *dev, static struct rtnl_link_stats64 *vrf_get_stats64(struct net_device *dev,
struct rtnl_link_stats64 *stats) struct rtnl_link_stats64 *stats)
{ {
...@@ -506,28 +425,14 @@ static int do_vrf_add_slave(struct net_device *dev, struct net_device *port_dev) ...@@ -506,28 +425,14 @@ static int do_vrf_add_slave(struct net_device *dev, struct net_device *port_dev)
{ {
int ret; int ret;
/* register the packet handler for slave ports */
ret = netdev_rx_handler_register(port_dev, vrf_handle_frame, dev);
if (ret) {
netdev_err(port_dev,
"Device %s failed to register rx_handler\n",
port_dev->name);
goto out_fail;
}
ret = netdev_master_upper_dev_link(port_dev, dev, NULL, NULL); ret = netdev_master_upper_dev_link(port_dev, dev, NULL, NULL);
if (ret < 0) if (ret < 0)
goto out_unregister; return ret;
port_dev->priv_flags |= IFF_L3MDEV_SLAVE; port_dev->priv_flags |= IFF_L3MDEV_SLAVE;
cycle_netdev(port_dev); cycle_netdev(port_dev);
return 0; return 0;
out_unregister:
netdev_rx_handler_unregister(port_dev);
out_fail:
return ret;
} }
static int vrf_add_slave(struct net_device *dev, struct net_device *port_dev) static int vrf_add_slave(struct net_device *dev, struct net_device *port_dev)
...@@ -544,8 +449,6 @@ static int do_vrf_del_slave(struct net_device *dev, struct net_device *port_dev) ...@@ -544,8 +449,6 @@ static int do_vrf_del_slave(struct net_device *dev, struct net_device *port_dev)
netdev_upper_dev_unlink(port_dev, dev); netdev_upper_dev_unlink(port_dev, dev);
port_dev->priv_flags &= ~IFF_L3MDEV_SLAVE; port_dev->priv_flags &= ~IFF_L3MDEV_SLAVE;
netdev_rx_handler_unregister(port_dev);
cycle_netdev(port_dev); cycle_netdev(port_dev);
return 0; return 0;
...@@ -669,6 +572,95 @@ static int vrf_get_saddr(struct net_device *dev, struct flowi4 *fl4) ...@@ -669,6 +572,95 @@ static int vrf_get_saddr(struct net_device *dev, struct flowi4 *fl4)
return rc; return rc;
} }
#if IS_ENABLED(CONFIG_IPV6)
/* neighbor handling is done with actual device; do not want
* to flip skb->dev for those ndisc packets. This really fails
* for multiple next protocols (e.g., NEXTHDR_HOP). But it is
* a start.
*/
static bool ipv6_ndisc_frame(const struct sk_buff *skb)
{
const struct ipv6hdr *iph = ipv6_hdr(skb);
bool rc = false;
if (iph->nexthdr == NEXTHDR_ICMP) {
const struct icmp6hdr *icmph;
struct icmp6hdr _icmph;
icmph = skb_header_pointer(skb, sizeof(*iph),
sizeof(_icmph), &_icmph);
if (!icmph)
goto out;
switch (icmph->icmp6_type) {
case NDISC_ROUTER_SOLICITATION:
case NDISC_ROUTER_ADVERTISEMENT:
case NDISC_NEIGHBOUR_SOLICITATION:
case NDISC_NEIGHBOUR_ADVERTISEMENT:
case NDISC_REDIRECT:
rc = true;
break;
}
}
out:
return rc;
}
static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev,
struct sk_buff *skb)
{
/* if packet is NDISC keep the ingress interface */
if (!ipv6_ndisc_frame(skb)) {
skb->dev = vrf_dev;
skb->skb_iif = vrf_dev->ifindex;
skb_push(skb, skb->mac_len);
dev_queue_xmit_nit(skb, vrf_dev);
skb_pull(skb, skb->mac_len);
IP6CB(skb)->flags |= IP6SKB_L3SLAVE;
}
return skb;
}
#else
static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev,
struct sk_buff *skb)
{
return skb;
}
#endif
static struct sk_buff *vrf_ip_rcv(struct net_device *vrf_dev,
struct sk_buff *skb)
{
skb->dev = vrf_dev;
skb->skb_iif = vrf_dev->ifindex;
skb_push(skb, skb->mac_len);
dev_queue_xmit_nit(skb, vrf_dev);
skb_pull(skb, skb->mac_len);
return skb;
}
/* called with rcu lock held */
static struct sk_buff *vrf_l3_rcv(struct net_device *vrf_dev,
struct sk_buff *skb,
u16 proto)
{
switch (proto) {
case AF_INET:
return vrf_ip_rcv(vrf_dev, skb);
case AF_INET6:
return vrf_ip6_rcv(vrf_dev, skb);
}
return skb;
}
#if IS_ENABLED(CONFIG_IPV6) #if IS_ENABLED(CONFIG_IPV6)
static struct dst_entry *vrf_get_rt6_dst(const struct net_device *dev, static struct dst_entry *vrf_get_rt6_dst(const struct net_device *dev,
const struct flowi6 *fl6) const struct flowi6 *fl6)
...@@ -690,6 +682,7 @@ static const struct l3mdev_ops vrf_l3mdev_ops = { ...@@ -690,6 +682,7 @@ static const struct l3mdev_ops vrf_l3mdev_ops = {
.l3mdev_fib_table = vrf_fib_table, .l3mdev_fib_table = vrf_fib_table,
.l3mdev_get_rtable = vrf_get_rtable, .l3mdev_get_rtable = vrf_get_rtable,
.l3mdev_get_saddr = vrf_get_saddr, .l3mdev_get_saddr = vrf_get_saddr,
.l3mdev_l3_rcv = vrf_l3_rcv,
#if IS_ENABLED(CONFIG_IPV6) #if IS_ENABLED(CONFIG_IPV6)
.l3mdev_get_rt6_dst = vrf_get_rt6_dst, .l3mdev_get_rt6_dst = vrf_get_rt6_dst,
#endif #endif
......
...@@ -118,14 +118,29 @@ struct inet6_skb_parm { ...@@ -118,14 +118,29 @@ struct inet6_skb_parm {
#define IP6SKB_ROUTERALERT 8 #define IP6SKB_ROUTERALERT 8
#define IP6SKB_FRAGMENTED 16 #define IP6SKB_FRAGMENTED 16
#define IP6SKB_HOPBYHOP 32 #define IP6SKB_HOPBYHOP 32
#define IP6SKB_L3SLAVE 64
}; };
#if defined(CONFIG_NET_L3_MASTER_DEV)
static inline bool skb_l3mdev_slave(__u16 flags)
{
return flags & IP6SKB_L3SLAVE;
}
#else
static inline bool skb_l3mdev_slave(__u16 flags)
{
return false;
}
#endif
#define IP6CB(skb) ((struct inet6_skb_parm*)((skb)->cb)) #define IP6CB(skb) ((struct inet6_skb_parm*)((skb)->cb))
#define IP6CBMTU(skb) ((struct ip6_mtuinfo *)((skb)->cb)) #define IP6CBMTU(skb) ((struct ip6_mtuinfo *)((skb)->cb))
static inline int inet6_iif(const struct sk_buff *skb) static inline int inet6_iif(const struct sk_buff *skb)
{ {
return IP6CB(skb)->iif; bool l3_slave = skb_l3mdev_slave(IP6CB(skb)->flags);
return l3_slave ? skb->skb_iif : IP6CB(skb)->iif;
} }
struct tcp6_request_sock { struct tcp6_request_sock {
......
...@@ -3258,6 +3258,8 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb); ...@@ -3258,6 +3258,8 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
bool is_skb_forwardable(const struct net_device *dev, bool is_skb_forwardable(const struct net_device *dev,
const struct sk_buff *skb); const struct sk_buff *skb);
void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev);
extern int netdev_budget; extern int netdev_budget;
/* Called by rtnetlink.c:rtnl_unlock() */ /* Called by rtnetlink.c:rtnl_unlock() */
......
...@@ -36,6 +36,7 @@ ...@@ -36,6 +36,7 @@
struct sock; struct sock;
struct inet_skb_parm { struct inet_skb_parm {
int iif;
struct ip_options opt; /* Compiled IP options */ struct ip_options opt; /* Compiled IP options */
unsigned char flags; unsigned char flags;
......
...@@ -25,6 +25,8 @@ ...@@ -25,6 +25,8 @@
struct l3mdev_ops { struct l3mdev_ops {
u32 (*l3mdev_fib_table)(const struct net_device *dev); u32 (*l3mdev_fib_table)(const struct net_device *dev);
struct sk_buff * (*l3mdev_l3_rcv)(struct net_device *dev,
struct sk_buff *skb, u16 proto);
/* IPv4 ops */ /* IPv4 ops */
struct rtable * (*l3mdev_get_rtable)(const struct net_device *dev, struct rtable * (*l3mdev_get_rtable)(const struct net_device *dev,
...@@ -134,6 +136,34 @@ int l3mdev_get_saddr(struct net *net, int ifindex, struct flowi4 *fl4); ...@@ -134,6 +136,34 @@ int l3mdev_get_saddr(struct net *net, int ifindex, struct flowi4 *fl4);
struct dst_entry *l3mdev_get_rt6_dst(struct net *net, const struct flowi6 *fl6); struct dst_entry *l3mdev_get_rt6_dst(struct net *net, const struct flowi6 *fl6);
static inline
struct sk_buff *l3mdev_l3_rcv(struct sk_buff *skb, u16 proto)
{
struct net_device *master = NULL;
if (netif_is_l3_slave(skb->dev))
master = netdev_master_upper_dev_get_rcu(skb->dev);
else if (netif_is_l3_master(skb->dev))
master = skb->dev;
if (master && master->l3mdev_ops->l3mdev_l3_rcv)
skb = master->l3mdev_ops->l3mdev_l3_rcv(master, skb, proto);
return skb;
}
static inline
struct sk_buff *l3mdev_ip_rcv(struct sk_buff *skb)
{
return l3mdev_l3_rcv(skb, AF_INET);
}
static inline
struct sk_buff *l3mdev_ip6_rcv(struct sk_buff *skb)
{
return l3mdev_l3_rcv(skb, AF_INET6);
}
#else #else
static inline int l3mdev_master_ifindex_rcu(const struct net_device *dev) static inline int l3mdev_master_ifindex_rcu(const struct net_device *dev)
...@@ -194,6 +224,18 @@ struct dst_entry *l3mdev_get_rt6_dst(struct net *net, const struct flowi6 *fl6) ...@@ -194,6 +224,18 @@ struct dst_entry *l3mdev_get_rt6_dst(struct net *net, const struct flowi6 *fl6)
{ {
return NULL; return NULL;
} }
static inline
struct sk_buff *l3mdev_ip_rcv(struct sk_buff *skb)
{
return skb;
}
static inline
struct sk_buff *l3mdev_ip6_rcv(struct sk_buff *skb)
{
return skb;
}
#endif #endif
#endif /* _NET_L3MDEV_H_ */ #endif /* _NET_L3MDEV_H_ */
...@@ -786,7 +786,9 @@ struct tcp_skb_cb { ...@@ -786,7 +786,9 @@ struct tcp_skb_cb {
*/ */
static inline int tcp_v6_iif(const struct sk_buff *skb) static inline int tcp_v6_iif(const struct sk_buff *skb)
{ {
return TCP_SKB_CB(skb)->header.h6.iif; bool l3_slave = skb_l3mdev_slave(TCP_SKB_CB(skb)->header.h6.flags);
return l3_slave ? skb->skb_iif : TCP_SKB_CB(skb)->header.h6.iif;
} }
#endif #endif
......
...@@ -1850,7 +1850,7 @@ static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) ...@@ -1850,7 +1850,7 @@ static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
* taps currently in use. * taps currently in use.
*/ */
static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
{ {
struct packet_type *ptype; struct packet_type *ptype;
struct sk_buff *skb2 = NULL; struct sk_buff *skb2 = NULL;
...@@ -1907,6 +1907,7 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) ...@@ -1907,6 +1907,7 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
rcu_read_unlock(); rcu_read_unlock();
} }
EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
/** /**
* netif_setup_tc - Handle tc mappings on real_num_tx_queues change * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
......
...@@ -313,6 +313,13 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) ...@@ -313,6 +313,13 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
const struct iphdr *iph = ip_hdr(skb); const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt; struct rtable *rt;
/* if ingress device is enslaved to an L3 master device pass the
* skb to its handler for processing
*/
skb = l3mdev_ip_rcv(skb);
if (!skb)
return NET_RX_SUCCESS;
if (net->ipv4.sysctl_ip_early_demux && if (net->ipv4.sysctl_ip_early_demux &&
!skb_dst(skb) && !skb_dst(skb) &&
!skb->sk && !skb->sk &&
...@@ -471,6 +478,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, ...@@ -471,6 +478,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
/* Remove any debris in the socket control block */ /* Remove any debris in the socket control block */
memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
IPCB(skb)->iif = skb->skb_iif;
/* Must drop socket now because of tproxy. */ /* Must drop socket now because of tproxy. */
skb_orphan(skb); skb_orphan(skb);
......
...@@ -1193,7 +1193,12 @@ void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb) ...@@ -1193,7 +1193,12 @@ void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb)
ipv6_sk_rxinfo(sk); ipv6_sk_rxinfo(sk);
if (prepare && skb_rtable(skb)) { if (prepare && skb_rtable(skb)) {
pktinfo->ipi_ifindex = inet_iif(skb); /* skb->cb is overloaded: prior to this point it is IP{6}CB
* which has interface index (iif) as the first member of the
* underlying inet{6}_skb_parm struct. This code then overlays
* PKTINFO_SKB_CB and in_pktinfo also has iif as the first
* element so the iif is picked up from the prior IPCB
*/
pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb); pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb);
} else { } else {
pktinfo->ipi_ifindex = 0; pktinfo->ipi_ifindex = 0;
......
...@@ -49,6 +49,13 @@ ...@@ -49,6 +49,13 @@
int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{ {
/* if ingress device is enslaved to an L3 master device pass the
* skb to its handler for processing
*/
skb = l3mdev_ip6_rcv(skb);
if (!skb)
return NET_RX_SUCCESS;
if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) { if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) {
const struct inet6_protocol *ipprot; const struct inet6_protocol *ipprot;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment