Commit c73a91b8 authored by David S. Miller's avatar David S. Miller

Merge branch 'ovs-gre'

Pravin B Shelar says:

====================
GRE: Use flow based tunneling for OVS GRE vport.

Following patches make use of new Using GRE tunnel meta data
collection feature. This allows us to directly use netdev
based GRE tunnel implementation. While doing so I have
removed GRE demux API which were targeted for OVS. Most
of GRE protocol code is now consolidated in ip_gre module.

v5-v4:
Fixed Kconfig dependency for vport-gre module.

v3-v4:
Added interface to ip-gre device to enable meta data collection.
While doing this I split second patch into two patches.

v2-v3:
Add API to create GRE flow based device.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents fb811395 9f57c67c
...@@ -4,6 +4,12 @@ ...@@ -4,6 +4,12 @@
#include <linux/skbuff.h> #include <linux/skbuff.h>
#include <net/ip_tunnels.h> #include <net/ip_tunnels.h>
struct gre_base_hdr {
__be16 flags;
__be16 protocol;
};
#define GRE_HEADER_SECTION 4
#define GREPROTO_CISCO 0 #define GREPROTO_CISCO 0
#define GREPROTO_PPTP 1 #define GREPROTO_PPTP 1
#define GREPROTO_MAX 2 #define GREPROTO_MAX 2
...@@ -14,91 +20,9 @@ struct gre_protocol { ...@@ -14,91 +20,9 @@ struct gre_protocol {
void (*err_handler)(struct sk_buff *skb, u32 info); void (*err_handler)(struct sk_buff *skb, u32 info);
}; };
struct gre_base_hdr {
__be16 flags;
__be16 protocol;
};
#define GRE_HEADER_SECTION 4
int gre_add_protocol(const struct gre_protocol *proto, u8 version); int gre_add_protocol(const struct gre_protocol *proto, u8 version);
int gre_del_protocol(const struct gre_protocol *proto, u8 version); int gre_del_protocol(const struct gre_protocol *proto, u8 version);
struct gre_cisco_protocol { struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
int (*handler)(struct sk_buff *skb, const struct tnl_ptk_info *tpi); u8 name_assign_type);
int (*err_handler)(struct sk_buff *skb, u32 info,
const struct tnl_ptk_info *tpi);
u8 priority;
};
int gre_cisco_register(struct gre_cisco_protocol *proto);
int gre_cisco_unregister(struct gre_cisco_protocol *proto);
void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
int hdr_len);
static inline struct sk_buff *gre_handle_offloads(struct sk_buff *skb,
bool csum)
{
return iptunnel_handle_offloads(skb, csum,
csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
}
static inline int ip_gre_calc_hlen(__be16 o_flags)
{
int addend = 4;
if (o_flags&TUNNEL_CSUM)
addend += 4;
if (o_flags&TUNNEL_KEY)
addend += 4;
if (o_flags&TUNNEL_SEQ)
addend += 4;
return addend;
}
static inline __be16 gre_flags_to_tnl_flags(__be16 flags)
{
__be16 tflags = 0;
if (flags & GRE_CSUM)
tflags |= TUNNEL_CSUM;
if (flags & GRE_ROUTING)
tflags |= TUNNEL_ROUTING;
if (flags & GRE_KEY)
tflags |= TUNNEL_KEY;
if (flags & GRE_SEQ)
tflags |= TUNNEL_SEQ;
if (flags & GRE_STRICT)
tflags |= TUNNEL_STRICT;
if (flags & GRE_REC)
tflags |= TUNNEL_REC;
if (flags & GRE_VERSION)
tflags |= TUNNEL_VERSION;
return tflags;
}
static inline __be16 tnl_flags_to_gre_flags(__be16 tflags)
{
__be16 flags = 0;
if (tflags & TUNNEL_CSUM)
flags |= GRE_CSUM;
if (tflags & TUNNEL_ROUTING)
flags |= GRE_ROUTING;
if (tflags & TUNNEL_KEY)
flags |= GRE_KEY;
if (tflags & TUNNEL_SEQ)
flags |= GRE_SEQ;
if (tflags & TUNNEL_STRICT)
flags |= GRE_STRICT;
if (tflags & TUNNEL_REC)
flags |= GRE_REC;
if (tflags & TUNNEL_VERSION)
flags |= GRE_VERSION;
return flags;
}
#endif #endif
...@@ -82,6 +82,8 @@ struct ip_tunnel_dst { ...@@ -82,6 +82,8 @@ struct ip_tunnel_dst {
__be32 saddr; __be32 saddr;
}; };
struct metadata_dst;
struct ip_tunnel { struct ip_tunnel {
struct ip_tunnel __rcu *next; struct ip_tunnel __rcu *next;
struct hlist_node hash_node; struct hlist_node hash_node;
...@@ -115,6 +117,7 @@ struct ip_tunnel { ...@@ -115,6 +117,7 @@ struct ip_tunnel {
unsigned int prl_count; /* # of entries in PRL */ unsigned int prl_count; /* # of entries in PRL */
int ip_tnl_net_id; int ip_tnl_net_id;
struct gro_cells gro_cells; struct gro_cells gro_cells;
bool collect_md;
}; };
#define TUNNEL_CSUM __cpu_to_be16(0x01) #define TUNNEL_CSUM __cpu_to_be16(0x01)
...@@ -149,6 +152,7 @@ struct tnl_ptk_info { ...@@ -149,6 +152,7 @@ struct tnl_ptk_info {
struct ip_tunnel_net { struct ip_tunnel_net {
struct net_device *fb_tunnel_dev; struct net_device *fb_tunnel_dev;
struct hlist_head tunnels[IP_TNL_HASH_SIZE]; struct hlist_head tunnels[IP_TNL_HASH_SIZE];
struct ip_tunnel __rcu *collect_md_tun;
}; };
struct ip_tunnel_encap_ops { struct ip_tunnel_encap_ops {
...@@ -235,7 +239,8 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, ...@@ -235,7 +239,8 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
__be32 key); __be32 key);
int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
const struct tnl_ptk_info *tpi, bool log_ecn_error); const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
bool log_ecn_error);
int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
struct ip_tunnel_parm *p); struct ip_tunnel_parm *p);
int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
......
...@@ -112,6 +112,7 @@ enum { ...@@ -112,6 +112,7 @@ enum {
IFLA_GRE_ENCAP_FLAGS, IFLA_GRE_ENCAP_FLAGS,
IFLA_GRE_ENCAP_SPORT, IFLA_GRE_ENCAP_SPORT,
IFLA_GRE_ENCAP_DPORT, IFLA_GRE_ENCAP_DPORT,
IFLA_GRE_COLLECT_METADATA,
__IFLA_GRE_MAX, __IFLA_GRE_MAX,
}; };
......
...@@ -31,7 +31,6 @@ ...@@ -31,7 +31,6 @@
#include <net/xfrm.h> #include <net/xfrm.h>
static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly; static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly;
static struct gre_cisco_protocol __rcu *gre_cisco_proto_list[GRE_IP_PROTO_MAX];
int gre_add_protocol(const struct gre_protocol *proto, u8 version) int gre_add_protocol(const struct gre_protocol *proto, u8 version)
{ {
...@@ -61,197 +60,6 @@ int gre_del_protocol(const struct gre_protocol *proto, u8 version) ...@@ -61,197 +60,6 @@ int gre_del_protocol(const struct gre_protocol *proto, u8 version)
} }
EXPORT_SYMBOL_GPL(gre_del_protocol); EXPORT_SYMBOL_GPL(gre_del_protocol);
void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
int hdr_len)
{
struct gre_base_hdr *greh;
skb_push(skb, hdr_len);
skb_reset_transport_header(skb);
greh = (struct gre_base_hdr *)skb->data;
greh->flags = tnl_flags_to_gre_flags(tpi->flags);
greh->protocol = tpi->proto;
if (tpi->flags&(TUNNEL_KEY|TUNNEL_CSUM|TUNNEL_SEQ)) {
__be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4);
if (tpi->flags&TUNNEL_SEQ) {
*ptr = tpi->seq;
ptr--;
}
if (tpi->flags&TUNNEL_KEY) {
*ptr = tpi->key;
ptr--;
}
if (tpi->flags&TUNNEL_CSUM &&
!(skb_shinfo(skb)->gso_type &
(SKB_GSO_GRE|SKB_GSO_GRE_CSUM))) {
*ptr = 0;
*(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0,
skb->len, 0));
}
}
}
EXPORT_SYMBOL_GPL(gre_build_header);
static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
bool *csum_err)
{
const struct gre_base_hdr *greh;
__be32 *options;
int hdr_len;
if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr))))
return -EINVAL;
greh = (struct gre_base_hdr *)skb_transport_header(skb);
if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING)))
return -EINVAL;
tpi->flags = gre_flags_to_tnl_flags(greh->flags);
hdr_len = ip_gre_calc_hlen(tpi->flags);
if (!pskb_may_pull(skb, hdr_len))
return -EINVAL;
greh = (struct gre_base_hdr *)skb_transport_header(skb);
tpi->proto = greh->protocol;
options = (__be32 *)(greh + 1);
if (greh->flags & GRE_CSUM) {
if (skb_checksum_simple_validate(skb)) {
*csum_err = true;
return -EINVAL;
}
skb_checksum_try_convert(skb, IPPROTO_GRE, 0,
null_compute_pseudo);
options++;
}
if (greh->flags & GRE_KEY) {
tpi->key = *options;
options++;
} else
tpi->key = 0;
if (unlikely(greh->flags & GRE_SEQ)) {
tpi->seq = *options;
options++;
} else
tpi->seq = 0;
/* WCCP version 1 and 2 protocol decoding.
* - Change protocol to IP
* - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
*/
if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) {
tpi->proto = htons(ETH_P_IP);
if ((*(u8 *)options & 0xF0) != 0x40) {
hdr_len += 4;
if (!pskb_may_pull(skb, hdr_len))
return -EINVAL;
}
}
return iptunnel_pull_header(skb, hdr_len, tpi->proto);
}
static int gre_cisco_rcv(struct sk_buff *skb)
{
struct tnl_ptk_info tpi;
int i;
bool csum_err = false;
#ifdef CONFIG_NET_IPGRE_BROADCAST
if (ipv4_is_multicast(ip_hdr(skb)->daddr)) {
/* Looped back packet, drop it! */
if (rt_is_output_route(skb_rtable(skb)))
goto drop;
}
#endif
if (parse_gre_header(skb, &tpi, &csum_err) < 0)
goto drop;
rcu_read_lock();
for (i = 0; i < GRE_IP_PROTO_MAX; i++) {
struct gre_cisco_protocol *proto;
int ret;
proto = rcu_dereference(gre_cisco_proto_list[i]);
if (!proto)
continue;
ret = proto->handler(skb, &tpi);
if (ret == PACKET_RCVD) {
rcu_read_unlock();
return 0;
}
}
rcu_read_unlock();
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
drop:
kfree_skb(skb);
return 0;
}
static void gre_cisco_err(struct sk_buff *skb, u32 info)
{
/* All the routers (except for Linux) return only
* 8 bytes of packet payload. It means, that precise relaying of
* ICMP in the real Internet is absolutely infeasible.
*
* Moreover, Cisco "wise men" put GRE key to the third word
* in GRE header. It makes impossible maintaining even soft
* state for keyed
* GRE tunnels with enabled checksum. Tell them "thank you".
*
* Well, I wonder, rfc1812 was written by Cisco employee,
* what the hell these idiots break standards established
* by themselves???
*/
const int type = icmp_hdr(skb)->type;
const int code = icmp_hdr(skb)->code;
struct tnl_ptk_info tpi;
bool csum_err = false;
int i;
if (parse_gre_header(skb, &tpi, &csum_err)) {
if (!csum_err) /* ignore csum errors. */
return;
}
if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
ipv4_update_pmtu(skb, dev_net(skb->dev), info,
skb->dev->ifindex, 0, IPPROTO_GRE, 0);
return;
}
if (type == ICMP_REDIRECT) {
ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex, 0,
IPPROTO_GRE, 0);
return;
}
rcu_read_lock();
for (i = 0; i < GRE_IP_PROTO_MAX; i++) {
struct gre_cisco_protocol *proto;
proto = rcu_dereference(gre_cisco_proto_list[i]);
if (!proto)
continue;
if (proto->err_handler(skb, info, &tpi) == PACKET_RCVD)
goto out;
}
out:
rcu_read_unlock();
}
static int gre_rcv(struct sk_buff *skb) static int gre_rcv(struct sk_buff *skb)
{ {
const struct gre_protocol *proto; const struct gre_protocol *proto;
...@@ -302,60 +110,19 @@ static const struct net_protocol net_gre_protocol = { ...@@ -302,60 +110,19 @@ static const struct net_protocol net_gre_protocol = {
.netns_ok = 1, .netns_ok = 1,
}; };
static const struct gre_protocol ipgre_protocol = {
.handler = gre_cisco_rcv,
.err_handler = gre_cisco_err,
};
int gre_cisco_register(struct gre_cisco_protocol *newp)
{
struct gre_cisco_protocol **proto = (struct gre_cisco_protocol **)
&gre_cisco_proto_list[newp->priority];
return (cmpxchg(proto, NULL, newp) == NULL) ? 0 : -EBUSY;
}
EXPORT_SYMBOL_GPL(gre_cisco_register);
int gre_cisco_unregister(struct gre_cisco_protocol *del_proto)
{
struct gre_cisco_protocol **proto = (struct gre_cisco_protocol **)
&gre_cisco_proto_list[del_proto->priority];
int ret;
ret = (cmpxchg(proto, del_proto, NULL) == del_proto) ? 0 : -EINVAL;
if (ret)
return ret;
synchronize_net();
return 0;
}
EXPORT_SYMBOL_GPL(gre_cisco_unregister);
static int __init gre_init(void) static int __init gre_init(void)
{ {
pr_info("GRE over IPv4 demultiplexor driver\n"); pr_info("GRE over IPv4 demultiplexor driver\n");
if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) { if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) {
pr_err("can't add protocol\n"); pr_err("can't add protocol\n");
goto err; return -EAGAIN;
}
if (gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0) {
pr_info("%s: can't add ipgre handler\n", __func__);
goto err_gre;
} }
return 0; return 0;
err_gre:
inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
err:
return -EAGAIN;
} }
static void __exit gre_exit(void) static void __exit gre_exit(void)
{ {
gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
inet_del_protocol(&net_gre_protocol, IPPROTO_GRE); inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
} }
......
...@@ -25,6 +25,7 @@ ...@@ -25,6 +25,7 @@
#include <linux/udp.h> #include <linux/udp.h>
#include <linux/if_arp.h> #include <linux/if_arp.h>
#include <linux/mroute.h> #include <linux/mroute.h>
#include <linux/if_vlan.h>
#include <linux/init.h> #include <linux/init.h>
#include <linux/in6.h> #include <linux/in6.h>
#include <linux/inetdevice.h> #include <linux/inetdevice.h>
...@@ -47,6 +48,7 @@ ...@@ -47,6 +48,7 @@
#include <net/netns/generic.h> #include <net/netns/generic.h>
#include <net/rtnetlink.h> #include <net/rtnetlink.h>
#include <net/gre.h> #include <net/gre.h>
#include <net/dst_metadata.h>
#if IS_ENABLED(CONFIG_IPV6) #if IS_ENABLED(CONFIG_IPV6)
#include <net/ipv6.h> #include <net/ipv6.h>
...@@ -121,7 +123,126 @@ static int ipgre_tunnel_init(struct net_device *dev); ...@@ -121,7 +123,126 @@ static int ipgre_tunnel_init(struct net_device *dev);
static int ipgre_net_id __read_mostly; static int ipgre_net_id __read_mostly;
static int gre_tap_net_id __read_mostly; static int gre_tap_net_id __read_mostly;
static int ipgre_err(struct sk_buff *skb, u32 info, static int ip_gre_calc_hlen(__be16 o_flags)
{
int addend = 4;
if (o_flags & TUNNEL_CSUM)
addend += 4;
if (o_flags & TUNNEL_KEY)
addend += 4;
if (o_flags & TUNNEL_SEQ)
addend += 4;
return addend;
}
static __be16 gre_flags_to_tnl_flags(__be16 flags)
{
__be16 tflags = 0;
if (flags & GRE_CSUM)
tflags |= TUNNEL_CSUM;
if (flags & GRE_ROUTING)
tflags |= TUNNEL_ROUTING;
if (flags & GRE_KEY)
tflags |= TUNNEL_KEY;
if (flags & GRE_SEQ)
tflags |= TUNNEL_SEQ;
if (flags & GRE_STRICT)
tflags |= TUNNEL_STRICT;
if (flags & GRE_REC)
tflags |= TUNNEL_REC;
if (flags & GRE_VERSION)
tflags |= TUNNEL_VERSION;
return tflags;
}
static __be16 tnl_flags_to_gre_flags(__be16 tflags)
{
__be16 flags = 0;
if (tflags & TUNNEL_CSUM)
flags |= GRE_CSUM;
if (tflags & TUNNEL_ROUTING)
flags |= GRE_ROUTING;
if (tflags & TUNNEL_KEY)
flags |= GRE_KEY;
if (tflags & TUNNEL_SEQ)
flags |= GRE_SEQ;
if (tflags & TUNNEL_STRICT)
flags |= GRE_STRICT;
if (tflags & TUNNEL_REC)
flags |= GRE_REC;
if (tflags & TUNNEL_VERSION)
flags |= GRE_VERSION;
return flags;
}
static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
bool *csum_err)
{
const struct gre_base_hdr *greh;
__be32 *options;
int hdr_len;
if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr))))
return -EINVAL;
greh = (struct gre_base_hdr *)skb_transport_header(skb);
if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING)))
return -EINVAL;
tpi->flags = gre_flags_to_tnl_flags(greh->flags);
hdr_len = ip_gre_calc_hlen(tpi->flags);
if (!pskb_may_pull(skb, hdr_len))
return -EINVAL;
greh = (struct gre_base_hdr *)skb_transport_header(skb);
tpi->proto = greh->protocol;
options = (__be32 *)(greh + 1);
if (greh->flags & GRE_CSUM) {
if (skb_checksum_simple_validate(skb)) {
*csum_err = true;
return -EINVAL;
}
skb_checksum_try_convert(skb, IPPROTO_GRE, 0,
null_compute_pseudo);
options++;
}
if (greh->flags & GRE_KEY) {
tpi->key = *options;
options++;
} else {
tpi->key = 0;
}
if (unlikely(greh->flags & GRE_SEQ)) {
tpi->seq = *options;
options++;
} else {
tpi->seq = 0;
}
/* WCCP version 1 and 2 protocol decoding.
* - Change protocol to IP
* - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
*/
if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) {
tpi->proto = htons(ETH_P_IP);
if ((*(u8 *)options & 0xF0) != 0x40) {
hdr_len += 4;
if (!pskb_may_pull(skb, hdr_len))
return -EINVAL;
}
}
return iptunnel_pull_header(skb, hdr_len, tpi->proto);
}
static void ipgre_err(struct sk_buff *skb, u32 info,
const struct tnl_ptk_info *tpi) const struct tnl_ptk_info *tpi)
{ {
...@@ -148,14 +269,14 @@ static int ipgre_err(struct sk_buff *skb, u32 info, ...@@ -148,14 +269,14 @@ static int ipgre_err(struct sk_buff *skb, u32 info,
switch (type) { switch (type) {
default: default:
case ICMP_PARAMETERPROB: case ICMP_PARAMETERPROB:
return PACKET_RCVD; return;
case ICMP_DEST_UNREACH: case ICMP_DEST_UNREACH:
switch (code) { switch (code) {
case ICMP_SR_FAILED: case ICMP_SR_FAILED:
case ICMP_PORT_UNREACH: case ICMP_PORT_UNREACH:
/* Impossible event. */ /* Impossible event. */
return PACKET_RCVD; return;
default: default:
/* All others are translated to HOST_UNREACH. /* All others are translated to HOST_UNREACH.
rfc2003 contains "deep thoughts" about NET_UNREACH, rfc2003 contains "deep thoughts" about NET_UNREACH,
...@@ -164,9 +285,10 @@ static int ipgre_err(struct sk_buff *skb, u32 info, ...@@ -164,9 +285,10 @@ static int ipgre_err(struct sk_buff *skb, u32 info,
break; break;
} }
break; break;
case ICMP_TIME_EXCEEDED: case ICMP_TIME_EXCEEDED:
if (code != ICMP_EXC_TTL) if (code != ICMP_EXC_TTL)
return PACKET_RCVD; return;
break; break;
case ICMP_REDIRECT: case ICMP_REDIRECT:
...@@ -183,26 +305,85 @@ static int ipgre_err(struct sk_buff *skb, u32 info, ...@@ -183,26 +305,85 @@ static int ipgre_err(struct sk_buff *skb, u32 info,
iph->daddr, iph->saddr, tpi->key); iph->daddr, iph->saddr, tpi->key);
if (!t) if (!t)
return PACKET_REJECT; return;
if (t->parms.iph.daddr == 0 || if (t->parms.iph.daddr == 0 ||
ipv4_is_multicast(t->parms.iph.daddr)) ipv4_is_multicast(t->parms.iph.daddr))
return PACKET_RCVD; return;
if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
return PACKET_RCVD; return;
if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO)) if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
t->err_count++; t->err_count++;
else else
t->err_count = 1; t->err_count = 1;
t->err_time = jiffies; t->err_time = jiffies;
return PACKET_RCVD; }
static void gre_err(struct sk_buff *skb, u32 info)
{
/* All the routers (except for Linux) return only
* 8 bytes of packet payload. It means, that precise relaying of
* ICMP in the real Internet is absolutely infeasible.
*
* Moreover, Cisco "wise men" put GRE key to the third word
* in GRE header. It makes impossible maintaining even soft
* state for keyed
* GRE tunnels with enabled checksum. Tell them "thank you".
*
* Well, I wonder, rfc1812 was written by Cisco employee,
* what the hell these idiots break standards established
* by themselves???
*/
const int type = icmp_hdr(skb)->type;
const int code = icmp_hdr(skb)->code;
struct tnl_ptk_info tpi;
bool csum_err = false;
if (parse_gre_header(skb, &tpi, &csum_err)) {
if (!csum_err) /* ignore csum errors. */
return;
}
if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
ipv4_update_pmtu(skb, dev_net(skb->dev), info,
skb->dev->ifindex, 0, IPPROTO_GRE, 0);
return;
}
if (type == ICMP_REDIRECT) {
ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex, 0,
IPPROTO_GRE, 0);
return;
}
ipgre_err(skb, info, &tpi);
}
static __be64 key_to_tunnel_id(__be32 key)
{
#ifdef __BIG_ENDIAN
return (__force __be64)((__force u32)key);
#else
return (__force __be64)((__force u64)key << 32);
#endif
}
/* Returns the least-significant 32 bits of a __be64. */
static __be32 tunnel_id_to_key(__be64 x)
{
#ifdef __BIG_ENDIAN
return (__force __be32)x;
#else
return (__force __be32)((__force u64)x >> 32);
#endif
} }
static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
{ {
struct net *net = dev_net(skb->dev); struct net *net = dev_net(skb->dev);
struct metadata_dst *tun_dst = NULL;
struct ip_tunnel_net *itn; struct ip_tunnel_net *itn;
const struct iphdr *iph; const struct iphdr *iph;
struct ip_tunnel *tunnel; struct ip_tunnel *tunnel;
...@@ -218,40 +399,194 @@ static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) ...@@ -218,40 +399,194 @@ static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
if (tunnel) { if (tunnel) {
skb_pop_mac_header(skb); skb_pop_mac_header(skb);
ip_tunnel_rcv(tunnel, skb, tpi, log_ecn_error); if (tunnel->collect_md) {
struct ip_tunnel_info *info;
tun_dst = metadata_dst_alloc(0, GFP_ATOMIC);
if (!tun_dst)
return PACKET_REJECT;
info = &tun_dst->u.tun_info;
info->key.ipv4_src = iph->saddr;
info->key.ipv4_dst = iph->daddr;
info->key.ipv4_tos = iph->tos;
info->key.ipv4_ttl = iph->ttl;
info->mode = IP_TUNNEL_INFO_RX;
info->key.tun_flags = tpi->flags &
(TUNNEL_CSUM | TUNNEL_KEY);
info->key.tun_id = key_to_tunnel_id(tpi->key);
info->key.tp_src = 0;
info->key.tp_dst = 0;
}
ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
return PACKET_RCVD; return PACKET_RCVD;
} }
return PACKET_REJECT; return PACKET_REJECT;
} }
static int gre_rcv(struct sk_buff *skb)
{
struct tnl_ptk_info tpi;
bool csum_err = false;
#ifdef CONFIG_NET_IPGRE_BROADCAST
if (ipv4_is_multicast(ip_hdr(skb)->daddr)) {
/* Looped back packet, drop it! */
if (rt_is_output_route(skb_rtable(skb)))
goto drop;
}
#endif
if (parse_gre_header(skb, &tpi, &csum_err) < 0)
goto drop;
if (ipgre_rcv(skb, &tpi) == PACKET_RCVD)
return 0;
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
drop:
kfree_skb(skb);
return 0;
}
static void build_header(struct sk_buff *skb, int hdr_len, __be16 flags,
__be16 proto, __be32 key, __be32 seq)
{
struct gre_base_hdr *greh;
skb_push(skb, hdr_len);
skb_reset_transport_header(skb);
greh = (struct gre_base_hdr *)skb->data;
greh->flags = tnl_flags_to_gre_flags(flags);
greh->protocol = proto;
if (flags & (TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_SEQ)) {
__be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4);
if (flags & TUNNEL_SEQ) {
*ptr = seq;
ptr--;
}
if (flags & TUNNEL_KEY) {
*ptr = key;
ptr--;
}
if (flags & TUNNEL_CSUM &&
!(skb_shinfo(skb)->gso_type &
(SKB_GSO_GRE | SKB_GSO_GRE_CSUM))) {
*ptr = 0;
*(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0,
skb->len, 0));
}
}
}
static void __gre_xmit(struct sk_buff *skb, struct net_device *dev, static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
const struct iphdr *tnl_params, const struct iphdr *tnl_params,
__be16 proto) __be16 proto)
{ {
struct ip_tunnel *tunnel = netdev_priv(dev); struct ip_tunnel *tunnel = netdev_priv(dev);
struct tnl_ptk_info tpi;
tpi.flags = tunnel->parms.o_flags;
tpi.proto = proto;
tpi.key = tunnel->parms.o_key;
if (tunnel->parms.o_flags & TUNNEL_SEQ) if (tunnel->parms.o_flags & TUNNEL_SEQ)
tunnel->o_seqno++; tunnel->o_seqno++;
tpi.seq = htonl(tunnel->o_seqno);
/* Push GRE header. */ /* Push GRE header. */
gre_build_header(skb, &tpi, tunnel->tun_hlen); build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags,
proto, tunnel->parms.o_key, htonl(tunnel->o_seqno));
skb_set_inner_protocol(skb, tpi.proto);
skb_set_inner_protocol(skb, proto);
ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol); ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol);
} }
static struct sk_buff *gre_handle_offloads(struct sk_buff *skb,
bool csum)
{
return iptunnel_handle_offloads(skb, csum,
csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
}
static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct ip_tunnel_info *tun_info;
struct net *net = dev_net(dev);
const struct ip_tunnel_key *key;
struct flowi4 fl;
struct rtable *rt;
int min_headroom;
int tunnel_hlen;
__be16 df, flags;
int err;
tun_info = skb_tunnel_info(skb, AF_INET);
if (unlikely(!tun_info || tun_info->mode != IP_TUNNEL_INFO_TX))
goto err_free_skb;
key = &tun_info->key;
memset(&fl, 0, sizeof(fl));
fl.daddr = key->ipv4_dst;
fl.saddr = key->ipv4_src;
fl.flowi4_tos = RT_TOS(key->ipv4_tos);
fl.flowi4_mark = skb->mark;
fl.flowi4_proto = IPPROTO_GRE;
rt = ip_route_output_key(net, &fl);
if (IS_ERR(rt))
goto err_free_skb;
tunnel_hlen = ip_gre_calc_hlen(key->tun_flags);
min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
+ tunnel_hlen + sizeof(struct iphdr);
if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
int head_delta = SKB_DATA_ALIGN(min_headroom -
skb_headroom(skb) +
16);
err = pskb_expand_head(skb, max_t(int, head_delta, 0),
0, GFP_ATOMIC);
if (unlikely(err))
goto err_free_rt;
}
/* Push Tunnel header. */
skb = gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM));
if (IS_ERR(skb)) {
skb = NULL;
goto err_free_rt;
}
flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY);
build_header(skb, tunnel_hlen, flags, htons(ETH_P_TEB),
tunnel_id_to_key(tun_info->key.tun_id), 0);
df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
err = iptunnel_xmit(skb->sk, rt, skb, fl.saddr,
key->ipv4_dst, IPPROTO_GRE,
key->ipv4_tos, key->ipv4_ttl, df, false);
iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
return;
err_free_rt:
ip_rt_put(rt);
err_free_skb:
kfree_skb(skb);
dev->stats.tx_dropped++;
}
static netdev_tx_t ipgre_xmit(struct sk_buff *skb, static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
struct net_device *dev) struct net_device *dev)
{ {
struct ip_tunnel *tunnel = netdev_priv(dev); struct ip_tunnel *tunnel = netdev_priv(dev);
const struct iphdr *tnl_params; const struct iphdr *tnl_params;
if (tunnel->collect_md) {
gre_fb_xmit(skb, dev);
return NETDEV_TX_OK;
}
if (dev->header_ops) { if (dev->header_ops) {
/* Need space for new headers */ /* Need space for new headers */
if (skb_cow_head(skb, dev->needed_headroom - if (skb_cow_head(skb, dev->needed_headroom -
...@@ -277,7 +612,6 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb, ...@@ -277,7 +612,6 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
goto out; goto out;
__gre_xmit(skb, dev, tnl_params, skb->protocol); __gre_xmit(skb, dev, tnl_params, skb->protocol);
return NETDEV_TX_OK; return NETDEV_TX_OK;
free_skb: free_skb:
...@@ -292,6 +626,11 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb, ...@@ -292,6 +626,11 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
{ {
struct ip_tunnel *tunnel = netdev_priv(dev); struct ip_tunnel *tunnel = netdev_priv(dev);
if (tunnel->collect_md) {
gre_fb_xmit(skb, dev);
return NETDEV_TX_OK;
}
skb = gre_handle_offloads(skb, !!(tunnel->parms.o_flags&TUNNEL_CSUM)); skb = gre_handle_offloads(skb, !!(tunnel->parms.o_flags&TUNNEL_CSUM));
if (IS_ERR(skb)) if (IS_ERR(skb))
goto out; goto out;
...@@ -300,7 +639,6 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb, ...@@ -300,7 +639,6 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
goto free_skb; goto free_skb;
__gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_TEB)); __gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_TEB));
return NETDEV_TX_OK; return NETDEV_TX_OK;
free_skb: free_skb:
...@@ -530,10 +868,9 @@ static int ipgre_tunnel_init(struct net_device *dev) ...@@ -530,10 +868,9 @@ static int ipgre_tunnel_init(struct net_device *dev)
return ip_tunnel_init(dev); return ip_tunnel_init(dev);
} }
static struct gre_cisco_protocol ipgre_protocol = { static const struct gre_protocol ipgre_protocol = {
.handler = ipgre_rcv, .handler = gre_rcv,
.err_handler = ipgre_err, .err_handler = gre_err,
.priority = 0,
}; };
static int __net_init ipgre_init_net(struct net *net) static int __net_init ipgre_init_net(struct net *net)
...@@ -596,7 +933,9 @@ static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[]) ...@@ -596,7 +933,9 @@ static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
return ipgre_tunnel_validate(tb, data); return ipgre_tunnel_validate(tb, data);
} }
static void ipgre_netlink_parms(struct nlattr *data[], struct nlattr *tb[], static void ipgre_netlink_parms(struct net_device *dev,
struct nlattr *data[],
struct nlattr *tb[],
struct ip_tunnel_parm *parms) struct ip_tunnel_parm *parms)
{ {
memset(parms, 0, sizeof(*parms)); memset(parms, 0, sizeof(*parms));
...@@ -635,6 +974,12 @@ static void ipgre_netlink_parms(struct nlattr *data[], struct nlattr *tb[], ...@@ -635,6 +974,12 @@ static void ipgre_netlink_parms(struct nlattr *data[], struct nlattr *tb[],
if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC])) if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
parms->iph.frag_off = htons(IP_DF); parms->iph.frag_off = htons(IP_DF);
if (data[IFLA_GRE_COLLECT_METADATA]) {
struct ip_tunnel *t = netdev_priv(dev);
t->collect_md = true;
}
} }
/* This function returns true when ENCAP attributes are present in the nl msg */ /* This function returns true when ENCAP attributes are present in the nl msg */
...@@ -712,7 +1057,7 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev, ...@@ -712,7 +1057,7 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev,
return err; return err;
} }
ipgre_netlink_parms(data, tb, &p); ipgre_netlink_parms(dev, data, tb, &p);
return ip_tunnel_newlink(dev, tb, &p); return ip_tunnel_newlink(dev, tb, &p);
} }
...@@ -730,7 +1075,7 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], ...@@ -730,7 +1075,7 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
return err; return err;
} }
ipgre_netlink_parms(data, tb, &p); ipgre_netlink_parms(dev, data, tb, &p);
return ip_tunnel_changelink(dev, tb, &p); return ip_tunnel_changelink(dev, tb, &p);
} }
...@@ -765,6 +1110,8 @@ static size_t ipgre_get_size(const struct net_device *dev) ...@@ -765,6 +1110,8 @@ static size_t ipgre_get_size(const struct net_device *dev)
nla_total_size(2) + nla_total_size(2) +
/* IFLA_GRE_ENCAP_DPORT */ /* IFLA_GRE_ENCAP_DPORT */
nla_total_size(2) + nla_total_size(2) +
/* IFLA_GRE_COLLECT_METADATA */
nla_total_size(0) +
0; 0;
} }
...@@ -796,6 +1143,11 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) ...@@ -796,6 +1143,11 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
t->encap.flags)) t->encap.flags))
goto nla_put_failure; goto nla_put_failure;
if (t->collect_md) {
if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA))
goto nla_put_failure;
}
return 0; return 0;
nla_put_failure: nla_put_failure:
...@@ -817,6 +1169,7 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = { ...@@ -817,6 +1169,7 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
[IFLA_GRE_ENCAP_FLAGS] = { .type = NLA_U16 }, [IFLA_GRE_ENCAP_FLAGS] = { .type = NLA_U16 },
[IFLA_GRE_ENCAP_SPORT] = { .type = NLA_U16 }, [IFLA_GRE_ENCAP_SPORT] = { .type = NLA_U16 },
[IFLA_GRE_ENCAP_DPORT] = { .type = NLA_U16 }, [IFLA_GRE_ENCAP_DPORT] = { .type = NLA_U16 },
[IFLA_GRE_COLLECT_METADATA] = { .type = NLA_FLAG },
}; };
static struct rtnl_link_ops ipgre_link_ops __read_mostly = { static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
...@@ -849,9 +1202,38 @@ static struct rtnl_link_ops ipgre_tap_ops __read_mostly = { ...@@ -849,9 +1202,38 @@ static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
.get_link_net = ip_tunnel_get_link_net, .get_link_net = ip_tunnel_get_link_net,
}; };
struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
u8 name_assign_type)
{
struct nlattr *tb[IFLA_MAX + 1];
struct net_device *dev;
struct ip_tunnel *t;
int err;
memset(&tb, 0, sizeof(tb));
dev = rtnl_create_link(net, name, name_assign_type,
&ipgre_tap_ops, tb);
if (IS_ERR(dev))
return dev;
/* Configure flow based GRE device. */
t = netdev_priv(dev);
t->collect_md = true;
err = ipgre_newlink(net, dev, tb, NULL);
if (err < 0)
goto out;
return dev;
out:
free_netdev(dev);
return ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(gretap_fb_dev_create);
static int __net_init ipgre_tap_init_net(struct net *net) static int __net_init ipgre_tap_init_net(struct net *net)
{ {
return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, NULL); return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0");
} }
static void __net_exit ipgre_tap_exit_net(struct net *net) static void __net_exit ipgre_tap_exit_net(struct net *net)
...@@ -881,7 +1263,7 @@ static int __init ipgre_init(void) ...@@ -881,7 +1263,7 @@ static int __init ipgre_init(void)
if (err < 0) if (err < 0)
goto pnet_tap_faied; goto pnet_tap_faied;
err = gre_cisco_register(&ipgre_protocol); err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
if (err < 0) { if (err < 0) {
pr_info("%s: can't add protocol\n", __func__); pr_info("%s: can't add protocol\n", __func__);
goto add_proto_failed; goto add_proto_failed;
...@@ -900,7 +1282,7 @@ static int __init ipgre_init(void) ...@@ -900,7 +1282,7 @@ static int __init ipgre_init(void)
tap_ops_failed: tap_ops_failed:
rtnl_link_unregister(&ipgre_link_ops); rtnl_link_unregister(&ipgre_link_ops);
rtnl_link_failed: rtnl_link_failed:
gre_cisco_unregister(&ipgre_protocol); gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
add_proto_failed: add_proto_failed:
unregister_pernet_device(&ipgre_tap_net_ops); unregister_pernet_device(&ipgre_tap_net_ops);
pnet_tap_faied: pnet_tap_faied:
...@@ -912,7 +1294,7 @@ static void __exit ipgre_fini(void) ...@@ -912,7 +1294,7 @@ static void __exit ipgre_fini(void)
{ {
rtnl_link_unregister(&ipgre_tap_ops); rtnl_link_unregister(&ipgre_tap_ops);
rtnl_link_unregister(&ipgre_link_ops); rtnl_link_unregister(&ipgre_link_ops);
gre_cisco_unregister(&ipgre_protocol); gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
unregister_pernet_device(&ipgre_tap_net_ops); unregister_pernet_device(&ipgre_tap_net_ops);
unregister_pernet_device(&ipgre_net_ops); unregister_pernet_device(&ipgre_net_ops);
} }
......
...@@ -230,10 +230,13 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, ...@@ -230,10 +230,13 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
if (cand) if (cand)
return cand; return cand;
t = rcu_dereference(itn->collect_md_tun);
if (t)
return t;
if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP) if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
return netdev_priv(itn->fb_tunnel_dev); return netdev_priv(itn->fb_tunnel_dev);
return NULL; return NULL;
} }
EXPORT_SYMBOL_GPL(ip_tunnel_lookup); EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
...@@ -261,11 +264,15 @@ static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t) ...@@ -261,11 +264,15 @@ static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
{ {
struct hlist_head *head = ip_bucket(itn, &t->parms); struct hlist_head *head = ip_bucket(itn, &t->parms);
if (t->collect_md)
rcu_assign_pointer(itn->collect_md_tun, t);
hlist_add_head_rcu(&t->hash_node, head); hlist_add_head_rcu(&t->hash_node, head);
} }
static void ip_tunnel_del(struct ip_tunnel *t) static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
{ {
if (t->collect_md)
rcu_assign_pointer(itn->collect_md_tun, NULL);
hlist_del_init_rcu(&t->hash_node); hlist_del_init_rcu(&t->hash_node);
} }
...@@ -419,7 +426,8 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net, ...@@ -419,7 +426,8 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net,
} }
int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
const struct tnl_ptk_info *tpi, bool log_ecn_error) const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
bool log_ecn_error)
{ {
struct pcpu_sw_netstats *tstats; struct pcpu_sw_netstats *tstats;
const struct iphdr *iph = ip_hdr(skb); const struct iphdr *iph = ip_hdr(skb);
...@@ -478,6 +486,9 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, ...@@ -478,6 +486,9 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
skb->dev = tunnel->dev; skb->dev = tunnel->dev;
} }
if (tun_dst)
skb_dst_set(skb, (struct dst_entry *)tun_dst);
gro_cells_receive(&tunnel->gro_cells, skb); gro_cells_receive(&tunnel->gro_cells, skb);
return 0; return 0;
...@@ -806,7 +817,7 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn, ...@@ -806,7 +817,7 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn,
struct ip_tunnel_parm *p, struct ip_tunnel_parm *p,
bool set_mtu) bool set_mtu)
{ {
ip_tunnel_del(t); ip_tunnel_del(itn, t);
t->parms.iph.saddr = p->iph.saddr; t->parms.iph.saddr = p->iph.saddr;
t->parms.iph.daddr = p->iph.daddr; t->parms.iph.daddr = p->iph.daddr;
t->parms.i_key = p->i_key; t->parms.i_key = p->i_key;
...@@ -967,7 +978,7 @@ void ip_tunnel_dellink(struct net_device *dev, struct list_head *head) ...@@ -967,7 +978,7 @@ void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id); itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
if (itn->fb_tunnel_dev != dev) { if (itn->fb_tunnel_dev != dev) {
ip_tunnel_del(netdev_priv(dev)); ip_tunnel_del(itn, netdev_priv(dev));
unregister_netdevice_queue(dev, head); unregister_netdevice_queue(dev, head);
} }
} }
...@@ -1072,8 +1083,13 @@ int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], ...@@ -1072,8 +1083,13 @@ int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
nt = netdev_priv(dev); nt = netdev_priv(dev);
itn = net_generic(net, nt->ip_tnl_net_id); itn = net_generic(net, nt->ip_tnl_net_id);
if (nt->collect_md) {
if (rtnl_dereference(itn->collect_md_tun))
return -EEXIST;
} else {
if (ip_tunnel_find(itn, p, dev->type)) if (ip_tunnel_find(itn, p, dev->type))
return -EEXIST; return -EEXIST;
}
nt->net = net; nt->net = net;
nt->parms = *p; nt->parms = *p;
...@@ -1089,7 +1105,6 @@ int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], ...@@ -1089,7 +1105,6 @@ int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
dev->mtu = mtu; dev->mtu = mtu;
ip_tunnel_add(itn, nt); ip_tunnel_add(itn, nt);
out: out:
return err; return err;
} }
...@@ -1163,6 +1178,10 @@ int ip_tunnel_init(struct net_device *dev) ...@@ -1163,6 +1178,10 @@ int ip_tunnel_init(struct net_device *dev)
iph->version = 4; iph->version = 4;
iph->ihl = 5; iph->ihl = 5;
if (tunnel->collect_md) {
dev->features |= NETIF_F_NETNS_LOCAL;
netif_keep_dst(dev);
}
return 0; return 0;
} }
EXPORT_SYMBOL_GPL(ip_tunnel_init); EXPORT_SYMBOL_GPL(ip_tunnel_init);
...@@ -1176,7 +1195,7 @@ void ip_tunnel_uninit(struct net_device *dev) ...@@ -1176,7 +1195,7 @@ void ip_tunnel_uninit(struct net_device *dev)
itn = net_generic(net, tunnel->ip_tnl_net_id); itn = net_generic(net, tunnel->ip_tnl_net_id);
/* fb_tunnel_dev will be unregisted in net-exit call. */ /* fb_tunnel_dev will be unregisted in net-exit call. */
if (itn->fb_tunnel_dev != dev) if (itn->fb_tunnel_dev != dev)
ip_tunnel_del(netdev_priv(dev)); ip_tunnel_del(itn, netdev_priv(dev));
ip_tunnel_dst_reset_all(tunnel); ip_tunnel_dst_reset_all(tunnel);
} }
......
...@@ -198,7 +198,7 @@ static int ipip_rcv(struct sk_buff *skb) ...@@ -198,7 +198,7 @@ static int ipip_rcv(struct sk_buff *skb)
goto drop; goto drop;
if (iptunnel_pull_header(skb, 0, tpi.proto)) if (iptunnel_pull_header(skb, 0, tpi.proto))
goto drop; goto drop;
return ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error); return ip_tunnel_rcv(tunnel, skb, &tpi, NULL, log_ecn_error);
} }
return -1; return -1;
......
...@@ -742,7 +742,7 @@ static int ipip_rcv(struct sk_buff *skb) ...@@ -742,7 +742,7 @@ static int ipip_rcv(struct sk_buff *skb)
goto drop; goto drop;
if (iptunnel_pull_header(skb, 0, tpi.proto)) if (iptunnel_pull_header(skb, 0, tpi.proto))
goto drop; goto drop;
return ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error); return ip_tunnel_rcv(tunnel, skb, &tpi, NULL, log_ecn_error);
} }
return 1; return 1;
......
...@@ -34,7 +34,7 @@ config OPENVSWITCH ...@@ -34,7 +34,7 @@ config OPENVSWITCH
config OPENVSWITCH_GRE config OPENVSWITCH_GRE
tristate "Open vSwitch GRE tunneling support" tristate "Open vSwitch GRE tunneling support"
depends on OPENVSWITCH depends on OPENVSWITCH
depends on NET_IPGRE_DEMUX depends on NET_IPGRE
default OPENVSWITCH default OPENVSWITCH
---help--- ---help---
If you say Y here, then the Open vSwitch will be able create GRE If you say Y here, then the Open vSwitch will be able create GRE
......
...@@ -45,235 +45,43 @@ ...@@ -45,235 +45,43 @@
#include "datapath.h" #include "datapath.h"
#include "vport.h" #include "vport.h"
#include "vport-netdev.h"
static struct vport_ops ovs_gre_vport_ops; static struct vport_ops ovs_gre_vport_ops;
/* Returns the least-significant 32 bits of a __be64. */ static struct vport *gre_tnl_create(const struct vport_parms *parms)
static __be32 be64_get_low32(__be64 x)
{ {
#ifdef __BIG_ENDIAN struct net *net = ovs_dp_get_net(parms->dp);
return (__force __be32)x; struct net_device *dev;
#else
return (__force __be32)((__force u64)x >> 32);
#endif
}
static __be16 filter_tnl_flags(__be16 flags)
{
return flags & (TUNNEL_CSUM | TUNNEL_KEY);
}
static struct sk_buff *__build_header(struct sk_buff *skb,
int tunnel_hlen)
{
struct tnl_ptk_info tpi;
const struct ip_tunnel_key *tun_key;
tun_key = &OVS_CB(skb)->egress_tun_info->key;
skb = gre_handle_offloads(skb, !!(tun_key->tun_flags & TUNNEL_CSUM));
if (IS_ERR(skb))
return skb;
tpi.flags = filter_tnl_flags(tun_key->tun_flags);
tpi.proto = htons(ETH_P_TEB);
tpi.key = be64_get_low32(tun_key->tun_id);
tpi.seq = 0;
gre_build_header(skb, &tpi, tunnel_hlen);
return skb;
}
static __be64 key_to_tunnel_id(__be32 key, __be32 seq)
{
#ifdef __BIG_ENDIAN
return (__force __be64)((__force u64)seq << 32 | (__force u32)key);
#else
return (__force __be64)((__force u64)key << 32 | (__force u32)seq);
#endif
}
/* Called with rcu_read_lock and BH disabled. */
static int gre_rcv(struct sk_buff *skb,
const struct tnl_ptk_info *tpi)
{
struct ip_tunnel_info tun_info;
struct ovs_net *ovs_net;
struct vport *vport;
__be64 key;
ovs_net = net_generic(dev_net(skb->dev), ovs_net_id);
vport = rcu_dereference(ovs_net->vport_net.gre_vport);
if (unlikely(!vport))
return PACKET_REJECT;
key = key_to_tunnel_id(tpi->key, tpi->seq);
ip_tunnel_info_init(&tun_info, ip_hdr(skb), 0, 0, key,
filter_tnl_flags(tpi->flags), NULL, 0);
ovs_vport_receive(vport, skb, &tun_info);
return PACKET_RCVD;
}
/* Called with rcu_read_lock and BH disabled. */
static int gre_err(struct sk_buff *skb, u32 info,
const struct tnl_ptk_info *tpi)
{
struct ovs_net *ovs_net;
struct vport *vport; struct vport *vport;
ovs_net = net_generic(dev_net(skb->dev), ovs_net_id); vport = ovs_vport_alloc(0, &ovs_gre_vport_ops, parms);
vport = rcu_dereference(ovs_net->vport_net.gre_vport); if (IS_ERR(vport))
return vport;
if (unlikely(!vport))
return PACKET_REJECT;
else
return PACKET_RCVD;
}
static int gre_tnl_send(struct vport *vport, struct sk_buff *skb)
{
struct net *net = ovs_dp_get_net(vport->dp);
const struct ip_tunnel_key *tun_key;
struct flowi4 fl;
struct rtable *rt;
int min_headroom;
int tunnel_hlen;
__be16 df;
int err;
if (unlikely(!OVS_CB(skb)->egress_tun_info)) {
err = -EINVAL;
goto err_free_skb;
}
tun_key = &OVS_CB(skb)->egress_tun_info->key;
rt = ovs_tunnel_route_lookup(net, tun_key, skb->mark, &fl, IPPROTO_GRE);
if (IS_ERR(rt)) {
err = PTR_ERR(rt);
goto err_free_skb;
}
tunnel_hlen = ip_gre_calc_hlen(tun_key->tun_flags);
min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
+ tunnel_hlen + sizeof(struct iphdr)
+ (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
int head_delta = SKB_DATA_ALIGN(min_headroom -
skb_headroom(skb) +
16);
err = pskb_expand_head(skb, max_t(int, head_delta, 0),
0, GFP_ATOMIC);
if (unlikely(err))
goto err_free_rt;
}
skb = vlan_hwaccel_push_inside(skb);
if (unlikely(!skb)) {
err = -ENOMEM;
goto err_free_rt;
}
/* Push Tunnel header. */ rtnl_lock();
skb = __build_header(skb, tunnel_hlen); dev = gretap_fb_dev_create(net, parms->name, NET_NAME_USER);
if (IS_ERR(skb)) { if (IS_ERR(dev)) {
err = PTR_ERR(skb); rtnl_unlock();
skb = NULL; ovs_vport_free(vport);
goto err_free_rt; return ERR_CAST(dev);
} }
df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? dev_change_flags(dev, dev->flags | IFF_UP);
htons(IP_DF) : 0; rtnl_unlock();
skb->ignore_df = 1; return vport;
return iptunnel_xmit(skb->sk, rt, skb, fl.saddr,
tun_key->ipv4_dst, IPPROTO_GRE,
tun_key->ipv4_tos, tun_key->ipv4_ttl, df, false);
err_free_rt:
ip_rt_put(rt);
err_free_skb:
kfree_skb(skb);
return err;
}
static struct gre_cisco_protocol gre_protocol = {
.handler = gre_rcv,
.err_handler = gre_err,
.priority = 1,
};
static int gre_ports;
static int gre_init(void)
{
int err;
gre_ports++;
if (gre_ports > 1)
return 0;
err = gre_cisco_register(&gre_protocol);
if (err)
pr_warn("cannot register gre protocol handler\n");
return err;
}
static void gre_exit(void)
{
gre_ports--;
if (gre_ports > 0)
return;
gre_cisco_unregister(&gre_protocol);
}
static const char *gre_get_name(const struct vport *vport)
{
return vport_priv(vport);
} }
static struct vport *gre_create(const struct vport_parms *parms) static struct vport *gre_create(const struct vport_parms *parms)
{ {
struct net *net = ovs_dp_get_net(parms->dp);
struct ovs_net *ovs_net;
struct vport *vport; struct vport *vport;
int err;
err = gre_init();
if (err)
return ERR_PTR(err);
ovs_net = net_generic(net, ovs_net_id);
if (ovsl_dereference(ovs_net->vport_net.gre_vport)) {
vport = ERR_PTR(-EEXIST);
goto error;
}
vport = ovs_vport_alloc(IFNAMSIZ, &ovs_gre_vport_ops, parms); vport = gre_tnl_create(parms);
if (IS_ERR(vport)) if (IS_ERR(vport))
goto error;
strncpy(vport_priv(vport), parms->name, IFNAMSIZ);
rcu_assign_pointer(ovs_net->vport_net.gre_vport, vport);
return vport;
error:
gre_exit();
return vport; return vport;
}
static void gre_tnl_destroy(struct vport *vport)
{
struct net *net = ovs_dp_get_net(vport->dp);
struct ovs_net *ovs_net;
ovs_net = net_generic(net, ovs_net_id);
RCU_INIT_POINTER(ovs_net->vport_net.gre_vport, NULL); return ovs_netdev_link(vport, parms->name);
ovs_vport_deferred_free(vport);
gre_exit();
} }
static int gre_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, static int gre_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
...@@ -288,10 +96,9 @@ static int gre_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, ...@@ -288,10 +96,9 @@ static int gre_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
static struct vport_ops ovs_gre_vport_ops = { static struct vport_ops ovs_gre_vport_ops = {
.type = OVS_VPORT_TYPE_GRE, .type = OVS_VPORT_TYPE_GRE,
.create = gre_create, .create = gre_create,
.destroy = gre_tnl_destroy, .send = ovs_netdev_send,
.get_name = gre_get_name,
.send = gre_tnl_send,
.get_egress_tun_info = gre_get_egress_tun_info, .get_egress_tun_info = gre_get_egress_tun_info,
.destroy = ovs_netdev_tunnel_destroy,
.owner = THIS_MODULE, .owner = THIS_MODULE,
}; };
......
...@@ -147,7 +147,7 @@ static struct vport *netdev_create(const struct vport_parms *parms) ...@@ -147,7 +147,7 @@ static struct vport *netdev_create(const struct vport_parms *parms)
return ovs_netdev_link(vport, parms->name); return ovs_netdev_link(vport, parms->name);
} }
void ovs_vport_free_rcu(struct rcu_head *rcu) static void vport_netdev_free(struct rcu_head *rcu)
{ {
struct vport *vport = container_of(rcu, struct vport, rcu); struct vport *vport = container_of(rcu, struct vport, rcu);
...@@ -155,7 +155,6 @@ void ovs_vport_free_rcu(struct rcu_head *rcu) ...@@ -155,7 +155,6 @@ void ovs_vport_free_rcu(struct rcu_head *rcu)
dev_put(vport->dev); dev_put(vport->dev);
ovs_vport_free(vport); ovs_vport_free(vport);
} }
EXPORT_SYMBOL_GPL(ovs_vport_free_rcu);
void ovs_netdev_detach_dev(struct vport *vport) void ovs_netdev_detach_dev(struct vport *vport)
{ {
...@@ -175,9 +174,25 @@ static void netdev_destroy(struct vport *vport) ...@@ -175,9 +174,25 @@ static void netdev_destroy(struct vport *vport)
ovs_netdev_detach_dev(vport); ovs_netdev_detach_dev(vport);
rtnl_unlock(); rtnl_unlock();
call_rcu(&vport->rcu, ovs_vport_free_rcu); call_rcu(&vport->rcu, vport_netdev_free);
} }
void ovs_netdev_tunnel_destroy(struct vport *vport)
{
rtnl_lock();
if (vport->dev->priv_flags & IFF_OVS_DATAPATH)
ovs_netdev_detach_dev(vport);
/* Early release so we can unregister the device */
dev_put(vport->dev);
rtnl_delete_link(vport->dev);
vport->dev = NULL;
rtnl_unlock();
call_rcu(&vport->rcu, vport_netdev_free);
}
EXPORT_SYMBOL_GPL(ovs_netdev_tunnel_destroy);
static unsigned int packet_length(const struct sk_buff *skb) static unsigned int packet_length(const struct sk_buff *skb)
{ {
unsigned int length = skb->len - ETH_HLEN; unsigned int length = skb->len - ETH_HLEN;
......
...@@ -29,9 +29,9 @@ struct vport *ovs_netdev_get_vport(struct net_device *dev); ...@@ -29,9 +29,9 @@ struct vport *ovs_netdev_get_vport(struct net_device *dev);
struct vport *ovs_netdev_link(struct vport *vport, const char *name); struct vport *ovs_netdev_link(struct vport *vport, const char *name);
int ovs_netdev_send(struct vport *vport, struct sk_buff *skb); int ovs_netdev_send(struct vport *vport, struct sk_buff *skb);
void ovs_netdev_detach_dev(struct vport *); void ovs_netdev_detach_dev(struct vport *);
void ovs_vport_free_rcu(struct rcu_head *);
int __init ovs_netdev_init(void); int __init ovs_netdev_init(void);
void ovs_netdev_exit(void); void ovs_netdev_exit(void);
void ovs_netdev_tunnel_destroy(struct vport *vport);
#endif /* vport_netdev.h */ #endif /* vport_netdev.h */
...@@ -146,21 +146,6 @@ static struct vport *vxlan_create(const struct vport_parms *parms) ...@@ -146,21 +146,6 @@ static struct vport *vxlan_create(const struct vport_parms *parms)
return ovs_netdev_link(vport, parms->name); return ovs_netdev_link(vport, parms->name);
} }
static void vxlan_destroy(struct vport *vport)
{
rtnl_lock();
if (vport->dev->priv_flags & IFF_OVS_DATAPATH)
ovs_netdev_detach_dev(vport);
/* Early release so we can unregister the device */
dev_put(vport->dev);
rtnl_delete_link(vport->dev);
vport->dev = NULL;
rtnl_unlock();
call_rcu(&vport->rcu, ovs_vport_free_rcu);
}
static int vxlan_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, static int vxlan_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
struct ip_tunnel_info *egress_tun_info) struct ip_tunnel_info *egress_tun_info)
{ {
...@@ -183,7 +168,7 @@ static int vxlan_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, ...@@ -183,7 +168,7 @@ static int vxlan_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
static struct vport_ops ovs_vxlan_netdev_vport_ops = { static struct vport_ops ovs_vxlan_netdev_vport_ops = {
.type = OVS_VPORT_TYPE_VXLAN, .type = OVS_VPORT_TYPE_VXLAN,
.create = vxlan_create, .create = vxlan_create,
.destroy = vxlan_destroy, .destroy = ovs_netdev_tunnel_destroy,
.get_options = vxlan_get_options, .get_options = vxlan_get_options,
.send = ovs_netdev_send, .send = ovs_netdev_send,
.get_egress_tun_info = vxlan_get_egress_tun_info, .get_egress_tun_info = vxlan_get_egress_tun_info,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment