Commit 33c6ce4a authored by Jakub Kicinski's avatar Jakub Kicinski

Merge branch 'net-move-more-duplicate-code-of-ovs-and-tc-conntrack-into-nf_conntrack_ovs'

Xin Long says:

====================
net: move more duplicate code of ovs and tc conntrack into nf_conntrack_ovs

We've moved some duplicate code into nf_nat_ovs in:

  "net: eliminate the duplicate code in the ct nat functions of ovs and tc"

This patchset addresses more code duplication in the conntrack of ovs
and tc then creates nf_conntrack_ovs for them, and four functions will
be extracted and moved into it:

  nf_ct_handle_fragments()
  nf_ct_skb_network_trim()
  nf_ct_helper()
  nf_ct_add_helper()
====================

Link: https://lore.kernel.org/r/cover.1675810210.git.lucien.xin@gmail.comSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 025a785f 0785407e
...@@ -362,6 +362,10 @@ static inline struct nf_conntrack_net *nf_ct_pernet(const struct net *net) ...@@ -362,6 +362,10 @@ static inline struct nf_conntrack_net *nf_ct_pernet(const struct net *net)
return net_generic(net, nf_conntrack_net_id); return net_generic(net, nf_conntrack_net_id);
} }
int nf_ct_skb_network_trim(struct sk_buff *skb, int family);
int nf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
u16 zone, u8 family, u8 *proto, u16 *mru);
#define NF_CT_STAT_INC(net, count) __this_cpu_inc((net)->ct.stat->count) #define NF_CT_STAT_INC(net, count) __this_cpu_inc((net)->ct.stat->count)
#define NF_CT_STAT_INC_ATOMIC(net, count) this_cpu_inc((net)->ct.stat->count) #define NF_CT_STAT_INC_ATOMIC(net, count) this_cpu_inc((net)->ct.stat->count)
#define NF_CT_STAT_ADD_ATOMIC(net, count, v) this_cpu_add((net)->ct.stat->count, (v)) #define NF_CT_STAT_ADD_ATOMIC(net, count, v) this_cpu_add((net)->ct.stat->count, (v))
......
...@@ -189,6 +189,9 @@ config NF_CONNTRACK_LABELS ...@@ -189,6 +189,9 @@ config NF_CONNTRACK_LABELS
to connection tracking entries. It can be used with xtables connlabel to connection tracking entries. It can be used with xtables connlabel
match and the nftables ct expression. match and the nftables ct expression.
config NF_CONNTRACK_OVS
bool
config NF_CT_PROTO_DCCP config NF_CT_PROTO_DCCP
bool 'DCCP protocol connection tracking support' bool 'DCCP protocol connection tracking support'
depends on NETFILTER_ADVANCED depends on NETFILTER_ADVANCED
......
...@@ -11,6 +11,7 @@ nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMEOUT) += nf_conntrack_timeout.o ...@@ -11,6 +11,7 @@ nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMEOUT) += nf_conntrack_timeout.o
nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o
nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o
nf_conntrack-$(CONFIG_NF_CONNTRACK_LABELS) += nf_conntrack_labels.o nf_conntrack-$(CONFIG_NF_CONNTRACK_LABELS) += nf_conntrack_labels.o
nf_conntrack-$(CONFIG_NF_CONNTRACK_OVS) += nf_conntrack_ovs.o
nf_conntrack-$(CONFIG_NF_CT_PROTO_DCCP) += nf_conntrack_proto_dccp.o nf_conntrack-$(CONFIG_NF_CT_PROTO_DCCP) += nf_conntrack_proto_dccp.o
nf_conntrack-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o nf_conntrack-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o
nf_conntrack-$(CONFIG_NF_CT_PROTO_GRE) += nf_conntrack_proto_gre.o nf_conntrack-$(CONFIG_NF_CT_PROTO_GRE) += nf_conntrack_proto_gre.o
......
...@@ -242,104 +242,6 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, ...@@ -242,104 +242,6 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
} }
EXPORT_SYMBOL_GPL(__nf_ct_try_assign_helper); EXPORT_SYMBOL_GPL(__nf_ct_try_assign_helper);
/* 'skb' should already be pulled to nh_ofs. */
int nf_ct_helper(struct sk_buff *skb, struct nf_conn *ct,
enum ip_conntrack_info ctinfo, u16 proto)
{
const struct nf_conntrack_helper *helper;
const struct nf_conn_help *help;
unsigned int protoff;
int err;
if (ctinfo == IP_CT_RELATED_REPLY)
return NF_ACCEPT;
help = nfct_help(ct);
if (!help)
return NF_ACCEPT;
helper = rcu_dereference(help->helper);
if (!helper)
return NF_ACCEPT;
if (helper->tuple.src.l3num != NFPROTO_UNSPEC &&
helper->tuple.src.l3num != proto)
return NF_ACCEPT;
switch (proto) {
case NFPROTO_IPV4:
protoff = ip_hdrlen(skb);
proto = ip_hdr(skb)->protocol;
break;
case NFPROTO_IPV6: {
u8 nexthdr = ipv6_hdr(skb)->nexthdr;
__be16 frag_off;
int ofs;
ofs = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
&frag_off);
if (ofs < 0 || (frag_off & htons(~0x7)) != 0) {
pr_debug("proto header not found\n");
return NF_ACCEPT;
}
protoff = ofs;
proto = nexthdr;
break;
}
default:
WARN_ONCE(1, "helper invoked on non-IP family!");
return NF_DROP;
}
if (helper->tuple.dst.protonum != proto)
return NF_ACCEPT;
err = helper->help(skb, protoff, ct, ctinfo);
if (err != NF_ACCEPT)
return err;
/* Adjust seqs after helper. This is needed due to some helpers (e.g.,
* FTP with NAT) adusting the TCP payload size when mangling IP
* addresses and/or port numbers in the text-based control connection.
*/
if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
!nf_ct_seq_adjust(skb, ct, ctinfo, protoff))
return NF_DROP;
return NF_ACCEPT;
}
EXPORT_SYMBOL_GPL(nf_ct_helper);
int nf_ct_add_helper(struct nf_conn *ct, const char *name, u8 family,
u8 proto, bool nat, struct nf_conntrack_helper **hp)
{
struct nf_conntrack_helper *helper;
struct nf_conn_help *help;
int ret = 0;
helper = nf_conntrack_helper_try_module_get(name, family, proto);
if (!helper)
return -EINVAL;
help = nf_ct_helper_ext_add(ct, GFP_KERNEL);
if (!help) {
nf_conntrack_helper_put(helper);
return -ENOMEM;
}
#if IS_ENABLED(CONFIG_NF_NAT)
if (nat) {
ret = nf_nat_helper_try_module_get(name, family, proto);
if (ret) {
nf_conntrack_helper_put(helper);
return ret;
}
}
#endif
rcu_assign_pointer(help->helper, helper);
*hp = helper;
return ret;
}
EXPORT_SYMBOL_GPL(nf_ct_add_helper);
/* appropriate ct lock protecting must be taken by caller */ /* appropriate ct lock protecting must be taken by caller */
static int unhelp(struct nf_conn *ct, void *me) static int unhelp(struct nf_conn *ct, void *me)
{ {
......
// SPDX-License-Identifier: GPL-2.0-only
/* Support ct functions for openvswitch and used by OVS and TC conntrack. */
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_seqadj.h>
#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
#include <net/ipv6_frag.h>
#include <net/ip.h>
/* 'skb' should already be pulled to nh_ofs. */
int nf_ct_helper(struct sk_buff *skb, struct nf_conn *ct,
enum ip_conntrack_info ctinfo, u16 proto)
{
const struct nf_conntrack_helper *helper;
const struct nf_conn_help *help;
unsigned int protoff;
int err;
if (ctinfo == IP_CT_RELATED_REPLY)
return NF_ACCEPT;
help = nfct_help(ct);
if (!help)
return NF_ACCEPT;
helper = rcu_dereference(help->helper);
if (!helper)
return NF_ACCEPT;
if (helper->tuple.src.l3num != NFPROTO_UNSPEC &&
helper->tuple.src.l3num != proto)
return NF_ACCEPT;
switch (proto) {
case NFPROTO_IPV4:
protoff = ip_hdrlen(skb);
proto = ip_hdr(skb)->protocol;
break;
case NFPROTO_IPV6: {
u8 nexthdr = ipv6_hdr(skb)->nexthdr;
__be16 frag_off;
int ofs;
ofs = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
&frag_off);
if (ofs < 0 || (frag_off & htons(~0x7)) != 0) {
pr_debug("proto header not found\n");
return NF_ACCEPT;
}
protoff = ofs;
proto = nexthdr;
break;
}
default:
WARN_ONCE(1, "helper invoked on non-IP family!");
return NF_DROP;
}
if (helper->tuple.dst.protonum != proto)
return NF_ACCEPT;
err = helper->help(skb, protoff, ct, ctinfo);
if (err != NF_ACCEPT)
return err;
/* Adjust seqs after helper. This is needed due to some helpers (e.g.,
* FTP with NAT) adusting the TCP payload size when mangling IP
* addresses and/or port numbers in the text-based control connection.
*/
if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
!nf_ct_seq_adjust(skb, ct, ctinfo, protoff))
return NF_DROP;
return NF_ACCEPT;
}
EXPORT_SYMBOL_GPL(nf_ct_helper);
int nf_ct_add_helper(struct nf_conn *ct, const char *name, u8 family,
u8 proto, bool nat, struct nf_conntrack_helper **hp)
{
struct nf_conntrack_helper *helper;
struct nf_conn_help *help;
int ret = 0;
helper = nf_conntrack_helper_try_module_get(name, family, proto);
if (!helper)
return -EINVAL;
help = nf_ct_helper_ext_add(ct, GFP_KERNEL);
if (!help) {
nf_conntrack_helper_put(helper);
return -ENOMEM;
}
#if IS_ENABLED(CONFIG_NF_NAT)
if (nat) {
ret = nf_nat_helper_try_module_get(name, family, proto);
if (ret) {
nf_conntrack_helper_put(helper);
return ret;
}
}
#endif
rcu_assign_pointer(help->helper, helper);
*hp = helper;
return ret;
}
EXPORT_SYMBOL_GPL(nf_ct_add_helper);
/* Trim the skb to the length specified by the IP/IPv6 header,
* removing any trailing lower-layer padding. This prepares the skb
* for higher-layer processing that assumes skb->len excludes padding
* (such as nf_ip_checksum). The caller needs to pull the skb to the
* network header, and ensure ip_hdr/ipv6_hdr points to valid data.
*/
int nf_ct_skb_network_trim(struct sk_buff *skb, int family)
{
unsigned int len;
switch (family) {
case NFPROTO_IPV4:
len = skb_ip_totlen(skb);
break;
case NFPROTO_IPV6:
len = sizeof(struct ipv6hdr)
+ ntohs(ipv6_hdr(skb)->payload_len);
break;
default:
len = skb->len;
}
return pskb_trim_rcsum(skb, len);
}
EXPORT_SYMBOL_GPL(nf_ct_skb_network_trim);
/* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero
* value if 'skb' is freed.
*/
int nf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
u16 zone, u8 family, u8 *proto, u16 *mru)
{
int err;
if (family == NFPROTO_IPV4) {
enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone;
memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
local_bh_disable();
err = ip_defrag(net, skb, user);
local_bh_enable();
if (err)
return err;
*mru = IPCB(skb)->frag_max_size;
#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
} else if (family == NFPROTO_IPV6) {
enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone;
memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
err = nf_ct_frag6_gather(net, skb, user);
if (err) {
if (err != -EINPROGRESS)
kfree_skb(skb);
return err;
}
*proto = ipv6_hdr(skb)->nexthdr;
*mru = IP6CB(skb)->frag_max_size;
#endif
} else {
kfree_skb(skb);
return -EPFNOSUPPORT;
}
skb_clear_hash(skb);
skb->ignore_df = 1;
return 0;
}
EXPORT_SYMBOL_GPL(nf_ct_handle_fragments);
...@@ -15,6 +15,7 @@ config OPENVSWITCH ...@@ -15,6 +15,7 @@ config OPENVSWITCH
select NET_MPLS_GSO select NET_MPLS_GSO
select DST_CACHE select DST_CACHE
select NET_NSH select NET_NSH
select NF_CONNTRACK_OVS if NF_CONNTRACK
select NF_NAT_OVS if NF_NAT select NF_NAT_OVS if NF_NAT
help help
Open vSwitch is a multilayer Ethernet switch targeted at virtualized Open vSwitch is a multilayer Ethernet switch targeted at virtualized
......
...@@ -435,52 +435,21 @@ static int ovs_ct_set_labels(struct nf_conn *ct, struct sw_flow_key *key, ...@@ -435,52 +435,21 @@ static int ovs_ct_set_labels(struct nf_conn *ct, struct sw_flow_key *key,
return 0; return 0;
} }
/* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero static int ovs_ct_handle_fragments(struct net *net, struct sw_flow_key *key,
* value if 'skb' is freed. u16 zone, int family, struct sk_buff *skb)
*/
static int handle_fragments(struct net *net, struct sw_flow_key *key,
u16 zone, struct sk_buff *skb)
{ {
struct ovs_skb_cb ovs_cb = *OVS_CB(skb); struct ovs_skb_cb ovs_cb = *OVS_CB(skb);
int err; int err;
if (key->eth.type == htons(ETH_P_IP)) { err = nf_ct_handle_fragments(net, skb, zone, family, &key->ip.proto, &ovs_cb.mru);
enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone; if (err)
return err;
memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
err = ip_defrag(net, skb, user);
if (err)
return err;
ovs_cb.mru = IPCB(skb)->frag_max_size;
#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
} else if (key->eth.type == htons(ETH_P_IPV6)) {
enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone;
memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
err = nf_ct_frag6_gather(net, skb, user);
if (err) {
if (err != -EINPROGRESS)
kfree_skb(skb);
return err;
}
key->ip.proto = ipv6_hdr(skb)->nexthdr;
ovs_cb.mru = IP6CB(skb)->frag_max_size;
#endif
} else {
kfree_skb(skb);
return -EPFNOSUPPORT;
}
/* The key extracted from the fragment that completed this datagram /* The key extracted from the fragment that completed this datagram
* likely didn't have an L4 header, so regenerate it. * likely didn't have an L4 header, so regenerate it.
*/ */
ovs_flow_key_update_l3l4(skb, key); ovs_flow_key_update_l3l4(skb, key);
key->ip.frag = OVS_FRAG_TYPE_NONE; key->ip.frag = OVS_FRAG_TYPE_NONE;
skb_clear_hash(skb);
skb->ignore_df = 1;
*OVS_CB(skb) = ovs_cb; *OVS_CB(skb) = ovs_cb;
return 0; return 0;
...@@ -1091,36 +1060,6 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key, ...@@ -1091,36 +1060,6 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
return 0; return 0;
} }
/* Trim the skb to the length specified by the IP/IPv6 header,
* removing any trailing lower-layer padding. This prepares the skb
* for higher-layer processing that assumes skb->len excludes padding
* (such as nf_ip_checksum). The caller needs to pull the skb to the
* network header, and ensure ip_hdr/ipv6_hdr points to valid data.
*/
static int ovs_skb_network_trim(struct sk_buff *skb)
{
unsigned int len;
int err;
switch (skb->protocol) {
case htons(ETH_P_IP):
len = skb_ip_totlen(skb);
break;
case htons(ETH_P_IPV6):
len = sizeof(struct ipv6hdr)
+ ntohs(ipv6_hdr(skb)->payload_len);
break;
default:
len = skb->len;
}
err = pskb_trim_rcsum(skb, len);
if (err)
kfree_skb(skb);
return err;
}
/* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero /* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero
* value if 'skb' is freed. * value if 'skb' is freed.
*/ */
...@@ -1135,12 +1074,15 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb, ...@@ -1135,12 +1074,15 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb,
nh_ofs = skb_network_offset(skb); nh_ofs = skb_network_offset(skb);
skb_pull_rcsum(skb, nh_ofs); skb_pull_rcsum(skb, nh_ofs);
err = ovs_skb_network_trim(skb); err = nf_ct_skb_network_trim(skb, info->family);
if (err) if (err) {
kfree_skb(skb);
return err; return err;
}
if (key->ip.frag != OVS_FRAG_TYPE_NONE) { if (key->ip.frag != OVS_FRAG_TYPE_NONE) {
err = handle_fragments(net, key, info->zone.id, skb); err = ovs_ct_handle_fragments(net, key, info->zone.id,
info->family, skb);
if (err) if (err)
return err; return err;
} }
......
...@@ -984,6 +984,7 @@ config NET_ACT_TUNNEL_KEY ...@@ -984,6 +984,7 @@ config NET_ACT_TUNNEL_KEY
config NET_ACT_CT config NET_ACT_CT
tristate "connection tracking tc action" tristate "connection tracking tc action"
depends on NET_CLS_ACT && NF_CONNTRACK && (!NF_NAT || NF_NAT) && NF_FLOW_TABLE depends on NET_CLS_ACT && NF_CONNTRACK && (!NF_NAT || NF_NAT) && NF_FLOW_TABLE
select NF_CONNTRACK_OVS
select NF_NAT_OVS if NF_NAT select NF_NAT_OVS if NF_NAT
help help
Say Y here to allow sending the packets to conntrack module. Say Y here to allow sending the packets to conntrack module.
......
...@@ -726,31 +726,6 @@ static bool tcf_ct_skb_nfct_cached(struct net *net, struct sk_buff *skb, ...@@ -726,31 +726,6 @@ static bool tcf_ct_skb_nfct_cached(struct net *net, struct sk_buff *skb,
return false; return false;
} }
/* Trim the skb to the length specified by the IP/IPv6 header,
* removing any trailing lower-layer padding. This prepares the skb
* for higher-layer processing that assumes skb->len excludes padding
* (such as nf_ip_checksum). The caller needs to pull the skb to the
* network header, and ensure ip_hdr/ipv6_hdr points to valid data.
*/
static int tcf_ct_skb_network_trim(struct sk_buff *skb, int family)
{
unsigned int len;
switch (family) {
case NFPROTO_IPV4:
len = skb_ip_totlen(skb);
break;
case NFPROTO_IPV6:
len = sizeof(struct ipv6hdr)
+ ntohs(ipv6_hdr(skb)->payload_len);
break;
default:
len = skb->len;
}
return pskb_trim_rcsum(skb, len);
}
static u8 tcf_ct_skb_nf_family(struct sk_buff *skb) static u8 tcf_ct_skb_nf_family(struct sk_buff *skb)
{ {
u8 family = NFPROTO_UNSPEC; u8 family = NFPROTO_UNSPEC;
...@@ -810,6 +785,7 @@ static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb, ...@@ -810,6 +785,7 @@ static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
struct nf_conn *ct; struct nf_conn *ct;
int err = 0; int err = 0;
bool frag; bool frag;
u8 proto;
u16 mru; u16 mru;
/* Previously seen (loopback)? Ignore. */ /* Previously seen (loopback)? Ignore. */
...@@ -825,50 +801,14 @@ static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb, ...@@ -825,50 +801,14 @@ static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
return err; return err;
skb_get(skb); skb_get(skb);
mru = tc_skb_cb(skb)->mru; err = nf_ct_handle_fragments(net, skb, zone, family, &proto, &mru);
if (err)
if (family == NFPROTO_IPV4) { return err;
enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone;
memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
local_bh_disable();
err = ip_defrag(net, skb, user);
local_bh_enable();
if (err && err != -EINPROGRESS)
return err;
if (!err) {
*defrag = true;
mru = IPCB(skb)->frag_max_size;
}
} else { /* NFPROTO_IPV6 */
#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone;
memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
err = nf_ct_frag6_gather(net, skb, user);
if (err && err != -EINPROGRESS)
goto out_free;
if (!err) {
*defrag = true;
mru = IP6CB(skb)->frag_max_size;
}
#else
err = -EOPNOTSUPP;
goto out_free;
#endif
}
if (err != -EINPROGRESS) *defrag = true;
tc_skb_cb(skb)->mru = mru; tc_skb_cb(skb)->mru = mru;
skb_clear_hash(skb);
skb->ignore_df = 1;
return err;
out_free: return 0;
kfree_skb(skb);
return err;
} }
static void tcf_ct_params_free(struct tcf_ct_params *params) static void tcf_ct_params_free(struct tcf_ct_params *params)
...@@ -1011,7 +951,7 @@ TC_INDIRECT_SCOPE int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, ...@@ -1011,7 +951,7 @@ TC_INDIRECT_SCOPE int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
if (err) if (err)
goto drop; goto drop;
err = tcf_ct_skb_network_trim(skb, family); err = nf_ct_skb_network_trim(skb, family);
if (err) if (err)
goto drop; goto drop;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment