Merge branch 'net-move-more-duplicate-code-of-ovs-and-tc-conntrack-into-nf_conntrack_ovs'

Xin Long says: ==================== net: move more duplicate code of ovs and tc conntrack into nf_conntrack_ovs We've moved some duplicate code into nf_nat_ovs in: "net: eliminate the duplicate code in the ct nat functions of ovs and tc" This patchset addresses more code duplication in the conntrack of ovs and tc then creates nf_conntrack_ovs for them, and four functions will be extracted and moved into it: nf_ct_handle_fragments() nf_ct_skb_network_trim() nf_ct_helper() nf_ct_add_helper() ==================== Link: https://lore.kernel.org/r/cover.1675810210.git.lucien.xin@gmail.comSigned-off-by: Jakub Kicinski <kuba@kernel.org>

Merge branch 'net-move-more-duplicate-code-of-ovs-and-tc-conntrack-into-nf_conntrack_ovs'
Xin Long says: ==================== net: move more duplicate code of ovs and tc conntrack into nf_conntrack_ovs We've moved some duplicate code into nf_nat_ovs in: "net: eliminate the duplicate code in the ct nat functions of ovs and tc" This patchset addresses more code duplication in the conntrack of ovs and tc then creates nf_conntrack_ovs for them, and four functions will be extracted and moved into it: nf_ct_handle_fragments() nf_ct_skb_network_trim() nf_ct_helper() nf_ct_add_helper() ==================== Link: https://lore.kernel.org/r/cover.1675810210.git.lucien.xin@gmail.comSigned-off-by: Jakub Kicinski <kuba@kernel.org>
33c6ce4a · Jakub Kicinski · 025a785f · 0785407e · 33c6ce4a · 33c6ce4a
Commit 33c6ce4a authored Feb 10, 2023 by Jakub Kicinski
9 changed files
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -362,6 +362,10 @@ static inline struct nf_conntrack_net *nf_ct_pernet(const struct net *net)
 	return net_generic(net, nf_conntrack_net_id);
 }
+int nf_ct_skb_network_trim(struct sk_buff *skb, int family);
+int nf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
+			   u16 zone, u8 family, u8 *proto, u16 *mru);
 #define NF_CT_STAT_INC(net, count)	  __this_cpu_inc((net)->ct.stat->count)
 #define NF_CT_STAT_INC_ATOMIC(net, count) this_cpu_inc((net)->ct.stat->count)
 #define NF_CT_STAT_ADD_ATOMIC(net, count, v) this_cpu_add((net)->ct.stat->count, (v))

--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -189,6 +189,9 @@ config NF_CONNTRACK_LABELS
 	  to connection tracking entries.  It can be used with xtables connlabel
 	  match and the nftables ct expression.
+config NF_CONNTRACK_OVS
+	bool
 config NF_CT_PROTO_DCCP
 	bool 'DCCP protocol connection tracking support'
 	depends on NETFILTER_ADVANCED

--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -11,6 +11,7 @@ nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMEOUT) += nf_conntrack_timeout.o
 nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o
 nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o
 nf_conntrack-$(CONFIG_NF_CONNTRACK_LABELS) += nf_conntrack_labels.o
+nf_conntrack-$(CONFIG_NF_CONNTRACK_OVS) += nf_conntrack_ovs.o
 nf_conntrack-$(CONFIG_NF_CT_PROTO_DCCP) += nf_conntrack_proto_dccp.o
 nf_conntrack-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o
 nf_conntrack-$(CONFIG_NF_CT_PROTO_GRE) += nf_conntrack_proto_gre.o

--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -242,104 +242,6 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
 }
 EXPORT_SYMBOL_GPL(__nf_ct_try_assign_helper);
-/* 'skb' should already be pulled to nh_ofs. */
-int nf_ct_helper(struct sk_buff *skb, struct nf_conn *ct,
-		 enum ip_conntrack_info ctinfo, u16 proto)
-{
-	const struct nf_conntrack_helper *helper;
-	const struct nf_conn_help *help;
-	unsigned int protoff;
-	int err;
-	if (ctinfo == IP_CT_RELATED_REPLY)
-		return NF_ACCEPT;
-	help = nfct_help(ct);
-	if (!help)
-		return NF_ACCEPT;
-	helper = rcu_dereference(help->helper);
-	if (!helper)
-		return NF_ACCEPT;
-	if (helper->tuple.src.l3num != NFPROTO_UNSPEC &&
-	    helper->tuple.src.l3num != proto)
-		return NF_ACCEPT;
-	switch (proto) {
-	case NFPROTO_IPV4:
-		protoff = ip_hdrlen(skb);
-		proto = ip_hdr(skb)->protocol;
-		break;
-	case NFPROTO_IPV6: {
-		u8 nexthdr = ipv6_hdr(skb)->nexthdr;
-		__be16 frag_off;
-		int ofs;
-		ofs = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
-				       &frag_off);
-		if (ofs < 0 || (frag_off & htons(~0x7)) != 0) {
-			pr_debug("proto header not found\n");
-			return NF_ACCEPT;
-		}
-		protoff = ofs;
-		proto = nexthdr;
-		break;
-	}
-	default:
-		WARN_ONCE(1, "helper invoked on non-IP family!");
-		return NF_DROP;
-	}
-	if (helper->tuple.dst.protonum != proto)
-		return NF_ACCEPT;
-	err = helper->help(skb, protoff, ct, ctinfo);
-	if (err != NF_ACCEPT)
-		return err;
-	/* Adjust seqs after helper.  This is needed due to some helpers (e.g.,
-	 * FTP with NAT) adusting the TCP payload size when mangling IP
-	 * addresses and/or port numbers in the text-based control connection.
-	 */
-	if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
-	    !nf_ct_seq_adjust(skb, ct, ctinfo, protoff))
-		return NF_DROP;
-	return NF_ACCEPT;
-}
-EXPORT_SYMBOL_GPL(nf_ct_helper);
-int nf_ct_add_helper(struct nf_conn *ct, const char *name, u8 family,
-		     u8 proto, bool nat, struct nf_conntrack_helper **hp)
-{
-	struct nf_conntrack_helper *helper;
-	struct nf_conn_help *help;
-	int ret = 0;
-	helper = nf_conntrack_helper_try_module_get(name, family, proto);
-	if (!helper)
-		return -EINVAL;
-	help = nf_ct_helper_ext_add(ct, GFP_KERNEL);
-	if (!help) {
-		nf_conntrack_helper_put(helper);
-		return -ENOMEM;
-	}
-#if IS_ENABLED(CONFIG_NF_NAT)
-	if (nat) {
-		ret = nf_nat_helper_try_module_get(name, family, proto);
-		if (ret) {
-			nf_conntrack_helper_put(helper);
-			return ret;
-		}
-	}
-#endif
-	rcu_assign_pointer(help->helper, helper);
-	*hp = helper;
-	return ret;
-}
-EXPORT_SYMBOL_GPL(nf_ct_add_helper);
 /* appropriate ct lock protecting must be taken by caller */
 static int unhelp(struct nf_conn *ct, void *me)
 {

--- a/net/netfilter/nf_conntrack_ovs.c
+++ b/net/netfilter/nf_conntrack_ovs.c
+// SPDX-License-Identifier: GPL-2.0-only
+/* Support ct functions for openvswitch and used by OVS and TC conntrack. */
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+#include <net/ipv6_frag.h>
+#include <net/ip.h>
+/* 'skb' should already be pulled to nh_ofs. */
+int nf_ct_helper(struct sk_buff *skb, struct nf_conn *ct,
+		 enum ip_conntrack_info ctinfo, u16 proto)
+{
+	const struct nf_conntrack_helper *helper;
+	const struct nf_conn_help *help;
+	unsigned int protoff;
+	int err;
+	if (ctinfo == IP_CT_RELATED_REPLY)
+		return NF_ACCEPT;
+	help = nfct_help(ct);
+	if (!help)
+		return NF_ACCEPT;
+	helper = rcu_dereference(help->helper);
+	if (!helper)
+		return NF_ACCEPT;
+	if (helper->tuple.src.l3num != NFPROTO_UNSPEC &&
+	    helper->tuple.src.l3num != proto)
+		return NF_ACCEPT;
+	switch (proto) {
+	case NFPROTO_IPV4:
+		protoff = ip_hdrlen(skb);
+		proto = ip_hdr(skb)->protocol;
+		break;
+	case NFPROTO_IPV6: {
+		u8 nexthdr = ipv6_hdr(skb)->nexthdr;
+		__be16 frag_off;
+		int ofs;
+		ofs = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
+				       &frag_off);
+		if (ofs < 0 || (frag_off & htons(~0x7)) != 0) {
+			pr_debug("proto header not found\n");
+			return NF_ACCEPT;
+		}
+		protoff = ofs;
+		proto = nexthdr;
+		break;
+	}
+	default:
+		WARN_ONCE(1, "helper invoked on non-IP family!");
+		return NF_DROP;
+	}
+	if (helper->tuple.dst.protonum != proto)
+		return NF_ACCEPT;
+	err = helper->help(skb, protoff, ct, ctinfo);
+	if (err != NF_ACCEPT)
+		return err;
+	/* Adjust seqs after helper.  This is needed due to some helpers (e.g.,
+	 * FTP with NAT) adusting the TCP payload size when mangling IP
+	 * addresses and/or port numbers in the text-based control connection.
+	 */
+	if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
+	    !nf_ct_seq_adjust(skb, ct, ctinfo, protoff))
+		return NF_DROP;
+	return NF_ACCEPT;
+}
+EXPORT_SYMBOL_GPL(nf_ct_helper);
+int nf_ct_add_helper(struct nf_conn *ct, const char *name, u8 family,
+		     u8 proto, bool nat, struct nf_conntrack_helper **hp)
+{
+	struct nf_conntrack_helper *helper;
+	struct nf_conn_help *help;
+	int ret = 0;
+	helper = nf_conntrack_helper_try_module_get(name, family, proto);
+	if (!helper)
+		return -EINVAL;
+	help = nf_ct_helper_ext_add(ct, GFP_KERNEL);
+	if (!help) {
+		nf_conntrack_helper_put(helper);
+		return -ENOMEM;
+	}
+#if IS_ENABLED(CONFIG_NF_NAT)
+	if (nat) {
+		ret = nf_nat_helper_try_module_get(name, family, proto);
+		if (ret) {
+			nf_conntrack_helper_put(helper);
+			return ret;
+		}
+	}
+#endif
+	rcu_assign_pointer(help->helper, helper);
+	*hp = helper;
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nf_ct_add_helper);
+/* Trim the skb to the length specified by the IP/IPv6 header,
+ * removing any trailing lower-layer padding. This prepares the skb
+ * for higher-layer processing that assumes skb->len excludes padding
+ * (such as nf_ip_checksum). The caller needs to pull the skb to the
+ * network header, and ensure ip_hdr/ipv6_hdr points to valid data.
+ */
+int nf_ct_skb_network_trim(struct sk_buff *skb, int family)
+{
+	unsigned int len;
+	switch (family) {
+	case NFPROTO_IPV4:
+		len = skb_ip_totlen(skb);
+		break;
+	case NFPROTO_IPV6:
+		len = sizeof(struct ipv6hdr)
+			+ ntohs(ipv6_hdr(skb)->payload_len);
+		break;
+	default:
+		len = skb->len;
+	}
+	return pskb_trim_rcsum(skb, len);
+}
+EXPORT_SYMBOL_GPL(nf_ct_skb_network_trim);
+/* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero
+ * value if 'skb' is freed.
+ */
+int nf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
+			   u16 zone, u8 family, u8 *proto, u16 *mru)
+{
+	int err;
+	if (family == NFPROTO_IPV4) {
+		enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone;
+		memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
+		local_bh_disable();
+		err = ip_defrag(net, skb, user);
+		local_bh_enable();
+		if (err)
+			return err;
+		*mru = IPCB(skb)->frag_max_size;
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
+	} else if (family == NFPROTO_IPV6) {
+		enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone;
+		memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
+		err = nf_ct_frag6_gather(net, skb, user);
+		if (err) {
+			if (err != -EINPROGRESS)
+				kfree_skb(skb);
+			return err;
+		}
+		*proto = ipv6_hdr(skb)->nexthdr;
+		*mru = IP6CB(skb)->frag_max_size;
+#endif
+	} else {
+		kfree_skb(skb);
+		return -EPFNOSUPPORT;
+	}
+	skb_clear_hash(skb);
+	skb->ignore_df = 1;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nf_ct_handle_fragments);
--- a/net/openvswitch/Kconfig
+++ b/net/openvswitch/Kconfig
@@ -15,6 +15,7 @@ config OPENVSWITCH
 	select NET_MPLS_GSO
 	select DST_CACHE
 	select NET_NSH
+	select NF_CONNTRACK_OVS if NF_CONNTRACK
 	select NF_NAT_OVS if NF_NAT
 	help
 	  Open vSwitch is a multilayer Ethernet switch targeted at virtualized

--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -435,52 +435,21 @@ static int ovs_ct_set_labels(struct nf_conn *ct, struct sw_flow_key *key,
 	return 0;
 }
-/* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero
+static int ovs_ct_handle_fragments(struct net *net, struct sw_flow_key *key,
- * value if 'skb' is freed.
+				   u16 zone, int family, struct sk_buff *skb)
- */
-static int handle_fragments(struct net *net, struct sw_flow_key *key,
-			    u16 zone, struct sk_buff *skb)
 {
 	struct ovs_skb_cb ovs_cb = *OVS_CB(skb);
 	int err;
-	if (key->eth.type == htons(ETH_P_IP)) {
+	err = nf_ct_handle_fragments(net, skb, zone, family, &key->ip.proto, &ovs_cb.mru);
-		enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone;
+	if (err)
+		return err;
-		memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
-		err = ip_defrag(net, skb, user);
-		if (err)
-			return err;
-		ovs_cb.mru = IPCB(skb)->frag_max_size;
-#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
-	} else if (key->eth.type == htons(ETH_P_IPV6)) {
-		enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone;
-		memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
-		err = nf_ct_frag6_gather(net, skb, user);
-		if (err) {
-			if (err != -EINPROGRESS)
-				kfree_skb(skb);
-			return err;
-		}
-		key->ip.proto = ipv6_hdr(skb)->nexthdr;
-		ovs_cb.mru = IP6CB(skb)->frag_max_size;
-#endif
-	} else {
-		kfree_skb(skb);
-		return -EPFNOSUPPORT;
-	}
 	/* The key extracted from the fragment that completed this datagram
 	 * likely didn't have an L4 header, so regenerate it.
 	 */
 	ovs_flow_key_update_l3l4(skb, key);
 	key->ip.frag = OVS_FRAG_TYPE_NONE;
-	skb_clear_hash(skb);
-	skb->ignore_df = 1;
 	*OVS_CB(skb) = ovs_cb;
 	return 0;
@@ -1091,36 +1060,6 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
 	return 0;
 }
-/* Trim the skb to the length specified by the IP/IPv6 header,
- * removing any trailing lower-layer padding. This prepares the skb
- * for higher-layer processing that assumes skb->len excludes padding
- * (such as nf_ip_checksum). The caller needs to pull the skb to the
- * network header, and ensure ip_hdr/ipv6_hdr points to valid data.
- */
-static int ovs_skb_network_trim(struct sk_buff *skb)
-{
-	unsigned int len;
-	int err;
-	switch (skb->protocol) {
-	case htons(ETH_P_IP):
-		len = skb_ip_totlen(skb);
-		break;
-	case htons(ETH_P_IPV6):
-		len = sizeof(struct ipv6hdr)
-			+ ntohs(ipv6_hdr(skb)->payload_len);
-		break;
-	default:
-		len = skb->len;
-	}
-	err = pskb_trim_rcsum(skb, len);
-	if (err)
-		kfree_skb(skb);
-	return err;
-}
 /* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero
 * value if 'skb' is freed.
 */
@@ -1135,12 +1074,15 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb,
 	nh_ofs = skb_network_offset(skb);
 	skb_pull_rcsum(skb, nh_ofs);
-	err = ovs_skb_network_trim(skb);
+	err = nf_ct_skb_network_trim(skb, info->family);
-	if (err)
+	if (err) {
+		kfree_skb(skb);
 		return err;
+	}
 	if (key->ip.frag != OVS_FRAG_TYPE_NONE) {
-		err = handle_fragments(net, key, info->zone.id, skb);
+		err = ovs_ct_handle_fragments(net, key, info->zone.id,
+					      info->family, skb);
 		if (err)
 			return err;
 	}

--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -984,6 +984,7 @@ config NET_ACT_TUNNEL_KEY
 config NET_ACT_CT
 	tristate "connection tracking tc action"
 	depends on NET_CLS_ACT && NF_CONNTRACK && (!NF_NAT || NF_NAT) && NF_FLOW_TABLE
+	select NF_CONNTRACK_OVS
 	select NF_NAT_OVS if NF_NAT
 	help
 	  Say Y here to allow sending the packets to conntrack module.

--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -726,31 +726,6 @@ static bool tcf_ct_skb_nfct_cached(struct net *net, struct sk_buff *skb,
 	return false;
 }
-/* Trim the skb to the length specified by the IP/IPv6 header,
- * removing any trailing lower-layer padding. This prepares the skb
- * for higher-layer processing that assumes skb->len excludes padding
- * (such as nf_ip_checksum). The caller needs to pull the skb to the
- * network header, and ensure ip_hdr/ipv6_hdr points to valid data.
- */
-static int tcf_ct_skb_network_trim(struct sk_buff *skb, int family)
-{
-	unsigned int len;
-	switch (family) {
-	case NFPROTO_IPV4:
-		len = skb_ip_totlen(skb);
-		break;
-	case NFPROTO_IPV6:
-		len = sizeof(struct ipv6hdr)
-			+ ntohs(ipv6_hdr(skb)->payload_len);
-		break;
-	default:
-		len = skb->len;
-	}
-	return pskb_trim_rcsum(skb, len);
-}
 static u8 tcf_ct_skb_nf_family(struct sk_buff *skb)
 {
 	u8 family = NFPROTO_UNSPEC;
@@ -810,6 +785,7 @@ static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
 	struct nf_conn *ct;
 	int err = 0;
 	bool frag;
+	u8 proto;
 	u16 mru;
 	/* Previously seen (loopback)? Ignore. */
@@ -825,50 +801,14 @@ static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
 		return err;
 	skb_get(skb);
-	mru = tc_skb_cb(skb)->mru;
+	err = nf_ct_handle_fragments(net, skb, zone, family, &proto, &mru);
+	if (err)
-	if (family == NFPROTO_IPV4) {
+		return err;
-		enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone;
-		memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
-		local_bh_disable();
-		err = ip_defrag(net, skb, user);
-		local_bh_enable();
-		if (err && err != -EINPROGRESS)
-			return err;
-		if (!err) {
-			*defrag = true;
-			mru = IPCB(skb)->frag_max_size;
-		}
-	} else { /* NFPROTO_IPV6 */
-#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
-		enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone;
-		memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
-		err = nf_ct_frag6_gather(net, skb, user);
-		if (err && err != -EINPROGRESS)
-			goto out_free;
-		if (!err) {
-			*defrag = true;
-			mru = IP6CB(skb)->frag_max_size;
-		}
-#else
-		err = -EOPNOTSUPP;
-		goto out_free;
-#endif
-	}
-	if (err != -EINPROGRESS)
+	*defrag = true;
-		tc_skb_cb(skb)->mru = mru;
+	tc_skb_cb(skb)->mru = mru;
-	skb_clear_hash(skb);
-	skb->ignore_df = 1;
-	return err;
-out_free:
+	return 0;
-	kfree_skb(skb);
-	return err;
 }
 static void tcf_ct_params_free(struct tcf_ct_params *params)
@@ -1011,7 +951,7 @@ TC_INDIRECT_SCOPE int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
 	if (err)
 		goto drop;
-	err = tcf_ct_skb_network_trim(skb, family);
+	err = nf_ct_skb_network_trim(skb, family);
 	if (err)
 		goto drop;