Commit b611b776 authored by Paolo Abeni's avatar Paolo Abeni

Merge tag 'nf-24-02-29' of git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf

Pablo Neira Ayuso says:

====================
Netfilter fixes for net

Patch #1 restores NFPROTO_INET with nft_compat, from Ignat Korchagin.

Patch #2 fixes an issue with bridge netfilter and broadcast/multicast
packets.

There is a day 0 bug in br_netfilter when used with connection tracking.

Conntrack assumes that an nf_conn structure that is not yet added to
hash table ("unconfirmed"), is only visible by the current cpu that is
processing the sk_buff.

For bridge this isn't true, sk_buff can get cloned in between, and
clones can be processed in parallel on different cpu.

This patch disables NAT and conntrack helpers for multicast packets.

Patch #3 adds a selftest to cover for the br_netfilter bug.

netfilter pull request 24-02-29

* tag 'nf-24-02-29' of git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf:
  selftests: netfilter: add bridge conntrack + multicast test case
  netfilter: bridge: confirm multicast packets before passing them up the stack
  netfilter: nf_tables: allow NFPROTO_INET in nft_(match/target)_validate()
====================

Link: https://lore.kernel.org/r/20240229000135.8780-1-pablo@netfilter.orgSigned-off-by: default avatarPaolo Abeni <pabeni@redhat.com>
parents 51dd4ee0 6523cf51
......@@ -474,6 +474,7 @@ struct nf_ct_hook {
const struct sk_buff *);
void (*attach)(struct sk_buff *nskb, const struct sk_buff *skb);
void (*set_closing)(struct nf_conntrack *nfct);
int (*confirm)(struct sk_buff *skb);
};
extern const struct nf_ct_hook __rcu *nf_ct_hook;
......
......@@ -43,6 +43,10 @@
#include <linux/sysctl.h>
#endif
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
#include <net/netfilter/nf_conntrack_core.h>
#endif
static unsigned int brnf_net_id __read_mostly;
struct brnf_net {
......@@ -553,6 +557,90 @@ static unsigned int br_nf_pre_routing(void *priv,
return NF_STOLEN;
}
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
/* conntracks' nf_confirm logic cannot handle cloned skbs referencing
* the same nf_conn entry, which will happen for multicast (broadcast)
* Frames on bridges.
*
* Example:
* macvlan0
* br0
* ethX ethY
*
* ethX (or Y) receives multicast or broadcast packet containing
* an IP packet, not yet in conntrack table.
*
* 1. skb passes through bridge and fake-ip (br_netfilter)Prerouting.
* -> skb->_nfct now references a unconfirmed entry
* 2. skb is broad/mcast packet. bridge now passes clones out on each bridge
* interface.
* 3. skb gets passed up the stack.
* 4. In macvlan case, macvlan driver retains clone(s) of the mcast skb
* and schedules a work queue to send them out on the lower devices.
*
* The clone skb->_nfct is not a copy, it is the same entry as the
* original skb. The macvlan rx handler then returns RX_HANDLER_PASS.
* 5. Normal conntrack hooks (in NF_INET_LOCAL_IN) confirm the orig skb.
*
* The Macvlan broadcast worker and normal confirm path will race.
*
* This race will not happen if step 2 already confirmed a clone. In that
* case later steps perform skb_clone() with skb->_nfct already confirmed (in
* hash table). This works fine.
*
* But such confirmation won't happen when eb/ip/nftables rules dropped the
* packets before they reached the nf_confirm step in postrouting.
*
* Work around this problem by explicit confirmation of the entry at
* LOCAL_IN time, before upper layer has a chance to clone the unconfirmed
* entry.
*
*/
static unsigned int br_nf_local_in(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
struct nf_conntrack *nfct = skb_nfct(skb);
const struct nf_ct_hook *ct_hook;
struct nf_conn *ct;
int ret;
if (!nfct || skb->pkt_type == PACKET_HOST)
return NF_ACCEPT;
ct = container_of(nfct, struct nf_conn, ct_general);
if (likely(nf_ct_is_confirmed(ct)))
return NF_ACCEPT;
WARN_ON_ONCE(skb_shared(skb));
WARN_ON_ONCE(refcount_read(&nfct->use) != 1);
/* We can't call nf_confirm here, it would create a dependency
* on nf_conntrack module.
*/
ct_hook = rcu_dereference(nf_ct_hook);
if (!ct_hook) {
skb->_nfct = 0ul;
nf_conntrack_put(nfct);
return NF_ACCEPT;
}
nf_bridge_pull_encap_header(skb);
ret = ct_hook->confirm(skb);
switch (ret & NF_VERDICT_MASK) {
case NF_STOLEN:
return NF_STOLEN;
default:
nf_bridge_push_encap_header(skb);
break;
}
ct = container_of(nfct, struct nf_conn, ct_general);
WARN_ON_ONCE(!nf_ct_is_confirmed(ct));
return ret;
}
#endif
/* PF_BRIDGE/FORWARD *************************************************/
static int br_nf_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
......@@ -964,6 +1052,14 @@ static const struct nf_hook_ops br_nf_ops[] = {
.hooknum = NF_BR_PRE_ROUTING,
.priority = NF_BR_PRI_BRNF,
},
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
{
.hook = br_nf_local_in,
.pf = NFPROTO_BRIDGE,
.hooknum = NF_BR_LOCAL_IN,
.priority = NF_BR_PRI_LAST,
},
#endif
{
.hook = br_nf_forward,
.pf = NFPROTO_BRIDGE,
......
......@@ -291,6 +291,30 @@ static unsigned int nf_ct_bridge_pre(void *priv, struct sk_buff *skb,
return nf_conntrack_in(skb, &bridge_state);
}
static unsigned int nf_ct_bridge_in(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
enum ip_conntrack_info ctinfo;
struct nf_conn *ct;
if (skb->pkt_type == PACKET_HOST)
return NF_ACCEPT;
/* nf_conntrack_confirm() cannot handle concurrent clones,
* this happens for broad/multicast frames with e.g. macvlan on top
* of the bridge device.
*/
ct = nf_ct_get(skb, &ctinfo);
if (!ct || nf_ct_is_confirmed(ct) || nf_ct_is_template(ct))
return NF_ACCEPT;
/* let inet prerouting call conntrack again */
skb->_nfct = 0;
nf_ct_put(ct);
return NF_ACCEPT;
}
static void nf_ct_bridge_frag_save(struct sk_buff *skb,
struct nf_bridge_frag_data *data)
{
......@@ -385,6 +409,12 @@ static struct nf_hook_ops nf_ct_bridge_hook_ops[] __read_mostly = {
.hooknum = NF_BR_PRE_ROUTING,
.priority = NF_IP_PRI_CONNTRACK,
},
{
.hook = nf_ct_bridge_in,
.pf = NFPROTO_BRIDGE,
.hooknum = NF_BR_LOCAL_IN,
.priority = NF_IP_PRI_CONNTRACK_CONFIRM,
},
{
.hook = nf_ct_bridge_post,
.pf = NFPROTO_BRIDGE,
......
......@@ -2756,6 +2756,7 @@ static const struct nf_ct_hook nf_conntrack_hook = {
.get_tuple_skb = nf_conntrack_get_tuple_skb,
.attach = nf_conntrack_attach,
.set_closing = nf_conntrack_set_closing,
.confirm = __nf_conntrack_confirm,
};
void nf_conntrack_init_end(void)
......
......@@ -359,10 +359,20 @@ static int nft_target_validate(const struct nft_ctx *ctx,
if (ctx->family != NFPROTO_IPV4 &&
ctx->family != NFPROTO_IPV6 &&
ctx->family != NFPROTO_INET &&
ctx->family != NFPROTO_BRIDGE &&
ctx->family != NFPROTO_ARP)
return -EOPNOTSUPP;
ret = nft_chain_validate_hooks(ctx->chain,
(1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_LOCAL_IN) |
(1 << NF_INET_FORWARD) |
(1 << NF_INET_LOCAL_OUT) |
(1 << NF_INET_POST_ROUTING));
if (ret)
return ret;
if (nft_is_base_chain(ctx->chain)) {
const struct nft_base_chain *basechain =
nft_base_chain(ctx->chain);
......@@ -610,10 +620,20 @@ static int nft_match_validate(const struct nft_ctx *ctx,
if (ctx->family != NFPROTO_IPV4 &&
ctx->family != NFPROTO_IPV6 &&
ctx->family != NFPROTO_INET &&
ctx->family != NFPROTO_BRIDGE &&
ctx->family != NFPROTO_ARP)
return -EOPNOTSUPP;
ret = nft_chain_validate_hooks(ctx->chain,
(1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_LOCAL_IN) |
(1 << NF_INET_FORWARD) |
(1 << NF_INET_LOCAL_OUT) |
(1 << NF_INET_POST_ROUTING));
if (ret)
return ret;
if (nft_is_base_chain(ctx->chain)) {
const struct nft_base_chain *basechain =
nft_base_chain(ctx->chain);
......
......@@ -7,7 +7,8 @@ TEST_PROGS := nft_trans_stress.sh nft_fib.sh nft_nat.sh bridge_brouter.sh \
nft_queue.sh nft_meta.sh nf_nat_edemux.sh \
ipip-conntrack-mtu.sh conntrack_tcp_unreplied.sh \
conntrack_vrf.sh nft_synproxy.sh rpath.sh nft_audit.sh \
conntrack_sctp_collision.sh xt_string.sh
conntrack_sctp_collision.sh xt_string.sh \
bridge_netfilter.sh
HOSTPKG_CONFIG := pkg-config
......
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
#
# Test bridge netfilter + conntrack, a combination that doesn't really work,
# with multicast/broadcast packets racing for hash table insertion.
# eth0 br0 eth0
# setup is: ns1 <->,ns0 <-> ns3
# ns2 <-' `'-> ns4
# Kselftest framework requirement - SKIP code is 4.
ksft_skip=4
ret=0
sfx=$(mktemp -u "XXXXXXXX")
ns0="ns0-$sfx"
ns1="ns1-$sfx"
ns2="ns2-$sfx"
ns3="ns3-$sfx"
ns4="ns4-$sfx"
ebtables -V > /dev/null 2>&1
if [ $? -ne 0 ];then
echo "SKIP: Could not run test without ebtables"
exit $ksft_skip
fi
ip -Version > /dev/null 2>&1
if [ $? -ne 0 ];then
echo "SKIP: Could not run test without ip tool"
exit $ksft_skip
fi
for i in $(seq 0 4); do
eval ip netns add \$ns$i
done
cleanup() {
for i in $(seq 0 4); do eval ip netns del \$ns$i;done
}
trap cleanup EXIT
do_ping()
{
fromns="$1"
dstip="$2"
ip netns exec $fromns ping -c 1 -q $dstip > /dev/null
if [ $? -ne 0 ]; then
echo "ERROR: ping from $fromns to $dstip"
ip netns exec ${ns0} nft list ruleset
ret=1
fi
}
bcast_ping()
{
fromns="$1"
dstip="$2"
for i in $(seq 1 1000); do
ip netns exec $fromns ping -q -f -b -c 1 -q $dstip > /dev/null 2>&1
if [ $? -ne 0 ]; then
echo "ERROR: ping -b from $fromns to $dstip"
ip netns exec ${ns0} nft list ruleset
fi
done
}
ip link add veth1 netns ${ns0} type veth peer name eth0 netns ${ns1}
if [ $? -ne 0 ]; then
echo "SKIP: Can't create veth device"
exit $ksft_skip
fi
ip link add veth2 netns ${ns0} type veth peer name eth0 netns $ns2
ip link add veth3 netns ${ns0} type veth peer name eth0 netns $ns3
ip link add veth4 netns ${ns0} type veth peer name eth0 netns $ns4
ip -net ${ns0} link set lo up
for i in $(seq 1 4); do
ip -net ${ns0} link set veth$i up
done
ip -net ${ns0} link add br0 type bridge stp_state 0 forward_delay 0 nf_call_iptables 1 nf_call_ip6tables 1 nf_call_arptables 1
if [ $? -ne 0 ]; then
echo "SKIP: Can't create bridge br0"
exit $ksft_skip
fi
# make veth0,1,2 part of bridge.
for i in $(seq 1 3); do
ip -net ${ns0} link set veth$i master br0
done
# add a macvlan on top of the bridge.
MACVLAN_ADDR=ba:f3:13:37:42:23
ip -net ${ns0} link add link br0 name macvlan0 type macvlan mode private
ip -net ${ns0} link set macvlan0 address ${MACVLAN_ADDR}
ip -net ${ns0} link set macvlan0 up
ip -net ${ns0} addr add 10.23.0.1/24 dev macvlan0
# add a macvlan on top of veth4.
MACVLAN_ADDR=ba:f3:13:37:42:24
ip -net ${ns0} link add link veth4 name macvlan4 type macvlan mode vepa
ip -net ${ns0} link set macvlan4 address ${MACVLAN_ADDR}
ip -net ${ns0} link set macvlan4 up
# make the macvlan part of the bridge.
# veth4 is not a bridge port, only the macvlan on top of it.
ip -net ${ns0} link set macvlan4 master br0
ip -net ${ns0} link set br0 up
ip -net ${ns0} addr add 10.0.0.1/24 dev br0
ip netns exec ${ns0} sysctl -q net.bridge.bridge-nf-call-iptables=1
ret=$?
if [ $ret -ne 0 ] ; then
echo "SKIP: bridge netfilter not available"
ret=$ksft_skip
fi
# for testing, so namespaces will reply to ping -b probes.
ip netns exec ${ns0} sysctl -q net.ipv4.icmp_echo_ignore_broadcasts=0
# enable conntrack in ns0 and drop broadcast packets in forward to
# avoid them from getting confirmed in the postrouting hook before
# the cloned skb is passed up the stack.
ip netns exec ${ns0} nft -f - <<EOF
table ip filter {
chain input {
type filter hook input priority 1; policy accept
iifname br0 counter
ct state new accept
}
}
table bridge filter {
chain forward {
type filter hook forward priority 0; policy accept
meta pkttype broadcast ip protocol icmp counter drop
}
}
EOF
# place 1, 2 & 3 in same subnet, connected via ns0:br0.
# ns4 is placed in same subnet as well, but its not
# part of the bridge: the corresponding veth4 is not
# part of the bridge, only its macvlan interface.
for i in $(seq 1 4); do
eval ip -net \$ns$i link set lo up
eval ip -net \$ns$i link set eth0 up
done
for i in $(seq 1 2); do
eval ip -net \$ns$i addr add 10.0.0.1$i/24 dev eth0
done
ip -net ${ns3} addr add 10.23.0.13/24 dev eth0
ip -net ${ns4} addr add 10.23.0.14/24 dev eth0
# test basic connectivity
do_ping ${ns1} 10.0.0.12
do_ping ${ns3} 10.23.0.1
do_ping ${ns4} 10.23.0.1
if [ $ret -eq 0 ];then
echo "PASS: netns connectivity: ns1 can reach ns2, ns3 and ns4 can reach ns0"
fi
bcast_ping ${ns1} 10.0.0.255
# This should deliver broadcast to macvlan0, which is on top of ns0:br0.
bcast_ping ${ns3} 10.23.0.255
# same, this time via veth4:macvlan4.
bcast_ping ${ns4} 10.23.0.255
read t < /proc/sys/kernel/tainted
if [ $t -eq 0 ];then
echo PASS: kernel not tainted
else
echo ERROR: kernel is tainted
ret=1
fi
exit $ret
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment