Commit 3e684fcf authored by Kirill Smelkov's avatar Kirill Smelkov

Kernel part of "mesh multicast" problem solved

Packets delivery in IPv4/IPv6 multicast is handled together by
multicast-routing daemon and by kernel. The MR daemon installs routes
into kernel, and the kernel actually routes the traffic.

We want the overall solution to obey to the following properties:

	every member of the group

	* can send a packet to the group via single uniform send call.
	* That packet will be delivered to all members of the group -
	  both on the same and on other machines, exactly once to each recipient.

This suggests sending/reception to be organized via one virtual
interface and then forwarded outside/into-inside as needed.

This patch adds topo-ABCDloop which creates a 4-nodes network with
cycle. It then manually installs static multicast routes the way, with
avoiding loops and that a packet sent from particular sender is
delivered to all other members of the network exactly once.

The sending/reception is organized via mcast-rx/mcast-tx veth pair, or
via loopback if the kernel has the following fixes

	linux@25bcc760
	linux@43eadf90

It was tested to work ok via

	# on A
	# ./tmcast.py txrx6 A
	joining ff1e::1 @ [7]mcast-rx
	tx: A.1 ...
	rx: A.1
	tx: A.2 ...
	rx: A.2
	tx: A.3 ...
	rx: A.3

	# on B
	# ./tmcast.py rx6
	joining ff1e::1 @ [7]mcast-rx
	rx: A.1
	rx: A.2
	rx: A.3

	# on C
	# ./tmcast.py rx6
	joining ff1e::1 @ [7]mcast-rx
	rx: A.1
	rx: A.2
	rx: A.3

	# on D
	# ./tmcast.py rx6
	joining ff1e::1 @ [7]mcast-rx
	rx: A.1
	rx: A.2
	rx: A.3

and similarly for also B, C or D being sender and all that with also IPv4.

Please see inside for details.

See also the following patch for more in-depth debugging of used approach.
parent 66a19431
......@@ -5,8 +5,8 @@ PID=$$
# `xnewhost <name>` creates new virtual host.
xnewhost() {
local X=$1
# lo
xunshare $X -- ip link set lo up
# lo with multicast, so that smcrouted handles it as well
xunshare $X -- ip link set lo up multicast on
# private /var/run so that e.g. smcrouted can be started
xnsenter $X -- mount -t tmpfs none /var/run
......
......@@ -40,13 +40,20 @@ def mjoin_tx(group, port, ttl=100):
else:
sk.setsockopt(IPPROTO_IP, IP_MULTICAST_TTL, ttl)
sk.connect((group, port))
# so that what we send is received locally as well
# NOTE do _not_ connect. If connected, Linux may permanently associate destination with wrong route.
# -> use sk.sendto() instead of sk.send() in txloop.
#sk.connect((group, port))
# turn off automatic loopback.
# We _do_ want to see what we send looping back to ourselves.
# We _do_ receive it back because we configure default multicast route to go to lo.
# When IP_MULTICAST_LOOP=1 the kernel has also dedicated path to send
# packets back irregardless of what is the route. Turn that later path off
# to avoid receiving what we send twice.
if ip6:
sk.setsockopt(IPPROTO_IPV6, IPV6_MULTICAST_LOOP, 1)
sk.setsockopt(IPPROTO_IPV6, IPV6_MULTICAST_LOOP, 0)
else:
sk.setsockopt(IPPROTO_IP, IP_MULTICAST_LOOP, 1)
sk.setsockopt(IPPROTO_IP, IP_MULTICAST_LOOP, 0)
return sk
......@@ -55,26 +62,45 @@ def mjoin_rx(group, port):
sk, ip6 = _udpsockfor(group)
sk.setsockopt(SOL_SOCKET, SO_REUSEADDR, 1)
sk.bind((group, port)) # works for reception even from multiple ifaces
# join the group on all interfaces
for ifidx, ifname in net.if_nameindex():
print("joining %s @ [%d]%s" % (group, ifidx, ifname))
bifidx = pack("@i", ifidx)
if ip6:
mreq = inet_pton(AF_INET6, group) + \
bifidx
sk.setsockopt(IPPROTO_IPV6, IPV6_JOIN_GROUP, mreq) # = IPV6_ADD_MEMBERSHIP
else:
mreq = inet_pton(AF_INET, group) + \
inet_pton(AF_INET, '0.0.0.0') + \
bifidx
sk.setsockopt(IPPROTO_IP, IP_ADD_MEMBERSHIP, mreq)
sk.bind((group, port)) # works for reception even if we would want to rx from multiple ifaces
# join the group on mcast-rx (preferrably, if it exists), or loopback.
#
# mcast-rx is used instead of loopback for receiving multicast, because
# with IPv6 Linux rejects fowarding multicast traffic to lo.
# See https://lab.nexedi.com/kirr/linux/commit/25bcc76020c5 for details.
#
# NOTE we used to join the group on all interfaces, but this results in
# multiple delivery of the same packet, e.g. if local node forwards that
# packet from its external interface A to B.
mcast_ifv = ("mcast-rx", "lo") # XXX change lo to interface corresponding to daddr=group route?
ifidx = None
for ifname in mcast_ifv:
try:
ifidx = net.if_nametoindex(ifname)
except OSError:
continue
break
if ifidx is None:
raise RuntimeError("mjoin_rx: cannot find suitable interface; tried: %s" % (mcast_ifv,))
print("joining %s @ [%d]%s" % (group, ifidx, ifname))
bifidx = pack("@i", ifidx)
if ip6:
mreq = inet_pton(AF_INET6, group) + \
bifidx
sk.setsockopt(IPPROTO_IPV6, IPV6_JOIN_GROUP, mreq) # = IPV6_ADD_MEMBERSHIP
else:
mreq = inet_pton(AF_INET, group) + \
inet_pton(AF_INET, '0.0.0.0') + \
bifidx
sk.setsockopt(IPPROTO_IP, IP_ADD_MEMBERSHIP, mreq)
return sk
def txloop(ctx, sk, pkt):
def txloop(ctx, sk, daddr, pkt):
i = 0
while 1:
if ctx.err():
......@@ -82,7 +108,7 @@ def txloop(ctx, sk, pkt):
i += 1
pkt_ = pkt + (b'.%d' % i)
print("tx: %s ..." % u(pkt_))
sk.send(pkt_)
sk.sendto(pkt_, daddr)
time.sleep(1)
def rxloop(ctx, sk):
......@@ -131,7 +157,7 @@ def main():
wg = sync.WorkGroup(ctx)
if "tx" in action:
sktx = mjoin_tx(*G)
wg.go(txloop, sktx, b(sys.argv[2]))
wg.go(txloop, sktx, G, b(sys.argv[2]))
if "rx" in action:
skrx = mjoin_rx(*G)
wg.go(rxloop, skrx)
......
#!/bin/bash -e
# topo ABCDloop creates the following network topology:
#
# (1.0.0.1 / 1::1)
#
# 1
#
# A
# / \
# / \
# / \
# (2.0.0.2 / 2::2) 2 B C 3 (3.0.0.3 / 3::3)
# \ /
# \ /
# \ /
# D
#
# 4
#
# (4.0.0.4 / 4::4)
#
#
# should be run under unshare -mrun .
# Based on https://github.com/troglobit/smcroute/tree/master/test
# whether to use use mcast_rx/mcast_tx interfaces instead of lo
# see
#
# https://lab.nexedi.com/kirr/linux/commit/25bcc76020c5 and
# https://lab.nexedi.com/kirr/linux/commit/43eadf90a7a6
#
# for why lo does not work out of the box.
use_mcast_rxtx_instead_of_lo=y
if [ $use_mcast_rxtx_instead_of_lo = n ]; then
mrx=lo
mtx=lo
else
mrx=mcast-rx
mtx=mcast-tx
fi
# prefixes for all handled multicast groups
mcast4=224.0.0.0/4
mcast6=ff1e::/16
. lib.sh
xnewhost A
xnewhost B
xnewhost C
xnewhost D
xlink A B
xlink A C
xlink B D
xlink C D
# setup mcast-rx/mcast-tx interfaces if needed
xnode_xlo() {
X=$1
# mrx <-> mtx veth pair
if [ $mrx != $mtx ]; then
xnsenter $X -- ip link add $mrx type veth peer name $mtx
xnsenter $X -- ip link set $mrx up
xnsenter $X -- ip link set $mtx up
# programs from local host will be sending multicast via mtx (having default route for multicast addresses)
# those packets will be delivered back to local delivery via mrx <- mtx
# tell IPv4 routing not to reject them
# NOTE IPv6 does not need this
xnsenter $X -- sysctl net.ipv4.conf.mcast-rx.accept_local=1
fi
}
xnode_xlo A
xnode_xlo B
xnode_xlo C
xnode_xlo D
# setup IPv4/IPv6 "node-level" addresses
xnodeaddr() {
X=$1
i=$2
xnsenter $X -- ip addr add $i.0.0.$i/24 dev $mtx scope global
xnsenter $X -- ip addr add $i::$i/16 dev $mtx scope global
}
xnodeaddr A 1
xnodeaddr B 2
xnodeaddr C 3
xnodeaddr D 4
# prepare routes for incoming/outgoing multicast forwarding:
# - from-external multicast traffic goes to lo
# - from-us multicast traffic is sent to the world via lo too
# sleep a bit so that IPv6 address is activated for real on $mtx
# if we don't delay - vvv `ip -6 route ... dev $mtx src ...` fails with "Error: Invalid source address".
#
# XXX isn't it a kernel bug that we have to delay?
# XXX and why delay 3s? (with 1s it always fails, with 2s it still fails sometimes)
sleep 3
xnode_mrxtx() {
X=$1
i=$2
# add explicit rule to route global multicast traffic to lo instead of relying on ad-hoc hack inside Linux kernel:
# https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/net/ipv4/route.c?id=v5.18-rc5-28-ga7391ad35724#n2747
# (see "Apparently, routing tables are wrong. Assume, that the destination is on link." there)
# (present starting from very long ago: Linux 2.1.68 year=1997)
#
# As we are establishing route for multicast, also explicitly specify preferred
# source address with which to send from-us multicast traffic with.
#
# For table local pref ... - see IPv6 case vvv for the rationale. Probably for
# IPv4 this is not strictly required, but we still do it just in case.
xnsenter $X -- ip r add multicast $mcast4 oif $mtx dev $mtx scope global \
table local pref high metric 1 \
src $i.0.0.$i
# for IPv6 similar explicit mcast route -> lo is required, because IPv6 routing does not have in-kernel ad-hoc that IPv4 has.
# NOTE Linux, unless patched, turns such lo-routes into rejects, so it won't work without patch to the kernel:
# https://lab.nexedi.com/kirr/linux/commit/25bcc76020c5
#
# Put this route into "local" table with highest possible preference to force
# the kernel to use it even if a neighbour node advertises via MLD its multicast capabilities.
# Upon receiving MLD messages the kernel creates `pref medium` routes in `table
# local`, so if we don't adjust "priority" of our route, it will become ignored.
#
# Also explicitly specify source address.
xnsenter $X -- ip r add multicast $mcast6 oif $mtx dev $mtx scope global \
table local pref high metric 1 \
src $i::$i
}
xnode_mrxtx A 1
xnode_mrxtx B 2
xnode_mrxtx C 3
xnode_mrxtx D 4
# add IPv4 "link-local" addresses (just a convenience; IPv6 sets them up automatically)
xnsenter A -- ip addr add 12.0.0.1/24 dev a-b
xnsenter B -- ip addr add 12.0.0.2/24 dev b-a
xnsenter A -- ip addr add 13.0.0.1/24 dev a-c
xnsenter C -- ip addr add 13.0.0.2/24 dev c-a
xnsenter B -- ip addr add 24.0.0.2/24 dev b-d
xnsenter D -- ip addr add 24.0.0.1/24 dev d-b
xnsenter C -- ip addr add 34.0.0.2/24 dev c-d
xnsenter D -- ip addr add 34.0.0.1/24 dev d-c
# run smcrouted on every host
for X in A B C D; do
xnsenter $X -- smcrouted -n &
done
sleep 1
# configure multicast routing on every host
# multicast originating from a host is always forwarded to all external
# interfaces on that host.
#
# NOTE when we are sending via mtx, the kernel does not notify mrouted about
# mtx - only about packets on mrx - so we hook into mrx instead of directly
# into mtx.
xnode_mr_sendout() {
X=$1; shift
i=$1; shift
oifs="$*"
xnsenter $X -- smcroutectl add $mrx $i.0.0.$i $mcast4 $oifs
xnsenter $X -- smcroutectl add $mrx $i::$i $mcast6 $oifs
}
xnode_mr_sendout A 1 a-b a-c
xnode_mr_sendout B 2 b-a b-d
xnode_mr_sendout C 3 c-a c-d
xnode_mr_sendout D 4 d-b d-c
# then for every root (host) organize forwarding of its trafic further in the
# graf in RPF-style with breaking loops. For example multicast originating from
# A, in addition to A->B and A->C, is also forwarded B->D, but not C->D nether
# D->C. As the result for every root node we have forwarding rules that cover
# delivery to whole network without cycles and only once.
xnode_mr_fwd() {
X=$1; shift
iif=$1; shift
ifrom=$1; shift
oifs="$*"
xnsenter $X -- smcroutectl add $iif $ifrom.0.0.$ifrom $mcast4 $oifs
xnsenter $X -- smcroutectl add $iif $ifrom::$ifrom $mcast6 $oifs
}
xnode_mr_fwd B b-a 1 b-d # A: B->D completes A->B and A->C
xnode_mr_fwd A a-b 2 a-c # B: A->C completes B->A and B->D
xnode_mr_fwd A a-c 3 a-b # C: A->B completes C->A and C->D
xnode_mr_fwd B b-d 4 b-a # D: B->A completes D->B and D->C
# multicast reaching a host from outside is forwarded to lo interface for
# delivery to programs running on that host.
#
# to set this up correctly let's take current forward routing table:
#
# A: A->B A->C B->D
# B: B->A B->D A->C
# C: C->A C->D A->B
# D: D->B D->C B->A
#
# and for every ->X create corresponding reverse reaching rules. For example
# for ->B packets can go into B from A (A->B), from C (A->B) and from D (D->B).
#
# (if we setup this in straightforward way as "every outside address on every
# outside port go to lo", then it will create duplicates on receive on a node
# that forwards multicast traffic originating from another node)
xnode_mr_incoming() {
X=$1; shift
iif=$1; shift
ifrom=$1; shift
xnsenter $X -- smcroutectl add $iif $ifrom.0.0.$ifrom $mcast4 $mtx
xnsenter $X -- smcroutectl add $iif $ifrom::$ifrom $mcast6 $mtx
}
# ->A
xnode_mr_incoming A a-b 2 # B: B->A
xnode_mr_incoming A a-c 3 # C: C->A
xnode_mr_incoming A a-b 4 # D: B->A
# ->B
xnode_mr_incoming B b-a 1 # A: A->B
xnode_mr_incoming B b-a 3 # C: A->B
xnode_mr_incoming B b-d 4 # D: D->B
# ->C
xnode_mr_incoming C c-a 1 # A: A->C
xnode_mr_incoming C c-a 2 # B: A->C
xnode_mr_incoming C c-d 4 # D: D->C
# ->D
xnode_mr_incoming D d-b 1 # A: B->D
xnode_mr_incoming D d-b 2 # B: B->D
xnode_mr_incoming D d-c 3 # C: C->D
echo -e "\n\n\nREADY"
wait
killall smcrouted
......@@ -47,14 +47,15 @@ if [ $cycle = y ]; then
xnsenter C -- ip addr add 30.0.0.2/24 dev c-b
fi
# multicast routing
xnsenter B -- ip route add 224.0.0.0/4 dev b-a
xnsenter C -- ip route add 224.0.0.0/4 dev c-a
xnsenter A -- ip route add 224.0.0.0/4 dev a-b # NOTE on A 224.0.0.0/4 is routed
xnsenter A -- ip route append 224.0.0.0/4 dev a-c # to _both_ a-b and a-c
if [ $cycle = y ]; then
xnsenter B -- ip route append 224.0.0.0/4 dev b-c
xnsenter C -- ip route append 224.0.0.0/4 dev c-b
fi
# XXX does not really work well - use smcroute instead - see topo-ABCDloop.sh
# # multicast routing
# xnsenter B -- ip route add 224.0.0.0/4 dev b-a
# xnsenter C -- ip route add 224.0.0.0/4 dev c-a
#
# xnsenter A -- ip route add 224.0.0.0/4 dev a-b # NOTE on A 224.0.0.0/4 is routed
# xnsenter A -- ip route append 224.0.0.0/4 dev a-c # to _both_ a-b and a-c
#
# if [ $cycle = y ]; then
# xnsenter B -- ip route append 224.0.0.0/4 dev b-c
# xnsenter C -- ip route append 224.0.0.0/4 dev c-b
# fi
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment