Commit 4ed03af2 authored by Yonghong Song's avatar Yonghong Song

add a bridge-router-bridge test with router implemented as a namespace

Signed-off-by: default avatarYonghong Song <yhs@plumgrid.com>
parent 0b680a2c
......@@ -76,6 +76,8 @@ static u64 (*bpf_ktime_get_ns)(void) =
(void *) BPF_FUNC_ktime_get_ns;
static int (*bpf_trace_printk)(const char *fmt, u64 fmt_size, ...) =
(void *) BPF_FUNC_trace_printk;
static u64 (*bpf_clone_redirect)(void *ctx, u64 ifindex, u64 flags) =
(void *)BPF_FUNC_clone_redirect;
static void bpf_tail_call_(u64 map_fd, void *ctx, int index) {
((void (*)(void *, u64, int))BPF_FUNC_tail_call)(ctx, map_fd, index);
}
......
......@@ -16,3 +16,5 @@ add_test(NAME py_test_trace2 WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
COMMAND ${TEST_WRAPPER} py_trace2 sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_trace2.py)
add_test(NAME py_test_trace3_c WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
COMMAND ${TEST_WRAPPER} py_trace3_c sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_trace3.py test_trace3.c)
add_test(NAME py_test_brb WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
COMMAND ${TEST_WRAPPER} py_brb_c sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_brb.py test_brb.c)
// Copyright (c) PLUMgrid, Inc.
// Licensed under the Apache License, Version 2.0 (the "License")
#include <bcc/proto.h>
/* compiler workaround */
#define _htonl __builtin_bswap32
#define _htons __builtin_bswap16
// meta data passed between bpf programs
typedef struct bpf_metadata {
u32 prog_id;
u32 rx_port_id;
} bpf_metadata_t;
typedef struct bpf_dest {
u32 prog_id;
u32 port_id;
} bpf_dest_t;
typedef struct eth_addr {
u8 addr[6];
} eth_addr_t;
// Program table definitions for tail calls
BPF_TABLE("prog", u32, u32, jump, 16);
// physical endpoint manager (pem) tables which connects to boeht bridge 1 and bridge 2
// <port_id, bpf_dest>
BPF_TABLE("array", u32, bpf_dest_t, pem_dest, 256);
// <port_id, ifindex>
BPF_TABLE("array", u32, u32, pem_port, 256);
// <ifindex, port_id>
BPF_TABLE("hash", u32, u32, pem_ifindex, 256);
// <0, tx2vm_pkts>
BPF_TABLE("array", u32, u32, pem_stats, 1);
// bridge 1 (br1) tables
// <port_id, bpf_dest>
BPF_TABLE("array", u32, bpf_dest_t, br1_dest, 256);
// <eth_addr, port_id>
BPF_TABLE("hash", eth_addr_t, u32, br1_mac, 256);
// <0, rtr_ifindex>
BPF_TABLE("array", u32, u32, br1_rtr, 1);
// <mac, ifindex>
BPF_TABLE("hash", eth_addr_t, u32, br1_mac_ifindex, 1);
// bridge 2 (br2) tables
// <port_id, bpf_dest>
BPF_TABLE("array", u32, bpf_dest_t, br2_dest, 256);
// <eth_addr, port_id>
BPF_TABLE("hash", eth_addr_t, u32, br2_mac, 256);
// <0, rtr_ifindex>
BPF_TABLE("array", u32, u32, br2_rtr, 1);
// <mac, ifindex>
BPF_TABLE("hash", eth_addr_t, u32, br2_mac_ifindex, 1);
BPF_EXPORT(pem)
int pem(struct __sk_buff *skb) {
bpf_metadata_t meta = {};
u32 ifindex;
u32 *tx_port_id_p;
u32 tx_port_id;
u32 rx_port;
u32 *ifindex_p;
bpf_dest_t *dest_p;
// pem does not look at packet data
if (skb->tc_index == 0) {
skb->tc_index = 1;
skb->cb[0] = skb->cb[1] = 0;
meta.prog_id = meta.rx_port_id = 0;
} else {
meta.prog_id = skb->cb[0];
meta.rx_port_id = skb->cb[1];
}
if (!meta.prog_id) {
/* from external */
ifindex = skb->ingress_ifindex;
tx_port_id_p = pem_ifindex.lookup(&ifindex);
if (tx_port_id_p) {
tx_port_id = *tx_port_id_p;
dest_p = pem_dest.lookup(&tx_port_id);
if (dest_p) {
skb->cb[0] = dest_p->prog_id;
skb->cb[1] = dest_p->port_id;
jump.call(skb, dest_p->prog_id);
}
}
} else {
/* from internal */
rx_port = meta.rx_port_id;
ifindex_p = pem_port.lookup(&rx_port);
if (ifindex_p) {
#if 1
/* accumulate stats, may hurt performance slightly */
u32 index = 0;
u32 *value = pem_stats.lookup(&index);
if (value)
lock_xadd(value, 1);
#endif
bpf_clone_redirect(skb, *ifindex_p, 0);
}
}
return 0;
}
static int br_common(struct __sk_buff *skb, int which_br) __attribute__((always_inline));
static int br_common(struct __sk_buff *skb, int which_br) {
bpf_metadata_t meta = {};
u16 proto;
u16 arpop;
eth_addr_t dmac;
u8 *mac_p;
u32 dip;
u32 *tx_port_id_p;
u32 tx_port_id;
bpf_dest_t *dest_p;
u32 index, *rtrif_p;
if (skb->tc_index == 0) {
skb->tc_index = 1;
skb->cb[0] = skb->cb[1] = 0;
meta.prog_id = meta.rx_port_id = 0;
} else {
meta.prog_id = skb->cb[0];
meta.rx_port_id = skb->cb[1];
}
BEGIN(ethernet);
PROTO(ethernet) {
// ethernet->dst seems not working, so tentatively use the primitive C API.
*(__u32 *)&dmac.addr[0] = _htonl(load_word(skb, 0));
*(__u16 *)&dmac.addr[4] = _htons(load_half(skb, 4));
if (meta.prog_id != 0) {
/* send to the router */
if (dmac.addr[0] & dmac.addr[1] & dmac.addr[2] & dmac.addr[3] & dmac.addr[4] & dmac.addr[5] == 0xff) {
index = 0;
if (which_br == 1)
rtrif_p = br1_rtr.lookup(&index);
else
rtrif_p = br2_rtr.lookup(&index);
if (rtrif_p)
bpf_clone_redirect(skb, *rtrif_p, 0);
} else {
/* the dmac address should match the router's */
if (which_br == 1)
rtrif_p = br1_mac_ifindex.lookup(&dmac);
else
rtrif_p = br2_mac_ifindex.lookup(&dmac);
if (rtrif_p)
bpf_clone_redirect(skb, *rtrif_p, 0);
}
return 0;
}
switch (ethernet->type) {
case 0x0800: goto ip;
case 0x0806: goto arp;
case 0x8100: goto dot1q;
}
goto EOP;
}
PROTO(dot1q) {
switch(dot1q->type) {
case 0x0806: goto arp;
case 0x0800: goto ip;
}
goto EOP;
}
PROTO(arp) {
/* mac learning */
// arpop = load_half(skb, 20);
arpop = arp->oper;
if (arpop == 2) {
index = 0;
if (which_br == 1)
rtrif_p = br1_rtr.lookup(&index);
else
rtrif_p = br2_rtr.lookup(&index);
if (rtrif_p) {
__u32 ifindex = *rtrif_p;
eth_addr_t smac;
*(__u16 *)&smac.addr[0] = _htons(load_half(skb, 6));
*(__u16 *)&smac.addr[2] = _htons(load_half(skb, 8));
*(__u16 *)&smac.addr[4] = _htons(load_half(skb, 10));
if (which_br == 1)
br1_mac_ifindex.update(&smac, &ifindex);
else
br2_mac_ifindex.update(&smac, &ifindex);
}
}
goto xmit;
}
PROTO(ip) {
goto xmit;
}
xmit:
if (which_br == 1)
tx_port_id_p = br1_mac.lookup(&dmac);
else
tx_port_id_p = br2_mac.lookup(&dmac);
if (tx_port_id_p) {
tx_port_id = *tx_port_id_p;
if (which_br == 1)
dest_p = br1_dest.lookup(&tx_port_id);
else
dest_p = br2_dest.lookup(&tx_port_id);
if (dest_p) {
skb->cb[0] = dest_p->prog_id;
skb->cb[1] = dest_p->port_id;
jump.call(skb, dest_p->prog_id);
}
}
EOP:
return 0;
}
BPF_EXPORT(br1)
int br1(struct __sk_buff *skb) {
return br_common(skb, 1);
}
BPF_EXPORT(br2)
int br2(struct __sk_buff *skb) {
return br_common(skb, 2);
}
#!/usr/bin/env python
# Copyright (c) PLUMgrid, Inc.
# Licensed under the Apache License, Version 2.0 (the "License")
# This program implements a topology likes below:
# pem: physical endpoint manager, implemented as a bpf program
#
# vm1 <--------+ +----> bridge1 <----+
# V V V
# pem router
# ^ ^ ^
# vm2 <--------+ +----> bridge2 <----+
#
# The vm1, vm2 and router are implemented as namespaces.
# The bridge is implemented with limited functionality in bpf program.
#
# vm1 and vm2 are in different subnet. For vm1 to communicate to vm2,
# the packet will have to travel from vm1 to pem, bridge1, router, bridge2, pem, and
# then come to vm2.
#
# When this test is run with verbose mode (ctest -R <test_name> -V),
# the following printout is observed on my local box:
#
# ......
# 8: ARPING 100.1.1.254 from 100.1.1.1 eth0
# 8: Unicast reply from 100.1.1.254 [E6:5F:05:95:4B:41] 0.532ms
# 8: Sent 1 probes (1 broadcast(s))
# 8: Received 1 response(s)
# 8: ARPING 200.1.1.254 from 200.1.1.1 eth0
# 8: Unicast reply from 200.1.1.254 [46:99:94:FB:6D:23] 0.522ms
# 8: Sent 1 probes (1 broadcast(s))
# 8: Received 1 response(s)
# 8: PING 200.1.1.1 (200.1.1.1) 56(84) bytes of data.
# 8: 64 bytes from 200.1.1.1: icmp_req=1 ttl=63 time=0.066 ms
# 8: 64 bytes from 200.1.1.1: icmp_req=2 ttl=63 time=0.024 ms
# 8: 64 bytes from 200.1.1.1: icmp_req=3 ttl=63 time=0.052 ms
# 8: 64 bytes from 200.1.1.1: icmp_req=4 ttl=63 time=0.050 ms
# 8: 64 bytes from 200.1.1.1: icmp_req=5 ttl=63 time=0.052 ms
# 8:
# 8: --- 200.1.1.1 ping statistics ---
# 8: 5 packets transmitted, 5 received, 0% packet loss, time 3999ms
# 8: rtt min/avg/max/mdev = 0.024/0.048/0.066/0.016 ms
# 8: [ ID] Interval Transfer Bandwidth
# 8: [ 5] 0.0- 1.0 sec 4.35 GBytes 37.4 Gbits/sec
# 8: Starting netserver with host 'IN(6)ADDR_ANY' port '12865' and family AF_UNSPEC
# 8: MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 200.1.1.1 (200.1.1.1) port 0 AF_INET : demo
# 8: Recv Send Send
# 8: Socket Socket Message Elapsed
# 8: Size Size Size Time Throughput
# 8: bytes bytes bytes secs. 10^6bits/sec
# 8:
# 8: 87380 16384 65160 10.00 45045.58
# 8: MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 200.1.1.1 (200.1.1.1) port 0 AF_INET : demo : first burst 0
# 8: Local /Remote
# 8: Socket Size Request Resp. Elapsed Trans.
# 8: Send Recv Size Size Time Rate
# 8: bytes Bytes bytes bytes secs. per sec
# 8:
# 8: 16384 87380 1 1 10.00 50563.99
# 8: 16384 87380
# 8: .
# 8: ----------------------------------------------------------------------
# 8: Ran 1 test in 32.956s
# 8:
# 8: OK
from ctypes import c_ubyte, c_ushort, c_uint, c_ulonglong, Structure
from netaddr import IPAddress
from bpf import BPF
from pyroute2 import IPRoute
from socket import socket, AF_INET, SOCK_DGRAM
import sys
from time import sleep
from unittest import main, TestCase
import subprocess
import commands
arg1 = sys.argv.pop(1)
class Bpf_Dest(Structure):
_fields_ = [("prog_id", c_uint),
("port_id", c_uint)]
class Eth_Addr(Structure):
_fields_ = [("addr0", c_ubyte),
("addr1", c_ubyte),
("addr2", c_ubyte),
("addr3", c_ubyte),
("addr4", c_ubyte),
("addr5", c_ubyte)]
class TestBPFSocket(TestCase):
def setup_vm_ns(self, ns, veth_in, veth_out):
subprocess.call(["ip", "link", "add", veth_in, "type", "veth", "peer", "name", veth_out])
subprocess.call(["ip", "netns", "add", ns])
subprocess.call(["ip", "link", "set", veth_in, "netns", ns])
subprocess.call(["ip", "netns", "exec", ns, "ip", "link", "set", veth_in, "name", "eth0"])
subprocess.call(["ip", "link", "set", veth_out, "up"])
def config_vm_ns(self, ns, ip_addr, net_mask, ip_gw):
subprocess.call(["ip", "netns", "exec", ns, "ip", "addr", "add", ip_addr + "/24", "dev", "eth0"])
subprocess.call(["ip", "netns", "exec", ns, "ip", "link", "set", "eth0", "up"])
subprocess.call(["ip", "netns", "exec", ns, "route", "add", "-net", net_mask + "/24", "gw", ip_gw])
def setup_router_ns(self, ns, veth1_in, veth1_out, veth2_in, veth2_out):
subprocess.call(["ip", "netns", "add", ns])
subprocess.call(["ip", "link", "add", veth1_in, "type", "veth", "peer", "name", veth1_out])
subprocess.call(["ip", "link", "set", veth1_in, "netns", ns])
subprocess.call(["ip", "netns", "exec", ns, "ip", "link", "set", veth1_in, "name", "eth0"])
subprocess.call(["ip", "link", "add", veth2_in, "type", "veth", "peer", "name", veth2_out])
subprocess.call(["ip", "link", "set", veth2_in, "netns", ns])
subprocess.call(["ip", "netns", "exec", ns, "ip", "link", "set", veth2_in, "name", "eth1"])
subprocess.call(["ip", "link", "set", veth1_out, "up"])
subprocess.call(["ip", "link", "set", veth2_out, "up"])
def config_router_ns(self, ns, ip_eth0, ip_eth1):
subprocess.call(["ip", "netns", "exec", ns, "ip", "addr", "add", ip_eth0 + "/24", "dev", "eth0"])
subprocess.call(["ip", "netns", "exec", ns, "ip", "link", "set", "eth0", "up"])
subprocess.call(["ip", "netns", "exec", ns, "ip", "addr", "add", ip_eth1 + "/24", "dev", "eth1"])
subprocess.call(["ip", "netns", "exec", ns, "ip", "link", "set", "eth1", "up"])
def set_default_const(self):
self.ns1 = "ns1"
self.ns1_eth_in = "v1"
self.ns1_eth_out = "v2"
self.ns2 = "ns2"
self.ns2_eth_in = "v3"
self.ns2_eth_out = "v4"
self.ns_router = "ns_router"
self.nsrtr_eth0_in = "v10"
self.nsrtr_eth0_out = "v11"
self.nsrtr_eth1_in = "v12"
self.nsrtr_eth1_out = "v13"
self.vm1_ip = "100.1.1.1"
self.vm2_ip = "200.1.1.1"
self.vm1_rtr_ip = "100.1.1.254"
self.vm2_rtr_ip = "200.1.1.254"
self.vm1_rtr_mask = "100.1.1.0"
self.vm2_rtr_mask = "200.1.1.0"
def get_table(self, b):
self.jump = b.get_table("jump", c_uint, c_uint)
self.pem_dest = b.get_table("pem_dest", c_uint, Bpf_Dest)
self.pem_port = b.get_table("pem_port", c_uint, c_uint)
self.pem_ifindex = b.get_table("pem_ifindex", c_uint, c_uint)
self.pem_stats = b.get_table("pem_stats", c_uint, c_uint)
self.br1_dest = b.get_table("br1_dest", c_uint, Bpf_Dest)
self.br1_mac = b.get_table("br1_mac", Eth_Addr, c_uint)
self.br1_rtr = b.get_table("br1_rtr", c_uint, c_uint)
self.br1_mac_ifindex = b.get_table("br1_mac_ifindex", Eth_Addr, c_uint)
self.br2_dest = b.get_table("br2_dest", c_uint, Bpf_Dest)
self.br2_mac = b.get_table("br2_mac", Eth_Addr, c_uint)
self.br2_rtr = b.get_table("br2_rtr", c_uint, c_uint)
self.br2_mac_ifindex = b.get_table("br2_mac_ifindex", Eth_Addr, c_uint)
def connect_ports(self, prog_id_pem, prog_id_br, curr_pem_pid, curr_br_pid,
ip, br_dest_map, br_mac_map,
ns_eth_out, vm_mac, vm_ip):
val = Bpf_Dest(prog_id_br, curr_br_pid)
self.pem_dest.update(c_uint(curr_pem_pid), val)
val = Bpf_Dest(prog_id_pem, curr_pem_pid)
br_dest_map.update(c_uint(curr_br_pid), val)
ifindex = ip.link_lookup(ifname=ns_eth_out)[0]
self.pem_port.update(c_uint(curr_pem_pid), c_uint(ifindex))
self.pem_ifindex.update(c_uint(ifindex), c_uint(curr_pem_pid))
mac1 = vm_mac.split(':')
mac_addr = Eth_Addr(int(mac1[0], 16), int(mac1[1], 16), int(mac1[2], 16),
int(mac1[3], 16), int(mac1[4], 16), int(mac1[5], 16))
br_mac_map.update(mac_addr, c_uint(curr_br_pid))
def attach_filter(self, ip, ifname, fd, name):
ifindex = ip.link_lookup(ifname=ifname)[0]
ip.tc("add", "ingress", ifindex, "ffff:")
ip.tc("add-filter", "bpf", ifindex, ":1", fd=fd, name=name,
parent="ffff:", action="drop", classid=1)
def config_maps(self):
b = BPF(src_file=arg1, debug=0)
pem_fn = b.load_func("pem", BPF.SCHED_CLS)
br1_fn = b.load_func("br1", BPF.SCHED_CLS)
br2_fn = b.load_func("br2", BPF.SCHED_CLS)
ip = IPRoute()
# program id
prog_id_pem = 1
prog_id_br1 = 2
prog_id_br2 = 3
# initial port id and table pointers
curr_pem_pid = 0
curr_br1_pid = 0
curr_br2_pid = 0
self.get_table(b)
# configure jump table
self.jump.update(c_uint(prog_id_pem), c_uint(pem_fn.fd))
self.jump.update(c_uint(prog_id_br1), c_uint(br1_fn.fd))
self.jump.update(c_uint(prog_id_br2), c_uint(br2_fn.fd))
# connect pem and br1
curr_pem_pid = curr_pem_pid + 1
curr_br1_pid = curr_br1_pid + 1
self.connect_ports(prog_id_pem, prog_id_br1, curr_pem_pid, curr_br1_pid,
ip, self.br1_dest, self.br1_mac,
self.ns1_eth_out, self.vm1_mac, self.vm1_ip)
# connect pem and br2
curr_pem_pid = curr_pem_pid + 1
curr_br2_pid = curr_br2_pid + 1
self.connect_ports(prog_id_pem, prog_id_br2, curr_pem_pid, curr_br2_pid,
ip, self.br2_dest, self.br2_mac,
self.ns2_eth_out, self.vm2_mac, self.vm2_ip)
# connect <br1, rtr> and <br2, rtr>
ifindex = ip.link_lookup(ifname=self.nsrtr_eth0_out)[0]
self.br1_rtr.update(c_uint(0), c_uint(ifindex))
ifindex = ip.link_lookup(ifname=self.nsrtr_eth1_out)[0]
self.br2_rtr.update(c_uint(0), c_uint(ifindex))
# tc filter setup with bpf programs attached
self.attach_filter(ip, self.ns1_eth_out, pem_fn.fd, pem_fn.name)
self.attach_filter(ip, self.ns2_eth_out, pem_fn.fd, pem_fn.name)
self.attach_filter(ip, self.nsrtr_eth0_out, br1_fn.fd, br1_fn.name)
self.attach_filter(ip, self.nsrtr_eth1_out, br2_fn.fd, br2_fn.name)
def setUp(self):
# set up the environment
self.set_default_const()
self.setup_vm_ns(self.ns1, self.ns1_eth_in, self.ns1_eth_out)
self.setup_vm_ns(self.ns2, self.ns2_eth_in, self.ns2_eth_out)
self.config_vm_ns(self.ns1, self.vm1_ip, self.vm2_rtr_mask, self.vm1_rtr_ip)
self.config_vm_ns(self.ns2, self.vm2_ip, self.vm1_rtr_mask, self.vm2_rtr_ip)
self.setup_router_ns(self.ns_router, self.nsrtr_eth0_in, self.nsrtr_eth0_out,
self.nsrtr_eth1_in, self.nsrtr_eth1_out)
self.config_router_ns(self.ns_router, self.vm1_rtr_ip, self.vm2_rtr_ip)
# get vm mac address
self.vm1_mac = commands.getoutput('ip netns exec ' + self.ns1 + ' cat /sys/class/net/eth0/address')
self.vm2_mac = commands.getoutput('ip netns exec ' + self.ns2 + ' cat /sys/class/net/eth0/address')
# load the program and configure maps
self.config_maps()
def test_brb(self):
# our bridge is not smart enough, so send arping for router learning to prevent router
# from sending out arp request
subprocess.call(["ip", "netns", "exec", self.ns1, "arping", "-w", "1", "-c", "1", "-I", "eth0",
self.vm1_rtr_ip])
subprocess.call(["ip", "netns", "exec", self.ns2, "arping", "-w", "1", "-c", "1", "-I", "eth0",
self.vm2_rtr_ip])
# ping
subprocess.call(["ip", "netns", "exec", self.ns1, "ping", self.vm2_ip, "-c", "5"])
# minimum one arp reply, 5 icmp reply
self.assertGreater(self.pem_stats.lookup(c_uint(0)).value, 5)
# iperf, run server on the background
subprocess.Popen(["ip", "netns", "exec", self.ns2, "iperf", "-s", "-xSCD"])
sleep(1)
subprocess.call(["ip", "netns", "exec", self.ns1, "iperf", "-c", self.vm2_ip, "-t", "1", "-xSC"])
subprocess.call(["ip", "netns", "exec", self.ns2, "killall", "iperf"])
# netperf, run server on the background
subprocess.Popen(["ip", "netns", "exec", self.ns2, "netserver"])
sleep(1)
subprocess.call(["ip", "netns", "exec", self.ns1, "netperf", "-H", self.vm2_ip, "--", "-m", "65160"])
subprocess.call(["ip", "netns", "exec", self.ns1, "netperf", "-H", self.vm2_ip, "-t", "TCP_RR"])
subprocess.call(["ip", "netns", "exec", self.ns2, "killall", "netserver"])
# cleanup, tear down the veths and namespaces
subprocess.call(["ip", "netns", "del", self.ns1])
subprocess.call(["ip", "netns", "del", self.ns2])
subprocess.call(["ip", "netns", "del", self.ns_router])
if __name__ == "__main__":
main()
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment