Commit 105bc130 authored by David S. Miller's avatar David S. Miller

Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next

Daniel Borkmann says:

====================
pull-request: bpf-next 2018-09-25

The following pull-request contains BPF updates for your *net-next* tree.

The main changes are:

1) Allow for RX stack hardening by implementing the kernel's flow
   dissector in BPF. Idea was originally presented at netconf 2017 [0].
   Quote from merge commit:

     [...] Because of the rigorous checks of the BPF verifier, this
     provides significant security guarantees. In particular, the BPF
     flow dissector cannot get inside of an infinite loop, as with
     CVE-2013-4348, because BPF programs are guaranteed to terminate.
     It cannot read outside of packet bounds, because all memory accesses
     are checked. Also, with BPF the administrator can decide which
     protocols to support, reducing potential attack surface. Rarely
     encountered protocols can be excluded from dissection and the
     program can be updated without kernel recompile or reboot if a
     bug is discovered. [...]

   Also, a sample flow dissector has been implemented in BPF as part
   of this work, from Petar and Willem.

   [0] http://vger.kernel.org/netconf2017_files/rx_hardening_and_udp_gso.pdf

2) Add support for bpftool to list currently active attachment
   points of BPF networking programs providing a quick overview
   similar to bpftool's perf subcommand, from Yonghong.

3) Fix a verifier pruning instability bug where a union member
   from the register state was not cleared properly leading to
   branches not being pruned despite them being valid candidates,
   from Alexei.

4) Various smaller fast-path optimizations in XDP's map redirect
   code, from Jesper.

5) Enable to recognize BPF_MAP_TYPE_REUSEPORT_SOCKARRAY maps
   in bpftool, from Roman.

6) Remove a duplicate check in libbpf that probes for function
   storage, from Taeung.

7) Fix an issue in test_progs by avoid checking for errno since
   on success its value should not be checked, from Mauricio.

8) Fix unused variable warning in bpf_getsockopt() helper when
   CONFIG_INET is not configured, from Anders.

9) Fix a compilation failure in the BPF sample code's use of
   bpf_flow_keys, from Prashant.

10) Minor cleanups in BPF code, from Yue and Zhong.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 3475372f d0e13a14
...@@ -212,6 +212,7 @@ enum bpf_reg_type { ...@@ -212,6 +212,7 @@ enum bpf_reg_type {
PTR_TO_PACKET_META, /* skb->data - meta_len */ PTR_TO_PACKET_META, /* skb->data - meta_len */
PTR_TO_PACKET, /* reg points to skb->data */ PTR_TO_PACKET, /* reg points to skb->data */
PTR_TO_PACKET_END, /* skb->data + headlen */ PTR_TO_PACKET_END, /* skb->data + headlen */
PTR_TO_FLOW_KEYS, /* reg points to bpf_flow_keys */
}; };
/* The information passed from prog-specific *_is_valid_access /* The information passed from prog-specific *_is_valid_access
......
...@@ -16,6 +16,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_SEG6LOCAL, lwt_seg6local) ...@@ -16,6 +16,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_SEG6LOCAL, lwt_seg6local)
BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops) BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops)
BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb) BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb)
BPF_PROG_TYPE(BPF_PROG_TYPE_SK_MSG, sk_msg) BPF_PROG_TYPE(BPF_PROG_TYPE_SK_MSG, sk_msg)
BPF_PROG_TYPE(BPF_PROG_TYPE_FLOW_DISSECTOR, flow_dissector)
#endif #endif
#ifdef CONFIG_BPF_EVENTS #ifdef CONFIG_BPF_EVENTS
BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe) BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe)
......
...@@ -243,6 +243,8 @@ struct scatterlist; ...@@ -243,6 +243,8 @@ struct scatterlist;
struct pipe_inode_info; struct pipe_inode_info;
struct iov_iter; struct iov_iter;
struct napi_struct; struct napi_struct;
struct bpf_prog;
union bpf_attr;
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
struct nf_conntrack { struct nf_conntrack {
...@@ -1192,6 +1194,24 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, ...@@ -1192,6 +1194,24 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
const struct flow_dissector_key *key, const struct flow_dissector_key *key,
unsigned int key_count); unsigned int key_count);
#ifdef CONFIG_NET
int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr,
struct bpf_prog *prog);
int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr);
#else
static inline int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr,
struct bpf_prog *prog)
{
return -EOPNOTSUPP;
}
static inline int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr)
{
return -EOPNOTSUPP;
}
#endif
bool __skb_flow_dissect(const struct sk_buff *skb, bool __skb_flow_dissect(const struct sk_buff *skb,
struct flow_dissector *flow_dissector, struct flow_dissector *flow_dissector,
void *target_container, void *target_container,
......
...@@ -43,6 +43,7 @@ struct ctl_table_header; ...@@ -43,6 +43,7 @@ struct ctl_table_header;
struct net_generic; struct net_generic;
struct uevent_sock; struct uevent_sock;
struct netns_ipvs; struct netns_ipvs;
struct bpf_prog;
#define NETDEV_HASHBITS 8 #define NETDEV_HASHBITS 8
...@@ -145,6 +146,8 @@ struct net { ...@@ -145,6 +146,8 @@ struct net {
#endif #endif
struct net_generic __rcu *gen; struct net_generic __rcu *gen;
struct bpf_prog __rcu *flow_dissector_prog;
/* Note : following structs are cache line aligned */ /* Note : following structs are cache line aligned */
#ifdef CONFIG_XFRM #ifdef CONFIG_XFRM
struct netns_xfrm xfrm; struct netns_xfrm xfrm;
......
...@@ -19,6 +19,7 @@ struct Qdisc_ops; ...@@ -19,6 +19,7 @@ struct Qdisc_ops;
struct qdisc_walker; struct qdisc_walker;
struct tcf_walker; struct tcf_walker;
struct module; struct module;
struct bpf_flow_keys;
typedef int tc_setup_cb_t(enum tc_setup_type type, typedef int tc_setup_cb_t(enum tc_setup_type type,
void *type_data, void *cb_priv); void *type_data, void *cb_priv);
...@@ -321,9 +322,14 @@ struct tcf_proto { ...@@ -321,9 +322,14 @@ struct tcf_proto {
}; };
struct qdisc_skb_cb { struct qdisc_skb_cb {
unsigned int pkt_len; union {
u16 slave_dev_queue_mapping; struct {
u16 tc_classid; unsigned int pkt_len;
u16 slave_dev_queue_mapping;
u16 tc_classid;
};
struct bpf_flow_keys *flow_keys;
};
#define QDISC_CB_PRIV_LEN 20 #define QDISC_CB_PRIV_LEN 20
unsigned char data[QDISC_CB_PRIV_LEN]; unsigned char data[QDISC_CB_PRIV_LEN];
}; };
......
...@@ -152,6 +152,7 @@ enum bpf_prog_type { ...@@ -152,6 +152,7 @@ enum bpf_prog_type {
BPF_PROG_TYPE_LWT_SEG6LOCAL, BPF_PROG_TYPE_LWT_SEG6LOCAL,
BPF_PROG_TYPE_LIRC_MODE2, BPF_PROG_TYPE_LIRC_MODE2,
BPF_PROG_TYPE_SK_REUSEPORT, BPF_PROG_TYPE_SK_REUSEPORT,
BPF_PROG_TYPE_FLOW_DISSECTOR,
}; };
enum bpf_attach_type { enum bpf_attach_type {
...@@ -172,6 +173,7 @@ enum bpf_attach_type { ...@@ -172,6 +173,7 @@ enum bpf_attach_type {
BPF_CGROUP_UDP4_SENDMSG, BPF_CGROUP_UDP4_SENDMSG,
BPF_CGROUP_UDP6_SENDMSG, BPF_CGROUP_UDP6_SENDMSG,
BPF_LIRC_MODE2, BPF_LIRC_MODE2,
BPF_FLOW_DISSECTOR,
__MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE
}; };
...@@ -2333,6 +2335,7 @@ struct __sk_buff { ...@@ -2333,6 +2335,7 @@ struct __sk_buff {
/* ... here. */ /* ... here. */
__u32 data_meta; __u32 data_meta;
struct bpf_flow_keys *flow_keys;
}; };
struct bpf_tunnel_key { struct bpf_tunnel_key {
...@@ -2778,4 +2781,27 @@ enum bpf_task_fd_type { ...@@ -2778,4 +2781,27 @@ enum bpf_task_fd_type {
BPF_FD_TYPE_URETPROBE, /* filename + offset */ BPF_FD_TYPE_URETPROBE, /* filename + offset */
}; };
struct bpf_flow_keys {
__u16 nhoff;
__u16 thoff;
__u16 addr_proto; /* ETH_P_* of valid addrs */
__u8 is_frag;
__u8 is_first_frag;
__u8 is_encap;
__u8 ip_proto;
__be16 n_proto;
__be16 sport;
__be16 dport;
union {
struct {
__be32 ipv4_src;
__be32 ipv4_dst;
};
struct {
__u32 ipv6_src[4]; /* in6_addr; network order */
__u32 ipv6_dst[4]; /* in6_addr; network order */
};
};
};
#endif /* _UAPI__LINUX_BPF_H__ */ #endif /* _UAPI__LINUX_BPF_H__ */
...@@ -553,6 +553,29 @@ static void bpf_fd_array_map_clear(struct bpf_map *map) ...@@ -553,6 +553,29 @@ static void bpf_fd_array_map_clear(struct bpf_map *map)
fd_array_map_delete_elem(map, &i); fd_array_map_delete_elem(map, &i);
} }
static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key,
struct seq_file *m)
{
void **elem, *ptr;
u32 prog_id;
rcu_read_lock();
elem = array_map_lookup_elem(map, key);
if (elem) {
ptr = READ_ONCE(*elem);
if (ptr) {
seq_printf(m, "%u: ", *(u32 *)key);
prog_id = prog_fd_array_sys_lookup_elem(ptr);
btf_type_seq_show(map->btf, map->btf_value_type_id,
&prog_id, m);
seq_puts(m, "\n");
}
}
rcu_read_unlock();
}
const struct bpf_map_ops prog_array_map_ops = { const struct bpf_map_ops prog_array_map_ops = {
.map_alloc_check = fd_array_map_alloc_check, .map_alloc_check = fd_array_map_alloc_check,
.map_alloc = array_map_alloc, .map_alloc = array_map_alloc,
...@@ -564,7 +587,7 @@ const struct bpf_map_ops prog_array_map_ops = { ...@@ -564,7 +587,7 @@ const struct bpf_map_ops prog_array_map_ops = {
.map_fd_put_ptr = prog_fd_array_put_ptr, .map_fd_put_ptr = prog_fd_array_put_ptr,
.map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem,
.map_release_uref = bpf_fd_array_map_clear, .map_release_uref = bpf_fd_array_map_clear,
.map_check_btf = map_check_no_btf, .map_seq_show_elem = prog_array_map_seq_show_elem,
}; };
static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
......
...@@ -612,8 +612,7 @@ static int free_sg(struct sock *sk, int start, ...@@ -612,8 +612,7 @@ static int free_sg(struct sock *sk, int start,
if (i == MAX_SKB_FRAGS) if (i == MAX_SKB_FRAGS)
i = 0; i = 0;
} }
if (md->skb) consume_skb(md->skb);
consume_skb(md->skb);
return free; return free;
} }
...@@ -995,8 +994,7 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, ...@@ -995,8 +994,7 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
if (!sg->length && md->sg_start == md->sg_end) { if (!sg->length && md->sg_start == md->sg_end) {
list_del(&md->list); list_del(&md->list);
if (md->skb) consume_skb(md->skb);
consume_skb(md->skb);
kfree(md); kfree(md);
} }
} }
......
...@@ -1615,6 +1615,9 @@ static int bpf_prog_attach(const union bpf_attr *attr) ...@@ -1615,6 +1615,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
case BPF_LIRC_MODE2: case BPF_LIRC_MODE2:
ptype = BPF_PROG_TYPE_LIRC_MODE2; ptype = BPF_PROG_TYPE_LIRC_MODE2;
break; break;
case BPF_FLOW_DISSECTOR:
ptype = BPF_PROG_TYPE_FLOW_DISSECTOR;
break;
default: default:
return -EINVAL; return -EINVAL;
} }
...@@ -1636,6 +1639,9 @@ static int bpf_prog_attach(const union bpf_attr *attr) ...@@ -1636,6 +1639,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
case BPF_PROG_TYPE_LIRC_MODE2: case BPF_PROG_TYPE_LIRC_MODE2:
ret = lirc_prog_attach(attr, prog); ret = lirc_prog_attach(attr, prog);
break; break;
case BPF_PROG_TYPE_FLOW_DISSECTOR:
ret = skb_flow_dissector_bpf_prog_attach(attr, prog);
break;
default: default:
ret = cgroup_bpf_prog_attach(attr, ptype, prog); ret = cgroup_bpf_prog_attach(attr, ptype, prog);
} }
...@@ -1688,6 +1694,8 @@ static int bpf_prog_detach(const union bpf_attr *attr) ...@@ -1688,6 +1694,8 @@ static int bpf_prog_detach(const union bpf_attr *attr)
return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, NULL); return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, NULL);
case BPF_LIRC_MODE2: case BPF_LIRC_MODE2:
return lirc_prog_detach(attr); return lirc_prog_detach(attr);
case BPF_FLOW_DISSECTOR:
return skb_flow_dissector_bpf_prog_detach(attr);
default: default:
return -EINVAL; return -EINVAL;
} }
......
...@@ -261,6 +261,7 @@ static const char * const reg_type_str[] = { ...@@ -261,6 +261,7 @@ static const char * const reg_type_str[] = {
[PTR_TO_PACKET] = "pkt", [PTR_TO_PACKET] = "pkt",
[PTR_TO_PACKET_META] = "pkt_meta", [PTR_TO_PACKET_META] = "pkt_meta",
[PTR_TO_PACKET_END] = "pkt_end", [PTR_TO_PACKET_END] = "pkt_end",
[PTR_TO_FLOW_KEYS] = "flow_keys",
}; };
static char slot_type_char[] = { static char slot_type_char[] = {
...@@ -570,7 +571,9 @@ static void __mark_reg_not_init(struct bpf_reg_state *reg); ...@@ -570,7 +571,9 @@ static void __mark_reg_not_init(struct bpf_reg_state *reg);
*/ */
static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm) static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm)
{ {
reg->id = 0; /* Clear id, off, and union(map_ptr, range) */
memset(((u8 *)reg) + sizeof(reg->type), 0,
offsetof(struct bpf_reg_state, var_off) - sizeof(reg->type));
reg->var_off = tnum_const(imm); reg->var_off = tnum_const(imm);
reg->smin_value = (s64)imm; reg->smin_value = (s64)imm;
reg->smax_value = (s64)imm; reg->smax_value = (s64)imm;
...@@ -589,7 +592,6 @@ static void __mark_reg_known_zero(struct bpf_reg_state *reg) ...@@ -589,7 +592,6 @@ static void __mark_reg_known_zero(struct bpf_reg_state *reg)
static void __mark_reg_const_zero(struct bpf_reg_state *reg) static void __mark_reg_const_zero(struct bpf_reg_state *reg)
{ {
__mark_reg_known(reg, 0); __mark_reg_known(reg, 0);
reg->off = 0;
reg->type = SCALAR_VALUE; reg->type = SCALAR_VALUE;
} }
...@@ -700,9 +702,12 @@ static void __mark_reg_unbounded(struct bpf_reg_state *reg) ...@@ -700,9 +702,12 @@ static void __mark_reg_unbounded(struct bpf_reg_state *reg)
/* Mark a register as having a completely unknown (scalar) value. */ /* Mark a register as having a completely unknown (scalar) value. */
static void __mark_reg_unknown(struct bpf_reg_state *reg) static void __mark_reg_unknown(struct bpf_reg_state *reg)
{ {
/*
* Clear type, id, off, and union(map_ptr, range) and
* padding between 'type' and union
*/
memset(reg, 0, offsetof(struct bpf_reg_state, var_off));
reg->type = SCALAR_VALUE; reg->type = SCALAR_VALUE;
reg->id = 0;
reg->off = 0;
reg->var_off = tnum_unknown; reg->var_off = tnum_unknown;
reg->frameno = 0; reg->frameno = 0;
__mark_reg_unbounded(reg); __mark_reg_unbounded(reg);
...@@ -961,6 +966,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type) ...@@ -961,6 +966,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
case PTR_TO_PACKET: case PTR_TO_PACKET:
case PTR_TO_PACKET_META: case PTR_TO_PACKET_META:
case PTR_TO_PACKET_END: case PTR_TO_PACKET_END:
case PTR_TO_FLOW_KEYS:
case CONST_PTR_TO_MAP: case CONST_PTR_TO_MAP:
return true; return true;
default: default:
...@@ -1234,6 +1240,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, ...@@ -1234,6 +1240,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
case BPF_PROG_TYPE_LWT_XMIT: case BPF_PROG_TYPE_LWT_XMIT:
case BPF_PROG_TYPE_SK_SKB: case BPF_PROG_TYPE_SK_SKB:
case BPF_PROG_TYPE_SK_MSG: case BPF_PROG_TYPE_SK_MSG:
case BPF_PROG_TYPE_FLOW_DISSECTOR:
if (meta) if (meta)
return meta->pkt_access; return meta->pkt_access;
...@@ -1317,6 +1324,18 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, ...@@ -1317,6 +1324,18 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
return -EACCES; return -EACCES;
} }
static int check_flow_keys_access(struct bpf_verifier_env *env, int off,
int size)
{
if (size < 0 || off < 0 ||
(u64)off + size > sizeof(struct bpf_flow_keys)) {
verbose(env, "invalid access to flow keys off=%d size=%d\n",
off, size);
return -EACCES;
}
return 0;
}
static bool __is_pointer_value(bool allow_ptr_leaks, static bool __is_pointer_value(bool allow_ptr_leaks,
const struct bpf_reg_state *reg) const struct bpf_reg_state *reg)
{ {
...@@ -1418,6 +1437,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, ...@@ -1418,6 +1437,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
* right in front, treat it the very same way. * right in front, treat it the very same way.
*/ */
return check_pkt_ptr_alignment(env, reg, off, size, strict); return check_pkt_ptr_alignment(env, reg, off, size, strict);
case PTR_TO_FLOW_KEYS:
pointer_desc = "flow keys ";
break;
case PTR_TO_MAP_VALUE: case PTR_TO_MAP_VALUE:
pointer_desc = "value "; pointer_desc = "value ";
break; break;
...@@ -1640,9 +1662,6 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn ...@@ -1640,9 +1662,6 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
else else
mark_reg_known_zero(env, regs, mark_reg_known_zero(env, regs,
value_regno); value_regno);
regs[value_regno].id = 0;
regs[value_regno].off = 0;
regs[value_regno].range = 0;
regs[value_regno].type = reg_type; regs[value_regno].type = reg_type;
} }
...@@ -1691,6 +1710,17 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn ...@@ -1691,6 +1710,17 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
err = check_packet_access(env, regno, off, size, false); err = check_packet_access(env, regno, off, size, false);
if (!err && t == BPF_READ && value_regno >= 0) if (!err && t == BPF_READ && value_regno >= 0)
mark_reg_unknown(env, regs, value_regno); mark_reg_unknown(env, regs, value_regno);
} else if (reg->type == PTR_TO_FLOW_KEYS) {
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
verbose(env, "R%d leaks addr into flow keys\n",
value_regno);
return -EACCES;
}
err = check_flow_keys_access(env, off, size);
if (!err && t == BPF_READ && value_regno >= 0)
mark_reg_unknown(env, regs, value_regno);
} else { } else {
verbose(env, "R%d invalid mem access '%s'\n", regno, verbose(env, "R%d invalid mem access '%s'\n", regno,
reg_type_str[reg->type]); reg_type_str[reg->type]);
...@@ -1838,6 +1868,8 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, ...@@ -1838,6 +1868,8 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
case PTR_TO_PACKET_META: case PTR_TO_PACKET_META:
return check_packet_access(env, regno, reg->off, access_size, return check_packet_access(env, regno, reg->off, access_size,
zero_size_allowed); zero_size_allowed);
case PTR_TO_FLOW_KEYS:
return check_flow_keys_access(env, reg->off, access_size);
case PTR_TO_MAP_VALUE: case PTR_TO_MAP_VALUE:
return check_map_access(env, regno, reg->off, access_size, return check_map_access(env, regno, reg->off, access_size,
zero_size_allowed); zero_size_allowed);
...@@ -2495,7 +2527,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn ...@@ -2495,7 +2527,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
/* There is no offset yet applied, variable or fixed */ /* There is no offset yet applied, variable or fixed */
mark_reg_known_zero(env, regs, BPF_REG_0); mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].off = 0;
/* remember map_ptr, so that check_map_access() /* remember map_ptr, so that check_map_access()
* can check 'value_size' boundary of memory access * can check 'value_size' boundary of memory access
* to map element returned from bpf_map_lookup_elem() * to map element returned from bpf_map_lookup_elem()
...@@ -4366,6 +4397,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, ...@@ -4366,6 +4397,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
case PTR_TO_CTX: case PTR_TO_CTX:
case CONST_PTR_TO_MAP: case CONST_PTR_TO_MAP:
case PTR_TO_PACKET_END: case PTR_TO_PACKET_END:
case PTR_TO_FLOW_KEYS:
/* Only valid matches are exact, which memcmp() above /* Only valid matches are exact, which memcmp() above
* would have accepted * would have accepted
*/ */
......
...@@ -3176,6 +3176,32 @@ static int __bpf_tx_xdp(struct net_device *dev, ...@@ -3176,6 +3176,32 @@ static int __bpf_tx_xdp(struct net_device *dev,
return 0; return 0;
} }
static noinline int
xdp_do_redirect_slow(struct net_device *dev, struct xdp_buff *xdp,
struct bpf_prog *xdp_prog, struct bpf_redirect_info *ri)
{
struct net_device *fwd;
u32 index = ri->ifindex;
int err;
fwd = dev_get_by_index_rcu(dev_net(dev), index);
ri->ifindex = 0;
if (unlikely(!fwd)) {
err = -EINVAL;
goto err;
}
err = __bpf_tx_xdp(fwd, NULL, xdp, 0);
if (unlikely(err))
goto err;
_trace_xdp_redirect(dev, xdp_prog, index);
return 0;
err:
_trace_xdp_redirect_err(dev, xdp_prog, index, err);
return err;
}
static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
struct bpf_map *map, struct bpf_map *map,
struct xdp_buff *xdp, struct xdp_buff *xdp,
...@@ -3188,7 +3214,7 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, ...@@ -3188,7 +3214,7 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
struct bpf_dtab_netdev *dst = fwd; struct bpf_dtab_netdev *dst = fwd;
err = dev_map_enqueue(dst, xdp, dev_rx); err = dev_map_enqueue(dst, xdp, dev_rx);
if (err) if (unlikely(err))
return err; return err;
__dev_map_insert_ctx(map, index); __dev_map_insert_ctx(map, index);
break; break;
...@@ -3197,7 +3223,7 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, ...@@ -3197,7 +3223,7 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
struct bpf_cpu_map_entry *rcpu = fwd; struct bpf_cpu_map_entry *rcpu = fwd;
err = cpu_map_enqueue(rcpu, xdp, dev_rx); err = cpu_map_enqueue(rcpu, xdp, dev_rx);
if (err) if (unlikely(err))
return err; return err;
__cpu_map_insert_ctx(map, index); __cpu_map_insert_ctx(map, index);
break; break;
...@@ -3238,7 +3264,7 @@ void xdp_do_flush_map(void) ...@@ -3238,7 +3264,7 @@ void xdp_do_flush_map(void)
} }
EXPORT_SYMBOL_GPL(xdp_do_flush_map); EXPORT_SYMBOL_GPL(xdp_do_flush_map);
static void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index) static inline void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index)
{ {
switch (map->map_type) { switch (map->map_type) {
case BPF_MAP_TYPE_DEVMAP: case BPF_MAP_TYPE_DEVMAP:
...@@ -3270,9 +3296,9 @@ void bpf_clear_redirect_map(struct bpf_map *map) ...@@ -3270,9 +3296,9 @@ void bpf_clear_redirect_map(struct bpf_map *map)
} }
static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
struct bpf_prog *xdp_prog, struct bpf_map *map) struct bpf_prog *xdp_prog, struct bpf_map *map,
struct bpf_redirect_info *ri)
{ {
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
u32 index = ri->ifindex; u32 index = ri->ifindex;
void *fwd = NULL; void *fwd = NULL;
int err; int err;
...@@ -3281,11 +3307,11 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, ...@@ -3281,11 +3307,11 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
WRITE_ONCE(ri->map, NULL); WRITE_ONCE(ri->map, NULL);
fwd = __xdp_map_lookup_elem(map, index); fwd = __xdp_map_lookup_elem(map, index);
if (!fwd) { if (unlikely(!fwd)) {
err = -EINVAL; err = -EINVAL;
goto err; goto err;
} }
if (ri->map_to_flush && ri->map_to_flush != map) if (ri->map_to_flush && unlikely(ri->map_to_flush != map))
xdp_do_flush_map(); xdp_do_flush_map();
err = __bpf_tx_xdp_map(dev, fwd, map, xdp, index); err = __bpf_tx_xdp_map(dev, fwd, map, xdp, index);
...@@ -3305,29 +3331,11 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, ...@@ -3305,29 +3331,11 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
{ {
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
struct bpf_map *map = READ_ONCE(ri->map); struct bpf_map *map = READ_ONCE(ri->map);
struct net_device *fwd;
u32 index = ri->ifindex;
int err;
if (map) if (likely(map))
return xdp_do_redirect_map(dev, xdp, xdp_prog, map); return xdp_do_redirect_map(dev, xdp, xdp_prog, map, ri);
fwd = dev_get_by_index_rcu(dev_net(dev), index); return xdp_do_redirect_slow(dev, xdp, xdp_prog, ri);
ri->ifindex = 0;
if (unlikely(!fwd)) {
err = -EINVAL;
goto err;
}
err = __bpf_tx_xdp(fwd, NULL, xdp, 0);
if (unlikely(err))
goto err;
_trace_xdp_redirect(dev, xdp_prog, index);
return 0;
err:
_trace_xdp_redirect_err(dev, xdp_prog, index, err);
return err;
} }
EXPORT_SYMBOL_GPL(xdp_do_redirect); EXPORT_SYMBOL_GPL(xdp_do_redirect);
...@@ -4044,14 +4052,15 @@ static const struct bpf_func_proto bpf_setsockopt_proto = { ...@@ -4044,14 +4052,15 @@ static const struct bpf_func_proto bpf_setsockopt_proto = {
BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock,
int, level, int, optname, char *, optval, int, optlen) int, level, int, optname, char *, optval, int, optlen)
{ {
struct inet_connection_sock *icsk;
struct sock *sk = bpf_sock->sk; struct sock *sk = bpf_sock->sk;
struct tcp_sock *tp;
if (!sk_fullsock(sk)) if (!sk_fullsock(sk))
goto err_clear; goto err_clear;
#ifdef CONFIG_INET #ifdef CONFIG_INET
if (level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) { if (level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) {
struct inet_connection_sock *icsk;
struct tcp_sock *tp;
switch (optname) { switch (optname) {
case TCP_CONGESTION: case TCP_CONGESTION:
icsk = inet_csk(sk); icsk = inet_csk(sk);
...@@ -5115,6 +5124,17 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) ...@@ -5115,6 +5124,17 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
} }
} }
static const struct bpf_func_proto *
flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
case BPF_FUNC_skb_load_bytes:
return &bpf_skb_load_bytes_proto;
default:
return bpf_base_func_proto(func_id);
}
}
static const struct bpf_func_proto * static const struct bpf_func_proto *
lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{ {
...@@ -5233,6 +5253,10 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type ...@@ -5233,6 +5253,10 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type
if (size != size_default) if (size != size_default)
return false; return false;
break; break;
case bpf_ctx_range(struct __sk_buff, flow_keys):
if (size != sizeof(struct bpf_flow_keys *))
return false;
break;
default: default:
/* Only narrow read access allowed for now. */ /* Only narrow read access allowed for now. */
if (type == BPF_WRITE) { if (type == BPF_WRITE) {
...@@ -5258,6 +5282,7 @@ static bool sk_filter_is_valid_access(int off, int size, ...@@ -5258,6 +5282,7 @@ static bool sk_filter_is_valid_access(int off, int size,
case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data):
case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, data_meta):
case bpf_ctx_range(struct __sk_buff, data_end): case bpf_ctx_range(struct __sk_buff, data_end):
case bpf_ctx_range(struct __sk_buff, flow_keys):
case bpf_ctx_range_till(struct __sk_buff, family, local_port): case bpf_ctx_range_till(struct __sk_buff, family, local_port):
return false; return false;
} }
...@@ -5283,6 +5308,7 @@ static bool lwt_is_valid_access(int off, int size, ...@@ -5283,6 +5308,7 @@ static bool lwt_is_valid_access(int off, int size,
case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, tc_classid):
case bpf_ctx_range_till(struct __sk_buff, family, local_port): case bpf_ctx_range_till(struct __sk_buff, family, local_port):
case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, data_meta):
case bpf_ctx_range(struct __sk_buff, flow_keys):
return false; return false;
} }
...@@ -5493,6 +5519,7 @@ static bool tc_cls_act_is_valid_access(int off, int size, ...@@ -5493,6 +5519,7 @@ static bool tc_cls_act_is_valid_access(int off, int size,
case bpf_ctx_range(struct __sk_buff, data_end): case bpf_ctx_range(struct __sk_buff, data_end):
info->reg_type = PTR_TO_PACKET_END; info->reg_type = PTR_TO_PACKET_END;
break; break;
case bpf_ctx_range(struct __sk_buff, flow_keys):
case bpf_ctx_range_till(struct __sk_buff, family, local_port): case bpf_ctx_range_till(struct __sk_buff, family, local_port):
return false; return false;
} }
...@@ -5694,6 +5721,7 @@ static bool sk_skb_is_valid_access(int off, int size, ...@@ -5694,6 +5721,7 @@ static bool sk_skb_is_valid_access(int off, int size,
switch (off) { switch (off) {
case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, tc_classid):
case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, data_meta):
case bpf_ctx_range(struct __sk_buff, flow_keys):
return false; return false;
} }
...@@ -5753,6 +5781,39 @@ static bool sk_msg_is_valid_access(int off, int size, ...@@ -5753,6 +5781,39 @@ static bool sk_msg_is_valid_access(int off, int size,
return true; return true;
} }
static bool flow_dissector_is_valid_access(int off, int size,
enum bpf_access_type type,
const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
{
if (type == BPF_WRITE) {
switch (off) {
case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
break;
default:
return false;
}
}
switch (off) {
case bpf_ctx_range(struct __sk_buff, data):
info->reg_type = PTR_TO_PACKET;
break;
case bpf_ctx_range(struct __sk_buff, data_end):
info->reg_type = PTR_TO_PACKET_END;
break;
case bpf_ctx_range(struct __sk_buff, flow_keys):
info->reg_type = PTR_TO_FLOW_KEYS;
break;
case bpf_ctx_range(struct __sk_buff, tc_classid):
case bpf_ctx_range(struct __sk_buff, data_meta):
case bpf_ctx_range_till(struct __sk_buff, family, local_port):
return false;
}
return bpf_skb_is_valid_access(off, size, type, prog, info);
}
static u32 bpf_convert_ctx_access(enum bpf_access_type type, static u32 bpf_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si, const struct bpf_insn *si,
struct bpf_insn *insn_buf, struct bpf_insn *insn_buf,
...@@ -6047,6 +6108,15 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, ...@@ -6047,6 +6108,15 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
bpf_target_off(struct sock_common, bpf_target_off(struct sock_common,
skc_num, 2, target_size)); skc_num, 2, target_size));
break; break;
case offsetof(struct __sk_buff, flow_keys):
off = si->off;
off -= offsetof(struct __sk_buff, flow_keys);
off += offsetof(struct sk_buff, cb);
off += offsetof(struct qdisc_skb_cb, flow_keys);
*insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
si->src_reg, off);
break;
} }
return insn - insn_buf; return insn - insn_buf;
...@@ -7010,6 +7080,15 @@ const struct bpf_verifier_ops sk_msg_verifier_ops = { ...@@ -7010,6 +7080,15 @@ const struct bpf_verifier_ops sk_msg_verifier_ops = {
const struct bpf_prog_ops sk_msg_prog_ops = { const struct bpf_prog_ops sk_msg_prog_ops = {
}; };
const struct bpf_verifier_ops flow_dissector_verifier_ops = {
.get_func_proto = flow_dissector_func_proto,
.is_valid_access = flow_dissector_is_valid_access,
.convert_ctx_access = bpf_convert_ctx_access,
};
const struct bpf_prog_ops flow_dissector_prog_ops = {
};
int sk_detach_filter(struct sock *sk) int sk_detach_filter(struct sock *sk)
{ {
int ret = -ENOENT; int ret = -ENOENT;
......
...@@ -25,6 +25,9 @@ ...@@ -25,6 +25,9 @@
#include <net/flow_dissector.h> #include <net/flow_dissector.h>
#include <scsi/fc/fc_fcoe.h> #include <scsi/fc/fc_fcoe.h>
#include <uapi/linux/batadv_packet.h> #include <uapi/linux/batadv_packet.h>
#include <linux/bpf.h>
static DEFINE_MUTEX(flow_dissector_mutex);
static void dissector_set_key(struct flow_dissector *flow_dissector, static void dissector_set_key(struct flow_dissector *flow_dissector,
enum flow_dissector_key_id key_id) enum flow_dissector_key_id key_id)
...@@ -62,6 +65,44 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, ...@@ -62,6 +65,44 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
} }
EXPORT_SYMBOL(skb_flow_dissector_init); EXPORT_SYMBOL(skb_flow_dissector_init);
int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr,
struct bpf_prog *prog)
{
struct bpf_prog *attached;
struct net *net;
net = current->nsproxy->net_ns;
mutex_lock(&flow_dissector_mutex);
attached = rcu_dereference_protected(net->flow_dissector_prog,
lockdep_is_held(&flow_dissector_mutex));
if (attached) {
/* Only one BPF program can be attached at a time */
mutex_unlock(&flow_dissector_mutex);
return -EEXIST;
}
rcu_assign_pointer(net->flow_dissector_prog, prog);
mutex_unlock(&flow_dissector_mutex);
return 0;
}
int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr)
{
struct bpf_prog *attached;
struct net *net;
net = current->nsproxy->net_ns;
mutex_lock(&flow_dissector_mutex);
attached = rcu_dereference_protected(net->flow_dissector_prog,
lockdep_is_held(&flow_dissector_mutex));
if (!attached) {
mutex_unlock(&flow_dissector_mutex);
return -ENOENT;
}
bpf_prog_put(attached);
RCU_INIT_POINTER(net->flow_dissector_prog, NULL);
mutex_unlock(&flow_dissector_mutex);
return 0;
}
/** /**
* skb_flow_get_be16 - extract be16 entity * skb_flow_get_be16 - extract be16 entity
* @skb: sk_buff to extract from * @skb: sk_buff to extract from
...@@ -588,6 +629,60 @@ static bool skb_flow_dissect_allowed(int *num_hdrs) ...@@ -588,6 +629,60 @@ static bool skb_flow_dissect_allowed(int *num_hdrs)
return (*num_hdrs <= MAX_FLOW_DISSECT_HDRS); return (*num_hdrs <= MAX_FLOW_DISSECT_HDRS);
} }
static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys,
struct flow_dissector *flow_dissector,
void *target_container)
{
struct flow_dissector_key_control *key_control;
struct flow_dissector_key_basic *key_basic;
struct flow_dissector_key_addrs *key_addrs;
struct flow_dissector_key_ports *key_ports;
key_control = skb_flow_dissector_target(flow_dissector,
FLOW_DISSECTOR_KEY_CONTROL,
target_container);
key_control->thoff = flow_keys->thoff;
if (flow_keys->is_frag)
key_control->flags |= FLOW_DIS_IS_FRAGMENT;
if (flow_keys->is_first_frag)
key_control->flags |= FLOW_DIS_FIRST_FRAG;
if (flow_keys->is_encap)
key_control->flags |= FLOW_DIS_ENCAPSULATION;
key_basic = skb_flow_dissector_target(flow_dissector,
FLOW_DISSECTOR_KEY_BASIC,
target_container);
key_basic->n_proto = flow_keys->n_proto;
key_basic->ip_proto = flow_keys->ip_proto;
if (flow_keys->addr_proto == ETH_P_IP &&
dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV4_ADDRS)) {
key_addrs = skb_flow_dissector_target(flow_dissector,
FLOW_DISSECTOR_KEY_IPV4_ADDRS,
target_container);
key_addrs->v4addrs.src = flow_keys->ipv4_src;
key_addrs->v4addrs.dst = flow_keys->ipv4_dst;
key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
} else if (flow_keys->addr_proto == ETH_P_IPV6 &&
dissector_uses_key(flow_dissector,
FLOW_DISSECTOR_KEY_IPV6_ADDRS)) {
key_addrs = skb_flow_dissector_target(flow_dissector,
FLOW_DISSECTOR_KEY_IPV6_ADDRS,
target_container);
memcpy(&key_addrs->v6addrs, &flow_keys->ipv6_src,
sizeof(key_addrs->v6addrs));
key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
}
if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) {
key_ports = skb_flow_dissector_target(flow_dissector,
FLOW_DISSECTOR_KEY_PORTS,
target_container);
key_ports->src = flow_keys->sport;
key_ports->dst = flow_keys->dport;
}
}
/** /**
* __skb_flow_dissect - extract the flow_keys struct and return it * __skb_flow_dissect - extract the flow_keys struct and return it
* @skb: sk_buff to extract the flow from, can be NULL if the rest are specified * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified
...@@ -619,6 +714,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb, ...@@ -619,6 +714,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
struct flow_dissector_key_vlan *key_vlan; struct flow_dissector_key_vlan *key_vlan;
enum flow_dissect_ret fdret; enum flow_dissect_ret fdret;
enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX; enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX;
struct bpf_prog *attached = NULL;
int num_hdrs = 0; int num_hdrs = 0;
u8 ip_proto = 0; u8 ip_proto = 0;
bool ret; bool ret;
...@@ -658,6 +754,50 @@ bool __skb_flow_dissect(const struct sk_buff *skb, ...@@ -658,6 +754,50 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
FLOW_DISSECTOR_KEY_BASIC, FLOW_DISSECTOR_KEY_BASIC,
target_container); target_container);
rcu_read_lock();
if (skb) {
if (skb->dev)
attached = rcu_dereference(dev_net(skb->dev)->flow_dissector_prog);
else if (skb->sk)
attached = rcu_dereference(sock_net(skb->sk)->flow_dissector_prog);
else
WARN_ON_ONCE(1);
}
if (attached) {
/* Note that even though the const qualifier is discarded
* throughout the execution of the BPF program, all changes(the
* control block) are reverted after the BPF program returns.
* Therefore, __skb_flow_dissect does not alter the skb.
*/
struct bpf_flow_keys flow_keys = {};
struct bpf_skb_data_end cb_saved;
struct bpf_skb_data_end *cb;
u32 result;
cb = (struct bpf_skb_data_end *)skb->cb;
/* Save Control Block */
memcpy(&cb_saved, cb, sizeof(cb_saved));
memset(cb, 0, sizeof(cb_saved));
/* Pass parameters to the BPF program */
cb->qdisc_cb.flow_keys = &flow_keys;
flow_keys.nhoff = nhoff;
bpf_compute_data_pointers((struct sk_buff *)skb);
result = BPF_PROG_RUN(attached, skb);
/* Restore state */
memcpy(cb, &cb_saved, sizeof(cb_saved));
__skb_flow_bpf_to_target(&flow_keys, flow_dissector,
target_container);
key_control->thoff = min_t(u16, key_control->thoff, skb->len);
rcu_read_unlock();
return result == BPF_OK;
}
rcu_read_unlock();
if (dissector_uses_key(flow_dissector, if (dissector_uses_key(flow_dissector,
FLOW_DISSECTOR_KEY_ETH_ADDRS)) { FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
struct ethhdr *eth = eth_hdr(skb); struct ethhdr *eth = eth_hdr(skb);
......
...@@ -16,7 +16,6 @@ ...@@ -16,7 +16,6 @@
#include <linux/netlink.h> #include <linux/netlink.h>
#include <linux/rtnetlink.h> #include <linux/rtnetlink.h>
#include <linux/types.h> #include <linux/types.h>
#include <sys/types.h>
#include <sys/socket.h> #include <sys/socket.h>
#include <sys/syscall.h> #include <sys/syscall.h>
#include <sys/ioctl.h> #include <sys/ioctl.h>
......
...@@ -9,7 +9,6 @@ ...@@ -9,7 +9,6 @@
*/ */
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <stdio.h>
#include <unistd.h> #include <unistd.h>
#include <errno.h> #include <errno.h>
#include <signal.h> #include <signal.h>
......
...@@ -14,7 +14,7 @@ struct vlan_hdr { ...@@ -14,7 +14,7 @@ struct vlan_hdr {
__be16 h_vlan_encapsulated_proto; __be16 h_vlan_encapsulated_proto;
}; };
struct bpf_flow_keys { struct flow_key_record {
__be32 src; __be32 src;
__be32 dst; __be32 dst;
union { union {
...@@ -59,7 +59,7 @@ static inline __u32 ipv6_addr_hash(struct __sk_buff *ctx, __u64 off) ...@@ -59,7 +59,7 @@ static inline __u32 ipv6_addr_hash(struct __sk_buff *ctx, __u64 off)
} }
static inline __u64 parse_ip(struct __sk_buff *skb, __u64 nhoff, __u64 *ip_proto, static inline __u64 parse_ip(struct __sk_buff *skb, __u64 nhoff, __u64 *ip_proto,
struct bpf_flow_keys *flow) struct flow_key_record *flow)
{ {
__u64 verlen; __u64 verlen;
...@@ -83,7 +83,7 @@ static inline __u64 parse_ip(struct __sk_buff *skb, __u64 nhoff, __u64 *ip_proto ...@@ -83,7 +83,7 @@ static inline __u64 parse_ip(struct __sk_buff *skb, __u64 nhoff, __u64 *ip_proto
} }
static inline __u64 parse_ipv6(struct __sk_buff *skb, __u64 nhoff, __u64 *ip_proto, static inline __u64 parse_ipv6(struct __sk_buff *skb, __u64 nhoff, __u64 *ip_proto,
struct bpf_flow_keys *flow) struct flow_key_record *flow)
{ {
*ip_proto = load_byte(skb, *ip_proto = load_byte(skb,
nhoff + offsetof(struct ipv6hdr, nexthdr)); nhoff + offsetof(struct ipv6hdr, nexthdr));
...@@ -96,7 +96,8 @@ static inline __u64 parse_ipv6(struct __sk_buff *skb, __u64 nhoff, __u64 *ip_pro ...@@ -96,7 +96,8 @@ static inline __u64 parse_ipv6(struct __sk_buff *skb, __u64 nhoff, __u64 *ip_pro
return nhoff; return nhoff;
} }
static inline bool flow_dissector(struct __sk_buff *skb, struct bpf_flow_keys *flow) static inline bool flow_dissector(struct __sk_buff *skb,
struct flow_key_record *flow)
{ {
__u64 nhoff = ETH_HLEN; __u64 nhoff = ETH_HLEN;
__u64 ip_proto; __u64 ip_proto;
...@@ -198,7 +199,7 @@ struct bpf_map_def SEC("maps") hash_map = { ...@@ -198,7 +199,7 @@ struct bpf_map_def SEC("maps") hash_map = {
SEC("socket2") SEC("socket2")
int bpf_prog2(struct __sk_buff *skb) int bpf_prog2(struct __sk_buff *skb)
{ {
struct bpf_flow_keys flow = {}; struct flow_key_record flow = {};
struct pair *value; struct pair *value;
u32 key; u32 key;
......
...@@ -61,7 +61,7 @@ struct vlan_hdr { ...@@ -61,7 +61,7 @@ struct vlan_hdr {
__be16 h_vlan_encapsulated_proto; __be16 h_vlan_encapsulated_proto;
}; };
struct bpf_flow_keys { struct flow_key_record {
__be32 src; __be32 src;
__be32 dst; __be32 dst;
union { union {
...@@ -88,7 +88,7 @@ static inline __u32 ipv6_addr_hash(struct __sk_buff *ctx, __u64 off) ...@@ -88,7 +88,7 @@ static inline __u32 ipv6_addr_hash(struct __sk_buff *ctx, __u64 off)
} }
struct globals { struct globals {
struct bpf_flow_keys flow; struct flow_key_record flow;
}; };
struct bpf_map_def SEC("maps") percpu_map = { struct bpf_map_def SEC("maps") percpu_map = {
...@@ -114,14 +114,14 @@ struct pair { ...@@ -114,14 +114,14 @@ struct pair {
struct bpf_map_def SEC("maps") hash_map = { struct bpf_map_def SEC("maps") hash_map = {
.type = BPF_MAP_TYPE_HASH, .type = BPF_MAP_TYPE_HASH,
.key_size = sizeof(struct bpf_flow_keys), .key_size = sizeof(struct flow_key_record),
.value_size = sizeof(struct pair), .value_size = sizeof(struct pair),
.max_entries = 1024, .max_entries = 1024,
}; };
static void update_stats(struct __sk_buff *skb, struct globals *g) static void update_stats(struct __sk_buff *skb, struct globals *g)
{ {
struct bpf_flow_keys key = g->flow; struct flow_key_record key = g->flow;
struct pair *value; struct pair *value;
value = bpf_map_lookup_elem(&hash_map, &key); value = bpf_map_lookup_elem(&hash_map, &key);
......
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
#define PARSE_IP_PROG_FD (prog_fd[0]) #define PARSE_IP_PROG_FD (prog_fd[0])
#define PROG_ARRAY_FD (map_fd[0]) #define PROG_ARRAY_FD (map_fd[0])
struct bpf_flow_keys { struct flow_key_record {
__be32 src; __be32 src;
__be32 dst; __be32 dst;
union { union {
...@@ -64,7 +64,7 @@ int main(int argc, char **argv) ...@@ -64,7 +64,7 @@ int main(int argc, char **argv)
(void) f; (void) f;
for (i = 0; i < 5; i++) { for (i = 0; i < 5; i++) {
struct bpf_flow_keys key = {}, next_key; struct flow_key_record key = {}, next_key;
struct pair value; struct pair value;
sleep(1); sleep(1);
......
...@@ -11,7 +11,6 @@ ...@@ -11,7 +11,6 @@
#include <unistd.h> #include <unistd.h>
#include <bpf/bpf.h> #include <bpf/bpf.h>
#include "bpf_load.h" #include "bpf_load.h"
#include <linux/bpf.h>
#include "cgroup_helpers.h" #include "cgroup_helpers.h"
#define CGROUP_PATH "/my-cgroup" #define CGROUP_PATH "/my-cgroup"
......
================
bpftool-net
================
-------------------------------------------------------------------------------
tool for inspection of netdev/tc related bpf prog attachments
-------------------------------------------------------------------------------
:Manual section: 8
SYNOPSIS
========
**bpftool** [*OPTIONS*] **net** *COMMAND*
*OPTIONS* := { [{ **-j** | **--json** }] [{ **-p** | **--pretty** }] }
*COMMANDS* :=
{ **show** | **list** } [ **dev** name ] | **help**
NET COMMANDS
============
| **bpftool** **net { show | list } [ dev name ]**
| **bpftool** **net help**
DESCRIPTION
===========
**bpftool net { show | list } [ dev name ]**
List bpf program attachments in the kernel networking subsystem.
Currently, only device driver xdp attachments and tc filter
classification/action attachments are implemented, i.e., for
program types **BPF_PROG_TYPE_SCHED_CLS**,
**BPF_PROG_TYPE_SCHED_ACT** and **BPF_PROG_TYPE_XDP**.
For programs attached to a particular cgroup, e.g.,
**BPF_PROG_TYPE_CGROUP_SKB**, **BPF_PROG_TYPE_CGROUP_SOCK**,
**BPF_PROG_TYPE_SOCK_OPS** and **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**,
users can use **bpftool cgroup** to dump cgroup attachments.
For sk_{filter, skb, msg, reuseport} and lwt/seg6
bpf programs, users should consult other tools, e.g., iproute2.
The current output will start with all xdp program attachments, followed by
all tc class/qdisc bpf program attachments. Both xdp programs and
tc programs are ordered based on ifindex number. If multiple bpf
programs attached to the same networking device through **tc filter**,
the order will be first all bpf programs attached to tc classes, then
all bpf programs attached to non clsact qdiscs, and finally all
bpf programs attached to root and clsact qdisc.
**bpftool net help**
Print short help message.
OPTIONS
=======
-h, --help
Print short generic help message (similar to **bpftool help**).
-v, --version
Print version number (similar to **bpftool version**).
-j, --json
Generate JSON output. For commands that cannot produce JSON, this
option has no effect.
-p, --pretty
Generate human-readable JSON output. Implies **-j**.
EXAMPLES
========
| **# bpftool net**
::
xdp:
eth0(2) driver id 198
tc:
eth0(2) htb name prefix_matcher.o:[cls_prefix_matcher_htb] id 111727 act []
eth0(2) clsact/ingress fbflow_icmp id 130246 act []
eth0(2) clsact/egress prefix_matcher.o:[cls_prefix_matcher_clsact] id 111726
eth0(2) clsact/egress cls_fg_dscp id 108619 act []
eth0(2) clsact/egress fbflow_egress id 130245
|
| **# bpftool -jp net**
::
[{
"xdp": [{
"devname": "eth0",
"ifindex": 2,
"mode": "driver",
"id": 198
}
],
"tc": [{
"devname": "eth0",
"ifindex": 2,
"kind": "htb",
"name": "prefix_matcher.o:[cls_prefix_matcher_htb]",
"id": 111727,
"act": []
},{
"devname": "eth0",
"ifindex": 2,
"kind": "clsact/ingress",
"name": "fbflow_icmp",
"id": 130246,
"act": []
},{
"devname": "eth0",
"ifindex": 2,
"kind": "clsact/egress",
"name": "prefix_matcher.o:[cls_prefix_matcher_clsact]",
"id": 111726,
},{
"devname": "eth0",
"ifindex": 2,
"kind": "clsact/egress",
"name": "cls_fg_dscp",
"id": 108619,
"act": []
},{
"devname": "eth0",
"ifindex": 2,
"kind": "clsact/egress",
"name": "fbflow_egress",
"id": 130245,
}
]
}
]
SEE ALSO
========
**bpftool**\ (8), **bpftool-prog**\ (8), **bpftool-map**\ (8)
...@@ -16,7 +16,7 @@ SYNOPSIS ...@@ -16,7 +16,7 @@ SYNOPSIS
**bpftool** **version** **bpftool** **version**
*OBJECT* := { **map** | **program** | **cgroup** | **perf** } *OBJECT* := { **map** | **program** | **cgroup** | **perf** | **net** }
*OPTIONS* := { { **-V** | **--version** } | { **-h** | **--help** } *OPTIONS* := { { **-V** | **--version** } | { **-h** | **--help** }
| { **-j** | **--json** } [{ **-p** | **--pretty** }] } | { **-j** | **--json** } [{ **-p** | **--pretty** }] }
...@@ -32,6 +32,8 @@ SYNOPSIS ...@@ -32,6 +32,8 @@ SYNOPSIS
*PERF-COMMANDS* := { **show** | **list** | **help** } *PERF-COMMANDS* := { **show** | **list** | **help** }
*NET-COMMANDS* := { **show** | **list** | **help** }
DESCRIPTION DESCRIPTION
=========== ===========
*bpftool* allows for inspection and simple modification of BPF objects *bpftool* allows for inspection and simple modification of BPF objects
...@@ -58,4 +60,4 @@ OPTIONS ...@@ -58,4 +60,4 @@ OPTIONS
SEE ALSO SEE ALSO
======== ========
**bpftool-map**\ (8), **bpftool-prog**\ (8), **bpftool-cgroup**\ (8) **bpftool-map**\ (8), **bpftool-prog**\ (8), **bpftool-cgroup**\ (8)
**bpftool-perf**\ (8) **bpftool-perf**\ (8), **bpftool-net**\ (8)
...@@ -494,10 +494,10 @@ _bpftool() ...@@ -494,10 +494,10 @@ _bpftool()
_filedir _filedir
return 0 return 0
;; ;;
tree) tree)
_filedir _filedir
return 0 return 0
;; ;;
attach|detach) attach|detach)
local ATTACH_TYPES='ingress egress sock_create sock_ops \ local ATTACH_TYPES='ingress egress sock_create sock_ops \
device bind4 bind6 post_bind4 post_bind6 connect4 \ device bind4 bind6 post_bind4 post_bind6 connect4 \
...@@ -552,6 +552,15 @@ _bpftool() ...@@ -552,6 +552,15 @@ _bpftool()
;; ;;
esac esac
;; ;;
net)
case $command in
*)
[[ $prev == $object ]] && \
COMPREPLY=( $( compgen -W 'help \
show list' -- "$cur" ) )
;;
esac
;;
esac esac
} && } &&
complete -F _bpftool bpftool complete -F _bpftool bpftool
......
...@@ -85,7 +85,7 @@ static int do_help(int argc, char **argv) ...@@ -85,7 +85,7 @@ static int do_help(int argc, char **argv)
" %s batch file FILE\n" " %s batch file FILE\n"
" %s version\n" " %s version\n"
"\n" "\n"
" OBJECT := { prog | map | cgroup | perf }\n" " OBJECT := { prog | map | cgroup | perf | net }\n"
" " HELP_SPEC_OPTIONS "\n" " " HELP_SPEC_OPTIONS "\n"
"", "",
bin_name, bin_name, bin_name); bin_name, bin_name, bin_name);
...@@ -215,6 +215,7 @@ static const struct cmd cmds[] = { ...@@ -215,6 +215,7 @@ static const struct cmd cmds[] = {
{ "map", do_map }, { "map", do_map },
{ "cgroup", do_cgroup }, { "cgroup", do_cgroup },
{ "perf", do_perf }, { "perf", do_perf },
{ "net", do_net },
{ "version", do_version }, { "version", do_version },
{ 0 } { 0 }
}; };
......
...@@ -136,6 +136,7 @@ int do_map(int argc, char **arg); ...@@ -136,6 +136,7 @@ int do_map(int argc, char **arg);
int do_event_pipe(int argc, char **argv); int do_event_pipe(int argc, char **argv);
int do_cgroup(int argc, char **arg); int do_cgroup(int argc, char **arg);
int do_perf(int argc, char **arg); int do_perf(int argc, char **arg);
int do_net(int argc, char **arg);
int prog_parse_fd(int *argc, char ***argv); int prog_parse_fd(int *argc, char ***argv);
int map_parse_fd(int *argc, char ***argv); int map_parse_fd(int *argc, char ***argv);
...@@ -165,4 +166,11 @@ struct btf_dumper { ...@@ -165,4 +166,11 @@ struct btf_dumper {
*/ */
int btf_dumper_type(const struct btf_dumper *d, __u32 type_id, int btf_dumper_type(const struct btf_dumper *d, __u32 type_id,
const void *data); const void *data);
struct nlattr;
struct ifinfomsg;
struct tcmsg;
int do_xdp_dump(struct ifinfomsg *ifinfo, struct nlattr **tb);
int do_filter_dump(struct tcmsg *ifinfo, struct nlattr **tb, const char *kind,
const char *devname, int ifindex);
#endif #endif
...@@ -71,6 +71,7 @@ static const char * const map_type_name[] = { ...@@ -71,6 +71,7 @@ static const char * const map_type_name[] = {
[BPF_MAP_TYPE_XSKMAP] = "xskmap", [BPF_MAP_TYPE_XSKMAP] = "xskmap",
[BPF_MAP_TYPE_SOCKHASH] = "sockhash", [BPF_MAP_TYPE_SOCKHASH] = "sockhash",
[BPF_MAP_TYPE_CGROUP_STORAGE] = "cgroup_storage", [BPF_MAP_TYPE_CGROUP_STORAGE] = "cgroup_storage",
[BPF_MAP_TYPE_REUSEPORT_SOCKARRAY] = "reuseport_sockarray",
}; };
static bool map_is_per_cpu(__u32 type) static bool map_is_per_cpu(__u32 type)
...@@ -673,12 +674,6 @@ static int do_dump(int argc, char **argv) ...@@ -673,12 +674,6 @@ static int do_dump(int argc, char **argv)
if (fd < 0) if (fd < 0)
return -1; return -1;
if (map_is_map_of_maps(info.type) || map_is_map_of_progs(info.type)) {
p_err("Dumping maps of maps and program maps not supported");
close(fd);
return -1;
}
key = malloc(info.key_size); key = malloc(info.key_size);
value = alloc_value(&info); value = alloc_value(&info);
if (!key || !value) { if (!key || !value) {
...@@ -732,7 +727,9 @@ static int do_dump(int argc, char **argv) ...@@ -732,7 +727,9 @@ static int do_dump(int argc, char **argv)
} else { } else {
print_entry_plain(&info, key, value); print_entry_plain(&info, key, value);
} }
} else { num_elems++;
} else if (!map_is_map_of_maps(info.type) &&
!map_is_map_of_progs(info.type)) {
if (json_output) { if (json_output) {
jsonw_name(json_wtr, "key"); jsonw_name(json_wtr, "key");
print_hex_data_json(key, info.key_size); print_hex_data_json(key, info.key_size);
...@@ -749,7 +746,6 @@ static int do_dump(int argc, char **argv) ...@@ -749,7 +746,6 @@ static int do_dump(int argc, char **argv)
} }
prev_key = key; prev_key = key;
num_elems++;
} }
if (json_output) if (json_output)
......
// SPDX-License-Identifier: GPL-2.0+
// Copyright (C) 2018 Facebook
#define _GNU_SOURCE
#include <errno.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <libbpf.h>
#include <net/if.h>
#include <linux/if.h>
#include <linux/rtnetlink.h>
#include <linux/tc_act/tc_bpf.h>
#include <sys/socket.h>
#include <bpf.h>
#include <nlattr.h>
#include "main.h"
#include "netlink_dumper.h"
struct ip_devname_ifindex {
char devname[64];
int ifindex;
};
struct bpf_netdev_t {
struct ip_devname_ifindex *devices;
int used_len;
int array_len;
int filter_idx;
};
struct tc_kind_handle {
char kind[64];
int handle;
};
struct bpf_tcinfo_t {
struct tc_kind_handle *handle_array;
int used_len;
int array_len;
bool is_qdisc;
};
struct bpf_filter_t {
const char *kind;
const char *devname;
int ifindex;
};
static int dump_link_nlmsg(void *cookie, void *msg, struct nlattr **tb)
{
struct bpf_netdev_t *netinfo = cookie;
struct ifinfomsg *ifinfo = msg;
if (netinfo->filter_idx > 0 && netinfo->filter_idx != ifinfo->ifi_index)
return 0;
if (netinfo->used_len == netinfo->array_len) {
netinfo->devices = realloc(netinfo->devices,
(netinfo->array_len + 16) *
sizeof(struct ip_devname_ifindex));
if (!netinfo->devices)
return -ENOMEM;
netinfo->array_len += 16;
}
netinfo->devices[netinfo->used_len].ifindex = ifinfo->ifi_index;
snprintf(netinfo->devices[netinfo->used_len].devname,
sizeof(netinfo->devices[netinfo->used_len].devname),
"%s",
tb[IFLA_IFNAME] ? nla_getattr_str(tb[IFLA_IFNAME]) : "");
netinfo->used_len++;
return do_xdp_dump(ifinfo, tb);
}
static int dump_class_qdisc_nlmsg(void *cookie, void *msg, struct nlattr **tb)
{
struct bpf_tcinfo_t *tcinfo = cookie;
struct tcmsg *info = msg;
if (tcinfo->is_qdisc) {
/* skip clsact qdisc */
if (tb[TCA_KIND] &&
strcmp(nla_data(tb[TCA_KIND]), "clsact") == 0)
return 0;
if (info->tcm_handle == 0)
return 0;
}
if (tcinfo->used_len == tcinfo->array_len) {
tcinfo->handle_array = realloc(tcinfo->handle_array,
(tcinfo->array_len + 16) * sizeof(struct tc_kind_handle));
if (!tcinfo->handle_array)
return -ENOMEM;
tcinfo->array_len += 16;
}
tcinfo->handle_array[tcinfo->used_len].handle = info->tcm_handle;
snprintf(tcinfo->handle_array[tcinfo->used_len].kind,
sizeof(tcinfo->handle_array[tcinfo->used_len].kind),
"%s",
tb[TCA_KIND] ? nla_getattr_str(tb[TCA_KIND]) : "unknown");
tcinfo->used_len++;
return 0;
}
static int dump_filter_nlmsg(void *cookie, void *msg, struct nlattr **tb)
{
const struct bpf_filter_t *filter_info = cookie;
return do_filter_dump((struct tcmsg *)msg, tb, filter_info->kind,
filter_info->devname, filter_info->ifindex);
}
static int show_dev_tc_bpf(int sock, unsigned int nl_pid,
struct ip_devname_ifindex *dev)
{
struct bpf_filter_t filter_info;
struct bpf_tcinfo_t tcinfo;
int i, handle, ret = 0;
tcinfo.handle_array = NULL;
tcinfo.used_len = 0;
tcinfo.array_len = 0;
tcinfo.is_qdisc = false;
ret = nl_get_class(sock, nl_pid, dev->ifindex, dump_class_qdisc_nlmsg,
&tcinfo);
if (ret)
goto out;
tcinfo.is_qdisc = true;
ret = nl_get_qdisc(sock, nl_pid, dev->ifindex, dump_class_qdisc_nlmsg,
&tcinfo);
if (ret)
goto out;
filter_info.devname = dev->devname;
filter_info.ifindex = dev->ifindex;
for (i = 0; i < tcinfo.used_len; i++) {
filter_info.kind = tcinfo.handle_array[i].kind;
ret = nl_get_filter(sock, nl_pid, dev->ifindex,
tcinfo.handle_array[i].handle,
dump_filter_nlmsg,
&filter_info);
if (ret)
goto out;
}
/* root, ingress and egress handle */
handle = TC_H_ROOT;
filter_info.kind = "root";
ret = nl_get_filter(sock, nl_pid, dev->ifindex, handle,
dump_filter_nlmsg, &filter_info);
if (ret)
goto out;
handle = TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_INGRESS);
filter_info.kind = "clsact/ingress";
ret = nl_get_filter(sock, nl_pid, dev->ifindex, handle,
dump_filter_nlmsg, &filter_info);
if (ret)
goto out;
handle = TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_EGRESS);
filter_info.kind = "clsact/egress";
ret = nl_get_filter(sock, nl_pid, dev->ifindex, handle,
dump_filter_nlmsg, &filter_info);
if (ret)
goto out;
out:
free(tcinfo.handle_array);
return 0;
}
static int do_show(int argc, char **argv)
{
int i, sock, ret, filter_idx = -1;
struct bpf_netdev_t dev_array;
unsigned int nl_pid;
char err_buf[256];
if (argc == 2) {
if (strcmp(argv[0], "dev") != 0)
usage();
filter_idx = if_nametoindex(argv[1]);
if (filter_idx == 0) {
fprintf(stderr, "invalid dev name %s\n", argv[1]);
return -1;
}
} else if (argc != 0) {
usage();
}
sock = bpf_netlink_open(&nl_pid);
if (sock < 0) {
fprintf(stderr, "failed to open netlink sock\n");
return -1;
}
dev_array.devices = NULL;
dev_array.used_len = 0;
dev_array.array_len = 0;
dev_array.filter_idx = filter_idx;
if (json_output)
jsonw_start_array(json_wtr);
NET_START_OBJECT;
NET_START_ARRAY("xdp", "%s:\n");
ret = nl_get_link(sock, nl_pid, dump_link_nlmsg, &dev_array);
NET_END_ARRAY("\n");
if (!ret) {
NET_START_ARRAY("tc", "%s:\n");
for (i = 0; i < dev_array.used_len; i++) {
ret = show_dev_tc_bpf(sock, nl_pid,
&dev_array.devices[i]);
if (ret)
break;
}
NET_END_ARRAY("\n");
}
NET_END_OBJECT;
if (json_output)
jsonw_end_array(json_wtr);
if (ret) {
if (json_output)
jsonw_null(json_wtr);
libbpf_strerror(ret, err_buf, sizeof(err_buf));
fprintf(stderr, "Error: %s\n", err_buf);
}
free(dev_array.devices);
close(sock);
return ret;
}
static int do_help(int argc, char **argv)
{
if (json_output) {
jsonw_null(json_wtr);
return 0;
}
fprintf(stderr,
"Usage: %s %s { show | list } [dev <devname>]\n"
" %s %s help\n"
"Note: Only xdp and tc attachments are supported now.\n"
" For progs attached to cgroups, use \"bpftool cgroup\"\n"
" to dump program attachments. For program types\n"
" sk_{filter,skb,msg,reuseport} and lwt/seg6, please\n"
" consult iproute2.\n",
bin_name, argv[-2], bin_name, argv[-2]);
return 0;
}
static const struct cmd cmds[] = {
{ "show", do_show },
{ "list", do_show },
{ "help", do_help },
{ 0 }
};
int do_net(int argc, char **argv)
{
return cmd_select(cmds, argc, argv, do_help);
}
// SPDX-License-Identifier: GPL-2.0+
// Copyright (C) 2018 Facebook
#include <stdlib.h>
#include <string.h>
#include <libbpf.h>
#include <linux/rtnetlink.h>
#include <linux/tc_act/tc_bpf.h>
#include <nlattr.h>
#include "main.h"
#include "netlink_dumper.h"
static void xdp_dump_prog_id(struct nlattr **tb, int attr,
const char *mode,
bool new_json_object)
{
if (!tb[attr])
return;
if (new_json_object)
NET_START_OBJECT
NET_DUMP_STR("mode", " %s", mode);
NET_DUMP_UINT("id", " id %u", nla_getattr_u32(tb[attr]))
if (new_json_object)
NET_END_OBJECT
}
static int do_xdp_dump_one(struct nlattr *attr, unsigned int ifindex,
const char *name)
{
struct nlattr *tb[IFLA_XDP_MAX + 1];
unsigned char mode;
if (nla_parse_nested(tb, IFLA_XDP_MAX, attr, NULL) < 0)
return -1;
if (!tb[IFLA_XDP_ATTACHED])
return 0;
mode = nla_getattr_u8(tb[IFLA_XDP_ATTACHED]);
if (mode == XDP_ATTACHED_NONE)
return 0;
NET_START_OBJECT;
if (name)
NET_DUMP_STR("devname", "%s", name);
NET_DUMP_UINT("ifindex", "(%d)", ifindex);
if (mode == XDP_ATTACHED_MULTI) {
if (json_output) {
jsonw_name(json_wtr, "multi_attachments");
jsonw_start_array(json_wtr);
}
xdp_dump_prog_id(tb, IFLA_XDP_SKB_PROG_ID, "generic", true);
xdp_dump_prog_id(tb, IFLA_XDP_DRV_PROG_ID, "driver", true);
xdp_dump_prog_id(tb, IFLA_XDP_HW_PROG_ID, "offload", true);
if (json_output)
jsonw_end_array(json_wtr);
} else if (mode == XDP_ATTACHED_DRV) {
xdp_dump_prog_id(tb, IFLA_XDP_PROG_ID, "driver", false);
} else if (mode == XDP_ATTACHED_SKB) {
xdp_dump_prog_id(tb, IFLA_XDP_PROG_ID, "generic", false);
} else if (mode == XDP_ATTACHED_HW) {
xdp_dump_prog_id(tb, IFLA_XDP_PROG_ID, "offload", false);
}
NET_END_OBJECT_FINAL;
return 0;
}
int do_xdp_dump(struct ifinfomsg *ifinfo, struct nlattr **tb)
{
if (!tb[IFLA_XDP])
return 0;
return do_xdp_dump_one(tb[IFLA_XDP], ifinfo->ifi_index,
nla_getattr_str(tb[IFLA_IFNAME]));
}
static int do_bpf_dump_one_act(struct nlattr *attr)
{
struct nlattr *tb[TCA_ACT_BPF_MAX + 1];
if (nla_parse_nested(tb, TCA_ACT_BPF_MAX, attr, NULL) < 0)
return -LIBBPF_ERRNO__NLPARSE;
if (!tb[TCA_ACT_BPF_PARMS])
return -LIBBPF_ERRNO__NLPARSE;
NET_START_OBJECT_NESTED2;
if (tb[TCA_ACT_BPF_NAME])
NET_DUMP_STR("name", "%s",
nla_getattr_str(tb[TCA_ACT_BPF_NAME]));
if (tb[TCA_ACT_BPF_ID])
NET_DUMP_UINT("id", " id %u",
nla_getattr_u32(tb[TCA_ACT_BPF_ID]));
NET_END_OBJECT_NESTED;
return 0;
}
static int do_dump_one_act(struct nlattr *attr)
{
struct nlattr *tb[TCA_ACT_MAX + 1];
if (!attr)
return 0;
if (nla_parse_nested(tb, TCA_ACT_MAX, attr, NULL) < 0)
return -LIBBPF_ERRNO__NLPARSE;
if (tb[TCA_ACT_KIND] && strcmp(nla_data(tb[TCA_ACT_KIND]), "bpf") == 0)
return do_bpf_dump_one_act(tb[TCA_ACT_OPTIONS]);
return 0;
}
static int do_bpf_act_dump(struct nlattr *attr)
{
struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
int act, ret;
if (nla_parse_nested(tb, TCA_ACT_MAX_PRIO, attr, NULL) < 0)
return -LIBBPF_ERRNO__NLPARSE;
NET_START_ARRAY("act", " %s [");
for (act = 0; act <= TCA_ACT_MAX_PRIO; act++) {
ret = do_dump_one_act(tb[act]);
if (ret)
break;
}
NET_END_ARRAY("] ");
return ret;
}
static int do_bpf_filter_dump(struct nlattr *attr)
{
struct nlattr *tb[TCA_BPF_MAX + 1];
int ret;
if (nla_parse_nested(tb, TCA_BPF_MAX, attr, NULL) < 0)
return -LIBBPF_ERRNO__NLPARSE;
if (tb[TCA_BPF_NAME])
NET_DUMP_STR("name", " %s", nla_getattr_str(tb[TCA_BPF_NAME]));
if (tb[TCA_BPF_ID])
NET_DUMP_UINT("id", " id %u", nla_getattr_u32(tb[TCA_BPF_ID]));
if (tb[TCA_BPF_ACT]) {
ret = do_bpf_act_dump(tb[TCA_BPF_ACT]);
if (ret)
return ret;
}
return 0;
}
int do_filter_dump(struct tcmsg *info, struct nlattr **tb, const char *kind,
const char *devname, int ifindex)
{
int ret = 0;
if (tb[TCA_OPTIONS] && strcmp(nla_data(tb[TCA_KIND]), "bpf") == 0) {
NET_START_OBJECT;
if (devname[0] != '\0')
NET_DUMP_STR("devname", "%s", devname);
NET_DUMP_UINT("ifindex", "(%u)", ifindex);
NET_DUMP_STR("kind", " %s", kind);
ret = do_bpf_filter_dump(tb[TCA_OPTIONS]);
NET_END_OBJECT_FINAL;
}
return ret;
}
// SPDX-License-Identifier: GPL-2.0+
// Copyright (C) 2018 Facebook
#ifndef _NETLINK_DUMPER_H_
#define _NETLINK_DUMPER_H_
#define NET_START_OBJECT \
{ \
if (json_output) \
jsonw_start_object(json_wtr); \
}
#define NET_START_OBJECT_NESTED(name) \
{ \
if (json_output) { \
jsonw_name(json_wtr, name); \
jsonw_start_object(json_wtr); \
} else { \
fprintf(stderr, "%s {", name); \
} \
}
#define NET_START_OBJECT_NESTED2 \
{ \
if (json_output) \
jsonw_start_object(json_wtr); \
else \
fprintf(stderr, "{"); \
}
#define NET_END_OBJECT_NESTED \
{ \
if (json_output) \
jsonw_end_object(json_wtr); \
else \
fprintf(stderr, "}"); \
}
#define NET_END_OBJECT \
{ \
if (json_output) \
jsonw_end_object(json_wtr); \
}
#define NET_END_OBJECT_FINAL \
{ \
if (json_output) \
jsonw_end_object(json_wtr); \
else \
fprintf(stderr, "\n"); \
}
#define NET_START_ARRAY(name, fmt_str) \
{ \
if (json_output) { \
jsonw_name(json_wtr, name); \
jsonw_start_array(json_wtr); \
} else { \
fprintf(stderr, fmt_str, name); \
} \
}
#define NET_END_ARRAY(endstr) \
{ \
if (json_output) \
jsonw_end_array(json_wtr); \
else \
fprintf(stderr, "%s", endstr); \
}
#define NET_DUMP_UINT(name, fmt_str, val) \
{ \
if (json_output) \
jsonw_uint_field(json_wtr, name, val); \
else \
fprintf(stderr, fmt_str, val); \
}
#define NET_DUMP_STR(name, fmt_str, str) \
{ \
if (json_output) \
jsonw_string_field(json_wtr, name, str);\
else \
fprintf(stderr, fmt_str, str); \
}
#define NET_DUMP_STR_ONLY(str) \
{ \
if (json_output) \
jsonw_string(json_wtr, str); \
else \
fprintf(stderr, "%s ", str); \
}
#endif
...@@ -74,6 +74,7 @@ static const char * const prog_type_name[] = { ...@@ -74,6 +74,7 @@ static const char * const prog_type_name[] = {
[BPF_PROG_TYPE_RAW_TRACEPOINT] = "raw_tracepoint", [BPF_PROG_TYPE_RAW_TRACEPOINT] = "raw_tracepoint",
[BPF_PROG_TYPE_CGROUP_SOCK_ADDR] = "cgroup_sock_addr", [BPF_PROG_TYPE_CGROUP_SOCK_ADDR] = "cgroup_sock_addr",
[BPF_PROG_TYPE_LIRC_MODE2] = "lirc_mode2", [BPF_PROG_TYPE_LIRC_MODE2] = "lirc_mode2",
[BPF_PROG_TYPE_FLOW_DISSECTOR] = "flow_dissector",
}; };
static void print_boot_time(__u64 nsecs, char *buf, unsigned int size) static void print_boot_time(__u64 nsecs, char *buf, unsigned int size)
......
...@@ -152,6 +152,7 @@ enum bpf_prog_type { ...@@ -152,6 +152,7 @@ enum bpf_prog_type {
BPF_PROG_TYPE_LWT_SEG6LOCAL, BPF_PROG_TYPE_LWT_SEG6LOCAL,
BPF_PROG_TYPE_LIRC_MODE2, BPF_PROG_TYPE_LIRC_MODE2,
BPF_PROG_TYPE_SK_REUSEPORT, BPF_PROG_TYPE_SK_REUSEPORT,
BPF_PROG_TYPE_FLOW_DISSECTOR,
}; };
enum bpf_attach_type { enum bpf_attach_type {
...@@ -172,6 +173,7 @@ enum bpf_attach_type { ...@@ -172,6 +173,7 @@ enum bpf_attach_type {
BPF_CGROUP_UDP4_SENDMSG, BPF_CGROUP_UDP4_SENDMSG,
BPF_CGROUP_UDP6_SENDMSG, BPF_CGROUP_UDP6_SENDMSG,
BPF_LIRC_MODE2, BPF_LIRC_MODE2,
BPF_FLOW_DISSECTOR,
__MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE
}; };
...@@ -2333,6 +2335,7 @@ struct __sk_buff { ...@@ -2333,6 +2335,7 @@ struct __sk_buff {
/* ... here. */ /* ... here. */
__u32 data_meta; __u32 data_meta;
struct bpf_flow_keys *flow_keys;
}; };
struct bpf_tunnel_key { struct bpf_tunnel_key {
...@@ -2778,4 +2781,27 @@ enum bpf_task_fd_type { ...@@ -2778,4 +2781,27 @@ enum bpf_task_fd_type {
BPF_FD_TYPE_URETPROBE, /* filename + offset */ BPF_FD_TYPE_URETPROBE, /* filename + offset */
}; };
struct bpf_flow_keys {
__u16 nhoff;
__u16 thoff;
__u16 addr_proto; /* ETH_P_* of valid addrs */
__u8 is_frag;
__u8 is_first_frag;
__u8 is_encap;
__u8 ip_proto;
__be16 n_proto;
__be16 sport;
__be16 dport;
union {
struct {
__be32 ipv4_src;
__be32 ipv4_dst;
};
struct {
__u32 ipv6_src[4]; /* in6_addr; network order */
__u32 ipv6_dst[4]; /* in6_addr; network order */
};
};
};
#endif /* _UAPI__LINUX_BPF_H__ */ #endif /* _UAPI__LINUX_BPF_H__ */
libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o str_error.o libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o str_error.o netlink.o
...@@ -28,16 +28,8 @@ ...@@ -28,16 +28,8 @@
#include <linux/bpf.h> #include <linux/bpf.h>
#include "bpf.h" #include "bpf.h"
#include "libbpf.h" #include "libbpf.h"
#include "nlattr.h"
#include <linux/rtnetlink.h>
#include <linux/if_link.h>
#include <sys/socket.h>
#include <errno.h> #include <errno.h>
#ifndef SOL_NETLINK
#define SOL_NETLINK 270
#endif
/* /*
* When building perf, unistd.h is overridden. __NR_bpf is * When building perf, unistd.h is overridden. __NR_bpf is
* required to be defined explicitly. * required to be defined explicitly.
...@@ -499,127 +491,6 @@ int bpf_raw_tracepoint_open(const char *name, int prog_fd) ...@@ -499,127 +491,6 @@ int bpf_raw_tracepoint_open(const char *name, int prog_fd)
return sys_bpf(BPF_RAW_TRACEPOINT_OPEN, &attr, sizeof(attr)); return sys_bpf(BPF_RAW_TRACEPOINT_OPEN, &attr, sizeof(attr));
} }
int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags)
{
struct sockaddr_nl sa;
int sock, seq = 0, len, ret = -1;
char buf[4096];
struct nlattr *nla, *nla_xdp;
struct {
struct nlmsghdr nh;
struct ifinfomsg ifinfo;
char attrbuf[64];
} req;
struct nlmsghdr *nh;
struct nlmsgerr *err;
socklen_t addrlen;
int one = 1;
memset(&sa, 0, sizeof(sa));
sa.nl_family = AF_NETLINK;
sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
if (sock < 0) {
return -errno;
}
if (setsockopt(sock, SOL_NETLINK, NETLINK_EXT_ACK,
&one, sizeof(one)) < 0) {
fprintf(stderr, "Netlink error reporting not supported\n");
}
if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) {
ret = -errno;
goto cleanup;
}
addrlen = sizeof(sa);
if (getsockname(sock, (struct sockaddr *)&sa, &addrlen) < 0) {
ret = -errno;
goto cleanup;
}
if (addrlen != sizeof(sa)) {
ret = -LIBBPF_ERRNO__INTERNAL;
goto cleanup;
}
memset(&req, 0, sizeof(req));
req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
req.nh.nlmsg_type = RTM_SETLINK;
req.nh.nlmsg_pid = 0;
req.nh.nlmsg_seq = ++seq;
req.ifinfo.ifi_family = AF_UNSPEC;
req.ifinfo.ifi_index = ifindex;
/* started nested attribute for XDP */
nla = (struct nlattr *)(((char *)&req)
+ NLMSG_ALIGN(req.nh.nlmsg_len));
nla->nla_type = NLA_F_NESTED | IFLA_XDP;
nla->nla_len = NLA_HDRLEN;
/* add XDP fd */
nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len);
nla_xdp->nla_type = IFLA_XDP_FD;
nla_xdp->nla_len = NLA_HDRLEN + sizeof(int);
memcpy((char *)nla_xdp + NLA_HDRLEN, &fd, sizeof(fd));
nla->nla_len += nla_xdp->nla_len;
/* if user passed in any flags, add those too */
if (flags) {
nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len);
nla_xdp->nla_type = IFLA_XDP_FLAGS;
nla_xdp->nla_len = NLA_HDRLEN + sizeof(flags);
memcpy((char *)nla_xdp + NLA_HDRLEN, &flags, sizeof(flags));
nla->nla_len += nla_xdp->nla_len;
}
req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len);
if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) {
ret = -errno;
goto cleanup;
}
len = recv(sock, buf, sizeof(buf), 0);
if (len < 0) {
ret = -errno;
goto cleanup;
}
for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len);
nh = NLMSG_NEXT(nh, len)) {
if (nh->nlmsg_pid != sa.nl_pid) {
ret = -LIBBPF_ERRNO__WRNGPID;
goto cleanup;
}
if (nh->nlmsg_seq != seq) {
ret = -LIBBPF_ERRNO__INVSEQ;
goto cleanup;
}
switch (nh->nlmsg_type) {
case NLMSG_ERROR:
err = (struct nlmsgerr *)NLMSG_DATA(nh);
if (!err->error)
continue;
ret = err->error;
nla_dump_errormsg(nh);
goto cleanup;
case NLMSG_DONE:
break;
default:
break;
}
}
ret = 0;
cleanup:
close(sock);
return ret;
}
int bpf_load_btf(void *btf, __u32 btf_size, char *log_buf, __u32 log_buf_size, int bpf_load_btf(void *btf, __u32 btf_size, char *log_buf, __u32 log_buf_size,
bool do_log) bool do_log)
{ {
......
...@@ -1502,6 +1502,7 @@ static bool bpf_prog_type__needs_kver(enum bpf_prog_type type) ...@@ -1502,6 +1502,7 @@ static bool bpf_prog_type__needs_kver(enum bpf_prog_type type)
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
case BPF_PROG_TYPE_LIRC_MODE2: case BPF_PROG_TYPE_LIRC_MODE2:
case BPF_PROG_TYPE_SK_REUSEPORT: case BPF_PROG_TYPE_SK_REUSEPORT:
case BPF_PROG_TYPE_FLOW_DISSECTOR:
return false; return false;
case BPF_PROG_TYPE_UNSPEC: case BPF_PROG_TYPE_UNSPEC:
case BPF_PROG_TYPE_KPROBE: case BPF_PROG_TYPE_KPROBE:
...@@ -2121,6 +2122,7 @@ static const struct { ...@@ -2121,6 +2122,7 @@ static const struct {
BPF_PROG_SEC("sk_skb", BPF_PROG_TYPE_SK_SKB), BPF_PROG_SEC("sk_skb", BPF_PROG_TYPE_SK_SKB),
BPF_PROG_SEC("sk_msg", BPF_PROG_TYPE_SK_MSG), BPF_PROG_SEC("sk_msg", BPF_PROG_TYPE_SK_MSG),
BPF_PROG_SEC("lirc_mode2", BPF_PROG_TYPE_LIRC_MODE2), BPF_PROG_SEC("lirc_mode2", BPF_PROG_TYPE_LIRC_MODE2),
BPF_PROG_SEC("flow_dissector", BPF_PROG_TYPE_FLOW_DISSECTOR),
BPF_SA_PROG_SEC("cgroup/bind4", BPF_CGROUP_INET4_BIND), BPF_SA_PROG_SEC("cgroup/bind4", BPF_CGROUP_INET4_BIND),
BPF_SA_PROG_SEC("cgroup/bind6", BPF_CGROUP_INET6_BIND), BPF_SA_PROG_SEC("cgroup/bind6", BPF_CGROUP_INET6_BIND),
BPF_SA_PROG_SEC("cgroup/connect4", BPF_CGROUP_INET4_CONNECT), BPF_SA_PROG_SEC("cgroup/connect4", BPF_CGROUP_INET4_CONNECT),
...@@ -2336,7 +2338,7 @@ int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr, ...@@ -2336,7 +2338,7 @@ int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr,
bpf_program__set_expected_attach_type(prog, bpf_program__set_expected_attach_type(prog,
expected_attach_type); expected_attach_type);
if (!bpf_program__is_function_storage(prog, obj) && !first_prog) if (!first_prog)
first_prog = prog; first_prog = prog;
} }
......
...@@ -46,6 +46,7 @@ enum libbpf_errno { ...@@ -46,6 +46,7 @@ enum libbpf_errno {
LIBBPF_ERRNO__PROGTYPE, /* Kernel doesn't support this program type */ LIBBPF_ERRNO__PROGTYPE, /* Kernel doesn't support this program type */
LIBBPF_ERRNO__WRNGPID, /* Wrong pid in netlink message */ LIBBPF_ERRNO__WRNGPID, /* Wrong pid in netlink message */
LIBBPF_ERRNO__INVSEQ, /* Invalid netlink sequence */ LIBBPF_ERRNO__INVSEQ, /* Invalid netlink sequence */
LIBBPF_ERRNO__NLPARSE, /* netlink parsing error */
__LIBBPF_ERRNO__END, __LIBBPF_ERRNO__END,
}; };
...@@ -297,4 +298,19 @@ int bpf_perf_event_read_simple(void *mem, unsigned long size, ...@@ -297,4 +298,19 @@ int bpf_perf_event_read_simple(void *mem, unsigned long size,
unsigned long page_size, unsigned long page_size,
void **buf, size_t *buf_len, void **buf, size_t *buf_len,
bpf_perf_event_print_t fn, void *priv); bpf_perf_event_print_t fn, void *priv);
struct nlmsghdr;
struct nlattr;
typedef int (*dump_nlmsg_t)(void *cookie, void *msg, struct nlattr **tb);
typedef int (*__dump_nlmsg_t)(struct nlmsghdr *nlmsg, dump_nlmsg_t,
void *cookie);
int bpf_netlink_open(unsigned int *nl_pid);
int nl_get_link(int sock, unsigned int nl_pid, dump_nlmsg_t dump_link_nlmsg,
void *cookie);
int nl_get_class(int sock, unsigned int nl_pid, int ifindex,
dump_nlmsg_t dump_class_nlmsg, void *cookie);
int nl_get_qdisc(int sock, unsigned int nl_pid, int ifindex,
dump_nlmsg_t dump_qdisc_nlmsg, void *cookie);
int nl_get_filter(int sock, unsigned int nl_pid, int ifindex, int handle,
dump_nlmsg_t dump_filter_nlmsg, void *cookie);
#endif #endif
...@@ -42,6 +42,7 @@ static const char *libbpf_strerror_table[NR_ERRNO] = { ...@@ -42,6 +42,7 @@ static const char *libbpf_strerror_table[NR_ERRNO] = {
[ERRCODE_OFFSET(PROGTYPE)] = "Kernel doesn't support this program type", [ERRCODE_OFFSET(PROGTYPE)] = "Kernel doesn't support this program type",
[ERRCODE_OFFSET(WRNGPID)] = "Wrong pid in netlink message", [ERRCODE_OFFSET(WRNGPID)] = "Wrong pid in netlink message",
[ERRCODE_OFFSET(INVSEQ)] = "Invalid netlink sequence", [ERRCODE_OFFSET(INVSEQ)] = "Invalid netlink sequence",
[ERRCODE_OFFSET(NLPARSE)] = "Incorrect netlink message parsing",
}; };
int libbpf_strerror(int err, char *buf, size_t size) int libbpf_strerror(int err, char *buf, size_t size)
......
// SPDX-License-Identifier: LGPL-2.1
/* Copyright (c) 2018 Facebook */
#include <stdlib.h>
#include <memory.h>
#include <unistd.h>
#include <linux/bpf.h>
#include <linux/rtnetlink.h>
#include <sys/socket.h>
#include <errno.h>
#include <time.h>
#include "bpf.h"
#include "libbpf.h"
#include "nlattr.h"
#ifndef SOL_NETLINK
#define SOL_NETLINK 270
#endif
int bpf_netlink_open(__u32 *nl_pid)
{
struct sockaddr_nl sa;
socklen_t addrlen;
int one = 1, ret;
int sock;
memset(&sa, 0, sizeof(sa));
sa.nl_family = AF_NETLINK;
sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
if (sock < 0)
return -errno;
if (setsockopt(sock, SOL_NETLINK, NETLINK_EXT_ACK,
&one, sizeof(one)) < 0) {
fprintf(stderr, "Netlink error reporting not supported\n");
}
if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) {
ret = -errno;
goto cleanup;
}
addrlen = sizeof(sa);
if (getsockname(sock, (struct sockaddr *)&sa, &addrlen) < 0) {
ret = -errno;
goto cleanup;
}
if (addrlen != sizeof(sa)) {
ret = -LIBBPF_ERRNO__INTERNAL;
goto cleanup;
}
*nl_pid = sa.nl_pid;
return sock;
cleanup:
close(sock);
return ret;
}
static int bpf_netlink_recv(int sock, __u32 nl_pid, int seq,
__dump_nlmsg_t _fn, dump_nlmsg_t fn,
void *cookie)
{
bool multipart = true;
struct nlmsgerr *err;
struct nlmsghdr *nh;
char buf[4096];
int len, ret;
while (multipart) {
multipart = false;
len = recv(sock, buf, sizeof(buf), 0);
if (len < 0) {
ret = -errno;
goto done;
}
if (len == 0)
break;
for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len);
nh = NLMSG_NEXT(nh, len)) {
if (nh->nlmsg_pid != nl_pid) {
ret = -LIBBPF_ERRNO__WRNGPID;
goto done;
}
if (nh->nlmsg_seq != seq) {
ret = -LIBBPF_ERRNO__INVSEQ;
goto done;
}
if (nh->nlmsg_flags & NLM_F_MULTI)
multipart = true;
switch (nh->nlmsg_type) {
case NLMSG_ERROR:
err = (struct nlmsgerr *)NLMSG_DATA(nh);
if (!err->error)
continue;
ret = err->error;
nla_dump_errormsg(nh);
goto done;
case NLMSG_DONE:
return 0;
default:
break;
}
if (_fn) {
ret = _fn(nh, fn, cookie);
if (ret)
return ret;
}
}
}
ret = 0;
done:
return ret;
}
int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags)
{
int sock, seq = 0, ret;
struct nlattr *nla, *nla_xdp;
struct {
struct nlmsghdr nh;
struct ifinfomsg ifinfo;
char attrbuf[64];
} req;
__u32 nl_pid;
sock = bpf_netlink_open(&nl_pid);
if (sock < 0)
return sock;
memset(&req, 0, sizeof(req));
req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
req.nh.nlmsg_type = RTM_SETLINK;
req.nh.nlmsg_pid = 0;
req.nh.nlmsg_seq = ++seq;
req.ifinfo.ifi_family = AF_UNSPEC;
req.ifinfo.ifi_index = ifindex;
/* started nested attribute for XDP */
nla = (struct nlattr *)(((char *)&req)
+ NLMSG_ALIGN(req.nh.nlmsg_len));
nla->nla_type = NLA_F_NESTED | IFLA_XDP;
nla->nla_len = NLA_HDRLEN;
/* add XDP fd */
nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len);
nla_xdp->nla_type = IFLA_XDP_FD;
nla_xdp->nla_len = NLA_HDRLEN + sizeof(int);
memcpy((char *)nla_xdp + NLA_HDRLEN, &fd, sizeof(fd));
nla->nla_len += nla_xdp->nla_len;
/* if user passed in any flags, add those too */
if (flags) {
nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len);
nla_xdp->nla_type = IFLA_XDP_FLAGS;
nla_xdp->nla_len = NLA_HDRLEN + sizeof(flags);
memcpy((char *)nla_xdp + NLA_HDRLEN, &flags, sizeof(flags));
nla->nla_len += nla_xdp->nla_len;
}
req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len);
if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) {
ret = -errno;
goto cleanup;
}
ret = bpf_netlink_recv(sock, nl_pid, seq, NULL, NULL, NULL);
cleanup:
close(sock);
return ret;
}
static int __dump_link_nlmsg(struct nlmsghdr *nlh, dump_nlmsg_t dump_link_nlmsg,
void *cookie)
{
struct nlattr *tb[IFLA_MAX + 1], *attr;
struct ifinfomsg *ifi = NLMSG_DATA(nlh);
int len;
len = nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*ifi));
attr = (struct nlattr *) ((void *) ifi + NLMSG_ALIGN(sizeof(*ifi)));
if (nla_parse(tb, IFLA_MAX, attr, len, NULL) != 0)
return -LIBBPF_ERRNO__NLPARSE;
return dump_link_nlmsg(cookie, ifi, tb);
}
int nl_get_link(int sock, unsigned int nl_pid, dump_nlmsg_t dump_link_nlmsg,
void *cookie)
{
struct {
struct nlmsghdr nlh;
struct ifinfomsg ifm;
} req = {
.nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
.nlh.nlmsg_type = RTM_GETLINK,
.nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST,
.ifm.ifi_family = AF_PACKET,
};
int seq = time(NULL);
req.nlh.nlmsg_seq = seq;
if (send(sock, &req, req.nlh.nlmsg_len, 0) < 0)
return -errno;
return bpf_netlink_recv(sock, nl_pid, seq, __dump_link_nlmsg,
dump_link_nlmsg, cookie);
}
static int __dump_class_nlmsg(struct nlmsghdr *nlh,
dump_nlmsg_t dump_class_nlmsg, void *cookie)
{
struct nlattr *tb[TCA_MAX + 1], *attr;
struct tcmsg *t = NLMSG_DATA(nlh);
int len;
len = nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*t));
attr = (struct nlattr *) ((void *) t + NLMSG_ALIGN(sizeof(*t)));
if (nla_parse(tb, TCA_MAX, attr, len, NULL) != 0)
return -LIBBPF_ERRNO__NLPARSE;
return dump_class_nlmsg(cookie, t, tb);
}
int nl_get_class(int sock, unsigned int nl_pid, int ifindex,
dump_nlmsg_t dump_class_nlmsg, void *cookie)
{
struct {
struct nlmsghdr nlh;
struct tcmsg t;
} req = {
.nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)),
.nlh.nlmsg_type = RTM_GETTCLASS,
.nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST,
.t.tcm_family = AF_UNSPEC,
.t.tcm_ifindex = ifindex,
};
int seq = time(NULL);
req.nlh.nlmsg_seq = seq;
if (send(sock, &req, req.nlh.nlmsg_len, 0) < 0)
return -errno;
return bpf_netlink_recv(sock, nl_pid, seq, __dump_class_nlmsg,
dump_class_nlmsg, cookie);
}
static int __dump_qdisc_nlmsg(struct nlmsghdr *nlh,
dump_nlmsg_t dump_qdisc_nlmsg, void *cookie)
{
struct nlattr *tb[TCA_MAX + 1], *attr;
struct tcmsg *t = NLMSG_DATA(nlh);
int len;
len = nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*t));
attr = (struct nlattr *) ((void *) t + NLMSG_ALIGN(sizeof(*t)));
if (nla_parse(tb, TCA_MAX, attr, len, NULL) != 0)
return -LIBBPF_ERRNO__NLPARSE;
return dump_qdisc_nlmsg(cookie, t, tb);
}
int nl_get_qdisc(int sock, unsigned int nl_pid, int ifindex,
dump_nlmsg_t dump_qdisc_nlmsg, void *cookie)
{
struct {
struct nlmsghdr nlh;
struct tcmsg t;
} req = {
.nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)),
.nlh.nlmsg_type = RTM_GETQDISC,
.nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST,
.t.tcm_family = AF_UNSPEC,
.t.tcm_ifindex = ifindex,
};
int seq = time(NULL);
req.nlh.nlmsg_seq = seq;
if (send(sock, &req, req.nlh.nlmsg_len, 0) < 0)
return -errno;
return bpf_netlink_recv(sock, nl_pid, seq, __dump_qdisc_nlmsg,
dump_qdisc_nlmsg, cookie);
}
static int __dump_filter_nlmsg(struct nlmsghdr *nlh,
dump_nlmsg_t dump_filter_nlmsg, void *cookie)
{
struct nlattr *tb[TCA_MAX + 1], *attr;
struct tcmsg *t = NLMSG_DATA(nlh);
int len;
len = nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*t));
attr = (struct nlattr *) ((void *) t + NLMSG_ALIGN(sizeof(*t)));
if (nla_parse(tb, TCA_MAX, attr, len, NULL) != 0)
return -LIBBPF_ERRNO__NLPARSE;
return dump_filter_nlmsg(cookie, t, tb);
}
int nl_get_filter(int sock, unsigned int nl_pid, int ifindex, int handle,
dump_nlmsg_t dump_filter_nlmsg, void *cookie)
{
struct {
struct nlmsghdr nlh;
struct tcmsg t;
} req = {
.nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)),
.nlh.nlmsg_type = RTM_GETTFILTER,
.nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST,
.t.tcm_family = AF_UNSPEC,
.t.tcm_ifindex = ifindex,
.t.tcm_parent = handle,
};
int seq = time(NULL);
req.nlh.nlmsg_seq = seq;
if (send(sock, &req, req.nlh.nlmsg_len, 0) < 0)
return -errno;
return bpf_netlink_recv(sock, nl_pid, seq, __dump_filter_nlmsg,
dump_filter_nlmsg, cookie);
}
...@@ -26,11 +26,6 @@ static uint16_t nla_attr_minlen[NLA_TYPE_MAX+1] = { ...@@ -26,11 +26,6 @@ static uint16_t nla_attr_minlen[NLA_TYPE_MAX+1] = {
[NLA_FLAG] = 0, [NLA_FLAG] = 0,
}; };
static int nla_len(const struct nlattr *nla)
{
return nla->nla_len - NLA_HDRLEN;
}
static struct nlattr *nla_next(const struct nlattr *nla, int *remaining) static struct nlattr *nla_next(const struct nlattr *nla, int *remaining)
{ {
int totlen = NLA_ALIGN(nla->nla_len); int totlen = NLA_ALIGN(nla->nla_len);
...@@ -46,11 +41,6 @@ static int nla_ok(const struct nlattr *nla, int remaining) ...@@ -46,11 +41,6 @@ static int nla_ok(const struct nlattr *nla, int remaining)
nla->nla_len <= remaining; nla->nla_len <= remaining;
} }
static void *nla_data(const struct nlattr *nla)
{
return (char *) nla + NLA_HDRLEN;
}
static int nla_type(const struct nlattr *nla) static int nla_type(const struct nlattr *nla)
{ {
return nla->nla_type & NLA_TYPE_MASK; return nla->nla_type & NLA_TYPE_MASK;
...@@ -114,8 +104,8 @@ static inline int nlmsg_len(const struct nlmsghdr *nlh) ...@@ -114,8 +104,8 @@ static inline int nlmsg_len(const struct nlmsghdr *nlh)
* @see nla_validate * @see nla_validate
* @return 0 on success or a negative error code. * @return 0 on success or a negative error code.
*/ */
static int nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, int len, int nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, int len,
struct nla_policy *policy) struct nla_policy *policy)
{ {
struct nlattr *nla; struct nlattr *nla;
int rem, err; int rem, err;
...@@ -146,6 +136,25 @@ static int nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, int ...@@ -146,6 +136,25 @@ static int nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, int
return err; return err;
} }
/**
* Create attribute index based on nested attribute
* @arg tb Index array to be filled (maxtype+1 elements).
* @arg maxtype Maximum attribute type expected and accepted.
* @arg nla Nested Attribute.
* @arg policy Attribute validation policy.
*
* Feeds the stream of attributes nested into the specified attribute
* to nla_parse().
*
* @see nla_parse
* @return 0 on success or a negative error code.
*/
int nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla,
struct nla_policy *policy)
{
return nla_parse(tb, maxtype, nla_data(nla), nla_len(nla), policy);
}
/* dump netlink extended ack error message */ /* dump netlink extended ack error message */
int nla_dump_errormsg(struct nlmsghdr *nlh) int nla_dump_errormsg(struct nlmsghdr *nlh)
{ {
......
...@@ -67,6 +67,44 @@ struct nla_policy { ...@@ -67,6 +67,44 @@ struct nla_policy {
nla_ok(pos, rem); \ nla_ok(pos, rem); \
pos = nla_next(pos, &(rem))) pos = nla_next(pos, &(rem)))
/**
* nla_data - head of payload
* @nla: netlink attribute
*/
static inline void *nla_data(const struct nlattr *nla)
{
return (char *) nla + NLA_HDRLEN;
}
static inline uint8_t nla_getattr_u8(const struct nlattr *nla)
{
return *(uint8_t *)nla_data(nla);
}
static inline uint32_t nla_getattr_u32(const struct nlattr *nla)
{
return *(uint32_t *)nla_data(nla);
}
static inline const char *nla_getattr_str(const struct nlattr *nla)
{
return (const char *)nla_data(nla);
}
/**
* nla_len - length of payload
* @nla: netlink attribute
*/
static inline int nla_len(const struct nlattr *nla)
{
return nla->nla_len - NLA_HDRLEN;
}
int nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, int len,
struct nla_policy *policy);
int nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla,
struct nla_policy *policy);
int nla_dump_errormsg(struct nlmsghdr *nlh); int nla_dump_errormsg(struct nlmsghdr *nlh);
#endif /* __NLATTR_H */ #endif /* __NLATTR_H */
...@@ -19,3 +19,9 @@ test_btf ...@@ -19,3 +19,9 @@ test_btf
test_sockmap test_sockmap
test_lirc_mode2_user test_lirc_mode2_user
get_cgroup_id_user get_cgroup_id_user
test_skb_cgroup_id_user
test_socket_cookie
test_cgroup_storage
test_select_reuseport
test_flow_dissector
flow_dissector_load
...@@ -35,7 +35,7 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test ...@@ -35,7 +35,7 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test
test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o \ test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o \
test_lwt_seg6local.o sendmsg4_prog.o sendmsg6_prog.o test_lirc_mode2_kern.o \ test_lwt_seg6local.o sendmsg4_prog.o sendmsg6_prog.o test_lirc_mode2_kern.o \
get_cgroup_id_kern.o socket_cookie_prog.o test_select_reuseport_kern.o \ get_cgroup_id_kern.o socket_cookie_prog.o test_select_reuseport_kern.o \
test_skb_cgroup_id_kern.o test_skb_cgroup_id_kern.o bpf_flow.o
# Order correspond to 'make run_tests' order # Order correspond to 'make run_tests' order
TEST_PROGS := test_kmod.sh \ TEST_PROGS := test_kmod.sh \
...@@ -47,10 +47,12 @@ TEST_PROGS := test_kmod.sh \ ...@@ -47,10 +47,12 @@ TEST_PROGS := test_kmod.sh \
test_tunnel.sh \ test_tunnel.sh \
test_lwt_seg6local.sh \ test_lwt_seg6local.sh \
test_lirc_mode2.sh \ test_lirc_mode2.sh \
test_skb_cgroup_id.sh test_skb_cgroup_id.sh \
test_flow_dissector.sh
# Compile but not part of 'make run_tests' # Compile but not part of 'make run_tests'
TEST_GEN_PROGS_EXTENDED = test_libbpf_open test_sock_addr test_skb_cgroup_id_user TEST_GEN_PROGS_EXTENDED = test_libbpf_open test_sock_addr test_skb_cgroup_id_user \
flow_dissector_load test_flow_dissector
include ../lib.mk include ../lib.mk
......
// SPDX-License-Identifier: GPL-2.0
#include <limits.h>
#include <stddef.h>
#include <stdbool.h>
#include <string.h>
#include <linux/pkt_cls.h>
#include <linux/bpf.h>
#include <linux/in.h>
#include <linux/if_ether.h>
#include <linux/icmp.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/if_packet.h>
#include <sys/socket.h>
#include <linux/if_tunnel.h>
#include <linux/mpls.h>
#include "bpf_helpers.h"
#include "bpf_endian.h"
int _version SEC("version") = 1;
#define PROG(F) SEC(#F) int bpf_func_##F
/* These are the identifiers of the BPF programs that will be used in tail
* calls. Name is limited to 16 characters, with the terminating character and
* bpf_func_ above, we have only 6 to work with, anything after will be cropped.
*/
enum {
IP,
IPV6,
IPV6OP, /* Destination/Hop-by-Hop Options IPv6 Extension header */
IPV6FR, /* Fragmentation IPv6 Extension Header */
MPLS,
VLAN,
};
#define IP_MF 0x2000
#define IP_OFFSET 0x1FFF
#define IP6_MF 0x0001
#define IP6_OFFSET 0xFFF8
struct vlan_hdr {
__be16 h_vlan_TCI;
__be16 h_vlan_encapsulated_proto;
};
struct gre_hdr {
__be16 flags;
__be16 proto;
};
struct frag_hdr {
__u8 nexthdr;
__u8 reserved;
__be16 frag_off;
__be32 identification;
};
struct bpf_map_def SEC("maps") jmp_table = {
.type = BPF_MAP_TYPE_PROG_ARRAY,
.key_size = sizeof(__u32),
.value_size = sizeof(__u32),
.max_entries = 8
};
static __always_inline void *bpf_flow_dissect_get_header(struct __sk_buff *skb,
__u16 hdr_size,
void *buffer)
{
void *data_end = (void *)(long)skb->data_end;
void *data = (void *)(long)skb->data;
__u16 nhoff = skb->flow_keys->nhoff;
__u8 *hdr;
/* Verifies this variable offset does not overflow */
if (nhoff > (USHRT_MAX - hdr_size))
return NULL;
hdr = data + nhoff;
if (hdr + hdr_size <= data_end)
return hdr;
if (bpf_skb_load_bytes(skb, nhoff, buffer, hdr_size))
return NULL;
return buffer;
}
/* Dispatches on ETHERTYPE */
static __always_inline int parse_eth_proto(struct __sk_buff *skb, __be16 proto)
{
struct bpf_flow_keys *keys = skb->flow_keys;
keys->n_proto = proto;
switch (proto) {
case bpf_htons(ETH_P_IP):
bpf_tail_call(skb, &jmp_table, IP);
break;
case bpf_htons(ETH_P_IPV6):
bpf_tail_call(skb, &jmp_table, IPV6);
break;
case bpf_htons(ETH_P_MPLS_MC):
case bpf_htons(ETH_P_MPLS_UC):
bpf_tail_call(skb, &jmp_table, MPLS);
break;
case bpf_htons(ETH_P_8021Q):
case bpf_htons(ETH_P_8021AD):
bpf_tail_call(skb, &jmp_table, VLAN);
break;
default:
/* Protocol not supported */
return BPF_DROP;
}
return BPF_DROP;
}
SEC("dissect")
int _dissect(struct __sk_buff *skb)
{
if (!skb->vlan_present)
return parse_eth_proto(skb, skb->protocol);
else
return parse_eth_proto(skb, skb->vlan_proto);
}
/* Parses on IPPROTO_* */
static __always_inline int parse_ip_proto(struct __sk_buff *skb, __u8 proto)
{
struct bpf_flow_keys *keys = skb->flow_keys;
void *data_end = (void *)(long)skb->data_end;
struct icmphdr *icmp, _icmp;
struct gre_hdr *gre, _gre;
struct ethhdr *eth, _eth;
struct tcphdr *tcp, _tcp;
struct udphdr *udp, _udp;
keys->ip_proto = proto;
switch (proto) {
case IPPROTO_ICMP:
icmp = bpf_flow_dissect_get_header(skb, sizeof(*icmp), &_icmp);
if (!icmp)
return BPF_DROP;
return BPF_OK;
case IPPROTO_IPIP:
keys->is_encap = true;
return parse_eth_proto(skb, bpf_htons(ETH_P_IP));
case IPPROTO_IPV6:
keys->is_encap = true;
return parse_eth_proto(skb, bpf_htons(ETH_P_IPV6));
case IPPROTO_GRE:
gre = bpf_flow_dissect_get_header(skb, sizeof(*gre), &_gre);
if (!gre)
return BPF_DROP;
if (bpf_htons(gre->flags & GRE_VERSION))
/* Only inspect standard GRE packets with version 0 */
return BPF_OK;
keys->nhoff += sizeof(*gre); /* Step over GRE Flags and Proto */
if (GRE_IS_CSUM(gre->flags))
keys->nhoff += 4; /* Step over chksum and Padding */
if (GRE_IS_KEY(gre->flags))
keys->nhoff += 4; /* Step over key */
if (GRE_IS_SEQ(gre->flags))
keys->nhoff += 4; /* Step over sequence number */
keys->is_encap = true;
if (gre->proto == bpf_htons(ETH_P_TEB)) {
eth = bpf_flow_dissect_get_header(skb, sizeof(*eth),
&_eth);
if (!eth)
return BPF_DROP;
keys->nhoff += sizeof(*eth);
return parse_eth_proto(skb, eth->h_proto);
} else {
return parse_eth_proto(skb, gre->proto);
}
case IPPROTO_TCP:
tcp = bpf_flow_dissect_get_header(skb, sizeof(*tcp), &_tcp);
if (!tcp)
return BPF_DROP;
if (tcp->doff < 5)
return BPF_DROP;
if ((__u8 *)tcp + (tcp->doff << 2) > data_end)
return BPF_DROP;
keys->thoff = keys->nhoff;
keys->sport = tcp->source;
keys->dport = tcp->dest;
return BPF_OK;
case IPPROTO_UDP:
case IPPROTO_UDPLITE:
udp = bpf_flow_dissect_get_header(skb, sizeof(*udp), &_udp);
if (!udp)
return BPF_DROP;
keys->thoff = keys->nhoff;
keys->sport = udp->source;
keys->dport = udp->dest;
return BPF_OK;
default:
return BPF_DROP;
}
return BPF_DROP;
}
static __always_inline int parse_ipv6_proto(struct __sk_buff *skb, __u8 nexthdr)
{
struct bpf_flow_keys *keys = skb->flow_keys;
keys->ip_proto = nexthdr;
switch (nexthdr) {
case IPPROTO_HOPOPTS:
case IPPROTO_DSTOPTS:
bpf_tail_call(skb, &jmp_table, IPV6OP);
break;
case IPPROTO_FRAGMENT:
bpf_tail_call(skb, &jmp_table, IPV6FR);
break;
default:
return parse_ip_proto(skb, nexthdr);
}
return BPF_DROP;
}
PROG(IP)(struct __sk_buff *skb)
{
void *data_end = (void *)(long)skb->data_end;
struct bpf_flow_keys *keys = skb->flow_keys;
void *data = (void *)(long)skb->data;
struct iphdr *iph, _iph;
bool done = false;
iph = bpf_flow_dissect_get_header(skb, sizeof(*iph), &_iph);
if (!iph)
return BPF_DROP;
/* IP header cannot be smaller than 20 bytes */
if (iph->ihl < 5)
return BPF_DROP;
keys->addr_proto = ETH_P_IP;
keys->ipv4_src = iph->saddr;
keys->ipv4_dst = iph->daddr;
keys->nhoff += iph->ihl << 2;
if (data + keys->nhoff > data_end)
return BPF_DROP;
if (iph->frag_off & bpf_htons(IP_MF | IP_OFFSET)) {
keys->is_frag = true;
if (iph->frag_off & bpf_htons(IP_OFFSET))
/* From second fragment on, packets do not have headers
* we can parse.
*/
done = true;
else
keys->is_first_frag = true;
}
if (done)
return BPF_OK;
return parse_ip_proto(skb, iph->protocol);
}
PROG(IPV6)(struct __sk_buff *skb)
{
struct bpf_flow_keys *keys = skb->flow_keys;
struct ipv6hdr *ip6h, _ip6h;
ip6h = bpf_flow_dissect_get_header(skb, sizeof(*ip6h), &_ip6h);
if (!ip6h)
return BPF_DROP;
keys->addr_proto = ETH_P_IPV6;
memcpy(&keys->ipv6_src, &ip6h->saddr, 2*sizeof(ip6h->saddr));
keys->nhoff += sizeof(struct ipv6hdr);
return parse_ipv6_proto(skb, ip6h->nexthdr);
}
PROG(IPV6OP)(struct __sk_buff *skb)
{
struct ipv6_opt_hdr *ip6h, _ip6h;
ip6h = bpf_flow_dissect_get_header(skb, sizeof(*ip6h), &_ip6h);
if (!ip6h)
return BPF_DROP;
/* hlen is in 8-octets and does not include the first 8 bytes
* of the header
*/
skb->flow_keys->nhoff += (1 + ip6h->hdrlen) << 3;
return parse_ipv6_proto(skb, ip6h->nexthdr);
}
PROG(IPV6FR)(struct __sk_buff *skb)
{
struct bpf_flow_keys *keys = skb->flow_keys;
struct frag_hdr *fragh, _fragh;
fragh = bpf_flow_dissect_get_header(skb, sizeof(*fragh), &_fragh);
if (!fragh)
return BPF_DROP;
keys->nhoff += sizeof(*fragh);
keys->is_frag = true;
if (!(fragh->frag_off & bpf_htons(IP6_OFFSET)))
keys->is_first_frag = true;
return parse_ipv6_proto(skb, fragh->nexthdr);
}
PROG(MPLS)(struct __sk_buff *skb)
{
struct mpls_label *mpls, _mpls;
mpls = bpf_flow_dissect_get_header(skb, sizeof(*mpls), &_mpls);
if (!mpls)
return BPF_DROP;
return BPF_OK;
}
PROG(VLAN)(struct __sk_buff *skb)
{
struct bpf_flow_keys *keys = skb->flow_keys;
struct vlan_hdr *vlan, _vlan;
__be16 proto;
/* Peek back to see if single or double-tagging */
if (bpf_skb_load_bytes(skb, keys->nhoff - sizeof(proto), &proto,
sizeof(proto)))
return BPF_DROP;
/* Account for double-tagging */
if (proto == bpf_htons(ETH_P_8021AD)) {
vlan = bpf_flow_dissect_get_header(skb, sizeof(*vlan), &_vlan);
if (!vlan)
return BPF_DROP;
if (vlan->h_vlan_encapsulated_proto != bpf_htons(ETH_P_8021Q))
return BPF_DROP;
keys->nhoff += sizeof(*vlan);
}
vlan = bpf_flow_dissect_get_header(skb, sizeof(*vlan), &_vlan);
if (!vlan)
return BPF_DROP;
keys->nhoff += sizeof(*vlan);
/* Only allow 8021AD + 8021Q double tagging and no triple tagging.*/
if (vlan->h_vlan_encapsulated_proto == bpf_htons(ETH_P_8021AD) ||
vlan->h_vlan_encapsulated_proto == bpf_htons(ETH_P_8021Q))
return BPF_DROP;
return parse_eth_proto(skb, vlan->h_vlan_encapsulated_proto);
}
char __license[] SEC("license") = "GPL";
...@@ -18,3 +18,4 @@ CONFIG_CRYPTO_HMAC=m ...@@ -18,3 +18,4 @@ CONFIG_CRYPTO_HMAC=m
CONFIG_CRYPTO_SHA256=m CONFIG_CRYPTO_SHA256=m
CONFIG_VXLAN=y CONFIG_VXLAN=y
CONFIG_GENEVE=y CONFIG_GENEVE=y
CONFIG_NET_CLS_FLOWER=m
// SPDX-License-Identifier: GPL-2.0
#include <error.h>
#include <errno.h>
#include <getopt.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <bpf/bpf.h>
#include <bpf/libbpf.h>
const char *cfg_pin_path = "/sys/fs/bpf/flow_dissector";
const char *cfg_map_name = "jmp_table";
bool cfg_attach = true;
char *cfg_section_name;
char *cfg_path_name;
static void load_and_attach_program(void)
{
struct bpf_program *prog, *main_prog;
struct bpf_map *prog_array;
int i, fd, prog_fd, ret;
struct bpf_object *obj;
int prog_array_fd;
ret = bpf_prog_load(cfg_path_name, BPF_PROG_TYPE_FLOW_DISSECTOR, &obj,
&prog_fd);
if (ret)
error(1, 0, "bpf_prog_load %s", cfg_path_name);
main_prog = bpf_object__find_program_by_title(obj, cfg_section_name);
if (!main_prog)
error(1, 0, "bpf_object__find_program_by_title %s",
cfg_section_name);
prog_fd = bpf_program__fd(main_prog);
if (prog_fd < 0)
error(1, 0, "bpf_program__fd");
prog_array = bpf_object__find_map_by_name(obj, cfg_map_name);
if (!prog_array)
error(1, 0, "bpf_object__find_map_by_name %s", cfg_map_name);
prog_array_fd = bpf_map__fd(prog_array);
if (prog_array_fd < 0)
error(1, 0, "bpf_map__fd %s", cfg_map_name);
i = 0;
bpf_object__for_each_program(prog, obj) {
fd = bpf_program__fd(prog);
if (fd < 0)
error(1, 0, "bpf_program__fd");
if (fd != prog_fd) {
printf("%d: %s\n", i, bpf_program__title(prog, false));
bpf_map_update_elem(prog_array_fd, &i, &fd, BPF_ANY);
++i;
}
}
ret = bpf_prog_attach(prog_fd, 0 /* Ignore */, BPF_FLOW_DISSECTOR, 0);
if (ret)
error(1, 0, "bpf_prog_attach %s", cfg_path_name);
ret = bpf_object__pin(obj, cfg_pin_path);
if (ret)
error(1, 0, "bpf_object__pin %s", cfg_pin_path);
}
static void detach_program(void)
{
char command[64];
int ret;
ret = bpf_prog_detach(0, BPF_FLOW_DISSECTOR);
if (ret)
error(1, 0, "bpf_prog_detach");
/* To unpin, it is necessary and sufficient to just remove this dir */
sprintf(command, "rm -r %s", cfg_pin_path);
ret = system(command);
if (ret)
error(1, errno, command);
}
static void parse_opts(int argc, char **argv)
{
bool attach = false;
bool detach = false;
int c;
while ((c = getopt(argc, argv, "adp:s:")) != -1) {
switch (c) {
case 'a':
if (detach)
error(1, 0, "attach/detach are exclusive");
attach = true;
break;
case 'd':
if (attach)
error(1, 0, "attach/detach are exclusive");
detach = true;
break;
case 'p':
if (cfg_path_name)
error(1, 0, "only one prog name can be given");
cfg_path_name = optarg;
break;
case 's':
if (cfg_section_name)
error(1, 0, "only one section can be given");
cfg_section_name = optarg;
break;
}
}
if (detach)
cfg_attach = false;
if (cfg_attach && !cfg_path_name)
error(1, 0, "must provide a path to the BPF program");
if (cfg_attach && !cfg_section_name)
error(1, 0, "must provide a section name");
}
int main(int argc, char **argv)
{
parse_opts(argc, argv);
if (cfg_attach)
load_and_attach_program();
else
detach_program();
return 0;
}
// SPDX-License-Identifier: GPL-2.0
/*
* Inject packets with all sorts of encapsulation into the kernel.
*
* IPv4/IPv6 outer layer 3
* GRE/GUE/BARE outer layer 4, where bare is IPIP/SIT/IPv4-in-IPv6/..
* IPv4/IPv6 inner layer 3
*/
#define _GNU_SOURCE
#include <stddef.h>
#include <arpa/inet.h>
#include <asm/byteorder.h>
#include <error.h>
#include <errno.h>
#include <linux/if_packet.h>
#include <linux/if_ether.h>
#include <linux/if_packet.h>
#include <linux/ipv6.h>
#include <netinet/ip.h>
#include <netinet/in.h>
#include <netinet/udp.h>
#include <poll.h>
#include <stdbool.h>
#include <stdlib.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <sys/types.h>
#include <unistd.h>
#define CFG_PORT_INNER 8000
/* Add some protocol definitions that do not exist in userspace */
struct grehdr {
uint16_t unused;
uint16_t protocol;
} __attribute__((packed));
struct guehdr {
union {
struct {
#if defined(__LITTLE_ENDIAN_BITFIELD)
__u8 hlen:5,
control:1,
version:2;
#elif defined (__BIG_ENDIAN_BITFIELD)
__u8 version:2,
control:1,
hlen:5;
#else
#error "Please fix <asm/byteorder.h>"
#endif
__u8 proto_ctype;
__be16 flags;
};
__be32 word;
};
};
static uint8_t cfg_dsfield_inner;
static uint8_t cfg_dsfield_outer;
static uint8_t cfg_encap_proto;
static bool cfg_expect_failure = false;
static int cfg_l3_extra = AF_UNSPEC; /* optional SIT prefix */
static int cfg_l3_inner = AF_UNSPEC;
static int cfg_l3_outer = AF_UNSPEC;
static int cfg_num_pkt = 10;
static int cfg_num_secs = 0;
static char cfg_payload_char = 'a';
static int cfg_payload_len = 100;
static int cfg_port_gue = 6080;
static bool cfg_only_rx;
static bool cfg_only_tx;
static int cfg_src_port = 9;
static char buf[ETH_DATA_LEN];
#define INIT_ADDR4(name, addr4, port) \
static struct sockaddr_in name = { \
.sin_family = AF_INET, \
.sin_port = __constant_htons(port), \
.sin_addr.s_addr = __constant_htonl(addr4), \
};
#define INIT_ADDR6(name, addr6, port) \
static struct sockaddr_in6 name = { \
.sin6_family = AF_INET6, \
.sin6_port = __constant_htons(port), \
.sin6_addr = addr6, \
};
INIT_ADDR4(in_daddr4, INADDR_LOOPBACK, CFG_PORT_INNER)
INIT_ADDR4(in_saddr4, INADDR_LOOPBACK + 2, 0)
INIT_ADDR4(out_daddr4, INADDR_LOOPBACK, 0)
INIT_ADDR4(out_saddr4, INADDR_LOOPBACK + 1, 0)
INIT_ADDR4(extra_daddr4, INADDR_LOOPBACK, 0)
INIT_ADDR4(extra_saddr4, INADDR_LOOPBACK + 1, 0)
INIT_ADDR6(in_daddr6, IN6ADDR_LOOPBACK_INIT, CFG_PORT_INNER)
INIT_ADDR6(in_saddr6, IN6ADDR_LOOPBACK_INIT, 0)
INIT_ADDR6(out_daddr6, IN6ADDR_LOOPBACK_INIT, 0)
INIT_ADDR6(out_saddr6, IN6ADDR_LOOPBACK_INIT, 0)
INIT_ADDR6(extra_daddr6, IN6ADDR_LOOPBACK_INIT, 0)
INIT_ADDR6(extra_saddr6, IN6ADDR_LOOPBACK_INIT, 0)
static unsigned long util_gettime(void)
{
struct timeval tv;
gettimeofday(&tv, NULL);
return (tv.tv_sec * 1000) + (tv.tv_usec / 1000);
}
static void util_printaddr(const char *msg, struct sockaddr *addr)
{
unsigned long off = 0;
char nbuf[INET6_ADDRSTRLEN];
switch (addr->sa_family) {
case PF_INET:
off = __builtin_offsetof(struct sockaddr_in, sin_addr);
break;
case PF_INET6:
off = __builtin_offsetof(struct sockaddr_in6, sin6_addr);
break;
default:
error(1, 0, "printaddr: unsupported family %u\n",
addr->sa_family);
}
if (!inet_ntop(addr->sa_family, ((void *) addr) + off, nbuf,
sizeof(nbuf)))
error(1, errno, "inet_ntop");
fprintf(stderr, "%s: %s\n", msg, nbuf);
}
static unsigned long add_csum_hword(const uint16_t *start, int num_u16)
{
unsigned long sum = 0;
int i;
for (i = 0; i < num_u16; i++)
sum += start[i];
return sum;
}
static uint16_t build_ip_csum(const uint16_t *start, int num_u16,
unsigned long sum)
{
sum += add_csum_hword(start, num_u16);
while (sum >> 16)
sum = (sum & 0xffff) + (sum >> 16);
return ~sum;
}
static void build_ipv4_header(void *header, uint8_t proto,
uint32_t src, uint32_t dst,
int payload_len, uint8_t tos)
{
struct iphdr *iph = header;
iph->ihl = 5;
iph->version = 4;
iph->tos = tos;
iph->ttl = 8;
iph->tot_len = htons(sizeof(*iph) + payload_len);
iph->id = htons(1337);
iph->protocol = proto;
iph->saddr = src;
iph->daddr = dst;
iph->check = build_ip_csum((void *) iph, iph->ihl << 1, 0);
}
static void ipv6_set_dsfield(struct ipv6hdr *ip6h, uint8_t dsfield)
{
uint16_t val, *ptr = (uint16_t *)ip6h;
val = ntohs(*ptr);
val &= 0xF00F;
val |= ((uint16_t) dsfield) << 4;
*ptr = htons(val);
}
static void build_ipv6_header(void *header, uint8_t proto,
struct sockaddr_in6 *src,
struct sockaddr_in6 *dst,
int payload_len, uint8_t dsfield)
{
struct ipv6hdr *ip6h = header;
ip6h->version = 6;
ip6h->payload_len = htons(payload_len);
ip6h->nexthdr = proto;
ip6h->hop_limit = 8;
ipv6_set_dsfield(ip6h, dsfield);
memcpy(&ip6h->saddr, &src->sin6_addr, sizeof(ip6h->saddr));
memcpy(&ip6h->daddr, &dst->sin6_addr, sizeof(ip6h->daddr));
}
static uint16_t build_udp_v4_csum(const struct iphdr *iph,
const struct udphdr *udph,
int num_words)
{
unsigned long pseudo_sum;
int num_u16 = sizeof(iph->saddr); /* halfwords: twice byte len */
pseudo_sum = add_csum_hword((void *) &iph->saddr, num_u16);
pseudo_sum += htons(IPPROTO_UDP);
pseudo_sum += udph->len;
return build_ip_csum((void *) udph, num_words, pseudo_sum);
}
static uint16_t build_udp_v6_csum(const struct ipv6hdr *ip6h,
const struct udphdr *udph,
int num_words)
{
unsigned long pseudo_sum;
int num_u16 = sizeof(ip6h->saddr); /* halfwords: twice byte len */
pseudo_sum = add_csum_hword((void *) &ip6h->saddr, num_u16);
pseudo_sum += htons(ip6h->nexthdr);
pseudo_sum += ip6h->payload_len;
return build_ip_csum((void *) udph, num_words, pseudo_sum);
}
static void build_udp_header(void *header, int payload_len,
uint16_t dport, int family)
{
struct udphdr *udph = header;
int len = sizeof(*udph) + payload_len;
udph->source = htons(cfg_src_port);
udph->dest = htons(dport);
udph->len = htons(len);
udph->check = 0;
if (family == AF_INET)
udph->check = build_udp_v4_csum(header - sizeof(struct iphdr),
udph, len >> 1);
else
udph->check = build_udp_v6_csum(header - sizeof(struct ipv6hdr),
udph, len >> 1);
}
static void build_gue_header(void *header, uint8_t proto)
{
struct guehdr *gueh = header;
gueh->proto_ctype = proto;
}
static void build_gre_header(void *header, uint16_t proto)
{
struct grehdr *greh = header;
greh->protocol = htons(proto);
}
static int l3_length(int family)
{
if (family == AF_INET)
return sizeof(struct iphdr);
else
return sizeof(struct ipv6hdr);
}
static int build_packet(void)
{
int ol3_len = 0, ol4_len = 0, il3_len = 0, il4_len = 0;
int el3_len = 0;
if (cfg_l3_extra)
el3_len = l3_length(cfg_l3_extra);
/* calculate header offsets */
if (cfg_encap_proto) {
ol3_len = l3_length(cfg_l3_outer);
if (cfg_encap_proto == IPPROTO_GRE)
ol4_len = sizeof(struct grehdr);
else if (cfg_encap_proto == IPPROTO_UDP)
ol4_len = sizeof(struct udphdr) + sizeof(struct guehdr);
}
il3_len = l3_length(cfg_l3_inner);
il4_len = sizeof(struct udphdr);
if (el3_len + ol3_len + ol4_len + il3_len + il4_len + cfg_payload_len >=
sizeof(buf))
error(1, 0, "packet too large\n");
/*
* Fill packet from inside out, to calculate correct checksums.
* But create ip before udp headers, as udp uses ip for pseudo-sum.
*/
memset(buf + el3_len + ol3_len + ol4_len + il3_len + il4_len,
cfg_payload_char, cfg_payload_len);
/* add zero byte for udp csum padding */
buf[el3_len + ol3_len + ol4_len + il3_len + il4_len + cfg_payload_len] = 0;
switch (cfg_l3_inner) {
case PF_INET:
build_ipv4_header(buf + el3_len + ol3_len + ol4_len,
IPPROTO_UDP,
in_saddr4.sin_addr.s_addr,
in_daddr4.sin_addr.s_addr,
il4_len + cfg_payload_len,
cfg_dsfield_inner);
break;
case PF_INET6:
build_ipv6_header(buf + el3_len + ol3_len + ol4_len,
IPPROTO_UDP,
&in_saddr6, &in_daddr6,
il4_len + cfg_payload_len,
cfg_dsfield_inner);
break;
}
build_udp_header(buf + el3_len + ol3_len + ol4_len + il3_len,
cfg_payload_len, CFG_PORT_INNER, cfg_l3_inner);
if (!cfg_encap_proto)
return il3_len + il4_len + cfg_payload_len;
switch (cfg_l3_outer) {
case PF_INET:
build_ipv4_header(buf + el3_len, cfg_encap_proto,
out_saddr4.sin_addr.s_addr,
out_daddr4.sin_addr.s_addr,
ol4_len + il3_len + il4_len + cfg_payload_len,
cfg_dsfield_outer);
break;
case PF_INET6:
build_ipv6_header(buf + el3_len, cfg_encap_proto,
&out_saddr6, &out_daddr6,
ol4_len + il3_len + il4_len + cfg_payload_len,
cfg_dsfield_outer);
break;
}
switch (cfg_encap_proto) {
case IPPROTO_UDP:
build_gue_header(buf + el3_len + ol3_len + ol4_len -
sizeof(struct guehdr),
cfg_l3_inner == PF_INET ? IPPROTO_IPIP
: IPPROTO_IPV6);
build_udp_header(buf + el3_len + ol3_len,
sizeof(struct guehdr) + il3_len + il4_len +
cfg_payload_len,
cfg_port_gue, cfg_l3_outer);
break;
case IPPROTO_GRE:
build_gre_header(buf + el3_len + ol3_len,
cfg_l3_inner == PF_INET ? ETH_P_IP
: ETH_P_IPV6);
break;
}
switch (cfg_l3_extra) {
case PF_INET:
build_ipv4_header(buf,
cfg_l3_outer == PF_INET ? IPPROTO_IPIP
: IPPROTO_IPV6,
extra_saddr4.sin_addr.s_addr,
extra_daddr4.sin_addr.s_addr,
ol3_len + ol4_len + il3_len + il4_len +
cfg_payload_len, 0);
break;
case PF_INET6:
build_ipv6_header(buf,
cfg_l3_outer == PF_INET ? IPPROTO_IPIP
: IPPROTO_IPV6,
&extra_saddr6, &extra_daddr6,
ol3_len + ol4_len + il3_len + il4_len +
cfg_payload_len, 0);
break;
}
return el3_len + ol3_len + ol4_len + il3_len + il4_len +
cfg_payload_len;
}
/* sender transmits encapsulated over RAW or unencap'd over UDP */
static int setup_tx(void)
{
int family, fd, ret;
if (cfg_l3_extra)
family = cfg_l3_extra;
else if (cfg_l3_outer)
family = cfg_l3_outer;
else
family = cfg_l3_inner;
fd = socket(family, SOCK_RAW, IPPROTO_RAW);
if (fd == -1)
error(1, errno, "socket tx");
if (cfg_l3_extra) {
if (cfg_l3_extra == PF_INET)
ret = connect(fd, (void *) &extra_daddr4,
sizeof(extra_daddr4));
else
ret = connect(fd, (void *) &extra_daddr6,
sizeof(extra_daddr6));
if (ret)
error(1, errno, "connect tx");
} else if (cfg_l3_outer) {
/* connect to destination if not encapsulated */
if (cfg_l3_outer == PF_INET)
ret = connect(fd, (void *) &out_daddr4,
sizeof(out_daddr4));
else
ret = connect(fd, (void *) &out_daddr6,
sizeof(out_daddr6));
if (ret)
error(1, errno, "connect tx");
} else {
/* otherwise using loopback */
if (cfg_l3_inner == PF_INET)
ret = connect(fd, (void *) &in_daddr4,
sizeof(in_daddr4));
else
ret = connect(fd, (void *) &in_daddr6,
sizeof(in_daddr6));
if (ret)
error(1, errno, "connect tx");
}
return fd;
}
/* receiver reads unencapsulated UDP */
static int setup_rx(void)
{
int fd, ret;
fd = socket(cfg_l3_inner, SOCK_DGRAM, 0);
if (fd == -1)
error(1, errno, "socket rx");
if (cfg_l3_inner == PF_INET)
ret = bind(fd, (void *) &in_daddr4, sizeof(in_daddr4));
else
ret = bind(fd, (void *) &in_daddr6, sizeof(in_daddr6));
if (ret)
error(1, errno, "bind rx");
return fd;
}
static int do_tx(int fd, const char *pkt, int len)
{
int ret;
ret = write(fd, pkt, len);
if (ret == -1)
error(1, errno, "send");
if (ret != len)
error(1, errno, "send: len (%d < %d)\n", ret, len);
return 1;
}
static int do_poll(int fd, short events, int timeout)
{
struct pollfd pfd;
int ret;
pfd.fd = fd;
pfd.events = events;
ret = poll(&pfd, 1, timeout);
if (ret == -1)
error(1, errno, "poll");
if (ret && !(pfd.revents & POLLIN))
error(1, errno, "poll: unexpected event 0x%x\n", pfd.revents);
return ret;
}
static int do_rx(int fd)
{
char rbuf;
int ret, num = 0;
while (1) {
ret = recv(fd, &rbuf, 1, MSG_DONTWAIT);
if (ret == -1 && errno == EAGAIN)
break;
if (ret == -1)
error(1, errno, "recv");
if (rbuf != cfg_payload_char)
error(1, 0, "recv: payload mismatch");
num++;
};
return num;
}
static int do_main(void)
{
unsigned long tstop, treport, tcur;
int fdt = -1, fdr = -1, len, tx = 0, rx = 0;
if (!cfg_only_tx)
fdr = setup_rx();
if (!cfg_only_rx)
fdt = setup_tx();
len = build_packet();
tcur = util_gettime();
treport = tcur + 1000;
tstop = tcur + (cfg_num_secs * 1000);
while (1) {
if (!cfg_only_rx)
tx += do_tx(fdt, buf, len);
if (!cfg_only_tx)
rx += do_rx(fdr);
if (cfg_num_secs) {
tcur = util_gettime();
if (tcur >= tstop)
break;
if (tcur >= treport) {
fprintf(stderr, "pkts: tx=%u rx=%u\n", tx, rx);
tx = 0;
rx = 0;
treport = tcur + 1000;
}
} else {
if (tx == cfg_num_pkt)
break;
}
}
/* read straggler packets, if any */
if (rx < tx) {
tstop = util_gettime() + 100;
while (rx < tx) {
tcur = util_gettime();
if (tcur >= tstop)
break;
do_poll(fdr, POLLIN, tstop - tcur);
rx += do_rx(fdr);
}
}
fprintf(stderr, "pkts: tx=%u rx=%u\n", tx, rx);
if (fdr != -1 && close(fdr))
error(1, errno, "close rx");
if (fdt != -1 && close(fdt))
error(1, errno, "close tx");
/*
* success (== 0) only if received all packets
* unless failure is expected, in which case none must arrive.
*/
if (cfg_expect_failure)
return rx != 0;
else
return rx != tx;
}
static void __attribute__((noreturn)) usage(const char *filepath)
{
fprintf(stderr, "Usage: %s [-e gre|gue|bare|none] [-i 4|6] [-l len] "
"[-O 4|6] [-o 4|6] [-n num] [-t secs] [-R] [-T] "
"[-s <osrc> [-d <odst>] [-S <isrc>] [-D <idst>] "
"[-x <otos>] [-X <itos>] [-f <isport>] [-F]\n",
filepath);
exit(1);
}
static void parse_addr(int family, void *addr, const char *optarg)
{
int ret;
ret = inet_pton(family, optarg, addr);
if (ret == -1)
error(1, errno, "inet_pton");
if (ret == 0)
error(1, 0, "inet_pton: bad string");
}
static void parse_addr4(struct sockaddr_in *addr, const char *optarg)
{
parse_addr(AF_INET, &addr->sin_addr, optarg);
}
static void parse_addr6(struct sockaddr_in6 *addr, const char *optarg)
{
parse_addr(AF_INET6, &addr->sin6_addr, optarg);
}
static int parse_protocol_family(const char *filepath, const char *optarg)
{
if (!strcmp(optarg, "4"))
return PF_INET;
if (!strcmp(optarg, "6"))
return PF_INET6;
usage(filepath);
}
static void parse_opts(int argc, char **argv)
{
int c;
while ((c = getopt(argc, argv, "d:D:e:f:Fhi:l:n:o:O:Rs:S:t:Tx:X:")) != -1) {
switch (c) {
case 'd':
if (cfg_l3_outer == AF_UNSPEC)
error(1, 0, "-d must be preceded by -o");
if (cfg_l3_outer == AF_INET)
parse_addr4(&out_daddr4, optarg);
else
parse_addr6(&out_daddr6, optarg);
break;
case 'D':
if (cfg_l3_inner == AF_UNSPEC)
error(1, 0, "-D must be preceded by -i");
if (cfg_l3_inner == AF_INET)
parse_addr4(&in_daddr4, optarg);
else
parse_addr6(&in_daddr6, optarg);
break;
case 'e':
if (!strcmp(optarg, "gre"))
cfg_encap_proto = IPPROTO_GRE;
else if (!strcmp(optarg, "gue"))
cfg_encap_proto = IPPROTO_UDP;
else if (!strcmp(optarg, "bare"))
cfg_encap_proto = IPPROTO_IPIP;
else if (!strcmp(optarg, "none"))
cfg_encap_proto = IPPROTO_IP; /* == 0 */
else
usage(argv[0]);
break;
case 'f':
cfg_src_port = strtol(optarg, NULL, 0);
break;
case 'F':
cfg_expect_failure = true;
break;
case 'h':
usage(argv[0]);
break;
case 'i':
if (!strcmp(optarg, "4"))
cfg_l3_inner = PF_INET;
else if (!strcmp(optarg, "6"))
cfg_l3_inner = PF_INET6;
else
usage(argv[0]);
break;
case 'l':
cfg_payload_len = strtol(optarg, NULL, 0);
break;
case 'n':
cfg_num_pkt = strtol(optarg, NULL, 0);
break;
case 'o':
cfg_l3_outer = parse_protocol_family(argv[0], optarg);
break;
case 'O':
cfg_l3_extra = parse_protocol_family(argv[0], optarg);
break;
case 'R':
cfg_only_rx = true;
break;
case 's':
if (cfg_l3_outer == AF_INET)
parse_addr4(&out_saddr4, optarg);
else
parse_addr6(&out_saddr6, optarg);
break;
case 'S':
if (cfg_l3_inner == AF_INET)
parse_addr4(&in_saddr4, optarg);
else
parse_addr6(&in_saddr6, optarg);
break;
case 't':
cfg_num_secs = strtol(optarg, NULL, 0);
break;
case 'T':
cfg_only_tx = true;
break;
case 'x':
cfg_dsfield_outer = strtol(optarg, NULL, 0);
break;
case 'X':
cfg_dsfield_inner = strtol(optarg, NULL, 0);
break;
}
}
if (cfg_only_rx && cfg_only_tx)
error(1, 0, "options: cannot combine rx-only and tx-only");
if (cfg_encap_proto && cfg_l3_outer == AF_UNSPEC)
error(1, 0, "options: must specify outer with encap");
else if ((!cfg_encap_proto) && cfg_l3_outer != AF_UNSPEC)
error(1, 0, "options: cannot combine no-encap and outer");
else if ((!cfg_encap_proto) && cfg_l3_extra != AF_UNSPEC)
error(1, 0, "options: cannot combine no-encap and extra");
if (cfg_l3_inner == AF_UNSPEC)
cfg_l3_inner = AF_INET6;
if (cfg_l3_inner == AF_INET6 && cfg_encap_proto == IPPROTO_IPIP)
cfg_encap_proto = IPPROTO_IPV6;
/* RFC 6040 4.2:
* on decap, if outer encountered congestion (CE == 0x3),
* but inner cannot encode ECN (NoECT == 0x0), then drop packet.
*/
if (((cfg_dsfield_outer & 0x3) == 0x3) &&
((cfg_dsfield_inner & 0x3) == 0x0))
cfg_expect_failure = true;
}
static void print_opts(void)
{
if (cfg_l3_inner == PF_INET6) {
util_printaddr("inner.dest6", (void *) &in_daddr6);
util_printaddr("inner.source6", (void *) &in_saddr6);
} else {
util_printaddr("inner.dest4", (void *) &in_daddr4);
util_printaddr("inner.source4", (void *) &in_saddr4);
}
if (!cfg_l3_outer)
return;
fprintf(stderr, "encap proto: %u\n", cfg_encap_proto);
if (cfg_l3_outer == PF_INET6) {
util_printaddr("outer.dest6", (void *) &out_daddr6);
util_printaddr("outer.source6", (void *) &out_saddr6);
} else {
util_printaddr("outer.dest4", (void *) &out_daddr4);
util_printaddr("outer.source4", (void *) &out_saddr4);
}
if (!cfg_l3_extra)
return;
if (cfg_l3_outer == PF_INET6) {
util_printaddr("extra.dest6", (void *) &extra_daddr6);
util_printaddr("extra.source6", (void *) &extra_saddr6);
} else {
util_printaddr("extra.dest4", (void *) &extra_daddr4);
util_printaddr("extra.source4", (void *) &extra_saddr4);
}
}
int main(int argc, char **argv)
{
parse_opts(argc, argv);
print_opts();
return do_main();
}
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
#
# Load BPF flow dissector and verify it correctly dissects traffic
export TESTNAME=test_flow_dissector
unmount=0
# Kselftest framework requirement - SKIP code is 4.
ksft_skip=4
msg="skip all tests:"
if [ $UID != 0 ]; then
echo $msg please run this as root >&2
exit $ksft_skip
fi
# This test needs to be run in a network namespace with in_netns.sh. Check if
# this is the case and run it with in_netns.sh if it is being run in the root
# namespace.
if [[ -z $(ip netns identify $$) ]]; then
../net/in_netns.sh "$0" "$@"
exit $?
fi
# Determine selftest success via shell exit code
exit_handler()
{
if (( $? == 0 )); then
echo "selftests: $TESTNAME [PASS]";
else
echo "selftests: $TESTNAME [FAILED]";
fi
set +e
# Cleanup
tc filter del dev lo ingress pref 1337 2> /dev/null
tc qdisc del dev lo ingress 2> /dev/null
./flow_dissector_load -d 2> /dev/null
if [ $unmount -ne 0 ]; then
umount bpffs 2> /dev/null
fi
}
# Exit script immediately (well catched by trap handler) if any
# program/thing exits with a non-zero status.
set -e
# (Use 'trap -l' to list meaning of numbers)
trap exit_handler 0 2 3 6 9
# Mount BPF file system
if /bin/mount | grep /sys/fs/bpf > /dev/null; then
echo "bpffs already mounted"
else
echo "bpffs not mounted. Mounting..."
unmount=1
/bin/mount bpffs /sys/fs/bpf -t bpf
fi
# Attach BPF program
./flow_dissector_load -p bpf_flow.o -s dissect
# Setup
tc qdisc add dev lo ingress
echo "Testing IPv4..."
# Drops all IP/UDP packets coming from port 9
tc filter add dev lo parent ffff: protocol ip pref 1337 flower ip_proto \
udp src_port 9 action drop
# Send 10 IPv4/UDP packets from port 8. Filter should not drop any.
./test_flow_dissector -i 4 -f 8
# Send 10 IPv4/UDP packets from port 9. Filter should drop all.
./test_flow_dissector -i 4 -f 9 -F
# Send 10 IPv4/UDP packets from port 10. Filter should not drop any.
./test_flow_dissector -i 4 -f 10
echo "Testing IPIP..."
# Send 10 IPv4/IPv4/UDP packets from port 8. Filter should not drop any.
./with_addr.sh ./with_tunnels.sh ./test_flow_dissector -o 4 -e bare -i 4 \
-D 192.168.0.1 -S 1.1.1.1 -f 8
# Send 10 IPv4/IPv4/UDP packets from port 9. Filter should drop all.
./with_addr.sh ./with_tunnels.sh ./test_flow_dissector -o 4 -e bare -i 4 \
-D 192.168.0.1 -S 1.1.1.1 -f 9 -F
# Send 10 IPv4/IPv4/UDP packets from port 10. Filter should not drop any.
./with_addr.sh ./with_tunnels.sh ./test_flow_dissector -o 4 -e bare -i 4 \
-D 192.168.0.1 -S 1.1.1.1 -f 10
echo "Testing IPv4 + GRE..."
# Send 10 IPv4/GRE/IPv4/UDP packets from port 8. Filter should not drop any.
./with_addr.sh ./with_tunnels.sh ./test_flow_dissector -o 4 -e gre -i 4 \
-D 192.168.0.1 -S 1.1.1.1 -f 8
# Send 10 IPv4/GRE/IPv4/UDP packets from port 9. Filter should drop all.
./with_addr.sh ./with_tunnels.sh ./test_flow_dissector -o 4 -e gre -i 4 \
-D 192.168.0.1 -S 1.1.1.1 -f 9 -F
# Send 10 IPv4/GRE/IPv4/UDP packets from port 10. Filter should not drop any.
./with_addr.sh ./with_tunnels.sh ./test_flow_dissector -o 4 -e gre -i 4 \
-D 192.168.0.1 -S 1.1.1.1 -f 10
tc filter del dev lo ingress pref 1337
echo "Testing IPv6..."
# Drops all IPv6/UDP packets coming from port 9
tc filter add dev lo parent ffff: protocol ipv6 pref 1337 flower ip_proto \
udp src_port 9 action drop
# Send 10 IPv6/UDP packets from port 8. Filter should not drop any.
./test_flow_dissector -i 6 -f 8
# Send 10 IPv6/UDP packets from port 9. Filter should drop all.
./test_flow_dissector -i 6 -f 9 -F
# Send 10 IPv6/UDP packets from port 10. Filter should not drop any.
./test_flow_dissector -i 6 -f 10
exit 0
...@@ -112,13 +112,13 @@ static void test_pkt_access(void) ...@@ -112,13 +112,13 @@ static void test_pkt_access(void)
err = bpf_prog_test_run(prog_fd, 100000, &pkt_v4, sizeof(pkt_v4), err = bpf_prog_test_run(prog_fd, 100000, &pkt_v4, sizeof(pkt_v4),
NULL, NULL, &retval, &duration); NULL, NULL, &retval, &duration);
CHECK(err || errno || retval, "ipv4", CHECK(err || retval, "ipv4",
"err %d errno %d retval %d duration %d\n", "err %d errno %d retval %d duration %d\n",
err, errno, retval, duration); err, errno, retval, duration);
err = bpf_prog_test_run(prog_fd, 100000, &pkt_v6, sizeof(pkt_v6), err = bpf_prog_test_run(prog_fd, 100000, &pkt_v6, sizeof(pkt_v6),
NULL, NULL, &retval, &duration); NULL, NULL, &retval, &duration);
CHECK(err || errno || retval, "ipv6", CHECK(err || retval, "ipv6",
"err %d errno %d retval %d duration %d\n", "err %d errno %d retval %d duration %d\n",
err, errno, retval, duration); err, errno, retval, duration);
bpf_object__close(obj); bpf_object__close(obj);
...@@ -153,14 +153,14 @@ static void test_xdp(void) ...@@ -153,14 +153,14 @@ static void test_xdp(void)
err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4), err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4),
buf, &size, &retval, &duration); buf, &size, &retval, &duration);
CHECK(err || errno || retval != XDP_TX || size != 74 || CHECK(err || retval != XDP_TX || size != 74 ||
iph->protocol != IPPROTO_IPIP, "ipv4", iph->protocol != IPPROTO_IPIP, "ipv4",
"err %d errno %d retval %d size %d\n", "err %d errno %d retval %d size %d\n",
err, errno, retval, size); err, errno, retval, size);
err = bpf_prog_test_run(prog_fd, 1, &pkt_v6, sizeof(pkt_v6), err = bpf_prog_test_run(prog_fd, 1, &pkt_v6, sizeof(pkt_v6),
buf, &size, &retval, &duration); buf, &size, &retval, &duration);
CHECK(err || errno || retval != XDP_TX || size != 114 || CHECK(err || retval != XDP_TX || size != 114 ||
iph6->nexthdr != IPPROTO_IPV6, "ipv6", iph6->nexthdr != IPPROTO_IPV6, "ipv6",
"err %d errno %d retval %d size %d\n", "err %d errno %d retval %d size %d\n",
err, errno, retval, size); err, errno, retval, size);
...@@ -185,13 +185,13 @@ static void test_xdp_adjust_tail(void) ...@@ -185,13 +185,13 @@ static void test_xdp_adjust_tail(void)
err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4), err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4),
buf, &size, &retval, &duration); buf, &size, &retval, &duration);
CHECK(err || errno || retval != XDP_DROP, CHECK(err || retval != XDP_DROP,
"ipv4", "err %d errno %d retval %d size %d\n", "ipv4", "err %d errno %d retval %d size %d\n",
err, errno, retval, size); err, errno, retval, size);
err = bpf_prog_test_run(prog_fd, 1, &pkt_v6, sizeof(pkt_v6), err = bpf_prog_test_run(prog_fd, 1, &pkt_v6, sizeof(pkt_v6),
buf, &size, &retval, &duration); buf, &size, &retval, &duration);
CHECK(err || errno || retval != XDP_TX || size != 54, CHECK(err || retval != XDP_TX || size != 54,
"ipv6", "err %d errno %d retval %d size %d\n", "ipv6", "err %d errno %d retval %d size %d\n",
err, errno, retval, size); err, errno, retval, size);
bpf_object__close(obj); bpf_object__close(obj);
...@@ -254,14 +254,14 @@ static void test_l4lb(const char *file) ...@@ -254,14 +254,14 @@ static void test_l4lb(const char *file)
err = bpf_prog_test_run(prog_fd, NUM_ITER, &pkt_v4, sizeof(pkt_v4), err = bpf_prog_test_run(prog_fd, NUM_ITER, &pkt_v4, sizeof(pkt_v4),
buf, &size, &retval, &duration); buf, &size, &retval, &duration);
CHECK(err || errno || retval != 7/*TC_ACT_REDIRECT*/ || size != 54 || CHECK(err || retval != 7/*TC_ACT_REDIRECT*/ || size != 54 ||
*magic != MAGIC_VAL, "ipv4", *magic != MAGIC_VAL, "ipv4",
"err %d errno %d retval %d size %d magic %x\n", "err %d errno %d retval %d size %d magic %x\n",
err, errno, retval, size, *magic); err, errno, retval, size, *magic);
err = bpf_prog_test_run(prog_fd, NUM_ITER, &pkt_v6, sizeof(pkt_v6), err = bpf_prog_test_run(prog_fd, NUM_ITER, &pkt_v6, sizeof(pkt_v6),
buf, &size, &retval, &duration); buf, &size, &retval, &duration);
CHECK(err || errno || retval != 7/*TC_ACT_REDIRECT*/ || size != 74 || CHECK(err || retval != 7/*TC_ACT_REDIRECT*/ || size != 74 ||
*magic != MAGIC_VAL, "ipv6", *magic != MAGIC_VAL, "ipv6",
"err %d errno %d retval %d size %d magic %x\n", "err %d errno %d retval %d size %d magic %x\n",
err, errno, retval, size, *magic); err, errno, retval, size, *magic);
...@@ -343,14 +343,14 @@ static void test_xdp_noinline(void) ...@@ -343,14 +343,14 @@ static void test_xdp_noinline(void)
err = bpf_prog_test_run(prog_fd, NUM_ITER, &pkt_v4, sizeof(pkt_v4), err = bpf_prog_test_run(prog_fd, NUM_ITER, &pkt_v4, sizeof(pkt_v4),
buf, &size, &retval, &duration); buf, &size, &retval, &duration);
CHECK(err || errno || retval != 1 || size != 54 || CHECK(err || retval != 1 || size != 54 ||
*magic != MAGIC_VAL, "ipv4", *magic != MAGIC_VAL, "ipv4",
"err %d errno %d retval %d size %d magic %x\n", "err %d errno %d retval %d size %d magic %x\n",
err, errno, retval, size, *magic); err, errno, retval, size, *magic);
err = bpf_prog_test_run(prog_fd, NUM_ITER, &pkt_v6, sizeof(pkt_v6), err = bpf_prog_test_run(prog_fd, NUM_ITER, &pkt_v6, sizeof(pkt_v6),
buf, &size, &retval, &duration); buf, &size, &retval, &duration);
CHECK(err || errno || retval != 1 || size != 74 || CHECK(err || retval != 1 || size != 74 ||
*magic != MAGIC_VAL, "ipv6", *magic != MAGIC_VAL, "ipv6",
"err %d errno %d retval %d size %d magic %x\n", "err %d errno %d retval %d size %d magic %x\n",
err, errno, retval, size, *magic); err, errno, retval, size, *magic);
......
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
#
# add private ipv4 and ipv6 addresses to loopback
readonly V6_INNER='100::a/128'
readonly V4_INNER='192.168.0.1/32'
if getopts ":s" opt; then
readonly SIT_DEV_NAME='sixtofourtest0'
readonly V6_SIT='2::/64'
readonly V4_SIT='172.17.0.1/32'
shift
fi
fail() {
echo "error: $*" 1>&2
exit 1
}
setup() {
ip -6 addr add "${V6_INNER}" dev lo || fail 'failed to setup v6 address'
ip -4 addr add "${V4_INNER}" dev lo || fail 'failed to setup v4 address'
if [[ -n "${V6_SIT}" ]]; then
ip link add "${SIT_DEV_NAME}" type sit remote any local any \
|| fail 'failed to add sit'
ip link set dev "${SIT_DEV_NAME}" up \
|| fail 'failed to bring sit device up'
ip -6 addr add "${V6_SIT}" dev "${SIT_DEV_NAME}" \
|| fail 'failed to setup v6 SIT address'
ip -4 addr add "${V4_SIT}" dev "${SIT_DEV_NAME}" \
|| fail 'failed to setup v4 SIT address'
fi
sleep 2 # avoid race causing bind to fail
}
cleanup() {
if [[ -n "${V6_SIT}" ]]; then
ip -4 addr del "${V4_SIT}" dev "${SIT_DEV_NAME}"
ip -6 addr del "${V6_SIT}" dev "${SIT_DEV_NAME}"
ip link del "${SIT_DEV_NAME}"
fi
ip -4 addr del "${V4_INNER}" dev lo
ip -6 addr del "${V6_INNER}" dev lo
}
trap cleanup EXIT
setup
"$@"
exit "$?"
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
#
# setup tunnels for flow dissection test
readonly SUFFIX="test_$(mktemp -u XXXX)"
CONFIG="remote 127.0.0.2 local 127.0.0.1 dev lo"
setup() {
ip link add "ipip_${SUFFIX}" type ipip ${CONFIG}
ip link add "gre_${SUFFIX}" type gre ${CONFIG}
ip link add "sit_${SUFFIX}" type sit ${CONFIG}
echo "tunnels before test:"
ip tunnel show
ip link set "ipip_${SUFFIX}" up
ip link set "gre_${SUFFIX}" up
ip link set "sit_${SUFFIX}" up
}
cleanup() {
ip tunnel del "ipip_${SUFFIX}"
ip tunnel del "gre_${SUFFIX}"
ip tunnel del "sit_${SUFFIX}"
echo "tunnels after test:"
ip tunnel show
}
trap cleanup EXIT
setup
"$@"
exit "$?"
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment