Commit 9d6f4177 authored by Daniel Borkmann's avatar Daniel Borkmann

Merge branch 'bpf-reuseport-map'

Martin KaFai Lau says:

====================
This series introduces a new map type "BPF_MAP_TYPE_REUSEPORT_SOCKARRAY"
and a new prog type BPF_PROG_TYPE_SK_REUSEPORT.

Here is a snippet from a commit message:

"To unleash the full potential of a bpf prog, it is essential for the
userspace to be capable of directly setting up a bpf map which can then
be consumed by the bpf prog to make decision.  In this case, decide which
SO_REUSEPORT sk to serve the incoming request.

By adding BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, the userspace has total control
and visibility on where a SO_REUSEPORT sk should be located in a bpf map.
The later patch will introduce BPF_PROG_TYPE_SK_REUSEPORT such that
the bpf prog can directly select a sk from the bpf map.  That will
raise the programmability of the bpf prog attached to a reuseport
group (a group of sk serving the same IP:PORT).

For example, in UDP, the bpf prog can peek into the payload (e.g.
through the "data" pointer introduced in the later patch) to learn
the application level's connection information and then decide which sk
to pick from a bpf map.  The userspace can tightly couple the sk's location
in a bpf map with the application logic in generating the UDP payload's
connection information.  This connection info contact/API stays within the
userspace.

Also, when used with map-in-map, the userspace can switch the
old-server-process's inner map to a new-server-process's inner map
in one call "bpf_map_update_elem(outer_map, &index, &new_reuseport_array)".
The bpf prog will then direct incoming requests to the new process instead
of the old process.  The old process can finish draining the pending
requests (e.g. by "accept()") before closing the old-fds.  [Note that
deleting a fd from a bpf map does not necessary mean the fd is closed]"
====================
Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
parents 74b247f4 91134d84
......@@ -524,6 +524,7 @@ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr)
}
struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type);
int array_map_alloc_check(union bpf_attr *attr);
#else /* !CONFIG_BPF_SYSCALL */
static inline struct bpf_prog *bpf_prog_get(u32 ufd)
......@@ -769,6 +770,33 @@ static inline void __xsk_map_flush(struct bpf_map *map)
}
#endif
#if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL)
void bpf_sk_reuseport_detach(struct sock *sk);
int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key,
void *value);
int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key,
void *value, u64 map_flags);
#else
static inline void bpf_sk_reuseport_detach(struct sock *sk)
{
}
#ifdef CONFIG_BPF_SYSCALL
static inline int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map,
void *key, void *value)
{
return -EOPNOTSUPP;
}
static inline int bpf_fd_reuseport_array_update_elem(struct bpf_map *map,
void *key, void *value,
u64 map_flags)
{
return -EOPNOTSUPP;
}
#endif /* CONFIG_BPF_SYSCALL */
#endif /* defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) */
/* verifier prototypes for helper functions called from eBPF programs */
extern const struct bpf_func_proto bpf_map_lookup_elem_proto;
extern const struct bpf_func_proto bpf_map_update_elem_proto;
......
......@@ -29,6 +29,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev)
#ifdef CONFIG_BPF_LIRC_MODE2
BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2)
#endif
#ifdef CONFIG_INET
BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport)
#endif
BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
......@@ -60,4 +63,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops)
#if defined(CONFIG_XDP_SOCKETS)
BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops)
#endif
#ifdef CONFIG_INET
BPF_MAP_TYPE(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, reuseport_array_ops)
#endif
#endif
......@@ -32,6 +32,7 @@ struct seccomp_data;
struct bpf_prog_aux;
struct xdp_rxq_info;
struct xdp_buff;
struct sock_reuseport;
/* ArgX, context and stack frame pointer register positions. Note,
* Arg1, Arg2, Arg3, etc are used as argument mappings of function
......@@ -752,6 +753,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk);
int sk_attach_bpf(u32 ufd, struct sock *sk);
int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk);
int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk);
void sk_reuseport_prog_free(struct bpf_prog *prog);
int sk_detach_filter(struct sock *sk);
int sk_get_filter(struct sock *sk, struct sock_filter __user *filter,
unsigned int len);
......@@ -833,6 +835,20 @@ void bpf_warn_invalid_xdp_action(u32 act);
struct sock *do_sk_redirect_map(struct sk_buff *skb);
struct sock *do_msg_redirect_map(struct sk_msg_buff *md);
#ifdef CONFIG_INET
struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
struct bpf_prog *prog, struct sk_buff *skb,
u32 hash);
#else
static inline struct sock *
bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
struct bpf_prog *prog, struct sk_buff *skb,
u32 hash)
{
return NULL;
}
#endif
#ifdef CONFIG_BPF_JIT
extern int bpf_jit_enable;
extern int bpf_jit_harden;
......
......@@ -108,6 +108,7 @@ int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr,
u32 banned_flags);
bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
bool match_wildcard);
bool inet_rcv_saddr_any(const struct sock *sk);
void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr);
void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr);
......
......@@ -5,25 +5,36 @@
#include <linux/filter.h>
#include <linux/skbuff.h>
#include <linux/types.h>
#include <linux/spinlock.h>
#include <net/sock.h>
extern spinlock_t reuseport_lock;
struct sock_reuseport {
struct rcu_head rcu;
u16 max_socks; /* length of socks */
u16 num_socks; /* elements in socks */
/* The last synq overflow event timestamp of this
* reuse->socks[] group.
*/
unsigned int synq_overflow_ts;
/* ID stays the same even after the size of socks[] grows. */
unsigned int reuseport_id;
bool bind_inany;
struct bpf_prog __rcu *prog; /* optional BPF sock selector */
struct sock *socks[0]; /* array of sock pointers */
};
extern int reuseport_alloc(struct sock *sk);
extern int reuseport_add_sock(struct sock *sk, struct sock *sk2);
extern int reuseport_alloc(struct sock *sk, bool bind_inany);
extern int reuseport_add_sock(struct sock *sk, struct sock *sk2,
bool bind_inany);
extern void reuseport_detach_sock(struct sock *sk);
extern struct sock *reuseport_select_sock(struct sock *sk,
u32 hash,
struct sk_buff *skb,
int hdr_len);
extern struct bpf_prog *reuseport_attach_prog(struct sock *sk,
struct bpf_prog *prog);
extern int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog);
int reuseport_get_id(struct sock_reuseport *reuse);
#endif /* _SOCK_REUSEPORT_H */
......@@ -36,6 +36,7 @@
#include <net/inet_hashtables.h>
#include <net/checksum.h>
#include <net/request_sock.h>
#include <net/sock_reuseport.h>
#include <net/sock.h>
#include <net/snmp.h>
#include <net/ip.h>
......@@ -473,9 +474,22 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb);
*/
static inline void tcp_synq_overflow(const struct sock *sk)
{
unsigned int last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp;
unsigned int last_overflow;
unsigned int now = jiffies;
if (sk->sk_reuseport) {
struct sock_reuseport *reuse;
reuse = rcu_dereference(sk->sk_reuseport_cb);
if (likely(reuse)) {
last_overflow = READ_ONCE(reuse->synq_overflow_ts);
if (time_after32(now, last_overflow + HZ))
WRITE_ONCE(reuse->synq_overflow_ts, now);
return;
}
}
last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp;
if (time_after32(now, last_overflow + HZ))
tcp_sk(sk)->rx_opt.ts_recent_stamp = now;
}
......@@ -483,9 +497,21 @@ static inline void tcp_synq_overflow(const struct sock *sk)
/* syncookies: no recent synqueue overflow on this listening socket? */
static inline bool tcp_synq_no_recent_overflow(const struct sock *sk)
{
unsigned int last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp;
unsigned int last_overflow;
unsigned int now = jiffies;
if (sk->sk_reuseport) {
struct sock_reuseport *reuse;
reuse = rcu_dereference(sk->sk_reuseport_cb);
if (likely(reuse)) {
last_overflow = READ_ONCE(reuse->synq_overflow_ts);
return time_after32(now, last_overflow +
TCP_SYNCOOKIE_VALID);
}
}
last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp;
return time_after32(now, last_overflow + TCP_SYNCOOKIE_VALID);
}
......
......@@ -126,6 +126,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_XSKMAP,
BPF_MAP_TYPE_SOCKHASH,
BPF_MAP_TYPE_CGROUP_STORAGE,
BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
};
enum bpf_prog_type {
......@@ -150,6 +151,7 @@ enum bpf_prog_type {
BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
BPF_PROG_TYPE_LWT_SEG6LOCAL,
BPF_PROG_TYPE_LIRC_MODE2,
BPF_PROG_TYPE_SK_REUSEPORT,
};
enum bpf_attach_type {
......@@ -2113,6 +2115,14 @@ union bpf_attr {
* the shared data.
* Return
* Pointer to the local storage area.
*
* int bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags)
* Description
* Select a SO_REUSEPORT sk from a BPF_MAP_TYPE_REUSEPORT_ARRAY map
* It checks the selected sk is matching the incoming
* request in the skb.
* Return
* 0 on success, or a negative error in case of failure.
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
......@@ -2196,7 +2206,8 @@ union bpf_attr {
FN(rc_keydown), \
FN(skb_cgroup_id), \
FN(get_current_cgroup_id), \
FN(get_local_storage),
FN(get_local_storage), \
FN(sk_select_reuseport),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call
......@@ -2413,6 +2424,30 @@ struct sk_msg_md {
__u32 local_port; /* stored in host byte order */
};
struct sk_reuseport_md {
/*
* Start of directly accessible data. It begins from
* the tcp/udp header.
*/
void *data;
void *data_end; /* End of directly accessible data */
/*
* Total length of packet (starting from the tcp/udp header).
* Note that the directly accessible bytes (data_end - data)
* could be less than this "len". Those bytes could be
* indirectly read by a helper "bpf_skb_load_bytes()".
*/
__u32 len;
/*
* Eth protocol in the mac header (network byte order). e.g.
* ETH_P_IP(0x0800) and ETH_P_IPV6(0x86DD)
*/
__u32 eth_protocol;
__u32 ip_protocol; /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */
__u32 bind_inany; /* Is sock bound to an INANY address? */
__u32 hash; /* A hash of the packet 4 tuples */
};
#define BPF_TAG_SIZE 8
struct bpf_prog_info {
......
......@@ -23,3 +23,6 @@ ifeq ($(CONFIG_PERF_EVENTS),y)
obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
endif
obj-$(CONFIG_CGROUP_BPF) += cgroup.o
ifeq ($(CONFIG_INET),y)
obj-$(CONFIG_BPF_SYSCALL) += reuseport_array.o
endif
......@@ -54,7 +54,7 @@ static int bpf_array_alloc_percpu(struct bpf_array *array)
}
/* Called from syscall */
static int array_map_alloc_check(union bpf_attr *attr)
int array_map_alloc_check(union bpf_attr *attr)
{
bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
int numa_node = bpf_map_attr_numa_node(attr);
......
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2018 Facebook
*/
#include <linux/bpf.h>
#include <linux/err.h>
#include <linux/sock_diag.h>
#include <net/sock_reuseport.h>
struct reuseport_array {
struct bpf_map map;
struct sock __rcu *ptrs[];
};
static struct reuseport_array *reuseport_array(struct bpf_map *map)
{
return (struct reuseport_array *)map;
}
/* The caller must hold the reuseport_lock */
void bpf_sk_reuseport_detach(struct sock *sk)
{
struct sock __rcu **socks;
write_lock_bh(&sk->sk_callback_lock);
socks = sk->sk_user_data;
if (socks) {
WRITE_ONCE(sk->sk_user_data, NULL);
/*
* Do not move this NULL assignment outside of
* sk->sk_callback_lock because there is
* a race with reuseport_array_free()
* which does not hold the reuseport_lock.
*/
RCU_INIT_POINTER(*socks, NULL);
}
write_unlock_bh(&sk->sk_callback_lock);
}
static int reuseport_array_alloc_check(union bpf_attr *attr)
{
if (attr->value_size != sizeof(u32) &&
attr->value_size != sizeof(u64))
return -EINVAL;
return array_map_alloc_check(attr);
}
static void *reuseport_array_lookup_elem(struct bpf_map *map, void *key)
{
struct reuseport_array *array = reuseport_array(map);
u32 index = *(u32 *)key;
if (unlikely(index >= array->map.max_entries))
return NULL;
return rcu_dereference(array->ptrs[index]);
}
/* Called from syscall only */
static int reuseport_array_delete_elem(struct bpf_map *map, void *key)
{
struct reuseport_array *array = reuseport_array(map);
u32 index = *(u32 *)key;
struct sock *sk;
int err;
if (index >= map->max_entries)
return -E2BIG;
if (!rcu_access_pointer(array->ptrs[index]))
return -ENOENT;
spin_lock_bh(&reuseport_lock);
sk = rcu_dereference_protected(array->ptrs[index],
lockdep_is_held(&reuseport_lock));
if (sk) {
write_lock_bh(&sk->sk_callback_lock);
WRITE_ONCE(sk->sk_user_data, NULL);
RCU_INIT_POINTER(array->ptrs[index], NULL);
write_unlock_bh(&sk->sk_callback_lock);
err = 0;
} else {
err = -ENOENT;
}
spin_unlock_bh(&reuseport_lock);
return err;
}
static void reuseport_array_free(struct bpf_map *map)
{
struct reuseport_array *array = reuseport_array(map);
struct sock *sk;
u32 i;
synchronize_rcu();
/*
* ops->map_*_elem() will not be able to access this
* array now. Hence, this function only races with
* bpf_sk_reuseport_detach() which was triggerred by
* close() or disconnect().
*
* This function and bpf_sk_reuseport_detach() are
* both removing sk from "array". Who removes it
* first does not matter.
*
* The only concern here is bpf_sk_reuseport_detach()
* may access "array" which is being freed here.
* bpf_sk_reuseport_detach() access this "array"
* through sk->sk_user_data _and_ with sk->sk_callback_lock
* held which is enough because this "array" is not freed
* until all sk->sk_user_data has stopped referencing this "array".
*
* Hence, due to the above, taking "reuseport_lock" is not
* needed here.
*/
/*
* Since reuseport_lock is not taken, sk is accessed under
* rcu_read_lock()
*/
rcu_read_lock();
for (i = 0; i < map->max_entries; i++) {
sk = rcu_dereference(array->ptrs[i]);
if (sk) {
write_lock_bh(&sk->sk_callback_lock);
/*
* No need for WRITE_ONCE(). At this point,
* no one is reading it without taking the
* sk->sk_callback_lock.
*/
sk->sk_user_data = NULL;
write_unlock_bh(&sk->sk_callback_lock);
RCU_INIT_POINTER(array->ptrs[i], NULL);
}
}
rcu_read_unlock();
/*
* Once reaching here, all sk->sk_user_data is not
* referenceing this "array". "array" can be freed now.
*/
bpf_map_area_free(array);
}
static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
{
int err, numa_node = bpf_map_attr_numa_node(attr);
struct reuseport_array *array;
u64 cost, array_size;
if (!capable(CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
array_size = sizeof(*array);
array_size += (u64)attr->max_entries * sizeof(struct sock *);
/* make sure there is no u32 overflow later in round_up() */
cost = array_size;
if (cost >= U32_MAX - PAGE_SIZE)
return ERR_PTR(-ENOMEM);
cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
err = bpf_map_precharge_memlock(cost);
if (err)
return ERR_PTR(err);
/* allocate all map elements and zero-initialize them */
array = bpf_map_area_alloc(array_size, numa_node);
if (!array)
return ERR_PTR(-ENOMEM);
/* copy mandatory map attributes */
bpf_map_init_from_attr(&array->map, attr);
array->map.pages = cost;
return &array->map;
}
int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key,
void *value)
{
struct sock *sk;
int err;
if (map->value_size != sizeof(u64))
return -ENOSPC;
rcu_read_lock();
sk = reuseport_array_lookup_elem(map, key);
if (sk) {
*(u64 *)value = sock_gen_cookie(sk);
err = 0;
} else {
err = -ENOENT;
}
rcu_read_unlock();
return err;
}
static int
reuseport_array_update_check(const struct reuseport_array *array,
const struct sock *nsk,
const struct sock *osk,
const struct sock_reuseport *nsk_reuse,
u32 map_flags)
{
if (osk && map_flags == BPF_NOEXIST)
return -EEXIST;
if (!osk && map_flags == BPF_EXIST)
return -ENOENT;
if (nsk->sk_protocol != IPPROTO_UDP && nsk->sk_protocol != IPPROTO_TCP)
return -ENOTSUPP;
if (nsk->sk_family != AF_INET && nsk->sk_family != AF_INET6)
return -ENOTSUPP;
if (nsk->sk_type != SOCK_STREAM && nsk->sk_type != SOCK_DGRAM)
return -ENOTSUPP;
/*
* sk must be hashed (i.e. listening in the TCP case or binded
* in the UDP case) and
* it must also be a SO_REUSEPORT sk (i.e. reuse cannot be NULL).
*
* Also, sk will be used in bpf helper that is protected by
* rcu_read_lock().
*/
if (!sock_flag(nsk, SOCK_RCU_FREE) || !sk_hashed(nsk) || !nsk_reuse)
return -EINVAL;
/* READ_ONCE because the sk->sk_callback_lock may not be held here */
if (READ_ONCE(nsk->sk_user_data))
return -EBUSY;
return 0;
}
/*
* Called from syscall only.
* The "nsk" in the fd refcnt.
* The "osk" and "reuse" are protected by reuseport_lock.
*/
int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key,
void *value, u64 map_flags)
{
struct reuseport_array *array = reuseport_array(map);
struct sock *free_osk = NULL, *osk, *nsk;
struct sock_reuseport *reuse;
u32 index = *(u32 *)key;
struct socket *socket;
int err, fd;
if (map_flags > BPF_EXIST)
return -EINVAL;
if (index >= map->max_entries)
return -E2BIG;
if (map->value_size == sizeof(u64)) {
u64 fd64 = *(u64 *)value;
if (fd64 > S32_MAX)
return -EINVAL;
fd = fd64;
} else {
fd = *(int *)value;
}
socket = sockfd_lookup(fd, &err);
if (!socket)
return err;
nsk = socket->sk;
if (!nsk) {
err = -EINVAL;
goto put_file;
}
/* Quick checks before taking reuseport_lock */
err = reuseport_array_update_check(array, nsk,
rcu_access_pointer(array->ptrs[index]),
rcu_access_pointer(nsk->sk_reuseport_cb),
map_flags);
if (err)
goto put_file;
spin_lock_bh(&reuseport_lock);
/*
* Some of the checks only need reuseport_lock
* but it is done under sk_callback_lock also
* for simplicity reason.
*/
write_lock_bh(&nsk->sk_callback_lock);
osk = rcu_dereference_protected(array->ptrs[index],
lockdep_is_held(&reuseport_lock));
reuse = rcu_dereference_protected(nsk->sk_reuseport_cb,
lockdep_is_held(&reuseport_lock));
err = reuseport_array_update_check(array, nsk, osk, reuse, map_flags);
if (err)
goto put_file_unlock;
/* Ensure reuse->reuseport_id is set */
err = reuseport_get_id(reuse);
if (err < 0)
goto put_file_unlock;
WRITE_ONCE(nsk->sk_user_data, &array->ptrs[index]);
rcu_assign_pointer(array->ptrs[index], nsk);
free_osk = osk;
err = 0;
put_file_unlock:
write_unlock_bh(&nsk->sk_callback_lock);
if (free_osk) {
write_lock_bh(&free_osk->sk_callback_lock);
WRITE_ONCE(free_osk->sk_user_data, NULL);
write_unlock_bh(&free_osk->sk_callback_lock);
}
spin_unlock_bh(&reuseport_lock);
put_file:
fput(socket->file);
return err;
}
/* Called from syscall */
static int reuseport_array_get_next_key(struct bpf_map *map, void *key,
void *next_key)
{
struct reuseport_array *array = reuseport_array(map);
u32 index = key ? *(u32 *)key : U32_MAX;
u32 *next = (u32 *)next_key;
if (index >= array->map.max_entries) {
*next = 0;
return 0;
}
if (index == array->map.max_entries - 1)
return -ENOENT;
*next = index + 1;
return 0;
}
const struct bpf_map_ops reuseport_array_ops = {
.map_alloc_check = reuseport_array_alloc_check,
.map_alloc = reuseport_array_alloc,
.map_free = reuseport_array_free,
.map_lookup_elem = reuseport_array_lookup_elem,
.map_get_next_key = reuseport_array_get_next_key,
.map_delete_elem = reuseport_array_delete_elem,
};
......@@ -684,6 +684,8 @@ static int map_lookup_elem(union bpf_attr *attr)
err = bpf_fd_array_map_lookup_elem(map, key, value);
} else if (IS_FD_HASH(map)) {
err = bpf_fd_htab_map_lookup_elem(map, key, value);
} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
err = bpf_fd_reuseport_array_lookup_elem(map, key, value);
} else {
rcu_read_lock();
ptr = map->ops->map_lookup_elem(map, key);
......@@ -790,6 +792,10 @@ static int map_update_elem(union bpf_attr *attr)
err = bpf_fd_htab_map_update_elem(map, f.file, key, value,
attr->flags);
rcu_read_unlock();
} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
/* rcu_read_lock() is not needed */
err = bpf_fd_reuseport_array_update_elem(map, key, value,
attr->flags);
} else {
rcu_read_lock();
err = map->ops->map_update_elem(map, key, value, attr->flags);
......
......@@ -1310,6 +1310,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
case BPF_PROG_TYPE_LWT_IN:
case BPF_PROG_TYPE_LWT_OUT:
case BPF_PROG_TYPE_LWT_SEG6LOCAL:
case BPF_PROG_TYPE_SK_REUSEPORT:
/* dst_input() and dst_output() can't write for now */
if (t == BPF_WRITE)
return false;
......@@ -2166,6 +2167,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
func_id != BPF_FUNC_msg_redirect_hash)
goto error;
break;
case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY:
if (func_id != BPF_FUNC_sk_select_reuseport)
goto error;
break;
default:
break;
}
......@@ -2217,6 +2222,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE)
goto error;
break;
case BPF_FUNC_sk_select_reuseport:
if (map->map_type != BPF_MAP_TYPE_REUSEPORT_SOCKARRAY)
goto error;
break;
default:
break;
}
......
This diff is collapsed.
......@@ -8,11 +8,34 @@
#include <net/sock_reuseport.h>
#include <linux/bpf.h>
#include <linux/idr.h>
#include <linux/filter.h>
#include <linux/rcupdate.h>
#define INIT_SOCKS 128
static DEFINE_SPINLOCK(reuseport_lock);
DEFINE_SPINLOCK(reuseport_lock);
#define REUSEPORT_MIN_ID 1
static DEFINE_IDA(reuseport_ida);
int reuseport_get_id(struct sock_reuseport *reuse)
{
int id;
if (reuse->reuseport_id)
return reuse->reuseport_id;
id = ida_simple_get(&reuseport_ida, REUSEPORT_MIN_ID, 0,
/* Called under reuseport_lock */
GFP_ATOMIC);
if (id < 0)
return id;
reuse->reuseport_id = id;
return reuse->reuseport_id;
}
static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks)
{
......@@ -29,7 +52,7 @@ static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks)
return reuse;
}
int reuseport_alloc(struct sock *sk)
int reuseport_alloc(struct sock *sk, bool bind_inany)
{
struct sock_reuseport *reuse;
......@@ -41,9 +64,17 @@ int reuseport_alloc(struct sock *sk)
/* Allocation attempts can occur concurrently via the setsockopt path
* and the bind/hash path. Nothing to do when we lose the race.
*/
if (rcu_dereference_protected(sk->sk_reuseport_cb,
lockdep_is_held(&reuseport_lock)))
reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
lockdep_is_held(&reuseport_lock));
if (reuse) {
/* Only set reuse->bind_inany if the bind_inany is true.
* Otherwise, it will overwrite the reuse->bind_inany
* which was set by the bind/hash path.
*/
if (bind_inany)
reuse->bind_inany = bind_inany;
goto out;
}
reuse = __reuseport_alloc(INIT_SOCKS);
if (!reuse) {
......@@ -53,6 +84,7 @@ int reuseport_alloc(struct sock *sk)
reuse->socks[0] = sk;
reuse->num_socks = 1;
reuse->bind_inany = bind_inany;
rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
out:
......@@ -78,9 +110,12 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
more_reuse->max_socks = more_socks_size;
more_reuse->num_socks = reuse->num_socks;
more_reuse->prog = reuse->prog;
more_reuse->reuseport_id = reuse->reuseport_id;
more_reuse->bind_inany = reuse->bind_inany;
memcpy(more_reuse->socks, reuse->socks,
reuse->num_socks * sizeof(struct sock *));
more_reuse->synq_overflow_ts = READ_ONCE(reuse->synq_overflow_ts);
for (i = 0; i < reuse->num_socks; ++i)
rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb,
......@@ -99,8 +134,9 @@ static void reuseport_free_rcu(struct rcu_head *head)
struct sock_reuseport *reuse;
reuse = container_of(head, struct sock_reuseport, rcu);
if (reuse->prog)
bpf_prog_destroy(reuse->prog);
sk_reuseport_prog_free(rcu_dereference_protected(reuse->prog, 1));
if (reuse->reuseport_id)
ida_simple_remove(&reuseport_ida, reuse->reuseport_id);
kfree(reuse);
}
......@@ -110,12 +146,12 @@ static void reuseport_free_rcu(struct rcu_head *head)
* @sk2: Socket belonging to the existing reuseport group.
* May return ENOMEM and not add socket to group under memory pressure.
*/
int reuseport_add_sock(struct sock *sk, struct sock *sk2)
int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany)
{
struct sock_reuseport *old_reuse, *reuse;
if (!rcu_access_pointer(sk2->sk_reuseport_cb)) {
int err = reuseport_alloc(sk2);
int err = reuseport_alloc(sk2, bind_inany);
if (err)
return err;
......@@ -160,6 +196,14 @@ void reuseport_detach_sock(struct sock *sk)
spin_lock_bh(&reuseport_lock);
reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
lockdep_is_held(&reuseport_lock));
/* At least one of the sk in this reuseport group is added to
* a bpf map. Notify the bpf side. The bpf map logic will
* remove the sk if it is indeed added to a bpf map.
*/
if (reuse->reuseport_id)
bpf_sk_reuseport_detach(sk);
rcu_assign_pointer(sk->sk_reuseport_cb, NULL);
for (i = 0; i < reuse->num_socks; i++) {
......@@ -175,9 +219,9 @@ void reuseport_detach_sock(struct sock *sk)
}
EXPORT_SYMBOL(reuseport_detach_sock);
static struct sock *run_bpf(struct sock_reuseport *reuse, u16 socks,
struct bpf_prog *prog, struct sk_buff *skb,
int hdr_len)
static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks,
struct bpf_prog *prog, struct sk_buff *skb,
int hdr_len)
{
struct sk_buff *nskb = NULL;
u32 index;
......@@ -238,9 +282,15 @@ struct sock *reuseport_select_sock(struct sock *sk,
/* paired with smp_wmb() in reuseport_add_sock() */
smp_rmb();
if (prog && skb)
sk2 = run_bpf(reuse, socks, prog, skb, hdr_len);
if (!prog || !skb)
goto select_by_hash;
if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, hash);
else
sk2 = run_bpf_filter(reuse, socks, prog, skb, hdr_len);
select_by_hash:
/* no bpf or invalid bpf result: fall back to hash usage */
if (!sk2)
sk2 = reuse->socks[reciprocal_scale(hash, socks)];
......@@ -252,12 +302,21 @@ struct sock *reuseport_select_sock(struct sock *sk,
}
EXPORT_SYMBOL(reuseport_select_sock);
struct bpf_prog *
reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
{
struct sock_reuseport *reuse;
struct bpf_prog *old_prog;
if (sk_unhashed(sk) && sk->sk_reuseport) {
int err = reuseport_alloc(sk, false);
if (err)
return err;
} else if (!rcu_access_pointer(sk->sk_reuseport_cb)) {
/* The socket wasn't bound with SO_REUSEPORT */
return -EINVAL;
}
spin_lock_bh(&reuseport_lock);
reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
lockdep_is_held(&reuseport_lock));
......@@ -266,6 +325,7 @@ reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
rcu_assign_pointer(reuse->prog, prog);
spin_unlock_bh(&reuseport_lock);
return old_prog;
sk_reuseport_prog_free(old_prog);
return 0;
}
EXPORT_SYMBOL(reuseport_attach_prog);
......@@ -107,6 +107,15 @@ bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
}
EXPORT_SYMBOL(inet_rcv_saddr_equal);
bool inet_rcv_saddr_any(const struct sock *sk)
{
#if IS_ENABLED(CONFIG_IPV6)
if (sk->sk_family == AF_INET6)
return ipv6_addr_any(&sk->sk_v6_rcv_saddr);
#endif
return !sk->sk_rcv_saddr;
}
void inet_get_local_port_range(struct net *net, int *low, int *high)
{
unsigned int seq;
......
......@@ -328,7 +328,7 @@ struct sock *__inet_lookup_listener(struct net *net,
saddr, sport, daddr, hnum,
dif, sdif);
if (result)
return result;
goto done;
/* Lookup lhash2 with INADDR_ANY */
......@@ -337,9 +337,10 @@ struct sock *__inet_lookup_listener(struct net *net,
if (ilb2->count > ilb->count)
goto port_lookup;
return inet_lhash2_lookup(net, ilb2, skb, doff,
saddr, sport, daddr, hnum,
dif, sdif);
result = inet_lhash2_lookup(net, ilb2, skb, doff,
saddr, sport, daddr, hnum,
dif, sdif);
goto done;
port_lookup:
sk_for_each_rcu(sk, &ilb->head) {
......@@ -352,12 +353,15 @@ struct sock *__inet_lookup_listener(struct net *net,
result = reuseport_select_sock(sk, phash,
skb, doff);
if (result)
return result;
goto done;
}
result = sk;
hiscore = score;
}
}
done:
if (unlikely(IS_ERR(result)))
return NULL;
return result;
}
EXPORT_SYMBOL_GPL(__inet_lookup_listener);
......@@ -567,10 +571,11 @@ static int inet_reuseport_add_sock(struct sock *sk,
inet_csk(sk2)->icsk_bind_hash == tb &&
sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
inet_rcv_saddr_equal(sk, sk2, false))
return reuseport_add_sock(sk, sk2);
return reuseport_add_sock(sk, sk2,
inet_rcv_saddr_any(sk));
}
return reuseport_alloc(sk);
return reuseport_alloc(sk, inet_rcv_saddr_any(sk));
}
int __inet_hash(struct sock *sk, struct sock *osk)
......
......@@ -221,11 +221,12 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot)
(sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
inet_rcv_saddr_equal(sk, sk2, false)) {
return reuseport_add_sock(sk, sk2);
return reuseport_add_sock(sk, sk2,
inet_rcv_saddr_any(sk));
}
}
return reuseport_alloc(sk);
return reuseport_alloc(sk, inet_rcv_saddr_any(sk));
}
/**
......@@ -498,6 +499,8 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
daddr, hnum, dif, sdif,
exact_dif, hslot2, skb);
}
if (unlikely(IS_ERR(result)))
return NULL;
return result;
}
begin:
......@@ -512,6 +515,8 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
saddr, sport);
result = reuseport_select_sock(sk, hash, skb,
sizeof(struct udphdr));
if (unlikely(IS_ERR(result)))
return NULL;
if (result)
return result;
}
......
......@@ -191,7 +191,7 @@ struct sock *inet6_lookup_listener(struct net *net,
saddr, sport, daddr, hnum,
dif, sdif);
if (result)
return result;
goto done;
/* Lookup lhash2 with in6addr_any */
......@@ -200,9 +200,10 @@ struct sock *inet6_lookup_listener(struct net *net,
if (ilb2->count > ilb->count)
goto port_lookup;
return inet6_lhash2_lookup(net, ilb2, skb, doff,
saddr, sport, daddr, hnum,
dif, sdif);
result = inet6_lhash2_lookup(net, ilb2, skb, doff,
saddr, sport, daddr, hnum,
dif, sdif);
goto done;
port_lookup:
sk_for_each(sk, &ilb->head) {
......@@ -214,12 +215,15 @@ struct sock *inet6_lookup_listener(struct net *net,
result = reuseport_select_sock(sk, phash,
skb, doff);
if (result)
return result;
goto done;
}
result = sk;
hiscore = score;
}
}
done:
if (unlikely(IS_ERR(result)))
return NULL;
return result;
}
EXPORT_SYMBOL_GPL(inet6_lookup_listener);
......
......@@ -235,6 +235,8 @@ struct sock *__udp6_lib_lookup(struct net *net,
exact_dif, hslot2,
skb);
}
if (unlikely(IS_ERR(result)))
return NULL;
return result;
}
begin:
......@@ -249,6 +251,8 @@ struct sock *__udp6_lib_lookup(struct net *net,
saddr, sport);
result = reuseport_select_sock(sk, hash, skb,
sizeof(struct udphdr));
if (unlikely(IS_ERR(result)))
return NULL;
if (result)
return result;
}
......
......@@ -126,6 +126,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_XSKMAP,
BPF_MAP_TYPE_SOCKHASH,
BPF_MAP_TYPE_CGROUP_STORAGE,
BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
};
enum bpf_prog_type {
......@@ -150,6 +151,7 @@ enum bpf_prog_type {
BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
BPF_PROG_TYPE_LWT_SEG6LOCAL,
BPF_PROG_TYPE_LIRC_MODE2,
BPF_PROG_TYPE_SK_REUSEPORT,
};
enum bpf_attach_type {
......@@ -2113,6 +2115,14 @@ union bpf_attr {
* the shared data.
* Return
* Pointer to the local storage area.
*
* int bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags)
* Description
* Select a SO_REUSEPORT sk from a BPF_MAP_TYPE_REUSEPORT_ARRAY map
* It checks the selected sk is matching the incoming
* request in the skb.
* Return
* 0 on success, or a negative error in case of failure.
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
......@@ -2196,7 +2206,8 @@ union bpf_attr {
FN(rc_keydown), \
FN(skb_cgroup_id), \
FN(get_current_cgroup_id), \
FN(get_local_storage),
FN(get_local_storage), \
FN(sk_select_reuseport),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call
......@@ -2413,6 +2424,30 @@ struct sk_msg_md {
__u32 local_port; /* stored in host byte order */
};
struct sk_reuseport_md {
/*
* Start of directly accessible data. It begins from
* the tcp/udp header.
*/
void *data;
void *data_end; /* End of directly accessible data */
/*
* Total length of packet (starting from the tcp/udp header).
* Note that the directly accessible bytes (data_end - data)
* could be less than this "len". Those bytes could be
* indirectly read by a helper "bpf_skb_load_bytes()".
*/
__u32 len;
/*
* Eth protocol in the mac header (network byte order). e.g.
* ETH_P_IP(0x0800) and ETH_P_IPV6(0x86DD)
*/
__u32 eth_protocol;
__u32 ip_protocol; /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */
__u32 bind_inany; /* Is sock bound to an INANY address? */
__u32 hash; /* A hash of the packet 4 tuples */
};
#define BPF_TAG_SIZE 8
struct bpf_prog_info {
......
......@@ -92,6 +92,7 @@ int bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr)
attr.btf_key_type_id = create_attr->btf_key_type_id;
attr.btf_value_type_id = create_attr->btf_value_type_id;
attr.map_ifindex = create_attr->map_ifindex;
attr.inner_map_fd = create_attr->inner_map_fd;
return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
}
......
......@@ -39,6 +39,7 @@ struct bpf_create_map_attr {
__u32 btf_key_type_id;
__u32 btf_value_type_id;
__u32 map_ifindex;
__u32 inner_map_fd;
};
int bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr);
......
......@@ -1501,6 +1501,7 @@ static bool bpf_prog_type__needs_kver(enum bpf_prog_type type)
case BPF_PROG_TYPE_SK_MSG:
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
case BPF_PROG_TYPE_LIRC_MODE2:
case BPF_PROG_TYPE_SK_REUSEPORT:
return false;
case BPF_PROG_TYPE_UNSPEC:
case BPF_PROG_TYPE_KPROBE:
......
......@@ -23,7 +23,7 @@ $(TEST_CUSTOM_PROGS): $(OUTPUT)/%: %.c
TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \
test_align test_verifier_log test_dev_cgroup test_tcpbpf_user \
test_sock test_btf test_sockmap test_lirc_mode2_user get_cgroup_id_user \
test_socket_cookie test_cgroup_storage
test_socket_cookie test_cgroup_storage test_select_reuseport
TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test_obj_id.o \
test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o \
......@@ -34,7 +34,7 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test
test_btf_haskv.o test_btf_nokv.o test_sockmap_kern.o test_tunnel_kern.o \
test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o \
test_lwt_seg6local.o sendmsg4_prog.o sendmsg6_prog.o test_lirc_mode2_kern.o \
get_cgroup_id_kern.o socket_cookie_prog.o
get_cgroup_id_kern.o socket_cookie_prog.o test_select_reuseport_kern.o
# Order correspond to 'make run_tests' order
TEST_PROGS := test_kmod.sh \
......
......@@ -111,6 +111,8 @@ static int (*bpf_xdp_adjust_tail)(void *ctx, int offset) =
static int (*bpf_skb_get_xfrm_state)(void *ctx, int index, void *state,
int size, int flags) =
(void *) BPF_FUNC_skb_get_xfrm_state;
static int (*bpf_sk_select_reuseport)(void *ctx, void *map, void *key, __u32 flags) =
(void *) BPF_FUNC_sk_select_reuseport;
static int (*bpf_get_stack)(void *ctx, void *buf, int size, int flags) =
(void *) BPF_FUNC_get_stack;
static int (*bpf_fib_lookup)(void *ctx, struct bpf_fib_lookup *params,
......@@ -173,6 +175,8 @@ struct bpf_map_def {
static int (*bpf_skb_load_bytes)(void *ctx, int off, void *to, int len) =
(void *) BPF_FUNC_skb_load_bytes;
static int (*bpf_skb_load_bytes_relative)(void *ctx, int off, void *to, int len, __u32 start_header) =
(void *) BPF_FUNC_skb_load_bytes_relative;
static int (*bpf_skb_store_bytes)(void *ctx, int off, void *from, int len, int flags) =
(void *) BPF_FUNC_skb_store_bytes;
static int (*bpf_l3_csum_replace)(void *ctx, int off, int from, int to, int flags) =
......
......@@ -44,4 +44,8 @@ static inline unsigned int bpf_num_possible_cpus(void)
name[bpf_num_possible_cpus()]
#define bpf_percpu(name, cpu) name[(cpu)].v
#ifndef ARRAY_SIZE
# define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
#endif
#endif /* __BPF_UTIL__ */
......@@ -18,10 +18,7 @@
#include "../../../include/linux/filter.h"
#include "bpf_rlimit.h"
#ifndef ARRAY_SIZE
# define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
#endif
#include "bpf_util.h"
#define MAX_INSNS 512
#define MAX_MATCHES 16
......
......@@ -19,6 +19,7 @@
#include <bpf/btf.h>
#include "bpf_rlimit.h"
#include "bpf_util.h"
static uint32_t pass_cnt;
static uint32_t error_cnt;
......@@ -93,10 +94,6 @@ static int __base_pr(const char *format, ...)
#define MAX_NR_RAW_TYPES 1024
#define BTF_LOG_BUF_SIZE 65535
#ifndef ARRAY_SIZE
# define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
#endif
static struct args {
unsigned int raw_test_num;
unsigned int file_test_num;
......
......@@ -17,7 +17,8 @@
#include <stdlib.h>
#include <sys/wait.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <linux/bpf.h>
#include <bpf/bpf.h>
......@@ -26,8 +27,21 @@
#include "bpf_util.h"
#include "bpf_rlimit.h"
#ifndef ENOTSUPP
#define ENOTSUPP 524
#endif
static int map_flags;
#define CHECK(condition, tag, format...) ({ \
int __ret = !!(condition); \
if (__ret) { \
printf("%s(%d):FAIL:%s ", __func__, __LINE__, tag); \
printf(format); \
exit(-1); \
} \
})
static void test_hashmap(int task, void *data)
{
long long key, next_key, first_key, value;
......@@ -1150,6 +1164,250 @@ static void test_map_wronly(void)
assert(bpf_map_get_next_key(fd, &key, &value) == -1 && errno == EPERM);
}
static void prepare_reuseport_grp(int type, int map_fd,
__s64 *fds64, __u64 *sk_cookies,
unsigned int n)
{
socklen_t optlen, addrlen;
struct sockaddr_in6 s6;
const __u32 index0 = 0;
const int optval = 1;
unsigned int i;
u64 sk_cookie;
__s64 fd64;
int err;
s6.sin6_family = AF_INET6;
s6.sin6_addr = in6addr_any;
s6.sin6_port = 0;
addrlen = sizeof(s6);
optlen = sizeof(sk_cookie);
for (i = 0; i < n; i++) {
fd64 = socket(AF_INET6, type, 0);
CHECK(fd64 == -1, "socket()",
"sock_type:%d fd64:%lld errno:%d\n",
type, fd64, errno);
err = setsockopt(fd64, SOL_SOCKET, SO_REUSEPORT,
&optval, sizeof(optval));
CHECK(err == -1, "setsockopt(SO_REUSEEPORT)",
"err:%d errno:%d\n", err, errno);
/* reuseport_array does not allow unbound sk */
err = bpf_map_update_elem(map_fd, &index0, &fd64,
BPF_ANY);
CHECK(err != -1 || errno != EINVAL,
"reuseport array update unbound sk",
"sock_type:%d err:%d errno:%d\n",
type, err, errno);
err = bind(fd64, (struct sockaddr *)&s6, sizeof(s6));
CHECK(err == -1, "bind()",
"sock_type:%d err:%d errno:%d\n", type, err, errno);
if (i == 0) {
err = getsockname(fd64, (struct sockaddr *)&s6,
&addrlen);
CHECK(err == -1, "getsockname()",
"sock_type:%d err:%d errno:%d\n",
type, err, errno);
}
err = getsockopt(fd64, SOL_SOCKET, SO_COOKIE, &sk_cookie,
&optlen);
CHECK(err == -1, "getsockopt(SO_COOKIE)",
"sock_type:%d err:%d errno:%d\n", type, err, errno);
if (type == SOCK_STREAM) {
/*
* reuseport_array does not allow
* non-listening tcp sk.
*/
err = bpf_map_update_elem(map_fd, &index0, &fd64,
BPF_ANY);
CHECK(err != -1 || errno != EINVAL,
"reuseport array update non-listening sk",
"sock_type:%d err:%d errno:%d\n",
type, err, errno);
err = listen(fd64, 0);
CHECK(err == -1, "listen()",
"sock_type:%d, err:%d errno:%d\n",
type, err, errno);
}
fds64[i] = fd64;
sk_cookies[i] = sk_cookie;
}
}
static void test_reuseport_array(void)
{
#define REUSEPORT_FD_IDX(err, last) ({ (err) ? last : !last; })
const __u32 array_size = 4, index0 = 0, index3 = 3;
int types[2] = { SOCK_STREAM, SOCK_DGRAM }, type;
__u64 grpa_cookies[2], sk_cookie, map_cookie;
__s64 grpa_fds64[2] = { -1, -1 }, fd64 = -1;
const __u32 bad_index = array_size;
int map_fd, err, t, f;
__u32 fds_idx = 0;
int fd;
map_fd = bpf_create_map(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
sizeof(__u32), sizeof(__u64), array_size, 0);
CHECK(map_fd == -1, "reuseport array create",
"map_fd:%d, errno:%d\n", map_fd, errno);
/* Test lookup/update/delete with invalid index */
err = bpf_map_delete_elem(map_fd, &bad_index);
CHECK(err != -1 || errno != E2BIG, "reuseport array del >=max_entries",
"err:%d errno:%d\n", err, errno);
err = bpf_map_update_elem(map_fd, &bad_index, &fd64, BPF_ANY);
CHECK(err != -1 || errno != E2BIG,
"reuseport array update >=max_entries",
"err:%d errno:%d\n", err, errno);
err = bpf_map_lookup_elem(map_fd, &bad_index, &map_cookie);
CHECK(err != -1 || errno != ENOENT,
"reuseport array update >=max_entries",
"err:%d errno:%d\n", err, errno);
/* Test lookup/delete non existence elem */
err = bpf_map_lookup_elem(map_fd, &index3, &map_cookie);
CHECK(err != -1 || errno != ENOENT,
"reuseport array lookup not-exist elem",
"err:%d errno:%d\n", err, errno);
err = bpf_map_delete_elem(map_fd, &index3);
CHECK(err != -1 || errno != ENOENT,
"reuseport array del not-exist elem",
"err:%d errno:%d\n", err, errno);
for (t = 0; t < ARRAY_SIZE(types); t++) {
type = types[t];
prepare_reuseport_grp(type, map_fd, grpa_fds64,
grpa_cookies, ARRAY_SIZE(grpa_fds64));
/* Test BPF_* update flags */
/* BPF_EXIST failure case */
err = bpf_map_update_elem(map_fd, &index3, &grpa_fds64[fds_idx],
BPF_EXIST);
CHECK(err != -1 || errno != ENOENT,
"reuseport array update empty elem BPF_EXIST",
"sock_type:%d err:%d errno:%d\n",
type, err, errno);
fds_idx = REUSEPORT_FD_IDX(err, fds_idx);
/* BPF_NOEXIST success case */
err = bpf_map_update_elem(map_fd, &index3, &grpa_fds64[fds_idx],
BPF_NOEXIST);
CHECK(err == -1,
"reuseport array update empty elem BPF_NOEXIST",
"sock_type:%d err:%d errno:%d\n",
type, err, errno);
fds_idx = REUSEPORT_FD_IDX(err, fds_idx);
/* BPF_EXIST success case. */
err = bpf_map_update_elem(map_fd, &index3, &grpa_fds64[fds_idx],
BPF_EXIST);
CHECK(err == -1,
"reuseport array update same elem BPF_EXIST",
"sock_type:%d err:%d errno:%d\n", type, err, errno);
fds_idx = REUSEPORT_FD_IDX(err, fds_idx);
/* BPF_NOEXIST failure case */
err = bpf_map_update_elem(map_fd, &index3, &grpa_fds64[fds_idx],
BPF_NOEXIST);
CHECK(err != -1 || errno != EEXIST,
"reuseport array update non-empty elem BPF_NOEXIST",
"sock_type:%d err:%d errno:%d\n",
type, err, errno);
fds_idx = REUSEPORT_FD_IDX(err, fds_idx);
/* BPF_ANY case (always succeed) */
err = bpf_map_update_elem(map_fd, &index3, &grpa_fds64[fds_idx],
BPF_ANY);
CHECK(err == -1,
"reuseport array update same sk with BPF_ANY",
"sock_type:%d err:%d errno:%d\n", type, err, errno);
fd64 = grpa_fds64[fds_idx];
sk_cookie = grpa_cookies[fds_idx];
/* The same sk cannot be added to reuseport_array twice */
err = bpf_map_update_elem(map_fd, &index3, &fd64, BPF_ANY);
CHECK(err != -1 || errno != EBUSY,
"reuseport array update same sk with same index",
"sock_type:%d err:%d errno:%d\n",
type, err, errno);
err = bpf_map_update_elem(map_fd, &index0, &fd64, BPF_ANY);
CHECK(err != -1 || errno != EBUSY,
"reuseport array update same sk with different index",
"sock_type:%d err:%d errno:%d\n",
type, err, errno);
/* Test delete elem */
err = bpf_map_delete_elem(map_fd, &index3);
CHECK(err == -1, "reuseport array delete sk",
"sock_type:%d err:%d errno:%d\n",
type, err, errno);
/* Add it back with BPF_NOEXIST */
err = bpf_map_update_elem(map_fd, &index3, &fd64, BPF_NOEXIST);
CHECK(err == -1,
"reuseport array re-add with BPF_NOEXIST after del",
"sock_type:%d err:%d errno:%d\n", type, err, errno);
/* Test cookie */
err = bpf_map_lookup_elem(map_fd, &index3, &map_cookie);
CHECK(err == -1 || sk_cookie != map_cookie,
"reuseport array lookup re-added sk",
"sock_type:%d err:%d errno:%d sk_cookie:0x%llx map_cookie:0x%llxn",
type, err, errno, sk_cookie, map_cookie);
/* Test elem removed by close() */
for (f = 0; f < ARRAY_SIZE(grpa_fds64); f++)
close(grpa_fds64[f]);
err = bpf_map_lookup_elem(map_fd, &index3, &map_cookie);
CHECK(err != -1 || errno != ENOENT,
"reuseport array lookup after close()",
"sock_type:%d err:%d errno:%d\n",
type, err, errno);
}
/* Test SOCK_RAW */
fd64 = socket(AF_INET6, SOCK_RAW, IPPROTO_UDP);
CHECK(fd64 == -1, "socket(SOCK_RAW)", "err:%d errno:%d\n",
err, errno);
err = bpf_map_update_elem(map_fd, &index3, &fd64, BPF_NOEXIST);
CHECK(err != -1 || errno != ENOTSUPP, "reuseport array update SOCK_RAW",
"err:%d errno:%d\n", err, errno);
close(fd64);
/* Close the 64 bit value map */
close(map_fd);
/* Test 32 bit fd */
map_fd = bpf_create_map(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
sizeof(__u32), sizeof(__u32), array_size, 0);
CHECK(map_fd == -1, "reuseport array create",
"map_fd:%d, errno:%d\n", map_fd, errno);
prepare_reuseport_grp(SOCK_STREAM, map_fd, &fd64, &sk_cookie, 1);
fd = fd64;
err = bpf_map_update_elem(map_fd, &index3, &fd, BPF_NOEXIST);
CHECK(err == -1, "reuseport array update 32 bit fd",
"err:%d errno:%d\n", err, errno);
err = bpf_map_lookup_elem(map_fd, &index3, &map_cookie);
CHECK(err != -1 || errno != ENOSPC,
"reuseport array lookup 32 bit fd",
"err:%d errno:%d\n", err, errno);
close(fd);
close(map_fd);
}
static void run_all_tests(void)
{
test_hashmap(0, NULL);
......@@ -1170,6 +1428,8 @@ static void run_all_tests(void)
test_map_rdonly();
test_map_wronly();
test_reuseport_array();
}
int main(void)
......
This diff is collapsed.
/* SPDX-License-Identifier: GPL-2.0 */
/* Copyright (c) 2018 Facebook */
#ifndef __TEST_SELECT_REUSEPORT_COMMON_H
#define __TEST_SELECT_REUSEPORT_COMMON_H
#include <linux/types.h>
enum result {
DROP_ERR_INNER_MAP,
DROP_ERR_SKB_DATA,
DROP_ERR_SK_SELECT_REUSEPORT,
DROP_MISC,
PASS,
PASS_ERR_SK_SELECT_REUSEPORT,
NR_RESULTS,
};
struct cmd {
__u32 reuseport_index;
__u32 pass_on_failure;
};
struct data_check {
__u32 ip_protocol;
__u32 skb_addrs[8];
__u16 skb_ports[2];
__u16 eth_protocol;
__u8 bind_inany;
__u8 equal_check_end[0];
__u32 len;
__u32 hash;
};
#endif
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2018 Facebook */
#include <stdlib.h>
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/bpf.h>
#include <linux/types.h>
#include <linux/if_ether.h>
#include "bpf_endian.h"
#include "bpf_helpers.h"
#include "test_select_reuseport_common.h"
int _version SEC("version") = 1;
#ifndef offsetof
#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
#endif
struct bpf_map_def SEC("maps") outer_map = {
.type = BPF_MAP_TYPE_ARRAY_OF_MAPS,
.key_size = sizeof(__u32),
.value_size = sizeof(__u32),
.max_entries = 1,
};
struct bpf_map_def SEC("maps") result_map = {
.type = BPF_MAP_TYPE_ARRAY,
.key_size = sizeof(__u32),
.value_size = sizeof(__u32),
.max_entries = NR_RESULTS,
};
struct bpf_map_def SEC("maps") tmp_index_ovr_map = {
.type = BPF_MAP_TYPE_ARRAY,
.key_size = sizeof(__u32),
.value_size = sizeof(int),
.max_entries = 1,
};
struct bpf_map_def SEC("maps") linum_map = {
.type = BPF_MAP_TYPE_ARRAY,
.key_size = sizeof(__u32),
.value_size = sizeof(__u32),
.max_entries = 1,
};
struct bpf_map_def SEC("maps") data_check_map = {
.type = BPF_MAP_TYPE_ARRAY,
.key_size = sizeof(__u32),
.value_size = sizeof(struct data_check),
.max_entries = 1,
};
#define GOTO_DONE(_result) ({ \
result = (_result); \
linum = __LINE__; \
goto done; \
})
SEC("select_by_skb_data")
int _select_by_skb_data(struct sk_reuseport_md *reuse_md)
{
__u32 linum, index = 0, flags = 0, index_zero = 0;
__u32 *result_cnt, *linum_value;
struct data_check data_check = {};
struct cmd *cmd, cmd_copy;
void *data, *data_end;
void *reuseport_array;
enum result result;
int *index_ovr;
int err;
data = reuse_md->data;
data_end = reuse_md->data_end;
data_check.len = reuse_md->len;
data_check.eth_protocol = reuse_md->eth_protocol;
data_check.ip_protocol = reuse_md->ip_protocol;
data_check.hash = reuse_md->hash;
data_check.bind_inany = reuse_md->bind_inany;
if (data_check.eth_protocol == bpf_htons(ETH_P_IP)) {
if (bpf_skb_load_bytes_relative(reuse_md,
offsetof(struct iphdr, saddr),
data_check.skb_addrs, 8,
BPF_HDR_START_NET))
GOTO_DONE(DROP_MISC);
} else {
if (bpf_skb_load_bytes_relative(reuse_md,
offsetof(struct ipv6hdr, saddr),
data_check.skb_addrs, 32,
BPF_HDR_START_NET))
GOTO_DONE(DROP_MISC);
}
/*
* The ip_protocol could be a compile time decision
* if the bpf_prog.o is dedicated to either TCP or
* UDP.
*
* Otherwise, reuse_md->ip_protocol or
* the protocol field in the iphdr can be used.
*/
if (data_check.ip_protocol == IPPROTO_TCP) {
struct tcphdr *th = data;
if (th + 1 > data_end)
GOTO_DONE(DROP_MISC);
data_check.skb_ports[0] = th->source;
data_check.skb_ports[1] = th->dest;
if ((th->doff << 2) + sizeof(*cmd) > data_check.len)
GOTO_DONE(DROP_ERR_SKB_DATA);
if (bpf_skb_load_bytes(reuse_md, th->doff << 2, &cmd_copy,
sizeof(cmd_copy)))
GOTO_DONE(DROP_MISC);
cmd = &cmd_copy;
} else if (data_check.ip_protocol == IPPROTO_UDP) {
struct udphdr *uh = data;
if (uh + 1 > data_end)
GOTO_DONE(DROP_MISC);
data_check.skb_ports[0] = uh->source;
data_check.skb_ports[1] = uh->dest;
if (sizeof(struct udphdr) + sizeof(*cmd) > data_check.len)
GOTO_DONE(DROP_ERR_SKB_DATA);
if (data + sizeof(struct udphdr) + sizeof(*cmd) > data_end) {
if (bpf_skb_load_bytes(reuse_md, sizeof(struct udphdr),
&cmd_copy, sizeof(cmd_copy)))
GOTO_DONE(DROP_MISC);
cmd = &cmd_copy;
} else {
cmd = data + sizeof(struct udphdr);
}
} else {
GOTO_DONE(DROP_MISC);
}
reuseport_array = bpf_map_lookup_elem(&outer_map, &index_zero);
if (!reuseport_array)
GOTO_DONE(DROP_ERR_INNER_MAP);
index = cmd->reuseport_index;
index_ovr = bpf_map_lookup_elem(&tmp_index_ovr_map, &index_zero);
if (!index_ovr)
GOTO_DONE(DROP_MISC);
if (*index_ovr != -1) {
index = *index_ovr;
*index_ovr = -1;
}
err = bpf_sk_select_reuseport(reuse_md, reuseport_array, &index,
flags);
if (!err)
GOTO_DONE(PASS);
if (cmd->pass_on_failure)
GOTO_DONE(PASS_ERR_SK_SELECT_REUSEPORT);
else
GOTO_DONE(DROP_ERR_SK_SELECT_REUSEPORT);
done:
result_cnt = bpf_map_lookup_elem(&result_map, &result);
if (!result_cnt)
return SK_DROP;
bpf_map_update_elem(&linum_map, &index_zero, &linum, BPF_ANY);
bpf_map_update_elem(&data_check_map, &index_zero, &data_check, BPF_ANY);
(*result_cnt)++;
return result < PASS ? SK_DROP : SK_PASS;
}
char _license[] SEC("license") = "GPL";
......@@ -14,10 +14,7 @@
#include "cgroup_helpers.h"
#include "bpf_rlimit.h"
#ifndef ARRAY_SIZE
# define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
#endif
#include "bpf_util.h"
#define CG_PATH "/foo"
#define MAX_INSNS 512
......
......@@ -20,15 +20,12 @@
#include "cgroup_helpers.h"
#include "bpf_rlimit.h"
#include "bpf_util.h"
#ifndef ENOTSUPP
# define ENOTSUPP 524
#endif
#ifndef ARRAY_SIZE
# define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
#endif
#define CG_PATH "/foo"
#define CONNECT4_PROG_PATH "./connect4_prog.o"
#define CONNECT6_PROG_PATH "./connect6_prog.o"
......
......@@ -42,12 +42,9 @@
#endif
#include "bpf_rlimit.h"
#include "bpf_rand.h"
#include "bpf_util.h"
#include "../../../include/linux/filter.h"
#ifndef ARRAY_SIZE
# define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
#endif
#define MAX_INSNS BPF_MAXINSNS
#define MAX_FIXUPS 8
#define MAX_NR_MAPS 8
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment