Commit 271b955e authored by David S. Miller's avatar David S. Miller

Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf

Daniel Borkmann says:

====================
pull-request: bpf 2018-07-01

The following pull-request contains BPF updates for your *net* tree.

The main changes are:

1) A bpf_fib_lookup() helper fix to change the API before freeze to
   return an encoding of the FIB lookup result and return the nexthop
   device index in the params struct (instead of device index as return
   code that we had before), from David.

2) Various BPF JIT fixes to address syzkaller fallout, that is, do not
   reject progs when set_memory_*() fails since it could still be RO.
   Also arm32 JIT was not using bpf_jit_binary_lock_ro() API which was
   an issue, and a memory leak in s390 JIT found during review, from
   Daniel.

3) Multiple fixes for sockmap/hash to address most of the syzkaller
   triggered bugs. Usage with IPv6 was crashing, a GPF in bpf_tcp_close(),
   a missing sock_map_release() routine to hook up to callbacks, and a
   fix for an omitted bucket lock in sock_close(), from John.

4) Two bpftool fixes to remove duplicated error message on program load,
   and another one to close the libbpf object after program load. One
   additional fix for nfp driver's BPF offload to avoid stopping offload
   completely if replace of program failed, from Jakub.

5) Couple of BPF selftest fixes that bail out in some of the test
   scripts if the user does not have the right privileges, from Jeffrin.

6) Fixes in test_bpf for s390 when CONFIG_BPF_JIT_ALWAYS_ON is set
   where we need to set the flag that some of the test cases are expected
   to fail, from Kleber.

7) Fix to detangle BPF_LIRC_MODE2 dependency from CONFIG_CGROUP_BPF
   since it has no relation to it and lirc2 users often have configs
   without cgroups enabled and thus would not be able to use it, from Sean.

8) Fix a selftest failure in sockmap by removing a useless setrlimit()
   call that would set a too low limit where at the same time we are
   already including bpf_rlimit.h that does the job, from Yonghong.

9) Fix BPF selftest config with missing missing NET_SCHED, from Anders.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 35e8c7ba bf2b866a
...@@ -1844,7 +1844,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) ...@@ -1844,7 +1844,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
/* there are 2 passes here */ /* there are 2 passes here */
bpf_jit_dump(prog->len, image_size, 2, ctx.target); bpf_jit_dump(prog->len, image_size, 2, ctx.target);
set_memory_ro((unsigned long)header, header->pages); bpf_jit_binary_lock_ro(header);
prog->bpf_func = (void *)ctx.target; prog->bpf_func = (void *)ctx.target;
prog->jited = 1; prog->jited = 1;
prog->jited_len = image_size; prog->jited_len = image_size;
......
...@@ -1286,6 +1286,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) ...@@ -1286,6 +1286,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
goto free_addrs; goto free_addrs;
} }
if (bpf_jit_prog(&jit, fp)) { if (bpf_jit_prog(&jit, fp)) {
bpf_jit_binary_free(header);
fp = orig_fp; fp = orig_fp;
goto free_addrs; goto free_addrs;
} }
......
...@@ -207,29 +207,19 @@ void lirc_bpf_free(struct rc_dev *rcdev) ...@@ -207,29 +207,19 @@ void lirc_bpf_free(struct rc_dev *rcdev)
bpf_prog_array_free(rcdev->raw->progs); bpf_prog_array_free(rcdev->raw->progs);
} }
int lirc_prog_attach(const union bpf_attr *attr) int lirc_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog)
{ {
struct bpf_prog *prog;
struct rc_dev *rcdev; struct rc_dev *rcdev;
int ret; int ret;
if (attr->attach_flags) if (attr->attach_flags)
return -EINVAL; return -EINVAL;
prog = bpf_prog_get_type(attr->attach_bpf_fd,
BPF_PROG_TYPE_LIRC_MODE2);
if (IS_ERR(prog))
return PTR_ERR(prog);
rcdev = rc_dev_get_from_fd(attr->target_fd); rcdev = rc_dev_get_from_fd(attr->target_fd);
if (IS_ERR(rcdev)) { if (IS_ERR(rcdev))
bpf_prog_put(prog);
return PTR_ERR(rcdev); return PTR_ERR(rcdev);
}
ret = lirc_bpf_attach(rcdev, prog); ret = lirc_bpf_attach(rcdev, prog);
if (ret)
bpf_prog_put(prog);
put_device(&rcdev->dev); put_device(&rcdev->dev);
......
...@@ -81,10 +81,10 @@ nfp_bpf_xdp_offload(struct nfp_app *app, struct nfp_net *nn, ...@@ -81,10 +81,10 @@ nfp_bpf_xdp_offload(struct nfp_app *app, struct nfp_net *nn,
ret = nfp_net_bpf_offload(nn, prog, running, extack); ret = nfp_net_bpf_offload(nn, prog, running, extack);
/* Stop offload if replace not possible */ /* Stop offload if replace not possible */
if (ret && prog) if (ret)
nfp_bpf_xdp_offload(app, nn, NULL, extack); return ret;
nn->dp.bpf_offload_xdp = prog && !ret; nn->dp.bpf_offload_xdp = !!prog;
return ret; return ret;
} }
......
...@@ -188,12 +188,38 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, ...@@ -188,12 +188,38 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
\ \
__ret; \ __ret; \
}) })
int cgroup_bpf_prog_attach(const union bpf_attr *attr,
enum bpf_prog_type ptype, struct bpf_prog *prog);
int cgroup_bpf_prog_detach(const union bpf_attr *attr,
enum bpf_prog_type ptype);
int cgroup_bpf_prog_query(const union bpf_attr *attr,
union bpf_attr __user *uattr);
#else #else
struct bpf_prog;
struct cgroup_bpf {}; struct cgroup_bpf {};
static inline void cgroup_bpf_put(struct cgroup *cgrp) {} static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; }
static inline int cgroup_bpf_prog_attach(const union bpf_attr *attr,
enum bpf_prog_type ptype,
struct bpf_prog *prog)
{
return -EINVAL;
}
static inline int cgroup_bpf_prog_detach(const union bpf_attr *attr,
enum bpf_prog_type ptype)
{
return -EINVAL;
}
static inline int cgroup_bpf_prog_query(const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
return -EINVAL;
}
#define cgroup_bpf_enabled (0) #define cgroup_bpf_enabled (0)
#define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0)
#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
......
...@@ -696,6 +696,8 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map) ...@@ -696,6 +696,8 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map)
struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key); struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key);
struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key); struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key);
int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type); int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type);
int sockmap_get_from_fd(const union bpf_attr *attr, int type,
struct bpf_prog *prog);
#else #else
static inline struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) static inline struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
{ {
...@@ -714,6 +716,12 @@ static inline int sock_map_prog(struct bpf_map *map, ...@@ -714,6 +716,12 @@ static inline int sock_map_prog(struct bpf_map *map,
{ {
return -EOPNOTSUPP; return -EOPNOTSUPP;
} }
static inline int sockmap_get_from_fd(const union bpf_attr *attr, int type,
struct bpf_prog *prog)
{
return -EINVAL;
}
#endif #endif
#if defined(CONFIG_XDP_SOCKETS) #if defined(CONFIG_XDP_SOCKETS)
......
...@@ -5,11 +5,12 @@ ...@@ -5,11 +5,12 @@
#include <uapi/linux/bpf.h> #include <uapi/linux/bpf.h>
#ifdef CONFIG_BPF_LIRC_MODE2 #ifdef CONFIG_BPF_LIRC_MODE2
int lirc_prog_attach(const union bpf_attr *attr); int lirc_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog);
int lirc_prog_detach(const union bpf_attr *attr); int lirc_prog_detach(const union bpf_attr *attr);
int lirc_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr); int lirc_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr);
#else #else
static inline int lirc_prog_attach(const union bpf_attr *attr) static inline int lirc_prog_attach(const union bpf_attr *attr,
struct bpf_prog *prog)
{ {
return -EINVAL; return -EINVAL;
} }
......
...@@ -470,9 +470,7 @@ struct sock_fprog_kern { ...@@ -470,9 +470,7 @@ struct sock_fprog_kern {
}; };
struct bpf_binary_header { struct bpf_binary_header {
u16 pages; u32 pages;
u16 locked:1;
/* Some arches need word alignment for their instructions */ /* Some arches need word alignment for their instructions */
u8 image[] __aligned(4); u8 image[] __aligned(4);
}; };
...@@ -481,7 +479,7 @@ struct bpf_prog { ...@@ -481,7 +479,7 @@ struct bpf_prog {
u16 pages; /* Number of allocated pages */ u16 pages; /* Number of allocated pages */
u16 jited:1, /* Is our filter JIT'ed? */ u16 jited:1, /* Is our filter JIT'ed? */
jit_requested:1,/* archs need to JIT the prog */ jit_requested:1,/* archs need to JIT the prog */
locked:1, /* Program image locked? */ undo_set_mem:1, /* Passed set_memory_ro() checkpoint */
gpl_compatible:1, /* Is filter GPL compatible? */ gpl_compatible:1, /* Is filter GPL compatible? */
cb_access:1, /* Is control block accessed? */ cb_access:1, /* Is control block accessed? */
dst_needed:1, /* Do we need dst entry? */ dst_needed:1, /* Do we need dst entry? */
...@@ -677,46 +675,24 @@ bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default) ...@@ -677,46 +675,24 @@ bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default)
static inline void bpf_prog_lock_ro(struct bpf_prog *fp) static inline void bpf_prog_lock_ro(struct bpf_prog *fp)
{ {
#ifdef CONFIG_ARCH_HAS_SET_MEMORY fp->undo_set_mem = 1;
fp->locked = 1; set_memory_ro((unsigned long)fp, fp->pages);
if (set_memory_ro((unsigned long)fp, fp->pages))
fp->locked = 0;
#endif
} }
static inline void bpf_prog_unlock_ro(struct bpf_prog *fp) static inline void bpf_prog_unlock_ro(struct bpf_prog *fp)
{ {
#ifdef CONFIG_ARCH_HAS_SET_MEMORY if (fp->undo_set_mem)
if (fp->locked) { set_memory_rw((unsigned long)fp, fp->pages);
WARN_ON_ONCE(set_memory_rw((unsigned long)fp, fp->pages));
/* In case set_memory_rw() fails, we want to be the first
* to crash here instead of some random place later on.
*/
fp->locked = 0;
}
#endif
} }
static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr) static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr)
{ {
#ifdef CONFIG_ARCH_HAS_SET_MEMORY set_memory_ro((unsigned long)hdr, hdr->pages);
hdr->locked = 1;
if (set_memory_ro((unsigned long)hdr, hdr->pages))
hdr->locked = 0;
#endif
} }
static inline void bpf_jit_binary_unlock_ro(struct bpf_binary_header *hdr) static inline void bpf_jit_binary_unlock_ro(struct bpf_binary_header *hdr)
{ {
#ifdef CONFIG_ARCH_HAS_SET_MEMORY set_memory_rw((unsigned long)hdr, hdr->pages);
if (hdr->locked) {
WARN_ON_ONCE(set_memory_rw((unsigned long)hdr, hdr->pages));
/* In case set_memory_rw() fails, we want to be the first
* to crash here instead of some random place later on.
*/
hdr->locked = 0;
}
#endif
} }
static inline struct bpf_binary_header * static inline struct bpf_binary_header *
...@@ -728,22 +704,6 @@ bpf_jit_binary_hdr(const struct bpf_prog *fp) ...@@ -728,22 +704,6 @@ bpf_jit_binary_hdr(const struct bpf_prog *fp)
return (void *)addr; return (void *)addr;
} }
#ifdef CONFIG_ARCH_HAS_SET_MEMORY
static inline int bpf_prog_check_pages_ro_single(const struct bpf_prog *fp)
{
if (!fp->locked)
return -ENOLCK;
if (fp->jited) {
const struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp);
if (!hdr->locked)
return -ENOLCK;
}
return 0;
}
#endif
int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap); int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap);
static inline int sk_filter(struct sock *sk, struct sk_buff *skb) static inline int sk_filter(struct sock *sk, struct sk_buff *skb)
{ {
......
...@@ -1857,7 +1857,8 @@ union bpf_attr { ...@@ -1857,7 +1857,8 @@ union bpf_attr {
* is resolved), the nexthop address is returned in ipv4_dst * is resolved), the nexthop address is returned in ipv4_dst
* or ipv6_dst based on family, smac is set to mac address of * or ipv6_dst based on family, smac is set to mac address of
* egress device, dmac is set to nexthop mac address, rt_metric * egress device, dmac is set to nexthop mac address, rt_metric
* is set to metric from route (IPv4/IPv6 only). * is set to metric from route (IPv4/IPv6 only), and ifindex
* is set to the device index of the nexthop from the FIB lookup.
* *
* *plen* argument is the size of the passed in struct. * *plen* argument is the size of the passed in struct.
* *flags* argument can be a combination of one or more of the * *flags* argument can be a combination of one or more of the
...@@ -1873,9 +1874,10 @@ union bpf_attr { ...@@ -1873,9 +1874,10 @@ union bpf_attr {
* *ctx* is either **struct xdp_md** for XDP programs or * *ctx* is either **struct xdp_md** for XDP programs or
* **struct sk_buff** tc cls_act programs. * **struct sk_buff** tc cls_act programs.
* Return * Return
* Egress device index on success, 0 if packet needs to continue * * < 0 if any input argument is invalid
* up the stack for further processing or a negative error in case * * 0 on success (packet is forwarded, nexthop neighbor exists)
* of failure. * * > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the
* * packet is not forwarded or needs assist from full stack
* *
* int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags) * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags)
* Description * Description
...@@ -2612,6 +2614,18 @@ struct bpf_raw_tracepoint_args { ...@@ -2612,6 +2614,18 @@ struct bpf_raw_tracepoint_args {
#define BPF_FIB_LOOKUP_DIRECT BIT(0) #define BPF_FIB_LOOKUP_DIRECT BIT(0)
#define BPF_FIB_LOOKUP_OUTPUT BIT(1) #define BPF_FIB_LOOKUP_OUTPUT BIT(1)
enum {
BPF_FIB_LKUP_RET_SUCCESS, /* lookup successful */
BPF_FIB_LKUP_RET_BLACKHOLE, /* dest is blackholed; can be dropped */
BPF_FIB_LKUP_RET_UNREACHABLE, /* dest is unreachable; can be dropped */
BPF_FIB_LKUP_RET_PROHIBIT, /* dest not allowed; can be dropped */
BPF_FIB_LKUP_RET_NOT_FWDED, /* packet is not forwarded */
BPF_FIB_LKUP_RET_FWD_DISABLED, /* fwding is not enabled on ingress */
BPF_FIB_LKUP_RET_UNSUPP_LWT, /* fwd requires encapsulation */
BPF_FIB_LKUP_RET_NO_NEIGH, /* no neighbor entry for nh */
BPF_FIB_LKUP_RET_FRAG_NEEDED, /* fragmentation required to fwd */
};
struct bpf_fib_lookup { struct bpf_fib_lookup {
/* input: network family for lookup (AF_INET, AF_INET6) /* input: network family for lookup (AF_INET, AF_INET6)
* output: network family of egress nexthop * output: network family of egress nexthop
...@@ -2625,7 +2639,11 @@ struct bpf_fib_lookup { ...@@ -2625,7 +2639,11 @@ struct bpf_fib_lookup {
/* total length of packet from network header - used for MTU check */ /* total length of packet from network header - used for MTU check */
__u16 tot_len; __u16 tot_len;
__u32 ifindex; /* L3 device index for lookup */
/* input: L3 device index for lookup
* output: device index from FIB lookup
*/
__u32 ifindex;
union { union {
/* inputs to lookup */ /* inputs to lookup */
......
...@@ -428,6 +428,60 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, ...@@ -428,6 +428,60 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
return ret; return ret;
} }
int cgroup_bpf_prog_attach(const union bpf_attr *attr,
enum bpf_prog_type ptype, struct bpf_prog *prog)
{
struct cgroup *cgrp;
int ret;
cgrp = cgroup_get_from_fd(attr->target_fd);
if (IS_ERR(cgrp))
return PTR_ERR(cgrp);
ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type,
attr->attach_flags);
cgroup_put(cgrp);
return ret;
}
int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
{
struct bpf_prog *prog;
struct cgroup *cgrp;
int ret;
cgrp = cgroup_get_from_fd(attr->target_fd);
if (IS_ERR(cgrp))
return PTR_ERR(cgrp);
prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
if (IS_ERR(prog))
prog = NULL;
ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0);
if (prog)
bpf_prog_put(prog);
cgroup_put(cgrp);
return ret;
}
int cgroup_bpf_prog_query(const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
struct cgroup *cgrp;
int ret;
cgrp = cgroup_get_from_fd(attr->query.target_fd);
if (IS_ERR(cgrp))
return PTR_ERR(cgrp);
ret = cgroup_bpf_query(cgrp, attr, uattr);
cgroup_put(cgrp);
return ret;
}
/** /**
* __cgroup_bpf_run_filter_skb() - Run a program for packet filtering * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
* @sk: The socket sending or receiving traffic * @sk: The socket sending or receiving traffic
......
...@@ -598,8 +598,6 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, ...@@ -598,8 +598,6 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
bpf_fill_ill_insns(hdr, size); bpf_fill_ill_insns(hdr, size);
hdr->pages = size / PAGE_SIZE; hdr->pages = size / PAGE_SIZE;
hdr->locked = 0;
hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
PAGE_SIZE - sizeof(*hdr)); PAGE_SIZE - sizeof(*hdr));
start = (get_random_int() % hole) & ~(alignment - 1); start = (get_random_int() % hole) & ~(alignment - 1);
...@@ -1450,22 +1448,6 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) ...@@ -1450,22 +1448,6 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
return 0; return 0;
} }
static int bpf_prog_check_pages_ro_locked(const struct bpf_prog *fp)
{
#ifdef CONFIG_ARCH_HAS_SET_MEMORY
int i, err;
for (i = 0; i < fp->aux->func_cnt; i++) {
err = bpf_prog_check_pages_ro_single(fp->aux->func[i]);
if (err)
return err;
}
return bpf_prog_check_pages_ro_single(fp);
#endif
return 0;
}
static void bpf_prog_select_func(struct bpf_prog *fp) static void bpf_prog_select_func(struct bpf_prog *fp)
{ {
#ifndef CONFIG_BPF_JIT_ALWAYS_ON #ifndef CONFIG_BPF_JIT_ALWAYS_ON
...@@ -1524,17 +1506,7 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) ...@@ -1524,17 +1506,7 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
* all eBPF JITs might immediately support all features. * all eBPF JITs might immediately support all features.
*/ */
*err = bpf_check_tail_call(fp); *err = bpf_check_tail_call(fp);
if (*err)
return fp;
/* Checkpoint: at this point onwards any cBPF -> eBPF or
* native eBPF program is read-only. If we failed to change
* the page attributes (e.g. allocation failure from
* splitting large pages), then reject the whole program
* in order to guarantee not ending up with any W+X pages
* from BPF side in kernel.
*/
*err = bpf_prog_check_pages_ro_locked(fp);
return fp; return fp;
} }
EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
......
...@@ -72,6 +72,7 @@ struct bpf_htab { ...@@ -72,6 +72,7 @@ struct bpf_htab {
u32 n_buckets; u32 n_buckets;
u32 elem_size; u32 elem_size;
struct bpf_sock_progs progs; struct bpf_sock_progs progs;
struct rcu_head rcu;
}; };
struct htab_elem { struct htab_elem {
...@@ -89,8 +90,8 @@ enum smap_psock_state { ...@@ -89,8 +90,8 @@ enum smap_psock_state {
struct smap_psock_map_entry { struct smap_psock_map_entry {
struct list_head list; struct list_head list;
struct sock **entry; struct sock **entry;
struct htab_elem *hash_link; struct htab_elem __rcu *hash_link;
struct bpf_htab *htab; struct bpf_htab __rcu *htab;
}; };
struct smap_psock { struct smap_psock {
...@@ -120,6 +121,7 @@ struct smap_psock { ...@@ -120,6 +121,7 @@ struct smap_psock {
struct bpf_prog *bpf_parse; struct bpf_prog *bpf_parse;
struct bpf_prog *bpf_verdict; struct bpf_prog *bpf_verdict;
struct list_head maps; struct list_head maps;
spinlock_t maps_lock;
/* Back reference used when sock callback trigger sockmap operations */ /* Back reference used when sock callback trigger sockmap operations */
struct sock *sock; struct sock *sock;
...@@ -140,6 +142,7 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, ...@@ -140,6 +142,7 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
static int bpf_tcp_sendpage(struct sock *sk, struct page *page, static int bpf_tcp_sendpage(struct sock *sk, struct page *page,
int offset, size_t size, int flags); int offset, size_t size, int flags);
static void bpf_tcp_close(struct sock *sk, long timeout);
static inline struct smap_psock *smap_psock_sk(const struct sock *sk) static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
{ {
...@@ -161,7 +164,42 @@ static bool bpf_tcp_stream_read(const struct sock *sk) ...@@ -161,7 +164,42 @@ static bool bpf_tcp_stream_read(const struct sock *sk)
return !empty; return !empty;
} }
static struct proto tcp_bpf_proto; enum {
SOCKMAP_IPV4,
SOCKMAP_IPV6,
SOCKMAP_NUM_PROTS,
};
enum {
SOCKMAP_BASE,
SOCKMAP_TX,
SOCKMAP_NUM_CONFIGS,
};
static struct proto *saved_tcpv6_prot __read_mostly;
static DEFINE_SPINLOCK(tcpv6_prot_lock);
static struct proto bpf_tcp_prots[SOCKMAP_NUM_PROTS][SOCKMAP_NUM_CONFIGS];
static void build_protos(struct proto prot[SOCKMAP_NUM_CONFIGS],
struct proto *base)
{
prot[SOCKMAP_BASE] = *base;
prot[SOCKMAP_BASE].close = bpf_tcp_close;
prot[SOCKMAP_BASE].recvmsg = bpf_tcp_recvmsg;
prot[SOCKMAP_BASE].stream_memory_read = bpf_tcp_stream_read;
prot[SOCKMAP_TX] = prot[SOCKMAP_BASE];
prot[SOCKMAP_TX].sendmsg = bpf_tcp_sendmsg;
prot[SOCKMAP_TX].sendpage = bpf_tcp_sendpage;
}
static void update_sk_prot(struct sock *sk, struct smap_psock *psock)
{
int family = sk->sk_family == AF_INET6 ? SOCKMAP_IPV6 : SOCKMAP_IPV4;
int conf = psock->bpf_tx_msg ? SOCKMAP_TX : SOCKMAP_BASE;
sk->sk_prot = &bpf_tcp_prots[family][conf];
}
static int bpf_tcp_init(struct sock *sk) static int bpf_tcp_init(struct sock *sk)
{ {
struct smap_psock *psock; struct smap_psock *psock;
...@@ -181,14 +219,17 @@ static int bpf_tcp_init(struct sock *sk) ...@@ -181,14 +219,17 @@ static int bpf_tcp_init(struct sock *sk)
psock->save_close = sk->sk_prot->close; psock->save_close = sk->sk_prot->close;
psock->sk_proto = sk->sk_prot; psock->sk_proto = sk->sk_prot;
if (psock->bpf_tx_msg) { /* Build IPv6 sockmap whenever the address of tcpv6_prot changes */
tcp_bpf_proto.sendmsg = bpf_tcp_sendmsg; if (sk->sk_family == AF_INET6 &&
tcp_bpf_proto.sendpage = bpf_tcp_sendpage; unlikely(sk->sk_prot != smp_load_acquire(&saved_tcpv6_prot))) {
tcp_bpf_proto.recvmsg = bpf_tcp_recvmsg; spin_lock_bh(&tcpv6_prot_lock);
tcp_bpf_proto.stream_memory_read = bpf_tcp_stream_read; if (likely(sk->sk_prot != saved_tcpv6_prot)) {
build_protos(bpf_tcp_prots[SOCKMAP_IPV6], sk->sk_prot);
smp_store_release(&saved_tcpv6_prot, sk->sk_prot);
} }
spin_unlock_bh(&tcpv6_prot_lock);
sk->sk_prot = &tcp_bpf_proto; }
update_sk_prot(sk, psock);
rcu_read_unlock(); rcu_read_unlock();
return 0; return 0;
} }
...@@ -219,16 +260,54 @@ static void bpf_tcp_release(struct sock *sk) ...@@ -219,16 +260,54 @@ static void bpf_tcp_release(struct sock *sk)
rcu_read_unlock(); rcu_read_unlock();
} }
static struct htab_elem *lookup_elem_raw(struct hlist_head *head,
u32 hash, void *key, u32 key_size)
{
struct htab_elem *l;
hlist_for_each_entry_rcu(l, head, hash_node) {
if (l->hash == hash && !memcmp(&l->key, key, key_size))
return l;
}
return NULL;
}
static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
{
return &htab->buckets[hash & (htab->n_buckets - 1)];
}
static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
{
return &__select_bucket(htab, hash)->head;
}
static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
{ {
atomic_dec(&htab->count); atomic_dec(&htab->count);
kfree_rcu(l, rcu); kfree_rcu(l, rcu);
} }
static struct smap_psock_map_entry *psock_map_pop(struct sock *sk,
struct smap_psock *psock)
{
struct smap_psock_map_entry *e;
spin_lock_bh(&psock->maps_lock);
e = list_first_entry_or_null(&psock->maps,
struct smap_psock_map_entry,
list);
if (e)
list_del(&e->list);
spin_unlock_bh(&psock->maps_lock);
return e;
}
static void bpf_tcp_close(struct sock *sk, long timeout) static void bpf_tcp_close(struct sock *sk, long timeout)
{ {
void (*close_fun)(struct sock *sk, long timeout); void (*close_fun)(struct sock *sk, long timeout);
struct smap_psock_map_entry *e, *tmp; struct smap_psock_map_entry *e;
struct sk_msg_buff *md, *mtmp; struct sk_msg_buff *md, *mtmp;
struct smap_psock *psock; struct smap_psock *psock;
struct sock *osk; struct sock *osk;
...@@ -247,7 +326,6 @@ static void bpf_tcp_close(struct sock *sk, long timeout) ...@@ -247,7 +326,6 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
*/ */
close_fun = psock->save_close; close_fun = psock->save_close;
write_lock_bh(&sk->sk_callback_lock);
if (psock->cork) { if (psock->cork) {
free_start_sg(psock->sock, psock->cork); free_start_sg(psock->sock, psock->cork);
kfree(psock->cork); kfree(psock->cork);
...@@ -260,20 +338,38 @@ static void bpf_tcp_close(struct sock *sk, long timeout) ...@@ -260,20 +338,38 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
kfree(md); kfree(md);
} }
list_for_each_entry_safe(e, tmp, &psock->maps, list) { e = psock_map_pop(sk, psock);
while (e) {
if (e->entry) { if (e->entry) {
osk = cmpxchg(e->entry, sk, NULL); osk = cmpxchg(e->entry, sk, NULL);
if (osk == sk) { if (osk == sk) {
list_del(&e->list);
smap_release_sock(psock, sk); smap_release_sock(psock, sk);
} }
} else { } else {
hlist_del_rcu(&e->hash_link->hash_node); struct htab_elem *link = rcu_dereference(e->hash_link);
smap_release_sock(psock, e->hash_link->sk); struct bpf_htab *htab = rcu_dereference(e->htab);
free_htab_elem(e->htab, e->hash_link); struct hlist_head *head;
struct htab_elem *l;
struct bucket *b;
b = __select_bucket(htab, link->hash);
head = &b->head;
raw_spin_lock_bh(&b->lock);
l = lookup_elem_raw(head,
link->hash, link->key,
htab->map.key_size);
/* If another thread deleted this object skip deletion.
* The refcnt on psock may or may not be zero.
*/
if (l) {
hlist_del_rcu(&link->hash_node);
smap_release_sock(psock, link->sk);
free_htab_elem(htab, link);
} }
raw_spin_unlock_bh(&b->lock);
}
e = psock_map_pop(sk, psock);
} }
write_unlock_bh(&sk->sk_callback_lock);
rcu_read_unlock(); rcu_read_unlock();
close_fun(sk, timeout); close_fun(sk, timeout);
} }
...@@ -1111,8 +1207,7 @@ static void bpf_tcp_msg_add(struct smap_psock *psock, ...@@ -1111,8 +1207,7 @@ static void bpf_tcp_msg_add(struct smap_psock *psock,
static int bpf_tcp_ulp_register(void) static int bpf_tcp_ulp_register(void)
{ {
tcp_bpf_proto = tcp_prot; build_protos(bpf_tcp_prots[SOCKMAP_IPV4], &tcp_prot);
tcp_bpf_proto.close = bpf_tcp_close;
/* Once BPF TX ULP is registered it is never unregistered. It /* Once BPF TX ULP is registered it is never unregistered. It
* will be in the ULP list for the lifetime of the system. Doing * will be in the ULP list for the lifetime of the system. Doing
* duplicate registers is not a problem. * duplicate registers is not a problem.
...@@ -1357,7 +1452,9 @@ static void smap_release_sock(struct smap_psock *psock, struct sock *sock) ...@@ -1357,7 +1452,9 @@ static void smap_release_sock(struct smap_psock *psock, struct sock *sock)
{ {
if (refcount_dec_and_test(&psock->refcnt)) { if (refcount_dec_and_test(&psock->refcnt)) {
tcp_cleanup_ulp(sock); tcp_cleanup_ulp(sock);
write_lock_bh(&sock->sk_callback_lock);
smap_stop_sock(psock, sock); smap_stop_sock(psock, sock);
write_unlock_bh(&sock->sk_callback_lock);
clear_bit(SMAP_TX_RUNNING, &psock->state); clear_bit(SMAP_TX_RUNNING, &psock->state);
rcu_assign_sk_user_data(sock, NULL); rcu_assign_sk_user_data(sock, NULL);
call_rcu_sched(&psock->rcu, smap_destroy_psock); call_rcu_sched(&psock->rcu, smap_destroy_psock);
...@@ -1508,6 +1605,7 @@ static struct smap_psock *smap_init_psock(struct sock *sock, int node) ...@@ -1508,6 +1605,7 @@ static struct smap_psock *smap_init_psock(struct sock *sock, int node)
INIT_LIST_HEAD(&psock->maps); INIT_LIST_HEAD(&psock->maps);
INIT_LIST_HEAD(&psock->ingress); INIT_LIST_HEAD(&psock->ingress);
refcount_set(&psock->refcnt, 1); refcount_set(&psock->refcnt, 1);
spin_lock_init(&psock->maps_lock);
rcu_assign_sk_user_data(sock, psock); rcu_assign_sk_user_data(sock, psock);
sock_hold(sock); sock_hold(sock);
...@@ -1564,18 +1662,32 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) ...@@ -1564,18 +1662,32 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
return ERR_PTR(err); return ERR_PTR(err);
} }
static void smap_list_remove(struct smap_psock *psock, static void smap_list_map_remove(struct smap_psock *psock,
struct sock **entry, struct sock **entry)
struct htab_elem *hash_link)
{ {
struct smap_psock_map_entry *e, *tmp; struct smap_psock_map_entry *e, *tmp;
spin_lock_bh(&psock->maps_lock);
list_for_each_entry_safe(e, tmp, &psock->maps, list) { list_for_each_entry_safe(e, tmp, &psock->maps, list) {
if (e->entry == entry || e->hash_link == hash_link) { if (e->entry == entry)
list_del(&e->list); list_del(&e->list);
break;
} }
spin_unlock_bh(&psock->maps_lock);
}
static void smap_list_hash_remove(struct smap_psock *psock,
struct htab_elem *hash_link)
{
struct smap_psock_map_entry *e, *tmp;
spin_lock_bh(&psock->maps_lock);
list_for_each_entry_safe(e, tmp, &psock->maps, list) {
struct htab_elem *c = rcu_dereference(e->hash_link);
if (c == hash_link)
list_del(&e->list);
} }
spin_unlock_bh(&psock->maps_lock);
} }
static void sock_map_free(struct bpf_map *map) static void sock_map_free(struct bpf_map *map)
...@@ -1601,7 +1713,6 @@ static void sock_map_free(struct bpf_map *map) ...@@ -1601,7 +1713,6 @@ static void sock_map_free(struct bpf_map *map)
if (!sock) if (!sock)
continue; continue;
write_lock_bh(&sock->sk_callback_lock);
psock = smap_psock_sk(sock); psock = smap_psock_sk(sock);
/* This check handles a racing sock event that can get the /* This check handles a racing sock event that can get the
* sk_callback_lock before this case but after xchg happens * sk_callback_lock before this case but after xchg happens
...@@ -1609,10 +1720,9 @@ static void sock_map_free(struct bpf_map *map) ...@@ -1609,10 +1720,9 @@ static void sock_map_free(struct bpf_map *map)
* to be null and queued for garbage collection. * to be null and queued for garbage collection.
*/ */
if (likely(psock)) { if (likely(psock)) {
smap_list_remove(psock, &stab->sock_map[i], NULL); smap_list_map_remove(psock, &stab->sock_map[i]);
smap_release_sock(psock, sock); smap_release_sock(psock, sock);
} }
write_unlock_bh(&sock->sk_callback_lock);
} }
rcu_read_unlock(); rcu_read_unlock();
...@@ -1661,17 +1771,15 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key) ...@@ -1661,17 +1771,15 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key)
if (!sock) if (!sock)
return -EINVAL; return -EINVAL;
write_lock_bh(&sock->sk_callback_lock);
psock = smap_psock_sk(sock); psock = smap_psock_sk(sock);
if (!psock) if (!psock)
goto out; goto out;
if (psock->bpf_parse) if (psock->bpf_parse)
smap_stop_sock(psock, sock); smap_stop_sock(psock, sock);
smap_list_remove(psock, &stab->sock_map[k], NULL); smap_list_map_remove(psock, &stab->sock_map[k]);
smap_release_sock(psock, sock); smap_release_sock(psock, sock);
out: out:
write_unlock_bh(&sock->sk_callback_lock);
return 0; return 0;
} }
...@@ -1752,7 +1860,6 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, ...@@ -1752,7 +1860,6 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
} }
} }
write_lock_bh(&sock->sk_callback_lock);
psock = smap_psock_sk(sock); psock = smap_psock_sk(sock);
/* 2. Do not allow inheriting programs if psock exists and has /* 2. Do not allow inheriting programs if psock exists and has
...@@ -1809,7 +1916,9 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, ...@@ -1809,7 +1916,9 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
if (err) if (err)
goto out_free; goto out_free;
smap_init_progs(psock, verdict, parse); smap_init_progs(psock, verdict, parse);
write_lock_bh(&sock->sk_callback_lock);
smap_start_sock(psock, sock); smap_start_sock(psock, sock);
write_unlock_bh(&sock->sk_callback_lock);
} }
/* 4. Place psock in sockmap for use and stop any programs on /* 4. Place psock in sockmap for use and stop any programs on
...@@ -1819,9 +1928,10 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, ...@@ -1819,9 +1928,10 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
*/ */
if (map_link) { if (map_link) {
e->entry = map_link; e->entry = map_link;
spin_lock_bh(&psock->maps_lock);
list_add_tail(&e->list, &psock->maps); list_add_tail(&e->list, &psock->maps);
spin_unlock_bh(&psock->maps_lock);
} }
write_unlock_bh(&sock->sk_callback_lock);
return err; return err;
out_free: out_free:
smap_release_sock(psock, sock); smap_release_sock(psock, sock);
...@@ -1832,7 +1942,6 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, ...@@ -1832,7 +1942,6 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
} }
if (tx_msg) if (tx_msg)
bpf_prog_put(tx_msg); bpf_prog_put(tx_msg);
write_unlock_bh(&sock->sk_callback_lock);
kfree(e); kfree(e);
return err; return err;
} }
...@@ -1869,10 +1978,8 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, ...@@ -1869,10 +1978,8 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
if (osock) { if (osock) {
struct smap_psock *opsock = smap_psock_sk(osock); struct smap_psock *opsock = smap_psock_sk(osock);
write_lock_bh(&osock->sk_callback_lock); smap_list_map_remove(opsock, &stab->sock_map[i]);
smap_list_remove(opsock, &stab->sock_map[i], NULL);
smap_release_sock(opsock, osock); smap_release_sock(opsock, osock);
write_unlock_bh(&osock->sk_callback_lock);
} }
out: out:
return err; return err;
...@@ -1915,6 +2022,24 @@ int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type) ...@@ -1915,6 +2022,24 @@ int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type)
return 0; return 0;
} }
int sockmap_get_from_fd(const union bpf_attr *attr, int type,
struct bpf_prog *prog)
{
int ufd = attr->target_fd;
struct bpf_map *map;
struct fd f;
int err;
f = fdget(ufd);
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
err = sock_map_prog(map, prog, attr->attach_type);
fdput(f);
return err;
}
static void *sock_map_lookup(struct bpf_map *map, void *key) static void *sock_map_lookup(struct bpf_map *map, void *key)
{ {
return NULL; return NULL;
...@@ -2043,14 +2168,13 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) ...@@ -2043,14 +2168,13 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
return ERR_PTR(err); return ERR_PTR(err);
} }
static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) static void __bpf_htab_free(struct rcu_head *rcu)
{ {
return &htab->buckets[hash & (htab->n_buckets - 1)]; struct bpf_htab *htab;
}
static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) htab = container_of(rcu, struct bpf_htab, rcu);
{ bpf_map_area_free(htab->buckets);
return &__select_bucket(htab, hash)->head; kfree(htab);
} }
static void sock_hash_free(struct bpf_map *map) static void sock_hash_free(struct bpf_map *map)
...@@ -2069,16 +2193,18 @@ static void sock_hash_free(struct bpf_map *map) ...@@ -2069,16 +2193,18 @@ static void sock_hash_free(struct bpf_map *map)
*/ */
rcu_read_lock(); rcu_read_lock();
for (i = 0; i < htab->n_buckets; i++) { for (i = 0; i < htab->n_buckets; i++) {
struct hlist_head *head = select_bucket(htab, i); struct bucket *b = __select_bucket(htab, i);
struct hlist_head *head;
struct hlist_node *n; struct hlist_node *n;
struct htab_elem *l; struct htab_elem *l;
raw_spin_lock_bh(&b->lock);
head = &b->head;
hlist_for_each_entry_safe(l, n, head, hash_node) { hlist_for_each_entry_safe(l, n, head, hash_node) {
struct sock *sock = l->sk; struct sock *sock = l->sk;
struct smap_psock *psock; struct smap_psock *psock;
hlist_del_rcu(&l->hash_node); hlist_del_rcu(&l->hash_node);
write_lock_bh(&sock->sk_callback_lock);
psock = smap_psock_sk(sock); psock = smap_psock_sk(sock);
/* This check handles a racing sock event that can get /* This check handles a racing sock event that can get
* the sk_callback_lock before this case but after xchg * the sk_callback_lock before this case but after xchg
...@@ -2086,16 +2212,15 @@ static void sock_hash_free(struct bpf_map *map) ...@@ -2086,16 +2212,15 @@ static void sock_hash_free(struct bpf_map *map)
* (psock) to be null and queued for garbage collection. * (psock) to be null and queued for garbage collection.
*/ */
if (likely(psock)) { if (likely(psock)) {
smap_list_remove(psock, NULL, l); smap_list_hash_remove(psock, l);
smap_release_sock(psock, sock); smap_release_sock(psock, sock);
} }
write_unlock_bh(&sock->sk_callback_lock); free_htab_elem(htab, l);
kfree(l);
} }
raw_spin_unlock_bh(&b->lock);
} }
rcu_read_unlock(); rcu_read_unlock();
bpf_map_area_free(htab->buckets); call_rcu(&htab->rcu, __bpf_htab_free);
kfree(htab);
} }
static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab, static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab,
...@@ -2122,19 +2247,6 @@ static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab, ...@@ -2122,19 +2247,6 @@ static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab,
return l_new; return l_new;
} }
static struct htab_elem *lookup_elem_raw(struct hlist_head *head,
u32 hash, void *key, u32 key_size)
{
struct htab_elem *l;
hlist_for_each_entry_rcu(l, head, hash_node) {
if (l->hash == hash && !memcmp(&l->key, key, key_size))
return l;
}
return NULL;
}
static inline u32 htab_map_hash(const void *key, u32 key_len) static inline u32 htab_map_hash(const void *key, u32 key_len)
{ {
return jhash(key, key_len, 0); return jhash(key, key_len, 0);
...@@ -2254,9 +2366,12 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops, ...@@ -2254,9 +2366,12 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops,
goto bucket_err; goto bucket_err;
} }
e->hash_link = l_new; rcu_assign_pointer(e->hash_link, l_new);
e->htab = container_of(map, struct bpf_htab, map); rcu_assign_pointer(e->htab,
container_of(map, struct bpf_htab, map));
spin_lock_bh(&psock->maps_lock);
list_add_tail(&e->list, &psock->maps); list_add_tail(&e->list, &psock->maps);
spin_unlock_bh(&psock->maps_lock);
/* add new element to the head of the list, so that /* add new element to the head of the list, so that
* concurrent search will find it before old elem * concurrent search will find it before old elem
...@@ -2266,7 +2381,7 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops, ...@@ -2266,7 +2381,7 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops,
psock = smap_psock_sk(l_old->sk); psock = smap_psock_sk(l_old->sk);
hlist_del_rcu(&l_old->hash_node); hlist_del_rcu(&l_old->hash_node);
smap_list_remove(psock, NULL, l_old); smap_list_hash_remove(psock, l_old);
smap_release_sock(psock, l_old->sk); smap_release_sock(psock, l_old->sk);
free_htab_elem(htab, l_old); free_htab_elem(htab, l_old);
} }
...@@ -2326,7 +2441,6 @@ static int sock_hash_delete_elem(struct bpf_map *map, void *key) ...@@ -2326,7 +2441,6 @@ static int sock_hash_delete_elem(struct bpf_map *map, void *key)
struct smap_psock *psock; struct smap_psock *psock;
hlist_del_rcu(&l->hash_node); hlist_del_rcu(&l->hash_node);
write_lock_bh(&sock->sk_callback_lock);
psock = smap_psock_sk(sock); psock = smap_psock_sk(sock);
/* This check handles a racing sock event that can get the /* This check handles a racing sock event that can get the
* sk_callback_lock before this case but after xchg happens * sk_callback_lock before this case but after xchg happens
...@@ -2334,10 +2448,9 @@ static int sock_hash_delete_elem(struct bpf_map *map, void *key) ...@@ -2334,10 +2448,9 @@ static int sock_hash_delete_elem(struct bpf_map *map, void *key)
* to be null and queued for garbage collection. * to be null and queued for garbage collection.
*/ */
if (likely(psock)) { if (likely(psock)) {
smap_list_remove(psock, NULL, l); smap_list_hash_remove(psock, l);
smap_release_sock(psock, sock); smap_release_sock(psock, sock);
} }
write_unlock_bh(&sock->sk_callback_lock);
free_htab_elem(htab, l); free_htab_elem(htab, l);
ret = 0; ret = 0;
} }
...@@ -2383,6 +2496,7 @@ const struct bpf_map_ops sock_hash_ops = { ...@@ -2383,6 +2496,7 @@ const struct bpf_map_ops sock_hash_ops = {
.map_get_next_key = sock_hash_get_next_key, .map_get_next_key = sock_hash_get_next_key,
.map_update_elem = sock_hash_update_elem, .map_update_elem = sock_hash_update_elem,
.map_delete_elem = sock_hash_delete_elem, .map_delete_elem = sock_hash_delete_elem,
.map_release_uref = sock_map_release,
}; };
BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock, BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
......
...@@ -1483,8 +1483,6 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) ...@@ -1483,8 +1483,6 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
return err; return err;
} }
#ifdef CONFIG_CGROUP_BPF
static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
enum bpf_attach_type attach_type) enum bpf_attach_type attach_type)
{ {
...@@ -1499,40 +1497,6 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, ...@@ -1499,40 +1497,6 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
#define BPF_PROG_ATTACH_LAST_FIELD attach_flags #define BPF_PROG_ATTACH_LAST_FIELD attach_flags
static int sockmap_get_from_fd(const union bpf_attr *attr,
int type, bool attach)
{
struct bpf_prog *prog = NULL;
int ufd = attr->target_fd;
struct bpf_map *map;
struct fd f;
int err;
f = fdget(ufd);
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
if (attach) {
prog = bpf_prog_get_type(attr->attach_bpf_fd, type);
if (IS_ERR(prog)) {
fdput(f);
return PTR_ERR(prog);
}
}
err = sock_map_prog(map, prog, attr->attach_type);
if (err) {
fdput(f);
if (prog)
bpf_prog_put(prog);
return err;
}
fdput(f);
return 0;
}
#define BPF_F_ATTACH_MASK \ #define BPF_F_ATTACH_MASK \
(BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI) (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)
...@@ -1540,7 +1504,6 @@ static int bpf_prog_attach(const union bpf_attr *attr) ...@@ -1540,7 +1504,6 @@ static int bpf_prog_attach(const union bpf_attr *attr)
{ {
enum bpf_prog_type ptype; enum bpf_prog_type ptype;
struct bpf_prog *prog; struct bpf_prog *prog;
struct cgroup *cgrp;
int ret; int ret;
if (!capable(CAP_NET_ADMIN)) if (!capable(CAP_NET_ADMIN))
...@@ -1577,12 +1540,15 @@ static int bpf_prog_attach(const union bpf_attr *attr) ...@@ -1577,12 +1540,15 @@ static int bpf_prog_attach(const union bpf_attr *attr)
ptype = BPF_PROG_TYPE_CGROUP_DEVICE; ptype = BPF_PROG_TYPE_CGROUP_DEVICE;
break; break;
case BPF_SK_MSG_VERDICT: case BPF_SK_MSG_VERDICT:
return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, true); ptype = BPF_PROG_TYPE_SK_MSG;
break;
case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_PARSER:
case BPF_SK_SKB_STREAM_VERDICT: case BPF_SK_SKB_STREAM_VERDICT:
return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, true); ptype = BPF_PROG_TYPE_SK_SKB;
break;
case BPF_LIRC_MODE2: case BPF_LIRC_MODE2:
return lirc_prog_attach(attr); ptype = BPF_PROG_TYPE_LIRC_MODE2;
break;
default: default:
return -EINVAL; return -EINVAL;
} }
...@@ -1596,18 +1562,20 @@ static int bpf_prog_attach(const union bpf_attr *attr) ...@@ -1596,18 +1562,20 @@ static int bpf_prog_attach(const union bpf_attr *attr)
return -EINVAL; return -EINVAL;
} }
cgrp = cgroup_get_from_fd(attr->target_fd); switch (ptype) {
if (IS_ERR(cgrp)) { case BPF_PROG_TYPE_SK_SKB:
bpf_prog_put(prog); case BPF_PROG_TYPE_SK_MSG:
return PTR_ERR(cgrp); ret = sockmap_get_from_fd(attr, ptype, prog);
break;
case BPF_PROG_TYPE_LIRC_MODE2:
ret = lirc_prog_attach(attr, prog);
break;
default:
ret = cgroup_bpf_prog_attach(attr, ptype, prog);
} }
ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type,
attr->attach_flags);
if (ret) if (ret)
bpf_prog_put(prog); bpf_prog_put(prog);
cgroup_put(cgrp);
return ret; return ret;
} }
...@@ -1616,9 +1584,6 @@ static int bpf_prog_attach(const union bpf_attr *attr) ...@@ -1616,9 +1584,6 @@ static int bpf_prog_attach(const union bpf_attr *attr)
static int bpf_prog_detach(const union bpf_attr *attr) static int bpf_prog_detach(const union bpf_attr *attr)
{ {
enum bpf_prog_type ptype; enum bpf_prog_type ptype;
struct bpf_prog *prog;
struct cgroup *cgrp;
int ret;
if (!capable(CAP_NET_ADMIN)) if (!capable(CAP_NET_ADMIN))
return -EPERM; return -EPERM;
...@@ -1651,29 +1616,17 @@ static int bpf_prog_detach(const union bpf_attr *attr) ...@@ -1651,29 +1616,17 @@ static int bpf_prog_detach(const union bpf_attr *attr)
ptype = BPF_PROG_TYPE_CGROUP_DEVICE; ptype = BPF_PROG_TYPE_CGROUP_DEVICE;
break; break;
case BPF_SK_MSG_VERDICT: case BPF_SK_MSG_VERDICT:
return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, false); return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, NULL);
case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_PARSER:
case BPF_SK_SKB_STREAM_VERDICT: case BPF_SK_SKB_STREAM_VERDICT:
return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, false); return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, NULL);
case BPF_LIRC_MODE2: case BPF_LIRC_MODE2:
return lirc_prog_detach(attr); return lirc_prog_detach(attr);
default: default:
return -EINVAL; return -EINVAL;
} }
cgrp = cgroup_get_from_fd(attr->target_fd); return cgroup_bpf_prog_detach(attr, ptype);
if (IS_ERR(cgrp))
return PTR_ERR(cgrp);
prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
if (IS_ERR(prog))
prog = NULL;
ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0);
if (prog)
bpf_prog_put(prog);
cgroup_put(cgrp);
return ret;
} }
#define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt #define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt
...@@ -1681,9 +1634,6 @@ static int bpf_prog_detach(const union bpf_attr *attr) ...@@ -1681,9 +1634,6 @@ static int bpf_prog_detach(const union bpf_attr *attr)
static int bpf_prog_query(const union bpf_attr *attr, static int bpf_prog_query(const union bpf_attr *attr,
union bpf_attr __user *uattr) union bpf_attr __user *uattr)
{ {
struct cgroup *cgrp;
int ret;
if (!capable(CAP_NET_ADMIN)) if (!capable(CAP_NET_ADMIN))
return -EPERM; return -EPERM;
if (CHECK_ATTR(BPF_PROG_QUERY)) if (CHECK_ATTR(BPF_PROG_QUERY))
...@@ -1711,14 +1661,9 @@ static int bpf_prog_query(const union bpf_attr *attr, ...@@ -1711,14 +1661,9 @@ static int bpf_prog_query(const union bpf_attr *attr,
default: default:
return -EINVAL; return -EINVAL;
} }
cgrp = cgroup_get_from_fd(attr->query.target_fd);
if (IS_ERR(cgrp)) return cgroup_bpf_prog_query(attr, uattr);
return PTR_ERR(cgrp);
ret = cgroup_bpf_query(cgrp, attr, uattr);
cgroup_put(cgrp);
return ret;
} }
#endif /* CONFIG_CGROUP_BPF */
#define BPF_PROG_TEST_RUN_LAST_FIELD test.duration #define BPF_PROG_TEST_RUN_LAST_FIELD test.duration
...@@ -2365,7 +2310,6 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz ...@@ -2365,7 +2310,6 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_OBJ_GET: case BPF_OBJ_GET:
err = bpf_obj_get(&attr); err = bpf_obj_get(&attr);
break; break;
#ifdef CONFIG_CGROUP_BPF
case BPF_PROG_ATTACH: case BPF_PROG_ATTACH:
err = bpf_prog_attach(&attr); err = bpf_prog_attach(&attr);
break; break;
...@@ -2375,7 +2319,6 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz ...@@ -2375,7 +2319,6 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_PROG_QUERY: case BPF_PROG_QUERY:
err = bpf_prog_query(&attr, uattr); err = bpf_prog_query(&attr, uattr);
break; break;
#endif
case BPF_PROG_TEST_RUN: case BPF_PROG_TEST_RUN:
err = bpf_prog_test_run(&attr, uattr); err = bpf_prog_test_run(&attr, uattr);
break; break;
......
...@@ -5282,21 +5282,31 @@ static struct bpf_test tests[] = { ...@@ -5282,21 +5282,31 @@ static struct bpf_test tests[] = {
{ /* Mainly checking JIT here. */ { /* Mainly checking JIT here. */
"BPF_MAXINSNS: Ctx heavy transformations", "BPF_MAXINSNS: Ctx heavy transformations",
{ }, { },
#if defined(CONFIG_BPF_JIT_ALWAYS_ON) && defined(CONFIG_S390)
CLASSIC | FLAG_EXPECTED_FAIL,
#else
CLASSIC, CLASSIC,
#endif
{ }, { },
{ {
{ 1, !!(SKB_VLAN_TCI & VLAN_TAG_PRESENT) }, { 1, !!(SKB_VLAN_TCI & VLAN_TAG_PRESENT) },
{ 10, !!(SKB_VLAN_TCI & VLAN_TAG_PRESENT) } { 10, !!(SKB_VLAN_TCI & VLAN_TAG_PRESENT) }
}, },
.fill_helper = bpf_fill_maxinsns6, .fill_helper = bpf_fill_maxinsns6,
.expected_errcode = -ENOTSUPP,
}, },
{ /* Mainly checking JIT here. */ { /* Mainly checking JIT here. */
"BPF_MAXINSNS: Call heavy transformations", "BPF_MAXINSNS: Call heavy transformations",
{ }, { },
#if defined(CONFIG_BPF_JIT_ALWAYS_ON) && defined(CONFIG_S390)
CLASSIC | FLAG_NO_DATA | FLAG_EXPECTED_FAIL,
#else
CLASSIC | FLAG_NO_DATA, CLASSIC | FLAG_NO_DATA,
#endif
{ }, { },
{ { 1, 0 }, { 10, 0 } }, { { 1, 0 }, { 10, 0 } },
.fill_helper = bpf_fill_maxinsns7, .fill_helper = bpf_fill_maxinsns7,
.expected_errcode = -ENOTSUPP,
}, },
{ /* Mainly checking JIT here. */ { /* Mainly checking JIT here. */
"BPF_MAXINSNS: Jump heavy test", "BPF_MAXINSNS: Jump heavy test",
...@@ -5347,18 +5357,28 @@ static struct bpf_test tests[] = { ...@@ -5347,18 +5357,28 @@ static struct bpf_test tests[] = {
{ {
"BPF_MAXINSNS: exec all MSH", "BPF_MAXINSNS: exec all MSH",
{ }, { },
#if defined(CONFIG_BPF_JIT_ALWAYS_ON) && defined(CONFIG_S390)
CLASSIC | FLAG_EXPECTED_FAIL,
#else
CLASSIC, CLASSIC,
#endif
{ 0xfa, 0xfb, 0xfc, 0xfd, }, { 0xfa, 0xfb, 0xfc, 0xfd, },
{ { 4, 0xababab83 } }, { { 4, 0xababab83 } },
.fill_helper = bpf_fill_maxinsns13, .fill_helper = bpf_fill_maxinsns13,
.expected_errcode = -ENOTSUPP,
}, },
{ {
"BPF_MAXINSNS: ld_abs+get_processor_id", "BPF_MAXINSNS: ld_abs+get_processor_id",
{ }, { },
#if defined(CONFIG_BPF_JIT_ALWAYS_ON) && defined(CONFIG_S390)
CLASSIC | FLAG_EXPECTED_FAIL,
#else
CLASSIC, CLASSIC,
#endif
{ }, { },
{ { 1, 0xbee } }, { { 1, 0xbee } },
.fill_helper = bpf_fill_ld_abs_get_processor_id, .fill_helper = bpf_fill_ld_abs_get_processor_id,
.expected_errcode = -ENOTSUPP,
}, },
/* /*
* LD_IND / LD_ABS on fragmented SKBs * LD_IND / LD_ABS on fragmented SKBs
......
...@@ -4073,8 +4073,9 @@ static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, ...@@ -4073,8 +4073,9 @@ static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params,
memcpy(params->smac, dev->dev_addr, ETH_ALEN); memcpy(params->smac, dev->dev_addr, ETH_ALEN);
params->h_vlan_TCI = 0; params->h_vlan_TCI = 0;
params->h_vlan_proto = 0; params->h_vlan_proto = 0;
params->ifindex = dev->ifindex;
return dev->ifindex; return 0;
} }
#endif #endif
...@@ -4098,7 +4099,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, ...@@ -4098,7 +4099,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
/* verify forwarding is enabled on this interface */ /* verify forwarding is enabled on this interface */
in_dev = __in_dev_get_rcu(dev); in_dev = __in_dev_get_rcu(dev);
if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev))) if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev)))
return 0; return BPF_FIB_LKUP_RET_FWD_DISABLED;
if (flags & BPF_FIB_LOOKUP_OUTPUT) { if (flags & BPF_FIB_LOOKUP_OUTPUT) {
fl4.flowi4_iif = 1; fl4.flowi4_iif = 1;
...@@ -4123,7 +4124,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, ...@@ -4123,7 +4124,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
tb = fib_get_table(net, tbid); tb = fib_get_table(net, tbid);
if (unlikely(!tb)) if (unlikely(!tb))
return 0; return BPF_FIB_LKUP_RET_NOT_FWDED;
err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
} else { } else {
...@@ -4135,8 +4136,20 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, ...@@ -4135,8 +4136,20 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF); err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF);
} }
if (err || res.type != RTN_UNICAST) if (err) {
return 0; /* map fib lookup errors to RTN_ type */
if (err == -EINVAL)
return BPF_FIB_LKUP_RET_BLACKHOLE;
if (err == -EHOSTUNREACH)
return BPF_FIB_LKUP_RET_UNREACHABLE;
if (err == -EACCES)
return BPF_FIB_LKUP_RET_PROHIBIT;
return BPF_FIB_LKUP_RET_NOT_FWDED;
}
if (res.type != RTN_UNICAST)
return BPF_FIB_LKUP_RET_NOT_FWDED;
if (res.fi->fib_nhs > 1) if (res.fi->fib_nhs > 1)
fib_select_path(net, &res, &fl4, NULL); fib_select_path(net, &res, &fl4, NULL);
...@@ -4144,19 +4157,16 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, ...@@ -4144,19 +4157,16 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
if (check_mtu) { if (check_mtu) {
mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst); mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst);
if (params->tot_len > mtu) if (params->tot_len > mtu)
return 0; return BPF_FIB_LKUP_RET_FRAG_NEEDED;
} }
nh = &res.fi->fib_nh[res.nh_sel]; nh = &res.fi->fib_nh[res.nh_sel];
/* do not handle lwt encaps right now */ /* do not handle lwt encaps right now */
if (nh->nh_lwtstate) if (nh->nh_lwtstate)
return 0; return BPF_FIB_LKUP_RET_UNSUPP_LWT;
dev = nh->nh_dev; dev = nh->nh_dev;
if (unlikely(!dev))
return 0;
if (nh->nh_gw) if (nh->nh_gw)
params->ipv4_dst = nh->nh_gw; params->ipv4_dst = nh->nh_gw;
...@@ -4166,10 +4176,10 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, ...@@ -4166,10 +4176,10 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
* rcu_read_lock_bh is not needed here * rcu_read_lock_bh is not needed here
*/ */
neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst); neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst);
if (neigh) if (!neigh)
return bpf_fib_set_fwd_params(params, neigh, dev); return BPF_FIB_LKUP_RET_NO_NEIGH;
return 0; return bpf_fib_set_fwd_params(params, neigh, dev);
} }
#endif #endif
...@@ -4190,7 +4200,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, ...@@ -4190,7 +4200,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
/* link local addresses are never forwarded */ /* link local addresses are never forwarded */
if (rt6_need_strict(dst) || rt6_need_strict(src)) if (rt6_need_strict(dst) || rt6_need_strict(src))
return 0; return BPF_FIB_LKUP_RET_NOT_FWDED;
dev = dev_get_by_index_rcu(net, params->ifindex); dev = dev_get_by_index_rcu(net, params->ifindex);
if (unlikely(!dev)) if (unlikely(!dev))
...@@ -4198,7 +4208,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, ...@@ -4198,7 +4208,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
idev = __in6_dev_get_safely(dev); idev = __in6_dev_get_safely(dev);
if (unlikely(!idev || !net->ipv6.devconf_all->forwarding)) if (unlikely(!idev || !net->ipv6.devconf_all->forwarding))
return 0; return BPF_FIB_LKUP_RET_FWD_DISABLED;
if (flags & BPF_FIB_LOOKUP_OUTPUT) { if (flags & BPF_FIB_LOOKUP_OUTPUT) {
fl6.flowi6_iif = 1; fl6.flowi6_iif = 1;
...@@ -4225,7 +4235,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, ...@@ -4225,7 +4235,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
tb = ipv6_stub->fib6_get_table(net, tbid); tb = ipv6_stub->fib6_get_table(net, tbid);
if (unlikely(!tb)) if (unlikely(!tb))
return 0; return BPF_FIB_LKUP_RET_NOT_FWDED;
f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict); f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict);
} else { } else {
...@@ -4238,11 +4248,23 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, ...@@ -4238,11 +4248,23 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
} }
if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry)) if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry))
return 0; return BPF_FIB_LKUP_RET_NOT_FWDED;
if (unlikely(f6i->fib6_flags & RTF_REJECT)) {
switch (f6i->fib6_type) {
case RTN_BLACKHOLE:
return BPF_FIB_LKUP_RET_BLACKHOLE;
case RTN_UNREACHABLE:
return BPF_FIB_LKUP_RET_UNREACHABLE;
case RTN_PROHIBIT:
return BPF_FIB_LKUP_RET_PROHIBIT;
default:
return BPF_FIB_LKUP_RET_NOT_FWDED;
}
}
if (unlikely(f6i->fib6_flags & RTF_REJECT || if (f6i->fib6_type != RTN_UNICAST)
f6i->fib6_type != RTN_UNICAST)) return BPF_FIB_LKUP_RET_NOT_FWDED;
return 0;
if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0) if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0)
f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6, f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6,
...@@ -4252,11 +4274,11 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, ...@@ -4252,11 +4274,11 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
if (check_mtu) { if (check_mtu) {
mtu = ipv6_stub->ip6_mtu_from_fib6(f6i, dst, src); mtu = ipv6_stub->ip6_mtu_from_fib6(f6i, dst, src);
if (params->tot_len > mtu) if (params->tot_len > mtu)
return 0; return BPF_FIB_LKUP_RET_FRAG_NEEDED;
} }
if (f6i->fib6_nh.nh_lwtstate) if (f6i->fib6_nh.nh_lwtstate)
return 0; return BPF_FIB_LKUP_RET_UNSUPP_LWT;
if (f6i->fib6_flags & RTF_GATEWAY) if (f6i->fib6_flags & RTF_GATEWAY)
*dst = f6i->fib6_nh.nh_gw; *dst = f6i->fib6_nh.nh_gw;
...@@ -4270,10 +4292,10 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, ...@@ -4270,10 +4292,10 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
*/ */
neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128, neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128,
ndisc_hashfn, dst, dev); ndisc_hashfn, dst, dev);
if (neigh) if (!neigh)
return bpf_fib_set_fwd_params(params, neigh, dev); return BPF_FIB_LKUP_RET_NO_NEIGH;
return 0; return bpf_fib_set_fwd_params(params, neigh, dev);
} }
#endif #endif
...@@ -4315,7 +4337,7 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, ...@@ -4315,7 +4337,7 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
struct bpf_fib_lookup *, params, int, plen, u32, flags) struct bpf_fib_lookup *, params, int, plen, u32, flags)
{ {
struct net *net = dev_net(skb->dev); struct net *net = dev_net(skb->dev);
int index = -EAFNOSUPPORT; int rc = -EAFNOSUPPORT;
if (plen < sizeof(*params)) if (plen < sizeof(*params))
return -EINVAL; return -EINVAL;
...@@ -4326,25 +4348,25 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, ...@@ -4326,25 +4348,25 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
switch (params->family) { switch (params->family) {
#if IS_ENABLED(CONFIG_INET) #if IS_ENABLED(CONFIG_INET)
case AF_INET: case AF_INET:
index = bpf_ipv4_fib_lookup(net, params, flags, false); rc = bpf_ipv4_fib_lookup(net, params, flags, false);
break; break;
#endif #endif
#if IS_ENABLED(CONFIG_IPV6) #if IS_ENABLED(CONFIG_IPV6)
case AF_INET6: case AF_INET6:
index = bpf_ipv6_fib_lookup(net, params, flags, false); rc = bpf_ipv6_fib_lookup(net, params, flags, false);
break; break;
#endif #endif
} }
if (index > 0) { if (!rc) {
struct net_device *dev; struct net_device *dev;
dev = dev_get_by_index_rcu(net, index); dev = dev_get_by_index_rcu(net, params->ifindex);
if (!is_skb_forwardable(dev, skb)) if (!is_skb_forwardable(dev, skb))
index = 0; rc = BPF_FIB_LKUP_RET_FRAG_NEEDED;
} }
return index; return rc;
} }
static const struct bpf_func_proto bpf_skb_fib_lookup_proto = { static const struct bpf_func_proto bpf_skb_fib_lookup_proto = {
......
...@@ -48,9 +48,9 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags) ...@@ -48,9 +48,9 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags)
struct ethhdr *eth = data; struct ethhdr *eth = data;
struct ipv6hdr *ip6h; struct ipv6hdr *ip6h;
struct iphdr *iph; struct iphdr *iph;
int out_index;
u16 h_proto; u16 h_proto;
u64 nh_off; u64 nh_off;
int rc;
nh_off = sizeof(*eth); nh_off = sizeof(*eth);
if (data + nh_off > data_end) if (data + nh_off > data_end)
...@@ -101,7 +101,7 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags) ...@@ -101,7 +101,7 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags)
fib_params.ifindex = ctx->ingress_ifindex; fib_params.ifindex = ctx->ingress_ifindex;
out_index = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), flags); rc = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), flags);
/* verify egress index has xdp support /* verify egress index has xdp support
* TO-DO bpf_map_lookup_elem(&tx_port, &key) fails with * TO-DO bpf_map_lookup_elem(&tx_port, &key) fails with
...@@ -109,7 +109,7 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags) ...@@ -109,7 +109,7 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags)
* NOTE: without verification that egress index supports XDP * NOTE: without verification that egress index supports XDP
* forwarding packets are dropped. * forwarding packets are dropped.
*/ */
if (out_index > 0) { if (rc == 0) {
if (h_proto == htons(ETH_P_IP)) if (h_proto == htons(ETH_P_IP))
ip_decrease_ttl(iph); ip_decrease_ttl(iph);
else if (h_proto == htons(ETH_P_IPV6)) else if (h_proto == htons(ETH_P_IPV6))
...@@ -117,7 +117,7 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags) ...@@ -117,7 +117,7 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags)
memcpy(eth->h_dest, fib_params.dmac, ETH_ALEN); memcpy(eth->h_dest, fib_params.dmac, ETH_ALEN);
memcpy(eth->h_source, fib_params.smac, ETH_ALEN); memcpy(eth->h_source, fib_params.smac, ETH_ALEN);
return bpf_redirect_map(&tx_port, out_index, 0); return bpf_redirect_map(&tx_port, fib_params.ifindex, 0);
} }
return XDP_PASS; return XDP_PASS;
......
...@@ -694,15 +694,19 @@ static int do_load(int argc, char **argv) ...@@ -694,15 +694,19 @@ static int do_load(int argc, char **argv)
return -1; return -1;
} }
if (do_pin_fd(prog_fd, argv[1])) { if (do_pin_fd(prog_fd, argv[1]))
p_err("failed to pin program"); goto err_close_obj;
return -1;
}
if (json_output) if (json_output)
jsonw_null(json_wtr); jsonw_null(json_wtr);
bpf_object__close(obj);
return 0; return 0;
err_close_obj:
bpf_object__close(obj);
return -1;
} }
static int do_help(int argc, char **argv) static int do_help(int argc, char **argv)
......
...@@ -6,6 +6,7 @@ CONFIG_TEST_BPF=m ...@@ -6,6 +6,7 @@ CONFIG_TEST_BPF=m
CONFIG_CGROUP_BPF=y CONFIG_CGROUP_BPF=y
CONFIG_NETDEVSIM=m CONFIG_NETDEVSIM=m
CONFIG_NET_CLS_ACT=y CONFIG_NET_CLS_ACT=y
CONFIG_NET_SCHED=y
CONFIG_NET_SCH_INGRESS=y CONFIG_NET_SCH_INGRESS=y
CONFIG_NET_IPIP=y CONFIG_NET_IPIP=y
CONFIG_IPV6=y CONFIG_IPV6=y
......
#!/bin/sh #!/bin/sh
# SPDX-License-Identifier: GPL-2.0 # SPDX-License-Identifier: GPL-2.0
# Kselftest framework requirement - SKIP code is 4.
ksft_skip=4
msg="skip all tests:"
if [ "$(id -u)" != "0" ]; then
echo $msg please run this as root >&2
exit $ksft_skip
fi
SRC_TREE=../../../../ SRC_TREE=../../../../
test_run() test_run()
......
#!/bin/bash #!/bin/bash
# SPDX-License-Identifier: GPL-2.0 # SPDX-License-Identifier: GPL-2.0
# Kselftest framework requirement - SKIP code is 4.
ksft_skip=4
msg="skip all tests:"
if [ $UID != 0 ]; then
echo $msg please run this as root >&2
exit $ksft_skip
fi
GREEN='\033[0;92m' GREEN='\033[0;92m'
RED='\033[0;31m' RED='\033[0;31m'
NC='\033[0m' # No Color NC='\033[0m' # No Color
......
...@@ -21,6 +21,15 @@ ...@@ -21,6 +21,15 @@
# An UDP datagram is sent from fb00::1 to fb00::6. The test succeeds if this # An UDP datagram is sent from fb00::1 to fb00::6. The test succeeds if this
# datagram can be read on NS6 when binding to fb00::6. # datagram can be read on NS6 when binding to fb00::6.
# Kselftest framework requirement - SKIP code is 4.
ksft_skip=4
msg="skip all tests:"
if [ $UID != 0 ]; then
echo $msg please run this as root >&2
exit $ksft_skip
fi
TMP_FILE="/tmp/selftest_lwt_seg6local.txt" TMP_FILE="/tmp/selftest_lwt_seg6local.txt"
cleanup() cleanup()
......
...@@ -1413,18 +1413,12 @@ static int test_suite(void) ...@@ -1413,18 +1413,12 @@ static int test_suite(void)
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY};
int iov_count = 1, length = 1024, rate = 1; int iov_count = 1, length = 1024, rate = 1;
struct sockmap_options options = {0}; struct sockmap_options options = {0};
int opt, longindex, err, cg_fd = 0; int opt, longindex, err, cg_fd = 0;
char *bpf_file = BPF_SOCKMAP_FILENAME; char *bpf_file = BPF_SOCKMAP_FILENAME;
int test = PING_PONG; int test = PING_PONG;
if (setrlimit(RLIMIT_MEMLOCK, &r)) {
perror("setrlimit(RLIMIT_MEMLOCK)");
return 1;
}
if (argc < 2) if (argc < 2)
return test_suite(); return test_suite();
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment