Commit 41f57cfd authored by David S. Miller's avatar David S. Miller

Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf

Alexei Starovoitov says:

====================
pull-request: bpf 2020-02-19

The following pull-request contains BPF updates for your *net* tree.

We've added 10 non-merge commits during the last 10 day(s) which contain
a total of 10 files changed, 93 insertions(+), 31 deletions(-).

The main changes are:

1) batched bpf hashtab fixes from Brian and Yonghong.

2) various selftests and libbpf fixes.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents fca07a93 b9aff38d
...@@ -1045,9 +1045,9 @@ union bpf_attr { ...@@ -1045,9 +1045,9 @@ union bpf_attr {
* supports redirection to the egress interface, and accepts no * supports redirection to the egress interface, and accepts no
* flag at all. * flag at all.
* *
* The same effect can be attained with the more generic * The same effect can also be attained with the more generic
* **bpf_redirect_map**\ (), which requires specific maps to be * **bpf_redirect_map**\ (), which uses a BPF map to store the
* used but offers better performance. * redirect target instead of providing it directly to the helper.
* Return * Return
* For XDP, the helper returns **XDP_REDIRECT** on success or * For XDP, the helper returns **XDP_REDIRECT** on success or
* **XDP_ABORTED** on error. For other program types, the values * **XDP_ABORTED** on error. For other program types, the values
...@@ -1611,13 +1611,11 @@ union bpf_attr { ...@@ -1611,13 +1611,11 @@ union bpf_attr {
* the caller. Any higher bits in the *flags* argument must be * the caller. Any higher bits in the *flags* argument must be
* unset. * unset.
* *
* When used to redirect packets to net devices, this helper * See also bpf_redirect(), which only supports redirecting to an
* provides a high performance increase over **bpf_redirect**\ (). * ifindex, but doesn't require a map to do so.
* This is due to various implementation details of the underlying
* mechanisms, one of which is the fact that **bpf_redirect_map**\
* () tries to send packet as a "bulk" to the device.
* Return * Return
* **XDP_REDIRECT** on success, or **XDP_ABORTED** on error. * **XDP_REDIRECT** on success, or the value of the two lower bits
* of the **flags* argument on error.
* *
* int bpf_sk_redirect_map(struct sk_buff *skb, struct bpf_map *map, u32 key, u64 flags) * int bpf_sk_redirect_map(struct sk_buff *skb, struct bpf_map *map, u32 key, u64 flags)
* Description * Description
......
...@@ -4142,7 +4142,7 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, ...@@ -4142,7 +4142,7 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
* EFAULT - verifier bug * EFAULT - verifier bug
* 0 - 99% match. The last 1% is validated by the verifier. * 0 - 99% match. The last 1% is validated by the verifier.
*/ */
int btf_check_func_type_match(struct bpf_verifier_log *log, static int btf_check_func_type_match(struct bpf_verifier_log *log,
struct btf *btf1, const struct btf_type *t1, struct btf *btf1, const struct btf_type *t1,
struct btf *btf2, const struct btf_type *t2) struct btf *btf2, const struct btf_type *t2)
{ {
......
...@@ -56,6 +56,7 @@ struct htab_elem { ...@@ -56,6 +56,7 @@ struct htab_elem {
union { union {
struct bpf_htab *htab; struct bpf_htab *htab;
struct pcpu_freelist_node fnode; struct pcpu_freelist_node fnode;
struct htab_elem *batch_flink;
}; };
}; };
}; };
...@@ -126,6 +127,17 @@ static void htab_free_elems(struct bpf_htab *htab) ...@@ -126,6 +127,17 @@ static void htab_free_elems(struct bpf_htab *htab)
bpf_map_area_free(htab->elems); bpf_map_area_free(htab->elems);
} }
/* The LRU list has a lock (lru_lock). Each htab bucket has a lock
* (bucket_lock). If both locks need to be acquired together, the lock
* order is always lru_lock -> bucket_lock and this only happens in
* bpf_lru_list.c logic. For example, certain code path of
* bpf_lru_pop_free(), which is called by function prealloc_lru_pop(),
* will acquire lru_lock first followed by acquiring bucket_lock.
*
* In hashtab.c, to avoid deadlock, lock acquisition of
* bucket_lock followed by lru_lock is not allowed. In such cases,
* bucket_lock needs to be released first before acquiring lru_lock.
*/
static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
u32 hash) u32 hash)
{ {
...@@ -1256,10 +1268,12 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, ...@@ -1256,10 +1268,12 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
void __user *ukeys = u64_to_user_ptr(attr->batch.keys); void __user *ukeys = u64_to_user_ptr(attr->batch.keys);
void *ubatch = u64_to_user_ptr(attr->batch.in_batch); void *ubatch = u64_to_user_ptr(attr->batch.in_batch);
u32 batch, max_count, size, bucket_size; u32 batch, max_count, size, bucket_size;
struct htab_elem *node_to_free = NULL;
u64 elem_map_flags, map_flags; u64 elem_map_flags, map_flags;
struct hlist_nulls_head *head; struct hlist_nulls_head *head;
struct hlist_nulls_node *n; struct hlist_nulls_node *n;
unsigned long flags; unsigned long flags = 0;
bool locked = false;
struct htab_elem *l; struct htab_elem *l;
struct bucket *b; struct bucket *b;
int ret = 0; int ret = 0;
...@@ -1319,15 +1333,25 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, ...@@ -1319,15 +1333,25 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
dst_val = values; dst_val = values;
b = &htab->buckets[batch]; b = &htab->buckets[batch];
head = &b->head; head = &b->head;
/* do not grab the lock unless need it (bucket_cnt > 0). */
if (locked)
raw_spin_lock_irqsave(&b->lock, flags); raw_spin_lock_irqsave(&b->lock, flags);
bucket_cnt = 0; bucket_cnt = 0;
hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
bucket_cnt++; bucket_cnt++;
if (bucket_cnt && !locked) {
locked = true;
goto again_nocopy;
}
if (bucket_cnt > (max_count - total)) { if (bucket_cnt > (max_count - total)) {
if (total == 0) if (total == 0)
ret = -ENOSPC; ret = -ENOSPC;
/* Note that since bucket_cnt > 0 here, it is implicit
* that the locked was grabbed, so release it.
*/
raw_spin_unlock_irqrestore(&b->lock, flags); raw_spin_unlock_irqrestore(&b->lock, flags);
rcu_read_unlock(); rcu_read_unlock();
this_cpu_dec(bpf_prog_active); this_cpu_dec(bpf_prog_active);
...@@ -1337,6 +1361,9 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, ...@@ -1337,6 +1361,9 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
if (bucket_cnt > bucket_size) { if (bucket_cnt > bucket_size) {
bucket_size = bucket_cnt; bucket_size = bucket_cnt;
/* Note that since bucket_cnt > 0 here, it is implicit
* that the locked was grabbed, so release it.
*/
raw_spin_unlock_irqrestore(&b->lock, flags); raw_spin_unlock_irqrestore(&b->lock, flags);
rcu_read_unlock(); rcu_read_unlock();
this_cpu_dec(bpf_prog_active); this_cpu_dec(bpf_prog_active);
...@@ -1346,6 +1373,10 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, ...@@ -1346,6 +1373,10 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
goto alloc; goto alloc;
} }
/* Next block is only safe to run if you have grabbed the lock */
if (!locked)
goto next_batch;
hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
memcpy(dst_key, l->key, key_size); memcpy(dst_key, l->key, key_size);
...@@ -1370,16 +1401,33 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, ...@@ -1370,16 +1401,33 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
} }
if (do_delete) { if (do_delete) {
hlist_nulls_del_rcu(&l->hash_node); hlist_nulls_del_rcu(&l->hash_node);
if (is_lru_map)
bpf_lru_push_free(&htab->lru, &l->lru_node); /* bpf_lru_push_free() will acquire lru_lock, which
else * may cause deadlock. See comments in function
* prealloc_lru_pop(). Let us do bpf_lru_push_free()
* after releasing the bucket lock.
*/
if (is_lru_map) {
l->batch_flink = node_to_free;
node_to_free = l;
} else {
free_htab_elem(htab, l); free_htab_elem(htab, l);
} }
}
dst_key += key_size; dst_key += key_size;
dst_val += value_size; dst_val += value_size;
} }
raw_spin_unlock_irqrestore(&b->lock, flags); raw_spin_unlock_irqrestore(&b->lock, flags);
locked = false;
while (node_to_free) {
l = node_to_free;
node_to_free = node_to_free->batch_flink;
bpf_lru_push_free(&htab->lru, &l->lru_node);
}
next_batch:
/* If we are not copying data, we can go to next bucket and avoid /* If we are not copying data, we can go to next bucket and avoid
* unlocking the rcu. * unlocking the rcu.
*/ */
......
...@@ -321,7 +321,7 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info, ...@@ -321,7 +321,7 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
ulen = info->jited_prog_len; ulen = info->jited_prog_len;
info->jited_prog_len = aux->offload->jited_len; info->jited_prog_len = aux->offload->jited_len;
if (info->jited_prog_len & ulen) { if (info->jited_prog_len && ulen) {
uinsns = u64_to_user_ptr(info->jited_prog_insns); uinsns = u64_to_user_ptr(info->jited_prog_insns);
ulen = min_t(u32, info->jited_prog_len, ulen); ulen = min_t(u32, info->jited_prog_len, ulen);
if (copy_to_user(uinsns, aux->offload->jited_image, ulen)) { if (copy_to_user(uinsns, aux->offload->jited_image, ulen)) {
......
...@@ -217,6 +217,7 @@ static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) ...@@ -217,6 +217,7 @@ static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
static void xsk_flush(struct xdp_sock *xs) static void xsk_flush(struct xdp_sock *xs)
{ {
xskq_prod_submit(xs->rx); xskq_prod_submit(xs->rx);
__xskq_cons_release(xs->umem->fq);
sock_def_readable(&xs->sk); sock_def_readable(&xs->sk);
} }
...@@ -304,6 +305,7 @@ void xsk_umem_consume_tx_done(struct xdp_umem *umem) ...@@ -304,6 +305,7 @@ void xsk_umem_consume_tx_done(struct xdp_umem *umem)
rcu_read_lock(); rcu_read_lock();
list_for_each_entry_rcu(xs, &umem->xsk_list, list) { list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
__xskq_cons_release(xs->tx);
xs->sk.sk_write_space(&xs->sk); xs->sk.sk_write_space(&xs->sk);
} }
rcu_read_unlock(); rcu_read_unlock();
......
...@@ -271,7 +271,8 @@ static inline void xskq_cons_release(struct xsk_queue *q) ...@@ -271,7 +271,8 @@ static inline void xskq_cons_release(struct xsk_queue *q)
{ {
/* To improve performance, only update local state here. /* To improve performance, only update local state here.
* Reflect this to global state when we get new entries * Reflect this to global state when we get new entries
* from the ring in xskq_cons_get_entries(). * from the ring in xskq_cons_get_entries() and whenever
* Rx or Tx processing are completed in the NAPI loop.
*/ */
q->cached_cons++; q->cached_cons++;
} }
......
...@@ -1045,9 +1045,9 @@ union bpf_attr { ...@@ -1045,9 +1045,9 @@ union bpf_attr {
* supports redirection to the egress interface, and accepts no * supports redirection to the egress interface, and accepts no
* flag at all. * flag at all.
* *
* The same effect can be attained with the more generic * The same effect can also be attained with the more generic
* **bpf_redirect_map**\ (), which requires specific maps to be * **bpf_redirect_map**\ (), which uses a BPF map to store the
* used but offers better performance. * redirect target instead of providing it directly to the helper.
* Return * Return
* For XDP, the helper returns **XDP_REDIRECT** on success or * For XDP, the helper returns **XDP_REDIRECT** on success or
* **XDP_ABORTED** on error. For other program types, the values * **XDP_ABORTED** on error. For other program types, the values
...@@ -1611,13 +1611,11 @@ union bpf_attr { ...@@ -1611,13 +1611,11 @@ union bpf_attr {
* the caller. Any higher bits in the *flags* argument must be * the caller. Any higher bits in the *flags* argument must be
* unset. * unset.
* *
* When used to redirect packets to net devices, this helper * See also bpf_redirect(), which only supports redirecting to an
* provides a high performance increase over **bpf_redirect**\ (). * ifindex, but doesn't require a map to do so.
* This is due to various implementation details of the underlying
* mechanisms, one of which is the fact that **bpf_redirect_map**\
* () tries to send packet as a "bulk" to the device.
* Return * Return
* **XDP_REDIRECT** on success, or **XDP_ABORTED** on error. * **XDP_REDIRECT** on success, or the value of the two lower bits
* of the **flags* argument on error.
* *
* int bpf_sk_redirect_map(struct sk_buff *skb, struct bpf_map *map, u32 key, u64 flags) * int bpf_sk_redirect_map(struct sk_buff *skb, struct bpf_map *map, u32 key, u64 flags)
* Description * Description
......
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
#include <endian.h> #include <endian.h>
#include <fcntl.h> #include <fcntl.h>
#include <errno.h> #include <errno.h>
#include <ctype.h>
#include <asm/unistd.h> #include <asm/unistd.h>
#include <linux/err.h> #include <linux/err.h>
#include <linux/kernel.h> #include <linux/kernel.h>
...@@ -1283,7 +1284,7 @@ static size_t bpf_map_mmap_sz(const struct bpf_map *map) ...@@ -1283,7 +1284,7 @@ static size_t bpf_map_mmap_sz(const struct bpf_map *map)
static char *internal_map_name(struct bpf_object *obj, static char *internal_map_name(struct bpf_object *obj,
enum libbpf_map_type type) enum libbpf_map_type type)
{ {
char map_name[BPF_OBJ_NAME_LEN]; char map_name[BPF_OBJ_NAME_LEN], *p;
const char *sfx = libbpf_type_to_btf_name[type]; const char *sfx = libbpf_type_to_btf_name[type];
int sfx_len = max((size_t)7, strlen(sfx)); int sfx_len = max((size_t)7, strlen(sfx));
int pfx_len = min((size_t)BPF_OBJ_NAME_LEN - sfx_len - 1, int pfx_len = min((size_t)BPF_OBJ_NAME_LEN - sfx_len - 1,
...@@ -1292,6 +1293,11 @@ static char *internal_map_name(struct bpf_object *obj, ...@@ -1292,6 +1293,11 @@ static char *internal_map_name(struct bpf_object *obj,
snprintf(map_name, sizeof(map_name), "%.*s%.*s", pfx_len, obj->name, snprintf(map_name, sizeof(map_name), "%.*s%.*s", pfx_len, obj->name,
sfx_len, libbpf_type_to_btf_name[type]); sfx_len, libbpf_type_to_btf_name[type]);
/* sanitise map name to characters allowed by kernel */
for (p = map_name; *p && p < map_name + sizeof(map_name); p++)
if (!isalnum(*p) && *p != '_' && *p != '.')
*p = '_';
return strdup(map_name); return strdup(map_name);
} }
......
...@@ -506,8 +506,10 @@ static void test_syncookie(int type, sa_family_t family) ...@@ -506,8 +506,10 @@ static void test_syncookie(int type, sa_family_t family)
.pass_on_failure = 0, .pass_on_failure = 0,
}; };
if (type != SOCK_STREAM) if (type != SOCK_STREAM) {
test__skip();
return; return;
}
/* /*
* +1 for TCP-SYN and * +1 for TCP-SYN and
...@@ -822,8 +824,10 @@ void test_select_reuseport(void) ...@@ -822,8 +824,10 @@ void test_select_reuseport(void)
goto out; goto out;
saved_tcp_fo = read_int_sysctl(TCP_FO_SYSCTL); saved_tcp_fo = read_int_sysctl(TCP_FO_SYSCTL);
if (saved_tcp_fo < 0)
goto out;
saved_tcp_syncookie = read_int_sysctl(TCP_SYNCOOKIE_SYSCTL); saved_tcp_syncookie = read_int_sysctl(TCP_SYNCOOKIE_SYSCTL);
if (saved_tcp_syncookie < 0 || saved_tcp_syncookie < 0) if (saved_tcp_syncookie < 0)
goto out; goto out;
if (enable_fastopen()) if (enable_fastopen())
......
...@@ -3,6 +3,11 @@ ...@@ -3,6 +3,11 @@
#include "test_progs.h" #include "test_progs.h"
#define TCP_REPAIR 19 /* TCP sock is under repair right now */
#define TCP_REPAIR_ON 1
#define TCP_REPAIR_OFF_NO_WP -1 /* Turn off without window probes */
static int connected_socket_v4(void) static int connected_socket_v4(void)
{ {
struct sockaddr_in addr = { struct sockaddr_in addr = {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment