Commit 1eb3dee1 authored by Jakub Kicinski's avatar Jakub Kicinski

Merge tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf

Daniel Borkmann says:

====================
pull-request: bpf 2023-10-02

We've added 11 non-merge commits during the last 12 day(s) which contain
a total of 12 files changed, 176 insertions(+), 41 deletions(-).

The main changes are:

1) Fix BPF verifier to reset backtrack_state masks on global function
   exit as otherwise subsequent precision tracking would reuse them,
   from Andrii Nakryiko.

2) Several sockmap fixes for available bytes accounting,
   from John Fastabend.

3) Reject sk_msg egress redirects to non-TCP sockets given this
   is only supported for TCP sockets today, from Jakub Sitnicki.

4) Fix a syzkaller splat in bpf_mprog when hitting maximum program
   limits with BPF_F_BEFORE directive, from Daniel Borkmann
   and Nikolay Aleksandrov.

5) Fix BPF memory allocator to use kmalloc_size_roundup() to adjust
   size_index for selecting a bpf_mem_cache, from Hou Tao.

6) Fix arch_prepare_bpf_trampoline return code for s390 JIT,
   from Song Liu.

7) Fix bpf_trampoline_get when CONFIG_BPF_JIT is turned off,
   from Leon Hwang.

* tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf:
  bpf: Use kmalloc_size_roundup() to adjust size_index
  selftest/bpf: Add various selftests for program limits
  bpf, mprog: Fix maximum program check on mprog attachment
  bpf, sockmap: Reject sk_msg egress redirects to non-TCP sockets
  bpf, sockmap: Add tests for MSG_F_PEEK
  bpf, sockmap: Do not inc copied_seq when PEEK flag set
  bpf: tcp_read_skb needs to pop skb regardless of seq
  bpf: unconditionally reset backtrack_state masks on global func exit
  bpf: Fix tr dereferencing
  selftests/bpf: Check bpf_cubic_acked() is called via struct_ops
  s390/bpf: Let arch_prepare_bpf_trampoline return program size
====================

Link: https://lore.kernel.org/r/20231002113417.2309-1-daniel@iogearbox.netSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 51e7a666 9077fc22
......@@ -2513,7 +2513,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image,
return -E2BIG;
}
return ret;
return tjit.common.prg;
}
bool bpf_jit_supports_subprog_tailcalls(void)
......
......@@ -1307,7 +1307,7 @@ static inline int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link,
static inline struct bpf_trampoline *bpf_trampoline_get(u64 key,
struct bpf_attach_target_info *tgt_info)
{
return ERR_PTR(-EOPNOTSUPP);
return NULL;
}
static inline void bpf_trampoline_put(struct bpf_trampoline *tr) {}
#define DEFINE_BPF_DISPATCHER(name)
......
......@@ -965,37 +965,31 @@ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags)
return !ret ? NULL : ret + LLIST_NODE_SZ;
}
/* Most of the logic is taken from setup_kmalloc_cache_index_table() */
static __init int bpf_mem_cache_adjust_size(void)
{
unsigned int size, index;
unsigned int size;
/* Normally KMALLOC_MIN_SIZE is 8-bytes, but it can be
* up-to 256-bytes.
/* Adjusting the indexes in size_index() according to the object_size
* of underlying slab cache, so bpf_mem_alloc() will select a
* bpf_mem_cache with unit_size equal to the object_size of
* the underlying slab cache.
*
* The maximal value of KMALLOC_MIN_SIZE and __kmalloc_minalign() is
* 256-bytes, so only do adjustment for [8-bytes, 192-bytes].
*/
size = KMALLOC_MIN_SIZE;
if (size <= 192)
index = size_index[(size - 1) / 8];
else
index = fls(size - 1) - 1;
for (size = 8; size < KMALLOC_MIN_SIZE && size <= 192; size += 8)
size_index[(size - 1) / 8] = index;
for (size = 192; size >= 8; size -= 8) {
unsigned int kmalloc_size, index;
/* The minimal alignment is 64-bytes, so disable 96-bytes cache and
* use 128-bytes cache instead.
*/
if (KMALLOC_MIN_SIZE >= 64) {
index = size_index[(128 - 1) / 8];
for (size = 64 + 8; size <= 96; size += 8)
size_index[(size - 1) / 8] = index;
}
kmalloc_size = kmalloc_size_roundup(size);
if (kmalloc_size == size)
continue;
/* The minimal alignment is 128-bytes, so disable 192-bytes cache and
* use 256-bytes cache instead.
*/
if (KMALLOC_MIN_SIZE >= 128) {
index = fls(256 - 1) - 1;
for (size = 128 + 8; size <= 192; size += 8)
if (kmalloc_size <= 192)
index = size_index[(kmalloc_size - 1) / 8];
else
index = fls(kmalloc_size - 1) - 1;
/* Only overwrite if necessary */
if (size_index[(size - 1) / 8] != index)
size_index[(size - 1) / 8] = index;
}
......
......@@ -253,6 +253,9 @@ int bpf_mprog_attach(struct bpf_mprog_entry *entry,
goto out;
}
idx = tidx;
} else if (bpf_mprog_total(entry) == bpf_mprog_max()) {
ret = -ERANGE;
goto out;
}
if (flags & BPF_F_BEFORE) {
tidx = bpf_mprog_pos_before(entry, &rtuple);
......
......@@ -4047,10 +4047,8 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
bitmap_from_u64(mask, bt_reg_mask(bt));
for_each_set_bit(i, mask, 32) {
reg = &st->frame[0]->regs[i];
if (reg->type != SCALAR_VALUE) {
bt_clear_reg(bt, i);
continue;
}
if (reg->type == SCALAR_VALUE)
reg->precise = true;
}
return 0;
......
......@@ -668,6 +668,8 @@ BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg *, msg,
sk = __sock_map_lookup_elem(map, key);
if (unlikely(!sk || !sock_map_redirect_allowed(sk)))
return SK_DROP;
if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk))
return SK_DROP;
msg->flags = flags;
msg->sk_redir = sk;
......@@ -1267,6 +1269,8 @@ BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg *, msg,
sk = __sock_hash_lookup_elem(map, key);
if (unlikely(!sk || !sock_map_redirect_allowed(sk)))
return SK_DROP;
if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk))
return SK_DROP;
msg->flags = flags;
msg->sk_redir = sk;
......
......@@ -1621,16 +1621,13 @@ EXPORT_SYMBOL(tcp_read_sock);
int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 seq = tp->copied_seq;
struct sk_buff *skb;
int copied = 0;
u32 offset;
if (sk->sk_state == TCP_LISTEN)
return -ENOTCONN;
while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
u8 tcp_flags;
int used;
......@@ -1643,14 +1640,11 @@ int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
copied = used;
break;
}
seq += used;
copied += used;
if (tcp_flags & TCPHDR_FIN) {
++seq;
if (tcp_flags & TCPHDR_FIN)
break;
}
}
return copied;
}
EXPORT_SYMBOL(tcp_read_skb);
......
......@@ -222,6 +222,7 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk,
int *addr_len)
{
struct tcp_sock *tcp = tcp_sk(sk);
int peek = flags & MSG_PEEK;
u32 seq = tcp->copied_seq;
struct sk_psock *psock;
int copied = 0;
......@@ -311,6 +312,7 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk,
copied = -EAGAIN;
}
out:
if (!peek)
WRITE_ONCE(tcp->copied_seq, seq);
tcp_rcv_space_adjust(sk);
if (copied > 0)
......
......@@ -185,6 +185,8 @@ static void test_cubic(void)
do_test("bpf_cubic", NULL);
ASSERT_EQ(cubic_skel->bss->bpf_cubic_acked_called, 1, "pkts_acked called");
bpf_link__destroy(link);
bpf_cubic__destroy(cubic_skel);
}
......
......@@ -475,6 +475,55 @@ static void test_sockmap_skb_verdict_fionread(bool pass_prog)
test_sockmap_drop_prog__destroy(drop);
}
static void test_sockmap_skb_verdict_peek(void)
{
int err, map, verdict, s, c1, p1, zero = 0, sent, recvd, avail;
struct test_sockmap_pass_prog *pass;
char snd[256] = "0123456789";
char rcv[256] = "0";
pass = test_sockmap_pass_prog__open_and_load();
if (!ASSERT_OK_PTR(pass, "open_and_load"))
return;
verdict = bpf_program__fd(pass->progs.prog_skb_verdict);
map = bpf_map__fd(pass->maps.sock_map_rx);
err = bpf_prog_attach(verdict, map, BPF_SK_SKB_STREAM_VERDICT, 0);
if (!ASSERT_OK(err, "bpf_prog_attach"))
goto out;
s = socket_loopback(AF_INET, SOCK_STREAM);
if (!ASSERT_GT(s, -1, "socket_loopback(s)"))
goto out;
err = create_pair(s, AF_INET, SOCK_STREAM, &c1, &p1);
if (!ASSERT_OK(err, "create_pairs(s)"))
goto out;
err = bpf_map_update_elem(map, &zero, &c1, BPF_NOEXIST);
if (!ASSERT_OK(err, "bpf_map_update_elem(c1)"))
goto out_close;
sent = xsend(p1, snd, sizeof(snd), 0);
ASSERT_EQ(sent, sizeof(snd), "xsend(p1)");
recvd = recv(c1, rcv, sizeof(rcv), MSG_PEEK);
ASSERT_EQ(recvd, sizeof(rcv), "recv(c1)");
err = ioctl(c1, FIONREAD, &avail);
ASSERT_OK(err, "ioctl(FIONREAD) error");
ASSERT_EQ(avail, sizeof(snd), "after peek ioctl(FIONREAD)");
recvd = recv(c1, rcv, sizeof(rcv), 0);
ASSERT_EQ(recvd, sizeof(rcv), "recv(p0)");
err = ioctl(c1, FIONREAD, &avail);
ASSERT_OK(err, "ioctl(FIONREAD) error");
ASSERT_EQ(avail, 0, "after read ioctl(FIONREAD)");
out_close:
close(c1);
close(p1);
out:
test_sockmap_pass_prog__destroy(pass);
}
void test_sockmap_basic(void)
{
if (test__start_subtest("sockmap create_update_free"))
......@@ -515,4 +564,6 @@ void test_sockmap_basic(void)
test_sockmap_skb_verdict_fionread(true);
if (test__start_subtest("sockmap skb_verdict fionread on drop"))
test_sockmap_skb_verdict_fionread(false);
if (test__start_subtest("sockmap skb_verdict msg_f_peek"))
test_sockmap_skb_verdict_peek();
}
......@@ -2378,3 +2378,87 @@ void serial_test_tc_opts_chain_mixed(void)
test_tc_chain_mixed(BPF_TCX_INGRESS);
test_tc_chain_mixed(BPF_TCX_EGRESS);
}
static int generate_dummy_prog(void)
{
const struct bpf_insn prog_insns[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
};
const size_t prog_insn_cnt = sizeof(prog_insns) / sizeof(struct bpf_insn);
LIBBPF_OPTS(bpf_prog_load_opts, opts);
const size_t log_buf_sz = 256;
char *log_buf;
int fd = -1;
log_buf = malloc(log_buf_sz);
if (!ASSERT_OK_PTR(log_buf, "log_buf_alloc"))
return fd;
opts.log_buf = log_buf;
opts.log_size = log_buf_sz;
log_buf[0] = '\0';
opts.log_level = 0;
fd = bpf_prog_load(BPF_PROG_TYPE_SCHED_CLS, "tcx_prog", "GPL",
prog_insns, prog_insn_cnt, &opts);
ASSERT_STREQ(log_buf, "", "log_0");
ASSERT_GE(fd, 0, "prog_fd");
free(log_buf);
return fd;
}
static void test_tc_opts_max_target(int target, int flags, bool relative)
{
int err, ifindex, i, prog_fd, last_fd = -1;
LIBBPF_OPTS(bpf_prog_attach_opts, opta);
const int max_progs = 63;
ASSERT_OK(system("ip link add dev tcx_opts1 type veth peer name tcx_opts2"), "add veth");
ifindex = if_nametoindex("tcx_opts1");
ASSERT_NEQ(ifindex, 0, "non_zero_ifindex");
assert_mprog_count_ifindex(ifindex, target, 0);
for (i = 0; i < max_progs; i++) {
prog_fd = generate_dummy_prog();
if (!ASSERT_GE(prog_fd, 0, "dummy_prog"))
goto cleanup;
err = bpf_prog_attach_opts(prog_fd, ifindex, target, &opta);
if (!ASSERT_EQ(err, 0, "prog_attach"))
goto cleanup;
assert_mprog_count_ifindex(ifindex, target, i + 1);
if (i == max_progs - 1 && relative)
last_fd = prog_fd;
else
close(prog_fd);
}
prog_fd = generate_dummy_prog();
if (!ASSERT_GE(prog_fd, 0, "dummy_prog"))
goto cleanup;
opta.flags = flags;
if (last_fd > 0)
opta.relative_fd = last_fd;
err = bpf_prog_attach_opts(prog_fd, ifindex, target, &opta);
ASSERT_EQ(err, -ERANGE, "prog_64_attach");
assert_mprog_count_ifindex(ifindex, target, max_progs);
close(prog_fd);
cleanup:
if (last_fd > 0)
close(last_fd);
ASSERT_OK(system("ip link del dev tcx_opts1"), "del veth");
ASSERT_EQ(if_nametoindex("tcx_opts1"), 0, "dev1_removed");
ASSERT_EQ(if_nametoindex("tcx_opts2"), 0, "dev2_removed");
}
void serial_test_tc_opts_max(void)
{
test_tc_opts_max_target(BPF_TCX_INGRESS, 0, false);
test_tc_opts_max_target(BPF_TCX_EGRESS, 0, false);
test_tc_opts_max_target(BPF_TCX_INGRESS, BPF_F_BEFORE, false);
test_tc_opts_max_target(BPF_TCX_EGRESS, BPF_F_BEFORE, true);
test_tc_opts_max_target(BPF_TCX_INGRESS, BPF_F_AFTER, true);
test_tc_opts_max_target(BPF_TCX_EGRESS, BPF_F_AFTER, false);
}
......@@ -490,6 +490,8 @@ static __always_inline void hystart_update(struct sock *sk, __u32 delay)
}
}
int bpf_cubic_acked_called = 0;
void BPF_STRUCT_OPS(bpf_cubic_acked, struct sock *sk,
const struct ack_sample *sample)
{
......@@ -497,6 +499,7 @@ void BPF_STRUCT_OPS(bpf_cubic_acked, struct sock *sk,
struct bictcp *ca = inet_csk_ca(sk);
__u32 delay;
bpf_cubic_acked_called = 1;
/* Some calls are for duplicates without timetamps */
if (sample->rtt_us < 0)
return;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment