Commit 5ee35abb authored by Alexei Starovoitov's avatar Alexei Starovoitov

Merge branch 'bpf: Remove recursion check for struct_ops prog'

Martin KaFai Lau says:

====================

From: Martin KaFai Lau <martin.lau@kernel.org>

The struct_ops is sharing the tracing-trampoline's enter/exit
function which tracks prog->active to avoid recursion.  It turns
out the struct_ops bpf prog will hit this prog->active and
unnecessarily skipped running the struct_ops prog.  eg.  The
'.ssthresh' may run in_task() and then interrupted by softirq
that runs the same '.ssthresh'.

The kernel does not call the tcp-cc's ops in a recursive way,
so this set is to remove the recursion check for struct_ops prog.

v3:
- Clear the bpf_chg_cc_inprogress from the newly cloned tcp_sock
  in tcp_create_openreq_child() because the listen sk can
  be cloned without lock being held. (Eric Dumazet)

v2:
- v1 [0] turned into a long discussion on a few cases and also
  whether it needs to follow the bpf_run_ctx chain if there is
  tracing bpf_run_ctx (kprobe/trace/trampoline) running in between.

  It is a good signal that it is not obvious enough to reason
  about it and needs a tradeoff for a more straight forward approach.

  This revision uses one bit out of an existing 1 byte hole
  in the tcp_sock.  It is in Patch 4.

  [0]: https://lore.kernel.org/bpf/20220922225616.3054840-1-kafai@fb.com/T/#md98d40ac5ec295fdadef476c227a3401b2b6b911
====================
Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parents 8526f0d6 3411c5b6
...@@ -1836,6 +1836,9 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, ...@@ -1836,6 +1836,9 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
if (p->aux->sleepable) { if (p->aux->sleepable) {
enter = __bpf_prog_enter_sleepable; enter = __bpf_prog_enter_sleepable;
exit = __bpf_prog_exit_sleepable; exit = __bpf_prog_exit_sleepable;
} else if (p->type == BPF_PROG_TYPE_STRUCT_OPS) {
enter = __bpf_prog_enter_struct_ops;
exit = __bpf_prog_exit_struct_ops;
} else if (p->expected_attach_type == BPF_LSM_CGROUP) { } else if (p->expected_attach_type == BPF_LSM_CGROUP) {
enter = __bpf_prog_enter_lsm_cgroup; enter = __bpf_prog_enter_lsm_cgroup;
exit = __bpf_prog_exit_lsm_cgroup; exit = __bpf_prog_exit_lsm_cgroup;
......
...@@ -864,6 +864,10 @@ u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog, ...@@ -864,6 +864,10 @@ u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog,
struct bpf_tramp_run_ctx *run_ctx); struct bpf_tramp_run_ctx *run_ctx);
void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start, void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start,
struct bpf_tramp_run_ctx *run_ctx); struct bpf_tramp_run_ctx *run_ctx);
u64 notrace __bpf_prog_enter_struct_ops(struct bpf_prog *prog,
struct bpf_tramp_run_ctx *run_ctx);
void notrace __bpf_prog_exit_struct_ops(struct bpf_prog *prog, u64 start,
struct bpf_tramp_run_ctx *run_ctx);
void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr); void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr);
void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr); void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr);
......
...@@ -388,6 +388,12 @@ struct tcp_sock { ...@@ -388,6 +388,12 @@ struct tcp_sock {
u8 bpf_sock_ops_cb_flags; /* Control calling BPF programs u8 bpf_sock_ops_cb_flags; /* Control calling BPF programs
* values defined in uapi/linux/tcp.h * values defined in uapi/linux/tcp.h
*/ */
u8 bpf_chg_cc_inprogress:1; /* In the middle of
* bpf_setsockopt(TCP_CONGESTION),
* it is to avoid the bpf_tcp_cc->init()
* to recur itself by calling
* bpf_setsockopt(TCP_CONGESTION, "itself").
*/
#define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) (TP->bpf_sock_ops_cb_flags & ARG) #define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) (TP->bpf_sock_ops_cb_flags & ARG)
#else #else
#define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) 0 #define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) 0
......
...@@ -964,6 +964,29 @@ void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start, ...@@ -964,6 +964,29 @@ void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start,
rcu_read_unlock_trace(); rcu_read_unlock_trace();
} }
u64 notrace __bpf_prog_enter_struct_ops(struct bpf_prog *prog,
struct bpf_tramp_run_ctx *run_ctx)
__acquires(RCU)
{
rcu_read_lock();
migrate_disable();
run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
return bpf_prog_start_time();
}
void notrace __bpf_prog_exit_struct_ops(struct bpf_prog *prog, u64 start,
struct bpf_tramp_run_ctx *run_ctx)
__releases(RCU)
{
bpf_reset_run_ctx(run_ctx->saved_run_ctx);
update_prog_stats(prog, start);
migrate_enable();
rcu_read_unlock();
}
void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr) void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr)
{ {
percpu_ref_get(&tr->pcref); percpu_ref_get(&tr->pcref);
......
...@@ -5102,6 +5102,59 @@ static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname, ...@@ -5102,6 +5102,59 @@ static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname,
return 0; return 0;
} }
static int sol_tcp_sockopt_congestion(struct sock *sk, char *optval,
int *optlen, bool getopt)
{
struct tcp_sock *tp;
int ret;
if (*optlen < 2)
return -EINVAL;
if (getopt) {
if (!inet_csk(sk)->icsk_ca_ops)
return -EINVAL;
/* BPF expects NULL-terminated tcp-cc string */
optval[--(*optlen)] = '\0';
return do_tcp_getsockopt(sk, SOL_TCP, TCP_CONGESTION,
KERNEL_SOCKPTR(optval),
KERNEL_SOCKPTR(optlen));
}
/* "cdg" is the only cc that alloc a ptr
* in inet_csk_ca area. The bpf-tcp-cc may
* overwrite this ptr after switching to cdg.
*/
if (*optlen >= sizeof("cdg") - 1 && !strncmp("cdg", optval, *optlen))
return -ENOTSUPP;
/* It stops this looping
*
* .init => bpf_setsockopt(tcp_cc) => .init =>
* bpf_setsockopt(tcp_cc)" => .init => ....
*
* The second bpf_setsockopt(tcp_cc) is not allowed
* in order to break the loop when both .init
* are the same bpf prog.
*
* This applies even the second bpf_setsockopt(tcp_cc)
* does not cause a loop. This limits only the first
* '.init' can call bpf_setsockopt(TCP_CONGESTION) to
* pick a fallback cc (eg. peer does not support ECN)
* and the second '.init' cannot fallback to
* another.
*/
tp = tcp_sk(sk);
if (tp->bpf_chg_cc_inprogress)
return -EBUSY;
tp->bpf_chg_cc_inprogress = 1;
ret = do_tcp_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
KERNEL_SOCKPTR(optval), *optlen);
tp->bpf_chg_cc_inprogress = 0;
return ret;
}
static int sol_tcp_sockopt(struct sock *sk, int optname, static int sol_tcp_sockopt(struct sock *sk, int optname,
char *optval, int *optlen, char *optval, int *optlen,
bool getopt) bool getopt)
...@@ -5125,9 +5178,7 @@ static int sol_tcp_sockopt(struct sock *sk, int optname, ...@@ -5125,9 +5178,7 @@ static int sol_tcp_sockopt(struct sock *sk, int optname,
return -EINVAL; return -EINVAL;
break; break;
case TCP_CONGESTION: case TCP_CONGESTION:
if (*optlen < 2) return sol_tcp_sockopt_congestion(sk, optval, optlen, getopt);
return -EINVAL;
break;
case TCP_SAVED_SYN: case TCP_SAVED_SYN:
if (*optlen < 1) if (*optlen < 1)
return -EINVAL; return -EINVAL;
...@@ -5152,13 +5203,6 @@ static int sol_tcp_sockopt(struct sock *sk, int optname, ...@@ -5152,13 +5203,6 @@ static int sol_tcp_sockopt(struct sock *sk, int optname,
return 0; return 0;
} }
if (optname == TCP_CONGESTION) {
if (!inet_csk(sk)->icsk_ca_ops)
return -EINVAL;
/* BPF expects NULL-terminated tcp-cc string */
optval[--(*optlen)] = '\0';
}
return do_tcp_getsockopt(sk, SOL_TCP, optname, return do_tcp_getsockopt(sk, SOL_TCP, optname,
KERNEL_SOCKPTR(optval), KERNEL_SOCKPTR(optval),
KERNEL_SOCKPTR(optlen)); KERNEL_SOCKPTR(optlen));
...@@ -5285,12 +5329,6 @@ static int _bpf_getsockopt(struct sock *sk, int level, int optname, ...@@ -5285,12 +5329,6 @@ static int _bpf_getsockopt(struct sock *sk, int level, int optname,
BPF_CALL_5(bpf_sk_setsockopt, struct sock *, sk, int, level, BPF_CALL_5(bpf_sk_setsockopt, struct sock *, sk, int, level,
int, optname, char *, optval, int, optlen) int, optname, char *, optval, int, optlen)
{ {
if (level == SOL_TCP && optname == TCP_CONGESTION) {
if (optlen >= sizeof("cdg") - 1 &&
!strncmp("cdg", optval, optlen))
return -ENOTSUPP;
}
return _bpf_setsockopt(sk, level, optname, optval, optlen); return _bpf_setsockopt(sk, level, optname, optval, optlen);
} }
......
...@@ -541,6 +541,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, ...@@ -541,6 +541,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->fastopen_req = NULL; newtp->fastopen_req = NULL;
RCU_INIT_POINTER(newtp->fastopen_rsk, NULL); RCU_INIT_POINTER(newtp->fastopen_rsk, NULL);
newtp->bpf_chg_cc_inprogress = 0;
tcp_bpf_clone(sk, newsk); tcp_bpf_clone(sk, newsk);
__TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);
......
...@@ -290,6 +290,10 @@ static void test_dctcp_fallback(void) ...@@ -290,6 +290,10 @@ static void test_dctcp_fallback(void)
goto done; goto done;
ASSERT_STREQ(dctcp_skel->bss->cc_res, "cubic", "cc_res"); ASSERT_STREQ(dctcp_skel->bss->cc_res, "cubic", "cc_res");
ASSERT_EQ(dctcp_skel->bss->tcp_cdg_res, -ENOTSUPP, "tcp_cdg_res"); ASSERT_EQ(dctcp_skel->bss->tcp_cdg_res, -ENOTSUPP, "tcp_cdg_res");
/* All setsockopt(TCP_CONGESTION) in the recurred
* bpf_dctcp->init() should fail with -EBUSY.
*/
ASSERT_EQ(dctcp_skel->bss->ebusy_cnt, 3, "ebusy_cnt");
err = getsockopt(srv_fd, SOL_TCP, TCP_CONGESTION, srv_cc, &cc_len); err = getsockopt(srv_fd, SOL_TCP, TCP_CONGESTION, srv_cc, &cc_len);
if (!ASSERT_OK(err, "getsockopt(srv_fd, TCP_CONGESTION)")) if (!ASSERT_OK(err, "getsockopt(srv_fd, TCP_CONGESTION)"))
......
...@@ -11,6 +11,7 @@ ...@@ -11,6 +11,7 @@
#include <linux/types.h> #include <linux/types.h>
#include <linux/stddef.h> #include <linux/stddef.h>
#include <linux/tcp.h> #include <linux/tcp.h>
#include <errno.h>
#include <bpf/bpf_helpers.h> #include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h> #include <bpf/bpf_tracing.h>
#include "bpf_tcp_helpers.h" #include "bpf_tcp_helpers.h"
...@@ -23,6 +24,7 @@ const char tcp_cdg[] = "cdg"; ...@@ -23,6 +24,7 @@ const char tcp_cdg[] = "cdg";
char cc_res[TCP_CA_NAME_MAX]; char cc_res[TCP_CA_NAME_MAX];
int tcp_cdg_res = 0; int tcp_cdg_res = 0;
int stg_result = 0; int stg_result = 0;
int ebusy_cnt = 0;
struct { struct {
__uint(type, BPF_MAP_TYPE_SK_STORAGE); __uint(type, BPF_MAP_TYPE_SK_STORAGE);
...@@ -64,16 +66,23 @@ void BPF_PROG(dctcp_init, struct sock *sk) ...@@ -64,16 +66,23 @@ void BPF_PROG(dctcp_init, struct sock *sk)
if (!(tp->ecn_flags & TCP_ECN_OK) && fallback[0]) { if (!(tp->ecn_flags & TCP_ECN_OK) && fallback[0]) {
/* Switch to fallback */ /* Switch to fallback */
bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION, if (bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
(void *)fallback, sizeof(fallback)); (void *)fallback, sizeof(fallback)) == -EBUSY)
/* Switch back to myself which the bpf trampoline ebusy_cnt++;
* stopped calling dctcp_init recursively.
/* Switch back to myself and the recurred dctcp_init()
* will get -EBUSY for all bpf_setsockopt(TCP_CONGESTION),
* except the last "cdg" one.
*/ */
bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION, if (bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
(void *)bpf_dctcp, sizeof(bpf_dctcp)); (void *)bpf_dctcp, sizeof(bpf_dctcp)) == -EBUSY)
ebusy_cnt++;
/* Switch back to fallback */ /* Switch back to fallback */
bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION, if (bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
(void *)fallback, sizeof(fallback)); (void *)fallback, sizeof(fallback)) == -EBUSY)
ebusy_cnt++;
/* Expecting -ENOTSUPP for tcp_cdg_res */ /* Expecting -ENOTSUPP for tcp_cdg_res */
tcp_cdg_res = bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION, tcp_cdg_res = bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
(void *)tcp_cdg, sizeof(tcp_cdg)); (void *)tcp_cdg, sizeof(tcp_cdg));
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment