Commit 1cf4a0cc authored by Daniel Borkmann's avatar Daniel Borkmann

Merge branch 'bpf-sockmap-fixes-and-improvements'

John Fastabend says:

====================
Set of bpf fixes and improvements to make sockmap with kTLS usable
with "real" applications. This set came as the fallout of pulling
kTLS+sockmap into Cilium[1] and running in container environment.

Roughly broken into three parts,

Patches 1-3: resolve/improve handling of size field in sk_msg_md
Patch     4: it became difficult to use this in Cilium when the
	     SK_PASS verdict was not correctly handle. So handle
	     the case correctly.
Patch   5-8: Set of issues found while running OpenSSL TX kTLS
	     enabled applications. This resolves the most obvious
	     issues and gets applications using kTLS TX up and
	     running with sock{map|has}.

Other than the "sk_msg, zap ingress queue on psock down" (PATCH 6/8)
which can potentially cause a WARNING the issues fixed in this
series do not cause kernel side warnings, BUG, etc. but instead
cause stalls and other odd behavior in the user space applications
when using kTLS with BPF policies applied.

Primarily tested with 'curl' compiled with latest openssl and
also 'openssl s_client/s_server' containers using Cilium network
plugin with docker/k8s. Some basic testing with httpd was also
enabled. Cilium CI tests will be added shortly to cover these
cases as well. We also have 'wrk' and other test and benchmarking
tools we can run now.

We have two more sets of patches currently under testing that
will be sent shortly to address a few more issues. First the
OpenSSL RX kTLS side breaks when both sk_msg and sk_skb_verdict
programs are used with kTLS, the sk_skb_verdict programs are
not enforced. Second skmsg needs to call into tcp stack to
send to indicate consumed data.
====================
Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
parents 77ea5f4c 28cb6f1e
......@@ -36,9 +36,7 @@ struct sk_msg_sg {
struct scatterlist data[MAX_MSG_FRAGS + 1];
};
/* UAPI in filter.c depends on struct sk_msg_sg being first element. If
* this is moved filter.c also must be updated.
*/
/* UAPI in filter.c depends on struct sk_msg_sg being first element. */
struct sk_msg {
struct sk_msg_sg sg;
void *data;
......@@ -419,6 +417,14 @@ static inline void sk_psock_put(struct sock *sk, struct sk_psock *psock)
sk_psock_drop(sk, psock);
}
static inline void sk_psock_data_ready(struct sock *sk, struct sk_psock *psock)
{
if (psock->parser.enabled)
psock->parser.saved_data_ready(sk);
else
sk->sk_data_ready(sk);
}
static inline void psock_set_prog(struct bpf_prog **pprog,
struct bpf_prog *prog)
{
......
......@@ -286,6 +286,7 @@ struct ucred {
#define MSG_NOSIGNAL 0x4000 /* Do not generate SIGPIPE */
#define MSG_MORE 0x8000 /* Sender will send more */
#define MSG_WAITFORONE 0x10000 /* recvmmsg(): block until 1+ packets avail */
#define MSG_SENDPAGE_NOPOLICY 0x10000 /* sendpage() internal : do no apply policy */
#define MSG_SENDPAGE_NOTLAST 0x20000 /* sendpage() internal : not the last page */
#define MSG_BATCH 0x40000 /* sendmmsg(): more messages coming */
#define MSG_EOF MSG_FIN
......
......@@ -454,6 +454,15 @@ tls_offload_ctx_tx(const struct tls_context *tls_ctx)
return (struct tls_offload_context_tx *)tls_ctx->priv_ctx_tx;
}
static inline bool tls_sw_has_ctx_tx(const struct sock *sk)
{
struct tls_context *ctx = tls_get_ctx(sk);
if (!ctx)
return false;
return !!tls_sw_ctx_tx(ctx);
}
static inline struct tls_offload_context_rx *
tls_offload_ctx_rx(const struct tls_context *tls_ctx)
{
......
......@@ -6313,6 +6313,9 @@ static bool sk_msg_is_valid_access(int off, int size,
if (type == BPF_WRITE)
return false;
if (off % size != 0)
return false;
switch (off) {
case offsetof(struct sk_msg_md, data):
info->reg_type = PTR_TO_PACKET;
......@@ -6324,16 +6327,20 @@ static bool sk_msg_is_valid_access(int off, int size,
if (size != sizeof(__u64))
return false;
break;
default:
case bpf_ctx_range(struct sk_msg_md, family):
case bpf_ctx_range(struct sk_msg_md, remote_ip4):
case bpf_ctx_range(struct sk_msg_md, local_ip4):
case bpf_ctx_range_till(struct sk_msg_md, remote_ip6[0], remote_ip6[3]):
case bpf_ctx_range_till(struct sk_msg_md, local_ip6[0], local_ip6[3]):
case bpf_ctx_range(struct sk_msg_md, remote_port):
case bpf_ctx_range(struct sk_msg_md, local_port):
case bpf_ctx_range(struct sk_msg_md, size):
if (size != sizeof(__u32))
return false;
}
if (off < 0 || off >= sizeof(struct sk_msg_md))
return false;
if (off % size != 0)
break;
default:
return false;
}
return true;
}
......@@ -7418,6 +7425,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
int off;
#endif
/* convert ctx uses the fact sg element is first in struct */
BUILD_BUG_ON(offsetof(struct sk_msg, sg) != 0);
switch (si->off) {
case offsetof(struct sk_msg_md, data):
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data),
......
......@@ -403,7 +403,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb)
msg->skb = skb;
sk_psock_queue_msg(psock, msg);
sk->sk_data_ready(sk);
sk_psock_data_ready(sk, psock);
return copied;
}
......@@ -572,6 +572,7 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
{
rcu_assign_sk_user_data(sk, NULL);
sk_psock_cork_free(psock);
sk_psock_zap_ingress(psock);
sk_psock_restore_proto(sk, psock);
write_lock_bh(&sk->sk_callback_lock);
......@@ -669,6 +670,22 @@ static void sk_psock_verdict_apply(struct sk_psock *psock,
bool ingress;
switch (verdict) {
case __SK_PASS:
sk_other = psock->sk;
if (sock_flag(sk_other, SOCK_DEAD) ||
!sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
goto out_free;
}
if (atomic_read(&sk_other->sk_rmem_alloc) <=
sk_other->sk_rcvbuf) {
struct tcp_skb_cb *tcp = TCP_SKB_CB(skb);
tcp->bpf.flags |= BPF_F_INGRESS;
skb_queue_tail(&psock->ingress_skb, skb);
schedule_work(&psock->work);
break;
}
goto out_free;
case __SK_REDIRECT:
sk_other = tcp_skb_bpf_redirect_fetch(skb);
if (unlikely(!sk_other))
......@@ -735,7 +752,7 @@ static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb)
}
/* Called with socket lock held. */
static void sk_psock_data_ready(struct sock *sk)
static void sk_psock_strp_data_ready(struct sock *sk)
{
struct sk_psock *psock;
......@@ -783,7 +800,7 @@ void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock)
return;
parser->saved_data_ready = sk->sk_data_ready;
sk->sk_data_ready = sk_psock_data_ready;
sk->sk_data_ready = sk_psock_strp_data_ready;
sk->sk_write_space = sk_psock_write_space;
parser->enabled = true;
}
......
......@@ -8,6 +8,7 @@
#include <linux/wait.h>
#include <net/inet_common.h>
#include <net/tls.h>
static bool tcp_bpf_stream_read(const struct sock *sk)
{
......@@ -198,7 +199,7 @@ static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock,
msg->sg.start = i;
msg->sg.size -= apply_bytes;
sk_psock_queue_msg(psock, tmp);
sk->sk_data_ready(sk);
sk_psock_data_ready(sk, psock);
} else {
sk_msg_free(sk, tmp);
kfree(tmp);
......@@ -218,6 +219,8 @@ static int tcp_bpf_push(struct sock *sk, struct sk_msg *msg, u32 apply_bytes,
u32 off;
while (1) {
bool has_tx_ulp;
sge = sk_msg_elem(msg, msg->sg.start);
size = (apply && apply_bytes < sge->length) ?
apply_bytes : sge->length;
......@@ -226,7 +229,15 @@ static int tcp_bpf_push(struct sock *sk, struct sk_msg *msg, u32 apply_bytes,
tcp_rate_check_app_limited(sk);
retry:
ret = do_tcp_sendpages(sk, page, off, size, flags);
has_tx_ulp = tls_sw_has_ctx_tx(sk);
if (has_tx_ulp) {
flags |= MSG_SENDPAGE_NOPOLICY;
ret = kernel_sendpage_locked(sk,
page, off, size, flags);
} else {
ret = do_tcp_sendpages(sk, page, off, size, flags);
}
if (ret <= 0)
return ret;
if (apply)
......
......@@ -55,6 +55,8 @@ enum {
static struct proto *saved_tcpv6_prot;
static DEFINE_MUTEX(tcpv6_prot_mutex);
static struct proto *saved_tcpv4_prot;
static DEFINE_MUTEX(tcpv4_prot_mutex);
static LIST_HEAD(device_list);
static DEFINE_MUTEX(device_mutex);
static struct proto tls_prots[TLS_NUM_PROTS][TLS_NUM_CONFIG][TLS_NUM_CONFIG];
......@@ -690,6 +692,16 @@ static int tls_init(struct sock *sk)
mutex_unlock(&tcpv6_prot_mutex);
}
if (ip_ver == TLSV4 &&
unlikely(sk->sk_prot != smp_load_acquire(&saved_tcpv4_prot))) {
mutex_lock(&tcpv4_prot_mutex);
if (likely(sk->sk_prot != saved_tcpv4_prot)) {
build_protos(tls_prots[TLSV4], sk->sk_prot);
smp_store_release(&saved_tcpv4_prot, sk->sk_prot);
}
mutex_unlock(&tcpv4_prot_mutex);
}
ctx->tx_conf = TLS_BASE;
ctx->rx_conf = TLS_BASE;
update_sk_prot(sk, ctx);
......@@ -721,8 +733,6 @@ static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = {
static int __init tls_register(void)
{
build_protos(tls_prots[TLSV4], &tcp_prot);
tls_sw_proto_ops = inet_stream_ops;
tls_sw_proto_ops.splice_read = tls_sw_splice_read;
......
......@@ -686,12 +686,13 @@ static int bpf_exec_tx_verdict(struct sk_msg *msg, struct sock *sk,
struct sk_psock *psock;
struct sock *sk_redir;
struct tls_rec *rec;
bool enospc, policy;
int err = 0, send;
u32 delta = 0;
bool enospc;
policy = !(flags & MSG_SENDPAGE_NOPOLICY);
psock = sk_psock_get(sk);
if (!psock)
if (!psock || !policy)
return tls_push_record(sk, flags, record_type);
more_data:
enospc = sk_msg_full(msg);
......@@ -1017,8 +1018,8 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
return copied ? copied : ret;
}
int tls_sw_sendpage(struct sock *sk, struct page *page,
int offset, size_t size, int flags)
int tls_sw_do_sendpage(struct sock *sk, struct page *page,
int offset, size_t size, int flags)
{
long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
struct tls_context *tls_ctx = tls_get_ctx(sk);
......@@ -1033,15 +1034,7 @@ int tls_sw_sendpage(struct sock *sk, struct page *page,
int ret = 0;
bool eor;
if (flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL |
MSG_SENDPAGE_NOTLAST))
return -ENOTSUPP;
/* No MSG_EOR from splice, only look at MSG_MORE */
eor = !(flags & (MSG_MORE | MSG_SENDPAGE_NOTLAST));
lock_sock(sk);
sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
/* Wait till there is any pending write on socket */
......@@ -1145,10 +1138,34 @@ int tls_sw_sendpage(struct sock *sk, struct page *page,
}
sendpage_end:
ret = sk_stream_error(sk, flags, ret);
release_sock(sk);
return copied ? copied : ret;
}
int tls_sw_sendpage_locked(struct sock *sk, struct page *page,
int offset, size_t size, int flags)
{
if (flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL |
MSG_SENDPAGE_NOTLAST | MSG_SENDPAGE_NOPOLICY))
return -ENOTSUPP;
return tls_sw_do_sendpage(sk, page, offset, size, flags);
}
int tls_sw_sendpage(struct sock *sk, struct page *page,
int offset, size_t size, int flags)
{
int ret;
if (flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL |
MSG_SENDPAGE_NOTLAST | MSG_SENDPAGE_NOPOLICY))
return -ENOTSUPP;
lock_sock(sk);
ret = tls_sw_do_sendpage(sk, page, offset, size, flags);
release_sock(sk);
return ret;
}
static struct sk_buff *tls_wait_data(struct sock *sk, struct sk_psock *psock,
int flags, long timeo, int *err)
{
......
......@@ -1879,7 +1879,7 @@ static struct bpf_test tests[] = {
offsetof(struct sk_msg_md, size) + 4),
BPF_EXIT_INSN(),
},
.errstr = "R0 !read_ok",
.errstr = "invalid bpf_context access",
.result = REJECT,
.prog_type = BPF_PROG_TYPE_SK_MSG,
},
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment