Commit d105fa98 authored by Alexei Starovoitov's avatar Alexei Starovoitov

Merge branch 'skb_sk-sk_fullsock-tcp_sock'

Martin KaFai Lau says:

====================
This series adds __sk_buff->sk, "struct bpf_tcp_sock",
BPF_FUNC_sk_fullsock and BPF_FUNC_tcp_sock.  Together, they provide
a common way to expose the members of "struct tcp_sock" and
"struct bpf_sock" for the bpf_prog to access.

The patch series first adds a bpf_sock pointer to __sk_buff
and a new helper BPF_FUNC_sk_fullsock.

It then adds BPF_FUNC_tcp_sock to get a bpf_tcp_sock
pointer from a bpf_sock pointer.

The current use case is to allow a cg_skb_bpf_prog to provide
per cgroup traffic policing/shaping.

Please see individual patch for details.

v2:
- Patch 1 depends on
  commit d6238766 ("bpf: Fix narrow load on a bpf_sock returned from sk_lookup()")
  in the bpf branch.
- Add sk_to_full_sk() to bpf_sk_fullsock() and bpf_tcp_sock()
  such that there is a way to access the listener's sk and tcp_sk
  when __sk_buff->sk is a request_sock.
  The comments in the uapi bpf.h is updated accordingly.
- bpf_ctx_range_till() is used in bpf_sock_common_is_valid_access()
  in patch 1.  Saved a few lines.
- Patch 2 is new in v2 and it adds "state", "dst_ip4", "dst_ip6" and
  "dst_port" to the bpf_sock.  Narrow load is allowed on them.
  The "state" (i.e. sk_state) has already been used in
  INET_DIAG (e.g. ss -t) and getsockopt(TCP_INFO).
- While at it in the new patch 2, also allow narrow load on some
  existing fields of the bpf_sock, which are "family", "type", "protocol"
  and "src_port".  Only allow loading from first byte for now.
  i.e. does not allow narrow load starting from the 2nd byte.
- Add some narrow load tests to the test_verifier's sock.c
====================
Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parents 5f456649 e0b27b3f
......@@ -194,6 +194,7 @@ enum bpf_arg_type {
ARG_ANYTHING, /* any (initialized) argument is ok */
ARG_PTR_TO_SOCKET, /* pointer to bpf_sock */
ARG_PTR_TO_SPIN_LOCK, /* pointer to bpf_spin_lock */
ARG_PTR_TO_SOCK_COMMON, /* pointer to sock_common */
};
/* type of values returned from helper functions */
......@@ -203,6 +204,7 @@ enum bpf_return_type {
RET_PTR_TO_MAP_VALUE, /* returns a pointer to map elem value */
RET_PTR_TO_MAP_VALUE_OR_NULL, /* returns a pointer to map elem value or NULL */
RET_PTR_TO_SOCKET_OR_NULL, /* returns a pointer to a socket or NULL */
RET_PTR_TO_TCP_SOCK_OR_NULL, /* returns a pointer to a tcp_sock or NULL */
};
/* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs
......@@ -256,6 +258,10 @@ enum bpf_reg_type {
PTR_TO_FLOW_KEYS, /* reg points to bpf_flow_keys */
PTR_TO_SOCKET, /* reg points to struct bpf_sock */
PTR_TO_SOCKET_OR_NULL, /* reg points to struct bpf_sock or NULL */
PTR_TO_SOCK_COMMON, /* reg points to sock_common */
PTR_TO_SOCK_COMMON_OR_NULL, /* reg points to sock_common or NULL */
PTR_TO_TCP_SOCK, /* reg points to struct tcp_sock */
PTR_TO_TCP_SOCK_OR_NULL, /* reg points to struct tcp_sock or NULL */
};
/* The information passed from prog-specific *_is_valid_access
......@@ -920,6 +926,9 @@ void bpf_user_rnd_init_once(void);
u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
#if defined(CONFIG_NET)
bool bpf_sock_common_is_valid_access(int off, int size,
enum bpf_access_type type,
struct bpf_insn_access_aux *info);
bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,
struct bpf_insn_access_aux *info);
u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
......@@ -928,6 +937,12 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
struct bpf_prog *prog,
u32 *target_size);
#else
static inline bool bpf_sock_common_is_valid_access(int off, int size,
enum bpf_access_type type,
struct bpf_insn_access_aux *info)
{
return false;
}
static inline bool bpf_sock_is_valid_access(int off, int size,
enum bpf_access_type type,
struct bpf_insn_access_aux *info)
......@@ -944,4 +959,31 @@ static inline u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
}
#endif
#ifdef CONFIG_INET
bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
struct bpf_insn_access_aux *info);
u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
struct bpf_insn *insn_buf,
struct bpf_prog *prog,
u32 *target_size);
#else
static inline bool bpf_tcp_sock_is_valid_access(int off, int size,
enum bpf_access_type type,
struct bpf_insn_access_aux *info)
{
return false;
}
static inline u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
struct bpf_insn *insn_buf,
struct bpf_prog *prog,
u32 *target_size)
{
return 0;
}
#endif /* CONFIG_INET */
#endif /* _LINUX_BPF_H */
......@@ -2329,6 +2329,23 @@ union bpf_attr {
* "**y**".
* Return
* 0
*
* struct bpf_sock *bpf_sk_fullsock(struct bpf_sock *sk)
* Description
* This helper gets a **struct bpf_sock** pointer such
* that all the fields in bpf_sock can be accessed.
* Return
* A **struct bpf_sock** pointer on success, or NULL in
* case of failure.
*
* struct bpf_tcp_sock *bpf_tcp_sock(struct bpf_sock *sk)
* Description
* This helper gets a **struct bpf_tcp_sock** pointer from a
* **struct bpf_sock** pointer.
*
* Return
* A **struct bpf_tcp_sock** pointer on success, or NULL in
* case of failure.
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
......@@ -2425,7 +2442,9 @@ union bpf_attr {
FN(msg_pop_data), \
FN(rc_pointer_rel), \
FN(spin_lock), \
FN(spin_unlock),
FN(spin_unlock), \
FN(sk_fullsock), \
FN(tcp_sock),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call
......@@ -2545,6 +2564,7 @@ struct __sk_buff {
__u64 tstamp;
__u32 wire_len;
__u32 gso_segs;
__bpf_md_ptr(struct bpf_sock *, sk);
};
struct bpf_tunnel_key {
......@@ -2596,14 +2616,52 @@ struct bpf_sock {
__u32 protocol;
__u32 mark;
__u32 priority;
__u32 src_ip4; /* Allows 1,2,4-byte read.
* Stored in network byte order.
/* IP address also allows 1 and 2 bytes access */
__u32 src_ip4;
__u32 src_ip6[4];
__u32 src_port; /* host byte order */
__u32 dst_port; /* network byte order */
__u32 dst_ip4;
__u32 dst_ip6[4];
__u32 state;
};
struct bpf_tcp_sock {
__u32 snd_cwnd; /* Sending congestion window */
__u32 srtt_us; /* smoothed round trip time << 3 in usecs */
__u32 rtt_min;
__u32 snd_ssthresh; /* Slow start size threshold */
__u32 rcv_nxt; /* What we want to receive next */
__u32 snd_nxt; /* Next sequence we send */
__u32 snd_una; /* First byte we want an ack for */
__u32 mss_cache; /* Cached effective mss, not including SACKS */
__u32 ecn_flags; /* ECN status bits. */
__u32 rate_delivered; /* saved rate sample: packets delivered */
__u32 rate_interval_us; /* saved rate sample: time elapsed */
__u32 packets_out; /* Packets which are "in flight" */
__u32 retrans_out; /* Retransmitted packets out */
__u32 total_retrans; /* Total retransmits for entire connection */
__u32 segs_in; /* RFC4898 tcpEStatsPerfSegsIn
* total number of segments in.
*/
__u32 src_ip6[4]; /* Allows 1,2,4-byte read.
* Stored in network byte order.
__u32 data_segs_in; /* RFC4898 tcpEStatsPerfDataSegsIn
* total number of data segments in.
*/
__u32 segs_out; /* RFC4898 tcpEStatsPerfSegsOut
* The total number of segments sent.
*/
__u32 data_segs_out; /* RFC4898 tcpEStatsPerfDataSegsOut
* total number of data segments sent.
*/
__u32 lost_out; /* Lost packets */
__u32 sacked_out; /* SACK'd packets */
__u64 bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived
* sum(delta(rcv_nxt)), or how many bytes
* were acked.
*/
__u32 src_port; /* Allows 4-byte read.
* Stored in host byte order
__u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked
* sum(delta(snd_una)), or how many bytes
* were acked.
*/
};
......
This diff is collapsed.
This diff is collapsed.
......@@ -2329,6 +2329,23 @@ union bpf_attr {
* "**y**".
* Return
* 0
*
* struct bpf_sock *bpf_sk_fullsock(struct bpf_sock *sk)
* Description
* This helper gets a **struct bpf_sock** pointer such
* that all the fields in bpf_sock can be accessed.
* Return
* A **struct bpf_sock** pointer on success, or NULL in
* case of failure.
*
* struct bpf_tcp_sock *bpf_tcp_sock(struct bpf_sock *sk)
* Description
* This helper gets a **struct bpf_tcp_sock** pointer from a
* **struct bpf_sock** pointer.
*
* Return
* A **struct bpf_tcp_sock** pointer on success, or NULL in
* case of failure.
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
......@@ -2425,7 +2442,9 @@ union bpf_attr {
FN(msg_pop_data), \
FN(rc_pointer_rel), \
FN(spin_lock), \
FN(spin_unlock),
FN(spin_unlock), \
FN(sk_fullsock), \
FN(tcp_sock),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call
......@@ -2545,6 +2564,7 @@ struct __sk_buff {
__u64 tstamp;
__u32 wire_len;
__u32 gso_segs;
__bpf_md_ptr(struct bpf_sock *, sk);
};
struct bpf_tunnel_key {
......@@ -2596,14 +2616,52 @@ struct bpf_sock {
__u32 protocol;
__u32 mark;
__u32 priority;
__u32 src_ip4; /* Allows 1,2,4-byte read.
* Stored in network byte order.
/* IP address also allows 1 and 2 bytes access */
__u32 src_ip4;
__u32 src_ip6[4];
__u32 src_port; /* host byte order */
__u32 dst_port; /* network byte order */
__u32 dst_ip4;
__u32 dst_ip6[4];
__u32 state;
};
struct bpf_tcp_sock {
__u32 snd_cwnd; /* Sending congestion window */
__u32 srtt_us; /* smoothed round trip time << 3 in usecs */
__u32 rtt_min;
__u32 snd_ssthresh; /* Slow start size threshold */
__u32 rcv_nxt; /* What we want to receive next */
__u32 snd_nxt; /* Next sequence we send */
__u32 snd_una; /* First byte we want an ack for */
__u32 mss_cache; /* Cached effective mss, not including SACKS */
__u32 ecn_flags; /* ECN status bits. */
__u32 rate_delivered; /* saved rate sample: packets delivered */
__u32 rate_interval_us; /* saved rate sample: time elapsed */
__u32 packets_out; /* Packets which are "in flight" */
__u32 retrans_out; /* Retransmitted packets out */
__u32 total_retrans; /* Total retransmits for entire connection */
__u32 segs_in; /* RFC4898 tcpEStatsPerfSegsIn
* total number of segments in.
*/
__u32 src_ip6[4]; /* Allows 1,2,4-byte read.
* Stored in network byte order.
__u32 data_segs_in; /* RFC4898 tcpEStatsPerfDataSegsIn
* total number of data segments in.
*/
__u32 segs_out; /* RFC4898 tcpEStatsPerfSegsOut
* The total number of segments sent.
*/
__u32 data_segs_out; /* RFC4898 tcpEStatsPerfDataSegsOut
* total number of data segments sent.
*/
__u32 lost_out; /* Lost packets */
__u32 sacked_out; /* SACK'd packets */
__u64 bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived
* sum(delta(rcv_nxt)), or how many bytes
* were acked.
*/
__u32 src_port; /* Allows 4-byte read.
* Stored in host byte order
__u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked
* sum(delta(snd_una)), or how many bytes
* were acked.
*/
};
......
......@@ -23,7 +23,7 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test
test_align test_verifier_log test_dev_cgroup test_tcpbpf_user \
test_sock test_btf test_sockmap test_lirc_mode2_user get_cgroup_id_user \
test_socket_cookie test_cgroup_storage test_select_reuseport test_section_names \
test_netcnt test_tcpnotify_user
test_netcnt test_tcpnotify_user test_sock_fields
BPF_OBJ_FILES = \
test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o \
......@@ -35,7 +35,8 @@ BPF_OBJ_FILES = \
sendmsg4_prog.o sendmsg6_prog.o test_lirc_mode2_kern.o \
get_cgroup_id_kern.o socket_cookie_prog.o test_select_reuseport_kern.o \
test_skb_cgroup_id_kern.o bpf_flow.o netcnt_prog.o test_xdp_vlan.o \
xdp_dummy.o test_map_in_map.o test_spin_lock.o test_map_lock.o
xdp_dummy.o test_map_in_map.o test_spin_lock.o test_map_lock.o \
test_sock_fields_kern.o
# Objects are built with default compilation flags and with sub-register
# code-gen enabled.
......@@ -111,6 +112,7 @@ $(OUTPUT)/test_progs: trace_helpers.c
$(OUTPUT)/get_cgroup_id_user: cgroup_helpers.c
$(OUTPUT)/test_cgroup_storage: cgroup_helpers.c
$(OUTPUT)/test_netcnt: cgroup_helpers.c
$(OUTPUT)/test_sock_fields: cgroup_helpers.c
.PHONY: force
......
......@@ -176,6 +176,10 @@ static void (*bpf_spin_lock)(struct bpf_spin_lock *lock) =
(void *) BPF_FUNC_spin_lock;
static void (*bpf_spin_unlock)(struct bpf_spin_lock *lock) =
(void *) BPF_FUNC_spin_unlock;
static struct bpf_sock *(*bpf_sk_fullsock)(struct bpf_sock *sk) =
(void *) BPF_FUNC_sk_fullsock;
static struct bpf_tcp_sock *(*bpf_tcp_sock)(struct bpf_sock *sk) =
(void *) BPF_FUNC_tcp_sock;
/* llvm builtin functions that eBPF C program may use to
* emit BPF_LD_ABS and BPF_LD_IND instructions
......
......@@ -48,4 +48,13 @@ static inline unsigned int bpf_num_possible_cpus(void)
# define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
#endif
#ifndef sizeof_field
#define sizeof_field(TYPE, MEMBER) sizeof((((TYPE *)0)->MEMBER))
#endif
#ifndef offsetofend
#define offsetofend(TYPE, MEMBER) \
(offsetof(TYPE, MEMBER) + sizeof_field(TYPE, MEMBER))
#endif
#endif /* __BPF_UTIL__ */
This diff is collapsed.
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2019 Facebook */
#include <linux/bpf.h>
#include <netinet/in.h>
#include <stdbool.h>
#include "bpf_helpers.h"
#include "bpf_endian.h"
enum bpf_array_idx {
SRV_IDX,
CLI_IDX,
__NR_BPF_ARRAY_IDX,
};
struct bpf_map_def SEC("maps") addr_map = {
.type = BPF_MAP_TYPE_ARRAY,
.key_size = sizeof(__u32),
.value_size = sizeof(struct sockaddr_in6),
.max_entries = __NR_BPF_ARRAY_IDX,
};
struct bpf_map_def SEC("maps") sock_result_map = {
.type = BPF_MAP_TYPE_ARRAY,
.key_size = sizeof(__u32),
.value_size = sizeof(struct bpf_sock),
.max_entries = __NR_BPF_ARRAY_IDX,
};
struct bpf_map_def SEC("maps") tcp_sock_result_map = {
.type = BPF_MAP_TYPE_ARRAY,
.key_size = sizeof(__u32),
.value_size = sizeof(struct bpf_tcp_sock),
.max_entries = __NR_BPF_ARRAY_IDX,
};
struct bpf_map_def SEC("maps") linum_map = {
.type = BPF_MAP_TYPE_ARRAY,
.key_size = sizeof(__u32),
.value_size = sizeof(__u32),
.max_entries = 1,
};
static bool is_loopback6(__u32 *a6)
{
return !a6[0] && !a6[1] && !a6[2] && a6[3] == bpf_htonl(1);
}
static void skcpy(struct bpf_sock *dst,
const struct bpf_sock *src)
{
dst->bound_dev_if = src->bound_dev_if;
dst->family = src->family;
dst->type = src->type;
dst->protocol = src->protocol;
dst->mark = src->mark;
dst->priority = src->priority;
dst->src_ip4 = src->src_ip4;
dst->src_ip6[0] = src->src_ip6[0];
dst->src_ip6[1] = src->src_ip6[1];
dst->src_ip6[2] = src->src_ip6[2];
dst->src_ip6[3] = src->src_ip6[3];
dst->src_port = src->src_port;
dst->dst_ip4 = src->dst_ip4;
dst->dst_ip6[0] = src->dst_ip6[0];
dst->dst_ip6[1] = src->dst_ip6[1];
dst->dst_ip6[2] = src->dst_ip6[2];
dst->dst_ip6[3] = src->dst_ip6[3];
dst->dst_port = src->dst_port;
dst->state = src->state;
}
static void tpcpy(struct bpf_tcp_sock *dst,
const struct bpf_tcp_sock *src)
{
dst->snd_cwnd = src->snd_cwnd;
dst->srtt_us = src->srtt_us;
dst->rtt_min = src->rtt_min;
dst->snd_ssthresh = src->snd_ssthresh;
dst->rcv_nxt = src->rcv_nxt;
dst->snd_nxt = src->snd_nxt;
dst->snd_una = src->snd_una;
dst->mss_cache = src->mss_cache;
dst->ecn_flags = src->ecn_flags;
dst->rate_delivered = src->rate_delivered;
dst->rate_interval_us = src->rate_interval_us;
dst->packets_out = src->packets_out;
dst->retrans_out = src->retrans_out;
dst->total_retrans = src->total_retrans;
dst->segs_in = src->segs_in;
dst->data_segs_in = src->data_segs_in;
dst->segs_out = src->segs_out;
dst->data_segs_out = src->data_segs_out;
dst->lost_out = src->lost_out;
dst->sacked_out = src->sacked_out;
dst->bytes_received = src->bytes_received;
dst->bytes_acked = src->bytes_acked;
}
#define RETURN { \
linum = __LINE__; \
bpf_map_update_elem(&linum_map, &idx0, &linum, 0); \
return 1; \
}
SEC("cgroup_skb/egress")
int read_sock_fields(struct __sk_buff *skb)
{
__u32 srv_idx = SRV_IDX, cli_idx = CLI_IDX, idx;
struct sockaddr_in6 *srv_sa6, *cli_sa6;
struct bpf_tcp_sock *tp, *tp_ret;
struct bpf_sock *sk, *sk_ret;
__u32 linum, idx0 = 0;
sk = skb->sk;
if (!sk || sk->state == 10)
RETURN;
sk = bpf_sk_fullsock(sk);
if (!sk || sk->family != AF_INET6 || sk->protocol != IPPROTO_TCP ||
!is_loopback6(sk->src_ip6))
RETURN;
tp = bpf_tcp_sock(sk);
if (!tp)
RETURN;
srv_sa6 = bpf_map_lookup_elem(&addr_map, &srv_idx);
cli_sa6 = bpf_map_lookup_elem(&addr_map, &cli_idx);
if (!srv_sa6 || !cli_sa6)
RETURN;
if (sk->src_port == bpf_ntohs(srv_sa6->sin6_port))
idx = srv_idx;
else if (sk->src_port == bpf_ntohs(cli_sa6->sin6_port))
idx = cli_idx;
else
RETURN;
sk_ret = bpf_map_lookup_elem(&sock_result_map, &idx);
tp_ret = bpf_map_lookup_elem(&tcp_sock_result_map, &idx);
if (!sk_ret || !tp_ret)
RETURN;
skcpy(sk_ret, sk);
tpcpy(tp_ret, tp);
RETURN;
}
char _license[] SEC("license") = "GPL";
......@@ -547,7 +547,7 @@
BPF_EXIT_INSN(),
},
.prog_type = BPF_PROG_TYPE_SCHED_CLS,
.errstr = "cannot write into socket",
.errstr = "cannot write into sock",
.result = REJECT,
},
{
......@@ -562,7 +562,7 @@
BPF_EXIT_INSN(),
},
.prog_type = BPF_PROG_TYPE_SCHED_CLS,
.errstr = "invalid bpf_sock access off=0 size=8",
.errstr = "invalid sock access off=0 size=8",
.result = REJECT,
},
{
......
This diff is collapsed.
......@@ -365,7 +365,7 @@
},
.result = REJECT,
//.errstr = "same insn cannot be used with different pointers",
.errstr = "cannot write into socket",
.errstr = "cannot write into sock",
.prog_type = BPF_PROG_TYPE_SCHED_CLS,
},
{
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment