Commit 4e285455 authored by Quentin Monnet's avatar Quentin Monnet

sync bpf compat headers with latest net-next, update doc for helpers

- Update links in doc (make them point from net-next to linux, when
  relevant).
- Add helpers bpf_xdp_adjust_tail() and bpf_skb_get_xfrm_state() to
  documentation and headers.
- Synchronise helpers with latest net-next.
parent 3bb44859
...@@ -106,7 +106,7 @@ Alphabetical order ...@@ -106,7 +106,7 @@ Alphabetical order
Helper | Kernel version | Commit Helper | Kernel version | Commit
-------|----------------|------- -------|----------------|-------
`BPF_FUNC_bind()` | 4.17 | [`d74bad4e74ee`](https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=d74bad4e74ee373787a9ae24197c17b7cdc428d5) `BPF_FUNC_bind()` | 4.17 | [`d74bad4e74ee`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=d74bad4e74ee373787a9ae24197c17b7cdc428d5)
`BPF_FUNC_clone_redirect()` | 4.2 | [`3896d655f4d4`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=3896d655f4d491c67d669a15f275a39f713410f8) `BPF_FUNC_clone_redirect()` | 4.2 | [`3896d655f4d4`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=3896d655f4d491c67d669a15f275a39f713410f8)
`BPF_FUNC_csum_diff()` | 4.6 | [`7d672345ed29`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=7d672345ed295b1356a5d9f7111da1d1d7d65867) `BPF_FUNC_csum_diff()` | 4.6 | [`7d672345ed29`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=7d672345ed295b1356a5d9f7111da1d1d7d65867)
`BPF_FUNC_csum_update()` | 4.9 | [`36bbef52c7eb`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=36bbef52c7eb646ed6247055a2acd3851e317857) `BPF_FUNC_csum_update()` | 4.9 | [`36bbef52c7eb`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=36bbef52c7eb646ed6247055a2acd3851e317857)
...@@ -131,10 +131,10 @@ Helper | Kernel version | Commit ...@@ -131,10 +131,10 @@ Helper | Kernel version | Commit
`BPF_FUNC_map_delete_elem()` | 3.19 | [`d0003ec01c66`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=d0003ec01c667b731c139e23de3306a8b328ccf5) `BPF_FUNC_map_delete_elem()` | 3.19 | [`d0003ec01c66`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=d0003ec01c667b731c139e23de3306a8b328ccf5)
`BPF_FUNC_map_lookup_elem()` | 3.19 | [`d0003ec01c66`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=d0003ec01c667b731c139e23de3306a8b328ccf5) `BPF_FUNC_map_lookup_elem()` | 3.19 | [`d0003ec01c66`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=d0003ec01c667b731c139e23de3306a8b328ccf5)
`BPF_FUNC_map_update_elem()` | 3.19 | [`d0003ec01c66`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=d0003ec01c667b731c139e23de3306a8b328ccf5) `BPF_FUNC_map_update_elem()` | 3.19 | [`d0003ec01c66`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=d0003ec01c667b731c139e23de3306a8b328ccf5)
`BPF_FUNC_msg_apply_bytes()` | 4.17 | [`2a100317c9eb`](https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=2a100317c9ebc204a166f16294884fbf9da074ce) `BPF_FUNC_msg_apply_bytes()` | 4.17 | [`2a100317c9eb`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=2a100317c9ebc204a166f16294884fbf9da074ce)
`BPF_FUNC_msg_cork_bytes()` | 4.17 | [`91843d540a13`](https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=91843d540a139eb8070bcff8aa10089164436deb) `BPF_FUNC_msg_cork_bytes()` | 4.17 | [`91843d540a13`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=91843d540a139eb8070bcff8aa10089164436deb)
`BPF_FUNC_msg_pull_data()` | 4.17 | [`015632bb30da`](https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=015632bb30daaaee64e1bcac07570860e0bf3092) `BPF_FUNC_msg_pull_data()` | 4.17 | [`015632bb30da`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=015632bb30daaaee64e1bcac07570860e0bf3092)
`BPF_FUNC_msg_redirect_map()` | 4.17 | [`4f738adba30a`](https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=4f738adba30a7cfc006f605707e7aee847ffefa0) `BPF_FUNC_msg_redirect_map()` | 4.17 | [`4f738adba30a`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=4f738adba30a7cfc006f605707e7aee847ffefa0)
`BPF_FUNC_perf_event_output()` | 4.4 | [`a43eec304259`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=a43eec304259a6c637f4014a6d4767159b6a3aa3) `BPF_FUNC_perf_event_output()` | 4.4 | [`a43eec304259`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=a43eec304259a6c637f4014a6d4767159b6a3aa3)
`BPF_FUNC_perf_event_read()` | 4.3 | [`35578d798400`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=35578d7984003097af2b1e34502bc943d40c1804) `BPF_FUNC_perf_event_read()` | 4.3 | [`35578d798400`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=35578d7984003097af2b1e34502bc943d40c1804)
`BPF_FUNC_perf_event_read_value()` | 4.15 | [`908432ca84fc`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=908432ca84fc229e906ba164219e9ad0fe56f755) `BPF_FUNC_perf_event_read_value()` | 4.15 | [`908432ca84fc`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=908432ca84fc229e906ba164219e9ad0fe56f755)
...@@ -155,6 +155,7 @@ Helper | Kernel version | Commit ...@@ -155,6 +155,7 @@ Helper | Kernel version | Commit
`BPF_FUNC_skb_change_type()` | 4.8 | [`d2485c4242a8`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=d2485c4242a826fdf493fd3a27b8b792965b9b9e) `BPF_FUNC_skb_change_type()` | 4.8 | [`d2485c4242a8`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=d2485c4242a826fdf493fd3a27b8b792965b9b9e)
`BPF_FUNC_skb_get_tunnel_key()` | 4.3 | [`d3aa45ce6b94`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=d3aa45ce6b94c65b83971257317867db13e5f492) `BPF_FUNC_skb_get_tunnel_key()` | 4.3 | [`d3aa45ce6b94`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=d3aa45ce6b94c65b83971257317867db13e5f492)
`BPF_FUNC_skb_get_tunnel_opt()` | 4.6 | [`14ca0751c96f`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=14ca0751c96f8d3d0f52e8ed3b3236f8b34d3460) `BPF_FUNC_skb_get_tunnel_opt()` | 4.6 | [`14ca0751c96f`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=14ca0751c96f8d3d0f52e8ed3b3236f8b34d3460)
`BPF_FUNC_skb_get_xfrm_state()` | 4.18 | [`12bed760a78d`](https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=12bed760a78da6e12ac8252fec64d019a9eac523)
`BPF_FUNC_skb_load_bytes()` | 4.5 | [`05c74e5e53f6`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=05c74e5e53f6cb07502c3e6a820f33e2777b6605) `BPF_FUNC_skb_load_bytes()` | 4.5 | [`05c74e5e53f6`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=05c74e5e53f6cb07502c3e6a820f33e2777b6605)
`BPF_FUNC_skb_pull_data()` | 4.9 | [`36bbef52c7eb`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=36bbef52c7eb646ed6247055a2acd3851e317857) `BPF_FUNC_skb_pull_data()` | 4.9 | [`36bbef52c7eb`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=36bbef52c7eb646ed6247055a2acd3851e317857)
`BPF_FUNC_skb_set_tunnel_key()` | 4.3 | [`d3aa45ce6b94`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=d3aa45ce6b94c65b83971257317867db13e5f492) `BPF_FUNC_skb_set_tunnel_key()` | 4.3 | [`d3aa45ce6b94`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=d3aa45ce6b94c65b83971257317867db13e5f492)
...@@ -168,5 +169,6 @@ Helper | Kernel version | Commit ...@@ -168,5 +169,6 @@ Helper | Kernel version | Commit
`BPF_FUNC_trace_printk()` | 4.1 | [`9c959c863f82`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=9c959c863f8217a2ff3d7c296e8223654d240569) `BPF_FUNC_trace_printk()` | 4.1 | [`9c959c863f82`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=9c959c863f8217a2ff3d7c296e8223654d240569)
`BPF_FUNC_xdp_adjust_head()` | 4.10 | [`17bedab27231`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=17bedab2723145d17b14084430743549e6943d03) `BPF_FUNC_xdp_adjust_head()` | 4.10 | [`17bedab27231`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=17bedab2723145d17b14084430743549e6943d03)
`BPF_FUNC_xdp_adjust_meta()` | 4.15 | [`de8f3a83b0a0`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=de8f3a83b0a0fddb2cf56e7a718127e9619ea3da) `BPF_FUNC_xdp_adjust_meta()` | 4.15 | [`de8f3a83b0a0`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=de8f3a83b0a0fddb2cf56e7a718127e9619ea3da)
`BPF_FUNC_xdp_adjust_tail()` | 4.18 | [`b32cc5b9a346`](https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=b32cc5b9a346319c171e3ad905e0cddda032b5eb)
`BPF_FUNC_override_return()` | 4.16 | [`9802d86585db`](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=9802d86585db91655c7d1929a4f6bbe0952ea88e) `BPF_FUNC_override_return()` | 4.16 | [`9802d86585db`](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=9802d86585db91655c7d1929a4f6bbe0952ea88e)
`BPF_FUNC_sock_ops_cb_flags_set()` | 4.16 | [`b13d88072172`](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b13d880721729384757f235166068c315326f4a1) `BPF_FUNC_sock_ops_cb_flags_set()` | 4.16 | [`b13d88072172`](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b13d880721729384757f235166068c315326f4a1)
...@@ -95,6 +95,7 @@ enum bpf_cmd { ...@@ -95,6 +95,7 @@ enum bpf_cmd {
BPF_OBJ_GET_INFO_BY_FD, BPF_OBJ_GET_INFO_BY_FD,
BPF_PROG_QUERY, BPF_PROG_QUERY,
BPF_RAW_TRACEPOINT_OPEN, BPF_RAW_TRACEPOINT_OPEN,
BPF_BTF_LOAD,
}; };
enum bpf_map_type { enum bpf_map_type {
...@@ -279,6 +280,9 @@ union bpf_attr { ...@@ -279,6 +280,9 @@ union bpf_attr {
*/ */
char map_name[BPF_OBJ_NAME_LEN]; char map_name[BPF_OBJ_NAME_LEN];
__u32 map_ifindex; /* ifindex of netdev to create on */ __u32 map_ifindex; /* ifindex of netdev to create on */
__u32 btf_fd; /* fd pointing to a BTF type data */
__u32 btf_key_id; /* BTF type_id of the key */
__u32 btf_value_id; /* BTF type_id of the value */
}; };
struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
...@@ -363,398 +367,1406 @@ union bpf_attr { ...@@ -363,398 +367,1406 @@ union bpf_attr {
__u64 name; __u64 name;
__u32 prog_fd; __u32 prog_fd;
} raw_tracepoint; } raw_tracepoint;
struct { /* anonymous struct for BPF_BTF_LOAD */
__aligned_u64 btf;
__aligned_u64 btf_log_buf;
__u32 btf_size;
__u32 btf_log_size;
__u32 btf_log_level;
};
} __attribute__((aligned(8))); } __attribute__((aligned(8)));
/* BPF helper function descriptions: /* The description below is an attempt at providing documentation to eBPF
* * developers about the multiple available eBPF helper functions. It can be
* void *bpf_map_lookup_elem(&map, &key) * parsed and used to produce a manual page. The workflow is the following,
* Return: Map value or NULL * and requires the rst2man utility:
* *
* int bpf_map_update_elem(&map, &key, &value, flags) * $ ./scripts/bpf_helpers_doc.py \
* Return: 0 on success or negative error * --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst
* * $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7
* int bpf_map_delete_elem(&map, &key) * $ man /tmp/bpf-helpers.7
* Return: 0 on success or negative error *
* * Note that in order to produce this external documentation, some RST
* int bpf_probe_read(void *dst, int size, void *src) * formatting is used in the descriptions to get "bold" and "italics" in
* Return: 0 on success or negative error * manual pages. Also note that the few trailing white spaces are
* intentional, removing them would break paragraphs for rst2man.
*
* Start of BPF helper function descriptions:
*
* void *bpf_map_lookup_elem(struct bpf_map *map, const void *key)
* Description
* Perform a lookup in *map* for an entry associated to *key*.
* Return
* Map value associated to *key*, or **NULL** if no entry was
* found.
*
* int bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags)
* Description
* Add or update the value of the entry associated to *key* in
* *map* with *value*. *flags* is one of:
*
* **BPF_NOEXIST**
* The entry for *key* must not exist in the map.
* **BPF_EXIST**
* The entry for *key* must already exist in the map.
* **BPF_ANY**
* No condition on the existence of the entry for *key*.
*
* Flag value **BPF_NOEXIST** cannot be used for maps of types
* **BPF_MAP_TYPE_ARRAY** or **BPF_MAP_TYPE_PERCPU_ARRAY** (all
* elements always exist), the helper would return an error.
* Return
* 0 on success, or a negative error in case of failure.
*
* int bpf_map_delete_elem(struct bpf_map *map, const void *key)
* Description
* Delete entry with *key* from *map*.
* Return
* 0 on success, or a negative error in case of failure.
*
* int bpf_probe_read(void *dst, u32 size, const void *src)
* Description
* For tracing programs, safely attempt to read *size* bytes from
* address *src* and store the data in *dst*.
* Return
* 0 on success, or a negative error in case of failure.
* *
* u64 bpf_ktime_get_ns(void) * u64 bpf_ktime_get_ns(void)
* Return: current ktime * Description
* * Return the time elapsed since system boot, in nanoseconds.
* int bpf_trace_printk(const char *fmt, int fmt_size, ...) * Return
* Return: length of buffer written or negative error * Current *ktime*.
* *
* u32 bpf_prandom_u32(void) * int bpf_trace_printk(const char *fmt, u32 fmt_size, ...)
* Return: random value * Description
* * This helper is a "printk()-like" facility for debugging. It
* u32 bpf_raw_smp_processor_id(void) * prints a message defined by format *fmt* (of size *fmt_size*)
* Return: SMP processor ID * to file *\/sys/kernel/debug/tracing/trace* from DebugFS, if
* * available. It can take up to three additional **u64**
* int bpf_skb_store_bytes(skb, offset, from, len, flags) * arguments (as an eBPF helpers, the total number of arguments is
* store bytes into packet * limited to five).
* @skb: pointer to skb *
* @offset: offset within packet from skb->mac_header * Each time the helper is called, it appends a line to the trace.
* @from: pointer where to copy bytes from * The format of the trace is customizable, and the exact output
* @len: number of bytes to store into packet * one will get depends on the options set in
* @flags: bit 0 - if true, recompute skb->csum * *\/sys/kernel/debug/tracing/trace_options* (see also the
* other bits - reserved * *README* file under the same directory). However, it usually
* Return: 0 on success or negative error * defaults to something like:
* *
* int bpf_l3_csum_replace(skb, offset, from, to, flags) * ::
* recompute IP checksum *
* @skb: pointer to skb * telnet-470 [001] .N.. 419421.045894: 0x00000001: <formatted msg>
* @offset: offset within packet where IP checksum is located *
* @from: old value of header field * In the above:
* @to: new value of header field *
* @flags: bits 0-3 - size of header field * * ``telnet`` is the name of the current task.
* other bits - reserved * * ``470`` is the PID of the current task.
* Return: 0 on success or negative error * * ``001`` is the CPU number on which the task is
* * running.
* int bpf_l4_csum_replace(skb, offset, from, to, flags) * * In ``.N..``, each character refers to a set of
* recompute TCP/UDP checksum * options (whether irqs are enabled, scheduling
* @skb: pointer to skb * options, whether hard/softirqs are running, level of
* @offset: offset within packet where TCP/UDP checksum is located * preempt_disabled respectively). **N** means that
* @from: old value of header field * **TIF_NEED_RESCHED** and **PREEMPT_NEED_RESCHED**
* @to: new value of header field * are set.
* @flags: bits 0-3 - size of header field * * ``419421.045894`` is a timestamp.
* bit 4 - is pseudo header * * ``0x00000001`` is a fake value used by BPF for the
* other bits - reserved * instruction pointer register.
* Return: 0 on success or negative error * * ``<formatted msg>`` is the message formatted with
* * *fmt*.
* int bpf_tail_call(ctx, prog_array_map, index) *
* jump into another BPF program * The conversion specifiers supported by *fmt* are similar, but
* @ctx: context pointer passed to next program * more limited than for printk(). They are **%d**, **%i**,
* @prog_array_map: pointer to map which type is BPF_MAP_TYPE_PROG_ARRAY * **%u**, **%x**, **%ld**, **%li**, **%lu**, **%lx**, **%lld**,
* @index: 32-bit index inside array that selects specific program to run * **%lli**, **%llu**, **%llx**, **%p**, **%s**. No modifier (size
* Return: 0 on success or negative error * of field, padding with zeroes, etc.) is available, and the
* * helper will return **-EINVAL** (but print nothing) if it
* int bpf_clone_redirect(skb, ifindex, flags) * encounters an unknown specifier.
* redirect to another netdev *
* @skb: pointer to skb * Also, note that **bpf_trace_printk**\ () is slow, and should
* @ifindex: ifindex of the net device * only be used for debugging purposes. For this reason, a notice
* @flags: bit 0 - if set, redirect to ingress instead of egress * bloc (spanning several lines) is printed to kernel logs and
* other bits - reserved * states that the helper should not be used "for production use"
* Return: 0 on success or negative error * the first time this helper is used (or more precisely, when
* **trace_printk**\ () buffers are allocated). For passing values
* to user space, perf events should be preferred.
* Return
* The number of bytes written to the buffer, or a negative error
* in case of failure.
*
* u32 bpf_get_prandom_u32(void)
* Description
* Get a pseudo-random number.
*
* From a security point of view, this helper uses its own
* pseudo-random internal state, and cannot be used to infer the
* seed of other random functions in the kernel. However, it is
* essential to note that the generator used by the helper is not
* cryptographically secure.
* Return
* A random 32-bit unsigned value.
*
* u32 bpf_get_smp_processor_id(void)
* Description
* Get the SMP (symmetric multiprocessing) processor id. Note that
* all programs run with preemption disabled, which means that the
* SMP processor id is stable during all the execution of the
* program.
* Return
* The SMP id of the processor running the program.
*
* int bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags)
* Description
* Store *len* bytes from address *from* into the packet
* associated to *skb*, at *offset*. *flags* are a combination of
* **BPF_F_RECOMPUTE_CSUM** (automatically recompute the
* checksum for the packet after storing the bytes) and
* **BPF_F_INVALIDATE_HASH** (set *skb*\ **->hash**, *skb*\
* **->swhash** and *skb*\ **->l4hash** to 0).
*
* A call to this helper is susceptible to change the underlaying
* packet buffer. Therefore, at load time, all checks on pointers
* previously done by the verifier are invalidated and must be
* performed again, if the helper is used in combination with
* direct packet access.
* Return
* 0 on success, or a negative error in case of failure.
*
* int bpf_l3_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 size)
* Description
* Recompute the layer 3 (e.g. IP) checksum for the packet
* associated to *skb*. Computation is incremental, so the helper
* must know the former value of the header field that was
* modified (*from*), the new value of this field (*to*), and the
* number of bytes (2 or 4) for this field, stored in *size*.
* Alternatively, it is possible to store the difference between
* the previous and the new values of the header field in *to*, by
* setting *from* and *size* to 0. For both methods, *offset*
* indicates the location of the IP checksum within the packet.
*
* This helper works in combination with **bpf_csum_diff**\ (),
* which does not update the checksum in-place, but offers more
* flexibility and can handle sizes larger than 2 or 4 for the
* checksum to update.
*
* A call to this helper is susceptible to change the underlaying
* packet buffer. Therefore, at load time, all checks on pointers
* previously done by the verifier are invalidated and must be
* performed again, if the helper is used in combination with
* direct packet access.
* Return
* 0 on success, or a negative error in case of failure.
*
* int bpf_l4_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 flags)
* Description
* Recompute the layer 4 (e.g. TCP, UDP or ICMP) checksum for the
* packet associated to *skb*. Computation is incremental, so the
* helper must know the former value of the header field that was
* modified (*from*), the new value of this field (*to*), and the
* number of bytes (2 or 4) for this field, stored on the lowest
* four bits of *flags*. Alternatively, it is possible to store
* the difference between the previous and the new values of the
* header field in *to*, by setting *from* and the four lowest
* bits of *flags* to 0. For both methods, *offset* indicates the
* location of the IP checksum within the packet. In addition to
* the size of the field, *flags* can be added (bitwise OR) actual
* flags. With **BPF_F_MARK_MANGLED_0**, a null checksum is left
* untouched (unless **BPF_F_MARK_ENFORCE** is added as well), and
* for updates resulting in a null checksum the value is set to
* **CSUM_MANGLED_0** instead. Flag **BPF_F_PSEUDO_HDR** indicates
* the checksum is to be computed against a pseudo-header.
*
* This helper works in combination with **bpf_csum_diff**\ (),
* which does not update the checksum in-place, but offers more
* flexibility and can handle sizes larger than 2 or 4 for the
* checksum to update.
*
* A call to this helper is susceptible to change the underlaying
* packet buffer. Therefore, at load time, all checks on pointers
* previously done by the verifier are invalidated and must be
* performed again, if the helper is used in combination with
* direct packet access.
* Return
* 0 on success, or a negative error in case of failure.
*
* int bpf_tail_call(void *ctx, struct bpf_map *prog_array_map, u32 index)
* Description
* This special helper is used to trigger a "tail call", or in
* other words, to jump into another eBPF program. The same stack
* frame is used (but values on stack and in registers for the
* caller are not accessible to the callee). This mechanism allows
* for program chaining, either for raising the maximum number of
* available eBPF instructions, or to execute given programs in
* conditional blocks. For security reasons, there is an upper
* limit to the number of successive tail calls that can be
* performed.
*
* Upon call of this helper, the program attempts to jump into a
* program referenced at index *index* in *prog_array_map*, a
* special map of type **BPF_MAP_TYPE_PROG_ARRAY**, and passes
* *ctx*, a pointer to the context.
*
* If the call succeeds, the kernel immediately runs the first
* instruction of the new program. This is not a function call,
* and it never returns to the previous program. If the call
* fails, then the helper has no effect, and the caller continues
* to run its subsequent instructions. A call can fail if the
* destination program for the jump does not exist (i.e. *index*
* is superior to the number of entries in *prog_array_map*), or
* if the maximum number of tail calls has been reached for this
* chain of programs. This limit is defined in the kernel by the
* macro **MAX_TAIL_CALL_CNT** (not accessible to user space),
* which is currently set to 32.
* Return
* 0 on success, or a negative error in case of failure.
*
* int bpf_clone_redirect(struct sk_buff *skb, u32 ifindex, u64 flags)
* Description
* Clone and redirect the packet associated to *skb* to another
* net device of index *ifindex*. Both ingress and egress
* interfaces can be used for redirection. The **BPF_F_INGRESS**
* value in *flags* is used to make the distinction (ingress path
* is selected if the flag is present, egress path otherwise).
* This is the only flag supported for now.
*
* In comparison with **bpf_redirect**\ () helper,
* **bpf_clone_redirect**\ () has the associated cost of
* duplicating the packet buffer, but this can be executed out of
* the eBPF program. Conversely, **bpf_redirect**\ () is more
* efficient, but it is handled through an action code where the
* redirection happens only after the eBPF program has returned.
*
* A call to this helper is susceptible to change the underlaying
* packet buffer. Therefore, at load time, all checks on pointers
* previously done by the verifier are invalidated and must be
* performed again, if the helper is used in combination with
* direct packet access.
* Return
* 0 on success, or a negative error in case of failure.
* *
* u64 bpf_get_current_pid_tgid(void) * u64 bpf_get_current_pid_tgid(void)
* Return: current->tgid << 32 | current->pid * Return
* A 64-bit integer containing the current tgid and pid, and
* created as such:
* *current_task*\ **->tgid << 32 \|**
* *current_task*\ **->pid**.
* *
* u64 bpf_get_current_uid_gid(void) * u64 bpf_get_current_uid_gid(void)
* Return: current_gid << 32 | current_uid * Return
* * A 64-bit integer containing the current GID and UID, and
* int bpf_get_current_comm(char *buf, int size_of_buf) * created as such: *current_gid* **<< 32 \|** *current_uid*.
* stores current->comm into buf *
* Return: 0 on success or negative error * int bpf_get_current_comm(char *buf, u32 size_of_buf)
* * Description
* u32 bpf_get_cgroup_classid(skb) * Copy the **comm** attribute of the current task into *buf* of
* retrieve a proc's classid * *size_of_buf*. The **comm** attribute contains the name of
* @skb: pointer to skb * the executable (excluding the path) for the current task. The
* Return: classid if != 0 * *size_of_buf* must be strictly positive. On success, the
* * helper makes sure that the *buf* is NUL-terminated. On failure,
* int bpf_skb_vlan_push(skb, vlan_proto, vlan_tci) * it is filled with zeroes.
* Return: 0 on success or negative error * Return
* * 0 on success, or a negative error in case of failure.
* int bpf_skb_vlan_pop(skb) *
* Return: 0 on success or negative error * u32 bpf_get_cgroup_classid(struct sk_buff *skb)
* * Description
* int bpf_skb_get_tunnel_key(skb, key, size, flags) * Retrieve the classid for the current task, i.e. for the net_cls
* int bpf_skb_set_tunnel_key(skb, key, size, flags) * cgroup to which *skb* belongs.
* retrieve or populate tunnel metadata *
* @skb: pointer to skb * This helper can be used on TC egress path, but not on ingress.
* @key: pointer to 'struct bpf_tunnel_key' *
* @size: size of 'struct bpf_tunnel_key' * The net_cls cgroup provides an interface to tag network packets
* @flags: room for future extensions * based on a user-provided identifier for all traffic coming from
* Return: 0 on success or negative error * the tasks belonging to the related cgroup. See also the related
* * kernel documentation, available from the Linux sources in file
* u64 bpf_perf_event_read(map, flags) * *Documentation/cgroup-v1/net_cls.txt*.
* read perf event counter value *
* @map: pointer to perf_event_array map * The Linux kernel has two versions for cgroups: there are
* @flags: index of event in the map or bitmask flags * cgroups v1 and cgroups v2. Both are available to users, who can
* Return: value of perf event counter read or error code * use a mixture of them, but note that the net_cls cgroup is for
* * cgroup v1 only. This makes it incompatible with BPF programs
* int bpf_redirect(ifindex, flags) * run on cgroups, which is a cgroup-v2-only feature (a socket can
* redirect to another netdev * only hold data for one version of cgroups at a time).
* @ifindex: ifindex of the net device *
* @flags: * This helper is only available is the kernel was compiled with
* cls_bpf: * the **CONFIG_CGROUP_NET_CLASSID** configuration option set to
* bit 0 - if set, redirect to ingress instead of egress * "**y**" or to "**m**".
* other bits - reserved * Return
* xdp_bpf: * The classid, or 0 for the default unconfigured classid.
* all bits - reserved *
* Return: cls_bpf: TC_ACT_REDIRECT on success or TC_ACT_SHOT on error * int bpf_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
* xdp_bfp: XDP_REDIRECT on success or XDP_ABORT on error * Description
* int bpf_redirect_map(map, key, flags) * Push a *vlan_tci* (VLAN tag control information) of protocol
* redirect to endpoint in map * *vlan_proto* to the packet associated to *skb*, then update
* @map: pointer to dev map * the checksum. Note that if *vlan_proto* is different from
* @key: index in map to lookup * **ETH_P_8021Q** and **ETH_P_8021AD**, it is considered to
* @flags: -- * be **ETH_P_8021Q**.
* Return: XDP_REDIRECT on success or XDP_ABORT on error *
* * A call to this helper is susceptible to change the underlaying
* u32 bpf_get_route_realm(skb) * packet buffer. Therefore, at load time, all checks on pointers
* retrieve a dst's tclassid * previously done by the verifier are invalidated and must be
* @skb: pointer to skb * performed again, if the helper is used in combination with
* Return: realm if != 0 * direct packet access.
* * Return
* int bpf_perf_event_output(ctx, map, flags, data, size) * 0 on success, or a negative error in case of failure.
* output perf raw sample *
* @ctx: struct pt_regs* * int bpf_skb_vlan_pop(struct sk_buff *skb)
* @map: pointer to perf_event_array map * Description
* @flags: index of event in the map or bitmask flags * Pop a VLAN header from the packet associated to *skb*.
* @data: data on stack to be output as raw data *
* @size: size of data * A call to this helper is susceptible to change the underlaying
* Return: 0 on success or negative error * packet buffer. Therefore, at load time, all checks on pointers
* * previously done by the verifier are invalidated and must be
* int bpf_get_stackid(ctx, map, flags) * performed again, if the helper is used in combination with
* walk user or kernel stack and return id * direct packet access.
* @ctx: struct pt_regs* * Return
* @map: pointer to stack_trace map * 0 on success, or a negative error in case of failure.
* @flags: bits 0-7 - numer of stack frames to skip *
* bit 8 - collect user stack instead of kernel * int bpf_skb_get_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags)
* bit 9 - compare stacks by hash only * Description
* bit 10 - if two different stacks hash into the same stackid * Get tunnel metadata. This helper takes a pointer *key* to an
* discard old * empty **struct bpf_tunnel_key** of **size**, that will be
* other bits - reserved * filled with tunnel metadata for the packet associated to *skb*.
* Return: >= 0 stackid on success or negative error * The *flags* can be set to **BPF_F_TUNINFO_IPV6**, which
* * indicates that the tunnel is based on IPv6 protocol instead of
* s64 bpf_csum_diff(from, from_size, to, to_size, seed) * IPv4.
* calculate csum diff *
* @from: raw from buffer * The **struct bpf_tunnel_key** is an object that generalizes the
* @from_size: length of from buffer * principal parameters used by various tunneling protocols into a
* @to: raw to buffer * single struct. This way, it can be used to easily make a
* @to_size: length of to buffer * decision based on the contents of the encapsulation header,
* @seed: optional seed * "summarized" in this struct. In particular, it holds the IP
* Return: csum result or negative error code * address of the remote end (IPv4 or IPv6, depending on the case)
* * in *key*\ **->remote_ipv4** or *key*\ **->remote_ipv6**. Also,
* int bpf_skb_get_tunnel_opt(skb, opt, size) * this struct exposes the *key*\ **->tunnel_id**, which is
* retrieve tunnel options metadata * generally mapped to a VNI (Virtual Network Identifier), making
* @skb: pointer to skb * it programmable together with the **bpf_skb_set_tunnel_key**\
* @opt: pointer to raw tunnel option data * () helper.
* @size: size of @opt *
* Return: option size * Let's imagine that the following code is part of a program
* * attached to the TC ingress interface, on one end of a GRE
* int bpf_skb_set_tunnel_opt(skb, opt, size) * tunnel, and is supposed to filter out all messages coming from
* populate tunnel options metadata * remote ends with IPv4 address other than 10.0.0.1:
* @skb: pointer to skb *
* @opt: pointer to raw tunnel option data * ::
* @size: size of @opt *
* Return: 0 on success or negative error * int ret;
* * struct bpf_tunnel_key key = {};
* int bpf_skb_change_proto(skb, proto, flags) *
* Change protocol of the skb. Currently supported is v4 -> v6, * ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0);
* v6 -> v4 transitions. The helper will also resize the skb. eBPF * if (ret < 0)
* program is expected to fill the new headers via skb_store_bytes * return TC_ACT_SHOT; // drop packet
* and lX_csum_replace. *
* @skb: pointer to skb * if (key.remote_ipv4 != 0x0a000001)
* @proto: new skb->protocol type * return TC_ACT_SHOT; // drop packet
* @flags: reserved *
* Return: 0 on success or negative error * return TC_ACT_OK; // accept packet
* *
* int bpf_skb_change_type(skb, type) * This interface can also be used with all encapsulation devices
* Change packet type of skb. * that can operate in "collect metadata" mode: instead of having
* @skb: pointer to skb * one network device per specific configuration, the "collect
* @type: new skb->pkt_type type * metadata" mode only requires a single device where the
* Return: 0 on success or negative error * configuration can be extracted from this helper.
* *
* int bpf_skb_under_cgroup(skb, map, index) * This can be used together with various tunnels such as VXLan,
* Check cgroup2 membership of skb * Geneve, GRE or IP in IP (IPIP).
* @skb: pointer to skb * Return
* @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type * 0 on success, or a negative error in case of failure.
* @index: index of the cgroup in the bpf_map *
* Return: * int bpf_skb_set_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags)
* == 0 skb failed the cgroup2 descendant test * Description
* == 1 skb succeeded the cgroup2 descendant test * Populate tunnel metadata for packet associated to *skb.* The
* < 0 error * tunnel metadata is set to the contents of *key*, of *size*. The
* * *flags* can be set to a combination of the following values:
* u32 bpf_get_hash_recalc(skb) *
* Retrieve and possibly recalculate skb->hash. * **BPF_F_TUNINFO_IPV6**
* @skb: pointer to skb * Indicate that the tunnel is based on IPv6 protocol
* Return: hash * instead of IPv4.
* **BPF_F_ZERO_CSUM_TX**
* For IPv4 packets, add a flag to tunnel metadata
* indicating that checksum computation should be skipped
* and checksum set to zeroes.
* **BPF_F_DONT_FRAGMENT**
* Add a flag to tunnel metadata indicating that the
* packet should not be fragmented.
* **BPF_F_SEQ_NUMBER**
* Add a flag to tunnel metadata indicating that a
* sequence number should be added to tunnel header before
* sending the packet. This flag was added for GRE
* encapsulation, but might be used with other protocols
* as well in the future.
*
* Here is a typical usage on the transmit path:
*
* ::
*
* struct bpf_tunnel_key key;
* populate key ...
* bpf_skb_set_tunnel_key(skb, &key, sizeof(key), 0);
* bpf_clone_redirect(skb, vxlan_dev_ifindex, 0);
*
* See also the description of the **bpf_skb_get_tunnel_key**\ ()
* helper for additional information.
* Return
* 0 on success, or a negative error in case of failure.
*
* u64 bpf_perf_event_read(struct bpf_map *map, u64 flags)
* Description
* Read the value of a perf event counter. This helper relies on a
* *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of
* the perf event counter is selected when *map* is updated with
* perf event file descriptors. The *map* is an array whose size
* is the number of available CPUs, and each cell contains a value
* relative to one CPU. The value to retrieve is indicated by
* *flags*, that contains the index of the CPU to look up, masked
* with **BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to
* **BPF_F_CURRENT_CPU** to indicate that the value for the
* current CPU should be retrieved.
*
* Note that before Linux 4.13, only hardware perf event can be
* retrieved.
*
* Also, be aware that the newer helper
* **bpf_perf_event_read_value**\ () is recommended over
* **bpf_perf_event_read*\ () in general. The latter has some ABI
* quirks where error and counter value are used as a return code
* (which is wrong to do since ranges may overlap). This issue is
* fixed with bpf_perf_event_read_value(), which at the same time
* provides more features over the **bpf_perf_event_read**\ ()
* interface. Please refer to the description of
* **bpf_perf_event_read_value**\ () for details.
* Return
* The value of the perf event counter read from the map, or a
* negative error code in case of failure.
*
* int bpf_redirect(u32 ifindex, u64 flags)
* Description
* Redirect the packet to another net device of index *ifindex*.
* This helper is somewhat similar to **bpf_clone_redirect**\
* (), except that the packet is not cloned, which provides
* increased performance.
*
* Except for XDP, both ingress and egress interfaces can be used
* for redirection. The **BPF_F_INGRESS** value in *flags* is used
* to make the distinction (ingress path is selected if the flag
* is present, egress path otherwise). Currently, XDP only
* supports redirection to the egress interface, and accepts no
* flag at all.
*
* The same effect can be attained with the more generic
* **bpf_redirect_map**\ (), which requires specific maps to be
* used but offers better performance.
* Return
* For XDP, the helper returns **XDP_REDIRECT** on success or
* **XDP_ABORTED** on error. For other program types, the values
* are **TC_ACT_REDIRECT** on success or **TC_ACT_SHOT** on
* error.
*
* u32 bpf_get_route_realm(struct sk_buff *skb)
* Description
* Retrieve the realm or the route, that is to say the
* **tclassid** field of the destination for the *skb*. The
* indentifier retrieved is a user-provided tag, similar to the
* one used with the net_cls cgroup (see description for
* **bpf_get_cgroup_classid**\ () helper), but here this tag is
* held by a route (a destination entry), not by a task.
*
* Retrieving this identifier works with the clsact TC egress hook
* (see also **tc-bpf(8)**), or alternatively on conventional
* classful egress qdiscs, but not on TC ingress path. In case of
* clsact TC egress hook, this has the advantage that, internally,
* the destination entry has not been dropped yet in the transmit
* path. Therefore, the destination entry does not need to be
* artificially held via **netif_keep_dst**\ () for a classful
* qdisc until the *skb* is freed.
*
* This helper is available only if the kernel was compiled with
* **CONFIG_IP_ROUTE_CLASSID** configuration option.
* Return
* The realm of the route for the packet associated to *skb*, or 0
* if none was found.
*
* int bpf_perf_event_output(struct pt_reg *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
* Description
* Write raw *data* blob into a special BPF perf event held by
* *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf
* event must have the following attributes: **PERF_SAMPLE_RAW**
* as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and
* **PERF_COUNT_SW_BPF_OUTPUT** as **config**.
*
* The *flags* are used to indicate the index in *map* for which
* the value must be put, masked with **BPF_F_INDEX_MASK**.
* Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU**
* to indicate that the index of the current CPU core should be
* used.
*
* The value to write, of *size*, is passed through eBPF stack and
* pointed by *data*.
*
* The context of the program *ctx* needs also be passed to the
* helper.
*
* On user space, a program willing to read the values needs to
* call **perf_event_open**\ () on the perf event (either for
* one or for all CPUs) and to store the file descriptor into the
* *map*. This must be done before the eBPF program can send data
* into it. An example is available in file
* *samples/bpf/trace_output_user.c* in the Linux kernel source
* tree (the eBPF program counterpart is in
* *samples/bpf/trace_output_kern.c*).
*
* **bpf_perf_event_output**\ () achieves better performance
* than **bpf_trace_printk**\ () for sharing data with user
* space, and is much better suitable for streaming data from eBPF
* programs.
*
* Note that this helper is not restricted to tracing use cases
* and can be used with programs attached to TC or XDP as well,
* where it allows for passing data to user space listeners. Data
* can be:
*
* * Only custom structs,
* * Only the packet payload, or
* * A combination of both.
* Return
* 0 on success, or a negative error in case of failure.
*
* int bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len)
* Description
* This helper was provided as an easy way to load data from a
* packet. It can be used to load *len* bytes from *offset* from
* the packet associated to *skb*, into the buffer pointed by
* *to*.
*
* Since Linux 4.7, usage of this helper has mostly been replaced
* by "direct packet access", enabling packet data to be
* manipulated with *skb*\ **->data** and *skb*\ **->data_end**
* pointing respectively to the first byte of packet data and to
* the byte after the last byte of packet data. However, it
* remains useful if one wishes to read large quantities of data
* at once from a packet into the eBPF stack.
* Return
* 0 on success, or a negative error in case of failure.
*
* int bpf_get_stackid(struct pt_reg *ctx, struct bpf_map *map, u64 flags)
* Description
* Walk a user or a kernel stack and return its id. To achieve
* this, the helper needs *ctx*, which is a pointer to the context
* on which the tracing program is executed, and a pointer to a
* *map* of type **BPF_MAP_TYPE_STACK_TRACE**.
*
* The last argument, *flags*, holds the number of stack frames to
* skip (from 0 to 255), masked with
* **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
* a combination of the following flags:
*
* **BPF_F_USER_STACK**
* Collect a user space stack instead of a kernel stack.
* **BPF_F_FAST_STACK_CMP**
* Compare stacks by hash only.
* **BPF_F_REUSE_STACKID**
* If two different stacks hash into the same *stackid*,
* discard the old one.
*
* The stack id retrieved is a 32 bit long integer handle which
* can be further combined with other data (including other stack
* ids) and used as a key into maps. This can be useful for
* generating a variety of graphs (such as flame graphs or off-cpu
* graphs).
*
* For walking a stack, this helper is an improvement over
* **bpf_probe_read**\ (), which can be used with unrolled loops
* but is not efficient and consumes a lot of eBPF instructions.
* Instead, **bpf_get_stackid**\ () can collect up to
* **PERF_MAX_STACK_DEPTH** both kernel and user frames. Note that
* this limit can be controlled with the **sysctl** program, and
* that it should be manually increased in order to profile long
* user stacks (such as stacks for Java programs). To do so, use:
*
* ::
*
* # sysctl kernel.perf_event_max_stack=<new value>
*
* Return
* The positive or null stack id on success, or a negative error
* in case of failure.
*
* s64 bpf_csum_diff(__be32 *from, u32 from_size, __be32 *to, u32 to_size, __wsum seed)
* Description
* Compute a checksum difference, from the raw buffer pointed by
* *from*, of length *from_size* (that must be a multiple of 4),
* towards the raw buffer pointed by *to*, of size *to_size*
* (same remark). An optional *seed* can be added to the value
* (this can be cascaded, the seed may come from a previous call
* to the helper).
*
* This is flexible enough to be used in several ways:
*
* * With *from_size* == 0, *to_size* > 0 and *seed* set to
* checksum, it can be used when pushing new data.
* * With *from_size* > 0, *to_size* == 0 and *seed* set to
* checksum, it can be used when removing data from a packet.
* * With *from_size* > 0, *to_size* > 0 and *seed* set to 0, it
* can be used to compute a diff. Note that *from_size* and
* *to_size* do not need to be equal.
*
* This helper can be used in combination with
* **bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\ (), to
* which one can feed in the difference computed with
* **bpf_csum_diff**\ ().
* Return
* The checksum result, or a negative error code in case of
* failure.
*
* int bpf_skb_get_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size)
* Description
* Retrieve tunnel options metadata for the packet associated to
* *skb*, and store the raw tunnel option data to the buffer *opt*
* of *size*.
*
* This helper can be used with encapsulation devices that can
* operate in "collect metadata" mode (please refer to the related
* note in the description of **bpf_skb_get_tunnel_key**\ () for
* more details). A particular example where this can be used is
* in combination with the Geneve encapsulation protocol, where it
* allows for pushing (with **bpf_skb_get_tunnel_opt**\ () helper)
* and retrieving arbitrary TLVs (Type-Length-Value headers) from
* the eBPF program. This allows for full customization of these
* headers.
* Return
* The size of the option data retrieved.
*
* int bpf_skb_set_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size)
* Description
* Set tunnel options metadata for the packet associated to *skb*
* to the option data contained in the raw buffer *opt* of *size*.
*
* See also the description of the **bpf_skb_get_tunnel_opt**\ ()
* helper for additional information.
* Return
* 0 on success, or a negative error in case of failure.
*
* int bpf_skb_change_proto(struct sk_buff *skb, __be16 proto, u64 flags)
* Description
* Change the protocol of the *skb* to *proto*. Currently
* supported are transition from IPv4 to IPv6, and from IPv6 to
* IPv4. The helper takes care of the groundwork for the
* transition, including resizing the socket buffer. The eBPF
* program is expected to fill the new headers, if any, via
* **skb_store_bytes**\ () and to recompute the checksums with
* **bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\
* (). The main case for this helper is to perform NAT64
* operations out of an eBPF program.
*
* Internally, the GSO type is marked as dodgy so that headers are
* checked and segments are recalculated by the GSO/GRO engine.
* The size for GSO target is adapted as well.
*
* All values for *flags* are reserved for future usage, and must
* be left at zero.
*
* A call to this helper is susceptible to change the underlaying
* packet buffer. Therefore, at load time, all checks on pointers
* previously done by the verifier are invalidated and must be
* performed again, if the helper is used in combination with
* direct packet access.
* Return
* 0 on success, or a negative error in case of failure.
*
* int bpf_skb_change_type(struct sk_buff *skb, u32 type)
* Description
* Change the packet type for the packet associated to *skb*. This
* comes down to setting *skb*\ **->pkt_type** to *type*, except
* the eBPF program does not have a write access to *skb*\
* **->pkt_type** beside this helper. Using a helper here allows
* for graceful handling of errors.
*
* The major use case is to change incoming *skb*s to
* **PACKET_HOST** in a programmatic way instead of having to
* recirculate via **redirect**\ (..., **BPF_F_INGRESS**), for
* example.
*
* Note that *type* only allows certain values. At this time, they
* are:
*
* **PACKET_HOST**
* Packet is for us.
* **PACKET_BROADCAST**
* Send packet to all.
* **PACKET_MULTICAST**
* Send packet to group.
* **PACKET_OTHERHOST**
* Send packet to someone else.
* Return
* 0 on success, or a negative error in case of failure.
*
* int bpf_skb_under_cgroup(struct sk_buff *skb, struct bpf_map *map, u32 index)
* Description
* Check whether *skb* is a descendant of the cgroup2 held by
* *map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*.
* Return
* The return value depends on the result of the test, and can be:
*
* * 0, if the *skb* failed the cgroup2 descendant test.
* * 1, if the *skb* succeeded the cgroup2 descendant test.
* * A negative error code, if an error occurred.
*
* u32 bpf_get_hash_recalc(struct sk_buff *skb)
* Description
* Retrieve the hash of the packet, *skb*\ **->hash**. If it is
* not set, in particular if the hash was cleared due to mangling,
* recompute this hash. Later accesses to the hash can be done
* directly with *skb*\ **->hash**.
*
* Calling **bpf_set_hash_invalid**\ (), changing a packet
* prototype with **bpf_skb_change_proto**\ (), or calling
* **bpf_skb_store_bytes**\ () with the
* **BPF_F_INVALIDATE_HASH** are actions susceptible to clear
* the hash and to trigger a new computation for the next call to
* **bpf_get_hash_recalc**\ ().
* Return
* The 32-bit hash.
* *
* u64 bpf_get_current_task(void) * u64 bpf_get_current_task(void)
* Returns current task_struct * Return
* Return: current * A pointer to the current task struct.
* *
* int bpf_probe_write_user(void *dst, void *src, int len) * int bpf_probe_write_user(void *dst, const void *src, u32 len)
* safely attempt to write to a location * Description
* @dst: destination address in userspace * Attempt in a safe way to write *len* bytes from the buffer
* @src: source address on stack * *src* to *dst* in memory. It only works for threads that are in
* @len: number of bytes to copy * user context, and *dst* must be a valid user space address.
* Return: 0 on success or negative error *
* * This helper should not be used to implement any kind of
* int bpf_current_task_under_cgroup(map, index) * security mechanism because of TOC-TOU attacks, but rather to
* Check cgroup2 membership of current task * debug, divert, and manipulate execution of semi-cooperative
* @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type * processes.
* @index: index of the cgroup in the bpf_map *
* Return: * Keep in mind that this feature is meant for experiments, and it
* == 0 current failed the cgroup2 descendant test * has a risk of crashing the system and running programs.
* == 1 current succeeded the cgroup2 descendant test * Therefore, when an eBPF program using this helper is attached,
* < 0 error * a warning including PID and process name is printed to kernel
* * logs.
* int bpf_skb_change_tail(skb, len, flags) * Return
* The helper will resize the skb to the given new size, to be used f.e. * 0 on success, or a negative error in case of failure.
* with control messages. *
* @skb: pointer to skb * int bpf_current_task_under_cgroup(struct bpf_map *map, u32 index)
* @len: new skb length * Description
* @flags: reserved * Check whether the probe is being run is the context of a given
* Return: 0 on success or negative error * subset of the cgroup2 hierarchy. The cgroup2 to test is held by
* * *map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*.
* int bpf_skb_pull_data(skb, len) * Return
* The helper will pull in non-linear data in case the skb is non-linear * The return value depends on the result of the test, and can be:
* and not all of len are part of the linear section. Only needed for *
* read/write with direct packet access. * * 0, if the *skb* task belongs to the cgroup2.
* @skb: pointer to skb * * 1, if the *skb* task does not belong to the cgroup2.
* @len: len to make read/writeable * * A negative error code, if an error occurred.
* Return: 0 on success or negative error *
* * int bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags)
* s64 bpf_csum_update(skb, csum) * Description
* Adds csum into skb->csum in case of CHECKSUM_COMPLETE. * Resize (trim or grow) the packet associated to *skb* to the
* @skb: pointer to skb * new *len*. The *flags* are reserved for future usage, and must
* @csum: csum to add * be left at zero.
* Return: csum on success or negative error *
* * The basic idea is that the helper performs the needed work to
* void bpf_set_hash_invalid(skb) * change the size of the packet, then the eBPF program rewrites
* Invalidate current skb->hash. * the rest via helpers like **bpf_skb_store_bytes**\ (),
* @skb: pointer to skb * **bpf_l3_csum_replace**\ (), **bpf_l3_csum_replace**\ ()
* * and others. This helper is a slow path utility intended for
* int bpf_get_numa_node_id() * replies with control messages. And because it is targeted for
* Return: Id of current NUMA node. * slow path, the helper itself can afford to be slow: it
* * implicitly linearizes, unclones and drops offloads from the
* int bpf_skb_change_head() * *skb*.
* Grows headroom of skb and adjusts MAC header offset accordingly. *
* Will extends/reallocae as required automatically. * A call to this helper is susceptible to change the underlaying
* May change skb data pointer and will thus invalidate any check * packet buffer. Therefore, at load time, all checks on pointers
* performed for direct packet access. * previously done by the verifier are invalidated and must be
* @skb: pointer to skb * performed again, if the helper is used in combination with
* @len: length of header to be pushed in front * direct packet access.
* @flags: Flags (unused for now) * Return
* Return: 0 on success or negative error * 0 on success, or a negative error in case of failure.
* *
* int bpf_xdp_adjust_head(xdp_md, delta) * int bpf_skb_pull_data(struct sk_buff *skb, u32 len)
* Adjust the xdp_md.data by delta * Description
* @xdp_md: pointer to xdp_md * Pull in non-linear data in case the *skb* is non-linear and not
* @delta: An positive/negative integer to be added to xdp_md.data * all of *len* are part of the linear section. Make *len* bytes
* Return: 0 on success or negative on error * from *skb* readable and writable. If a zero value is passed for
* *len*, then the whole length of the *skb* is pulled.
*
* This helper is only needed for reading and writing with direct
* packet access.
*
* For direct packet access, testing that offsets to access
* are within packet boundaries (test on *skb*\ **->data_end**) is
* susceptible to fail if offsets are invalid, or if the requested
* data is in non-linear parts of the *skb*. On failure the
* program can just bail out, or in the case of a non-linear
* buffer, use a helper to make the data available. The
* **bpf_skb_load_bytes**\ () helper is a first solution to access
* the data. Another one consists in using **bpf_skb_pull_data**
* to pull in once the non-linear parts, then retesting and
* eventually access the data.
*
* At the same time, this also makes sure the *skb* is uncloned,
* which is a necessary condition for direct write. As this needs
* to be an invariant for the write part only, the verifier
* detects writes and adds a prologue that is calling
* **bpf_skb_pull_data()** to effectively unclone the *skb* from
* the very beginning in case it is indeed cloned.
*
* A call to this helper is susceptible to change the underlaying
* packet buffer. Therefore, at load time, all checks on pointers
* previously done by the verifier are invalidated and must be
* performed again, if the helper is used in combination with
* direct packet access.
* Return
* 0 on success, or a negative error in case of failure.
*
* s64 bpf_csum_update(struct sk_buff *skb, __wsum csum)
* Description
* Add the checksum *csum* into *skb*\ **->csum** in case the
* driver has supplied a checksum for the entire packet into that
* field. Return an error otherwise. This helper is intended to be
* used in combination with **bpf_csum_diff**\ (), in particular
* when the checksum needs to be updated after data has been
* written into the packet through direct packet access.
* Return
* The checksum on success, or a negative error code in case of
* failure.
*
* void bpf_set_hash_invalid(struct sk_buff *skb)
* Description
* Invalidate the current *skb*\ **->hash**. It can be used after
* mangling on headers through direct packet access, in order to
* indicate that the hash is outdated and to trigger a
* recalculation the next time the kernel tries to access this
* hash or when the **bpf_get_hash_recalc**\ () helper is called.
*
* int bpf_get_numa_node_id(void)
* Description
* Return the id of the current NUMA node. The primary use case
* for this helper is the selection of sockets for the local NUMA
* node, when the program is attached to sockets using the
* **SO_ATTACH_REUSEPORT_EBPF** option (see also **socket(7)**),
* but the helper is also available to other eBPF program types,
* similarly to **bpf_get_smp_processor_id**\ ().
* Return
* The id of current NUMA node.
*
* int bpf_skb_change_head(struct sk_buff *skb, u32 len, u64 flags)
* Description
* Grows headroom of packet associated to *skb* and adjusts the
* offset of the MAC header accordingly, adding *len* bytes of
* space. It automatically extends and reallocates memory as
* required.
*
* This helper can be used on a layer 3 *skb* to push a MAC header
* for redirection into a layer 2 device.
*
* All values for *flags* are reserved for future usage, and must
* be left at zero.
*
* A call to this helper is susceptible to change the underlaying
* packet buffer. Therefore, at load time, all checks on pointers
* previously done by the verifier are invalidated and must be
* performed again, if the helper is used in combination with
* direct packet access.
* Return
* 0 on success, or a negative error in case of failure.
*
* int bpf_xdp_adjust_head(struct xdp_buff *xdp_md, int delta)
* Description
* Adjust (move) *xdp_md*\ **->data** by *delta* bytes. Note that
* it is possible to use a negative value for *delta*. This helper
* can be used to prepare the packet for pushing or popping
* headers.
*
* A call to this helper is susceptible to change the underlaying
* packet buffer. Therefore, at load time, all checks on pointers
* previously done by the verifier are invalidated and must be
* performed again, if the helper is used in combination with
* direct packet access.
* Return
* 0 on success, or a negative error in case of failure.
* *
* int bpf_probe_read_str(void *dst, int size, const void *unsafe_ptr) * int bpf_probe_read_str(void *dst, int size, const void *unsafe_ptr)
* Copy a NUL terminated string from unsafe address. In case the string * Description
* length is smaller than size, the target is not padded with further NUL * Copy a NUL terminated string from an unsafe address
* bytes. In case the string length is larger than size, just count-1 * *unsafe_ptr* to *dst*. The *size* should include the
* bytes are copied and the last byte is set to NUL. * terminating NUL byte. In case the string length is smaller than
* @dst: destination address * *size*, the target is not padded with further NUL bytes. If the
* @size: maximum number of bytes to copy, including the trailing NUL * string length is larger than *size*, just *size*-1 bytes are
* @unsafe_ptr: unsafe address * copied and the last byte is set to NUL.
* Return: *
* > 0 length of the string including the trailing NUL on success * On success, the length of the copied string is returned. This
* < 0 error * makes this helper useful in tracing programs for reading
* * strings, and more importantly to get its length at runtime. See
* u64 bpf_get_socket_cookie(skb) * the following snippet:
* Get the cookie for the socket stored inside sk_buff. *
* @skb: pointer to skb * ::
* Return: 8 Bytes non-decreasing number on success or 0 if the socket *
* field is missing inside sk_buff * SEC("kprobe/sys_open")
* * void bpf_sys_open(struct pt_regs *ctx)
* u32 bpf_get_socket_uid(skb) * {
* Get the owner uid of the socket stored inside sk_buff. * char buf[PATHLEN]; // PATHLEN is defined to 256
* @skb: pointer to skb * int res = bpf_probe_read_str(buf, sizeof(buf),
* Return: uid of the socket owner on success or overflowuid if failed. * ctx->di);
* *
* u32 bpf_set_hash(skb, hash) * // Consume buf, for example push it to
* Set full skb->hash. * // userspace via bpf_perf_event_output(); we
* @skb: pointer to skb * // can use res (the string length) as event
* @hash: hash to set * // size, after checking its boundaries.
* * }
* int bpf_setsockopt(bpf_socket, level, optname, optval, optlen) *
* Calls setsockopt. Not all opts are available, only those with * In comparison, using **bpf_probe_read()** helper here instead
* integer optvals plus TCP_CONGESTION. * to read the string would require to estimate the length at
* Supported levels: SOL_SOCKET and IPPROTO_TCP * compile time, and would often result in copying more memory
* @bpf_socket: pointer to bpf_socket * than necessary.
* @level: SOL_SOCKET or IPPROTO_TCP *
* @optname: option name * Another useful use case is when parsing individual process
* @optval: pointer to option value * arguments or individual environment variables navigating
* @optlen: length of optval in bytes * *current*\ **->mm->arg_start** and *current*\
* Return: 0 or negative error * **->mm->env_start**: using this helper and the return value,
* * one can quickly iterate at the right offset of the memory area.
* int bpf_getsockopt(bpf_socket, level, optname, optval, optlen) * Return
* Calls getsockopt. Not all opts are available. * On success, the strictly positive length of the string,
* Supported levels: IPPROTO_TCP * including the trailing NUL character. On error, a negative
* @bpf_socket: pointer to bpf_socket * value.
* @level: IPPROTO_TCP *
* @optname: option name * u64 bpf_get_socket_cookie(struct sk_buff *skb)
* @optval: pointer to option value * Description
* @optlen: length of optval in bytes * If the **struct sk_buff** pointed by *skb* has a known socket,
* Return: 0 or negative error * retrieve the cookie (generated by the kernel) of this socket.
* * If no cookie has been set yet, generate a new cookie. Once
* int bpf_sock_ops_cb_flags_set(bpf_sock_ops, flags) * generated, the socket cookie remains stable for the life of the
* Set callback flags for sock_ops * socket. This helper can be useful for monitoring per socket
* @bpf_sock_ops: pointer to bpf_sock_ops_kern struct * networking traffic statistics as it provides a unique socket
* @flags: flags value * identifier per namespace.
* Return: 0 for no error * Return
* -EINVAL if there is no full tcp socket * A 8-byte long non-decreasing number on success, or 0 if the
* bits in flags that are not supported by current kernel * socket field is missing inside *skb*.
* *
* int bpf_skb_adjust_room(skb, len_diff, mode, flags) * u32 bpf_get_socket_uid(struct sk_buff *skb)
* Grow or shrink room in sk_buff. * Return
* @skb: pointer to skb * The owner UID of the socket associated to *skb*. If the socket
* @len_diff: (signed) amount of room to grow/shrink * is **NULL**, or if it is not a full socket (i.e. if it is a
* @mode: operation mode (enum bpf_adj_room_mode) * time-wait or a request socket instead), **overflowuid** value
* @flags: reserved for future use * is returned (note that **overflowuid** might also be the actual
* Return: 0 on success or negative error code * UID value for the socket).
* *
* int bpf_sk_redirect_map(map, key, flags) * u32 bpf_set_hash(struct sk_buff *skb, u32 hash)
* Redirect skb to a sock in map using key as a lookup key for the * Description
* sock in map. * Set the full hash for *skb* (set the field *skb*\ **->hash**)
* @map: pointer to sockmap * to value *hash*.
* @key: key to lookup sock in map * Return
* @flags: reserved for future use * 0
* Return: SK_PASS *
* * int bpf_setsockopt(struct bpf_sock_ops_kern *bpf_socket, int level, int optname, char *optval, int optlen)
* int bpf_sock_map_update(skops, map, key, flags) * Description
* @skops: pointer to bpf_sock_ops * Emulate a call to **setsockopt()** on the socket associated to
* @map: pointer to sockmap to update * *bpf_socket*, which must be a full socket. The *level* at
* @key: key to insert/update sock in map * which the option resides and the name *optname* of the option
* @flags: same flags as map update elem * must be specified, see **setsockopt(2)** for more information.
* * The option value of length *optlen* is pointed by *optval*.
* int bpf_xdp_adjust_meta(xdp_md, delta) *
* Adjust the xdp_md.data_meta by delta * This helper actually implements a subset of **setsockopt()**.
* @xdp_md: pointer to xdp_md * It supports the following *level*\ s:
* @delta: An positive/negative integer to be added to xdp_md.data_meta *
* Return: 0 on success or negative on error * * **SOL_SOCKET**, which supports the following *optname*\ s:
* * **SO_RCVBUF**, **SO_SNDBUF**, **SO_MAX_PACING_RATE**,
* int bpf_perf_event_read_value(map, flags, buf, buf_size) * **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**.
* read perf event counter value and perf event enabled/running time * * **IPPROTO_TCP**, which supports the following *optname*\ s:
* @map: pointer to perf_event_array map * **TCP_CONGESTION**, **TCP_BPF_IW**,
* @flags: index of event in the map or bitmask flags * **TCP_BPF_SNDCWND_CLAMP**.
* @buf: buf to fill * * **IPPROTO_IP**, which supports *optname* **IP_TOS**.
* @buf_size: size of the buf * * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**.
* Return: 0 on success or negative error code * Return
* * 0 on success, or a negative error in case of failure.
* int bpf_perf_prog_read_value(ctx, buf, buf_size) *
* read perf prog attached perf event counter and enabled/running time * int bpf_skb_adjust_room(struct sk_buff *skb, u32 len_diff, u32 mode, u64 flags)
* @ctx: pointer to ctx * Description
* @buf: buf to fill * Grow or shrink the room for data in the packet associated to
* @buf_size: size of the buf * *skb* by *len_diff*, and according to the selected *mode*.
* Return : 0 on success or negative error code *
* * There is a single supported mode at this time:
* int bpf_override_return(pt_regs, rc) *
* @pt_regs: pointer to struct pt_regs * * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer
* @rc: the return value to set * (room space is added or removed below the layer 3 header).
* *
* int bpf_msg_redirect_map(map, key, flags) * All values for *flags* are reserved for future usage, and must
* Redirect msg to a sock in map using key as a lookup key for the * be left at zero.
* sock in map. *
* @map: pointer to sockmap * A call to this helper is susceptible to change the underlaying
* @key: key to lookup sock in map * packet buffer. Therefore, at load time, all checks on pointers
* @flags: reserved for future use * previously done by the verifier are invalidated and must be
* Return: SK_PASS * performed again, if the helper is used in combination with
* * direct packet access.
* int bpf_bind(ctx, addr, addr_len) * Return
* Bind socket to address. Only binding to IP is supported, no port can be * 0 on success, or a negative error in case of failure.
* set in addr. *
* @ctx: pointer to context of type bpf_sock_addr * int bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags)
* @addr: pointer to struct sockaddr to bind socket to * Description
* @addr_len: length of sockaddr structure * Redirect the packet to the endpoint referenced by *map* at
* Return: 0 on success or negative error code * index *key*. Depending on its type, this *map* can contain
* references to net devices (for forwarding packets through other
* ports), or to CPUs (for redirecting XDP frames to another CPU;
* but this is only implemented for native XDP (with driver
* support) as of this writing).
*
* All values for *flags* are reserved for future usage, and must
* be left at zero.
*
* When used to redirect packets to net devices, this helper
* provides a high performance increase over **bpf_redirect**\ ().
* This is due to various implementation details of the underlying
* mechanisms, one of which is the fact that **bpf_redirect_map**\
* () tries to send packet as a "bulk" to the device.
* Return
* **XDP_REDIRECT** on success, or **XDP_ABORTED** on error.
*
* int bpf_sk_redirect_map(struct bpf_map *map, u32 key, u64 flags)
* Description
* Redirect the packet to the socket referenced by *map* (of type
* **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and
* egress interfaces can be used for redirection. The
* **BPF_F_INGRESS** value in *flags* is used to make the
* distinction (ingress path is selected if the flag is present,
* egress path otherwise). This is the only flag supported for now.
* Return
* **SK_PASS** on success, or **SK_DROP** on error.
*
* int bpf_sock_map_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags)
* Description
* Add an entry to, or update a *map* referencing sockets. The
* *skops* is used as a new value for the entry associated to
* *key*. *flags* is one of:
*
* **BPF_NOEXIST**
* The entry for *key* must not exist in the map.
* **BPF_EXIST**
* The entry for *key* must already exist in the map.
* **BPF_ANY**
* No condition on the existence of the entry for *key*.
*
* If the *map* has eBPF programs (parser and verdict), those will
* be inherited by the socket being added. If the socket is
* already attached to eBPF programs, this results in an error.
* Return
* 0 on success, or a negative error in case of failure.
*
* int bpf_xdp_adjust_meta(struct xdp_buff *xdp_md, int delta)
* Description
* Adjust the address pointed by *xdp_md*\ **->data_meta** by
* *delta* (which can be positive or negative). Note that this
* operation modifies the address stored in *xdp_md*\ **->data**,
* so the latter must be loaded only after the helper has been
* called.
*
* The use of *xdp_md*\ **->data_meta** is optional and programs
* are not required to use it. The rationale is that when the
* packet is processed with XDP (e.g. as DoS filter), it is
* possible to push further meta data along with it before passing
* to the stack, and to give the guarantee that an ingress eBPF
* program attached as a TC classifier on the same device can pick
* this up for further post-processing. Since TC works with socket
* buffers, it remains possible to set from XDP the **mark** or
* **priority** pointers, or other pointers for the socket buffer.
* Having this scratch space generic and programmable allows for
* more flexibility as the user is free to store whatever meta
* data they need.
*
* A call to this helper is susceptible to change the underlaying
* packet buffer. Therefore, at load time, all checks on pointers
* previously done by the verifier are invalidated and must be
* performed again, if the helper is used in combination with
* direct packet access.
* Return
* 0 on success, or a negative error in case of failure.
*
* int bpf_perf_event_read_value(struct bpf_map *map, u64 flags, struct bpf_perf_event_value *buf, u32 buf_size)
* Description
* Read the value of a perf event counter, and store it into *buf*
* of size *buf_size*. This helper relies on a *map* of type
* **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of the perf event
* counter is selected when *map* is updated with perf event file
* descriptors. The *map* is an array whose size is the number of
* available CPUs, and each cell contains a value relative to one
* CPU. The value to retrieve is indicated by *flags*, that
* contains the index of the CPU to look up, masked with
* **BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to
* **BPF_F_CURRENT_CPU** to indicate that the value for the
* current CPU should be retrieved.
*
* This helper behaves in a way close to
* **bpf_perf_event_read**\ () helper, save that instead of
* just returning the value observed, it fills the *buf*
* structure. This allows for additional data to be retrieved: in
* particular, the enabled and running times (in *buf*\
* **->enabled** and *buf*\ **->running**, respectively) are
* copied. In general, **bpf_perf_event_read_value**\ () is
* recommended over **bpf_perf_event_read**\ (), which has some
* ABI issues and provides fewer functionalities.
*
* These values are interesting, because hardware PMU (Performance
* Monitoring Unit) counters are limited resources. When there are
* more PMU based perf events opened than available counters,
* kernel will multiplex these events so each event gets certain
* percentage (but not all) of the PMU time. In case that
* multiplexing happens, the number of samples or counter value
* will not reflect the case compared to when no multiplexing
* occurs. This makes comparison between different runs difficult.
* Typically, the counter value should be normalized before
* comparing to other experiments. The usual normalization is done
* as follows.
*
* ::
*
* normalized_counter = counter * t_enabled / t_running
*
* Where t_enabled is the time enabled for event and t_running is
* the time running for event since last normalization. The
* enabled and running times are accumulated since the perf event
* open. To achieve scaling factor between two invocations of an
* eBPF program, users can can use CPU id as the key (which is
* typical for perf array usage model) to remember the previous
* value and do the calculation inside the eBPF program.
* Return
* 0 on success, or a negative error in case of failure.
*
* int bpf_perf_prog_read_value(struct bpf_perf_event_data_kern *ctx, struct bpf_perf_event_value *buf, u32 buf_size)
* Description
* For en eBPF program attached to a perf event, retrieve the
* value of the event counter associated to *ctx* and store it in
* the structure pointed by *buf* and of size *buf_size*. Enabled
* and running times are also stored in the structure (see
* description of helper **bpf_perf_event_read_value**\ () for
* more details).
* Return
* 0 on success, or a negative error in case of failure.
*
* int bpf_getsockopt(struct bpf_sock_ops_kern *bpf_socket, int level, int optname, char *optval, int optlen)
* Description
* Emulate a call to **getsockopt()** on the socket associated to
* *bpf_socket*, which must be a full socket. The *level* at
* which the option resides and the name *optname* of the option
* must be specified, see **getsockopt(2)** for more information.
* The retrieved value is stored in the structure pointed by
* *opval* and of length *optlen*.
*
* This helper actually implements a subset of **getsockopt()**.
* It supports the following *level*\ s:
*
* * **IPPROTO_TCP**, which supports *optname*
* **TCP_CONGESTION**.
* * **IPPROTO_IP**, which supports *optname* **IP_TOS**.
* * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**.
* Return
* 0 on success, or a negative error in case of failure.
*
* int bpf_override_return(struct pt_reg *regs, u64 rc)
* Description
* Used for error injection, this helper uses kprobes to override
* the return value of the probed function, and to set it to *rc*.
* The first argument is the context *regs* on which the kprobe
* works.
*
* This helper works by setting setting the PC (program counter)
* to an override function which is run in place of the original
* probed function. This means the probed function is not run at
* all. The replacement function just returns with the required
* value.
*
* This helper has security implications, and thus is subject to
* restrictions. It is only available if the kernel was compiled
* with the **CONFIG_BPF_KPROBE_OVERRIDE** configuration
* option, and in this case it only works on functions tagged with
* **ALLOW_ERROR_INJECTION** in the kernel code.
*
* Also, the helper is only available for the architectures having
* the CONFIG_FUNCTION_ERROR_INJECTION option. As of this writing,
* x86 architecture is the only one to support this feature.
* Return
* 0
*
* int bpf_sock_ops_cb_flags_set(struct bpf_sock_ops_kern *bpf_sock, int argval)
* Description
* Attempt to set the value of the **bpf_sock_ops_cb_flags** field
* for the full TCP socket associated to *bpf_sock_ops* to
* *argval*.
*
* The primary use of this field is to determine if there should
* be calls to eBPF programs of type
* **BPF_PROG_TYPE_SOCK_OPS** at various points in the TCP
* code. A program of the same type can change its value, per
* connection and as necessary, when the connection is
* established. This field is directly accessible for reading, but
* this helper must be used for updates in order to return an
* error if an eBPF program tries to set a callback that is not
* supported in the current kernel.
*
* The supported callback values that *argval* can combine are:
*
* * **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out)
* * **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission)
* * **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change)
*
* Here are some examples of where one could call such eBPF
* program:
*
* * When RTO fires.
* * When a packet is retransmitted.
* * When the connection terminates.
* * When a packet is sent.
* * When a packet is received.
* Return
* Code **-EINVAL** if the socket is not a full TCP socket;
* otherwise, a positive number containing the bits that could not
* be set is returned (which comes down to 0 if all bits were set
* as required).
*
* int bpf_msg_redirect_map(struct sk_msg_buff *msg, struct bpf_map *map, u32 key, u64 flags)
* Description
* This helper is used in programs implementing policies at the
* socket level. If the message *msg* is allowed to pass (i.e. if
* the verdict eBPF program returns **SK_PASS**), redirect it to
* the socket referenced by *map* (of type
* **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and
* egress interfaces can be used for redirection. The
* **BPF_F_INGRESS** value in *flags* is used to make the
* distinction (ingress path is selected if the flag is present,
* egress path otherwise). This is the only flag supported for now.
* Return
* **SK_PASS** on success, or **SK_DROP** on error.
*
* int bpf_msg_apply_bytes(struct sk_msg_buff *msg, u32 bytes)
* Description
* For socket policies, apply the verdict of the eBPF program to
* the next *bytes* (number of bytes) of message *msg*.
*
* For example, this helper can be used in the following cases:
*
* * A single **sendmsg**\ () or **sendfile**\ () system call
* contains multiple logical messages that the eBPF program is
* supposed to read and for which it should apply a verdict.
* * An eBPF program only cares to read the first *bytes* of a
* *msg*. If the message has a large payload, then setting up
* and calling the eBPF program repeatedly for all bytes, even
* though the verdict is already known, would create unnecessary
* overhead.
*
* When called from within an eBPF program, the helper sets a
* counter internal to the BPF infrastructure, that is used to
* apply the last verdict to the next *bytes*. If *bytes* is
* smaller than the current data being processed from a
* **sendmsg**\ () or **sendfile**\ () system call, the first
* *bytes* will be sent and the eBPF program will be re-run with
* the pointer for start of data pointing to byte number *bytes*
* **+ 1**. If *bytes* is larger than the current data being
* processed, then the eBPF verdict will be applied to multiple
* **sendmsg**\ () or **sendfile**\ () calls until *bytes* are
* consumed.
*
* Note that if a socket closes with the internal counter holding
* a non-zero value, this is not a problem because data is not
* being buffered for *bytes* and is sent as it is received.
* Return
* 0
*
* int bpf_msg_cork_bytes(struct sk_msg_buff *msg, u32 bytes)
* Description
* For socket policies, prevent the execution of the verdict eBPF
* program for message *msg* until *bytes* (byte number) have been
* accumulated.
*
* This can be used when one needs a specific number of bytes
* before a verdict can be assigned, even if the data spans
* multiple **sendmsg**\ () or **sendfile**\ () calls. The extreme
* case would be a user calling **sendmsg**\ () repeatedly with
* 1-byte long message segments. Obviously, this is bad for
* performance, but it is still valid. If the eBPF program needs
* *bytes* bytes to validate a header, this helper can be used to
* prevent the eBPF program to be called again until *bytes* have
* been accumulated.
* Return
* 0
*
* int bpf_msg_pull_data(struct sk_msg_buff *msg, u32 start, u32 end, u64 flags)
* Description
* For socket policies, pull in non-linear data from user space
* for *msg* and set pointers *msg*\ **->data** and *msg*\
* **->data_end** to *start* and *end* bytes offsets into *msg*,
* respectively.
*
* If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a
* *msg* it can only parse data that the (**data**, **data_end**)
* pointers have already consumed. For **sendmsg**\ () hooks this
* is likely the first scatterlist element. But for calls relying
* on the **sendpage** handler (e.g. **sendfile**\ ()) this will
* be the range (**0**, **0**) because the data is shared with
* user space and by default the objective is to avoid allowing
* user space to modify data while (or after) eBPF verdict is
* being decided. This helper can be used to pull in data and to
* set the start and end pointer to given values. Data will be
* copied if necessary (i.e. if data was not linear and if start
* and end pointers do not point to the same chunk).
*
* A call to this helper is susceptible to change the underlaying
* packet buffer. Therefore, at load time, all checks on pointers
* previously done by the verifier are invalidated and must be
* performed again, if the helper is used in combination with
* direct packet access.
*
* All values for *flags* are reserved for future usage, and must
* be left at zero.
* Return
* 0 on success, or a negative error in case of failure.
*
* int bpf_bind(struct bpf_sock_addr_kern *ctx, struct sockaddr *addr, int addr_len)
* Description
* Bind the socket associated to *ctx* to the address pointed by
* *addr*, of length *addr_len*. This allows for making outgoing
* connection from the desired IP address, which can be useful for
* example when all processes inside a cgroup should use one
* single IP address on a host that has multiple IP configured.
*
* This helper works for IPv4 and IPv6, TCP and UDP sockets. The
* domain (*addr*\ **->sa_family**) must be **AF_INET** (or
* **AF_INET6**). Looking for a free port to bind to can be
* expensive, therefore binding to port is not permitted by the
* helper: *addr*\ **->sin_port** (or **sin6_port**, respectively)
* must be set to zero.
* Return
* 0 on success, or a negative error in case of failure.
*
* int bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta)
* Description
* Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is
* only possible to shrink the packet as of this writing,
* therefore *delta* must be a negative integer.
*
* A call to this helper is susceptible to change the underlaying
* packet buffer. Therefore, at load time, all checks on pointers
* previously done by the verifier are invalidated and must be
* performed again, if the helper is used in combination with
* direct packet access.
* Return
* 0 on success, or a negative error in case of failure.
*
* int bpf_skb_get_xfrm_state(struct sk_buff *skb, u32 index, struct bpf_xfrm_state *xfrm_state, u32 size, u64 flags)
* Description
* Retrieve the XFRM state (IP transform framework, see also
* **ip-xfrm(8)**) at *index* in XFRM "security path" for *skb*.
*
* The retrieved value is stored in the **struct bpf_xfrm_state**
* pointed by *xfrm_state* and of length *size*.
*
* All values for *flags* are reserved for future usage, and must
* be left at zero.
*
* This helper is available only if the kernel was compiled with
* **CONFIG_XFRM** configuration option.
* Return
* 0 on success, or a negative error in case of failure.
*/ */
#define __BPF_FUNC_MAPPER(FN) \ #define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \ FN(unspec), \
...@@ -821,7 +1833,9 @@ union bpf_attr { ...@@ -821,7 +1833,9 @@ union bpf_attr {
FN(msg_apply_bytes), \ FN(msg_apply_bytes), \
FN(msg_cork_bytes), \ FN(msg_cork_bytes), \
FN(msg_pull_data), \ FN(msg_pull_data), \
FN(bind), FN(bind), \
FN(xdp_adjust_tail), \
FN(skb_get_xfrm_state),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper /* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call * function eBPF program intends to call
...@@ -927,6 +1941,19 @@ struct bpf_tunnel_key { ...@@ -927,6 +1941,19 @@ struct bpf_tunnel_key {
__u32 tunnel_label; __u32 tunnel_label;
}; };
/* user accessible mirror of in-kernel xfrm_state.
* new fields can only be added to the end of this structure
*/
struct bpf_xfrm_state {
__u32 reqid;
__u32 spi; /* Stored in network byte order */
__u16 family;
union {
__u32 remote_ipv4; /* Stored in network byte order */
__u32 remote_ipv6[4]; /* Stored in network byte order */
};
};
/* Generic BPF return codes which all BPF program types may support. /* Generic BPF return codes which all BPF program types may support.
* The values are binary compatible with their TC_ACT_* counter-part to * The values are binary compatible with their TC_ACT_* counter-part to
* provide backwards compatibility with existing SCHED_CLS and SCHED_ACT * provide backwards compatibility with existing SCHED_CLS and SCHED_ACT
...@@ -1017,6 +2044,7 @@ struct bpf_prog_info { ...@@ -1017,6 +2044,7 @@ struct bpf_prog_info {
__aligned_u64 map_ids; __aligned_u64 map_ids;
char name[BPF_OBJ_NAME_LEN]; char name[BPF_OBJ_NAME_LEN];
__u32 ifindex; __u32 ifindex;
__u32 gpl_compatible:1;
__u64 netns_dev; __u64 netns_dev;
__u64 netns_ino; __u64 netns_ino;
} __attribute__((aligned(8))); } __attribute__((aligned(8)));
......
...@@ -96,6 +96,7 @@ enum bpf_cmd { ...@@ -96,6 +96,7 @@ enum bpf_cmd {
BPF_OBJ_GET_INFO_BY_FD, BPF_OBJ_GET_INFO_BY_FD,
BPF_PROG_QUERY, BPF_PROG_QUERY,
BPF_RAW_TRACEPOINT_OPEN, BPF_RAW_TRACEPOINT_OPEN,
BPF_BTF_LOAD,
}; };
enum bpf_map_type { enum bpf_map_type {
...@@ -280,6 +281,9 @@ union bpf_attr { ...@@ -280,6 +281,9 @@ union bpf_attr {
*/ */
char map_name[BPF_OBJ_NAME_LEN]; char map_name[BPF_OBJ_NAME_LEN];
__u32 map_ifindex; /* ifindex of netdev to create on */ __u32 map_ifindex; /* ifindex of netdev to create on */
__u32 btf_fd; /* fd pointing to a BTF type data */
__u32 btf_key_id; /* BTF type_id of the key */
__u32 btf_value_id; /* BTF type_id of the value */
}; };
struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
...@@ -364,398 +368,1406 @@ union bpf_attr { ...@@ -364,398 +368,1406 @@ union bpf_attr {
__u64 name; __u64 name;
__u32 prog_fd; __u32 prog_fd;
} raw_tracepoint; } raw_tracepoint;
struct { /* anonymous struct for BPF_BTF_LOAD */
__aligned_u64 btf;
__aligned_u64 btf_log_buf;
__u32 btf_size;
__u32 btf_log_size;
__u32 btf_log_level;
};
} __attribute__((aligned(8))); } __attribute__((aligned(8)));
/* BPF helper function descriptions: /* The description below is an attempt at providing documentation to eBPF
* * developers about the multiple available eBPF helper functions. It can be
* void *bpf_map_lookup_elem(&map, &key) * parsed and used to produce a manual page. The workflow is the following,
* Return: Map value or NULL * and requires the rst2man utility:
* *
* int bpf_map_update_elem(&map, &key, &value, flags) * $ ./scripts/bpf_helpers_doc.py \
* Return: 0 on success or negative error * --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst
* * $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7
* int bpf_map_delete_elem(&map, &key) * $ man /tmp/bpf-helpers.7
* Return: 0 on success or negative error *
* * Note that in order to produce this external documentation, some RST
* int bpf_probe_read(void *dst, int size, void *src) * formatting is used in the descriptions to get "bold" and "italics" in
* Return: 0 on success or negative error * manual pages. Also note that the few trailing white spaces are
* intentional, removing them would break paragraphs for rst2man.
*
* Start of BPF helper function descriptions:
*
* void *bpf_map_lookup_elem(struct bpf_map *map, const void *key)
* Description
* Perform a lookup in *map* for an entry associated to *key*.
* Return
* Map value associated to *key*, or **NULL** if no entry was
* found.
*
* int bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags)
* Description
* Add or update the value of the entry associated to *key* in
* *map* with *value*. *flags* is one of:
*
* **BPF_NOEXIST**
* The entry for *key* must not exist in the map.
* **BPF_EXIST**
* The entry for *key* must already exist in the map.
* **BPF_ANY**
* No condition on the existence of the entry for *key*.
*
* Flag value **BPF_NOEXIST** cannot be used for maps of types
* **BPF_MAP_TYPE_ARRAY** or **BPF_MAP_TYPE_PERCPU_ARRAY** (all
* elements always exist), the helper would return an error.
* Return
* 0 on success, or a negative error in case of failure.
*
* int bpf_map_delete_elem(struct bpf_map *map, const void *key)
* Description
* Delete entry with *key* from *map*.
* Return
* 0 on success, or a negative error in case of failure.
*
* int bpf_probe_read(void *dst, u32 size, const void *src)
* Description
* For tracing programs, safely attempt to read *size* bytes from
* address *src* and store the data in *dst*.
* Return
* 0 on success, or a negative error in case of failure.
* *
* u64 bpf_ktime_get_ns(void) * u64 bpf_ktime_get_ns(void)
* Return: current ktime * Description
* * Return the time elapsed since system boot, in nanoseconds.
* int bpf_trace_printk(const char *fmt, int fmt_size, ...) * Return
* Return: length of buffer written or negative error * Current *ktime*.
* *
* u32 bpf_prandom_u32(void) * int bpf_trace_printk(const char *fmt, u32 fmt_size, ...)
* Return: random value * Description
* * This helper is a "printk()-like" facility for debugging. It
* u32 bpf_raw_smp_processor_id(void) * prints a message defined by format *fmt* (of size *fmt_size*)
* Return: SMP processor ID * to file *\/sys/kernel/debug/tracing/trace* from DebugFS, if
* * available. It can take up to three additional **u64**
* int bpf_skb_store_bytes(skb, offset, from, len, flags) * arguments (as an eBPF helpers, the total number of arguments is
* store bytes into packet * limited to five).
* @skb: pointer to skb *
* @offset: offset within packet from skb->mac_header * Each time the helper is called, it appends a line to the trace.
* @from: pointer where to copy bytes from * The format of the trace is customizable, and the exact output
* @len: number of bytes to store into packet * one will get depends on the options set in
* @flags: bit 0 - if true, recompute skb->csum * *\/sys/kernel/debug/tracing/trace_options* (see also the
* other bits - reserved * *README* file under the same directory). However, it usually
* Return: 0 on success or negative error * defaults to something like:
* *
* int bpf_l3_csum_replace(skb, offset, from, to, flags) * ::
* recompute IP checksum *
* @skb: pointer to skb * telnet-470 [001] .N.. 419421.045894: 0x00000001: <formatted msg>
* @offset: offset within packet where IP checksum is located *
* @from: old value of header field * In the above:
* @to: new value of header field *
* @flags: bits 0-3 - size of header field * * ``telnet`` is the name of the current task.
* other bits - reserved * * ``470`` is the PID of the current task.
* Return: 0 on success or negative error * * ``001`` is the CPU number on which the task is
* * running.
* int bpf_l4_csum_replace(skb, offset, from, to, flags) * * In ``.N..``, each character refers to a set of
* recompute TCP/UDP checksum * options (whether irqs are enabled, scheduling
* @skb: pointer to skb * options, whether hard/softirqs are running, level of
* @offset: offset within packet where TCP/UDP checksum is located * preempt_disabled respectively). **N** means that
* @from: old value of header field * **TIF_NEED_RESCHED** and **PREEMPT_NEED_RESCHED**
* @to: new value of header field * are set.
* @flags: bits 0-3 - size of header field * * ``419421.045894`` is a timestamp.
* bit 4 - is pseudo header * * ``0x00000001`` is a fake value used by BPF for the
* other bits - reserved * instruction pointer register.
* Return: 0 on success or negative error * * ``<formatted msg>`` is the message formatted with
* * *fmt*.
* int bpf_tail_call(ctx, prog_array_map, index) *
* jump into another BPF program * The conversion specifiers supported by *fmt* are similar, but
* @ctx: context pointer passed to next program * more limited than for printk(). They are **%d**, **%i**,
* @prog_array_map: pointer to map which type is BPF_MAP_TYPE_PROG_ARRAY * **%u**, **%x**, **%ld**, **%li**, **%lu**, **%lx**, **%lld**,
* @index: 32-bit index inside array that selects specific program to run * **%lli**, **%llu**, **%llx**, **%p**, **%s**. No modifier (size
* Return: 0 on success or negative error * of field, padding with zeroes, etc.) is available, and the
* * helper will return **-EINVAL** (but print nothing) if it
* int bpf_clone_redirect(skb, ifindex, flags) * encounters an unknown specifier.
* redirect to another netdev *
* @skb: pointer to skb * Also, note that **bpf_trace_printk**\ () is slow, and should
* @ifindex: ifindex of the net device * only be used for debugging purposes. For this reason, a notice
* @flags: bit 0 - if set, redirect to ingress instead of egress * bloc (spanning several lines) is printed to kernel logs and
* other bits - reserved * states that the helper should not be used "for production use"
* Return: 0 on success or negative error * the first time this helper is used (or more precisely, when
* **trace_printk**\ () buffers are allocated). For passing values
* to user space, perf events should be preferred.
* Return
* The number of bytes written to the buffer, or a negative error
* in case of failure.
*
* u32 bpf_get_prandom_u32(void)
* Description
* Get a pseudo-random number.
*
* From a security point of view, this helper uses its own
* pseudo-random internal state, and cannot be used to infer the
* seed of other random functions in the kernel. However, it is
* essential to note that the generator used by the helper is not
* cryptographically secure.
* Return
* A random 32-bit unsigned value.
*
* u32 bpf_get_smp_processor_id(void)
* Description
* Get the SMP (symmetric multiprocessing) processor id. Note that
* all programs run with preemption disabled, which means that the
* SMP processor id is stable during all the execution of the
* program.
* Return
* The SMP id of the processor running the program.
*
* int bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags)
* Description
* Store *len* bytes from address *from* into the packet
* associated to *skb*, at *offset*. *flags* are a combination of
* **BPF_F_RECOMPUTE_CSUM** (automatically recompute the
* checksum for the packet after storing the bytes) and
* **BPF_F_INVALIDATE_HASH** (set *skb*\ **->hash**, *skb*\
* **->swhash** and *skb*\ **->l4hash** to 0).
*
* A call to this helper is susceptible to change the underlaying
* packet buffer. Therefore, at load time, all checks on pointers
* previously done by the verifier are invalidated and must be
* performed again, if the helper is used in combination with
* direct packet access.
* Return
* 0 on success, or a negative error in case of failure.
*
* int bpf_l3_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 size)
* Description
* Recompute the layer 3 (e.g. IP) checksum for the packet
* associated to *skb*. Computation is incremental, so the helper
* must know the former value of the header field that was
* modified (*from*), the new value of this field (*to*), and the
* number of bytes (2 or 4) for this field, stored in *size*.
* Alternatively, it is possible to store the difference between
* the previous and the new values of the header field in *to*, by
* setting *from* and *size* to 0. For both methods, *offset*
* indicates the location of the IP checksum within the packet.
*
* This helper works in combination with **bpf_csum_diff**\ (),
* which does not update the checksum in-place, but offers more
* flexibility and can handle sizes larger than 2 or 4 for the
* checksum to update.
*
* A call to this helper is susceptible to change the underlaying
* packet buffer. Therefore, at load time, all checks on pointers
* previously done by the verifier are invalidated and must be
* performed again, if the helper is used in combination with
* direct packet access.
* Return
* 0 on success, or a negative error in case of failure.
*
* int bpf_l4_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 flags)
* Description
* Recompute the layer 4 (e.g. TCP, UDP or ICMP) checksum for the
* packet associated to *skb*. Computation is incremental, so the
* helper must know the former value of the header field that was
* modified (*from*), the new value of this field (*to*), and the
* number of bytes (2 or 4) for this field, stored on the lowest
* four bits of *flags*. Alternatively, it is possible to store
* the difference between the previous and the new values of the
* header field in *to*, by setting *from* and the four lowest
* bits of *flags* to 0. For both methods, *offset* indicates the
* location of the IP checksum within the packet. In addition to
* the size of the field, *flags* can be added (bitwise OR) actual
* flags. With **BPF_F_MARK_MANGLED_0**, a null checksum is left
* untouched (unless **BPF_F_MARK_ENFORCE** is added as well), and
* for updates resulting in a null checksum the value is set to
* **CSUM_MANGLED_0** instead. Flag **BPF_F_PSEUDO_HDR** indicates
* the checksum is to be computed against a pseudo-header.
*
* This helper works in combination with **bpf_csum_diff**\ (),
* which does not update the checksum in-place, but offers more
* flexibility and can handle sizes larger than 2 or 4 for the
* checksum to update.
*
* A call to this helper is susceptible to change the underlaying
* packet buffer. Therefore, at load time, all checks on pointers
* previously done by the verifier are invalidated and must be
* performed again, if the helper is used in combination with
* direct packet access.
* Return
* 0 on success, or a negative error in case of failure.
*
* int bpf_tail_call(void *ctx, struct bpf_map *prog_array_map, u32 index)
* Description
* This special helper is used to trigger a "tail call", or in
* other words, to jump into another eBPF program. The same stack
* frame is used (but values on stack and in registers for the
* caller are not accessible to the callee). This mechanism allows
* for program chaining, either for raising the maximum number of
* available eBPF instructions, or to execute given programs in
* conditional blocks. For security reasons, there is an upper
* limit to the number of successive tail calls that can be
* performed.
*
* Upon call of this helper, the program attempts to jump into a
* program referenced at index *index* in *prog_array_map*, a
* special map of type **BPF_MAP_TYPE_PROG_ARRAY**, and passes
* *ctx*, a pointer to the context.
*
* If the call succeeds, the kernel immediately runs the first
* instruction of the new program. This is not a function call,
* and it never returns to the previous program. If the call
* fails, then the helper has no effect, and the caller continues
* to run its subsequent instructions. A call can fail if the
* destination program for the jump does not exist (i.e. *index*
* is superior to the number of entries in *prog_array_map*), or
* if the maximum number of tail calls has been reached for this
* chain of programs. This limit is defined in the kernel by the
* macro **MAX_TAIL_CALL_CNT** (not accessible to user space),
* which is currently set to 32.
* Return
* 0 on success, or a negative error in case of failure.
*
* int bpf_clone_redirect(struct sk_buff *skb, u32 ifindex, u64 flags)
* Description
* Clone and redirect the packet associated to *skb* to another
* net device of index *ifindex*. Both ingress and egress
* interfaces can be used for redirection. The **BPF_F_INGRESS**
* value in *flags* is used to make the distinction (ingress path
* is selected if the flag is present, egress path otherwise).
* This is the only flag supported for now.
*
* In comparison with **bpf_redirect**\ () helper,
* **bpf_clone_redirect**\ () has the associated cost of
* duplicating the packet buffer, but this can be executed out of
* the eBPF program. Conversely, **bpf_redirect**\ () is more
* efficient, but it is handled through an action code where the
* redirection happens only after the eBPF program has returned.
*
* A call to this helper is susceptible to change the underlaying
* packet buffer. Therefore, at load time, all checks on pointers
* previously done by the verifier are invalidated and must be
* performed again, if the helper is used in combination with
* direct packet access.
* Return
* 0 on success, or a negative error in case of failure.
* *
* u64 bpf_get_current_pid_tgid(void) * u64 bpf_get_current_pid_tgid(void)
* Return: current->tgid << 32 | current->pid * Return
* A 64-bit integer containing the current tgid and pid, and
* created as such:
* *current_task*\ **->tgid << 32 \|**
* *current_task*\ **->pid**.
* *
* u64 bpf_get_current_uid_gid(void) * u64 bpf_get_current_uid_gid(void)
* Return: current_gid << 32 | current_uid * Return
* * A 64-bit integer containing the current GID and UID, and
* int bpf_get_current_comm(char *buf, int size_of_buf) * created as such: *current_gid* **<< 32 \|** *current_uid*.
* stores current->comm into buf *
* Return: 0 on success or negative error * int bpf_get_current_comm(char *buf, u32 size_of_buf)
* * Description
* u32 bpf_get_cgroup_classid(skb) * Copy the **comm** attribute of the current task into *buf* of
* retrieve a proc's classid * *size_of_buf*. The **comm** attribute contains the name of
* @skb: pointer to skb * the executable (excluding the path) for the current task. The
* Return: classid if != 0 * *size_of_buf* must be strictly positive. On success, the
* * helper makes sure that the *buf* is NUL-terminated. On failure,
* int bpf_skb_vlan_push(skb, vlan_proto, vlan_tci) * it is filled with zeroes.
* Return: 0 on success or negative error * Return
* * 0 on success, or a negative error in case of failure.
* int bpf_skb_vlan_pop(skb) *
* Return: 0 on success or negative error * u32 bpf_get_cgroup_classid(struct sk_buff *skb)
* * Description
* int bpf_skb_get_tunnel_key(skb, key, size, flags) * Retrieve the classid for the current task, i.e. for the net_cls
* int bpf_skb_set_tunnel_key(skb, key, size, flags) * cgroup to which *skb* belongs.
* retrieve or populate tunnel metadata *
* @skb: pointer to skb * This helper can be used on TC egress path, but not on ingress.
* @key: pointer to 'struct bpf_tunnel_key' *
* @size: size of 'struct bpf_tunnel_key' * The net_cls cgroup provides an interface to tag network packets
* @flags: room for future extensions * based on a user-provided identifier for all traffic coming from
* Return: 0 on success or negative error * the tasks belonging to the related cgroup. See also the related
* * kernel documentation, available from the Linux sources in file
* u64 bpf_perf_event_read(map, flags) * *Documentation/cgroup-v1/net_cls.txt*.
* read perf event counter value *
* @map: pointer to perf_event_array map * The Linux kernel has two versions for cgroups: there are
* @flags: index of event in the map or bitmask flags * cgroups v1 and cgroups v2. Both are available to users, who can
* Return: value of perf event counter read or error code * use a mixture of them, but note that the net_cls cgroup is for
* * cgroup v1 only. This makes it incompatible with BPF programs
* int bpf_redirect(ifindex, flags) * run on cgroups, which is a cgroup-v2-only feature (a socket can
* redirect to another netdev * only hold data for one version of cgroups at a time).
* @ifindex: ifindex of the net device *
* @flags: * This helper is only available is the kernel was compiled with
* cls_bpf: * the **CONFIG_CGROUP_NET_CLASSID** configuration option set to
* bit 0 - if set, redirect to ingress instead of egress * "**y**" or to "**m**".
* other bits - reserved * Return
* xdp_bpf: * The classid, or 0 for the default unconfigured classid.
* all bits - reserved *
* Return: cls_bpf: TC_ACT_REDIRECT on success or TC_ACT_SHOT on error * int bpf_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
* xdp_bfp: XDP_REDIRECT on success or XDP_ABORT on error * Description
* int bpf_redirect_map(map, key, flags) * Push a *vlan_tci* (VLAN tag control information) of protocol
* redirect to endpoint in map * *vlan_proto* to the packet associated to *skb*, then update
* @map: pointer to dev map * the checksum. Note that if *vlan_proto* is different from
* @key: index in map to lookup * **ETH_P_8021Q** and **ETH_P_8021AD**, it is considered to
* @flags: -- * be **ETH_P_8021Q**.
* Return: XDP_REDIRECT on success or XDP_ABORT on error *
* * A call to this helper is susceptible to change the underlaying
* u32 bpf_get_route_realm(skb) * packet buffer. Therefore, at load time, all checks on pointers
* retrieve a dst's tclassid * previously done by the verifier are invalidated and must be
* @skb: pointer to skb * performed again, if the helper is used in combination with
* Return: realm if != 0 * direct packet access.
* * Return
* int bpf_perf_event_output(ctx, map, flags, data, size) * 0 on success, or a negative error in case of failure.
* output perf raw sample *
* @ctx: struct pt_regs* * int bpf_skb_vlan_pop(struct sk_buff *skb)
* @map: pointer to perf_event_array map * Description
* @flags: index of event in the map or bitmask flags * Pop a VLAN header from the packet associated to *skb*.
* @data: data on stack to be output as raw data *
* @size: size of data * A call to this helper is susceptible to change the underlaying
* Return: 0 on success or negative error * packet buffer. Therefore, at load time, all checks on pointers
* * previously done by the verifier are invalidated and must be
* int bpf_get_stackid(ctx, map, flags) * performed again, if the helper is used in combination with
* walk user or kernel stack and return id * direct packet access.
* @ctx: struct pt_regs* * Return
* @map: pointer to stack_trace map * 0 on success, or a negative error in case of failure.
* @flags: bits 0-7 - numer of stack frames to skip *
* bit 8 - collect user stack instead of kernel * int bpf_skb_get_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags)
* bit 9 - compare stacks by hash only * Description
* bit 10 - if two different stacks hash into the same stackid * Get tunnel metadata. This helper takes a pointer *key* to an
* discard old * empty **struct bpf_tunnel_key** of **size**, that will be
* other bits - reserved * filled with tunnel metadata for the packet associated to *skb*.
* Return: >= 0 stackid on success or negative error * The *flags* can be set to **BPF_F_TUNINFO_IPV6**, which
* * indicates that the tunnel is based on IPv6 protocol instead of
* s64 bpf_csum_diff(from, from_size, to, to_size, seed) * IPv4.
* calculate csum diff *
* @from: raw from buffer * The **struct bpf_tunnel_key** is an object that generalizes the
* @from_size: length of from buffer * principal parameters used by various tunneling protocols into a
* @to: raw to buffer * single struct. This way, it can be used to easily make a
* @to_size: length of to buffer * decision based on the contents of the encapsulation header,
* @seed: optional seed * "summarized" in this struct. In particular, it holds the IP
* Return: csum result or negative error code * address of the remote end (IPv4 or IPv6, depending on the case)
* * in *key*\ **->remote_ipv4** or *key*\ **->remote_ipv6**. Also,
* int bpf_skb_get_tunnel_opt(skb, opt, size) * this struct exposes the *key*\ **->tunnel_id**, which is
* retrieve tunnel options metadata * generally mapped to a VNI (Virtual Network Identifier), making
* @skb: pointer to skb * it programmable together with the **bpf_skb_set_tunnel_key**\
* @opt: pointer to raw tunnel option data * () helper.
* @size: size of @opt *
* Return: option size * Let's imagine that the following code is part of a program
* * attached to the TC ingress interface, on one end of a GRE
* int bpf_skb_set_tunnel_opt(skb, opt, size) * tunnel, and is supposed to filter out all messages coming from
* populate tunnel options metadata * remote ends with IPv4 address other than 10.0.0.1:
* @skb: pointer to skb *
* @opt: pointer to raw tunnel option data * ::
* @size: size of @opt *
* Return: 0 on success or negative error * int ret;
* * struct bpf_tunnel_key key = {};
* int bpf_skb_change_proto(skb, proto, flags) *
* Change protocol of the skb. Currently supported is v4 -> v6, * ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0);
* v6 -> v4 transitions. The helper will also resize the skb. eBPF * if (ret < 0)
* program is expected to fill the new headers via skb_store_bytes * return TC_ACT_SHOT; // drop packet
* and lX_csum_replace. *
* @skb: pointer to skb * if (key.remote_ipv4 != 0x0a000001)
* @proto: new skb->protocol type * return TC_ACT_SHOT; // drop packet
* @flags: reserved *
* Return: 0 on success or negative error * return TC_ACT_OK; // accept packet
* *
* int bpf_skb_change_type(skb, type) * This interface can also be used with all encapsulation devices
* Change packet type of skb. * that can operate in "collect metadata" mode: instead of having
* @skb: pointer to skb * one network device per specific configuration, the "collect
* @type: new skb->pkt_type type * metadata" mode only requires a single device where the
* Return: 0 on success or negative error * configuration can be extracted from this helper.
* *
* int bpf_skb_under_cgroup(skb, map, index) * This can be used together with various tunnels such as VXLan,
* Check cgroup2 membership of skb * Geneve, GRE or IP in IP (IPIP).
* @skb: pointer to skb * Return
* @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type * 0 on success, or a negative error in case of failure.
* @index: index of the cgroup in the bpf_map *
* Return: * int bpf_skb_set_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags)
* == 0 skb failed the cgroup2 descendant test * Description
* == 1 skb succeeded the cgroup2 descendant test * Populate tunnel metadata for packet associated to *skb.* The
* < 0 error * tunnel metadata is set to the contents of *key*, of *size*. The
* * *flags* can be set to a combination of the following values:
* u32 bpf_get_hash_recalc(skb) *
* Retrieve and possibly recalculate skb->hash. * **BPF_F_TUNINFO_IPV6**
* @skb: pointer to skb * Indicate that the tunnel is based on IPv6 protocol
* Return: hash * instead of IPv4.
* **BPF_F_ZERO_CSUM_TX**
* For IPv4 packets, add a flag to tunnel metadata
* indicating that checksum computation should be skipped
* and checksum set to zeroes.
* **BPF_F_DONT_FRAGMENT**
* Add a flag to tunnel metadata indicating that the
* packet should not be fragmented.
* **BPF_F_SEQ_NUMBER**
* Add a flag to tunnel metadata indicating that a
* sequence number should be added to tunnel header before
* sending the packet. This flag was added for GRE
* encapsulation, but might be used with other protocols
* as well in the future.
*
* Here is a typical usage on the transmit path:
*
* ::
*
* struct bpf_tunnel_key key;
* populate key ...
* bpf_skb_set_tunnel_key(skb, &key, sizeof(key), 0);
* bpf_clone_redirect(skb, vxlan_dev_ifindex, 0);
*
* See also the description of the **bpf_skb_get_tunnel_key**\ ()
* helper for additional information.
* Return
* 0 on success, or a negative error in case of failure.
*
* u64 bpf_perf_event_read(struct bpf_map *map, u64 flags)
* Description
* Read the value of a perf event counter. This helper relies on a
* *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of
* the perf event counter is selected when *map* is updated with
* perf event file descriptors. The *map* is an array whose size
* is the number of available CPUs, and each cell contains a value
* relative to one CPU. The value to retrieve is indicated by
* *flags*, that contains the index of the CPU to look up, masked
* with **BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to
* **BPF_F_CURRENT_CPU** to indicate that the value for the
* current CPU should be retrieved.
*
* Note that before Linux 4.13, only hardware perf event can be
* retrieved.
*
* Also, be aware that the newer helper
* **bpf_perf_event_read_value**\ () is recommended over
* **bpf_perf_event_read*\ () in general. The latter has some ABI
* quirks where error and counter value are used as a return code
* (which is wrong to do since ranges may overlap). This issue is
* fixed with bpf_perf_event_read_value(), which at the same time
* provides more features over the **bpf_perf_event_read**\ ()
* interface. Please refer to the description of
* **bpf_perf_event_read_value**\ () for details.
* Return
* The value of the perf event counter read from the map, or a
* negative error code in case of failure.
*
* int bpf_redirect(u32 ifindex, u64 flags)
* Description
* Redirect the packet to another net device of index *ifindex*.
* This helper is somewhat similar to **bpf_clone_redirect**\
* (), except that the packet is not cloned, which provides
* increased performance.
*
* Except for XDP, both ingress and egress interfaces can be used
* for redirection. The **BPF_F_INGRESS** value in *flags* is used
* to make the distinction (ingress path is selected if the flag
* is present, egress path otherwise). Currently, XDP only
* supports redirection to the egress interface, and accepts no
* flag at all.
*
* The same effect can be attained with the more generic
* **bpf_redirect_map**\ (), which requires specific maps to be
* used but offers better performance.
* Return
* For XDP, the helper returns **XDP_REDIRECT** on success or
* **XDP_ABORTED** on error. For other program types, the values
* are **TC_ACT_REDIRECT** on success or **TC_ACT_SHOT** on
* error.
*
* u32 bpf_get_route_realm(struct sk_buff *skb)
* Description
* Retrieve the realm or the route, that is to say the
* **tclassid** field of the destination for the *skb*. The
* indentifier retrieved is a user-provided tag, similar to the
* one used with the net_cls cgroup (see description for
* **bpf_get_cgroup_classid**\ () helper), but here this tag is
* held by a route (a destination entry), not by a task.
*
* Retrieving this identifier works with the clsact TC egress hook
* (see also **tc-bpf(8)**), or alternatively on conventional
* classful egress qdiscs, but not on TC ingress path. In case of
* clsact TC egress hook, this has the advantage that, internally,
* the destination entry has not been dropped yet in the transmit
* path. Therefore, the destination entry does not need to be
* artificially held via **netif_keep_dst**\ () for a classful
* qdisc until the *skb* is freed.
*
* This helper is available only if the kernel was compiled with
* **CONFIG_IP_ROUTE_CLASSID** configuration option.
* Return
* The realm of the route for the packet associated to *skb*, or 0
* if none was found.
*
* int bpf_perf_event_output(struct pt_reg *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
* Description
* Write raw *data* blob into a special BPF perf event held by
* *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf
* event must have the following attributes: **PERF_SAMPLE_RAW**
* as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and
* **PERF_COUNT_SW_BPF_OUTPUT** as **config**.
*
* The *flags* are used to indicate the index in *map* for which
* the value must be put, masked with **BPF_F_INDEX_MASK**.
* Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU**
* to indicate that the index of the current CPU core should be
* used.
*
* The value to write, of *size*, is passed through eBPF stack and
* pointed by *data*.
*
* The context of the program *ctx* needs also be passed to the
* helper.
*
* On user space, a program willing to read the values needs to
* call **perf_event_open**\ () on the perf event (either for
* one or for all CPUs) and to store the file descriptor into the
* *map*. This must be done before the eBPF program can send data
* into it. An example is available in file
* *samples/bpf/trace_output_user.c* in the Linux kernel source
* tree (the eBPF program counterpart is in
* *samples/bpf/trace_output_kern.c*).
*
* **bpf_perf_event_output**\ () achieves better performance
* than **bpf_trace_printk**\ () for sharing data with user
* space, and is much better suitable for streaming data from eBPF
* programs.
*
* Note that this helper is not restricted to tracing use cases
* and can be used with programs attached to TC or XDP as well,
* where it allows for passing data to user space listeners. Data
* can be:
*
* * Only custom structs,
* * Only the packet payload, or
* * A combination of both.
* Return
* 0 on success, or a negative error in case of failure.
*
* int bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len)
* Description
* This helper was provided as an easy way to load data from a
* packet. It can be used to load *len* bytes from *offset* from
* the packet associated to *skb*, into the buffer pointed by
* *to*.
*
* Since Linux 4.7, usage of this helper has mostly been replaced
* by "direct packet access", enabling packet data to be
* manipulated with *skb*\ **->data** and *skb*\ **->data_end**
* pointing respectively to the first byte of packet data and to
* the byte after the last byte of packet data. However, it
* remains useful if one wishes to read large quantities of data
* at once from a packet into the eBPF stack.
* Return
* 0 on success, or a negative error in case of failure.
*
* int bpf_get_stackid(struct pt_reg *ctx, struct bpf_map *map, u64 flags)
* Description
* Walk a user or a kernel stack and return its id. To achieve
* this, the helper needs *ctx*, which is a pointer to the context
* on which the tracing program is executed, and a pointer to a
* *map* of type **BPF_MAP_TYPE_STACK_TRACE**.
*
* The last argument, *flags*, holds the number of stack frames to
* skip (from 0 to 255), masked with
* **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
* a combination of the following flags:
*
* **BPF_F_USER_STACK**
* Collect a user space stack instead of a kernel stack.
* **BPF_F_FAST_STACK_CMP**
* Compare stacks by hash only.
* **BPF_F_REUSE_STACKID**
* If two different stacks hash into the same *stackid*,
* discard the old one.
*
* The stack id retrieved is a 32 bit long integer handle which
* can be further combined with other data (including other stack
* ids) and used as a key into maps. This can be useful for
* generating a variety of graphs (such as flame graphs or off-cpu
* graphs).
*
* For walking a stack, this helper is an improvement over
* **bpf_probe_read**\ (), which can be used with unrolled loops
* but is not efficient and consumes a lot of eBPF instructions.
* Instead, **bpf_get_stackid**\ () can collect up to
* **PERF_MAX_STACK_DEPTH** both kernel and user frames. Note that
* this limit can be controlled with the **sysctl** program, and
* that it should be manually increased in order to profile long
* user stacks (such as stacks for Java programs). To do so, use:
*
* ::
*
* # sysctl kernel.perf_event_max_stack=<new value>
*
* Return
* The positive or null stack id on success, or a negative error
* in case of failure.
*
* s64 bpf_csum_diff(__be32 *from, u32 from_size, __be32 *to, u32 to_size, __wsum seed)
* Description
* Compute a checksum difference, from the raw buffer pointed by
* *from*, of length *from_size* (that must be a multiple of 4),
* towards the raw buffer pointed by *to*, of size *to_size*
* (same remark). An optional *seed* can be added to the value
* (this can be cascaded, the seed may come from a previous call
* to the helper).
*
* This is flexible enough to be used in several ways:
*
* * With *from_size* == 0, *to_size* > 0 and *seed* set to
* checksum, it can be used when pushing new data.
* * With *from_size* > 0, *to_size* == 0 and *seed* set to
* checksum, it can be used when removing data from a packet.
* * With *from_size* > 0, *to_size* > 0 and *seed* set to 0, it
* can be used to compute a diff. Note that *from_size* and
* *to_size* do not need to be equal.
*
* This helper can be used in combination with
* **bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\ (), to
* which one can feed in the difference computed with
* **bpf_csum_diff**\ ().
* Return
* The checksum result, or a negative error code in case of
* failure.
*
* int bpf_skb_get_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size)
* Description
* Retrieve tunnel options metadata for the packet associated to
* *skb*, and store the raw tunnel option data to the buffer *opt*
* of *size*.
*
* This helper can be used with encapsulation devices that can
* operate in "collect metadata" mode (please refer to the related
* note in the description of **bpf_skb_get_tunnel_key**\ () for
* more details). A particular example where this can be used is
* in combination with the Geneve encapsulation protocol, where it
* allows for pushing (with **bpf_skb_get_tunnel_opt**\ () helper)
* and retrieving arbitrary TLVs (Type-Length-Value headers) from
* the eBPF program. This allows for full customization of these
* headers.
* Return
* The size of the option data retrieved.
*
* int bpf_skb_set_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size)
* Description
* Set tunnel options metadata for the packet associated to *skb*
* to the option data contained in the raw buffer *opt* of *size*.
*
* See also the description of the **bpf_skb_get_tunnel_opt**\ ()
* helper for additional information.
* Return
* 0 on success, or a negative error in case of failure.
*
* int bpf_skb_change_proto(struct sk_buff *skb, __be16 proto, u64 flags)
* Description
* Change the protocol of the *skb* to *proto*. Currently
* supported are transition from IPv4 to IPv6, and from IPv6 to
* IPv4. The helper takes care of the groundwork for the
* transition, including resizing the socket buffer. The eBPF
* program is expected to fill the new headers, if any, via
* **skb_store_bytes**\ () and to recompute the checksums with
* **bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\
* (). The main case for this helper is to perform NAT64
* operations out of an eBPF program.
*
* Internally, the GSO type is marked as dodgy so that headers are
* checked and segments are recalculated by the GSO/GRO engine.
* The size for GSO target is adapted as well.
*
* All values for *flags* are reserved for future usage, and must
* be left at zero.
*
* A call to this helper is susceptible to change the underlaying
* packet buffer. Therefore, at load time, all checks on pointers
* previously done by the verifier are invalidated and must be
* performed again, if the helper is used in combination with
* direct packet access.
* Return
* 0 on success, or a negative error in case of failure.
*
* int bpf_skb_change_type(struct sk_buff *skb, u32 type)
* Description
* Change the packet type for the packet associated to *skb*. This
* comes down to setting *skb*\ **->pkt_type** to *type*, except
* the eBPF program does not have a write access to *skb*\
* **->pkt_type** beside this helper. Using a helper here allows
* for graceful handling of errors.
*
* The major use case is to change incoming *skb*s to
* **PACKET_HOST** in a programmatic way instead of having to
* recirculate via **redirect**\ (..., **BPF_F_INGRESS**), for
* example.
*
* Note that *type* only allows certain values. At this time, they
* are:
*
* **PACKET_HOST**
* Packet is for us.
* **PACKET_BROADCAST**
* Send packet to all.
* **PACKET_MULTICAST**
* Send packet to group.
* **PACKET_OTHERHOST**
* Send packet to someone else.
* Return
* 0 on success, or a negative error in case of failure.
*
* int bpf_skb_under_cgroup(struct sk_buff *skb, struct bpf_map *map, u32 index)
* Description
* Check whether *skb* is a descendant of the cgroup2 held by
* *map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*.
* Return
* The return value depends on the result of the test, and can be:
*
* * 0, if the *skb* failed the cgroup2 descendant test.
* * 1, if the *skb* succeeded the cgroup2 descendant test.
* * A negative error code, if an error occurred.
*
* u32 bpf_get_hash_recalc(struct sk_buff *skb)
* Description
* Retrieve the hash of the packet, *skb*\ **->hash**. If it is
* not set, in particular if the hash was cleared due to mangling,
* recompute this hash. Later accesses to the hash can be done
* directly with *skb*\ **->hash**.
*
* Calling **bpf_set_hash_invalid**\ (), changing a packet
* prototype with **bpf_skb_change_proto**\ (), or calling
* **bpf_skb_store_bytes**\ () with the
* **BPF_F_INVALIDATE_HASH** are actions susceptible to clear
* the hash and to trigger a new computation for the next call to
* **bpf_get_hash_recalc**\ ().
* Return
* The 32-bit hash.
* *
* u64 bpf_get_current_task(void) * u64 bpf_get_current_task(void)
* Returns current task_struct * Return
* Return: current * A pointer to the current task struct.
* *
* int bpf_probe_write_user(void *dst, void *src, int len) * int bpf_probe_write_user(void *dst, const void *src, u32 len)
* safely attempt to write to a location * Description
* @dst: destination address in userspace * Attempt in a safe way to write *len* bytes from the buffer
* @src: source address on stack * *src* to *dst* in memory. It only works for threads that are in
* @len: number of bytes to copy * user context, and *dst* must be a valid user space address.
* Return: 0 on success or negative error *
* * This helper should not be used to implement any kind of
* int bpf_current_task_under_cgroup(map, index) * security mechanism because of TOC-TOU attacks, but rather to
* Check cgroup2 membership of current task * debug, divert, and manipulate execution of semi-cooperative
* @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type * processes.
* @index: index of the cgroup in the bpf_map *
* Return: * Keep in mind that this feature is meant for experiments, and it
* == 0 current failed the cgroup2 descendant test * has a risk of crashing the system and running programs.
* == 1 current succeeded the cgroup2 descendant test * Therefore, when an eBPF program using this helper is attached,
* < 0 error * a warning including PID and process name is printed to kernel
* * logs.
* int bpf_skb_change_tail(skb, len, flags) * Return
* The helper will resize the skb to the given new size, to be used f.e. * 0 on success, or a negative error in case of failure.
* with control messages. *
* @skb: pointer to skb * int bpf_current_task_under_cgroup(struct bpf_map *map, u32 index)
* @len: new skb length * Description
* @flags: reserved * Check whether the probe is being run is the context of a given
* Return: 0 on success or negative error * subset of the cgroup2 hierarchy. The cgroup2 to test is held by
* * *map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*.
* int bpf_skb_pull_data(skb, len) * Return
* The helper will pull in non-linear data in case the skb is non-linear * The return value depends on the result of the test, and can be:
* and not all of len are part of the linear section. Only needed for *
* read/write with direct packet access. * * 0, if the *skb* task belongs to the cgroup2.
* @skb: pointer to skb * * 1, if the *skb* task does not belong to the cgroup2.
* @len: len to make read/writeable * * A negative error code, if an error occurred.
* Return: 0 on success or negative error *
* * int bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags)
* s64 bpf_csum_update(skb, csum) * Description
* Adds csum into skb->csum in case of CHECKSUM_COMPLETE. * Resize (trim or grow) the packet associated to *skb* to the
* @skb: pointer to skb * new *len*. The *flags* are reserved for future usage, and must
* @csum: csum to add * be left at zero.
* Return: csum on success or negative error *
* * The basic idea is that the helper performs the needed work to
* void bpf_set_hash_invalid(skb) * change the size of the packet, then the eBPF program rewrites
* Invalidate current skb->hash. * the rest via helpers like **bpf_skb_store_bytes**\ (),
* @skb: pointer to skb * **bpf_l3_csum_replace**\ (), **bpf_l3_csum_replace**\ ()
* * and others. This helper is a slow path utility intended for
* int bpf_get_numa_node_id() * replies with control messages. And because it is targeted for
* Return: Id of current NUMA node. * slow path, the helper itself can afford to be slow: it
* * implicitly linearizes, unclones and drops offloads from the
* int bpf_skb_change_head() * *skb*.
* Grows headroom of skb and adjusts MAC header offset accordingly. *
* Will extends/reallocae as required automatically. * A call to this helper is susceptible to change the underlaying
* May change skb data pointer and will thus invalidate any check * packet buffer. Therefore, at load time, all checks on pointers
* performed for direct packet access. * previously done by the verifier are invalidated and must be
* @skb: pointer to skb * performed again, if the helper is used in combination with
* @len: length of header to be pushed in front * direct packet access.
* @flags: Flags (unused for now) * Return
* Return: 0 on success or negative error * 0 on success, or a negative error in case of failure.
* *
* int bpf_xdp_adjust_head(xdp_md, delta) * int bpf_skb_pull_data(struct sk_buff *skb, u32 len)
* Adjust the xdp_md.data by delta * Description
* @xdp_md: pointer to xdp_md * Pull in non-linear data in case the *skb* is non-linear and not
* @delta: An positive/negative integer to be added to xdp_md.data * all of *len* are part of the linear section. Make *len* bytes
* Return: 0 on success or negative on error * from *skb* readable and writable. If a zero value is passed for
* *len*, then the whole length of the *skb* is pulled.
*
* This helper is only needed for reading and writing with direct
* packet access.
*
* For direct packet access, testing that offsets to access
* are within packet boundaries (test on *skb*\ **->data_end**) is
* susceptible to fail if offsets are invalid, or if the requested
* data is in non-linear parts of the *skb*. On failure the
* program can just bail out, or in the case of a non-linear
* buffer, use a helper to make the data available. The
* **bpf_skb_load_bytes**\ () helper is a first solution to access
* the data. Another one consists in using **bpf_skb_pull_data**
* to pull in once the non-linear parts, then retesting and
* eventually access the data.
*
* At the same time, this also makes sure the *skb* is uncloned,
* which is a necessary condition for direct write. As this needs
* to be an invariant for the write part only, the verifier
* detects writes and adds a prologue that is calling
* **bpf_skb_pull_data()** to effectively unclone the *skb* from
* the very beginning in case it is indeed cloned.
*
* A call to this helper is susceptible to change the underlaying
* packet buffer. Therefore, at load time, all checks on pointers
* previously done by the verifier are invalidated and must be
* performed again, if the helper is used in combination with
* direct packet access.
* Return
* 0 on success, or a negative error in case of failure.
*
* s64 bpf_csum_update(struct sk_buff *skb, __wsum csum)
* Description
* Add the checksum *csum* into *skb*\ **->csum** in case the
* driver has supplied a checksum for the entire packet into that
* field. Return an error otherwise. This helper is intended to be
* used in combination with **bpf_csum_diff**\ (), in particular
* when the checksum needs to be updated after data has been
* written into the packet through direct packet access.
* Return
* The checksum on success, or a negative error code in case of
* failure.
*
* void bpf_set_hash_invalid(struct sk_buff *skb)
* Description
* Invalidate the current *skb*\ **->hash**. It can be used after
* mangling on headers through direct packet access, in order to
* indicate that the hash is outdated and to trigger a
* recalculation the next time the kernel tries to access this
* hash or when the **bpf_get_hash_recalc**\ () helper is called.
*
* int bpf_get_numa_node_id(void)
* Description
* Return the id of the current NUMA node. The primary use case
* for this helper is the selection of sockets for the local NUMA
* node, when the program is attached to sockets using the
* **SO_ATTACH_REUSEPORT_EBPF** option (see also **socket(7)**),
* but the helper is also available to other eBPF program types,
* similarly to **bpf_get_smp_processor_id**\ ().
* Return
* The id of current NUMA node.
*
* int bpf_skb_change_head(struct sk_buff *skb, u32 len, u64 flags)
* Description
* Grows headroom of packet associated to *skb* and adjusts the
* offset of the MAC header accordingly, adding *len* bytes of
* space. It automatically extends and reallocates memory as
* required.
*
* This helper can be used on a layer 3 *skb* to push a MAC header
* for redirection into a layer 2 device.
*
* All values for *flags* are reserved for future usage, and must
* be left at zero.
*
* A call to this helper is susceptible to change the underlaying
* packet buffer. Therefore, at load time, all checks on pointers
* previously done by the verifier are invalidated and must be
* performed again, if the helper is used in combination with
* direct packet access.
* Return
* 0 on success, or a negative error in case of failure.
*
* int bpf_xdp_adjust_head(struct xdp_buff *xdp_md, int delta)
* Description
* Adjust (move) *xdp_md*\ **->data** by *delta* bytes. Note that
* it is possible to use a negative value for *delta*. This helper
* can be used to prepare the packet for pushing or popping
* headers.
*
* A call to this helper is susceptible to change the underlaying
* packet buffer. Therefore, at load time, all checks on pointers
* previously done by the verifier are invalidated and must be
* performed again, if the helper is used in combination with
* direct packet access.
* Return
* 0 on success, or a negative error in case of failure.
* *
* int bpf_probe_read_str(void *dst, int size, const void *unsafe_ptr) * int bpf_probe_read_str(void *dst, int size, const void *unsafe_ptr)
* Copy a NUL terminated string from unsafe address. In case the string * Description
* length is smaller than size, the target is not padded with further NUL * Copy a NUL terminated string from an unsafe address
* bytes. In case the string length is larger than size, just count-1 * *unsafe_ptr* to *dst*. The *size* should include the
* bytes are copied and the last byte is set to NUL. * terminating NUL byte. In case the string length is smaller than
* @dst: destination address * *size*, the target is not padded with further NUL bytes. If the
* @size: maximum number of bytes to copy, including the trailing NUL * string length is larger than *size*, just *size*-1 bytes are
* @unsafe_ptr: unsafe address * copied and the last byte is set to NUL.
* Return: *
* > 0 length of the string including the trailing NUL on success * On success, the length of the copied string is returned. This
* < 0 error * makes this helper useful in tracing programs for reading
* * strings, and more importantly to get its length at runtime. See
* u64 bpf_get_socket_cookie(skb) * the following snippet:
* Get the cookie for the socket stored inside sk_buff. *
* @skb: pointer to skb * ::
* Return: 8 Bytes non-decreasing number on success or 0 if the socket *
* field is missing inside sk_buff * SEC("kprobe/sys_open")
* * void bpf_sys_open(struct pt_regs *ctx)
* u32 bpf_get_socket_uid(skb) * {
* Get the owner uid of the socket stored inside sk_buff. * char buf[PATHLEN]; // PATHLEN is defined to 256
* @skb: pointer to skb * int res = bpf_probe_read_str(buf, sizeof(buf),
* Return: uid of the socket owner on success or overflowuid if failed. * ctx->di);
* *
* u32 bpf_set_hash(skb, hash) * // Consume buf, for example push it to
* Set full skb->hash. * // userspace via bpf_perf_event_output(); we
* @skb: pointer to skb * // can use res (the string length) as event
* @hash: hash to set * // size, after checking its boundaries.
* * }
* int bpf_setsockopt(bpf_socket, level, optname, optval, optlen) *
* Calls setsockopt. Not all opts are available, only those with * In comparison, using **bpf_probe_read()** helper here instead
* integer optvals plus TCP_CONGESTION. * to read the string would require to estimate the length at
* Supported levels: SOL_SOCKET and IPPROTO_TCP * compile time, and would often result in copying more memory
* @bpf_socket: pointer to bpf_socket * than necessary.
* @level: SOL_SOCKET or IPPROTO_TCP *
* @optname: option name * Another useful use case is when parsing individual process
* @optval: pointer to option value * arguments or individual environment variables navigating
* @optlen: length of optval in bytes * *current*\ **->mm->arg_start** and *current*\
* Return: 0 or negative error * **->mm->env_start**: using this helper and the return value,
* * one can quickly iterate at the right offset of the memory area.
* int bpf_getsockopt(bpf_socket, level, optname, optval, optlen) * Return
* Calls getsockopt. Not all opts are available. * On success, the strictly positive length of the string,
* Supported levels: IPPROTO_TCP * including the trailing NUL character. On error, a negative
* @bpf_socket: pointer to bpf_socket * value.
* @level: IPPROTO_TCP *
* @optname: option name * u64 bpf_get_socket_cookie(struct sk_buff *skb)
* @optval: pointer to option value * Description
* @optlen: length of optval in bytes * If the **struct sk_buff** pointed by *skb* has a known socket,
* Return: 0 or negative error * retrieve the cookie (generated by the kernel) of this socket.
* * If no cookie has been set yet, generate a new cookie. Once
* int bpf_sock_ops_cb_flags_set(bpf_sock_ops, flags) * generated, the socket cookie remains stable for the life of the
* Set callback flags for sock_ops * socket. This helper can be useful for monitoring per socket
* @bpf_sock_ops: pointer to bpf_sock_ops_kern struct * networking traffic statistics as it provides a unique socket
* @flags: flags value * identifier per namespace.
* Return: 0 for no error * Return
* -EINVAL if there is no full tcp socket * A 8-byte long non-decreasing number on success, or 0 if the
* bits in flags that are not supported by current kernel * socket field is missing inside *skb*.
* *
* int bpf_skb_adjust_room(skb, len_diff, mode, flags) * u32 bpf_get_socket_uid(struct sk_buff *skb)
* Grow or shrink room in sk_buff. * Return
* @skb: pointer to skb * The owner UID of the socket associated to *skb*. If the socket
* @len_diff: (signed) amount of room to grow/shrink * is **NULL**, or if it is not a full socket (i.e. if it is a
* @mode: operation mode (enum bpf_adj_room_mode) * time-wait or a request socket instead), **overflowuid** value
* @flags: reserved for future use * is returned (note that **overflowuid** might also be the actual
* Return: 0 on success or negative error code * UID value for the socket).
* *
* int bpf_sk_redirect_map(map, key, flags) * u32 bpf_set_hash(struct sk_buff *skb, u32 hash)
* Redirect skb to a sock in map using key as a lookup key for the * Description
* sock in map. * Set the full hash for *skb* (set the field *skb*\ **->hash**)
* @map: pointer to sockmap * to value *hash*.
* @key: key to lookup sock in map * Return
* @flags: reserved for future use * 0
* Return: SK_PASS *
* * int bpf_setsockopt(struct bpf_sock_ops_kern *bpf_socket, int level, int optname, char *optval, int optlen)
* int bpf_sock_map_update(skops, map, key, flags) * Description
* @skops: pointer to bpf_sock_ops * Emulate a call to **setsockopt()** on the socket associated to
* @map: pointer to sockmap to update * *bpf_socket*, which must be a full socket. The *level* at
* @key: key to insert/update sock in map * which the option resides and the name *optname* of the option
* @flags: same flags as map update elem * must be specified, see **setsockopt(2)** for more information.
* * The option value of length *optlen* is pointed by *optval*.
* int bpf_xdp_adjust_meta(xdp_md, delta) *
* Adjust the xdp_md.data_meta by delta * This helper actually implements a subset of **setsockopt()**.
* @xdp_md: pointer to xdp_md * It supports the following *level*\ s:
* @delta: An positive/negative integer to be added to xdp_md.data_meta *
* Return: 0 on success or negative on error * * **SOL_SOCKET**, which supports the following *optname*\ s:
* * **SO_RCVBUF**, **SO_SNDBUF**, **SO_MAX_PACING_RATE**,
* int bpf_perf_event_read_value(map, flags, buf, buf_size) * **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**.
* read perf event counter value and perf event enabled/running time * * **IPPROTO_TCP**, which supports the following *optname*\ s:
* @map: pointer to perf_event_array map * **TCP_CONGESTION**, **TCP_BPF_IW**,
* @flags: index of event in the map or bitmask flags * **TCP_BPF_SNDCWND_CLAMP**.
* @buf: buf to fill * * **IPPROTO_IP**, which supports *optname* **IP_TOS**.
* @buf_size: size of the buf * * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**.
* Return: 0 on success or negative error code * Return
* * 0 on success, or a negative error in case of failure.
* int bpf_perf_prog_read_value(ctx, buf, buf_size) *
* read perf prog attached perf event counter and enabled/running time * int bpf_skb_adjust_room(struct sk_buff *skb, u32 len_diff, u32 mode, u64 flags)
* @ctx: pointer to ctx * Description
* @buf: buf to fill * Grow or shrink the room for data in the packet associated to
* @buf_size: size of the buf * *skb* by *len_diff*, and according to the selected *mode*.
* Return : 0 on success or negative error code *
* * There is a single supported mode at this time:
* int bpf_override_return(pt_regs, rc) *
* @pt_regs: pointer to struct pt_regs * * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer
* @rc: the return value to set * (room space is added or removed below the layer 3 header).
* *
* int bpf_msg_redirect_map(map, key, flags) * All values for *flags* are reserved for future usage, and must
* Redirect msg to a sock in map using key as a lookup key for the * be left at zero.
* sock in map. *
* @map: pointer to sockmap * A call to this helper is susceptible to change the underlaying
* @key: key to lookup sock in map * packet buffer. Therefore, at load time, all checks on pointers
* @flags: reserved for future use * previously done by the verifier are invalidated and must be
* Return: SK_PASS * performed again, if the helper is used in combination with
* * direct packet access.
* int bpf_bind(ctx, addr, addr_len) * Return
* Bind socket to address. Only binding to IP is supported, no port can be * 0 on success, or a negative error in case of failure.
* set in addr. *
* @ctx: pointer to context of type bpf_sock_addr * int bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags)
* @addr: pointer to struct sockaddr to bind socket to * Description
* @addr_len: length of sockaddr structure * Redirect the packet to the endpoint referenced by *map* at
* Return: 0 on success or negative error code * index *key*. Depending on its type, this *map* can contain
* references to net devices (for forwarding packets through other
* ports), or to CPUs (for redirecting XDP frames to another CPU;
* but this is only implemented for native XDP (with driver
* support) as of this writing).
*
* All values for *flags* are reserved for future usage, and must
* be left at zero.
*
* When used to redirect packets to net devices, this helper
* provides a high performance increase over **bpf_redirect**\ ().
* This is due to various implementation details of the underlying
* mechanisms, one of which is the fact that **bpf_redirect_map**\
* () tries to send packet as a "bulk" to the device.
* Return
* **XDP_REDIRECT** on success, or **XDP_ABORTED** on error.
*
* int bpf_sk_redirect_map(struct bpf_map *map, u32 key, u64 flags)
* Description
* Redirect the packet to the socket referenced by *map* (of type
* **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and
* egress interfaces can be used for redirection. The
* **BPF_F_INGRESS** value in *flags* is used to make the
* distinction (ingress path is selected if the flag is present,
* egress path otherwise). This is the only flag supported for now.
* Return
* **SK_PASS** on success, or **SK_DROP** on error.
*
* int bpf_sock_map_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags)
* Description
* Add an entry to, or update a *map* referencing sockets. The
* *skops* is used as a new value for the entry associated to
* *key*. *flags* is one of:
*
* **BPF_NOEXIST**
* The entry for *key* must not exist in the map.
* **BPF_EXIST**
* The entry for *key* must already exist in the map.
* **BPF_ANY**
* No condition on the existence of the entry for *key*.
*
* If the *map* has eBPF programs (parser and verdict), those will
* be inherited by the socket being added. If the socket is
* already attached to eBPF programs, this results in an error.
* Return
* 0 on success, or a negative error in case of failure.
*
* int bpf_xdp_adjust_meta(struct xdp_buff *xdp_md, int delta)
* Description
* Adjust the address pointed by *xdp_md*\ **->data_meta** by
* *delta* (which can be positive or negative). Note that this
* operation modifies the address stored in *xdp_md*\ **->data**,
* so the latter must be loaded only after the helper has been
* called.
*
* The use of *xdp_md*\ **->data_meta** is optional and programs
* are not required to use it. The rationale is that when the
* packet is processed with XDP (e.g. as DoS filter), it is
* possible to push further meta data along with it before passing
* to the stack, and to give the guarantee that an ingress eBPF
* program attached as a TC classifier on the same device can pick
* this up for further post-processing. Since TC works with socket
* buffers, it remains possible to set from XDP the **mark** or
* **priority** pointers, or other pointers for the socket buffer.
* Having this scratch space generic and programmable allows for
* more flexibility as the user is free to store whatever meta
* data they need.
*
* A call to this helper is susceptible to change the underlaying
* packet buffer. Therefore, at load time, all checks on pointers
* previously done by the verifier are invalidated and must be
* performed again, if the helper is used in combination with
* direct packet access.
* Return
* 0 on success, or a negative error in case of failure.
*
* int bpf_perf_event_read_value(struct bpf_map *map, u64 flags, struct bpf_perf_event_value *buf, u32 buf_size)
* Description
* Read the value of a perf event counter, and store it into *buf*
* of size *buf_size*. This helper relies on a *map* of type
* **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of the perf event
* counter is selected when *map* is updated with perf event file
* descriptors. The *map* is an array whose size is the number of
* available CPUs, and each cell contains a value relative to one
* CPU. The value to retrieve is indicated by *flags*, that
* contains the index of the CPU to look up, masked with
* **BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to
* **BPF_F_CURRENT_CPU** to indicate that the value for the
* current CPU should be retrieved.
*
* This helper behaves in a way close to
* **bpf_perf_event_read**\ () helper, save that instead of
* just returning the value observed, it fills the *buf*
* structure. This allows for additional data to be retrieved: in
* particular, the enabled and running times (in *buf*\
* **->enabled** and *buf*\ **->running**, respectively) are
* copied. In general, **bpf_perf_event_read_value**\ () is
* recommended over **bpf_perf_event_read**\ (), which has some
* ABI issues and provides fewer functionalities.
*
* These values are interesting, because hardware PMU (Performance
* Monitoring Unit) counters are limited resources. When there are
* more PMU based perf events opened than available counters,
* kernel will multiplex these events so each event gets certain
* percentage (but not all) of the PMU time. In case that
* multiplexing happens, the number of samples or counter value
* will not reflect the case compared to when no multiplexing
* occurs. This makes comparison between different runs difficult.
* Typically, the counter value should be normalized before
* comparing to other experiments. The usual normalization is done
* as follows.
*
* ::
*
* normalized_counter = counter * t_enabled / t_running
*
* Where t_enabled is the time enabled for event and t_running is
* the time running for event since last normalization. The
* enabled and running times are accumulated since the perf event
* open. To achieve scaling factor between two invocations of an
* eBPF program, users can can use CPU id as the key (which is
* typical for perf array usage model) to remember the previous
* value and do the calculation inside the eBPF program.
* Return
* 0 on success, or a negative error in case of failure.
*
* int bpf_perf_prog_read_value(struct bpf_perf_event_data_kern *ctx, struct bpf_perf_event_value *buf, u32 buf_size)
* Description
* For en eBPF program attached to a perf event, retrieve the
* value of the event counter associated to *ctx* and store it in
* the structure pointed by *buf* and of size *buf_size*. Enabled
* and running times are also stored in the structure (see
* description of helper **bpf_perf_event_read_value**\ () for
* more details).
* Return
* 0 on success, or a negative error in case of failure.
*
* int bpf_getsockopt(struct bpf_sock_ops_kern *bpf_socket, int level, int optname, char *optval, int optlen)
* Description
* Emulate a call to **getsockopt()** on the socket associated to
* *bpf_socket*, which must be a full socket. The *level* at
* which the option resides and the name *optname* of the option
* must be specified, see **getsockopt(2)** for more information.
* The retrieved value is stored in the structure pointed by
* *opval* and of length *optlen*.
*
* This helper actually implements a subset of **getsockopt()**.
* It supports the following *level*\ s:
*
* * **IPPROTO_TCP**, which supports *optname*
* **TCP_CONGESTION**.
* * **IPPROTO_IP**, which supports *optname* **IP_TOS**.
* * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**.
* Return
* 0 on success, or a negative error in case of failure.
*
* int bpf_override_return(struct pt_reg *regs, u64 rc)
* Description
* Used for error injection, this helper uses kprobes to override
* the return value of the probed function, and to set it to *rc*.
* The first argument is the context *regs* on which the kprobe
* works.
*
* This helper works by setting setting the PC (program counter)
* to an override function which is run in place of the original
* probed function. This means the probed function is not run at
* all. The replacement function just returns with the required
* value.
*
* This helper has security implications, and thus is subject to
* restrictions. It is only available if the kernel was compiled
* with the **CONFIG_BPF_KPROBE_OVERRIDE** configuration
* option, and in this case it only works on functions tagged with
* **ALLOW_ERROR_INJECTION** in the kernel code.
*
* Also, the helper is only available for the architectures having
* the CONFIG_FUNCTION_ERROR_INJECTION option. As of this writing,
* x86 architecture is the only one to support this feature.
* Return
* 0
*
* int bpf_sock_ops_cb_flags_set(struct bpf_sock_ops_kern *bpf_sock, int argval)
* Description
* Attempt to set the value of the **bpf_sock_ops_cb_flags** field
* for the full TCP socket associated to *bpf_sock_ops* to
* *argval*.
*
* The primary use of this field is to determine if there should
* be calls to eBPF programs of type
* **BPF_PROG_TYPE_SOCK_OPS** at various points in the TCP
* code. A program of the same type can change its value, per
* connection and as necessary, when the connection is
* established. This field is directly accessible for reading, but
* this helper must be used for updates in order to return an
* error if an eBPF program tries to set a callback that is not
* supported in the current kernel.
*
* The supported callback values that *argval* can combine are:
*
* * **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out)
* * **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission)
* * **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change)
*
* Here are some examples of where one could call such eBPF
* program:
*
* * When RTO fires.
* * When a packet is retransmitted.
* * When the connection terminates.
* * When a packet is sent.
* * When a packet is received.
* Return
* Code **-EINVAL** if the socket is not a full TCP socket;
* otherwise, a positive number containing the bits that could not
* be set is returned (which comes down to 0 if all bits were set
* as required).
*
* int bpf_msg_redirect_map(struct sk_msg_buff *msg, struct bpf_map *map, u32 key, u64 flags)
* Description
* This helper is used in programs implementing policies at the
* socket level. If the message *msg* is allowed to pass (i.e. if
* the verdict eBPF program returns **SK_PASS**), redirect it to
* the socket referenced by *map* (of type
* **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and
* egress interfaces can be used for redirection. The
* **BPF_F_INGRESS** value in *flags* is used to make the
* distinction (ingress path is selected if the flag is present,
* egress path otherwise). This is the only flag supported for now.
* Return
* **SK_PASS** on success, or **SK_DROP** on error.
*
* int bpf_msg_apply_bytes(struct sk_msg_buff *msg, u32 bytes)
* Description
* For socket policies, apply the verdict of the eBPF program to
* the next *bytes* (number of bytes) of message *msg*.
*
* For example, this helper can be used in the following cases:
*
* * A single **sendmsg**\ () or **sendfile**\ () system call
* contains multiple logical messages that the eBPF program is
* supposed to read and for which it should apply a verdict.
* * An eBPF program only cares to read the first *bytes* of a
* *msg*. If the message has a large payload, then setting up
* and calling the eBPF program repeatedly for all bytes, even
* though the verdict is already known, would create unnecessary
* overhead.
*
* When called from within an eBPF program, the helper sets a
* counter internal to the BPF infrastructure, that is used to
* apply the last verdict to the next *bytes*. If *bytes* is
* smaller than the current data being processed from a
* **sendmsg**\ () or **sendfile**\ () system call, the first
* *bytes* will be sent and the eBPF program will be re-run with
* the pointer for start of data pointing to byte number *bytes*
* **+ 1**. If *bytes* is larger than the current data being
* processed, then the eBPF verdict will be applied to multiple
* **sendmsg**\ () or **sendfile**\ () calls until *bytes* are
* consumed.
*
* Note that if a socket closes with the internal counter holding
* a non-zero value, this is not a problem because data is not
* being buffered for *bytes* and is sent as it is received.
* Return
* 0
*
* int bpf_msg_cork_bytes(struct sk_msg_buff *msg, u32 bytes)
* Description
* For socket policies, prevent the execution of the verdict eBPF
* program for message *msg* until *bytes* (byte number) have been
* accumulated.
*
* This can be used when one needs a specific number of bytes
* before a verdict can be assigned, even if the data spans
* multiple **sendmsg**\ () or **sendfile**\ () calls. The extreme
* case would be a user calling **sendmsg**\ () repeatedly with
* 1-byte long message segments. Obviously, this is bad for
* performance, but it is still valid. If the eBPF program needs
* *bytes* bytes to validate a header, this helper can be used to
* prevent the eBPF program to be called again until *bytes* have
* been accumulated.
* Return
* 0
*
* int bpf_msg_pull_data(struct sk_msg_buff *msg, u32 start, u32 end, u64 flags)
* Description
* For socket policies, pull in non-linear data from user space
* for *msg* and set pointers *msg*\ **->data** and *msg*\
* **->data_end** to *start* and *end* bytes offsets into *msg*,
* respectively.
*
* If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a
* *msg* it can only parse data that the (**data**, **data_end**)
* pointers have already consumed. For **sendmsg**\ () hooks this
* is likely the first scatterlist element. But for calls relying
* on the **sendpage** handler (e.g. **sendfile**\ ()) this will
* be the range (**0**, **0**) because the data is shared with
* user space and by default the objective is to avoid allowing
* user space to modify data while (or after) eBPF verdict is
* being decided. This helper can be used to pull in data and to
* set the start and end pointer to given values. Data will be
* copied if necessary (i.e. if data was not linear and if start
* and end pointers do not point to the same chunk).
*
* A call to this helper is susceptible to change the underlaying
* packet buffer. Therefore, at load time, all checks on pointers
* previously done by the verifier are invalidated and must be
* performed again, if the helper is used in combination with
* direct packet access.
*
* All values for *flags* are reserved for future usage, and must
* be left at zero.
* Return
* 0 on success, or a negative error in case of failure.
*
* int bpf_bind(struct bpf_sock_addr_kern *ctx, struct sockaddr *addr, int addr_len)
* Description
* Bind the socket associated to *ctx* to the address pointed by
* *addr*, of length *addr_len*. This allows for making outgoing
* connection from the desired IP address, which can be useful for
* example when all processes inside a cgroup should use one
* single IP address on a host that has multiple IP configured.
*
* This helper works for IPv4 and IPv6, TCP and UDP sockets. The
* domain (*addr*\ **->sa_family**) must be **AF_INET** (or
* **AF_INET6**). Looking for a free port to bind to can be
* expensive, therefore binding to port is not permitted by the
* helper: *addr*\ **->sin_port** (or **sin6_port**, respectively)
* must be set to zero.
* Return
* 0 on success, or a negative error in case of failure.
*
* int bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta)
* Description
* Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is
* only possible to shrink the packet as of this writing,
* therefore *delta* must be a negative integer.
*
* A call to this helper is susceptible to change the underlaying
* packet buffer. Therefore, at load time, all checks on pointers
* previously done by the verifier are invalidated and must be
* performed again, if the helper is used in combination with
* direct packet access.
* Return
* 0 on success, or a negative error in case of failure.
*
* int bpf_skb_get_xfrm_state(struct sk_buff *skb, u32 index, struct bpf_xfrm_state *xfrm_state, u32 size, u64 flags)
* Description
* Retrieve the XFRM state (IP transform framework, see also
* **ip-xfrm(8)**) at *index* in XFRM "security path" for *skb*.
*
* The retrieved value is stored in the **struct bpf_xfrm_state**
* pointed by *xfrm_state* and of length *size*.
*
* All values for *flags* are reserved for future usage, and must
* be left at zero.
*
* This helper is available only if the kernel was compiled with
* **CONFIG_XFRM** configuration option.
* Return
* 0 on success, or a negative error in case of failure.
*/ */
#define __BPF_FUNC_MAPPER(FN) \ #define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \ FN(unspec), \
...@@ -822,7 +1834,9 @@ union bpf_attr { ...@@ -822,7 +1834,9 @@ union bpf_attr {
FN(msg_apply_bytes), \ FN(msg_apply_bytes), \
FN(msg_cork_bytes), \ FN(msg_cork_bytes), \
FN(msg_pull_data), \ FN(msg_pull_data), \
FN(bind), FN(bind), \
FN(xdp_adjust_tail), \
FN(skb_get_xfrm_state),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper /* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call * function eBPF program intends to call
...@@ -928,6 +1942,19 @@ struct bpf_tunnel_key { ...@@ -928,6 +1942,19 @@ struct bpf_tunnel_key {
__u32 tunnel_label; __u32 tunnel_label;
}; };
/* user accessible mirror of in-kernel xfrm_state.
* new fields can only be added to the end of this structure
*/
struct bpf_xfrm_state {
__u32 reqid;
__u32 spi; /* Stored in network byte order */
__u16 family;
union {
__u32 remote_ipv4; /* Stored in network byte order */
__u32 remote_ipv6[4]; /* Stored in network byte order */
};
};
/* Generic BPF return codes which all BPF program types may support. /* Generic BPF return codes which all BPF program types may support.
* The values are binary compatible with their TC_ACT_* counter-part to * The values are binary compatible with their TC_ACT_* counter-part to
* provide backwards compatibility with existing SCHED_CLS and SCHED_ACT * provide backwards compatibility with existing SCHED_CLS and SCHED_ACT
...@@ -1018,6 +2045,7 @@ struct bpf_prog_info { ...@@ -1018,6 +2045,7 @@ struct bpf_prog_info {
__aligned_u64 map_ids; __aligned_u64 map_ids;
char name[BPF_OBJ_NAME_LEN]; char name[BPF_OBJ_NAME_LEN];
__u32 ifindex; __u32 ifindex;
__u32 gpl_compatible:1;
__u64 netns_dev; __u64 netns_dev;
__u64 netns_ino; __u64 netns_ino;
} __attribute__((aligned(8))); } __attribute__((aligned(8)));
......
...@@ -352,6 +352,10 @@ static int (*bpf_msg_pull_data)(void *msg, u32 start, u32 end, u64 flags) = ...@@ -352,6 +352,10 @@ static int (*bpf_msg_pull_data)(void *msg, u32 start, u32 end, u64 flags) =
(void *) BPF_FUNC_msg_pull_data; (void *) BPF_FUNC_msg_pull_data;
static int (*bpf_bind)(void *ctx, void *addr, int addr_len) = static int (*bpf_bind)(void *ctx, void *addr, int addr_len) =
(void *) BPF_FUNC_bind; (void *) BPF_FUNC_bind;
static int (*bpf_xdp_adjust_tail)(void *ctx, int offset) =
(void *) BPF_FUNC_xdp_adjust_tail;
static int (*bpf_skb_get_xfrm_state)(void *ctx, u32 index, void *xfrm_state, u32 size, u64 flags) =
(void *) BPF_FUNC_skb_get_xfrm_state;
/* llvm builtin functions that eBPF C program may use to /* llvm builtin functions that eBPF C program may use to
* emit BPF_LD_ABS and BPF_LD_IND instructions * emit BPF_LD_ABS and BPF_LD_IND instructions
......
...@@ -154,6 +154,8 @@ static struct bpf_helper helpers[] = { ...@@ -154,6 +154,8 @@ static struct bpf_helper helpers[] = {
{"msg_cork_bytes", "4.17"}, {"msg_cork_bytes", "4.17"},
{"msg_pull_data", "4.17"}, {"msg_pull_data", "4.17"},
{"bind", "4.17"}, {"bind", "4.17"},
{"xdp_adjust_tail", "4.18"},
{"skb_get_xfrm_state", "4.18"},
}; };
static uint64_t ptr_to_u64(void *ptr) static uint64_t ptr_to_u64(void *ptr)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment