Commit 8f50d5c4 authored by Martin KaFai Lau's avatar Martin KaFai Lau

Merge branch 'Allow struct_ops maps with a large number of programs'

Kui-Feng Lee says:

====================
The BPF struct_ops previously only allowed for one page to be used for
the trampolines of all links in a map. However, we have recently run
out of space due to the large number of BPF program links. By
allocating additional pages when we exhaust an existing page, we can
accommodate more links in a single map.

The variable st_map->image has been changed to st_map->image_pages,
and its type has been changed to an array of pointers to buffers of
PAGE_SIZE. Additional pages are allocated when all existing pages are
exhausted.

The test case loads a struct_ops maps having 40 programs. Their
trampolines takes about 6.6k+ bytes over 1.5 pages on x86.
---
Major differences from v3:

 - Refactor buffer allocations to bpf_struct_ops_tramp_buf_alloc() and
   bpf_struct_ops_tramp_buf_free().

Major differences from v2:

 - Move image buffer allocation to bpf_struct_ops_prepare_trampoline().

Major differences from v1:

 - Always free pages if failing to update.

 - Allocate 8 pages at most.

v3: https://lore.kernel.org/all/20240224030302.1500343-1-thinker.li@gmail.com/
v2: https://lore.kernel.org/all/20240221225911.757861-1-thinker.li@gmail.com/
v1: https://lore.kernel.org/all/20240216182828.201727-1-thinker.li@gmail.com/
====================
Signed-off-by: default avatarMartin KaFai Lau <martin.lau@kernel.org>
parents 01031fd4 93bc28d8
......@@ -1763,7 +1763,9 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
struct bpf_tramp_link *link,
const struct btf_func_model *model,
void *stub_func,
void *image, void *image_end);
void **image, u32 *image_off,
bool allow_alloc);
void bpf_struct_ops_image_free(void *image);
static inline bool bpf_try_module_get(const void *data, struct module *owner)
{
if (owner == BPF_MODULE_OWNER)
......
......@@ -18,6 +18,8 @@ struct bpf_struct_ops_value {
char data[] ____cacheline_aligned_in_smp;
};
#define MAX_TRAMP_IMAGE_PAGES 8
struct bpf_struct_ops_map {
struct bpf_map map;
struct rcu_head rcu;
......@@ -30,12 +32,11 @@ struct bpf_struct_ops_map {
*/
struct bpf_link **links;
u32 links_cnt;
/* image is a page that has all the trampolines
u32 image_pages_cnt;
/* image_pages is an array of pages that has all the trampolines
* that stores the func args before calling the bpf_prog.
* A PAGE_SIZE "image" is enough to store all trampoline for
* "links[]".
*/
void *image;
void *image_pages[MAX_TRAMP_IMAGE_PAGES];
/* The owner moduler's btf. */
struct btf *btf;
/* uvalue->data stores the kernel struct
......@@ -116,6 +117,31 @@ static bool is_valid_value_type(struct btf *btf, s32 value_id,
return true;
}
static void *bpf_struct_ops_image_alloc(void)
{
void *image;
int err;
err = bpf_jit_charge_modmem(PAGE_SIZE);
if (err)
return ERR_PTR(err);
image = arch_alloc_bpf_trampoline(PAGE_SIZE);
if (!image) {
bpf_jit_uncharge_modmem(PAGE_SIZE);
return ERR_PTR(-ENOMEM);
}
return image;
}
void bpf_struct_ops_image_free(void *image)
{
if (image) {
arch_free_bpf_trampoline(image, PAGE_SIZE);
bpf_jit_uncharge_modmem(PAGE_SIZE);
}
}
#define MAYBE_NULL_SUFFIX "__nullable"
#define MAX_STUB_NAME 128
......@@ -461,6 +487,15 @@ static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map)
}
}
static void bpf_struct_ops_map_free_image(struct bpf_struct_ops_map *st_map)
{
int i;
for (i = 0; i < st_map->image_pages_cnt; i++)
bpf_struct_ops_image_free(st_map->image_pages[i]);
st_map->image_pages_cnt = 0;
}
static int check_zero_holes(const struct btf *btf, const struct btf_type *t, void *data)
{
const struct btf_member *member;
......@@ -506,9 +541,12 @@ const struct bpf_link_ops bpf_struct_ops_link_lops = {
int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
struct bpf_tramp_link *link,
const struct btf_func_model *model,
void *stub_func, void *image, void *image_end)
void *stub_func,
void **_image, u32 *_image_off,
bool allow_alloc)
{
u32 flags = BPF_TRAMP_F_INDIRECT;
u32 image_off = *_image_off, flags = BPF_TRAMP_F_INDIRECT;
void *image = *_image;
int size;
tlinks[BPF_TRAMP_FENTRY].links[0] = link;
......@@ -518,12 +556,32 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
flags |= BPF_TRAMP_F_RET_FENTRY_RET;
size = arch_bpf_trampoline_size(model, flags, tlinks, NULL);
if (size < 0)
return size;
if (size > (unsigned long)image_end - (unsigned long)image)
return -E2BIG;
return arch_prepare_bpf_trampoline(NULL, image, image_end,
if (size <= 0)
return size ? : -EFAULT;
/* Allocate image buffer if necessary */
if (!image || size > PAGE_SIZE - image_off) {
if (!allow_alloc)
return -E2BIG;
image = bpf_struct_ops_image_alloc();
if (IS_ERR(image))
return PTR_ERR(image);
image_off = 0;
}
size = arch_prepare_bpf_trampoline(NULL, image + image_off,
image + PAGE_SIZE,
model, flags, tlinks, stub_func);
if (size <= 0) {
if (image != *_image)
bpf_struct_ops_image_free(image);
return size ? : -EFAULT;
}
*_image = image;
*_image_off = image_off + size;
return 0;
}
static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
......@@ -539,8 +597,8 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
struct bpf_tramp_links *tlinks;
void *udata, *kdata;
int prog_fd, err;
void *image, *image_end;
u32 i;
u32 i, trampoline_start, image_off = 0;
void *cur_image = NULL, *image = NULL;
if (flags)
return -EINVAL;
......@@ -578,8 +636,6 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
udata = &uvalue->data;
kdata = &kvalue->data;
image = st_map->image;
image_end = st_map->image + PAGE_SIZE;
module_type = btf_type_by_id(btf_vmlinux, st_ops_ids[IDX_MODULE_ID]);
for_each_member(i, t, member) {
......@@ -658,28 +714,39 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
&bpf_struct_ops_link_lops, prog);
st_map->links[i] = &link->link;
trampoline_start = image_off;
err = bpf_struct_ops_prepare_trampoline(tlinks, link,
&st_ops->func_models[i],
*(void **)(st_ops->cfi_stubs + moff),
image, image_end);
&st_ops->func_models[i],
*(void **)(st_ops->cfi_stubs + moff),
&image, &image_off,
st_map->image_pages_cnt < MAX_TRAMP_IMAGE_PAGES);
if (err)
goto reset_unlock;
if (cur_image != image) {
st_map->image_pages[st_map->image_pages_cnt++] = image;
cur_image = image;
trampoline_start = 0;
}
if (err < 0)
goto reset_unlock;
*(void **)(kdata + moff) = image + cfi_get_offset();
image += err;
*(void **)(kdata + moff) = image + trampoline_start + cfi_get_offset();
/* put prog_id to udata */
*(unsigned long *)(udata + moff) = prog->aux->id;
}
if (st_ops->validate) {
err = st_ops->validate(kdata);
if (err)
goto reset_unlock;
}
for (i = 0; i < st_map->image_pages_cnt; i++)
arch_protect_bpf_trampoline(st_map->image_pages[i], PAGE_SIZE);
if (st_map->map.map_flags & BPF_F_LINK) {
err = 0;
if (st_ops->validate) {
err = st_ops->validate(kdata);
if (err)
goto reset_unlock;
}
arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE);
/* Let bpf_link handle registration & unregistration.
*
* Pair with smp_load_acquire() during lookup_elem().
......@@ -688,7 +755,6 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
goto unlock;
}
arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE);
err = st_ops->reg(kdata);
if (likely(!err)) {
/* This refcnt increment on the map here after
......@@ -711,9 +777,9 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
* there was a race in registering the struct_ops (under the same name) to
* a sub-system through different struct_ops's maps.
*/
arch_unprotect_bpf_trampoline(st_map->image, PAGE_SIZE);
reset_unlock:
bpf_struct_ops_map_free_image(st_map);
bpf_struct_ops_map_put_progs(st_map);
memset(uvalue, 0, map->value_size);
memset(kvalue, 0, map->value_size);
......@@ -780,10 +846,7 @@ static void __bpf_struct_ops_map_free(struct bpf_map *map)
if (st_map->links)
bpf_struct_ops_map_put_progs(st_map);
bpf_map_area_free(st_map->links);
if (st_map->image) {
arch_free_bpf_trampoline(st_map->image, PAGE_SIZE);
bpf_jit_uncharge_modmem(PAGE_SIZE);
}
bpf_struct_ops_map_free_image(st_map);
bpf_map_area_free(st_map->uvalue);
bpf_map_area_free(st_map);
}
......@@ -893,20 +956,6 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
st_map->st_ops_desc = st_ops_desc;
map = &st_map->map;
ret = bpf_jit_charge_modmem(PAGE_SIZE);
if (ret)
goto errout_free;
st_map->image = arch_alloc_bpf_trampoline(PAGE_SIZE);
if (!st_map->image) {
/* __bpf_struct_ops_map_free() uses st_map->image as flag
* for "charged or not". In this case, we need to unchange
* here.
*/
bpf_jit_uncharge_modmem(PAGE_SIZE);
ret = -ENOMEM;
goto errout_free;
}
st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE);
st_map->links_cnt = btf_type_vlen(t);
st_map->links =
......
......@@ -91,6 +91,7 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
struct bpf_tramp_link *link = NULL;
void *image = NULL;
unsigned int op_idx;
u32 image_off = 0;
int prog_ret;
s32 type_id;
int err;
......@@ -114,12 +115,6 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
goto out;
}
image = arch_alloc_bpf_trampoline(PAGE_SIZE);
if (!image) {
err = -ENOMEM;
goto out;
}
link = kzalloc(sizeof(*link), GFP_USER);
if (!link) {
err = -ENOMEM;
......@@ -133,7 +128,8 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
err = bpf_struct_ops_prepare_trampoline(tlinks, link,
&st_ops->func_models[op_idx],
&dummy_ops_test_ret_function,
image, image + PAGE_SIZE);
&image, &image_off,
true);
if (err < 0)
goto out;
......@@ -147,7 +143,7 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
err = -EFAULT;
out:
kfree(args);
arch_free_bpf_trampoline(image, PAGE_SIZE);
bpf_struct_ops_image_free(image);
if (link)
bpf_link_put(&link->link);
kfree(tlinks);
......
......@@ -146,11 +146,7 @@ EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
int tcp_update_congestion_control(struct tcp_congestion_ops *ca, struct tcp_congestion_ops *old_ca)
{
struct tcp_congestion_ops *existing;
int ret;
ret = tcp_validate_congestion_control(ca);
if (ret)
return ret;
int ret = 0;
ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name));
......
......@@ -43,6 +43,50 @@ struct bpf_testmod_ops {
int b;
} unsupported;
int data;
/* The following pointers are used to test the maps having multiple
* pages of trampolines.
*/
int (*tramp_1)(int value);
int (*tramp_2)(int value);
int (*tramp_3)(int value);
int (*tramp_4)(int value);
int (*tramp_5)(int value);
int (*tramp_6)(int value);
int (*tramp_7)(int value);
int (*tramp_8)(int value);
int (*tramp_9)(int value);
int (*tramp_10)(int value);
int (*tramp_11)(int value);
int (*tramp_12)(int value);
int (*tramp_13)(int value);
int (*tramp_14)(int value);
int (*tramp_15)(int value);
int (*tramp_16)(int value);
int (*tramp_17)(int value);
int (*tramp_18)(int value);
int (*tramp_19)(int value);
int (*tramp_20)(int value);
int (*tramp_21)(int value);
int (*tramp_22)(int value);
int (*tramp_23)(int value);
int (*tramp_24)(int value);
int (*tramp_25)(int value);
int (*tramp_26)(int value);
int (*tramp_27)(int value);
int (*tramp_28)(int value);
int (*tramp_29)(int value);
int (*tramp_30)(int value);
int (*tramp_31)(int value);
int (*tramp_32)(int value);
int (*tramp_33)(int value);
int (*tramp_34)(int value);
int (*tramp_35)(int value);
int (*tramp_36)(int value);
int (*tramp_37)(int value);
int (*tramp_38)(int value);
int (*tramp_39)(int value);
int (*tramp_40)(int value);
};
#endif /* _BPF_TESTMOD_H */
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
#include <test_progs.h>
#include "struct_ops_multi_pages.skel.h"
static void do_struct_ops_multi_pages(void)
{
struct struct_ops_multi_pages *skel;
struct bpf_link *link;
/* The size of all trampolines of skel->maps.multi_pages should be
* over 1 page (at least for x86).
*/
skel = struct_ops_multi_pages__open_and_load();
if (!ASSERT_OK_PTR(skel, "struct_ops_multi_pages_open_and_load"))
return;
link = bpf_map__attach_struct_ops(skel->maps.multi_pages);
ASSERT_OK_PTR(link, "attach_multi_pages");
bpf_link__destroy(link);
struct_ops_multi_pages__destroy(skel);
}
void test_struct_ops_multi_pages(void)
{
if (test__start_subtest("multi_pages"))
do_struct_ops_multi_pages();
}
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
#include <vmlinux.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#include "../bpf_testmod/bpf_testmod.h"
char _license[] SEC("license") = "GPL";
#define TRAMP(x) \
SEC("struct_ops/tramp_" #x) \
int BPF_PROG(tramp_ ## x, int a) \
{ \
return a; \
}
TRAMP(1)
TRAMP(2)
TRAMP(3)
TRAMP(4)
TRAMP(5)
TRAMP(6)
TRAMP(7)
TRAMP(8)
TRAMP(9)
TRAMP(10)
TRAMP(11)
TRAMP(12)
TRAMP(13)
TRAMP(14)
TRAMP(15)
TRAMP(16)
TRAMP(17)
TRAMP(18)
TRAMP(19)
TRAMP(20)
TRAMP(21)
TRAMP(22)
TRAMP(23)
TRAMP(24)
TRAMP(25)
TRAMP(26)
TRAMP(27)
TRAMP(28)
TRAMP(29)
TRAMP(30)
TRAMP(31)
TRAMP(32)
TRAMP(33)
TRAMP(34)
TRAMP(35)
TRAMP(36)
TRAMP(37)
TRAMP(38)
TRAMP(39)
TRAMP(40)
#define F_TRAMP(x) .tramp_ ## x = (void *)tramp_ ## x
SEC(".struct_ops.link")
struct bpf_testmod_ops multi_pages = {
F_TRAMP(1),
F_TRAMP(2),
F_TRAMP(3),
F_TRAMP(4),
F_TRAMP(5),
F_TRAMP(6),
F_TRAMP(7),
F_TRAMP(8),
F_TRAMP(9),
F_TRAMP(10),
F_TRAMP(11),
F_TRAMP(12),
F_TRAMP(13),
F_TRAMP(14),
F_TRAMP(15),
F_TRAMP(16),
F_TRAMP(17),
F_TRAMP(18),
F_TRAMP(19),
F_TRAMP(20),
F_TRAMP(21),
F_TRAMP(22),
F_TRAMP(23),
F_TRAMP(24),
F_TRAMP(25),
F_TRAMP(26),
F_TRAMP(27),
F_TRAMP(28),
F_TRAMP(29),
F_TRAMP(30),
F_TRAMP(31),
F_TRAMP(32),
F_TRAMP(33),
F_TRAMP(34),
F_TRAMP(35),
F_TRAMP(36),
F_TRAMP(37),
F_TRAMP(38),
F_TRAMP(39),
F_TRAMP(40),
};
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment