Commit 187e2af0 authored by Kui-Feng Lee's avatar Kui-Feng Lee Committed by Martin KaFai Lau

bpf: struct_ops supports more than one page for trampolines.

The BPF struct_ops previously only allowed one page of trampolines.
Each function pointer of a struct_ops is implemented by a struct_ops
bpf program. Each struct_ops bpf program requires a trampoline.
The following selftest patch shows each page can hold a little more
than 20 trampolines.

While one page is more than enough for the tcp-cc usecase,
the sched_ext use case shows that one page is not always enough and hits
the one page limit. This patch overcomes the one page limit by allocating
another page when needed and it is limited to a total of
MAX_IMAGE_PAGES (8) pages which is more than enough for
reasonable usages.

The variable st_map->image has been changed to st_map->image_pages, and
its type has been changed to an array of pointers to pages.
Signed-off-by: default avatarKui-Feng Lee <thinker.li@gmail.com>
Link: https://lore.kernel.org/r/20240224223418.526631-3-thinker.li@gmail.comSigned-off-by: default avatarMartin KaFai Lau <martin.lau@kernel.org>
parent 73e4f9e6
...@@ -1763,7 +1763,9 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, ...@@ -1763,7 +1763,9 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
struct bpf_tramp_link *link, struct bpf_tramp_link *link,
const struct btf_func_model *model, const struct btf_func_model *model,
void *stub_func, void *stub_func,
void *image, void *image_end); void **image, u32 *image_off,
bool allow_alloc);
void bpf_struct_ops_image_free(void *image);
static inline bool bpf_try_module_get(const void *data, struct module *owner) static inline bool bpf_try_module_get(const void *data, struct module *owner)
{ {
if (owner == BPF_MODULE_OWNER) if (owner == BPF_MODULE_OWNER)
......
...@@ -18,6 +18,8 @@ struct bpf_struct_ops_value { ...@@ -18,6 +18,8 @@ struct bpf_struct_ops_value {
char data[] ____cacheline_aligned_in_smp; char data[] ____cacheline_aligned_in_smp;
}; };
#define MAX_TRAMP_IMAGE_PAGES 8
struct bpf_struct_ops_map { struct bpf_struct_ops_map {
struct bpf_map map; struct bpf_map map;
struct rcu_head rcu; struct rcu_head rcu;
...@@ -30,12 +32,11 @@ struct bpf_struct_ops_map { ...@@ -30,12 +32,11 @@ struct bpf_struct_ops_map {
*/ */
struct bpf_link **links; struct bpf_link **links;
u32 links_cnt; u32 links_cnt;
/* image is a page that has all the trampolines u32 image_pages_cnt;
/* image_pages is an array of pages that has all the trampolines
* that stores the func args before calling the bpf_prog. * that stores the func args before calling the bpf_prog.
* A PAGE_SIZE "image" is enough to store all trampoline for
* "links[]".
*/ */
void *image; void *image_pages[MAX_TRAMP_IMAGE_PAGES];
/* The owner moduler's btf. */ /* The owner moduler's btf. */
struct btf *btf; struct btf *btf;
/* uvalue->data stores the kernel struct /* uvalue->data stores the kernel struct
...@@ -116,6 +117,31 @@ static bool is_valid_value_type(struct btf *btf, s32 value_id, ...@@ -116,6 +117,31 @@ static bool is_valid_value_type(struct btf *btf, s32 value_id,
return true; return true;
} }
static void *bpf_struct_ops_image_alloc(void)
{
void *image;
int err;
err = bpf_jit_charge_modmem(PAGE_SIZE);
if (err)
return ERR_PTR(err);
image = arch_alloc_bpf_trampoline(PAGE_SIZE);
if (!image) {
bpf_jit_uncharge_modmem(PAGE_SIZE);
return ERR_PTR(-ENOMEM);
}
return image;
}
void bpf_struct_ops_image_free(void *image)
{
if (image) {
arch_free_bpf_trampoline(image, PAGE_SIZE);
bpf_jit_uncharge_modmem(PAGE_SIZE);
}
}
#define MAYBE_NULL_SUFFIX "__nullable" #define MAYBE_NULL_SUFFIX "__nullable"
#define MAX_STUB_NAME 128 #define MAX_STUB_NAME 128
...@@ -461,6 +487,15 @@ static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map) ...@@ -461,6 +487,15 @@ static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map)
} }
} }
static void bpf_struct_ops_map_free_image(struct bpf_struct_ops_map *st_map)
{
int i;
for (i = 0; i < st_map->image_pages_cnt; i++)
bpf_struct_ops_image_free(st_map->image_pages[i]);
st_map->image_pages_cnt = 0;
}
static int check_zero_holes(const struct btf *btf, const struct btf_type *t, void *data) static int check_zero_holes(const struct btf *btf, const struct btf_type *t, void *data)
{ {
const struct btf_member *member; const struct btf_member *member;
...@@ -506,9 +541,12 @@ const struct bpf_link_ops bpf_struct_ops_link_lops = { ...@@ -506,9 +541,12 @@ const struct bpf_link_ops bpf_struct_ops_link_lops = {
int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
struct bpf_tramp_link *link, struct bpf_tramp_link *link,
const struct btf_func_model *model, const struct btf_func_model *model,
void *stub_func, void *image, void *image_end) void *stub_func,
void **_image, u32 *_image_off,
bool allow_alloc)
{ {
u32 flags = BPF_TRAMP_F_INDIRECT; u32 image_off = *_image_off, flags = BPF_TRAMP_F_INDIRECT;
void *image = *_image;
int size; int size;
tlinks[BPF_TRAMP_FENTRY].links[0] = link; tlinks[BPF_TRAMP_FENTRY].links[0] = link;
...@@ -518,12 +556,32 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, ...@@ -518,12 +556,32 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
flags |= BPF_TRAMP_F_RET_FENTRY_RET; flags |= BPF_TRAMP_F_RET_FENTRY_RET;
size = arch_bpf_trampoline_size(model, flags, tlinks, NULL); size = arch_bpf_trampoline_size(model, flags, tlinks, NULL);
if (size < 0) if (size <= 0)
return size; return size ? : -EFAULT;
if (size > (unsigned long)image_end - (unsigned long)image)
return -E2BIG; /* Allocate image buffer if necessary */
return arch_prepare_bpf_trampoline(NULL, image, image_end, if (!image || size > PAGE_SIZE - image_off) {
if (!allow_alloc)
return -E2BIG;
image = bpf_struct_ops_image_alloc();
if (IS_ERR(image))
return PTR_ERR(image);
image_off = 0;
}
size = arch_prepare_bpf_trampoline(NULL, image + image_off,
image + PAGE_SIZE,
model, flags, tlinks, stub_func); model, flags, tlinks, stub_func);
if (size <= 0) {
if (image != *_image)
bpf_struct_ops_image_free(image);
return size ? : -EFAULT;
}
*_image = image;
*_image_off = image_off + size;
return 0;
} }
static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
...@@ -539,8 +597,8 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, ...@@ -539,8 +597,8 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
struct bpf_tramp_links *tlinks; struct bpf_tramp_links *tlinks;
void *udata, *kdata; void *udata, *kdata;
int prog_fd, err; int prog_fd, err;
void *image, *image_end; u32 i, trampoline_start, image_off = 0;
u32 i; void *cur_image = NULL, *image = NULL;
if (flags) if (flags)
return -EINVAL; return -EINVAL;
...@@ -578,8 +636,6 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, ...@@ -578,8 +636,6 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
udata = &uvalue->data; udata = &uvalue->data;
kdata = &kvalue->data; kdata = &kvalue->data;
image = st_map->image;
image_end = st_map->image + PAGE_SIZE;
module_type = btf_type_by_id(btf_vmlinux, st_ops_ids[IDX_MODULE_ID]); module_type = btf_type_by_id(btf_vmlinux, st_ops_ids[IDX_MODULE_ID]);
for_each_member(i, t, member) { for_each_member(i, t, member) {
...@@ -658,15 +714,24 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, ...@@ -658,15 +714,24 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
&bpf_struct_ops_link_lops, prog); &bpf_struct_ops_link_lops, prog);
st_map->links[i] = &link->link; st_map->links[i] = &link->link;
trampoline_start = image_off;
err = bpf_struct_ops_prepare_trampoline(tlinks, link, err = bpf_struct_ops_prepare_trampoline(tlinks, link,
&st_ops->func_models[i], &st_ops->func_models[i],
*(void **)(st_ops->cfi_stubs + moff), *(void **)(st_ops->cfi_stubs + moff),
image, image_end); &image, &image_off,
st_map->image_pages_cnt < MAX_TRAMP_IMAGE_PAGES);
if (err)
goto reset_unlock;
if (cur_image != image) {
st_map->image_pages[st_map->image_pages_cnt++] = image;
cur_image = image;
trampoline_start = 0;
}
if (err < 0) if (err < 0)
goto reset_unlock; goto reset_unlock;
*(void **)(kdata + moff) = image + cfi_get_offset(); *(void **)(kdata + moff) = image + trampoline_start + cfi_get_offset();
image += err;
/* put prog_id to udata */ /* put prog_id to udata */
*(unsigned long *)(udata + moff) = prog->aux->id; *(unsigned long *)(udata + moff) = prog->aux->id;
...@@ -677,10 +742,11 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, ...@@ -677,10 +742,11 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
if (err) if (err)
goto reset_unlock; goto reset_unlock;
} }
for (i = 0; i < st_map->image_pages_cnt; i++)
arch_protect_bpf_trampoline(st_map->image_pages[i], PAGE_SIZE);
if (st_map->map.map_flags & BPF_F_LINK) { if (st_map->map.map_flags & BPF_F_LINK) {
err = 0; err = 0;
arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE);
/* Let bpf_link handle registration & unregistration. /* Let bpf_link handle registration & unregistration.
* *
* Pair with smp_load_acquire() during lookup_elem(). * Pair with smp_load_acquire() during lookup_elem().
...@@ -689,7 +755,6 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, ...@@ -689,7 +755,6 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
goto unlock; goto unlock;
} }
arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE);
err = st_ops->reg(kdata); err = st_ops->reg(kdata);
if (likely(!err)) { if (likely(!err)) {
/* This refcnt increment on the map here after /* This refcnt increment on the map here after
...@@ -712,9 +777,9 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, ...@@ -712,9 +777,9 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
* there was a race in registering the struct_ops (under the same name) to * there was a race in registering the struct_ops (under the same name) to
* a sub-system through different struct_ops's maps. * a sub-system through different struct_ops's maps.
*/ */
arch_unprotect_bpf_trampoline(st_map->image, PAGE_SIZE);
reset_unlock: reset_unlock:
bpf_struct_ops_map_free_image(st_map);
bpf_struct_ops_map_put_progs(st_map); bpf_struct_ops_map_put_progs(st_map);
memset(uvalue, 0, map->value_size); memset(uvalue, 0, map->value_size);
memset(kvalue, 0, map->value_size); memset(kvalue, 0, map->value_size);
...@@ -781,10 +846,7 @@ static void __bpf_struct_ops_map_free(struct bpf_map *map) ...@@ -781,10 +846,7 @@ static void __bpf_struct_ops_map_free(struct bpf_map *map)
if (st_map->links) if (st_map->links)
bpf_struct_ops_map_put_progs(st_map); bpf_struct_ops_map_put_progs(st_map);
bpf_map_area_free(st_map->links); bpf_map_area_free(st_map->links);
if (st_map->image) { bpf_struct_ops_map_free_image(st_map);
arch_free_bpf_trampoline(st_map->image, PAGE_SIZE);
bpf_jit_uncharge_modmem(PAGE_SIZE);
}
bpf_map_area_free(st_map->uvalue); bpf_map_area_free(st_map->uvalue);
bpf_map_area_free(st_map); bpf_map_area_free(st_map);
} }
...@@ -894,20 +956,6 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) ...@@ -894,20 +956,6 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
st_map->st_ops_desc = st_ops_desc; st_map->st_ops_desc = st_ops_desc;
map = &st_map->map; map = &st_map->map;
ret = bpf_jit_charge_modmem(PAGE_SIZE);
if (ret)
goto errout_free;
st_map->image = arch_alloc_bpf_trampoline(PAGE_SIZE);
if (!st_map->image) {
/* __bpf_struct_ops_map_free() uses st_map->image as flag
* for "charged or not". In this case, we need to unchange
* here.
*/
bpf_jit_uncharge_modmem(PAGE_SIZE);
ret = -ENOMEM;
goto errout_free;
}
st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE); st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE);
st_map->links_cnt = btf_type_vlen(t); st_map->links_cnt = btf_type_vlen(t);
st_map->links = st_map->links =
......
...@@ -91,6 +91,7 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, ...@@ -91,6 +91,7 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
struct bpf_tramp_link *link = NULL; struct bpf_tramp_link *link = NULL;
void *image = NULL; void *image = NULL;
unsigned int op_idx; unsigned int op_idx;
u32 image_off = 0;
int prog_ret; int prog_ret;
s32 type_id; s32 type_id;
int err; int err;
...@@ -114,12 +115,6 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, ...@@ -114,12 +115,6 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
goto out; goto out;
} }
image = arch_alloc_bpf_trampoline(PAGE_SIZE);
if (!image) {
err = -ENOMEM;
goto out;
}
link = kzalloc(sizeof(*link), GFP_USER); link = kzalloc(sizeof(*link), GFP_USER);
if (!link) { if (!link) {
err = -ENOMEM; err = -ENOMEM;
...@@ -133,7 +128,8 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, ...@@ -133,7 +128,8 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
err = bpf_struct_ops_prepare_trampoline(tlinks, link, err = bpf_struct_ops_prepare_trampoline(tlinks, link,
&st_ops->func_models[op_idx], &st_ops->func_models[op_idx],
&dummy_ops_test_ret_function, &dummy_ops_test_ret_function,
image, image + PAGE_SIZE); &image, &image_off,
true);
if (err < 0) if (err < 0)
goto out; goto out;
...@@ -147,7 +143,7 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, ...@@ -147,7 +143,7 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
err = -EFAULT; err = -EFAULT;
out: out:
kfree(args); kfree(args);
arch_free_bpf_trampoline(image, PAGE_SIZE); bpf_struct_ops_image_free(image);
if (link) if (link)
bpf_link_put(&link->link); bpf_link_put(&link->link);
kfree(tlinks); kfree(tlinks);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment