Commit 80123f0a authored by Alexei Starovoitov's avatar Alexei Starovoitov

Merge branch 'bpf_prog_pack allocator'

Song Liu says:

====================

Changes v8 => v9:
1. Fix an error with multi function program, in 4/9.

Changes v7 => v8:
1. Rebase and fix conflicts.
2. Lock text_mutex for text_poke_copy. (Daniel)

Changes v6 => v7:
1. Redesign the interface between generic and arch logic, based on feedback
   from Alexei and Ilya.
2. Split 6/7 of v6 to 7/9 and 8/9 in v7, for cleaner logic.
3. Add bpf_arch_text_copy in 6/9.

Changes v5 => v6:
1. Make jit_hole_buffer 128 byte long. Only fill the first and last 128
   bytes of header with INT3. (Alexei)
2. Use kvmalloc for temporary buffer. (Alexei)
3. Rename tmp_header/tmp_image => rw_header/rw_image. Remove tmp_image from
   x64_jit_data. (Alexei)
4. Change fall back round_up_to in bpf_jit_binary_alloc_pack() from
   BPF_PROG_MAX_PACK_PROG_SIZE to PAGE_SIZE.

Changes v4 => v5:
1. Do not use atomic64 for bpf_jit_current. (Alexei)

Changes v3 => v4:
1. Rename text_poke_jit() => text_poke_copy(). (Peter)
2. Change comment style. (Peter)

Changes v2 => v3:
1. Fix tailcall.

Changes v1 => v2:
1. Use text_poke instead of writing through linear mapping. (Peter)
2. Avoid making changes to non-x86_64 code.

Most BPF programs are small, but they consume a page each. For systems
with busy traffic and many BPF programs, this could also add significant
pressure to instruction TLB. High iTLB pressure usually causes slow down
for the whole system, which includes visible performance degradation for
production workloads.

This set tries to solve this problem with customized allocator that pack
multiple programs into a huge page.

Patches 1-6 prepare the work. Patch 7 contains key logic of bpf_prog_pack
allocator. Patch 8 contains bpf_jit_binary_pack_alloc logic on top of
bpf_prog_pack allocator. Patch 9 uses this allocator in x86_64 jit.
====================
Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parents 128dac5f 1022a549
......@@ -159,6 +159,7 @@ config X86
select HAVE_ALIGNED_STRUCT_PAGE if SLUB
select HAVE_ARCH_AUDITSYSCALL
select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE
select HAVE_ARCH_HUGE_VMALLOC if HAVE_ARCH_HUGE_VMAP
select HAVE_ARCH_JUMP_LABEL
select HAVE_ARCH_JUMP_LABEL_RELATIVE
select HAVE_ARCH_KASAN if X86_64
......
......@@ -44,6 +44,7 @@ extern void text_poke_early(void *addr, const void *opcode, size_t len);
extern void *text_poke(void *addr, const void *opcode, size_t len);
extern void text_poke_sync(void);
extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len);
extern void *text_poke_copy(void *addr, const void *opcode, size_t len);
extern int poke_int3_handler(struct pt_regs *regs);
extern void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate);
......
......@@ -1102,6 +1102,40 @@ void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
return __text_poke(addr, opcode, len);
}
/**
* text_poke_copy - Copy instructions into (an unused part of) RX memory
* @addr: address to modify
* @opcode: source of the copy
* @len: length to copy, could be more than 2x PAGE_SIZE
*
* Not safe against concurrent execution; useful for JITs to dump
* new code blocks into unused regions of RX memory. Can be used in
* conjunction with synchronize_rcu_tasks() to wait for existing
* execution to quiesce after having made sure no existing functions
* pointers are live.
*/
void *text_poke_copy(void *addr, const void *opcode, size_t len)
{
unsigned long start = (unsigned long)addr;
size_t patched = 0;
if (WARN_ON_ONCE(core_kernel_text(start)))
return NULL;
mutex_lock(&text_mutex);
while (patched < len) {
unsigned long ptr = start + patched;
size_t s;
s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
__text_poke((void *)ptr, opcode + patched, s);
patched += s;
}
mutex_unlock(&text_mutex);
return addr;
}
static void do_sync_core(void *info)
{
sync_core();
......
......@@ -330,8 +330,7 @@ static int emit_jump(u8 **pprog, void *func, void *ip)
}
static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
void *old_addr, void *new_addr,
const bool text_live)
void *old_addr, void *new_addr)
{
const u8 *nop_insn = x86_nops[5];
u8 old_insn[X86_PATCH_SIZE];
......@@ -365,10 +364,7 @@ static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
goto out;
ret = 1;
if (memcmp(ip, new_insn, X86_PATCH_SIZE)) {
if (text_live)
text_poke_bp(ip, new_insn, X86_PATCH_SIZE, NULL);
else
memcpy(ip, new_insn, X86_PATCH_SIZE);
text_poke_bp(ip, new_insn, X86_PATCH_SIZE, NULL);
ret = 0;
}
out:
......@@ -384,7 +380,7 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
/* BPF poking in modules is not supported */
return -EINVAL;
return __bpf_arch_text_poke(ip, t, old_addr, new_addr, true);
return __bpf_arch_text_poke(ip, t, old_addr, new_addr);
}
#define EMIT_LFENCE() EMIT3(0x0F, 0xAE, 0xE8)
......@@ -558,24 +554,15 @@ static void bpf_tail_call_direct_fixup(struct bpf_prog *prog)
mutex_lock(&array->aux->poke_mutex);
target = array->ptrs[poke->tail_call.key];
if (target) {
/* Plain memcpy is used when image is not live yet
* and still not locked as read-only. Once poke
* location is active (poke->tailcall_target_stable),
* any parallel bpf_arch_text_poke() might occur
* still on the read-write image until we finally
* locked it as read-only. Both modifications on
* the given image are under text_mutex to avoid
* interference.
*/
ret = __bpf_arch_text_poke(poke->tailcall_target,
BPF_MOD_JUMP, NULL,
(u8 *)target->bpf_func +
poke->adj_off, false);
poke->adj_off);
BUG_ON(ret < 0);
ret = __bpf_arch_text_poke(poke->tailcall_bypass,
BPF_MOD_JUMP,
(u8 *)poke->tailcall_target +
X86_PATCH_SIZE, NULL, false);
X86_PATCH_SIZE, NULL);
BUG_ON(ret < 0);
}
WRITE_ONCE(poke->tailcall_target_stable, true);
......@@ -866,7 +853,7 @@ static void emit_nops(u8 **pprog, int len)
#define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp)))
static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image,
int oldproglen, struct jit_context *ctx, bool jmp_padding)
{
bool tail_call_reachable = bpf_prog->aux->tail_call_reachable;
......@@ -893,8 +880,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
push_callee_regs(&prog, callee_regs_used);
ilen = prog - temp;
if (image)
memcpy(image + proglen, temp, ilen);
if (rw_image)
memcpy(rw_image + proglen, temp, ilen);
proglen += ilen;
addrs[0] = proglen;
prog = temp;
......@@ -1323,6 +1310,9 @@ st: if (is_imm8(insn->off))
pr_err("extable->insn doesn't fit into 32-bit\n");
return -EFAULT;
}
/* switch ex to rw buffer for writes */
ex = (void *)rw_image + ((void *)ex - (void *)image);
ex->insn = delta;
ex->data = EX_TYPE_BPF;
......@@ -1705,7 +1695,7 @@ st: if (is_imm8(insn->off))
pr_err("bpf_jit: fatal error\n");
return -EFAULT;
}
memcpy(image + proglen, temp, ilen);
memcpy(rw_image + proglen, temp, ilen);
}
proglen += ilen;
addrs[i] = proglen;
......@@ -2246,6 +2236,7 @@ int arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs)
}
struct x64_jit_data {
struct bpf_binary_header *rw_header;
struct bpf_binary_header *header;
int *addrs;
u8 *image;
......@@ -2258,6 +2249,7 @@ struct x64_jit_data {
struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
{
struct bpf_binary_header *rw_header = NULL;
struct bpf_binary_header *header = NULL;
struct bpf_prog *tmp, *orig_prog = prog;
struct x64_jit_data *jit_data;
......@@ -2266,6 +2258,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
bool tmp_blinded = false;
bool extra_pass = false;
bool padding = false;
u8 *rw_image = NULL;
u8 *image = NULL;
int *addrs;
int pass;
......@@ -2301,6 +2294,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
oldproglen = jit_data->proglen;
image = jit_data->image;
header = jit_data->header;
rw_header = jit_data->rw_header;
rw_image = (void *)rw_header + ((void *)image - (void *)header);
extra_pass = true;
padding = true;
goto skip_init_addrs;
......@@ -2331,12 +2326,12 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
for (pass = 0; pass < MAX_PASSES || image; pass++) {
if (!padding && pass >= PADDING_PASSES)
padding = true;
proglen = do_jit(prog, addrs, image, oldproglen, &ctx, padding);
proglen = do_jit(prog, addrs, image, rw_image, oldproglen, &ctx, padding);
if (proglen <= 0) {
out_image:
image = NULL;
if (header)
bpf_jit_binary_free(header);
bpf_jit_binary_pack_free(header, rw_header);
prog = orig_prog;
goto out_addrs;
}
......@@ -2360,8 +2355,9 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
sizeof(struct exception_table_entry);
/* allocate module memory for x86 insns and extable */
header = bpf_jit_binary_alloc(roundup(proglen, align) + extable_size,
&image, align, jit_fill_hole);
header = bpf_jit_binary_pack_alloc(roundup(proglen, align) + extable_size,
&image, align, &rw_header, &rw_image,
jit_fill_hole);
if (!header) {
prog = orig_prog;
goto out_addrs;
......@@ -2377,14 +2373,22 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
if (image) {
if (!prog->is_func || extra_pass) {
/*
* bpf_jit_binary_pack_finalize fails in two scenarios:
* 1) header is not pointing to proper module memory;
* 2) the arch doesn't support bpf_arch_text_copy().
*
* Both cases are serious bugs that we should not continue.
*/
BUG_ON(bpf_jit_binary_pack_finalize(prog, header, rw_header));
bpf_tail_call_direct_fixup(prog);
bpf_jit_binary_lock_ro(header);
} else {
jit_data->addrs = addrs;
jit_data->ctx = ctx;
jit_data->proglen = proglen;
jit_data->image = image;
jit_data->header = header;
jit_data->rw_header = rw_header;
}
prog->bpf_func = (void *)image;
prog->jited = 1;
......@@ -2412,3 +2416,10 @@ bool bpf_jit_supports_kfunc_call(void)
{
return true;
}
void *bpf_arch_text_copy(void *dst, void *src, size_t len)
{
if (text_poke_copy(dst, src, len) == NULL)
return ERR_PTR(-EINVAL);
return dst;
}
......@@ -846,8 +846,8 @@ void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym);
void bpf_image_ksym_del(struct bpf_ksym *ksym);
void bpf_ksym_add(struct bpf_ksym *ksym);
void bpf_ksym_del(struct bpf_ksym *ksym);
int bpf_jit_charge_modmem(u32 pages);
void bpf_jit_uncharge_modmem(u32 pages);
int bpf_jit_charge_modmem(u32 size);
void bpf_jit_uncharge_modmem(u32 size);
bool bpf_prog_has_trampoline(const struct bpf_prog *prog);
#else
static inline int bpf_trampoline_link_prog(struct bpf_prog *prog,
......@@ -953,6 +953,7 @@ struct bpf_prog_aux {
bool sleepable;
bool tail_call_reachable;
bool xdp_has_frags;
bool use_bpf_prog_pack;
struct hlist_node tramp_hlist;
/* BTF_KIND_FUNC_PROTO for valid attach_btf_id */
const struct btf_type *attach_func_proto;
......@@ -2362,6 +2363,8 @@ enum bpf_text_poke_type {
int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
void *addr1, void *addr2);
void *bpf_arch_text_copy(void *dst, void *src, size_t len);
struct btf_id_set;
bool btf_id_set_contains(const struct btf_id_set *set, u32 id);
......
......@@ -548,7 +548,7 @@ struct sock_fprog_kern {
#define BPF_IMAGE_ALIGNMENT 8
struct bpf_binary_header {
u32 pages;
u32 size;
u8 image[] __aligned(BPF_IMAGE_ALIGNMENT);
};
......@@ -886,17 +886,8 @@ static inline void bpf_prog_lock_ro(struct bpf_prog *fp)
static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr)
{
set_vm_flush_reset_perms(hdr);
set_memory_ro((unsigned long)hdr, hdr->pages);
set_memory_x((unsigned long)hdr, hdr->pages);
}
static inline struct bpf_binary_header *
bpf_jit_binary_hdr(const struct bpf_prog *fp)
{
unsigned long real_start = (unsigned long)fp->bpf_func;
unsigned long addr = real_start & PAGE_MASK;
return (void *)addr;
set_memory_ro((unsigned long)hdr, hdr->size >> PAGE_SHIFT);
set_memory_x((unsigned long)hdr, hdr->size >> PAGE_SHIFT);
}
int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap);
......@@ -1068,6 +1059,18 @@ void *bpf_jit_alloc_exec(unsigned long size);
void bpf_jit_free_exec(void *addr);
void bpf_jit_free(struct bpf_prog *fp);
struct bpf_binary_header *
bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **ro_image,
unsigned int alignment,
struct bpf_binary_header **rw_hdr,
u8 **rw_image,
bpf_jit_fill_hole_t bpf_fill_ill_insns);
int bpf_jit_binary_pack_finalize(struct bpf_prog *prog,
struct bpf_binary_header *ro_header,
struct bpf_binary_header *rw_header);
void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header,
struct bpf_binary_header *rw_header);
int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
struct bpf_jit_poke_descriptor *poke);
......
......@@ -537,13 +537,10 @@ long bpf_jit_limit_max __read_mostly;
static void
bpf_prog_ksym_set_addr(struct bpf_prog *prog)
{
const struct bpf_binary_header *hdr = bpf_jit_binary_hdr(prog);
unsigned long addr = (unsigned long)hdr;
WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog));
prog->aux->ksym.start = (unsigned long) prog->bpf_func;
prog->aux->ksym.end = addr + hdr->pages * PAGE_SIZE;
prog->aux->ksym.end = prog->aux->ksym.start + prog->jited_len;
}
static void
......@@ -808,6 +805,133 @@ int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
return slot;
}
/*
* BPF program pack allocator.
*
* Most BPF programs are pretty small. Allocating a hole page for each
* program is sometime a waste. Many small bpf program also adds pressure
* to instruction TLB. To solve this issue, we introduce a BPF program pack
* allocator. The prog_pack allocator uses HPAGE_PMD_SIZE page (2MB on x86)
* to host BPF programs.
*/
#define BPF_PROG_PACK_SIZE HPAGE_PMD_SIZE
#define BPF_PROG_CHUNK_SHIFT 6
#define BPF_PROG_CHUNK_SIZE (1 << BPF_PROG_CHUNK_SHIFT)
#define BPF_PROG_CHUNK_MASK (~(BPF_PROG_CHUNK_SIZE - 1))
#define BPF_PROG_CHUNK_COUNT (BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE)
struct bpf_prog_pack {
struct list_head list;
void *ptr;
unsigned long bitmap[BITS_TO_LONGS(BPF_PROG_CHUNK_COUNT)];
};
#define BPF_PROG_MAX_PACK_PROG_SIZE HPAGE_PMD_SIZE
#define BPF_PROG_SIZE_TO_NBITS(size) (round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE)
static DEFINE_MUTEX(pack_mutex);
static LIST_HEAD(pack_list);
static struct bpf_prog_pack *alloc_new_pack(void)
{
struct bpf_prog_pack *pack;
pack = kzalloc(sizeof(*pack), GFP_KERNEL);
if (!pack)
return NULL;
pack->ptr = module_alloc(BPF_PROG_PACK_SIZE);
if (!pack->ptr) {
kfree(pack);
return NULL;
}
bitmap_zero(pack->bitmap, BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE);
list_add_tail(&pack->list, &pack_list);
set_vm_flush_reset_perms(pack->ptr);
set_memory_ro((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE);
set_memory_x((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE);
return pack;
}
static void *bpf_prog_pack_alloc(u32 size)
{
unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size);
struct bpf_prog_pack *pack;
unsigned long pos;
void *ptr = NULL;
if (size > BPF_PROG_MAX_PACK_PROG_SIZE) {
size = round_up(size, PAGE_SIZE);
ptr = module_alloc(size);
if (ptr) {
set_vm_flush_reset_perms(ptr);
set_memory_ro((unsigned long)ptr, size / PAGE_SIZE);
set_memory_x((unsigned long)ptr, size / PAGE_SIZE);
}
return ptr;
}
mutex_lock(&pack_mutex);
list_for_each_entry(pack, &pack_list, list) {
pos = bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
nbits, 0);
if (pos < BPF_PROG_CHUNK_COUNT)
goto found_free_area;
}
pack = alloc_new_pack();
if (!pack)
goto out;
pos = 0;
found_free_area:
bitmap_set(pack->bitmap, pos, nbits);
ptr = (void *)(pack->ptr) + (pos << BPF_PROG_CHUNK_SHIFT);
out:
mutex_unlock(&pack_mutex);
return ptr;
}
static void bpf_prog_pack_free(struct bpf_binary_header *hdr)
{
struct bpf_prog_pack *pack = NULL, *tmp;
unsigned int nbits;
unsigned long pos;
void *pack_ptr;
if (hdr->size > BPF_PROG_MAX_PACK_PROG_SIZE) {
module_memfree(hdr);
return;
}
pack_ptr = (void *)((unsigned long)hdr & ~(BPF_PROG_PACK_SIZE - 1));
mutex_lock(&pack_mutex);
list_for_each_entry(tmp, &pack_list, list) {
if (tmp->ptr == pack_ptr) {
pack = tmp;
break;
}
}
if (WARN_ONCE(!pack, "bpf_prog_pack bug\n"))
goto out;
nbits = BPF_PROG_SIZE_TO_NBITS(hdr->size);
pos = ((unsigned long)hdr - (unsigned long)pack_ptr) >> BPF_PROG_CHUNK_SHIFT;
bitmap_clear(pack->bitmap, pos, nbits);
if (bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
BPF_PROG_CHUNK_COUNT, 0) == 0) {
list_del(&pack->list);
module_memfree(pack->ptr);
kfree(pack);
}
out:
mutex_unlock(&pack_mutex);
}
static atomic_long_t bpf_jit_current;
/* Can be overridden by an arch's JIT compiler if it has a custom,
......@@ -833,12 +957,11 @@ static int __init bpf_jit_charge_init(void)
}
pure_initcall(bpf_jit_charge_init);
int bpf_jit_charge_modmem(u32 pages)
int bpf_jit_charge_modmem(u32 size)
{
if (atomic_long_add_return(pages, &bpf_jit_current) >
(bpf_jit_limit >> PAGE_SHIFT)) {
if (atomic_long_add_return(size, &bpf_jit_current) > bpf_jit_limit) {
if (!bpf_capable()) {
atomic_long_sub(pages, &bpf_jit_current);
atomic_long_sub(size, &bpf_jit_current);
return -EPERM;
}
}
......@@ -846,9 +969,9 @@ int bpf_jit_charge_modmem(u32 pages)
return 0;
}
void bpf_jit_uncharge_modmem(u32 pages)
void bpf_jit_uncharge_modmem(u32 size)
{
atomic_long_sub(pages, &bpf_jit_current);
atomic_long_sub(size, &bpf_jit_current);
}
void *__weak bpf_jit_alloc_exec(unsigned long size)
......@@ -867,7 +990,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
bpf_jit_fill_hole_t bpf_fill_ill_insns)
{
struct bpf_binary_header *hdr;
u32 size, hole, start, pages;
u32 size, hole, start;
WARN_ON_ONCE(!is_power_of_2(alignment) ||
alignment > BPF_IMAGE_ALIGNMENT);
......@@ -877,20 +1000,19 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
* random section of illegal instructions.
*/
size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE);
pages = size / PAGE_SIZE;
if (bpf_jit_charge_modmem(pages))
if (bpf_jit_charge_modmem(size))
return NULL;
hdr = bpf_jit_alloc_exec(size);
if (!hdr) {
bpf_jit_uncharge_modmem(pages);
bpf_jit_uncharge_modmem(size);
return NULL;
}
/* Fill space with illegal/arch-dep instructions. */
bpf_fill_ill_insns(hdr, size);
hdr->pages = pages;
hdr->size = size;
hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
PAGE_SIZE - sizeof(*hdr));
start = (get_random_int() % hole) & ~(alignment - 1);
......@@ -903,10 +1025,113 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
void bpf_jit_binary_free(struct bpf_binary_header *hdr)
{
u32 pages = hdr->pages;
u32 size = hdr->size;
bpf_jit_free_exec(hdr);
bpf_jit_uncharge_modmem(pages);
bpf_jit_uncharge_modmem(size);
}
/* Allocate jit binary from bpf_prog_pack allocator.
* Since the allocated memory is RO+X, the JIT engine cannot write directly
* to the memory. To solve this problem, a RW buffer is also allocated at
* as the same time. The JIT engine should calculate offsets based on the
* RO memory address, but write JITed program to the RW buffer. Once the
* JIT engine finishes, it calls bpf_jit_binary_pack_finalize, which copies
* the JITed program to the RO memory.
*/
struct bpf_binary_header *
bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
unsigned int alignment,
struct bpf_binary_header **rw_header,
u8 **rw_image,
bpf_jit_fill_hole_t bpf_fill_ill_insns)
{
struct bpf_binary_header *ro_header;
u32 size, hole, start;
WARN_ON_ONCE(!is_power_of_2(alignment) ||
alignment > BPF_IMAGE_ALIGNMENT);
/* add 16 bytes for a random section of illegal instructions */
size = round_up(proglen + sizeof(*ro_header) + 16, BPF_PROG_CHUNK_SIZE);
if (bpf_jit_charge_modmem(size))
return NULL;
ro_header = bpf_prog_pack_alloc(size);
if (!ro_header) {
bpf_jit_uncharge_modmem(size);
return NULL;
}
*rw_header = kvmalloc(size, GFP_KERNEL);
if (!*rw_header) {
bpf_prog_pack_free(ro_header);
bpf_jit_uncharge_modmem(size);
return NULL;
}
/* Fill space with illegal/arch-dep instructions. */
bpf_fill_ill_insns(*rw_header, size);
(*rw_header)->size = size;
hole = min_t(unsigned int, size - (proglen + sizeof(*ro_header)),
BPF_PROG_CHUNK_SIZE - sizeof(*ro_header));
start = (get_random_int() % hole) & ~(alignment - 1);
*image_ptr = &ro_header->image[start];
*rw_image = &(*rw_header)->image[start];
return ro_header;
}
/* Copy JITed text from rw_header to its final location, the ro_header. */
int bpf_jit_binary_pack_finalize(struct bpf_prog *prog,
struct bpf_binary_header *ro_header,
struct bpf_binary_header *rw_header)
{
void *ptr;
ptr = bpf_arch_text_copy(ro_header, rw_header, rw_header->size);
kvfree(rw_header);
if (IS_ERR(ptr)) {
bpf_prog_pack_free(ro_header);
return PTR_ERR(ptr);
}
prog->aux->use_bpf_prog_pack = true;
return 0;
}
/* bpf_jit_binary_pack_free is called in two different scenarios:
* 1) when the program is freed after;
* 2) when the JIT engine fails (before bpf_jit_binary_pack_finalize).
* For case 2), we need to free both the RO memory and the RW buffer.
* Also, ro_header->size in 2) is not properly set yet, so rw_header->size
* is used for uncharge.
*/
void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header,
struct bpf_binary_header *rw_header)
{
u32 size = rw_header ? rw_header->size : ro_header->size;
bpf_prog_pack_free(ro_header);
kvfree(rw_header);
bpf_jit_uncharge_modmem(size);
}
static inline struct bpf_binary_header *
bpf_jit_binary_hdr(const struct bpf_prog *fp)
{
unsigned long real_start = (unsigned long)fp->bpf_func;
unsigned long addr;
if (fp->aux->use_bpf_prog_pack)
addr = real_start & BPF_PROG_CHUNK_MASK;
else
addr = real_start & PAGE_MASK;
return (void *)addr;
}
/* This symbol is only overridden by archs that have different
......@@ -918,7 +1143,10 @@ void __weak bpf_jit_free(struct bpf_prog *fp)
if (fp->jited) {
struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp);
bpf_jit_binary_free(hdr);
if (fp->aux->use_bpf_prog_pack)
bpf_jit_binary_pack_free(hdr, NULL /* rw_buffer */);
else
bpf_jit_binary_free(hdr);
WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp));
}
......@@ -2445,6 +2673,11 @@ int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
return -ENOTSUPP;
}
void * __weak bpf_arch_text_copy(void *dst, void *src, size_t len)
{
return ERR_PTR(-ENOTSUPP);
}
DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
EXPORT_SYMBOL(bpf_stats_enabled_key);
......
......@@ -213,7 +213,7 @@ static void __bpf_tramp_image_put_deferred(struct work_struct *work)
im = container_of(work, struct bpf_tramp_image, work);
bpf_image_ksym_del(&im->ksym);
bpf_jit_free_exec(im->image);
bpf_jit_uncharge_modmem(1);
bpf_jit_uncharge_modmem(PAGE_SIZE);
percpu_ref_exit(&im->pcref);
kfree_rcu(im, rcu);
}
......@@ -310,7 +310,7 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx)
if (!im)
goto out;
err = bpf_jit_charge_modmem(1);
err = bpf_jit_charge_modmem(PAGE_SIZE);
if (err)
goto out_free_im;
......@@ -332,7 +332,7 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx)
out_free_image:
bpf_jit_free_exec(im->image);
out_uncharge:
bpf_jit_uncharge_modmem(1);
bpf_jit_uncharge_modmem(PAGE_SIZE);
out_free_im:
kfree(im);
out:
......
......@@ -13067,6 +13067,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
prog->jited = 1;
prog->bpf_func = func[0]->bpf_func;
prog->jited_len = func[0]->jited_len;
prog->aux->func = func;
prog->aux->func_cnt = env->subprog_cnt;
bpf_prog_jit_attempt_done(prog);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment