Commit 6dbae03b authored by Alexei Starovoitov's avatar Alexei Starovoitov

Merge branch 'optimize-bpf_tail_call'

Daniel Borkmann says:

====================
This gets rid of indirect jumps for BPF tail calls whenever possible.
The series adds emission for *direct* jumps for tail call maps in order
to avoid the retpoline overhead from a493a87f ("bpf, x64: implement
retpoline for tail call") for situations that allow for it, meaning,
for known constant keys at verification time which are used as index
into the tail call map. See patch 7/8 for more general details.

Thanks!

v1  -> v2:
  - added more test cases
  - u8 ip_stable -> bool (Andrii)
  - removed bpf_map_poke_{un,}lock and simplified the code (Andrii)
  - added break into prog_array_map_poke_untrack since there's just
    one prog (Andrii)
  - fixed typo: for for in commit msg (Andrii)
  - reworked __bpf_arch_text_poke (Andrii)
  - added subtests, and comment on tests themselves, NULL-NULL
    transistion (Andrii)
  - in constant map key tracking I've moved the map_poke_track callback
    to once we've finished creating the poke tab as otherwise concurrent
    access from tail call map would blow up (since we realloc the table)
rfc -> v1:
  - Applied Alexei's and Andrii's feeback from
    https://lore.kernel.org/bpf/cover.1573779287.git.daniel@iogearbox.net/T/#t
====================
Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parents c4781e37 79d49ba0
......@@ -203,8 +203,9 @@ struct jit_context {
/* Maximum number of bytes emitted while JITing one eBPF insn */
#define BPF_MAX_INSN_SIZE 128
#define BPF_INSN_SAFETY 64
/* number of bytes emit_call() needs to generate call instruction */
#define X86_CALL_SIZE 5
/* Number of bytes emit_patch() needs to generate instructions */
#define X86_PATCH_SIZE 5
#define PROLOGUE_SIZE 25
......@@ -215,7 +216,7 @@ struct jit_context {
static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
{
u8 *prog = *pprog;
int cnt = X86_CALL_SIZE;
int cnt = X86_PATCH_SIZE;
/* BPF trampoline can be made to work without these nops,
* but let's waste 5 bytes for now and optimize later
......@@ -238,6 +239,123 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
*pprog = prog;
}
static int emit_patch(u8 **pprog, void *func, void *ip, u8 opcode)
{
u8 *prog = *pprog;
int cnt = 0;
s64 offset;
offset = func - (ip + X86_PATCH_SIZE);
if (!is_simm32(offset)) {
pr_err("Target call %p is out of range\n", func);
return -ERANGE;
}
EMIT1_off32(opcode, offset);
*pprog = prog;
return 0;
}
static int emit_call(u8 **pprog, void *func, void *ip)
{
return emit_patch(pprog, func, ip, 0xE8);
}
static int emit_jump(u8 **pprog, void *func, void *ip)
{
return emit_patch(pprog, func, ip, 0xE9);
}
static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
void *old_addr, void *new_addr,
const bool text_live)
{
int (*emit_patch_fn)(u8 **pprog, void *func, void *ip);
const u8 *nop_insn = ideal_nops[NOP_ATOMIC5];
u8 old_insn[X86_PATCH_SIZE] = {};
u8 new_insn[X86_PATCH_SIZE] = {};
u8 *prog;
int ret;
switch (t) {
case BPF_MOD_NOP_TO_CALL ... BPF_MOD_CALL_TO_NOP:
emit_patch_fn = emit_call;
break;
case BPF_MOD_NOP_TO_JUMP ... BPF_MOD_JUMP_TO_NOP:
emit_patch_fn = emit_jump;
break;
default:
return -ENOTSUPP;
}
switch (t) {
case BPF_MOD_NOP_TO_CALL:
case BPF_MOD_NOP_TO_JUMP:
if (!old_addr && new_addr) {
memcpy(old_insn, nop_insn, X86_PATCH_SIZE);
prog = new_insn;
ret = emit_patch_fn(&prog, new_addr, ip);
if (ret)
return ret;
break;
}
return -ENXIO;
case BPF_MOD_CALL_TO_CALL:
case BPF_MOD_JUMP_TO_JUMP:
if (old_addr && new_addr) {
prog = old_insn;
ret = emit_patch_fn(&prog, old_addr, ip);
if (ret)
return ret;
prog = new_insn;
ret = emit_patch_fn(&prog, new_addr, ip);
if (ret)
return ret;
break;
}
return -ENXIO;
case BPF_MOD_CALL_TO_NOP:
case BPF_MOD_JUMP_TO_NOP:
if (old_addr && !new_addr) {
memcpy(new_insn, nop_insn, X86_PATCH_SIZE);
prog = old_insn;
ret = emit_patch_fn(&prog, old_addr, ip);
if (ret)
return ret;
break;
}
return -ENXIO;
default:
return -ENOTSUPP;
}
ret = -EBUSY;
mutex_lock(&text_mutex);
if (memcmp(ip, old_insn, X86_PATCH_SIZE))
goto out;
if (text_live)
text_poke_bp(ip, new_insn, X86_PATCH_SIZE, NULL);
else
memcpy(ip, new_insn, X86_PATCH_SIZE);
ret = 0;
out:
mutex_unlock(&text_mutex);
return ret;
}
int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
void *old_addr, void *new_addr)
{
if (!is_kernel_text((long)ip) &&
!is_bpf_text_address((long)ip))
/* BPF poking in modules is not supported */
return -EINVAL;
return __bpf_arch_text_poke(ip, t, old_addr, new_addr, true);
}
/*
* Generate the following code:
*
......@@ -252,7 +370,7 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
* goto *(prog->bpf_func + prologue_size);
* out:
*/
static void emit_bpf_tail_call(u8 **pprog)
static void emit_bpf_tail_call_indirect(u8 **pprog)
{
u8 *prog = *pprog;
int label1, label2, label3;
......@@ -319,6 +437,69 @@ static void emit_bpf_tail_call(u8 **pprog)
*pprog = prog;
}
static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke,
u8 **pprog, int addr, u8 *image)
{
u8 *prog = *pprog;
int cnt = 0;
/*
* if (tail_call_cnt > MAX_TAIL_CALL_CNT)
* goto out;
*/
EMIT2_off32(0x8B, 0x85, -36 - MAX_BPF_STACK); /* mov eax, dword ptr [rbp - 548] */
EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */
EMIT2(X86_JA, 14); /* ja out */
EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */
EMIT2_off32(0x89, 0x85, -36 - MAX_BPF_STACK); /* mov dword ptr [rbp -548], eax */
poke->ip = image + (addr - X86_PATCH_SIZE);
poke->adj_off = PROLOGUE_SIZE;
memcpy(prog, ideal_nops[NOP_ATOMIC5], X86_PATCH_SIZE);
prog += X86_PATCH_SIZE;
/* out: */
*pprog = prog;
}
static void bpf_tail_call_direct_fixup(struct bpf_prog *prog)
{
static const enum bpf_text_poke_type type = BPF_MOD_NOP_TO_JUMP;
struct bpf_jit_poke_descriptor *poke;
struct bpf_array *array;
struct bpf_prog *target;
int i, ret;
for (i = 0; i < prog->aux->size_poke_tab; i++) {
poke = &prog->aux->poke_tab[i];
WARN_ON_ONCE(READ_ONCE(poke->ip_stable));
if (poke->reason != BPF_POKE_REASON_TAIL_CALL)
continue;
array = container_of(poke->tail_call.map, struct bpf_array, map);
mutex_lock(&array->aux->poke_mutex);
target = array->ptrs[poke->tail_call.key];
if (target) {
/* Plain memcpy is used when image is not live yet
* and still not locked as read-only. Once poke
* location is active (poke->ip_stable), any parallel
* bpf_arch_text_poke() might occur still on the
* read-write image until we finally locked it as
* read-only. Both modifications on the given image
* are under text_mutex to avoid interference.
*/
ret = __bpf_arch_text_poke(poke->ip, type, NULL,
(u8 *)target->bpf_func +
poke->adj_off, false);
BUG_ON(ret < 0);
}
WRITE_ONCE(poke->ip_stable, true);
mutex_unlock(&array->aux->poke_mutex);
}
}
static void emit_mov_imm32(u8 **pprog, bool sign_propagate,
u32 dst_reg, const u32 imm32)
{
......@@ -480,72 +661,6 @@ static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
*pprog = prog;
}
static int emit_call(u8 **pprog, void *func, void *ip)
{
u8 *prog = *pprog;
int cnt = 0;
s64 offset;
offset = func - (ip + X86_CALL_SIZE);
if (!is_simm32(offset)) {
pr_err("Target call %p is out of range\n", func);
return -EINVAL;
}
EMIT1_off32(0xE8, offset);
*pprog = prog;
return 0;
}
int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
void *old_addr, void *new_addr)
{
u8 old_insn[X86_CALL_SIZE] = {};
u8 new_insn[X86_CALL_SIZE] = {};
u8 *prog;
int ret;
if (!is_kernel_text((long)ip) &&
!is_bpf_text_address((long)ip))
/* BPF trampoline in modules is not supported */
return -EINVAL;
if (old_addr) {
prog = old_insn;
ret = emit_call(&prog, old_addr, (void *)ip);
if (ret)
return ret;
}
if (new_addr) {
prog = new_insn;
ret = emit_call(&prog, new_addr, (void *)ip);
if (ret)
return ret;
}
ret = -EBUSY;
mutex_lock(&text_mutex);
switch (t) {
case BPF_MOD_NOP_TO_CALL:
if (memcmp(ip, ideal_nops[NOP_ATOMIC5], X86_CALL_SIZE))
goto out;
text_poke_bp(ip, new_insn, X86_CALL_SIZE, NULL);
break;
case BPF_MOD_CALL_TO_CALL:
if (memcmp(ip, old_insn, X86_CALL_SIZE))
goto out;
text_poke_bp(ip, new_insn, X86_CALL_SIZE, NULL);
break;
case BPF_MOD_CALL_TO_NOP:
if (memcmp(ip, old_insn, X86_CALL_SIZE))
goto out;
text_poke_bp(ip, ideal_nops[NOP_ATOMIC5], X86_CALL_SIZE, NULL);
break;
}
ret = 0;
out:
mutex_unlock(&text_mutex);
return ret;
}
static bool ex_handler_bpf(const struct exception_table_entry *x,
struct pt_regs *regs, int trapnr,
unsigned long error_code, unsigned long fault_addr)
......@@ -1013,7 +1128,11 @@ xadd: if (is_imm8(insn->off))
break;
case BPF_JMP | BPF_TAIL_CALL:
emit_bpf_tail_call(&prog);
if (imm32)
emit_bpf_tail_call_direct(&bpf_prog->aux->poke_tab[imm32 - 1],
&prog, addrs[i], image);
else
emit_bpf_tail_call_indirect(&prog);
break;
/* cond jump */
......@@ -1394,7 +1513,7 @@ int arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags
/* skip patched call instruction and point orig_call to actual
* body of the kernel function.
*/
orig_call += X86_CALL_SIZE;
orig_call += X86_PATCH_SIZE;
prog = image;
......@@ -1571,6 +1690,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
if (image) {
if (!prog->is_func || extra_pass) {
bpf_tail_call_direct_fixup(prog);
bpf_jit_binary_lock_ro(header);
} else {
jit_data->addrs = addrs;
......
......@@ -22,6 +22,7 @@ struct bpf_verifier_env;
struct bpf_verifier_log;
struct perf_event;
struct bpf_prog;
struct bpf_prog_aux;
struct bpf_map;
struct sock;
struct seq_file;
......@@ -64,6 +65,12 @@ struct bpf_map_ops {
const struct btf_type *key_type,
const struct btf_type *value_type);
/* Prog poke tracking helpers. */
int (*map_poke_track)(struct bpf_map *map, struct bpf_prog_aux *aux);
void (*map_poke_untrack)(struct bpf_map *map, struct bpf_prog_aux *aux);
void (*map_poke_run)(struct bpf_map *map, u32 key, struct bpf_prog *old,
struct bpf_prog *new);
/* Direct value access helpers. */
int (*map_direct_value_addr)(const struct bpf_map *map,
u64 *imm, u32 off);
......@@ -488,6 +495,24 @@ struct bpf_func_info_aux {
bool unreliable;
};
enum bpf_jit_poke_reason {
BPF_POKE_REASON_TAIL_CALL,
};
/* Descriptor of pokes pointing /into/ the JITed image. */
struct bpf_jit_poke_descriptor {
void *ip;
union {
struct {
struct bpf_map *map;
u32 key;
} tail_call;
};
bool ip_stable;
u8 adj_off;
u16 reason;
};
struct bpf_prog_aux {
atomic64_t refcnt;
u32 used_map_cnt;
......@@ -513,6 +538,8 @@ struct bpf_prog_aux {
const char *attach_func_name;
struct bpf_prog **func;
void *jit_data; /* JIT specific data. arch dependent */
struct bpf_jit_poke_descriptor *poke_tab;
u32 size_poke_tab;
struct latch_tree_node ksym_tnode;
struct list_head ksym_lnode;
const struct bpf_prog_ops *ops;
......@@ -560,17 +587,26 @@ struct bpf_prog_aux {
};
};
struct bpf_array_aux {
/* 'Ownership' of prog array is claimed by the first program that
* is going to use this map or by the first program which FD is
* stored in the map to make sure that all callers and callees have
* the same prog type and JITed flag.
*/
enum bpf_prog_type type;
bool jited;
/* Programs with direct jumps into programs part of this array. */
struct list_head poke_progs;
struct bpf_map *map;
struct mutex poke_mutex;
struct work_struct work;
};
struct bpf_array {
struct bpf_map map;
u32 elem_size;
u32 index_mask;
/* 'ownership' of prog_array is claimed by the first program that
* is going to use this map or by the first program which FD is stored
* in the map to make sure that all callers and callees have the same
* prog_type and JITed flag
*/
enum bpf_prog_type owner_prog_type;
bool owner_jited;
struct bpf_array_aux *aux;
union {
char value[0] __aligned(8);
void *ptrs[0] __aligned(8);
......@@ -1031,6 +1067,10 @@ static inline int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
{
return -ENOTSUPP;
}
static inline void bpf_map_put(struct bpf_map *map)
{
}
#endif /* CONFIG_BPF_SYSCALL */
static inline struct bpf_prog *bpf_prog_get_type(u32 ufd,
......@@ -1284,10 +1324,16 @@ static inline u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
#endif /* CONFIG_INET */
enum bpf_text_poke_type {
/* All call-related pokes. */
BPF_MOD_NOP_TO_CALL,
BPF_MOD_CALL_TO_CALL,
BPF_MOD_CALL_TO_NOP,
/* All jump-related pokes. */
BPF_MOD_NOP_TO_JUMP,
BPF_MOD_JUMP_TO_JUMP,
BPF_MOD_JUMP_TO_NOP,
};
int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
void *addr1, void *addr2);
......
......@@ -293,7 +293,7 @@ struct bpf_verifier_state_list {
struct bpf_insn_aux_data {
union {
enum bpf_reg_type ptr_type; /* pointer type for load/store insns */
unsigned long map_state; /* pointer/poison value for maps */
unsigned long map_ptr_state; /* pointer/poison value for maps */
s32 call_imm; /* saved imm field of call insn */
u32 alu_limit; /* limit for add/sub register with pointer */
struct {
......@@ -301,6 +301,7 @@ struct bpf_insn_aux_data {
u32 map_off; /* offset from value base address */
};
};
u64 map_key_state; /* constant (32 bit) key tracking for maps */
int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
int sanitize_stack_off; /* stack slot to be cleared */
bool seen; /* this insn was processed by the verifier */
......
......@@ -952,6 +952,9 @@ void *bpf_jit_alloc_exec(unsigned long size);
void bpf_jit_free_exec(void *addr);
void bpf_jit_free(struct bpf_prog *fp);
int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
struct bpf_jit_poke_descriptor *poke);
int bpf_jit_get_func_addr(const struct bpf_prog *prog,
const struct bpf_insn *insn, bool extra_pass,
u64 *func_addr, bool *func_addr_fixed);
......@@ -1055,6 +1058,13 @@ static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp)
return false;
}
static inline int
bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
struct bpf_jit_poke_descriptor *poke)
{
return -ENOTSUPP;
}
static inline void bpf_jit_free(struct bpf_prog *fp)
{
bpf_prog_unlock_free(fp);
......
......@@ -586,10 +586,17 @@ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
if (IS_ERR(new_ptr))
return PTR_ERR(new_ptr);
old_ptr = xchg(array->ptrs + index, new_ptr);
if (map->ops->map_poke_run) {
mutex_lock(&array->aux->poke_mutex);
old_ptr = xchg(array->ptrs + index, new_ptr);
map->ops->map_poke_run(map, index, old_ptr, new_ptr);
mutex_unlock(&array->aux->poke_mutex);
} else {
old_ptr = xchg(array->ptrs + index, new_ptr);
}
if (old_ptr)
map->ops->map_fd_put_ptr(old_ptr);
return 0;
}
......@@ -602,7 +609,15 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
if (index >= array->map.max_entries)
return -E2BIG;
old_ptr = xchg(array->ptrs + index, NULL);
if (map->ops->map_poke_run) {
mutex_lock(&array->aux->poke_mutex);
old_ptr = xchg(array->ptrs + index, NULL);
map->ops->map_poke_run(map, index, old_ptr, NULL);
mutex_unlock(&array->aux->poke_mutex);
} else {
old_ptr = xchg(array->ptrs + index, NULL);
}
if (old_ptr) {
map->ops->map_fd_put_ptr(old_ptr);
return 0;
......@@ -671,17 +686,205 @@ static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key,
rcu_read_unlock();
}
struct prog_poke_elem {
struct list_head list;
struct bpf_prog_aux *aux;
};
static int prog_array_map_poke_track(struct bpf_map *map,
struct bpf_prog_aux *prog_aux)
{
struct prog_poke_elem *elem;
struct bpf_array_aux *aux;
int ret = 0;
aux = container_of(map, struct bpf_array, map)->aux;
mutex_lock(&aux->poke_mutex);
list_for_each_entry(elem, &aux->poke_progs, list) {
if (elem->aux == prog_aux)
goto out;
}
elem = kmalloc(sizeof(*elem), GFP_KERNEL);
if (!elem) {
ret = -ENOMEM;
goto out;
}
INIT_LIST_HEAD(&elem->list);
/* We must track the program's aux info at this point in time
* since the program pointer itself may not be stable yet, see
* also comment in prog_array_map_poke_run().
*/
elem->aux = prog_aux;
list_add_tail(&elem->list, &aux->poke_progs);
out:
mutex_unlock(&aux->poke_mutex);
return ret;
}
static void prog_array_map_poke_untrack(struct bpf_map *map,
struct bpf_prog_aux *prog_aux)
{
struct prog_poke_elem *elem, *tmp;
struct bpf_array_aux *aux;
aux = container_of(map, struct bpf_array, map)->aux;
mutex_lock(&aux->poke_mutex);
list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) {
if (elem->aux == prog_aux) {
list_del_init(&elem->list);
kfree(elem);
break;
}
}
mutex_unlock(&aux->poke_mutex);
}
static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
struct bpf_prog *old,
struct bpf_prog *new)
{
enum bpf_text_poke_type type;
struct prog_poke_elem *elem;
struct bpf_array_aux *aux;
if (!old && new)
type = BPF_MOD_NOP_TO_JUMP;
else if (old && !new)
type = BPF_MOD_JUMP_TO_NOP;
else if (old && new)
type = BPF_MOD_JUMP_TO_JUMP;
else
return;
aux = container_of(map, struct bpf_array, map)->aux;
WARN_ON_ONCE(!mutex_is_locked(&aux->poke_mutex));
list_for_each_entry(elem, &aux->poke_progs, list) {
struct bpf_jit_poke_descriptor *poke;
int i, ret;
for (i = 0; i < elem->aux->size_poke_tab; i++) {
poke = &elem->aux->poke_tab[i];
/* Few things to be aware of:
*
* 1) We can only ever access aux in this context, but
* not aux->prog since it might not be stable yet and
* there could be danger of use after free otherwise.
* 2) Initially when we start tracking aux, the program
* is not JITed yet and also does not have a kallsyms
* entry. We skip these as poke->ip_stable is not
* active yet. The JIT will do the final fixup before
* setting it stable. The various poke->ip_stable are
* successively activated, so tail call updates can
* arrive from here while JIT is still finishing its
* final fixup for non-activated poke entries.
* 3) On program teardown, the program's kallsym entry gets
* removed out of RCU callback, but we can only untrack
* from sleepable context, therefore bpf_arch_text_poke()
* might not see that this is in BPF text section and
* bails out with -EINVAL. As these are unreachable since
* RCU grace period already passed, we simply skip them.
* 4) Also programs reaching refcount of zero while patching
* is in progress is okay since we're protected under
* poke_mutex and untrack the programs before the JIT
* buffer is freed. When we're still in the middle of
* patching and suddenly kallsyms entry of the program
* gets evicted, we just skip the rest which is fine due
* to point 3).
* 5) Any other error happening below from bpf_arch_text_poke()
* is a unexpected bug.
*/
if (!READ_ONCE(poke->ip_stable))
continue;
if (poke->reason != BPF_POKE_REASON_TAIL_CALL)
continue;
if (poke->tail_call.map != map ||
poke->tail_call.key != key)
continue;
ret = bpf_arch_text_poke(poke->ip, type,
old ? (u8 *)old->bpf_func +
poke->adj_off : NULL,
new ? (u8 *)new->bpf_func +
poke->adj_off : NULL);
BUG_ON(ret < 0 && ret != -EINVAL);
}
}
}
static void prog_array_map_clear_deferred(struct work_struct *work)
{
struct bpf_map *map = container_of(work, struct bpf_array_aux,
work)->map;
bpf_fd_array_map_clear(map);
bpf_map_put(map);
}
static void prog_array_map_clear(struct bpf_map *map)
{
struct bpf_array_aux *aux = container_of(map, struct bpf_array,
map)->aux;
bpf_map_inc(map);
schedule_work(&aux->work);
}
static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
{
struct bpf_array_aux *aux;
struct bpf_map *map;
aux = kzalloc(sizeof(*aux), GFP_KERNEL);
if (!aux)
return ERR_PTR(-ENOMEM);
INIT_WORK(&aux->work, prog_array_map_clear_deferred);
INIT_LIST_HEAD(&aux->poke_progs);
mutex_init(&aux->poke_mutex);
map = array_map_alloc(attr);
if (IS_ERR(map)) {
kfree(aux);
return map;
}
container_of(map, struct bpf_array, map)->aux = aux;
aux->map = map;
return map;
}
static void prog_array_map_free(struct bpf_map *map)
{
struct prog_poke_elem *elem, *tmp;
struct bpf_array_aux *aux;
aux = container_of(map, struct bpf_array, map)->aux;
list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) {
list_del_init(&elem->list);
kfree(elem);
}
kfree(aux);
fd_array_map_free(map);
}
const struct bpf_map_ops prog_array_map_ops = {
.map_alloc_check = fd_array_map_alloc_check,
.map_alloc = array_map_alloc,
.map_free = fd_array_map_free,
.map_alloc = prog_array_map_alloc,
.map_free = prog_array_map_free,
.map_poke_track = prog_array_map_poke_track,
.map_poke_untrack = prog_array_map_poke_untrack,
.map_poke_run = prog_array_map_poke_run,
.map_get_next_key = array_map_get_next_key,
.map_lookup_elem = fd_array_map_lookup_elem,
.map_delete_elem = fd_array_map_delete_elem,
.map_fd_get_ptr = prog_fd_array_get_ptr,
.map_fd_put_ptr = prog_fd_array_put_ptr,
.map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem,
.map_release_uref = bpf_fd_array_map_clear,
.map_release_uref = prog_array_map_clear,
.map_seq_show_elem = prog_array_map_seq_show_elem,
};
......
......@@ -256,6 +256,7 @@ void __bpf_prog_free(struct bpf_prog *fp)
{
if (fp->aux) {
free_percpu(fp->aux->stats);
kfree(fp->aux->poke_tab);
kfree(fp->aux);
}
vfree(fp);
......@@ -756,6 +757,39 @@ int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
return ret;
}
int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
struct bpf_jit_poke_descriptor *poke)
{
struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab;
static const u32 poke_tab_max = 1024;
u32 slot = prog->aux->size_poke_tab;
u32 size = slot + 1;
if (size > poke_tab_max)
return -ENOSPC;
if (poke->ip || poke->ip_stable || poke->adj_off)
return -EINVAL;
switch (poke->reason) {
case BPF_POKE_REASON_TAIL_CALL:
if (!poke->tail_call.map)
return -EINVAL;
break;
default:
return -EINVAL;
}
tab = krealloc(tab, size * sizeof(*poke), GFP_KERNEL);
if (!tab)
return -ENOMEM;
memcpy(&tab[slot], poke, sizeof(*poke));
prog->aux->size_poke_tab = size;
prog->aux->poke_tab = tab;
return slot;
}
static atomic_long_t bpf_jit_current;
/* Can be overridden by an arch's JIT compiler if it has a custom,
......@@ -1691,18 +1725,17 @@ bool bpf_prog_array_compatible(struct bpf_array *array,
if (fp->kprobe_override)
return false;
if (!array->owner_prog_type) {
if (!array->aux->type) {
/* There's no owner yet where we could check for
* compatibility.
*/
array->owner_prog_type = fp->type;
array->owner_jited = fp->jited;
array->aux->type = fp->type;
array->aux->jited = fp->jited;
return true;
}
return array->owner_prog_type == fp->type &&
array->owner_jited == fp->jited;
return array->aux->type == fp->type &&
array->aux->jited == fp->jited;
}
static int bpf_check_tail_call(const struct bpf_prog *fp)
......@@ -2003,12 +2036,40 @@ int bpf_prog_array_copy_info(struct bpf_prog_array *array,
: 0;
}
static void bpf_free_cgroup_storage(struct bpf_prog_aux *aux)
{
enum bpf_cgroup_storage_type stype;
for_each_cgroup_storage_type(stype) {
if (!aux->cgroup_storage[stype])
continue;
bpf_cgroup_storage_release(aux->prog,
aux->cgroup_storage[stype]);
}
}
static void bpf_free_used_maps(struct bpf_prog_aux *aux)
{
struct bpf_map *map;
int i;
bpf_free_cgroup_storage(aux);
for (i = 0; i < aux->used_map_cnt; i++) {
map = aux->used_maps[i];
if (map->ops->map_poke_untrack)
map->ops->map_poke_untrack(map, aux);
bpf_map_put(map);
}
kfree(aux->used_maps);
}
static void bpf_prog_free_deferred(struct work_struct *work)
{
struct bpf_prog_aux *aux;
int i;
aux = container_of(work, struct bpf_prog_aux, work);
bpf_free_used_maps(aux);
if (bpf_prog_is_dev_bound(aux))
bpf_prog_offload_destroy(aux->prog);
#ifdef CONFIG_PERF_EVENTS
......
......@@ -17,9 +17,8 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
if (IS_ERR(inner_map))
return inner_map;
/* prog_array->owner_prog_type and owner_jited
* is a runtime binding. Doing static check alone
* in the verifier is not enough.
/* prog_array->aux->{type,jited} is a runtime binding.
* Doing static check alone in the verifier is not enough.
*/
if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY ||
inner_map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE ||
......
......@@ -25,12 +25,13 @@
#include <linux/nospec.h>
#include <uapi/linux/btf.h>
#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \
(map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
(map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
(map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
(map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
(map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
#define IS_FD_PROG_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY)
#define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_HASH(map))
#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map) || \
IS_FD_HASH(map))
#define BPF_OBJ_FLAG_MASK (BPF_F_RDONLY | BPF_F_WRONLY)
......@@ -389,13 +390,12 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
{
const struct bpf_map *map = filp->private_data;
const struct bpf_array *array;
u32 owner_prog_type = 0;
u32 owner_jited = 0;
u32 type = 0, jited = 0;
if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) {
array = container_of(map, struct bpf_array, map);
owner_prog_type = array->owner_prog_type;
owner_jited = array->owner_jited;
type = array->aux->type;
jited = array->aux->jited;
}
seq_printf(m,
......@@ -415,12 +415,9 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
map->memory.pages * 1ULL << PAGE_SHIFT,
map->id,
READ_ONCE(map->frozen));
if (owner_prog_type) {
seq_printf(m, "owner_prog_type:\t%u\n",
owner_prog_type);
seq_printf(m, "owner_jited:\t%u\n",
owner_jited);
if (type) {
seq_printf(m, "owner_prog_type:\t%u\n", type);
seq_printf(m, "owner_jited:\t%u\n", jited);
}
}
#endif
......@@ -881,7 +878,7 @@ static int map_lookup_elem(union bpf_attr *attr)
err = bpf_percpu_cgroup_storage_copy(map, key, value);
} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
err = bpf_stackmap_copy(map, key, value);
} else if (IS_FD_ARRAY(map)) {
} else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) {
err = bpf_fd_array_map_lookup_elem(map, key, value);
} else if (IS_FD_HASH(map)) {
err = bpf_fd_htab_map_lookup_elem(map, key, value);
......@@ -1008,6 +1005,10 @@ static int map_update_elem(union bpf_attr *attr)
map->map_type == BPF_MAP_TYPE_SOCKMAP) {
err = map->ops->map_update_elem(map, key, value, attr->flags);
goto out;
} else if (IS_FD_PROG_ARRAY(map)) {
err = bpf_fd_array_map_update_elem(map, f.file, key, value,
attr->flags);
goto out;
}
/* must increment bpf_prog_active to avoid kprobe+bpf triggering from
......@@ -1090,6 +1091,9 @@ static int map_delete_elem(union bpf_attr *attr)
if (bpf_map_is_dev_bound(map)) {
err = bpf_map_offload_delete_elem(map, key);
goto out;
} else if (IS_FD_PROG_ARRAY(map)) {
err = map->ops->map_delete_elem(map, key);
goto out;
}
preempt_disable();
......@@ -1302,25 +1306,6 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
return 0;
}
/* drop refcnt on maps used by eBPF program and free auxilary data */
static void free_used_maps(struct bpf_prog_aux *aux)
{
enum bpf_cgroup_storage_type stype;
int i;
for_each_cgroup_storage_type(stype) {
if (!aux->cgroup_storage[stype])
continue;
bpf_cgroup_storage_release(aux->prog,
aux->cgroup_storage[stype]);
}
for (i = 0; i < aux->used_map_cnt; i++)
bpf_map_put(aux->used_maps[i]);
kfree(aux->used_maps);
}
int __bpf_prog_charge(struct user_struct *user, u32 pages)
{
unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
......@@ -1415,7 +1400,6 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
kvfree(aux->func_info);
kfree(aux->func_info_aux);
free_used_maps(aux);
bpf_prog_uncharge_memlock(aux->prog);
security_bpf_prog_free(aux);
bpf_prog_free(aux->prog);
......
......@@ -171,6 +171,9 @@ struct bpf_verifier_stack_elem {
#define BPF_COMPLEXITY_LIMIT_JMP_SEQ 8192
#define BPF_COMPLEXITY_LIMIT_STATES 64
#define BPF_MAP_KEY_POISON (1ULL << 63)
#define BPF_MAP_KEY_SEEN (1ULL << 62)
#define BPF_MAP_PTR_UNPRIV 1UL
#define BPF_MAP_PTR_POISON ((void *)((0xeB9FUL << 1) + \
POISON_POINTER_DELTA))
......@@ -178,12 +181,12 @@ struct bpf_verifier_stack_elem {
static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux)
{
return BPF_MAP_PTR(aux->map_state) == BPF_MAP_PTR_POISON;
return BPF_MAP_PTR(aux->map_ptr_state) == BPF_MAP_PTR_POISON;
}
static bool bpf_map_ptr_unpriv(const struct bpf_insn_aux_data *aux)
{
return aux->map_state & BPF_MAP_PTR_UNPRIV;
return aux->map_ptr_state & BPF_MAP_PTR_UNPRIV;
}
static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux,
......@@ -191,8 +194,31 @@ static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux,
{
BUILD_BUG_ON((unsigned long)BPF_MAP_PTR_POISON & BPF_MAP_PTR_UNPRIV);
unpriv |= bpf_map_ptr_unpriv(aux);
aux->map_state = (unsigned long)map |
(unpriv ? BPF_MAP_PTR_UNPRIV : 0UL);
aux->map_ptr_state = (unsigned long)map |
(unpriv ? BPF_MAP_PTR_UNPRIV : 0UL);
}
static bool bpf_map_key_poisoned(const struct bpf_insn_aux_data *aux)
{
return aux->map_key_state & BPF_MAP_KEY_POISON;
}
static bool bpf_map_key_unseen(const struct bpf_insn_aux_data *aux)
{
return !(aux->map_key_state & BPF_MAP_KEY_SEEN);
}
static u64 bpf_map_key_immediate(const struct bpf_insn_aux_data *aux)
{
return aux->map_key_state & ~(BPF_MAP_KEY_SEEN | BPF_MAP_KEY_POISON);
}
static void bpf_map_key_store(struct bpf_insn_aux_data *aux, u64 state)
{
bool poisoned = bpf_map_key_poisoned(aux);
aux->map_key_state = state | BPF_MAP_KEY_SEEN |
(poisoned ? BPF_MAP_KEY_POISON : 0ULL);
}
struct bpf_call_arg_meta {
......@@ -4090,15 +4116,49 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
return -EACCES;
}
if (!BPF_MAP_PTR(aux->map_state))
if (!BPF_MAP_PTR(aux->map_ptr_state))
bpf_map_ptr_store(aux, meta->map_ptr,
meta->map_ptr->unpriv_array);
else if (BPF_MAP_PTR(aux->map_state) != meta->map_ptr)
else if (BPF_MAP_PTR(aux->map_ptr_state) != meta->map_ptr)
bpf_map_ptr_store(aux, BPF_MAP_PTR_POISON,
meta->map_ptr->unpriv_array);
return 0;
}
static int
record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
int func_id, int insn_idx)
{
struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
struct bpf_reg_state *regs = cur_regs(env), *reg;
struct bpf_map *map = meta->map_ptr;
struct tnum range;
u64 val;
if (func_id != BPF_FUNC_tail_call)
return 0;
if (!map || map->map_type != BPF_MAP_TYPE_PROG_ARRAY) {
verbose(env, "kernel subsystem misconfigured verifier\n");
return -EINVAL;
}
range = tnum_range(0, map->max_entries - 1);
reg = &regs[BPF_REG_3];
if (!register_is_const(reg) || !tnum_in(range, reg->var_off)) {
bpf_map_key_store(aux, BPF_MAP_KEY_POISON);
return 0;
}
val = reg->var_off.value;
if (bpf_map_key_unseen(aux))
bpf_map_key_store(aux, val);
else if (!bpf_map_key_poisoned(aux) &&
bpf_map_key_immediate(aux) != val)
bpf_map_key_store(aux, BPF_MAP_KEY_POISON);
return 0;
}
static int check_reference_leak(struct bpf_verifier_env *env)
{
struct bpf_func_state *state = cur_func(env);
......@@ -4173,6 +4233,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
if (err)
return err;
err = record_func_key(env, &meta, func_id, insn_idx);
if (err)
return err;
/* Mark slots with STACK_MISC in case of raw mode, stack offset
* is inferred from register state.
*/
......@@ -9065,6 +9129,7 @@ static int fixup_call_args(struct bpf_verifier_env *env)
static int fixup_bpf_calls(struct bpf_verifier_env *env)
{
struct bpf_prog *prog = env->prog;
bool expect_blinding = bpf_jit_blinding_enabled(prog);
struct bpf_insn *insn = prog->insnsi;
const struct bpf_func_proto *fn;
const int insn_cnt = prog->len;
......@@ -9073,7 +9138,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
struct bpf_insn insn_buf[16];
struct bpf_prog *new_prog;
struct bpf_map *map_ptr;
int i, cnt, delta = 0;
int i, ret, cnt, delta = 0;
for (i = 0; i < insn_cnt; i++, insn++) {
if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) ||
......@@ -9217,6 +9282,26 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
insn->code = BPF_JMP | BPF_TAIL_CALL;
aux = &env->insn_aux_data[i + delta];
if (prog->jit_requested && !expect_blinding &&
!bpf_map_key_poisoned(aux) &&
!bpf_map_ptr_poisoned(aux) &&
!bpf_map_ptr_unpriv(aux)) {
struct bpf_jit_poke_descriptor desc = {
.reason = BPF_POKE_REASON_TAIL_CALL,
.tail_call.map = BPF_MAP_PTR(aux->map_ptr_state),
.tail_call.key = bpf_map_key_immediate(aux),
};
ret = bpf_jit_add_poke_descriptor(prog, &desc);
if (ret < 0) {
verbose(env, "adding tail call poke descriptor failed\n");
return ret;
}
insn->imm = ret + 1;
continue;
}
if (!bpf_map_ptr_unpriv(aux))
continue;
......@@ -9231,7 +9316,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
return -EINVAL;
}
map_ptr = BPF_MAP_PTR(aux->map_state);
map_ptr = BPF_MAP_PTR(aux->map_ptr_state);
insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3,
map_ptr->max_entries, 2);
insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3,
......@@ -9265,7 +9350,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
if (bpf_map_ptr_poisoned(aux))
goto patch_call_imm;
map_ptr = BPF_MAP_PTR(aux->map_state);
map_ptr = BPF_MAP_PTR(aux->map_ptr_state);
ops = map_ptr->ops;
if (insn->imm == BPF_FUNC_map_lookup_elem &&
ops->map_gen_lookup) {
......@@ -9345,6 +9430,23 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
insn->imm = fn->func - __bpf_call_base;
}
/* Since poke tab is now finalized, publish aux to tracker. */
for (i = 0; i < prog->aux->size_poke_tab; i++) {
map_ptr = prog->aux->poke_tab[i].tail_call.map;
if (!map_ptr->ops->map_poke_track ||
!map_ptr->ops->map_poke_untrack ||
!map_ptr->ops->map_poke_run) {
verbose(env, "bpf verifier is misconfigured\n");
return -EINVAL;
}
ret = map_ptr->ops->map_poke_track(map_ptr, prog->aux);
if (ret < 0) {
verbose(env, "tracking tail call prog failed\n");
return ret;
}
}
return 0;
}
......
This diff is collapsed.
// SPDX-License-Identifier: GPL-2.0
#include <linux/bpf.h>
#include "bpf_helpers.h"
struct {
__uint(type, BPF_MAP_TYPE_PROG_ARRAY);
__uint(max_entries, 3);
__uint(key_size, sizeof(__u32));
__uint(value_size, sizeof(__u32));
} jmp_table SEC(".maps");
#define TAIL_FUNC(x) \
SEC("classifier/" #x) \
int bpf_func_##x(struct __sk_buff *skb) \
{ \
return x; \
}
TAIL_FUNC(0)
TAIL_FUNC(1)
TAIL_FUNC(2)
SEC("classifier")
int entry(struct __sk_buff *skb)
{
/* Multiple locations to make sure we patch
* all of them.
*/
bpf_tail_call(skb, &jmp_table, 0);
bpf_tail_call(skb, &jmp_table, 0);
bpf_tail_call(skb, &jmp_table, 0);
bpf_tail_call(skb, &jmp_table, 0);
bpf_tail_call(skb, &jmp_table, 1);
bpf_tail_call(skb, &jmp_table, 1);
bpf_tail_call(skb, &jmp_table, 1);
bpf_tail_call(skb, &jmp_table, 1);
bpf_tail_call(skb, &jmp_table, 2);
bpf_tail_call(skb, &jmp_table, 2);
bpf_tail_call(skb, &jmp_table, 2);
bpf_tail_call(skb, &jmp_table, 2);
return 3;
}
char __license[] SEC("license") = "GPL";
int _version SEC("version") = 1;
// SPDX-License-Identifier: GPL-2.0
#include <linux/bpf.h>
#include "bpf_helpers.h"
struct {
__uint(type, BPF_MAP_TYPE_PROG_ARRAY);
__uint(max_entries, 5);
__uint(key_size, sizeof(__u32));
__uint(value_size, sizeof(__u32));
} jmp_table SEC(".maps");
SEC("classifier/0")
int bpf_func_0(struct __sk_buff *skb)
{
bpf_tail_call(skb, &jmp_table, 1);
return 0;
}
SEC("classifier/1")
int bpf_func_1(struct __sk_buff *skb)
{
bpf_tail_call(skb, &jmp_table, 2);
return 1;
}
SEC("classifier/2")
int bpf_func_2(struct __sk_buff *skb)
{
return 2;
}
SEC("classifier/3")
int bpf_func_3(struct __sk_buff *skb)
{
bpf_tail_call(skb, &jmp_table, 4);
return 3;
}
SEC("classifier/4")
int bpf_func_4(struct __sk_buff *skb)
{
bpf_tail_call(skb, &jmp_table, 3);
return 4;
}
SEC("classifier")
int entry(struct __sk_buff *skb)
{
bpf_tail_call(skb, &jmp_table, 0);
/* Check multi-prog update. */
bpf_tail_call(skb, &jmp_table, 2);
/* Check tail call limit. */
bpf_tail_call(skb, &jmp_table, 3);
return 3;
}
char __license[] SEC("license") = "GPL";
int _version SEC("version") = 1;
// SPDX-License-Identifier: GPL-2.0
#include <linux/bpf.h>
#include "bpf_helpers.h"
struct {
__uint(type, BPF_MAP_TYPE_PROG_ARRAY);
__uint(max_entries, 1);
__uint(key_size, sizeof(__u32));
__uint(value_size, sizeof(__u32));
} jmp_table SEC(".maps");
static volatile int count;
SEC("classifier/0")
int bpf_func_0(struct __sk_buff *skb)
{
count++;
bpf_tail_call(skb, &jmp_table, 0);
return 1;
}
SEC("classifier")
int entry(struct __sk_buff *skb)
{
bpf_tail_call(skb, &jmp_table, 0);
return 0;
}
char __license[] SEC("license") = "GPL";
int _version SEC("version") = 1;
// SPDX-License-Identifier: GPL-2.0
#include <linux/bpf.h>
#include "bpf_helpers.h"
struct {
__uint(type, BPF_MAP_TYPE_PROG_ARRAY);
__uint(max_entries, 3);
__uint(key_size, sizeof(__u32));
__uint(value_size, sizeof(__u32));
} jmp_table SEC(".maps");
static volatile int selector;
#define TAIL_FUNC(x) \
SEC("classifier/" #x) \
int bpf_func_##x(struct __sk_buff *skb) \
{ \
return x; \
}
TAIL_FUNC(0)
TAIL_FUNC(1)
TAIL_FUNC(2)
SEC("classifier")
int entry(struct __sk_buff *skb)
{
bpf_tail_call(skb, &jmp_table, selector);
return 3;
}
char __license[] SEC("license") = "GPL";
int _version SEC("version") = 1;
// SPDX-License-Identifier: GPL-2.0
#include <linux/bpf.h>
#include "bpf_helpers.h"
struct {
__uint(type, BPF_MAP_TYPE_PROG_ARRAY);
__uint(max_entries, 3);
__uint(key_size, sizeof(__u32));
__uint(value_size, sizeof(__u32));
} jmp_table SEC(".maps");
static volatile int selector;
#define TAIL_FUNC(x) \
SEC("classifier/" #x) \
int bpf_func_##x(struct __sk_buff *skb) \
{ \
return x; \
}
TAIL_FUNC(0)
TAIL_FUNC(1)
TAIL_FUNC(2)
SEC("classifier")
int entry(struct __sk_buff *skb)
{
int idx = 0;
if (selector == 1234)
idx = 1;
else if (selector == 5678)
idx = 2;
bpf_tail_call(skb, &jmp_table, idx);
return 3;
}
char __license[] SEC("license") = "GPL";
int _version SEC("version") = 1;
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment