Commit 557c0c6e authored by Alexei Starovoitov's avatar Alexei Starovoitov Committed by David S. Miller

bpf: convert stackmap to pre-allocation

It was observed that calling bpf_get_stackid() from a kprobe inside
slub or from spin_unlock causes similar deadlock as with hashmap,
therefore convert stackmap to use pre-allocated memory.

The call_rcu is no longer feasible mechanism, since delayed freeing
causes bpf_get_stackid() to fail unpredictably when number of actual
stacks is significantly less than user requested max_entries.
Since elements are no longer freed into slub, we can push elements into
freelist immediately and let them be recycled.
However the very unlikley race between user space map_lookup() and
program-side recycling is possible:
     cpu0                          cpu1
     ----                          ----
user does lookup(stackidX)
starts copying ips into buffer
                                   delete(stackidX)
                                   calls bpf_get_stackid()
				   which recyles the element and
                                   overwrites with new stack trace

To avoid user space seeing a partial stack trace consisting of two
merged stack traces, do bucket = xchg(, NULL); copy; xchg(,bucket);
to preserve consistent stack trace delivery to user space.
Now we can move memset(,0) of left-over element value from critical
path of bpf_get_stackid() into slow-path of user space lookup.
Also disallow lookup() from bpf program, since it's useless and
program shouldn't be messing with collected stack trace.

Note that similar race between user space lookup and kernel side updates
is also present in hashmap, but it's not a new race. bpf programs were
always allowed to modify hash and array map elements while user space
is copying them.

Fixes: d5a3b1f6 ("bpf: introduce BPF_MAP_TYPE_STACK_TRACE")
Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 823707b6
...@@ -195,6 +195,7 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, ...@@ -195,6 +195,7 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
u64 flags); u64 flags);
int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
u64 flags); u64 flags);
int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value);
/* memcpy that is used with 8-byte aligned pointers, power-of-8 size and /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and
* forced to use 'long' read/writes to try to atomically copy long counters. * forced to use 'long' read/writes to try to atomically copy long counters.
......
...@@ -10,9 +10,10 @@ ...@@ -10,9 +10,10 @@
#include <linux/vmalloc.h> #include <linux/vmalloc.h>
#include <linux/stacktrace.h> #include <linux/stacktrace.h>
#include <linux/perf_event.h> #include <linux/perf_event.h>
#include "percpu_freelist.h"
struct stack_map_bucket { struct stack_map_bucket {
struct rcu_head rcu; struct pcpu_freelist_node fnode;
u32 hash; u32 hash;
u32 nr; u32 nr;
u64 ip[]; u64 ip[];
...@@ -20,10 +21,34 @@ struct stack_map_bucket { ...@@ -20,10 +21,34 @@ struct stack_map_bucket {
struct bpf_stack_map { struct bpf_stack_map {
struct bpf_map map; struct bpf_map map;
void *elems;
struct pcpu_freelist freelist;
u32 n_buckets; u32 n_buckets;
struct stack_map_bucket __rcu *buckets[]; struct stack_map_bucket *buckets[];
}; };
static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
{
u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;
int err;
smap->elems = vzalloc(elem_size * smap->map.max_entries);
if (!smap->elems)
return -ENOMEM;
err = pcpu_freelist_init(&smap->freelist);
if (err)
goto free_elems;
pcpu_freelist_populate(&smap->freelist, smap->elems, elem_size,
smap->map.max_entries);
return 0;
free_elems:
vfree(smap->elems);
return err;
}
/* Called from syscall */ /* Called from syscall */
static struct bpf_map *stack_map_alloc(union bpf_attr *attr) static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
{ {
...@@ -70,12 +95,22 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) ...@@ -70,12 +95,22 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
smap->n_buckets = n_buckets; smap->n_buckets = n_buckets;
smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
err = bpf_map_precharge_memlock(smap->map.pages);
if (err)
goto free_smap;
err = get_callchain_buffers(); err = get_callchain_buffers();
if (err) if (err)
goto free_smap; goto free_smap;
err = prealloc_elems_and_freelist(smap);
if (err)
goto put_buffers;
return &smap->map; return &smap->map;
put_buffers:
put_callchain_buffers();
free_smap: free_smap:
kvfree(smap); kvfree(smap);
return ERR_PTR(err); return ERR_PTR(err);
...@@ -121,7 +156,7 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) ...@@ -121,7 +156,7 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
ips = trace->ip + skip + init_nr; ips = trace->ip + skip + init_nr;
hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0); hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0);
id = hash & (smap->n_buckets - 1); id = hash & (smap->n_buckets - 1);
bucket = rcu_dereference(smap->buckets[id]); bucket = READ_ONCE(smap->buckets[id]);
if (bucket && bucket->hash == hash) { if (bucket && bucket->hash == hash) {
if (flags & BPF_F_FAST_STACK_CMP) if (flags & BPF_F_FAST_STACK_CMP)
...@@ -135,19 +170,18 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) ...@@ -135,19 +170,18 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
if (bucket && !(flags & BPF_F_REUSE_STACKID)) if (bucket && !(flags & BPF_F_REUSE_STACKID))
return -EEXIST; return -EEXIST;
new_bucket = kmalloc(sizeof(struct stack_map_bucket) + map->value_size, new_bucket = (struct stack_map_bucket *)
GFP_ATOMIC | __GFP_NOWARN); pcpu_freelist_pop(&smap->freelist);
if (unlikely(!new_bucket)) if (unlikely(!new_bucket))
return -ENOMEM; return -ENOMEM;
memcpy(new_bucket->ip, ips, trace_len); memcpy(new_bucket->ip, ips, trace_len);
memset(new_bucket->ip + trace_len / 8, 0, map->value_size - trace_len);
new_bucket->hash = hash; new_bucket->hash = hash;
new_bucket->nr = trace_nr; new_bucket->nr = trace_nr;
old_bucket = xchg(&smap->buckets[id], new_bucket); old_bucket = xchg(&smap->buckets[id], new_bucket);
if (old_bucket) if (old_bucket)
kfree_rcu(old_bucket, rcu); pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
return id; return id;
} }
...@@ -160,17 +194,34 @@ const struct bpf_func_proto bpf_get_stackid_proto = { ...@@ -160,17 +194,34 @@ const struct bpf_func_proto bpf_get_stackid_proto = {
.arg3_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING,
}; };
/* Called from syscall or from eBPF program */ /* Called from eBPF program */
static void *stack_map_lookup_elem(struct bpf_map *map, void *key) static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
{
return NULL;
}
/* Called from syscall */
int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
{ {
struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
struct stack_map_bucket *bucket; struct stack_map_bucket *bucket, *old_bucket;
u32 id = *(u32 *)key; u32 id = *(u32 *)key, trace_len;
if (unlikely(id >= smap->n_buckets)) if (unlikely(id >= smap->n_buckets))
return NULL; return -ENOENT;
bucket = rcu_dereference(smap->buckets[id]);
return bucket ? bucket->ip : NULL; bucket = xchg(&smap->buckets[id], NULL);
if (!bucket)
return -ENOENT;
trace_len = bucket->nr * sizeof(u64);
memcpy(value, bucket->ip, trace_len);
memset(value + trace_len, 0, map->value_size - trace_len);
old_bucket = xchg(&smap->buckets[id], bucket);
if (old_bucket)
pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
return 0;
} }
static int stack_map_get_next_key(struct bpf_map *map, void *key, void *next_key) static int stack_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
...@@ -196,7 +247,7 @@ static int stack_map_delete_elem(struct bpf_map *map, void *key) ...@@ -196,7 +247,7 @@ static int stack_map_delete_elem(struct bpf_map *map, void *key)
old_bucket = xchg(&smap->buckets[id], NULL); old_bucket = xchg(&smap->buckets[id], NULL);
if (old_bucket) { if (old_bucket) {
kfree_rcu(old_bucket, rcu); pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
return 0; return 0;
} else { } else {
return -ENOENT; return -ENOENT;
...@@ -207,13 +258,12 @@ static int stack_map_delete_elem(struct bpf_map *map, void *key) ...@@ -207,13 +258,12 @@ static int stack_map_delete_elem(struct bpf_map *map, void *key)
static void stack_map_free(struct bpf_map *map) static void stack_map_free(struct bpf_map *map)
{ {
struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
int i;
/* wait for bpf programs to complete before freeing stack map */
synchronize_rcu(); synchronize_rcu();
for (i = 0; i < smap->n_buckets; i++) vfree(smap->elems);
if (smap->buckets[i]) pcpu_freelist_destroy(&smap->freelist);
kfree_rcu(smap->buckets[i], rcu);
kvfree(smap); kvfree(smap);
put_callchain_buffers(); put_callchain_buffers();
} }
......
...@@ -290,6 +290,8 @@ static int map_lookup_elem(union bpf_attr *attr) ...@@ -290,6 +290,8 @@ static int map_lookup_elem(union bpf_attr *attr)
err = bpf_percpu_hash_copy(map, key, value); err = bpf_percpu_hash_copy(map, key, value);
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
err = bpf_percpu_array_copy(map, key, value); err = bpf_percpu_array_copy(map, key, value);
} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
err = bpf_stackmap_copy(map, key, value);
} else { } else {
rcu_read_lock(); rcu_read_lock();
ptr = map->ops->map_lookup_elem(map, key); ptr = map->ops->map_lookup_elem(map, key);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment