Commit 5bebd3e3 authored by Alexei Starovoitov's avatar Alexei Starovoitov

Merge branch 'remove-unnecessary-synchronizations-in-cpumap'

Hou Tao says:

====================
Remove unnecessary synchronizations in cpumap

From: Hou Tao <houtao1@huawei.com>

Hi,

This is the formal patchset to remove unnecessary synchronizations in
cpu-map after address comments and collect Rvb tags from Toke
Høiland-Jørgensen (Big thanks to Toke). Patch #1 removes the unnecessary
rcu_barrier() when freeing bpf_cpu_map_entry and replaces it by
queue_rcu_work(). Patch #2 removes the unnecessary call_rcu() and
queue_work() when destroying cpu-map and does the freeing directly.

Test the patchset by using xdp_redirect_cpu and virtio-net. Both
xdp-mode and skb-mode have been exercised and no issues were reported.
As ususal, comments and suggestions are always welcome.

Change Log:
v1:
  * address comments from Toke Høiland-Jørgensen
  * add Rvb tags from Toke Høiland-Jørgensen
  * update outdated comment in cpu_map_delete_elem()

RFC: https://lore.kernel.org/bpf/20230728023030.1906124-1-houtao@huaweicloud.com
====================

Link: https://lore.kernel.org/r/20230816045959.358059-1-houtao@huaweicloud.comSigned-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parents 0a55264c c2e42ddf
...@@ -68,11 +68,8 @@ struct bpf_cpu_map_entry { ...@@ -68,11 +68,8 @@ struct bpf_cpu_map_entry {
struct bpf_cpumap_val value; struct bpf_cpumap_val value;
struct bpf_prog *prog; struct bpf_prog *prog;
atomic_t refcnt; /* Control when this struct can be free'ed */
struct rcu_head rcu;
struct work_struct kthread_stop_wq;
struct completion kthread_running; struct completion kthread_running;
struct rcu_work free_work;
}; };
struct bpf_cpu_map { struct bpf_cpu_map {
...@@ -117,11 +114,6 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) ...@@ -117,11 +114,6 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
return &cmap->map; return &cmap->map;
} }
static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
{
atomic_inc(&rcpu->refcnt);
}
static void __cpu_map_ring_cleanup(struct ptr_ring *ring) static void __cpu_map_ring_cleanup(struct ptr_ring *ring)
{ {
/* The tear-down procedure should have made sure that queue is /* The tear-down procedure should have made sure that queue is
...@@ -142,35 +134,6 @@ static void __cpu_map_ring_cleanup(struct ptr_ring *ring) ...@@ -142,35 +134,6 @@ static void __cpu_map_ring_cleanup(struct ptr_ring *ring)
} }
} }
static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
{
if (atomic_dec_and_test(&rcpu->refcnt)) {
if (rcpu->prog)
bpf_prog_put(rcpu->prog);
/* The queue should be empty at this point */
__cpu_map_ring_cleanup(rcpu->queue);
ptr_ring_cleanup(rcpu->queue, NULL);
kfree(rcpu->queue);
kfree(rcpu);
}
}
/* called from workqueue, to workaround syscall using preempt_disable */
static void cpu_map_kthread_stop(struct work_struct *work)
{
struct bpf_cpu_map_entry *rcpu;
rcpu = container_of(work, struct bpf_cpu_map_entry, kthread_stop_wq);
/* Wait for flush in __cpu_map_entry_free(), via full RCU barrier,
* as it waits until all in-flight call_rcu() callbacks complete.
*/
rcu_barrier();
/* kthread_stop will wake_up_process and wait for it to complete */
kthread_stop(rcpu->kthread);
}
static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu, static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu,
struct list_head *listp, struct list_head *listp,
struct xdp_cpumap_stats *stats) struct xdp_cpumap_stats *stats)
...@@ -395,7 +358,6 @@ static int cpu_map_kthread_run(void *data) ...@@ -395,7 +358,6 @@ static int cpu_map_kthread_run(void *data)
} }
__set_current_state(TASK_RUNNING); __set_current_state(TASK_RUNNING);
put_cpu_map_entry(rcpu);
return 0; return 0;
} }
...@@ -472,9 +434,6 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value, ...@@ -472,9 +434,6 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
if (IS_ERR(rcpu->kthread)) if (IS_ERR(rcpu->kthread))
goto free_prog; goto free_prog;
get_cpu_map_entry(rcpu); /* 1-refcnt for being in cmap->cpu_map[] */
get_cpu_map_entry(rcpu); /* 1-refcnt for kthread */
/* Make sure kthread runs on a single CPU */ /* Make sure kthread runs on a single CPU */
kthread_bind(rcpu->kthread, cpu); kthread_bind(rcpu->kthread, cpu);
wake_up_process(rcpu->kthread); wake_up_process(rcpu->kthread);
...@@ -501,40 +460,40 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value, ...@@ -501,40 +460,40 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
return NULL; return NULL;
} }
static void __cpu_map_entry_free(struct rcu_head *rcu) static void __cpu_map_entry_free(struct work_struct *work)
{ {
struct bpf_cpu_map_entry *rcpu; struct bpf_cpu_map_entry *rcpu;
/* This cpu_map_entry have been disconnected from map and one /* This cpu_map_entry have been disconnected from map and one
* RCU grace-period have elapsed. Thus, XDP cannot queue any * RCU grace-period have elapsed. Thus, XDP cannot queue any
* new packets and cannot change/set flush_needed that can * new packets and cannot change/set flush_needed that can
* find this entry. * find this entry.
*/ */
rcpu = container_of(rcu, struct bpf_cpu_map_entry, rcu); rcpu = container_of(to_rcu_work(work), struct bpf_cpu_map_entry, free_work);
/* kthread_stop will wake_up_process and wait for it to complete.
* cpu_map_kthread_run() makes sure the pointer ring is empty
* before exiting.
*/
kthread_stop(rcpu->kthread);
if (rcpu->prog)
bpf_prog_put(rcpu->prog);
/* The queue should be empty at this point */
__cpu_map_ring_cleanup(rcpu->queue);
ptr_ring_cleanup(rcpu->queue, NULL);
kfree(rcpu->queue);
free_percpu(rcpu->bulkq); free_percpu(rcpu->bulkq);
/* Cannot kthread_stop() here, last put free rcpu resources */ kfree(rcpu);
put_cpu_map_entry(rcpu);
} }
/* After xchg pointer to bpf_cpu_map_entry, use the call_rcu() to /* After the xchg of the bpf_cpu_map_entry pointer, we need to make sure the old
* ensure any driver rcu critical sections have completed, but this * entry is no longer in use before freeing. We use queue_rcu_work() to call
* does not guarantee a flush has happened yet. Because driver side * __cpu_map_entry_free() in a separate workqueue after waiting for an RCU grace
* rcu_read_lock/unlock only protects the running XDP program. The * period. This means that (a) all pending enqueue and flush operations have
* atomic xchg and NULL-ptr check in __cpu_map_flush() makes sure a * completed (because of the RCU callback), and (b) we are in a workqueue
* pending flush op doesn't fail. * context where we can stop the kthread and wait for it to exit before freeing
* * everything.
* The bpf_cpu_map_entry is still used by the kthread, and there can
* still be pending packets (in queue and percpu bulkq). A refcnt
* makes sure to last user (kthread_stop vs. call_rcu) free memory
* resources.
*
* The rcu callback __cpu_map_entry_free flush remaining packets in
* percpu bulkq to queue. Due to caller map_delete_elem() disable
* preemption, cannot call kthread_stop() to make sure queue is empty.
* Instead a work_queue is started for stopping kthread,
* cpu_map_kthread_stop, which waits for an RCU grace period before
* stopping kthread, emptying the queue.
*/ */
static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
u32 key_cpu, struct bpf_cpu_map_entry *rcpu) u32 key_cpu, struct bpf_cpu_map_entry *rcpu)
...@@ -543,9 +502,8 @@ static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, ...@@ -543,9 +502,8 @@ static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
old_rcpu = unrcu_pointer(xchg(&cmap->cpu_map[key_cpu], RCU_INITIALIZER(rcpu))); old_rcpu = unrcu_pointer(xchg(&cmap->cpu_map[key_cpu], RCU_INITIALIZER(rcpu)));
if (old_rcpu) { if (old_rcpu) {
call_rcu(&old_rcpu->rcu, __cpu_map_entry_free); INIT_RCU_WORK(&old_rcpu->free_work, __cpu_map_entry_free);
INIT_WORK(&old_rcpu->kthread_stop_wq, cpu_map_kthread_stop); queue_rcu_work(system_wq, &old_rcpu->free_work);
schedule_work(&old_rcpu->kthread_stop_wq);
} }
} }
...@@ -557,7 +515,7 @@ static long cpu_map_delete_elem(struct bpf_map *map, void *key) ...@@ -557,7 +515,7 @@ static long cpu_map_delete_elem(struct bpf_map *map, void *key)
if (key_cpu >= map->max_entries) if (key_cpu >= map->max_entries)
return -EINVAL; return -EINVAL;
/* notice caller map_delete_elem() use preempt_disable() */ /* notice caller map_delete_elem() uses rcu_read_lock() */
__cpu_map_entry_replace(cmap, key_cpu, NULL); __cpu_map_entry_replace(cmap, key_cpu, NULL);
return 0; return 0;
} }
...@@ -608,16 +566,15 @@ static void cpu_map_free(struct bpf_map *map) ...@@ -608,16 +566,15 @@ static void cpu_map_free(struct bpf_map *map)
/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
* so the bpf programs (can be more than one that used this map) were * so the bpf programs (can be more than one that used this map) were
* disconnected from events. Wait for outstanding critical sections in * disconnected from events. Wait for outstanding critical sections in
* these programs to complete. The rcu critical section only guarantees * these programs to complete. synchronize_rcu() below not only
* no further "XDP/bpf-side" reads against bpf_cpu_map->cpu_map. * guarantees no further "XDP/bpf-side" reads against
* It does __not__ ensure pending flush operations (if any) are * bpf_cpu_map->cpu_map, but also ensure pending flush operations
* complete. * (if any) are completed.
*/ */
synchronize_rcu(); synchronize_rcu();
/* For cpu_map the remote CPUs can still be using the entries /* The only possible user of bpf_cpu_map_entry is
* (struct bpf_cpu_map_entry). * cpu_map_kthread_run().
*/ */
for (i = 0; i < cmap->map.max_entries; i++) { for (i = 0; i < cmap->map.max_entries; i++) {
struct bpf_cpu_map_entry *rcpu; struct bpf_cpu_map_entry *rcpu;
...@@ -626,8 +583,8 @@ static void cpu_map_free(struct bpf_map *map) ...@@ -626,8 +583,8 @@ static void cpu_map_free(struct bpf_map *map)
if (!rcpu) if (!rcpu)
continue; continue;
/* bq flush and cleanup happens after RCU grace-period */ /* Stop kthread and cleanup entry directly */
__cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */ __cpu_map_entry_free(&rcpu->free_work.work);
} }
bpf_map_area_free(cmap->cpu_map); bpf_map_area_free(cmap->cpu_map);
bpf_map_area_free(cmap); bpf_map_area_free(cmap);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment