Merge branch 'Optimize performance of update hash-map when free is zero'

Feng zhou says: ==================== From: Feng Zhou <zhoufeng.zf@bytedance.com> We encountered bad case on big system with 96 CPUs that alloc_htab_elem() would last for 1ms. The reason is that after the prealloc hashtab has no free elems, when trying to update, it will still grab spin_locks of all cpus. If there are multiple update users, the competition is very serious. 0001: Use head->first to check whether the free list is empty or not before taking the lock. 0002: Add benchmark to reproduce this worst case. Changelog: v5->v6: Addressed comments from Alexei Starovoitov. - Adjust the commit log. some details in here: https://lore.kernel.org/all/20220608021050.47279-1-zhoufeng.zf@bytedance.com/ v4->v5: Addressed comments from Alexei Starovoitov. - Use head->first. - Use cpu+max_entries. some details in here: https://lore.kernel.org/bpf/20220601084149.13097-1-zhoufeng.zf@bytedance.com/ v3->v4: Addressed comments from Daniel Borkmann. - Use READ_ONCE/WRITE_ONCE. some details in here: https://lore.kernel.org/all/20220530091340.53443-1-zhoufeng.zf@bytedance.com/ v2->v3: Addressed comments from Alexei Starovoitov, Andrii Nakryiko. - Adjust the way the benchmark is tested. - Adjust the code format. some details in here: https://lore.kernel.org/all/20220524075306.32306-1-zhoufeng.zf@bytedance.com/T/ v1->v2: Addressed comments from Alexei Starovoitov. - add a benchmark to reproduce the issue. - Adjust the code format that avoid adding indent. some details in here: https://lore.kernel.org/all/877ac441-045b-1844-6938-fcaee5eee7f2@bytedance.com/T/ ==================== Signed-off-by: Alexei Starovoitov <ast@kernel.org>

Merge branch 'Optimize performance of update hash-map when free is zero'
Feng zhou says: ==================== From: Feng Zhou <zhoufeng.zf@bytedance.com> We encountered bad case on big system with 96 CPUs that alloc_htab_elem() would last for 1ms. The reason is that after the prealloc hashtab has no free elems, when trying to update, it will still grab spin_locks of all cpus. If there are multiple update users, the competition is very serious. 0001: Use head->first to check whether the free list is empty or not before taking the lock. 0002: Add benchmark to reproduce this worst case. Changelog: v5->v6: Addressed comments from Alexei Starovoitov. - Adjust the commit log. some details in here: https://lore.kernel.org/all/20220608021050.47279-1-zhoufeng.zf@bytedance.com/ v4->v5: Addressed comments from Alexei Starovoitov. - Use head->first. - Use cpu+max_entries. some details in here: https://lore.kernel.org/bpf/20220601084149.13097-1-zhoufeng.zf@bytedance.com/ v3->v4: Addressed comments from Daniel Borkmann. - Use READ_ONCE/WRITE_ONCE. some details in here: https://lore.kernel.org/all/20220530091340.53443-1-zhoufeng.zf@bytedance.com/ v2->v3: Addressed comments from Alexei Starovoitov, Andrii Nakryiko. - Adjust the way the benchmark is tested. - Adjust the code format. some details in here: https://lore.kernel.org/all/20220524075306.32306-1-zhoufeng.zf@bytedance.com/T/ v1->v2: Addressed comments from Alexei Starovoitov. - add a benchmark to reproduce the issue. - Adjust the code format that avoid adding indent. some details in here: https://lore.kernel.org/all/877ac441-045b-1844-6938-fcaee5eee7f2@bytedance.com/T/ ==================== Signed-off-by: Alexei Starovoitov <ast@kernel.org>
d5e9aeda · Alexei Starovoitov · fe928335 · 89eda984 · d5e9aeda · d5e9aeda
Commit d5e9aeda authored Jun 11, 2022 by Alexei Starovoitov
6 changed files
--- a/kernel/bpf/percpu_freelist.c
+++ b/kernel/bpf/percpu_freelist.c
@@ -31,7 +31,7 @@ static inline void pcpu_freelist_push_node(struct pcpu_freelist_head *head,
 					   struct pcpu_freelist_node *node)
 {
 	node->next = head->first;
-	head->first = node;
+	WRITE_ONCE(head->first, node);
 }

 static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head,
@@ -130,14 +130,17 @@ static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s)
 	orig_cpu = cpu = raw_smp_processor_id();
 	while (1) {
 		head = per_cpu_ptr(s->freelist, cpu);
+		if (!READ_ONCE(head->first))
+			goto next_cpu;
 		raw_spin_lock(&head->lock);
 		node = head->first;
 		if (node) {
-			head->first = node->next;
+			WRITE_ONCE(head->first, node->next);
 			raw_spin_unlock(&head->lock);
 			return node;
 		}
 		raw_spin_unlock(&head->lock);
+next_cpu:
 		cpu = cpumask_next(cpu, cpu_possible_mask);
 		if (cpu >= nr_cpu_ids)
 			cpu = 0;
@@ -146,10 +149,12 @@ static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s)
 	}

 	/* per cpu lists are all empty, try extralist */
+	if (!READ_ONCE(s->extralist.first))
+		return NULL;
 	raw_spin_lock(&s->extralist.lock);
 	node = s->extralist.first;
 	if (node)
-		s->extralist.first = node->next;
+		WRITE_ONCE(s->extralist.first, node->next);
 	raw_spin_unlock(&s->extralist.lock);
 	return node;
 }
@@ -164,15 +169,18 @@ ___pcpu_freelist_pop_nmi(struct pcpu_freelist *s)
 	orig_cpu = cpu = raw_smp_processor_id();
 	while (1) {
 		head = per_cpu_ptr(s->freelist, cpu);
+		if (!READ_ONCE(head->first))
+			goto next_cpu;
 		if (raw_spin_trylock(&head->lock)) {
 			node = head->first;
 			if (node) {
-				head->first = node->next;
+				WRITE_ONCE(head->first, node->next);
 				raw_spin_unlock(&head->lock);
 				return node;
 			}
 			raw_spin_unlock(&head->lock);
 		}
+next_cpu:
 		cpu = cpumask_next(cpu, cpu_possible_mask);
 		if (cpu >= nr_cpu_ids)
 			cpu = 0;
@@ -181,11 +189,11 @@ ___pcpu_freelist_pop_nmi(struct pcpu_freelist *s)
 	}

 	/* cannot pop from per cpu lists, try extralist */
-	if (!raw_spin_trylock(&s->extralist.lock))
+	if (!READ_ONCE(s->extralist.first) || !raw_spin_trylock(&s->extralist.lock))
 		return NULL;
 	node = s->extralist.first;
 	if (node)
-		s->extralist.first = node->next;
+		WRITE_ONCE(s->extralist.first, node->next);
 	raw_spin_unlock(&s->extralist.lock);
 	return node;
 }

--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -560,6 +560,7 @@ $(OUTPUT)/bench_ringbufs.o: $(OUTPUT)/ringbuf_bench.skel.h \
 $(OUTPUT)/bench_bloom_filter_map.o: $(OUTPUT)/bloom_filter_bench.skel.h
 $(OUTPUT)/bench_bpf_loop.o: $(OUTPUT)/bpf_loop_bench.skel.h
 $(OUTPUT)/bench_strncmp.o: $(OUTPUT)/strncmp_bench.skel.h
+$(OUTPUT)/bench_bpf_hashmap_full_update.o: $(OUTPUT)/bpf_hashmap_full_update_bench.skel.h
 $(OUTPUT)/bench.o: bench.h testing_helpers.h $(BPFOBJ)
 $(OUTPUT)/bench: LDLIBS += -lm
 $(OUTPUT)/bench: $(OUTPUT)/bench.o \
@@ -571,7 +572,8 @@ $(OUTPUT)/bench: $(OUTPUT)/bench.o \
 		 $(OUTPUT)/bench_ringbufs.o \
 		 $(OUTPUT)/bench_bloom_filter_map.o \
 		 $(OUTPUT)/bench_bpf_loop.o \
-		 $(OUTPUT)/bench_strncmp.o
+		 $(OUTPUT)/bench_strncmp.o \
+		 $(OUTPUT)/bench_bpf_hashmap_full_update.o
 	$(call msg,BINARY,,$@)
 	$(Q)$(CC) $(CFLAGS) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@


--- a/tools/testing/selftests/bpf/bench.c
+++ b/tools/testing/selftests/bpf/bench.c
@@ -396,6 +396,7 @@ extern const struct bench bench_hashmap_with_bloom;
 extern const struct bench bench_bpf_loop;
 extern const struct bench bench_strncmp_no_helper;
 extern const struct bench bench_strncmp_helper;
+extern const struct bench bench_bpf_hashmap_full_update;

 static const struct bench *benchs[] = {
 	&bench_count_global,
@@ -430,6 +431,7 @@ static const struct bench *benchs[] = {
 	&bench_bpf_loop,
 	&bench_strncmp_no_helper,
 	&bench_strncmp_helper,
+	&bench_bpf_hashmap_full_update,
 };

 static void setup_benchmark()

--- a/tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_full_update.c
+++ b/tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_full_update.c
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Bytedance */
+
+#include <argp.h>
+#include "bench.h"
+#include "bpf_hashmap_full_update_bench.skel.h"
+#include "bpf_util.h"
+
+/* BPF triggering benchmarks */
+static struct ctx {
+	struct bpf_hashmap_full_update_bench *skel;
+} ctx;
+
+#define MAX_LOOP_NUM 10000
+
+static void validate(void)
+{
+	if (env.consumer_cnt != 1) {
+		fprintf(stderr, "benchmark doesn't support multi-consumer!\n");
+		exit(1);
+	}
+}
+
+static void *producer(void *input)
+{
+	while (true) {
+		/* trigger the bpf program */
+		syscall(__NR_getpgid);
+	}
+
+	return NULL;
+}
+
+static void *consumer(void *input)
+{
+	return NULL;
+}
+
+static void measure(struct bench_res *res)
+{
+}
+
+static void setup(void)
+{
+	struct bpf_link *link;
+	int map_fd, i, max_entries;
+
+	setup_libbpf();
+
+	ctx.skel = bpf_hashmap_full_update_bench__open_and_load();
+	if (!ctx.skel) {
+		fprintf(stderr, "failed to open skeleton\n");
+		exit(1);
+	}
+
+	ctx.skel->bss->nr_loops = MAX_LOOP_NUM;
+
+	link = bpf_program__attach(ctx.skel->progs.benchmark);
+	if (!link) {
+		fprintf(stderr, "failed to attach program!\n");
+		exit(1);
+	}
+
+	/* fill hash_map */
+	map_fd = bpf_map__fd(ctx.skel->maps.hash_map_bench);
+	max_entries = bpf_map__max_entries(ctx.skel->maps.hash_map_bench);
+	for (i = 0; i < max_entries; i++)
+		bpf_map_update_elem(map_fd, &i, &i, BPF_ANY);
+}
+
+void hashmap_report_final(struct bench_res res[], int res_cnt)
+{
+	unsigned int nr_cpus = bpf_num_possible_cpus();
+	int i;
+
+	for (i = 0; i < nr_cpus; i++) {
+		u64 time = ctx.skel->bss->percpu_time[i];
+
+		if (!time)
+			continue;
+
+		printf("%d:hash_map_full_perf %lld events per sec\n",
+		       i, ctx.skel->bss->nr_loops * 1000000000ll / time);
+	}
+}
+
+const struct bench bench_bpf_hashmap_full_update = {
+	.name = "bpf-hashmap-ful-update",
+	.validate = validate,
+	.setup = setup,
+	.producer_thread = producer,
+	.consumer_thread = consumer,
+	.measure = measure,
+	.report_progress = NULL,
+	.report_final = hashmap_report_final,
+};
--- a/tools/testing/selftests/bpf/benchs/run_bench_bpf_hashmap_full_update.sh
+++ b/tools/testing/selftests/bpf/benchs/run_bench_bpf_hashmap_full_update.sh
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source ./benchs/run_common.sh
+
+set -eufo pipefail
+
+nr_threads=`expr $(cat /proc/cpuinfo | grep "processor"| wc -l) - 1`
+summary=$($RUN_BENCH -p $nr_threads bpf-hashmap-ful-update)
+printf "$summary"
+printf "\n"
--- a/tools/testing/selftests/bpf/progs/bpf_hashmap_full_update_bench.c
+++ b/tools/testing/selftests/bpf/progs/bpf_hashmap_full_update_bench.c
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Bytedance */
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
+
+char _license[] SEC("license") = "GPL";
+
+#define MAX_ENTRIES 1000
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__type(key, u32);
+	__type(value, u64);
+	__uint(max_entries, MAX_ENTRIES);
+} hash_map_bench SEC(".maps");
+
+u64 __attribute__((__aligned__(256))) percpu_time[256];
+u64 nr_loops;
+
+static int loop_update_callback(__u32 index, u32 *key)
+{
+	u64 init_val = 1;
+
+	bpf_map_update_elem(&hash_map_bench, key, &init_val, BPF_ANY);
+	return 0;
+}
+
+SEC("fentry/" SYS_PREFIX "sys_getpgid")
+int benchmark(void *ctx)
+{
+	u32 cpu = bpf_get_smp_processor_id();
+	u32 key = cpu + MAX_ENTRIES;
+	u64 start_time = bpf_ktime_get_ns();
+
+	bpf_loop(nr_loops, loop_update_callback, &key, 0);
+	percpu_time[cpu & 255] = bpf_ktime_get_ns() - start_time;
+	return 0;
+}