perf kwork top: Implements BPF-based cpu usage statistics

Use BPF to collect statistics on the CPU usage based on perf BPF skeletons. Example usage: # perf kwork top -h Usage: perf kwork top [<options>] -b, --use-bpf Use BPF to measure task cpu usage -C, --cpu <cpu> list of cpus to profile -i, --input <file> input file name -n, --name <name> event name to profile -s, --sort <key[,key2...]> sort by key(s): rate, runtime, tid --time <str> Time span for analysis (start,stop) # # perf kwork -k sched top -b Starting trace, Hit <Ctrl+C> to stop and report ^C Total : 160702.425 ms, 8 cpus %Cpu(s): 36.00% id, 0.00% hi, 0.00% si %Cpu0 [|||||||||||||||||| 61.66%] %Cpu1 [|||||||||||||||||| 61.27%] %Cpu2 [||||||||||||||||||| 66.40%] %Cpu3 [|||||||||||||||||| 61.28%] %Cpu4 [|||||||||||||||||| 61.82%] %Cpu5 [||||||||||||||||||||||| 77.41%] %Cpu6 [|||||||||||||||||| 61.73%] %Cpu7 [|||||||||||||||||| 63.25%] PID SPID %CPU RUNTIME COMMMAND ------------------------------------------------------------- 0 0 38.72 8089.463 ms [swapper/1] 0 0 38.71 8084.547 ms [swapper/3] 0 0 38.33 8007.532 ms [swapper/0] 0 0 38.26 7992.985 ms [swapper/6] 0 0 38.17 7971.865 ms [swapper/4] 0 0 36.74 7447.765 ms [swapper/7] 0 0 33.59 6486.942 ms [swapper/2] 0 0 22.58 3771.268 ms [swapper/5] 9545 9351 2.48 447.136 ms sched-messaging 9574 9351 2.09 418.583 ms sched-messaging 9724 9351 2.05 372.407 ms sched-messaging 9531 9351 2.01 368.804 ms sched-messaging 9512 9351 2.00 362.250 ms sched-messaging 9514 9351 1.95 357.767 ms sched-messaging 9538 9351 1.86 384.476 ms sched-messaging 9712 9351 1.84 386.490 ms sched-messaging 9723 9351 1.83 380.021 ms sched-messaging 9722 9351 1.82 382.738 ms sched-messaging 9517 9351 1.81 354.794 ms sched-messaging 9559 9351 1.79 344.305 ms sched-messaging 9725 9351 1.77 365.315 ms sched-messaging <SNIP> # perf kwork -k sched top -b -n perf Starting trace, Hit <Ctrl+C> to stop and report ^C Total : 151563.332 ms, 8 cpus %Cpu(s): 26.49% id, 0.00% hi, 0.00% si %Cpu0 [ 0.01%] %Cpu1 [ 0.00%] %Cpu2 [ 0.00%] %Cpu3 [ 0.00%] %Cpu4 [ 0.00%] %Cpu5 [ 0.00%] %Cpu6 [ 0.00%] %Cpu7 [ 0.00%] PID SPID %CPU RUNTIME COMMMAND ------------------------------------------------------------- 9754 9754 0.01 2.303 ms perf # # perf kwork -k sched top -b -C 2,3,4 Starting trace, Hit <Ctrl+C> to stop and report ^C Total : 48016.721 ms, 3 cpus %Cpu(s): 27.82% id, 0.00% hi, 0.00% si %Cpu2 [|||||||||||||||||||||| 74.68%] %Cpu3 [||||||||||||||||||||| 71.06%] %Cpu4 [||||||||||||||||||||| 70.91%] PID SPID %CPU RUNTIME COMMMAND ------------------------------------------------------------- 0 0 29.08 4734.998 ms [swapper/4] 0 0 28.93 4710.029 ms [swapper/3] 0 0 25.31 3912.363 ms [swapper/2] 10248 10158 1.62 264.931 ms sched-messaging 10253 10158 1.62 265.136 ms sched-messaging 10158 10158 1.60 263.013 ms bash 10360 10158 1.49 243.639 ms sched-messaging 10413 10158 1.48 238.604 ms sched-messaging 10531 10158 1.47 234.067 ms sched-messaging 10400 10158 1.47 240.631 ms sched-messaging 10355 10158 1.47 230.586 ms sched-messaging 10377 10158 1.43 234.835 ms sched-messaging 10526 10158 1.42 232.045 ms sched-messaging 10298 10158 1.41 222.396 ms sched-messaging 10410 10158 1.38 221.853 ms sched-messaging 10364 10158 1.38 226.042 ms sched-messaging 10480 10158 1.36 213.633 ms sched-messaging 10370 10158 1.36 223.620 ms sched-messaging 10553 10158 1.34 217.169 ms sched-messaging 10291 10158 1.34 211.516 ms sched-messaging 10251 10158 1.34 218.813 ms sched-messaging 10522 10158 1.33 218.498 ms sched-messaging 10288 10158 1.33 216.787 ms sched-messaging <SNIP> Reviewed-by: Ian Rogers <irogers@google.com> Signed-off-by: Yang Jihong <yangjihong1@huawei.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ravi Bangoria <ravi.bangoria@amd.com> Cc: Sandipan Das <sandipan.das@amd.com> Link: https://lore.kernel.org/r/20230812084917.169338-15-yangjihong1@huawei.comSigned-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

perf kwork top: Implements BPF-based cpu usage statistics
Use BPF to collect statistics on the CPU usage based on perf BPF skeletons. Example usage: # perf kwork top -h Usage: perf kwork top [<options>] -b, --use-bpf Use BPF to measure task cpu usage -C, --cpu <cpu> list of cpus to profile -i, --input <file> input file name -n, --name <name> event name to profile -s, --sort <key[,key2...]> sort by key(s): rate, runtime, tid --time <str> Time span for analysis (start,stop) # # perf kwork -k sched top -b Starting trace, Hit <Ctrl+C> to stop and report ^C Total : 160702.425 ms, 8 cpus %Cpu(s): 36.00% id, 0.00% hi, 0.00% si %Cpu0 [|||||||||||||||||| 61.66%] %Cpu1 [|||||||||||||||||| 61.27%] %Cpu2 [||||||||||||||||||| 66.40%] %Cpu3 [|||||||||||||||||| 61.28%] %Cpu4 [|||||||||||||||||| 61.82%] %Cpu5 [||||||||||||||||||||||| 77.41%] %Cpu6 [|||||||||||||||||| 61.73%] %Cpu7 [|||||||||||||||||| 63.25%] PID SPID %CPU RUNTIME COMMMAND ------------------------------------------------------------- 0 0 38.72 8089.463 ms [swapper/1] 0 0 38.71 8084.547 ms [swapper/3] 0 0 38.33 8007.532 ms [swapper/0] 0 0 38.26 7992.985 ms [swapper/6] 0 0 38.17 7971.865 ms [swapper/4] 0 0 36.74 7447.765 ms [swapper/7] 0 0 33.59 6486.942 ms [swapper/2] 0 0 22.58 3771.268 ms [swapper/5] 9545 9351 2.48 447.136 ms sched-messaging 9574 9351 2.09 418.583 ms sched-messaging 9724 9351 2.05 372.407 ms sched-messaging 9531 9351 2.01 368.804 ms sched-messaging 9512 9351 2.00 362.250 ms sched-messaging 9514 9351 1.95 357.767 ms sched-messaging 9538 9351 1.86 384.476 ms sched-messaging 9712 9351 1.84 386.490 ms sched-messaging 9723 9351 1.83 380.021 ms sched-messaging 9722 9351 1.82 382.738 ms sched-messaging 9517 9351 1.81 354.794 ms sched-messaging 9559 9351 1.79 344.305 ms sched-messaging 9725 9351 1.77 365.315 ms sched-messaging <SNIP> # perf kwork -k sched top -b -n perf Starting trace, Hit <Ctrl+C> to stop and report ^C Total : 151563.332 ms, 8 cpus %Cpu(s): 26.49% id, 0.00% hi, 0.00% si %Cpu0 [ 0.01%] %Cpu1 [ 0.00%] %Cpu2 [ 0.00%] %Cpu3 [ 0.00%] %Cpu4 [ 0.00%] %Cpu5 [ 0.00%] %Cpu6 [ 0.00%] %Cpu7 [ 0.00%] PID SPID %CPU RUNTIME COMMMAND ------------------------------------------------------------- 9754 9754 0.01 2.303 ms perf # # perf kwork -k sched top -b -C 2,3,4 Starting trace, Hit <Ctrl+C> to stop and report ^C Total : 48016.721 ms, 3 cpus %Cpu(s): 27.82% id, 0.00% hi, 0.00% si %Cpu2 [|||||||||||||||||||||| 74.68%] %Cpu3 [||||||||||||||||||||| 71.06%] %Cpu4 [||||||||||||||||||||| 70.91%] PID SPID %CPU RUNTIME COMMMAND ------------------------------------------------------------- 0 0 29.08 4734.998 ms [swapper/4] 0 0 28.93 4710.029 ms [swapper/3] 0 0 25.31 3912.363 ms [swapper/2] 10248 10158 1.62 264.931 ms sched-messaging 10253 10158 1.62 265.136 ms sched-messaging 10158 10158 1.60 263.013 ms bash 10360 10158 1.49 243.639 ms sched-messaging 10413 10158 1.48 238.604 ms sched-messaging 10531 10158 1.47 234.067 ms sched-messaging 10400 10158 1.47 240.631 ms sched-messaging 10355 10158 1.47 230.586 ms sched-messaging 10377 10158 1.43 234.835 ms sched-messaging 10526 10158 1.42 232.045 ms sched-messaging 10298 10158 1.41 222.396 ms sched-messaging 10410 10158 1.38 221.853 ms sched-messaging 10364 10158 1.38 226.042 ms sched-messaging 10480 10158 1.36 213.633 ms sched-messaging 10370 10158 1.36 223.620 ms sched-messaging 10553 10158 1.34 217.169 ms sched-messaging 10291 10158 1.34 211.516 ms sched-messaging 10251 10158 1.34 218.813 ms sched-messaging 10522 10158 1.33 218.498 ms sched-messaging 10288 10158 1.33 216.787 ms sched-messaging <SNIP> Reviewed-by: Ian Rogers <irogers@google.com> Signed-off-by: Yang Jihong <yangjihong1@huawei.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ravi Bangoria <ravi.bangoria@amd.com> Cc: Sandipan Das <sandipan.das@amd.com> Link: https://lore.kernel.org/r/20230812084917.169338-15-yangjihong1@huawei.comSigned-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
8c984209 · Yang Jihong · Arnaldo Carvalho de Melo · aa172a5a · 8c984209 · 8c984209
Commit 8c984209 authored Aug 12, 2023 by Yang Jihong Committed by Arnaldo Carvalho de Melo Sep 12, 2023
7 changed files
--- a/tools/perf/Documentation/perf-kwork.txt
+++ b/tools/perf/Documentation/perf-kwork.txt
@@ -33,6 +33,7 @@ There are several variants of 'perf kwork':
        perf kwork latency -b
        perf kwork timehist
        perf kwork top
+        perf kwork top -b
   By default it shows the individual work events such as irq, workqeueu,
   including the run time and delay (time between raise and actually entry):
@@ -181,6 +182,10 @@ OPTIONS for 'perf kwork timehist'
 OPTIONS for 'perf kwork top'
 ---------------------------------
+-b::
+--use-bpf::
+	Use BPF to measure task cpu usage.
 -C::
 --cpu::
 	Only show events for the given CPU(s) (comma separated list).

--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -1032,6 +1032,7 @@ SKELETONS += $(SKEL_OUT)/bperf_leader.skel.h $(SKEL_OUT)/bperf_follower.skel.h
 SKELETONS += $(SKEL_OUT)/bperf_cgroup.skel.h $(SKEL_OUT)/func_latency.skel.h
 SKELETONS += $(SKEL_OUT)/off_cpu.skel.h $(SKEL_OUT)/lock_contention.skel.h
 SKELETONS += $(SKEL_OUT)/kwork_trace.skel.h $(SKEL_OUT)/sample_filter.skel.h
+SKELETONS += $(SKEL_OUT)/kwork_top.skel.h
 SKELETONS += $(SKEL_OUT)/bench_uprobe.skel.h
 SKELETONS += $(SKEL_OUT)/augmented_raw_syscalls.skel.h

--- a/tools/perf/builtin-kwork.c
+++ b/tools/perf/builtin-kwork.c
@@ -1619,8 +1619,14 @@ static void top_print_header(struct perf_kwork *kwork __maybe_unused)
 	int ret;
 	printf("\n ");
-	ret = printf(" %*s  %*s  %*s  %-*s",
+	ret = printf(" %*s %s%*s%s %*s  %*s  %-*s",
 		     PRINT_PID_WIDTH, "PID",
+		     kwork->use_bpf ? " " : "",
+		     kwork->use_bpf ? PRINT_PID_WIDTH : 0,
+		     kwork->use_bpf ? "SPID" : "",
+		     kwork->use_bpf ? " " : "",
 		     PRINT_CPU_USAGE_WIDTH, "%CPU",
 		     PRINT_RUNTIME_HEADER_WIDTH + RPINT_DECIMAL_WIDTH, "RUNTIME",
 		     PRINT_TASK_NAME_WIDTH, "COMMMAND");
@@ -1639,6 +1645,12 @@ static int top_print_work(struct perf_kwork *kwork __maybe_unused, struct kwork_
 	 */
 	ret += printf(" %*ld ", PRINT_PID_WIDTH, work->id);
+	/*
+	 * tgid
+	 */
+	if (kwork->use_bpf)
+		ret += printf(" %*d ", PRINT_PID_WIDTH, work->tgid);
 	/*
 	 * cpu usage
 	 */
@@ -1656,7 +1668,13 @@ static int top_print_work(struct perf_kwork *kwork __maybe_unused, struct kwork_
 	/*
 	 * command
 	 */
-	ret += printf(" %-*s", PRINT_TASK_NAME_WIDTH, work->name);
+	if (kwork->use_bpf)
+		ret += printf(" %s%s%s",
+			      work->is_kthread ? "[" : "",
+			      work->name,
+			      work->is_kthread ? "]" : "");
+	else
+		ret += printf(" %-*s", PRINT_TASK_NAME_WIDTH, work->name);
 	printf("\n");
 	return ret;
@@ -2153,6 +2171,36 @@ static void perf_kwork__top_report(struct perf_kwork *kwork)
 	printf("\n");
 }
+static int perf_kwork__top_bpf(struct perf_kwork *kwork)
+{
+	int ret;
+	signal(SIGINT, sig_handler);
+	signal(SIGTERM, sig_handler);
+	ret = perf_kwork__top_prepare_bpf(kwork);
+	if (ret)
+		return -1;
+	printf("Starting trace, Hit <Ctrl+C> to stop and report\n");
+	perf_kwork__top_start();
+	/*
+	 * a simple pause, wait here for stop signal
+	 */
+	pause();
+	perf_kwork__top_finish();
+	perf_kwork__top_read_bpf(kwork);
+	perf_kwork__top_cleanup_bpf();
+	return 0;
+}
 static int perf_kwork__top(struct perf_kwork *kwork)
 {
 	struct __top_cpus_runtime *cpus_runtime;
@@ -2165,7 +2213,11 @@ static int perf_kwork__top(struct perf_kwork *kwork)
 	kwork->top_stat.cpus_runtime = cpus_runtime;
 	bitmap_zero(kwork->top_stat.all_cpus_bitmap, MAX_NR_CPUS);
-	ret = perf_kwork__read_events(kwork);
+	if (kwork->use_bpf)
+		ret = perf_kwork__top_bpf(kwork);
+	else
+		ret = perf_kwork__read_events(kwork);
 	if (ret)
 		goto out;
@@ -2380,6 +2432,10 @@ int cmd_kwork(int argc, const char **argv)
 		   "Time span for analysis (start,stop)"),
 	OPT_STRING('i', "input", &input_name, "file",
 		   "input file name"),
+#ifdef HAVE_BPF_SKEL
+	OPT_BOOLEAN('b', "use-bpf", &kwork.use_bpf,
+		    "Use BPF to measure task cpu usage"),
+#endif
 	OPT_PARENT(kwork_options)
 	};
 	const char *kwork_usage[] = {

--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -165,6 +165,7 @@ endif
 ifeq ($(CONFIG_LIBTRACEEVENT),y)
  perf-$(CONFIG_PERF_BPF_SKEL) += bpf_kwork.o
+  perf-$(CONFIG_PERF_BPF_SKEL) += bpf_kwork_top.o
 endif
 perf-$(CONFIG_LIBELF) += symbol-elf.o

--- a/tools/perf/util/bpf_kwork_top.c
+++ b/tools/perf/util/bpf_kwork_top.c
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * bpf_kwork_top.c
+ *
+ * Copyright (c) 2022  Huawei Inc,  Yang Jihong <yangjihong1@huawei.com>
+ */
+#include <time.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <linux/time64.h>
+#include "util/debug.h"
+#include "util/evsel.h"
+#include "util/kwork.h"
+#include <bpf/bpf.h>
+#include <perf/cpumap.h>
+#include "util/bpf_skel/kwork_top.skel.h"
+/*
+ * This should be in sync with "util/kwork_top.bpf.c"
+ */
+#define MAX_COMMAND_LEN 16
+struct time_data {
+	__u64 timestamp;
+};
+struct work_data {
+	__u64 runtime;
+};
+struct task_data {
+	__u32 tgid;
+	__u32 is_kthread;
+	char comm[MAX_COMMAND_LEN];
+};
+struct work_key {
+	__u32 type;
+	__u32 pid;
+	__u64 task_p;
+};
+struct task_key {
+	__u32 pid;
+	__u32 cpu;
+};
+struct kwork_class_bpf {
+	struct kwork_class *class;
+	void (*load_prepare)(void);
+};
+static struct kwork_top_bpf *skel;
+void perf_kwork__top_start(void)
+{
+	struct timespec ts;
+	clock_gettime(CLOCK_MONOTONIC, &ts);
+	skel->bss->from_timestamp = (u64)ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec;
+	skel->bss->enabled = 1;
+	pr_debug("perf kwork top start at: %lld\n", skel->bss->from_timestamp);
+}
+void perf_kwork__top_finish(void)
+{
+	struct timespec ts;
+	skel->bss->enabled = 0;
+	clock_gettime(CLOCK_MONOTONIC, &ts);
+	skel->bss->to_timestamp = (u64)ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec;
+	pr_debug("perf kwork top finish at: %lld\n", skel->bss->to_timestamp);
+}
+static void sched_load_prepare(void)
+{
+	bpf_program__set_autoload(skel->progs.on_switch, true);
+}
+static struct kwork_class_bpf kwork_sched_bpf = {
+	.load_prepare = sched_load_prepare,
+};
+static struct kwork_class_bpf *
+kwork_class_bpf_supported_list[KWORK_CLASS_MAX] = {
+	[KWORK_CLASS_SCHED]	= &kwork_sched_bpf,
+};
+static bool valid_kwork_class_type(enum kwork_class_type type)
+{
+	return type >= 0 && type < KWORK_CLASS_MAX ? true : false;
+}
+static int setup_filters(struct perf_kwork *kwork)
+{
+	u8 val = 1;
+	int i, nr_cpus, fd;
+	struct perf_cpu_map *map;
+	if (kwork->cpu_list) {
+		fd = bpf_map__fd(skel->maps.kwork_top_cpu_filter);
+		if (fd < 0) {
+			pr_debug("Invalid cpu filter fd\n");
+			return -1;
+		}
+		map = perf_cpu_map__new(kwork->cpu_list);
+		if (!map) {
+			pr_debug("Invalid cpu_list\n");
+			return -1;
+		}
+		nr_cpus = libbpf_num_possible_cpus();
+		for (i = 0; i < perf_cpu_map__nr(map); i++) {
+			struct perf_cpu cpu = perf_cpu_map__cpu(map, i);
+			if (cpu.cpu >= nr_cpus) {
+				perf_cpu_map__put(map);
+				pr_err("Requested cpu %d too large\n", cpu.cpu);
+				return -1;
+			}
+			bpf_map_update_elem(fd, &cpu.cpu, &val, BPF_ANY);
+		}
+		perf_cpu_map__put(map);
+		skel->bss->has_cpu_filter = 1;
+	}
+	return 0;
+}
+int perf_kwork__top_prepare_bpf(struct perf_kwork *kwork __maybe_unused)
+{
+	struct bpf_program *prog;
+	struct kwork_class *class;
+	struct kwork_class_bpf *class_bpf;
+	enum kwork_class_type type;
+	skel = kwork_top_bpf__open();
+	if (!skel) {
+		pr_debug("Failed to open kwork top skeleton\n");
+		return -1;
+	}
+	/*
+	 * set all progs to non-autoload,
+	 * then set corresponding progs according to config
+	 */
+	bpf_object__for_each_program(prog, skel->obj)
+		bpf_program__set_autoload(prog, false);
+	list_for_each_entry(class, &kwork->class_list, list) {
+		type = class->type;
+		if (!valid_kwork_class_type(type) ||
+		    !kwork_class_bpf_supported_list[type]) {
+			pr_err("Unsupported bpf trace class %s\n", class->name);
+			goto out;
+		}
+		class_bpf = kwork_class_bpf_supported_list[type];
+		class_bpf->class = class;
+		if (class_bpf->load_prepare)
+			class_bpf->load_prepare();
+	}
+	if (kwork_top_bpf__load(skel)) {
+		pr_debug("Failed to load kwork top skeleton\n");
+		goto out;
+	}
+	if (setup_filters(kwork))
+		goto out;
+	if (kwork_top_bpf__attach(skel)) {
+		pr_debug("Failed to attach kwork top skeleton\n");
+		goto out;
+	}
+	return 0;
+out:
+	kwork_top_bpf__destroy(skel);
+	return -1;
+}
+static void read_task_info(struct kwork_work *work)
+{
+	int fd;
+	struct task_data data;
+	struct task_key key = {
+		.pid = work->id,
+		.cpu = work->cpu,
+	};
+	fd = bpf_map__fd(skel->maps.kwork_top_tasks);
+	if (fd < 0) {
+		pr_debug("Invalid top tasks map fd\n");
+		return;
+	}
+	if (!bpf_map_lookup_elem(fd, &key, &data)) {
+		work->tgid = data.tgid;
+		work->is_kthread = data.is_kthread;
+		work->name = strdup(data.comm);
+	}
+}
+static int add_work(struct perf_kwork *kwork, struct work_key *key,
+		    struct work_data *data, int cpu)
+{
+	struct kwork_class_bpf *bpf_trace;
+	struct kwork_work *work;
+	struct kwork_work tmp = {
+		.id = key->pid,
+		.cpu = cpu,
+		.name = NULL,
+	};
+	enum kwork_class_type type = key->type;
+	if (!valid_kwork_class_type(type)) {
+		pr_debug("Invalid class type %d to add work\n", type);
+		return -1;
+	}
+	bpf_trace = kwork_class_bpf_supported_list[type];
+	tmp.class = bpf_trace->class;
+	work = perf_kwork_add_work(kwork, tmp.class, &tmp);
+	if (!work)
+		return -1;
+	work->total_runtime = data->runtime;
+	read_task_info(work);
+	return 0;
+}
+int perf_kwork__top_read_bpf(struct perf_kwork *kwork)
+{
+	int i, fd, nr_cpus;
+	struct work_data *data;
+	struct work_key key, prev;
+	fd = bpf_map__fd(skel->maps.kwork_top_works);
+	if (fd < 0) {
+		pr_debug("Invalid top runtime fd\n");
+		return -1;
+	}
+	nr_cpus = libbpf_num_possible_cpus();
+	data = calloc(nr_cpus, sizeof(struct work_data));
+	if (!data)
+		return -1;
+	memset(&prev, 0, sizeof(prev));
+	while (!bpf_map_get_next_key(fd, &prev, &key)) {
+		if ((bpf_map_lookup_elem(fd, &key, data)) != 0) {
+			pr_debug("Failed to lookup top elem\n");
+			return -1;
+		}
+		for (i = 0; i < nr_cpus; i++) {
+			if (data[i].runtime == 0)
+				continue;
+			if (add_work(kwork, &key, &data[i], i))
+				return -1;
+		}
+		prev = key;
+	}
+	free(data);
+	return 0;
+}
+void perf_kwork__top_cleanup_bpf(void)
+{
+	kwork_top_bpf__destroy(skel);
+}
--- a/tools/perf/util/bpf_skel/kwork_top.bpf.c
+++ b/tools/perf/util/bpf_skel/kwork_top.bpf.c
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+// Copyright (c) 2022, Huawei
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+/*
+ * This should be in sync with "util/kwork.h"
+ */
+enum kwork_class_type {
+	KWORK_CLASS_IRQ,
+	KWORK_CLASS_SOFTIRQ,
+	KWORK_CLASS_WORKQUEUE,
+	KWORK_CLASS_SCHED,
+	KWORK_CLASS_MAX,
+};
+#define MAX_ENTRIES     102400
+#define MAX_NR_CPUS     2048
+#define PF_KTHREAD      0x00200000
+#define MAX_COMMAND_LEN 16
+struct time_data {
+	__u64 timestamp;
+};
+struct work_data {
+	__u64 runtime;
+};
+struct task_data {
+	__u32 tgid;
+	__u32 is_kthread;
+	char comm[MAX_COMMAND_LEN];
+};
+struct work_key {
+	__u32 type;
+	__u32 pid;
+	__u64 task_p;
+};
+struct task_key {
+	__u32 pid;
+	__u32 cpu;
+};
+struct {
+	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, int);
+	__type(value, struct time_data);
+} kwork_top_task_time SEC(".maps");
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(key_size, sizeof(struct task_key));
+	__uint(value_size, sizeof(struct task_data));
+	__uint(max_entries, MAX_ENTRIES);
+} kwork_top_tasks SEC(".maps");
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_HASH);
+	__uint(key_size, sizeof(struct work_key));
+	__uint(value_size, sizeof(struct work_data));
+	__uint(max_entries, MAX_ENTRIES);
+} kwork_top_works SEC(".maps");
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(key_size, sizeof(u32));
+	__uint(value_size, sizeof(u8));
+	__uint(max_entries, MAX_NR_CPUS);
+} kwork_top_cpu_filter SEC(".maps");
+int enabled = 0;
+int has_cpu_filter = 0;
+__u64 from_timestamp = 0;
+__u64 to_timestamp = 0;
+static __always_inline int cpu_is_filtered(__u32 cpu)
+{
+	__u8 *cpu_val;
+	if (has_cpu_filter) {
+		cpu_val = bpf_map_lookup_elem(&kwork_top_cpu_filter, &cpu);
+		if (!cpu_val)
+			return 1;
+	}
+	return 0;
+}
+static __always_inline void update_task_info(struct task_struct *task, __u32 cpu)
+{
+	struct task_key key = {
+		.pid = task->pid,
+		.cpu = cpu,
+	};
+	if (!bpf_map_lookup_elem(&kwork_top_tasks, &key)) {
+		struct task_data data = {
+			.tgid = task->tgid,
+			.is_kthread = task->flags & PF_KTHREAD ? 1 : 0,
+		};
+		BPF_CORE_READ_STR_INTO(&data.comm, task, comm);
+		bpf_map_update_elem(&kwork_top_tasks, &key, &data, BPF_ANY);
+	}
+}
+static __always_inline void update_work(struct work_key *key, __u64 delta)
+{
+	struct work_data *data;
+	data = bpf_map_lookup_elem(&kwork_top_works, key);
+	if (data) {
+		data->runtime += delta;
+	} else {
+		struct work_data new_data = {
+			.runtime = delta,
+		};
+		bpf_map_update_elem(&kwork_top_works, key, &new_data, BPF_ANY);
+	}
+}
+static void on_sched_out(struct task_struct *task, __u64 ts, __u32 cpu)
+{
+	__u64 delta;
+	struct time_data *pelem;
+	pelem = bpf_task_storage_get(&kwork_top_task_time, task, NULL, 0);
+	if (pelem)
+		delta = ts - pelem->timestamp;
+	else
+		delta = ts - from_timestamp;
+	struct work_key key = {
+		.type = KWORK_CLASS_SCHED,
+		.pid = task->pid,
+		.task_p = (__u64)task,
+	};
+	update_work(&key, delta);
+	update_task_info(task, cpu);
+}
+static void on_sched_in(struct task_struct *task, __u64 ts)
+{
+	struct time_data *pelem;
+	pelem = bpf_task_storage_get(&kwork_top_task_time, task, NULL,
+				     BPF_LOCAL_STORAGE_GET_F_CREATE);
+	if (pelem)
+		pelem->timestamp = ts;
+}
+SEC("tp_btf/sched_switch")
+int on_switch(u64 *ctx)
+{
+	struct task_struct *prev, *next;
+	prev = (struct task_struct *)ctx[1];
+	next = (struct task_struct *)ctx[2];
+	if (!enabled)
+		return 0;
+	__u32 cpu = bpf_get_smp_processor_id();
+	if (cpu_is_filtered(cpu))
+		return 0;
+	__u64 ts = bpf_ktime_get_ns();
+	on_sched_out(prev, ts, cpu);
+	on_sched_in(next, ts);
+	return 0;
+}
+char LICENSE[] SEC("license") = "Dual BSD/GPL";
--- a/tools/perf/util/kwork.h
+++ b/tools/perf/util/kwork.h
@@ -135,6 +135,8 @@ struct kwork_work {
 	 * top report
 	 */
 	u32 cpu_usage;
+	u32 tgid;
+	bool is_kthread;
 };
 struct kwork_class {
@@ -264,6 +266,13 @@ void perf_kwork__report_cleanup_bpf(void);
 void perf_kwork__trace_start(void);
 void perf_kwork__trace_finish(void);
+int perf_kwork__top_prepare_bpf(struct perf_kwork *kwork);
+int perf_kwork__top_read_bpf(struct perf_kwork *kwork);
+void perf_kwork__top_cleanup_bpf(void);
+void perf_kwork__top_start(void);
+void perf_kwork__top_finish(void);
 #else  /* !HAVE_BPF_SKEL */
 static inline int
@@ -283,6 +292,23 @@ static inline void perf_kwork__report_cleanup_bpf(void) {}
 static inline void perf_kwork__trace_start(void) {}
 static inline void perf_kwork__trace_finish(void) {}
+static inline int
+perf_kwork__top_prepare_bpf(struct perf_kwork *kwork __maybe_unused)
+{
+	return -1;
+}
+static inline int
+perf_kwork__top_read_bpf(struct perf_kwork *kwork __maybe_unused)
+{
+	return -1;
+}
+static inline void perf_kwork__top_cleanup_bpf(void) {}
+static inline void perf_kwork__top_start(void) {}
+static inline void perf_kwork__top_finish(void) {}
 #endif  /* HAVE_BPF_SKEL */
 #endif  /* PERF_UTIL_KWORK_H */