Merge tag 'perf-core-for-mingo-5.5-20191011' of...

Merge tag 'perf-core-for-mingo-5.5-20191011' of git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux into perf/core Pull perf/core improvements and fixes from Arnaldo Carvalho de Melo: perf trace: Arnaldo Carvalho de Melo: - Reuse the strace-like syscall_arg_fmt->scnprintf() beautification routines (convert integer arguments into strings, like open flags, etc) in tracepoint arguments. For now the type based scnprintf routines (pid_t, umode_t, etc) and the ones based in well known arg name based ("fd", etc) gets associated with tracepoint args of that type. A tracepoint only arg, "msr", for the msr:{write,read}_msr gets added as an initial step. - Introduce syscall_arg_fmt->strtoul() methods to be the reverse operation of ->scnprintf(), i.e. to go from a string to an integer. - Implement --filter, just like in 'perf record', that affects the tracepoint events specied thus far in the command line, use the ->strtoul() methods to allow strings in tables associated with beautifiers to the integers the in-kernel tracepoint (eBPF later) filters expect, e.g.: # perf trace --max-events 1 -e sched:*ipi --filter="cpu==1 || cpu==2" 0.000 as/24630 sched:sched_wake_idle_without_ipi(cpu: 1) # # perf trace --max-events 1 --max-stack=32 -e msr:* --filter="msr==IA32_TSC_DEADLINE" 207.000 cc1/19963 msr:write_msr(msr: IA32_TSC_DEADLINE, val: 5442316760822) do_trace_write_msr ([kernel.kallsyms]) do_trace_write_msr ([kernel.kallsyms]) lapic_next_deadline ([kernel.kallsyms]) clockevents_program_event ([kernel.kallsyms]) hrtimer_interrupt ([kernel.kallsyms]) smp_apic_timer_interrupt ([kernel.kallsyms]) apic_timer_interrupt ([kernel.kallsyms]) [0x6ff66c] (/usr/lib/gcc-cross/alpha-linux-gnu/8/cc1) [0x7047c3] (/usr/lib/gcc-cross/alpha-linux-gnu/8/cc1) [0x707708] (/usr/lib/gcc-cross/alpha-linux-gnu/8/cc1) execute_one_pass (/usr/lib/gcc-cross/alpha-linux-gnu/8/cc1) [0x4f3d37] (/usr/lib/gcc-cross/alpha-linux-gnu/8/cc1) [0x4f3d49] (/usr/lib/gcc-cross/alpha-linux-gnu/8/cc1) execute_pass_list (/usr/lib/gcc-cross/alpha-linux-gnu/8/cc1) cgraph_node::expand (/usr/lib/gcc-cross/alpha-linux-gnu/8/cc1) [0x2625b4] (/usr/lib/gcc-cross/alpha-linux-gnu/8/cc1) symbol_table::finalize_compilation_unit (/usr/lib/gcc-cross/alpha-linux-gnu/8/cc1) [0x5ae8b9] (/usr/lib/gcc-cross/alpha-linux-gnu/8/cc1) toplev::main (/usr/lib/gcc-cross/alpha-linux-gnu/8/cc1) main (/usr/lib/gcc-cross/alpha-linux-gnu/8/cc1) [0x26b6a] (/usr/lib/x86_64-linux-gnu/libc-2.29.so) # # perf trace --max-events 8 -e msr:* --filter="msr==IA32_SPEC_CTRL" 0.000 :13281/13281 msr:write_msr(msr: IA32_SPEC_CTRL, val: 6) 0.063 migration/3/25 msr:write_msr(msr: IA32_SPEC_CTRL) 0.217 kworker/u16:1-/4826 msr:write_msr(msr: IA32_SPEC_CTRL) 0.687 rcu_sched/11 msr:write_msr(msr: IA32_SPEC_CTRL) 0.696 :13280/13280 msr:write_msr(msr: IA32_SPEC_CTRL, val: 6) 0.305 :13281/13281 msr:write_msr(msr: IA32_SPEC_CTRL, val: 6) 0.355 :13274/13274 msr:write_msr(msr: IA32_SPEC_CTRL, val: 6) 2.743 kworker/u16:0-/6711 msr:write_msr(msr: IA32_SPEC_CTRL) # # perf trace --max-events 8 --cpu 1 -e msr:* --filter="msr!=IA32_SPEC_CTRL && msr!=IA32_TSC_DEADLINE && msr != FS_BASE" 0.000 mtr-packet/30819 msr:write_msr(msr: 0x830, val: 68719479037) 0.096 :0/0 msr:read_msr(msr: IA32_TSC_ADJUST) 238.925 mtr-packet/30819 msr:write_msr(msr: 0x830, val: 8589936893) 511.010 :0/0 msr:write_msr(msr: 0x830, val: 68719479037) 1005.052 :0/0 msr:read_msr(msr: IA32_TSC_ADJUST) 1235.131 CPU 0/KVM/3750 msr:write_msr(msr: 0x830, val: 4294969595) 1235.195 CPU 0/KVM/3750 msr:read_msr(msr: IA32_SYSENTER_ESP, val: -2199023037952) 1235.201 CPU 0/KVM/3750 msr:read_msr(msr: IA32_APICBASE, val: 4276096000) # - Default to not using libtraceevent and its plugins for beautifying tracepoint arguments, since now we're reusing the strace-like beatufiers. Use --libtraceevent_print (using just --libtrace is unambiguous and can be used as a short hand) to go back to those beautifiers. This will help in the transition, as can be seen in some of the sched tracepoints that still need some work in the libbeauty based mode: # trace --no-inherit -e msr:*,*sleep,sched:* sleep 1 0.000 ( ): sched:sched_waking(comm: "trace", pid: 3319 (trace), prio: 120, success: 1) 0.006 ( ): sched:sched_wakeup(comm: "trace", pid: 3319 (trace), prio: 120, success: 1) 0.348 ( ): sched:sched_process_exec(filename: 140212596720100, pid: 3319 (sleep), old_pid: 3319 (sleep)) 0.490 ( ): msr:write_msr(msr: FS_BASE, val: 139631189321088) 0.670 ( ): nanosleep(rqtp: 0x7ffc52c23bc0) ... 0.674 ( ): sched:sched_stat_runtime(comm: "sleep", pid: 3319 (sleep), runtime: 659259, vruntime: 78942418342) 0.675 ( ): sched:sched_switch(prev_comm: "sleep", prev_pid: 3319 (sleep), prev_prio: 120, prev_state: 1, next_comm: "swapper/0", next_prio: 120) 1001.059 ( ): sched:sched_waking(comm: "sleep", pid: 3319 (sleep), prio: 120, success: 1) 1001.098 ( ): sched:sched_wakeup(comm: "sleep", pid: 3319 (sleep), prio: 120, success: 1) 0.670 (1000.504 ms): ... [continued]: nanosleep()) = 0 1001.456 ( ): sched:sched_process_exit(comm: "sleep", pid: 3319 (sleep), prio: 120) # trace --libtrace --no-inherit -e msr:*,*sleep,sched:* sleep 1 # trace --libtrace --no-inherit -e msr:*,*sleep,sched:* sleep 1 0.000 ( ): sched:sched_waking(comm=trace pid=3323 prio=120 target_cpu=000) 0.007 ( ): sched:sched_wakeup(comm=trace pid=3323 prio=120 target_cpu=000) 0.382 ( ): sched:sched_process_exec(filename=/usr/bin/sleep pid=3323 old_pid=3323) 0.525 ( ): msr:write_msr(c0000100, value 7f5d508a0580) 0.713 ( ): nanosleep(rqtp: 0x7fff487fb4a0) ... 0.717 ( ): sched:sched_stat_runtime(comm=sleep pid=3323 runtime=617722 [ns] vruntime=78957731636 [ns]) 0.719 ( ): sched:sched_switch(prev_comm=sleep prev_pid=3323 prev_prio=120 prev_state=S ==> next_comm=swapper/0 next_pid=0 next_prio=120) 1001.117 ( ): sched:sched_waking(comm=sleep pid=3323 prio=120 target_cpu=000) 1001.157 ( ): sched:sched_wakeup(comm=sleep pid=3323 prio=120 target_cpu=000) 0.713 (1000.522 ms): ... [continued]: nanosleep()) = 0 1001.538 ( ): sched:sched_process_exit(comm=sleep pid=3323 prio=120) # - Make -v (verbose) mode be honoured for .perfconfig based trace.add_events, to help in diagnosing problems with building eBPF events (-e source.c). - When using eBPF syscall payload augmentation do not show strace-like syscalls when all the user specified was some tracepoint event, bringing the behaviour in line with that of when not using eBPF augmentation. Intel PT: exported-sql-viewer GUI: Adrian Hunter: - Add LookupModel, HBoxLayout, VBoxLayout, global time range calculations so as to add a time chart by CPU. perf script: Andi Kleen: - Allow --time (to specify a time span of interest) with --reltime perf diff: Jin Yao: - Report noise for cycles diff, i.e. a histogram + stddev. (timestamps relative to start). perf annotate: Arnaldo Carvalho de Melo: - Initialize env->cpuid when running in live mode (perf top), as it is used in some of the per arch annotation init routines. samples bpf: Björn Töpel: - Fixup fallout of using tools/perf/perf-sys. from outside tools/perf. Core: Ian Rogers: - Avoid 'sample_reg_masks' being const + weak, as this breaks with some compilers that constant-propagate from the weak symbol. libperf: - First part of moving the perf_mmap class from tools/perf to libperf. - Propagate CFLAGS to libperf from the tools/perf Makefile. Vendor events: John Garry: - Add entry in MAINTAINERS with reviewers for the for perf tool arm64 pmu-events files. Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> Signed-off-by: Ingo Molnar <mingo@kernel.org>

Merge tag 'perf-core-for-mingo-5.5-20191011' of...
Merge tag 'perf-core-for-mingo-5.5-20191011' of git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux into perf/core Pull perf/core improvements and fixes from Arnaldo Carvalho de Melo: perf trace: Arnaldo Carvalho de Melo: - Reuse the strace-like syscall_arg_fmt->scnprintf() beautification routines (convert integer arguments into strings, like open flags, etc) in tracepoint arguments. For now the type based scnprintf routines (pid_t, umode_t, etc) and the ones based in well known arg name based ("fd", etc) gets associated with tracepoint args of that type. A tracepoint only arg, "msr", for the msr:{write,read}_msr gets added as an initial step. - Introduce syscall_arg_fmt->strtoul() methods to be the reverse operation of ->scnprintf(), i.e. to go from a string to an integer. - Implement --filter, just like in 'perf record', that affects the tracepoint events specied thus far in the command line, use the ->strtoul() methods to allow strings in tables associated with beautifiers to the integers the in-kernel tracepoint (eBPF later) filters expect, e.g.: # perf trace --max-events 1 -e sched:*ipi --filter="cpu==1 || cpu==2" 0.000 as/24630 sched:sched_wake_idle_without_ipi(cpu: 1) # # perf trace --max-events 1 --max-stack=32 -e msr:* --filter="msr==IA32_TSC_DEADLINE" 207.000 cc1/19963 msr:write_msr(msr: IA32_TSC_DEADLINE, val: 5442316760822) do_trace_write_msr ([kernel.kallsyms]) do_trace_write_msr ([kernel.kallsyms]) lapic_next_deadline ([kernel.kallsyms]) clockevents_program_event ([kernel.kallsyms]) hrtimer_interrupt ([kernel.kallsyms]) smp_apic_timer_interrupt ([kernel.kallsyms]) apic_timer_interrupt ([kernel.kallsyms]) [0x6ff66c] (/usr/lib/gcc-cross/alpha-linux-gnu/8/cc1) [0x7047c3] (/usr/lib/gcc-cross/alpha-linux-gnu/8/cc1) [0x707708] (/usr/lib/gcc-cross/alpha-linux-gnu/8/cc1) execute_one_pass (/usr/lib/gcc-cross/alpha-linux-gnu/8/cc1) [0x4f3d37] (/usr/lib/gcc-cross/alpha-linux-gnu/8/cc1) [0x4f3d49] (/usr/lib/gcc-cross/alpha-linux-gnu/8/cc1) execute_pass_list (/usr/lib/gcc-cross/alpha-linux-gnu/8/cc1) cgraph_node::expand (/usr/lib/gcc-cross/alpha-linux-gnu/8/cc1) [0x2625b4] (/usr/lib/gcc-cross/alpha-linux-gnu/8/cc1) symbol_table::finalize_compilation_unit (/usr/lib/gcc-cross/alpha-linux-gnu/8/cc1) [0x5ae8b9] (/usr/lib/gcc-cross/alpha-linux-gnu/8/cc1) toplev::main (/usr/lib/gcc-cross/alpha-linux-gnu/8/cc1) main (/usr/lib/gcc-cross/alpha-linux-gnu/8/cc1) [0x26b6a] (/usr/lib/x86_64-linux-gnu/libc-2.29.so) # # perf trace --max-events 8 -e msr:* --filter="msr==IA32_SPEC_CTRL" 0.000 :13281/13281 msr:write_msr(msr: IA32_SPEC_CTRL, val: 6) 0.063 migration/3/25 msr:write_msr(msr: IA32_SPEC_CTRL) 0.217 kworker/u16:1-/4826 msr:write_msr(msr: IA32_SPEC_CTRL) 0.687 rcu_sched/11 msr:write_msr(msr: IA32_SPEC_CTRL) 0.696 :13280/13280 msr:write_msr(msr: IA32_SPEC_CTRL, val: 6) 0.305 :13281/13281 msr:write_msr(msr: IA32_SPEC_CTRL, val: 6) 0.355 :13274/13274 msr:write_msr(msr: IA32_SPEC_CTRL, val: 6) 2.743 kworker/u16:0-/6711 msr:write_msr(msr: IA32_SPEC_CTRL) # # perf trace --max-events 8 --cpu 1 -e msr:* --filter="msr!=IA32_SPEC_CTRL && msr!=IA32_TSC_DEADLINE && msr != FS_BASE" 0.000 mtr-packet/30819 msr:write_msr(msr: 0x830, val: 68719479037) 0.096 :0/0 msr:read_msr(msr: IA32_TSC_ADJUST) 238.925 mtr-packet/30819 msr:write_msr(msr: 0x830, val: 8589936893) 511.010 :0/0 msr:write_msr(msr: 0x830, val: 68719479037) 1005.052 :0/0 msr:read_msr(msr: IA32_TSC_ADJUST) 1235.131 CPU 0/KVM/3750 msr:write_msr(msr: 0x830, val: 4294969595) 1235.195 CPU 0/KVM/3750 msr:read_msr(msr: IA32_SYSENTER_ESP, val: -2199023037952) 1235.201 CPU 0/KVM/3750 msr:read_msr(msr: IA32_APICBASE, val: 4276096000) # - Default to not using libtraceevent and its plugins for beautifying tracepoint arguments, since now we're reusing the strace-like beatufiers. Use --libtraceevent_print (using just --libtrace is unambiguous and can be used as a short hand) to go back to those beautifiers. This will help in the transition, as can be seen in some of the sched tracepoints that still need some work in the libbeauty based mode: # trace --no-inherit -e msr:*,*sleep,sched:* sleep 1 0.000 ( ): sched:sched_waking(comm: "trace", pid: 3319 (trace), prio: 120, success: 1) 0.006 ( ): sched:sched_wakeup(comm: "trace", pid: 3319 (trace), prio: 120, success: 1) 0.348 ( ): sched:sched_process_exec(filename: 140212596720100, pid: 3319 (sleep), old_pid: 3319 (sleep)) 0.490 ( ): msr:write_msr(msr: FS_BASE, val: 139631189321088) 0.670 ( ): nanosleep(rqtp: 0x7ffc52c23bc0) ... 0.674 ( ): sched:sched_stat_runtime(comm: "sleep", pid: 3319 (sleep), runtime: 659259, vruntime: 78942418342) 0.675 ( ): sched:sched_switch(prev_comm: "sleep", prev_pid: 3319 (sleep), prev_prio: 120, prev_state: 1, next_comm: "swapper/0", next_prio: 120) 1001.059 ( ): sched:sched_waking(comm: "sleep", pid: 3319 (sleep), prio: 120, success: 1) 1001.098 ( ): sched:sched_wakeup(comm: "sleep", pid: 3319 (sleep), prio: 120, success: 1) 0.670 (1000.504 ms): ... [continued]: nanosleep()) = 0 1001.456 ( ): sched:sched_process_exit(comm: "sleep", pid: 3319 (sleep), prio: 120) # trace --libtrace --no-inherit -e msr:*,*sleep,sched:* sleep 1 # trace --libtrace --no-inherit -e msr:*,*sleep,sched:* sleep 1 0.000 ( ): sched:sched_waking(comm=trace pid=3323 prio=120 target_cpu=000) 0.007 ( ): sched:sched_wakeup(comm=trace pid=3323 prio=120 target_cpu=000) 0.382 ( ): sched:sched_process_exec(filename=/usr/bin/sleep pid=3323 old_pid=3323) 0.525 ( ): msr:write_msr(c0000100, value 7f5d508a0580) 0.713 ( ): nanosleep(rqtp: 0x7fff487fb4a0) ... 0.717 ( ): sched:sched_stat_runtime(comm=sleep pid=3323 runtime=617722 [ns] vruntime=78957731636 [ns]) 0.719 ( ): sched:sched_switch(prev_comm=sleep prev_pid=3323 prev_prio=120 prev_state=S ==> next_comm=swapper/0 next_pid=0 next_prio=120) 1001.117 ( ): sched:sched_waking(comm=sleep pid=3323 prio=120 target_cpu=000) 1001.157 ( ): sched:sched_wakeup(comm=sleep pid=3323 prio=120 target_cpu=000) 0.713 (1000.522 ms): ... [continued]: nanosleep()) = 0 1001.538 ( ): sched:sched_process_exit(comm=sleep pid=3323 prio=120) # - Make -v (verbose) mode be honoured for .perfconfig based trace.add_events, to help in diagnosing problems with building eBPF events (-e source.c). - When using eBPF syscall payload augmentation do not show strace-like syscalls when all the user specified was some tracepoint event, bringing the behaviour in line with that of when not using eBPF augmentation. Intel PT: exported-sql-viewer GUI: Adrian Hunter: - Add LookupModel, HBoxLayout, VBoxLayout, global time range calculations so as to add a time chart by CPU. perf script: Andi Kleen: - Allow --time (to specify a time span of interest) with --reltime perf diff: Jin Yao: - Report noise for cycles diff, i.e. a histogram + stddev. (timestamps relative to start). perf annotate: Arnaldo Carvalho de Melo: - Initialize env->cpuid when running in live mode (perf top), as it is used in some of the per arch annotation init routines. samples bpf: Björn Töpel: - Fixup fallout of using tools/perf/perf-sys. from outside tools/perf. Core: Ian Rogers: - Avoid 'sample_reg_masks' being const + weak, as this breaks with some compilers that constant-propagate from the weak symbol. libperf: - First part of moving the perf_mmap class from tools/perf to libperf. - Propagate CFLAGS to libperf from the tools/perf Makefile. Vendor events: John Garry: - Add entry in MAINTAINERS with reviewers for the for perf tool arm64 pmu-events files. Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> Signed-off-by: Ingo Molnar <mingo@kernel.org>
39b656ee · Ingo Molnar · 4f5cafb5 · cebf7d51 · 39b656ee · 39b656ee
Commit 39b656ee authored Oct 15, 2019 by Ingo Molnar
74 changed files
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -12769,6 +12769,13 @@ F:	arch/*/events/*
 F:	arch/*/events/*/*
 F:	tools/perf/

+PERFORMANCE EVENTS SUBSYSTEM ARM64 PMU EVENTS
+R:	John Garry <john.garry@huawei.com>
+R:	Will Deacon <will@kernel.org>
+L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
+S:	Supported
+F:	tools/perf/pmu-events/arch/arm64/
+
 PERSONALITY HANDLING
 M:	Christoph Hellwig <hch@infradead.org>
 L:	linux-abi-devel@lists.sourceforge.net

--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -176,6 +176,7 @@ KBUILD_HOSTCFLAGS += -I$(srctree)/tools/lib/bpf/
 KBUILD_HOSTCFLAGS += -I$(srctree)/tools/testing/selftests/bpf/
 KBUILD_HOSTCFLAGS += -I$(srctree)/tools/lib/ -I$(srctree)/tools/include
 KBUILD_HOSTCFLAGS += -I$(srctree)/tools/perf
+KBUILD_HOSTCFLAGS += -DHAVE_ATTR_TEST=0

 HOSTCFLAGS_bpf_load.o += -I$(objtree)/usr/include -Wno-unused-variable


--- a/tools/arch/x86/include/asm/msr-index.h
+++ b/tools/arch/x86/include/asm/msr-index.h
--- a/tools/perf/Documentation/perf-config.txt
+++ b/tools/perf/Documentation/perf-config.txt
@@ -561,6 +561,11 @@ trace.*::
 	trace.show_zeros::
 		Do not suppress syscall arguments that are equal to zero.

+	trace.tracepoint_beautifiers::
+		Use "libtraceevent" to use that library to augment the tracepoint arguments,
+		"libbeauty", the default, to use the same argument beautifiers used in the
+		strace-like sys_enter+sys_exit lines.
+
 llvm.*::
 	llvm.clang-path::
 		Path to clang. If omit, search it from $PATH.

--- a/tools/perf/Documentation/perf-diff.txt
+++ b/tools/perf/Documentation/perf-diff.txt
@@ -95,6 +95,11 @@ OPTIONS
        diff.compute config option.  See COMPARISON METHODS section for
        more info.

+--cycles-hist::
+	Report a histogram and the standard deviation for cycles data.
+	It can help us to judge if the reported cycles data is noisy or
+	not. This option should be used with '-c cycles'.
+
 -p::
 --period::
        Show period values for both compared hist entries.

--- a/tools/perf/Documentation/perf-trace.txt
+++ b/tools/perf/Documentation/perf-trace.txt
@@ -42,6 +42,11 @@ OPTIONS
 	Prefixing with ! shows all syscalls but the ones specified.  You may
 	need to escape it.

+--filter=<filter>::
+        Event filter. This option should follow an event selector (-e) which
+	selects tracepoint event(s).
+
+
 -D msecs::
 --delay msecs::
 After starting the program, wait msecs before measuring. This is useful to
@@ -219,6 +224,11 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs.
 	may happen, for instance, when a thread gets migrated to a different CPU
 	while processing a syscall.

+--libtraceevent_print::
+	Use libtraceevent to print tracepoint arguments. By default 'perf trace' uses
+	the same beautifiers used in the strace-like enter+exit lines to augment the
+	tracepoint arguments.
+
 --map-dump::
 	Dump BPF maps setup by events passed via -e, for instance the augmented_raw_syscalls
 	living in tools/perf/examples/bpf/augmented_raw_syscalls.c. For now this

--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -188,7 +188,7 @@ endif

 # Treat warnings as errors unless directed not to
 ifneq ($(WERROR),0)
-  CFLAGS += -Werror
+  CORE_CFLAGS += -Werror
  CXXFLAGS += -Werror
 endif

@@ -198,9 +198,9 @@ endif

 ifeq ($(DEBUG),0)
 ifeq ($(CC_NO_CLANG), 0)
-  CFLAGS += -O3
+  CORE_CFLAGS += -O3
 else
-  CFLAGS += -O6
+  CORE_CFLAGS += -O6
 endif
 endif

@@ -245,12 +245,12 @@ FEATURE_CHECK_LDFLAGS-libaio = -lrt

 FEATURE_CHECK_LDFLAGS-disassembler-four-args = -lbfd -lopcodes -ldl

-CFLAGS += -fno-omit-frame-pointer
-CFLAGS += -ggdb3
-CFLAGS += -funwind-tables
-CFLAGS += -Wall
-CFLAGS += -Wextra
-CFLAGS += -std=gnu99
+CORE_CFLAGS += -fno-omit-frame-pointer
+CORE_CFLAGS += -ggdb3
+CORE_CFLAGS += -funwind-tables
+CORE_CFLAGS += -Wall
+CORE_CFLAGS += -Wextra
+CORE_CFLAGS += -std=gnu99

 CXXFLAGS += -std=gnu++11 -fno-exceptions -fno-rtti
 CXXFLAGS += -Wall
@@ -272,12 +272,12 @@ include $(FEATURES_DUMP)
 endif

 ifeq ($(feature-stackprotector-all), 1)
-  CFLAGS += -fstack-protector-all
+  CORE_CFLAGS += -fstack-protector-all
 endif

 ifeq ($(DEBUG),0)
  ifeq ($(feature-fortify-source), 1)
-    CFLAGS += -D_FORTIFY_SOURCE=2
+    CORE_CFLAGS += -D_FORTIFY_SOURCE=2
  endif
 endif

@@ -301,10 +301,12 @@ INC_FLAGS += -I$(src-perf)/util
 INC_FLAGS += -I$(src-perf)
 INC_FLAGS += -I$(srctree)/tools/lib/

-CFLAGS   += $(INC_FLAGS)
+CORE_CFLAGS += -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE
+
+CFLAGS   += $(CORE_CFLAGS) $(INC_FLAGS)
 CXXFLAGS += $(INC_FLAGS)

-CFLAGS += -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE
+LIBPERF_CFLAGS := $(CORE_CFLAGS) $(EXTRA_CFLAGS)

 ifeq ($(feature-sync-compare-and-swap), 1)
  CFLAGS += -DHAVE_SYNC_COMPARE_AND_SWAP_SUPPORT

--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -407,6 +407,7 @@ linux_uapi_dir := $(srctree)/tools/include/uapi/linux
 asm_generic_uapi_dir := $(srctree)/tools/include/uapi/asm-generic
 arch_asm_uapi_dir := $(srctree)/tools/arch/$(SRCARCH)/include/uapi/asm/
 x86_arch_asm_uapi_dir := $(srctree)/tools/arch/x86/include/uapi/asm/
+x86_arch_asm_dir := $(srctree)/tools/arch/x86/include/asm/

 beauty_outdir := $(OUTPUT)trace/beauty/generated
 beauty_ioctl_outdir := $(beauty_outdir)/ioctl
@@ -543,6 +544,12 @@ x86_arch_prctl_code_tbl := $(srctree)/tools/perf/trace/beauty/x86_arch_prctl.sh
 $(x86_arch_prctl_code_array): $(x86_arch_asm_uapi_dir)/prctl.h $(x86_arch_prctl_code_tbl)
 	$(Q)$(SHELL) '$(x86_arch_prctl_code_tbl)' $(x86_arch_asm_uapi_dir) > $@

+x86_arch_MSRs_array := $(beauty_outdir)/x86_arch_MSRs_array.c
+x86_arch_MSRs_tbl := $(srctree)/tools/perf/trace/beauty/tracepoints/x86_msr.sh
+
+$(x86_arch_MSRs_array): $(x86_arch_asm_dir)/msr-index.h $(x86_arch_MSRs_tbl)
+	$(Q)$(SHELL) '$(x86_arch_MSRs_tbl)' $(x86_arch_asm_dir) > $@
+
 rename_flags_array := $(beauty_outdir)/rename_flags_array.c
 rename_flags_tbl := $(srctree)/tools/perf/trace/beauty/rename_flags.sh

@@ -677,6 +684,7 @@ prepare: $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)common-cmds.h archheaders $(drm_ioc
 	$(perf_ioctl_array) \
 	$(prctl_option_array) \
 	$(usbdevfs_ioctl_array) \
+	$(x86_arch_MSRs_array) \
 	$(x86_arch_prctl_code_array) \
 	$(rename_flags_array) \
 	$(arch_errno_name_array) \
@@ -761,7 +769,7 @@ $(LIBBPF)-clean:
 	$(Q)$(MAKE) -C $(BPF_DIR) O=$(OUTPUT) clean >/dev/null

 $(LIBPERF): FORCE
-	$(Q)$(MAKE) -C $(LIBPERF_DIR) O=$(OUTPUT) $(OUTPUT)libperf.a
+	$(Q)$(MAKE) -C $(LIBPERF_DIR) EXTRA_CFLAGS="$(LIBPERF_CFLAGS)" O=$(OUTPUT) $(OUTPUT)libperf.a

 $(LIBPERF)-clean:
 	$(call QUIET_CLEAN, libperf)
@@ -981,6 +989,7 @@ clean:: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clea
 		$(OUTPUT)$(perf_ioctl_array) \
 		$(OUTPUT)$(prctl_option_array) \
 		$(OUTPUT)$(usbdevfs_ioctl_array) \
+		$(OUTPUT)$(x86_arch_MSRs_array) \
 		$(OUTPUT)$(x86_arch_prctl_code_array) \
 		$(OUTPUT)$(rename_flags_array) \
 		$(OUTPUT)$(arch_errno_name_array) \

--- a/tools/perf/arch/arm/util/Build
+++ b/tools/perf/arch/arm/util/Build
+perf-y += perf_regs.o
+
 perf-$(CONFIG_DWARF) += dwarf-regs.o

 perf-$(CONFIG_LOCAL_LIBUNWIND)    += unwind-libunwind.o

--- a/tools/perf/arch/arm/util/perf_regs.c
+++ b/tools/perf/arch/arm/util/perf_regs.c
+// SPDX-License-Identifier: GPL-2.0
+#include "../../util/perf_regs.h"
+
+const struct sample_reg sample_reg_masks[] = {
+	SMPL_REG_END
+};
--- a/tools/perf/arch/arm64/util/Build
+++ b/tools/perf/arch/arm64/util/Build
 perf-y += header.o
+perf-y += perf_regs.o
 perf-y += sym-handling.o
 perf-$(CONFIG_DWARF)     += dwarf-regs.o
 perf-$(CONFIG_LOCAL_LIBUNWIND) += unwind-libunwind.o

--- a/tools/perf/arch/arm64/util/perf_regs.c
+++ b/tools/perf/arch/arm64/util/perf_regs.c
+// SPDX-License-Identifier: GPL-2.0
+#include "../../util/perf_regs.h"
+
+const struct sample_reg sample_reg_masks[] = {
+	SMPL_REG_END
+};
--- a/tools/perf/arch/csky/util/Build
+++ b/tools/perf/arch/csky/util/Build
+perf-y += perf_regs.o
+
 perf-$(CONFIG_DWARF) += dwarf-regs.o
 perf-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o
--- a/tools/perf/arch/csky/util/perf_regs.c
+++ b/tools/perf/arch/csky/util/perf_regs.c
+// SPDX-License-Identifier: GPL-2.0
+#include "../../util/perf_regs.h"
+
+const struct sample_reg sample_reg_masks[] = {
+	SMPL_REG_END
+};
--- a/tools/perf/arch/riscv/util/Build
+++ b/tools/perf/arch/riscv/util/Build
+perf-y += perf_regs.o
+
 perf-$(CONFIG_DWARF) += dwarf-regs.o
 perf-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o
--- a/tools/perf/arch/riscv/util/perf_regs.c
+++ b/tools/perf/arch/riscv/util/perf_regs.c
+// SPDX-License-Identifier: GPL-2.0
+#include "../../util/perf_regs.h"
+
+const struct sample_reg sample_reg_masks[] = {
+	SMPL_REG_END
+};
--- a/tools/perf/arch/s390/util/Build
+++ b/tools/perf/arch/s390/util/Build
 perf-y += header.o
 perf-y += kvm-stat.o
+perf-y += perf_regs.o

 perf-$(CONFIG_DWARF) += dwarf-regs.o
 perf-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o

--- a/tools/perf/arch/s390/util/perf_regs.c
+++ b/tools/perf/arch/s390/util/perf_regs.c
+// SPDX-License-Identifier: GPL-2.0
+#include "../../util/perf_regs.h"
+
+const struct sample_reg sample_reg_masks[] = {
+	SMPL_REG_END
+};
--- a/tools/perf/arch/x86/tests/perf-time-to-tsc.c
+++ b/tools/perf/arch/x86/tests/perf-time-to-tsc.c
@@ -9,6 +9,7 @@
 #include <sys/prctl.h>
 #include <perf/cpumap.h>
 #include <perf/evlist.h>
+#include <perf/mmap.h>

 #include "debug.h"
 #include "parse-events.h"
@@ -117,10 +118,10 @@ int test__perf_time_to_tsc(struct test *test __maybe_unused, int subtest __maybe

 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
 		md = &evlist->mmap[i];
-		if (perf_mmap__read_init(md) < 0)
+		if (perf_mmap__read_init(&md->core) < 0)
 			continue;

-		while ((event = perf_mmap__read_event(md)) != NULL) {
+		while ((event = perf_mmap__read_event(&md->core)) != NULL) {
 			struct perf_sample sample;

 			if (event->header.type != PERF_RECORD_COMM ||
@@ -139,9 +140,9 @@ int test__perf_time_to_tsc(struct test *test __maybe_unused, int subtest __maybe
 				comm2_time = sample.time;
 			}
 next_event:
-			perf_mmap__consume(md);
+			perf_mmap__consume(&md->core);
 		}
-		perf_mmap__read_done(md);
+		perf_mmap__read_done(&md->core);
 	}

 	if (!comm1_time || !comm2_time)

--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c
@@ -23,6 +23,7 @@
 #include "util/time-utils.h"
 #include "util/annotate.h"
 #include "util/map.h"
+#include "util/spark.h"
 #include <linux/err.h>
 #include <linux/zalloc.h>
 #include <subcmd/pager.h>
@@ -53,6 +54,7 @@ enum {
 	PERF_HPP_DIFF__FORMULA,
 	PERF_HPP_DIFF__DELTA_ABS,
 	PERF_HPP_DIFF__CYCLES,
+	PERF_HPP_DIFF__CYCLES_HIST,

 	PERF_HPP_DIFF__MAX_INDEX
 };
@@ -87,6 +89,7 @@ static bool force;
 static bool show_period;
 static bool show_formula;
 static bool show_baseline_only;
+static bool cycles_hist;
 static unsigned int sort_compute = 1;

 static s64 compute_wdiff_w1;
@@ -164,6 +167,10 @@ static struct header_column {
 	[PERF_HPP_DIFF__CYCLES] = {
 		.name  = "[Program Block Range] Cycles Diff",
 		.width = 70,
+	},
+	[PERF_HPP_DIFF__CYCLES_HIST] = {
+		.name  = "stddev/Hist",
+		.width = NUM_SPARKS + 9,
 	}
 };

@@ -610,6 +617,9 @@ static void init_block_info(struct block_info *bi, struct symbol *sym,
 	bi->cycles_aggr = ch->cycles_aggr;
 	bi->num = ch->num;
 	bi->num_aggr = ch->num_aggr;
+
+	memcpy(bi->cycles_spark, ch->cycles_spark,
+	       NUM_SPARKS * sizeof(u64));
 }

 static int process_block_per_sym(struct hist_entry *he)
@@ -689,6 +699,21 @@ static struct hist_entry *get_block_pair(struct hist_entry *he,
 	return NULL;
 }

+static void init_spark_values(unsigned long *svals, int num)
+{
+	for (int i = 0; i < num; i++)
+		svals[i] = 0;
+}
+
+static void update_spark_value(unsigned long *svals, int num,
+			       struct stats *stats, u64 val)
+{
+	int n = stats->n;
+
+	if (n < num)
+		svals[n] = val;
+}
+
 static void compute_cycles_diff(struct hist_entry *he,
 				struct hist_entry *pair)
 {
@@ -697,6 +722,26 @@ static void compute_cycles_diff(struct hist_entry *he,
 		pair->diff.cycles =
 			pair->block_info->cycles_aggr / pair->block_info->num_aggr -
 			he->block_info->cycles_aggr / he->block_info->num_aggr;
+
+		if (!cycles_hist)
+			return;
+
+		init_stats(&pair->diff.stats);
+		init_spark_values(pair->diff.svals, NUM_SPARKS);
+
+		for (int i = 0; i < pair->block_info->num; i++) {
+			u64 val;
+
+			if (i >= he->block_info->num || i >= NUM_SPARKS)
+				break;
+
+			val = labs(pair->block_info->cycles_spark[i] -
+				     he->block_info->cycles_spark[i]);
+
+			update_spark_value(pair->diff.svals, NUM_SPARKS,
+					   &pair->diff.stats, val);
+			update_stats(&pair->diff.stats, val);
+		}
 	}
 }

@@ -1255,6 +1300,9 @@ static const struct option options[] = {
 		    "Show period values."),
 	OPT_BOOLEAN('F', "formula", &show_formula,
 		    "Show formula."),
+	OPT_BOOLEAN(0, "cycles-hist", &cycles_hist,
+		    "Show cycles histogram and standard deviation "
+		    "- WARNING: use only with -c cycles."),
 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
 		    "dump raw trace in ASCII"),
 	OPT_BOOLEAN('f', "force", &force, "don't complain, do it"),
@@ -1462,6 +1510,90 @@ static int hpp__color_cycles(struct perf_hpp_fmt *fmt,
 	return __hpp__color_compare(fmt, hpp, he, COMPUTE_CYCLES);
 }

+static int all_zero(unsigned long *vals, int len)
+{
+	int i;
+
+	for (i = 0; i < len; i++)
+		if (vals[i] != 0)
+			return 0;
+	return 1;
+}
+
+static int print_cycles_spark(char *bf, int size, unsigned long *svals, u64 n)
+{
+	int printed;
+
+	if (n <= 1)
+		return 0;
+
+	if (n > NUM_SPARKS)
+		n = NUM_SPARKS;
+	if (all_zero(svals, n))
+		return 0;
+
+	printed = print_spark(bf, size, svals, n);
+	printed += scnprintf(bf + printed, size - printed, " ");
+	return printed;
+}
+
+static int hpp__color_cycles_hist(struct perf_hpp_fmt *fmt,
+			    struct perf_hpp *hpp, struct hist_entry *he)
+{
+	struct diff_hpp_fmt *dfmt =
+		container_of(fmt, struct diff_hpp_fmt, fmt);
+	struct hist_entry *pair = get_pair_fmt(he, dfmt);
+	struct block_hist *bh = container_of(he, struct block_hist, he);
+	struct block_hist *bh_pair;
+	struct hist_entry *block_he;
+	char spark[32], buf[128];
+	double r;
+	int ret, pad;
+
+	if (!pair) {
+		if (bh->block_idx)
+			hpp->skip = true;
+
+		goto no_print;
+	}
+
+	bh_pair = container_of(pair, struct block_hist, he);
+
+	block_he = hists__get_entry(&bh_pair->block_hists, bh->block_idx);
+	if (!block_he) {
+		hpp->skip = true;
+		goto no_print;
+	}
+
+	ret = print_cycles_spark(spark, sizeof(spark), block_he->diff.svals,
+				 block_he->diff.stats.n);
+
+	r = rel_stddev_stats(stddev_stats(&block_he->diff.stats),
+			     avg_stats(&block_he->diff.stats));
+
+	if (ret) {
+		/*
+		 * Padding spaces if number of sparks less than NUM_SPARKS
+		 * otherwise the output is not aligned.
+		 */
+		pad = NUM_SPARKS - ((ret - 1) / 3);
+		scnprintf(buf, sizeof(buf), "%s%5.1f%% %s", "\u00B1", r, spark);
+		ret = scnprintf(hpp->buf, hpp->size, "%*s",
+				dfmt->header_width, buf);
+
+		if (pad) {
+			ret += scnprintf(hpp->buf + ret, hpp->size - ret,
+					 "%-*s", pad, " ");
+		}
+
+		return ret;
+	}
+
+no_print:
+	return scnprintf(hpp->buf, hpp->size, "%*s",
+			dfmt->header_width, " ");
+}
+
 static void
 hpp__entry_unpair(struct hist_entry *he, int idx, char *buf, size_t size)
 {
@@ -1667,6 +1799,10 @@ static void data__hpp_register(struct data__file *d, int idx)
 		fmt->color = hpp__color_cycles;
 		fmt->sort  = hist_entry__cmp_nop;
 		break;
+	case PERF_HPP_DIFF__CYCLES_HIST:
+		fmt->color = hpp__color_cycles_hist;
+		fmt->sort  = hist_entry__cmp_nop;
+		break;
 	default:
 		fmt->sort  = hist_entry__cmp_nop;
 		break;
@@ -1692,10 +1828,14 @@ static int ui_init(void)
 		 *   PERF_HPP_DIFF__DELTA
 		 *   PERF_HPP_DIFF__RATIO
 		 *   PERF_HPP_DIFF__WEIGHTED_DIFF
+		 *   PERF_HPP_DIFF__CYCLES
 		 */
 		data__hpp_register(d, i ? compute_2_hpp[compute] :
 					  PERF_HPP_DIFF__BASELINE);

+		if (cycles_hist && i)
+			data__hpp_register(d, PERF_HPP_DIFF__CYCLES_HIST);
+
 		/*
 		 * And the rest:
 		 *
@@ -1850,6 +1990,9 @@ int cmd_diff(int argc, const char **argv)
 	if (quiet)
 		perf_quiet_option();

+	if (cycles_hist && (compute != COMPUTE_CYCLES))
+		usage_with_options(diff_usage, options);
+
 	symbol__annotation_init();

 	if (symbol__init(NULL) < 0)

--- a/tools/perf/builtin-kvm.c
+++ b/tools/perf/builtin-kvm.c
@@ -46,6 +46,7 @@
 #include <semaphore.h>
 #include <signal.h>
 #include <math.h>
+#include <perf/mmap.h>

 static const char *get_filename_for_perf_kvm(void)
 {
@@ -759,14 +760,14 @@ static s64 perf_kvm__mmap_read_idx(struct perf_kvm_stat *kvm, int idx,

 	*mmap_time = ULLONG_MAX;
 	md = &evlist->mmap[idx];
-	err = perf_mmap__read_init(md);
+	err = perf_mmap__read_init(&md->core);
 	if (err < 0)
 		return (err == -EAGAIN) ? 0 : -1;

-	while ((event = perf_mmap__read_event(md)) != NULL) {
+	while ((event = perf_mmap__read_event(&md->core)) != NULL) {
 		err = perf_evlist__parse_sample_timestamp(evlist, event, &timestamp);
 		if (err) {
-			perf_mmap__consume(md);
+			perf_mmap__consume(&md->core);
 			pr_err("Failed to parse sample\n");
 			return -1;
 		}
@@ -776,7 +777,7 @@ static s64 perf_kvm__mmap_read_idx(struct perf_kvm_stat *kvm, int idx,
 		 * FIXME: Here we can't consume the event, as perf_session__queue_event will
 		 *        point to it, and it'll get possibly overwritten by the kernel.
 		 */
-		perf_mmap__consume(md);
+		perf_mmap__consume(&md->core);

 		if (err) {
 			pr_err("Failed to enqueue sample: %d\n", err);
@@ -793,7 +794,7 @@ static s64 perf_kvm__mmap_read_idx(struct perf_kvm_stat *kvm, int idx,
 			break;
 	}

-	perf_mmap__read_done(md);
+	perf_mmap__read_done(&md->core);
 	return n;
 }


--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -197,7 +197,7 @@ static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
 		 * every aio write request started in record__aio_push() so
 		 * decrement it because the request is now complete.
 		 */
-		perf_mmap__put(md);
+		perf_mmap__put(&md->core);
 		rc = 1;
 	} else {
 		/*
@@ -276,7 +276,7 @@ static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size

 	if (record__comp_enabled(aio->rec)) {
 		size = zstd_compress(aio->rec->session, aio->data + aio->size,
-				     perf_mmap__mmap_len(map) - aio->size,
+				     mmap__mmap_len(map) - aio->size,
 				     buf, size);
 	} else {
 		memcpy(aio->data + aio->size, buf, size);
@@ -293,7 +293,7 @@ static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size
 		 * after started aio request completion or at record__aio_push()
 		 * if the request failed to start.
 		 */
-		perf_mmap__get(map);
+		perf_mmap__get(&map->core);
 	}

 	aio->size += size;
@@ -332,7 +332,7 @@ static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
 		 * map->refcount is decremented in record__aio_complete() after
 		 * aio write operation finishes successfully.
 		 */
-		perf_mmap__put(map);
+		perf_mmap__put(&map->core);
 	}

 	return ret;
@@ -488,7 +488,7 @@ static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
 	struct record *rec = to;

 	if (record__comp_enabled(rec)) {
-		size = zstd_compress(rec->session, map->data, perf_mmap__mmap_len(map), bf, size);
+		size = zstd_compress(rec->session, map->data, mmap__mmap_len(map), bf, size);
 		bf   = map->data;
 	}


--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -3605,11 +3605,6 @@ int cmd_script(int argc, const char **argv)
 		}
 	}

-	if (script.time_str && reltime) {
-		fprintf(stderr, "Don't combine --reltime with --time\n");
-		return -1;
-	}
-
 	if (itrace_synth_opts.callchain &&
 	    itrace_synth_opts.callchain_sz > scripting_max_stack)
 		scripting_max_stack = itrace_synth_opts.callchain_sz;

--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -82,6 +82,7 @@
 #include <linux/err.h>

 #include <linux/ctype.h>
+#include <perf/mmap.h>

 static volatile int done;
 static volatile int resize;
@@ -869,10 +870,10 @@ static void perf_top__mmap_read_idx(struct perf_top *top, int idx)
 	union perf_event *event;

 	md = opts->overwrite ? &evlist->overwrite_mmap[idx] : &evlist->mmap[idx];
-	if (perf_mmap__read_init(md) < 0)
+	if (perf_mmap__read_init(&md->core) < 0)
 		return;

-	while ((event = perf_mmap__read_event(md)) != NULL) {
+	while ((event = perf_mmap__read_event(&md->core)) != NULL) {
 		int ret;

 		ret = perf_evlist__parse_sample_timestamp(evlist, event, &last_timestamp);
@@ -883,7 +884,7 @@ static void perf_top__mmap_read_idx(struct perf_top *top, int idx)
 		if (ret)
 			break;

-		perf_mmap__consume(md);
+		perf_mmap__consume(&md->core);

 		if (top->qe.rotate) {
 			pthread_mutex_lock(&top->qe.mutex);
@@ -893,7 +894,7 @@ static void perf_top__mmap_read_idx(struct perf_top *top, int idx)
 		}
 	}

-	perf_mmap__read_done(md);
+	perf_mmap__read_done(&md->core);
 }

 static void perf_top__mmap_read(struct perf_top *top)
@@ -1560,6 +1561,17 @@ int cmd_top(int argc, const char **argv)
 	status = perf_config(perf_top_config, &top);
 	if (status)
 		return status;
+	/*
+	 * Since the per arch annotation init routine may need the cpuid, read
+	 * it here, since we are not getting this from the perf.data header.
+	 */
+	status = perf_env__read_cpuid(&perf_env);
+	if (status) {
+		pr_err("Couldn't read the cpuid for this machine: %s\n",
+		       str_error_r(errno, errbuf, sizeof(errbuf)));
+		goto out_delete_evlist;
+	}
+	top.evlist->env = &perf_env;

 	argc = parse_options(argc, argv, options, top_usage, 0);
 	if (argc)

--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
--- a/tools/perf/check-headers.sh
+++ b/tools/perf/check-headers.sh
@@ -28,6 +28,7 @@ arch/x86/include/asm/disabled-features.h
 arch/x86/include/asm/required-features.h
 arch/x86/include/asm/cpufeatures.h
 arch/x86/include/asm/inat_types.h
+arch/x86/include/asm/msr-index.h
 arch/x86/include/uapi/asm/prctl.h
 arch/x86/lib/x86-opcode-map.txt
 arch/x86/tools/gen-insn-attr-x86.awk

--- a/tools/perf/lib/Build
+++ b/tools/perf/lib/Build
@@ -3,6 +3,7 @@ libperf-y += cpumap.o
 libperf-y += threadmap.o
 libperf-y += evsel.o
 libperf-y += evlist.o
+libperf-y += mmap.o
 libperf-y += zalloc.o
 libperf-y += xyarray.o
 libperf-y += lib.o

--- a/tools/perf/lib/Makefile
+++ b/tools/perf/lib/Makefile
@@ -172,8 +172,9 @@ install_headers:
 		$(call do_install,include/perf/cpumap.h,$(prefix)/include/perf,644); \
 		$(call do_install,include/perf/threadmap.h,$(prefix)/include/perf,644); \
 		$(call do_install,include/perf/evlist.h,$(prefix)/include/perf,644); \
-		$(call do_install,include/perf/evsel.h,$(prefix)/include/perf,644);
-		$(call do_install,include/perf/event.h,$(prefix)/include/perf,644);
+		$(call do_install,include/perf/evsel.h,$(prefix)/include/perf,644); \
+		$(call do_install,include/perf/event.h,$(prefix)/include/perf,644); \
+		$(call do_install,include/perf/mmap.h,$(prefix)/include/perf,644);

 install_pkgconfig: $(LIBPERF_PC)
 	$(call QUIET_INSTALL, $(LIBPERF_PC)) \

--- a/tools/perf/lib/core.c
+++ b/tools/perf/lib/core.c
@@ -5,11 +5,12 @@
 #include <stdio.h>
 #include <stdarg.h>
 #include <unistd.h>
+#include <linux/compiler.h>
 #include <perf/core.h>
 #include <internal/lib.h>
 #include "internal.h"

-static int __base_pr(enum libperf_print_level level, const char *format,
+static int __base_pr(enum libperf_print_level level __maybe_unused, const char *format,
 		     va_list args)
 {
 	return vfprintf(stderr, format, args);

--- a/tools/perf/lib/evlist.c
+++ b/tools/perf/lib/evlist.c
@@ -8,13 +8,20 @@
 #include <internal/evlist.h>
 #include <internal/evsel.h>
 #include <internal/xyarray.h>
+#include <internal/mmap.h>
+#include <internal/cpumap.h>
+#include <internal/threadmap.h>
+#include <internal/xyarray.h>
+#include <internal/lib.h>
 #include <linux/zalloc.h>
+#include <sys/ioctl.h>
 #include <stdlib.h>
 #include <errno.h>
 #include <unistd.h>
 #include <fcntl.h>
 #include <signal.h>
 #include <poll.h>
+#include <sys/mman.h>
 #include <perf/cpumap.h>
 #include <perf/threadmap.h>
 #include <api/fd/array.h>
@@ -27,6 +34,7 @@ void perf_evlist__init(struct perf_evlist *evlist)
 		INIT_HLIST_HEAD(&evlist->heads[i]);
 	INIT_LIST_HEAD(&evlist->entries);
 	evlist->nr_entries = 0;
+	fdarray__init(&evlist->pollfd, 64);
 }

 static void __perf_evlist__propagate_maps(struct perf_evlist *evlist,
@@ -101,8 +109,36 @@ perf_evlist__next(struct perf_evlist *evlist, struct perf_evsel *prev)
 	return next;
 }

+static void perf_evlist__purge(struct perf_evlist *evlist)
+{
+	struct perf_evsel *pos, *n;
+
+	perf_evlist__for_each_entry_safe(evlist, n, pos) {
+		list_del_init(&pos->node);
+		perf_evsel__delete(pos);
+	}
+
+	evlist->nr_entries = 0;
+}
+
+void perf_evlist__exit(struct perf_evlist *evlist)
+{
+	perf_cpu_map__put(evlist->cpus);
+	perf_thread_map__put(evlist->threads);
+	evlist->cpus = NULL;
+	evlist->threads = NULL;
+	fdarray__exit(&evlist->pollfd);
+}
+
 void perf_evlist__delete(struct perf_evlist *evlist)
 {
+	if (evlist == NULL)
+		return;
+
+	perf_evlist__munmap(evlist);
+	perf_evlist__close(evlist);
+	perf_evlist__purge(evlist);
+	perf_evlist__exit(evlist);
 	free(evlist);
 }

@@ -277,7 +313,295 @@ int perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd,
 	return pos;
 }

+static void perf_evlist__munmap_filtered(struct fdarray *fda, int fd,
+					 void *arg __maybe_unused)
+{
+	struct perf_mmap *map = fda->priv[fd].ptr;
+
+	if (map)
+		perf_mmap__put(map);
+}
+
+int perf_evlist__filter_pollfd(struct perf_evlist *evlist, short revents_and_mask)
+{
+	return fdarray__filter(&evlist->pollfd, revents_and_mask,
+			       perf_evlist__munmap_filtered, NULL);
+}
+
 int perf_evlist__poll(struct perf_evlist *evlist, int timeout)
 {
 	return fdarray__poll(&evlist->pollfd, timeout);
 }
+
+static struct perf_mmap* perf_evlist__alloc_mmap(struct perf_evlist *evlist, bool overwrite)
+{
+	int i;
+	struct perf_mmap *map;
+
+	evlist->nr_mmaps = perf_cpu_map__nr(evlist->cpus);
+	if (perf_cpu_map__empty(evlist->cpus))
+		evlist->nr_mmaps = perf_thread_map__nr(evlist->threads);
+
+	map = zalloc(evlist->nr_mmaps * sizeof(struct perf_mmap));
+	if (!map)
+		return NULL;
+
+	for (i = 0; i < evlist->nr_mmaps; i++) {
+		/*
+		 * When the perf_mmap() call is made we grab one refcount, plus
+		 * one extra to let perf_mmap__consume() get the last
+		 * events after all real references (perf_mmap__get()) are
+		 * dropped.
+		 *
+		 * Each PERF_EVENT_IOC_SET_OUTPUT points to this mmap and
+		 * thus does perf_mmap__get() on it.
+		 */
+		perf_mmap__init(&map[i], overwrite, NULL);
+	}
+
+	return map;
+}
+
+static void perf_evlist__set_sid_idx(struct perf_evlist *evlist,
+				     struct perf_evsel *evsel, int idx, int cpu,
+				     int thread)
+{
+	struct perf_sample_id *sid = SID(evsel, cpu, thread);
+
+	sid->idx = idx;
+	if (evlist->cpus && cpu >= 0)
+		sid->cpu = evlist->cpus->map[cpu];
+	else
+		sid->cpu = -1;
+	if (!evsel->system_wide && evlist->threads && thread >= 0)
+		sid->tid = perf_thread_map__pid(evlist->threads, thread);
+	else
+		sid->tid = -1;
+}
+
+static struct perf_mmap*
+perf_evlist__mmap_cb_get(struct perf_evlist *evlist, bool overwrite, int idx)
+{
+	struct perf_mmap *map = &evlist->mmap[idx];
+
+	if (overwrite) {
+		if (!evlist->mmap_ovw) {
+			evlist->mmap_ovw = perf_evlist__alloc_mmap(evlist, true);
+			if (!evlist->mmap_ovw)
+				return NULL;
+		}
+		map = &evlist->mmap_ovw[idx];
+	}
+
+	return map;
+}
+
+#define FD(e, x, y) (*(int *) xyarray__entry(e->fd, x, y))
+
+static int
+perf_evlist__mmap_cb_mmap(struct perf_mmap *map, struct perf_mmap_param *mp,
+			  int output, int cpu)
+{
+	return perf_mmap__mmap(map, mp, output, cpu);
+}
+
+static int
+mmap_per_evsel(struct perf_evlist *evlist, struct perf_evlist_mmap_ops *ops,
+	       int idx, struct perf_mmap_param *mp, int cpu_idx,
+	       int thread, int *_output, int *_output_overwrite)
+{
+	int evlist_cpu = perf_cpu_map__cpu(evlist->cpus, cpu_idx);
+	struct perf_evsel *evsel;
+	int revent;
+
+	perf_evlist__for_each_entry(evlist, evsel) {
+		bool overwrite = evsel->attr.write_backward;
+		struct perf_mmap *map;
+		int *output, fd, cpu;
+
+		if (evsel->system_wide && thread)
+			continue;
+
+		cpu = perf_cpu_map__idx(evsel->cpus, evlist_cpu);
+		if (cpu == -1)
+			continue;
+
+		map = ops->get(evlist, overwrite, idx);
+		if (map == NULL)
+			return -ENOMEM;
+
+		if (overwrite) {
+			mp->prot = PROT_READ;
+			output   = _output_overwrite;
+		} else {
+			mp->prot = PROT_READ | PROT_WRITE;
+			output   = _output;
+		}
+
+		fd = FD(evsel, cpu, thread);
+
+		if (*output == -1) {
+			*output = fd;
+
+			/*
+			 * The last one will be done at perf_mmap__consume(), so that we
+			 * make sure we don't prevent tools from consuming every last event in
+			 * the ring buffer.
+			 *
+			 * I.e. we can get the POLLHUP meaning that the fd doesn't exist
+			 * anymore, but the last events for it are still in the ring buffer,
+			 * waiting to be consumed.
+			 *
+			 * Tools can chose to ignore this at their own discretion, but the
+			 * evlist layer can't just drop it when filtering events in
+			 * perf_evlist__filter_pollfd().
+			 */
+			refcount_set(&map->refcnt, 2);
+
+			if (ops->mmap(map, mp, *output, evlist_cpu) < 0)
+				return -1;
+		} else {
+			if (ioctl(fd, PERF_EVENT_IOC_SET_OUTPUT, *output) != 0)
+				return -1;
+
+			perf_mmap__get(map);
+		}
+
+		revent = !overwrite ? POLLIN : 0;
+
+		if (!evsel->system_wide &&
+		    perf_evlist__add_pollfd(evlist, fd, map, revent) < 0) {
+			perf_mmap__put(map);
+			return -1;
+		}
+
+		if (evsel->attr.read_format & PERF_FORMAT_ID) {
+			if (perf_evlist__id_add_fd(evlist, evsel, cpu, thread,
+						   fd) < 0)
+				return -1;
+			perf_evlist__set_sid_idx(evlist, evsel, idx, cpu,
+						 thread);
+		}
+	}
+
+	return 0;
+}
+
+static int
+mmap_per_thread(struct perf_evlist *evlist, struct perf_evlist_mmap_ops *ops,
+		struct perf_mmap_param *mp)
+{
+	int thread;
+	int nr_threads = perf_thread_map__nr(evlist->threads);
+
+	for (thread = 0; thread < nr_threads; thread++) {
+		int output = -1;
+		int output_overwrite = -1;
+
+		if (ops->idx)
+			ops->idx(evlist, mp, thread, false);
+
+		if (mmap_per_evsel(evlist, ops, thread, mp, 0, thread,
+				   &output, &output_overwrite))
+			goto out_unmap;
+	}
+
+	return 0;
+
+out_unmap:
+	perf_evlist__munmap(evlist);
+	return -1;
+}
+
+static int
+mmap_per_cpu(struct perf_evlist *evlist, struct perf_evlist_mmap_ops *ops,
+	     struct perf_mmap_param *mp)
+{
+	int nr_threads = perf_thread_map__nr(evlist->threads);
+	int nr_cpus    = perf_cpu_map__nr(evlist->cpus);
+	int cpu, thread;
+
+	for (cpu = 0; cpu < nr_cpus; cpu++) {
+		int output = -1;
+		int output_overwrite = -1;
+
+		if (ops->idx)
+			ops->idx(evlist, mp, cpu, true);
+
+		for (thread = 0; thread < nr_threads; thread++) {
+			if (mmap_per_evsel(evlist, ops, cpu, mp, cpu,
+					   thread, &output, &output_overwrite))
+				goto out_unmap;
+		}
+	}
+
+	return 0;
+
+out_unmap:
+	perf_evlist__munmap(evlist);
+	return -1;
+}
+
+int perf_evlist__mmap_ops(struct perf_evlist *evlist,
+			  struct perf_evlist_mmap_ops *ops,
+			  struct perf_mmap_param *mp)
+{
+	struct perf_evsel *evsel;
+	const struct perf_cpu_map *cpus = evlist->cpus;
+	const struct perf_thread_map *threads = evlist->threads;
+
+	if (!ops || !ops->get || !ops->mmap)
+		return -EINVAL;
+
+	if (!evlist->mmap)
+		evlist->mmap = perf_evlist__alloc_mmap(evlist, false);
+	if (!evlist->mmap)
+		return -ENOMEM;
+
+	perf_evlist__for_each_entry(evlist, evsel) {
+		if ((evsel->attr.read_format & PERF_FORMAT_ID) &&
+		    evsel->sample_id == NULL &&
+		    perf_evsel__alloc_id(evsel, perf_cpu_map__nr(cpus), threads->nr) < 0)
+			return -ENOMEM;
+	}
+
+	if (evlist->pollfd.entries == NULL && perf_evlist__alloc_pollfd(evlist) < 0)
+		return -ENOMEM;
+
+	if (perf_cpu_map__empty(cpus))
+		return mmap_per_thread(evlist, ops, mp);
+
+	return mmap_per_cpu(evlist, ops, mp);
+}
+
+int perf_evlist__mmap(struct perf_evlist *evlist, int pages)
+{
+	struct perf_mmap_param mp;
+	struct perf_evlist_mmap_ops ops = {
+		.get  = perf_evlist__mmap_cb_get,
+		.mmap = perf_evlist__mmap_cb_mmap,
+	};
+
+	evlist->mmap_len = (pages + 1) * page_size;
+	mp.mask = evlist->mmap_len - page_size - 1;
+
+	return perf_evlist__mmap_ops(evlist, &ops, &mp);
+}
+
+void perf_evlist__munmap(struct perf_evlist *evlist)
+{
+	int i;
+
+	if (evlist->mmap) {
+		for (i = 0; i < evlist->nr_mmaps; i++)
+			perf_mmap__munmap(&evlist->mmap[i]);
+	}
+
+	if (evlist->mmap_ovw) {
+		for (i = 0; i < evlist->nr_mmaps; i++)
+			perf_mmap__munmap(&evlist->mmap_ovw[i]);
+	}
+
+	zfree(&evlist->mmap);
+	zfree(&evlist->mmap_ovw);
+}
--- a/tools/perf/lib/include/internal/evlist.h
+++ b/tools/perf/lib/include/internal/evlist.h
@@ -11,6 +11,7 @@

 struct perf_cpu_map;
 struct perf_thread_map;
+struct perf_mmap_param;

 struct perf_evlist {
 	struct list_head	 entries;
@@ -22,12 +23,33 @@ struct perf_evlist {
 	size_t			 mmap_len;
 	struct fdarray		 pollfd;
 	struct hlist_head	 heads[PERF_EVLIST__HLIST_SIZE];
+	struct perf_mmap	*mmap;
+	struct perf_mmap	*mmap_ovw;
+};
+
+typedef void
+(*perf_evlist_mmap__cb_idx_t)(struct perf_evlist*, struct perf_mmap_param*, int, bool);
+typedef struct perf_mmap*
+(*perf_evlist_mmap__cb_get_t)(struct perf_evlist*, bool, int);
+typedef int
+(*perf_evlist_mmap__cb_mmap_t)(struct perf_mmap*, struct perf_mmap_param*, int, int);
+
+struct perf_evlist_mmap_ops {
+	perf_evlist_mmap__cb_idx_t	idx;
+	perf_evlist_mmap__cb_get_t	get;
+	perf_evlist_mmap__cb_mmap_t	mmap;
 };

 int perf_evlist__alloc_pollfd(struct perf_evlist *evlist);
 int perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd,
 			    void *ptr, short revent);

+int perf_evlist__mmap_ops(struct perf_evlist *evlist,
+			  struct perf_evlist_mmap_ops *ops,
+			  struct perf_mmap_param *mp);
+
+void perf_evlist__exit(struct perf_evlist *evlist);
+
 /**
 * __perf_evlist__for_each_entry - iterate thru all the evsels
 * @list: list_head instance to iterate
@@ -60,6 +82,24 @@ int perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd,
 #define perf_evlist__for_each_entry_reverse(evlist, evsel) \
 	__perf_evlist__for_each_entry_reverse(&(evlist)->entries, evsel)

+/**
+ * __perf_evlist__for_each_entry_safe - safely iterate thru all the evsels
+ * @list: list_head instance to iterate
+ * @tmp: struct evsel temp iterator
+ * @evsel: struct evsel iterator
+ */
+#define __perf_evlist__for_each_entry_safe(list, tmp, evsel) \
+	list_for_each_entry_safe(evsel, tmp, list, node)
+
+/**
+ * perf_evlist__for_each_entry_safe - safely iterate thru all the evsels
+ * @evlist: evlist instance to iterate
+ * @evsel: struct evsel iterator
+ * @tmp: struct evsel temp iterator
+ */
+#define perf_evlist__for_each_entry_safe(evlist, tmp, evsel) \
+	__perf_evlist__for_each_entry_safe(&(evlist)->entries, tmp, evsel)
+
 static inline struct perf_evsel *perf_evlist__first(struct perf_evlist *evlist)
 {
 	return list_entry(evlist->entries.next, struct perf_evsel, node);

--- a/tools/perf/lib/include/internal/mmap.h
+++ b/tools/perf/lib/include/internal/mmap.h
@@ -10,23 +10,45 @@
 /* perf sample has 16 bits size limit */
 #define PERF_SAMPLE_MAX_SIZE (1 << 16)

+struct perf_mmap;
+
+typedef void (*libperf_unmap_cb_t)(struct perf_mmap *map);
+
 /**
 * struct perf_mmap - perf's ring buffer mmap details
 *
 * @refcnt - e.g. code using PERF_EVENT_IOC_SET_OUTPUT to share this
 */
 struct perf_mmap {
-	void		*base;
-	int		 mask;
-	int		 fd;
-	int		 cpu;
-	refcount_t	 refcnt;
-	u64		 prev;
-	u64		 start;
-	u64		 end;
-	bool		 overwrite;
-	u64		 flush;
-	char		 event_copy[PERF_SAMPLE_MAX_SIZE] __aligned(8);
+	void			*base;
+	int			 mask;
+	int			 fd;
+	int			 cpu;
+	refcount_t		 refcnt;
+	u64			 prev;
+	u64			 start;
+	u64			 end;
+	bool			 overwrite;
+	u64			 flush;
+	libperf_unmap_cb_t	 unmap_cb;
+	char			 event_copy[PERF_SAMPLE_MAX_SIZE] __aligned(8);
+};
+
+struct perf_mmap_param {
+	int	prot;
+	int	mask;
 };

+size_t perf_mmap__mmap_len(struct perf_mmap *map);
+
+void perf_mmap__init(struct perf_mmap *map, bool overwrite,
+		     libperf_unmap_cb_t unmap_cb);
+int perf_mmap__mmap(struct perf_mmap *map, struct perf_mmap_param *mp,
+		    int fd, int cpu);
+void perf_mmap__munmap(struct perf_mmap *map);
+void perf_mmap__get(struct perf_mmap *map);
+void perf_mmap__put(struct perf_mmap *map);
+
+u64 perf_mmap__read_head(struct perf_mmap *map);
+
 #endif /* __LIBPERF_INTERNAL_MMAP_H */
--- a/tools/perf/lib/include/perf/core.h
+++ b/tools/perf/lib/include/perf/core.h
@@ -12,6 +12,8 @@ enum libperf_print_level {
 	LIBPERF_WARN,
 	LIBPERF_INFO,
 	LIBPERF_DEBUG,
+	LIBPERF_DEBUG2,
+	LIBPERF_DEBUG3,
 };

 typedef int (*libperf_print_fn_t)(enum libperf_print_level level,

--- a/tools/perf/lib/include/perf/evlist.h
+++ b/tools/perf/lib/include/perf/evlist.h
@@ -32,5 +32,10 @@ LIBPERF_API void perf_evlist__set_maps(struct perf_evlist *evlist,
 				       struct perf_cpu_map *cpus,
 				       struct perf_thread_map *threads);
 LIBPERF_API int perf_evlist__poll(struct perf_evlist *evlist, int timeout);
+LIBPERF_API int perf_evlist__filter_pollfd(struct perf_evlist *evlist,
+					   short revents_and_mask);
+
+LIBPERF_API int perf_evlist__mmap(struct perf_evlist *evlist, int pages);
+LIBPERF_API void perf_evlist__munmap(struct perf_evlist *evlist);

 #endif /* __LIBPERF_EVLIST_H */
--- a/tools/perf/lib/include/perf/mmap.h
+++ b/tools/perf/lib/include/perf/mmap.h
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LIBPERF_MMAP_H
+#define __LIBPERF_MMAP_H
+
+#include <perf/core.h>
+
+struct perf_mmap;
+union perf_event;
+
+LIBPERF_API void perf_mmap__consume(struct perf_mmap *map);
+LIBPERF_API int perf_mmap__read_init(struct perf_mmap *map);
+LIBPERF_API void perf_mmap__read_done(struct perf_mmap *map);
+LIBPERF_API union perf_event *perf_mmap__read_event(struct perf_mmap *map);
+
+#endif /* __LIBPERF_MMAP_H */
--- a/tools/perf/lib/internal.h
+++ b/tools/perf/lib/internal.h
@@ -14,5 +14,7 @@ do {                            \
 #define pr_warning(fmt, ...)    __pr(LIBPERF_WARN, fmt, ##__VA_ARGS__)
 #define pr_info(fmt, ...)       __pr(LIBPERF_INFO, fmt, ##__VA_ARGS__)
 #define pr_debug(fmt, ...)      __pr(LIBPERF_DEBUG, fmt, ##__VA_ARGS__)
+#define pr_debug2(fmt, ...)     __pr(LIBPERF_DEBUG2, fmt, ##__VA_ARGS__)
+#define pr_debug3(fmt, ...)     __pr(LIBPERF_DEBUG3, fmt, ##__VA_ARGS__)

 #endif /* __LIBPERF_INTERNAL_H */
--- a/tools/perf/lib/libperf.map
+++ b/tools/perf/lib/libperf.map
@@ -40,6 +40,13 @@ LIBPERF_0.0.1 {
 		perf_evlist__next;
 		perf_evlist__set_maps;
 		perf_evlist__poll;
+		perf_evlist__mmap;
+		perf_evlist__munmap;
+		perf_evlist__filter_pollfd;
+		perf_mmap__consume;
+		perf_mmap__read_init;
+		perf_mmap__read_done;
+		perf_mmap__read_event;
 	local:
 		*;
 };
--- a/tools/perf/lib/mmap.c
+++ b/tools/perf/lib/mmap.c
+// SPDX-License-Identifier: GPL-2.0
+#include <sys/mman.h>
+#include <inttypes.h>
+#include <asm/bug.h>
+#include <errno.h>
+#include <string.h>
+#include <linux/ring_buffer.h>
+#include <linux/perf_event.h>
+#include <perf/mmap.h>
+#include <perf/event.h>
+#include <internal/mmap.h>
+#include <internal/lib.h>
+#include <linux/kernel.h>
+#include "internal.h"
+
+void perf_mmap__init(struct perf_mmap *map, bool overwrite,
+		     libperf_unmap_cb_t unmap_cb)
+{
+	map->fd = -1;
+	map->overwrite = overwrite;
+	map->unmap_cb  = unmap_cb;
+	refcount_set(&map->refcnt, 0);
+}
+
+size_t perf_mmap__mmap_len(struct perf_mmap *map)
+{
+	return map->mask + 1 + page_size;
+}
+
+int perf_mmap__mmap(struct perf_mmap *map, struct perf_mmap_param *mp,
+		    int fd, int cpu)
+{
+	map->prev = 0;
+	map->mask = mp->mask;
+	map->base = mmap(NULL, perf_mmap__mmap_len(map), mp->prot,
+			 MAP_SHARED, fd, 0);
+	if (map->base == MAP_FAILED) {
+		map->base = NULL;
+		return -1;
+	}
+
+	map->fd  = fd;
+	map->cpu = cpu;
+	return 0;
+}
+
+void perf_mmap__munmap(struct perf_mmap *map)
+{
+	if (map && map->base != NULL) {
+		munmap(map->base, perf_mmap__mmap_len(map));
+		map->base = NULL;
+		map->fd = -1;
+		refcount_set(&map->refcnt, 0);
+	}
+	if (map && map->unmap_cb)
+		map->unmap_cb(map);
+}
+
+void perf_mmap__get(struct perf_mmap *map)
+{
+	refcount_inc(&map->refcnt);
+}
+
+void perf_mmap__put(struct perf_mmap *map)
+{
+	BUG_ON(map->base && refcount_read(&map->refcnt) == 0);
+
+	if (refcount_dec_and_test(&map->refcnt))
+		perf_mmap__munmap(map);
+}
+
+static inline void perf_mmap__write_tail(struct perf_mmap *md, u64 tail)
+{
+	ring_buffer_write_tail(md->base, tail);
+}
+
+u64 perf_mmap__read_head(struct perf_mmap *map)
+{
+	return ring_buffer_read_head(map->base);
+}
+
+static bool perf_mmap__empty(struct perf_mmap *map)
+{
+	struct perf_event_mmap_page *pc = map->base;
+
+	return perf_mmap__read_head(map) == map->prev && !pc->aux_size;
+}
+
+void perf_mmap__consume(struct perf_mmap *map)
+{
+	if (!map->overwrite) {
+		u64 old = map->prev;
+
+		perf_mmap__write_tail(map, old);
+	}
+
+	if (refcount_read(&map->refcnt) == 1 && perf_mmap__empty(map))
+		perf_mmap__put(map);
+}
+
+static int overwrite_rb_find_range(void *buf, int mask, u64 *start, u64 *end)
+{
+	struct perf_event_header *pheader;
+	u64 evt_head = *start;
+	int size = mask + 1;
+
+	pr_debug2("%s: buf=%p, start=%"PRIx64"\n", __func__, buf, *start);
+	pheader = (struct perf_event_header *)(buf + (*start & mask));
+	while (true) {
+		if (evt_head - *start >= (unsigned int)size) {
+			pr_debug("Finished reading overwrite ring buffer: rewind\n");
+			if (evt_head - *start > (unsigned int)size)
+				evt_head -= pheader->size;
+			*end = evt_head;
+			return 0;
+		}
+
+		pheader = (struct perf_event_header *)(buf + (evt_head & mask));
+
+		if (pheader->size == 0) {
+			pr_debug("Finished reading overwrite ring buffer: get start\n");
+			*end = evt_head;
+			return 0;
+		}
+
+		evt_head += pheader->size;
+		pr_debug3("move evt_head: %"PRIx64"\n", evt_head);
+	}
+	WARN_ONCE(1, "Shouldn't get here\n");
+	return -1;
+}
+
+/*
+ * Report the start and end of the available data in ringbuffer
+ */
+static int __perf_mmap__read_init(struct perf_mmap *md)
+{
+	u64 head = perf_mmap__read_head(md);
+	u64 old = md->prev;
+	unsigned char *data = md->base + page_size;
+	unsigned long size;
+
+	md->start = md->overwrite ? head : old;
+	md->end = md->overwrite ? old : head;
+
+	if ((md->end - md->start) < md->flush)
+		return -EAGAIN;
+
+	size = md->end - md->start;
+	if (size > (unsigned long)(md->mask) + 1) {
+		if (!md->overwrite) {
+			WARN_ONCE(1, "failed to keep up with mmap data. (warn only once)\n");
+
+			md->prev = head;
+			perf_mmap__consume(md);
+			return -EAGAIN;
+		}
+
+		/*
+		 * Backward ring buffer is full. We still have a chance to read
+		 * most of data from it.
+		 */
+		if (overwrite_rb_find_range(data, md->mask, &md->start, &md->end))
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+int perf_mmap__read_init(struct perf_mmap *map)
+{
+	/*
+	 * Check if event was unmapped due to a POLLHUP/POLLERR.
+	 */
+	if (!refcount_read(&map->refcnt))
+		return -ENOENT;
+
+	return __perf_mmap__read_init(map);
+}
+
+/*
+ * Mandatory for overwrite mode
+ * The direction of overwrite mode is backward.
+ * The last perf_mmap__read() will set tail to map->core.prev.
+ * Need to correct the map->core.prev to head which is the end of next read.
+ */
+void perf_mmap__read_done(struct perf_mmap *map)
+{
+	/*
+	 * Check if event was unmapped due to a POLLHUP/POLLERR.
+	 */
+	if (!refcount_read(&map->refcnt))
+		return;
+
+	map->prev = perf_mmap__read_head(map);
+}
+
+/* When check_messup is true, 'end' must points to a good entry */
+static union perf_event *perf_mmap__read(struct perf_mmap *map,
+					 u64 *startp, u64 end)
+{
+	unsigned char *data = map->base + page_size;
+	union perf_event *event = NULL;
+	int diff = end - *startp;
+
+	if (diff >= (int)sizeof(event->header)) {
+		size_t size;
+
+		event = (union perf_event *)&data[*startp & map->mask];
+		size = event->header.size;
+
+		if (size < sizeof(event->header) || diff < (int)size)
+			return NULL;
+
+		/*
+		 * Event straddles the mmap boundary -- header should always
+		 * be inside due to u64 alignment of output.
+		 */
+		if ((*startp & map->mask) + size != ((*startp + size) & map->mask)) {
+			unsigned int offset = *startp;
+			unsigned int len = min(sizeof(*event), size), cpy;
+			void *dst = map->event_copy;
+
+			do {
+				cpy = min(map->mask + 1 - (offset & map->mask), len);
+				memcpy(dst, &data[offset & map->mask], cpy);
+				offset += cpy;
+				dst += cpy;
+				len -= cpy;
+			} while (len);
+
+			event = (union perf_event *)map->event_copy;
+		}
+
+		*startp += size;
+	}
+
+	return event;
+}
+
+/*
+ * Read event from ring buffer one by one.
+ * Return one event for each call.
+ *
+ * Usage:
+ * perf_mmap__read_init()
+ * while(event = perf_mmap__read_event()) {
+ *	//process the event
+ *	perf_mmap__consume()
+ * }
+ * perf_mmap__read_done()
+ */
+union perf_event *perf_mmap__read_event(struct perf_mmap *map)
+{
+	union perf_event *event;
+
+	/*
+	 * Check if event was unmapped due to a POLLHUP/POLLERR.
+	 */
+	if (!refcount_read(&map->refcnt))
+		return NULL;
+
+	/* non-overwirte doesn't pause the ringbuffer */
+	if (!map->overwrite)
+		map->end = perf_mmap__read_head(map);
+
+	event = perf_mmap__read(map, &map->start, map->end);
+
+	if (!map->overwrite)
+		map->prev = map->start;
+
+	return event;
+}
--- a/tools/perf/perf-sys.h
+++ b/tools/perf/perf-sys.h
@@ -15,7 +15,9 @@ void test_attr__init(void);
 void test_attr__open(struct perf_event_attr *attr, pid_t pid, int cpu,
 		     int fd, int group_fd, unsigned long flags);

-#define HAVE_ATTR_TEST
+#ifndef HAVE_ATTR_TEST
+#define HAVE_ATTR_TEST 1
+#endif

 static inline int
 sys_perf_event_open(struct perf_event_attr *attr,
@@ -27,7 +29,7 @@ sys_perf_event_open(struct perf_event_attr *attr,
 	fd = syscall(__NR_perf_event_open, attr, pid, cpu,
 		     group_fd, flags);

-#ifdef HAVE_ATTR_TEST
+#if HAVE_ATTR_TEST
 	if (unlikely(test_attr__enabled))
 		test_attr__open(attr, pid, cpu, fd, group_fd, flags);
 #endif

--- a/tools/perf/scripts/python/exported-sql-viewer.py
+++ b/tools/perf/scripts/python/exported-sql-viewer.py
--- a/tools/perf/tests/backward-ring-buffer.c
+++ b/tools/perf/tests/backward-ring-buffer.c
@@ -13,6 +13,7 @@
 #include "util/mmap.h"
 #include <errno.h>
 #include <linux/string.h>
+#include <perf/mmap.h>

 #define NR_ITERS 111

@@ -37,8 +38,8 @@ static int count_samples(struct evlist *evlist, int *sample_count,
 		struct mmap *map = &evlist->overwrite_mmap[i];
 		union perf_event *event;

-		perf_mmap__read_init(map);
-		while ((event = perf_mmap__read_event(map)) != NULL) {
+		perf_mmap__read_init(&map->core);
+		while ((event = perf_mmap__read_event(&map->core)) != NULL) {
 			const u32 type = event->header.type;

 			switch (type) {
@@ -53,7 +54,7 @@ static int count_samples(struct evlist *evlist, int *sample_count,
 				return TEST_FAIL;
 			}
 		}
-		perf_mmap__read_done(map);
+		perf_mmap__read_done(&map->core);
 	}
 	return TEST_OK;
 }

--- a/tools/perf/tests/bpf.c
+++ b/tools/perf/tests/bpf.c
@@ -15,6 +15,7 @@
 #include <linux/string.h>
 #include <api/fs/fs.h>
 #include <bpf/bpf.h>
+#include <perf/mmap.h>
 #include "tests.h"
 #include "llvm.h"
 #include "debug.h"
@@ -184,16 +185,16 @@ static int do_test(struct bpf_object *obj, int (*func)(void),
 		struct mmap *md;

 		md = &evlist->mmap[i];
-		if (perf_mmap__read_init(md) < 0)
+		if (perf_mmap__read_init(&md->core) < 0)
 			continue;

-		while ((event = perf_mmap__read_event(md)) != NULL) {
+		while ((event = perf_mmap__read_event(&md->core)) != NULL) {
 			const u32 type = event->header.type;

 			if (type == PERF_RECORD_SAMPLE)
 				count ++;
 		}
-		perf_mmap__read_done(md);
+		perf_mmap__read_done(&md->core);
 	}

 	if (count != expect) {

--- a/tools/perf/tests/code-reading.c
+++ b/tools/perf/tests/code-reading.c
@@ -10,6 +10,7 @@
 #include <sys/param.h>
 #include <perf/cpumap.h>
 #include <perf/evlist.h>
+#include <perf/mmap.h>

 #include "debug.h"
 #include "dso.h"
@@ -425,16 +426,16 @@ static int process_events(struct machine *machine, struct evlist *evlist,

 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
 		md = &evlist->mmap[i];
-		if (perf_mmap__read_init(md) < 0)
+		if (perf_mmap__read_init(&md->core) < 0)
 			continue;

-		while ((event = perf_mmap__read_event(md)) != NULL) {
+		while ((event = perf_mmap__read_event(&md->core)) != NULL) {
 			ret = process_event(machine, evlist, event, state);
-			perf_mmap__consume(md);
+			perf_mmap__consume(&md->core);
 			if (ret < 0)
 				return ret;
 		}
-		perf_mmap__read_done(md);
+		perf_mmap__read_done(&md->core);
 	}
 	return 0;
 }

--- a/tools/perf/tests/keep-tracking.c
+++ b/tools/perf/tests/keep-tracking.c
@@ -5,6 +5,7 @@
 #include <sys/prctl.h>
 #include <perf/cpumap.h>
 #include <perf/evlist.h>
+#include <perf/mmap.h>

 #include "debug.h"
 #include "parse-events.h"
@@ -38,17 +39,17 @@ static int find_comm(struct evlist *evlist, const char *comm)
 	found = 0;
 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
 		md = &evlist->mmap[i];
-		if (perf_mmap__read_init(md) < 0)
+		if (perf_mmap__read_init(&md->core) < 0)
 			continue;
-		while ((event = perf_mmap__read_event(md)) != NULL) {
+		while ((event = perf_mmap__read_event(&md->core)) != NULL) {
 			if (event->header.type == PERF_RECORD_COMM &&
 			    (pid_t)event->comm.pid == getpid() &&
 			    (pid_t)event->comm.tid == getpid() &&
 			    strcmp(event->comm.comm, comm) == 0)
 				found += 1;
-			perf_mmap__consume(md);
+			perf_mmap__consume(&md->core);
 		}
-		perf_mmap__read_done(md);
+		perf_mmap__read_done(&md->core);
 	}
 	return found;
 }

--- a/tools/perf/tests/mmap-basic.c
+++ b/tools/perf/tests/mmap-basic.c
--- a/tools/perf/tests/openat-syscall-tp-fields.c
+++ b/tools/perf/tests/openat-syscall-tp-fields.c
--- a/tools/perf/tests/perf-record.c
+++ b/tools/perf/tests/perf-record.c
--- a/tools/perf/tests/sw-clock.c
+++ b/tools/perf/tests/sw-clock.c
--- a/tools/perf/tests/switch-tracking.c
+++ b/tools/perf/tests/switch-tracking.c
--- a/tools/perf/tests/task-exit.c
+++ b/tools/perf/tests/task-exit.c
--- a/tools/perf/trace/beauty/Build
+++ b/tools/perf/trace/beauty/Build
@@ -17,3 +17,4 @@ perf-y += sockaddr.o
 perf-y += socket.o
 perf-y += statx.o
 perf-y += sync_file_range.o
+perf-y += tracepoints/
--- a/tools/perf/trace/beauty/beauty.h
+++ b/tools/perf/trace/beauty/beauty.h
--- a/tools/perf/trace/beauty/tracepoints/Build
+++ b/tools/perf/trace/beauty/tracepoints/Build
+perf-y += x86_msr.o
--- a/tools/perf/trace/beauty/tracepoints/x86_msr.c
+++ b/tools/perf/trace/beauty/tracepoints/x86_msr.c
--- a/tools/perf/trace/beauty/tracepoints/x86_msr.sh
+++ b/tools/perf/trace/beauty/tracepoints/x86_msr.sh
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -95,6 +95,7 @@ perf-y += cloexec.o
 perf-y += call-path.o
 perf-y += rwsem.o
 perf-y += thread-stack.o
+perf-y += spark.o
 perf-$(CONFIG_AUXTRACE) += auxtrace.o
 perf-$(CONFIG_AUXTRACE) += intel-pt-decoder/
 perf-$(CONFIG_AUXTRACE) += intel-pt.o

--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
--- a/tools/perf/util/annotate.h
+++ b/tools/perf/util/annotate.h
--- a/tools/perf/util/env.c
+++ b/tools/perf/util/env.c
--- a/tools/perf/util/env.h
+++ b/tools/perf/util/env.h
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
--- a/tools/perf/util/mmap.c
+++ b/tools/perf/util/mmap.c
--- a/tools/perf/util/mmap.h
+++ b/tools/perf/util/mmap.h
--- a/tools/perf/util/parse-regs-options.c
+++ b/tools/perf/util/parse-regs-options.c
--- a/tools/perf/util/perf_regs.c
+++ b/tools/perf/util/perf_regs.c
--- a/tools/perf/util/perf_regs.h
+++ b/tools/perf/util/perf_regs.h
--- a/tools/perf/util/python.c
+++ b/tools/perf/util/python.c
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
--- a/tools/perf/util/spark.c
+++ b/tools/perf/util/spark.c
--- a/tools/perf/util/spark.h
+++ b/tools/perf/util/spark.h
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h