Merge tag 'perf-core-for-mingo-4.15-20171023' of...

Merge tag 'perf-core-for-mingo-4.15-20171023' of git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux into perf/core Pull perf/core improvements and fixes from Arnaldo Carvalho de Melo: - Update vendor events JSON metrics for Intel's Broadwell, Broadwell Server, Haswell, Haswell Server, IvyBridge, IvyTown, JakeTown, Sandy Bridge, Skylake and SkyLake Server (Andi Kleen) - Add vendor event file for Intel's Goldmont Plus V1 (Kan Liang) - Move perf_mmap methods from 'perf record' and evlist.c to a separate mmap.[ch] pair, to better separate things and pave the way for further work on multithreading tools (Arnaldo Carvalho de Melo) - Do not check ABI headers in a detached tarball build, as it the kernel headers from where we copied tools/include/ are by definition not available (Arnaldo Carvalho de Melo) - Make 'perf script' use fprintf() like printing, i.e. receiving a FILE pointer so that it gets consistent with other tools/ code and allows for printing to per-event files (Arnaldo Carvalho de Melo) - Error handling fixes (resource release on exit) for 'perf script' and 'perf kmem' (Christophe JAILLET) - Make some 'perf event attr' tests optional on virtual machines, where tested counters are not available (Jiri Olsa) Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> Signed-off-by: Ingo Molnar <mingo@kernel.org>

Merge tag 'perf-core-for-mingo-4.15-20171023' of...
Merge tag 'perf-core-for-mingo-4.15-20171023' of git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux into perf/core Pull perf/core improvements and fixes from Arnaldo Carvalho de Melo: - Update vendor events JSON metrics for Intel's Broadwell, Broadwell Server, Haswell, Haswell Server, IvyBridge, IvyTown, JakeTown, Sandy Bridge, Skylake and SkyLake Server (Andi Kleen) - Add vendor event file for Intel's Goldmont Plus V1 (Kan Liang) - Move perf_mmap methods from 'perf record' and evlist.c to a separate mmap.[ch] pair, to better separate things and pave the way for further work on multithreading tools (Arnaldo Carvalho de Melo) - Do not check ABI headers in a detached tarball build, as it the kernel headers from where we copied tools/include/ are by definition not available (Arnaldo Carvalho de Melo) - Make 'perf script' use fprintf() like printing, i.e. receiving a FILE pointer so that it gets consistent with other tools/ code and allows for printing to per-event files (Arnaldo Carvalho de Melo) - Error handling fixes (resource release on exit) for 'perf script' and 'perf kmem' (Christophe JAILLET) - Make some 'perf event attr' tests optional on virtual machines, where tested counters are not available (Jiri Olsa) Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> Signed-off-by: Ingo Molnar <mingo@kernel.org>
9b7c8547 · Ingo Molnar · 8776fe75 · 65db92e0 · 9b7c8547 · 9b7c8547
Commit 9b7c8547 authored Oct 24, 2017 by Ingo Molnar
46 changed files
--- a/tools/perf/Documentation/perf-list.txt
+++ b/tools/perf/Documentation/perf-list.txt
@@ -204,7 +204,7 @@ For example Intel Core CPUs typically have four generic performance counters
 for the core, plus three fixed counters for instructions, cycles and
 ref-cycles. Some special events have restrictions on which counter they
 can schedule, and may not support multiple instances in a single group.
-When too many events are specified in the group none of them will not
+When too many events are specified in the group some of them will not
 be measured.

 Globally pinned events can limit the number of counters available for

--- a/tools/perf/arch/arm/annotate/instructions.c
+++ b/tools/perf/arch/arm/annotate/instructions.c
+#include <linux/compiler.h>
 #include <sys/types.h>
 #include <regex.h>

@@ -23,7 +24,7 @@ static struct ins_ops *arm__associate_instruction_ops(struct arch *arch, const c
 	return ops;
 }

-static int arm__annotate_init(struct arch *arch)
+static int arm__annotate_init(struct arch *arch, char *cpuid __maybe_unused)
 {
 	struct arm_annotate *arm;
 	int err;

--- a/tools/perf/arch/arm64/annotate/instructions.c
+++ b/tools/perf/arch/arm64/annotate/instructions.c
+#include <linux/compiler.h>
 #include <sys/types.h>
 #include <regex.h>

@@ -25,7 +26,7 @@ static struct ins_ops *arm64__associate_instruction_ops(struct arch *arch, const
 	return ops;
 }

-static int arm64__annotate_init(struct arch *arch)
+static int arm64__annotate_init(struct arch *arch, char *cpuid __maybe_unused)
 {
 	struct arm64_annotate *arm;
 	int err;

--- a/tools/perf/arch/powerpc/annotate/instructions.c
+++ b/tools/perf/arch/powerpc/annotate/instructions.c
+#include <linux/compiler.h>
+
 static struct ins_ops *powerpc__associate_instruction_ops(struct arch *arch, const char *name)
 {
 	int i;
@@ -46,7 +48,7 @@ static struct ins_ops *powerpc__associate_instruction_ops(struct arch *arch, con
 	return ops;
 }

-static int powerpc__annotate_init(struct arch *arch)
+static int powerpc__annotate_init(struct arch *arch, char *cpuid __maybe_unused)
 {
 	if (!arch->initialized) {
 		arch->initialized = true;

--- a/tools/perf/arch/s390/annotate/instructions.c
+++ b/tools/perf/arch/s390/annotate/instructions.c
+#include <linux/compiler.h>
+
 static struct ins_ops *s390__associate_ins_ops(struct arch *arch, const char *name)
 {
 	struct ins_ops *ops = NULL;
@@ -19,7 +21,7 @@ static struct ins_ops *s390__associate_ins_ops(struct arch *arch, const char *na
 	return ops;
 }

-static int s390__annotate_init(struct arch *arch)
+static int s390__annotate_init(struct arch *arch, char *cpuid __maybe_unused)
 {
 	if (!arch->initialized) {
 		arch->initialized = true;

--- a/tools/perf/arch/x86/annotate/instructions.c
+++ b/tools/perf/arch/x86/annotate/instructions.c
@@ -122,3 +122,17 @@ static int x86__cpuid_parse(struct arch *arch, char *cpuid)

 	return -1;
 }
+
+static int x86__annotate_init(struct arch *arch, char *cpuid)
+{
+	int err = 0;
+
+	if (arch->initialized)
+		return 0;
+
+	if (cpuid)
+		err = x86__cpuid_parse(arch, cpuid);
+
+	arch->initialized = true;
+	return err;
+}
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -1983,7 +1983,8 @@ int cmd_kmem(int argc, const char **argv)

 	if (perf_time__parse_str(&ptime, time_str) != 0) {
 		pr_err("Invalid time string\n");
-		return -EINVAL;
+		ret = -EINVAL;
+		goto out_delete;
 	}

 	if (!strcmp(argv[0], "stat")) {

--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -129,107 +129,12 @@ static int process_synthesized_event(struct perf_tool *tool,
 	return record__write(rec, event, event->header.size);
 }

-static int
-backward_rb_find_range(void *buf, int mask, u64 head, u64 *start, u64 *end)
+static int record__pushfn(void *to, void *bf, size_t size)
 {
-	struct perf_event_header *pheader;
-	u64 evt_head = head;
-	int size = mask + 1;
-
-	pr_debug2("backward_rb_find_range: buf=%p, head=%"PRIx64"\n", buf, head);
-	pheader = (struct perf_event_header *)(buf + (head & mask));
-	*start = head;
-	while (true) {
-		if (evt_head - head >= (unsigned int)size) {
-			pr_debug("Finished reading backward ring buffer: rewind\n");
-			if (evt_head - head > (unsigned int)size)
-				evt_head -= pheader->size;
-			*end = evt_head;
-			return 0;
-		}
-
-		pheader = (struct perf_event_header *)(buf + (evt_head & mask));
-
-		if (pheader->size == 0) {
-			pr_debug("Finished reading backward ring buffer: get start\n");
-			*end = evt_head;
-			return 0;
-		}
-
-		evt_head += pheader->size;
-		pr_debug3("move evt_head: %"PRIx64"\n", evt_head);
-	}
-	WARN_ONCE(1, "Shouldn't get here\n");
-	return -1;
-}
-
-static int
-rb_find_range(void *data, int mask, u64 head, u64 old,
-	      u64 *start, u64 *end, bool backward)
-{
-	if (!backward) {
-		*start = old;
-		*end = head;
-		return 0;
-	}
-
-	return backward_rb_find_range(data, mask, head, start, end);
-}
-
-static int
-record__mmap_read(struct record *rec, struct perf_mmap *md,
-		  bool overwrite, bool backward)
-{
-	u64 head = perf_mmap__read_head(md);
-	u64 old = md->prev;
-	u64 end = head, start = old;
-	unsigned char *data = md->base + page_size;
-	unsigned long size;
-	void *buf;
-	int rc = 0;
-
-	if (rb_find_range(data, md->mask, head,
-			  old, &start, &end, backward))
-		return -1;
-
-	if (start == end)
-		return 0;
+	struct record *rec = to;

 	rec->samples++;
-
-	size = end - start;
-	if (size > (unsigned long)(md->mask) + 1) {
-		WARN_ONCE(1, "failed to keep up with mmap data. (warn only once)\n");
-
-		md->prev = head;
-		perf_mmap__consume(md, overwrite || backward);
-		return 0;
-	}
-
-	if ((start & md->mask) + size != (end & md->mask)) {
-		buf = &data[start & md->mask];
-		size = md->mask + 1 - (start & md->mask);
-		start += size;
-
-		if (record__write(rec, buf, size) < 0) {
-			rc = -1;
-			goto out;
-		}
-	}
-
-	buf = &data[start & md->mask];
-	size = end - start;
-	start += size;
-
-	if (record__write(rec, buf, size) < 0) {
-		rc = -1;
-		goto out;
-	}
-
-	md->prev = head;
-	perf_mmap__consume(md, overwrite || backward);
-out:
-	return rc;
+	return record__write(rec, bf, size);
 }

 static volatile int done;
@@ -576,8 +481,7 @@ static int record__mmap_read_evlist(struct record *rec, struct perf_evlist *evli
 		struct auxtrace_mmap *mm = &maps[i].auxtrace_mmap;

 		if (maps[i].base) {
-			if (record__mmap_read(rec, &maps[i],
-					      evlist->overwrite, backward) != 0) {
+			if (perf_mmap__push(&maps[i], evlist->overwrite, backward, rec, record__pushfn) != 0) {
 				rc = -1;
 				goto out;
 			}

--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -1828,16 +1828,14 @@ static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evs
 	goto out_put;
 }

-static void bpf_output__printer(enum binary_printer_ops op,
-				unsigned int val, void *extra)
+static int bpf_output__printer(enum binary_printer_ops op,
+			       unsigned int val, void *extra __maybe_unused, FILE *fp)
 {
-	FILE *output = extra;
 	unsigned char ch = (unsigned char)val;

 	switch (op) {
 	case BINARY_PRINT_CHAR_DATA:
-		fprintf(output, "%c", isprint(ch) ? ch : '.');
-		break;
+		return fprintf(fp, "%c", isprint(ch) ? ch : '.');
 	case BINARY_PRINT_DATA_BEGIN:
 	case BINARY_PRINT_LINE_BEGIN:
 	case BINARY_PRINT_ADDR:
@@ -1850,13 +1848,15 @@ static void bpf_output__printer(enum binary_printer_ops op,
 	default:
 		break;
 	}
+
+	return 0;
 }

 static void bpf_output__fprintf(struct trace *trace,
 				struct perf_sample *sample)
 {
-	print_binary(sample->raw_data, sample->raw_size, 8,
-		     bpf_output__printer, trace->output);
+	binary__fprintf(sample->raw_data, sample->raw_size, 8,
+			bpf_output__printer, NULL, trace->output);
 }

 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,

--- a/tools/perf/check-headers.sh
+++ b/tools/perf/check-headers.sh
@@ -57,6 +57,11 @@ check () {
 }


+# Check if we have the kernel headers (tools/perf/../../include), else
+# we're probably on a detached tarball, so no point in trying to check
+# differences.
+test -d ../../include || exit 0
+
 # simple diff check
 for i in $HEADERS; do
  check $i -B

--- a/tools/perf/pmu-events/arch/x86/broadwell/bdw-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/broadwell/bdw-metrics.json
@@ -13,7 +13,7 @@
    },
    {
        "BriefDescription": "Rough Estimation of fraction of fetched lines bytes that were likely consumed by program instructions",
-        "MetricExpr": "min( 1 , IDQ.MITE_UOPS / ( UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY * 16 * ( ICACHE.HIT + ICACHE.MISSES ) / 4.0 ) )",
+        "MetricExpr": "min( 1 , IDQ.MITE_UOPS / ( (UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY) * 16 * ( ICACHE.HIT + ICACHE.MISSES ) / 4.0 ) )",
        "MetricGroup": "Frontend",
        "MetricName": "IFetch_Line_Utilization"
    },
@@ -25,7 +25,7 @@
    },
    {
        "BriefDescription": "Cycles Per Instruction (threaded)",
-        "MetricExpr": "1 / INST_RETIRED.ANY / cycles",
+        "MetricExpr": "1 / (INST_RETIRED.ANY / cycles)",
        "MetricGroup": "Pipeline;Summary",
        "MetricName": "CPI"
    },
@@ -37,7 +37,7 @@
    },
    {
        "BriefDescription": "Total issue-pipeline slots",
-        "MetricExpr": "4*( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles",
+        "MetricExpr": "4*(( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)",
        "MetricGroup": "TopDownL1",
        "MetricName": "SLOTS"
    },
@@ -49,19 +49,19 @@
    },
    {
        "BriefDescription": "Instructions Per Cycle (per physical core)",
-        "MetricExpr": "INST_RETIRED.ANY / ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles",
+        "MetricExpr": "INST_RETIRED.ANY / (( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)",
        "MetricGroup": "SMT",
        "MetricName": "CoreIPC"
    },
    {
        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)",
-        "MetricExpr": "UOPS_EXECUTED.THREAD / ( cpu@uops_executed.core\\,cmask\\=1@ / 2) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC",
+        "MetricExpr": "UOPS_EXECUTED.THREAD / (( cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)",
        "MetricGroup": "Pipeline;Ports_Utilization",
        "MetricName": "ILP"
    },
    {
        "BriefDescription": "Average Branch Address Clear Cost (fraction of cycles)",
-	"MetricExpr": "2* ( RS_EVENTS.EMPTY_CYCLES - ICACHE.IFDATA_STALL  - ( 14 * ITLB_MISSES.STLB_HIT + cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + 7* ITLB_MISSES.WALK_COMPLETED ) ) / RS_EVENTS.EMPTY_END",
+        "MetricExpr": "2* (( RS_EVENTS.EMPTY_CYCLES - ICACHE.IFDATA_STALL  - (( 14 * ITLB_MISSES.STLB_HIT + cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + 7* ITLB_MISSES.WALK_COMPLETED )) ) / RS_EVENTS.EMPTY_END)",
        "MetricGroup": "Unknown_Branches",
        "MetricName": "BAClear_Cost"
    },
@@ -79,13 +79,13 @@
    },
    {
        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least 1 such miss)",
-        "MetricExpr": "L1D_PEND_MISS.PENDING / ( cpu@l1d_pend_miss.pending_cycles\\,any\\=1@ / 2) if #SMT_on else L1D_PEND_MISS.PENDING_CYCLES",
+        "MetricExpr": "L1D_PEND_MISS.PENDING / (( cpu@l1d_pend_miss.pending_cycles\\,any\\=1@ / 2) if #SMT_on else L1D_PEND_MISS.PENDING_CYCLES)",
        "MetricGroup": "Memory_Bound;Memory_BW",
        "MetricName": "MLP"
    },
    {
        "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
-	"MetricExpr": "( cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + cpu@DTLB_LOAD_MISSES.WALK_DURATION\\,cmask\\=1@ + cpu@DTLB_STORE_MISSES.WALK_DURATION\\,cmask\\=1@ + 7*(DTLB_STORE_MISSES.WALK_COMPLETED+DTLB_LOAD_MISSES.WALK_COMPLETED+ITLB_MISSES.WALK_COMPLETED)) / ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles",
+        "MetricExpr": "( cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + cpu@DTLB_LOAD_MISSES.WALK_DURATION\\,cmask\\=1@ + cpu@DTLB_STORE_MISSES.WALK_DURATION\\,cmask\\=1@ + 7*(DTLB_STORE_MISSES.WALK_COMPLETED+DTLB_LOAD_MISSES.WALK_COMPLETED+ITLB_MISSES.WALK_COMPLETED)) / (( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)",
        "MetricGroup": "TLB",
        "MetricName": "Page_Walks_Utilization"
    },
@@ -97,7 +97,7 @@
    },
    {
        "BriefDescription": "Giga Floating Point Operations Per Second",
-        "MetricExpr": "( 1*( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2* FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4*( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8* FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE ) / 1000000000 / duration_time",
+        "MetricExpr": "(( 1*( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2* FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4*( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8* FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE )) / 1000000000 / duration_time",
        "MetricGroup": "FLOPS;Summary",
        "MetricName": "GFLOPs"
    },

--- a/tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json
@@ -13,7 +13,7 @@
    },
    {
        "BriefDescription": "Rough Estimation of fraction of fetched lines bytes that were likely consumed by program instructions",
-        "MetricExpr": "min( 1 , IDQ.MITE_UOPS / ( UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY * 16 * ( ICACHE.HIT + ICACHE.MISSES ) / 4.0 ) )",
+        "MetricExpr": "min( 1 , IDQ.MITE_UOPS / ( (UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY) * 16 * ( ICACHE.HIT + ICACHE.MISSES ) / 4.0 ) )",
        "MetricGroup": "Frontend",
        "MetricName": "IFetch_Line_Utilization"
    },
@@ -25,7 +25,7 @@
    },
    {
        "BriefDescription": "Cycles Per Instruction (threaded)",
-        "MetricExpr": "1 / INST_RETIRED.ANY / cycles",
+        "MetricExpr": "1 / (INST_RETIRED.ANY / cycles)",
        "MetricGroup": "Pipeline;Summary",
        "MetricName": "CPI"
    },
@@ -37,7 +37,7 @@
    },
    {
        "BriefDescription": "Total issue-pipeline slots",
-        "MetricExpr": "4*( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles",
+        "MetricExpr": "4*(( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)",
        "MetricGroup": "TopDownL1",
        "MetricName": "SLOTS"
    },
@@ -49,19 +49,19 @@
    },
    {
        "BriefDescription": "Instructions Per Cycle (per physical core)",
-        "MetricExpr": "INST_RETIRED.ANY / ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles",
+        "MetricExpr": "INST_RETIRED.ANY / (( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)",
        "MetricGroup": "SMT",
        "MetricName": "CoreIPC"
    },
    {
        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)",
-        "MetricExpr": "UOPS_EXECUTED.THREAD / ( cpu@uops_executed.core\\,cmask\\=1@ / 2) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC",
+        "MetricExpr": "UOPS_EXECUTED.THREAD / (( cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)",
        "MetricGroup": "Pipeline;Ports_Utilization",
        "MetricName": "ILP"
    },
    {
        "BriefDescription": "Average Branch Address Clear Cost (fraction of cycles)",
-	"MetricExpr": "2* ( RS_EVENTS.EMPTY_CYCLES - ICACHE.IFDATA_STALL  - ( 14 * ITLB_MISSES.STLB_HIT + cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + 7* ITLB_MISSES.WALK_COMPLETED ) ) / RS_EVENTS.EMPTY_END",
+        "MetricExpr": "2* (( RS_EVENTS.EMPTY_CYCLES - ICACHE.IFDATA_STALL  - (( 14 * ITLB_MISSES.STLB_HIT + cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + 7* ITLB_MISSES.WALK_COMPLETED )) ) / RS_EVENTS.EMPTY_END)",
        "MetricGroup": "Unknown_Branches",
        "MetricName": "BAClear_Cost"
    },
@@ -79,13 +79,13 @@
    },
    {
        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least 1 such miss)",
-        "MetricExpr": "L1D_PEND_MISS.PENDING / ( cpu@l1d_pend_miss.pending_cycles\\,any\\=1@ / 2) if #SMT_on else L1D_PEND_MISS.PENDING_CYCLES",
+        "MetricExpr": "L1D_PEND_MISS.PENDING / (( cpu@l1d_pend_miss.pending_cycles\\,any\\=1@ / 2) if #SMT_on else L1D_PEND_MISS.PENDING_CYCLES)",
        "MetricGroup": "Memory_Bound;Memory_BW",
        "MetricName": "MLP"
    },
    {
        "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
-        "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION + 7*(DTLB_STORE_MISSES.WALK_COMPLETED+DTLB_LOAD_MISSES.WALK_COMPLETED+ITLB_MISSES.WALK_COMPLETED) ) / (2*( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)",
+        "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION + 7*(DTLB_STORE_MISSES.WALK_COMPLETED+DTLB_LOAD_MISSES.WALK_COMPLETED+ITLB_MISSES.WALK_COMPLETED) ) / (2*(( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles))",
        "MetricGroup": "TLB",
        "MetricName": "Page_Walks_Utilization"
    },
@@ -97,7 +97,7 @@
    },
    {
        "BriefDescription": "Giga Floating Point Operations Per Second",
-        "MetricExpr": "( 1*( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2* FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4*( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8* FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE ) / 1000000000 / duration_time",
+        "MetricExpr": "(( 1*( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2* FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4*( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8* FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE )) / 1000000000 / duration_time",
        "MetricGroup": "FLOPS;Summary",
        "MetricName": "GFLOPs"
    },

--- a/tools/perf/pmu-events/arch/x86/goldmontplus/cache.json
+++ b/tools/perf/pmu-events/arch/x86/goldmontplus/cache.json
--- a/tools/perf/pmu-events/arch/x86/goldmontplus/frontend.json
+++ b/tools/perf/pmu-events/arch/x86/goldmontplus/frontend.json
+[
+    {
+        "CollectPEBSRecord": "1",
+        "PublicDescription": "Counts requests to the Instruction Cache (ICache) for one or more bytes in an ICache Line and that cache line is in the ICache (hit).  The event strives to count on a cache line basis, so that multiple accesses which hit in a single cache line count as one ICACHE.HIT.  Specifically, the event counts when straight line code crosses the cache line boundary, or when a branch target is to a new line, and that cache line is in the ICache. This event counts differently than Intel processors based on Silvermont microarchitecture.",
+        "EventCode": "0x80",
+        "Counter": "0,1,2,3",
+        "UMask": "0x1",
+        "PEBScounters": "0,1,2,3",
+        "EventName": "ICACHE.HIT",
+        "PDIR_COUNTER": "na",
+        "SampleAfterValue": "200003",
+        "BriefDescription": "References per ICache line that are available in the ICache (hit). This event counts differently than Intel processors based on Silvermont microarchitecture"
+    },
+    {
+        "CollectPEBSRecord": "1",
+        "PublicDescription": "Counts requests to the Instruction Cache (ICache)  for one or more bytes in an ICache Line and that cache line is not in the ICache (miss).  The event strives to count on a cache line basis, so that multiple accesses which miss in a single cache line count as one ICACHE.MISS.  Specifically, the event counts when straight line code crosses the cache line boundary, or when a branch target is to a new line, and that cache line is not in the ICache. This event counts differently than Intel processors based on Silvermont microarchitecture.",
+        "EventCode": "0x80",
+        "Counter": "0,1,2,3",
+        "UMask": "0x2",
+        "PEBScounters": "0,1,2,3",
+        "EventName": "ICACHE.MISSES",
+        "PDIR_COUNTER": "na",
+        "SampleAfterValue": "200003",
+        "BriefDescription": "References per ICache line that are not available in the ICache (miss). This event counts differently than Intel processors based on Silvermont microarchitecture"
+    },
+    {
+        "CollectPEBSRecord": "1",
+        "PublicDescription": "Counts requests to the Instruction Cache (ICache) for one or more bytes in an ICache Line.  The event strives to count on a cache line basis, so that multiple fetches to a single cache line count as one ICACHE.ACCESS.  Specifically, the event counts when accesses from straight line code crosses the cache line boundary, or when a branch target is to a new line.\r\nThis event counts differently than Intel processors based on Silvermont microarchitecture.",
+        "EventCode": "0x80",
+        "Counter": "0,1,2,3",
+        "UMask": "0x3",
+        "PEBScounters": "0,1,2,3",
+        "EventName": "ICACHE.ACCESSES",
+        "PDIR_COUNTER": "na",
+        "SampleAfterValue": "200003",
+        "BriefDescription": "References per ICache line. This event counts differently than Intel processors based on Silvermont microarchitecture"
+    },
+    {
+        "CollectPEBSRecord": "1",
+        "PublicDescription": "Counts the number of times the Microcode Sequencer (MS) starts a flow of uops from the MSROM. It does not count every time a uop is read from the MSROM.  The most common case that this counts is when a micro-coded instruction is encountered by the front end of the machine.  Other cases include when an instruction encounters a fault, trap, or microcode assist of any sort that initiates a flow of uops.  The event will count MS startups for uops that are speculative, and subsequently cleared by branch mispredict or a machine clear.",
+        "EventCode": "0xE7",
+        "Counter": "0,1,2,3",
+        "UMask": "0x1",
+        "PEBScounters": "0,1,2,3",
+        "EventName": "MS_DECODED.MS_ENTRY",
+        "PDIR_COUNTER": "na",
+        "SampleAfterValue": "200003",
+        "BriefDescription": "MS decode starts"
+    },
+    {
+        "CollectPEBSRecord": "1",
+        "PublicDescription": "Counts the number of times the prediction (from the predecode cache) for instruction length is incorrect.",
+        "EventCode": "0xE9",
+        "Counter": "0,1,2,3",
+        "UMask": "0x1",
+        "PEBScounters": "0,1,2,3",
+        "EventName": "DECODE_RESTRICTION.PREDECODE_WRONG",
+        "PDIR_COUNTER": "na",
+        "SampleAfterValue": "200003",
+        "BriefDescription": "Decode restrictions due to predicting wrong instruction length"
+    }
+]
\ No newline at end of file
--- a/tools/perf/pmu-events/arch/x86/goldmontplus/memory.json
+++ b/tools/perf/pmu-events/arch/x86/goldmontplus/memory.json
+[
+    {
+        "PEBS": "2",
+        "CollectPEBSRecord": "2",
+        "PublicDescription": "Counts when a memory load of a uop spans a page boundary (a split) is retired.",
+        "EventCode": "0x13",
+        "Counter": "0,1,2,3",
+        "UMask": "0x2",
+        "PEBScounters": "0,1,2,3",
+        "EventName": "MISALIGN_MEM_REF.LOAD_PAGE_SPLIT",
+        "SampleAfterValue": "200003",
+        "BriefDescription": "Load uops that split a page (Precise event capable)"
+    },
+    {
+        "PEBS": "2",
+        "CollectPEBSRecord": "2",
+        "PublicDescription": "Counts when a memory store of a uop spans a page boundary (a split) is retired.",
+        "EventCode": "0x13",
+        "Counter": "0,1,2,3",
+        "UMask": "0x4",
+        "PEBScounters": "0,1,2,3",
+        "EventName": "MISALIGN_MEM_REF.STORE_PAGE_SPLIT",
+        "SampleAfterValue": "200003",
+        "BriefDescription": "Store uops that split a page (Precise event capable)"
+    },
+    {
+        "CollectPEBSRecord": "1",
+        "PublicDescription": "Counts machine clears due to memory ordering issues.  This occurs when a snoop request happens and the machine is uncertain if memory ordering will be preserved - as another core is in the process of modifying the data.",
+        "EventCode": "0xC3",
+        "Counter": "0,1,2,3",
+        "UMask": "0x2",
+        "PEBScounters": "0,1,2,3",
+        "EventName": "MACHINE_CLEARS.MEMORY_ORDERING",
+        "PDIR_COUNTER": "na",
+        "SampleAfterValue": "20003",
+        "BriefDescription": "Machine clears due to memory ordering issue"
+    }
+]
\ No newline at end of file
--- a/tools/perf/pmu-events/arch/x86/goldmontplus/other.json
+++ b/tools/perf/pmu-events/arch/x86/goldmontplus/other.json
+[
+    {
+        "CollectPEBSRecord": "1",
+        "PublicDescription": "Counts cycles that fetch is stalled due to any reason. That is, the decoder queue is able to accept bytes, but the fetch unit is unable to provide bytes.  This will include cycles due to an ITLB miss, ICache miss and other events.",
+        "EventCode": "0x86",
+        "Counter": "0,1,2,3",
+        "UMask": "0x0",
+        "PEBScounters": "0,1,2,3",
+        "EventName": "FETCH_STALL.ALL",
+        "PDIR_COUNTER": "na",
+        "SampleAfterValue": "200003",
+        "BriefDescription": "Cycles code-fetch stalled due to any reason."
+    },
+    {
+        "CollectPEBSRecord": "1",
+        "PublicDescription": "Counts cycles that fetch is stalled due to an outstanding ITLB miss. That is, the decoder queue is able to accept bytes, but the fetch unit is unable to provide bytes due to an ITLB miss.  Note: this event is not the same as page walk cycles to retrieve an instruction translation.",
+        "EventCode": "0x86",
+        "Counter": "0,1,2,3",
+        "UMask": "0x1",
+        "PEBScounters": "0,1,2,3",
+        "EventName": "FETCH_STALL.ITLB_FILL_PENDING_CYCLES",
+        "PDIR_COUNTER": "na",
+        "SampleAfterValue": "200003",
+        "BriefDescription": "Cycles the code-fetch stalls and an ITLB miss is outstanding."
+    },
+    {
+        "CollectPEBSRecord": "1",
+        "PublicDescription": "Counts the number of issue slots per core cycle that were not consumed by the backend due to either a full resource  in the backend (RESOURCE_FULL) or due to the processor recovering from some event (RECOVERY).",
+        "EventCode": "0xCA",
+        "Counter": "0,1,2,3",
+        "UMask": "0x0",
+        "PEBScounters": "0,1,2,3",
+        "EventName": "ISSUE_SLOTS_NOT_CONSUMED.ANY",
+        "PDIR_COUNTER": "na",
+        "SampleAfterValue": "200003",
+        "BriefDescription": "Unfilled issue slots per cycle"
+    },
+    {
+        "CollectPEBSRecord": "1",
+        "PublicDescription": "Counts the number of issue slots per core cycle that were not consumed because of a full resource in the backend.  Including but not limited to resources such as the Re-order Buffer (ROB), reservation stations (RS), load/store buffers, physical registers, or any other needed machine resource that is currently unavailable.   Note that uops must be available for consumption in order for this event to fire.  If a uop is not available (Instruction Queue is empty), this event will not count.",
+        "EventCode": "0xCA",
+        "Counter": "0,1,2,3",
+        "UMask": "0x1",
+        "PEBScounters": "0,1,2,3",
+        "EventName": "ISSUE_SLOTS_NOT_CONSUMED.RESOURCE_FULL",
+        "PDIR_COUNTER": "na",
+        "SampleAfterValue": "200003",
+        "BriefDescription": "Unfilled issue slots per cycle because of a full resource in the backend"
+    },
+    {
+        "CollectPEBSRecord": "1",
+        "PublicDescription": "Counts the number of issue slots per core cycle that were not consumed by the backend because allocation is stalled waiting for a mispredicted jump to retire or other branch-like conditions (e.g. the event is relevant during certain microcode flows).   Counts all issue slots blocked while within this window including slots where uops were not available in the Instruction Queue.",
+        "EventCode": "0xCA",
+        "Counter": "0,1,2,3",
+        "UMask": "0x2",
+        "PEBScounters": "0,1,2,3",
+        "EventName": "ISSUE_SLOTS_NOT_CONSUMED.RECOVERY",
+        "PDIR_COUNTER": "na",
+        "SampleAfterValue": "200003",
+        "BriefDescription": "Unfilled issue slots per cycle to recover"
+    },
+    {
+        "CollectPEBSRecord": "2",
+        "PublicDescription": "Counts hardware interrupts received by the processor.",
+        "EventCode": "0xCB",
+        "Counter": "0,1,2,3",
+        "UMask": "0x1",
+        "PEBScounters": "0,1,2,3",
+        "EventName": "HW_INTERRUPTS.RECEIVED",
+        "PDIR_COUNTER": "na",
+        "SampleAfterValue": "203",
+        "BriefDescription": "Hardware interrupts received"
+    },
+    {
+        "CollectPEBSRecord": "2",
+        "PublicDescription": "Counts the number of core cycles during which interrupts are masked (disabled). Increments by 1 each core cycle that EFLAGS.IF is 0, regardless of whether interrupts are pending or not.",
+        "EventCode": "0xCB",
+        "Counter": "0,1,2,3",
+        "UMask": "0x2",
+        "PEBScounters": "0,1,2,3",
+        "EventName": "HW_INTERRUPTS.MASKED",
+        "PDIR_COUNTER": "na",
+        "SampleAfterValue": "200003",
+        "BriefDescription": "Cycles hardware interrupts are masked"
+    },
+    {
+        "CollectPEBSRecord": "2",
+        "PublicDescription": "Counts core cycles during which there are pending interrupts, but interrupts are masked (EFLAGS.IF = 0).",
+        "EventCode": "0xCB",
+        "Counter": "0,1,2,3",
+        "UMask": "0x4",
+        "PEBScounters": "0,1,2,3",
+        "EventName": "HW_INTERRUPTS.PENDING_AND_MASKED",
+        "PDIR_COUNTER": "na",
+        "SampleAfterValue": "200003",
+        "BriefDescription": "Cycles pending interrupts are masked"
+    }
+]
\ No newline at end of file
--- a/tools/perf/pmu-events/arch/x86/goldmontplus/pipeline.json
+++ b/tools/perf/pmu-events/arch/x86/goldmontplus/pipeline.json
--- a/tools/perf/pmu-events/arch/x86/goldmontplus/virtual-memory.json
+++ b/tools/perf/pmu-events/arch/x86/goldmontplus/virtual-memory.json
--- a/tools/perf/pmu-events/arch/x86/haswell/hsw-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/haswell/hsw-metrics.json
@@ -13,7 +13,7 @@
    },
    {
        "BriefDescription": "Rough Estimation of fraction of fetched lines bytes that were likely consumed by program instructions",
-        "MetricExpr": "min( 1 , IDQ.MITE_UOPS / ( UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY * 16 * ( ICACHE.HIT + ICACHE.MISSES ) / 4.0 ) )",
+        "MetricExpr": "min( 1 , IDQ.MITE_UOPS / ( (UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY) * 16 * ( ICACHE.HIT + ICACHE.MISSES ) / 4.0 ) )",
        "MetricGroup": "Frontend",
        "MetricName": "IFetch_Line_Utilization"
    },
@@ -25,7 +25,7 @@
    },
    {
        "BriefDescription": "Cycles Per Instruction (threaded)",
-        "MetricExpr": "1 / INST_RETIRED.ANY / cycles",
+        "MetricExpr": "1 / (INST_RETIRED.ANY / cycles)",
        "MetricGroup": "Pipeline;Summary",
        "MetricName": "CPI"
    },
@@ -37,7 +37,7 @@
    },
    {
        "BriefDescription": "Total issue-pipeline slots",
-        "MetricExpr": "4*( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles",
+        "MetricExpr": "4*(( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)",
        "MetricGroup": "TopDownL1",
        "MetricName": "SLOTS"
    },
@@ -49,19 +49,19 @@
    },
    {
        "BriefDescription": "Instructions Per Cycle (per physical core)",
-        "MetricExpr": "INST_RETIRED.ANY / ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles",
+        "MetricExpr": "INST_RETIRED.ANY / (( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)",
        "MetricGroup": "SMT",
        "MetricName": "CoreIPC"
    },
    {
        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)",
-	"MetricExpr": "( UOPS_EXECUTED.CORE / 2 / ( cpu@uops_executed.core\\,cmask\\=1@ / 2)) if #SMT_on else (UOPS_EXECUTED.CORE / cpu@uops_executed.core\\,cmask\\=1@)",
+        "MetricExpr": "( UOPS_EXECUTED.CORE / 2 / (( cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2) if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@) ) if #SMT_on else UOPS_EXECUTED.CORE / (( cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2) if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@)",
        "MetricGroup": "Pipeline;Ports_Utilization",
        "MetricName": "ILP"
    },
    {
        "BriefDescription": "Average Branch Address Clear Cost (fraction of cycles)",
-        "MetricExpr": "2* ( RS_EVENTS.EMPTY_CYCLES - ICACHE.IFDATA_STALL  - ( 14 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION ) ) / RS_EVENTS.EMPTY_END",
+        "MetricExpr": "2* (( RS_EVENTS.EMPTY_CYCLES - ICACHE.IFDATA_STALL  - (( 14 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION )) ) / RS_EVENTS.EMPTY_END)",
        "MetricGroup": "Unknown_Branches",
        "MetricName": "BAClear_Cost"
    },
@@ -79,13 +79,13 @@
    },
    {
        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least 1 such miss)",
-        "MetricExpr": "L1D_PEND_MISS.PENDING / ( cpu@l1d_pend_miss.pending_cycles\\,any\\=1@ / 2) if #SMT_on else L1D_PEND_MISS.PENDING_CYCLES",
+        "MetricExpr": "L1D_PEND_MISS.PENDING / (( cpu@l1d_pend_miss.pending_cycles\\,any\\=1@ / 2) if #SMT_on else L1D_PEND_MISS.PENDING_CYCLES)",
        "MetricGroup": "Memory_Bound;Memory_BW",
        "MetricName": "MLP"
    },
    {
        "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
-        "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION ) / ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles",
+        "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION ) / (( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)",
        "MetricGroup": "TLB",
        "MetricName": "Page_Walks_Utilization"
    },

--- a/tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json
@@ -13,7 +13,7 @@
    },
    {
        "BriefDescription": "Rough Estimation of fraction of fetched lines bytes that were likely consumed by program instructions",
-        "MetricExpr": "min( 1 , IDQ.MITE_UOPS / ( UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY * 16 * ( ICACHE.HIT + ICACHE.MISSES ) / 4.0 ) )",
+        "MetricExpr": "min( 1 , IDQ.MITE_UOPS / ( (UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY) * 16 * ( ICACHE.HIT + ICACHE.MISSES ) / 4.0 ) )",
        "MetricGroup": "Frontend",
        "MetricName": "IFetch_Line_Utilization"
    },
@@ -25,7 +25,7 @@
    },
    {
        "BriefDescription": "Cycles Per Instruction (threaded)",
-        "MetricExpr": "1 / INST_RETIRED.ANY / cycles",
+        "MetricExpr": "1 / (INST_RETIRED.ANY / cycles)",
        "MetricGroup": "Pipeline;Summary",
        "MetricName": "CPI"
    },
@@ -37,7 +37,7 @@
    },
    {
        "BriefDescription": "Total issue-pipeline slots",
-        "MetricExpr": "4*( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles",
+        "MetricExpr": "4*(( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)",
        "MetricGroup": "TopDownL1",
        "MetricName": "SLOTS"
    },
@@ -49,19 +49,19 @@
    },
    {
        "BriefDescription": "Instructions Per Cycle (per physical core)",
-        "MetricExpr": "INST_RETIRED.ANY / ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles",
+        "MetricExpr": "INST_RETIRED.ANY / (( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)",
        "MetricGroup": "SMT",
        "MetricName": "CoreIPC"
    },
    {
        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)",
-	"MetricExpr": "( UOPS_EXECUTED.CORE / 2 / ( cpu@uops_executed.core\\,cmask\\=1@ / 2)) if #SMT_on else UOPS_EXECUTED.CORE / cpu@uops_executed.core\\,cmask\\=1@",
+        "MetricExpr": "( UOPS_EXECUTED.CORE / 2 / (( cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2) if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@) ) if #SMT_on else UOPS_EXECUTED.CORE / (( cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2) if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@)",
        "MetricGroup": "Pipeline;Ports_Utilization",
        "MetricName": "ILP"
    },
    {
        "BriefDescription": "Average Branch Address Clear Cost (fraction of cycles)",
-        "MetricExpr": "2* ( RS_EVENTS.EMPTY_CYCLES - ICACHE.IFDATA_STALL  - ( 14 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION ) ) / RS_EVENTS.EMPTY_END",
+        "MetricExpr": "2* (( RS_EVENTS.EMPTY_CYCLES - ICACHE.IFDATA_STALL  - (( 14 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION )) ) / RS_EVENTS.EMPTY_END)",
        "MetricGroup": "Unknown_Branches",
        "MetricName": "BAClear_Cost"
    },
@@ -79,13 +79,13 @@
    },
    {
        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least 1 such miss)",
-        "MetricExpr": "L1D_PEND_MISS.PENDING / ( cpu@l1d_pend_miss.pending_cycles\\,any\\=1@ / 2) if #SMT_on else L1D_PEND_MISS.PENDING_CYCLES",
+        "MetricExpr": "L1D_PEND_MISS.PENDING / (( cpu@l1d_pend_miss.pending_cycles\\,any\\=1@ / 2) if #SMT_on else L1D_PEND_MISS.PENDING_CYCLES)",
        "MetricGroup": "Memory_Bound;Memory_BW",
        "MetricName": "MLP"
    },
    {
        "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
-        "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION ) / ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles",
+        "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION ) / (( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)",
        "MetricGroup": "TLB",
        "MetricName": "Page_Walks_Utilization"
    },

--- a/tools/perf/pmu-events/arch/x86/ivybridge/ivb-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/ivybridge/ivb-metrics.json
@@ -13,7 +13,7 @@
    },
    {
        "BriefDescription": "Rough Estimation of fraction of fetched lines bytes that were likely consumed by program instructions",
-        "MetricExpr": "min( 1 , UOPS_ISSUED.ANY / ( UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY * 32 * ( ICACHE.HIT + ICACHE.MISSES ) / 4) )",
+        "MetricExpr": "min( 1 , UOPS_ISSUED.ANY / ( (UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY) * 32 * ( ICACHE.HIT + ICACHE.MISSES ) / 4) )",
        "MetricGroup": "Frontend",
        "MetricName": "IFetch_Line_Utilization"
    },
@@ -25,7 +25,7 @@
    },
    {
        "BriefDescription": "Cycles Per Instruction (threaded)",
-        "MetricExpr": "1 / INST_RETIRED.ANY / cycles",
+        "MetricExpr": "1 / (INST_RETIRED.ANY / cycles)",
        "MetricGroup": "Pipeline;Summary",
        "MetricName": "CPI"
    },
@@ -37,7 +37,7 @@
    },
    {
        "BriefDescription": "Total issue-pipeline slots",
-        "MetricExpr": "4*( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles",
+        "MetricExpr": "4*(( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)",
        "MetricGroup": "TopDownL1",
        "MetricName": "SLOTS"
    },
@@ -49,19 +49,19 @@
    },
    {
        "BriefDescription": "Instructions Per Cycle (per physical core)",
-        "MetricExpr": "INST_RETIRED.ANY / ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles",
+        "MetricExpr": "INST_RETIRED.ANY / (( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)",
        "MetricGroup": "SMT",
        "MetricName": "CoreIPC"
    },
    {
        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)",
-        "MetricExpr": "UOPS_EXECUTED.THREAD / ( cpu@uops_executed.core\\,cmask\\=1@ / 2) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC",
+        "MetricExpr": "UOPS_EXECUTED.THREAD / (( cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)",
        "MetricGroup": "Pipeline;Ports_Utilization",
        "MetricName": "ILP"
    },
    {
        "BriefDescription": "Average Branch Address Clear Cost (fraction of cycles)",
-        "MetricExpr": "2* ( RS_EVENTS.EMPTY_CYCLES - ICACHE.IFETCH_STALL ) / RS_EVENTS.EMPTY_END",
+        "MetricExpr": "2* (( RS_EVENTS.EMPTY_CYCLES - ICACHE.IFETCH_STALL ) / RS_EVENTS.EMPTY_END)",
        "MetricGroup": "Unknown_Branches",
        "MetricName": "BAClear_Cost"
    },
@@ -79,13 +79,13 @@
    },
    {
        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least 1 such miss)",
-        "MetricExpr": "L1D_PEND_MISS.PENDING / ( cpu@l1d_pend_miss.pending_cycles\\,any\\=1@ / 2) if #SMT_on else L1D_PEND_MISS.PENDING_CYCLES",
+        "MetricExpr": "L1D_PEND_MISS.PENDING / (( cpu@l1d_pend_miss.pending_cycles\\,any\\=1@ / 2) if #SMT_on else L1D_PEND_MISS.PENDING_CYCLES)",
        "MetricGroup": "Memory_Bound;Memory_BW",
        "MetricName": "MLP"
    },
    {
        "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
-        "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION ) / ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles",
+        "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION ) / (( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)",
        "MetricGroup": "TLB",
        "MetricName": "Page_Walks_Utilization"
    },
@@ -97,7 +97,7 @@
    },
    {
        "BriefDescription": "Giga Floating Point Operations Per Second",
-        "MetricExpr": "( 1*( FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE ) + 2* FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4*( FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE ) + 8* SIMD_FP_256.PACKED_SINGLE ) / 1000000000 / duration_time",
+        "MetricExpr": "(( 1*( FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE ) + 2* FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4*( FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE ) + 8* SIMD_FP_256.PACKED_SINGLE )) / 1000000000 / duration_time",
        "MetricGroup": "FLOPS;Summary",
        "MetricName": "GFLOPs"
    },

--- a/tools/perf/pmu-events/arch/x86/ivytown/ivt-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/ivytown/ivt-metrics.json
@@ -13,7 +13,7 @@
    },
    {
        "BriefDescription": "Rough Estimation of fraction of fetched lines bytes that were likely consumed by program instructions",
-        "MetricExpr": "min( 1 , UOPS_ISSUED.ANY / ( UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY * 32 * ( ICACHE.HIT + ICACHE.MISSES ) / 4) )",
+        "MetricExpr": "min( 1 , UOPS_ISSUED.ANY / ( (UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY) * 32 * ( ICACHE.HIT + ICACHE.MISSES ) / 4) )",
        "MetricGroup": "Frontend",
        "MetricName": "IFetch_Line_Utilization"
    },
@@ -25,7 +25,7 @@
    },
    {
        "BriefDescription": "Cycles Per Instruction (threaded)",
-        "MetricExpr": "1 / INST_RETIRED.ANY / cycles",
+        "MetricExpr": "1 / (INST_RETIRED.ANY / cycles)",
        "MetricGroup": "Pipeline;Summary",
        "MetricName": "CPI"
    },
@@ -37,7 +37,7 @@
    },
    {
        "BriefDescription": "Total issue-pipeline slots",
-        "MetricExpr": "4*( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles",
+        "MetricExpr": "4*(( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)",
        "MetricGroup": "TopDownL1",
        "MetricName": "SLOTS"
    },
@@ -49,19 +49,19 @@
    },
    {
        "BriefDescription": "Instructions Per Cycle (per physical core)",
-        "MetricExpr": "INST_RETIRED.ANY / ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles",
+        "MetricExpr": "INST_RETIRED.ANY / (( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)",
        "MetricGroup": "SMT",
        "MetricName": "CoreIPC"
    },
    {
        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)",
-        "MetricExpr": "UOPS_EXECUTED.THREAD / ( cpu@uops_executed.core\\,cmask\\=1@ / 2) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC",
+        "MetricExpr": "UOPS_EXECUTED.THREAD / (( cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)",
        "MetricGroup": "Pipeline;Ports_Utilization",
        "MetricName": "ILP"
    },
    {
        "BriefDescription": "Average Branch Address Clear Cost (fraction of cycles)",
-        "MetricExpr": "2* ( RS_EVENTS.EMPTY_CYCLES - ICACHE.IFETCH_STALL ) / RS_EVENTS.EMPTY_END",
+        "MetricExpr": "2* (( RS_EVENTS.EMPTY_CYCLES - ICACHE.IFETCH_STALL ) / RS_EVENTS.EMPTY_END)",
        "MetricGroup": "Unknown_Branches",
        "MetricName": "BAClear_Cost"
    },
@@ -79,13 +79,13 @@
    },
    {
        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least 1 such miss)",
-        "MetricExpr": "L1D_PEND_MISS.PENDING / ( cpu@l1d_pend_miss.pending_cycles\\,any\\=1@ / 2) if #SMT_on else L1D_PEND_MISS.PENDING_CYCLES",
+        "MetricExpr": "L1D_PEND_MISS.PENDING / (( cpu@l1d_pend_miss.pending_cycles\\,any\\=1@ / 2) if #SMT_on else L1D_PEND_MISS.PENDING_CYCLES)",
        "MetricGroup": "Memory_Bound;Memory_BW",
        "MetricName": "MLP"
    },
    {
        "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
-        "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION ) / ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles",
+        "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION ) / (( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)",
        "MetricGroup": "TLB",
        "MetricName": "Page_Walks_Utilization"
    },
@@ -97,7 +97,7 @@
    },
    {
        "BriefDescription": "Giga Floating Point Operations Per Second",
-        "MetricExpr": "( 1*( FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE ) + 2* FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4*( FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE ) + 8* SIMD_FP_256.PACKED_SINGLE ) / 1000000000 / duration_time",
+        "MetricExpr": "(( 1*( FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE ) + 2* FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4*( FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE ) + 8* SIMD_FP_256.PACKED_SINGLE )) / 1000000000 / duration_time",
        "MetricGroup": "FLOPS;Summary",
        "MetricName": "GFLOPs"
    },

--- a/tools/perf/pmu-events/arch/x86/jaketown/jkt-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/jaketown/jkt-metrics.json
@@ -13,7 +13,7 @@
    },
    {
        "BriefDescription": "Rough Estimation of fraction of fetched lines bytes that were likely consumed by program instructions",
-        "MetricExpr": "min( 1 , UOPS_ISSUED.ANY / ( UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY * 32 * ( ICACHE.HIT + ICACHE.MISSES ) / 4) )",
+        "MetricExpr": "min( 1 , UOPS_ISSUED.ANY / ( (UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY) * 32 * ( ICACHE.HIT + ICACHE.MISSES ) / 4) )",
        "MetricGroup": "Frontend",
        "MetricName": "IFetch_Line_Utilization"
    },
@@ -25,7 +25,7 @@
    },
    {
        "BriefDescription": "Cycles Per Instruction (threaded)",
-        "MetricExpr": "1 / INST_RETIRED.ANY / cycles",
+        "MetricExpr": "1 / (INST_RETIRED.ANY / cycles)",
        "MetricGroup": "Pipeline;Summary",
        "MetricName": "CPI"
    },
@@ -37,7 +37,7 @@
    },
    {
        "BriefDescription": "Total issue-pipeline slots",
-        "MetricExpr": "4*( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles",
+        "MetricExpr": "4*(( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)",
        "MetricGroup": "TopDownL1",
        "MetricName": "SLOTS"
    },
@@ -49,13 +49,13 @@
    },
    {
        "BriefDescription": "Instructions Per Cycle (per physical core)",
-        "MetricExpr": "INST_RETIRED.ANY / ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles",
+        "MetricExpr": "INST_RETIRED.ANY / (( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)",
        "MetricGroup": "SMT",
        "MetricName": "CoreIPC"
    },
    {
        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)",
-	"MetricExpr": "UOPS_DISPATCHED.THREAD / ( cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@ / 2) if #SMT_on else cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@",
+        "MetricExpr": "UOPS_DISPATCHED.THREAD / (( cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@ / 2) if #SMT_on else cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@)",
        "MetricGroup": "Pipeline;Ports_Utilization",
        "MetricName": "ILP"
    },
@@ -73,7 +73,7 @@
    },
    {
        "BriefDescription": "Giga Floating Point Operations Per Second",
-        "MetricExpr": "( 1*( FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE ) + 2* FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4*( FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE ) + 8* SIMD_FP_256.PACKED_SINGLE ) / 1000000000 / duration_time",
+        "MetricExpr": "(( 1*( FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE ) + 2* FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4*( FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE ) + 8* SIMD_FP_256.PACKED_SINGLE )) / 1000000000 / duration_time",
        "MetricGroup": "FLOPS;Summary",
        "MetricName": "GFLOPs"
    },

--- a/tools/perf/pmu-events/arch/x86/mapfile.csv
+++ b/tools/perf/pmu-events/arch/x86/mapfile.csv
@@ -9,6 +9,7 @@ GenuineIntel-6-27,v4,bonnell,core
 GenuineIntel-6-36,v4,bonnell,core
 GenuineIntel-6-35,v4,bonnell,core
 GenuineIntel-6-5C,v8,goldmont,core
+GenuineIntel-6-7A,v1,goldmontplus,core
 GenuineIntel-6-3C,v24,haswell,core
 GenuineIntel-6-45,v24,haswell,core
 GenuineIntel-6-46,v24,haswell,core

--- a/tools/perf/pmu-events/arch/x86/sandybridge/snb-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/sandybridge/snb-metrics.json
@@ -13,7 +13,7 @@
    },
    {
        "BriefDescription": "Rough Estimation of fraction of fetched lines bytes that were likely consumed by program instructions",
-        "MetricExpr": "min( 1 , UOPS_ISSUED.ANY / ( UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY * 32 * ( ICACHE.HIT + ICACHE.MISSES ) / 4) )",
+        "MetricExpr": "min( 1 , UOPS_ISSUED.ANY / ( (UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY) * 32 * ( ICACHE.HIT + ICACHE.MISSES ) / 4) )",
        "MetricGroup": "Frontend",
        "MetricName": "IFetch_Line_Utilization"
    },
@@ -25,7 +25,7 @@
    },
    {
        "BriefDescription": "Cycles Per Instruction (threaded)",
-        "MetricExpr": "1 / INST_RETIRED.ANY / cycles",
+        "MetricExpr": "1 / (INST_RETIRED.ANY / cycles)",
        "MetricGroup": "Pipeline;Summary",
        "MetricName": "CPI"
    },
@@ -37,7 +37,7 @@
    },
    {
        "BriefDescription": "Total issue-pipeline slots",
-        "MetricExpr": "4*( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles",
+        "MetricExpr": "4*(( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)",
        "MetricGroup": "TopDownL1",
        "MetricName": "SLOTS"
    },
@@ -49,13 +49,13 @@
    },
    {
        "BriefDescription": "Instructions Per Cycle (per physical core)",
-        "MetricExpr": "INST_RETIRED.ANY / ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles",
+        "MetricExpr": "INST_RETIRED.ANY / (( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)",
        "MetricGroup": "SMT",
        "MetricName": "CoreIPC"
    },
    {
        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)",
-	"MetricExpr": "UOPS_DISPATCHED.THREAD / ( cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@ / 2) if #SMT_on else cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@",
+        "MetricExpr": "UOPS_DISPATCHED.THREAD / (( cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@ / 2) if #SMT_on else cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@)",
        "MetricGroup": "Pipeline;Ports_Utilization",
        "MetricName": "ILP"
    },
@@ -73,7 +73,7 @@
    },
    {
        "BriefDescription": "Giga Floating Point Operations Per Second",
-        "MetricExpr": "( 1*( FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE ) + 2* FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4*( FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE ) + 8* SIMD_FP_256.PACKED_SINGLE ) / 1000000000 / duration_time",
+        "MetricExpr": "(( 1*( FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE ) + 2* FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4*( FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE ) + 8* SIMD_FP_256.PACKED_SINGLE )) / 1000000000 / duration_time",
        "MetricGroup": "FLOPS;Summary",
        "MetricName": "GFLOPs"
    },

--- a/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json
@@ -13,7 +13,7 @@
    },
    {
        "BriefDescription": "Rough Estimation of fraction of fetched lines bytes that were likely consumed by program instructions",
-        "MetricExpr": "min( 1 , UOPS_ISSUED.ANY / (UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY * 64 * ( ICACHE_64B.IFTAG_HIT + ICACHE_64B.IFTAG_MISS ) / 4.1) )",
+        "MetricExpr": "min( 1 , UOPS_ISSUED.ANY / ((UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY) * 64 * ( ICACHE_64B.IFTAG_HIT + ICACHE_64B.IFTAG_MISS ) / 4.1) )",
        "MetricGroup": "Frontend",
        "MetricName": "IFetch_Line_Utilization"
    },
@@ -25,7 +25,7 @@
    },
    {
        "BriefDescription": "Cycles Per Instruction (threaded)",
-        "MetricExpr": "1 / INST_RETIRED.ANY / cycles",
+        "MetricExpr": "1 / (INST_RETIRED.ANY / cycles)",
        "MetricGroup": "Pipeline;Summary",
        "MetricName": "CPI"
    },
@@ -37,7 +37,7 @@
    },
    {
        "BriefDescription": "Total issue-pipeline slots",
-        "MetricExpr": "4*( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles",
+        "MetricExpr": "4*(( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)",
        "MetricGroup": "TopDownL1",
        "MetricName": "SLOTS"
    },
@@ -49,19 +49,19 @@
    },
    {
        "BriefDescription": "Instructions Per Cycle (per physical core)",
-        "MetricExpr": "INST_RETIRED.ANY / ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles",
+        "MetricExpr": "INST_RETIRED.ANY / (( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)",
        "MetricGroup": "SMT",
        "MetricName": "CoreIPC"
    },
    {
        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)",
-        "MetricExpr": "UOPS_EXECUTED.THREAD / ( UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2) if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1",
+        "MetricExpr": "UOPS_EXECUTED.THREAD / (( UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2) if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)",
        "MetricGroup": "Pipeline;Ports_Utilization",
        "MetricName": "ILP"
    },
    {
        "BriefDescription": "Average Branch Address Clear Cost (fraction of cycles)",
-        "MetricExpr": "2* ( RS_EVENTS.EMPTY_CYCLES - ICACHE_16B.IFDATA_STALL  - ICACHE_64B.IFTAG_STALL ) / RS_EVENTS.EMPTY_END",
+        "MetricExpr": "2* (( RS_EVENTS.EMPTY_CYCLES - ICACHE_16B.IFDATA_STALL  - ICACHE_64B.IFTAG_STALL ) / RS_EVENTS.EMPTY_END)",
        "MetricGroup": "Unknown_Branches",
        "MetricName": "BAClear_Cost"
    },
@@ -73,19 +73,19 @@
    },
    {
        "BriefDescription": "Actual Average Latency for L1 data-cache miss demand loads",
-	"MetricExpr": "L1D_PEND_MISS.PENDING / ( MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT )",
+        "MetricExpr": "L1D_PEND_MISS.PENDING / ( MEM_LOAD_RETIRED.L1_MISS_PS + MEM_LOAD_RETIRED.FB_HIT_PS )",
        "MetricGroup": "Memory_Bound;Memory_Lat",
        "MetricName": "Load_Miss_Real_Latency"
    },
    {
        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least 1 such miss)",
-        "MetricExpr": "L1D_PEND_MISS.PENDING / ( L1D_PEND_MISS.PENDING_CYCLES_ANY / 2) if #SMT_on else L1D_PEND_MISS.PENDING_CYCLES",
+        "MetricExpr": "L1D_PEND_MISS.PENDING / (( L1D_PEND_MISS.PENDING_CYCLES_ANY / 2) if #SMT_on else L1D_PEND_MISS.PENDING_CYCLES)",
        "MetricGroup": "Memory_Bound;Memory_BW",
        "MetricName": "MLP"
    },
    {
        "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
-        "MetricExpr": "( ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING + EPT.WALK_PENDING ) / ( 2 * ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles )",
+        "MetricExpr": "( ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING + EPT.WALK_PENDING ) / ( 2 * (( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles) )",
        "MetricGroup": "TLB",
        "MetricName": "Page_Walks_Utilization"
    },
@@ -97,7 +97,7 @@
    },
    {
        "BriefDescription": "Giga Floating Point Operations Per Second",
-        "MetricExpr": "( 1*( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2* FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4*( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8* FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE ) / 1000000000 / duration_time",
+        "MetricExpr": "(( 1*( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2* FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4*( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8* FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE )) / 1000000000 / duration_time",
        "MetricGroup": "FLOPS;Summary",
        "MetricName": "GFLOPs"
    },

--- a/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json
@@ -13,19 +13,19 @@
    },
    {
        "BriefDescription": "Rough Estimation of fraction of fetched lines bytes that were likely consumed by program instructions",
-        "MetricExpr": "min( 1 , UOPS_ISSUED.ANY / (UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY * 64 * ( ICACHE_64B.IFTAG_HIT + ICACHE_64B.IFTAG_MISS ) / 4.1) )",
+        "MetricExpr": "min( 1 , UOPS_ISSUED.ANY / ((UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY) * 64 * ( ICACHE_64B.IFTAG_HIT + ICACHE_64B.IFTAG_MISS ) / 4.1) )",
        "MetricGroup": "Frontend",
        "MetricName": "IFetch_Line_Utilization"
    },
    {
-        "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)",
+        "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded Icache; or Uop Cache)",
        "MetricExpr": "IDQ.DSB_UOPS / ( IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS )",
        "MetricGroup": "DSB; Frontend_Bandwidth",
        "MetricName": "DSB_Coverage"
    },
    {
        "BriefDescription": "Cycles Per Instruction (threaded)",
-        "MetricExpr": "1 / INST_RETIRED.ANY / cycles",
+        "MetricExpr": "1 / (INST_RETIRED.ANY / cycles)",
        "MetricGroup": "Pipeline;Summary",
        "MetricName": "CPI"
    },
@@ -36,8 +36,8 @@
        "MetricName": "CLKS"
    },
    {
-        "BriefDescription": "Total issue-pipeline slots (per-core)",
-        "MetricExpr": "4*cycles if not #SMT_on else (( CPU_CLK_UNHALTED.THREAD / 2) * (CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK )) if #EBS_Mode else ( CPU_CLK_UNHALTED.THREAD_ANY / 2 )",
+        "BriefDescription": "Total issue-pipeline slots",
+        "MetricExpr": "4*(( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)",
        "MetricGroup": "TopDownL1",
        "MetricName": "SLOTS"
    },
@@ -49,25 +49,25 @@
    },
    {
        "BriefDescription": "Instructions Per Cycle (per physical core)",
-        "MetricExpr": "INST_RETIRED.ANY / cycles if not #SMT_on else (( CPU_CLK_UNHALTED.THREAD / 2) * (CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK )) if #EBS_Mode else ( CPU_CLK_UNHALTED.THREAD_ANY / 2 )",
+        "MetricExpr": "INST_RETIRED.ANY / (( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles)",
        "MetricGroup": "SMT",
        "MetricName": "CoreIPC"
    },
    {
        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)",
-        "MetricExpr": "UOPS_EXECUTED.THREAD / ( UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2) if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1",
+        "MetricExpr": "UOPS_EXECUTED.THREAD / (( UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2) if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)",
        "MetricGroup": "Pipeline;Ports_Utilization",
        "MetricName": "ILP"
    },
    {
        "BriefDescription": "Average Branch Address Clear Cost (fraction of cycles)",
-        "MetricExpr": "( RS_EVENTS.EMPTY_CYCLES - (ICACHE_16B.IFDATA_STALL +2* ICACHE_16B.IFDATA_STALL:c1:e1) - ICACHE_64B.IFTAG_STALL ) / RS_EVENTS.EMPTY_END",
+        "MetricExpr": "2* (( RS_EVENTS.EMPTY_CYCLES - ICACHE_16B.IFDATA_STALL  - ICACHE_64B.IFTAG_STALL ) / RS_EVENTS.EMPTY_END)",
        "MetricGroup": "Unknown_Branches",
        "MetricName": "BAClear_Cost"
    },
    {
        "BriefDescription": "Core actual clocks when any thread is active on the physical core",
-        "MetricExpr": "CPU_CLK_UNHALTED.THREAD if not #SMT_on else (( CPU_CLK_UNHALTED.THREAD / 2) * (CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK )) if 1 else ( CPU_CLK_UNHALTED.THREAD_ANY / 2 )",
+        "MetricExpr": "( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else CPU_CLK_UNHALTED.THREAD",
        "MetricGroup": "SMT",
        "MetricName": "CORE_CLKS"
    },
@@ -79,34 +79,16 @@
    },
    {
        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least 1 such miss)",
-        "MetricExpr": "L1D_PEND_MISS.PENDING / ( L1D_PEND_MISS.PENDING_CYCLES_ANY / 2) if #SMT_on else L1D_PEND_MISS.PENDING_CYCLES",
+        "MetricExpr": "L1D_PEND_MISS.PENDING / (( L1D_PEND_MISS.PENDING_CYCLES_ANY / 2) if #SMT_on else L1D_PEND_MISS.PENDING_CYCLES)",
        "MetricGroup": "Memory_Bound;Memory_BW",
        "MetricName": "MLP"
    },
    {
        "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
-        "MetricExpr": "( ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING + EPT.WALK_PENDING ) / ( 2 * cycles if not #SMT_on else (( CPU_CLK_UNHALTED.THREAD / 2) * (CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK )) if #EBS_Mode else ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) )",
+        "MetricExpr": "( ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING + EPT.WALK_PENDING ) / ( 2 * (( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else cycles) )",
        "MetricGroup": "TLB",
        "MetricName": "Page_Walks_Utilization"
    },
-    {
-        "BriefDescription": "L1 cache miss per kilo instruction for demand loads",
-        "MetricExpr": "1000 * MEM_LOAD_RETIRED.L1_MISS_PS / INST_RETIRED.ANY",
-        "MetricGroup": "Cache_Misses;",
-        "MetricName": "L1MPKI"
-    },
-    {
-        "BriefDescription": "L2 cache miss per kilo instruction for demand loads",
-        "MetricExpr": "1000 * MEM_LOAD_RETIRED.L2_MISS_PS / INST_RETIRED.ANY",
-        "MetricGroup": "Cache_Misses;",
-        "MetricName": "L2MPKI"
-    },
-    {
-        "BriefDescription": "L3 cache miss per kilo instruction for demand loads",
-        "MetricExpr": "1000 * MEM_LOAD_RETIRED.L3_MISS_PS / INST_RETIRED.ANY",
-        "MetricGroup": "Cache_Misses;",
-        "MetricName": "L3MPKI"
-    },
    {
        "BriefDescription": "Average CPU Utilization",
        "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / msr@tsc@",
@@ -115,7 +97,7 @@
    },
    {
        "BriefDescription": "Giga Floating Point Operations Per Second",
-        "MetricExpr": "( 1*( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2* FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4*( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8* (FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE ) + 16* FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / 1000000000 / duration_time",
+        "MetricExpr": "(( 1*( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2* FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4*( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8* FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE )) / 1000000000 / duration_time",
        "MetricGroup": "FLOPS;Summary",
        "MetricName": "GFLOPs"
    },

--- a/tools/perf/tests/attr/test-stat-C0
+++ b/tools/perf/tests/attr/test-stat-C0
@@ -7,3 +7,4 @@ ret     = 1
 # events are disabled by default when attached to cpu
 disabled=1
 enable_on_exec=0
+optional=1
--- a/tools/perf/tests/attr/test-stat-basic
+++ b/tools/perf/tests/attr/test-stat-basic
@@ -4,3 +4,4 @@ args    = -e cycles kill >/dev/null 2>&1
 ret     = 1

 [event:base-stat]
+optional=1
--- a/tools/perf/tests/attr/test-stat-default
+++ b/tools/perf/tests/attr/test-stat-default
@@ -32,6 +32,7 @@ config=2
 fd=5
 type=0
 config=0
+optional=1

 # PERF_TYPE_HARDWARE / PERF_COUNT_HW_STALLED_CYCLES_FRONTEND
 [event6:base-stat]
@@ -52,15 +53,18 @@ optional=1
 fd=8
 type=0
 config=1
+optional=1

 # PERF_TYPE_HARDWARE / PERF_COUNT_HW_BRANCH_INSTRUCTIONS
 [event9:base-stat]
 fd=9
 type=0
 config=4
+optional=1

 # PERF_TYPE_HARDWARE / PERF_COUNT_HW_BRANCH_MISSES
 [event10:base-stat]
 fd=10
 type=0
 config=5
+optional=1
--- a/tools/perf/tests/attr/test-stat-detailed-1
+++ b/tools/perf/tests/attr/test-stat-detailed-1
@@ -33,6 +33,7 @@ config=2
 fd=5
 type=0
 config=0
+optional=1

 # PERF_TYPE_HARDWARE / PERF_COUNT_HW_STALLED_CYCLES_FRONTEND
 [event6:base-stat]
@@ -53,18 +54,21 @@ optional=1
 fd=8
 type=0
 config=1
+optional=1

 # PERF_TYPE_HARDWARE / PERF_COUNT_HW_BRANCH_INSTRUCTIONS
 [event9:base-stat]
 fd=9
 type=0
 config=4
+optional=1

 # PERF_TYPE_HARDWARE / PERF_COUNT_HW_BRANCH_MISSES
 [event10:base-stat]
 fd=10
 type=0
 config=5
+optional=1

 # PERF_TYPE_HW_CACHE /
 #  PERF_COUNT_HW_CACHE_L1D                <<  0  |
@@ -74,6 +78,7 @@ config=5
 fd=11
 type=3
 config=0
+optional=1

 # PERF_TYPE_HW_CACHE /
 #  PERF_COUNT_HW_CACHE_L1D                <<  0  |
@@ -83,6 +88,7 @@ config=0
 fd=12
 type=3
 config=65536
+optional=1

 # PERF_TYPE_HW_CACHE /
 #  PERF_COUNT_HW_CACHE_LL                 <<  0  |
@@ -92,6 +98,7 @@ config=65536
 fd=13
 type=3
 config=2
+optional=1

 # PERF_TYPE_HW_CACHE,
 #  PERF_COUNT_HW_CACHE_LL                 <<  0  |
@@ -101,3 +108,4 @@ config=2
 fd=14
 type=3
 config=65538
+optional=1
--- a/tools/perf/tests/attr/test-stat-detailed-2
+++ b/tools/perf/tests/attr/test-stat-detailed-2
@@ -33,6 +33,7 @@ config=2
 fd=5
 type=0
 config=0
+optional=1

 # PERF_TYPE_HARDWARE / PERF_COUNT_HW_STALLED_CYCLES_FRONTEND
 [event6:base-stat]
@@ -53,18 +54,21 @@ optional=1
 fd=8
 type=0
 config=1
+optional=1

 # PERF_TYPE_HARDWARE / PERF_COUNT_HW_BRANCH_INSTRUCTIONS
 [event9:base-stat]
 fd=9
 type=0
 config=4
+optional=1

 # PERF_TYPE_HARDWARE / PERF_COUNT_HW_BRANCH_MISSES
 [event10:base-stat]
 fd=10
 type=0
 config=5
+optional=1

 # PERF_TYPE_HW_CACHE /
 #  PERF_COUNT_HW_CACHE_L1D                <<  0  |
@@ -74,6 +78,7 @@ config=5
 fd=11
 type=3
 config=0
+optional=1

 # PERF_TYPE_HW_CACHE /
 #  PERF_COUNT_HW_CACHE_L1D                <<  0  |
@@ -83,6 +88,7 @@ config=0
 fd=12
 type=3
 config=65536
+optional=1

 # PERF_TYPE_HW_CACHE /
 #  PERF_COUNT_HW_CACHE_LL                 <<  0  |
@@ -92,6 +98,7 @@ config=65536
 fd=13
 type=3
 config=2
+optional=1

 # PERF_TYPE_HW_CACHE,
 #  PERF_COUNT_HW_CACHE_LL                 <<  0  |
@@ -101,6 +108,7 @@ config=2
 fd=14
 type=3
 config=65538
+optional=1

 # PERF_TYPE_HW_CACHE,
 #  PERF_COUNT_HW_CACHE_L1I                <<  0  |
@@ -120,6 +128,7 @@ optional=1
 fd=16
 type=3
 config=65537
+optional=1

 # PERF_TYPE_HW_CACHE,
 #  PERF_COUNT_HW_CACHE_DTLB               <<  0  |
@@ -129,6 +138,7 @@ config=65537
 fd=17
 type=3
 config=3
+optional=1

 # PERF_TYPE_HW_CACHE,
 #  PERF_COUNT_HW_CACHE_DTLB               <<  0  |
@@ -138,6 +148,7 @@ config=3
 fd=18
 type=3
 config=65539
+optional=1

 # PERF_TYPE_HW_CACHE,
 #  PERF_COUNT_HW_CACHE_ITLB               <<  0  |
@@ -147,6 +158,7 @@ config=65539
 fd=19
 type=3
 config=4
+optional=1

 # PERF_TYPE_HW_CACHE,
 #  PERF_COUNT_HW_CACHE_ITLB               <<  0  |
@@ -156,3 +168,4 @@ config=4
 fd=20
 type=3
 config=65540
+optional=1
--- a/tools/perf/tests/attr/test-stat-detailed-3
+++ b/tools/perf/tests/attr/test-stat-detailed-3
@@ -33,6 +33,7 @@ config=2
 fd=5
 type=0
 config=0
+optional=1

 # PERF_TYPE_HARDWARE / PERF_COUNT_HW_STALLED_CYCLES_FRONTEND
 [event6:base-stat]
@@ -53,18 +54,21 @@ optional=1
 fd=8
 type=0
 config=1
+optional=1

 # PERF_TYPE_HARDWARE / PERF_COUNT_HW_BRANCH_INSTRUCTIONS
 [event9:base-stat]
 fd=9
 type=0
 config=4
+optional=1

 # PERF_TYPE_HARDWARE / PERF_COUNT_HW_BRANCH_MISSES
 [event10:base-stat]
 fd=10
 type=0
 config=5
+optional=1

 # PERF_TYPE_HW_CACHE /
 #  PERF_COUNT_HW_CACHE_L1D                <<  0  |
@@ -74,6 +78,7 @@ config=5
 fd=11
 type=3
 config=0
+optional=1

 # PERF_TYPE_HW_CACHE /
 #  PERF_COUNT_HW_CACHE_L1D                <<  0  |
@@ -83,6 +88,7 @@ config=0
 fd=12
 type=3
 config=65536
+optional=1

 # PERF_TYPE_HW_CACHE /
 #  PERF_COUNT_HW_CACHE_LL                 <<  0  |
@@ -92,6 +98,7 @@ config=65536
 fd=13
 type=3
 config=2
+optional=1

 # PERF_TYPE_HW_CACHE,
 #  PERF_COUNT_HW_CACHE_LL                 <<  0  |
@@ -101,6 +108,7 @@ config=2
 fd=14
 type=3
 config=65538
+optional=1

 # PERF_TYPE_HW_CACHE,
 #  PERF_COUNT_HW_CACHE_L1I                <<  0  |
@@ -120,6 +128,7 @@ optional=1
 fd=16
 type=3
 config=65537
+optional=1

 # PERF_TYPE_HW_CACHE,
 #  PERF_COUNT_HW_CACHE_DTLB               <<  0  |
@@ -129,6 +138,7 @@ config=65537
 fd=17
 type=3
 config=3
+optional=1

 # PERF_TYPE_HW_CACHE,
 #  PERF_COUNT_HW_CACHE_DTLB               <<  0  |
@@ -138,6 +148,7 @@ config=3
 fd=18
 type=3
 config=65539
+optional=1

 # PERF_TYPE_HW_CACHE,
 #  PERF_COUNT_HW_CACHE_ITLB               <<  0  |
@@ -147,6 +158,7 @@ config=65539
 fd=19
 type=3
 config=4
+optional=1

 # PERF_TYPE_HW_CACHE,
 #  PERF_COUNT_HW_CACHE_ITLB               <<  0  |
@@ -156,6 +168,7 @@ config=4
 fd=20
 type=3
 config=65540
+optional=1

 # PERF_TYPE_HW_CACHE,
 #  PERF_COUNT_HW_CACHE_L1D                <<  0  |

--- a/tools/perf/tests/attr/test-stat-no-inherit
+++ b/tools/perf/tests/attr/test-stat-no-inherit
@@ -5,3 +5,4 @@ ret     = 1

 [event:base-stat]
 inherit=0
+optional=1
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -13,6 +13,7 @@ libperf-y += find_bit.o
 libperf-y += kallsyms.o
 libperf-y += levenshtein.o
 libperf-y += llvm-utils.o
+libperf-y += mmap.o
 libperf-y += memswap.o
 libperf-y += parse-events.o
 libperf-y += perf_regs.o

--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -49,10 +49,9 @@ struct arch {
 	void		*priv;
 	unsigned int	model;
 	unsigned int	family;
-	int		(*init)(struct arch *arch);
+	int		(*init)(struct arch *arch, char *cpuid);
 	bool		(*ins_is_fused)(struct arch *arch, const char *ins1,
 					const char *ins2);
-	int		(*cpuid_parse)(struct arch *arch, char *cpuid);
 	struct		{
 		char comment_char;
 		char skip_functions_char;
@@ -132,10 +131,10 @@ static struct arch architectures[] = {
 	},
 	{
 		.name = "x86",
+		.init = x86__annotate_init,
 		.instructions = x86__instructions,
 		.nr_instructions = ARRAY_SIZE(x86__instructions),
 		.ins_is_fused = x86__ins_is_fused,
-		.cpuid_parse = x86__cpuid_parse,
 		.objdump =  {
 			.comment_char = '#',
 		},
@@ -1447,16 +1446,13 @@ int symbol__disassemble(struct symbol *sym, struct map *map,
 		*parch = arch;

 	if (arch->init) {
-		err = arch->init(arch);
+		err = arch->init(arch, cpuid);
 		if (err) {
 			pr_err("%s: failed to initialize %s arch priv area\n", __func__, arch->name);
 			return err;
 		}
 	}

-	if (arch->cpuid_parse && cpuid)
-		arch->cpuid_parse(arch, cpuid);
-
 	pr_debug("%s: filename=%s, sym=%s, start=%#" PRIx64 ", end=%#" PRIx64 "\n", __func__,
 		 symfs_filename, sym->name, map->unmap_ip(map, sym->start),
 		 map->unmap_ip(map, sym->end));

--- a/tools/perf/util/debug.c
+++ b/tools/perf/util/debug.c
@@ -111,50 +111,53 @@ int dump_printf(const char *fmt, ...)
 	return ret;
 }

-static void trace_event_printer(enum binary_printer_ops op,
-				unsigned int val, void *extra)
+static int trace_event_printer(enum binary_printer_ops op,
+			       unsigned int val, void *extra, FILE *fp)
 {
 	const char *color = PERF_COLOR_BLUE;
 	union perf_event *event = (union perf_event *)extra;
 	unsigned char ch = (unsigned char)val;
+	int printed = 0;

 	switch (op) {
 	case BINARY_PRINT_DATA_BEGIN:
-		printf(".");
-		color_fprintf(stdout, color, "\n. ... raw event: size %d bytes\n",
+		printed += fprintf(fp, ".");
+		printed += color_fprintf(fp, color, "\n. ... raw event: size %d bytes\n",
 					 event->header.size);
 		break;
 	case BINARY_PRINT_LINE_BEGIN:
-		printf(".");
+		printed += fprintf(fp, ".");
 		break;
 	case BINARY_PRINT_ADDR:
-		color_fprintf(stdout, color, "  %04x: ", val);
+		printed += color_fprintf(fp, color, "  %04x: ", val);
 		break;
 	case BINARY_PRINT_NUM_DATA:
-		color_fprintf(stdout, color, " %02x", val);
+		printed += color_fprintf(fp, color, " %02x", val);
 		break;
 	case BINARY_PRINT_NUM_PAD:
-		color_fprintf(stdout, color, "   ");
+		printed += color_fprintf(fp, color, "   ");
 		break;
 	case BINARY_PRINT_SEP:
-		color_fprintf(stdout, color, "  ");
+		printed += color_fprintf(fp, color, "  ");
 		break;
 	case BINARY_PRINT_CHAR_DATA:
-		color_fprintf(stdout, color, "%c",
+		printed += color_fprintf(fp, color, "%c",
 			      isprint(ch) ? ch : '.');
 		break;
 	case BINARY_PRINT_CHAR_PAD:
-		color_fprintf(stdout, color, " ");
+		printed += color_fprintf(fp, color, " ");
 		break;
 	case BINARY_PRINT_LINE_END:
-		color_fprintf(stdout, color, "\n");
+		printed += color_fprintf(fp, color, "\n");
 		break;
 	case BINARY_PRINT_DATA_END:
-		printf("\n");
+		printed += fprintf(fp, "\n");
 		break;
 	default:
 		break;
 	}
+
+	return printed;
 }

 void trace_event(union perf_event *event)

--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -11,8 +11,8 @@
 #include "../perf.h"
 #include "event.h"
 #include "evsel.h"
+#include "mmap.h"
 #include "util.h"
-#include "auxtrace.h"
 #include <signal.h>
 #include <unistd.h>

@@ -24,55 +24,6 @@ struct record_opts;
 #define PERF_EVLIST__HLIST_BITS 8
 #define PERF_EVLIST__HLIST_SIZE (1 << PERF_EVLIST__HLIST_BITS)

-/**
- * struct perf_mmap - perf's ring buffer mmap details
- *
- * @refcnt - e.g. code using PERF_EVENT_IOC_SET_OUTPUT to share this
- */
-struct perf_mmap {
-	void		 *base;
-	int		 mask;
-	int		 fd;
-	refcount_t	 refcnt;
-	u64		 prev;
-	struct auxtrace_mmap auxtrace_mmap;
-	char		 event_copy[PERF_SAMPLE_MAX_SIZE] __aligned(8);
-};
-
-static inline size_t
-perf_mmap__mmap_len(struct perf_mmap *map)
-{
-	return map->mask + 1 + page_size;
-}
-
-/*
- * State machine of bkw_mmap_state:
- *
- *                     .________________(forbid)_____________.
- *                     |                                     V
- * NOTREADY --(0)--> RUNNING --(1)--> DATA_PENDING --(2)--> EMPTY
- *                     ^  ^              |   ^               |
- *                     |  |__(forbid)____/   |___(forbid)___/|
- *                     |                                     |
- *                      \_________________(3)_______________/
- *
- * NOTREADY     : Backward ring buffers are not ready
- * RUNNING      : Backward ring buffers are recording
- * DATA_PENDING : We are required to collect data from backward ring buffers
- * EMPTY        : We have collected data from backward ring buffers.
- *
- * (0): Setup backward ring buffer
- * (1): Pause ring buffers for reading
- * (2): Read from ring buffers
- * (3): Resume ring buffers for recording
- */
-enum bkw_mmap_state {
-	BKW_MMAP_NOTREADY,
-	BKW_MMAP_RUNNING,
-	BKW_MMAP_DATA_PENDING,
-	BKW_MMAP_EMPTY,
-};
-
 struct perf_evlist {
 	struct list_head entries;
 	struct hlist_head heads[PERF_EVLIST__HLIST_SIZE];
@@ -177,12 +128,6 @@ struct perf_sample_id *perf_evlist__id2sid(struct perf_evlist *evlist, u64 id);

 void perf_evlist__toggle_bkw_mmap(struct perf_evlist *evlist, enum bkw_mmap_state state);

-union perf_event *perf_mmap__read_forward(struct perf_mmap *map, bool check_messup);
-union perf_event *perf_mmap__read_backward(struct perf_mmap *map);
-
-void perf_mmap__read_catchup(struct perf_mmap *md);
-void perf_mmap__consume(struct perf_mmap *md, bool overwrite);
-
 union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx);

 union perf_event *perf_evlist__mmap_read_forward(struct perf_evlist *evlist,
@@ -286,25 +231,6 @@ size_t perf_evlist__fprintf(struct perf_evlist *evlist, FILE *fp);
 int perf_evlist__strerror_open(struct perf_evlist *evlist, int err, char *buf, size_t size);
 int perf_evlist__strerror_mmap(struct perf_evlist *evlist, int err, char *buf, size_t size);

-static inline u64 perf_mmap__read_head(struct perf_mmap *mm)
-{
-	struct perf_event_mmap_page *pc = mm->base;
-	u64 head = ACCESS_ONCE(pc->data_head);
-	rmb();
-	return head;
-}
-
-static inline void perf_mmap__write_tail(struct perf_mmap *md, u64 tail)
-{
-	struct perf_event_mmap_page *pc = md->base;
-
-	/*
-	 * ensure all reads are done before we write the tail out.
-	 */
-	mb();
-	pc->data_tail = tail;
-}
-
 bool perf_evlist__can_select_event(struct perf_evlist *evlist, const char *str);
 void perf_evlist__to_front(struct perf_evlist *evlist,
 			   struct perf_evsel *move_evsel);

--- a/tools/perf/util/mmap.c
+++ b/tools/perf/util/mmap.c
--- a/tools/perf/util/mmap.h
+++ b/tools/perf/util/mmap.h
+#ifndef __PERF_MMAP_H
+#define __PERF_MMAP_H 1
+
+#include <linux/compiler.h>
+#include <linux/refcount.h>
+#include <linux/types.h>
+#include <asm/barrier.h>
+#include <stdbool.h>
+#include "auxtrace.h"
+#include "event.h"
+
+/**
+ * struct perf_mmap - perf's ring buffer mmap details
+ *
+ * @refcnt - e.g. code using PERF_EVENT_IOC_SET_OUTPUT to share this
+ */
+struct perf_mmap {
+	void		 *base;
+	int		 mask;
+	int		 fd;
+	refcount_t	 refcnt;
+	u64		 prev;
+	struct auxtrace_mmap auxtrace_mmap;
+	char		 event_copy[PERF_SAMPLE_MAX_SIZE] __aligned(8);
+};
+
+/*
+ * State machine of bkw_mmap_state:
+ *
+ *                     .________________(forbid)_____________.
+ *                     |                                     V
+ * NOTREADY --(0)--> RUNNING --(1)--> DATA_PENDING --(2)--> EMPTY
+ *                     ^  ^              |   ^               |
+ *                     |  |__(forbid)____/   |___(forbid)___/|
+ *                     |                                     |
+ *                      \_________________(3)_______________/
+ *
+ * NOTREADY     : Backward ring buffers are not ready
+ * RUNNING      : Backward ring buffers are recording
+ * DATA_PENDING : We are required to collect data from backward ring buffers
+ * EMPTY        : We have collected data from backward ring buffers.
+ *
+ * (0): Setup backward ring buffer
+ * (1): Pause ring buffers for reading
+ * (2): Read from ring buffers
+ * (3): Resume ring buffers for recording
+ */
+enum bkw_mmap_state {
+	BKW_MMAP_NOTREADY,
+	BKW_MMAP_RUNNING,
+	BKW_MMAP_DATA_PENDING,
+	BKW_MMAP_EMPTY,
+};
+
+struct mmap_params {
+	int			    prot, mask;
+	struct auxtrace_mmap_params auxtrace_mp;
+};
+
+int perf_mmap__mmap(struct perf_mmap *map, struct mmap_params *mp, int fd);
+void perf_mmap__munmap(struct perf_mmap *map);
+
+void perf_mmap__get(struct perf_mmap *map);
+void perf_mmap__put(struct perf_mmap *map);
+
+void perf_mmap__consume(struct perf_mmap *map, bool overwrite);
+
+void perf_mmap__read_catchup(struct perf_mmap *md);
+
+static inline u64 perf_mmap__read_head(struct perf_mmap *mm)
+{
+	struct perf_event_mmap_page *pc = mm->base;
+	u64 head = ACCESS_ONCE(pc->data_head);
+	rmb();
+	return head;
+}
+
+static inline void perf_mmap__write_tail(struct perf_mmap *md, u64 tail)
+{
+	struct perf_event_mmap_page *pc = md->base;
+
+	/*
+	 * ensure all reads are done before we write the tail out.
+	 */
+	mb();
+	pc->data_tail = tail;
+}
+
+union perf_event *perf_mmap__read_forward(struct perf_mmap *map, bool check_messup);
+union perf_event *perf_mmap__read_backward(struct perf_mmap *map);
+
+int perf_mmap__push(struct perf_mmap *md, bool overwrite, bool backward,
+		    void *to, int push(void *to, void *buf, size_t size));
+
+size_t perf_mmap__mmap_len(struct perf_mmap *map);
+
+#endif /*__PERF_MMAP_H */
--- a/tools/perf/util/namespaces.h
+++ b/tools/perf/util/namespaces.h
@@ -9,9 +9,10 @@
 #ifndef __PERF_NAMESPACES_H
 #define __PERF_NAMESPACES_H

-#include "../perf.h"
-#include <linux/list.h>
+#include <sys/types.h>
+#include <linux/perf_event.h>
 #include <linux/refcount.h>
+#include <linux/types.h>

 struct namespaces_event;


--- a/tools/perf/util/print_binary.c
+++ b/tools/perf/util/print_binary.c
--- a/tools/perf/util/print_binary.h
+++ b/tools/perf/util/print_binary.h
--- a/tools/perf/util/python-ext-sources
+++ b/tools/perf/util/python-ext-sources