Commit a193cc75 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'perf-core-2023-06-27' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull perf events updates from Ingo Molnar:

 - Rework & fix the event forwarding logic by extending the core
   interface.

   This fixes AMD PMU events that have to be forwarded from the
   core PMU to the IBS PMU.

 - Add self-tests to test AMD IBS invocation via core PMU events

 - Clean up Intel FixCntrCtl MSR encoding & handling

* tag 'perf-core-2023-06-27' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  perf: Re-instate the linear PMU search
  perf/x86/intel: Define bit macros for FixCntrCtl MSR
  perf test: Add selftest to test IBS invocation via core pmu events
  perf/core: Remove pmu linear searching code
  perf/ibs: Fix interface via core pmu events
  perf/core: Rework forwarding of {task|cpu}-clock events
parents bc6cb4d5 228020b4
......@@ -374,7 +374,7 @@ static int amd_pmu_hw_config(struct perf_event *event)
/* pass precise event sampling to ibs: */
if (event->attr.precise_ip && get_ibs_caps())
return -ENOENT;
return forward_event_to_ibs(event);
if (has_branch_stack(event) && !x86_pmu.lbr_nr)
return -EOPNOTSUPP;
......
......@@ -190,7 +190,7 @@ static struct perf_ibs *get_ibs_pmu(int type)
}
/*
* Use IBS for precise event sampling:
* core pmu config -> IBS config
*
* perf record -a -e cpu-cycles:p ... # use ibs op counting cycle count
* perf record -a -e r076:p ... # same as -e cpu-cycles:p
......@@ -199,25 +199,9 @@ static struct perf_ibs *get_ibs_pmu(int type)
* IbsOpCntCtl (bit 19) of IBS Execution Control Register (IbsOpCtl,
* MSRC001_1033) is used to select either cycle or micro-ops counting
* mode.
*
* The rip of IBS samples has skid 0. Thus, IBS supports precise
* levels 1 and 2 and the PERF_EFLAGS_EXACT is set. In rare cases the
* rip is invalid when IBS was not able to record the rip correctly.
* We clear PERF_EFLAGS_EXACT and take the rip from pt_regs then.
*
*/
static int perf_ibs_precise_event(struct perf_event *event, u64 *config)
static int core_pmu_ibs_config(struct perf_event *event, u64 *config)
{
switch (event->attr.precise_ip) {
case 0:
return -ENOENT;
case 1:
case 2:
break;
default:
return -EOPNOTSUPP;
}
switch (event->attr.type) {
case PERF_TYPE_HARDWARE:
switch (event->attr.config) {
......@@ -243,22 +227,37 @@ static int perf_ibs_precise_event(struct perf_event *event, u64 *config)
return -EOPNOTSUPP;
}
/*
* The rip of IBS samples has skid 0. Thus, IBS supports precise
* levels 1 and 2 and the PERF_EFLAGS_EXACT is set. In rare cases the
* rip is invalid when IBS was not able to record the rip correctly.
* We clear PERF_EFLAGS_EXACT and take the rip from pt_regs then.
*/
int forward_event_to_ibs(struct perf_event *event)
{
u64 config = 0;
if (!event->attr.precise_ip || event->attr.precise_ip > 2)
return -EOPNOTSUPP;
if (!core_pmu_ibs_config(event, &config)) {
event->attr.type = perf_ibs_op.pmu.type;
event->attr.config = config;
}
return -ENOENT;
}
static int perf_ibs_init(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
struct perf_ibs *perf_ibs;
u64 max_cnt, config;
int ret;
perf_ibs = get_ibs_pmu(event->attr.type);
if (perf_ibs) {
config = event->attr.config;
} else {
perf_ibs = &perf_ibs_op;
ret = perf_ibs_precise_event(event, &config);
if (ret)
return ret;
}
if (!perf_ibs)
return -ENOENT;
config = event->attr.config;
if (event->pmu != &perf_ibs->pmu)
return -ENOENT;
......
......@@ -2461,7 +2461,7 @@ static void intel_pmu_disable_fixed(struct perf_event *event)
intel_clear_masks(event, idx);
mask = 0xfULL << ((idx - INTEL_PMC_IDX_FIXED) * 4);
mask = intel_fixed_bits_by_idx(idx - INTEL_PMC_IDX_FIXED, INTEL_FIXED_BITS_MASK);
cpuc->fixed_ctrl_val &= ~mask;
}
......@@ -2760,25 +2760,25 @@ static void intel_pmu_enable_fixed(struct perf_event *event)
* if requested:
*/
if (!event->attr.precise_ip)
bits |= 0x8;
bits |= INTEL_FIXED_0_ENABLE_PMI;
if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
bits |= 0x2;
bits |= INTEL_FIXED_0_USER;
if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
bits |= 0x1;
bits |= INTEL_FIXED_0_KERNEL;
/*
* ANY bit is supported in v3 and up
*/
if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY)
bits |= 0x4;
bits |= INTEL_FIXED_0_ANYTHREAD;
idx -= INTEL_PMC_IDX_FIXED;
bits <<= (idx * 4);
mask = 0xfULL << (idx * 4);
bits = intel_fixed_bits_by_idx(idx, bits);
mask = intel_fixed_bits_by_idx(idx, INTEL_FIXED_BITS_MASK);
if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip) {
bits |= ICL_FIXED_0_ADAPTIVE << (idx * 4);
mask |= ICL_FIXED_0_ADAPTIVE << (idx * 4);
bits |= intel_fixed_bits_by_idx(idx, ICL_FIXED_0_ADAPTIVE);
mask |= intel_fixed_bits_by_idx(idx, ICL_FIXED_0_ADAPTIVE);
}
cpuc->fixed_ctrl_val &= ~mask;
......
......@@ -32,11 +32,21 @@
#define ARCH_PERFMON_EVENTSEL_INV (1ULL << 23)
#define ARCH_PERFMON_EVENTSEL_CMASK 0xFF000000ULL
#define INTEL_FIXED_BITS_MASK 0xFULL
#define INTEL_FIXED_BITS_STRIDE 4
#define INTEL_FIXED_0_KERNEL (1ULL << 0)
#define INTEL_FIXED_0_USER (1ULL << 1)
#define INTEL_FIXED_0_ANYTHREAD (1ULL << 2)
#define INTEL_FIXED_0_ENABLE_PMI (1ULL << 3)
#define HSW_IN_TX (1ULL << 32)
#define HSW_IN_TX_CHECKPOINTED (1ULL << 33)
#define ICL_EVENTSEL_ADAPTIVE (1ULL << 34)
#define ICL_FIXED_0_ADAPTIVE (1ULL << 32)
#define intel_fixed_bits_by_idx(_idx, _bits) \
((_bits) << ((_idx) * INTEL_FIXED_BITS_STRIDE))
#define AMD64_EVENTSEL_INT_CORE_ENABLE (1ULL << 36)
#define AMD64_EVENTSEL_GUESTONLY (1ULL << 40)
#define AMD64_EVENTSEL_HOSTONLY (1ULL << 41)
......@@ -478,8 +488,10 @@ struct pebs_xmm {
#ifdef CONFIG_X86_LOCAL_APIC
extern u32 get_ibs_caps(void);
extern int forward_event_to_ibs(struct perf_event *event);
#else
static inline u32 get_ibs_caps(void) { return 0; }
static inline int forward_event_to_ibs(struct perf_event *event) { return -ENOENT; }
#endif
#ifdef CONFIG_PERF_EVENTS
......
......@@ -295,6 +295,8 @@ struct perf_event_pmu_context;
struct perf_output_handle;
#define PMU_NULL_DEV ((void *)(~0UL))
/**
* struct pmu - generic performance monitoring unit
*/
......@@ -827,6 +829,14 @@ struct perf_event {
void *security;
#endif
struct list_head sb_list;
/*
* Certain events gets forwarded to another pmu internally by over-
* writing kernel copy of event->attr.type without user being aware
* of it. event->orig_type contains original 'type' requested by
* user.
*/
__u32 orig_type;
#endif /* CONFIG_PERF_EVENTS */
};
......
......@@ -6647,7 +6647,7 @@ static void perf_sigtrap(struct perf_event *event)
return;
send_sig_perf((void __user *)event->pending_addr,
event->attr.type, event->attr.sig_data);
event->orig_type, event->attr.sig_data);
}
/*
......@@ -9951,6 +9951,9 @@ static void sw_perf_event_destroy(struct perf_event *event)
swevent_hlist_put();
}
static struct pmu perf_cpu_clock; /* fwd declaration */
static struct pmu perf_task_clock;
static int perf_swevent_init(struct perf_event *event)
{
u64 event_id = event->attr.config;
......@@ -9966,7 +9969,10 @@ static int perf_swevent_init(struct perf_event *event)
switch (event_id) {
case PERF_COUNT_SW_CPU_CLOCK:
event->attr.type = perf_cpu_clock.type;
return -ENOENT;
case PERF_COUNT_SW_TASK_CLOCK:
event->attr.type = perf_task_clock.type;
return -ENOENT;
default:
......@@ -11098,7 +11104,7 @@ static void cpu_clock_event_read(struct perf_event *event)
static int cpu_clock_event_init(struct perf_event *event)
{
if (event->attr.type != PERF_TYPE_SOFTWARE)
if (event->attr.type != perf_cpu_clock.type)
return -ENOENT;
if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
......@@ -11119,6 +11125,7 @@ static struct pmu perf_cpu_clock = {
.task_ctx_nr = perf_sw_context,
.capabilities = PERF_PMU_CAP_NO_NMI,
.dev = PMU_NULL_DEV,
.event_init = cpu_clock_event_init,
.add = cpu_clock_event_add,
......@@ -11179,7 +11186,7 @@ static void task_clock_event_read(struct perf_event *event)
static int task_clock_event_init(struct perf_event *event)
{
if (event->attr.type != PERF_TYPE_SOFTWARE)
if (event->attr.type != perf_task_clock.type)
return -ENOENT;
if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
......@@ -11200,6 +11207,7 @@ static struct pmu perf_task_clock = {
.task_ctx_nr = perf_sw_context,
.capabilities = PERF_PMU_CAP_NO_NMI,
.dev = PMU_NULL_DEV,
.event_init = task_clock_event_init,
.add = task_clock_event_add,
......@@ -11427,31 +11435,31 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
goto unlock;
pmu->type = -1;
if (!name)
goto skip_type;
if (WARN_ONCE(!name, "Can not register anonymous pmu.\n")) {
ret = -EINVAL;
goto free_pdc;
}
pmu->name = name;
if (type != PERF_TYPE_SOFTWARE) {
if (type >= 0)
max = type;
if (type >= 0)
max = type;
ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL);
if (ret < 0)
goto free_pdc;
ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL);
if (ret < 0)
goto free_pdc;
WARN_ON(type >= 0 && ret != type);
WARN_ON(type >= 0 && ret != type);
type = ret;
}
type = ret;
pmu->type = type;
if (pmu_bus_running) {
if (pmu_bus_running && !pmu->dev) {
ret = pmu_dev_alloc(pmu);
if (ret)
goto free_idr;
}
skip_type:
ret = -ENOMEM;
pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context);
if (!pmu->cpu_pmu_context)
......@@ -11493,16 +11501,7 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
if (!pmu->event_idx)
pmu->event_idx = perf_event_idx_default;
/*
* Ensure the TYPE_SOFTWARE PMUs are at the head of the list,
* since these cannot be in the IDR. This way the linear search
* is fast, provided a valid software event is provided.
*/
if (type == PERF_TYPE_SOFTWARE || !name)
list_add_rcu(&pmu->entry, &pmus);
else
list_add_tail_rcu(&pmu->entry, &pmus);
list_add_rcu(&pmu->entry, &pmus);
atomic_set(&pmu->exclusive_cnt, 0);
ret = 0;
unlock:
......@@ -11511,12 +11510,13 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
return ret;
free_dev:
device_del(pmu->dev);
put_device(pmu->dev);
if (pmu->dev && pmu->dev != PMU_NULL_DEV) {
device_del(pmu->dev);
put_device(pmu->dev);
}
free_idr:
if (pmu->type != PERF_TYPE_SOFTWARE)
idr_remove(&pmu_idr, pmu->type);
idr_remove(&pmu_idr, pmu->type);
free_pdc:
free_percpu(pmu->pmu_disable_count);
......@@ -11537,9 +11537,8 @@ void perf_pmu_unregister(struct pmu *pmu)
synchronize_rcu();
free_percpu(pmu->pmu_disable_count);
if (pmu->type != PERF_TYPE_SOFTWARE)
idr_remove(&pmu_idr, pmu->type);
if (pmu_bus_running) {
idr_remove(&pmu_idr, pmu->type);
if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) {
if (pmu->nr_addr_filters)
device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
device_del(pmu->dev);
......@@ -11613,6 +11612,12 @@ static struct pmu *perf_init_event(struct perf_event *event)
idx = srcu_read_lock(&pmus_srcu);
/*
* Save original type before calling pmu->event_init() since certain
* pmus overwrites event->attr.type to forward event to another pmu.
*/
event->orig_type = event->attr.type;
/* Try parent's PMU first: */
if (event->parent && event->parent->pmu) {
pmu = event->parent->pmu;
......@@ -13652,8 +13657,8 @@ void __init perf_event_init(void)
perf_event_init_all_cpus();
init_srcu_struct(&pmus_srcu);
perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
perf_pmu_register(&perf_cpu_clock, NULL, -1);
perf_pmu_register(&perf_task_clock, NULL, -1);
perf_pmu_register(&perf_cpu_clock, "cpu_clock", -1);
perf_pmu_register(&perf_task_clock, "task_clock", -1);
perf_tp_register();
perf_event_init_cpu(smp_processor_id());
register_reboot_notifier(&perf_reboot_notifier);
......@@ -13696,7 +13701,7 @@ static int __init perf_event_sysfs_init(void)
goto unlock;
list_for_each_entry(pmu, &pmus, entry) {
if (!pmu->name || pmu->type < 0)
if (pmu->dev)
continue;
ret = pmu_dev_alloc(pmu);
......
......@@ -11,6 +11,7 @@ int test__intel_pt_pkt_decoder(struct test_suite *test, int subtest);
int test__intel_pt_hybrid_compat(struct test_suite *test, int subtest);
int test__bp_modify(struct test_suite *test, int subtest);
int test__x86_sample_parsing(struct test_suite *test, int subtest);
int test__amd_ibs_via_core_pmu(struct test_suite *test, int subtest);
extern struct test_suite *arch_tests[];
......
......@@ -5,3 +5,4 @@ perf-y += arch-tests.o
perf-y += sample-parsing.o
perf-$(CONFIG_AUXTRACE) += insn-x86.o intel-pt-test.o
perf-$(CONFIG_X86_64) += bp-modify.o
perf-y += amd-ibs-via-core-pmu.o
// SPDX-License-Identifier: GPL-2.0
#include "arch-tests.h"
#include "linux/perf_event.h"
#include "tests/tests.h"
#include "pmu.h"
#include "pmus.h"
#include "../perf-sys.h"
#include "debug.h"
#define NR_SUB_TESTS 5
static struct sub_tests {
int type;
unsigned long config;
bool valid;
} sub_tests[NR_SUB_TESTS] = {
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES, true },
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, false },
{ PERF_TYPE_RAW, 0x076, true },
{ PERF_TYPE_RAW, 0x0C1, true },
{ PERF_TYPE_RAW, 0x012, false },
};
static int event_open(int type, unsigned long config)
{
struct perf_event_attr attr;
memset(&attr, 0, sizeof(struct perf_event_attr));
attr.type = type;
attr.size = sizeof(struct perf_event_attr);
attr.config = config;
attr.disabled = 1;
attr.precise_ip = 1;
attr.sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TID;
attr.sample_period = 100000;
return sys_perf_event_open(&attr, -1, 0, -1, 0);
}
int test__amd_ibs_via_core_pmu(struct test_suite *test __maybe_unused,
int subtest __maybe_unused)
{
struct perf_pmu *ibs_pmu;
int ret = TEST_OK;
int fd, i;
if (list_empty(&pmus))
perf_pmu__scan(NULL);
ibs_pmu = perf_pmu__find("ibs_op");
if (!ibs_pmu)
return TEST_SKIP;
for (i = 0; i < NR_SUB_TESTS; i++) {
fd = event_open(sub_tests[i].type, sub_tests[i].config);
pr_debug("type: 0x%x, config: 0x%lx, fd: %d - ", sub_tests[i].type,
sub_tests[i].config, fd);
if ((sub_tests[i].valid && fd == -1) ||
(!sub_tests[i].valid && fd > 0)) {
pr_debug("Fail\n");
ret = TEST_FAIL;
} else {
pr_debug("Pass\n");
}
if (fd > 0)
close(fd);
}
return ret;
}
......@@ -22,6 +22,7 @@ struct test_suite suite__intel_pt = {
DEFINE_SUITE("x86 bp modify", bp_modify);
#endif
DEFINE_SUITE("x86 Sample parsing", x86_sample_parsing);
DEFINE_SUITE("AMD IBS via core pmu", amd_ibs_via_core_pmu);
struct test_suite *arch_tests[] = {
#ifdef HAVE_DWARF_UNWIND_SUPPORT
......@@ -35,5 +36,6 @@ struct test_suite *arch_tests[] = {
&suite__bp_modify,
#endif
&suite__x86_sample_parsing,
&suite__amd_ibs_via_core_pmu,
NULL,
};
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment