Commit e9d7f7cd authored by Yan, Zheng's avatar Yan, Zheng Committed by Ingo Molnar

perf/x86/intel: Add basic Haswell LBR call stack support

Haswell has a new feature that utilizes the existing LBR facility to
record call chains. To enable this feature, bits (JCC, NEAR_IND_JMP,
NEAR_REL_JMP, FAR_BRANCH, EN_CALLSTACK) in LBR_SELECT must be set to 1,
bits (NEAR_REL_CALL, NEAR-IND_CALL, NEAR_RET) must be cleared. Due to
a hardware bug of Haswell, this feature doesn't work well with
FREEZE_LBRS_ON_PMI.

When the call stack feature is enabled, the LBR stack will capture
unfiltered call data normally, but as return instructions are executed,
the last captured branch record is flushed from the on-chip registers
in a last-in first-out (LIFO) manner. Thus, branch information relative
to leaf functions will not be captured, while preserving the call stack
information of the main line execution path.

This patch defines a separate lbr_sel map for Haswell. The map contains
a new entry for the call stack feature.
Signed-off-by: default avatarYan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: default avatarKan Liang <kan.liang@intel.com>
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: eranian@google.com
Cc: jolsa@redhat.com
Link: http://lkml.kernel.org/r/1415156173-10035-5-git-send-email-kan.liang@intel.comSigned-off-by: default avatarIngo Molnar <mingo@kernel.org>
parent 2a0ad3b3
...@@ -517,7 +517,11 @@ struct x86_pmu { ...@@ -517,7 +517,11 @@ struct x86_pmu {
}; };
enum { enum {
PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE = PERF_SAMPLE_BRANCH_MAX_SHIFT, PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT = PERF_SAMPLE_BRANCH_MAX_SHIFT,
PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE,
PERF_SAMPLE_BRANCH_CALL_STACK =
1U << PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT,
}; };
#define x86_add_quirk(func_) \ #define x86_add_quirk(func_) \
...@@ -551,6 +555,12 @@ static struct perf_pmu_events_attr event_attr_##v = { \ ...@@ -551,6 +555,12 @@ static struct perf_pmu_events_attr event_attr_##v = { \
extern struct x86_pmu x86_pmu __read_mostly; extern struct x86_pmu x86_pmu __read_mostly;
static inline bool x86_pmu_has_lbr_callstack(void)
{
return x86_pmu.lbr_sel_map &&
x86_pmu.lbr_sel_map[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] > 0;
}
DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events); DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
int x86_perf_event_set_period(struct perf_event *event); int x86_perf_event_set_period(struct perf_event *event);
...@@ -754,6 +764,8 @@ void intel_pmu_lbr_init_atom(void); ...@@ -754,6 +764,8 @@ void intel_pmu_lbr_init_atom(void);
void intel_pmu_lbr_init_snb(void); void intel_pmu_lbr_init_snb(void);
void intel_pmu_lbr_init_hsw(void);
int intel_pmu_setup_lbr_filter(struct perf_event *event); int intel_pmu_setup_lbr_filter(struct perf_event *event);
int p4_pmu_init(void); int p4_pmu_init(void);
......
...@@ -2537,7 +2537,7 @@ __init int intel_pmu_init(void) ...@@ -2537,7 +2537,7 @@ __init int intel_pmu_init(void)
memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
intel_pmu_lbr_init_snb(); intel_pmu_lbr_init_hsw();
x86_pmu.event_constraints = intel_hsw_event_constraints; x86_pmu.event_constraints = intel_hsw_event_constraints;
x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints; x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
......
...@@ -39,6 +39,7 @@ static enum { ...@@ -39,6 +39,7 @@ static enum {
#define LBR_IND_JMP_BIT 6 /* do not capture indirect jumps */ #define LBR_IND_JMP_BIT 6 /* do not capture indirect jumps */
#define LBR_REL_JMP_BIT 7 /* do not capture relative jumps */ #define LBR_REL_JMP_BIT 7 /* do not capture relative jumps */
#define LBR_FAR_BIT 8 /* do not capture far branches */ #define LBR_FAR_BIT 8 /* do not capture far branches */
#define LBR_CALL_STACK_BIT 9 /* enable call stack */
#define LBR_KERNEL (1 << LBR_KERNEL_BIT) #define LBR_KERNEL (1 << LBR_KERNEL_BIT)
#define LBR_USER (1 << LBR_USER_BIT) #define LBR_USER (1 << LBR_USER_BIT)
...@@ -49,6 +50,7 @@ static enum { ...@@ -49,6 +50,7 @@ static enum {
#define LBR_REL_JMP (1 << LBR_REL_JMP_BIT) #define LBR_REL_JMP (1 << LBR_REL_JMP_BIT)
#define LBR_IND_JMP (1 << LBR_IND_JMP_BIT) #define LBR_IND_JMP (1 << LBR_IND_JMP_BIT)
#define LBR_FAR (1 << LBR_FAR_BIT) #define LBR_FAR (1 << LBR_FAR_BIT)
#define LBR_CALL_STACK (1 << LBR_CALL_STACK_BIT)
#define LBR_PLM (LBR_KERNEL | LBR_USER) #define LBR_PLM (LBR_KERNEL | LBR_USER)
...@@ -92,6 +94,7 @@ enum { ...@@ -92,6 +94,7 @@ enum {
X86_BR_ABORT = 1 << 12,/* transaction abort */ X86_BR_ABORT = 1 << 12,/* transaction abort */
X86_BR_IN_TX = 1 << 13,/* in transaction */ X86_BR_IN_TX = 1 << 13,/* in transaction */
X86_BR_NO_TX = 1 << 14,/* not in transaction */ X86_BR_NO_TX = 1 << 14,/* not in transaction */
X86_BR_CALL_STACK = 1 << 15,/* call stack */
}; };
#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL) #define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
...@@ -373,7 +376,7 @@ void intel_pmu_lbr_read(void) ...@@ -373,7 +376,7 @@ void intel_pmu_lbr_read(void)
* - in case there is no HW filter * - in case there is no HW filter
* - in case the HW filter has errata or limitations * - in case the HW filter has errata or limitations
*/ */
static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event) static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
{ {
u64 br_type = event->attr.branch_sample_type; u64 br_type = event->attr.branch_sample_type;
int mask = 0; int mask = 0;
...@@ -410,11 +413,21 @@ static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event) ...@@ -410,11 +413,21 @@ static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
if (br_type & PERF_SAMPLE_BRANCH_COND) if (br_type & PERF_SAMPLE_BRANCH_COND)
mask |= X86_BR_JCC; mask |= X86_BR_JCC;
if (br_type & PERF_SAMPLE_BRANCH_CALL_STACK) {
if (!x86_pmu_has_lbr_callstack())
return -EOPNOTSUPP;
if (mask & ~(X86_BR_USER | X86_BR_KERNEL))
return -EINVAL;
mask |= X86_BR_CALL | X86_BR_IND_CALL | X86_BR_RET |
X86_BR_CALL_STACK;
}
/* /*
* stash actual user request into reg, it may * stash actual user request into reg, it may
* be used by fixup code for some CPU * be used by fixup code for some CPU
*/ */
event->hw.branch_reg.reg = mask; event->hw.branch_reg.reg = mask;
return 0;
} }
/* /*
...@@ -443,8 +456,12 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event) ...@@ -443,8 +456,12 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
reg = &event->hw.branch_reg; reg = &event->hw.branch_reg;
reg->idx = EXTRA_REG_LBR; reg->idx = EXTRA_REG_LBR;
/* LBR_SELECT operates in suppress mode so invert mask */ /*
reg->config = ~mask & x86_pmu.lbr_sel_mask; * The first 9 bits (LBR_SEL_MASK) in LBR_SELECT operate
* in suppress mode. So LBR_SELECT should be set to
* (~mask & LBR_SEL_MASK) | (mask & ~LBR_SEL_MASK)
*/
reg->config = mask ^ x86_pmu.lbr_sel_mask;
return 0; return 0;
} }
...@@ -462,7 +479,9 @@ int intel_pmu_setup_lbr_filter(struct perf_event *event) ...@@ -462,7 +479,9 @@ int intel_pmu_setup_lbr_filter(struct perf_event *event)
/* /*
* setup SW LBR filter * setup SW LBR filter
*/ */
intel_pmu_setup_sw_lbr_filter(event); ret = intel_pmu_setup_sw_lbr_filter(event);
if (ret)
return ret;
/* /*
* setup HW LBR filter, if any * setup HW LBR filter, if any
...@@ -732,6 +751,20 @@ static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = { ...@@ -732,6 +751,20 @@ static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = {
[PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC, [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC,
}; };
static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = {
[PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY,
[PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER,
[PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL,
[PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN,
[PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_FAR,
[PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL
| LBR_FAR,
[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL,
[PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC,
[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = LBR_REL_CALL | LBR_IND_CALL
| LBR_RETURN | LBR_CALL_STACK,
};
/* core */ /* core */
void __init intel_pmu_lbr_init_core(void) void __init intel_pmu_lbr_init_core(void)
{ {
...@@ -788,6 +821,20 @@ void __init intel_pmu_lbr_init_snb(void) ...@@ -788,6 +821,20 @@ void __init intel_pmu_lbr_init_snb(void)
pr_cont("16-deep LBR, "); pr_cont("16-deep LBR, ");
} }
/* haswell */
void intel_pmu_lbr_init_hsw(void)
{
x86_pmu.lbr_nr = 16;
x86_pmu.lbr_tos = MSR_LBR_TOS;
x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
x86_pmu.lbr_to = MSR_LBR_NHM_TO;
x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
x86_pmu.lbr_sel_map = hsw_lbr_sel_map;
pr_cont("16-deep LBR, ");
}
/* atom */ /* atom */
void __init intel_pmu_lbr_init_atom(void) void __init intel_pmu_lbr_init_atom(void)
{ {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment