Commit 8b077e4a authored by Kan Liang's avatar Kan Liang Committed by Ingo Molnar

perf/x86/intel/lbr: Optimize context switches for the LBR call stack

Context switches with perf LBR call stack context are fairly expensive
because they do a lot of MSR writes. Currently we unconditionally do the
expensive operation when LBR call stack is enabled. It's not necessary
for some common cases, e.g task -> other kernel thread -> same task.
The LBR registers are not changed, hence they don't need to be
rewritten/restored.

Introduce per-CPU variables to track the last LBR call stack context.
If the same context is scheduled in, the rewrite/restore is not
required, with the following two exceptions:

 - The LBR registers may be modified by a normal LBR event, i.e., adding
   a new LBR event or scheduling an existing LBR event. In both cases,
   the LBR registers are reset first. The last LBR call stack information
   is cleared in intel_pmu_lbr_reset(). Restoring the LBR registers is
   required.

 - The LBR registers are initialized to zero in C6.
   If the LBR registers which TOS points is cleared, C6 must be entered
   while swapped out. Restoring the LBR registers is required as well.

These exceptions are not common.
Signed-off-by: default avatarKan Liang <kan.liang@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: acme@kernel.org
Cc: eranian@google.com
Link: https://lore.kernel.org/lkml/1528213126-4312-2-git-send-email-kan.liang@linux.intel.comSigned-off-by: default avatarIngo Molnar <mingo@kernel.org>
parent 0592e57b
...@@ -216,6 +216,8 @@ static void intel_pmu_lbr_reset_64(void) ...@@ -216,6 +216,8 @@ static void intel_pmu_lbr_reset_64(void)
void intel_pmu_lbr_reset(void) void intel_pmu_lbr_reset(void)
{ {
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
if (!x86_pmu.lbr_nr) if (!x86_pmu.lbr_nr)
return; return;
...@@ -223,6 +225,9 @@ void intel_pmu_lbr_reset(void) ...@@ -223,6 +225,9 @@ void intel_pmu_lbr_reset(void)
intel_pmu_lbr_reset_32(); intel_pmu_lbr_reset_32();
else else
intel_pmu_lbr_reset_64(); intel_pmu_lbr_reset_64();
cpuc->last_task_ctx = NULL;
cpuc->last_log_id = 0;
} }
/* /*
...@@ -334,6 +339,7 @@ static inline u64 rdlbr_to(unsigned int idx) ...@@ -334,6 +339,7 @@ static inline u64 rdlbr_to(unsigned int idx)
static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
{ {
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
int i; int i;
unsigned lbr_idx, mask; unsigned lbr_idx, mask;
u64 tos; u64 tos;
...@@ -344,8 +350,20 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) ...@@ -344,8 +350,20 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
return; return;
} }
mask = x86_pmu.lbr_nr - 1;
tos = task_ctx->tos; tos = task_ctx->tos;
/*
* Does not restore the LBR registers, if
* - No one else touched them, and
* - Did not enter C6
*/
if ((task_ctx == cpuc->last_task_ctx) &&
(task_ctx->log_id == cpuc->last_log_id) &&
rdlbr_from(tos)) {
task_ctx->lbr_stack_state = LBR_NONE;
return;
}
mask = x86_pmu.lbr_nr - 1;
for (i = 0; i < task_ctx->valid_lbrs; i++) { for (i = 0; i < task_ctx->valid_lbrs; i++) {
lbr_idx = (tos - i) & mask; lbr_idx = (tos - i) & mask;
wrlbr_from(lbr_idx, task_ctx->lbr_from[i]); wrlbr_from(lbr_idx, task_ctx->lbr_from[i]);
...@@ -369,6 +387,7 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) ...@@ -369,6 +387,7 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx) static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
{ {
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
unsigned lbr_idx, mask; unsigned lbr_idx, mask;
u64 tos, from; u64 tos, from;
int i; int i;
...@@ -393,6 +412,9 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx) ...@@ -393,6 +412,9 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
task_ctx->valid_lbrs = i; task_ctx->valid_lbrs = i;
task_ctx->tos = tos; task_ctx->tos = tos;
task_ctx->lbr_stack_state = LBR_VALID; task_ctx->lbr_stack_state = LBR_VALID;
cpuc->last_task_ctx = task_ctx;
cpuc->last_log_id = ++task_ctx->log_id;
} }
void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in) void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
......
...@@ -163,6 +163,7 @@ struct intel_excl_cntrs { ...@@ -163,6 +163,7 @@ struct intel_excl_cntrs {
unsigned core_id; /* per-core: core id */ unsigned core_id; /* per-core: core id */
}; };
struct x86_perf_task_context;
#define MAX_LBR_ENTRIES 32 #define MAX_LBR_ENTRIES 32
enum { enum {
...@@ -214,6 +215,8 @@ struct cpu_hw_events { ...@@ -214,6 +215,8 @@ struct cpu_hw_events {
struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES];
struct er_account *lbr_sel; struct er_account *lbr_sel;
u64 br_sel; u64 br_sel;
struct x86_perf_task_context *last_task_ctx;
int last_log_id;
/* /*
* Intel host/guest exclude bits * Intel host/guest exclude bits
...@@ -651,6 +654,7 @@ struct x86_perf_task_context { ...@@ -651,6 +654,7 @@ struct x86_perf_task_context {
int valid_lbrs; int valid_lbrs;
int lbr_callstack_users; int lbr_callstack_users;
int lbr_stack_state; int lbr_stack_state;
int log_id;
}; };
#define x86_add_quirk(func_) \ #define x86_add_quirk(func_) \
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment