Commit d31fc13f authored by Kan Liang's avatar Kan Liang Committed by Ingo Molnar

perf/x86/intel: Fix event update for auto-reload

There is a bug when reading event->count with large PEBS enabled.

Here is an example:

  # ./read_count
  0x71f0
  0x122c0
  0x1000000001c54
  0x100000001257d
  0x200000000bdc5

In fixed period mode, the auto-reload mechanism could be enabled for
PEBS events, but the calculation of event->count does not take the
auto-reload values into account.

Anyone who reads event->count will get the wrong result, e.g x86_pmu_read().

This bug was introduced with the auto-reload mechanism enabled since
commit:

  851559e3 ("perf/x86/intel: Use the PEBS auto reload mechanism when possible")

Introduce intel_pmu_save_and_restart_reload() to calculate the
event->count only for auto-reload.

Since the counter increments a negative counter value and overflows on
the sign switch, giving the interval:

        [-period, 0]

the difference between two consequtive reads is:

 A) value2 - value1;
    when no overflows have happened in between,
 B) (0 - value1) + (value2 - (-period));
    when one overflow happened in between,
 C) (0 - value1) + (n - 1) * (period) + (value2 - (-period));
    when @n overflows happened in between.

Here A) is the obvious difference, B) is the extension to the discrete
interval, where the first term is to the top of the interval and the
second term is from the bottom of the next interval and C) the extension
to multiple intervals, where the middle term is the whole intervals
covered.

The equation for all cases is:

    value2 - value1 + n * period

Previously the event->count is updated right before the sample output.
But for case A, there is no PEBS record ready. It needs to be specially
handled.

Remove the auto-reload code from x86_perf_event_set_period() since
we'll not longer call that function in this case.

Based-on-code-from: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: default avatarKan Liang <kan.liang@linux.intel.com>
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: acme@kernel.org
Fixes: 851559e3 ("perf/x86/intel: Use the PEBS auto reload mechanism when possible")
Link: http://lkml.kernel.org/r/1518474035-21006-2-git-send-email-kan.liang@linux.intel.comSigned-off-by: default avatarIngo Molnar <mingo@kernel.org>
parent 82d71ed0
...@@ -1156,8 +1156,6 @@ int x86_perf_event_set_period(struct perf_event *event) ...@@ -1156,8 +1156,6 @@ int x86_perf_event_set_period(struct perf_event *event)
per_cpu(pmc_prev_left[idx], smp_processor_id()) = left; per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
if (!(hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) ||
local64_read(&hwc->prev_count) != (u64)-left) {
/* /*
* The hw event starts counting from this event offset, * The hw event starts counting from this event offset,
* mark it to be able to extra future deltas: * mark it to be able to extra future deltas:
...@@ -1165,7 +1163,6 @@ int x86_perf_event_set_period(struct perf_event *event) ...@@ -1165,7 +1163,6 @@ int x86_perf_event_set_period(struct perf_event *event)
local64_set(&hwc->prev_count, (u64)-left); local64_set(&hwc->prev_count, (u64)-left);
wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask); wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
}
/* /*
* Due to erratum on certan cpu we need * Due to erratum on certan cpu we need
......
...@@ -1306,17 +1306,84 @@ get_next_pebs_record_by_bit(void *base, void *top, int bit) ...@@ -1306,17 +1306,84 @@ get_next_pebs_record_by_bit(void *base, void *top, int bit)
return NULL; return NULL;
} }
/*
* Special variant of intel_pmu_save_and_restart() for auto-reload.
*/
static int
intel_pmu_save_and_restart_reload(struct perf_event *event, int count)
{
struct hw_perf_event *hwc = &event->hw;
int shift = 64 - x86_pmu.cntval_bits;
u64 period = hwc->sample_period;
u64 prev_raw_count, new_raw_count;
s64 new, old;
WARN_ON(!period);
/*
* drain_pebs() only happens when the PMU is disabled.
*/
WARN_ON(this_cpu_read(cpu_hw_events.enabled));
prev_raw_count = local64_read(&hwc->prev_count);
rdpmcl(hwc->event_base_rdpmc, new_raw_count);
local64_set(&hwc->prev_count, new_raw_count);
/*
* Since the counter increments a negative counter value and
* overflows on the sign switch, giving the interval:
*
* [-period, 0]
*
* the difference between two consequtive reads is:
*
* A) value2 - value1;
* when no overflows have happened in between,
*
* B) (0 - value1) + (value2 - (-period));
* when one overflow happened in between,
*
* C) (0 - value1) + (n - 1) * (period) + (value2 - (-period));
* when @n overflows happened in between.
*
* Here A) is the obvious difference, B) is the extension to the
* discrete interval, where the first term is to the top of the
* interval and the second term is from the bottom of the next
* interval and C) the extension to multiple intervals, where the
* middle term is the whole intervals covered.
*
* An equivalent of C, by reduction, is:
*
* value2 - value1 + n * period
*/
new = ((s64)(new_raw_count << shift) >> shift);
old = ((s64)(prev_raw_count << shift) >> shift);
local64_add(new - old + count * period, &event->count);
perf_event_update_userpage(event);
return 0;
}
static void __intel_pmu_pebs_event(struct perf_event *event, static void __intel_pmu_pebs_event(struct perf_event *event,
struct pt_regs *iregs, struct pt_regs *iregs,
void *base, void *top, void *base, void *top,
int bit, int count) int bit, int count)
{ {
struct hw_perf_event *hwc = &event->hw;
struct perf_sample_data data; struct perf_sample_data data;
struct pt_regs regs; struct pt_regs regs;
void *at = get_next_pebs_record_by_bit(base, top, bit); void *at = get_next_pebs_record_by_bit(base, top, bit);
if (!intel_pmu_save_and_restart(event) && if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
!(event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD)) /*
* Now, auto-reload is only enabled in fixed period mode.
* The reload value is always hwc->sample_period.
* May need to change it, if auto-reload is enabled in
* freq mode later.
*/
intel_pmu_save_and_restart_reload(event, count);
} else if (!intel_pmu_save_and_restart(event))
return; return;
while (count > 1) { while (count > 1) {
...@@ -1368,8 +1435,11 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs) ...@@ -1368,8 +1435,11 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
return; return;
n = top - at; n = top - at;
if (n <= 0) if (n <= 0) {
if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD)
intel_pmu_save_and_restart_reload(event, 0);
return; return;
}
__intel_pmu_pebs_event(event, iregs, at, top, 0, n); __intel_pmu_pebs_event(event, iregs, at, top, 0, n);
} }
...@@ -1392,8 +1462,22 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) ...@@ -1392,8 +1462,22 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
ds->pebs_index = ds->pebs_buffer_base; ds->pebs_index = ds->pebs_buffer_base;
if (unlikely(base >= top)) if (unlikely(base >= top)) {
/*
* The drain_pebs() could be called twice in a short period
* for auto-reload event in pmu::read(). There are no
* overflows have happened in between.
* It needs to call intel_pmu_save_and_restart_reload() to
* update the event->count for this case.
*/
for_each_set_bit(bit, (unsigned long *)&cpuc->pebs_enabled,
x86_pmu.max_pebs_events) {
event = cpuc->events[bit];
if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD)
intel_pmu_save_and_restart_reload(event, 0);
}
return; return;
}
for (at = base; at < top; at += x86_pmu.pebs_record_size) { for (at = base; at < top; at += x86_pmu.pebs_record_size) {
struct pebs_record_nhm *p = at; struct pebs_record_nhm *p = at;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment