Commit aaba9801 authored by Ingo Molnar's avatar Ingo Molnar

perf_counter, x86: Make NMI lockups more robust

We have a debug check that detects stuck NMIs and returns with
the PMU disabled in the global ctrl MSR - but i managed to trigger
a situation where this was not enough to deassert the NMI.

So clear/reset the full PMU and keep the disable count balanced when
exiting from here. This way the box produces a debug warning but
stays up and is more debuggable.

[ Impact: in case of PMU related bugs, recover more gracefully ]

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: default avatarIngo Molnar <mingo@elte.hu>
parent 79202ba9
...@@ -724,6 +724,30 @@ static void intel_pmu_save_and_restart(struct perf_counter *counter) ...@@ -724,6 +724,30 @@ static void intel_pmu_save_and_restart(struct perf_counter *counter)
intel_pmu_enable_counter(hwc, idx); intel_pmu_enable_counter(hwc, idx);
} }
static void intel_pmu_reset(void)
{
unsigned long flags;
int idx;
if (!x86_pmu.num_counters)
return;
local_irq_save(flags);
printk("clearing PMU state on CPU#%d\n", smp_processor_id());
for (idx = 0; idx < x86_pmu.num_counters; idx++) {
checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
checking_wrmsrl(x86_pmu.perfctr + idx, 0ull);
}
for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
}
local_irq_restore(flags);
}
/* /*
* This handler is triggered by the local APIC, so the APIC IRQ handling * This handler is triggered by the local APIC, so the APIC IRQ handling
* rules apply: * rules apply:
...@@ -750,6 +774,8 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi) ...@@ -750,6 +774,8 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
if (++loops > 100) { if (++loops > 100) {
WARN_ONCE(1, "perfcounters: irq loop stuck!\n"); WARN_ONCE(1, "perfcounters: irq loop stuck!\n");
perf_counter_print_debug(); perf_counter_print_debug();
intel_pmu_reset();
perf_enable();
return 1; return 1;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment