Commit 71a84402 authored by Yazen Ghannam's avatar Yazen Ghannam Committed by Borislav Petkov

x86/MCE/AMD: Don't report L1 BTB MCA errors on some family 17h models

AMD family 17h Models 10h-2Fh may report a high number of L1 BTB MCA
errors under certain conditions. The errors are benign and can safely be
ignored. However, the high error rate may cause the MCA threshold
counter to overflow causing a high rate of thresholding interrupts.

In addition, users may see the errors reported through the AMD MCE
decoder module, even with the interrupt disabled, due to MCA polling.

Clear the "Counter Present" bit in the Instruction Fetch bank's
MCA_MISC0 register. This will prevent enabling MCA thresholding on this
bank which will prevent the high interrupt rate due to this error.

Define an AMD-specific function to filter these errors from the MCE
event pool so that they don't get reported during early boot.

Rename filter function in EDAC/mce_amd to avoid a naming conflict, while
at it.

 [ bp: Move function prototype to the internal header and
   massage/cleanup, fix typos. ]
Reported-by: default avatarRafał Miłecki <rafal@milecki.pl>
Signed-off-by: default avatarYazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: default avatarBorislav Petkov <bp@suse.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: "clemej@gmail.com" <clemej@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morse <james.morse@arm.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mauro Carvalho Chehab <mchehab@kernel.org>
Cc: Pu Wen <puwen@hygon.cn>
Cc: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Cc: Shirish S <Shirish.S@amd.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Cc: x86-ml <x86@kernel.org>
Cc: <stable@vger.kernel.org> # 5.0.x: c95b323d: x86/MCE/AMD: Turn off MC4_MISC thresholding on all family 0x15 models
Cc: <stable@vger.kernel.org> # 5.0.x: 30aa3d26: x86/MCE/AMD: Carve out the MC4_MISC thresholding quirk
Cc: <stable@vger.kernel.org> # 5.0.x: 9308fd40: x86/MCE: Group AMD function prototypes in <asm/mce.h>
Cc: <stable@vger.kernel.org> # 5.0.x
Link: https://lkml.kernel.org/r/20190325163410.171021-2-Yazen.Ghannam@amd.com
parent 45d4b7b9
...@@ -563,33 +563,59 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr, ...@@ -563,33 +563,59 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
return offset; return offset;
} }
bool amd_filter_mce(struct mce *m)
{
enum smca_bank_types bank_type = smca_get_bank_type(m->bank);
struct cpuinfo_x86 *c = &boot_cpu_data;
u8 xec = (m->status >> 16) & 0x3F;
/* See Family 17h Models 10h-2Fh Erratum #1114. */
if (c->x86 == 0x17 &&
c->x86_model >= 0x10 && c->x86_model <= 0x2F &&
bank_type == SMCA_IF && xec == 10)
return true;
return false;
}
/* /*
* Turn off MC4_MISC thresholding banks on all family 0x15 models since * Turn off thresholding banks for the following conditions:
* they're not supported there. * - MC4_MISC thresholding is not supported on Family 0x15.
* - Prevent possible spurious interrupts from the IF bank on Family 0x17
* Models 0x10-0x2F due to Erratum #1114.
*/ */
void disable_err_thresholding(struct cpuinfo_x86 *c) void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank)
{ {
int i; int i, num_msrs;
u64 hwcr; u64 hwcr;
bool need_toggle; bool need_toggle;
u32 msrs[] = { u32 msrs[NR_BLOCKS];
0x00000413, /* MC4_MISC0 */
0xc0000408, /* MC4_MISC1 */ if (c->x86 == 0x15 && bank == 4) {
}; msrs[0] = 0x00000413; /* MC4_MISC0 */
msrs[1] = 0xc0000408; /* MC4_MISC1 */
num_msrs = 2;
} else if (c->x86 == 0x17 &&
(c->x86_model >= 0x10 && c->x86_model <= 0x2F)) {
if (c->x86 != 0x15) if (smca_get_bank_type(bank) != SMCA_IF)
return; return;
msrs[0] = MSR_AMD64_SMCA_MCx_MISC(bank);
num_msrs = 1;
} else {
return;
}
rdmsrl(MSR_K7_HWCR, hwcr); rdmsrl(MSR_K7_HWCR, hwcr);
/* McStatusWrEn has to be set */ /* McStatusWrEn has to be set */
need_toggle = !(hwcr & BIT(18)); need_toggle = !(hwcr & BIT(18));
if (need_toggle) if (need_toggle)
wrmsrl(MSR_K7_HWCR, hwcr | BIT(18)); wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
/* Clear CntP bit safely */ /* Clear CntP bit safely */
for (i = 0; i < ARRAY_SIZE(msrs); i++) for (i = 0; i < num_msrs; i++)
msr_clear_bit(msrs[i], 62); msr_clear_bit(msrs[i], 62);
/* restore old settings */ /* restore old settings */
...@@ -604,12 +630,12 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) ...@@ -604,12 +630,12 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
unsigned int bank, block, cpu = smp_processor_id(); unsigned int bank, block, cpu = smp_processor_id();
int offset = -1; int offset = -1;
disable_err_thresholding(c);
for (bank = 0; bank < mca_cfg.banks; ++bank) { for (bank = 0; bank < mca_cfg.banks; ++bank) {
if (mce_flags.smca) if (mce_flags.smca)
smca_configure(bank, cpu); smca_configure(bank, cpu);
disable_err_thresholding(c, bank);
for (block = 0; block < NR_BLOCKS; ++block) { for (block = 0; block < NR_BLOCKS; ++block) {
address = get_block_address(address, low, high, bank, block); address = get_block_address(address, low, high, bank, block);
if (!address) if (!address)
......
...@@ -1777,6 +1777,9 @@ static void __mcheck_cpu_init_timer(void) ...@@ -1777,6 +1777,9 @@ static void __mcheck_cpu_init_timer(void)
bool filter_mce(struct mce *m) bool filter_mce(struct mce *m)
{ {
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
return amd_filter_mce(m);
return false; return false;
} }
......
...@@ -176,4 +176,10 @@ extern struct mca_msr_regs msr_ops; ...@@ -176,4 +176,10 @@ extern struct mca_msr_regs msr_ops;
/* Decide whether to add MCE record to MCE event pool or filter it out. */ /* Decide whether to add MCE record to MCE event pool or filter it out. */
extern bool filter_mce(struct mce *m); extern bool filter_mce(struct mce *m);
#ifdef CONFIG_X86_MCE_AMD
extern bool amd_filter_mce(struct mce *m);
#else
static inline bool amd_filter_mce(struct mce *m) { return false; };
#endif
#endif /* __X86_MCE_INTERNAL_H__ */ #endif /* __X86_MCE_INTERNAL_H__ */
...@@ -1004,7 +1004,7 @@ static inline void amd_decode_err_code(u16 ec) ...@@ -1004,7 +1004,7 @@ static inline void amd_decode_err_code(u16 ec)
/* /*
* Filter out unwanted MCE signatures here. * Filter out unwanted MCE signatures here.
*/ */
static bool amd_filter_mce(struct mce *m) static bool ignore_mce(struct mce *m)
{ {
/* /*
* NB GART TLB error reporting is disabled by default. * NB GART TLB error reporting is disabled by default.
...@@ -1038,7 +1038,7 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) ...@@ -1038,7 +1038,7 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
unsigned int fam = x86_family(m->cpuid); unsigned int fam = x86_family(m->cpuid);
int ecc; int ecc;
if (amd_filter_mce(m)) if (ignore_mce(m))
return NOTIFY_STOP; return NOTIFY_STOP;
pr_emerg(HW_ERR "%s\n", decode_error_status(m)); pr_emerg(HW_ERR "%s\n", decode_error_status(m));
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment