Commit 13324c42 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'x86-cpu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 CPU feature updates from Thomas Gleixner:
 "Updates for x86 CPU features:

   - Support for UMWAIT/UMONITOR, which allows to use MWAIT and MONITOR
     instructions in user space to save power e.g. in HPC workloads
     which spin wait on synchronization points.

     The maximum time a MWAIT can halt in userspace is controlled by the
     kernel and can be adjusted by the sysadmin.

   - Speed up the MTRR handling code on CPUs which support cache
     self-snooping correctly.

     On those CPUs the wbinvd() invocations can be omitted which speeds
     up the MTRR setup by a factor of 50.

   - Support for the new x86 vendor Zhaoxin who develops processors
     based on the VIA Centaur technology.

   - Prevent 'cat /proc/cpuinfo' from affecting isolated NOHZ_FULL CPUs
     by sending IPIs to retrieve the CPU frequency and use the cached
     values instead.

   - The addition and late revert of the FSGSBASE support. The revert
     was required as it turned out that the code still has hard to
     diagnose issues. Yet another engineering trainwreck...

   - Small fixes, cleanups, improvements and the usual new Intel CPU
     family/model addons"

* 'x86-cpu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (41 commits)
  x86/fsgsbase: Revert FSGSBASE support
  selftests/x86/fsgsbase: Fix some test case bugs
  x86/entry/64: Fix and clean up paranoid_exit
  x86/entry/64: Don't compile ignore_sysret if 32-bit emulation is enabled
  selftests/x86: Test SYSCALL and SYSENTER manually with TF set
  x86/mtrr: Skip cache flushes on CPUs with cache self-snooping
  x86/cpu/intel: Clear cache self-snoop capability in CPUs with known errata
  Documentation/ABI: Document umwait control sysfs interfaces
  x86/umwait: Add sysfs interface to control umwait maximum time
  x86/umwait: Add sysfs interface to control umwait C0.2 state
  x86/umwait: Initialize umwait control values
  x86/cpufeatures: Enumerate user wait instructions
  x86/cpu: Disable frequency requests via aperfmperf IPI for nohz_full CPUs
  x86/acpi/cstate: Add Zhaoxin processors support for cache flush policy in C3
  ACPI, x86: Add Zhaoxin processors support for NONSTOP TSC
  x86/cpu: Create Zhaoxin processors architecture support file
  x86/cpu: Split Tremont based Atoms from the rest
  Documentation/x86/64: Add documentation for GS/FS addressing mode
  x86/elf: Enumerate kernel FSGSBASE capability in AT_HWCAP2
  x86/cpu: Enable FSGSBASE on 64bit by default and add a chicken bit
  ...
parents ab2486a9 049331f2
...@@ -538,3 +538,26 @@ Description: Intel Energy and Performance Bias Hint (EPB) ...@@ -538,3 +538,26 @@ Description: Intel Energy and Performance Bias Hint (EPB)
This attribute is present for all online CPUs supporting the This attribute is present for all online CPUs supporting the
Intel EPB feature. Intel EPB feature.
What: /sys/devices/system/cpu/umwait_control
/sys/devices/system/cpu/umwait_control/enable_c02
/sys/devices/system/cpu/umwait_control/max_time
Date: May 2019
Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
Description: Umwait control
enable_c02: Read/write interface to control umwait C0.2 state
Read returns C0.2 state status:
0: C0.2 is disabled
1: C0.2 is enabled
Write 'y' or '1' or 'on' to enable C0.2 state.
Write 'n' or '0' or 'off' to disable C0.2 state.
The interface is case insensitive.
max_time: Read/write interface to control umwait maximum time
in TSC-quanta that the CPU can reside in either C0.1
or C0.2 state. The time is an unsigned 32-bit number.
Note that a value of zero means there is no limit.
Low order two bits must be zero.
...@@ -31,7 +31,7 @@ you probably needn't concern yourself with isdn4k-utils. ...@@ -31,7 +31,7 @@ you probably needn't concern yourself with isdn4k-utils.
====================== =============== ======================================== ====================== =============== ========================================
GNU C 4.6 gcc --version GNU C 4.6 gcc --version
GNU make 3.81 make --version GNU make 3.81 make --version
binutils 2.20 ld -v binutils 2.21 ld -v
flex 2.5.35 flex --version flex 2.5.35 flex --version
bison 2.0 bison --version bison 2.0 bison --version
util-linux 2.10o fdformat --version util-linux 2.10o fdformat --version
...@@ -77,9 +77,7 @@ You will need GNU make 3.81 or later to build the kernel. ...@@ -77,9 +77,7 @@ You will need GNU make 3.81 or later to build the kernel.
Binutils Binutils
-------- --------
The build system has, as of 4.13, switched to using thin archives (`ar T`) Binutils 2.21 or newer is needed to build the kernel.
rather than incremental linking (`ld -r`) for built-in.a intermediate steps.
This requires binutils 2.20 or newer.
pkg-config pkg-config
---------- ----------
......
...@@ -17523,6 +17523,12 @@ Q: https://patchwork.linuxtv.org/project/linux-media/list/ ...@@ -17523,6 +17523,12 @@ Q: https://patchwork.linuxtv.org/project/linux-media/list/
S: Maintained S: Maintained
F: drivers/media/dvb-frontends/zd1301_demod* F: drivers/media/dvb-frontends/zd1301_demod*
ZHAOXIN PROCESSOR SUPPORT
M: Tony W Wang-oc <TonyWWang-oc@zhaoxin.com>
L: linux-kernel@vger.kernel.org
S: Maintained
F: arch/x86/kernel/cpu/zhaoxin.c
ZPOOL COMPRESSED PAGE STORAGE API ZPOOL COMPRESSED PAGE STORAGE API
M: Dan Streetman <ddstreet@ieee.org> M: Dan Streetman <ddstreet@ieee.org>
L: linux-mm@kvack.org L: linux-mm@kvack.org
......
...@@ -480,3 +480,16 @@ config CPU_SUP_UMC_32 ...@@ -480,3 +480,16 @@ config CPU_SUP_UMC_32
CPU might render the kernel unbootable. CPU might render the kernel unbootable.
If unsure, say N. If unsure, say N.
config CPU_SUP_ZHAOXIN
default y
bool "Support Zhaoxin processors" if PROCESSOR_SELECT
help
This enables detection, tunings and quirks for Zhaoxin processors
You need this enabled if you want your kernel to run on a
Zhaoxin CPU. Disabling this option on other types of CPUs
makes the kernel a tiny bit smaller. Disabling it on a Zhaoxin
CPU might render the kernel unbootable.
If unsure, say N.
...@@ -1692,11 +1692,17 @@ nmi_restore: ...@@ -1692,11 +1692,17 @@ nmi_restore:
iretq iretq
END(nmi) END(nmi)
#ifndef CONFIG_IA32_EMULATION
/*
* This handles SYSCALL from 32-bit code. There is no way to program
* MSRs to fully disable 32-bit SYSCALL.
*/
ENTRY(ignore_sysret) ENTRY(ignore_sysret)
UNWIND_HINT_EMPTY UNWIND_HINT_EMPTY
mov $-ENOSYS, %eax mov $-ENOSYS, %eax
sysret sysret
END(ignore_sysret) END(ignore_sysret)
#endif
ENTRY(rewind_stack_do_exit) ENTRY(rewind_stack_do_exit)
UNWIND_HINT_FUNC UNWIND_HINT_FUNC
......
...@@ -1400,6 +1400,7 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = { ...@@ -1400,6 +1400,7 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = {
X86_UNCORE_MODEL_MATCH(INTEL_FAM6_KABYLAKE_MOBILE, skl_uncore_init), X86_UNCORE_MODEL_MATCH(INTEL_FAM6_KABYLAKE_MOBILE, skl_uncore_init),
X86_UNCORE_MODEL_MATCH(INTEL_FAM6_KABYLAKE_DESKTOP, skl_uncore_init), X86_UNCORE_MODEL_MATCH(INTEL_FAM6_KABYLAKE_DESKTOP, skl_uncore_init),
X86_UNCORE_MODEL_MATCH(INTEL_FAM6_ICELAKE_MOBILE, icl_uncore_init), X86_UNCORE_MODEL_MATCH(INTEL_FAM6_ICELAKE_MOBILE, icl_uncore_init),
X86_UNCORE_MODEL_MATCH(INTEL_FAM6_ICELAKE_NNPI, icl_uncore_init),
{}, {},
}; };
......
...@@ -22,8 +22,8 @@ enum cpuid_leafs ...@@ -22,8 +22,8 @@ enum cpuid_leafs
CPUID_LNX_3, CPUID_LNX_3,
CPUID_7_0_EBX, CPUID_7_0_EBX,
CPUID_D_1_EAX, CPUID_D_1_EAX,
CPUID_F_0_EDX, CPUID_LNX_4,
CPUID_F_1_EDX, CPUID_7_1_EAX,
CPUID_8000_0008_EBX, CPUID_8000_0008_EBX,
CPUID_6_EAX, CPUID_6_EAX,
CPUID_8000_000A_EDX, CPUID_8000_000A_EDX,
......
...@@ -239,12 +239,14 @@ ...@@ -239,12 +239,14 @@
#define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */ #define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */
#define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */ #define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */
#define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */ #define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */
#define X86_FEATURE_FDP_EXCPTN_ONLY ( 9*32+ 6) /* "" FPU data pointer updated only on x87 exceptions */
#define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */ #define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */
#define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */ #define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */
#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB instructions */ #define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB instructions */
#define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */ #define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */
#define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */ #define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */
#define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */ #define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */
#define X86_FEATURE_ZERO_FCS_FDS ( 9*32+13) /* "" Zero out FPU CS and FPU DS */
#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */ #define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */
#define X86_FEATURE_RDT_A ( 9*32+15) /* Resource Director Technology Allocation */ #define X86_FEATURE_RDT_A ( 9*32+15) /* Resource Director Technology Allocation */
#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */ #define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */
...@@ -269,13 +271,19 @@ ...@@ -269,13 +271,19 @@
#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 instruction */ #define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 instruction */
#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS instructions */ #define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS instructions */
/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (EDX), word 11 */ /*
#define X86_FEATURE_CQM_LLC (11*32+ 1) /* LLC QoS if 1 */ * Extended auxiliary flags: Linux defined - for features scattered in various
* CPUID levels like 0xf, etc.
*
* Reuse free bits when adding new feature flags!
*/
#define X86_FEATURE_CQM_LLC (11*32+ 0) /* LLC QoS if 1 */
#define X86_FEATURE_CQM_OCCUP_LLC (11*32+ 1) /* LLC occupancy monitoring */
#define X86_FEATURE_CQM_MBM_TOTAL (11*32+ 2) /* LLC Total MBM monitoring */
#define X86_FEATURE_CQM_MBM_LOCAL (11*32+ 3) /* LLC Local MBM monitoring */
/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (EDX), word 12 */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring */ #define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */
#define X86_FEATURE_CQM_MBM_TOTAL (12*32+ 1) /* LLC Total MBM monitoring */
#define X86_FEATURE_CQM_MBM_LOCAL (12*32+ 2) /* LLC Local MBM monitoring */
/* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */
#define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ #define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */
...@@ -322,6 +330,7 @@ ...@@ -322,6 +330,7 @@
#define X86_FEATURE_UMIP (16*32+ 2) /* User Mode Instruction Protection */ #define X86_FEATURE_UMIP (16*32+ 2) /* User Mode Instruction Protection */
#define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ #define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */
#define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ #define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */
#define X86_FEATURE_WAITPKG (16*32+ 5) /* UMONITOR/UMWAIT/TPAUSE Instructions */
#define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */ #define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */
#define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New Instructions */ #define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New Instructions */
#define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */ #define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */
......
...@@ -56,6 +56,7 @@ ...@@ -56,6 +56,7 @@
#define INTEL_FAM6_ICELAKE_XEON_D 0x6C #define INTEL_FAM6_ICELAKE_XEON_D 0x6C
#define INTEL_FAM6_ICELAKE_DESKTOP 0x7D #define INTEL_FAM6_ICELAKE_DESKTOP 0x7D
#define INTEL_FAM6_ICELAKE_MOBILE 0x7E #define INTEL_FAM6_ICELAKE_MOBILE 0x7E
#define INTEL_FAM6_ICELAKE_NNPI 0x9D
/* "Small Core" Processors (Atom) */ /* "Small Core" Processors (Atom) */
...@@ -76,6 +77,7 @@ ...@@ -76,6 +77,7 @@
#define INTEL_FAM6_ATOM_GOLDMONT 0x5C /* Apollo Lake */ #define INTEL_FAM6_ATOM_GOLDMONT 0x5C /* Apollo Lake */
#define INTEL_FAM6_ATOM_GOLDMONT_X 0x5F /* Denverton */ #define INTEL_FAM6_ATOM_GOLDMONT_X 0x5F /* Denverton */
#define INTEL_FAM6_ATOM_GOLDMONT_PLUS 0x7A /* Gemini Lake */ #define INTEL_FAM6_ATOM_GOLDMONT_PLUS 0x7A /* Gemini Lake */
#define INTEL_FAM6_ATOM_TREMONT_X 0x86 /* Jacobsville */ #define INTEL_FAM6_ATOM_TREMONT_X 0x86 /* Jacobsville */
/* Xeon Phi */ /* Xeon Phi */
......
...@@ -61,6 +61,15 @@ ...@@ -61,6 +61,15 @@
#define MSR_PLATFORM_INFO_CPUID_FAULT_BIT 31 #define MSR_PLATFORM_INFO_CPUID_FAULT_BIT 31
#define MSR_PLATFORM_INFO_CPUID_FAULT BIT_ULL(MSR_PLATFORM_INFO_CPUID_FAULT_BIT) #define MSR_PLATFORM_INFO_CPUID_FAULT BIT_ULL(MSR_PLATFORM_INFO_CPUID_FAULT_BIT)
#define MSR_IA32_UMWAIT_CONTROL 0xe1
#define MSR_IA32_UMWAIT_CONTROL_C02_DISABLE BIT(0)
#define MSR_IA32_UMWAIT_CONTROL_RESERVED BIT(1)
/*
* The time field is bit[31:2], but representing a 32bit value with
* bit[1:0] zero.
*/
#define MSR_IA32_UMWAIT_CONTROL_TIME_MASK (~0x03U)
#define MSR_PKG_CST_CONFIG_CONTROL 0x000000e2 #define MSR_PKG_CST_CONFIG_CONTROL 0x000000e2
#define NHM_C3_AUTO_DEMOTE (1UL << 25) #define NHM_C3_AUTO_DEMOTE (1UL << 25)
#define NHM_C1_AUTO_DEMOTE (1UL << 26) #define NHM_C1_AUTO_DEMOTE (1UL << 26)
......
...@@ -144,7 +144,8 @@ enum cpuid_regs_idx { ...@@ -144,7 +144,8 @@ enum cpuid_regs_idx {
#define X86_VENDOR_TRANSMETA 7 #define X86_VENDOR_TRANSMETA 7
#define X86_VENDOR_NSC 8 #define X86_VENDOR_NSC 8
#define X86_VENDOR_HYGON 9 #define X86_VENDOR_HYGON 9
#define X86_VENDOR_NUM 10 #define X86_VENDOR_ZHAOXIN 10
#define X86_VENDOR_NUM 11
#define X86_VENDOR_UNKNOWN 0xff #define X86_VENDOR_UNKNOWN 0xff
......
...@@ -64,6 +64,21 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags, ...@@ -64,6 +64,21 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags,
c->x86_stepping >= 0x0e)) c->x86_stepping >= 0x0e))
flags->bm_check = 1; flags->bm_check = 1;
} }
if (c->x86_vendor == X86_VENDOR_ZHAOXIN) {
/*
* All Zhaoxin CPUs that support C3 share cache.
* And caches should not be flushed by software while
* entering C3 type state.
*/
flags->bm_check = 1;
/*
* On all recent Zhaoxin platforms, ARB_DISABLE is a nop.
* So, set bm_control to zero to indicate that ARB_DISABLE
* is not required while entering C3 type state.
*/
flags->bm_control = 0;
}
} }
EXPORT_SYMBOL(acpi_processor_power_init_bm_check); EXPORT_SYMBOL(acpi_processor_power_init_bm_check);
......
...@@ -24,6 +24,7 @@ obj-y += match.o ...@@ -24,6 +24,7 @@ obj-y += match.o
obj-y += bugs.o obj-y += bugs.o
obj-y += aperfmperf.o obj-y += aperfmperf.o
obj-y += cpuid-deps.o obj-y += cpuid-deps.o
obj-y += umwait.o
obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o
...@@ -38,6 +39,7 @@ obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o ...@@ -38,6 +39,7 @@ obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o
obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
obj-$(CONFIG_CPU_SUP_ZHAOXIN) += zhaoxin.o
obj-$(CONFIG_X86_MCE) += mce/ obj-$(CONFIG_X86_MCE) += mce/
obj-$(CONFIG_MTRR) += mtrr/ obj-$(CONFIG_MTRR) += mtrr/
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
#include <linux/percpu.h> #include <linux/percpu.h>
#include <linux/cpufreq.h> #include <linux/cpufreq.h>
#include <linux/smp.h> #include <linux/smp.h>
#include <linux/sched/isolation.h>
#include "cpu.h" #include "cpu.h"
...@@ -85,6 +86,9 @@ unsigned int aperfmperf_get_khz(int cpu) ...@@ -85,6 +86,9 @@ unsigned int aperfmperf_get_khz(int cpu)
if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
return 0; return 0;
if (!housekeeping_cpu(cpu, HK_FLAG_MISC))
return 0;
aperfmperf_snapshot_cpu(cpu, ktime_get(), true); aperfmperf_snapshot_cpu(cpu, ktime_get(), true);
return per_cpu(samples.khz, cpu); return per_cpu(samples.khz, cpu);
} }
...@@ -101,9 +105,12 @@ void arch_freq_prepare_all(void) ...@@ -101,9 +105,12 @@ void arch_freq_prepare_all(void)
if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
return; return;
for_each_online_cpu(cpu) for_each_online_cpu(cpu) {
if (!housekeeping_cpu(cpu, HK_FLAG_MISC))
continue;
if (!aperfmperf_snapshot_cpu(cpu, now, false)) if (!aperfmperf_snapshot_cpu(cpu, now, false))
wait = true; wait = true;
}
if (wait) if (wait)
msleep(APERFMPERF_REFRESH_DELAY_MS); msleep(APERFMPERF_REFRESH_DELAY_MS);
...@@ -117,6 +124,9 @@ unsigned int arch_freq_get_on_cpu(int cpu) ...@@ -117,6 +124,9 @@ unsigned int arch_freq_get_on_cpu(int cpu)
if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
return 0; return 0;
if (!housekeeping_cpu(cpu, HK_FLAG_MISC))
return 0;
if (aperfmperf_snapshot_cpu(cpu, ktime_get(), true)) if (aperfmperf_snapshot_cpu(cpu, ktime_get(), true))
return per_cpu(samples.khz, cpu); return per_cpu(samples.khz, cpu);
......
...@@ -658,8 +658,7 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id) ...@@ -658,8 +658,7 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id)
if (c->x86 < 0x17) { if (c->x86 < 0x17) {
/* LLC is at the node level. */ /* LLC is at the node level. */
per_cpu(cpu_llc_id, cpu) = node_id; per_cpu(cpu_llc_id, cpu) = node_id;
} else if (c->x86 == 0x17 && } else if (c->x86 == 0x17 && c->x86_model <= 0x1F) {
c->x86_model >= 0 && c->x86_model <= 0x1F) {
/* /*
* LLC is at the core complex level. * LLC is at the core complex level.
* Core complex ID is ApicId[3] for these processors. * Core complex ID is ApicId[3] for these processors.
......
...@@ -801,6 +801,30 @@ static void init_speculation_control(struct cpuinfo_x86 *c) ...@@ -801,6 +801,30 @@ static void init_speculation_control(struct cpuinfo_x86 *c)
} }
} }
static void init_cqm(struct cpuinfo_x86 *c)
{
if (!cpu_has(c, X86_FEATURE_CQM_LLC)) {
c->x86_cache_max_rmid = -1;
c->x86_cache_occ_scale = -1;
return;
}
/* will be overridden if occupancy monitoring exists */
c->x86_cache_max_rmid = cpuid_ebx(0xf);
if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC) ||
cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL) ||
cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)) {
u32 eax, ebx, ecx, edx;
/* QoS sub-leaf, EAX=0Fh, ECX=1 */
cpuid_count(0xf, 1, &eax, &ebx, &ecx, &edx);
c->x86_cache_max_rmid = ecx;
c->x86_cache_occ_scale = ebx;
}
}
void get_cpu_cap(struct cpuinfo_x86 *c) void get_cpu_cap(struct cpuinfo_x86 *c)
{ {
u32 eax, ebx, ecx, edx; u32 eax, ebx, ecx, edx;
...@@ -823,6 +847,12 @@ void get_cpu_cap(struct cpuinfo_x86 *c) ...@@ -823,6 +847,12 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
c->x86_capability[CPUID_7_0_EBX] = ebx; c->x86_capability[CPUID_7_0_EBX] = ebx;
c->x86_capability[CPUID_7_ECX] = ecx; c->x86_capability[CPUID_7_ECX] = ecx;
c->x86_capability[CPUID_7_EDX] = edx; c->x86_capability[CPUID_7_EDX] = edx;
/* Check valid sub-leaf index before accessing it */
if (eax >= 1) {
cpuid_count(0x00000007, 1, &eax, &ebx, &ecx, &edx);
c->x86_capability[CPUID_7_1_EAX] = eax;
}
} }
/* Extended state features: level 0x0000000d */ /* Extended state features: level 0x0000000d */
...@@ -832,33 +862,6 @@ void get_cpu_cap(struct cpuinfo_x86 *c) ...@@ -832,33 +862,6 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
c->x86_capability[CPUID_D_1_EAX] = eax; c->x86_capability[CPUID_D_1_EAX] = eax;
} }
/* Additional Intel-defined flags: level 0x0000000F */
if (c->cpuid_level >= 0x0000000F) {
/* QoS sub-leaf, EAX=0Fh, ECX=0 */
cpuid_count(0x0000000F, 0, &eax, &ebx, &ecx, &edx);
c->x86_capability[CPUID_F_0_EDX] = edx;
if (cpu_has(c, X86_FEATURE_CQM_LLC)) {
/* will be overridden if occupancy monitoring exists */
c->x86_cache_max_rmid = ebx;
/* QoS sub-leaf, EAX=0Fh, ECX=1 */
cpuid_count(0x0000000F, 1, &eax, &ebx, &ecx, &edx);
c->x86_capability[CPUID_F_1_EDX] = edx;
if ((cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC)) ||
((cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL)) ||
(cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)))) {
c->x86_cache_max_rmid = ecx;
c->x86_cache_occ_scale = ebx;
}
} else {
c->x86_cache_max_rmid = -1;
c->x86_cache_occ_scale = -1;
}
}
/* AMD-defined flags: level 0x80000001 */ /* AMD-defined flags: level 0x80000001 */
eax = cpuid_eax(0x80000000); eax = cpuid_eax(0x80000000);
c->extended_cpuid_level = eax; c->extended_cpuid_level = eax;
...@@ -889,6 +892,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c) ...@@ -889,6 +892,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
init_scattered_cpuid_features(c); init_scattered_cpuid_features(c);
init_speculation_control(c); init_speculation_control(c);
init_cqm(c);
/* /*
* Clear/Set all flags overridden by options, after probe. * Clear/Set all flags overridden by options, after probe.
......
...@@ -64,6 +64,10 @@ static const struct cpuid_dep cpuid_deps[] = { ...@@ -64,6 +64,10 @@ static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F }, { X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F },
{ X86_FEATURE_AVX512_4FMAPS, X86_FEATURE_AVX512F }, { X86_FEATURE_AVX512_4FMAPS, X86_FEATURE_AVX512F },
{ X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F }, { X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F },
{ X86_FEATURE_CQM_OCCUP_LLC, X86_FEATURE_CQM_LLC },
{ X86_FEATURE_CQM_MBM_TOTAL, X86_FEATURE_CQM_LLC },
{ X86_FEATURE_CQM_MBM_LOCAL, X86_FEATURE_CQM_LLC },
{ X86_FEATURE_AVX512_BF16, X86_FEATURE_AVX512VL },
{} {}
}; };
......
...@@ -66,6 +66,32 @@ void check_mpx_erratum(struct cpuinfo_x86 *c) ...@@ -66,6 +66,32 @@ void check_mpx_erratum(struct cpuinfo_x86 *c)
} }
} }
/*
* Processors which have self-snooping capability can handle conflicting
* memory type across CPUs by snooping its own cache. However, there exists
* CPU models in which having conflicting memory types still leads to
* unpredictable behavior, machine check errors, or hangs. Clear this
* feature to prevent its use on machines with known erratas.
*/
static void check_memory_type_self_snoop_errata(struct cpuinfo_x86 *c)
{
switch (c->x86_model) {
case INTEL_FAM6_CORE_YONAH:
case INTEL_FAM6_CORE2_MEROM:
case INTEL_FAM6_CORE2_MEROM_L:
case INTEL_FAM6_CORE2_PENRYN:
case INTEL_FAM6_CORE2_DUNNINGTON:
case INTEL_FAM6_NEHALEM:
case INTEL_FAM6_NEHALEM_G:
case INTEL_FAM6_NEHALEM_EP:
case INTEL_FAM6_NEHALEM_EX:
case INTEL_FAM6_WESTMERE:
case INTEL_FAM6_WESTMERE_EP:
case INTEL_FAM6_SANDYBRIDGE:
setup_clear_cpu_cap(X86_FEATURE_SELFSNOOP);
}
}
static bool ring3mwait_disabled __read_mostly; static bool ring3mwait_disabled __read_mostly;
static int __init ring3mwait_disable(char *__unused) static int __init ring3mwait_disable(char *__unused)
...@@ -304,6 +330,7 @@ static void early_init_intel(struct cpuinfo_x86 *c) ...@@ -304,6 +330,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
} }
check_mpx_erratum(c); check_mpx_erratum(c);
check_memory_type_self_snoop_errata(c);
/* /*
* Get the number of SMT siblings early from the extended topology * Get the number of SMT siblings early from the extended topology
......
...@@ -743,6 +743,14 @@ static void prepare_set(void) __acquires(set_atomicity_lock) ...@@ -743,6 +743,14 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
/* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */
cr0 = read_cr0() | X86_CR0_CD; cr0 = read_cr0() | X86_CR0_CD;
write_cr0(cr0); write_cr0(cr0);
/*
* Cache flushing is the most time-consuming step when programming
* the MTRRs. Fortunately, as per the Intel Software Development
* Manual, we can skip it if the processor supports cache self-
* snooping.
*/
if (!static_cpu_has(X86_FEATURE_SELFSNOOP))
wbinvd(); wbinvd();
/* Save value of CR4 and clear Page Global Enable (bit 7) */ /* Save value of CR4 and clear Page Global Enable (bit 7) */
...@@ -760,6 +768,9 @@ static void prepare_set(void) __acquires(set_atomicity_lock) ...@@ -760,6 +768,9 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
/* Disable MTRRs, and set the default type to uncached */ /* Disable MTRRs, and set the default type to uncached */
mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi); mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi);
/* Again, only flush caches if we have to. */
if (!static_cpu_has(X86_FEATURE_SELFSNOOP))
wbinvd(); wbinvd();
} }
......
...@@ -26,6 +26,10 @@ struct cpuid_bit { ...@@ -26,6 +26,10 @@ struct cpuid_bit {
static const struct cpuid_bit cpuid_bits[] = { static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 }, { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 },
{ X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 },
{ X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 },
{ X86_FEATURE_CQM_OCCUP_LLC, CPUID_EDX, 0, 0x0000000f, 1 },
{ X86_FEATURE_CQM_MBM_TOTAL, CPUID_EDX, 1, 0x0000000f, 1 },
{ X86_FEATURE_CQM_MBM_LOCAL, CPUID_EDX, 2, 0x0000000f, 1 },
{ X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 }, { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 },
{ X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 }, { X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 },
{ X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 }, { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 },
......
// SPDX-License-Identifier: GPL-2.0
#include <linux/syscore_ops.h>
#include <linux/suspend.h>
#include <linux/cpu.h>
#include <asm/msr.h>
#define UMWAIT_C02_ENABLE 0
#define UMWAIT_CTRL_VAL(max_time, c02_disable) \
(((max_time) & MSR_IA32_UMWAIT_CONTROL_TIME_MASK) | \
((c02_disable) & MSR_IA32_UMWAIT_CONTROL_C02_DISABLE))
/*
* Cache IA32_UMWAIT_CONTROL MSR. This is a systemwide control. By default,
* umwait max time is 100000 in TSC-quanta and C0.2 is enabled
*/
static u32 umwait_control_cached = UMWAIT_CTRL_VAL(100000, UMWAIT_C02_ENABLE);
/*
* Serialize access to umwait_control_cached and IA32_UMWAIT_CONTROL MSR in
* the sysfs write functions.
*/
static DEFINE_MUTEX(umwait_lock);
static void umwait_update_control_msr(void * unused)
{
lockdep_assert_irqs_disabled();
wrmsr(MSR_IA32_UMWAIT_CONTROL, READ_ONCE(umwait_control_cached), 0);
}
/*
* The CPU hotplug callback sets the control MSR to the global control
* value.
*
* Disable interrupts so the read of umwait_control_cached and the WRMSR
* are protected against a concurrent sysfs write. Otherwise the sysfs
* write could update the cached value after it had been read on this CPU
* and issue the IPI before the old value had been written. The IPI would
* interrupt, write the new value and after return from IPI the previous
* value would be written by this CPU.
*
* With interrupts disabled the upcoming CPU either sees the new control
* value or the IPI is updating this CPU to the new control value after
* interrupts have been reenabled.
*/
static int umwait_cpu_online(unsigned int cpu)
{
local_irq_disable();
umwait_update_control_msr(NULL);
local_irq_enable();
return 0;
}
/*
* On resume, restore IA32_UMWAIT_CONTROL MSR on the boot processor which
* is the only active CPU at this time. The MSR is set up on the APs via the
* CPU hotplug callback.
*
* This function is invoked on resume from suspend and hibernation. On
* resume from suspend the restore should be not required, but we neither
* trust the firmware nor does it matter if the same value is written
* again.
*/
static void umwait_syscore_resume(void)
{
umwait_update_control_msr(NULL);
}
static struct syscore_ops umwait_syscore_ops = {
.resume = umwait_syscore_resume,
};
/* sysfs interface */
/*
* When bit 0 in IA32_UMWAIT_CONTROL MSR is 1, C0.2 is disabled.
* Otherwise, C0.2 is enabled.
*/
static inline bool umwait_ctrl_c02_enabled(u32 ctrl)
{
return !(ctrl & MSR_IA32_UMWAIT_CONTROL_C02_DISABLE);
}
static inline u32 umwait_ctrl_max_time(u32 ctrl)
{
return ctrl & MSR_IA32_UMWAIT_CONTROL_TIME_MASK;
}
static inline void umwait_update_control(u32 maxtime, bool c02_enable)
{
u32 ctrl = maxtime & MSR_IA32_UMWAIT_CONTROL_TIME_MASK;
if (!c02_enable)
ctrl |= MSR_IA32_UMWAIT_CONTROL_C02_DISABLE;
WRITE_ONCE(umwait_control_cached, ctrl);
/* Propagate to all CPUs */
on_each_cpu(umwait_update_control_msr, NULL, 1);
}
static ssize_t
enable_c02_show(struct device *dev, struct device_attribute *attr, char *buf)
{
u32 ctrl = READ_ONCE(umwait_control_cached);
return sprintf(buf, "%d\n", umwait_ctrl_c02_enabled(ctrl));
}
static ssize_t enable_c02_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
bool c02_enable;
u32 ctrl;
int ret;
ret = kstrtobool(buf, &c02_enable);
if (ret)
return ret;
mutex_lock(&umwait_lock);
ctrl = READ_ONCE(umwait_control_cached);
if (c02_enable != umwait_ctrl_c02_enabled(ctrl))
umwait_update_control(ctrl, c02_enable);
mutex_unlock(&umwait_lock);
return count;
}
static DEVICE_ATTR_RW(enable_c02);
static ssize_t
max_time_show(struct device *kobj, struct device_attribute *attr, char *buf)
{
u32 ctrl = READ_ONCE(umwait_control_cached);
return sprintf(buf, "%u\n", umwait_ctrl_max_time(ctrl));
}
static ssize_t max_time_store(struct device *kobj,
struct device_attribute *attr,
const char *buf, size_t count)
{
u32 max_time, ctrl;
int ret;
ret = kstrtou32(buf, 0, &max_time);
if (ret)
return ret;
/* bits[1:0] must be zero */
if (max_time & ~MSR_IA32_UMWAIT_CONTROL_TIME_MASK)
return -EINVAL;
mutex_lock(&umwait_lock);
ctrl = READ_ONCE(umwait_control_cached);
if (max_time != umwait_ctrl_max_time(ctrl))
umwait_update_control(max_time, umwait_ctrl_c02_enabled(ctrl));
mutex_unlock(&umwait_lock);
return count;
}
static DEVICE_ATTR_RW(max_time);
static struct attribute *umwait_attrs[] = {
&dev_attr_enable_c02.attr,
&dev_attr_max_time.attr,
NULL
};
static struct attribute_group umwait_attr_group = {
.attrs = umwait_attrs,
.name = "umwait_control",
};
static int __init umwait_init(void)
{
struct device *dev;
int ret;
if (!boot_cpu_has(X86_FEATURE_WAITPKG))
return -ENODEV;
ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "umwait:online",
umwait_cpu_online, NULL);
register_syscore_ops(&umwait_syscore_ops);
/*
* Add umwait control interface. Ignore failure, so at least the
* default values are set up in case the machine manages to boot.
*/
dev = cpu_subsys.dev_root;
return sysfs_create_group(&dev->kobj, &umwait_attr_group);
}
device_initcall(umwait_init);
// SPDX-License-Identifier: GPL-2.0
#include <linux/sched.h>
#include <linux/sched/clock.h>
#include <asm/cpufeature.h>
#include "cpu.h"
#define MSR_ZHAOXIN_FCR57 0x00001257
#define ACE_PRESENT (1 << 6)
#define ACE_ENABLED (1 << 7)
#define ACE_FCR (1 << 7) /* MSR_ZHAOXIN_FCR */
#define RNG_PRESENT (1 << 2)
#define RNG_ENABLED (1 << 3)
#define RNG_ENABLE (1 << 8) /* MSR_ZHAOXIN_RNG */
#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000
#define X86_VMX_FEATURE_PROC_CTLS_VNMI 0x00400000
#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS 0x80000000
#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001
#define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002
#define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020
static void init_zhaoxin_cap(struct cpuinfo_x86 *c)
{
u32 lo, hi;
/* Test for Extended Feature Flags presence */
if (cpuid_eax(0xC0000000) >= 0xC0000001) {
u32 tmp = cpuid_edx(0xC0000001);
/* Enable ACE unit, if present and disabled */
if ((tmp & (ACE_PRESENT | ACE_ENABLED)) == ACE_PRESENT) {
rdmsr(MSR_ZHAOXIN_FCR57, lo, hi);
/* Enable ACE unit */
lo |= ACE_FCR;
wrmsr(MSR_ZHAOXIN_FCR57, lo, hi);
pr_info("CPU: Enabled ACE h/w crypto\n");
}
/* Enable RNG unit, if present and disabled */
if ((tmp & (RNG_PRESENT | RNG_ENABLED)) == RNG_PRESENT) {
rdmsr(MSR_ZHAOXIN_FCR57, lo, hi);
/* Enable RNG unit */
lo |= RNG_ENABLE;
wrmsr(MSR_ZHAOXIN_FCR57, lo, hi);
pr_info("CPU: Enabled h/w RNG\n");
}
/*
* Store Extended Feature Flags as word 5 of the CPU
* capability bit array
*/
c->x86_capability[CPUID_C000_0001_EDX] = cpuid_edx(0xC0000001);
}
if (c->x86 >= 0x6)
set_cpu_cap(c, X86_FEATURE_REP_GOOD);
cpu_detect_cache_sizes(c);
}
static void early_init_zhaoxin(struct cpuinfo_x86 *c)
{
if (c->x86 >= 0x6)
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
#ifdef CONFIG_X86_64
set_cpu_cap(c, X86_FEATURE_SYSENTER32);
#endif
if (c->x86_power & (1 << 8)) {
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
}
if (c->cpuid_level >= 0x00000001) {
u32 eax, ebx, ecx, edx;
cpuid(0x00000001, &eax, &ebx, &ecx, &edx);
/*
* If HTT (EDX[28]) is set EBX[16:23] contain the number of
* apicids which are reserved per package. Store the resulting
* shift value for the package management code.
*/
if (edx & (1U << 28))
c->x86_coreid_bits = get_count_order((ebx >> 16) & 0xff);
}
}
static void zhaoxin_detect_vmx_virtcap(struct cpuinfo_x86 *c)
{
u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2;
rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high);
msr_ctl = vmx_msr_high | vmx_msr_low;
if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)
set_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI)
set_cpu_cap(c, X86_FEATURE_VNMI);
if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) {
rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
vmx_msr_low, vmx_msr_high);
msr_ctl2 = vmx_msr_high | vmx_msr_low;
if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) &&
(msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW))
set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT)
set_cpu_cap(c, X86_FEATURE_EPT);
if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID)
set_cpu_cap(c, X86_FEATURE_VPID);
}
}
static void init_zhaoxin(struct cpuinfo_x86 *c)
{
early_init_zhaoxin(c);
init_intel_cacheinfo(c);
detect_num_cpu_cores(c);
#ifdef CONFIG_X86_32
detect_ht(c);
#endif
if (c->cpuid_level > 9) {
unsigned int eax = cpuid_eax(10);
/*
* Check for version and the number of counters
* Version(eax[7:0]) can't be 0;
* Counters(eax[15:8]) should be greater than 1;
*/
if ((eax & 0xff) && (((eax >> 8) & 0xff) > 1))
set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
}
if (c->x86 >= 0x6)
init_zhaoxin_cap(c);
#ifdef CONFIG_X86_64
set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
#endif
if (cpu_has(c, X86_FEATURE_VMX))
zhaoxin_detect_vmx_virtcap(c);
}
#ifdef CONFIG_X86_32
static unsigned int
zhaoxin_size_cache(struct cpuinfo_x86 *c, unsigned int size)
{
return size;
}
#endif
static const struct cpu_dev zhaoxin_cpu_dev = {
.c_vendor = "zhaoxin",
.c_ident = { " Shanghai " },
.c_early_init = early_init_zhaoxin,
.c_init = init_zhaoxin,
#ifdef CONFIG_X86_32
.legacy_cache_size = zhaoxin_size_cache,
#endif
.c_x86_vendor = X86_VENDOR_ZHAOXIN,
};
cpu_dev_register(zhaoxin_cpu_dev);
...@@ -397,22 +397,12 @@ static int putreg(struct task_struct *child, ...@@ -397,22 +397,12 @@ static int putreg(struct task_struct *child,
case offsetof(struct user_regs_struct,fs_base): case offsetof(struct user_regs_struct,fs_base):
if (value >= TASK_SIZE_MAX) if (value >= TASK_SIZE_MAX)
return -EIO; return -EIO;
/* x86_fsbase_write_task(child, value);
* When changing the FS base, use do_arch_prctl_64()
* to set the index to zero and to set the base
* as requested.
*/
if (child->thread.fsbase != value)
return do_arch_prctl_64(child, ARCH_SET_FS, value);
return 0; return 0;
case offsetof(struct user_regs_struct,gs_base): case offsetof(struct user_regs_struct,gs_base):
/*
* Exactly the same here as the %fs handling above.
*/
if (value >= TASK_SIZE_MAX) if (value >= TASK_SIZE_MAX)
return -EIO; return -EIO;
if (child->thread.gsbase != value) x86_gsbase_write_task(child, value);
return do_arch_prctl_64(child, ARCH_SET_GS, value);
return 0; return 0;
#endif #endif
} }
......
...@@ -47,8 +47,6 @@ static const struct cpuid_reg reverse_cpuid[] = { ...@@ -47,8 +47,6 @@ static const struct cpuid_reg reverse_cpuid[] = {
[CPUID_8000_0001_ECX] = {0x80000001, 0, CPUID_ECX}, [CPUID_8000_0001_ECX] = {0x80000001, 0, CPUID_ECX},
[CPUID_7_0_EBX] = { 7, 0, CPUID_EBX}, [CPUID_7_0_EBX] = { 7, 0, CPUID_EBX},
[CPUID_D_1_EAX] = { 0xd, 1, CPUID_EAX}, [CPUID_D_1_EAX] = { 0xd, 1, CPUID_EAX},
[CPUID_F_0_EDX] = { 0xf, 0, CPUID_EDX},
[CPUID_F_1_EDX] = { 0xf, 1, CPUID_EDX},
[CPUID_8000_0008_EBX] = {0x80000008, 0, CPUID_EBX}, [CPUID_8000_0008_EBX] = {0x80000008, 0, CPUID_EBX},
[CPUID_6_EAX] = { 6, 0, CPUID_EAX}, [CPUID_6_EAX] = { 6, 0, CPUID_EAX},
[CPUID_8000_000A_EDX] = {0x8000000a, 0, CPUID_EDX}, [CPUID_8000_000A_EDX] = {0x8000000a, 0, CPUID_EDX},
......
...@@ -64,6 +64,7 @@ static void power_saving_mwait_init(void) ...@@ -64,6 +64,7 @@ static void power_saving_mwait_init(void)
case X86_VENDOR_HYGON: case X86_VENDOR_HYGON:
case X86_VENDOR_AMD: case X86_VENDOR_AMD:
case X86_VENDOR_INTEL: case X86_VENDOR_INTEL:
case X86_VENDOR_ZHAOXIN:
/* /*
* AMD Fam10h TSC will tick in all * AMD Fam10h TSC will tick in all
* C/P/S0/S1 states when this bit is set. * C/P/S0/S1 states when this bit is set.
......
...@@ -196,6 +196,7 @@ static void tsc_check_state(int state) ...@@ -196,6 +196,7 @@ static void tsc_check_state(int state)
case X86_VENDOR_AMD: case X86_VENDOR_AMD:
case X86_VENDOR_INTEL: case X86_VENDOR_INTEL:
case X86_VENDOR_CENTAUR: case X86_VENDOR_CENTAUR:
case X86_VENDOR_ZHAOXIN:
/* /*
* AMD Fam10h TSC will tick in all * AMD Fam10h TSC will tick in all
* C/P/S0/S1 states when this bit is set. * C/P/S0/S1 states when this bit is set.
......
...@@ -12,8 +12,9 @@ CAN_BUILD_WITH_NOPIE := $(shell ./check_cc.sh $(CC) trivial_program.c -no-pie) ...@@ -12,8 +12,9 @@ CAN_BUILD_WITH_NOPIE := $(shell ./check_cc.sh $(CC) trivial_program.c -no-pie)
TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt test_mremap_vdso \ TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt test_mremap_vdso \
check_initial_reg_state sigreturn iopl mpx-mini-test ioperm \ check_initial_reg_state sigreturn iopl mpx-mini-test ioperm \
protection_keys test_vdso test_vsyscall mov_ss_trap protection_keys test_vdso test_vsyscall mov_ss_trap \
TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault test_syscall_vdso unwind_vdso \ syscall_arg_fault
TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \
test_FCMOV test_FCOMI test_FISTTP \ test_FCMOV test_FCOMI test_FISTTP \
vdso_restorer vdso_restorer
TARGETS_C_64BIT_ONLY := fsgsbase sysret_rip TARGETS_C_64BIT_ONLY := fsgsbase sysret_rip
......
...@@ -23,6 +23,10 @@ ...@@ -23,6 +23,10 @@
#include <pthread.h> #include <pthread.h>
#include <asm/ldt.h> #include <asm/ldt.h>
#include <sys/mman.h> #include <sys/mman.h>
#include <stddef.h>
#include <sys/ptrace.h>
#include <sys/wait.h>
#include <setjmp.h>
#ifndef __x86_64__ #ifndef __x86_64__
# error This test is 64-bit only # error This test is 64-bit only
...@@ -31,6 +35,8 @@ ...@@ -31,6 +35,8 @@
static volatile sig_atomic_t want_segv; static volatile sig_atomic_t want_segv;
static volatile unsigned long segv_addr; static volatile unsigned long segv_addr;
static unsigned short *shared_scratch;
static int nerrs; static int nerrs;
static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
...@@ -71,6 +77,43 @@ static void sigsegv(int sig, siginfo_t *si, void *ctx_void) ...@@ -71,6 +77,43 @@ static void sigsegv(int sig, siginfo_t *si, void *ctx_void)
} }
static jmp_buf jmpbuf;
static void sigill(int sig, siginfo_t *si, void *ctx_void)
{
siglongjmp(jmpbuf, 1);
}
static bool have_fsgsbase;
static inline unsigned long rdgsbase(void)
{
unsigned long gsbase;
asm volatile("rdgsbase %0" : "=r" (gsbase) :: "memory");
return gsbase;
}
static inline unsigned long rdfsbase(void)
{
unsigned long fsbase;
asm volatile("rdfsbase %0" : "=r" (fsbase) :: "memory");
return fsbase;
}
static inline void wrgsbase(unsigned long gsbase)
{
asm volatile("wrgsbase %0" :: "r" (gsbase) : "memory");
}
static inline void wrfsbase(unsigned long fsbase)
{
asm volatile("wrfsbase %0" :: "r" (fsbase) : "memory");
}
enum which_base { FS, GS }; enum which_base { FS, GS };
static unsigned long read_base(enum which_base which) static unsigned long read_base(enum which_base which)
...@@ -199,16 +242,13 @@ static void do_remote_base() ...@@ -199,16 +242,13 @@ static void do_remote_base()
to_set, hard_zero ? " and clear gs" : "", sel); to_set, hard_zero ? " and clear gs" : "", sel);
} }
void do_unexpected_base(void) static __thread int set_thread_area_entry_number = -1;
static unsigned short load_gs(void)
{ {
/* /*
* The goal here is to try to arrange for GS == 0, GSBASE != * Sets GS != 0 and GSBASE != 0 but arranges for the kernel to think
* 0, and for the the kernel the think that GSBASE == 0. * that GSBASE == 0 (i.e. thread.gsbase == 0).
*
* To make the test as reliable as possible, this uses
* explicit descriptorss. (This is not the only way. This
* could use ARCH_SET_GS with a low, nonzero base, but the
* relevant side effect of ARCH_SET_GS could change.)
*/ */
/* Step 1: tell the kernel that we have GSBASE == 0. */ /* Step 1: tell the kernel that we have GSBASE == 0. */
...@@ -228,8 +268,9 @@ void do_unexpected_base(void) ...@@ -228,8 +268,9 @@ void do_unexpected_base(void)
.useable = 0 .useable = 0
}; };
if (syscall(SYS_modify_ldt, 1, &desc, sizeof(desc)) == 0) { if (syscall(SYS_modify_ldt, 1, &desc, sizeof(desc)) == 0) {
printf("\tother thread: using LDT slot 0\n"); printf("\tusing LDT slot 0\n");
asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0x7)); asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0x7));
return 0x7;
} else { } else {
/* No modify_ldt for us (configured out, perhaps) */ /* No modify_ldt for us (configured out, perhaps) */
...@@ -239,7 +280,7 @@ void do_unexpected_base(void) ...@@ -239,7 +280,7 @@ void do_unexpected_base(void)
MAP_PRIVATE | MAP_ANONYMOUS | MAP_32BIT, -1, 0); MAP_PRIVATE | MAP_ANONYMOUS | MAP_32BIT, -1, 0);
memcpy(low_desc, &desc, sizeof(desc)); memcpy(low_desc, &desc, sizeof(desc));
low_desc->entry_number = -1; low_desc->entry_number = set_thread_area_entry_number;
/* 32-bit set_thread_area */ /* 32-bit set_thread_area */
long ret; long ret;
...@@ -251,18 +292,43 @@ void do_unexpected_base(void) ...@@ -251,18 +292,43 @@ void do_unexpected_base(void)
if (ret != 0) { if (ret != 0) {
printf("[NOTE]\tcould not create a segment -- test won't do anything\n"); printf("[NOTE]\tcould not create a segment -- test won't do anything\n");
return; return 0;
} }
printf("\tother thread: using GDT slot %d\n", desc.entry_number); printf("\tusing GDT slot %d\n", desc.entry_number);
asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)((desc.entry_number << 3) | 0x3))); set_thread_area_entry_number = desc.entry_number;
unsigned short gs = (unsigned short)((desc.entry_number << 3) | 0x3);
asm volatile ("mov %0, %%gs" : : "rm" (gs));
return gs;
} }
}
/* void test_wrbase(unsigned short index, unsigned long base)
* Step 3: set the selector back to zero. On AMD chips, this will {
* preserve GSBASE. unsigned short newindex;
*/ unsigned long newbase;
asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0)); printf("[RUN]\tGS = 0x%hx, GSBASE = 0x%lx\n", index, base);
asm volatile ("mov %0, %%gs" : : "rm" (index));
wrgsbase(base);
remote_base = 0;
ftx = 1;
syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0);
while (ftx != 0)
syscall(SYS_futex, &ftx, FUTEX_WAIT, 1, NULL, NULL, 0);
asm volatile ("mov %%gs, %0" : "=rm" (newindex));
newbase = rdgsbase();
if (newindex == index && newbase == base) {
printf("[OK]\tIndex and base were preserved\n");
} else {
printf("[FAIL]\tAfter switch, GS = 0x%hx and GSBASE = 0x%lx\n",
newindex, newbase);
nerrs++;
}
} }
static void *threadproc(void *ctx) static void *threadproc(void *ctx)
...@@ -273,12 +339,19 @@ static void *threadproc(void *ctx) ...@@ -273,12 +339,19 @@ static void *threadproc(void *ctx)
if (ftx == 3) if (ftx == 3)
return NULL; return NULL;
if (ftx == 1) if (ftx == 1) {
do_remote_base(); do_remote_base();
else if (ftx == 2) } else if (ftx == 2) {
do_unexpected_base(); /*
else * On AMD chips, this causes GSBASE != 0, GS == 0, and
* thread.gsbase == 0.
*/
load_gs();
asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0));
} else {
errx(1, "helper thread got bad command"); errx(1, "helper thread got bad command");
}
ftx = 0; ftx = 0;
syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0); syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0);
...@@ -367,10 +440,99 @@ static void test_unexpected_base(void) ...@@ -367,10 +440,99 @@ static void test_unexpected_base(void)
} }
} }
#define USER_REGS_OFFSET(r) offsetof(struct user_regs_struct, r)
static void test_ptrace_write_gsbase(void)
{
int status;
pid_t child = fork();
if (child < 0)
err(1, "fork");
if (child == 0) {
printf("[RUN]\tPTRACE_POKE(), write GSBASE from ptracer\n");
*shared_scratch = load_gs();
if (ptrace(PTRACE_TRACEME, 0, NULL, NULL) != 0)
err(1, "PTRACE_TRACEME");
raise(SIGTRAP);
_exit(0);
}
wait(&status);
if (WSTOPSIG(status) == SIGTRAP) {
unsigned long gs, base;
unsigned long gs_offset = USER_REGS_OFFSET(gs);
unsigned long base_offset = USER_REGS_OFFSET(gs_base);
gs = ptrace(PTRACE_PEEKUSER, child, gs_offset, NULL);
if (gs != *shared_scratch) {
nerrs++;
printf("[FAIL]\tGS is not prepared with nonzero\n");
goto END;
}
if (ptrace(PTRACE_POKEUSER, child, base_offset, 0xFF) != 0)
err(1, "PTRACE_POKEUSER");
gs = ptrace(PTRACE_PEEKUSER, child, gs_offset, NULL);
base = ptrace(PTRACE_PEEKUSER, child, base_offset, NULL);
/*
* In a non-FSGSBASE system, the nonzero selector will load
* GSBASE (again). But what is tested here is whether the
* selector value is changed or not by the GSBASE write in
* a ptracer.
*/
if (gs != *shared_scratch) {
nerrs++;
printf("[FAIL]\tGS changed to %lx\n", gs);
/*
* On older kernels, poking a nonzero value into the
* base would zero the selector. On newer kernels,
* this behavior has changed -- poking the base
* changes only the base and, if FSGSBASE is not
* available, this may have no effect.
*/
if (gs == 0)
printf("\tNote: this is expected behavior on older kernels.\n");
} else if (have_fsgsbase && (base != 0xFF)) {
nerrs++;
printf("[FAIL]\tGSBASE changed to %lx\n", base);
} else {
printf("[OK]\tGS remained 0x%hx%s", *shared_scratch, have_fsgsbase ? " and GSBASE changed to 0xFF" : "");
printf("\n");
}
}
END:
ptrace(PTRACE_CONT, child, NULL, NULL);
}
int main() int main()
{ {
pthread_t thread; pthread_t thread;
shared_scratch = mmap(NULL, 4096, PROT_READ | PROT_WRITE,
MAP_ANONYMOUS | MAP_SHARED, -1, 0);
/* Probe FSGSBASE */
sethandler(SIGILL, sigill, 0);
if (sigsetjmp(jmpbuf, 1) == 0) {
rdfsbase();
have_fsgsbase = true;
printf("\tFSGSBASE instructions are enabled\n");
} else {
printf("\tFSGSBASE instructions are disabled\n");
}
clearhandler(SIGILL);
sethandler(SIGSEGV, sigsegv, 0); sethandler(SIGSEGV, sigsegv, 0);
check_gs_value(0); check_gs_value(0);
...@@ -417,11 +579,28 @@ int main() ...@@ -417,11 +579,28 @@ int main()
test_unexpected_base(); test_unexpected_base();
if (have_fsgsbase) {
unsigned short ss;
asm volatile ("mov %%ss, %0" : "=rm" (ss));
test_wrbase(0, 0);
test_wrbase(0, 1);
test_wrbase(0, 0x200000000);
test_wrbase(0, 0xffffffffffffffff);
test_wrbase(ss, 0);
test_wrbase(ss, 1);
test_wrbase(ss, 0x200000000);
test_wrbase(ss, 0xffffffffffffffff);
}
ftx = 3; /* Kill the thread. */ ftx = 3; /* Kill the thread. */
syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0); syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0);
if (pthread_join(thread, NULL) != 0) if (pthread_join(thread, NULL) != 0)
err(1, "pthread_join"); err(1, "pthread_join");
test_ptrace_write_gsbase();
return nerrs == 0 ? 0 : 1; return nerrs == 0 ? 0 : 1;
} }
...@@ -15,9 +15,30 @@ ...@@ -15,9 +15,30 @@
#include <setjmp.h> #include <setjmp.h>
#include <errno.h> #include <errno.h>
#ifdef __x86_64__
# define WIDTH "q"
#else
# define WIDTH "l"
#endif
/* Our sigaltstack scratch space. */ /* Our sigaltstack scratch space. */
static unsigned char altstack_data[SIGSTKSZ]; static unsigned char altstack_data[SIGSTKSZ];
static unsigned long get_eflags(void)
{
unsigned long eflags;
asm volatile ("pushf" WIDTH "\n\tpop" WIDTH " %0" : "=rm" (eflags));
return eflags;
}
static void set_eflags(unsigned long eflags)
{
asm volatile ("push" WIDTH " %0\n\tpopf" WIDTH
: : "rm" (eflags) : "flags");
}
#define X86_EFLAGS_TF (1UL << 8)
static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
int flags) int flags)
{ {
...@@ -35,13 +56,22 @@ static sigjmp_buf jmpbuf; ...@@ -35,13 +56,22 @@ static sigjmp_buf jmpbuf;
static volatile sig_atomic_t n_errs; static volatile sig_atomic_t n_errs;
#ifdef __x86_64__
#define REG_AX REG_RAX
#define REG_IP REG_RIP
#else
#define REG_AX REG_EAX
#define REG_IP REG_EIP
#endif
static void sigsegv_or_sigbus(int sig, siginfo_t *info, void *ctx_void) static void sigsegv_or_sigbus(int sig, siginfo_t *info, void *ctx_void)
{ {
ucontext_t *ctx = (ucontext_t*)ctx_void; ucontext_t *ctx = (ucontext_t*)ctx_void;
long ax = (long)ctx->uc_mcontext.gregs[REG_AX];
if (ctx->uc_mcontext.gregs[REG_EAX] != -EFAULT) { if (ax != -EFAULT && ax != -ENOSYS) {
printf("[FAIL]\tAX had the wrong value: 0x%x\n", printf("[FAIL]\tAX had the wrong value: 0x%lx\n",
ctx->uc_mcontext.gregs[REG_EAX]); (unsigned long)ax);
n_errs++; n_errs++;
} else { } else {
printf("[OK]\tSeems okay\n"); printf("[OK]\tSeems okay\n");
...@@ -50,9 +80,42 @@ static void sigsegv_or_sigbus(int sig, siginfo_t *info, void *ctx_void) ...@@ -50,9 +80,42 @@ static void sigsegv_or_sigbus(int sig, siginfo_t *info, void *ctx_void)
siglongjmp(jmpbuf, 1); siglongjmp(jmpbuf, 1);
} }
static volatile sig_atomic_t sigtrap_consecutive_syscalls;
static void sigtrap(int sig, siginfo_t *info, void *ctx_void)
{
/*
* KVM has some bugs that can cause us to stop making progress.
* detect them and complain, but don't infinite loop or fail the
* test.
*/
ucontext_t *ctx = (ucontext_t*)ctx_void;
unsigned short *ip = (unsigned short *)ctx->uc_mcontext.gregs[REG_IP];
if (*ip == 0x340f || *ip == 0x050f) {
/* The trap was on SYSCALL or SYSENTER */
sigtrap_consecutive_syscalls++;
if (sigtrap_consecutive_syscalls > 3) {
printf("[WARN]\tGot stuck single-stepping -- you probably have a KVM bug\n");
siglongjmp(jmpbuf, 1);
}
} else {
sigtrap_consecutive_syscalls = 0;
}
}
static void sigill(int sig, siginfo_t *info, void *ctx_void) static void sigill(int sig, siginfo_t *info, void *ctx_void)
{ {
ucontext_t *ctx = (ucontext_t*)ctx_void;
unsigned short *ip = (unsigned short *)ctx->uc_mcontext.gregs[REG_IP];
if (*ip == 0x0b0f) {
/* one of the ud2 instructions faulted */
printf("[OK]\tSYSCALL returned normally\n");
} else {
printf("[SKIP]\tIllegal instruction\n"); printf("[SKIP]\tIllegal instruction\n");
}
siglongjmp(jmpbuf, 1); siglongjmp(jmpbuf, 1);
} }
...@@ -120,9 +183,48 @@ int main() ...@@ -120,9 +183,48 @@ int main()
"movl $-1, %%ebp\n\t" "movl $-1, %%ebp\n\t"
"movl $-1, %%esp\n\t" "movl $-1, %%esp\n\t"
"syscall\n\t" "syscall\n\t"
"pushl $0" /* make sure we segfault cleanly */ "ud2" /* make sure we recover cleanly */
: : : "memory", "flags");
}
printf("[RUN]\tSYSENTER with TF and invalid state\n");
sethandler(SIGTRAP, sigtrap, SA_ONSTACK);
if (sigsetjmp(jmpbuf, 1) == 0) {
sigtrap_consecutive_syscalls = 0;
set_eflags(get_eflags() | X86_EFLAGS_TF);
asm volatile (
"movl $-1, %%eax\n\t"
"movl $-1, %%ebx\n\t"
"movl $-1, %%ecx\n\t"
"movl $-1, %%edx\n\t"
"movl $-1, %%esi\n\t"
"movl $-1, %%edi\n\t"
"movl $-1, %%ebp\n\t"
"movl $-1, %%esp\n\t"
"sysenter"
: : : "memory", "flags");
}
set_eflags(get_eflags() & ~X86_EFLAGS_TF);
printf("[RUN]\tSYSCALL with TF and invalid state\n");
if (sigsetjmp(jmpbuf, 1) == 0) {
sigtrap_consecutive_syscalls = 0;
set_eflags(get_eflags() | X86_EFLAGS_TF);
asm volatile (
"movl $-1, %%eax\n\t"
"movl $-1, %%ebx\n\t"
"movl $-1, %%ecx\n\t"
"movl $-1, %%edx\n\t"
"movl $-1, %%esi\n\t"
"movl $-1, %%edi\n\t"
"movl $-1, %%ebp\n\t"
"movl $-1, %%esp\n\t"
"syscall\n\t"
"ud2" /* make sure we recover cleanly */
: : : "memory", "flags"); : : : "memory", "flags");
} }
set_eflags(get_eflags() & ~X86_EFLAGS_TF);
return 0; return 0;
} }
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment