Commit a35747c3 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull kvm fixes from Paolo Bonzini:
 "ARM:

   - Plug a race in the stage-2 mapping code where the IPA and the PA
     would end up being out of sync

   - Make better use of the bitmap API (bitmap_zero, bitmap_zalloc...)

   - FP/SVE/SME documentation update, in the hope that this field
     becomes clearer...

   - Add workaround for Apple SEIS brokenness to a new SoC

   - Random comment fixes

  x86:

   - add MSR_IA32_TSX_CTRL into msrs_to_save

   - fixes for XCR0 handling in SGX enclaves

  Generic:

   - Fix vcpu_array[0] races

   - Fix race between starting a VM and 'reboot -f'"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
  KVM: VMX: add MSR_IA32_TSX_CTRL into msrs_to_save
  KVM: x86: Don't adjust guest's CPUID.0x12.1 (allowed SGX enclave XFRM)
  KVM: VMX: Don't rely _only_ on CPUID to enforce XCR0 restrictions for ECREATE
  KVM: Fix vcpu_array[0] races
  KVM: VMX: Fix header file dependency of asm/vmx.h
  KVM: Don't enable hardware after a restart/shutdown is initiated
  KVM: Use syscore_ops instead of reboot_notifier to hook restart/shutdown
  KVM: arm64: vgic: Add Apple M2 PRO/MAX cpus to the list of broken SEIS implementations
  KVM: arm64: Clarify host SME state management
  KVM: arm64: Restructure check for SVE support in FP trap handler
  KVM: arm64: Document check for TIF_FOREIGN_FPSTATE
  KVM: arm64: Fix repeated words in comments
  KVM: arm64: Constify start/end/phys fields of the pgtable walker data
  KVM: arm64: Infer PA offset from VA in hyp map walker
  KVM: arm64: Infer the PA offset from IPA in stage-2 map walker
  KVM: arm64: Use the bitmap API to allocate bitmaps
  KVM: arm64: Slightly optimize flush_context()
parents c47d122c b9846a69
......@@ -126,6 +126,10 @@
#define APPLE_CPU_PART_M1_FIRESTORM_MAX 0x029
#define APPLE_CPU_PART_M2_BLIZZARD 0x032
#define APPLE_CPU_PART_M2_AVALANCHE 0x033
#define APPLE_CPU_PART_M2_BLIZZARD_PRO 0x034
#define APPLE_CPU_PART_M2_AVALANCHE_PRO 0x035
#define APPLE_CPU_PART_M2_BLIZZARD_MAX 0x038
#define APPLE_CPU_PART_M2_AVALANCHE_MAX 0x039
#define AMPERE_CPU_PART_AMPERE1 0xAC3
......@@ -181,6 +185,10 @@
#define MIDR_APPLE_M1_FIRESTORM_MAX MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_FIRESTORM_MAX)
#define MIDR_APPLE_M2_BLIZZARD MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M2_BLIZZARD)
#define MIDR_APPLE_M2_AVALANCHE MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M2_AVALANCHE)
#define MIDR_APPLE_M2_BLIZZARD_PRO MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M2_BLIZZARD_PRO)
#define MIDR_APPLE_M2_AVALANCHE_PRO MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M2_AVALANCHE_PRO)
#define MIDR_APPLE_M2_BLIZZARD_MAX MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M2_BLIZZARD_MAX)
#define MIDR_APPLE_M2_AVALANCHE_MAX MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M2_AVALANCHE_MAX)
#define MIDR_AMPERE1 MIDR_CPU_MODEL(ARM_CPU_IMP_AMPERE, AMPERE_CPU_PART_AMPERE1)
/* Fujitsu Erratum 010001 affects A64FX 1.0 and 1.1, (v0r0 and v1r0) */
......
......@@ -209,6 +209,7 @@ struct kvm_pgtable_visit_ctx {
kvm_pte_t old;
void *arg;
struct kvm_pgtable_mm_ops *mm_ops;
u64 start;
u64 addr;
u64 end;
u32 level;
......
......@@ -81,26 +81,34 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu)
fpsimd_kvm_prepare();
/*
* We will check TIF_FOREIGN_FPSTATE just before entering the
* guest in kvm_arch_vcpu_ctxflush_fp() and override this to
* FP_STATE_FREE if the flag set.
*/
vcpu->arch.fp_state = FP_STATE_HOST_OWNED;
vcpu_clear_flag(vcpu, HOST_SVE_ENABLED);
if (read_sysreg(cpacr_el1) & CPACR_EL1_ZEN_EL0EN)
vcpu_set_flag(vcpu, HOST_SVE_ENABLED);
/*
* We don't currently support SME guests but if we leave
* things in streaming mode then when the guest starts running
* FPSIMD or SVE code it may generate SME traps so as a
* special case if we are in streaming mode we force the host
* state to be saved now and exit streaming mode so that we
* don't have to handle any SME traps for valid guest
* operations. Do this for ZA as well for now for simplicity.
*/
if (system_supports_sme()) {
vcpu_clear_flag(vcpu, HOST_SME_ENABLED);
if (read_sysreg(cpacr_el1) & CPACR_EL1_SMEN_EL0EN)
vcpu_set_flag(vcpu, HOST_SME_ENABLED);
/*
* If PSTATE.SM is enabled then save any pending FP
* state and disable PSTATE.SM. If we leave PSTATE.SM
* enabled and the guest does not enable SME via
* CPACR_EL1.SMEN then operations that should be valid
* may generate SME traps from EL1 to EL1 which we
* can't intercept and which would confuse the guest.
*
* Do the same for PSTATE.ZA in the case where there
* is state in the registers which has not already
* been saved, this is very unlikely to happen.
*/
if (read_sysreg_s(SYS_SVCR) & (SVCR_SM_MASK | SVCR_ZA_MASK)) {
vcpu->arch.fp_state = FP_STATE_FREE;
fpsimd_save_and_flush_cpu_state();
......
......@@ -177,9 +177,17 @@ static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code)
sve_guest = vcpu_has_sve(vcpu);
esr_ec = kvm_vcpu_trap_get_class(vcpu);
/* Don't handle SVE traps for non-SVE vcpus here: */
if (!sve_guest && esr_ec != ESR_ELx_EC_FP_ASIMD)
/* Only handle traps the vCPU can support here: */
switch (esr_ec) {
case ESR_ELx_EC_FP_ASIMD:
break;
case ESR_ELx_EC_SVE:
if (!sve_guest)
return false;
break;
default:
return false;
}
/* Valid trap. Switch the context: */
......
......@@ -58,8 +58,9 @@
struct kvm_pgtable_walk_data {
struct kvm_pgtable_walker *walker;
const u64 start;
u64 addr;
u64 end;
const u64 end;
};
static bool kvm_phys_is_valid(u64 phys)
......@@ -201,6 +202,7 @@ static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data,
.old = READ_ONCE(*ptep),
.arg = data->walker->arg,
.mm_ops = mm_ops,
.start = data->start,
.addr = data->addr,
.end = data->end,
.level = level,
......@@ -293,6 +295,7 @@ int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
struct kvm_pgtable_walker *walker)
{
struct kvm_pgtable_walk_data walk_data = {
.start = ALIGN_DOWN(addr, PAGE_SIZE),
.addr = ALIGN_DOWN(addr, PAGE_SIZE),
.end = PAGE_ALIGN(walk_data.addr + size),
.walker = walker,
......@@ -349,7 +352,7 @@ int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr,
}
struct hyp_map_data {
u64 phys;
const u64 phys;
kvm_pte_t attr;
};
......@@ -407,13 +410,12 @@ enum kvm_pgtable_prot kvm_pgtable_hyp_pte_prot(kvm_pte_t pte)
static bool hyp_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx,
struct hyp_map_data *data)
{
u64 phys = data->phys + (ctx->addr - ctx->start);
kvm_pte_t new;
u64 granule = kvm_granule_size(ctx->level), phys = data->phys;
if (!kvm_block_mapping_supported(ctx, phys))
return false;
data->phys += granule;
new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level);
if (ctx->old == new)
return true;
......@@ -576,7 +578,7 @@ void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt)
}
struct stage2_map_data {
u64 phys;
const u64 phys;
kvm_pte_t attr;
u8 owner_id;
......@@ -794,20 +796,43 @@ static bool stage2_pte_executable(kvm_pte_t pte)
return !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN);
}
static u64 stage2_map_walker_phys_addr(const struct kvm_pgtable_visit_ctx *ctx,
const struct stage2_map_data *data)
{
u64 phys = data->phys;
/*
* Stage-2 walks to update ownership data are communicated to the map
* walker using an invalid PA. Avoid offsetting an already invalid PA,
* which could overflow and make the address valid again.
*/
if (!kvm_phys_is_valid(phys))
return phys;
/*
* Otherwise, work out the correct PA based on how far the walk has
* gotten.
*/
return phys + (ctx->addr - ctx->start);
}
static bool stage2_leaf_mapping_allowed(const struct kvm_pgtable_visit_ctx *ctx,
struct stage2_map_data *data)
{
u64 phys = stage2_map_walker_phys_addr(ctx, data);
if (data->force_pte && (ctx->level < (KVM_PGTABLE_MAX_LEVELS - 1)))
return false;
return kvm_block_mapping_supported(ctx, data->phys);
return kvm_block_mapping_supported(ctx, phys);
}
static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx,
struct stage2_map_data *data)
{
kvm_pte_t new;
u64 granule = kvm_granule_size(ctx->level), phys = data->phys;
u64 phys = stage2_map_walker_phys_addr(ctx, data);
u64 granule = kvm_granule_size(ctx->level);
struct kvm_pgtable *pgt = data->mmu->pgt;
struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
......@@ -841,8 +866,6 @@ static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx,
stage2_make_pte(ctx, new);
if (kvm_phys_is_valid(phys))
data->phys += granule;
return 0;
}
......
......@@ -204,7 +204,7 @@ void kvm_inject_size_fault(struct kvm_vcpu *vcpu)
* Size Fault at level 0, as if exceeding PARange.
*
* Non-LPAE guests will only get the external abort, as there
* is no way to to describe the ASF.
* is no way to describe the ASF.
*/
if (vcpu_el1_is_32bit(vcpu) &&
!(vcpu_read_sys_reg(vcpu, TCR_EL1) & TTBCR_EAE))
......
......@@ -616,6 +616,10 @@ static const struct midr_range broken_seis[] = {
MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM_MAX),
MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD),
MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE),
MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_PRO),
MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_PRO),
MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_MAX),
MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_MAX),
{},
};
......
......@@ -47,7 +47,7 @@ static void flush_context(void)
int cpu;
u64 vmid;
bitmap_clear(vmid_map, 0, NUM_USER_VMIDS);
bitmap_zero(vmid_map, NUM_USER_VMIDS);
for_each_possible_cpu(cpu) {
vmid = atomic64_xchg_relaxed(&per_cpu(active_vmids, cpu), 0);
......@@ -182,8 +182,7 @@ int __init kvm_arm_vmid_alloc_init(void)
*/
WARN_ON(NUM_USER_VMIDS - 1 <= num_possible_cpus());
atomic64_set(&vmid_generation, VMID_FIRST_VERSION);
vmid_map = kcalloc(BITS_TO_LONGS(NUM_USER_VMIDS),
sizeof(*vmid_map), GFP_KERNEL);
vmid_map = bitmap_zalloc(NUM_USER_VMIDS, GFP_KERNEL);
if (!vmid_map)
return -ENOMEM;
......@@ -192,5 +191,5 @@ int __init kvm_arm_vmid_alloc_init(void)
void __init kvm_arm_vmid_alloc_free(void)
{
kfree(vmid_map);
bitmap_free(vmid_map);
}
......@@ -13,7 +13,9 @@
#include <linux/bitops.h>
#include <linux/bug.h>
#include <linux/types.h>
#include <uapi/asm/vmx.h>
#include <asm/vmxfeatures.h>
......
......@@ -253,7 +253,6 @@ static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_e
int nent)
{
struct kvm_cpuid_entry2 *best;
u64 guest_supported_xcr0 = cpuid_get_supported_xcr0(entries, nent);
best = cpuid_entry2_find(entries, nent, 1, KVM_CPUID_INDEX_NOT_SIGNIFICANT);
if (best) {
......@@ -292,21 +291,6 @@ static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_e
vcpu->arch.ia32_misc_enable_msr &
MSR_IA32_MISC_ENABLE_MWAIT);
}
/*
* Bits 127:0 of the allowed SECS.ATTRIBUTES (CPUID.0x12.0x1) enumerate
* the supported XSAVE Feature Request Mask (XFRM), i.e. the enclave's
* requested XCR0 value. The enclave's XFRM must be a subset of XCRO
* at the time of EENTER, thus adjust the allowed XFRM by the guest's
* supported XCR0. Similar to XCR0 handling, FP and SSE are forced to
* '1' even on CPUs that don't support XSAVE.
*/
best = cpuid_entry2_find(entries, nent, 0x12, 0x1);
if (best) {
best->ecx &= guest_supported_xcr0 & 0xffffffff;
best->edx &= guest_supported_xcr0 >> 32;
best->ecx |= XFEATURE_MASK_FPSSE;
}
}
void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
......
......@@ -170,12 +170,19 @@ static int __handle_encls_ecreate(struct kvm_vcpu *vcpu,
return 1;
}
/* Enforce CPUID restrictions on MISCSELECT, ATTRIBUTES and XFRM. */
/*
* Enforce CPUID restrictions on MISCSELECT, ATTRIBUTES and XFRM. Note
* that the allowed XFRM (XFeature Request Mask) isn't strictly bound
* by the supported XCR0. FP+SSE *must* be set in XFRM, even if XSAVE
* is unsupported, i.e. even if XCR0 itself is completely unsupported.
*/
if ((u32)miscselect & ~sgx_12_0->ebx ||
(u32)attributes & ~sgx_12_1->eax ||
(u32)(attributes >> 32) & ~sgx_12_1->ebx ||
(u32)xfrm & ~sgx_12_1->ecx ||
(u32)(xfrm >> 32) & ~sgx_12_1->edx) {
(u32)(xfrm >> 32) & ~sgx_12_1->edx ||
xfrm & ~(vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FPSSE) ||
(xfrm & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
kvm_inject_gp(vcpu, 0);
return 1;
}
......
......@@ -1446,7 +1446,7 @@ static const u32 msrs_to_save_base[] = {
#endif
MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
MSR_IA32_SPEC_CTRL,
MSR_IA32_SPEC_CTRL, MSR_IA32_TSX_CTRL,
MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH,
MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK,
MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B,
......@@ -7155,6 +7155,10 @@ static void kvm_probe_msr_to_save(u32 msr_index)
if (!kvm_cpu_cap_has(X86_FEATURE_XFD))
return;
break;
case MSR_IA32_TSX_CTRL:
if (!(kvm_get_arch_capabilities() & ARCH_CAP_TSX_CTRL_MSR))
return;
break;
default:
break;
}
......
......@@ -3962,18 +3962,19 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
}
vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
r = xa_insert(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, GFP_KERNEL_ACCOUNT);
BUG_ON(r == -EBUSY);
r = xa_reserve(&kvm->vcpu_array, vcpu->vcpu_idx, GFP_KERNEL_ACCOUNT);
if (r)
goto unlock_vcpu_destroy;
/* Now it's all set up, let userspace reach it */
kvm_get_kvm(kvm);
r = create_vcpu_fd(vcpu);
if (r < 0) {
xa_erase(&kvm->vcpu_array, vcpu->vcpu_idx);
kvm_put_kvm_no_destroy(kvm);
goto unlock_vcpu_destroy;
if (r < 0)
goto kvm_put_xa_release;
if (KVM_BUG_ON(!!xa_store(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, 0), kvm)) {
r = -EINVAL;
goto kvm_put_xa_release;
}
/*
......@@ -3988,6 +3989,9 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
kvm_create_vcpu_debugfs(vcpu);
return r;
kvm_put_xa_release:
kvm_put_kvm_no_destroy(kvm);
xa_release(&kvm->vcpu_array, vcpu->vcpu_idx);
unlock_vcpu_destroy:
mutex_unlock(&kvm->lock);
kvm_dirty_ring_free(&vcpu->dirty_ring);
......@@ -5184,7 +5188,20 @@ static void hardware_disable_all(void)
static int hardware_enable_all(void)
{
atomic_t failed = ATOMIC_INIT(0);
int r = 0;
int r;
/*
* Do not enable hardware virtualization if the system is going down.
* If userspace initiated a forced reboot, e.g. reboot -f, then it's
* possible for an in-flight KVM_CREATE_VM to trigger hardware enabling
* after kvm_reboot() is called. Note, this relies on system_state
* being set _before_ kvm_reboot(), which is why KVM uses a syscore ops
* hook instead of registering a dedicated reboot notifier (the latter
* runs before system_state is updated).
*/
if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF ||
system_state == SYSTEM_RESTART)
return -EBUSY;
/*
* When onlining a CPU, cpu_online_mask is set before kvm_online_cpu()
......@@ -5197,6 +5214,8 @@ static int hardware_enable_all(void)
cpus_read_lock();
mutex_lock(&kvm_lock);
r = 0;
kvm_usage_count++;
if (kvm_usage_count == 1) {
on_each_cpu(hardware_enable_nolock, &failed, 1);
......@@ -5213,26 +5232,24 @@ static int hardware_enable_all(void)
return r;
}
static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
void *v)
static void kvm_shutdown(void)
{
/*
* Some (well, at least mine) BIOSes hang on reboot if
* in vmx root mode.
*
* And Intel TXT required VMX off for all cpu when system shutdown.
* Disable hardware virtualization and set kvm_rebooting to indicate
* that KVM has asynchronously disabled hardware virtualization, i.e.
* that relevant errors and exceptions aren't entirely unexpected.
* Some flavors of hardware virtualization need to be disabled before
* transferring control to firmware (to perform shutdown/reboot), e.g.
* on x86, virtualization can block INIT interrupts, which are used by
* firmware to pull APs back under firmware control. Note, this path
* is used for both shutdown and reboot scenarios, i.e. neither name is
* 100% comprehensive.
*/
pr_info("kvm: exiting hardware virtualization\n");
kvm_rebooting = true;
on_each_cpu(hardware_disable_nolock, NULL, 1);
return NOTIFY_OK;
}
static struct notifier_block kvm_reboot_notifier = {
.notifier_call = kvm_reboot,
.priority = 0,
};
static int kvm_suspend(void)
{
/*
......@@ -5263,6 +5280,7 @@ static void kvm_resume(void)
static struct syscore_ops kvm_syscore_ops = {
.suspend = kvm_suspend,
.resume = kvm_resume,
.shutdown = kvm_shutdown,
};
#else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
static int hardware_enable_all(void)
......@@ -5967,7 +5985,6 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
if (r)
return r;
register_reboot_notifier(&kvm_reboot_notifier);
register_syscore_ops(&kvm_syscore_ops);
#endif
......@@ -6039,7 +6056,6 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
err_vcpu_cache:
#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
unregister_syscore_ops(&kvm_syscore_ops);
unregister_reboot_notifier(&kvm_reboot_notifier);
cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE);
#endif
return r;
......@@ -6065,7 +6081,6 @@ void kvm_exit(void)
kvm_async_pf_deinit();
#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
unregister_syscore_ops(&kvm_syscore_ops);
unregister_reboot_notifier(&kvm_reboot_notifier);
cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE);
#endif
kvm_irqfd_exit();
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment