Commit 1b21c8db authored by Paolo Bonzini's avatar Paolo Bonzini

Merge tag 'kvmarm-5.10' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm into HEAD

KVM/arm64 updates for Linux 5.10

- New page table code for both hypervisor and guest stage-2
- Introduction of a new EL2-private host context
- Allow EL2 to have its own private per-CPU variables
- Support of PMU event filtering
- Complete rework of the Spectre mitigation
parents 628ade2d 4e5dc64c
......@@ -25,8 +25,10 @@ Returns:
======= ========================================================
-EBUSY The PMU overflow interrupt is already set
-ENXIO The overflow interrupt not set when attempting to get it
-ENODEV PMUv3 not supported
-EFAULT Error reading interrupt number
-ENXIO PMUv3 not supported or the overflow interrupt not set
when attempting to get it
-ENODEV KVM_ARM_VCPU_PMU_V3 feature missing from VCPU
-EINVAL Invalid PMU overflow interrupt number supplied or
trying to set the IRQ number without using an in-kernel
irqchip.
......@@ -45,9 +47,10 @@ all vcpus, while as an SPI it must be a separate number per vcpu.
Returns:
======= ======================================================
-EEXIST Interrupt number already used
-ENODEV PMUv3 not supported or GIC not initialized
-ENXIO PMUv3 not properly configured or in-kernel irqchip not
configured as required prior to calling this attribute
-ENXIO PMUv3 not supported, missing VCPU feature or interrupt
number not set
-EBUSY PMUv3 already initialized
======= ======================================================
......@@ -55,6 +58,52 @@ Request the initialization of the PMUv3. If using the PMUv3 with an in-kernel
virtual GIC implementation, this must be done after initializing the in-kernel
irqchip.
1.3 ATTRIBUTE: KVM_ARM_VCPU_PMU_V3_FILTER
-----------------------------------------
:Parameters: in kvm_device_attr.addr the address for a PMU event filter is a
pointer to a struct kvm_pmu_event_filter
:Returns:
======= ======================================================
-ENODEV PMUv3 not supported or GIC not initialized
-ENXIO PMUv3 not properly configured or in-kernel irqchip not
configured as required prior to calling this attribute
-EBUSY PMUv3 already initialized
-EINVAL Invalid filter range
======= ======================================================
Request the installation of a PMU event filter described as follows::
struct kvm_pmu_event_filter {
__u16 base_event;
__u16 nevents;
#define KVM_PMU_EVENT_ALLOW 0
#define KVM_PMU_EVENT_DENY 1
__u8 action;
__u8 pad[3];
};
A filter range is defined as the range [@base_event, @base_event + @nevents),
together with an @action (KVM_PMU_EVENT_ALLOW or KVM_PMU_EVENT_DENY). The
first registered range defines the global policy (global ALLOW if the first
@action is DENY, global DENY if the first @action is ALLOW). Multiple ranges
can be programmed, and must fit within the event space defined by the PMU
architecture (10 bits on ARMv8.0, 16 bits from ARMv8.1 onwards).
Note: "Cancelling" a filter by registering the opposite action for the same
range doesn't change the default action. For example, installing an ALLOW
filter for event range [0:10) as the first filter and then applying a DENY
action for the same range will leave the whole range as disabled.
Restrictions: Event 0 (SW_INCR) is never filtered, as it doesn't count a
hardware event. Filtering event 0x1E (CHAIN) has no effect either, as it
isn't strictly speaking an event. Filtering the cycle counter is possible
using event 0x11 (CPU_CYCLES).
2. GROUP: KVM_ARM_VCPU_TIMER_CTRL
=================================
......
......@@ -1165,32 +1165,6 @@ config UNMAP_KERNEL_AT_EL0
If unsure, say Y.
config HARDEN_BRANCH_PREDICTOR
bool "Harden the branch predictor against aliasing attacks" if EXPERT
default y
help
Speculation attacks against some high-performance processors rely on
being able to manipulate the branch predictor for a victim context by
executing aliasing branches in the attacker context. Such attacks
can be partially mitigated against by clearing internal branch
predictor state and limiting the prediction logic in some situations.
This config option will take CPU-specific actions to harden the
branch predictor against aliasing attacks and may rely on specific
instruction sequences or control bits being set by the system
firmware.
If unsure, say Y.
config ARM64_SSBD
bool "Speculative Store Bypass Disable" if EXPERT
default y
help
This enables mitigation of the bypassing of previous stores
by speculative loads.
If unsure, say Y.
config RODATA_FULL_DEFAULT_ENABLED
bool "Apply r/o permissions of VM areas also to their linear aliases"
default y
......
......@@ -218,6 +218,23 @@ lr .req x30 // link register
str \src, [\tmp, :lo12:\sym]
.endm
/*
* @dst: destination register
*/
#if defined(__KVM_NVHE_HYPERVISOR__) || defined(__KVM_VHE_HYPERVISOR__)
.macro this_cpu_offset, dst
mrs \dst, tpidr_el2
.endm
#else
.macro this_cpu_offset, dst
alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
mrs \dst, tpidr_el1
alternative_else
mrs \dst, tpidr_el2
alternative_endif
.endm
#endif
/*
* @dst: Result of per_cpu(sym, smp_processor_id()) (can be SP)
* @sym: The name of the per-cpu variable
......@@ -226,11 +243,7 @@ lr .req x30 // link register
.macro adr_this_cpu, dst, sym, tmp
adrp \tmp, \sym
add \dst, \tmp, #:lo12:\sym
alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
mrs \tmp, tpidr_el1
alternative_else
mrs \tmp, tpidr_el2
alternative_endif
this_cpu_offset \tmp
add \dst, \dst, \tmp
.endm
......@@ -241,11 +254,7 @@ alternative_endif
*/
.macro ldr_this_cpu dst, sym, tmp
adr_l \dst, \sym
alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
mrs \tmp, tpidr_el1
alternative_else
mrs \tmp, tpidr_el2
alternative_endif
this_cpu_offset \tmp
ldr \dst, [\dst, \tmp]
.endm
......
......@@ -31,13 +31,13 @@
#define ARM64_HAS_DCPOP 21
#define ARM64_SVE 22
#define ARM64_UNMAP_KERNEL_AT_EL0 23
#define ARM64_HARDEN_BRANCH_PREDICTOR 24
#define ARM64_SPECTRE_V2 24
#define ARM64_HAS_RAS_EXTN 25
#define ARM64_WORKAROUND_843419 26
#define ARM64_HAS_CACHE_IDC 27
#define ARM64_HAS_CACHE_DIC 28
#define ARM64_HW_DBM 29
#define ARM64_SSBD 30
#define ARM64_SPECTRE_V4 30
#define ARM64_MISMATCHED_CACHE_TYPE 31
#define ARM64_HAS_STAGE2_FWB 32
#define ARM64_HAS_CRC32 33
......
......@@ -698,30 +698,6 @@ static inline bool system_supports_tlb_range(void)
cpus_have_const_cap(ARM64_HAS_TLB_RANGE);
}
#define ARM64_BP_HARDEN_UNKNOWN -1
#define ARM64_BP_HARDEN_WA_NEEDED 0
#define ARM64_BP_HARDEN_NOT_REQUIRED 1
int get_spectre_v2_workaround_state(void);
#define ARM64_SSBD_UNKNOWN -1
#define ARM64_SSBD_FORCE_DISABLE 0
#define ARM64_SSBD_KERNEL 1
#define ARM64_SSBD_FORCE_ENABLE 2
#define ARM64_SSBD_MITIGATED 3
static inline int arm64_get_ssbd_state(void)
{
#ifdef CONFIG_ARM64_SSBD
extern int ssbd_state;
return ssbd_state;
#else
return ARM64_SSBD_UNKNOWN;
#endif
}
void arm64_set_ssbd_mitigation(bool state);
extern int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt);
static inline u32 id_aa64mmfr0_parange_to_phys_shift(int parange)
......
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2020 Google LLC.
* Written by David Brazdil <dbrazdil@google.com>
*/
#ifndef __ARM64_HYP_IMAGE_H__
#define __ARM64_HYP_IMAGE_H__
/*
* KVM nVHE code has its own symbol namespace prefixed with __kvm_nvhe_,
* to separate it from the kernel proper.
*/
#define kvm_nvhe_sym(sym) __kvm_nvhe_##sym
#ifdef LINKER_SCRIPT
/*
* KVM nVHE ELF section names are prefixed with .hyp, to separate them
* from the kernel proper.
*/
#define HYP_SECTION_NAME(NAME) .hyp##NAME
/* Defines an ELF hyp section from input section @NAME and its subsections. */
#define HYP_SECTION(NAME) \
HYP_SECTION_NAME(NAME) : { *(NAME NAME##.*) }
/*
* Defines a linker script alias of a kernel-proper symbol referenced by
* KVM nVHE hyp code.
*/
#define KVM_NVHE_ALIAS(sym) kvm_nvhe_sym(sym) = sym;
#endif /* LINKER_SCRIPT */
#endif /* __ARM64_HYP_IMAGE_H__ */
......@@ -7,11 +7,9 @@
#ifndef __ARM_KVM_ASM_H__
#define __ARM_KVM_ASM_H__
#include <asm/hyp_image.h>
#include <asm/virt.h>
#define VCPU_WORKAROUND_2_FLAG_SHIFT 0
#define VCPU_WORKAROUND_2_FLAG (_AC(1, UL) << VCPU_WORKAROUND_2_FLAG_SHIFT)
#define ARM_EXIT_WITH_SERROR_BIT 31
#define ARM_EXCEPTION_CODE(x) ((x) & ~(1U << ARM_EXIT_WITH_SERROR_BIT))
#define ARM_EXCEPTION_IS_TRAP(x) (ARM_EXCEPTION_CODE((x)) == ARM_EXCEPTION_TRAP)
......@@ -38,17 +36,34 @@
#define __SMCCC_WORKAROUND_1_SMC_SZ 36
#define KVM_HOST_SMCCC_ID(id) \
ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \
ARM_SMCCC_SMC_64, \
ARM_SMCCC_OWNER_VENDOR_HYP, \
(id))
#define KVM_HOST_SMCCC_FUNC(name) KVM_HOST_SMCCC_ID(__KVM_HOST_SMCCC_FUNC_##name)
#define __KVM_HOST_SMCCC_FUNC___kvm_hyp_init 0
#define __KVM_HOST_SMCCC_FUNC___kvm_vcpu_run 1
#define __KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context 2
#define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_ipa 3
#define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid 4
#define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_local_vmid 5
#define __KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff 6
#define __KVM_HOST_SMCCC_FUNC___kvm_enable_ssbs 7
#define __KVM_HOST_SMCCC_FUNC___vgic_v3_get_ich_vtr_el2 8
#define __KVM_HOST_SMCCC_FUNC___vgic_v3_read_vmcr 9
#define __KVM_HOST_SMCCC_FUNC___vgic_v3_write_vmcr 10
#define __KVM_HOST_SMCCC_FUNC___vgic_v3_init_lrs 11
#define __KVM_HOST_SMCCC_FUNC___kvm_get_mdcr_el2 12
#define __KVM_HOST_SMCCC_FUNC___vgic_v3_save_aprs 13
#define __KVM_HOST_SMCCC_FUNC___vgic_v3_restore_aprs 14
#ifndef __ASSEMBLY__
#include <linux/mm.h>
/*
* Translate name of a symbol defined in nVHE hyp to the name seen
* by kernel proper. All nVHE symbols are prefixed by the build system
* to avoid clashes with the VHE variants.
*/
#define kvm_nvhe_sym(sym) __kvm_nvhe_##sym
#define DECLARE_KVM_VHE_SYM(sym) extern char sym[]
#define DECLARE_KVM_NVHE_SYM(sym) extern char kvm_nvhe_sym(sym)[]
......@@ -60,10 +75,53 @@
DECLARE_KVM_VHE_SYM(sym); \
DECLARE_KVM_NVHE_SYM(sym)
#define DECLARE_KVM_VHE_PER_CPU(type, sym) \
DECLARE_PER_CPU(type, sym)
#define DECLARE_KVM_NVHE_PER_CPU(type, sym) \
DECLARE_PER_CPU(type, kvm_nvhe_sym(sym))
#define DECLARE_KVM_HYP_PER_CPU(type, sym) \
DECLARE_KVM_VHE_PER_CPU(type, sym); \
DECLARE_KVM_NVHE_PER_CPU(type, sym)
/*
* Compute pointer to a symbol defined in nVHE percpu region.
* Returns NULL if percpu memory has not been allocated yet.
*/
#define this_cpu_ptr_nvhe_sym(sym) per_cpu_ptr_nvhe_sym(sym, smp_processor_id())
#define per_cpu_ptr_nvhe_sym(sym, cpu) \
({ \
unsigned long base, off; \
base = kvm_arm_hyp_percpu_base[cpu]; \
off = (unsigned long)&CHOOSE_NVHE_SYM(sym) - \
(unsigned long)&CHOOSE_NVHE_SYM(__per_cpu_start); \
base ? (typeof(CHOOSE_NVHE_SYM(sym))*)(base + off) : NULL; \
})
#if defined(__KVM_NVHE_HYPERVISOR__)
#define CHOOSE_NVHE_SYM(sym) sym
#define CHOOSE_HYP_SYM(sym) CHOOSE_NVHE_SYM(sym)
/* The nVHE hypervisor shouldn't even try to access VHE symbols */
extern void *__nvhe_undefined_symbol;
#define CHOOSE_VHE_SYM(sym) __nvhe_undefined_symbol
#define this_cpu_ptr_hyp_sym(sym) (&__nvhe_undefined_symbol)
#define per_cpu_ptr_hyp_sym(sym, cpu) (&__nvhe_undefined_symbol)
#elif defined(__KVM_VHE_HYPERVISOR__)
#define CHOOSE_VHE_SYM(sym) sym
#define CHOOSE_NVHE_SYM(sym) kvm_nvhe_sym(sym)
#define CHOOSE_HYP_SYM(sym) CHOOSE_VHE_SYM(sym)
/* The VHE hypervisor shouldn't even try to access nVHE symbols */
extern void *__vhe_undefined_symbol;
#define CHOOSE_NVHE_SYM(sym) __vhe_undefined_symbol
#define this_cpu_ptr_hyp_sym(sym) (&__vhe_undefined_symbol)
#define per_cpu_ptr_hyp_sym(sym, cpu) (&__vhe_undefined_symbol)
#else
#ifndef __KVM_NVHE_HYPERVISOR__
/*
* BIG FAT WARNINGS:
*
......@@ -75,12 +133,21 @@
* - Don't let the nVHE hypervisor have access to this, as it will
* pick the *wrong* symbol (yes, it runs at EL2...).
*/
#define CHOOSE_HYP_SYM(sym) (is_kernel_in_hyp_mode() ? CHOOSE_VHE_SYM(sym) \
#define CHOOSE_HYP_SYM(sym) (is_kernel_in_hyp_mode() \
? CHOOSE_VHE_SYM(sym) \
: CHOOSE_NVHE_SYM(sym))
#else
/* The nVHE hypervisor shouldn't even try to access anything */
extern void *__nvhe_undefined_symbol;
#define CHOOSE_HYP_SYM(sym) __nvhe_undefined_symbol
#define this_cpu_ptr_hyp_sym(sym) (is_kernel_in_hyp_mode() \
? this_cpu_ptr(&sym) \
: this_cpu_ptr_nvhe_sym(sym))
#define per_cpu_ptr_hyp_sym(sym, cpu) (is_kernel_in_hyp_mode() \
? per_cpu_ptr(&sym, cpu) \
: per_cpu_ptr_nvhe_sym(sym, cpu))
#define CHOOSE_VHE_SYM(sym) sym
#define CHOOSE_NVHE_SYM(sym) kvm_nvhe_sym(sym)
#endif
/* Translate a kernel address @ptr into its equivalent linear mapping */
......@@ -98,15 +165,19 @@ struct kvm_vcpu;
struct kvm_s2_mmu;
DECLARE_KVM_NVHE_SYM(__kvm_hyp_init);
DECLARE_KVM_NVHE_SYM(__kvm_hyp_host_vector);
DECLARE_KVM_HYP_SYM(__kvm_hyp_vector);
#define __kvm_hyp_init CHOOSE_NVHE_SYM(__kvm_hyp_init)
#define __kvm_hyp_host_vector CHOOSE_NVHE_SYM(__kvm_hyp_host_vector)
#define __kvm_hyp_vector CHOOSE_HYP_SYM(__kvm_hyp_vector)
#ifdef CONFIG_KVM_INDIRECT_VECTORS
extern unsigned long kvm_arm_hyp_percpu_base[NR_CPUS];
DECLARE_KVM_NVHE_SYM(__per_cpu_start);
DECLARE_KVM_NVHE_SYM(__per_cpu_end);
extern atomic_t arm64_el2_vector_last_slot;
DECLARE_KVM_HYP_SYM(__bp_harden_hyp_vecs);
#define __bp_harden_hyp_vecs CHOOSE_HYP_SYM(__bp_harden_hyp_vecs)
#endif
extern void __kvm_flush_vm_context(void);
extern void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa,
......@@ -149,26 +220,6 @@ extern char __smccc_workaround_1_smc[__SMCCC_WORKAROUND_1_SMC_SZ];
addr; \
})
/*
* Home-grown __this_cpu_{ptr,read} variants that always work at HYP,
* provided that sym is really a *symbol* and not a pointer obtained from
* a data structure. As for SHIFT_PERCPU_PTR(), the creative casting keeps
* sparse quiet.
*/
#define __hyp_this_cpu_ptr(sym) \
({ \
void *__ptr; \
__verify_pcpu_ptr(&sym); \
__ptr = hyp_symbol_addr(sym); \
__ptr += read_sysreg(tpidr_el2); \
(typeof(sym) __kernel __force *)__ptr; \
})
#define __hyp_this_cpu_read(sym) \
({ \
*__hyp_this_cpu_ptr(sym); \
})
#define __KVM_EXTABLE(from, to) \
" .pushsection __kvm_ex_table, \"a\"\n" \
" .align 3\n" \
......@@ -199,20 +250,8 @@ extern char __smccc_workaround_1_smc[__SMCCC_WORKAROUND_1_SMC_SZ];
#else /* __ASSEMBLY__ */
.macro hyp_adr_this_cpu reg, sym, tmp
adr_l \reg, \sym
mrs \tmp, tpidr_el2
add \reg, \reg, \tmp
.endm
.macro hyp_ldr_this_cpu reg, sym, tmp
adr_l \reg, \sym
mrs \tmp, tpidr_el2
ldr \reg, [\reg, \tmp]
.endm
.macro get_host_ctxt reg, tmp
hyp_adr_this_cpu \reg, kvm_host_data, \tmp
adr_this_cpu \reg, kvm_host_data, \tmp
add \reg, \reg, #HOST_DATA_CONTEXT
.endm
......@@ -221,6 +260,16 @@ extern char __smccc_workaround_1_smc[__SMCCC_WORKAROUND_1_SMC_SZ];
ldr \vcpu, [\ctxt, #HOST_CONTEXT_VCPU]
.endm
.macro get_loaded_vcpu vcpu, ctxt
adr_this_cpu \ctxt, kvm_hyp_ctxt, \vcpu
ldr \vcpu, [\ctxt, #HOST_CONTEXT_VCPU]
.endm
.macro set_loaded_vcpu vcpu, ctxt, tmp
adr_this_cpu \ctxt, kvm_hyp_ctxt, \tmp
str \vcpu, [\ctxt, #HOST_CONTEXT_VCPU]
.endm
/*
* KVM extable for unexpected exceptions.
* In the same format _asm_extable, but output to a different section so that
......@@ -236,6 +285,45 @@ extern char __smccc_workaround_1_smc[__SMCCC_WORKAROUND_1_SMC_SZ];
.popsection
.endm
#define CPU_XREG_OFFSET(x) (CPU_USER_PT_REGS + 8*x)
#define CPU_LR_OFFSET CPU_XREG_OFFSET(30)
#define CPU_SP_EL0_OFFSET (CPU_LR_OFFSET + 8)
/*
* We treat x18 as callee-saved as the host may use it as a platform
* register (e.g. for shadow call stack).
*/
.macro save_callee_saved_regs ctxt
str x18, [\ctxt, #CPU_XREG_OFFSET(18)]
stp x19, x20, [\ctxt, #CPU_XREG_OFFSET(19)]
stp x21, x22, [\ctxt, #CPU_XREG_OFFSET(21)]
stp x23, x24, [\ctxt, #CPU_XREG_OFFSET(23)]
stp x25, x26, [\ctxt, #CPU_XREG_OFFSET(25)]
stp x27, x28, [\ctxt, #CPU_XREG_OFFSET(27)]
stp x29, lr, [\ctxt, #CPU_XREG_OFFSET(29)]
.endm
.macro restore_callee_saved_regs ctxt
// We require \ctxt is not x18-x28
ldr x18, [\ctxt, #CPU_XREG_OFFSET(18)]
ldp x19, x20, [\ctxt, #CPU_XREG_OFFSET(19)]
ldp x21, x22, [\ctxt, #CPU_XREG_OFFSET(21)]
ldp x23, x24, [\ctxt, #CPU_XREG_OFFSET(23)]
ldp x25, x26, [\ctxt, #CPU_XREG_OFFSET(25)]
ldp x27, x28, [\ctxt, #CPU_XREG_OFFSET(27)]
ldp x29, lr, [\ctxt, #CPU_XREG_OFFSET(29)]
.endm
.macro save_sp_el0 ctxt, tmp
mrs \tmp, sp_el0
str \tmp, [\ctxt, #CPU_SP_EL0_OFFSET]
.endm
.macro restore_sp_el0 ctxt, tmp
ldr \tmp, [\ctxt, #CPU_SP_EL0_OFFSET]
msr sp_el0, \tmp
.endm
#endif
#endif /* __ARM_KVM_ASM_H__ */
......@@ -391,20 +391,6 @@ static inline unsigned long kvm_vcpu_get_mpidr_aff(struct kvm_vcpu *vcpu)
return vcpu_read_sys_reg(vcpu, MPIDR_EL1) & MPIDR_HWID_BITMASK;
}
static inline bool kvm_arm_get_vcpu_workaround_2_flag(struct kvm_vcpu *vcpu)
{
return vcpu->arch.workaround_flags & VCPU_WORKAROUND_2_FLAG;
}
static inline void kvm_arm_set_vcpu_workaround_2_flag(struct kvm_vcpu *vcpu,
bool flag)
{
if (flag)
vcpu->arch.workaround_flags |= VCPU_WORKAROUND_2_FLAG;
else
vcpu->arch.workaround_flags &= ~VCPU_WORKAROUND_2_FLAG;
}
static inline void kvm_vcpu_set_be(struct kvm_vcpu *vcpu)
{
if (vcpu_mode_is_32bit(vcpu)) {
......
......@@ -11,6 +11,7 @@
#ifndef __ARM64_KVM_HOST_H__
#define __ARM64_KVM_HOST_H__
#include <linux/arm-smccc.h>
#include <linux/bitmap.h>
#include <linux/types.h>
#include <linux/jump_label.h>
......@@ -79,8 +80,8 @@ struct kvm_s2_mmu {
* for vEL1/EL0 with vHCR_EL2.VM == 0. In that case, we use the
* canonical stage-2 page tables.
*/
pgd_t *pgd;
phys_addr_t pgd_phys;
struct kvm_pgtable *pgt;
/* The last vcpu id that ran on each physical CPU */
int __percpu *last_vcpu_ran;
......@@ -110,6 +111,13 @@ struct kvm_arch {
* supported.
*/
bool return_nisv_io_abort_to_user;
/*
* VM-wide PMU filter, implemented as a bitmap and big enough for
* up to 2^10 events (ARMv8.0) or 2^16 events (ARMv8.1+).
*/
unsigned long *pmu_filter;
unsigned int pmuver;
};
struct kvm_vcpu_fault_info {
......@@ -262,8 +270,6 @@ struct kvm_host_data {
struct kvm_pmu_events pmu_events;
};
typedef struct kvm_host_data kvm_host_data_t;
struct vcpu_reset_state {
unsigned long pc;
unsigned long r0;
......@@ -480,18 +486,15 @@ int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
void kvm_arm_halt_guest(struct kvm *kvm);
void kvm_arm_resume_guest(struct kvm *kvm);
u64 __kvm_call_hyp(void *hypfn, ...);
#define kvm_call_hyp_nvhe(f, ...) \
do { \
DECLARE_KVM_NVHE_SYM(f); \
__kvm_call_hyp(kvm_ksym_ref_nvhe(f), ##__VA_ARGS__); \
} while(0)
#define kvm_call_hyp_nvhe_ret(f, ...) \
({ \
DECLARE_KVM_NVHE_SYM(f); \
__kvm_call_hyp(kvm_ksym_ref_nvhe(f), ##__VA_ARGS__); \
struct arm_smccc_res res; \
\
arm_smccc_1_1_hvc(KVM_HOST_SMCCC_FUNC(f), \
##__VA_ARGS__, &res); \
WARN_ON(res.a0 != SMCCC_RET_SUCCESS); \
\
res.a1; \
})
/*
......@@ -517,7 +520,7 @@ u64 __kvm_call_hyp(void *hypfn, ...);
ret = f(__VA_ARGS__); \
isb(); \
} else { \
ret = kvm_call_hyp_nvhe_ret(f, ##__VA_ARGS__); \
ret = kvm_call_hyp_nvhe(f, ##__VA_ARGS__); \
} \
\
ret; \
......@@ -565,7 +568,7 @@ void kvm_set_sei_esr(struct kvm_vcpu *vcpu, u64 syndrome);
struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
DECLARE_PER_CPU(kvm_host_data_t, kvm_host_data);
DECLARE_KVM_HYP_PER_CPU(struct kvm_host_data, kvm_host_data);
static inline void kvm_init_host_cpu_context(struct kvm_cpu_context *cpu_ctxt)
{
......@@ -631,46 +634,6 @@ static inline void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr) {}
static inline void kvm_clr_pmu_events(u32 clr) {}
#endif
#define KVM_BP_HARDEN_UNKNOWN -1
#define KVM_BP_HARDEN_WA_NEEDED 0
#define KVM_BP_HARDEN_NOT_REQUIRED 1
static inline int kvm_arm_harden_branch_predictor(void)
{
switch (get_spectre_v2_workaround_state()) {
case ARM64_BP_HARDEN_WA_NEEDED:
return KVM_BP_HARDEN_WA_NEEDED;
case ARM64_BP_HARDEN_NOT_REQUIRED:
return KVM_BP_HARDEN_NOT_REQUIRED;
case ARM64_BP_HARDEN_UNKNOWN:
default:
return KVM_BP_HARDEN_UNKNOWN;
}
}
#define KVM_SSBD_UNKNOWN -1
#define KVM_SSBD_FORCE_DISABLE 0
#define KVM_SSBD_KERNEL 1
#define KVM_SSBD_FORCE_ENABLE 2
#define KVM_SSBD_MITIGATED 3
static inline int kvm_arm_have_ssbd(void)
{
switch (arm64_get_ssbd_state()) {
case ARM64_SSBD_FORCE_DISABLE:
return KVM_SSBD_FORCE_DISABLE;
case ARM64_SSBD_KERNEL:
return KVM_SSBD_KERNEL;
case ARM64_SSBD_FORCE_ENABLE:
return KVM_SSBD_FORCE_ENABLE;
case ARM64_SSBD_MITIGATED:
return KVM_SSBD_MITIGATED;
case ARM64_SSBD_UNKNOWN:
default:
return KVM_SSBD_UNKNOWN;
}
}
void kvm_vcpu_load_sysregs_vhe(struct kvm_vcpu *vcpu);
void kvm_vcpu_put_sysregs_vhe(struct kvm_vcpu *vcpu);
......
......@@ -12,6 +12,9 @@
#include <asm/alternative.h>
#include <asm/sysreg.h>
DECLARE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
DECLARE_PER_CPU(unsigned long, kvm_hyp_vector);
#define read_sysreg_elx(r,nvh,vh) \
({ \
u64 reg; \
......@@ -87,11 +90,11 @@ void activate_traps_vhe_load(struct kvm_vcpu *vcpu);
void deactivate_traps_vhe_put(void);
#endif
u64 __guest_enter(struct kvm_vcpu *vcpu, struct kvm_cpu_context *host_ctxt);
u64 __guest_enter(struct kvm_vcpu *vcpu);
void __noreturn hyp_panic(struct kvm_cpu_context *host_ctxt);
void __noreturn hyp_panic(void);
#ifdef __KVM_NVHE_HYPERVISOR__
void __noreturn __hyp_do_panic(unsigned long, ...);
void __noreturn __hyp_do_panic(bool restore_host, u64 spsr, u64 elr, u64 par);
#endif
#endif /* __ARM64_KVM_HYP_H__ */
......
......@@ -9,6 +9,7 @@
#include <asm/page.h>
#include <asm/memory.h>
#include <asm/mmu.h>
#include <asm/cpufeature.h>
/*
......@@ -43,16 +44,6 @@
* HYP_VA_MIN = 1 << (VA_BITS - 1)
* HYP_VA_MAX = HYP_VA_MIN + (1 << (VA_BITS - 1)) - 1
*
* This of course assumes that the trampoline page exists within the
* VA_BITS range. If it doesn't, then it means we're in the odd case
* where the kernel idmap (as well as HYP) uses more levels than the
* kernel runtime page tables (as seen when the kernel is configured
* for 4k pages, 39bits VA, and yet memory lives just above that
* limit, forcing the idmap to use 4 levels of page tables while the
* kernel itself only uses 3). In this particular case, it doesn't
* matter which side of VA_BITS we use, as we're guaranteed not to
* conflict with anything.
*
* When using VHE, there are no separate hyp mappings and all KVM
* functionality is already mapped as part of the main kernel
* mappings, and none of this applies in that case.
......@@ -117,15 +108,10 @@ static __always_inline unsigned long __kern_hyp_va(unsigned long v)
#define kvm_phys_size(kvm) (_AC(1, ULL) << kvm_phys_shift(kvm))
#define kvm_phys_mask(kvm) (kvm_phys_size(kvm) - _AC(1, ULL))
static inline bool kvm_page_empty(void *ptr)
{
struct page *ptr_page = virt_to_page(ptr);
return page_count(ptr_page) == 1;
}
#include <asm/kvm_pgtable.h>
#include <asm/stage2_pgtable.h>
int create_hyp_mappings(void *from, void *to, pgprot_t prot);
int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot);
int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
void __iomem **kaddr,
void __iomem **haddr);
......@@ -141,149 +127,9 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
int kvm_handle_guest_abort(struct kvm_vcpu *vcpu);
void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu);
phys_addr_t kvm_mmu_get_httbr(void);
phys_addr_t kvm_get_idmap_vector(void);
int kvm_mmu_init(void);
void kvm_clear_hyp_idmap(void);
#define kvm_mk_pmd(ptep) \
__pmd(__phys_to_pmd_val(__pa(ptep)) | PMD_TYPE_TABLE)
#define kvm_mk_pud(pmdp) \
__pud(__phys_to_pud_val(__pa(pmdp)) | PMD_TYPE_TABLE)
#define kvm_mk_p4d(pmdp) \
__p4d(__phys_to_p4d_val(__pa(pmdp)) | PUD_TYPE_TABLE)
#define kvm_set_pud(pudp, pud) set_pud(pudp, pud)
#define kvm_pfn_pte(pfn, prot) pfn_pte(pfn, prot)
#define kvm_pfn_pmd(pfn, prot) pfn_pmd(pfn, prot)
#define kvm_pfn_pud(pfn, prot) pfn_pud(pfn, prot)
#define kvm_pud_pfn(pud) pud_pfn(pud)
#define kvm_pmd_mkhuge(pmd) pmd_mkhuge(pmd)
#define kvm_pud_mkhuge(pud) pud_mkhuge(pud)
static inline pte_t kvm_s2pte_mkwrite(pte_t pte)
{
pte_val(pte) |= PTE_S2_RDWR;
return pte;
}
static inline pmd_t kvm_s2pmd_mkwrite(pmd_t pmd)
{
pmd_val(pmd) |= PMD_S2_RDWR;
return pmd;
}
static inline pud_t kvm_s2pud_mkwrite(pud_t pud)
{
pud_val(pud) |= PUD_S2_RDWR;
return pud;
}
static inline pte_t kvm_s2pte_mkexec(pte_t pte)
{
pte_val(pte) &= ~PTE_S2_XN;
return pte;
}
static inline pmd_t kvm_s2pmd_mkexec(pmd_t pmd)
{
pmd_val(pmd) &= ~PMD_S2_XN;
return pmd;
}
static inline pud_t kvm_s2pud_mkexec(pud_t pud)
{
pud_val(pud) &= ~PUD_S2_XN;
return pud;
}
static inline void kvm_set_s2pte_readonly(pte_t *ptep)
{
pteval_t old_pteval, pteval;
pteval = READ_ONCE(pte_val(*ptep));
do {
old_pteval = pteval;
pteval &= ~PTE_S2_RDWR;
pteval |= PTE_S2_RDONLY;
pteval = cmpxchg_relaxed(&pte_val(*ptep), old_pteval, pteval);
} while (pteval != old_pteval);
}
static inline bool kvm_s2pte_readonly(pte_t *ptep)
{
return (READ_ONCE(pte_val(*ptep)) & PTE_S2_RDWR) == PTE_S2_RDONLY;
}
static inline bool kvm_s2pte_exec(pte_t *ptep)
{
return !(READ_ONCE(pte_val(*ptep)) & PTE_S2_XN);
}
static inline void kvm_set_s2pmd_readonly(pmd_t *pmdp)
{
kvm_set_s2pte_readonly((pte_t *)pmdp);
}
static inline bool kvm_s2pmd_readonly(pmd_t *pmdp)
{
return kvm_s2pte_readonly((pte_t *)pmdp);
}
static inline bool kvm_s2pmd_exec(pmd_t *pmdp)
{
return !(READ_ONCE(pmd_val(*pmdp)) & PMD_S2_XN);
}
static inline void kvm_set_s2pud_readonly(pud_t *pudp)
{
kvm_set_s2pte_readonly((pte_t *)pudp);
}
static inline bool kvm_s2pud_readonly(pud_t *pudp)
{
return kvm_s2pte_readonly((pte_t *)pudp);
}
static inline bool kvm_s2pud_exec(pud_t *pudp)
{
return !(READ_ONCE(pud_val(*pudp)) & PUD_S2_XN);
}
static inline pud_t kvm_s2pud_mkyoung(pud_t pud)
{
return pud_mkyoung(pud);
}
static inline bool kvm_s2pud_young(pud_t pud)
{
return pud_young(pud);
}
#define hyp_pte_table_empty(ptep) kvm_page_empty(ptep)
#ifdef __PAGETABLE_PMD_FOLDED
#define hyp_pmd_table_empty(pmdp) (0)
#else
#define hyp_pmd_table_empty(pmdp) kvm_page_empty(pmdp)
#endif
#ifdef __PAGETABLE_PUD_FOLDED
#define hyp_pud_table_empty(pudp) (0)
#else
#define hyp_pud_table_empty(pudp) kvm_page_empty(pudp)
#endif
#ifdef __PAGETABLE_P4D_FOLDED
#define hyp_p4d_table_empty(p4dp) (0)
#else
#define hyp_p4d_table_empty(p4dp) kvm_page_empty(p4dp)
#endif
struct kvm;
......@@ -325,77 +171,9 @@ static inline void __invalidate_icache_guest_page(kvm_pfn_t pfn,
}
}
static inline void __kvm_flush_dcache_pte(pte_t pte)
{
if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) {
struct page *page = pte_page(pte);
kvm_flush_dcache_to_poc(page_address(page), PAGE_SIZE);
}
}
static inline void __kvm_flush_dcache_pmd(pmd_t pmd)
{
if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) {
struct page *page = pmd_page(pmd);
kvm_flush_dcache_to_poc(page_address(page), PMD_SIZE);
}
}
static inline void __kvm_flush_dcache_pud(pud_t pud)
{
if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) {
struct page *page = pud_page(pud);
kvm_flush_dcache_to_poc(page_address(page), PUD_SIZE);
}
}
void kvm_set_way_flush(struct kvm_vcpu *vcpu);
void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled);
static inline bool __kvm_cpu_uses_extended_idmap(void)
{
return __cpu_uses_extended_idmap_level();
}
static inline unsigned long __kvm_idmap_ptrs_per_pgd(void)
{
return idmap_ptrs_per_pgd;
}
/*
* Can't use pgd_populate here, because the extended idmap adds an extra level
* above CONFIG_PGTABLE_LEVELS (which is 2 or 3 if we're using the extended
* idmap), and pgd_populate is only available if CONFIG_PGTABLE_LEVELS = 4.
*/
static inline void __kvm_extend_hypmap(pgd_t *boot_hyp_pgd,
pgd_t *hyp_pgd,
pgd_t *merged_hyp_pgd,
unsigned long hyp_idmap_start)
{
int idmap_idx;
u64 pgd_addr;
/*
* Use the first entry to access the HYP mappings. It is
* guaranteed to be free, otherwise we wouldn't use an
* extended idmap.
*/
VM_BUG_ON(pgd_val(merged_hyp_pgd[0]));
pgd_addr = __phys_to_pgd_val(__pa(hyp_pgd));
merged_hyp_pgd[0] = __pgd(pgd_addr | PMD_TYPE_TABLE);
/*
* Create another extended level entry that points to the boot HYP map,
* which contains an ID mapping of the HYP init code. We essentially
* merge the boot and runtime HYP maps by doing so, but they don't
* overlap anyway, so this is fine.
*/
idmap_idx = hyp_idmap_start >> VA_BITS;
VM_BUG_ON(pgd_val(merged_hyp_pgd[idmap_idx]));
pgd_addr = __phys_to_pgd_val(__pa(boot_hyp_pgd));
merged_hyp_pgd[idmap_idx] = __pgd(pgd_addr | PMD_TYPE_TABLE);
}
static inline unsigned int kvm_get_vmid_bits(void)
{
int reg = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
......@@ -430,19 +208,17 @@ static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa,
return ret;
}
#ifdef CONFIG_KVM_INDIRECT_VECTORS
/*
* EL2 vectors can be mapped and rerouted in a number of ways,
* depending on the kernel configuration and CPU present:
*
* - If the CPU has the ARM64_HARDEN_BRANCH_PREDICTOR cap, the
* hardening sequence is placed in one of the vector slots, which is
* executed before jumping to the real vectors.
* - If the CPU is affected by Spectre-v2, the hardening sequence is
* placed in one of the vector slots, which is executed before jumping
* to the real vectors.
*
* - If the CPU has both the ARM64_HARDEN_EL2_VECTORS cap and the
* ARM64_HARDEN_BRANCH_PREDICTOR cap, the slot containing the
* hardening sequence is mapped next to the idmap page, and executed
* before jumping to the real vectors.
* - If the CPU also has the ARM64_HARDEN_EL2_VECTORS cap, the slot
* containing the hardening sequence is mapped next to the idmap page,
* and executed before jumping to the real vectors.
*
* - If the CPU only has the ARM64_HARDEN_EL2_VECTORS cap, then an
* empty slot is selected, mapped next to the idmap page, and
......@@ -452,19 +228,16 @@ static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa,
* VHE, as we don't have hypervisor-specific mappings. If the system
* is VHE and yet selects this capability, it will be ignored.
*/
#include <asm/mmu.h>
extern void *__kvm_bp_vect_base;
extern int __kvm_harden_el2_vector_slot;
/* This is called on both VHE and !VHE systems */
static inline void *kvm_get_hyp_vector(void)
{
struct bp_hardening_data *data = arm64_get_bp_hardening_data();
void *vect = kern_hyp_va(kvm_ksym_ref(__kvm_hyp_vector));
int slot = -1;
if (cpus_have_const_cap(ARM64_HARDEN_BRANCH_PREDICTOR) && data->fn) {
if (cpus_have_const_cap(ARM64_SPECTRE_V2) && data->fn) {
vect = kern_hyp_va(kvm_ksym_ref(__bp_harden_hyp_vecs));
slot = data->hyp_vectors_slot;
}
......@@ -481,102 +254,8 @@ static inline void *kvm_get_hyp_vector(void)
return vect;
}
/* This is only called on a !VHE system */
static inline int kvm_map_vectors(void)
{
/*
* HBP = ARM64_HARDEN_BRANCH_PREDICTOR
* HEL2 = ARM64_HARDEN_EL2_VECTORS
*
* !HBP + !HEL2 -> use direct vectors
* HBP + !HEL2 -> use hardened vectors in place
* !HBP + HEL2 -> allocate one vector slot and use exec mapping
* HBP + HEL2 -> use hardened vertors and use exec mapping
*/
if (cpus_have_const_cap(ARM64_HARDEN_BRANCH_PREDICTOR)) {
__kvm_bp_vect_base = kvm_ksym_ref(__bp_harden_hyp_vecs);
__kvm_bp_vect_base = kern_hyp_va(__kvm_bp_vect_base);
}
if (cpus_have_const_cap(ARM64_HARDEN_EL2_VECTORS)) {
phys_addr_t vect_pa = __pa_symbol(__bp_harden_hyp_vecs);
unsigned long size = __BP_HARDEN_HYP_VECS_SZ;
/*
* Always allocate a spare vector slot, as we don't
* know yet which CPUs have a BP hardening slot that
* we can reuse.
*/
__kvm_harden_el2_vector_slot = atomic_inc_return(&arm64_el2_vector_last_slot);
BUG_ON(__kvm_harden_el2_vector_slot >= BP_HARDEN_EL2_SLOTS);
return create_hyp_exec_mappings(vect_pa, size,
&__kvm_bp_vect_base);
}
return 0;
}
#else
static inline void *kvm_get_hyp_vector(void)
{
return kern_hyp_va(kvm_ksym_ref(__kvm_hyp_vector));
}
static inline int kvm_map_vectors(void)
{
return 0;
}
#endif
#ifdef CONFIG_ARM64_SSBD
DECLARE_PER_CPU_READ_MOSTLY(u64, arm64_ssbd_callback_required);
static inline int hyp_map_aux_data(void)
{
int cpu, err;
for_each_possible_cpu(cpu) {
u64 *ptr;
ptr = per_cpu_ptr(&arm64_ssbd_callback_required, cpu);
err = create_hyp_mappings(ptr, ptr + 1, PAGE_HYP);
if (err)
return err;
}
return 0;
}
#else
static inline int hyp_map_aux_data(void)
{
return 0;
}
#endif
#define kvm_phys_to_vttbr(addr) phys_to_ttbr(addr)
/*
* Get the magic number 'x' for VTTBR:BADDR of this KVM instance.
* With v8.2 LVA extensions, 'x' should be a minimum of 6 with
* 52bit IPS.
*/
static inline int arm64_vttbr_x(u32 ipa_shift, u32 levels)
{
int x = ARM64_VTTBR_X(ipa_shift, levels);
return (IS_ENABLED(CONFIG_ARM64_PA_BITS_52) && x < 6) ? 6 : x;
}
static inline u64 vttbr_baddr_mask(u32 ipa_shift, u32 levels)
{
unsigned int x = arm64_vttbr_x(ipa_shift, levels);
return GENMASK_ULL(PHYS_MASK_SHIFT - 1, x);
}
static inline u64 kvm_vttbr_baddr_mask(struct kvm *kvm)
{
return vttbr_baddr_mask(kvm_phys_shift(kvm), kvm_stage2_levels(kvm));
}
static __always_inline u64 kvm_get_vttbr(struct kvm_s2_mmu *mmu)
{
struct kvm_vmid *vmid = &mmu->vmid;
......
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2020 Google LLC
* Author: Will Deacon <will@kernel.org>
*/
#ifndef __ARM64_KVM_PGTABLE_H__
#define __ARM64_KVM_PGTABLE_H__
#include <linux/bits.h>
#include <linux/kvm_host.h>
#include <linux/types.h>
typedef u64 kvm_pte_t;
/**
* struct kvm_pgtable - KVM page-table.
* @ia_bits: Maximum input address size, in bits.
* @start_level: Level at which the page-table walk starts.
* @pgd: Pointer to the first top-level entry of the page-table.
* @mmu: Stage-2 KVM MMU struct. Unused for stage-1 page-tables.
*/
struct kvm_pgtable {
u32 ia_bits;
u32 start_level;
kvm_pte_t *pgd;
/* Stage-2 only */
struct kvm_s2_mmu *mmu;
};
/**
* enum kvm_pgtable_prot - Page-table permissions and attributes.
* @KVM_PGTABLE_PROT_X: Execute permission.
* @KVM_PGTABLE_PROT_W: Write permission.
* @KVM_PGTABLE_PROT_R: Read permission.
* @KVM_PGTABLE_PROT_DEVICE: Device attributes.
*/
enum kvm_pgtable_prot {
KVM_PGTABLE_PROT_X = BIT(0),
KVM_PGTABLE_PROT_W = BIT(1),
KVM_PGTABLE_PROT_R = BIT(2),
KVM_PGTABLE_PROT_DEVICE = BIT(3),
};
#define PAGE_HYP (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W)
#define PAGE_HYP_EXEC (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_X)
#define PAGE_HYP_RO (KVM_PGTABLE_PROT_R)
#define PAGE_HYP_DEVICE (PAGE_HYP | KVM_PGTABLE_PROT_DEVICE)
/**
* enum kvm_pgtable_walk_flags - Flags to control a depth-first page-table walk.
* @KVM_PGTABLE_WALK_LEAF: Visit leaf entries, including invalid
* entries.
* @KVM_PGTABLE_WALK_TABLE_PRE: Visit table entries before their
* children.
* @KVM_PGTABLE_WALK_TABLE_POST: Visit table entries after their
* children.
*/
enum kvm_pgtable_walk_flags {
KVM_PGTABLE_WALK_LEAF = BIT(0),
KVM_PGTABLE_WALK_TABLE_PRE = BIT(1),
KVM_PGTABLE_WALK_TABLE_POST = BIT(2),
};
typedef int (*kvm_pgtable_visitor_fn_t)(u64 addr, u64 end, u32 level,
kvm_pte_t *ptep,
enum kvm_pgtable_walk_flags flag,
void * const arg);
/**
* struct kvm_pgtable_walker - Hook into a page-table walk.
* @cb: Callback function to invoke during the walk.
* @arg: Argument passed to the callback function.
* @flags: Bitwise-OR of flags to identify the entry types on which to
* invoke the callback function.
*/
struct kvm_pgtable_walker {
const kvm_pgtable_visitor_fn_t cb;
void * const arg;
const enum kvm_pgtable_walk_flags flags;
};
/**
* kvm_pgtable_hyp_init() - Initialise a hypervisor stage-1 page-table.
* @pgt: Uninitialised page-table structure to initialise.
* @va_bits: Maximum virtual address bits.
*
* Return: 0 on success, negative error code on failure.
*/
int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits);
/**
* kvm_pgtable_hyp_destroy() - Destroy an unused hypervisor stage-1 page-table.
* @pgt: Page-table structure initialised by kvm_pgtable_hyp_init().
*
* The page-table is assumed to be unreachable by any hardware walkers prior
* to freeing and therefore no TLB invalidation is performed.
*/
void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt);
/**
* kvm_pgtable_hyp_map() - Install a mapping in a hypervisor stage-1 page-table.
* @pgt: Page-table structure initialised by kvm_pgtable_hyp_init().
* @addr: Virtual address at which to place the mapping.
* @size: Size of the mapping.
* @phys: Physical address of the memory to map.
* @prot: Permissions and attributes for the mapping.
*
* The offset of @addr within a page is ignored, @size is rounded-up to
* the next page boundary and @phys is rounded-down to the previous page
* boundary.
*
* If device attributes are not explicitly requested in @prot, then the
* mapping will be normal, cacheable. Attempts to install a new mapping
* for a virtual address that is already mapped will be rejected with an
* error and a WARN().
*
* Return: 0 on success, negative error code on failure.
*/
int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
enum kvm_pgtable_prot prot);
/**
* kvm_pgtable_stage2_init() - Initialise a guest stage-2 page-table.
* @pgt: Uninitialised page-table structure to initialise.
* @kvm: KVM structure representing the guest virtual machine.
*
* Return: 0 on success, negative error code on failure.
*/
int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm *kvm);
/**
* kvm_pgtable_stage2_destroy() - Destroy an unused guest stage-2 page-table.
* @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
*
* The page-table is assumed to be unreachable by any hardware walkers prior
* to freeing and therefore no TLB invalidation is performed.
*/
void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt);
/**
* kvm_pgtable_stage2_map() - Install a mapping in a guest stage-2 page-table.
* @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
* @addr: Intermediate physical address at which to place the mapping.
* @size: Size of the mapping.
* @phys: Physical address of the memory to map.
* @prot: Permissions and attributes for the mapping.
* @mc: Cache of pre-allocated GFP_PGTABLE_USER memory from which to
* allocate page-table pages.
*
* The offset of @addr within a page is ignored, @size is rounded-up to
* the next page boundary and @phys is rounded-down to the previous page
* boundary.
*
* If device attributes are not explicitly requested in @prot, then the
* mapping will be normal, cacheable.
*
* Note that this function will both coalesce existing table entries and split
* existing block mappings, relying on page-faults to fault back areas outside
* of the new mapping lazily.
*
* Return: 0 on success, negative error code on failure.
*/
int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
u64 phys, enum kvm_pgtable_prot prot,
struct kvm_mmu_memory_cache *mc);
/**
* kvm_pgtable_stage2_unmap() - Remove a mapping from a guest stage-2 page-table.
* @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
* @addr: Intermediate physical address from which to remove the mapping.
* @size: Size of the mapping.
*
* The offset of @addr within a page is ignored and @size is rounded-up to
* the next page boundary.
*
* TLB invalidation is performed for each page-table entry cleared during the
* unmapping operation and the reference count for the page-table page
* containing the cleared entry is decremented, with unreferenced pages being
* freed. Unmapping a cacheable page will ensure that it is clean to the PoC if
* FWB is not supported by the CPU.
*
* Return: 0 on success, negative error code on failure.
*/
int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size);
/**
* kvm_pgtable_stage2_wrprotect() - Write-protect guest stage-2 address range
* without TLB invalidation.
* @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
* @addr: Intermediate physical address from which to write-protect,
* @size: Size of the range.
*
* The offset of @addr within a page is ignored and @size is rounded-up to
* the next page boundary.
*
* Note that it is the caller's responsibility to invalidate the TLB after
* calling this function to ensure that the updated permissions are visible
* to the CPUs.
*
* Return: 0 on success, negative error code on failure.
*/
int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size);
/**
* kvm_pgtable_stage2_mkyoung() - Set the access flag in a page-table entry.
* @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
* @addr: Intermediate physical address to identify the page-table entry.
*
* The offset of @addr within a page is ignored.
*
* If there is a valid, leaf page-table entry used to translate @addr, then
* set the access flag in that entry.
*
* Return: The old page-table entry prior to setting the flag, 0 on failure.
*/
kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr);
/**
* kvm_pgtable_stage2_mkold() - Clear the access flag in a page-table entry.
* @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
* @addr: Intermediate physical address to identify the page-table entry.
*
* The offset of @addr within a page is ignored.
*
* If there is a valid, leaf page-table entry used to translate @addr, then
* clear the access flag in that entry.
*
* Note that it is the caller's responsibility to invalidate the TLB after
* calling this function to ensure that the updated permissions are visible
* to the CPUs.
*
* Return: The old page-table entry prior to clearing the flag, 0 on failure.
*/
kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr);
/**
* kvm_pgtable_stage2_relax_perms() - Relax the permissions enforced by a
* page-table entry.
* @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
* @addr: Intermediate physical address to identify the page-table entry.
* @prot: Additional permissions to grant for the mapping.
*
* The offset of @addr within a page is ignored.
*
* If there is a valid, leaf page-table entry used to translate @addr, then
* relax the permissions in that entry according to the read, write and
* execute permissions specified by @prot. No permissions are removed, and
* TLB invalidation is performed after updating the entry.
*
* Return: 0 on success, negative error code on failure.
*/
int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
enum kvm_pgtable_prot prot);
/**
* kvm_pgtable_stage2_is_young() - Test whether a page-table entry has the
* access flag set.
* @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
* @addr: Intermediate physical address to identify the page-table entry.
*
* The offset of @addr within a page is ignored.
*
* Return: True if the page-table entry has the access flag set, false otherwise.
*/
bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr);
/**
* kvm_pgtable_stage2_flush_range() - Clean and invalidate data cache to Point
* of Coherency for guest stage-2 address
* range.
* @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
* @addr: Intermediate physical address from which to flush.
* @size: Size of the range.
*
* The offset of @addr within a page is ignored and @size is rounded-up to
* the next page boundary.
*
* Return: 0 on success, negative error code on failure.
*/
int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size);
/**
* kvm_pgtable_walk() - Walk a page-table.
* @pgt: Page-table structure initialised by kvm_pgtable_*_init().
* @addr: Input address for the start of the walk.
* @size: Size of the range to walk.
* @walker: Walker callback description.
*
* The offset of @addr within a page is ignored and @size is rounded-up to
* the next page boundary.
*
* The walker will walk the page-table entries corresponding to the input
* address range specified, visiting entries according to the walker flags.
* Invalid entries are treated as leaf entries. Leaf entries are reloaded
* after invoking the walker callback, allowing the walker to descend into
* a newly installed table.
*
* Returning a negative error code from the walker callback function will
* terminate the walk immediately with the same error code.
*
* Return: 0 on success, negative error code on failure.
*/
int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
struct kvm_pgtable_walker *walker);
#endif /* __ARM64_KVM_PGTABLE_H__ */
......@@ -60,7 +60,7 @@
.endm
/*
* Both ptrauth_switch_to_guest and ptrauth_switch_to_host macros will
* Both ptrauth_switch_to_guest and ptrauth_switch_to_hyp macros will
* check for the presence ARM64_HAS_ADDRESS_AUTH, which is defined as
* (ARM64_HAS_ADDRESS_AUTH_ARCH || ARM64_HAS_ADDRESS_AUTH_IMP_DEF) and
* then proceed ahead with the save/restore of Pointer Authentication
......@@ -78,7 +78,7 @@ alternative_else_nop_endif
.L__skip_switch\@:
.endm
.macro ptrauth_switch_to_host g_ctxt, h_ctxt, reg1, reg2, reg3
.macro ptrauth_switch_to_hyp g_ctxt, h_ctxt, reg1, reg2, reg3
alternative_if_not ARM64_HAS_ADDRESS_AUTH
b .L__skip_switch\@
alternative_else_nop_endif
......@@ -96,7 +96,7 @@ alternative_else_nop_endif
#else /* !CONFIG_ARM64_PTR_AUTH */
.macro ptrauth_switch_to_guest g_ctxt, reg1, reg2, reg3
.endm
.macro ptrauth_switch_to_host g_ctxt, h_ctxt, reg1, reg2, reg3
.macro ptrauth_switch_to_hyp g_ctxt, h_ctxt, reg1, reg2, reg3
.endm
#endif /* CONFIG_ARM64_PTR_AUTH */
#endif /* __ASSEMBLY__ */
......
......@@ -45,7 +45,6 @@ struct bp_hardening_data {
bp_hardening_cb_t fn;
};
#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR
DECLARE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data);
static inline struct bp_hardening_data *arm64_get_bp_hardening_data(void)
......@@ -57,21 +56,13 @@ static inline void arm64_apply_bp_hardening(void)
{
struct bp_hardening_data *d;
if (!cpus_have_const_cap(ARM64_HARDEN_BRANCH_PREDICTOR))
if (!cpus_have_const_cap(ARM64_SPECTRE_V2))
return;
d = arm64_get_bp_hardening_data();
if (d->fn)
d->fn();
}
#else
static inline struct bp_hardening_data *arm64_get_bp_hardening_data(void)
{
return NULL;
}
static inline void arm64_apply_bp_hardening(void) { }
#endif /* CONFIG_HARDEN_BRANCH_PREDICTOR */
extern void arm64_memblock_init(void);
extern void paging_init(void);
......
......@@ -19,7 +19,16 @@ static inline void set_my_cpu_offset(unsigned long off)
:: "r" (off) : "memory");
}
static inline unsigned long __my_cpu_offset(void)
static inline unsigned long __hyp_my_cpu_offset(void)
{
/*
* Non-VHE hyp code runs with preemption disabled. No need to hazard
* the register access against barrier() as in __kern_my_cpu_offset.
*/
return read_sysreg(tpidr_el2);
}
static inline unsigned long __kern_my_cpu_offset(void)
{
unsigned long off;
......@@ -35,7 +44,12 @@ static inline unsigned long __my_cpu_offset(void)
return off;
}
#define __my_cpu_offset __my_cpu_offset()
#ifdef __KVM_NVHE_HYPERVISOR__
#define __my_cpu_offset __hyp_my_cpu_offset()
#else
#define __my_cpu_offset __kern_my_cpu_offset()
#endif
#define PERCPU_RW_OPS(sz) \
static inline unsigned long __percpu_read_##sz(void *ptr) \
......@@ -227,4 +241,14 @@ PERCPU_RET_OP(add, add, ldadd)
#include <asm-generic/percpu.h>
/* Redefine macros for nVHE hyp under DEBUG_PREEMPT to avoid its dependencies. */
#if defined(__KVM_NVHE_HYPERVISOR__) && defined(CONFIG_DEBUG_PREEMPT)
#undef this_cpu_ptr
#define this_cpu_ptr raw_cpu_ptr
#undef __this_cpu_read
#define __this_cpu_read raw_cpu_read
#undef __this_cpu_write
#define __this_cpu_write raw_cpu_write
#endif
#endif /* __ASM_PERCPU_H */
......@@ -156,7 +156,6 @@
#define PTE_CONT (_AT(pteval_t, 1) << 52) /* Contiguous range */
#define PTE_PXN (_AT(pteval_t, 1) << 53) /* Privileged XN */
#define PTE_UXN (_AT(pteval_t, 1) << 54) /* User XN */
#define PTE_HYP_XN (_AT(pteval_t, 1) << 54) /* HYP XN */
#define PTE_ADDR_LOW (((_AT(pteval_t, 1) << (48 - PAGE_SHIFT)) - 1) << PAGE_SHIFT)
#ifdef CONFIG_ARM64_PA_BITS_52
......@@ -172,34 +171,11 @@
#define PTE_ATTRINDX(t) (_AT(pteval_t, (t)) << 2)
#define PTE_ATTRINDX_MASK (_AT(pteval_t, 7) << 2)
/*
* 2nd stage PTE definitions
*/
#define PTE_S2_RDONLY (_AT(pteval_t, 1) << 6) /* HAP[2:1] */
#define PTE_S2_RDWR (_AT(pteval_t, 3) << 6) /* HAP[2:1] */
#define PTE_S2_XN (_AT(pteval_t, 2) << 53) /* XN[1:0] */
#define PTE_S2_SW_RESVD (_AT(pteval_t, 15) << 55) /* Reserved for SW */
#define PMD_S2_RDONLY (_AT(pmdval_t, 1) << 6) /* HAP[2:1] */
#define PMD_S2_RDWR (_AT(pmdval_t, 3) << 6) /* HAP[2:1] */
#define PMD_S2_XN (_AT(pmdval_t, 2) << 53) /* XN[1:0] */
#define PMD_S2_SW_RESVD (_AT(pmdval_t, 15) << 55) /* Reserved for SW */
#define PUD_S2_RDONLY (_AT(pudval_t, 1) << 6) /* HAP[2:1] */
#define PUD_S2_RDWR (_AT(pudval_t, 3) << 6) /* HAP[2:1] */
#define PUD_S2_XN (_AT(pudval_t, 2) << 53) /* XN[1:0] */
/*
* Memory Attribute override for Stage-2 (MemAttr[3:0])
*/
#define PTE_S2_MEMATTR(t) (_AT(pteval_t, (t)) << 2)
/*
* EL2/HYP PTE/PMD definitions
*/
#define PMD_HYP PMD_SECT_USER
#define PTE_HYP PTE_USER
/*
* Highest possible physical address supported.
*/
......
......@@ -56,7 +56,6 @@ extern bool arm64_use_ng_mappings;
#define PROT_SECT_NORMAL_EXEC (PROT_SECT_DEFAULT | PMD_SECT_UXN | PMD_ATTRINDX(MT_NORMAL))
#define _PAGE_DEFAULT (_PROT_DEFAULT | PTE_ATTRINDX(MT_NORMAL))
#define _HYP_PAGE_DEFAULT _PAGE_DEFAULT
#define PAGE_KERNEL __pgprot(PROT_NORMAL)
#define PAGE_KERNEL_RO __pgprot((PROT_NORMAL & ~PTE_WRITE) | PTE_RDONLY)
......@@ -64,11 +63,6 @@ extern bool arm64_use_ng_mappings;
#define PAGE_KERNEL_EXEC __pgprot(PROT_NORMAL & ~PTE_PXN)
#define PAGE_KERNEL_EXEC_CONT __pgprot((PROT_NORMAL & ~PTE_PXN) | PTE_CONT)
#define PAGE_HYP __pgprot(_HYP_PAGE_DEFAULT | PTE_HYP | PTE_HYP_XN)
#define PAGE_HYP_EXEC __pgprot(_HYP_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY)
#define PAGE_HYP_RO __pgprot(_HYP_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY | PTE_HYP_XN)
#define PAGE_HYP_DEVICE __pgprot(_PROT_DEFAULT | PTE_ATTRINDX(MT_DEVICE_nGnRE) | PTE_HYP | PTE_HYP_XN)
#define PAGE_S2_MEMATTR(attr) \
({ \
u64 __val; \
......@@ -79,19 +73,6 @@ extern bool arm64_use_ng_mappings;
__val; \
})
#define PAGE_S2_XN \
({ \
u64 __val; \
if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC)) \
__val = 0; \
else \
__val = PTE_S2_XN; \
__val; \
})
#define PAGE_S2 __pgprot(_PROT_DEFAULT | PAGE_S2_MEMATTR(NORMAL) | PTE_S2_RDONLY | PAGE_S2_XN)
#define PAGE_S2_DEVICE __pgprot(_PROT_DEFAULT | PAGE_S2_MEMATTR(DEVICE_nGnRE) | PTE_S2_RDONLY | PTE_S2_XN)
#define PAGE_NONE __pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | PTE_PROT_NONE | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN)
/* shared+writable pages are clean by default, hence PTE_RDONLY|PTE_WRITE */
#define PAGE_SHARED __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN | PTE_WRITE)
......
......@@ -38,6 +38,7 @@
#include <asm/pgtable-hwdef.h>
#include <asm/pointer_auth.h>
#include <asm/ptrace.h>
#include <asm/spectre.h>
#include <asm/types.h>
/*
......@@ -197,40 +198,15 @@ static inline void start_thread_common(struct pt_regs *regs, unsigned long pc)
regs->pmr_save = GIC_PRIO_IRQON;
}
static inline void set_ssbs_bit(struct pt_regs *regs)
{
regs->pstate |= PSR_SSBS_BIT;
}
static inline void set_compat_ssbs_bit(struct pt_regs *regs)
{
regs->pstate |= PSR_AA32_SSBS_BIT;
}
static inline void start_thread(struct pt_regs *regs, unsigned long pc,
unsigned long sp)
{
start_thread_common(regs, pc);
regs->pstate = PSR_MODE_EL0t;
if (arm64_get_ssbd_state() != ARM64_SSBD_FORCE_ENABLE)
set_ssbs_bit(regs);
spectre_v4_enable_task_mitigation(current);
regs->sp = sp;
}
static inline bool is_ttbr0_addr(unsigned long addr)
{
/* entry assembly clears tags for TTBR0 addrs */
return addr < TASK_SIZE;
}
static inline bool is_ttbr1_addr(unsigned long addr)
{
/* TTBR1 addresses may have a tag if KASAN_SW_TAGS is in use */
return arch_kasan_reset_tag(addr) >= PAGE_OFFSET;
}
#ifdef CONFIG_COMPAT
static inline void compat_start_thread(struct pt_regs *regs, unsigned long pc,
unsigned long sp)
......@@ -244,13 +220,23 @@ static inline void compat_start_thread(struct pt_regs *regs, unsigned long pc,
regs->pstate |= PSR_AA32_E_BIT;
#endif
if (arm64_get_ssbd_state() != ARM64_SSBD_FORCE_ENABLE)
set_compat_ssbs_bit(regs);
spectre_v4_enable_task_mitigation(current);
regs->compat_sp = sp;
}
#endif
static inline bool is_ttbr0_addr(unsigned long addr)
{
/* entry assembly clears tags for TTBR0 addrs */
return addr < TASK_SIZE;
}
static inline bool is_ttbr1_addr(unsigned long addr)
{
/* TTBR1 addresses may have a tag if KASAN_SW_TAGS is in use */
return arch_kasan_reset_tag(addr) >= PAGE_OFFSET;
}
/* Forward declaration, a strange C thing */
struct task_struct;
......
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Interface for managing mitigations for Spectre vulnerabilities.
*
* Copyright (C) 2020 Google LLC
* Author: Will Deacon <will@kernel.org>
*/
#ifndef __ASM_SPECTRE_H
#define __ASM_SPECTRE_H
#include <asm/cpufeature.h>
/* Watch out, ordering is important here. */
enum mitigation_state {
SPECTRE_UNAFFECTED,
SPECTRE_MITIGATED,
SPECTRE_VULNERABLE,
};
struct task_struct;
enum mitigation_state arm64_get_spectre_v2_state(void);
bool has_spectre_v2(const struct arm64_cpu_capabilities *cap, int scope);
void spectre_v2_enable_mitigation(const struct arm64_cpu_capabilities *__unused);
enum mitigation_state arm64_get_spectre_v4_state(void);
bool has_spectre_v4(const struct arm64_cpu_capabilities *cap, int scope);
void spectre_v4_enable_mitigation(const struct arm64_cpu_capabilities *__unused);
void spectre_v4_enable_task_mitigation(struct task_struct *tsk);
#endif /* __ASM_SPECTRE_H */
......@@ -8,7 +8,6 @@
#ifndef __ARM64_S2_PGTABLE_H_
#define __ARM64_S2_PGTABLE_H_
#include <linux/hugetlb.h>
#include <linux/pgtable.h>
/*
......@@ -36,21 +35,6 @@
#define stage2_pgdir_size(kvm) (1ULL << stage2_pgdir_shift(kvm))
#define stage2_pgdir_mask(kvm) ~(stage2_pgdir_size(kvm) - 1)
/*
* The number of PTRS across all concatenated stage2 tables given by the
* number of bits resolved at the initial level.
* If we force more levels than necessary, we may have (stage2_pgdir_shift > IPA),
* in which case, stage2_pgd_ptrs will have one entry.
*/
#define pgd_ptrs_shift(ipa, pgdir_shift) \
((ipa) > (pgdir_shift) ? ((ipa) - (pgdir_shift)) : 0)
#define __s2_pgd_ptrs(ipa, lvls) \
(1 << (pgd_ptrs_shift((ipa), pt_levels_pgdir_shift(lvls))))
#define __s2_pgd_size(ipa, lvls) (__s2_pgd_ptrs((ipa), (lvls)) * sizeof(pgd_t))
#define stage2_pgd_ptrs(kvm) __s2_pgd_ptrs(kvm_phys_shift(kvm), kvm_stage2_levels(kvm))
#define stage2_pgd_size(kvm) __s2_pgd_size(kvm_phys_shift(kvm), kvm_stage2_levels(kvm))
/*
* kvm_mmmu_cache_min_pages() is the number of pages required to install
* a stage-2 translation. We pre-allocate the entry level page table at
......@@ -58,196 +42,6 @@
*/
#define kvm_mmu_cache_min_pages(kvm) (kvm_stage2_levels(kvm) - 1)
/* Stage2 PUD definitions when the level is present */
static inline bool kvm_stage2_has_pud(struct kvm *kvm)
{
return (CONFIG_PGTABLE_LEVELS > 3) && (kvm_stage2_levels(kvm) > 3);
}
#define S2_PUD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(1)
#define S2_PUD_SIZE (1UL << S2_PUD_SHIFT)
#define S2_PUD_MASK (~(S2_PUD_SIZE - 1))
#define stage2_pgd_none(kvm, pgd) pgd_none(pgd)
#define stage2_pgd_clear(kvm, pgd) pgd_clear(pgd)
#define stage2_pgd_present(kvm, pgd) pgd_present(pgd)
#define stage2_pgd_populate(kvm, pgd, p4d) pgd_populate(NULL, pgd, p4d)
static inline p4d_t *stage2_p4d_offset(struct kvm *kvm,
pgd_t *pgd, unsigned long address)
{
return p4d_offset(pgd, address);
}
static inline void stage2_p4d_free(struct kvm *kvm, p4d_t *p4d)
{
}
static inline bool stage2_p4d_table_empty(struct kvm *kvm, p4d_t *p4dp)
{
return false;
}
static inline phys_addr_t stage2_p4d_addr_end(struct kvm *kvm,
phys_addr_t addr, phys_addr_t end)
{
return end;
}
static inline bool stage2_p4d_none(struct kvm *kvm, p4d_t p4d)
{
if (kvm_stage2_has_pud(kvm))
return p4d_none(p4d);
else
return 0;
}
static inline void stage2_p4d_clear(struct kvm *kvm, p4d_t *p4dp)
{
if (kvm_stage2_has_pud(kvm))
p4d_clear(p4dp);
}
static inline bool stage2_p4d_present(struct kvm *kvm, p4d_t p4d)
{
if (kvm_stage2_has_pud(kvm))
return p4d_present(p4d);
else
return 1;
}
static inline void stage2_p4d_populate(struct kvm *kvm, p4d_t *p4d, pud_t *pud)
{
if (kvm_stage2_has_pud(kvm))
p4d_populate(NULL, p4d, pud);
}
static inline pud_t *stage2_pud_offset(struct kvm *kvm,
p4d_t *p4d, unsigned long address)
{
if (kvm_stage2_has_pud(kvm))
return pud_offset(p4d, address);
else
return (pud_t *)p4d;
}
static inline void stage2_pud_free(struct kvm *kvm, pud_t *pud)
{
if (kvm_stage2_has_pud(kvm))
free_page((unsigned long)pud);
}
static inline bool stage2_pud_table_empty(struct kvm *kvm, pud_t *pudp)
{
if (kvm_stage2_has_pud(kvm))
return kvm_page_empty(pudp);
else
return false;
}
static inline phys_addr_t
stage2_pud_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
{
if (kvm_stage2_has_pud(kvm)) {
phys_addr_t boundary = (addr + S2_PUD_SIZE) & S2_PUD_MASK;
return (boundary - 1 < end - 1) ? boundary : end;
} else {
return end;
}
}
/* Stage2 PMD definitions when the level is present */
static inline bool kvm_stage2_has_pmd(struct kvm *kvm)
{
return (CONFIG_PGTABLE_LEVELS > 2) && (kvm_stage2_levels(kvm) > 2);
}
#define S2_PMD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(2)
#define S2_PMD_SIZE (1UL << S2_PMD_SHIFT)
#define S2_PMD_MASK (~(S2_PMD_SIZE - 1))
static inline bool stage2_pud_none(struct kvm *kvm, pud_t pud)
{
if (kvm_stage2_has_pmd(kvm))
return pud_none(pud);
else
return 0;
}
static inline void stage2_pud_clear(struct kvm *kvm, pud_t *pud)
{
if (kvm_stage2_has_pmd(kvm))
pud_clear(pud);
}
static inline bool stage2_pud_present(struct kvm *kvm, pud_t pud)
{
if (kvm_stage2_has_pmd(kvm))
return pud_present(pud);
else
return 1;
}
static inline void stage2_pud_populate(struct kvm *kvm, pud_t *pud, pmd_t *pmd)
{
if (kvm_stage2_has_pmd(kvm))
pud_populate(NULL, pud, pmd);
}
static inline pmd_t *stage2_pmd_offset(struct kvm *kvm,
pud_t *pud, unsigned long address)
{
if (kvm_stage2_has_pmd(kvm))
return pmd_offset(pud, address);
else
return (pmd_t *)pud;
}
static inline void stage2_pmd_free(struct kvm *kvm, pmd_t *pmd)
{
if (kvm_stage2_has_pmd(kvm))
free_page((unsigned long)pmd);
}
static inline bool stage2_pud_huge(struct kvm *kvm, pud_t pud)
{
if (kvm_stage2_has_pmd(kvm))
return pud_huge(pud);
else
return 0;
}
static inline bool stage2_pmd_table_empty(struct kvm *kvm, pmd_t *pmdp)
{
if (kvm_stage2_has_pmd(kvm))
return kvm_page_empty(pmdp);
else
return 0;
}
static inline phys_addr_t
stage2_pmd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
{
if (kvm_stage2_has_pmd(kvm)) {
phys_addr_t boundary = (addr + S2_PMD_SIZE) & S2_PMD_MASK;
return (boundary - 1 < end - 1) ? boundary : end;
} else {
return end;
}
}
static inline bool stage2_pte_table_empty(struct kvm *kvm, pte_t *ptep)
{
return kvm_page_empty(ptep);
}
static inline unsigned long stage2_pgd_index(struct kvm *kvm, phys_addr_t addr)
{
return (((addr) >> stage2_pgdir_shift(kvm)) & (stage2_pgd_ptrs(kvm) - 1));
}
static inline phys_addr_t
stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
{
......@@ -256,13 +50,4 @@ stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
return (boundary - 1 < end - 1) ? boundary : end;
}
/*
* Level values for the ARMv8.4-TTL extension, mapping PUD/PMD/PTE and
* the architectural page-table level.
*/
#define S2_NO_LEVEL_HINT 0
#define S2_PUD_LEVEL 1
#define S2_PMD_LEVEL 2
#define S2_PTE_LEVEL 3
#endif /* __ARM64_S2_PGTABLE_H_ */
......@@ -159,6 +159,21 @@ struct kvm_sync_regs {
struct kvm_arch_memory_slot {
};
/*
* PMU filter structure. Describe a range of events with a particular
* action. To be used with KVM_ARM_VCPU_PMU_V3_FILTER.
*/
struct kvm_pmu_event_filter {
__u16 base_event;
__u16 nevents;
#define KVM_PMU_EVENT_ALLOW 0
#define KVM_PMU_EVENT_DENY 1
__u8 action;
__u8 pad[3];
};
/* for KVM_GET/SET_VCPU_EVENTS */
struct kvm_vcpu_events {
struct {
......@@ -242,6 +257,15 @@ struct kvm_vcpu_events {
#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL 0
#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL 1
#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_REQUIRED 2
/*
* Only two states can be presented by the host kernel:
* - NOT_REQUIRED: the guest doesn't need to do anything
* - NOT_AVAIL: the guest isn't mitigated (it can still use SSBS if available)
*
* All the other values are deprecated. The host still accepts all
* values (they are ABI), but will narrow them to the above two.
*/
#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2 KVM_REG_ARM_FW_REG(2)
#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL 0
#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN 1
......@@ -329,6 +353,7 @@ struct kvm_vcpu_events {
#define KVM_ARM_VCPU_PMU_V3_CTRL 0
#define KVM_ARM_VCPU_PMU_V3_IRQ 0
#define KVM_ARM_VCPU_PMU_V3_INIT 1
#define KVM_ARM_VCPU_PMU_V3_FILTER 2
#define KVM_ARM_VCPU_TIMER_CTRL 1
#define KVM_ARM_VCPU_TIMER_IRQ_VTIMER 0
#define KVM_ARM_VCPU_TIMER_IRQ_PTIMER 1
......
......@@ -19,7 +19,7 @@ obj-y := debug-monitors.o entry.o irq.o fpsimd.o \
return_address.o cpuinfo.o cpu_errata.o \
cpufeature.o alternative.o cacheinfo.o \
smp.o smp_spin_table.o topology.o smccc-call.o \
syscall.o
syscall.o proton-pack.o
targets += efi-entry.o
......@@ -59,7 +59,6 @@ arm64-reloc-test-y := reloc_test_core.o reloc_test_syms.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
obj-$(CONFIG_CRASH_CORE) += crash_core.o
obj-$(CONFIG_ARM_SDE_INTERFACE) += sdei.o
obj-$(CONFIG_ARM64_SSBD) += ssbd.o
obj-$(CONFIG_ARM64_PTR_AUTH) += pointer_auth.o
obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o
......
......@@ -106,365 +106,6 @@ cpu_enable_trap_ctr_access(const struct arm64_cpu_capabilities *cap)
sysreg_clear_set(sctlr_el1, SCTLR_EL1_UCT, 0);
}
atomic_t arm64_el2_vector_last_slot = ATOMIC_INIT(-1);
#include <asm/mmu_context.h>
#include <asm/cacheflush.h>
DEFINE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data);
#ifdef CONFIG_KVM_INDIRECT_VECTORS
static void __copy_hyp_vect_bpi(int slot, const char *hyp_vecs_start,
const char *hyp_vecs_end)
{
void *dst = lm_alias(__bp_harden_hyp_vecs + slot * SZ_2K);
int i;
for (i = 0; i < SZ_2K; i += 0x80)
memcpy(dst + i, hyp_vecs_start, hyp_vecs_end - hyp_vecs_start);
__flush_icache_range((uintptr_t)dst, (uintptr_t)dst + SZ_2K);
}
static void install_bp_hardening_cb(bp_hardening_cb_t fn,
const char *hyp_vecs_start,
const char *hyp_vecs_end)
{
static DEFINE_RAW_SPINLOCK(bp_lock);
int cpu, slot = -1;
/*
* detect_harden_bp_fw() passes NULL for the hyp_vecs start/end if
* we're a guest. Skip the hyp-vectors work.
*/
if (!hyp_vecs_start) {
__this_cpu_write(bp_hardening_data.fn, fn);
return;
}
raw_spin_lock(&bp_lock);
for_each_possible_cpu(cpu) {
if (per_cpu(bp_hardening_data.fn, cpu) == fn) {
slot = per_cpu(bp_hardening_data.hyp_vectors_slot, cpu);
break;
}
}
if (slot == -1) {
slot = atomic_inc_return(&arm64_el2_vector_last_slot);
BUG_ON(slot >= BP_HARDEN_EL2_SLOTS);
__copy_hyp_vect_bpi(slot, hyp_vecs_start, hyp_vecs_end);
}
__this_cpu_write(bp_hardening_data.hyp_vectors_slot, slot);
__this_cpu_write(bp_hardening_data.fn, fn);
raw_spin_unlock(&bp_lock);
}
#else
static void install_bp_hardening_cb(bp_hardening_cb_t fn,
const char *hyp_vecs_start,
const char *hyp_vecs_end)
{
__this_cpu_write(bp_hardening_data.fn, fn);
}
#endif /* CONFIG_KVM_INDIRECT_VECTORS */
#include <linux/arm-smccc.h>
static void __maybe_unused call_smc_arch_workaround_1(void)
{
arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL);
}
static void call_hvc_arch_workaround_1(void)
{
arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL);
}
static void qcom_link_stack_sanitization(void)
{
u64 tmp;
asm volatile("mov %0, x30 \n"
".rept 16 \n"
"bl . + 4 \n"
".endr \n"
"mov x30, %0 \n"
: "=&r" (tmp));
}
static bool __nospectre_v2;
static int __init parse_nospectre_v2(char *str)
{
__nospectre_v2 = true;
return 0;
}
early_param("nospectre_v2", parse_nospectre_v2);
/*
* -1: No workaround
* 0: No workaround required
* 1: Workaround installed
*/
static int detect_harden_bp_fw(void)
{
bp_hardening_cb_t cb;
void *smccc_start, *smccc_end;
struct arm_smccc_res res;
u32 midr = read_cpuid_id();
arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
ARM_SMCCC_ARCH_WORKAROUND_1, &res);
switch ((int)res.a0) {
case 1:
/* Firmware says we're just fine */
return 0;
case 0:
break;
default:
return -1;
}
switch (arm_smccc_1_1_get_conduit()) {
case SMCCC_CONDUIT_HVC:
cb = call_hvc_arch_workaround_1;
/* This is a guest, no need to patch KVM vectors */
smccc_start = NULL;
smccc_end = NULL;
break;
#if IS_ENABLED(CONFIG_KVM)
case SMCCC_CONDUIT_SMC:
cb = call_smc_arch_workaround_1;
smccc_start = __smccc_workaround_1_smc;
smccc_end = __smccc_workaround_1_smc +
__SMCCC_WORKAROUND_1_SMC_SZ;
break;
#endif
default:
return -1;
}
if (((midr & MIDR_CPU_MODEL_MASK) == MIDR_QCOM_FALKOR) ||
((midr & MIDR_CPU_MODEL_MASK) == MIDR_QCOM_FALKOR_V1))
cb = qcom_link_stack_sanitization;
if (IS_ENABLED(CONFIG_HARDEN_BRANCH_PREDICTOR))
install_bp_hardening_cb(cb, smccc_start, smccc_end);
return 1;
}
DEFINE_PER_CPU_READ_MOSTLY(u64, arm64_ssbd_callback_required);
int ssbd_state __read_mostly = ARM64_SSBD_KERNEL;
static bool __ssb_safe = true;
static const struct ssbd_options {
const char *str;
int state;
} ssbd_options[] = {
{ "force-on", ARM64_SSBD_FORCE_ENABLE, },
{ "force-off", ARM64_SSBD_FORCE_DISABLE, },
{ "kernel", ARM64_SSBD_KERNEL, },
};
static int __init ssbd_cfg(char *buf)
{
int i;
if (!buf || !buf[0])
return -EINVAL;
for (i = 0; i < ARRAY_SIZE(ssbd_options); i++) {
int len = strlen(ssbd_options[i].str);
if (strncmp(buf, ssbd_options[i].str, len))
continue;
ssbd_state = ssbd_options[i].state;
return 0;
}
return -EINVAL;
}
early_param("ssbd", ssbd_cfg);
void __init arm64_update_smccc_conduit(struct alt_instr *alt,
__le32 *origptr, __le32 *updptr,
int nr_inst)
{
u32 insn;
BUG_ON(nr_inst != 1);
switch (arm_smccc_1_1_get_conduit()) {
case SMCCC_CONDUIT_HVC:
insn = aarch64_insn_get_hvc_value();
break;
case SMCCC_CONDUIT_SMC:
insn = aarch64_insn_get_smc_value();
break;
default:
return;
}
*updptr = cpu_to_le32(insn);
}
void __init arm64_enable_wa2_handling(struct alt_instr *alt,
__le32 *origptr, __le32 *updptr,
int nr_inst)
{
BUG_ON(nr_inst != 1);
/*
* Only allow mitigation on EL1 entry/exit and guest
* ARCH_WORKAROUND_2 handling if the SSBD state allows it to
* be flipped.
*/
if (arm64_get_ssbd_state() == ARM64_SSBD_KERNEL)
*updptr = cpu_to_le32(aarch64_insn_gen_nop());
}
void arm64_set_ssbd_mitigation(bool state)
{
int conduit;
if (!IS_ENABLED(CONFIG_ARM64_SSBD)) {
pr_info_once("SSBD disabled by kernel configuration\n");
return;
}
if (this_cpu_has_cap(ARM64_SSBS)) {
if (state)
asm volatile(SET_PSTATE_SSBS(0));
else
asm volatile(SET_PSTATE_SSBS(1));
return;
}
conduit = arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_WORKAROUND_2, state,
NULL);
WARN_ON_ONCE(conduit == SMCCC_CONDUIT_NONE);
}
static bool has_ssbd_mitigation(const struct arm64_cpu_capabilities *entry,
int scope)
{
struct arm_smccc_res res;
bool required = true;
s32 val;
bool this_cpu_safe = false;
int conduit;
WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible());
if (cpu_mitigations_off())
ssbd_state = ARM64_SSBD_FORCE_DISABLE;
/* delay setting __ssb_safe until we get a firmware response */
if (is_midr_in_range_list(read_cpuid_id(), entry->midr_range_list))
this_cpu_safe = true;
if (this_cpu_has_cap(ARM64_SSBS)) {
if (!this_cpu_safe)
__ssb_safe = false;
required = false;
goto out_printmsg;
}
conduit = arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
ARM_SMCCC_ARCH_WORKAROUND_2, &res);
if (conduit == SMCCC_CONDUIT_NONE) {
ssbd_state = ARM64_SSBD_UNKNOWN;
if (!this_cpu_safe)
__ssb_safe = false;
return false;
}
val = (s32)res.a0;
switch (val) {
case SMCCC_RET_NOT_SUPPORTED:
ssbd_state = ARM64_SSBD_UNKNOWN;
if (!this_cpu_safe)
__ssb_safe = false;
return false;
/* machines with mixed mitigation requirements must not return this */
case SMCCC_RET_NOT_REQUIRED:
pr_info_once("%s mitigation not required\n", entry->desc);
ssbd_state = ARM64_SSBD_MITIGATED;
return false;
case SMCCC_RET_SUCCESS:
__ssb_safe = false;
required = true;
break;
case 1: /* Mitigation not required on this CPU */
required = false;
break;
default:
WARN_ON(1);
if (!this_cpu_safe)
__ssb_safe = false;
return false;
}
switch (ssbd_state) {
case ARM64_SSBD_FORCE_DISABLE:
arm64_set_ssbd_mitigation(false);
required = false;
break;
case ARM64_SSBD_KERNEL:
if (required) {
__this_cpu_write(arm64_ssbd_callback_required, 1);
arm64_set_ssbd_mitigation(true);
}
break;
case ARM64_SSBD_FORCE_ENABLE:
arm64_set_ssbd_mitigation(true);
required = true;
break;
default:
WARN_ON(1);
break;
}
out_printmsg:
switch (ssbd_state) {
case ARM64_SSBD_FORCE_DISABLE:
pr_info_once("%s disabled from command-line\n", entry->desc);
break;
case ARM64_SSBD_FORCE_ENABLE:
pr_info_once("%s forced from command-line\n", entry->desc);
break;
}
return required;
}
/* known invulnerable cores */
static const struct midr_range arm64_ssb_cpus[] = {
MIDR_ALL_VERSIONS(MIDR_CORTEX_A35),
MIDR_ALL_VERSIONS(MIDR_CORTEX_A53),
MIDR_ALL_VERSIONS(MIDR_CORTEX_A55),
MIDR_ALL_VERSIONS(MIDR_BRAHMA_B53),
MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_3XX_SILVER),
MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_SILVER),
{},
};
#ifdef CONFIG_ARM64_ERRATUM_1463225
DEFINE_PER_CPU(int, __in_cortex_a76_erratum_1463225_wa);
......@@ -519,83 +160,6 @@ cpu_enable_cache_maint_trap(const struct arm64_cpu_capabilities *__unused)
.type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, \
CAP_MIDR_RANGE_LIST(midr_list)
/* Track overall mitigation state. We are only mitigated if all cores are ok */
static bool __hardenbp_enab = true;
static bool __spectrev2_safe = true;
int get_spectre_v2_workaround_state(void)
{
if (__spectrev2_safe)
return ARM64_BP_HARDEN_NOT_REQUIRED;
if (!__hardenbp_enab)
return ARM64_BP_HARDEN_UNKNOWN;
return ARM64_BP_HARDEN_WA_NEEDED;
}
/*
* List of CPUs that do not need any Spectre-v2 mitigation at all.
*/
static const struct midr_range spectre_v2_safe_list[] = {
MIDR_ALL_VERSIONS(MIDR_CORTEX_A35),
MIDR_ALL_VERSIONS(MIDR_CORTEX_A53),
MIDR_ALL_VERSIONS(MIDR_CORTEX_A55),
MIDR_ALL_VERSIONS(MIDR_BRAHMA_B53),
MIDR_ALL_VERSIONS(MIDR_HISI_TSV110),
MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_3XX_SILVER),
MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_SILVER),
{ /* sentinel */ }
};
/*
* Track overall bp hardening for all heterogeneous cores in the machine.
* We are only considered "safe" if all booted cores are known safe.
*/
static bool __maybe_unused
check_branch_predictor(const struct arm64_cpu_capabilities *entry, int scope)
{
int need_wa;
WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible());
/* If the CPU has CSV2 set, we're safe */
if (cpuid_feature_extract_unsigned_field(read_cpuid(ID_AA64PFR0_EL1),
ID_AA64PFR0_CSV2_SHIFT))
return false;
/* Alternatively, we have a list of unaffected CPUs */
if (is_midr_in_range_list(read_cpuid_id(), spectre_v2_safe_list))
return false;
/* Fallback to firmware detection */
need_wa = detect_harden_bp_fw();
if (!need_wa)
return false;
__spectrev2_safe = false;
if (!IS_ENABLED(CONFIG_HARDEN_BRANCH_PREDICTOR)) {
pr_warn_once("spectrev2 mitigation disabled by kernel configuration\n");
__hardenbp_enab = false;
return false;
}
/* forced off */
if (__nospectre_v2 || cpu_mitigations_off()) {
pr_info_once("spectrev2 mitigation disabled by command line option\n");
__hardenbp_enab = false;
return false;
}
if (need_wa < 0) {
pr_warn_once("ARM_SMCCC_ARCH_WORKAROUND_1 missing from firmware\n");
__hardenbp_enab = false;
}
return (need_wa > 0);
}
static const __maybe_unused struct midr_range tx2_family_cpus[] = {
MIDR_ALL_VERSIONS(MIDR_BRCM_VULCAN),
MIDR_ALL_VERSIONS(MIDR_CAVIUM_THUNDERX2),
......@@ -887,9 +451,11 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
},
#endif
{
.capability = ARM64_HARDEN_BRANCH_PREDICTOR,
.desc = "Spectre-v2",
.capability = ARM64_SPECTRE_V2,
.type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM,
.matches = check_branch_predictor,
.matches = has_spectre_v2,
.cpu_enable = spectre_v2_enable_mitigation,
},
#ifdef CONFIG_RANDOMIZE_BASE
{
......@@ -899,11 +465,11 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
},
#endif
{
.desc = "Speculative Store Bypass Disable",
.capability = ARM64_SSBD,
.desc = "Spectre-v4",
.capability = ARM64_SPECTRE_V4,
.type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM,
.matches = has_ssbd_mitigation,
.midr_range_list = arm64_ssb_cpus,
.matches = has_spectre_v4,
.cpu_enable = spectre_v4_enable_mitigation,
},
#ifdef CONFIG_ARM64_ERRATUM_1418040
{
......@@ -956,40 +522,3 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
{
}
};
ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr,
char *buf)
{
return sprintf(buf, "Mitigation: __user pointer sanitization\n");
}
ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr,
char *buf)
{
switch (get_spectre_v2_workaround_state()) {
case ARM64_BP_HARDEN_NOT_REQUIRED:
return sprintf(buf, "Not affected\n");
case ARM64_BP_HARDEN_WA_NEEDED:
return sprintf(buf, "Mitigation: Branch predictor hardening\n");
case ARM64_BP_HARDEN_UNKNOWN:
default:
return sprintf(buf, "Vulnerable\n");
}
}
ssize_t cpu_show_spec_store_bypass(struct device *dev,
struct device_attribute *attr, char *buf)
{
if (__ssb_safe)
return sprintf(buf, "Not affected\n");
switch (ssbd_state) {
case ARM64_SSBD_KERNEL:
case ARM64_SSBD_FORCE_ENABLE:
if (IS_ENABLED(CONFIG_ARM64_SSBD))
return sprintf(buf,
"Mitigation: Speculative Store Bypass disabled via prctl\n");
}
return sprintf(buf, "Vulnerable\n");
}
......@@ -227,7 +227,7 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = {
static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = {
ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_MPAMFRAC_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_RASFRAC_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_SSBS_SHIFT, 4, ID_AA64PFR1_SSBS_PSTATE_NI),
ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR1_SSBS_SHIFT, 4, ID_AA64PFR1_SSBS_PSTATE_NI),
ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_BTI),
FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_BT_SHIFT, 4, 0),
ARM64_FTR_END,
......@@ -487,7 +487,7 @@ static const struct arm64_ftr_bits ftr_id_pfr1[] = {
};
static const struct arm64_ftr_bits ftr_id_pfr2[] = {
ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR2_SSBS_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_PFR2_SSBS_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_PFR2_CSV3_SHIFT, 4, 0),
ARM64_FTR_END,
};
......@@ -1583,48 +1583,6 @@ static void cpu_has_fwb(const struct arm64_cpu_capabilities *__unused)
WARN_ON(val & (7 << 27 | 7 << 21));
}
#ifdef CONFIG_ARM64_SSBD
static int ssbs_emulation_handler(struct pt_regs *regs, u32 instr)
{
if (user_mode(regs))
return 1;
if (instr & BIT(PSTATE_Imm_shift))
regs->pstate |= PSR_SSBS_BIT;
else
regs->pstate &= ~PSR_SSBS_BIT;
arm64_skip_faulting_instruction(regs, 4);
return 0;
}
static struct undef_hook ssbs_emulation_hook = {
.instr_mask = ~(1U << PSTATE_Imm_shift),
.instr_val = 0xd500401f | PSTATE_SSBS,
.fn = ssbs_emulation_handler,
};
static void cpu_enable_ssbs(const struct arm64_cpu_capabilities *__unused)
{
static bool undef_hook_registered = false;
static DEFINE_RAW_SPINLOCK(hook_lock);
raw_spin_lock(&hook_lock);
if (!undef_hook_registered) {
register_undef_hook(&ssbs_emulation_hook);
undef_hook_registered = true;
}
raw_spin_unlock(&hook_lock);
if (arm64_get_ssbd_state() == ARM64_SSBD_FORCE_DISABLE) {
sysreg_clear_set(sctlr_el1, 0, SCTLR_ELx_DSSBS);
arm64_set_ssbd_mitigation(false);
} else {
arm64_set_ssbd_mitigation(true);
}
}
#endif /* CONFIG_ARM64_SSBD */
#ifdef CONFIG_ARM64_PAN
static void cpu_enable_pan(const struct arm64_cpu_capabilities *__unused)
{
......@@ -1976,19 +1934,16 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
.field_pos = ID_AA64ISAR0_CRC32_SHIFT,
.min_field_value = 1,
},
#ifdef CONFIG_ARM64_SSBD
{
.desc = "Speculative Store Bypassing Safe (SSBS)",
.capability = ARM64_SSBS,
.type = ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE,
.type = ARM64_CPUCAP_SYSTEM_FEATURE,
.matches = has_cpuid_feature,
.sys_reg = SYS_ID_AA64PFR1_EL1,
.field_pos = ID_AA64PFR1_SSBS_SHIFT,
.sign = FTR_UNSIGNED,
.min_field_value = ID_AA64PFR1_SSBS_PSTATE_ONLY,
.cpu_enable = cpu_enable_ssbs,
},
#endif
#ifdef CONFIG_ARM64_CNP
{
.desc = "Common not Private translations",
......
......@@ -132,9 +132,8 @@ alternative_else_nop_endif
* them if required.
*/
.macro apply_ssbd, state, tmp1, tmp2
#ifdef CONFIG_ARM64_SSBD
alternative_cb arm64_enable_wa2_handling
b .L__asm_ssbd_skip\@
alternative_cb spectre_v4_patch_fw_mitigation_enable
b .L__asm_ssbd_skip\@ // Patched to NOP
alternative_cb_end
ldr_this_cpu \tmp2, arm64_ssbd_callback_required, \tmp1
cbz \tmp2, .L__asm_ssbd_skip\@
......@@ -142,11 +141,10 @@ alternative_cb_end
tbnz \tmp2, #TIF_SSBD, .L__asm_ssbd_skip\@
mov w0, #ARM_SMCCC_ARCH_WORKAROUND_2
mov w1, #\state
alternative_cb arm64_update_smccc_conduit
alternative_cb spectre_v4_patch_fw_mitigation_conduit
nop // Patched to SMC/HVC #0
alternative_cb_end
.L__asm_ssbd_skip\@:
#endif
.endm
.macro kernel_entry, el, regsize = 64
......@@ -697,11 +695,9 @@ el0_irq_naked:
bl trace_hardirqs_off
#endif
#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR
tbz x22, #55, 1f
bl do_el0_irq_bp_hardening
1:
#endif
irq_handler
#ifdef CONFIG_TRACE_IRQFLAGS
......
......@@ -332,11 +332,7 @@ int swsusp_arch_suspend(void)
* mitigation off behind our back, let's set the state
* to what we expect it to be.
*/
switch (arm64_get_ssbd_state()) {
case ARM64_SSBD_FORCE_ENABLE:
case ARM64_SSBD_KERNEL:
arm64_set_ssbd_mitigation(true);
}
spectre_v4_enable_mitigation(NULL);
}
local_daif_restore(flags);
......
......@@ -61,16 +61,11 @@ __efistub__ctype = _ctype;
* memory mappings.
*/
#define KVM_NVHE_ALIAS(sym) __kvm_nvhe_##sym = sym;
/* Alternative callbacks for init-time patching of nVHE hyp code. */
KVM_NVHE_ALIAS(arm64_enable_wa2_handling);
KVM_NVHE_ALIAS(kvm_patch_vector_branch);
KVM_NVHE_ALIAS(kvm_update_va_mask);
/* Global kernel state accessed by nVHE hyp code. */
KVM_NVHE_ALIAS(arm64_ssbd_callback_required);
KVM_NVHE_ALIAS(kvm_host_data);
KVM_NVHE_ALIAS(kvm_vgic_global_state);
/* Kernel constant needed to compute idmap addresses. */
......
......@@ -21,6 +21,7 @@
#include <linux/lockdep.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/nospec.h>
#include <linux/stddef.h>
#include <linux/sysctl.h>
#include <linux/unistd.h>
......@@ -421,8 +422,7 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start,
cpus_have_const_cap(ARM64_HAS_UAO))
childregs->pstate |= PSR_UAO_BIT;
if (arm64_get_ssbd_state() == ARM64_SSBD_FORCE_DISABLE)
set_ssbs_bit(childregs);
spectre_v4_enable_task_mitigation(p);
if (system_uses_irq_prio_masking())
childregs->pmr_save = GIC_PRIO_IRQON;
......@@ -472,8 +472,6 @@ void uao_thread_switch(struct task_struct *next)
*/
static void ssbs_thread_switch(struct task_struct *next)
{
struct pt_regs *regs = task_pt_regs(next);
/*
* Nothing to do for kernel threads, but 'regs' may be junk
* (e.g. idle task) so check the flags and bail early.
......@@ -485,18 +483,10 @@ static void ssbs_thread_switch(struct task_struct *next)
* If all CPUs implement the SSBS extension, then we just need to
* context-switch the PSTATE field.
*/
if (cpu_have_feature(cpu_feature(SSBS)))
return;
/* If the mitigation is enabled, then we leave SSBS clear. */
if ((arm64_get_ssbd_state() == ARM64_SSBD_FORCE_ENABLE) ||
test_tsk_thread_flag(next, TIF_SSBD))
if (cpus_have_const_cap(ARM64_SSBS))
return;
if (compat_user_mode(regs))
set_compat_ssbs_bit(regs);
else if (user_mode(regs))
set_ssbs_bit(regs);
spectre_v4_enable_task_mitigation(next);
}
/*
......@@ -620,6 +610,11 @@ void arch_setup_new_exec(void)
current->mm->context.flags = is_compat_task() ? MMCF_AARCH32 : 0;
ptrauth_thread_init_user(current);
if (task_spec_ssb_noexec(current)) {
arch_prctl_spec_ctrl_set(current, PR_SPEC_STORE_BYPASS,
PR_SPEC_ENABLE);
}
}
#ifdef CONFIG_ARM64_TAGGED_ADDR_ABI
......
// SPDX-License-Identifier: GPL-2.0-only
/*
* Handle detection, reporting and mitigation of Spectre v1, v2 and v4, as
* detailed at:
*
* https://developer.arm.com/support/arm-security-updates/speculative-processor-vulnerability
*
* This code was originally written hastily under an awful lot of stress and so
* aspects of it are somewhat hacky. Unfortunately, changing anything in here
* instantly makes me feel ill. Thanks, Jann. Thann.
*
* Copyright (C) 2018 ARM Ltd, All Rights Reserved.
* Copyright (C) 2020 Google LLC
*
* "If there's something strange in your neighbourhood, who you gonna call?"
*
* Authors: Will Deacon <will@kernel.org> and Marc Zyngier <maz@kernel.org>
*/
#include <linux/arm-smccc.h>
#include <linux/cpu.h>
#include <linux/device.h>
#include <linux/nospec.h>
#include <linux/prctl.h>
#include <linux/sched/task_stack.h>
#include <asm/spectre.h>
#include <asm/traps.h>
/*
* We try to ensure that the mitigation state can never change as the result of
* onlining a late CPU.
*/
static void update_mitigation_state(enum mitigation_state *oldp,
enum mitigation_state new)
{
enum mitigation_state state;
do {
state = READ_ONCE(*oldp);
if (new <= state)
break;
/* Userspace almost certainly can't deal with this. */
if (WARN_ON(system_capabilities_finalized()))
break;
} while (cmpxchg_relaxed(oldp, state, new) != state);
}
/*
* Spectre v1.
*
* The kernel can't protect userspace for this one: it's each person for
* themselves. Advertise what we're doing and be done with it.
*/
ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr,
char *buf)
{
return sprintf(buf, "Mitigation: __user pointer sanitization\n");
}
/*
* Spectre v2.
*
* This one sucks. A CPU is either:
*
* - Mitigated in hardware and advertised by ID_AA64PFR0_EL1.CSV2.
* - Mitigated in hardware and listed in our "safe list".
* - Mitigated in software by firmware.
* - Mitigated in software by a CPU-specific dance in the kernel.
* - Vulnerable.
*
* It's not unlikely for different CPUs in a big.LITTLE system to fall into
* different camps.
*/
static enum mitigation_state spectre_v2_state;
static bool __read_mostly __nospectre_v2;
static int __init parse_spectre_v2_param(char *str)
{
__nospectre_v2 = true;
return 0;
}
early_param("nospectre_v2", parse_spectre_v2_param);
static bool spectre_v2_mitigations_off(void)
{
bool ret = __nospectre_v2 || cpu_mitigations_off();
if (ret)
pr_info_once("spectre-v2 mitigation disabled by command line option\n");
return ret;
}
ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr,
char *buf)
{
switch (spectre_v2_state) {
case SPECTRE_UNAFFECTED:
return sprintf(buf, "Not affected\n");
case SPECTRE_MITIGATED:
return sprintf(buf, "Mitigation: Branch predictor hardening\n");
case SPECTRE_VULNERABLE:
fallthrough;
default:
return sprintf(buf, "Vulnerable\n");
}
}
static enum mitigation_state spectre_v2_get_cpu_hw_mitigation_state(void)
{
u64 pfr0;
static const struct midr_range spectre_v2_safe_list[] = {
MIDR_ALL_VERSIONS(MIDR_CORTEX_A35),
MIDR_ALL_VERSIONS(MIDR_CORTEX_A53),
MIDR_ALL_VERSIONS(MIDR_CORTEX_A55),
MIDR_ALL_VERSIONS(MIDR_BRAHMA_B53),
MIDR_ALL_VERSIONS(MIDR_HISI_TSV110),
MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_3XX_SILVER),
MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_SILVER),
{ /* sentinel */ }
};
/* If the CPU has CSV2 set, we're safe */
pfr0 = read_cpuid(ID_AA64PFR0_EL1);
if (cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_CSV2_SHIFT))
return SPECTRE_UNAFFECTED;
/* Alternatively, we have a list of unaffected CPUs */
if (is_midr_in_range_list(read_cpuid_id(), spectre_v2_safe_list))
return SPECTRE_UNAFFECTED;
return SPECTRE_VULNERABLE;
}
#define SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED (1)
static enum mitigation_state spectre_v2_get_cpu_fw_mitigation_state(void)
{
int ret;
struct arm_smccc_res res;
arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
ARM_SMCCC_ARCH_WORKAROUND_1, &res);
ret = res.a0;
switch (ret) {
case SMCCC_RET_SUCCESS:
return SPECTRE_MITIGATED;
case SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED:
return SPECTRE_UNAFFECTED;
default:
fallthrough;
case SMCCC_RET_NOT_SUPPORTED:
return SPECTRE_VULNERABLE;
}
}
bool has_spectre_v2(const struct arm64_cpu_capabilities *entry, int scope)
{
WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible());
if (spectre_v2_get_cpu_hw_mitigation_state() == SPECTRE_UNAFFECTED)
return false;
if (spectre_v2_get_cpu_fw_mitigation_state() == SPECTRE_UNAFFECTED)
return false;
return true;
}
DEFINE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data);
enum mitigation_state arm64_get_spectre_v2_state(void)
{
return spectre_v2_state;
}
#ifdef CONFIG_KVM
#include <asm/cacheflush.h>
#include <asm/kvm_asm.h>
atomic_t arm64_el2_vector_last_slot = ATOMIC_INIT(-1);
static void __copy_hyp_vect_bpi(int slot, const char *hyp_vecs_start,
const char *hyp_vecs_end)
{
void *dst = lm_alias(__bp_harden_hyp_vecs + slot * SZ_2K);
int i;
for (i = 0; i < SZ_2K; i += 0x80)
memcpy(dst + i, hyp_vecs_start, hyp_vecs_end - hyp_vecs_start);
__flush_icache_range((uintptr_t)dst, (uintptr_t)dst + SZ_2K);
}
static void install_bp_hardening_cb(bp_hardening_cb_t fn)
{
static DEFINE_RAW_SPINLOCK(bp_lock);
int cpu, slot = -1;
const char *hyp_vecs_start = __smccc_workaround_1_smc;
const char *hyp_vecs_end = __smccc_workaround_1_smc +
__SMCCC_WORKAROUND_1_SMC_SZ;
/*
* detect_harden_bp_fw() passes NULL for the hyp_vecs start/end if
* we're a guest. Skip the hyp-vectors work.
*/
if (!is_hyp_mode_available()) {
__this_cpu_write(bp_hardening_data.fn, fn);
return;
}
raw_spin_lock(&bp_lock);
for_each_possible_cpu(cpu) {
if (per_cpu(bp_hardening_data.fn, cpu) == fn) {
slot = per_cpu(bp_hardening_data.hyp_vectors_slot, cpu);
break;
}
}
if (slot == -1) {
slot = atomic_inc_return(&arm64_el2_vector_last_slot);
BUG_ON(slot >= BP_HARDEN_EL2_SLOTS);
__copy_hyp_vect_bpi(slot, hyp_vecs_start, hyp_vecs_end);
}
__this_cpu_write(bp_hardening_data.hyp_vectors_slot, slot);
__this_cpu_write(bp_hardening_data.fn, fn);
raw_spin_unlock(&bp_lock);
}
#else
static void install_bp_hardening_cb(bp_hardening_cb_t fn)
{
__this_cpu_write(bp_hardening_data.fn, fn);
}
#endif /* CONFIG_KVM */
static void call_smc_arch_workaround_1(void)
{
arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL);
}
static void call_hvc_arch_workaround_1(void)
{
arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL);
}
static void qcom_link_stack_sanitisation(void)
{
u64 tmp;
asm volatile("mov %0, x30 \n"
".rept 16 \n"
"bl . + 4 \n"
".endr \n"
"mov x30, %0 \n"
: "=&r" (tmp));
}
static enum mitigation_state spectre_v2_enable_fw_mitigation(void)
{
bp_hardening_cb_t cb;
enum mitigation_state state;
state = spectre_v2_get_cpu_fw_mitigation_state();
if (state != SPECTRE_MITIGATED)
return state;
if (spectre_v2_mitigations_off())
return SPECTRE_VULNERABLE;
switch (arm_smccc_1_1_get_conduit()) {
case SMCCC_CONDUIT_HVC:
cb = call_hvc_arch_workaround_1;
break;
case SMCCC_CONDUIT_SMC:
cb = call_smc_arch_workaround_1;
break;
default:
return SPECTRE_VULNERABLE;
}
install_bp_hardening_cb(cb);
return SPECTRE_MITIGATED;
}
static enum mitigation_state spectre_v2_enable_sw_mitigation(void)
{
u32 midr;
if (spectre_v2_mitigations_off())
return SPECTRE_VULNERABLE;
midr = read_cpuid_id();
if (((midr & MIDR_CPU_MODEL_MASK) != MIDR_QCOM_FALKOR) &&
((midr & MIDR_CPU_MODEL_MASK) != MIDR_QCOM_FALKOR_V1))
return SPECTRE_VULNERABLE;
install_bp_hardening_cb(qcom_link_stack_sanitisation);
return SPECTRE_MITIGATED;
}
void spectre_v2_enable_mitigation(const struct arm64_cpu_capabilities *__unused)
{
enum mitigation_state state;
WARN_ON(preemptible());
state = spectre_v2_get_cpu_hw_mitigation_state();
if (state == SPECTRE_VULNERABLE)
state = spectre_v2_enable_fw_mitigation();
if (state == SPECTRE_VULNERABLE)
state = spectre_v2_enable_sw_mitigation();
update_mitigation_state(&spectre_v2_state, state);
}
/*
* Spectre v4.
*
* If you thought Spectre v2 was nasty, wait until you see this mess. A CPU is
* either:
*
* - Mitigated in hardware and listed in our "safe list".
* - Mitigated in hardware via PSTATE.SSBS.
* - Mitigated in software by firmware (sometimes referred to as SSBD).
*
* Wait, that doesn't sound so bad, does it? Keep reading...
*
* A major source of headaches is that the software mitigation is enabled both
* on a per-task basis, but can also be forced on for the kernel, necessitating
* both context-switch *and* entry/exit hooks. To make it even worse, some CPUs
* allow EL0 to toggle SSBS directly, which can end up with the prctl() state
* being stale when re-entering the kernel. The usual big.LITTLE caveats apply,
* so you can have systems that have both firmware and SSBS mitigations. This
* means we actually have to reject late onlining of CPUs with mitigations if
* all of the currently onlined CPUs are safelisted, as the mitigation tends to
* be opt-in for userspace. Yes, really, the cure is worse than the disease.
*
* The only good part is that if the firmware mitigation is present, then it is
* present for all CPUs, meaning we don't have to worry about late onlining of a
* vulnerable CPU if one of the boot CPUs is using the firmware mitigation.
*
* Give me a VAX-11/780 any day of the week...
*/
static enum mitigation_state spectre_v4_state;
/* This is the per-cpu state tracking whether we need to talk to firmware */
DEFINE_PER_CPU_READ_MOSTLY(u64, arm64_ssbd_callback_required);
enum spectre_v4_policy {
SPECTRE_V4_POLICY_MITIGATION_DYNAMIC,
SPECTRE_V4_POLICY_MITIGATION_ENABLED,
SPECTRE_V4_POLICY_MITIGATION_DISABLED,
};
static enum spectre_v4_policy __read_mostly __spectre_v4_policy;
static const struct spectre_v4_param {
const char *str;
enum spectre_v4_policy policy;
} spectre_v4_params[] = {
{ "force-on", SPECTRE_V4_POLICY_MITIGATION_ENABLED, },
{ "force-off", SPECTRE_V4_POLICY_MITIGATION_DISABLED, },
{ "kernel", SPECTRE_V4_POLICY_MITIGATION_DYNAMIC, },
};
static int __init parse_spectre_v4_param(char *str)
{
int i;
if (!str || !str[0])
return -EINVAL;
for (i = 0; i < ARRAY_SIZE(spectre_v4_params); i++) {
const struct spectre_v4_param *param = &spectre_v4_params[i];
if (strncmp(str, param->str, strlen(param->str)))
continue;
__spectre_v4_policy = param->policy;
return 0;
}
return -EINVAL;
}
early_param("ssbd", parse_spectre_v4_param);
/*
* Because this was all written in a rush by people working in different silos,
* we've ended up with multiple command line options to control the same thing.
* Wrap these up in some helpers, which prefer disabling the mitigation if faced
* with contradictory parameters. The mitigation is always either "off",
* "dynamic" or "on".
*/
static bool spectre_v4_mitigations_off(void)
{
bool ret = cpu_mitigations_off() ||
__spectre_v4_policy == SPECTRE_V4_POLICY_MITIGATION_DISABLED;
if (ret)
pr_info_once("spectre-v4 mitigation disabled by command-line option\n");
return ret;
}
/* Do we need to toggle the mitigation state on entry to/exit from the kernel? */
static bool spectre_v4_mitigations_dynamic(void)
{
return !spectre_v4_mitigations_off() &&
__spectre_v4_policy == SPECTRE_V4_POLICY_MITIGATION_DYNAMIC;
}
static bool spectre_v4_mitigations_on(void)
{
return !spectre_v4_mitigations_off() &&
__spectre_v4_policy == SPECTRE_V4_POLICY_MITIGATION_ENABLED;
}
ssize_t cpu_show_spec_store_bypass(struct device *dev,
struct device_attribute *attr, char *buf)
{
switch (spectre_v4_state) {
case SPECTRE_UNAFFECTED:
return sprintf(buf, "Not affected\n");
case SPECTRE_MITIGATED:
return sprintf(buf, "Mitigation: Speculative Store Bypass disabled via prctl\n");
case SPECTRE_VULNERABLE:
fallthrough;
default:
return sprintf(buf, "Vulnerable\n");
}
}
enum mitigation_state arm64_get_spectre_v4_state(void)
{
return spectre_v4_state;
}
static enum mitigation_state spectre_v4_get_cpu_hw_mitigation_state(void)
{
static const struct midr_range spectre_v4_safe_list[] = {
MIDR_ALL_VERSIONS(MIDR_CORTEX_A35),
MIDR_ALL_VERSIONS(MIDR_CORTEX_A53),
MIDR_ALL_VERSIONS(MIDR_CORTEX_A55),
MIDR_ALL_VERSIONS(MIDR_BRAHMA_B53),
MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_3XX_SILVER),
MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_SILVER),
{ /* sentinel */ },
};
if (is_midr_in_range_list(read_cpuid_id(), spectre_v4_safe_list))
return SPECTRE_UNAFFECTED;
/* CPU features are detected first */
if (this_cpu_has_cap(ARM64_SSBS))
return SPECTRE_MITIGATED;
return SPECTRE_VULNERABLE;
}
static enum mitigation_state spectre_v4_get_cpu_fw_mitigation_state(void)
{
int ret;
struct arm_smccc_res res;
arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
ARM_SMCCC_ARCH_WORKAROUND_2, &res);
ret = res.a0;
switch (ret) {
case SMCCC_RET_SUCCESS:
return SPECTRE_MITIGATED;
case SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED:
fallthrough;
case SMCCC_RET_NOT_REQUIRED:
return SPECTRE_UNAFFECTED;
default:
fallthrough;
case SMCCC_RET_NOT_SUPPORTED:
return SPECTRE_VULNERABLE;
}
}
bool has_spectre_v4(const struct arm64_cpu_capabilities *cap, int scope)
{
enum mitigation_state state;
WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible());
state = spectre_v4_get_cpu_hw_mitigation_state();
if (state == SPECTRE_VULNERABLE)
state = spectre_v4_get_cpu_fw_mitigation_state();
return state != SPECTRE_UNAFFECTED;
}
static int ssbs_emulation_handler(struct pt_regs *regs, u32 instr)
{
if (user_mode(regs))
return 1;
if (instr & BIT(PSTATE_Imm_shift))
regs->pstate |= PSR_SSBS_BIT;
else
regs->pstate &= ~PSR_SSBS_BIT;
arm64_skip_faulting_instruction(regs, 4);
return 0;
}
static struct undef_hook ssbs_emulation_hook = {
.instr_mask = ~(1U << PSTATE_Imm_shift),
.instr_val = 0xd500401f | PSTATE_SSBS,
.fn = ssbs_emulation_handler,
};
static enum mitigation_state spectre_v4_enable_hw_mitigation(void)
{
static bool undef_hook_registered = false;
static DEFINE_RAW_SPINLOCK(hook_lock);
enum mitigation_state state;
/*
* If the system is mitigated but this CPU doesn't have SSBS, then
* we must be on the safelist and there's nothing more to do.
*/
state = spectre_v4_get_cpu_hw_mitigation_state();
if (state != SPECTRE_MITIGATED || !this_cpu_has_cap(ARM64_SSBS))
return state;
raw_spin_lock(&hook_lock);
if (!undef_hook_registered) {
register_undef_hook(&ssbs_emulation_hook);
undef_hook_registered = true;
}
raw_spin_unlock(&hook_lock);
if (spectre_v4_mitigations_off()) {
sysreg_clear_set(sctlr_el1, 0, SCTLR_ELx_DSSBS);
asm volatile(SET_PSTATE_SSBS(1));
return SPECTRE_VULNERABLE;
}
/* SCTLR_EL1.DSSBS was initialised to 0 during boot */
asm volatile(SET_PSTATE_SSBS(0));
return SPECTRE_MITIGATED;
}
/*
* Patch a branch over the Spectre-v4 mitigation code with a NOP so that
* we fallthrough and check whether firmware needs to be called on this CPU.
*/
void __init spectre_v4_patch_fw_mitigation_enable(struct alt_instr *alt,
__le32 *origptr,
__le32 *updptr, int nr_inst)
{
BUG_ON(nr_inst != 1); /* Branch -> NOP */
if (spectre_v4_mitigations_off())
return;
if (cpus_have_final_cap(ARM64_SSBS))
return;
if (spectre_v4_mitigations_dynamic())
*updptr = cpu_to_le32(aarch64_insn_gen_nop());
}
/*
* Patch a NOP in the Spectre-v4 mitigation code with an SMC/HVC instruction
* to call into firmware to adjust the mitigation state.
*/
void __init spectre_v4_patch_fw_mitigation_conduit(struct alt_instr *alt,
__le32 *origptr,
__le32 *updptr, int nr_inst)
{
u32 insn;
BUG_ON(nr_inst != 1); /* NOP -> HVC/SMC */
switch (arm_smccc_1_1_get_conduit()) {
case SMCCC_CONDUIT_HVC:
insn = aarch64_insn_get_hvc_value();
break;
case SMCCC_CONDUIT_SMC:
insn = aarch64_insn_get_smc_value();
break;
default:
return;
}
*updptr = cpu_to_le32(insn);
}
static enum mitigation_state spectre_v4_enable_fw_mitigation(void)
{
enum mitigation_state state;
state = spectre_v4_get_cpu_fw_mitigation_state();
if (state != SPECTRE_MITIGATED)
return state;
if (spectre_v4_mitigations_off()) {
arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_WORKAROUND_2, false, NULL);
return SPECTRE_VULNERABLE;
}
arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_WORKAROUND_2, true, NULL);
if (spectre_v4_mitigations_dynamic())
__this_cpu_write(arm64_ssbd_callback_required, 1);
return SPECTRE_MITIGATED;
}
void spectre_v4_enable_mitigation(const struct arm64_cpu_capabilities *__unused)
{
enum mitigation_state state;
WARN_ON(preemptible());
state = spectre_v4_enable_hw_mitigation();
if (state == SPECTRE_VULNERABLE)
state = spectre_v4_enable_fw_mitigation();
update_mitigation_state(&spectre_v4_state, state);
}
static void __update_pstate_ssbs(struct pt_regs *regs, bool state)
{
u64 bit = compat_user_mode(regs) ? PSR_AA32_SSBS_BIT : PSR_SSBS_BIT;
if (state)
regs->pstate |= bit;
else
regs->pstate &= ~bit;
}
void spectre_v4_enable_task_mitigation(struct task_struct *tsk)
{
struct pt_regs *regs = task_pt_regs(tsk);
bool ssbs = false, kthread = tsk->flags & PF_KTHREAD;
if (spectre_v4_mitigations_off())
ssbs = true;
else if (spectre_v4_mitigations_dynamic() && !kthread)
ssbs = !test_tsk_thread_flag(tsk, TIF_SSBD);
__update_pstate_ssbs(regs, ssbs);
}
/*
* The Spectre-v4 mitigation can be controlled via a prctl() from userspace.
* This is interesting because the "speculation disabled" behaviour can be
* configured so that it is preserved across exec(), which means that the
* prctl() may be necessary even when PSTATE.SSBS can be toggled directly
* from userspace.
*/
static void ssbd_prctl_enable_mitigation(struct task_struct *task)
{
task_clear_spec_ssb_noexec(task);
task_set_spec_ssb_disable(task);
set_tsk_thread_flag(task, TIF_SSBD);
}
static void ssbd_prctl_disable_mitigation(struct task_struct *task)
{
task_clear_spec_ssb_noexec(task);
task_clear_spec_ssb_disable(task);
clear_tsk_thread_flag(task, TIF_SSBD);
}
static int ssbd_prctl_set(struct task_struct *task, unsigned long ctrl)
{
switch (ctrl) {
case PR_SPEC_ENABLE:
/* Enable speculation: disable mitigation */
/*
* Force disabled speculation prevents it from being
* re-enabled.
*/
if (task_spec_ssb_force_disable(task))
return -EPERM;
/*
* If the mitigation is forced on, then speculation is forced
* off and we again prevent it from being re-enabled.
*/
if (spectre_v4_mitigations_on())
return -EPERM;
ssbd_prctl_disable_mitigation(task);
break;
case PR_SPEC_FORCE_DISABLE:
/* Force disable speculation: force enable mitigation */
/*
* If the mitigation is forced off, then speculation is forced
* on and we prevent it from being disabled.
*/
if (spectre_v4_mitigations_off())
return -EPERM;
task_set_spec_ssb_force_disable(task);
fallthrough;
case PR_SPEC_DISABLE:
/* Disable speculation: enable mitigation */
/* Same as PR_SPEC_FORCE_DISABLE */
if (spectre_v4_mitigations_off())
return -EPERM;
ssbd_prctl_enable_mitigation(task);
break;
case PR_SPEC_DISABLE_NOEXEC:
/* Disable speculation until execve(): enable mitigation */
/*
* If the mitigation state is forced one way or the other, then
* we must fail now before we try to toggle it on execve().
*/
if (task_spec_ssb_force_disable(task) ||
spectre_v4_mitigations_off() ||
spectre_v4_mitigations_on()) {
return -EPERM;
}
ssbd_prctl_enable_mitigation(task);
task_set_spec_ssb_noexec(task);
break;
default:
return -ERANGE;
}
spectre_v4_enable_task_mitigation(task);
return 0;
}
int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which,
unsigned long ctrl)
{
switch (which) {
case PR_SPEC_STORE_BYPASS:
return ssbd_prctl_set(task, ctrl);
default:
return -ENODEV;
}
}
static int ssbd_prctl_get(struct task_struct *task)
{
switch (spectre_v4_state) {
case SPECTRE_UNAFFECTED:
return PR_SPEC_NOT_AFFECTED;
case SPECTRE_MITIGATED:
if (spectre_v4_mitigations_on())
return PR_SPEC_NOT_AFFECTED;
if (spectre_v4_mitigations_dynamic())
break;
/* Mitigations are disabled, so we're vulnerable. */
fallthrough;
case SPECTRE_VULNERABLE:
fallthrough;
default:
return PR_SPEC_ENABLE;
}
/* Check the mitigation state for this task */
if (task_spec_ssb_force_disable(task))
return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE;
if (task_spec_ssb_noexec(task))
return PR_SPEC_PRCTL | PR_SPEC_DISABLE_NOEXEC;
if (task_spec_ssb_disable(task))
return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
}
int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which)
{
switch (which) {
case PR_SPEC_STORE_BYPASS:
return ssbd_prctl_get(task);
default:
return -ENODEV;
}
}
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2018 ARM Ltd, All Rights Reserved.
*/
#include <linux/compat.h>
#include <linux/errno.h>
#include <linux/prctl.h>
#include <linux/sched.h>
#include <linux/sched/task_stack.h>
#include <linux/thread_info.h>
#include <asm/cpufeature.h>
static void ssbd_ssbs_enable(struct task_struct *task)
{
u64 val = is_compat_thread(task_thread_info(task)) ?
PSR_AA32_SSBS_BIT : PSR_SSBS_BIT;
task_pt_regs(task)->pstate |= val;
}
static void ssbd_ssbs_disable(struct task_struct *task)
{
u64 val = is_compat_thread(task_thread_info(task)) ?
PSR_AA32_SSBS_BIT : PSR_SSBS_BIT;
task_pt_regs(task)->pstate &= ~val;
}
/*
* prctl interface for SSBD
*/
static int ssbd_prctl_set(struct task_struct *task, unsigned long ctrl)
{
int state = arm64_get_ssbd_state();
/* Unsupported */
if (state == ARM64_SSBD_UNKNOWN)
return -ENODEV;
/* Treat the unaffected/mitigated state separately */
if (state == ARM64_SSBD_MITIGATED) {
switch (ctrl) {
case PR_SPEC_ENABLE:
return -EPERM;
case PR_SPEC_DISABLE:
case PR_SPEC_FORCE_DISABLE:
return 0;
}
}
/*
* Things are a bit backward here: the arm64 internal API
* *enables the mitigation* when the userspace API *disables
* speculation*. So much fun.
*/
switch (ctrl) {
case PR_SPEC_ENABLE:
/* If speculation is force disabled, enable is not allowed */
if (state == ARM64_SSBD_FORCE_ENABLE ||
task_spec_ssb_force_disable(task))
return -EPERM;
task_clear_spec_ssb_disable(task);
clear_tsk_thread_flag(task, TIF_SSBD);
ssbd_ssbs_enable(task);
break;
case PR_SPEC_DISABLE:
if (state == ARM64_SSBD_FORCE_DISABLE)
return -EPERM;
task_set_spec_ssb_disable(task);
set_tsk_thread_flag(task, TIF_SSBD);
ssbd_ssbs_disable(task);
break;
case PR_SPEC_FORCE_DISABLE:
if (state == ARM64_SSBD_FORCE_DISABLE)
return -EPERM;
task_set_spec_ssb_disable(task);
task_set_spec_ssb_force_disable(task);
set_tsk_thread_flag(task, TIF_SSBD);
ssbd_ssbs_disable(task);
break;
default:
return -ERANGE;
}
return 0;
}
int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which,
unsigned long ctrl)
{
switch (which) {
case PR_SPEC_STORE_BYPASS:
return ssbd_prctl_set(task, ctrl);
default:
return -ENODEV;
}
}
static int ssbd_prctl_get(struct task_struct *task)
{
switch (arm64_get_ssbd_state()) {
case ARM64_SSBD_UNKNOWN:
return -ENODEV;
case ARM64_SSBD_FORCE_ENABLE:
return PR_SPEC_DISABLE;
case ARM64_SSBD_KERNEL:
if (task_spec_ssb_force_disable(task))
return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE;
if (task_spec_ssb_disable(task))
return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
case ARM64_SSBD_FORCE_DISABLE:
return PR_SPEC_ENABLE;
default:
return PR_SPEC_NOT_AFFECTED;
}
}
int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which)
{
switch (which) {
case PR_SPEC_STORE_BYPASS:
return ssbd_prctl_get(task);
default:
return -ENODEV;
}
}
......@@ -72,8 +72,7 @@ void notrace __cpu_suspend_exit(void)
* have turned the mitigation on. If the user has forcefully
* disabled it, make sure their wishes are obeyed.
*/
if (arm64_get_ssbd_state() == ARM64_SSBD_FORCE_DISABLE)
arm64_set_ssbd_mitigation(false);
spectre_v4_enable_mitigation(NULL);
}
/*
......
......@@ -9,6 +9,7 @@
#include <asm-generic/vmlinux.lds.h>
#include <asm/cache.h>
#include <asm/hyp_image.h>
#include <asm/kernel-pgtable.h>
#include <asm/memory.h>
#include <asm/page.h>
......@@ -21,12 +22,23 @@ ENTRY(_text)
jiffies = jiffies_64;
#ifdef CONFIG_KVM
#define HYPERVISOR_EXTABLE \
. = ALIGN(SZ_8); \
__start___kvm_ex_table = .; \
*(__kvm_ex_table) \
__stop___kvm_ex_table = .;
#define HYPERVISOR_PERCPU_SECTION \
. = ALIGN(PAGE_SIZE); \
HYP_SECTION_NAME(.data..percpu) : { \
*(HYP_SECTION_NAME(.data..percpu)) \
}
#else /* CONFIG_KVM */
#define HYPERVISOR_EXTABLE
#define HYPERVISOR_PERCPU_SECTION
#endif
#define HYPERVISOR_TEXT \
/* \
* Align to 4 KB so that \
......@@ -190,6 +202,7 @@ SECTIONS
}
PERCPU_SECTION(L1_CACHE_BYTES)
HYPERVISOR_PERCPU_SECTION
.rela.dyn : ALIGN(8) {
*(.rela .rela*)
......
......@@ -57,9 +57,6 @@ config KVM_ARM_PMU
Adds support for a virtual Performance Monitoring Unit (PMU) in
virtual machines.
config KVM_INDIRECT_VECTORS
def_bool HARDEN_BRANCH_PREDICTOR || RANDOMIZE_BASE
endif # KVM
endif # VIRTUALIZATION
......@@ -13,7 +13,7 @@ obj-$(CONFIG_KVM) += hyp/
kvm-y := $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o \
$(KVM)/vfio.o $(KVM)/irqchip.o \
arm.o mmu.o mmio.o psci.o perf.o hypercalls.o pvtime.o \
inject_fault.o regmap.o va_layout.o hyp.o handle_exit.o \
inject_fault.o regmap.o va_layout.o handle_exit.o \
guest.o debug.o reset.o sys_regs.o \
vgic-sys-reg-v3.o fpsimd.o pmu.o \
aarch32.o arch_timer.o \
......
......@@ -46,8 +46,10 @@
__asm__(".arch_extension virt");
#endif
DEFINE_PER_CPU(kvm_host_data_t, kvm_host_data);
DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector);
static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
unsigned long kvm_arm_hyp_percpu_base[NR_CPUS];
/* The VMID used in the VTTBR */
static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1);
......@@ -145,6 +147,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
{
int i;
bitmap_free(kvm->arch.pmu_filter);
kvm_vgic_destroy(kvm);
for (i = 0; i < KVM_MAX_VCPUS; ++i) {
......@@ -286,7 +290,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
if (vcpu->arch.has_run_once && unlikely(!irqchip_in_kernel(vcpu->kvm)))
static_branch_dec(&userspace_irqchip_in_use);
kvm_mmu_free_memory_caches(vcpu);
kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
kvm_timer_vcpu_terminate(vcpu);
kvm_pmu_vcpu_destroy(vcpu);
......@@ -1259,12 +1263,60 @@ long kvm_arch_vm_ioctl(struct file *filp,
}
}
static unsigned long nvhe_percpu_size(void)
{
return (unsigned long)CHOOSE_NVHE_SYM(__per_cpu_end) -
(unsigned long)CHOOSE_NVHE_SYM(__per_cpu_start);
}
static unsigned long nvhe_percpu_order(void)
{
unsigned long size = nvhe_percpu_size();
return size ? get_order(size) : 0;
}
static int kvm_map_vectors(void)
{
/*
* SV2 = ARM64_SPECTRE_V2
* HEL2 = ARM64_HARDEN_EL2_VECTORS
*
* !SV2 + !HEL2 -> use direct vectors
* SV2 + !HEL2 -> use hardened vectors in place
* !SV2 + HEL2 -> allocate one vector slot and use exec mapping
* SV2 + HEL2 -> use hardened vectors and use exec mapping
*/
if (cpus_have_const_cap(ARM64_SPECTRE_V2)) {
__kvm_bp_vect_base = kvm_ksym_ref(__bp_harden_hyp_vecs);
__kvm_bp_vect_base = kern_hyp_va(__kvm_bp_vect_base);
}
if (cpus_have_const_cap(ARM64_HARDEN_EL2_VECTORS)) {
phys_addr_t vect_pa = __pa_symbol(__bp_harden_hyp_vecs);
unsigned long size = __BP_HARDEN_HYP_VECS_SZ;
/*
* Always allocate a spare vector slot, as we don't
* know yet which CPUs have a BP hardening slot that
* we can reuse.
*/
__kvm_harden_el2_vector_slot = atomic_inc_return(&arm64_el2_vector_last_slot);
BUG_ON(__kvm_harden_el2_vector_slot >= BP_HARDEN_EL2_SLOTS);
return create_hyp_exec_mappings(vect_pa, size,
&__kvm_bp_vect_base);
}
return 0;
}
static void cpu_init_hyp_mode(void)
{
phys_addr_t pgd_ptr;
unsigned long hyp_stack_ptr;
unsigned long vector_ptr;
unsigned long tpidr_el2;
struct arm_smccc_res res;
/* Switch from the HYP stub to our own HYP init vector */
__hyp_set_vectors(kvm_get_idmap_vector());
......@@ -1274,12 +1326,13 @@ static void cpu_init_hyp_mode(void)
* kernel's mapping to the linear mapping, and store it in tpidr_el2
* so that we can use adr_l to access per-cpu variables in EL2.
*/
tpidr_el2 = ((unsigned long)this_cpu_ptr(&kvm_host_data) -
(unsigned long)kvm_ksym_ref(&kvm_host_data));
tpidr_el2 = (unsigned long)this_cpu_ptr_nvhe_sym(__per_cpu_start) -
(unsigned long)kvm_ksym_ref(CHOOSE_NVHE_SYM(__per_cpu_start));
pgd_ptr = kvm_mmu_get_httbr();
hyp_stack_ptr = __this_cpu_read(kvm_arm_hyp_stack_page) + PAGE_SIZE;
vector_ptr = (unsigned long)kvm_get_hyp_vector();
hyp_stack_ptr = kern_hyp_va(hyp_stack_ptr);
vector_ptr = (unsigned long)kern_hyp_va(kvm_ksym_ref(__kvm_hyp_host_vector));
/*
* Call initialization code, and switch to the full blown HYP code.
......@@ -1288,14 +1341,16 @@ static void cpu_init_hyp_mode(void)
* cpus_have_const_cap() wrapper.
*/
BUG_ON(!system_capabilities_finalized());
__kvm_call_hyp((void *)pgd_ptr, hyp_stack_ptr, vector_ptr, tpidr_el2);
arm_smccc_1_1_hvc(KVM_HOST_SMCCC_FUNC(__kvm_hyp_init),
pgd_ptr, tpidr_el2, hyp_stack_ptr, vector_ptr, &res);
WARN_ON(res.a0 != SMCCC_RET_SUCCESS);
/*
* Disabling SSBD on a non-VHE system requires us to enable SSBS
* at EL2.
*/
if (this_cpu_has_cap(ARM64_SSBS) &&
arm64_get_ssbd_state() == ARM64_SSBD_FORCE_DISABLE) {
arm64_get_spectre_v4_state() == SPECTRE_VULNERABLE) {
kvm_call_hyp_nvhe(__kvm_enable_ssbs);
}
}
......@@ -1308,10 +1363,12 @@ static void cpu_hyp_reset(void)
static void cpu_hyp_reinit(void)
{
kvm_init_host_cpu_context(&this_cpu_ptr(&kvm_host_data)->host_ctxt);
kvm_init_host_cpu_context(&this_cpu_ptr_hyp_sym(kvm_host_data)->host_ctxt);
cpu_hyp_reset();
*this_cpu_ptr_hyp_sym(kvm_hyp_vector) = (unsigned long)kvm_get_hyp_vector();
if (is_kernel_in_hyp_mode())
kvm_timer_init_vhe();
else
......@@ -1462,8 +1519,10 @@ static void teardown_hyp_mode(void)
int cpu;
free_hyp_pgds();
for_each_possible_cpu(cpu)
for_each_possible_cpu(cpu) {
free_page(per_cpu(kvm_arm_hyp_stack_page, cpu));
free_pages(kvm_arm_hyp_percpu_base[cpu], nvhe_percpu_order());
}
}
/**
......@@ -1496,6 +1555,24 @@ static int init_hyp_mode(void)
per_cpu(kvm_arm_hyp_stack_page, cpu) = stack_page;
}
/*
* Allocate and initialize pages for Hypervisor-mode percpu regions.
*/
for_each_possible_cpu(cpu) {
struct page *page;
void *page_addr;
page = alloc_pages(GFP_KERNEL, nvhe_percpu_order());
if (!page) {
err = -ENOMEM;
goto out_err;
}
page_addr = page_address(page);
memcpy(page_addr, CHOOSE_NVHE_SYM(__per_cpu_start), nvhe_percpu_size());
kvm_arm_hyp_percpu_base[cpu] = (unsigned long)page_addr;
}
/*
* Map the Hyp-code called directly from the host
*/
......@@ -1540,22 +1617,21 @@ static int init_hyp_mode(void)
}
}
/*
* Map Hyp percpu pages
*/
for_each_possible_cpu(cpu) {
kvm_host_data_t *cpu_data;
char *percpu_begin = (char *)kvm_arm_hyp_percpu_base[cpu];
char *percpu_end = percpu_begin + nvhe_percpu_size();
cpu_data = per_cpu_ptr(&kvm_host_data, cpu);
err = create_hyp_mappings(cpu_data, cpu_data + 1, PAGE_HYP);
err = create_hyp_mappings(percpu_begin, percpu_end, PAGE_HYP);
if (err) {
kvm_err("Cannot map host CPU state: %d\n", err);
kvm_err("Cannot map hyp percpu region\n");
goto out_err;
}
}
err = hyp_map_aux_data();
if (err)
kvm_err("Cannot map host auxiliary data: %d\n", err);
return 0;
out_err:
......
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (C) 2012,2013 - ARM Ltd
* Author: Marc Zyngier <marc.zyngier@arm.com>
*/
#include <linux/linkage.h>
#include <asm/alternative.h>
#include <asm/assembler.h>
#include <asm/cpufeature.h>
/*
* u64 __kvm_call_hyp(void *hypfn, ...);
*
* This is not really a variadic function in the classic C-way and care must
* be taken when calling this to ensure parameters are passed in registers
* only, since the stack will change between the caller and the callee.
*
* Call the function with the first argument containing a pointer to the
* function you wish to call in Hyp mode, and subsequent arguments will be
* passed as x0, x1, and x2 (a maximum of 3 arguments in addition to the
* function pointer can be passed). The function being called must be mapped
* in Hyp mode (see init_hyp_mode in arch/arm/kvm/arm.c). Return values are
* passed in x0.
*
* A function pointer with a value less than 0xfff has a special meaning,
* and is used to implement hyp stubs in the same way as in
* arch/arm64/kernel/hyp_stub.S.
*/
SYM_FUNC_START(__kvm_call_hyp)
hvc #0
ret
SYM_FUNC_END(__kvm_call_hyp)
......@@ -10,5 +10,4 @@ subdir-ccflags-y := -I$(incdir) \
-DDISABLE_BRANCH_PROFILING \
$(DISABLE_STACKLEAK_PLUGIN)
obj-$(CONFIG_KVM) += vhe/ nvhe/
obj-$(CONFIG_KVM_INDIRECT_VECTORS) += smccc_wa.o
obj-$(CONFIG_KVM) += vhe/ nvhe/ pgtable.o smccc_wa.o
......@@ -7,7 +7,6 @@
#include <linux/linkage.h>
#include <asm/alternative.h>
#include <asm/asm-offsets.h>
#include <asm/assembler.h>
#include <asm/fpsimdmacros.h>
#include <asm/kvm.h>
......@@ -16,66 +15,28 @@
#include <asm/kvm_mmu.h>
#include <asm/kvm_ptrauth.h>
#define CPU_XREG_OFFSET(x) (CPU_USER_PT_REGS + 8*x)
#define CPU_SP_EL0_OFFSET (CPU_XREG_OFFSET(30) + 8)
.text
/*
* We treat x18 as callee-saved as the host may use it as a platform
* register (e.g. for shadow call stack).
*/
.macro save_callee_saved_regs ctxt
str x18, [\ctxt, #CPU_XREG_OFFSET(18)]
stp x19, x20, [\ctxt, #CPU_XREG_OFFSET(19)]
stp x21, x22, [\ctxt, #CPU_XREG_OFFSET(21)]
stp x23, x24, [\ctxt, #CPU_XREG_OFFSET(23)]
stp x25, x26, [\ctxt, #CPU_XREG_OFFSET(25)]
stp x27, x28, [\ctxt, #CPU_XREG_OFFSET(27)]
stp x29, lr, [\ctxt, #CPU_XREG_OFFSET(29)]
.endm
.macro restore_callee_saved_regs ctxt
// We require \ctxt is not x18-x28
ldr x18, [\ctxt, #CPU_XREG_OFFSET(18)]
ldp x19, x20, [\ctxt, #CPU_XREG_OFFSET(19)]
ldp x21, x22, [\ctxt, #CPU_XREG_OFFSET(21)]
ldp x23, x24, [\ctxt, #CPU_XREG_OFFSET(23)]
ldp x25, x26, [\ctxt, #CPU_XREG_OFFSET(25)]
ldp x27, x28, [\ctxt, #CPU_XREG_OFFSET(27)]
ldp x29, lr, [\ctxt, #CPU_XREG_OFFSET(29)]
.endm
.macro save_sp_el0 ctxt, tmp
mrs \tmp, sp_el0
str \tmp, [\ctxt, #CPU_SP_EL0_OFFSET]
.endm
.macro restore_sp_el0 ctxt, tmp
ldr \tmp, [\ctxt, #CPU_SP_EL0_OFFSET]
msr sp_el0, \tmp
.endm
/*
* u64 __guest_enter(struct kvm_vcpu *vcpu,
* struct kvm_cpu_context *host_ctxt);
* u64 __guest_enter(struct kvm_vcpu *vcpu);
*/
SYM_FUNC_START(__guest_enter)
// x0: vcpu
// x1: host context
// x2-x17: clobbered by macros
// x1-x17: clobbered by macros
// x29: guest context
// Store the host regs
adr_this_cpu x1, kvm_hyp_ctxt, x2
// Store the hyp regs
save_callee_saved_regs x1
// Save the host's sp_el0
// Save hyp's sp_el0
save_sp_el0 x1, x2
// Now the host state is stored if we have a pending RAS SError it must
// affect the host. If any asynchronous exception is pending we defer
// the guest entry. The DSB isn't necessary before v8.2 as any SError
// would be fatal.
// Now the hyp state is stored if we have a pending RAS SError it must
// affect the host or hyp. If any asynchronous exception is pending we
// defer the guest entry. The DSB isn't necessary before v8.2 as any
// SError would be fatal.
alternative_if ARM64_HAS_RAS_EXTN
dsb nshst
isb
......@@ -86,6 +47,8 @@ alternative_else_nop_endif
ret
1:
set_loaded_vcpu x0, x1, x2
add x29, x0, #VCPU_CONTEXT
// Macro ptrauth_switch_to_guest format:
......@@ -116,6 +79,26 @@ alternative_else_nop_endif
eret
sb
SYM_INNER_LABEL(__guest_exit_panic, SYM_L_GLOBAL)
// x2-x29,lr: vcpu regs
// vcpu x0-x1 on the stack
// If the hyp context is loaded, go straight to hyp_panic
get_loaded_vcpu x0, x1
cbz x0, hyp_panic
// The hyp context is saved so make sure it is restored to allow
// hyp_panic to run at hyp and, subsequently, panic to run in the host.
// This makes use of __guest_exit to avoid duplication but sets the
// return address to tail call into hyp_panic. As a side effect, the
// current state is saved to the guest context but it will only be
// accurate if the guest had been completely restored.
adr_this_cpu x0, kvm_hyp_ctxt, x1
adr x1, hyp_panic
str x1, [x0, #CPU_XREG_OFFSET(30)]
get_vcpu_ptr x1, x0
SYM_INNER_LABEL(__guest_exit, SYM_L_GLOBAL)
// x0: return code
// x1: vcpu
......@@ -148,21 +131,23 @@ SYM_INNER_LABEL(__guest_exit, SYM_L_GLOBAL)
// Store the guest's sp_el0
save_sp_el0 x1, x2
get_host_ctxt x2, x3
adr_this_cpu x2, kvm_hyp_ctxt, x3
// Macro ptrauth_switch_to_guest format:
// ptrauth_switch_to_host(guest cxt, host cxt, tmp1, tmp2, tmp3)
// Macro ptrauth_switch_to_hyp format:
// ptrauth_switch_to_hyp(guest cxt, host cxt, tmp1, tmp2, tmp3)
// The below macro to save/restore keys is not implemented in C code
// as it may cause Pointer Authentication key signing mismatch errors
// when this feature is enabled for kernel code.
ptrauth_switch_to_host x1, x2, x3, x4, x5
ptrauth_switch_to_hyp x1, x2, x3, x4, x5
// Restore the hosts's sp_el0
// Restore hyp's sp_el0
restore_sp_el0 x2, x3
// Now restore the host regs
// Now restore the hyp regs
restore_callee_saved_regs x2
set_loaded_vcpu xzr, x1, x2
alternative_if ARM64_HAS_RAS_EXTN
// If we have the RAS extensions we can consume a pending error
// without an unmask-SError and isb. The ESB-instruction consumed any
......
......@@ -12,7 +12,6 @@
#include <asm/cpufeature.h>
#include <asm/kvm_arm.h>
#include <asm/kvm_asm.h>
#include <asm/kvm_mmu.h>
#include <asm/mmu.h>
.macro save_caller_saved_regs_vect
......@@ -41,20 +40,6 @@
.text
.macro do_el2_call
/*
* Shuffle the parameters before calling the function
* pointed to in x0. Assumes parameters in x[1,2,3].
*/
str lr, [sp, #-16]!
mov lr, x0
mov x0, x1
mov x1, x2
mov x2, x3
blr lr
ldr lr, [sp], #16
.endm
el1_sync: // Guest trapped into EL2
mrs x0, esr_el2
......@@ -63,44 +48,6 @@ el1_sync: // Guest trapped into EL2
ccmp x0, #ESR_ELx_EC_HVC32, #4, ne
b.ne el1_trap
#ifdef __KVM_NVHE_HYPERVISOR__
mrs x1, vttbr_el2 // If vttbr is valid, the guest
cbnz x1, el1_hvc_guest // called HVC
/* Here, we're pretty sure the host called HVC. */
ldp x0, x1, [sp], #16
/* Check for a stub HVC call */
cmp x0, #HVC_STUB_HCALL_NR
b.hs 1f
/*
* Compute the idmap address of __kvm_handle_stub_hvc and
* jump there. Since we use kimage_voffset, do not use the
* HYP VA for __kvm_handle_stub_hvc, but the kernel VA instead
* (by loading it from the constant pool).
*
* Preserve x0-x4, which may contain stub parameters.
*/
ldr x5, =__kvm_handle_stub_hvc
ldr_l x6, kimage_voffset
/* x5 = __pa(x5) */
sub x5, x5, x6
br x5
1:
/*
* Perform the EL2 call
*/
kern_hyp_va x0
do_el2_call
eret
sb
#endif /* __KVM_NVHE_HYPERVISOR__ */
el1_hvc_guest:
/*
* Fastest possible path for ARM_SMCCC_ARCH_WORKAROUND_1.
* The workaround has already been applied on the host,
......@@ -116,35 +63,6 @@ el1_hvc_guest:
ARM_SMCCC_ARCH_WORKAROUND_2)
cbnz w1, el1_trap
#ifdef CONFIG_ARM64_SSBD
alternative_cb arm64_enable_wa2_handling
b wa2_end
alternative_cb_end
get_vcpu_ptr x2, x0
ldr x0, [x2, #VCPU_WORKAROUND_FLAGS]
// Sanitize the argument and update the guest flags
ldr x1, [sp, #8] // Guest's x1
clz w1, w1 // Murphy's device:
lsr w1, w1, #5 // w1 = !!w1 without using
eor w1, w1, #1 // the flags...
bfi x0, x1, #VCPU_WORKAROUND_2_FLAG_SHIFT, #1
str x0, [x2, #VCPU_WORKAROUND_FLAGS]
/* Check that we actually need to perform the call */
hyp_ldr_this_cpu x0, arm64_ssbd_callback_required, x2
cbz x0, wa2_end
mov w0, #ARM_SMCCC_ARCH_WORKAROUND_2
smc #0
/* Don't leak data from the SMC call */
mov x3, xzr
wa2_end:
mov x2, xzr
mov x1, xzr
#endif
wa_epilogue:
mov x0, xzr
add sp, sp, #16
......@@ -198,24 +116,7 @@ el2_error:
eret
sb
#ifdef __KVM_NVHE_HYPERVISOR__
SYM_FUNC_START(__hyp_do_panic)
mov lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\
PSR_MODE_EL1h)
msr spsr_el2, lr
ldr lr, =panic
msr elr_el2, lr
eret
sb
SYM_FUNC_END(__hyp_do_panic)
#endif
SYM_CODE_START(__hyp_panic)
get_host_ctxt x0, x1
b hyp_panic
SYM_CODE_END(__hyp_panic)
.macro invalid_vector label, target = __hyp_panic
.macro invalid_vector label, target = __guest_exit_panic
.align 2
SYM_CODE_START(\label)
b \target
......@@ -227,7 +128,6 @@ SYM_CODE_END(\label)
invalid_vector el2t_irq_invalid
invalid_vector el2t_fiq_invalid
invalid_vector el2t_error_invalid
invalid_vector el2h_sync_invalid
invalid_vector el2h_irq_invalid
invalid_vector el2h_fiq_invalid
invalid_vector el1_fiq_invalid
......@@ -257,10 +157,9 @@ check_preamble_length 661b, 662b
.macro invalid_vect target
.align 7
661:
b \target
nop
stp x0, x1, [sp, #-16]!
662:
ldp x0, x1, [sp], #16
b \target
check_preamble_length 661b, 662b
......@@ -288,7 +187,6 @@ SYM_CODE_START(__kvm_hyp_vector)
valid_vect el1_error // Error 32-bit EL1
SYM_CODE_END(__kvm_hyp_vector)
#ifdef CONFIG_KVM_INDIRECT_VECTORS
.macro hyp_ventry
.align 7
1: esb
......@@ -338,4 +236,3 @@ SYM_CODE_START(__bp_harden_hyp_vecs)
1: .org __bp_harden_hyp_vecs + __BP_HARDEN_HYP_VECS_SZ
.org 1b
SYM_CODE_END(__bp_harden_hyp_vecs)
#endif
......@@ -135,7 +135,7 @@ static inline void __debug_switch_to_guest_common(struct kvm_vcpu *vcpu)
if (!(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY))
return;
host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
guest_ctxt = &vcpu->arch.ctxt;
host_dbg = &vcpu->arch.host_debug_state.regs;
guest_dbg = kern_hyp_va(vcpu->arch.debug_ptr);
......@@ -154,7 +154,7 @@ static inline void __debug_switch_to_host_common(struct kvm_vcpu *vcpu)
if (!(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY))
return;
host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
guest_ctxt = &vcpu->arch.ctxt;
host_dbg = &vcpu->arch.host_debug_state.regs;
guest_dbg = kern_hyp_va(vcpu->arch.debug_ptr);
......
......@@ -126,11 +126,6 @@ static inline void ___deactivate_traps(struct kvm_vcpu *vcpu)
}
}
static inline void __activate_vm(struct kvm_s2_mmu *mmu)
{
__load_guest_stage2(mmu);
}
static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar)
{
u64 par, tmp;
......@@ -377,6 +372,8 @@ static inline bool esr_is_ptrauth_trap(u32 esr)
ctxt_sys_reg(ctxt, key ## KEYHI_EL1) = __val; \
} while(0)
DECLARE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
static inline bool __hyp_handle_ptrauth(struct kvm_vcpu *vcpu)
{
struct kvm_cpu_context *ctxt;
......@@ -386,7 +383,7 @@ static inline bool __hyp_handle_ptrauth(struct kvm_vcpu *vcpu)
!esr_is_ptrauth_trap(kvm_vcpu_get_esr(vcpu)))
return false;
ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
ctxt = this_cpu_ptr(&kvm_hyp_ctxt);
__ptrauth_save_key(ctxt, APIA);
__ptrauth_save_key(ctxt, APIB);
__ptrauth_save_key(ctxt, APDA);
......@@ -479,49 +476,15 @@ static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code)
return false;
}
static inline bool __needs_ssbd_off(struct kvm_vcpu *vcpu)
{
if (!cpus_have_final_cap(ARM64_SSBD))
return false;
return !(vcpu->arch.workaround_flags & VCPU_WORKAROUND_2_FLAG);
}
static inline void __set_guest_arch_workaround_state(struct kvm_vcpu *vcpu)
{
#ifdef CONFIG_ARM64_SSBD
/*
* The host runs with the workaround always present. If the
* guest wants it disabled, so be it...
*/
if (__needs_ssbd_off(vcpu) &&
__hyp_this_cpu_read(arm64_ssbd_callback_required))
arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_2, 0, NULL);
#endif
}
static inline void __set_host_arch_workaround_state(struct kvm_vcpu *vcpu)
{
#ifdef CONFIG_ARM64_SSBD
/*
* If the guest has disabled the workaround, bring it back on.
*/
if (__needs_ssbd_off(vcpu) &&
__hyp_this_cpu_read(arm64_ssbd_callback_required))
arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_2, 1, NULL);
#endif
}
static inline void __kvm_unexpected_el2_exception(void)
{
extern char __guest_exit_panic[];
unsigned long addr, fixup;
struct kvm_cpu_context *host_ctxt;
struct exception_table_entry *entry, *end;
unsigned long elr_el2 = read_sysreg(elr_el2);
entry = hyp_symbol_addr(__start___kvm_ex_table);
end = hyp_symbol_addr(__stop___kvm_ex_table);
host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
while (entry < end) {
addr = (unsigned long)&entry->insn + entry->insn;
......@@ -536,7 +499,8 @@ static inline void __kvm_unexpected_el2_exception(void)
return;
}
hyp_panic(host_ctxt);
/* Trigger a panic after restoring the hyp context. */
write_sysreg(__guest_exit_panic, elr_el2);
}
#endif /* __ARM64_KVM_HYP_SWITCH_H__ */
# SPDX-License-Identifier: GPL-2.0-only
hyp.lds
......@@ -6,44 +6,50 @@
asflags-y := -D__KVM_NVHE_HYPERVISOR__
ccflags-y := -D__KVM_NVHE_HYPERVISOR__
obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o
obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o hyp-main.o
obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \
../fpsimd.o ../hyp-entry.o
obj-y := $(patsubst %.o,%.hyp.o,$(obj-y))
extra-y := $(patsubst %.hyp.o,%.hyp.tmp.o,$(obj-y))
##
## Build rules for compiling nVHE hyp code
## Output of this folder is `kvm_nvhe.o`, a partially linked object
## file containing all nVHE hyp code and data.
##
$(obj)/%.hyp.tmp.o: $(src)/%.c FORCE
hyp-obj := $(patsubst %.o,%.nvhe.o,$(obj-y))
obj-y := kvm_nvhe.o
extra-y := $(hyp-obj) kvm_nvhe.tmp.o hyp.lds
# 1) Compile all source files to `.nvhe.o` object files. The file extension
# avoids file name clashes for files shared with VHE.
$(obj)/%.nvhe.o: $(src)/%.c FORCE
$(call if_changed_rule,cc_o_c)
$(obj)/%.hyp.tmp.o: $(src)/%.S FORCE
$(obj)/%.nvhe.o: $(src)/%.S FORCE
$(call if_changed_rule,as_o_S)
$(obj)/%.hyp.o: $(obj)/%.hyp.tmp.o FORCE
$(call if_changed,hypcopy)
# Disable reordering functions by GCC (enabled at -O2).
# This pass puts functions into '.text.*' sections to aid the linker
# in optimizing ELF layout. See HYPCOPY comment below for more info.
ccflags-y += $(call cc-option,-fno-reorder-functions)
# 2) Compile linker script.
$(obj)/hyp.lds: $(src)/hyp.lds.S FORCE
$(call if_changed_dep,cpp_lds_S)
# 3) Partially link all '.nvhe.o' files and apply the linker script.
# Prefixes names of ELF sections with '.hyp', eg. '.hyp.text'.
# Note: The following rule assumes that the 'ld' rule puts LDFLAGS before
# the list of dependencies to form '-T $(obj)/hyp.lds'. This is to
# keep the dependency on the target while avoiding an error from
# GNU ld if the linker script is passed to it twice.
LDFLAGS_kvm_nvhe.tmp.o := -r -T
$(obj)/kvm_nvhe.tmp.o: $(obj)/hyp.lds $(addprefix $(obj)/,$(hyp-obj)) FORCE
$(call if_changed,ld)
# 4) Produce the final 'kvm_nvhe.o', ready to be linked into 'vmlinux'.
# Prefixes names of ELF symbols with '__kvm_nvhe_'.
$(obj)/kvm_nvhe.o: $(obj)/kvm_nvhe.tmp.o FORCE
$(call if_changed,hypcopy)
# The HYPCOPY command uses `objcopy` to prefix all ELF symbol names
# and relevant ELF section names to avoid clashes with VHE code/data.
#
# Hyp code is assumed to be in the '.text' section of the input object
# files (with the exception of specialized sections such as
# '.hyp.idmap.text'). This assumption may be broken by a compiler that
# divides code into sections like '.text.unlikely' so as to optimize
# ELF layout. HYPCOPY checks that no such sections exist in the input
# using `objdump`, otherwise they would be linked together with other
# kernel code and not memory-mapped correctly at runtime.
# to avoid clashes with VHE code/data.
quiet_cmd_hypcopy = HYPCOPY $@
cmd_hypcopy = \
if $(OBJDUMP) -h $< | grep -F '.text.'; then \
echo "$@: function reordering not supported in nVHE hyp code" >&2; \
/bin/false; \
fi; \
$(OBJCOPY) --prefix-symbols=__kvm_nvhe_ \
--rename-section=.text=.hyp.text \
$< $@
cmd_hypcopy = $(OBJCOPY) --prefix-symbols=__kvm_nvhe_ $< $@
# Remove ftrace and Shadow Call Stack CFLAGS.
# This is equivalent to the 'notrace' and '__noscs' annotations.
......
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (C) 2020 - Google Inc
* Author: Andrew Scull <ascull@google.com>
*/
#include <linux/linkage.h>
#include <asm/assembler.h>
#include <asm/kvm_asm.h>
#include <asm/kvm_mmu.h>
.text
SYM_FUNC_START(__host_exit)
stp x0, x1, [sp, #-16]!
get_host_ctxt x0, x1
ALTERNATIVE(nop, SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN)
/* Store the host regs x2 and x3 */
stp x2, x3, [x0, #CPU_XREG_OFFSET(2)]
/* Retrieve the host regs x0-x1 from the stack */
ldp x2, x3, [sp], #16 // x0, x1
/* Store the host regs x0-x1 and x4-x17 */
stp x2, x3, [x0, #CPU_XREG_OFFSET(0)]
stp x4, x5, [x0, #CPU_XREG_OFFSET(4)]
stp x6, x7, [x0, #CPU_XREG_OFFSET(6)]
stp x8, x9, [x0, #CPU_XREG_OFFSET(8)]
stp x10, x11, [x0, #CPU_XREG_OFFSET(10)]
stp x12, x13, [x0, #CPU_XREG_OFFSET(12)]
stp x14, x15, [x0, #CPU_XREG_OFFSET(14)]
stp x16, x17, [x0, #CPU_XREG_OFFSET(16)]
/* Store the host regs x18-x29, lr */
save_callee_saved_regs x0
/* Save the host context pointer in x29 across the function call */
mov x29, x0
bl handle_trap
/* Restore host regs x0-x17 */
ldp x0, x1, [x29, #CPU_XREG_OFFSET(0)]
ldp x2, x3, [x29, #CPU_XREG_OFFSET(2)]
ldp x4, x5, [x29, #CPU_XREG_OFFSET(4)]
ldp x6, x7, [x29, #CPU_XREG_OFFSET(6)]
/* x0-7 are use for panic arguments */
__host_enter_for_panic:
ldp x8, x9, [x29, #CPU_XREG_OFFSET(8)]
ldp x10, x11, [x29, #CPU_XREG_OFFSET(10)]
ldp x12, x13, [x29, #CPU_XREG_OFFSET(12)]
ldp x14, x15, [x29, #CPU_XREG_OFFSET(14)]
ldp x16, x17, [x29, #CPU_XREG_OFFSET(16)]
/* Restore host regs x18-x29, lr */
restore_callee_saved_regs x29
/* Do not touch any register after this! */
__host_enter_without_restoring:
eret
sb
SYM_FUNC_END(__host_exit)
/*
* void __noreturn __hyp_do_panic(bool restore_host, u64 spsr, u64 elr, u64 par);
*/
SYM_FUNC_START(__hyp_do_panic)
/* Load the format arguments into x1-7 */
mov x6, x3
get_vcpu_ptr x7, x3
mrs x3, esr_el2
mrs x4, far_el2
mrs x5, hpfar_el2
/* Prepare and exit to the host's panic funciton. */
mov lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\
PSR_MODE_EL1h)
msr spsr_el2, lr
ldr lr, =panic
msr elr_el2, lr
/*
* Set the panic format string and enter the host, conditionally
* restoring the host context.
*/
cmp x0, xzr
ldr x0, =__hyp_panic_string
b.eq __host_enter_without_restoring
b __host_enter_for_panic
SYM_FUNC_END(__hyp_do_panic)
.macro host_el1_sync_vect
.align 7
.L__vect_start\@:
stp x0, x1, [sp, #-16]!
mrs x0, esr_el2
lsr x0, x0, #ESR_ELx_EC_SHIFT
cmp x0, #ESR_ELx_EC_HVC64
ldp x0, x1, [sp], #16
b.ne __host_exit
/* Check for a stub HVC call */
cmp x0, #HVC_STUB_HCALL_NR
b.hs __host_exit
/*
* Compute the idmap address of __kvm_handle_stub_hvc and
* jump there. Since we use kimage_voffset, do not use the
* HYP VA for __kvm_handle_stub_hvc, but the kernel VA instead
* (by loading it from the constant pool).
*
* Preserve x0-x4, which may contain stub parameters.
*/
ldr x5, =__kvm_handle_stub_hvc
ldr_l x6, kimage_voffset
/* x5 = __pa(x5) */
sub x5, x5, x6
br x5
.L__vect_end\@:
.if ((.L__vect_end\@ - .L__vect_start\@) > 0x80)
.error "host_el1_sync_vect larger than vector entry"
.endif
.endm
.macro invalid_host_el2_vect
.align 7
/* If a guest is loaded, panic out of it. */
stp x0, x1, [sp, #-16]!
get_loaded_vcpu x0, x1
cbnz x0, __guest_exit_panic
add sp, sp, #16
/*
* The panic may not be clean if the exception is taken before the host
* context has been saved by __host_exit or after the hyp context has
* been partially clobbered by __host_enter.
*/
b hyp_panic
.endm
.macro invalid_host_el1_vect
.align 7
mov x0, xzr /* restore_host = false */
mrs x1, spsr_el2
mrs x2, elr_el2
mrs x3, par_el1
b __hyp_do_panic
.endm
/*
* The host vector does not use an ESB instruction in order to avoid consuming
* SErrors that should only be consumed by the host. Guest entry is deferred by
* __guest_enter if there are any pending asynchronous exceptions so hyp will
* always return to the host without having consumerd host SErrors.
*
* CONFIG_KVM_INDIRECT_VECTORS is not applied to the host vectors because the
* host knows about the EL2 vectors already, and there is no point in hiding
* them.
*/
.align 11
SYM_CODE_START(__kvm_hyp_host_vector)
invalid_host_el2_vect // Synchronous EL2t
invalid_host_el2_vect // IRQ EL2t
invalid_host_el2_vect // FIQ EL2t
invalid_host_el2_vect // Error EL2t
invalid_host_el2_vect // Synchronous EL2h
invalid_host_el2_vect // IRQ EL2h
invalid_host_el2_vect // FIQ EL2h
invalid_host_el2_vect // Error EL2h
host_el1_sync_vect // Synchronous 64-bit EL1
invalid_host_el1_vect // IRQ 64-bit EL1
invalid_host_el1_vect // FIQ 64-bit EL1
invalid_host_el1_vect // Error 64-bit EL1
invalid_host_el1_vect // Synchronous 32-bit EL1
invalid_host_el1_vect // IRQ 32-bit EL1
invalid_host_el1_vect // FIQ 32-bit EL1
invalid_host_el1_vect // Error 32-bit EL1
SYM_CODE_END(__kvm_hyp_host_vector)
......@@ -4,11 +4,13 @@
* Author: Marc Zyngier <marc.zyngier@arm.com>
*/
#include <linux/arm-smccc.h>
#include <linux/linkage.h>
#include <asm/alternative.h>
#include <asm/assembler.h>
#include <asm/kvm_arm.h>
#include <asm/kvm_asm.h>
#include <asm/kvm_mmu.h>
#include <asm/pgtable-hwdef.h>
#include <asm/sysreg.h>
......@@ -44,27 +46,37 @@ __invalid:
b .
/*
* x0: HYP pgd
* x1: HYP stack
* x2: HYP vectors
* x3: per-CPU offset
* x0: SMCCC function ID
* x1: HYP pgd
* x2: per-CPU offset
* x3: HYP stack
* x4: HYP vectors
*/
__do_hyp_init:
/* Check for a stub HVC call */
cmp x0, #HVC_STUB_HCALL_NR
b.lo __kvm_handle_stub_hvc
phys_to_ttbr x4, x0
/* Set tpidr_el2 for use by HYP to free a register */
msr tpidr_el2, x2
mov x2, #KVM_HOST_SMCCC_FUNC(__kvm_hyp_init)
cmp x0, x2
b.eq 1f
mov x0, #SMCCC_RET_NOT_SUPPORTED
eret
1: phys_to_ttbr x0, x1
alternative_if ARM64_HAS_CNP
orr x4, x4, #TTBR_CNP_BIT
orr x0, x0, #TTBR_CNP_BIT
alternative_else_nop_endif
msr ttbr0_el2, x4
msr ttbr0_el2, x0
mrs x4, tcr_el1
mov_q x5, TCR_EL2_MASK
and x4, x4, x5
mov x5, #TCR_EL2_RES1
orr x4, x4, x5
mrs x0, tcr_el1
mov_q x1, TCR_EL2_MASK
and x0, x0, x1
mov x1, #TCR_EL2_RES1
orr x0, x0, x1
/*
* The ID map may be configured to use an extended virtual address
......@@ -80,18 +92,18 @@ alternative_else_nop_endif
*
* So use the same T0SZ value we use for the ID map.
*/
ldr_l x5, idmap_t0sz
bfi x4, x5, TCR_T0SZ_OFFSET, TCR_TxSZ_WIDTH
ldr_l x1, idmap_t0sz
bfi x0, x1, TCR_T0SZ_OFFSET, TCR_TxSZ_WIDTH
/*
* Set the PS bits in TCR_EL2.
*/
tcr_compute_pa_size x4, #TCR_EL2_PS_SHIFT, x5, x6
tcr_compute_pa_size x0, #TCR_EL2_PS_SHIFT, x1, x2
msr tcr_el2, x4
msr tcr_el2, x0
mrs x4, mair_el1
msr mair_el2, x4
mrs x0, mair_el1
msr mair_el2, x0
isb
/* Invalidate the stale TLBs from Bootloader */
......@@ -103,25 +115,22 @@ alternative_else_nop_endif
* as well as the EE bit on BE. Drop the A flag since the compiler
* is allowed to generate unaligned accesses.
*/
mov_q x4, (SCTLR_EL2_RES1 | (SCTLR_ELx_FLAGS & ~SCTLR_ELx_A))
CPU_BE( orr x4, x4, #SCTLR_ELx_EE)
mov_q x0, (SCTLR_EL2_RES1 | (SCTLR_ELx_FLAGS & ~SCTLR_ELx_A))
CPU_BE( orr x0, x0, #SCTLR_ELx_EE)
alternative_if ARM64_HAS_ADDRESS_AUTH
mov_q x5, (SCTLR_ELx_ENIA | SCTLR_ELx_ENIB | \
mov_q x1, (SCTLR_ELx_ENIA | SCTLR_ELx_ENIB | \
SCTLR_ELx_ENDA | SCTLR_ELx_ENDB)
orr x4, x4, x5
orr x0, x0, x1
alternative_else_nop_endif
msr sctlr_el2, x4
msr sctlr_el2, x0
isb
/* Set the stack and new vectors */
kern_hyp_va x1
mov sp, x1
msr vbar_el2, x2
/* Set tpidr_el2 for use by HYP */
msr tpidr_el2, x3
mov sp, x3
msr vbar_el2, x4
/* Hello, World! */
mov x0, #SMCCC_RET_SUCCESS
eret
SYM_CODE_END(__kvm_hyp_init)
......
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2020 - Google Inc
* Author: Andrew Scull <ascull@google.com>
*/
#include <hyp/switch.h>
#include <asm/kvm_asm.h>
#include <asm/kvm_emulate.h>
#include <asm/kvm_host.h>
#include <asm/kvm_hyp.h>
#include <asm/kvm_mmu.h>
#include <kvm/arm_hypercalls.h>
static void handle_host_hcall(unsigned long func_id,
struct kvm_cpu_context *host_ctxt)
{
unsigned long ret = 0;
switch (func_id) {
case KVM_HOST_SMCCC_FUNC(__kvm_vcpu_run): {
unsigned long r1 = host_ctxt->regs.regs[1];
struct kvm_vcpu *vcpu = (struct kvm_vcpu *)r1;
ret = __kvm_vcpu_run(kern_hyp_va(vcpu));
break;
}
case KVM_HOST_SMCCC_FUNC(__kvm_flush_vm_context):
__kvm_flush_vm_context();
break;
case KVM_HOST_SMCCC_FUNC(__kvm_tlb_flush_vmid_ipa): {
unsigned long r1 = host_ctxt->regs.regs[1];
struct kvm_s2_mmu *mmu = (struct kvm_s2_mmu *)r1;
phys_addr_t ipa = host_ctxt->regs.regs[2];
int level = host_ctxt->regs.regs[3];
__kvm_tlb_flush_vmid_ipa(kern_hyp_va(mmu), ipa, level);
break;
}
case KVM_HOST_SMCCC_FUNC(__kvm_tlb_flush_vmid): {
unsigned long r1 = host_ctxt->regs.regs[1];
struct kvm_s2_mmu *mmu = (struct kvm_s2_mmu *)r1;
__kvm_tlb_flush_vmid(kern_hyp_va(mmu));
break;
}
case KVM_HOST_SMCCC_FUNC(__kvm_tlb_flush_local_vmid): {
unsigned long r1 = host_ctxt->regs.regs[1];
struct kvm_s2_mmu *mmu = (struct kvm_s2_mmu *)r1;
__kvm_tlb_flush_local_vmid(kern_hyp_va(mmu));
break;
}
case KVM_HOST_SMCCC_FUNC(__kvm_timer_set_cntvoff): {
u64 cntvoff = host_ctxt->regs.regs[1];
__kvm_timer_set_cntvoff(cntvoff);
break;
}
case KVM_HOST_SMCCC_FUNC(__kvm_enable_ssbs):
__kvm_enable_ssbs();
break;
case KVM_HOST_SMCCC_FUNC(__vgic_v3_get_ich_vtr_el2):
ret = __vgic_v3_get_ich_vtr_el2();
break;
case KVM_HOST_SMCCC_FUNC(__vgic_v3_read_vmcr):
ret = __vgic_v3_read_vmcr();
break;
case KVM_HOST_SMCCC_FUNC(__vgic_v3_write_vmcr): {
u32 vmcr = host_ctxt->regs.regs[1];
__vgic_v3_write_vmcr(vmcr);
break;
}
case KVM_HOST_SMCCC_FUNC(__vgic_v3_init_lrs):
__vgic_v3_init_lrs();
break;
case KVM_HOST_SMCCC_FUNC(__kvm_get_mdcr_el2):
ret = __kvm_get_mdcr_el2();
break;
case KVM_HOST_SMCCC_FUNC(__vgic_v3_save_aprs): {
unsigned long r1 = host_ctxt->regs.regs[1];
struct vgic_v3_cpu_if *cpu_if = (struct vgic_v3_cpu_if *)r1;
__vgic_v3_save_aprs(kern_hyp_va(cpu_if));
break;
}
case KVM_HOST_SMCCC_FUNC(__vgic_v3_restore_aprs): {
unsigned long r1 = host_ctxt->regs.regs[1];
struct vgic_v3_cpu_if *cpu_if = (struct vgic_v3_cpu_if *)r1;
__vgic_v3_restore_aprs(kern_hyp_va(cpu_if));
break;
}
default:
/* Invalid host HVC. */
host_ctxt->regs.regs[0] = SMCCC_RET_NOT_SUPPORTED;
return;
}
host_ctxt->regs.regs[0] = SMCCC_RET_SUCCESS;
host_ctxt->regs.regs[1] = ret;
}
void handle_trap(struct kvm_cpu_context *host_ctxt)
{
u64 esr = read_sysreg_el2(SYS_ESR);
unsigned long func_id;
if (ESR_ELx_EC(esr) != ESR_ELx_EC_HVC64)
hyp_panic();
func_id = host_ctxt->regs.regs[0];
handle_host_hcall(func_id, host_ctxt);
}
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2020 Google LLC.
* Written by David Brazdil <dbrazdil@google.com>
*
* Linker script used for partial linking of nVHE EL2 object files.
*/
#include <asm/hyp_image.h>
#include <asm-generic/vmlinux.lds.h>
#include <asm/cache.h>
#include <asm/memory.h>
SECTIONS {
HYP_SECTION(.text)
HYP_SECTION_NAME(.data..percpu) : {
PERCPU_INPUT(L1_CACHE_BYTES)
}
}
......@@ -27,6 +27,11 @@
#include <asm/processor.h>
#include <asm/thread_info.h>
/* Non-VHE specific context */
DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data);
DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
DEFINE_PER_CPU(unsigned long, kvm_hyp_vector);
static void __activate_traps(struct kvm_vcpu *vcpu)
{
u64 val;
......@@ -42,6 +47,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
}
write_sysreg(val, cptr_el2);
write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el2);
if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
struct kvm_cpu_context *ctxt = &vcpu->arch.ctxt;
......@@ -60,6 +66,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
static void __deactivate_traps(struct kvm_vcpu *vcpu)
{
extern char __kvm_hyp_host_vector[];
u64 mdcr_el2;
___deactivate_traps(vcpu);
......@@ -91,9 +98,10 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu)
write_sysreg(mdcr_el2, mdcr_el2);
write_sysreg(HCR_HOST_NVHE_FLAGS, hcr_el2);
write_sysreg(CPTR_EL2_DEFAULT, cptr_el2);
write_sysreg(__kvm_hyp_host_vector, vbar_el2);
}
static void __deactivate_vm(struct kvm_vcpu *vcpu)
static void __load_host_stage2(void)
{
write_sysreg(0, vttbr_el2);
}
......@@ -173,9 +181,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
pmr_sync();
}
vcpu = kern_hyp_va(vcpu);
host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
host_ctxt->__hyp_running_vcpu = vcpu;
guest_ctxt = &vcpu->arch.ctxt;
......@@ -194,7 +200,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
__sysreg32_restore_state(vcpu);
__sysreg_restore_state_nvhe(guest_ctxt);
__activate_vm(kern_hyp_va(vcpu->arch.hw_mmu));
__load_guest_stage2(kern_hyp_va(vcpu->arch.hw_mmu));
__activate_traps(vcpu);
__hyp_vgic_restore_state(vcpu);
......@@ -202,24 +208,20 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
__debug_switch_to_guest(vcpu);
__set_guest_arch_workaround_state(vcpu);
do {
/* Jump in the fire! */
exit_code = __guest_enter(vcpu, host_ctxt);
exit_code = __guest_enter(vcpu);
/* And we're baaack! */
} while (fixup_guest_exit(vcpu, &exit_code));
__set_host_arch_workaround_state(vcpu);
__sysreg_save_state_nvhe(guest_ctxt);
__sysreg32_save_state(vcpu);
__timer_disable_traps(vcpu);
__hyp_vgic_save_state(vcpu);
__deactivate_traps(vcpu);
__deactivate_vm(vcpu);
__load_host_stage2();
__sysreg_restore_state_nvhe(host_ctxt);
......@@ -239,35 +241,31 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
if (system_uses_irq_prio_masking())
gic_write_pmr(GIC_PRIO_IRQOFF);
host_ctxt->__hyp_running_vcpu = NULL;
return exit_code;
}
void __noreturn hyp_panic(struct kvm_cpu_context *host_ctxt)
void __noreturn hyp_panic(void)
{
u64 spsr = read_sysreg_el2(SYS_SPSR);
u64 elr = read_sysreg_el2(SYS_ELR);
u64 par = read_sysreg(par_el1);
struct kvm_vcpu *vcpu = host_ctxt->__hyp_running_vcpu;
unsigned long str_va;
bool restore_host = true;
struct kvm_cpu_context *host_ctxt;
struct kvm_vcpu *vcpu;
host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
vcpu = host_ctxt->__hyp_running_vcpu;
if (read_sysreg(vttbr_el2)) {
if (vcpu) {
__timer_disable_traps(vcpu);
__deactivate_traps(vcpu);
__deactivate_vm(vcpu);
__load_host_stage2();
__sysreg_restore_state_nvhe(host_ctxt);
}
/*
* Force the panic string to be loaded from the literal pool,
* making sure it is a kernel address and not a PC-relative
* reference.
*/
asm volatile("ldr %0, =%1" : "=r" (str_va) : "S" (__hyp_panic_string));
__hyp_do_panic(str_va,
spsr, elr,
read_sysreg(esr_el2), read_sysreg_el2(SYS_FAR),
read_sysreg(hpfar_el2), par, vcpu);
__hyp_do_panic(restore_host, spsr, elr, par);
unreachable();
}
......
......@@ -54,7 +54,6 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu,
dsb(ishst);
/* Switch to requested VMID */
mmu = kern_hyp_va(mmu);
__tlb_switch_to_guest(mmu, &cxt);
/*
......@@ -108,7 +107,6 @@ void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu)
dsb(ishst);
/* Switch to requested VMID */
mmu = kern_hyp_va(mmu);
__tlb_switch_to_guest(mmu, &cxt);
__tlbi(vmalls12e1is);
......
// SPDX-License-Identifier: GPL-2.0-only
/*
* Stand-alone page-table allocator for hyp stage-1 and guest stage-2.
* No bombay mix was harmed in the writing of this file.
*
* Copyright (C) 2020 Google LLC
* Author: Will Deacon <will@kernel.org>
*/
#include <linux/bitfield.h>
#include <asm/kvm_pgtable.h>
#define KVM_PGTABLE_MAX_LEVELS 4U
#define KVM_PTE_VALID BIT(0)
#define KVM_PTE_TYPE BIT(1)
#define KVM_PTE_TYPE_BLOCK 0
#define KVM_PTE_TYPE_PAGE 1
#define KVM_PTE_TYPE_TABLE 1
#define KVM_PTE_ADDR_MASK GENMASK(47, PAGE_SHIFT)
#define KVM_PTE_ADDR_51_48 GENMASK(15, 12)
#define KVM_PTE_LEAF_ATTR_LO GENMASK(11, 2)
#define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2)
#define KVM_PTE_LEAF_ATTR_LO_S1_AP GENMASK(7, 6)
#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO 3
#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW 1
#define KVM_PTE_LEAF_ATTR_LO_S1_SH GENMASK(9, 8)
#define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS 3
#define KVM_PTE_LEAF_ATTR_LO_S1_AF BIT(10)
#define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR GENMASK(5, 2)
#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R BIT(6)
#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W BIT(7)
#define KVM_PTE_LEAF_ATTR_LO_S2_SH GENMASK(9, 8)
#define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS 3
#define KVM_PTE_LEAF_ATTR_LO_S2_AF BIT(10)
#define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 51)
#define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54)
#define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54)
struct kvm_pgtable_walk_data {
struct kvm_pgtable *pgt;
struct kvm_pgtable_walker *walker;
u64 addr;
u64 end;
};
static u64 kvm_granule_shift(u32 level)
{
/* Assumes KVM_PGTABLE_MAX_LEVELS is 4 */
return ARM64_HW_PGTABLE_LEVEL_SHIFT(level);
}
static u64 kvm_granule_size(u32 level)
{
return BIT(kvm_granule_shift(level));
}
static bool kvm_block_mapping_supported(u64 addr, u64 end, u64 phys, u32 level)
{
u64 granule = kvm_granule_size(level);
/*
* Reject invalid block mappings and don't bother with 4TB mappings for
* 52-bit PAs.
*/
if (level == 0 || (PAGE_SIZE != SZ_4K && level == 1))
return false;
if (granule > (end - addr))
return false;
return IS_ALIGNED(addr, granule) && IS_ALIGNED(phys, granule);
}
static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, u32 level)
{
u64 shift = kvm_granule_shift(level);
u64 mask = BIT(PAGE_SHIFT - 3) - 1;
return (data->addr >> shift) & mask;
}
static u32 __kvm_pgd_page_idx(struct kvm_pgtable *pgt, u64 addr)
{
u64 shift = kvm_granule_shift(pgt->start_level - 1); /* May underflow */
u64 mask = BIT(pgt->ia_bits) - 1;
return (addr & mask) >> shift;
}
static u32 kvm_pgd_page_idx(struct kvm_pgtable_walk_data *data)
{
return __kvm_pgd_page_idx(data->pgt, data->addr);
}
static u32 kvm_pgd_pages(u32 ia_bits, u32 start_level)
{
struct kvm_pgtable pgt = {
.ia_bits = ia_bits,
.start_level = start_level,
};
return __kvm_pgd_page_idx(&pgt, -1ULL) + 1;
}
static bool kvm_pte_valid(kvm_pte_t pte)
{
return pte & KVM_PTE_VALID;
}
static bool kvm_pte_table(kvm_pte_t pte, u32 level)
{
if (level == KVM_PGTABLE_MAX_LEVELS - 1)
return false;
if (!kvm_pte_valid(pte))
return false;
return FIELD_GET(KVM_PTE_TYPE, pte) == KVM_PTE_TYPE_TABLE;
}
static u64 kvm_pte_to_phys(kvm_pte_t pte)
{
u64 pa = pte & KVM_PTE_ADDR_MASK;
if (PAGE_SHIFT == 16)
pa |= FIELD_GET(KVM_PTE_ADDR_51_48, pte) << 48;
return pa;
}
static kvm_pte_t kvm_phys_to_pte(u64 pa)
{
kvm_pte_t pte = pa & KVM_PTE_ADDR_MASK;
if (PAGE_SHIFT == 16)
pte |= FIELD_PREP(KVM_PTE_ADDR_51_48, pa >> 48);
return pte;
}
static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte)
{
return __va(kvm_pte_to_phys(pte));
}
static void kvm_set_invalid_pte(kvm_pte_t *ptep)
{
kvm_pte_t pte = *ptep;
WRITE_ONCE(*ptep, pte & ~KVM_PTE_VALID);
}
static void kvm_set_table_pte(kvm_pte_t *ptep, kvm_pte_t *childp)
{
kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(__pa(childp));
pte |= FIELD_PREP(KVM_PTE_TYPE, KVM_PTE_TYPE_TABLE);
pte |= KVM_PTE_VALID;
WARN_ON(kvm_pte_valid(old));
smp_store_release(ptep, pte);
}
static bool kvm_set_valid_leaf_pte(kvm_pte_t *ptep, u64 pa, kvm_pte_t attr,
u32 level)
{
kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(pa);
u64 type = (level == KVM_PGTABLE_MAX_LEVELS - 1) ? KVM_PTE_TYPE_PAGE :
KVM_PTE_TYPE_BLOCK;
pte |= attr & (KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI);
pte |= FIELD_PREP(KVM_PTE_TYPE, type);
pte |= KVM_PTE_VALID;
/* Tolerate KVM recreating the exact same mapping. */
if (kvm_pte_valid(old))
return old == pte;
smp_store_release(ptep, pte);
return true;
}
static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, u64 addr,
u32 level, kvm_pte_t *ptep,
enum kvm_pgtable_walk_flags flag)
{
struct kvm_pgtable_walker *walker = data->walker;
return walker->cb(addr, data->end, level, ptep, flag, walker->arg);
}
static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data,
kvm_pte_t *pgtable, u32 level);
static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data,
kvm_pte_t *ptep, u32 level)
{
int ret = 0;
u64 addr = data->addr;
kvm_pte_t *childp, pte = *ptep;
bool table = kvm_pte_table(pte, level);
enum kvm_pgtable_walk_flags flags = data->walker->flags;
if (table && (flags & KVM_PGTABLE_WALK_TABLE_PRE)) {
ret = kvm_pgtable_visitor_cb(data, addr, level, ptep,
KVM_PGTABLE_WALK_TABLE_PRE);
}
if (!table && (flags & KVM_PGTABLE_WALK_LEAF)) {
ret = kvm_pgtable_visitor_cb(data, addr, level, ptep,
KVM_PGTABLE_WALK_LEAF);
pte = *ptep;
table = kvm_pte_table(pte, level);
}
if (ret)
goto out;
if (!table) {
data->addr += kvm_granule_size(level);
goto out;
}
childp = kvm_pte_follow(pte);
ret = __kvm_pgtable_walk(data, childp, level + 1);
if (ret)
goto out;
if (flags & KVM_PGTABLE_WALK_TABLE_POST) {
ret = kvm_pgtable_visitor_cb(data, addr, level, ptep,
KVM_PGTABLE_WALK_TABLE_POST);
}
out:
return ret;
}
static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data,
kvm_pte_t *pgtable, u32 level)
{
u32 idx;
int ret = 0;
if (WARN_ON_ONCE(level >= KVM_PGTABLE_MAX_LEVELS))
return -EINVAL;
for (idx = kvm_pgtable_idx(data, level); idx < PTRS_PER_PTE; ++idx) {
kvm_pte_t *ptep = &pgtable[idx];
if (data->addr >= data->end)
break;
ret = __kvm_pgtable_visit(data, ptep, level);
if (ret)
break;
}
return ret;
}
static int _kvm_pgtable_walk(struct kvm_pgtable_walk_data *data)
{
u32 idx;
int ret = 0;
struct kvm_pgtable *pgt = data->pgt;
u64 limit = BIT(pgt->ia_bits);
if (data->addr > limit || data->end > limit)
return -ERANGE;
if (!pgt->pgd)
return -EINVAL;
for (idx = kvm_pgd_page_idx(data); data->addr < data->end; ++idx) {
kvm_pte_t *ptep = &pgt->pgd[idx * PTRS_PER_PTE];
ret = __kvm_pgtable_walk(data, ptep, pgt->start_level);
if (ret)
break;
}
return ret;
}
int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
struct kvm_pgtable_walker *walker)
{
struct kvm_pgtable_walk_data walk_data = {
.pgt = pgt,
.addr = ALIGN_DOWN(addr, PAGE_SIZE),
.end = PAGE_ALIGN(walk_data.addr + size),
.walker = walker,
};
return _kvm_pgtable_walk(&walk_data);
}
struct hyp_map_data {
u64 phys;
kvm_pte_t attr;
};
static int hyp_map_set_prot_attr(enum kvm_pgtable_prot prot,
struct hyp_map_data *data)
{
bool device = prot & KVM_PGTABLE_PROT_DEVICE;
u32 mtype = device ? MT_DEVICE_nGnRE : MT_NORMAL;
kvm_pte_t attr = FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX, mtype);
u32 sh = KVM_PTE_LEAF_ATTR_LO_S1_SH_IS;
u32 ap = (prot & KVM_PGTABLE_PROT_W) ? KVM_PTE_LEAF_ATTR_LO_S1_AP_RW :
KVM_PTE_LEAF_ATTR_LO_S1_AP_RO;
if (!(prot & KVM_PGTABLE_PROT_R))
return -EINVAL;
if (prot & KVM_PGTABLE_PROT_X) {
if (prot & KVM_PGTABLE_PROT_W)
return -EINVAL;
if (device)
return -EINVAL;
} else {
attr |= KVM_PTE_LEAF_ATTR_HI_S1_XN;
}
attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap);
attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh);
attr |= KVM_PTE_LEAF_ATTR_LO_S1_AF;
data->attr = attr;
return 0;
}
static bool hyp_map_walker_try_leaf(u64 addr, u64 end, u32 level,
kvm_pte_t *ptep, struct hyp_map_data *data)
{
u64 granule = kvm_granule_size(level), phys = data->phys;
if (!kvm_block_mapping_supported(addr, end, phys, level))
return false;
WARN_ON(!kvm_set_valid_leaf_pte(ptep, phys, data->attr, level));
data->phys += granule;
return true;
}
static int hyp_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
enum kvm_pgtable_walk_flags flag, void * const arg)
{
kvm_pte_t *childp;
if (hyp_map_walker_try_leaf(addr, end, level, ptep, arg))
return 0;
if (WARN_ON(level == KVM_PGTABLE_MAX_LEVELS - 1))
return -EINVAL;
childp = (kvm_pte_t *)get_zeroed_page(GFP_KERNEL);
if (!childp)
return -ENOMEM;
kvm_set_table_pte(ptep, childp);
return 0;
}
int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
enum kvm_pgtable_prot prot)
{
int ret;
struct hyp_map_data map_data = {
.phys = ALIGN_DOWN(phys, PAGE_SIZE),
};
struct kvm_pgtable_walker walker = {
.cb = hyp_map_walker,
.flags = KVM_PGTABLE_WALK_LEAF,
.arg = &map_data,
};
ret = hyp_map_set_prot_attr(prot, &map_data);
if (ret)
return ret;
ret = kvm_pgtable_walk(pgt, addr, size, &walker);
dsb(ishst);
isb();
return ret;
}
int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits)
{
u64 levels = ARM64_HW_PGTABLE_LEVELS(va_bits);
pgt->pgd = (kvm_pte_t *)get_zeroed_page(GFP_KERNEL);
if (!pgt->pgd)
return -ENOMEM;
pgt->ia_bits = va_bits;
pgt->start_level = KVM_PGTABLE_MAX_LEVELS - levels;
pgt->mmu = NULL;
return 0;
}
static int hyp_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
enum kvm_pgtable_walk_flags flag, void * const arg)
{
free_page((unsigned long)kvm_pte_follow(*ptep));
return 0;
}
void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt)
{
struct kvm_pgtable_walker walker = {
.cb = hyp_free_walker,
.flags = KVM_PGTABLE_WALK_TABLE_POST,
};
WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));
free_page((unsigned long)pgt->pgd);
pgt->pgd = NULL;
}
struct stage2_map_data {
u64 phys;
kvm_pte_t attr;
kvm_pte_t *anchor;
struct kvm_s2_mmu *mmu;
struct kvm_mmu_memory_cache *memcache;
};
static int stage2_map_set_prot_attr(enum kvm_pgtable_prot prot,
struct stage2_map_data *data)
{
bool device = prot & KVM_PGTABLE_PROT_DEVICE;
kvm_pte_t attr = device ? PAGE_S2_MEMATTR(DEVICE_nGnRE) :
PAGE_S2_MEMATTR(NORMAL);
u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS;
if (!(prot & KVM_PGTABLE_PROT_X))
attr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;
else if (device)
return -EINVAL;
if (prot & KVM_PGTABLE_PROT_R)
attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R;
if (prot & KVM_PGTABLE_PROT_W)
attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh);
attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF;
data->attr = attr;
return 0;
}
static bool stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
kvm_pte_t *ptep,
struct stage2_map_data *data)
{
u64 granule = kvm_granule_size(level), phys = data->phys;
if (!kvm_block_mapping_supported(addr, end, phys, level))
return false;
if (kvm_set_valid_leaf_pte(ptep, phys, data->attr, level))
goto out;
/* There's an existing valid leaf entry, so perform break-before-make */
kvm_set_invalid_pte(ptep);
kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level);
kvm_set_valid_leaf_pte(ptep, phys, data->attr, level);
out:
data->phys += granule;
return true;
}
static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 level,
kvm_pte_t *ptep,
struct stage2_map_data *data)
{
if (data->anchor)
return 0;
if (!kvm_block_mapping_supported(addr, end, data->phys, level))
return 0;
kvm_set_invalid_pte(ptep);
kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, 0);
data->anchor = ptep;
return 0;
}
static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
struct stage2_map_data *data)
{
kvm_pte_t *childp, pte = *ptep;
struct page *page = virt_to_page(ptep);
if (data->anchor) {
if (kvm_pte_valid(pte))
put_page(page);
return 0;
}
if (stage2_map_walker_try_leaf(addr, end, level, ptep, data))
goto out_get_page;
if (WARN_ON(level == KVM_PGTABLE_MAX_LEVELS - 1))
return -EINVAL;
if (!data->memcache)
return -ENOMEM;
childp = kvm_mmu_memory_cache_alloc(data->memcache);
if (!childp)
return -ENOMEM;
/*
* If we've run into an existing block mapping then replace it with
* a table. Accesses beyond 'end' that fall within the new table
* will be mapped lazily.
*/
if (kvm_pte_valid(pte)) {
kvm_set_invalid_pte(ptep);
kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level);
put_page(page);
}
kvm_set_table_pte(ptep, childp);
out_get_page:
get_page(page);
return 0;
}
static int stage2_map_walk_table_post(u64 addr, u64 end, u32 level,
kvm_pte_t *ptep,
struct stage2_map_data *data)
{
int ret = 0;
if (!data->anchor)
return 0;
free_page((unsigned long)kvm_pte_follow(*ptep));
put_page(virt_to_page(ptep));
if (data->anchor == ptep) {
data->anchor = NULL;
ret = stage2_map_walk_leaf(addr, end, level, ptep, data);
}
return ret;
}
/*
* This is a little fiddly, as we use all three of the walk flags. The idea
* is that the TABLE_PRE callback runs for table entries on the way down,
* looking for table entries which we could conceivably replace with a
* block entry for this mapping. If it finds one, then it sets the 'anchor'
* field in 'struct stage2_map_data' to point at the table entry, before
* clearing the entry to zero and descending into the now detached table.
*
* The behaviour of the LEAF callback then depends on whether or not the
* anchor has been set. If not, then we're not using a block mapping higher
* up the table and we perform the mapping at the existing leaves instead.
* If, on the other hand, the anchor _is_ set, then we drop references to
* all valid leaves so that the pages beneath the anchor can be freed.
*
* Finally, the TABLE_POST callback does nothing if the anchor has not
* been set, but otherwise frees the page-table pages while walking back up
* the page-table, installing the block entry when it revisits the anchor
* pointer and clearing the anchor to NULL.
*/
static int stage2_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
enum kvm_pgtable_walk_flags flag, void * const arg)
{
struct stage2_map_data *data = arg;
switch (flag) {
case KVM_PGTABLE_WALK_TABLE_PRE:
return stage2_map_walk_table_pre(addr, end, level, ptep, data);
case KVM_PGTABLE_WALK_LEAF:
return stage2_map_walk_leaf(addr, end, level, ptep, data);
case KVM_PGTABLE_WALK_TABLE_POST:
return stage2_map_walk_table_post(addr, end, level, ptep, data);
}
return -EINVAL;
}
int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
u64 phys, enum kvm_pgtable_prot prot,
struct kvm_mmu_memory_cache *mc)
{
int ret;
struct stage2_map_data map_data = {
.phys = ALIGN_DOWN(phys, PAGE_SIZE),
.mmu = pgt->mmu,
.memcache = mc,
};
struct kvm_pgtable_walker walker = {
.cb = stage2_map_walker,
.flags = KVM_PGTABLE_WALK_TABLE_PRE |
KVM_PGTABLE_WALK_LEAF |
KVM_PGTABLE_WALK_TABLE_POST,
.arg = &map_data,
};
ret = stage2_map_set_prot_attr(prot, &map_data);
if (ret)
return ret;
ret = kvm_pgtable_walk(pgt, addr, size, &walker);
dsb(ishst);
return ret;
}
static void stage2_flush_dcache(void *addr, u64 size)
{
if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
return;
__flush_dcache_area(addr, size);
}
static bool stage2_pte_cacheable(kvm_pte_t pte)
{
u64 memattr = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR, pte);
return memattr == PAGE_S2_MEMATTR(NORMAL);
}
static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
enum kvm_pgtable_walk_flags flag,
void * const arg)
{
struct kvm_s2_mmu *mmu = arg;
kvm_pte_t pte = *ptep, *childp = NULL;
bool need_flush = false;
if (!kvm_pte_valid(pte))
return 0;
if (kvm_pte_table(pte, level)) {
childp = kvm_pte_follow(pte);
if (page_count(virt_to_page(childp)) != 1)
return 0;
} else if (stage2_pte_cacheable(pte)) {
need_flush = true;
}
/*
* This is similar to the map() path in that we unmap the entire
* block entry and rely on the remaining portions being faulted
* back lazily.
*/
kvm_set_invalid_pte(ptep);
kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, addr, level);
put_page(virt_to_page(ptep));
if (need_flush) {
stage2_flush_dcache(kvm_pte_follow(pte),
kvm_granule_size(level));
}
if (childp)
free_page((unsigned long)childp);
return 0;
}
int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
{
struct kvm_pgtable_walker walker = {
.cb = stage2_unmap_walker,
.arg = pgt->mmu,
.flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
};
return kvm_pgtable_walk(pgt, addr, size, &walker);
}
struct stage2_attr_data {
kvm_pte_t attr_set;
kvm_pte_t attr_clr;
kvm_pte_t pte;
u32 level;
};
static int stage2_attr_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
enum kvm_pgtable_walk_flags flag,
void * const arg)
{
kvm_pte_t pte = *ptep;
struct stage2_attr_data *data = arg;
if (!kvm_pte_valid(pte))
return 0;
data->level = level;
data->pte = pte;
pte &= ~data->attr_clr;
pte |= data->attr_set;
/*
* We may race with the CPU trying to set the access flag here,
* but worst-case the access flag update gets lost and will be
* set on the next access instead.
*/
if (data->pte != pte)
WRITE_ONCE(*ptep, pte);
return 0;
}
static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr,
u64 size, kvm_pte_t attr_set,
kvm_pte_t attr_clr, kvm_pte_t *orig_pte,
u32 *level)
{
int ret;
kvm_pte_t attr_mask = KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI;
struct stage2_attr_data data = {
.attr_set = attr_set & attr_mask,
.attr_clr = attr_clr & attr_mask,
};
struct kvm_pgtable_walker walker = {
.cb = stage2_attr_walker,
.arg = &data,
.flags = KVM_PGTABLE_WALK_LEAF,
};
ret = kvm_pgtable_walk(pgt, addr, size, &walker);
if (ret)
return ret;
if (orig_pte)
*orig_pte = data.pte;
if (level)
*level = data.level;
return 0;
}
int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size)
{
return stage2_update_leaf_attrs(pgt, addr, size, 0,
KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W,
NULL, NULL);
}
kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr)
{
kvm_pte_t pte = 0;
stage2_update_leaf_attrs(pgt, addr, 1, KVM_PTE_LEAF_ATTR_LO_S2_AF, 0,
&pte, NULL);
dsb(ishst);
return pte;
}
kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr)
{
kvm_pte_t pte = 0;
stage2_update_leaf_attrs(pgt, addr, 1, 0, KVM_PTE_LEAF_ATTR_LO_S2_AF,
&pte, NULL);
/*
* "But where's the TLBI?!", you scream.
* "Over in the core code", I sigh.
*
* See the '->clear_flush_young()' callback on the KVM mmu notifier.
*/
return pte;
}
bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr)
{
kvm_pte_t pte = 0;
stage2_update_leaf_attrs(pgt, addr, 1, 0, 0, &pte, NULL);
return pte & KVM_PTE_LEAF_ATTR_LO_S2_AF;
}
int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
enum kvm_pgtable_prot prot)
{
int ret;
u32 level;
kvm_pte_t set = 0, clr = 0;
if (prot & KVM_PGTABLE_PROT_R)
set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R;
if (prot & KVM_PGTABLE_PROT_W)
set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
if (prot & KVM_PGTABLE_PROT_X)
clr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;
ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level);
if (!ret)
kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, pgt->mmu, addr, level);
return ret;
}
static int stage2_flush_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
enum kvm_pgtable_walk_flags flag,
void * const arg)
{
kvm_pte_t pte = *ptep;
if (!kvm_pte_valid(pte) || !stage2_pte_cacheable(pte))
return 0;
stage2_flush_dcache(kvm_pte_follow(pte), kvm_granule_size(level));
return 0;
}
int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size)
{
struct kvm_pgtable_walker walker = {
.cb = stage2_flush_walker,
.flags = KVM_PGTABLE_WALK_LEAF,
};
if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
return 0;
return kvm_pgtable_walk(pgt, addr, size, &walker);
}
int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm *kvm)
{
size_t pgd_sz;
u64 vtcr = kvm->arch.vtcr;
u32 ia_bits = VTCR_EL2_IPA(vtcr);
u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr);
u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0;
pgd_sz = kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE;
pgt->pgd = alloc_pages_exact(pgd_sz, GFP_KERNEL | __GFP_ZERO);
if (!pgt->pgd)
return -ENOMEM;
pgt->ia_bits = ia_bits;
pgt->start_level = start_level;
pgt->mmu = &kvm->arch.mmu;
/* Ensure zeroed PGD pages are visible to the hardware walker */
dsb(ishst);
return 0;
}
static int stage2_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
enum kvm_pgtable_walk_flags flag,
void * const arg)
{
kvm_pte_t pte = *ptep;
if (!kvm_pte_valid(pte))
return 0;
put_page(virt_to_page(ptep));
if (kvm_pte_table(pte, level))
free_page((unsigned long)kvm_pte_follow(pte));
return 0;
}
void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
{
size_t pgd_sz;
struct kvm_pgtable_walker walker = {
.cb = stage2_free_walker,
.flags = KVM_PGTABLE_WALK_LEAF |
KVM_PGTABLE_WALK_TABLE_POST,
};
WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));
pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE;
free_pages_exact(pgt->pgd, pgd_sz);
pgt->pgd = NULL;
}
......@@ -28,6 +28,11 @@
const char __hyp_panic_string[] = "HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%p\n";
/* VHE specific context */
DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data);
DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
DEFINE_PER_CPU(unsigned long, kvm_hyp_vector);
static void __activate_traps(struct kvm_vcpu *vcpu)
{
u64 val;
......@@ -59,7 +64,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
write_sysreg(val, cpacr_el1);
write_sysreg(kvm_get_hyp_vector(), vbar_el1);
write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el1);
}
NOKPROBE_SYMBOL(__activate_traps);
......@@ -108,7 +113,7 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
struct kvm_cpu_context *guest_ctxt;
u64 exit_code;
host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
host_ctxt->__hyp_running_vcpu = vcpu;
guest_ctxt = &vcpu->arch.ctxt;
......@@ -120,28 +125,24 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
* HCR_EL2.TGE.
*
* We have already configured the guest's stage 1 translation in
* kvm_vcpu_load_sysregs_vhe above. We must now call __activate_vm
* before __activate_traps, because __activate_vm configures
* stage 2 translation, and __activate_traps clear HCR_EL2.TGE
* (among other things).
* kvm_vcpu_load_sysregs_vhe above. We must now call
* __load_guest_stage2 before __activate_traps, because
* __load_guest_stage2 configures stage 2 translation, and
* __activate_traps clear HCR_EL2.TGE (among other things).
*/
__activate_vm(vcpu->arch.hw_mmu);
__load_guest_stage2(vcpu->arch.hw_mmu);
__activate_traps(vcpu);
sysreg_restore_guest_state_vhe(guest_ctxt);
__debug_switch_to_guest(vcpu);
__set_guest_arch_workaround_state(vcpu);
do {
/* Jump in the fire! */
exit_code = __guest_enter(vcpu, host_ctxt);
exit_code = __guest_enter(vcpu);
/* And we're baaack! */
} while (fixup_guest_exit(vcpu, &exit_code));
__set_host_arch_workaround_state(vcpu);
sysreg_save_guest_state_vhe(guest_ctxt);
__deactivate_traps(vcpu);
......@@ -192,10 +193,12 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
return ret;
}
static void __hyp_call_panic(u64 spsr, u64 elr, u64 par,
struct kvm_cpu_context *host_ctxt)
static void __hyp_call_panic(u64 spsr, u64 elr, u64 par)
{
struct kvm_cpu_context *host_ctxt;
struct kvm_vcpu *vcpu;
host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
vcpu = host_ctxt->__hyp_running_vcpu;
__deactivate_traps(vcpu);
......@@ -208,13 +211,13 @@ static void __hyp_call_panic(u64 spsr, u64 elr, u64 par,
}
NOKPROBE_SYMBOL(__hyp_call_panic);
void __noreturn hyp_panic(struct kvm_cpu_context *host_ctxt)
void __noreturn hyp_panic(void)
{
u64 spsr = read_sysreg_el2(SYS_SPSR);
u64 elr = read_sysreg_el2(SYS_ELR);
u64 par = read_sysreg(par_el1);
__hyp_call_panic(spsr, elr, par, host_ctxt);
__hyp_call_panic(spsr, elr, par);
unreachable();
}
......
......@@ -66,7 +66,7 @@ void kvm_vcpu_load_sysregs_vhe(struct kvm_vcpu *vcpu)
struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt;
struct kvm_cpu_context *host_ctxt;
host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
__sysreg_save_user_state(host_ctxt);
/*
......@@ -100,7 +100,7 @@ void kvm_vcpu_put_sysregs_vhe(struct kvm_vcpu *vcpu)
struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt;
struct kvm_cpu_context *host_ctxt;
host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
deactivate_traps_vhe_put();
__sysreg_save_el1_state(guest_ctxt);
......
......@@ -24,27 +24,36 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
feature = smccc_get_arg1(vcpu);
switch (feature) {
case ARM_SMCCC_ARCH_WORKAROUND_1:
switch (kvm_arm_harden_branch_predictor()) {
case KVM_BP_HARDEN_UNKNOWN:
switch (arm64_get_spectre_v2_state()) {
case SPECTRE_VULNERABLE:
break;
case KVM_BP_HARDEN_WA_NEEDED:
case SPECTRE_MITIGATED:
val = SMCCC_RET_SUCCESS;
break;
case KVM_BP_HARDEN_NOT_REQUIRED:
case SPECTRE_UNAFFECTED:
val = SMCCC_RET_NOT_REQUIRED;
break;
}
break;
case ARM_SMCCC_ARCH_WORKAROUND_2:
switch (kvm_arm_have_ssbd()) {
case KVM_SSBD_FORCE_DISABLE:
case KVM_SSBD_UNKNOWN:
switch (arm64_get_spectre_v4_state()) {
case SPECTRE_VULNERABLE:
break;
case KVM_SSBD_KERNEL:
val = SMCCC_RET_SUCCESS;
case SPECTRE_MITIGATED:
/*
* SSBS everywhere: Indicate no firmware
* support, as the SSBS support will be
* indicated to the guest and the default is
* safe.
*
* Otherwise, expose a permanent mitigation
* to the guest, and hide SSBS so that the
* guest stays protected.
*/
if (cpus_have_final_cap(ARM64_SSBS))
break;
case KVM_SSBD_FORCE_ENABLE:
case KVM_SSBD_MITIGATED:
fallthrough;
case SPECTRE_UNAFFECTED:
val = SMCCC_RET_NOT_REQUIRED;
break;
}
......
......@@ -202,6 +202,7 @@ void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr)
/**
* kvm_inject_undefined - inject an undefined instruction into the guest
* @vcpu: The vCPU in which to inject the exception
*
* It is assumed that this code is called from the VCPU thread and that the
* VCPU therefore is not currently executing guest code.
......
......@@ -14,6 +14,7 @@
#include <asm/cacheflush.h>
#include <asm/kvm_arm.h>
#include <asm/kvm_mmu.h>
#include <asm/kvm_pgtable.h>
#include <asm/kvm_ras.h>
#include <asm/kvm_asm.h>
#include <asm/kvm_emulate.h>
......@@ -21,9 +22,7 @@
#include "trace.h"
static pgd_t *boot_hyp_pgd;
static pgd_t *hyp_pgd;
static pgd_t *merged_hyp_pgd;
static struct kvm_pgtable *hyp_pgtable;
static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
static unsigned long hyp_idmap_start;
......@@ -32,16 +31,42 @@ static phys_addr_t hyp_idmap_vector;
static unsigned long io_map_base;
#define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t))
#define KVM_S2PTE_FLAG_IS_IOMAP (1UL << 0)
#define KVM_S2_FLAG_LOGGING_ACTIVE (1UL << 1)
static bool is_iomap(unsigned long flags)
/*
* Release kvm_mmu_lock periodically if the memory region is large. Otherwise,
* we may see kernel panics with CONFIG_DETECT_HUNG_TASK,
* CONFIG_LOCKUP_DETECTOR, CONFIG_LOCKDEP. Additionally, holding the lock too
* long will also starve other vCPUs. We have to also make sure that the page
* tables are not freed while we released the lock.
*/
static int stage2_apply_range(struct kvm *kvm, phys_addr_t addr,
phys_addr_t end,
int (*fn)(struct kvm_pgtable *, u64, u64),
bool resched)
{
return flags & KVM_S2PTE_FLAG_IS_IOMAP;
int ret;
u64 next;
do {
struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
if (!pgt)
return -EINVAL;
next = stage2_pgd_addr_end(kvm, addr, end);
ret = fn(pgt, addr, next - addr);
if (ret)
break;
if (resched && next != end)
cond_resched_lock(&kvm->mmu_lock);
} while (addr = next, addr != end);
return ret;
}
#define stage2_apply_range_resched(kvm, addr, end, fn) \
stage2_apply_range(kvm, addr, end, fn, true)
static bool memslot_is_logging(struct kvm_memory_slot *memslot)
{
return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
......@@ -58,154 +83,11 @@ void kvm_flush_remote_tlbs(struct kvm *kvm)
kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu);
}
static void kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa,
int level)
{
kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ipa, level);
}
/*
* D-Cache management functions. They take the page table entries by
* value, as they are flushing the cache using the kernel mapping (or
* kmap on 32bit).
*/
static void kvm_flush_dcache_pte(pte_t pte)
{
__kvm_flush_dcache_pte(pte);
}
static void kvm_flush_dcache_pmd(pmd_t pmd)
{
__kvm_flush_dcache_pmd(pmd);
}
static void kvm_flush_dcache_pud(pud_t pud)
{
__kvm_flush_dcache_pud(pud);
}
static bool kvm_is_device_pfn(unsigned long pfn)
{
return !pfn_valid(pfn);
}
/**
* stage2_dissolve_pmd() - clear and flush huge PMD entry
* @mmu: pointer to mmu structure to operate on
* @addr: IPA
* @pmd: pmd pointer for IPA
*
* Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs.
*/
static void stage2_dissolve_pmd(struct kvm_s2_mmu *mmu, phys_addr_t addr, pmd_t *pmd)
{
if (!pmd_thp_or_huge(*pmd))
return;
pmd_clear(pmd);
kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL);
put_page(virt_to_page(pmd));
}
/**
* stage2_dissolve_pud() - clear and flush huge PUD entry
* @mmu: pointer to mmu structure to operate on
* @addr: IPA
* @pud: pud pointer for IPA
*
* Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs.
*/
static void stage2_dissolve_pud(struct kvm_s2_mmu *mmu, phys_addr_t addr, pud_t *pudp)
{
struct kvm *kvm = mmu->kvm;
if (!stage2_pud_huge(kvm, *pudp))
return;
stage2_pud_clear(kvm, pudp);
kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL);
put_page(virt_to_page(pudp));
}
static void clear_stage2_pgd_entry(struct kvm_s2_mmu *mmu, pgd_t *pgd, phys_addr_t addr)
{
struct kvm *kvm = mmu->kvm;
p4d_t *p4d_table __maybe_unused = stage2_p4d_offset(kvm, pgd, 0UL);
stage2_pgd_clear(kvm, pgd);
kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
stage2_p4d_free(kvm, p4d_table);
put_page(virt_to_page(pgd));
}
static void clear_stage2_p4d_entry(struct kvm_s2_mmu *mmu, p4d_t *p4d, phys_addr_t addr)
{
struct kvm *kvm = mmu->kvm;
pud_t *pud_table __maybe_unused = stage2_pud_offset(kvm, p4d, 0);
stage2_p4d_clear(kvm, p4d);
kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
stage2_pud_free(kvm, pud_table);
put_page(virt_to_page(p4d));
}
static void clear_stage2_pud_entry(struct kvm_s2_mmu *mmu, pud_t *pud, phys_addr_t addr)
{
struct kvm *kvm = mmu->kvm;
pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(kvm, pud, 0);
VM_BUG_ON(stage2_pud_huge(kvm, *pud));
stage2_pud_clear(kvm, pud);
kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
stage2_pmd_free(kvm, pmd_table);
put_page(virt_to_page(pud));
}
static void clear_stage2_pmd_entry(struct kvm_s2_mmu *mmu, pmd_t *pmd, phys_addr_t addr)
{
pte_t *pte_table = pte_offset_kernel(pmd, 0);
VM_BUG_ON(pmd_thp_or_huge(*pmd));
pmd_clear(pmd);
kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
free_page((unsigned long)pte_table);
put_page(virt_to_page(pmd));
}
static inline void kvm_set_pte(pte_t *ptep, pte_t new_pte)
{
WRITE_ONCE(*ptep, new_pte);
dsb(ishst);
}
static inline void kvm_set_pmd(pmd_t *pmdp, pmd_t new_pmd)
{
WRITE_ONCE(*pmdp, new_pmd);
dsb(ishst);
}
static inline void kvm_pmd_populate(pmd_t *pmdp, pte_t *ptep)
{
kvm_set_pmd(pmdp, kvm_mk_pmd(ptep));
}
static inline void kvm_pud_populate(pud_t *pudp, pmd_t *pmdp)
{
WRITE_ONCE(*pudp, kvm_mk_pud(pmdp));
dsb(ishst);
}
static inline void kvm_p4d_populate(p4d_t *p4dp, pud_t *pudp)
{
WRITE_ONCE(*p4dp, kvm_mk_p4d(pudp));
dsb(ishst);
}
static inline void kvm_pgd_populate(pgd_t *pgdp, p4d_t *p4dp)
{
#ifndef __PAGETABLE_P4D_FOLDED
WRITE_ONCE(*pgdp, kvm_mk_pgd(p4dp));
dsb(ishst);
#endif
}
/*
* Unmapping vs dcache management:
*
......@@ -223,120 +105,19 @@ static inline void kvm_pgd_populate(pgd_t *pgdp, p4d_t *p4dp)
* end up writing old data to disk.
*
* This is why right after unmapping a page/section and invalidating
* the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure
* the IO subsystem will never hit in the cache.
* the corresponding TLBs, we flush to make sure the IO subsystem will
* never hit in the cache.
*
* This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as
* we then fully enforce cacheability of RAM, no matter what the guest
* does.
*/
static void unmap_stage2_ptes(struct kvm_s2_mmu *mmu, pmd_t *pmd,
phys_addr_t addr, phys_addr_t end)
{
phys_addr_t start_addr = addr;
pte_t *pte, *start_pte;
start_pte = pte = pte_offset_kernel(pmd, addr);
do {
if (!pte_none(*pte)) {
pte_t old_pte = *pte;
kvm_set_pte(pte, __pte(0));
kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PTE_LEVEL);
/* No need to invalidate the cache for device mappings */
if (!kvm_is_device_pfn(pte_pfn(old_pte)))
kvm_flush_dcache_pte(old_pte);
put_page(virt_to_page(pte));
}
} while (pte++, addr += PAGE_SIZE, addr != end);
if (stage2_pte_table_empty(mmu->kvm, start_pte))
clear_stage2_pmd_entry(mmu, pmd, start_addr);
}
static void unmap_stage2_pmds(struct kvm_s2_mmu *mmu, pud_t *pud,
phys_addr_t addr, phys_addr_t end)
{
struct kvm *kvm = mmu->kvm;
phys_addr_t next, start_addr = addr;
pmd_t *pmd, *start_pmd;
start_pmd = pmd = stage2_pmd_offset(kvm, pud, addr);
do {
next = stage2_pmd_addr_end(kvm, addr, end);
if (!pmd_none(*pmd)) {
if (pmd_thp_or_huge(*pmd)) {
pmd_t old_pmd = *pmd;
pmd_clear(pmd);
kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL);
kvm_flush_dcache_pmd(old_pmd);
put_page(virt_to_page(pmd));
} else {
unmap_stage2_ptes(mmu, pmd, addr, next);
}
}
} while (pmd++, addr = next, addr != end);
if (stage2_pmd_table_empty(kvm, start_pmd))
clear_stage2_pud_entry(mmu, pud, start_addr);
}
static void unmap_stage2_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d,
phys_addr_t addr, phys_addr_t end)
{
struct kvm *kvm = mmu->kvm;
phys_addr_t next, start_addr = addr;
pud_t *pud, *start_pud;
start_pud = pud = stage2_pud_offset(kvm, p4d, addr);
do {
next = stage2_pud_addr_end(kvm, addr, end);
if (!stage2_pud_none(kvm, *pud)) {
if (stage2_pud_huge(kvm, *pud)) {
pud_t old_pud = *pud;
stage2_pud_clear(kvm, pud);
kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL);
kvm_flush_dcache_pud(old_pud);
put_page(virt_to_page(pud));
} else {
unmap_stage2_pmds(mmu, pud, addr, next);
}
}
} while (pud++, addr = next, addr != end);
if (stage2_pud_table_empty(kvm, start_pud))
clear_stage2_p4d_entry(mmu, p4d, start_addr);
}
static void unmap_stage2_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd,
phys_addr_t addr, phys_addr_t end)
{
struct kvm *kvm = mmu->kvm;
phys_addr_t next, start_addr = addr;
p4d_t *p4d, *start_p4d;
start_p4d = p4d = stage2_p4d_offset(kvm, pgd, addr);
do {
next = stage2_p4d_addr_end(kvm, addr, end);
if (!stage2_p4d_none(kvm, *p4d))
unmap_stage2_puds(mmu, p4d, addr, next);
} while (p4d++, addr = next, addr != end);
if (stage2_p4d_table_empty(kvm, start_p4d))
clear_stage2_pgd_entry(mmu, pgd, start_addr);
}
/**
* unmap_stage2_range -- Clear stage2 page table entries to unmap a range
* @kvm: The VM pointer
* @mmu: The KVM stage-2 MMU pointer
* @start: The intermediate physical base address of the range to unmap
* @size: The size of the area to unmap
* @may_block: Whether or not we are permitted to block
*
* Clear a range of stage-2 mappings, lowering the various ref-counts. Must
* be called while holding mmu_lock (unless for freeing the stage2 pgd before
......@@ -347,32 +128,12 @@ static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64
bool may_block)
{
struct kvm *kvm = mmu->kvm;
pgd_t *pgd;
phys_addr_t addr = start, end = start + size;
phys_addr_t next;
phys_addr_t end = start + size;
assert_spin_locked(&kvm->mmu_lock);
WARN_ON(size & ~PAGE_MASK);
pgd = mmu->pgd + stage2_pgd_index(kvm, addr);
do {
/*
* Make sure the page table is still active, as another thread
* could have possibly freed the page table, while we released
* the lock.
*/
if (!READ_ONCE(mmu->pgd))
break;
next = stage2_pgd_addr_end(kvm, addr, end);
if (!stage2_pgd_none(kvm, *pgd))
unmap_stage2_p4ds(mmu, pgd, addr, next);
/*
* If the range is too large, release the kvm->mmu_lock
* to prevent starvation and lockup detector warnings.
*/
if (may_block && next != end)
cond_resched_lock(&kvm->mmu_lock);
} while (pgd++, addr = next, addr != end);
WARN_ON(stage2_apply_range(kvm, start, end, kvm_pgtable_stage2_unmap,
may_block));
}
static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size)
......@@ -380,89 +141,13 @@ static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 si
__unmap_stage2_range(mmu, start, size, true);
}
static void stage2_flush_ptes(struct kvm_s2_mmu *mmu, pmd_t *pmd,
phys_addr_t addr, phys_addr_t end)
{
pte_t *pte;
pte = pte_offset_kernel(pmd, addr);
do {
if (!pte_none(*pte) && !kvm_is_device_pfn(pte_pfn(*pte)))
kvm_flush_dcache_pte(*pte);
} while (pte++, addr += PAGE_SIZE, addr != end);
}
static void stage2_flush_pmds(struct kvm_s2_mmu *mmu, pud_t *pud,
phys_addr_t addr, phys_addr_t end)
{
struct kvm *kvm = mmu->kvm;
pmd_t *pmd;
phys_addr_t next;
pmd = stage2_pmd_offset(kvm, pud, addr);
do {
next = stage2_pmd_addr_end(kvm, addr, end);
if (!pmd_none(*pmd)) {
if (pmd_thp_or_huge(*pmd))
kvm_flush_dcache_pmd(*pmd);
else
stage2_flush_ptes(mmu, pmd, addr, next);
}
} while (pmd++, addr = next, addr != end);
}
static void stage2_flush_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d,
phys_addr_t addr, phys_addr_t end)
{
struct kvm *kvm = mmu->kvm;
pud_t *pud;
phys_addr_t next;
pud = stage2_pud_offset(kvm, p4d, addr);
do {
next = stage2_pud_addr_end(kvm, addr, end);
if (!stage2_pud_none(kvm, *pud)) {
if (stage2_pud_huge(kvm, *pud))
kvm_flush_dcache_pud(*pud);
else
stage2_flush_pmds(mmu, pud, addr, next);
}
} while (pud++, addr = next, addr != end);
}
static void stage2_flush_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd,
phys_addr_t addr, phys_addr_t end)
{
struct kvm *kvm = mmu->kvm;
p4d_t *p4d;
phys_addr_t next;
p4d = stage2_p4d_offset(kvm, pgd, addr);
do {
next = stage2_p4d_addr_end(kvm, addr, end);
if (!stage2_p4d_none(kvm, *p4d))
stage2_flush_puds(mmu, p4d, addr, next);
} while (p4d++, addr = next, addr != end);
}
static void stage2_flush_memslot(struct kvm *kvm,
struct kvm_memory_slot *memslot)
{
struct kvm_s2_mmu *mmu = &kvm->arch.mmu;
phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
phys_addr_t next;
pgd_t *pgd;
pgd = mmu->pgd + stage2_pgd_index(kvm, addr);
do {
next = stage2_pgd_addr_end(kvm, addr, end);
if (!stage2_pgd_none(kvm, *pgd))
stage2_flush_p4ds(mmu, pgd, addr, next);
if (next != end)
cond_resched_lock(&kvm->mmu_lock);
} while (pgd++, addr = next, addr != end);
stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_flush);
}
/**
......@@ -489,338 +174,28 @@ static void stage2_flush_vm(struct kvm *kvm)
srcu_read_unlock(&kvm->srcu, idx);
}
static void clear_hyp_pgd_entry(pgd_t *pgd)
{
p4d_t *p4d_table __maybe_unused = p4d_offset(pgd, 0UL);
pgd_clear(pgd);
p4d_free(NULL, p4d_table);
put_page(virt_to_page(pgd));
}
static void clear_hyp_p4d_entry(p4d_t *p4d)
{
pud_t *pud_table __maybe_unused = pud_offset(p4d, 0UL);
VM_BUG_ON(p4d_huge(*p4d));
p4d_clear(p4d);
pud_free(NULL, pud_table);
put_page(virt_to_page(p4d));
}
static void clear_hyp_pud_entry(pud_t *pud)
{
pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0);
VM_BUG_ON(pud_huge(*pud));
pud_clear(pud);
pmd_free(NULL, pmd_table);
put_page(virt_to_page(pud));
}
static void clear_hyp_pmd_entry(pmd_t *pmd)
{
pte_t *pte_table = pte_offset_kernel(pmd, 0);
VM_BUG_ON(pmd_thp_or_huge(*pmd));
pmd_clear(pmd);
pte_free_kernel(NULL, pte_table);
put_page(virt_to_page(pmd));
}
static void unmap_hyp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
{
pte_t *pte, *start_pte;
start_pte = pte = pte_offset_kernel(pmd, addr);
do {
if (!pte_none(*pte)) {
kvm_set_pte(pte, __pte(0));
put_page(virt_to_page(pte));
}
} while (pte++, addr += PAGE_SIZE, addr != end);
if (hyp_pte_table_empty(start_pte))
clear_hyp_pmd_entry(pmd);
}
static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end)
{
phys_addr_t next;
pmd_t *pmd, *start_pmd;
start_pmd = pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
/* Hyp doesn't use huge pmds */
if (!pmd_none(*pmd))
unmap_hyp_ptes(pmd, addr, next);
} while (pmd++, addr = next, addr != end);
if (hyp_pmd_table_empty(start_pmd))
clear_hyp_pud_entry(pud);
}
static void unmap_hyp_puds(p4d_t *p4d, phys_addr_t addr, phys_addr_t end)
{
phys_addr_t next;
pud_t *pud, *start_pud;
start_pud = pud = pud_offset(p4d, addr);
do {
next = pud_addr_end(addr, end);
/* Hyp doesn't use huge puds */
if (!pud_none(*pud))
unmap_hyp_pmds(pud, addr, next);
} while (pud++, addr = next, addr != end);
if (hyp_pud_table_empty(start_pud))
clear_hyp_p4d_entry(p4d);
}
static void unmap_hyp_p4ds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end)
{
phys_addr_t next;
p4d_t *p4d, *start_p4d;
start_p4d = p4d = p4d_offset(pgd, addr);
do {
next = p4d_addr_end(addr, end);
/* Hyp doesn't use huge p4ds */
if (!p4d_none(*p4d))
unmap_hyp_puds(p4d, addr, next);
} while (p4d++, addr = next, addr != end);
if (hyp_p4d_table_empty(start_p4d))
clear_hyp_pgd_entry(pgd);
}
static unsigned int kvm_pgd_index(unsigned long addr, unsigned int ptrs_per_pgd)
{
return (addr >> PGDIR_SHIFT) & (ptrs_per_pgd - 1);
}
static void __unmap_hyp_range(pgd_t *pgdp, unsigned long ptrs_per_pgd,
phys_addr_t start, u64 size)
{
pgd_t *pgd;
phys_addr_t addr = start, end = start + size;
phys_addr_t next;
/*
* We don't unmap anything from HYP, except at the hyp tear down.
* Hence, we don't have to invalidate the TLBs here.
*/
pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd);
do {
next = pgd_addr_end(addr, end);
if (!pgd_none(*pgd))
unmap_hyp_p4ds(pgd, addr, next);
} while (pgd++, addr = next, addr != end);
}
static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size)
{
__unmap_hyp_range(pgdp, PTRS_PER_PGD, start, size);
}
static void unmap_hyp_idmap_range(pgd_t *pgdp, phys_addr_t start, u64 size)
{
__unmap_hyp_range(pgdp, __kvm_idmap_ptrs_per_pgd(), start, size);
}
/**
* free_hyp_pgds - free Hyp-mode page tables
*
* Assumes hyp_pgd is a page table used strictly in Hyp-mode and
* therefore contains either mappings in the kernel memory area (above
* PAGE_OFFSET), or device mappings in the idmap range.
*
* boot_hyp_pgd should only map the idmap range, and is only used in
* the extended idmap case.
*/
void free_hyp_pgds(void)
{
pgd_t *id_pgd;
mutex_lock(&kvm_hyp_pgd_mutex);
id_pgd = boot_hyp_pgd ? boot_hyp_pgd : hyp_pgd;
if (id_pgd) {
/* In case we never called hyp_mmu_init() */
if (!io_map_base)
io_map_base = hyp_idmap_start;
unmap_hyp_idmap_range(id_pgd, io_map_base,
hyp_idmap_start + PAGE_SIZE - io_map_base);
if (hyp_pgtable) {
kvm_pgtable_hyp_destroy(hyp_pgtable);
kfree(hyp_pgtable);
}
if (boot_hyp_pgd) {
free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order);
boot_hyp_pgd = NULL;
}
if (hyp_pgd) {
unmap_hyp_range(hyp_pgd, kern_hyp_va(PAGE_OFFSET),
(uintptr_t)high_memory - PAGE_OFFSET);
free_pages((unsigned long)hyp_pgd, hyp_pgd_order);
hyp_pgd = NULL;
}
if (merged_hyp_pgd) {
clear_page(merged_hyp_pgd);
free_page((unsigned long)merged_hyp_pgd);
merged_hyp_pgd = NULL;
}
mutex_unlock(&kvm_hyp_pgd_mutex);
}
static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start,
unsigned long end, unsigned long pfn,
pgprot_t prot)
{
pte_t *pte;
unsigned long addr;
addr = start;
do {
pte = pte_offset_kernel(pmd, addr);
kvm_set_pte(pte, kvm_pfn_pte(pfn, prot));
get_page(virt_to_page(pte));
pfn++;
} while (addr += PAGE_SIZE, addr != end);
}
static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start,
unsigned long end, unsigned long pfn,
pgprot_t prot)
{
pmd_t *pmd;
pte_t *pte;
unsigned long addr, next;
addr = start;
do {
pmd = pmd_offset(pud, addr);
BUG_ON(pmd_sect(*pmd));
if (pmd_none(*pmd)) {
pte = pte_alloc_one_kernel(NULL);
if (!pte) {
kvm_err("Cannot allocate Hyp pte\n");
return -ENOMEM;
}
kvm_pmd_populate(pmd, pte);
get_page(virt_to_page(pmd));
}
next = pmd_addr_end(addr, end);
create_hyp_pte_mappings(pmd, addr, next, pfn, prot);
pfn += (next - addr) >> PAGE_SHIFT;
} while (addr = next, addr != end);
return 0;
}
static int create_hyp_pud_mappings(p4d_t *p4d, unsigned long start,
unsigned long end, unsigned long pfn,
pgprot_t prot)
{
pud_t *pud;
pmd_t *pmd;
unsigned long addr, next;
int ret;
addr = start;
do {
pud = pud_offset(p4d, addr);
if (pud_none_or_clear_bad(pud)) {
pmd = pmd_alloc_one(NULL, addr);
if (!pmd) {
kvm_err("Cannot allocate Hyp pmd\n");
return -ENOMEM;
}
kvm_pud_populate(pud, pmd);
get_page(virt_to_page(pud));
}
next = pud_addr_end(addr, end);
ret = create_hyp_pmd_mappings(pud, addr, next, pfn, prot);
if (ret)
return ret;
pfn += (next - addr) >> PAGE_SHIFT;
} while (addr = next, addr != end);
return 0;
}
static int create_hyp_p4d_mappings(pgd_t *pgd, unsigned long start,
unsigned long end, unsigned long pfn,
pgprot_t prot)
static int __create_hyp_mappings(unsigned long start, unsigned long size,
unsigned long phys, enum kvm_pgtable_prot prot)
{
p4d_t *p4d;
pud_t *pud;
unsigned long addr, next;
int ret;
addr = start;
do {
p4d = p4d_offset(pgd, addr);
if (p4d_none(*p4d)) {
pud = pud_alloc_one(NULL, addr);
if (!pud) {
kvm_err("Cannot allocate Hyp pud\n");
return -ENOMEM;
}
kvm_p4d_populate(p4d, pud);
get_page(virt_to_page(p4d));
}
next = p4d_addr_end(addr, end);
ret = create_hyp_pud_mappings(p4d, addr, next, pfn, prot);
if (ret)
return ret;
pfn += (next - addr) >> PAGE_SHIFT;
} while (addr = next, addr != end);
return 0;
}
static int __create_hyp_mappings(pgd_t *pgdp, unsigned long ptrs_per_pgd,
unsigned long start, unsigned long end,
unsigned long pfn, pgprot_t prot)
{
pgd_t *pgd;
p4d_t *p4d;
unsigned long addr, next;
int err = 0;
int err;
mutex_lock(&kvm_hyp_pgd_mutex);
addr = start & PAGE_MASK;
end = PAGE_ALIGN(end);
do {
pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd);
if (pgd_none(*pgd)) {
p4d = p4d_alloc_one(NULL, addr);
if (!p4d) {
kvm_err("Cannot allocate Hyp p4d\n");
err = -ENOMEM;
goto out;
}
kvm_pgd_populate(pgd, p4d);
get_page(virt_to_page(pgd));
}
next = pgd_addr_end(addr, end);
err = create_hyp_p4d_mappings(pgd, addr, next, pfn, prot);
if (err)
goto out;
pfn += (next - addr) >> PAGE_SHIFT;
} while (addr = next, addr != end);
out:
err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot);
mutex_unlock(&kvm_hyp_pgd_mutex);
return err;
}
......@@ -845,7 +220,7 @@ static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
* in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
* physical pages.
*/
int create_hyp_mappings(void *from, void *to, pgprot_t prot)
int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
{
phys_addr_t phys_addr;
unsigned long virt_addr;
......@@ -862,9 +237,7 @@ int create_hyp_mappings(void *from, void *to, pgprot_t prot)
int err;
phys_addr = kvm_kaddr_to_phys(from + virt_addr - start);
err = __create_hyp_mappings(hyp_pgd, PTRS_PER_PGD,
virt_addr, virt_addr + PAGE_SIZE,
__phys_to_pfn(phys_addr),
err = __create_hyp_mappings(virt_addr, PAGE_SIZE, phys_addr,
prot);
if (err)
return err;
......@@ -874,9 +247,9 @@ int create_hyp_mappings(void *from, void *to, pgprot_t prot)
}
static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
unsigned long *haddr, pgprot_t prot)
unsigned long *haddr,
enum kvm_pgtable_prot prot)
{
pgd_t *pgd = hyp_pgd;
unsigned long base;
int ret = 0;
......@@ -908,17 +281,11 @@ static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
if (ret)
goto out;
if (__kvm_cpu_uses_extended_idmap())
pgd = boot_hyp_pgd;
ret = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(),
base, base + size,
__phys_to_pfn(phys_addr), prot);
ret = __create_hyp_mappings(base, size, phys_addr, prot);
if (ret)
goto out;
*haddr = base + offset_in_page(phys_addr);
out:
return ret;
}
......@@ -985,480 +352,139 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
}
/**
* kvm_init_stage2_mmu - Initialise a S2 MMU strucrure
* @kvm: The pointer to the KVM structure
* @mmu: The pointer to the s2 MMU structure
*
* Allocates only the stage-2 HW PGD level table(s) of size defined by
* stage2_pgd_size(mmu->kvm).
*
* Note we don't need locking here as this is only called when the VM is
* created, which can only be done once.
*/
int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
{
phys_addr_t pgd_phys;
pgd_t *pgd;
int cpu;
if (mmu->pgd != NULL) {
kvm_err("kvm_arch already initialized?\n");
return -EINVAL;
}
/* Allocate the HW PGD, making sure that each page gets its own refcount */
pgd = alloc_pages_exact(stage2_pgd_size(kvm), GFP_KERNEL | __GFP_ZERO);
if (!pgd)
return -ENOMEM;
pgd_phys = virt_to_phys(pgd);
if (WARN_ON(pgd_phys & ~kvm_vttbr_baddr_mask(kvm)))
return -EINVAL;
mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran));
if (!mmu->last_vcpu_ran) {
free_pages_exact(pgd, stage2_pgd_size(kvm));
return -ENOMEM;
}
for_each_possible_cpu(cpu)
*per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1;
mmu->kvm = kvm;
mmu->pgd = pgd;
mmu->pgd_phys = pgd_phys;
mmu->vmid.vmid_gen = 0;
return 0;
}
static void stage2_unmap_memslot(struct kvm *kvm,
struct kvm_memory_slot *memslot)
{
hva_t hva = memslot->userspace_addr;
phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
phys_addr_t size = PAGE_SIZE * memslot->npages;
hva_t reg_end = hva + size;
/*
* A memory region could potentially cover multiple VMAs, and any holes
* between them, so iterate over all of them to find out if we should
* unmap any of them.
*
* +--------------------------------------------+
* +---------------+----------------+ +----------------+
* | : VMA 1 | VMA 2 | | VMA 3 : |
* +---------------+----------------+ +----------------+
* | memory region |
* +--------------------------------------------+
*/
do {
struct vm_area_struct *vma = find_vma(current->mm, hva);
hva_t vm_start, vm_end;
if (!vma || vma->vm_start >= reg_end)
break;
/*
* Take the intersection of this VMA with the memory region
*/
vm_start = max(hva, vma->vm_start);
vm_end = min(reg_end, vma->vm_end);
if (!(vma->vm_flags & VM_PFNMAP)) {
gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
unmap_stage2_range(&kvm->arch.mmu, gpa, vm_end - vm_start);
}
hva = vm_end;
} while (hva < reg_end);
}
/**
* stage2_unmap_vm - Unmap Stage-2 RAM mappings
* @kvm: The struct kvm pointer
*
* Go through the memregions and unmap any regular RAM
* backing memory already mapped to the VM.
*/
void stage2_unmap_vm(struct kvm *kvm)
{
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
int idx;
idx = srcu_read_lock(&kvm->srcu);
mmap_read_lock(current->mm);
spin_lock(&kvm->mmu_lock);
slots = kvm_memslots(kvm);
kvm_for_each_memslot(memslot, slots)
stage2_unmap_memslot(kvm, memslot);
spin_unlock(&kvm->mmu_lock);
mmap_read_unlock(current->mm);
srcu_read_unlock(&kvm->srcu, idx);
}
void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
{
struct kvm *kvm = mmu->kvm;
void *pgd = NULL;
spin_lock(&kvm->mmu_lock);
if (mmu->pgd) {
unmap_stage2_range(mmu, 0, kvm_phys_size(kvm));
pgd = READ_ONCE(mmu->pgd);
mmu->pgd = NULL;
}
spin_unlock(&kvm->mmu_lock);
/* Free the HW pgd, one page at a time */
if (pgd) {
free_pages_exact(pgd, stage2_pgd_size(kvm));
free_percpu(mmu->last_vcpu_ran);
}
}
static p4d_t *stage2_get_p4d(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache,
phys_addr_t addr)
{
struct kvm *kvm = mmu->kvm;
pgd_t *pgd;
p4d_t *p4d;
pgd = mmu->pgd + stage2_pgd_index(kvm, addr);
if (stage2_pgd_none(kvm, *pgd)) {
if (!cache)
return NULL;
p4d = kvm_mmu_memory_cache_alloc(cache);
stage2_pgd_populate(kvm, pgd, p4d);
get_page(virt_to_page(pgd));
}
return stage2_p4d_offset(kvm, pgd, addr);
}
static pud_t *stage2_get_pud(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache,
phys_addr_t addr)
{
struct kvm *kvm = mmu->kvm;
p4d_t *p4d;
pud_t *pud;
p4d = stage2_get_p4d(mmu, cache, addr);
if (stage2_p4d_none(kvm, *p4d)) {
if (!cache)
return NULL;
pud = kvm_mmu_memory_cache_alloc(cache);
stage2_p4d_populate(kvm, p4d, pud);
get_page(virt_to_page(p4d));
}
return stage2_pud_offset(kvm, p4d, addr);
}
static pmd_t *stage2_get_pmd(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache,
phys_addr_t addr)
{
struct kvm *kvm = mmu->kvm;
pud_t *pud;
pmd_t *pmd;
pud = stage2_get_pud(mmu, cache, addr);
if (!pud || stage2_pud_huge(kvm, *pud))
return NULL;
if (stage2_pud_none(kvm, *pud)) {
if (!cache)
return NULL;
pmd = kvm_mmu_memory_cache_alloc(cache);
stage2_pud_populate(kvm, pud, pmd);
get_page(virt_to_page(pud));
}
return stage2_pmd_offset(kvm, pud, addr);
}
static int stage2_set_pmd_huge(struct kvm_s2_mmu *mmu,
struct kvm_mmu_memory_cache *cache,
phys_addr_t addr, const pmd_t *new_pmd)
{
pmd_t *pmd, old_pmd;
retry:
pmd = stage2_get_pmd(mmu, cache, addr);
VM_BUG_ON(!pmd);
old_pmd = *pmd;
/*
* Multiple vcpus faulting on the same PMD entry, can
* lead to them sequentially updating the PMD with the
* same value. Following the break-before-make
* (pmd_clear() followed by tlb_flush()) process can
* hinder forward progress due to refaults generated
* on missing translations.
*
* Skip updating the page table if the entry is
* unchanged.
*/
if (pmd_val(old_pmd) == pmd_val(*new_pmd))
return 0;
if (pmd_present(old_pmd)) {
/*
* If we already have PTE level mapping for this block,
* we must unmap it to avoid inconsistent TLB state and
* leaking the table page. We could end up in this situation
* if the memory slot was marked for dirty logging and was
* reverted, leaving PTE level mappings for the pages accessed
* during the period. So, unmap the PTE level mapping for this
* block and retry, as we could have released the upper level
* table in the process.
*
* Normal THP split/merge follows mmu_notifier callbacks and do
* get handled accordingly.
*/
if (!pmd_thp_or_huge(old_pmd)) {
unmap_stage2_range(mmu, addr & S2_PMD_MASK, S2_PMD_SIZE);
goto retry;
}
/*
* Mapping in huge pages should only happen through a
* fault. If a page is merged into a transparent huge
* page, the individual subpages of that huge page
* should be unmapped through MMU notifiers before we
* get here.
*
* Merging of CompoundPages is not supported; they
* should become splitting first, unmapped, merged,
* and mapped back in on-demand.
*/
WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
pmd_clear(pmd);
kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL);
} else {
get_page(virt_to_page(pmd));
}
kvm_set_pmd(pmd, *new_pmd);
return 0;
}
static int stage2_set_pud_huge(struct kvm_s2_mmu *mmu,
struct kvm_mmu_memory_cache *cache,
phys_addr_t addr, const pud_t *new_pudp)
{
struct kvm *kvm = mmu->kvm;
pud_t *pudp, old_pud;
retry:
pudp = stage2_get_pud(mmu, cache, addr);
VM_BUG_ON(!pudp);
old_pud = *pudp;
/*
* A large number of vcpus faulting on the same stage 2 entry,
* can lead to a refault due to the stage2_pud_clear()/tlb_flush().
* Skip updating the page tables if there is no change.
*/
if (pud_val(old_pud) == pud_val(*new_pudp))
return 0;
if (stage2_pud_present(kvm, old_pud)) {
/*
* If we already have table level mapping for this block, unmap
* the range for this block and retry.
*/
if (!stage2_pud_huge(kvm, old_pud)) {
unmap_stage2_range(mmu, addr & S2_PUD_MASK, S2_PUD_SIZE);
goto retry;
}
WARN_ON_ONCE(kvm_pud_pfn(old_pud) != kvm_pud_pfn(*new_pudp));
stage2_pud_clear(kvm, pudp);
kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL);
} else {
get_page(virt_to_page(pudp));
}
kvm_set_pud(pudp, *new_pudp);
return 0;
}
/*
* stage2_get_leaf_entry - walk the stage2 VM page tables and return
* true if a valid and present leaf-entry is found. A pointer to the
* leaf-entry is returned in the appropriate level variable - pudpp,
* pmdpp, ptepp.
* kvm_init_stage2_mmu - Initialise a S2 MMU strucrure
* @kvm: The pointer to the KVM structure
* @mmu: The pointer to the s2 MMU structure
*
* Allocates only the stage-2 HW PGD level table(s).
* Note we don't need locking here as this is only called when the VM is
* created, which can only be done once.
*/
static bool stage2_get_leaf_entry(struct kvm_s2_mmu *mmu, phys_addr_t addr,
pud_t **pudpp, pmd_t **pmdpp, pte_t **ptepp)
int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
{
struct kvm *kvm = mmu->kvm;
pud_t *pudp;
pmd_t *pmdp;
pte_t *ptep;
int cpu, err;
struct kvm_pgtable *pgt;
*pudpp = NULL;
*pmdpp = NULL;
*ptepp = NULL;
pudp = stage2_get_pud(mmu, NULL, addr);
if (!pudp || stage2_pud_none(kvm, *pudp) || !stage2_pud_present(kvm, *pudp))
return false;
if (stage2_pud_huge(kvm, *pudp)) {
*pudpp = pudp;
return true;
if (mmu->pgt != NULL) {
kvm_err("kvm_arch already initialized?\n");
return -EINVAL;
}
pmdp = stage2_pmd_offset(kvm, pudp, addr);
if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp))
return false;
if (pmd_thp_or_huge(*pmdp)) {
*pmdpp = pmdp;
return true;
}
pgt = kzalloc(sizeof(*pgt), GFP_KERNEL);
if (!pgt)
return -ENOMEM;
ptep = pte_offset_kernel(pmdp, addr);
if (!ptep || pte_none(*ptep) || !pte_present(*ptep))
return false;
err = kvm_pgtable_stage2_init(pgt, kvm);
if (err)
goto out_free_pgtable;
*ptepp = ptep;
return true;
}
mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran));
if (!mmu->last_vcpu_ran) {
err = -ENOMEM;
goto out_destroy_pgtable;
}
static bool stage2_is_exec(struct kvm_s2_mmu *mmu, phys_addr_t addr, unsigned long sz)
{
pud_t *pudp;
pmd_t *pmdp;
pte_t *ptep;
bool found;
for_each_possible_cpu(cpu)
*per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1;
found = stage2_get_leaf_entry(mmu, addr, &pudp, &pmdp, &ptep);
if (!found)
return false;
mmu->kvm = kvm;
mmu->pgt = pgt;
mmu->pgd_phys = __pa(pgt->pgd);
mmu->vmid.vmid_gen = 0;
return 0;
if (pudp)
return sz <= PUD_SIZE && kvm_s2pud_exec(pudp);
else if (pmdp)
return sz <= PMD_SIZE && kvm_s2pmd_exec(pmdp);
else
return sz == PAGE_SIZE && kvm_s2pte_exec(ptep);
out_destroy_pgtable:
kvm_pgtable_stage2_destroy(pgt);
out_free_pgtable:
kfree(pgt);
return err;
}
static int stage2_set_pte(struct kvm_s2_mmu *mmu,
struct kvm_mmu_memory_cache *cache,
phys_addr_t addr, const pte_t *new_pte,
unsigned long flags)
static void stage2_unmap_memslot(struct kvm *kvm,
struct kvm_memory_slot *memslot)
{
struct kvm *kvm = mmu->kvm;
pud_t *pud;
pmd_t *pmd;
pte_t *pte, old_pte;
bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE;
VM_BUG_ON(logging_active && !cache);
hva_t hva = memslot->userspace_addr;
phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
phys_addr_t size = PAGE_SIZE * memslot->npages;
hva_t reg_end = hva + size;
/* Create stage-2 page table mapping - Levels 0 and 1 */
pud = stage2_get_pud(mmu, cache, addr);
if (!pud) {
/*
* Ignore calls from kvm_set_spte_hva for unallocated
* address ranges.
* A memory region could potentially cover multiple VMAs, and any holes
* between them, so iterate over all of them to find out if we should
* unmap any of them.
*
* +--------------------------------------------+
* +---------------+----------------+ +----------------+
* | : VMA 1 | VMA 2 | | VMA 3 : |
* +---------------+----------------+ +----------------+
* | memory region |
* +--------------------------------------------+
*/
return 0;
}
do {
struct vm_area_struct *vma = find_vma(current->mm, hva);
hva_t vm_start, vm_end;
/*
* While dirty page logging - dissolve huge PUD, then continue
* on to allocate page.
*/
if (logging_active)
stage2_dissolve_pud(mmu, addr, pud);
if (stage2_pud_none(kvm, *pud)) {
if (!cache)
return 0; /* ignore calls from kvm_set_spte_hva */
pmd = kvm_mmu_memory_cache_alloc(cache);
stage2_pud_populate(kvm, pud, pmd);
get_page(virt_to_page(pud));
}
if (!vma || vma->vm_start >= reg_end)
break;
pmd = stage2_pmd_offset(kvm, pud, addr);
if (!pmd) {
/*
* Ignore calls from kvm_set_spte_hva for unallocated
* address ranges.
* Take the intersection of this VMA with the memory region
*/
return 0;
}
vm_start = max(hva, vma->vm_start);
vm_end = min(reg_end, vma->vm_end);
/*
* While dirty page logging - dissolve huge PMD, then continue on to
* allocate page.
*/
if (logging_active)
stage2_dissolve_pmd(mmu, addr, pmd);
/* Create stage-2 page mappings - Level 2 */
if (pmd_none(*pmd)) {
if (!cache)
return 0; /* ignore calls from kvm_set_spte_hva */
pte = kvm_mmu_memory_cache_alloc(cache);
kvm_pmd_populate(pmd, pte);
get_page(virt_to_page(pmd));
if (!(vma->vm_flags & VM_PFNMAP)) {
gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
unmap_stage2_range(&kvm->arch.mmu, gpa, vm_end - vm_start);
}
hva = vm_end;
} while (hva < reg_end);
}
pte = pte_offset_kernel(pmd, addr);
if (iomap && pte_present(*pte))
return -EFAULT;
/**
* stage2_unmap_vm - Unmap Stage-2 RAM mappings
* @kvm: The struct kvm pointer
*
* Go through the memregions and unmap any regular RAM
* backing memory already mapped to the VM.
*/
void stage2_unmap_vm(struct kvm *kvm)
{
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
int idx;
/* Create 2nd stage page table mapping - Level 3 */
old_pte = *pte;
if (pte_present(old_pte)) {
/* Skip page table update if there is no change */
if (pte_val(old_pte) == pte_val(*new_pte))
return 0;
idx = srcu_read_lock(&kvm->srcu);
mmap_read_lock(current->mm);
spin_lock(&kvm->mmu_lock);
kvm_set_pte(pte, __pte(0));
kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PTE_LEVEL);
} else {
get_page(virt_to_page(pte));
}
slots = kvm_memslots(kvm);
kvm_for_each_memslot(memslot, slots)
stage2_unmap_memslot(kvm, memslot);
kvm_set_pte(pte, *new_pte);
return 0;
spin_unlock(&kvm->mmu_lock);
mmap_read_unlock(current->mm);
srcu_read_unlock(&kvm->srcu, idx);
}
#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
static int stage2_ptep_test_and_clear_young(pte_t *pte)
{
if (pte_young(*pte)) {
*pte = pte_mkold(*pte);
return 1;
}
return 0;
}
#else
static int stage2_ptep_test_and_clear_young(pte_t *pte)
void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
{
return __ptep_test_and_clear_young(pte);
}
#endif
struct kvm *kvm = mmu->kvm;
struct kvm_pgtable *pgt = NULL;
static int stage2_pmdp_test_and_clear_young(pmd_t *pmd)
{
return stage2_ptep_test_and_clear_young((pte_t *)pmd);
}
spin_lock(&kvm->mmu_lock);
pgt = mmu->pgt;
if (pgt) {
mmu->pgd_phys = 0;
mmu->pgt = NULL;
free_percpu(mmu->last_vcpu_ran);
}
spin_unlock(&kvm->mmu_lock);
static int stage2_pudp_test_and_clear_young(pud_t *pud)
{
return stage2_ptep_test_and_clear_young((pte_t *)pud);
if (pgt) {
kvm_pgtable_stage2_destroy(pgt);
kfree(pgt);
}
}
/**
......@@ -1468,169 +494,52 @@ static int stage2_pudp_test_and_clear_young(pud_t *pud)
* @guest_ipa: The IPA at which to insert the mapping
* @pa: The physical address of the device
* @size: The size of the mapping
* @writable: Whether or not to create a writable mapping
*/
int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
phys_addr_t pa, unsigned long size, bool writable)
{
phys_addr_t addr, end;
phys_addr_t addr;
int ret = 0;
unsigned long pfn;
struct kvm_mmu_memory_cache cache = { 0, __GFP_ZERO, NULL, };
struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE |
KVM_PGTABLE_PROT_R |
(writable ? KVM_PGTABLE_PROT_W : 0);
end = (guest_ipa + size + PAGE_SIZE - 1) & PAGE_MASK;
pfn = __phys_to_pfn(pa);
for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) {
pte_t pte = kvm_pfn_pte(pfn, PAGE_S2_DEVICE);
if (writable)
pte = kvm_s2pte_mkwrite(pte);
size += offset_in_page(guest_ipa);
guest_ipa &= PAGE_MASK;
for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) {
ret = kvm_mmu_topup_memory_cache(&cache,
kvm_mmu_cache_min_pages(kvm));
if (ret)
goto out;
break;
spin_lock(&kvm->mmu_lock);
ret = stage2_set_pte(&kvm->arch.mmu, &cache, addr, &pte,
KVM_S2PTE_FLAG_IS_IOMAP);
ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot,
&cache);
spin_unlock(&kvm->mmu_lock);
if (ret)
goto out;
break;
pfn++;
pa += PAGE_SIZE;
}
out:
kvm_mmu_free_memory_cache(&cache);
return ret;
}
/**
* stage2_wp_ptes - write protect PMD range
* @pmd: pointer to pmd entry
* @addr: range start address
* @end: range end address
*/
static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
{
pte_t *pte;
pte = pte_offset_kernel(pmd, addr);
do {
if (!pte_none(*pte)) {
if (!kvm_s2pte_readonly(pte))
kvm_set_s2pte_readonly(pte);
}
} while (pte++, addr += PAGE_SIZE, addr != end);
}
/**
* stage2_wp_pmds - write protect PUD range
* kvm: kvm instance for the VM
* @pud: pointer to pud entry
* @addr: range start address
* @end: range end address
*/
static void stage2_wp_pmds(struct kvm_s2_mmu *mmu, pud_t *pud,
phys_addr_t addr, phys_addr_t end)
{
struct kvm *kvm = mmu->kvm;
pmd_t *pmd;
phys_addr_t next;
pmd = stage2_pmd_offset(kvm, pud, addr);
do {
next = stage2_pmd_addr_end(kvm, addr, end);
if (!pmd_none(*pmd)) {
if (pmd_thp_or_huge(*pmd)) {
if (!kvm_s2pmd_readonly(pmd))
kvm_set_s2pmd_readonly(pmd);
} else {
stage2_wp_ptes(pmd, addr, next);
}
}
} while (pmd++, addr = next, addr != end);
}
/**
* stage2_wp_puds - write protect P4D range
* @p4d: pointer to p4d entry
* @addr: range start address
* @end: range end address
*/
static void stage2_wp_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d,
phys_addr_t addr, phys_addr_t end)
{
struct kvm *kvm = mmu->kvm;
pud_t *pud;
phys_addr_t next;
pud = stage2_pud_offset(kvm, p4d, addr);
do {
next = stage2_pud_addr_end(kvm, addr, end);
if (!stage2_pud_none(kvm, *pud)) {
if (stage2_pud_huge(kvm, *pud)) {
if (!kvm_s2pud_readonly(pud))
kvm_set_s2pud_readonly(pud);
} else {
stage2_wp_pmds(mmu, pud, addr, next);
}
}
} while (pud++, addr = next, addr != end);
}
/**
* stage2_wp_p4ds - write protect PGD range
* @pgd: pointer to pgd entry
* @addr: range start address
* @end: range end address
*/
static void stage2_wp_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd,
phys_addr_t addr, phys_addr_t end)
{
struct kvm *kvm = mmu->kvm;
p4d_t *p4d;
phys_addr_t next;
p4d = stage2_p4d_offset(kvm, pgd, addr);
do {
next = stage2_p4d_addr_end(kvm, addr, end);
if (!stage2_p4d_none(kvm, *p4d))
stage2_wp_puds(mmu, p4d, addr, next);
} while (p4d++, addr = next, addr != end);
}
/**
* stage2_wp_range() - write protect stage2 memory region range
* @kvm: The KVM pointer
* @mmu: The KVM stage-2 MMU pointer
* @addr: Start address of range
* @end: End address of range
*/
static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
{
struct kvm *kvm = mmu->kvm;
pgd_t *pgd;
phys_addr_t next;
pgd = mmu->pgd + stage2_pgd_index(kvm, addr);
do {
/*
* Release kvm_mmu_lock periodically if the memory region is
* large. Otherwise, we may see kernel panics with
* CONFIG_DETECT_HUNG_TASK, CONFIG_LOCKUP_DETECTOR,
* CONFIG_LOCKDEP. Additionally, holding the lock too long
* will also starve other vCPUs. We have to also make sure
* that the page tables are not freed while we released
* the lock.
*/
cond_resched_lock(&kvm->mmu_lock);
if (!READ_ONCE(mmu->pgd))
break;
next = stage2_pgd_addr_end(kvm, addr, end);
if (stage2_pgd_present(kvm, *pgd))
stage2_wp_p4ds(mmu, pgd, addr, next);
} while (pgd++, addr = next, addr != end);
stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_wrprotect);
}
/**
......@@ -1833,20 +742,21 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
struct kvm_memory_slot *memslot, unsigned long hva,
unsigned long fault_status)
{
int ret;
int ret = 0;
bool write_fault, writable, force_pte = false;
bool exec_fault, needs_exec;
bool exec_fault;
bool device = false;
unsigned long mmu_seq;
gfn_t gfn = fault_ipa >> PAGE_SHIFT;
struct kvm *kvm = vcpu->kvm;
struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
struct vm_area_struct *vma;
short vma_shift;
gfn_t gfn;
kvm_pfn_t pfn;
pgprot_t mem_type = PAGE_S2;
bool logging_active = memslot_is_logging(memslot);
unsigned long vma_pagesize, flags = 0;
struct kvm_s2_mmu *mmu = vcpu->arch.hw_mmu;
unsigned long vma_pagesize;
enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
struct kvm_pgtable *pgt;
write_fault = kvm_is_write_fault(vcpu);
exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
......@@ -1871,31 +781,41 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
else
vma_shift = PAGE_SHIFT;
vma_pagesize = 1ULL << vma_shift;
if (logging_active ||
(vma->vm_flags & VM_PFNMAP) ||
!fault_supports_stage2_huge_mapping(memslot, hva, vma_pagesize)) {
(vma->vm_flags & VM_PFNMAP)) {
force_pte = true;
vma_pagesize = PAGE_SIZE;
vma_shift = PAGE_SHIFT;
}
/*
* The stage2 has a minimum of 2 level table (For arm64 see
* kvm_arm_setup_stage2()). Hence, we are guaranteed that we can
* use PMD_SIZE huge mappings (even when the PMD is folded into PGD).
* As for PUD huge maps, we must make sure that we have at least
* 3 levels, i.e, PMD is not folded.
*/
if (vma_pagesize == PMD_SIZE ||
(vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm)))
gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT;
if (vma_shift == PUD_SHIFT &&
!fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE))
vma_shift = PMD_SHIFT;
if (vma_shift == PMD_SHIFT &&
!fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) {
force_pte = true;
vma_shift = PAGE_SHIFT;
}
vma_pagesize = 1UL << vma_shift;
if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE)
fault_ipa &= ~(vma_pagesize - 1);
gfn = fault_ipa >> PAGE_SHIFT;
mmap_read_unlock(current->mm);
/* We need minimum second+third level pages */
ret = kvm_mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm));
/*
* Permission faults just need to update the existing leaf entry,
* and so normally don't require allocations from the memcache. The
* only exception to this is when dirty logging is enabled at runtime
* and a write fault needs to collapse a block entry into a table.
*/
if (fault_status != FSC_PERM || (logging_active && write_fault)) {
ret = kvm_mmu_topup_memory_cache(memcache,
kvm_mmu_cache_min_pages(kvm));
if (ret)
return ret;
}
mmu_seq = vcpu->kvm->mmu_notifier_seq;
/*
......@@ -1918,28 +838,20 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
return -EFAULT;
if (kvm_is_device_pfn(pfn)) {
mem_type = PAGE_S2_DEVICE;
flags |= KVM_S2PTE_FLAG_IS_IOMAP;
} else if (logging_active) {
/*
* Faults on pages in a memslot with logging enabled
* should not be mapped with huge pages (it introduces churn
* and performance degradation), so force a pte mapping.
*/
flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
device = true;
} else if (logging_active && !write_fault) {
/*
* Only actually map the page as writable if this was a write
* fault.
*/
if (!write_fault)
writable = false;
}
if (exec_fault && is_iomap(flags))
if (exec_fault && device)
return -ENOEXEC;
spin_lock(&kvm->mmu_lock);
pgt = vcpu->arch.hw_mmu->pgt;
if (mmu_notifier_retry(kvm, mmu_seq))
goto out_unlock;
......@@ -1950,67 +862,31 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
if (vma_pagesize == PAGE_SIZE && !force_pte)
vma_pagesize = transparent_hugepage_adjust(memslot, hva,
&pfn, &fault_ipa);
if (writable)
if (writable) {
prot |= KVM_PGTABLE_PROT_W;
kvm_set_pfn_dirty(pfn);
mark_page_dirty(kvm, gfn);
}
if (fault_status != FSC_PERM && !is_iomap(flags))
if (fault_status != FSC_PERM && !device)
clean_dcache_guest_page(pfn, vma_pagesize);
if (exec_fault)
if (exec_fault) {
prot |= KVM_PGTABLE_PROT_X;
invalidate_icache_guest_page(pfn, vma_pagesize);
/*
* If we took an execution fault we have made the
* icache/dcache coherent above and should now let the s2
* mapping be executable.
*
* Write faults (!exec_fault && FSC_PERM) are orthogonal to
* execute permissions, and we preserve whatever we have.
*/
needs_exec = exec_fault ||
(fault_status == FSC_PERM &&
stage2_is_exec(mmu, fault_ipa, vma_pagesize));
/*
* If PUD_SIZE == PMD_SIZE, there is no real PUD level, and
* all we have is a 2-level page table. Trying to map a PUD in
* this case would be fatally wrong.
*/
if (PUD_SIZE != PMD_SIZE && vma_pagesize == PUD_SIZE) {
pud_t new_pud = kvm_pfn_pud(pfn, mem_type);
new_pud = kvm_pud_mkhuge(new_pud);
if (writable)
new_pud = kvm_s2pud_mkwrite(new_pud);
if (needs_exec)
new_pud = kvm_s2pud_mkexec(new_pud);
ret = stage2_set_pud_huge(mmu, memcache, fault_ipa, &new_pud);
} else if (vma_pagesize == PMD_SIZE) {
pmd_t new_pmd = kvm_pfn_pmd(pfn, mem_type);
new_pmd = kvm_pmd_mkhuge(new_pmd);
if (writable)
new_pmd = kvm_s2pmd_mkwrite(new_pmd);
if (needs_exec)
new_pmd = kvm_s2pmd_mkexec(new_pmd);
ret = stage2_set_pmd_huge(mmu, memcache, fault_ipa, &new_pmd);
} else {
pte_t new_pte = kvm_pfn_pte(pfn, mem_type);
if (writable) {
new_pte = kvm_s2pte_mkwrite(new_pte);
mark_page_dirty(kvm, gfn);
}
if (needs_exec)
new_pte = kvm_s2pte_mkexec(new_pte);
if (device)
prot |= KVM_PGTABLE_PROT_DEVICE;
else if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC))
prot |= KVM_PGTABLE_PROT_X;
ret = stage2_set_pte(mmu, memcache, fault_ipa, &new_pte, flags);
if (fault_status == FSC_PERM && !(logging_active && writable)) {
ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot);
} else {
ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize,
__pfn_to_phys(pfn), prot,
memcache);
}
out_unlock:
......@@ -2020,46 +896,23 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
return ret;
}
/*
* Resolve the access fault by making the page young again.
* Note that because the faulting entry is guaranteed not to be
* cached in the TLB, we don't need to invalidate anything.
* Only the HW Access Flag updates are supported for Stage 2 (no DBM),
* so there is no need for atomic (pte|pmd)_mkyoung operations.
*/
/* Resolve the access fault by making the page young again. */
static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
{
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
kvm_pfn_t pfn;
bool pfn_valid = false;
pte_t pte;
kvm_pte_t kpte;
struct kvm_s2_mmu *mmu;
trace_kvm_access_fault(fault_ipa);
spin_lock(&vcpu->kvm->mmu_lock);
if (!stage2_get_leaf_entry(vcpu->arch.hw_mmu, fault_ipa, &pud, &pmd, &pte))
goto out;
if (pud) { /* HugeTLB */
*pud = kvm_s2pud_mkyoung(*pud);
pfn = kvm_pud_pfn(*pud);
pfn_valid = true;
} else if (pmd) { /* THP, HugeTLB */
*pmd = pmd_mkyoung(*pmd);
pfn = pmd_pfn(*pmd);
pfn_valid = true;
} else {
*pte = pte_mkyoung(*pte); /* Just a page... */
pfn = pte_pfn(*pte);
pfn_valid = true;
}
out:
mmu = vcpu->arch.hw_mmu;
kpte = kvm_pgtable_stage2_mkyoung(mmu->pgt, fault_ipa);
spin_unlock(&vcpu->kvm->mmu_lock);
if (pfn_valid)
kvm_set_pfn_accessed(pfn);
pte = __pte(kpte);
if (pte_valid(pte))
kvm_set_pfn_accessed(pte_pfn(pte));
}
/**
......@@ -2230,7 +1083,7 @@ static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *dat
int kvm_unmap_hva_range(struct kvm *kvm,
unsigned long start, unsigned long end, unsigned flags)
{
if (!kvm->arch.mmu.pgd)
if (!kvm->arch.mmu.pgt)
return 0;
trace_kvm_unmap_hva_range(start, end);
......@@ -2240,28 +1093,27 @@ int kvm_unmap_hva_range(struct kvm *kvm,
static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
{
pte_t *pte = (pte_t *)data;
kvm_pfn_t *pfn = (kvm_pfn_t *)data;
WARN_ON(size != PAGE_SIZE);
/*
* We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE
* flag clear because MMU notifiers will have unmapped a huge PMD before
* calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and
* therefore stage2_set_pte() never needs to clear out a huge PMD
* through this calling path.
* The MMU notifiers will have unmapped a huge PMD before calling
* ->change_pte() (which in turn calls kvm_set_spte_hva()) and
* therefore we never need to clear out a huge PMD through this
* calling path and a memcache is not required.
*/
stage2_set_pte(&kvm->arch.mmu, NULL, gpa, pte, 0);
kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, gpa, PAGE_SIZE,
__pfn_to_phys(*pfn), KVM_PGTABLE_PROT_R, NULL);
return 0;
}
int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
{
unsigned long end = hva + PAGE_SIZE;
kvm_pfn_t pfn = pte_pfn(pte);
pte_t stage2_pte;
if (!kvm->arch.mmu.pgd)
if (!kvm->arch.mmu.pgt)
return 0;
trace_kvm_set_spte_hva(hva);
......@@ -2271,51 +1123,30 @@ int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
* just like a translation fault and clean the cache to the PoC.
*/
clean_dcache_guest_page(pfn, PAGE_SIZE);
stage2_pte = kvm_pfn_pte(pfn, PAGE_S2);
handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte);
handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pfn);
return 0;
}
static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
{
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
pte_t pte;
kvm_pte_t kpte;
WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
if (!stage2_get_leaf_entry(&kvm->arch.mmu, gpa, &pud, &pmd, &pte))
return 0;
if (pud)
return stage2_pudp_test_and_clear_young(pud);
else if (pmd)
return stage2_pmdp_test_and_clear_young(pmd);
else
return stage2_ptep_test_and_clear_young(pte);
kpte = kvm_pgtable_stage2_mkold(kvm->arch.mmu.pgt, gpa);
pte = __pte(kpte);
return pte_valid(pte) && pte_young(pte);
}
static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
{
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
if (!stage2_get_leaf_entry(&kvm->arch.mmu, gpa, &pud, &pmd, &pte))
return 0;
if (pud)
return kvm_s2pud_young(*pud);
else if (pmd)
return pmd_young(*pmd);
else
return pte_young(*pte);
return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt, gpa);
}
int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
{
if (!kvm->arch.mmu.pgd)
if (!kvm->arch.mmu.pgt)
return 0;
trace_kvm_age_hva(start, end);
return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
......@@ -2323,24 +1154,16 @@ int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
{
if (!kvm->arch.mmu.pgd)
if (!kvm->arch.mmu.pgt)
return 0;
trace_kvm_test_age_hva(hva);
return handle_hva_to_gpa(kvm, hva, hva + PAGE_SIZE,
kvm_test_age_hva_handler, NULL);
}
void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu)
{
kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
}
phys_addr_t kvm_mmu_get_httbr(void)
{
if (__kvm_cpu_uses_extended_idmap())
return virt_to_phys(merged_hyp_pgd);
else
return virt_to_phys(hyp_pgd);
return __pa(hyp_pgtable->pgd);
}
phys_addr_t kvm_get_idmap_vector(void)
......@@ -2348,14 +1171,10 @@ phys_addr_t kvm_get_idmap_vector(void)
return hyp_idmap_vector;
}
static int kvm_map_idmap_text(pgd_t *pgd)
static int kvm_map_idmap_text(void)
{
int err;
/* Create the idmap in the boot page tables */
err = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(),
hyp_idmap_start, hyp_idmap_end,
__phys_to_pfn(hyp_idmap_start),
unsigned long size = hyp_idmap_end - hyp_idmap_start;
int err = __create_hyp_mappings(hyp_idmap_start, size, hyp_idmap_start,
PAGE_HYP_EXEC);
if (err)
kvm_err("Failed to idmap %lx-%lx\n",
......@@ -2367,6 +1186,7 @@ static int kvm_map_idmap_text(pgd_t *pgd)
int kvm_mmu_init(void)
{
int err;
u32 hyp_va_bits;
hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start);
hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE);
......@@ -2380,6 +1200,8 @@ int kvm_mmu_init(void)
*/
BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
hyp_va_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET);
kvm_debug("Using %u-bit virtual addresses at EL2\n", hyp_va_bits);
kvm_debug("IDMAP page: %lx\n", hyp_idmap_start);
kvm_debug("HYP VA range: %lx:%lx\n",
kern_hyp_va(PAGE_OFFSET),
......@@ -2397,43 +1219,30 @@ int kvm_mmu_init(void)
goto out;
}
hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order);
if (!hyp_pgd) {
kvm_err("Hyp mode PGD not allocated\n");
err = -ENOMEM;
goto out;
}
if (__kvm_cpu_uses_extended_idmap()) {
boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
hyp_pgd_order);
if (!boot_hyp_pgd) {
kvm_err("Hyp boot PGD not allocated\n");
hyp_pgtable = kzalloc(sizeof(*hyp_pgtable), GFP_KERNEL);
if (!hyp_pgtable) {
kvm_err("Hyp mode page-table not allocated\n");
err = -ENOMEM;
goto out;
}
err = kvm_map_idmap_text(boot_hyp_pgd);
err = kvm_pgtable_hyp_init(hyp_pgtable, hyp_va_bits);
if (err)
goto out;
goto out_free_pgtable;
merged_hyp_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
if (!merged_hyp_pgd) {
kvm_err("Failed to allocate extra HYP pgd\n");
goto out;
}
__kvm_extend_hypmap(boot_hyp_pgd, hyp_pgd, merged_hyp_pgd,
hyp_idmap_start);
} else {
err = kvm_map_idmap_text(hyp_pgd);
err = kvm_map_idmap_text();
if (err)
goto out;
}
goto out_destroy_pgtable;
io_map_base = hyp_idmap_start;
return 0;
out_destroy_pgtable:
kvm_pgtable_hyp_destroy(hyp_pgtable);
out_free_pgtable:
kfree(hyp_pgtable);
hyp_pgtable = NULL;
out:
free_hyp_pgds();
return err;
}
......@@ -2537,7 +1346,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
spin_lock(&kvm->mmu_lock);
if (ret)
unmap_stage2_range(&kvm->arch.mmu, mem->guest_phys_addr, mem->memory_size);
else
else if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB))
stage2_flush_memslot(kvm, memslot);
spin_unlock(&kvm->mmu_lock);
out:
......
......@@ -20,6 +20,21 @@ static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc);
#define PERF_ATTR_CFG1_KVM_PMU_CHAINED 0x1
static u32 kvm_pmu_event_mask(struct kvm *kvm)
{
switch (kvm->arch.pmuver) {
case 1: /* ARMv8.0 */
return GENMASK(9, 0);
case 4: /* ARMv8.1 */
case 5: /* ARMv8.4 */
case 6: /* ARMv8.5 */
return GENMASK(15, 0);
default: /* Shouldn't be here, just for sanity */
WARN_ONCE(1, "Unknown PMU version %d\n", kvm->arch.pmuver);
return 0;
}
}
/**
* kvm_pmu_idx_is_64bit - determine if select_idx is a 64bit counter
* @vcpu: The vcpu pointer
......@@ -100,7 +115,7 @@ static bool kvm_pmu_idx_has_chain_evtype(struct kvm_vcpu *vcpu, u64 select_idx)
return false;
reg = PMEVTYPER0_EL0 + select_idx;
eventsel = __vcpu_sys_reg(vcpu, reg) & ARMV8_PMU_EVTYPE_EVENT;
eventsel = __vcpu_sys_reg(vcpu, reg) & kvm_pmu_event_mask(vcpu->kvm);
return eventsel == ARMV8_PMUV3_PERFCTR_CHAIN;
}
......@@ -495,7 +510,7 @@ void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val)
/* PMSWINC only applies to ... SW_INC! */
type = __vcpu_sys_reg(vcpu, PMEVTYPER0_EL0 + i);
type &= ARMV8_PMU_EVTYPE_EVENT;
type &= kvm_pmu_event_mask(vcpu->kvm);
if (type != ARMV8_PMUV3_PERFCTR_SW_INCR)
continue;
......@@ -578,11 +593,21 @@ static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx)
data = __vcpu_sys_reg(vcpu, reg);
kvm_pmu_stop_counter(vcpu, pmc);
eventsel = data & ARMV8_PMU_EVTYPE_EVENT;
if (pmc->idx == ARMV8_PMU_CYCLE_IDX)
eventsel = ARMV8_PMUV3_PERFCTR_CPU_CYCLES;
else
eventsel = data & kvm_pmu_event_mask(vcpu->kvm);
/* Software increment event does't need to be backed by a perf event */
if (eventsel == ARMV8_PMUV3_PERFCTR_SW_INCR &&
pmc->idx != ARMV8_PMU_CYCLE_IDX)
/* Software increment event doesn't need to be backed by a perf event */
if (eventsel == ARMV8_PMUV3_PERFCTR_SW_INCR)
return;
/*
* If we have a filter in place and that the event isn't allowed, do
* not install a perf event either.
*/
if (vcpu->kvm->arch.pmu_filter &&
!test_bit(eventsel, vcpu->kvm->arch.pmu_filter))
return;
memset(&attr, 0, sizeof(struct perf_event_attr));
......@@ -594,8 +619,7 @@ static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx)
attr.exclude_kernel = data & ARMV8_PMU_EXCLUDE_EL1 ? 1 : 0;
attr.exclude_hv = 1; /* Don't count EL2 events */
attr.exclude_host = 1; /* Don't count host events */
attr.config = (pmc->idx == ARMV8_PMU_CYCLE_IDX) ?
ARMV8_PMUV3_PERFCTR_CPU_CYCLES : eventsel;
attr.config = eventsel;
counter = kvm_pmu_get_pair_counter_value(vcpu, pmc);
......@@ -679,17 +703,95 @@ static void kvm_pmu_update_pmc_chained(struct kvm_vcpu *vcpu, u64 select_idx)
void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data,
u64 select_idx)
{
u64 reg, event_type = data & ARMV8_PMU_EVTYPE_MASK;
u64 reg, mask;
mask = ARMV8_PMU_EVTYPE_MASK;
mask &= ~ARMV8_PMU_EVTYPE_EVENT;
mask |= kvm_pmu_event_mask(vcpu->kvm);
reg = (select_idx == ARMV8_PMU_CYCLE_IDX)
? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + select_idx;
__vcpu_sys_reg(vcpu, reg) = event_type;
__vcpu_sys_reg(vcpu, reg) = data & mask;
kvm_pmu_update_pmc_chained(vcpu, select_idx);
kvm_pmu_create_perf_event(vcpu, select_idx);
}
static int kvm_pmu_probe_pmuver(void)
{
struct perf_event_attr attr = { };
struct perf_event *event;
struct arm_pmu *pmu;
int pmuver = 0xf;
/*
* Create a dummy event that only counts user cycles. As we'll never
* leave this function with the event being live, it will never
* count anything. But it allows us to probe some of the PMU
* details. Yes, this is terrible.
*/
attr.type = PERF_TYPE_RAW;
attr.size = sizeof(attr);
attr.pinned = 1;
attr.disabled = 0;
attr.exclude_user = 0;
attr.exclude_kernel = 1;
attr.exclude_hv = 1;
attr.exclude_host = 1;
attr.config = ARMV8_PMUV3_PERFCTR_CPU_CYCLES;
attr.sample_period = GENMASK(63, 0);
event = perf_event_create_kernel_counter(&attr, -1, current,
kvm_pmu_perf_overflow, &attr);
if (IS_ERR(event)) {
pr_err_once("kvm: pmu event creation failed %ld\n",
PTR_ERR(event));
return 0xf;
}
if (event->pmu) {
pmu = to_arm_pmu(event->pmu);
if (pmu->pmuver)
pmuver = pmu->pmuver;
}
perf_event_disable(event);
perf_event_release_kernel(event);
return pmuver;
}
u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1)
{
unsigned long *bmap = vcpu->kvm->arch.pmu_filter;
u64 val, mask = 0;
int base, i;
if (!pmceid1) {
val = read_sysreg(pmceid0_el0);
base = 0;
} else {
val = read_sysreg(pmceid1_el0);
base = 32;
}
if (!bmap)
return val;
for (i = 0; i < 32; i += 8) {
u64 byte;
byte = bitmap_get_value8(bmap, base + i);
mask |= byte << i;
byte = bitmap_get_value8(bmap, 0x4000 + base + i);
mask |= byte << (32 + i);
}
return val & mask;
}
bool kvm_arm_support_pmu_v3(void)
{
/*
......@@ -735,15 +837,6 @@ int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu)
static int kvm_arm_pmu_v3_init(struct kvm_vcpu *vcpu)
{
if (!kvm_arm_support_pmu_v3())
return -ENODEV;
if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
return -ENXIO;
if (vcpu->arch.pmu.created)
return -EBUSY;
if (irqchip_in_kernel(vcpu->kvm)) {
int ret;
......@@ -796,6 +889,19 @@ static bool pmu_irq_is_valid(struct kvm *kvm, int irq)
int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
{
if (!kvm_arm_support_pmu_v3() ||
!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
return -ENODEV;
if (vcpu->arch.pmu.created)
return -EBUSY;
if (!vcpu->kvm->arch.pmuver)
vcpu->kvm->arch.pmuver = kvm_pmu_probe_pmuver();
if (vcpu->kvm->arch.pmuver == 0xf)
return -ENODEV;
switch (attr->attr) {
case KVM_ARM_VCPU_PMU_V3_IRQ: {
int __user *uaddr = (int __user *)(long)attr->addr;
......@@ -804,9 +910,6 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
if (!irqchip_in_kernel(vcpu->kvm))
return -EINVAL;
if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
return -ENODEV;
if (get_user(irq, uaddr))
return -EFAULT;
......@@ -824,6 +927,53 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
vcpu->arch.pmu.irq_num = irq;
return 0;
}
case KVM_ARM_VCPU_PMU_V3_FILTER: {
struct kvm_pmu_event_filter __user *uaddr;
struct kvm_pmu_event_filter filter;
int nr_events;
nr_events = kvm_pmu_event_mask(vcpu->kvm) + 1;
uaddr = (struct kvm_pmu_event_filter __user *)(long)attr->addr;
if (copy_from_user(&filter, uaddr, sizeof(filter)))
return -EFAULT;
if (((u32)filter.base_event + filter.nevents) > nr_events ||
(filter.action != KVM_PMU_EVENT_ALLOW &&
filter.action != KVM_PMU_EVENT_DENY))
return -EINVAL;
mutex_lock(&vcpu->kvm->lock);
if (!vcpu->kvm->arch.pmu_filter) {
vcpu->kvm->arch.pmu_filter = bitmap_alloc(nr_events, GFP_KERNEL);
if (!vcpu->kvm->arch.pmu_filter) {
mutex_unlock(&vcpu->kvm->lock);
return -ENOMEM;
}
/*
* The default depends on the first applied filter.
* If it allows events, the default is to deny.
* Conversely, if the first filter denies a set of
* events, the default is to allow.
*/
if (filter.action == KVM_PMU_EVENT_ALLOW)
bitmap_zero(vcpu->kvm->arch.pmu_filter, nr_events);
else
bitmap_fill(vcpu->kvm->arch.pmu_filter, nr_events);
}
if (filter.action == KVM_PMU_EVENT_ALLOW)
bitmap_set(vcpu->kvm->arch.pmu_filter, filter.base_event, filter.nevents);
else
bitmap_clear(vcpu->kvm->arch.pmu_filter, filter.base_event, filter.nevents);
mutex_unlock(&vcpu->kvm->lock);
return 0;
}
case KVM_ARM_VCPU_PMU_V3_INIT:
return kvm_arm_pmu_v3_init(vcpu);
}
......@@ -860,6 +1010,7 @@ int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
switch (attr->attr) {
case KVM_ARM_VCPU_PMU_V3_IRQ:
case KVM_ARM_VCPU_PMU_V3_INIT:
case KVM_ARM_VCPU_PMU_V3_FILTER:
if (kvm_arm_support_pmu_v3() &&
test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
return 0;
......
......@@ -31,9 +31,9 @@ static bool kvm_pmu_switch_needed(struct perf_event_attr *attr)
*/
void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr)
{
struct kvm_host_data *ctx = this_cpu_ptr(&kvm_host_data);
struct kvm_host_data *ctx = this_cpu_ptr_hyp_sym(kvm_host_data);
if (!kvm_pmu_switch_needed(attr))
if (!ctx || !kvm_pmu_switch_needed(attr))
return;
if (!attr->exclude_host)
......@@ -47,7 +47,10 @@ void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr)
*/
void kvm_clr_pmu_events(u32 clr)
{
struct kvm_host_data *ctx = this_cpu_ptr(&kvm_host_data);
struct kvm_host_data *ctx = this_cpu_ptr_hyp_sym(kvm_host_data);
if (!ctx)
return;
ctx->pmu_events.events_host &= ~clr;
ctx->pmu_events.events_guest &= ~clr;
......@@ -173,7 +176,7 @@ void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu)
return;
preempt_disable();
host = this_cpu_ptr(&kvm_host_data);
host = this_cpu_ptr_hyp_sym(kvm_host_data);
events_guest = host->pmu_events.events_guest;
events_host = host->pmu_events.events_host;
......@@ -193,7 +196,7 @@ void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu)
if (!has_vhe())
return;
host = this_cpu_ptr(&kvm_host_data);
host = this_cpu_ptr_hyp_sym(kvm_host_data);
events_guest = host->pmu_events.events_guest;
events_host = host->pmu_events.events_host;
......
......@@ -425,27 +425,30 @@ static int get_kernel_wa_level(u64 regid)
{
switch (regid) {
case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1:
switch (kvm_arm_harden_branch_predictor()) {
case KVM_BP_HARDEN_UNKNOWN:
switch (arm64_get_spectre_v2_state()) {
case SPECTRE_VULNERABLE:
return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL;
case KVM_BP_HARDEN_WA_NEEDED:
case SPECTRE_MITIGATED:
return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL;
case KVM_BP_HARDEN_NOT_REQUIRED:
case SPECTRE_UNAFFECTED:
return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_REQUIRED;
}
return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL;
case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2:
switch (kvm_arm_have_ssbd()) {
case KVM_SSBD_FORCE_DISABLE:
switch (arm64_get_spectre_v4_state()) {
case SPECTRE_MITIGATED:
/*
* As for the hypercall discovery, we pretend we
* don't have any FW mitigation if SSBS is there at
* all times.
*/
if (cpus_have_final_cap(ARM64_SSBS))
return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL;
case KVM_SSBD_KERNEL:
return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL;
case KVM_SSBD_FORCE_ENABLE:
case KVM_SSBD_MITIGATED:
fallthrough;
case SPECTRE_UNAFFECTED:
return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED;
case KVM_SSBD_UNKNOWN:
default:
return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN;
case SPECTRE_VULNERABLE:
return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL;
}
}
......@@ -462,14 +465,8 @@ int kvm_arm_get_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
val = kvm_psci_version(vcpu, vcpu->kvm);
break;
case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1:
val = get_kernel_wa_level(reg->id) & KVM_REG_FEATURE_LEVEL_MASK;
break;
case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2:
val = get_kernel_wa_level(reg->id) & KVM_REG_FEATURE_LEVEL_MASK;
if (val == KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL &&
kvm_arm_get_vcpu_workaround_2_flag(vcpu))
val |= KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED;
break;
default:
return -ENOENT;
......@@ -527,34 +524,35 @@ int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED))
return -EINVAL;
wa_level = val & KVM_REG_FEATURE_LEVEL_MASK;
if (get_kernel_wa_level(reg->id) < wa_level)
return -EINVAL;
/* The enabled bit must not be set unless the level is AVAIL. */
if (wa_level != KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL &&
wa_level != val)
if ((val & KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED) &&
(val & KVM_REG_FEATURE_LEVEL_MASK) != KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL)
return -EINVAL;
/* Are we finished or do we need to check the enable bit ? */
if (kvm_arm_have_ssbd() != KVM_SSBD_KERNEL)
return 0;
/*
* If this kernel supports the workaround to be switched on
* or off, make sure it matches the requested setting.
* Map all the possible incoming states to the only two we
* really want to deal with.
*/
switch (wa_level) {
case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL:
kvm_arm_set_vcpu_workaround_2_flag(vcpu,
val & KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED);
switch (val & KVM_REG_FEATURE_LEVEL_MASK) {
case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL:
case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN:
wa_level = KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL;
break;
case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL:
case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED:
kvm_arm_set_vcpu_workaround_2_flag(vcpu, true);
wa_level = KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED;
break;
default:
return -EINVAL;
}
/*
* We can deal with NOT_AVAIL on NOT_REQUIRED, but not the
* other way around.
*/
if (get_kernel_wa_level(reg->id) < wa_level)
return -EINVAL;
return 0;
default:
return -ENOENT;
......
......@@ -319,10 +319,6 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
vcpu->arch.reset_state.reset = false;
}
/* Default workaround setup is enabled (if supported) */
if (kvm_arm_have_ssbd() == KVM_SSBD_KERNEL)
vcpu->arch.workaround_flags |= VCPU_WORKAROUND_2_FLAG;
/* Reset timer */
ret = kvm_timer_vcpu_reset(vcpu);
out:
......@@ -339,7 +335,7 @@ u32 get_kvm_ipa_limit(void)
int kvm_set_ipa_limit(void)
{
unsigned int ipa_max, pa_max, va_max, parange, tgran_2;
unsigned int parange, tgran_2;
u64 mmfr0;
mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
......@@ -376,39 +372,11 @@ int kvm_set_ipa_limit(void)
break;
}
pa_max = id_aa64mmfr0_parange_to_phys_shift(parange);
/* Clamp the IPA limit to the PA size supported by the kernel */
ipa_max = (pa_max > PHYS_MASK_SHIFT) ? PHYS_MASK_SHIFT : pa_max;
/*
* Since our stage2 table is dependent on the stage1 page table code,
* we must always honor the following condition:
*
* Number of levels in Stage1 >= Number of levels in Stage2.
*
* So clamp the ipa limit further down to limit the number of levels.
* Since we can concatenate upto 16 tables at entry level, we could
* go upto 4bits above the maximum VA addressable with the current
* number of levels.
*/
va_max = PGDIR_SHIFT + PAGE_SHIFT - 3;
va_max += 4;
if (va_max < ipa_max)
ipa_max = va_max;
/*
* If the final limit is lower than the real physical address
* limit of the CPUs, report the reason.
*/
if (ipa_max < pa_max)
pr_info("kvm: Limiting the IPA size due to kernel %s Address limit\n",
(va_max < pa_max) ? "Virtual" : "Physical");
WARN(ipa_max < KVM_PHYS_SHIFT,
"KVM IPA limit (%d bit) is smaller than default size\n", ipa_max);
kvm_ipa_limit = ipa_max;
kvm_info("IPA Size Limit: %dbits\n", kvm_ipa_limit);
kvm_ipa_limit = id_aa64mmfr0_parange_to_phys_shift(parange);
WARN(kvm_ipa_limit < KVM_PHYS_SHIFT,
"KVM IPA Size Limit (%d bits) is smaller than default size\n",
kvm_ipa_limit);
kvm_info("IPA Size Limit: %d bits\n", kvm_ipa_limit);
return 0;
}
......
......@@ -769,10 +769,7 @@ static bool access_pmceid(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
if (pmu_access_el0_disabled(vcpu))
return false;
if (!(p->Op2 & 1))
pmceid = read_sysreg(pmceid0_el0);
else
pmceid = read_sysreg(pmceid1_el0);
pmceid = kvm_pmu_get_pmceid(vcpu, (p->Op2 & 1));
p->regval = pmceid;
......@@ -1131,6 +1128,9 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu,
if (!vcpu_has_sve(vcpu))
val &= ~(0xfUL << ID_AA64PFR0_SVE_SHIFT);
val &= ~(0xfUL << ID_AA64PFR0_AMU_SHIFT);
if (!(val & (0xfUL << ID_AA64PFR0_CSV2_SHIFT)) &&
arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED)
val |= (1UL << ID_AA64PFR0_CSV2_SHIFT);
} else if (id == SYS_ID_AA64ISAR1_EL1 && !vcpu_has_ptrauth(vcpu)) {
val &= ~((0xfUL << ID_AA64ISAR1_APA_SHIFT) |
(0xfUL << ID_AA64ISAR1_API_SHIFT) |
......
......@@ -260,34 +260,14 @@ static int vgic_debug_show(struct seq_file *s, void *v)
return 0;
}
static const struct seq_operations vgic_debug_seq_ops = {
static const struct seq_operations vgic_debug_sops = {
.start = vgic_debug_start,
.next = vgic_debug_next,
.stop = vgic_debug_stop,
.show = vgic_debug_show
};
static int debug_open(struct inode *inode, struct file *file)
{
int ret;
ret = seq_open(file, &vgic_debug_seq_ops);
if (!ret) {
struct seq_file *seq;
/* seq_open will have modified file->private_data */
seq = file->private_data;
seq->private = inode->i_private;
}
return ret;
};
static const struct file_operations vgic_debug_fops = {
.owner = THIS_MODULE,
.open = debug_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release
};
DEFINE_SEQ_ATTRIBUTE(vgic_debug);
void vgic_debug_init(struct kvm *kvm)
{
......
......@@ -662,7 +662,7 @@ void vgic_v3_load(struct kvm_vcpu *vcpu)
if (likely(cpu_if->vgic_sre))
kvm_call_hyp(__vgic_v3_write_vmcr, cpu_if->vgic_vmcr);
kvm_call_hyp(__vgic_v3_restore_aprs, kern_hyp_va(cpu_if));
kvm_call_hyp(__vgic_v3_restore_aprs, cpu_if);
if (has_vhe())
__vgic_v3_activate_traps(cpu_if);
......@@ -686,7 +686,7 @@ void vgic_v3_put(struct kvm_vcpu *vcpu)
vgic_v3_vmcr_sync(vcpu);
kvm_call_hyp(__vgic_v3_save_aprs, kern_hyp_va(cpu_if));
kvm_call_hyp(__vgic_v3_save_aprs, cpu_if);
if (has_vhe())
__vgic_v3_deactivate_traps(cpu_if);
......
......@@ -34,6 +34,7 @@ struct kvm_pmu {
u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx);
void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val);
u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu);
u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1);
void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu);
void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu);
void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu);
......@@ -108,6 +109,10 @@ static inline int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu)
{
return 0;
}
static inline u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1)
{
return 0;
}
#endif
#endif
......@@ -49,6 +49,7 @@
#define ARM_SMCCC_OWNER_OEM 3
#define ARM_SMCCC_OWNER_STANDARD 4
#define ARM_SMCCC_OWNER_STANDARD_HYP 5
#define ARM_SMCCC_OWNER_VENDOR_HYP 6
#define ARM_SMCCC_OWNER_TRUSTED_APP 48
#define ARM_SMCCC_OWNER_TRUSTED_APP_END 49
#define ARM_SMCCC_OWNER_TRUSTED_OS 50
......@@ -227,87 +228,67 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1,
#define __count_args(...) \
___count_args(__VA_ARGS__, 7, 6, 5, 4, 3, 2, 1, 0)
#define __constraint_write_0 \
"+r" (r0), "=&r" (r1), "=&r" (r2), "=&r" (r3)
#define __constraint_write_1 \
"+r" (r0), "+r" (r1), "=&r" (r2), "=&r" (r3)
#define __constraint_write_2 \
"+r" (r0), "+r" (r1), "+r" (r2), "=&r" (r3)
#define __constraint_write_3 \
"+r" (r0), "+r" (r1), "+r" (r2), "+r" (r3)
#define __constraint_write_4 __constraint_write_3
#define __constraint_write_5 __constraint_write_4
#define __constraint_write_6 __constraint_write_5
#define __constraint_write_7 __constraint_write_6
#define __constraint_read_0
#define __constraint_read_1
#define __constraint_read_2
#define __constraint_read_3
#define __constraint_read_4 "r" (r4)
#define __constraint_read_5 __constraint_read_4, "r" (r5)
#define __constraint_read_6 __constraint_read_5, "r" (r6)
#define __constraint_read_7 __constraint_read_6, "r" (r7)
#define __constraint_read_0 "r" (arg0)
#define __constraint_read_1 __constraint_read_0, "r" (arg1)
#define __constraint_read_2 __constraint_read_1, "r" (arg2)
#define __constraint_read_3 __constraint_read_2, "r" (arg3)
#define __constraint_read_4 __constraint_read_3, "r" (arg4)
#define __constraint_read_5 __constraint_read_4, "r" (arg5)
#define __constraint_read_6 __constraint_read_5, "r" (arg6)
#define __constraint_read_7 __constraint_read_6, "r" (arg7)
#define __declare_arg_0(a0, res) \
struct arm_smccc_res *___res = res; \
register unsigned long r0 asm("r0") = (u32)a0; \
register unsigned long r1 asm("r1"); \
register unsigned long r2 asm("r2"); \
register unsigned long r3 asm("r3")
register unsigned long arg0 asm("r0") = (u32)a0
#define __declare_arg_1(a0, a1, res) \
typeof(a1) __a1 = a1; \
struct arm_smccc_res *___res = res; \
register unsigned long r0 asm("r0") = (u32)a0; \
register unsigned long r1 asm("r1") = __a1; \
register unsigned long r2 asm("r2"); \
register unsigned long r3 asm("r3")
register unsigned long arg0 asm("r0") = (u32)a0; \
register typeof(a1) arg1 asm("r1") = __a1
#define __declare_arg_2(a0, a1, a2, res) \
typeof(a1) __a1 = a1; \
typeof(a2) __a2 = a2; \
struct arm_smccc_res *___res = res; \
register unsigned long r0 asm("r0") = (u32)a0; \
register unsigned long r1 asm("r1") = __a1; \
register unsigned long r2 asm("r2") = __a2; \
register unsigned long r3 asm("r3")
register unsigned long arg0 asm("r0") = (u32)a0; \
register typeof(a1) arg1 asm("r1") = __a1; \
register typeof(a2) arg2 asm("r2") = __a2
#define __declare_arg_3(a0, a1, a2, a3, res) \
typeof(a1) __a1 = a1; \
typeof(a2) __a2 = a2; \
typeof(a3) __a3 = a3; \
struct arm_smccc_res *___res = res; \
register unsigned long r0 asm("r0") = (u32)a0; \
register unsigned long r1 asm("r1") = __a1; \
register unsigned long r2 asm("r2") = __a2; \
register unsigned long r3 asm("r3") = __a3
register unsigned long arg0 asm("r0") = (u32)a0; \
register typeof(a1) arg1 asm("r1") = __a1; \
register typeof(a2) arg2 asm("r2") = __a2; \
register typeof(a3) arg3 asm("r3") = __a3
#define __declare_arg_4(a0, a1, a2, a3, a4, res) \
typeof(a4) __a4 = a4; \
__declare_arg_3(a0, a1, a2, a3, res); \
register unsigned long r4 asm("r4") = __a4
register typeof(a4) arg4 asm("r4") = __a4
#define __declare_arg_5(a0, a1, a2, a3, a4, a5, res) \
typeof(a5) __a5 = a5; \
__declare_arg_4(a0, a1, a2, a3, a4, res); \
register unsigned long r5 asm("r5") = __a5
register typeof(a5) arg5 asm("r5") = __a5
#define __declare_arg_6(a0, a1, a2, a3, a4, a5, a6, res) \
typeof(a6) __a6 = a6; \
__declare_arg_5(a0, a1, a2, a3, a4, a5, res); \
register unsigned long r6 asm("r6") = __a6
register typeof(a6) arg6 asm("r6") = __a6
#define __declare_arg_7(a0, a1, a2, a3, a4, a5, a6, a7, res) \
typeof(a7) __a7 = a7; \
__declare_arg_6(a0, a1, a2, a3, a4, a5, a6, res); \
register unsigned long r7 asm("r7") = __a7
register typeof(a7) arg7 asm("r7") = __a7
#define ___declare_args(count, ...) __declare_arg_ ## count(__VA_ARGS__)
#define __declare_args(count, ...) ___declare_args(count, __VA_ARGS__)
#define ___constraints(count) \
: __constraint_write_ ## count \
: __constraint_read_ ## count \
: "memory"
#define __constraints(count) ___constraints(count)
......@@ -319,8 +300,13 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1,
*/
#define __arm_smccc_1_1(inst, ...) \
do { \
register unsigned long r0 asm("r0"); \
register unsigned long r1 asm("r1"); \
register unsigned long r2 asm("r2"); \
register unsigned long r3 asm("r3"); \
__declare_args(__count_args(__VA_ARGS__), __VA_ARGS__); \
asm volatile(inst "\n" \
asm volatile(inst "\n" : \
"=r" (r0), "=r" (r1), "=r" (r2), "=r" (r3) \
__constraints(__count_args(__VA_ARGS__))); \
if (___res) \
*___res = (typeof(*___res)){r0, r1, r2, r3}; \
......@@ -366,7 +352,7 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1,
#define __fail_smccc_1_1(...) \
do { \
__declare_args(__count_args(__VA_ARGS__), __VA_ARGS__); \
asm ("" __constraints(__count_args(__VA_ARGS__))); \
asm ("" : __constraints(__count_args(__VA_ARGS__))); \
if (___res) \
___res->a0 = SMCCC_RET_NOT_SUPPORTED; \
} while (0)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment