Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull kvm fixes from Paolo Bonzini: "ARM: - selftest fix - force PTE mapping on device pages provided via VFIO - fix detection of cacheable mapping at S2 - fallback to PMD/PTE mappings for composite huge pages - fix accounting of Stage-2 PGD allocation - fix AArch32 handling of some of the debug registers - simplify host HYP entry - fix stray pointer conversion on nVHE TLB invalidation - fix initialization of the nVHE code - simplify handling of capabilities exposed to HYP - nuke VCPUs caught using a forbidden AArch32 EL0 x86: - new nested virtualization selftest - miscellaneous fixes - make W=1 fixes - reserve new CPUID bit in the KVM leaves" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: KVM: vmx: remove unused variable KVM: selftests: Don't require THP to run tests KVM: VMX: eVMCS: make evmcs_sanitize_exec_ctrls() work again KVM: selftests: test behavior of unmapped L2 APIC-access address KVM: x86: Fix NULL dereference at kvm_msr_ignored_check() KVM: x86: replace static const variables with macros KVM: arm64: Handle Asymmetric AArch32 systems arm64: cpufeature: upgrade hyp caps to final arm64: cpufeature: reorder cpus_have_{const, final}_cap() KVM: arm64: Factor out is_{vhe,nvhe}_hyp_code() KVM: arm64: Force PTE mapping on fault resulting in a device mapping KVM: arm64: Use fallback mapping sizes for contiguous huge page sizes KVM: arm64: Fix masks in stage2_pte_cacheable() KVM: arm64: Fix AArch32 handling of DBGD{CCINT,SCRext} and DBGVCR KVM: arm64: Allocate stage-2 pgd pages with GFP_KERNEL_ACCOUNT KVM: arm64: Drop useless PAN setting on host EL1 to EL2 transition KVM: arm64: Remove leftover kern_hyp_va() in nVHE TLB invalidation KVM: arm64: Don't corrupt tpidr_el2 on failed HVC call x86/kvm: Reserve KVM_FEATURE_MSI_EXT_DEST_ID

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm fixes from Paolo Bonzini: "ARM: - selftest fix - force PTE mapping on device pages provided via VFIO - fix detection of cacheable mapping at S2 - fallback to PMD/PTE mappings for composite huge pages - fix accounting of Stage-2 PGD allocation - fix AArch32 handling of some of the debug registers - simplify host HYP entry - fix stray pointer conversion on nVHE TLB invalidation - fix initialization of the nVHE code - simplify handling of capabilities exposed to HYP - nuke VCPUs caught using a forbidden AArch32 EL0 x86: - new nested virtualization selftest - miscellaneous fixes - make W=1 fixes - reserve new CPUID bit in the KVM leaves" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: KVM: vmx: remove unused variable KVM: selftests: Don't require THP to run tests KVM: VMX: eVMCS: make evmcs_sanitize_exec_ctrls() work again KVM: selftests: test behavior of unmapped L2 APIC-access address KVM: x86: Fix NULL dereference at kvm_msr_ignored_check() KVM: x86: replace static const variables with macros KVM: arm64: Handle Asymmetric AArch32 systems arm64: cpufeature: upgrade hyp caps to final arm64: cpufeature: reorder cpus_have_{const, final}_cap() KVM: arm64: Factor out is_{vhe,nvhe}_hyp_code() KVM: arm64: Force PTE mapping on fault resulting in a device mapping KVM: arm64: Use fallback mapping sizes for contiguous huge page sizes KVM: arm64: Fix masks in stage2_pte_cacheable() KVM: arm64: Fix AArch32 handling of DBGD{CCINT,SCRext} and DBGVCR KVM: arm64: Allocate stage-2 pgd pages with GFP_KERNEL_ACCOUNT KVM: arm64: Drop useless PAN setting on host EL1 to EL2 transition KVM: arm64: Remove leftover kern_hyp_va() in nVHE TLB invalidation KVM: arm64: Don't corrupt tpidr_el2 on failed HVC call x86/kvm: Reserve KVM_FEATURE_MSI_EXT_DEST_ID
2d38c80d · Linus Torvalds · c2dc4c07 · 9478dec3 · 2d38c80d · 2d38c80d
Commit 2d38c80d authored Nov 01, 2020 by Linus Torvalds
26 changed files
--- a/Documentation/virt/kvm/cpuid.rst
+++ b/Documentation/virt/kvm/cpuid.rst
@@ -92,6 +92,10 @@ KVM_FEATURE_ASYNC_PF_INT           14          guest checks this feature bit
                                               async pf acknowledgment msr
                                               0x4b564d07.

+KVM_FEATURE_MSI_EXT_DEST_ID        15          guest checks this feature bit
+                                               before using extended destination
+                                               ID bits in MSI address bits 11-5.
+
 KVM_FEATURE_CLOCKSOURCE_STABLE_BIT 24          host will warn if no guest-side
                                               per-cpu warps are expected in
                                               kvmclock

--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -375,6 +375,23 @@ cpucap_multi_entry_cap_matches(const struct arm64_cpu_capabilities *entry,
 	return false;
 }

+static __always_inline bool is_vhe_hyp_code(void)
+{
+	/* Only defined for code run in VHE hyp context */
+	return __is_defined(__KVM_VHE_HYPERVISOR__);
+}
+
+static __always_inline bool is_nvhe_hyp_code(void)
+{
+	/* Only defined for code run in NVHE hyp context */
+	return __is_defined(__KVM_NVHE_HYPERVISOR__);
+}
+
+static __always_inline bool is_hyp_code(void)
+{
+	return is_vhe_hyp_code() || is_nvhe_hyp_code();
+}
+
 extern DECLARE_BITMAP(cpu_hwcaps, ARM64_NCAPS);
 extern struct static_key_false cpu_hwcap_keys[ARM64_NCAPS];
 extern struct static_key_false arm64_const_caps_ready;
@@ -428,35 +445,40 @@ static __always_inline bool __cpus_have_const_cap(int num)
 }

 /*
- * Test for a capability, possibly with a runtime check.
+ * Test for a capability without a runtime check.
 *
- * Before capabilities are finalized, this behaves as cpus_have_cap().
+ * Before capabilities are finalized, this will BUG().
 * After capabilities are finalized, this is patched to avoid a runtime check.
 *
 * @num must be a compile-time constant.
 */
-static __always_inline bool cpus_have_const_cap(int num)
+static __always_inline bool cpus_have_final_cap(int num)
 {
 	if (system_capabilities_finalized())
 		return __cpus_have_const_cap(num);
 	else
-		return cpus_have_cap(num);
+		BUG();
 }

 /*
- * Test for a capability without a runtime check.
+ * Test for a capability, possibly with a runtime check for non-hyp code.
 *
- * Before capabilities are finalized, this will BUG().
+ * For hyp code, this behaves the same as cpus_have_final_cap().
+ *
+ * For non-hyp code:
+ * Before capabilities are finalized, this behaves as cpus_have_cap().
 * After capabilities are finalized, this is patched to avoid a runtime check.
 *
 * @num must be a compile-time constant.
 */
-static __always_inline bool cpus_have_final_cap(int num)
+static __always_inline bool cpus_have_const_cap(int num)
 {
-	if (system_capabilities_finalized())
+	if (is_hyp_code())
+		return cpus_have_final_cap(num);
+	else if (system_capabilities_finalized())
 		return __cpus_have_const_cap(num);
 	else
-		BUG();
+		return cpus_have_cap(num);
 }

 static inline void cpus_set_cap(unsigned int num)

--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -239,6 +239,7 @@ enum vcpu_sysreg {
 #define cp14_DBGWCR0	(DBGWCR0_EL1 * 2)
 #define cp14_DBGWVR0	(DBGWVR0_EL1 * 2)
 #define cp14_DBGDCCINT	(MDCCINT_EL1 * 2)
+#define cp14_DBGVCR	(DBGVCR32_EL2 * 2)

 #define NR_COPRO_REGS	(NR_SYS_REGS * 2)


--- a/arch/arm64/include/asm/virt.h
+++ b/arch/arm64/include/asm/virt.h
@@ -86,13 +86,12 @@ static inline bool is_kernel_in_hyp_mode(void)
 static __always_inline bool has_vhe(void)
 {
 	/*
-	 * The following macros are defined for code specic to VHE/nVHE.
-	 * If has_vhe() is inlined into those compilation units, it can
-	 * be determined statically. Otherwise fall back to caps.
+	 * Code only run in VHE/NVHE hyp context can assume VHE is present or
+	 * absent. Otherwise fall back to caps.
 	 */
-	if (__is_defined(__KVM_VHE_HYPERVISOR__))
+	if (is_vhe_hyp_code())
 		return true;
-	else if (__is_defined(__KVM_NVHE_HYPERVISOR__))
+	else if (is_nvhe_hyp_code())
 		return false;
 	else
 		return cpus_have_final_cap(ARM64_HAS_VIRT_HOST_EXTN);

--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -87,7 +87,6 @@ KVM_NVHE_ALIAS(__icache_flags);
 /* Kernel symbols needed for cpus_have_final/const_caps checks. */
 KVM_NVHE_ALIAS(arm64_const_caps_ready);
 KVM_NVHE_ALIAS(cpu_hwcap_keys);
-KVM_NVHE_ALIAS(cpu_hwcaps);

 /* Static keys which are set if a vGIC trap should be handled in hyp. */
 KVM_NVHE_ALIAS(vgic_v2_cpuif_trap);

--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -808,6 +808,25 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)

 		preempt_enable();

+		/*
+		 * The ARMv8 architecture doesn't give the hypervisor
+		 * a mechanism to prevent a guest from dropping to AArch32 EL0
+		 * if implemented by the CPU. If we spot the guest in such
+		 * state and that we decided it wasn't supposed to do so (like
+		 * with the asymmetric AArch32 case), return to userspace with
+		 * a fatal error.
+		 */
+		if (!system_supports_32bit_el0() && vcpu_mode_is_32bit(vcpu)) {
+			/*
+			 * As we have caught the guest red-handed, decide that
+			 * it isn't fit for purpose anymore by making the vcpu
+			 * invalid. The VMM can try and fix it by issuing  a
+			 * KVM_ARM_VCPU_INIT if it really wants to.
+			 */
+			vcpu->arch.target = -1;
+			ret = ARM_EXCEPTION_IL;
+		}
+
 		ret = handle_exit(vcpu, ret);
 	}


--- a/arch/arm64/kvm/hyp/nvhe/host.S
+++ b/arch/arm64/kvm/hyp/nvhe/host.S
@@ -17,8 +17,6 @@ SYM_FUNC_START(__host_exit)

 	get_host_ctxt	x0, x1

-	ALTERNATIVE(nop, SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN)
-
 	/* Store the host regs x2 and x3 */
 	stp	x2, x3,   [x0, #CPU_XREG_OFFSET(2)]


--- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
@@ -57,16 +57,25 @@ __do_hyp_init:
 	cmp	x0, #HVC_STUB_HCALL_NR
 	b.lo	__kvm_handle_stub_hvc

-	/* Set tpidr_el2 for use by HYP to free a register */
-	msr	tpidr_el2, x2
-
-	mov	x2, #KVM_HOST_SMCCC_FUNC(__kvm_hyp_init)
-	cmp	x0, x2
-	b.eq	1f
+	// We only actively check bits [24:31], and everything
+	// else has to be zero, which we check at build time.
+#if (KVM_HOST_SMCCC_FUNC(__kvm_hyp_init) & 0xFFFFFFFF00FFFFFF)
+#error Unexpected __KVM_HOST_SMCCC_FUNC___kvm_hyp_init value
+#endif
+
+	ror	x0, x0, #24
+	eor	x0, x0, #((KVM_HOST_SMCCC_FUNC(__kvm_hyp_init) >> 24) & 0xF)
+	ror	x0, x0, #4
+	eor	x0, x0, #((KVM_HOST_SMCCC_FUNC(__kvm_hyp_init) >> 28) & 0xF)
+	cbz	x0, 1f
 	mov	x0, #SMCCC_RET_NOT_SUPPORTED
 	eret

-1:	phys_to_ttbr x0, x1
+1:
+	/* Set tpidr_el2 for use by HYP to free a register */
+	msr	tpidr_el2, x2
+
+	phys_to_ttbr x0, x1
 alternative_if ARM64_HAS_CNP
 	orr	x0, x0, #TTBR_CNP_BIT
 alternative_else_nop_endif

--- a/arch/arm64/kvm/hyp/nvhe/tlb.c
+++ b/arch/arm64/kvm/hyp/nvhe/tlb.c
@@ -128,7 +128,6 @@ void __kvm_tlb_flush_local_vmid(struct kvm_s2_mmu *mmu)
 	struct tlb_inv_context cxt;

 	/* Switch to requested VMID */
-	mmu = kern_hyp_va(mmu);
 	__tlb_switch_to_guest(mmu, &cxt);

 	__tlbi(vmalle1);

--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -635,7 +635,7 @@ static void stage2_flush_dcache(void *addr, u64 size)

 static bool stage2_pte_cacheable(kvm_pte_t pte)
 {
-	u64 memattr = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR, pte);
+	u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR;
 	return memattr == PAGE_S2_MEMATTR(NORMAL);
 }

@@ -846,7 +846,7 @@ int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm *kvm)
 	u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0;

 	pgd_sz = kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE;
-	pgt->pgd = alloc_pages_exact(pgd_sz, GFP_KERNEL | __GFP_ZERO);
+	pgt->pgd = alloc_pages_exact(pgd_sz, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
 	if (!pgt->pgd)
 		return -ENOMEM;


--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -787,14 +787,26 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		vma_shift = PAGE_SHIFT;
 	}

-	if (vma_shift == PUD_SHIFT &&
-	    !fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE))
-	       vma_shift = PMD_SHIFT;
-
-	if (vma_shift == PMD_SHIFT &&
-	    !fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) {
-		force_pte = true;
+	switch (vma_shift) {
+	case PUD_SHIFT:
+		if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE))
+			break;
+		fallthrough;
+	case CONT_PMD_SHIFT:
+		vma_shift = PMD_SHIFT;
+		fallthrough;
+	case PMD_SHIFT:
+		if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE))
+			break;
+		fallthrough;
+	case CONT_PTE_SHIFT:
 		vma_shift = PAGE_SHIFT;
+		force_pte = true;
+		fallthrough;
+	case PAGE_SHIFT:
+		break;
+	default:
+		WARN_ONCE(1, "Unknown vma_shift %d", vma_shift);
 	}

 	vma_pagesize = 1UL << vma_shift;
@@ -839,6 +851,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,

 	if (kvm_is_device_pfn(pfn)) {
 		device = true;
+		force_pte = true;
 	} else if (logging_active && !write_fault) {
 		/*
 		 * Only actually map the page as writable if this was a write

--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1897,9 +1897,9 @@ static const struct sys_reg_desc cp14_regs[] = {
 	{ Op1( 0), CRn( 0), CRm( 1), Op2( 0), trap_raz_wi },
 	DBG_BCR_BVR_WCR_WVR(1),
 	/* DBGDCCINT */
-	{ Op1( 0), CRn( 0), CRm( 2), Op2( 0), trap_debug32 },
+	{ Op1( 0), CRn( 0), CRm( 2), Op2( 0), trap_debug32, NULL, cp14_DBGDCCINT },
 	/* DBGDSCRext */
-	{ Op1( 0), CRn( 0), CRm( 2), Op2( 2), trap_debug32 },
+	{ Op1( 0), CRn( 0), CRm( 2), Op2( 2), trap_debug32, NULL, cp14_DBGDSCRext },
 	DBG_BCR_BVR_WCR_WVR(2),
 	/* DBGDTR[RT]Xint */
 	{ Op1( 0), CRn( 0), CRm( 3), Op2( 0), trap_raz_wi },
@@ -1914,7 +1914,7 @@ static const struct sys_reg_desc cp14_regs[] = {
 	{ Op1( 0), CRn( 0), CRm( 6), Op2( 2), trap_raz_wi },
 	DBG_BCR_BVR_WCR_WVR(6),
 	/* DBGVCR */
-	{ Op1( 0), CRn( 0), CRm( 7), Op2( 0), trap_debug32 },
+	{ Op1( 0), CRn( 0), CRm( 7), Op2( 0), trap_debug32, NULL, cp14_DBGVCR },
 	DBG_BCR_BVR_WCR_WVR(7),
 	DBG_BCR_BVR_WCR_WVR(8),
 	DBG_BCR_BVR_WCR_WVR(9),

--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -32,6 +32,7 @@
 #define KVM_FEATURE_POLL_CONTROL	12
 #define KVM_FEATURE_PV_SCHED_YIELD	13
 #define KVM_FEATURE_ASYNC_PF_INT	14
+#define KVM_FEATURE_MSI_EXT_DEST_ID	15

 #define KVM_HINTS_REALTIME      0


--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -225,7 +225,7 @@ static gfn_t get_mmio_spte_gfn(u64 spte)
 {
 	u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask;

-	gpa |= (spte >> shadow_nonpresent_or_rsvd_mask_len)
+	gpa |= (spte >> SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)
 	       & shadow_nonpresent_or_rsvd_mask;

 	return gpa >> PAGE_SHIFT;
@@ -591,15 +591,15 @@ static u64 mmu_spte_get_lockless(u64 *sptep)
 static u64 restore_acc_track_spte(u64 spte)
 {
 	u64 new_spte = spte;
-	u64 saved_bits = (spte >> shadow_acc_track_saved_bits_shift)
-			 & shadow_acc_track_saved_bits_mask;
+	u64 saved_bits = (spte >> SHADOW_ACC_TRACK_SAVED_BITS_SHIFT)
+			 & SHADOW_ACC_TRACK_SAVED_BITS_MASK;

 	WARN_ON_ONCE(spte_ad_enabled(spte));
 	WARN_ON_ONCE(!is_access_track_spte(spte));

 	new_spte &= ~shadow_acc_track_mask;
-	new_spte &= ~(shadow_acc_track_saved_bits_mask <<
-		      shadow_acc_track_saved_bits_shift);
+	new_spte &= ~(SHADOW_ACC_TRACK_SAVED_BITS_MASK <<
+		      SHADOW_ACC_TRACK_SAVED_BITS_SHIFT);
 	new_spte |= saved_bits;

 	return new_spte;

--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -55,7 +55,7 @@ u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access)
 	mask |= shadow_mmio_value | access;
 	mask |= gpa | shadow_nonpresent_or_rsvd_mask;
 	mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
-		<< shadow_nonpresent_or_rsvd_mask_len;
+		<< SHADOW_NONPRESENT_OR_RSVD_MASK_LEN;

 	return mask;
 }
@@ -231,12 +231,12 @@ u64 mark_spte_for_access_track(u64 spte)
 		  !spte_can_locklessly_be_made_writable(spte),
 		  "kvm: Writable SPTE is not locklessly dirty-trackable\n");

-	WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask <<
-			  shadow_acc_track_saved_bits_shift),
+	WARN_ONCE(spte & (SHADOW_ACC_TRACK_SAVED_BITS_MASK <<
+			  SHADOW_ACC_TRACK_SAVED_BITS_SHIFT),
 		  "kvm: Access Tracking saved bit locations are not zero\n");

-	spte |= (spte & shadow_acc_track_saved_bits_mask) <<
-		shadow_acc_track_saved_bits_shift;
+	spte |= (spte & SHADOW_ACC_TRACK_SAVED_BITS_MASK) <<
+		SHADOW_ACC_TRACK_SAVED_BITS_SHIFT;
 	spte &= ~shadow_acc_track_mask;

 	return spte;
@@ -245,7 +245,7 @@ u64 mark_spte_for_access_track(u64 spte)
 void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask)
 {
 	BUG_ON((u64)(unsigned)access_mask != access_mask);
-	WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << shadow_nonpresent_or_rsvd_mask_len));
+	WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << SHADOW_NONPRESENT_OR_RSVD_MASK_LEN));
 	WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask);
 	shadow_mmio_value = mmio_value | SPTE_MMIO_MASK;
 	shadow_mmio_access_mask = access_mask;
@@ -306,9 +306,9 @@ void kvm_mmu_reset_all_pte_masks(void)
 	low_phys_bits = boot_cpu_data.x86_phys_bits;
 	if (boot_cpu_has_bug(X86_BUG_L1TF) &&
 	    !WARN_ON_ONCE(boot_cpu_data.x86_cache_bits >=
-			  52 - shadow_nonpresent_or_rsvd_mask_len)) {
+			  52 - SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)) {
 		low_phys_bits = boot_cpu_data.x86_cache_bits
-			- shadow_nonpresent_or_rsvd_mask_len;
+			- SHADOW_NONPRESENT_OR_RSVD_MASK_LEN;
 		shadow_nonpresent_or_rsvd_mask =
 			rsvd_bits(low_phys_bits, boot_cpu_data.x86_cache_bits - 1);
 	}

--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -104,20 +104,20 @@ extern u64 __read_mostly shadow_acc_track_mask;
 */
 extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask;

+/*
+ * The number of high-order 1 bits to use in the mask above.
+ */
+#define SHADOW_NONPRESENT_OR_RSVD_MASK_LEN 5
+
 /*
 * The mask/shift to use for saving the original R/X bits when marking the PTE
 * as not-present for access tracking purposes. We do not save the W bit as the
 * PTEs being access tracked also need to be dirty tracked, so the W bit will be
 * restored only when a write is attempted to the page.
 */
-static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK |
-						    PT64_EPT_EXECUTABLE_MASK;
-static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT;
-
-/*
- * The number of high-order 1 bits to use in the mask above.
- */
-static const u64 shadow_nonpresent_or_rsvd_mask_len = 5;
+#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (PT64_EPT_READABLE_MASK | \
+					  PT64_EPT_EXECUTABLE_MASK)
+#define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT PT64_SECOND_AVAIL_BITS_SHIFT

 /*
 * In some cases, we need to preserve the GFN of a non-present or reserved

--- a/arch/x86/kvm/vmx/evmcs.c
+++ b/arch/x86/kvm/vmx/evmcs.c
@@ -297,14 +297,13 @@ const struct evmcs_field vmcs_field_to_evmcs_1[] = {
 };
 const unsigned int nr_evmcs_1_fields = ARRAY_SIZE(vmcs_field_to_evmcs_1);

-void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
+__init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
 {
 	vmcs_conf->pin_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_PINCTRL;
 	vmcs_conf->cpu_based_2nd_exec_ctrl &= ~EVMCS1_UNSUPPORTED_2NDEXEC;

 	vmcs_conf->vmexit_ctrl &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL;
 	vmcs_conf->vmentry_ctrl &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL;
-
 }
 #endif


--- a/arch/x86/kvm/vmx/evmcs.h
+++ b/arch/x86/kvm/vmx/evmcs.h
@@ -185,7 +185,7 @@ static inline void evmcs_load(u64 phys_addr)
 	vp_ap->enlighten_vmentry = 1;
 }

-void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf);
+__init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf);
 #else /* !IS_ENABLED(CONFIG_HYPERV) */
 static inline void evmcs_write64(unsigned long field, u64 value) {}
 static inline void evmcs_write32(unsigned long field, u32 value) {}
@@ -194,7 +194,6 @@ static inline u64 evmcs_read64(unsigned long field) { return 0; }
 static inline u32 evmcs_read32(unsigned long field) { return 0; }
 static inline u16 evmcs_read16(unsigned long field) { return 0; }
 static inline void evmcs_load(u64 phys_addr) {}
-static inline void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) {}
 static inline void evmcs_touch_msr_bitmap(void) {}
 #endif /* IS_ENABLED(CONFIG_HYPERV) */


--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -2560,8 +2560,10 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
 	vmcs_conf->vmexit_ctrl         = _vmexit_control;
 	vmcs_conf->vmentry_ctrl        = _vmentry_control;

-	if (static_branch_unlikely(&enable_evmcs))
+#if IS_ENABLED(CONFIG_HYPERV)
+	if (enlightened_vmcs)
 		evmcs_sanitize_exec_ctrls(vmcs_conf);
+#endif

 	return 0;
 }
@@ -6834,7 +6836,6 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx;
-	unsigned long *msr_bitmap;
 	int i, cpu, err;

 	BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0);
@@ -6894,7 +6895,6 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
 	bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
 	bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS);

-	msr_bitmap = vmx->vmcs01.msr_bitmap;
 	vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
 	vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
 	vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);

--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -265,13 +265,13 @@ static int kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr,

 	if (ignore_msrs) {
 		if (report_ignored_msrs)
-			vcpu_unimpl(vcpu, "ignored %s: 0x%x data 0x%llx\n",
-				    op, msr, data);
+			kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n",
+				      op, msr, data);
 		/* Mask the error */
 		return 0;
 	} else {
-		vcpu_debug_ratelimited(vcpu, "unhandled %s: 0x%x data 0x%llx\n",
-				       op, msr, data);
+		kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n",
+				      op, msr, data);
 		return -ENOENT;
 	}
 }

--- a/tools/testing/selftests/kvm/.gitignore
+++ b/tools/testing/selftests/kvm/.gitignore
@@ -15,6 +15,7 @@
 /x86_64/vmx_preemption_timer_test
 /x86_64/svm_vmcall_test
 /x86_64/sync_regs_test
+/x86_64/vmx_apic_access_test
 /x86_64/vmx_close_while_nested_test
 /x86_64/vmx_dirty_log_test
 /x86_64/vmx_set_nested_state_test

--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -49,6 +49,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/state_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_preemption_timer_test
 TEST_GEN_PROGS_x86_64 += x86_64/svm_vmcall_test
 TEST_GEN_PROGS_x86_64 += x86_64/sync_regs_test
+TEST_GEN_PROGS_x86_64 += x86_64/vmx_apic_access_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test

--- a/tools/testing/selftests/kvm/include/x86_64/vmx.h
+++ b/tools/testing/selftests/kvm/include/x86_64/vmx.h
@@ -573,6 +573,10 @@ struct vmx_pages {
 	void *eptp_hva;
 	uint64_t eptp_gpa;
 	void *eptp;
+
+	void *apic_access_hva;
+	uint64_t apic_access_gpa;
+	void *apic_access;
 };

 union vmx_basic {
@@ -615,5 +619,7 @@ void nested_map_memslot(struct vmx_pages *vmx, struct kvm_vm *vm,
 			uint32_t memslot, uint32_t eptp_memslot);
 void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm,
 		  uint32_t eptp_memslot);
+void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm,
+				      uint32_t eptp_memslot);

 #endif /* SELFTEST_KVM_VMX_H */
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -14,6 +14,7 @@
 #include <sys/mman.h>
 #include <sys/types.h>
 #include <sys/stat.h>
+#include <unistd.h>
 #include <linux/kernel.h>

 #define KVM_UTIL_PGS_PER_HUGEPG 512
@@ -664,13 +665,21 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,

 	/* As needed perform madvise */
 	if (src_type == VM_MEM_SRC_ANONYMOUS || src_type == VM_MEM_SRC_ANONYMOUS_THP) {
-		ret = madvise(region->host_mem, npages * vm->page_size,
-			     src_type == VM_MEM_SRC_ANONYMOUS ? MADV_NOHUGEPAGE : MADV_HUGEPAGE);
-		TEST_ASSERT(ret == 0, "madvise failed,\n"
-			    "  addr: %p\n"
-			    "  length: 0x%lx\n"
-			    "  src_type: %x",
-			    region->host_mem, npages * vm->page_size, src_type);
+		struct stat statbuf;
+
+		ret = stat("/sys/kernel/mm/transparent_hugepage", &statbuf);
+		TEST_ASSERT(ret == 0 || (ret == -1 && errno == ENOENT),
+			    "stat /sys/kernel/mm/transparent_hugepage");
+
+		TEST_ASSERT(ret == 0 || src_type != VM_MEM_SRC_ANONYMOUS_THP,
+			    "VM_MEM_SRC_ANONYMOUS_THP requires THP to be configured in the host kernel");
+
+		if (ret == 0) {
+			ret = madvise(region->host_mem, npages * vm->page_size,
+				      src_type == VM_MEM_SRC_ANONYMOUS ? MADV_NOHUGEPAGE : MADV_HUGEPAGE);
+			TEST_ASSERT(ret == 0, "madvise failed, addr: %p length: 0x%lx src_type: %x",
+				    region->host_mem, npages * vm->page_size, src_type);
+		}
 	}

 	region->unused_phy_pages = sparsebit_alloc();

--- a/tools/testing/selftests/kvm/lib/x86_64/vmx.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/vmx.c
@@ -542,3 +542,12 @@ void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm,
 	vmx->eptp_hva = addr_gva2hva(vm, (uintptr_t)vmx->eptp);
 	vmx->eptp_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->eptp);
 }
+
+void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm,
+				      uint32_t eptp_memslot)
+{
+	vmx->apic_access = (void *)vm_vaddr_alloc(vm, getpagesize(),
+						  0x10000, 0, 0);
+	vmx->apic_access_hva = addr_gva2hva(vm, (uintptr_t)vmx->apic_access);
+	vmx->apic_access_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->apic_access);
+}
--- a/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c
+++ b/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * vmx_apic_access_test
+ *
+ * Copyright (C) 2020, Google LLC.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ * The first subtest simply checks to see that an L2 guest can be
+ * launched with a valid APIC-access address that is backed by a
+ * page of L1 physical memory.
+ *
+ * The second subtest sets the APIC-access address to a (valid) L1
+ * physical address that is not backed by memory. KVM can't handle
+ * this situation, so resuming L2 should result in a KVM exit for
+ * internal error (emulation). This is not an architectural
+ * requirement. It is just a shortcoming of KVM. The internal error
+ * is unfortunate, but it's better than what used to happen!
+ */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "vmx.h"
+
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "kselftest.h"
+
+#define VCPU_ID		0
+
+/* The virtual machine object. */
+static struct kvm_vm *vm;
+
+static void l2_guest_code(void)
+{
+	/* Exit to L1 */
+	__asm__ __volatile__("vmcall");
+}
+
+static void l1_guest_code(struct vmx_pages *vmx_pages, unsigned long high_gpa)
+{
+#define L2_GUEST_STACK_SIZE 64
+	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+	uint32_t control;
+
+	GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
+	GUEST_ASSERT(load_vmcs(vmx_pages));
+
+	/* Prepare the VMCS for L2 execution. */
+	prepare_vmcs(vmx_pages, l2_guest_code,
+		     &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+	control = vmreadz(CPU_BASED_VM_EXEC_CONTROL);
+	control |= CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+	vmwrite(CPU_BASED_VM_EXEC_CONTROL, control);
+	control = vmreadz(SECONDARY_VM_EXEC_CONTROL);
+	control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+	vmwrite(SECONDARY_VM_EXEC_CONTROL, control);
+	vmwrite(APIC_ACCESS_ADDR, vmx_pages->apic_access_gpa);
+
+	/* Try to launch L2 with the memory-backed APIC-access address. */
+	GUEST_SYNC(vmreadz(APIC_ACCESS_ADDR));
+	GUEST_ASSERT(!vmlaunch());
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+
+	vmwrite(APIC_ACCESS_ADDR, high_gpa);
+
+	/* Try to resume L2 with the unbacked APIC-access address. */
+	GUEST_SYNC(vmreadz(APIC_ACCESS_ADDR));
+	GUEST_ASSERT(!vmresume());
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+
+	GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned long apic_access_addr = ~0ul;
+	unsigned int paddr_width;
+	unsigned int vaddr_width;
+	vm_vaddr_t vmx_pages_gva;
+	unsigned long high_gpa;
+	struct vmx_pages *vmx;
+	bool done = false;
+
+	nested_vmx_check_supported();
+
+	vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code);
+	vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+
+	kvm_get_cpu_address_width(&paddr_width, &vaddr_width);
+	high_gpa = (1ul << paddr_width) - getpagesize();
+	if ((unsigned long)DEFAULT_GUEST_PHY_PAGES * getpagesize() > high_gpa) {
+		print_skip("No unbacked physical page available");
+		exit(KSFT_SKIP);
+	}
+
+	vmx = vcpu_alloc_vmx(vm, &vmx_pages_gva);
+	prepare_virtualize_apic_accesses(vmx, vm, 0);
+	vcpu_args_set(vm, VCPU_ID, 2, vmx_pages_gva, high_gpa);
+
+	while (!done) {
+		volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID);
+		struct ucall uc;
+
+		vcpu_run(vm, VCPU_ID);
+		if (apic_access_addr == high_gpa) {
+			TEST_ASSERT(run->exit_reason ==
+				    KVM_EXIT_INTERNAL_ERROR,
+				    "Got exit reason other than KVM_EXIT_INTERNAL_ERROR: %u (%s)\n",
+				    run->exit_reason,
+				    exit_reason_str(run->exit_reason));
+			TEST_ASSERT(run->internal.suberror ==
+				    KVM_INTERNAL_ERROR_EMULATION,
+				    "Got internal suberror other than KVM_INTERNAL_ERROR_EMULATION: %u\n",
+				    run->internal.suberror);
+			break;
+		}
+		TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+			    "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n",
+			    run->exit_reason,
+			    exit_reason_str(run->exit_reason));
+
+		switch (get_ucall(vm, VCPU_ID, &uc)) {
+		case UCALL_ABORT:
+			TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0],
+				  __FILE__, uc.args[1]);
+			/* NOT REACHED */
+		case UCALL_SYNC:
+			apic_access_addr = uc.args[1];
+			break;
+		case UCALL_DONE:
+			done = true;
+			break;
+		default:
+			TEST_ASSERT(false, "Unknown ucall %lu", uc.cmd);
+		}
+	}
+	kvm_vm_free(vm);
+	return 0;
+}