Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull kvm fixes from Paolo Bonzini: "More x86 fixes: - Logic bugs in CR0 writes and Hyper-V hypercalls - Don't use Enlightened MSR Bitmap for L3 - Remove user-triggerable WARN Plus a few selftest fixes and a regression test for the user-triggerable WARN" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: selftests: KVM: Add test to verify KVM doesn't explode on "bad" I/O KVM: x86: Don't WARN if userspace mucks with RCX during string I/O exit KVM: X86: Raise #GP when clearing CR0_PG in 64 bit mode selftests: KVM: avoid failures due to reserved HyperTransport region KVM: x86: Ignore sparse banks size for an "all CPUs", non-sparse IPI req KVM: x86: Wait for IPIs to be delivered when handling Hyper-V TLB flush hypercall KVM: x86: selftests: svm_int_ctl_test: fix intercept calculation KVM: nVMX: Don't use Enlightened MSR Bitmap for L3

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm fixes from Paolo Bonzini: "More x86 fixes: - Logic bugs in CR0 writes and Hyper-V hypercalls - Don't use Enlightened MSR Bitmap for L3 - Remove user-triggerable WARN Plus a few selftest fixes and a regression test for the user-triggerable WARN" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: selftests: KVM: Add test to verify KVM doesn't explode on "bad" I/O KVM: x86: Don't WARN if userspace mucks with RCX during string I/O exit KVM: X86: Raise #GP when clearing CR0_PG in 64 bit mode selftests: KVM: avoid failures due to reserved HyperTransport region KVM: x86: Ignore sparse banks size for an "all CPUs", non-sparse IPI req KVM: x86: Wait for IPIs to be delivered when handling Hyper-V TLB flush hypercall KVM: x86: selftests: svm_int_ctl_test: fix intercept calculation KVM: nVMX: Don't use Enlightened MSR Bitmap for L3
b9172f9e · Linus Torvalds · b8a98b6b · 10e7a099 · b9172f9e · b9172f9e
Commit b9172f9e authored Dec 10, 2021 by Linus Torvalds
11 changed files
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -97,7 +97,7 @@
 	KVM_ARCH_REQ_FLAGS(25, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
 #define KVM_REQ_TLB_FLUSH_CURRENT	KVM_ARCH_REQ(26)
 #define KVM_REQ_TLB_FLUSH_GUEST \
-	KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_NO_WAKEUP)
+	KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
 #define KVM_REQ_APF_READY		KVM_ARCH_REQ(28)
 #define KVM_REQ_MSR_FILTER_CHANGED	KVM_ARCH_REQ(29)
 #define KVM_REQ_UPDATE_CPU_DIRTY_LOGGING \

--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1922,11 +1922,13 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool
 		all_cpus = send_ipi_ex.vp_set.format == HV_GENERIC_SET_ALL;
+		if (all_cpus)
+			goto check_and_send_ipi;
 		if (!sparse_banks_len)
 			goto ret_success;
-		if (!all_cpus &&
+		if (kvm_read_guest(kvm,
-		    kvm_read_guest(kvm,
 				   hc->ingpa + offsetof(struct hv_send_ipi_ex,
 							vp_set.bank_contents),
 				   sparse_banks,
@@ -1934,6 +1936,7 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool
 			return HV_STATUS_INVALID_HYPERCALL_INPUT;
 	}
+check_and_send_ipi:
 	if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR))
 		return HV_STATUS_INVALID_HYPERCALL_INPUT;

--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -2646,15 +2646,6 @@ int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
 		if (!loaded_vmcs->msr_bitmap)
 			goto out_vmcs;
 		memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
-		if (IS_ENABLED(CONFIG_HYPERV) &&
-		    static_branch_unlikely(&enable_evmcs) &&
-		    (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
-			struct hv_enlightened_vmcs *evmcs =
-				(struct hv_enlightened_vmcs *)loaded_vmcs->vmcs;
-			evmcs->hv_enlightenments_control.msr_bitmap = 1;
-		}
 	}
 	memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
@@ -6842,6 +6833,19 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
 	if (err < 0)
 		goto free_pml;
+	/*
+	 * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a
+	 * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the
+	 * feature only for vmcs01, KVM currently isn't equipped to realize any
+	 * performance benefits from enabling it for vmcs02.
+	 */
+	if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs) &&
+	    (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
+		struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
+		evmcs->hv_enlightenments_control.msr_bitmap = 1;
+	}
 	/* The MSR bitmap starts with all ones */
 	bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
 	bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS);

--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -890,7 +890,8 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 	    !load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)))
 		return 1;
-	if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
+	if (!(cr0 & X86_CR0_PG) &&
+	    (is_64_bit_mode(vcpu) || kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)))
 		return 1;
 	static_call(kvm_x86_set_cr0)(vcpu, cr0);
@@ -7121,7 +7122,13 @@ static int emulator_pio_in(struct kvm_vcpu *vcpu, int size,
 			   unsigned short port, void *val, unsigned int count)
 {
 	if (vcpu->arch.pio.count) {
-		/* Complete previous iteration.  */
+		/*
+		 * Complete a previous iteration that required userspace I/O.
+		 * Note, @count isn't guaranteed to match pio.count as userspace
+		 * can modify ECX before rerunning the vCPU.  Ignore any such
+		 * shenanigans as KVM doesn't support modifying the rep count,
+		 * and the emulator ensures @count doesn't overflow the buffer.
+		 */
 	} else {
 		int r = __emulator_pio_in(vcpu, size, port, count);
 		if (!r)
@@ -7130,7 +7137,6 @@ static int emulator_pio_in(struct kvm_vcpu *vcpu, int size,
 		/* Results already available, fall through.  */
 	}
-	WARN_ON(count != vcpu->arch.pio.count);
 	complete_emulator_pio_in(vcpu, val);
 	return 1;
 }

--- a/tools/testing/selftests/kvm/.gitignore
+++ b/tools/testing/selftests/kvm/.gitignore
@@ -30,6 +30,7 @@
 /x86_64/svm_int_ctl_test
 /x86_64/sync_regs_test
 /x86_64/tsc_msrs_test
+/x86_64/userspace_io_test
 /x86_64/userspace_msr_exit_test
 /x86_64/vmx_apic_access_test
 /x86_64/vmx_close_while_nested_test

--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -59,6 +59,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/vmx_preemption_timer_test
 TEST_GEN_PROGS_x86_64 += x86_64/svm_vmcall_test
 TEST_GEN_PROGS_x86_64 += x86_64/svm_int_ctl_test
 TEST_GEN_PROGS_x86_64 += x86_64/sync_regs_test
+TEST_GEN_PROGS_x86_64 += x86_64/userspace_io_test
 TEST_GEN_PROGS_x86_64 += x86_64/userspace_msr_exit_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_apic_access_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test

--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -71,6 +71,15 @@ enum vm_guest_mode {
 #endif
+#if defined(__x86_64__)
+unsigned long vm_compute_max_gfn(struct kvm_vm *vm);
+#else
+static inline unsigned long vm_compute_max_gfn(struct kvm_vm *vm)
+{
+	return ((1ULL << vm->pa_bits) >> vm->page_shift) - 1;
+}
+#endif
 #define MIN_PAGE_SIZE		(1U << MIN_PAGE_SHIFT)
 #define PTES_PER_MIN_PAGE	ptes_per_page(MIN_PAGE_SIZE)

--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -302,7 +302,7 @@ struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm)
 		(1ULL << (vm->va_bits - 1)) >> vm->page_shift);
 	/* Limit physical addresses to PA-bits. */
-	vm->max_gfn = ((1ULL << vm->pa_bits) >> vm->page_shift) - 1;
+	vm->max_gfn = vm_compute_max_gfn(vm);
 	/* Allocate and setup memory for guest. */
 	vm->vpages_mapped = sparsebit_alloc();

--- a/tools/testing/selftests/kvm/lib/x86_64/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c
@@ -1431,3 +1431,71 @@ struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vm *vm, uint32_t vcpui
 	return cpuid;
 }
+#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx 0x68747541
+#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx 0x444d4163
+#define X86EMUL_CPUID_VENDOR_AuthenticAMD_edx 0x69746e65
+static inline unsigned x86_family(unsigned int eax)
+{
+        unsigned int x86;
+        x86 = (eax >> 8) & 0xf;
+        if (x86 == 0xf)
+                x86 += (eax >> 20) & 0xff;
+        return x86;
+}
+unsigned long vm_compute_max_gfn(struct kvm_vm *vm)
+{
+	const unsigned long num_ht_pages = 12 << (30 - vm->page_shift); /* 12 GiB */
+	unsigned long ht_gfn, max_gfn, max_pfn;
+	uint32_t eax, ebx, ecx, edx, max_ext_leaf;
+	max_gfn = (1ULL << (vm->pa_bits - vm->page_shift)) - 1;
+	/* Avoid reserved HyperTransport region on AMD processors.  */
+	eax = ecx = 0;
+	cpuid(&eax, &ebx, &ecx, &edx);
+	if (ebx != X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx ||
+	    ecx != X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx ||
+	    edx != X86EMUL_CPUID_VENDOR_AuthenticAMD_edx)
+		return max_gfn;
+	/* On parts with <40 physical address bits, the area is fully hidden */
+	if (vm->pa_bits < 40)
+		return max_gfn;
+	/* Before family 17h, the HyperTransport area is just below 1T.  */
+	ht_gfn = (1 << 28) - num_ht_pages;
+	eax = 1;
+	cpuid(&eax, &ebx, &ecx, &edx);
+	if (x86_family(eax) < 0x17)
+		goto done;
+	/*
+	 * Otherwise it's at the top of the physical address space, possibly
+	 * reduced due to SME by bits 11:6 of CPUID[0x8000001f].EBX.  Use
+	 * the old conservative value if MAXPHYADDR is not enumerated.
+	 */
+	eax = 0x80000000;
+	cpuid(&eax, &ebx, &ecx, &edx);
+	max_ext_leaf = eax;
+	if (max_ext_leaf < 0x80000008)
+		goto done;
+	eax = 0x80000008;
+	cpuid(&eax, &ebx, &ecx, &edx);
+	max_pfn = (1ULL << ((eax & 0xff) - vm->page_shift)) - 1;
+	if (max_ext_leaf >= 0x8000001f) {
+		eax = 0x8000001f;
+		cpuid(&eax, &ebx, &ecx, &edx);
+		max_pfn >>= (ebx >> 6) & 0x3f;
+	}
+	ht_gfn = max_pfn - num_ht_pages;
+done:
+	return min(max_gfn, ht_gfn - 1);
+}
--- a/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c
+++ b/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c
@@ -75,7 +75,7 @@ static void l1_guest_code(struct svm_test_data *svm)
 	vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK;
 	/* No intercepts for real and virtual interrupts */
-	vmcb->control.intercept &= ~(1ULL << INTERCEPT_INTR | INTERCEPT_VINTR);
+	vmcb->control.intercept &= ~(BIT(INTERCEPT_INTR) | BIT(INTERCEPT_VINTR));
 	/* Make a virtual interrupt VINTR_IRQ_NUMBER pending */
 	vmcb->control.int_ctl |= V_IRQ_MASK | (0x1 << V_INTR_PRIO_SHIFT);

--- a/tools/testing/selftests/kvm/x86_64/userspace_io_test.c
+++ b/tools/testing/selftests/kvm/x86_64/userspace_io_test.c
+// SPDX-License-Identifier: GPL-2.0
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#define VCPU_ID			1
+static void guest_ins_port80(uint8_t *buffer, unsigned int count)
+{
+	unsigned long end;
+	if (count == 2)
+		end = (unsigned long)buffer + 1;
+	else
+		end = (unsigned long)buffer + 8192;
+	asm volatile("cld; rep; insb" : "+D"(buffer), "+c"(count) : "d"(0x80) : "memory");
+	GUEST_ASSERT_1(count == 0, count);
+	GUEST_ASSERT_2((unsigned long)buffer == end, buffer, end);
+}
+static void guest_code(void)
+{
+	uint8_t buffer[8192];
+	int i;
+	/*
+	 * Special case tests.  main() will adjust RCX 2 => 1 and 3 => 8192 to
+	 * test that KVM doesn't explode when userspace modifies the "count" on
+	 * a userspace I/O exit.  KVM isn't required to play nice with the I/O
+	 * itself as KVM doesn't support manipulating the count, it just needs
+	 * to not explode or overflow a buffer.
+	 */
+	guest_ins_port80(buffer, 2);
+	guest_ins_port80(buffer, 3);
+	/* Verify KVM fills the buffer correctly when not stuffing RCX. */
+	memset(buffer, 0, sizeof(buffer));
+	guest_ins_port80(buffer, 8192);
+	for (i = 0; i < 8192; i++)
+		GUEST_ASSERT_2(buffer[i] == 0xaa, i, buffer[i]);
+	GUEST_DONE();
+}
+int main(int argc, char *argv[])
+{
+	struct kvm_regs regs;
+	struct kvm_run *run;
+	struct kvm_vm *vm;
+	struct ucall uc;
+	int rc;
+	/* Tell stdout not to buffer its content */
+	setbuf(stdout, NULL);
+	/* Create VM */
+	vm = vm_create_default(VCPU_ID, 0, guest_code);
+	run = vcpu_state(vm, VCPU_ID);
+	memset(&regs, 0, sizeof(regs));
+	while (1) {
+		rc = _vcpu_run(vm, VCPU_ID);
+		TEST_ASSERT(rc == 0, "vcpu_run failed: %d\n", rc);
+		TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+			    "Unexpected exit reason: %u (%s),\n",
+			    run->exit_reason,
+			    exit_reason_str(run->exit_reason));
+		if (get_ucall(vm, VCPU_ID, &uc))
+			break;
+		TEST_ASSERT(run->io.port == 0x80,
+			    "Expected I/O at port 0x80, got port 0x%x\n", run->io.port);
+		/*
+		 * Modify the rep string count in RCX: 2 => 1 and 3 => 8192.
+		 * Note, this abuses KVM's batching of rep string I/O to avoid
+		 * getting stuck in an infinite loop.  That behavior isn't in
+		 * scope from a testing perspective as it's not ABI in any way,
+		 * i.e. it really is abusing internal KVM knowledge.
+		 */
+		vcpu_regs_get(vm, VCPU_ID, &regs);
+		if (regs.rcx == 2)
+			regs.rcx = 1;
+		if (regs.rcx == 3)
+			regs.rcx = 8192;
+		memset((void *)run + run->io.data_offset, 0xaa, 4096);
+		vcpu_regs_set(vm, VCPU_ID, &regs);
+	}
+	switch (uc.cmd) {
+	case UCALL_DONE:
+		break;
+	case UCALL_ABORT:
+		TEST_FAIL("%s at %s:%ld : argN+1 = 0x%lx, argN+2 = 0x%lx",
+			  (const char *)uc.args[0], __FILE__, uc.args[1],
+			  uc.args[2], uc.args[3]);
+	default:
+		TEST_FAIL("Unknown ucall %lu", uc.cmd);
+	}
+	kvm_vm_free(vm);
+	return 0;
+}