Commit 85542adb authored by Thomas Prescher's avatar Thomas Prescher Committed by Sean Christopherson

KVM: x86: Add KVM_RUN_X86_GUEST_MODE kvm_run flag

When a vCPU is interrupted by a signal while running a nested guest,
KVM will exit to userspace with L2 state. However, userspace has no
way to know whether it sees L1 or L2 state (besides calling
KVM_GET_STATS_FD, which does not have a stable ABI).

This causes multiple problems:

The simplest one is L2 state corruption when userspace marks the sregs
as dirty. See this mailing list thread [1] for a complete discussion.

Another problem is that if userspace decides to continue by emulating
instructions, it will unknowingly emulate with L2 state as if L1
doesn't exist, which can be considered a weird guest escape.

Introduce a new flag, KVM_RUN_X86_GUEST_MODE, in the kvm_run data
structure, which is set when the vCPU exited while running a nested
guest.  Also introduce a new capability, KVM_CAP_X86_GUEST_MODE, to
advertise the functionality to userspace.

[1] https://lore.kernel.org/kvm/20240416123558.212040-1-julian.stecklina@cyberus-technology.de/T/#m280aadcb2e10ae02c191a7dc4ed4b711a74b1f55Signed-off-by: default avatarThomas Prescher <thomas.prescher@cyberus-technology.de>
Signed-off-by: default avatarJulian Stecklina <julian.stecklina@cyberus-technology.de>
Link: https://lore.kernel.org/r/20240508132502.184428-1-julian.stecklina@cyberus-technology.deSigned-off-by: default avatarSean Christopherson <seanjc@google.com>
parent 508f0c7b
...@@ -6419,6 +6419,9 @@ affect the device's behavior. Current defined flags:: ...@@ -6419,6 +6419,9 @@ affect the device's behavior. Current defined flags::
#define KVM_RUN_X86_SMM (1 << 0) #define KVM_RUN_X86_SMM (1 << 0)
/* x86, set if bus lock detected in VM */ /* x86, set if bus lock detected in VM */
#define KVM_RUN_X86_BUS_LOCK (1 << 1) #define KVM_RUN_X86_BUS_LOCK (1 << 1)
/* x86, set if the VCPU is executing a nested (L2) guest */
#define KVM_RUN_X86_GUEST_MODE (1 << 2)
/* arm64, set for KVM_EXIT_DEBUG */ /* arm64, set for KVM_EXIT_DEBUG */
#define KVM_DEBUG_ARCH_HSR_HIGH_VALID (1 << 0) #define KVM_DEBUG_ARCH_HSR_HIGH_VALID (1 << 0)
...@@ -8089,6 +8092,20 @@ by KVM_CHECK_EXTENSION. ...@@ -8089,6 +8092,20 @@ by KVM_CHECK_EXTENSION.
Note: Userspace is responsible for correctly configuring CPUID 0x15, a.k.a. the Note: Userspace is responsible for correctly configuring CPUID 0x15, a.k.a. the
core crystal clock frequency, if a non-zero CPUID 0x15 is exposed to the guest. core crystal clock frequency, if a non-zero CPUID 0x15 is exposed to the guest.
7.36 KVM_CAP_X86_GUEST_MODE
------------------------------
:Architectures: x86
:Returns: Informational only, -EINVAL on direct KVM_ENABLE_CAP.
The presence of this capability indicates that KVM_RUN will update the
KVM_RUN_X86_GUEST_MODE bit in kvm_run.flags to indicate whether the
vCPU was executing nested guest code when it exited.
KVM exits with the register state of either the L1 or L2 guest
depending on which executed at the time of an exit. Userspace must
take care to differentiate between these cases.
8. Other capabilities. 8. Other capabilities.
====================== ======================
......
...@@ -106,6 +106,7 @@ struct kvm_ioapic_state { ...@@ -106,6 +106,7 @@ struct kvm_ioapic_state {
#define KVM_RUN_X86_SMM (1 << 0) #define KVM_RUN_X86_SMM (1 << 0)
#define KVM_RUN_X86_BUS_LOCK (1 << 1) #define KVM_RUN_X86_BUS_LOCK (1 << 1)
#define KVM_RUN_X86_GUEST_MODE (1 << 2)
/* for KVM_GET_REGS and KVM_SET_REGS */ /* for KVM_GET_REGS and KVM_SET_REGS */
struct kvm_regs { struct kvm_regs {
......
...@@ -4704,6 +4704,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) ...@@ -4704,6 +4704,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES: case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES:
case KVM_CAP_IRQFD_RESAMPLE: case KVM_CAP_IRQFD_RESAMPLE:
case KVM_CAP_MEMORY_FAULT_INFO: case KVM_CAP_MEMORY_FAULT_INFO:
case KVM_CAP_X86_GUEST_MODE:
r = 1; r = 1;
break; break;
case KVM_CAP_X86_APIC_BUS_CYCLES_NS: case KVM_CAP_X86_APIC_BUS_CYCLES_NS:
...@@ -10277,6 +10278,8 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu) ...@@ -10277,6 +10278,8 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
if (is_smm(vcpu)) if (is_smm(vcpu))
kvm_run->flags |= KVM_RUN_X86_SMM; kvm_run->flags |= KVM_RUN_X86_SMM;
if (is_guest_mode(vcpu))
kvm_run->flags |= KVM_RUN_X86_GUEST_MODE;
} }
static void update_cr8_intercept(struct kvm_vcpu *vcpu) static void update_cr8_intercept(struct kvm_vcpu *vcpu)
......
...@@ -918,6 +918,7 @@ struct kvm_enable_cap { ...@@ -918,6 +918,7 @@ struct kvm_enable_cap {
#define KVM_CAP_GUEST_MEMFD 234 #define KVM_CAP_GUEST_MEMFD 234
#define KVM_CAP_VM_TYPES 235 #define KVM_CAP_VM_TYPES 235
#define KVM_CAP_X86_APIC_BUS_CYCLES_NS 236 #define KVM_CAP_X86_APIC_BUS_CYCLES_NS 236
#define KVM_CAP_X86_GUEST_MODE 237
struct kvm_irq_routing_irqchip { struct kvm_irq_routing_irqchip {
__u32 irqchip; __u32 irqchip;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment