Commit 73331c5d authored by Paolo Bonzini's avatar Paolo Bonzini

Merge branch 'kvm-fixes-for-5.18-rc5' into HEAD

Fixes for (relatively) old bugs, to be merged in both the -rc and next
development trees:

* Fix potential races when walking host page table

* Fix bad user ABI for KVM_EXIT_SYSTEM_EVENT

* Fix shadow page table leak when KVM runs nested
parents 484c22df 44187235
...@@ -5986,16 +5986,16 @@ should put the acknowledged interrupt vector into the 'epr' field. ...@@ -5986,16 +5986,16 @@ should put the acknowledged interrupt vector into the 'epr' field.
#define KVM_SYSTEM_EVENT_RESET 2 #define KVM_SYSTEM_EVENT_RESET 2
#define KVM_SYSTEM_EVENT_CRASH 3 #define KVM_SYSTEM_EVENT_CRASH 3
__u32 type; __u32 type;
__u64 flags; __u32 ndata;
__u64 data[16];
} system_event; } system_event;
If exit_reason is KVM_EXIT_SYSTEM_EVENT then the vcpu has triggered If exit_reason is KVM_EXIT_SYSTEM_EVENT then the vcpu has triggered
a system-level event using some architecture specific mechanism (hypercall a system-level event using some architecture specific mechanism (hypercall
or some special instruction). In case of ARM64, this is triggered using or some special instruction). In case of ARM64, this is triggered using
HVC instruction based PSCI call from the vcpu. The 'type' field describes HVC instruction based PSCI call from the vcpu.
the system-level event type. The 'flags' field describes architecture
specific flags for the system-level event.
The 'type' field describes the system-level event type.
Valid values for 'type' are: Valid values for 'type' are:
- KVM_SYSTEM_EVENT_SHUTDOWN -- the guest has requested a shutdown of the - KVM_SYSTEM_EVENT_SHUTDOWN -- the guest has requested a shutdown of the
...@@ -6010,10 +6010,20 @@ Valid values for 'type' are: ...@@ -6010,10 +6010,20 @@ Valid values for 'type' are:
to ignore the request, or to gather VM memory core dump and/or to ignore the request, or to gather VM memory core dump and/or
reset/shutdown of the VM. reset/shutdown of the VM.
Valid flags are: If KVM_CAP_SYSTEM_EVENT_DATA is present, the 'data' field can contain
architecture specific information for the system-level event. Only
the first `ndata` items (possibly zero) of the data array are valid.
- for arm64, data[0] is set to KVM_SYSTEM_EVENT_RESET_FLAG_PSCI_RESET2 if
the guest issued a SYSTEM_RESET2 call according to v1.1 of the PSCI
specification.
- for RISC-V, data[0] is set to the value of the second argument of the
``sbi_system_reset`` call.
- KVM_SYSTEM_EVENT_RESET_FLAG_PSCI_RESET2 (arm64 only) -- the guest issued Previous versions of Linux defined a `flags` member in this struct. The
a SYSTEM_RESET2 call according to v1.1 of the PSCI specification. field is now aliased to `data[0]`. Userspace can assume that it is only
written if ndata is greater than 0.
:: ::
......
...@@ -181,7 +181,8 @@ static void kvm_prepare_system_event(struct kvm_vcpu *vcpu, u32 type, u64 flags) ...@@ -181,7 +181,8 @@ static void kvm_prepare_system_event(struct kvm_vcpu *vcpu, u32 type, u64 flags)
memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event)); memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event));
vcpu->run->system_event.type = type; vcpu->run->system_event.type = type;
vcpu->run->system_event.flags = flags; vcpu->run->system_event.ndata = 1;
vcpu->run->system_event.data[0] = flags;
vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
} }
......
...@@ -83,7 +83,7 @@ void kvm_riscv_vcpu_sbi_forward(struct kvm_vcpu *vcpu, struct kvm_run *run) ...@@ -83,7 +83,7 @@ void kvm_riscv_vcpu_sbi_forward(struct kvm_vcpu *vcpu, struct kvm_run *run)
void kvm_riscv_vcpu_sbi_system_reset(struct kvm_vcpu *vcpu, void kvm_riscv_vcpu_sbi_system_reset(struct kvm_vcpu *vcpu,
struct kvm_run *run, struct kvm_run *run,
u32 type, u64 flags) u32 type, u64 reason)
{ {
unsigned long i; unsigned long i;
struct kvm_vcpu *tmp; struct kvm_vcpu *tmp;
...@@ -94,7 +94,8 @@ void kvm_riscv_vcpu_sbi_system_reset(struct kvm_vcpu *vcpu, ...@@ -94,7 +94,8 @@ void kvm_riscv_vcpu_sbi_system_reset(struct kvm_vcpu *vcpu,
memset(&run->system_event, 0, sizeof(run->system_event)); memset(&run->system_event, 0, sizeof(run->system_event));
run->system_event.type = type; run->system_event.type = type;
run->system_event.flags = flags; run->system_event.ndata = 1;
run->system_event.data[0] = reason;
run->exit_reason = KVM_EXIT_SYSTEM_EVENT; run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
} }
......
...@@ -65,6 +65,30 @@ static __always_inline u64 rsvd_bits(int s, int e) ...@@ -65,6 +65,30 @@ static __always_inline u64 rsvd_bits(int s, int e)
return ((2ULL << (e - s)) - 1) << s; return ((2ULL << (e - s)) - 1) << s;
} }
/*
* The number of non-reserved physical address bits irrespective of features
* that repurpose legal bits, e.g. MKTME.
*/
extern u8 __read_mostly shadow_phys_bits;
static inline gfn_t kvm_mmu_max_gfn(void)
{
/*
* Note that this uses the host MAXPHYADDR, not the guest's.
* EPT/NPT cannot support GPAs that would exceed host.MAXPHYADDR;
* assuming KVM is running on bare metal, guest accesses beyond
* host.MAXPHYADDR will hit a #PF(RSVD) and never cause a vmexit
* (either EPT Violation/Misconfig or #NPF), and so KVM will never
* install a SPTE for such addresses. If KVM is running as a VM
* itself, on the other hand, it might see a MAXPHYADDR that is less
* than hardware's real MAXPHYADDR. Using the host MAXPHYADDR
* disallows such SPTEs entirely and simplifies the TDP MMU.
*/
int max_gpa_bits = likely(tdp_enabled) ? shadow_phys_bits : 52;
return (1ULL << (max_gpa_bits - PAGE_SHIFT)) - 1;
}
void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask); void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask);
void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only); void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only);
......
...@@ -2804,8 +2804,12 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, ...@@ -2804,8 +2804,12 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
const struct kvm_memory_slot *slot) const struct kvm_memory_slot *slot)
{ {
unsigned long hva; unsigned long hva;
pte_t *pte; unsigned long flags;
int level; int level = PG_LEVEL_4K;
pgd_t pgd;
p4d_t p4d;
pud_t pud;
pmd_t pmd;
if (!PageCompound(pfn_to_page(pfn)) && !kvm_is_zone_device_pfn(pfn)) if (!PageCompound(pfn_to_page(pfn)) && !kvm_is_zone_device_pfn(pfn))
return PG_LEVEL_4K; return PG_LEVEL_4K;
...@@ -2820,10 +2824,43 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, ...@@ -2820,10 +2824,43 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
*/ */
hva = __gfn_to_hva_memslot(slot, gfn); hva = __gfn_to_hva_memslot(slot, gfn);
pte = lookup_address_in_mm(kvm->mm, hva, &level); /*
if (unlikely(!pte)) * Lookup the mapping level in the current mm. The information
return PG_LEVEL_4K; * may become stale soon, but it is safe to use as long as
* 1) mmu_notifier_retry was checked after taking mmu_lock, and
* 2) mmu_lock is taken now.
*
* We still need to disable IRQs to prevent concurrent tear down
* of page tables.
*/
local_irq_save(flags);
pgd = READ_ONCE(*pgd_offset(kvm->mm, hva));
if (pgd_none(pgd))
goto out;
p4d = READ_ONCE(*p4d_offset(&pgd, hva));
if (p4d_none(p4d) || !p4d_present(p4d))
goto out;
pud = READ_ONCE(*pud_offset(&p4d, hva));
if (pud_none(pud) || !pud_present(pud))
goto out;
if (pud_large(pud)) {
level = PG_LEVEL_1G;
goto out;
}
pmd = READ_ONCE(*pmd_offset(&pud, hva));
if (pmd_none(pmd) || !pmd_present(pmd))
goto out;
if (pmd_large(pmd))
level = PG_LEVEL_2M;
out:
local_irq_restore(flags);
return level; return level;
} }
...@@ -2992,9 +3029,15 @@ static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fa ...@@ -2992,9 +3029,15 @@ static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fa
/* /*
* If MMIO caching is disabled, emulate immediately without * If MMIO caching is disabled, emulate immediately without
* touching the shadow page tables as attempting to install an * touching the shadow page tables as attempting to install an
* MMIO SPTE will just be an expensive nop. * MMIO SPTE will just be an expensive nop. Do not cache MMIO
*/ * whose gfn is greater than host.MAXPHYADDR, any guest that
if (unlikely(!shadow_mmio_value)) { * generates such gfns is running nested and is being tricked
* by L0 userspace (you can observe gfn > L1.MAXPHYADDR if
* and only if L1's MAXPHYADDR is inaccurate with respect to
* the hardware's).
*/
if (unlikely(!shadow_mmio_value) ||
unlikely(fault->gfn > kvm_mmu_max_gfn())) {
*ret_val = RET_PF_EMULATE; *ret_val = RET_PF_EMULATE;
return true; return true;
} }
......
...@@ -201,12 +201,6 @@ static inline bool is_removed_spte(u64 spte) ...@@ -201,12 +201,6 @@ static inline bool is_removed_spte(u64 spte)
*/ */
extern u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; extern u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
/*
* The number of non-reserved physical address bits irrespective of features
* that repurpose legal bits, e.g. MKTME.
*/
extern u8 __read_mostly shadow_phys_bits;
static inline bool is_mmio_spte(u64 spte) static inline bool is_mmio_spte(u64 spte)
{ {
return (spte & shadow_mmio_mask) == shadow_mmio_value && return (spte & shadow_mmio_mask) == shadow_mmio_value &&
......
...@@ -815,14 +815,15 @@ static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm, ...@@ -815,14 +815,15 @@ static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
return iter->yielded; return iter->yielded;
} }
static inline gfn_t tdp_mmu_max_gfn_host(void) static inline gfn_t tdp_mmu_max_gfn_exclusive(void)
{ {
/* /*
* Bound TDP MMU walks at host.MAXPHYADDR, guest accesses beyond that * Bound TDP MMU walks at host.MAXPHYADDR. KVM disallows memslots with
* will hit a #PF(RSVD) and never hit an EPT Violation/Misconfig / #NPF, * a gpa range that would exceed the max gfn, and KVM does not create
* and so KVM will never install a SPTE for such addresses. * MMIO SPTEs for "impossible" gfns, instead sending such accesses down
* the slow emulation path every time.
*/ */
return 1ULL << (shadow_phys_bits - PAGE_SHIFT); return kvm_mmu_max_gfn() + 1;
} }
static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
...@@ -830,7 +831,7 @@ static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, ...@@ -830,7 +831,7 @@ static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
{ {
struct tdp_iter iter; struct tdp_iter iter;
gfn_t end = tdp_mmu_max_gfn_host(); gfn_t end = tdp_mmu_max_gfn_exclusive();
gfn_t start = 0; gfn_t start = 0;
for_each_tdp_pte_min_level(iter, root, zap_level, start, end) { for_each_tdp_pte_min_level(iter, root, zap_level, start, end) {
...@@ -923,7 +924,7 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, ...@@ -923,7 +924,7 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
{ {
struct tdp_iter iter; struct tdp_iter iter;
end = min(end, tdp_mmu_max_gfn_host()); end = min(end, tdp_mmu_max_gfn_exclusive());
lockdep_assert_held_write(&kvm->mmu_lock); lockdep_assert_held_write(&kvm->mmu_lock);
......
...@@ -10020,12 +10020,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) ...@@ -10020,12 +10020,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (kvm_check_request(KVM_REQ_HV_CRASH, vcpu)) { if (kvm_check_request(KVM_REQ_HV_CRASH, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
vcpu->run->system_event.type = KVM_SYSTEM_EVENT_CRASH; vcpu->run->system_event.type = KVM_SYSTEM_EVENT_CRASH;
vcpu->run->system_event.ndata = 0;
r = 0; r = 0;
goto out; goto out;
} }
if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) { if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET; vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET;
vcpu->run->system_event.ndata = 0;
r = 0; r = 0;
goto out; goto out;
} }
...@@ -12009,8 +12011,12 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, ...@@ -12009,8 +12011,12 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
struct kvm_memory_slot *new, struct kvm_memory_slot *new,
enum kvm_mr_change change) enum kvm_mr_change change)
{ {
if (change == KVM_MR_CREATE || change == KVM_MR_MOVE) if (change == KVM_MR_CREATE || change == KVM_MR_MOVE) {
if ((new->base_gfn + new->npages - 1) > kvm_mmu_max_gfn())
return -EINVAL;
return kvm_alloc_memslot_metadata(kvm, new); return kvm_alloc_memslot_metadata(kvm, new);
}
if (change == KVM_MR_FLAGS_ONLY) if (change == KVM_MR_FLAGS_ONLY)
memcpy(&new->arch, &old->arch, sizeof(old->arch)); memcpy(&new->arch, &old->arch, sizeof(old->arch));
......
...@@ -445,7 +445,13 @@ struct kvm_run { ...@@ -445,7 +445,13 @@ struct kvm_run {
#define KVM_SYSTEM_EVENT_RESET 2 #define KVM_SYSTEM_EVENT_RESET 2
#define KVM_SYSTEM_EVENT_CRASH 3 #define KVM_SYSTEM_EVENT_CRASH 3
__u32 type; __u32 type;
__u32 ndata;
union {
#ifndef __KERNEL__
__u64 flags; __u64 flags;
#endif
__u64 data[16];
};
} system_event; } system_event;
/* KVM_EXIT_S390_STSI */ /* KVM_EXIT_S390_STSI */
struct { struct {
...@@ -1144,6 +1150,8 @@ struct kvm_ppc_resize_hpt { ...@@ -1144,6 +1150,8 @@ struct kvm_ppc_resize_hpt {
#define KVM_CAP_S390_MEM_OP_EXTENSION 211 #define KVM_CAP_S390_MEM_OP_EXTENSION 211
#define KVM_CAP_PMU_CAPABILITY 212 #define KVM_CAP_PMU_CAPABILITY 212
#define KVM_CAP_DISABLE_QUIRKS2 213 #define KVM_CAP_DISABLE_QUIRKS2 213
/* #define KVM_CAP_VM_TSC_CONTROL 214 */
#define KVM_CAP_SYSTEM_EVENT_DATA 215
#ifdef KVM_CAP_IRQ_ROUTING #ifdef KVM_CAP_IRQ_ROUTING
......
...@@ -4354,6 +4354,7 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) ...@@ -4354,6 +4354,7 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
return 0; return 0;
#endif #endif
case KVM_CAP_BINARY_STATS_FD: case KVM_CAP_BINARY_STATS_FD:
case KVM_CAP_SYSTEM_EVENT_DATA:
return 1; return 1;
default: default:
break; break;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment