Commit 71d7c575 authored by Paolo Bonzini's avatar Paolo Bonzini

Merge branch 'kvm-fixes-for-5.18-rc5' into HEAD

Fixes for (relatively) old bugs, to be merged in both the -rc and next
development trees.

The merge reconciles the ABI fixes for KVM_EXIT_SYSTEM_EVENT between
5.18 and commit c24a950e ("KVM, SEV: Add KVM_EXIT_SHUTDOWN metadata
for SEV-ES", 2022-04-13).
parents 5d6c7de6 44187235
......@@ -6089,21 +6089,18 @@ should put the acknowledged interrupt vector into the 'epr' field.
#define KVM_SYSTEM_EVENT_RESET 2
#define KVM_SYSTEM_EVENT_CRASH 3
#define KVM_SYSTEM_EVENT_SEV_TERM 4
#define KVM_SYSTEM_EVENT_NDATA_VALID (1u << 31)
__u32 type;
__u32 ndata;
__u64 flags;
__u64 data[16];
} system_event;
If exit_reason is KVM_EXIT_SYSTEM_EVENT then the vcpu has triggered
a system-level event using some architecture specific mechanism (hypercall
or some special instruction). In case of ARM64, this is triggered using
HVC instruction based PSCI call from the vcpu. The 'type' field describes
the system-level event type. The 'flags' field describes architecture
specific flags for the system-level event.
HVC instruction based PSCI call from the vcpu.
Valid values for bits 30:0 of 'type' are:
The 'type' field describes the system-level event type.
Valid values for 'type' are:
- KVM_SYSTEM_EVENT_SHUTDOWN -- the guest has requested a shutdown of the
VM. Userspace is not obliged to honour this, and if it does honour
......@@ -6119,14 +6116,20 @@ Valid values for bits 30:0 of 'type' are:
- KVM_SYSTEM_EVENT_SEV_TERM -- an AMD SEV guest requested termination.
The guest physical address of the guest's GHCB is stored in `data[0]`.
Valid flags are:
If KVM_CAP_SYSTEM_EVENT_DATA is present, the 'data' field can contain
architecture specific information for the system-level event. Only
the first `ndata` items (possibly zero) of the data array are valid.
- for arm64, data[0] is set to KVM_SYSTEM_EVENT_RESET_FLAG_PSCI_RESET2 if
the guest issued a SYSTEM_RESET2 call according to v1.1 of the PSCI
specification.
- KVM_SYSTEM_EVENT_RESET_FLAG_PSCI_RESET2 (arm64 only) -- the guest issued
a SYSTEM_RESET2 call according to v1.1 of the PSCI specification.
- for RISC-V, data[0] is set to the value of the second argument of the
``sbi_system_reset`` call.
Extra data for this event is stored in the `data[]` array, up to index
`ndata-1` included, if bit 31 is set in `type`. The data depends on the
`type` field. There is no extra data if bit 31 is clear or `ndata` is zero.
Previous versions of Linux defined a `flags` member in this struct. The
field is now aliased to `data[0]`. Userspace can assume that it is only
written if ndata is greater than 0.
::
......
......@@ -181,7 +181,8 @@ static void kvm_prepare_system_event(struct kvm_vcpu *vcpu, u32 type, u64 flags)
memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event));
vcpu->run->system_event.type = type;
vcpu->run->system_event.flags = flags;
vcpu->run->system_event.ndata = 1;
vcpu->run->system_event.data[0] = flags;
vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
}
......
......@@ -83,7 +83,7 @@ void kvm_riscv_vcpu_sbi_forward(struct kvm_vcpu *vcpu, struct kvm_run *run)
void kvm_riscv_vcpu_sbi_system_reset(struct kvm_vcpu *vcpu,
struct kvm_run *run,
u32 type, u64 flags)
u32 type, u64 reason)
{
unsigned long i;
struct kvm_vcpu *tmp;
......@@ -94,7 +94,8 @@ void kvm_riscv_vcpu_sbi_system_reset(struct kvm_vcpu *vcpu,
memset(&run->system_event, 0, sizeof(run->system_event));
run->system_event.type = type;
run->system_event.flags = flags;
run->system_event.ndata = 1;
run->system_event.data[0] = reason;
run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
}
......
......@@ -65,6 +65,30 @@ static __always_inline u64 rsvd_bits(int s, int e)
return ((2ULL << (e - s)) - 1) << s;
}
/*
* The number of non-reserved physical address bits irrespective of features
* that repurpose legal bits, e.g. MKTME.
*/
extern u8 __read_mostly shadow_phys_bits;
static inline gfn_t kvm_mmu_max_gfn(void)
{
/*
* Note that this uses the host MAXPHYADDR, not the guest's.
* EPT/NPT cannot support GPAs that would exceed host.MAXPHYADDR;
* assuming KVM is running on bare metal, guest accesses beyond
* host.MAXPHYADDR will hit a #PF(RSVD) and never cause a vmexit
* (either EPT Violation/Misconfig or #NPF), and so KVM will never
* install a SPTE for such addresses. If KVM is running as a VM
* itself, on the other hand, it might see a MAXPHYADDR that is less
* than hardware's real MAXPHYADDR. Using the host MAXPHYADDR
* disallows such SPTEs entirely and simplifies the TDP MMU.
*/
int max_gpa_bits = likely(tdp_enabled) ? shadow_phys_bits : 52;
return (1ULL << (max_gpa_bits - PAGE_SHIFT)) - 1;
}
void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask);
void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only);
......
......@@ -2804,8 +2804,12 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
const struct kvm_memory_slot *slot)
{
unsigned long hva;
pte_t *pte;
int level;
unsigned long flags;
int level = PG_LEVEL_4K;
pgd_t pgd;
p4d_t p4d;
pud_t pud;
pmd_t pmd;
if (!PageCompound(pfn_to_page(pfn)) && !kvm_is_zone_device_pfn(pfn))
return PG_LEVEL_4K;
......@@ -2820,10 +2824,43 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
*/
hva = __gfn_to_hva_memslot(slot, gfn);
pte = lookup_address_in_mm(kvm->mm, hva, &level);
if (unlikely(!pte))
return PG_LEVEL_4K;
/*
* Lookup the mapping level in the current mm. The information
* may become stale soon, but it is safe to use as long as
* 1) mmu_notifier_retry was checked after taking mmu_lock, and
* 2) mmu_lock is taken now.
*
* We still need to disable IRQs to prevent concurrent tear down
* of page tables.
*/
local_irq_save(flags);
pgd = READ_ONCE(*pgd_offset(kvm->mm, hva));
if (pgd_none(pgd))
goto out;
p4d = READ_ONCE(*p4d_offset(&pgd, hva));
if (p4d_none(p4d) || !p4d_present(p4d))
goto out;
pud = READ_ONCE(*pud_offset(&p4d, hva));
if (pud_none(pud) || !pud_present(pud))
goto out;
if (pud_large(pud)) {
level = PG_LEVEL_1G;
goto out;
}
pmd = READ_ONCE(*pmd_offset(&pud, hva));
if (pmd_none(pmd) || !pmd_present(pmd))
goto out;
if (pmd_large(pmd))
level = PG_LEVEL_2M;
out:
local_irq_restore(flags);
return level;
}
......@@ -2992,9 +3029,15 @@ static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fa
/*
* If MMIO caching is disabled, emulate immediately without
* touching the shadow page tables as attempting to install an
* MMIO SPTE will just be an expensive nop.
* MMIO SPTE will just be an expensive nop. Do not cache MMIO
* whose gfn is greater than host.MAXPHYADDR, any guest that
* generates such gfns is running nested and is being tricked
* by L0 userspace (you can observe gfn > L1.MAXPHYADDR if
* and only if L1's MAXPHYADDR is inaccurate with respect to
* the hardware's).
*/
if (unlikely(!shadow_mmio_value)) {
if (unlikely(!shadow_mmio_value) ||
unlikely(fault->gfn > kvm_mmu_max_gfn())) {
*ret_val = RET_PF_EMULATE;
return true;
}
......
......@@ -201,12 +201,6 @@ static inline bool is_removed_spte(u64 spte)
*/
extern u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
/*
* The number of non-reserved physical address bits irrespective of features
* that repurpose legal bits, e.g. MKTME.
*/
extern u8 __read_mostly shadow_phys_bits;
static inline bool is_mmio_spte(u64 spte)
{
return (spte & shadow_mmio_mask) == shadow_mmio_value &&
......
......@@ -815,14 +815,15 @@ static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
return iter->yielded;
}
static inline gfn_t tdp_mmu_max_gfn_host(void)
static inline gfn_t tdp_mmu_max_gfn_exclusive(void)
{
/*
* Bound TDP MMU walks at host.MAXPHYADDR, guest accesses beyond that
* will hit a #PF(RSVD) and never hit an EPT Violation/Misconfig / #NPF,
* and so KVM will never install a SPTE for such addresses.
* Bound TDP MMU walks at host.MAXPHYADDR. KVM disallows memslots with
* a gpa range that would exceed the max gfn, and KVM does not create
* MMIO SPTEs for "impossible" gfns, instead sending such accesses down
* the slow emulation path every time.
*/
return 1ULL << (shadow_phys_bits - PAGE_SHIFT);
return kvm_mmu_max_gfn() + 1;
}
static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
......@@ -830,7 +831,7 @@ static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
{
struct tdp_iter iter;
gfn_t end = tdp_mmu_max_gfn_host();
gfn_t end = tdp_mmu_max_gfn_exclusive();
gfn_t start = 0;
for_each_tdp_pte_min_level(iter, root, zap_level, start, end) {
......@@ -923,7 +924,7 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
{
struct tdp_iter iter;
end = min(end, tdp_mmu_max_gfn_host());
end = min(end, tdp_mmu_max_gfn_exclusive());
lockdep_assert_held_write(&kvm->mmu_lock);
......
......@@ -2739,10 +2739,9 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
reason_set, reason_code);
vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM |
KVM_SYSTEM_EVENT_NDATA_VALID;
vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM;
vcpu->run->system_event.ndata = 1;
vcpu->run->system_event.data[1] = control->ghcb_gpa;
vcpu->run->system_event.data[0] = control->ghcb_gpa;
return 0;
}
......
......@@ -10056,12 +10056,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (kvm_check_request(KVM_REQ_HV_CRASH, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
vcpu->run->system_event.type = KVM_SYSTEM_EVENT_CRASH;
vcpu->run->system_event.ndata = 0;
r = 0;
goto out;
}
if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET;
vcpu->run->system_event.ndata = 0;
r = 0;
goto out;
}
......@@ -12059,8 +12061,12 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
struct kvm_memory_slot *new,
enum kvm_mr_change change)
{
if (change == KVM_MR_CREATE || change == KVM_MR_MOVE)
if (change == KVM_MR_CREATE || change == KVM_MR_MOVE) {
if ((new->base_gfn + new->npages - 1) > kvm_mmu_max_gfn())
return -EINVAL;
return kvm_alloc_memslot_metadata(kvm, new);
}
if (change == KVM_MR_FLAGS_ONLY)
memcpy(&new->arch, &old->arch, sizeof(old->arch));
......
......@@ -445,10 +445,14 @@ struct kvm_run {
#define KVM_SYSTEM_EVENT_RESET 2
#define KVM_SYSTEM_EVENT_CRASH 3
#define KVM_SYSTEM_EVENT_SEV_TERM 4
#define KVM_SYSTEM_EVENT_NDATA_VALID (1u << 31)
__u32 type;
__u32 ndata;
__u64 data[16];
union {
#ifndef __KERNEL__
__u64 flags;
#endif
__u64 data[16];
};
} system_event;
/* KVM_EXIT_S390_STSI */
struct {
......@@ -1148,6 +1152,7 @@ struct kvm_ppc_resize_hpt {
#define KVM_CAP_PMU_CAPABILITY 212
#define KVM_CAP_DISABLE_QUIRKS2 213
#define KVM_CAP_VM_TSC_CONTROL 214
#define KVM_CAP_SYSTEM_EVENT_DATA 215
#ifdef KVM_CAP_IRQ_ROUTING
......
......@@ -4333,6 +4333,7 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
return 0;
#endif
case KVM_CAP_BINARY_STATS_FD:
case KVM_CAP_SYSTEM_EVENT_DATA:
return 1;
default:
break;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment