Commit 7b1b868e authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull kvm fixes from Paolo Bonzini:
 "Bugfixes for ARM, x86 and tools"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
  tools/kvm_stat: Exempt time-based counters
  KVM: mmu: Fix SPTE encoding of MMIO generation upper half
  kvm: x86/mmu: Use cpuid to determine max gfn
  kvm: svm: de-allocate svm_cpu_data for all cpus in svm_cpu_uninit()
  selftests: kvm/set_memory_region_test: Fix race in move region test
  KVM: arm64: Add usage of stage 2 fault lookup level in user_mem_abort()
  KVM: arm64: Fix handling of merging tables into a block entry
  KVM: arm64: Fix memory leak on stage2 update of a valid PTE
parents b53966ff 111d0bda
......@@ -455,7 +455,7 @@ If the generation number of the spte does not equal the global generation
number, it will ignore the cached MMIO information and handle the page
fault through the slow path.
Since only 19 bits are used to store generation-number on mmio spte, all
Since only 18 bits are used to store generation-number on mmio spte, all
pages are zapped when there is an overflow.
Unfortunately, a single memory access might access kvm_memslots(kvm) multiple
......
......@@ -104,6 +104,7 @@
/* Shared ISS fault status code(IFSC/DFSC) for Data/Instruction aborts */
#define ESR_ELx_FSC (0x3F)
#define ESR_ELx_FSC_TYPE (0x3C)
#define ESR_ELx_FSC_LEVEL (0x03)
#define ESR_ELx_FSC_EXTABT (0x10)
#define ESR_ELx_FSC_SERROR (0x11)
#define ESR_ELx_FSC_ACCESS (0x08)
......
......@@ -350,6 +350,11 @@ static __always_inline u8 kvm_vcpu_trap_get_fault_type(const struct kvm_vcpu *vc
return kvm_vcpu_get_esr(vcpu) & ESR_ELx_FSC_TYPE;
}
static __always_inline u8 kvm_vcpu_trap_get_fault_level(const struct kvm_vcpu *vcpu)
{
return kvm_vcpu_get_esr(vcpu) & ESR_ELx_FSC_LEVEL;
}
static __always_inline bool kvm_vcpu_abt_issea(const struct kvm_vcpu *vcpu)
{
switch (kvm_vcpu_trap_get_fault(vcpu)) {
......
......@@ -470,6 +470,15 @@ static bool stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
if (!kvm_block_mapping_supported(addr, end, phys, level))
return false;
/*
* If the PTE was already valid, drop the refcount on the table
* early, as it will be bumped-up again in stage2_map_walk_leaf().
* This ensures that the refcount stays constant across a valid to
* valid PTE update.
*/
if (kvm_pte_valid(*ptep))
put_page(virt_to_page(ptep));
if (kvm_set_valid_leaf_pte(ptep, phys, data->attr, level))
goto out;
......@@ -493,7 +502,13 @@ static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 level,
return 0;
kvm_set_invalid_pte(ptep);
kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, 0);
/*
* Invalidate the whole stage-2, as we may have numerous leaf
* entries below us which would otherwise need invalidating
* individually.
*/
kvm_call_hyp(__kvm_tlb_flush_vmid, data->mmu);
data->anchor = ptep;
return 0;
}
......
......@@ -754,10 +754,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
gfn_t gfn;
kvm_pfn_t pfn;
bool logging_active = memslot_is_logging(memslot);
unsigned long vma_pagesize;
unsigned long fault_level = kvm_vcpu_trap_get_fault_level(vcpu);
unsigned long vma_pagesize, fault_granule;
enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
struct kvm_pgtable *pgt;
fault_granule = 1UL << ARM64_HW_PGTABLE_LEVEL_SHIFT(fault_level);
write_fault = kvm_is_write_fault(vcpu);
exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
VM_BUG_ON(write_fault && exec_fault);
......@@ -896,7 +898,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
else if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC))
prot |= KVM_PGTABLE_PROT_X;
if (fault_status == FSC_PERM && !(logging_active && writable)) {
/*
* Under the premise of getting a FSC_PERM fault, we just need to relax
* permissions only if vma_pagesize equals fault_granule. Otherwise,
* kvm_pgtable_stage2_map() should be called to change block size.
*/
if (fault_status == FSC_PERM && vma_pagesize == fault_granule) {
ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot);
} else {
ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize,
......
......@@ -40,8 +40,8 @@ static u64 generation_mmio_spte_mask(u64 gen)
WARN_ON(gen & ~MMIO_SPTE_GEN_MASK);
BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK | MMIO_SPTE_GEN_LOW_MASK) & SPTE_SPECIAL_MASK);
mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK;
mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK;
mask = (gen << MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_SPTE_GEN_LOW_MASK;
mask |= (gen << MMIO_SPTE_GEN_HIGH_SHIFT) & MMIO_SPTE_GEN_HIGH_MASK;
return mask;
}
......
......@@ -56,11 +56,11 @@
#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
/*
* Due to limited space in PTEs, the MMIO generation is a 19 bit subset of
* Due to limited space in PTEs, the MMIO generation is a 18 bit subset of
* the memslots generation and is derived as follows:
*
* Bits 0-8 of the MMIO generation are propagated to spte bits 3-11
* Bits 9-18 of the MMIO generation are propagated to spte bits 52-61
* Bits 9-17 of the MMIO generation are propagated to spte bits 54-62
*
* The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in
* the MMIO generation number, as doing so would require stealing a bit from
......@@ -69,18 +69,29 @@
* requires a full MMU zap). The flag is instead explicitly queried when
* checking for MMIO spte cache hits.
*/
#define MMIO_SPTE_GEN_MASK GENMASK_ULL(17, 0)
#define MMIO_SPTE_GEN_LOW_START 3
#define MMIO_SPTE_GEN_LOW_END 11
#define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \
MMIO_SPTE_GEN_LOW_START)
#define MMIO_SPTE_GEN_HIGH_START PT64_SECOND_AVAIL_BITS_SHIFT
#define MMIO_SPTE_GEN_HIGH_END 62
#define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \
MMIO_SPTE_GEN_LOW_START)
#define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \
MMIO_SPTE_GEN_HIGH_START)
#define MMIO_SPTE_GEN_LOW_BITS (MMIO_SPTE_GEN_LOW_END - MMIO_SPTE_GEN_LOW_START + 1)
#define MMIO_SPTE_GEN_HIGH_BITS (MMIO_SPTE_GEN_HIGH_END - MMIO_SPTE_GEN_HIGH_START + 1)
/* remember to adjust the comment above as well if you change these */
static_assert(MMIO_SPTE_GEN_LOW_BITS == 9 && MMIO_SPTE_GEN_HIGH_BITS == 9);
#define MMIO_SPTE_GEN_LOW_SHIFT (MMIO_SPTE_GEN_LOW_START - 0)
#define MMIO_SPTE_GEN_HIGH_SHIFT (MMIO_SPTE_GEN_HIGH_START - MMIO_SPTE_GEN_LOW_BITS)
#define MMIO_SPTE_GEN_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_BITS + MMIO_SPTE_GEN_HIGH_BITS - 1, 0)
extern u64 __read_mostly shadow_nx_mask;
extern u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
extern u64 __read_mostly shadow_user_mask;
......@@ -228,8 +239,8 @@ static inline u64 get_mmio_spte_generation(u64 spte)
{
u64 gen;
gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START;
gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START;
gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_SHIFT;
gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_SHIFT;
return gen;
}
......
......@@ -66,7 +66,7 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
{
gfn_t max_gfn = 1ULL << (boot_cpu_data.x86_phys_bits - PAGE_SHIFT);
gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
lockdep_assert_held(&kvm->mmu_lock);
......@@ -456,7 +456,7 @@ bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end)
void kvm_tdp_mmu_zap_all(struct kvm *kvm)
{
gfn_t max_gfn = 1ULL << (boot_cpu_data.x86_phys_bits - PAGE_SHIFT);
gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
bool flush;
flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn);
......
......@@ -530,12 +530,12 @@ static int svm_hardware_enable(void)
static void svm_cpu_uninit(int cpu)
{
struct svm_cpu_data *sd = per_cpu(svm_data, raw_smp_processor_id());
struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
if (!sd)
return;
per_cpu(svm_data, raw_smp_processor_id()) = NULL;
per_cpu(svm_data, cpu) = NULL;
kfree(sd->sev_vmcbs);
__free_page(sd->save_area);
kfree(sd);
......
......@@ -742,7 +742,11 @@ class DebugfsProvider(Provider):
The fields are all available KVM debugfs files
"""
return self.walkdir(PATH_DEBUGFS_KVM)[2]
exempt_list = ['halt_poll_fail_ns', 'halt_poll_success_ns']
fields = [field for field in self.walkdir(PATH_DEBUGFS_KVM)[2]
if field not in exempt_list]
return fields
def update_fields(self, fields_filter):
"""Refresh fields, applying fields_filter"""
......
......@@ -156,14 +156,23 @@ static void guest_code_move_memory_region(void)
GUEST_SYNC(0);
/*
* Spin until the memory region is moved to a misaligned address. This
* may or may not trigger MMIO, as the window where the memslot is
* invalid is quite small.
* Spin until the memory region starts getting moved to a
* misaligned address.
* Every region move may or may not trigger MMIO, as the
* window where the memslot is invalid is usually quite small.
*/
val = guest_spin_on_val(0);
GUEST_ASSERT_1(val == 1 || val == MMIO_VAL, val);
/* Spin until the memory region is realigned. */
/* Spin until the misaligning memory region move completes. */
val = guest_spin_on_val(MMIO_VAL);
GUEST_ASSERT_1(val == 1 || val == 0, val);
/* Spin until the memory region starts to get re-aligned. */
val = guest_spin_on_val(0);
GUEST_ASSERT_1(val == 1 || val == MMIO_VAL, val);
/* Spin until the re-aligning memory region move completes. */
val = guest_spin_on_val(MMIO_VAL);
GUEST_ASSERT_1(val == 1, val);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment