Commit 2de4085c authored by Ben Gardon's avatar Ben Gardon Committed by Paolo Bonzini

KVM: x86/MMU: Recursively zap nested TDP SPs when zapping last/only parent

Recursively zap all to-be-orphaned children, unsynced or otherwise, when
zapping a shadow page for a nested TDP MMU.  KVM currently only zaps the
unsynced child pages, but not the synced ones.  This can create problems
over time when running many nested guests because it leaves unlinked
pages which will not be freed until the page quota is hit. With the
default page quota of 20 shadow pages per 1000 guest pages, this looks
like a memory leak and can degrade MMU performance.

In a recent benchmark, substantial performance degradation was observed:
An L1 guest was booted with 64G memory.
2G nested Windows guests were booted, 10 at a time for 20
iterations. (200 total boots)
Windows was used in this benchmark because they touch all of their
memory on startup.
By the end of the benchmark, the nested guests were taking ~10% longer
to boot. With this patch there is no degradation in boot time.
Without this patch the benchmark ends with hundreds of thousands of
stale EPT02 pages cluttering up rmaps and the page hash map. As a
result, VM shutdown is also much slower: deleting memslot 0 was
observed to take over a minute. With this patch it takes just a
few miliseconds.

Cc: Peter Shier <pshier@google.com>
Signed-off-by: default avatarBen Gardon <bgardon@google.com>
Co-developed-by: default avatarSean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: default avatarSean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200923221406.16297-3-sean.j.christopherson@intel.com>
Reviewed-by: default avatarBen Gardon <bgardon@google.com>
Signed-off-by: default avatarPaolo Bonzini <pbonzini@redhat.com>
parent ace569e0
......@@ -2615,8 +2615,9 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
}
}
static void mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
u64 *spte)
/* Returns the number of zapped non-leaf child shadow pages. */
static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
u64 *spte, struct list_head *invalid_list)
{
u64 pte;
struct kvm_mmu_page *child;
......@@ -2630,19 +2631,34 @@ static void mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
} else {
child = to_shadow_page(pte & PT64_BASE_ADDR_MASK);
drop_parent_pte(child, spte);
/*
* Recursively zap nested TDP SPs, parentless SPs are
* unlikely to be used again in the near future. This
* avoids retaining a large number of stale nested SPs.
*/
if (tdp_enabled && invalid_list &&
child->role.guest_mode && !child->parent_ptes.val)
return kvm_mmu_prepare_zap_page(kvm, child,
invalid_list);
}
} else if (is_mmio_spte(pte)) {
mmu_spte_clear_no_track(spte);
}
return 0;
}
static void kvm_mmu_page_unlink_children(struct kvm *kvm,
struct kvm_mmu_page *sp)
static int kvm_mmu_page_unlink_children(struct kvm *kvm,
struct kvm_mmu_page *sp,
struct list_head *invalid_list)
{
int zapped = 0;
unsigned i;
for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
mmu_page_zap_pte(kvm, sp, sp->spt + i);
zapped += mmu_page_zap_pte(kvm, sp, sp->spt + i, invalid_list);
return zapped;
}
static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
......@@ -2688,7 +2704,7 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
trace_kvm_mmu_prepare_zap_page(sp);
++kvm->stat.mmu_shadow_zapped;
*nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
kvm_mmu_page_unlink_children(kvm, sp);
*nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list);
kvm_mmu_unlink_parents(kvm, sp);
/* Zapping children means active_mmu_pages has become unstable. */
......@@ -5396,7 +5412,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
u32 base_role = vcpu->arch.mmu->mmu_role.base.word;
entry = *spte;
mmu_page_zap_pte(vcpu->kvm, sp, spte);
mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL);
if (gentry &&
!((sp->role.word ^ base_role) & ~role_ign.word) &&
rmap_can_add(vcpu))
......
......@@ -929,7 +929,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
pte_gpa = FNAME(get_level1_sp_gpa)(sp);
pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
mmu_page_zap_pte(vcpu->kvm, sp, sptep);
mmu_page_zap_pte(vcpu->kvm, sp, sptep, NULL);
if (is_shadow_present_pte(old_spte))
kvm_flush_remote_tlbs_with_address(vcpu->kvm,
sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level));
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment