Commit 6e949ddb authored by Paolo Bonzini's avatar Paolo Bonzini

Merge branch 'kvm-tdpmmu-fixes' into kvm-master

Merge topic branch with fixes for both 5.14-rc6 and 5.15.
parents c5e2bf0b ce25681d
...@@ -25,10 +25,10 @@ On x86: ...@@ -25,10 +25,10 @@ On x86:
- vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock - vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock
- kvm->arch.mmu_lock is an rwlock. kvm->arch.tdp_mmu_pages_lock is - kvm->arch.mmu_lock is an rwlock. kvm->arch.tdp_mmu_pages_lock and
taken inside kvm->arch.mmu_lock, and cannot be taken without already kvm->arch.mmu_unsync_pages_lock are taken inside kvm->arch.mmu_lock, and
holding kvm->arch.mmu_lock (typically with ``read_lock``, otherwise cannot be taken without already holding kvm->arch.mmu_lock (typically with
there's no need to take kvm->arch.tdp_mmu_pages_lock at all). ``read_lock`` for the TDP MMU, thus the need for additional spinlocks).
Everything else is a leaf: no other lock is taken inside the critical Everything else is a leaf: no other lock is taken inside the critical
sections. sections.
......
...@@ -1038,6 +1038,13 @@ struct kvm_arch { ...@@ -1038,6 +1038,13 @@ struct kvm_arch {
struct list_head lpage_disallowed_mmu_pages; struct list_head lpage_disallowed_mmu_pages;
struct kvm_page_track_notifier_node mmu_sp_tracker; struct kvm_page_track_notifier_node mmu_sp_tracker;
struct kvm_page_track_notifier_head track_notifier_head; struct kvm_page_track_notifier_head track_notifier_head;
/*
* Protects marking pages unsync during page faults, as TDP MMU page
* faults only take mmu_lock for read. For simplicity, the unsync
* pages lock is always taken when marking pages unsync regardless of
* whether mmu_lock is held for read or write.
*/
spinlock_t mmu_unsync_pages_lock;
struct list_head assigned_dev_head; struct list_head assigned_dev_head;
struct iommu_domain *iommu_domain; struct iommu_domain *iommu_domain;
......
...@@ -2535,6 +2535,7 @@ static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) ...@@ -2535,6 +2535,7 @@ static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync) int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync)
{ {
struct kvm_mmu_page *sp; struct kvm_mmu_page *sp;
bool locked = false;
/* /*
* Force write-protection if the page is being tracked. Note, the page * Force write-protection if the page is being tracked. Note, the page
...@@ -2557,9 +2558,34 @@ int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync) ...@@ -2557,9 +2558,34 @@ int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync)
if (sp->unsync) if (sp->unsync)
continue; continue;
/*
* TDP MMU page faults require an additional spinlock as they
* run with mmu_lock held for read, not write, and the unsync
* logic is not thread safe. Take the spinklock regardless of
* the MMU type to avoid extra conditionals/parameters, there's
* no meaningful penalty if mmu_lock is held for write.
*/
if (!locked) {
locked = true;
spin_lock(&vcpu->kvm->arch.mmu_unsync_pages_lock);
/*
* Recheck after taking the spinlock, a different vCPU
* may have since marked the page unsync. A false
* positive on the unprotected check above is not
* possible as clearing sp->unsync _must_ hold mmu_lock
* for write, i.e. unsync cannot transition from 0->1
* while this CPU holds mmu_lock for read (or write).
*/
if (READ_ONCE(sp->unsync))
continue;
}
WARN_ON(sp->role.level != PG_LEVEL_4K); WARN_ON(sp->role.level != PG_LEVEL_4K);
kvm_unsync_page(vcpu, sp); kvm_unsync_page(vcpu, sp);
} }
if (locked)
spin_unlock(&vcpu->kvm->arch.mmu_unsync_pages_lock);
/* /*
* We need to ensure that the marking of unsync pages is visible * We need to ensure that the marking of unsync pages is visible
...@@ -5537,6 +5563,8 @@ void kvm_mmu_init_vm(struct kvm *kvm) ...@@ -5537,6 +5563,8 @@ void kvm_mmu_init_vm(struct kvm *kvm)
{ {
struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
if (!kvm_mmu_init_tdp_mmu(kvm)) if (!kvm_mmu_init_tdp_mmu(kvm))
/* /*
* No smp_load/store wrappers needed here as we are in * No smp_load/store wrappers needed here as we are in
......
...@@ -43,6 +43,7 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) ...@@ -43,6 +43,7 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
if (!kvm->arch.tdp_mmu_enabled) if (!kvm->arch.tdp_mmu_enabled)
return; return;
WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
/* /*
...@@ -81,8 +82,6 @@ static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) ...@@ -81,8 +82,6 @@ static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
bool shared) bool shared)
{ {
gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
kvm_lockdep_assert_mmu_lock_held(kvm, shared); kvm_lockdep_assert_mmu_lock_held(kvm, shared);
if (!refcount_dec_and_test(&root->tdp_mmu_root_count)) if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
...@@ -94,7 +93,7 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, ...@@ -94,7 +93,7 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
list_del_rcu(&root->link); list_del_rcu(&root->link);
spin_unlock(&kvm->arch.tdp_mmu_pages_lock); spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
zap_gfn_range(kvm, root, 0, max_gfn, false, false, shared); zap_gfn_range(kvm, root, 0, -1ull, false, false, shared);
call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback); call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
} }
...@@ -724,13 +723,29 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, ...@@ -724,13 +723,29 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
gfn_t start, gfn_t end, bool can_yield, bool flush, gfn_t start, gfn_t end, bool can_yield, bool flush,
bool shared) bool shared)
{ {
gfn_t max_gfn_host = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
bool zap_all = (start == 0 && end >= max_gfn_host);
struct tdp_iter iter; struct tdp_iter iter;
/*
* No need to try to step down in the iterator when zapping all SPTEs,
* zapping the top-level non-leaf SPTEs will recurse on their children.
*/
int min_level = zap_all ? root->role.level : PG_LEVEL_4K;
/*
* Bound the walk at host.MAXPHYADDR, guest accesses beyond that will
* hit a #PF(RSVD) and never get to an EPT Violation/Misconfig / #NPF,
* and so KVM will never install a SPTE for such addresses.
*/
end = min(end, max_gfn_host);
kvm_lockdep_assert_mmu_lock_held(kvm, shared); kvm_lockdep_assert_mmu_lock_held(kvm, shared);
rcu_read_lock(); rcu_read_lock();
tdp_root_for_each_pte(iter, root, start, end) { for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
min_level, start, end) {
retry: retry:
if (can_yield && if (can_yield &&
tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) { tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) {
...@@ -744,9 +759,10 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, ...@@ -744,9 +759,10 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
/* /*
* If this is a non-last-level SPTE that covers a larger range * If this is a non-last-level SPTE that covers a larger range
* than should be zapped, continue, and zap the mappings at a * than should be zapped, continue, and zap the mappings at a
* lower level. * lower level, except when zapping all SPTEs.
*/ */
if ((iter.gfn < start || if (!zap_all &&
(iter.gfn < start ||
iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) && iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
!is_last_spte(iter.old_spte, iter.level)) !is_last_spte(iter.old_spte, iter.level))
continue; continue;
...@@ -794,12 +810,11 @@ bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start, ...@@ -794,12 +810,11 @@ bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
void kvm_tdp_mmu_zap_all(struct kvm *kvm) void kvm_tdp_mmu_zap_all(struct kvm *kvm)
{ {
gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
bool flush = false; bool flush = false;
int i; int i;
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, max_gfn, flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull,
flush, false); flush, false);
if (flush) if (flush)
...@@ -838,7 +853,6 @@ static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm, ...@@ -838,7 +853,6 @@ static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm,
*/ */
void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
{ {
gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
struct kvm_mmu_page *next_root; struct kvm_mmu_page *next_root;
struct kvm_mmu_page *root; struct kvm_mmu_page *root;
bool flush = false; bool flush = false;
...@@ -854,8 +868,7 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) ...@@ -854,8 +868,7 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
rcu_read_unlock(); rcu_read_unlock();
flush = zap_gfn_range(kvm, root, 0, max_gfn, true, flush, flush = zap_gfn_range(kvm, root, 0, -1ull, true, flush, true);
true);
/* /*
* Put the reference acquired in * Put the reference acquired in
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment