Commit 1b6c146d authored by Paolo Bonzini's avatar Paolo Bonzini

Merge tag 'kvm-x86-fixes-6.8-2' of https://github.com/kvm-x86/linux into HEAD

KVM x86 fixes for 6.8, round 2:

 - When emulating an atomic access, mark the gfn as dirty in the memslot
   to fix a bug where KVM could fail to mark the slot as dirty during live
   migration, ultimately resulting in guest data corruption due to a dirty
   page not being re-copied from the source to the target.

 - Check for mmu_notifier invalidation events before faulting in the pfn,
   and before acquiring mmu_lock, to avoid unnecessary work and lock
   contention.  Contending mmu_lock is especially problematic on preemptible
   kernels, as KVM may yield mmu_lock in response to the contention, which
   severely degrades overall performance due to vCPUs making it difficult
   for the task that triggered invalidation to make forward progress.

   Note, due to another kernel bug, this fix isn't limited to preemtible
   kernels, as any kernel built with CONFIG_PREEMPT_DYNAMIC=y will yield
   contended rwlocks and spinlocks.

   https://lore.kernel.org/all/20240110214723.695930-1-seanjc@google.com
parents 5ef1d8c1 d02c357e
...@@ -4405,6 +4405,31 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, ...@@ -4405,6 +4405,31 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq; fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq;
smp_rmb(); smp_rmb();
/*
* Check for a relevant mmu_notifier invalidation event before getting
* the pfn from the primary MMU, and before acquiring mmu_lock.
*
* For mmu_lock, if there is an in-progress invalidation and the kernel
* allows preemption, the invalidation task may drop mmu_lock and yield
* in response to mmu_lock being contended, which is *very* counter-
* productive as this vCPU can't actually make forward progress until
* the invalidation completes.
*
* Retrying now can also avoid unnessary lock contention in the primary
* MMU, as the primary MMU doesn't necessarily hold a single lock for
* the duration of the invalidation, i.e. faulting in a conflicting pfn
* can cause the invalidation to take longer by holding locks that are
* needed to complete the invalidation.
*
* Do the pre-check even for non-preemtible kernels, i.e. even if KVM
* will never yield mmu_lock in response to contention, as this vCPU is
* *guaranteed* to need to retry, i.e. waiting until mmu_lock is held
* to detect retry guarantees the worst case latency for the vCPU.
*/
if (fault->slot &&
mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn))
return RET_PF_RETRY;
ret = __kvm_faultin_pfn(vcpu, fault); ret = __kvm_faultin_pfn(vcpu, fault);
if (ret != RET_PF_CONTINUE) if (ret != RET_PF_CONTINUE)
return ret; return ret;
...@@ -4415,6 +4440,18 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, ...@@ -4415,6 +4440,18 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
if (unlikely(!fault->slot)) if (unlikely(!fault->slot))
return kvm_handle_noslot_fault(vcpu, fault, access); return kvm_handle_noslot_fault(vcpu, fault, access);
/*
* Check again for a relevant mmu_notifier invalidation event purely to
* avoid contending mmu_lock. Most invalidations will be detected by
* the previous check, but checking is extremely cheap relative to the
* overall cost of failing to detect the invalidation until after
* mmu_lock is acquired.
*/
if (mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn)) {
kvm_release_pfn_clean(fault->pfn);
return RET_PF_RETRY;
}
return RET_PF_CONTINUE; return RET_PF_CONTINUE;
} }
...@@ -4442,6 +4479,11 @@ static bool is_page_fault_stale(struct kvm_vcpu *vcpu, ...@@ -4442,6 +4479,11 @@ static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
if (!sp && kvm_test_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu)) if (!sp && kvm_test_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu))
return true; return true;
/*
* Check for a relevant mmu_notifier invalidation event one last time
* now that mmu_lock is held, as the "unsafe" checks performed without
* holding mmu_lock can get false negatives.
*/
return fault->slot && return fault->slot &&
mmu_invalidate_retry_gfn(vcpu->kvm, fault->mmu_seq, fault->gfn); mmu_invalidate_retry_gfn(vcpu->kvm, fault->mmu_seq, fault->gfn);
} }
......
...@@ -8007,6 +8007,16 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, ...@@ -8007,6 +8007,16 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
if (r < 0) if (r < 0)
return X86EMUL_UNHANDLEABLE; return X86EMUL_UNHANDLEABLE;
/*
* Mark the page dirty _before_ checking whether or not the CMPXCHG was
* successful, as the old value is written back on failure. Note, for
* live migration, this is unnecessarily conservative as CMPXCHG writes
* back the original value and the access is atomic, but KVM's ABI is
* that all writes are dirty logged, regardless of the value written.
*/
kvm_vcpu_mark_page_dirty(vcpu, gpa_to_gfn(gpa));
if (r) if (r)
return X86EMUL_CMPXCHG_FAILED; return X86EMUL_CMPXCHG_FAILED;
......
...@@ -2031,6 +2031,32 @@ static inline int mmu_invalidate_retry_gfn(struct kvm *kvm, ...@@ -2031,6 +2031,32 @@ static inline int mmu_invalidate_retry_gfn(struct kvm *kvm,
return 1; return 1;
return 0; return 0;
} }
/*
* This lockless version of the range-based retry check *must* be paired with a
* call to the locked version after acquiring mmu_lock, i.e. this is safe to
* use only as a pre-check to avoid contending mmu_lock. This version *will*
* get false negatives and false positives.
*/
static inline bool mmu_invalidate_retry_gfn_unsafe(struct kvm *kvm,
unsigned long mmu_seq,
gfn_t gfn)
{
/*
* Use READ_ONCE() to ensure the in-progress flag and sequence counter
* are always read from memory, e.g. so that checking for retry in a
* loop won't result in an infinite retry loop. Don't force loads for
* start+end, as the key to avoiding infinite retry loops is observing
* the 1=>0 transition of in-progress, i.e. getting false negatives
* due to stale start+end values is acceptable.
*/
if (unlikely(READ_ONCE(kvm->mmu_invalidate_in_progress)) &&
gfn >= kvm->mmu_invalidate_range_start &&
gfn < kvm->mmu_invalidate_range_end)
return true;
return READ_ONCE(kvm->mmu_invalidate_seq) != mmu_seq;
}
#endif #endif
#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment