kvm: x86: Add multi-entry LRU cache for previous CR3s

Adds support for storing multiple previous CR3/root_hpa pairs maintained as an LRU cache, so that the lockless CR3 switch path can be used when switching back to any of them. Signed-off-by: Junaid Shahid <junaids@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

kvm: x86: Add multi-entry LRU cache for previous CR3s
Adds support for storing multiple previous CR3/root_hpa pairs maintained as an LRU cache, so that the lockless CR3 switch path can be used when switching back to any of them. Signed-off-by: Junaid Shahid <junaids@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
b94742c9 · Junaid Shahid · Paolo Bonzini · faff8758 · b94742c9 · b94742c9
Commit b94742c9 authored Jun 27, 2018 by Junaid Shahid Committed by Paolo Bonzini Aug 06, 2018
Show whitespace changes
Inline Side-by-side

Showing with 92 additions and 41 deletions

arch/x86/include/asm/kvm_host.h arch/x86/include/asm/kvm_host.h +6 -4

arch/x86/kvm/mmu.c arch/x86/kvm/mmu.c +78 -33

arch/x86/kvm/vmx.c arch/x86/kvm/vmx.c +8 -4

No files found.
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -335,6 +335,8 @@ struct kvm_mmu_root_info {
 #define KVM_MMU_ROOT_INFO_INVALID \
 	((struct kvm_mmu_root_info) { .cr3 = INVALID_PAGE, .hpa = INVALID_PAGE })
+#define KVM_MMU_NUM_PREV_ROOTS 3
 /*
 * x86 supports 4 paging modes (5-level 64-bit, 4-level 64-bit, 3-level 32-bit,
 * and 2-level 32-bit).  The kvm_mmu structure abstracts the details of the
@@ -363,7 +365,7 @@ struct kvm_mmu {
 	u8 shadow_root_level;
 	u8 ept_ad;
 	bool direct_map;
-	struct kvm_mmu_root_info prev_root;
+	struct kvm_mmu_root_info prev_roots[KVM_MMU_NUM_PREV_ROOTS];
 	/*
 	 * Bitmap; bit set = permission fault
@@ -1296,7 +1298,7 @@ static inline int __kvm_irq_line_state(unsigned long *irq_state,
 }
 #define KVM_MMU_ROOT_CURRENT		BIT(0)
-#define KVM_MMU_ROOT_PREVIOUS	BIT(1)
+#define KVM_MMU_ROOT_PREVIOUS(i)	BIT(1+i)
 #define KVM_MMU_ROOTS_ALL		(~0UL)
 int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level);

--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3445,17 +3445,25 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, ulong roots_to_free)
 	LIST_HEAD(invalid_list);
 	struct kvm_mmu *mmu = &vcpu->arch.mmu;
 	bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT;
-	bool free_prev_root = roots_to_free & KVM_MMU_ROOT_PREVIOUS;
+	BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
 	/* Before acquiring the MMU lock, see if we need to do any real work. */
-	if (!(free_active_root && VALID_PAGE(mmu->root_hpa)) &&
+	if (!(free_active_root && VALID_PAGE(mmu->root_hpa))) {
-	    !(free_prev_root && VALID_PAGE(mmu->prev_root.hpa)))
+		for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+			if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) &&
+			    VALID_PAGE(mmu->prev_roots[i].hpa))
+				break;
+		if (i == KVM_MMU_NUM_PREV_ROOTS)
 			return;
+	}
 	spin_lock(&vcpu->kvm->mmu_lock);
-	if (free_prev_root)
+	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
-		mmu_free_root_page(vcpu->kvm, &mmu->prev_root.hpa,
+		if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
+			mmu_free_root_page(vcpu->kvm, &mmu->prev_roots[i].hpa,
 					   &invalid_list);
 	if (free_active_root) {
@@ -4064,6 +4072,38 @@ static void nonpaging_init_context(struct kvm_vcpu *vcpu,
 	context->nx = false;
 }
+/*
+ * Find out if a previously cached root matching the new CR3/role is available.
+ * The current root is also inserted into the cache.
+ * If a matching root was found, it is assigned to kvm_mmu->root_hpa and true is
+ * returned.
+ * Otherwise, the LRU root from the cache is assigned to kvm_mmu->root_hpa and
+ * false is returned. This root should now be freed by the caller.
+ */
+static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_cr3,
+				  union kvm_mmu_page_role new_role)
+{
+	uint i;
+	struct kvm_mmu_root_info root;
+	struct kvm_mmu *mmu = &vcpu->arch.mmu;
+	root.cr3 = mmu->get_cr3(vcpu);
+	root.hpa = mmu->root_hpa;
+	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
+		swap(root, mmu->prev_roots[i]);
+		if (new_cr3 == root.cr3 && VALID_PAGE(root.hpa) &&
+		    page_header(root.hpa) != NULL &&
+		    new_role.word == page_header(root.hpa)->role.word)
+			break;
+	}
+	mmu->root_hpa = root.hpa;
+	return i < KVM_MMU_NUM_PREV_ROOTS;
+}
 static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
 			    union kvm_mmu_page_role new_role,
 			    bool skip_tlb_flush)
@@ -4077,18 +4117,10 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
 	 */
 	if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
 	    mmu->root_level >= PT64_ROOT_4LEVEL) {
-		gpa_t prev_cr3 = mmu->prev_root.cr3;
 		if (mmu_check_root(vcpu, new_cr3 >> PAGE_SHIFT))
 			return false;
-		swap(mmu->root_hpa, mmu->prev_root.hpa);
+		if (cached_root_available(vcpu, new_cr3, new_role)) {
-		mmu->prev_root.cr3 = mmu->get_cr3(vcpu);
-		if (new_cr3 == prev_cr3 &&
-		    VALID_PAGE(mmu->root_hpa) &&
-		    page_header(mmu->root_hpa) != NULL &&
-		    new_role.word == page_header(mmu->root_hpa)->role.word) {
 			/*
 			 * It is possible that the cached previous root page is
 			 * obsolete because of a change in the MMU
@@ -4854,8 +4886,12 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
 void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots)
 {
 	if (reset_roots) {
+		uint i;
 		vcpu->arch.mmu.root_hpa = INVALID_PAGE;
-		vcpu->arch.mmu.prev_root = KVM_MMU_ROOT_INFO_INVALID;
+		for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+			vcpu->arch.mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
 	}
 	if (mmu_is_nested(vcpu))
@@ -5225,6 +5261,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
 {
 	struct kvm_mmu *mmu = &vcpu->arch.mmu;
+	int i;
 	/* INVLPG on a * non-canonical address is a NOP according to the SDM.  */
 	if (is_noncanonical_address(gva, vcpu))
@@ -5235,16 +5272,17 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
 	/*
 	 * INVLPG is required to invalidate any global mappings for the VA,
 	 * irrespective of PCID. Since it would take us roughly similar amount
-	 * of work to determine whether the prev_root mapping of the VA is
+	 * of work to determine whether any of the prev_root mappings of the VA
-	 * marked global, or to just sync it blindly, so we might as well just
+	 * is marked global, or to just sync it blindly, so we might as well
-	 * always sync it.
+	 * just always sync it.
 	 *
-	 * Mappings not reachable via the current cr3 or the prev_root.cr3 will
+	 * Mappings not reachable via the current cr3 or the prev_roots will be
-	 * be synced when switching to that cr3, so nothing needs to be done
+	 * synced when switching to that cr3, so nothing needs to be done here
-	 * here for them.
+	 * for them.
 	 */
-	if (VALID_PAGE(mmu->prev_root.hpa))
+	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
-		mmu->invlpg(vcpu, gva, mmu->prev_root.hpa);
+		if (VALID_PAGE(mmu->prev_roots[i].hpa))
+			mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
 	kvm_x86_ops->tlb_flush_gva(vcpu, gva);
 	++vcpu->stat.invlpg;
@@ -5255,17 +5293,20 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
 {
 	struct kvm_mmu *mmu = &vcpu->arch.mmu;
 	bool tlb_flush = false;
+	uint i;
 	if (pcid == kvm_get_active_pcid(vcpu)) {
 		mmu->invlpg(vcpu, gva, mmu->root_hpa);
 		tlb_flush = true;
 	}
-	if (VALID_PAGE(mmu->prev_root.hpa) &&
+	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
-	    pcid == kvm_get_pcid(vcpu, mmu->prev_root.cr3)) {
+		if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
-		mmu->invlpg(vcpu, gva, mmu->prev_root.hpa);
+		    pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].cr3)) {
+			mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
 			tlb_flush = true;
 		}
+	}
 	if (tlb_flush)
 		kvm_x86_ops->tlb_flush_gva(vcpu, gva);
@@ -5273,9 +5314,9 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
 	++vcpu->stat.invlpg;
 	/*
-	 * Mappings not reachable via the current cr3 or the prev_root.cr3 will
+	 * Mappings not reachable via the current cr3 or the prev_roots will be
-	 * be synced when switching to that cr3, so nothing needs to be done
+	 * synced when switching to that cr3, so nothing needs to be done here
-	 * here for them.
+	 * for them.
 	 */
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva);
@@ -5321,12 +5362,16 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
 int kvm_mmu_create(struct kvm_vcpu *vcpu)
 {
+	uint i;
 	vcpu->arch.walk_mmu = &vcpu->arch.mmu;
 	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
-	vcpu->arch.mmu.prev_root = KVM_MMU_ROOT_INFO_INVALID;
 	vcpu->arch.mmu.translate_gpa = translate_gpa;
 	vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
+	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+		vcpu->arch.mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
 	return alloc_mmu_pages(vcpu);
 }

--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8788,6 +8788,8 @@ static int handle_invpcid(struct kvm_vcpu *vcpu)
 	bool pcid_enabled;
 	gva_t gva;
 	struct x86_exception e;
+	unsigned i;
+	unsigned long roots_to_free = 0;
 	struct {
 		u64 pcid;
 		u64 gla;
@@ -8846,12 +8848,14 @@ static int handle_invpcid(struct kvm_vcpu *vcpu)
 			kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 		}
-		if (kvm_get_pcid(vcpu, vcpu->arch.mmu.prev_root.cr3)
+		for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+			if (kvm_get_pcid(vcpu, vcpu->arch.mmu.prev_roots[i].cr3)
 			    == operand.pcid)
-			kvm_mmu_free_roots(vcpu, KVM_MMU_ROOT_PREVIOUS);
+				roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
+		kvm_mmu_free_roots(vcpu, roots_to_free);
 		/*
-		 * If neither the current cr3 nor the prev_root.cr3 use the
+		 * If neither the current cr3 nor any of the prev_roots use the
 		 * given PCID, then nothing needs to be done here because a
 		 * resync will happen anyway before switching to any other CR3.
 		 */