KVM: x86/mmu: Split huge pages mapped by the TDP MMU during KVM_CLEAR_DIRTY_LOG

When using KVM_DIRTY_LOG_INITIALLY_SET, huge pages are not write-protected when dirty logging is enabled on the memslot. Instead they are write-protected once userspace invokes KVM_CLEAR_DIRTY_LOG for the first time and only for the specific sub-region being cleared. Enhance KVM_CLEAR_DIRTY_LOG to also try to split huge pages prior to write-protecting to avoid causing write-protection faults on vCPU threads. This also allows userspace to smear the cost of huge page splitting across multiple ioctls, rather than splitting the entire memslot as is the case when initially-all-set is not used. Signed-off-by: David Matlack <dmatlack@google.com> Message-Id: <20220119230739.2234394-17-dmatlack@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

KVM: x86/mmu: Split huge pages mapped by the TDP MMU during KVM_CLEAR_DIRTY_LOG
When using KVM_DIRTY_LOG_INITIALLY_SET, huge pages are not write-protected when dirty logging is enabled on the memslot. Instead they are write-protected once userspace invokes KVM_CLEAR_DIRTY_LOG for the first time and only for the specific sub-region being cleared. Enhance KVM_CLEAR_DIRTY_LOG to also try to split huge pages prior to write-protecting to avoid causing write-protection faults on vCPU threads. This also allows userspace to smear the cost of huge page splitting across multiple ioctls, rather than splitting the entire memslot as is the case when initially-all-set is not used. Signed-off-by: David Matlack <dmatlack@google.com> Message-Id: <20220119230739.2234394-17-dmatlack@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
cb00a70b · David Matlack · Paolo Bonzini · a3fe5dbd · cb00a70b · cb00a70b
Commit cb00a70b authored Jan 19, 2022 by David Matlack Committed by Paolo Bonzini Feb 10, 2022
7 changed files
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2356,7 +2356,9 @@
 			KVM_DIRTY_LOG_INITIALLY_SET is enabled or disabled. If
 			disabled, all huge pages in a memslot will be eagerly
 			split when dirty logging is enabled on that memslot. If
-			enabled, huge pages will not be eagerly split.
+			enabled, eager page splitting will be performed during
+			the KVM_CLEAR_DIRTY ioctl, and only for the pages being
+			cleared.

 			Eager page splitting currently only supports splitting
 			huge pages mapped by the TDP MMU.

--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1590,6 +1590,10 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
 void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm,
 				       const struct kvm_memory_slot *memslot,
 				       int target_level);
+void kvm_mmu_try_split_huge_pages(struct kvm *kvm,
+				  const struct kvm_memory_slot *memslot,
+				  u64 start, u64 end,
+				  int target_level);
 void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
 				   const struct kvm_memory_slot *memslot);
 void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,

--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -1358,6 +1358,9 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
 		gfn_t start = slot->base_gfn + gfn_offset + __ffs(mask);
 		gfn_t end = slot->base_gfn + gfn_offset + __fls(mask);

+		if (READ_ONCE(eager_page_split))
+			kvm_mmu_try_split_huge_pages(kvm, slot, start, end, PG_LEVEL_4K);
+
 		kvm_mmu_slot_gfn_write_protect(kvm, slot, start, PG_LEVEL_2M);

 		/* Cross two large pages? */
@@ -5830,6 +5833,22 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
 		kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
 }

+/* Must be called with the mmu_lock held in write-mode. */
+void kvm_mmu_try_split_huge_pages(struct kvm *kvm,
+				   const struct kvm_memory_slot *memslot,
+				   u64 start, u64 end,
+				   int target_level)
+{
+	if (is_tdp_mmu_enabled(kvm))
+		kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end,
+						 target_level, false);
+
+	/*
+	 * A TLB flush is unnecessary at this point for the same resons as in
+	 * kvm_mmu_slot_try_split_huge_pages().
+	 */
+}
+
 void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm,
 					const struct kvm_memory_slot *memslot,
 					int target_level)
@@ -5839,7 +5858,7 @@ void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm,

 	if (is_tdp_mmu_enabled(kvm)) {
 		read_lock(&kvm->mmu_lock);
-		kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level);
+		kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, true);
 		read_unlock(&kvm->mmu_lock);
 	}


--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -963,27 +963,33 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
 }

 /*
- * tdp_mmu_link_sp_atomic - Atomically replace the given spte with an spte
- * pointing to the provided page table.
+ * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
+ * provided page table.
 *
 * @kvm: kvm instance
 * @iter: a tdp_iter instance currently on the SPTE that should be set
 * @sp: The new TDP page table to install.
 * @account_nx: True if this page table is being installed to split a
 *              non-executable huge page.
+ * @shared: This operation is running under the MMU lock in read mode.
 *
 * Returns: 0 if the new page table was installed. Non-0 if the page table
 *          could not be installed (e.g. the atomic compare-exchange failed).
 */
-static int tdp_mmu_link_sp_atomic(struct kvm *kvm, struct tdp_iter *iter,
-				  struct kvm_mmu_page *sp, bool account_nx)
+static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
+			   struct kvm_mmu_page *sp, bool account_nx,
+			   bool shared)
 {
 	u64 spte = make_nonleaf_spte(sp->spt, !shadow_accessed_mask);
-	int ret;
+	int ret = 0;

+	if (shared) {
 		ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
 		if (ret)
 			return ret;
+	} else {
+		tdp_mmu_set_spte(kvm, iter, spte);
+	}

 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
 	list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
@@ -1051,7 +1057,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 			sp = tdp_mmu_alloc_sp(vcpu);
 			tdp_mmu_init_child_sp(sp, &iter);

-			if (tdp_mmu_link_sp_atomic(vcpu->kvm, &iter, sp, account_nx)) {
+			if (tdp_mmu_link_sp(vcpu->kvm, &iter, sp, account_nx, true)) {
 				tdp_mmu_free_sp(sp);
 				break;
 			}
@@ -1277,12 +1283,11 @@ static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
 }

 static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
-						       struct tdp_iter *iter)
+						       struct tdp_iter *iter,
+						       bool shared)
 {
 	struct kvm_mmu_page *sp;

-	lockdep_assert_held_read(&kvm->mmu_lock);
-
 	/*
 	 * Since we are allocating while under the MMU lock we have to be
 	 * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
@@ -1297,20 +1302,27 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
 		return sp;

 	rcu_read_unlock();
+
+	if (shared)
 		read_unlock(&kvm->mmu_lock);
+	else
+		write_unlock(&kvm->mmu_lock);

 	iter->yielded = true;
 	sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);

+	if (shared)
 		read_lock(&kvm->mmu_lock);
+	else
+		write_lock(&kvm->mmu_lock);
+
 	rcu_read_lock();

 	return sp;
 }

-static int tdp_mmu_split_huge_page_atomic(struct kvm *kvm,
-					  struct tdp_iter *iter,
-					  struct kvm_mmu_page *sp)
+static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
+				   struct kvm_mmu_page *sp, bool shared)
 {
 	const u64 huge_spte = iter->old_spte;
 	const int level = iter->level;
@@ -1333,7 +1345,7 @@ static int tdp_mmu_split_huge_page_atomic(struct kvm *kvm,
 	 * correctness standpoint since the translation will be the same either
 	 * way.
 	 */
-	ret = tdp_mmu_link_sp_atomic(kvm, iter, sp, false);
+	ret = tdp_mmu_link_sp(kvm, iter, sp, false, shared);
 	if (ret)
 		return ret;

@@ -1350,7 +1362,7 @@ static int tdp_mmu_split_huge_page_atomic(struct kvm *kvm,
 static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
 					 struct kvm_mmu_page *root,
 					 gfn_t start, gfn_t end,
-					 int target_level)
+					 int target_level, bool shared)
 {
 	struct kvm_mmu_page *sp = NULL;
 	struct tdp_iter iter;
@@ -1371,14 +1383,14 @@ static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
 	 */
 	for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
 retry:
-		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
+		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
 			continue;

 		if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
 			continue;

 		if (!sp) {
-			sp = tdp_mmu_alloc_sp_for_split(kvm, &iter);
+			sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared);
 			if (!sp) {
 				ret = -ENOMEM;
 				break;
@@ -1388,7 +1400,7 @@ static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
 				continue;
 		}

-		if (tdp_mmu_split_huge_page_atomic(kvm, &iter, sp))
+		if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
 			goto retry;

 		sp = NULL;
@@ -1408,23 +1420,24 @@ static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
 	return ret;
 }

+
 /*
 * Try to split all huge pages mapped by the TDP MMU down to the target level.
 */
 void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
 				      const struct kvm_memory_slot *slot,
 				      gfn_t start, gfn_t end,
-				      int target_level)
+				      int target_level, bool shared)
 {
 	struct kvm_mmu_page *root;
 	int r = 0;

-	lockdep_assert_held_read(&kvm->mmu_lock);
+	kvm_lockdep_assert_mmu_lock_held(kvm, shared);

-	for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) {
-		r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level);
+	for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) {
+		r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
 		if (r) {
-			kvm_tdp_mmu_put_root(kvm, root, true);
+			kvm_tdp_mmu_put_root(kvm, root, shared);
 			break;
 		}
 	}

--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -70,7 +70,7 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
 void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
 				      const struct kvm_memory_slot *slot,
 				      gfn_t start, gfn_t end,
-				      int target_level);
+				      int target_level, bool shared);

 static inline void kvm_tdp_mmu_walk_lockless_begin(void)
 {

--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -192,7 +192,7 @@ bool __read_mostly enable_pmu = true;
 EXPORT_SYMBOL_GPL(enable_pmu);
 module_param(enable_pmu, bool, 0444);

-static bool __read_mostly eager_page_split = true;
+bool __read_mostly eager_page_split = true;
 module_param(eager_page_split, bool, 0644);

 /*

--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -307,6 +307,8 @@ extern int pi_inject_timer;

 extern bool report_ignored_msrs;

+extern bool eager_page_split;
+
 static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
 {
 	return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,