KVM: PPC: Book3S HV: Don't use compound_order to determine host mapping size

THP paths can defer splitting compound pages until after the actual remap and TLB flushes to split a huge PMD/PUD. This causes radix partition scope page table mappings to get out of synch with the host qemu page table mappings. This results in random memory corruption in the guest when running with THP. The easiest way to reproduce is use KVM balloon to free up a lot of memory in the guest and then shrink the balloon to give the memory back, while some work is being done in the guest. Cc: David Gibson <david@gibson.dropbear.id.au> Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> Cc: kvm-ppc@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Paul Mackerras <paulus@ozlabs.org>

KVM: PPC: Book3S HV: Don't use compound_order to determine host mapping size
THP paths can defer splitting compound pages until after the actual remap and TLB flushes to split a huge PMD/PUD. This causes radix partition scope page table mappings to get out of synch with the host qemu page table mappings. This results in random memory corruption in the guest when running with THP. The easiest way to reproduce is use KVM balloon to free up a lot of memory in the guest and then shrink the balloon to give the memory back, while some work is being done in the guest. Cc: David Gibson <david@gibson.dropbear.id.au> Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> Cc: kvm-ppc@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
71d29f43 · Nicholas Piggin · Paul Mackerras · 425333bf · 71d29f43
Commit 71d29f43 authored Sep 11, 2018 by Nicholas Piggin Committed by Paul Mackerras Sep 12, 2018
Show whitespace changes
Inline Side-by-side

Showing with 37 additions and 54 deletions

arch/powerpc/kvm/book3s_64_mmu_radix.c arch/powerpc/kvm/book3s_64_mmu_radix.c +37 -54

No files found.
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -525,8 +525,8 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 				   unsigned long ea, unsigned long dsisr)
 {
 	struct kvm *kvm = vcpu->kvm;
-	unsigned long mmu_seq, pte_size;
-	unsigned long gpa, gfn, hva, pfn;
+	unsigned long mmu_seq;
+	unsigned long gpa, gfn, hva;
 	struct kvm_memory_slot *memslot;
 	struct page *page = NULL;
 	long ret;
@@ -623,9 +623,10 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	 */
 	hva = gfn_to_hva_memslot(memslot, gfn);
 	if (upgrade_p && __get_user_pages_fast(hva, 1, 1, &page) == 1) {
-		pfn = page_to_pfn(page);
 		upgrade_write = true;
 	} else {
+		unsigned long pfn;
+
 		/* Call KVM generic code to do the slow-path check */
 		pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
 					   writing, upgrade_p);
@@ -639,41 +640,16 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		}
 	}

-	/* See if we can insert a 1GB or 2MB large PTE here */
-	level = 0;
-	if (page && PageCompound(page)) {
-		pte_size = PAGE_SIZE << compound_order(compound_head(page));
-		if (pte_size >= PUD_SIZE &&
-		    (gpa & (PUD_SIZE - PAGE_SIZE)) ==
-		    (hva & (PUD_SIZE - PAGE_SIZE))) {
-			level = 2;
-			pfn &= ~((PUD_SIZE >> PAGE_SHIFT) - 1);
-		} else if (pte_size >= PMD_SIZE &&
-			   (gpa & (PMD_SIZE - PAGE_SIZE)) ==
-			   (hva & (PMD_SIZE - PAGE_SIZE))) {
-			level = 1;
-			pfn &= ~((PMD_SIZE >> PAGE_SHIFT) - 1);
-		}
-	}
-
-	/*
-	 * Compute the PTE value that we need to insert.
-	 */
-	if (page) {
-		pgflags = _PAGE_READ | _PAGE_EXEC | _PAGE_PRESENT | _PAGE_PTE |
-			_PAGE_ACCESSED;
-		if (writing || upgrade_write)
-			pgflags |= _PAGE_WRITE | _PAGE_DIRTY;
-		pte = pfn_pte(pfn, __pgprot(pgflags));
-	} else {
 	/*
 	 * Read the PTE from the process' radix tree and use that
-		 * so we get the attribute bits.
+	 * so we get the shift and attribute bits.
 	 */
 	local_irq_disable();
 	ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift);
 	pte = *ptep;
 	local_irq_enable();
+
+	/* Get pte level from shift/size */
 	if (shift == PUD_SHIFT &&
 	    (gpa & (PUD_SIZE - PAGE_SIZE)) ==
 	    (hva & (PUD_SIZE - PAGE_SIZE))) {
@@ -682,11 +658,19 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		   (gpa & (PMD_SIZE - PAGE_SIZE)) ==
 		   (hva & (PMD_SIZE - PAGE_SIZE))) {
 		level = 1;
-		} else if (shift && shift != PAGE_SHIFT) {
-			/* Adjust PFN */
-			unsigned long mask = (1ul << shift) - PAGE_SIZE;
-			pte = __pte(pte_val(pte) | (hva & mask));
+	} else {
+		level = 0;
+		if (shift > PAGE_SHIFT) {
+			/*
+			 * If the pte maps more than one page, bring over
+			 * bits from the virtual address to get the real
+			 * address of the specific single page we want.
+			 */
+			unsigned long rpnmask = (1ul << shift) - PAGE_SIZE;
+			pte = __pte(pte_val(pte) | (hva & rpnmask));
+		}
 	}
+
 	pte = __pte(pte_val(pte) | _PAGE_EXEC | _PAGE_ACCESSED);
 	if (writing || upgrade_write) {
 		if (pte_val(pte) & _PAGE_WRITE)
@@ -694,7 +678,6 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	} else {
 		pte = __pte(pte_val(pte) & ~(_PAGE_WRITE | _PAGE_DIRTY));
 	}
-	}

 	/* Allocate space in the tree and write the PTE */
 	ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq);