Merge tag 'kvm-arm-for-4.7' of...

Merge tag 'kvm-arm-for-4.7' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm into HEAD KVM/ARM Changes for Linux v4.7 Reworks our stage 2 page table handling to have page table manipulation macros separate from those of the host systems as the underlying hardware page tables can be configured to be noticably different in layout from the stage 1 page tables used by the host. Adds 16K page size support based on the above. Adds a generic firmware probing layer for the timer and GIC so that KVM initializes using the same logic based on both ACPI and FDT. Finally adds support for hardware updating of the access flag.

Merge tag 'kvm-arm-for-4.7' of...
Merge tag 'kvm-arm-for-4.7' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm into HEAD KVM/ARM Changes for Linux v4.7 Reworks our stage 2 page table handling to have page table manipulation macros separate from those of the host systems as the underlying hardware page tables can be configured to be noticably different in layout from the stage 1 page tables used by the host. Adds 16K page size support based on the above. Adds a generic firmware probing layer for the timer and GIC so that KVM initializes using the same logic based on both ACPI and FDT. Finally adds support for hardware updating of the access flag.
bdb4094e · Paolo Bonzini · 6ac0f61f · 06485053 · bdb4094e · bdb4094e
Commit bdb4094e authored May 11, 2016 by Paolo Bonzini
25 changed files
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -47,6 +47,7 @@
 #include <linux/highmem.h>
 #include <asm/cacheflush.h>
 #include <asm/pgalloc.h>
+#include <asm/stage2_pgtable.h>

 int create_hyp_mappings(void *from, void *to);
 int create_hyp_io_mappings(void *from, void *to, phys_addr_t);
@@ -105,14 +106,16 @@ static inline void kvm_clean_pte(pte_t *pte)
 	clean_pte_table(pte);
 }

-static inline void kvm_set_s2pte_writable(pte_t *pte)
+static inline pte_t kvm_s2pte_mkwrite(pte_t pte)
 {
-	pte_val(*pte) |= L_PTE_S2_RDWR;
+	pte_val(pte) |= L_PTE_S2_RDWR;
+	return pte;
 }

-static inline void kvm_set_s2pmd_writable(pmd_t *pmd)
+static inline pmd_t kvm_s2pmd_mkwrite(pmd_t pmd)
 {
-	pmd_val(*pmd) |= L_PMD_S2_RDWR;
+	pmd_val(pmd) |= L_PMD_S2_RDWR;
+	return pmd;
 }

 static inline void kvm_set_s2pte_readonly(pte_t *pte)
@@ -135,22 +138,6 @@ static inline bool kvm_s2pmd_readonly(pmd_t *pmd)
 	return (pmd_val(*pmd) & L_PMD_S2_RDWR) == L_PMD_S2_RDONLY;
 }

-
-/* Open coded p*d_addr_end that can deal with 64bit addresses */
-#define kvm_pgd_addr_end(addr, end)					\
-({	u64 __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK;		\
-	(__boundary - 1 < (end) - 1)? __boundary: (end);		\
-})
-
-#define kvm_pud_addr_end(addr,end)		(end)
-
-#define kvm_pmd_addr_end(addr, end)					\
-({	u64 __boundary = ((addr) + PMD_SIZE) & PMD_MASK;		\
-	(__boundary - 1 < (end) - 1)? __boundary: (end);		\
-})
-
-#define kvm_pgd_index(addr)			pgd_index(addr)
-
 static inline bool kvm_page_empty(void *ptr)
 {
 	struct page *ptr_page = virt_to_page(ptr);
@@ -159,19 +146,11 @@ static inline bool kvm_page_empty(void *ptr)

 #define kvm_pte_table_empty(kvm, ptep) kvm_page_empty(ptep)
 #define kvm_pmd_table_empty(kvm, pmdp) kvm_page_empty(pmdp)
-#define kvm_pud_table_empty(kvm, pudp) (0)
-
-#define KVM_PREALLOC_LEVEL	0
+#define kvm_pud_table_empty(kvm, pudp) false

-static inline void *kvm_get_hwpgd(struct kvm *kvm)
-{
-	return kvm->arch.pgd;
-}
-
-static inline unsigned int kvm_get_hwpgd_size(void)
-{
-	return PTRS_PER_S2_PGD * sizeof(pgd_t);
-}
+#define hyp_pte_table_empty(ptep) kvm_page_empty(ptep)
+#define hyp_pmd_table_empty(pmdp) kvm_page_empty(pmdp)
+#define hyp_pud_table_empty(pudp) false

 struct kvm;


--- a/arch/arm/include/asm/stage2_pgtable.h
+++ b/arch/arm/include/asm/stage2_pgtable.h
+/*
+ * Copyright (C) 2016 - ARM Ltd
+ *
+ * stage2 page table helpers
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __ARM_S2_PGTABLE_H_
+#define __ARM_S2_PGTABLE_H_
+
+#define stage2_pgd_none(pgd)			pgd_none(pgd)
+#define stage2_pgd_clear(pgd)			pgd_clear(pgd)
+#define stage2_pgd_present(pgd)			pgd_present(pgd)
+#define stage2_pgd_populate(pgd, pud)		pgd_populate(NULL, pgd, pud)
+#define stage2_pud_offset(pgd, address)		pud_offset(pgd, address)
+#define stage2_pud_free(pud)			pud_free(NULL, pud)
+
+#define stage2_pud_none(pud)			pud_none(pud)
+#define stage2_pud_clear(pud)			pud_clear(pud)
+#define stage2_pud_present(pud)			pud_present(pud)
+#define stage2_pud_populate(pud, pmd)		pud_populate(NULL, pud, pmd)
+#define stage2_pmd_offset(pud, address)		pmd_offset(pud, address)
+#define stage2_pmd_free(pmd)			pmd_free(NULL, pmd)
+
+#define stage2_pud_huge(pud)			pud_huge(pud)
+
+/* Open coded p*d_addr_end that can deal with 64bit addresses */
+static inline phys_addr_t stage2_pgd_addr_end(phys_addr_t addr, phys_addr_t end)
+{
+	phys_addr_t boundary = (addr + PGDIR_SIZE) & PGDIR_MASK;
+
+	return (boundary - 1 < end - 1) ? boundary : end;
+}
+
+#define stage2_pud_addr_end(addr, end)		(end)
+
+static inline phys_addr_t stage2_pmd_addr_end(phys_addr_t addr, phys_addr_t end)
+{
+	phys_addr_t boundary = (addr + PMD_SIZE) & PMD_MASK;
+
+	return (boundary - 1 < end - 1) ? boundary : end;
+}
+
+#define stage2_pgd_index(addr)				pgd_index(addr)
+
+#define stage2_pte_table_empty(ptep)			kvm_page_empty(ptep)
+#define stage2_pmd_table_empty(pmdp)			kvm_page_empty(pmdp)
+#define stage2_pud_table_empty(pudp)			false
+
+#endif	/* __ARM_S2_PGTABLE_H_ */
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -448,7 +448,7 @@ static void update_vttbr(struct kvm *kvm)
 	kvm_next_vmid &= (1 << kvm_vmid_bits) - 1;

 	/* update vttbr to be used with the new vmid */
-	pgd_phys = virt_to_phys(kvm_get_hwpgd(kvm));
+	pgd_phys = virt_to_phys(kvm->arch.pgd);
 	BUG_ON(pgd_phys & ~VTTBR_BADDR_MASK);
 	vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK(kvm_vmid_bits);
 	kvm->arch.vttbr = pgd_phys | vmid;

--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -43,11 +43,9 @@ static unsigned long hyp_idmap_start;
 static unsigned long hyp_idmap_end;
 static phys_addr_t hyp_idmap_vector;

+#define S2_PGD_SIZE	(PTRS_PER_S2_PGD * sizeof(pgd_t))
 #define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t))

-#define kvm_pmd_huge(_x)	(pmd_huge(_x) || pmd_trans_huge(_x))
-#define kvm_pud_huge(_x)	pud_huge(_x)
-
 #define KVM_S2PTE_FLAG_IS_IOMAP		(1UL << 0)
 #define KVM_S2_FLAG_LOGGING_ACTIVE	(1UL << 1)

@@ -69,14 +67,7 @@ void kvm_flush_remote_tlbs(struct kvm *kvm)

 static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
 {
-	/*
-	 * This function also gets called when dealing with HYP page
-	 * tables. As HYP doesn't have an associated struct kvm (and
-	 * the HYP page tables are fairly static), we don't do
-	 * anything there.
-	 */
-	if (kvm)
-		kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
+	kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
 }

 /*
@@ -115,7 +106,7 @@ static bool kvm_is_device_pfn(unsigned long pfn)
 */
 static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
 {
-	if (!kvm_pmd_huge(*pmd))
+	if (!pmd_thp_or_huge(*pmd))
 		return;

 	pmd_clear(pmd);
@@ -155,29 +146,29 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
 	return p;
 }

-static void clear_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr)
+static void clear_stage2_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr)
 {
-	pud_t *pud_table __maybe_unused = pud_offset(pgd, 0);
-	pgd_clear(pgd);
+	pud_t *pud_table __maybe_unused = stage2_pud_offset(pgd, 0UL);
+	stage2_pgd_clear(pgd);
 	kvm_tlb_flush_vmid_ipa(kvm, addr);
-	pud_free(NULL, pud_table);
+	stage2_pud_free(pud_table);
 	put_page(virt_to_page(pgd));
 }

-static void clear_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr)
+static void clear_stage2_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr)
 {
-	pmd_t *pmd_table = pmd_offset(pud, 0);
-	VM_BUG_ON(pud_huge(*pud));
-	pud_clear(pud);
+	pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(pud, 0);
+	VM_BUG_ON(stage2_pud_huge(*pud));
+	stage2_pud_clear(pud);
 	kvm_tlb_flush_vmid_ipa(kvm, addr);
-	pmd_free(NULL, pmd_table);
+	stage2_pmd_free(pmd_table);
 	put_page(virt_to_page(pud));
 }

-static void clear_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr)
+static void clear_stage2_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr)
 {
 	pte_t *pte_table = pte_offset_kernel(pmd, 0);
-	VM_BUG_ON(kvm_pmd_huge(*pmd));
+	VM_BUG_ON(pmd_thp_or_huge(*pmd));
 	pmd_clear(pmd);
 	kvm_tlb_flush_vmid_ipa(kvm, addr);
 	pte_free_kernel(NULL, pte_table);
@@ -204,7 +195,7 @@ static void clear_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr)
 * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure
 * the IO subsystem will never hit in the cache.
 */
-static void unmap_ptes(struct kvm *kvm, pmd_t *pmd,
+static void unmap_stage2_ptes(struct kvm *kvm, pmd_t *pmd,
 		       phys_addr_t addr, phys_addr_t end)
 {
 	phys_addr_t start_addr = addr;
@@ -226,21 +217,21 @@ static void unmap_ptes(struct kvm *kvm, pmd_t *pmd,
 		}
 	} while (pte++, addr += PAGE_SIZE, addr != end);

-	if (kvm_pte_table_empty(kvm, start_pte))
-		clear_pmd_entry(kvm, pmd, start_addr);
+	if (stage2_pte_table_empty(start_pte))
+		clear_stage2_pmd_entry(kvm, pmd, start_addr);
 }

-static void unmap_pmds(struct kvm *kvm, pud_t *pud,
+static void unmap_stage2_pmds(struct kvm *kvm, pud_t *pud,
 		       phys_addr_t addr, phys_addr_t end)
 {
 	phys_addr_t next, start_addr = addr;
 	pmd_t *pmd, *start_pmd;

-	start_pmd = pmd = pmd_offset(pud, addr);
+	start_pmd = pmd = stage2_pmd_offset(pud, addr);
 	do {
-		next = kvm_pmd_addr_end(addr, end);
+		next = stage2_pmd_addr_end(addr, end);
 		if (!pmd_none(*pmd)) {
-			if (kvm_pmd_huge(*pmd)) {
+			if (pmd_thp_or_huge(*pmd)) {
 				pmd_t old_pmd = *pmd;

 				pmd_clear(pmd);
@@ -250,57 +241,64 @@ static void unmap_pmds(struct kvm *kvm, pud_t *pud,

 				put_page(virt_to_page(pmd));
 			} else {
-				unmap_ptes(kvm, pmd, addr, next);
+				unmap_stage2_ptes(kvm, pmd, addr, next);
 			}
 		}
 	} while (pmd++, addr = next, addr != end);

-	if (kvm_pmd_table_empty(kvm, start_pmd))
-		clear_pud_entry(kvm, pud, start_addr);
+	if (stage2_pmd_table_empty(start_pmd))
+		clear_stage2_pud_entry(kvm, pud, start_addr);
 }

-static void unmap_puds(struct kvm *kvm, pgd_t *pgd,
+static void unmap_stage2_puds(struct kvm *kvm, pgd_t *pgd,
 		       phys_addr_t addr, phys_addr_t end)
 {
 	phys_addr_t next, start_addr = addr;
 	pud_t *pud, *start_pud;

-	start_pud = pud = pud_offset(pgd, addr);
+	start_pud = pud = stage2_pud_offset(pgd, addr);
 	do {
-		next = kvm_pud_addr_end(addr, end);
-		if (!pud_none(*pud)) {
-			if (pud_huge(*pud)) {
+		next = stage2_pud_addr_end(addr, end);
+		if (!stage2_pud_none(*pud)) {
+			if (stage2_pud_huge(*pud)) {
 				pud_t old_pud = *pud;

-				pud_clear(pud);
+				stage2_pud_clear(pud);
 				kvm_tlb_flush_vmid_ipa(kvm, addr);
-
 				kvm_flush_dcache_pud(old_pud);
-
 				put_page(virt_to_page(pud));
 			} else {
-				unmap_pmds(kvm, pud, addr, next);
+				unmap_stage2_pmds(kvm, pud, addr, next);
 			}
 		}
 	} while (pud++, addr = next, addr != end);

-	if (kvm_pud_table_empty(kvm, start_pud))
-		clear_pgd_entry(kvm, pgd, start_addr);
+	if (stage2_pud_table_empty(start_pud))
+		clear_stage2_pgd_entry(kvm, pgd, start_addr);
 }

-
-static void unmap_range(struct kvm *kvm, pgd_t *pgdp,
-			phys_addr_t start, u64 size)
+/**
+ * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
+ * @kvm:   The VM pointer
+ * @start: The intermediate physical base address of the range to unmap
+ * @size:  The size of the area to unmap
+ *
+ * Clear a range of stage-2 mappings, lowering the various ref-counts.  Must
+ * be called while holding mmu_lock (unless for freeing the stage2 pgd before
+ * destroying the VM), otherwise another faulting VCPU may come in and mess
+ * with things behind our backs.
+ */
+static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size)
 {
 	pgd_t *pgd;
 	phys_addr_t addr = start, end = start + size;
 	phys_addr_t next;

-	pgd = pgdp + kvm_pgd_index(addr);
+	pgd = kvm->arch.pgd + stage2_pgd_index(addr);
 	do {
-		next = kvm_pgd_addr_end(addr, end);
-		if (!pgd_none(*pgd))
-			unmap_puds(kvm, pgd, addr, next);
+		next = stage2_pgd_addr_end(addr, end);
+		if (!stage2_pgd_none(*pgd))
+			unmap_stage2_puds(kvm, pgd, addr, next);
 	} while (pgd++, addr = next, addr != end);
 }

@@ -322,11 +320,11 @@ static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud,
 	pmd_t *pmd;
 	phys_addr_t next;

-	pmd = pmd_offset(pud, addr);
+	pmd = stage2_pmd_offset(pud, addr);
 	do {
-		next = kvm_pmd_addr_end(addr, end);
+		next = stage2_pmd_addr_end(addr, end);
 		if (!pmd_none(*pmd)) {
-			if (kvm_pmd_huge(*pmd))
+			if (pmd_thp_or_huge(*pmd))
 				kvm_flush_dcache_pmd(*pmd);
 			else
 				stage2_flush_ptes(kvm, pmd, addr, next);
@@ -340,11 +338,11 @@ static void stage2_flush_puds(struct kvm *kvm, pgd_t *pgd,
 	pud_t *pud;
 	phys_addr_t next;

-	pud = pud_offset(pgd, addr);
+	pud = stage2_pud_offset(pgd, addr);
 	do {
-		next = kvm_pud_addr_end(addr, end);
-		if (!pud_none(*pud)) {
-			if (pud_huge(*pud))
+		next = stage2_pud_addr_end(addr, end);
+		if (!stage2_pud_none(*pud)) {
+			if (stage2_pud_huge(*pud))
 				kvm_flush_dcache_pud(*pud);
 			else
 				stage2_flush_pmds(kvm, pud, addr, next);
@@ -360,9 +358,9 @@ static void stage2_flush_memslot(struct kvm *kvm,
 	phys_addr_t next;
 	pgd_t *pgd;

-	pgd = kvm->arch.pgd + kvm_pgd_index(addr);
+	pgd = kvm->arch.pgd + stage2_pgd_index(addr);
 	do {
-		next = kvm_pgd_addr_end(addr, end);
+		next = stage2_pgd_addr_end(addr, end);
 		stage2_flush_puds(kvm, pgd, addr, next);
 	} while (pgd++, addr = next, addr != end);
 }
@@ -391,6 +389,100 @@ static void stage2_flush_vm(struct kvm *kvm)
 	srcu_read_unlock(&kvm->srcu, idx);
 }

+static void clear_hyp_pgd_entry(pgd_t *pgd)
+{
+	pud_t *pud_table __maybe_unused = pud_offset(pgd, 0UL);
+	pgd_clear(pgd);
+	pud_free(NULL, pud_table);
+	put_page(virt_to_page(pgd));
+}
+
+static void clear_hyp_pud_entry(pud_t *pud)
+{
+	pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0);
+	VM_BUG_ON(pud_huge(*pud));
+	pud_clear(pud);
+	pmd_free(NULL, pmd_table);
+	put_page(virt_to_page(pud));
+}
+
+static void clear_hyp_pmd_entry(pmd_t *pmd)
+{
+	pte_t *pte_table = pte_offset_kernel(pmd, 0);
+	VM_BUG_ON(pmd_thp_or_huge(*pmd));
+	pmd_clear(pmd);
+	pte_free_kernel(NULL, pte_table);
+	put_page(virt_to_page(pmd));
+}
+
+static void unmap_hyp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
+{
+	pte_t *pte, *start_pte;
+
+	start_pte = pte = pte_offset_kernel(pmd, addr);
+	do {
+		if (!pte_none(*pte)) {
+			kvm_set_pte(pte, __pte(0));
+			put_page(virt_to_page(pte));
+		}
+	} while (pte++, addr += PAGE_SIZE, addr != end);
+
+	if (hyp_pte_table_empty(start_pte))
+		clear_hyp_pmd_entry(pmd);
+}
+
+static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end)
+{
+	phys_addr_t next;
+	pmd_t *pmd, *start_pmd;
+
+	start_pmd = pmd = pmd_offset(pud, addr);
+	do {
+		next = pmd_addr_end(addr, end);
+		/* Hyp doesn't use huge pmds */
+		if (!pmd_none(*pmd))
+			unmap_hyp_ptes(pmd, addr, next);
+	} while (pmd++, addr = next, addr != end);
+
+	if (hyp_pmd_table_empty(start_pmd))
+		clear_hyp_pud_entry(pud);
+}
+
+static void unmap_hyp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end)
+{
+	phys_addr_t next;
+	pud_t *pud, *start_pud;
+
+	start_pud = pud = pud_offset(pgd, addr);
+	do {
+		next = pud_addr_end(addr, end);
+		/* Hyp doesn't use huge puds */
+		if (!pud_none(*pud))
+			unmap_hyp_pmds(pud, addr, next);
+	} while (pud++, addr = next, addr != end);
+
+	if (hyp_pud_table_empty(start_pud))
+		clear_hyp_pgd_entry(pgd);
+}
+
+static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size)
+{
+	pgd_t *pgd;
+	phys_addr_t addr = start, end = start + size;
+	phys_addr_t next;
+
+	/*
+	 * We don't unmap anything from HYP, except at the hyp tear down.
+	 * Hence, we don't have to invalidate the TLBs here.
+	 */
+	pgd = pgdp + pgd_index(addr);
+	do {
+		next = pgd_addr_end(addr, end);
+		if (!pgd_none(*pgd))
+			unmap_hyp_puds(pgd, addr, next);
+	} while (pgd++, addr = next, addr != end);
+}
+
 /**
 * free_boot_hyp_pgd - free HYP boot page tables
 *
@@ -401,14 +493,14 @@ void free_boot_hyp_pgd(void)
 	mutex_lock(&kvm_hyp_pgd_mutex);

 	if (boot_hyp_pgd) {
-		unmap_range(NULL, boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE);
-		unmap_range(NULL, boot_hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE);
+		unmap_hyp_range(boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE);
+		unmap_hyp_range(boot_hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE);
 		free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order);
 		boot_hyp_pgd = NULL;
 	}

 	if (hyp_pgd)
-		unmap_range(NULL, hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE);
+		unmap_hyp_range(hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE);

 	mutex_unlock(&kvm_hyp_pgd_mutex);
 }
@@ -433,9 +525,9 @@ void free_hyp_pgds(void)

 	if (hyp_pgd) {
 		for (addr = PAGE_OFFSET; virt_addr_valid(addr); addr += PGDIR_SIZE)
-			unmap_range(NULL, hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE);
+			unmap_hyp_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE);
 		for (addr = VMALLOC_START; is_vmalloc_addr((void*)addr); addr += PGDIR_SIZE)
-			unmap_range(NULL, hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE);
+			unmap_hyp_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE);

 		free_pages((unsigned long)hyp_pgd, hyp_pgd_order);
 		hyp_pgd = NULL;
@@ -645,20 +737,6 @@ int create_hyp_io_mappings(void *from, void *to, phys_addr_t phys_addr)
 				     __phys_to_pfn(phys_addr), PAGE_HYP_DEVICE);
 }

-/* Free the HW pgd, one page at a time */
-static void kvm_free_hwpgd(void *hwpgd)
-{
-	free_pages_exact(hwpgd, kvm_get_hwpgd_size());
-}
-
-/* Allocate the HW PGD, making sure that each page gets its own refcount */
-static void *kvm_alloc_hwpgd(void)
-{
-	unsigned int size = kvm_get_hwpgd_size();
-
-	return alloc_pages_exact(size, GFP_KERNEL | __GFP_ZERO);
-}
-
 /**
 * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation.
 * @kvm:	The KVM struct pointer for the VM.
@@ -673,81 +751,22 @@ static void *kvm_alloc_hwpgd(void)
 int kvm_alloc_stage2_pgd(struct kvm *kvm)
 {
 	pgd_t *pgd;
-	void *hwpgd;

 	if (kvm->arch.pgd != NULL) {
 		kvm_err("kvm_arch already initialized?\n");
 		return -EINVAL;
 	}

-	hwpgd = kvm_alloc_hwpgd();
-	if (!hwpgd)
+	/* Allocate the HW PGD, making sure that each page gets its own refcount */
+	pgd = alloc_pages_exact(S2_PGD_SIZE, GFP_KERNEL | __GFP_ZERO);
+	if (!pgd)
 		return -ENOMEM;

-	/* When the kernel uses more levels of page tables than the
-	 * guest, we allocate a fake PGD and pre-populate it to point
-	 * to the next-level page table, which will be the real
-	 * initial page table pointed to by the VTTBR.
-	 *
-	 * When KVM_PREALLOC_LEVEL==2, we allocate a single page for
-	 * the PMD and the kernel will use folded pud.
-	 * When KVM_PREALLOC_LEVEL==1, we allocate 2 consecutive PUD
-	 * pages.
-	 */
-	if (KVM_PREALLOC_LEVEL > 0) {
-		int i;
-
-		/*
-		 * Allocate fake pgd for the page table manipulation macros to
-		 * work.  This is not used by the hardware and we have no
-		 * alignment requirement for this allocation.
-		 */
-		pgd = kmalloc(PTRS_PER_S2_PGD * sizeof(pgd_t),
-				GFP_KERNEL | __GFP_ZERO);
-
-		if (!pgd) {
-			kvm_free_hwpgd(hwpgd);
-			return -ENOMEM;
-		}
-
-		/* Plug the HW PGD into the fake one. */
-		for (i = 0; i < PTRS_PER_S2_PGD; i++) {
-			if (KVM_PREALLOC_LEVEL == 1)
-				pgd_populate(NULL, pgd + i,
-					     (pud_t *)hwpgd + i * PTRS_PER_PUD);
-			else if (KVM_PREALLOC_LEVEL == 2)
-				pud_populate(NULL, pud_offset(pgd, 0) + i,
-					     (pmd_t *)hwpgd + i * PTRS_PER_PMD);
-		}
-	} else {
-		/*
-		 * Allocate actual first-level Stage-2 page table used by the
-		 * hardware for Stage-2 page table walks.
-		 */
-		pgd = (pgd_t *)hwpgd;
-	}
-
 	kvm_clean_pgd(pgd);
 	kvm->arch.pgd = pgd;
 	return 0;
 }

-/**
- * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
- * @kvm:   The VM pointer
- * @start: The intermediate physical base address of the range to unmap
- * @size:  The size of the area to unmap
- *
- * Clear a range of stage-2 mappings, lowering the various ref-counts.  Must
- * be called while holding mmu_lock (unless for freeing the stage2 pgd before
- * destroying the VM), otherwise another faulting VCPU may come in and mess
- * with things behind our backs.
- */
-static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size)
-{
-	unmap_range(kvm, kvm->arch.pgd, start, size);
-}
-
 static void stage2_unmap_memslot(struct kvm *kvm,
 				 struct kvm_memory_slot *memslot)
 {
@@ -830,10 +849,8 @@ void kvm_free_stage2_pgd(struct kvm *kvm)
 		return;

 	unmap_stage2_range(kvm, 0, KVM_PHYS_SIZE);
-	kvm_free_hwpgd(kvm_get_hwpgd(kvm));
-	if (KVM_PREALLOC_LEVEL > 0)
-		kfree(kvm->arch.pgd);
-
+	/* Free the HW pgd, one page at a time */
+	free_pages_exact(kvm->arch.pgd, S2_PGD_SIZE);
 	kvm->arch.pgd = NULL;
 }

@@ -843,16 +860,16 @@ static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache
 	pgd_t *pgd;
 	pud_t *pud;

-	pgd = kvm->arch.pgd + kvm_pgd_index(addr);
-	if (WARN_ON(pgd_none(*pgd))) {
+	pgd = kvm->arch.pgd + stage2_pgd_index(addr);
+	if (WARN_ON(stage2_pgd_none(*pgd))) {
 		if (!cache)
 			return NULL;
 		pud = mmu_memory_cache_alloc(cache);
-		pgd_populate(NULL, pgd, pud);
+		stage2_pgd_populate(pgd, pud);
 		get_page(virt_to_page(pgd));
 	}

-	return pud_offset(pgd, addr);
+	return stage2_pud_offset(pgd, addr);
 }

 static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
@@ -862,15 +879,15 @@ static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache
 	pmd_t *pmd;

 	pud = stage2_get_pud(kvm, cache, addr);
-	if (pud_none(*pud)) {
+	if (stage2_pud_none(*pud)) {
 		if (!cache)
 			return NULL;
 		pmd = mmu_memory_cache_alloc(cache);
-		pud_populate(NULL, pud, pmd);
+		stage2_pud_populate(pud, pmd);
 		get_page(virt_to_page(pud));
 	}

-	return pmd_offset(pud, addr);
+	return stage2_pmd_offset(pud, addr);
 }

 static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
@@ -893,11 +910,14 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
 	VM_BUG_ON(pmd_present(*pmd) && pmd_pfn(*pmd) != pmd_pfn(*new_pmd));

 	old_pmd = *pmd;
-	kvm_set_pmd(pmd, *new_pmd);
-	if (pmd_present(old_pmd))
+	if (pmd_present(old_pmd)) {
+		pmd_clear(pmd);
 		kvm_tlb_flush_vmid_ipa(kvm, addr);
-	else
+	} else {
 		get_page(virt_to_page(pmd));
+	}
+
+	kvm_set_pmd(pmd, *new_pmd);
 	return 0;
 }

@@ -946,15 +966,38 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,

 	/* Create 2nd stage page table mapping - Level 3 */
 	old_pte = *pte;
-	kvm_set_pte(pte, *new_pte);
-	if (pte_present(old_pte))
+	if (pte_present(old_pte)) {
+		kvm_set_pte(pte, __pte(0));
 		kvm_tlb_flush_vmid_ipa(kvm, addr);
-	else
+	} else {
 		get_page(virt_to_page(pte));
+	}

+	kvm_set_pte(pte, *new_pte);
 	return 0;
 }

+#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
+static int stage2_ptep_test_and_clear_young(pte_t *pte)
+{
+	if (pte_young(*pte)) {
+		*pte = pte_mkold(*pte);
+		return 1;
+	}
+	return 0;
+}
+#else
+static int stage2_ptep_test_and_clear_young(pte_t *pte)
+{
+	return __ptep_test_and_clear_young(pte);
+}
+#endif
+
+static int stage2_pmdp_test_and_clear_young(pmd_t *pmd)
+{
+	return stage2_ptep_test_and_clear_young((pte_t *)pmd);
+}
+
 /**
 * kvm_phys_addr_ioremap - map a device range to guest IPA
 *
@@ -978,7 +1021,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
 		pte_t pte = pfn_pte(pfn, PAGE_S2_DEVICE);

 		if (writable)
-			kvm_set_s2pte_writable(&pte);
+			pte = kvm_s2pte_mkwrite(pte);

 		ret = mmu_topup_memory_cache(&cache, KVM_MMU_CACHE_MIN_PAGES,
 						KVM_NR_MEM_OBJS);
@@ -1078,12 +1121,12 @@ static void stage2_wp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end)
 	pmd_t *pmd;
 	phys_addr_t next;

-	pmd = pmd_offset(pud, addr);
+	pmd = stage2_pmd_offset(pud, addr);

 	do {
-		next = kvm_pmd_addr_end(addr, end);
+		next = stage2_pmd_addr_end(addr, end);
 		if (!pmd_none(*pmd)) {
-			if (kvm_pmd_huge(*pmd)) {
+			if (pmd_thp_or_huge(*pmd)) {
 				if (!kvm_s2pmd_readonly(pmd))
 					kvm_set_s2pmd_readonly(pmd);
 			} else {
@@ -1106,12 +1149,12 @@ static void  stage2_wp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end)
 	pud_t *pud;
 	phys_addr_t next;

-	pud = pud_offset(pgd, addr);
+	pud = stage2_pud_offset(pgd, addr);
 	do {
-		next = kvm_pud_addr_end(addr, end);
-		if (!pud_none(*pud)) {
+		next = stage2_pud_addr_end(addr, end);
+		if (!stage2_pud_none(*pud)) {
 			/* TODO:PUD not supported, revisit later if supported */
-			BUG_ON(kvm_pud_huge(*pud));
+			BUG_ON(stage2_pud_huge(*pud));
 			stage2_wp_pmds(pud, addr, next);
 		}
 	} while (pud++, addr = next, addr != end);
@@ -1128,7 +1171,7 @@ static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
 	pgd_t *pgd;
 	phys_addr_t next;

-	pgd = kvm->arch.pgd + kvm_pgd_index(addr);
+	pgd = kvm->arch.pgd + stage2_pgd_index(addr);
 	do {
 		/*
 		 * Release kvm_mmu_lock periodically if the memory region is
@@ -1140,8 +1183,8 @@ static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
 		if (need_resched() || spin_needbreak(&kvm->mmu_lock))
 			cond_resched_lock(&kvm->mmu_lock);

-		next = kvm_pgd_addr_end(addr, end);
-		if (pgd_present(*pgd))
+		next = stage2_pgd_addr_end(addr, end);
+		if (stage2_pgd_present(*pgd))
 			stage2_wp_puds(pgd, addr, next);
 	} while (pgd++, addr = next, addr != end);
 }
@@ -1320,7 +1363,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		pmd_t new_pmd = pfn_pmd(pfn, mem_type);
 		new_pmd = pmd_mkhuge(new_pmd);
 		if (writable) {
-			kvm_set_s2pmd_writable(&new_pmd);
+			new_pmd = kvm_s2pmd_mkwrite(new_pmd);
 			kvm_set_pfn_dirty(pfn);
 		}
 		coherent_cache_guest_page(vcpu, pfn, PMD_SIZE, fault_ipa_uncached);
@@ -1329,7 +1372,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		pte_t new_pte = pfn_pte(pfn, mem_type);

 		if (writable) {
-			kvm_set_s2pte_writable(&new_pte);
+			new_pte = kvm_s2pte_mkwrite(new_pte);
 			kvm_set_pfn_dirty(pfn);
 			mark_page_dirty(kvm, gfn);
 		}
@@ -1348,6 +1391,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 * Resolve the access fault by making the page young again.
 * Note that because the faulting entry is guaranteed not to be
 * cached in the TLB, we don't need to invalidate anything.
+ * Only the HW Access Flag updates are supported for Stage 2 (no DBM),
+ * so there is no need for atomic (pte|pmd)_mkyoung operations.
 */
 static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
 {
@@ -1364,7 +1409,7 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
 	if (!pmd || pmd_none(*pmd))	/* Nothing there */
 		goto out;

-	if (kvm_pmd_huge(*pmd)) {	/* THP, HugeTLB */
+	if (pmd_thp_or_huge(*pmd)) {	/* THP, HugeTLB */
 		*pmd = pmd_mkyoung(*pmd);
 		pfn = pmd_pfn(*pmd);
 		pfn_valid = true;
@@ -1588,25 +1633,14 @@ static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, void *data)
 	if (!pmd || pmd_none(*pmd))	/* Nothing there */
 		return 0;

-	if (kvm_pmd_huge(*pmd)) {	/* THP, HugeTLB */
-		if (pmd_young(*pmd)) {
-			*pmd = pmd_mkold(*pmd);
-			return 1;
-		}
-
-		return 0;
-	}
+	if (pmd_thp_or_huge(*pmd))	/* THP, HugeTLB */
+		return stage2_pmdp_test_and_clear_young(pmd);

 	pte = pte_offset_kernel(pmd, gpa);
 	if (pte_none(*pte))
 		return 0;

-	if (pte_young(*pte)) {
-		*pte = pte_mkold(*pte);	/* Just a page... */
-		return 1;
-	}
-
-	return 0;
+	return stage2_ptep_test_and_clear_young(pte);
 }

 static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, void *data)
@@ -1618,7 +1652,7 @@ static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, void *data)
 	if (!pmd || pmd_none(*pmd))	/* Nothing there */
 		return 0;

-	if (kvm_pmd_huge(*pmd))		/* THP, HugeTLB */
+	if (pmd_thp_or_huge(*pmd))		/* THP, HugeTLB */
 		return pmd_young(*pmd);

 	pte = pte_offset_kernel(pmd, gpa);

--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -96,32 +96,37 @@
 			 SCTLR_EL2_SA | SCTLR_EL2_I)

 /* TCR_EL2 Registers bits */
-#define TCR_EL2_RES1	((1 << 31) | (1 << 23))
-#define TCR_EL2_TBI	(1 << 20)
-#define TCR_EL2_PS	(7 << 16)
-#define TCR_EL2_PS_40B	(2 << 16)
-#define TCR_EL2_TG0	(1 << 14)
-#define TCR_EL2_SH0	(3 << 12)
-#define TCR_EL2_ORGN0	(3 << 10)
-#define TCR_EL2_IRGN0	(3 << 8)
-#define TCR_EL2_T0SZ	0x3f
-#define TCR_EL2_MASK	(TCR_EL2_TG0 | TCR_EL2_SH0 | \
-			 TCR_EL2_ORGN0 | TCR_EL2_IRGN0 | TCR_EL2_T0SZ)
+#define TCR_EL2_RES1		((1 << 31) | (1 << 23))
+#define TCR_EL2_TBI		(1 << 20)
+#define TCR_EL2_PS_SHIFT	16
+#define TCR_EL2_PS_MASK		(7 << TCR_EL2_PS_SHIFT)
+#define TCR_EL2_PS_40B		(2 << TCR_EL2_PS_SHIFT)
+#define TCR_EL2_TG0_MASK	TCR_TG0_MASK
+#define TCR_EL2_SH0_MASK	TCR_SH0_MASK
+#define TCR_EL2_ORGN0_MASK	TCR_ORGN0_MASK
+#define TCR_EL2_IRGN0_MASK	TCR_IRGN0_MASK
+#define TCR_EL2_T0SZ_MASK	0x3f
+#define TCR_EL2_MASK	(TCR_EL2_TG0_MASK | TCR_EL2_SH0_MASK | \
+			 TCR_EL2_ORGN0_MASK | TCR_EL2_IRGN0_MASK | TCR_EL2_T0SZ_MASK)

 /* VTCR_EL2 Registers bits */
 #define VTCR_EL2_RES1		(1 << 31)
-#define VTCR_EL2_PS_MASK	(7 << 16)
-#define VTCR_EL2_TG0_MASK	(1 << 14)
-#define VTCR_EL2_TG0_4K		(0 << 14)
-#define VTCR_EL2_TG0_64K	(1 << 14)
-#define VTCR_EL2_SH0_MASK	(3 << 12)
-#define VTCR_EL2_SH0_INNER	(3 << 12)
-#define VTCR_EL2_ORGN0_MASK	(3 << 10)
-#define VTCR_EL2_ORGN0_WBWA	(1 << 10)
-#define VTCR_EL2_IRGN0_MASK	(3 << 8)
-#define VTCR_EL2_IRGN0_WBWA	(1 << 8)
-#define VTCR_EL2_SL0_MASK	(3 << 6)
-#define VTCR_EL2_SL0_LVL1	(1 << 6)
+#define VTCR_EL2_HD		(1 << 22)
+#define VTCR_EL2_HA		(1 << 21)
+#define VTCR_EL2_PS_MASK	TCR_EL2_PS_MASK
+#define VTCR_EL2_TG0_MASK	TCR_TG0_MASK
+#define VTCR_EL2_TG0_4K		TCR_TG0_4K
+#define VTCR_EL2_TG0_16K	TCR_TG0_16K
+#define VTCR_EL2_TG0_64K	TCR_TG0_64K
+#define VTCR_EL2_SH0_MASK	TCR_SH0_MASK
+#define VTCR_EL2_SH0_INNER	TCR_SH0_INNER
+#define VTCR_EL2_ORGN0_MASK	TCR_ORGN0_MASK
+#define VTCR_EL2_ORGN0_WBWA	TCR_ORGN0_WBWA
+#define VTCR_EL2_IRGN0_MASK	TCR_IRGN0_MASK
+#define VTCR_EL2_IRGN0_WBWA	TCR_IRGN0_WBWA
+#define VTCR_EL2_SL0_SHIFT	6
+#define VTCR_EL2_SL0_MASK	(3 << VTCR_EL2_SL0_SHIFT)
+#define VTCR_EL2_SL0_LVL1	(1 << VTCR_EL2_SL0_SHIFT)
 #define VTCR_EL2_T0SZ_MASK	0x3f
 #define VTCR_EL2_T0SZ_40B	24
 #define VTCR_EL2_VS_SHIFT	19
@@ -137,35 +142,45 @@
 * (see hyp-init.S).
 *
 * Note that when using 4K pages, we concatenate two first level page tables
- * together.
+ * together. With 16K pages, we concatenate 16 first level page tables.
 *
 * The magic numbers used for VTTBR_X in this patch can be found in Tables
 * D4-23 and D4-25 in ARM DDI 0487A.b.
 */
+
+#define VTCR_EL2_T0SZ_IPA	VTCR_EL2_T0SZ_40B
+#define VTCR_EL2_COMMON_BITS	(VTCR_EL2_SH0_INNER | VTCR_EL2_ORGN0_WBWA | \
+				 VTCR_EL2_IRGN0_WBWA | VTCR_EL2_RES1)
+
 #ifdef CONFIG_ARM64_64K_PAGES
 /*
 * Stage2 translation configuration:
- * 40bits input  (T0SZ = 24)
 * 64kB pages (TG0 = 1)
 * 2 level page tables (SL = 1)
 */
-#define VTCR_EL2_FLAGS		(VTCR_EL2_TG0_64K | VTCR_EL2_SH0_INNER | \
-				 VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \
-				 VTCR_EL2_SL0_LVL1 | VTCR_EL2_RES1)
-#define VTTBR_X		(38 - VTCR_EL2_T0SZ_40B)
-#else
+#define VTCR_EL2_TGRAN_FLAGS		(VTCR_EL2_TG0_64K | VTCR_EL2_SL0_LVL1)
+#define VTTBR_X_TGRAN_MAGIC		38
+#elif defined(CONFIG_ARM64_16K_PAGES)
+/*
+ * Stage2 translation configuration:
+ * 16kB pages (TG0 = 2)
+ * 2 level page tables (SL = 1)
+ */
+#define VTCR_EL2_TGRAN_FLAGS		(VTCR_EL2_TG0_16K | VTCR_EL2_SL0_LVL1)
+#define VTTBR_X_TGRAN_MAGIC		42
+#else	/* 4K */
 /*
 * Stage2 translation configuration:
- * 40bits input  (T0SZ = 24)
 * 4kB pages (TG0 = 0)
 * 3 level page tables (SL = 1)
 */
-#define VTCR_EL2_FLAGS		(VTCR_EL2_TG0_4K | VTCR_EL2_SH0_INNER | \
-				 VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \
-				 VTCR_EL2_SL0_LVL1 | VTCR_EL2_RES1)
-#define VTTBR_X		(37 - VTCR_EL2_T0SZ_40B)
+#define VTCR_EL2_TGRAN_FLAGS		(VTCR_EL2_TG0_4K | VTCR_EL2_SL0_LVL1)
+#define VTTBR_X_TGRAN_MAGIC		37
 #endif

+#define VTCR_EL2_FLAGS			(VTCR_EL2_COMMON_BITS | VTCR_EL2_TGRAN_FLAGS)
+#define VTTBR_X				(VTTBR_X_TGRAN_MAGIC - VTCR_EL2_T0SZ_IPA)
+
 #define VTTBR_BADDR_SHIFT (VTTBR_X - 1)
 #define VTTBR_BADDR_MASK  (((UL(1) << (PHYS_MASK_SHIFT - VTTBR_X)) - 1) << VTTBR_BADDR_SHIFT)
 #define VTTBR_VMID_SHIFT  (UL(48))

--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -45,18 +45,6 @@
 */
 #define TRAMPOLINE_VA		(HYP_PAGE_OFFSET_MASK & PAGE_MASK)

-/*
- * KVM_MMU_CACHE_MIN_PAGES is the number of stage2 page table translation
- * levels in addition to the PGD and potentially the PUD which are
- * pre-allocated (we pre-allocate the fake PGD and the PUD when the Stage-2
- * tables use one level of tables less than the kernel.
- */
-#ifdef CONFIG_ARM64_64K_PAGES
-#define KVM_MMU_CACHE_MIN_PAGES	1
-#else
-#define KVM_MMU_CACHE_MIN_PAGES	2
-#endif
-
 #ifdef __ASSEMBLY__

 #include <asm/alternative.h>
@@ -91,6 +79,8 @@ alternative_endif
 #define KVM_PHYS_SIZE	(1UL << KVM_PHYS_SHIFT)
 #define KVM_PHYS_MASK	(KVM_PHYS_SIZE - 1UL)

+#include <asm/stage2_pgtable.h>
+
 int create_hyp_mappings(void *from, void *to);
 int create_hyp_io_mappings(void *from, void *to, phys_addr_t);
 void free_boot_hyp_pgd(void);
@@ -121,19 +111,32 @@ static inline void kvm_clean_pmd_entry(pmd_t *pmd) {}
 static inline void kvm_clean_pte(pte_t *pte) {}
 static inline void kvm_clean_pte_entry(pte_t *pte) {}

-static inline void kvm_set_s2pte_writable(pte_t *pte)
+static inline pte_t kvm_s2pte_mkwrite(pte_t pte)
 {
-	pte_val(*pte) |= PTE_S2_RDWR;
+	pte_val(pte) |= PTE_S2_RDWR;
+	return pte;
 }

-static inline void kvm_set_s2pmd_writable(pmd_t *pmd)
+static inline pmd_t kvm_s2pmd_mkwrite(pmd_t pmd)
 {
-	pmd_val(*pmd) |= PMD_S2_RDWR;
+	pmd_val(pmd) |= PMD_S2_RDWR;
+	return pmd;
 }

 static inline void kvm_set_s2pte_readonly(pte_t *pte)
 {
-	pte_val(*pte) = (pte_val(*pte) & ~PTE_S2_RDWR) | PTE_S2_RDONLY;
+	pteval_t pteval;
+	unsigned long tmp;
+
+	asm volatile("//	kvm_set_s2pte_readonly\n"
+	"	prfm	pstl1strm, %2\n"
+	"1:	ldxr	%0, %2\n"
+	"	and	%0, %0, %3		// clear PTE_S2_RDWR\n"
+	"	orr	%0, %0, %4		// set PTE_S2_RDONLY\n"
+	"	stxr	%w1, %0, %2\n"
+	"	cbnz	%w1, 1b\n"
+	: "=&r" (pteval), "=&r" (tmp), "+Q" (pte_val(*pte))
+	: "L" (~PTE_S2_RDWR), "L" (PTE_S2_RDONLY));
 }

 static inline bool kvm_s2pte_readonly(pte_t *pte)
@@ -143,69 +146,12 @@ static inline bool kvm_s2pte_readonly(pte_t *pte)

 static inline void kvm_set_s2pmd_readonly(pmd_t *pmd)
 {
-	pmd_val(*pmd) = (pmd_val(*pmd) & ~PMD_S2_RDWR) | PMD_S2_RDONLY;
+	kvm_set_s2pte_readonly((pte_t *)pmd);
 }

 static inline bool kvm_s2pmd_readonly(pmd_t *pmd)
 {
-	return (pmd_val(*pmd) & PMD_S2_RDWR) == PMD_S2_RDONLY;
-}
-
-
-#define kvm_pgd_addr_end(addr, end)	pgd_addr_end(addr, end)
-#define kvm_pud_addr_end(addr, end)	pud_addr_end(addr, end)
-#define kvm_pmd_addr_end(addr, end)	pmd_addr_end(addr, end)
-
-/*
- * In the case where PGDIR_SHIFT is larger than KVM_PHYS_SHIFT, we can address
- * the entire IPA input range with a single pgd entry, and we would only need
- * one pgd entry.  Note that in this case, the pgd is actually not used by
- * the MMU for Stage-2 translations, but is merely a fake pgd used as a data
- * structure for the kernel pgtable macros to work.
- */
-#if PGDIR_SHIFT > KVM_PHYS_SHIFT
-#define PTRS_PER_S2_PGD_SHIFT	0
-#else
-#define PTRS_PER_S2_PGD_SHIFT	(KVM_PHYS_SHIFT - PGDIR_SHIFT)
-#endif
-#define PTRS_PER_S2_PGD		(1 << PTRS_PER_S2_PGD_SHIFT)
-
-#define kvm_pgd_index(addr)	(((addr) >> PGDIR_SHIFT) & (PTRS_PER_S2_PGD - 1))
-
-/*
- * If we are concatenating first level stage-2 page tables, we would have less
- * than or equal to 16 pointers in the fake PGD, because that's what the
- * architecture allows.  In this case, (4 - CONFIG_PGTABLE_LEVELS)
- * represents the first level for the host, and we add 1 to go to the next
- * level (which uses contatenation) for the stage-2 tables.
- */
-#if PTRS_PER_S2_PGD <= 16
-#define KVM_PREALLOC_LEVEL	(4 - CONFIG_PGTABLE_LEVELS + 1)
-#else
-#define KVM_PREALLOC_LEVEL	(0)
-#endif
-
-static inline void *kvm_get_hwpgd(struct kvm *kvm)
-{
-	pgd_t *pgd = kvm->arch.pgd;
-	pud_t *pud;
-
-	if (KVM_PREALLOC_LEVEL == 0)
-		return pgd;
-
-	pud = pud_offset(pgd, 0);
-	if (KVM_PREALLOC_LEVEL == 1)
-		return pud;
-
-	BUG_ON(KVM_PREALLOC_LEVEL != 2);
-	return pmd_offset(pud, 0);
-}
-
-static inline unsigned int kvm_get_hwpgd_size(void)
-{
-	if (KVM_PREALLOC_LEVEL > 0)
-		return PTRS_PER_S2_PGD * PAGE_SIZE;
-	return PTRS_PER_S2_PGD * sizeof(pgd_t);
+	return kvm_s2pte_readonly((pte_t *)pmd);
 }

 static inline bool kvm_page_empty(void *ptr)
@@ -214,23 +160,20 @@ static inline bool kvm_page_empty(void *ptr)
 	return page_count(ptr_page) == 1;
 }

-#define kvm_pte_table_empty(kvm, ptep) kvm_page_empty(ptep)
+#define hyp_pte_table_empty(ptep) kvm_page_empty(ptep)

 #ifdef __PAGETABLE_PMD_FOLDED
-#define kvm_pmd_table_empty(kvm, pmdp) (0)
+#define hyp_pmd_table_empty(pmdp) (0)
 #else
-#define kvm_pmd_table_empty(kvm, pmdp) \
-	(kvm_page_empty(pmdp) && (!(kvm) || KVM_PREALLOC_LEVEL < 2))
+#define hyp_pmd_table_empty(pmdp) kvm_page_empty(pmdp)
 #endif

 #ifdef __PAGETABLE_PUD_FOLDED
-#define kvm_pud_table_empty(kvm, pudp) (0)
+#define hyp_pud_table_empty(pudp) (0)
 #else
-#define kvm_pud_table_empty(kvm, pudp) \
-	(kvm_page_empty(pudp) && (!(kvm) || KVM_PREALLOC_LEVEL < 1))
+#define hyp_pud_table_empty(pudp) kvm_page_empty(pudp)
 #endif

-
 struct kvm;

 #define kvm_flush_dcache_to_poc(a,l)	__flush_dcache_area((a), (l))

--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -208,23 +208,69 @@
 #define TCR_T1SZ(x)		((UL(64) - (x)) << TCR_T1SZ_OFFSET)
 #define TCR_TxSZ(x)		(TCR_T0SZ(x) | TCR_T1SZ(x))
 #define TCR_TxSZ_WIDTH		6
-#define TCR_IRGN_NC		((UL(0) << 8) | (UL(0) << 24))
-#define TCR_IRGN_WBWA		((UL(1) << 8) | (UL(1) << 24))
-#define TCR_IRGN_WT		((UL(2) << 8) | (UL(2) << 24))
-#define TCR_IRGN_WBnWA		((UL(3) << 8) | (UL(3) << 24))
-#define TCR_IRGN_MASK		((UL(3) << 8) | (UL(3) << 24))
-#define TCR_ORGN_NC		((UL(0) << 10) | (UL(0) << 26))
-#define TCR_ORGN_WBWA		((UL(1) << 10) | (UL(1) << 26))
-#define TCR_ORGN_WT		((UL(2) << 10) | (UL(2) << 26))
-#define TCR_ORGN_WBnWA		((UL(3) << 10) | (UL(3) << 26))
-#define TCR_ORGN_MASK		((UL(3) << 10) | (UL(3) << 26))
-#define TCR_SHARED		((UL(3) << 12) | (UL(3) << 28))
-#define TCR_TG0_4K		(UL(0) << 14)
-#define TCR_TG0_64K		(UL(1) << 14)
-#define TCR_TG0_16K		(UL(2) << 14)
-#define TCR_TG1_16K		(UL(1) << 30)
-#define TCR_TG1_4K		(UL(2) << 30)
-#define TCR_TG1_64K		(UL(3) << 30)
+
+#define TCR_IRGN0_SHIFT		8
+#define TCR_IRGN0_MASK		(UL(3) << TCR_IRGN0_SHIFT)
+#define TCR_IRGN0_NC		(UL(0) << TCR_IRGN0_SHIFT)
+#define TCR_IRGN0_WBWA		(UL(1) << TCR_IRGN0_SHIFT)
+#define TCR_IRGN0_WT		(UL(2) << TCR_IRGN0_SHIFT)
+#define TCR_IRGN0_WBnWA		(UL(3) << TCR_IRGN0_SHIFT)
+
+#define TCR_IRGN1_SHIFT		24
+#define TCR_IRGN1_MASK		(UL(3) << TCR_IRGN1_SHIFT)
+#define TCR_IRGN1_NC		(UL(0) << TCR_IRGN1_SHIFT)
+#define TCR_IRGN1_WBWA		(UL(1) << TCR_IRGN1_SHIFT)
+#define TCR_IRGN1_WT		(UL(2) << TCR_IRGN1_SHIFT)
+#define TCR_IRGN1_WBnWA		(UL(3) << TCR_IRGN1_SHIFT)
+
+#define TCR_IRGN_NC		(TCR_IRGN0_NC | TCR_IRGN1_NC)
+#define TCR_IRGN_WBWA		(TCR_IRGN0_WBWA | TCR_IRGN1_WBWA)
+#define TCR_IRGN_WT		(TCR_IRGN0_WT | TCR_IRGN1_WT)
+#define TCR_IRGN_WBnWA		(TCR_IRGN0_WBnWA | TCR_IRGN1_WBnWA)
+#define TCR_IRGN_MASK		(TCR_IRGN0_MASK | TCR_IRGN1_MASK)
+
+
+#define TCR_ORGN0_SHIFT		10
+#define TCR_ORGN0_MASK		(UL(3) << TCR_ORGN0_SHIFT)
+#define TCR_ORGN0_NC		(UL(0) << TCR_ORGN0_SHIFT)
+#define TCR_ORGN0_WBWA		(UL(1) << TCR_ORGN0_SHIFT)
+#define TCR_ORGN0_WT		(UL(2) << TCR_ORGN0_SHIFT)
+#define TCR_ORGN0_WBnWA		(UL(3) << TCR_ORGN0_SHIFT)
+
+#define TCR_ORGN1_SHIFT		26
+#define TCR_ORGN1_MASK		(UL(3) << TCR_ORGN1_SHIFT)
+#define TCR_ORGN1_NC		(UL(0) << TCR_ORGN1_SHIFT)
+#define TCR_ORGN1_WBWA		(UL(1) << TCR_ORGN1_SHIFT)
+#define TCR_ORGN1_WT		(UL(2) << TCR_ORGN1_SHIFT)
+#define TCR_ORGN1_WBnWA		(UL(3) << TCR_ORGN1_SHIFT)
+
+#define TCR_ORGN_NC		(TCR_ORGN0_NC | TCR_ORGN1_NC)
+#define TCR_ORGN_WBWA		(TCR_ORGN0_WBWA | TCR_ORGN1_WBWA)
+#define TCR_ORGN_WT		(TCR_ORGN0_WT | TCR_ORGN1_WT)
+#define TCR_ORGN_WBnWA		(TCR_ORGN0_WBnWA | TCR_ORGN1_WBnWA)
+#define TCR_ORGN_MASK		(TCR_ORGN0_MASK | TCR_ORGN1_MASK)
+
+#define TCR_SH0_SHIFT		12
+#define TCR_SH0_MASK		(UL(3) << TCR_SH0_SHIFT)
+#define TCR_SH0_INNER		(UL(3) << TCR_SH0_SHIFT)
+
+#define TCR_SH1_SHIFT		28
+#define TCR_SH1_MASK		(UL(3) << TCR_SH1_SHIFT)
+#define TCR_SH1_INNER		(UL(3) << TCR_SH1_SHIFT)
+#define TCR_SHARED		(TCR_SH0_INNER | TCR_SH1_INNER)
+
+#define TCR_TG0_SHIFT		14
+#define TCR_TG0_MASK		(UL(3) << TCR_TG0_SHIFT)
+#define TCR_TG0_4K		(UL(0) << TCR_TG0_SHIFT)
+#define TCR_TG0_64K		(UL(1) << TCR_TG0_SHIFT)
+#define TCR_TG0_16K		(UL(2) << TCR_TG0_SHIFT)
+
+#define TCR_TG1_SHIFT		30
+#define TCR_TG1_MASK		(UL(3) << TCR_TG1_SHIFT)
+#define TCR_TG1_16K		(UL(1) << TCR_TG1_SHIFT)
+#define TCR_TG1_4K		(UL(2) << TCR_TG1_SHIFT)
+#define TCR_TG1_64K		(UL(3) << TCR_TG1_SHIFT)
+
 #define TCR_ASID16		(UL(1) << 36)
 #define TCR_TBI0		(UL(1) << 37)
 #define TCR_HA			(UL(1) << 39)

--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -290,6 +290,8 @@ static inline pgprot_t mk_sect_prot(pgprot_t prot)
 #define pmd_mkyoung(pmd)	pte_pmd(pte_mkyoung(pmd_pte(pmd)))
 #define pmd_mknotpresent(pmd)	(__pmd(pmd_val(pmd) & ~PMD_TYPE_MASK))

+#define pmd_thp_or_huge(pmd)	(pmd_huge(pmd) || pmd_trans_huge(pmd))
+
 #define __HAVE_ARCH_PMD_WRITE
 #define pmd_write(pmd)		pte_write(pmd_pte(pmd))

@@ -530,14 +532,12 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
 * Atomic pte/pmd modifications.
 */
 #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
-static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
-					    unsigned long address,
-					    pte_t *ptep)
+static inline int __ptep_test_and_clear_young(pte_t *ptep)
 {
 	pteval_t pteval;
 	unsigned int tmp, res;

-	asm volatile("//	ptep_test_and_clear_young\n"
+	asm volatile("//	__ptep_test_and_clear_young\n"
 	"	prfm	pstl1strm, %2\n"
 	"1:	ldxr	%0, %2\n"
 	"	ubfx	%w3, %w0, %5, #1	// extract PTE_AF (young)\n"
@@ -550,6 +550,13 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
 	return res;
 }

+static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
+					    unsigned long address,
+					    pte_t *ptep)
+{
+	return __ptep_test_and_clear_young(ptep);
+}
+
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
 static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,

--- a/arch/arm64/include/asm/stage2_pgtable-nopmd.h
+++ b/arch/arm64/include/asm/stage2_pgtable-nopmd.h
+/*
+ * Copyright (C) 2016 - ARM Ltd
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __ARM64_S2_PGTABLE_NOPMD_H_
+#define __ARM64_S2_PGTABLE_NOPMD_H_
+
+#include <asm/stage2_pgtable-nopud.h>
+
+#define __S2_PGTABLE_PMD_FOLDED
+
+#define S2_PMD_SHIFT		S2_PUD_SHIFT
+#define S2_PTRS_PER_PMD		1
+#define S2_PMD_SIZE		(1UL << S2_PMD_SHIFT)
+#define S2_PMD_MASK		(~(S2_PMD_SIZE-1))
+
+#define stage2_pud_none(pud)			(0)
+#define stage2_pud_present(pud)			(1)
+#define stage2_pud_clear(pud)			do { } while (0)
+#define stage2_pud_populate(pud, pmd)		do { } while (0)
+#define stage2_pmd_offset(pud, address)		((pmd_t *)(pud))
+
+#define stage2_pmd_free(pmd)			do { } while (0)
+
+#define stage2_pmd_addr_end(addr, end)		(end)
+
+#define stage2_pud_huge(pud)			(0)
+#define stage2_pmd_table_empty(pmdp)		(0)
+
+#endif
--- a/arch/arm64/include/asm/stage2_pgtable-nopud.h
+++ b/arch/arm64/include/asm/stage2_pgtable-nopud.h
+/*
+ * Copyright (C) 2016 - ARM Ltd
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __ARM64_S2_PGTABLE_NOPUD_H_
+#define __ARM64_S2_PGTABLE_NOPUD_H_
+
+#define __S2_PGTABLE_PUD_FOLDED
+
+#define S2_PUD_SHIFT		S2_PGDIR_SHIFT
+#define S2_PTRS_PER_PUD		1
+#define S2_PUD_SIZE		(_AC(1, UL) << S2_PUD_SHIFT)
+#define S2_PUD_MASK		(~(S2_PUD_SIZE-1))
+
+#define stage2_pgd_none(pgd)			(0)
+#define stage2_pgd_present(pgd)			(1)
+#define stage2_pgd_clear(pgd)			do { } while (0)
+#define stage2_pgd_populate(pgd, pud)	do { } while (0)
+
+#define stage2_pud_offset(pgd, address)		((pud_t *)(pgd))
+
+#define stage2_pud_free(x)			do { } while (0)
+
+#define stage2_pud_addr_end(addr, end)		(end)
+#define stage2_pud_table_empty(pmdp)		(0)
+
+#endif
--- a/arch/arm64/include/asm/stage2_pgtable.h
+++ b/arch/arm64/include/asm/stage2_pgtable.h
+/*
+ * Copyright (C) 2016 - ARM Ltd
+ *
+ * stage2 page table helpers
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __ARM64_S2_PGTABLE_H_
+#define __ARM64_S2_PGTABLE_H_
+
+#include <asm/pgtable.h>
+
+/*
+ * The hardware supports concatenation of up to 16 tables at stage2 entry level
+ * and we use the feature whenever possible.
+ *
+ * Now, the minimum number of bits resolved at any level is (PAGE_SHIFT - 3).
+ * On arm64, the smallest PAGE_SIZE supported is 4k, which means
+ *             (PAGE_SHIFT - 3) > 4 holds for all page sizes.
+ * This implies, the total number of page table levels at stage2 expected
+ * by the hardware is actually the number of levels required for (KVM_PHYS_SHIFT - 4)
+ * in normal translations(e.g, stage1), since we cannot have another level in
+ * the range (KVM_PHYS_SHIFT, KVM_PHYS_SHIFT - 4).
+ */
+#define STAGE2_PGTABLE_LEVELS		ARM64_HW_PGTABLE_LEVELS(KVM_PHYS_SHIFT - 4)
+
+/*
+ * With all the supported VA_BITs and 40bit guest IPA, the following condition
+ * is always true:
+ *
+ *       STAGE2_PGTABLE_LEVELS <= CONFIG_PGTABLE_LEVELS
+ *
+ * We base our stage-2 page table walker helpers on this assumption and
+ * fall back to using the host version of the helper wherever possible.
+ * i.e, if a particular level is not folded (e.g, PUD) at stage2, we fall back
+ * to using the host version, since it is guaranteed it is not folded at host.
+ *
+ * If the condition breaks in the future, we can rearrange the host level
+ * definitions and reuse them for stage2. Till then...
+ */
+#if STAGE2_PGTABLE_LEVELS > CONFIG_PGTABLE_LEVELS
+#error "Unsupported combination of guest IPA and host VA_BITS."
+#endif
+
+/* S2_PGDIR_SHIFT is the size mapped by top-level stage2 entry */
+#define S2_PGDIR_SHIFT			ARM64_HW_PGTABLE_LEVEL_SHIFT(4 - STAGE2_PGTABLE_LEVELS)
+#define S2_PGDIR_SIZE			(_AC(1, UL) << S2_PGDIR_SHIFT)
+#define S2_PGDIR_MASK			(~(S2_PGDIR_SIZE - 1))
+
+/*
+ * The number of PTRS across all concatenated stage2 tables given by the
+ * number of bits resolved at the initial level.
+ */
+#define PTRS_PER_S2_PGD			(1 << (KVM_PHYS_SHIFT - S2_PGDIR_SHIFT))
+
+/*
+ * KVM_MMU_CACHE_MIN_PAGES is the number of stage2 page table translation
+ * levels in addition to the PGD.
+ */
+#define KVM_MMU_CACHE_MIN_PAGES		(STAGE2_PGTABLE_LEVELS - 1)
+
+
+#if STAGE2_PGTABLE_LEVELS > 3
+
+#define S2_PUD_SHIFT			ARM64_HW_PGTABLE_LEVEL_SHIFT(1)
+#define S2_PUD_SIZE			(_AC(1, UL) << S2_PUD_SHIFT)
+#define S2_PUD_MASK			(~(S2_PUD_SIZE - 1))
+
+#define stage2_pgd_none(pgd)				pgd_none(pgd)
+#define stage2_pgd_clear(pgd)				pgd_clear(pgd)
+#define stage2_pgd_present(pgd)				pgd_present(pgd)
+#define stage2_pgd_populate(pgd, pud)			pgd_populate(NULL, pgd, pud)
+#define stage2_pud_offset(pgd, address)			pud_offset(pgd, address)
+#define stage2_pud_free(pud)				pud_free(NULL, pud)
+
+#define stage2_pud_table_empty(pudp)			kvm_page_empty(pudp)
+
+static inline phys_addr_t stage2_pud_addr_end(phys_addr_t addr, phys_addr_t end)
+{
+	phys_addr_t boundary = (addr + S2_PUD_SIZE) & S2_PUD_MASK;
+
+	return (boundary - 1 < end - 1) ? boundary : end;
+}
+
+#endif		/* STAGE2_PGTABLE_LEVELS > 3 */
+
+
+#if STAGE2_PGTABLE_LEVELS > 2
+
+#define S2_PMD_SHIFT			ARM64_HW_PGTABLE_LEVEL_SHIFT(2)
+#define S2_PMD_SIZE			(_AC(1, UL) << S2_PMD_SHIFT)
+#define S2_PMD_MASK			(~(S2_PMD_SIZE - 1))
+
+#define stage2_pud_none(pud)				pud_none(pud)
+#define stage2_pud_clear(pud)				pud_clear(pud)
+#define stage2_pud_present(pud)				pud_present(pud)
+#define stage2_pud_populate(pud, pmd)			pud_populate(NULL, pud, pmd)
+#define stage2_pmd_offset(pud, address)			pmd_offset(pud, address)
+#define stage2_pmd_free(pmd)				pmd_free(NULL, pmd)
+
+#define stage2_pud_huge(pud)				pud_huge(pud)
+#define stage2_pmd_table_empty(pmdp)			kvm_page_empty(pmdp)
+
+static inline phys_addr_t stage2_pmd_addr_end(phys_addr_t addr, phys_addr_t end)
+{
+	phys_addr_t boundary = (addr + S2_PMD_SIZE) & S2_PMD_MASK;
+
+	return (boundary - 1 < end - 1) ? boundary : end;
+}
+
+#endif		/* STAGE2_PGTABLE_LEVELS > 2 */
+
+#define stage2_pte_table_empty(ptep)			kvm_page_empty(ptep)
+
+#if STAGE2_PGTABLE_LEVELS == 2
+#include <asm/stage2_pgtable-nopmd.h>
+#elif STAGE2_PGTABLE_LEVELS == 3
+#include <asm/stage2_pgtable-nopud.h>
+#endif
+
+
+#define stage2_pgd_index(addr)				(((addr) >> S2_PGDIR_SHIFT) & (PTRS_PER_S2_PGD - 1))
+
+static inline phys_addr_t stage2_pgd_addr_end(phys_addr_t addr, phys_addr_t end)
+{
+	phys_addr_t boundary = (addr + S2_PGDIR_SIZE) & S2_PGDIR_MASK;
+
+	return (boundary - 1 < end - 1) ? boundary : end;
+}
+
+#endif	/* __ARM64_S2_PGTABLE_H_ */
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -22,7 +22,6 @@ config KVM_ARM_VGIC_V3
 config KVM
 	bool "Kernel-based Virtual Machine (KVM) support"
 	depends on OF
-	depends on !ARM64_16K_PAGES
 	select MMU_NOTIFIER
 	select PREEMPT_NOTIFIERS
 	select ANON_INODES

--- a/arch/arm64/kvm/hyp/s2-setup.c
+++ b/arch/arm64/kvm/hyp/s2-setup.c
@@ -65,6 +65,14 @@ u32 __hyp_text __init_stage2_translation(void)
 	 */
 	val |= 64 - (parange > 40 ? 40 : parange);

+	/*
+	 * Check the availability of Hardware Access Flag / Dirty Bit
+	 * Management in ID_AA64MMFR1_EL1 and enable the feature in VTCR_EL2.
+	 */
+	tmp = (read_sysreg(id_aa64mmfr1_el1) >> ID_AA64MMFR1_HADBS_SHIFT) & 0xf;
+	if (IS_ENABLED(CONFIG_ARM64_HW_AFDBM) && tmp)
+		val |= VTCR_EL2_HA;
+
 	/*
 	 * Read the VMIDBits bits from ID_AA64MMFR1_EL1 and set the VS
 	 * bit in VTCR_EL2.

--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -468,11 +468,11 @@ static struct cyclecounter cyclecounter = {
 	.mask	= CLOCKSOURCE_MASK(56),
 };

-static struct timecounter timecounter;
+static struct arch_timer_kvm_info arch_timer_kvm_info;

-struct timecounter *arch_timer_get_timecounter(void)
+struct arch_timer_kvm_info *arch_timer_get_kvm_info(void)
 {
-	return &timecounter;
+	return &arch_timer_kvm_info;
 }

 static void __init arch_counter_register(unsigned type)
@@ -500,7 +500,8 @@ static void __init arch_counter_register(unsigned type)
 	clocksource_register_hz(&clocksource_counter, arch_timer_rate);
 	cyclecounter.mult = clocksource_counter.mult;
 	cyclecounter.shift = clocksource_counter.shift;
-	timecounter_init(&timecounter, &cyclecounter, start_count);
+	timecounter_init(&arch_timer_kvm_info.timecounter,
+			 &cyclecounter, start_count);

 	/* 56 bits minimum, so we assume worst case rollover */
 	sched_clock_register(arch_timer_read_counter, 56, arch_timer_rate);
@@ -744,6 +745,8 @@ static void __init arch_timer_init(void)

 	arch_timer_register();
 	arch_timer_common_init();
+
+	arch_timer_kvm_info.virtual_irq = arch_timer_ppi[VIRT_PPI];
 }

 static void __init arch_timer_of_init(struct device_node *np)

--- a/drivers/irqchip/irq-gic-common.c
+++ b/drivers/irqchip/irq-gic-common.c
@@ -21,6 +21,19 @@

 #include "irq-gic-common.h"

+static const struct gic_kvm_info *gic_kvm_info;
+
+const struct gic_kvm_info *gic_get_kvm_info(void)
+{
+	return gic_kvm_info;
+}
+
+void gic_set_kvm_info(const struct gic_kvm_info *info)
+{
+	BUG_ON(gic_kvm_info != NULL);
+	gic_kvm_info = info;
+}
+
 void gic_enable_quirks(u32 iidr, const struct gic_quirk *quirks,
 		void *data)
 {

--- a/drivers/irqchip/irq-gic-common.h
+++ b/drivers/irqchip/irq-gic-common.h
@@ -19,6 +19,7 @@

 #include <linux/of.h>
 #include <linux/irqdomain.h>
+#include <linux/irqchip/arm-gic-common.h>

 struct gic_quirk {
 	const char *desc;
@@ -35,4 +36,6 @@ void gic_cpu_config(void __iomem *base, void (*sync_access)(void));
 void gic_enable_quirks(u32 iidr, const struct gic_quirk *quirks,
 		void *data);

+void gic_set_kvm_info(const struct gic_kvm_info *info);
+
 #endif /* _IRQ_GIC_COMMON_H */
--- a/drivers/irqchip/irq-gic-v3.c
+++ b/drivers/irqchip/irq-gic-v3.c
@@ -15,6 +15,8 @@
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

+#define pr_fmt(fmt)	"GICv3: " fmt
+
 #include <linux/acpi.h>
 #include <linux/cpu.h>
 #include <linux/cpu_pm.h>
@@ -28,6 +30,7 @@
 #include <linux/slab.h>

 #include <linux/irqchip.h>
+#include <linux/irqchip/arm-gic-common.h>
 #include <linux/irqchip/arm-gic-v3.h>

 #include <asm/cputype.h>
@@ -56,6 +59,8 @@ struct gic_chip_data {
 static struct gic_chip_data gic_data __read_mostly;
 static struct static_key supports_deactivate = STATIC_KEY_INIT_TRUE;

+static struct gic_kvm_info gic_v3_kvm_info;
+
 #define gic_data_rdist()		(this_cpu_ptr(gic_data.rdists.rdist))
 #define gic_data_rdist_rd_base()	(gic_data_rdist()->rd_base)
 #define gic_data_rdist_sgi_base()	(gic_data_rdist_rd_base() + SZ_64K)
@@ -901,6 +906,30 @@ static int __init gic_validate_dist_version(void __iomem *dist_base)
 	return 0;
 }

+static void __init gic_of_setup_kvm_info(struct device_node *node)
+{
+	int ret;
+	struct resource r;
+	u32 gicv_idx;
+
+	gic_v3_kvm_info.type = GIC_V3;
+
+	gic_v3_kvm_info.maint_irq = irq_of_parse_and_map(node, 0);
+	if (!gic_v3_kvm_info.maint_irq)
+		return;
+
+	if (of_property_read_u32(node, "#redistributor-regions",
+				 &gicv_idx))
+		gicv_idx = 1;
+
+	gicv_idx += 3;	/* Also skip GICD, GICC, GICH */
+	ret = of_address_to_resource(node, gicv_idx, &r);
+	if (!ret)
+		gic_v3_kvm_info.vcpu = r;
+
+	gic_set_kvm_info(&gic_v3_kvm_info);
+}
+
 static int __init gic_of_init(struct device_node *node, struct device_node *parent)
 {
 	void __iomem *dist_base;
@@ -952,8 +981,10 @@ static int __init gic_of_init(struct device_node *node, struct device_node *pare

 	err = gic_init_bases(dist_base, rdist_regs, nr_redist_regions,
 			     redist_stride, &node->fwnode);
-	if (!err)
+	if (!err) {
+		gic_of_setup_kvm_info(node);
 		return 0;
+	}

 out_unmap_rdist:
 	for (i = 0; i < nr_redist_regions; i++)
@@ -968,19 +999,25 @@ static int __init gic_of_init(struct device_node *node, struct device_node *pare
 IRQCHIP_DECLARE(gic_v3, "arm,gic-v3", gic_of_init);

 #ifdef CONFIG_ACPI
-static void __iomem *dist_base;
-static struct redist_region *redist_regs __initdata;
-static u32 nr_redist_regions __initdata;
-static bool single_redist;
+static struct
+{
+	void __iomem *dist_base;
+	struct redist_region *redist_regs;
+	u32 nr_redist_regions;
+	bool single_redist;
+	u32 maint_irq;
+	int maint_irq_mode;
+	phys_addr_t vcpu_base;
+} acpi_data __initdata;

 static void __init
 gic_acpi_register_redist(phys_addr_t phys_base, void __iomem *redist_base)
 {
 	static int count = 0;

-	redist_regs[count].phys_base = phys_base;
-	redist_regs[count].redist_base = redist_base;
-	redist_regs[count].single_redist = single_redist;
+	acpi_data.redist_regs[count].phys_base = phys_base;
+	acpi_data.redist_regs[count].redist_base = redist_base;
+	acpi_data.redist_regs[count].single_redist = acpi_data.single_redist;
 	count++;
 }

@@ -1008,7 +1045,7 @@ gic_acpi_parse_madt_gicc(struct acpi_subtable_header *header,
 {
 	struct acpi_madt_generic_interrupt *gicc =
 				(struct acpi_madt_generic_interrupt *)header;
-	u32 reg = readl_relaxed(dist_base + GICD_PIDR2) & GIC_PIDR2_ARCH_MASK;
+	u32 reg = readl_relaxed(acpi_data.dist_base + GICD_PIDR2) & GIC_PIDR2_ARCH_MASK;
 	u32 size = reg == GIC_PIDR2_ARCH_GICv4 ? SZ_64K * 4 : SZ_64K * 2;
 	void __iomem *redist_base;

@@ -1025,7 +1062,7 @@ static int __init gic_acpi_collect_gicr_base(void)
 	acpi_tbl_entry_handler redist_parser;
 	enum acpi_madt_type type;

-	if (single_redist) {
+	if (acpi_data.single_redist) {
 		type = ACPI_MADT_TYPE_GENERIC_INTERRUPT;
 		redist_parser = gic_acpi_parse_madt_gicc;
 	} else {
@@ -1076,14 +1113,14 @@ static int __init gic_acpi_count_gicr_regions(void)
 	count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_REDISTRIBUTOR,
 				      gic_acpi_match_gicr, 0);
 	if (count > 0) {
-		single_redist = false;
+		acpi_data.single_redist = false;
 		return count;
 	}

 	count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_INTERRUPT,
 				      gic_acpi_match_gicc, 0);
 	if (count > 0)
-		single_redist = true;
+		acpi_data.single_redist = true;

 	return count;
 }
@@ -1103,36 +1140,117 @@ static bool __init acpi_validate_gic_table(struct acpi_subtable_header *header,
 	if (count <= 0)
 		return false;

-	nr_redist_regions = count;
+	acpi_data.nr_redist_regions = count;
 	return true;
 }

+static int __init gic_acpi_parse_virt_madt_gicc(struct acpi_subtable_header *header,
+						const unsigned long end)
+{
+	struct acpi_madt_generic_interrupt *gicc =
+		(struct acpi_madt_generic_interrupt *)header;
+	int maint_irq_mode;
+	static int first_madt = true;
+
+	/* Skip unusable CPUs */
+	if (!(gicc->flags & ACPI_MADT_ENABLED))
+		return 0;
+
+	maint_irq_mode = (gicc->flags & ACPI_MADT_VGIC_IRQ_MODE) ?
+		ACPI_EDGE_SENSITIVE : ACPI_LEVEL_SENSITIVE;
+
+	if (first_madt) {
+		first_madt = false;
+
+		acpi_data.maint_irq = gicc->vgic_interrupt;
+		acpi_data.maint_irq_mode = maint_irq_mode;
+		acpi_data.vcpu_base = gicc->gicv_base_address;
+
+		return 0;
+	}
+
+	/*
+	 * The maintenance interrupt and GICV should be the same for every CPU
+	 */
+	if ((acpi_data.maint_irq != gicc->vgic_interrupt) ||
+	    (acpi_data.maint_irq_mode != maint_irq_mode) ||
+	    (acpi_data.vcpu_base != gicc->gicv_base_address))
+		return -EINVAL;
+
+	return 0;
+}
+
+static bool __init gic_acpi_collect_virt_info(void)
+{
+	int count;
+
+	count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_INTERRUPT,
+				      gic_acpi_parse_virt_madt_gicc, 0);
+
+	return (count > 0);
+}
+
 #define ACPI_GICV3_DIST_MEM_SIZE (SZ_64K)
+#define ACPI_GICV2_VCTRL_MEM_SIZE	(SZ_4K)
+#define ACPI_GICV2_VCPU_MEM_SIZE	(SZ_8K)
+
+static void __init gic_acpi_setup_kvm_info(void)
+{
+	int irq;
+
+	if (!gic_acpi_collect_virt_info()) {
+		pr_warn("Unable to get hardware information used for virtualization\n");
+		return;
+	}
+
+	gic_v3_kvm_info.type = GIC_V3;
+
+	irq = acpi_register_gsi(NULL, acpi_data.maint_irq,
+				acpi_data.maint_irq_mode,
+				ACPI_ACTIVE_HIGH);
+	if (irq <= 0)
+		return;
+
+	gic_v3_kvm_info.maint_irq = irq;
+
+	if (acpi_data.vcpu_base) {
+		struct resource *vcpu = &gic_v3_kvm_info.vcpu;
+
+		vcpu->flags = IORESOURCE_MEM;
+		vcpu->start = acpi_data.vcpu_base;
+		vcpu->end = vcpu->start + ACPI_GICV2_VCPU_MEM_SIZE - 1;
+	}
+
+	gic_set_kvm_info(&gic_v3_kvm_info);
+}

 static int __init
 gic_acpi_init(struct acpi_subtable_header *header, const unsigned long end)
 {
 	struct acpi_madt_generic_distributor *dist;
 	struct fwnode_handle *domain_handle;
+	size_t size;
 	int i, err;

 	/* Get distributor base address */
 	dist = (struct acpi_madt_generic_distributor *)header;
-	dist_base = ioremap(dist->base_address, ACPI_GICV3_DIST_MEM_SIZE);
-	if (!dist_base) {
+	acpi_data.dist_base = ioremap(dist->base_address,
+				      ACPI_GICV3_DIST_MEM_SIZE);
+	if (!acpi_data.dist_base) {
 		pr_err("Unable to map GICD registers\n");
 		return -ENOMEM;
 	}

-	err = gic_validate_dist_version(dist_base);
+	err = gic_validate_dist_version(acpi_data.dist_base);
 	if (err) {
-		pr_err("No distributor detected at @%p, giving up", dist_base);
+		pr_err("No distributor detected at @%p, giving up",
+		       acpi_data.dist_base);
 		goto out_dist_unmap;
 	}

-	redist_regs = kzalloc(sizeof(*redist_regs) * nr_redist_regions,
-			      GFP_KERNEL);
-	if (!redist_regs) {
+	size = sizeof(*acpi_data.redist_regs) * acpi_data.nr_redist_regions;
+	acpi_data.redist_regs = kzalloc(size, GFP_KERNEL);
+	if (!acpi_data.redist_regs) {
 		err = -ENOMEM;
 		goto out_dist_unmap;
 	}
@@ -1141,29 +1259,31 @@ gic_acpi_init(struct acpi_subtable_header *header, const unsigned long end)
 	if (err)
 		goto out_redist_unmap;

-	domain_handle = irq_domain_alloc_fwnode(dist_base);
+	domain_handle = irq_domain_alloc_fwnode(acpi_data.dist_base);
 	if (!domain_handle) {
 		err = -ENOMEM;
 		goto out_redist_unmap;
 	}

-	err = gic_init_bases(dist_base, redist_regs, nr_redist_regions, 0,
-			     domain_handle);
+	err = gic_init_bases(acpi_data.dist_base, acpi_data.redist_regs,
+			     acpi_data.nr_redist_regions, 0, domain_handle);
 	if (err)
 		goto out_fwhandle_free;

 	acpi_set_irq_model(ACPI_IRQ_MODEL_GIC, domain_handle);
+	gic_acpi_setup_kvm_info();
+
 	return 0;

 out_fwhandle_free:
 	irq_domain_free_fwnode(domain_handle);
 out_redist_unmap:
-	for (i = 0; i < nr_redist_regions; i++)
-		if (redist_regs[i].redist_base)
-			iounmap(redist_regs[i].redist_base);
-	kfree(redist_regs);
+	for (i = 0; i < acpi_data.nr_redist_regions; i++)
+		if (acpi_data.redist_regs[i].redist_base)
+			iounmap(acpi_data.redist_regs[i].redist_base);
+	kfree(acpi_data.redist_regs);
 out_dist_unmap:
-	iounmap(dist_base);
+	iounmap(acpi_data.dist_base);
 	return err;
 }
 IRQCHIP_ACPI_DECLARE(gic_v3, ACPI_MADT_TYPE_GENERIC_DISTRIBUTOR,

--- a/drivers/irqchip/irq-gic.c
+++ b/drivers/irqchip/irq-gic.c
@@ -102,6 +102,8 @@ static struct static_key supports_deactivate = STATIC_KEY_INIT_TRUE;

 static struct gic_chip_data gic_data[CONFIG_ARM_GIC_MAX_NR] __read_mostly;

+static struct gic_kvm_info gic_v2_kvm_info;
+
 #ifdef CONFIG_GIC_NON_BANKED
 static void __iomem *gic_get_percpu_base(union gic_base *base)
 {
@@ -1189,6 +1191,29 @@ static bool gic_check_eoimode(struct device_node *node, void __iomem **base)
 	return true;
 }

+static void __init gic_of_setup_kvm_info(struct device_node *node)
+{
+	int ret;
+	struct resource *vctrl_res = &gic_v2_kvm_info.vctrl;
+	struct resource *vcpu_res = &gic_v2_kvm_info.vcpu;
+
+	gic_v2_kvm_info.type = GIC_V2;
+
+	gic_v2_kvm_info.maint_irq = irq_of_parse_and_map(node, 0);
+	if (!gic_v2_kvm_info.maint_irq)
+		return;
+
+	ret = of_address_to_resource(node, 2, vctrl_res);
+	if (ret)
+		return;
+
+	ret = of_address_to_resource(node, 3, vcpu_res);
+	if (ret)
+		return;
+
+	gic_set_kvm_info(&gic_v2_kvm_info);
+}
+
 int __init
 gic_of_init(struct device_node *node, struct device_node *parent)
 {
@@ -1218,8 +1243,10 @@ gic_of_init(struct device_node *node, struct device_node *parent)

 	__gic_init_bases(gic_cnt, -1, dist_base, cpu_base, percpu_offset,
 			 &node->fwnode);
-	if (!gic_cnt)
+	if (!gic_cnt) {
 		gic_init_physaddr(node);
+		gic_of_setup_kvm_info(node);
+	}

 	if (parent) {
 		irq = irq_of_parse_and_map(node, 0);
@@ -1245,7 +1272,14 @@ IRQCHIP_DECLARE(pl390, "arm,pl390", gic_of_init);
 #endif

 #ifdef CONFIG_ACPI
-static phys_addr_t cpu_phy_base __initdata;
+static struct
+{
+	phys_addr_t cpu_phys_base;
+	u32 maint_irq;
+	int maint_irq_mode;
+	phys_addr_t vctrl_base;
+	phys_addr_t vcpu_base;
+} acpi_data __initdata;

 static int __init
 gic_acpi_parse_madt_cpu(struct acpi_subtable_header *header,
@@ -1265,10 +1299,16 @@ gic_acpi_parse_madt_cpu(struct acpi_subtable_header *header,
 	 * All CPU interface addresses have to be the same.
 	 */
 	gic_cpu_base = processor->base_address;
-	if (cpu_base_assigned && gic_cpu_base != cpu_phy_base)
+	if (cpu_base_assigned && gic_cpu_base != acpi_data.cpu_phys_base)
 		return -EINVAL;

-	cpu_phy_base = gic_cpu_base;
+	acpi_data.cpu_phys_base = gic_cpu_base;
+	acpi_data.maint_irq = processor->vgic_interrupt;
+	acpi_data.maint_irq_mode = (processor->flags & ACPI_MADT_VGIC_IRQ_MODE) ?
+				    ACPI_EDGE_SENSITIVE : ACPI_LEVEL_SENSITIVE;
+	acpi_data.vctrl_base = processor->gich_base_address;
+	acpi_data.vcpu_base = processor->gicv_base_address;
+
 	cpu_base_assigned = 1;
 	return 0;
 }
@@ -1299,6 +1339,41 @@ static bool __init gic_validate_dist(struct acpi_subtable_header *header,

 #define ACPI_GICV2_DIST_MEM_SIZE	(SZ_4K)
 #define ACPI_GIC_CPU_IF_MEM_SIZE	(SZ_8K)
+#define ACPI_GICV2_VCTRL_MEM_SIZE	(SZ_4K)
+#define ACPI_GICV2_VCPU_MEM_SIZE	(SZ_8K)
+
+static void __init gic_acpi_setup_kvm_info(void)
+{
+	int irq;
+	struct resource *vctrl_res = &gic_v2_kvm_info.vctrl;
+	struct resource *vcpu_res = &gic_v2_kvm_info.vcpu;
+
+	gic_v2_kvm_info.type = GIC_V2;
+
+	if (!acpi_data.vctrl_base)
+		return;
+
+	vctrl_res->flags = IORESOURCE_MEM;
+	vctrl_res->start = acpi_data.vctrl_base;
+	vctrl_res->end = vctrl_res->start + ACPI_GICV2_VCTRL_MEM_SIZE - 1;
+
+	if (!acpi_data.vcpu_base)
+		return;
+
+	vcpu_res->flags = IORESOURCE_MEM;
+	vcpu_res->start = acpi_data.vcpu_base;
+	vcpu_res->end = vcpu_res->start + ACPI_GICV2_VCPU_MEM_SIZE - 1;
+
+	irq = acpi_register_gsi(NULL, acpi_data.maint_irq,
+				acpi_data.maint_irq_mode,
+				ACPI_ACTIVE_HIGH);
+	if (irq <= 0)
+		return;
+
+	gic_v2_kvm_info.maint_irq = irq;
+
+	gic_set_kvm_info(&gic_v2_kvm_info);
+}

 static int __init gic_v2_acpi_init(struct acpi_subtable_header *header,
 				   const unsigned long end)
@@ -1316,7 +1391,7 @@ static int __init gic_v2_acpi_init(struct acpi_subtable_header *header,
 		return -EINVAL;
 	}

-	cpu_base = ioremap(cpu_phy_base, ACPI_GIC_CPU_IF_MEM_SIZE);
+	cpu_base = ioremap(acpi_data.cpu_phys_base, ACPI_GIC_CPU_IF_MEM_SIZE);
 	if (!cpu_base) {
 		pr_err("Unable to map GICC registers\n");
 		return -ENOMEM;
@@ -1356,6 +1431,8 @@ static int __init gic_v2_acpi_init(struct acpi_subtable_header *header,
 	if (IS_ENABLED(CONFIG_ARM_GIC_V2M))
 		gicv2m_init(NULL, gic_data[0].domain);

+	gic_acpi_setup_kvm_info();
+
 	return 0;
 }
 IRQCHIP_ACPI_DECLARE(gic_v2, ACPI_MADT_TYPE_GENERIC_DISTRIBUTOR,

--- a/include/clocksource/arm_arch_timer.h
+++ b/include/clocksource/arm_arch_timer.h
@@ -49,11 +49,16 @@ enum arch_timer_reg {

 #define ARCH_TIMER_EVT_STREAM_FREQ	10000	/* 100us */

+struct arch_timer_kvm_info {
+	struct timecounter timecounter;
+	int virtual_irq;
+};
+
 #ifdef CONFIG_ARM_ARCH_TIMER

 extern u32 arch_timer_get_rate(void);
 extern u64 (*arch_timer_read_counter)(void);
-extern struct timecounter *arch_timer_get_timecounter(void);
+extern struct arch_timer_kvm_info *arch_timer_get_kvm_info(void);

 #else

@@ -67,11 +72,6 @@ static inline u64 arch_timer_read_counter(void)
 	return 0;
 }

-static inline struct timecounter *arch_timer_get_timecounter(void)
-{
-	return NULL;
-}
-
 #endif

 #endif
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -25,6 +25,7 @@
 #include <linux/spinlock.h>
 #include <linux/types.h>
 #include <kvm/iodev.h>
+#include <linux/irqchip/arm-gic-common.h>

 #define VGIC_NR_IRQS_LEGACY	256
 #define VGIC_NR_SGIS		16
@@ -353,15 +354,15 @@ bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, struct irq_phys_map *map);
 #define vgic_initialized(k)	(!!((k)->arch.vgic.nr_cpus))
 #define vgic_ready(k)		((k)->arch.vgic.ready)

-int vgic_v2_probe(struct device_node *vgic_node,
+int vgic_v2_probe(const struct gic_kvm_info *gic_kvm_info,
 		  const struct vgic_ops **ops,
 		  const struct vgic_params **params);
 #ifdef CONFIG_KVM_ARM_VGIC_V3
-int vgic_v3_probe(struct device_node *vgic_node,
+int vgic_v3_probe(const struct gic_kvm_info *gic_kvm_info,
 		  const struct vgic_ops **ops,
 		  const struct vgic_params **params);
 #else
-static inline int vgic_v3_probe(struct device_node *vgic_node,
+static inline int vgic_v3_probe(const struct gic_kvm_info *gic_kvm_info,
 				const struct vgic_ops **ops,
 				const struct vgic_params **params)
 {

--- a/include/linux/irqchip/arm-gic-common.h
+++ b/include/linux/irqchip/arm-gic-common.h
+/*
+ * include/linux/irqchip/arm-gic-common.h
+ *
+ * Copyright (C) 2016 ARM Limited, All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __LINUX_IRQCHIP_ARM_GIC_COMMON_H
+#define __LINUX_IRQCHIP_ARM_GIC_COMMON_H
+
+#include <linux/types.h>
+#include <linux/ioport.h>
+
+enum gic_type {
+	GIC_V2,
+	GIC_V3,
+};
+
+struct gic_kvm_info {
+	/* GIC type */
+	enum gic_type	type;
+	/* Virtual CPU interface */
+	struct resource vcpu;
+	/* Interrupt number */
+	unsigned int	maint_irq;
+	/* Virtual control interface */
+	struct resource vctrl;
+};
+
+const struct gic_kvm_info *gic_get_kvm_info(void);
+
+#endif /* __LINUX_IRQCHIP_ARM_GIC_COMMON_H */
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -17,7 +17,6 @@
 */

 #include <linux/cpu.h>
-#include <linux/of_irq.h>
 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
 #include <linux/interrupt.h>
@@ -438,45 +437,29 @@ static struct notifier_block kvm_timer_cpu_nb = {
 	.notifier_call = kvm_timer_cpu_notify,
 };

-static const struct of_device_id arch_timer_of_match[] = {
-	{ .compatible	= "arm,armv7-timer",	},
-	{ .compatible	= "arm,armv8-timer",	},
-	{},
-};
-
 int kvm_timer_hyp_init(void)
 {
-	struct device_node *np;
-	unsigned int ppi;
+	struct arch_timer_kvm_info *info;
 	int err;

-	timecounter = arch_timer_get_timecounter();
-	if (!timecounter)
-		return -ENODEV;
+	info = arch_timer_get_kvm_info();
+	timecounter = &info->timecounter;

-	np = of_find_matching_node(NULL, arch_timer_of_match);
-	if (!np) {
-		kvm_err("kvm_arch_timer: can't find DT node\n");
+	if (info->virtual_irq <= 0) {
+		kvm_err("kvm_arch_timer: invalid virtual timer IRQ: %d\n",
+			info->virtual_irq);
 		return -ENODEV;
 	}
+	host_vtimer_irq = info->virtual_irq;

-	ppi = irq_of_parse_and_map(np, 2);
-	if (!ppi) {
-		kvm_err("kvm_arch_timer: no virtual timer interrupt\n");
-		err = -EINVAL;
-		goto out;
-	}
-
-	err = request_percpu_irq(ppi, kvm_arch_timer_handler,
+	err = request_percpu_irq(host_vtimer_irq, kvm_arch_timer_handler,
 				 "kvm guest timer", kvm_get_running_vcpus());
 	if (err) {
 		kvm_err("kvm_arch_timer: can't request interrupt %d (%d)\n",
-			ppi, err);
+			host_vtimer_irq, err);
 		goto out;
 	}

-	host_vtimer_irq = ppi;
-
 	err = __register_cpu_notifier(&kvm_timer_cpu_nb);
 	if (err) {
 		kvm_err("Cannot register timer CPU notifier\n");
@@ -489,14 +472,13 @@ int kvm_timer_hyp_init(void)
 		goto out_free;
 	}

-	kvm_info("%s IRQ%d\n", np->name, ppi);
+	kvm_info("virtual timer IRQ%d\n", host_vtimer_irq);
 	on_each_cpu(kvm_timer_init_interrupt, NULL, 1);

 	goto out;
 out_free:
-	free_percpu_irq(ppi, kvm_get_running_vcpus());
+	free_percpu_irq(host_vtimer_irq, kvm_get_running_vcpus());
 out:
-	of_node_put(np);
 	return err;
 }


--- a/virt/kvm/arm/vgic-v2.c
+++ b/virt/kvm/arm/vgic-v2.c
@@ -20,9 +20,6 @@
 #include <linux/kvm_host.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
-#include <linux/of.h>
-#include <linux/of_address.h>
-#include <linux/of_irq.h>

 #include <linux/irqchip/arm-gic.h>

@@ -186,38 +183,39 @@ static void vgic_cpu_init_lrs(void *params)
 }

 /**
- * vgic_v2_probe - probe for a GICv2 compatible interrupt controller in DT
- * @node:	pointer to the DT node
- * @ops: 	address of a pointer to the GICv2 operations
- * @params:	address of a pointer to HW-specific parameters
+ * vgic_v2_probe - probe for a GICv2 compatible interrupt controller
+ * @gic_kvm_info:	pointer to the GIC description
+ * @ops:		address of a pointer to the GICv2 operations
+ * @params:		address of a pointer to HW-specific parameters
 *
 * Returns 0 if a GICv2 has been found, with the low level operations
 * in *ops and the HW parameters in *params. Returns an error code
 * otherwise.
 */
-int vgic_v2_probe(struct device_node *vgic_node,
-		  const struct vgic_ops **ops,
-		  const struct vgic_params **params)
+int vgic_v2_probe(const struct gic_kvm_info *gic_kvm_info,
+		   const struct vgic_ops **ops,
+		   const struct vgic_params **params)
 {
 	int ret;
-	struct resource vctrl_res;
-	struct resource vcpu_res;
 	struct vgic_params *vgic = &vgic_v2_params;
+	const struct resource *vctrl_res = &gic_kvm_info->vctrl;
+	const struct resource *vcpu_res = &gic_kvm_info->vcpu;

-	vgic->maint_irq = irq_of_parse_and_map(vgic_node, 0);
-	if (!vgic->maint_irq) {
-		kvm_err("error getting vgic maintenance irq from DT\n");
+	if (!gic_kvm_info->maint_irq) {
+		kvm_err("error getting vgic maintenance irq\n");
 		ret = -ENXIO;
 		goto out;
 	}
+	vgic->maint_irq = gic_kvm_info->maint_irq;

-	ret = of_address_to_resource(vgic_node, 2, &vctrl_res);
-	if (ret) {
-		kvm_err("Cannot obtain GICH resource\n");
+	if (!gic_kvm_info->vctrl.start) {
+		kvm_err("GICH not present in the firmware table\n");
+		ret = -ENXIO;
 		goto out;
 	}

-	vgic->vctrl_base = of_iomap(vgic_node, 2);
+	vgic->vctrl_base = ioremap(gic_kvm_info->vctrl.start,
+				   resource_size(&gic_kvm_info->vctrl));
 	if (!vgic->vctrl_base) {
 		kvm_err("Cannot ioremap GICH\n");
 		ret = -ENOMEM;
@@ -228,29 +226,23 @@ int vgic_v2_probe(struct device_node *vgic_node,
 	vgic->nr_lr = (vgic->nr_lr & 0x3f) + 1;

 	ret = create_hyp_io_mappings(vgic->vctrl_base,
-				     vgic->vctrl_base + resource_size(&vctrl_res),
-				     vctrl_res.start);
+				     vgic->vctrl_base + resource_size(vctrl_res),
+				     vctrl_res->start);
 	if (ret) {
 		kvm_err("Cannot map VCTRL into hyp\n");
 		goto out_unmap;
 	}

-	if (of_address_to_resource(vgic_node, 3, &vcpu_res)) {
-		kvm_err("Cannot obtain GICV resource\n");
-		ret = -ENXIO;
-		goto out_unmap;
-	}
-
-	if (!PAGE_ALIGNED(vcpu_res.start)) {
+	if (!PAGE_ALIGNED(vcpu_res->start)) {
 		kvm_err("GICV physical address 0x%llx not page aligned\n",
-			(unsigned long long)vcpu_res.start);
+			(unsigned long long)vcpu_res->start);
 		ret = -ENXIO;
 		goto out_unmap;
 	}

-	if (!PAGE_ALIGNED(resource_size(&vcpu_res))) {
+	if (!PAGE_ALIGNED(resource_size(vcpu_res))) {
 		kvm_err("GICV size 0x%llx not a multiple of page size 0x%lx\n",
-			(unsigned long long)resource_size(&vcpu_res),
+			(unsigned long long)resource_size(vcpu_res),
 			PAGE_SIZE);
 		ret = -ENXIO;
 		goto out_unmap;
@@ -259,10 +251,10 @@ int vgic_v2_probe(struct device_node *vgic_node,
 	vgic->can_emulate_gicv2 = true;
 	kvm_register_device_ops(&kvm_arm_vgic_v2_ops, KVM_DEV_TYPE_ARM_VGIC_V2);

-	vgic->vcpu_base = vcpu_res.start;
+	vgic->vcpu_base = vcpu_res->start;

-	kvm_info("%s@%llx IRQ%d\n", vgic_node->name,
-		 vctrl_res.start, vgic->maint_irq);
+	kvm_info("GICH base=0x%llx, GICV base=0x%llx, IRQ=%d\n",
+		 gic_kvm_info->vctrl.start, vgic->vcpu_base, vgic->maint_irq);

 	vgic->type = VGIC_V2;
 	vgic->max_gic_vcpus = VGIC_V2_MAX_CPUS;
@@ -276,6 +268,5 @@ int vgic_v2_probe(struct device_node *vgic_node,
 out_unmap:
 	iounmap(vgic->vctrl_base);
 out:
-	of_node_put(vgic_node);
 	return ret;
 }
--- a/virt/kvm/arm/vgic-v3.c
+++ b/virt/kvm/arm/vgic-v3.c
@@ -20,11 +20,9 @@
 #include <linux/kvm_host.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
-#include <linux/of.h>
-#include <linux/of_address.h>
-#include <linux/of_irq.h>

 #include <linux/irqchip/arm-gic-v3.h>
+#include <linux/irqchip/arm-gic-common.h>

 #include <asm/kvm_emulate.h>
 #include <asm/kvm_arm.h>
@@ -222,30 +220,24 @@ static void vgic_cpu_init_lrs(void *params)
 }

 /**
- * vgic_v3_probe - probe for a GICv3 compatible interrupt controller in DT
- * @node:	pointer to the DT node
- * @ops: 	address of a pointer to the GICv3 operations
- * @params:	address of a pointer to HW-specific parameters
+ * vgic_v3_probe - probe for a GICv3 compatible interrupt controller
+ * @gic_kvm_info:	pointer to the GIC description
+ * @ops:		address of a pointer to the GICv3 operations
+ * @params:		address of a pointer to HW-specific parameters
 *
 * Returns 0 if a GICv3 has been found, with the low level operations
 * in *ops and the HW parameters in *params. Returns an error code
 * otherwise.
 */
-int vgic_v3_probe(struct device_node *vgic_node,
+int vgic_v3_probe(const struct gic_kvm_info *gic_kvm_info,
 		  const struct vgic_ops **ops,
 		  const struct vgic_params **params)
 {
 	int ret = 0;
-	u32 gicv_idx;
-	struct resource vcpu_res;
 	struct vgic_params *vgic = &vgic_v3_params;
+	const struct resource *vcpu_res = &gic_kvm_info->vcpu;

-	vgic->maint_irq = irq_of_parse_and_map(vgic_node, 0);
-	if (!vgic->maint_irq) {
-		kvm_err("error getting vgic maintenance irq from DT\n");
-		ret = -ENXIO;
-		goto out;
-	}
+	vgic->maint_irq = gic_kvm_info->maint_irq;

 	ich_vtr_el2 = kvm_call_hyp(__vgic_v3_get_ich_vtr_el2);

@@ -256,24 +248,19 @@ int vgic_v3_probe(struct device_node *vgic_node,
 	vgic->nr_lr = (ich_vtr_el2 & 0xf) + 1;
 	vgic->can_emulate_gicv2 = false;

-	if (of_property_read_u32(vgic_node, "#redistributor-regions", &gicv_idx))
-		gicv_idx = 1;
-
-	gicv_idx += 3; /* Also skip GICD, GICC, GICH */
-	if (of_address_to_resource(vgic_node, gicv_idx, &vcpu_res)) {
+	if (!vcpu_res->start) {
 		kvm_info("GICv3: no GICV resource entry\n");
 		vgic->vcpu_base = 0;
-	} else if (!PAGE_ALIGNED(vcpu_res.start)) {
+	} else if (!PAGE_ALIGNED(vcpu_res->start)) {
 		pr_warn("GICV physical address 0x%llx not page aligned\n",
-			(unsigned long long)vcpu_res.start);
+			(unsigned long long)vcpu_res->start);
 		vgic->vcpu_base = 0;
-	} else if (!PAGE_ALIGNED(resource_size(&vcpu_res))) {
+	} else if (!PAGE_ALIGNED(resource_size(vcpu_res))) {
 		pr_warn("GICV size 0x%llx not a multiple of page size 0x%lx\n",
-			(unsigned long long)resource_size(&vcpu_res),
+			(unsigned long long)resource_size(vcpu_res),
 			PAGE_SIZE);
-		vgic->vcpu_base = 0;
 	} else {
-		vgic->vcpu_base = vcpu_res.start;
+		vgic->vcpu_base = vcpu_res->start;
 		vgic->can_emulate_gicv2 = true;
 		kvm_register_device_ops(&kvm_arm_vgic_v2_ops,
 					KVM_DEV_TYPE_ARM_VGIC_V2);
@@ -286,15 +273,13 @@ int vgic_v3_probe(struct device_node *vgic_node,
 	vgic->type = VGIC_V3;
 	vgic->max_gic_vcpus = VGIC_V3_MAX_CPUS;

-	kvm_info("%s@%llx IRQ%d\n", vgic_node->name,
-		 vcpu_res.start, vgic->maint_irq);
+	kvm_info("GICV base=0x%llx, IRQ=%d\n",
+		 vgic->vcpu_base, vgic->maint_irq);

 	on_each_cpu(vgic_cpu_init_lrs, vgic, 1);

 	*ops = &vgic_v3_ops;
 	*params = vgic;

-out:
-	of_node_put(vgic_node);
 	return ret;
 }
--- a/virt/kvm/arm/vgic.c
+++ b/virt/kvm/arm/vgic.c
@@ -21,9 +21,7 @@
 #include <linux/kvm_host.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
-#include <linux/of.h>
-#include <linux/of_address.h>
-#include <linux/of_irq.h>
+#include <linux/irq.h>
 #include <linux/rculist.h>
 #include <linux/uaccess.h>

@@ -33,6 +31,7 @@
 #include <trace/events/kvm.h>
 #include <asm/kvm.h>
 #include <kvm/iodev.h>
+#include <linux/irqchip/arm-gic-common.h>

 #define CREATE_TRACE_POINTS
 #include "trace.h"
@@ -2389,33 +2388,38 @@ static struct notifier_block vgic_cpu_nb = {
 	.notifier_call = vgic_cpu_notify,
 };

-static const struct of_device_id vgic_ids[] = {
-	{ .compatible = "arm,cortex-a15-gic",	.data = vgic_v2_probe, },
-	{ .compatible = "arm,cortex-a7-gic",	.data = vgic_v2_probe, },
-	{ .compatible = "arm,gic-400",		.data = vgic_v2_probe, },
-	{ .compatible = "arm,gic-v3",		.data = vgic_v3_probe, },
-	{},
-};
-
-int kvm_vgic_hyp_init(void)
+static int kvm_vgic_probe(void)
 {
-	const struct of_device_id *matched_id;
-	const int (*vgic_probe)(struct device_node *,const struct vgic_ops **,
-				const struct vgic_params **);
-	struct device_node *vgic_node;
+	const struct gic_kvm_info *gic_kvm_info;
 	int ret;

-	vgic_node = of_find_matching_node_and_match(NULL,
-						    vgic_ids, &matched_id);
-	if (!vgic_node) {
-		kvm_err("error: no compatible GIC node found\n");
+	gic_kvm_info = gic_get_kvm_info();
+	if (!gic_kvm_info)
 		return -ENODEV;
+
+	switch (gic_kvm_info->type) {
+	case GIC_V2:
+		ret = vgic_v2_probe(gic_kvm_info, &vgic_ops, &vgic);
+		break;
+	case GIC_V3:
+		ret = vgic_v3_probe(gic_kvm_info, &vgic_ops, &vgic);
+		break;
+	default:
+		ret = -ENODEV;
 	}

-	vgic_probe = matched_id->data;
-	ret = vgic_probe(vgic_node, &vgic_ops, &vgic);
-	if (ret)
+	return ret;
+}
+
+int kvm_vgic_hyp_init(void)
+{
+	int ret;
+
+	ret = kvm_vgic_probe();
+	if (ret) {
+		kvm_err("error: KVM vGIC probing failed\n");
 		return ret;
+	}

 	ret = request_percpu_irq(vgic->maint_irq, vgic_maintenance_handler,
 				 "vgic", kvm_get_running_vcpus());