Commit cf0c7125 authored by Marc Zyngier's avatar Marc Zyngier

Merge branch kvm-arm64/mmu/el2-tracking into kvmarm-master/next

* kvm-arm64/mmu/el2-tracking: (25 commits)
  : Enable tracking of page sharing between host EL1 and EL2
  KVM: arm64: Minor optimization of range_is_memory
  KVM: arm64: Make hyp_panic() more robust when protected mode is enabled
  KVM: arm64: Return -EPERM from __pkvm_host_share_hyp()
  KVM: arm64: Make __pkvm_create_mappings static
  KVM: arm64: Restrict EL2 stage-1 changes in protected mode
  KVM: arm64: Refactor protected nVHE stage-1 locking
  KVM: arm64: Remove __pkvm_mark_hyp
  KVM: arm64: Mark host bss and rodata section as shared
  KVM: arm64: Enable retrieving protections attributes of PTEs
  KVM: arm64: Introduce addr_is_memory()
  KVM: arm64: Expose pkvm_hyp_id
  KVM: arm64: Expose host stage-2 manipulation helpers
  KVM: arm64: Add helpers to tag shared pages in SW bits
  KVM: arm64: Allow populating software bits
  KVM: arm64: Enable forcing page-level stage-2 mappings
  KVM: arm64: Tolerate re-creating hyp mappings to set software bits
  KVM: arm64: Don't overwrite software bits with owner id
  KVM: arm64: Rename KVM_PTE_LEAF_ATTR_S2_IGNORED
  KVM: arm64: Optimize host memory aborts
  KVM: arm64: Expose page-table helpers
  ...
Signed-off-by: default avatarMarc Zyngier <maz@kernel.org>
parents 82f8d543 14ecf075
......@@ -59,12 +59,11 @@
#define __KVM_HOST_SMCCC_FUNC___vgic_v3_save_aprs 13
#define __KVM_HOST_SMCCC_FUNC___vgic_v3_restore_aprs 14
#define __KVM_HOST_SMCCC_FUNC___pkvm_init 15
#define __KVM_HOST_SMCCC_FUNC___pkvm_create_mappings 16
#define __KVM_HOST_SMCCC_FUNC___pkvm_host_share_hyp 16
#define __KVM_HOST_SMCCC_FUNC___pkvm_create_private_mapping 17
#define __KVM_HOST_SMCCC_FUNC___pkvm_cpu_set_vector 18
#define __KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize 19
#define __KVM_HOST_SMCCC_FUNC___pkvm_mark_hyp 20
#define __KVM_HOST_SMCCC_FUNC___kvm_adjust_pc 21
#define __KVM_HOST_SMCCC_FUNC___kvm_adjust_pc 20
#ifndef __ASSEMBLY__
......
......@@ -25,6 +25,46 @@ static inline u64 kvm_get_parange(u64 mmfr0)
typedef u64 kvm_pte_t;
#define KVM_PTE_VALID BIT(0)
#define KVM_PTE_ADDR_MASK GENMASK(47, PAGE_SHIFT)
#define KVM_PTE_ADDR_51_48 GENMASK(15, 12)
static inline bool kvm_pte_valid(kvm_pte_t pte)
{
return pte & KVM_PTE_VALID;
}
static inline u64 kvm_pte_to_phys(kvm_pte_t pte)
{
u64 pa = pte & KVM_PTE_ADDR_MASK;
if (PAGE_SHIFT == 16)
pa |= FIELD_GET(KVM_PTE_ADDR_51_48, pte) << 48;
return pa;
}
static inline u64 kvm_granule_shift(u32 level)
{
/* Assumes KVM_PGTABLE_MAX_LEVELS is 4 */
return ARM64_HW_PGTABLE_LEVEL_SHIFT(level);
}
static inline u64 kvm_granule_size(u32 level)
{
return BIT(kvm_granule_shift(level));
}
static inline bool kvm_level_supports_block_mapping(u32 level)
{
/*
* Reject invalid block mappings and don't bother with 4TB mappings for
* 52-bit PAs.
*/
return !(level == 0 || (PAGE_SIZE != SZ_4K && level == 1));
}
/**
* struct kvm_pgtable_mm_ops - Memory management callbacks.
* @zalloc_page: Allocate a single zeroed memory page.
......@@ -75,31 +115,16 @@ enum kvm_pgtable_stage2_flags {
KVM_PGTABLE_S2_IDMAP = BIT(1),
};
/**
* struct kvm_pgtable - KVM page-table.
* @ia_bits: Maximum input address size, in bits.
* @start_level: Level at which the page-table walk starts.
* @pgd: Pointer to the first top-level entry of the page-table.
* @mm_ops: Memory management callbacks.
* @mmu: Stage-2 KVM MMU struct. Unused for stage-1 page-tables.
*/
struct kvm_pgtable {
u32 ia_bits;
u32 start_level;
kvm_pte_t *pgd;
struct kvm_pgtable_mm_ops *mm_ops;
/* Stage-2 only */
struct kvm_s2_mmu *mmu;
enum kvm_pgtable_stage2_flags flags;
};
/**
* enum kvm_pgtable_prot - Page-table permissions and attributes.
* @KVM_PGTABLE_PROT_X: Execute permission.
* @KVM_PGTABLE_PROT_W: Write permission.
* @KVM_PGTABLE_PROT_R: Read permission.
* @KVM_PGTABLE_PROT_DEVICE: Device attributes.
* @KVM_PGTABLE_PROT_SW0: Software bit 0.
* @KVM_PGTABLE_PROT_SW1: Software bit 1.
* @KVM_PGTABLE_PROT_SW2: Software bit 2.
* @KVM_PGTABLE_PROT_SW3: Software bit 3.
*/
enum kvm_pgtable_prot {
KVM_PGTABLE_PROT_X = BIT(0),
......@@ -107,21 +132,48 @@ enum kvm_pgtable_prot {
KVM_PGTABLE_PROT_R = BIT(2),
KVM_PGTABLE_PROT_DEVICE = BIT(3),
KVM_PGTABLE_PROT_SW0 = BIT(55),
KVM_PGTABLE_PROT_SW1 = BIT(56),
KVM_PGTABLE_PROT_SW2 = BIT(57),
KVM_PGTABLE_PROT_SW3 = BIT(58),
};
#define PAGE_HYP (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W)
#define KVM_PGTABLE_PROT_RW (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W)
#define KVM_PGTABLE_PROT_RWX (KVM_PGTABLE_PROT_RW | KVM_PGTABLE_PROT_X)
#define PKVM_HOST_MEM_PROT KVM_PGTABLE_PROT_RWX
#define PKVM_HOST_MMIO_PROT KVM_PGTABLE_PROT_RW
#define PAGE_HYP KVM_PGTABLE_PROT_RW
#define PAGE_HYP_EXEC (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_X)
#define PAGE_HYP_RO (KVM_PGTABLE_PROT_R)
#define PAGE_HYP_DEVICE (PAGE_HYP | KVM_PGTABLE_PROT_DEVICE)
typedef bool (*kvm_pgtable_force_pte_cb_t)(u64 addr, u64 end,
enum kvm_pgtable_prot prot);
/**
* struct kvm_mem_range - Range of Intermediate Physical Addresses
* @start: Start of the range.
* @end: End of the range.
* struct kvm_pgtable - KVM page-table.
* @ia_bits: Maximum input address size, in bits.
* @start_level: Level at which the page-table walk starts.
* @pgd: Pointer to the first top-level entry of the page-table.
* @mm_ops: Memory management callbacks.
* @mmu: Stage-2 KVM MMU struct. Unused for stage-1 page-tables.
* @flags: Stage-2 page-table flags.
* @force_pte_cb: Function that returns true if page level mappings must
* be used instead of block mappings.
*/
struct kvm_mem_range {
u64 start;
u64 end;
struct kvm_pgtable {
u32 ia_bits;
u32 start_level;
kvm_pte_t *pgd;
struct kvm_pgtable_mm_ops *mm_ops;
/* Stage-2 only */
struct kvm_s2_mmu *mmu;
enum kvm_pgtable_stage2_flags flags;
kvm_pgtable_force_pte_cb_t force_pte_cb;
};
/**
......@@ -216,21 +268,24 @@ int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift);
/**
* kvm_pgtable_stage2_init_flags() - Initialise a guest stage-2 page-table.
* __kvm_pgtable_stage2_init() - Initialise a guest stage-2 page-table.
* @pgt: Uninitialised page-table structure to initialise.
* @arch: Arch-specific KVM structure representing the guest virtual
* machine.
* @mm_ops: Memory management callbacks.
* @flags: Stage-2 configuration flags.
* @force_pte_cb: Function that returns true if page level mappings must
* be used instead of block mappings.
*
* Return: 0 on success, negative error code on failure.
*/
int kvm_pgtable_stage2_init_flags(struct kvm_pgtable *pgt, struct kvm_arch *arch,
struct kvm_pgtable_mm_ops *mm_ops,
enum kvm_pgtable_stage2_flags flags);
int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_arch *arch,
struct kvm_pgtable_mm_ops *mm_ops,
enum kvm_pgtable_stage2_flags flags,
kvm_pgtable_force_pte_cb_t force_pte_cb);
#define kvm_pgtable_stage2_init(pgt, arch, mm_ops) \
kvm_pgtable_stage2_init_flags(pgt, arch, mm_ops, 0)
__kvm_pgtable_stage2_init(pgt, arch, mm_ops, 0, NULL)
/**
* kvm_pgtable_stage2_destroy() - Destroy an unused guest stage-2 page-table.
......@@ -374,7 +429,8 @@ kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr);
* If there is a valid, leaf page-table entry used to translate @addr, then
* relax the permissions in that entry according to the read, write and
* execute permissions specified by @prot. No permissions are removed, and
* TLB invalidation is performed after updating the entry.
* TLB invalidation is performed after updating the entry. Software bits cannot
* be set or cleared using kvm_pgtable_stage2_relax_perms().
*
* Return: 0 on success, negative error code on failure.
*/
......@@ -453,22 +509,22 @@ int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr,
kvm_pte_t *ptep, u32 *level);
/**
* kvm_pgtable_stage2_find_range() - Find a range of Intermediate Physical
* Addresses with compatible permission
* attributes.
* @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*().
* @addr: Address that must be covered by the range.
* @prot: Protection attributes that the range must be compatible with.
* @range: Range structure used to limit the search space at call time and
* that will hold the result.
* kvm_pgtable_stage2_pte_prot() - Retrieve the protection attributes of a
* stage-2 Page-Table Entry.
* @pte: Page-table entry
*
* The offset of @addr within a page is ignored. An IPA is compatible with @prot
* iff its corresponding stage-2 page-table entry has default ownership and, if
* valid, is mapped with protection attributes identical to @prot.
* Return: protection attributes of the page-table entry in the enum
* kvm_pgtable_prot format.
*/
enum kvm_pgtable_prot kvm_pgtable_stage2_pte_prot(kvm_pte_t pte);
/**
* kvm_pgtable_hyp_pte_prot() - Retrieve the protection attributes of a stage-1
* Page-Table Entry.
* @pte: Page-table entry
*
* Return: 0 on success, negative error code on failure.
* Return: protection attributes of the page-table entry in the enum
* kvm_pgtable_prot format.
*/
int kvm_pgtable_stage2_find_range(struct kvm_pgtable *pgt, u64 addr,
enum kvm_pgtable_prot prot,
struct kvm_mem_range *range);
enum kvm_pgtable_prot kvm_pgtable_hyp_pte_prot(kvm_pte_t pte);
#endif /* __ARM64_KVM_PGTABLE_H__ */
......@@ -46,6 +46,15 @@ if KVM
source "virt/kvm/Kconfig"
config NVHE_EL2_DEBUG
bool "Debug mode for non-VHE EL2 object"
help
Say Y here to enable the debug mode for the non-VHE KVM EL2 object.
Failure reports will BUG() in the hypervisor. This is intended for
local EL2 hypervisor development.
If unsure, say N.
endif # KVM
endif # VIRTUALIZATION
......@@ -91,10 +91,14 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
kvm->arch.return_nisv_io_abort_to_user = true;
break;
case KVM_CAP_ARM_MTE:
if (!system_supports_mte() || kvm->created_vcpus)
return -EINVAL;
r = 0;
kvm->arch.mte_enabled = true;
mutex_lock(&kvm->lock);
if (!system_supports_mte() || kvm->created_vcpus) {
r = -EINVAL;
} else {
r = 0;
kvm->arch.mte_enabled = true;
}
mutex_unlock(&kvm->lock);
break;
default:
r = -EINVAL;
......@@ -1946,62 +1950,17 @@ static void _kvm_host_prot_finalize(void *discard)
WARN_ON(kvm_call_hyp_nvhe(__pkvm_prot_finalize));
}
static inline int pkvm_mark_hyp(phys_addr_t start, phys_addr_t end)
{
return kvm_call_hyp_nvhe(__pkvm_mark_hyp, start, end);
}
#define pkvm_mark_hyp_section(__section) \
pkvm_mark_hyp(__pa_symbol(__section##_start), \
__pa_symbol(__section##_end))
static int finalize_hyp_mode(void)
{
int cpu, ret;
if (!is_protected_kvm_enabled())
return 0;
ret = pkvm_mark_hyp_section(__hyp_idmap_text);
if (ret)
return ret;
ret = pkvm_mark_hyp_section(__hyp_text);
if (ret)
return ret;
ret = pkvm_mark_hyp_section(__hyp_rodata);
if (ret)
return ret;
/*
* Exclude HYP BSS from kmemleak so that it doesn't get peeked
* at, which would end badly once the section is inaccessible.
* None of other sections should ever be introspected.
*/
kmemleak_free_part(__hyp_bss_start, __hyp_bss_end - __hyp_bss_start);
ret = pkvm_mark_hyp_section(__hyp_bss);
if (ret)
return ret;
ret = pkvm_mark_hyp(hyp_mem_base, hyp_mem_base + hyp_mem_size);
if (ret)
return ret;
for_each_possible_cpu(cpu) {
phys_addr_t start = virt_to_phys((void *)kvm_arm_hyp_percpu_base[cpu]);
phys_addr_t end = start + (PAGE_SIZE << nvhe_percpu_order());
ret = pkvm_mark_hyp(start, end);
if (ret)
return ret;
start = virt_to_phys((void *)per_cpu(kvm_arm_hyp_stack_page, cpu));
end = start + PAGE_SIZE;
ret = pkvm_mark_hyp(start, end);
if (ret)
return ret;
}
/*
* Flip the static key upfront as that may no longer be possible
......
......@@ -292,11 +292,12 @@ void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index)
kvm_handle_guest_serror(vcpu, kvm_vcpu_get_esr(vcpu));
}
void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, u64 elr,
void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr,
u64 elr_virt, u64 elr_phys,
u64 par, uintptr_t vcpu,
u64 far, u64 hpfar) {
u64 elr_in_kimg = __phys_to_kimg(__hyp_pa(elr));
u64 hyp_offset = elr_in_kimg - kaslr_offset() - elr;
u64 elr_in_kimg = __phys_to_kimg(elr_phys);
u64 hyp_offset = elr_in_kimg - kaslr_offset() - elr_virt;
u64 mode = spsr & PSR_MODE_MASK;
/*
......@@ -309,20 +310,24 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, u64 elr,
kvm_err("Invalid host exception to nVHE hyp!\n");
} else if (ESR_ELx_EC(esr) == ESR_ELx_EC_BRK64 &&
(esr & ESR_ELx_BRK64_ISS_COMMENT_MASK) == BUG_BRK_IMM) {
struct bug_entry *bug = find_bug(elr_in_kimg);
const char *file = NULL;
unsigned int line = 0;
/* All hyp bugs, including warnings, are treated as fatal. */
if (bug)
bug_get_file_line(bug, &file, &line);
if (!is_protected_kvm_enabled() ||
IS_ENABLED(CONFIG_NVHE_EL2_DEBUG)) {
struct bug_entry *bug = find_bug(elr_in_kimg);
if (bug)
bug_get_file_line(bug, &file, &line);
}
if (file)
kvm_err("nVHE hyp BUG at: %s:%u!\n", file, line);
else
kvm_err("nVHE hyp BUG at: %016llx!\n", elr + hyp_offset);
kvm_err("nVHE hyp BUG at: %016llx!\n", elr_virt + hyp_offset);
} else {
kvm_err("nVHE hyp panic at: %016llx!\n", elr + hyp_offset);
kvm_err("nVHE hyp panic at: %016llx!\n", elr_virt + hyp_offset);
}
/*
......@@ -334,5 +339,5 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, u64 elr,
kvm_err("Hyp Offset: 0x%llx\n", hyp_offset);
panic("HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%016lx\n",
spsr, elr, esr, far, hpfar, par, vcpu);
spsr, elr_virt, esr, far, hpfar, par, vcpu);
}
......@@ -12,6 +12,32 @@
#include <asm/virt.h>
#include <nvhe/spinlock.h>
/*
* SW bits 0-1 are reserved to track the memory ownership state of each page:
* 00: The page is owned exclusively by the page-table owner.
* 01: The page is owned by the page-table owner, but is shared
* with another entity.
* 10: The page is shared with, but not owned by the page-table owner.
* 11: Reserved for future use (lending).
*/
enum pkvm_page_state {
PKVM_PAGE_OWNED = 0ULL,
PKVM_PAGE_SHARED_OWNED = KVM_PGTABLE_PROT_SW0,
PKVM_PAGE_SHARED_BORROWED = KVM_PGTABLE_PROT_SW1,
};
#define PKVM_PAGE_STATE_PROT_MASK (KVM_PGTABLE_PROT_SW0 | KVM_PGTABLE_PROT_SW1)
static inline enum kvm_pgtable_prot pkvm_mkstate(enum kvm_pgtable_prot prot,
enum pkvm_page_state state)
{
return (prot & ~PKVM_PAGE_STATE_PROT_MASK) | state;
}
static inline enum pkvm_page_state pkvm_getstate(enum kvm_pgtable_prot prot)
{
return prot & PKVM_PAGE_STATE_PROT_MASK;
}
struct host_kvm {
struct kvm_arch arch;
struct kvm_pgtable pgt;
......@@ -20,9 +46,14 @@ struct host_kvm {
};
extern struct host_kvm host_kvm;
extern const u8 pkvm_hyp_id;
int __pkvm_prot_finalize(void);
int __pkvm_mark_hyp(phys_addr_t start, phys_addr_t end);
int __pkvm_host_share_hyp(u64 pfn);
bool addr_is_memory(phys_addr_t phys);
int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot);
int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id);
int kvm_host_prepare_stage2(void *pgt_pool_base);
void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt);
......
......@@ -23,8 +23,7 @@ int hyp_map_vectors(void);
int hyp_back_vmemmap(phys_addr_t phys, unsigned long size, phys_addr_t back);
int pkvm_cpu_set_vector(enum arm64_hyp_spectre_vector slot);
int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot);
int __pkvm_create_mappings(unsigned long start, unsigned long size,
unsigned long phys, enum kvm_pgtable_prot prot);
int pkvm_create_mappings_locked(void *from, void *to, enum kvm_pgtable_prot prot);
unsigned long __pkvm_create_private_mapping(phys_addr_t phys, size_t size,
enum kvm_pgtable_prot prot);
......
......@@ -15,6 +15,7 @@
#include <asm/alternative.h>
#include <asm/lse.h>
#include <asm/rwonce.h>
typedef union hyp_spinlock {
u32 __val;
......@@ -89,4 +90,28 @@ static inline void hyp_spin_unlock(hyp_spinlock_t *lock)
: "memory");
}
static inline bool hyp_spin_is_locked(hyp_spinlock_t *lock)
{
hyp_spinlock_t lockval = READ_ONCE(*lock);
return lockval.owner != lockval.next;
}
#ifdef CONFIG_NVHE_EL2_DEBUG
static inline void hyp_assert_lock_held(hyp_spinlock_t *lock)
{
/*
* The __pkvm_init() path accesses protected data-structures without
* holding locks as the other CPUs are guaranteed to not enter EL2
* concurrently at this point in time. The point by which EL2 is
* initialized on all CPUs is reflected in the pkvm static key, so
* wait until it is set before checking the lock state.
*/
if (static_branch_likely(&kvm_protected_mode_initialized))
BUG_ON(!hyp_spin_is_locked(lock));
}
#else
static inline void hyp_assert_lock_held(hyp_spinlock_t *lock) { }
#endif
#endif /* __ARM64_KVM_NVHE_SPINLOCK_H__ */
......@@ -7,6 +7,7 @@
#include <linux/linkage.h>
#include <asm/assembler.h>
#include <asm/kvm_arm.h>
#include <asm/kvm_asm.h>
#include <asm/kvm_mmu.h>
......@@ -85,12 +86,24 @@ SYM_FUNC_START(__hyp_do_panic)
mov x29, x0
#ifdef CONFIG_NVHE_EL2_DEBUG
/* Ensure host stage-2 is disabled */
mrs x0, hcr_el2
bic x0, x0, #HCR_VM
msr hcr_el2, x0
isb
tlbi vmalls12e1
dsb nsh
#endif
/* Load the panic arguments into x0-7 */
mrs x0, esr_el2
get_vcpu_ptr x4, x5
mrs x5, far_el2
mrs x6, hpfar_el2
mov x7, xzr // Unused argument
mov x4, x3
mov x3, x2
hyp_pa x3, x6
get_vcpu_ptr x5, x6
mrs x6, far_el2
mrs x7, hpfar_el2
/* Enter the host, conditionally restoring the host context. */
cbz x29, __host_enter_without_restoring
......
......@@ -140,14 +140,11 @@ static void handle___pkvm_cpu_set_vector(struct kvm_cpu_context *host_ctxt)
cpu_reg(host_ctxt, 1) = pkvm_cpu_set_vector(slot);
}
static void handle___pkvm_create_mappings(struct kvm_cpu_context *host_ctxt)
static void handle___pkvm_host_share_hyp(struct kvm_cpu_context *host_ctxt)
{
DECLARE_REG(unsigned long, start, host_ctxt, 1);
DECLARE_REG(unsigned long, size, host_ctxt, 2);
DECLARE_REG(unsigned long, phys, host_ctxt, 3);
DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 4);
DECLARE_REG(u64, pfn, host_ctxt, 1);
cpu_reg(host_ctxt, 1) = __pkvm_create_mappings(start, size, phys, prot);
cpu_reg(host_ctxt, 1) = __pkvm_host_share_hyp(pfn);
}
static void handle___pkvm_create_private_mapping(struct kvm_cpu_context *host_ctxt)
......@@ -163,14 +160,6 @@ static void handle___pkvm_prot_finalize(struct kvm_cpu_context *host_ctxt)
{
cpu_reg(host_ctxt, 1) = __pkvm_prot_finalize();
}
static void handle___pkvm_mark_hyp(struct kvm_cpu_context *host_ctxt)
{
DECLARE_REG(phys_addr_t, start, host_ctxt, 1);
DECLARE_REG(phys_addr_t, end, host_ctxt, 2);
cpu_reg(host_ctxt, 1) = __pkvm_mark_hyp(start, end);
}
typedef void (*hcall_t)(struct kvm_cpu_context *);
#define HANDLE_FUNC(x) [__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x
......@@ -193,10 +182,9 @@ static const hcall_t host_hcall[] = {
HANDLE_FUNC(__vgic_v3_restore_aprs),
HANDLE_FUNC(__pkvm_init),
HANDLE_FUNC(__pkvm_cpu_set_vector),
HANDLE_FUNC(__pkvm_create_mappings),
HANDLE_FUNC(__pkvm_host_share_hyp),
HANDLE_FUNC(__pkvm_create_private_mapping),
HANDLE_FUNC(__pkvm_prot_finalize),
HANDLE_FUNC(__pkvm_mark_hyp),
};
static void handle_host_hcall(struct kvm_cpu_context *host_ctxt)
......
......@@ -31,7 +31,7 @@ static struct hyp_pool host_s2_pool;
u64 id_aa64mmfr0_el1_sys_val;
u64 id_aa64mmfr1_el1_sys_val;
static const u8 pkvm_hyp_id = 1;
const u8 pkvm_hyp_id = 1;
static void *host_s2_zalloc_pages_exact(size_t size)
{
......@@ -89,6 +89,8 @@ static void prepare_host_vtcr(void)
id_aa64mmfr1_el1_sys_val, phys_shift);
}
static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot prot);
int kvm_host_prepare_stage2(void *pgt_pool_base)
{
struct kvm_s2_mmu *mmu = &host_kvm.arch.mmu;
......@@ -101,8 +103,9 @@ int kvm_host_prepare_stage2(void *pgt_pool_base)
if (ret)
return ret;
ret = kvm_pgtable_stage2_init_flags(&host_kvm.pgt, &host_kvm.arch,
&host_kvm.mm_ops, KVM_HOST_S2_FLAGS);
ret = __kvm_pgtable_stage2_init(&host_kvm.pgt, &host_kvm.arch,
&host_kvm.mm_ops, KVM_HOST_S2_FLAGS,
host_stage2_force_pte_cb);
if (ret)
return ret;
......@@ -159,6 +162,11 @@ static int host_stage2_unmap_dev_all(void)
return kvm_pgtable_stage2_unmap(pgt, addr, BIT(pgt->ia_bits) - addr);
}
struct kvm_mem_range {
u64 start;
u64 end;
};
static bool find_mem_range(phys_addr_t addr, struct kvm_mem_range *range)
{
int cur, left = 0, right = hyp_memblock_nr;
......@@ -189,16 +197,26 @@ static bool find_mem_range(phys_addr_t addr, struct kvm_mem_range *range)
return false;
}
bool addr_is_memory(phys_addr_t phys)
{
struct kvm_mem_range range;
return find_mem_range(phys, &range);
}
static bool is_in_mem_range(u64 addr, struct kvm_mem_range *range)
{
return range->start <= addr && addr < range->end;
}
static bool range_is_memory(u64 start, u64 end)
{
struct kvm_mem_range r1, r2;
struct kvm_mem_range r;
if (!find_mem_range(start, &r1) || !find_mem_range(end, &r2))
return false;
if (r1.start != r2.start)
if (!find_mem_range(start, &r))
return false;
return true;
return is_in_mem_range(end - 1, &r);
}
static inline int __host_stage2_idmap(u64 start, u64 end,
......@@ -208,60 +226,208 @@ static inline int __host_stage2_idmap(u64 start, u64 end,
prot, &host_s2_pool);
}
/*
* The pool has been provided with enough pages to cover all of memory with
* page granularity, but it is difficult to know how much of the MMIO range
* we will need to cover upfront, so we may need to 'recycle' the pages if we
* run out.
*/
#define host_stage2_try(fn, ...) \
({ \
int __ret; \
hyp_assert_lock_held(&host_kvm.lock); \
__ret = fn(__VA_ARGS__); \
if (__ret == -ENOMEM) { \
__ret = host_stage2_unmap_dev_all(); \
if (!__ret) \
__ret = fn(__VA_ARGS__); \
} \
__ret; \
})
static inline bool range_included(struct kvm_mem_range *child,
struct kvm_mem_range *parent)
{
return parent->start <= child->start && child->end <= parent->end;
}
static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range)
{
struct kvm_mem_range cur;
kvm_pte_t pte;
u32 level;
int ret;
hyp_assert_lock_held(&host_kvm.lock);
ret = kvm_pgtable_get_leaf(&host_kvm.pgt, addr, &pte, &level);
if (ret)
return ret;
if (kvm_pte_valid(pte))
return -EAGAIN;
if (pte)
return -EPERM;
do {
u64 granule = kvm_granule_size(level);
cur.start = ALIGN_DOWN(addr, granule);
cur.end = cur.start + granule;
level++;
} while ((level < KVM_PGTABLE_MAX_LEVELS) &&
!(kvm_level_supports_block_mapping(level) &&
range_included(&cur, range)));
*range = cur;
return 0;
}
int host_stage2_idmap_locked(phys_addr_t addr, u64 size,
enum kvm_pgtable_prot prot)
{
hyp_assert_lock_held(&host_kvm.lock);
return host_stage2_try(__host_stage2_idmap, addr, addr + size, prot);
}
int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id)
{
hyp_assert_lock_held(&host_kvm.lock);
return host_stage2_try(kvm_pgtable_stage2_set_owner, &host_kvm.pgt,
addr, size, &host_s2_pool, owner_id);
}
static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot prot)
{
/*
* Block mappings must be used with care in the host stage-2 as a
* kvm_pgtable_stage2_map() operation targeting a page in the range of
* an existing block will delete the block under the assumption that
* mappings in the rest of the block range can always be rebuilt lazily.
* That assumption is correct for the host stage-2 with RWX mappings
* targeting memory or RW mappings targeting MMIO ranges (see
* host_stage2_idmap() below which implements some of the host memory
* abort logic). However, this is not safe for any other mappings where
* the host stage-2 page-table is in fact the only place where this
* state is stored. In all those cases, it is safer to use page-level
* mappings, hence avoiding to lose the state because of side-effects in
* kvm_pgtable_stage2_map().
*/
if (range_is_memory(addr, end))
return prot != PKVM_HOST_MEM_PROT;
else
return prot != PKVM_HOST_MMIO_PROT;
}
static int host_stage2_idmap(u64 addr)
{
enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W;
struct kvm_mem_range range;
bool is_memory = find_mem_range(addr, &range);
enum kvm_pgtable_prot prot;
int ret;
if (is_memory)
prot |= KVM_PGTABLE_PROT_X;
prot = is_memory ? PKVM_HOST_MEM_PROT : PKVM_HOST_MMIO_PROT;
hyp_spin_lock(&host_kvm.lock);
ret = kvm_pgtable_stage2_find_range(&host_kvm.pgt, addr, prot, &range);
ret = host_stage2_adjust_range(addr, &range);
if (ret)
goto unlock;
ret = __host_stage2_idmap(range.start, range.end, prot);
if (ret != -ENOMEM)
ret = host_stage2_idmap_locked(range.start, range.end - range.start, prot);
unlock:
hyp_spin_unlock(&host_kvm.lock);
return ret;
}
static inline bool check_prot(enum kvm_pgtable_prot prot,
enum kvm_pgtable_prot required,
enum kvm_pgtable_prot denied)
{
return (prot & (required | denied)) == required;
}
int __pkvm_host_share_hyp(u64 pfn)
{
phys_addr_t addr = hyp_pfn_to_phys(pfn);
enum kvm_pgtable_prot prot, cur;
void *virt = __hyp_va(addr);
enum pkvm_page_state state;
kvm_pte_t pte;
int ret;
if (!addr_is_memory(addr))
return -EINVAL;
hyp_spin_lock(&host_kvm.lock);
hyp_spin_lock(&pkvm_pgd_lock);
ret = kvm_pgtable_get_leaf(&host_kvm.pgt, addr, &pte, NULL);
if (ret)
goto unlock;
if (!pte)
goto map_shared;
/*
* The pool has been provided with enough pages to cover all of memory
* with page granularity, but it is difficult to know how much of the
* MMIO range we will need to cover upfront, so we may need to 'recycle'
* the pages if we run out.
* Check attributes in the host stage-2 PTE. We need the page to be:
* - mapped RWX as we're sharing memory;
* - not borrowed, as that implies absence of ownership.
* Otherwise, we can't let it got through
*/
ret = host_stage2_unmap_dev_all();
if (ret)
cur = kvm_pgtable_stage2_pte_prot(pte);
prot = pkvm_mkstate(0, PKVM_PAGE_SHARED_BORROWED);
if (!check_prot(cur, PKVM_HOST_MEM_PROT, prot)) {
ret = -EPERM;
goto unlock;
}
ret = __host_stage2_idmap(range.start, range.end, prot);
state = pkvm_getstate(cur);
if (state == PKVM_PAGE_OWNED)
goto map_shared;
unlock:
hyp_spin_unlock(&host_kvm.lock);
/*
* Tolerate double-sharing the same page, but this requires
* cross-checking the hypervisor stage-1.
*/
if (state != PKVM_PAGE_SHARED_OWNED) {
ret = -EPERM;
goto unlock;
}
return ret;
}
ret = kvm_pgtable_get_leaf(&pkvm_pgtable, (u64)virt, &pte, NULL);
if (ret)
goto unlock;
int __pkvm_mark_hyp(phys_addr_t start, phys_addr_t end)
{
int ret;
/*
* If the page has been shared with the hypervisor, it must be
* already mapped as SHARED_BORROWED in its stage-1.
*/
cur = kvm_pgtable_hyp_pte_prot(pte);
prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_SHARED_BORROWED);
if (!check_prot(cur, prot, ~prot))
ret = -EPERM;
goto unlock;
map_shared:
/*
* host_stage2_unmap_dev_all() currently relies on MMIO mappings being
* non-persistent, so don't allow changing page ownership in MMIO range.
* If the page is not yet shared, adjust mappings in both page-tables
* while both locks are held.
*/
if (!range_is_memory(start, end))
return -EINVAL;
prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_SHARED_BORROWED);
ret = pkvm_create_mappings_locked(virt, virt + PAGE_SIZE, prot);
BUG_ON(ret);
hyp_spin_lock(&host_kvm.lock);
ret = kvm_pgtable_stage2_set_owner(&host_kvm.pgt, start, end - start,
&host_s2_pool, pkvm_hyp_id);
prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, PKVM_PAGE_SHARED_OWNED);
ret = host_stage2_idmap_locked(addr, PAGE_SIZE, prot);
BUG_ON(ret);
unlock:
hyp_spin_unlock(&pkvm_pgd_lock);
hyp_spin_unlock(&host_kvm.lock);
return ret != -EAGAIN ? ret : 0;
return ret;
}
void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt)
......
......@@ -23,8 +23,8 @@ u64 __io_map_base;
struct memblock_region hyp_memory[HYP_MEMBLOCK_REGIONS];
unsigned int hyp_memblock_nr;
int __pkvm_create_mappings(unsigned long start, unsigned long size,
unsigned long phys, enum kvm_pgtable_prot prot)
static int __pkvm_create_mappings(unsigned long start, unsigned long size,
unsigned long phys, enum kvm_pgtable_prot prot)
{
int err;
......@@ -67,13 +67,15 @@ unsigned long __pkvm_create_private_mapping(phys_addr_t phys, size_t size,
return addr;
}
int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
int pkvm_create_mappings_locked(void *from, void *to, enum kvm_pgtable_prot prot)
{
unsigned long start = (unsigned long)from;
unsigned long end = (unsigned long)to;
unsigned long virt_addr;
phys_addr_t phys;
hyp_assert_lock_held(&pkvm_pgd_lock);
start = start & PAGE_MASK;
end = PAGE_ALIGN(end);
......@@ -81,7 +83,8 @@ int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
int err;
phys = hyp_virt_to_phys((void *)virt_addr);
err = __pkvm_create_mappings(virt_addr, PAGE_SIZE, phys, prot);
err = kvm_pgtable_hyp_map(&pkvm_pgtable, virt_addr, PAGE_SIZE,
phys, prot);
if (err)
return err;
}
......@@ -89,6 +92,17 @@ int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
return 0;
}
int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
{
int ret;
hyp_spin_lock(&pkvm_pgd_lock);
ret = pkvm_create_mappings_locked(from, to, prot);
hyp_spin_unlock(&pkvm_pgd_lock);
return ret;
}
int hyp_back_vmemmap(phys_addr_t phys, unsigned long size, phys_addr_t back)
{
unsigned long start, end;
......
......@@ -58,6 +58,7 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size,
{
void *start, *end, *virt = hyp_phys_to_virt(phys);
unsigned long pgt_size = hyp_s1_pgtable_pages() << PAGE_SHIFT;
enum kvm_pgtable_prot prot;
int ret, i;
/* Recreate the hyp page-table using the early page allocator */
......@@ -83,10 +84,6 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size,
if (ret)
return ret;
ret = pkvm_create_mappings(__start_rodata, __end_rodata, PAGE_HYP_RO);
if (ret)
return ret;
ret = pkvm_create_mappings(__hyp_rodata_start, __hyp_rodata_end, PAGE_HYP_RO);
if (ret)
return ret;
......@@ -95,10 +92,6 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size,
if (ret)
return ret;
ret = pkvm_create_mappings(__hyp_bss_end, __bss_stop, PAGE_HYP_RO);
if (ret)
return ret;
ret = pkvm_create_mappings(virt, virt + size, PAGE_HYP);
if (ret)
return ret;
......@@ -117,6 +110,24 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size,
return ret;
}
/*
* Map the host's .bss and .rodata sections RO in the hypervisor, but
* transfer the ownership from the host to the hypervisor itself to
* make sure it can't be donated or shared with another entity.
*
* The ownership transition requires matching changes in the host
* stage-2. This will be done later (see finalize_host_mappings()) once
* the hyp_vmemmap is addressable.
*/
prot = pkvm_mkstate(PAGE_HYP_RO, PKVM_PAGE_SHARED_OWNED);
ret = pkvm_create_mappings(__start_rodata, __end_rodata, prot);
if (ret)
return ret;
ret = pkvm_create_mappings(__hyp_bss_end, __bss_stop, prot);
if (ret)
return ret;
return 0;
}
......@@ -148,6 +159,57 @@ static void hpool_put_page(void *addr)
hyp_put_page(&hpool, addr);
}
static int finalize_host_mappings_walker(u64 addr, u64 end, u32 level,
kvm_pte_t *ptep,
enum kvm_pgtable_walk_flags flag,
void * const arg)
{
enum kvm_pgtable_prot prot;
enum pkvm_page_state state;
kvm_pte_t pte = *ptep;
phys_addr_t phys;
if (!kvm_pte_valid(pte))
return 0;
if (level != (KVM_PGTABLE_MAX_LEVELS - 1))
return -EINVAL;
phys = kvm_pte_to_phys(pte);
if (!addr_is_memory(phys))
return 0;
/*
* Adjust the host stage-2 mappings to match the ownership attributes
* configured in the hypervisor stage-1.
*/
state = pkvm_getstate(kvm_pgtable_hyp_pte_prot(pte));
switch (state) {
case PKVM_PAGE_OWNED:
return host_stage2_set_owner_locked(phys, PAGE_SIZE, pkvm_hyp_id);
case PKVM_PAGE_SHARED_OWNED:
prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, PKVM_PAGE_SHARED_BORROWED);
break;
case PKVM_PAGE_SHARED_BORROWED:
prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, PKVM_PAGE_SHARED_OWNED);
break;
default:
return -EINVAL;
}
return host_stage2_idmap_locked(phys, PAGE_SIZE, prot);
}
static int finalize_host_mappings(void)
{
struct kvm_pgtable_walker walker = {
.cb = finalize_host_mappings_walker,
.flags = KVM_PGTABLE_WALK_LEAF,
};
return kvm_pgtable_walk(&pkvm_pgtable, 0, BIT(pkvm_pgtable.ia_bits), &walker);
}
void __noreturn __pkvm_init_finalise(void)
{
struct kvm_host_data *host_data = this_cpu_ptr(&kvm_host_data);
......@@ -167,6 +229,10 @@ void __noreturn __pkvm_init_finalise(void)
if (ret)
goto out;
ret = finalize_host_mappings();
if (ret)
goto out;
pkvm_pgtable_mm_ops = (struct kvm_pgtable_mm_ops) {
.zalloc_page = hyp_zalloc_hyp_page,
.phys_to_virt = hyp_phys_to_virt,
......
This diff is collapsed.
......@@ -260,10 +260,8 @@ static int __create_hyp_mappings(unsigned long start, unsigned long size,
{
int err;
if (!kvm_host_owns_hyp_mappings()) {
return kvm_call_hyp_nvhe(__pkvm_create_mappings,
start, size, phys, prot);
}
if (WARN_ON(!kvm_host_owns_hyp_mappings()))
return -EINVAL;
mutex_lock(&kvm_hyp_pgd_mutex);
err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot);
......@@ -283,6 +281,21 @@ static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
}
}
static int pkvm_share_hyp(phys_addr_t start, phys_addr_t end)
{
phys_addr_t addr;
int ret;
for (addr = ALIGN_DOWN(start, PAGE_SIZE); addr < end; addr += PAGE_SIZE) {
ret = kvm_call_hyp_nvhe(__pkvm_host_share_hyp,
__phys_to_pfn(addr));
if (ret)
return ret;
}
return 0;
}
/**
* create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode
* @from: The virtual kernel start address of the range
......@@ -303,6 +316,13 @@ int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
if (is_kernel_in_hyp_mode())
return 0;
if (!kvm_host_owns_hyp_mappings()) {
if (WARN_ON(prot != PAGE_HYP))
return -EPERM;
return pkvm_share_hyp(kvm_kaddr_to_phys(from),
kvm_kaddr_to_phys(to));
}
start = start & PAGE_MASK;
end = PAGE_ALIGN(end);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment