Commit adde0476 authored by Marc Zyngier's avatar Marc Zyngier

Merge branch kvm-arm64/selftest/s2-faults into kvmarm-master/next

* kvm-arm64/selftest/s2-faults:
  : .
  : New KVM/arm64 selftests exercising various sorts of S2 faults, courtesy
  : of Ricardo Koller. From the cover letter:
  :
  : "This series adds a new aarch64 selftest for testing stage 2 fault handling
  : for various combinations of guest accesses (e.g., write, S1PTW), backing
  : sources (e.g., anon), and types of faults (e.g., read on hugetlbfs with a
  : hole, write on a readonly memslot). Each test tries a different combination
  : and then checks that the access results in the right behavior (e.g., uffd
  : faults with the right address and write/read flag). [...]"
  : .
  KVM: selftests: aarch64: Add mix of tests into page_fault_test
  KVM: selftests: aarch64: Add readonly memslot tests into page_fault_test
  KVM: selftests: aarch64: Add dirty logging tests into page_fault_test
  KVM: selftests: aarch64: Add userfaultfd tests into page_fault_test
  KVM: selftests: aarch64: Add aarch64/page_fault_test
  KVM: selftests: Use the right memslot for code, page-tables, and data allocations
  KVM: selftests: Fix alignment in virt_arch_pgd_alloc() and vm_vaddr_alloc()
  KVM: selftests: Add vm->memslots[] and enum kvm_mem_region_type
  KVM: selftests: Stash backing_src_type in struct userspace_mem_region
  tools: Copy bitfield.h from the kernel sources
  KVM: selftests: aarch64: Construct DEFAULT_MAIR_EL1 using sysreg.h macros
  KVM: selftests: Add missing close and munmap in __vm_mem_region_delete()
  KVM: selftests: aarch64: Add virt_get_pte_hva() library function
  KVM: selftests: Add a userfaultfd library
Signed-off-by: default avatarMarc Zyngier <maz@kernel.org>
parents 02f6fdd4 ff2b5509
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (C) 2014 Felix Fietkau <nbd@nbd.name>
* Copyright (C) 2004 - 2009 Ivo van Doorn <IvDoorn@gmail.com>
*/
#ifndef _LINUX_BITFIELD_H
#define _LINUX_BITFIELD_H
#include <linux/build_bug.h>
#include <asm/byteorder.h>
/*
* Bitfield access macros
*
* FIELD_{GET,PREP} macros take as first parameter shifted mask
* from which they extract the base mask and shift amount.
* Mask must be a compilation time constant.
*
* Example:
*
* #define REG_FIELD_A GENMASK(6, 0)
* #define REG_FIELD_B BIT(7)
* #define REG_FIELD_C GENMASK(15, 8)
* #define REG_FIELD_D GENMASK(31, 16)
*
* Get:
* a = FIELD_GET(REG_FIELD_A, reg);
* b = FIELD_GET(REG_FIELD_B, reg);
*
* Set:
* reg = FIELD_PREP(REG_FIELD_A, 1) |
* FIELD_PREP(REG_FIELD_B, 0) |
* FIELD_PREP(REG_FIELD_C, c) |
* FIELD_PREP(REG_FIELD_D, 0x40);
*
* Modify:
* reg &= ~REG_FIELD_C;
* reg |= FIELD_PREP(REG_FIELD_C, c);
*/
#define __bf_shf(x) (__builtin_ffsll(x) - 1)
#define __scalar_type_to_unsigned_cases(type) \
unsigned type: (unsigned type)0, \
signed type: (unsigned type)0
#define __unsigned_scalar_typeof(x) typeof( \
_Generic((x), \
char: (unsigned char)0, \
__scalar_type_to_unsigned_cases(char), \
__scalar_type_to_unsigned_cases(short), \
__scalar_type_to_unsigned_cases(int), \
__scalar_type_to_unsigned_cases(long), \
__scalar_type_to_unsigned_cases(long long), \
default: (x)))
#define __bf_cast_unsigned(type, x) ((__unsigned_scalar_typeof(type))(x))
#define __BF_FIELD_CHECK(_mask, _reg, _val, _pfx) \
({ \
BUILD_BUG_ON_MSG(!__builtin_constant_p(_mask), \
_pfx "mask is not constant"); \
BUILD_BUG_ON_MSG((_mask) == 0, _pfx "mask is zero"); \
BUILD_BUG_ON_MSG(__builtin_constant_p(_val) ? \
~((_mask) >> __bf_shf(_mask)) & (_val) : 0, \
_pfx "value too large for the field"); \
BUILD_BUG_ON_MSG(__bf_cast_unsigned(_mask, _mask) > \
__bf_cast_unsigned(_reg, ~0ull), \
_pfx "type of reg too small for mask"); \
__BUILD_BUG_ON_NOT_POWER_OF_2((_mask) + \
(1ULL << __bf_shf(_mask))); \
})
/**
* FIELD_MAX() - produce the maximum value representable by a field
* @_mask: shifted mask defining the field's length and position
*
* FIELD_MAX() returns the maximum value that can be held in the field
* specified by @_mask.
*/
#define FIELD_MAX(_mask) \
({ \
__BF_FIELD_CHECK(_mask, 0ULL, 0ULL, "FIELD_MAX: "); \
(typeof(_mask))((_mask) >> __bf_shf(_mask)); \
})
/**
* FIELD_FIT() - check if value fits in the field
* @_mask: shifted mask defining the field's length and position
* @_val: value to test against the field
*
* Return: true if @_val can fit inside @_mask, false if @_val is too big.
*/
#define FIELD_FIT(_mask, _val) \
({ \
__BF_FIELD_CHECK(_mask, 0ULL, 0ULL, "FIELD_FIT: "); \
!((((typeof(_mask))_val) << __bf_shf(_mask)) & ~(_mask)); \
})
/**
* FIELD_PREP() - prepare a bitfield element
* @_mask: shifted mask defining the field's length and position
* @_val: value to put in the field
*
* FIELD_PREP() masks and shifts up the value. The result should
* be combined with other fields of the bitfield using logical OR.
*/
#define FIELD_PREP(_mask, _val) \
({ \
__BF_FIELD_CHECK(_mask, 0ULL, _val, "FIELD_PREP: "); \
((typeof(_mask))(_val) << __bf_shf(_mask)) & (_mask); \
})
/**
* FIELD_GET() - extract a bitfield element
* @_mask: shifted mask defining the field's length and position
* @_reg: value of entire bitfield
*
* FIELD_GET() extracts the field specified by @_mask from the
* bitfield passed in as @_reg by masking and shifting it down.
*/
#define FIELD_GET(_mask, _reg) \
({ \
__BF_FIELD_CHECK(_mask, _reg, 0U, "FIELD_GET: "); \
(typeof(_mask))(((_reg) & (_mask)) >> __bf_shf(_mask)); \
})
extern void __compiletime_error("value doesn't fit into mask")
__field_overflow(void);
extern void __compiletime_error("bad bitfield mask")
__bad_mask(void);
static __always_inline u64 field_multiplier(u64 field)
{
if ((field | (field - 1)) & ((field | (field - 1)) + 1))
__bad_mask();
return field & -field;
}
static __always_inline u64 field_mask(u64 field)
{
return field / field_multiplier(field);
}
#define field_max(field) ((typeof(field))field_mask(field))
#define ____MAKE_OP(type,base,to,from) \
static __always_inline __##type type##_encode_bits(base v, base field) \
{ \
if (__builtin_constant_p(v) && (v & ~field_mask(field))) \
__field_overflow(); \
return to((v & field_mask(field)) * field_multiplier(field)); \
} \
static __always_inline __##type type##_replace_bits(__##type old, \
base val, base field) \
{ \
return (old & ~to(field)) | type##_encode_bits(val, field); \
} \
static __always_inline void type##p_replace_bits(__##type *p, \
base val, base field) \
{ \
*p = (*p & ~to(field)) | type##_encode_bits(val, field); \
} \
static __always_inline base type##_get_bits(__##type v, base field) \
{ \
return (from(v) & field)/field_multiplier(field); \
}
#define __MAKE_OP(size) \
____MAKE_OP(le##size,u##size,cpu_to_le##size,le##size##_to_cpu) \
____MAKE_OP(be##size,u##size,cpu_to_be##size,be##size##_to_cpu) \
____MAKE_OP(u##size,u##size,,)
____MAKE_OP(u8,u8,,)
__MAKE_OP(16)
__MAKE_OP(32)
__MAKE_OP(64)
#undef __MAKE_OP
#undef ____MAKE_OP
#endif
......@@ -4,6 +4,7 @@
/aarch64/debug-exceptions
/aarch64/get-reg-list
/aarch64/hypercalls
/aarch64/page_fault_test
/aarch64/psci_test
/aarch64/vcpu_width_config
/aarch64/vgic_init
......
......@@ -47,6 +47,7 @@ LIBKVM += lib/perf_test_util.c
LIBKVM += lib/rbtree.c
LIBKVM += lib/sparsebit.c
LIBKVM += lib/test_util.c
LIBKVM += lib/userfaultfd_util.c
LIBKVM_STRING += lib/string_override.c
......@@ -152,6 +153,7 @@ TEST_GEN_PROGS_aarch64 += aarch64/arch_timer
TEST_GEN_PROGS_aarch64 += aarch64/debug-exceptions
TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list
TEST_GEN_PROGS_aarch64 += aarch64/hypercalls
TEST_GEN_PROGS_aarch64 += aarch64/page_fault_test
TEST_GEN_PROGS_aarch64 += aarch64/psci_test
TEST_GEN_PROGS_aarch64 += aarch64/vcpu_width_config
TEST_GEN_PROGS_aarch64 += aarch64/vgic_init
......
// SPDX-License-Identifier: GPL-2.0
/*
* page_fault_test.c - Test stage 2 faults.
*
* This test tries different combinations of guest accesses (e.g., write,
* S1PTW), backing source type (e.g., anon) and types of faults (e.g., read on
* hugetlbfs with a hole). It checks that the expected handling method is
* called (e.g., uffd faults with the right address and write/read flag).
*/
#define _GNU_SOURCE
#include <linux/bitmap.h>
#include <fcntl.h>
#include <test_util.h>
#include <kvm_util.h>
#include <processor.h>
#include <asm/sysreg.h>
#include <linux/bitfield.h>
#include "guest_modes.h"
#include "userfaultfd_util.h"
/* Guest virtual addresses that point to the test page and its PTE. */
#define TEST_GVA 0xc0000000
#define TEST_EXEC_GVA (TEST_GVA + 0x8)
#define TEST_PTE_GVA 0xb0000000
#define TEST_DATA 0x0123456789ABCDEF
static uint64_t *guest_test_memory = (uint64_t *)TEST_GVA;
#define CMD_NONE (0)
#define CMD_SKIP_TEST (1ULL << 1)
#define CMD_HOLE_PT (1ULL << 2)
#define CMD_HOLE_DATA (1ULL << 3)
#define CMD_CHECK_WRITE_IN_DIRTY_LOG (1ULL << 4)
#define CMD_CHECK_S1PTW_WR_IN_DIRTY_LOG (1ULL << 5)
#define CMD_CHECK_NO_WRITE_IN_DIRTY_LOG (1ULL << 6)
#define CMD_CHECK_NO_S1PTW_WR_IN_DIRTY_LOG (1ULL << 7)
#define CMD_SET_PTE_AF (1ULL << 8)
#define PREPARE_FN_NR 10
#define CHECK_FN_NR 10
static struct event_cnt {
int mmio_exits;
int fail_vcpu_runs;
int uffd_faults;
/* uffd_faults is incremented from multiple threads. */
pthread_mutex_t uffd_faults_mutex;
} events;
struct test_desc {
const char *name;
uint64_t mem_mark_cmd;
/* Skip the test if any prepare function returns false */
bool (*guest_prepare[PREPARE_FN_NR])(void);
void (*guest_test)(void);
void (*guest_test_check[CHECK_FN_NR])(void);
uffd_handler_t uffd_pt_handler;
uffd_handler_t uffd_data_handler;
void (*dabt_handler)(struct ex_regs *regs);
void (*iabt_handler)(struct ex_regs *regs);
void (*mmio_handler)(struct kvm_vm *vm, struct kvm_run *run);
void (*fail_vcpu_run_handler)(int ret);
uint32_t pt_memslot_flags;
uint32_t data_memslot_flags;
bool skip;
struct event_cnt expected_events;
};
struct test_params {
enum vm_mem_backing_src_type src_type;
struct test_desc *test_desc;
};
static inline void flush_tlb_page(uint64_t vaddr)
{
uint64_t page = vaddr >> 12;
dsb(ishst);
asm volatile("tlbi vaae1is, %0" :: "r" (page));
dsb(ish);
isb();
}
static void guest_write64(void)
{
uint64_t val;
WRITE_ONCE(*guest_test_memory, TEST_DATA);
val = READ_ONCE(*guest_test_memory);
GUEST_ASSERT_EQ(val, TEST_DATA);
}
/* Check the system for atomic instructions. */
static bool guest_check_lse(void)
{
uint64_t isar0 = read_sysreg(id_aa64isar0_el1);
uint64_t atomic;
atomic = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR0_ATOMICS), isar0);
return atomic >= 2;
}
static bool guest_check_dc_zva(void)
{
uint64_t dczid = read_sysreg(dczid_el0);
uint64_t dzp = FIELD_GET(ARM64_FEATURE_MASK(DCZID_DZP), dczid);
return dzp == 0;
}
/* Compare and swap instruction. */
static void guest_cas(void)
{
uint64_t val;
GUEST_ASSERT(guest_check_lse());
asm volatile(".arch_extension lse\n"
"casal %0, %1, [%2]\n"
:: "r" (0), "r" (TEST_DATA), "r" (guest_test_memory));
val = READ_ONCE(*guest_test_memory);
GUEST_ASSERT_EQ(val, TEST_DATA);
}
static void guest_read64(void)
{
uint64_t val;
val = READ_ONCE(*guest_test_memory);
GUEST_ASSERT_EQ(val, 0);
}
/* Address translation instruction */
static void guest_at(void)
{
uint64_t par;
asm volatile("at s1e1r, %0" :: "r" (guest_test_memory));
par = read_sysreg(par_el1);
isb();
/* Bit 1 indicates whether the AT was successful */
GUEST_ASSERT_EQ(par & 1, 0);
}
/*
* The size of the block written by "dc zva" is guaranteed to be between (2 <<
* 0) and (2 << 9), which is safe in our case as we need the write to happen
* for at least a word, and not more than a page.
*/
static void guest_dc_zva(void)
{
uint16_t val;
asm volatile("dc zva, %0" :: "r" (guest_test_memory));
dsb(ish);
val = READ_ONCE(*guest_test_memory);
GUEST_ASSERT_EQ(val, 0);
}
/*
* Pre-indexing loads and stores don't have a valid syndrome (ESR_EL2.ISV==0).
* And that's special because KVM must take special care with those: they
* should still count as accesses for dirty logging or user-faulting, but
* should be handled differently on mmio.
*/
static void guest_ld_preidx(void)
{
uint64_t val;
uint64_t addr = TEST_GVA - 8;
/*
* This ends up accessing "TEST_GVA + 8 - 8", where "TEST_GVA - 8" is
* in a gap between memslots not backing by anything.
*/
asm volatile("ldr %0, [%1, #8]!"
: "=r" (val), "+r" (addr));
GUEST_ASSERT_EQ(val, 0);
GUEST_ASSERT_EQ(addr, TEST_GVA);
}
static void guest_st_preidx(void)
{
uint64_t val = TEST_DATA;
uint64_t addr = TEST_GVA - 8;
asm volatile("str %0, [%1, #8]!"
: "+r" (val), "+r" (addr));
GUEST_ASSERT_EQ(addr, TEST_GVA);
val = READ_ONCE(*guest_test_memory);
}
static bool guest_set_ha(void)
{
uint64_t mmfr1 = read_sysreg(id_aa64mmfr1_el1);
uint64_t hadbs, tcr;
/* Skip if HA is not supported. */
hadbs = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR1_HADBS), mmfr1);
if (hadbs == 0)
return false;
tcr = read_sysreg(tcr_el1) | TCR_EL1_HA;
write_sysreg(tcr, tcr_el1);
isb();
return true;
}
static bool guest_clear_pte_af(void)
{
*((uint64_t *)TEST_PTE_GVA) &= ~PTE_AF;
flush_tlb_page(TEST_GVA);
return true;
}
static void guest_check_pte_af(void)
{
dsb(ish);
GUEST_ASSERT_EQ(*((uint64_t *)TEST_PTE_GVA) & PTE_AF, PTE_AF);
}
static void guest_check_write_in_dirty_log(void)
{
GUEST_SYNC(CMD_CHECK_WRITE_IN_DIRTY_LOG);
}
static void guest_check_no_write_in_dirty_log(void)
{
GUEST_SYNC(CMD_CHECK_NO_WRITE_IN_DIRTY_LOG);
}
static void guest_check_s1ptw_wr_in_dirty_log(void)
{
GUEST_SYNC(CMD_CHECK_S1PTW_WR_IN_DIRTY_LOG);
}
static void guest_exec(void)
{
int (*code)(void) = (int (*)(void))TEST_EXEC_GVA;
int ret;
ret = code();
GUEST_ASSERT_EQ(ret, 0x77);
}
static bool guest_prepare(struct test_desc *test)
{
bool (*prepare_fn)(void);
int i;
for (i = 0; i < PREPARE_FN_NR; i++) {
prepare_fn = test->guest_prepare[i];
if (prepare_fn && !prepare_fn())
return false;
}
return true;
}
static void guest_test_check(struct test_desc *test)
{
void (*check_fn)(void);
int i;
for (i = 0; i < CHECK_FN_NR; i++) {
check_fn = test->guest_test_check[i];
if (check_fn)
check_fn();
}
}
static void guest_code(struct test_desc *test)
{
if (!guest_prepare(test))
GUEST_SYNC(CMD_SKIP_TEST);
GUEST_SYNC(test->mem_mark_cmd);
if (test->guest_test)
test->guest_test();
guest_test_check(test);
GUEST_DONE();
}
static void no_dabt_handler(struct ex_regs *regs)
{
GUEST_ASSERT_1(false, read_sysreg(far_el1));
}
static void no_iabt_handler(struct ex_regs *regs)
{
GUEST_ASSERT_1(false, regs->pc);
}
static struct uffd_args {
char *copy;
void *hva;
uint64_t paging_size;
} pt_args, data_args;
/* Returns true to continue the test, and false if it should be skipped. */
static int uffd_generic_handler(int uffd_mode, int uffd, struct uffd_msg *msg,
struct uffd_args *args, bool expect_write)
{
uint64_t addr = msg->arg.pagefault.address;
uint64_t flags = msg->arg.pagefault.flags;
struct uffdio_copy copy;
int ret;
TEST_ASSERT(uffd_mode == UFFDIO_REGISTER_MODE_MISSING,
"The only expected UFFD mode is MISSING");
ASSERT_EQ(!!(flags & UFFD_PAGEFAULT_FLAG_WRITE), expect_write);
ASSERT_EQ(addr, (uint64_t)args->hva);
pr_debug("uffd fault: addr=%p write=%d\n",
(void *)addr, !!(flags & UFFD_PAGEFAULT_FLAG_WRITE));
copy.src = (uint64_t)args->copy;
copy.dst = addr;
copy.len = args->paging_size;
copy.mode = 0;
ret = ioctl(uffd, UFFDIO_COPY, &copy);
if (ret == -1) {
pr_info("Failed UFFDIO_COPY in 0x%lx with errno: %d\n",
addr, errno);
return ret;
}
pthread_mutex_lock(&events.uffd_faults_mutex);
events.uffd_faults += 1;
pthread_mutex_unlock(&events.uffd_faults_mutex);
return 0;
}
static int uffd_pt_write_handler(int mode, int uffd, struct uffd_msg *msg)
{
return uffd_generic_handler(mode, uffd, msg, &pt_args, true);
}
static int uffd_data_write_handler(int mode, int uffd, struct uffd_msg *msg)
{
return uffd_generic_handler(mode, uffd, msg, &data_args, true);
}
static int uffd_data_read_handler(int mode, int uffd, struct uffd_msg *msg)
{
return uffd_generic_handler(mode, uffd, msg, &data_args, false);
}
static void setup_uffd_args(struct userspace_mem_region *region,
struct uffd_args *args)
{
args->hva = (void *)region->region.userspace_addr;
args->paging_size = region->region.memory_size;
args->copy = malloc(args->paging_size);
TEST_ASSERT(args->copy, "Failed to allocate data copy.");
memcpy(args->copy, args->hva, args->paging_size);
}
static void setup_uffd(struct kvm_vm *vm, struct test_params *p,
struct uffd_desc **pt_uffd, struct uffd_desc **data_uffd)
{
struct test_desc *test = p->test_desc;
int uffd_mode = UFFDIO_REGISTER_MODE_MISSING;
setup_uffd_args(vm_get_mem_region(vm, MEM_REGION_PT), &pt_args);
setup_uffd_args(vm_get_mem_region(vm, MEM_REGION_TEST_DATA), &data_args);
*pt_uffd = NULL;
if (test->uffd_pt_handler)
*pt_uffd = uffd_setup_demand_paging(uffd_mode, 0,
pt_args.hva,
pt_args.paging_size,
test->uffd_pt_handler);
*data_uffd = NULL;
if (test->uffd_data_handler)
*data_uffd = uffd_setup_demand_paging(uffd_mode, 0,
data_args.hva,
data_args.paging_size,
test->uffd_data_handler);
}
static void free_uffd(struct test_desc *test, struct uffd_desc *pt_uffd,
struct uffd_desc *data_uffd)
{
if (test->uffd_pt_handler)
uffd_stop_demand_paging(pt_uffd);
if (test->uffd_data_handler)
uffd_stop_demand_paging(data_uffd);
free(pt_args.copy);
free(data_args.copy);
}
static int uffd_no_handler(int mode, int uffd, struct uffd_msg *msg)
{
TEST_FAIL("There was no UFFD fault expected.");
return -1;
}
/* Returns false if the test should be skipped. */
static bool punch_hole_in_backing_store(struct kvm_vm *vm,
struct userspace_mem_region *region)
{
void *hva = (void *)region->region.userspace_addr;
uint64_t paging_size = region->region.memory_size;
int ret, fd = region->fd;
if (fd != -1) {
ret = fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
0, paging_size);
TEST_ASSERT(ret == 0, "fallocate failed\n");
} else {
ret = madvise(hva, paging_size, MADV_DONTNEED);
TEST_ASSERT(ret == 0, "madvise failed\n");
}
return true;
}
static void mmio_on_test_gpa_handler(struct kvm_vm *vm, struct kvm_run *run)
{
struct userspace_mem_region *region;
void *hva;
region = vm_get_mem_region(vm, MEM_REGION_TEST_DATA);
hva = (void *)region->region.userspace_addr;
ASSERT_EQ(run->mmio.phys_addr, region->region.guest_phys_addr);
memcpy(hva, run->mmio.data, run->mmio.len);
events.mmio_exits += 1;
}
static void mmio_no_handler(struct kvm_vm *vm, struct kvm_run *run)
{
uint64_t data;
memcpy(&data, run->mmio.data, sizeof(data));
pr_debug("addr=%lld len=%d w=%d data=%lx\n",
run->mmio.phys_addr, run->mmio.len,
run->mmio.is_write, data);
TEST_FAIL("There was no MMIO exit expected.");
}
static bool check_write_in_dirty_log(struct kvm_vm *vm,
struct userspace_mem_region *region,
uint64_t host_pg_nr)
{
unsigned long *bmap;
bool first_page_dirty;
uint64_t size = region->region.memory_size;
/* getpage_size() is not always equal to vm->page_size */
bmap = bitmap_zalloc(size / getpagesize());
kvm_vm_get_dirty_log(vm, region->region.slot, bmap);
first_page_dirty = test_bit(host_pg_nr, bmap);
free(bmap);
return first_page_dirty;
}
/* Returns true to continue the test, and false if it should be skipped. */
static bool handle_cmd(struct kvm_vm *vm, int cmd)
{
struct userspace_mem_region *data_region, *pt_region;
bool continue_test = true;
data_region = vm_get_mem_region(vm, MEM_REGION_TEST_DATA);
pt_region = vm_get_mem_region(vm, MEM_REGION_PT);
if (cmd == CMD_SKIP_TEST)
continue_test = false;
if (cmd & CMD_HOLE_PT)
continue_test = punch_hole_in_backing_store(vm, pt_region);
if (cmd & CMD_HOLE_DATA)
continue_test = punch_hole_in_backing_store(vm, data_region);
if (cmd & CMD_CHECK_WRITE_IN_DIRTY_LOG)
TEST_ASSERT(check_write_in_dirty_log(vm, data_region, 0),
"Missing write in dirty log");
if (cmd & CMD_CHECK_S1PTW_WR_IN_DIRTY_LOG)
TEST_ASSERT(check_write_in_dirty_log(vm, pt_region, 0),
"Missing s1ptw write in dirty log");
if (cmd & CMD_CHECK_NO_WRITE_IN_DIRTY_LOG)
TEST_ASSERT(!check_write_in_dirty_log(vm, data_region, 0),
"Unexpected write in dirty log");
if (cmd & CMD_CHECK_NO_S1PTW_WR_IN_DIRTY_LOG)
TEST_ASSERT(!check_write_in_dirty_log(vm, pt_region, 0),
"Unexpected s1ptw write in dirty log");
return continue_test;
}
void fail_vcpu_run_no_handler(int ret)
{
TEST_FAIL("Unexpected vcpu run failure\n");
}
void fail_vcpu_run_mmio_no_syndrome_handler(int ret)
{
TEST_ASSERT(errno == ENOSYS,
"The mmio handler should have returned not implemented.");
events.fail_vcpu_runs += 1;
}
typedef uint32_t aarch64_insn_t;
extern aarch64_insn_t __exec_test[2];
noinline void __return_0x77(void)
{
asm volatile("__exec_test: mov x0, #0x77\n"
"ret\n");
}
/*
* Note that this function runs on the host before the test VM starts: there's
* no need to sync the D$ and I$ caches.
*/
static void load_exec_code_for_test(struct kvm_vm *vm)
{
uint64_t *code;
struct userspace_mem_region *region;
void *hva;
region = vm_get_mem_region(vm, MEM_REGION_TEST_DATA);
hva = (void *)region->region.userspace_addr;
assert(TEST_EXEC_GVA > TEST_GVA);
code = hva + TEST_EXEC_GVA - TEST_GVA;
memcpy(code, __exec_test, sizeof(__exec_test));
}
static void setup_abort_handlers(struct kvm_vm *vm, struct kvm_vcpu *vcpu,
struct test_desc *test)
{
vm_init_descriptor_tables(vm);
vcpu_init_descriptor_tables(vcpu);
vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT,
ESR_EC_DABT, no_dabt_handler);
vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT,
ESR_EC_IABT, no_iabt_handler);
}
static void setup_gva_maps(struct kvm_vm *vm)
{
struct userspace_mem_region *region;
uint64_t pte_gpa;
region = vm_get_mem_region(vm, MEM_REGION_TEST_DATA);
/* Map TEST_GVA first. This will install a new PTE. */
virt_pg_map(vm, TEST_GVA, region->region.guest_phys_addr);
/* Then map TEST_PTE_GVA to the above PTE. */
pte_gpa = addr_hva2gpa(vm, virt_get_pte_hva(vm, TEST_GVA));
virt_pg_map(vm, TEST_PTE_GVA, pte_gpa);
}
enum pf_test_memslots {
CODE_AND_DATA_MEMSLOT,
PAGE_TABLE_MEMSLOT,
TEST_DATA_MEMSLOT,
};
/*
* Create a memslot for code and data at pfn=0, and test-data and PT ones
* at max_gfn.
*/
static void setup_memslots(struct kvm_vm *vm, struct test_params *p)
{
uint64_t backing_src_pagesz = get_backing_src_pagesz(p->src_type);
uint64_t guest_page_size = vm->page_size;
uint64_t max_gfn = vm_compute_max_gfn(vm);
/* Enough for 2M of code when using 4K guest pages. */
uint64_t code_npages = 512;
uint64_t pt_size, data_size, data_gpa;
/*
* This test requires 1 pgd, 2 pud, 4 pmd, and 6 pte pages when using
* VM_MODE_P48V48_4K. Note that the .text takes ~1.6MBs. That's 13
* pages. VM_MODE_P48V48_4K is the mode with most PT pages; let's use
* twice that just in case.
*/
pt_size = 26 * guest_page_size;
/* memslot sizes and gpa's must be aligned to the backing page size */
pt_size = align_up(pt_size, backing_src_pagesz);
data_size = align_up(guest_page_size, backing_src_pagesz);
data_gpa = (max_gfn * guest_page_size) - data_size;
data_gpa = align_down(data_gpa, backing_src_pagesz);
vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 0,
CODE_AND_DATA_MEMSLOT, code_npages, 0);
vm->memslots[MEM_REGION_CODE] = CODE_AND_DATA_MEMSLOT;
vm->memslots[MEM_REGION_DATA] = CODE_AND_DATA_MEMSLOT;
vm_userspace_mem_region_add(vm, p->src_type, data_gpa - pt_size,
PAGE_TABLE_MEMSLOT, pt_size / guest_page_size,
p->test_desc->pt_memslot_flags);
vm->memslots[MEM_REGION_PT] = PAGE_TABLE_MEMSLOT;
vm_userspace_mem_region_add(vm, p->src_type, data_gpa, TEST_DATA_MEMSLOT,
data_size / guest_page_size,
p->test_desc->data_memslot_flags);
vm->memslots[MEM_REGION_TEST_DATA] = TEST_DATA_MEMSLOT;
}
static void setup_default_handlers(struct test_desc *test)
{
if (!test->mmio_handler)
test->mmio_handler = mmio_no_handler;
if (!test->fail_vcpu_run_handler)
test->fail_vcpu_run_handler = fail_vcpu_run_no_handler;
}
static void check_event_counts(struct test_desc *test)
{
ASSERT_EQ(test->expected_events.uffd_faults, events.uffd_faults);
ASSERT_EQ(test->expected_events.mmio_exits, events.mmio_exits);
ASSERT_EQ(test->expected_events.fail_vcpu_runs, events.fail_vcpu_runs);
}
static void print_test_banner(enum vm_guest_mode mode, struct test_params *p)
{
struct test_desc *test = p->test_desc;
pr_debug("Test: %s\n", test->name);
pr_debug("Testing guest mode: %s\n", vm_guest_mode_string(mode));
pr_debug("Testing memory backing src type: %s\n",
vm_mem_backing_src_alias(p->src_type)->name);
}
static void reset_event_counts(void)
{
memset(&events, 0, sizeof(events));
}
/*
* This function either succeeds, skips the test (after setting test->skip), or
* fails with a TEST_FAIL that aborts all tests.
*/
static void vcpu_run_loop(struct kvm_vm *vm, struct kvm_vcpu *vcpu,
struct test_desc *test)
{
struct kvm_run *run;
struct ucall uc;
int ret;
run = vcpu->run;
for (;;) {
ret = _vcpu_run(vcpu);
if (ret) {
test->fail_vcpu_run_handler(ret);
goto done;
}
switch (get_ucall(vcpu, &uc)) {
case UCALL_SYNC:
if (!handle_cmd(vm, uc.args[1])) {
test->skip = true;
goto done;
}
break;
case UCALL_ABORT:
REPORT_GUEST_ASSERT_2(uc, "values: %#lx, %#lx");
break;
case UCALL_DONE:
goto done;
case UCALL_NONE:
if (run->exit_reason == KVM_EXIT_MMIO)
test->mmio_handler(vm, run);
break;
default:
TEST_FAIL("Unknown ucall %lu", uc.cmd);
}
}
done:
pr_debug(test->skip ? "Skipped.\n" : "Done.\n");
}
static void run_test(enum vm_guest_mode mode, void *arg)
{
struct test_params *p = (struct test_params *)arg;
struct test_desc *test = p->test_desc;
struct kvm_vm *vm;
struct kvm_vcpu *vcpu;
struct uffd_desc *pt_uffd, *data_uffd;
print_test_banner(mode, p);
vm = ____vm_create(mode);
setup_memslots(vm, p);
kvm_vm_elf_load(vm, program_invocation_name);
vcpu = vm_vcpu_add(vm, 0, guest_code);
setup_gva_maps(vm);
ucall_init(vm, NULL);
reset_event_counts();
/*
* Set some code in the data memslot for the guest to execute (only
* applicable to the EXEC tests). This has to be done before
* setup_uffd() as that function copies the memslot data for the uffd
* handler.
*/
load_exec_code_for_test(vm);
setup_uffd(vm, p, &pt_uffd, &data_uffd);
setup_abort_handlers(vm, vcpu, test);
setup_default_handlers(test);
vcpu_args_set(vcpu, 1, test);
vcpu_run_loop(vm, vcpu, test);
ucall_uninit(vm);
kvm_vm_free(vm);
free_uffd(test, pt_uffd, data_uffd);
/*
* Make sure we check the events after the uffd threads have exited,
* which means they updated their respective event counters.
*/
if (!test->skip)
check_event_counts(test);
}
static void help(char *name)
{
puts("");
printf("usage: %s [-h] [-s mem-type]\n", name);
puts("");
guest_modes_help();
backing_src_help("-s");
puts("");
}
#define SNAME(s) #s
#define SCAT2(a, b) SNAME(a ## _ ## b)
#define SCAT3(a, b, c) SCAT2(a, SCAT2(b, c))
#define SCAT4(a, b, c, d) SCAT2(a, SCAT3(b, c, d))
#define _CHECK(_test) _CHECK_##_test
#define _PREPARE(_test) _PREPARE_##_test
#define _PREPARE_guest_read64 NULL
#define _PREPARE_guest_ld_preidx NULL
#define _PREPARE_guest_write64 NULL
#define _PREPARE_guest_st_preidx NULL
#define _PREPARE_guest_exec NULL
#define _PREPARE_guest_at NULL
#define _PREPARE_guest_dc_zva guest_check_dc_zva
#define _PREPARE_guest_cas guest_check_lse
/* With or without access flag checks */
#define _PREPARE_with_af guest_set_ha, guest_clear_pte_af
#define _PREPARE_no_af NULL
#define _CHECK_with_af guest_check_pte_af
#define _CHECK_no_af NULL
/* Performs an access and checks that no faults were triggered. */
#define TEST_ACCESS(_access, _with_af, _mark_cmd) \
{ \
.name = SCAT3(_access, _with_af, #_mark_cmd), \
.guest_prepare = { _PREPARE(_with_af), \
_PREPARE(_access) }, \
.mem_mark_cmd = _mark_cmd, \
.guest_test = _access, \
.guest_test_check = { _CHECK(_with_af) }, \
.expected_events = { 0 }, \
}
#define TEST_UFFD(_access, _with_af, _mark_cmd, \
_uffd_data_handler, _uffd_pt_handler, _uffd_faults) \
{ \
.name = SCAT4(uffd, _access, _with_af, #_mark_cmd), \
.guest_prepare = { _PREPARE(_with_af), \
_PREPARE(_access) }, \
.guest_test = _access, \
.mem_mark_cmd = _mark_cmd, \
.guest_test_check = { _CHECK(_with_af) }, \
.uffd_data_handler = _uffd_data_handler, \
.uffd_pt_handler = _uffd_pt_handler, \
.expected_events = { .uffd_faults = _uffd_faults, }, \
}
#define TEST_DIRTY_LOG(_access, _with_af, _test_check) \
{ \
.name = SCAT3(dirty_log, _access, _with_af), \
.data_memslot_flags = KVM_MEM_LOG_DIRTY_PAGES, \
.pt_memslot_flags = KVM_MEM_LOG_DIRTY_PAGES, \
.guest_prepare = { _PREPARE(_with_af), \
_PREPARE(_access) }, \
.guest_test = _access, \
.guest_test_check = { _CHECK(_with_af), _test_check, \
guest_check_s1ptw_wr_in_dirty_log}, \
.expected_events = { 0 }, \
}
#define TEST_UFFD_AND_DIRTY_LOG(_access, _with_af, _uffd_data_handler, \
_uffd_faults, _test_check) \
{ \
.name = SCAT3(uffd_and_dirty_log, _access, _with_af), \
.data_memslot_flags = KVM_MEM_LOG_DIRTY_PAGES, \
.pt_memslot_flags = KVM_MEM_LOG_DIRTY_PAGES, \
.guest_prepare = { _PREPARE(_with_af), \
_PREPARE(_access) }, \
.guest_test = _access, \
.mem_mark_cmd = CMD_HOLE_DATA | CMD_HOLE_PT, \
.guest_test_check = { _CHECK(_with_af), _test_check }, \
.uffd_data_handler = _uffd_data_handler, \
.uffd_pt_handler = uffd_pt_write_handler, \
.expected_events = { .uffd_faults = _uffd_faults, }, \
}
#define TEST_RO_MEMSLOT(_access, _mmio_handler, _mmio_exits) \
{ \
.name = SCAT3(ro_memslot, _access, _with_af), \
.data_memslot_flags = KVM_MEM_READONLY, \
.guest_prepare = { _PREPARE(_access) }, \
.guest_test = _access, \
.mmio_handler = _mmio_handler, \
.expected_events = { .mmio_exits = _mmio_exits }, \
}
#define TEST_RO_MEMSLOT_NO_SYNDROME(_access) \
{ \
.name = SCAT2(ro_memslot_no_syndrome, _access), \
.data_memslot_flags = KVM_MEM_READONLY, \
.guest_test = _access, \
.fail_vcpu_run_handler = fail_vcpu_run_mmio_no_syndrome_handler, \
.expected_events = { .fail_vcpu_runs = 1 }, \
}
#define TEST_RO_MEMSLOT_AND_DIRTY_LOG(_access, _mmio_handler, _mmio_exits, \
_test_check) \
{ \
.name = SCAT3(ro_memslot, _access, _with_af), \
.data_memslot_flags = KVM_MEM_READONLY | KVM_MEM_LOG_DIRTY_PAGES, \
.pt_memslot_flags = KVM_MEM_LOG_DIRTY_PAGES, \
.guest_prepare = { _PREPARE(_access) }, \
.guest_test = _access, \
.guest_test_check = { _test_check }, \
.mmio_handler = _mmio_handler, \
.expected_events = { .mmio_exits = _mmio_exits}, \
}
#define TEST_RO_MEMSLOT_NO_SYNDROME_AND_DIRTY_LOG(_access, _test_check) \
{ \
.name = SCAT2(ro_memslot_no_syn_and_dlog, _access), \
.data_memslot_flags = KVM_MEM_READONLY | KVM_MEM_LOG_DIRTY_PAGES, \
.pt_memslot_flags = KVM_MEM_LOG_DIRTY_PAGES, \
.guest_test = _access, \
.guest_test_check = { _test_check }, \
.fail_vcpu_run_handler = fail_vcpu_run_mmio_no_syndrome_handler, \
.expected_events = { .fail_vcpu_runs = 1 }, \
}
#define TEST_RO_MEMSLOT_AND_UFFD(_access, _mmio_handler, _mmio_exits, \
_uffd_data_handler, _uffd_faults) \
{ \
.name = SCAT2(ro_memslot_uffd, _access), \
.data_memslot_flags = KVM_MEM_READONLY, \
.mem_mark_cmd = CMD_HOLE_DATA | CMD_HOLE_PT, \
.guest_prepare = { _PREPARE(_access) }, \
.guest_test = _access, \
.uffd_data_handler = _uffd_data_handler, \
.uffd_pt_handler = uffd_pt_write_handler, \
.mmio_handler = _mmio_handler, \
.expected_events = { .mmio_exits = _mmio_exits, \
.uffd_faults = _uffd_faults }, \
}
#define TEST_RO_MEMSLOT_NO_SYNDROME_AND_UFFD(_access, _uffd_data_handler, \
_uffd_faults) \
{ \
.name = SCAT2(ro_memslot_no_syndrome, _access), \
.data_memslot_flags = KVM_MEM_READONLY, \
.mem_mark_cmd = CMD_HOLE_DATA | CMD_HOLE_PT, \
.guest_test = _access, \
.uffd_data_handler = _uffd_data_handler, \
.uffd_pt_handler = uffd_pt_write_handler, \
.fail_vcpu_run_handler = fail_vcpu_run_mmio_no_syndrome_handler, \
.expected_events = { .fail_vcpu_runs = 1, \
.uffd_faults = _uffd_faults }, \
}
static struct test_desc tests[] = {
/* Check that HW is setting the Access Flag (AF) (sanity checks). */
TEST_ACCESS(guest_read64, with_af, CMD_NONE),
TEST_ACCESS(guest_ld_preidx, with_af, CMD_NONE),
TEST_ACCESS(guest_cas, with_af, CMD_NONE),
TEST_ACCESS(guest_write64, with_af, CMD_NONE),
TEST_ACCESS(guest_st_preidx, with_af, CMD_NONE),
TEST_ACCESS(guest_dc_zva, with_af, CMD_NONE),
TEST_ACCESS(guest_exec, with_af, CMD_NONE),
/*
* Punch a hole in the data backing store, and then try multiple
* accesses: reads should rturn zeroes, and writes should
* re-populate the page. Moreover, the test also check that no
* exception was generated in the guest. Note that this
* reading/writing behavior is the same as reading/writing a
* punched page (with fallocate(FALLOC_FL_PUNCH_HOLE)) from
* userspace.
*/
TEST_ACCESS(guest_read64, no_af, CMD_HOLE_DATA),
TEST_ACCESS(guest_cas, no_af, CMD_HOLE_DATA),
TEST_ACCESS(guest_ld_preidx, no_af, CMD_HOLE_DATA),
TEST_ACCESS(guest_write64, no_af, CMD_HOLE_DATA),
TEST_ACCESS(guest_st_preidx, no_af, CMD_HOLE_DATA),
TEST_ACCESS(guest_at, no_af, CMD_HOLE_DATA),
TEST_ACCESS(guest_dc_zva, no_af, CMD_HOLE_DATA),
/*
* Punch holes in the data and PT backing stores and mark them for
* userfaultfd handling. This should result in 2 faults: the access
* on the data backing store, and its respective S1 page table walk
* (S1PTW).
*/
TEST_UFFD(guest_read64, with_af, CMD_HOLE_DATA | CMD_HOLE_PT,
uffd_data_read_handler, uffd_pt_write_handler, 2),
/* no_af should also lead to a PT write. */
TEST_UFFD(guest_read64, no_af, CMD_HOLE_DATA | CMD_HOLE_PT,
uffd_data_read_handler, uffd_pt_write_handler, 2),
/* Note how that cas invokes the read handler. */
TEST_UFFD(guest_cas, with_af, CMD_HOLE_DATA | CMD_HOLE_PT,
uffd_data_read_handler, uffd_pt_write_handler, 2),
/*
* Can't test guest_at with_af as it's IMPDEF whether the AF is set.
* The S1PTW fault should still be marked as a write.
*/
TEST_UFFD(guest_at, no_af, CMD_HOLE_DATA | CMD_HOLE_PT,
uffd_data_read_handler, uffd_pt_write_handler, 1),
TEST_UFFD(guest_ld_preidx, with_af, CMD_HOLE_DATA | CMD_HOLE_PT,
uffd_data_read_handler, uffd_pt_write_handler, 2),
TEST_UFFD(guest_write64, with_af, CMD_HOLE_DATA | CMD_HOLE_PT,
uffd_data_write_handler, uffd_pt_write_handler, 2),
TEST_UFFD(guest_dc_zva, with_af, CMD_HOLE_DATA | CMD_HOLE_PT,
uffd_data_write_handler, uffd_pt_write_handler, 2),
TEST_UFFD(guest_st_preidx, with_af, CMD_HOLE_DATA | CMD_HOLE_PT,
uffd_data_write_handler, uffd_pt_write_handler, 2),
TEST_UFFD(guest_exec, with_af, CMD_HOLE_DATA | CMD_HOLE_PT,
uffd_data_read_handler, uffd_pt_write_handler, 2),
/*
* Try accesses when the data and PT memory regions are both
* tracked for dirty logging.
*/
TEST_DIRTY_LOG(guest_read64, with_af, guest_check_no_write_in_dirty_log),
/* no_af should also lead to a PT write. */
TEST_DIRTY_LOG(guest_read64, no_af, guest_check_no_write_in_dirty_log),
TEST_DIRTY_LOG(guest_ld_preidx, with_af, guest_check_no_write_in_dirty_log),
TEST_DIRTY_LOG(guest_at, no_af, guest_check_no_write_in_dirty_log),
TEST_DIRTY_LOG(guest_exec, with_af, guest_check_no_write_in_dirty_log),
TEST_DIRTY_LOG(guest_write64, with_af, guest_check_write_in_dirty_log),
TEST_DIRTY_LOG(guest_cas, with_af, guest_check_write_in_dirty_log),
TEST_DIRTY_LOG(guest_dc_zva, with_af, guest_check_write_in_dirty_log),
TEST_DIRTY_LOG(guest_st_preidx, with_af, guest_check_write_in_dirty_log),
/*
* Access when the data and PT memory regions are both marked for
* dirty logging and UFFD at the same time. The expected result is
* that writes should mark the dirty log and trigger a userfaultfd
* write fault. Reads/execs should result in a read userfaultfd
* fault, and nothing in the dirty log. Any S1PTW should result in
* a write in the dirty log and a userfaultfd write.
*/
TEST_UFFD_AND_DIRTY_LOG(guest_read64, with_af, uffd_data_read_handler, 2,
guest_check_no_write_in_dirty_log),
/* no_af should also lead to a PT write. */
TEST_UFFD_AND_DIRTY_LOG(guest_read64, no_af, uffd_data_read_handler, 2,
guest_check_no_write_in_dirty_log),
TEST_UFFD_AND_DIRTY_LOG(guest_ld_preidx, with_af, uffd_data_read_handler,
2, guest_check_no_write_in_dirty_log),
TEST_UFFD_AND_DIRTY_LOG(guest_at, with_af, 0, 1,
guest_check_no_write_in_dirty_log),
TEST_UFFD_AND_DIRTY_LOG(guest_exec, with_af, uffd_data_read_handler, 2,
guest_check_no_write_in_dirty_log),
TEST_UFFD_AND_DIRTY_LOG(guest_write64, with_af, uffd_data_write_handler,
2, guest_check_write_in_dirty_log),
TEST_UFFD_AND_DIRTY_LOG(guest_cas, with_af, uffd_data_read_handler, 2,
guest_check_write_in_dirty_log),
TEST_UFFD_AND_DIRTY_LOG(guest_dc_zva, with_af, uffd_data_write_handler,
2, guest_check_write_in_dirty_log),
TEST_UFFD_AND_DIRTY_LOG(guest_st_preidx, with_af,
uffd_data_write_handler, 2,
guest_check_write_in_dirty_log),
/*
* Try accesses when the data memory region is marked read-only
* (with KVM_MEM_READONLY). Writes with a syndrome result in an
* MMIO exit, writes with no syndrome (e.g., CAS) result in a
* failed vcpu run, and reads/execs with and without syndroms do
* not fault.
*/
TEST_RO_MEMSLOT(guest_read64, 0, 0),
TEST_RO_MEMSLOT(guest_ld_preidx, 0, 0),
TEST_RO_MEMSLOT(guest_at, 0, 0),
TEST_RO_MEMSLOT(guest_exec, 0, 0),
TEST_RO_MEMSLOT(guest_write64, mmio_on_test_gpa_handler, 1),
TEST_RO_MEMSLOT_NO_SYNDROME(guest_dc_zva),
TEST_RO_MEMSLOT_NO_SYNDROME(guest_cas),
TEST_RO_MEMSLOT_NO_SYNDROME(guest_st_preidx),
/*
* Access when both the data region is both read-only and marked
* for dirty logging at the same time. The expected result is that
* for writes there should be no write in the dirty log. The
* readonly handling is the same as if the memslot was not marked
* for dirty logging: writes with a syndrome result in an MMIO
* exit, and writes with no syndrome result in a failed vcpu run.
*/
TEST_RO_MEMSLOT_AND_DIRTY_LOG(guest_read64, 0, 0,
guest_check_no_write_in_dirty_log),
TEST_RO_MEMSLOT_AND_DIRTY_LOG(guest_ld_preidx, 0, 0,
guest_check_no_write_in_dirty_log),
TEST_RO_MEMSLOT_AND_DIRTY_LOG(guest_at, 0, 0,
guest_check_no_write_in_dirty_log),
TEST_RO_MEMSLOT_AND_DIRTY_LOG(guest_exec, 0, 0,
guest_check_no_write_in_dirty_log),
TEST_RO_MEMSLOT_AND_DIRTY_LOG(guest_write64, mmio_on_test_gpa_handler,
1, guest_check_no_write_in_dirty_log),
TEST_RO_MEMSLOT_NO_SYNDROME_AND_DIRTY_LOG(guest_dc_zva,
guest_check_no_write_in_dirty_log),
TEST_RO_MEMSLOT_NO_SYNDROME_AND_DIRTY_LOG(guest_cas,
guest_check_no_write_in_dirty_log),
TEST_RO_MEMSLOT_NO_SYNDROME_AND_DIRTY_LOG(guest_st_preidx,
guest_check_no_write_in_dirty_log),
/*
* Access when the data region is both read-only and punched with
* holes tracked with userfaultfd. The expected result is the
* union of both userfaultfd and read-only behaviors. For example,
* write accesses result in a userfaultfd write fault and an MMIO
* exit. Writes with no syndrome result in a failed vcpu run and
* no userfaultfd write fault. Reads result in userfaultfd getting
* triggered.
*/
TEST_RO_MEMSLOT_AND_UFFD(guest_read64, 0, 0,
uffd_data_read_handler, 2),
TEST_RO_MEMSLOT_AND_UFFD(guest_ld_preidx, 0, 0,
uffd_data_read_handler, 2),
TEST_RO_MEMSLOT_AND_UFFD(guest_at, 0, 0,
uffd_no_handler, 1),
TEST_RO_MEMSLOT_AND_UFFD(guest_exec, 0, 0,
uffd_data_read_handler, 2),
TEST_RO_MEMSLOT_AND_UFFD(guest_write64, mmio_on_test_gpa_handler, 1,
uffd_data_write_handler, 2),
TEST_RO_MEMSLOT_NO_SYNDROME_AND_UFFD(guest_cas,
uffd_data_read_handler, 2),
TEST_RO_MEMSLOT_NO_SYNDROME_AND_UFFD(guest_dc_zva,
uffd_no_handler, 1),
TEST_RO_MEMSLOT_NO_SYNDROME_AND_UFFD(guest_st_preidx,
uffd_no_handler, 1),
{ 0 }
};
static void for_each_test_and_guest_mode(enum vm_mem_backing_src_type src_type)
{
struct test_desc *t;
for (t = &tests[0]; t->name; t++) {
if (t->skip)
continue;
struct test_params p = {
.src_type = src_type,
.test_desc = t,
};
for_each_guest_mode(run_test, &p);
}
}
int main(int argc, char *argv[])
{
enum vm_mem_backing_src_type src_type;
int opt;
setbuf(stdout, NULL);
src_type = DEFAULT_VM_MEM_SRC;
while ((opt = getopt(argc, argv, "hm:s:")) != -1) {
switch (opt) {
case 'm':
guest_modes_cmdline(optarg);
break;
case 's':
src_type = parse_backing_src_type(optarg);
break;
case 'h':
default:
help(argv[0]);
exit(0);
}
}
for_each_test_and_guest_mode(src_type);
return 0;
}
......@@ -22,23 +22,13 @@
#include "test_util.h"
#include "perf_test_util.h"
#include "guest_modes.h"
#include "userfaultfd_util.h"
#ifdef __NR_userfaultfd
#ifdef PRINT_PER_PAGE_UPDATES
#define PER_PAGE_DEBUG(...) printf(__VA_ARGS__)
#else
#define PER_PAGE_DEBUG(...) _no_printf(__VA_ARGS__)
#endif
#ifdef PRINT_PER_VCPU_UPDATES
#define PER_VCPU_DEBUG(...) printf(__VA_ARGS__)
#else
#define PER_VCPU_DEBUG(...) _no_printf(__VA_ARGS__)
#endif
static int nr_vcpus = 1;
static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE;
static size_t demand_paging_size;
static char *guest_data_prototype;
......@@ -67,9 +57,11 @@ static void vcpu_worker(struct perf_test_vcpu_args *vcpu_args)
ts_diff.tv_sec, ts_diff.tv_nsec);
}
static int handle_uffd_page_request(int uffd_mode, int uffd, uint64_t addr)
static int handle_uffd_page_request(int uffd_mode, int uffd,
struct uffd_msg *msg)
{
pid_t tid = syscall(__NR_gettid);
uint64_t addr = msg->arg.pagefault.address;
struct timespec start;
struct timespec ts_diff;
int r;
......@@ -116,174 +108,32 @@ static int handle_uffd_page_request(int uffd_mode, int uffd, uint64_t addr)
return 0;
}
bool quit_uffd_thread;
struct uffd_handler_args {
struct test_params {
int uffd_mode;
int uffd;
int pipefd;
useconds_t delay;
useconds_t uffd_delay;
enum vm_mem_backing_src_type src_type;
bool partition_vcpu_memory_access;
};
static void *uffd_handler_thread_fn(void *arg)
{
struct uffd_handler_args *uffd_args = (struct uffd_handler_args *)arg;
int uffd = uffd_args->uffd;
int pipefd = uffd_args->pipefd;
useconds_t delay = uffd_args->delay;
int64_t pages = 0;
struct timespec start;
struct timespec ts_diff;
clock_gettime(CLOCK_MONOTONIC, &start);
while (!quit_uffd_thread) {
struct uffd_msg msg;
struct pollfd pollfd[2];
char tmp_chr;
int r;
uint64_t addr;
pollfd[0].fd = uffd;
pollfd[0].events = POLLIN;
pollfd[1].fd = pipefd;
pollfd[1].events = POLLIN;
r = poll(pollfd, 2, -1);
switch (r) {
case -1:
pr_info("poll err");
continue;
case 0:
continue;
case 1:
break;
default:
pr_info("Polling uffd returned %d", r);
return NULL;
}
if (pollfd[0].revents & POLLERR) {
pr_info("uffd revents has POLLERR");
return NULL;
}
if (pollfd[1].revents & POLLIN) {
r = read(pollfd[1].fd, &tmp_chr, 1);
TEST_ASSERT(r == 1,
"Error reading pipefd in UFFD thread\n");
return NULL;
}
if (!(pollfd[0].revents & POLLIN))
continue;
r = read(uffd, &msg, sizeof(msg));
if (r == -1) {
if (errno == EAGAIN)
continue;
pr_info("Read of uffd got errno %d\n", errno);
return NULL;
}
if (r != sizeof(msg)) {
pr_info("Read on uffd returned unexpected size: %d bytes", r);
return NULL;
}
if (!(msg.event & UFFD_EVENT_PAGEFAULT))
continue;
if (delay)
usleep(delay);
addr = msg.arg.pagefault.address;
r = handle_uffd_page_request(uffd_args->uffd_mode, uffd, addr);
if (r < 0)
return NULL;
pages++;
}
ts_diff = timespec_elapsed(start);
PER_VCPU_DEBUG("userfaulted %ld pages over %ld.%.9lds. (%f/sec)\n",
pages, ts_diff.tv_sec, ts_diff.tv_nsec,
pages / ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / 100000000.0));
return NULL;
}
static void setup_demand_paging(struct kvm_vm *vm,
pthread_t *uffd_handler_thread, int pipefd,
int uffd_mode, useconds_t uffd_delay,
struct uffd_handler_args *uffd_args,
void *hva, void *alias, uint64_t len)
static void prefault_mem(void *alias, uint64_t len)
{
bool is_minor = (uffd_mode == UFFDIO_REGISTER_MODE_MINOR);
int uffd;
struct uffdio_api uffdio_api;
struct uffdio_register uffdio_register;
uint64_t expected_ioctls = ((uint64_t) 1) << _UFFDIO_COPY;
int ret;
size_t p;
PER_PAGE_DEBUG("Userfaultfd %s mode, faults resolved with %s\n",
is_minor ? "MINOR" : "MISSING",
is_minor ? "UFFDIO_CONINUE" : "UFFDIO_COPY");
/* In order to get minor faults, prefault via the alias. */
if (is_minor) {
size_t p;
expected_ioctls = ((uint64_t) 1) << _UFFDIO_CONTINUE;
TEST_ASSERT(alias != NULL, "Alias required for minor faults");
for (p = 0; p < (len / demand_paging_size); ++p) {
memcpy(alias + (p * demand_paging_size),
guest_data_prototype, demand_paging_size);
}
TEST_ASSERT(alias != NULL, "Alias required for minor faults");
for (p = 0; p < (len / demand_paging_size); ++p) {
memcpy(alias + (p * demand_paging_size),
guest_data_prototype, demand_paging_size);
}
uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
TEST_ASSERT(uffd >= 0, __KVM_SYSCALL_ERROR("userfaultfd()", uffd));
uffdio_api.api = UFFD_API;
uffdio_api.features = 0;
ret = ioctl(uffd, UFFDIO_API, &uffdio_api);
TEST_ASSERT(ret != -1, __KVM_SYSCALL_ERROR("UFFDIO_API", ret));
uffdio_register.range.start = (uint64_t)hva;
uffdio_register.range.len = len;
uffdio_register.mode = uffd_mode;
ret = ioctl(uffd, UFFDIO_REGISTER, &uffdio_register);
TEST_ASSERT(ret != -1, __KVM_SYSCALL_ERROR("UFFDIO_REGISTER", ret));
TEST_ASSERT((uffdio_register.ioctls & expected_ioctls) ==
expected_ioctls, "missing userfaultfd ioctls");
uffd_args->uffd_mode = uffd_mode;
uffd_args->uffd = uffd;
uffd_args->pipefd = pipefd;
uffd_args->delay = uffd_delay;
pthread_create(uffd_handler_thread, NULL, uffd_handler_thread_fn,
uffd_args);
PER_VCPU_DEBUG("Created uffd thread for HVA range [%p, %p)\n",
hva, hva + len);
}
struct test_params {
int uffd_mode;
useconds_t uffd_delay;
enum vm_mem_backing_src_type src_type;
bool partition_vcpu_memory_access;
};
static void run_test(enum vm_guest_mode mode, void *arg)
{
struct test_params *p = arg;
pthread_t *uffd_handler_threads = NULL;
struct uffd_handler_args *uffd_args = NULL;
struct uffd_desc **uffd_descs = NULL;
struct timespec start;
struct timespec ts_diff;
int *pipefds = NULL;
struct kvm_vm *vm;
int r, i;
int i;
vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 1,
p->src_type, p->partition_vcpu_memory_access);
......@@ -296,15 +146,8 @@ static void run_test(enum vm_guest_mode mode, void *arg)
memset(guest_data_prototype, 0xAB, demand_paging_size);
if (p->uffd_mode) {
uffd_handler_threads =
malloc(nr_vcpus * sizeof(*uffd_handler_threads));
TEST_ASSERT(uffd_handler_threads, "Memory allocation failed");
uffd_args = malloc(nr_vcpus * sizeof(*uffd_args));
TEST_ASSERT(uffd_args, "Memory allocation failed");
pipefds = malloc(sizeof(int) * nr_vcpus * 2);
TEST_ASSERT(pipefds, "Unable to allocate memory for pipefd");
uffd_descs = malloc(nr_vcpus * sizeof(struct uffd_desc *));
TEST_ASSERT(uffd_descs, "Memory allocation failed");
for (i = 0; i < nr_vcpus; i++) {
struct perf_test_vcpu_args *vcpu_args;
......@@ -317,19 +160,17 @@ static void run_test(enum vm_guest_mode mode, void *arg)
vcpu_hva = addr_gpa2hva(vm, vcpu_args->gpa);
vcpu_alias = addr_gpa2alias(vm, vcpu_args->gpa);
prefault_mem(vcpu_alias,
vcpu_args->pages * perf_test_args.guest_page_size);
/*
* Set up user fault fd to handle demand paging
* requests.
*/
r = pipe2(&pipefds[i * 2],
O_CLOEXEC | O_NONBLOCK);
TEST_ASSERT(!r, "Failed to set up pipefd");
setup_demand_paging(vm, &uffd_handler_threads[i],
pipefds[i * 2], p->uffd_mode,
p->uffd_delay, &uffd_args[i],
vcpu_hva, vcpu_alias,
vcpu_args->pages * perf_test_args.guest_page_size);
uffd_descs[i] = uffd_setup_demand_paging(
p->uffd_mode, p->uffd_delay, vcpu_hva,
vcpu_args->pages * perf_test_args.guest_page_size,
&handle_uffd_page_request);
}
}
......@@ -344,15 +185,9 @@ static void run_test(enum vm_guest_mode mode, void *arg)
pr_info("All vCPU threads joined\n");
if (p->uffd_mode) {
char c;
/* Tell the user fault fd handler threads to quit */
for (i = 0; i < nr_vcpus; i++) {
r = write(pipefds[i * 2 + 1], &c, 1);
TEST_ASSERT(r == 1, "Unable to write to pipefd");
pthread_join(uffd_handler_threads[i], NULL);
}
for (i = 0; i < nr_vcpus; i++)
uffd_stop_demand_paging(uffd_descs[i]);
}
pr_info("Total guest execution time: %ld.%.9lds\n",
......@@ -364,11 +199,8 @@ static void run_test(enum vm_guest_mode mode, void *arg)
perf_test_destroy_vm(vm);
free(guest_data_prototype);
if (p->uffd_mode) {
free(uffd_handler_threads);
free(uffd_args);
free(pipefds);
}
if (p->uffd_mode)
free(uffd_descs);
}
static void help(char *name)
......
......@@ -38,12 +38,25 @@
* NORMAL 4 1111:1111
* NORMAL_WT 5 1011:1011
*/
#define DEFAULT_MAIR_EL1 ((0x00ul << (0 * 8)) | \
(0x04ul << (1 * 8)) | \
(0x0cul << (2 * 8)) | \
(0x44ul << (3 * 8)) | \
(0xfful << (4 * 8)) | \
(0xbbul << (5 * 8)))
/* Linux doesn't use these memory types, so let's define them. */
#define MAIR_ATTR_DEVICE_GRE UL(0x0c)
#define MAIR_ATTR_NORMAL_WT UL(0xbb)
#define MT_DEVICE_nGnRnE 0
#define MT_DEVICE_nGnRE 1
#define MT_DEVICE_GRE 2
#define MT_NORMAL_NC 3
#define MT_NORMAL 4
#define MT_NORMAL_WT 5
#define DEFAULT_MAIR_EL1 \
(MAIR_ATTRIDX(MAIR_ATTR_DEVICE_nGnRnE, MT_DEVICE_nGnRnE) | \
MAIR_ATTRIDX(MAIR_ATTR_DEVICE_nGnRE, MT_DEVICE_nGnRE) | \
MAIR_ATTRIDX(MAIR_ATTR_DEVICE_GRE, MT_DEVICE_GRE) | \
MAIR_ATTRIDX(MAIR_ATTR_NORMAL_NC, MT_NORMAL_NC) | \
MAIR_ATTRIDX(MAIR_ATTR_NORMAL, MT_NORMAL) | \
MAIR_ATTRIDX(MAIR_ATTR_NORMAL_WT, MT_NORMAL_WT))
#define MPIDR_HWID_BITMASK (0xff00fffffful)
......@@ -92,11 +105,19 @@ enum {
#define ESR_EC_MASK (ESR_EC_NUM - 1)
#define ESR_EC_SVC64 0x15
#define ESR_EC_IABT 0x21
#define ESR_EC_DABT 0x25
#define ESR_EC_HW_BP_CURRENT 0x31
#define ESR_EC_SSTEP_CURRENT 0x33
#define ESR_EC_WP_CURRENT 0x35
#define ESR_EC_BRK_INS 0x3c
/* Access flag */
#define PTE_AF (1ULL << 10)
/* Access flag update enable/disable */
#define TCR_EL1_HA (1ULL << 39)
void aarch64_get_supported_page_sizes(uint32_t ipa,
bool *ps4k, bool *ps16k, bool *ps64k);
......@@ -109,6 +130,8 @@ void vm_install_exception_handler(struct kvm_vm *vm,
void vm_install_sync_handler(struct kvm_vm *vm,
int vector, int ec, handler_fn handler);
uint64_t *virt_get_pte_hva(struct kvm_vm *vm, vm_vaddr_t gva);
static inline void cpu_relax(void)
{
asm volatile("yield" ::: "memory");
......
......@@ -34,6 +34,7 @@ struct userspace_mem_region {
struct sparsebit *unused_phy_pages;
int fd;
off_t offset;
enum vm_mem_backing_src_type backing_src_type;
void *host_mem;
void *host_alias;
void *mmap_start;
......@@ -64,6 +65,14 @@ struct userspace_mem_regions {
DECLARE_HASHTABLE(slot_hash, 9);
};
enum kvm_mem_region_type {
MEM_REGION_CODE,
MEM_REGION_DATA,
MEM_REGION_PT,
MEM_REGION_TEST_DATA,
NR_MEM_REGIONS,
};
struct kvm_vm {
int mode;
unsigned long type;
......@@ -92,6 +101,13 @@ struct kvm_vm {
int stats_fd;
struct kvm_stats_header stats_header;
struct kvm_stats_desc *stats_desc;
/*
* KVM region slots. These are the default memslots used by page
* allocators, e.g., lib/elf uses the memslots[MEM_REGION_CODE]
* memslot.
*/
uint32_t memslots[NR_MEM_REGIONS];
};
......@@ -104,6 +120,13 @@ struct kvm_vm {
struct userspace_mem_region *
memslot2region(struct kvm_vm *vm, uint32_t memslot);
static inline struct userspace_mem_region *vm_get_mem_region(struct kvm_vm *vm,
enum kvm_mem_region_type type)
{
assert(type < NR_MEM_REGIONS);
return memslot2region(vm, vm->memslots[type]);
}
/* Minimum allocated guest virtual and physical addresses */
#define KVM_UTIL_MIN_VADDR 0x2000
#define KVM_GUEST_PAGE_TABLE_MIN_PADDR 0x180000
......@@ -384,7 +407,11 @@ void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa);
void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot);
struct kvm_vcpu *__vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id);
vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min);
vm_vaddr_t __vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min,
enum kvm_mem_region_type type);
vm_vaddr_t vm_vaddr_alloc_pages(struct kvm_vm *vm, int nr_pages);
vm_vaddr_t __vm_vaddr_alloc_page(struct kvm_vm *vm,
enum kvm_mem_region_type type);
vm_vaddr_t vm_vaddr_alloc_page(struct kvm_vm *vm);
void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
......@@ -646,13 +673,13 @@ vm_paddr_t vm_alloc_page_table(struct kvm_vm *vm);
* __vm_create() does NOT create vCPUs, @nr_runnable_vcpus is used purely to
* calculate the amount of memory needed for per-vCPU data, e.g. stacks.
*/
struct kvm_vm *____vm_create(enum vm_guest_mode mode, uint64_t nr_pages);
struct kvm_vm *____vm_create(enum vm_guest_mode mode);
struct kvm_vm *__vm_create(enum vm_guest_mode mode, uint32_t nr_runnable_vcpus,
uint64_t nr_extra_pages);
static inline struct kvm_vm *vm_create_barebones(void)
{
return ____vm_create(VM_MODE_DEFAULT, 0);
return ____vm_create(VM_MODE_DEFAULT);
}
static inline struct kvm_vm *vm_create(uint32_t nr_runnable_vcpus)
......
/* SPDX-License-Identifier: GPL-2.0 */
/*
* KVM userfaultfd util
*
* Copyright (C) 2018, Red Hat, Inc.
* Copyright (C) 2019-2022 Google LLC
*/
#define _GNU_SOURCE /* for pipe2 */
#include <inttypes.h>
#include <time.h>
#include <pthread.h>
#include <linux/userfaultfd.h>
#include "test_util.h"
typedef int (*uffd_handler_t)(int uffd_mode, int uffd, struct uffd_msg *msg);
struct uffd_desc {
int uffd_mode;
int uffd;
int pipefds[2];
useconds_t delay;
uffd_handler_t handler;
pthread_t thread;
};
struct uffd_desc *uffd_setup_demand_paging(int uffd_mode, useconds_t delay,
void *hva, uint64_t len,
uffd_handler_t handler);
void uffd_stop_demand_paging(struct uffd_desc *uffd);
#ifdef PRINT_PER_PAGE_UPDATES
#define PER_PAGE_DEBUG(...) printf(__VA_ARGS__)
#else
#define PER_PAGE_DEBUG(...) _no_printf(__VA_ARGS__)
#endif
#ifdef PRINT_PER_VCPU_UPDATES
#define PER_VCPU_DEBUG(...) printf(__VA_ARGS__)
#else
#define PER_VCPU_DEBUG(...) _no_printf(__VA_ARGS__)
#endif
......@@ -77,13 +77,15 @@ static uint64_t __maybe_unused ptrs_per_pte(struct kvm_vm *vm)
void virt_arch_pgd_alloc(struct kvm_vm *vm)
{
if (!vm->pgd_created) {
vm_paddr_t paddr = vm_phy_pages_alloc(vm,
page_align(vm, ptrs_per_pgd(vm) * 8) / vm->page_size,
KVM_GUEST_PAGE_TABLE_MIN_PADDR, 0);
vm->pgd = paddr;
vm->pgd_created = true;
}
size_t nr_pages = page_align(vm, ptrs_per_pgd(vm) * 8) / vm->page_size;
if (vm->pgd_created)
return;
vm->pgd = vm_phy_pages_alloc(vm, nr_pages,
KVM_GUEST_PAGE_TABLE_MIN_PADDR,
vm->memslots[MEM_REGION_PT]);
vm->pgd_created = true;
}
static void _virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
......@@ -134,12 +136,12 @@ static void _virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr)
{
uint64_t attr_idx = 4; /* NORMAL (See DEFAULT_MAIR_EL1) */
uint64_t attr_idx = MT_NORMAL;
_virt_pg_map(vm, vaddr, paddr, attr_idx);
}
vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
uint64_t *virt_get_pte_hva(struct kvm_vm *vm, vm_vaddr_t gva)
{
uint64_t *ptep;
......@@ -170,11 +172,18 @@ vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
TEST_FAIL("Page table levels must be 2, 3, or 4");
}
return pte_addr(vm, *ptep) + (gva & (vm->page_size - 1));
return ptep;
unmapped_gva:
TEST_FAIL("No mapping for vm virtual address, gva: 0x%lx", gva);
exit(1);
exit(EXIT_FAILURE);
}
vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
{
uint64_t *ptep = virt_get_pte_hva(vm, gva);
return pte_addr(vm, *ptep) + (gva & (vm->page_size - 1));
}
static void pte_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent, uint64_t page, int level)
......@@ -319,13 +328,16 @@ void vcpu_arch_dump(FILE *stream, struct kvm_vcpu *vcpu, uint8_t indent)
struct kvm_vcpu *aarch64_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id,
struct kvm_vcpu_init *init, void *guest_code)
{
size_t stack_size = vm->page_size == 4096 ?
DEFAULT_STACK_PGS * vm->page_size :
vm->page_size;
uint64_t stack_vaddr = vm_vaddr_alloc(vm, stack_size,
DEFAULT_ARM64_GUEST_STACK_VADDR_MIN);
size_t stack_size;
uint64_t stack_vaddr;
struct kvm_vcpu *vcpu = __vm_vcpu_add(vm, vcpu_id);
stack_size = vm->page_size == 4096 ? DEFAULT_STACK_PGS * vm->page_size :
vm->page_size;
stack_vaddr = __vm_vaddr_alloc(vm, stack_size,
DEFAULT_ARM64_GUEST_STACK_VADDR_MIN,
MEM_REGION_DATA);
aarch64_vcpu_setup(vcpu, init);
vcpu_set_reg(vcpu, ARM64_CORE_REG(sp_el1), stack_vaddr + stack_size);
......@@ -429,8 +441,8 @@ void route_exception(struct ex_regs *regs, int vector)
void vm_init_descriptor_tables(struct kvm_vm *vm)
{
vm->handlers = vm_vaddr_alloc(vm, sizeof(struct handlers),
vm->page_size);
vm->handlers = __vm_vaddr_alloc(vm, sizeof(struct handlers),
vm->page_size, MEM_REGION_DATA);
*(vm_vaddr_t *)addr_gva2hva(vm, (vm_vaddr_t)(&exception_handlers)) = vm->handlers;
}
......
......@@ -161,7 +161,8 @@ void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename)
seg_vend |= vm->page_size - 1;
size_t seg_size = seg_vend - seg_vstart + 1;
vm_vaddr_t vaddr = vm_vaddr_alloc(vm, seg_size, seg_vstart);
vm_vaddr_t vaddr = __vm_vaddr_alloc(vm, seg_size, seg_vstart,
MEM_REGION_CODE);
TEST_ASSERT(vaddr == seg_vstart, "Unable to allocate "
"virtual memory for segment at requested min addr,\n"
" segment idx: %u\n"
......
......@@ -185,13 +185,10 @@ const struct vm_guest_mode_params vm_guest_mode_params[] = {
_Static_assert(sizeof(vm_guest_mode_params)/sizeof(struct vm_guest_mode_params) == NUM_VM_MODES,
"Missing new mode params?");
struct kvm_vm *____vm_create(enum vm_guest_mode mode, uint64_t nr_pages)
struct kvm_vm *____vm_create(enum vm_guest_mode mode)
{
struct kvm_vm *vm;
pr_debug("%s: mode='%s' pages='%ld'\n", __func__,
vm_guest_mode_string(mode), nr_pages);
vm = calloc(1, sizeof(*vm));
TEST_ASSERT(vm != NULL, "Insufficient Memory");
......@@ -287,9 +284,6 @@ struct kvm_vm *____vm_create(enum vm_guest_mode mode, uint64_t nr_pages)
/* Allocate and setup memory for guest. */
vm->vpages_mapped = sparsebit_alloc();
if (nr_pages != 0)
vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
0, 0, nr_pages, 0);
return vm;
}
......@@ -335,8 +329,16 @@ struct kvm_vm *__vm_create(enum vm_guest_mode mode, uint32_t nr_runnable_vcpus,
uint64_t nr_pages = vm_nr_pages_required(mode, nr_runnable_vcpus,
nr_extra_pages);
struct kvm_vm *vm;
int i;
pr_debug("%s: mode='%s' pages='%ld'\n", __func__,
vm_guest_mode_string(mode), nr_pages);
vm = ____vm_create(mode, nr_pages);
vm = ____vm_create(mode);
vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 0, 0, nr_pages, 0);
for (i = 0; i < NR_MEM_REGIONS; i++)
vm->memslots[i] = 0;
kvm_vm_elf_load(vm, program_invocation_name);
......@@ -586,6 +588,12 @@ static void __vm_mem_region_delete(struct kvm_vm *vm,
sparsebit_free(&region->unused_phy_pages);
ret = munmap(region->mmap_start, region->mmap_size);
TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret));
if (region->fd >= 0) {
/* There's an extra map when using shared memory. */
ret = munmap(region->mmap_alias, region->mmap_size);
TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret));
close(region->fd);
}
free(region);
}
......@@ -923,6 +931,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
vm_mem_backing_src_alias(src_type)->name);
}
region->backing_src_type = src_type;
region->unused_phy_pages = sparsebit_alloc();
sparsebit_set_num(region->unused_phy_pages,
guest_paddr >> vm->page_shift, npages);
......@@ -1217,32 +1226,15 @@ static vm_vaddr_t vm_vaddr_unused_gap(struct kvm_vm *vm, size_t sz,
return pgidx_start * vm->page_size;
}
/*
* VM Virtual Address Allocate
*
* Input Args:
* vm - Virtual Machine
* sz - Size in bytes
* vaddr_min - Minimum starting virtual address
*
* Output Args: None
*
* Return:
* Starting guest virtual address
*
* Allocates at least sz bytes within the virtual address space of the vm
* given by vm. The allocated bytes are mapped to a virtual address >=
* the address given by vaddr_min. Note that each allocation uses a
* a unique set of pages, with the minimum real allocation being at least
* a page.
*/
vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min)
vm_vaddr_t __vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min,
enum kvm_mem_region_type type)
{
uint64_t pages = (sz >> vm->page_shift) + ((sz % vm->page_size) != 0);
virt_pgd_alloc(vm);
vm_paddr_t paddr = vm_phy_pages_alloc(vm, pages,
KVM_UTIL_MIN_PFN * vm->page_size, 0);
KVM_UTIL_MIN_PFN * vm->page_size,
vm->memslots[type]);
/*
* Find an unused range of virtual page addresses of at least
......@@ -1263,6 +1255,30 @@ vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min)
return vaddr_start;
}
/*
* VM Virtual Address Allocate
*
* Input Args:
* vm - Virtual Machine
* sz - Size in bytes
* vaddr_min - Minimum starting virtual address
*
* Output Args: None
*
* Return:
* Starting guest virtual address
*
* Allocates at least sz bytes within the virtual address space of the vm
* given by vm. The allocated bytes are mapped to a virtual address >=
* the address given by vaddr_min. Note that each allocation uses a
* a unique set of pages, with the minimum real allocation being at least
* a page. The allocated physical space comes from the TEST_DATA memory region.
*/
vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min)
{
return __vm_vaddr_alloc(vm, sz, vaddr_min, MEM_REGION_TEST_DATA);
}
/*
* VM Virtual Address Allocate Pages
*
......@@ -1282,6 +1298,11 @@ vm_vaddr_t vm_vaddr_alloc_pages(struct kvm_vm *vm, int nr_pages)
return vm_vaddr_alloc(vm, nr_pages * getpagesize(), KVM_UTIL_MIN_VADDR);
}
vm_vaddr_t __vm_vaddr_alloc_page(struct kvm_vm *vm, enum kvm_mem_region_type type)
{
return __vm_vaddr_alloc(vm, getpagesize(), KVM_UTIL_MIN_VADDR, type);
}
/*
* VM Virtual Address Allocate Page
*
......@@ -1847,7 +1868,8 @@ vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm, vm_paddr_t paddr_min,
vm_paddr_t vm_alloc_page_table(struct kvm_vm *vm)
{
return vm_phy_page_alloc(vm, KVM_GUEST_PAGE_TABLE_MIN_PADDR, 0);
return vm_phy_page_alloc(vm, KVM_GUEST_PAGE_TABLE_MIN_PADDR,
vm->memslots[MEM_REGION_PT]);
}
/*
......
......@@ -55,13 +55,15 @@ static uint64_t pte_index(struct kvm_vm *vm, vm_vaddr_t gva, int level)
void virt_arch_pgd_alloc(struct kvm_vm *vm)
{
if (!vm->pgd_created) {
vm_paddr_t paddr = vm_phy_pages_alloc(vm,
page_align(vm, ptrs_per_pte(vm) * 8) / vm->page_size,
KVM_GUEST_PAGE_TABLE_MIN_PADDR, 0);
vm->pgd = paddr;
vm->pgd_created = true;
}
size_t nr_pages = page_align(vm, ptrs_per_pte(vm) * 8) / vm->page_size;
if (vm->pgd_created)
return;
vm->pgd = vm_phy_pages_alloc(vm, nr_pages,
KVM_GUEST_PAGE_TABLE_MIN_PADDR,
vm->memslots[MEM_REGION_PT]);
vm->pgd_created = true;
}
void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr)
......@@ -279,15 +281,18 @@ struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id,
void *guest_code)
{
int r;
size_t stack_size = vm->page_size == 4096 ?
DEFAULT_STACK_PGS * vm->page_size :
vm->page_size;
unsigned long stack_vaddr = vm_vaddr_alloc(vm, stack_size,
DEFAULT_RISCV_GUEST_STACK_VADDR_MIN);
size_t stack_size;
unsigned long stack_vaddr;
unsigned long current_gp = 0;
struct kvm_mp_state mps;
struct kvm_vcpu *vcpu;
stack_size = vm->page_size == 4096 ? DEFAULT_STACK_PGS * vm->page_size :
vm->page_size;
stack_vaddr = __vm_vaddr_alloc(vm, stack_size,
DEFAULT_RISCV_GUEST_STACK_VADDR_MIN,
MEM_REGION_DATA);
vcpu = __vm_vcpu_add(vm, vcpu_id);
riscv_vcpu_mmu_setup(vcpu);
......
......@@ -21,7 +21,8 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm)
return;
paddr = vm_phy_pages_alloc(vm, PAGES_PER_REGION,
KVM_GUEST_PAGE_TABLE_MIN_PADDR, 0);
KVM_GUEST_PAGE_TABLE_MIN_PADDR,
vm->memslots[MEM_REGION_PT]);
memset(addr_gpa2hva(vm, paddr), 0xff, PAGES_PER_REGION * vm->page_size);
vm->pgd = paddr;
......@@ -167,8 +168,9 @@ struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id,
TEST_ASSERT(vm->page_size == 4096, "Unsupported page size: 0x%x",
vm->page_size);
stack_vaddr = vm_vaddr_alloc(vm, stack_size,
DEFAULT_GUEST_STACK_VADDR_MIN);
stack_vaddr = __vm_vaddr_alloc(vm, stack_size,
DEFAULT_GUEST_STACK_VADDR_MIN,
MEM_REGION_DATA);
vcpu = __vm_vcpu_add(vm, vcpu_id);
......
// SPDX-License-Identifier: GPL-2.0
/*
* KVM userfaultfd util
* Adapted from demand_paging_test.c
*
* Copyright (C) 2018, Red Hat, Inc.
* Copyright (C) 2019-2022 Google LLC
*/
#define _GNU_SOURCE /* for pipe2 */
#include <inttypes.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <poll.h>
#include <pthread.h>
#include <linux/userfaultfd.h>
#include <sys/syscall.h>
#include "kvm_util.h"
#include "test_util.h"
#include "perf_test_util.h"
#include "userfaultfd_util.h"
#ifdef __NR_userfaultfd
static void *uffd_handler_thread_fn(void *arg)
{
struct uffd_desc *uffd_desc = (struct uffd_desc *)arg;
int uffd = uffd_desc->uffd;
int pipefd = uffd_desc->pipefds[0];
useconds_t delay = uffd_desc->delay;
int64_t pages = 0;
struct timespec start;
struct timespec ts_diff;
clock_gettime(CLOCK_MONOTONIC, &start);
while (1) {
struct uffd_msg msg;
struct pollfd pollfd[2];
char tmp_chr;
int r;
pollfd[0].fd = uffd;
pollfd[0].events = POLLIN;
pollfd[1].fd = pipefd;
pollfd[1].events = POLLIN;
r = poll(pollfd, 2, -1);
switch (r) {
case -1:
pr_info("poll err");
continue;
case 0:
continue;
case 1:
break;
default:
pr_info("Polling uffd returned %d", r);
return NULL;
}
if (pollfd[0].revents & POLLERR) {
pr_info("uffd revents has POLLERR");
return NULL;
}
if (pollfd[1].revents & POLLIN) {
r = read(pollfd[1].fd, &tmp_chr, 1);
TEST_ASSERT(r == 1,
"Error reading pipefd in UFFD thread\n");
return NULL;
}
if (!(pollfd[0].revents & POLLIN))
continue;
r = read(uffd, &msg, sizeof(msg));
if (r == -1) {
if (errno == EAGAIN)
continue;
pr_info("Read of uffd got errno %d\n", errno);
return NULL;
}
if (r != sizeof(msg)) {
pr_info("Read on uffd returned unexpected size: %d bytes", r);
return NULL;
}
if (!(msg.event & UFFD_EVENT_PAGEFAULT))
continue;
if (delay)
usleep(delay);
r = uffd_desc->handler(uffd_desc->uffd_mode, uffd, &msg);
if (r < 0)
return NULL;
pages++;
}
ts_diff = timespec_elapsed(start);
PER_VCPU_DEBUG("userfaulted %ld pages over %ld.%.9lds. (%f/sec)\n",
pages, ts_diff.tv_sec, ts_diff.tv_nsec,
pages / ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / 100000000.0));
return NULL;
}
struct uffd_desc *uffd_setup_demand_paging(int uffd_mode, useconds_t delay,
void *hva, uint64_t len,
uffd_handler_t handler)
{
struct uffd_desc *uffd_desc;
bool is_minor = (uffd_mode == UFFDIO_REGISTER_MODE_MINOR);
int uffd;
struct uffdio_api uffdio_api;
struct uffdio_register uffdio_register;
uint64_t expected_ioctls = ((uint64_t) 1) << _UFFDIO_COPY;
int ret;
PER_PAGE_DEBUG("Userfaultfd %s mode, faults resolved with %s\n",
is_minor ? "MINOR" : "MISSING",
is_minor ? "UFFDIO_CONINUE" : "UFFDIO_COPY");
uffd_desc = malloc(sizeof(struct uffd_desc));
TEST_ASSERT(uffd_desc, "malloc failed");
/* In order to get minor faults, prefault via the alias. */
if (is_minor)
expected_ioctls = ((uint64_t) 1) << _UFFDIO_CONTINUE;
uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
TEST_ASSERT(uffd >= 0, "uffd creation failed, errno: %d", errno);
uffdio_api.api = UFFD_API;
uffdio_api.features = 0;
TEST_ASSERT(ioctl(uffd, UFFDIO_API, &uffdio_api) != -1,
"ioctl UFFDIO_API failed: %" PRIu64,
(uint64_t)uffdio_api.api);
uffdio_register.range.start = (uint64_t)hva;
uffdio_register.range.len = len;
uffdio_register.mode = uffd_mode;
TEST_ASSERT(ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) != -1,
"ioctl UFFDIO_REGISTER failed");
TEST_ASSERT((uffdio_register.ioctls & expected_ioctls) ==
expected_ioctls, "missing userfaultfd ioctls");
ret = pipe2(uffd_desc->pipefds, O_CLOEXEC | O_NONBLOCK);
TEST_ASSERT(!ret, "Failed to set up pipefd");
uffd_desc->uffd_mode = uffd_mode;
uffd_desc->uffd = uffd;
uffd_desc->delay = delay;
uffd_desc->handler = handler;
pthread_create(&uffd_desc->thread, NULL, uffd_handler_thread_fn,
uffd_desc);
PER_VCPU_DEBUG("Created uffd thread for HVA range [%p, %p)\n",
hva, hva + len);
return uffd_desc;
}
void uffd_stop_demand_paging(struct uffd_desc *uffd)
{
char c = 0;
int ret;
ret = write(uffd->pipefds[1], &c, 1);
TEST_ASSERT(ret == 1, "Unable to write to pipefd");
ret = pthread_join(uffd->thread, NULL);
TEST_ASSERT(ret == 0, "Pthread_join failed.");
close(uffd->uffd);
close(uffd->pipefds[1]);
close(uffd->pipefds[0]);
free(uffd);
}
#endif /* __NR_userfaultfd */
......@@ -552,7 +552,7 @@ vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
static void kvm_setup_gdt(struct kvm_vm *vm, struct kvm_dtable *dt)
{
if (!vm->gdt)
vm->gdt = vm_vaddr_alloc_page(vm);
vm->gdt = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA);
dt->base = vm->gdt;
dt->limit = getpagesize();
......@@ -562,7 +562,7 @@ static void kvm_setup_tss_64bit(struct kvm_vm *vm, struct kvm_segment *segp,
int selector)
{
if (!vm->tss)
vm->tss = vm_vaddr_alloc_page(vm);
vm->tss = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA);
memset(segp, 0, sizeof(*segp));
segp->base = vm->tss;
......@@ -647,8 +647,9 @@ struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id,
vm_vaddr_t stack_vaddr;
struct kvm_vcpu *vcpu;
stack_vaddr = vm_vaddr_alloc(vm, DEFAULT_STACK_PGS * getpagesize(),
DEFAULT_GUEST_STACK_VADDR_MIN);
stack_vaddr = __vm_vaddr_alloc(vm, DEFAULT_STACK_PGS * getpagesize(),
DEFAULT_GUEST_STACK_VADDR_MIN,
MEM_REGION_DATA);
vcpu = __vm_vcpu_add(vm, vcpu_id);
vcpu_init_cpuid(vcpu, kvm_get_supported_cpuid());
......@@ -1145,8 +1146,8 @@ void vm_init_descriptor_tables(struct kvm_vm *vm)
extern void *idt_handlers;
int i;
vm->idt = vm_vaddr_alloc_page(vm);
vm->handlers = vm_vaddr_alloc_page(vm);
vm->idt = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA);
vm->handlers = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA);
/* Handlers have the same address in both address spaces.*/
for (i = 0; i < NUM_INTERRUPTS; i++)
set_idt_entry(vm, i, (unsigned long)(&idt_handlers)[i], 0,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment