Commit 2c96136a authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'x86_cc_for_v6.5' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 confidential computing update from Borislav Petkov:

 - Add support for unaccepted memory as specified in the UEFI spec v2.9.

   The gist of it all is that Intel TDX and AMD SEV-SNP confidential
   computing guests define the notion of accepting memory before using
   it and thus preventing a whole set of attacks against such guests
   like memory replay and the like.

   There are a couple of strategies of how memory should be accepted -
   the current implementation does an on-demand way of accepting.

* tag 'x86_cc_for_v6.5' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  virt: sevguest: Add CONFIG_CRYPTO dependency
  x86/efi: Safely enable unaccepted memory in UEFI
  x86/sev: Add SNP-specific unaccepted memory support
  x86/sev: Use large PSC requests if applicable
  x86/sev: Allow for use of the early boot GHCB for PSC requests
  x86/sev: Put PSC struct on the stack in prep for unaccepted memory support
  x86/sev: Fix calculation of end address based on number of pages
  x86/tdx: Add unaccepted memory support
  x86/tdx: Refactor try_accept_one()
  x86/tdx: Make _tdx_hypercall() and __tdx_module_call() available in boot stub
  efi/unaccepted: Avoid load_unaligned_zeropad() stepping into unaccepted memory
  efi: Add unaccepted memory support
  x86/boot/compressed: Handle unaccepted memory
  efi/libstub: Implement support for unaccepted memory
  efi/x86: Get full memory map in allocate_e820()
  mm: Add support for unaccepted memory
parents 3e5822e0 84b9b44b
......@@ -887,9 +887,11 @@ config INTEL_TDX_GUEST
bool "Intel TDX (Trust Domain Extensions) - Guest Support"
depends on X86_64 && CPU_SUP_INTEL
depends on X86_X2APIC
depends on EFI_STUB
select ARCH_HAS_CC_PLATFORM
select X86_MEM_ENCRYPT
select X86_MCE
select UNACCEPTED_MEMORY
help
Support running as a guest under Intel TDX. Without this support,
the guest kernel can not boot or run under TDX.
......@@ -1544,11 +1546,13 @@ config X86_MEM_ENCRYPT
config AMD_MEM_ENCRYPT
bool "AMD Secure Memory Encryption (SME) support"
depends on X86_64 && CPU_SUP_AMD
depends on EFI_STUB
select DMA_COHERENT_POOL
select ARCH_USE_MEMREMAP_PROT
select INSTRUCTION_DECODER
select ARCH_HAS_CC_PLATFORM
select X86_MEM_ENCRYPT
select UNACCEPTED_MEMORY
help
Say yes to enable support for the encryption of system memory.
This requires an AMD processor that supports Secure Memory
......
......@@ -106,7 +106,8 @@ ifdef CONFIG_X86_64
endif
vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o
vmlinux-objs-$(CONFIG_INTEL_TDX_GUEST) += $(obj)/tdx.o $(obj)/tdcall.o
vmlinux-objs-$(CONFIG_INTEL_TDX_GUEST) += $(obj)/tdx.o $(obj)/tdcall.o $(obj)/tdx-shared.o
vmlinux-objs-$(CONFIG_UNACCEPTED_MEMORY) += $(obj)/mem.o
vmlinux-objs-$(CONFIG_EFI) += $(obj)/efi.o
vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_mixed.o
......
......@@ -16,6 +16,7 @@ typedef guid_t efi_guid_t __aligned(__alignof__(u32));
#define ACPI_TABLE_GUID EFI_GUID(0xeb9d2d30, 0x2d88, 0x11d3, 0x9a, 0x16, 0x00, 0x90, 0x27, 0x3f, 0xc1, 0x4d)
#define ACPI_20_TABLE_GUID EFI_GUID(0x8868e871, 0xe4f1, 0x11d3, 0xbc, 0x22, 0x00, 0x80, 0xc7, 0x3c, 0x88, 0x81)
#define EFI_CC_BLOB_GUID EFI_GUID(0x067b1f5f, 0xcf26, 0x44c5, 0x85, 0x54, 0x93, 0xd7, 0x77, 0x91, 0x2d, 0x42)
#define LINUX_EFI_UNACCEPTED_MEM_TABLE_GUID EFI_GUID(0xd5d1de3c, 0x105c, 0x44f9, 0x9e, 0xa9, 0xbc, 0xef, 0x98, 0x12, 0x00, 0x31)
#define EFI32_LOADER_SIGNATURE "EL32"
#define EFI64_LOADER_SIGNATURE "EL64"
......@@ -32,6 +33,7 @@ typedef struct {
} efi_table_hdr_t;
#define EFI_CONVENTIONAL_MEMORY 7
#define EFI_UNACCEPTED_MEMORY 15
#define EFI_MEMORY_MORE_RELIABLE \
((u64)0x0000000000010000ULL) /* higher reliability */
......@@ -104,6 +106,14 @@ struct efi_setup_data {
u64 reserved[8];
};
struct efi_unaccepted_memory {
u32 version;
u32 unit_size;
u64 phys_base;
u64 size;
unsigned long bitmap[];
};
static inline int efi_guidcmp (efi_guid_t left, efi_guid_t right)
{
return memcmp(&left, &right, sizeof (efi_guid_t));
......
......@@ -22,3 +22,22 @@ void error(char *m)
while (1)
asm("hlt");
}
/* EFI libstub provides vsnprintf() */
#ifdef CONFIG_EFI_STUB
void panic(const char *fmt, ...)
{
static char buf[1024];
va_list args;
int len;
va_start(args, fmt);
len = vsnprintf(buf, sizeof(buf), fmt, args);
va_end(args);
if (len && buf[len - 1] == '\n')
buf[len - 1] = '\0';
error(buf);
}
#endif
......@@ -6,5 +6,6 @@
void warn(char *m);
void error(char *m) __noreturn;
void panic(const char *fmt, ...) __noreturn __cold;
#endif /* BOOT_COMPRESSED_ERROR_H */
......@@ -672,6 +672,33 @@ static bool process_mem_region(struct mem_vector *region,
}
#ifdef CONFIG_EFI
/*
* Only EFI_CONVENTIONAL_MEMORY and EFI_UNACCEPTED_MEMORY (if supported) are
* guaranteed to be free.
*
* Pick free memory more conservatively than the EFI spec allows: according to
* the spec, EFI_BOOT_SERVICES_{CODE|DATA} are also free memory and thus
* available to place the kernel image into, but in practice there's firmware
* where using that memory leads to crashes. Buggy vendor EFI code registers
* for an event that triggers on SetVirtualAddressMap(). The handler assumes
* that EFI_BOOT_SERVICES_DATA memory has not been touched by loader yet, which
* is probably true for Windows.
*
* Preserve EFI_BOOT_SERVICES_* regions until after SetVirtualAddressMap().
*/
static inline bool memory_type_is_free(efi_memory_desc_t *md)
{
if (md->type == EFI_CONVENTIONAL_MEMORY)
return true;
if (IS_ENABLED(CONFIG_UNACCEPTED_MEMORY) &&
md->type == EFI_UNACCEPTED_MEMORY)
return true;
return false;
}
/*
* Returns true if we processed the EFI memmap, which we prefer over the E820
* table if it is available.
......@@ -716,18 +743,7 @@ process_efi_entries(unsigned long minimum, unsigned long image_size)
for (i = 0; i < nr_desc; i++) {
md = efi_early_memdesc_ptr(pmap, e->efi_memdesc_size, i);
/*
* Here we are more conservative in picking free memory than
* the EFI spec allows:
*
* According to the spec, EFI_BOOT_SERVICES_{CODE|DATA} are also
* free memory and thus available to place the kernel image into,
* but in practice there's firmware where using that memory leads
* to crashes.
*
* Only EFI_CONVENTIONAL_MEMORY is guaranteed to be free.
*/
if (md->type != EFI_CONVENTIONAL_MEMORY)
if (!memory_type_is_free(md))
continue;
if (efi_soft_reserve_enabled() &&
......
// SPDX-License-Identifier: GPL-2.0-only
#include "error.h"
#include "misc.h"
#include "tdx.h"
#include "sev.h"
#include <asm/shared/tdx.h>
/*
* accept_memory() and process_unaccepted_memory() called from EFI stub which
* runs before decompresser and its early_tdx_detect().
*
* Enumerate TDX directly from the early users.
*/
static bool early_is_tdx_guest(void)
{
static bool once;
static bool is_tdx;
if (!IS_ENABLED(CONFIG_INTEL_TDX_GUEST))
return false;
if (!once) {
u32 eax, sig[3];
cpuid_count(TDX_CPUID_LEAF_ID, 0, &eax,
&sig[0], &sig[2], &sig[1]);
is_tdx = !memcmp(TDX_IDENT, sig, sizeof(sig));
once = true;
}
return is_tdx;
}
void arch_accept_memory(phys_addr_t start, phys_addr_t end)
{
/* Platform-specific memory-acceptance call goes here */
if (early_is_tdx_guest()) {
if (!tdx_accept_memory(start, end))
panic("TDX: Failed to accept memory\n");
} else if (sev_snp_enabled()) {
snp_accept_memory(start, end);
} else {
error("Cannot accept memory: unknown platform\n");
}
}
bool init_unaccepted_memory(void)
{
guid_t guid = LINUX_EFI_UNACCEPTED_MEM_TABLE_GUID;
struct efi_unaccepted_memory *table;
unsigned long cfg_table_pa;
unsigned int cfg_table_len;
enum efi_type et;
int ret;
et = efi_get_type(boot_params);
if (et == EFI_TYPE_NONE)
return false;
ret = efi_get_conf_table(boot_params, &cfg_table_pa, &cfg_table_len);
if (ret) {
warn("EFI config table not found.");
return false;
}
table = (void *)efi_find_vendor_table(boot_params, cfg_table_pa,
cfg_table_len, guid);
if (!table)
return false;
if (table->version != 1)
error("Unknown version of unaccepted memory table\n");
/*
* In many cases unaccepted_table is already set by EFI stub, but it
* has to be initialized again to cover cases when the table is not
* allocated by EFI stub or EFI stub copied the kernel image with
* efi_relocate_kernel() before the variable is set.
*
* It must be initialized before the first usage of accept_memory().
*/
unaccepted_table = table;
return true;
}
......@@ -455,6 +455,12 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
#endif
debug_putstr("\nDecompressing Linux... ");
if (init_unaccepted_memory()) {
debug_putstr("Accepting memory... ");
accept_memory(__pa(output), __pa(output) + needed_size);
}
__decompress(input_data, input_len, NULL, NULL, output, output_len,
NULL, error);
entry_offset = parse_elf(output);
......
......@@ -247,4 +247,14 @@ static inline unsigned long efi_find_vendor_table(struct boot_params *bp,
}
#endif /* CONFIG_EFI */
#ifdef CONFIG_UNACCEPTED_MEMORY
bool init_unaccepted_memory(void);
#else
static inline bool init_unaccepted_memory(void) { return false; }
#endif
/* Defined in EFI stub */
extern struct efi_unaccepted_memory *unaccepted_table;
void accept_memory(phys_addr_t start, phys_addr_t end);
#endif /* BOOT_COMPRESSED_MISC_H */
......@@ -115,7 +115,7 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
/* Include code for early handlers */
#include "../../kernel/sev-shared.c"
static inline bool sev_snp_enabled(void)
bool sev_snp_enabled(void)
{
return sev_status & MSR_AMD64_SEV_SNP_ENABLED;
}
......@@ -181,6 +181,58 @@ static bool early_setup_ghcb(void)
return true;
}
static phys_addr_t __snp_accept_memory(struct snp_psc_desc *desc,
phys_addr_t pa, phys_addr_t pa_end)
{
struct psc_hdr *hdr;
struct psc_entry *e;
unsigned int i;
hdr = &desc->hdr;
memset(hdr, 0, sizeof(*hdr));
e = desc->entries;
i = 0;
while (pa < pa_end && i < VMGEXIT_PSC_MAX_ENTRY) {
hdr->end_entry = i;
e->gfn = pa >> PAGE_SHIFT;
e->operation = SNP_PAGE_STATE_PRIVATE;
if (IS_ALIGNED(pa, PMD_SIZE) && (pa_end - pa) >= PMD_SIZE) {
e->pagesize = RMP_PG_SIZE_2M;
pa += PMD_SIZE;
} else {
e->pagesize = RMP_PG_SIZE_4K;
pa += PAGE_SIZE;
}
e++;
i++;
}
if (vmgexit_psc(boot_ghcb, desc))
sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC);
pvalidate_pages(desc);
return pa;
}
void snp_accept_memory(phys_addr_t start, phys_addr_t end)
{
struct snp_psc_desc desc = {};
unsigned int i;
phys_addr_t pa;
if (!boot_ghcb && !early_setup_ghcb())
sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC);
pa = start;
while (pa < end)
pa = __snp_accept_memory(&desc, pa, end);
}
void sev_es_shutdown_ghcb(void)
{
if (!boot_ghcb)
......
/* SPDX-License-Identifier: GPL-2.0 */
/*
* AMD SEV header for early boot related functions.
*
* Author: Tom Lendacky <thomas.lendacky@amd.com>
*/
#ifndef BOOT_COMPRESSED_SEV_H
#define BOOT_COMPRESSED_SEV_H
#ifdef CONFIG_AMD_MEM_ENCRYPT
bool sev_snp_enabled(void);
void snp_accept_memory(phys_addr_t start, phys_addr_t end);
#else
static inline bool sev_snp_enabled(void) { return false; }
static inline void snp_accept_memory(phys_addr_t start, phys_addr_t end) { }
#endif
#endif
#include "error.h"
#include "../../coco/tdx/tdx-shared.c"
# SPDX-License-Identifier: GPL-2.0
obj-y += tdx.o tdcall.o
obj-y += tdx.o tdx-shared.o tdcall.o
#include <asm/tdx.h>
#include <asm/pgtable.h>
static unsigned long try_accept_one(phys_addr_t start, unsigned long len,
enum pg_level pg_level)
{
unsigned long accept_size = page_level_size(pg_level);
u64 tdcall_rcx;
u8 page_size;
if (!IS_ALIGNED(start, accept_size))
return 0;
if (len < accept_size)
return 0;
/*
* Pass the page physical address to the TDX module to accept the
* pending, private page.
*
* Bits 2:0 of RCX encode page size: 0 - 4K, 1 - 2M, 2 - 1G.
*/
switch (pg_level) {
case PG_LEVEL_4K:
page_size = 0;
break;
case PG_LEVEL_2M:
page_size = 1;
break;
case PG_LEVEL_1G:
page_size = 2;
break;
default:
return 0;
}
tdcall_rcx = start | page_size;
if (__tdx_module_call(TDX_ACCEPT_PAGE, tdcall_rcx, 0, 0, 0, NULL))
return 0;
return accept_size;
}
bool tdx_accept_memory(phys_addr_t start, phys_addr_t end)
{
/*
* For shared->private conversion, accept the page using
* TDX_ACCEPT_PAGE TDX module call.
*/
while (start < end) {
unsigned long len = end - start;
unsigned long accept_size;
/*
* Try larger accepts first. It gives chance to VMM to keep
* 1G/2M Secure EPT entries where possible and speeds up
* process by cutting number of hypercalls (if successful).
*/
accept_size = try_accept_one(start, len, PG_LEVEL_1G);
if (!accept_size)
accept_size = try_accept_one(start, len, PG_LEVEL_2M);
if (!accept_size)
accept_size = try_accept_one(start, len, PG_LEVEL_4K);
if (!accept_size)
return false;
start += accept_size;
}
return true;
}
......@@ -14,20 +14,6 @@
#include <asm/insn-eval.h>
#include <asm/pgtable.h>
/* TDX module Call Leaf IDs */
#define TDX_GET_INFO 1
#define TDX_GET_VEINFO 3
#define TDX_GET_REPORT 4
#define TDX_ACCEPT_PAGE 6
#define TDX_WR 8
/* TDCS fields. To be used by TDG.VM.WR and TDG.VM.RD module calls */
#define TDCS_NOTIFY_ENABLES 0x9100000000000010
/* TDX hypercall Leaf IDs */
#define TDVMCALL_MAP_GPA 0x10001
#define TDVMCALL_REPORT_FATAL_ERROR 0x10003
/* MMIO direction */
#define EPT_READ 0
#define EPT_WRITE 1
......@@ -51,24 +37,6 @@
#define TDREPORT_SUBTYPE_0 0
/*
* Wrapper for standard use of __tdx_hypercall with no output aside from
* return code.
*/
static inline u64 _tdx_hypercall(u64 fn, u64 r12, u64 r13, u64 r14, u64 r15)
{
struct tdx_hypercall_args args = {
.r10 = TDX_HYPERCALL_STANDARD,
.r11 = fn,
.r12 = r12,
.r13 = r13,
.r14 = r14,
.r15 = r15,
};
return __tdx_hypercall(&args);
}
/* Called from __tdx_hypercall() for unrecoverable failure */
noinstr void __tdx_hypercall_failed(void)
{
......@@ -745,47 +713,6 @@ static bool tdx_cache_flush_required(void)
return true;
}
static bool try_accept_one(phys_addr_t *start, unsigned long len,
enum pg_level pg_level)
{
unsigned long accept_size = page_level_size(pg_level);
u64 tdcall_rcx;
u8 page_size;
if (!IS_ALIGNED(*start, accept_size))
return false;
if (len < accept_size)
return false;
/*
* Pass the page physical address to the TDX module to accept the
* pending, private page.
*
* Bits 2:0 of RCX encode page size: 0 - 4K, 1 - 2M, 2 - 1G.
*/
switch (pg_level) {
case PG_LEVEL_4K:
page_size = 0;
break;
case PG_LEVEL_2M:
page_size = 1;
break;
case PG_LEVEL_1G:
page_size = 2;
break;
default:
return false;
}
tdcall_rcx = *start | page_size;
if (__tdx_module_call(TDX_ACCEPT_PAGE, tdcall_rcx, 0, 0, 0, NULL))
return false;
*start += accept_size;
return true;
}
/*
* Inform the VMM of the guest's intent for this physical page: shared with
* the VMM or private to the guest. The VMM is expected to change its mapping
......@@ -810,32 +737,9 @@ static bool tdx_enc_status_changed(unsigned long vaddr, int numpages, bool enc)
if (_tdx_hypercall(TDVMCALL_MAP_GPA, start, end - start, 0, 0))
return false;
/* private->shared conversion requires only MapGPA call */
if (!enc)
return true;
/*
* For shared->private conversion, accept the page using
* TDX_ACCEPT_PAGE TDX module call.
*/
while (start < end) {
unsigned long len = end - start;
/*
* Try larger accepts first. It gives chance to VMM to keep
* 1G/2M SEPT entries where possible and speeds up process by
* cutting number of hypercalls (if successful).
*/
if (try_accept_one(&start, len, PG_LEVEL_1G))
continue;
if (try_accept_one(&start, len, PG_LEVEL_2M))
continue;
if (!try_accept_one(&start, len, PG_LEVEL_4K))
return false;
}
/* shared->private conversion requires memory to be accepted before use */
if (enc)
return tdx_accept_memory(start, end);
return true;
}
......
......@@ -31,6 +31,8 @@ extern unsigned long efi_mixed_mode_stack_pa;
#define ARCH_EFI_IRQ_FLAGS_MASK X86_EFLAGS_IF
#define EFI_UNACCEPTED_UNIT_SIZE PMD_SIZE
/*
* The EFI services are called through variadic functions in many cases. These
* functions are implemented in assembler and support only a fixed number of
......
......@@ -106,8 +106,13 @@ enum psc_op {
#define GHCB_HV_FT_SNP BIT_ULL(0)
#define GHCB_HV_FT_SNP_AP_CREATION BIT_ULL(1)
/* SNP Page State Change NAE event */
#define VMGEXIT_PSC_MAX_ENTRY 253
/*
* SNP Page State Change NAE event
* The VMGEXIT_PSC_MAX_ENTRY determines the size of the PSC structure, which
* is a local stack variable in set_pages_state(). Do not increase this value
* without evaluating the impact to stack usage.
*/
#define VMGEXIT_PSC_MAX_ENTRY 64
struct psc_hdr {
u16 cur_entry;
......
......@@ -80,11 +80,15 @@ extern void vc_no_ghcb(void);
extern void vc_boot_ghcb(void);
extern bool handle_vc_boot_ghcb(struct pt_regs *regs);
/* PVALIDATE return codes */
#define PVALIDATE_FAIL_SIZEMISMATCH 6
/* Software defined (when rFlags.CF = 1) */
#define PVALIDATE_FAIL_NOUPDATE 255
/* RMP page size */
#define RMP_PG_SIZE_4K 0
#define RMP_PG_SIZE_2M 1
#define RMPADJUST_VMSA_PAGE_BIT BIT(16)
......@@ -192,16 +196,17 @@ struct snp_guest_request_ioctl;
void setup_ghcb(void);
void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr,
unsigned int npages);
unsigned long npages);
void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr,
unsigned int npages);
unsigned long npages);
void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op);
void snp_set_memory_shared(unsigned long vaddr, unsigned int npages);
void snp_set_memory_private(unsigned long vaddr, unsigned int npages);
void snp_set_memory_shared(unsigned long vaddr, unsigned long npages);
void snp_set_memory_private(unsigned long vaddr, unsigned long npages);
void snp_set_wakeup_secondary_cpu(void);
bool snp_init(struct boot_params *bp);
void __init __noreturn snp_abort(void);
int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio);
void snp_accept_memory(phys_addr_t start, phys_addr_t end);
#else
static inline void sev_es_ist_enter(struct pt_regs *regs) { }
static inline void sev_es_ist_exit(void) { }
......@@ -212,12 +217,12 @@ static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate)
static inline int rmpadjust(unsigned long vaddr, bool rmp_psize, unsigned long attrs) { return 0; }
static inline void setup_ghcb(void) { }
static inline void __init
early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, unsigned int npages) { }
early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, unsigned long npages) { }
static inline void __init
early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, unsigned int npages) { }
early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, unsigned long npages) { }
static inline void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op) { }
static inline void snp_set_memory_shared(unsigned long vaddr, unsigned int npages) { }
static inline void snp_set_memory_private(unsigned long vaddr, unsigned int npages) { }
static inline void snp_set_memory_shared(unsigned long vaddr, unsigned long npages) { }
static inline void snp_set_memory_private(unsigned long vaddr, unsigned long npages) { }
static inline void snp_set_wakeup_secondary_cpu(void) { }
static inline bool snp_init(struct boot_params *bp) { return false; }
static inline void snp_abort(void) { }
......@@ -225,6 +230,8 @@ static inline int snp_issue_guest_request(u64 exit_code, struct snp_req_data *in
{
return -ENOTTY;
}
static inline void snp_accept_memory(phys_addr_t start, phys_addr_t end) { }
#endif
#endif
......@@ -10,6 +10,20 @@
#define TDX_CPUID_LEAF_ID 0x21
#define TDX_IDENT "IntelTDX "
/* TDX module Call Leaf IDs */
#define TDX_GET_INFO 1
#define TDX_GET_VEINFO 3
#define TDX_GET_REPORT 4
#define TDX_ACCEPT_PAGE 6
#define TDX_WR 8
/* TDCS fields. To be used by TDG.VM.WR and TDG.VM.RD module calls */
#define TDCS_NOTIFY_ENABLES 0x9100000000000010
/* TDX hypercall Leaf IDs */
#define TDVMCALL_MAP_GPA 0x10001
#define TDVMCALL_REPORT_FATAL_ERROR 0x10003
#ifndef __ASSEMBLY__
/*
......@@ -37,8 +51,47 @@ struct tdx_hypercall_args {
u64 __tdx_hypercall(struct tdx_hypercall_args *args);
u64 __tdx_hypercall_ret(struct tdx_hypercall_args *args);
/*
* Wrapper for standard use of __tdx_hypercall with no output aside from
* return code.
*/
static inline u64 _tdx_hypercall(u64 fn, u64 r12, u64 r13, u64 r14, u64 r15)
{
struct tdx_hypercall_args args = {
.r10 = TDX_HYPERCALL_STANDARD,
.r11 = fn,
.r12 = r12,
.r13 = r13,
.r14 = r14,
.r15 = r15,
};
return __tdx_hypercall(&args);
}
/* Called from __tdx_hypercall() for unrecoverable failure */
void __tdx_hypercall_failed(void);
/*
* Used in __tdx_module_call() to gather the output registers' values of the
* TDCALL instruction when requesting services from the TDX module. This is a
* software only structure and not part of the TDX module/VMM ABI
*/
struct tdx_module_output {
u64 rcx;
u64 rdx;
u64 r8;
u64 r9;
u64 r10;
u64 r11;
};
/* Used to communicate with the TDX module */
u64 __tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9,
struct tdx_module_output *out);
bool tdx_accept_memory(phys_addr_t start, phys_addr_t end);
#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_SHARED_TDX_H */
......@@ -5,6 +5,8 @@
#include <linux/init.h>
#include <linux/bits.h>
#include <asm/errno.h>
#include <asm/ptrace.h>
#include <asm/shared/tdx.h>
......@@ -20,21 +22,6 @@
#ifndef __ASSEMBLY__
/*
* Used to gather the output registers values of the TDCALL and SEAMCALL
* instructions when requesting services from the TDX module.
*
* This is a software only structure and not part of the TDX module/VMM ABI.
*/
struct tdx_module_output {
u64 rcx;
u64 rdx;
u64 r8;
u64 r9;
u64 r10;
u64 r11;
};
/*
* Used by the #VE exception handler to gather the #VE exception
* info from the TDX module. This is a software only structure
......@@ -55,10 +42,6 @@ struct ve_info {
void __init tdx_early_init(void);
/* Used to communicate with the TDX module */
u64 __tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9,
struct tdx_module_output *out);
void tdx_get_ve_info(struct ve_info *ve);
bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve);
......
#ifndef _ASM_X86_UNACCEPTED_MEMORY_H
#define _ASM_X86_UNACCEPTED_MEMORY_H
#include <linux/efi.h>
#include <asm/tdx.h>
#include <asm/sev.h>
static inline void arch_accept_memory(phys_addr_t start, phys_addr_t end)
{
/* Platform-specific memory-acceptance call goes here */
if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) {
if (!tdx_accept_memory(start, end))
panic("TDX: Failed to accept memory\n");
} else if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) {
snp_accept_memory(start, end);
} else {
panic("Cannot accept memory: unknown platform\n");
}
}
static inline struct efi_unaccepted_memory *efi_get_unaccepted_table(void)
{
if (efi.unaccepted == EFI_INVALID_TABLE_ADDR)
return NULL;
return __va(efi.unaccepted);
}
#endif
......@@ -12,6 +12,9 @@
#ifndef __BOOT_COMPRESSED
#define error(v) pr_err(v)
#define has_cpuflag(f) boot_cpu_has(f)
#else
#undef WARN
#define WARN(condition, format...) (!!(condition))
#endif
/* I/O parameters for CPUID-related helpers */
......@@ -991,3 +994,103 @@ static void __init setup_cpuid_table(const struct cc_blob_sev_info *cc_info)
cpuid_ext_range_max = fn->eax;
}
}
static void pvalidate_pages(struct snp_psc_desc *desc)
{
struct psc_entry *e;
unsigned long vaddr;
unsigned int size;
unsigned int i;
bool validate;
int rc;
for (i = 0; i <= desc->hdr.end_entry; i++) {
e = &desc->entries[i];
vaddr = (unsigned long)pfn_to_kaddr(e->gfn);
size = e->pagesize ? RMP_PG_SIZE_2M : RMP_PG_SIZE_4K;
validate = e->operation == SNP_PAGE_STATE_PRIVATE;
rc = pvalidate(vaddr, size, validate);
if (rc == PVALIDATE_FAIL_SIZEMISMATCH && size == RMP_PG_SIZE_2M) {
unsigned long vaddr_end = vaddr + PMD_SIZE;
for (; vaddr < vaddr_end; vaddr += PAGE_SIZE) {
rc = pvalidate(vaddr, RMP_PG_SIZE_4K, validate);
if (rc)
break;
}
}
if (rc) {
WARN(1, "Failed to validate address 0x%lx ret %d", vaddr, rc);
sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE);
}
}
}
static int vmgexit_psc(struct ghcb *ghcb, struct snp_psc_desc *desc)
{
int cur_entry, end_entry, ret = 0;
struct snp_psc_desc *data;
struct es_em_ctxt ctxt;
vc_ghcb_invalidate(ghcb);
/* Copy the input desc into GHCB shared buffer */
data = (struct snp_psc_desc *)ghcb->shared_buffer;
memcpy(ghcb->shared_buffer, desc, min_t(int, GHCB_SHARED_BUF_SIZE, sizeof(*desc)));
/*
* As per the GHCB specification, the hypervisor can resume the guest
* before processing all the entries. Check whether all the entries
* are processed. If not, then keep retrying. Note, the hypervisor
* will update the data memory directly to indicate the status, so
* reference the data->hdr everywhere.
*
* The strategy here is to wait for the hypervisor to change the page
* state in the RMP table before guest accesses the memory pages. If the
* page state change was not successful, then later memory access will
* result in a crash.
*/
cur_entry = data->hdr.cur_entry;
end_entry = data->hdr.end_entry;
while (data->hdr.cur_entry <= data->hdr.end_entry) {
ghcb_set_sw_scratch(ghcb, (u64)__pa(data));
/* This will advance the shared buffer data points to. */
ret = sev_es_ghcb_hv_call(ghcb, &ctxt, SVM_VMGEXIT_PSC, 0, 0);
/*
* Page State Change VMGEXIT can pass error code through
* exit_info_2.
*/
if (WARN(ret || ghcb->save.sw_exit_info_2,
"SNP: PSC failed ret=%d exit_info_2=%llx\n",
ret, ghcb->save.sw_exit_info_2)) {
ret = 1;
goto out;
}
/* Verify that reserved bit is not set */
if (WARN(data->hdr.reserved, "Reserved bit is set in the PSC header\n")) {
ret = 1;
goto out;
}
/*
* Sanity check that entry processing is not going backwards.
* This will happen only if hypervisor is tricking us.
*/
if (WARN(data->hdr.end_entry > end_entry || cur_entry > data->hdr.cur_entry,
"SNP: PSC processing going backward, end_entry %d (got %d) cur_entry %d (got %d)\n",
end_entry, data->hdr.end_entry, cur_entry, data->hdr.cur_entry)) {
ret = 1;
goto out;
}
}
out:
return ret;
}
This diff is collapsed.
......@@ -96,6 +96,9 @@ static const unsigned long * const efi_tables[] = {
#ifdef CONFIG_EFI_COCO_SECRET
&efi.coco_secret,
#endif
#ifdef CONFIG_UNACCEPTED_MEMORY
&efi.unaccepted,
#endif
};
u64 efi_setup; /* efi setup_data physical address */
......
......@@ -448,6 +448,9 @@ static ssize_t node_read_meminfo(struct device *dev,
"Node %d ShmemPmdMapped: %8lu kB\n"
"Node %d FileHugePages: %8lu kB\n"
"Node %d FilePmdMapped: %8lu kB\n"
#endif
#ifdef CONFIG_UNACCEPTED_MEMORY
"Node %d Unaccepted: %8lu kB\n"
#endif
,
nid, K(node_page_state(pgdat, NR_FILE_DIRTY)),
......@@ -477,6 +480,10 @@ static ssize_t node_read_meminfo(struct device *dev,
nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)),
nid, K(node_page_state(pgdat, NR_FILE_THPS)),
nid, K(node_page_state(pgdat, NR_FILE_PMDMAPPED))
#endif
#ifdef CONFIG_UNACCEPTED_MEMORY
,
nid, K(sum_zone_node_page_state(nid, NR_UNACCEPTED))
#endif
);
len += hugetlb_report_node_meminfo(buf, len, nid);
......
......@@ -269,6 +269,20 @@ config EFI_COCO_SECRET
virt/coco/efi_secret module to access the secrets, which in turn
allows userspace programs to access the injected secrets.
config UNACCEPTED_MEMORY
bool
depends on EFI_STUB
help
Some Virtual Machine platforms, such as Intel TDX, require
some memory to be "accepted" by the guest before it can be used.
This mechanism helps prevent malicious hosts from making changes
to guest memory.
UEFI specification v2.9 introduced EFI_UNACCEPTED_MEMORY memory type.
This option adds support for unaccepted memory and makes such memory
usable by the kernel.
config EFI_EMBEDDED_FIRMWARE
bool
select CRYPTO_LIB_SHA256
......
......@@ -41,3 +41,4 @@ obj-$(CONFIG_EFI_CAPSULE_LOADER) += capsule-loader.o
obj-$(CONFIG_EFI_EARLYCON) += earlycon.o
obj-$(CONFIG_UEFI_CPER_ARM) += cper-arm.o
obj-$(CONFIG_UEFI_CPER_X86) += cper-x86.o
obj-$(CONFIG_UNACCEPTED_MEMORY) += unaccepted_memory.o
......@@ -50,6 +50,9 @@ struct efi __read_mostly efi = {
#ifdef CONFIG_EFI_COCO_SECRET
.coco_secret = EFI_INVALID_TABLE_ADDR,
#endif
#ifdef CONFIG_UNACCEPTED_MEMORY
.unaccepted = EFI_INVALID_TABLE_ADDR,
#endif
};
EXPORT_SYMBOL(efi);
......@@ -584,6 +587,9 @@ static const efi_config_table_type_t common_tables[] __initconst = {
#ifdef CONFIG_EFI_COCO_SECRET
{LINUX_EFI_COCO_SECRET_AREA_GUID, &efi.coco_secret, "CocoSecret" },
#endif
#ifdef CONFIG_UNACCEPTED_MEMORY
{LINUX_EFI_UNACCEPTED_MEM_TABLE_GUID, &efi.unaccepted, "Unaccepted" },
#endif
#ifdef CONFIG_EFI_GENERIC_STUB
{LINUX_EFI_SCREEN_INFO_TABLE_GUID, &screen_info_table },
#endif
......@@ -738,6 +744,25 @@ int __init efi_config_parse_tables(const efi_config_table_t *config_tables,
}
}
if (IS_ENABLED(CONFIG_UNACCEPTED_MEMORY) &&
efi.unaccepted != EFI_INVALID_TABLE_ADDR) {
struct efi_unaccepted_memory *unaccepted;
unaccepted = early_memremap(efi.unaccepted, sizeof(*unaccepted));
if (unaccepted) {
unsigned long size;
if (unaccepted->version == 1) {
size = sizeof(*unaccepted) + unaccepted->size;
memblock_reserve(efi.unaccepted, size);
} else {
efi.unaccepted = EFI_INVALID_TABLE_ADDR;
}
early_memunmap(unaccepted, sizeof(*unaccepted));
}
}
return 0;
}
......@@ -822,6 +847,7 @@ static __initdata char memory_type_name[][13] = {
"MMIO Port",
"PAL Code",
"Persistent",
"Unaccepted",
};
char * __init efi_md_typeattr_format(char *buf, size_t size,
......
......@@ -96,6 +96,8 @@ CFLAGS_arm32-stub.o := -DTEXT_OFFSET=$(TEXT_OFFSET)
zboot-obj-$(CONFIG_RISCV) := lib-clz_ctz.o lib-ashldi3.o
lib-$(CONFIG_EFI_ZBOOT) += zboot.o $(zboot-obj-y)
lib-$(CONFIG_UNACCEPTED_MEMORY) += unaccepted_memory.o bitmap.o find.o
extra-y := $(lib-y)
lib-y := $(patsubst %.o,%.stub.o,$(lib-y))
......
#include <linux/bitmap.h>
void __bitmap_set(unsigned long *map, unsigned int start, int len)
{
unsigned long *p = map + BIT_WORD(start);
const unsigned int size = start + len;
int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG);
unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start);
while (len - bits_to_set >= 0) {
*p |= mask_to_set;
len -= bits_to_set;
bits_to_set = BITS_PER_LONG;
mask_to_set = ~0UL;
p++;
}
if (len) {
mask_to_set &= BITMAP_LAST_WORD_MASK(size);
*p |= mask_to_set;
}
}
void __bitmap_clear(unsigned long *map, unsigned int start, int len)
{
unsigned long *p = map + BIT_WORD(start);
const unsigned int size = start + len;
int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG);
unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start);
while (len - bits_to_clear >= 0) {
*p &= ~mask_to_clear;
len -= bits_to_clear;
bits_to_clear = BITS_PER_LONG;
mask_to_clear = ~0UL;
p++;
}
if (len) {
mask_to_clear &= BITMAP_LAST_WORD_MASK(size);
*p &= ~mask_to_clear;
}
}
......@@ -1136,4 +1136,10 @@ void efi_remap_image(unsigned long image_base, unsigned alloc_size,
asmlinkage efi_status_t __efiapi
efi_zboot_entry(efi_handle_t handle, efi_system_table_t *systab);
efi_status_t allocate_unaccepted_bitmap(__u32 nr_desc,
struct efi_boot_memmap *map);
void process_unaccepted_memory(u64 start, u64 end);
void accept_memory(phys_addr_t start, phys_addr_t end);
void arch_accept_memory(phys_addr_t start, phys_addr_t end);
#endif
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/bitmap.h>
#include <linux/math.h>
#include <linux/minmax.h>
/*
* Common helper for find_next_bit() function family
* @FETCH: The expression that fetches and pre-processes each word of bitmap(s)
* @MUNGE: The expression that post-processes a word containing found bit (may be empty)
* @size: The bitmap size in bits
* @start: The bitnumber to start searching at
*/
#define FIND_NEXT_BIT(FETCH, MUNGE, size, start) \
({ \
unsigned long mask, idx, tmp, sz = (size), __start = (start); \
\
if (unlikely(__start >= sz)) \
goto out; \
\
mask = MUNGE(BITMAP_FIRST_WORD_MASK(__start)); \
idx = __start / BITS_PER_LONG; \
\
for (tmp = (FETCH) & mask; !tmp; tmp = (FETCH)) { \
if ((idx + 1) * BITS_PER_LONG >= sz) \
goto out; \
idx++; \
} \
\
sz = min(idx * BITS_PER_LONG + __ffs(MUNGE(tmp)), sz); \
out: \
sz; \
})
unsigned long _find_next_bit(const unsigned long *addr, unsigned long nbits, unsigned long start)
{
return FIND_NEXT_BIT(addr[idx], /* nop */, nbits, start);
}
unsigned long _find_next_zero_bit(const unsigned long *addr, unsigned long nbits,
unsigned long start)
{
return FIND_NEXT_BIT(~addr[idx], /* nop */, nbits, start);
}
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/efi.h>
#include <asm/efi.h>
#include "efistub.h"
struct efi_unaccepted_memory *unaccepted_table;
efi_status_t allocate_unaccepted_bitmap(__u32 nr_desc,
struct efi_boot_memmap *map)
{
efi_guid_t unaccepted_table_guid = LINUX_EFI_UNACCEPTED_MEM_TABLE_GUID;
u64 unaccepted_start = ULLONG_MAX, unaccepted_end = 0, bitmap_size;
efi_status_t status;
int i;
/* Check if the table is already installed */
unaccepted_table = get_efi_config_table(unaccepted_table_guid);
if (unaccepted_table) {
if (unaccepted_table->version != 1) {
efi_err("Unknown version of unaccepted memory table\n");
return EFI_UNSUPPORTED;
}
return EFI_SUCCESS;
}
/* Check if there's any unaccepted memory and find the max address */
for (i = 0; i < nr_desc; i++) {
efi_memory_desc_t *d;
unsigned long m = (unsigned long)map->map;
d = efi_early_memdesc_ptr(m, map->desc_size, i);
if (d->type != EFI_UNACCEPTED_MEMORY)
continue;
unaccepted_start = min(unaccepted_start, d->phys_addr);
unaccepted_end = max(unaccepted_end,
d->phys_addr + d->num_pages * PAGE_SIZE);
}
if (unaccepted_start == ULLONG_MAX)
return EFI_SUCCESS;
unaccepted_start = round_down(unaccepted_start,
EFI_UNACCEPTED_UNIT_SIZE);
unaccepted_end = round_up(unaccepted_end, EFI_UNACCEPTED_UNIT_SIZE);
/*
* If unaccepted memory is present, allocate a bitmap to track what
* memory has to be accepted before access.
*
* One bit in the bitmap represents 2MiB in the address space:
* A 4k bitmap can track 64GiB of physical address space.
*
* In the worst case scenario -- a huge hole in the middle of the
* address space -- It needs 256MiB to handle 4PiB of the address
* space.
*
* The bitmap will be populated in setup_e820() according to the memory
* map after efi_exit_boot_services().
*/
bitmap_size = DIV_ROUND_UP(unaccepted_end - unaccepted_start,
EFI_UNACCEPTED_UNIT_SIZE * BITS_PER_BYTE);
status = efi_bs_call(allocate_pool, EFI_LOADER_DATA,
sizeof(*unaccepted_table) + bitmap_size,
(void **)&unaccepted_table);
if (status != EFI_SUCCESS) {
efi_err("Failed to allocate unaccepted memory config table\n");
return status;
}
unaccepted_table->version = 1;
unaccepted_table->unit_size = EFI_UNACCEPTED_UNIT_SIZE;
unaccepted_table->phys_base = unaccepted_start;
unaccepted_table->size = bitmap_size;
memset(unaccepted_table->bitmap, 0, bitmap_size);
status = efi_bs_call(install_configuration_table,
&unaccepted_table_guid, unaccepted_table);
if (status != EFI_SUCCESS) {
efi_bs_call(free_pool, unaccepted_table);
efi_err("Failed to install unaccepted memory config table!\n");
}
return status;
}
/*
* The accepted memory bitmap only works at unit_size granularity. Take
* unaligned start/end addresses and either:
* 1. Accepts the memory immediately and in its entirety
* 2. Accepts unaligned parts, and marks *some* aligned part unaccepted
*
* The function will never reach the bitmap_set() with zero bits to set.
*/
void process_unaccepted_memory(u64 start, u64 end)
{
u64 unit_size = unaccepted_table->unit_size;
u64 unit_mask = unaccepted_table->unit_size - 1;
u64 bitmap_size = unaccepted_table->size;
/*
* Ensure that at least one bit will be set in the bitmap by
* immediately accepting all regions under 2*unit_size. This is
* imprecise and may immediately accept some areas that could
* have been represented in the bitmap. But, results in simpler
* code below
*
* Consider case like this (assuming unit_size == 2MB):
*
* | 4k | 2044k | 2048k |
* ^ 0x0 ^ 2MB ^ 4MB
*
* Only the first 4k has been accepted. The 0MB->2MB region can not be
* represented in the bitmap. The 2MB->4MB region can be represented in
* the bitmap. But, the 0MB->4MB region is <2*unit_size and will be
* immediately accepted in its entirety.
*/
if (end - start < 2 * unit_size) {
arch_accept_memory(start, end);
return;
}
/*
* No matter how the start and end are aligned, at least one unaccepted
* unit_size area will remain to be marked in the bitmap.
*/
/* Immediately accept a <unit_size piece at the start: */
if (start & unit_mask) {
arch_accept_memory(start, round_up(start, unit_size));
start = round_up(start, unit_size);
}
/* Immediately accept a <unit_size piece at the end: */
if (end & unit_mask) {
arch_accept_memory(round_down(end, unit_size), end);
end = round_down(end, unit_size);
}
/*
* Accept part of the range that before phys_base and cannot be recorded
* into the bitmap.
*/
if (start < unaccepted_table->phys_base) {
arch_accept_memory(start,
min(unaccepted_table->phys_base, end));
start = unaccepted_table->phys_base;
}
/* Nothing to record */
if (end < unaccepted_table->phys_base)
return;
/* Translate to offsets from the beginning of the bitmap */
start -= unaccepted_table->phys_base;
end -= unaccepted_table->phys_base;
/* Accept memory that doesn't fit into bitmap */
if (end > bitmap_size * unit_size * BITS_PER_BYTE) {
unsigned long phys_start, phys_end;
phys_start = bitmap_size * unit_size * BITS_PER_BYTE +
unaccepted_table->phys_base;
phys_end = end + unaccepted_table->phys_base;
arch_accept_memory(phys_start, phys_end);
end = bitmap_size * unit_size * BITS_PER_BYTE;
}
/*
* 'start' and 'end' are now both unit_size-aligned.
* Record the range as being unaccepted:
*/
bitmap_set(unaccepted_table->bitmap,
start / unit_size, (end - start) / unit_size);
}
void accept_memory(phys_addr_t start, phys_addr_t end)
{
unsigned long range_start, range_end;
unsigned long bitmap_size;
u64 unit_size;
if (!unaccepted_table)
return;
unit_size = unaccepted_table->unit_size;
/*
* Only care for the part of the range that is represented
* in the bitmap.
*/
if (start < unaccepted_table->phys_base)
start = unaccepted_table->phys_base;
if (end < unaccepted_table->phys_base)
return;
/* Translate to offsets from the beginning of the bitmap */
start -= unaccepted_table->phys_base;
end -= unaccepted_table->phys_base;
/* Make sure not to overrun the bitmap */
if (end > unaccepted_table->size * unit_size * BITS_PER_BYTE)
end = unaccepted_table->size * unit_size * BITS_PER_BYTE;
range_start = start / unit_size;
bitmap_size = DIV_ROUND_UP(end, unit_size);
for_each_set_bitrange_from(range_start, range_end,
unaccepted_table->bitmap, bitmap_size) {
unsigned long phys_start, phys_end;
phys_start = range_start * unit_size + unaccepted_table->phys_base;
phys_end = range_end * unit_size + unaccepted_table->phys_base;
arch_accept_memory(phys_start, phys_end);
bitmap_clear(unaccepted_table->bitmap,
range_start, range_end - range_start);
}
}
......@@ -26,6 +26,17 @@ const efi_dxe_services_table_t *efi_dxe_table;
u32 image_offset __section(".data");
static efi_loaded_image_t *image = NULL;
typedef union sev_memory_acceptance_protocol sev_memory_acceptance_protocol_t;
union sev_memory_acceptance_protocol {
struct {
efi_status_t (__efiapi * allow_unaccepted_memory)(
sev_memory_acceptance_protocol_t *);
};
struct {
u32 allow_unaccepted_memory;
} mixed_mode;
};
static efi_status_t
preserve_pci_rom_image(efi_pci_io_protocol_t *pci, struct pci_setup_rom **__rom)
{
......@@ -310,6 +321,29 @@ setup_memory_protection(unsigned long image_base, unsigned long image_size)
#endif
}
static void setup_unaccepted_memory(void)
{
efi_guid_t mem_acceptance_proto = OVMF_SEV_MEMORY_ACCEPTANCE_PROTOCOL_GUID;
sev_memory_acceptance_protocol_t *proto;
efi_status_t status;
if (!IS_ENABLED(CONFIG_UNACCEPTED_MEMORY))
return;
/*
* Enable unaccepted memory before calling exit boot services in order
* for the UEFI to not accept all memory on EBS.
*/
status = efi_bs_call(locate_protocol, &mem_acceptance_proto, NULL,
(void **)&proto);
if (status != EFI_SUCCESS)
return;
status = efi_call_proto(proto, allow_unaccepted_memory);
if (status != EFI_SUCCESS)
efi_err("Memory acceptance protocol failed\n");
}
static const efi_char16_t apple[] = L"Apple";
static void setup_quirks(struct boot_params *boot_params,
......@@ -613,6 +647,16 @@ setup_e820(struct boot_params *params, struct setup_data *e820ext, u32 e820ext_s
e820_type = E820_TYPE_PMEM;
break;
case EFI_UNACCEPTED_MEMORY:
if (!IS_ENABLED(CONFIG_UNACCEPTED_MEMORY)) {
efi_warn_once(
"The system has unaccepted memory, but kernel does not support it\nConsider enabling CONFIG_UNACCEPTED_MEMORY\n");
continue;
}
e820_type = E820_TYPE_RAM;
process_unaccepted_memory(d->phys_addr,
d->phys_addr + PAGE_SIZE * d->num_pages);
break;
default:
continue;
}
......@@ -681,28 +725,27 @@ static efi_status_t allocate_e820(struct boot_params *params,
struct setup_data **e820ext,
u32 *e820ext_size)
{
unsigned long map_size, desc_size, map_key;
struct efi_boot_memmap *map;
efi_status_t status;
__u32 nr_desc, desc_version;
/* Only need the size of the mem map and size of each mem descriptor */
map_size = 0;
status = efi_bs_call(get_memory_map, &map_size, NULL, &map_key,
&desc_size, &desc_version);
if (status != EFI_BUFFER_TOO_SMALL)
return (status != EFI_SUCCESS) ? status : EFI_UNSUPPORTED;
__u32 nr_desc;
nr_desc = map_size / desc_size + EFI_MMAP_NR_SLACK_SLOTS;
status = efi_get_memory_map(&map, false);
if (status != EFI_SUCCESS)
return status;
if (nr_desc > ARRAY_SIZE(params->e820_table)) {
u32 nr_e820ext = nr_desc - ARRAY_SIZE(params->e820_table);
nr_desc = map->map_size / map->desc_size;
if (nr_desc > ARRAY_SIZE(params->e820_table) - EFI_MMAP_NR_SLACK_SLOTS) {
u32 nr_e820ext = nr_desc - ARRAY_SIZE(params->e820_table) +
EFI_MMAP_NR_SLACK_SLOTS;
status = alloc_e820ext(nr_e820ext, e820ext, e820ext_size);
if (status != EFI_SUCCESS)
return status;
}
return EFI_SUCCESS;
if (IS_ENABLED(CONFIG_UNACCEPTED_MEMORY) && status == EFI_SUCCESS)
status = allocate_unaccepted_bitmap(nr_desc, map);
efi_bs_call(free_pool, map);
return status;
}
struct exit_boot_struct {
......@@ -899,6 +942,8 @@ asmlinkage unsigned long efi_main(efi_handle_t handle,
setup_quirks(boot_params, bzimage_addr, buffer_end - buffer_start);
setup_unaccepted_memory();
status = exit_boot(boot_params, handle);
if (status != EFI_SUCCESS) {
efi_err("exit_boot() failed!\n");
......
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/efi.h>
#include <linux/memblock.h>
#include <linux/spinlock.h>
#include <asm/unaccepted_memory.h>
/* Protects unaccepted memory bitmap */
static DEFINE_SPINLOCK(unaccepted_memory_lock);
/*
* accept_memory() -- Consult bitmap and accept the memory if needed.
*
* Only memory that is explicitly marked as unaccepted in the bitmap requires
* an action. All the remaining memory is implicitly accepted and doesn't need
* acceptance.
*
* No need to accept:
* - anything if the system has no unaccepted table;
* - memory that is below phys_base;
* - memory that is above the memory that addressable by the bitmap;
*/
void accept_memory(phys_addr_t start, phys_addr_t end)
{
struct efi_unaccepted_memory *unaccepted;
unsigned long range_start, range_end;
unsigned long flags;
u64 unit_size;
unaccepted = efi_get_unaccepted_table();
if (!unaccepted)
return;
unit_size = unaccepted->unit_size;
/*
* Only care for the part of the range that is represented
* in the bitmap.
*/
if (start < unaccepted->phys_base)
start = unaccepted->phys_base;
if (end < unaccepted->phys_base)
return;
/* Translate to offsets from the beginning of the bitmap */
start -= unaccepted->phys_base;
end -= unaccepted->phys_base;
/*
* load_unaligned_zeropad() can lead to unwanted loads across page
* boundaries. The unwanted loads are typically harmless. But, they
* might be made to totally unrelated or even unmapped memory.
* load_unaligned_zeropad() relies on exception fixup (#PF, #GP and now
* #VE) to recover from these unwanted loads.
*
* But, this approach does not work for unaccepted memory. For TDX, a
* load from unaccepted memory will not lead to a recoverable exception
* within the guest. The guest will exit to the VMM where the only
* recourse is to terminate the guest.
*
* There are two parts to fix this issue and comprehensively avoid
* access to unaccepted memory. Together these ensure that an extra
* "guard" page is accepted in addition to the memory that needs to be
* used:
*
* 1. Implicitly extend the range_contains_unaccepted_memory(start, end)
* checks up to end+unit_size if 'end' is aligned on a unit_size
* boundary.
*
* 2. Implicitly extend accept_memory(start, end) to end+unit_size if
* 'end' is aligned on a unit_size boundary. (immediately following
* this comment)
*/
if (!(end % unit_size))
end += unit_size;
/* Make sure not to overrun the bitmap */
if (end > unaccepted->size * unit_size * BITS_PER_BYTE)
end = unaccepted->size * unit_size * BITS_PER_BYTE;
range_start = start / unit_size;
spin_lock_irqsave(&unaccepted_memory_lock, flags);
for_each_set_bitrange_from(range_start, range_end, unaccepted->bitmap,
DIV_ROUND_UP(end, unit_size)) {
unsigned long phys_start, phys_end;
unsigned long len = range_end - range_start;
phys_start = range_start * unit_size + unaccepted->phys_base;
phys_end = range_end * unit_size + unaccepted->phys_base;
arch_accept_memory(phys_start, phys_end);
bitmap_clear(unaccepted->bitmap, range_start, len);
}
spin_unlock_irqrestore(&unaccepted_memory_lock, flags);
}
bool range_contains_unaccepted_memory(phys_addr_t start, phys_addr_t end)
{
struct efi_unaccepted_memory *unaccepted;
unsigned long flags;
bool ret = false;
u64 unit_size;
unaccepted = efi_get_unaccepted_table();
if (!unaccepted)
return false;
unit_size = unaccepted->unit_size;
/*
* Only care for the part of the range that is represented
* in the bitmap.
*/
if (start < unaccepted->phys_base)
start = unaccepted->phys_base;
if (end < unaccepted->phys_base)
return false;
/* Translate to offsets from the beginning of the bitmap */
start -= unaccepted->phys_base;
end -= unaccepted->phys_base;
/*
* Also consider the unaccepted state of the *next* page. See fix #1 in
* the comment on load_unaligned_zeropad() in accept_memory().
*/
if (!(end % unit_size))
end += unit_size;
/* Make sure not to overrun the bitmap */
if (end > unaccepted->size * unit_size * BITS_PER_BYTE)
end = unaccepted->size * unit_size * BITS_PER_BYTE;
spin_lock_irqsave(&unaccepted_memory_lock, flags);
while (start < end) {
if (test_bit(start / unit_size, unaccepted->bitmap)) {
ret = true;
break;
}
start += unit_size;
}
spin_unlock_irqrestore(&unaccepted_memory_lock, flags);
return ret;
}
......@@ -2,6 +2,7 @@ config SEV_GUEST
tristate "AMD SEV Guest driver"
default m
depends on AMD_MEM_ENCRYPT
select CRYPTO
select CRYPTO_AEAD2
select CRYPTO_GCM
help
......
......@@ -168,6 +168,11 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
global_zone_page_state(NR_FREE_CMA_PAGES));
#endif
#ifdef CONFIG_UNACCEPTED_MEMORY
show_val_kb(m, "Unaccepted: ",
global_zone_page_state(NR_UNACCEPTED));
#endif
hugetlb_report_meminfo(m);
arch_report_meminfo(m);
......
......@@ -108,7 +108,8 @@ typedef struct {
#define EFI_MEMORY_MAPPED_IO_PORT_SPACE 12
#define EFI_PAL_CODE 13
#define EFI_PERSISTENT_MEMORY 14
#define EFI_MAX_MEMORY_TYPE 15
#define EFI_UNACCEPTED_MEMORY 15
#define EFI_MAX_MEMORY_TYPE 16
/* Attribute values: */
#define EFI_MEMORY_UC ((u64)0x0000000000000001ULL) /* uncached */
......@@ -417,6 +418,7 @@ void efi_native_runtime_setup(void);
#define LINUX_EFI_MOK_VARIABLE_TABLE_GUID EFI_GUID(0xc451ed2b, 0x9694, 0x45d3, 0xba, 0xba, 0xed, 0x9f, 0x89, 0x88, 0xa3, 0x89)
#define LINUX_EFI_COCO_SECRET_AREA_GUID EFI_GUID(0xadf956ad, 0xe98c, 0x484c, 0xae, 0x11, 0xb5, 0x1c, 0x7d, 0x33, 0x64, 0x47)
#define LINUX_EFI_BOOT_MEMMAP_GUID EFI_GUID(0x800f683f, 0xd08b, 0x423a, 0xa2, 0x93, 0x96, 0x5c, 0x3c, 0x6f, 0xe2, 0xb4)
#define LINUX_EFI_UNACCEPTED_MEM_TABLE_GUID EFI_GUID(0xd5d1de3c, 0x105c, 0x44f9, 0x9e, 0xa9, 0xbc, 0xef, 0x98, 0x12, 0x00, 0x31)
#define RISCV_EFI_BOOT_PROTOCOL_GUID EFI_GUID(0xccd15fec, 0x6f73, 0x4eec, 0x83, 0x95, 0x3e, 0x69, 0xe4, 0xb9, 0x40, 0xbf)
......@@ -435,6 +437,9 @@ void efi_native_runtime_setup(void);
#define DELLEMC_EFI_RCI2_TABLE_GUID EFI_GUID(0x2d9f28a2, 0xa886, 0x456a, 0x97, 0xa8, 0xf1, 0x1e, 0xf2, 0x4f, 0xf4, 0x55)
#define AMD_SEV_MEM_ENCRYPT_GUID EFI_GUID(0x0cf29b71, 0x9e51, 0x433a, 0xa3, 0xb7, 0x81, 0xf3, 0xab, 0x16, 0xb8, 0x75)
/* OVMF protocol GUIDs */
#define OVMF_SEV_MEMORY_ACCEPTANCE_PROTOCOL_GUID EFI_GUID(0xc5a010fe, 0x38a7, 0x4531, 0x8a, 0x4a, 0x05, 0x00, 0xd2, 0xfd, 0x16, 0x49)
typedef struct {
efi_guid_t guid;
u64 table;
......@@ -534,6 +539,14 @@ struct efi_boot_memmap {
efi_memory_desc_t map[];
};
struct efi_unaccepted_memory {
u32 version;
u32 unit_size;
u64 phys_base;
u64 size;
unsigned long bitmap[];
};
/*
* Architecture independent structure for describing a memory map for the
* benefit of efi_memmap_init_early(), and for passing context between
......@@ -636,6 +649,7 @@ extern struct efi {
unsigned long tpm_final_log; /* TPM2 Final Events Log table */
unsigned long mokvar_table; /* MOK variable config table */
unsigned long coco_secret; /* Confidential computing secret table */
unsigned long unaccepted; /* Unaccepted memory table */
efi_get_time_t *get_time;
efi_set_time_t *set_time;
......
......@@ -3839,4 +3839,23 @@ madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
}
#endif
#ifdef CONFIG_UNACCEPTED_MEMORY
bool range_contains_unaccepted_memory(phys_addr_t start, phys_addr_t end);
void accept_memory(phys_addr_t start, phys_addr_t end);
#else
static inline bool range_contains_unaccepted_memory(phys_addr_t start,
phys_addr_t end)
{
return false;
}
static inline void accept_memory(phys_addr_t start, phys_addr_t end)
{
}
#endif
#endif /* _LINUX_MM_H */
......@@ -143,6 +143,9 @@ enum zone_stat_item {
NR_ZSPAGES, /* allocated in zsmalloc */
#endif
NR_FREE_CMA_PAGES,
#ifdef CONFIG_UNACCEPTED_MEMORY
NR_UNACCEPTED,
#endif
NR_VM_ZONE_STAT_ITEMS };
enum node_stat_item {
......@@ -910,6 +913,11 @@ struct zone {
/* free areas of different sizes */
struct free_area free_area[MAX_ORDER + 1];
#ifdef CONFIG_UNACCEPTED_MEMORY
/* Pages to be accepted. All pages on the list are MAX_ORDER */
struct list_head unaccepted_pages;
#endif
/* zone flags, see below */
unsigned long flags;
......
......@@ -1436,6 +1436,15 @@ phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
*/
kmemleak_alloc_phys(found, size, 0);
/*
* Some Virtual Machine platforms, such as Intel TDX or AMD SEV-SNP,
* require memory to be accepted before it can be used by the
* guest.
*
* Accept the memory of the allocated buffer.
*/
accept_memory(found, found + size);
return found;
}
......
......@@ -1375,6 +1375,10 @@ static void __meminit zone_init_free_lists(struct zone *zone)
INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
zone->free_area[order].nr_free = 0;
}
#ifdef CONFIG_UNACCEPTED_MEMORY
INIT_LIST_HEAD(&zone->unaccepted_pages);
#endif
}
void __meminit init_currently_empty_zone(struct zone *zone,
......@@ -1960,6 +1964,9 @@ static void __init deferred_free_range(unsigned long pfn,
return;
}
/* Accept chunks smaller than MAX_ORDER upfront */
accept_memory(PFN_PHYS(pfn), PFN_PHYS(pfn + nr_pages));
for (i = 0; i < nr_pages; i++, page++, pfn++) {
if (pageblock_aligned(pfn))
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
......
......@@ -387,6 +387,12 @@ EXPORT_SYMBOL(nr_node_ids);
EXPORT_SYMBOL(nr_online_nodes);
#endif
static bool page_contains_unaccepted(struct page *page, unsigned int order);
static void accept_page(struct page *page, unsigned int order);
static bool try_to_accept_memory(struct zone *zone, unsigned int order);
static inline bool has_unaccepted_memory(void);
static bool __free_unaccepted(struct page *page);
int page_group_by_mobility_disabled __read_mostly;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
......@@ -1481,6 +1487,13 @@ void __free_pages_core(struct page *page, unsigned int order)
atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
if (page_contains_unaccepted(page, order)) {
if (order == MAX_ORDER && __free_unaccepted(page))
return;
accept_page(page, order);
}
/*
* Bypass PCP and place fresh pages right to the tail, primarily
* relevant for memory onlining.
......@@ -3159,6 +3172,9 @@ static inline long __zone_watermark_unusable_free(struct zone *z,
if (!(alloc_flags & ALLOC_CMA))
unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES);
#endif
#ifdef CONFIG_UNACCEPTED_MEMORY
unusable_free += zone_page_state(z, NR_UNACCEPTED);
#endif
return unusable_free;
}
......@@ -3458,6 +3474,11 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
gfp_mask)) {
int ret;
if (has_unaccepted_memory()) {
if (try_to_accept_memory(zone, order))
goto try_this_zone;
}
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/*
* Watermark failed for this zone, but see if we can
......@@ -3510,6 +3531,11 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
return page;
} else {
if (has_unaccepted_memory()) {
if (try_to_accept_memory(zone, order))
goto try_this_zone;
}
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/* Try again if zone has deferred pages */
if (deferred_pages_enabled()) {
......@@ -7215,3 +7241,150 @@ bool has_managed_dma(void)
return false;
}
#endif /* CONFIG_ZONE_DMA */
#ifdef CONFIG_UNACCEPTED_MEMORY
/* Counts number of zones with unaccepted pages. */
static DEFINE_STATIC_KEY_FALSE(zones_with_unaccepted_pages);
static bool lazy_accept = true;
static int __init accept_memory_parse(char *p)
{
if (!strcmp(p, "lazy")) {
lazy_accept = true;
return 0;
} else if (!strcmp(p, "eager")) {
lazy_accept = false;
return 0;
} else {
return -EINVAL;
}
}
early_param("accept_memory", accept_memory_parse);
static bool page_contains_unaccepted(struct page *page, unsigned int order)
{
phys_addr_t start = page_to_phys(page);
phys_addr_t end = start + (PAGE_SIZE << order);
return range_contains_unaccepted_memory(start, end);
}
static void accept_page(struct page *page, unsigned int order)
{
phys_addr_t start = page_to_phys(page);
accept_memory(start, start + (PAGE_SIZE << order));
}
static bool try_to_accept_memory_one(struct zone *zone)
{
unsigned long flags;
struct page *page;
bool last;
if (list_empty(&zone->unaccepted_pages))
return false;
spin_lock_irqsave(&zone->lock, flags);
page = list_first_entry_or_null(&zone->unaccepted_pages,
struct page, lru);
if (!page) {
spin_unlock_irqrestore(&zone->lock, flags);
return false;
}
list_del(&page->lru);
last = list_empty(&zone->unaccepted_pages);
__mod_zone_freepage_state(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
__mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES);
spin_unlock_irqrestore(&zone->lock, flags);
accept_page(page, MAX_ORDER);
__free_pages_ok(page, MAX_ORDER, FPI_TO_TAIL);
if (last)
static_branch_dec(&zones_with_unaccepted_pages);
return true;
}
static bool try_to_accept_memory(struct zone *zone, unsigned int order)
{
long to_accept;
int ret = false;
/* How much to accept to get to high watermark? */
to_accept = high_wmark_pages(zone) -
(zone_page_state(zone, NR_FREE_PAGES) -
__zone_watermark_unusable_free(zone, order, 0));
/* Accept at least one page */
do {
if (!try_to_accept_memory_one(zone))
break;
ret = true;
to_accept -= MAX_ORDER_NR_PAGES;
} while (to_accept > 0);
return ret;
}
static inline bool has_unaccepted_memory(void)
{
return static_branch_unlikely(&zones_with_unaccepted_pages);
}
static bool __free_unaccepted(struct page *page)
{
struct zone *zone = page_zone(page);
unsigned long flags;
bool first = false;
if (!lazy_accept)
return false;
spin_lock_irqsave(&zone->lock, flags);
first = list_empty(&zone->unaccepted_pages);
list_add_tail(&page->lru, &zone->unaccepted_pages);
__mod_zone_freepage_state(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
__mod_zone_page_state(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES);
spin_unlock_irqrestore(&zone->lock, flags);
if (first)
static_branch_inc(&zones_with_unaccepted_pages);
return true;
}
#else
static bool page_contains_unaccepted(struct page *page, unsigned int order)
{
return false;
}
static void accept_page(struct page *page, unsigned int order)
{
}
static bool try_to_accept_memory(struct zone *zone, unsigned int order)
{
return false;
}
static inline bool has_unaccepted_memory(void)
{
return false;
}
static bool __free_unaccepted(struct page *page)
{
BUILD_BUG();
return false;
}
#endif /* CONFIG_UNACCEPTED_MEMORY */
......@@ -1180,6 +1180,9 @@ const char * const vmstat_text[] = {
"nr_zspages",
#endif
"nr_free_cma",
#ifdef CONFIG_UNACCEPTED_MEMORY
"nr_unaccepted",
#endif
/* enum numa_stat_item counters */
#ifdef CONFIG_NUMA
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment