Commit d22fff81 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 mm updates from Ingo Molnar:

 - Extend the memmap= boot parameter syntax to allow the redeclaration
   and dropping of existing ranges, and to support all e820 range types
   (Jan H. Schönherr)

 - Improve the W+X boot time security checks to remove false positive
   warnings on Xen (Jan Beulich)

 - Support booting as Xen PVH guest (Juergen Gross)

 - Improved 5-level paging (LA57) support, in particular it's possible
   now to have a single kernel image for both 4-level and 5-level
   hardware (Kirill A. Shutemov)

 - AMD hardware RAM encryption support (SME/SEV) fixes (Tom Lendacky)

 - Preparatory commits for hardware-encrypted RAM support on Intel CPUs.
   (Kirill A. Shutemov)

 - Improved Intel-MID support (Andy Shevchenko)

 - Show EFI page tables in page_tables debug files (Andy Lutomirski)

 - ... plus misc fixes and smaller cleanups

* 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (56 commits)
  x86/cpu/tme: Fix spelling: "configuation" -> "configuration"
  x86/boot: Fix SEV boot failure from change to __PHYSICAL_MASK_SHIFT
  x86/mm: Update comment in detect_tme() regarding x86_phys_bits
  x86/mm/32: Remove unused node_memmap_size_bytes() & CONFIG_NEED_NODE_MEMMAP_SIZE logic
  x86/mm: Remove pointless checks in vmalloc_fault
  x86/platform/intel-mid: Add special handling for ACPI HW reduced platforms
  ACPI, x86/boot: Introduce the ->reduced_hw_early_init() ACPI callback
  ACPI, x86/boot: Split out acpi_generic_reduce_hw_init() and export
  x86/pconfig: Provide defines and helper to run MKTME_KEY_PROG leaf
  x86/pconfig: Detect PCONFIG targets
  x86/tme: Detect if TME and MKTME is activated by BIOS
  x86/boot/compressed/64: Handle 5-level paging boot if kernel is above 4G
  x86/boot/compressed/64: Use page table in trampoline memory
  x86/boot/compressed/64: Use stack from trampoline memory
  x86/boot/compressed/64: Make sure we have a 32-bit code segment
  x86/mm: Do not use paravirtualized calls in native_set_p4d()
  kdump, vmcoreinfo: Export pgtable_l5_enabled value
  x86/boot/compressed/64: Prepare new top-level page table for trampoline
  x86/boot/compressed/64: Set up trampoline memory
  x86/boot/compressed/64: Save and restore trampoline memory
  ...
parents 986b37c0 eaeb8e76
......@@ -2248,6 +2248,15 @@
The memory region may be marked as e820 type 12 (0xc)
and is NVDIMM or ADR memory.
memmap=<size>%<offset>-<oldtype>+<newtype>
[KNL,ACPI] Convert memory within the specified region
from <oldtype> to <newtype>. If "-<oldtype>" is left
out, the whole region will be marked as <newtype>,
even if previously unavailable. If "+<newtype>" is left
out, matching memory will be removed. Types are
specified as e820 types, e.g., 1 = RAM, 2 = reserved,
3 = ACPI, 12 = PRAM.
memory_corruption_check=0/1 [X86]
Some BIOSes seem to corrupt the first 64k of
memory when doing things like suspend/resume.
......
......@@ -20,12 +20,9 @@ Documentation/x86/x86_64/mm.txt
CONFIG_X86_5LEVEL=y enables the feature.
So far, a kernel compiled with the option enabled will be able to boot
only on machines that supports the feature -- see for 'la57' flag in
/proc/cpuinfo.
The plan is to implement boot-time switching between 4- and 5-level paging
in the future.
Kernel with CONFIG_X86_5LEVEL=y still able to boot on 4-level hardware.
In this case additional page table level -- p4d -- will be folded at
runtime.
== User-space and large virtual address space ==
......
......@@ -1461,6 +1461,8 @@ config X86_PAE
config X86_5LEVEL
bool "Enable 5-level page tables support"
select DYNAMIC_MEMORY_LAYOUT
select SPARSEMEM_VMEMMAP
depends on X86_64
---help---
5-level paging enables access to larger address space:
......@@ -1469,8 +1471,8 @@ config X86_5LEVEL
It will be supported by future Intel CPUs.
Note: a kernel with this option enabled can only be booted
on machines that support the feature.
A kernel with the option enabled can be booted on machines that
support 4- or 5-level paging.
See Documentation/x86/x86_64/5level-paging.txt for more
information.
......@@ -1595,10 +1597,6 @@ config ARCH_HAVE_MEMORY_PRESENT
def_bool y
depends on X86_32 && DISCONTIGMEM
config NEED_NODE_MEMMAP_SIZE
def_bool y
depends on X86_32 && (DISCONTIGMEM || SPARSEMEM)
config ARCH_FLATMEM_ENABLE
def_bool y
depends on X86_32 && !NUMA
......@@ -2174,10 +2172,17 @@ config PHYSICAL_ALIGN
Don't change this unless you know what you are doing.
config DYNAMIC_MEMORY_LAYOUT
bool
---help---
This option makes base addresses of vmalloc and vmemmap as well as
__PAGE_OFFSET movable during boot.
config RANDOMIZE_MEMORY
bool "Randomize the kernel memory sections"
depends on X86_64
depends on RANDOMIZE_BASE
select DYNAMIC_MEMORY_LAYOUT
default RANDOMIZE_BASE
---help---
Randomizes the base virtual address of kernel memory sections
......
......@@ -78,7 +78,7 @@ vmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \
vmlinux-objs-$(CONFIG_EARLY_PRINTK) += $(obj)/early_serial_console.o
vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr.o
ifdef CONFIG_X86_64
vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/pagetable.o
vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr_64.o
vmlinux-objs-y += $(obj)/mem_encrypt.o
vmlinux-objs-y += $(obj)/pgtable_64.o
endif
......
......@@ -33,6 +33,7 @@
#include <asm/processor-flags.h>
#include <asm/asm-offsets.h>
#include <asm/bootparam.h>
#include "pgtable.h"
/*
* Locally defined symbols should be marked hidden:
......@@ -304,55 +305,77 @@ ENTRY(startup_64)
/* Set up the stack */
leaq boot_stack_end(%rbx), %rsp
#ifdef CONFIG_X86_5LEVEL
/*
* Check if we need to enable 5-level paging.
* RSI holds real mode data and need to be preserved across
* a function call.
* At this point we are in long mode with 4-level paging enabled,
* but we might want to enable 5-level paging or vice versa.
*
* The problem is that we cannot do it directly. Setting or clearing
* CR4.LA57 in long mode would trigger #GP. So we need to switch off
* long mode and paging first.
*
* We also need a trampoline in lower memory to switch over from
* 4- to 5-level paging for cases when the bootloader puts the kernel
* above 4G, but didn't enable 5-level paging for us.
*
* The same trampoline can be used to switch from 5- to 4-level paging
* mode, like when starting 4-level paging kernel via kexec() when
* original kernel worked in 5-level paging mode.
*
* For the trampoline, we need the top page table to reside in lower
* memory as we don't have a way to load 64-bit values into CR3 in
* 32-bit mode.
*
* We go though the trampoline even if we don't have to: if we're
* already in a desired paging mode. This way the trampoline code gets
* tested on every boot.
*/
pushq %rsi
call l5_paging_required
popq %rsi
/* If l5_paging_required() returned zero, we're done here. */
cmpq $0, %rax
je lvl5
/* Make sure we have GDT with 32-bit code segment */
leaq gdt(%rip), %rax
movq %rax, gdt64+2(%rip)
lgdt gdt64(%rip)
/*
* At this point we are in long mode with 4-level paging enabled,
* but we want to enable 5-level paging.
* paging_prepare() sets up the trampoline and checks if we need to
* enable 5-level paging.
*
* The problem is that we cannot do it directly. Setting LA57 in
* long mode would trigger #GP. So we need to switch off long mode
* first.
* Address of the trampoline is returned in RAX.
* Non zero RDX on return means we need to enable 5-level paging.
*
* NOTE: This is not going to work if bootloader put us above 4G
* limit.
*
* The first step is go into compatibility mode.
* RSI holds real mode data and needs to be preserved across
* this function call.
*/
pushq %rsi
call paging_prepare
popq %rsi
/* Clear additional page table */
leaq lvl5_pgtable(%rbx), %rdi
xorq %rax, %rax
movq $(PAGE_SIZE/8), %rcx
rep stosq
/* Save the trampoline address in RCX */
movq %rax, %rcx
/*
* Setup current CR3 as the first and only entry in a new top level
* page table.
* Load the address of trampoline_return() into RDI.
* It will be used by the trampoline to return to the main code.
*/
movq %cr3, %rdi
leaq 0x7 (%rdi), %rax
movq %rax, lvl5_pgtable(%rbx)
leaq trampoline_return(%rip), %rdi
/* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */
pushq $__KERNEL32_CS
leaq compatible_mode(%rip), %rax
leaq TRAMPOLINE_32BIT_CODE_OFFSET(%rax), %rax
pushq %rax
lretq
lvl5:
#endif
trampoline_return:
/* Restore the stack, the 32-bit trampoline uses its own stack */
leaq boot_stack_end(%rbx), %rsp
/*
* cleanup_trampoline() would restore trampoline memory.
*
* RSI holds real mode data and needs to be preserved across
* this function call.
*/
pushq %rsi
call cleanup_trampoline
popq %rsi
/* Zero EFLAGS */
pushq $0
......@@ -490,46 +513,82 @@ relocated:
jmp *%rax
.code32
#ifdef CONFIG_X86_5LEVEL
compatible_mode:
/* Setup data and stack segments */
/*
* This is the 32-bit trampoline that will be copied over to low memory.
*
* RDI contains the return address (might be above 4G).
* ECX contains the base address of the trampoline memory.
* Non zero RDX on return means we need to enable 5-level paging.
*/
ENTRY(trampoline_32bit_src)
/* Set up data and stack segments */
movl $__KERNEL_DS, %eax
movl %eax, %ds
movl %eax, %ss
/* Set up new stack */
leal TRAMPOLINE_32BIT_STACK_END(%ecx), %esp
/* Disable paging */
movl %cr0, %eax
btrl $X86_CR0_PG_BIT, %eax
movl %eax, %cr0
/* Point CR3 to 5-level paging */
leal lvl5_pgtable(%ebx), %eax
movl %eax, %cr3
/* Check what paging mode we want to be in after the trampoline */
cmpl $0, %edx
jz 1f
/* Enable PAE and LA57 mode */
/* We want 5-level paging: don't touch CR3 if it already points to 5-level page tables */
movl %cr4, %eax
testl $X86_CR4_LA57, %eax
jnz 3f
jmp 2f
1:
/* We want 4-level paging: don't touch CR3 if it already points to 4-level page tables */
movl %cr4, %eax
orl $(X86_CR4_PAE | X86_CR4_LA57), %eax
testl $X86_CR4_LA57, %eax
jz 3f
2:
/* Point CR3 to the trampoline's new top level page table */
leal TRAMPOLINE_32BIT_PGTABLE_OFFSET(%ecx), %eax
movl %eax, %cr3
3:
/* Enable PAE and LA57 (if required) paging modes */
movl $X86_CR4_PAE, %eax
cmpl $0, %edx
jz 1f
orl $X86_CR4_LA57, %eax
1:
movl %eax, %cr4
/* Calculate address we are running at */
call 1f
1: popl %edi
subl $1b, %edi
/* Calculate address of paging_enabled() once we are executing in the trampoline */
leal paging_enabled - trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_OFFSET(%ecx), %eax
/* Prepare stack for far return to Long Mode */
/* Prepare the stack for far return to Long Mode */
pushl $__KERNEL_CS
leal lvl5(%edi), %eax
push %eax
pushl %eax
/* Enable paging back */
/* Enable paging again */
movl $(X86_CR0_PG | X86_CR0_PE), %eax
movl %eax, %cr0
lret
#endif
.code64
paging_enabled:
/* Return from the trampoline */
jmp *%rdi
/*
* The trampoline code has a size limit.
* Make sure we fail to compile if the trampoline code grows
* beyond TRAMPOLINE_32BIT_CODE_SIZE bytes.
*/
.org trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_SIZE
.code32
no_longmode:
/* This isn't an x86-64 CPU so hang */
/* This isn't an x86-64 CPU, so hang intentionally, we cannot continue */
1:
hlt
jmp 1b
......@@ -537,6 +596,11 @@ no_longmode:
#include "../../kernel/verify_cpu.S"
.data
gdt64:
.word gdt_end - gdt
.long 0
.word 0
.quad 0
gdt:
.word gdt_end - gdt
.long gdt
......@@ -585,7 +649,3 @@ boot_stack_end:
.balign 4096
pgtable:
.fill BOOT_PGT_SIZE, 1, 0
#ifdef CONFIG_X86_5LEVEL
lvl5_pgtable:
.fill PAGE_SIZE, 1, 0
#endif
......@@ -46,6 +46,12 @@
#define STATIC
#include <linux/decompress/mm.h>
#ifdef CONFIG_X86_5LEVEL
unsigned int pgtable_l5_enabled __ro_after_init;
unsigned int pgdir_shift __ro_after_init = 39;
unsigned int ptrs_per_p4d __ro_after_init = 1;
#endif
extern unsigned long get_cmd_line_ptr(void);
/* Simplified build-specific string for starting entropy. */
......@@ -723,6 +729,14 @@ void choose_random_location(unsigned long input,
return;
}
#ifdef CONFIG_X86_5LEVEL
if (__read_cr4() & X86_CR4_LA57) {
pgtable_l5_enabled = 1;
pgdir_shift = 48;
ptrs_per_p4d = 512;
}
#endif
boot_params->hdr.loadflags |= KASLR_FLAG;
/* Prepare to add new identity pagetables on demand. */
......
......@@ -16,13 +16,6 @@
#define __pa(x) ((unsigned long)(x))
#define __va(x) ((void *)((unsigned long)(x)))
/*
* The pgtable.h and mm/ident_map.c includes make use of the SME related
* information which is not used in the compressed image support. Un-define
* the SME support to avoid any compile and link errors.
*/
#undef CONFIG_AMD_MEM_ENCRYPT
/* No PAGE_TABLE_ISOLATION support needed either: */
#undef CONFIG_PAGE_TABLE_ISOLATION
......@@ -85,13 +78,14 @@ static struct x86_mapping_info mapping_info;
/* Locates and clears a region for a new top level page table. */
void initialize_identity_maps(void)
{
unsigned long sev_me_mask = get_sev_encryption_mask();
/* If running as an SEV guest, the encryption mask is required. */
set_sev_encryption_mask();
/* Init mapping_info with run-time function/buffer pointers. */
mapping_info.alloc_pgt_page = alloc_pgt_page;
mapping_info.context = &pgt_data;
mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sev_me_mask;
mapping_info.kernpg_flag = _KERNPG_TABLE | sev_me_mask;
mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sme_me_mask;
mapping_info.kernpg_flag = _KERNPG_TABLE;
/*
* It should be impossible for this not to already be true,
......
......@@ -88,9 +88,7 @@ ENTRY(get_sev_encryption_bit)
ENDPROC(get_sev_encryption_bit)
.code64
ENTRY(get_sev_encryption_mask)
xor %rax, %rax
ENTRY(set_sev_encryption_mask)
#ifdef CONFIG_AMD_MEM_ENCRYPT
push %rbp
push %rdx
......@@ -101,9 +99,7 @@ ENTRY(get_sev_encryption_mask)
testl %eax, %eax
jz .Lno_sev_mask
xor %rdx, %rdx
bts %rax, %rdx /* Create the encryption mask */
mov %rdx, %rax /* ... and return it */
bts %rax, sme_me_mask(%rip) /* Create the encryption mask */
.Lno_sev_mask:
movq %rbp, %rsp /* Restore original stack pointer */
......@@ -112,9 +108,16 @@ ENTRY(get_sev_encryption_mask)
pop %rbp
#endif
xor %rax, %rax
ret
ENDPROC(get_sev_encryption_mask)
ENDPROC(set_sev_encryption_mask)
.data
enc_bit:
.int 0xffffffff
#ifdef CONFIG_AMD_MEM_ENCRYPT
.balign 8
GLOBAL(sme_me_mask)
.quad 0
#endif
......@@ -14,6 +14,7 @@
#include "misc.h"
#include "error.h"
#include "pgtable.h"
#include "../string.h"
#include "../voffset.h"
......@@ -169,16 +170,6 @@ void __puthex(unsigned long value)
}
}
static bool l5_supported(void)
{
/* Check if leaf 7 is supported. */
if (native_cpuid_eax(0) < 7)
return 0;
/* Check if la57 is supported. */
return native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31));
}
#if CONFIG_X86_NEED_RELOCS
static void handle_relocations(void *output, unsigned long output_len,
unsigned long virt_addr)
......@@ -376,12 +367,6 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
console_init();
debug_putstr("early console in extract_kernel\n");
if (IS_ENABLED(CONFIG_X86_5LEVEL) && !l5_supported()) {
error("This linux kernel as configured requires 5-level paging\n"
"This CPU does not support the required 'cr4.la57' feature\n"
"Unable to boot - please use a kernel appropriate for your CPU\n");
}
free_mem_ptr = heap; /* Heap */
free_mem_end_ptr = heap + BOOT_HEAP_SIZE;
......@@ -392,6 +377,11 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
debug_putaddr(output_len);
debug_putaddr(kernel_total_size);
#ifdef CONFIG_X86_64
/* Report address of 32-bit trampoline */
debug_putaddr(trampoline_32bit);
#endif
/*
* The memory hole needed for the kernel is the larger of either
* the entire decompressed kernel plus relocation table, or the
......
......@@ -12,6 +12,11 @@
#undef CONFIG_PARAVIRT_SPINLOCKS
#undef CONFIG_KASAN
#ifdef CONFIG_X86_5LEVEL
/* cpu_feature_enabled() cannot be used that early */
#define pgtable_l5_enabled __pgtable_l5_enabled
#endif
#include <linux/linkage.h>
#include <linux/screen_info.h>
#include <linux/elf.h>
......@@ -109,6 +114,6 @@ static inline void console_init(void)
{ }
#endif
unsigned long get_sev_encryption_mask(void);
void set_sev_encryption_mask(void);
#endif
#ifndef BOOT_COMPRESSED_PAGETABLE_H
#define BOOT_COMPRESSED_PAGETABLE_H
#define TRAMPOLINE_32BIT_SIZE (2 * PAGE_SIZE)
#define TRAMPOLINE_32BIT_PGTABLE_OFFSET 0
#define TRAMPOLINE_32BIT_CODE_OFFSET PAGE_SIZE
#define TRAMPOLINE_32BIT_CODE_SIZE 0x60
#define TRAMPOLINE_32BIT_STACK_END TRAMPOLINE_32BIT_SIZE
#ifndef __ASSEMBLER__
extern unsigned long *trampoline_32bit;
extern void trampoline_32bit_src(void *return_ptr);
#endif /* __ASSEMBLER__ */
#endif /* BOOT_COMPRESSED_PAGETABLE_H */
#include <asm/processor.h>
#include "pgtable.h"
#include "../string.h"
/*
* __force_order is used by special_insns.h asm code to force instruction
......@@ -9,20 +11,144 @@
*/
unsigned long __force_order;
int l5_paging_required(void)
#define BIOS_START_MIN 0x20000U /* 128K, less than this is insane */
#define BIOS_START_MAX 0x9f000U /* 640K, absolute maximum */
struct paging_config {
unsigned long trampoline_start;
unsigned long l5_required;
};
/* Buffer to preserve trampoline memory */
static char trampoline_save[TRAMPOLINE_32BIT_SIZE];
/*
* The page table is going to be used instead of page table in the trampoline
* memory.
*
* It must not be in BSS as BSS is cleared after cleanup_trampoline().
*/
static char top_pgtable[PAGE_SIZE] __aligned(PAGE_SIZE) __section(.data);
/*
* Trampoline address will be printed by extract_kernel() for debugging
* purposes.
*
* Avoid putting the pointer into .bss as it will be cleared between
* paging_prepare() and extract_kernel().
*/
unsigned long *trampoline_32bit __section(.data);
struct paging_config paging_prepare(void)
{
/* Check if leaf 7 is supported. */
struct paging_config paging_config = {};
unsigned long bios_start, ebda_start;
/*
* Check if LA57 is desired and supported.
*
* There are two parts to the check:
* - if the kernel supports 5-level paging: CONFIG_X86_5LEVEL=y
* - if the machine supports 5-level paging:
* + CPUID leaf 7 is supported
* + the leaf has the feature bit set
*
* That's substitute for boot_cpu_has() in early boot code.
*/
if (IS_ENABLED(CONFIG_X86_5LEVEL) &&
native_cpuid_eax(0) >= 7 &&
(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)))) {
paging_config.l5_required = 1;
}
/*
* Find a suitable spot for the trampoline.
* This code is based on reserve_bios_regions().
*/
ebda_start = *(unsigned short *)0x40e << 4;
bios_start = *(unsigned short *)0x413 << 10;
if (bios_start < BIOS_START_MIN || bios_start > BIOS_START_MAX)
bios_start = BIOS_START_MAX;
if (ebda_start > BIOS_START_MIN && ebda_start < bios_start)
bios_start = ebda_start;
if (native_cpuid_eax(0) < 7)
return 0;
/* Place the trampoline just below the end of low memory, aligned to 4k */
paging_config.trampoline_start = bios_start - TRAMPOLINE_32BIT_SIZE;
paging_config.trampoline_start = round_down(paging_config.trampoline_start, PAGE_SIZE);
/* Check if la57 is supported. */
if (!(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31))))
return 0;
trampoline_32bit = (unsigned long *)paging_config.trampoline_start;
/* Check if 5-level paging has already been enabled. */
if (native_read_cr4() & X86_CR4_LA57)
return 0;
/* Preserve trampoline memory */
memcpy(trampoline_save, trampoline_32bit, TRAMPOLINE_32BIT_SIZE);
/* Clear trampoline memory first */
memset(trampoline_32bit, 0, TRAMPOLINE_32BIT_SIZE);
/* Copy trampoline code in place */
memcpy(trampoline_32bit + TRAMPOLINE_32BIT_CODE_OFFSET / sizeof(unsigned long),
&trampoline_32bit_src, TRAMPOLINE_32BIT_CODE_SIZE);
/*
* The code below prepares page table in trampoline memory.
*
* The new page table will be used by trampoline code for switching
* from 4- to 5-level paging or vice versa.
*
* If switching is not required, the page table is unused: trampoline
* code wouldn't touch CR3.
*/
/*
* We are not going to use the page table in trampoline memory if we
* are already in the desired paging mode.
*/
if (paging_config.l5_required == !!(native_read_cr4() & X86_CR4_LA57))
goto out;
if (paging_config.l5_required) {
/*
* For 4- to 5-level paging transition, set up current CR3 as
* the first and the only entry in a new top-level page table.
*/
trampoline_32bit[TRAMPOLINE_32BIT_PGTABLE_OFFSET] = __native_read_cr3() | _PAGE_TABLE_NOENC;
} else {
unsigned long src;
/*
* For 5- to 4-level paging transition, copy page table pointed
* by first entry in the current top-level page table as our
* new top-level page table.
*
* We cannot just point to the page table from trampoline as it
* may be above 4G.
*/
src = *(unsigned long *)__native_read_cr3() & PAGE_MASK;
memcpy(trampoline_32bit + TRAMPOLINE_32BIT_PGTABLE_OFFSET / sizeof(unsigned long),
(void *)src, PAGE_SIZE);
}
out:
return paging_config;
}
void cleanup_trampoline(void)
{
void *trampoline_pgtable;
trampoline_pgtable = trampoline_32bit + TRAMPOLINE_32BIT_PGTABLE_OFFSET;
/*
* Move the top level page table out of trampoline memory,
* if it's there.
*/
if ((void *)__native_read_cr3() == trampoline_pgtable) {
memcpy(top_pgtable, trampoline_pgtable, PAGE_SIZE);
native_write_cr3((unsigned long)top_pgtable);
}
return 1;
/* Restore trampoline memory */
memcpy(trampoline_32bit, trampoline_save, TRAMPOLINE_32BIT_SIZE);
}
......@@ -260,8 +260,13 @@ GLOBAL(entry_SYSCALL_64_after_hwframe)
* Change top bits to match most significant bit (47th or 56th bit
* depending on paging mode) in the address.
*/
#ifdef CONFIG_X86_5LEVEL
ALTERNATIVE "shl $(64 - 48), %rcx; sar $(64 - 48), %rcx", \
"shl $(64 - 57), %rcx; sar $(64 - 57), %rcx", X86_FEATURE_LA57
#else
shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
#endif
/* If this changed %rcx, it was not canonical */
cmpq %rcx, %r11
......
......@@ -31,6 +31,7 @@
#include <asm/mmu.h>
#include <asm/mpspec.h>
#include <asm/realmode.h>
#include <asm/x86_init.h>
#ifdef CONFIG_ACPI_APEI
# include <asm/pgtable_types.h>
......@@ -133,6 +134,14 @@ static inline bool acpi_has_cpu_in_madt(void)
return !!acpi_lapic;
}
#define ACPI_HAVE_ARCH_GET_ROOT_POINTER
static inline u64 acpi_arch_get_root_pointer(void)
{
return x86_init.acpi.get_root_pointer();
}
void acpi_generic_reduced_hw_init(void);
#else /* !CONFIG_ACPI */
#define acpi_lapic 0
......@@ -142,6 +151,8 @@ static inline void acpi_noirq_set(void) { }
static inline void acpi_disable_pci(void) { }
static inline void disable_acpi(void) { }
static inline void acpi_generic_reduced_hw_init(void) { }
#endif /* !CONFIG_ACPI */
#define ARCH_HAS_POWER_INIT 1
......
#ifndef _ASM_X86_INTEL_PCONFIG_H
#define _ASM_X86_INTEL_PCONFIG_H
#include <asm/asm.h>
#include <asm/processor.h>
enum pconfig_target {
INVALID_TARGET = 0,
MKTME_TARGET = 1,
PCONFIG_TARGET_NR
};
int pconfig_target_supported(enum pconfig_target target);
enum pconfig_leaf {
MKTME_KEY_PROGRAM = 0,
PCONFIG_LEAF_INVALID,
};
#define PCONFIG ".byte 0x0f, 0x01, 0xc5"
/* Defines and structure for MKTME_KEY_PROGRAM of PCONFIG instruction */
/* mktme_key_program::keyid_ctrl COMMAND, bits [7:0] */
#define MKTME_KEYID_SET_KEY_DIRECT 0
#define MKTME_KEYID_SET_KEY_RANDOM 1
#define MKTME_KEYID_CLEAR_KEY 2
#define MKTME_KEYID_NO_ENCRYPT 3
/* mktme_key_program::keyid_ctrl ENC_ALG, bits [23:8] */
#define MKTME_AES_XTS_128 (1 << 8)
/* Return codes from the PCONFIG MKTME_KEY_PROGRAM */
#define MKTME_PROG_SUCCESS 0
#define MKTME_INVALID_PROG_CMD 1
#define MKTME_ENTROPY_ERROR 2
#define MKTME_INVALID_KEYID 3
#define MKTME_INVALID_ENC_ALG 4
#define MKTME_DEVICE_BUSY 5
/* Hardware requires the structure to be 256 byte alinged. Otherwise #GP(0). */
struct mktme_key_program {
u16 keyid;
u32 keyid_ctrl;
u8 __rsvd[58];
u8 key_field_1[64];
u8 key_field_2[64];
} __packed __aligned(256);
static inline int mktme_key_program(struct mktme_key_program *key_program)
{
unsigned long rax = MKTME_KEY_PROGRAM;
if (!pconfig_target_supported(MKTME_TARGET))
return -ENXIO;
asm volatile(PCONFIG
: "=a" (rax), "=b" (key_program)
: "0" (rax), "1" (key_program)
: "memory", "cc");
return rax;
}
#endif /* _ASM_X86_INTEL_PCONFIG_H */
......@@ -5,10 +5,6 @@
unsigned long kaslr_get_random_long(const char *purpose);
#ifdef CONFIG_RANDOMIZE_MEMORY
extern unsigned long page_offset_base;
extern unsigned long vmalloc_base;
extern unsigned long vmemmap_base;
void kernel_randomize_memory(void);
#else
static inline void kernel_randomize_memory(void) { }
......
......@@ -22,6 +22,7 @@
#ifdef CONFIG_AMD_MEM_ENCRYPT
extern u64 sme_me_mask;
extern bool sev_enabled;
void sme_encrypt_execute(unsigned long encrypted_kernel_vaddr,
unsigned long decrypted_kernel_vaddr,
......
......@@ -11,6 +11,10 @@
extern unsigned long max_pfn;
extern unsigned long phys_base;
extern unsigned long page_offset_base;
extern unsigned long vmalloc_base;
extern unsigned long vmemmap_base;
static inline unsigned long __phys_addr_nodebug(unsigned long x)
{
unsigned long y = x - __START_KERNEL_map;
......
......@@ -37,26 +37,24 @@
* hypervisor to fit. Choosing 16 slots here is arbitrary, but it's
* what Xen requires.
*/
#ifdef CONFIG_X86_5LEVEL
#define __PAGE_OFFSET_BASE _AC(0xff10000000000000, UL)
#else
#define __PAGE_OFFSET_BASE _AC(0xffff880000000000, UL)
#endif
#define __PAGE_OFFSET_BASE_L5 _AC(0xff10000000000000, UL)
#define __PAGE_OFFSET_BASE_L4 _AC(0xffff880000000000, UL)
#ifdef CONFIG_RANDOMIZE_MEMORY
#ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT
#define __PAGE_OFFSET page_offset_base
#else
#define __PAGE_OFFSET __PAGE_OFFSET_BASE
#endif /* CONFIG_RANDOMIZE_MEMORY */
#define __PAGE_OFFSET __PAGE_OFFSET_BASE_L4
#endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */
#define __START_KERNEL_map _AC(0xffffffff80000000, UL)
/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
#ifdef CONFIG_X86_5LEVEL
#define __PHYSICAL_MASK_SHIFT 52
#define __VIRTUAL_MASK_SHIFT 56
#ifdef CONFIG_X86_5LEVEL
#define __VIRTUAL_MASK_SHIFT (pgtable_l5_enabled ? 56 : 47)
#else
#define __PHYSICAL_MASK_SHIFT 46
#define __VIRTUAL_MASK_SHIFT 47
#endif
......
......@@ -568,17 +568,22 @@ static inline p4dval_t p4d_val(p4d_t p4d)
return PVOP_CALLEE1(p4dval_t, pv_mmu_ops.p4d_val, p4d.p4d);
}
static inline void set_pgd(pgd_t *pgdp, pgd_t pgd)
static inline void __set_pgd(pgd_t *pgdp, pgd_t pgd)
{
pgdval_t val = native_pgd_val(pgd);
PVOP_VCALL2(pv_mmu_ops.set_pgd, pgdp, val);
PVOP_VCALL2(pv_mmu_ops.set_pgd, pgdp, native_pgd_val(pgd));
}
static inline void pgd_clear(pgd_t *pgdp)
{
set_pgd(pgdp, __pgd(0));
}
#define set_pgd(pgdp, pgdval) do { \
if (pgtable_l5_enabled) \
__set_pgd(pgdp, pgdval); \
else \
set_p4d((p4d_t *)(pgdp), (p4d_t) { (pgdval).pgd }); \
} while (0)
#define pgd_clear(pgdp) do { \
if (pgtable_l5_enabled) \
set_pgd(pgdp, __pgd(0)); \
} while (0)
#endif /* CONFIG_PGTABLE_LEVELS == 5 */
......
......@@ -167,6 +167,8 @@ static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
#if CONFIG_PGTABLE_LEVELS > 4
static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d)
{
if (!pgtable_l5_enabled)
return;
paravirt_alloc_p4d(mm, __pa(p4d) >> PAGE_SHIFT);
set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(p4d)));
}
......@@ -191,6 +193,7 @@ extern void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d);
static inline void __p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d,
unsigned long address)
{
if (pgtable_l5_enabled)
___p4d_free_tlb(tlb, p4d);
}
......
......@@ -44,5 +44,6 @@ typedef union {
*/
#define PTRS_PER_PTE 512
#define MAX_POSSIBLE_PHYSMEM_BITS 36
#endif /* _ASM_X86_PGTABLE_3LEVEL_DEFS_H */
......@@ -65,7 +65,7 @@ extern pmdval_t early_pmd_flags;
#ifndef __PAGETABLE_P4D_FOLDED
#define set_pgd(pgdp, pgd) native_set_pgd(pgdp, pgd)
#define pgd_clear(pgd) native_pgd_clear(pgd)
#define pgd_clear(pgd) (pgtable_l5_enabled ? native_pgd_clear(pgd) : 0)
#endif
#ifndef set_p4d
......@@ -859,6 +859,8 @@ static inline unsigned long p4d_index(unsigned long address)
#if CONFIG_PGTABLE_LEVELS > 4
static inline int pgd_present(pgd_t pgd)
{
if (!pgtable_l5_enabled)
return 1;
return pgd_flags(pgd) & _PAGE_PRESENT;
}
......@@ -876,6 +878,8 @@ static inline unsigned long pgd_page_vaddr(pgd_t pgd)
/* to find an entry in a page-table-directory. */
static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
{
if (!pgtable_l5_enabled)
return (p4d_t *)pgd;
return (p4d_t *)pgd_page_vaddr(*pgd) + p4d_index(address);
}
......@@ -883,6 +887,9 @@ static inline int pgd_bad(pgd_t pgd)
{
unsigned long ignore_flags = _PAGE_USER;
if (!pgtable_l5_enabled)
return 0;
if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
ignore_flags |= _PAGE_NX;
......@@ -891,6 +898,8 @@ static inline int pgd_bad(pgd_t pgd)
static inline int pgd_none(pgd_t pgd)
{
if (!pgtable_l5_enabled)
return 0;
/*
* There is no need to do a workaround for the KNL stray
* A/D bit erratum here. PGDs only point to page tables
......
......@@ -34,6 +34,8 @@ static inline void check_pgt_cache(void) { }
void paging_init(void);
void sync_initial_page_table(void);
static inline int pgd_large(pgd_t pgd) { return 0; }
/*
* Define this if things work differently on an i386 and an i486:
* it will (on an i486) warn about kernel memory accesses that are
......
......@@ -15,6 +15,8 @@
# include <asm/pgtable-2level_types.h>
#endif
#define pgtable_l5_enabled 0
#define PGDIR_SIZE (1UL << PGDIR_SHIFT)
#define PGDIR_MASK (~(PGDIR_SIZE - 1))
......
......@@ -218,29 +218,26 @@ static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
{
#if defined(CONFIG_PAGE_TABLE_ISOLATION) && !defined(CONFIG_X86_5LEVEL)
p4dp->pgd = pti_set_user_pgd(&p4dp->pgd, p4d.pgd);
#else
pgd_t pgd;
if (pgtable_l5_enabled || !IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) {
*p4dp = p4d;
#endif
return;
}
pgd = native_make_pgd(native_p4d_val(p4d));
pgd = pti_set_user_pgd((pgd_t *)p4dp, pgd);
*p4dp = native_make_p4d(native_pgd_val(pgd));
}
static inline void native_p4d_clear(p4d_t *p4d)
{
#ifdef CONFIG_X86_5LEVEL
native_set_p4d(p4d, native_make_p4d(0));
#else
native_set_p4d(p4d, (p4d_t) { .pgd = native_make_pgd(0)});
#endif
}
static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
{
#ifdef CONFIG_PAGE_TABLE_ISOLATION
*pgdp = pti_set_user_pgd(pgdp, pgd);
#else
*pgdp = pgd;
#endif
}
static inline void native_pgd_clear(pgd_t *pgd)
......
......@@ -20,6 +20,18 @@ typedef unsigned long pgprotval_t;
typedef struct { pteval_t pte; } pte_t;
#ifdef CONFIG_X86_5LEVEL
extern unsigned int __pgtable_l5_enabled;
#ifndef pgtable_l5_enabled
#define pgtable_l5_enabled cpu_feature_enabled(X86_FEATURE_LA57)
#endif
#else
#define pgtable_l5_enabled 0
#endif
extern unsigned int pgdir_shift;
extern unsigned int ptrs_per_p4d;
#endif /* !__ASSEMBLY__ */
#define SHARED_KERNEL_PMD 0
......@@ -29,17 +41,20 @@ typedef struct { pteval_t pte; } pte_t;
/*
* PGDIR_SHIFT determines what a top-level page table entry can map
*/
#define PGDIR_SHIFT 48
#define PGDIR_SHIFT pgdir_shift
#define PTRS_PER_PGD 512
/*
* 4th level page in 5-level paging case
*/
#define P4D_SHIFT 39
#define PTRS_PER_P4D 512
#define MAX_PTRS_PER_P4D 512
#define PTRS_PER_P4D ptrs_per_p4d
#define P4D_SIZE (_AC(1, UL) << P4D_SHIFT)
#define P4D_MASK (~(P4D_SIZE - 1))
#define MAX_POSSIBLE_PHYSMEM_BITS 52
#else /* CONFIG_X86_5LEVEL */
/*
......@@ -47,6 +62,7 @@ typedef struct { pteval_t pte; } pte_t;
*/
#define PGDIR_SHIFT 39
#define PTRS_PER_PGD 512
#define MAX_PTRS_PER_P4D 1
#endif /* CONFIG_X86_5LEVEL */
......@@ -82,31 +98,33 @@ typedef struct { pteval_t pte; } pte_t;
* range must not overlap with anything except the KASAN shadow area, which
* is correct as KASAN disables KASLR.
*/
#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
#define MAXMEM (1UL << MAX_PHYSMEM_BITS)
#ifdef CONFIG_X86_5LEVEL
# define VMALLOC_SIZE_TB _AC(12800, UL)
# define __VMALLOC_BASE _AC(0xffa0000000000000, UL)
# define __VMEMMAP_BASE _AC(0xffd4000000000000, UL)
# define LDT_PGD_ENTRY _AC(-112, UL)
# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
#else
# define VMALLOC_SIZE_TB _AC(32, UL)
# define __VMALLOC_BASE _AC(0xffffc90000000000, UL)
# define __VMEMMAP_BASE _AC(0xffffea0000000000, UL)
# define LDT_PGD_ENTRY _AC(-3, UL)
# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
#endif
#define LDT_PGD_ENTRY_L4 -3UL
#define LDT_PGD_ENTRY_L5 -112UL
#define LDT_PGD_ENTRY (pgtable_l5_enabled ? LDT_PGD_ENTRY_L5 : LDT_PGD_ENTRY_L4)
#define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
#define __VMALLOC_BASE_L4 0xffffc90000000000
#define __VMALLOC_BASE_L5 0xffa0000000000000
#define VMALLOC_SIZE_TB_L4 32UL
#define VMALLOC_SIZE_TB_L5 12800UL
#define __VMEMMAP_BASE_L4 0xffffea0000000000
#define __VMEMMAP_BASE_L5 0xffd4000000000000
#ifdef CONFIG_RANDOMIZE_MEMORY
#ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT
# define VMALLOC_START vmalloc_base
# define VMALLOC_SIZE_TB (pgtable_l5_enabled ? VMALLOC_SIZE_TB_L5 : VMALLOC_SIZE_TB_L4)
# define VMEMMAP_START vmemmap_base
#else
# define VMALLOC_START __VMALLOC_BASE
# define VMEMMAP_START __VMEMMAP_BASE
#endif /* CONFIG_RANDOMIZE_MEMORY */
# define VMALLOC_START __VMALLOC_BASE_L4
# define VMALLOC_SIZE_TB VMALLOC_SIZE_TB_L4
# define VMEMMAP_START __VMEMMAP_BASE_L4
#endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */
#define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL))
#define VMALLOC_END (VMALLOC_START + (VMALLOC_SIZE_TB << 40) - 1)
#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE)
/* The module sections ends with the start of the fixmap */
......
......@@ -53,12 +53,6 @@
# define NEED_MOVBE 0
#endif
#ifdef CONFIG_X86_5LEVEL
# define NEED_LA57 (1<<(X86_FEATURE_LA57 & 31))
#else
# define NEED_LA57 0
#endif
#ifdef CONFIG_X86_64
#ifdef CONFIG_PARAVIRT
/* Paravirtualized systems may not have PSE or PGE available */
......@@ -104,7 +98,7 @@
#define REQUIRED_MASK13 0
#define REQUIRED_MASK14 0
#define REQUIRED_MASK15 0
#define REQUIRED_MASK16 (NEED_LA57)
#define REQUIRED_MASK16 0
#define REQUIRED_MASK17 0
#define REQUIRED_MASK18 0
#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19)
......
......@@ -27,13 +27,8 @@
# endif
#else /* CONFIG_X86_32 */
# define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */
# ifdef CONFIG_X86_5LEVEL
# define MAX_PHYSADDR_BITS 52
# define MAX_PHYSMEM_BITS 52
# else
# define MAX_PHYSADDR_BITS 44
# define MAX_PHYSMEM_BITS 46
# endif
# define MAX_PHYSADDR_BITS (pgtable_l5_enabled ? 52 : 44)
# define MAX_PHYSMEM_BITS (pgtable_l5_enabled ? 52 : 46)
#endif
#endif /* CONFIG_SPARSEMEM */
......
......@@ -130,6 +130,16 @@ struct x86_hyper_init {
void (*init_mem_mapping)(void);
};
/**
* struct x86_init_acpi - x86 ACPI init functions
* @get_root_pointer: get RSDP address
* @reduced_hw_early_init: hardware reduced platform early init
*/
struct x86_init_acpi {
u64 (*get_root_pointer)(void);
void (*reduced_hw_early_init)(void);
};
/**
* struct x86_init_ops - functions for platform specific setup
*
......@@ -144,6 +154,7 @@ struct x86_init_ops {
struct x86_init_iommu iommu;
struct x86_init_pci pci;
struct x86_hyper_init hyper;
struct x86_init_acpi acpi;
};
/**
......
......@@ -1376,17 +1376,21 @@ static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
*
* We initialize the Hardware-reduced ACPI model here:
*/
static void __init acpi_reduced_hw_init(void)
void __init acpi_generic_reduced_hw_init(void)
{
if (acpi_gbl_reduced_hardware) {
/*
* Override x86_init functions and bypass legacy pic
* in Hardware-reduced ACPI mode
* Override x86_init functions and bypass legacy PIC in
* hardware reduced ACPI mode.
*/
x86_init.timers.timer_init = x86_init_noop;
x86_init.irqs.pre_vector_init = x86_init_noop;
legacy_pic = &null_legacy_pic;
}
}
static void __init acpi_reduced_hw_init(void)
{
if (acpi_gbl_reduced_hardware)
x86_init.acpi.reduced_hw_early_init();
}
/*
......
......@@ -28,7 +28,7 @@ obj-y += cpuid-deps.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o
obj-$(CONFIG_CPU_SUP_INTEL) += intel.o
obj-$(CONFIG_CPU_SUP_INTEL) += intel.o intel_pconfig.o
obj-$(CONFIG_CPU_SUP_AMD) += amd.o
obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o
obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
......
......@@ -509,6 +509,90 @@ static void detect_vmx_virtcap(struct cpuinfo_x86 *c)
}
}
#define MSR_IA32_TME_ACTIVATE 0x982
/* Helpers to access TME_ACTIVATE MSR */
#define TME_ACTIVATE_LOCKED(x) (x & 0x1)
#define TME_ACTIVATE_ENABLED(x) (x & 0x2)
#define TME_ACTIVATE_POLICY(x) ((x >> 4) & 0xf) /* Bits 7:4 */
#define TME_ACTIVATE_POLICY_AES_XTS_128 0
#define TME_ACTIVATE_KEYID_BITS(x) ((x >> 32) & 0xf) /* Bits 35:32 */
#define TME_ACTIVATE_CRYPTO_ALGS(x) ((x >> 48) & 0xffff) /* Bits 63:48 */
#define TME_ACTIVATE_CRYPTO_AES_XTS_128 1
/* Values for mktme_status (SW only construct) */
#define MKTME_ENABLED 0
#define MKTME_DISABLED 1
#define MKTME_UNINITIALIZED 2
static int mktme_status = MKTME_UNINITIALIZED;
static void detect_tme(struct cpuinfo_x86 *c)
{
u64 tme_activate, tme_policy, tme_crypto_algs;
int keyid_bits = 0, nr_keyids = 0;
static u64 tme_activate_cpu0 = 0;
rdmsrl(MSR_IA32_TME_ACTIVATE, tme_activate);
if (mktme_status != MKTME_UNINITIALIZED) {
if (tme_activate != tme_activate_cpu0) {
/* Broken BIOS? */
pr_err_once("x86/tme: configuration is inconsistent between CPUs\n");
pr_err_once("x86/tme: MKTME is not usable\n");
mktme_status = MKTME_DISABLED;
/* Proceed. We may need to exclude bits from x86_phys_bits. */
}
} else {
tme_activate_cpu0 = tme_activate;
}
if (!TME_ACTIVATE_LOCKED(tme_activate) || !TME_ACTIVATE_ENABLED(tme_activate)) {
pr_info_once("x86/tme: not enabled by BIOS\n");
mktme_status = MKTME_DISABLED;
return;
}
if (mktme_status != MKTME_UNINITIALIZED)
goto detect_keyid_bits;
pr_info("x86/tme: enabled by BIOS\n");
tme_policy = TME_ACTIVATE_POLICY(tme_activate);
if (tme_policy != TME_ACTIVATE_POLICY_AES_XTS_128)
pr_warn("x86/tme: Unknown policy is active: %#llx\n", tme_policy);
tme_crypto_algs = TME_ACTIVATE_CRYPTO_ALGS(tme_activate);
if (!(tme_crypto_algs & TME_ACTIVATE_CRYPTO_AES_XTS_128)) {
pr_err("x86/mktme: No known encryption algorithm is supported: %#llx\n",
tme_crypto_algs);
mktme_status = MKTME_DISABLED;
}
detect_keyid_bits:
keyid_bits = TME_ACTIVATE_KEYID_BITS(tme_activate);
nr_keyids = (1UL << keyid_bits) - 1;
if (nr_keyids) {
pr_info_once("x86/mktme: enabled by BIOS\n");
pr_info_once("x86/mktme: %d KeyIDs available\n", nr_keyids);
} else {
pr_info_once("x86/mktme: disabled by BIOS\n");
}
if (mktme_status == MKTME_UNINITIALIZED) {
/* MKTME is usable */
mktme_status = MKTME_ENABLED;
}
/*
* KeyID bits effectively lower the number of physical address
* bits. Update cpuinfo_x86::x86_phys_bits accordingly.
*/
c->x86_phys_bits -= keyid_bits;
}
static void init_intel_energy_perf(struct cpuinfo_x86 *c)
{
u64 epb;
......@@ -679,6 +763,9 @@ static void init_intel(struct cpuinfo_x86 *c)
if (cpu_has(c, X86_FEATURE_VMX))
detect_vmx_virtcap(c);
if (cpu_has(c, X86_FEATURE_TME))
detect_tme(c);
init_intel_energy_perf(c);
init_intel_misc_features(c);
......
// SPDX-License-Identifier: GPL-2.0
/*
* Intel PCONFIG instruction support.
*
* Copyright (C) 2017 Intel Corporation
*
* Author:
* Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
*/
#include <asm/cpufeature.h>
#include <asm/intel_pconfig.h>
#define PCONFIG_CPUID 0x1b
#define PCONFIG_CPUID_SUBLEAF_MASK ((1 << 12) - 1)
/* Subleaf type (EAX) for PCONFIG CPUID leaf (0x1B) */
enum {
PCONFIG_CPUID_SUBLEAF_INVALID = 0,
PCONFIG_CPUID_SUBLEAF_TARGETID = 1,
};
/* Bitmask of supported targets */
static u64 targets_supported __read_mostly;
int pconfig_target_supported(enum pconfig_target target)
{
/*
* We would need to re-think the implementation once we get > 64
* PCONFIG targets. Spec allows up to 2^32 targets.
*/
BUILD_BUG_ON(PCONFIG_TARGET_NR >= 64);
if (WARN_ON_ONCE(target >= 64))
return 0;
return targets_supported & (1ULL << target);
}
static int __init intel_pconfig_init(void)
{
int subleaf;
if (!boot_cpu_has(X86_FEATURE_PCONFIG))
return 0;
/*
* Scan subleafs of PCONFIG CPUID leaf.
*
* Subleafs of the same type need not to be consecutive.
*
* Stop on the first invalid subleaf type. All subleafs after the first
* invalid are invalid too.
*/
for (subleaf = 0; subleaf < INT_MAX; subleaf++) {
struct cpuid_regs regs;
cpuid_count(PCONFIG_CPUID, subleaf,
&regs.eax, &regs.ebx, &regs.ecx, &regs.edx);
switch (regs.eax & PCONFIG_CPUID_SUBLEAF_MASK) {
case PCONFIG_CPUID_SUBLEAF_INVALID:
/* Stop on the first invalid subleaf */
goto out;
case PCONFIG_CPUID_SUBLEAF_TARGETID:
/* Mark supported PCONFIG targets */
if (regs.ebx < 64)
targets_supported |= (1ULL << regs.ebx);
if (regs.ecx < 64)
targets_supported |= (1ULL << regs.ecx);
if (regs.edx < 64)
targets_supported |= (1ULL << regs.edx);
break;
default:
/* Unknown CPUID.PCONFIG subleaf: ignore */
break;
}
}
out:
return 0;
}
arch_initcall(intel_pconfig_init);
......@@ -1095,19 +1095,7 @@ static void mce_unmap_kpfn(unsigned long pfn)
* a legal address.
*/
/*
* Build time check to see if we have a spare virtual bit. Don't want
* to leave this until run time because most developers don't have a
* system that can exercise this code path. This will only become a
* problem if/when we move beyond 5-level page tables.
*
* Hard code "9" here because cpp doesn't grok ilog2(PTRS_PER_PGD)
*/
#if PGDIR_SHIFT + 9 < 63
decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
#else
#error "no unused virtual bit available"
#endif
if (set_memory_np(decoy_addr, 1))
pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
......@@ -2357,6 +2345,12 @@ static __init int mcheck_init_device(void)
{
int err;
/*
* Check if we have a spare virtual bit. This will only become
* a problem if/when we move beyond 5-level page tables.
*/
MAYBE_BUILD_BUG_ON(__VIRTUAL_MASK_SHIFT >= 63);
if (!mce_available(&boot_cpu_data)) {
err = -EIO;
goto err_out;
......
......@@ -924,6 +924,24 @@ static int __init parse_memmap_one(char *p)
} else if (*p == '!') {
start_at = memparse(p+1, &p);
e820__range_add(start_at, mem_size, E820_TYPE_PRAM);
} else if (*p == '%') {
enum e820_type from = 0, to = 0;
start_at = memparse(p + 1, &p);
if (*p == '-')
from = simple_strtoull(p + 1, &p, 0);
if (*p == '+')
to = simple_strtoull(p + 1, &p, 0);
if (*p != '\0')
return -EINVAL;
if (from && to)
e820__range_update(start_at, mem_size, from, to);
else if (to)
e820__range_add(start_at, mem_size, to);
else if (from)
e820__range_remove(start_at, mem_size, from, 1);
else
e820__range_remove(start_at, mem_size, 0, 0);
} else {
e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1);
}
......
......@@ -32,6 +32,11 @@
#include <asm/microcode.h>
#include <asm/kasan.h>
#ifdef CONFIG_X86_5LEVEL
#undef pgtable_l5_enabled
#define pgtable_l5_enabled __pgtable_l5_enabled
#endif
/*
* Manage page tables very early on.
*/
......@@ -39,6 +44,24 @@ extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
static unsigned int __initdata next_early_pgt;
pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
#ifdef CONFIG_X86_5LEVEL
unsigned int __pgtable_l5_enabled __ro_after_init;
EXPORT_SYMBOL(__pgtable_l5_enabled);
unsigned int pgdir_shift __ro_after_init = 39;
EXPORT_SYMBOL(pgdir_shift);
unsigned int ptrs_per_p4d __ro_after_init = 1;
EXPORT_SYMBOL(ptrs_per_p4d);
#endif
#ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT
unsigned long page_offset_base __ro_after_init = __PAGE_OFFSET_BASE_L4;
EXPORT_SYMBOL(page_offset_base);
unsigned long vmalloc_base __ro_after_init = __VMALLOC_BASE_L4;
EXPORT_SYMBOL(vmalloc_base);
unsigned long vmemmap_base __ro_after_init = __VMEMMAP_BASE_L4;
EXPORT_SYMBOL(vmemmap_base);
#endif
#define __head __section(.head.text)
static void __head *fixup_pointer(void *ptr, unsigned long physaddr)
......@@ -46,6 +69,41 @@ static void __head *fixup_pointer(void *ptr, unsigned long physaddr)
return ptr - (void *)_text + (void *)physaddr;
}
static unsigned long __head *fixup_long(void *ptr, unsigned long physaddr)
{
return fixup_pointer(ptr, physaddr);
}
#ifdef CONFIG_X86_5LEVEL
static unsigned int __head *fixup_int(void *ptr, unsigned long physaddr)
{
return fixup_pointer(ptr, physaddr);
}
static bool __head check_la57_support(unsigned long physaddr)
{
if (native_cpuid_eax(0) < 7)
return false;
if (!(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31))))
return false;
*fixup_int(&pgtable_l5_enabled, physaddr) = 1;
*fixup_int(&pgdir_shift, physaddr) = 48;
*fixup_int(&ptrs_per_p4d, physaddr) = 512;
*fixup_long(&page_offset_base, physaddr) = __PAGE_OFFSET_BASE_L5;
*fixup_long(&vmalloc_base, physaddr) = __VMALLOC_BASE_L5;
*fixup_long(&vmemmap_base, physaddr) = __VMEMMAP_BASE_L5;
return true;
}
#else
static bool __head check_la57_support(unsigned long physaddr)
{
return false;
}
#endif
unsigned long __head __startup_64(unsigned long physaddr,
struct boot_params *bp)
{
......@@ -55,9 +113,12 @@ unsigned long __head __startup_64(unsigned long physaddr,
p4dval_t *p4d;
pudval_t *pud;
pmdval_t *pmd, pmd_entry;
bool la57;
int i;
unsigned int *next_pgt_ptr;
la57 = check_la57_support(physaddr);
/* Is the address too large? */
if (physaddr >> MAX_PHYSMEM_BITS)
for (;;);
......@@ -81,9 +142,14 @@ unsigned long __head __startup_64(unsigned long physaddr,
/* Fixup the physical addresses in the page table */
pgd = fixup_pointer(&early_top_pgt, physaddr);
pgd[pgd_index(__START_KERNEL_map)] += load_delta;
if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
p = pgd + pgd_index(__START_KERNEL_map);
if (la57)
*p = (unsigned long)level4_kernel_pgt;
else
*p = (unsigned long)level3_kernel_pgt;
*p += _PAGE_TABLE_NOENC - __START_KERNEL_map + load_delta;
if (la57) {
p4d = fixup_pointer(&level4_kernel_pgt, physaddr);
p4d[511] += load_delta;
}
......@@ -108,7 +174,7 @@ unsigned long __head __startup_64(unsigned long physaddr,
pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask();
if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
if (la57) {
p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
......@@ -154,8 +220,7 @@ unsigned long __head __startup_64(unsigned long physaddr,
* Fixup phys_base - remove the memory encryption mask to obtain
* the true physical address.
*/
p = fixup_pointer(&phys_base, physaddr);
*p += load_delta - sme_get_me_mask();
*fixup_long(&phys_base, physaddr) += load_delta - sme_get_me_mask();
/* Encrypt the kernel and related (if SME is active) */
sme_encrypt_kernel(bp);
......@@ -206,7 +271,7 @@ int __init __early_make_pgtable(unsigned long address, pmdval_t pmd)
* critical -- __PAGE_OFFSET would point us back into the dynamic
* range and we might end up looping forever...
*/
if (!IS_ENABLED(CONFIG_X86_5LEVEL))
if (!pgtable_l5_enabled)
p4d_p = pgd_p;
else if (pgd)
p4d_p = (p4dval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
......@@ -322,7 +387,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
BUILD_BUG_ON((__START_KERNEL_map & ~PMD_MASK) != 0);
BUILD_BUG_ON((MODULES_VADDR & ~PMD_MASK) != 0);
BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
MAYBE_BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
(__START_KERNEL & PGDIR_MASK)));
BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
......
......@@ -39,12 +39,12 @@
*
*/
#define l4_index(x) (((x) >> 39) & 511)
#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH)
PGD_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE)
PGD_START_KERNEL = pgd_index(__START_KERNEL_map)
#endif
L4_PAGE_OFFSET = l4_index(__PAGE_OFFSET_BASE_L4)
L4_START_KERNEL = l4_index(__START_KERNEL_map)
L3_START_KERNEL = pud_index(__START_KERNEL_map)
.text
......@@ -125,7 +125,10 @@ ENTRY(secondary_startup_64)
/* Enable PAE mode, PGE and LA57 */
movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
#ifdef CONFIG_X86_5LEVEL
testl $1, __pgtable_l5_enabled(%rip)
jz 1f
orl $X86_CR4_LA57, %ecx
1:
#endif
movq %rcx, %cr4
......@@ -374,12 +377,7 @@ GLOBAL(name)
__INITDATA
NEXT_PGD_PAGE(early_top_pgt)
.fill 511,8,0
#ifdef CONFIG_X86_5LEVEL
.quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
#else
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
#endif
.fill 512,8,0
.fill PTI_USER_PGD_FILL,8,0
NEXT_PAGE(early_dynamic_pgts)
......@@ -390,9 +388,9 @@ NEXT_PAGE(early_dynamic_pgts)
#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH)
NEXT_PGD_PAGE(init_top_pgt)
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
.org init_top_pgt + PGD_PAGE_OFFSET*8, 0
.org init_top_pgt + L4_PAGE_OFFSET*8, 0
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
.org init_top_pgt + PGD_START_KERNEL*8, 0
.org init_top_pgt + L4_START_KERNEL*8, 0
/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
.fill PTI_USER_PGD_FILL,8,0
......
......@@ -350,6 +350,7 @@ void arch_crash_save_vmcoreinfo(void)
{
VMCOREINFO_NUMBER(phys_base);
VMCOREINFO_SYMBOL(init_top_pgt);
VMCOREINFO_NUMBER(pgtable_l5_enabled);
#ifdef CONFIG_NUMA
VMCOREINFO_SYMBOL(node_data);
......
......@@ -189,9 +189,7 @@ struct ist_info ist_info;
#endif
#else
struct cpuinfo_x86 boot_cpu_data __read_mostly = {
.x86_phys_bits = MAX_PHYSMEM_BITS,
};
struct cpuinfo_x86 boot_cpu_data __read_mostly;
EXPORT_SYMBOL(boot_cpu_data);
#endif
......@@ -851,6 +849,7 @@ void __init setup_arch(char **cmdline_p)
__flush_tlb_all();
#else
printk(KERN_INFO "Command line: %s\n", boot_command_line);
boot_cpu_data.x86_phys_bits = MAX_PHYSMEM_BITS;
#endif
/*
......
......@@ -8,6 +8,7 @@
#include <linux/export.h>
#include <linux/pci.h>
#include <asm/acpi.h>
#include <asm/bios_ebda.h>
#include <asm/paravirt.h>
#include <asm/pci_x86.h>
......@@ -26,10 +27,11 @@
void x86_init_noop(void) { }
void __init x86_init_uint_noop(unsigned int unused) { }
int __init iommu_init_noop(void) { return 0; }
void iommu_shutdown_noop(void) { }
bool __init bool_x86_init_noop(void) { return false; }
void x86_op_int_noop(int cpu) { }
static int __init iommu_init_noop(void) { return 0; }
static void iommu_shutdown_noop(void) { }
static bool __init bool_x86_init_noop(void) { return false; }
static void x86_op_int_noop(int cpu) { }
static u64 u64_x86_init_noop(void) { return 0; }
/*
* The platform setup functions are preset with the default functions
......@@ -91,6 +93,11 @@ struct x86_init_ops x86_init __initdata = {
.x2apic_available = bool_x86_init_noop,
.init_mem_mapping = x86_init_noop,
},
.acpi = {
.get_root_pointer = u64_x86_init_noop,
.reduced_hw_early_init = acpi_generic_reduced_hw_init,
},
};
struct x86_cpuinit_ops x86_cpuinit = {
......
# SPDX-License-Identifier: GPL-2.0
# Kernel does not boot with instrumentation of tlb.c and mem_encrypt.c
# Kernel does not boot with instrumentation of tlb.c and mem_encrypt*.c
KCOV_INSTRUMENT_tlb.o := n
KCOV_INSTRUMENT_mem_encrypt.o := n
KCOV_INSTRUMENT_mem_encrypt_identity.o := n
KASAN_SANITIZE_mem_encrypt.o := n
KASAN_SANITIZE_mem_encrypt_identity.o := n
ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_mem_encrypt.o = -pg
CFLAGS_REMOVE_mem_encrypt_identity.o = -pg
endif
obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
......@@ -16,6 +19,7 @@ obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
nostackp := $(call cc-option, -fno-stack-protector)
CFLAGS_physaddr.o := $(nostackp)
CFLAGS_setup_nx.o := $(nostackp)
CFLAGS_mem_encrypt_identity.o := $(nostackp)
CFLAGS_fault.o := -I$(src)/../include/asm/trace
......@@ -47,4 +51,5 @@ obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
obj-$(CONFIG_PAGE_TABLE_ISOLATION) += pti.o
obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o
obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_identity.o
obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o
......@@ -72,6 +72,31 @@ static const struct file_operations ptdump_curusr_fops = {
};
#endif
#if defined(CONFIG_EFI) && defined(CONFIG_X86_64)
extern pgd_t *efi_pgd;
static struct dentry *pe_efi;
static int ptdump_show_efi(struct seq_file *m, void *v)
{
if (efi_pgd)
ptdump_walk_pgd_level_debugfs(m, efi_pgd, false);
return 0;
}
static int ptdump_open_efi(struct inode *inode, struct file *filp)
{
return single_open(filp, ptdump_show_efi, NULL);
}
static const struct file_operations ptdump_efi_fops = {
.owner = THIS_MODULE,
.open = ptdump_open_efi,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
#endif
static struct dentry *dir, *pe_knl, *pe_curknl;
static int __init pt_dump_debug_init(void)
......@@ -96,6 +121,13 @@ static int __init pt_dump_debug_init(void)
if (!pe_curusr)
goto err;
#endif
#if defined(CONFIG_EFI) && defined(CONFIG_X86_64)
pe_efi = debugfs_create_file("efi", 0400, dir, NULL, &ptdump_efi_fops);
if (!pe_efi)
goto err;
#endif
return 0;
err:
debugfs_remove_recursive(dir);
......
This diff is collapsed.
......@@ -417,11 +417,11 @@ void vmalloc_sync_all(void)
*/
static noinline int vmalloc_fault(unsigned long address)
{
pgd_t *pgd, *pgd_ref;
p4d_t *p4d, *p4d_ref;
pud_t *pud, *pud_ref;
pmd_t *pmd, *pmd_ref;
pte_t *pte, *pte_ref;
pgd_t *pgd, *pgd_k;
p4d_t *p4d, *p4d_k;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
/* Make sure we are in vmalloc area: */
if (!(address >= VMALLOC_START && address < VMALLOC_END))
......@@ -435,73 +435,51 @@ static noinline int vmalloc_fault(unsigned long address)
* case just flush:
*/
pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address);
pgd_ref = pgd_offset_k(address);
if (pgd_none(*pgd_ref))
pgd_k = pgd_offset_k(address);
if (pgd_none(*pgd_k))
return -1;
if (CONFIG_PGTABLE_LEVELS > 4) {
if (pgtable_l5_enabled) {
if (pgd_none(*pgd)) {
set_pgd(pgd, *pgd_ref);
set_pgd(pgd, *pgd_k);
arch_flush_lazy_mmu_mode();
} else {
BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_k));
}
}
/* With 4-level paging, copying happens on the p4d level. */
p4d = p4d_offset(pgd, address);
p4d_ref = p4d_offset(pgd_ref, address);
if (p4d_none(*p4d_ref))
p4d_k = p4d_offset(pgd_k, address);
if (p4d_none(*p4d_k))
return -1;
if (p4d_none(*p4d) && CONFIG_PGTABLE_LEVELS == 4) {
set_p4d(p4d, *p4d_ref);
if (p4d_none(*p4d) && !pgtable_l5_enabled) {
set_p4d(p4d, *p4d_k);
arch_flush_lazy_mmu_mode();
} else {
BUG_ON(p4d_pfn(*p4d) != p4d_pfn(*p4d_ref));
BUG_ON(p4d_pfn(*p4d) != p4d_pfn(*p4d_k));
}
/*
* Below here mismatches are bugs because these lower tables
* are shared:
*/
BUILD_BUG_ON(CONFIG_PGTABLE_LEVELS < 4);
pud = pud_offset(p4d, address);
pud_ref = pud_offset(p4d_ref, address);
if (pud_none(*pud_ref))
if (pud_none(*pud))
return -1;
if (pud_none(*pud) || pud_pfn(*pud) != pud_pfn(*pud_ref))
BUG();
if (pud_large(*pud))
return 0;
pmd = pmd_offset(pud, address);
pmd_ref = pmd_offset(pud_ref, address);
if (pmd_none(*pmd_ref))
if (pmd_none(*pmd))
return -1;
if (pmd_none(*pmd) || pmd_pfn(*pmd) != pmd_pfn(*pmd_ref))
BUG();
if (pmd_large(*pmd))
return 0;
pte_ref = pte_offset_kernel(pmd_ref, address);
if (!pte_present(*pte_ref))
return -1;
pte = pte_offset_kernel(pmd, address);
/*
* Don't use pte_page here, because the mappings can point
* outside mem_map, and the NUMA hash lookup cannot handle
* that:
*/
if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
BUG();
if (!pte_present(*pte))
return -1;
return 0;
}
......
......@@ -120,7 +120,7 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
result = ident_p4d_init(info, p4d, addr, next);
if (result)
return result;
if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
if (pgtable_l5_enabled) {
set_pgd(pgd, __pgd(__pa(p4d) | info->kernpg_flag));
} else {
/*
......
......@@ -88,12 +88,7 @@ static int __init nonx32_setup(char *str)
}
__setup("noexec32=", nonx32_setup);
/*
* When memory was added make sure all the processes MM have
* suitable PGD entries in the local PGD level page.
*/
#ifdef CONFIG_X86_5LEVEL
void sync_global_pgds(unsigned long start, unsigned long end)
static void sync_global_pgds_l5(unsigned long start, unsigned long end)
{
unsigned long addr;
......@@ -129,8 +124,8 @@ void sync_global_pgds(unsigned long start, unsigned long end)
spin_unlock(&pgd_lock);
}
}
#else
void sync_global_pgds(unsigned long start, unsigned long end)
static void sync_global_pgds_l4(unsigned long start, unsigned long end)
{
unsigned long addr;
......@@ -143,7 +138,7 @@ void sync_global_pgds(unsigned long start, unsigned long end)
* With folded p4d, pgd_none() is always false, we need to
* handle synchonization on p4d level.
*/
BUILD_BUG_ON(pgd_none(*pgd_ref));
MAYBE_BUILD_BUG_ON(pgd_none(*pgd_ref));
p4d_ref = p4d_offset(pgd_ref, addr);
if (p4d_none(*p4d_ref))
......@@ -173,7 +168,18 @@ void sync_global_pgds(unsigned long start, unsigned long end)
spin_unlock(&pgd_lock);
}
}
#endif
/*
* When memory was added make sure all the processes MM have
* suitable PGD entries in the local PGD level page.
*/
void sync_global_pgds(unsigned long start, unsigned long end)
{
if (pgtable_l5_enabled)
sync_global_pgds_l5(start, end);
else
sync_global_pgds_l4(start, end);
}
/*
* NOTE: This function is marked __ref because it calls __init function
......@@ -632,7 +638,7 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
unsigned long vaddr = (unsigned long)__va(paddr);
int i = p4d_index(vaddr);
if (!IS_ENABLED(CONFIG_X86_5LEVEL))
if (!pgtable_l5_enabled)
return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end, page_size_mask);
for (; i < PTRS_PER_P4D; i++, paddr = paddr_next) {
......@@ -712,7 +718,7 @@ kernel_physical_mapping_init(unsigned long paddr_start,
page_size_mask);
spin_lock(&init_mm.page_table_lock);
if (IS_ENABLED(CONFIG_X86_5LEVEL))
if (pgtable_l5_enabled)
pgd_populate(&init_mm, pgd, p4d);
else
p4d_populate(&init_mm, p4d_offset(pgd, vaddr), (pud_t *) p4d);
......@@ -1089,7 +1095,7 @@ remove_p4d_table(p4d_t *p4d_start, unsigned long addr, unsigned long end,
* 5-level case we should free them. This code will have to change
* to adapt for boot-time switching between 4 and 5 level page tables.
*/
if (CONFIG_PGTABLE_LEVELS == 5)
if (pgtable_l5_enabled)
free_pud_table(pud_base, p4d);
}
......
// SPDX-License-Identifier: GPL-2.0
#define DISABLE_BRANCH_PROFILING
#define pr_fmt(fmt) "kasan: " fmt
#ifdef CONFIG_X86_5LEVEL
/* Too early to use cpu_feature_enabled() */
#define pgtable_l5_enabled __pgtable_l5_enabled
#endif
#include <linux/bootmem.h>
#include <linux/kasan.h>
#include <linux/kdebug.h>
......@@ -19,7 +25,7 @@
extern struct range pfn_mapped[E820_MAX_ENTRIES];
static p4d_t tmp_p4d_table[PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE);
static p4d_t tmp_p4d_table[MAX_PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE);
static __init void *early_alloc(size_t size, int nid, bool panic)
{
......@@ -176,10 +182,10 @@ static void __init clear_pgds(unsigned long start,
* With folded p4d, pgd_clear() is nop, use p4d_clear()
* instead.
*/
if (CONFIG_PGTABLE_LEVELS < 5)
p4d_clear(p4d_offset(pgd, start));
else
if (pgtable_l5_enabled)
pgd_clear(pgd);
else
p4d_clear(p4d_offset(pgd, start));
}
pgd = pgd_offset_k(start);
......@@ -191,7 +197,7 @@ static inline p4d_t *early_p4d_offset(pgd_t *pgd, unsigned long addr)
{
unsigned long p4d;
if (!IS_ENABLED(CONFIG_X86_5LEVEL))
if (!pgtable_l5_enabled)
return (p4d_t *)pgd;
p4d = __pa_nodebug(pgd_val(*pgd)) & PTE_PFN_MASK;
......@@ -272,7 +278,7 @@ void __init kasan_early_init(void)
for (i = 0; i < PTRS_PER_PUD; i++)
kasan_zero_pud[i] = __pud(pud_val);
for (i = 0; IS_ENABLED(CONFIG_X86_5LEVEL) && i < PTRS_PER_P4D; i++)
for (i = 0; pgtable_l5_enabled && i < PTRS_PER_P4D; i++)
kasan_zero_p4d[i] = __p4d(p4d_val);
kasan_map_early_shadow(early_top_pgt);
......@@ -303,7 +309,7 @@ void __init kasan_init(void)
* bunch of things like kernel code, modules, EFI mapping, etc.
* We need to take extra steps to not overwrite them.
*/
if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
if (pgtable_l5_enabled) {
void *ptr;
ptr = (void *)pgd_page_vaddr(*pgd_offset_k(KASAN_SHADOW_END));
......
......@@ -34,23 +34,12 @@
#define TB_SHIFT 40
/*
* Virtual address start and end range for randomization.
*
* The end address could depend on more configuration options to make the
* highest amount of space for randomization available, but that's too hard
* to keep straight and caused issues already.
*/
static const unsigned long vaddr_start = __PAGE_OFFSET_BASE;
static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE;
/* Default values */
unsigned long page_offset_base = __PAGE_OFFSET_BASE;
EXPORT_SYMBOL(page_offset_base);
unsigned long vmalloc_base = __VMALLOC_BASE;
EXPORT_SYMBOL(vmalloc_base);
unsigned long vmemmap_base = __VMEMMAP_BASE;
EXPORT_SYMBOL(vmemmap_base);
/*
* Memory regions randomized by KASLR (except modules that use a separate logic
* earlier during boot). The list is ordered based on virtual addresses. This
......@@ -60,8 +49,8 @@ static __initdata struct kaslr_memory_region {
unsigned long *base;
unsigned long size_tb;
} kaslr_regions[] = {
{ &page_offset_base, 1 << (__PHYSICAL_MASK_SHIFT - TB_SHIFT) /* Maximum */ },
{ &vmalloc_base, VMALLOC_SIZE_TB },
{ &page_offset_base, 0 },
{ &vmalloc_base, 0 },
{ &vmemmap_base, 1 },
};
......@@ -84,11 +73,14 @@ static inline bool kaslr_memory_enabled(void)
void __init kernel_randomize_memory(void)
{
size_t i;
unsigned long vaddr = vaddr_start;
unsigned long vaddr_start, vaddr;
unsigned long rand, memory_tb;
struct rnd_state rand_state;
unsigned long remain_entropy;
vaddr_start = pgtable_l5_enabled ? __PAGE_OFFSET_BASE_L5 : __PAGE_OFFSET_BASE_L4;
vaddr = vaddr_start;
/*
* These BUILD_BUG_ON checks ensure the memory layout is consistent
* with the vaddr_start/vaddr_end variables. These checks are very
......@@ -101,6 +93,9 @@ void __init kernel_randomize_memory(void)
if (!kaslr_memory_enabled())
return;
kaslr_regions[0].size_tb = 1 << (__PHYSICAL_MASK_SHIFT - TB_SHIFT);
kaslr_regions[1].size_tb = VMALLOC_SIZE_TB;
/*
* Update Physical memory mapping to available and
* add padding if needed (especially for memory hotplug support).
......@@ -129,7 +124,7 @@ void __init kernel_randomize_memory(void)
*/
entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i);
prandom_bytes_state(&rand_state, &rand, sizeof(rand));
if (IS_ENABLED(CONFIG_X86_5LEVEL))
if (pgtable_l5_enabled)
entropy = (rand % (entropy + 1)) & P4D_MASK;
else
entropy = (rand % (entropy + 1)) & PUD_MASK;
......@@ -141,7 +136,7 @@ void __init kernel_randomize_memory(void)
* randomization alignment.
*/
vaddr += get_padding(&kaslr_regions[i]);
if (IS_ENABLED(CONFIG_X86_5LEVEL))
if (pgtable_l5_enabled)
vaddr = round_up(vaddr + 1, P4D_SIZE);
else
vaddr = round_up(vaddr + 1, PUD_SIZE);
......@@ -217,7 +212,7 @@ void __meminit init_trampoline(void)
return;
}
if (IS_ENABLED(CONFIG_X86_5LEVEL))
if (pgtable_l5_enabled)
init_trampoline_p4d();
else
init_trampoline_pud();
......
This diff is collapsed.
This diff is collapsed.
......@@ -60,17 +60,6 @@ void memory_present(int nid, unsigned long start, unsigned long end)
}
printk(KERN_CONT "\n");
}
unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
unsigned long end_pfn)
{
unsigned long nr_pages = end_pfn - start_pfn;
if (!nr_pages)
return 0;
return (nr_pages + 1) * sizeof(struct page);
}
#endif
extern unsigned long highend_pfn, highstart_pfn;
......
......@@ -157,7 +157,7 @@ static void sync_current_stack_to_mm(struct mm_struct *mm)
unsigned long sp = current_stack_pointer;
pgd_t *pgd = pgd_offset(mm, sp);
if (CONFIG_PGTABLE_LEVELS > 4) {
if (pgtable_l5_enabled) {
if (unlikely(pgd_none(*pgd))) {
pgd_t *pgd_ref = pgd_offset_k(sp);
......@@ -613,7 +613,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
{
int cpu;
struct flush_tlb_info info = {
struct flush_tlb_info info __aligned(SMP_CACHE_BYTES) = {
.mm = mm,
};
......
......@@ -27,6 +27,7 @@
#include <linux/ioport.h>
#include <linux/mc146818rtc.h>
#include <linux/efi.h>
#include <linux/export.h>
#include <linux/uaccess.h>
#include <linux/io.h>
#include <linux/reboot.h>
......@@ -190,7 +191,8 @@ void __init efi_call_phys_epilog(pgd_t *save_pgd)
early_code_mapping_set_exec(0);
}
static pgd_t *efi_pgd;
pgd_t *efi_pgd;
EXPORT_SYMBOL_GPL(efi_pgd);
/*
* We need our own copy of the higher levels of the page tables
......@@ -225,7 +227,7 @@ int __init efi_alloc_page_tables(void)
pud = pud_alloc(&init_mm, p4d, EFI_VA_END);
if (!pud) {
if (CONFIG_PGTABLE_LEVELS > 4)
if (pgtable_l5_enabled)
free_page((unsigned long) pgd_page_vaddr(*pgd));
free_pages((unsigned long)efi_pgd, PGD_ALLOCATION_ORDER);
return -ENOMEM;
......@@ -255,8 +257,8 @@ void efi_sync_low_kernel_mappings(void)
* only span a single PGD entry and that the entry also maps
* other important kernel regions.
*/
BUILD_BUG_ON(pgd_index(EFI_VA_END) != pgd_index(MODULES_END));
BUILD_BUG_ON((EFI_VA_START & PGDIR_MASK) !=
MAYBE_BUILD_BUG_ON(pgd_index(EFI_VA_END) != pgd_index(MODULES_END));
MAYBE_BUILD_BUG_ON((EFI_VA_START & PGDIR_MASK) !=
(EFI_VA_END & PGDIR_MASK));
pgd_efi = efi_pgd + pgd_index(PAGE_OFFSET);
......
......@@ -199,6 +199,12 @@ void __init x86_intel_mid_early_setup(void)
legacy_pic = &null_legacy_pic;
/*
* Do nothing for now as everything needed done in
* x86_intel_mid_early_setup() below.
*/
x86_init.acpi.reduced_hw_early_init = x86_init_noop;
pm_power_off = intel_mid_power_off;
machine_ops.emergency_restart = intel_mid_reboot;
......
......@@ -50,7 +50,7 @@ static int set_up_temporary_text_mapping(pgd_t *pgd)
{
pmd_t *pmd;
pud_t *pud;
p4d_t *p4d;
p4d_t *p4d = NULL;
/*
* The new mapping only has to cover the page containing the image
......@@ -66,7 +66,7 @@ static int set_up_temporary_text_mapping(pgd_t *pgd)
* tables used by the image kernel.
*/
if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
if (pgtable_l5_enabled) {
p4d = (p4d_t *)get_safe_page(GFP_ATOMIC);
if (!p4d)
return -ENOMEM;
......@@ -84,7 +84,7 @@ static int set_up_temporary_text_mapping(pgd_t *pgd)
__pmd((jump_address_phys & PMD_MASK) | __PAGE_KERNEL_LARGE_EXEC));
set_pud(pud + pud_index(restore_jump_address),
__pud(__pa(pmd) | _KERNPG_TABLE));
if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
if (p4d) {
set_p4d(p4d + p4d_index(restore_jump_address), __p4d(__pa(pud) | _KERNPG_TABLE));
set_pgd(pgd + pgd_index(restore_jump_address), __pgd(__pa(p4d) | _KERNPG_TABLE));
} else {
......
......@@ -18,9 +18,6 @@ config XEN_PV
bool "Xen PV guest support"
default y
depends on XEN
# XEN_PV is not ready to work with 5-level paging.
# Changes to hypervisor are also required.
depends on !X86_5LEVEL
select XEN_HAVE_PVMMU
select XEN_HAVE_VPMU
help
......@@ -79,6 +76,4 @@ config XEN_DEBUG_FS
config XEN_PVH
bool "Support for running as a PVH guest"
depends on XEN && XEN_PVHVM && ACPI
# Pre-built page tables are not ready to handle 5-level paging.
depends on !X86_5LEVEL
def_bool n
......@@ -6,6 +6,7 @@
#include <asm/io_apic.h>
#include <asm/hypervisor.h>
#include <asm/e820/api.h>
#include <asm/x86_init.h>
#include <asm/xen/interface.h>
#include <asm/xen/hypercall.h>
......@@ -16,15 +17,20 @@
/*
* PVH variables.
*
* xen_pvh and pvh_bootparams need to live in data segment since they
* are used after startup_{32|64}, which clear .bss, are invoked.
* xen_pvh pvh_bootparams and pvh_start_info need to live in data segment
* since they are used after startup_{32|64}, which clear .bss, are invoked.
*/
bool xen_pvh __attribute__((section(".data"))) = 0;
struct boot_params pvh_bootparams __attribute__((section(".data")));
struct hvm_start_info pvh_start_info __attribute__((section(".data")));
struct hvm_start_info pvh_start_info;
unsigned int pvh_start_info_sz = sizeof(pvh_start_info);
static u64 pvh_get_root_pointer(void)
{
return pvh_start_info.rsdp_paddr;
}
static void __init init_pvh_bootparams(void)
{
struct xen_memory_map memmap;
......@@ -71,6 +77,8 @@ static void __init init_pvh_bootparams(void)
*/
pvh_bootparams.hdr.version = 0x212;
pvh_bootparams.hdr.type_of_loader = (9 << 4) | 0; /* Xen loader */
x86_init.acpi.get_root_pointer = pvh_get_root_pointer;
}
/*
......
......@@ -538,6 +538,22 @@ static void xen_set_p4d(p4d_t *ptr, p4d_t val)
xen_mc_issue(PARAVIRT_LAZY_MMU);
}
#if CONFIG_PGTABLE_LEVELS >= 5
__visible p4dval_t xen_p4d_val(p4d_t p4d)
{
return pte_mfn_to_pfn(p4d.p4d);
}
PV_CALLEE_SAVE_REGS_THUNK(xen_p4d_val);
__visible p4d_t xen_make_p4d(p4dval_t p4d)
{
p4d = pte_pfn_to_mfn(p4d);
return native_make_p4d(p4d);
}
PV_CALLEE_SAVE_REGS_THUNK(xen_make_p4d);
#endif /* CONFIG_PGTABLE_LEVELS >= 5 */
#endif /* CONFIG_X86_64 */
static int xen_pmd_walk(struct mm_struct *mm, pmd_t *pmd,
......@@ -2411,6 +2427,11 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
.alloc_pud = xen_alloc_pmd_init,
.release_pud = xen_release_pmd_init,
#if CONFIG_PGTABLE_LEVELS >= 5
.p4d_val = PV_CALLEE_SAVE(xen_p4d_val),
.make_p4d = PV_CALLEE_SAVE(xen_make_p4d),
#endif
#endif /* CONFIG_X86_64 */
.activate_mm = xen_activate_mm,
......
......@@ -189,12 +189,15 @@ early_param("acpi_rsdp", setup_acpi_rsdp);
acpi_physical_address __init acpi_os_get_root_pointer(void)
{
acpi_physical_address pa = 0;
acpi_physical_address pa;
#ifdef CONFIG_KEXEC
if (acpi_rsdp)
return acpi_rsdp;
#endif
pa = acpi_arch_get_root_pointer();
if (pa)
return pa;
if (efi_enabled(EFI_CONFIG_TABLES)) {
if (efi.acpi20 != EFI_INVALID_TABLE_ADDR)
......
......@@ -8,6 +8,7 @@
#define P4D_SHIFT PGDIR_SHIFT
#define P4D_SIZE PGDIR_SIZE
#define P4D_MASK PGDIR_MASK
#define MAX_PTRS_PER_P4D 1
#define PTRS_PER_P4D 1
#define p4d_t pgd_t
......
......@@ -9,6 +9,7 @@
typedef struct { pgd_t pgd; } p4d_t;
#define P4D_SHIFT PGDIR_SHIFT
#define MAX_PTRS_PER_P4D 1
#define PTRS_PER_P4D 1
#define P4D_SIZE (1UL << P4D_SHIFT)
#define P4D_MASK (~(P4D_SIZE-1))
......
......@@ -623,6 +623,13 @@ bool acpi_gtdt_c3stop(int type);
int acpi_arch_timer_mem_init(struct arch_timer_mem *timer_mem, int *timer_count);
#endif
#ifndef ACPI_HAVE_ARCH_GET_ROOT_POINTER
static inline u64 acpi_arch_get_root_pointer(void)
{
return 0;
}
#endif
#else /* !CONFIG_ACPI */
#define acpi_disabled 1
......
......@@ -18,7 +18,7 @@ extern unsigned char kasan_zero_page[PAGE_SIZE];
extern pte_t kasan_zero_pte[PTRS_PER_PTE];
extern pmd_t kasan_zero_pmd[PTRS_PER_PMD];
extern pud_t kasan_zero_pud[PTRS_PER_PUD];
extern p4d_t kasan_zero_p4d[PTRS_PER_P4D];
extern p4d_t kasan_zero_p4d[MAX_PTRS_PER_P4D];
void kasan_populate_zero_shadow(const void *shadow_start,
const void *shadow_end);
......
......@@ -816,10 +816,6 @@ int local_memory_node(int node_id);
static inline int local_memory_node(int node_id) { return node_id; };
#endif
#ifdef CONFIG_NEED_NODE_MEMMAP_SIZE
unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long);
#endif
/*
* zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc.
*/
......@@ -1289,7 +1285,6 @@ struct mminit_pfnnid_cache {
#endif
void memory_present(int nid, unsigned long start, unsigned long end);
unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long);
/*
* If it is possible to have holes within a MAX_ORDER_NR_PAGES, then we
......
......@@ -31,7 +31,7 @@
unsigned char kasan_zero_page[PAGE_SIZE] __page_aligned_bss;
#if CONFIG_PGTABLE_LEVELS > 4
p4d_t kasan_zero_p4d[PTRS_PER_P4D] __page_aligned_bss;
p4d_t kasan_zero_p4d[MAX_PTRS_PER_P4D] __page_aligned_bss;
#endif
#if CONFIG_PGTABLE_LEVELS > 3
pud_t kasan_zero_pud[PTRS_PER_PUD] __page_aligned_bss;
......
......@@ -235,28 +235,6 @@ void __init memory_present(int nid, unsigned long start, unsigned long end)
}
}
/*
* Only used by the i386 NUMA architecures, but relatively
* generic code.
*/
unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn,
unsigned long end_pfn)
{
unsigned long pfn;
unsigned long nr_pages = 0;
mminit_validate_memmodel_limits(&start_pfn, &end_pfn);
for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
if (nid != early_pfn_to_nid(pfn))
continue;
if (pfn_present(pfn))
nr_pages += PAGES_PER_SECTION;
}
return nr_pages * sizeof(struct page);
}
/*
* Subtle, we encode the real pfn into the mem_map such that
* the identity pfn - section_mem_map will return the actual
......
......@@ -84,18 +84,19 @@
* This is made more complicated by various memory models and PAE.
*/
#ifndef MAX_PHYSMEM_BITS
#ifdef CONFIG_HIGHMEM64G
#define MAX_PHYSMEM_BITS 36
#else /* !CONFIG_HIGHMEM64G */
#ifndef MAX_POSSIBLE_PHYSMEM_BITS
#ifdef MAX_PHYSMEM_BITS
#define MAX_POSSIBLE_PHYSMEM_BITS MAX_PHYSMEM_BITS
#else
/*
* If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just
* be PAGE_SHIFT
*/
#define MAX_PHYSMEM_BITS BITS_PER_LONG
#define MAX_POSSIBLE_PHYSMEM_BITS BITS_PER_LONG
#endif
#endif
#define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT)
#define _PFN_BITS (MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT)
/*
* Memory for allocating for handle keeps object position by
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment