Commit e345dcc9 authored by Hugh Dickins's avatar Hugh Dickins Committed by Greg Kroah-Hartman

kaiser: add "nokaiser" boot option, using ALTERNATIVE


Added "nokaiser" boot option: an early param like "noinvpcid".
Most places now check int kaiser_enabled (#defined 0 when not
CONFIG_KAISER) instead of #ifdef CONFIG_KAISER; but entry_64.S
and entry_64_compat.S are using the ALTERNATIVE technique, which
patches in the preferred instructions at runtime.  That technique
is tied to x86 cpu features, so X86_FEATURE_KAISER is fabricated.

Prior to "nokaiser", Kaiser #defined _PAGE_GLOBAL 0: revert that,
but be careful with both _PAGE_GLOBAL and CR4.PGE: setting them when
nokaiser like when !CONFIG_KAISER, but not setting either when kaiser -
neither matters on its own, but it's hard to be sure that _PAGE_GLOBAL
won't get set in some obscure corner, or something add PGE into CR4.
By omitting _PAGE_GLOBAL from __supported_pte_mask when kaiser_enabled,
all page table setup which uses pte_pfn() masks it out of the ptes.

It's slightly shameful that the same declaration versus definition of
kaiser_enabled appears in not one, not two, but in three header files
(asm/kaiser.h, asm/pgtable.h, asm/tlbflush.h).  I felt safer that way,
than with #including any of those in any of the others; and did not
feel it worth an asm/kaiser_enabled.h - kernel/cpu/common.c includes
them all, so we shall hear about it if they get out of synch.

Cleanups while in the area: removed the silly #ifdef CONFIG_KAISER
from kaiser.c; removed the unused native_get_normal_pgd(); removed
the spurious reg clutter from SWITCH_*_CR3 macro stubs; corrected some
comments.  But more interestingly, set CR4.PSE in secondary_startup_64:
the manual is clear that it does not matter whether it's 0 or 1 when
4-level-pts are enabled, but I was distracted to find cr4 different on
BSP and auxiliaries - BSP alone was adding PSE, in probe_page_size_mask().
Signed-off-by: default avatarHugh Dickins <hughd@google.com>
Acked-by: default avatarJiri Kosina <jkosina@suse.cz>
Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
parent 500943e5
...@@ -2523,6 +2523,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted. ...@@ -2523,6 +2523,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
nojitter [IA-64] Disables jitter checking for ITC timers. nojitter [IA-64] Disables jitter checking for ITC timers.
nokaiser [X86-64] Disable KAISER isolation of kernel from user.
no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver
no-kvmapf [X86,KVM] Disable paravirtualized asynchronous page no-kvmapf [X86,KVM] Disable paravirtualized asynchronous page
......
...@@ -1051,7 +1051,7 @@ ENTRY(paranoid_entry) ...@@ -1051,7 +1051,7 @@ ENTRY(paranoid_entry)
* unconditionally, but we need to find out whether the reverse * unconditionally, but we need to find out whether the reverse
* should be done on return (conveyed to paranoid_exit in %ebx). * should be done on return (conveyed to paranoid_exit in %ebx).
*/ */
movq %cr3, %rax ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
testl $KAISER_SHADOW_PGD_OFFSET, %eax testl $KAISER_SHADOW_PGD_OFFSET, %eax
jz 2f jz 2f
orl $2, %ebx orl $2, %ebx
...@@ -1083,6 +1083,7 @@ ENTRY(paranoid_exit) ...@@ -1083,6 +1083,7 @@ ENTRY(paranoid_exit)
TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF_DEBUG
TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ_DEBUG
#ifdef CONFIG_KAISER #ifdef CONFIG_KAISER
/* No ALTERNATIVE for X86_FEATURE_KAISER: paranoid_entry sets %ebx */
testl $2, %ebx /* SWITCH_USER_CR3 needed? */ testl $2, %ebx /* SWITCH_USER_CR3 needed? */
jz paranoid_exit_no_switch jz paranoid_exit_no_switch
SWITCH_USER_CR3 SWITCH_USER_CR3
...@@ -1315,13 +1316,14 @@ ENTRY(nmi) ...@@ -1315,13 +1316,14 @@ ENTRY(nmi)
#ifdef CONFIG_KAISER #ifdef CONFIG_KAISER
/* Unconditionally use kernel CR3 for do_nmi() */ /* Unconditionally use kernel CR3 for do_nmi() */
/* %rax is saved above, so OK to clobber here */ /* %rax is saved above, so OK to clobber here */
movq %cr3, %rax ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
/* If PCID enabled, NOFLUSH now and NOFLUSH on return */ /* If PCID enabled, NOFLUSH now and NOFLUSH on return */
orq x86_cr3_pcid_noflush, %rax orq x86_cr3_pcid_noflush, %rax
pushq %rax pushq %rax
/* mask off "user" bit of pgd address and 12 PCID bits: */ /* mask off "user" bit of pgd address and 12 PCID bits: */
andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
movq %rax, %cr3 movq %rax, %cr3
2:
#endif #endif
call do_nmi call do_nmi
...@@ -1331,8 +1333,7 @@ ENTRY(nmi) ...@@ -1331,8 +1333,7 @@ ENTRY(nmi)
* kernel code that needs user CR3, but do we ever return * kernel code that needs user CR3, but do we ever return
* to "user mode" where we need the kernel CR3? * to "user mode" where we need the kernel CR3?
*/ */
popq %rax ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER
mov %rax, %cr3
#endif #endif
/* /*
...@@ -1559,13 +1560,14 @@ end_repeat_nmi: ...@@ -1559,13 +1560,14 @@ end_repeat_nmi:
#ifdef CONFIG_KAISER #ifdef CONFIG_KAISER
/* Unconditionally use kernel CR3 for do_nmi() */ /* Unconditionally use kernel CR3 for do_nmi() */
/* %rax is saved above, so OK to clobber here */ /* %rax is saved above, so OK to clobber here */
movq %cr3, %rax ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
/* If PCID enabled, NOFLUSH now and NOFLUSH on return */ /* If PCID enabled, NOFLUSH now and NOFLUSH on return */
orq x86_cr3_pcid_noflush, %rax orq x86_cr3_pcid_noflush, %rax
pushq %rax pushq %rax
/* mask off "user" bit of pgd address and 12 PCID bits: */ /* mask off "user" bit of pgd address and 12 PCID bits: */
andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
movq %rax, %cr3 movq %rax, %cr3
2:
#endif #endif
/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
...@@ -1577,8 +1579,7 @@ end_repeat_nmi: ...@@ -1577,8 +1579,7 @@ end_repeat_nmi:
* kernel code that needs user CR3, like just just before * kernel code that needs user CR3, like just just before
* a sysret. * a sysret.
*/ */
popq %rax ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER
mov %rax, %cr3
#endif #endif
testl %ebx, %ebx /* swapgs needed? */ testl %ebx, %ebx /* swapgs needed? */
......
...@@ -200,6 +200,9 @@ ...@@ -200,6 +200,9 @@
#define X86_FEATURE_HWP_PKG_REQ ( 7*32+14) /* Intel HWP_PKG_REQ */ #define X86_FEATURE_HWP_PKG_REQ ( 7*32+14) /* Intel HWP_PKG_REQ */
#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */
/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */
#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_KAISER w/o nokaiser */
/* Virtualization flags: Linux defined, word 8 */ /* Virtualization flags: Linux defined, word 8 */
#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
#define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ #define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */
......
...@@ -46,28 +46,33 @@ movq \reg, %cr3 ...@@ -46,28 +46,33 @@ movq \reg, %cr3
.endm .endm
.macro SWITCH_KERNEL_CR3 .macro SWITCH_KERNEL_CR3
pushq %rax ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER
_SWITCH_TO_KERNEL_CR3 %rax _SWITCH_TO_KERNEL_CR3 %rax
popq %rax popq %rax
8:
.endm .endm
.macro SWITCH_USER_CR3 .macro SWITCH_USER_CR3
pushq %rax ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER
_SWITCH_TO_USER_CR3 %rax %al _SWITCH_TO_USER_CR3 %rax %al
popq %rax popq %rax
8:
.endm .endm
.macro SWITCH_KERNEL_CR3_NO_STACK .macro SWITCH_KERNEL_CR3_NO_STACK
movq %rax, PER_CPU_VAR(unsafe_stack_register_backup) ALTERNATIVE "jmp 8f", \
__stringify(movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)), \
X86_FEATURE_KAISER
_SWITCH_TO_KERNEL_CR3 %rax _SWITCH_TO_KERNEL_CR3 %rax
movq PER_CPU_VAR(unsafe_stack_register_backup), %rax movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
8:
.endm .endm
#else /* CONFIG_KAISER */ #else /* CONFIG_KAISER */
.macro SWITCH_KERNEL_CR3 reg .macro SWITCH_KERNEL_CR3
.endm .endm
.macro SWITCH_USER_CR3 reg regb .macro SWITCH_USER_CR3
.endm .endm
.macro SWITCH_KERNEL_CR3_NO_STACK .macro SWITCH_KERNEL_CR3_NO_STACK
.endm .endm
...@@ -90,6 +95,16 @@ DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user); ...@@ -90,6 +95,16 @@ DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user);
extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[]; extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
extern int kaiser_enabled;
#else
#define kaiser_enabled 0
#endif /* CONFIG_KAISER */
/*
* Kaiser function prototypes are needed even when CONFIG_KAISER is not set,
* so as to build with tests on kaiser_enabled instead of #ifdefs.
*/
/** /**
* kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping * kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping
* @addr: the start address of the range * @addr: the start address of the range
...@@ -119,8 +134,6 @@ extern void kaiser_remove_mapping(unsigned long start, unsigned long size); ...@@ -119,8 +134,6 @@ extern void kaiser_remove_mapping(unsigned long start, unsigned long size);
*/ */
extern void kaiser_init(void); extern void kaiser_init(void);
#endif /* CONFIG_KAISER */
#endif /* __ASSEMBLY */ #endif /* __ASSEMBLY */
#endif /* _ASM_X86_KAISER_H */ #endif /* _ASM_X86_KAISER_H */
...@@ -18,6 +18,12 @@ ...@@ -18,6 +18,12 @@
#ifndef __ASSEMBLY__ #ifndef __ASSEMBLY__
#include <asm/x86_init.h> #include <asm/x86_init.h>
#ifdef CONFIG_KAISER
extern int kaiser_enabled;
#else
#define kaiser_enabled 0
#endif
void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd); void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
void ptdump_walk_pgd_level_checkwx(void); void ptdump_walk_pgd_level_checkwx(void);
...@@ -660,7 +666,7 @@ static inline int pgd_bad(pgd_t pgd) ...@@ -660,7 +666,7 @@ static inline int pgd_bad(pgd_t pgd)
* page table by accident; it will fault on the first * page table by accident; it will fault on the first
* instruction it tries to run. See native_set_pgd(). * instruction it tries to run. See native_set_pgd().
*/ */
if (IS_ENABLED(CONFIG_KAISER)) if (kaiser_enabled)
ignore_flags |= _PAGE_NX; ignore_flags |= _PAGE_NX;
return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE; return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
...@@ -867,10 +873,12 @@ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) ...@@ -867,10 +873,12 @@ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
{ {
memcpy(dst, src, count * sizeof(pgd_t)); memcpy(dst, src, count * sizeof(pgd_t));
#ifdef CONFIG_KAISER #ifdef CONFIG_KAISER
if (kaiser_enabled) {
/* Clone the shadow pgd part as well */ /* Clone the shadow pgd part as well */
memcpy(native_get_shadow_pgd(dst), memcpy(native_get_shadow_pgd(dst),
native_get_shadow_pgd(src), native_get_shadow_pgd(src),
count * sizeof(pgd_t)); count * sizeof(pgd_t));
}
#endif #endif
} }
......
...@@ -111,13 +111,12 @@ extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd); ...@@ -111,13 +111,12 @@ extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd);
static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp) static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
{ {
#ifdef CONFIG_DEBUG_VM
/* linux/mmdebug.h may not have been included at this point */
BUG_ON(!kaiser_enabled);
#endif
return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE); return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE);
} }
static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp)
{
return (pgd_t *)((unsigned long)pgdp & ~(unsigned long)PAGE_SIZE);
}
#else #else
static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd) static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
{ {
...@@ -128,10 +127,6 @@ static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp) ...@@ -128,10 +127,6 @@ static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
BUILD_BUG_ON(1); BUILD_BUG_ON(1);
return NULL; return NULL;
} }
static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp)
{
return pgdp;
}
#endif /* CONFIG_KAISER */ #endif /* CONFIG_KAISER */
static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
......
...@@ -39,11 +39,7 @@ ...@@ -39,11 +39,7 @@
#define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED) #define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
#define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY) #define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE) #define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
#ifdef CONFIG_KAISER
#define _PAGE_GLOBAL (_AT(pteval_t, 0))
#else
#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) #define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
#endif
#define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1) #define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
#define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2) #define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2)
#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
......
...@@ -136,9 +136,11 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask) ...@@ -136,9 +136,11 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)
* to avoid the need for asm/kaiser.h in unexpected places. * to avoid the need for asm/kaiser.h in unexpected places.
*/ */
#ifdef CONFIG_KAISER #ifdef CONFIG_KAISER
extern int kaiser_enabled;
extern void kaiser_setup_pcid(void); extern void kaiser_setup_pcid(void);
extern void kaiser_flush_tlb_on_return_to_user(void); extern void kaiser_flush_tlb_on_return_to_user(void);
#else #else
#define kaiser_enabled 0
static inline void kaiser_setup_pcid(void) static inline void kaiser_setup_pcid(void)
{ {
} }
...@@ -163,7 +165,7 @@ static inline void __native_flush_tlb(void) ...@@ -163,7 +165,7 @@ static inline void __native_flush_tlb(void)
* back: * back:
*/ */
preempt_disable(); preempt_disable();
if (this_cpu_has(X86_FEATURE_PCID)) if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID))
kaiser_flush_tlb_on_return_to_user(); kaiser_flush_tlb_on_return_to_user();
native_write_cr3(native_read_cr3()); native_write_cr3(native_read_cr3());
preempt_enable(); preempt_enable();
...@@ -174,19 +176,29 @@ static inline void __native_flush_tlb_global_irq_disabled(void) ...@@ -174,19 +176,29 @@ static inline void __native_flush_tlb_global_irq_disabled(void)
unsigned long cr4; unsigned long cr4;
cr4 = this_cpu_read(cpu_tlbstate.cr4); cr4 = this_cpu_read(cpu_tlbstate.cr4);
/* clear PGE */ if (cr4 & X86_CR4_PGE) {
/* clear PGE and flush TLB of all entries */
native_write_cr4(cr4 & ~X86_CR4_PGE); native_write_cr4(cr4 & ~X86_CR4_PGE);
/* write old PGE again and flush TLBs */ /* restore PGE as it was before */
native_write_cr4(cr4); native_write_cr4(cr4);
} else {
/*
* x86_64 microcode update comes this way when CR4.PGE is not
* enabled, and it's safer for all callers to allow this case.
*/
native_write_cr3(native_read_cr3());
}
} }
static inline void __native_flush_tlb_global(void) static inline void __native_flush_tlb_global(void)
{ {
#ifdef CONFIG_KAISER unsigned long flags;
if (kaiser_enabled) {
/* Globals are not used at all */ /* Globals are not used at all */
__native_flush_tlb(); __native_flush_tlb();
#else return;
unsigned long flags; }
if (this_cpu_has(X86_FEATURE_INVPCID)) { if (this_cpu_has(X86_FEATURE_INVPCID)) {
/* /*
...@@ -207,7 +219,6 @@ static inline void __native_flush_tlb_global(void) ...@@ -207,7 +219,6 @@ static inline void __native_flush_tlb_global(void)
raw_local_irq_save(flags); raw_local_irq_save(flags);
__native_flush_tlb_global_irq_disabled(); __native_flush_tlb_global_irq_disabled();
raw_local_irq_restore(flags); raw_local_irq_restore(flags);
#endif
} }
static inline void __native_flush_tlb_single(unsigned long addr) static inline void __native_flush_tlb_single(unsigned long addr)
...@@ -222,7 +233,7 @@ static inline void __native_flush_tlb_single(unsigned long addr) ...@@ -222,7 +233,7 @@ static inline void __native_flush_tlb_single(unsigned long addr)
*/ */
if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) { if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) {
if (this_cpu_has(X86_FEATURE_PCID)) if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID))
kaiser_flush_tlb_on_return_to_user(); kaiser_flush_tlb_on_return_to_user();
asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
return; return;
...@@ -237,9 +248,9 @@ static inline void __native_flush_tlb_single(unsigned long addr) ...@@ -237,9 +248,9 @@ static inline void __native_flush_tlb_single(unsigned long addr)
* Make sure to do only a single invpcid when KAISER is * Make sure to do only a single invpcid when KAISER is
* disabled and we have only a single ASID. * disabled and we have only a single ASID.
*/ */
if (X86_CR3_PCID_ASID_KERN != X86_CR3_PCID_ASID_USER) if (kaiser_enabled)
invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr);
invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr); invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr);
invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr);
} }
static inline void __flush_tlb_all(void) static inline void __flush_tlb_all(void)
......
...@@ -178,6 +178,20 @@ static int __init x86_pcid_setup(char *s) ...@@ -178,6 +178,20 @@ static int __init x86_pcid_setup(char *s)
return 1; return 1;
} }
__setup("nopcid", x86_pcid_setup); __setup("nopcid", x86_pcid_setup);
static int __init x86_nokaiser_setup(char *s)
{
/* nokaiser doesn't accept parameters */
if (s)
return -EINVAL;
#ifdef CONFIG_KAISER
kaiser_enabled = 0;
setup_clear_cpu_cap(X86_FEATURE_KAISER);
pr_info("nokaiser: KAISER feature disabled\n");
#endif
return 0;
}
early_param("nokaiser", x86_nokaiser_setup);
#endif #endif
static int __init x86_noinvpcid_setup(char *s) static int __init x86_noinvpcid_setup(char *s)
...@@ -324,7 +338,7 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c) ...@@ -324,7 +338,7 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
static void setup_pcid(struct cpuinfo_x86 *c) static void setup_pcid(struct cpuinfo_x86 *c)
{ {
if (cpu_has(c, X86_FEATURE_PCID)) { if (cpu_has(c, X86_FEATURE_PCID)) {
if (cpu_has(c, X86_FEATURE_PGE)) { if (cpu_has(c, X86_FEATURE_PGE) || kaiser_enabled) {
cr4_set_bits(X86_CR4_PCIDE); cr4_set_bits(X86_CR4_PCIDE);
/* /*
* INVPCID has two "groups" of types: * INVPCID has two "groups" of types:
...@@ -747,6 +761,10 @@ void get_cpu_cap(struct cpuinfo_x86 *c) ...@@ -747,6 +761,10 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
c->x86_power = cpuid_edx(0x80000007); c->x86_power = cpuid_edx(0x80000007);
init_scattered_cpuid_features(c); init_scattered_cpuid_features(c);
#ifdef CONFIG_KAISER
if (kaiser_enabled)
set_cpu_cap(c, X86_FEATURE_KAISER);
#endif
} }
static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
...@@ -1406,6 +1424,14 @@ void cpu_init(void) ...@@ -1406,6 +1424,14 @@ void cpu_init(void)
* try to read it. * try to read it.
*/ */
cr4_init_shadow(); cr4_init_shadow();
if (!kaiser_enabled) {
/*
* secondary_startup_64() deferred setting PGE in cr4:
* probe_page_size_mask() sets it on the boot cpu,
* but it needs to be set on each secondary cpu.
*/
cr4_set_bits(X86_CR4_PGE);
}
/* /*
* Load microcode on this cpu if a valid microcode is available. * Load microcode on this cpu if a valid microcode is available.
......
...@@ -132,9 +132,10 @@ void __init init_espfix_bsp(void) ...@@ -132,9 +132,10 @@ void __init init_espfix_bsp(void)
* area to ensure it is mapped into the shadow user page * area to ensure it is mapped into the shadow user page
* tables. * tables.
*/ */
if (IS_ENABLED(CONFIG_KAISER)) if (kaiser_enabled) {
set_pgd(native_get_shadow_pgd(pgd_p), set_pgd(native_get_shadow_pgd(pgd_p),
__pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page))); __pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page)));
}
/* Randomize the locations */ /* Randomize the locations */
init_espfix_random(); init_espfix_random();
......
...@@ -183,8 +183,8 @@ ENTRY(secondary_startup_64) ...@@ -183,8 +183,8 @@ ENTRY(secondary_startup_64)
movq $(init_level4_pgt - __START_KERNEL_map), %rax movq $(init_level4_pgt - __START_KERNEL_map), %rax
1: 1:
/* Enable PAE mode and PGE */ /* Enable PAE and PSE, but defer PGE until kaiser_enabled is decided */
movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx movl $(X86_CR4_PAE | X86_CR4_PSE), %ecx
movq %rcx, %cr4 movq %rcx, %cr4
/* Setup early boot stage 4 level pagetables. */ /* Setup early boot stage 4 level pagetables. */
......
...@@ -165,7 +165,7 @@ static void __init probe_page_size_mask(void) ...@@ -165,7 +165,7 @@ static void __init probe_page_size_mask(void)
cr4_set_bits_and_update_boot(X86_CR4_PSE); cr4_set_bits_and_update_boot(X86_CR4_PSE);
/* Enable PGE if available */ /* Enable PGE if available */
if (cpu_has_pge) { if (cpu_has_pge && !kaiser_enabled) {
cr4_set_bits_and_update_boot(X86_CR4_PGE); cr4_set_bits_and_update_boot(X86_CR4_PGE);
__supported_pte_mask |= _PAGE_GLOBAL; __supported_pte_mask |= _PAGE_GLOBAL;
} else } else
......
...@@ -395,6 +395,16 @@ void __init cleanup_highmap(void) ...@@ -395,6 +395,16 @@ void __init cleanup_highmap(void)
continue; continue;
if (vaddr < (unsigned long) _text || vaddr > end) if (vaddr < (unsigned long) _text || vaddr > end)
set_pmd(pmd, __pmd(0)); set_pmd(pmd, __pmd(0));
else if (kaiser_enabled) {
/*
* level2_kernel_pgt is initialized with _PAGE_GLOBAL:
* clear that now. This is not important, so long as
* CR4.PGE remains clear, but it removes an anomaly.
* Physical mapping setup below avoids _PAGE_GLOBAL
* by use of massage_pgprot() inside pfn_pte() etc.
*/
set_pmd(pmd, pmd_clear_flags(*pmd, _PAGE_GLOBAL));
}
} }
} }
......
...@@ -17,7 +17,9 @@ ...@@ -17,7 +17,9 @@
#include <asm/pgalloc.h> #include <asm/pgalloc.h>
#include <asm/desc.h> #include <asm/desc.h>
#ifdef CONFIG_KAISER int kaiser_enabled __read_mostly = 1;
EXPORT_SYMBOL(kaiser_enabled); /* for inlined TLB flush functions */
__visible __visible
DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
...@@ -168,7 +170,7 @@ static pte_t *kaiser_pagetable_walk(unsigned long address, bool is_atomic) ...@@ -168,7 +170,7 @@ static pte_t *kaiser_pagetable_walk(unsigned long address, bool is_atomic)
return pte_offset_kernel(pmd, address); return pte_offset_kernel(pmd, address);
} }
int kaiser_add_user_map(const void *__start_addr, unsigned long size, static int kaiser_add_user_map(const void *__start_addr, unsigned long size,
unsigned long flags) unsigned long flags)
{ {
int ret = 0; int ret = 0;
...@@ -178,6 +180,15 @@ int kaiser_add_user_map(const void *__start_addr, unsigned long size, ...@@ -178,6 +180,15 @@ int kaiser_add_user_map(const void *__start_addr, unsigned long size,
unsigned long end_addr = PAGE_ALIGN(start_addr + size); unsigned long end_addr = PAGE_ALIGN(start_addr + size);
unsigned long target_address; unsigned long target_address;
/*
* It is convenient for callers to pass in __PAGE_KERNEL etc,
* and there is no actual harm from setting _PAGE_GLOBAL, so
* long as CR4.PGE is not set. But it is nonetheless troubling
* to see Kaiser itself setting _PAGE_GLOBAL (now that "nokaiser"
* requires that not to be #defined to 0): so mask it off here.
*/
flags &= ~_PAGE_GLOBAL;
for (; address < end_addr; address += PAGE_SIZE) { for (; address < end_addr; address += PAGE_SIZE) {
target_address = get_pa_from_mapping(address); target_address = get_pa_from_mapping(address);
if (target_address == -1) { if (target_address == -1) {
...@@ -264,6 +275,8 @@ void __init kaiser_init(void) ...@@ -264,6 +275,8 @@ void __init kaiser_init(void)
{ {
int cpu; int cpu;
if (!kaiser_enabled)
return;
kaiser_init_all_pgds(); kaiser_init_all_pgds();
for_each_possible_cpu(cpu) { for_each_possible_cpu(cpu) {
...@@ -312,6 +325,8 @@ void __init kaiser_init(void) ...@@ -312,6 +325,8 @@ void __init kaiser_init(void)
/* Add a mapping to the shadow mapping, and synchronize the mappings */ /* Add a mapping to the shadow mapping, and synchronize the mappings */
int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags) int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
{ {
if (!kaiser_enabled)
return 0;
return kaiser_add_user_map((const void *)addr, size, flags); return kaiser_add_user_map((const void *)addr, size, flags);
} }
...@@ -323,6 +338,8 @@ void kaiser_remove_mapping(unsigned long start, unsigned long size) ...@@ -323,6 +338,8 @@ void kaiser_remove_mapping(unsigned long start, unsigned long size)
unsigned long addr, next; unsigned long addr, next;
pgd_t *pgd; pgd_t *pgd;
if (!kaiser_enabled)
return;
pgd = native_get_shadow_pgd(pgd_offset_k(start)); pgd = native_get_shadow_pgd(pgd_offset_k(start));
for (addr = start; addr < end; pgd++, addr = next) { for (addr = start; addr < end; pgd++, addr = next) {
next = pgd_addr_end(addr, end); next = pgd_addr_end(addr, end);
...@@ -344,6 +361,8 @@ static inline bool is_userspace_pgd(pgd_t *pgdp) ...@@ -344,6 +361,8 @@ static inline bool is_userspace_pgd(pgd_t *pgdp)
pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd) pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
{ {
if (!kaiser_enabled)
return pgd;
/* /*
* Do we need to also populate the shadow pgd? Check _PAGE_USER to * Do we need to also populate the shadow pgd? Check _PAGE_USER to
* skip cases like kexec and EFI which make temporary low mappings. * skip cases like kexec and EFI which make temporary low mappings.
...@@ -400,4 +419,3 @@ void kaiser_flush_tlb_on_return_to_user(void) ...@@ -400,4 +419,3 @@ void kaiser_flush_tlb_on_return_to_user(void)
X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET); X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
} }
EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user); EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);
#endif /* CONFIG_KAISER */
...@@ -341,16 +341,12 @@ static inline void _pgd_free(pgd_t *pgd) ...@@ -341,16 +341,12 @@ static inline void _pgd_free(pgd_t *pgd)
} }
#else #else
#ifdef CONFIG_KAISER
/* /*
* Instead of one pmd, we aquire two pmds. Being order-1, it is * Instead of one pgd, Kaiser acquires two pgds. Being order-1, it is
* both 8k in size and 8k-aligned. That lets us just flip bit 12 * both 8k in size and 8k-aligned. That lets us just flip bit 12
* in a pointer to swap between the two 4k halves. * in a pointer to swap between the two 4k halves.
*/ */
#define PGD_ALLOCATION_ORDER 1 #define PGD_ALLOCATION_ORDER kaiser_enabled
#else
#define PGD_ALLOCATION_ORDER 0
#endif
static inline pgd_t *_pgd_alloc(void) static inline pgd_t *_pgd_alloc(void)
{ {
......
...@@ -39,8 +39,7 @@ static void load_new_mm_cr3(pgd_t *pgdir) ...@@ -39,8 +39,7 @@ static void load_new_mm_cr3(pgd_t *pgdir)
{ {
unsigned long new_mm_cr3 = __pa(pgdir); unsigned long new_mm_cr3 = __pa(pgdir);
#ifdef CONFIG_KAISER if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID)) {
if (this_cpu_has(X86_FEATURE_PCID)) {
/* /*
* We reuse the same PCID for different tasks, so we must * We reuse the same PCID for different tasks, so we must
* flush all the entries for the PCID out when we change tasks. * flush all the entries for the PCID out when we change tasks.
...@@ -57,7 +56,6 @@ static void load_new_mm_cr3(pgd_t *pgdir) ...@@ -57,7 +56,6 @@ static void load_new_mm_cr3(pgd_t *pgdir)
new_mm_cr3 |= X86_CR3_PCID_KERN_FLUSH; new_mm_cr3 |= X86_CR3_PCID_KERN_FLUSH;
kaiser_flush_tlb_on_return_to_user(); kaiser_flush_tlb_on_return_to_user();
} }
#endif /* CONFIG_KAISER */
/* /*
* Caution: many callers of this function expect * Caution: many callers of this function expect
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment