Commit bed9bb7f authored by Dave Hansen's avatar Dave Hansen Committed by Greg Kroah-Hartman

kaiser: merged update


Merged fixes and cleanups, rebased to 4.4.89 tree (no 5-level paging).
Signed-off-by: default avatarDave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: default avatarHugh Dickins <hughd@google.com>
Acked-by: default avatarJiri Kosina <jkosina@suse.cz>
Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
parent 8a43ddfb
...@@ -212,6 +212,13 @@ entry_SYSCALL_64_fastpath: ...@@ -212,6 +212,13 @@ entry_SYSCALL_64_fastpath:
movq RIP(%rsp), %rcx movq RIP(%rsp), %rcx
movq EFLAGS(%rsp), %r11 movq EFLAGS(%rsp), %r11
RESTORE_C_REGS_EXCEPT_RCX_R11 RESTORE_C_REGS_EXCEPT_RCX_R11
/*
* This opens a window where we have a user CR3, but are
* running in the kernel. This makes using the CS
* register useless for telling whether or not we need to
* switch CR3 in NMIs. Normal interrupts are OK because
* they are off here.
*/
SWITCH_USER_CR3 SWITCH_USER_CR3
movq RSP(%rsp), %rsp movq RSP(%rsp), %rsp
/* /*
...@@ -350,11 +357,25 @@ GLOBAL(int_ret_from_sys_call) ...@@ -350,11 +357,25 @@ GLOBAL(int_ret_from_sys_call)
syscall_return_via_sysret: syscall_return_via_sysret:
/* rcx and r11 are already restored (see code above) */ /* rcx and r11 are already restored (see code above) */
RESTORE_C_REGS_EXCEPT_RCX_R11 RESTORE_C_REGS_EXCEPT_RCX_R11
/*
* This opens a window where we have a user CR3, but are
* running in the kernel. This makes using the CS
* register useless for telling whether or not we need to
* switch CR3 in NMIs. Normal interrupts are OK because
* they are off here.
*/
SWITCH_USER_CR3 SWITCH_USER_CR3
movq RSP(%rsp), %rsp movq RSP(%rsp), %rsp
USERGS_SYSRET64 USERGS_SYSRET64
opportunistic_sysret_failed: opportunistic_sysret_failed:
/*
* This opens a window where we have a user CR3, but are
* running in the kernel. This makes using the CS
* register useless for telling whether or not we need to
* switch CR3 in NMIs. Normal interrupts are OK because
* they are off here.
*/
SWITCH_USER_CR3 SWITCH_USER_CR3
SWAPGS SWAPGS
jmp restore_c_regs_and_iret jmp restore_c_regs_and_iret
...@@ -1059,6 +1080,13 @@ ENTRY(error_entry) ...@@ -1059,6 +1080,13 @@ ENTRY(error_entry)
cld cld
SAVE_C_REGS 8 SAVE_C_REGS 8
SAVE_EXTRA_REGS 8 SAVE_EXTRA_REGS 8
/*
* error_entry() always returns with a kernel gsbase and
* CR3. We must also have a kernel CR3/gsbase before
* calling TRACE_IRQS_*. Just unconditionally switch to
* the kernel CR3 here.
*/
SWITCH_KERNEL_CR3
xorl %ebx, %ebx xorl %ebx, %ebx
testb $3, CS+8(%rsp) testb $3, CS+8(%rsp)
jz .Lerror_kernelspace jz .Lerror_kernelspace
...@@ -1069,7 +1097,6 @@ ENTRY(error_entry) ...@@ -1069,7 +1097,6 @@ ENTRY(error_entry)
* from user mode due to an IRET fault. * from user mode due to an IRET fault.
*/ */
SWAPGS SWAPGS
SWITCH_KERNEL_CR3
.Lerror_entry_from_usermode_after_swapgs: .Lerror_entry_from_usermode_after_swapgs:
/* /*
...@@ -1122,7 +1149,7 @@ ENTRY(error_entry) ...@@ -1122,7 +1149,7 @@ ENTRY(error_entry)
* Switch to kernel gsbase: * Switch to kernel gsbase:
*/ */
SWAPGS SWAPGS
SWITCH_KERNEL_CR3
/* /*
* Pretend that the exception came from user mode: set up pt_regs * Pretend that the exception came from user mode: set up pt_regs
* as if we faulted immediately after IRET and clear EBX so that * as if we faulted immediately after IRET and clear EBX so that
...@@ -1222,7 +1249,10 @@ ENTRY(nmi) ...@@ -1222,7 +1249,10 @@ ENTRY(nmi)
*/ */
SWAPGS_UNSAFE_STACK SWAPGS_UNSAFE_STACK
SWITCH_KERNEL_CR3_NO_STACK /*
* percpu variables are mapped with user CR3, so no need
* to switch CR3 here.
*/
cld cld
movq %rsp, %rdx movq %rsp, %rdx
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
...@@ -1256,14 +1286,33 @@ ENTRY(nmi) ...@@ -1256,14 +1286,33 @@ ENTRY(nmi)
movq %rsp, %rdi movq %rsp, %rdi
movq $-1, %rsi movq $-1, %rsi
#ifdef CONFIG_KAISER
/* Unconditionally use kernel CR3 for do_nmi() */
/* %rax is saved above, so OK to clobber here */
movq %cr3, %rax
pushq %rax
#ifdef CONFIG_KAISER_REAL_SWITCH
andq $(~0x1000), %rax
#endif
movq %rax, %cr3
#endif
call do_nmi call do_nmi
/*
* Unconditionally restore CR3. I know we return to
* kernel code that needs user CR3, but do we ever return
* to "user mode" where we need the kernel CR3?
*/
#ifdef CONFIG_KAISER
popq %rax
mov %rax, %cr3
#endif
/* /*
* Return back to user mode. We must *not* do the normal exit * Return back to user mode. We must *not* do the normal exit
* work, because we don't want to enable interrupts. Fortunately, * work, because we don't want to enable interrupts. Do not
* do_nmi doesn't modify pt_regs. * switch to user CR3: we might be going back to kernel code
* that had a user CR3 set.
*/ */
SWITCH_USER_CR3
SWAPGS SWAPGS
jmp restore_c_regs_and_iret jmp restore_c_regs_and_iret
...@@ -1459,23 +1508,54 @@ end_repeat_nmi: ...@@ -1459,23 +1508,54 @@ end_repeat_nmi:
ALLOC_PT_GPREGS_ON_STACK ALLOC_PT_GPREGS_ON_STACK
/* /*
* Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit * Use the same approach as paranoid_entry to handle SWAPGS, but
* as we should not be calling schedule in NMI context. * without CR3 handling since we do that differently in NMIs. No
* Even with normal interrupts enabled. An NMI should not be * need to use paranoid_exit as we should not be calling schedule
* setting NEED_RESCHED or anything that normal interrupts and * in NMI context. Even with normal interrupts enabled. An NMI
* exceptions might do. * should not be setting NEED_RESCHED or anything that normal
* interrupts and exceptions might do.
*/ */
call paranoid_entry cld
SAVE_C_REGS
SAVE_EXTRA_REGS
movl $1, %ebx
movl $MSR_GS_BASE, %ecx
rdmsr
testl %edx, %edx
js 1f /* negative -> in kernel */
SWAPGS
xorl %ebx, %ebx
1:
#ifdef CONFIG_KAISER
/* Unconditionally use kernel CR3 for do_nmi() */
/* %rax is saved above, so OK to clobber here */
movq %cr3, %rax
pushq %rax
#ifdef CONFIG_KAISER_REAL_SWITCH
andq $(~0x1000), %rax
#endif
movq %rax, %cr3
#endif
/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
movq %rsp, %rdi movq %rsp, %rdi
addq $8, %rdi /* point %rdi at ptregs, fixed up for CR3 */
movq $-1, %rsi movq $-1, %rsi
call do_nmi call do_nmi
/*
* Unconditionally restore CR3. We might be returning to
* kernel code that needs user CR3, like just just before
* a sysret.
*/
#ifdef CONFIG_KAISER
popq %rax
mov %rax, %cr3
#endif
testl %ebx, %ebx /* swapgs needed? */ testl %ebx, %ebx /* swapgs needed? */
jnz nmi_restore jnz nmi_restore
nmi_swapgs: nmi_swapgs:
SWITCH_USER_CR3_NO_STACK /* We fixed up CR3 above, so no need to switch it here */
SWAPGS_UNSAFE_STACK SWAPGS_UNSAFE_STACK
nmi_restore: nmi_restore:
RESTORE_EXTRA_REGS RESTORE_EXTRA_REGS
......
...@@ -16,13 +16,17 @@ ...@@ -16,13 +16,17 @@
.macro _SWITCH_TO_KERNEL_CR3 reg .macro _SWITCH_TO_KERNEL_CR3 reg
movq %cr3, \reg movq %cr3, \reg
#ifdef CONFIG_KAISER_REAL_SWITCH
andq $(~0x1000), \reg andq $(~0x1000), \reg
#endif
movq \reg, %cr3 movq \reg, %cr3
.endm .endm
.macro _SWITCH_TO_USER_CR3 reg .macro _SWITCH_TO_USER_CR3 reg
movq %cr3, \reg movq %cr3, \reg
#ifdef CONFIG_KAISER_REAL_SWITCH
orq $(0x1000), \reg orq $(0x1000), \reg
#endif
movq \reg, %cr3 movq \reg, %cr3
.endm .endm
...@@ -65,48 +69,53 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax ...@@ -65,48 +69,53 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
.endm .endm
#endif /* CONFIG_KAISER */ #endif /* CONFIG_KAISER */
#else /* __ASSEMBLY__ */ #else /* __ASSEMBLY__ */
#ifdef CONFIG_KAISER #ifdef CONFIG_KAISER
// Upon kernel/user mode switch, it may happen that /*
// the address space has to be switched before the registers have been stored. * Upon kernel/user mode switch, it may happen that the address
// To change the address space, another register is needed. * space has to be switched before the registers have been
// A register therefore has to be stored/restored. * stored. To change the address space, another register is
// * needed. A register therefore has to be stored/restored.
DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); */
#endif /* CONFIG_KAISER */ DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
/** /**
* shadowmem_add_mapping - map a virtual memory part to the shadow mapping * kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping
* @addr: the start address of the range * @addr: the start address of the range
* @size: the size of the range * @size: the size of the range
* @flags: The mapping flags of the pages * @flags: The mapping flags of the pages
* *
* the mapping is done on a global scope, so no bigger synchronization has to be done. * The mapping is done on a global scope, so no bigger
* the pages have to be manually unmapped again when they are not needed any longer. * synchronization has to be done. the pages have to be
* manually unmapped again when they are not needed any longer.
*/ */
extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags); extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
/** /**
* shadowmem_remove_mapping - unmap a virtual memory part of the shadow mapping * kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping
* @addr: the start address of the range * @addr: the start address of the range
* @size: the size of the range * @size: the size of the range
*/ */
extern void kaiser_remove_mapping(unsigned long start, unsigned long size); extern void kaiser_remove_mapping(unsigned long start, unsigned long size);
/** /**
* shadowmem_initialize_mapping - Initalize the shadow mapping * kaiser_initialize_mapping - Initalize the shadow mapping
* *
* most parts of the shadow mapping can be mapped upon boot time. * Most parts of the shadow mapping can be mapped upon boot
* only the thread stacks have to be mapped on runtime. * time. Only per-process things like the thread stacks
* the mapped regions are not unmapped at all. * or a new LDT have to be mapped at runtime. These boot-
* time mappings are permanent and nevertunmapped.
*/ */
extern void kaiser_init(void); extern void kaiser_init(void);
#endif #endif /* CONFIG_KAISER */
#endif /* __ASSEMBLY */
......
...@@ -653,7 +653,17 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address) ...@@ -653,7 +653,17 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address)
static inline int pgd_bad(pgd_t pgd) static inline int pgd_bad(pgd_t pgd)
{ {
return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE; pgdval_t ignore_flags = _PAGE_USER;
/*
* We set NX on KAISER pgds that map userspace memory so
* that userspace can not meaningfully use the kernel
* page table by accident; it will fault on the first
* instruction it tries to run. See native_set_pgd().
*/
if (IS_ENABLED(CONFIG_KAISER))
ignore_flags |= _PAGE_NX;
return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
} }
static inline int pgd_none(pgd_t pgd) static inline int pgd_none(pgd_t pgd)
...@@ -857,8 +867,10 @@ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) ...@@ -857,8 +867,10 @@ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
{ {
memcpy(dst, src, count * sizeof(pgd_t)); memcpy(dst, src, count * sizeof(pgd_t));
#ifdef CONFIG_KAISER #ifdef CONFIG_KAISER
// clone the shadow pgd part as well /* Clone the shadow pgd part as well */
memcpy(native_get_shadow_pgd(dst), native_get_shadow_pgd(src), count * sizeof(pgd_t)); memcpy(native_get_shadow_pgd(dst),
native_get_shadow_pgd(src),
count * sizeof(pgd_t));
#endif #endif
} }
......
...@@ -107,26 +107,58 @@ static inline void native_pud_clear(pud_t *pud) ...@@ -107,26 +107,58 @@ static inline void native_pud_clear(pud_t *pud)
} }
#ifdef CONFIG_KAISER #ifdef CONFIG_KAISER
static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) { static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp)
{
return (pgd_t *)(void*)((unsigned long)(void*)pgdp | (unsigned long)PAGE_SIZE); return (pgd_t *)(void*)((unsigned long)(void*)pgdp | (unsigned long)PAGE_SIZE);
} }
static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) { static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp)
{
return (pgd_t *)(void*)((unsigned long)(void*)pgdp & ~(unsigned long)PAGE_SIZE); return (pgd_t *)(void*)((unsigned long)(void*)pgdp & ~(unsigned long)PAGE_SIZE);
} }
#else
static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp)
{
BUILD_BUG_ON(1);
return NULL;
}
static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp)
{
return pgdp;
}
#endif /* CONFIG_KAISER */ #endif /* CONFIG_KAISER */
/*
* Page table pages are page-aligned. The lower half of the top
* level is used for userspace and the top half for the kernel.
* This returns true for user pages that need to get copied into
* both the user and kernel copies of the page tables, and false
* for kernel pages that should only be in the kernel copy.
*/
static inline bool is_userspace_pgd(void *__ptr)
{
unsigned long ptr = (unsigned long)__ptr;
return ((ptr % PAGE_SIZE) < (PAGE_SIZE / 2));
}
static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
{ {
#ifdef CONFIG_KAISER #ifdef CONFIG_KAISER
// We know that a pgd is page aligned. pteval_t extra_kern_pgd_flags = 0;
// Therefore the lower indices have to be mapped to user space. /* Do we need to also populate the shadow pgd? */
// These pages are mapped to the shadow mapping. if (is_userspace_pgd(pgdp)) {
if ((((unsigned long)pgdp) % PAGE_SIZE) < (PAGE_SIZE / 2)) {
native_get_shadow_pgd(pgdp)->pgd = pgd.pgd; native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
/*
* Even if the entry is *mapping* userspace, ensure
* that userspace can not use it. This way, if we
* get out to userspace running on the kernel CR3,
* userspace will crash instead of running.
*/
extra_kern_pgd_flags = _PAGE_NX;
} }
pgdp->pgd = pgd.pgd;
pgdp->pgd = pgd.pgd & ~_PAGE_USER; pgdp->pgd |= extra_kern_pgd_flags;
#else /* CONFIG_KAISER */ #else /* CONFIG_KAISER */
*pgdp = pgd; *pgdp = pgd;
#endif #endif
......
...@@ -42,7 +42,7 @@ ...@@ -42,7 +42,7 @@
#ifdef CONFIG_KAISER #ifdef CONFIG_KAISER
#define _PAGE_GLOBAL (_AT(pteval_t, 0)) #define _PAGE_GLOBAL (_AT(pteval_t, 0))
#else #else
#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) #define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
#endif #endif
#define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1) #define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
#define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2) #define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2)
...@@ -93,11 +93,7 @@ ...@@ -93,11 +93,7 @@
#define _PAGE_NX (_AT(pteval_t, 0)) #define _PAGE_NX (_AT(pteval_t, 0))
#endif #endif
#ifdef CONFIG_KAISER
#define _PAGE_PROTNONE (_AT(pteval_t, 0))
#else
#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
#endif
#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
_PAGE_ACCESSED | _PAGE_DIRTY) _PAGE_ACCESSED | _PAGE_DIRTY)
......
...@@ -127,11 +127,14 @@ void __init init_espfix_bsp(void) ...@@ -127,11 +127,14 @@ void __init init_espfix_bsp(void)
/* Install the espfix pud into the kernel page directory */ /* Install the espfix pud into the kernel page directory */
pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)]; pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page); pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
#ifdef CONFIG_KAISER /*
// add the esp stack pud to the shadow mapping here. * Just copy the top-level PGD that is mapping the espfix
// This can be done directly, because the fixup stack has its own pud * area to ensure it is mapped into the shadow user page
set_pgd(native_get_shadow_pgd(pgd_p), __pgd(_PAGE_TABLE | __pa((pud_t *)espfix_pud_page))); * tables.
#endif */
if (IS_ENABLED(CONFIG_KAISER))
set_pgd(native_get_shadow_pgd(pgd_p),
__pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page)));
/* Randomize the locations */ /* Randomize the locations */
init_espfix_random(); init_espfix_random();
......
...@@ -442,11 +442,24 @@ early_idt_ripmsg: ...@@ -442,11 +442,24 @@ early_idt_ripmsg:
GLOBAL(name) GLOBAL(name)
#ifdef CONFIG_KAISER #ifdef CONFIG_KAISER
/*
* Each PGD needs to be 8k long and 8k aligned. We do not
* ever go out to userspace with these, so we do not
* strictly *need* the second page, but this allows us to
* have a single set_pgd() implementation that does not
* need to worry about whether it has 4k or 8k to work
* with.
*
* This ensures PGDs are 8k long:
*/
#define KAISER_USER_PGD_FILL 512
/* This ensures they are 8k-aligned: */
#define NEXT_PGD_PAGE(name) \ #define NEXT_PGD_PAGE(name) \
.balign 2 * PAGE_SIZE; \ .balign 2 * PAGE_SIZE; \
GLOBAL(name) GLOBAL(name)
#else #else
#define NEXT_PGD_PAGE(name) NEXT_PAGE(name) #define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
#define KAISER_USER_PGD_FILL 0
#endif #endif
/* Automate the creation of 1 to 1 mapping pmd entries */ /* Automate the creation of 1 to 1 mapping pmd entries */
...@@ -461,6 +474,7 @@ GLOBAL(name) ...@@ -461,6 +474,7 @@ GLOBAL(name)
NEXT_PGD_PAGE(early_level4_pgt) NEXT_PGD_PAGE(early_level4_pgt)
.fill 511,8,0 .fill 511,8,0
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
.fill KAISER_USER_PGD_FILL,8,0
NEXT_PAGE(early_dynamic_pgts) NEXT_PAGE(early_dynamic_pgts)
.fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
...@@ -469,7 +483,8 @@ NEXT_PAGE(early_dynamic_pgts) ...@@ -469,7 +483,8 @@ NEXT_PAGE(early_dynamic_pgts)
#ifndef CONFIG_XEN #ifndef CONFIG_XEN
NEXT_PGD_PAGE(init_level4_pgt) NEXT_PGD_PAGE(init_level4_pgt)
.fill 2*512,8,0 .fill 512,8,0
.fill KAISER_USER_PGD_FILL,8,0
#else #else
NEXT_PGD_PAGE(init_level4_pgt) NEXT_PGD_PAGE(init_level4_pgt)
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
...@@ -478,6 +493,7 @@ NEXT_PGD_PAGE(init_level4_pgt) ...@@ -478,6 +493,7 @@ NEXT_PGD_PAGE(init_level4_pgt)
.org init_level4_pgt + L4_START_KERNEL*8, 0 .org init_level4_pgt + L4_START_KERNEL*8, 0
/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
.fill KAISER_USER_PGD_FILL,8,0
NEXT_PAGE(level3_ident_pgt) NEXT_PAGE(level3_ident_pgt)
.quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
...@@ -488,6 +504,7 @@ NEXT_PAGE(level2_ident_pgt) ...@@ -488,6 +504,7 @@ NEXT_PAGE(level2_ident_pgt)
*/ */
PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
#endif #endif
.fill KAISER_USER_PGD_FILL,8,0
NEXT_PAGE(level3_kernel_pgt) NEXT_PAGE(level3_kernel_pgt)
.fill L3_START_KERNEL,8,0 .fill L3_START_KERNEL,8,0
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include <linux/uaccess.h> #include <linux/uaccess.h>
#include <asm/ldt.h> #include <asm/ldt.h>
#include <asm/kaiser.h>
#include <asm/desc.h> #include <asm/desc.h>
#include <asm/mmu_context.h> #include <asm/mmu_context.h>
#include <asm/syscalls.h> #include <asm/syscalls.h>
...@@ -34,11 +35,21 @@ static void flush_ldt(void *current_mm) ...@@ -34,11 +35,21 @@ static void flush_ldt(void *current_mm)
set_ldt(pc->ldt->entries, pc->ldt->size); set_ldt(pc->ldt->entries, pc->ldt->size);
} }
static void __free_ldt_struct(struct ldt_struct *ldt)
{
if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
vfree(ldt->entries);
else
free_page((unsigned long)ldt->entries);
kfree(ldt);
}
/* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */ /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
static struct ldt_struct *alloc_ldt_struct(int size) static struct ldt_struct *alloc_ldt_struct(int size)
{ {
struct ldt_struct *new_ldt; struct ldt_struct *new_ldt;
int alloc_size; int alloc_size;
int ret = 0;
if (size > LDT_ENTRIES) if (size > LDT_ENTRIES)
return NULL; return NULL;
...@@ -66,6 +77,14 @@ static struct ldt_struct *alloc_ldt_struct(int size) ...@@ -66,6 +77,14 @@ static struct ldt_struct *alloc_ldt_struct(int size)
return NULL; return NULL;
} }
// FIXME: make kaiser_add_mapping() return an error code
// when it fails
kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size,
__PAGE_KERNEL);
if (ret) {
__free_ldt_struct(new_ldt);
return NULL;
}
new_ldt->size = size; new_ldt->size = size;
return new_ldt; return new_ldt;
} }
...@@ -92,12 +111,10 @@ static void free_ldt_struct(struct ldt_struct *ldt) ...@@ -92,12 +111,10 @@ static void free_ldt_struct(struct ldt_struct *ldt)
if (likely(!ldt)) if (likely(!ldt))
return; return;
kaiser_remove_mapping((unsigned long)ldt->entries,
ldt->size * LDT_ENTRY_SIZE);
paravirt_free_ldt(ldt->entries, ldt->size); paravirt_free_ldt(ldt->entries, ldt->size);
if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE) __free_ldt_struct(ldt);
vfree(ldt->entries);
else
free_page((unsigned long)ldt->entries);
kfree(ldt);
} }
/* /*
......
...@@ -9,10 +9,12 @@ ...@@ -9,10 +9,12 @@
#include <linux/atomic.h> #include <linux/atomic.h>
atomic_t trace_idt_ctr = ATOMIC_INIT(0); atomic_t trace_idt_ctr = ATOMIC_INIT(0);
__aligned(PAGE_SIZE)
struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1, struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1,
(unsigned long) trace_idt_table }; (unsigned long) trace_idt_table };
/* No need to be aligned, but done to keep all IDTs defined the same way. */ /* No need to be aligned, but done to keep all IDTs defined the same way. */
__aligned(PAGE_SIZE)
gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss; gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss;
static int trace_irq_vector_refcount; static int trace_irq_vector_refcount;
......
#include <linux/bug.h>
#include <linux/kernel.h> #include <linux/kernel.h>
#include <linux/errno.h> #include <linux/errno.h>
#include <linux/string.h> #include <linux/string.h>
#include <linux/types.h> #include <linux/types.h>
#include <linux/bug.h> #include <linux/bug.h>
#include <linux/init.h> #include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/spinlock.h> #include <linux/spinlock.h>
#include <linux/mm.h> #include <linux/mm.h>
#include <linux/uaccess.h> #include <linux/uaccess.h>
#include <linux/ftrace.h>
#include <asm/kaiser.h>
#include <asm/pgtable.h> #include <asm/pgtable.h>
#include <asm/pgalloc.h> #include <asm/pgalloc.h>
#include <asm/desc.h> #include <asm/desc.h>
#ifdef CONFIG_KAISER #ifdef CONFIG_KAISER
__visible DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); __visible DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
/*
* At runtime, the only things we map are some things for CPU
* hotplug, and stacks for new processes. No two CPUs will ever
* be populating the same addresses, so we only need to ensure
* that we protect between two CPUs trying to allocate and
* populate the same page table page.
*
* Only take this lock when doing a set_p[4um]d(), but it is not
* needed for doing a set_pte(). We assume that only the *owner*
* of a given allocation will be doing this for _their_
* allocation.
*
* This ensures that once a system has been running for a while
* and there have been stacks all over and these page tables
* are fully populated, there will be no further acquisitions of
* this lock.
*/
static DEFINE_SPINLOCK(shadow_table_allocation_lock);
/** /*
* Get the real ppn from a address in kernel mapping. * Returns -1 on error.
* @param address The virtual adrress
* @return the physical address
*/ */
static inline unsigned long get_pa_from_mapping (unsigned long address) static inline unsigned long get_pa_from_mapping(unsigned long vaddr)
{ {
pgd_t *pgd; pgd_t *pgd;
pud_t *pud; pud_t *pud;
pmd_t *pmd; pmd_t *pmd;
pte_t *pte; pte_t *pte;
pgd = pgd_offset_k(address); pgd = pgd_offset_k(vaddr);
BUG_ON(pgd_none(*pgd) || pgd_large(*pgd)); /*
* We made all the kernel PGDs present in kaiser_init().
pud = pud_offset(pgd, address); * We expect them to stay that way.
BUG_ON(pud_none(*pud)); */
BUG_ON(pgd_none(*pgd));
/*
* PGDs are either 512GB or 128TB on all x86_64
* configurations. We don't handle these.
*/
BUG_ON(pgd_large(*pgd));
if (pud_large(*pud)) { pud = pud_offset(pgd, vaddr);
return (pud_pfn(*pud) << PAGE_SHIFT) | (address & ~PUD_PAGE_MASK); if (pud_none(*pud)) {
WARN_ON_ONCE(1);
return -1;
} }
pmd = pmd_offset(pud, address); if (pud_large(*pud))
BUG_ON(pmd_none(*pmd)); return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK);
if (pmd_large(*pmd)) { pmd = pmd_offset(pud, vaddr);
return (pmd_pfn(*pmd) << PAGE_SHIFT) | (address & ~PMD_PAGE_MASK); if (pmd_none(*pmd)) {
WARN_ON_ONCE(1);
return -1;
} }
pte = pte_offset_kernel(pmd, address); if (pmd_large(*pmd))
BUG_ON(pte_none(*pte)); return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK);
return (pte_pfn(*pte) << PAGE_SHIFT) | (address & ~PAGE_MASK); pte = pte_offset_kernel(pmd, vaddr);
if (pte_none(*pte)) {
WARN_ON_ONCE(1);
return -1;
}
return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
} }
void _kaiser_copy (unsigned long start_addr, unsigned long size, /*
unsigned long flags) * This is a relatively normal page table walk, except that it
* also tries to allocate page tables pages along the way.
*
* Returns a pointer to a PTE on success, or NULL on failure.
*/
static pte_t *kaiser_pagetable_walk(unsigned long address, bool is_atomic)
{ {
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd; pmd_t *pmd;
pte_t *pte; pud_t *pud;
unsigned long address; pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
unsigned long end_addr = start_addr + size; gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
unsigned long target_address;
for (address = PAGE_ALIGN(start_addr - (PAGE_SIZE - 1)); might_sleep();
address < PAGE_ALIGN(end_addr); address += PAGE_SIZE) { if (is_atomic) {
target_address = get_pa_from_mapping(address); gfp &= ~GFP_KERNEL;
gfp |= __GFP_HIGH | __GFP_ATOMIC;
}
pgd = native_get_shadow_pgd(pgd_offset_k(address)); if (pgd_none(*pgd)) {
WARN_ONCE(1, "All shadow pgds should have been populated");
return NULL;
}
BUILD_BUG_ON(pgd_large(*pgd) != 0);
BUG_ON(pgd_none(*pgd) && "All shadow pgds should be mapped at this time\n"); pud = pud_offset(pgd, address);
BUG_ON(pgd_large(*pgd)); /* The shadow page tables do not use large mappings: */
if (pud_large(*pud)) {
WARN_ON(1);
return NULL;
}
if (pud_none(*pud)) {
unsigned long new_pmd_page = __get_free_page(gfp);
if (!new_pmd_page)
return NULL;
spin_lock(&shadow_table_allocation_lock);
if (pud_none(*pud))
set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
else
free_page(new_pmd_page);
spin_unlock(&shadow_table_allocation_lock);
}
pud = pud_offset(pgd, address); pmd = pmd_offset(pud, address);
if (pud_none(*pud)) { /* The shadow page tables do not use large mappings: */
set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd_alloc_one(0, address)))); if (pmd_large(*pmd)) {
} WARN_ON(1);
BUG_ON(pud_large(*pud)); return NULL;
}
if (pmd_none(*pmd)) {
unsigned long new_pte_page = __get_free_page(gfp);
if (!new_pte_page)
return NULL;
spin_lock(&shadow_table_allocation_lock);
if (pmd_none(*pmd))
set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
else
free_page(new_pte_page);
spin_unlock(&shadow_table_allocation_lock);
}
pmd = pmd_offset(pud, address); return pte_offset_kernel(pmd, address);
if (pmd_none(*pmd)) { }
set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte_alloc_one_kernel(0, address))));
}
BUG_ON(pmd_large(*pmd));
pte = pte_offset_kernel(pmd, address); int kaiser_add_user_map(const void *__start_addr, unsigned long size,
unsigned long flags)
{
int ret = 0;
pte_t *pte;
unsigned long start_addr = (unsigned long )__start_addr;
unsigned long address = start_addr & PAGE_MASK;
unsigned long end_addr = PAGE_ALIGN(start_addr + size);
unsigned long target_address;
for (;address < end_addr; address += PAGE_SIZE) {
target_address = get_pa_from_mapping(address);
if (target_address == -1) {
ret = -EIO;
break;
}
pte = kaiser_pagetable_walk(address, false);
if (pte_none(*pte)) { if (pte_none(*pte)) {
set_pte(pte, __pte(flags | target_address)); set_pte(pte, __pte(flags | target_address));
} else { } else {
BUG_ON(__pa(pte_page(*pte)) != target_address); pte_t tmp;
set_pte(&tmp, __pte(flags | target_address));
WARN_ON_ONCE(!pte_same(*pte, tmp));
} }
} }
return ret;
}
static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags)
{
unsigned long size = end - start;
return kaiser_add_user_map(start, size, flags);
} }
// at first, add a pmd for every pgd entry in the shadowmem-kernel-part of the kernel mapping /*
static inline void __init _kaiser_init(void) * Ensure that the top level of the (shadow) page tables are
* entirely populated. This ensures that all processes that get
* forked have the same entries. This way, we do not have to
* ever go set up new entries in older processes.
*
* Note: we never free these, so there are no updates to them
* after this.
*/
static void __init kaiser_init_all_pgds(void)
{ {
pgd_t *pgd; pgd_t *pgd;
int i = 0; int i = 0;
pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0)); pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) { for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
set_pgd(pgd + i, __pgd(_PAGE_TABLE |__pa(pud_alloc_one(0, 0)))); pgd_t new_pgd;
pud_t *pud = pud_alloc_one(&init_mm, PAGE_OFFSET + i * PGDIR_SIZE);
if (!pud) {
WARN_ON(1);
break;
}
new_pgd = __pgd(_KERNPG_TABLE |__pa(pud));
/*
* Make sure not to stomp on some other pgd entry.
*/
if (!pgd_none(pgd[i])) {
WARN_ON(1);
continue;
}
set_pgd(pgd + i, new_pgd);
} }
} }
#define kaiser_add_user_map_early(start, size, flags) do { \
int __ret = kaiser_add_user_map(start, size, flags); \
WARN_ON(__ret); \
} while (0)
#define kaiser_add_user_map_ptrs_early(start, end, flags) do { \
int __ret = kaiser_add_user_map_ptrs(start, end, flags); \
WARN_ON(__ret); \
} while (0)
extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[]; extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
spinlock_t shadow_table_lock; /*
* If anything in here fails, we will likely die on one of the
* first kernel->user transitions and init will die. But, we
* will have most of the kernel up by then and should be able to
* get a clean warning out of it. If we BUG_ON() here, we run
* the risk of being before we have good console output.
*/
void __init kaiser_init(void) void __init kaiser_init(void)
{ {
int cpu; int cpu;
spin_lock_init(&shadow_table_lock);
spin_lock(&shadow_table_lock);
_kaiser_init(); kaiser_init_all_pgds();
for_each_possible_cpu(cpu) { for_each_possible_cpu(cpu) {
// map the per cpu user variables void *percpu_vaddr = __per_cpu_user_mapped_start +
_kaiser_copy( per_cpu_offset(cpu);
(unsigned long) (__per_cpu_user_mapped_start + per_cpu_offset(cpu)), unsigned long percpu_sz = __per_cpu_user_mapped_end -
(unsigned long) __per_cpu_user_mapped_end - (unsigned long) __per_cpu_user_mapped_start, __per_cpu_user_mapped_start;
__PAGE_KERNEL); kaiser_add_user_map_early(percpu_vaddr, percpu_sz,
__PAGE_KERNEL);
} }
// map the entry/exit text section, which is responsible to switch between user- and kernel mode /*
_kaiser_copy( * Map the entry/exit text section, which is needed at
(unsigned long) __entry_text_start, * switches from user to and from kernel.
(unsigned long) __entry_text_end - (unsigned long) __entry_text_start, */
__PAGE_KERNEL_RX); kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,
__PAGE_KERNEL_RX);
// the fixed map address of the idt_table #if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
_kaiser_copy( kaiser_add_user_map_ptrs_early(__irqentry_text_start,
(unsigned long) idt_descr.address, __irqentry_text_end,
sizeof(gate_desc) * NR_VECTORS, __PAGE_KERNEL_RX);
__PAGE_KERNEL_RO); #endif
kaiser_add_user_map_early((void *)idt_descr.address,
spin_unlock(&shadow_table_lock); sizeof(gate_desc) * NR_VECTORS,
__PAGE_KERNEL_RO);
#ifdef CONFIG_TRACING
kaiser_add_user_map_early(&trace_idt_descr,
sizeof(trace_idt_descr),
__PAGE_KERNEL);
kaiser_add_user_map_early(&trace_idt_table,
sizeof(gate_desc) * NR_VECTORS,
__PAGE_KERNEL);
#endif
kaiser_add_user_map_early(&debug_idt_descr, sizeof(debug_idt_descr),
__PAGE_KERNEL);
kaiser_add_user_map_early(&debug_idt_table,
sizeof(gate_desc) * NR_VECTORS,
__PAGE_KERNEL);
} }
extern void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end);
// add a mapping to the shadow-mapping, and synchronize the mappings // add a mapping to the shadow-mapping, and synchronize the mappings
void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags) int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
{ {
spin_lock(&shadow_table_lock); return kaiser_add_user_map((const void *)addr, size, flags);
_kaiser_copy(addr, size, flags);
spin_unlock(&shadow_table_lock);
} }
extern void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end);
void kaiser_remove_mapping(unsigned long start, unsigned long size) void kaiser_remove_mapping(unsigned long start, unsigned long size)
{ {
pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(start)); unsigned long end = start + size;
spin_lock(&shadow_table_lock); unsigned long addr;
do {
unmap_pud_range(pgd, start, start + size); for (addr = start; addr < end; addr += PGDIR_SIZE) {
} while (pgd++ != native_get_shadow_pgd(pgd_offset_k(start + size))); pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(addr));
spin_unlock(&shadow_table_lock); /*
* unmap_p4d_range() handles > P4D_SIZE unmaps,
* so no need to trim 'end'.
*/
unmap_pud_range_nofree(pgd, addr, end);
}
} }
#endif /* CONFIG_KAISER */ #endif /* CONFIG_KAISER */
...@@ -52,6 +52,7 @@ static DEFINE_SPINLOCK(cpa_lock); ...@@ -52,6 +52,7 @@ static DEFINE_SPINLOCK(cpa_lock);
#define CPA_FLUSHTLB 1 #define CPA_FLUSHTLB 1
#define CPA_ARRAY 2 #define CPA_ARRAY 2
#define CPA_PAGES_ARRAY 4 #define CPA_PAGES_ARRAY 4
#define CPA_FREE_PAGETABLES 8
#ifdef CONFIG_PROC_FS #ifdef CONFIG_PROC_FS
static unsigned long direct_pages_count[PG_LEVEL_NUM]; static unsigned long direct_pages_count[PG_LEVEL_NUM];
...@@ -723,10 +724,13 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte, ...@@ -723,10 +724,13 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
return 0; return 0;
} }
static bool try_to_free_pte_page(pte_t *pte) static bool try_to_free_pte_page(struct cpa_data *cpa, pte_t *pte)
{ {
int i; int i;
if (!(cpa->flags & CPA_FREE_PAGETABLES))
return false;
for (i = 0; i < PTRS_PER_PTE; i++) for (i = 0; i < PTRS_PER_PTE; i++)
if (!pte_none(pte[i])) if (!pte_none(pte[i]))
return false; return false;
...@@ -735,10 +739,13 @@ static bool try_to_free_pte_page(pte_t *pte) ...@@ -735,10 +739,13 @@ static bool try_to_free_pte_page(pte_t *pte)
return true; return true;
} }
static bool try_to_free_pmd_page(pmd_t *pmd) static bool try_to_free_pmd_page(struct cpa_data *cpa, pmd_t *pmd)
{ {
int i; int i;
if (!(cpa->flags & CPA_FREE_PAGETABLES))
return false;
for (i = 0; i < PTRS_PER_PMD; i++) for (i = 0; i < PTRS_PER_PMD; i++)
if (!pmd_none(pmd[i])) if (!pmd_none(pmd[i]))
return false; return false;
...@@ -759,7 +766,9 @@ static bool try_to_free_pud_page(pud_t *pud) ...@@ -759,7 +766,9 @@ static bool try_to_free_pud_page(pud_t *pud)
return true; return true;
} }
static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end) static bool unmap_pte_range(struct cpa_data *cpa, pmd_t *pmd,
unsigned long start,
unsigned long end)
{ {
pte_t *pte = pte_offset_kernel(pmd, start); pte_t *pte = pte_offset_kernel(pmd, start);
...@@ -770,22 +779,23 @@ static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end) ...@@ -770,22 +779,23 @@ static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
pte++; pte++;
} }
if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) { if (try_to_free_pte_page(cpa, (pte_t *)pmd_page_vaddr(*pmd))) {
pmd_clear(pmd); pmd_clear(pmd);
return true; return true;
} }
return false; return false;
} }
static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd, static void __unmap_pmd_range(struct cpa_data *cpa, pud_t *pud, pmd_t *pmd,
unsigned long start, unsigned long end) unsigned long start, unsigned long end)
{ {
if (unmap_pte_range(pmd, start, end)) if (unmap_pte_range(cpa, pmd, start, end))
if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud))) if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))
pud_clear(pud); pud_clear(pud);
} }
static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) static void unmap_pmd_range(struct cpa_data *cpa, pud_t *pud,
unsigned long start, unsigned long end)
{ {
pmd_t *pmd = pmd_offset(pud, start); pmd_t *pmd = pmd_offset(pud, start);
...@@ -796,7 +806,7 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) ...@@ -796,7 +806,7 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
unsigned long next_page = (start + PMD_SIZE) & PMD_MASK; unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
unsigned long pre_end = min_t(unsigned long, end, next_page); unsigned long pre_end = min_t(unsigned long, end, next_page);
__unmap_pmd_range(pud, pmd, start, pre_end); __unmap_pmd_range(cpa, pud, pmd, start, pre_end);
start = pre_end; start = pre_end;
pmd++; pmd++;
...@@ -809,7 +819,8 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) ...@@ -809,7 +819,8 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
if (pmd_large(*pmd)) if (pmd_large(*pmd))
pmd_clear(pmd); pmd_clear(pmd);
else else
__unmap_pmd_range(pud, pmd, start, start + PMD_SIZE); __unmap_pmd_range(cpa, pud, pmd,
start, start + PMD_SIZE);
start += PMD_SIZE; start += PMD_SIZE;
pmd++; pmd++;
...@@ -819,17 +830,19 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) ...@@ -819,17 +830,19 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
* 4K leftovers? * 4K leftovers?
*/ */
if (start < end) if (start < end)
return __unmap_pmd_range(pud, pmd, start, end); return __unmap_pmd_range(cpa, pud, pmd, start, end);
/* /*
* Try again to free the PMD page if haven't succeeded above. * Try again to free the PMD page if haven't succeeded above.
*/ */
if (!pud_none(*pud)) if (!pud_none(*pud))
if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud))) if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))
pud_clear(pud); pud_clear(pud);
} }
void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) static void __unmap_pud_range(struct cpa_data *cpa, pgd_t *pgd,
unsigned long start,
unsigned long end)
{ {
pud_t *pud = pud_offset(pgd, start); pud_t *pud = pud_offset(pgd, start);
...@@ -840,7 +853,7 @@ void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) ...@@ -840,7 +853,7 @@ void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
unsigned long next_page = (start + PUD_SIZE) & PUD_MASK; unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
unsigned long pre_end = min_t(unsigned long, end, next_page); unsigned long pre_end = min_t(unsigned long, end, next_page);
unmap_pmd_range(pud, start, pre_end); unmap_pmd_range(cpa, pud, start, pre_end);
start = pre_end; start = pre_end;
pud++; pud++;
...@@ -854,7 +867,7 @@ void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) ...@@ -854,7 +867,7 @@ void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
if (pud_large(*pud)) if (pud_large(*pud))
pud_clear(pud); pud_clear(pud);
else else
unmap_pmd_range(pud, start, start + PUD_SIZE); unmap_pmd_range(cpa, pud, start, start + PUD_SIZE);
start += PUD_SIZE; start += PUD_SIZE;
pud++; pud++;
...@@ -864,7 +877,7 @@ void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) ...@@ -864,7 +877,7 @@ void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
* 2M leftovers? * 2M leftovers?
*/ */
if (start < end) if (start < end)
unmap_pmd_range(pud, start, end); unmap_pmd_range(cpa, pud, start, end);
/* /*
* No need to try to free the PUD page because we'll free it in * No need to try to free the PUD page because we'll free it in
...@@ -872,6 +885,24 @@ void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) ...@@ -872,6 +885,24 @@ void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
*/ */
} }
static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
{
struct cpa_data cpa = {
.flags = CPA_FREE_PAGETABLES,
};
__unmap_pud_range(&cpa, pgd, start, end);
}
void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end)
{
struct cpa_data cpa = {
.flags = 0,
};
__unmap_pud_range(&cpa, pgd, start, end);
}
static void unmap_pgd_range(pgd_t *root, unsigned long addr, unsigned long end) static void unmap_pgd_range(pgd_t *root, unsigned long addr, unsigned long end)
{ {
pgd_t *pgd_entry = root + pgd_index(addr); pgd_t *pgd_entry = root + pgd_index(addr);
......
...@@ -340,40 +340,26 @@ static inline void _pgd_free(pgd_t *pgd) ...@@ -340,40 +340,26 @@ static inline void _pgd_free(pgd_t *pgd)
kmem_cache_free(pgd_cache, pgd); kmem_cache_free(pgd_cache, pgd);
} }
#else #else
static inline pgd_t *_pgd_alloc(void)
{
#ifdef CONFIG_KAISER #ifdef CONFIG_KAISER
// Instead of one PML4, we aquire two PML4s and, thus, an 8kb-aligned memory /*
// block. Therefore, we have to allocate at least 3 pages. However, the * Instead of one pmd, we aquire two pmds. Being order-1, it is
// __get_free_pages returns us 4 pages. Hence, we store the base pointer at * both 8k in size and 8k-aligned. That lets us just flip bit 12
// the beginning of the page of our 8kb-aligned memory block in order to * in a pointer to swap between the two 4k halves.
// correctly free it afterwars. */
#define PGD_ALLOCATION_ORDER 1
unsigned long pages = __get_free_pages(PGALLOC_GFP, get_order(4*PAGE_SIZE));
if(native_get_normal_pgd((pgd_t*) pages) == (pgd_t*) pages)
{
*((unsigned long*)(pages + 2 * PAGE_SIZE)) = pages;
return (pgd_t *) pages;
}
else
{
*((unsigned long*)(pages + 3 * PAGE_SIZE)) = pages;
return (pgd_t *) (pages + PAGE_SIZE);
}
#else #else
return (pgd_t *)__get_free_page(PGALLOC_GFP); #define PGD_ALLOCATION_ORDER 0
#endif #endif
static inline pgd_t *_pgd_alloc(void)
{
return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER);
} }
static inline void _pgd_free(pgd_t *pgd) static inline void _pgd_free(pgd_t *pgd)
{ {
#ifdef CONFIG_KAISER free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
unsigned long pages = *((unsigned long*) ((char*) pgd + 2 * PAGE_SIZE));
free_pages(pages, get_order(4*PAGE_SIZE));
#else
free_page((unsigned long)pgd);
#endif
} }
#endif /* CONFIG_X86_PAE */ #endif /* CONFIG_X86_PAE */
......
#ifndef _INCLUDE_KAISER_H
#define _INCLUDE_KAISER_H
#ifdef CONFIG_KAISER
#include <asm/kaiser.h>
#else
/*
* These stubs are used whenever CONFIG_KAISER is off, which
* includes architectures that support KAISER, but have it
* disabled.
*/
static inline void kaiser_init(void)
{
}
static inline void kaiser_remove_mapping(unsigned long start, unsigned long size)
{
}
static inline int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
{
return 0;
}
#endif /* !CONFIG_KAISER */
#endif /* _INCLUDE_KAISER_H */
...@@ -58,6 +58,7 @@ ...@@ -58,6 +58,7 @@
#include <linux/tsacct_kern.h> #include <linux/tsacct_kern.h>
#include <linux/cn_proc.h> #include <linux/cn_proc.h>
#include <linux/freezer.h> #include <linux/freezer.h>
#include <linux/kaiser.h>
#include <linux/delayacct.h> #include <linux/delayacct.h>
#include <linux/taskstats_kern.h> #include <linux/taskstats_kern.h>
#include <linux/random.h> #include <linux/random.h>
...@@ -335,7 +336,6 @@ void set_task_stack_end_magic(struct task_struct *tsk) ...@@ -335,7 +336,6 @@ void set_task_stack_end_magic(struct task_struct *tsk)
*stackend = STACK_END_MAGIC; /* for overflow detection */ *stackend = STACK_END_MAGIC; /* for overflow detection */
} }
extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
static struct task_struct *dup_task_struct(struct task_struct *orig, int node) static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
{ {
struct task_struct *tsk; struct task_struct *tsk;
...@@ -357,9 +357,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) ...@@ -357,9 +357,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
goto free_ti; goto free_ti;
tsk->stack = ti; tsk->stack = ti;
#ifdef CONFIG_KAISER
kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE, __PAGE_KERNEL); err= kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE, __PAGE_KERNEL);
#endif if (err)
goto free_ti;
#ifdef CONFIG_SECCOMP #ifdef CONFIG_SECCOMP
/* /*
* We must handle setting up seccomp filters once we're under * We must handle setting up seccomp filters once we're under
......
...@@ -32,12 +32,17 @@ config SECURITY ...@@ -32,12 +32,17 @@ config SECURITY
If you are unsure how to answer this question, answer N. If you are unsure how to answer this question, answer N.
config KAISER config KAISER
bool "Remove the kernel mapping in user mode" bool "Remove the kernel mapping in user mode"
default y
depends on X86_64 depends on X86_64
depends on !PARAVIRT depends on !PARAVIRT
help help
This enforces a strict kernel and user space isolation in order to close This enforces a strict kernel and user space isolation in order to close
hardware side channels on kernel address information. hardware side channels on kernel address information.
config KAISER_REAL_SWITCH
bool "KAISER: actually switch page tables"
default y
config SECURITYFS config SECURITYFS
bool "Enable the securityfs filesystem" bool "Enable the securityfs filesystem"
help help
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment