Commit bed9bb7f authored by Dave Hansen's avatar Dave Hansen Committed by Greg Kroah-Hartman

kaiser: merged update


Merged fixes and cleanups, rebased to 4.4.89 tree (no 5-level paging).
Signed-off-by: default avatarDave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: default avatarHugh Dickins <hughd@google.com>
Acked-by: default avatarJiri Kosina <jkosina@suse.cz>
Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
parent 8a43ddfb
......@@ -212,6 +212,13 @@ entry_SYSCALL_64_fastpath:
movq RIP(%rsp), %rcx
movq EFLAGS(%rsp), %r11
RESTORE_C_REGS_EXCEPT_RCX_R11
/*
* This opens a window where we have a user CR3, but are
* running in the kernel. This makes using the CS
* register useless for telling whether or not we need to
* switch CR3 in NMIs. Normal interrupts are OK because
* they are off here.
*/
SWITCH_USER_CR3
movq RSP(%rsp), %rsp
/*
......@@ -350,11 +357,25 @@ GLOBAL(int_ret_from_sys_call)
syscall_return_via_sysret:
/* rcx and r11 are already restored (see code above) */
RESTORE_C_REGS_EXCEPT_RCX_R11
/*
* This opens a window where we have a user CR3, but are
* running in the kernel. This makes using the CS
* register useless for telling whether or not we need to
* switch CR3 in NMIs. Normal interrupts are OK because
* they are off here.
*/
SWITCH_USER_CR3
movq RSP(%rsp), %rsp
USERGS_SYSRET64
opportunistic_sysret_failed:
/*
* This opens a window where we have a user CR3, but are
* running in the kernel. This makes using the CS
* register useless for telling whether or not we need to
* switch CR3 in NMIs. Normal interrupts are OK because
* they are off here.
*/
SWITCH_USER_CR3
SWAPGS
jmp restore_c_regs_and_iret
......@@ -1059,6 +1080,13 @@ ENTRY(error_entry)
cld
SAVE_C_REGS 8
SAVE_EXTRA_REGS 8
/*
* error_entry() always returns with a kernel gsbase and
* CR3. We must also have a kernel CR3/gsbase before
* calling TRACE_IRQS_*. Just unconditionally switch to
* the kernel CR3 here.
*/
SWITCH_KERNEL_CR3
xorl %ebx, %ebx
testb $3, CS+8(%rsp)
jz .Lerror_kernelspace
......@@ -1069,7 +1097,6 @@ ENTRY(error_entry)
* from user mode due to an IRET fault.
*/
SWAPGS
SWITCH_KERNEL_CR3
.Lerror_entry_from_usermode_after_swapgs:
/*
......@@ -1122,7 +1149,7 @@ ENTRY(error_entry)
* Switch to kernel gsbase:
*/
SWAPGS
SWITCH_KERNEL_CR3
/*
* Pretend that the exception came from user mode: set up pt_regs
* as if we faulted immediately after IRET and clear EBX so that
......@@ -1222,7 +1249,10 @@ ENTRY(nmi)
*/
SWAPGS_UNSAFE_STACK
SWITCH_KERNEL_CR3_NO_STACK
/*
* percpu variables are mapped with user CR3, so no need
* to switch CR3 here.
*/
cld
movq %rsp, %rdx
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
......@@ -1256,14 +1286,33 @@ ENTRY(nmi)
movq %rsp, %rdi
movq $-1, %rsi
#ifdef CONFIG_KAISER
/* Unconditionally use kernel CR3 for do_nmi() */
/* %rax is saved above, so OK to clobber here */
movq %cr3, %rax
pushq %rax
#ifdef CONFIG_KAISER_REAL_SWITCH
andq $(~0x1000), %rax
#endif
movq %rax, %cr3
#endif
call do_nmi
/*
* Unconditionally restore CR3. I know we return to
* kernel code that needs user CR3, but do we ever return
* to "user mode" where we need the kernel CR3?
*/
#ifdef CONFIG_KAISER
popq %rax
mov %rax, %cr3
#endif
/*
* Return back to user mode. We must *not* do the normal exit
* work, because we don't want to enable interrupts. Fortunately,
* do_nmi doesn't modify pt_regs.
* work, because we don't want to enable interrupts. Do not
* switch to user CR3: we might be going back to kernel code
* that had a user CR3 set.
*/
SWITCH_USER_CR3
SWAPGS
jmp restore_c_regs_and_iret
......@@ -1459,23 +1508,54 @@ end_repeat_nmi:
ALLOC_PT_GPREGS_ON_STACK
/*
* Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
* as we should not be calling schedule in NMI context.
* Even with normal interrupts enabled. An NMI should not be
* setting NEED_RESCHED or anything that normal interrupts and
* exceptions might do.
* Use the same approach as paranoid_entry to handle SWAPGS, but
* without CR3 handling since we do that differently in NMIs. No
* need to use paranoid_exit as we should not be calling schedule
* in NMI context. Even with normal interrupts enabled. An NMI
* should not be setting NEED_RESCHED or anything that normal
* interrupts and exceptions might do.
*/
call paranoid_entry
cld
SAVE_C_REGS
SAVE_EXTRA_REGS
movl $1, %ebx
movl $MSR_GS_BASE, %ecx
rdmsr
testl %edx, %edx
js 1f /* negative -> in kernel */
SWAPGS
xorl %ebx, %ebx
1:
#ifdef CONFIG_KAISER
/* Unconditionally use kernel CR3 for do_nmi() */
/* %rax is saved above, so OK to clobber here */
movq %cr3, %rax
pushq %rax
#ifdef CONFIG_KAISER_REAL_SWITCH
andq $(~0x1000), %rax
#endif
movq %rax, %cr3
#endif
/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
movq %rsp, %rdi
addq $8, %rdi /* point %rdi at ptregs, fixed up for CR3 */
movq $-1, %rsi
call do_nmi
/*
* Unconditionally restore CR3. We might be returning to
* kernel code that needs user CR3, like just just before
* a sysret.
*/
#ifdef CONFIG_KAISER
popq %rax
mov %rax, %cr3
#endif
testl %ebx, %ebx /* swapgs needed? */
jnz nmi_restore
nmi_swapgs:
SWITCH_USER_CR3_NO_STACK
/* We fixed up CR3 above, so no need to switch it here */
SWAPGS_UNSAFE_STACK
nmi_restore:
RESTORE_EXTRA_REGS
......
......@@ -16,13 +16,17 @@
.macro _SWITCH_TO_KERNEL_CR3 reg
movq %cr3, \reg
#ifdef CONFIG_KAISER_REAL_SWITCH
andq $(~0x1000), \reg
#endif
movq \reg, %cr3
.endm
.macro _SWITCH_TO_USER_CR3 reg
movq %cr3, \reg
#ifdef CONFIG_KAISER_REAL_SWITCH
orq $(0x1000), \reg
#endif
movq \reg, %cr3
.endm
......@@ -65,48 +69,53 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
.endm
#endif /* CONFIG_KAISER */
#else /* __ASSEMBLY__ */
#ifdef CONFIG_KAISER
// Upon kernel/user mode switch, it may happen that
// the address space has to be switched before the registers have been stored.
// To change the address space, another register is needed.
// A register therefore has to be stored/restored.
//
DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
/*
* Upon kernel/user mode switch, it may happen that the address
* space has to be switched before the registers have been
* stored. To change the address space, another register is
* needed. A register therefore has to be stored/restored.
*/
#endif /* CONFIG_KAISER */
DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
/**
* shadowmem_add_mapping - map a virtual memory part to the shadow mapping
* kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping
* @addr: the start address of the range
* @size: the size of the range
* @flags: The mapping flags of the pages
*
* the mapping is done on a global scope, so no bigger synchronization has to be done.
* the pages have to be manually unmapped again when they are not needed any longer.
* The mapping is done on a global scope, so no bigger
* synchronization has to be done. the pages have to be
* manually unmapped again when they are not needed any longer.
*/
extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
/**
* shadowmem_remove_mapping - unmap a virtual memory part of the shadow mapping
* kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping
* @addr: the start address of the range
* @size: the size of the range
*/
extern void kaiser_remove_mapping(unsigned long start, unsigned long size);
/**
* shadowmem_initialize_mapping - Initalize the shadow mapping
* kaiser_initialize_mapping - Initalize the shadow mapping
*
* most parts of the shadow mapping can be mapped upon boot time.
* only the thread stacks have to be mapped on runtime.
* the mapped regions are not unmapped at all.
* Most parts of the shadow mapping can be mapped upon boot
* time. Only per-process things like the thread stacks
* or a new LDT have to be mapped at runtime. These boot-
* time mappings are permanent and nevertunmapped.
*/
extern void kaiser_init(void);
#endif
#endif /* CONFIG_KAISER */
#endif /* __ASSEMBLY */
......
......@@ -653,7 +653,17 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address)
static inline int pgd_bad(pgd_t pgd)
{
return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
pgdval_t ignore_flags = _PAGE_USER;
/*
* We set NX on KAISER pgds that map userspace memory so
* that userspace can not meaningfully use the kernel
* page table by accident; it will fault on the first
* instruction it tries to run. See native_set_pgd().
*/
if (IS_ENABLED(CONFIG_KAISER))
ignore_flags |= _PAGE_NX;
return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
}
static inline int pgd_none(pgd_t pgd)
......@@ -857,8 +867,10 @@ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
{
memcpy(dst, src, count * sizeof(pgd_t));
#ifdef CONFIG_KAISER
// clone the shadow pgd part as well
memcpy(native_get_shadow_pgd(dst), native_get_shadow_pgd(src), count * sizeof(pgd_t));
/* Clone the shadow pgd part as well */
memcpy(native_get_shadow_pgd(dst),
native_get_shadow_pgd(src),
count * sizeof(pgd_t));
#endif
}
......
......@@ -107,26 +107,58 @@ static inline void native_pud_clear(pud_t *pud)
}
#ifdef CONFIG_KAISER
static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) {
static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp)
{
return (pgd_t *)(void*)((unsigned long)(void*)pgdp | (unsigned long)PAGE_SIZE);
}
static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) {
static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp)
{
return (pgd_t *)(void*)((unsigned long)(void*)pgdp & ~(unsigned long)PAGE_SIZE);
}
#else
static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp)
{
BUILD_BUG_ON(1);
return NULL;
}
static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp)
{
return pgdp;
}
#endif /* CONFIG_KAISER */
/*
* Page table pages are page-aligned. The lower half of the top
* level is used for userspace and the top half for the kernel.
* This returns true for user pages that need to get copied into
* both the user and kernel copies of the page tables, and false
* for kernel pages that should only be in the kernel copy.
*/
static inline bool is_userspace_pgd(void *__ptr)
{
unsigned long ptr = (unsigned long)__ptr;
return ((ptr % PAGE_SIZE) < (PAGE_SIZE / 2));
}
static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
{
#ifdef CONFIG_KAISER
// We know that a pgd is page aligned.
// Therefore the lower indices have to be mapped to user space.
// These pages are mapped to the shadow mapping.
if ((((unsigned long)pgdp) % PAGE_SIZE) < (PAGE_SIZE / 2)) {
pteval_t extra_kern_pgd_flags = 0;
/* Do we need to also populate the shadow pgd? */
if (is_userspace_pgd(pgdp)) {
native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
/*
* Even if the entry is *mapping* userspace, ensure
* that userspace can not use it. This way, if we
* get out to userspace running on the kernel CR3,
* userspace will crash instead of running.
*/
extra_kern_pgd_flags = _PAGE_NX;
}
pgdp->pgd = pgd.pgd & ~_PAGE_USER;
pgdp->pgd = pgd.pgd;
pgdp->pgd |= extra_kern_pgd_flags;
#else /* CONFIG_KAISER */
*pgdp = pgd;
#endif
......
......@@ -42,7 +42,7 @@
#ifdef CONFIG_KAISER
#define _PAGE_GLOBAL (_AT(pteval_t, 0))
#else
#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
#endif
#define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
#define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2)
......@@ -93,11 +93,7 @@
#define _PAGE_NX (_AT(pteval_t, 0))
#endif
#ifdef CONFIG_KAISER
#define _PAGE_PROTNONE (_AT(pteval_t, 0))
#else
#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
#endif
#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
_PAGE_ACCESSED | _PAGE_DIRTY)
......
......@@ -127,11 +127,14 @@ void __init init_espfix_bsp(void)
/* Install the espfix pud into the kernel page directory */
pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
#ifdef CONFIG_KAISER
// add the esp stack pud to the shadow mapping here.
// This can be done directly, because the fixup stack has its own pud
set_pgd(native_get_shadow_pgd(pgd_p), __pgd(_PAGE_TABLE | __pa((pud_t *)espfix_pud_page)));
#endif
/*
* Just copy the top-level PGD that is mapping the espfix
* area to ensure it is mapped into the shadow user page
* tables.
*/
if (IS_ENABLED(CONFIG_KAISER))
set_pgd(native_get_shadow_pgd(pgd_p),
__pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page)));
/* Randomize the locations */
init_espfix_random();
......
......@@ -442,11 +442,24 @@ early_idt_ripmsg:
GLOBAL(name)
#ifdef CONFIG_KAISER
/*
* Each PGD needs to be 8k long and 8k aligned. We do not
* ever go out to userspace with these, so we do not
* strictly *need* the second page, but this allows us to
* have a single set_pgd() implementation that does not
* need to worry about whether it has 4k or 8k to work
* with.
*
* This ensures PGDs are 8k long:
*/
#define KAISER_USER_PGD_FILL 512
/* This ensures they are 8k-aligned: */
#define NEXT_PGD_PAGE(name) \
.balign 2 * PAGE_SIZE; \
GLOBAL(name)
#else
#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
#define KAISER_USER_PGD_FILL 0
#endif
/* Automate the creation of 1 to 1 mapping pmd entries */
......@@ -461,6 +474,7 @@ GLOBAL(name)
NEXT_PGD_PAGE(early_level4_pgt)
.fill 511,8,0
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
.fill KAISER_USER_PGD_FILL,8,0
NEXT_PAGE(early_dynamic_pgts)
.fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
......@@ -469,7 +483,8 @@ NEXT_PAGE(early_dynamic_pgts)
#ifndef CONFIG_XEN
NEXT_PGD_PAGE(init_level4_pgt)
.fill 2*512,8,0
.fill 512,8,0
.fill KAISER_USER_PGD_FILL,8,0
#else
NEXT_PGD_PAGE(init_level4_pgt)
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
......@@ -478,6 +493,7 @@ NEXT_PGD_PAGE(init_level4_pgt)
.org init_level4_pgt + L4_START_KERNEL*8, 0
/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
.fill KAISER_USER_PGD_FILL,8,0
NEXT_PAGE(level3_ident_pgt)
.quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
......@@ -488,6 +504,7 @@ NEXT_PAGE(level2_ident_pgt)
*/
PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
#endif
.fill KAISER_USER_PGD_FILL,8,0
NEXT_PAGE(level3_kernel_pgt)
.fill L3_START_KERNEL,8,0
......
......@@ -18,6 +18,7 @@
#include <linux/uaccess.h>
#include <asm/ldt.h>
#include <asm/kaiser.h>
#include <asm/desc.h>
#include <asm/mmu_context.h>
#include <asm/syscalls.h>
......@@ -34,11 +35,21 @@ static void flush_ldt(void *current_mm)
set_ldt(pc->ldt->entries, pc->ldt->size);
}
static void __free_ldt_struct(struct ldt_struct *ldt)
{
if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
vfree(ldt->entries);
else
free_page((unsigned long)ldt->entries);
kfree(ldt);
}
/* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
static struct ldt_struct *alloc_ldt_struct(int size)
{
struct ldt_struct *new_ldt;
int alloc_size;
int ret = 0;
if (size > LDT_ENTRIES)
return NULL;
......@@ -66,6 +77,14 @@ static struct ldt_struct *alloc_ldt_struct(int size)
return NULL;
}
// FIXME: make kaiser_add_mapping() return an error code
// when it fails
kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size,
__PAGE_KERNEL);
if (ret) {
__free_ldt_struct(new_ldt);
return NULL;
}
new_ldt->size = size;
return new_ldt;
}
......@@ -92,12 +111,10 @@ static void free_ldt_struct(struct ldt_struct *ldt)
if (likely(!ldt))
return;
kaiser_remove_mapping((unsigned long)ldt->entries,
ldt->size * LDT_ENTRY_SIZE);
paravirt_free_ldt(ldt->entries, ldt->size);
if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
vfree(ldt->entries);
else
free_page((unsigned long)ldt->entries);
kfree(ldt);
__free_ldt_struct(ldt);
}
/*
......
......@@ -9,10 +9,12 @@
#include <linux/atomic.h>
atomic_t trace_idt_ctr = ATOMIC_INIT(0);
__aligned(PAGE_SIZE)
struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1,
(unsigned long) trace_idt_table };
/* No need to be aligned, but done to keep all IDTs defined the same way. */
__aligned(PAGE_SIZE)
gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss;
static int trace_irq_vector_refcount;
......
This diff is collapsed.
......@@ -52,6 +52,7 @@ static DEFINE_SPINLOCK(cpa_lock);
#define CPA_FLUSHTLB 1
#define CPA_ARRAY 2
#define CPA_PAGES_ARRAY 4
#define CPA_FREE_PAGETABLES 8
#ifdef CONFIG_PROC_FS
static unsigned long direct_pages_count[PG_LEVEL_NUM];
......@@ -723,10 +724,13 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
return 0;
}
static bool try_to_free_pte_page(pte_t *pte)
static bool try_to_free_pte_page(struct cpa_data *cpa, pte_t *pte)
{
int i;
if (!(cpa->flags & CPA_FREE_PAGETABLES))
return false;
for (i = 0; i < PTRS_PER_PTE; i++)
if (!pte_none(pte[i]))
return false;
......@@ -735,10 +739,13 @@ static bool try_to_free_pte_page(pte_t *pte)
return true;
}
static bool try_to_free_pmd_page(pmd_t *pmd)
static bool try_to_free_pmd_page(struct cpa_data *cpa, pmd_t *pmd)
{
int i;
if (!(cpa->flags & CPA_FREE_PAGETABLES))
return false;
for (i = 0; i < PTRS_PER_PMD; i++)
if (!pmd_none(pmd[i]))
return false;
......@@ -759,7 +766,9 @@ static bool try_to_free_pud_page(pud_t *pud)
return true;
}
static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
static bool unmap_pte_range(struct cpa_data *cpa, pmd_t *pmd,
unsigned long start,
unsigned long end)
{
pte_t *pte = pte_offset_kernel(pmd, start);
......@@ -770,22 +779,23 @@ static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
pte++;
}
if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {
if (try_to_free_pte_page(cpa, (pte_t *)pmd_page_vaddr(*pmd))) {
pmd_clear(pmd);
return true;
}
return false;
}
static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
static void __unmap_pmd_range(struct cpa_data *cpa, pud_t *pud, pmd_t *pmd,
unsigned long start, unsigned long end)
{
if (unmap_pte_range(pmd, start, end))
if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
if (unmap_pte_range(cpa, pmd, start, end))
if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))
pud_clear(pud);
}
static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
static void unmap_pmd_range(struct cpa_data *cpa, pud_t *pud,
unsigned long start, unsigned long end)
{
pmd_t *pmd = pmd_offset(pud, start);
......@@ -796,7 +806,7 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
unsigned long pre_end = min_t(unsigned long, end, next_page);
__unmap_pmd_range(pud, pmd, start, pre_end);
__unmap_pmd_range(cpa, pud, pmd, start, pre_end);
start = pre_end;
pmd++;
......@@ -809,7 +819,8 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
if (pmd_large(*pmd))
pmd_clear(pmd);
else
__unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);
__unmap_pmd_range(cpa, pud, pmd,
start, start + PMD_SIZE);
start += PMD_SIZE;
pmd++;
......@@ -819,17 +830,19 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
* 4K leftovers?
*/
if (start < end)
return __unmap_pmd_range(pud, pmd, start, end);
return __unmap_pmd_range(cpa, pud, pmd, start, end);
/*
* Try again to free the PMD page if haven't succeeded above.
*/
if (!pud_none(*pud))
if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))
pud_clear(pud);
}
void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
static void __unmap_pud_range(struct cpa_data *cpa, pgd_t *pgd,
unsigned long start,
unsigned long end)
{
pud_t *pud = pud_offset(pgd, start);
......@@ -840,7 +853,7 @@ void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
unsigned long pre_end = min_t(unsigned long, end, next_page);
unmap_pmd_range(pud, start, pre_end);
unmap_pmd_range(cpa, pud, start, pre_end);
start = pre_end;
pud++;
......@@ -854,7 +867,7 @@ void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
if (pud_large(*pud))
pud_clear(pud);
else
unmap_pmd_range(pud, start, start + PUD_SIZE);
unmap_pmd_range(cpa, pud, start, start + PUD_SIZE);
start += PUD_SIZE;
pud++;
......@@ -864,7 +877,7 @@ void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
* 2M leftovers?
*/
if (start < end)
unmap_pmd_range(pud, start, end);
unmap_pmd_range(cpa, pud, start, end);
/*
* No need to try to free the PUD page because we'll free it in
......@@ -872,6 +885,24 @@ void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
*/
}
static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
{
struct cpa_data cpa = {
.flags = CPA_FREE_PAGETABLES,
};
__unmap_pud_range(&cpa, pgd, start, end);
}
void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end)
{
struct cpa_data cpa = {
.flags = 0,
};
__unmap_pud_range(&cpa, pgd, start, end);
}
static void unmap_pgd_range(pgd_t *root, unsigned long addr, unsigned long end)
{
pgd_t *pgd_entry = root + pgd_index(addr);
......
......@@ -340,40 +340,26 @@ static inline void _pgd_free(pgd_t *pgd)
kmem_cache_free(pgd_cache, pgd);
}
#else
static inline pgd_t *_pgd_alloc(void)
{
#ifdef CONFIG_KAISER
// Instead of one PML4, we aquire two PML4s and, thus, an 8kb-aligned memory
// block. Therefore, we have to allocate at least 3 pages. However, the
// __get_free_pages returns us 4 pages. Hence, we store the base pointer at
// the beginning of the page of our 8kb-aligned memory block in order to
// correctly free it afterwars.
unsigned long pages = __get_free_pages(PGALLOC_GFP, get_order(4*PAGE_SIZE));
if(native_get_normal_pgd((pgd_t*) pages) == (pgd_t*) pages)
{
*((unsigned long*)(pages + 2 * PAGE_SIZE)) = pages;
return (pgd_t *) pages;
}
else
{
*((unsigned long*)(pages + 3 * PAGE_SIZE)) = pages;
return (pgd_t *) (pages + PAGE_SIZE);
}
/*
* Instead of one pmd, we aquire two pmds. Being order-1, it is
* both 8k in size and 8k-aligned. That lets us just flip bit 12
* in a pointer to swap between the two 4k halves.
*/
#define PGD_ALLOCATION_ORDER 1
#else
return (pgd_t *)__get_free_page(PGALLOC_GFP);
#define PGD_ALLOCATION_ORDER 0
#endif
static inline pgd_t *_pgd_alloc(void)
{
return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER);
}
static inline void _pgd_free(pgd_t *pgd)
{
#ifdef CONFIG_KAISER
unsigned long pages = *((unsigned long*) ((char*) pgd + 2 * PAGE_SIZE));
free_pages(pages, get_order(4*PAGE_SIZE));
#else
free_page((unsigned long)pgd);
#endif
free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
}
#endif /* CONFIG_X86_PAE */
......
#ifndef _INCLUDE_KAISER_H
#define _INCLUDE_KAISER_H
#ifdef CONFIG_KAISER
#include <asm/kaiser.h>
#else
/*
* These stubs are used whenever CONFIG_KAISER is off, which
* includes architectures that support KAISER, but have it
* disabled.
*/
static inline void kaiser_init(void)
{
}
static inline void kaiser_remove_mapping(unsigned long start, unsigned long size)
{
}
static inline int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
{
return 0;
}
#endif /* !CONFIG_KAISER */
#endif /* _INCLUDE_KAISER_H */
......@@ -58,6 +58,7 @@
#include <linux/tsacct_kern.h>
#include <linux/cn_proc.h>
#include <linux/freezer.h>
#include <linux/kaiser.h>
#include <linux/delayacct.h>
#include <linux/taskstats_kern.h>
#include <linux/random.h>
......@@ -335,7 +336,6 @@ void set_task_stack_end_magic(struct task_struct *tsk)
*stackend = STACK_END_MAGIC; /* for overflow detection */
}
extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
{
struct task_struct *tsk;
......@@ -357,9 +357,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
goto free_ti;
tsk->stack = ti;
#ifdef CONFIG_KAISER
kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE, __PAGE_KERNEL);
#endif
err= kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE, __PAGE_KERNEL);
if (err)
goto free_ti;
#ifdef CONFIG_SECCOMP
/*
* We must handle setting up seccomp filters once we're under
......
......@@ -32,12 +32,17 @@ config SECURITY
If you are unsure how to answer this question, answer N.
config KAISER
bool "Remove the kernel mapping in user mode"
default y
depends on X86_64
depends on !PARAVIRT
help
This enforces a strict kernel and user space isolation in order to close
hardware side channels on kernel address information.
config KAISER_REAL_SWITCH
bool "KAISER: actually switch page tables"
default y
config SECURITYFS
bool "Enable the securityfs filesystem"
help
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment