Commit eac34119 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'x86/pti' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 PTI updates from Thomas Gleixner:
 "The Speck brigade sadly provides yet another large set of patches
  destroying the perfomance which we carefully built and preserved

   - PTI support for 32bit PAE. The missing counter part to the 64bit
     PTI code implemented by Joerg.

   - A set of fixes for the Global Bit mechanics for non PCID CPUs which
     were setting the Global Bit too widely and therefore possibly
     exposing interesting memory needlessly.

   - Protection against userspace-userspace SpectreRSB

   - Support for the upcoming Enhanced IBRS mode, which is preferred
     over IBRS. Unfortunately we dont know the performance impact of
     this, but it's expected to be less horrible than the IBRS
     hammering.

   - Cleanups and simplifications"

* 'x86/pti' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (60 commits)
  x86/mm/pti: Move user W+X check into pti_finalize()
  x86/relocs: Add __end_rodata_aligned to S_REL
  x86/mm/pti: Clone kernel-image on PTE level for 32 bit
  x86/mm/pti: Don't clear permissions in pti_clone_pmd()
  x86/mm/pti: Fix 32 bit PCID check
  x86/mm/init: Remove freed kernel image areas from alias mapping
  x86/mm/init: Add helper for freeing kernel image pages
  x86/mm/init: Pass unconverted symbol addresses to free_init_pages()
  mm: Allow non-direct-map arguments to free_reserved_area()
  x86/mm/pti: Clear Global bit more aggressively
  x86/speculation: Support Enhanced IBRS on future CPUs
  x86/speculation: Protect against userspace-userspace spectreRSB
  x86/kexec: Allocate 8k PGDs for PTI
  Revert "perf/core: Make sure the ring-buffer is mapped in all page-tables"
  x86/mm: Remove in_nmi() warning from vmalloc_fault()
  x86/entry/32: Check for VM86 mode in slow-path check
  perf/core: Make sure the ring-buffer is mapped in all page-tables
  x86/pti: Check the return value of pti_user_pagetable_walk_pmd()
  x86/pti: Check the return value of pti_user_pagetable_walk_p4d()
  x86/entry/32: Add debug code to check entry/exit CR3
  ...
parents d191c82d d878efce
This diff is collapsed.
......@@ -219,6 +219,7 @@
#define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */
#define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */
#define X86_FEATURE_ZEN ( 7*32+28) /* "" CPU is AMD family 0x17 (Zen) */
#define X86_FEATURE_IBRS_ENHANCED ( 7*32+29) /* Enhanced IBRS */
/* Virtualization flags: Linux defined, word 8 */
#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
......
......@@ -71,12 +71,7 @@ struct ldt_struct {
static inline void *ldt_slot_va(int slot)
{
#ifdef CONFIG_X86_64
return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot);
#else
BUG();
return (void *)fix_to_virt(FIX_HOLE);
#endif
}
/*
......
......@@ -214,7 +214,7 @@ enum spectre_v2_mitigation {
SPECTRE_V2_RETPOLINE_MINIMAL_AMD,
SPECTRE_V2_RETPOLINE_GENERIC,
SPECTRE_V2_RETPOLINE_AMD,
SPECTRE_V2_IBRS,
SPECTRE_V2_IBRS_ENHANCED,
};
/* The Speculative Store Bypass disable variants */
......
......@@ -19,6 +19,9 @@ static inline void native_set_pte(pte_t *ptep , pte_t pte)
static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
{
#ifdef CONFIG_PAGE_TABLE_ISOLATION
pmd.pud.p4d.pgd = pti_set_user_pgtbl(&pmdp->pud.p4d.pgd, pmd.pud.p4d.pgd);
#endif
*pmdp = pmd;
}
......@@ -58,6 +61,9 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp)
#ifdef CONFIG_SMP
static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
{
#ifdef CONFIG_PAGE_TABLE_ISOLATION
pti_set_user_pgtbl(&xp->pud.p4d.pgd, __pgd(0));
#endif
return __pmd(xchg((pmdval_t *)xp, 0));
}
#else
......@@ -67,6 +73,9 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
#ifdef CONFIG_SMP
static inline pud_t native_pudp_get_and_clear(pud_t *xp)
{
#ifdef CONFIG_PAGE_TABLE_ISOLATION
pti_set_user_pgtbl(&xp->p4d.pgd, __pgd(0));
#endif
return __pud(xchg((pudval_t *)xp, 0));
}
#else
......
......@@ -35,4 +35,7 @@ typedef union {
#define PTRS_PER_PTE 1024
/* This covers all VMSPLIT_* and VMSPLIT_*_OPT variants */
#define PGD_KERNEL_START (CONFIG_PAGE_OFFSET >> PGDIR_SHIFT)
#endif /* _ASM_X86_PGTABLE_2LEVEL_DEFS_H */
......@@ -98,6 +98,9 @@ static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
static inline void native_set_pud(pud_t *pudp, pud_t pud)
{
#ifdef CONFIG_PAGE_TABLE_ISOLATION
pud.p4d.pgd = pti_set_user_pgtbl(&pudp->p4d.pgd, pud.p4d.pgd);
#endif
set_64bit((unsigned long long *)(pudp), native_pud_val(pud));
}
......@@ -229,6 +232,10 @@ static inline pud_t native_pudp_get_and_clear(pud_t *pudp)
{
union split_pud res, *orig = (union split_pud *)pudp;
#ifdef CONFIG_PAGE_TABLE_ISOLATION
pti_set_user_pgtbl(&pudp->p4d.pgd, __pgd(0));
#endif
/* xchg acts as a barrier before setting of the high bits */
res.pud_low = xchg(&orig->pud_low, 0);
res.pud_high = orig->pud_high;
......
......@@ -21,9 +21,10 @@ typedef union {
#endif /* !__ASSEMBLY__ */
#ifdef CONFIG_PARAVIRT
#define SHARED_KERNEL_PMD (pv_info.shared_kernel_pmd)
#define SHARED_KERNEL_PMD ((!static_cpu_has(X86_FEATURE_PTI) && \
(pv_info.shared_kernel_pmd)))
#else
#define SHARED_KERNEL_PMD 1
#define SHARED_KERNEL_PMD (!static_cpu_has(X86_FEATURE_PTI))
#endif
/*
......@@ -45,5 +46,6 @@ typedef union {
#define PTRS_PER_PTE 512
#define MAX_POSSIBLE_PHYSMEM_BITS 36
#define PGD_KERNEL_START (CONFIG_PAGE_OFFSET >> PGDIR_SHIFT)
#endif /* _ASM_X86_PGTABLE_3LEVEL_DEFS_H */
......@@ -30,11 +30,14 @@ int __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user);
void ptdump_walk_pgd_level_checkwx(void);
void ptdump_walk_user_pgd_level_checkwx(void);
#ifdef CONFIG_DEBUG_WX
#define debug_checkwx() ptdump_walk_pgd_level_checkwx()
#define debug_checkwx() ptdump_walk_pgd_level_checkwx()
#define debug_checkwx_user() ptdump_walk_user_pgd_level_checkwx()
#else
#define debug_checkwx() do { } while (0)
#define debug_checkwx() do { } while (0)
#define debug_checkwx_user() do { } while (0)
#endif
/*
......@@ -640,8 +643,31 @@ static inline int is_new_memtype_allowed(u64 paddr, unsigned long size,
pmd_t *populate_extra_pmd(unsigned long vaddr);
pte_t *populate_extra_pte(unsigned long vaddr);
#ifdef CONFIG_PAGE_TABLE_ISOLATION
pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd);
/*
* Take a PGD location (pgdp) and a pgd value that needs to be set there.
* Populates the user and returns the resulting PGD that must be set in
* the kernel copy of the page tables.
*/
static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd)
{
if (!static_cpu_has(X86_FEATURE_PTI))
return pgd;
return __pti_set_user_pgtbl(pgdp, pgd);
}
#else /* CONFIG_PAGE_TABLE_ISOLATION */
static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd)
{
return pgd;
}
#endif /* CONFIG_PAGE_TABLE_ISOLATION */
#endif /* __ASSEMBLY__ */
#ifdef CONFIG_X86_32
# include <asm/pgtable_32.h>
#else
......@@ -1154,6 +1180,70 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
}
}
#endif
/*
* Page table pages are page-aligned. The lower half of the top
* level is used for userspace and the top half for the kernel.
*
* Returns true for parts of the PGD that map userspace and
* false for the parts that map the kernel.
*/
static inline bool pgdp_maps_userspace(void *__ptr)
{
unsigned long ptr = (unsigned long)__ptr;
return (((ptr & ~PAGE_MASK) / sizeof(pgd_t)) < PGD_KERNEL_START);
}
static inline int pgd_large(pgd_t pgd) { return 0; }
#ifdef CONFIG_PAGE_TABLE_ISOLATION
/*
* All top-level PAGE_TABLE_ISOLATION page tables are order-1 pages
* (8k-aligned and 8k in size). The kernel one is at the beginning 4k and
* the user one is in the last 4k. To switch between them, you
* just need to flip the 12th bit in their addresses.
*/
#define PTI_PGTABLE_SWITCH_BIT PAGE_SHIFT
/*
* This generates better code than the inline assembly in
* __set_bit().
*/
static inline void *ptr_set_bit(void *ptr, int bit)
{
unsigned long __ptr = (unsigned long)ptr;
__ptr |= BIT(bit);
return (void *)__ptr;
}
static inline void *ptr_clear_bit(void *ptr, int bit)
{
unsigned long __ptr = (unsigned long)ptr;
__ptr &= ~BIT(bit);
return (void *)__ptr;
}
static inline pgd_t *kernel_to_user_pgdp(pgd_t *pgdp)
{
return ptr_set_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
}
static inline pgd_t *user_to_kernel_pgdp(pgd_t *pgdp)
{
return ptr_clear_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
}
static inline p4d_t *kernel_to_user_p4dp(p4d_t *p4dp)
{
return ptr_set_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
}
static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp)
{
return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
}
#endif /* CONFIG_PAGE_TABLE_ISOLATION */
/*
* clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
......
......@@ -34,8 +34,6 @@ static inline void check_pgt_cache(void) { }
void paging_init(void);
void sync_initial_page_table(void);
static inline int pgd_large(pgd_t pgd) { return 0; }
/*
* Define this if things work differently on an i386 and an i486:
* it will (on an i486) warn about kernel memory accesses that are
......
......@@ -50,13 +50,18 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */
((FIXADDR_TOT_START - PAGE_SIZE * (CPU_ENTRY_AREA_PAGES + 1)) \
& PMD_MASK)
#define PKMAP_BASE \
#define LDT_BASE_ADDR \
((CPU_ENTRY_AREA_BASE - PAGE_SIZE) & PMD_MASK)
#define LDT_END_ADDR (LDT_BASE_ADDR + PMD_SIZE)
#define PKMAP_BASE \
((LDT_BASE_ADDR - PAGE_SIZE) & PMD_MASK)
#ifdef CONFIG_HIGHMEM
# define VMALLOC_END (PKMAP_BASE - 2 * PAGE_SIZE)
#else
# define VMALLOC_END (CPU_ENTRY_AREA_BASE - 2 * PAGE_SIZE)
# define VMALLOC_END (LDT_BASE_ADDR - 2 * PAGE_SIZE)
#endif
#define MODULES_VADDR VMALLOC_START
......
......@@ -132,90 +132,6 @@ static inline pud_t native_pudp_get_and_clear(pud_t *xp)
#endif
}
#ifdef CONFIG_PAGE_TABLE_ISOLATION
/*
* All top-level PAGE_TABLE_ISOLATION page tables are order-1 pages
* (8k-aligned and 8k in size). The kernel one is at the beginning 4k and
* the user one is in the last 4k. To switch between them, you
* just need to flip the 12th bit in their addresses.
*/
#define PTI_PGTABLE_SWITCH_BIT PAGE_SHIFT
/*
* This generates better code than the inline assembly in
* __set_bit().
*/
static inline void *ptr_set_bit(void *ptr, int bit)
{
unsigned long __ptr = (unsigned long)ptr;
__ptr |= BIT(bit);
return (void *)__ptr;
}
static inline void *ptr_clear_bit(void *ptr, int bit)
{
unsigned long __ptr = (unsigned long)ptr;
__ptr &= ~BIT(bit);
return (void *)__ptr;
}
static inline pgd_t *kernel_to_user_pgdp(pgd_t *pgdp)
{
return ptr_set_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
}
static inline pgd_t *user_to_kernel_pgdp(pgd_t *pgdp)
{
return ptr_clear_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
}
static inline p4d_t *kernel_to_user_p4dp(p4d_t *p4dp)
{
return ptr_set_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
}
static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp)
{
return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
}
#endif /* CONFIG_PAGE_TABLE_ISOLATION */
/*
* Page table pages are page-aligned. The lower half of the top
* level is used for userspace and the top half for the kernel.
*
* Returns true for parts of the PGD that map userspace and
* false for the parts that map the kernel.
*/
static inline bool pgdp_maps_userspace(void *__ptr)
{
unsigned long ptr = (unsigned long)__ptr;
return (ptr & ~PAGE_MASK) < (PAGE_SIZE / 2);
}
#ifdef CONFIG_PAGE_TABLE_ISOLATION
pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd);
/*
* Take a PGD location (pgdp) and a pgd value that needs to be set there.
* Populates the user and returns the resulting PGD that must be set in
* the kernel copy of the page tables.
*/
static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
{
if (!static_cpu_has(X86_FEATURE_PTI))
return pgd;
return __pti_set_user_pgd(pgdp, pgd);
}
#else
static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
{
return pgd;
}
#endif
static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
{
pgd_t pgd;
......@@ -226,7 +142,7 @@ static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
}
pgd = native_make_pgd(native_p4d_val(p4d));
pgd = pti_set_user_pgd((pgd_t *)p4dp, pgd);
pgd = pti_set_user_pgtbl((pgd_t *)p4dp, pgd);
*p4dp = native_make_p4d(native_pgd_val(pgd));
}
......@@ -237,7 +153,7 @@ static inline void native_p4d_clear(p4d_t *p4d)
static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
{
*pgdp = pti_set_user_pgd(pgdp, pgd);
*pgdp = pti_set_user_pgtbl(pgdp, pgd);
}
static inline void native_pgd_clear(pgd_t *pgd)
......@@ -255,7 +171,6 @@ extern void sync_global_pgds(unsigned long start, unsigned long end);
/*
* Level 4 access.
*/
static inline int pgd_large(pgd_t pgd) { return 0; }
#define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)
/* PUD - Level3 access */
......
......@@ -115,6 +115,7 @@ extern unsigned int ptrs_per_p4d;
#define LDT_PGD_ENTRY_L5 -112UL
#define LDT_PGD_ENTRY (pgtable_l5_enabled() ? LDT_PGD_ENTRY_L5 : LDT_PGD_ENTRY_L4)
#define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
#define LDT_END_ADDR (LDT_BASE_ADDR + PGDIR_SIZE)
#define __VMALLOC_BASE_L4 0xffffc90000000000UL
#define __VMALLOC_BASE_L5 0xffa0000000000000UL
......@@ -153,4 +154,6 @@ extern unsigned int ptrs_per_p4d;
#define EARLY_DYNAMIC_PAGE_TABLES 64
#define PGD_KERNEL_START ((PAGE_SIZE / 2) / sizeof(pgd_t))
#endif /* _ASM_X86_PGTABLE_64_DEFS_H */
......@@ -50,6 +50,7 @@
#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
#define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
#define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2)
#define _PAGE_SOFTW3 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW3)
#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
......@@ -266,14 +267,37 @@ typedef struct pgprot { pgprotval_t pgprot; } pgprot_t;
typedef struct { pgdval_t pgd; } pgd_t;
#ifdef CONFIG_X86_PAE
/*
* PHYSICAL_PAGE_MASK might be non-constant when SME is compiled in, so we can't
* use it here.
*/
#define PGD_PAE_PAGE_MASK ((signed long)PAGE_MASK)
#define PGD_PAE_PHYS_MASK (((1ULL << __PHYSICAL_MASK_SHIFT)-1) & PGD_PAE_PAGE_MASK)
/*
* PAE allows Base Address, P, PWT, PCD and AVL bits to be set in PGD entries.
* All other bits are Reserved MBZ
*/
#define PGD_ALLOWED_BITS (PGD_PAE_PHYS_MASK | _PAGE_PRESENT | \
_PAGE_PWT | _PAGE_PCD | \
_PAGE_SOFTW1 | _PAGE_SOFTW2 | _PAGE_SOFTW3)
#else
/* No need to mask any bits for !PAE */
#define PGD_ALLOWED_BITS (~0ULL)
#endif
static inline pgd_t native_make_pgd(pgdval_t val)
{
return (pgd_t) { val };
return (pgd_t) { val & PGD_ALLOWED_BITS };
}
static inline pgdval_t native_pgd_val(pgd_t pgd)
{
return pgd.pgd;
return pgd.pgd & PGD_ALLOWED_BITS;
}
static inline pgdval_t pgd_flags(pgd_t pgd)
......
......@@ -39,10 +39,6 @@
#define CR3_PCID_MASK 0xFFFull
#define CR3_NOFLUSH BIT_ULL(63)
#ifdef CONFIG_PAGE_TABLE_ISOLATION
# define X86_CR3_PTI_PCID_USER_BIT 11
#endif
#else
/*
* CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
......@@ -53,4 +49,8 @@
#define CR3_NOFLUSH 0
#endif
#ifdef CONFIG_PAGE_TABLE_ISOLATION
# define X86_CR3_PTI_PCID_USER_BIT 11
#endif
#endif /* _ASM_X86_PROCESSOR_FLAGS_H */
......@@ -966,6 +966,7 @@ static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves)
extern unsigned long arch_align_stack(unsigned long sp);
extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
extern void free_kernel_image_pages(void *begin, void *end);
void default_idle(void);
#ifdef CONFIG_XEN
......
......@@ -6,10 +6,9 @@
#ifdef CONFIG_PAGE_TABLE_ISOLATION
extern void pti_init(void);
extern void pti_check_boottime_disable(void);
extern void pti_clone_kernel_text(void);
extern void pti_finalize(void);
#else
static inline void pti_check_boottime_disable(void) { }
static inline void pti_clone_kernel_text(void) { }
#endif
#endif /* __ASSEMBLY__ */
......
......@@ -7,6 +7,7 @@
extern char __brk_base[], __brk_limit[];
extern struct exception_table_entry __stop___ex_table[];
extern char __end_rodata_aligned[];
#if defined(CONFIG_X86_64)
extern char __end_rodata_hpage_align[];
......
......@@ -46,6 +46,7 @@ int set_memory_np(unsigned long addr, int numpages);
int set_memory_4k(unsigned long addr, int numpages);
int set_memory_encrypted(unsigned long addr, int numpages);
int set_memory_decrypted(unsigned long addr, int numpages);
int set_memory_np_noalias(unsigned long addr, int numpages);
int set_memory_array_uc(unsigned long *addr, int addrinarray);
int set_memory_array_wc(unsigned long *addr, int addrinarray);
......
......@@ -87,15 +87,25 @@ static inline void refresh_sysenter_cs(struct thread_struct *thread)
#endif
/* This is used when switching tasks or entering/exiting vm86 mode. */
static inline void update_sp0(struct task_struct *task)
static inline void update_task_stack(struct task_struct *task)
{
/* On x86_64, sp0 always points to the entry trampoline stack, which is constant: */
/* sp0 always points to the entry trampoline stack, which is constant: */
#ifdef CONFIG_X86_32
load_sp0(task->thread.sp0);
if (static_cpu_has(X86_FEATURE_XENPV))
load_sp0(task->thread.sp0);
else
this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0);
#else
/*
* x86-64 updates x86_tss.sp1 via cpu_current_top_of_stack. That
* doesn't work on x86-32 because sp1 and
* cpu_current_top_of_stack have different values (because of
* the non-zero stack-padding on 32bit).
*/
if (static_cpu_has(X86_FEATURE_XENPV))
load_sp0(task_top_of_stack(task));
#endif
}
#endif /* _ASM_X86_SWITCH_TO_H */
......@@ -103,4 +103,9 @@ void common(void) {
OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page);
DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack));
DEFINE(MASK_entry_stack, (~(sizeof(struct entry_stack) - 1)));
/* Offset for sp0 and sp1 into the tss_struct */
OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
}
......@@ -46,8 +46,14 @@ void foo(void)
OFFSET(saved_context_gdt_desc, saved_context, gdt_desc);
BLANK();
/* Offset from the sysenter stack to tss.sp0 */
DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) -
/*
* Offset from the entry stack to task stack stored in TSS. Kernel entry
* happens on the per-cpu entry-stack, and the asm code switches to the
* task-stack pointer stored in x86_tss.sp1, which is a copy of
* task->thread.sp0 where entry code can find it.
*/
DEFINE(TSS_entry2task_stack,
offsetof(struct cpu_entry_area, tss.x86_tss.sp1) -
offsetofend(struct cpu_entry_area, entry_stack_page.stack));
#ifdef CONFIG_STACKPROTECTOR
......
......@@ -65,8 +65,6 @@ int main(void)
#undef ENTRY
OFFSET(TSS_ist, tss_struct, x86_tss.ist);
OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
BLANK();
#ifdef CONFIG_STACKPROTECTOR
......
......@@ -130,6 +130,7 @@ static const char *spectre_v2_strings[] = {
[SPECTRE_V2_RETPOLINE_MINIMAL_AMD] = "Vulnerable: Minimal AMD ASM retpoline",
[SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline",
[SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline",
[SPECTRE_V2_IBRS_ENHANCED] = "Mitigation: Enhanced IBRS",
};
#undef pr_fmt
......@@ -313,23 +314,6 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
return cmd;
}
/* Check for Skylake-like CPUs (for RSB handling) */
static bool __init is_skylake_era(void)
{
if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
boot_cpu_data.x86 == 6) {
switch (boot_cpu_data.x86_model) {
case INTEL_FAM6_SKYLAKE_MOBILE:
case INTEL_FAM6_SKYLAKE_DESKTOP:
case INTEL_FAM6_SKYLAKE_X:
case INTEL_FAM6_KABYLAKE_MOBILE:
case INTEL_FAM6_KABYLAKE_DESKTOP:
return true;
}
}
return false;
}
static void __init spectre_v2_select_mitigation(void)
{
enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline();
......@@ -349,6 +333,13 @@ static void __init spectre_v2_select_mitigation(void)
case SPECTRE_V2_CMD_FORCE:
case SPECTRE_V2_CMD_AUTO:
if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) {
mode = SPECTRE_V2_IBRS_ENHANCED;
/* Force it so VMEXIT will restore correctly */
x86_spec_ctrl_base |= SPEC_CTRL_IBRS;
wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
goto specv2_set_mode;
}
if (IS_ENABLED(CONFIG_RETPOLINE))
goto retpoline_auto;
break;
......@@ -386,26 +377,20 @@ static void __init spectre_v2_select_mitigation(void)
setup_force_cpu_cap(X86_FEATURE_RETPOLINE);
}
specv2_set_mode:
spectre_v2_enabled = mode;
pr_info("%s\n", spectre_v2_strings[mode]);
/*
* If neither SMEP nor PTI are available, there is a risk of
* hitting userspace addresses in the RSB after a context switch
* from a shallow call stack to a deeper one. To prevent this fill
* the entire RSB, even when using IBRS.
* If spectre v2 protection has been enabled, unconditionally fill
* RSB during a context switch; this protects against two independent
* issues:
*
* Skylake era CPUs have a separate issue with *underflow* of the
* RSB, when they will predict 'ret' targets from the generic BTB.
* The proper mitigation for this is IBRS. If IBRS is not supported
* or deactivated in favour of retpolines the RSB fill on context
* switch is required.
* - RSB underflow (and switch to BTB) on Skylake+
* - SpectreRSB variant of spectre v2 on X86_BUG_SPECTRE_V2 CPUs
*/
if ((!boot_cpu_has(X86_FEATURE_PTI) &&
!boot_cpu_has(X86_FEATURE_SMEP)) || is_skylake_era()) {
setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
pr_info("Spectre v2 mitigation: Filling RSB on context switch\n");
}
setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n");
/* Initialize Indirect Branch Prediction Barrier if supported */
if (boot_cpu_has(X86_FEATURE_IBPB)) {
......@@ -415,9 +400,16 @@ static void __init spectre_v2_select_mitigation(void)
/*
* Retpoline means the kernel is safe because it has no indirect
* branches. But firmware isn't, so use IBRS to protect that.
* branches. Enhanced IBRS protects firmware too, so, enable restricted
* speculation around firmware calls only when Enhanced IBRS isn't
* supported.
*
* Use "mode" to check Enhanced IBRS instead of boot_cpu_has(), because
* the user might select retpoline on the kernel command line and if
* the CPU supports Enhanced IBRS, kernel might un-intentionally not
* enable IBRS around firmware calls.
*/
if (boot_cpu_has(X86_FEATURE_IBRS)) {
if (boot_cpu_has(X86_FEATURE_IBRS) && mode != SPECTRE_V2_IBRS_ENHANCED) {
setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW);
pr_info("Enabling Restricted Speculation for firmware calls\n");
}
......
......@@ -1005,6 +1005,9 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
!cpu_has(c, X86_FEATURE_AMD_SSB_NO))
setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS);
if (ia32_cap & ARCH_CAP_IBRS_ALL)
setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED);
if (x86_match_cpu(cpu_no_meltdown))
return;
......@@ -1804,11 +1807,12 @@ void cpu_init(void)
enter_lazy_tlb(&init_mm, curr);
/*
* Initialize the TSS. Don't bother initializing sp0, as the initial
* task never enters user mode.
* Initialize the TSS. sp0 points to the entry trampoline stack
* regardless of what task is running.
*/
set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
load_TR_desc();
load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1));
load_mm_ldt(&init_mm);
......
......@@ -512,11 +512,18 @@ ENTRY(initial_code)
ENTRY(setup_once_ref)
.long setup_once
#ifdef CONFIG_PAGE_TABLE_ISOLATION
#define PGD_ALIGN (2 * PAGE_SIZE)
#define PTI_USER_PGD_FILL 1024
#else
#define PGD_ALIGN (PAGE_SIZE)
#define PTI_USER_PGD_FILL 0
#endif
/*
* BSS section
*/
__PAGE_ALIGNED_BSS
.align PAGE_SIZE
.align PGD_ALIGN
#ifdef CONFIG_X86_PAE
.globl initial_pg_pmd
initial_pg_pmd:
......@@ -526,14 +533,17 @@ initial_pg_pmd:
initial_page_table:
.fill 1024,4,0
#endif
.align PGD_ALIGN
initial_pg_fixmap:
.fill 1024,4,0
.globl empty_zero_page
empty_zero_page:
.fill 4096,1,0
.globl swapper_pg_dir
.align PGD_ALIGN
swapper_pg_dir:
.fill 1024,4,0
.fill PTI_USER_PGD_FILL,4,0
.globl empty_zero_page
empty_zero_page:
.fill 4096,1,0
EXPORT_SYMBOL(empty_zero_page)
/*
......@@ -542,7 +552,7 @@ EXPORT_SYMBOL(empty_zero_page)
#ifdef CONFIG_X86_PAE
__PAGE_ALIGNED_DATA
/* Page-aligned for the benefit of paravirt? */
.align PAGE_SIZE
.align PGD_ALIGN
ENTRY(initial_page_table)
.long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */
# if KPMDS == 3
......
......@@ -100,6 +100,102 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
return new_ldt;
}
#ifdef CONFIG_PAGE_TABLE_ISOLATION
static void do_sanity_check(struct mm_struct *mm,
bool had_kernel_mapping,
bool had_user_mapping)
{
if (mm->context.ldt) {
/*
* We already had an LDT. The top-level entry should already
* have been allocated and synchronized with the usermode
* tables.
*/
WARN_ON(!had_kernel_mapping);
if (static_cpu_has(X86_FEATURE_PTI))
WARN_ON(!had_user_mapping);
} else {
/*
* This is the first time we're mapping an LDT for this process.
* Sync the pgd to the usermode tables.
*/
WARN_ON(had_kernel_mapping);
if (static_cpu_has(X86_FEATURE_PTI))
WARN_ON(had_user_mapping);
}
}
#ifdef CONFIG_X86_PAE
static pmd_t *pgd_to_pmd_walk(pgd_t *pgd, unsigned long va)
{
p4d_t *p4d;
pud_t *pud;
if (pgd->pgd == 0)
return NULL;
p4d = p4d_offset(pgd, va);
if (p4d_none(*p4d))
return NULL;
pud = pud_offset(p4d, va);
if (pud_none(*pud))
return NULL;
return pmd_offset(pud, va);
}
static void map_ldt_struct_to_user(struct mm_struct *mm)
{
pgd_t *k_pgd = pgd_offset(mm, LDT_BASE_ADDR);
pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd);
pmd_t *k_pmd, *u_pmd;
k_pmd = pgd_to_pmd_walk(k_pgd, LDT_BASE_ADDR);
u_pmd = pgd_to_pmd_walk(u_pgd, LDT_BASE_ADDR);
if (static_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt)
set_pmd(u_pmd, *k_pmd);
}
static void sanity_check_ldt_mapping(struct mm_struct *mm)
{
pgd_t *k_pgd = pgd_offset(mm, LDT_BASE_ADDR);
pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd);
bool had_kernel, had_user;
pmd_t *k_pmd, *u_pmd;
k_pmd = pgd_to_pmd_walk(k_pgd, LDT_BASE_ADDR);
u_pmd = pgd_to_pmd_walk(u_pgd, LDT_BASE_ADDR);
had_kernel = (k_pmd->pmd != 0);
had_user = (u_pmd->pmd != 0);
do_sanity_check(mm, had_kernel, had_user);
}
#else /* !CONFIG_X86_PAE */
static void map_ldt_struct_to_user(struct mm_struct *mm)
{
pgd_t *pgd = pgd_offset(mm, LDT_BASE_ADDR);
if (static_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt)
set_pgd(kernel_to_user_pgdp(pgd), *pgd);
}
static void sanity_check_ldt_mapping(struct mm_struct *mm)
{
pgd_t *pgd = pgd_offset(mm, LDT_BASE_ADDR);
bool had_kernel = (pgd->pgd != 0);
bool had_user = (kernel_to_user_pgdp(pgd)->pgd != 0);
do_sanity_check(mm, had_kernel, had_user);
}
#endif /* CONFIG_X86_PAE */
/*
* If PTI is enabled, this maps the LDT into the kernelmode and
* usermode tables for the given mm.
......@@ -115,9 +211,8 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
static int
map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
{
#ifdef CONFIG_PAGE_TABLE_ISOLATION
bool is_vmalloc, had_top_level_entry;
unsigned long va;
bool is_vmalloc;
spinlock_t *ptl;
pgd_t *pgd;
int i;
......@@ -131,13 +226,15 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
*/
WARN_ON(ldt->slot != -1);
/* Check if the current mappings are sane */
sanity_check_ldt_mapping(mm);
/*
* Did we already have the top level entry allocated? We can't
* use pgd_none() for this because it doens't do anything on
* 4-level page table kernels.
*/
pgd = pgd_offset(mm, LDT_BASE_ADDR);
had_top_level_entry = (pgd->pgd != 0);
is_vmalloc = is_vmalloc_addr(ldt->entries);
......@@ -172,41 +269,31 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
pte_unmap_unlock(ptep, ptl);
}
if (mm->context.ldt) {
/*
* We already had an LDT. The top-level entry should already
* have been allocated and synchronized with the usermode
* tables.
*/
WARN_ON(!had_top_level_entry);
if (static_cpu_has(X86_FEATURE_PTI))
WARN_ON(!kernel_to_user_pgdp(pgd)->pgd);
} else {
/*
* This is the first time we're mapping an LDT for this process.
* Sync the pgd to the usermode tables.
*/
WARN_ON(had_top_level_entry);
if (static_cpu_has(X86_FEATURE_PTI)) {
WARN_ON(kernel_to_user_pgdp(pgd)->pgd);
set_pgd(kernel_to_user_pgdp(pgd), *pgd);
}
}
/* Propagate LDT mapping to the user page-table */
map_ldt_struct_to_user(mm);
va = (unsigned long)ldt_slot_va(slot);
flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0);
ldt->slot = slot;
#endif
return 0;
}
#else /* !CONFIG_PAGE_TABLE_ISOLATION */
static int
map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
{
return 0;
}
#endif /* CONFIG_PAGE_TABLE_ISOLATION */
static void free_ldt_pgtables(struct mm_struct *mm)
{
#ifdef CONFIG_PAGE_TABLE_ISOLATION
struct mmu_gather tlb;
unsigned long start = LDT_BASE_ADDR;
unsigned long end = start + (1UL << PGDIR_SHIFT);
unsigned long end = LDT_END_ADDR;
if (!static_cpu_has(X86_FEATURE_PTI))
return;
......
......@@ -56,7 +56,7 @@ static void load_segments(void)
static void machine_kexec_free_page_tables(struct kimage *image)
{
free_page((unsigned long)image->arch.pgd);
free_pages((unsigned long)image->arch.pgd, PGD_ALLOCATION_ORDER);
image->arch.pgd = NULL;
#ifdef CONFIG_X86_PAE
free_page((unsigned long)image->arch.pmd0);
......@@ -72,7 +72,8 @@ static void machine_kexec_free_page_tables(struct kimage *image)
static int machine_kexec_alloc_page_tables(struct kimage *image)
{
image->arch.pgd = (pgd_t *)get_zeroed_page(GFP_KERNEL);
image->arch.pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
PGD_ALLOCATION_ORDER);
#ifdef CONFIG_X86_PAE
image->arch.pmd0 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
image->arch.pmd1 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
......
......@@ -57,14 +57,12 @@ __visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = {
*/
.sp0 = (1UL << (BITS_PER_LONG-1)) + 1,
#ifdef CONFIG_X86_64
/*
* .sp1 is cpu_current_top_of_stack. The init task never
* runs user code, but cpu_current_top_of_stack should still
* be well defined before the first context switch.
*/
.sp1 = TOP_OF_INIT_STACK,
#endif
#ifdef CONFIG_X86_32
.ss0 = __KERNEL_DS,
......
......@@ -285,7 +285,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
* current_thread_info(). Refresh the SYSENTER configuration in
* case prev or next is vm86.
*/
update_sp0(next_p);
update_task_stack(next_p);
refresh_sysenter_cs(next);
this_cpu_write(cpu_current_top_of_stack,
(unsigned long)task_stack_page(next_p) +
......
......@@ -478,7 +478,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
/* Reload sp0. */
update_sp0(next_p);
update_task_stack(next_p);
/*
* Now maybe reload the debug registers and handle I/O bitmaps
......
......@@ -149,7 +149,7 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval)
preempt_disable();
tsk->thread.sp0 = vm86->saved_sp0;
tsk->thread.sysenter_cs = __KERNEL_CS;
update_sp0(tsk);
update_task_stack(tsk);
refresh_sysenter_cs(&tsk->thread);
vm86->saved_sp0 = 0;
preempt_enable();
......@@ -374,7 +374,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
refresh_sysenter_cs(&tsk->thread);
}
update_sp0(tsk);
update_task_stack(tsk);
preempt_enable();
if (vm86->flags & VM86_SCREEN_BITMAP)
......
......@@ -55,19 +55,22 @@ jiffies_64 = jiffies;
* so we can enable protection checks as well as retain 2MB large page
* mappings for kernel text.
*/
#define X64_ALIGN_RODATA_BEGIN . = ALIGN(HPAGE_SIZE);
#define X86_ALIGN_RODATA_BEGIN . = ALIGN(HPAGE_SIZE);
#define X64_ALIGN_RODATA_END \
#define X86_ALIGN_RODATA_END \
. = ALIGN(HPAGE_SIZE); \
__end_rodata_hpage_align = .;
__end_rodata_hpage_align = .; \
__end_rodata_aligned = .;
#define ALIGN_ENTRY_TEXT_BEGIN . = ALIGN(PMD_SIZE);
#define ALIGN_ENTRY_TEXT_END . = ALIGN(PMD_SIZE);
#else
#define X64_ALIGN_RODATA_BEGIN
#define X64_ALIGN_RODATA_END
#define X86_ALIGN_RODATA_BEGIN
#define X86_ALIGN_RODATA_END \
. = ALIGN(PAGE_SIZE); \
__end_rodata_aligned = .;
#define ALIGN_ENTRY_TEXT_BEGIN
#define ALIGN_ENTRY_TEXT_END
......@@ -141,9 +144,9 @@ SECTIONS
/* .text should occupy whole number of pages */
. = ALIGN(PAGE_SIZE);
X64_ALIGN_RODATA_BEGIN
X86_ALIGN_RODATA_BEGIN
RO_DATA(PAGE_SIZE)
X64_ALIGN_RODATA_END
X86_ALIGN_RODATA_END
/* Data */
.data : AT(ADDR(.data) - LOAD_OFFSET) {
......
......@@ -111,6 +111,8 @@ static struct addr_marker address_markers[] = {
[END_OF_SPACE_NR] = { -1, NULL }
};
#define INIT_PGD ((pgd_t *) &init_top_pgt)
#else /* CONFIG_X86_64 */
enum address_markers_idx {
......@@ -120,6 +122,9 @@ enum address_markers_idx {
VMALLOC_END_NR,
#ifdef CONFIG_HIGHMEM
PKMAP_BASE_NR,
#endif
#ifdef CONFIG_MODIFY_LDT_SYSCALL
LDT_NR,
#endif
CPU_ENTRY_AREA_NR,
FIXADDR_START_NR,
......@@ -133,12 +138,17 @@ static struct addr_marker address_markers[] = {
[VMALLOC_END_NR] = { 0UL, "vmalloc() End" },
#ifdef CONFIG_HIGHMEM
[PKMAP_BASE_NR] = { 0UL, "Persistent kmap() Area" },
#endif
#ifdef CONFIG_MODIFY_LDT_SYSCALL
[LDT_NR] = { 0UL, "LDT remap" },
#endif
[CPU_ENTRY_AREA_NR] = { 0UL, "CPU entry area" },
[FIXADDR_START_NR] = { 0UL, "Fixmap area" },
[END_OF_SPACE_NR] = { -1, NULL }
};
#define INIT_PGD (swapper_pg_dir)
#endif /* !CONFIG_X86_64 */
/* Multipliers for offsets within the PTEs */
......@@ -496,11 +506,7 @@ static inline bool is_hypervisor_range(int idx)
static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
bool checkwx, bool dmesg)
{
#ifdef CONFIG_X86_64
pgd_t *start = (pgd_t *) &init_top_pgt;
#else
pgd_t *start = swapper_pg_dir;
#endif
pgd_t *start = INIT_PGD;
pgprotval_t prot, eff;
int i;
struct pg_state st = {};
......@@ -563,12 +569,13 @@ void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user)
}
EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs);
static void ptdump_walk_user_pgd_level_checkwx(void)
void ptdump_walk_user_pgd_level_checkwx(void)
{
#ifdef CONFIG_PAGE_TABLE_ISOLATION
pgd_t *pgd = (pgd_t *) &init_top_pgt;
pgd_t *pgd = INIT_PGD;
if (!static_cpu_has(X86_FEATURE_PTI))
if (!(__supported_pte_mask & _PAGE_NX) ||
!static_cpu_has(X86_FEATURE_PTI))
return;
pr_info("x86/mm: Checking user space page tables\n");
......@@ -580,7 +587,6 @@ static void ptdump_walk_user_pgd_level_checkwx(void)
void ptdump_walk_pgd_level_checkwx(void)
{
ptdump_walk_pgd_level_core(NULL, NULL, true, false);
ptdump_walk_user_pgd_level_checkwx();
}
static int __init pt_dump_init(void)
......@@ -609,6 +615,9 @@ static int __init pt_dump_init(void)
# endif
address_markers[FIXADDR_START_NR].start_address = FIXADDR_START;
address_markers[CPU_ENTRY_AREA_NR].start_address = CPU_ENTRY_AREA_BASE;
# ifdef CONFIG_MODIFY_LDT_SYSCALL
address_markers[LDT_NR].start_address = LDT_BASE_ADDR;
# endif
#endif
return 0;
}
......
......@@ -317,8 +317,6 @@ static noinline int vmalloc_fault(unsigned long address)
if (!(address >= VMALLOC_START && address < VMALLOC_END))
return -1;
WARN_ON_ONCE(in_nmi());
/*
* Synchronize this task's top level page-table
* with the 'reference' page table.
......
......@@ -773,13 +773,44 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
}
}
/*
* begin/end can be in the direct map or the "high kernel mapping"
* used for the kernel image only. free_init_pages() will do the
* right thing for either kind of address.
*/
void free_kernel_image_pages(void *begin, void *end)
{
unsigned long begin_ul = (unsigned long)begin;
unsigned long end_ul = (unsigned long)end;
unsigned long len_pages = (end_ul - begin_ul) >> PAGE_SHIFT;
free_init_pages("unused kernel image", begin_ul, end_ul);
/*
* PTI maps some of the kernel into userspace. For performance,
* this includes some kernel areas that do not contain secrets.
* Those areas might be adjacent to the parts of the kernel image
* being freed, which may contain secrets. Remove the "high kernel
* image mapping" for these freed areas, ensuring they are not even
* potentially vulnerable to Meltdown regardless of the specific
* optimizations PTI is currently using.
*
* The "noalias" prevents unmapping the direct map alias which is
* needed to access the freed pages.
*
* This is only valid for 64bit kernels. 32bit has only one mapping
* which can't be treated in this way for obvious reasons.
*/
if (IS_ENABLED(CONFIG_X86_64) && cpu_feature_enabled(X86_FEATURE_PTI))
set_memory_np_noalias(begin_ul, len_pages);
}
void __ref free_initmem(void)
{
e820__reallocate_tables();
free_init_pages("unused kernel",
(unsigned long)(&__init_begin),
(unsigned long)(&__init_end));
free_kernel_image_pages(&__init_begin, &__init_end);
}
#ifdef CONFIG_BLK_DEV_INITRD
......
......@@ -1283,20 +1283,10 @@ void mark_rodata_ro(void)
set_memory_ro(start, (end-start) >> PAGE_SHIFT);
#endif
free_init_pages("unused kernel",
(unsigned long) __va(__pa_symbol(text_end)),
(unsigned long) __va(__pa_symbol(rodata_start)));
free_init_pages("unused kernel",
(unsigned long) __va(__pa_symbol(rodata_end)),
(unsigned long) __va(__pa_symbol(_sdata)));
free_kernel_image_pages((void *)text_end, (void *)rodata_start);
free_kernel_image_pages((void *)rodata_end, (void *)_sdata);
debug_checkwx();
/*
* Do this after all of the manipulation of the
* kernel text page tables are complete.
*/
pti_clone_kernel_text();
}
int kern_addr_valid(unsigned long addr)
......
......@@ -53,6 +53,7 @@ static DEFINE_SPINLOCK(cpa_lock);
#define CPA_FLUSHTLB 1
#define CPA_ARRAY 2
#define CPA_PAGES_ARRAY 4
#define CPA_NO_CHECK_ALIAS 8 /* Do not search for aliases */
#ifdef CONFIG_PROC_FS
static unsigned long direct_pages_count[PG_LEVEL_NUM];
......@@ -1486,6 +1487,9 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
/* No alias checking for _NX bit modifications */
checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
/* Has caller explicitly disabled alias checking? */
if (in_flag & CPA_NO_CHECK_ALIAS)
checkalias = 0;
ret = __change_page_attr_set_clr(&cpa, checkalias);
......@@ -1772,6 +1776,15 @@ int set_memory_np(unsigned long addr, int numpages)
return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
}
int set_memory_np_noalias(unsigned long addr, int numpages)
{
int cpa_flags = CPA_NO_CHECK_ALIAS;
return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
__pgprot(_PAGE_PRESENT), 0,
cpa_flags, NULL);
}
int set_memory_4k(unsigned long addr, int numpages)
{
return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
......@@ -1784,6 +1797,12 @@ int set_memory_nonglobal(unsigned long addr, int numpages)
__pgprot(_PAGE_GLOBAL), 0);
}
int set_memory_global(unsigned long addr, int numpages)
{
return change_page_attr_set(&addr, numpages,
__pgprot(_PAGE_GLOBAL), 0);
}
static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
{
struct cpa_data cpa;
......
......@@ -182,6 +182,14 @@ static void pgd_dtor(pgd_t *pgd)
*/
#define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD
/*
* We allocate separate PMDs for the kernel part of the user page-table
* when PTI is enabled. We need them to map the per-process LDT into the
* user-space page-table.
*/
#define PREALLOCATED_USER_PMDS (static_cpu_has(X86_FEATURE_PTI) ? \
KERNEL_PGD_PTRS : 0)
void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
{
paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
......@@ -202,14 +210,14 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
/* No need to prepopulate any pagetable entries in non-PAE modes. */
#define PREALLOCATED_PMDS 0
#define PREALLOCATED_USER_PMDS 0
#endif /* CONFIG_X86_PAE */
static void free_pmds(struct mm_struct *mm, pmd_t *pmds[])
static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
{
int i;
for(i = 0; i < PREALLOCATED_PMDS; i++)
for (i = 0; i < count; i++)
if (pmds[i]) {
pgtable_pmd_page_dtor(virt_to_page(pmds[i]));
free_page((unsigned long)pmds[i]);
......@@ -217,7 +225,7 @@ static void free_pmds(struct mm_struct *mm, pmd_t *pmds[])
}
}
static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[])
static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
{
int i;
bool failed = false;
......@@ -226,7 +234,7 @@ static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[])
if (mm == &init_mm)
gfp &= ~__GFP_ACCOUNT;
for(i = 0; i < PREALLOCATED_PMDS; i++) {
for (i = 0; i < count; i++) {
pmd_t *pmd = (pmd_t *)__get_free_page(gfp);
if (!pmd)
failed = true;
......@@ -241,7 +249,7 @@ static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[])
}
if (failed) {
free_pmds(mm, pmds);
free_pmds(mm, pmds, count);
return -ENOMEM;
}
......@@ -254,23 +262,38 @@ static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[])
* preallocate which never got a corresponding vma will need to be
* freed manually.
*/
static void mop_up_one_pmd(struct mm_struct *mm, pgd_t *pgdp)
{
pgd_t pgd = *pgdp;
if (pgd_val(pgd) != 0) {
pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
*pgdp = native_make_pgd(0);
paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
pmd_free(mm, pmd);
mm_dec_nr_pmds(mm);
}
}
static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
{
int i;
for(i = 0; i < PREALLOCATED_PMDS; i++) {
pgd_t pgd = pgdp[i];
for (i = 0; i < PREALLOCATED_PMDS; i++)
mop_up_one_pmd(mm, &pgdp[i]);
if (pgd_val(pgd) != 0) {
pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
#ifdef CONFIG_PAGE_TABLE_ISOLATION
pgdp[i] = native_make_pgd(0);
if (!static_cpu_has(X86_FEATURE_PTI))
return;
paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
pmd_free(mm, pmd);
mm_dec_nr_pmds(mm);
}
}
pgdp = kernel_to_user_pgdp(pgdp);
for (i = 0; i < PREALLOCATED_USER_PMDS; i++)
mop_up_one_pmd(mm, &pgdp[i + KERNEL_PGD_BOUNDARY]);
#endif
}
static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
......@@ -296,6 +319,38 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
}
}
#ifdef CONFIG_PAGE_TABLE_ISOLATION
static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
pgd_t *k_pgd, pmd_t *pmds[])
{
pgd_t *s_pgd = kernel_to_user_pgdp(swapper_pg_dir);
pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd);
p4d_t *u_p4d;
pud_t *u_pud;
int i;
u_p4d = p4d_offset(u_pgd, 0);
u_pud = pud_offset(u_p4d, 0);
s_pgd += KERNEL_PGD_BOUNDARY;
u_pud += KERNEL_PGD_BOUNDARY;
for (i = 0; i < PREALLOCATED_USER_PMDS; i++, u_pud++, s_pgd++) {
pmd_t *pmd = pmds[i];
memcpy(pmd, (pmd_t *)pgd_page_vaddr(*s_pgd),
sizeof(pmd_t) * PTRS_PER_PMD);
pud_populate(mm, u_pud, pmd);
}
}
#else
static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
pgd_t *k_pgd, pmd_t *pmds[])
{
}
#endif
/*
* Xen paravirt assumes pgd table should be in one page. 64 bit kernel also
* assumes that pgd should be in one page.
......@@ -340,7 +395,8 @@ static inline pgd_t *_pgd_alloc(void)
* We allocate one page for pgd.
*/
if (!SHARED_KERNEL_PMD)
return (pgd_t *)__get_free_page(PGALLOC_GFP);
return (pgd_t *)__get_free_pages(PGALLOC_GFP,
PGD_ALLOCATION_ORDER);
/*
* Now PAE kernel is not running as a Xen domain. We can allocate
......@@ -352,7 +408,7 @@ static inline pgd_t *_pgd_alloc(void)
static inline void _pgd_free(pgd_t *pgd)
{
if (!SHARED_KERNEL_PMD)
free_page((unsigned long)pgd);
free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
else
kmem_cache_free(pgd_cache, pgd);
}
......@@ -372,6 +428,7 @@ static inline void _pgd_free(pgd_t *pgd)
pgd_t *pgd_alloc(struct mm_struct *mm)
{
pgd_t *pgd;
pmd_t *u_pmds[PREALLOCATED_USER_PMDS];
pmd_t *pmds[PREALLOCATED_PMDS];
pgd = _pgd_alloc();
......@@ -381,12 +438,15 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
mm->pgd = pgd;
if (preallocate_pmds(mm, pmds) != 0)
if (preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0)
goto out_free_pgd;
if (paravirt_pgd_alloc(mm) != 0)
if (preallocate_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS) != 0)
goto out_free_pmds;
if (paravirt_pgd_alloc(mm) != 0)
goto out_free_user_pmds;
/*
* Make sure that pre-populating the pmds is atomic with
* respect to anything walking the pgd_list, so that they
......@@ -396,13 +456,16 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
pgd_ctor(mm, pgd);
pgd_prepopulate_pmd(mm, pgd, pmds);
pgd_prepopulate_user_pmd(mm, pgd, u_pmds);
spin_unlock(&pgd_lock);
return pgd;
out_free_user_pmds:
free_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS);
out_free_pmds:
free_pmds(mm, pmds);
free_pmds(mm, pmds, PREALLOCATED_PMDS);
out_free_pgd:
_pgd_free(pgd);
out:
......
This diff is collapsed.
......@@ -67,6 +67,7 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = {
"__tracedata_(start|end)|"
"__(start|stop)_notes|"
"__end_rodata|"
"__end_rodata_aligned|"
"__initramfs_start|"
"(jiffies|jiffies_64)|"
#if ELF_BITS == 64
......
......@@ -6,6 +6,7 @@
#include <asm/pti.h>
#else
static inline void pti_init(void) { }
static inline void pti_finalize(void) { }
#endif
#endif
......@@ -1065,6 +1065,13 @@ static int __ref kernel_init(void *unused)
jump_label_invalidate_initmem();
free_initmem();
mark_readonly();
/*
* Kernel mappings are now finalized - update the userspace page-table
* to finalize PTI.
*/
pti_finalize();
system_state = SYSTEM_RUNNING;
numa_default_policy();
......
......@@ -6939,9 +6939,21 @@ unsigned long free_reserved_area(void *start, void *end, int poison, char *s)
start = (void *)PAGE_ALIGN((unsigned long)start);
end = (void *)((unsigned long)end & PAGE_MASK);
for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
struct page *page = virt_to_page(pos);
void *direct_map_addr;
/*
* 'direct_map_addr' might be different from 'pos'
* because some architectures' virt_to_page()
* work with aliases. Getting the direct map
* address ensures that we get a _writeable_
* alias for the memset().
*/
direct_map_addr = page_address(page);
if ((unsigned int)poison <= 0xFF)
memset(pos, poison, PAGE_SIZE);
free_reserved_page(virt_to_page(pos));
memset(direct_map_addr, poison, PAGE_SIZE);
free_reserved_page(page);
}
if (pages && s)
......
......@@ -57,7 +57,7 @@ config SECURITY_NETWORK
config PAGE_TABLE_ISOLATION
bool "Remove the kernel mapping in user mode"
default y
depends on X86_64 && !UML
depends on X86 && !UML
help
This feature reduces the number of hardware side channels by
ensuring that the majority of kernel addresses are not mapped
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment