Commit dc4e0021 authored by Andy Lutomirski's avatar Andy Lutomirski Committed by Ingo Molnar

x86/doublefault/32: Move #DF stack and TSS to cpu_entry_area

There are three problems with the current layout of the doublefault
stack and TSS.  First, the TSS is only cacheline-aligned, which is
not enough -- if the hardware portion of the TSS (struct x86_hw_tss)
crosses a page boundary, horrible things happen [0].  Second, the
stack and TSS are global, so simultaneous double faults on different
CPUs will cause massive corruption.  Third, the whole mechanism
won't work if user CR3 is loaded, resulting in a triple fault [1].

Let the doublefault stack and TSS share a page (which prevents the
TSS from spanning a page boundary), make it percpu, and move it into
cpu_entry_area.  Teach the stack dump code about the doublefault
stack.

[0] Real hardware will read past the end of the page onto the next
    *physical* page if a task switch happens.  Virtual machines may
    have any number of bugs, and I would consider it reasonable for
    a VM to summarily kill the guest if it tries to task-switch to
    a page-spanning TSS.

[1] Real hardware triple faults.  At least some VMs seem to hang.
    I'm not sure what's going on.
Signed-off-by: default avatarAndy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: default avatarIngo Molnar <mingo@kernel.org>
parent e99b6f46
...@@ -65,6 +65,13 @@ enum exception_stack_ordering { ...@@ -65,6 +65,13 @@ enum exception_stack_ordering {
#endif #endif
#ifdef CONFIG_X86_32
struct doublefault_stack {
unsigned long stack[(PAGE_SIZE - sizeof(struct x86_hw_tss)) / sizeof(unsigned long)];
struct x86_hw_tss tss;
} __aligned(PAGE_SIZE);
#endif
/* /*
* cpu_entry_area is a percpu region that contains things needed by the CPU * cpu_entry_area is a percpu region that contains things needed by the CPU
* and early entry/exit code. Real types aren't used for all fields here * and early entry/exit code. Real types aren't used for all fields here
...@@ -86,6 +93,11 @@ struct cpu_entry_area { ...@@ -86,6 +93,11 @@ struct cpu_entry_area {
#endif #endif
struct entry_stack_page entry_stack_page; struct entry_stack_page entry_stack_page;
#ifdef CONFIG_X86_32
char guard_doublefault_stack[PAGE_SIZE];
struct doublefault_stack doublefault_stack;
#endif
/* /*
* On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because * On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because
* we need task switches to work, and task switches write to the TSS. * we need task switches to work, and task switches write to the TSS.
......
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_DOUBLEFAULT_H
#define _ASM_X86_DOUBLEFAULT_H
#if defined(CONFIG_X86_32) && defined(CONFIG_DOUBLEFAULT)
extern void doublefault_init_cpu_tss(void);
#else
static inline void doublefault_init_cpu_tss(void)
{
}
#endif
#endif /* _ASM_X86_DOUBLEFAULT_H */
...@@ -41,10 +41,11 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */ ...@@ -41,10 +41,11 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */
#endif #endif
/* /*
* Define this here and validate with BUILD_BUG_ON() in pgtable_32.c * This is an upper bound on sizeof(struct cpu_entry_area) / PAGE_SIZE.
* to avoid include recursion hell * Define this here and validate with BUILD_BUG_ON() in cpu_entry_area.c
* to avoid include recursion hell.
*/ */
#define CPU_ENTRY_AREA_PAGES (NR_CPUS * 41) #define CPU_ENTRY_AREA_PAGES (NR_CPUS * 43)
/* The +1 is for the readonly IDT page: */ /* The +1 is for the readonly IDT page: */
#define CPU_ENTRY_AREA_BASE \ #define CPU_ENTRY_AREA_BASE \
......
...@@ -166,7 +166,6 @@ enum cpuid_regs_idx { ...@@ -166,7 +166,6 @@ enum cpuid_regs_idx {
extern struct cpuinfo_x86 boot_cpu_data; extern struct cpuinfo_x86 boot_cpu_data;
extern struct cpuinfo_x86 new_cpu_data; extern struct cpuinfo_x86 new_cpu_data;
extern struct x86_hw_tss doublefault_tss;
extern __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS]; extern __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS];
extern __u32 cpu_caps_set[NCAPINTS + NBUGINTS]; extern __u32 cpu_caps_set[NCAPINTS + NBUGINTS];
......
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
#include <asm/stackprotector.h> #include <asm/stackprotector.h>
#include <asm/perf_event.h> #include <asm/perf_event.h>
#include <asm/mmu_context.h> #include <asm/mmu_context.h>
#include <asm/doublefault.h>
#include <asm/archrandom.h> #include <asm/archrandom.h>
#include <asm/hypervisor.h> #include <asm/hypervisor.h>
#include <asm/processor.h> #include <asm/processor.h>
...@@ -1814,8 +1815,6 @@ static inline void tss_setup_ist(struct tss_struct *tss) ...@@ -1814,8 +1815,6 @@ static inline void tss_setup_ist(struct tss_struct *tss)
tss->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE); tss->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE);
} }
static inline void gdt_setup_doublefault_tss(int cpu) { }
#else /* CONFIG_X86_64 */ #else /* CONFIG_X86_64 */
static inline void setup_getcpu(int cpu) { } static inline void setup_getcpu(int cpu) { }
...@@ -1827,13 +1826,6 @@ static inline void ucode_cpu_init(int cpu) ...@@ -1827,13 +1826,6 @@ static inline void ucode_cpu_init(int cpu)
static inline void tss_setup_ist(struct tss_struct *tss) { } static inline void tss_setup_ist(struct tss_struct *tss) { }
static inline void gdt_setup_doublefault_tss(int cpu)
{
#ifdef CONFIG_DOUBLEFAULT
/* Set up the doublefault TSS pointer in the GDT */
__set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
#endif
}
#endif /* !CONFIG_X86_64 */ #endif /* !CONFIG_X86_64 */
static inline void tss_setup_io_bitmap(struct tss_struct *tss) static inline void tss_setup_io_bitmap(struct tss_struct *tss)
...@@ -1923,7 +1915,7 @@ void cpu_init(void) ...@@ -1923,7 +1915,7 @@ void cpu_init(void)
clear_all_debug_regs(); clear_all_debug_regs();
dbg_restore_debug_regs(); dbg_restore_debug_regs();
gdt_setup_doublefault_tss(cpu); doublefault_init_cpu_tss();
fpu__init_cpu(); fpu__init_cpu();
......
...@@ -10,10 +10,6 @@ ...@@ -10,10 +10,6 @@
#include <asm/processor.h> #include <asm/processor.h>
#include <asm/desc.h> #include <asm/desc.h>
#define DOUBLEFAULT_STACKSIZE (1024)
static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE];
#define STACK_START (unsigned long)(doublefault_stack+DOUBLEFAULT_STACKSIZE)
#define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM) #define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM)
static void doublefault_fn(void) static void doublefault_fn(void)
...@@ -21,6 +17,8 @@ static void doublefault_fn(void) ...@@ -21,6 +17,8 @@ static void doublefault_fn(void)
struct desc_ptr gdt_desc = {0, 0}; struct desc_ptr gdt_desc = {0, 0};
unsigned long gdt, tss; unsigned long gdt, tss;
BUILD_BUG_ON(sizeof(struct doublefault_stack) != PAGE_SIZE);
native_store_gdt(&gdt_desc); native_store_gdt(&gdt_desc);
gdt = gdt_desc.address; gdt = gdt_desc.address;
...@@ -48,24 +46,46 @@ static void doublefault_fn(void) ...@@ -48,24 +46,46 @@ static void doublefault_fn(void)
cpu_relax(); cpu_relax();
} }
struct x86_hw_tss doublefault_tss __cacheline_aligned = { DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = {
.sp0 = STACK_START, .tss = {
.ss0 = __KERNEL_DS, /*
.ldt = 0, * No sp0 or ss0 -- we never run CPL != 0 with this TSS
* active. sp is filled in later.
*/
.ldt = 0,
.io_bitmap_base = IO_BITMAP_OFFSET_INVALID, .io_bitmap_base = IO_BITMAP_OFFSET_INVALID,
.ip = (unsigned long) doublefault_fn, .ip = (unsigned long) doublefault_fn,
/* 0x2 bit is always set */ /* 0x2 bit is always set */
.flags = X86_EFLAGS_SF | 0x2, .flags = X86_EFLAGS_SF | 0x2,
.sp = STACK_START, .es = __USER_DS,
.es = __USER_DS, .cs = __KERNEL_CS,
.cs = __KERNEL_CS, .ss = __KERNEL_DS,
.ss = __KERNEL_DS, .ds = __USER_DS,
.ds = __USER_DS, .fs = __KERNEL_PERCPU,
.fs = __KERNEL_PERCPU,
#ifndef CONFIG_X86_32_LAZY_GS #ifndef CONFIG_X86_32_LAZY_GS
.gs = __KERNEL_STACK_CANARY, .gs = __KERNEL_STACK_CANARY,
#endif #endif
.__cr3 = __pa_nodebug(swapper_pg_dir), .__cr3 = __pa_nodebug(swapper_pg_dir),
},
}; };
void doublefault_init_cpu_tss(void)
{
unsigned int cpu = smp_processor_id();
struct cpu_entry_area *cea = get_cpu_entry_area(cpu);
/*
* The linker isn't smart enough to initialize percpu variables that
* point to other places in percpu space.
*/
this_cpu_write(doublefault_stack.tss.sp,
(unsigned long)&cea->doublefault_stack.stack +
sizeof(doublefault_stack.stack));
/* Set up doublefault TSS pointer in the GDT */
__set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS,
&get_cpu_entry_area(cpu)->doublefault_stack.tss);
}
...@@ -29,6 +29,9 @@ const char *stack_type_name(enum stack_type type) ...@@ -29,6 +29,9 @@ const char *stack_type_name(enum stack_type type)
if (type == STACK_TYPE_ENTRY) if (type == STACK_TYPE_ENTRY)
return "ENTRY_TRAMPOLINE"; return "ENTRY_TRAMPOLINE";
if (type == STACK_TYPE_EXCEPTION)
return "#DF";
return NULL; return NULL;
} }
...@@ -82,6 +85,30 @@ static bool in_softirq_stack(unsigned long *stack, struct stack_info *info) ...@@ -82,6 +85,30 @@ static bool in_softirq_stack(unsigned long *stack, struct stack_info *info)
return true; return true;
} }
static bool in_doublefault_stack(unsigned long *stack, struct stack_info *info)
{
#ifdef CONFIG_DOUBLEFAULT
struct cpu_entry_area *cea = get_cpu_entry_area(raw_smp_processor_id());
struct doublefault_stack *ss = &cea->doublefault_stack;
void *begin = ss->stack;
void *end = begin + sizeof(ss->stack);
if ((void *)stack < begin || (void *)stack >= end)
return false;
info->type = STACK_TYPE_EXCEPTION;
info->begin = begin;
info->end = end;
info->next_sp = (unsigned long *)this_cpu_read(cpu_tss_rw.x86_tss.sp);
return true;
#else
return false;
#endif
}
int get_stack_info(unsigned long *stack, struct task_struct *task, int get_stack_info(unsigned long *stack, struct task_struct *task,
struct stack_info *info, unsigned long *visit_mask) struct stack_info *info, unsigned long *visit_mask)
{ {
...@@ -105,6 +132,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task, ...@@ -105,6 +132,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
if (in_softirq_stack(stack, info)) if (in_softirq_stack(stack, info))
goto recursion_check; goto recursion_check;
if (in_doublefault_stack(stack, info))
goto recursion_check;
goto unknown; goto unknown;
recursion_check: recursion_check:
......
...@@ -17,6 +17,10 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(struct exception_stacks, exception_stacks); ...@@ -17,6 +17,10 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(struct exception_stacks, exception_stacks);
DEFINE_PER_CPU(struct cea_exception_stacks*, cea_exception_stacks); DEFINE_PER_CPU(struct cea_exception_stacks*, cea_exception_stacks);
#endif #endif
#if defined(CONFIG_X86_32) && defined(CONFIG_DOUBLEFAULT)
DECLARE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack);
#endif
struct cpu_entry_area *get_cpu_entry_area(int cpu) struct cpu_entry_area *get_cpu_entry_area(int cpu)
{ {
unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE; unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE;
...@@ -108,7 +112,15 @@ static void __init percpu_setup_exception_stacks(unsigned int cpu) ...@@ -108,7 +112,15 @@ static void __init percpu_setup_exception_stacks(unsigned int cpu)
cea_map_stack(MCE); cea_map_stack(MCE);
} }
#else #else
static inline void percpu_setup_exception_stacks(unsigned int cpu) {} static inline void percpu_setup_exception_stacks(unsigned int cpu)
{
#ifdef CONFIG_DOUBLEFAULT
struct cpu_entry_area *cea = get_cpu_entry_area(cpu);
cea_map_percpu_pages(&cea->doublefault_stack,
&per_cpu(doublefault_stack, cpu), 1, PAGE_KERNEL);
#endif
}
#endif #endif
/* Setup the fixmap mappings only once per-processor */ /* Setup the fixmap mappings only once per-processor */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment