Commit caf9a826 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 PTI preparatory patches from Thomas Gleixner:
 "Todays Advent calendar window contains twentyfour easy to digest
  patches. The original plan was to have twenty three matching the date,
  but a late fixup made that moot.

   - Move the cpu_entry_area mapping out of the fixmap into a separate
     address space. That's necessary because the fixmap becomes too big
     with NRCPUS=8192 and this caused already subtle and hard to
     diagnose failures.

     The top most patch is fresh from today and cures a brain slip of
     that tall grumpy german greybeard, who ignored the intricacies of
     32bit wraparounds.

   - Limit the number of CPUs on 32bit to 64. That's insane big already,
     but at least it's small enough to prevent address space issues with
     the cpu_entry_area map, which have been observed and debugged with
     the fixmap code

   - A few TLB flush fixes in various places plus documentation which of
     the TLB functions should be used for what.

   - Rename the SYSENTER stack to CPU_ENTRY_AREA stack as it is used for
     more than sysenter now and keeping the name makes backtraces
     confusing.

   - Prevent LDT inheritance on exec() by moving it to arch_dup_mmap(),
     which is only invoked on fork().

   - Make vysycall more robust.

   - A few fixes and cleanups of the debug_pagetables code. Check
     PAGE_PRESENT instead of checking the PTE for 0 and a cleanup of the
     C89 initialization of the address hint array which already was out
     of sync with the index enums.

   - Move the ESPFIX init to a different place to prepare for PTI.

   - Several code moves with no functional change to make PTI
     integration simpler and header files less convoluted.

   - Documentation fixes and clarifications"

* 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (24 commits)
  x86/cpu_entry_area: Prevent wraparound in setup_cpu_entry_area_ptes() on 32bit
  init: Invoke init_espfix_bsp() from mm_init()
  x86/cpu_entry_area: Move it out of the fixmap
  x86/cpu_entry_area: Move it to a separate unit
  x86/mm: Create asm/invpcid.h
  x86/mm: Put MMU to hardware ASID translation in one place
  x86/mm: Remove hard-coded ASID limit checks
  x86/mm: Move the CR3 construction functions to tlbflush.h
  x86/mm: Add comments to clarify which TLB-flush functions are supposed to flush what
  x86/mm: Remove superfluous barriers
  x86/mm: Use __flush_tlb_one() for kernel memory
  x86/microcode: Dont abuse the TLB-flush interface
  x86/uv: Use the right TLB-flush API
  x86/entry: Rename SYSENTER_stack to CPU_ENTRY_AREA_entry_stack
  x86/doc: Remove obvious weirdnesses from the x86 MM layout documentation
  x86/mm/64: Improve the memory map documentation
  x86/ldt: Prevent LDT inheritance on exec
  x86/ldt: Rework locking
  arch, mm: Allow arch_dup_mmap() to fail
  x86/vsyscall/64: Warn and fail vsyscall emulation in NATIVE mode
  ...
parents 9c294ec0 f6c4fd50
<previous description obsolete, deleted>
Virtual memory map with 4 level page tables: Virtual memory map with 4 level page tables:
0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm 0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm
...@@ -14,13 +12,15 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB) ...@@ -14,13 +12,15 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
... unused hole ... ... unused hole ...
ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB) ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
... unused hole ... ... unused hole ...
fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping
ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
... unused hole ... ... unused hole ...
ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
... unused hole ... ... unused hole ...
ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0 ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0
ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space (variable) ffffffffa0000000 - [fixmap start] (~1526 MB) module mapping space (variable)
ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls [fixmap start] - ffffffffff5fffff kernel-internal fixmap range
ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI
ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
Virtual memory map with 5 level page tables: Virtual memory map with 5 level page tables:
...@@ -36,19 +36,22 @@ ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB) ...@@ -36,19 +36,22 @@ ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
... unused hole ... ... unused hole ...
ffdf000000000000 - fffffc0000000000 (=53 bits) kasan shadow memory (8PB) ffdf000000000000 - fffffc0000000000 (=53 bits) kasan shadow memory (8PB)
... unused hole ... ... unused hole ...
fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping
ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
... unused hole ... ... unused hole ...
ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
... unused hole ... ... unused hole ...
ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0 ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0
ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space ffffffffa0000000 - [fixmap start] (~1526 MB) module mapping space
ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls [fixmap start] - ffffffffff5fffff kernel-internal fixmap range
ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI
ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
Architecture defines a 64-bit virtual address. Implementations can support Architecture defines a 64-bit virtual address. Implementations can support
less. Currently supported are 48- and 57-bit virtual addresses. Bits 63 less. Currently supported are 48- and 57-bit virtual addresses. Bits 63
through to the most-significant implemented bit are set to either all ones through to the most-significant implemented bit are sign extended.
or all zero. This causes hole between user space and kernel addresses. This causes hole between user space and kernel addresses if you interpret them
as unsigned.
The direct mapping covers all memory in the system up to the highest The direct mapping covers all memory in the system up to the highest
memory address (this means in some cases it can also include PCI memory memory address (this means in some cases it can also include PCI memory
...@@ -58,9 +61,6 @@ vmalloc space is lazily synchronized into the different PML4/PML5 pages of ...@@ -58,9 +61,6 @@ vmalloc space is lazily synchronized into the different PML4/PML5 pages of
the processes using the page fault handler, with init_top_pgt as the processes using the page fault handler, with init_top_pgt as
reference. reference.
Current X86-64 implementations support up to 46 bits of address space (64 TB),
which is our current limit. This expands into MBZ space in the page tables.
We map EFI runtime services in the 'efi_pgd' PGD in a 64Gb large virtual We map EFI runtime services in the 'efi_pgd' PGD in a 64Gb large virtual
memory window (this size is arbitrary, it can be raised later if needed). memory window (this size is arbitrary, it can be raised later if needed).
The mappings are not part of any other kernel PGD and are only available The mappings are not part of any other kernel PGD and are only available
...@@ -72,5 +72,3 @@ following fixmap section. ...@@ -72,5 +72,3 @@ following fixmap section.
Note that if CONFIG_RANDOMIZE_MEMORY is enabled, the direct mapping of all Note that if CONFIG_RANDOMIZE_MEMORY is enabled, the direct mapping of all
physical memory, vmalloc/ioremap space and virtual memory map are randomized. physical memory, vmalloc/ioremap space and virtual memory map are randomized.
Their order is preserved but their base will be offset early at boot time. Their order is preserved but their base will be offset early at boot time.
-Andi Kleen, Jul 2004
...@@ -160,9 +160,10 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, ...@@ -160,9 +160,10 @@ static inline void enter_lazy_tlb(struct mm_struct *mm,
#endif #endif
} }
static inline void arch_dup_mmap(struct mm_struct *oldmm, static inline int arch_dup_mmap(struct mm_struct *oldmm,
struct mm_struct *mm) struct mm_struct *mm)
{ {
return 0;
} }
#ifndef CONFIG_PPC_BOOK3S_64 #ifndef CONFIG_PPC_BOOK3S_64
......
...@@ -15,9 +15,10 @@ extern void uml_setup_stubs(struct mm_struct *mm); ...@@ -15,9 +15,10 @@ extern void uml_setup_stubs(struct mm_struct *mm);
/* /*
* Needed since we do not use the asm-generic/mm_hooks.h: * Needed since we do not use the asm-generic/mm_hooks.h:
*/ */
static inline void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
{ {
uml_setup_stubs(mm); uml_setup_stubs(mm);
return 0;
} }
extern void arch_exit_mmap(struct mm_struct *mm); extern void arch_exit_mmap(struct mm_struct *mm);
static inline void arch_unmap(struct mm_struct *mm, static inline void arch_unmap(struct mm_struct *mm,
......
...@@ -81,9 +81,10 @@ do { \ ...@@ -81,9 +81,10 @@ do { \
} \ } \
} while (0) } while (0)
static inline void arch_dup_mmap(struct mm_struct *oldmm, static inline int arch_dup_mmap(struct mm_struct *oldmm,
struct mm_struct *mm) struct mm_struct *mm)
{ {
return 0;
} }
static inline void arch_unmap(struct mm_struct *mm, static inline void arch_unmap(struct mm_struct *mm,
......
...@@ -926,7 +926,8 @@ config MAXSMP ...@@ -926,7 +926,8 @@ config MAXSMP
config NR_CPUS config NR_CPUS
int "Maximum number of CPUs" if SMP && !MAXSMP int "Maximum number of CPUs" if SMP && !MAXSMP
range 2 8 if SMP && X86_32 && !X86_BIGSMP range 2 8 if SMP && X86_32 && !X86_BIGSMP
range 2 512 if SMP && !MAXSMP && !CPUMASK_OFFSTACK range 2 64 if SMP && X86_32 && X86_BIGSMP
range 2 512 if SMP && !MAXSMP && !CPUMASK_OFFSTACK && X86_64
range 2 8192 if SMP && !MAXSMP && CPUMASK_OFFSTACK && X86_64 range 2 8192 if SMP && !MAXSMP && CPUMASK_OFFSTACK && X86_64
default "1" if !SMP default "1" if !SMP
default "8192" if MAXSMP default "8192" if MAXSMP
......
...@@ -942,9 +942,9 @@ ENTRY(debug) ...@@ -942,9 +942,9 @@ ENTRY(debug)
/* Are we currently on the SYSENTER stack? */ /* Are we currently on the SYSENTER stack? */
movl PER_CPU_VAR(cpu_entry_area), %ecx movl PER_CPU_VAR(cpu_entry_area), %ecx
addl $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx
subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ subl %eax, %ecx /* ecx = (end of entry_stack) - esp */
cmpl $SIZEOF_SYSENTER_stack, %ecx cmpl $SIZEOF_entry_stack, %ecx
jb .Ldebug_from_sysenter_stack jb .Ldebug_from_sysenter_stack
TRACE_IRQS_OFF TRACE_IRQS_OFF
...@@ -986,9 +986,9 @@ ENTRY(nmi) ...@@ -986,9 +986,9 @@ ENTRY(nmi)
/* Are we currently on the SYSENTER stack? */ /* Are we currently on the SYSENTER stack? */
movl PER_CPU_VAR(cpu_entry_area), %ecx movl PER_CPU_VAR(cpu_entry_area), %ecx
addl $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx
subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ subl %eax, %ecx /* ecx = (end of entry_stack) - esp */
cmpl $SIZEOF_SYSENTER_stack, %ecx cmpl $SIZEOF_entry_stack, %ecx
jb .Lnmi_from_sysenter_stack jb .Lnmi_from_sysenter_stack
/* Not on SYSENTER stack. */ /* Not on SYSENTER stack. */
......
...@@ -158,8 +158,8 @@ END(native_usergs_sysret64) ...@@ -158,8 +158,8 @@ END(native_usergs_sysret64)
_entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip) _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip)
/* The top word of the SYSENTER stack is hot and is usable as scratch space. */ /* The top word of the SYSENTER stack is hot and is usable as scratch space. */
#define RSP_SCRATCH CPU_ENTRY_AREA_SYSENTER_stack + \ #define RSP_SCRATCH CPU_ENTRY_AREA_entry_stack + \
SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA SIZEOF_entry_stack - 8 + CPU_ENTRY_AREA
ENTRY(entry_SYSCALL_64_trampoline) ENTRY(entry_SYSCALL_64_trampoline)
UNWIND_HINT_EMPTY UNWIND_HINT_EMPTY
......
...@@ -37,6 +37,7 @@ ...@@ -37,6 +37,7 @@
#include <asm/unistd.h> #include <asm/unistd.h>
#include <asm/fixmap.h> #include <asm/fixmap.h>
#include <asm/traps.h> #include <asm/traps.h>
#include <asm/paravirt.h>
#define CREATE_TRACE_POINTS #define CREATE_TRACE_POINTS
#include "vsyscall_trace.h" #include "vsyscall_trace.h"
...@@ -138,6 +139,10 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) ...@@ -138,6 +139,10 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
WARN_ON_ONCE(address != regs->ip); WARN_ON_ONCE(address != regs->ip);
/* This should be unreachable in NATIVE mode. */
if (WARN_ON(vsyscall_mode == NATIVE))
return false;
if (vsyscall_mode == NONE) { if (vsyscall_mode == NONE) {
warn_bad_vsyscall(KERN_INFO, regs, warn_bad_vsyscall(KERN_INFO, regs,
"vsyscall attempted with vsyscall=none"); "vsyscall attempted with vsyscall=none");
...@@ -329,16 +334,47 @@ int in_gate_area_no_mm(unsigned long addr) ...@@ -329,16 +334,47 @@ int in_gate_area_no_mm(unsigned long addr)
return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR; return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR;
} }
/*
* The VSYSCALL page is the only user-accessible page in the kernel address
* range. Normally, the kernel page tables can have _PAGE_USER clear, but
* the tables covering VSYSCALL_ADDR need _PAGE_USER set if vsyscalls
* are enabled.
*
* Some day we may create a "minimal" vsyscall mode in which we emulate
* vsyscalls but leave the page not present. If so, we skip calling
* this.
*/
static void __init set_vsyscall_pgtable_user_bits(void)
{
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pgd = pgd_offset_k(VSYSCALL_ADDR);
set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER));
p4d = p4d_offset(pgd, VSYSCALL_ADDR);
#if CONFIG_PGTABLE_LEVELS >= 5
p4d->p4d |= _PAGE_USER;
#endif
pud = pud_offset(p4d, VSYSCALL_ADDR);
set_pud(pud, __pud(pud_val(*pud) | _PAGE_USER));
pmd = pmd_offset(pud, VSYSCALL_ADDR);
set_pmd(pmd, __pmd(pmd_val(*pmd) | _PAGE_USER));
}
void __init map_vsyscall(void) void __init map_vsyscall(void)
{ {
extern char __vsyscall_page; extern char __vsyscall_page;
unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);
if (vsyscall_mode != NONE) if (vsyscall_mode != NONE) {
__set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall, __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall,
vsyscall_mode == NATIVE vsyscall_mode == NATIVE
? PAGE_KERNEL_VSYSCALL ? PAGE_KERNEL_VSYSCALL
: PAGE_KERNEL_VVAR); : PAGE_KERNEL_VVAR);
set_vsyscall_pgtable_user_bits();
}
BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
(unsigned long)VSYSCALL_ADDR); (unsigned long)VSYSCALL_ADDR);
......
// SPDX-License-Identifier: GPL-2.0
#ifndef _ASM_X86_CPU_ENTRY_AREA_H
#define _ASM_X86_CPU_ENTRY_AREA_H
#include <linux/percpu-defs.h>
#include <asm/processor.h>
/*
* cpu_entry_area is a percpu region that contains things needed by the CPU
* and early entry/exit code. Real types aren't used for all fields here
* to avoid circular header dependencies.
*
* Every field is a virtual alias of some other allocated backing store.
* There is no direct allocation of a struct cpu_entry_area.
*/
struct cpu_entry_area {
char gdt[PAGE_SIZE];
/*
* The GDT is just below entry_stack and thus serves (on x86_64) as
* a a read-only guard page.
*/
struct entry_stack_page entry_stack_page;
/*
* On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because
* we need task switches to work, and task switches write to the TSS.
*/
struct tss_struct tss;
char entry_trampoline[PAGE_SIZE];
#ifdef CONFIG_X86_64
/*
* Exception stacks used for IST entries.
*
* In the future, this should have a separate slot for each stack
* with guard pages between them.
*/
char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ];
#endif
};
#define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area))
#define CPU_ENTRY_AREA_TOT_SIZE (CPU_ENTRY_AREA_SIZE * NR_CPUS)
DECLARE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
extern void setup_cpu_entry_areas(void);
extern void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags);
#define CPU_ENTRY_AREA_RO_IDT CPU_ENTRY_AREA_BASE
#define CPU_ENTRY_AREA_PER_CPU (CPU_ENTRY_AREA_RO_IDT + PAGE_SIZE)
#define CPU_ENTRY_AREA_RO_IDT_VADDR ((void *)CPU_ENTRY_AREA_RO_IDT)
#define CPU_ENTRY_AREA_MAP_SIZE \
(CPU_ENTRY_AREA_PER_CPU + CPU_ENTRY_AREA_TOT_SIZE - CPU_ENTRY_AREA_BASE)
extern struct cpu_entry_area *get_cpu_entry_area(int cpu);
static inline struct entry_stack *cpu_entry_stack(int cpu)
{
return &get_cpu_entry_area(cpu)->entry_stack_page.stack;
}
#endif
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
#include <asm/mmu.h> #include <asm/mmu.h>
#include <asm/fixmap.h> #include <asm/fixmap.h>
#include <asm/irq_vectors.h> #include <asm/irq_vectors.h>
#include <asm/cpu_entry_area.h>
#include <linux/smp.h> #include <linux/smp.h>
#include <linux/percpu.h> #include <linux/percpu.h>
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
#ifndef _ASM_X86_ESPFIX_H #ifndef _ASM_X86_ESPFIX_H
#define _ASM_X86_ESPFIX_H #define _ASM_X86_ESPFIX_H
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_ESPFIX64
#include <asm/percpu.h> #include <asm/percpu.h>
...@@ -11,7 +11,8 @@ DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr); ...@@ -11,7 +11,8 @@ DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr);
extern void init_espfix_bsp(void); extern void init_espfix_bsp(void);
extern void init_espfix_ap(int cpu); extern void init_espfix_ap(int cpu);
#else
#endif /* CONFIG_X86_64 */ static inline void init_espfix_ap(int cpu) { }
#endif
#endif /* _ASM_X86_ESPFIX_H */ #endif /* _ASM_X86_ESPFIX_H */
...@@ -44,46 +44,6 @@ extern unsigned long __FIXADDR_TOP; ...@@ -44,46 +44,6 @@ extern unsigned long __FIXADDR_TOP;
PAGE_SIZE) PAGE_SIZE)
#endif #endif
/*
* cpu_entry_area is a percpu region in the fixmap that contains things
* needed by the CPU and early entry/exit code. Real types aren't used
* for all fields here to avoid circular header dependencies.
*
* Every field is a virtual alias of some other allocated backing store.
* There is no direct allocation of a struct cpu_entry_area.
*/
struct cpu_entry_area {
char gdt[PAGE_SIZE];
/*
* The GDT is just below SYSENTER_stack and thus serves (on x86_64) as
* a a read-only guard page.
*/
struct SYSENTER_stack_page SYSENTER_stack_page;
/*
* On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because
* we need task switches to work, and task switches write to the TSS.
*/
struct tss_struct tss;
char entry_trampoline[PAGE_SIZE];
#ifdef CONFIG_X86_64
/*
* Exception stacks used for IST entries.
*
* In the future, this should have a separate slot for each stack
* with guard pages between them.
*/
char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ];
#endif
};
#define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE)
extern void setup_cpu_entry_areas(void);
/* /*
* Here we define all the compile-time 'special' virtual * Here we define all the compile-time 'special' virtual
* addresses. The point is to have a constant address at * addresses. The point is to have a constant address at
...@@ -123,7 +83,6 @@ enum fixed_addresses { ...@@ -123,7 +83,6 @@ enum fixed_addresses {
FIX_IO_APIC_BASE_0, FIX_IO_APIC_BASE_0,
FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1, FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
#endif #endif
FIX_RO_IDT, /* Virtual mapping for read-only IDT */
#ifdef CONFIG_X86_32 #ifdef CONFIG_X86_32
FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
...@@ -139,9 +98,6 @@ enum fixed_addresses { ...@@ -139,9 +98,6 @@ enum fixed_addresses {
#ifdef CONFIG_X86_INTEL_MID #ifdef CONFIG_X86_INTEL_MID
FIX_LNW_VRTC, FIX_LNW_VRTC,
#endif #endif
/* Fixmap entries to remap the GDTs, one per processor. */
FIX_CPU_ENTRY_AREA_TOP,
FIX_CPU_ENTRY_AREA_BOTTOM = FIX_CPU_ENTRY_AREA_TOP + (CPU_ENTRY_AREA_PAGES * NR_CPUS) - 1,
#ifdef CONFIG_ACPI_APEI_GHES #ifdef CONFIG_ACPI_APEI_GHES
/* Used for GHES mapping from assorted contexts */ /* Used for GHES mapping from assorted contexts */
...@@ -182,7 +138,7 @@ enum fixed_addresses { ...@@ -182,7 +138,7 @@ enum fixed_addresses {
extern void reserve_top_address(unsigned long reserve); extern void reserve_top_address(unsigned long reserve);
#define FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT) #define FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) #define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
extern int fixmaps_set; extern int fixmaps_set;
...@@ -230,30 +186,5 @@ void __init *early_memremap_decrypted_wp(resource_size_t phys_addr, ...@@ -230,30 +186,5 @@ void __init *early_memremap_decrypted_wp(resource_size_t phys_addr,
void __early_set_fixmap(enum fixed_addresses idx, void __early_set_fixmap(enum fixed_addresses idx,
phys_addr_t phys, pgprot_t flags); phys_addr_t phys, pgprot_t flags);
static inline unsigned int __get_cpu_entry_area_page_index(int cpu, int page)
{
BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0);
return FIX_CPU_ENTRY_AREA_BOTTOM - cpu*CPU_ENTRY_AREA_PAGES - page;
}
#define __get_cpu_entry_area_offset_index(cpu, offset) ({ \
BUILD_BUG_ON(offset % PAGE_SIZE != 0); \
__get_cpu_entry_area_page_index(cpu, offset / PAGE_SIZE); \
})
#define get_cpu_entry_area_index(cpu, field) \
__get_cpu_entry_area_offset_index((cpu), offsetof(struct cpu_entry_area, field))
static inline struct cpu_entry_area *get_cpu_entry_area(int cpu)
{
return (struct cpu_entry_area *)__fix_to_virt(__get_cpu_entry_area_page_index(cpu, 0));
}
static inline struct SYSENTER_stack *cpu_SYSENTER_stack(int cpu)
{
return &get_cpu_entry_area(cpu)->SYSENTER_stack_page.stack;
}
#endif /* !__ASSEMBLY__ */ #endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_FIXMAP_H */ #endif /* _ASM_X86_FIXMAP_H */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_INVPCID
#define _ASM_X86_INVPCID
static inline void __invpcid(unsigned long pcid, unsigned long addr,
unsigned long type)
{
struct { u64 d[2]; } desc = { { pcid, addr } };
/*
* The memory clobber is because the whole point is to invalidate
* stale TLB entries and, especially if we're flushing global
* mappings, we don't want the compiler to reorder any subsequent
* memory accesses before the TLB flush.
*
* The hex opcode is invpcid (%ecx), %eax in 32-bit mode and
* invpcid (%rcx), %rax in long mode.
*/
asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01"
: : "m" (desc), "a" (type), "c" (&desc) : "memory");
}
#define INVPCID_TYPE_INDIV_ADDR 0
#define INVPCID_TYPE_SINGLE_CTXT 1
#define INVPCID_TYPE_ALL_INCL_GLOBAL 2
#define INVPCID_TYPE_ALL_NON_GLOBAL 3
/* Flush all mappings for a given pcid and addr, not including globals. */
static inline void invpcid_flush_one(unsigned long pcid,
unsigned long addr)
{
__invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR);
}
/* Flush all mappings for a given PCID, not including globals. */
static inline void invpcid_flush_single_context(unsigned long pcid)
{
__invpcid(pcid, 0, INVPCID_TYPE_SINGLE_CTXT);
}
/* Flush all mappings, including globals, for all PCIDs. */
static inline void invpcid_flush_all(void)
{
__invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL);
}
/* Flush all mappings for all PCIDs except globals. */
static inline void invpcid_flush_all_nonglobals(void)
{
__invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL);
}
#endif /* _ASM_X86_INVPCID */
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
#define _ASM_X86_MMU_H #define _ASM_X86_MMU_H
#include <linux/spinlock.h> #include <linux/spinlock.h>
#include <linux/rwsem.h>
#include <linux/mutex.h> #include <linux/mutex.h>
#include <linux/atomic.h> #include <linux/atomic.h>
...@@ -27,7 +28,8 @@ typedef struct { ...@@ -27,7 +28,8 @@ typedef struct {
atomic64_t tlb_gen; atomic64_t tlb_gen;
#ifdef CONFIG_MODIFY_LDT_SYSCALL #ifdef CONFIG_MODIFY_LDT_SYSCALL
struct ldt_struct *ldt; struct rw_semaphore ldt_usr_sem;
struct ldt_struct *ldt;
#endif #endif
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
......
...@@ -57,11 +57,17 @@ struct ldt_struct { ...@@ -57,11 +57,17 @@ struct ldt_struct {
/* /*
* Used for LDT copy/destruction. * Used for LDT copy/destruction.
*/ */
int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm); static inline void init_new_context_ldt(struct mm_struct *mm)
{
mm->context.ldt = NULL;
init_rwsem(&mm->context.ldt_usr_sem);
}
int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm);
void destroy_context_ldt(struct mm_struct *mm); void destroy_context_ldt(struct mm_struct *mm);
#else /* CONFIG_MODIFY_LDT_SYSCALL */ #else /* CONFIG_MODIFY_LDT_SYSCALL */
static inline int init_new_context_ldt(struct task_struct *tsk, static inline void init_new_context_ldt(struct mm_struct *mm) { }
struct mm_struct *mm) static inline int ldt_dup_context(struct mm_struct *oldmm,
struct mm_struct *mm)
{ {
return 0; return 0;
} }
...@@ -132,18 +138,21 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk); ...@@ -132,18 +138,21 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
static inline int init_new_context(struct task_struct *tsk, static inline int init_new_context(struct task_struct *tsk,
struct mm_struct *mm) struct mm_struct *mm)
{ {
mutex_init(&mm->context.lock);
mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id); mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id);
atomic64_set(&mm->context.tlb_gen, 0); atomic64_set(&mm->context.tlb_gen, 0);
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
if (cpu_feature_enabled(X86_FEATURE_OSPKE)) { if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {
/* pkey 0 is the default and always allocated */ /* pkey 0 is the default and always allocated */
mm->context.pkey_allocation_map = 0x1; mm->context.pkey_allocation_map = 0x1;
/* -1 means unallocated or invalid */ /* -1 means unallocated or invalid */
mm->context.execute_only_pkey = -1; mm->context.execute_only_pkey = -1;
} }
#endif #endif
return init_new_context_ldt(tsk, mm); init_new_context_ldt(mm);
return 0;
} }
static inline void destroy_context(struct mm_struct *mm) static inline void destroy_context(struct mm_struct *mm)
{ {
...@@ -176,10 +185,10 @@ do { \ ...@@ -176,10 +185,10 @@ do { \
} while (0) } while (0)
#endif #endif
static inline void arch_dup_mmap(struct mm_struct *oldmm, static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
struct mm_struct *mm)
{ {
paravirt_arch_dup_mmap(oldmm, mm); paravirt_arch_dup_mmap(oldmm, mm);
return ldt_dup_context(oldmm, mm);
} }
static inline void arch_exit_mmap(struct mm_struct *mm) static inline void arch_exit_mmap(struct mm_struct *mm)
...@@ -281,33 +290,6 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, ...@@ -281,33 +290,6 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
return __pkru_allows_pkey(vma_pkey(vma), write); return __pkru_allows_pkey(vma_pkey(vma), write);
} }
/*
* If PCID is on, ASID-aware code paths put the ASID+1 into the PCID
* bits. This serves two purposes. It prevents a nasty situation in
* which PCID-unaware code saves CR3, loads some other value (with PCID
* == 0), and then restores CR3, thus corrupting the TLB for ASID 0 if
* the saved ASID was nonzero. It also means that any bugs involving
* loading a PCID-enabled CR3 with CR4.PCIDE off will trigger
* deterministically.
*/
static inline unsigned long build_cr3(struct mm_struct *mm, u16 asid)
{
if (static_cpu_has(X86_FEATURE_PCID)) {
VM_WARN_ON_ONCE(asid > 4094);
return __sme_pa(mm->pgd) | (asid + 1);
} else {
VM_WARN_ON_ONCE(asid != 0);
return __sme_pa(mm->pgd);
}
}
static inline unsigned long build_cr3_noflush(struct mm_struct *mm, u16 asid)
{
VM_WARN_ON_ONCE(asid > 4094);
return __sme_pa(mm->pgd) | (asid + 1) | CR3_NOFLUSH;
}
/* /*
* This can be used from process context to figure out what the value of * This can be used from process context to figure out what the value of
* CR3 is without needing to do a (slow) __read_cr3(). * CR3 is without needing to do a (slow) __read_cr3().
...@@ -317,7 +299,7 @@ static inline unsigned long build_cr3_noflush(struct mm_struct *mm, u16 asid) ...@@ -317,7 +299,7 @@ static inline unsigned long build_cr3_noflush(struct mm_struct *mm, u16 asid)
*/ */
static inline unsigned long __get_current_cr3_fast(void) static inline unsigned long __get_current_cr3_fast(void)
{ {
unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm), unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd,
this_cpu_read(cpu_tlbstate.loaded_mm_asid)); this_cpu_read(cpu_tlbstate.loaded_mm_asid));
/* For now, be very restrictive about when this can be called. */ /* For now, be very restrictive about when this can be called. */
......
...@@ -38,13 +38,22 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */ ...@@ -38,13 +38,22 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */
#define LAST_PKMAP 1024 #define LAST_PKMAP 1024
#endif #endif
#define PKMAP_BASE ((FIXADDR_START - PAGE_SIZE * (LAST_PKMAP + 1)) \ /*
& PMD_MASK) * Define this here and validate with BUILD_BUG_ON() in pgtable_32.c
* to avoid include recursion hell
*/
#define CPU_ENTRY_AREA_PAGES (NR_CPUS * 40)
#define CPU_ENTRY_AREA_BASE \
((FIXADDR_START - PAGE_SIZE * (CPU_ENTRY_AREA_PAGES + 1)) & PMD_MASK)
#define PKMAP_BASE \
((CPU_ENTRY_AREA_BASE - PAGE_SIZE) & PMD_MASK)
#ifdef CONFIG_HIGHMEM #ifdef CONFIG_HIGHMEM
# define VMALLOC_END (PKMAP_BASE - 2 * PAGE_SIZE) # define VMALLOC_END (PKMAP_BASE - 2 * PAGE_SIZE)
#else #else
# define VMALLOC_END (FIXADDR_START - 2 * PAGE_SIZE) # define VMALLOC_END (CPU_ENTRY_AREA_BASE - 2 * PAGE_SIZE)
#endif #endif
#define MODULES_VADDR VMALLOC_START #define MODULES_VADDR VMALLOC_START
......
...@@ -76,32 +76,41 @@ typedef struct { pteval_t pte; } pte_t; ...@@ -76,32 +76,41 @@ typedef struct { pteval_t pte; } pte_t;
#define PGDIR_MASK (~(PGDIR_SIZE - 1)) #define PGDIR_MASK (~(PGDIR_SIZE - 1))
/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ /* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) #define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
#ifdef CONFIG_X86_5LEVEL #ifdef CONFIG_X86_5LEVEL
#define VMALLOC_SIZE_TB _AC(16384, UL) # define VMALLOC_SIZE_TB _AC(16384, UL)
#define __VMALLOC_BASE _AC(0xff92000000000000, UL) # define __VMALLOC_BASE _AC(0xff92000000000000, UL)
#define __VMEMMAP_BASE _AC(0xffd4000000000000, UL) # define __VMEMMAP_BASE _AC(0xffd4000000000000, UL)
#else #else
#define VMALLOC_SIZE_TB _AC(32, UL) # define VMALLOC_SIZE_TB _AC(32, UL)
#define __VMALLOC_BASE _AC(0xffffc90000000000, UL) # define __VMALLOC_BASE _AC(0xffffc90000000000, UL)
#define __VMEMMAP_BASE _AC(0xffffea0000000000, UL) # define __VMEMMAP_BASE _AC(0xffffea0000000000, UL)
#endif #endif
#ifdef CONFIG_RANDOMIZE_MEMORY #ifdef CONFIG_RANDOMIZE_MEMORY
#define VMALLOC_START vmalloc_base # define VMALLOC_START vmalloc_base
#define VMEMMAP_START vmemmap_base # define VMEMMAP_START vmemmap_base
#else #else
#define VMALLOC_START __VMALLOC_BASE # define VMALLOC_START __VMALLOC_BASE
#define VMEMMAP_START __VMEMMAP_BASE # define VMEMMAP_START __VMEMMAP_BASE
#endif /* CONFIG_RANDOMIZE_MEMORY */ #endif /* CONFIG_RANDOMIZE_MEMORY */
#define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL))
#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) #define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL))
#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE)
/* The module sections ends with the start of the fixmap */ /* The module sections ends with the start of the fixmap */
#define MODULES_END __fix_to_virt(__end_of_fixed_addresses + 1) #define MODULES_END __fix_to_virt(__end_of_fixed_addresses + 1)
#define MODULES_LEN (MODULES_END - MODULES_VADDR) #define MODULES_LEN (MODULES_END - MODULES_VADDR)
#define ESPFIX_PGD_ENTRY _AC(-2, UL)
#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT) #define ESPFIX_PGD_ENTRY _AC(-2, UL)
#define EFI_VA_START ( -4 * (_AC(1, UL) << 30)) #define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT)
#define EFI_VA_END (-68 * (_AC(1, UL) << 30))
#define CPU_ENTRY_AREA_PGD _AC(-3, UL)
#define CPU_ENTRY_AREA_BASE (CPU_ENTRY_AREA_PGD << P4D_SHIFT)
#define EFI_VA_START ( -4 * (_AC(1, UL) << 30))
#define EFI_VA_END (-68 * (_AC(1, UL) << 30))
#define EARLY_DYNAMIC_PAGE_TABLES 64 #define EARLY_DYNAMIC_PAGE_TABLES 64
......
...@@ -337,12 +337,12 @@ struct x86_hw_tss { ...@@ -337,12 +337,12 @@ struct x86_hw_tss {
#define IO_BITMAP_OFFSET (offsetof(struct tss_struct, io_bitmap) - offsetof(struct tss_struct, x86_tss)) #define IO_BITMAP_OFFSET (offsetof(struct tss_struct, io_bitmap) - offsetof(struct tss_struct, x86_tss))
#define INVALID_IO_BITMAP_OFFSET 0x8000 #define INVALID_IO_BITMAP_OFFSET 0x8000
struct SYSENTER_stack { struct entry_stack {
unsigned long words[64]; unsigned long words[64];
}; };
struct SYSENTER_stack_page { struct entry_stack_page {
struct SYSENTER_stack stack; struct entry_stack stack;
} __aligned(PAGE_SIZE); } __aligned(PAGE_SIZE);
struct tss_struct { struct tss_struct {
......
...@@ -16,7 +16,7 @@ enum stack_type { ...@@ -16,7 +16,7 @@ enum stack_type {
STACK_TYPE_TASK, STACK_TYPE_TASK,
STACK_TYPE_IRQ, STACK_TYPE_IRQ,
STACK_TYPE_SOFTIRQ, STACK_TYPE_SOFTIRQ,
STACK_TYPE_SYSENTER, STACK_TYPE_ENTRY,
STACK_TYPE_EXCEPTION, STACK_TYPE_EXCEPTION,
STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1, STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1,
}; };
...@@ -29,7 +29,7 @@ struct stack_info { ...@@ -29,7 +29,7 @@ struct stack_info {
bool in_task_stack(unsigned long *stack, struct task_struct *task, bool in_task_stack(unsigned long *stack, struct task_struct *task,
struct stack_info *info); struct stack_info *info);
bool in_sysenter_stack(unsigned long *stack, struct stack_info *info); bool in_entry_stack(unsigned long *stack, struct stack_info *info);
int get_stack_info(unsigned long *stack, struct task_struct *task, int get_stack_info(unsigned long *stack, struct task_struct *task,
struct stack_info *info, unsigned long *visit_mask); struct stack_info *info, unsigned long *visit_mask);
......
...@@ -9,70 +9,66 @@ ...@@ -9,70 +9,66 @@
#include <asm/cpufeature.h> #include <asm/cpufeature.h>
#include <asm/special_insns.h> #include <asm/special_insns.h>
#include <asm/smp.h> #include <asm/smp.h>
#include <asm/invpcid.h>
static inline void __invpcid(unsigned long pcid, unsigned long addr, static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
unsigned long type)
{ {
struct { u64 d[2]; } desc = { { pcid, addr } };
/* /*
* The memory clobber is because the whole point is to invalidate * Bump the generation count. This also serves as a full barrier
* stale TLB entries and, especially if we're flushing global * that synchronizes with switch_mm(): callers are required to order
* mappings, we don't want the compiler to reorder any subsequent * their read of mm_cpumask after their writes to the paging
* memory accesses before the TLB flush. * structures.
*
* The hex opcode is invpcid (%ecx), %eax in 32-bit mode and
* invpcid (%rcx), %rax in long mode.
*/ */
asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01" return atomic64_inc_return(&mm->context.tlb_gen);
: : "m" (desc), "a" (type), "c" (&desc) : "memory");
} }
#define INVPCID_TYPE_INDIV_ADDR 0 /* There are 12 bits of space for ASIDS in CR3 */
#define INVPCID_TYPE_SINGLE_CTXT 1 #define CR3_HW_ASID_BITS 12
#define INVPCID_TYPE_ALL_INCL_GLOBAL 2 /*
#define INVPCID_TYPE_ALL_NON_GLOBAL 3 * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for
* user/kernel switches
*/
#define PTI_CONSUMED_ASID_BITS 0
/* Flush all mappings for a given pcid and addr, not including globals. */ #define CR3_AVAIL_ASID_BITS (CR3_HW_ASID_BITS - PTI_CONSUMED_ASID_BITS)
static inline void invpcid_flush_one(unsigned long pcid, /*
unsigned long addr) * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account
{ * for them being zero-based. Another -1 is because ASID 0 is reserved for
__invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR); * use by non-PCID-aware users.
} */
#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_ASID_BITS) - 2)
/* Flush all mappings for a given PCID, not including globals. */ static inline u16 kern_pcid(u16 asid)
static inline void invpcid_flush_single_context(unsigned long pcid)
{ {
__invpcid(pcid, 0, INVPCID_TYPE_SINGLE_CTXT); VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
/*
* If PCID is on, ASID-aware code paths put the ASID+1 into the
* PCID bits. This serves two purposes. It prevents a nasty
* situation in which PCID-unaware code saves CR3, loads some other
* value (with PCID == 0), and then restores CR3, thus corrupting
* the TLB for ASID 0 if the saved ASID was nonzero. It also means
* that any bugs involving loading a PCID-enabled CR3 with
* CR4.PCIDE off will trigger deterministically.
*/
return asid + 1;
} }
/* Flush all mappings, including globals, for all PCIDs. */ struct pgd_t;
static inline void invpcid_flush_all(void) static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)
{ {
__invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL); if (static_cpu_has(X86_FEATURE_PCID)) {
return __sme_pa(pgd) | kern_pcid(asid);
} else {
VM_WARN_ON_ONCE(asid != 0);
return __sme_pa(pgd);
}
} }
/* Flush all mappings for all PCIDs except globals. */ static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
static inline void invpcid_flush_all_nonglobals(void)
{ {
__invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL); VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
} VM_WARN_ON_ONCE(!this_cpu_has(X86_FEATURE_PCID));
return __sme_pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH;
static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
{
u64 new_tlb_gen;
/*
* Bump the generation count. This also serves as a full barrier
* that synchronizes with switch_mm(): callers are required to order
* their read of mm_cpumask after their writes to the paging
* structures.
*/
smp_mb__before_atomic();
new_tlb_gen = atomic64_inc_return(&mm->context.tlb_gen);
smp_mb__after_atomic();
return new_tlb_gen;
} }
#ifdef CONFIG_PARAVIRT #ifdef CONFIG_PARAVIRT
...@@ -237,6 +233,9 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask) ...@@ -237,6 +233,9 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)
extern void initialize_tlbstate_and_flush(void); extern void initialize_tlbstate_and_flush(void);
/*
* flush the entire current user mapping
*/
static inline void __native_flush_tlb(void) static inline void __native_flush_tlb(void)
{ {
/* /*
...@@ -249,20 +248,12 @@ static inline void __native_flush_tlb(void) ...@@ -249,20 +248,12 @@ static inline void __native_flush_tlb(void)
preempt_enable(); preempt_enable();
} }
static inline void __native_flush_tlb_global_irq_disabled(void) /*
{ * flush everything
unsigned long cr4; */
cr4 = this_cpu_read(cpu_tlbstate.cr4);
/* clear PGE */
native_write_cr4(cr4 & ~X86_CR4_PGE);
/* write old PGE again and flush TLBs */
native_write_cr4(cr4);
}
static inline void __native_flush_tlb_global(void) static inline void __native_flush_tlb_global(void)
{ {
unsigned long flags; unsigned long cr4, flags;
if (static_cpu_has(X86_FEATURE_INVPCID)) { if (static_cpu_has(X86_FEATURE_INVPCID)) {
/* /*
...@@ -280,22 +271,36 @@ static inline void __native_flush_tlb_global(void) ...@@ -280,22 +271,36 @@ static inline void __native_flush_tlb_global(void)
*/ */
raw_local_irq_save(flags); raw_local_irq_save(flags);
__native_flush_tlb_global_irq_disabled(); cr4 = this_cpu_read(cpu_tlbstate.cr4);
/* toggle PGE */
native_write_cr4(cr4 ^ X86_CR4_PGE);
/* write old PGE again and flush TLBs */
native_write_cr4(cr4);
raw_local_irq_restore(flags); raw_local_irq_restore(flags);
} }
/*
* flush one page in the user mapping
*/
static inline void __native_flush_tlb_single(unsigned long addr) static inline void __native_flush_tlb_single(unsigned long addr)
{ {
asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
} }
/*
* flush everything
*/
static inline void __flush_tlb_all(void) static inline void __flush_tlb_all(void)
{ {
if (boot_cpu_has(X86_FEATURE_PGE)) if (boot_cpu_has(X86_FEATURE_PGE)) {
__flush_tlb_global(); __flush_tlb_global();
else } else {
/*
* !PGE -> !PCID (setup_pcid()), thus every flush is total.
*/
__flush_tlb(); __flush_tlb();
}
/* /*
* Note: if we somehow had PCID but not PGE, then this wouldn't work -- * Note: if we somehow had PCID but not PGE, then this wouldn't work --
...@@ -306,6 +311,9 @@ static inline void __flush_tlb_all(void) ...@@ -306,6 +311,9 @@ static inline void __flush_tlb_all(void)
*/ */
} }
/*
* flush one page in the kernel mapping
*/
static inline void __flush_tlb_one(unsigned long addr) static inline void __flush_tlb_one(unsigned long addr)
{ {
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
......
...@@ -97,6 +97,6 @@ void common(void) { ...@@ -97,6 +97,6 @@ void common(void) {
/* Layout info for cpu_entry_area */ /* Layout info for cpu_entry_area */
OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline); OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
OFFSET(CPU_ENTRY_AREA_SYSENTER_stack, cpu_entry_area, SYSENTER_stack_page); OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page);
DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack)); DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack));
} }
...@@ -48,7 +48,7 @@ void foo(void) ...@@ -48,7 +48,7 @@ void foo(void)
/* Offset from the sysenter stack to tss.sp0 */ /* Offset from the sysenter stack to tss.sp0 */
DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) - DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) -
offsetofend(struct cpu_entry_area, SYSENTER_stack_page.stack)); offsetofend(struct cpu_entry_area, entry_stack_page.stack));
#ifdef CONFIG_CC_STACKPROTECTOR #ifdef CONFIG_CC_STACKPROTECTOR
BLANK(); BLANK();
......
...@@ -506,102 +506,8 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { ...@@ -506,102 +506,8 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
[0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
[DEBUG_STACK - 1] = DEBUG_STKSZ [DEBUG_STACK - 1] = DEBUG_STKSZ
}; };
static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
#endif
static DEFINE_PER_CPU_PAGE_ALIGNED(struct SYSENTER_stack_page,
SYSENTER_stack_storage);
static void __init
set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot)
{
for ( ; pages; pages--, idx--, ptr += PAGE_SIZE)
__set_fixmap(idx, per_cpu_ptr_to_phys(ptr), prot);
}
/* Setup the fixmap mappings only once per-processor */
static void __init setup_cpu_entry_area(int cpu)
{
#ifdef CONFIG_X86_64
extern char _entry_trampoline[];
/* On 64-bit systems, we use a read-only fixmap GDT and TSS. */
pgprot_t gdt_prot = PAGE_KERNEL_RO;
pgprot_t tss_prot = PAGE_KERNEL_RO;
#else
/*
* On native 32-bit systems, the GDT cannot be read-only because
* our double fault handler uses a task gate, and entering through
* a task gate needs to change an available TSS to busy. If the
* GDT is read-only, that will triple fault. The TSS cannot be
* read-only because the CPU writes to it on task switches.
*
* On Xen PV, the GDT must be read-only because the hypervisor
* requires it.
*/
pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ?
PAGE_KERNEL_RO : PAGE_KERNEL;
pgprot_t tss_prot = PAGE_KERNEL;
#endif
__set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot);
set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, SYSENTER_stack_page),
per_cpu_ptr(&SYSENTER_stack_storage, cpu), 1,
PAGE_KERNEL);
/*
* The Intel SDM says (Volume 3, 7.2.1):
*
* Avoid placing a page boundary in the part of the TSS that the
* processor reads during a task switch (the first 104 bytes). The
* processor may not correctly perform address translations if a
* boundary occurs in this area. During a task switch, the processor
* reads and writes into the first 104 bytes of each TSS (using
* contiguous physical addresses beginning with the physical address
* of the first byte of the TSS). So, after TSS access begins, if
* part of the 104 bytes is not physically contiguous, the processor
* will access incorrect information without generating a page-fault
* exception.
*
* There are also a lot of errata involving the TSS spanning a page
* boundary. Assert that we're not doing that.
*/
BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^
offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK);
BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0);
set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss),
&per_cpu(cpu_tss_rw, cpu),
sizeof(struct tss_struct) / PAGE_SIZE,
tss_prot);
#ifdef CONFIG_X86_32
per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu);
#endif #endif
#ifdef CONFIG_X86_64
BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0);
BUILD_BUG_ON(sizeof(exception_stacks) !=
sizeof(((struct cpu_entry_area *)0)->exception_stacks));
set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, exception_stacks),
&per_cpu(exception_stacks, cpu),
sizeof(exception_stacks) / PAGE_SIZE,
PAGE_KERNEL);
__set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline),
__pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
#endif
}
void __init setup_cpu_entry_areas(void)
{
unsigned int cpu;
for_each_possible_cpu(cpu)
setup_cpu_entry_area(cpu);
}
/* Load the original GDT from the per-cpu structure */ /* Load the original GDT from the per-cpu structure */
void load_direct_gdt(int cpu) void load_direct_gdt(int cpu)
{ {
...@@ -1348,7 +1254,7 @@ void enable_sep_cpu(void) ...@@ -1348,7 +1254,7 @@ void enable_sep_cpu(void)
tss->x86_tss.ss1 = __KERNEL_CS; tss->x86_tss.ss1 = __KERNEL_CS;
wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0); wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0);
wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1), 0); wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1), 0);
wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0); wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0);
put_cpu(); put_cpu();
...@@ -1465,7 +1371,7 @@ void syscall_init(void) ...@@ -1465,7 +1371,7 @@ void syscall_init(void)
* AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
*/ */
wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1)); wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1));
wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
#else #else
wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret); wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret);
...@@ -1680,7 +1586,7 @@ void cpu_init(void) ...@@ -1680,7 +1586,7 @@ void cpu_init(void)
*/ */
set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
load_TR_desc(); load_TR_desc();
load_sp0((unsigned long)(cpu_SYSENTER_stack(cpu) + 1)); load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1));
load_mm_ldt(&init_mm); load_mm_ldt(&init_mm);
......
...@@ -565,15 +565,6 @@ static void print_ucode(struct ucode_cpu_info *uci) ...@@ -565,15 +565,6 @@ static void print_ucode(struct ucode_cpu_info *uci)
} }
#else #else
/*
* Flush global tlb. We only do this in x86_64 where paging has been enabled
* already and PGE should be enabled as well.
*/
static inline void flush_tlb_early(void)
{
__native_flush_tlb_global_irq_disabled();
}
static inline void print_ucode(struct ucode_cpu_info *uci) static inline void print_ucode(struct ucode_cpu_info *uci)
{ {
struct microcode_intel *mc; struct microcode_intel *mc;
...@@ -602,10 +593,6 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) ...@@ -602,10 +593,6 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
if (rev != mc->hdr.rev) if (rev != mc->hdr.rev)
return -1; return -1;
#ifdef CONFIG_X86_64
/* Flush global tlb. This is precaution. */
flush_tlb_early();
#endif
uci->cpu_sig.rev = rev; uci->cpu_sig.rev = rev;
if (early) if (early)
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include <linux/nmi.h> #include <linux/nmi.h>
#include <linux/sysfs.h> #include <linux/sysfs.h>
#include <asm/cpu_entry_area.h>
#include <asm/stacktrace.h> #include <asm/stacktrace.h>
#include <asm/unwind.h> #include <asm/unwind.h>
...@@ -43,9 +44,9 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task, ...@@ -43,9 +44,9 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task,
return true; return true;
} }
bool in_sysenter_stack(unsigned long *stack, struct stack_info *info) bool in_entry_stack(unsigned long *stack, struct stack_info *info)
{ {
struct SYSENTER_stack *ss = cpu_SYSENTER_stack(smp_processor_id()); struct entry_stack *ss = cpu_entry_stack(smp_processor_id());
void *begin = ss; void *begin = ss;
void *end = ss + 1; void *end = ss + 1;
...@@ -53,7 +54,7 @@ bool in_sysenter_stack(unsigned long *stack, struct stack_info *info) ...@@ -53,7 +54,7 @@ bool in_sysenter_stack(unsigned long *stack, struct stack_info *info)
if ((void *)stack < begin || (void *)stack >= end) if ((void *)stack < begin || (void *)stack >= end)
return false; return false;
info->type = STACK_TYPE_SYSENTER; info->type = STACK_TYPE_ENTRY;
info->begin = begin; info->begin = begin;
info->end = end; info->end = end;
info->next_sp = NULL; info->next_sp = NULL;
...@@ -111,13 +112,13 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, ...@@ -111,13 +112,13 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
* - task stack * - task stack
* - interrupt stack * - interrupt stack
* - HW exception stacks (double fault, nmi, debug, mce) * - HW exception stacks (double fault, nmi, debug, mce)
* - SYSENTER stack * - entry stack
* *
* x86-32 can have up to four stacks: * x86-32 can have up to four stacks:
* - task stack * - task stack
* - softirq stack * - softirq stack
* - hardirq stack * - hardirq stack
* - SYSENTER stack * - entry stack
*/ */
for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
const char *stack_name; const char *stack_name;
......
...@@ -26,8 +26,8 @@ const char *stack_type_name(enum stack_type type) ...@@ -26,8 +26,8 @@ const char *stack_type_name(enum stack_type type)
if (type == STACK_TYPE_SOFTIRQ) if (type == STACK_TYPE_SOFTIRQ)
return "SOFTIRQ"; return "SOFTIRQ";
if (type == STACK_TYPE_SYSENTER) if (type == STACK_TYPE_ENTRY)
return "SYSENTER"; return "ENTRY_TRAMPOLINE";
return NULL; return NULL;
} }
...@@ -96,7 +96,7 @@ int get_stack_info(unsigned long *stack, struct task_struct *task, ...@@ -96,7 +96,7 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
if (task != current) if (task != current)
goto unknown; goto unknown;
if (in_sysenter_stack(stack, info)) if (in_entry_stack(stack, info))
goto recursion_check; goto recursion_check;
if (in_hardirq_stack(stack, info)) if (in_hardirq_stack(stack, info))
......
...@@ -37,8 +37,14 @@ const char *stack_type_name(enum stack_type type) ...@@ -37,8 +37,14 @@ const char *stack_type_name(enum stack_type type)
if (type == STACK_TYPE_IRQ) if (type == STACK_TYPE_IRQ)
return "IRQ"; return "IRQ";
if (type == STACK_TYPE_SYSENTER) if (type == STACK_TYPE_ENTRY) {
return "SYSENTER"; /*
* On 64-bit, we have a generic entry stack that we
* use for all the kernel entry points, including
* SYSENTER.
*/
return "ENTRY_TRAMPOLINE";
}
if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST) if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST)
return exception_stack_names[type - STACK_TYPE_EXCEPTION]; return exception_stack_names[type - STACK_TYPE_EXCEPTION];
...@@ -118,7 +124,7 @@ int get_stack_info(unsigned long *stack, struct task_struct *task, ...@@ -118,7 +124,7 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
if (in_irq_stack(stack, info)) if (in_irq_stack(stack, info))
goto recursion_check; goto recursion_check;
if (in_sysenter_stack(stack, info)) if (in_entry_stack(stack, info))
goto recursion_check; goto recursion_check;
goto unknown; goto unknown;
......
...@@ -5,6 +5,11 @@ ...@@ -5,6 +5,11 @@
* Copyright (C) 2002 Andi Kleen * Copyright (C) 2002 Andi Kleen
* *
* This handles calls from both 32bit and 64bit mode. * This handles calls from both 32bit and 64bit mode.
*
* Lock order:
* contex.ldt_usr_sem
* mmap_sem
* context.lock
*/ */
#include <linux/errno.h> #include <linux/errno.h>
...@@ -42,7 +47,7 @@ static void refresh_ldt_segments(void) ...@@ -42,7 +47,7 @@ static void refresh_ldt_segments(void)
#endif #endif
} }
/* context.lock is held for us, so we don't need any locking. */ /* context.lock is held by the task which issued the smp function call */
static void flush_ldt(void *__mm) static void flush_ldt(void *__mm)
{ {
struct mm_struct *mm = __mm; struct mm_struct *mm = __mm;
...@@ -99,15 +104,17 @@ static void finalize_ldt_struct(struct ldt_struct *ldt) ...@@ -99,15 +104,17 @@ static void finalize_ldt_struct(struct ldt_struct *ldt)
paravirt_alloc_ldt(ldt->entries, ldt->nr_entries); paravirt_alloc_ldt(ldt->entries, ldt->nr_entries);
} }
/* context.lock is held */ static void install_ldt(struct mm_struct *mm, struct ldt_struct *ldt)
static void install_ldt(struct mm_struct *current_mm,
struct ldt_struct *ldt)
{ {
mutex_lock(&mm->context.lock);
/* Synchronizes with READ_ONCE in load_mm_ldt. */ /* Synchronizes with READ_ONCE in load_mm_ldt. */
smp_store_release(&current_mm->context.ldt, ldt); smp_store_release(&mm->context.ldt, ldt);
/* Activate the LDT for all CPUs using current_mm. */ /* Activate the LDT for all CPUs using currents mm. */
on_each_cpu_mask(mm_cpumask(current_mm), flush_ldt, current_mm, true); on_each_cpu_mask(mm_cpumask(mm), flush_ldt, mm, true);
mutex_unlock(&mm->context.lock);
} }
static void free_ldt_struct(struct ldt_struct *ldt) static void free_ldt_struct(struct ldt_struct *ldt)
...@@ -124,27 +131,20 @@ static void free_ldt_struct(struct ldt_struct *ldt) ...@@ -124,27 +131,20 @@ static void free_ldt_struct(struct ldt_struct *ldt)
} }
/* /*
* we do not have to muck with descriptors here, that is * Called on fork from arch_dup_mmap(). Just copy the current LDT state,
* done in switch_mm() as needed. * the new task is not running, so nothing can be installed.
*/ */
int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm) int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm)
{ {
struct ldt_struct *new_ldt; struct ldt_struct *new_ldt;
struct mm_struct *old_mm;
int retval = 0; int retval = 0;
mutex_init(&mm->context.lock); if (!old_mm)
old_mm = current->mm;
if (!old_mm) {
mm->context.ldt = NULL;
return 0; return 0;
}
mutex_lock(&old_mm->context.lock); mutex_lock(&old_mm->context.lock);
if (!old_mm->context.ldt) { if (!old_mm->context.ldt)
mm->context.ldt = NULL;
goto out_unlock; goto out_unlock;
}
new_ldt = alloc_ldt_struct(old_mm->context.ldt->nr_entries); new_ldt = alloc_ldt_struct(old_mm->context.ldt->nr_entries);
if (!new_ldt) { if (!new_ldt) {
...@@ -180,7 +180,7 @@ static int read_ldt(void __user *ptr, unsigned long bytecount) ...@@ -180,7 +180,7 @@ static int read_ldt(void __user *ptr, unsigned long bytecount)
unsigned long entries_size; unsigned long entries_size;
int retval; int retval;
mutex_lock(&mm->context.lock); down_read(&mm->context.ldt_usr_sem);
if (!mm->context.ldt) { if (!mm->context.ldt) {
retval = 0; retval = 0;
...@@ -209,7 +209,7 @@ static int read_ldt(void __user *ptr, unsigned long bytecount) ...@@ -209,7 +209,7 @@ static int read_ldt(void __user *ptr, unsigned long bytecount)
retval = bytecount; retval = bytecount;
out_unlock: out_unlock:
mutex_unlock(&mm->context.lock); up_read(&mm->context.ldt_usr_sem);
return retval; return retval;
} }
...@@ -269,7 +269,8 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) ...@@ -269,7 +269,8 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
ldt.avl = 0; ldt.avl = 0;
} }
mutex_lock(&mm->context.lock); if (down_write_killable(&mm->context.ldt_usr_sem))
return -EINTR;
old_ldt = mm->context.ldt; old_ldt = mm->context.ldt;
old_nr_entries = old_ldt ? old_ldt->nr_entries : 0; old_nr_entries = old_ldt ? old_ldt->nr_entries : 0;
...@@ -291,7 +292,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) ...@@ -291,7 +292,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
error = 0; error = 0;
out_unlock: out_unlock:
mutex_unlock(&mm->context.lock); up_write(&mm->context.ldt_usr_sem);
out: out:
return error; return error;
} }
......
...@@ -932,12 +932,8 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle, ...@@ -932,12 +932,8 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
initial_code = (unsigned long)start_secondary; initial_code = (unsigned long)start_secondary;
initial_stack = idle->thread.sp; initial_stack = idle->thread.sp;
/* /* Enable the espfix hack for this CPU */
* Enable the espfix hack for this CPU
*/
#ifdef CONFIG_X86_ESPFIX64
init_espfix_ap(cpu); init_espfix_ap(cpu);
#endif
/* So we see what's up */ /* So we see what's up */
announce_cpu(cpu, apicid); announce_cpu(cpu, apicid);
......
...@@ -51,6 +51,7 @@ ...@@ -51,6 +51,7 @@
#include <asm/traps.h> #include <asm/traps.h>
#include <asm/desc.h> #include <asm/desc.h>
#include <asm/fpu/internal.h> #include <asm/fpu/internal.h>
#include <asm/cpu_entry_area.h>
#include <asm/mce.h> #include <asm/mce.h>
#include <asm/fixmap.h> #include <asm/fixmap.h>
#include <asm/mach_traps.h> #include <asm/mach_traps.h>
...@@ -951,8 +952,9 @@ void __init trap_init(void) ...@@ -951,8 +952,9 @@ void __init trap_init(void)
* "sidt" instruction will not leak the location of the kernel, and * "sidt" instruction will not leak the location of the kernel, and
* to defend the IDT against arbitrary memory write vulnerabilities. * to defend the IDT against arbitrary memory write vulnerabilities.
* It will be reloaded in cpu_init() */ * It will be reloaded in cpu_init() */
__set_fixmap(FIX_RO_IDT, __pa_symbol(idt_table), PAGE_KERNEL_RO); cea_set_pte(CPU_ENTRY_AREA_RO_IDT_VADDR, __pa_symbol(idt_table),
idt_descr.address = fix_to_virt(FIX_RO_IDT); PAGE_KERNEL_RO);
idt_descr.address = CPU_ENTRY_AREA_RO_IDT;
/* /*
* Should be a barrier for any external CPU state: * Should be a barrier for any external CPU state:
......
...@@ -10,7 +10,7 @@ CFLAGS_REMOVE_mem_encrypt.o = -pg ...@@ -10,7 +10,7 @@ CFLAGS_REMOVE_mem_encrypt.o = -pg
endif endif
obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
pat.o pgtable.o physaddr.o setup_nx.o tlb.o pat.o pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o
# Make sure __phys_addr has no stackprotector # Make sure __phys_addr has no stackprotector
nostackp := $(call cc-option, -fno-stack-protector) nostackp := $(call cc-option, -fno-stack-protector)
......
// SPDX-License-Identifier: GPL-2.0
#include <linux/spinlock.h>
#include <linux/percpu.h>
#include <asm/cpu_entry_area.h>
#include <asm/pgtable.h>
#include <asm/fixmap.h>
#include <asm/desc.h>
static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage);
#ifdef CONFIG_X86_64
static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
#endif
struct cpu_entry_area *get_cpu_entry_area(int cpu)
{
unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE;
BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0);
return (struct cpu_entry_area *) va;
}
EXPORT_SYMBOL(get_cpu_entry_area);
void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags)
{
unsigned long va = (unsigned long) cea_vaddr;
set_pte_vaddr(va, pfn_pte(pa >> PAGE_SHIFT, flags));
}
static void __init
cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot)
{
for ( ; pages; pages--, cea_vaddr+= PAGE_SIZE, ptr += PAGE_SIZE)
cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot);
}
/* Setup the fixmap mappings only once per-processor */
static void __init setup_cpu_entry_area(int cpu)
{
#ifdef CONFIG_X86_64
extern char _entry_trampoline[];
/* On 64-bit systems, we use a read-only fixmap GDT and TSS. */
pgprot_t gdt_prot = PAGE_KERNEL_RO;
pgprot_t tss_prot = PAGE_KERNEL_RO;
#else
/*
* On native 32-bit systems, the GDT cannot be read-only because
* our double fault handler uses a task gate, and entering through
* a task gate needs to change an available TSS to busy. If the
* GDT is read-only, that will triple fault. The TSS cannot be
* read-only because the CPU writes to it on task switches.
*
* On Xen PV, the GDT must be read-only because the hypervisor
* requires it.
*/
pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ?
PAGE_KERNEL_RO : PAGE_KERNEL;
pgprot_t tss_prot = PAGE_KERNEL;
#endif
cea_set_pte(&get_cpu_entry_area(cpu)->gdt, get_cpu_gdt_paddr(cpu),
gdt_prot);
cea_map_percpu_pages(&get_cpu_entry_area(cpu)->entry_stack_page,
per_cpu_ptr(&entry_stack_storage, cpu), 1,
PAGE_KERNEL);
/*
* The Intel SDM says (Volume 3, 7.2.1):
*
* Avoid placing a page boundary in the part of the TSS that the
* processor reads during a task switch (the first 104 bytes). The
* processor may not correctly perform address translations if a
* boundary occurs in this area. During a task switch, the processor
* reads and writes into the first 104 bytes of each TSS (using
* contiguous physical addresses beginning with the physical address
* of the first byte of the TSS). So, after TSS access begins, if
* part of the 104 bytes is not physically contiguous, the processor
* will access incorrect information without generating a page-fault
* exception.
*
* There are also a lot of errata involving the TSS spanning a page
* boundary. Assert that we're not doing that.
*/
BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^
offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK);
BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0);
cea_map_percpu_pages(&get_cpu_entry_area(cpu)->tss,
&per_cpu(cpu_tss_rw, cpu),
sizeof(struct tss_struct) / PAGE_SIZE, tss_prot);
#ifdef CONFIG_X86_32
per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu);
#endif
#ifdef CONFIG_X86_64
BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0);
BUILD_BUG_ON(sizeof(exception_stacks) !=
sizeof(((struct cpu_entry_area *)0)->exception_stacks));
cea_map_percpu_pages(&get_cpu_entry_area(cpu)->exception_stacks,
&per_cpu(exception_stacks, cpu),
sizeof(exception_stacks) / PAGE_SIZE, PAGE_KERNEL);
cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline,
__pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
#endif
}
static __init void setup_cpu_entry_area_ptes(void)
{
#ifdef CONFIG_X86_32
unsigned long start, end;
BUILD_BUG_ON(CPU_ENTRY_AREA_PAGES * PAGE_SIZE < CPU_ENTRY_AREA_MAP_SIZE);
BUG_ON(CPU_ENTRY_AREA_BASE & ~PMD_MASK);
start = CPU_ENTRY_AREA_BASE;
end = start + CPU_ENTRY_AREA_MAP_SIZE;
/* Careful here: start + PMD_SIZE might wrap around */
for (; start < end && start >= CPU_ENTRY_AREA_BASE; start += PMD_SIZE)
populate_extra_pte(start);
#endif
}
void __init setup_cpu_entry_areas(void)
{
unsigned int cpu;
setup_cpu_entry_area_ptes();
for_each_possible_cpu(cpu)
setup_cpu_entry_area(cpu);
}
...@@ -44,10 +44,12 @@ struct addr_marker { ...@@ -44,10 +44,12 @@ struct addr_marker {
unsigned long max_lines; unsigned long max_lines;
}; };
/* indices for address_markers; keep sync'd w/ address_markers below */ /* Address space markers hints */
#ifdef CONFIG_X86_64
enum address_markers_idx { enum address_markers_idx {
USER_SPACE_NR = 0, USER_SPACE_NR = 0,
#ifdef CONFIG_X86_64
KERNEL_SPACE_NR, KERNEL_SPACE_NR,
LOW_KERNEL_NR, LOW_KERNEL_NR,
VMALLOC_START_NR, VMALLOC_START_NR,
...@@ -56,56 +58,74 @@ enum address_markers_idx { ...@@ -56,56 +58,74 @@ enum address_markers_idx {
KASAN_SHADOW_START_NR, KASAN_SHADOW_START_NR,
KASAN_SHADOW_END_NR, KASAN_SHADOW_END_NR,
#endif #endif
# ifdef CONFIG_X86_ESPFIX64 CPU_ENTRY_AREA_NR,
#ifdef CONFIG_X86_ESPFIX64
ESPFIX_START_NR, ESPFIX_START_NR,
# endif #endif
#ifdef CONFIG_EFI
EFI_END_NR,
#endif
HIGH_KERNEL_NR, HIGH_KERNEL_NR,
MODULES_VADDR_NR, MODULES_VADDR_NR,
MODULES_END_NR, MODULES_END_NR,
#else FIXADDR_START_NR,
END_OF_SPACE_NR,
};
static struct addr_marker address_markers[] = {
[USER_SPACE_NR] = { 0, "User Space" },
[KERNEL_SPACE_NR] = { (1UL << 63), "Kernel Space" },
[LOW_KERNEL_NR] = { 0UL, "Low Kernel Mapping" },
[VMALLOC_START_NR] = { 0UL, "vmalloc() Area" },
[VMEMMAP_START_NR] = { 0UL, "Vmemmap" },
#ifdef CONFIG_KASAN
[KASAN_SHADOW_START_NR] = { KASAN_SHADOW_START, "KASAN shadow" },
[KASAN_SHADOW_END_NR] = { KASAN_SHADOW_END, "KASAN shadow end" },
#endif
[CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" },
#ifdef CONFIG_X86_ESPFIX64
[ESPFIX_START_NR] = { ESPFIX_BASE_ADDR, "ESPfix Area", 16 },
#endif
#ifdef CONFIG_EFI
[EFI_END_NR] = { EFI_VA_END, "EFI Runtime Services" },
#endif
[HIGH_KERNEL_NR] = { __START_KERNEL_map, "High Kernel Mapping" },
[MODULES_VADDR_NR] = { MODULES_VADDR, "Modules" },
[MODULES_END_NR] = { MODULES_END, "End Modules" },
[FIXADDR_START_NR] = { FIXADDR_START, "Fixmap Area" },
[END_OF_SPACE_NR] = { -1, NULL }
};
#else /* CONFIG_X86_64 */
enum address_markers_idx {
USER_SPACE_NR = 0,
KERNEL_SPACE_NR, KERNEL_SPACE_NR,
VMALLOC_START_NR, VMALLOC_START_NR,
VMALLOC_END_NR, VMALLOC_END_NR,
# ifdef CONFIG_HIGHMEM #ifdef CONFIG_HIGHMEM
PKMAP_BASE_NR, PKMAP_BASE_NR,
# endif
FIXADDR_START_NR,
#endif #endif
CPU_ENTRY_AREA_NR,
FIXADDR_START_NR,
END_OF_SPACE_NR,
}; };
/* Address space markers hints */
static struct addr_marker address_markers[] = { static struct addr_marker address_markers[] = {
{ 0, "User Space" }, [USER_SPACE_NR] = { 0, "User Space" },
#ifdef CONFIG_X86_64 [KERNEL_SPACE_NR] = { PAGE_OFFSET, "Kernel Mapping" },
{ 0x8000000000000000UL, "Kernel Space" }, [VMALLOC_START_NR] = { 0UL, "vmalloc() Area" },
{ 0/* PAGE_OFFSET */, "Low Kernel Mapping" }, [VMALLOC_END_NR] = { 0UL, "vmalloc() End" },
{ 0/* VMALLOC_START */, "vmalloc() Area" }, #ifdef CONFIG_HIGHMEM
{ 0/* VMEMMAP_START */, "Vmemmap" }, [PKMAP_BASE_NR] = { 0UL, "Persistent kmap() Area" },
#ifdef CONFIG_KASAN
{ KASAN_SHADOW_START, "KASAN shadow" },
{ KASAN_SHADOW_END, "KASAN shadow end" },
#endif #endif
# ifdef CONFIG_X86_ESPFIX64 [CPU_ENTRY_AREA_NR] = { 0UL, "CPU entry area" },
{ ESPFIX_BASE_ADDR, "ESPfix Area", 16 }, [FIXADDR_START_NR] = { 0UL, "Fixmap area" },
# endif [END_OF_SPACE_NR] = { -1, NULL }
# ifdef CONFIG_EFI
{ EFI_VA_END, "EFI Runtime Services" },
# endif
{ __START_KERNEL_map, "High Kernel Mapping" },
{ MODULES_VADDR, "Modules" },
{ MODULES_END, "End Modules" },
#else
{ PAGE_OFFSET, "Kernel Mapping" },
{ 0/* VMALLOC_START */, "vmalloc() Area" },
{ 0/*VMALLOC_END*/, "vmalloc() End" },
# ifdef CONFIG_HIGHMEM
{ 0/*PKMAP_BASE*/, "Persistent kmap() Area" },
# endif
{ 0/*FIXADDR_START*/, "Fixmap Area" },
#endif
{ -1, NULL } /* End of list */
}; };
#endif /* !CONFIG_X86_64 */
/* Multipliers for offsets within the PTEs */ /* Multipliers for offsets within the PTEs */
#define PTE_LEVEL_MULT (PAGE_SIZE) #define PTE_LEVEL_MULT (PAGE_SIZE)
#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT) #define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)
...@@ -140,7 +160,7 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg) ...@@ -140,7 +160,7 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
static const char * const level_name[] = static const char * const level_name[] =
{ "cr3", "pgd", "p4d", "pud", "pmd", "pte" }; { "cr3", "pgd", "p4d", "pud", "pmd", "pte" };
if (!pgprot_val(prot)) { if (!(pr & _PAGE_PRESENT)) {
/* Not present */ /* Not present */
pt_dump_cont_printf(m, dmsg, " "); pt_dump_cont_printf(m, dmsg, " ");
} else { } else {
...@@ -525,8 +545,8 @@ static int __init pt_dump_init(void) ...@@ -525,8 +545,8 @@ static int __init pt_dump_init(void)
address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE; address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE;
# endif # endif
address_markers[FIXADDR_START_NR].start_address = FIXADDR_START; address_markers[FIXADDR_START_NR].start_address = FIXADDR_START;
address_markers[CPU_ENTRY_AREA_NR].start_address = CPU_ENTRY_AREA_BASE;
#endif #endif
return 0; return 0;
} }
__initcall(pt_dump_init); __initcall(pt_dump_init);
...@@ -50,6 +50,7 @@ ...@@ -50,6 +50,7 @@
#include <asm/setup.h> #include <asm/setup.h>
#include <asm/set_memory.h> #include <asm/set_memory.h>
#include <asm/page_types.h> #include <asm/page_types.h>
#include <asm/cpu_entry_area.h>
#include <asm/init.h> #include <asm/init.h>
#include "mm_internal.h" #include "mm_internal.h"
...@@ -766,6 +767,7 @@ void __init mem_init(void) ...@@ -766,6 +767,7 @@ void __init mem_init(void)
mem_init_print_info(NULL); mem_init_print_info(NULL);
printk(KERN_INFO "virtual kernel memory layout:\n" printk(KERN_INFO "virtual kernel memory layout:\n"
" fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
" cpu_entry : 0x%08lx - 0x%08lx (%4ld kB)\n"
#ifdef CONFIG_HIGHMEM #ifdef CONFIG_HIGHMEM
" pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n" " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
#endif #endif
...@@ -777,6 +779,10 @@ void __init mem_init(void) ...@@ -777,6 +779,10 @@ void __init mem_init(void)
FIXADDR_START, FIXADDR_TOP, FIXADDR_START, FIXADDR_TOP,
(FIXADDR_TOP - FIXADDR_START) >> 10, (FIXADDR_TOP - FIXADDR_START) >> 10,
CPU_ENTRY_AREA_BASE,
CPU_ENTRY_AREA_BASE + CPU_ENTRY_AREA_MAP_SIZE,
CPU_ENTRY_AREA_MAP_SIZE >> 10,
#ifdef CONFIG_HIGHMEM #ifdef CONFIG_HIGHMEM
PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
(LAST_PKMAP*PAGE_SIZE) >> 10, (LAST_PKMAP*PAGE_SIZE) >> 10,
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include <asm/tlbflush.h> #include <asm/tlbflush.h>
#include <asm/sections.h> #include <asm/sections.h>
#include <asm/pgtable.h> #include <asm/pgtable.h>
#include <asm/cpu_entry_area.h>
extern struct range pfn_mapped[E820_MAX_ENTRIES]; extern struct range pfn_mapped[E820_MAX_ENTRIES];
...@@ -322,31 +323,33 @@ void __init kasan_init(void) ...@@ -322,31 +323,33 @@ void __init kasan_init(void)
map_range(&pfn_mapped[i]); map_range(&pfn_mapped[i]);
} }
kasan_populate_zero_shadow( shadow_cpu_entry_begin = (void *)CPU_ENTRY_AREA_BASE;
kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM),
kasan_mem_to_shadow((void *)__START_KERNEL_map));
kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext),
(unsigned long)kasan_mem_to_shadow(_end),
early_pfn_to_nid(__pa(_stext)));
shadow_cpu_entry_begin = (void *)__fix_to_virt(FIX_CPU_ENTRY_AREA_BOTTOM);
shadow_cpu_entry_begin = kasan_mem_to_shadow(shadow_cpu_entry_begin); shadow_cpu_entry_begin = kasan_mem_to_shadow(shadow_cpu_entry_begin);
shadow_cpu_entry_begin = (void *)round_down((unsigned long)shadow_cpu_entry_begin, shadow_cpu_entry_begin = (void *)round_down((unsigned long)shadow_cpu_entry_begin,
PAGE_SIZE); PAGE_SIZE);
shadow_cpu_entry_end = (void *)(__fix_to_virt(FIX_CPU_ENTRY_AREA_TOP) + PAGE_SIZE); shadow_cpu_entry_end = (void *)(CPU_ENTRY_AREA_BASE +
CPU_ENTRY_AREA_MAP_SIZE);
shadow_cpu_entry_end = kasan_mem_to_shadow(shadow_cpu_entry_end); shadow_cpu_entry_end = kasan_mem_to_shadow(shadow_cpu_entry_end);
shadow_cpu_entry_end = (void *)round_up((unsigned long)shadow_cpu_entry_end, shadow_cpu_entry_end = (void *)round_up((unsigned long)shadow_cpu_entry_end,
PAGE_SIZE); PAGE_SIZE);
kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), kasan_populate_zero_shadow(
shadow_cpu_entry_begin); kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM),
shadow_cpu_entry_begin);
kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin, kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin,
(unsigned long)shadow_cpu_entry_end, 0); (unsigned long)shadow_cpu_entry_end, 0);
kasan_populate_zero_shadow(shadow_cpu_entry_end, (void *)KASAN_SHADOW_END); kasan_populate_zero_shadow(shadow_cpu_entry_end,
kasan_mem_to_shadow((void *)__START_KERNEL_map));
kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext),
(unsigned long)kasan_mem_to_shadow(_end),
early_pfn_to_nid(__pa(_stext)));
kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
(void *)KASAN_SHADOW_END);
load_cr3(init_top_pgt); load_cr3(init_top_pgt);
__flush_tlb_all(); __flush_tlb_all();
......
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
#include <linux/pagemap.h> #include <linux/pagemap.h>
#include <linux/spinlock.h> #include <linux/spinlock.h>
#include <asm/cpu_entry_area.h>
#include <asm/pgtable.h> #include <asm/pgtable.h>
#include <asm/pgalloc.h> #include <asm/pgalloc.h>
#include <asm/fixmap.h> #include <asm/fixmap.h>
......
...@@ -128,7 +128,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, ...@@ -128,7 +128,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* isn't free. * isn't free.
*/ */
#ifdef CONFIG_DEBUG_VM #ifdef CONFIG_DEBUG_VM
if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev, prev_asid))) { if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid))) {
/* /*
* If we were to BUG here, we'd be very likely to kill * If we were to BUG here, we'd be very likely to kill
* the system so hard that we don't see the call trace. * the system so hard that we don't see the call trace.
...@@ -195,7 +195,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, ...@@ -195,7 +195,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
if (need_flush) { if (need_flush) {
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
write_cr3(build_cr3(next, new_asid)); write_cr3(build_cr3(next->pgd, new_asid));
/* /*
* NB: This gets called via leave_mm() in the idle path * NB: This gets called via leave_mm() in the idle path
...@@ -208,7 +208,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, ...@@ -208,7 +208,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
} else { } else {
/* The new ASID is already up to date. */ /* The new ASID is already up to date. */
write_cr3(build_cr3_noflush(next, new_asid)); write_cr3(build_cr3_noflush(next->pgd, new_asid));
/* See above wrt _rcuidle. */ /* See above wrt _rcuidle. */
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
...@@ -288,7 +288,7 @@ void initialize_tlbstate_and_flush(void) ...@@ -288,7 +288,7 @@ void initialize_tlbstate_and_flush(void)
!(cr4_read_shadow() & X86_CR4_PCIDE)); !(cr4_read_shadow() & X86_CR4_PCIDE));
/* Force ASID 0 and force a TLB flush. */ /* Force ASID 0 and force a TLB flush. */
write_cr3(build_cr3(mm, 0)); write_cr3(build_cr3(mm->pgd, 0));
/* Reinitialize tlbstate. */ /* Reinitialize tlbstate. */
this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0); this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0);
...@@ -551,7 +551,7 @@ static void do_kernel_range_flush(void *info) ...@@ -551,7 +551,7 @@ static void do_kernel_range_flush(void *info)
/* flush range by one by one 'invlpg' */ /* flush range by one by one 'invlpg' */
for (addr = f->start; addr < f->end; addr += PAGE_SIZE) for (addr = f->start; addr < f->end; addr += PAGE_SIZE)
__flush_tlb_single(addr); __flush_tlb_one(addr);
} }
void flush_tlb_kernel_range(unsigned long start, unsigned long end) void flush_tlb_kernel_range(unsigned long start, unsigned long end)
......
...@@ -299,7 +299,7 @@ static void bau_process_message(struct msg_desc *mdp, struct bau_control *bcp, ...@@ -299,7 +299,7 @@ static void bau_process_message(struct msg_desc *mdp, struct bau_control *bcp,
local_flush_tlb(); local_flush_tlb();
stat->d_alltlb++; stat->d_alltlb++;
} else { } else {
__flush_tlb_one(msg->address); __flush_tlb_single(msg->address);
stat->d_onetlb++; stat->d_onetlb++;
} }
stat->d_requestee++; stat->d_requestee++;
......
...@@ -2273,7 +2273,6 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) ...@@ -2273,7 +2273,6 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
switch (idx) { switch (idx) {
case FIX_BTMAP_END ... FIX_BTMAP_BEGIN: case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
case FIX_RO_IDT:
#ifdef CONFIG_X86_32 #ifdef CONFIG_X86_32
case FIX_WP_TEST: case FIX_WP_TEST:
# ifdef CONFIG_HIGHMEM # ifdef CONFIG_HIGHMEM
...@@ -2284,7 +2283,6 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) ...@@ -2284,7 +2283,6 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
#endif #endif
case FIX_TEXT_POKE0: case FIX_TEXT_POKE0:
case FIX_TEXT_POKE1: case FIX_TEXT_POKE1:
case FIX_CPU_ENTRY_AREA_TOP ... FIX_CPU_ENTRY_AREA_BOTTOM:
/* All local page mappings */ /* All local page mappings */
pte = pfn_pte(phys, prot); pte = pfn_pte(phys, prot);
break; break;
......
...@@ -7,9 +7,10 @@ ...@@ -7,9 +7,10 @@
#ifndef _ASM_GENERIC_MM_HOOKS_H #ifndef _ASM_GENERIC_MM_HOOKS_H
#define _ASM_GENERIC_MM_HOOKS_H #define _ASM_GENERIC_MM_HOOKS_H
static inline void arch_dup_mmap(struct mm_struct *oldmm, static inline int arch_dup_mmap(struct mm_struct *oldmm,
struct mm_struct *mm) struct mm_struct *mm)
{ {
return 0;
} }
static inline void arch_exit_mmap(struct mm_struct *mm) static inline void arch_exit_mmap(struct mm_struct *mm)
......
...@@ -1025,6 +1025,11 @@ static inline int pmd_clear_huge(pmd_t *pmd) ...@@ -1025,6 +1025,11 @@ static inline int pmd_clear_huge(pmd_t *pmd)
struct file; struct file;
int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
unsigned long size, pgprot_t *vma_prot); unsigned long size, pgprot_t *vma_prot);
#ifndef CONFIG_X86_ESPFIX64
static inline void init_espfix_bsp(void) { }
#endif
#endif /* !__ASSEMBLY__ */ #endif /* !__ASSEMBLY__ */
#ifndef io_remap_pfn_range #ifndef io_remap_pfn_range
......
...@@ -504,6 +504,8 @@ static void __init mm_init(void) ...@@ -504,6 +504,8 @@ static void __init mm_init(void)
pgtable_init(); pgtable_init();
vmalloc_init(); vmalloc_init();
ioremap_huge_init(); ioremap_huge_init();
/* Should be run before the first non-init thread is created */
init_espfix_bsp();
} }
asmlinkage __visible void __init start_kernel(void) asmlinkage __visible void __init start_kernel(void)
...@@ -678,10 +680,6 @@ asmlinkage __visible void __init start_kernel(void) ...@@ -678,10 +680,6 @@ asmlinkage __visible void __init start_kernel(void)
#ifdef CONFIG_X86 #ifdef CONFIG_X86
if (efi_enabled(EFI_RUNTIME_SERVICES)) if (efi_enabled(EFI_RUNTIME_SERVICES))
efi_enter_virtual_mode(); efi_enter_virtual_mode();
#endif
#ifdef CONFIG_X86_ESPFIX64
/* Should be run before the first non-init thread is created */
init_espfix_bsp();
#endif #endif
thread_stack_cache_init(); thread_stack_cache_init();
cred_init(); cred_init();
......
...@@ -721,8 +721,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, ...@@ -721,8 +721,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
goto out; goto out;
} }
/* a new mm has just been created */ /* a new mm has just been created */
arch_dup_mmap(oldmm, mm); retval = arch_dup_mmap(oldmm, mm);
retval = 0;
out: out:
up_write(&mm->mmap_sem); up_write(&mm->mmap_sem);
flush_tlb_mm(oldmm); flush_tlb_mm(oldmm);
......
...@@ -627,13 +627,10 @@ static void do_multicpu_tests(void) ...@@ -627,13 +627,10 @@ static void do_multicpu_tests(void)
static int finish_exec_test(void) static int finish_exec_test(void)
{ {
/* /*
* In a sensible world, this would be check_invalid_segment(0, 1); * Older kernel versions did inherit the LDT on exec() which is
* For better or for worse, though, the LDT is inherited across exec. * wrong because exec() starts from a clean state.
* We can probably change this safely, but for now we test it.
*/ */
check_valid_segment(0, 1, check_invalid_segment(0, 1);
AR_DPL3 | AR_TYPE_XRCODE | AR_S | AR_P | AR_DB,
42, true);
return nerrs ? 1 : 0; return nerrs ? 1 : 0;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment