Commit 7a69f9c6 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 mm updates from Ingo Molnar:
 "The main changes in this cycle were:

   - Continued work to add support for 5-level paging provided by future
     Intel CPUs. In particular we switch the x86 GUP code to the generic
     implementation. (Kirill A. Shutemov)

   - Continued work to add PCID CPU support to native kernels as well.
     In this round most of the focus is on reworking/refreshing the TLB
     flush infrastructure for the upcoming PCID changes. (Andy
     Lutomirski)"

* 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (34 commits)
  x86/mm: Delete a big outdated comment about TLB flushing
  x86/mm: Don't reenter flush_tlb_func_common()
  x86/KASLR: Fix detection 32/64 bit bootloaders for 5-level paging
  x86/ftrace: Exclude functions in head64.c from function-tracing
  x86/mmap, ASLR: Do not treat unlimited-stack tasks as legacy mmap
  x86/mm: Remove reset_lazy_tlbstate()
  x86/ldt: Simplify the LDT switching logic
  x86/boot/64: Put __startup_64() into .head.text
  x86/mm: Add support for 5-level paging for KASLR
  x86/mm: Make kernel_physical_mapping_init() support 5-level paging
  x86/mm: Add sync_global_pgds() for configuration with 5-level paging
  x86/boot/64: Add support of additional page table level during early boot
  x86/boot/64: Rename init_level4_pgt and early_level4_pgt
  x86/boot/64: Rewrite startup_64() in C
  x86/boot/compressed: Enable 5-level paging during decompression stage
  x86/boot/efi: Define __KERNEL32_CS GDT on 64-bit configurations
  x86/boot/efi: Fix __KERNEL_CS definition of GDT entry on 64-bit configurations
  x86/boot/efi: Cleanup initialization of GDT entries
  x86/asm: Fix comment in return_from_SYSCALL_64()
  x86/mm/gup: Switch GUP to the generic get_user_page_fast() implementation
  ...
parents 9bc088ab 8781fb7e
...@@ -1638,7 +1638,7 @@ config ARCH_SELECT_MEMORY_MODEL ...@@ -1638,7 +1638,7 @@ config ARCH_SELECT_MEMORY_MODEL
config HAVE_ARCH_PFN_VALID config HAVE_ARCH_PFN_VALID
def_bool ARCH_HAS_HOLES_MEMORYMODEL || !SPARSEMEM def_bool ARCH_HAS_HOLES_MEMORYMODEL || !SPARSEMEM
config HAVE_GENERIC_RCU_GUP config HAVE_GENERIC_GUP
def_bool y def_bool y
depends on ARM_LPAE depends on ARM_LPAE
......
...@@ -205,7 +205,7 @@ config GENERIC_CALIBRATE_DELAY ...@@ -205,7 +205,7 @@ config GENERIC_CALIBRATE_DELAY
config ZONE_DMA config ZONE_DMA
def_bool y def_bool y
config HAVE_GENERIC_RCU_GUP config HAVE_GENERIC_GUP
def_bool y def_bool y
config ARCH_DMA_ADDR_T_64BIT config ARCH_DMA_ADDR_T_64BIT
......
...@@ -184,7 +184,7 @@ config PPC ...@@ -184,7 +184,7 @@ config PPC
select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_GRAPH_TRACER
select HAVE_FUNCTION_TRACER select HAVE_FUNCTION_TRACER
select HAVE_GCC_PLUGINS select HAVE_GCC_PLUGINS
select HAVE_GENERIC_RCU_GUP select HAVE_GENERIC_GUP
select HAVE_HW_BREAKPOINT if PERF_EVENTS && (PPC_BOOK3S || PPC_8xx) select HAVE_HW_BREAKPOINT if PERF_EVENTS && (PPC_BOOK3S || PPC_8xx)
select HAVE_IDE select HAVE_IDE
select HAVE_IOREMAP_PROT select HAVE_IOREMAP_PROT
......
...@@ -69,7 +69,7 @@ config X86 ...@@ -69,7 +69,7 @@ config X86
select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_BUILTIN_BSWAP
select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_RWLOCKS
select ARCH_USE_QUEUED_SPINLOCKS select ARCH_USE_QUEUED_SPINLOCKS
select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if SMP select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
select ARCH_WANT_FRAME_POINTERS select ARCH_WANT_FRAME_POINTERS
select ARCH_WANTS_DYNAMIC_TASK_STRUCT select ARCH_WANTS_DYNAMIC_TASK_STRUCT
select BUILDTIME_EXTABLE_SORT select BUILDTIME_EXTABLE_SORT
...@@ -2793,6 +2793,9 @@ config X86_DMA_REMAP ...@@ -2793,6 +2793,9 @@ config X86_DMA_REMAP
bool bool
depends on STA2X11 depends on STA2X11
config HAVE_GENERIC_GUP
def_bool y
source "net/Kconfig" source "net/Kconfig"
source "drivers/Kconfig" source "drivers/Kconfig"
......
...@@ -1046,9 +1046,31 @@ struct boot_params *efi_main(struct efi_config *c, ...@@ -1046,9 +1046,31 @@ struct boot_params *efi_main(struct efi_config *c,
memset((char *)gdt->address, 0x0, gdt->size); memset((char *)gdt->address, 0x0, gdt->size);
desc = (struct desc_struct *)gdt->address; desc = (struct desc_struct *)gdt->address;
/* The first GDT is a dummy and the second is unused. */ /* The first GDT is a dummy. */
desc += 2; desc++;
if (IS_ENABLED(CONFIG_X86_64)) {
/* __KERNEL32_CS */
desc->limit0 = 0xffff;
desc->base0 = 0x0000;
desc->base1 = 0x0000;
desc->type = SEG_TYPE_CODE | SEG_TYPE_EXEC_READ;
desc->s = DESC_TYPE_CODE_DATA;
desc->dpl = 0;
desc->p = 1;
desc->limit = 0xf;
desc->avl = 0;
desc->l = 0;
desc->d = SEG_OP_SIZE_32BIT;
desc->g = SEG_GRANULARITY_4KB;
desc->base2 = 0x00;
desc++;
} else {
/* Second entry is unused on 32-bit */
desc++;
}
/* __KERNEL_CS */
desc->limit0 = 0xffff; desc->limit0 = 0xffff;
desc->base0 = 0x0000; desc->base0 = 0x0000;
desc->base1 = 0x0000; desc->base1 = 0x0000;
...@@ -1058,12 +1080,18 @@ struct boot_params *efi_main(struct efi_config *c, ...@@ -1058,12 +1080,18 @@ struct boot_params *efi_main(struct efi_config *c,
desc->p = 1; desc->p = 1;
desc->limit = 0xf; desc->limit = 0xf;
desc->avl = 0; desc->avl = 0;
desc->l = 0; if (IS_ENABLED(CONFIG_X86_64)) {
desc->d = SEG_OP_SIZE_32BIT; desc->l = 1;
desc->d = 0;
} else {
desc->l = 0;
desc->d = SEG_OP_SIZE_32BIT;
}
desc->g = SEG_GRANULARITY_4KB; desc->g = SEG_GRANULARITY_4KB;
desc->base2 = 0x00; desc->base2 = 0x00;
desc++; desc++;
/* __KERNEL_DS */
desc->limit0 = 0xffff; desc->limit0 = 0xffff;
desc->base0 = 0x0000; desc->base0 = 0x0000;
desc->base1 = 0x0000; desc->base1 = 0x0000;
...@@ -1077,24 +1105,25 @@ struct boot_params *efi_main(struct efi_config *c, ...@@ -1077,24 +1105,25 @@ struct boot_params *efi_main(struct efi_config *c,
desc->d = SEG_OP_SIZE_32BIT; desc->d = SEG_OP_SIZE_32BIT;
desc->g = SEG_GRANULARITY_4KB; desc->g = SEG_GRANULARITY_4KB;
desc->base2 = 0x00; desc->base2 = 0x00;
#ifdef CONFIG_X86_64
/* Task segment value */
desc++; desc++;
desc->limit0 = 0x0000;
desc->base0 = 0x0000; if (IS_ENABLED(CONFIG_X86_64)) {
desc->base1 = 0x0000; /* Task segment value */
desc->type = SEG_TYPE_TSS; desc->limit0 = 0x0000;
desc->s = 0; desc->base0 = 0x0000;
desc->dpl = 0; desc->base1 = 0x0000;
desc->p = 1; desc->type = SEG_TYPE_TSS;
desc->limit = 0x0; desc->s = 0;
desc->avl = 0; desc->dpl = 0;
desc->l = 0; desc->p = 1;
desc->d = 0; desc->limit = 0x0;
desc->g = SEG_GRANULARITY_4KB; desc->avl = 0;
desc->base2 = 0x00; desc->l = 0;
#endif /* CONFIG_X86_64 */ desc->d = 0;
desc->g = SEG_GRANULARITY_4KB;
desc->base2 = 0x00;
desc++;
}
asm volatile("cli"); asm volatile("cli");
asm volatile ("lgdt %0" : : "m" (*gdt)); asm volatile ("lgdt %0" : : "m" (*gdt));
......
...@@ -346,6 +346,48 @@ preferred_addr: ...@@ -346,6 +346,48 @@ preferred_addr:
/* Set up the stack */ /* Set up the stack */
leaq boot_stack_end(%rbx), %rsp leaq boot_stack_end(%rbx), %rsp
#ifdef CONFIG_X86_5LEVEL
/* Check if 5-level paging has already enabled */
movq %cr4, %rax
testl $X86_CR4_LA57, %eax
jnz lvl5
/*
* At this point we are in long mode with 4-level paging enabled,
* but we want to enable 5-level paging.
*
* The problem is that we cannot do it directly. Setting LA57 in
* long mode would trigger #GP. So we need to switch off long mode
* first.
*
* NOTE: This is not going to work if bootloader put us above 4G
* limit.
*
* The first step is go into compatibility mode.
*/
/* Clear additional page table */
leaq lvl5_pgtable(%rbx), %rdi
xorq %rax, %rax
movq $(PAGE_SIZE/8), %rcx
rep stosq
/*
* Setup current CR3 as the first and only entry in a new top level
* page table.
*/
movq %cr3, %rdi
leaq 0x7 (%rdi), %rax
movq %rax, lvl5_pgtable(%rbx)
/* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */
pushq $__KERNEL32_CS
leaq compatible_mode(%rip), %rax
pushq %rax
lretq
lvl5:
#endif
/* Zero EFLAGS */ /* Zero EFLAGS */
pushq $0 pushq $0
popfq popfq
...@@ -429,6 +471,44 @@ relocated: ...@@ -429,6 +471,44 @@ relocated:
jmp *%rax jmp *%rax
.code32 .code32
#ifdef CONFIG_X86_5LEVEL
compatible_mode:
/* Setup data and stack segments */
movl $__KERNEL_DS, %eax
movl %eax, %ds
movl %eax, %ss
/* Disable paging */
movl %cr0, %eax
btrl $X86_CR0_PG_BIT, %eax
movl %eax, %cr0
/* Point CR3 to 5-level paging */
leal lvl5_pgtable(%ebx), %eax
movl %eax, %cr3
/* Enable PAE and LA57 mode */
movl %cr4, %eax
orl $(X86_CR4_PAE | X86_CR4_LA57), %eax
movl %eax, %cr4
/* Calculate address we are running at */
call 1f
1: popl %edi
subl $1b, %edi
/* Prepare stack for far return to Long Mode */
pushl $__KERNEL_CS
leal lvl5(%edi), %eax
push %eax
/* Enable paging back */
movl $(X86_CR0_PG | X86_CR0_PE), %eax
movl %eax, %cr0
lret
#endif
no_longmode: no_longmode:
/* This isn't an x86-64 CPU so hang */ /* This isn't an x86-64 CPU so hang */
1: 1:
...@@ -442,7 +522,7 @@ gdt: ...@@ -442,7 +522,7 @@ gdt:
.word gdt_end - gdt .word gdt_end - gdt
.long gdt .long gdt
.word 0 .word 0
.quad 0x0000000000000000 /* NULL descriptor */ .quad 0x00cf9a000000ffff /* __KERNEL32_CS */
.quad 0x00af9a000000ffff /* __KERNEL_CS */ .quad 0x00af9a000000ffff /* __KERNEL_CS */
.quad 0x00cf92000000ffff /* __KERNEL_DS */ .quad 0x00cf92000000ffff /* __KERNEL_DS */
.quad 0x0080890000000000 /* TS descriptor */ .quad 0x0080890000000000 /* TS descriptor */
...@@ -486,3 +566,7 @@ boot_stack_end: ...@@ -486,3 +566,7 @@ boot_stack_end:
.balign 4096 .balign 4096
pgtable: pgtable:
.fill BOOT_PGT_SIZE, 1, 0 .fill BOOT_PGT_SIZE, 1, 0
#ifdef CONFIG_X86_5LEVEL
lvl5_pgtable:
.fill PAGE_SIZE, 1, 0
#endif
...@@ -63,7 +63,7 @@ static void *alloc_pgt_page(void *context) ...@@ -63,7 +63,7 @@ static void *alloc_pgt_page(void *context)
static struct alloc_pgt_data pgt_data; static struct alloc_pgt_data pgt_data;
/* The top level page table entry pointer. */ /* The top level page table entry pointer. */
static unsigned long level4p; static unsigned long top_level_pgt;
/* /*
* Mapping information structure passed to kernel_ident_mapping_init(). * Mapping information structure passed to kernel_ident_mapping_init().
...@@ -91,9 +91,15 @@ void initialize_identity_maps(void) ...@@ -91,9 +91,15 @@ void initialize_identity_maps(void)
* If we came here via startup_32(), cr3 will be _pgtable already * If we came here via startup_32(), cr3 will be _pgtable already
* and we must append to the existing area instead of entirely * and we must append to the existing area instead of entirely
* overwriting it. * overwriting it.
*
* With 5-level paging, we use '_pgtable' to allocate the p4d page table,
* the top-level page table is allocated separately.
*
* p4d_offset(top_level_pgt, 0) would cover both the 4- and 5-level
* cases. On 4-level paging it's equal to 'top_level_pgt'.
*/ */
level4p = read_cr3(); top_level_pgt = read_cr3_pa();
if (level4p == (unsigned long)_pgtable) { if (p4d_offset((pgd_t *)top_level_pgt, 0) == (p4d_t *)_pgtable) {
debug_putstr("booted via startup_32()\n"); debug_putstr("booted via startup_32()\n");
pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE; pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE;
pgt_data.pgt_buf_size = BOOT_PGT_SIZE - BOOT_INIT_PGT_SIZE; pgt_data.pgt_buf_size = BOOT_PGT_SIZE - BOOT_INIT_PGT_SIZE;
...@@ -103,7 +109,7 @@ void initialize_identity_maps(void) ...@@ -103,7 +109,7 @@ void initialize_identity_maps(void)
pgt_data.pgt_buf = _pgtable; pgt_data.pgt_buf = _pgtable;
pgt_data.pgt_buf_size = BOOT_PGT_SIZE; pgt_data.pgt_buf_size = BOOT_PGT_SIZE;
memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size); memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size);
level4p = (unsigned long)alloc_pgt_page(&pgt_data); top_level_pgt = (unsigned long)alloc_pgt_page(&pgt_data);
} }
} }
...@@ -123,7 +129,7 @@ void add_identity_map(unsigned long start, unsigned long size) ...@@ -123,7 +129,7 @@ void add_identity_map(unsigned long start, unsigned long size)
return; return;
/* Build the mapping. */ /* Build the mapping. */
kernel_ident_mapping_init(&mapping_info, (pgd_t *)level4p, kernel_ident_mapping_init(&mapping_info, (pgd_t *)top_level_pgt,
start, end); start, end);
} }
...@@ -134,5 +140,5 @@ void add_identity_map(unsigned long start, unsigned long size) ...@@ -134,5 +140,5 @@ void add_identity_map(unsigned long start, unsigned long size)
*/ */
void finalize_identity_maps(void) void finalize_identity_maps(void)
{ {
write_cr3(level4p); write_cr3(top_level_pgt);
} }
...@@ -265,7 +265,8 @@ return_from_SYSCALL_64: ...@@ -265,7 +265,8 @@ return_from_SYSCALL_64:
* If width of "canonical tail" ever becomes variable, this will need * If width of "canonical tail" ever becomes variable, this will need
* to be updated to remain correct on both old and new CPUs. * to be updated to remain correct on both old and new CPUs.
* *
* Change top 16 bits to be the sign-extension of 47th bit * Change top bits to match most significant bit (47th or 56th bit
* depending on paging mode) in the address.
*/ */
shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
......
...@@ -2111,8 +2111,7 @@ static int x86_pmu_event_init(struct perf_event *event) ...@@ -2111,8 +2111,7 @@ static int x86_pmu_event_init(struct perf_event *event)
static void refresh_pce(void *ignored) static void refresh_pce(void *ignored)
{ {
if (current->active_mm) load_mm_cr4(this_cpu_read(cpu_tlbstate.loaded_mm));
load_mm_cr4(current->active_mm);
} }
static void x86_pmu_event_mapped(struct perf_event *event) static void x86_pmu_event_mapped(struct perf_event *event)
...@@ -2344,7 +2343,7 @@ static unsigned long get_segment_base(unsigned int segment) ...@@ -2344,7 +2343,7 @@ static unsigned long get_segment_base(unsigned int segment)
/* IRQs are off, so this synchronizes with smp_store_release */ /* IRQs are off, so this synchronizes with smp_store_release */
ldt = lockless_dereference(current->active_mm->context.ldt); ldt = lockless_dereference(current->active_mm->context.ldt);
if (!ldt || idx > ldt->size) if (!ldt || idx > ldt->nr_entries)
return 0; return 0;
desc = &ldt->entries[idx]; desc = &ldt->entries[idx];
......
...@@ -74,7 +74,7 @@ struct efi_scratch { ...@@ -74,7 +74,7 @@ struct efi_scratch {
__kernel_fpu_begin(); \ __kernel_fpu_begin(); \
\ \
if (efi_scratch.use_pgd) { \ if (efi_scratch.use_pgd) { \
efi_scratch.prev_cr3 = read_cr3(); \ efi_scratch.prev_cr3 = __read_cr3(); \
write_cr3((unsigned long)efi_scratch.efi_pgt); \ write_cr3((unsigned long)efi_scratch.efi_pgt); \
__flush_tlb_all(); \ __flush_tlb_all(); \
} \ } \
......
...@@ -22,8 +22,8 @@ typedef struct { ...@@ -22,8 +22,8 @@ typedef struct {
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
unsigned int irq_resched_count; unsigned int irq_resched_count;
unsigned int irq_call_count; unsigned int irq_call_count;
unsigned int irq_tlb_count;
#endif #endif
unsigned int irq_tlb_count;
#ifdef CONFIG_X86_THERMAL_VECTOR #ifdef CONFIG_X86_THERMAL_VECTOR
unsigned int irq_thermal_count; unsigned int irq_thermal_count;
#endif #endif
......
...@@ -37,12 +37,6 @@ typedef struct { ...@@ -37,12 +37,6 @@ typedef struct {
#endif #endif
} mm_context_t; } mm_context_t;
#ifdef CONFIG_SMP
void leave_mm(int cpu); void leave_mm(int cpu);
#else
static inline void leave_mm(int cpu)
{
}
#endif
#endif /* _ASM_X86_MMU_H */ #endif /* _ASM_X86_MMU_H */
...@@ -47,7 +47,7 @@ struct ldt_struct { ...@@ -47,7 +47,7 @@ struct ldt_struct {
* allocations, but it's not worth trying to optimize. * allocations, but it's not worth trying to optimize.
*/ */
struct desc_struct *entries; struct desc_struct *entries;
unsigned int size; unsigned int nr_entries;
}; };
/* /*
...@@ -87,22 +87,46 @@ static inline void load_mm_ldt(struct mm_struct *mm) ...@@ -87,22 +87,46 @@ static inline void load_mm_ldt(struct mm_struct *mm)
*/ */
if (unlikely(ldt)) if (unlikely(ldt))
set_ldt(ldt->entries, ldt->size); set_ldt(ldt->entries, ldt->nr_entries);
else else
clear_LDT(); clear_LDT();
#else #else
clear_LDT(); clear_LDT();
#endif #endif
}
static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
{
#ifdef CONFIG_MODIFY_LDT_SYSCALL
/*
* Load the LDT if either the old or new mm had an LDT.
*
* An mm will never go from having an LDT to not having an LDT. Two
* mms never share an LDT, so we don't gain anything by checking to
* see whether the LDT changed. There's also no guarantee that
* prev->context.ldt actually matches LDTR, but, if LDTR is non-NULL,
* then prev->context.ldt will also be non-NULL.
*
* If we really cared, we could optimize the case where prev == next
* and we're exiting lazy mode. Most of the time, if this happens,
* we don't actually need to reload LDTR, but modify_ldt() is mostly
* used by legacy code and emulators where we don't need this level of
* performance.
*
* This uses | instead of || because it generates better code.
*/
if (unlikely((unsigned long)prev->context.ldt |
(unsigned long)next->context.ldt))
load_mm_ldt(next);
#endif
DEBUG_LOCKS_WARN_ON(preemptible()); DEBUG_LOCKS_WARN_ON(preemptible());
} }
static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
{ {
#ifdef CONFIG_SMP
if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY); this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
#endif
} }
static inline int init_new_context(struct task_struct *tsk, static inline int init_new_context(struct task_struct *tsk,
...@@ -220,18 +244,6 @@ static inline int vma_pkey(struct vm_area_struct *vma) ...@@ -220,18 +244,6 @@ static inline int vma_pkey(struct vm_area_struct *vma)
} }
#endif #endif
static inline bool __pkru_allows_pkey(u16 pkey, bool write)
{
u32 pkru = read_pkru();
if (!__pkru_allows_read(pkru, pkey))
return false;
if (write && !__pkru_allows_write(pkru, pkey))
return false;
return true;
}
/* /*
* We only want to enforce protection keys on the current process * We only want to enforce protection keys on the current process
* because we effectively have no access to PKRU for other * because we effectively have no access to PKRU for other
...@@ -268,4 +280,23 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, ...@@ -268,4 +280,23 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
return __pkru_allows_pkey(vma_pkey(vma), write); return __pkru_allows_pkey(vma_pkey(vma), write);
} }
/*
* This can be used from process context to figure out what the value of
* CR3 is without needing to do a (slow) __read_cr3().
*
* It's intended to be used for code like KVM that sneakily changes CR3
* and needs to restore it. It needs to be used very carefully.
*/
static inline unsigned long __get_current_cr3_fast(void)
{
unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd);
/* For now, be very restrictive about when this can be called. */
VM_WARN_ON(in_nmi() || !in_atomic());
VM_BUG_ON(cr3 != __read_cr3());
return cr3;
}
#endif /* _ASM_X86_MMU_CONTEXT_H */ #endif /* _ASM_X86_MMU_CONTEXT_H */
...@@ -61,7 +61,7 @@ static inline void write_cr2(unsigned long x) ...@@ -61,7 +61,7 @@ static inline void write_cr2(unsigned long x)
PVOP_VCALL1(pv_mmu_ops.write_cr2, x); PVOP_VCALL1(pv_mmu_ops.write_cr2, x);
} }
static inline unsigned long read_cr3(void) static inline unsigned long __read_cr3(void)
{ {
return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr3); return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr3);
} }
...@@ -312,11 +312,9 @@ static inline void __flush_tlb_single(unsigned long addr) ...@@ -312,11 +312,9 @@ static inline void __flush_tlb_single(unsigned long addr)
} }
static inline void flush_tlb_others(const struct cpumask *cpumask, static inline void flush_tlb_others(const struct cpumask *cpumask,
struct mm_struct *mm, const struct flush_tlb_info *info)
unsigned long start,
unsigned long end)
{ {
PVOP_VCALL4(pv_mmu_ops.flush_tlb_others, cpumask, mm, start, end); PVOP_VCALL2(pv_mmu_ops.flush_tlb_others, cpumask, info);
} }
static inline int paravirt_pgd_alloc(struct mm_struct *mm) static inline int paravirt_pgd_alloc(struct mm_struct *mm)
......
...@@ -51,6 +51,7 @@ struct mm_struct; ...@@ -51,6 +51,7 @@ struct mm_struct;
struct desc_struct; struct desc_struct;
struct task_struct; struct task_struct;
struct cpumask; struct cpumask;
struct flush_tlb_info;
/* /*
* Wrapper type for pointers to code which uses the non-standard * Wrapper type for pointers to code which uses the non-standard
...@@ -223,9 +224,7 @@ struct pv_mmu_ops { ...@@ -223,9 +224,7 @@ struct pv_mmu_ops {
void (*flush_tlb_kernel)(void); void (*flush_tlb_kernel)(void);
void (*flush_tlb_single)(unsigned long addr); void (*flush_tlb_single)(unsigned long addr);
void (*flush_tlb_others)(const struct cpumask *cpus, void (*flush_tlb_others)(const struct cpumask *cpus,
struct mm_struct *mm, const struct flush_tlb_info *info);
unsigned long start,
unsigned long end);
/* Hooks for allocating and freeing a pagetable top-level */ /* Hooks for allocating and freeing a pagetable top-level */
int (*pgd_alloc)(struct mm_struct *mm); int (*pgd_alloc)(struct mm_struct *mm);
......
...@@ -212,4 +212,51 @@ static inline pud_t native_pudp_get_and_clear(pud_t *pudp) ...@@ -212,4 +212,51 @@ static inline pud_t native_pudp_get_and_clear(pud_t *pudp)
#define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high }) #define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high })
#define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } }) #define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } })
#define gup_get_pte gup_get_pte
/*
* WARNING: only to be used in the get_user_pages_fast() implementation.
*
* With get_user_pages_fast(), we walk down the pagetables without taking
* any locks. For this we would like to load the pointers atomically,
* but that is not possible (without expensive cmpxchg8b) on PAE. What
* we do have is the guarantee that a PTE will only either go from not
* present to present, or present to not present or both -- it will not
* switch to a completely different present page without a TLB flush in
* between; something that we are blocking by holding interrupts off.
*
* Setting ptes from not present to present goes:
*
* ptep->pte_high = h;
* smp_wmb();
* ptep->pte_low = l;
*
* And present to not present goes:
*
* ptep->pte_low = 0;
* smp_wmb();
* ptep->pte_high = 0;
*
* We must ensure here that the load of pte_low sees 'l' iff pte_high
* sees 'h'. We load pte_high *after* loading pte_low, which ensures we
* don't see an older value of pte_high. *Then* we recheck pte_low,
* which ensures that we haven't picked up a changed pte high. We might
* have gotten rubbish values from pte_low and pte_high, but we are
* guaranteed that pte_low will not have the present bit set *unless*
* it is 'l'. Because get_user_pages_fast() only operates on present ptes
* we're safe.
*/
static inline pte_t gup_get_pte(pte_t *ptep)
{
pte_t pte;
do {
pte.pte_low = ptep->pte_low;
smp_rmb();
pte.pte_high = ptep->pte_high;
smp_rmb();
} while (unlikely(pte.pte_low != ptep->pte_low));
return pte;
}
#endif /* _ASM_X86_PGTABLE_3LEVEL_H */ #endif /* _ASM_X86_PGTABLE_3LEVEL_H */
...@@ -244,6 +244,11 @@ static inline int pud_devmap(pud_t pud) ...@@ -244,6 +244,11 @@ static inline int pud_devmap(pud_t pud)
return 0; return 0;
} }
#endif #endif
static inline int pgd_devmap(pgd_t pgd)
{
return 0;
}
#endif #endif
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
...@@ -917,7 +922,7 @@ extern pgd_t trampoline_pgd_entry; ...@@ -917,7 +922,7 @@ extern pgd_t trampoline_pgd_entry;
static inline void __meminit init_trampoline_default(void) static inline void __meminit init_trampoline_default(void)
{ {
/* Default trampoline pgd value */ /* Default trampoline pgd value */
trampoline_pgd_entry = init_level4_pgt[pgd_index(__PAGE_OFFSET)]; trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
} }
# ifdef CONFIG_RANDOMIZE_MEMORY # ifdef CONFIG_RANDOMIZE_MEMORY
void __meminit init_trampoline(void); void __meminit init_trampoline(void);
...@@ -1185,6 +1190,54 @@ static inline u16 pte_flags_pkey(unsigned long pte_flags) ...@@ -1185,6 +1190,54 @@ static inline u16 pte_flags_pkey(unsigned long pte_flags)
#endif #endif
} }
static inline bool __pkru_allows_pkey(u16 pkey, bool write)
{
u32 pkru = read_pkru();
if (!__pkru_allows_read(pkru, pkey))
return false;
if (write && !__pkru_allows_write(pkru, pkey))
return false;
return true;
}
/*
* 'pteval' can come from a PTE, PMD or PUD. We only check
* _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the
* same value on all 3 types.
*/
static inline bool __pte_access_permitted(unsigned long pteval, bool write)
{
unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER;
if (write)
need_pte_bits |= _PAGE_RW;
if ((pteval & need_pte_bits) != need_pte_bits)
return 0;
return __pkru_allows_pkey(pte_flags_pkey(pteval), write);
}
#define pte_access_permitted pte_access_permitted
static inline bool pte_access_permitted(pte_t pte, bool write)
{
return __pte_access_permitted(pte_val(pte), write);
}
#define pmd_access_permitted pmd_access_permitted
static inline bool pmd_access_permitted(pmd_t pmd, bool write)
{
return __pte_access_permitted(pmd_val(pmd), write);
}
#define pud_access_permitted pud_access_permitted
static inline bool pud_access_permitted(pud_t pud, bool write)
{
return __pte_access_permitted(pud_val(pud), write);
}
#include <asm-generic/pgtable.h> #include <asm-generic/pgtable.h>
#endif /* __ASSEMBLY__ */ #endif /* __ASSEMBLY__ */
......
...@@ -14,15 +14,17 @@ ...@@ -14,15 +14,17 @@
#include <linux/bitops.h> #include <linux/bitops.h>
#include <linux/threads.h> #include <linux/threads.h>
extern p4d_t level4_kernel_pgt[512];
extern p4d_t level4_ident_pgt[512];
extern pud_t level3_kernel_pgt[512]; extern pud_t level3_kernel_pgt[512];
extern pud_t level3_ident_pgt[512]; extern pud_t level3_ident_pgt[512];
extern pmd_t level2_kernel_pgt[512]; extern pmd_t level2_kernel_pgt[512];
extern pmd_t level2_fixmap_pgt[512]; extern pmd_t level2_fixmap_pgt[512];
extern pmd_t level2_ident_pgt[512]; extern pmd_t level2_ident_pgt[512];
extern pte_t level1_fixmap_pgt[512]; extern pte_t level1_fixmap_pgt[512];
extern pgd_t init_level4_pgt[]; extern pgd_t init_top_pgt[];
#define swapper_pg_dir init_level4_pgt #define swapper_pg_dir init_top_pgt
extern void paging_init(void); extern void paging_init(void);
...@@ -227,6 +229,20 @@ extern void cleanup_highmap(void); ...@@ -227,6 +229,20 @@ extern void cleanup_highmap(void);
extern void init_extra_mapping_uc(unsigned long phys, unsigned long size); extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
extern void init_extra_mapping_wb(unsigned long phys, unsigned long size); extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
#endif /* !__ASSEMBLY__ */ #define gup_fast_permitted gup_fast_permitted
static inline bool gup_fast_permitted(unsigned long start, int nr_pages,
int write)
{
unsigned long len, end;
len = (unsigned long)nr_pages << PAGE_SHIFT;
end = start + len;
if (end < start)
return false;
if (end >> __VIRTUAL_MASK_SHIFT)
return false;
return true;
}
#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_PGTABLE_64_H */ #endif /* _ASM_X86_PGTABLE_64_H */
...@@ -8,4 +8,40 @@ ...@@ -8,4 +8,40 @@
#else #else
#define X86_VM_MASK 0 /* No VM86 support */ #define X86_VM_MASK 0 /* No VM86 support */
#endif #endif
/*
* CR3's layout varies depending on several things.
*
* If CR4.PCIDE is set (64-bit only), then CR3[11:0] is the address space ID.
* If PAE is enabled, then CR3[11:5] is part of the PDPT address
* (i.e. it's 32-byte aligned, not page-aligned) and CR3[4:0] is ignored.
* Otherwise (non-PAE, non-PCID), CR3[3] is PWT, CR3[4] is PCD, and
* CR3[2:0] and CR3[11:5] are ignored.
*
* In all cases, Linux puts zeros in the low ignored bits and in PWT and PCD.
*
* CR3[63] is always read as zero. If CR4.PCIDE is set, then CR3[63] may be
* written as 1 to prevent the write to CR3 from flushing the TLB.
*
* On systems with SME, one bit (in a variable position!) is stolen to indicate
* that the top-level paging structure is encrypted.
*
* All of the remaining bits indicate the physical address of the top-level
* paging structure.
*
* CR3_ADDR_MASK is the mask used by read_cr3_pa().
*/
#ifdef CONFIG_X86_64
/* Mask off the address space ID bits. */
#define CR3_ADDR_MASK 0x7FFFFFFFFFFFF000ull
#define CR3_PCID_MASK 0xFFFull
#else
/*
* CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
* a tiny bit of code size by setting all the bits.
*/
#define CR3_ADDR_MASK 0xFFFFFFFFull
#define CR3_PCID_MASK 0ull
#endif
#endif /* _ASM_X86_PROCESSOR_FLAGS_H */ #endif /* _ASM_X86_PROCESSOR_FLAGS_H */
...@@ -231,6 +231,14 @@ native_cpuid_reg(ebx) ...@@ -231,6 +231,14 @@ native_cpuid_reg(ebx)
native_cpuid_reg(ecx) native_cpuid_reg(ecx)
native_cpuid_reg(edx) native_cpuid_reg(edx)
/*
* Friendlier CR3 helpers.
*/
static inline unsigned long read_cr3_pa(void)
{
return __read_cr3() & CR3_ADDR_MASK;
}
static inline void load_cr3(pgd_t *pgdir) static inline void load_cr3(pgd_t *pgdir)
{ {
write_cr3(__pa(pgdir)); write_cr3(__pa(pgdir));
......
...@@ -39,7 +39,7 @@ static inline void native_write_cr2(unsigned long val) ...@@ -39,7 +39,7 @@ static inline void native_write_cr2(unsigned long val)
asm volatile("mov %0,%%cr2": : "r" (val), "m" (__force_order)); asm volatile("mov %0,%%cr2": : "r" (val), "m" (__force_order));
} }
static inline unsigned long native_read_cr3(void) static inline unsigned long __native_read_cr3(void)
{ {
unsigned long val; unsigned long val;
asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order)); asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order));
...@@ -159,9 +159,13 @@ static inline void write_cr2(unsigned long x) ...@@ -159,9 +159,13 @@ static inline void write_cr2(unsigned long x)
native_write_cr2(x); native_write_cr2(x);
} }
static inline unsigned long read_cr3(void) /*
* Careful! CR3 contains more than just an address. You probably want
* read_cr3_pa() instead.
*/
static inline unsigned long __read_cr3(void)
{ {
return native_read_cr3(); return __native_read_cr3();
} }
static inline void write_cr3(unsigned long x) static inline void write_cr3(unsigned long x)
......
#ifndef _ARCH_X86_TLBBATCH_H
#define _ARCH_X86_TLBBATCH_H
#include <linux/cpumask.h>
struct arch_tlbflush_unmap_batch {
/*
* Each bit set is a CPU that potentially has a TLB entry for one of
* the PFNs being flushed..
*/
struct cpumask cpumask;
};
#endif /* _ARCH_X86_TLBBATCH_H */
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
#include <asm/processor.h> #include <asm/processor.h>
#include <asm/cpufeature.h> #include <asm/cpufeature.h>
#include <asm/special_insns.h> #include <asm/special_insns.h>
#include <asm/smp.h>
static inline void __invpcid(unsigned long pcid, unsigned long addr, static inline void __invpcid(unsigned long pcid, unsigned long addr,
unsigned long type) unsigned long type)
...@@ -65,10 +66,14 @@ static inline void invpcid_flush_all_nonglobals(void) ...@@ -65,10 +66,14 @@ static inline void invpcid_flush_all_nonglobals(void)
#endif #endif
struct tlb_state { struct tlb_state {
#ifdef CONFIG_SMP /*
struct mm_struct *active_mm; * cpu_tlbstate.loaded_mm should match CR3 whenever interrupts
* are on. This means that it may not match current->active_mm,
* which will contain the previous user mm when we're in lazy TLB
* mode even if we've already switched back to swapper_pg_dir.
*/
struct mm_struct *loaded_mm;
int state; int state;
#endif
/* /*
* Access to this CR4 shadow and to H/W CR4 is protected by * Access to this CR4 shadow and to H/W CR4 is protected by
...@@ -151,7 +156,7 @@ static inline void __native_flush_tlb(void) ...@@ -151,7 +156,7 @@ static inline void __native_flush_tlb(void)
* back: * back:
*/ */
preempt_disable(); preempt_disable();
native_write_cr3(native_read_cr3()); native_write_cr3(__native_read_cr3());
preempt_enable(); preempt_enable();
} }
...@@ -220,84 +225,16 @@ static inline void __flush_tlb_one(unsigned long addr) ...@@ -220,84 +225,16 @@ static inline void __flush_tlb_one(unsigned long addr)
* - flush_tlb_page(vma, vmaddr) flushes one page * - flush_tlb_page(vma, vmaddr) flushes one page
* - flush_tlb_range(vma, start, end) flushes a range of pages * - flush_tlb_range(vma, start, end) flushes a range of pages
* - flush_tlb_kernel_range(start, end) flushes a range of kernel pages * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
* - flush_tlb_others(cpumask, mm, start, end) flushes TLBs on other cpus * - flush_tlb_others(cpumask, info) flushes TLBs on other cpus
* *
* ..but the i386 has somewhat limited tlb flushing capabilities, * ..but the i386 has somewhat limited tlb flushing capabilities,
* and page-granular flushes are available only on i486 and up. * and page-granular flushes are available only on i486 and up.
*/ */
struct flush_tlb_info {
#ifndef CONFIG_SMP struct mm_struct *mm;
unsigned long start;
/* "_up" is for UniProcessor. unsigned long end;
* };
* This is a helper for other header functions. *Not* intended to be called
* directly. All global TLB flushes need to either call this, or to bump the
* vm statistics themselves.
*/
static inline void __flush_tlb_up(void)
{
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
__flush_tlb();
}
static inline void flush_tlb_all(void)
{
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
__flush_tlb_all();
}
static inline void local_flush_tlb(void)
{
__flush_tlb_up();
}
static inline void flush_tlb_mm(struct mm_struct *mm)
{
if (mm == current->active_mm)
__flush_tlb_up();
}
static inline void flush_tlb_page(struct vm_area_struct *vma,
unsigned long addr)
{
if (vma->vm_mm == current->active_mm)
__flush_tlb_one(addr);
}
static inline void flush_tlb_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
if (vma->vm_mm == current->active_mm)
__flush_tlb_up();
}
static inline void flush_tlb_mm_range(struct mm_struct *mm,
unsigned long start, unsigned long end, unsigned long vmflag)
{
if (mm == current->active_mm)
__flush_tlb_up();
}
static inline void native_flush_tlb_others(const struct cpumask *cpumask,
struct mm_struct *mm,
unsigned long start,
unsigned long end)
{
}
static inline void reset_lazy_tlbstate(void)
{
}
static inline void flush_tlb_kernel_range(unsigned long start,
unsigned long end)
{
flush_tlb_all();
}
#else /* SMP */
#include <asm/smp.h>
#define local_flush_tlb() __flush_tlb() #define local_flush_tlb() __flush_tlb()
...@@ -307,29 +244,32 @@ static inline void flush_tlb_kernel_range(unsigned long start, ...@@ -307,29 +244,32 @@ static inline void flush_tlb_kernel_range(unsigned long start,
flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags) flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags)
extern void flush_tlb_all(void); extern void flush_tlb_all(void);
extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long end, unsigned long vmflag); unsigned long end, unsigned long vmflag);
extern void flush_tlb_kernel_range(unsigned long start, unsigned long end); extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
{
flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, VM_NONE);
}
void native_flush_tlb_others(const struct cpumask *cpumask, void native_flush_tlb_others(const struct cpumask *cpumask,
struct mm_struct *mm, const struct flush_tlb_info *info);
unsigned long start, unsigned long end);
#define TLBSTATE_OK 1 #define TLBSTATE_OK 1
#define TLBSTATE_LAZY 2 #define TLBSTATE_LAZY 2
static inline void reset_lazy_tlbstate(void) static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
struct mm_struct *mm)
{ {
this_cpu_write(cpu_tlbstate.state, 0); cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
this_cpu_write(cpu_tlbstate.active_mm, &init_mm);
} }
#endif /* SMP */ extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
#ifndef CONFIG_PARAVIRT #ifndef CONFIG_PARAVIRT
#define flush_tlb_others(mask, mm, start, end) \ #define flush_tlb_others(mask, info) \
native_flush_tlb_others(mask, mm, start, end) native_flush_tlb_others(mask, info)
#endif #endif
#endif /* _ASM_X86_TLBFLUSH_H */ #endif /* _ASM_X86_TLBFLUSH_H */
#ifndef _ASM_X86_UV_UV_H #ifndef _ASM_X86_UV_UV_H
#define _ASM_X86_UV_UV_H #define _ASM_X86_UV_UV_H
#include <asm/tlbflush.h>
enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC}; enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
struct cpumask; struct cpumask;
...@@ -15,10 +17,7 @@ extern void uv_cpu_init(void); ...@@ -15,10 +17,7 @@ extern void uv_cpu_init(void);
extern void uv_nmi_init(void); extern void uv_nmi_init(void);
extern void uv_system_init(void); extern void uv_system_init(void);
extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
struct mm_struct *mm, const struct flush_tlb_info *info);
unsigned long start,
unsigned long end,
unsigned int cpu);
#else /* X86_UV */ #else /* X86_UV */
...@@ -28,8 +27,8 @@ static inline int is_uv_hubless(void) { return 0; } ...@@ -28,8 +27,8 @@ static inline int is_uv_hubless(void) { return 0; }
static inline void uv_cpu_init(void) { } static inline void uv_cpu_init(void) { }
static inline void uv_system_init(void) { } static inline void uv_system_init(void) { }
static inline const struct cpumask * static inline const struct cpumask *
uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm, uv_flush_tlb_others(const struct cpumask *cpumask,
unsigned long start, unsigned long end, unsigned int cpu) const struct flush_tlb_info *info)
{ return cpumask; } { return cpumask; }
#endif /* X86_UV */ #endif /* X86_UV */
......
...@@ -104,6 +104,8 @@ ...@@ -104,6 +104,8 @@
#define X86_CR4_OSFXSR _BITUL(X86_CR4_OSFXSR_BIT) #define X86_CR4_OSFXSR _BITUL(X86_CR4_OSFXSR_BIT)
#define X86_CR4_OSXMMEXCPT_BIT 10 /* enable unmasked SSE exceptions */ #define X86_CR4_OSXMMEXCPT_BIT 10 /* enable unmasked SSE exceptions */
#define X86_CR4_OSXMMEXCPT _BITUL(X86_CR4_OSXMMEXCPT_BIT) #define X86_CR4_OSXMMEXCPT _BITUL(X86_CR4_OSXMMEXCPT_BIT)
#define X86_CR4_LA57_BIT 12 /* enable 5-level page tables */
#define X86_CR4_LA57 _BITUL(X86_CR4_LA57_BIT)
#define X86_CR4_VMXE_BIT 13 /* enable VMX virtualization */ #define X86_CR4_VMXE_BIT 13 /* enable VMX virtualization */
#define X86_CR4_VMXE _BITUL(X86_CR4_VMXE_BIT) #define X86_CR4_VMXE _BITUL(X86_CR4_VMXE_BIT)
#define X86_CR4_SMXE_BIT 14 /* enable safer mode (TXT) */ #define X86_CR4_SMXE_BIT 14 /* enable safer mode (TXT) */
......
...@@ -18,6 +18,7 @@ CFLAGS_REMOVE_pvclock.o = -pg ...@@ -18,6 +18,7 @@ CFLAGS_REMOVE_pvclock.o = -pg
CFLAGS_REMOVE_kvmclock.o = -pg CFLAGS_REMOVE_kvmclock.o = -pg
CFLAGS_REMOVE_ftrace.o = -pg CFLAGS_REMOVE_ftrace.o = -pg
CFLAGS_REMOVE_early_printk.o = -pg CFLAGS_REMOVE_early_printk.o = -pg
CFLAGS_REMOVE_head64.o = -pg
endif endif
KASAN_SANITIZE_head$(BITS).o := n KASAN_SANITIZE_head$(BITS).o := n
......
...@@ -125,7 +125,7 @@ void __init init_espfix_bsp(void) ...@@ -125,7 +125,7 @@ void __init init_espfix_bsp(void)
p4d_t *p4d; p4d_t *p4d;
/* Install the espfix pud into the kernel page directory */ /* Install the espfix pud into the kernel page directory */
pgd = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)]; pgd = &init_top_pgt[pgd_index(ESPFIX_BASE_ADDR)];
p4d = p4d_alloc(&init_mm, pgd, ESPFIX_BASE_ADDR); p4d = p4d_alloc(&init_mm, pgd, ESPFIX_BASE_ADDR);
p4d_populate(&init_mm, p4d, espfix_pud_page); p4d_populate(&init_mm, p4d, espfix_pud_page);
......
...@@ -33,17 +33,120 @@ ...@@ -33,17 +33,120 @@
/* /*
* Manage page tables very early on. * Manage page tables very early on.
*/ */
extern pgd_t early_level4_pgt[PTRS_PER_PGD]; extern pgd_t early_top_pgt[PTRS_PER_PGD];
extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD]; extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
static unsigned int __initdata next_early_pgt = 2; static unsigned int __initdata next_early_pgt;
pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
#define __head __section(.head.text)
static void __head *fixup_pointer(void *ptr, unsigned long physaddr)
{
return ptr - (void *)_text + (void *)physaddr;
}
void __head __startup_64(unsigned long physaddr)
{
unsigned long load_delta, *p;
pgdval_t *pgd;
p4dval_t *p4d;
pudval_t *pud;
pmdval_t *pmd, pmd_entry;
int i;
/* Is the address too large? */
if (physaddr >> MAX_PHYSMEM_BITS)
for (;;);
/*
* Compute the delta between the address I am compiled to run at
* and the address I am actually running at.
*/
load_delta = physaddr - (unsigned long)(_text - __START_KERNEL_map);
/* Is the address not 2M aligned? */
if (load_delta & ~PMD_PAGE_MASK)
for (;;);
/* Fixup the physical addresses in the page table */
pgd = fixup_pointer(&early_top_pgt, physaddr);
pgd[pgd_index(__START_KERNEL_map)] += load_delta;
if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
p4d = fixup_pointer(&level4_kernel_pgt, physaddr);
p4d[511] += load_delta;
}
pud = fixup_pointer(&level3_kernel_pgt, physaddr);
pud[510] += load_delta;
pud[511] += load_delta;
pmd = fixup_pointer(level2_fixmap_pgt, physaddr);
pmd[506] += load_delta;
/*
* Set up the identity mapping for the switchover. These
* entries should *NOT* have the global bit set! This also
* creates a bunch of nonsense entries but that is fine --
* it avoids problems around wraparound.
*/
pud = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
pmd = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
pgd[i + 0] = (pgdval_t)p4d + _KERNPG_TABLE;
pgd[i + 1] = (pgdval_t)p4d + _KERNPG_TABLE;
i = (physaddr >> P4D_SHIFT) % PTRS_PER_P4D;
p4d[i + 0] = (pgdval_t)pud + _KERNPG_TABLE;
p4d[i + 1] = (pgdval_t)pud + _KERNPG_TABLE;
} else {
i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
pgd[i + 0] = (pgdval_t)pud + _KERNPG_TABLE;
pgd[i + 1] = (pgdval_t)pud + _KERNPG_TABLE;
}
i = (physaddr >> PUD_SHIFT) % PTRS_PER_PUD;
pud[i + 0] = (pudval_t)pmd + _KERNPG_TABLE;
pud[i + 1] = (pudval_t)pmd + _KERNPG_TABLE;
pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL;
pmd_entry += physaddr;
for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) {
int idx = i + (physaddr >> PMD_SHIFT) % PTRS_PER_PMD;
pmd[idx] = pmd_entry + i * PMD_SIZE;
}
/*
* Fixup the kernel text+data virtual addresses. Note that
* we might write invalid pmds, when the kernel is relocated
* cleanup_highmap() fixes this up along with the mappings
* beyond _end.
*/
pmd = fixup_pointer(level2_kernel_pgt, physaddr);
for (i = 0; i < PTRS_PER_PMD; i++) {
if (pmd[i] & _PAGE_PRESENT)
pmd[i] += load_delta;
}
/* Fixup phys_base */
p = fixup_pointer(&phys_base, physaddr);
*p += load_delta;
}
/* Wipe all early page tables except for the kernel symbol map */ /* Wipe all early page tables except for the kernel symbol map */
static void __init reset_early_page_tables(void) static void __init reset_early_page_tables(void)
{ {
memset(early_level4_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1)); memset(early_top_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1));
next_early_pgt = 0; next_early_pgt = 0;
write_cr3(__pa_nodebug(early_level4_pgt)); write_cr3(__pa_nodebug(early_top_pgt));
} }
/* Create a new PMD entry */ /* Create a new PMD entry */
...@@ -51,15 +154,16 @@ int __init early_make_pgtable(unsigned long address) ...@@ -51,15 +154,16 @@ int __init early_make_pgtable(unsigned long address)
{ {
unsigned long physaddr = address - __PAGE_OFFSET; unsigned long physaddr = address - __PAGE_OFFSET;
pgdval_t pgd, *pgd_p; pgdval_t pgd, *pgd_p;
p4dval_t p4d, *p4d_p;
pudval_t pud, *pud_p; pudval_t pud, *pud_p;
pmdval_t pmd, *pmd_p; pmdval_t pmd, *pmd_p;
/* Invalid address or early pgt is done ? */ /* Invalid address or early pgt is done ? */
if (physaddr >= MAXMEM || read_cr3() != __pa_nodebug(early_level4_pgt)) if (physaddr >= MAXMEM || read_cr3_pa() != __pa_nodebug(early_top_pgt))
return -1; return -1;
again: again:
pgd_p = &early_level4_pgt[pgd_index(address)].pgd; pgd_p = &early_top_pgt[pgd_index(address)].pgd;
pgd = *pgd_p; pgd = *pgd_p;
/* /*
...@@ -67,8 +171,25 @@ int __init early_make_pgtable(unsigned long address) ...@@ -67,8 +171,25 @@ int __init early_make_pgtable(unsigned long address)
* critical -- __PAGE_OFFSET would point us back into the dynamic * critical -- __PAGE_OFFSET would point us back into the dynamic
* range and we might end up looping forever... * range and we might end up looping forever...
*/ */
if (pgd) if (!IS_ENABLED(CONFIG_X86_5LEVEL))
pud_p = (pudval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); p4d_p = pgd_p;
else if (pgd)
p4d_p = (p4dval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
else {
if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
reset_early_page_tables();
goto again;
}
p4d_p = (p4dval_t *)early_dynamic_pgts[next_early_pgt++];
memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D);
*pgd_p = (pgdval_t)p4d_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
}
p4d_p += p4d_index(address);
p4d = *p4d_p;
if (p4d)
pud_p = (pudval_t *)((p4d & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
else { else {
if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) { if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
reset_early_page_tables(); reset_early_page_tables();
...@@ -77,7 +198,7 @@ int __init early_make_pgtable(unsigned long address) ...@@ -77,7 +198,7 @@ int __init early_make_pgtable(unsigned long address)
pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++]; pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++];
memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD); memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
*pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; *p4d_p = (p4dval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
} }
pud_p += pud_index(address); pud_p += pud_index(address);
pud = *pud_p; pud = *pud_p;
...@@ -156,7 +277,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) ...@@ -156,7 +277,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
clear_bss(); clear_bss();
clear_page(init_level4_pgt); clear_page(init_top_pgt);
kasan_early_init(); kasan_early_init();
...@@ -171,8 +292,8 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) ...@@ -171,8 +292,8 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
*/ */
load_ucode_bsp(); load_ucode_bsp();
/* set init_level4_pgt kernel high mapping*/ /* set init_top_pgt kernel high mapping*/
init_level4_pgt[511] = early_level4_pgt[511]; init_top_pgt[511] = early_top_pgt[511];
x86_64_start_reservations(real_mode_data); x86_64_start_reservations(real_mode_data);
} }
......
...@@ -37,10 +37,11 @@ ...@@ -37,10 +37,11 @@
* *
*/ */
#define p4d_index(x) (((x) >> P4D_SHIFT) & (PTRS_PER_P4D-1))
#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) #define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
L4_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE) PGD_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE)
L4_START_KERNEL = pgd_index(__START_KERNEL_map) PGD_START_KERNEL = pgd_index(__START_KERNEL_map)
L3_START_KERNEL = pud_index(__START_KERNEL_map) L3_START_KERNEL = pud_index(__START_KERNEL_map)
.text .text
...@@ -72,101 +73,12 @@ startup_64: ...@@ -72,101 +73,12 @@ startup_64:
/* Sanitize CPU configuration */ /* Sanitize CPU configuration */
call verify_cpu call verify_cpu
/*
* Compute the delta between the address I am compiled to run at and the
* address I am actually running at.
*/
leaq _text(%rip), %rbp
subq $_text - __START_KERNEL_map, %rbp
/* Is the address not 2M aligned? */
testl $~PMD_PAGE_MASK, %ebp
jnz bad_address
/*
* Is the address too large?
*/
leaq _text(%rip), %rax
shrq $MAX_PHYSMEM_BITS, %rax
jnz bad_address
/*
* Fixup the physical addresses in the page table
*/
addq %rbp, early_level4_pgt + (L4_START_KERNEL*8)(%rip)
addq %rbp, level3_kernel_pgt + (510*8)(%rip)
addq %rbp, level3_kernel_pgt + (511*8)(%rip)
addq %rbp, level2_fixmap_pgt + (506*8)(%rip)
/*
* Set up the identity mapping for the switchover. These
* entries should *NOT* have the global bit set! This also
* creates a bunch of nonsense entries but that is fine --
* it avoids problems around wraparound.
*/
leaq _text(%rip), %rdi leaq _text(%rip), %rdi
leaq early_level4_pgt(%rip), %rbx pushq %rsi
call __startup_64
movq %rdi, %rax popq %rsi
shrq $PGDIR_SHIFT, %rax
leaq (PAGE_SIZE + _KERNPG_TABLE)(%rbx), %rdx
movq %rdx, 0(%rbx,%rax,8)
movq %rdx, 8(%rbx,%rax,8)
addq $PAGE_SIZE, %rdx
movq %rdi, %rax
shrq $PUD_SHIFT, %rax
andl $(PTRS_PER_PUD-1), %eax
movq %rdx, PAGE_SIZE(%rbx,%rax,8)
incl %eax
andl $(PTRS_PER_PUD-1), %eax
movq %rdx, PAGE_SIZE(%rbx,%rax,8)
addq $PAGE_SIZE * 2, %rbx
movq %rdi, %rax
shrq $PMD_SHIFT, %rdi
addq $(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL), %rax
leaq (_end - 1)(%rip), %rcx
shrq $PMD_SHIFT, %rcx
subq %rdi, %rcx
incl %ecx
1:
andq $(PTRS_PER_PMD - 1), %rdi
movq %rax, (%rbx,%rdi,8)
incq %rdi
addq $PMD_SIZE, %rax
decl %ecx
jnz 1b
test %rbp, %rbp
jz .Lskip_fixup
/* movq $(early_top_pgt - __START_KERNEL_map), %rax
* Fixup the kernel text+data virtual addresses. Note that
* we might write invalid pmds, when the kernel is relocated
* cleanup_highmap() fixes this up along with the mappings
* beyond _end.
*/
leaq level2_kernel_pgt(%rip), %rdi
leaq PAGE_SIZE(%rdi), %r8
/* See if it is a valid page table entry */
1: testb $_PAGE_PRESENT, 0(%rdi)
jz 2f
addq %rbp, 0(%rdi)
/* Go to the next page */
2: addq $8, %rdi
cmp %r8, %rdi
jne 1b
/* Fixup phys_base */
addq %rbp, phys_base(%rip)
.Lskip_fixup:
movq $(early_level4_pgt - __START_KERNEL_map), %rax
jmp 1f jmp 1f
ENTRY(secondary_startup_64) ENTRY(secondary_startup_64)
/* /*
...@@ -186,14 +98,17 @@ ENTRY(secondary_startup_64) ...@@ -186,14 +98,17 @@ ENTRY(secondary_startup_64)
/* Sanitize CPU configuration */ /* Sanitize CPU configuration */
call verify_cpu call verify_cpu
movq $(init_level4_pgt - __START_KERNEL_map), %rax movq $(init_top_pgt - __START_KERNEL_map), %rax
1: 1:
/* Enable PAE mode and PGE */ /* Enable PAE mode, PGE and LA57 */
movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
#ifdef CONFIG_X86_5LEVEL
orl $X86_CR4_LA57, %ecx
#endif
movq %rcx, %cr4 movq %rcx, %cr4
/* Setup early boot stage 4 level pagetables. */ /* Setup early boot stage 4-/5-level pagetables. */
addq phys_base(%rip), %rax addq phys_base(%rip), %rax
movq %rax, %cr3 movq %rax, %cr3
...@@ -417,9 +332,13 @@ GLOBAL(name) ...@@ -417,9 +332,13 @@ GLOBAL(name)
.endr .endr
__INITDATA __INITDATA
NEXT_PAGE(early_level4_pgt) NEXT_PAGE(early_top_pgt)
.fill 511,8,0 .fill 511,8,0
#ifdef CONFIG_X86_5LEVEL
.quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
#else
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
#endif
NEXT_PAGE(early_dynamic_pgts) NEXT_PAGE(early_dynamic_pgts)
.fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
...@@ -427,14 +346,14 @@ NEXT_PAGE(early_dynamic_pgts) ...@@ -427,14 +346,14 @@ NEXT_PAGE(early_dynamic_pgts)
.data .data
#ifndef CONFIG_XEN #ifndef CONFIG_XEN
NEXT_PAGE(init_level4_pgt) NEXT_PAGE(init_top_pgt)
.fill 512,8,0 .fill 512,8,0
#else #else
NEXT_PAGE(init_level4_pgt) NEXT_PAGE(init_top_pgt)
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
.org init_level4_pgt + L4_PAGE_OFFSET*8, 0 .org init_top_pgt + PGD_PAGE_OFFSET*8, 0
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
.org init_level4_pgt + L4_START_KERNEL*8, 0 .org init_top_pgt + PGD_START_KERNEL*8, 0
/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
...@@ -448,6 +367,12 @@ NEXT_PAGE(level2_ident_pgt) ...@@ -448,6 +367,12 @@ NEXT_PAGE(level2_ident_pgt)
PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
#endif #endif
#ifdef CONFIG_X86_5LEVEL
NEXT_PAGE(level4_kernel_pgt)
.fill 511,8,0
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
#endif
NEXT_PAGE(level3_kernel_pgt) NEXT_PAGE(level3_kernel_pgt)
.fill L3_START_KERNEL,8,0 .fill L3_START_KERNEL,8,0
/* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
......
...@@ -22,24 +22,25 @@ ...@@ -22,24 +22,25 @@
#include <asm/syscalls.h> #include <asm/syscalls.h>
/* context.lock is held for us, so we don't need any locking. */ /* context.lock is held for us, so we don't need any locking. */
static void flush_ldt(void *current_mm) static void flush_ldt(void *__mm)
{ {
struct mm_struct *mm = __mm;
mm_context_t *pc; mm_context_t *pc;
if (current->active_mm != current_mm) if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm)
return; return;
pc = &current->active_mm->context; pc = &mm->context;
set_ldt(pc->ldt->entries, pc->ldt->size); set_ldt(pc->ldt->entries, pc->ldt->nr_entries);
} }
/* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */ /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
static struct ldt_struct *alloc_ldt_struct(unsigned int size) static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
{ {
struct ldt_struct *new_ldt; struct ldt_struct *new_ldt;
unsigned int alloc_size; unsigned int alloc_size;
if (size > LDT_ENTRIES) if (num_entries > LDT_ENTRIES)
return NULL; return NULL;
new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL); new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL);
...@@ -47,7 +48,7 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int size) ...@@ -47,7 +48,7 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int size)
return NULL; return NULL;
BUILD_BUG_ON(LDT_ENTRY_SIZE != sizeof(struct desc_struct)); BUILD_BUG_ON(LDT_ENTRY_SIZE != sizeof(struct desc_struct));
alloc_size = size * LDT_ENTRY_SIZE; alloc_size = num_entries * LDT_ENTRY_SIZE;
/* /*
* Xen is very picky: it requires a page-aligned LDT that has no * Xen is very picky: it requires a page-aligned LDT that has no
...@@ -65,14 +66,14 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int size) ...@@ -65,14 +66,14 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int size)
return NULL; return NULL;
} }
new_ldt->size = size; new_ldt->nr_entries = num_entries;
return new_ldt; return new_ldt;
} }
/* After calling this, the LDT is immutable. */ /* After calling this, the LDT is immutable. */
static void finalize_ldt_struct(struct ldt_struct *ldt) static void finalize_ldt_struct(struct ldt_struct *ldt)
{ {
paravirt_alloc_ldt(ldt->entries, ldt->size); paravirt_alloc_ldt(ldt->entries, ldt->nr_entries);
} }
/* context.lock is held */ /* context.lock is held */
...@@ -91,8 +92,8 @@ static void free_ldt_struct(struct ldt_struct *ldt) ...@@ -91,8 +92,8 @@ static void free_ldt_struct(struct ldt_struct *ldt)
if (likely(!ldt)) if (likely(!ldt))
return; return;
paravirt_free_ldt(ldt->entries, ldt->size); paravirt_free_ldt(ldt->entries, ldt->nr_entries);
if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE) if (ldt->nr_entries * LDT_ENTRY_SIZE > PAGE_SIZE)
vfree_atomic(ldt->entries); vfree_atomic(ldt->entries);
else else
free_page((unsigned long)ldt->entries); free_page((unsigned long)ldt->entries);
...@@ -122,14 +123,14 @@ int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm) ...@@ -122,14 +123,14 @@ int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm)
goto out_unlock; goto out_unlock;
} }
new_ldt = alloc_ldt_struct(old_mm->context.ldt->size); new_ldt = alloc_ldt_struct(old_mm->context.ldt->nr_entries);
if (!new_ldt) { if (!new_ldt) {
retval = -ENOMEM; retval = -ENOMEM;
goto out_unlock; goto out_unlock;
} }
memcpy(new_ldt->entries, old_mm->context.ldt->entries, memcpy(new_ldt->entries, old_mm->context.ldt->entries,
new_ldt->size * LDT_ENTRY_SIZE); new_ldt->nr_entries * LDT_ENTRY_SIZE);
finalize_ldt_struct(new_ldt); finalize_ldt_struct(new_ldt);
mm->context.ldt = new_ldt; mm->context.ldt = new_ldt;
...@@ -152,9 +153,9 @@ void destroy_context_ldt(struct mm_struct *mm) ...@@ -152,9 +153,9 @@ void destroy_context_ldt(struct mm_struct *mm)
static int read_ldt(void __user *ptr, unsigned long bytecount) static int read_ldt(void __user *ptr, unsigned long bytecount)
{ {
int retval;
unsigned long size;
struct mm_struct *mm = current->mm; struct mm_struct *mm = current->mm;
unsigned long entries_size;
int retval;
mutex_lock(&mm->context.lock); mutex_lock(&mm->context.lock);
...@@ -166,18 +167,18 @@ static int read_ldt(void __user *ptr, unsigned long bytecount) ...@@ -166,18 +167,18 @@ static int read_ldt(void __user *ptr, unsigned long bytecount)
if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES) if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES)
bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES; bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES;
size = mm->context.ldt->size * LDT_ENTRY_SIZE; entries_size = mm->context.ldt->nr_entries * LDT_ENTRY_SIZE;
if (size > bytecount) if (entries_size > bytecount)
size = bytecount; entries_size = bytecount;
if (copy_to_user(ptr, mm->context.ldt->entries, size)) { if (copy_to_user(ptr, mm->context.ldt->entries, entries_size)) {
retval = -EFAULT; retval = -EFAULT;
goto out_unlock; goto out_unlock;
} }
if (size != bytecount) { if (entries_size != bytecount) {
/* Zero-fill the rest and pretend we read bytecount bytes. */ /* Zero-fill the rest and pretend we read bytecount bytes. */
if (clear_user(ptr + size, bytecount - size)) { if (clear_user(ptr + entries_size, bytecount - entries_size)) {
retval = -EFAULT; retval = -EFAULT;
goto out_unlock; goto out_unlock;
} }
...@@ -208,7 +209,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) ...@@ -208,7 +209,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
{ {
struct mm_struct *mm = current->mm; struct mm_struct *mm = current->mm;
struct ldt_struct *new_ldt, *old_ldt; struct ldt_struct *new_ldt, *old_ldt;
unsigned int oldsize, newsize; unsigned int old_nr_entries, new_nr_entries;
struct user_desc ldt_info; struct user_desc ldt_info;
struct desc_struct ldt; struct desc_struct ldt;
int error; int error;
...@@ -247,17 +248,18 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) ...@@ -247,17 +248,18 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
mutex_lock(&mm->context.lock); mutex_lock(&mm->context.lock);
old_ldt = mm->context.ldt; old_ldt = mm->context.ldt;
oldsize = old_ldt ? old_ldt->size : 0; old_nr_entries = old_ldt ? old_ldt->nr_entries : 0;
newsize = max(ldt_info.entry_number + 1, oldsize); new_nr_entries = max(ldt_info.entry_number + 1, old_nr_entries);
error = -ENOMEM; error = -ENOMEM;
new_ldt = alloc_ldt_struct(newsize); new_ldt = alloc_ldt_struct(new_nr_entries);
if (!new_ldt) if (!new_ldt)
goto out_unlock; goto out_unlock;
if (old_ldt) if (old_ldt)
memcpy(new_ldt->entries, old_ldt->entries, oldsize * LDT_ENTRY_SIZE); memcpy(new_ldt->entries, old_ldt->entries, old_nr_entries * LDT_ENTRY_SIZE);
new_ldt->entries[ldt_info.entry_number] = ldt; new_ldt->entries[ldt_info.entry_number] = ldt;
finalize_ldt_struct(new_ldt); finalize_ldt_struct(new_ldt);
......
...@@ -347,7 +347,7 @@ void machine_kexec(struct kimage *image) ...@@ -347,7 +347,7 @@ void machine_kexec(struct kimage *image)
void arch_crash_save_vmcoreinfo(void) void arch_crash_save_vmcoreinfo(void)
{ {
VMCOREINFO_NUMBER(phys_base); VMCOREINFO_NUMBER(phys_base);
VMCOREINFO_SYMBOL(init_level4_pgt); VMCOREINFO_SYMBOL(init_top_pgt);
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA
VMCOREINFO_SYMBOL(node_data); VMCOREINFO_SYMBOL(node_data);
......
...@@ -391,7 +391,7 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = { ...@@ -391,7 +391,7 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
.read_cr2 = native_read_cr2, .read_cr2 = native_read_cr2,
.write_cr2 = native_write_cr2, .write_cr2 = native_write_cr2,
.read_cr3 = native_read_cr3, .read_cr3 = __native_read_cr3,
.write_cr3 = native_write_cr3, .write_cr3 = native_write_cr3,
.flush_tlb_user = native_flush_tlb, .flush_tlb_user = native_flush_tlb,
......
...@@ -92,7 +92,7 @@ void __show_regs(struct pt_regs *regs, int all) ...@@ -92,7 +92,7 @@ void __show_regs(struct pt_regs *regs, int all)
cr0 = read_cr0(); cr0 = read_cr0();
cr2 = read_cr2(); cr2 = read_cr2();
cr3 = read_cr3(); cr3 = __read_cr3();
cr4 = __read_cr4(); cr4 = __read_cr4();
printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
cr0, cr2, cr3, cr4); cr0, cr2, cr3, cr4);
......
...@@ -104,7 +104,7 @@ void __show_regs(struct pt_regs *regs, int all) ...@@ -104,7 +104,7 @@ void __show_regs(struct pt_regs *regs, int all)
cr0 = read_cr0(); cr0 = read_cr0();
cr2 = read_cr2(); cr2 = read_cr2();
cr3 = read_cr3(); cr3 = __read_cr3();
cr4 = __read_cr4(); cr4 = __read_cr4();
printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
...@@ -142,7 +142,7 @@ void release_thread(struct task_struct *dead_task) ...@@ -142,7 +142,7 @@ void release_thread(struct task_struct *dead_task)
pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n", pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
dead_task->comm, dead_task->comm,
dead_task->mm->context.ldt->entries, dead_task->mm->context.ldt->entries,
dead_task->mm->context.ldt->size); dead_task->mm->context.ldt->nr_entries);
BUG(); BUG();
} }
#endif #endif
......
...@@ -1589,7 +1589,6 @@ void native_cpu_die(unsigned int cpu) ...@@ -1589,7 +1589,6 @@ void native_cpu_die(unsigned int cpu)
void play_dead_common(void) void play_dead_common(void)
{ {
idle_task_exit(); idle_task_exit();
reset_lazy_tlbstate();
/* Ack it */ /* Ack it */
(void)cpu_report_death(); (void)cpu_report_death();
......
...@@ -34,7 +34,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re ...@@ -34,7 +34,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re
mutex_lock(&child->mm->context.lock); mutex_lock(&child->mm->context.lock);
if (unlikely(!child->mm->context.ldt || if (unlikely(!child->mm->context.ldt ||
seg >= child->mm->context.ldt->size)) seg >= child->mm->context.ldt->nr_entries))
addr = -1L; /* bogus selector, access would fault */ addr = -1L; /* bogus selector, access would fault */
else { else {
desc = &child->mm->context.ldt->entries[seg]; desc = &child->mm->context.ldt->entries[seg];
......
...@@ -49,6 +49,7 @@ ...@@ -49,6 +49,7 @@
#include <asm/kexec.h> #include <asm/kexec.h>
#include <asm/apic.h> #include <asm/apic.h>
#include <asm/irq_remapping.h> #include <asm/irq_remapping.h>
#include <asm/mmu_context.h>
#include "trace.h" #include "trace.h"
#include "pmu.h" #include "pmu.h"
...@@ -597,6 +598,7 @@ struct vcpu_vmx { ...@@ -597,6 +598,7 @@ struct vcpu_vmx {
int gs_ldt_reload_needed; int gs_ldt_reload_needed;
int fs_reload_needed; int fs_reload_needed;
u64 msr_host_bndcfgs; u64 msr_host_bndcfgs;
unsigned long vmcs_host_cr3; /* May not match real cr3 */
unsigned long vmcs_host_cr4; /* May not match real cr4 */ unsigned long vmcs_host_cr4; /* May not match real cr4 */
} host_state; } host_state;
struct { struct {
...@@ -5013,12 +5015,19 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx) ...@@ -5013,12 +5015,19 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
u32 low32, high32; u32 low32, high32;
unsigned long tmpl; unsigned long tmpl;
struct desc_ptr dt; struct desc_ptr dt;
unsigned long cr0, cr4; unsigned long cr0, cr3, cr4;
cr0 = read_cr0(); cr0 = read_cr0();
WARN_ON(cr0 & X86_CR0_TS); WARN_ON(cr0 & X86_CR0_TS);
vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */ vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */
vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
/*
* Save the most likely value for this task's CR3 in the VMCS.
* We can't use __get_current_cr3_fast() because we're not atomic.
*/
cr3 = __read_cr3();
vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */
vmx->host_state.vmcs_host_cr3 = cr3;
/* Save the most likely value for this task's CR4 in the VMCS. */ /* Save the most likely value for this task's CR4 in the VMCS. */
cr4 = cr4_read_shadow(); cr4 = cr4_read_shadow();
...@@ -8822,7 +8831,7 @@ static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu) ...@@ -8822,7 +8831,7 @@ static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
{ {
struct vcpu_vmx *vmx = to_vmx(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long debugctlmsr, cr4; unsigned long debugctlmsr, cr3, cr4;
/* Don't enter VMX if guest state is invalid, let the exit handler /* Don't enter VMX if guest state is invalid, let the exit handler
start emulation until we arrive back to a valid state */ start emulation until we arrive back to a valid state */
...@@ -8844,6 +8853,12 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) ...@@ -8844,6 +8853,12 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
cr3 = __get_current_cr3_fast();
if (unlikely(cr3 != vmx->host_state.vmcs_host_cr3)) {
vmcs_writel(HOST_CR3, cr3);
vmx->host_state.vmcs_host_cr3 = cr3;
}
cr4 = cr4_read_shadow(); cr4 = cr4_read_shadow();
if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) { if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) {
vmcs_writel(HOST_CR4, cr4); vmcs_writel(HOST_CR4, cr4);
......
...@@ -27,7 +27,7 @@ static inline struct desc_struct FPU_get_ldt_descriptor(unsigned seg) ...@@ -27,7 +27,7 @@ static inline struct desc_struct FPU_get_ldt_descriptor(unsigned seg)
#ifdef CONFIG_MODIFY_LDT_SYSCALL #ifdef CONFIG_MODIFY_LDT_SYSCALL
seg >>= 3; seg >>= 3;
mutex_lock(&current->mm->context.lock); mutex_lock(&current->mm->context.lock);
if (current->mm->context.ldt && seg < current->mm->context.ldt->size) if (current->mm->context.ldt && seg < current->mm->context.ldt->nr_entries)
ret = current->mm->context.ldt->entries[seg]; ret = current->mm->context.ldt->entries[seg];
mutex_unlock(&current->mm->context.lock); mutex_unlock(&current->mm->context.lock);
#endif #endif
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
KCOV_INSTRUMENT_tlb.o := n KCOV_INSTRUMENT_tlb.o := n
obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
pat.o pgtable.o physaddr.o gup.o setup_nx.o tlb.o pat.o pgtable.o physaddr.o setup_nx.o tlb.o
# Make sure __phys_addr has no stackprotector # Make sure __phys_addr has no stackprotector
nostackp := $(call cc-option, -fno-stack-protector) nostackp := $(call cc-option, -fno-stack-protector)
......
...@@ -431,7 +431,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, ...@@ -431,7 +431,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
bool checkwx) bool checkwx)
{ {
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
pgd_t *start = (pgd_t *) &init_level4_pgt; pgd_t *start = (pgd_t *) &init_top_pgt;
#else #else
pgd_t *start = swapper_pg_dir; pgd_t *start = swapper_pg_dir;
#endif #endif
......
...@@ -346,7 +346,7 @@ static noinline int vmalloc_fault(unsigned long address) ...@@ -346,7 +346,7 @@ static noinline int vmalloc_fault(unsigned long address)
* Do _not_ use "current" here. We might be inside * Do _not_ use "current" here. We might be inside
* an interrupt in the middle of a task switch.. * an interrupt in the middle of a task switch..
*/ */
pgd_paddr = read_cr3(); pgd_paddr = read_cr3_pa();
pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
if (!pmd_k) if (!pmd_k)
return -1; return -1;
...@@ -388,7 +388,7 @@ static bool low_pfn(unsigned long pfn) ...@@ -388,7 +388,7 @@ static bool low_pfn(unsigned long pfn)
static void dump_pagetable(unsigned long address) static void dump_pagetable(unsigned long address)
{ {
pgd_t *base = __va(read_cr3()); pgd_t *base = __va(read_cr3_pa());
pgd_t *pgd = &base[pgd_index(address)]; pgd_t *pgd = &base[pgd_index(address)];
p4d_t *p4d; p4d_t *p4d;
pud_t *pud; pud_t *pud;
...@@ -451,7 +451,7 @@ static noinline int vmalloc_fault(unsigned long address) ...@@ -451,7 +451,7 @@ static noinline int vmalloc_fault(unsigned long address)
* happen within a race in page table update. In the later * happen within a race in page table update. In the later
* case just flush: * case just flush:
*/ */
pgd = (pgd_t *)__va(read_cr3()) + pgd_index(address); pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address);
pgd_ref = pgd_offset_k(address); pgd_ref = pgd_offset_k(address);
if (pgd_none(*pgd_ref)) if (pgd_none(*pgd_ref))
return -1; return -1;
...@@ -555,7 +555,7 @@ static int bad_address(void *p) ...@@ -555,7 +555,7 @@ static int bad_address(void *p)
static void dump_pagetable(unsigned long address) static void dump_pagetable(unsigned long address)
{ {
pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK); pgd_t *base = __va(read_cr3_pa());
pgd_t *pgd = base + pgd_index(address); pgd_t *pgd = base + pgd_index(address);
p4d_t *p4d; p4d_t *p4d;
pud_t *pud; pud_t *pud;
...@@ -700,7 +700,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, ...@@ -700,7 +700,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
pgd_t *pgd; pgd_t *pgd;
pte_t *pte; pte_t *pte;
pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK); pgd = __va(read_cr3_pa());
pgd += pgd_index(address); pgd += pgd_index(address);
pte = lookup_address_in_pgd(pgd, address, &level); pte = lookup_address_in_pgd(pgd, address, &level);
......
This diff is collapsed.
...@@ -811,10 +811,8 @@ void __init zone_sizes_init(void) ...@@ -811,10 +811,8 @@ void __init zone_sizes_init(void)
} }
DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
#ifdef CONFIG_SMP .loaded_mm = &init_mm,
.active_mm = &init_mm,
.state = 0, .state = 0,
#endif
.cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
}; };
EXPORT_SYMBOL_GPL(cpu_tlbstate); EXPORT_SYMBOL_GPL(cpu_tlbstate);
......
...@@ -92,6 +92,44 @@ __setup("noexec32=", nonx32_setup); ...@@ -92,6 +92,44 @@ __setup("noexec32=", nonx32_setup);
* When memory was added make sure all the processes MM have * When memory was added make sure all the processes MM have
* suitable PGD entries in the local PGD level page. * suitable PGD entries in the local PGD level page.
*/ */
#ifdef CONFIG_X86_5LEVEL
void sync_global_pgds(unsigned long start, unsigned long end)
{
unsigned long addr;
for (addr = start; addr <= end; addr = ALIGN(addr + 1, PGDIR_SIZE)) {
const pgd_t *pgd_ref = pgd_offset_k(addr);
struct page *page;
/* Check for overflow */
if (addr < start)
break;
if (pgd_none(*pgd_ref))
continue;
spin_lock(&pgd_lock);
list_for_each_entry(page, &pgd_list, lru) {
pgd_t *pgd;
spinlock_t *pgt_lock;
pgd = (pgd_t *)page_address(page) + pgd_index(addr);
/* the pgt_lock only for Xen */
pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
spin_lock(pgt_lock);
if (!pgd_none(*pgd_ref) && !pgd_none(*pgd))
BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
if (pgd_none(*pgd))
set_pgd(pgd, *pgd_ref);
spin_unlock(pgt_lock);
}
spin_unlock(&pgd_lock);
}
}
#else
void sync_global_pgds(unsigned long start, unsigned long end) void sync_global_pgds(unsigned long start, unsigned long end)
{ {
unsigned long addr; unsigned long addr;
...@@ -135,6 +173,7 @@ void sync_global_pgds(unsigned long start, unsigned long end) ...@@ -135,6 +173,7 @@ void sync_global_pgds(unsigned long start, unsigned long end)
spin_unlock(&pgd_lock); spin_unlock(&pgd_lock);
} }
} }
#endif
/* /*
* NOTE: This function is marked __ref because it calls __init function * NOTE: This function is marked __ref because it calls __init function
...@@ -585,6 +624,57 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, ...@@ -585,6 +624,57 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
return paddr_last; return paddr_last;
} }
static unsigned long __meminit
phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
unsigned long page_size_mask)
{
unsigned long paddr_next, paddr_last = paddr_end;
unsigned long vaddr = (unsigned long)__va(paddr);
int i = p4d_index(vaddr);
if (!IS_ENABLED(CONFIG_X86_5LEVEL))
return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end, page_size_mask);
for (; i < PTRS_PER_P4D; i++, paddr = paddr_next) {
p4d_t *p4d;
pud_t *pud;
vaddr = (unsigned long)__va(paddr);
p4d = p4d_page + p4d_index(vaddr);
paddr_next = (paddr & P4D_MASK) + P4D_SIZE;
if (paddr >= paddr_end) {
if (!after_bootmem &&
!e820__mapped_any(paddr & P4D_MASK, paddr_next,
E820_TYPE_RAM) &&
!e820__mapped_any(paddr & P4D_MASK, paddr_next,
E820_TYPE_RESERVED_KERN))
set_p4d(p4d, __p4d(0));
continue;
}
if (!p4d_none(*p4d)) {
pud = pud_offset(p4d, 0);
paddr_last = phys_pud_init(pud, paddr,
paddr_end,
page_size_mask);
__flush_tlb_all();
continue;
}
pud = alloc_low_page();
paddr_last = phys_pud_init(pud, paddr, paddr_end,
page_size_mask);
spin_lock(&init_mm.page_table_lock);
p4d_populate(&init_mm, p4d, pud);
spin_unlock(&init_mm.page_table_lock);
}
__flush_tlb_all();
return paddr_last;
}
/* /*
* Create page table mapping for the physical memory for specific physical * Create page table mapping for the physical memory for specific physical
* addresses. The virtual and physical addresses have to be aligned on PMD level * addresses. The virtual and physical addresses have to be aligned on PMD level
...@@ -606,26 +696,26 @@ kernel_physical_mapping_init(unsigned long paddr_start, ...@@ -606,26 +696,26 @@ kernel_physical_mapping_init(unsigned long paddr_start,
for (; vaddr < vaddr_end; vaddr = vaddr_next) { for (; vaddr < vaddr_end; vaddr = vaddr_next) {
pgd_t *pgd = pgd_offset_k(vaddr); pgd_t *pgd = pgd_offset_k(vaddr);
p4d_t *p4d; p4d_t *p4d;
pud_t *pud;
vaddr_next = (vaddr & PGDIR_MASK) + PGDIR_SIZE; vaddr_next = (vaddr & PGDIR_MASK) + PGDIR_SIZE;
BUILD_BUG_ON(pgd_none(*pgd)); if (pgd_val(*pgd)) {
p4d = p4d_offset(pgd, vaddr); p4d = (p4d_t *)pgd_page_vaddr(*pgd);
if (p4d_val(*p4d)) { paddr_last = phys_p4d_init(p4d, __pa(vaddr),
pud = (pud_t *)p4d_page_vaddr(*p4d);
paddr_last = phys_pud_init(pud, __pa(vaddr),
__pa(vaddr_end), __pa(vaddr_end),
page_size_mask); page_size_mask);
continue; continue;
} }
pud = alloc_low_page(); p4d = alloc_low_page();
paddr_last = phys_pud_init(pud, __pa(vaddr), __pa(vaddr_end), paddr_last = phys_p4d_init(p4d, __pa(vaddr), __pa(vaddr_end),
page_size_mask); page_size_mask);
spin_lock(&init_mm.page_table_lock); spin_lock(&init_mm.page_table_lock);
p4d_populate(&init_mm, p4d, pud); if (IS_ENABLED(CONFIG_X86_5LEVEL))
pgd_populate(&init_mm, pgd, p4d);
else
p4d_populate(&init_mm, p4d_offset(pgd, vaddr), (pud_t *) p4d);
spin_unlock(&init_mm.page_table_lock); spin_unlock(&init_mm.page_table_lock);
pgd_changed = true; pgd_changed = true;
} }
......
...@@ -424,7 +424,7 @@ static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss; ...@@ -424,7 +424,7 @@ static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
{ {
/* Don't assume we're using swapper_pg_dir at this point */ /* Don't assume we're using swapper_pg_dir at this point */
pgd_t *base = __va(read_cr3()); pgd_t *base = __va(read_cr3_pa());
pgd_t *pgd = &base[pgd_index(addr)]; pgd_t *pgd = &base[pgd_index(addr)];
p4d_t *p4d = p4d_offset(pgd, addr); p4d_t *p4d = p4d_offset(pgd, addr);
pud_t *pud = pud_offset(p4d, addr); pud_t *pud = pud_offset(p4d, addr);
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
#include <asm/tlbflush.h> #include <asm/tlbflush.h>
#include <asm/sections.h> #include <asm/sections.h>
extern pgd_t early_level4_pgt[PTRS_PER_PGD]; extern pgd_t early_top_pgt[PTRS_PER_PGD];
extern struct range pfn_mapped[E820_MAX_ENTRIES]; extern struct range pfn_mapped[E820_MAX_ENTRIES];
static int __init map_range(struct range *range) static int __init map_range(struct range *range)
...@@ -109,8 +109,8 @@ void __init kasan_early_init(void) ...@@ -109,8 +109,8 @@ void __init kasan_early_init(void)
for (i = 0; CONFIG_PGTABLE_LEVELS >= 5 && i < PTRS_PER_P4D; i++) for (i = 0; CONFIG_PGTABLE_LEVELS >= 5 && i < PTRS_PER_P4D; i++)
kasan_zero_p4d[i] = __p4d(p4d_val); kasan_zero_p4d[i] = __p4d(p4d_val);
kasan_map_early_shadow(early_level4_pgt); kasan_map_early_shadow(early_top_pgt);
kasan_map_early_shadow(init_level4_pgt); kasan_map_early_shadow(init_top_pgt);
} }
void __init kasan_init(void) void __init kasan_init(void)
...@@ -121,8 +121,8 @@ void __init kasan_init(void) ...@@ -121,8 +121,8 @@ void __init kasan_init(void)
register_die_notifier(&kasan_die_notifier); register_die_notifier(&kasan_die_notifier);
#endif #endif
memcpy(early_level4_pgt, init_level4_pgt, sizeof(early_level4_pgt)); memcpy(early_top_pgt, init_top_pgt, sizeof(early_top_pgt));
load_cr3(early_level4_pgt); load_cr3(early_top_pgt);
__flush_tlb_all(); __flush_tlb_all();
clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END); clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END);
...@@ -148,7 +148,7 @@ void __init kasan_init(void) ...@@ -148,7 +148,7 @@ void __init kasan_init(void)
kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
(void *)KASAN_SHADOW_END); (void *)KASAN_SHADOW_END);
load_cr3(init_level4_pgt); load_cr3(init_top_pgt);
__flush_tlb_all(); __flush_tlb_all();
/* /*
......
...@@ -6,12 +6,12 @@ ...@@ -6,12 +6,12 @@
* *
* Entropy is generated using the KASLR early boot functions now shared in * Entropy is generated using the KASLR early boot functions now shared in
* the lib directory (originally written by Kees Cook). Randomization is * the lib directory (originally written by Kees Cook). Randomization is
* done on PGD & PUD page table levels to increase possible addresses. The * done on PGD & P4D/PUD page table levels to increase possible addresses.
* physical memory mapping code was adapted to support PUD level virtual * The physical memory mapping code was adapted to support P4D/PUD level
* addresses. This implementation on the best configuration provides 30,000 * virtual addresses. This implementation on the best configuration provides
* possible virtual addresses in average for each memory region. An additional * 30,000 possible virtual addresses in average for each memory region.
* low memory page is used to ensure each CPU can start with a PGD aligned * An additional low memory page is used to ensure each CPU can start with
* virtual address (for realmode). * a PGD aligned virtual address (for realmode).
* *
* The order of each memory region is not changed. The feature looks at * The order of each memory region is not changed. The feature looks at
* the available space for the regions based on different configuration * the available space for the regions based on different configuration
...@@ -70,7 +70,7 @@ static __initdata struct kaslr_memory_region { ...@@ -70,7 +70,7 @@ static __initdata struct kaslr_memory_region {
unsigned long *base; unsigned long *base;
unsigned long size_tb; unsigned long size_tb;
} kaslr_regions[] = { } kaslr_regions[] = {
{ &page_offset_base, 64/* Maximum */ }, { &page_offset_base, 1 << (__PHYSICAL_MASK_SHIFT - TB_SHIFT) /* Maximum */ },
{ &vmalloc_base, VMALLOC_SIZE_TB }, { &vmalloc_base, VMALLOC_SIZE_TB },
{ &vmemmap_base, 1 }, { &vmemmap_base, 1 },
}; };
...@@ -142,7 +142,10 @@ void __init kernel_randomize_memory(void) ...@@ -142,7 +142,10 @@ void __init kernel_randomize_memory(void)
*/ */
entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i); entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i);
prandom_bytes_state(&rand_state, &rand, sizeof(rand)); prandom_bytes_state(&rand_state, &rand, sizeof(rand));
entropy = (rand % (entropy + 1)) & PUD_MASK; if (IS_ENABLED(CONFIG_X86_5LEVEL))
entropy = (rand % (entropy + 1)) & P4D_MASK;
else
entropy = (rand % (entropy + 1)) & PUD_MASK;
vaddr += entropy; vaddr += entropy;
*kaslr_regions[i].base = vaddr; *kaslr_regions[i].base = vaddr;
...@@ -151,27 +154,21 @@ void __init kernel_randomize_memory(void) ...@@ -151,27 +154,21 @@ void __init kernel_randomize_memory(void)
* randomization alignment. * randomization alignment.
*/ */
vaddr += get_padding(&kaslr_regions[i]); vaddr += get_padding(&kaslr_regions[i]);
vaddr = round_up(vaddr + 1, PUD_SIZE); if (IS_ENABLED(CONFIG_X86_5LEVEL))
vaddr = round_up(vaddr + 1, P4D_SIZE);
else
vaddr = round_up(vaddr + 1, PUD_SIZE);
remain_entropy -= entropy; remain_entropy -= entropy;
} }
} }
/* static void __meminit init_trampoline_pud(void)
* Create PGD aligned trampoline table to allow real mode initialization
* of additional CPUs. Consume only 1 low memory page.
*/
void __meminit init_trampoline(void)
{ {
unsigned long paddr, paddr_next; unsigned long paddr, paddr_next;
pgd_t *pgd; pgd_t *pgd;
pud_t *pud_page, *pud_page_tramp; pud_t *pud_page, *pud_page_tramp;
int i; int i;
if (!kaslr_memory_enabled()) {
init_trampoline_default();
return;
}
pud_page_tramp = alloc_low_page(); pud_page_tramp = alloc_low_page();
paddr = 0; paddr = 0;
...@@ -192,3 +189,49 @@ void __meminit init_trampoline(void) ...@@ -192,3 +189,49 @@ void __meminit init_trampoline(void)
set_pgd(&trampoline_pgd_entry, set_pgd(&trampoline_pgd_entry,
__pgd(_KERNPG_TABLE | __pa(pud_page_tramp))); __pgd(_KERNPG_TABLE | __pa(pud_page_tramp)));
} }
static void __meminit init_trampoline_p4d(void)
{
unsigned long paddr, paddr_next;
pgd_t *pgd;
p4d_t *p4d_page, *p4d_page_tramp;
int i;
p4d_page_tramp = alloc_low_page();
paddr = 0;
pgd = pgd_offset_k((unsigned long)__va(paddr));
p4d_page = (p4d_t *) pgd_page_vaddr(*pgd);
for (i = p4d_index(paddr); i < PTRS_PER_P4D; i++, paddr = paddr_next) {
p4d_t *p4d, *p4d_tramp;
unsigned long vaddr = (unsigned long)__va(paddr);
p4d_tramp = p4d_page_tramp + p4d_index(paddr);
p4d = p4d_page + p4d_index(vaddr);
paddr_next = (paddr & P4D_MASK) + P4D_SIZE;
*p4d_tramp = *p4d;
}
set_pgd(&trampoline_pgd_entry,
__pgd(_KERNPG_TABLE | __pa(p4d_page_tramp)));
}
/*
* Create PGD aligned trampoline table to allow real mode initialization
* of additional CPUs. Consume only 1 low memory page.
*/
void __meminit init_trampoline(void)
{
if (!kaslr_memory_enabled()) {
init_trampoline_default();
return;
}
if (IS_ENABLED(CONFIG_X86_5LEVEL))
init_trampoline_p4d();
else
init_trampoline_pud();
}
...@@ -74,9 +74,6 @@ static int mmap_is_legacy(void) ...@@ -74,9 +74,6 @@ static int mmap_is_legacy(void)
if (current->personality & ADDR_COMPAT_LAYOUT) if (current->personality & ADDR_COMPAT_LAYOUT)
return 1; return 1;
if (rlimit(RLIMIT_STACK) == RLIM_INFINITY)
return 1;
return sysctl_legacy_va_layout; return sysctl_legacy_va_layout;
} }
......
This diff is collapsed.
...@@ -80,7 +80,7 @@ pgd_t * __init efi_call_phys_prolog(void) ...@@ -80,7 +80,7 @@ pgd_t * __init efi_call_phys_prolog(void)
int n_pgds, i, j; int n_pgds, i, j;
if (!efi_enabled(EFI_OLD_MEMMAP)) { if (!efi_enabled(EFI_OLD_MEMMAP)) {
save_pgd = (pgd_t *)read_cr3(); save_pgd = (pgd_t *)__read_cr3();
write_cr3((unsigned long)efi_scratch.efi_pgt); write_cr3((unsigned long)efi_scratch.efi_pgt);
goto out; goto out;
} }
...@@ -649,7 +649,7 @@ efi_status_t efi_thunk_set_virtual_address_map( ...@@ -649,7 +649,7 @@ efi_status_t efi_thunk_set_virtual_address_map(
efi_sync_low_kernel_mappings(); efi_sync_low_kernel_mappings();
local_irq_save(flags); local_irq_save(flags);
efi_scratch.prev_cr3 = read_cr3(); efi_scratch.prev_cr3 = __read_cr3();
write_cr3((unsigned long)efi_scratch.efi_pgt); write_cr3((unsigned long)efi_scratch.efi_pgt);
__flush_tlb_all(); __flush_tlb_all();
......
...@@ -77,7 +77,7 @@ static int xo1_power_state_enter(suspend_state_t pm_state) ...@@ -77,7 +77,7 @@ static int xo1_power_state_enter(suspend_state_t pm_state)
asmlinkage __visible int xo1_do_sleep(u8 sleep_state) asmlinkage __visible int xo1_do_sleep(u8 sleep_state)
{ {
void *pgd_addr = __va(read_cr3()); void *pgd_addr = __va(read_cr3_pa());
/* Program wakeup mask (using dword access to CS5536_PM1_EN) */ /* Program wakeup mask (using dword access to CS5536_PM1_EN) */
outl(wakeup_mask << 16, acpi_base + CS5536_PM1_STS); outl(wakeup_mask << 16, acpi_base + CS5536_PM1_STS);
......
...@@ -1123,11 +1123,9 @@ static int set_distrib_bits(struct cpumask *flush_mask, struct bau_control *bcp, ...@@ -1123,11 +1123,9 @@ static int set_distrib_bits(struct cpumask *flush_mask, struct bau_control *bcp,
* done. The returned pointer is valid till preemption is re-enabled. * done. The returned pointer is valid till preemption is re-enabled.
*/ */
const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
struct mm_struct *mm, const struct flush_tlb_info *info)
unsigned long start,
unsigned long end,
unsigned int cpu)
{ {
unsigned int cpu = smp_processor_id();
int locals = 0, remotes = 0, hubs = 0; int locals = 0, remotes = 0, hubs = 0;
struct bau_desc *bau_desc; struct bau_desc *bau_desc;
struct cpumask *flush_mask; struct cpumask *flush_mask;
...@@ -1181,8 +1179,8 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, ...@@ -1181,8 +1179,8 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
record_send_statistics(stat, locals, hubs, remotes, bau_desc); record_send_statistics(stat, locals, hubs, remotes, bau_desc);
if (!end || (end - start) <= PAGE_SIZE) if (!info->end || (info->end - info->start) <= PAGE_SIZE)
address = start; address = info->start;
else else
address = TLB_FLUSH_ALL; address = TLB_FLUSH_ALL;
......
...@@ -129,7 +129,7 @@ static void __save_processor_state(struct saved_context *ctxt) ...@@ -129,7 +129,7 @@ static void __save_processor_state(struct saved_context *ctxt)
*/ */
ctxt->cr0 = read_cr0(); ctxt->cr0 = read_cr0();
ctxt->cr2 = read_cr2(); ctxt->cr2 = read_cr2();
ctxt->cr3 = read_cr3(); ctxt->cr3 = __read_cr3();
ctxt->cr4 = __read_cr4(); ctxt->cr4 = __read_cr4();
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
ctxt->cr8 = read_cr8(); ctxt->cr8 = read_cr8();
......
...@@ -150,7 +150,8 @@ static int relocate_restore_code(void) ...@@ -150,7 +150,8 @@ static int relocate_restore_code(void)
memcpy((void *)relocated_restore_code, &core_restore_code, PAGE_SIZE); memcpy((void *)relocated_restore_code, &core_restore_code, PAGE_SIZE);
/* Make the page containing the relocated code executable */ /* Make the page containing the relocated code executable */
pgd = (pgd_t *)__va(read_cr3()) + pgd_index(relocated_restore_code); pgd = (pgd_t *)__va(read_cr3_pa()) +
pgd_index(relocated_restore_code);
p4d = p4d_offset(pgd, relocated_restore_code); p4d = p4d_offset(pgd, relocated_restore_code);
if (p4d_large(*p4d)) { if (p4d_large(*p4d)) {
set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX)); set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX));
......
...@@ -102,7 +102,7 @@ static void __init setup_real_mode(void) ...@@ -102,7 +102,7 @@ static void __init setup_real_mode(void)
trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd); trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
trampoline_pgd[0] = trampoline_pgd_entry.pgd; trampoline_pgd[0] = trampoline_pgd_entry.pgd;
trampoline_pgd[511] = init_level4_pgt[511].pgd; trampoline_pgd[511] = init_top_pgt[511].pgd;
#endif #endif
} }
......
...@@ -975,37 +975,32 @@ static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) ...@@ -975,37 +975,32 @@ static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
spin_unlock(&mm->page_table_lock); spin_unlock(&mm->page_table_lock);
} }
static void drop_mm_ref_this_cpu(void *info)
#ifdef CONFIG_SMP
/* Another cpu may still have their %cr3 pointing at the pagetable, so
we need to repoint it somewhere else before we can unpin it. */
static void drop_other_mm_ref(void *info)
{ {
struct mm_struct *mm = info; struct mm_struct *mm = info;
struct mm_struct *active_mm;
active_mm = this_cpu_read(cpu_tlbstate.active_mm);
if (active_mm == mm && this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm)
leave_mm(smp_processor_id()); leave_mm(smp_processor_id());
/* If this cpu still has a stale cr3 reference, then make sure /*
it has been flushed. */ * If this cpu still has a stale cr3 reference, then make sure
* it has been flushed.
*/
if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd)) if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd))
load_cr3(swapper_pg_dir); xen_mc_flush();
} }
#ifdef CONFIG_SMP
/*
* Another cpu may still have their %cr3 pointing at the pagetable, so
* we need to repoint it somewhere else before we can unpin it.
*/
static void xen_drop_mm_ref(struct mm_struct *mm) static void xen_drop_mm_ref(struct mm_struct *mm)
{ {
cpumask_var_t mask; cpumask_var_t mask;
unsigned cpu; unsigned cpu;
if (current->active_mm == mm) { drop_mm_ref_this_cpu(mm);
if (current->mm == mm)
load_cr3(swapper_pg_dir);
else
leave_mm(smp_processor_id());
}
/* Get the "official" set of cpus referring to our pagetable. */ /* Get the "official" set of cpus referring to our pagetable. */
if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) { if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
...@@ -1013,31 +1008,31 @@ static void xen_drop_mm_ref(struct mm_struct *mm) ...@@ -1013,31 +1008,31 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
if (!cpumask_test_cpu(cpu, mm_cpumask(mm)) if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
&& per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd)) && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
continue; continue;
smp_call_function_single(cpu, drop_other_mm_ref, mm, 1); smp_call_function_single(cpu, drop_mm_ref_this_cpu, mm, 1);
} }
return; return;
} }
cpumask_copy(mask, mm_cpumask(mm)); cpumask_copy(mask, mm_cpumask(mm));
/* It's possible that a vcpu may have a stale reference to our /*
cr3, because its in lazy mode, and it hasn't yet flushed * It's possible that a vcpu may have a stale reference to our
its set of pending hypercalls yet. In this case, we can * cr3, because its in lazy mode, and it hasn't yet flushed
look at its actual current cr3 value, and force it to flush * its set of pending hypercalls yet. In this case, we can
if needed. */ * look at its actual current cr3 value, and force it to flush
* if needed.
*/
for_each_online_cpu(cpu) { for_each_online_cpu(cpu) {
if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd)) if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
cpumask_set_cpu(cpu, mask); cpumask_set_cpu(cpu, mask);
} }
if (!cpumask_empty(mask)) smp_call_function_many(mask, drop_mm_ref_this_cpu, mm, 1);
smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
free_cpumask_var(mask); free_cpumask_var(mask);
} }
#else #else
static void xen_drop_mm_ref(struct mm_struct *mm) static void xen_drop_mm_ref(struct mm_struct *mm)
{ {
if (current->active_mm == mm) drop_mm_ref_this_cpu(mm);
load_cr3(swapper_pg_dir);
} }
#endif #endif
...@@ -1366,8 +1361,7 @@ static void xen_flush_tlb_single(unsigned long addr) ...@@ -1366,8 +1361,7 @@ static void xen_flush_tlb_single(unsigned long addr)
} }
static void xen_flush_tlb_others(const struct cpumask *cpus, static void xen_flush_tlb_others(const struct cpumask *cpus,
struct mm_struct *mm, unsigned long start, const struct flush_tlb_info *info)
unsigned long end)
{ {
struct { struct {
struct mmuext_op op; struct mmuext_op op;
...@@ -1379,7 +1373,7 @@ static void xen_flush_tlb_others(const struct cpumask *cpus, ...@@ -1379,7 +1373,7 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
} *args; } *args;
struct multicall_space mcs; struct multicall_space mcs;
trace_xen_mmu_flush_tlb_others(cpus, mm, start, end); trace_xen_mmu_flush_tlb_others(cpus, info->mm, info->start, info->end);
if (cpumask_empty(cpus)) if (cpumask_empty(cpus))
return; /* nothing to do */ return; /* nothing to do */
...@@ -1393,9 +1387,10 @@ static void xen_flush_tlb_others(const struct cpumask *cpus, ...@@ -1393,9 +1387,10 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask)); cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
if (end != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) { if (info->end != TLB_FLUSH_ALL &&
(info->end - info->start) <= PAGE_SIZE) {
args->op.cmd = MMUEXT_INVLPG_MULTI; args->op.cmd = MMUEXT_INVLPG_MULTI;
args->op.arg1.linear_addr = start; args->op.arg1.linear_addr = info->start;
} }
MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF); MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
...@@ -1470,8 +1465,8 @@ static void xen_write_cr3(unsigned long cr3) ...@@ -1470,8 +1465,8 @@ static void xen_write_cr3(unsigned long cr3)
* At the start of the day - when Xen launches a guest, it has already * At the start of the day - when Xen launches a guest, it has already
* built pagetables for the guest. We diligently look over them * built pagetables for the guest. We diligently look over them
* in xen_setup_kernel_pagetable and graft as appropriate them in the * in xen_setup_kernel_pagetable and graft as appropriate them in the
* init_level4_pgt and its friends. Then when we are happy we load * init_top_pgt and its friends. Then when we are happy we load
* the new init_level4_pgt - and continue on. * the new init_top_pgt - and continue on.
* *
* The generic code starts (start_kernel) and 'init_mem_mapping' sets * The generic code starts (start_kernel) and 'init_mem_mapping' sets
* up the rest of the pagetables. When it has completed it loads the cr3. * up the rest of the pagetables. When it has completed it loads the cr3.
...@@ -1914,12 +1909,12 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) ...@@ -1914,12 +1909,12 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
pt_end = pt_base + xen_start_info->nr_pt_frames; pt_end = pt_base + xen_start_info->nr_pt_frames;
/* Zap identity mapping */ /* Zap identity mapping */
init_level4_pgt[0] = __pgd(0); init_top_pgt[0] = __pgd(0);
/* Pre-constructed entries are in pfn, so convert to mfn */ /* Pre-constructed entries are in pfn, so convert to mfn */
/* L4[272] -> level3_ident_pgt */ /* L4[272] -> level3_ident_pgt */
/* L4[511] -> level3_kernel_pgt */ /* L4[511] -> level3_kernel_pgt */
convert_pfn_mfn(init_level4_pgt); convert_pfn_mfn(init_top_pgt);
/* L3_i[0] -> level2_ident_pgt */ /* L3_i[0] -> level2_ident_pgt */
convert_pfn_mfn(level3_ident_pgt); convert_pfn_mfn(level3_ident_pgt);
...@@ -1950,10 +1945,10 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) ...@@ -1950,10 +1945,10 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
/* Copy the initial P->M table mappings if necessary. */ /* Copy the initial P->M table mappings if necessary. */
i = pgd_index(xen_start_info->mfn_list); i = pgd_index(xen_start_info->mfn_list);
if (i && i < pgd_index(__START_KERNEL_map)) if (i && i < pgd_index(__START_KERNEL_map))
init_level4_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i]; init_top_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i];
/* Make pagetable pieces RO */ /* Make pagetable pieces RO */
set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); set_page_prot(init_top_pgt, PAGE_KERNEL_RO);
set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO); set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
...@@ -1964,7 +1959,7 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) ...@@ -1964,7 +1959,7 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
/* Pin down new L4 */ /* Pin down new L4 */
pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
PFN_DOWN(__pa_symbol(init_level4_pgt))); PFN_DOWN(__pa_symbol(init_top_pgt)));
/* Unpin Xen-provided one */ /* Unpin Xen-provided one */
pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
...@@ -1974,7 +1969,7 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) ...@@ -1974,7 +1969,7 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
* attach it to, so make sure we just set kernel pgd. * attach it to, so make sure we just set kernel pgd.
*/ */
xen_mc_batch(); xen_mc_batch();
__xen_write_cr3(true, __pa(init_level4_pgt)); __xen_write_cr3(true, __pa(init_top_pgt));
xen_mc_issue(PARAVIRT_LAZY_CPU); xen_mc_issue(PARAVIRT_LAZY_CPU);
/* We can't that easily rip out L3 and L2, as the Xen pagetables are /* We can't that easily rip out L3 and L2, as the Xen pagetables are
...@@ -2022,7 +2017,7 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr) ...@@ -2022,7 +2017,7 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
pmd_t pmd; pmd_t pmd;
pte_t pte; pte_t pte;
pa = read_cr3(); pa = read_cr3_pa();
pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) * pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) *
sizeof(pgd))); sizeof(pgd)));
if (!pgd_present(pgd)) if (!pgd_present(pgd))
...@@ -2102,7 +2097,7 @@ void __init xen_relocate_p2m(void) ...@@ -2102,7 +2097,7 @@ void __init xen_relocate_p2m(void)
pt_phys = pmd_phys + PFN_PHYS(n_pmd); pt_phys = pmd_phys + PFN_PHYS(n_pmd);
p2m_pfn = PFN_DOWN(pt_phys) + n_pt; p2m_pfn = PFN_DOWN(pt_phys) + n_pt;
pgd = __va(read_cr3()); pgd = __va(read_cr3_pa());
new_p2m = (unsigned long *)(2 * PGDIR_SIZE); new_p2m = (unsigned long *)(2 * PGDIR_SIZE);
idx_p4d = 0; idx_p4d = 0;
save_pud = n_pud; save_pud = n_pud;
...@@ -2209,7 +2204,7 @@ static void __init xen_write_cr3_init(unsigned long cr3) ...@@ -2209,7 +2204,7 @@ static void __init xen_write_cr3_init(unsigned long cr3)
{ {
unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir)); unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
BUG_ON(read_cr3() != __pa(initial_page_table)); BUG_ON(read_cr3_pa() != __pa(initial_page_table));
BUG_ON(cr3 != __pa(swapper_pg_dir)); BUG_ON(cr3 != __pa(swapper_pg_dir));
/* /*
......
...@@ -87,7 +87,7 @@ ENTRY(pvh_start_xen) ...@@ -87,7 +87,7 @@ ENTRY(pvh_start_xen)
wrmsr wrmsr
/* Enable pre-constructed page tables. */ /* Enable pre-constructed page tables. */
mov $_pa(init_level4_pgt), %eax mov $_pa(init_top_pgt), %eax
mov %eax, %cr3 mov %eax, %cr3
mov $(X86_CR0_PG | X86_CR0_PE), %eax mov $(X86_CR0_PG | X86_CR0_PE), %eax
mov %eax, %cr0 mov %eax, %cr0
......
...@@ -14,6 +14,10 @@ ...@@ -14,6 +14,10 @@
#include <asm/page.h> #include <asm/page.h>
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
#include <asm/tlbbatch.h>
#endif
#define USE_SPLIT_PTE_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS) #define USE_SPLIT_PTE_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS)
#define USE_SPLIT_PMD_PTLOCKS (USE_SPLIT_PTE_PTLOCKS && \ #define USE_SPLIT_PMD_PTLOCKS (USE_SPLIT_PTE_PTLOCKS && \
IS_ENABLED(CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK)) IS_ENABLED(CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK))
...@@ -67,12 +71,15 @@ struct page_frag { ...@@ -67,12 +71,15 @@ struct page_frag {
struct tlbflush_unmap_batch { struct tlbflush_unmap_batch {
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
/* /*
* Each bit set is a CPU that potentially has a TLB entry for one of * The arch code makes the following promise: generic code can modify a
* the PFNs being flushed. See set_tlb_ubc_flush_pending(). * PTE, then call arch_tlbbatch_add_mm() (which internally provides all
* needed barriers), then call arch_tlbbatch_flush(), and the entries
* will be flushed on all CPUs by the time that arch_tlbbatch_flush()
* returns.
*/ */
struct cpumask cpumask; struct arch_tlbflush_unmap_batch arch;
/* True if any bit in cpumask is set */ /* True if a flush is needed. */
bool flush_required; bool flush_required;
/* /*
......
...@@ -93,10 +93,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, ...@@ -93,10 +93,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
#endif #endif
#endif #endif
#ifdef CONFIG_DEBUG_TLBFLUSH #ifdef CONFIG_DEBUG_TLBFLUSH
#ifdef CONFIG_SMP
NR_TLB_REMOTE_FLUSH, /* cpu tried to flush others' tlbs */ NR_TLB_REMOTE_FLUSH, /* cpu tried to flush others' tlbs */
NR_TLB_REMOTE_FLUSH_RECEIVED,/* cpu received ipi for flush */ NR_TLB_REMOTE_FLUSH_RECEIVED,/* cpu received ipi for flush */
#endif /* CONFIG_SMP */
NR_TLB_LOCAL_FLUSH_ALL, NR_TLB_LOCAL_FLUSH_ALL,
NR_TLB_LOCAL_FLUSH_ONE, NR_TLB_LOCAL_FLUSH_ONE,
#endif /* CONFIG_DEBUG_TLBFLUSH */ #endif /* CONFIG_DEBUG_TLBFLUSH */
......
...@@ -137,7 +137,7 @@ config HAVE_MEMBLOCK_NODE_MAP ...@@ -137,7 +137,7 @@ config HAVE_MEMBLOCK_NODE_MAP
config HAVE_MEMBLOCK_PHYS_MAP config HAVE_MEMBLOCK_PHYS_MAP
bool bool
config HAVE_GENERIC_RCU_GUP config HAVE_GENERIC_GUP
bool bool
config ARCH_DISCARD_MEMBLOCK config ARCH_DISCARD_MEMBLOCK
......
...@@ -1146,7 +1146,7 @@ struct page *get_dump_page(unsigned long addr) ...@@ -1146,7 +1146,7 @@ struct page *get_dump_page(unsigned long addr)
#endif /* CONFIG_ELF_CORE */ #endif /* CONFIG_ELF_CORE */
/* /*
* Generic RCU Fast GUP * Generic Fast GUP
* *
* get_user_pages_fast attempts to pin user pages by walking the page * get_user_pages_fast attempts to pin user pages by walking the page
* tables directly and avoids taking locks. Thus the walker needs to be * tables directly and avoids taking locks. Thus the walker needs to be
...@@ -1167,8 +1167,8 @@ struct page *get_dump_page(unsigned long addr) ...@@ -1167,8 +1167,8 @@ struct page *get_dump_page(unsigned long addr)
* Before activating this code, please be aware that the following assumptions * Before activating this code, please be aware that the following assumptions
* are currently made: * are currently made:
* *
* *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free * *) Either HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to
* pages containing page tables. * free pages containing page tables or TLB flushing requires IPI broadcast.
* *
* *) ptes can be read atomically by the architecture. * *) ptes can be read atomically by the architecture.
* *
...@@ -1178,7 +1178,7 @@ struct page *get_dump_page(unsigned long addr) ...@@ -1178,7 +1178,7 @@ struct page *get_dump_page(unsigned long addr)
* *
* This code is based heavily on the PowerPC implementation by Nick Piggin. * This code is based heavily on the PowerPC implementation by Nick Piggin.
*/ */
#ifdef CONFIG_HAVE_GENERIC_RCU_GUP #ifdef CONFIG_HAVE_GENERIC_GUP
#ifndef gup_get_pte #ifndef gup_get_pte
/* /*
...@@ -1668,4 +1668,4 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, ...@@ -1668,4 +1668,4 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
return ret; return ret;
} }
#endif /* CONFIG_HAVE_GENERIC_RCU_GUP */ #endif /* CONFIG_HAVE_GENERIC_GUP */
...@@ -579,25 +579,13 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma) ...@@ -579,25 +579,13 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
void try_to_unmap_flush(void) void try_to_unmap_flush(void)
{ {
struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc; struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
int cpu;
if (!tlb_ubc->flush_required) if (!tlb_ubc->flush_required)
return; return;
cpu = get_cpu(); arch_tlbbatch_flush(&tlb_ubc->arch);
if (cpumask_test_cpu(cpu, &tlb_ubc->cpumask)) {
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
local_flush_tlb();
trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL);
}
if (cpumask_any_but(&tlb_ubc->cpumask, cpu) < nr_cpu_ids)
flush_tlb_others(&tlb_ubc->cpumask, NULL, 0, TLB_FLUSH_ALL);
cpumask_clear(&tlb_ubc->cpumask);
tlb_ubc->flush_required = false; tlb_ubc->flush_required = false;
tlb_ubc->writable = false; tlb_ubc->writable = false;
put_cpu();
} }
/* Flush iff there are potentially writable TLB entries that can race with IO */ /* Flush iff there are potentially writable TLB entries that can race with IO */
...@@ -613,7 +601,7 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable) ...@@ -613,7 +601,7 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
{ {
struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc; struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
cpumask_or(&tlb_ubc->cpumask, &tlb_ubc->cpumask, mm_cpumask(mm)); arch_tlbbatch_add_mm(&tlb_ubc->arch, mm);
tlb_ubc->flush_required = true; tlb_ubc->flush_required = true;
/* /*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment