Commit 0a024896 authored by Hugh Dickins's avatar Hugh Dickins Committed by Kleber Sacilotto de Souza

kaiser: enhanced by kernel and user PCIDs

Merged performance improvements to Kaiser, using distinct kernel
and user Process Context Identifiers to minimize the TLB flushing.
Acked-by: default avatarJiri Kosina <jkosina@suse.cz>
Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>

CVE-2017-5754
Signed-off-by: default avatarColin Ian King <colin.king@canonical.com>
Signed-off-by: default avatarKleber Sacilotto de Souza <kleber.souza@canonical.com>
parent 42339271
...@@ -1291,7 +1291,10 @@ ENTRY(nmi) ...@@ -1291,7 +1291,10 @@ ENTRY(nmi)
/* %rax is saved above, so OK to clobber here */ /* %rax is saved above, so OK to clobber here */
movq %cr3, %rax movq %cr3, %rax
pushq %rax pushq %rax
andq $(~KAISER_SHADOW_PGD_OFFSET), %rax /* mask off "user" bit of pgd address and 12 PCID bits: */
andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
/* Add back kernel PCID and "no flush" bit */
orq X86_CR3_PCID_KERN_VAR, %rax
movq %rax, %cr3 movq %rax, %cr3
#endif #endif
call do_nmi call do_nmi
...@@ -1532,7 +1535,10 @@ end_repeat_nmi: ...@@ -1532,7 +1535,10 @@ end_repeat_nmi:
/* %rax is saved above, so OK to clobber here */ /* %rax is saved above, so OK to clobber here */
movq %cr3, %rax movq %cr3, %rax
pushq %rax pushq %rax
andq $(~KAISER_SHADOW_PGD_OFFSET), %rax /* mask off "user" bit of pgd address and 12 PCID bits: */
andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
/* Add back kernel PCID and "no flush" bit */
orq X86_CR3_PCID_KERN_VAR, %rax
movq %rax, %cr3 movq %rax, %cr3
#endif #endif
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
#include <asm/irqflags.h> #include <asm/irqflags.h>
#include <asm/asm.h> #include <asm/asm.h>
#include <asm/smap.h> #include <asm/smap.h>
#include <asm/pgtable_types.h>
#include <asm/kaiser.h> #include <asm/kaiser.h>
#include <linux/linkage.h> #include <linux/linkage.h>
#include <linux/err.h> #include <linux/err.h>
......
...@@ -188,6 +188,7 @@ ...@@ -188,6 +188,7 @@
#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */ #define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */
#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ #define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 4) /* Effectively INVPCID && CR4.PCIDE=1 */
#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
......
#ifndef _ASM_X86_KAISER_H #ifndef _ASM_X86_KAISER_H
#define _ASM_X86_KAISER_H #define _ASM_X86_KAISER_H
#include <uapi/asm/processor-flags.h> /* For PCID constants */
/* /*
* This file includes the definitions for the KAISER feature. * This file includes the definitions for the KAISER feature.
* KAISER is a counter measure against x86_64 side channel attacks on * KAISER is a counter measure against x86_64 side channel attacks on
...@@ -21,13 +24,21 @@ ...@@ -21,13 +24,21 @@
.macro _SWITCH_TO_KERNEL_CR3 reg .macro _SWITCH_TO_KERNEL_CR3 reg
movq %cr3, \reg movq %cr3, \reg
andq $(~KAISER_SHADOW_PGD_OFFSET), \reg andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg
orq X86_CR3_PCID_KERN_VAR, \reg
movq \reg, %cr3 movq \reg, %cr3
.endm .endm
.macro _SWITCH_TO_USER_CR3 reg .macro _SWITCH_TO_USER_CR3 reg
movq %cr3, \reg movq %cr3, \reg
orq $(KAISER_SHADOW_PGD_OFFSET), \reg andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg
/*
* This can obviously be one instruction by putting the
* KAISER_SHADOW_PGD_OFFSET bit in the X86_CR3_PCID_USER_VAR.
* But, just leave it now for simplicity.
*/
orq X86_CR3_PCID_USER_VAR, \reg
orq $(KAISER_SHADOW_PGD_OFFSET), \reg
movq \reg, %cr3 movq \reg, %cr3
.endm .endm
......
...@@ -106,6 +106,32 @@ ...@@ -106,6 +106,32 @@
_PAGE_SOFT_DIRTY) _PAGE_SOFT_DIRTY)
#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
/* The ASID is the lower 12 bits of CR3 */
#define X86_CR3_PCID_ASID_MASK (_AC((1<<12)-1,UL))
/* Mask for all the PCID-related bits in CR3: */
#define X86_CR3_PCID_MASK (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK)
#if defined(CONFIG_KAISER) && defined(CONFIG_X86_64)
#define X86_CR3_PCID_ASID_KERN (_AC(0x4,UL))
#define X86_CR3_PCID_ASID_USER (_AC(0x6,UL))
#define X86_CR3_PCID_KERN_FLUSH (X86_CR3_PCID_ASID_KERN)
#define X86_CR3_PCID_USER_FLUSH (X86_CR3_PCID_ASID_USER)
#define X86_CR3_PCID_KERN_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_KERN)
#define X86_CR3_PCID_USER_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_USER)
#else
#define X86_CR3_PCID_ASID_KERN (_AC(0x0,UL))
#define X86_CR3_PCID_ASID_USER (_AC(0x0,UL))
/*
* PCIDs are unsupported on 32-bit and none of these bits can be
* set in CR3:
*/
#define X86_CR3_PCID_KERN_FLUSH (0)
#define X86_CR3_PCID_USER_FLUSH (0)
#define X86_CR3_PCID_KERN_NOFLUSH (0)
#define X86_CR3_PCID_USER_NOFLUSH (0)
#endif
/* /*
* The cache modes defined here are used to translate between pure SW usage * The cache modes defined here are used to translate between pure SW usage
* and the HW defined cache mode bits and/or PAT entries. * and the HW defined cache mode bits and/or PAT entries.
......
...@@ -135,14 +135,25 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask) ...@@ -135,14 +135,25 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)
static inline void __native_flush_tlb(void) static inline void __native_flush_tlb(void)
{ {
if (!cpu_feature_enabled(X86_FEATURE_INVPCID)) {
/*
* If current->mm == NULL then we borrow a mm which may change during a
* task switch and therefore we must not be preempted while we write CR3
* back:
*/
preempt_disable();
native_write_cr3(native_read_cr3());
preempt_enable();
return;
}
/* /*
* If current->mm == NULL then we borrow a mm which may change during a * We are no longer using globals with KAISER, so a
* task switch and therefore we must not be preempted while we write CR3 * "nonglobals" flush would work too. But, this is more
* back: * conservative.
*
* Note, this works with CR4.PCIDE=0 or 1.
*/ */
preempt_disable(); invpcid_flush_all();
native_write_cr3(native_read_cr3());
preempt_enable();
} }
static inline void __native_flush_tlb_global_irq_disabled(void) static inline void __native_flush_tlb_global_irq_disabled(void)
...@@ -183,7 +194,31 @@ static inline void __native_flush_tlb_global(void) ...@@ -183,7 +194,31 @@ static inline void __native_flush_tlb_global(void)
static inline void __native_flush_tlb_single(unsigned long addr) static inline void __native_flush_tlb_single(unsigned long addr)
{ {
asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); /*
* SIMICS #GP's if you run INVPCID with type 2/3
* and X86_CR4_PCIDE clear. Shame!
*
* The ASIDs used below are hard-coded. But, we must not
* call invpcid(type=1/2) before CR4.PCIDE=1. Just call
* invpcid in the case we are called early.
*/
if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) {
asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
return;
}
/* Flush the address out of both PCIDs. */
/*
* An optimization here might be to determine addresses
* that are only kernel-mapped and only flush the kernel
* ASID. But, userspace flushes are probably much more
* important performance-wise.
*
* Make sure to do only a single invpcid when KAISER is
* disabled and we have only a single ASID.
*/
if (X86_CR3_PCID_ASID_KERN != X86_CR3_PCID_ASID_USER)
invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr);
invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr);
} }
static inline void __flush_tlb_all(void) static inline void __flush_tlb_all(void)
......
...@@ -77,7 +77,8 @@ ...@@ -77,7 +77,8 @@
#define X86_CR3_PWT _BITUL(X86_CR3_PWT_BIT) #define X86_CR3_PWT _BITUL(X86_CR3_PWT_BIT)
#define X86_CR3_PCD_BIT 4 /* Page Cache Disable */ #define X86_CR3_PCD_BIT 4 /* Page Cache Disable */
#define X86_CR3_PCD _BITUL(X86_CR3_PCD_BIT) #define X86_CR3_PCD _BITUL(X86_CR3_PCD_BIT)
#define X86_CR3_PCID_MASK _AC(0x00000fff,UL) /* PCID Mask */ #define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */
#define X86_CR3_PCID_NOFLUSH _BITULL(X86_CR3_PCID_NOFLUSH_BIT)
/* /*
* Intel CPU features in CR4 * Intel CPU features in CR4
......
...@@ -287,11 +287,45 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c) ...@@ -287,11 +287,45 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
} }
} }
/*
* These can have bit 63 set, so we can not just use a plain "or"
* instruction to get their value or'd into CR3. It would take
* another register. So, we use a memory reference to these
* instead.
*
* This is also handy because systems that do not support
* PCIDs just end up or'ing a 0 into their CR3, which does
* no harm.
*/
__aligned(PAGE_SIZE) unsigned long X86_CR3_PCID_KERN_VAR = 0;
__aligned(PAGE_SIZE) unsigned long X86_CR3_PCID_USER_VAR = 0;
static void setup_pcid(struct cpuinfo_x86 *c) static void setup_pcid(struct cpuinfo_x86 *c)
{ {
if (cpu_has(c, X86_FEATURE_PCID)) { if (cpu_has(c, X86_FEATURE_PCID)) {
if (cpu_has(c, X86_FEATURE_PGE)) { if (cpu_has(c, X86_FEATURE_PGE)) {
cr4_set_bits(X86_CR4_PCIDE); cr4_set_bits(X86_CR4_PCIDE);
/*
* These variables are used by the entry/exit
* code to change PCIDs.
*/
#ifdef CONFIG_KAISER
X86_CR3_PCID_KERN_VAR = X86_CR3_PCID_KERN_NOFLUSH;
X86_CR3_PCID_USER_VAR = X86_CR3_PCID_USER_NOFLUSH;
#endif
/*
* INVPCID has two "groups" of types:
* 1/2: Invalidate an individual address
* 3/4: Invalidate all contexts
*
* 1/2 take a PCID, but 3/4 do not. So, 3/4
* ignore the PCID argument in the descriptor.
* But, we have to be careful not to call 1/2
* with an actual non-zero PCID in them before
* we do the above cr4_set_bits().
*/
if (cpu_has(c, X86_FEATURE_INVPCID))
set_cpu_cap(c, X86_FEATURE_INVPCID_SINGLE);
} else { } else {
/* /*
* flush_tlb_all(), as currently implemented, won't * flush_tlb_all(), as currently implemented, won't
......
...@@ -759,7 +759,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) ...@@ -759,7 +759,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
return 1; return 1;
/* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */ /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu)) if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_ASID_MASK) ||
!is_long_mode(vcpu))
return 1; return 1;
} }
......
...@@ -240,6 +240,8 @@ static void __init kaiser_init_all_pgds(void) ...@@ -240,6 +240,8 @@ static void __init kaiser_init_all_pgds(void)
} while (0) } while (0)
extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[]; extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
extern unsigned long X86_CR3_PCID_KERN_VAR;
extern unsigned long X86_CR3_PCID_USER_VAR;
/* /*
* If anything in here fails, we will likely die on one of the * If anything in here fails, we will likely die on one of the
* first kernel->user transitions and init will die. But, we * first kernel->user transitions and init will die. But, we
...@@ -290,6 +292,11 @@ void __init kaiser_init(void) ...@@ -290,6 +292,11 @@ void __init kaiser_init(void)
kaiser_add_user_map_early(&debug_idt_table, kaiser_add_user_map_early(&debug_idt_table,
sizeof(gate_desc) * NR_VECTORS, sizeof(gate_desc) * NR_VECTORS,
__PAGE_KERNEL); __PAGE_KERNEL);
kaiser_add_user_map_early(&X86_CR3_PCID_KERN_VAR, PAGE_SIZE,
__PAGE_KERNEL);
kaiser_add_user_map_early(&X86_CR3_PCID_USER_VAR, PAGE_SIZE,
__PAGE_KERNEL);
} }
/* Add a mapping to the shadow mapping, and synchronize the mappings */ /* Add a mapping to the shadow mapping, and synchronize the mappings */
......
...@@ -36,6 +36,46 @@ struct flush_tlb_info { ...@@ -36,6 +36,46 @@ struct flush_tlb_info {
unsigned long flush_end; unsigned long flush_end;
}; };
static void load_new_mm_cr3(pgd_t *pgdir)
{
unsigned long new_mm_cr3 = __pa(pgdir);
/*
* KAISER, plus PCIDs needs some extra work here. But,
* if either of features is not present, we need no
* PCIDs here and just do a normal, full TLB flush with
* the write_cr3()
*/
if (!IS_ENABLED(CONFIG_KAISER) ||
!cpu_feature_enabled(X86_FEATURE_PCID))
goto out_set_cr3;
/*
* We reuse the same PCID for different tasks, so we must
* flush all the entires for the PCID out when we change
* tasks.
*/
new_mm_cr3 = X86_CR3_PCID_KERN_FLUSH | __pa(pgdir);
/*
* The flush from load_cr3() may leave old TLB entries
* for userspace in place. We must flush that context
* separately. We can theoretically delay doing this
* until we actually load up the userspace CR3, but
* that's a bit tricky. We have to have the "need to
* flush userspace PCID" bit per-cpu and check it in the
* exit-to-userspace paths.
*/
invpcid_flush_single_context(X86_CR3_PCID_ASID_USER);
out_set_cr3:
/*
* Caution: many callers of this function expect
* that load_cr3() is serializing and orders TLB
* fills with respect to the mm_cpumask writes.
*/
write_cr3(new_mm_cr3);
}
/* /*
* We cannot call mmdrop() because we are in interrupt context, * We cannot call mmdrop() because we are in interrupt context,
* instead update mm->cpu_vm_mask. * instead update mm->cpu_vm_mask.
...@@ -47,7 +87,7 @@ void leave_mm(int cpu) ...@@ -47,7 +87,7 @@ void leave_mm(int cpu)
BUG(); BUG();
if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) { if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) {
cpumask_clear_cpu(cpu, mm_cpumask(active_mm)); cpumask_clear_cpu(cpu, mm_cpumask(active_mm));
load_cr3(swapper_pg_dir); load_new_mm_cr3(swapper_pg_dir);
/* /*
* This gets called in the idle path where RCU * This gets called in the idle path where RCU
* functions differently. Tracing normally * functions differently. Tracing normally
...@@ -101,7 +141,7 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next, ...@@ -101,7 +141,7 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next,
* ordering guarantee we need. * ordering guarantee we need.
* *
*/ */
load_cr3(next->pgd); load_new_mm_cr3(next->pgd);
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
...@@ -150,7 +190,7 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next, ...@@ -150,7 +190,7 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next,
* As above, load_cr3() is serializing and orders TLB * As above, load_cr3() is serializing and orders TLB
* fills with respect to the mm_cpumask write. * fills with respect to the mm_cpumask write.
*/ */
load_cr3(next->pgd); load_new_mm_cr3(next->pgd);
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
load_mm_cr4(next); load_mm_cr4(next);
load_mm_ldt(next); load_mm_ldt(next);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment