Commit 404e1095 authored by Hugh Dickins's avatar Hugh Dickins Committed by Kleber Sacilotto de Souza

kaiser: load_new_mm_cr3() let SWITCH_USER_CR3 flush user

We have many machines (Westmere, Sandybridge, Ivybridge) supporting
PCID but not INVPCID: on these load_new_mm_cr3() simply crashed.

Flushing user context inside load_new_mm_cr3() without the use of
invpcid is difficult: momentarily switch from kernel to user context
and back to do so?  I'm not sure whether that can be safely done at
all, and would risk polluting user context with kernel internals,
and kernel context with stale user externals.

Instead, follow the hint in the comment that was there: change
X86_CR3_PCID_USER_VAR to be a per-cpu variable, then load_new_mm_cr3()
can leave a note in it, for SWITCH_USER_CR3 on return to userspace to
flush user context TLB, instead of default X86_CR3_PCID_USER_NOFLUSH.

Which works well enough that there's no need to do it this way only
when invpcid is unsupported: it's a good alternative to invpcid here.
But there's a couple of inlines in asm/tlbflush.h that need to do the
same trick, so it's best to localize all this per-cpu business in
mm/kaiser.c: moving that part of the initialization from setup_pcid()
to kaiser_setup_pcid(); with kaiser_flush_tlb_on_return_to_user() the
function for noting an X86_CR3_PCID_USER_FLUSH.  And let's keep a
KAISER_SHADOW_PGD_OFFSET in there, to avoid the extra OR on exit.

I did try to make the feature tests in asm/tlbflush.h more consistent
with each other: there seem to be far too many ways of performing such
tests, and I don't have a good grasp of their differences.  At first
I converted them all to be static_cpu_has(): but that proved to be a
mistake, as the comment in __native_flush_tlb_single() hints; so then
I reversed and made them all this_cpu_has().  Probably all gratuitous
change, but that's the way it's working at present.

I am slightly bothered by the way non-per-cpu X86_CR3_PCID_KERN_VAR
gets re-initialized by each cpu (before and after these changes):
no problem when (as usual) all cpus on a machine have the same
features, but in principle incorrect.  However, my experiment
to per-cpu-ify that one did not end well...
Acked-by: default avatarJiri Kosina <jkosina@suse.cz>
Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>

CVE-2017-5754
Signed-off-by: default avatarColin Ian King <colin.king@canonical.com>
Signed-off-by: default avatarKleber Sacilotto de Souza <kleber.souza@canonical.com>
parent 0a024896
...@@ -32,13 +32,12 @@ movq \reg, %cr3 ...@@ -32,13 +32,12 @@ movq \reg, %cr3
.macro _SWITCH_TO_USER_CR3 reg .macro _SWITCH_TO_USER_CR3 reg
movq %cr3, \reg movq %cr3, \reg
andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg
/* orq PER_CPU_VAR(X86_CR3_PCID_USER_VAR), \reg
* This can obviously be one instruction by putting the js 9f
* KAISER_SHADOW_PGD_OFFSET bit in the X86_CR3_PCID_USER_VAR. // FLUSH this time, reset to NOFLUSH for next time
* But, just leave it now for simplicity. // But if nopcid? Consider using 0x80 for user pcid?
*/ movb $(0x80), PER_CPU_VAR(X86_CR3_PCID_USER_VAR+7)
orq X86_CR3_PCID_USER_VAR, \reg 9:
orq $(KAISER_SHADOW_PGD_OFFSET), \reg
movq \reg, %cr3 movq \reg, %cr3
.endm .endm
...@@ -90,6 +89,11 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax ...@@ -90,6 +89,11 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
*/ */
DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
extern unsigned long X86_CR3_PCID_KERN_VAR;
DECLARE_PER_CPU(unsigned long, X86_CR3_PCID_USER_VAR);
extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
/** /**
* kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping * kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping
* @addr: the start address of the range * @addr: the start address of the range
......
...@@ -133,27 +133,42 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask) ...@@ -133,27 +133,42 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)
cr4_set_bits(mask); cr4_set_bits(mask);
} }
/*
* Declare a couple of kaiser interfaces here for convenience,
* to avoid the need for asm/kaiser.h in unexpected places.
*/
#ifdef CONFIG_KAISER
extern void kaiser_setup_pcid(void);
extern void kaiser_flush_tlb_on_return_to_user(void);
#else
static inline void kaiser_setup_pcid(void)
{
}
static inline void kaiser_flush_tlb_on_return_to_user(void)
{
}
#endif
static inline void __native_flush_tlb(void) static inline void __native_flush_tlb(void)
{ {
if (!cpu_feature_enabled(X86_FEATURE_INVPCID)) { if (this_cpu_has(X86_FEATURE_INVPCID)) {
/* /*
* If current->mm == NULL then we borrow a mm which may change during a * Note, this works with CR4.PCIDE=0 or 1.
* task switch and therefore we must not be preempted while we write CR3
* back:
*/ */
preempt_disable(); invpcid_flush_all_nonglobals();
native_write_cr3(native_read_cr3());
preempt_enable();
return; return;
} }
/* /*
* We are no longer using globals with KAISER, so a * If current->mm == NULL then we borrow a mm which may change during a
* "nonglobals" flush would work too. But, this is more * task switch and therefore we must not be preempted while we write CR3
* conservative. * back:
*
* Note, this works with CR4.PCIDE=0 or 1.
*/ */
invpcid_flush_all(); preempt_disable();
if (this_cpu_has(X86_FEATURE_PCID))
kaiser_flush_tlb_on_return_to_user();
native_write_cr3(native_read_cr3());
preempt_enable();
} }
static inline void __native_flush_tlb_global_irq_disabled(void) static inline void __native_flush_tlb_global_irq_disabled(void)
...@@ -169,9 +184,13 @@ static inline void __native_flush_tlb_global_irq_disabled(void) ...@@ -169,9 +184,13 @@ static inline void __native_flush_tlb_global_irq_disabled(void)
static inline void __native_flush_tlb_global(void) static inline void __native_flush_tlb_global(void)
{ {
#ifdef CONFIG_KAISER
/* Globals are not used at all */
__native_flush_tlb();
#else
unsigned long flags; unsigned long flags;
if (static_cpu_has(X86_FEATURE_INVPCID)) { if (this_cpu_has(X86_FEATURE_INVPCID)) {
/* /*
* Using INVPCID is considerably faster than a pair of writes * Using INVPCID is considerably faster than a pair of writes
* to CR4 sandwiched inside an IRQ flag save/restore. * to CR4 sandwiched inside an IRQ flag save/restore.
...@@ -186,10 +205,9 @@ static inline void __native_flush_tlb_global(void) ...@@ -186,10 +205,9 @@ static inline void __native_flush_tlb_global(void)
* be called from deep inside debugging code.) * be called from deep inside debugging code.)
*/ */
raw_local_irq_save(flags); raw_local_irq_save(flags);
__native_flush_tlb_global_irq_disabled(); __native_flush_tlb_global_irq_disabled();
raw_local_irq_restore(flags); raw_local_irq_restore(flags);
#endif
} }
static inline void __native_flush_tlb_single(unsigned long addr) static inline void __native_flush_tlb_single(unsigned long addr)
...@@ -200,9 +218,12 @@ static inline void __native_flush_tlb_single(unsigned long addr) ...@@ -200,9 +218,12 @@ static inline void __native_flush_tlb_single(unsigned long addr)
* *
* The ASIDs used below are hard-coded. But, we must not * The ASIDs used below are hard-coded. But, we must not
* call invpcid(type=1/2) before CR4.PCIDE=1. Just call * call invpcid(type=1/2) before CR4.PCIDE=1. Just call
* invpcid in the case we are called early. * invlpg in the case we are called early.
*/ */
if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) { if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) {
if (this_cpu_has(X86_FEATURE_PCID))
kaiser_flush_tlb_on_return_to_user();
asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
return; return;
} }
......
...@@ -287,32 +287,11 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c) ...@@ -287,32 +287,11 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
} }
} }
/*
* These can have bit 63 set, so we can not just use a plain "or"
* instruction to get their value or'd into CR3. It would take
* another register. So, we use a memory reference to these
* instead.
*
* This is also handy because systems that do not support
* PCIDs just end up or'ing a 0 into their CR3, which does
* no harm.
*/
__aligned(PAGE_SIZE) unsigned long X86_CR3_PCID_KERN_VAR = 0;
__aligned(PAGE_SIZE) unsigned long X86_CR3_PCID_USER_VAR = 0;
static void setup_pcid(struct cpuinfo_x86 *c) static void setup_pcid(struct cpuinfo_x86 *c)
{ {
if (cpu_has(c, X86_FEATURE_PCID)) { if (cpu_has(c, X86_FEATURE_PCID)) {
if (cpu_has(c, X86_FEATURE_PGE)) { if (cpu_has(c, X86_FEATURE_PGE)) {
cr4_set_bits(X86_CR4_PCIDE); cr4_set_bits(X86_CR4_PCIDE);
/*
* These variables are used by the entry/exit
* code to change PCIDs.
*/
#ifdef CONFIG_KAISER
X86_CR3_PCID_KERN_VAR = X86_CR3_PCID_KERN_NOFLUSH;
X86_CR3_PCID_USER_VAR = X86_CR3_PCID_USER_NOFLUSH;
#endif
/* /*
* INVPCID has two "groups" of types: * INVPCID has two "groups" of types:
* 1/2: Invalidate an individual address * 1/2: Invalidate an individual address
...@@ -338,6 +317,7 @@ static void setup_pcid(struct cpuinfo_x86 *c) ...@@ -338,6 +317,7 @@ static void setup_pcid(struct cpuinfo_x86 *c)
clear_cpu_cap(c, X86_FEATURE_PCID); clear_cpu_cap(c, X86_FEATURE_PCID);
} }
} }
kaiser_setup_pcid();
} }
/* /*
......
...@@ -12,12 +12,26 @@ ...@@ -12,12 +12,26 @@
#include <linux/ftrace.h> #include <linux/ftrace.h>
#include <asm/kaiser.h> #include <asm/kaiser.h>
#include <asm/tlbflush.h> /* to verify its kaiser declarations */
#include <asm/pgtable.h> #include <asm/pgtable.h>
#include <asm/pgalloc.h> #include <asm/pgalloc.h>
#include <asm/desc.h> #include <asm/desc.h>
#ifdef CONFIG_KAISER #ifdef CONFIG_KAISER
__visible
DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
/*
* These can have bit 63 set, so we can not just use a plain "or"
* instruction to get their value or'd into CR3. It would take
* another register. So, we use a memory reference to these instead.
*
* This is also handy because systems that do not support PCIDs
* just end up or'ing a 0 into their CR3, which does no harm.
*/
__aligned(PAGE_SIZE) unsigned long X86_CR3_PCID_KERN_VAR;
DEFINE_PER_CPU(unsigned long, X86_CR3_PCID_USER_VAR);
__visible DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
/* /*
* At runtime, the only things we map are some things for CPU * At runtime, the only things we map are some things for CPU
* hotplug, and stacks for new processes. No two CPUs will ever * hotplug, and stacks for new processes. No two CPUs will ever
...@@ -239,9 +253,6 @@ static void __init kaiser_init_all_pgds(void) ...@@ -239,9 +253,6 @@ static void __init kaiser_init_all_pgds(void)
WARN_ON(__ret); \ WARN_ON(__ret); \
} while (0) } while (0)
extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
extern unsigned long X86_CR3_PCID_KERN_VAR;
extern unsigned long X86_CR3_PCID_USER_VAR;
/* /*
* If anything in here fails, we will likely die on one of the * If anything in here fails, we will likely die on one of the
* first kernel->user transitions and init will die. But, we * first kernel->user transitions and init will die. But, we
...@@ -295,8 +306,6 @@ void __init kaiser_init(void) ...@@ -295,8 +306,6 @@ void __init kaiser_init(void)
kaiser_add_user_map_early(&X86_CR3_PCID_KERN_VAR, PAGE_SIZE, kaiser_add_user_map_early(&X86_CR3_PCID_KERN_VAR, PAGE_SIZE,
__PAGE_KERNEL); __PAGE_KERNEL);
kaiser_add_user_map_early(&X86_CR3_PCID_USER_VAR, PAGE_SIZE,
__PAGE_KERNEL);
} }
/* Add a mapping to the shadow mapping, and synchronize the mappings */ /* Add a mapping to the shadow mapping, and synchronize the mappings */
...@@ -361,4 +370,33 @@ pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd) ...@@ -361,4 +370,33 @@ pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
} }
return pgd; return pgd;
} }
void kaiser_setup_pcid(void)
{
unsigned long kern_cr3 = 0;
unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET;
if (this_cpu_has(X86_FEATURE_PCID)) {
kern_cr3 |= X86_CR3_PCID_KERN_NOFLUSH;
user_cr3 |= X86_CR3_PCID_USER_NOFLUSH;
}
/*
* These variables are used by the entry/exit
* code to change PCID and pgd and TLB flushing.
*/
X86_CR3_PCID_KERN_VAR = kern_cr3;
this_cpu_write(X86_CR3_PCID_USER_VAR, user_cr3);
}
/*
* Make a note that this cpu will need to flush USER tlb on return to user.
* Caller checks whether this_cpu_has(X86_FEATURE_PCID) before calling:
* if cpu does not, then the NOFLUSH bit will never have been set.
*/
void kaiser_flush_tlb_on_return_to_user(void)
{
this_cpu_write(X86_CR3_PCID_USER_VAR,
X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
}
EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);
#endif /* CONFIG_KAISER */ #endif /* CONFIG_KAISER */
...@@ -6,13 +6,14 @@ ...@@ -6,13 +6,14 @@
#include <linux/interrupt.h> #include <linux/interrupt.h>
#include <linux/module.h> #include <linux/module.h>
#include <linux/cpu.h> #include <linux/cpu.h>
#include <linux/debugfs.h>
#include <asm/tlbflush.h> #include <asm/tlbflush.h>
#include <asm/mmu_context.h> #include <asm/mmu_context.h>
#include <asm/cache.h> #include <asm/cache.h>
#include <asm/apic.h> #include <asm/apic.h>
#include <asm/uv/uv.h> #include <asm/uv/uv.h>
#include <linux/debugfs.h> #include <asm/kaiser.h>
/* /*
* Smarter SMP flushing macros. * Smarter SMP flushing macros.
...@@ -40,34 +41,23 @@ static void load_new_mm_cr3(pgd_t *pgdir) ...@@ -40,34 +41,23 @@ static void load_new_mm_cr3(pgd_t *pgdir)
{ {
unsigned long new_mm_cr3 = __pa(pgdir); unsigned long new_mm_cr3 = __pa(pgdir);
/* #ifdef CONFIG_KAISER
* KAISER, plus PCIDs needs some extra work here. But, if (this_cpu_has(X86_FEATURE_PCID)) {
* if either of features is not present, we need no /*
* PCIDs here and just do a normal, full TLB flush with * We reuse the same PCID for different tasks, so we must
* the write_cr3() * flush all the entries for the PCID out when we change tasks.
*/ * Flush KERN below, flush USER when returning to userspace in
if (!IS_ENABLED(CONFIG_KAISER) || * kaiser's SWITCH_USER_CR3 (_SWITCH_TO_USER_CR3) macro.
!cpu_feature_enabled(X86_FEATURE_PCID)) *
goto out_set_cr3; * invpcid_flush_single_context(X86_CR3_PCID_ASID_USER) could
/* * do it here, but can only be used if X86_FEATURE_INVPCID is
* We reuse the same PCID for different tasks, so we must * available - and many machines support pcid without invpcid.
* flush all the entires for the PCID out when we change */
* tasks. new_mm_cr3 |= X86_CR3_PCID_KERN_FLUSH;
*/ kaiser_flush_tlb_on_return_to_user();
new_mm_cr3 = X86_CR3_PCID_KERN_FLUSH | __pa(pgdir); }
#endif /* CONFIG_KAISER */
/*
* The flush from load_cr3() may leave old TLB entries
* for userspace in place. We must flush that context
* separately. We can theoretically delay doing this
* until we actually load up the userspace CR3, but
* that's a bit tricky. We have to have the "need to
* flush userspace PCID" bit per-cpu and check it in the
* exit-to-userspace paths.
*/
invpcid_flush_single_context(X86_CR3_PCID_ASID_USER);
out_set_cr3:
/* /*
* Caution: many callers of this function expect * Caution: many callers of this function expect
* that load_cr3() is serializing and orders TLB * that load_cr3() is serializing and orders TLB
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment