Commit 4b783176 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull STIBP fallout fixes from Thomas Gleixner:
 "The performance destruction department finally got it's act together
  and came up with a cure for the STIPB regression:

   - Provide a command line option to control the spectre v2 user space
     mitigations. Default is either seccomp or prctl (if seccomp is
     disabled in Kconfig). prctl allows mitigation opt-in, seccomp
     enables the migitation for sandboxed processes.

   - Rework the code to handle the conditional STIBP/IBPB control and
     remove the now unused ptrace_may_access_sched() optimization
     attempt

   - Disable STIBP automatically when SMT is disabled

   - Optimize the switch_to() logic to avoid MSR writes and invocations
     of __switch_to_xtra().

   - Make the asynchronous speculation TIF updates synchronous to
     prevent stale mitigation state.

  As a general cleanup this also makes retpoline directly depend on
  compiler support and removes the 'minimal retpoline' option which just
  pretended to provide some form of security while providing none"

* 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (31 commits)
  x86/speculation: Provide IBPB always command line options
  x86/speculation: Add seccomp Spectre v2 user space protection mode
  x86/speculation: Enable prctl mode for spectre_v2_user
  x86/speculation: Add prctl() control for indirect branch speculation
  x86/speculation: Prepare arch_smt_update() for PRCTL mode
  x86/speculation: Prevent stale SPEC_CTRL msr content
  x86/speculation: Split out TIF update
  ptrace: Remove unused ptrace_may_access_sched() and MODE_IBRS
  x86/speculation: Prepare for conditional IBPB in switch_mm()
  x86/speculation: Avoid __switch_to_xtra() calls
  x86/process: Consolidate and simplify switch_to_xtra() code
  x86/speculation: Prepare for per task indirect branch speculation control
  x86/speculation: Add command line control for indirect branch speculation
  x86/speculation: Unify conditional spectre v2 print functions
  x86/speculataion: Mark command line parser data __initdata
  x86/speculation: Mark string arrays const correctly
  x86/speculation: Reorder the spec_v2 code
  x86/l1tf: Show actual SMT state
  x86/speculation: Rework SMT state change
  sched/smt: Expose sched_smt_present static key
  ...
parents 88058417 55a97402
......@@ -4199,9 +4199,13 @@
spectre_v2= [X86] Control mitigation of Spectre variant 2
(indirect branch speculation) vulnerability.
The default operation protects the kernel from
user space attacks.
on - unconditionally enable
off - unconditionally disable
on - unconditionally enable, implies
spectre_v2_user=on
off - unconditionally disable, implies
spectre_v2_user=off
auto - kernel detects whether your CPU model is
vulnerable
......@@ -4211,6 +4215,12 @@
CONFIG_RETPOLINE configuration option, and the
compiler with which the kernel was built.
Selecting 'on' will also enable the mitigation
against user space to user space task attacks.
Selecting 'off' will disable both the kernel and
the user space protections.
Specific mitigations can also be selected manually:
retpoline - replace indirect branches
......@@ -4220,6 +4230,48 @@
Not specifying this option is equivalent to
spectre_v2=auto.
spectre_v2_user=
[X86] Control mitigation of Spectre variant 2
(indirect branch speculation) vulnerability between
user space tasks
on - Unconditionally enable mitigations. Is
enforced by spectre_v2=on
off - Unconditionally disable mitigations. Is
enforced by spectre_v2=off
prctl - Indirect branch speculation is enabled,
but mitigation can be enabled via prctl
per thread. The mitigation control state
is inherited on fork.
prctl,ibpb
- Like "prctl" above, but only STIBP is
controlled per thread. IBPB is issued
always when switching between different user
space processes.
seccomp
- Same as "prctl" above, but all seccomp
threads will enable the mitigation unless
they explicitly opt out.
seccomp,ibpb
- Like "seccomp" above, but only STIBP is
controlled per thread. IBPB is issued
always when switching between different
user space processes.
auto - Kernel selects the mitigation depending on
the available CPU features and vulnerability.
Default mitigation:
If CONFIG_SECCOMP=y then "seccomp", otherwise "prctl"
Not specifying this option is equivalent to
spectre_v2_user=auto.
spec_store_bypass_disable=
[HW] Control Speculative Store Bypass (SSB) Disable mitigation
(Speculative Store Bypass vulnerability)
......
......@@ -92,3 +92,12 @@ Speculation misfeature controls
* prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_ENABLE, 0, 0);
* prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_DISABLE, 0, 0);
* prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_FORCE_DISABLE, 0, 0);
- PR_SPEC_INDIR_BRANCH: Indirect Branch Speculation in User Processes
(Mitigate Spectre V2 style attacks against user processes)
Invocations:
* prctl(PR_GET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, 0, 0, 0);
* prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_ENABLE, 0, 0);
* prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_DISABLE, 0, 0);
* prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_FORCE_DISABLE, 0, 0);
......@@ -444,10 +444,6 @@ config RETPOLINE
branches. Requires a compiler with -mindirect-branch=thunk-extern
support for full protection. The kernel may run slower.
Without compiler support, at least indirect branches in assembler
code are eliminated. Since this includes the syscall entry path,
it is not entirely pointless.
config INTEL_RDT
bool "Intel Resource Director Technology support"
depends on X86 && CPU_SUP_INTEL
......@@ -1004,13 +1000,7 @@ config NR_CPUS
to the kernel image.
config SCHED_SMT
bool "SMT (Hyperthreading) scheduler support"
depends on SMP
---help---
SMT scheduler support improves the CPU scheduler's decision making
when dealing with Intel Pentium 4 chips with HyperThreading at a
cost of slightly increased overhead in some places. If unsure say
N here.
def_bool y if SMP
config SCHED_MC
def_bool y
......
......@@ -220,9 +220,10 @@ KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
# Avoid indirect branches in kernel to deal with Spectre
ifdef CONFIG_RETPOLINE
ifneq ($(RETPOLINE_CFLAGS),)
KBUILD_CFLAGS += $(RETPOLINE_CFLAGS) -DRETPOLINE
ifeq ($(RETPOLINE_CFLAGS),)
$(error You are building kernel with non-retpoline compiler, please update your compiler.)
endif
KBUILD_CFLAGS += $(RETPOLINE_CFLAGS)
endif
archscripts: scripts_basic
......
......@@ -41,9 +41,10 @@
#define MSR_IA32_SPEC_CTRL 0x00000048 /* Speculation Control */
#define SPEC_CTRL_IBRS (1 << 0) /* Indirect Branch Restricted Speculation */
#define SPEC_CTRL_STIBP (1 << 1) /* Single Thread Indirect Branch Predictors */
#define SPEC_CTRL_STIBP_SHIFT 1 /* Single Thread Indirect Branch Predictor (STIBP) bit */
#define SPEC_CTRL_STIBP (1 << SPEC_CTRL_STIBP_SHIFT) /* STIBP mask */
#define SPEC_CTRL_SSBD_SHIFT 2 /* Speculative Store Bypass Disable bit */
#define SPEC_CTRL_SSBD (1 << SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */
#define SPEC_CTRL_SSBD (1 << SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */
#define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */
#define PRED_CMD_IBPB (1 << 0) /* Indirect Branch Prediction Barrier */
......
......@@ -3,6 +3,8 @@
#ifndef _ASM_X86_NOSPEC_BRANCH_H_
#define _ASM_X86_NOSPEC_BRANCH_H_
#include <linux/static_key.h>
#include <asm/alternative.h>
#include <asm/alternative-asm.h>
#include <asm/cpufeatures.h>
......@@ -162,11 +164,12 @@
_ASM_PTR " 999b\n\t" \
".popsection\n\t"
#if defined(CONFIG_X86_64) && defined(RETPOLINE)
#ifdef CONFIG_RETPOLINE
#ifdef CONFIG_X86_64
/*
* Since the inline asm uses the %V modifier which is only in newer GCC,
* the 64-bit one is dependent on RETPOLINE not CONFIG_RETPOLINE.
* Inline asm uses the %V modifier which is only in newer GCC
* which is ensured when CONFIG_RETPOLINE is defined.
*/
# define CALL_NOSPEC \
ANNOTATE_NOSPEC_ALTERNATIVE \
......@@ -181,7 +184,7 @@
X86_FEATURE_RETPOLINE_AMD)
# define THUNK_TARGET(addr) [thunk_target] "r" (addr)
#elif defined(CONFIG_X86_32) && defined(CONFIG_RETPOLINE)
#else /* CONFIG_X86_32 */
/*
* For i386 we use the original ret-equivalent retpoline, because
* otherwise we'll run out of registers. We don't care about CET
......@@ -211,6 +214,7 @@
X86_FEATURE_RETPOLINE_AMD)
# define THUNK_TARGET(addr) [thunk_target] "rm" (addr)
#endif
#else /* No retpoline for C / inline asm */
# define CALL_NOSPEC "call *%[thunk_target]\n"
# define THUNK_TARGET(addr) [thunk_target] "rm" (addr)
......@@ -219,13 +223,19 @@
/* The Spectre V2 mitigation variants */
enum spectre_v2_mitigation {
SPECTRE_V2_NONE,
SPECTRE_V2_RETPOLINE_MINIMAL,
SPECTRE_V2_RETPOLINE_MINIMAL_AMD,
SPECTRE_V2_RETPOLINE_GENERIC,
SPECTRE_V2_RETPOLINE_AMD,
SPECTRE_V2_IBRS_ENHANCED,
};
/* The indirect branch speculation control variants */
enum spectre_v2_user_mitigation {
SPECTRE_V2_USER_NONE,
SPECTRE_V2_USER_STRICT,
SPECTRE_V2_USER_PRCTL,
SPECTRE_V2_USER_SECCOMP,
};
/* The Speculative Store Bypass disable variants */
enum ssb_mitigation {
SPEC_STORE_BYPASS_NONE,
......@@ -303,6 +313,10 @@ do { \
preempt_enable(); \
} while (0)
DECLARE_STATIC_KEY_FALSE(switch_to_cond_stibp);
DECLARE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
#endif /* __ASSEMBLY__ */
/*
......
......@@ -53,12 +53,24 @@ static inline u64 ssbd_tif_to_spec_ctrl(u64 tifn)
return (tifn & _TIF_SSBD) >> (TIF_SSBD - SPEC_CTRL_SSBD_SHIFT);
}
static inline u64 stibp_tif_to_spec_ctrl(u64 tifn)
{
BUILD_BUG_ON(TIF_SPEC_IB < SPEC_CTRL_STIBP_SHIFT);
return (tifn & _TIF_SPEC_IB) >> (TIF_SPEC_IB - SPEC_CTRL_STIBP_SHIFT);
}
static inline unsigned long ssbd_spec_ctrl_to_tif(u64 spec_ctrl)
{
BUILD_BUG_ON(TIF_SSBD < SPEC_CTRL_SSBD_SHIFT);
return (spec_ctrl & SPEC_CTRL_SSBD) << (TIF_SSBD - SPEC_CTRL_SSBD_SHIFT);
}
static inline unsigned long stibp_spec_ctrl_to_tif(u64 spec_ctrl)
{
BUILD_BUG_ON(TIF_SPEC_IB < SPEC_CTRL_STIBP_SHIFT);
return (spec_ctrl & SPEC_CTRL_STIBP) << (TIF_SPEC_IB - SPEC_CTRL_STIBP_SHIFT);
}
static inline u64 ssbd_tif_to_amd_ls_cfg(u64 tifn)
{
return (tifn & _TIF_SSBD) ? x86_amd_ls_cfg_ssbd_mask : 0ULL;
......@@ -70,11 +82,7 @@ extern void speculative_store_bypass_ht_init(void);
static inline void speculative_store_bypass_ht_init(void) { }
#endif
extern void speculative_store_bypass_update(unsigned long tif);
static inline void speculative_store_bypass_update_current(void)
{
speculative_store_bypass_update(current_thread_info()->flags);
}
extern void speculation_ctrl_update(unsigned long tif);
extern void speculation_ctrl_update_current(void);
#endif
......@@ -11,9 +11,6 @@ struct task_struct *__switch_to_asm(struct task_struct *prev,
__visible struct task_struct *__switch_to(struct task_struct *prev,
struct task_struct *next);
struct tss_struct;
void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
struct tss_struct *tss);
/* This runs runs on the previous thread's stack. */
static inline void prepare_switch_to(struct task_struct *next)
......
......@@ -79,10 +79,12 @@ struct thread_info {
#define TIF_SIGPENDING 2 /* signal pending */
#define TIF_NEED_RESCHED 3 /* rescheduling necessary */
#define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/
#define TIF_SSBD 5 /* Reduced data speculation */
#define TIF_SSBD 5 /* Speculative store bypass disable */
#define TIF_SYSCALL_EMU 6 /* syscall emulation active */
#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
#define TIF_SECCOMP 8 /* secure computing */
#define TIF_SPEC_IB 9 /* Indirect branch speculation mitigation */
#define TIF_SPEC_FORCE_UPDATE 10 /* Force speculation MSR update in context switch */
#define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */
#define TIF_UPROBE 12 /* breakpointed or singlestepping */
#define TIF_PATCH_PENDING 13 /* pending live patching update */
......@@ -110,6 +112,8 @@ struct thread_info {
#define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU)
#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
#define _TIF_SECCOMP (1 << TIF_SECCOMP)
#define _TIF_SPEC_IB (1 << TIF_SPEC_IB)
#define _TIF_SPEC_FORCE_UPDATE (1 << TIF_SPEC_FORCE_UPDATE)
#define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY)
#define _TIF_UPROBE (1 << TIF_UPROBE)
#define _TIF_PATCH_PENDING (1 << TIF_PATCH_PENDING)
......@@ -145,8 +149,18 @@ struct thread_info {
_TIF_FSCHECK)
/* flags to check in __switch_to() */
#define _TIF_WORK_CTXSW \
(_TIF_IO_BITMAP|_TIF_NOCPUID|_TIF_NOTSC|_TIF_BLOCKSTEP|_TIF_SSBD)
#define _TIF_WORK_CTXSW_BASE \
(_TIF_IO_BITMAP|_TIF_NOCPUID|_TIF_NOTSC|_TIF_BLOCKSTEP| \
_TIF_SSBD | _TIF_SPEC_FORCE_UPDATE)
/*
* Avoid calls to __switch_to_xtra() on UP as STIBP is not evaluated.
*/
#ifdef CONFIG_SMP
# define _TIF_WORK_CTXSW (_TIF_WORK_CTXSW_BASE | _TIF_SPEC_IB)
#else
# define _TIF_WORK_CTXSW (_TIF_WORK_CTXSW_BASE)
#endif
#define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
......
......@@ -169,10 +169,14 @@ struct tlb_state {
#define LOADED_MM_SWITCHING ((struct mm_struct *)1)
/* Last user mm for optimizing IBPB */
union {
struct mm_struct *last_user_mm;
unsigned long last_user_mm_ibpb;
};
u16 loaded_mm_asid;
u16 next_asid;
/* last user mm's ctx id */
u64 last_ctx_id;
/*
* We can be in one of several states:
......
This diff is collapsed.
......@@ -40,6 +40,8 @@
#include <asm/prctl.h>
#include <asm/spec-ctrl.h>
#include "process.h"
/*
* per-CPU TSS segments. Threads are completely 'soft' on Linux,
* no more per-task TSS's. The TSS size is kept cacheline-aligned
......@@ -252,11 +254,12 @@ void arch_setup_new_exec(void)
enable_cpuid();
}
static inline void switch_to_bitmap(struct tss_struct *tss,
struct thread_struct *prev,
static inline void switch_to_bitmap(struct thread_struct *prev,
struct thread_struct *next,
unsigned long tifp, unsigned long tifn)
{
struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw);
if (tifn & _TIF_IO_BITMAP) {
/*
* Copy the relevant range of the IO bitmap.
......@@ -395,32 +398,85 @@ static __always_inline void amd_set_ssb_virt_state(unsigned long tifn)
wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, ssbd_tif_to_spec_ctrl(tifn));
}
static __always_inline void intel_set_ssb_state(unsigned long tifn)
/*
* Update the MSRs managing speculation control, during context switch.
*
* tifp: Previous task's thread flags
* tifn: Next task's thread flags
*/
static __always_inline void __speculation_ctrl_update(unsigned long tifp,
unsigned long tifn)
{
u64 msr = x86_spec_ctrl_base | ssbd_tif_to_spec_ctrl(tifn);
unsigned long tif_diff = tifp ^ tifn;
u64 msr = x86_spec_ctrl_base;
bool updmsr = false;
/*
* If TIF_SSBD is different, select the proper mitigation
* method. Note that if SSBD mitigation is disabled or permanentely
* enabled this branch can't be taken because nothing can set
* TIF_SSBD.
*/
if (tif_diff & _TIF_SSBD) {
if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) {
amd_set_ssb_virt_state(tifn);
} else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) {
amd_set_core_ssb_state(tifn);
} else if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
static_cpu_has(X86_FEATURE_AMD_SSBD)) {
msr |= ssbd_tif_to_spec_ctrl(tifn);
updmsr = true;
}
}
/*
* Only evaluate TIF_SPEC_IB if conditional STIBP is enabled,
* otherwise avoid the MSR write.
*/
if (IS_ENABLED(CONFIG_SMP) &&
static_branch_unlikely(&switch_to_cond_stibp)) {
updmsr |= !!(tif_diff & _TIF_SPEC_IB);
msr |= stibp_tif_to_spec_ctrl(tifn);
}
wrmsrl(MSR_IA32_SPEC_CTRL, msr);
if (updmsr)
wrmsrl(MSR_IA32_SPEC_CTRL, msr);
}
static __always_inline void __speculative_store_bypass_update(unsigned long tifn)
static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk)
{
if (static_cpu_has(X86_FEATURE_VIRT_SSBD))
amd_set_ssb_virt_state(tifn);
else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD))
amd_set_core_ssb_state(tifn);
else
intel_set_ssb_state(tifn);
if (test_and_clear_tsk_thread_flag(tsk, TIF_SPEC_FORCE_UPDATE)) {
if (task_spec_ssb_disable(tsk))
set_tsk_thread_flag(tsk, TIF_SSBD);
else
clear_tsk_thread_flag(tsk, TIF_SSBD);
if (task_spec_ib_disable(tsk))
set_tsk_thread_flag(tsk, TIF_SPEC_IB);
else
clear_tsk_thread_flag(tsk, TIF_SPEC_IB);
}
/* Return the updated threadinfo flags*/
return task_thread_info(tsk)->flags;
}
void speculative_store_bypass_update(unsigned long tif)
void speculation_ctrl_update(unsigned long tif)
{
/* Forced update. Make sure all relevant TIF flags are different */
preempt_disable();
__speculative_store_bypass_update(tif);
__speculation_ctrl_update(~tif, tif);
preempt_enable();
}
void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
struct tss_struct *tss)
/* Called from seccomp/prctl update */
void speculation_ctrl_update_current(void)
{
preempt_disable();
speculation_ctrl_update(speculation_ctrl_update_tif(current));
preempt_enable();
}
void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
{
struct thread_struct *prev, *next;
unsigned long tifp, tifn;
......@@ -430,7 +486,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
tifn = READ_ONCE(task_thread_info(next_p)->flags);
tifp = READ_ONCE(task_thread_info(prev_p)->flags);
switch_to_bitmap(tss, prev, next, tifp, tifn);
switch_to_bitmap(prev, next, tifp, tifn);
propagate_user_return_notify(prev_p, next_p);
......@@ -451,8 +507,15 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
if ((tifp ^ tifn) & _TIF_NOCPUID)
set_cpuid_faulting(!!(tifn & _TIF_NOCPUID));
if ((tifp ^ tifn) & _TIF_SSBD)
__speculative_store_bypass_update(tifn);
if (likely(!((tifp | tifn) & _TIF_SPEC_FORCE_UPDATE))) {
__speculation_ctrl_update(tifp, tifn);
} else {
speculation_ctrl_update_tif(prev_p);
tifn = speculation_ctrl_update_tif(next_p);
/* Enforce MSR update to ensure consistent state */
__speculation_ctrl_update(~tifn, tifn);
}
}
/*
......
// SPDX-License-Identifier: GPL-2.0
//
// Code shared between 32 and 64 bit
#include <asm/spec-ctrl.h>
void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p);
/*
* This needs to be inline to optimize for the common case where no extra
* work needs to be done.
*/
static inline void switch_to_extra(struct task_struct *prev,
struct task_struct *next)
{
unsigned long next_tif = task_thread_info(next)->flags;
unsigned long prev_tif = task_thread_info(prev)->flags;
if (IS_ENABLED(CONFIG_SMP)) {
/*
* Avoid __switch_to_xtra() invocation when conditional
* STIPB is disabled and the only different bit is
* TIF_SPEC_IB. For CONFIG_SMP=n TIF_SPEC_IB is not
* in the TIF_WORK_CTXSW masks.
*/
if (!static_branch_likely(&switch_to_cond_stibp)) {
prev_tif &= ~_TIF_SPEC_IB;
next_tif &= ~_TIF_SPEC_IB;
}
}
/*
* __switch_to_xtra() handles debug registers, i/o bitmaps,
* speculation mitigations etc.
*/
if (unlikely(next_tif & _TIF_WORK_CTXSW_NEXT ||
prev_tif & _TIF_WORK_CTXSW_PREV))
__switch_to_xtra(prev, next);
}
......@@ -59,6 +59,8 @@
#include <asm/intel_rdt_sched.h>
#include <asm/proto.h>
#include "process.h"
void __show_regs(struct pt_regs *regs, enum show_regs_mode mode)
{
unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
......@@ -232,7 +234,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
struct fpu *prev_fpu = &prev->fpu;
struct fpu *next_fpu = &next->fpu;
int cpu = smp_processor_id();
struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu);
/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
......@@ -264,12 +265,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl))
set_iopl_mask(next->iopl);
/*
* Now maybe handle debug registers and/or IO bitmaps
*/
if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV ||
task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
__switch_to_xtra(prev_p, next_p, tss);
switch_to_extra(prev_p, next_p);
/*
* Leave lazy mode, flushing any hypercalls made here.
......
......@@ -60,6 +60,8 @@
#include <asm/unistd_32_ia32.h>
#endif
#include "process.h"
/* Prints also some state that isn't saved in the pt_regs */
void __show_regs(struct pt_regs *regs, enum show_regs_mode mode)
{
......@@ -553,7 +555,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
struct fpu *prev_fpu = &prev->fpu;
struct fpu *next_fpu = &next->fpu;
int cpu = smp_processor_id();
struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu);
WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
this_cpu_read(irq_count) != -1);
......@@ -617,12 +618,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
/* Reload sp0. */
update_task_stack(next_p);
/*
* Now maybe reload the debug registers and handle I/O bitmaps
*/
if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
__switch_to_xtra(prev_p, next_p, tss);
switch_to_extra(prev_p, next_p);
#ifdef CONFIG_XEN_PV
/*
......
......@@ -7,7 +7,6 @@
#include <linux/export.h>
#include <linux/cpu.h>
#include <linux/debugfs.h>
#include <linux/ptrace.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
......@@ -30,6 +29,12 @@
* Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
*/
/*
* Use bit 0 to mangle the TIF_SPEC_IB state into the mm pointer which is
* stored in cpu_tlb_state.last_user_mm_ibpb.
*/
#define LAST_USER_MM_IBPB 0x1UL
/*
* We get here when we do something requiring a TLB invalidation
* but could not go invalidate all of the contexts. We do the
......@@ -181,17 +186,87 @@ static void sync_current_stack_to_mm(struct mm_struct *mm)
}
}
static bool ibpb_needed(struct task_struct *tsk, u64 last_ctx_id)
static inline unsigned long mm_mangle_tif_spec_ib(struct task_struct *next)
{
unsigned long next_tif = task_thread_info(next)->flags;
unsigned long ibpb = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_IBPB;
return (unsigned long)next->mm | ibpb;
}
static void cond_ibpb(struct task_struct *next)
{
if (!next || !next->mm)
return;
/*
* Check if the current (previous) task has access to the memory
* of the @tsk (next) task. If access is denied, make sure to
* issue a IBPB to stop user->user Spectre-v2 attacks.
*
* Note: __ptrace_may_access() returns 0 or -ERRNO.
* Both, the conditional and the always IBPB mode use the mm
* pointer to avoid the IBPB when switching between tasks of the
* same process. Using the mm pointer instead of mm->context.ctx_id
* opens a hypothetical hole vs. mm_struct reuse, which is more or
* less impossible to control by an attacker. Aside of that it
* would only affect the first schedule so the theoretically
* exposed data is not really interesting.
*/
return (tsk && tsk->mm && tsk->mm->context.ctx_id != last_ctx_id &&
ptrace_may_access_sched(tsk, PTRACE_MODE_SPEC_IBPB));
if (static_branch_likely(&switch_mm_cond_ibpb)) {
unsigned long prev_mm, next_mm;
/*
* This is a bit more complex than the always mode because
* it has to handle two cases:
*
* 1) Switch from a user space task (potential attacker)
* which has TIF_SPEC_IB set to a user space task
* (potential victim) which has TIF_SPEC_IB not set.
*
* 2) Switch from a user space task (potential attacker)
* which has TIF_SPEC_IB not set to a user space task
* (potential victim) which has TIF_SPEC_IB set.
*
* This could be done by unconditionally issuing IBPB when
* a task which has TIF_SPEC_IB set is either scheduled in
* or out. Though that results in two flushes when:
*
* - the same user space task is scheduled out and later
* scheduled in again and only a kernel thread ran in
* between.
*
* - a user space task belonging to the same process is
* scheduled in after a kernel thread ran in between
*
* - a user space task belonging to the same process is
* scheduled in immediately.
*
* Optimize this with reasonably small overhead for the
* above cases. Mangle the TIF_SPEC_IB bit into the mm
* pointer of the incoming task which is stored in
* cpu_tlbstate.last_user_mm_ibpb for comparison.
*/
next_mm = mm_mangle_tif_spec_ib(next);
prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_ibpb);
/*
* Issue IBPB only if the mm's are different and one or
* both have the IBPB bit set.
*/
if (next_mm != prev_mm &&
(next_mm | prev_mm) & LAST_USER_MM_IBPB)
indirect_branch_prediction_barrier();
this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, next_mm);
}
if (static_branch_unlikely(&switch_mm_always_ibpb)) {
/*
* Only flush when switching to a user space task with a
* different context than the user space task which ran
* last on this CPU.
*/
if (this_cpu_read(cpu_tlbstate.last_user_mm) != next->mm) {
indirect_branch_prediction_barrier();
this_cpu_write(cpu_tlbstate.last_user_mm, next->mm);
}
}
}
void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
......@@ -292,22 +367,12 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
new_asid = prev_asid;
need_flush = true;
} else {
u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
/*
* Avoid user/user BTB poisoning by flushing the branch
* predictor when switching between processes. This stops
* one process from doing Spectre-v2 attacks on another.
*
* As an optimization, flush indirect branches only when
* switching into a processes that can't be ptrace by the
* current one (as in such case, attacker has much more
* convenient way how to tamper with the next process than
* branch buffer poisoning).
*/
if (static_cpu_has(X86_FEATURE_USE_IBPB) &&
ibpb_needed(tsk, last_ctx_id))
indirect_branch_prediction_barrier();
cond_ibpb(tsk);
if (IS_ENABLED(CONFIG_VMAP_STACK)) {
/*
......@@ -365,14 +430,6 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
}
/*
* Record last user mm's context id, so we can avoid
* flushing branch buffer with IBPB if we switch back
* to the same user.
*/
if (next != &init_mm)
this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
/* Make sure we write CR3 before loaded_mm. */
barrier();
......@@ -441,7 +498,7 @@ void initialize_tlbstate_and_flush(void)
write_cr3(build_cr3(mm->pgd, 0));
/* Reinitialize tlbstate. */
this_cpu_write(cpu_tlbstate.last_ctx_id, mm->context.ctx_id);
this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, LAST_USER_MM_IBPB);
this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0);
this_cpu_write(cpu_tlbstate.next_asid, 1);
this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);
......
......@@ -64,15 +64,12 @@ extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead);
#define PTRACE_MODE_NOAUDIT 0x04
#define PTRACE_MODE_FSCREDS 0x08
#define PTRACE_MODE_REALCREDS 0x10
#define PTRACE_MODE_SCHED 0x20
#define PTRACE_MODE_IBPB 0x40
/* shorthands for READ/ATTACH and FSCREDS/REALCREDS combinations */
#define PTRACE_MODE_READ_FSCREDS (PTRACE_MODE_READ | PTRACE_MODE_FSCREDS)
#define PTRACE_MODE_READ_REALCREDS (PTRACE_MODE_READ | PTRACE_MODE_REALCREDS)
#define PTRACE_MODE_ATTACH_FSCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_FSCREDS)
#define PTRACE_MODE_ATTACH_REALCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_REALCREDS)
#define PTRACE_MODE_SPEC_IBPB (PTRACE_MODE_ATTACH_REALCREDS | PTRACE_MODE_IBPB)
/**
* ptrace_may_access - check whether the caller is permitted to access
......@@ -90,20 +87,6 @@ extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead);
*/
extern bool ptrace_may_access(struct task_struct *task, unsigned int mode);
/**
* ptrace_may_access - check whether the caller is permitted to access
* a target task.
* @task: target task
* @mode: selects type of access and caller credentials
*
* Returns true on success, false on denial.
*
* Similar to ptrace_may_access(). Only to be called from context switch
* code. Does not call into audit and the regular LSM hooks due to locking
* constraints.
*/
extern bool ptrace_may_access_sched(struct task_struct *task, unsigned int mode);
static inline int ptrace_reparented(struct task_struct *child)
{
return !same_thread_group(child->real_parent, child->parent);
......
......@@ -1454,6 +1454,8 @@ static inline bool is_percpu_thread(void)
#define PFA_SPREAD_SLAB 2 /* Spread some slab caches over cpuset */
#define PFA_SPEC_SSB_DISABLE 3 /* Speculative Store Bypass disabled */
#define PFA_SPEC_SSB_FORCE_DISABLE 4 /* Speculative Store Bypass force disabled*/
#define PFA_SPEC_IB_DISABLE 5 /* Indirect branch speculation restricted */
#define PFA_SPEC_IB_FORCE_DISABLE 6 /* Indirect branch speculation permanently restricted */
#define TASK_PFA_TEST(name, func) \
static inline bool task_##func(struct task_struct *p) \
......@@ -1485,6 +1487,13 @@ TASK_PFA_CLEAR(SPEC_SSB_DISABLE, spec_ssb_disable)
TASK_PFA_TEST(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable)
TASK_PFA_SET(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable)
TASK_PFA_TEST(SPEC_IB_DISABLE, spec_ib_disable)
TASK_PFA_SET(SPEC_IB_DISABLE, spec_ib_disable)
TASK_PFA_CLEAR(SPEC_IB_DISABLE, spec_ib_disable)
TASK_PFA_TEST(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable)
TASK_PFA_SET(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable)
static inline void
current_restore_flags(unsigned long orig_flags, unsigned long flags)
{
......
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_SMT_H
#define _LINUX_SCHED_SMT_H
#include <linux/static_key.h>
#ifdef CONFIG_SCHED_SMT
extern struct static_key_false sched_smt_present;
static __always_inline bool sched_smt_active(void)
{
return static_branch_likely(&sched_smt_present);
}
#else
static inline bool sched_smt_active(void) { return false; }
#endif
void arch_smt_update(void);
#endif
......@@ -212,6 +212,7 @@ struct prctl_mm_map {
#define PR_SET_SPECULATION_CTRL 53
/* Speculation control variants */
# define PR_SPEC_STORE_BYPASS 0
# define PR_SPEC_INDIRECT_BRANCH 1
/* Return and control values for PR_SET/GET_SPECULATION_CTRL */
# define PR_SPEC_NOT_AFFECTED 0
# define PR_SPEC_PRCTL (1UL << 0)
......
......@@ -10,6 +10,7 @@
#include <linux/sched/signal.h>
#include <linux/sched/hotplug.h>
#include <linux/sched/task.h>
#include <linux/sched/smt.h>
#include <linux/unistd.h>
#include <linux/cpu.h>
#include <linux/oom.h>
......@@ -367,6 +368,12 @@ static void lockdep_release_cpus_lock(void)
#endif /* CONFIG_HOTPLUG_CPU */
/*
* Architectures that need SMT-specific errata handling during SMT hotplug
* should override this.
*/
void __weak arch_smt_update(void) { }
#ifdef CONFIG_HOTPLUG_SMT
enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED;
EXPORT_SYMBOL_GPL(cpu_smt_control);
......@@ -1011,6 +1018,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
* concurrent CPU hotplug via cpu_add_remove_lock.
*/
lockup_detector_cleanup();
arch_smt_update();
return ret;
}
......@@ -1139,6 +1147,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
ret = cpuhp_up_callbacks(cpu, st, target);
out:
cpus_write_unlock();
arch_smt_update();
return ret;
}
......@@ -2055,12 +2064,6 @@ static void cpuhp_online_cpu_device(unsigned int cpu)
kobject_uevent(&dev->kobj, KOBJ_ONLINE);
}
/*
* Architectures that need SMT-specific errata handling during SMT hotplug
* should override this.
*/
void __weak arch_smt_update(void) { };
static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
{
int cpu, ret = 0;
......
......@@ -261,9 +261,6 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode)
{
if (mode & PTRACE_MODE_SCHED)
return false;
if (mode & PTRACE_MODE_NOAUDIT)
return has_ns_capability_noaudit(current, ns, CAP_SYS_PTRACE);
else
......@@ -331,16 +328,9 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
!ptrace_has_cap(mm->user_ns, mode)))
return -EPERM;
if (mode & PTRACE_MODE_SCHED)
return 0;
return security_ptrace_access_check(task, mode);
}
bool ptrace_may_access_sched(struct task_struct *task, unsigned int mode)
{
return __ptrace_may_access(task, mode | PTRACE_MODE_SCHED);
}
bool ptrace_may_access(struct task_struct *task, unsigned int mode)
{
int err;
......
......@@ -5738,15 +5738,10 @@ int sched_cpu_activate(unsigned int cpu)
#ifdef CONFIG_SCHED_SMT
/*
* The sched_smt_present static key needs to be evaluated on every
* hotplug event because at boot time SMT might be disabled when
* the number of booted CPUs is limited.
*
* If then later a sibling gets hotplugged, then the key would stay
* off and SMT scheduling would never be functional.
* When going up, increment the number of cores with SMT present.
*/
if (cpumask_weight(cpu_smt_mask(cpu)) > 1)
static_branch_enable_cpuslocked(&sched_smt_present);
if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
static_branch_inc_cpuslocked(&sched_smt_present);
#endif
set_cpu_active(cpu, true);
......@@ -5790,6 +5785,14 @@ int sched_cpu_deactivate(unsigned int cpu)
*/
synchronize_rcu_mult(call_rcu, call_rcu_sched);
#ifdef CONFIG_SCHED_SMT
/*
* When going down, decrement the number of cores with SMT present.
*/
if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
static_branch_dec_cpuslocked(&sched_smt_present);
#endif
if (!sched_smp_initialized)
return 0;
......
......@@ -23,6 +23,7 @@
#include <linux/sched/prio.h>
#include <linux/sched/rt.h>
#include <linux/sched/signal.h>
#include <linux/sched/smt.h>
#include <linux/sched/stat.h>
#include <linux/sched/sysctl.h>
#include <linux/sched/task.h>
......@@ -936,9 +937,6 @@ static inline int cpu_of(struct rq *rq)
#ifdef CONFIG_SCHED_SMT
extern struct static_key_false sched_smt_present;
extern void __update_idle_core(struct rq *rq);
static inline void update_idle_core(struct rq *rq)
......
......@@ -236,10 +236,8 @@ ifdef CONFIG_GCOV_KERNEL
objtool_args += --no-unreachable
endif
ifdef CONFIG_RETPOLINE
ifneq ($(RETPOLINE_CFLAGS),)
objtool_args += --retpoline
endif
endif
ifdef CONFIG_MODVERSIONS
......
......@@ -212,6 +212,7 @@ struct prctl_mm_map {
#define PR_SET_SPECULATION_CTRL 53
/* Speculation control variants */
# define PR_SPEC_STORE_BYPASS 0
# define PR_SPEC_INDIRECT_BRANCH 1
/* Return and control values for PR_SET/GET_SPECULATION_CTRL */
# define PR_SPEC_NOT_AFFECTED 0
# define PR_SPEC_PRCTL (1UL << 0)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment