Commit 0d37dde7 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'x86-entry-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 vsyscall updates from Thomas Gleixner:
 "Further hardening of the legacy vsyscall by providing support for
  execute only mode and switching the default to it.

  This prevents a certain class of attacks which rely on the vsyscall
  page being accessible at a fixed address in the canonical kernel
  address space"

* 'x86-entry-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  selftests/x86: Add a test for process_vm_readv() on the vsyscall page
  x86/vsyscall: Add __ro_after_init to global variables
  x86/vsyscall: Change the default vsyscall mode to xonly
  selftests/x86/vsyscall: Verify that vsyscall=none blocks execution
  x86/vsyscall: Document odd SIGSEGV error code for vsyscalls
  x86/vsyscall: Show something useful on a read fault
  x86/vsyscall: Add a new vsyscall=xonly mode
  Documentation/admin: Remove the vsyscall=native documentation
parents 0902d501 7f0a5e07
...@@ -5100,13 +5100,12 @@ ...@@ -5100,13 +5100,12 @@
targets for exploits that can control RIP. targets for exploits that can control RIP.
emulate [default] Vsyscalls turn into traps and are emulate [default] Vsyscalls turn into traps and are
emulated reasonably safely. emulated reasonably safely. The vsyscall
page is readable.
native Vsyscalls are native syscall instructions. xonly Vsyscalls turn into traps and are
This is a little bit faster than trapping emulated reasonably safely. The vsyscall
and makes a few dynamic recompilers work page is not readable.
better than they would in emulation mode.
It also makes exploits much easier to write.
none Vsyscalls don't work at all. This makes none Vsyscalls don't work at all. This makes
them quite hard to use for exploits but them quite hard to use for exploits but
......
...@@ -2288,7 +2288,7 @@ config COMPAT_VDSO ...@@ -2288,7 +2288,7 @@ config COMPAT_VDSO
choice choice
prompt "vsyscall table for legacy applications" prompt "vsyscall table for legacy applications"
depends on X86_64 depends on X86_64
default LEGACY_VSYSCALL_EMULATE default LEGACY_VSYSCALL_XONLY
help help
Legacy user code that does not know how to find the vDSO expects Legacy user code that does not know how to find the vDSO expects
to be able to issue three syscalls by calling fixed addresses in to be able to issue three syscalls by calling fixed addresses in
...@@ -2296,23 +2296,38 @@ choice ...@@ -2296,23 +2296,38 @@ choice
it can be used to assist security vulnerability exploitation. it can be used to assist security vulnerability exploitation.
This setting can be changed at boot time via the kernel command This setting can be changed at boot time via the kernel command
line parameter vsyscall=[emulate|none]. line parameter vsyscall=[emulate|xonly|none].
On a system with recent enough glibc (2.14 or newer) and no On a system with recent enough glibc (2.14 or newer) and no
static binaries, you can say None without a performance penalty static binaries, you can say None without a performance penalty
to improve security. to improve security.
If unsure, select "Emulate". If unsure, select "Emulate execution only".
config LEGACY_VSYSCALL_EMULATE config LEGACY_VSYSCALL_EMULATE
bool "Emulate" bool "Full emulation"
help help
The kernel traps and emulates calls into the fixed The kernel traps and emulates calls into the fixed vsyscall
vsyscall address mapping. This makes the mapping address mapping. This makes the mapping non-executable, but
non-executable, but it still contains known contents, it still contains readable known contents, which could be
which could be used in certain rare security vulnerability used in certain rare security vulnerability exploits. This
exploits. This configuration is recommended when userspace configuration is recommended when using legacy userspace
still uses the vsyscall area. that still uses vsyscalls along with legacy binary
instrumentation tools that require code to be readable.
An example of this type of legacy userspace is running
Pin on an old binary that still uses vsyscalls.
config LEGACY_VSYSCALL_XONLY
bool "Emulate execution only"
help
The kernel traps and emulates calls into the fixed vsyscall
address mapping and does not allow reads. This
configuration is recommended when userspace might use the
legacy vsyscall area but support for legacy binary
instrumentation of legacy code is not needed. It mitigates
certain uses of the vsyscall area as an ASLR-bypassing
buffer.
config LEGACY_VSYSCALL_NONE config LEGACY_VSYSCALL_NONE
bool "None" bool "None"
......
...@@ -42,9 +42,11 @@ ...@@ -42,9 +42,11 @@
#define CREATE_TRACE_POINTS #define CREATE_TRACE_POINTS
#include "vsyscall_trace.h" #include "vsyscall_trace.h"
static enum { EMULATE, NONE } vsyscall_mode = static enum { EMULATE, XONLY, NONE } vsyscall_mode __ro_after_init =
#ifdef CONFIG_LEGACY_VSYSCALL_NONE #ifdef CONFIG_LEGACY_VSYSCALL_NONE
NONE; NONE;
#elif defined(CONFIG_LEGACY_VSYSCALL_XONLY)
XONLY;
#else #else
EMULATE; EMULATE;
#endif #endif
...@@ -54,6 +56,8 @@ static int __init vsyscall_setup(char *str) ...@@ -54,6 +56,8 @@ static int __init vsyscall_setup(char *str)
if (str) { if (str) {
if (!strcmp("emulate", str)) if (!strcmp("emulate", str))
vsyscall_mode = EMULATE; vsyscall_mode = EMULATE;
else if (!strcmp("xonly", str))
vsyscall_mode = XONLY;
else if (!strcmp("none", str)) else if (!strcmp("none", str))
vsyscall_mode = NONE; vsyscall_mode = NONE;
else else
...@@ -113,7 +117,8 @@ static bool write_ok_or_segv(unsigned long ptr, size_t size) ...@@ -113,7 +117,8 @@ static bool write_ok_or_segv(unsigned long ptr, size_t size)
} }
} }
bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) bool emulate_vsyscall(unsigned long error_code,
struct pt_regs *regs, unsigned long address)
{ {
struct task_struct *tsk; struct task_struct *tsk;
unsigned long caller; unsigned long caller;
...@@ -122,6 +127,22 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) ...@@ -122,6 +127,22 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
long ret; long ret;
unsigned long orig_dx; unsigned long orig_dx;
/* Write faults or kernel-privilege faults never get fixed up. */
if ((error_code & (X86_PF_WRITE | X86_PF_USER)) != X86_PF_USER)
return false;
if (!(error_code & X86_PF_INSTR)) {
/* Failed vsyscall read */
if (vsyscall_mode == EMULATE)
return false;
/*
* User code tried and failed to read the vsyscall page.
*/
warn_bad_vsyscall(KERN_INFO, regs, "vsyscall read attempt denied -- look up the vsyscall kernel parameter if you need a workaround");
return false;
}
/* /*
* No point in checking CS -- the only way to get here is a user mode * No point in checking CS -- the only way to get here is a user mode
* trap to a high address, which means that we're in 64-bit user code. * trap to a high address, which means that we're in 64-bit user code.
...@@ -284,7 +305,7 @@ static const char *gate_vma_name(struct vm_area_struct *vma) ...@@ -284,7 +305,7 @@ static const char *gate_vma_name(struct vm_area_struct *vma)
static const struct vm_operations_struct gate_vma_ops = { static const struct vm_operations_struct gate_vma_ops = {
.name = gate_vma_name, .name = gate_vma_name,
}; };
static struct vm_area_struct gate_vma = { static struct vm_area_struct gate_vma __ro_after_init = {
.vm_start = VSYSCALL_ADDR, .vm_start = VSYSCALL_ADDR,
.vm_end = VSYSCALL_ADDR + PAGE_SIZE, .vm_end = VSYSCALL_ADDR + PAGE_SIZE,
.vm_page_prot = PAGE_READONLY_EXEC, .vm_page_prot = PAGE_READONLY_EXEC,
...@@ -357,12 +378,20 @@ void __init map_vsyscall(void) ...@@ -357,12 +378,20 @@ void __init map_vsyscall(void)
extern char __vsyscall_page; extern char __vsyscall_page;
unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);
if (vsyscall_mode != NONE) { /*
* For full emulation, the page needs to exist for real. In
* execute-only mode, there is no PTE at all backing the vsyscall
* page.
*/
if (vsyscall_mode == EMULATE) {
__set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall, __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall,
PAGE_KERNEL_VVAR); PAGE_KERNEL_VVAR);
set_vsyscall_pgtable_user_bits(swapper_pg_dir); set_vsyscall_pgtable_user_bits(swapper_pg_dir);
} }
if (vsyscall_mode == XONLY)
gate_vma.vm_flags = VM_EXEC;
BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
(unsigned long)VSYSCALL_ADDR); (unsigned long)VSYSCALL_ADDR);
} }
...@@ -13,10 +13,12 @@ extern void set_vsyscall_pgtable_user_bits(pgd_t *root); ...@@ -13,10 +13,12 @@ extern void set_vsyscall_pgtable_user_bits(pgd_t *root);
* Called on instruction fetch fault in vsyscall page. * Called on instruction fetch fault in vsyscall page.
* Returns true if handled. * Returns true if handled.
*/ */
extern bool emulate_vsyscall(struct pt_regs *regs, unsigned long address); extern bool emulate_vsyscall(unsigned long error_code,
struct pt_regs *regs, unsigned long address);
#else #else
static inline void map_vsyscall(void) {} static inline void map_vsyscall(void) {}
static inline bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) static inline bool emulate_vsyscall(unsigned long error_code,
struct pt_regs *regs, unsigned long address)
{ {
return false; return false;
} }
......
...@@ -710,6 +710,10 @@ static void set_signal_archinfo(unsigned long address, ...@@ -710,6 +710,10 @@ static void set_signal_archinfo(unsigned long address,
* To avoid leaking information about the kernel page * To avoid leaking information about the kernel page
* table layout, pretend that user-mode accesses to * table layout, pretend that user-mode accesses to
* kernel addresses are always protection faults. * kernel addresses are always protection faults.
*
* NB: This means that failed vsyscalls with vsyscall=none
* will have the PROT bit. This doesn't leak any
* information and does not appear to cause any problems.
*/ */
if (address >= TASK_SIZE_MAX) if (address >= TASK_SIZE_MAX)
error_code |= X86_PF_PROT; error_code |= X86_PF_PROT;
...@@ -1369,16 +1373,18 @@ void do_user_addr_fault(struct pt_regs *regs, ...@@ -1369,16 +1373,18 @@ void do_user_addr_fault(struct pt_regs *regs,
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
/* /*
* Instruction fetch faults in the vsyscall page might need * Faults in the vsyscall page might need emulation. The
* emulation. The vsyscall page is at a high address * vsyscall page is at a high address (>PAGE_OFFSET), but is
* (>PAGE_OFFSET), but is considered to be part of the user * considered to be part of the user address space.
* address space.
* *
* The vsyscall page does not have a "real" VMA, so do this * The vsyscall page does not have a "real" VMA, so do this
* emulation before we go searching for VMAs. * emulation before we go searching for VMAs.
*
* PKRU never rejects instruction fetches, so we don't need
* to consider the PF_PK bit.
*/ */
if ((hw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) { if (is_vsyscall_vaddr(address)) {
if (emulate_vsyscall(regs, address)) if (emulate_vsyscall(hw_error_code, regs, address))
return; return;
} }
#endif #endif
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include <sched.h> #include <sched.h>
#include <stdbool.h> #include <stdbool.h>
#include <setjmp.h> #include <setjmp.h>
#include <sys/uio.h>
#ifdef __x86_64__ #ifdef __x86_64__
# define VSYS(x) (x) # define VSYS(x) (x)
...@@ -49,21 +50,21 @@ static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), ...@@ -49,21 +50,21 @@ static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
} }
/* vsyscalls and vDSO */ /* vsyscalls and vDSO */
bool should_read_vsyscall = false; bool vsyscall_map_r = false, vsyscall_map_x = false;
typedef long (*gtod_t)(struct timeval *tv, struct timezone *tz); typedef long (*gtod_t)(struct timeval *tv, struct timezone *tz);
gtod_t vgtod = (gtod_t)VSYS(0xffffffffff600000); const gtod_t vgtod = (gtod_t)VSYS(0xffffffffff600000);
gtod_t vdso_gtod; gtod_t vdso_gtod;
typedef int (*vgettime_t)(clockid_t, struct timespec *); typedef int (*vgettime_t)(clockid_t, struct timespec *);
vgettime_t vdso_gettime; vgettime_t vdso_gettime;
typedef long (*time_func_t)(time_t *t); typedef long (*time_func_t)(time_t *t);
time_func_t vtime = (time_func_t)VSYS(0xffffffffff600400); const time_func_t vtime = (time_func_t)VSYS(0xffffffffff600400);
time_func_t vdso_time; time_func_t vdso_time;
typedef long (*getcpu_t)(unsigned *, unsigned *, void *); typedef long (*getcpu_t)(unsigned *, unsigned *, void *);
getcpu_t vgetcpu = (getcpu_t)VSYS(0xffffffffff600800); const getcpu_t vgetcpu = (getcpu_t)VSYS(0xffffffffff600800);
getcpu_t vdso_getcpu; getcpu_t vdso_getcpu;
static void init_vdso(void) static void init_vdso(void)
...@@ -107,7 +108,7 @@ static int init_vsys(void) ...@@ -107,7 +108,7 @@ static int init_vsys(void)
maps = fopen("/proc/self/maps", "r"); maps = fopen("/proc/self/maps", "r");
if (!maps) { if (!maps) {
printf("[WARN]\tCould not open /proc/self/maps -- assuming vsyscall is r-x\n"); printf("[WARN]\tCould not open /proc/self/maps -- assuming vsyscall is r-x\n");
should_read_vsyscall = true; vsyscall_map_r = true;
return 0; return 0;
} }
...@@ -133,12 +134,8 @@ static int init_vsys(void) ...@@ -133,12 +134,8 @@ static int init_vsys(void)
} }
printf("\tvsyscall permissions are %c-%c\n", r, x); printf("\tvsyscall permissions are %c-%c\n", r, x);
should_read_vsyscall = (r == 'r'); vsyscall_map_r = (r == 'r');
if (x != 'x') { vsyscall_map_x = (x == 'x');
vgtod = NULL;
vtime = NULL;
vgetcpu = NULL;
}
found = true; found = true;
break; break;
...@@ -148,10 +145,8 @@ static int init_vsys(void) ...@@ -148,10 +145,8 @@ static int init_vsys(void)
if (!found) { if (!found) {
printf("\tno vsyscall map in /proc/self/maps\n"); printf("\tno vsyscall map in /proc/self/maps\n");
should_read_vsyscall = false; vsyscall_map_r = false;
vgtod = NULL; vsyscall_map_x = false;
vtime = NULL;
vgetcpu = NULL;
} }
return nerrs; return nerrs;
...@@ -183,9 +178,13 @@ static inline long sys_getcpu(unsigned * cpu, unsigned * node, ...@@ -183,9 +178,13 @@ static inline long sys_getcpu(unsigned * cpu, unsigned * node,
} }
static jmp_buf jmpbuf; static jmp_buf jmpbuf;
static volatile unsigned long segv_err;
static void sigsegv(int sig, siginfo_t *info, void *ctx_void) static void sigsegv(int sig, siginfo_t *info, void *ctx_void)
{ {
ucontext_t *ctx = (ucontext_t *)ctx_void;
segv_err = ctx->uc_mcontext.gregs[REG_ERR];
siglongjmp(jmpbuf, 1); siglongjmp(jmpbuf, 1);
} }
...@@ -238,7 +237,7 @@ static int test_gtod(void) ...@@ -238,7 +237,7 @@ static int test_gtod(void)
err(1, "syscall gettimeofday"); err(1, "syscall gettimeofday");
if (vdso_gtod) if (vdso_gtod)
ret_vdso = vdso_gtod(&tv_vdso, &tz_vdso); ret_vdso = vdso_gtod(&tv_vdso, &tz_vdso);
if (vgtod) if (vsyscall_map_x)
ret_vsys = vgtod(&tv_vsys, &tz_vsys); ret_vsys = vgtod(&tv_vsys, &tz_vsys);
if (sys_gtod(&tv_sys2, &tz_sys) != 0) if (sys_gtod(&tv_sys2, &tz_sys) != 0)
err(1, "syscall gettimeofday"); err(1, "syscall gettimeofday");
...@@ -252,7 +251,7 @@ static int test_gtod(void) ...@@ -252,7 +251,7 @@ static int test_gtod(void)
} }
} }
if (vgtod) { if (vsyscall_map_x) {
if (ret_vsys == 0) { if (ret_vsys == 0) {
nerrs += check_gtod(&tv_sys1, &tv_sys2, &tz_sys, "vsyscall", &tv_vsys, &tz_vsys); nerrs += check_gtod(&tv_sys1, &tv_sys2, &tz_sys, "vsyscall", &tv_vsys, &tz_vsys);
} else { } else {
...@@ -273,7 +272,7 @@ static int test_time(void) { ...@@ -273,7 +272,7 @@ static int test_time(void) {
t_sys1 = sys_time(&t2_sys1); t_sys1 = sys_time(&t2_sys1);
if (vdso_time) if (vdso_time)
t_vdso = vdso_time(&t2_vdso); t_vdso = vdso_time(&t2_vdso);
if (vtime) if (vsyscall_map_x)
t_vsys = vtime(&t2_vsys); t_vsys = vtime(&t2_vsys);
t_sys2 = sys_time(&t2_sys2); t_sys2 = sys_time(&t2_sys2);
if (t_sys1 < 0 || t_sys1 != t2_sys1 || t_sys2 < 0 || t_sys2 != t2_sys2) { if (t_sys1 < 0 || t_sys1 != t2_sys1 || t_sys2 < 0 || t_sys2 != t2_sys2) {
...@@ -294,7 +293,7 @@ static int test_time(void) { ...@@ -294,7 +293,7 @@ static int test_time(void) {
} }
} }
if (vtime) { if (vsyscall_map_x) {
if (t_vsys < 0 || t_vsys != t2_vsys) { if (t_vsys < 0 || t_vsys != t2_vsys) {
printf("[FAIL]\tvsyscall failed (ret:%ld output:%ld)\n", t_vsys, t2_vsys); printf("[FAIL]\tvsyscall failed (ret:%ld output:%ld)\n", t_vsys, t2_vsys);
nerrs++; nerrs++;
...@@ -330,7 +329,7 @@ static int test_getcpu(int cpu) ...@@ -330,7 +329,7 @@ static int test_getcpu(int cpu)
ret_sys = sys_getcpu(&cpu_sys, &node_sys, 0); ret_sys = sys_getcpu(&cpu_sys, &node_sys, 0);
if (vdso_getcpu) if (vdso_getcpu)
ret_vdso = vdso_getcpu(&cpu_vdso, &node_vdso, 0); ret_vdso = vdso_getcpu(&cpu_vdso, &node_vdso, 0);
if (vgetcpu) if (vsyscall_map_x)
ret_vsys = vgetcpu(&cpu_vsys, &node_vsys, 0); ret_vsys = vgetcpu(&cpu_vsys, &node_vsys, 0);
if (ret_sys == 0) { if (ret_sys == 0) {
...@@ -369,7 +368,7 @@ static int test_getcpu(int cpu) ...@@ -369,7 +368,7 @@ static int test_getcpu(int cpu)
} }
} }
if (vgetcpu) { if (vsyscall_map_x) {
if (ret_vsys) { if (ret_vsys) {
printf("[FAIL]\tvsyscall getcpu() failed\n"); printf("[FAIL]\tvsyscall getcpu() failed\n");
nerrs++; nerrs++;
...@@ -410,20 +409,88 @@ static int test_vsys_r(void) ...@@ -410,20 +409,88 @@ static int test_vsys_r(void)
can_read = false; can_read = false;
} }
if (can_read && !should_read_vsyscall) { if (can_read && !vsyscall_map_r) {
printf("[FAIL]\tWe have read access, but we shouldn't\n"); printf("[FAIL]\tWe have read access, but we shouldn't\n");
return 1; return 1;
} else if (!can_read && should_read_vsyscall) { } else if (!can_read && vsyscall_map_r) {
printf("[FAIL]\tWe don't have read access, but we should\n"); printf("[FAIL]\tWe don't have read access, but we should\n");
return 1; return 1;
} else if (can_read) {
printf("[OK]\tWe have read access\n");
} else { } else {
printf("[OK]\tgot expected result\n"); printf("[OK]\tWe do not have read access: #PF(0x%lx)\n",
segv_err);
} }
#endif #endif
return 0; return 0;
} }
static int test_vsys_x(void)
{
#ifdef __x86_64__
if (vsyscall_map_x) {
/* We already tested this adequately. */
return 0;
}
printf("[RUN]\tMake sure that vsyscalls really page fault\n");
bool can_exec;
if (sigsetjmp(jmpbuf, 1) == 0) {
vgtod(NULL, NULL);
can_exec = true;
} else {
can_exec = false;
}
if (can_exec) {
printf("[FAIL]\tExecuting the vsyscall did not page fault\n");
return 1;
} else if (segv_err & (1 << 4)) { /* INSTR */
printf("[OK]\tExecuting the vsyscall page failed: #PF(0x%lx)\n",
segv_err);
} else {
printf("[FAILT]\tExecution failed with the wrong error: #PF(0x%lx)\n",
segv_err);
return 1;
}
#endif
return 0;
}
static int test_process_vm_readv(void)
{
#ifdef __x86_64__
char buf[4096];
struct iovec local, remote;
int ret;
printf("[RUN]\tprocess_vm_readv() from vsyscall page\n");
local.iov_base = buf;
local.iov_len = 4096;
remote.iov_base = (void *)0xffffffffff600000;
remote.iov_len = 4096;
ret = process_vm_readv(getpid(), &local, 1, &remote, 1, 0);
if (ret != 4096) {
printf("[OK]\tprocess_vm_readv() failed (ret = %d, errno = %d)\n", ret, errno);
return 0;
}
if (vsyscall_map_r) {
if (!memcmp(buf, (const void *)0xffffffffff600000, 4096)) {
printf("[OK]\tIt worked and read correct data\n");
} else {
printf("[FAIL]\tIt worked but returned incorrect data\n");
return 1;
}
}
#endif
return 0;
}
#ifdef __x86_64__ #ifdef __x86_64__
#define X86_EFLAGS_TF (1UL << 8) #define X86_EFLAGS_TF (1UL << 8)
...@@ -455,7 +522,7 @@ static int test_emulation(void) ...@@ -455,7 +522,7 @@ static int test_emulation(void)
time_t tmp; time_t tmp;
bool is_native; bool is_native;
if (!vtime) if (!vsyscall_map_x)
return 0; return 0;
printf("[RUN]\tchecking that vsyscalls are emulated\n"); printf("[RUN]\tchecking that vsyscalls are emulated\n");
...@@ -497,6 +564,9 @@ int main(int argc, char **argv) ...@@ -497,6 +564,9 @@ int main(int argc, char **argv)
sethandler(SIGSEGV, sigsegv, 0); sethandler(SIGSEGV, sigsegv, 0);
nerrs += test_vsys_r(); nerrs += test_vsys_r();
nerrs += test_vsys_x();
nerrs += test_process_vm_readv();
#ifdef __x86_64__ #ifdef __x86_64__
nerrs += test_emulation(); nerrs += test_emulation();
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment