Commit 0d37dde7 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'x86-entry-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 vsyscall updates from Thomas Gleixner:
 "Further hardening of the legacy vsyscall by providing support for
  execute only mode and switching the default to it.

  This prevents a certain class of attacks which rely on the vsyscall
  page being accessible at a fixed address in the canonical kernel
  address space"

* 'x86-entry-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  selftests/x86: Add a test for process_vm_readv() on the vsyscall page
  x86/vsyscall: Add __ro_after_init to global variables
  x86/vsyscall: Change the default vsyscall mode to xonly
  selftests/x86/vsyscall: Verify that vsyscall=none blocks execution
  x86/vsyscall: Document odd SIGSEGV error code for vsyscalls
  x86/vsyscall: Show something useful on a read fault
  x86/vsyscall: Add a new vsyscall=xonly mode
  Documentation/admin: Remove the vsyscall=native documentation
parents 0902d501 7f0a5e07
......@@ -5100,13 +5100,12 @@
targets for exploits that can control RIP.
emulate [default] Vsyscalls turn into traps and are
emulated reasonably safely.
emulated reasonably safely. The vsyscall
page is readable.
native Vsyscalls are native syscall instructions.
This is a little bit faster than trapping
and makes a few dynamic recompilers work
better than they would in emulation mode.
It also makes exploits much easier to write.
xonly Vsyscalls turn into traps and are
emulated reasonably safely. The vsyscall
page is not readable.
none Vsyscalls don't work at all. This makes
them quite hard to use for exploits but
......
......@@ -2288,7 +2288,7 @@ config COMPAT_VDSO
choice
prompt "vsyscall table for legacy applications"
depends on X86_64
default LEGACY_VSYSCALL_EMULATE
default LEGACY_VSYSCALL_XONLY
help
Legacy user code that does not know how to find the vDSO expects
to be able to issue three syscalls by calling fixed addresses in
......@@ -2296,23 +2296,38 @@ choice
it can be used to assist security vulnerability exploitation.
This setting can be changed at boot time via the kernel command
line parameter vsyscall=[emulate|none].
line parameter vsyscall=[emulate|xonly|none].
On a system with recent enough glibc (2.14 or newer) and no
static binaries, you can say None without a performance penalty
to improve security.
If unsure, select "Emulate".
If unsure, select "Emulate execution only".
config LEGACY_VSYSCALL_EMULATE
bool "Emulate"
bool "Full emulation"
help
The kernel traps and emulates calls into the fixed
vsyscall address mapping. This makes the mapping
non-executable, but it still contains known contents,
which could be used in certain rare security vulnerability
exploits. This configuration is recommended when userspace
still uses the vsyscall area.
The kernel traps and emulates calls into the fixed vsyscall
address mapping. This makes the mapping non-executable, but
it still contains readable known contents, which could be
used in certain rare security vulnerability exploits. This
configuration is recommended when using legacy userspace
that still uses vsyscalls along with legacy binary
instrumentation tools that require code to be readable.
An example of this type of legacy userspace is running
Pin on an old binary that still uses vsyscalls.
config LEGACY_VSYSCALL_XONLY
bool "Emulate execution only"
help
The kernel traps and emulates calls into the fixed vsyscall
address mapping and does not allow reads. This
configuration is recommended when userspace might use the
legacy vsyscall area but support for legacy binary
instrumentation of legacy code is not needed. It mitigates
certain uses of the vsyscall area as an ASLR-bypassing
buffer.
config LEGACY_VSYSCALL_NONE
bool "None"
......
......@@ -42,9 +42,11 @@
#define CREATE_TRACE_POINTS
#include "vsyscall_trace.h"
static enum { EMULATE, NONE } vsyscall_mode =
static enum { EMULATE, XONLY, NONE } vsyscall_mode __ro_after_init =
#ifdef CONFIG_LEGACY_VSYSCALL_NONE
NONE;
#elif defined(CONFIG_LEGACY_VSYSCALL_XONLY)
XONLY;
#else
EMULATE;
#endif
......@@ -54,6 +56,8 @@ static int __init vsyscall_setup(char *str)
if (str) {
if (!strcmp("emulate", str))
vsyscall_mode = EMULATE;
else if (!strcmp("xonly", str))
vsyscall_mode = XONLY;
else if (!strcmp("none", str))
vsyscall_mode = NONE;
else
......@@ -113,7 +117,8 @@ static bool write_ok_or_segv(unsigned long ptr, size_t size)
}
}
bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
bool emulate_vsyscall(unsigned long error_code,
struct pt_regs *regs, unsigned long address)
{
struct task_struct *tsk;
unsigned long caller;
......@@ -122,6 +127,22 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
long ret;
unsigned long orig_dx;
/* Write faults or kernel-privilege faults never get fixed up. */
if ((error_code & (X86_PF_WRITE | X86_PF_USER)) != X86_PF_USER)
return false;
if (!(error_code & X86_PF_INSTR)) {
/* Failed vsyscall read */
if (vsyscall_mode == EMULATE)
return false;
/*
* User code tried and failed to read the vsyscall page.
*/
warn_bad_vsyscall(KERN_INFO, regs, "vsyscall read attempt denied -- look up the vsyscall kernel parameter if you need a workaround");
return false;
}
/*
* No point in checking CS -- the only way to get here is a user mode
* trap to a high address, which means that we're in 64-bit user code.
......@@ -284,7 +305,7 @@ static const char *gate_vma_name(struct vm_area_struct *vma)
static const struct vm_operations_struct gate_vma_ops = {
.name = gate_vma_name,
};
static struct vm_area_struct gate_vma = {
static struct vm_area_struct gate_vma __ro_after_init = {
.vm_start = VSYSCALL_ADDR,
.vm_end = VSYSCALL_ADDR + PAGE_SIZE,
.vm_page_prot = PAGE_READONLY_EXEC,
......@@ -357,12 +378,20 @@ void __init map_vsyscall(void)
extern char __vsyscall_page;
unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);
if (vsyscall_mode != NONE) {
/*
* For full emulation, the page needs to exist for real. In
* execute-only mode, there is no PTE at all backing the vsyscall
* page.
*/
if (vsyscall_mode == EMULATE) {
__set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall,
PAGE_KERNEL_VVAR);
set_vsyscall_pgtable_user_bits(swapper_pg_dir);
}
if (vsyscall_mode == XONLY)
gate_vma.vm_flags = VM_EXEC;
BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
(unsigned long)VSYSCALL_ADDR);
}
......@@ -13,10 +13,12 @@ extern void set_vsyscall_pgtable_user_bits(pgd_t *root);
* Called on instruction fetch fault in vsyscall page.
* Returns true if handled.
*/
extern bool emulate_vsyscall(struct pt_regs *regs, unsigned long address);
extern bool emulate_vsyscall(unsigned long error_code,
struct pt_regs *regs, unsigned long address);
#else
static inline void map_vsyscall(void) {}
static inline bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
static inline bool emulate_vsyscall(unsigned long error_code,
struct pt_regs *regs, unsigned long address)
{
return false;
}
......
......@@ -710,6 +710,10 @@ static void set_signal_archinfo(unsigned long address,
* To avoid leaking information about the kernel page
* table layout, pretend that user-mode accesses to
* kernel addresses are always protection faults.
*
* NB: This means that failed vsyscalls with vsyscall=none
* will have the PROT bit. This doesn't leak any
* information and does not appear to cause any problems.
*/
if (address >= TASK_SIZE_MAX)
error_code |= X86_PF_PROT;
......@@ -1369,16 +1373,18 @@ void do_user_addr_fault(struct pt_regs *regs,
#ifdef CONFIG_X86_64
/*
* Instruction fetch faults in the vsyscall page might need
* emulation. The vsyscall page is at a high address
* (>PAGE_OFFSET), but is considered to be part of the user
* address space.
* Faults in the vsyscall page might need emulation. The
* vsyscall page is at a high address (>PAGE_OFFSET), but is
* considered to be part of the user address space.
*
* The vsyscall page does not have a "real" VMA, so do this
* emulation before we go searching for VMAs.
*
* PKRU never rejects instruction fetches, so we don't need
* to consider the PF_PK bit.
*/
if ((hw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) {
if (emulate_vsyscall(regs, address))
if (is_vsyscall_vaddr(address)) {
if (emulate_vsyscall(hw_error_code, regs, address))
return;
}
#endif
......
......@@ -18,6 +18,7 @@
#include <sched.h>
#include <stdbool.h>
#include <setjmp.h>
#include <sys/uio.h>
#ifdef __x86_64__
# define VSYS(x) (x)
......@@ -49,21 +50,21 @@ static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
}
/* vsyscalls and vDSO */
bool should_read_vsyscall = false;
bool vsyscall_map_r = false, vsyscall_map_x = false;
typedef long (*gtod_t)(struct timeval *tv, struct timezone *tz);
gtod_t vgtod = (gtod_t)VSYS(0xffffffffff600000);
const gtod_t vgtod = (gtod_t)VSYS(0xffffffffff600000);
gtod_t vdso_gtod;
typedef int (*vgettime_t)(clockid_t, struct timespec *);
vgettime_t vdso_gettime;
typedef long (*time_func_t)(time_t *t);
time_func_t vtime = (time_func_t)VSYS(0xffffffffff600400);
const time_func_t vtime = (time_func_t)VSYS(0xffffffffff600400);
time_func_t vdso_time;
typedef long (*getcpu_t)(unsigned *, unsigned *, void *);
getcpu_t vgetcpu = (getcpu_t)VSYS(0xffffffffff600800);
const getcpu_t vgetcpu = (getcpu_t)VSYS(0xffffffffff600800);
getcpu_t vdso_getcpu;
static void init_vdso(void)
......@@ -107,7 +108,7 @@ static int init_vsys(void)
maps = fopen("/proc/self/maps", "r");
if (!maps) {
printf("[WARN]\tCould not open /proc/self/maps -- assuming vsyscall is r-x\n");
should_read_vsyscall = true;
vsyscall_map_r = true;
return 0;
}
......@@ -133,12 +134,8 @@ static int init_vsys(void)
}
printf("\tvsyscall permissions are %c-%c\n", r, x);
should_read_vsyscall = (r == 'r');
if (x != 'x') {
vgtod = NULL;
vtime = NULL;
vgetcpu = NULL;
}
vsyscall_map_r = (r == 'r');
vsyscall_map_x = (x == 'x');
found = true;
break;
......@@ -148,10 +145,8 @@ static int init_vsys(void)
if (!found) {
printf("\tno vsyscall map in /proc/self/maps\n");
should_read_vsyscall = false;
vgtod = NULL;
vtime = NULL;
vgetcpu = NULL;
vsyscall_map_r = false;
vsyscall_map_x = false;
}
return nerrs;
......@@ -183,9 +178,13 @@ static inline long sys_getcpu(unsigned * cpu, unsigned * node,
}
static jmp_buf jmpbuf;
static volatile unsigned long segv_err;
static void sigsegv(int sig, siginfo_t *info, void *ctx_void)
{
ucontext_t *ctx = (ucontext_t *)ctx_void;
segv_err = ctx->uc_mcontext.gregs[REG_ERR];
siglongjmp(jmpbuf, 1);
}
......@@ -238,7 +237,7 @@ static int test_gtod(void)
err(1, "syscall gettimeofday");
if (vdso_gtod)
ret_vdso = vdso_gtod(&tv_vdso, &tz_vdso);
if (vgtod)
if (vsyscall_map_x)
ret_vsys = vgtod(&tv_vsys, &tz_vsys);
if (sys_gtod(&tv_sys2, &tz_sys) != 0)
err(1, "syscall gettimeofday");
......@@ -252,7 +251,7 @@ static int test_gtod(void)
}
}
if (vgtod) {
if (vsyscall_map_x) {
if (ret_vsys == 0) {
nerrs += check_gtod(&tv_sys1, &tv_sys2, &tz_sys, "vsyscall", &tv_vsys, &tz_vsys);
} else {
......@@ -273,7 +272,7 @@ static int test_time(void) {
t_sys1 = sys_time(&t2_sys1);
if (vdso_time)
t_vdso = vdso_time(&t2_vdso);
if (vtime)
if (vsyscall_map_x)
t_vsys = vtime(&t2_vsys);
t_sys2 = sys_time(&t2_sys2);
if (t_sys1 < 0 || t_sys1 != t2_sys1 || t_sys2 < 0 || t_sys2 != t2_sys2) {
......@@ -294,7 +293,7 @@ static int test_time(void) {
}
}
if (vtime) {
if (vsyscall_map_x) {
if (t_vsys < 0 || t_vsys != t2_vsys) {
printf("[FAIL]\tvsyscall failed (ret:%ld output:%ld)\n", t_vsys, t2_vsys);
nerrs++;
......@@ -330,7 +329,7 @@ static int test_getcpu(int cpu)
ret_sys = sys_getcpu(&cpu_sys, &node_sys, 0);
if (vdso_getcpu)
ret_vdso = vdso_getcpu(&cpu_vdso, &node_vdso, 0);
if (vgetcpu)
if (vsyscall_map_x)
ret_vsys = vgetcpu(&cpu_vsys, &node_vsys, 0);
if (ret_sys == 0) {
......@@ -369,7 +368,7 @@ static int test_getcpu(int cpu)
}
}
if (vgetcpu) {
if (vsyscall_map_x) {
if (ret_vsys) {
printf("[FAIL]\tvsyscall getcpu() failed\n");
nerrs++;
......@@ -410,20 +409,88 @@ static int test_vsys_r(void)
can_read = false;
}
if (can_read && !should_read_vsyscall) {
if (can_read && !vsyscall_map_r) {
printf("[FAIL]\tWe have read access, but we shouldn't\n");
return 1;
} else if (!can_read && should_read_vsyscall) {
} else if (!can_read && vsyscall_map_r) {
printf("[FAIL]\tWe don't have read access, but we should\n");
return 1;
} else if (can_read) {
printf("[OK]\tWe have read access\n");
} else {
printf("[OK]\tgot expected result\n");
printf("[OK]\tWe do not have read access: #PF(0x%lx)\n",
segv_err);
}
#endif
return 0;
}
static int test_vsys_x(void)
{
#ifdef __x86_64__
if (vsyscall_map_x) {
/* We already tested this adequately. */
return 0;
}
printf("[RUN]\tMake sure that vsyscalls really page fault\n");
bool can_exec;
if (sigsetjmp(jmpbuf, 1) == 0) {
vgtod(NULL, NULL);
can_exec = true;
} else {
can_exec = false;
}
if (can_exec) {
printf("[FAIL]\tExecuting the vsyscall did not page fault\n");
return 1;
} else if (segv_err & (1 << 4)) { /* INSTR */
printf("[OK]\tExecuting the vsyscall page failed: #PF(0x%lx)\n",
segv_err);
} else {
printf("[FAILT]\tExecution failed with the wrong error: #PF(0x%lx)\n",
segv_err);
return 1;
}
#endif
return 0;
}
static int test_process_vm_readv(void)
{
#ifdef __x86_64__
char buf[4096];
struct iovec local, remote;
int ret;
printf("[RUN]\tprocess_vm_readv() from vsyscall page\n");
local.iov_base = buf;
local.iov_len = 4096;
remote.iov_base = (void *)0xffffffffff600000;
remote.iov_len = 4096;
ret = process_vm_readv(getpid(), &local, 1, &remote, 1, 0);
if (ret != 4096) {
printf("[OK]\tprocess_vm_readv() failed (ret = %d, errno = %d)\n", ret, errno);
return 0;
}
if (vsyscall_map_r) {
if (!memcmp(buf, (const void *)0xffffffffff600000, 4096)) {
printf("[OK]\tIt worked and read correct data\n");
} else {
printf("[FAIL]\tIt worked but returned incorrect data\n");
return 1;
}
}
#endif
return 0;
}
#ifdef __x86_64__
#define X86_EFLAGS_TF (1UL << 8)
......@@ -455,7 +522,7 @@ static int test_emulation(void)
time_t tmp;
bool is_native;
if (!vtime)
if (!vsyscall_map_x)
return 0;
printf("[RUN]\tchecking that vsyscalls are emulated\n");
......@@ -497,6 +564,9 @@ int main(int argc, char **argv)
sethandler(SIGSEGV, sigsegv, 0);
nerrs += test_vsys_r();
nerrs += test_vsys_x();
nerrs += test_process_vm_readv();
#ifdef __x86_64__
nerrs += test_emulation();
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment