Commit e994cc24 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'seccomp-v5.11-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux

Pull seccomp updates from Kees Cook:
 "The major change here is finally gaining seccomp constant-action
  bitmaps, which internally reduces the seccomp overhead for many
  real-world syscall filters to O(1), as discussed at Plumbers this
  year.

   - Improve seccomp performance via constant-action bitmaps (YiFei Zhu
     & Kees Cook)

   - Fix bogus __user annotations (Jann Horn)

   - Add missed CONFIG for improved selftest coverage (Mickaël Salaün)"

* tag 'seccomp-v5.11-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux:
  selftests/seccomp: Update kernel config
  seccomp: Remove bogus __user annotations
  seccomp/cache: Report cache data through /proc/pid/seccomp_cache
  xtensa: Enable seccomp architecture tracking
  sh: Enable seccomp architecture tracking
  s390: Enable seccomp architecture tracking
  riscv: Enable seccomp architecture tracking
  powerpc: Enable seccomp architecture tracking
  parisc: Enable seccomp architecture tracking
  csky: Enable seccomp architecture tracking
  arm: Enable seccomp architecture tracking
  arm64: Enable seccomp architecture tracking
  selftests/seccomp: Compare bitmap vs filter overhead
  x86: Enable seccomp architecture tracking
  seccomp/cache: Add "emulator" to check if filter is constant allow
  seccomp/cache: Lookup syscall allowlist bitmap for fast path
parents ba1d41a5 2c07343a
......@@ -486,6 +486,9 @@ config HAVE_ARCH_SECCOMP_FILTER
- secure_computing return value is checked and a return value of -1
results in the system call being skipped immediately.
- seccomp syscall wired up
- if !HAVE_SPARSE_SYSCALL_NR, have SECCOMP_ARCH_NATIVE,
SECCOMP_ARCH_NATIVE_NR, SECCOMP_ARCH_NATIVE_NAME defined. If
COMPAT is supported, have the SECCOMP_ARCH_COMPAT* defines too.
config SECCOMP
prompt "Enable seccomp to safely execute untrusted bytecode"
......@@ -514,6 +517,20 @@ config SECCOMP_FILTER
See Documentation/userspace-api/seccomp_filter.rst for details.
config SECCOMP_CACHE_DEBUG
bool "Show seccomp filter cache status in /proc/pid/seccomp_cache"
depends on SECCOMP_FILTER && !HAVE_SPARSE_SYSCALL_NR
depends on PROC_FS
help
This enables the /proc/pid/seccomp_cache interface to monitor
seccomp cache data. The file format is subject to change. Reading
the file requires CAP_SYS_ADMIN.
This option is for debugging only. Enabling presents the risk that
an adversary may be able to infer the seccomp filter logic.
If unsure, say N.
config HAVE_ARCH_STACKLEAK
bool
help
......
......@@ -4,7 +4,6 @@ generic-y += extable.h
generic-y += flat.h
generic-y += local64.h
generic-y += parport.h
generic-y += seccomp.h
generated-y += mach-types.h
generated-y += unistd-nr.h
/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef _ASM_SECCOMP_H
#define _ASM_SECCOMP_H
#include <asm-generic/seccomp.h>
#define SECCOMP_ARCH_NATIVE AUDIT_ARCH_ARM
#define SECCOMP_ARCH_NATIVE_NR NR_syscalls
#define SECCOMP_ARCH_NATIVE_NAME "arm"
#endif /* _ASM_SECCOMP_H */
......@@ -19,4 +19,13 @@
#include <asm-generic/seccomp.h>
#define SECCOMP_ARCH_NATIVE AUDIT_ARCH_AARCH64
#define SECCOMP_ARCH_NATIVE_NR NR_syscalls
#define SECCOMP_ARCH_NATIVE_NAME "aarch64"
#ifdef CONFIG_COMPAT
# define SECCOMP_ARCH_COMPAT AUDIT_ARCH_ARM
# define SECCOMP_ARCH_COMPAT_NR __NR_compat_syscalls
# define SECCOMP_ARCH_COMPAT_NAME "arm"
#endif
#endif /* _ASM_SECCOMP_H */
......@@ -4,6 +4,5 @@ generic-y += gpio.h
generic-y += kvm_para.h
generic-y += local64.h
generic-y += qrwlock.h
generic-y += seccomp.h
generic-y += user.h
generic-y += vmlinux.lds.h
/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef _ASM_SECCOMP_H
#define _ASM_SECCOMP_H
#include <asm-generic/seccomp.h>
#define SECCOMP_ARCH_NATIVE AUDIT_ARCH_CSKY
#define SECCOMP_ARCH_NATIVE_NR NR_syscalls
#define SECCOMP_ARCH_NATIVE_NAME "csky"
#endif /* _ASM_SECCOMP_H */
......@@ -5,5 +5,4 @@ generated-y += syscall_table_c32.h
generic-y += kvm_para.h
generic-y += local64.h
generic-y += mcs_spinlock.h
generic-y += seccomp.h
generic-y += user.h
/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef _ASM_SECCOMP_H
#define _ASM_SECCOMP_H
#include <asm-generic/seccomp.h>
#ifdef CONFIG_64BIT
# define SECCOMP_ARCH_NATIVE AUDIT_ARCH_PARISC64
# define SECCOMP_ARCH_NATIVE_NR NR_syscalls
# define SECCOMP_ARCH_NATIVE_NAME "parisc64"
# ifdef CONFIG_COMPAT
# define SECCOMP_ARCH_COMPAT AUDIT_ARCH_PARISC
# define SECCOMP_ARCH_COMPAT_NR NR_syscalls
# define SECCOMP_ARCH_COMPAT_NAME "parisc"
# endif
#else /* !CONFIG_64BIT */
# define SECCOMP_ARCH_NATIVE AUDIT_ARCH_PARISC
# define SECCOMP_ARCH_NATIVE_NR NR_syscalls
# define SECCOMP_ARCH_NATIVE_NAME "parisc"
#endif
#endif /* _ASM_SECCOMP_H */
......@@ -8,4 +8,27 @@
#include <asm-generic/seccomp.h>
#ifdef __LITTLE_ENDIAN__
#define __SECCOMP_ARCH_LE __AUDIT_ARCH_LE
#define __SECCOMP_ARCH_LE_NAME "le"
#else
#define __SECCOMP_ARCH_LE 0
#define __SECCOMP_ARCH_LE_NAME
#endif
#ifdef CONFIG_PPC64
# define SECCOMP_ARCH_NATIVE (AUDIT_ARCH_PPC64 | __SECCOMP_ARCH_LE)
# define SECCOMP_ARCH_NATIVE_NR NR_syscalls
# define SECCOMP_ARCH_NATIVE_NAME "ppc64" __SECCOMP_ARCH_LE_NAME
# ifdef CONFIG_COMPAT
# define SECCOMP_ARCH_COMPAT (AUDIT_ARCH_PPC | __SECCOMP_ARCH_LE)
# define SECCOMP_ARCH_COMPAT_NR NR_syscalls
# define SECCOMP_ARCH_COMPAT_NAME "ppc" __SECCOMP_ARCH_LE_NAME
# endif
#else /* !CONFIG_PPC64 */
# define SECCOMP_ARCH_NATIVE (AUDIT_ARCH_PPC | __SECCOMP_ARCH_LE)
# define SECCOMP_ARCH_NATIVE_NR NR_syscalls
# define SECCOMP_ARCH_NATIVE_NAME "ppc" __SECCOMP_ARCH_LE_NAME
#endif
#endif /* _ASM_POWERPC_SECCOMP_H */
......@@ -7,4 +7,14 @@
#include <asm-generic/seccomp.h>
#ifdef CONFIG_64BIT
# define SECCOMP_ARCH_NATIVE AUDIT_ARCH_RISCV64
# define SECCOMP_ARCH_NATIVE_NR NR_syscalls
# define SECCOMP_ARCH_NATIVE_NAME "riscv64"
#else /* !CONFIG_64BIT */
# define SECCOMP_ARCH_NATIVE AUDIT_ARCH_RISCV32
# define SECCOMP_ARCH_NATIVE_NR NR_syscalls
# define SECCOMP_ARCH_NATIVE_NAME "riscv32"
#endif
#endif /* _ASM_SECCOMP_H */
......@@ -16,4 +16,13 @@
#include <asm-generic/seccomp.h>
#define SECCOMP_ARCH_NATIVE AUDIT_ARCH_S390X
#define SECCOMP_ARCH_NATIVE_NR NR_syscalls
#define SECCOMP_ARCH_NATIVE_NAME "s390x"
#ifdef CONFIG_COMPAT
# define SECCOMP_ARCH_COMPAT AUDIT_ARCH_S390
# define SECCOMP_ARCH_COMPAT_NR NR_syscalls
# define SECCOMP_ARCH_COMPAT_NAME "s390"
#endif
#endif /* _ASM_S390_SECCOMP_H */
......@@ -8,4 +8,14 @@
#define __NR_seccomp_exit __NR_exit
#define __NR_seccomp_sigreturn __NR_rt_sigreturn
#ifdef CONFIG_CPU_LITTLE_ENDIAN
#define __SECCOMP_ARCH_LE __AUDIT_ARCH_LE
#else
#define __SECCOMP_ARCH_LE 0
#endif
#define SECCOMP_ARCH_NATIVE (AUDIT_ARCH_SH | __SECCOMP_ARCH_LE)
#define SECCOMP_ARCH_NATIVE_NR NR_syscalls
#define SECCOMP_ARCH_NATIVE_NAME "sh"
#endif /* __ASM_SECCOMP_H */
......@@ -16,6 +16,26 @@
#define __NR_seccomp_sigreturn_32 __NR_ia32_sigreturn
#endif
#ifdef CONFIG_X86_64
# define SECCOMP_ARCH_NATIVE AUDIT_ARCH_X86_64
# define SECCOMP_ARCH_NATIVE_NR NR_syscalls
# define SECCOMP_ARCH_NATIVE_NAME "x86_64"
# ifdef CONFIG_COMPAT
# define SECCOMP_ARCH_COMPAT AUDIT_ARCH_I386
# define SECCOMP_ARCH_COMPAT_NR IA32_NR_syscalls
# define SECCOMP_ARCH_COMPAT_NAME "ia32"
# endif
/*
* x32 will have __X32_SYSCALL_BIT set in syscall number. We don't support
* caching them and they are treated as out of range syscalls, which will
* always pass through the BPF filter.
*/
#else /* !CONFIG_X86_64 */
# define SECCOMP_ARCH_NATIVE AUDIT_ARCH_I386
# define SECCOMP_ARCH_NATIVE_NR NR_syscalls
# define SECCOMP_ARCH_NATIVE_NAME "ia32"
#endif
#include <asm-generic/seccomp.h>
#endif /* _ASM_X86_SECCOMP_H */
......@@ -7,5 +7,4 @@ generic-y += mcs_spinlock.h
generic-y += param.h
generic-y += qrwlock.h
generic-y += qspinlock.h
generic-y += seccomp.h
generic-y += user.h
/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef _ASM_SECCOMP_H
#define _ASM_SECCOMP_H
#include <asm-generic/seccomp.h>
#define SECCOMP_ARCH_NATIVE AUDIT_ARCH_XTENSA
#define SECCOMP_ARCH_NATIVE_NR NR_syscalls
#define SECCOMP_ARCH_NATIVE_NAME "xtensa"
#endif /* _ASM_SECCOMP_H */
......@@ -3263,6 +3263,9 @@ static const struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_PROC_PID_ARCH_STATUS
ONE("arch_status", S_IRUGO, proc_pid_arch_status),
#endif
#ifdef CONFIG_SECCOMP_CACHE_DEBUG
ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
#endif
};
static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
......@@ -3592,6 +3595,9 @@ static const struct pid_entry tid_base_stuff[] = {
#ifdef CONFIG_PROC_PID_ARCH_STATUS
ONE("arch_status", S_IRUGO, proc_pid_arch_status),
#endif
#ifdef CONFIG_SECCOMP_CACHE_DEBUG
ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
#endif
};
static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
......
......@@ -121,4 +121,11 @@ static inline long seccomp_get_metadata(struct task_struct *task,
return -EINVAL;
}
#endif /* CONFIG_SECCOMP_FILTER && CONFIG_CHECKPOINT_RESTORE */
#ifdef CONFIG_SECCOMP_CACHE_DEBUG
struct seq_file;
int proc_pid_seccomp_cache(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task);
#endif
#endif /* _LINUX_SECCOMP_H */
......@@ -143,6 +143,38 @@ struct notification {
struct list_head notifications;
};
#ifdef SECCOMP_ARCH_NATIVE
/**
* struct action_cache - per-filter cache of seccomp actions per
* arch/syscall pair
*
* @allow_native: A bitmap where each bit represents whether the
* filter will always allow the syscall, for the
* native architecture.
* @allow_compat: A bitmap where each bit represents whether the
* filter will always allow the syscall, for the
* compat architecture.
*/
struct action_cache {
DECLARE_BITMAP(allow_native, SECCOMP_ARCH_NATIVE_NR);
#ifdef SECCOMP_ARCH_COMPAT
DECLARE_BITMAP(allow_compat, SECCOMP_ARCH_COMPAT_NR);
#endif
};
#else
struct action_cache { };
static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilter,
const struct seccomp_data *sd)
{
return false;
}
static inline void seccomp_cache_prepare(struct seccomp_filter *sfilter)
{
}
#endif /* SECCOMP_ARCH_NATIVE */
/**
* struct seccomp_filter - container for seccomp BPF programs
*
......@@ -159,6 +191,7 @@ struct notification {
* this filter after reaching 0. The @users count is always smaller
* or equal to @refs. Hence, reaching 0 for @users does not mean
* the filter can be freed.
* @cache: cache of arch/syscall mappings to actions
* @log: true if all actions except for SECCOMP_RET_ALLOW should be logged
* @prev: points to a previously installed, or inherited, filter
* @prog: the BPF program to evaluate
......@@ -180,6 +213,7 @@ struct seccomp_filter {
refcount_t refs;
refcount_t users;
bool log;
struct action_cache cache;
struct seccomp_filter *prev;
struct bpf_prog *prog;
struct notification *notif;
......@@ -298,6 +332,52 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
return 0;
}
#ifdef SECCOMP_ARCH_NATIVE
static inline bool seccomp_cache_check_allow_bitmap(const void *bitmap,
size_t bitmap_size,
int syscall_nr)
{
if (unlikely(syscall_nr < 0 || syscall_nr >= bitmap_size))
return false;
syscall_nr = array_index_nospec(syscall_nr, bitmap_size);
return test_bit(syscall_nr, bitmap);
}
/**
* seccomp_cache_check_allow - lookup seccomp cache
* @sfilter: The seccomp filter
* @sd: The seccomp data to lookup the cache with
*
* Returns true if the seccomp_data is cached and allowed.
*/
static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilter,
const struct seccomp_data *sd)
{
int syscall_nr = sd->nr;
const struct action_cache *cache = &sfilter->cache;
#ifndef SECCOMP_ARCH_COMPAT
/* A native-only architecture doesn't need to check sd->arch. */
return seccomp_cache_check_allow_bitmap(cache->allow_native,
SECCOMP_ARCH_NATIVE_NR,
syscall_nr);
#else
if (likely(sd->arch == SECCOMP_ARCH_NATIVE))
return seccomp_cache_check_allow_bitmap(cache->allow_native,
SECCOMP_ARCH_NATIVE_NR,
syscall_nr);
if (likely(sd->arch == SECCOMP_ARCH_COMPAT))
return seccomp_cache_check_allow_bitmap(cache->allow_compat,
SECCOMP_ARCH_COMPAT_NR,
syscall_nr);
#endif /* SECCOMP_ARCH_COMPAT */
WARN_ON_ONCE(true);
return false;
}
#endif /* SECCOMP_ARCH_NATIVE */
/**
* seccomp_run_filters - evaluates all seccomp filters against @sd
* @sd: optional seccomp data to be passed to filters
......@@ -320,6 +400,9 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd,
if (WARN_ON(f == NULL))
return SECCOMP_RET_KILL_PROCESS;
if (seccomp_cache_check_allow(f, sd))
return SECCOMP_RET_ALLOW;
/*
* All filters in the list are evaluated and the lowest BPF return
* value always takes priority (ignoring the DATA).
......@@ -470,6 +553,9 @@ void seccomp_filter_release(struct task_struct *tsk)
{
struct seccomp_filter *orig = tsk->seccomp.filter;
/* We are effectively holding the siglock by not having any sighand. */
WARN_ON(tsk->sighand != NULL);
/* Detach task from its filter tree. */
tsk->seccomp.filter = NULL;
__seccomp_filter_release(orig);
......@@ -544,7 +630,12 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
{
struct seccomp_filter *sfilter;
int ret;
const bool save_orig = IS_ENABLED(CONFIG_CHECKPOINT_RESTORE);
const bool save_orig =
#if defined(CONFIG_CHECKPOINT_RESTORE) || defined(SECCOMP_ARCH_NATIVE)
true;
#else
false;
#endif
if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
return ERR_PTR(-EINVAL);
......@@ -609,6 +700,148 @@ seccomp_prepare_user_filter(const char __user *user_filter)
return filter;
}
#ifdef SECCOMP_ARCH_NATIVE
/**
* seccomp_is_const_allow - check if filter is constant allow with given data
* @fprog: The BPF programs
* @sd: The seccomp data to check against, only syscall number and arch
* number are considered constant.
*/
static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog,
struct seccomp_data *sd)
{
unsigned int reg_value = 0;
unsigned int pc;
bool op_res;
if (WARN_ON_ONCE(!fprog))
return false;
for (pc = 0; pc < fprog->len; pc++) {
struct sock_filter *insn = &fprog->filter[pc];
u16 code = insn->code;
u32 k = insn->k;
switch (code) {
case BPF_LD | BPF_W | BPF_ABS:
switch (k) {
case offsetof(struct seccomp_data, nr):
reg_value = sd->nr;
break;
case offsetof(struct seccomp_data, arch):
reg_value = sd->arch;
break;
default:
/* can't optimize (non-constant value load) */
return false;
}
break;
case BPF_RET | BPF_K:
/* reached return with constant values only, check allow */
return k == SECCOMP_RET_ALLOW;
case BPF_JMP | BPF_JA:
pc += insn->k;
break;
case BPF_JMP | BPF_JEQ | BPF_K:
case BPF_JMP | BPF_JGE | BPF_K:
case BPF_JMP | BPF_JGT | BPF_K:
case BPF_JMP | BPF_JSET | BPF_K:
switch (BPF_OP(code)) {
case BPF_JEQ:
op_res = reg_value == k;
break;
case BPF_JGE:
op_res = reg_value >= k;
break;
case BPF_JGT:
op_res = reg_value > k;
break;
case BPF_JSET:
op_res = !!(reg_value & k);
break;
default:
/* can't optimize (unknown jump) */
return false;
}
pc += op_res ? insn->jt : insn->jf;
break;
case BPF_ALU | BPF_AND | BPF_K:
reg_value &= k;
break;
default:
/* can't optimize (unknown insn) */
return false;
}
}
/* ran off the end of the filter?! */
WARN_ON(1);
return false;
}
static void seccomp_cache_prepare_bitmap(struct seccomp_filter *sfilter,
void *bitmap, const void *bitmap_prev,
size_t bitmap_size, int arch)
{
struct sock_fprog_kern *fprog = sfilter->prog->orig_prog;
struct seccomp_data sd;
int nr;
if (bitmap_prev) {
/* The new filter must be as restrictive as the last. */
bitmap_copy(bitmap, bitmap_prev, bitmap_size);
} else {
/* Before any filters, all syscalls are always allowed. */
bitmap_fill(bitmap, bitmap_size);
}
for (nr = 0; nr < bitmap_size; nr++) {
/* No bitmap change: not a cacheable action. */
if (!test_bit(nr, bitmap))
continue;
sd.nr = nr;
sd.arch = arch;
/* No bitmap change: continue to always allow. */
if (seccomp_is_const_allow(fprog, &sd))
continue;
/*
* Not a cacheable action: always run filters.
* atomic clear_bit() not needed, filter not visible yet.
*/
__clear_bit(nr, bitmap);
}
}
/**
* seccomp_cache_prepare - emulate the filter to find cachable syscalls
* @sfilter: The seccomp filter
*
* Returns 0 if successful or -errno if error occurred.
*/
static void seccomp_cache_prepare(struct seccomp_filter *sfilter)
{
struct action_cache *cache = &sfilter->cache;
const struct action_cache *cache_prev =
sfilter->prev ? &sfilter->prev->cache : NULL;
seccomp_cache_prepare_bitmap(sfilter, cache->allow_native,
cache_prev ? cache_prev->allow_native : NULL,
SECCOMP_ARCH_NATIVE_NR,
SECCOMP_ARCH_NATIVE);
#ifdef SECCOMP_ARCH_COMPAT
seccomp_cache_prepare_bitmap(sfilter, cache->allow_compat,
cache_prev ? cache_prev->allow_compat : NULL,
SECCOMP_ARCH_COMPAT_NR,
SECCOMP_ARCH_COMPAT);
#endif /* SECCOMP_ARCH_COMPAT */
}
#endif /* SECCOMP_ARCH_NATIVE */
/**
* seccomp_attach_filter: validate and attach filter
* @flags: flags to change filter behavior
......@@ -658,6 +891,7 @@ static long seccomp_attach_filter(unsigned int flags,
* task reference.
*/
filter->prev = current->seccomp.filter;
seccomp_cache_prepare(filter);
current->seccomp.filter = filter;
atomic_inc(&current->seccomp.filter_count);
......@@ -1967,7 +2201,7 @@ static bool seccomp_actions_logged_from_names(u32 *actions_logged, char *names)
return true;
}
static int read_actions_logged(struct ctl_table *ro_table, void __user *buffer,
static int read_actions_logged(struct ctl_table *ro_table, void *buffer,
size_t *lenp, loff_t *ppos)
{
char names[sizeof(seccomp_actions_avail)];
......@@ -1985,7 +2219,7 @@ static int read_actions_logged(struct ctl_table *ro_table, void __user *buffer,
return proc_dostring(&table, 0, buffer, lenp, ppos);
}
static int write_actions_logged(struct ctl_table *ro_table, void __user *buffer,
static int write_actions_logged(struct ctl_table *ro_table, void *buffer,
size_t *lenp, loff_t *ppos, u32 *actions_logged)
{
char names[sizeof(seccomp_actions_avail)];
......@@ -2103,3 +2337,59 @@ static int __init seccomp_sysctl_init(void)
device_initcall(seccomp_sysctl_init)
#endif /* CONFIG_SYSCTL */
#ifdef CONFIG_SECCOMP_CACHE_DEBUG
/* Currently CONFIG_SECCOMP_CACHE_DEBUG implies SECCOMP_ARCH_NATIVE */
static void proc_pid_seccomp_cache_arch(struct seq_file *m, const char *name,
const void *bitmap, size_t bitmap_size)
{
int nr;
for (nr = 0; nr < bitmap_size; nr++) {
bool cached = test_bit(nr, bitmap);
char *status = cached ? "ALLOW" : "FILTER";
seq_printf(m, "%s %d %s\n", name, nr, status);
}
}
int proc_pid_seccomp_cache(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
struct seccomp_filter *f;
unsigned long flags;
/*
* We don't want some sandboxed process to know what their seccomp
* filters consist of.
*/
if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN))
return -EACCES;
if (!lock_task_sighand(task, &flags))
return -ESRCH;
f = READ_ONCE(task->seccomp.filter);
if (!f) {
unlock_task_sighand(task, &flags);
return 0;
}
/* prevent filter from being freed while we are printing it */
__get_seccomp_filter(f);
unlock_task_sighand(task, &flags);
proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_NATIVE_NAME,
f->cache.allow_native,
SECCOMP_ARCH_NATIVE_NR);
#ifdef SECCOMP_ARCH_COMPAT
proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_COMPAT_NAME,
f->cache.allow_compat,
SECCOMP_ARCH_COMPAT_NR);
#endif /* SECCOMP_ARCH_COMPAT */
__put_seccomp_filter(f);
return 0;
}
#endif /* CONFIG_SECCOMP_CACHE_DEBUG */
CONFIG_PID_NS=y
CONFIG_SECCOMP=y
CONFIG_SECCOMP_FILTER=y
CONFIG_USER_NS=y
......@@ -4,12 +4,16 @@
*/
#define _GNU_SOURCE
#include <assert.h>
#include <limits.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <unistd.h>
#include <linux/filter.h>
#include <linux/seccomp.h>
#include <sys/param.h>
#include <sys/prctl.h>
#include <sys/syscall.h>
#include <sys/types.h>
......@@ -70,18 +74,74 @@ unsigned long long calibrate(void)
return samples * seconds;
}
bool approx(int i_one, int i_two)
{
double one = i_one, one_bump = one * 0.01;
double two = i_two, two_bump = two * 0.01;
one_bump = one + MAX(one_bump, 2.0);
two_bump = two + MAX(two_bump, 2.0);
/* Equal to, or within 1% or 2 digits */
if (one == two ||
(one > two && one <= two_bump) ||
(two > one && two <= one_bump))
return true;
return false;
}
bool le(int i_one, int i_two)
{
if (i_one <= i_two)
return true;
return false;
}
long compare(const char *name_one, const char *name_eval, const char *name_two,
unsigned long long one, bool (*eval)(int, int), unsigned long long two)
{
bool good;
printf("\t%s %s %s (%lld %s %lld): ", name_one, name_eval, name_two,
(long long)one, name_eval, (long long)two);
if (one > INT_MAX) {
printf("Miscalculation! Measurement went negative: %lld\n", (long long)one);
return 1;
}
if (two > INT_MAX) {
printf("Miscalculation! Measurement went negative: %lld\n", (long long)two);
return 1;
}
good = eval(one, two);
printf("%s\n", good ? "✔️" : "❌");
return good ? 0 : 1;
}
int main(int argc, char *argv[])
{
struct sock_filter bitmap_filter[] = {
BPF_STMT(BPF_LD|BPF_W|BPF_ABS, offsetof(struct seccomp_data, nr)),
BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
};
struct sock_fprog bitmap_prog = {
.len = (unsigned short)ARRAY_SIZE(bitmap_filter),
.filter = bitmap_filter,
};
struct sock_filter filter[] = {
BPF_STMT(BPF_LD|BPF_W|BPF_ABS, offsetof(struct seccomp_data, args[0])),
BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
};
struct sock_fprog prog = {
.len = (unsigned short)ARRAY_SIZE(filter),
.filter = filter,
};
long ret;
unsigned long long samples;
unsigned long long native, filter1, filter2;
long ret, bits;
unsigned long long samples, calc;
unsigned long long native, filter1, filter2, bitmap1, bitmap2;
unsigned long long entry, per_filter1, per_filter2;
printf("Current BPF sysctl settings:\n");
system("sysctl net.core.bpf_jit_enable");
......@@ -101,35 +161,82 @@ int main(int argc, char *argv[])
ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
assert(ret == 0);
/* One filter */
ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
/* One filter resulting in a bitmap */
ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bitmap_prog);
assert(ret == 0);
filter1 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples;
printf("getpid RET_ALLOW 1 filter: %llu ns\n", filter1);
bitmap1 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples;
printf("getpid RET_ALLOW 1 filter (bitmap): %llu ns\n", bitmap1);
/* Second filter resulting in a bitmap */
ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bitmap_prog);
assert(ret == 0);
if (filter1 == native)
printf("No overhead measured!? Try running again with more samples.\n");
bitmap2 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples;
printf("getpid RET_ALLOW 2 filters (bitmap): %llu ns\n", bitmap2);
/* Two filters */
/* Third filter, can no longer be converted to bitmap */
ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
assert(ret == 0);
filter2 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples;
printf("getpid RET_ALLOW 2 filters: %llu ns\n", filter2);
/* Calculations */
printf("Estimated total seccomp overhead for 1 filter: %llu ns\n",
filter1 - native);
filter1 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples;
printf("getpid RET_ALLOW 3 filters (full): %llu ns\n", filter1);
printf("Estimated total seccomp overhead for 2 filters: %llu ns\n",
filter2 - native);
/* Fourth filter, can not be converted to bitmap because of filter 3 */
ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bitmap_prog);
assert(ret == 0);
printf("Estimated seccomp per-filter overhead: %llu ns\n",
filter2 - filter1);
filter2 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples;
printf("getpid RET_ALLOW 4 filters (full): %llu ns\n", filter2);
/* Estimations */
#define ESTIMATE(fmt, var, what) do { \
var = (what); \
printf("Estimated " fmt ": %llu ns\n", var); \
if (var > INT_MAX) \
goto more_samples; \
} while (0)
ESTIMATE("total seccomp overhead for 1 bitmapped filter", calc,
bitmap1 - native);
ESTIMATE("total seccomp overhead for 2 bitmapped filters", calc,
bitmap2 - native);
ESTIMATE("total seccomp overhead for 3 full filters", calc,
filter1 - native);
ESTIMATE("total seccomp overhead for 4 full filters", calc,
filter2 - native);
ESTIMATE("seccomp entry overhead", entry,
bitmap1 - native - (bitmap2 - bitmap1));
ESTIMATE("seccomp per-filter overhead (last 2 diff)", per_filter1,
filter2 - filter1);
ESTIMATE("seccomp per-filter overhead (filters / 4)", per_filter2,
(filter2 - native - entry) / 4);
printf("Expectations:\n");
ret |= compare("native", "≤", "1 bitmap", native, le, bitmap1);
bits = compare("native", "≤", "1 filter", native, le, filter1);
if (bits)
goto more_samples;
ret |= compare("per-filter (last 2 diff)", "≈", "per-filter (filters / 4)",
per_filter1, approx, per_filter2);
bits = compare("1 bitmapped", "≈", "2 bitmapped",
bitmap1 - native, approx, bitmap2 - native);
if (bits) {
printf("Skipping constant action bitmap expectations: they appear unsupported.\n");
goto out;
}
printf("Estimated seccomp entry overhead: %llu ns\n",
filter1 - native - (filter2 - filter1));
ret |= compare("entry", "≈", "1 bitmapped", entry, approx, bitmap1 - native);
ret |= compare("entry", "≈", "2 bitmapped", entry, approx, bitmap2 - native);
ret |= compare("native + entry + (per filter * 4)", "≈", "4 filters total",
entry + (per_filter1 * 4) + native, approx, filter2);
if (ret == 0)
goto out;
more_samples:
printf("Saw unexpected benchmark result. Try running again with more samples?\n");
out:
return 0;
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment