Commit dc8af1ff authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'seccomp-v5.19-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux

Pull seccomp updates from Kees Cook:

 - Rework USER_NOTIF notification ordering and kill logic (Sargun
   Dhillon)

 - Improved PTRACE_O_SUSPEND_SECCOMP selftest (Jann Horn)

 - Gracefully handle failed unshare() in selftests (Yang Guang)

 - Spelling fix (Colin Ian King)

* tag 'seccomp-v5.19-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux:
  selftests/seccomp: Fix spelling mistake "Coud" -> "Could"
  selftests/seccomp: Add test for wait killable notifier
  selftests/seccomp: Refactor get_proc_stat to split out file reading code
  seccomp: Add wait_killable semantic to seccomp user notifier
  selftests/seccomp: Ensure that notifications come in FIFO order
  seccomp: Use FIFO semantics to order notifications
  selftests/seccomp: Add SKIP for failed unshare()
  selftests/seccomp: Test PTRACE_O_SUSPEND_SECCOMP without CAP_SYS_ADMIN
parents 0bf13a84 5e91d2a4
......@@ -271,6 +271,16 @@ notifying process it will be replaced. The supervisor can also add an FD, and
respond atomically by using the ``SECCOMP_ADDFD_FLAG_SEND`` flag and the return
value will be the injected file descriptor number.
The notifying process can be preempted, resulting in the notification being
aborted. This can be problematic when trying to take actions on behalf of the
notifying process that are long-running and typically retryable (mounting a
filesytem). Alternatively, at filter installation time, the
``SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV`` flag can be set. This flag makes it
such that when a user notification is received by the supervisor, the notifying
process will ignore non-fatal signals until the response is sent. Signals that
are sent prior to the notification being received by userspace are handled
normally.
It is worth noting that ``struct seccomp_data`` contains the values of register
arguments to the syscall, but does not contain pointers to memory. The task's
memory is accessible to suitably privileged traces via ``ptrace()`` or
......
......@@ -8,7 +8,8 @@
SECCOMP_FILTER_FLAG_LOG | \
SECCOMP_FILTER_FLAG_SPEC_ALLOW | \
SECCOMP_FILTER_FLAG_NEW_LISTENER | \
SECCOMP_FILTER_FLAG_TSYNC_ESRCH)
SECCOMP_FILTER_FLAG_TSYNC_ESRCH | \
SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV)
/* sizeof() the first published struct seccomp_notif_addfd */
#define SECCOMP_NOTIFY_ADDFD_SIZE_VER0 24
......
......@@ -23,6 +23,8 @@
#define SECCOMP_FILTER_FLAG_SPEC_ALLOW (1UL << 2)
#define SECCOMP_FILTER_FLAG_NEW_LISTENER (1UL << 3)
#define SECCOMP_FILTER_FLAG_TSYNC_ESRCH (1UL << 4)
/* Received notifications wait in killable state (only respond to fatal signals) */
#define SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV (1UL << 5)
/*
* All BPF programs must return a 32-bit value.
......
......@@ -200,6 +200,8 @@ static inline void seccomp_cache_prepare(struct seccomp_filter *sfilter)
* the filter can be freed.
* @cache: cache of arch/syscall mappings to actions
* @log: true if all actions except for SECCOMP_RET_ALLOW should be logged
* @wait_killable_recv: Put notifying process in killable state once the
* notification is received by the userspace listener.
* @prev: points to a previously installed, or inherited, filter
* @prog: the BPF program to evaluate
* @notif: the struct that holds all notification related information
......@@ -220,6 +222,7 @@ struct seccomp_filter {
refcount_t refs;
refcount_t users;
bool log;
bool wait_killable_recv;
struct action_cache cache;
struct seccomp_filter *prev;
struct bpf_prog *prog;
......@@ -893,6 +896,10 @@ static long seccomp_attach_filter(unsigned int flags,
if (flags & SECCOMP_FILTER_FLAG_LOG)
filter->log = true;
/* Set wait killable flag, if present. */
if (flags & SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV)
filter->wait_killable_recv = true;
/*
* If there is an existing filter, make it the prev and don't drop its
* task reference.
......@@ -1080,6 +1087,12 @@ static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd, struct seccomp_kn
complete(&addfd->completion);
}
static bool should_sleep_killable(struct seccomp_filter *match,
struct seccomp_knotif *n)
{
return match->wait_killable_recv && n->state == SECCOMP_NOTIFY_SENT;
}
static int seccomp_do_user_notification(int this_syscall,
struct seccomp_filter *match,
const struct seccomp_data *sd)
......@@ -1100,7 +1113,7 @@ static int seccomp_do_user_notification(int this_syscall,
n.data = sd;
n.id = seccomp_next_notify_id(match);
init_completion(&n.ready);
list_add(&n.list, &match->notif->notifications);
list_add_tail(&n.list, &match->notif->notifications);
INIT_LIST_HEAD(&n.addfd);
up(&match->notif->request);
......@@ -1110,11 +1123,25 @@ static int seccomp_do_user_notification(int this_syscall,
* This is where we wait for a reply from userspace.
*/
do {
bool wait_killable = should_sleep_killable(match, &n);
mutex_unlock(&match->notify_lock);
err = wait_for_completion_interruptible(&n.ready);
if (wait_killable)
err = wait_for_completion_killable(&n.ready);
else
err = wait_for_completion_interruptible(&n.ready);
mutex_lock(&match->notify_lock);
if (err != 0)
if (err != 0) {
/*
* Check to see if the notifcation got picked up and
* whether we should switch to wait killable.
*/
if (!wait_killable && should_sleep_killable(match, &n))
continue;
goto interrupted;
}
addfd = list_first_entry_or_null(&n.addfd,
struct seccomp_kaddfd, list);
......@@ -1484,6 +1511,9 @@ static long seccomp_notify_recv(struct seccomp_filter *filter,
mutex_lock(&filter->notify_lock);
knotif = find_notification(filter, unotif.id);
if (knotif) {
/* Reset the process to make sure it's not stuck */
if (should_sleep_killable(filter, knotif))
complete(&knotif->ready);
knotif->state = SECCOMP_NOTIFY_INIT;
up(&filter->notif->request);
}
......@@ -1829,6 +1859,14 @@ static long seccomp_set_mode_filter(unsigned int flags,
((flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH) == 0))
return -EINVAL;
/*
* The SECCOMP_FILTER_FLAG_WAIT_KILLABLE_SENT flag doesn't make sense
* without the SECCOMP_FILTER_FLAG_NEW_LISTENER flag.
*/
if ((flags & SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV) &&
((flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) == 0))
return -EINVAL;
/* Prepare the new filter before holding any locks. */
prepared = seccomp_prepare_user_filter(filter);
if (IS_ERR(prepared))
......
# SPDX-License-Identifier: GPL-2.0
CFLAGS += -Wl,-no-as-needed -Wall -isystem ../../../../usr/include/
LDFLAGS += -lpthread
LDLIBS += -lcap
TEST_GEN_PROGS := seccomp_bpf seccomp_benchmark
include ../lib.mk
......@@ -46,6 +46,7 @@
#include <sys/ioctl.h>
#include <linux/kcmp.h>
#include <sys/resource.h>
#include <sys/capability.h>
#include <unistd.h>
#include <sys/syscall.h>
......@@ -59,6 +60,8 @@
#define SKIP(s, ...) XFAIL(s, ##__VA_ARGS__)
#endif
#define MIN(X, Y) ((X) < (Y) ? (X) : (Y))
#ifndef PR_SET_PTRACER
# define PR_SET_PTRACER 0x59616d61
#endif
......@@ -268,6 +271,10 @@ struct seccomp_notif_addfd_big {
#define SECCOMP_FILTER_FLAG_TSYNC_ESRCH (1UL << 4)
#endif
#ifndef SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV
#define SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV (1UL << 5)
#endif
#ifndef seccomp
int seccomp(unsigned int op, unsigned int flags, void *args)
{
......@@ -3742,7 +3749,10 @@ TEST(user_notification_fault_recv)
struct seccomp_notif req = {};
struct seccomp_notif_resp resp = {};
ASSERT_EQ(unshare(CLONE_NEWUSER), 0);
ASSERT_EQ(unshare(CLONE_NEWUSER), 0) {
if (errno == EINVAL)
SKIP(return, "kernel missing CLONE_NEWUSER support");
}
listener = user_notif_syscall(__NR_getppid,
SECCOMP_FILTER_FLAG_NEW_LISTENER);
......@@ -4231,6 +4241,421 @@ TEST(user_notification_addfd_rlimit)
close(memfd);
}
/* Make sure PTRACE_O_SUSPEND_SECCOMP requires CAP_SYS_ADMIN. */
FIXTURE(O_SUSPEND_SECCOMP) {
pid_t pid;
};
FIXTURE_SETUP(O_SUSPEND_SECCOMP)
{
ERRNO_FILTER(block_read, E2BIG);
cap_value_t cap_list[] = { CAP_SYS_ADMIN };
cap_t caps;
self->pid = 0;
/* make sure we don't have CAP_SYS_ADMIN */
caps = cap_get_proc();
ASSERT_NE(NULL, caps);
ASSERT_EQ(0, cap_set_flag(caps, CAP_EFFECTIVE, 1, cap_list, CAP_CLEAR));
ASSERT_EQ(0, cap_set_proc(caps));
cap_free(caps);
ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0));
ASSERT_EQ(0, prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_block_read));
self->pid = fork();
ASSERT_GE(self->pid, 0);
if (self->pid == 0) {
while (1)
pause();
_exit(127);
}
}
FIXTURE_TEARDOWN(O_SUSPEND_SECCOMP)
{
if (self->pid)
kill(self->pid, SIGKILL);
}
TEST_F(O_SUSPEND_SECCOMP, setoptions)
{
int wstatus;
ASSERT_EQ(0, ptrace(PTRACE_ATTACH, self->pid, NULL, 0));
ASSERT_EQ(self->pid, wait(&wstatus));
ASSERT_EQ(-1, ptrace(PTRACE_SETOPTIONS, self->pid, NULL, PTRACE_O_SUSPEND_SECCOMP));
if (errno == EINVAL)
SKIP(return, "Kernel does not support PTRACE_O_SUSPEND_SECCOMP (missing CONFIG_CHECKPOINT_RESTORE?)");
ASSERT_EQ(EPERM, errno);
}
TEST_F(O_SUSPEND_SECCOMP, seize)
{
int ret;
ret = ptrace(PTRACE_SEIZE, self->pid, NULL, PTRACE_O_SUSPEND_SECCOMP);
ASSERT_EQ(-1, ret);
if (errno == EINVAL)
SKIP(return, "Kernel does not support PTRACE_O_SUSPEND_SECCOMP (missing CONFIG_CHECKPOINT_RESTORE?)");
ASSERT_EQ(EPERM, errno);
}
/*
* get_nth - Get the nth, space separated entry in a file.
*
* Returns the length of the read field.
* Throws error if field is zero-lengthed.
*/
static ssize_t get_nth(struct __test_metadata *_metadata, const char *path,
const unsigned int position, char **entry)
{
char *line = NULL;
unsigned int i;
ssize_t nread;
size_t len = 0;
FILE *f;
f = fopen(path, "r");
ASSERT_NE(f, NULL) {
TH_LOG("Could not open %s: %s", path, strerror(errno));
}
for (i = 0; i < position; i++) {
nread = getdelim(&line, &len, ' ', f);
ASSERT_GE(nread, 0) {
TH_LOG("Failed to read %d entry in file %s", i, path);
}
}
fclose(f);
ASSERT_GT(nread, 0) {
TH_LOG("Entry in file %s had zero length", path);
}
*entry = line;
return nread - 1;
}
/* For a given PID, get the task state (D, R, etc...) */
static char get_proc_stat(struct __test_metadata *_metadata, pid_t pid)
{
char proc_path[100] = {0};
char status;
char *line;
snprintf(proc_path, sizeof(proc_path), "/proc/%d/stat", pid);
ASSERT_EQ(get_nth(_metadata, proc_path, 3, &line), 1);
status = *line;
free(line);
return status;
}
TEST(user_notification_fifo)
{
struct seccomp_notif_resp resp = {};
struct seccomp_notif req = {};
int i, status, listener;
pid_t pid, pids[3];
__u64 baseid;
long ret;
/* 100 ms */
struct timespec delay = { .tv_nsec = 100000000 };
ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
ASSERT_EQ(0, ret) {
TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
}
/* Setup a listener */
listener = user_notif_syscall(__NR_getppid,
SECCOMP_FILTER_FLAG_NEW_LISTENER);
ASSERT_GE(listener, 0);
pid = fork();
ASSERT_GE(pid, 0);
if (pid == 0) {
ret = syscall(__NR_getppid);
exit(ret != USER_NOTIF_MAGIC);
}
EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
baseid = req.id + 1;
resp.id = req.id;
resp.error = 0;
resp.val = USER_NOTIF_MAGIC;
/* check that we make sure flags == 0 */
EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
EXPECT_EQ(waitpid(pid, &status, 0), pid);
EXPECT_EQ(true, WIFEXITED(status));
EXPECT_EQ(0, WEXITSTATUS(status));
/* Start children, and generate notifications */
for (i = 0; i < ARRAY_SIZE(pids); i++) {
pid = fork();
if (pid == 0) {
ret = syscall(__NR_getppid);
exit(ret != USER_NOTIF_MAGIC);
}
pids[i] = pid;
}
/* This spins until all of the children are sleeping */
restart_wait:
for (i = 0; i < ARRAY_SIZE(pids); i++) {
if (get_proc_stat(_metadata, pids[i]) != 'S') {
nanosleep(&delay, NULL);
goto restart_wait;
}
}
/* Read the notifications in order (and respond) */
for (i = 0; i < ARRAY_SIZE(pids); i++) {
memset(&req, 0, sizeof(req));
EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
EXPECT_EQ(req.id, baseid + i);
resp.id = req.id;
EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
}
/* Make sure notifications were received */
for (i = 0; i < ARRAY_SIZE(pids); i++) {
EXPECT_EQ(waitpid(pids[i], &status, 0), pids[i]);
EXPECT_EQ(true, WIFEXITED(status));
EXPECT_EQ(0, WEXITSTATUS(status));
}
}
/* get_proc_syscall - Get the syscall in progress for a given pid
*
* Returns the current syscall number for a given process
* Returns -1 if not in syscall (running or blocked)
*/
static long get_proc_syscall(struct __test_metadata *_metadata, int pid)
{
char proc_path[100] = {0};
long ret = -1;
ssize_t nread;
char *line;
snprintf(proc_path, sizeof(proc_path), "/proc/%d/syscall", pid);
nread = get_nth(_metadata, proc_path, 1, &line);
ASSERT_GT(nread, 0);
if (!strncmp("running", line, MIN(7, nread)))
ret = strtol(line, NULL, 16);
free(line);
return ret;
}
/* Ensure non-fatal signals prior to receive are unmodified */
TEST(user_notification_wait_killable_pre_notification)
{
struct sigaction new_action = {
.sa_handler = signal_handler,
};
int listener, status, sk_pair[2];
pid_t pid;
long ret;
char c;
/* 100 ms */
struct timespec delay = { .tv_nsec = 100000000 };
ASSERT_EQ(sigemptyset(&new_action.sa_mask), 0);
ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
ASSERT_EQ(0, ret)
{
TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
}
ASSERT_EQ(socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair), 0);
listener = user_notif_syscall(
__NR_getppid, SECCOMP_FILTER_FLAG_NEW_LISTENER |
SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV);
ASSERT_GE(listener, 0);
/*
* Check that we can kill the process with SIGUSR1 prior to receiving
* the notification. SIGUSR1 is wired up to a custom signal handler,
* and make sure it gets called.
*/
pid = fork();
ASSERT_GE(pid, 0);
if (pid == 0) {
close(sk_pair[0]);
handled = sk_pair[1];
/* Setup the non-fatal sigaction without SA_RESTART */
if (sigaction(SIGUSR1, &new_action, NULL)) {
perror("sigaction");
exit(1);
}
ret = syscall(__NR_getppid);
/* Make sure we got a return from a signal interruption */
exit(ret != -1 || errno != EINTR);
}
/*
* Make sure we've gotten to the seccomp user notification wait
* from getppid prior to sending any signals
*/
while (get_proc_syscall(_metadata, pid) != __NR_getppid &&
get_proc_stat(_metadata, pid) != 'S')
nanosleep(&delay, NULL);
/* Send non-fatal kill signal */
EXPECT_EQ(kill(pid, SIGUSR1), 0);
/* wait for process to exit (exit checks for EINTR) */
EXPECT_EQ(waitpid(pid, &status, 0), pid);
EXPECT_EQ(true, WIFEXITED(status));
EXPECT_EQ(0, WEXITSTATUS(status));
EXPECT_EQ(read(sk_pair[0], &c, 1), 1);
}
/* Ensure non-fatal signals after receive are blocked */
TEST(user_notification_wait_killable)
{
struct sigaction new_action = {
.sa_handler = signal_handler,
};
struct seccomp_notif_resp resp = {};
struct seccomp_notif req = {};
int listener, status, sk_pair[2];
pid_t pid;
long ret;
char c;
/* 100 ms */
struct timespec delay = { .tv_nsec = 100000000 };
ASSERT_EQ(sigemptyset(&new_action.sa_mask), 0);
ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
ASSERT_EQ(0, ret)
{
TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
}
ASSERT_EQ(socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair), 0);
listener = user_notif_syscall(
__NR_getppid, SECCOMP_FILTER_FLAG_NEW_LISTENER |
SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV);
ASSERT_GE(listener, 0);
pid = fork();
ASSERT_GE(pid, 0);
if (pid == 0) {
close(sk_pair[0]);
handled = sk_pair[1];
/* Setup the sigaction without SA_RESTART */
if (sigaction(SIGUSR1, &new_action, NULL)) {
perror("sigaction");
exit(1);
}
/* Make sure that the syscall is completed (no EINTR) */
ret = syscall(__NR_getppid);
exit(ret != USER_NOTIF_MAGIC);
}
/*
* Get the notification, to make move the notifying process into a
* non-preemptible (TASK_KILLABLE) state.
*/
EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
/* Send non-fatal kill signal */
EXPECT_EQ(kill(pid, SIGUSR1), 0);
/*
* Make sure the task enters moves to TASK_KILLABLE by waiting for
* D (Disk Sleep) state after receiving non-fatal signal.
*/
while (get_proc_stat(_metadata, pid) != 'D')
nanosleep(&delay, NULL);
resp.id = req.id;
resp.val = USER_NOTIF_MAGIC;
/* Make sure the notification is found and able to be replied to */
EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
/*
* Make sure that the signal handler does get called once we're back in
* userspace.
*/
EXPECT_EQ(read(sk_pair[0], &c, 1), 1);
/* wait for process to exit (exit checks for USER_NOTIF_MAGIC) */
EXPECT_EQ(waitpid(pid, &status, 0), pid);
EXPECT_EQ(true, WIFEXITED(status));
EXPECT_EQ(0, WEXITSTATUS(status));
}
/* Ensure fatal signals after receive are not blocked */
TEST(user_notification_wait_killable_fatal)
{
struct seccomp_notif req = {};
int listener, status;
pid_t pid;
long ret;
/* 100 ms */
struct timespec delay = { .tv_nsec = 100000000 };
ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
ASSERT_EQ(0, ret)
{
TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
}
listener = user_notif_syscall(
__NR_getppid, SECCOMP_FILTER_FLAG_NEW_LISTENER |
SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV);
ASSERT_GE(listener, 0);
pid = fork();
ASSERT_GE(pid, 0);
if (pid == 0) {
/* This should never complete as it should get a SIGTERM */
syscall(__NR_getppid);
exit(1);
}
while (get_proc_stat(_metadata, pid) != 'S')
nanosleep(&delay, NULL);
/*
* Get the notification, to make move the notifying process into a
* non-preemptible (TASK_KILLABLE) state.
*/
EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
/* Kill the process with a fatal signal */
EXPECT_EQ(kill(pid, SIGTERM), 0);
/*
* Wait for the process to exit, and make sure the process terminated
* due to the SIGTERM signal.
*/
EXPECT_EQ(waitpid(pid, &status, 0), pid);
EXPECT_EQ(true, WIFSIGNALED(status));
EXPECT_EQ(SIGTERM, WTERMSIG(status));
}
/*
* TODO:
* - expand NNP testing
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment