Commit 8c1684bb authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-linus-2020-05-13' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux

Pull thread fix from Christian Brauner:
 "This contains a single fix for all exported legacy fork helpers to
  block accidental access to clone3() features in the upper 32 bits of
  their respective flags arguments.

  I got Cced on a glibc issue where someone reported consistent failures
  for the legacy clone() syscall on ppc64le when sign extension was
  performed (since the clone() syscall in glibc exposes the flags
  argument as an int whereas the kernel uses unsigned long).

  The legacy clone() syscall is odd in a bunch of ways and here two
  things interact:

   - First, legacy clone's flag argument is word-size dependent, i.e.
     it's an unsigned long whereas most system calls with flag arguments
     use int or unsigned int.

   - Second, legacy clone() ignores unknown and deprecated flags.

  The two of them taken together means that users on 64bit systems can
  pass garbage for the upper 32bit of the clone() syscall since forever
  and things would just work fine.

  The following program compiled on a 64bit kernel prior to v5.7-rc1
  will succeed and will fail post v5.7-rc1 with EBADF:

    int main(int argc, char *argv[])
    {
          pid_t pid;

          /* Note that legacy clone() has different argument ordering on
           * different architectures so this won't work everywhere.
           *
           * Only set the upper 32 bits.
           */
          pid = syscall(__NR_clone, 0xffffffff00000000 | SIGCHLD,
                        NULL, NULL, NULL, NULL);
          if (pid < 0)
                  exit(EXIT_FAILURE);
          if (pid == 0)
                  exit(EXIT_SUCCESS);
          if (wait(NULL) != pid)
                  exit(EXIT_FAILURE);

          exit(EXIT_SUCCESS);
    }

  Since legacy clone() couldn't be extended this was not a problem so
  far and nobody really noticed or cared since nothing in the kernel
  ever bothered to look at the upper 32 bits.

  But once we introduced clone3() and expanded the flag argument in
  struct clone_args to 64 bit we opened this can of worms. With the
  first flag-based extension to clone3() making use of the upper 32 bits
  of the flag argument we've effectively made it possible for the legacy
  clone() syscall to reach clone3() only flags on accident. The sign
  extension scenario is just the odd corner-case that we needed to
  figure this out.

  The reason we just realized this now and not already when we
  introduced CLONE_CLEAR_SIGHAND was that CLONE_INTO_CGROUP assumes that
  a valid cgroup file descriptor has been given - whereas
  CLONE_CLEAR_SIGHAND doesn't need to verify anything. It just silently
  resets the signal handlers to SIG_DFL.

  So the sign extension (or the user accidently passing garbage for the
  upper 32 bits) caused the CLONE_INTO_CGROUP bit to be raised and the
  kernel to error out when it didn't find a valid cgroup file
  descriptor.

  Note, I'm also capping kernel_thread()'s flag argument mainly because
  none of the new features make sense for kernel_thread() and we
  shouldn't risk them being accidently activated however unlikely. If we
  wanted to, we could even make kernel_thread() yell when an unknown
  flag has been set which it doesn't do right now. But it's not worth
  risking this in a bugfix imho"

* tag 'for-linus-2020-05-13' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux:
  fork: prevent accidental access to clone3 features
parents f44d5c48 3f2c788a
...@@ -2486,11 +2486,11 @@ long do_fork(unsigned long clone_flags, ...@@ -2486,11 +2486,11 @@ long do_fork(unsigned long clone_flags,
int __user *child_tidptr) int __user *child_tidptr)
{ {
struct kernel_clone_args args = { struct kernel_clone_args args = {
.flags = (clone_flags & ~CSIGNAL), .flags = (lower_32_bits(clone_flags) & ~CSIGNAL),
.pidfd = parent_tidptr, .pidfd = parent_tidptr,
.child_tid = child_tidptr, .child_tid = child_tidptr,
.parent_tid = parent_tidptr, .parent_tid = parent_tidptr,
.exit_signal = (clone_flags & CSIGNAL), .exit_signal = (lower_32_bits(clone_flags) & CSIGNAL),
.stack = stack_start, .stack = stack_start,
.stack_size = stack_size, .stack_size = stack_size,
}; };
...@@ -2508,8 +2508,9 @@ long do_fork(unsigned long clone_flags, ...@@ -2508,8 +2508,9 @@ long do_fork(unsigned long clone_flags,
pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
{ {
struct kernel_clone_args args = { struct kernel_clone_args args = {
.flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL), .flags = ((lower_32_bits(flags) | CLONE_VM |
.exit_signal = (flags & CSIGNAL), CLONE_UNTRACED) & ~CSIGNAL),
.exit_signal = (lower_32_bits(flags) & CSIGNAL),
.stack = (unsigned long)fn, .stack = (unsigned long)fn,
.stack_size = (unsigned long)arg, .stack_size = (unsigned long)arg,
}; };
...@@ -2570,11 +2571,11 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, ...@@ -2570,11 +2571,11 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
#endif #endif
{ {
struct kernel_clone_args args = { struct kernel_clone_args args = {
.flags = (clone_flags & ~CSIGNAL), .flags = (lower_32_bits(clone_flags) & ~CSIGNAL),
.pidfd = parent_tidptr, .pidfd = parent_tidptr,
.child_tid = child_tidptr, .child_tid = child_tidptr,
.parent_tid = parent_tidptr, .parent_tid = parent_tidptr,
.exit_signal = (clone_flags & CSIGNAL), .exit_signal = (lower_32_bits(clone_flags) & CSIGNAL),
.stack = newsp, .stack = newsp,
.tls = tls, .tls = tls,
}; };
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment