Commit 75a1a607 authored by Daniel Borkmann's avatar Daniel Borkmann Committed by Alexei Starovoitov

uaccess: Add strict non-pagefault kernel-space read function

Add two new probe_kernel_read_strict() and strncpy_from_unsafe_strict()
helpers which by default alias to the __probe_kernel_read() and the
__strncpy_from_unsafe(), respectively, but can be overridden by archs
which have non-overlapping address ranges for kernel space and user
space in order to bail out with -EFAULT when attempting to probe user
memory including non-canonical user access addresses [0]:

  4-level page tables:
    user-space mem: 0x0000000000000000 - 0x00007fffffffffff
    non-canonical:  0x0000800000000000 - 0xffff7fffffffffff

  5-level page tables:
    user-space mem: 0x0000000000000000 - 0x00ffffffffffffff
    non-canonical:  0x0100000000000000 - 0xfeffffffffffffff

The idea is that these helpers are complementary to the probe_user_read()
and strncpy_from_unsafe_user() which probe user-only memory. Both added
helpers here do the same, but for kernel-only addresses.

Both set of helpers are going to be used for BPF tracing. They also
explicitly avoid throwing the splat for non-canonical user addresses from
00c42373 ("x86-64: add warning for non-canonical user access address
dereferences").

For compat, the current probe_kernel_read() and strncpy_from_unsafe() are
left as-is.

  [0] Documentation/x86/x86_64/mm.txt
Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: x86@kernel.org
Link: https://lore.kernel.org/bpf/eefeefd769aa5a013531f491a71f0936779e916b.1572649915.git.daniel@iogearbox.net
parent 1d1585ca
...@@ -13,7 +13,7 @@ CFLAGS_REMOVE_mem_encrypt_identity.o = -pg ...@@ -13,7 +13,7 @@ CFLAGS_REMOVE_mem_encrypt_identity.o = -pg
endif endif
obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
pat.o pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o pat.o pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o maccess.o
# Make sure __phys_addr has no stackprotector # Make sure __phys_addr has no stackprotector
nostackp := $(call cc-option, -fno-stack-protector) nostackp := $(call cc-option, -fno-stack-protector)
......
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/uaccess.h>
#include <linux/kernel.h>
#ifdef CONFIG_X86_64
static __always_inline u64 canonical_address(u64 vaddr, u8 vaddr_bits)
{
return ((s64)vaddr << (64 - vaddr_bits)) >> (64 - vaddr_bits);
}
static __always_inline bool invalid_probe_range(u64 vaddr)
{
/*
* Range covering the highest possible canonical userspace address
* as well as non-canonical address range. For the canonical range
* we also need to include the userspace guard page.
*/
return vaddr < TASK_SIZE_MAX + PAGE_SIZE ||
canonical_address(vaddr, boot_cpu_data.x86_virt_bits) != vaddr;
}
#else
static __always_inline bool invalid_probe_range(u64 vaddr)
{
return vaddr < TASK_SIZE_MAX;
}
#endif
long probe_kernel_read_strict(void *dst, const void *src, size_t size)
{
if (unlikely(invalid_probe_range((unsigned long)src)))
return -EFAULT;
return __probe_kernel_read(dst, src, size);
}
long strncpy_from_unsafe_strict(char *dst, const void *unsafe_addr, long count)
{
if (unlikely(invalid_probe_range((unsigned long)unsafe_addr)))
return -EFAULT;
return __strncpy_from_unsafe(dst, unsafe_addr, count);
}
...@@ -311,6 +311,7 @@ copy_struct_from_user(void *dst, size_t ksize, const void __user *src, ...@@ -311,6 +311,7 @@ copy_struct_from_user(void *dst, size_t ksize, const void __user *src,
* happens, handle that and return -EFAULT. * happens, handle that and return -EFAULT.
*/ */
extern long probe_kernel_read(void *dst, const void *src, size_t size); extern long probe_kernel_read(void *dst, const void *src, size_t size);
extern long probe_kernel_read_strict(void *dst, const void *src, size_t size);
extern long __probe_kernel_read(void *dst, const void *src, size_t size); extern long __probe_kernel_read(void *dst, const void *src, size_t size);
/* /*
...@@ -350,6 +351,9 @@ extern long notrace probe_user_write(void __user *dst, const void *src, size_t s ...@@ -350,6 +351,9 @@ extern long notrace probe_user_write(void __user *dst, const void *src, size_t s
extern long notrace __probe_user_write(void __user *dst, const void *src, size_t size); extern long notrace __probe_user_write(void __user *dst, const void *src, size_t size);
extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count); extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count);
extern long strncpy_from_unsafe_strict(char *dst, const void *unsafe_addr,
long count);
extern long __strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count);
extern long strncpy_from_unsafe_user(char *dst, const void __user *unsafe_addr, extern long strncpy_from_unsafe_user(char *dst, const void __user *unsafe_addr,
long count); long count);
extern long strnlen_unsafe_user(const void __user *unsafe_addr, long count); extern long strnlen_unsafe_user(const void __user *unsafe_addr, long count);
......
...@@ -43,11 +43,20 @@ probe_write_common(void __user *dst, const void *src, size_t size) ...@@ -43,11 +43,20 @@ probe_write_common(void __user *dst, const void *src, size_t size)
* do_page_fault() doesn't attempt to take mmap_sem. This makes * do_page_fault() doesn't attempt to take mmap_sem. This makes
* probe_kernel_read() suitable for use within regions where the caller * probe_kernel_read() suitable for use within regions where the caller
* already holds mmap_sem, or other locks which nest inside mmap_sem. * already holds mmap_sem, or other locks which nest inside mmap_sem.
*
* probe_kernel_read_strict() is the same as probe_kernel_read() except for
* the case where architectures have non-overlapping user and kernel address
* ranges: probe_kernel_read_strict() will additionally return -EFAULT for
* probing memory on a user address range where probe_user_read() is supposed
* to be used instead.
*/ */
long __weak probe_kernel_read(void *dst, const void *src, size_t size) long __weak probe_kernel_read(void *dst, const void *src, size_t size)
__attribute__((alias("__probe_kernel_read"))); __attribute__((alias("__probe_kernel_read")));
long __weak probe_kernel_read_strict(void *dst, const void *src, size_t size)
__attribute__((alias("__probe_kernel_read")));
long __probe_kernel_read(void *dst, const void *src, size_t size) long __probe_kernel_read(void *dst, const void *src, size_t size)
{ {
long ret; long ret;
...@@ -157,8 +166,22 @@ EXPORT_SYMBOL_GPL(probe_user_write); ...@@ -157,8 +166,22 @@ EXPORT_SYMBOL_GPL(probe_user_write);
* *
* If @count is smaller than the length of the string, copies @count-1 bytes, * If @count is smaller than the length of the string, copies @count-1 bytes,
* sets the last byte of @dst buffer to NUL and returns @count. * sets the last byte of @dst buffer to NUL and returns @count.
*
* strncpy_from_unsafe_strict() is the same as strncpy_from_unsafe() except
* for the case where architectures have non-overlapping user and kernel address
* ranges: strncpy_from_unsafe_strict() will additionally return -EFAULT for
* probing memory on a user address range where strncpy_from_unsafe_user() is
* supposed to be used instead.
*/ */
long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count)
long __weak strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count)
__attribute__((alias("__strncpy_from_unsafe")));
long __weak strncpy_from_unsafe_strict(char *dst, const void *unsafe_addr,
long count)
__attribute__((alias("__strncpy_from_unsafe")));
long __strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count)
{ {
mm_segment_t old_fs = get_fs(); mm_segment_t old_fs = get_fs();
const void *src = unsafe_addr; const void *src = unsafe_addr;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment