Commit 53683e40 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'trace-ringbuffer-v6.10' of...

Merge tag 'trace-ringbuffer-v6.10' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace

Pull tracing ring buffer updates from Steven Rostedt:
 "Add ring_buffer memory mappings.

  The tracing ring buffer was created based on being mostly used with
  the splice system call. It is broken up into page ordered sub-buffers
  and the reader swaps a new sub-buffer with an existing sub-buffer
  that's part of the write buffer. It then has total access to the
  swapped out sub-buffer and can do copyless movements of the memory
  into other mediums (file system, network, etc).

  The buffer is great for passing around the ring buffer contents in the
  kernel, but is not so good for when the consumer is the user space
  task itself.

  A new interface is added that allows user space to memory map the ring
  buffer. It will get all the write sub-buffers as well as reader
  sub-buffer (that is not written to). It can send an ioctl to change
  which sub-buffer is the new reader sub-buffer.

  The ring buffer is read only to user space. It only needs to call the
  ioctl when it is finished with a sub-buffer and needs a new sub-buffer
  that the writer will not write over.

  A self test program was also created for testing and can be used as an
  example for the interface to user space. The libtracefs (external to
  the kernel) also has code that interacts with this, although it is
  disabled until the interface is in a official release. It can be
  enabled by compiling the library with a special flag. This was used
  for testing applications that perform better with the buffer being
  mapped.

  Memory mapped buffers have limitations. The main one is that it can
  not be used with the snapshot logic. If the buffer is mapped,
  snapshots will be disabled. If any logic is set to trigger snapshots
  on a buffer, that buffer will not be allowed to be mapped"

* tag 'trace-ringbuffer-v6.10' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace:
  ring-buffer: Add cast to unsigned long addr passed to virt_to_page()
  ring-buffer: Have mmapped ring buffer keep track of missed events
  ring-buffer/selftest: Add ring-buffer mapping test
  Documentation: tracing: Add ring-buffer mapping
  tracing: Allow user-space mapping of the ring-buffer
  ring-buffer: Introducing ring-buffer mapping functions
  ring-buffer: Allocate sub-buffers with __GFP_COMP
parents 594d2815 b9c6820f
...@@ -29,6 +29,7 @@ Linux Tracing Technologies ...@@ -29,6 +29,7 @@ Linux Tracing Technologies
timerlat-tracer timerlat-tracer
intel_th intel_th
ring-buffer-design ring-buffer-design
ring-buffer-map
stm stm
sys-t sys-t
coresight/index coresight/index
......
.. SPDX-License-Identifier: GPL-2.0
==================================
Tracefs ring-buffer memory mapping
==================================
:Author: Vincent Donnefort <vdonnefort@google.com>
Overview
========
Tracefs ring-buffer memory map provides an efficient method to stream data
as no memory copy is necessary. The application mapping the ring-buffer becomes
then a consumer for that ring-buffer, in a similar fashion to trace_pipe.
Memory mapping setup
====================
The mapping works with a mmap() of the trace_pipe_raw interface.
The first system page of the mapping contains ring-buffer statistics and
description. It is referred to as the meta-page. One of the most important
fields of the meta-page is the reader. It contains the sub-buffer ID which can
be safely read by the mapper (see ring-buffer-design.rst).
The meta-page is followed by all the sub-buffers, ordered by ascending ID. It is
therefore effortless to know where the reader starts in the mapping:
.. code-block:: c
reader_id = meta->reader->id;
reader_offset = meta->meta_page_size + reader_id * meta->subbuf_size;
When the application is done with the current reader, it can get a new one using
the trace_pipe_raw ioctl() TRACE_MMAP_IOCTL_GET_READER. This ioctl also updates
the meta-page fields.
Limitations
===========
When a mapping is in place on a Tracefs ring-buffer, it is not possible to
either resize it (either by increasing the entire size of the ring-buffer or
each subbuf). It is also not possible to use snapshot and causes splice to copy
the ring buffer data instead of using the copyless swap from the ring buffer.
Concurrent readers (either another application mapping that ring-buffer or the
kernel with trace_pipe) are allowed but not recommended. They will compete for
the ring-buffer and the output is unpredictable, just like concurrent readers on
trace_pipe would be.
Example
=======
.. code-block:: c
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <linux/trace_mmap.h>
#include <sys/mman.h>
#include <sys/ioctl.h>
#define TRACE_PIPE_RAW "/sys/kernel/tracing/per_cpu/cpu0/trace_pipe_raw"
int main(void)
{
int page_size = getpagesize(), fd, reader_id;
unsigned long meta_len, data_len;
struct trace_buffer_meta *meta;
void *map, *reader, *data;
fd = open(TRACE_PIPE_RAW, O_RDONLY | O_NONBLOCK);
if (fd < 0)
exit(EXIT_FAILURE);
map = mmap(NULL, page_size, PROT_READ, MAP_SHARED, fd, 0);
if (map == MAP_FAILED)
exit(EXIT_FAILURE);
meta = (struct trace_buffer_meta *)map;
meta_len = meta->meta_page_size;
printf("entries: %llu\n", meta->entries);
printf("overrun: %llu\n", meta->overrun);
printf("read: %llu\n", meta->read);
printf("nr_subbufs: %u\n", meta->nr_subbufs);
data_len = meta->subbuf_size * meta->nr_subbufs;
data = mmap(NULL, data_len, PROT_READ, MAP_SHARED, fd, meta_len);
if (data == MAP_FAILED)
exit(EXIT_FAILURE);
if (ioctl(fd, TRACE_MMAP_IOCTL_GET_READER) < 0)
exit(EXIT_FAILURE);
reader_id = meta->reader.id;
reader = data + meta->subbuf_size * reader_id;
printf("Current reader address: %p\n", reader);
munmap(data, data_len);
munmap(meta, meta_len);
close (fd);
return 0;
}
...@@ -6,6 +6,8 @@ ...@@ -6,6 +6,8 @@
#include <linux/seq_file.h> #include <linux/seq_file.h>
#include <linux/poll.h> #include <linux/poll.h>
#include <uapi/linux/trace_mmap.h>
struct trace_buffer; struct trace_buffer;
struct ring_buffer_iter; struct ring_buffer_iter;
...@@ -223,4 +225,8 @@ int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node); ...@@ -223,4 +225,8 @@ int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node);
#define trace_rb_cpu_prepare NULL #define trace_rb_cpu_prepare NULL
#endif #endif
int ring_buffer_map(struct trace_buffer *buffer, int cpu,
struct vm_area_struct *vma);
int ring_buffer_unmap(struct trace_buffer *buffer, int cpu);
int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu);
#endif /* _LINUX_RING_BUFFER_H */ #endif /* _LINUX_RING_BUFFER_H */
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
#ifndef _TRACE_MMAP_H_
#define _TRACE_MMAP_H_
#include <linux/types.h>
/**
* struct trace_buffer_meta - Ring-buffer Meta-page description
* @meta_page_size: Size of this meta-page.
* @meta_struct_len: Size of this structure.
* @subbuf_size: Size of each sub-buffer.
* @nr_subbufs: Number of subbfs in the ring-buffer, including the reader.
* @reader.lost_events: Number of events lost at the time of the reader swap.
* @reader.id: subbuf ID of the current reader. ID range [0 : @nr_subbufs - 1]
* @reader.read: Number of bytes read on the reader subbuf.
* @flags: Placeholder for now, 0 until new features are supported.
* @entries: Number of entries in the ring-buffer.
* @overrun: Number of entries lost in the ring-buffer.
* @read: Number of entries that have been read.
* @Reserved1: Internal use only.
* @Reserved2: Internal use only.
*/
struct trace_buffer_meta {
__u32 meta_page_size;
__u32 meta_struct_len;
__u32 subbuf_size;
__u32 nr_subbufs;
struct {
__u64 lost_events;
__u32 id;
__u32 read;
} reader;
__u64 flags;
__u64 entries;
__u64 overrun;
__u64 read;
__u64 Reserved1;
__u64 Reserved2;
};
#define TRACE_MMAP_IOCTL_GET_READER _IO('T', 0x1)
#endif /* _TRACE_MMAP_H_ */
This diff is collapsed.
...@@ -1191,6 +1191,12 @@ static void tracing_snapshot_instance_cond(struct trace_array *tr, ...@@ -1191,6 +1191,12 @@ static void tracing_snapshot_instance_cond(struct trace_array *tr,
return; return;
} }
if (tr->mapped) {
trace_array_puts(tr, "*** BUFFER MEMORY MAPPED ***\n");
trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n");
return;
}
local_irq_save(flags); local_irq_save(flags);
update_max_tr(tr, current, smp_processor_id(), cond_data); update_max_tr(tr, current, smp_processor_id(), cond_data);
local_irq_restore(flags); local_irq_restore(flags);
...@@ -1323,7 +1329,7 @@ static int tracing_arm_snapshot_locked(struct trace_array *tr) ...@@ -1323,7 +1329,7 @@ static int tracing_arm_snapshot_locked(struct trace_array *tr)
lockdep_assert_held(&trace_types_lock); lockdep_assert_held(&trace_types_lock);
spin_lock(&tr->snapshot_trigger_lock); spin_lock(&tr->snapshot_trigger_lock);
if (tr->snapshot == UINT_MAX) { if (tr->snapshot == UINT_MAX || tr->mapped) {
spin_unlock(&tr->snapshot_trigger_lock); spin_unlock(&tr->snapshot_trigger_lock);
return -EBUSY; return -EBUSY;
} }
...@@ -8194,15 +8200,32 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, ...@@ -8194,15 +8200,32 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
return ret; return ret;
} }
/* An ioctl call with cmd 0 to the ring buffer file will wake up all waiters */
static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned long arg) static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{ {
struct ftrace_buffer_info *info = file->private_data; struct ftrace_buffer_info *info = file->private_data;
struct trace_iterator *iter = &info->iter; struct trace_iterator *iter = &info->iter;
int err;
if (cmd) if (cmd == TRACE_MMAP_IOCTL_GET_READER) {
return -ENOIOCTLCMD; if (!(file->f_flags & O_NONBLOCK)) {
err = ring_buffer_wait(iter->array_buffer->buffer,
iter->cpu_file,
iter->tr->buffer_percent,
NULL, NULL);
if (err)
return err;
}
return ring_buffer_map_get_reader(iter->array_buffer->buffer,
iter->cpu_file);
} else if (cmd) {
return -ENOTTY;
}
/*
* An ioctl call with cmd 0 to the ring buffer file will wake up all
* waiters
*/
mutex_lock(&trace_types_lock); mutex_lock(&trace_types_lock);
/* Make sure the waiters see the new wait_index */ /* Make sure the waiters see the new wait_index */
...@@ -8214,6 +8237,76 @@ static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned ...@@ -8214,6 +8237,76 @@ static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned
return 0; return 0;
} }
#ifdef CONFIG_TRACER_MAX_TRACE
static int get_snapshot_map(struct trace_array *tr)
{
int err = 0;
/*
* Called with mmap_lock held. lockdep would be unhappy if we would now
* take trace_types_lock. Instead use the specific
* snapshot_trigger_lock.
*/
spin_lock(&tr->snapshot_trigger_lock);
if (tr->snapshot || tr->mapped == UINT_MAX)
err = -EBUSY;
else
tr->mapped++;
spin_unlock(&tr->snapshot_trigger_lock);
/* Wait for update_max_tr() to observe iter->tr->mapped */
if (tr->mapped == 1)
synchronize_rcu();
return err;
}
static void put_snapshot_map(struct trace_array *tr)
{
spin_lock(&tr->snapshot_trigger_lock);
if (!WARN_ON(!tr->mapped))
tr->mapped--;
spin_unlock(&tr->snapshot_trigger_lock);
}
#else
static inline int get_snapshot_map(struct trace_array *tr) { return 0; }
static inline void put_snapshot_map(struct trace_array *tr) { }
#endif
static void tracing_buffers_mmap_close(struct vm_area_struct *vma)
{
struct ftrace_buffer_info *info = vma->vm_file->private_data;
struct trace_iterator *iter = &info->iter;
WARN_ON(ring_buffer_unmap(iter->array_buffer->buffer, iter->cpu_file));
put_snapshot_map(iter->tr);
}
static const struct vm_operations_struct tracing_buffers_vmops = {
.close = tracing_buffers_mmap_close,
};
static int tracing_buffers_mmap(struct file *filp, struct vm_area_struct *vma)
{
struct ftrace_buffer_info *info = filp->private_data;
struct trace_iterator *iter = &info->iter;
int ret = 0;
ret = get_snapshot_map(iter->tr);
if (ret)
return ret;
ret = ring_buffer_map(iter->array_buffer->buffer, iter->cpu_file, vma);
if (ret)
put_snapshot_map(iter->tr);
vma->vm_ops = &tracing_buffers_vmops;
return ret;
}
static const struct file_operations tracing_buffers_fops = { static const struct file_operations tracing_buffers_fops = {
.open = tracing_buffers_open, .open = tracing_buffers_open,
.read = tracing_buffers_read, .read = tracing_buffers_read,
...@@ -8223,6 +8316,7 @@ static const struct file_operations tracing_buffers_fops = { ...@@ -8223,6 +8316,7 @@ static const struct file_operations tracing_buffers_fops = {
.splice_read = tracing_buffers_splice_read, .splice_read = tracing_buffers_splice_read,
.unlocked_ioctl = tracing_buffers_ioctl, .unlocked_ioctl = tracing_buffers_ioctl,
.llseek = no_llseek, .llseek = no_llseek,
.mmap = tracing_buffers_mmap,
}; };
static ssize_t static ssize_t
......
...@@ -336,6 +336,7 @@ struct trace_array { ...@@ -336,6 +336,7 @@ struct trace_array {
bool allocated_snapshot; bool allocated_snapshot;
spinlock_t snapshot_trigger_lock; spinlock_t snapshot_trigger_lock;
unsigned int snapshot; unsigned int snapshot;
unsigned int mapped;
unsigned long max_latency; unsigned long max_latency;
#ifdef CONFIG_FSNOTIFY #ifdef CONFIG_FSNOTIFY
struct dentry *d_max_latency; struct dentry *d_max_latency;
......
# SPDX-License-Identifier: GPL-2.0
CFLAGS += -Wl,-no-as-needed -Wall
CFLAGS += $(KHDR_INCLUDES)
CFLAGS += -D_GNU_SOURCE
TEST_GEN_PROGS = map_test
include ../lib.mk
CONFIG_FTRACE=y
CONFIG_TRACER_SNAPSHOT=y
// SPDX-License-Identifier: GPL-2.0
/*
* Ring-buffer memory mapping tests
*
* Copyright (c) 2024 Vincent Donnefort <vdonnefort@google.com>
*/
#include <fcntl.h>
#include <sched.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <linux/trace_mmap.h>
#include <sys/mman.h>
#include <sys/ioctl.h>
#include "../user_events/user_events_selftests.h" /* share tracefs setup */
#include "../kselftest_harness.h"
#define TRACEFS_ROOT "/sys/kernel/tracing"
static int __tracefs_write(const char *path, const char *value)
{
int fd, ret;
fd = open(path, O_WRONLY | O_TRUNC);
if (fd < 0)
return fd;
ret = write(fd, value, strlen(value));
close(fd);
return ret == -1 ? -errno : 0;
}
static int __tracefs_write_int(const char *path, int value)
{
char *str;
int ret;
if (asprintf(&str, "%d", value) < 0)
return -1;
ret = __tracefs_write(path, str);
free(str);
return ret;
}
#define tracefs_write_int(path, value) \
ASSERT_EQ(__tracefs_write_int((path), (value)), 0)
#define tracefs_write(path, value) \
ASSERT_EQ(__tracefs_write((path), (value)), 0)
static int tracefs_reset(void)
{
if (__tracefs_write_int(TRACEFS_ROOT"/tracing_on", 0))
return -1;
if (__tracefs_write(TRACEFS_ROOT"/trace", ""))
return -1;
if (__tracefs_write(TRACEFS_ROOT"/set_event", ""))
return -1;
if (__tracefs_write(TRACEFS_ROOT"/current_tracer", "nop"))
return -1;
return 0;
}
struct tracefs_cpu_map_desc {
struct trace_buffer_meta *meta;
int cpu_fd;
};
int tracefs_cpu_map(struct tracefs_cpu_map_desc *desc, int cpu)
{
int page_size = getpagesize();
char *cpu_path;
void *map;
if (asprintf(&cpu_path,
TRACEFS_ROOT"/per_cpu/cpu%d/trace_pipe_raw",
cpu) < 0)
return -ENOMEM;
desc->cpu_fd = open(cpu_path, O_RDONLY | O_NONBLOCK);
free(cpu_path);
if (desc->cpu_fd < 0)
return -ENODEV;
map = mmap(NULL, page_size, PROT_READ, MAP_SHARED, desc->cpu_fd, 0);
if (map == MAP_FAILED)
return -errno;
desc->meta = (struct trace_buffer_meta *)map;
return 0;
}
void tracefs_cpu_unmap(struct tracefs_cpu_map_desc *desc)
{
munmap(desc->meta, desc->meta->meta_page_size);
close(desc->cpu_fd);
}
FIXTURE(map) {
struct tracefs_cpu_map_desc map_desc;
bool umount;
};
FIXTURE_VARIANT(map) {
int subbuf_size;
};
FIXTURE_VARIANT_ADD(map, subbuf_size_4k) {
.subbuf_size = 4,
};
FIXTURE_VARIANT_ADD(map, subbuf_size_8k) {
.subbuf_size = 8,
};
FIXTURE_SETUP(map)
{
int cpu = sched_getcpu();
cpu_set_t cpu_mask;
bool fail, umount;
char *message;
if (getuid() != 0)
SKIP(return, "Skipping: %s", "Please run the test as root");
if (!tracefs_enabled(&message, &fail, &umount)) {
if (fail) {
TH_LOG("Tracefs setup failed: %s", message);
ASSERT_FALSE(fail);
}
SKIP(return, "Skipping: %s", message);
}
self->umount = umount;
ASSERT_GE(cpu, 0);
ASSERT_EQ(tracefs_reset(), 0);
tracefs_write_int(TRACEFS_ROOT"/buffer_subbuf_size_kb", variant->subbuf_size);
ASSERT_EQ(tracefs_cpu_map(&self->map_desc, cpu), 0);
/*
* Ensure generated events will be found on this very same ring-buffer.
*/
CPU_ZERO(&cpu_mask);
CPU_SET(cpu, &cpu_mask);
ASSERT_EQ(sched_setaffinity(0, sizeof(cpu_mask), &cpu_mask), 0);
}
FIXTURE_TEARDOWN(map)
{
tracefs_reset();
if (self->umount)
tracefs_unmount();
tracefs_cpu_unmap(&self->map_desc);
}
TEST_F(map, meta_page_check)
{
struct tracefs_cpu_map_desc *desc = &self->map_desc;
int cnt = 0;
ASSERT_EQ(desc->meta->entries, 0);
ASSERT_EQ(desc->meta->overrun, 0);
ASSERT_EQ(desc->meta->read, 0);
ASSERT_EQ(desc->meta->reader.id, 0);
ASSERT_EQ(desc->meta->reader.read, 0);
ASSERT_EQ(ioctl(desc->cpu_fd, TRACE_MMAP_IOCTL_GET_READER), 0);
ASSERT_EQ(desc->meta->reader.id, 0);
tracefs_write_int(TRACEFS_ROOT"/tracing_on", 1);
for (int i = 0; i < 16; i++)
tracefs_write_int(TRACEFS_ROOT"/trace_marker", i);
again:
ASSERT_EQ(ioctl(desc->cpu_fd, TRACE_MMAP_IOCTL_GET_READER), 0);
ASSERT_EQ(desc->meta->entries, 16);
ASSERT_EQ(desc->meta->overrun, 0);
ASSERT_EQ(desc->meta->read, 16);
ASSERT_EQ(desc->meta->reader.id, 1);
if (!(cnt++))
goto again;
}
TEST_F(map, data_mmap)
{
struct tracefs_cpu_map_desc *desc = &self->map_desc;
unsigned long meta_len, data_len;
void *data;
meta_len = desc->meta->meta_page_size;
data_len = desc->meta->subbuf_size * desc->meta->nr_subbufs;
/* Map all the available subbufs */
data = mmap(NULL, data_len, PROT_READ, MAP_SHARED,
desc->cpu_fd, meta_len);
ASSERT_NE(data, MAP_FAILED);
munmap(data, data_len);
/* Map all the available subbufs - 1 */
data_len -= desc->meta->subbuf_size;
data = mmap(NULL, data_len, PROT_READ, MAP_SHARED,
desc->cpu_fd, meta_len);
ASSERT_NE(data, MAP_FAILED);
munmap(data, data_len);
/* Overflow the available subbufs by 1 */
meta_len += desc->meta->subbuf_size * 2;
data = mmap(NULL, data_len, PROT_READ, MAP_SHARED,
desc->cpu_fd, meta_len);
ASSERT_EQ(data, MAP_FAILED);
}
FIXTURE(snapshot) {
bool umount;
};
FIXTURE_SETUP(snapshot)
{
bool fail, umount;
struct stat sb;
char *message;
if (getuid() != 0)
SKIP(return, "Skipping: %s", "Please run the test as root");
if (stat(TRACEFS_ROOT"/snapshot", &sb))
SKIP(return, "Skipping: %s", "snapshot not available");
if (!tracefs_enabled(&message, &fail, &umount)) {
if (fail) {
TH_LOG("Tracefs setup failed: %s", message);
ASSERT_FALSE(fail);
}
SKIP(return, "Skipping: %s", message);
}
self->umount = umount;
}
FIXTURE_TEARDOWN(snapshot)
{
__tracefs_write(TRACEFS_ROOT"/events/sched/sched_switch/trigger",
"!snapshot");
tracefs_reset();
if (self->umount)
tracefs_unmount();
}
TEST_F(snapshot, excludes_map)
{
struct tracefs_cpu_map_desc map_desc;
int cpu = sched_getcpu();
ASSERT_GE(cpu, 0);
tracefs_write(TRACEFS_ROOT"/events/sched/sched_switch/trigger",
"snapshot");
ASSERT_EQ(tracefs_cpu_map(&map_desc, cpu), -EBUSY);
}
TEST_F(snapshot, excluded_by_map)
{
struct tracefs_cpu_map_desc map_desc;
int cpu = sched_getcpu();
ASSERT_EQ(tracefs_cpu_map(&map_desc, cpu), 0);
ASSERT_EQ(__tracefs_write(TRACEFS_ROOT"/events/sched/sched_switch/trigger",
"snapshot"), -EBUSY);
ASSERT_EQ(__tracefs_write(TRACEFS_ROOT"/snapshot",
"1"), -EBUSY);
}
TEST_HARNESS_MAIN
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment