Commit 43a21ea8 authored by Peter Zijlstra's avatar Peter Zijlstra Committed by Ingo Molnar

perf_counter: Add event overlow handling

Alternative method of mmap() data output handling that provides
better overflow management and a more reliable data stream.

Unlike the previous method, that didn't have any user->kernel
feedback and relied on userspace keeping up, this method relies on
userspace writing its last read position into the control page.

It will ensure new output doesn't overwrite not-yet read events,
new events for which there is no space left are lost and the
overflow counter is incremented, providing exact event loss
numbers.
Signed-off-by: default avatarPeter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: default avatarIngo Molnar <mingo@elte.hu>
parent d3a9262e
...@@ -236,10 +236,16 @@ struct perf_counter_mmap_page { ...@@ -236,10 +236,16 @@ struct perf_counter_mmap_page {
/* /*
* Control data for the mmap() data buffer. * Control data for the mmap() data buffer.
* *
* User-space reading this value should issue an rmb(), on SMP capable * User-space reading the @data_head value should issue an rmb(), on
* platforms, after reading this value -- see perf_counter_wakeup(). * SMP capable platforms, after reading this value -- see
* perf_counter_wakeup().
*
* When the mapping is PROT_WRITE the @data_tail value should be
* written by userspace to reflect the last read data. In this case
* the kernel will not over-write unread data.
*/ */
__u64 data_head; /* head in the data section */ __u64 data_head; /* head in the data section */
__u64 data_tail; /* user-space written tail */
}; };
#define PERF_EVENT_MISC_CPUMODE_MASK (3 << 0) #define PERF_EVENT_MISC_CPUMODE_MASK (3 << 0)
...@@ -273,6 +279,15 @@ enum perf_event_type { ...@@ -273,6 +279,15 @@ enum perf_event_type {
*/ */
PERF_EVENT_MMAP = 1, PERF_EVENT_MMAP = 1,
/*
* struct {
* struct perf_event_header header;
* u64 id;
* u64 lost;
* };
*/
PERF_EVENT_LOST = 2,
/* /*
* struct { * struct {
* struct perf_event_header header; * struct perf_event_header header;
...@@ -313,26 +328,26 @@ enum perf_event_type { ...@@ -313,26 +328,26 @@ enum perf_event_type {
/* /*
* When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field
* will be PERF_RECORD_* * will be PERF_SAMPLE_*
* *
* struct { * struct {
* struct perf_event_header header; * struct perf_event_header header;
* *
* { u64 ip; } && PERF_RECORD_IP * { u64 ip; } && PERF_SAMPLE_IP
* { u32 pid, tid; } && PERF_RECORD_TID * { u32 pid, tid; } && PERF_SAMPLE_TID
* { u64 time; } && PERF_RECORD_TIME * { u64 time; } && PERF_SAMPLE_TIME
* { u64 addr; } && PERF_RECORD_ADDR * { u64 addr; } && PERF_SAMPLE_ADDR
* { u64 config; } && PERF_RECORD_CONFIG * { u64 config; } && PERF_SAMPLE_CONFIG
* { u32 cpu, res; } && PERF_RECORD_CPU * { u32 cpu, res; } && PERF_SAMPLE_CPU
* *
* { u64 nr; * { u64 nr;
* { u64 id, val; } cnt[nr]; } && PERF_RECORD_GROUP * { u64 id, val; } cnt[nr]; } && PERF_SAMPLE_GROUP
* *
* { u16 nr, * { u16 nr,
* hv, * hv,
* kernel, * kernel,
* user; * user;
* u64 ips[nr]; } && PERF_RECORD_CALLCHAIN * u64 ips[nr]; } && PERF_SAMPLE_CALLCHAIN
* }; * };
*/ */
}; };
...@@ -424,6 +439,7 @@ struct file; ...@@ -424,6 +439,7 @@ struct file;
struct perf_mmap_data { struct perf_mmap_data {
struct rcu_head rcu_head; struct rcu_head rcu_head;
int nr_pages; /* nr of data pages */ int nr_pages; /* nr of data pages */
int writable; /* are we writable */
int nr_locked; /* nr pages mlocked */ int nr_locked; /* nr pages mlocked */
atomic_t poll; /* POLL_ for wakeups */ atomic_t poll; /* POLL_ for wakeups */
...@@ -433,8 +449,8 @@ struct perf_mmap_data { ...@@ -433,8 +449,8 @@ struct perf_mmap_data {
atomic_long_t done_head; /* completed head */ atomic_long_t done_head; /* completed head */
atomic_t lock; /* concurrent writes */ atomic_t lock; /* concurrent writes */
atomic_t wakeup; /* needs a wakeup */ atomic_t wakeup; /* needs a wakeup */
atomic_t lost; /* nr records lost */
struct perf_counter_mmap_page *user_page; struct perf_counter_mmap_page *user_page;
void *data_pages[0]; void *data_pages[0];
......
...@@ -1794,6 +1794,12 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) ...@@ -1794,6 +1794,12 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
struct perf_mmap_data *data; struct perf_mmap_data *data;
int ret = VM_FAULT_SIGBUS; int ret = VM_FAULT_SIGBUS;
if (vmf->flags & FAULT_FLAG_MKWRITE) {
if (vmf->pgoff == 0)
ret = 0;
return ret;
}
rcu_read_lock(); rcu_read_lock();
data = rcu_dereference(counter->data); data = rcu_dereference(counter->data);
if (!data) if (!data)
...@@ -1807,9 +1813,16 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) ...@@ -1807,9 +1813,16 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
if ((unsigned)nr > data->nr_pages) if ((unsigned)nr > data->nr_pages)
goto unlock; goto unlock;
if (vmf->flags & FAULT_FLAG_WRITE)
goto unlock;
vmf->page = virt_to_page(data->data_pages[nr]); vmf->page = virt_to_page(data->data_pages[nr]);
} }
get_page(vmf->page); get_page(vmf->page);
vmf->page->mapping = vma->vm_file->f_mapping;
vmf->page->index = vmf->pgoff;
ret = 0; ret = 0;
unlock: unlock:
rcu_read_unlock(); rcu_read_unlock();
...@@ -1862,6 +1875,14 @@ static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages) ...@@ -1862,6 +1875,14 @@ static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
return -ENOMEM; return -ENOMEM;
} }
static void perf_mmap_free_page(unsigned long addr)
{
struct page *page = virt_to_page(addr);
page->mapping = NULL;
__free_page(page);
}
static void __perf_mmap_data_free(struct rcu_head *rcu_head) static void __perf_mmap_data_free(struct rcu_head *rcu_head)
{ {
struct perf_mmap_data *data; struct perf_mmap_data *data;
...@@ -1869,9 +1890,10 @@ static void __perf_mmap_data_free(struct rcu_head *rcu_head) ...@@ -1869,9 +1890,10 @@ static void __perf_mmap_data_free(struct rcu_head *rcu_head)
data = container_of(rcu_head, struct perf_mmap_data, rcu_head); data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
free_page((unsigned long)data->user_page); perf_mmap_free_page((unsigned long)data->user_page);
for (i = 0; i < data->nr_pages; i++) for (i = 0; i < data->nr_pages; i++)
free_page((unsigned long)data->data_pages[i]); perf_mmap_free_page((unsigned long)data->data_pages[i]);
kfree(data); kfree(data);
} }
...@@ -1911,6 +1933,7 @@ static struct vm_operations_struct perf_mmap_vmops = { ...@@ -1911,6 +1933,7 @@ static struct vm_operations_struct perf_mmap_vmops = {
.open = perf_mmap_open, .open = perf_mmap_open,
.close = perf_mmap_close, .close = perf_mmap_close,
.fault = perf_mmap_fault, .fault = perf_mmap_fault,
.page_mkwrite = perf_mmap_fault,
}; };
static int perf_mmap(struct file *file, struct vm_area_struct *vma) static int perf_mmap(struct file *file, struct vm_area_struct *vma)
...@@ -1924,7 +1947,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) ...@@ -1924,7 +1947,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
long user_extra, extra; long user_extra, extra;
int ret = 0; int ret = 0;
if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE)) if (!(vma->vm_flags & VM_SHARED))
return -EINVAL; return -EINVAL;
vma_size = vma->vm_end - vma->vm_start; vma_size = vma->vm_end - vma->vm_start;
...@@ -1983,10 +2006,12 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) ...@@ -1983,10 +2006,12 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
atomic_long_add(user_extra, &user->locked_vm); atomic_long_add(user_extra, &user->locked_vm);
vma->vm_mm->locked_vm += extra; vma->vm_mm->locked_vm += extra;
counter->data->nr_locked = extra; counter->data->nr_locked = extra;
if (vma->vm_flags & VM_WRITE)
counter->data->writable = 1;
unlock: unlock:
mutex_unlock(&counter->mmap_mutex); mutex_unlock(&counter->mmap_mutex);
vma->vm_flags &= ~VM_MAYWRITE;
vma->vm_flags |= VM_RESERVED; vma->vm_flags |= VM_RESERVED;
vma->vm_ops = &perf_mmap_vmops; vma->vm_ops = &perf_mmap_vmops;
...@@ -2163,11 +2188,38 @@ struct perf_output_handle { ...@@ -2163,11 +2188,38 @@ struct perf_output_handle {
unsigned long head; unsigned long head;
unsigned long offset; unsigned long offset;
int nmi; int nmi;
int overflow; int sample;
int locked; int locked;
unsigned long flags; unsigned long flags;
}; };
static bool perf_output_space(struct perf_mmap_data *data,
unsigned int offset, unsigned int head)
{
unsigned long tail;
unsigned long mask;
if (!data->writable)
return true;
mask = (data->nr_pages << PAGE_SHIFT) - 1;
/*
* Userspace could choose to issue a mb() before updating the tail
* pointer. So that all reads will be completed before the write is
* issued.
*/
tail = ACCESS_ONCE(data->user_page->data_tail);
smp_rmb();
offset = (offset - tail) & mask;
head = (head - tail) & mask;
if ((int)(head - offset) < 0)
return false;
return true;
}
static void perf_output_wakeup(struct perf_output_handle *handle) static void perf_output_wakeup(struct perf_output_handle *handle)
{ {
atomic_set(&handle->data->poll, POLL_IN); atomic_set(&handle->data->poll, POLL_IN);
...@@ -2258,12 +2310,57 @@ static void perf_output_unlock(struct perf_output_handle *handle) ...@@ -2258,12 +2310,57 @@ static void perf_output_unlock(struct perf_output_handle *handle)
local_irq_restore(handle->flags); local_irq_restore(handle->flags);
} }
static void perf_output_copy(struct perf_output_handle *handle,
const void *buf, unsigned int len)
{
unsigned int pages_mask;
unsigned int offset;
unsigned int size;
void **pages;
offset = handle->offset;
pages_mask = handle->data->nr_pages - 1;
pages = handle->data->data_pages;
do {
unsigned int page_offset;
int nr;
nr = (offset >> PAGE_SHIFT) & pages_mask;
page_offset = offset & (PAGE_SIZE - 1);
size = min_t(unsigned int, PAGE_SIZE - page_offset, len);
memcpy(pages[nr] + page_offset, buf, size);
len -= size;
buf += size;
offset += size;
} while (len);
handle->offset = offset;
/*
* Check we didn't copy past our reservation window, taking the
* possible unsigned int wrap into account.
*/
WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
}
#define perf_output_put(handle, x) \
perf_output_copy((handle), &(x), sizeof(x))
static int perf_output_begin(struct perf_output_handle *handle, static int perf_output_begin(struct perf_output_handle *handle,
struct perf_counter *counter, unsigned int size, struct perf_counter *counter, unsigned int size,
int nmi, int overflow) int nmi, int sample)
{ {
struct perf_mmap_data *data; struct perf_mmap_data *data;
unsigned int offset, head; unsigned int offset, head;
int have_lost;
struct {
struct perf_event_header header;
u64 id;
u64 lost;
} lost_event;
/* /*
* For inherited counters we send all the output towards the parent. * For inherited counters we send all the output towards the parent.
...@@ -2279,16 +2376,22 @@ static int perf_output_begin(struct perf_output_handle *handle, ...@@ -2279,16 +2376,22 @@ static int perf_output_begin(struct perf_output_handle *handle,
handle->data = data; handle->data = data;
handle->counter = counter; handle->counter = counter;
handle->nmi = nmi; handle->nmi = nmi;
handle->overflow = overflow; handle->sample = sample;
if (!data->nr_pages) if (!data->nr_pages)
goto fail; goto fail;
have_lost = atomic_read(&data->lost);
if (have_lost)
size += sizeof(lost_event);
perf_output_lock(handle); perf_output_lock(handle);
do { do {
offset = head = atomic_long_read(&data->head); offset = head = atomic_long_read(&data->head);
head += size; head += size;
if (unlikely(!perf_output_space(data, offset, head)))
goto fail;
} while (atomic_long_cmpxchg(&data->head, offset, head) != offset); } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
handle->offset = offset; handle->offset = offset;
...@@ -2297,55 +2400,27 @@ static int perf_output_begin(struct perf_output_handle *handle, ...@@ -2297,55 +2400,27 @@ static int perf_output_begin(struct perf_output_handle *handle,
if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT)) if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT))
atomic_set(&data->wakeup, 1); atomic_set(&data->wakeup, 1);
if (have_lost) {
lost_event.header.type = PERF_EVENT_LOST;
lost_event.header.misc = 0;
lost_event.header.size = sizeof(lost_event);
lost_event.id = counter->id;
lost_event.lost = atomic_xchg(&data->lost, 0);
perf_output_put(handle, lost_event);
}
return 0; return 0;
fail: fail:
perf_output_wakeup(handle); atomic_inc(&data->lost);
perf_output_unlock(handle);
out: out:
rcu_read_unlock(); rcu_read_unlock();
return -ENOSPC; return -ENOSPC;
} }
static void perf_output_copy(struct perf_output_handle *handle,
const void *buf, unsigned int len)
{
unsigned int pages_mask;
unsigned int offset;
unsigned int size;
void **pages;
offset = handle->offset;
pages_mask = handle->data->nr_pages - 1;
pages = handle->data->data_pages;
do {
unsigned int page_offset;
int nr;
nr = (offset >> PAGE_SHIFT) & pages_mask;
page_offset = offset & (PAGE_SIZE - 1);
size = min_t(unsigned int, PAGE_SIZE - page_offset, len);
memcpy(pages[nr] + page_offset, buf, size);
len -= size;
buf += size;
offset += size;
} while (len);
handle->offset = offset;
/*
* Check we didn't copy past our reservation window, taking the
* possible unsigned int wrap into account.
*/
WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
}
#define perf_output_put(handle, x) \
perf_output_copy((handle), &(x), sizeof(x))
static void perf_output_end(struct perf_output_handle *handle) static void perf_output_end(struct perf_output_handle *handle)
{ {
struct perf_counter *counter = handle->counter; struct perf_counter *counter = handle->counter;
...@@ -2353,7 +2428,7 @@ static void perf_output_end(struct perf_output_handle *handle) ...@@ -2353,7 +2428,7 @@ static void perf_output_end(struct perf_output_handle *handle)
int wakeup_events = counter->attr.wakeup_events; int wakeup_events = counter->attr.wakeup_events;
if (handle->overflow && wakeup_events) { if (handle->sample && wakeup_events) {
int events = atomic_inc_return(&data->events); int events = atomic_inc_return(&data->events);
if (events >= wakeup_events) { if (events >= wakeup_events) {
atomic_sub(wakeup_events, &data->events); atomic_sub(wakeup_events, &data->events);
...@@ -2958,7 +3033,7 @@ static void perf_log_throttle(struct perf_counter *counter, int enable) ...@@ -2958,7 +3033,7 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
} }
/* /*
* Generic counter overflow handling. * Generic counter overflow handling, sampling.
*/ */
int perf_counter_overflow(struct perf_counter *counter, int nmi, int perf_counter_overflow(struct perf_counter *counter, int nmi,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment