Commit 814dd948 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull perf fixes from Ingo Molnar:
 "x86 PMU driver fixes plus a core code race fix"

* 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  perf/x86/intel: Fix incorrect lbr_sel_mask value
  perf/x86/intel/pt: Don't die on VMXON
  perf/core: Fix perf_event_open() vs. execve() race
  perf/x86/amd: Set the size of event map array to PERF_COUNT_HW_MAX
  perf/core: Make sysctl_perf_cpu_time_max_percent conform to documentation
  perf/x86/intel/rapl: Add missing Haswell model
  perf/x86/intel: Add model number for Skylake Server to perf
parents 2113caed cf3beb7c
...@@ -115,7 +115,7 @@ static __initconst const u64 amd_hw_cache_event_ids ...@@ -115,7 +115,7 @@ static __initconst const u64 amd_hw_cache_event_ids
/* /*
* AMD Performance Monitor K7 and later. * AMD Performance Monitor K7 and later.
*/ */
static const u64 amd_perfmon_event_map[] = static const u64 amd_perfmon_event_map[PERF_COUNT_HW_MAX] =
{ {
[PERF_COUNT_HW_CPU_CYCLES] = 0x0076, [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
[PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
......
...@@ -3639,6 +3639,7 @@ __init int intel_pmu_init(void) ...@@ -3639,6 +3639,7 @@ __init int intel_pmu_init(void)
case 78: /* 14nm Skylake Mobile */ case 78: /* 14nm Skylake Mobile */
case 94: /* 14nm Skylake Desktop */ case 94: /* 14nm Skylake Desktop */
case 85: /* 14nm Skylake Server */
x86_pmu.late_ack = true; x86_pmu.late_ack = true;
memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
......
...@@ -63,7 +63,7 @@ static enum { ...@@ -63,7 +63,7 @@ static enum {
#define LBR_PLM (LBR_KERNEL | LBR_USER) #define LBR_PLM (LBR_KERNEL | LBR_USER)
#define LBR_SEL_MASK 0x1ff /* valid bits in LBR_SELECT */ #define LBR_SEL_MASK 0x3ff /* valid bits in LBR_SELECT */
#define LBR_NOT_SUPP -1 /* LBR filter not supported */ #define LBR_NOT_SUPP -1 /* LBR filter not supported */
#define LBR_IGN 0 /* ignored */ #define LBR_IGN 0 /* ignored */
...@@ -610,8 +610,10 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event) ...@@ -610,8 +610,10 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
* The first 9 bits (LBR_SEL_MASK) in LBR_SELECT operate * The first 9 bits (LBR_SEL_MASK) in LBR_SELECT operate
* in suppress mode. So LBR_SELECT should be set to * in suppress mode. So LBR_SELECT should be set to
* (~mask & LBR_SEL_MASK) | (mask & ~LBR_SEL_MASK) * (~mask & LBR_SEL_MASK) | (mask & ~LBR_SEL_MASK)
* But the 10th bit LBR_CALL_STACK does not operate
* in suppress mode.
*/ */
reg->config = mask ^ x86_pmu.lbr_sel_mask; reg->config = mask ^ (x86_pmu.lbr_sel_mask & ~LBR_CALL_STACK);
if ((br_type & PERF_SAMPLE_BRANCH_NO_CYCLES) && if ((br_type & PERF_SAMPLE_BRANCH_NO_CYCLES) &&
(br_type & PERF_SAMPLE_BRANCH_NO_FLAGS) && (br_type & PERF_SAMPLE_BRANCH_NO_FLAGS) &&
......
...@@ -136,9 +136,21 @@ static int __init pt_pmu_hw_init(void) ...@@ -136,9 +136,21 @@ static int __init pt_pmu_hw_init(void)
struct dev_ext_attribute *de_attrs; struct dev_ext_attribute *de_attrs;
struct attribute **attrs; struct attribute **attrs;
size_t size; size_t size;
u64 reg;
int ret; int ret;
long i; long i;
if (boot_cpu_has(X86_FEATURE_VMX)) {
/*
* Intel SDM, 36.5 "Tracing post-VMXON" says that
* "IA32_VMX_MISC[bit 14]" being 1 means PT can trace
* post-VMXON.
*/
rdmsrl(MSR_IA32_VMX_MISC, reg);
if (reg & BIT(14))
pt_pmu.vmx = true;
}
attrs = NULL; attrs = NULL;
for (i = 0; i < PT_CPUID_LEAVES; i++) { for (i = 0; i < PT_CPUID_LEAVES; i++) {
...@@ -269,20 +281,23 @@ static void pt_config(struct perf_event *event) ...@@ -269,20 +281,23 @@ static void pt_config(struct perf_event *event)
reg |= (event->attr.config & PT_CONFIG_MASK); reg |= (event->attr.config & PT_CONFIG_MASK);
event->hw.config = reg;
wrmsrl(MSR_IA32_RTIT_CTL, reg); wrmsrl(MSR_IA32_RTIT_CTL, reg);
} }
static void pt_config_start(bool start) static void pt_config_stop(struct perf_event *event)
{ {
u64 ctl; u64 ctl = READ_ONCE(event->hw.config);
/* may be already stopped by a PMI */
if (!(ctl & RTIT_CTL_TRACEEN))
return;
rdmsrl(MSR_IA32_RTIT_CTL, ctl); ctl &= ~RTIT_CTL_TRACEEN;
if (start)
ctl |= RTIT_CTL_TRACEEN;
else
ctl &= ~RTIT_CTL_TRACEEN;
wrmsrl(MSR_IA32_RTIT_CTL, ctl); wrmsrl(MSR_IA32_RTIT_CTL, ctl);
WRITE_ONCE(event->hw.config, ctl);
/* /*
* A wrmsr that disables trace generation serializes other PT * A wrmsr that disables trace generation serializes other PT
* registers and causes all data packets to be written to memory, * registers and causes all data packets to be written to memory,
...@@ -291,8 +306,7 @@ static void pt_config_start(bool start) ...@@ -291,8 +306,7 @@ static void pt_config_start(bool start)
* The below WMB, separating data store and aux_head store matches * The below WMB, separating data store and aux_head store matches
* the consumer's RMB that separates aux_head load and data load. * the consumer's RMB that separates aux_head load and data load.
*/ */
if (!start) wmb();
wmb();
} }
static void pt_config_buffer(void *buf, unsigned int topa_idx, static void pt_config_buffer(void *buf, unsigned int topa_idx,
...@@ -942,11 +956,17 @@ void intel_pt_interrupt(void) ...@@ -942,11 +956,17 @@ void intel_pt_interrupt(void)
if (!ACCESS_ONCE(pt->handle_nmi)) if (!ACCESS_ONCE(pt->handle_nmi))
return; return;
pt_config_start(false); /*
* If VMX is on and PT does not support it, don't touch anything.
*/
if (READ_ONCE(pt->vmx_on))
return;
if (!event) if (!event)
return; return;
pt_config_stop(event);
buf = perf_get_aux(&pt->handle); buf = perf_get_aux(&pt->handle);
if (!buf) if (!buf)
return; return;
...@@ -983,6 +1003,35 @@ void intel_pt_interrupt(void) ...@@ -983,6 +1003,35 @@ void intel_pt_interrupt(void)
} }
} }
void intel_pt_handle_vmx(int on)
{
struct pt *pt = this_cpu_ptr(&pt_ctx);
struct perf_event *event;
unsigned long flags;
/* PT plays nice with VMX, do nothing */
if (pt_pmu.vmx)
return;
/*
* VMXON will clear RTIT_CTL.TraceEn; we need to make
* sure to not try to set it while VMX is on. Disable
* interrupts to avoid racing with pmu callbacks;
* concurrent PMI should be handled fine.
*/
local_irq_save(flags);
WRITE_ONCE(pt->vmx_on, on);
if (on) {
/* prevent pt_config_stop() from writing RTIT_CTL */
event = pt->handle.event;
if (event)
event->hw.config = 0;
}
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(intel_pt_handle_vmx);
/* /*
* PMU callbacks * PMU callbacks
*/ */
...@@ -992,6 +1041,9 @@ static void pt_event_start(struct perf_event *event, int mode) ...@@ -992,6 +1041,9 @@ static void pt_event_start(struct perf_event *event, int mode)
struct pt *pt = this_cpu_ptr(&pt_ctx); struct pt *pt = this_cpu_ptr(&pt_ctx);
struct pt_buffer *buf = perf_get_aux(&pt->handle); struct pt_buffer *buf = perf_get_aux(&pt->handle);
if (READ_ONCE(pt->vmx_on))
return;
if (!buf || pt_buffer_is_full(buf, pt)) { if (!buf || pt_buffer_is_full(buf, pt)) {
event->hw.state = PERF_HES_STOPPED; event->hw.state = PERF_HES_STOPPED;
return; return;
...@@ -1014,7 +1066,8 @@ static void pt_event_stop(struct perf_event *event, int mode) ...@@ -1014,7 +1066,8 @@ static void pt_event_stop(struct perf_event *event, int mode)
* see comment in intel_pt_interrupt(). * see comment in intel_pt_interrupt().
*/ */
ACCESS_ONCE(pt->handle_nmi) = 0; ACCESS_ONCE(pt->handle_nmi) = 0;
pt_config_start(false);
pt_config_stop(event);
if (event->hw.state == PERF_HES_STOPPED) if (event->hw.state == PERF_HES_STOPPED)
return; return;
......
...@@ -65,6 +65,7 @@ enum pt_capabilities { ...@@ -65,6 +65,7 @@ enum pt_capabilities {
struct pt_pmu { struct pt_pmu {
struct pmu pmu; struct pmu pmu;
u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES]; u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];
bool vmx;
}; };
/** /**
...@@ -107,10 +108,12 @@ struct pt_buffer { ...@@ -107,10 +108,12 @@ struct pt_buffer {
* struct pt - per-cpu pt context * struct pt - per-cpu pt context
* @handle: perf output handle * @handle: perf output handle
* @handle_nmi: do handle PT PMI on this cpu, there's an active event * @handle_nmi: do handle PT PMI on this cpu, there's an active event
* @vmx_on: 1 if VMX is ON on this cpu
*/ */
struct pt { struct pt {
struct perf_output_handle handle; struct perf_output_handle handle;
int handle_nmi; int handle_nmi;
int vmx_on;
}; };
#endif /* __INTEL_PT_H__ */ #endif /* __INTEL_PT_H__ */
...@@ -718,6 +718,7 @@ static int __init rapl_pmu_init(void) ...@@ -718,6 +718,7 @@ static int __init rapl_pmu_init(void)
break; break;
case 60: /* Haswell */ case 60: /* Haswell */
case 69: /* Haswell-Celeron */ case 69: /* Haswell-Celeron */
case 70: /* Haswell GT3e */
case 61: /* Broadwell */ case 61: /* Broadwell */
case 71: /* Broadwell-H */ case 71: /* Broadwell-H */
rapl_cntr_mask = RAPL_IDX_HSW; rapl_cntr_mask = RAPL_IDX_HSW;
......
...@@ -285,6 +285,10 @@ static inline void perf_events_lapic_init(void) { } ...@@ -285,6 +285,10 @@ static inline void perf_events_lapic_init(void) { }
static inline void perf_check_microcode(void) { } static inline void perf_check_microcode(void) { }
#endif #endif
#ifdef CONFIG_CPU_SUP_INTEL
extern void intel_pt_handle_vmx(int on);
#endif
#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD)
extern void amd_pmu_enable_virt(void); extern void amd_pmu_enable_virt(void);
extern void amd_pmu_disable_virt(void); extern void amd_pmu_disable_virt(void);
......
...@@ -3103,6 +3103,8 @@ static __init int vmx_disabled_by_bios(void) ...@@ -3103,6 +3103,8 @@ static __init int vmx_disabled_by_bios(void)
static void kvm_cpu_vmxon(u64 addr) static void kvm_cpu_vmxon(u64 addr)
{ {
intel_pt_handle_vmx(1);
asm volatile (ASM_VMX_VMXON_RAX asm volatile (ASM_VMX_VMXON_RAX
: : "a"(&addr), "m"(addr) : : "a"(&addr), "m"(addr)
: "memory", "cc"); : "memory", "cc");
...@@ -3172,6 +3174,8 @@ static void vmclear_local_loaded_vmcss(void) ...@@ -3172,6 +3174,8 @@ static void vmclear_local_loaded_vmcss(void)
static void kvm_cpu_vmxoff(void) static void kvm_cpu_vmxoff(void)
{ {
asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc"); asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
intel_pt_handle_vmx(0);
} }
static void hardware_disable(void) static void hardware_disable(void)
......
...@@ -412,7 +412,8 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, ...@@ -412,7 +412,8 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
if (ret || !write) if (ret || !write)
return ret; return ret;
if (sysctl_perf_cpu_time_max_percent == 100) { if (sysctl_perf_cpu_time_max_percent == 100 ||
sysctl_perf_cpu_time_max_percent == 0) {
printk(KERN_WARNING printk(KERN_WARNING
"perf: Dynamic interrupt throttling disabled, can hang your system!\n"); "perf: Dynamic interrupt throttling disabled, can hang your system!\n");
WRITE_ONCE(perf_sample_allowed_ns, 0); WRITE_ONCE(perf_sample_allowed_ns, 0);
...@@ -1105,6 +1106,7 @@ static void put_ctx(struct perf_event_context *ctx) ...@@ -1105,6 +1106,7 @@ static void put_ctx(struct perf_event_context *ctx)
* function. * function.
* *
* Lock order: * Lock order:
* cred_guard_mutex
* task_struct::perf_event_mutex * task_struct::perf_event_mutex
* perf_event_context::mutex * perf_event_context::mutex
* perf_event::child_mutex; * perf_event::child_mutex;
...@@ -3420,7 +3422,6 @@ static struct task_struct * ...@@ -3420,7 +3422,6 @@ static struct task_struct *
find_lively_task_by_vpid(pid_t vpid) find_lively_task_by_vpid(pid_t vpid)
{ {
struct task_struct *task; struct task_struct *task;
int err;
rcu_read_lock(); rcu_read_lock();
if (!vpid) if (!vpid)
...@@ -3434,16 +3435,7 @@ find_lively_task_by_vpid(pid_t vpid) ...@@ -3434,16 +3435,7 @@ find_lively_task_by_vpid(pid_t vpid)
if (!task) if (!task)
return ERR_PTR(-ESRCH); return ERR_PTR(-ESRCH);
/* Reuse ptrace permission checks for now. */
err = -EACCES;
if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
goto errout;
return task; return task;
errout:
put_task_struct(task);
return ERR_PTR(err);
} }
/* /*
...@@ -8413,6 +8405,24 @@ SYSCALL_DEFINE5(perf_event_open, ...@@ -8413,6 +8405,24 @@ SYSCALL_DEFINE5(perf_event_open,
get_online_cpus(); get_online_cpus();
if (task) {
err = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
if (err)
goto err_cpus;
/*
* Reuse ptrace permission checks for now.
*
* We must hold cred_guard_mutex across this and any potential
* perf_install_in_context() call for this new event to
* serialize against exec() altering our credentials (and the
* perf_event_exit_task() that could imply).
*/
err = -EACCES;
if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
goto err_cred;
}
if (flags & PERF_FLAG_PID_CGROUP) if (flags & PERF_FLAG_PID_CGROUP)
cgroup_fd = pid; cgroup_fd = pid;
...@@ -8420,7 +8430,7 @@ SYSCALL_DEFINE5(perf_event_open, ...@@ -8420,7 +8430,7 @@ SYSCALL_DEFINE5(perf_event_open,
NULL, NULL, cgroup_fd); NULL, NULL, cgroup_fd);
if (IS_ERR(event)) { if (IS_ERR(event)) {
err = PTR_ERR(event); err = PTR_ERR(event);
goto err_cpus; goto err_cred;
} }
if (is_sampling_event(event)) { if (is_sampling_event(event)) {
...@@ -8479,11 +8489,6 @@ SYSCALL_DEFINE5(perf_event_open, ...@@ -8479,11 +8489,6 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_context; goto err_context;
} }
if (task) {
put_task_struct(task);
task = NULL;
}
/* /*
* Look up the group leader (we will attach this event to it): * Look up the group leader (we will attach this event to it):
*/ */
...@@ -8581,6 +8586,11 @@ SYSCALL_DEFINE5(perf_event_open, ...@@ -8581,6 +8586,11 @@ SYSCALL_DEFINE5(perf_event_open,
WARN_ON_ONCE(ctx->parent_ctx); WARN_ON_ONCE(ctx->parent_ctx);
/*
* This is the point on no return; we cannot fail hereafter. This is
* where we start modifying current state.
*/
if (move_group) { if (move_group) {
/* /*
* See perf_event_ctx_lock() for comments on the details * See perf_event_ctx_lock() for comments on the details
...@@ -8652,6 +8662,11 @@ SYSCALL_DEFINE5(perf_event_open, ...@@ -8652,6 +8662,11 @@ SYSCALL_DEFINE5(perf_event_open,
mutex_unlock(&gctx->mutex); mutex_unlock(&gctx->mutex);
mutex_unlock(&ctx->mutex); mutex_unlock(&ctx->mutex);
if (task) {
mutex_unlock(&task->signal->cred_guard_mutex);
put_task_struct(task);
}
put_online_cpus(); put_online_cpus();
mutex_lock(&current->perf_event_mutex); mutex_lock(&current->perf_event_mutex);
...@@ -8684,6 +8699,9 @@ SYSCALL_DEFINE5(perf_event_open, ...@@ -8684,6 +8699,9 @@ SYSCALL_DEFINE5(perf_event_open,
*/ */
if (!event_file) if (!event_file)
free_event(event); free_event(event);
err_cred:
if (task)
mutex_unlock(&task->signal->cred_guard_mutex);
err_cpus: err_cpus:
put_online_cpus(); put_online_cpus();
err_task: err_task:
...@@ -8968,6 +8986,9 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) ...@@ -8968,6 +8986,9 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
/* /*
* When a child task exits, feed back event values to parent events. * When a child task exits, feed back event values to parent events.
*
* Can be called with cred_guard_mutex held when called from
* install_exec_creds().
*/ */
void perf_event_exit_task(struct task_struct *child) void perf_event_exit_task(struct task_struct *child)
{ {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment