Merge branch 'tracing/core-v3' of...

Merge branch 'tracing/core-v3' of git://git.kernel.org/pub/scm/linux/kernel/git/frederic/random-tracing into tracing/urgent

Merge branch 'tracing/core-v3' of...
Merge branch 'tracing/core-v3' of git://git.kernel.org/pub/scm/linux/kernel/git/frederic/random-tracing into tracing/urgent
be4bdbfb · Ingo Molnar · fc537766 · 20ab4425 · be4bdbfb · be4bdbfb
Commit be4bdbfb authored Sep 19, 2009 by Ingo Molnar
5 changed files
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -4,6 +4,7 @@
 #include <linux/ring_buffer.h>
 #include <linux/trace_seq.h>
 #include <linux/percpu.h>
+#include <linux/hardirq.h>
 struct trace_array;
 struct tracer;
@@ -130,10 +131,15 @@ struct ftrace_event_call {
 	void			*data;
 	atomic_t		profile_count;
-	int			(*profile_enable)(struct ftrace_event_call *);
+	int			(*profile_enable)(void);
-	void			(*profile_disable)(struct ftrace_event_call *);
+	void			(*profile_disable)(void);
 };
+#define FTRACE_MAX_PROFILE_SIZE	2048
+extern char			*trace_profile_buf;
+extern char			*trace_profile_buf_nmi;
 #define MAX_FILTER_PRED		32
 #define MAX_FILTER_STR_VAL	256	/* Should handle KSYM_SYMBOL_LEN */

--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -100,32 +100,24 @@ struct perf_counter_attr;
 #ifdef CONFIG_EVENT_PROFILE
 #define TRACE_SYS_ENTER_PROFILE(sname)					       \
-static int prof_sysenter_enable_##sname(struct ftrace_event_call *event_call)  \
+static int prof_sysenter_enable_##sname(void)				       \
 {									       \
-	int ret = 0;							       \
+	return reg_prof_syscall_enter("sys"#sname);			       \
-	if (!atomic_inc_return(&event_enter_##sname.profile_count))	       \
-		ret = reg_prof_syscall_enter("sys"#sname);		       \
-	return ret;							       \
 }									       \
 									       \
-static void prof_sysenter_disable_##sname(struct ftrace_event_call *event_call)\
+static void prof_sysenter_disable_##sname(void)				       \
 {									       \
-	if (atomic_add_negative(-1, &event_enter_##sname.profile_count))       \
 	unreg_prof_syscall_enter("sys"#sname);				       \
 }
 #define TRACE_SYS_EXIT_PROFILE(sname)					       \
-static int prof_sysexit_enable_##sname(struct ftrace_event_call *event_call)   \
+static int prof_sysexit_enable_##sname(void)				       \
 {									       \
-	int ret = 0;							       \
+	return reg_prof_syscall_exit("sys"#sname);			       \
-	if (!atomic_inc_return(&event_exit_##sname.profile_count))	       \
-		ret = reg_prof_syscall_exit("sys"#sname);		       \
-	return ret;							       \
 }									       \
 									       \
-static void prof_sysexit_disable_##sname(struct ftrace_event_call *event_call) \
+static void prof_sysexit_disable_##sname(void)				       \
 {                                                                              \
-	if (atomic_add_negative(-1, &event_exit_##sname.profile_count))	       \
 	unreg_prof_syscall_exit("sys"#sname);				       \
 }

--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -382,19 +382,13 @@ static inline int ftrace_get_offsets_##call(				\
 *
 * NOTE: The insertion profile callback (ftrace_profile_<call>) is defined later
 *
- * static int ftrace_profile_enable_<call>(struct ftrace_event_call *event_call)
+ * static int ftrace_profile_enable_<call>(void)
 * {
- * 	int ret = 0;
+ * 	return register_trace_<call>(ftrace_profile_<call>);
- *
- * 	if (!atomic_inc_return(&event_call->profile_count))
- * 		ret = register_trace_<call>(ftrace_profile_<call>);
- *
- * 	return ret;
 * }
 *
- * static void ftrace_profile_disable_<call>(struct ftrace_event_call *event_call)
+ * static void ftrace_profile_disable_<call>(void)
 * {
- * 	if (atomic_add_negative(-1, &event->call->profile_count))
 * 	unregister_trace_<call>(ftrace_profile_<call>);
 * }
 *
@@ -405,19 +399,13 @@ static inline int ftrace_get_offsets_##call(				\
 									\
 static void ftrace_profile_##call(proto);				\
 									\
-static int ftrace_profile_enable_##call(struct ftrace_event_call *event_call) \
+static int ftrace_profile_enable_##call(void)				\
 {									\
-	int ret = 0;							\
+	return register_trace_##call(ftrace_profile_##call);		\
-									\
-	if (!atomic_inc_return(&event_call->profile_count))		\
-		ret = register_trace_##call(ftrace_profile_##call);	\
-									\
-	return ret;							\
 }									\
 									\
-static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\
+static void ftrace_profile_disable_##call(void)				\
 {									\
-	if (atomic_add_negative(-1, &event_call->profile_count))	\
 	unregister_trace_##call(ftrace_profile_##call);			\
 }
@@ -660,11 +648,12 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 *	struct ftrace_raw_##call *entry;
 *	u64 __addr = 0, __count = 1;
 *	unsigned long irq_flags;
+ *	struct trace_entry *ent;
 *	int __entry_size;
 *	int __data_size;
+ *	int __cpu
 *	int pc;
 *
- *	local_save_flags(irq_flags);
 *	pc = preempt_count();
 *
 *	__data_size = ftrace_get_offsets_<call>(&__data_offsets, args);
@@ -675,12 +664,22 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 *			     sizeof(u64));
 *	__entry_size -= sizeof(u32);
 *
- *	do {
+ *	// Protect the non nmi buffer
- *		char raw_data[__entry_size]; <- allocate our sample in the stack
+ *	// This also protects the rcu read side
- *		struct trace_entry *ent;
+ *	local_irq_save(irq_flags);
+ *	__cpu = smp_processor_id();
+ *
+ *	if (in_nmi())
+ *		raw_data = rcu_dereference(trace_profile_buf_nmi);
+ *	else
+ *		raw_data = rcu_dereference(trace_profile_buf);
 *
- *		zero dead bytes from alignment to avoid stack leak to userspace:
+ *	if (!raw_data)
+ *		goto end;
 *
+ *	raw_data = per_cpu_ptr(raw_data, __cpu);
+ *
+ *	//zero dead bytes from alignment to avoid stack leak to userspace:
 *	*(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL;
 *	entry = (struct ftrace_raw_<call> *)raw_data;
 *	ent = &entry->ent;
@@ -693,7 +692,6 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 *
 *	perf_tpcounter_event(event_call->id, __addr, __count, entry,
 *		     __entry_size);  <- submit them to perf counter
- *	} while (0);
 *
 * }
 */
@@ -716,11 +714,13 @@ static void ftrace_profile_##call(proto)				\
 	struct ftrace_raw_##call *entry;				\
 	u64 __addr = 0, __count = 1;					\
 	unsigned long irq_flags;					\
+	struct trace_entry *ent;					\
 	int __entry_size;						\
 	int __data_size;						\
+	char *raw_data;							\
+	int __cpu;							\
 	int pc;								\
 									\
-	local_save_flags(irq_flags);					\
 	pc = preempt_count();						\
 									\
 	__data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
@@ -728,9 +728,22 @@ static void ftrace_profile_##call(proto)				\
 			     sizeof(u64));				\
 	__entry_size -= sizeof(u32);					\
 									\
-	do {								\
+	if (WARN_ONCE(__entry_size > FTRACE_MAX_PROFILE_SIZE,		\
-		char raw_data[__entry_size];				\
+		      "profile buffer not large enough"))		\
-		struct trace_entry *ent;				\
+		return;							\
+									\
+	local_irq_save(irq_flags);					\
+	__cpu = smp_processor_id();					\
+									\
+	if (in_nmi())							\
+		raw_data = rcu_dereference(trace_profile_buf_nmi);		\
+	else								\
+		raw_data = rcu_dereference(trace_profile_buf);		\
+									\
+	if (!raw_data)							\
+		goto end;						\
+									\
+	raw_data = per_cpu_ptr(raw_data, __cpu);			\
 									\
 	*(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL;		\
 	entry = (struct ftrace_raw_##call *)raw_data;			\
@@ -742,9 +755,11 @@ static void ftrace_profile_##call(proto)				\
 									\
 	{ assign; }							\
 									\
-		perf_tpcounter_event(event_call->id, __addr, __count, entry,\
+	perf_tpcounter_event(event_call->id, __addr, __count, entry,	\
 			     __entry_size);				\
-	} while (0);							\
+									\
+end:									\
+	local_irq_restore(irq_flags);					\
 									\
 }

--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -8,6 +8,54 @@
 #include <linux/module.h>
 #include "trace.h"
+/*
+ * We can't use a size but a type in alloc_percpu()
+ * So let's create a dummy type that matches the desired size
+ */
+typedef struct {char buf[FTRACE_MAX_PROFILE_SIZE];} profile_buf_t;
+char		*trace_profile_buf;
+char 		*trace_profile_buf_nmi;
+/* Count the events in use (per event id, not per instance) */
+static int	total_profile_count;
+static int ftrace_profile_enable_event(struct ftrace_event_call *event)
+{
+	char *buf;
+	int ret = -ENOMEM;
+	if (atomic_inc_return(&event->profile_count))
+		return 0;
+	if (!total_profile_count++) {
+		buf = (char *)alloc_percpu(profile_buf_t);
+		if (!buf)
+			goto fail_buf;
+		rcu_assign_pointer(trace_profile_buf, buf);
+		buf = (char *)alloc_percpu(profile_buf_t);
+		if (!buf)
+			goto fail_buf_nmi;
+		rcu_assign_pointer(trace_profile_buf_nmi, buf);
+	}
+	ret = event->profile_enable();
+	if (!ret)
+		return 0;
+	kfree(trace_profile_buf_nmi);
+fail_buf_nmi:
+	kfree(trace_profile_buf);
+fail_buf:
+	total_profile_count--;
+	atomic_dec(&event->profile_count);
+	return ret;
+}
 int ftrace_profile_enable(int event_id)
 {
 	struct ftrace_event_call *event;
@@ -17,7 +65,7 @@ int ftrace_profile_enable(int event_id)
 	list_for_each_entry(event, &ftrace_events, list) {
 		if (event->id == event_id && event->profile_enable &&
 		    try_module_get(event->mod)) {
-			ret = event->profile_enable(event);
+			ret = ftrace_profile_enable_event(event);
 			break;
 		}
 	}
@@ -26,6 +74,33 @@ int ftrace_profile_enable(int event_id)
 	return ret;
 }
+static void ftrace_profile_disable_event(struct ftrace_event_call *event)
+{
+	char *buf, *nmi_buf;
+	if (!atomic_add_negative(-1, &event->profile_count))
+		return;
+	event->profile_disable();
+	if (!--total_profile_count) {
+		buf = trace_profile_buf;
+		rcu_assign_pointer(trace_profile_buf, NULL);
+		nmi_buf = trace_profile_buf_nmi;
+		rcu_assign_pointer(trace_profile_buf_nmi, NULL);
+		/*
+		 * Ensure every events in profiling have finished before
+		 * releasing the buffers
+		 */
+		synchronize_sched();
+		free_percpu(buf);
+		free_percpu(nmi_buf);
+	}
+}
 void ftrace_profile_disable(int event_id)
 {
 	struct ftrace_event_call *event;
@@ -33,7 +108,7 @@ void ftrace_profile_disable(int event_id)
 	mutex_lock(&event_mutex);
 	list_for_each_entry(event, &ftrace_events, list) {
 		if (event->id == event_id) {
-			event->profile_disable(event);
+			ftrace_profile_disable_event(event);
 			module_put(event->mod);
 			break;
 		}

--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -384,10 +384,13 @@ static int sys_prof_refcount_exit;
 static void prof_syscall_enter(struct pt_regs *regs, long id)
 {
-	struct syscall_trace_enter *rec;
 	struct syscall_metadata *sys_data;
+	struct syscall_trace_enter *rec;
+	unsigned long flags;
+	char *raw_data;
 	int syscall_nr;
 	int size;
+	int cpu;
 	syscall_nr = syscall_get_nr(current, regs);
 	if (!test_bit(syscall_nr, enabled_prof_enter_syscalls))
@@ -402,8 +405,24 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
 	size = ALIGN(size + sizeof(u32), sizeof(u64));
 	size -= sizeof(u32);
-	do {
+	if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
-		char raw_data[size];
+		      "profile buffer not large enough"))
+		return;
+	/* Protect the per cpu buffer, begin the rcu read side */
+	local_irq_save(flags);
+	cpu = smp_processor_id();
+	if (in_nmi())
+		raw_data = rcu_dereference(trace_profile_buf_nmi);
+	else
+		raw_data = rcu_dereference(trace_profile_buf);
+	if (!raw_data)
+		goto end;
+	raw_data = per_cpu_ptr(raw_data, cpu);
 	/* zero the dead bytes from align to not leak stack to user */
 	*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
@@ -415,7 +434,9 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
 	syscall_get_arguments(current, regs, 0, sys_data->nb_args,
 			       (unsigned long *)&rec->args);
 	perf_tpcounter_event(sys_data->enter_id, 0, 1, rec, size);
-	} while(0);
+end:
+	local_irq_restore(flags);
 }
 int reg_prof_syscall_enter(char *name)
@@ -460,8 +481,12 @@ void unreg_prof_syscall_enter(char *name)
 static void prof_syscall_exit(struct pt_regs *regs, long ret)
 {
 	struct syscall_metadata *sys_data;
-	struct syscall_trace_exit rec;
+	struct syscall_trace_exit *rec;
+	unsigned long flags;
 	int syscall_nr;
+	char *raw_data;
+	int size;
+	int cpu;
 	syscall_nr = syscall_get_nr(current, regs);
 	if (!test_bit(syscall_nr, enabled_prof_exit_syscalls))
@@ -471,12 +496,46 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
 	if (!sys_data)
 		return;
-	tracing_generic_entry_update(&rec.ent, 0, 0);
+	/* We can probably do that at build time */
-	rec.ent.type = sys_data->exit_id;
+	size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
-	rec.nr = syscall_nr;
+	size -= sizeof(u32);
-	rec.ret = syscall_get_return_value(current, regs);
+	/*
+	 * Impossible, but be paranoid with the future
+	 * How to put this check outside runtime?
+	 */
+	if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
+		"exit event has grown above profile buffer size"))
+		return;
-	perf_tpcounter_event(sys_data->exit_id, 0, 1, &rec, sizeof(rec));
+	/* Protect the per cpu buffer, begin the rcu read side */
+	local_irq_save(flags);
+	cpu = smp_processor_id();
+	if (in_nmi())
+		raw_data = rcu_dereference(trace_profile_buf_nmi);
+	else
+		raw_data = rcu_dereference(trace_profile_buf);
+	if (!raw_data)
+		goto end;
+	raw_data = per_cpu_ptr(raw_data, cpu);
+	/* zero the dead bytes from align to not leak stack to user */
+	*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
+	rec = (struct syscall_trace_exit *)raw_data;
+	tracing_generic_entry_update(&rec->ent, 0, 0);
+	rec->ent.type = sys_data->exit_id;
+	rec->nr = syscall_nr;
+	rec->ret = syscall_get_return_value(current, regs);
+	perf_tpcounter_event(sys_data->exit_id, 0, 1, rec, size);
+end:
+	local_irq_restore(flags);
 }
 int reg_prof_syscall_exit(char *name)