Merge branch 'improve_perf_barriers'

Daniel Borkmann says: ==================== This set first adds smp_* barrier variants to tools infrastructure and updates perf and libbpf to make use of them. For details, please see individual patches, thanks! Arnaldo, if there are no objections, could this be routed via bpf-next with Acked-by's due to later dependencies in libbpf? Alternatively, I could also get the 2nd patch out during merge window, but perhaps it's okay to do in one go as there shouldn't be much conflict in perf itself. Thanks! v1 -> v2: - add common helper and switch to acquire/release variants when possible, thanks Peter! ==================== Signed-off-by: Alexei Starovoitov <ast@kernel.org>

Merge branch 'improve_perf_barriers'
Daniel Borkmann says: ==================== This set first adds smp_* barrier variants to tools infrastructure and updates perf and libbpf to make use of them. For details, please see individual patches, thanks! Arnaldo, if there are no objections, could this be routed via bpf-next with Acked-by's due to later dependencies in libbpf? Alternatively, I could also get the 2nd patch out during merge window, but perhaps it's okay to do in one go as there shouldn't be much conflict in perf itself. Thanks! v1 -> v2: - add common helper and switch to acquire/release variants when possible, thanks Peter! ==================== Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2929ad29 · Alexei Starovoitov · 78de3546 · a64af0ef · 2929ad29 · 2929ad29
Commit 2929ad29 authored Oct 19, 2018 by Alexei Starovoitov
10 changed files
--- a/tools/arch/arm64/include/asm/barrier.h
+++ b/tools/arch/arm64/include/asm/barrier.h
@@ -14,4 +14,74 @@
 #define wmb()		asm volatile("dmb ishst" ::: "memory")
 #define rmb()		asm volatile("dmb ishld" ::: "memory")

+#define smp_store_release(p, v)					\
+do {								\
+	union { typeof(*p) __val; char __c[1]; } __u =		\
+		{ .__val = (__force typeof(*p)) (v) }; 		\
+								\
+	switch (sizeof(*p)) {					\
+	case 1:							\
+		asm volatile ("stlrb %w1, %0"			\
+				: "=Q" (*p)			\
+				: "r" (*(__u8 *)__u.__c)	\
+				: "memory");			\
+		break;						\
+	case 2:							\
+		asm volatile ("stlrh %w1, %0"			\
+				: "=Q" (*p)			\
+				: "r" (*(__u16 *)__u.__c)	\
+				: "memory");			\
+		break;						\
+	case 4:							\
+		asm volatile ("stlr %w1, %0"			\
+				: "=Q" (*p)			\
+				: "r" (*(__u32 *)__u.__c)	\
+				: "memory");			\
+		break;						\
+	case 8:							\
+		asm volatile ("stlr %1, %0"			\
+				: "=Q" (*p)			\
+				: "r" (*(__u64 *)__u.__c)	\
+				: "memory");			\
+		break;						\
+	default:						\
+		/* Only to shut up gcc ... */			\
+		mb();						\
+		break;						\
+	}							\
+} while (0)
+
+#define smp_load_acquire(p)					\
+({								\
+	union { typeof(*p) __val; char __c[1]; } __u;		\
+								\
+	switch (sizeof(*p)) {					\
+	case 1:							\
+		asm volatile ("ldarb %w0, %1"			\
+			: "=r" (*(__u8 *)__u.__c)		\
+			: "Q" (*p) : "memory");			\
+		break;						\
+	case 2:							\
+		asm volatile ("ldarh %w0, %1"			\
+			: "=r" (*(__u16 *)__u.__c)		\
+			: "Q" (*p) : "memory");			\
+		break;						\
+	case 4:							\
+		asm volatile ("ldar %w0, %1"			\
+			: "=r" (*(__u32 *)__u.__c)		\
+			: "Q" (*p) : "memory");			\
+		break;						\
+	case 8:							\
+		asm volatile ("ldar %0, %1"			\
+			: "=r" (*(__u64 *)__u.__c)		\
+			: "Q" (*p) : "memory");			\
+		break;						\
+	default:						\
+		/* Only to shut up gcc ... */			\
+		mb();						\
+		break;						\
+	}							\
+	__u.__val;						\
+})
+
 #endif /* _TOOLS_LINUX_ASM_AARCH64_BARRIER_H */
--- a/tools/arch/ia64/include/asm/barrier.h
+++ b/tools/arch/ia64/include/asm/barrier.h
@@ -46,4 +46,17 @@
 #define rmb()		mb()
 #define wmb()		mb()

+#define smp_store_release(p, v)			\
+do {						\
+	barrier();				\
+	WRITE_ONCE(*p, v);			\
+} while (0)
+
+#define smp_load_acquire(p)			\
+({						\
+	typeof(*p) ___p1 = READ_ONCE(*p);	\
+	barrier();				\
+	___p1;					\
+})
+
 #endif /* _TOOLS_LINUX_ASM_IA64_BARRIER_H */
--- a/tools/arch/powerpc/include/asm/barrier.h
+++ b/tools/arch/powerpc/include/asm/barrier.h
@@ -27,4 +27,20 @@
 #define rmb()  __asm__ __volatile__ ("sync" : : : "memory")
 #define wmb()  __asm__ __volatile__ ("sync" : : : "memory")

+#if defined(__powerpc64__)
+#define smp_lwsync()	__asm__ __volatile__ ("lwsync" : : : "memory")
+
+#define smp_store_release(p, v)			\
+do {						\
+	smp_lwsync();				\
+	WRITE_ONCE(*p, v);			\
+} while (0)
+
+#define smp_load_acquire(p)			\
+({						\
+	typeof(*p) ___p1 = READ_ONCE(*p);	\
+	smp_lwsync();				\
+	___p1;					\
+})
+#endif /* defined(__powerpc64__) */
 #endif /* _TOOLS_LINUX_ASM_POWERPC_BARRIER_H */
--- a/tools/arch/s390/include/asm/barrier.h
+++ b/tools/arch/s390/include/asm/barrier.h
@@ -28,4 +28,17 @@
 #define rmb()				mb()
 #define wmb()				mb()

+#define smp_store_release(p, v)			\
+do {						\
+	barrier();				\
+	WRITE_ONCE(*p, v);			\
+} while (0)
+
+#define smp_load_acquire(p)			\
+({						\
+	typeof(*p) ___p1 = READ_ONCE(*p);	\
+	barrier();				\
+	___p1;					\
+})
+
 #endif /* __TOOLS_LIB_ASM_BARRIER_H */
--- a/tools/arch/sparc/include/asm/barrier_64.h
+++ b/tools/arch/sparc/include/asm/barrier_64.h
@@ -40,4 +40,17 @@ do {	__asm__ __volatile__("ba,pt	%%xcc, 1f\n\t" \
 #define rmb()	__asm__ __volatile__("":::"memory")
 #define wmb()	__asm__ __volatile__("":::"memory")

+#define smp_store_release(p, v)			\
+do {						\
+	barrier();				\
+	WRITE_ONCE(*p, v);			\
+} while (0)
+
+#define smp_load_acquire(p)			\
+({						\
+	typeof(*p) ___p1 = READ_ONCE(*p);	\
+	barrier();				\
+	___p1;					\
+})
+
 #endif /* !(__TOOLS_LINUX_SPARC64_BARRIER_H) */
--- a/tools/arch/x86/include/asm/barrier.h
+++ b/tools/arch/x86/include/asm/barrier.h
@@ -26,4 +26,18 @@
 #define wmb()	asm volatile("sfence" ::: "memory")
 #endif

+#if defined(__x86_64__)
+#define smp_store_release(p, v)			\
+do {						\
+	barrier();				\
+	WRITE_ONCE(*p, v);			\
+} while (0)
+
+#define smp_load_acquire(p)			\
+({						\
+	typeof(*p) ___p1 = READ_ONCE(*p);	\
+	barrier();				\
+	___p1;					\
+})
+#endif /* defined(__x86_64__) */
 #endif /* _TOOLS_LINUX_ASM_X86_BARRIER_H */
--- a/tools/include/asm/barrier.h
+++ b/tools/include/asm/barrier.h
 /* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/compiler.h>
 #if defined(__i386__) || defined(__x86_64__)
 #include "../../arch/x86/include/asm/barrier.h"
 #elif defined(__arm__)
@@ -26,3 +27,37 @@
 #else
 #include <asm-generic/barrier.h>
 #endif
+
+/*
+ * Generic fallback smp_*() definitions for archs that haven't
+ * been updated yet.
+ */
+
+#ifndef smp_rmb
+# define smp_rmb()	rmb()
+#endif
+
+#ifndef smp_wmb
+# define smp_wmb()	wmb()
+#endif
+
+#ifndef smp_mb
+# define smp_mb()	mb()
+#endif
+
+#ifndef smp_store_release
+# define smp_store_release(p, v)		\
+do {						\
+	smp_mb();				\
+	WRITE_ONCE(*p, v);			\
+} while (0)
+#endif
+
+#ifndef smp_load_acquire
+# define smp_load_acquire(p)			\
+({						\
+	typeof(*p) ___p1 = READ_ONCE(*p);	\
+	smp_mb();				\
+	___p1;					\
+})
+#endif
--- a/tools/include/linux/ring_buffer.h
+++ b/tools/include/linux/ring_buffer.h
+#ifndef _TOOLS_LINUX_RING_BUFFER_H_
+#define _TOOLS_LINUX_RING_BUFFER_H_
+
+#include <asm/barrier.h>
+
+/*
+ * Contract with kernel for walking the perf ring buffer from
+ * user space requires the following barrier pairing (quote
+ * from kernel/events/ring_buffer.c):
+ *
+ *   Since the mmap() consumer (userspace) can run on a
+ *   different CPU:
+ *
+ *   kernel                             user
+ *
+ *   if (LOAD ->data_tail) {            LOAD ->data_head
+ *                      (A)             smp_rmb()       (C)
+ *      STORE $data                     LOAD $data
+ *      smp_wmb()       (B)             smp_mb()        (D)
+ *      STORE ->data_head               STORE ->data_tail
+ *   }
+ *
+ *   Where A pairs with D, and B pairs with C.
+ *
+ *   In our case A is a control dependency that separates the
+ *   load of the ->data_tail and the stores of $data. In case
+ *   ->data_tail indicates there is no room in the buffer to
+ *   store $data we do not.
+ *
+ *   D needs to be a full barrier since it separates the data
+ *   READ from the tail WRITE.
+ *
+ *   For B a WMB is sufficient since it separates two WRITEs,
+ *   and for C an RMB is sufficient since it separates two READs.
+ *
+ * Note, instead of B, C, D we could also use smp_store_release()
+ * in B and D as well as smp_load_acquire() in C.
+ *
+ * However, this optimization does not make sense for all kernel
+ * supported architectures since for a fair number it would
+ * resolve into READ_ONCE() + smp_mb() pair for smp_load_acquire(),
+ * and smp_mb() + WRITE_ONCE() pair for smp_store_release().
+ *
+ * Thus for those smp_wmb() in B and smp_rmb() in C would still
+ * be less expensive. For the case of D this has either the same
+ * cost or is less expensive, for example, due to TSO x86 can
+ * avoid the CPU barrier entirely.
+ */
+
+static inline u64 ring_buffer_read_head(struct perf_event_mmap_page *base)
+{
+/*
+ * Architectures where smp_load_acquire() does not fallback to
+ * READ_ONCE() + smp_mb() pair.
+ */
+#if defined(__x86_64__) || defined(__aarch64__) || defined(__powerpc64__) || \
+    defined(__ia64__) || defined(__sparc__) && defined(__arch64__)
+	return smp_load_acquire(&base->data_head);
+#else
+	u64 head = READ_ONCE(base->data_head);
+
+	smp_rmb();
+	return head;
+#endif
+}
+
+static inline void ring_buffer_write_tail(struct perf_event_mmap_page *base,
+					  u64 tail)
+{
+	smp_store_release(&base->data_tail, tail);
+}
+
+#endif /* _TOOLS_LINUX_RING_BUFFER_H_ */
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -27,6 +27,7 @@
 #include <linux/list.h>
 #include <linux/limits.h>
 #include <linux/perf_event.h>
+#include <linux/ring_buffer.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <sys/vfs.h>
@@ -2418,13 +2419,12 @@ bpf_perf_event_read_simple(void *mem, unsigned long size,
 			   unsigned long page_size, void **buf, size_t *buf_len,
 			   bpf_perf_event_print_t fn, void *priv)
 {
-	volatile struct perf_event_mmap_page *header = mem;
+	struct perf_event_mmap_page *header = mem;
+	__u64 data_head = ring_buffer_read_head(header);
 	__u64 data_tail = header->data_tail;
-	__u64 data_head = header->data_head;
 	int ret = LIBBPF_PERF_EVENT_ERROR;
 	void *base, *begin, *end;

-	asm volatile("" ::: "memory"); /* in real code it should be smp_rmb() */
 	if (data_head == data_tail)
 		return LIBBPF_PERF_EVENT_CONT;

@@ -2467,8 +2467,6 @@ bpf_perf_event_read_simple(void *mem, unsigned long size,
 		data_tail += ehdr->size;
 	}

-	__sync_synchronize(); /* smp_mb() */
-	header->data_tail = data_tail;
-
+	ring_buffer_write_tail(header, data_tail);
 	return ret;
 }
--- a/tools/perf/util/mmap.h
+++ b/tools/perf/util/mmap.h
@@ -4,7 +4,7 @@
 #include <linux/compiler.h>
 #include <linux/refcount.h>
 #include <linux/types.h>
-#include <asm/barrier.h>
+#include <linux/ring_buffer.h>
 #include <stdbool.h>
 #include "auxtrace.h"
 #include "event.h"
@@ -71,21 +71,12 @@ void perf_mmap__consume(struct perf_mmap *map);

 static inline u64 perf_mmap__read_head(struct perf_mmap *mm)
 {
-	struct perf_event_mmap_page *pc = mm->base;
-	u64 head = READ_ONCE(pc->data_head);
-	rmb();
-	return head;
+	return ring_buffer_read_head(mm->base);
 }

 static inline void perf_mmap__write_tail(struct perf_mmap *md, u64 tail)
 {
-	struct perf_event_mmap_page *pc = md->base;
-
-	/*
-	 * ensure all reads are done before we write the tail out.
-	 */
-	mb();
-	pc->data_tail = tail;
+	ring_buffer_write_tail(md->base, tail);
 }

 union perf_event *perf_mmap__read_forward(struct perf_mmap *map);