Commit 1be4b121 authored by Vladislav Vaintroub's avatar Vladislav Vaintroub

Small Windows specific performance fixes:

- Use native memcmp() supplied with C runtime instead of hand-unrolled loop ptr_compare_N loop
Prior to fix  ptr_compare_0()  has  3.7% samples in OLTP-RO in-memory. 
Fix brings this down to 1.8% (all memcmp samples)

- Innodb : fix UT_RELAX_CPU to  be defined  as YieldProcessor, as  was also originally intended 
(but intention was lost in the #ifdef maze

This reduces number of ut_delay() samples in profile from 1.5% to 0.5%
parent 99aa3d46
...@@ -21,17 +21,23 @@ ...@@ -21,17 +21,23 @@
#include "mysys_priv.h" #include "mysys_priv.h"
#include <myisampack.h> #include <myisampack.h>
#ifdef __sun
/* /*
* On Solaris, memcmp() is normally faster than the unrolled ptr_compare_N * On some platforms, memcmp() is faster than the unrolled ptr_compare_N
* functions, as memcmp() is usually a platform-specific implementation * functions, as memcmp() is usually a platform-specific implementation
* written in assembler, provided in /usr/lib/libc/libc_hwcap*.so.1. * written in assembler. for example one in /usr/lib/libc/libc_hwcap*.so.1.
* This implementation is also usually faster than the built-in memcmp * on Solaris, or on Windows inside C runtime linrary.
* supplied by GCC, so it is recommended to build with "-fno-builtin-memcmp" *
* in CFLAGS if building with GCC on Solaris. * On Solaris, native implementation is also usually faster than the
* built-in memcmp supplied by GCC, so it is recommended to build
* with "-fno-builtin-memcmp"in CFLAGS if building with GCC on Solaris.
*/ */
#if defined (__sun) || defined (_WIN32)
#define USE_NATIVE_MEMCMP 1
#endif
#ifdef USE_NATIVE_MEMCMP
#include <string.h> #include <string.h>
static int native_compare(size_t *length, unsigned char **a, unsigned char **b) static int native_compare(size_t *length, unsigned char **a, unsigned char **b)
...@@ -39,7 +45,7 @@ static int native_compare(size_t *length, unsigned char **a, unsigned char **b) ...@@ -39,7 +45,7 @@ static int native_compare(size_t *length, unsigned char **a, unsigned char **b)
return memcmp(*a, *b, *length); return memcmp(*a, *b, *length);
} }
#else /* __sun */ #else /* USE_NATIVE_MEMCMP */
static int ptr_compare(size_t *compare_length, uchar **a, uchar **b); static int ptr_compare(size_t *compare_length, uchar **a, uchar **b);
static int ptr_compare_0(size_t *compare_length, uchar **a, uchar **b); static int ptr_compare_0(size_t *compare_length, uchar **a, uchar **b);
...@@ -50,7 +56,7 @@ static int ptr_compare_3(size_t *compare_length, uchar **a, uchar **b); ...@@ -50,7 +56,7 @@ static int ptr_compare_3(size_t *compare_length, uchar **a, uchar **b);
/* Get a pointer to a optimal byte-compare function for a given size */ /* Get a pointer to a optimal byte-compare function for a given size */
#ifdef __sun #ifdef USE_NATIVE_MEMCMP
qsort2_cmp get_ptr_compare (size_t size __attribute__((unused))) qsort2_cmp get_ptr_compare (size_t size __attribute__((unused)))
{ {
return (qsort2_cmp) native_compare; return (qsort2_cmp) native_compare;
...@@ -68,7 +74,7 @@ qsort2_cmp get_ptr_compare (size_t size) ...@@ -68,7 +74,7 @@ qsort2_cmp get_ptr_compare (size_t size)
} }
return 0; /* Impossible */ return 0; /* Impossible */
} }
#endif /* __sun */ #endif /* USE_NATIVE_MEMCMP */
/* /*
...@@ -78,7 +84,7 @@ qsort2_cmp get_ptr_compare (size_t size) ...@@ -78,7 +84,7 @@ qsort2_cmp get_ptr_compare (size_t size)
#define cmp(N) if (first[N] != last[N]) return (int) first[N] - (int) last[N] #define cmp(N) if (first[N] != last[N]) return (int) first[N] - (int) last[N]
#ifndef __sun #ifndef USE_NATIVE_MEMCMP
static int ptr_compare(size_t *compare_length, uchar **a, uchar **b) static int ptr_compare(size_t *compare_length, uchar **a, uchar **b)
{ {
......
...@@ -63,16 +63,16 @@ typedef time_t ib_time_t; ...@@ -63,16 +63,16 @@ typedef time_t ib_time_t;
# define UT_RELAX_CPU() __asm__ __volatile__ ("pause") # define UT_RELAX_CPU() __asm__ __volatile__ ("pause")
#elif defined(HAVE_FAKE_PAUSE_INSTRUCTION) #elif defined(HAVE_FAKE_PAUSE_INSTRUCTION)
# define UT_RELAX_CPU() __asm__ __volatile__ ("rep; nop") # define UT_RELAX_CPU() __asm__ __volatile__ ("rep; nop")
#elif defined(HAVE_ATOMIC_BUILTINS)
# define UT_RELAX_CPU() do { \
volatile lint volatile_var; \
os_compare_and_swap_lint(&volatile_var, 0, 1); \
} while (0)
#elif defined(HAVE_WINDOWS_ATOMICS) #elif defined(HAVE_WINDOWS_ATOMICS)
/* In the Win32 API, the x86 PAUSE instruction is executed by calling /* In the Win32 API, the x86 PAUSE instruction is executed by calling
the YieldProcessor macro defined in WinNT.h. It is a CPU architecture- the YieldProcessor macro defined in WinNT.h. It is a CPU architecture-
independent way by using YieldProcessor. */ independent way by using YieldProcessor. */
# define UT_RELAX_CPU() YieldProcessor() # define UT_RELAX_CPU() YieldProcessor()
#elif defined(HAVE_ATOMIC_BUILTINS)
# define UT_RELAX_CPU() do { \
volatile lint volatile_var; \
os_compare_and_swap_lint(&volatile_var, 0, 1); \
} while (0)
#else #else
# define UT_RELAX_CPU() ((void)0) /* avoid warning for an empty statement */ # define UT_RELAX_CPU() ((void)0) /* avoid warning for an empty statement */
#endif #endif
......
...@@ -63,16 +63,16 @@ typedef time_t ib_time_t; ...@@ -63,16 +63,16 @@ typedef time_t ib_time_t;
# define UT_RELAX_CPU() __asm__ __volatile__ ("pause") # define UT_RELAX_CPU() __asm__ __volatile__ ("pause")
#elif defined(HAVE_FAKE_PAUSE_INSTRUCTION) #elif defined(HAVE_FAKE_PAUSE_INSTRUCTION)
# define UT_RELAX_CPU() __asm__ __volatile__ ("rep; nop") # define UT_RELAX_CPU() __asm__ __volatile__ ("rep; nop")
#elif defined(HAVE_ATOMIC_BUILTINS)
# define UT_RELAX_CPU() do { \
volatile lint volatile_var; \
os_compare_and_swap_lint(&volatile_var, 0, 1); \
} while (0)
#elif defined(HAVE_WINDOWS_ATOMICS) #elif defined(HAVE_WINDOWS_ATOMICS)
/* In the Win32 API, the x86 PAUSE instruction is executed by calling /* In the Win32 API, the x86 PAUSE instruction is executed by calling
the YieldProcessor macro defined in WinNT.h. It is a CPU architecture- the YieldProcessor macro defined in WinNT.h. It is a CPU architecture-
independent way by using YieldProcessor. */ independent way by using YieldProcessor. */
# define UT_RELAX_CPU() YieldProcessor() # define UT_RELAX_CPU() YieldProcessor()
#elif defined(HAVE_ATOMIC_BUILTINS)
# define UT_RELAX_CPU() do { \
volatile lint volatile_var; \
os_compare_and_swap_lint(&volatile_var, 0, 1); \
} while (0)
#else #else
# define UT_RELAX_CPU() ((void)0) /* avoid warning for an empty statement */ # define UT_RELAX_CPU() ((void)0) /* avoid warning for an empty statement */
#endif #endif
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment