Commit 4b209dbf authored by Keith Randall's avatar Keith Randall

runtime: don't use REP;MOVSB if CPUID doesn't say it is fast

Only use REP;MOVSB if:
 1) The CPUID flag says it is fast, and
 2) The pointers are unaligned
Otherwise, use REP;MOVSQ.

Update #14630

Change-Id: I946b28b87880c08e5eed1ce2945016466c89db66
Reviewed-on: https://go-review.googlesource.com/21300Reviewed-by: default avatarNigel Tao <nigeltao@golang.org>
parent 1a9373bc
...@@ -54,6 +54,7 @@ bad_proc: // show that the program requires MMX. ...@@ -54,6 +54,7 @@ bad_proc: // show that the program requires MMX.
has_cpuid: has_cpuid:
MOVL $0, AX MOVL $0, AX
CPUID CPUID
MOVL AX, SI
CMPL AX, $0 CMPL AX, $0
JE nocpuinfo JE nocpuinfo
...@@ -69,6 +70,7 @@ has_cpuid: ...@@ -69,6 +70,7 @@ has_cpuid:
MOVB $1, runtime·lfenceBeforeRdtsc(SB) MOVB $1, runtime·lfenceBeforeRdtsc(SB)
notintel: notintel:
// Load EAX=1 cpuid flags
MOVL $1, AX MOVL $1, AX
CPUID CPUID
MOVL CX, AX // Move to global variable clobbers CX when generating PIC MOVL CX, AX // Move to global variable clobbers CX when generating PIC
...@@ -79,6 +81,14 @@ notintel: ...@@ -79,6 +81,14 @@ notintel:
TESTL $(1<<23), DX // MMX TESTL $(1<<23), DX // MMX
JZ bad_proc JZ bad_proc
// Load EAX=7/ECX=0 cpuid flags
CMPL SI, $7
JLT nocpuinfo
MOVL $7, AX
MOVL $0, CX
CPUID
MOVL BX, runtime·cpuid_ebx7(SB)
nocpuinfo: nocpuinfo:
// if there is an _cgo_init, call it to let it // if there is an _cgo_init, call it to let it
......
...@@ -28,6 +28,7 @@ TEXT runtime·rt0_go(SB),NOSPLIT,$0 ...@@ -28,6 +28,7 @@ TEXT runtime·rt0_go(SB),NOSPLIT,$0
// find out information about the processor we're on // find out information about the processor we're on
MOVQ $0, AX MOVQ $0, AX
CPUID CPUID
MOVQ AX, SI
CMPQ AX, $0 CMPQ AX, $0
JE nocpuinfo JE nocpuinfo
...@@ -42,15 +43,25 @@ TEXT runtime·rt0_go(SB),NOSPLIT,$0 ...@@ -42,15 +43,25 @@ TEXT runtime·rt0_go(SB),NOSPLIT,$0
JNE notintel JNE notintel
MOVB $1, runtime·lfenceBeforeRdtsc(SB) MOVB $1, runtime·lfenceBeforeRdtsc(SB)
notintel: notintel:
// Do nothing.
// Load EAX=1 cpuid flags
MOVQ $1, AX MOVQ $1, AX
CPUID CPUID
MOVL CX, runtime·cpuid_ecx(SB) MOVL CX, runtime·cpuid_ecx(SB)
MOVL DX, runtime·cpuid_edx(SB) MOVL DX, runtime·cpuid_edx(SB)
// Load EAX=7/ECX=0 cpuid flags
CMPQ SI, $7
JLT no7
MOVL $7, AX
MOVL $0, CX
CPUID
MOVL BX, runtime·cpuid_ebx7(SB)
no7:
// Detect AVX and AVX2 as per 14.7.1 Detection of AVX2 chapter of [1] // Detect AVX and AVX2 as per 14.7.1 Detection of AVX2 chapter of [1]
// [1] 64-ia-32-architectures-software-developer-manual-325462.pdf // [1] 64-ia-32-architectures-software-developer-manual-325462.pdf
// http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf // http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf
MOVL runtime·cpuid_ecx(SB), CX
ANDL $0x18000000, CX // check for OSXSAVE and AVX bits ANDL $0x18000000, CX // check for OSXSAVE and AVX bits
CMPL CX, $0x18000000 CMPL CX, $0x18000000
JNE noavx JNE noavx
...@@ -61,12 +72,8 @@ notintel: ...@@ -61,12 +72,8 @@ notintel:
CMPL AX, $6 // Check for OS support of YMM registers CMPL AX, $6 // Check for OS support of YMM registers
JNE noavx JNE noavx
MOVB $1, runtime·support_avx(SB) MOVB $1, runtime·support_avx(SB)
MOVL $7, AX TESTL $(1<<5), runtime·cpuid_ebx7(SB) // check for AVX2 bit
MOVL $0, CX JEQ noavx2
CPUID
ANDL $0x20, BX // check for AVX2 bit
CMPL BX, $0x20
JNE noavx2
MOVB $1, runtime·support_avx2(SB) MOVB $1, runtime·support_avx2(SB)
JMP nocpuinfo JMP nocpuinfo
noavx: noavx:
......
...@@ -70,24 +70,29 @@ nosse2: ...@@ -70,24 +70,29 @@ nosse2:
* forward copy loop * forward copy loop
*/ */
forward: forward:
// If REP MOVSB isn't fast, don't use it
TESTL $(1<<9), runtime·cpuid_ebx7(SB) // erms, aka enhanced REP MOVSB/STOSB
JEQ fwdBy4
// Check alignment // Check alignment
MOVL SI, AX MOVL SI, AX
ORL DI, AX ORL DI, AX
TESTL $3, AX TESTL $3, AX
JNE unaligned_fwd JEQ fwdBy4
// Do 1 byte at a time
MOVL BX, CX
REP; MOVSB
RET
fwdBy4:
// Do 4 bytes at a time
MOVL BX, CX MOVL BX, CX
SHRL $2, CX SHRL $2, CX
ANDL $3, BX ANDL $3, BX
REP; MOVSL REP; MOVSL
JMP tail JMP tail
unaligned_fwd:
MOVL BX, CX
REP; MOVSB
RET
/* /*
* check overlap * check overlap
*/ */
......
...@@ -77,25 +77,29 @@ forward: ...@@ -77,25 +77,29 @@ forward:
CMPQ BX, $2048 CMPQ BX, $2048
JLS move_256through2048 JLS move_256through2048
// If REP MOVSB isn't fast, don't use it
TESTL $(1<<9), runtime·cpuid_ebx7(SB) // erms, aka enhanced REP MOVSB/STOSB
JEQ fwdBy8
// Check alignment // Check alignment
MOVQ SI, AX MOVL SI, AX
ORQ DI, AX ORL DI, AX
TESTL $7, AX TESTL $7, AX
JNE unaligned_fwd JEQ fwdBy8
// Do 1 byte at a time
MOVQ BX, CX
REP; MOVSB
RET
// Aligned - do 8 bytes at a time fwdBy8:
// Do 8 bytes at a time
MOVQ BX, CX MOVQ BX, CX
SHRQ $3, CX SHRQ $3, CX
ANDQ $7, BX ANDQ $7, BX
REP; MOVSQ REP; MOVSQ
JMP tail JMP tail
unaligned_fwd:
// Unaligned - do 1 byte at a time
MOVQ BX, CX
REP; MOVSB
RET
back: back:
/* /*
* check overlap * check overlap
......
...@@ -701,6 +701,7 @@ var ( ...@@ -701,6 +701,7 @@ var (
// Set on startup in asm_{x86,amd64}.s. // Set on startup in asm_{x86,amd64}.s.
cpuid_ecx uint32 cpuid_ecx uint32
cpuid_edx uint32 cpuid_edx uint32
cpuid_ebx7 uint32
lfenceBeforeRdtsc bool lfenceBeforeRdtsc bool
support_avx bool support_avx bool
support_avx2 bool support_avx2 bool
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment