Commit 59daa706 authored by Ma Ling's avatar Ma Ling Committed by H. Peter Anvin

x86, mem: Optimize memcpy by avoiding memory false dependece

All read operations after allocation stage can run speculatively,
all write operation will run in program order, and if addresses are
different read may run before older write operation, otherwise wait
until write commit. However CPU don't check each address bit,
so read could fail to recognize different address even they
are in different page.For example if rsi is 0xf004, rdi is 0xe008,
in following operation there will generate big performance latency.
1. movq (%rsi),	%rax
2. movq %rax,	(%rdi)
3. movq 8(%rsi), %rax
4. movq %rax,	8(%rdi)

If %rsi and rdi were in really the same meory page, there are TRUE
read-after-write dependence because instruction 2 write 0x008 and
instruction 3 read 0x00c, the two address are overlap partially.
Actually there are in different page and no any issues,
but without checking each address bit CPU could think they are
in the same page, and instruction 3 have to wait for instruction 2
to write data into cache from write buffer, then load data from cache,
the cost time read spent is equal to mfence instruction. We may avoid it by
tuning operation sequence as follow.

1. movq 8(%rsi), %rax
2. movq %rax,	8(%rdi)
3. movq (%rsi),	%rax
4. movq %rax,	(%rdi)

Instruction 3 read 0x004, instruction 2 write address 0x010, no any
dependence.  At last on Core2 we gain 1.83x speedup compared with
original instruction sequence.  In this patch we first handle small
size(less 20bytes), then jump to different copy mode. Based on our
micro-benchmark small bytes from 1 to 127 bytes, we got up to 2X
improvement, and up to 1.5X improvement for 1024 bytes on Corei7.  (We
use our micro-benchmark, and will do further test according to your
requirment)
Signed-off-by: default avatarMa Ling <ling.ma@intel.com>
LKML-Reference: <1277753065-18610-1-git-send-email-ling.ma@intel.com>
Signed-off-by: default avatarH. Peter Anvin <hpa@zytor.com>
parent fdf42896
...@@ -36,11 +36,9 @@ void *memmove(void *dest, const void *src, size_t n) ...@@ -36,11 +36,9 @@ void *memmove(void *dest, const void *src, size_t n)
"1" (src), "1" (src),
"2" (dest) "2" (dest)
:"memory"); :"memory");
} else { } else {
if((src + n) < dest)
if((src + count) < dest) return memcpy(dest, src, n);
return memcpy(dest, src, count);
else else
__asm__ __volatile__( __asm__ __volatile__(
"std\n\t" "std\n\t"
......
...@@ -40,84 +40,132 @@ ...@@ -40,84 +40,132 @@
ENTRY(__memcpy) ENTRY(__memcpy)
ENTRY(memcpy) ENTRY(memcpy)
CFI_STARTPROC CFI_STARTPROC
movq %rdi, %rax
/* /*
* Put the number of full 64-byte blocks into %ecx. * Use 32bit CMP here to avoid long NOP padding.
* Tail portion is handled at the end:
*/ */
movq %rdi, %rax cmp $0x20, %edx
movl %edx, %ecx jb .Lhandle_tail
shrl $6, %ecx
jz .Lhandle_tail
.p2align 4
.Lloop_64:
/* /*
* We decrement the loop index here - and the zero-flag is * We check whether memory false dependece could occur,
* checked at the end of the loop (instructions inbetween do * then jump to corresponding copy mode.
* not change the zero flag):
*/ */
decl %ecx cmp %dil, %sil
jl .Lcopy_backward
subl $0x20, %edx
.Lcopy_forward_loop:
subq $0x20, %rdx
/* /*
* Move in blocks of 4x16 bytes: * Move in blocks of 4x8 bytes:
*/ */
movq 0*8(%rsi), %r11 movq 0*8(%rsi), %r8
movq 1*8(%rsi), %r8 movq 1*8(%rsi), %r9
movq %r11, 0*8(%rdi) movq 2*8(%rsi), %r10
movq %r8, 1*8(%rdi) movq 3*8(%rsi), %r11
leaq 4*8(%rsi), %rsi
movq 2*8(%rsi), %r9
movq 3*8(%rsi), %r10 movq %r8, 0*8(%rdi)
movq %r9, 2*8(%rdi) movq %r9, 1*8(%rdi)
movq %r10, 3*8(%rdi) movq %r10, 2*8(%rdi)
movq %r11, 3*8(%rdi)
movq 4*8(%rsi), %r11 leaq 4*8(%rdi), %rdi
movq 5*8(%rsi), %r8 jae .Lcopy_forward_loop
movq %r11, 4*8(%rdi) addq $0x20, %rdx
movq %r8, 5*8(%rdi) jmp .Lhandle_tail
movq 6*8(%rsi), %r9 .Lcopy_backward:
movq 7*8(%rsi), %r10 /*
movq %r9, 6*8(%rdi) * Calculate copy position to tail.
movq %r10, 7*8(%rdi) */
addq %rdx, %rsi
leaq 64(%rsi), %rsi addq %rdx, %rdi
leaq 64(%rdi), %rdi subq $0x20, %rdx
/*
jnz .Lloop_64 * At most 3 ALU operations in one cycle,
* so append NOPS in the same 16bytes trunk.
*/
.p2align 4
.Lcopy_backward_loop:
subq $0x20, %rdx
movq -1*8(%rsi), %r8
movq -2*8(%rsi), %r9
movq -3*8(%rsi), %r10
movq -4*8(%rsi), %r11
leaq -4*8(%rsi), %rsi
movq %r8, -1*8(%rdi)
movq %r9, -2*8(%rdi)
movq %r10, -3*8(%rdi)
movq %r11, -4*8(%rdi)
leaq -4*8(%rdi), %rdi
jae .Lcopy_backward_loop
/*
* Calculate copy position to head.
*/
addq $0x20, %rdx
subq %rdx, %rsi
subq %rdx, %rdi
.Lhandle_tail: .Lhandle_tail:
movl %edx, %ecx cmpq $16, %rdx
andl $63, %ecx jb .Lless_16bytes
shrl $3, %ecx
jz .Lhandle_7
/*
* Move data from 16 bytes to 31 bytes.
*/
movq 0*8(%rsi), %r8
movq 1*8(%rsi), %r9
movq -2*8(%rsi, %rdx), %r10
movq -1*8(%rsi, %rdx), %r11
movq %r8, 0*8(%rdi)
movq %r9, 1*8(%rdi)
movq %r10, -2*8(%rdi, %rdx)
movq %r11, -1*8(%rdi, %rdx)
retq
.p2align 4 .p2align 4
.Lloop_8: .Lless_16bytes:
decl %ecx cmpq $8, %rdx
movq (%rsi), %r8 jb .Lless_8bytes
movq %r8, (%rdi) /*
leaq 8(%rdi), %rdi * Move data from 8 bytes to 15 bytes.
leaq 8(%rsi), %rsi */
jnz .Lloop_8 movq 0*8(%rsi), %r8
movq -1*8(%rsi, %rdx), %r9
.Lhandle_7: movq %r8, 0*8(%rdi)
movl %edx, %ecx movq %r9, -1*8(%rdi, %rdx)
andl $7, %ecx retq
jz .Lend .p2align 4
.Lless_8bytes:
cmpq $4, %rdx
jb .Lless_3bytes
/*
* Move data from 4 bytes to 7 bytes.
*/
movl (%rsi), %ecx
movl -4(%rsi, %rdx), %r8d
movl %ecx, (%rdi)
movl %r8d, -4(%rdi, %rdx)
retq
.p2align 4 .p2align 4
.Lless_3bytes:
cmpl $0, %edx
je .Lend
/*
* Move data from 1 bytes to 3 bytes.
*/
.Lloop_1: .Lloop_1:
movb (%rsi), %r8b movb (%rsi), %r8b
movb %r8b, (%rdi) movb %r8b, (%rdi)
incq %rdi incq %rdi
incq %rsi incq %rsi
decl %ecx decl %edx
jnz .Lloop_1 jnz .Lloop_1
.Lend: .Lend:
ret retq
CFI_ENDPROC CFI_ENDPROC
ENDPROC(memcpy) ENDPROC(memcpy)
ENDPROC(__memcpy) ENDPROC(__memcpy)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment