Commit a6faa972 authored by Nicolas Pitre's avatar Nicolas Pitre Committed by Russell King

[ARM PATCH] 1363/1: memcpy with preload support and other optimisations

Patch from Nicolas Pitre

This improves on what I did with patch #1362/1 by adding preloads for 
architectures that support it.  On an XScale PXA255 this provides a 20% 
performance gain.

Tested with all combinations of sizes and alignments.
parent 49744e17
...@@ -27,15 +27,15 @@ ...@@ -27,15 +27,15 @@
/* /*
* Prototype: void memcpy(void *to,const void *from,unsigned long n); * Prototype: void memcpy(void *to,const void *from,unsigned long n);
* ARM3: cant use memcopy here!!!
*/ */
ENTRY(memcpy) ENTRY(memcpy)
ENTRY(memmove) ENTRY(memmove)
ENTER ENTER
cmp r1, r0 cmp r1, r0
bcc 19f bcc 23f
subs r2, r2, #4 subs r2, r2, #4
blt 6f blt 6f
PLD( pld [r1, #0] )
ands ip, r0, #3 ands ip, r0, #3
bne 7f bne 7f
ands ip, r1, #3 ands ip, r1, #3
...@@ -43,29 +43,42 @@ ENTRY(memmove) ...@@ -43,29 +43,42 @@ ENTRY(memmove)
1: subs r2, r2, #8 1: subs r2, r2, #8
blt 5f blt 5f
subs r2, r2, #0x14 subs r2, r2, #20
blt 3f blt 4f
2: ldmia r1!,{r3 - r9, ip} PLD( pld [r1, #28] )
stmia r0!,{r3 - r9, ip} PLD( subs r2, r2, #64 )
PLD( blt 3f )
2: PLD( pld [r1, #60] )
PLD( pld [r1, #92] )
ldmia r1!, {r3 - r9, ip}
subs r2, r2, #32 subs r2, r2, #32
stmgeia r0!, {r3 - r9, ip}
ldmgeia r1!, {r3 - r9, ip}
subges r2, r2, #32
stmia r0!, {r3 - r9, ip}
bge 2b bge 2b
cmn r2, #16 3: PLD( ldmia r1!, {r3 - r9, ip} )
PLD( adds r2, r2, #32 )
PLD( stmgeia r0!, {r3 - r9, ip} )
PLD( ldmgeia r1!, {r3 - r9, ip} )
PLD( subges r2, r2, #32 )
PLD( stmia r0!, {r3 - r9, ip} )
4: cmn r2, #16
ldmgeia r1!, {r3 - r6} ldmgeia r1!, {r3 - r6}
subge r2, r2, #16
stmgeia r0!, {r3 - r6} stmgeia r0!, {r3 - r6}
subge r2, r2, #0x10 adds r2, r2, #20
3: adds r2, r2, #0x14 ldmgeia r1!, {r3 - r5}
4: ldmgeia r1!, {r3 - r5} subge r2, r2, #12
stmgeia r0!, {r3 - r5} stmgeia r0!, {r3 - r5}
subges r2, r2, #12
bge 4b
5: adds r2, r2, #8 5: adds r2, r2, #8
blt 6f blt 6f
subs r2, r2, #4 subs r2, r2, #4
ldrlt r3, [r1], #4 ldrlt r3, [r1], #4
ldmgeia r1!, {r4, r5} ldmgeia r1!, {r4, r5}
subge r2, r2, #4
strlt r3, [r0], #4 strlt r3, [r0], #4
stmgeia r0!, {r4, r5} stmgeia r0!, {r4, r5}
subge r2, r2, #4
6: adds r2, r2, #4 6: adds r2, r2, #4
EXITEQ EXITEQ
...@@ -94,13 +107,19 @@ ENTRY(memmove) ...@@ -94,13 +107,19 @@ ENTRY(memmove)
8: bic r1, r1, #3 8: bic r1, r1, #3
ldr r7, [r1], #4 ldr r7, [r1], #4
cmp ip, #2 cmp ip, #2
bgt 15f bgt 18f
beq 11f beq 13f
cmp r2, #12 cmp r2, #12
blt 10f blt 11f
PLD( pld [r1, #12] )
sub r2, r2, #12 sub r2, r2, #12
9: mov r3, r7, pull #8 PLD( subs r2, r2, #32 )
PLD( blt 10f )
PLD( pld [r1, #28] )
9: PLD( pld [r1, #44] )
10: mov r3, r7, pull #8
ldmia r1!, {r4 - r7} ldmia r1!, {r4 - r7}
subs r2, r2, #16
orr r3, r3, r4, push #24 orr r3, r3, r4, push #24
mov r4, r4, pull #8 mov r4, r4, pull #8
orr r4, r4, r5, push #24 orr r4, r4, r5, push #24
...@@ -109,24 +128,32 @@ ENTRY(memmove) ...@@ -109,24 +128,32 @@ ENTRY(memmove)
mov r6, r6, pull #8 mov r6, r6, pull #8
orr r6, r6, r7, push #24 orr r6, r6, r7, push #24
stmia r0!, {r3 - r6} stmia r0!, {r3 - r6}
subs r2, r2, #16
bge 9b bge 9b
PLD( cmn r2, #32 )
PLD( bge 10b )
PLD( add r2, r2, #32 )
adds r2, r2, #12 adds r2, r2, #12
blt 100f blt 12f
10: mov r3, r7, pull #8 11: mov r3, r7, pull #8
ldr r7, [r1], #4 ldr r7, [r1], #4
subs r2, r2, #4 subs r2, r2, #4
orr r3, r3, r7, push #24 orr r3, r3, r7, push #24
str r3, [r0], #4 str r3, [r0], #4
bge 10b bge 11b
100: sub r1, r1, #3 12: sub r1, r1, #3
b 6b b 6b
11: cmp r2, #12 13: cmp r2, #12
blt 13f /* */ blt 16f
PLD( pld [r1, #12] )
sub r2, r2, #12 sub r2, r2, #12
12: mov r3, r7, pull #16 PLD( subs r2, r2, #32 )
PLD( blt 15f )
PLD( pld [r1, #28] )
14: PLD( pld [r1, #44] )
15: mov r3, r7, pull #16
ldmia r1!, {r4 - r7} ldmia r1!, {r4 - r7}
subs r2, r2, #16
orr r3, r3, r4, push #16 orr r3, r3, r4, push #16
mov r4, r4, pull #16 mov r4, r4, pull #16
orr r4, r4, r5, push #16 orr r4, r4, r5, push #16
...@@ -135,24 +162,32 @@ ENTRY(memmove) ...@@ -135,24 +162,32 @@ ENTRY(memmove)
mov r6, r6, pull #16 mov r6, r6, pull #16
orr r6, r6, r7, push #16 orr r6, r6, r7, push #16
stmia r0!, {r3 - r6} stmia r0!, {r3 - r6}
subs r2, r2, #16 bge 14b
bge 12b PLD( cmn r2, #32 )
PLD( bge 15b )
PLD( add r2, r2, #32 )
adds r2, r2, #12 adds r2, r2, #12
blt 14f blt 17f
13: mov r3, r7, pull #16 16: mov r3, r7, pull #16
ldr r7, [r1], #4 ldr r7, [r1], #4
subs r2, r2, #4 subs r2, r2, #4
orr r3, r3, r7, push #16 orr r3, r3, r7, push #16
str r3, [r0], #4 str r3, [r0], #4
bge 13b bge 16b
14: sub r1, r1, #2 17: sub r1, r1, #2
b 6b b 6b
15: cmp r2, #12 18: cmp r2, #12
blt 17f blt 21f
PLD( pld [r1, #12] )
sub r2, r2, #12 sub r2, r2, #12
16: mov r3, r7, pull #24 PLD( subs r2, r2, #32 )
PLD( blt 20f )
PLD( pld [r1, #28] )
19: PLD( pld [r1, #44] )
20: mov r3, r7, pull #24
ldmia r1!, {r4 - r7} ldmia r1!, {r4 - r7}
subs r2, r2, #16
orr r3, r3, r4, push #8 orr r3, r3, r4, push #8
mov r4, r4, pull #24 mov r4, r4, pull #24
orr r4, r4, r5, push #8 orr r4, r4, r5, push #8
...@@ -161,55 +196,72 @@ ENTRY(memmove) ...@@ -161,55 +196,72 @@ ENTRY(memmove)
mov r6, r6, pull #24 mov r6, r6, pull #24
orr r6, r6, r7, push #8 orr r6, r6, r7, push #8
stmia r0!, {r3 - r6} stmia r0!, {r3 - r6}
subs r2, r2, #16 bge 19b
bge 16b PLD( cmn r2, #32 )
PLD( bge 20b )
PLD( add r2, r2, #32 )
adds r2, r2, #12 adds r2, r2, #12
blt 18f blt 22f
17: mov r3, r7, pull #24 21: mov r3, r7, pull #24
ldr r7, [r1], #4 ldr r7, [r1], #4
subs r2, r2, #4 subs r2, r2, #4
orr r3, r3, r7, push #8 orr r3, r3, r7, push #8
str r3, [r0], #4 str r3, [r0], #4
bge 17b bge 21b
18: sub r1, r1, #1 22: sub r1, r1, #1
b 6b b 6b
19: add r1, r1, r2 23: add r1, r1, r2
add r0, r0, r2 add r0, r0, r2
subs r2, r2, #4 subs r2, r2, #4
blt 24f blt 29f
PLD( pld [r1, #-4] )
ands ip, r0, #3 ands ip, r0, #3
bne 25f bne 30f
ands ip, r1, #3 ands ip, r1, #3
bne 26f bne 31f
20: subs r2, r2, #8 24: subs r2, r2, #8
blt 23f blt 28f
subs r2, r2, #0x14 subs r2, r2, #20
blt 22f blt 27f
21: ldmdb r1!, {r3 - r9, ip} PLD( pld [r1, #-32] )
stmdb r0!, {r3 - r9, ip} PLD( subs r2, r2, #64 )
PLD( blt 26f )
25: PLD( pld [r1, #-64] )
PLD( pld [r1, #-96] )
ldmdb r1!, {r3 - r9, ip}
subs r2, r2, #32 subs r2, r2, #32
bge 21b stmgedb r0!, {r3 - r9, ip}
22: cmn r2, #16 ldmgedb r1!, {r3 - r9, ip}
subges r2, r2, #32
stmdb r0!, {r3 - r9, ip}
bge 25b
26: PLD( ldmdb r1!, {r3 - r9, ip} )
PLD( adds r2, r2, #32 )
PLD( stmgedb r0!, {r3 - r9, ip} )
PLD( ldmgedb r1!, {r3 - r9, ip} )
PLD( subges r2, r2, #32 )
PLD( stmdb r0!, {r3 - r9, ip} )
27: cmn r2, #16
ldmgedb r1!, {r3 - r6} ldmgedb r1!, {r3 - r6}
stmgedb r0!, {r3 - r6}
subge r2, r2, #16 subge r2, r2, #16
stmgedb r0!, {r3 - r6}
adds r2, r2, #20 adds r2, r2, #20
ldmgedb r1!, {r3 - r5} ldmgedb r1!, {r3 - r5}
stmgedb r0!, {r3 - r5}
subge r2, r2, #12 subge r2, r2, #12
23: adds r2, r2, #8 stmgedb r0!, {r3 - r5}
blt 24f 28: adds r2, r2, #8
blt 29f
subs r2, r2, #4 subs r2, r2, #4
ldrlt r3, [r1, #-4]! ldrlt r3, [r1, #-4]!
ldmgedb r1!, {r4, r5} ldmgedb r1!, {r4, r5}
subge r2, r2, #4
strlt r3, [r0, #-4]! strlt r3, [r0, #-4]!
stmgedb r0!, {r4, r5} stmgedb r0!, {r4, r5}
subge r2, r2, #4
24: adds r2, r2, #4 29: adds r2, r2, #4
EXITEQ EXITEQ
cmp r2, #2 cmp r2, #2
ldrb r3, [r1, #-1]! ldrb r3, [r1, #-1]!
...@@ -220,7 +272,7 @@ ENTRY(memmove) ...@@ -220,7 +272,7 @@ ENTRY(memmove)
strgtb r5, [r0, #-1]! strgtb r5, [r0, #-1]!
EXIT EXIT
25: cmp ip, #2 30: cmp ip, #2
ldrb r3, [r1, #-1]! ldrb r3, [r1, #-1]!
ldrgeb r4, [r1, #-1]! ldrgeb r4, [r1, #-1]!
ldrgtb r5, [r1, #-1]! ldrgtb r5, [r1, #-1]!
...@@ -228,20 +280,26 @@ ENTRY(memmove) ...@@ -228,20 +280,26 @@ ENTRY(memmove)
strgeb r4, [r0, #-1]! strgeb r4, [r0, #-1]!
strgtb r5, [r0, #-1]! strgtb r5, [r0, #-1]!
subs r2, r2, ip subs r2, r2, ip
blt 24b blt 29b
ands ip, r1, #3 ands ip, r1, #3
beq 20b beq 24b
26: bic r1, r1, #3 31: bic r1, r1, #3
ldr r3, [r1], #0 ldr r3, [r1], #0
cmp ip, #2 cmp ip, #2
blt 34f blt 41f
beq 30f beq 36f
cmp r2, #12 cmp r2, #12
blt 28f blt 34f
PLD( pld [r1, #-16] )
sub r2, r2, #12 sub r2, r2, #12
27: mov r7, r3, push #8 PLD( subs r2, r2, #32 )
PLD( blt 33f )
PLD( pld [r1, #-32] )
32: PLD( pld [r1, #-48] )
33: mov r7, r3, push #8
ldmdb r1!, {r3, r4, r5, r6} ldmdb r1!, {r3, r4, r5, r6}
subs r2, r2, #16
orr r7, r7, r6, pull #24 orr r7, r7, r6, pull #24
mov r6, r6, push #8 mov r6, r6, push #8
orr r6, r6, r5, pull #24 orr r6, r6, r5, pull #24
...@@ -250,24 +308,32 @@ ENTRY(memmove) ...@@ -250,24 +308,32 @@ ENTRY(memmove)
mov r4, r4, push #8 mov r4, r4, push #8
orr r4, r4, r3, pull #24 orr r4, r4, r3, pull #24
stmdb r0!, {r4, r5, r6, r7} stmdb r0!, {r4, r5, r6, r7}
subs r2, r2, #16 bge 32b
bge 27b PLD( cmn r2, #32 )
PLD( bge 33b )
PLD( add r2, r2, #32 )
adds r2, r2, #12 adds r2, r2, #12
blt 29f blt 35f
28: mov ip, r3, push #8 34: mov ip, r3, push #8
ldr r3, [r1, #-4]! ldr r3, [r1, #-4]!
subs r2, r2, #4 subs r2, r2, #4
orr ip, ip, r3, pull #24 orr ip, ip, r3, pull #24
str ip, [r0, #-4]! str ip, [r0, #-4]!
bge 28b bge 34b
29: add r1, r1, #3 35: add r1, r1, #3
b 24b b 29b
30: cmp r2, #12 36: cmp r2, #12
blt 32f blt 39f
PLD( pld [r1, #-16] )
sub r2, r2, #12 sub r2, r2, #12
31: mov r7, r3, push #16 PLD( subs r2, r2, #32 )
PLD( blt 38f )
PLD( pld [r1, #-32] )
37: PLD( pld [r1, #-48] )
38: mov r7, r3, push #16
ldmdb r1!, {r3, r4, r5, r6} ldmdb r1!, {r3, r4, r5, r6}
subs r2, r2, #16
orr r7, r7, r6, pull #16 orr r7, r7, r6, pull #16
mov r6, r6, push #16 mov r6, r6, push #16
orr r6, r6, r5, pull #16 orr r6, r6, r5, pull #16
...@@ -276,24 +342,32 @@ ENTRY(memmove) ...@@ -276,24 +342,32 @@ ENTRY(memmove)
mov r4, r4, push #16 mov r4, r4, push #16
orr r4, r4, r3, pull #16 orr r4, r4, r3, pull #16
stmdb r0!, {r4, r5, r6, r7} stmdb r0!, {r4, r5, r6, r7}
subs r2, r2, #16 bge 37b
bge 31b PLD( cmn r2, #32 )
PLD( bge 38b )
PLD( add r2, r2, #32 )
adds r2, r2, #12 adds r2, r2, #12
blt 33f blt 40f
32: mov ip, r3, push #16 39: mov ip, r3, push #16
ldr r3, [r1, #-4]! ldr r3, [r1, #-4]!
subs r2, r2, #4 subs r2, r2, #4
orr ip, ip, r3, pull #16 orr ip, ip, r3, pull #16
str ip, [r0, #-4]! str ip, [r0, #-4]!
bge 32b bge 39b
33: add r1, r1, #2 40: add r1, r1, #2
b 24b b 29b
34: cmp r2, #12 41: cmp r2, #12
blt 36f blt 44f
PLD( pld [r1, #-16] )
sub r2, r2, #12 sub r2, r2, #12
35: mov r7, r3, push #24 PLD( subs r2, r2, #32 )
PLD( blt 43f )
PLD( pld [r1, #-32] )
42: PLD( pld [r1, #-48] )
43: mov r7, r3, push #24
ldmdb r1!, {r3, r4, r5, r6} ldmdb r1!, {r3, r4, r5, r6}
subs r2, r2, #16
orr r7, r7, r6, pull #8 orr r7, r7, r6, pull #8
mov r6, r6, push #24 mov r6, r6, push #24
orr r6, r6, r5, pull #8 orr r6, r6, r5, pull #8
...@@ -302,17 +376,18 @@ ENTRY(memmove) ...@@ -302,17 +376,18 @@ ENTRY(memmove)
mov r4, r4, push #24 mov r4, r4, push #24
orr r4, r4, r3, pull #8 orr r4, r4, r3, pull #8
stmdb r0!, {r4, r5, r6, r7} stmdb r0!, {r4, r5, r6, r7}
subs r2, r2, #16 bge 42b
bge 35b PLD( cmn r2, #32 )
PLD( bge 43b )
PLD( add r2, r2, #32 )
adds r2, r2, #12 adds r2, r2, #12
blt 37f blt 45f
36: mov ip, r3, push #24 44: mov ip, r3, push #24
ldr r3, [r1, #-4]! ldr r3, [r1, #-4]!
subs r2, r2, #4 subs r2, r2, #4
orr ip, ip, r3, pull #8 orr ip, ip, r3, pull #8
str ip, [r0, #-4]! str ip, [r0, #-4]!
bge 36b bge 44b
37: add r1, r1, #1 45: add r1, r1, #1
b 24b b 29b
.align
...@@ -26,3 +26,13 @@ ...@@ -26,3 +26,13 @@
#define push lsr #define push lsr
#define byte(x) ((3-x)*8) #define byte(x) ((3-x)*8)
#endif #endif
/*
* Data preload for architectures that support it
*/
#if __LINUX_ARM_ARCH__ >= 5
#define PLD(code...) code
#else
#define PLD(code...)
#endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment