Commit 3cb41be8 authored by Carlos Eduardo Seo's avatar Carlos Eduardo Seo Committed by Lynn Boger

math/big: improve performance for AddMulVVW and mulAddVWW for ppc64x

This change adds a better implementation in asm for AddMulVVW and
mulAddVWW for ppc64x, with speedups up to 1.54x.

benchmark                       old ns/op     new ns/op     delta
BenchmarkAddMulVVW/1-8          6.58          6.29          -4.41%
BenchmarkAddMulVVW/2-8          7.43          7.25          -2.42%
BenchmarkAddMulVVW/3-8          8.95          8.15          -8.94%
BenchmarkAddMulVVW/4-8          10.1          9.37          -7.23%
BenchmarkAddMulVVW/5-8          12.0          10.7          -10.83%
BenchmarkAddMulVVW/10-8         22.1          20.1          -9.05%
BenchmarkAddMulVVW/100-8        211           154           -27.01%
BenchmarkAddMulVVW/1000-8       2046          1450          -29.13%
BenchmarkAddMulVVW/10000-8      20407         14793         -27.51%
BenchmarkAddMulVVW/100000-8     223857        145548        -34.98%

benchmark                       old MB/s     new MB/s     speedup
BenchmarkAddMulVVW/1-8          9719.88      10175.79     1.05x
BenchmarkAddMulVVW/2-8          17233.97     17657.54     1.02x
BenchmarkAddMulVVW/3-8          21446.05     23550.49     1.10x
BenchmarkAddMulVVW/4-8          25375.70     27334.33     1.08x
BenchmarkAddMulVVW/5-8          26650.52     30029.34     1.13x
BenchmarkAddMulVVW/10-8         28984.29     31833.68     1.10x
BenchmarkAddMulVVW/100-8        30249.41     41531.69     1.37x
BenchmarkAddMulVVW/1000-8       31273.35     44108.54     1.41x
BenchmarkAddMulVVW/10000-8      31360.47     43263.54     1.38x
BenchmarkAddMulVVW/100000-8     28589.58     43971.66     1.54x

Change-Id: I8a8105d4da3592afdef3125757a99f378a0254bb
Reviewed-on: https://go-review.googlesource.com/53931
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: default avatarLynn Boger <laboger@linux.vnet.ibm.com>
parent 92cfd07a
...@@ -98,103 +98,59 @@ TEXT ·shrVU(SB), NOSPLIT, $0 ...@@ -98,103 +98,59 @@ TEXT ·shrVU(SB), NOSPLIT, $0
// func mulAddVWW(z, x []Word, y, r Word) (c Word) // func mulAddVWW(z, x []Word, y, r Word) (c Word)
TEXT ·mulAddVWW(SB), NOSPLIT, $0 TEXT ·mulAddVWW(SB), NOSPLIT, $0
MOVD z+0(FP), R10 MOVD z+0(FP), R10 // R10 = z[]
MOVD x+24(FP), R8 MOVD x+24(FP), R8 // R8 = x[]
MOVD y+48(FP), R9 MOVD y+48(FP), R9 // R9 = y
MOVD r+56(FP), R4 // c = r MOVD r+56(FP), R4 // R4 = r = c
MOVD z_len+8(FP), R11 MOVD z_len+8(FP), R11 // R11 = z_len
MOVD $0, R3 // i = 0
MOVD $8, R18
MOVD $1, R19
JMP e5
l5:
MULLD R18, R3, R5
MOVD (R8)(R5), R20
MULLD R9, R20, R6
MULHDU R9, R20, R7
ADDC R4, R6
ADDZE R7
MOVD R6, (R10)(R5)
MOVD R7, R4
ADD R19, R3
e5: MOVD R0, R3 // R3 will be the index register
CMP R3, R11 CMP R0, R11
BLT l5 MOVD R11, CTR // Initialize loop counter
BEQ done
loop:
MOVD (R8)(R3), R20 // x[i]
MULLD R9, R20, R6 // R6 = z0 = Low-order(x[i]*y)
MULHDU R9, R20, R7 // R7 = z1 = High-order(x[i]*y)
ADDC R4, R6 // Compute sum for z1 and z0
ADDZE R7
MOVD R6, (R10)(R3) // z[i]
MOVD R7, R4 // c
ADD $8, R3
BC 16, 0, loop // bdnz
done:
MOVD R4, c+64(FP) MOVD R4, c+64(FP)
RET RET
// func addMulVVW(z, x []Word, y Word) (c Word) // func addMulVVW(z, x []Word, y Word) (c Word)
TEXT ·addMulVVW(SB), NOSPLIT, $0 TEXT ·addMulVVW(SB), NOSPLIT, $0
MOVD z+0(FP), R10 MOVD z+0(FP), R10 // R10 = z[]
MOVD x+24(FP), R8 MOVD x+24(FP), R8 // R8 = x[]
MOVD y+48(FP), R9 MOVD y+48(FP), R9 // R9 = y
MOVD z_len+8(FP), R22 MOVD z_len+8(FP), R22 // R22 = z_len
MOVD $0, R5 // i = 0
MOVD $0, R4 // c = 0
MOVD $8, R28
MOVD $-2, R23
AND R22, R23 // mask the last bit of z.len
MOVD $2, R24
CMP R23, R24
BGE unrolled
JMP end
unrolled:
MOVD $8, R19 // no (RA)(RB*8) on power
MULLD R5, R19
MOVD (R10)(R19), R11 // R11 = z[i]
MOVD (R8)(R19), R16 // R16 = x[i]
ADD R28, R19, R25
MOVD (R10)(R25), R17
MOVD (R8)(R25), R18
MULLD R9, R16, R12
MULHDU R9, R16, R14
MULLD R9, R18, R6
MULHDU R9, R18, R7
ADDC R4, R12
ADDZE R14
ADDC R11, R12 // z[i] = (x[i]*y) + z[i] + carry
ADDZE R14 // carry = high order bits + add carry
MOVD R12, (R10)(R19)
ADDC R14, R6
ADDZE R7
ADDC R17, R6
ADDZE R7
MOVD R6, (R10)(R25)
MOVD R7, R4
ADD R24, R5 MOVD R0, R3 // R3 will be the index register
CMP R5, R23 CMP R0, R22
BLT unrolled MOVD R0, R4 // R4 = c = 0
JMP end MOVD R22, CTR // Initialize loop counter
BEQ done
loop: loop:
MOVD $8, R19 MOVD (R8)(R3), R20 // Load x[i]
MULLD R5, R19 MOVD (R10)(R3), R21 // Load z[i]
MOVD (R10)(R19), R11 MULLD R9, R20, R6 // R6 = Low-order(x[i]*y)
MOVD (R8)(R19), R16 MULHDU R9, R20, R7 // R7 = High-order(x[i]*y)
MULLD R9, R16, R12 ADDC R21, R6 // R6 = z0
MULHDU R9, R16, R14 ADDZE R7 // R7 = z1
ADDC R4, R12 ADDC R4, R6 // R6 = z0 + c + 0
ADDZE R14 ADDZE R7, R4 // c += z1
ADDC R11, R12 MOVD R6, (R10)(R3) // Store z[i]
ADDZE R14 ADD $8, R3
MOVD R12, (R10)(R19) BC 16, 0, loop // bdnz
MOVD R14, R4
MOVD $1, R15
ADD R15, R5
end:
CMP R5, R22
BLT loop
done:
MOVD R4, c+56(FP) MOVD R4, c+56(FP)
RET RET
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment