Commit 316f81bb authored by David G. Andersen's avatar David G. Andersen Committed by Robert Griesemer

math/big: assembly versions of bitLen for x86-64, 386, and ARM.

Roughly 2x speedup for the internal bitLen function in arith.go.  Added TestWordBitLen test.

Performance differences against the new version of
bitLen generic:

x86-64 Macbook pro (current tip):

benchmark                old ns/op    new ns/op    delta
big.BenchmarkBitLen0             6            4  -37.40%
big.BenchmarkBitLen1             6            2  -51.79%
big.BenchmarkBitLen2             6            2  -65.04%
big.BenchmarkBitLen3             6            2  -66.10%
big.BenchmarkBitLen4             6            2  -60.96%
big.BenchmarkBitLen5             6            2  -55.80%
big.BenchmarkBitLen8             6            2  -56.19%
big.BenchmarkBitLen9             6            2  -64.73%
big.BenchmarkBitLen16            7            2  -68.84%
big.BenchmarkBitLen17            6            2  -67.11%
big.BenchmarkBitLen31            7            2  -61.57%

386 Intel Atom (current tip):
benchmark                old ns/op    new ns/op    delta
big.BenchmarkBitLen0            23           20  -13.04%
big.BenchmarkBitLen1            23           20  -14.77%
big.BenchmarkBitLen2            24           20  -19.28%
big.BenchmarkBitLen3            25           20  -21.57%
big.BenchmarkBitLen4            24           20  -16.94%
big.BenchmarkBitLen5            25           20  -20.78%
big.BenchmarkBitLen8            24           20  -19.28%
big.BenchmarkBitLen9            25           20  -20.47%
big.BenchmarkBitLen16           26           20  -23.37%
big.BenchmarkBitLen17           26           20  -25.09%
big.BenchmarkBitLen31           32           20  -35.51%

ARM v5 SheevaPlug, previous weekly patched with bitLen:
benchmark                old ns/op    new ns/op    delta
big.BenchmarkBitLen0            50           29  -41.73%
big.BenchmarkBitLen1            51           29  -42.75%
big.BenchmarkBitLen2            59           29  -50.08%
big.BenchmarkBitLen3            60           29  -50.75%
big.BenchmarkBitLen4            59           29  -50.08%
big.BenchmarkBitLen5            60           29  -50.75%
big.BenchmarkBitLen8            59           29  -50.08%
big.BenchmarkBitLen9            60           29  -50.75%
big.BenchmarkBitLen16           69           29  -57.35%
big.BenchmarkBitLen17           70           29  -57.89%
big.BenchmarkBitLen31           95           29  -69.07%

R=golang-dev, minux.ma, gri
CC=golang-dev
https://golang.org/cl/5574054
parent d645adc3
......@@ -79,7 +79,7 @@ func mulAddWWW_g(x, y, c Word) (z1, z0 Word) {
}
// Length of x in bits.
func bitLen(x Word) (n int) {
func bitLen_g(x Word) (n int) {
for ; x >= 0x8000; x >>= 16 {
n += 16
}
......
......@@ -245,7 +245,7 @@ E6: CMPL BX, $0 // i < 0
RET
// divWVW(z* Word, xn Word, x []Word, y Word) (r Word)
// func divWVW(z* Word, xn Word, x []Word, y Word) (r Word)
TEXT ·divWVW(SB),7,$0
MOVL z+0(FP), DI
MOVL xn+12(FP), DX // r = xn
......@@ -263,3 +263,14 @@ E7: SUBL $1, BX // i--
MOVL DX, r+32(FP)
RET
// func bitLen(x Word) (n int)
TEXT ·bitLen(SB),7,$0
BSRL x+0(FP), AX
JZ Z1
INCL AX
MOVL AX, n+4(FP)
RET
Z1: MOVL $0, n+4(FP)
RET
......@@ -243,7 +243,7 @@ E6: CMPQ BX, R11 // i < n
RET
// divWVW(z []Word, xn Word, x []Word, y Word) (r Word)
// func divWVW(z []Word, xn Word, x []Word, y Word) (r Word)
TEXT ·divWVW(SB),7,$0
MOVQ z+0(FP), R10
MOVQ xn+16(FP), DX // r = xn
......@@ -261,3 +261,14 @@ E7: SUBL $1, BX // i--
MOVQ DX, r+48(FP)
RET
// func bitLen(x Word) (n int)
TEXT ·bitLen(SB),7,$0
BSRQ x+0(FP), AX
JZ Z1
INCQ AX
MOVQ AX, n+8(FP)
RET
Z1: MOVQ $0, n+8(FP)
RET
......@@ -290,7 +290,7 @@ E9:
RET
// divWVW(z* Word, xn Word, x []Word, y Word) (r Word)
// func divWVW(z* Word, xn Word, x []Word, y Word) (r Word)
TEXT ·divWVW(SB),7,$0
// ARM has no multiword division, so use portable code.
B ·divWVW_g(SB)
......@@ -310,3 +310,12 @@ TEXT ·mulWW(SB),7,$0
MOVW R4, z1+8(FP)
MOVW R3, z0+12(FP)
RET
// func bitLen(x Word) (n int)
TEXT ·bitLen(SB),7,$0
MOVW x+0(FP), R0
WORD $0xe16f0f10 // CLZ R0, R0 (count leading zeros)
MOVW $32, R1
SUB.S R0, R1
MOVW R1, n+4(FP)
RET
......@@ -16,3 +16,4 @@ func shrVU(z, x []Word, s uint) (c Word)
func mulAddVWW(z, x []Word, y, r Word) (c Word)
func addMulVVW(z, x []Word, y Word) (c Word)
func divWVW(z []Word, xn Word, x []Word, y Word) (r Word)
func bitLen(x Word) (n int)
......@@ -334,6 +334,29 @@ func TestMulAddWWW(t *testing.T) {
}
}
func TestWordBitLen(t *testing.T) {
// Test every possible output of bitLen with the high bit set
// and then with all bits below max set
z := bitLen(0)
if z != 0 {
t.Errorf("0 got %d want 0", z)
}
x := Word(1) // Will be ...00010000...
y := Word(1) // Will be ...00011111...
for i := 1; i <= _W; i++ {
z = bitLen(x)
if z != i {
t.Errorf("%x got %d want %d", x, z, i)
}
z = bitLen(y)
if z != i {
t.Errorf("%x got %d want %d", y, z, i)
}
x <<= 1
y = (y << 1) | 0x1
}
}
// runs b.N iterations of bitLen called on a Word containing (1 << nbits)-1.
func benchmarkBitLenN(b *testing.B, nbits uint) {
testword := Word((uint64(1) << nbits) - 1)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment