crypto/rc4: faster amd64 implementation

XOR key into data 128 bits at a time instead of 64 bits and pipeline half of state loads. Rotate loop to allow single-register indexing for state[i]. On a MacBookPro10,2 (Core i5): benchmark old ns/op new ns/op delta BenchmarkRC4_128 412 224 -45.63% BenchmarkRC4_1K 3179 1613 -49.26% BenchmarkRC4_8K 25223 12545 -50.26% benchmark old MB/s new MB/s speedup BenchmarkRC4_128 310.51 570.42 1.84x BenchmarkRC4_1K 322.09 634.48 1.97x BenchmarkRC4_8K 320.97 645.32 2.01x For comparison, on the same machine, openssl 0.9.8r reports its rc4 speed as somewhat under 350 MB/s for both 1K and 8K (it is operating 64 bits at a time). On an Intel Xeon E5520: benchmark old ns/op new ns/op delta BenchmarkRC4_128 418 259 -38.04% BenchmarkRC4_1K 3200 1884 -41.12% BenchmarkRC4_8K 25173 14529 -42.28% benchmark old MB/s new MB/s speedup BenchmarkRC4_128 306.04 492.48 1.61x BenchmarkRC4_1K 319.93 543.26 1.70x BenchmarkRC4_8K 321.61 557.20 1.73x For comparison, on the same machine, openssl 1.0.1 reports its rc4 speed as 587 MB/s for 1K and 601 MB/s for 8K. R=agl CC=golang-dev https://golang.org/cl/7865046

crypto/rc4: faster amd64 implementation
XOR key into data 128 bits at a time instead of 64 bits and pipeline half of state loads. Rotate loop to allow single-register indexing for state[i]. On a MacBookPro10,2 (Core i5): benchmark old ns/op new ns/op delta BenchmarkRC4_128 412 224 -45.63% BenchmarkRC4_1K 3179 1613 -49.26% BenchmarkRC4_8K 25223 12545 -50.26% benchmark old MB/s new MB/s speedup BenchmarkRC4_128 310.51 570.42 1.84x BenchmarkRC4_1K 322.09 634.48 1.97x BenchmarkRC4_8K 320.97 645.32 2.01x For comparison, on the same machine, openssl 0.9.8r reports its rc4 speed as somewhat under 350 MB/s for both 1K and 8K (it is operating 64 bits at a time). On an Intel Xeon E5520: benchmark old ns/op new ns/op delta BenchmarkRC4_128 418 259 -38.04% BenchmarkRC4_1K 3200 1884 -41.12% BenchmarkRC4_8K 25173 14529 -42.28% benchmark old MB/s new MB/s speedup BenchmarkRC4_128 306.04 492.48 1.61x BenchmarkRC4_1K 319.93 543.26 1.70x BenchmarkRC4_8K 321.61 557.20 1.73x For comparison, on the same machine, openssl 1.0.1 reports its rc4 speed as 587 MB/s for 1K and 601 MB/s for 8K. R=agl CC=golang-dev https://golang.org/cl/7865046
b505ff62 · Russ Cox · d04ac4b0 · b505ff62 · b505ff62 · b505ff62
Commit b505ff62 authored Mar 21, 2013 by Russ Cox
7 changed files
--- a/src/cmd/6l/optab.c
+++ b/src/cmd/6l/optab.c
@@ -537,6 +537,11 @@ uchar	yextrw[] =
 	Yxr,	Yrl,	Zibm_r,	2,
 	0
 };
+uchar	yinsrw[] =
+{
+	Yml,	Yxr,	Zibm_r,	2,
+	0
+};
 uchar	yinsr[] =
 {
 	Ymm,	Yxr,	Zibm_r,	3,
@@ -992,7 +997,7 @@ Optab optab[] =
 	{ APFRSQRT,	ymfp,	Px, 0x97 },
 	{ APFSUB,	ymfp,	Px, 0x9a },
 	{ APFSUBR,	ymfp,	Px, 0xaa },
-	{ APINSRW,	yextrw,	Pq, 0xc4,(00) },
+	{ APINSRW,	yinsrw,	Pq, 0xc4,(00) },
 	{ APINSRD,	yinsr,	Pq, 0x3a, 0x22, (00) },
 	{ APINSRQ,	yinsr,	Pq3, 0x3a, 0x22, (00) },
 	{ APMADDWL,	ymm,	Py, 0xf5,Pe,0xf5 },

--- a/src/pkg/crypto/rc4/rc4.go
+++ b/src/pkg/crypto/rc4/rc4.go
@@ -13,7 +13,7 @@ import "strconv"

 // A Cipher is an instance of RC4 using a particular key.
 type Cipher struct {
-	s    [256]byte
+	s    [256]uint32
 	i, j uint8
 }

@@ -32,11 +32,11 @@ func NewCipher(key []byte) (*Cipher, error) {
 	}
 	var c Cipher
 	for i := 0; i < 256; i++ {
-		c.s[i] = uint8(i)
+		c.s[i] = uint32(i)
 	}
 	var j uint8 = 0
 	for i := 0; i < 256; i++ {
-		j += c.s[i] + key[i%k]
+		j += uint8(c.s[i]) + key[i%k]
 		c.s[i], c.s[j] = c.s[j], c.s[i]
 	}
 	return &c, nil

--- a/src/pkg/crypto/rc4/rc4_386.s
+++ b/src/pkg/crypto/rc4/rc4_386.s
@@ -20,19 +20,19 @@ loop:
 	INCB AX

 	// j += c.s[i]
-	MOVBLZX (BP)(AX*1), DX
+	MOVBLZX (BP)(AX*4), DX
 	ADDB DX, BX
 	MOVBLZX BX, BX

 	// c.s[i], c.s[j] = c.s[j], c.s[i]
-	MOVBLZX (BP)(BX*1), CX
-	MOVB CX, (BP)(AX*1)
-	MOVB DX, (BP)(BX*1)
+	MOVBLZX (BP)(BX*4), CX
+	MOVB CX, (BP)(AX*4)
+	MOVB DX, (BP)(BX*4)

 	// *dst = *src ^ c.s[c.s[i]+c.s[j]]
 	ADDB DX, CX
 	MOVBLZX CX, CX
-	MOVB (BP)(CX*1), CX
+	MOVB (BP)(CX*4), CX
 	XORB (SI), CX
 	MOVBLZX CX, CX
 	MOVB CX, (DI)

--- a/src/pkg/crypto/rc4/rc4_amd64.s
+++ b/src/pkg/crypto/rc4/rc4_amd64.s
 // Original source:
 //	http://www.zorinaq.com/papers/rc4-amd64.html
 //	http://www.zorinaq.com/papers/rc4-amd64.tar.bz2
+
+// Local modifications:
 //
 // Transliterated from GNU to 6a assembly syntax by the Go authors.
 // The comments and spacing are from the original.
-
+//
 // The new EXTEND macros avoid a bad stall on some systems after 8-bit math.
+//
+// The original code accumulated 64 bits of key stream in an integer
+// register and then XOR'ed the key stream into the data 8 bytes at a time.
+// Modified to accumulate 128 bits of key stream into an XMM register
+// and then XOR the key stream into the data 16 bytes at a time.
+// Approximately doubles throughput.

 // NOTE: Changing EXTEND to a no-op makes the code run 1.2x faster on Core i5
 // but makes the code run 2.0x slower on Xeon.
@@ -38,59 +46,123 @@ TEXT ·xorKeyStream(SB),7,$0
 	MOVQ	yp+40(FP),	AX
 	MOVBQZX	0(AX),		DX		// y = *yp

-	INCQ	CX				// x++
-	ANDQ	$255,		CX		// x &= 0xff
-	LEAQ	-8(BX)(SI*1),	BX		// rbx = in+len-8
-	MOVQ	BX,		R9		// tmp = in+len-8
-	MOVBLZX	(BP)(CX*1),	AX		// tx = d[x]
-	CMPQ	BX,		SI		// cmp in with in+len-8
-	JLT	end				// jump if (in+len-8 < in)
+	LEAQ	(SI)(BX*1),	R9		// limit = in+len

-start:
-	ADDQ	$8,		SI		// increment in
-	ADDQ	$8,		DI		// increment out
-	
-	// generate the next 8 bytes of the rc4 stream into R8
-	MOVQ	$8,		R11		// byte counter
-l1:	ADDB	AX,		DX
+l1:	CMPQ	SI,		R9		// cmp in with in+len
+	JGE	finished			// jump if (in >= in+len)
+
+	INCB	CX
+	EXTEND(CX)
+	TESTL	$15,		CX
+	JZ	wordloop
+
+	MOVBLZX	(BP)(CX*4),	AX
+
+	ADDB	AX,		DX		// y += tx
 	EXTEND(DX)
-	MOVBLZX	(BP)(DX*1),	BX		// ty = d[y]
-	MOVB	BX,		(BP)(CX*1)	// d[x] = ty
-	ADDB	AX,		BX		// val = ty + tx
+	MOVBLZX	(BP)(DX*4),	BX		// ty = d[y]
+	MOVB	BX,		(BP)(CX*4)	// d[x] = ty
+	ADDB	AX,		BX		// val = ty+tx
 	EXTEND(BX)
-	MOVB	AX,		(BP)(DX*1)	// d[y] = tx
-	INCB	CX				// x++ (NEXT ROUND)
-	EXTEND(CX)
-	MOVBLZX	(BP)(CX*1),	AX		// tx = d[x] (NEXT ROUND)
-	SHLQ	$8,		R8
-	MOVB	(BP)(BX*1),	R8		// val = d[val]
-	DECQ	R11
-	JNZ	l1
-
-	// xor 8 bytes
-	BSWAPQ	R8
-	XORQ	-8(SI),		R8
-	CMPQ	SI,		R9		// cmp in+len-8 with in XXX
-	MOVQ	R8,		-8(DI)
-	JLE	start				// jump if (in <= in+len-8)
+	MOVB	AX,		(BP)(DX*4)	// d[y] = tx
+	MOVBLZX	(BP)(BX*4),	R8		// val = d[val]
+	XORB	(SI),		R8		// xor 1 byte
+	MOVB	R8,		(DI)
+	INCQ	SI				// in++
+	INCQ	DI				// out++
+	JMP l1
+
+wordloop:
+	SUBQ	$16,		R9
+	CMPQ	SI,		R9
+	JGT	end
+
+start:
+	ADDQ	$16,		SI		// increment in
+	ADDQ	$16,		DI		// increment out
+
+	// Each KEYROUND generates one byte of key and
+	// inserts it into an XMM register at the given 16-bit index.
+	// The key state array is uint32 words only using the bottom
+	// byte of each word, so the 16-bit OR only copies 8 useful bits.
+	// We accumulate alternating bytes into X0 and X1, and then at
+	// the end we OR X1<<8 into X0 to produce the actual key.
+	//
+	// At the beginning of the loop, CX%16 == 0, so the 16 loads
+	// at state[CX], state[CX+1], ..., state[CX+15] can precompute
+	// (state+CX) as R12 and then become R12[0], R12[1], ... R12[15],
+	// without fear of the byte computation CX+15 wrapping around.
+	//
+	// The first round needs R12[0], the second needs R12[1], and so on.
+	// We can avoid memory stalls by starting the load for round n+1
+	// before the end of round n, using the LOAD macro.
+	LEAQ	(BP)(CX*4),	R12
+
+#define KEYROUND(xmm, load, off, r1, r2, index) \
+	MOVBLZX	(BP)(DX*4),	R8; \
+	MOVB	r1,		(BP)(DX*4); \
+	load((off+1), r2); \
+	MOVB	R8,		(off*4)(R12); \
+	ADDB	r1,		R8; \
+	EXTEND(R8); \
+	PINSRW	$index, (BP)(R8*4), xmm
+
+#define LOAD(off, reg) \
+	MOVBLZX	(off*4)(R12),	reg; \
+	ADDB	reg,		DX; \
+	EXTEND(DX)
+
+#define SKIP(off, reg)
+
+	LOAD(0, AX)
+	KEYROUND(X0, LOAD, 0, AX, BX, 0)
+	KEYROUND(X1, LOAD, 1, BX, AX, 0)
+	KEYROUND(X0, LOAD, 2, AX, BX, 1)
+	KEYROUND(X1, LOAD, 3, BX, AX, 1)
+	KEYROUND(X0, LOAD, 4, AX, BX, 2)
+	KEYROUND(X1, LOAD, 5, BX, AX, 2)
+	KEYROUND(X0, LOAD, 6, AX, BX, 3)
+	KEYROUND(X1, LOAD, 7, BX, AX, 3)
+	KEYROUND(X0, LOAD, 8, AX, BX, 4)
+	KEYROUND(X1, LOAD, 9, BX, AX, 4)
+	KEYROUND(X0, LOAD, 10, AX, BX, 5)
+	KEYROUND(X1, LOAD, 11, BX, AX, 5)
+	KEYROUND(X0, LOAD, 12, AX, BX, 6)
+	KEYROUND(X1, LOAD, 13, BX, AX, 6)
+	KEYROUND(X0, LOAD, 14, AX, BX, 7)
+	KEYROUND(X1, SKIP, 15, BX, AX, 7)
+	
+	ADDB	$16,		CX
+
+	PSLLQ	$8,		X1
+	PXOR	X1,		X0
+	MOVOU	-16(SI),	X2
+	PXOR	X0,		X2
+	MOVOU	X2,		-16(DI)
+
+	CMPQ	SI,		R9		// cmp in with in+len-16
+	JLE	start				// jump if (in <= in+len-16)

 end:
-	ADDQ	$8,		R9		// tmp = in+len
+	DECB	CX
+	ADDQ	$16,		R9		// tmp = in+len

 	// handle the last bytes, one by one
-l2:	CMPQ	R9,		SI		// cmp in with in+len
-	JLE	finished			// jump if (in+len <= in)
+l2:	CMPQ	SI,		R9		// cmp in with in+len
+	JGE	finished			// jump if (in >= in+len)
+
+	INCB	CX
+	EXTEND(CX)
+	MOVBLZX	(BP)(CX*4),	AX
+
 	ADDB	AX,		DX		// y += tx
 	EXTEND(DX)
-	MOVBLZX	(BP)(DX*1),	BX		// ty = d[y]
-	MOVB	BX,		(BP)(CX*1)	// d[x] = ty
+	MOVBLZX	(BP)(DX*4),	BX		// ty = d[y]
+	MOVB	BX,		(BP)(CX*4)	// d[x] = ty
 	ADDB	AX,		BX		// val = ty+tx
 	EXTEND(BX)
-	MOVB	AX,		(BP)(DX*1)	// d[y] = tx
-	INCB	CX				// x++ (NEXT ROUND)
-	EXTEND(CX)
-	MOVBLZX	(BP)(CX*1),	AX		// tx = d[x] (NEXT ROUND)
-	MOVBLZX	(BP)(BX*1),	R8		// val = d[val]
+	MOVB	AX,		(BP)(DX*4)	// d[y] = tx
+	MOVBLZX	(BP)(BX*4),	R8		// val = d[val]
 	XORB	(SI),		R8		// xor 1 byte
 	MOVB	R8,		(DI)
 	INCQ	SI				// in++
@@ -98,7 +170,6 @@ l2:	CMPQ	R9,		SI		// cmp in with in+len
 	JMP l2

 finished:
-	DECQ	CX				// x--
 	MOVQ	yp+40(FP),	BX
 	MOVB	DX, 0(BX)
 	MOVQ	xp+32(FP),	AX

--- a/src/pkg/crypto/rc4/rc4_arm.s
+++ b/src/pkg/crypto/rc4/rc4_arm.s
@@ -31,19 +31,19 @@ loop:
 	// i += 1; j += state[i]
 	ADD $1, R(i)
 	AND $0xff, R(i)
-	MOVBU R(i)<<0(R(state)), R(t)
+	MOVBU R(i)<<2(R(state)), R(t)
 	ADD R(t), R(j)
 	AND $0xff, R(j)

 	// swap state[i] <-> state[j]
-	MOVBU R(j)<<0(R(state)), R(t2)
-	MOVB R(t2), R(i)<<0(R(state))
-	MOVB R(t), R(j)<<0(R(state))
+	MOVBU R(j)<<2(R(state)), R(t2)
+	MOVB R(t2), R(i)<<2(R(state))
+	MOVB R(t), R(j)<<2(R(state))

 	// dst[k] = src[k] ^ state[state[i] + state[j]]
 	ADD R(t2), R(t)
 	AND $0xff, R(t)
-	MOVBU R(t)<<0(R(state)), R(t)
+	MOVBU R(t)<<2(R(state)), R(t)
 	MOVBU R(k)<<0(R(src)), R(t2)
 	EOR R(t), R(t2)
 	MOVB R(t2), R(k)<<0(R(dst))

--- a/src/pkg/crypto/rc4/rc4_asm.go
+++ b/src/pkg/crypto/rc4/rc4_asm.go
@@ -6,7 +6,7 @@

 package rc4

-func xorKeyStream(dst, src *byte, n int, state *[256]byte, i, j *uint8)
+func xorKeyStream(dst, src *byte, n int, state *[256]uint32, i, j *uint8)

 // XORKeyStream sets dst to the result of XORing src with the key stream.
 // Dst and src may be the same slice but otherwise should not overlap.

--- a/src/pkg/crypto/rc4/rc4_test.go
+++ b/src/pkg/crypto/rc4/rc4_test.go
@@ -5,6 +5,7 @@
 package rc4

 import (
+	"bytes"
 	"fmt"
 	"testing"
 )
@@ -115,6 +116,26 @@ func TestGolden(t *testing.T) {
 	}
 }

+func TestBlock(t *testing.T) {
+	c1a, _ := NewCipher(golden[0].key)
+	c1b, _ := NewCipher(golden[1].key)
+	data1 := make([]byte, 1<<20)
+	for i := range data1 {
+		c1a.XORKeyStream(data1[i:i+1], data1[i:i+1])
+		c1b.XORKeyStream(data1[i:i+1], data1[i:i+1])
+	}
+
+	c2a, _ := NewCipher(golden[0].key)
+	c2b, _ := NewCipher(golden[1].key)
+	data2 := make([]byte, 1<<20)
+	c2a.XORKeyStream(data2, data2)
+	c2b.XORKeyStream(data2, data2)
+
+	if !bytes.Equal(data1, data2) {
+		t.Fatalf("bad block")
+	}
+}
+
 func benchmark(b *testing.B, size int64) {
 	buf := make([]byte, size)
 	c, err := NewCipher(golden[0].key)