Commit 5168fcf6 authored by templexxx's avatar templexxx Committed by Brad Fitzpatrick

crypto/cipher: use SIMD for xor on amd64

cpu: Intel(R) Core(TM) i7-7700HQ CPU @ 2.80GHz

Benchmark: xor

name                   old time/op    new time/op     delta
XORBytes/8Bytes-8        8.21ns ± 1%     6.35ns ± 3%   -22.66%  (p=0.008 n=5+5)
XORBytes/128Bytes-8      17.9ns ± 1%     10.4ns ± 1%   -41.68%  (p=0.008 n=5+5)
XORBytes/2048Bytes-8      187ns ± 1%       78ns ± 0%   -58.44%  (p=0.008 n=5+5)
XORBytes/32768Bytes-8    2.87µs ± 1%     1.38µs ± 0%   -52.05%  (p=0.008 n=5+5)

name                   old speed      new speed       delta
XORBytes/8Bytes-8       974MB/s ± 1%   1260MB/s ± 2%   +29.33%  (p=0.008 n=5+5)
XORBytes/128Bytes-8    7.15GB/s ± 0%  12.25GB/s ± 1%   +71.17%  (p=0.008 n=5+5)
XORBytes/2048Bytes-8   10.9GB/s ± 1%   26.4GB/s ± 0%  +140.99%  (p=0.008 n=5+5)
XORBytes/32768Bytes-8  11.4GB/s ± 1%   23.8GB/s ± 0%  +108.52%  (p=0.008 n=5+5)

Benchmark: cipher

name               old time/op    new time/op    delta
AESGCMSeal1K-8        269ns ± 6%     261ns ± 2%     ~     (p=0.246 n=5+5)
AESGCMOpen1K-8        242ns ± 1%     240ns ± 2%     ~     (p=0.190 n=5+5)
AESGCMSign8K-8        869ns ± 0%     870ns ± 1%     ~     (p=0.683 n=5+5)
AESGCMSeal8K-8       1.64µs ± 6%    1.59µs ± 7%     ~     (p=0.151 n=5+5)
AESGCMOpen8K-8       1.48µs ± 2%    1.46µs ± 0%   -1.39%  (p=0.008 n=5+5)
AESCFBEncrypt1K-8    1.88µs ± 5%    1.62µs ± 1%  -13.52%  (p=0.008 n=5+5)
AESCFBDecrypt1K-8    1.76µs ± 1%    1.58µs ± 1%  -10.24%  (p=0.016 n=4+5)
AESOFB1K-8           1.10µs ± 4%    1.03µs ± 2%   -6.36%  (p=0.008 n=5+5)
AESCTR1K-8           1.24µs ± 1%    1.17µs ± 0%   -5.96%  (p=0.008 n=5+5)
AESCBCEncrypt1K-8    1.74µs ± 0%    1.14µs ± 1%  -34.36%  (p=0.008 n=5+5)
AESCBCDecrypt1K-8    1.28µs ± 1%    1.10µs ± 1%  -14.04%  (p=0.008 n=5+5)

name               old speed      new speed      delta
AESGCMSeal1K-8     3.81GB/s ± 6%  3.91GB/s ± 2%     ~     (p=0.310 n=5+5)
AESGCMOpen1K-8     4.23GB/s ± 1%  4.27GB/s ± 2%     ~     (p=0.222 n=5+5)
AESGCMSign8K-8     9.43GB/s ± 0%  9.41GB/s ± 1%     ~     (p=0.841 n=5+5)
AESGCMSeal8K-8     5.01GB/s ± 6%  5.16GB/s ± 6%     ~     (p=0.151 n=5+5)
AESGCMOpen8K-8     5.54GB/s ± 2%  5.62GB/s ± 0%   +1.41%  (p=0.008 n=5+5)
AESCFBEncrypt1K-8   543MB/s ± 5%   627MB/s ± 1%  +15.55%  (p=0.008 n=5+5)
AESCFBDecrypt1K-8   580MB/s ± 1%   646MB/s ± 1%  +11.40%  (p=0.016 n=4+5)
AESOFB1K-8          925MB/s ± 4%   988MB/s ± 2%   +6.73%  (p=0.008 n=5+5)
AESCTR1K-8          821MB/s ± 1%   873MB/s ± 1%   +6.34%  (p=0.008 n=5+5)
AESCBCEncrypt1K-8   588MB/s ± 1%   897MB/s ± 1%  +52.36%  (p=0.008 n=5+5)
AESCBCDecrypt1K-8   799MB/s ± 1%   929MB/s ± 1%  +16.32%  (p=0.008 n=5+5)

Change-Id: I42e6ba66c23dad853d33c924fca7b0ed805cefdd
Reviewed-on: https://go-review.googlesource.com/c/125316Reviewed-by: default avatarIlya Tocar <ilya.tocar@intel.com>
Run-TryBot: Ilya Tocar <ilya.tocar@intel.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
parent 0e0f7986
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package cipher
// Export internal functions for testing.
var XorBytes = xorBytes
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package cipher
// xorBytes xors the bytes in a and b. The destination should have enough
// space, otherwise xorBytes will panic. Returns the number of bytes xor'd.
func xorBytes(dst, a, b []byte) int {
n := len(a)
if len(b) < n {
n = len(b)
}
if n == 0 {
return 0
}
_ = dst[n-1]
xorBytesSSE2(&dst[0], &a[0], &b[0], n) // amd64 must have SSE2
return n
}
func xorWords(dst, a, b []byte) {
xorBytes(dst, a, b)
}
//go:noescape
func xorBytesSSE2(dst, a, b *byte, n int)
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "textflag.h"
// func xorBytesSSE2(dst, a, b *byte, n int)
TEXT ·xorBytesSSE2(SB), NOSPLIT, $0
MOVQ dst+0(FP), BX
MOVQ a+8(FP), SI
MOVQ b+16(FP), CX
MOVQ n+24(FP), DX
TESTQ $15, DX // AND 15 & len, if not zero jump to not_aligned.
JNZ not_aligned
aligned:
MOVQ $0, AX // position in slices
loop16b:
MOVOU (SI)(AX*1), X0 // XOR 16byte forwards.
MOVOU (CX)(AX*1), X1
PXOR X1, X0
MOVOU X0, (BX)(AX*1)
ADDQ $16, AX
CMPQ DX, AX
JNE loop16b
RET
loop_1b:
SUBQ $1, DX // XOR 1byte backwards.
MOVB (SI)(DX*1), DI
MOVB (CX)(DX*1), AX
XORB AX, DI
MOVB DI, (BX)(DX*1)
TESTQ $7, DX // AND 7 & len, if not zero jump to loop_1b.
JNZ loop_1b
CMPQ DX, $0 // if len is 0, ret.
JE ret
TESTQ $15, DX // AND 15 & len, if zero jump to aligned.
JZ aligned
not_aligned:
TESTQ $7, DX // AND $7 & len, if not zero jump to loop_1b.
JNE loop_1b
SUBQ $8, DX // XOR 8bytes backwards.
MOVQ (SI)(DX*1), DI
MOVQ (CX)(DX*1), AX
XORQ AX, DI
MOVQ DI, (BX)(DX*1)
CMPQ DX, $16 // if len is greater or equal 16 here, it must be aligned.
JGE aligned
ret:
RET
......@@ -2,6 +2,8 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !amd64
package cipher
import (
......@@ -9,12 +11,9 @@ import (
"unsafe"
)
const wordSize = int(unsafe.Sizeof(uintptr(0)))
const supportsUnaligned = runtime.GOARCH == "386" || runtime.GOARCH == "amd64" || runtime.GOARCH == "ppc64" || runtime.GOARCH == "ppc64le" || runtime.GOARCH == "s390x"
// fastXORBytes xors in bulk. It only works on architectures that
// support unaligned read/writes.
func fastXORBytes(dst, a, b []byte) int {
// xorBytes xors the bytes in a and b. The destination should have enough
// space, otherwise xorBytes will panic. Returns the number of bytes xor'd.
func xorBytes(dst, a, b []byte) int {
n := len(a)
if len(b) < n {
n = len(b)
......@@ -22,6 +21,28 @@ func fastXORBytes(dst, a, b []byte) int {
if n == 0 {
return 0
}
switch {
case supportsUnaligned:
fastXORBytes(dst, a, b, n)
default:
// TODO(hanwen): if (dst, a, b) have common alignment
// we could still try fastXORBytes. It is not clear
// how often this happens, and it's only worth it if
// the block encryption itself is hardware
// accelerated.
safeXORBytes(dst, a, b, n)
}
return n
}
const wordSize = int(unsafe.Sizeof(uintptr(0)))
const supportsUnaligned = runtime.GOARCH == "386" || runtime.GOARCH == "ppc64" || runtime.GOARCH == "ppc64le" || runtime.GOARCH == "s390x"
// fastXORBytes xors in bulk. It only works on architectures that
// support unaligned read/writes.
// n needs to be smaller or equal than the length of a and b.
func fastXORBytes(dst, a, b []byte, n int) {
// Assert dst has enough space
_ = dst[n-1]
......@@ -38,34 +59,13 @@ func fastXORBytes(dst, a, b []byte) int {
for i := (n - n%wordSize); i < n; i++ {
dst[i] = a[i] ^ b[i]
}
return n
}
func safeXORBytes(dst, a, b []byte) int {
n := len(a)
if len(b) < n {
n = len(b)
}
// n needs to be smaller or equal than the length of a and b.
func safeXORBytes(dst, a, b []byte, n int) {
for i := 0; i < n; i++ {
dst[i] = a[i] ^ b[i]
}
return n
}
// xorBytes xors the bytes in a and b. The destination should have enough
// space, otherwise xorBytes will panic. Returns the number of bytes xor'd.
func xorBytes(dst, a, b []byte) int {
if supportsUnaligned {
return fastXORBytes(dst, a, b)
} else {
// TODO(hanwen): if (dst, a, b) have common alignment
// we could still try fastXORBytes. It is not clear
// how often this happens, and it's only worth it if
// the block encryption itself is hardware
// accelerated.
return safeXORBytes(dst, a, b)
}
}
// fastXORWords XORs multiples of 4 or 8 bytes (depending on architecture.)
......@@ -80,10 +80,12 @@ func fastXORWords(dst, a, b []byte) {
}
}
// fastXORWords XORs multiples of 4 or 8 bytes (depending on architecture.)
// The slice arguments a and b are assumed to be of equal length.
func xorWords(dst, a, b []byte) {
if supportsUnaligned {
fastXORWords(dst, a, b)
} else {
safeXORBytes(dst, a, b)
safeXORBytes(dst, a, b, len(b))
}
}
......@@ -2,27 +2,71 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package cipher
package cipher_test
import (
"bytes"
"crypto/cipher"
"crypto/rand"
"fmt"
"io"
"testing"
)
func TestXOR(t *testing.T) {
for alignP := 0; alignP < 2; alignP++ {
for alignQ := 0; alignQ < 2; alignQ++ {
for alignD := 0; alignD < 2; alignD++ {
p := make([]byte, 1024)[alignP:]
q := make([]byte, 1024)[alignQ:]
d1 := make([]byte, 1024+alignD)[alignD:]
d2 := make([]byte, 1024+alignD)[alignD:]
xorBytes(d1, p, q)
safeXORBytes(d2, p, q)
if !bytes.Equal(d1, d2) {
t.Error("not equal")
for j := 1; j <= 1024; j++ {
for alignP := 0; alignP < 2; alignP++ {
for alignQ := 0; alignQ < 2; alignQ++ {
for alignD := 0; alignD < 2; alignD++ {
p := make([]byte, j)[alignP:]
q := make([]byte, j)[alignQ:]
d1 := make([]byte, j+alignD)[alignD:]
d2 := make([]byte, j+alignD)[alignD:]
if _, err := io.ReadFull(rand.Reader, p); err != nil {
t.Fatal(err)
}
if _, err := io.ReadFull(rand.Reader, q); err != nil {
t.Fatal(err)
}
cipher.XorBytes(d1, p, q)
n := min(p, q)
for i := 0; i < n; i++ {
d2[i] = p[i] ^ q[i]
}
if !bytes.Equal(d1, d2) {
t.Logf("p: %#v", p)
t.Logf("q: %#v", q)
t.Logf("expect: %#v", d2)
t.Logf("result: %#v", d1)
t.Fatal("not equal")
}
}
}
}
}
}
func min(a, b []byte) int {
n := len(a)
if len(b) < n {
n = len(b)
}
return n
}
func BenchmarkXORBytes(b *testing.B) {
dst := make([]byte, 1<<15)
data0 := make([]byte, 1<<15)
data1 := make([]byte, 1<<15)
sizes := []int64{1 << 3, 1 << 7, 1 << 11, 1 << 15}
for _, size := range sizes {
b.Run(fmt.Sprintf("%dBytes", size), func(b *testing.B) {
s0 := data0[:size]
s1 := data1[:size]
b.SetBytes(int64(size))
for i := 0; i < b.N; i++ {
cipher.XorBytes(dst, s0, s1)
}
})
}
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment