Commit b46d3988 authored by Balaram Makam's avatar Balaram Makam Committed by Cherry Zhang

runtime: improve arm64 memclr implementation

Improve runtime memclr_arm64.s using ZVA feature to zero out memory when n
is at least 64 bytes.

Also add DCZID_EL0 system register to use in MRS instruction.

    Benchmark results of runtime/Memclr on Amberwing:
name          old time/op    new time/op    delta
Memclr/5        12.7ns ± 0%    12.7ns ± 0%      ~     (all equal)
Memclr/16       12.7ns ± 0%    12.2ns ± 1%    -4.13%  (p=0.000 n=7+8)
Memclr/64       14.0ns ± 0%    14.6ns ± 1%    +4.29%  (p=0.000 n=7+8)
Memclr/256      23.7ns ± 0%    25.7ns ± 0%    +8.44%  (p=0.000 n=8+7)
Memclr/4096      204ns ± 0%      74ns ± 0%   -63.71%  (p=0.000 n=8+8)
Memclr/65536    2.89µs ± 0%    0.84µs ± 0%   -70.91%  (p=0.000 n=8+8)
Memclr/1M       45.9µs ± 0%    17.0µs ± 0%   -62.88%  (p=0.000 n=8+8)
Memclr/4M        184µs ± 0%      77µs ± 4%   -57.94%  (p=0.001 n=6+8)
Memclr/8M        367µs ± 0%     144µs ± 1%   -60.72%  (p=0.000 n=7+8)
Memclr/16M       734µs ± 0%     293µs ± 1%   -60.09%  (p=0.000 n=8+8)
Memclr/64M      2.94ms ± 0%    1.23ms ± 0%   -58.06%  (p=0.000 n=7+8)
GoMemclr/5      8.00ns ± 0%    8.79ns ± 0%    +9.83%  (p=0.000 n=8+8)
GoMemclr/16     8.00ns ± 0%    7.60ns ± 0%    -5.00%  (p=0.000 n=8+8)
GoMemclr/64     10.8ns ± 0%    10.4ns ± 0%    -3.70%  (p=0.000 n=8+8)
GoMemclr/256    20.4ns ± 0%    21.2ns ± 0%    +3.92%  (p=0.000 n=8+8)

name          old speed      new speed      delta
Memclr/5       394MB/s ± 0%   393MB/s ± 0%    -0.28%  (p=0.006 n=8+8)
Memclr/16     1.26GB/s ± 0%  1.31GB/s ± 1%    +4.07%  (p=0.000 n=7+8)
Memclr/64     4.57GB/s ± 0%  4.39GB/s ± 2%    -3.91%  (p=0.000 n=7+8)
Memclr/256    10.8GB/s ± 0%  10.0GB/s ± 0%    -7.95%  (p=0.001 n=7+6)
Memclr/4096   20.1GB/s ± 0%  55.3GB/s ± 0%  +175.46%  (p=0.000 n=8+8)
Memclr/65536  22.6GB/s ± 0%  77.8GB/s ± 0%  +243.63%  (p=0.000 n=7+8)
Memclr/1M     22.8GB/s ± 0%  61.5GB/s ± 0%  +169.38%  (p=0.000 n=8+8)
Memclr/4M     22.8GB/s ± 0%  54.3GB/s ± 4%  +137.85%  (p=0.001 n=6+8)
Memclr/8M     22.8GB/s ± 0%  58.1GB/s ± 1%  +154.56%  (p=0.000 n=7+8)
Memclr/16M    22.8GB/s ± 0%  57.2GB/s ± 1%  +150.54%  (p=0.000 n=8+8)
Memclr/64M    22.8GB/s ± 0%  54.4GB/s ± 0%  +138.42%  (p=0.000 n=7+8)
GoMemclr/5     625MB/s ± 0%   569MB/s ± 0%    -8.90%  (p=0.000 n=7+8)
GoMemclr/16   2.00GB/s ± 0%  2.10GB/s ± 0%    +5.26%  (p=0.000 n=8+8)
GoMemclr/64   5.92GB/s ± 0%  6.15GB/s ± 0%    +3.83%  (p=0.000 n=7+8)
GoMemclr/256  12.5GB/s ± 0%  12.1GB/s ± 0%    -3.77%  (p=0.000 n=8+7)

    Benchmark results of runtime/Memclr on Amberwing without ZVA:
name          old time/op    new time/op    delta
Memclr/5        12.7ns ± 0%    12.8ns ± 0%   +0.79%  (p=0.008 n=5+5)
Memclr/16       12.7ns ± 0%    12.7ns ± 0%     ~     (p=0.444 n=5+5)
Memclr/64       14.0ns ± 0%    14.4ns ± 0%   +2.86%  (p=0.008 n=5+5)
Memclr/256      23.7ns ± 1%    19.2ns ± 0%  -19.06%  (p=0.008 n=5+5)
Memclr/4096      203ns ± 0%     119ns ± 0%  -41.38%  (p=0.008 n=5+5)
Memclr/65536    2.89µs ± 0%    1.66µs ± 0%  -42.76%  (p=0.008 n=5+5)
Memclr/1M       45.9µs ± 0%    26.2µs ± 0%  -42.82%  (p=0.008 n=5+5)
Memclr/4M        184µs ± 0%     105µs ± 0%  -42.81%  (p=0.008 n=5+5)
Memclr/8M        367µs ± 0%     210µs ± 0%  -42.76%  (p=0.008 n=5+5)
Memclr/16M       734µs ± 0%     420µs ± 0%  -42.74%  (p=0.008 n=5+5)
Memclr/64M      2.94ms ± 0%    1.69ms ± 0%  -42.46%  (p=0.008 n=5+5)
GoMemclr/5      8.00ns ± 0%    8.40ns ± 0%   +5.00%  (p=0.008 n=5+5)
GoMemclr/16     8.00ns ± 0%    8.40ns ± 0%   +5.00%  (p=0.008 n=5+5)
GoMemclr/64     10.8ns ± 0%     9.6ns ± 0%  -11.02%  (p=0.008 n=5+5)
GoMemclr/256    20.4ns ± 0%    17.2ns ± 0%  -15.69%  (p=0.008 n=5+5)

name          old speed      new speed      delta
Memclr/5       393MB/s ± 0%   391MB/s ± 0%   -0.64%  (p=0.008 n=5+5)
Memclr/16     1.26GB/s ± 0%  1.26GB/s ± 0%   -0.55%  (p=0.008 n=5+5)
Memclr/64     4.57GB/s ± 0%  4.44GB/s ± 0%   -2.79%  (p=0.008 n=5+5)
Memclr/256    10.8GB/s ± 0%  13.3GB/s ± 0%  +23.07%  (p=0.016 n=4+5)
Memclr/4096   20.1GB/s ± 0%  34.3GB/s ± 0%  +70.91%  (p=0.008 n=5+5)
Memclr/65536  22.7GB/s ± 0%  39.6GB/s ± 0%  +74.65%  (p=0.008 n=5+5)
Memclr/1M     22.8GB/s ± 0%  40.0GB/s ± 0%  +74.88%  (p=0.008 n=5+5)
Memclr/4M     22.8GB/s ± 0%  39.9GB/s ± 0%  +74.84%  (p=0.008 n=5+5)
Memclr/8M     22.9GB/s ± 0%  39.9GB/s ± 0%  +74.71%  (p=0.008 n=5+5)
Memclr/16M    22.9GB/s ± 0%  39.9GB/s ± 0%  +74.64%  (p=0.008 n=5+5)
Memclr/64M    22.8GB/s ± 0%  39.7GB/s ± 0%  +73.79%  (p=0.008 n=5+5)
GoMemclr/5     625MB/s ± 0%   595MB/s ± 0%   -4.77%  (p=0.000 n=4+5)
GoMemclr/16   2.00GB/s ± 0%  1.90GB/s ± 0%   -4.77%  (p=0.008 n=5+5)
GoMemclr/64   5.92GB/s ± 0%  6.66GB/s ± 0%  +12.48%  (p=0.016 n=4+5)
GoMemclr/256  12.5GB/s ± 0%  14.9GB/s ± 0%  +18.95%  (p=0.008 n=5+5)

Fixes #22948

Change-Id: Iaae4e22391e25b54d299821bb7f8a81ac3986b93
Reviewed-on: https://go-review.googlesource.com/82055
Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: default avatarCherry Zhang <cherryyz@google.com>
parent e65d6a6a
......@@ -260,6 +260,7 @@ func archArm64() *Arch {
register["SPSel"] = arm64.REG_SPSel
register["DAIFSet"] = arm64.REG_DAIFSet
register["DAIFClr"] = arm64.REG_DAIFClr
register["DCZID_EL0"] = arm64.REG_DCZID_EL0
register["PLDL1KEEP"] = arm64.REG_PLDL1KEEP
register["PLDL1STRM"] = arm64.REG_PLDL1STRM
register["PLDL2KEEP"] = arm64.REG_PLDL2KEEP
......
......@@ -251,6 +251,7 @@ TEXT asmtest(SB),DUPOK|NOSPLIT,$-8
MSR $6, DAIFClr // ff4603d5
MRS ELR_EL1, R8 // 284038d5
MSR R16, ELR_EL1 // 304018d5
MRS DCZID_EL0, R3 // e3003bd5
MSUBW R1, R1, R12, R5 // 8585011b
MSUB R19, R16, R26, R2 // 42c3139b
MULW R26, R5, R22 // b67c1a1b
......
......@@ -208,6 +208,7 @@ const (
REG_SPSel
REG_DAIFSet
REG_DAIFClr
REG_DCZID_EL0
REG_PLDL1KEEP
REG_PLDL1STRM
REG_PLDL2KEEP
......
......@@ -634,6 +634,7 @@ var systemreg = []struct {
enc uint32
}{
{REG_ELR_EL1, 8<<16 | 4<<12 | 1<<5},
{REG_DCZID_EL0, 3<<19 | 3<<16 | 7<<5},
}
var prfopfield = []struct {
......
......@@ -134,6 +134,8 @@ func rconv(r int) string {
return "DAIFSet"
case r == REG_DAIFClr:
return "DAIFClr"
case r == REG_DCZID_EL0:
return "DCZID_EL0"
case r == REG_PLDL1KEEP:
return "PLDL1KEEP"
case r == REG_PLDL1STRM:
......
......@@ -8,52 +8,175 @@
TEXT runtime·memclrNoHeapPointers(SB),NOSPLIT,$0-16
MOVD ptr+0(FP), R0
MOVD n+8(FP), R1
// If size is less than 16 bytes, use tail_zero to zero what remains
CMP $16, R1
BLT tail_zero
// Get buffer offset into 16 byte aligned address for better performance
ANDS $15, R0, ZR
BNE unaligned_to_16
aligned_to_16:
LSR $4, R1, R2
// If n is equal to 16 bytes, use zero_exact_16 to zero
BEQ zero_exact_16
// If n is greater than 16 bytes, use zero_by_16 to zero
BHI zero_by_16
// n is less than 16 bytes
ADD R1, R0, R7
TBZ $3, R1, less_than_8
MOVD ZR, (R0)
MOVD ZR, -8(R7)
RET
less_than_8:
TBZ $2, R1, less_than_4
MOVW ZR, (R0)
MOVW ZR, -4(R7)
RET
less_than_4:
CBZ R1, ending
MOVB ZR, (R0)
TBZ $1, R1, ending
MOVH ZR, -2(R7)
ending:
RET
zero_exact_16:
// n is exactly 16 bytes
STP (ZR, ZR), (R0)
RET
zero_by_16:
STP.P (ZR, ZR), 16(R0)
SUBS $1, R2, R2
BNE zero_by_16
// n greater than 16 bytes, check if the start address is aligned
NEG R0, R4
ANDS $15, R4, R4
// Try zeroing using zva if the start address is aligned with 16
BEQ try_zva
// Non-aligned store
STP (ZR, ZR), (R0)
// Make the destination aligned
SUB R4, R1, R1
ADD R4, R0, R0
B try_zva
tail_maybe_long:
CMP $64, R1
BHS no_zva
tail63:
ANDS $48, R1, R3
BEQ last16
CMPW $32, R3
BEQ last48
BLT last32
STP.P (ZR, ZR), 16(R0)
last48:
STP.P (ZR, ZR), 16(R0)
last32:
STP.P (ZR, ZR), 16(R0)
// The last store length is at most 16, so it is safe to use
// stp to write last 16 bytes
last16:
ANDS $15, R1, R1
BEQ ending
CBZ R1, last_end
ADD R1, R0, R0
STP (ZR, ZR), -16(R0)
last_end:
RET
// Zero buffer with size=R1 < 16
tail_zero:
TBZ $3, R1, tail_zero_4
MOVD.P ZR, 8(R0)
no_zva:
SUB $16, R0, R0
SUB $64, R1, R1
tail_zero_4:
TBZ $2, R1, tail_zero_2
MOVW.P ZR, 4(R0)
loop_64:
STP (ZR, ZR), 16(R0)
STP (ZR, ZR), 32(R0)
STP (ZR, ZR), 48(R0)
STP.W (ZR, ZR), 64(R0)
SUBS $64, R1, R1
BGE loop_64
ANDS $63, R1, ZR
ADD $16, R0, R0
BNE tail63
RET
tail_zero_2:
TBZ $1, R1, tail_zero_1
MOVH.P ZR, 2(R0)
try_zva:
// Try using the ZVA feature to zero entire cache lines
// It is not meaningful to use ZVA if the block size is less than 64,
// so make sure that n is greater than or equal to 64
CMP $63, R1
BLE tail63
tail_zero_1:
TBZ $0, R1, ending
MOVB ZR, (R0)
CMP $128, R1
// Ensure n is at least 128 bytes, so that there is enough to copy after
// alignment.
BLT no_zva
// Check if ZVA is allowed from user code, and if so get the block size
MOVW block_size<>(SB), R5
TBNZ $31, R5, no_zva
CBNZ R5, zero_by_line
// DCZID_EL0 bit assignments
// [63:5] Reserved
// [4] DZP, if bit set DC ZVA instruction is prohibited, else permitted
// [3:0] log2 of the block size in words, eg. if it returns 0x4 then block size is 16 words
MRS DCZID_EL0, R3
TBZ $4, R3, init
// ZVA not available
MOVW $~0, R5
MOVW R5, block_size<>(SB)
B no_zva
ending:
init:
MOVW $4, R9
ANDW $15, R3, R5
LSLW R5, R9, R5
MOVW R5, block_size<>(SB)
ANDS $63, R5, R9
// Block size is less than 64.
BNE no_zva
zero_by_line:
CMP R5, R1
// Not enough memory to reach alignment
BLO no_zva
SUB $1, R5, R6
NEG R0, R4
ANDS R6, R4, R4
// Already aligned
BEQ aligned
// check there is enough to copy after alignment
SUB R4, R1, R3
// Check that the remaining length to ZVA after alignment
// is greater than 64.
CMP $64, R3
CCMP GE, R3, R5, $10 // condition code GE, NZCV=0b1010
BLT no_zva
// We now have at least 64 bytes to zero, update n
MOVD R3, R1
loop_zva_prolog:
STP (ZR, ZR), (R0)
STP (ZR, ZR), 16(R0)
STP (ZR, ZR), 32(R0)
SUBS $64, R4, R4
STP (ZR, ZR), 48(R0)
ADD $64, R0, R0
BGE loop_zva_prolog
ADD R4, R0, R0
aligned:
SUB R5, R1, R1
loop_zva:
WORD $0xd50b7420 // DC ZVA, R0
ADD R5, R0, R0
SUBS R5, R1, R1
BHS loop_zva
ANDS R6, R1, R1
BNE tail_maybe_long
RET
unaligned_to_16:
MOVD R0, R2
head_loop:
MOVBU.P ZR, 1(R0)
ANDS $15, R0, ZR
BNE head_loop
// Adjust length for what remains
SUB R2, R0, R3
SUB R3, R1
// If size is less than 16 bytes, use tail_zero to zero what remains
CMP $16, R1
BLT tail_zero
B aligned_to_16
GLOBL block_size<>(SB), NOPTR, $8
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment