Skip to content

Commit 09f99c0

Browse files
howjmayFiloSottile
andcommitted
crypto/sha3: add SIMD implementation with ARMv8.2 features
On ARMv8 four SIMD instructions, EOR3, RAX1, XAR, BCAX are added to accelerate sha3 operations. Here the SIMD version of sha3 on ARMv8 is added. fips140: off goos: darwin goarch: arm64 pkg: crypto/sha3 cpu: Apple M2 │ 9e72f5fe60 │ ab93158ba0-dirty │ │ sec/op │ sec/op vs base │ Sha3_512_MTU-8 6.497µ ± 1% 2.988µ ± 0% -54.01% (p=0.002 n=6) Sha3_384_MTU-8 4.639µ ± 5% 2.142µ ± 1% -53.83% (p=0.002 n=6) Sha3_256_MTU-8 3.631µ ± 1% 1.698µ ± 6% -53.24% (p=0.002 n=6) Sha3_224_MTU-8 3.443µ ± 1% 1.602µ ± 1% -53.47% (p=0.002 n=6) Shake128_MTU-8 2.974µ ± 2% 1.392µ ± 1% -53.19% (p=0.002 n=6) Shake256_MTU-8 3.320µ ± 0% 1.537µ ± 2% -53.70% (p=0.002 n=6) Shake256_16x-8 47.26µ ± 1% 27.39µ ± 6% -42.06% (p=0.002 n=6) Shake256_1MiB-8 2.567m ± 1% 1.306m ± 1% -49.12% (p=0.002 n=6) Sha3_512_1MiB-8 4.785m ± 1% 2.397m ± 8% -49.90% (p=0.002 n=6) geomean 23.47µ 11.38µ -51.52% │ 9e72f5fe60 │ ab93158ba0-dirty │ │ B/s │ B/s vs base │ Sha3_512_MTU-8 198.2Mi ± 1% 430.9Mi ± 0% +117.45% (p=0.002 n=6) Sha3_384_MTU-8 277.5Mi ± 5% 601.1Mi ± 1% +116.58% (p=0.002 n=6) Sha3_256_MTU-8 354.6Mi ± 1% 758.2Mi ± 6% +113.85% (p=0.002 n=6) Sha3_224_MTU-8 373.9Mi ± 1% 803.6Mi ± 1% +114.90% (p=0.002 n=6) Shake128_MTU-8 432.9Mi ± 2% 925.2Mi ± 1% +113.70% (p=0.002 n=6) Shake256_MTU-8 387.8Mi ± 0% 837.6Mi ± 2% +115.98% (p=0.002 n=6) Shake256_16x-8 330.6Mi ± 1% 570.7Mi ± 6% +72.61% (p=0.002 n=6) Shake256_1MiB-8 389.5Mi ± 1% 765.5Mi ± 1% +96.53% (p=0.002 n=6) Sha3_512_1MiB-8 209.0Mi ± 1% 417.2Mi ± 8% +99.61% (p=0.002 n=6) geomean 317.7Mi 655.3Mi +106.29% fips140: off goos: darwin goarch: arm64 pkg: crypto/mlkem cpu: Apple M2 │ 9e72f5fe60 │ 257696ed2d-dirty │ │ sec/op │ sec/op vs base │ KeyGen-8 36.97µ ± 1% 29.82µ ± 3% -19.34% (p=0.002 n=6) Encaps-8 51.54µ ± 5% 44.75µ ± 5% -13.17% (p=0.002 n=6) Decaps-8 47.72µ ± 10% 44.73µ ± 1% -6.27% (p=0.002 n=6) RoundTrip/Alice-8 90.47µ ± 2% 79.74µ ± 1% -11.86% (p=0.002 n=6) RoundTrip/Bob-8 52.15µ ± 1% 44.45µ ± 0% -14.76% (p=0.002 n=6) geomean 53.27µ 46.25µ -13.18% Cq-Include-Trybots: luci.golang.try:gotip-darwin-arm64_15 Co-authored-by: Filippo Valsorda <[email protected]> Change-Id: I8c1f476a7d59498bb44d09d7a573beaa07b10f53 Reviewed-on: https://go-review.googlesource.com/c/go/+/667675 Reviewed-by: Roland Shoemaker <[email protected]> LUCI-TryBot-Result: Go LUCI <[email protected]> Reviewed-by: David Chase <[email protected]> Reviewed-by: Daniel McCarney <[email protected]>
1 parent 430a3dc commit 09f99c0

File tree

4 files changed

+209
-1
lines changed

4 files changed

+209
-1
lines changed
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
// Copyright 2025 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
//go:build !purego
6+
7+
package sha3
8+
9+
import (
10+
"crypto/internal/fips140deps/cpu"
11+
"crypto/internal/impl"
12+
"runtime"
13+
)
14+
15+
// On non-Apple ARM64, the SHA-3 instructions are apparently slower than the
16+
// pure Go implementation. Checking GOOS is a bit blunt, as it also excludes
17+
// Asahi Linux; we might consider checking the MIDR model in the future.
18+
var useSHA3 = cpu.ARM64HasSHA3 && runtime.GOOS == "darwin"
19+
20+
func init() {
21+
impl.Register("sha3", "Armv8.2", &useSHA3)
22+
}
23+
24+
//go:noescape
25+
func keccakF1600NEON(a *[200]byte)
26+
27+
func keccakF1600(a *[200]byte) {
28+
if useSHA3 {
29+
keccakF1600NEON(a)
30+
} else {
31+
keccakF1600Generic(a)
32+
}
33+
}
34+
35+
func (d *Digest) write(p []byte) (n int, err error) {
36+
return d.writeGeneric(p)
37+
}
38+
func (d *Digest) read(out []byte) (n int, err error) {
39+
return d.readGeneric(out)
40+
}
41+
func (d *Digest) sum(b []byte) []byte {
42+
return d.sumGeneric(b)
43+
}
Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
// Copyright 2022 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
//go:build !purego
6+
7+
#include "textflag.h"
8+
9+
// func keccakF1600NEON(a *[200]byte)
10+
TEXT ·keccakF1600NEON(SB), $200-8
11+
MOVD a+0(FP), R0
12+
MOVD $round_consts<>(SB), R1
13+
MOVD $24, R2 // counter for loop
14+
15+
VLD1.P 16(R0), [V0.D1, V1.D1]
16+
VLD1.P 16(R0), [V2.D1, V3.D1]
17+
VLD1.P 16(R0), [V4.D1, V5.D1]
18+
VLD1.P 16(R0), [V6.D1, V7.D1]
19+
VLD1.P 16(R0), [V8.D1, V9.D1]
20+
VLD1.P 16(R0), [V10.D1, V11.D1]
21+
VLD1.P 16(R0), [V12.D1, V13.D1]
22+
VLD1.P 16(R0), [V14.D1, V15.D1]
23+
VLD1.P 16(R0), [V16.D1, V17.D1]
24+
VLD1.P 16(R0), [V18.D1, V19.D1]
25+
VLD1.P 16(R0), [V20.D1, V21.D1]
26+
VLD1.P 16(R0), [V22.D1, V23.D1]
27+
VLD1 (R0), [V24.D1]
28+
29+
SUB $192, R0, R0
30+
31+
loop:
32+
// theta
33+
VEOR3 V20.B16, V15.B16, V10.B16, V25.B16
34+
VEOR3 V21.B16, V16.B16, V11.B16, V26.B16
35+
VEOR3 V22.B16, V17.B16, V12.B16, V27.B16
36+
VEOR3 V23.B16, V18.B16, V13.B16, V28.B16
37+
VEOR3 V24.B16, V19.B16, V14.B16, V29.B16
38+
VEOR3 V25.B16, V5.B16, V0.B16, V25.B16
39+
VEOR3 V26.B16, V6.B16, V1.B16, V26.B16
40+
VEOR3 V27.B16, V7.B16, V2.B16, V27.B16
41+
VEOR3 V28.B16, V8.B16, V3.B16, V28.B16
42+
VEOR3 V29.B16, V9.B16, V4.B16, V29.B16
43+
44+
VRAX1 V27.D2, V25.D2, V30.D2
45+
VRAX1 V28.D2, V26.D2, V31.D2
46+
VRAX1 V29.D2, V27.D2, V27.D2
47+
VRAX1 V25.D2, V28.D2, V28.D2
48+
VRAX1 V26.D2, V29.D2, V29.D2
49+
50+
// theta and rho and Pi
51+
VXAR $63, V30.D2, V1.D2, V25.D2
52+
53+
VXAR $20, V30.D2, V6.D2, V1.D2
54+
VXAR $44, V28.D2, V9.D2, V6.D2
55+
VXAR $3, V31.D2, V22.D2, V9.D2
56+
VXAR $25, V28.D2, V14.D2, V22.D2
57+
VXAR $46, V29.D2, V20.D2, V14.D2
58+
59+
VXAR $2, V31.D2, V2.D2, V26.D2
60+
61+
VXAR $21, V31.D2, V12.D2, V2.D2
62+
VXAR $39, V27.D2, V13.D2, V12.D2
63+
VXAR $56, V28.D2, V19.D2, V13.D2
64+
VXAR $8, V27.D2, V23.D2, V19.D2
65+
VXAR $23, V29.D2, V15.D2, V23.D2
66+
67+
VXAR $37, V28.D2, V4.D2, V15.D2
68+
69+
VXAR $50, V28.D2, V24.D2, V28.D2
70+
VXAR $62, V30.D2, V21.D2, V24.D2
71+
VXAR $9, V27.D2, V8.D2, V8.D2
72+
VXAR $19, V30.D2, V16.D2, V4.D2
73+
VXAR $28, V29.D2, V5.D2, V16.D2
74+
75+
VXAR $36, V27.D2, V3.D2, V5.D2
76+
77+
VEOR V29.B16, V0.B16, V0.B16
78+
79+
VXAR $43, V27.D2, V18.D2, V27.D2
80+
VXAR $49, V31.D2, V17.D2, V3.D2
81+
VXAR $54, V30.D2, V11.D2, V30.D2
82+
VXAR $58, V31.D2, V7.D2, V31.D2
83+
VXAR $61, V29.D2, V10.D2, V29.D2
84+
85+
// chi and iota
86+
VBCAX V8.B16, V22.B16, V26.B16, V20.B16
87+
VBCAX V22.B16, V23.B16, V8.B16, V21.B16
88+
VBCAX V23.B16, V24.B16, V22.B16, V22.B16
89+
VBCAX V24.B16, V26.B16, V23.B16, V23.B16
90+
VBCAX V26.B16, V8.B16, V24.B16, V24.B16
91+
92+
VLD1R.P 8(R1), [V26.D2]
93+
94+
VBCAX V3.B16, V19.B16, V30.B16, V17.B16
95+
VBCAX V19.B16, V15.B16, V3.B16, V18.B16
96+
VBCAX V15.B16, V16.B16, V19.B16, V19.B16
97+
VBCAX V16.B16, V30.B16, V15.B16, V15.B16
98+
VBCAX V30.B16, V3.B16, V16.B16, V16.B16
99+
100+
VBCAX V31.B16, V12.B16, V25.B16, V10.B16
101+
VBCAX V12.B16, V13.B16, V31.B16, V11.B16
102+
VBCAX V13.B16, V14.B16, V12.B16, V12.B16
103+
VBCAX V14.B16, V25.B16, V13.B16, V13.B16
104+
VBCAX V25.B16, V31.B16, V14.B16, V14.B16
105+
106+
VBCAX V4.B16, V9.B16, V29.B16, V7.B16
107+
VBCAX V9.B16, V5.B16, V4.B16, V8.B16
108+
VBCAX V5.B16, V6.B16, V9.B16, V9.B16
109+
VBCAX V6.B16, V29.B16, V5.B16, V5.B16
110+
VBCAX V29.B16, V4.B16, V6.B16, V6.B16
111+
112+
VBCAX V28.B16, V0.B16, V27.B16, V3.B16
113+
VBCAX V0.B16, V1.B16, V28.B16, V4.B16
114+
115+
VBCAX V1.B16, V2.B16, V0.B16, V0.B16 // iota (chi part)
116+
VEOR V26.B16, V0.B16, V0.B16 // iota
117+
118+
VBCAX V2.B16, V27.B16, V1.B16, V1.B16
119+
VBCAX V27.B16, V28.B16, V2.B16, V2.B16
120+
121+
SUB $1, R2, R2
122+
CBNZ R2, loop
123+
124+
VST1.P [V0.D1, V1.D1], 16(R0)
125+
VST1.P [V2.D1, V3.D1], 16(R0)
126+
VST1.P [V4.D1, V5.D1], 16(R0)
127+
VST1.P [V6.D1, V7.D1], 16(R0)
128+
VST1.P [V8.D1, V9.D1], 16(R0)
129+
VST1.P [V10.D1, V11.D1], 16(R0)
130+
VST1.P [V12.D1, V13.D1], 16(R0)
131+
VST1.P [V14.D1, V15.D1], 16(R0)
132+
VST1.P [V16.D1, V17.D1], 16(R0)
133+
VST1.P [V18.D1, V19.D1], 16(R0)
134+
VST1.P [V20.D1, V21.D1], 16(R0)
135+
VST1.P [V22.D1, V23.D1], 16(R0)
136+
VST1 [V24.D1], (R0)
137+
138+
RET
139+
140+
DATA round_consts<>+0x00(SB)/8, $0x0000000000000001
141+
DATA round_consts<>+0x08(SB)/8, $0x0000000000008082
142+
DATA round_consts<>+0x10(SB)/8, $0x800000000000808a
143+
DATA round_consts<>+0x18(SB)/8, $0x8000000080008000
144+
DATA round_consts<>+0x20(SB)/8, $0x000000000000808b
145+
DATA round_consts<>+0x28(SB)/8, $0x0000000080000001
146+
DATA round_consts<>+0x30(SB)/8, $0x8000000080008081
147+
DATA round_consts<>+0x38(SB)/8, $0x8000000000008009
148+
DATA round_consts<>+0x40(SB)/8, $0x000000000000008a
149+
DATA round_consts<>+0x48(SB)/8, $0x0000000000000088
150+
DATA round_consts<>+0x50(SB)/8, $0x0000000080008009
151+
DATA round_consts<>+0x58(SB)/8, $0x000000008000000a
152+
DATA round_consts<>+0x60(SB)/8, $0x000000008000808b
153+
DATA round_consts<>+0x68(SB)/8, $0x800000000000008b
154+
DATA round_consts<>+0x70(SB)/8, $0x8000000000008089
155+
DATA round_consts<>+0x78(SB)/8, $0x8000000000008003
156+
DATA round_consts<>+0x80(SB)/8, $0x8000000000008002
157+
DATA round_consts<>+0x88(SB)/8, $0x8000000000000080
158+
DATA round_consts<>+0x90(SB)/8, $0x000000000000800a
159+
DATA round_consts<>+0x98(SB)/8, $0x800000008000000a
160+
DATA round_consts<>+0xA0(SB)/8, $0x8000000080008081
161+
DATA round_consts<>+0xA8(SB)/8, $0x8000000000008080
162+
DATA round_consts<>+0xB0(SB)/8, $0x0000000080000001
163+
DATA round_consts<>+0xB8(SB)/8, $0x8000000080008008
164+
GLOBL round_consts<>(SB), NOPTR|RODATA, $192

src/crypto/internal/fips140/sha3/sha3_noasm.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
// Use of this source code is governed by a BSD-style
33
// license that can be found in the LICENSE file.
44

5-
//go:build (!amd64 && !s390x) || purego
5+
//go:build (!amd64 && !arm64 && !s390x) || purego
66

77
package sha3
88

src/crypto/internal/fips140deps/cpu/cpu.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ var (
2222
ARM64HasPMULL = cpu.ARM64.HasPMULL
2323
ARM64HasSHA2 = cpu.ARM64.HasSHA2
2424
ARM64HasSHA512 = cpu.ARM64.HasSHA512
25+
ARM64HasSHA3 = cpu.ARM64.HasSHA3
2526

2627
LOONG64HasLSX = cpu.Loong64.HasLSX
2728
LOONG64HasLASX = cpu.Loong64.HasLASX

0 commit comments

Comments
 (0)