5
5
using System . Runtime . CompilerServices ;
6
6
using System . Runtime . Intrinsics ;
7
7
using System . Runtime . InteropServices ;
8
+ using static System . IO . Hashing . VectorHelper ;
8
9
9
10
namespace System . IO . Hashing
10
11
{
@@ -17,7 +18,9 @@ public partial class Crc32
17
18
private static bool CanBeVectorized ( ReadOnlySpan < byte > source ) =>
18
19
BitConverter . IsLittleEndian
19
20
&& VectorHelper . IsSupported
20
- && source . Length >= Vector128 < byte > . Count * 4 ;
21
+ // Vectorization can process spans as short as a single vector (16 bytes), but if ARM intrinsics are supported they
22
+ // seem to be more performant for spans less than 8 vectors (128 bytes).
23
+ && source . Length >= Vector128 < byte > . Count * ( System . Runtime . Intrinsics . Arm . Crc32 . IsSupported ? 8 : 1 ) ;
21
24
22
25
// Processes the bytes in source in 64 byte chunks using carryless/polynomial multiplication intrinsics,
23
26
// followed by processing 16 byte chunks, and then processing remaining bytes individually. Requires
@@ -35,102 +38,81 @@ private static uint UpdateVectorized(uint crc, ReadOnlySpan<byte> source)
35
38
ref byte srcRef = ref MemoryMarshal . GetReference ( source ) ;
36
39
int length = source . Length ;
37
40
38
- Vector128 < ulong > x1 = Vector128 . LoadUnsafe ( ref srcRef ) . AsUInt64 ( ) ;
39
- Vector128 < ulong > x2 = Vector128 . LoadUnsafe ( ref srcRef , 16 ) . AsUInt64 ( ) ;
40
- Vector128 < ulong > x3 = Vector128 . LoadUnsafe ( ref srcRef , 32 ) . AsUInt64 ( ) ;
41
- Vector128 < ulong > x4 = Vector128 . LoadUnsafe ( ref srcRef , 48 ) . AsUInt64 ( ) ;
42
- Vector128 < ulong > x5 ;
41
+ Vector128 < ulong > kConstants ;
42
+ Vector128 < ulong > x1 ; // Accumulator for the new CRC
43
+ Vector128 < ulong > x2 ;
43
44
44
- x1 ^= Vector128 . CreateScalar ( crc ) . AsUInt64 ( ) ;
45
- Vector128 < ulong > x0 = Vector128 . Create ( 0x0154442bd4UL , 0x01c6e41596UL ) ; // k1, k2
46
-
47
- srcRef = ref Unsafe . Add ( ref srcRef , Vector128 < byte > . Count * 4 ) ;
48
- length -= Vector128 < byte > . Count * 4 ;
49
-
50
- // Parallel fold blocks of 64, if any.
51
- while ( length >= Vector128 < byte > . Count * 4 )
45
+ if ( length >= Vector128 < byte > . Count * 8 )
52
46
{
53
- x5 = VectorHelper . CarrylessMultiplyLower ( x1 , x0 ) ;
54
- Vector128 < ulong > x6 = VectorHelper . CarrylessMultiplyLower ( x2 , x0 ) ;
55
- Vector128 < ulong > x7 = VectorHelper . CarrylessMultiplyLower ( x3 , x0 ) ;
56
- Vector128 < ulong > x8 = VectorHelper . CarrylessMultiplyLower ( x4 , x0 ) ;
57
-
58
- x1 = VectorHelper . CarrylessMultiplyUpper ( x1 , x0 ) ;
59
- x2 = VectorHelper . CarrylessMultiplyUpper ( x2 , x0 ) ;
60
- x3 = VectorHelper . CarrylessMultiplyUpper ( x3 , x0 ) ;
61
- x4 = VectorHelper . CarrylessMultiplyUpper ( x4 , x0 ) ;
62
-
63
- Vector128 < ulong > y5 = Vector128 . LoadUnsafe ( ref srcRef ) . AsUInt64 ( ) ;
64
- Vector128 < ulong > y6 = Vector128 . LoadUnsafe ( ref srcRef , 16 ) . AsUInt64 ( ) ;
65
- Vector128 < ulong > y7 = Vector128 . LoadUnsafe ( ref srcRef , 32 ) . AsUInt64 ( ) ;
66
- Vector128 < ulong > y8 = Vector128 . LoadUnsafe ( ref srcRef , 48 ) . AsUInt64 ( ) ;
67
-
68
- x1 ^= x5 ;
69
- x2 ^= x6 ;
70
- x3 ^= x7 ;
71
- x4 ^= x8 ;
72
-
73
- x1 ^= y5 ;
74
- x2 ^= y6 ;
75
- x3 ^= y7 ;
76
- x4 ^= y8 ;
47
+ x1 = Vector128 . LoadUnsafe ( ref srcRef ) . AsUInt64 ( ) ;
48
+ x2 = Vector128 . LoadUnsafe ( ref srcRef , 16 ) . AsUInt64 ( ) ;
49
+ Vector128 < ulong > x3 = Vector128 . LoadUnsafe ( ref srcRef , 32 ) . AsUInt64 ( ) ;
50
+ Vector128 < ulong > x4 = Vector128 . LoadUnsafe ( ref srcRef , 48 ) . AsUInt64 ( ) ;
77
51
78
52
srcRef = ref Unsafe . Add ( ref srcRef , Vector128 < byte > . Count * 4 ) ;
79
53
length -= Vector128 < byte > . Count * 4 ;
80
- }
81
-
82
- // Fold into 128-bits.
83
- x0 = Vector128 . Create ( 0x01751997d0UL , 0x00ccaa009eUL ) ; // k3, k4
84
54
85
- x5 = VectorHelper . CarrylessMultiplyLower ( x1 , x0 ) ;
86
- x1 = VectorHelper . CarrylessMultiplyUpper ( x1 , x0 ) ;
87
- x1 ^= x2 ;
88
- x1 ^= x5 ;
55
+ // Load and XOR the initial CRC value
56
+ x1 ^= Vector128 . CreateScalar ( crc ) . AsUInt64 ( ) ;
57
+
58
+ kConstants = Vector128 . Create ( 0x0154442bd4UL , 0x01c6e41596UL ) ; // k1, k2
59
+
60
+ // Parallel fold blocks of 64, if any.
61
+ do
62
+ {
63
+ Vector128 < ulong > y5 = Vector128 . LoadUnsafe ( ref srcRef ) . AsUInt64 ( ) ;
64
+ Vector128 < ulong > y6 = Vector128 . LoadUnsafe ( ref srcRef , 16 ) . AsUInt64 ( ) ;
65
+ Vector128 < ulong > y7 = Vector128 . LoadUnsafe ( ref srcRef , 32 ) . AsUInt64 ( ) ;
66
+ Vector128 < ulong > y8 = Vector128 . LoadUnsafe ( ref srcRef , 48 ) . AsUInt64 ( ) ;
67
+
68
+ x1 = FoldPolynomialPair ( y5 , x1 , kConstants ) ;
69
+ x2 = FoldPolynomialPair ( y6 , x2 , kConstants ) ;
70
+ x3 = FoldPolynomialPair ( y7 , x3 , kConstants ) ;
71
+ x4 = FoldPolynomialPair ( y8 , x4 , kConstants ) ;
72
+
73
+ srcRef = ref Unsafe . Add ( ref srcRef , Vector128 < byte > . Count * 4 ) ;
74
+ length -= Vector128 < byte > . Count * 4 ;
75
+ } while ( length >= Vector128 < byte > . Count * 4 ) ;
76
+
77
+ // Fold into 128-bits.
78
+ kConstants = Vector128 . Create ( 0x01751997d0UL , 0x00ccaa009eUL ) ; // k3, k4
79
+ x1 = FoldPolynomialPair ( x2 , x1 , kConstants ) ;
80
+ x1 = FoldPolynomialPair ( x3 , x1 , kConstants ) ;
81
+ x1 = FoldPolynomialPair ( x4 , x1 , kConstants ) ;
82
+ }
83
+ else
84
+ {
85
+ // For shorter sources just load the first vector and XOR with the CRC
86
+ Debug . Assert ( length >= 16 ) ;
89
87
90
- x5 = VectorHelper . CarrylessMultiplyLower ( x1 , x0 ) ;
91
- x1 = VectorHelper . CarrylessMultiplyUpper ( x1 , x0 ) ;
92
- x1 ^= x3 ;
93
- x1 ^= x5 ;
88
+ x1 = Vector128 . LoadUnsafe ( ref srcRef ) . AsUInt64 ( ) ;
89
+ x1 ^= Vector128 . CreateScalar ( crc ) . AsUInt64 ( ) ;
94
90
95
- x5 = VectorHelper . CarrylessMultiplyLower ( x1 , x0 ) ;
96
- x1 = VectorHelper . CarrylessMultiplyUpper ( x1 , x0 ) ;
97
- x1 ^= x4 ;
98
- x1 ^= x5 ;
91
+ srcRef = ref Unsafe . Add ( ref srcRef , Vector128 < byte > . Count ) ;
92
+ length -= Vector128 < byte > . Count ;
93
+ }
99
94
100
95
// Single fold blocks of 16, if any.
101
96
while ( length >= Vector128 < byte > . Count )
102
97
{
103
- x2 = Vector128 . LoadUnsafe ( ref srcRef ) . AsUInt64 ( ) ;
104
-
105
- x5 = VectorHelper . CarrylessMultiplyLower ( x1 , x0 ) ;
106
- x1 = VectorHelper . CarrylessMultiplyUpper ( x1 , x0 ) ;
107
- x1 ^= x2 ;
108
- x1 ^= x5 ;
98
+ x1 = FoldPolynomialPair ( Vector128 . LoadUnsafe ( ref srcRef ) . AsUInt64 ( ) , x1 ,
99
+ Vector128 . Create ( 0x01751997d0UL , 0x00ccaa009eUL ) ) ;
109
100
110
101
srcRef = ref Unsafe . Add ( ref srcRef , Vector128 < byte > . Count ) ;
111
102
length -= Vector128 < byte > . Count ;
112
103
}
113
104
114
105
// Fold 128 bits to 64 bits.
115
- x2 = VectorHelper . CarrylessMultiplyLeftLowerRightUpper ( x1 , x0 ) ;
116
- x3 = Vector128 . Create ( ~ 0 , 0 , ~ 0 , 0 ) . AsUInt64 ( ) ;
117
- x1 = VectorHelper . ShiftRightBytesInVector ( x1 , 8 ) ;
118
- x1 ^= x2 ;
119
-
120
- x0 = Vector128 . CreateScalar ( 0x0163cd6124UL ) ; // k5, k0
121
-
122
- x2 = VectorHelper . ShiftRightBytesInVector ( x1 , 4 ) ;
123
- x1 &= x3 ;
124
- x1 = VectorHelper . CarrylessMultiplyLower ( x1 , x0 ) ;
125
- x1 ^= x2 ;
106
+ Vector128 < ulong > bitmask = Vector128 . Create ( ~ 0 , 0 , ~ 0 , 0 ) . AsUInt64 ( ) ;
107
+ x1 = ShiftRightBytesInVector ( x1 , 8 ) ^
108
+ CarrylessMultiplyLower ( x1 , Vector128 . CreateScalar ( 0x00ccaa009eUL ) ) ;
109
+ x1 = CarrylessMultiplyLower ( x1 & bitmask , Vector128 . CreateScalar ( 0x0163cd6124UL ) ) ^ // k5, k0
110
+ ShiftRightBytesInVector ( x1 , 4 ) ;
126
111
127
112
// Reduce to 32 bits.
128
- x0 = Vector128 . Create ( 0x01db710641UL , 0x01f7011641UL ) ; // polynomial
129
-
130
- x2 = x1 & x3 ;
131
- x2 = VectorHelper . CarrylessMultiplyLeftLowerRightUpper ( x2 , x0 ) ;
132
- x2 &= x3 ;
133
- x2 = VectorHelper . CarrylessMultiplyLower ( x2 , x0 ) ;
113
+ kConstants = Vector128 . Create ( 0x01db710641UL , 0x01f7011641UL ) ; // polynomial
114
+ x2 = CarrylessMultiplyLeftLowerRightUpper ( x1 & bitmask , kConstants ) & bitmask ;
115
+ x2 = CarrylessMultiplyLower ( x2 , kConstants ) ;
134
116
x1 ^= x2 ;
135
117
136
118
// Process the remaining bytes, if any
0 commit comments