Skip to content

Commit ed33e6c

Browse files
authored
Vectorize shorter buffers for CRC-32 on Intel (#86539)
1 parent 52597f5 commit ed33e6c

File tree

2 files changed

+82
-99
lines changed

2 files changed

+82
-99
lines changed

src/libraries/System.IO.Hashing/src/System/IO/Hashing/Crc32.Vectorized.cs

Lines changed: 59 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
using System.Runtime.CompilerServices;
66
using System.Runtime.Intrinsics;
77
using System.Runtime.InteropServices;
8+
using static System.IO.Hashing.VectorHelper;
89

910
namespace System.IO.Hashing
1011
{
@@ -17,7 +18,9 @@ public partial class Crc32
1718
private static bool CanBeVectorized(ReadOnlySpan<byte> source) =>
1819
BitConverter.IsLittleEndian
1920
&& VectorHelper.IsSupported
20-
&& source.Length >= Vector128<byte>.Count * 4;
21+
// Vectorization can process spans as short as a single vector (16 bytes), but if ARM intrinsics are supported they
22+
// seem to be more performant for spans less than 8 vectors (128 bytes).
23+
&& source.Length >= Vector128<byte>.Count * (System.Runtime.Intrinsics.Arm.Crc32.IsSupported ? 8 : 1);
2124

2225
// Processes the bytes in source in 64 byte chunks using carryless/polynomial multiplication intrinsics,
2326
// followed by processing 16 byte chunks, and then processing remaining bytes individually. Requires
@@ -35,102 +38,81 @@ private static uint UpdateVectorized(uint crc, ReadOnlySpan<byte> source)
3538
ref byte srcRef = ref MemoryMarshal.GetReference(source);
3639
int length = source.Length;
3740

38-
Vector128<ulong> x1 = Vector128.LoadUnsafe(ref srcRef).AsUInt64();
39-
Vector128<ulong> x2 = Vector128.LoadUnsafe(ref srcRef, 16).AsUInt64();
40-
Vector128<ulong> x3 = Vector128.LoadUnsafe(ref srcRef, 32).AsUInt64();
41-
Vector128<ulong> x4 = Vector128.LoadUnsafe(ref srcRef, 48).AsUInt64();
42-
Vector128<ulong> x5;
41+
Vector128<ulong> kConstants;
42+
Vector128<ulong> x1; // Accumulator for the new CRC
43+
Vector128<ulong> x2;
4344

44-
x1 ^= Vector128.CreateScalar(crc).AsUInt64();
45-
Vector128<ulong> x0 = Vector128.Create(0x0154442bd4UL, 0x01c6e41596UL); // k1, k2
46-
47-
srcRef = ref Unsafe.Add(ref srcRef, Vector128<byte>.Count * 4);
48-
length -= Vector128<byte>.Count * 4;
49-
50-
// Parallel fold blocks of 64, if any.
51-
while (length >= Vector128<byte>.Count * 4)
45+
if (length >= Vector128<byte>.Count * 8)
5246
{
53-
x5 = VectorHelper.CarrylessMultiplyLower(x1, x0);
54-
Vector128<ulong> x6 = VectorHelper.CarrylessMultiplyLower(x2, x0);
55-
Vector128<ulong> x7 = VectorHelper.CarrylessMultiplyLower(x3, x0);
56-
Vector128<ulong> x8 = VectorHelper.CarrylessMultiplyLower(x4, x0);
57-
58-
x1 = VectorHelper.CarrylessMultiplyUpper(x1, x0);
59-
x2 = VectorHelper.CarrylessMultiplyUpper(x2, x0);
60-
x3 = VectorHelper.CarrylessMultiplyUpper(x3, x0);
61-
x4 = VectorHelper.CarrylessMultiplyUpper(x4, x0);
62-
63-
Vector128<ulong> y5 = Vector128.LoadUnsafe(ref srcRef).AsUInt64();
64-
Vector128<ulong> y6 = Vector128.LoadUnsafe(ref srcRef, 16).AsUInt64();
65-
Vector128<ulong> y7 = Vector128.LoadUnsafe(ref srcRef, 32).AsUInt64();
66-
Vector128<ulong> y8 = Vector128.LoadUnsafe(ref srcRef, 48).AsUInt64();
67-
68-
x1 ^= x5;
69-
x2 ^= x6;
70-
x3 ^= x7;
71-
x4 ^= x8;
72-
73-
x1 ^= y5;
74-
x2 ^= y6;
75-
x3 ^= y7;
76-
x4 ^= y8;
47+
x1 = Vector128.LoadUnsafe(ref srcRef).AsUInt64();
48+
x2 = Vector128.LoadUnsafe(ref srcRef, 16).AsUInt64();
49+
Vector128<ulong> x3 = Vector128.LoadUnsafe(ref srcRef, 32).AsUInt64();
50+
Vector128<ulong> x4 = Vector128.LoadUnsafe(ref srcRef, 48).AsUInt64();
7751

7852
srcRef = ref Unsafe.Add(ref srcRef, Vector128<byte>.Count * 4);
7953
length -= Vector128<byte>.Count * 4;
80-
}
81-
82-
// Fold into 128-bits.
83-
x0 = Vector128.Create(0x01751997d0UL, 0x00ccaa009eUL); // k3, k4
8454

85-
x5 = VectorHelper.CarrylessMultiplyLower(x1, x0);
86-
x1 = VectorHelper.CarrylessMultiplyUpper(x1, x0);
87-
x1 ^= x2;
88-
x1 ^= x5;
55+
// Load and XOR the initial CRC value
56+
x1 ^= Vector128.CreateScalar(crc).AsUInt64();
57+
58+
kConstants = Vector128.Create(0x0154442bd4UL, 0x01c6e41596UL); // k1, k2
59+
60+
// Parallel fold blocks of 64, if any.
61+
do
62+
{
63+
Vector128<ulong> y5 = Vector128.LoadUnsafe(ref srcRef).AsUInt64();
64+
Vector128<ulong> y6 = Vector128.LoadUnsafe(ref srcRef, 16).AsUInt64();
65+
Vector128<ulong> y7 = Vector128.LoadUnsafe(ref srcRef, 32).AsUInt64();
66+
Vector128<ulong> y8 = Vector128.LoadUnsafe(ref srcRef, 48).AsUInt64();
67+
68+
x1 = FoldPolynomialPair(y5, x1, kConstants);
69+
x2 = FoldPolynomialPair(y6, x2, kConstants);
70+
x3 = FoldPolynomialPair(y7, x3, kConstants);
71+
x4 = FoldPolynomialPair(y8, x4, kConstants);
72+
73+
srcRef = ref Unsafe.Add(ref srcRef, Vector128<byte>.Count * 4);
74+
length -= Vector128<byte>.Count * 4;
75+
} while (length >= Vector128<byte>.Count * 4);
76+
77+
// Fold into 128-bits.
78+
kConstants = Vector128.Create(0x01751997d0UL, 0x00ccaa009eUL); // k3, k4
79+
x1 = FoldPolynomialPair(x2, x1, kConstants);
80+
x1 = FoldPolynomialPair(x3, x1, kConstants);
81+
x1 = FoldPolynomialPair(x4, x1, kConstants);
82+
}
83+
else
84+
{
85+
// For shorter sources just load the first vector and XOR with the CRC
86+
Debug.Assert(length >= 16);
8987

90-
x5 = VectorHelper.CarrylessMultiplyLower(x1, x0);
91-
x1 = VectorHelper.CarrylessMultiplyUpper(x1, x0);
92-
x1 ^= x3;
93-
x1 ^= x5;
88+
x1 = Vector128.LoadUnsafe(ref srcRef).AsUInt64();
89+
x1 ^= Vector128.CreateScalar(crc).AsUInt64();
9490

95-
x5 = VectorHelper.CarrylessMultiplyLower(x1, x0);
96-
x1 = VectorHelper.CarrylessMultiplyUpper(x1, x0);
97-
x1 ^= x4;
98-
x1 ^= x5;
91+
srcRef = ref Unsafe.Add(ref srcRef, Vector128<byte>.Count);
92+
length -= Vector128<byte>.Count;
93+
}
9994

10095
// Single fold blocks of 16, if any.
10196
while (length >= Vector128<byte>.Count)
10297
{
103-
x2 = Vector128.LoadUnsafe(ref srcRef).AsUInt64();
104-
105-
x5 = VectorHelper.CarrylessMultiplyLower(x1, x0);
106-
x1 = VectorHelper.CarrylessMultiplyUpper(x1, x0);
107-
x1 ^= x2;
108-
x1 ^= x5;
98+
x1 = FoldPolynomialPair(Vector128.LoadUnsafe(ref srcRef).AsUInt64(), x1,
99+
Vector128.Create(0x01751997d0UL, 0x00ccaa009eUL));
109100

110101
srcRef = ref Unsafe.Add(ref srcRef, Vector128<byte>.Count);
111102
length -= Vector128<byte>.Count;
112103
}
113104

114105
// Fold 128 bits to 64 bits.
115-
x2 = VectorHelper.CarrylessMultiplyLeftLowerRightUpper(x1, x0);
116-
x3 = Vector128.Create(~0, 0, ~0, 0).AsUInt64();
117-
x1 = VectorHelper.ShiftRightBytesInVector(x1, 8);
118-
x1 ^= x2;
119-
120-
x0 = Vector128.CreateScalar(0x0163cd6124UL); // k5, k0
121-
122-
x2 = VectorHelper.ShiftRightBytesInVector(x1, 4);
123-
x1 &= x3;
124-
x1 = VectorHelper.CarrylessMultiplyLower(x1, x0);
125-
x1 ^= x2;
106+
Vector128<ulong> bitmask = Vector128.Create(~0, 0, ~0, 0).AsUInt64();
107+
x1 = ShiftRightBytesInVector(x1, 8) ^
108+
CarrylessMultiplyLower(x1, Vector128.CreateScalar(0x00ccaa009eUL));
109+
x1 = CarrylessMultiplyLower(x1 & bitmask, Vector128.CreateScalar(0x0163cd6124UL)) ^ // k5, k0
110+
ShiftRightBytesInVector(x1, 4);
126111

127112
// Reduce to 32 bits.
128-
x0 = Vector128.Create(0x01db710641UL, 0x01f7011641UL); // polynomial
129-
130-
x2 = x1 & x3;
131-
x2 = VectorHelper.CarrylessMultiplyLeftLowerRightUpper(x2, x0);
132-
x2 &= x3;
133-
x2 = VectorHelper.CarrylessMultiplyLower(x2, x0);
113+
kConstants = Vector128.Create(0x01db710641UL, 0x01f7011641UL); // polynomial
114+
x2 = CarrylessMultiplyLeftLowerRightUpper(x1 & bitmask, kConstants) & bitmask;
115+
x2 = CarrylessMultiplyLower(x2, kConstants);
134116
x1 ^= x2;
135117

136118
// Process the remaining bytes, if any

src/libraries/System.IO.Hashing/src/System/IO/Hashing/Crc64.Vectorized.cs

Lines changed: 23 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
using System.Runtime.CompilerServices;
66
using System.Runtime.Intrinsics;
77
using System.Runtime.InteropServices;
8+
using static System.IO.Hashing.VectorHelper;
89

910
namespace System.IO.Hashing
1011
{
@@ -72,7 +73,7 @@ private static ulong UpdateVectorized(ulong crc, ReadOnlySpan<byte> source)
7273
// Load and XOR the initial CRC value
7374
// CRC value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
7475
// because data will be byte-reflected and will align with initial crc at correct place.
75-
x0 ^= VectorHelper.ShiftLowerToUpper(Vector128.CreateScalar(crc));
76+
x0 ^= ShiftLowerToUpper(Vector128.CreateScalar(crc));
7677

7778
kConstants = Vector128.Create(0x5cf79dea9ac37d6UL, 0x001067e571d7d5c2UL); // k3, k4
7879

@@ -81,36 +82,36 @@ private static ulong UpdateVectorized(ulong crc, ReadOnlySpan<byte> source)
8182
{
8283
Vector128<ulong> y1 = LoadFromSource(ref srcRef, 0);
8384
Vector128<ulong> y2 = LoadFromSource(ref srcRef, 16);
84-
x0 = VectorHelper.FoldPolynomialPair(y1, x0, kConstants);
85-
x1 = VectorHelper.FoldPolynomialPair(y2, x1, kConstants);
85+
x0 = FoldPolynomialPair(y1, x0, kConstants);
86+
x1 = FoldPolynomialPair(y2, x1, kConstants);
8687

8788
y1 = LoadFromSource(ref srcRef, 32);
8889
y2 = LoadFromSource(ref srcRef, 48);
89-
x2 = VectorHelper.FoldPolynomialPair(y1, x2, kConstants);
90-
x3 = VectorHelper.FoldPolynomialPair(y2, x3, kConstants);
90+
x2 = FoldPolynomialPair(y1, x2, kConstants);
91+
x3 = FoldPolynomialPair(y2, x3, kConstants);
9192

9293
y1 = LoadFromSource(ref srcRef, 64);
9394
y2 = LoadFromSource(ref srcRef, 80);
94-
x4 = VectorHelper.FoldPolynomialPair(y1, x4, kConstants);
95-
x5 = VectorHelper.FoldPolynomialPair(y2, x5, kConstants);
95+
x4 = FoldPolynomialPair(y1, x4, kConstants);
96+
x5 = FoldPolynomialPair(y2, x5, kConstants);
9697

9798
y1 = LoadFromSource(ref srcRef, 96);
9899
y2 = LoadFromSource(ref srcRef, 112);
99-
x6 = VectorHelper.FoldPolynomialPair(y1, x6, kConstants);
100-
x7 = VectorHelper.FoldPolynomialPair(y2, x7, kConstants);
100+
x6 = FoldPolynomialPair(y1, x6, kConstants);
101+
x7 = FoldPolynomialPair(y2, x7, kConstants);
101102

102103
srcRef = ref Unsafe.Add(ref srcRef, Vector128<byte>.Count * 8);
103104
length -= Vector128<byte>.Count * 8;
104105
} while (length >= Vector128<byte>.Count * 8);
105106

106107
// Fold into 128-bits in x7
107-
x7 = VectorHelper.FoldPolynomialPair(x7, x0, Vector128.Create(0xe464f4df5fb60ac1UL, 0xb649c5b35a759cf2UL)); // k9, k10
108-
x7 = VectorHelper.FoldPolynomialPair(x7, x1, Vector128.Create(0x9af04e1eff82d0ddUL, 0x6e82e609297f8fe8UL)); // k11, k12
109-
x7 = VectorHelper.FoldPolynomialPair(x7, x2, Vector128.Create(0x97c516e98bd2e73UL, 0xb76477b31e22e7bUL)); // k13, k14
110-
x7 = VectorHelper.FoldPolynomialPair(x7, x3, Vector128.Create(0x5f6843ca540df020UL, 0xddf4b6981205b83fUL)); // k15, k16
111-
x7 = VectorHelper.FoldPolynomialPair(x7, x4, Vector128.Create(0x54819d8713758b2cUL, 0x4a6b90073eb0af5aUL)); // k17, k18
112-
x7 = VectorHelper.FoldPolynomialPair(x7, x5, Vector128.Create(0x571bee0a227ef92bUL, 0x44bef2a201b5200cUL)); // k19, k20
113-
x7 = VectorHelper.FoldPolynomialPair(x7, x6, Vector128.Create(0x5f5c3c7eb52fab6UL, 0x4eb938a7d257740eUL)); // k1, k2
108+
x7 = FoldPolynomialPair(x7, x0, Vector128.Create(0xe464f4df5fb60ac1UL, 0xb649c5b35a759cf2UL)); // k9, k10
109+
x7 = FoldPolynomialPair(x7, x1, Vector128.Create(0x9af04e1eff82d0ddUL, 0x6e82e609297f8fe8UL)); // k11, k12
110+
x7 = FoldPolynomialPair(x7, x2, Vector128.Create(0x97c516e98bd2e73UL, 0xb76477b31e22e7bUL)); // k13, k14
111+
x7 = FoldPolynomialPair(x7, x3, Vector128.Create(0x5f6843ca540df020UL, 0xddf4b6981205b83fUL)); // k15, k16
112+
x7 = FoldPolynomialPair(x7, x4, Vector128.Create(0x54819d8713758b2cUL, 0x4a6b90073eb0af5aUL)); // k17, k18
113+
x7 = FoldPolynomialPair(x7, x5, Vector128.Create(0x571bee0a227ef92bUL, 0x44bef2a201b5200cUL)); // k19, k20
114+
x7 = FoldPolynomialPair(x7, x6, Vector128.Create(0x5f5c3c7eb52fab6UL, 0x4eb938a7d257740eUL)); // k1, k2
114115
}
115116
else
116117
{
@@ -122,7 +123,7 @@ private static ulong UpdateVectorized(ulong crc, ReadOnlySpan<byte> source)
122123
// Load and XOR the initial CRC value
123124
// CRC value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
124125
// because the data will be byte-reflected and will align with initial crc at correct place.
125-
x7 ^= VectorHelper.ShiftLowerToUpper(Vector128.CreateScalar(crc));
126+
x7 ^= ShiftLowerToUpper(Vector128.CreateScalar(crc));
126127

127128
srcRef = ref Unsafe.Add(ref srcRef, Vector128<byte>.Count);
128129
length -= Vector128<byte>.Count;
@@ -131,22 +132,22 @@ private static ulong UpdateVectorized(ulong crc, ReadOnlySpan<byte> source)
131132
// Single fold blocks of 16, if any, into x7
132133
while (length >= Vector128<byte>.Count)
133134
{
134-
x7 = VectorHelper.FoldPolynomialPair(LoadFromSource(ref srcRef, 0), x7,
135+
x7 = FoldPolynomialPair(LoadFromSource(ref srcRef, 0), x7,
135136
Vector128.Create(0x5f5c3c7eb52fab6UL, 0x4eb938a7d257740eUL)); // k1, k2
136137

137138
srcRef = ref Unsafe.Add(ref srcRef, Vector128<byte>.Count);
138139
length -= Vector128<byte>.Count;
139140
}
140141

141142
// Compute CRC of a 128-bit value and fold to the upper 64-bits
142-
x7 = VectorHelper.CarrylessMultiplyLeftUpperRightLower(x7, Vector128.CreateScalar(0x5f5c3c7eb52fab6UL)) ^ // k5
143-
VectorHelper.ShiftLowerToUpper(x7);
143+
x7 = CarrylessMultiplyLeftUpperRightLower(x7, Vector128.CreateScalar(0x5f5c3c7eb52fab6UL)) ^ // k5
144+
ShiftLowerToUpper(x7);
144145

145146
// Barrett reduction
146147
kConstants = Vector128.Create(0x578d29d06cc4f872UL, 0x42f0e1eba9ea3693UL); // k7, k8
147148
Vector128<ulong> temp = x7;
148-
x7 = VectorHelper.CarrylessMultiplyLeftUpperRightLower(x7, kConstants) ^ (x7 & Vector128.Create(0UL, ~0UL));
149-
x7 = VectorHelper.CarrylessMultiplyUpper(x7, kConstants);
149+
x7 = CarrylessMultiplyLeftUpperRightLower(x7, kConstants) ^ (x7 & Vector128.Create(0UL, ~0UL));
150+
x7 = CarrylessMultiplyUpper(x7, kConstants);
150151
x7 ^= temp;
151152

152153
// Process the remaining bytes, if any

0 commit comments

Comments
 (0)