Skip to content

Commit 1517ef4

Browse files
j-naylorr-devulapPaul Amonson
authored and
Commitfest Bot
committed
Improve CRC32C performance on recent x86_64 platforms
The previous implementation of CRC32C on x86 relied on the native CRC32 instruction from the SSE 4.2 extension, which operates on up to 8 bytes at a time. We can get a substantial speedup by using carryless multiplication on SIMD registers, processing 64 bytes per loop iteration. Shorter inputs fall back to ordinary CRC instructions. The VPCLMULQDQ instruction on 512-bit registers has been available on Intel hardware since 2019 and AMD since 2022. There is an older variant for 128-bit registers, but at least on Zen 2 it performs worse than normal CRC instructions for short inputs. (Thanks to David Rowley for testing on that platform.) We must now do a runtime check, even for builds that target SSE 4.2. This doesn't matter in practice for WAL (arguably the most critical case), because since commit af0c248 the final computation with the 20-byte WAL header is inlined and unrolled. Compared with two direct function calls, testing showed equal or slightly faster performance in performing an indirect function call on several dozen bytes followed by inlined instructions on constant input of 20 bytes. The MIT-licensed implementation was generated with the "generate" program from https://github.com/corsix/fast-crc32/ Based on: "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" V. Gopal, E. Ozturk, et al., 2009 Co-authored-by: Raghuveer Devulapalli <[email protected]> Co-authored-by: Paul Amonson <[email protected]> Reviewed-by: Nathan Bossart <[email protected]> Reviewed-by: Andres Freund <[email protected]> (earlier version) Tested-by: Raghuveer Devulapalli <[email protected]> Tested-by: Matthew Sterrett <[email protected]> (earlier version) Discussion: https://postgr.es/m/BL1PR11MB530401FA7E9B1CA432CF9DC3DC192@BL1PR11MB5304.namprd11.prod.outlook.com Discussion: https://postgr.es/m/PH8PR11MB82869FF741DFA4E9A029FF13FBF72@PH8PR11MB8286.namprd11.prod.outlook.com
1 parent f104192 commit 1517ef4

File tree

11 files changed

+408
-60
lines changed

11 files changed

+408
-60
lines changed

config/c-compiler.m4

+37
Original file line numberDiff line numberDiff line change
@@ -581,6 +581,43 @@ fi
581581
undefine([Ac_cachevar])dnl
582582
])# PGAC_SSE42_CRC32_INTRINSICS
583583

584+
# PGAC_AVX512_PCLMUL_INTRINSICS
585+
# ---------------------------
586+
# Check if the compiler supports AVX-512 carryless multiplication
587+
# and AVX-512VL instructions used for computing CRC. AVX-512F is
588+
# assumed to be supported if the above are.
589+
#
590+
# If the intrinsics are supported, sets pgac_avx512_pclmul_intrinsics.
591+
AC_DEFUN([PGAC_AVX512_PCLMUL_INTRINSICS],
592+
[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx512_pclmul_intrinsics])])dnl
593+
AC_CACHE_CHECK([for _mm512_clmulepi64_epi128], [Ac_cachevar],
594+
[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <immintrin.h>
595+
__m512i x;
596+
__m512i y;
597+
598+
#if defined(__has_attribute) && __has_attribute (target)
599+
__attribute__((target("vpclmulqdq,avx512vl")))
600+
#endif
601+
static int avx512_pclmul_test(void)
602+
{
603+
__m128i z;
604+
605+
y = _mm512_clmulepi64_epi128(x, y, 0);
606+
z = _mm_ternarylogic_epi64(
607+
_mm512_castsi512_si128(y),
608+
_mm512_extracti32x4_epi32(y, 1),
609+
_mm512_extracti32x4_epi32(y, 2),
610+
0x96);
611+
return _mm_crc32_u64(0, _mm_extract_epi64(z, 0));
612+
}],
613+
[return avx512_pclmul_test();])],
614+
[Ac_cachevar=yes],
615+
[Ac_cachevar=no])])
616+
if test x"$Ac_cachevar" = x"yes"; then
617+
pgac_avx512_pclmul_intrinsics=yes
618+
fi
619+
undefine([Ac_cachevar])dnl
620+
])# PGAC_AVX512_PCLMUL_INTRINSICS
584621

585622
# PGAC_ARMV8_CRC32C_INTRINSICS
586623
# ----------------------------

configure

+80-11
Original file line numberDiff line numberDiff line change
@@ -17864,17 +17864,21 @@ fi
1786417864

1786517865
# Select CRC-32C implementation.
1786617866
#
17867-
# If we are targeting a processor that has Intel SSE 4.2 instructions, we can
17868-
# use the special CRC instructions for calculating CRC-32C. If we're not
17869-
# targeting such a processor, but we can nevertheless produce code that uses
17870-
# the SSE intrinsics, compile both implementations and select which one to use
17871-
# at runtime, depending on whether SSE 4.2 is supported by the processor we're
17872-
# running on.
17867+
# There are three methods of calculating CRC, in order of increasing
17868+
# performance:
1787317869
#
17874-
# Similarly, if we are targeting an ARM processor that has the CRC
17875-
# instructions that are part of the ARMv8 CRC Extension, use them. And if
17876-
# we're not targeting such a processor, but can nevertheless produce code that
17877-
# uses the CRC instructions, compile both, and select at runtime.
17870+
# 1. The fallback using a lookup table, called slicing-by-8
17871+
# 2. CRC-32C instructions (found in e.g. Intel SSE 4.2 and ARMv8 CRC Extension)
17872+
# 3. Algorithms using carryless multiplication instructions
17873+
# (e.g. Intel PCLMUL and Arm PMULL)
17874+
#
17875+
# If we can produce code (via function attributes or additional compiler
17876+
# flags) that uses #2 (and possibly #3), we compile all implementations
17877+
# and select which one to use at runtime, depending on what is supported
17878+
# by the processor we're running on.
17879+
#
17880+
# If we are targeting a processor that has #2, we can use that without
17881+
# runtime selection.
1787817882
#
1787917883
# Note that we do not use __attribute__((target("..."))) for the ARM CRC
1788017884
# instructions because until clang 16, using the ARM intrinsics still requires
@@ -17925,7 +17929,7 @@ if test x"$USE_SSE42_CRC32C" = x"1"; then
1792517929

1792617930
$as_echo "#define USE_SSE42_CRC32C 1" >>confdefs.h
1792717931

17928-
PG_CRC32C_OBJS="pg_crc32c_sse42.o"
17932+
PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sse42_choose.o"
1792917933
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2" >&5
1793017934
$as_echo "SSE 4.2" >&6; }
1793117935
else
@@ -17974,6 +17978,71 @@ $as_echo "slicing-by-8" >&6; }
1797417978
fi
1797517979

1797617980

17981+
# Check for carryless multiplication intrinsics to do vectorized CRC calculations.
17982+
#
17983+
if test x"$host_cpu" = x"x86_64"; then
17984+
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_clmulepi64_epi128" >&5
17985+
$as_echo_n "checking for _mm512_clmulepi64_epi128... " >&6; }
17986+
if ${pgac_cv_avx512_pclmul_intrinsics+:} false; then :
17987+
$as_echo_n "(cached) " >&6
17988+
else
17989+
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
17990+
/* end confdefs.h. */
17991+
#include <immintrin.h>
17992+
__m512i x;
17993+
__m512i y;
17994+
17995+
#if defined(__has_attribute) && __has_attribute (target)
17996+
__attribute__((target("vpclmulqdq,avx512vl")))
17997+
#endif
17998+
static int avx512_pclmul_test(void)
17999+
{
18000+
__m128i z;
18001+
18002+
y = _mm512_clmulepi64_epi128(x, y, 0);
18003+
z = _mm_ternarylogic_epi64(
18004+
_mm512_castsi512_si128(y),
18005+
_mm512_extracti32x4_epi32(y, 1),
18006+
_mm512_extracti32x4_epi32(y, 2),
18007+
0x96);
18008+
return _mm_crc32_u64(0, _mm_extract_epi64(z, 0));
18009+
}
18010+
int
18011+
main ()
18012+
{
18013+
return avx512_pclmul_test();
18014+
;
18015+
return 0;
18016+
}
18017+
_ACEOF
18018+
if ac_fn_c_try_link "$LINENO"; then :
18019+
pgac_cv_avx512_pclmul_intrinsics=yes
18020+
else
18021+
pgac_cv_avx512_pclmul_intrinsics=no
18022+
fi
18023+
rm -f core conftest.err conftest.$ac_objext \
18024+
conftest$ac_exeext conftest.$ac_ext
18025+
fi
18026+
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx512_pclmul_intrinsics" >&5
18027+
$as_echo "$pgac_cv_avx512_pclmul_intrinsics" >&6; }
18028+
if test x"$pgac_cv_avx512_pclmul_intrinsics" = x"yes"; then
18029+
pgac_avx512_pclmul_intrinsics=yes
18030+
fi
18031+
18032+
fi
18033+
18034+
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for vectorized CRC-32C" >&5
18035+
$as_echo_n "checking for vectorized CRC-32C... " >&6; }
18036+
if test x"$pgac_avx512_pclmul_intrinsics" = x"yes"; then
18037+
18038+
$as_echo "#define USE_AVX512_CRC_WITH_RUNTIME_CHECK 1" >>confdefs.h
18039+
18040+
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: AVX-512 with runtime check" >&5
18041+
$as_echo "AVX-512 with runtime check" >&6; }
18042+
else
18043+
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: none" >&5
18044+
$as_echo "none" >&6; }
18045+
fi
1797718046

1797818047
# Select semaphore implementation type.
1797918048
if test "$PORTNAME" != "win32"; then

configure.ac

+29-12
Original file line numberDiff line numberDiff line change
@@ -2116,17 +2116,21 @@ AC_SUBST(CFLAGS_CRC)
21162116

21172117
# Select CRC-32C implementation.
21182118
#
2119-
# If we are targeting a processor that has Intel SSE 4.2 instructions, we can
2120-
# use the special CRC instructions for calculating CRC-32C. If we're not
2121-
# targeting such a processor, but we can nevertheless produce code that uses
2122-
# the SSE intrinsics, compile both implementations and select which one to use
2123-
# at runtime, depending on whether SSE 4.2 is supported by the processor we're
2124-
# running on.
2125-
#
2126-
# Similarly, if we are targeting an ARM processor that has the CRC
2127-
# instructions that are part of the ARMv8 CRC Extension, use them. And if
2128-
# we're not targeting such a processor, but can nevertheless produce code that
2129-
# uses the CRC instructions, compile both, and select at runtime.
2119+
# There are three methods of calculating CRC, in order of increasing
2120+
# performance:
2121+
#
2122+
# 1. The fallback using a lookup table, called slicing-by-8
2123+
# 2. CRC-32C instructions (found in e.g. Intel SSE 4.2 and ARMv8 CRC Extension)
2124+
# 3. Algorithms using carryless multiplication instructions
2125+
# (e.g. Intel PCLMUL and Arm PMULL)
2126+
#
2127+
# If we can produce code (via function attributes or additional compiler
2128+
# flags) that uses #2 (and possibly #3), we compile all implementations
2129+
# and select which one to use at runtime, depending on what is supported
2130+
# by the processor we're running on.
2131+
#
2132+
# If we are targeting a processor that has #2, we can use that without
2133+
# runtime selection.
21302134
#
21312135
# Note that we do not use __attribute__((target("..."))) for the ARM CRC
21322136
# instructions because until clang 16, using the ARM intrinsics still requires
@@ -2174,7 +2178,7 @@ fi
21742178
AC_MSG_CHECKING([which CRC-32C implementation to use])
21752179
if test x"$USE_SSE42_CRC32C" = x"1"; then
21762180
AC_DEFINE(USE_SSE42_CRC32C, 1, [Define to 1 use Intel SSE 4.2 CRC instructions.])
2177-
PG_CRC32C_OBJS="pg_crc32c_sse42.o"
2181+
PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sse42_choose.o"
21782182
AC_MSG_RESULT(SSE 4.2)
21792183
else
21802184
if test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then
@@ -2207,6 +2211,19 @@ else
22072211
fi
22082212
AC_SUBST(PG_CRC32C_OBJS)
22092213

2214+
# Check for carryless multiplication intrinsics to do vectorized CRC calculations.
2215+
#
2216+
if test x"$host_cpu" = x"x86_64"; then
2217+
PGAC_AVX512_PCLMUL_INTRINSICS()
2218+
fi
2219+
2220+
AC_MSG_CHECKING([for vectorized CRC-32C])
2221+
if test x"$pgac_avx512_pclmul_intrinsics" = x"yes"; then
2222+
AC_DEFINE(USE_AVX512_CRC_WITH_RUNTIME_CHECK, 1, [Define to 1 to use AVX-512 CRC algorithms with a runtime check.])
2223+
AC_MSG_RESULT(AVX-512 with runtime check)
2224+
else
2225+
AC_MSG_RESULT(none)
2226+
fi
22102227

22112228
# Select semaphore implementation type.
22122229
if test "$PORTNAME" != "win32"; then

meson.build

+47-11
Original file line numberDiff line numberDiff line change
@@ -2349,17 +2349,21 @@ endif
23492349
###############################################################
23502350
# Select CRC-32C implementation.
23512351
#
2352-
# If we are targeting a processor that has Intel SSE 4.2 instructions, we can
2353-
# use the special CRC instructions for calculating CRC-32C. If we're not
2354-
# targeting such a processor, but we can nevertheless produce code that uses
2355-
# the SSE intrinsics, compile both implementations and select which one to use
2356-
# at runtime, depending on whether SSE 4.2 is supported by the processor we're
2357-
# running on.
2352+
# There are three methods of calculating CRC, in order of increasing
2353+
# performance:
23582354
#
2359-
# Similarly, if we are targeting an ARM processor that has the CRC
2360-
# instructions that are part of the ARMv8 CRC Extension, use them. And if
2361-
# we're not targeting such a processor, but can nevertheless produce code that
2362-
# uses the CRC instructions, compile both, and select at runtime.
2355+
# 1. The fallback using a lookup table, called slicing-by-8
2356+
# 2. CRC-32C instructions (found in e.g. Intel SSE 4.2 and ARMv8 CRC Extension)
2357+
# 3. Algorithms using carryless multiplication instructions
2358+
# (e.g. Intel PCLMUL and Arm PMULL)
2359+
#
2360+
# If we can produce code (via function attributes or additional compiler
2361+
# flags) that uses #2 (and possibly #3), we compile all implementations
2362+
# and select which one to use at runtime, depending on what is supported
2363+
# by the processor we're running on.
2364+
#
2365+
# If we are targeting a processor that has #2, we can use that without
2366+
# runtime selection.
23632367
#
23642368
# Note that we do not use __attribute__((target("..."))) for the ARM CRC
23652369
# instructions because until clang 16, using the ARM intrinsics still requires
@@ -2393,7 +2397,7 @@ int main(void)
23932397
}
23942398
'''
23952399

2396-
if not cc.links(prog, name: '_mm_crc32_u8 and _mm_crc32_u32',
2400+
if not cc.links(prog, name: 'SSE 4.2 CRC32C',
23972401
args: test_c_args)
23982402
# Do not use Intel SSE 4.2
23992403
elif (cc.get_define('__SSE4_2__') != '')
@@ -2408,6 +2412,38 @@ int main(void)
24082412
have_optimized_crc = true
24092413
endif
24102414

2415+
# Check if the compiler supports AVX-512 carryless multiplication
2416+
# and AVX-512VL instructions used for computing CRC. AVX-512F is
2417+
# assumed to be supported if the above are.
2418+
prog = '''
2419+
#include <immintrin.h>
2420+
__m512i x;
2421+
__m512i y;
2422+
2423+
#if defined(__has_attribute) && __has_attribute (target)
2424+
__attribute__((target("vpclmulqdq,avx512vl")))
2425+
#endif
2426+
int main(void)
2427+
{
2428+
__m128i z;
2429+
2430+
y = _mm512_clmulepi64_epi128(x, y, 0);
2431+
z = _mm_ternarylogic_epi64(
2432+
_mm512_castsi512_si128(y),
2433+
_mm512_extracti32x4_epi32(y, 1),
2434+
_mm512_extracti32x4_epi32(y, 2),
2435+
0x96);
2436+
/* return computed value, to prevent the above being optimized away */
2437+
return _mm_crc32_u64(0, _mm_extract_epi64(z, 0));
2438+
}
2439+
'''
2440+
2441+
if cc.links(prog,
2442+
name: 'AVX-512 CRC32C',
2443+
args: test_c_args)
2444+
cdata.set('USE_AVX512_CRC32C_WITH_RUNTIME_CHECK', 1)
2445+
endif
2446+
24112447
endif
24122448

24132449
elif host_cpu == 'arm' or host_cpu == 'aarch64'

src/include/pg_config.h.in

+3
Original file line numberDiff line numberDiff line change
@@ -665,6 +665,9 @@
665665
/* Define to 1 to build with assertion checks. (--enable-cassert) */
666666
#undef USE_ASSERT_CHECKING
667667

668+
/* Define to 1 to use AVX-512 CRC algorithms with a runtime check. */
669+
#undef USE_AVX512_CRC_WITH_RUNTIME_CHECK
670+
668671
/* Define to 1 to use AVX-512 popcount instructions with a runtime check. */
669672
#undef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
670673

src/include/port/pg_crc32c.h

+29-10
Original file line numberDiff line numberDiff line change
@@ -42,15 +42,22 @@ typedef uint32 pg_crc32c;
4242
#define EQ_CRC32C(c1, c2) ((c1) == (c2))
4343

4444
#if defined(USE_SSE42_CRC32C)
45-
/* Use Intel SSE4.2 instructions. */
45+
/*
46+
* Use either Intel SSE 4.2 or AVX-512 instructions. We don't need a runtime check
47+
* for SSE 4.2, so we can inline those in some cases.
48+
*/
4649

4750
#include <nmmintrin.h>
4851

4952
#define COMP_CRC32C(crc, data, len) \
5053
((crc) = pg_comp_crc32c_dispatch((crc), (data), (len)))
5154
#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
5255

56+
extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len);
5357
extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len);
58+
#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
59+
extern pg_crc32c pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t len);
60+
#endif
5461

5562
/*
5663
* We can only get here if the host compiler targets SSE 4.2, but on some
@@ -82,9 +89,27 @@ pg_comp_crc32c_dispatch(pg_crc32c crc, const void *data, size_t len)
8289
return crc;
8390
}
8491
else
85-
return pg_comp_crc32c_sse42(crc, data, len);
92+
/* Otherwise, use a runtime check for AVX-512 instructions. */
93+
return pg_comp_crc32c(crc, data, len);
8694
}
8795

96+
#elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK)
97+
98+
/*
99+
* Use Intel SSE 4.2 or AVX-512 instructions, but perform a runtime check first
100+
* to check that they are available.
101+
*/
102+
#define COMP_CRC32C(crc, data, len) \
103+
((crc) = pg_comp_crc32c((crc), (data), (len)))
104+
#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
105+
106+
extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len);
107+
extern PGDLLIMPORT pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len);
108+
extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len);
109+
#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
110+
extern pg_crc32c pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t len);
111+
#endif
112+
88113
#elif defined(USE_ARMV8_CRC32C)
89114
/* Use ARMv8 CRC Extension instructions. */
90115

@@ -103,10 +128,10 @@ extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t le
103128

104129
extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len);
105130

106-
#elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) || defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK)
131+
#elif defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK)
107132

108133
/*
109-
* Use Intel SSE 4.2 or ARMv8 instructions, but perform a runtime check first
134+
* Use ARMv8 instructions, but perform a runtime check first
110135
* to check that they are available.
111136
*/
112137
#define COMP_CRC32C(crc, data, len) \
@@ -115,13 +140,7 @@ extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_
115140

116141
extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len);
117142
extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len);
118-
119-
#ifdef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK
120-
extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len);
121-
#endif
122-
#ifdef USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK
123143
extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len);
124-
#endif
125144

126145
#else
127146
/*

src/port/meson.build

+1
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ replace_funcs_pos = [
8686
# x86/x64
8787
['pg_crc32c_sse42', 'USE_SSE42_CRC32C'],
8888
['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
89+
['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C'],
8990
['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
9091
['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
9192

0 commit comments

Comments
 (0)