Skip to content

Commit 722fbd0

Browse files
authored
Implement an SSE2 accelerated version of zend_adler32 (#10507)
When benchmarking the file cache of opcache on index.php from a dummy WordPress install, I noticed that 36.42% of the time was spent in zend_adler32 to verify the checksums of the files. Callgrind reported that 332,731,216 instructions were executed during that run and average time to execute the index file was around 91ms. This patch implements an SSE2 accelerated version of zend_adler32, which reduces the number of instructions executed on that bench to 248,600,983, which is a reduction of ~25%. There is also a decrease in wallclock time measurable: around 10ms. Now only 16.05% of the time is spent computing checksums. The benchmark tests were performed using Callgrind, and time for the wallclock time. These tests were executed multiple times and their results were averaged. The WordPress install only contains two almost-blank posts.
1 parent d3abcae commit 722fbd0

File tree

1 file changed

+66
-14
lines changed

1 file changed

+66
-14
lines changed

ext/opcache/zend_accelerator_util_funcs.c

+66-14
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,11 @@
2727
#include "zend_shared_alloc.h"
2828
#include "zend_observer.h"
2929

30+
#ifdef __SSE2__
31+
/* For SSE2 adler32 */
32+
#include <immintrin.h>
33+
#endif
34+
3035
typedef int (*id_function_t)(void *, void *);
3136
typedef void (*unique_copy_ctor_func_t)(void *pElement);
3237

@@ -451,11 +456,62 @@ zend_op_array* zend_accel_load_script(zend_persistent_script *persistent_script,
451456
#define ADLER32_NMAX 5552
452457
/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
453458

454-
#define ADLER32_DO1(buf) {s1 += *(buf); s2 += s1;}
455-
#define ADLER32_DO2(buf, i) ADLER32_DO1(buf + i); ADLER32_DO1(buf + i + 1);
456-
#define ADLER32_DO4(buf, i) ADLER32_DO2(buf, i); ADLER32_DO2(buf, i + 2);
457-
#define ADLER32_DO8(buf, i) ADLER32_DO4(buf, i); ADLER32_DO4(buf, i + 4);
458-
#define ADLER32_DO16(buf) ADLER32_DO8(buf, 0); ADLER32_DO8(buf, 8);
459+
#define ADLER32_SCALAR_DO1(buf) {s1 += *(buf); s2 += s1;}
460+
#define ADLER32_SCALAR_DO2(buf, i) ADLER32_SCALAR_DO1(buf + i); ADLER32_SCALAR_DO1(buf + i + 1);
461+
#define ADLER32_SCALAR_DO4(buf, i) ADLER32_SCALAR_DO2(buf, i); ADLER32_SCALAR_DO2(buf, i + 2);
462+
#define ADLER32_SCALAR_DO8(buf, i) ADLER32_SCALAR_DO4(buf, i); ADLER32_SCALAR_DO4(buf, i + 4);
463+
#define ADLER32_SCALAR_DO16(buf) ADLER32_SCALAR_DO8(buf, 0); ADLER32_SCALAR_DO8(buf, 8);
464+
465+
static zend_always_inline void adler32_do16_loop(unsigned char *buf, unsigned char *end, unsigned int *s1_out, unsigned int *s2_out)
466+
{
467+
unsigned int s1 = *s1_out;
468+
unsigned int s2 = *s2_out;
469+
470+
#ifdef __SSE2__
471+
const __m128i zero = _mm_setzero_si128();
472+
473+
__m128i accumulate_s2 = zero;
474+
unsigned int accumulate_s1 = 0;
475+
476+
do {
477+
__m128i read = _mm_loadu_si128((__m128i *) buf); /* [A:P] */
478+
479+
/* Split the 8-bit-element vector into two 16-bit-element vectors where each element gets zero-extended from 8-bits to 16-bits */
480+
__m128i lower = _mm_unpacklo_epi8(read, zero); /* [A:H] zero-extended to 16-bits */
481+
__m128i higher = _mm_unpackhi_epi8(read, zero); /* [I:P] zero-extended to 16-bits */
482+
lower = _mm_madd_epi16(lower, _mm_set_epi16(9, 10, 11, 12, 13, 14, 15, 16)); /* [A * 16:H * 9] */
483+
higher = _mm_madd_epi16(higher, _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8)); /* [I * 8:P * 1] */
484+
485+
/* We'll cheat here: it's difficult to add 16-bit elementwise, but we can do 32-bit additions.
486+
* The highest value the sum of two elements of the vectors can take is 0xff * 16 + 0xff * 8 < 0xffff.
487+
* That means there is no carry possible from 16->17 bits so the 32-bit addition is safe. */
488+
__m128i sum = _mm_add_epi32(lower, higher); /* [A * 16 + I * 8:H * 9 + P * 1] */
489+
accumulate_s2 = _mm_add_epi32(accumulate_s2, sum);
490+
accumulate_s1 += s1;
491+
492+
/* Computes 8-bit element-wise abs(buf - zero) and then sums the elements into two 16 bit parts */
493+
sum = _mm_sad_epu8(read, zero);
494+
s1 += _mm_cvtsi128_si32(sum) + _mm_extract_epi16(sum, 4);
495+
496+
buf += 16;
497+
} while (buf != end);
498+
499+
/* For convenience, let's do a rename of variables and let accumulate_s2 = [X, Y, Z, W] */
500+
__m128i shuffled = _mm_shuffle_epi32(accumulate_s2, _MM_SHUFFLE(1, 0, 0, 2)); /* [Y, X, X, Z] */
501+
accumulate_s2 = _mm_add_epi32(accumulate_s2, shuffled); /* [X + Y, Y + X, Z + X, W + Z] */
502+
shuffled = _mm_shuffle_epi32(accumulate_s2, _MM_SHUFFLE(3, 3, 3, 3)); /* [X + Y, X + Y, X + Y, X + Y] */
503+
accumulate_s2 = _mm_add_epi32(accumulate_s2, shuffled); /* [/, /, /, W + Z + X + Y] */
504+
s2 += accumulate_s1 * 16 + _mm_cvtsi128_si32(accumulate_s2);
505+
#else
506+
do {
507+
ADLER32_SCALAR_DO16(buf);
508+
buf += 16;
509+
} while (buf != end);
510+
#endif
511+
512+
*s1_out = s1;
513+
*s2_out = s2;
514+
}
459515

460516
unsigned int zend_adler32(unsigned int checksum, unsigned char *buf, uint32_t len)
461517
{
@@ -466,10 +522,8 @@ unsigned int zend_adler32(unsigned int checksum, unsigned char *buf, uint32_t le
466522
while (len >= ADLER32_NMAX) {
467523
len -= ADLER32_NMAX;
468524
end = buf + ADLER32_NMAX;
469-
do {
470-
ADLER32_DO16(buf);
471-
buf += 16;
472-
} while (buf != end);
525+
adler32_do16_loop(buf, end, &s1, &s2);
526+
buf = end;
473527
s1 %= ADLER32_BASE;
474528
s2 %= ADLER32_BASE;
475529
}
@@ -478,15 +532,13 @@ unsigned int zend_adler32(unsigned int checksum, unsigned char *buf, uint32_t le
478532
if (len >= 16) {
479533
end = buf + (len & 0xfff0);
480534
len &= 0xf;
481-
do {
482-
ADLER32_DO16(buf);
483-
buf += 16;
484-
} while (buf != end);
535+
adler32_do16_loop(buf, end, &s1, &s2);
536+
buf = end;
485537
}
486538
if (len) {
487539
end = buf + len;
488540
do {
489-
ADLER32_DO1(buf);
541+
ADLER32_SCALAR_DO1(buf);
490542
buf++;
491543
} while (buf != end);
492544
}

0 commit comments

Comments
 (0)