27
27
#include "zend_shared_alloc.h"
28
28
#include "zend_observer.h"
29
29
30
+ #ifdef __SSE2__
31
+ /* For SSE2 adler32 */
32
+ #include <immintrin.h>
33
+ #endif
34
+
30
35
typedef int (* id_function_t )(void * , void * );
31
36
typedef void (* unique_copy_ctor_func_t )(void * pElement );
32
37
@@ -451,11 +456,62 @@ zend_op_array* zend_accel_load_script(zend_persistent_script *persistent_script,
451
456
#define ADLER32_NMAX 5552
452
457
/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
453
458
454
- #define ADLER32_DO1 (buf ) {s1 += *(buf); s2 += s1;}
455
- #define ADLER32_DO2 (buf , i ) ADLER32_DO1(buf + i); ADLER32_DO1(buf + i + 1);
456
- #define ADLER32_DO4 (buf , i ) ADLER32_DO2(buf, i); ADLER32_DO2(buf, i + 2);
457
- #define ADLER32_DO8 (buf , i ) ADLER32_DO4(buf, i); ADLER32_DO4(buf, i + 4);
458
- #define ADLER32_DO16 (buf ) ADLER32_DO8(buf, 0); ADLER32_DO8(buf, 8);
459
+ #define ADLER32_SCALAR_DO1 (buf ) {s1 += *(buf); s2 += s1;}
460
+ #define ADLER32_SCALAR_DO2 (buf , i ) ADLER32_SCALAR_DO1(buf + i); ADLER32_SCALAR_DO1(buf + i + 1);
461
+ #define ADLER32_SCALAR_DO4 (buf , i ) ADLER32_SCALAR_DO2(buf, i); ADLER32_SCALAR_DO2(buf, i + 2);
462
+ #define ADLER32_SCALAR_DO8 (buf , i ) ADLER32_SCALAR_DO4(buf, i); ADLER32_SCALAR_DO4(buf, i + 4);
463
+ #define ADLER32_SCALAR_DO16 (buf ) ADLER32_SCALAR_DO8(buf, 0); ADLER32_SCALAR_DO8(buf, 8);
464
+
465
+ static zend_always_inline void adler32_do16_loop (unsigned char * buf , unsigned char * end , unsigned int * s1_out , unsigned int * s2_out )
466
+ {
467
+ unsigned int s1 = * s1_out ;
468
+ unsigned int s2 = * s2_out ;
469
+
470
+ #ifdef __SSE2__
471
+ const __m128i zero = _mm_setzero_si128 ();
472
+
473
+ __m128i accumulate_s2 = zero ;
474
+ unsigned int accumulate_s1 = 0 ;
475
+
476
+ do {
477
+ __m128i read = _mm_loadu_si128 ((__m128i * ) buf ); /* [A:P] */
478
+
479
+ /* Split the 8-bit-element vector into two 16-bit-element vectors where each element gets zero-extended from 8-bits to 16-bits */
480
+ __m128i lower = _mm_unpacklo_epi8 (read , zero ); /* [A:H] zero-extended to 16-bits */
481
+ __m128i higher = _mm_unpackhi_epi8 (read , zero ); /* [I:P] zero-extended to 16-bits */
482
+ lower = _mm_madd_epi16 (lower , _mm_set_epi16 (9 , 10 , 11 , 12 , 13 , 14 , 15 , 16 )); /* [A * 16:H * 9] */
483
+ higher = _mm_madd_epi16 (higher , _mm_set_epi16 (1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 )); /* [I * 8:P * 1] */
484
+
485
+ /* We'll cheat here: it's difficult to add 16-bit elementwise, but we can do 32-bit additions.
486
+ * The highest value the sum of two elements of the vectors can take is 0xff * 16 + 0xff * 8 < 0xffff.
487
+ * That means there is no carry possible from 16->17 bits so the 32-bit addition is safe. */
488
+ __m128i sum = _mm_add_epi32 (lower , higher ); /* [A * 16 + I * 8:H * 9 + P * 1] */
489
+ accumulate_s2 = _mm_add_epi32 (accumulate_s2 , sum );
490
+ accumulate_s1 += s1 ;
491
+
492
+ /* Computes 8-bit element-wise abs(buf - zero) and then sums the elements into two 16 bit parts */
493
+ sum = _mm_sad_epu8 (read , zero );
494
+ s1 += _mm_cvtsi128_si32 (sum ) + _mm_extract_epi16 (sum , 4 );
495
+
496
+ buf += 16 ;
497
+ } while (buf != end );
498
+
499
+ /* For convenience, let's do a rename of variables and let accumulate_s2 = [X, Y, Z, W] */
500
+ __m128i shuffled = _mm_shuffle_epi32 (accumulate_s2 , _MM_SHUFFLE (1 , 0 , 0 , 2 )); /* [Y, X, X, Z] */
501
+ accumulate_s2 = _mm_add_epi32 (accumulate_s2 , shuffled ); /* [X + Y, Y + X, Z + X, W + Z] */
502
+ shuffled = _mm_shuffle_epi32 (accumulate_s2 , _MM_SHUFFLE (3 , 3 , 3 , 3 )); /* [X + Y, X + Y, X + Y, X + Y] */
503
+ accumulate_s2 = _mm_add_epi32 (accumulate_s2 , shuffled ); /* [/, /, /, W + Z + X + Y] */
504
+ s2 += accumulate_s1 * 16 + _mm_cvtsi128_si32 (accumulate_s2 );
505
+ #else
506
+ do {
507
+ ADLER32_SCALAR_DO16 (buf );
508
+ buf += 16 ;
509
+ } while (buf != end );
510
+ #endif
511
+
512
+ * s1_out = s1 ;
513
+ * s2_out = s2 ;
514
+ }
459
515
460
516
unsigned int zend_adler32 (unsigned int checksum , unsigned char * buf , uint32_t len )
461
517
{
@@ -466,10 +522,8 @@ unsigned int zend_adler32(unsigned int checksum, unsigned char *buf, uint32_t le
466
522
while (len >= ADLER32_NMAX ) {
467
523
len -= ADLER32_NMAX ;
468
524
end = buf + ADLER32_NMAX ;
469
- do {
470
- ADLER32_DO16 (buf );
471
- buf += 16 ;
472
- } while (buf != end );
525
+ adler32_do16_loop (buf , end , & s1 , & s2 );
526
+ buf = end ;
473
527
s1 %= ADLER32_BASE ;
474
528
s2 %= ADLER32_BASE ;
475
529
}
@@ -478,15 +532,13 @@ unsigned int zend_adler32(unsigned int checksum, unsigned char *buf, uint32_t le
478
532
if (len >= 16 ) {
479
533
end = buf + (len & 0xfff0 );
480
534
len &= 0xf ;
481
- do {
482
- ADLER32_DO16 (buf );
483
- buf += 16 ;
484
- } while (buf != end );
535
+ adler32_do16_loop (buf , end , & s1 , & s2 );
536
+ buf = end ;
485
537
}
486
538
if (len ) {
487
539
end = buf + len ;
488
540
do {
489
- ADLER32_DO1 (buf );
541
+ ADLER32_SCALAR_DO1 (buf );
490
542
buf ++ ;
491
543
} while (buf != end );
492
544
}
0 commit comments