@@ -4728,8 +4728,9 @@ finish_up_remaining_bytes: ;
4728
4728
if (p == e ) {
4729
4729
uint8_t remaining_bytes = ZSTR_LEN (str ) & (sizeof (__m128i ) - 1 ); /* Not including terminating null */
4730
4730
4731
- /* Crazy hack here... we want to use the above vectorized code to check a block of less than 16
4732
- * bytes, but there is no good way to read a variable number of bytes into an XMM register
4731
+ /* Crazy hack here for cases where 9 or more bytes are remaining...
4732
+ * We want to use the above vectorized code to check a block of less than 16 bytes,
4733
+ * but there is no good way to read a variable number of bytes into an XMM register
4733
4734
* However, we know that these bytes are part of a zend_string, and a zend_string has some
4734
4735
* 'header' fields which occupy the memory just before its content
4735
4736
* And, those header fields occupy more than 16 bytes...
@@ -4744,20 +4745,17 @@ finish_up_remaining_bytes: ;
4744
4745
* shift distance, so the compiler will choke on _mm_srli_si128(operand, shift_dist)
4745
4746
*/
4746
4747
switch (remaining_bytes ) {
4747
- case 0 :
4748
- operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 15 )), 15 );
4749
- goto check_operand ;
4748
+ case 0 : ;
4749
+ __m128i bad_mask = _mm_set_epi8 (-64 , -32 , -16 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 );
4750
+ __m128i bad = _mm_cmpeq_epi8 (_mm_and_si128 (last_block , bad_mask ), bad_mask );
4751
+ return _mm_movemask_epi8 (bad ) == 0 ;
4750
4752
case 1 :
4751
- operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 14 )), 14 );
4752
- goto check_operand ;
4753
4753
case 2 :
4754
- operand = _mm_srli_si128 ( _mm_loadu_si128 (( __m128i * )( p - 13 )), 13 );
4754
+ operand = _mm_set_epi16 ( 0 , 0 , 0 , 0 , 0 , 0 , 0 , * (( uint16_t * ) p ) );
4755
4755
goto check_operand ;
4756
4756
case 3 :
4757
- operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 12 )), 12 );
4758
- goto check_operand ;
4759
4757
case 4 :
4760
- operand = _mm_srli_si128 ( _mm_loadu_si128 (( __m128i * )( p - 11 )), 11 );
4758
+ operand = _mm_set_epi32 ( 0 , 0 , 0 , * (( uint32_t * ) p ) );
4761
4759
goto check_operand ;
4762
4760
case 5 :
4763
4761
operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 10 )), 10 );
@@ -4766,10 +4764,8 @@ finish_up_remaining_bytes: ;
4766
4764
operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 9 )), 9 );
4767
4765
goto check_operand ;
4768
4766
case 7 :
4769
- operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 8 )), 8 );
4770
- goto check_operand ;
4771
4767
case 8 :
4772
- operand = _mm_srli_si128 ( _mm_loadu_si128 (( __m128i * )( p - 7 )), 7 );
4768
+ operand = _mm_set_epi64x ( 0 , * (( uint64_t * ) p ) );
4773
4769
goto check_operand ;
4774
4770
case 9 :
4775
4771
operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 6 )), 6 );
0 commit comments