Skip to content

Commit d14ed12

Browse files
committed
Adjust code to finish validating remaining 0-8 bytes at end of UTF-8 string
This code is a few percent faster for short UTF-8 strings. For long (~10,000 byte) strings, it is also consistently faster on my local microbenchmarks, but by less than 1%.
1 parent b0ad9df commit d14ed12

File tree

1 file changed

+10
-14
lines changed

1 file changed

+10
-14
lines changed

ext/mbstring/mbstring.c

+10-14
Original file line numberDiff line numberDiff line change
@@ -4728,8 +4728,9 @@ finish_up_remaining_bytes: ;
47284728
if (p == e) {
47294729
uint8_t remaining_bytes = ZSTR_LEN(str) & (sizeof(__m128i) - 1); /* Not including terminating null */
47304730

4731-
/* Crazy hack here... we want to use the above vectorized code to check a block of less than 16
4732-
* bytes, but there is no good way to read a variable number of bytes into an XMM register
4731+
/* Crazy hack here for cases where 9 or more bytes are remaining...
4732+
* We want to use the above vectorized code to check a block of less than 16 bytes,
4733+
* but there is no good way to read a variable number of bytes into an XMM register
47334734
* However, we know that these bytes are part of a zend_string, and a zend_string has some
47344735
* 'header' fields which occupy the memory just before its content
47354736
* And, those header fields occupy more than 16 bytes...
@@ -4744,20 +4745,17 @@ finish_up_remaining_bytes: ;
47444745
* shift distance, so the compiler will choke on _mm_srli_si128(operand, shift_dist)
47454746
*/
47464747
switch (remaining_bytes) {
4747-
case 0:
4748-
operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 15)), 15);
4749-
goto check_operand;
4748+
case 0: ;
4749+
__m128i bad_mask = _mm_set_epi8(-64, -32, -16, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
4750+
__m128i bad = _mm_cmpeq_epi8(_mm_and_si128(last_block, bad_mask), bad_mask);
4751+
return _mm_movemask_epi8(bad) == 0;
47504752
case 1:
4751-
operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 14)), 14);
4752-
goto check_operand;
47534753
case 2:
4754-
operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 13)), 13);
4754+
operand = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, *((uint16_t*)p));
47554755
goto check_operand;
47564756
case 3:
4757-
operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 12)), 12);
4758-
goto check_operand;
47594757
case 4:
4760-
operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 11)), 11);
4758+
operand = _mm_set_epi32(0, 0, 0, *((uint32_t*)p));
47614759
goto check_operand;
47624760
case 5:
47634761
operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 10)), 10);
@@ -4766,10 +4764,8 @@ finish_up_remaining_bytes: ;
47664764
operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 9)), 9);
47674765
goto check_operand;
47684766
case 7:
4769-
operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 8)), 8);
4770-
goto check_operand;
47714767
case 8:
4772-
operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 7)), 7);
4768+
operand = _mm_set_epi64x(0, *((uint64_t*)p));
47734769
goto check_operand;
47744770
case 9:
47754771
operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 6)), 6);

0 commit comments

Comments
 (0)