Skip to content

Commit 8902e47

Browse files
committedJan 18, 2023
Simplify checks (in mb_fast_check_utf8) for overlong code units and invalid codepoint values
1 parent d58f704 commit 8902e47

File tree

1 file changed

+4
-6
lines changed

1 file changed

+4
-6
lines changed
 

‎ext/mbstring/mbstring.c

+4-6
Original file line numberDiff line numberDiff line change
@@ -4659,19 +4659,17 @@ static bool mb_fast_check_utf8(zend_string *str)
46594659
* 0xED followed by a byte >= 0xA0 indicates a reserved codepoint
46604660
* We can check for both problems at once by generating a vector where each byte < 0xA0
46614661
* is mapped to 0xE0, and each byte >= 0xA0 is mapped to 0xED
4662-
* Shift the original block right by one byte, and XOR the shifted block with the bitmask
4663-
* Any matches will give a 0x00 byte; do a compare with a zero vector to pick out the
4664-
* bad positions, and OR them into `bad` */
4662+
* Shift the original block right by one byte, and compare the shifted block with the bitmask */
46654663
__m128i operand2 = _mm_or_si128(_mm_slli_si128(operand, 1), _mm_srli_si128(last_block, 15));
46664664
__m128i mask1 = _mm_or_si128(find_e0, _mm_and_si128(_mm_set1_epi8(0xD), _mm_cmpgt_epi8(operand, over_9f)));
4667-
bad = _mm_or_si128(bad, _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_xor_si128(operand2, mask1)));
4665+
bad = _mm_or_si128(bad, _mm_cmpeq_epi8(operand2, mask1));
46684666

46694667
/* Check for overlong 4-byte code units AND invalid codepoints > U+10FFFF
46704668
* Similar to the previous check; 0xF0 followed by < 0x90 indicates an overlong 4-byte
46714669
* code unit, and 0xF4 followed by >= 0x90 indicates a codepoint over U+10FFFF
4672-
* Build the bitmask, XOR it with the shifted block, check for 0x00 bytes in the result */
4670+
* Build the bitmask and compare it with the shifted block */
46734671
__m128i mask2 = _mm_or_si128(find_f0, _mm_and_si128(_mm_set1_epi8(0x4), _mm_cmpgt_epi8(operand, over_8f)));
4674-
bad = _mm_or_si128(bad, _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_xor_si128(operand2, mask2)));
4672+
bad = _mm_or_si128(bad, _mm_cmpeq_epi8(operand2, mask2));
46754673

46764674
/* Check for overlong 2-byte code units
46774675
* Any 0xC0 or 0xC1 byte can only be the first byte of an overlong 2-byte code unit

0 commit comments

Comments
 (0)