Speed boost for mb_stripos (when not using UTF-8)

alexdowad · alexdowad · commit 744ca16e73cf · 2022-12-18T15:31:20.000+02:00
Instead of case-folding a string and then converting it to UTF-8 as a
separate operation, why not convert it to UTF-8 at the same time as
we fold case?

For non-UTF-8 encodings, this typically makes mb_stripos about 2x
faster.
diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c
@@ -2878,7 +2878,7 @@ PHP_FUNCTION(mb_convert_encoding)
 
 static zend_string *mbstring_convert_case(php_case_mode case_mode, const char *str, size_t str_len, const mbfl_encoding *enc)
 {
-	return php_unicode_convert_case(case_mode, str, str_len, enc, MBSTRG(current_filter_illegal_mode), MBSTRG(current_filter_illegal_substchar));
+	return php_unicode_convert_case(case_mode, str, str_len, enc, enc, MBSTRG(current_filter_illegal_mode), MBSTRG(current_filter_illegal_substchar));
 }
 
 PHP_FUNCTION(mb_convert_case)
@@ -4858,10 +4858,10 @@ MBSTRING_API size_t php_mb_stripos(bool mode, zend_string *haystack, zend_string
 {
 	/* We're using simple case-folding here, because we'd have to deal with remapping of
 	 * offsets otherwise. */
-	zend_string *haystack_conv = mbstring_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc);
-	zend_string *needle_conv = mbstring_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, ZSTR_VAL(needle), ZSTR_LEN(needle), enc);
+	zend_string *haystack_conv = php_unicode_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, MBSTRG(current_filter_illegal_mode), MBSTRG(current_filter_illegal_substchar));
+	zend_string *needle_conv = php_unicode_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, MBSTRG(current_filter_illegal_mode), MBSTRG(current_filter_illegal_substchar));
 
-	size_t n = mb_find_strpos(haystack_conv, needle_conv, enc, offset, mode);
+	size_t n = mb_find_strpos(haystack_conv, needle_conv, &mbfl_encoding_utf8, offset, mode);
 
 	zend_string_free(haystack_conv);
 	zend_string_free(needle_conv);
diff --git a/ext/mbstring/php_unicode.c b/ext/mbstring/php_unicode.c
@@ -238,7 +238,7 @@ static uint32_t *emit_special_casing_sequence(uint32_t w, uint32_t *out)
 	return out;
 }
 
-MBSTRING_API zend_string *php_unicode_convert_case(php_case_mode case_mode, const char *srcstr, size_t in_len, const mbfl_encoding *src_encoding, int illegal_mode, uint32_t illegal_substchar)
+MBSTRING_API zend_string *php_unicode_convert_case(php_case_mode case_mode, const char *srcstr, size_t in_len, const mbfl_encoding *src_encoding, const mbfl_encoding *dst_encoding, int illegal_mode, uint32_t illegal_substchar)
 {
 	/* A Unicode codepoint can expand out to up to 3 codepoints when uppercased, lowercased, or title cased
 	 * See http://www.unicode.org/Public/UNIDATA/SpecialCasing.txt */
@@ -363,7 +363,7 @@ MBSTRING_API zend_string *php_unicode_convert_case(php_case_mode case_mode, cons
 		}
 
 		ZEND_ASSERT(p - converted_buf <= 192);
-		src_encoding->from_wchar(converted_buf, p - converted_buf, &buf, !in_len);
+		dst_encoding->from_wchar(converted_buf, p - converted_buf, &buf, !in_len);
 	}
 
 	return mb_convert_buf_result(&buf);
diff --git a/ext/mbstring/php_unicode.h b/ext/mbstring/php_unicode.h
@@ -91,7 +91,7 @@ typedef enum {
 
 MBSTRING_API zend_string *php_unicode_convert_case(
 		php_case_mode case_mode, const char *srcstr, size_t srclen,
-		const mbfl_encoding *src_encoding, int illegal_mode, uint32_t illegal_substchar);
+		const mbfl_encoding *src_encoding, const mbfl_encoding *dst_encoding, int illegal_mode, uint32_t illegal_substchar);
 
 /* Optimize the common ASCII case for lower/upper */
 
diff --git a/ext/mbstring/tests/mb_stripos.phpt b/ext/mbstring/tests/mb_stripos.phpt
@@ -9,9 +9,8 @@ mbstring
 ini_set('include_path','.');
 include_once('common.inc');
 
-
 // Test string
-$euc_jp = '0123����ʸ��������ܸ�Ǥ���EUC-JP��ȤäƤ��ޤ���0123���ܸ�����ݽ�����';
+$euc_jp = "0123\xA4\xB3\xA4\xCE\xCA\xB8\xBB\xFA\xCE\xF3\xA4\xCF\xC6\xFC\xCB\xDC\xB8\xEC\xA4\xC7\xA4\xB9\xA1\xA3EUC-JP\xA4\xF2\xBB\xC8\xA4\xC3\xA4\xC6\xA4\xA4\xA4\xDE\xA4\xB9\xA1\xA30123\xC6\xFC\xCB\xDC\xB8\xEC\xA4\xCF\xCC\xCC\xC5\xDD\xBD\xAD\xA4\xA4\xA1\xA3";
 
 $slen = mb_strlen($euc_jp, 'EUC-JP');
 echo "String len: $slen\n";
@@ -21,11 +20,11 @@ mb_internal_encoding('UTF-8') or print("mb_internal_encoding() failed\n");
 
 echo  "== POSITIVE OFFSET ==\n";
 
-print  mb_stripos($euc_jp, '���ܸ�', 0, 'EUC-JP') . "\n";
+print  mb_stripos($euc_jp, "\xC6\xFC\xCB\xDC\xB8\xEC", 0, 'EUC-JP') . "\n";
 print  mb_stripos($euc_jp, '0', 0,     'EUC-JP') . "\n";
 print  mb_stripos($euc_jp, 3, 0,       'EUC-JP') . "\n";
 print  mb_stripos($euc_jp, 0, 0,       'EUC-JP') . "\n";
-print  mb_stripos($euc_jp, '���ܸ�', 15, 'EUC-JP') . "\n";
+print  mb_stripos($euc_jp, "\xC6\xFC\xCB\xDC\xB8\xEC", 15, 'EUC-JP') . "\n";
 print  mb_stripos($euc_jp, '0', 15,     'EUC-JP') . "\n";
 print  mb_stripos($euc_jp, 3, 15,       'EUC-JP') . "\n";
 print  mb_stripos($euc_jp, 0, 15,       'EUC-JP') . "\n";
@@ -34,7 +33,7 @@ print  mb_stripos($euc_jp, 0, 15,       'EUC-JP') . "\n";
 // Negative offset
 echo "== NEGATIVE OFFSET ==\n";
 
-print mb_stripos($euc_jp, '���ܸ�', -15, 'EUC-JP') . "\n";
+print mb_stripos($euc_jp, "\xC6\xFC\xCB\xDC\xB8\xEC", -15, 'EUC-JP') . "\n";
 print mb_stripos($euc_jp, '0', -15,     'EUC-JP') . "\n";
 print mb_stripos($euc_jp, 3, -15,       'EUC-JP') . "\n";
 print mb_stripos($euc_jp, 0, -15,       'EUC-JP') . "\n";
@@ -44,15 +43,15 @@ print mb_stripos($euc_jp, 0, -43,       'EUC-JP') . "\n";
 // Out of range - should return false
 print ("== OUT OF RANGE ==\n");
 
-$r =  mb_stripos($euc_jp, '���ܸ�', 40, 'EUC-JP');
+$r =  mb_stripos($euc_jp, "\xC6\xFC\xCB\xDC\xB8\xEC", 40, 'EUC-JP');
 ($r === FALSE) ? print "OK_OUT_RANGE\n"     : print "NG_OUT_RANGE\n";
 $r =  mb_stripos($euc_jp, '0', 40,     'EUC-JP');
 ($r === FALSE) ? print "OK_OUT_RANGE\n"     : print "NG_OUT_RANGE\n";
 $r =  mb_stripos($euc_jp, 3, 40,       'EUC-JP');
 ($r === FALSE) ? print "OK_OUT_RANGE\n"     : print "NG_OUT_RANGE\n";
 $r =   mb_stripos($euc_jp, 0, 40,       'EUC-JP');
 ($r === FALSE) ? print "OK_OUT_RANGE\n"     : print "NG_OUT_RANGE\n";
-$r =  mb_stripos($euc_jp, '���ܸ�', -3, 'EUC-JP');
+$r =  mb_stripos($euc_jp, "\xC6\xFC\xCB\xDC\xB8\xEC", -3, 'EUC-JP');
 ($r === FALSE) ? print "OK_OUT_RANGE\n"     : print "NG_OUT_RANGE\n";
 $r =  mb_stripos($euc_jp, '0', -3,     'EUC-JP');
 ($r === FALSE) ? print "OK_OUT_RANGE\n"     : print "NG_OUT_RANGE\n";
@@ -65,7 +64,7 @@ $r =   mb_stripos($euc_jp, 0, -3,       'EUC-JP');
 // Non-existent
 echo "== NON-EXISTENT ==\n";
 
-$r = mb_stripos($euc_jp, '�ڹ��', 0, 'EUC-JP');
+$r = mb_stripos($euc_jp, "\xB4\xDA\xB9\xF1\xB8\xEC", 0, 'EUC-JP');
 ($r === FALSE) ? print "OK_STR\n"     : print "NG_STR\n";
 $r = mb_stripos($euc_jp, "\n",     0, 'EUC-JP');
 ($r === FALSE) ? print "OK_NEWLINE\n" : print "NG_NEWLINE\n";
@@ -76,12 +75,12 @@ echo "== NO ENCODING PARAMETER ==\n";
 
 mb_internal_encoding('EUC-JP')  or print("mb_internal_encoding() failed\n");
 
-print  mb_stripos($euc_jp, '���ܸ�', 0) . "\n";
+print  mb_stripos($euc_jp, "\xC6\xFC\xCB\xDC\xB8\xEC", 0) . "\n";
 print  mb_stripos($euc_jp, '0', 0) . "\n";
 print  mb_stripos($euc_jp, 3, 0) . "\n";
 print  mb_stripos($euc_jp, 0, 0) . "\n";
 
-$r = mb_stripos($euc_jp, '�ڹ��', 0);
+$r = mb_stripos($euc_jp, "\xB4\xDA\xB9\xF1\xB8\xEC", 0);
 ($r === FALSE) ? print "OK_STR\n"     : print "NG_STR\n";
 $r = mb_stripos($euc_jp, "\n", 0);
 ($r === FALSE) ? print "OK_NEWLINE\n" : print "NG_NEWLINE\n";
@@ -91,12 +90,12 @@ echo "== NO OFFSET AND ENCODING PARAMETER ==\n";
 
 mb_internal_encoding('EUC-JP')  or print("mb_internal_encoding() failed\n");
 
-print  mb_stripos($euc_jp, '���ܸ�') . "\n";
+print  mb_stripos($euc_jp, "\xC6\xFC\xCB\xDC\xB8\xEC") . "\n";
 print  mb_stripos($euc_jp, '0') . "\n";
 print  mb_stripos($euc_jp, 3) . "\n";
 print  mb_stripos($euc_jp, 0) . "\n";
 
-$r = mb_stripos($euc_jp, '�ڹ��');
+$r = mb_stripos($euc_jp, "\xB4\xDA\xB9\xF1\xB8\xEC");
 ($r === FALSE) ? print "OK_STR\n"     : print "NG_STR\n";
 $r = mb_stripos($euc_jp, "\n");
 ($r === FALSE) ? print "OK_NEWLINE\n" : print "NG_NEWLINE\n";

Original file line number	Diff line number	Diff line change
`@@ -2878,7 +2878,7 @@ PHP_FUNCTION(mb_convert_encoding)`
`2878`	`2878`
`2879`	`2879`	`static zend_string mbstring_convert_case(php_case_mode case_mode, const char str, size_t str_len, const mbfl_encoding *enc)`
`2880`	`2880`	`{`
`2881`		`- return php_unicode_convert_case(case_mode, str, str_len, enc, MBSTRG(current_filter_illegal_mode), MBSTRG(current_filter_illegal_substchar));`
	`2881`	`+ return php_unicode_convert_case(case_mode, str, str_len, enc, enc, MBSTRG(current_filter_illegal_mode), MBSTRG(current_filter_illegal_substchar));`
`2882`	`2882`	`}`
`2883`	`2883`
`2884`	`2884`	`PHP_FUNCTION(mb_convert_case)`
`@@ -4858,10 +4858,10 @@ MBSTRING_API size_t php_mb_stripos(bool mode, zend_string *haystack, zend_string`
`4858`	`4858`	`{`
`4859`	`4859`	`/* We're using simple case-folding here, because we'd have to deal with remapping of`
`4860`	`4860`	`* offsets otherwise. */`
`4861`		`- zend_string *haystack_conv = mbstring_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc);`
`4862`		`- zend_string *needle_conv = mbstring_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, ZSTR_VAL(needle), ZSTR_LEN(needle), enc);`
	`4861`	`+ zend_string *haystack_conv = php_unicode_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, MBSTRG(current_filter_illegal_mode), MBSTRG(current_filter_illegal_substchar));`
	`4862`	`+ zend_string *needle_conv = php_unicode_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, MBSTRG(current_filter_illegal_mode), MBSTRG(current_filter_illegal_substchar));`
`4863`	`4863`
`4864`		`- size_t n = mb_find_strpos(haystack_conv, needle_conv, enc, offset, mode);`
	`4864`	`+ size_t n = mb_find_strpos(haystack_conv, needle_conv, &mbfl_encoding_utf8, offset, mode);`
`4865`	`4865`
`4866`	`4866`	`zend_string_free(haystack_conv);`
`4867`	`4867`	`zend_string_free(needle_conv);`
Original file line number	Diff line number	Diff line change
`@@ -238,7 +238,7 @@ static uint32_t emit_special_casing_sequence(uint32_t w, uint32_t out)`
`238`	`238`	`return out;`
`239`	`239`	`}`
`240`	`240`
`241`		`-MBSTRING_API zend_string php_unicode_convert_case(php_case_mode case_mode, const char srcstr, size_t in_len, const mbfl_encoding *src_encoding, int illegal_mode, uint32_t illegal_substchar)`
	`241`	`+MBSTRING_API zend_string php_unicode_convert_case(php_case_mode case_mode, const char srcstr, size_t in_len, const mbfl_encoding src_encoding, const mbfl_encoding dst_encoding, int illegal_mode, uint32_t illegal_substchar)`
`242`	`242`	`{`
`243`	`243`	`/* A Unicode codepoint can expand out to up to 3 codepoints when uppercased, lowercased, or title cased`
`244`	`244`	`* See http://www.unicode.org/Public/UNIDATA/SpecialCasing.txt */`
`@@ -363,7 +363,7 @@ MBSTRING_API zend_string *php_unicode_convert_case(php_case_mode case_mode, cons`
`363`	`363`	`}`
`364`	`364`
`365`	`365`	`ZEND_ASSERT(p - converted_buf <= 192);`
`366`		`- src_encoding->from_wchar(converted_buf, p - converted_buf, &buf, !in_len);`
	`366`	`+ dst_encoding->from_wchar(converted_buf, p - converted_buf, &buf, !in_len);`
`367`	`367`	`}`
`368`	`368`
`369`	`369`	`return mb_convert_buf_result(&buf);`