Skip to content

Commit 744ca16

Browse files
committed
Speed boost for mb_stripos (when not using UTF-8)
Instead of case-folding a string and then converting it to UTF-8 as a separate operation, why not convert it to UTF-8 at the same time as we fold case? For non-UTF-8 encodings, this typically makes mb_stripos about 2x faster.
1 parent e288438 commit 744ca16

File tree

4 files changed

+18
-19
lines changed

4 files changed

+18
-19
lines changed

ext/mbstring/mbstring.c

+4-4
Original file line numberDiff line numberDiff line change
@@ -2878,7 +2878,7 @@ PHP_FUNCTION(mb_convert_encoding)
28782878

28792879
static zend_string *mbstring_convert_case(php_case_mode case_mode, const char *str, size_t str_len, const mbfl_encoding *enc)
28802880
{
2881-
return php_unicode_convert_case(case_mode, str, str_len, enc, MBSTRG(current_filter_illegal_mode), MBSTRG(current_filter_illegal_substchar));
2881+
return php_unicode_convert_case(case_mode, str, str_len, enc, enc, MBSTRG(current_filter_illegal_mode), MBSTRG(current_filter_illegal_substchar));
28822882
}
28832883

28842884
PHP_FUNCTION(mb_convert_case)
@@ -4858,10 +4858,10 @@ MBSTRING_API size_t php_mb_stripos(bool mode, zend_string *haystack, zend_string
48584858
{
48594859
/* We're using simple case-folding here, because we'd have to deal with remapping of
48604860
* offsets otherwise. */
4861-
zend_string *haystack_conv = mbstring_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc);
4862-
zend_string *needle_conv = mbstring_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, ZSTR_VAL(needle), ZSTR_LEN(needle), enc);
4861+
zend_string *haystack_conv = php_unicode_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, MBSTRG(current_filter_illegal_mode), MBSTRG(current_filter_illegal_substchar));
4862+
zend_string *needle_conv = php_unicode_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, MBSTRG(current_filter_illegal_mode), MBSTRG(current_filter_illegal_substchar));
48634863

4864-
size_t n = mb_find_strpos(haystack_conv, needle_conv, enc, offset, mode);
4864+
size_t n = mb_find_strpos(haystack_conv, needle_conv, &mbfl_encoding_utf8, offset, mode);
48654865

48664866
zend_string_free(haystack_conv);
48674867
zend_string_free(needle_conv);

ext/mbstring/php_unicode.c

+2-2
Original file line numberDiff line numberDiff line change
@@ -238,7 +238,7 @@ static uint32_t *emit_special_casing_sequence(uint32_t w, uint32_t *out)
238238
return out;
239239
}
240240

241-
MBSTRING_API zend_string *php_unicode_convert_case(php_case_mode case_mode, const char *srcstr, size_t in_len, const mbfl_encoding *src_encoding, int illegal_mode, uint32_t illegal_substchar)
241+
MBSTRING_API zend_string *php_unicode_convert_case(php_case_mode case_mode, const char *srcstr, size_t in_len, const mbfl_encoding *src_encoding, const mbfl_encoding *dst_encoding, int illegal_mode, uint32_t illegal_substchar)
242242
{
243243
/* A Unicode codepoint can expand out to up to 3 codepoints when uppercased, lowercased, or title cased
244244
* See http://www.unicode.org/Public/UNIDATA/SpecialCasing.txt */
@@ -363,7 +363,7 @@ MBSTRING_API zend_string *php_unicode_convert_case(php_case_mode case_mode, cons
363363
}
364364

365365
ZEND_ASSERT(p - converted_buf <= 192);
366-
src_encoding->from_wchar(converted_buf, p - converted_buf, &buf, !in_len);
366+
dst_encoding->from_wchar(converted_buf, p - converted_buf, &buf, !in_len);
367367
}
368368

369369
return mb_convert_buf_result(&buf);

ext/mbstring/php_unicode.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ typedef enum {
9191

9292
MBSTRING_API zend_string *php_unicode_convert_case(
9393
php_case_mode case_mode, const char *srcstr, size_t srclen,
94-
const mbfl_encoding *src_encoding, int illegal_mode, uint32_t illegal_substchar);
94+
const mbfl_encoding *src_encoding, const mbfl_encoding *dst_encoding, int illegal_mode, uint32_t illegal_substchar);
9595

9696
/* Optimize the common ASCII case for lower/upper */
9797

ext/mbstring/tests/mb_stripos.phpt

+11-12
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,8 @@ mbstring
99
ini_set('include_path','.');
1010
include_once('common.inc');
1111

12-
1312
// Test string
14-
$euc_jp = '0123この文字列は日本語です。EUC-JPを使っています。0123日本語は面倒臭い。';
13+
$euc_jp = "0123\xA4\xB3\xA4\xCE\xCA\xB8\xBB\xFA\xCE\xF3\xA4\xCF\xC6\xFC\xCB\xDC\xB8\xEC\xA4\xC7\xA4\xB9\xA1\xA3EUC-JP\xA4\xF2\xBB\xC8\xA4\xC3\xA4\xC6\xA4\xA4\xA4\xDE\xA4\xB9\xA1\xA30123\xC6\xFC\xCB\xDC\xB8\xEC\xA4\xCF\xCC\xCC\xC5\xDD\xBD\xAD\xA4\xA4\xA1\xA3";
1514

1615
$slen = mb_strlen($euc_jp, 'EUC-JP');
1716
echo "String len: $slen\n";
@@ -21,11 +20,11 @@ mb_internal_encoding('UTF-8') or print("mb_internal_encoding() failed\n");
2120

2221
echo "== POSITIVE OFFSET ==\n";
2322

24-
print mb_stripos($euc_jp, '日本語', 0, 'EUC-JP') . "\n";
23+
print mb_stripos($euc_jp, "\xC6\xFC\xCB\xDC\xB8\xEC", 0, 'EUC-JP') . "\n";
2524
print mb_stripos($euc_jp, '0', 0, 'EUC-JP') . "\n";
2625
print mb_stripos($euc_jp, 3, 0, 'EUC-JP') . "\n";
2726
print mb_stripos($euc_jp, 0, 0, 'EUC-JP') . "\n";
28-
print mb_stripos($euc_jp, '日本語', 15, 'EUC-JP') . "\n";
27+
print mb_stripos($euc_jp, "\xC6\xFC\xCB\xDC\xB8\xEC", 15, 'EUC-JP') . "\n";
2928
print mb_stripos($euc_jp, '0', 15, 'EUC-JP') . "\n";
3029
print mb_stripos($euc_jp, 3, 15, 'EUC-JP') . "\n";
3130
print mb_stripos($euc_jp, 0, 15, 'EUC-JP') . "\n";
@@ -34,7 +33,7 @@ print mb_stripos($euc_jp, 0, 15, 'EUC-JP') . "\n";
3433
// Negative offset
3534
echo "== NEGATIVE OFFSET ==\n";
3635

37-
print mb_stripos($euc_jp, '日本語', -15, 'EUC-JP') . "\n";
36+
print mb_stripos($euc_jp, "\xC6\xFC\xCB\xDC\xB8\xEC", -15, 'EUC-JP') . "\n";
3837
print mb_stripos($euc_jp, '0', -15, 'EUC-JP') . "\n";
3938
print mb_stripos($euc_jp, 3, -15, 'EUC-JP') . "\n";
4039
print mb_stripos($euc_jp, 0, -15, 'EUC-JP') . "\n";
@@ -44,15 +43,15 @@ print mb_stripos($euc_jp, 0, -43, 'EUC-JP') . "\n";
4443
// Out of range - should return false
4544
print ("== OUT OF RANGE ==\n");
4645

47-
$r = mb_stripos($euc_jp, '日本語', 40, 'EUC-JP');
46+
$r = mb_stripos($euc_jp, "\xC6\xFC\xCB\xDC\xB8\xEC", 40, 'EUC-JP');
4847
($r === FALSE) ? print "OK_OUT_RANGE\n" : print "NG_OUT_RANGE\n";
4948
$r = mb_stripos($euc_jp, '0', 40, 'EUC-JP');
5049
($r === FALSE) ? print "OK_OUT_RANGE\n" : print "NG_OUT_RANGE\n";
5150
$r = mb_stripos($euc_jp, 3, 40, 'EUC-JP');
5251
($r === FALSE) ? print "OK_OUT_RANGE\n" : print "NG_OUT_RANGE\n";
5352
$r = mb_stripos($euc_jp, 0, 40, 'EUC-JP');
5453
($r === FALSE) ? print "OK_OUT_RANGE\n" : print "NG_OUT_RANGE\n";
55-
$r = mb_stripos($euc_jp, '日本語', -3, 'EUC-JP');
54+
$r = mb_stripos($euc_jp, "\xC6\xFC\xCB\xDC\xB8\xEC", -3, 'EUC-JP');
5655
($r === FALSE) ? print "OK_OUT_RANGE\n" : print "NG_OUT_RANGE\n";
5756
$r = mb_stripos($euc_jp, '0', -3, 'EUC-JP');
5857
($r === FALSE) ? print "OK_OUT_RANGE\n" : print "NG_OUT_RANGE\n";
@@ -65,7 +64,7 @@ $r = mb_stripos($euc_jp, 0, -3, 'EUC-JP');
6564
// Non-existent
6665
echo "== NON-EXISTENT ==\n";
6766

68-
$r = mb_stripos($euc_jp, '韓国語', 0, 'EUC-JP');
67+
$r = mb_stripos($euc_jp, "\xB4\xDA\xB9\xF1\xB8\xEC", 0, 'EUC-JP');
6968
($r === FALSE) ? print "OK_STR\n" : print "NG_STR\n";
7069
$r = mb_stripos($euc_jp, "\n", 0, 'EUC-JP');
7170
($r === FALSE) ? print "OK_NEWLINE\n" : print "NG_NEWLINE\n";
@@ -76,12 +75,12 @@ echo "== NO ENCODING PARAMETER ==\n";
7675

7776
mb_internal_encoding('EUC-JP') or print("mb_internal_encoding() failed\n");
7877

79-
print mb_stripos($euc_jp, '日本語', 0) . "\n";
78+
print mb_stripos($euc_jp, "\xC6\xFC\xCB\xDC\xB8\xEC", 0) . "\n";
8079
print mb_stripos($euc_jp, '0', 0) . "\n";
8180
print mb_stripos($euc_jp, 3, 0) . "\n";
8281
print mb_stripos($euc_jp, 0, 0) . "\n";
8382

84-
$r = mb_stripos($euc_jp, '韓国語', 0);
83+
$r = mb_stripos($euc_jp, "\xB4\xDA\xB9\xF1\xB8\xEC", 0);
8584
($r === FALSE) ? print "OK_STR\n" : print "NG_STR\n";
8685
$r = mb_stripos($euc_jp, "\n", 0);
8786
($r === FALSE) ? print "OK_NEWLINE\n" : print "NG_NEWLINE\n";
@@ -91,12 +90,12 @@ echo "== NO OFFSET AND ENCODING PARAMETER ==\n";
9190

9291
mb_internal_encoding('EUC-JP') or print("mb_internal_encoding() failed\n");
9392

94-
print mb_stripos($euc_jp, '日本語') . "\n";
93+
print mb_stripos($euc_jp, "\xC6\xFC\xCB\xDC\xB8\xEC") . "\n";
9594
print mb_stripos($euc_jp, '0') . "\n";
9695
print mb_stripos($euc_jp, 3) . "\n";
9796
print mb_stripos($euc_jp, 0) . "\n";
9897

99-
$r = mb_stripos($euc_jp, '韓国語');
98+
$r = mb_stripos($euc_jp, "\xB4\xDA\xB9\xF1\xB8\xEC");
10099
($r === FALSE) ? print "OK_STR\n" : print "NG_STR\n";
101100
$r = mb_stripos($euc_jp, "\n");
102101
($r === FALSE) ? print "OK_NEWLINE\n" : print "NG_NEWLINE\n";

0 commit comments

Comments
 (0)