Skip to content

Commit 950a7db

Browse files
committed
Use fast text conversion filters to implement mb_check_encoding
Benchmarking reveals that this is about 8% slower for UTF-8 strings which have a bad codepoint at the very beginning of the string. For good strings, or those where the first bad codepoint is much later in the string, it is significantly faster (2-3 times faster in many cases).
1 parent 8533fcc commit 950a7db

File tree

1 file changed

+22
-19
lines changed

1 file changed

+22
-19
lines changed

ext/mbstring/mbstring.c

+22-19
Original file line numberDiff line numberDiff line change
@@ -3860,31 +3860,34 @@ PHP_FUNCTION(mb_get_info)
38603860
}
38613861
/* }}} */
38623862

3863-
static int mbfl_filt_check_errors(int c, void* data)
3864-
{
3865-
if (c == MBFL_BAD_INPUT) {
3866-
(*((mbfl_convert_filter**)data))->num_illegalchar++;
3867-
}
3868-
return 0;
3869-
}
3870-
38713863
MBSTRING_API int php_mb_check_encoding(const char *input, size_t length, const mbfl_encoding *encoding)
38723864
{
3873-
mbfl_convert_filter *filter = mbfl_convert_filter_new(encoding, &mbfl_encoding_wchar, mbfl_filt_check_errors, NULL, &filter);
3874-
3875-
while (length--) {
3876-
unsigned char c = *input++;
3877-
(filter->filter_function)(c, filter);
3878-
if (filter->num_illegalchar) {
3879-
mbfl_convert_filter_delete(filter);
3865+
uint32_t wchar_buf[128];
3866+
unsigned char *in = (unsigned char*)input;
3867+
unsigned int state = 0;
3868+
3869+
/* If the input string is not encoded in the given encoding, there is a significant chance
3870+
* that this will be seen in the first bytes. Therefore, rather than converting an entire
3871+
* buffer of 128 codepoints, convert and check just a few codepoints first */
3872+
size_t out_len = encoding->to_wchar(&in, &length, wchar_buf, 8, &state);
3873+
ZEND_ASSERT(out_len <= 8);
3874+
for (int i = 0; i < out_len; i++) {
3875+
if (wchar_buf[i] == MBFL_BAD_INPUT) {
38803876
return 0;
38813877
}
38823878
}
38833879

3884-
(filter->filter_flush)(filter);
3885-
int result = !filter->num_illegalchar;
3886-
mbfl_convert_filter_delete(filter);
3887-
return result;
3880+
while (length) {
3881+
out_len = encoding->to_wchar(&in, &length, wchar_buf, 128, &state);
3882+
ZEND_ASSERT(out_len <= 128);
3883+
for (int i = 0; i < out_len; i++) {
3884+
if (wchar_buf[i] == MBFL_BAD_INPUT) {
3885+
return 0;
3886+
}
3887+
}
3888+
}
3889+
3890+
return 1;
38883891
}
38893892

38903893
static int php_mb_check_encoding_recursive(HashTable *vars, const mbfl_encoding *encoding)

0 commit comments

Comments
 (0)