Skip to content

Commit 880803a

Browse files
committed
Use fast conversion filters to implement php_mb_ord
Even for single-character strings, this is about 50% faster for ASCII, UTF-8, and UTF-16. For long strings, the performance gain is enormous, since the old code would convert the ENTIRE string, just to pick out the first codepoint.
1 parent 9468fa7 commit 880803a

File tree

2 files changed

+13
-21
lines changed

2 files changed

+13
-21
lines changed

ext/mbstring/libmbfl/mbfl/mbfl_encoding.h

+4
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,10 @@ typedef struct {
143143
typedef size_t (*mb_to_wchar_fn)(unsigned char **in, size_t *in_len, uint32_t *out, size_t out_len, unsigned int *state);
144144
typedef void (*mb_from_wchar_fn)(uint32_t *in, size_t in_len, mb_convert_buf *out, bool end);
145145

146+
/* When converting encoded text to a buffer of wchars (Unicode codepoints) using `mb_to_wchar_fn`,
147+
* the buffer must be at least this size (to work with all supported text encodings) */
148+
#define MBSTRING_MIN_WCHAR_BUFSIZE 5
149+
146150
static inline void mb_convert_buf_init(mb_convert_buf *buf, size_t initsize, uint32_t repl_char, unsigned int err_mode)
147151
{
148152
buf->state = buf->errors = 0;

ext/mbstring/mbstring.c

+9-21
Original file line numberDiff line numberDiff line change
@@ -3993,29 +3993,17 @@ static inline zend_long php_mb_ord(const char *str, size_t str_len, zend_string
39933993
return -2;
39943994
}
39953995

3996-
{
3997-
mbfl_wchar_device dev;
3998-
mbfl_convert_filter *filter;
3999-
zend_long cp;
4000-
4001-
mbfl_wchar_device_init(&dev);
4002-
filter = mbfl_convert_filter_new(enc, &mbfl_encoding_wchar, mbfl_wchar_device_output, 0, &dev);
4003-
/* If this assertion fails this means some memory allocation failure which is a bug */
4004-
ZEND_ASSERT(filter != NULL);
4005-
4006-
mbfl_convert_filter_feed_string(filter, (unsigned char*)str, str_len);
4007-
mbfl_convert_filter_flush(filter);
4008-
4009-
if (dev.pos < 1 || filter->num_illegalchar || dev.buffer[0] == MBFL_BAD_INPUT) {
4010-
cp = -1;
4011-
} else {
4012-
cp = dev.buffer[0];
4013-
}
3996+
/* Some legacy text encodings have a minimum required wchar buffer size;
3997+
* the ones which need the most are SJIS-Mac, UTF-7, and UTF7-IMAP */
3998+
uint32_t wchar_buf[MBSTRING_MIN_WCHAR_BUFSIZE];
3999+
unsigned int state = 0;
4000+
size_t out_len = enc->to_wchar((unsigned char**)&str, &str_len, wchar_buf, MBSTRING_MIN_WCHAR_BUFSIZE, &state);
4001+
ZEND_ASSERT(out_len <= MBSTRING_MIN_WCHAR_BUFSIZE);
40144002

4015-
mbfl_convert_filter_delete(filter);
4016-
mbfl_wchar_device_clear(&dev);
4017-
return cp;
4003+
if (!out_len || wchar_buf[0] == MBFL_BAD_INPUT) {
4004+
return -1;
40184005
}
4006+
return wchar_buf[0];
40194007
}
40204008

40214009

0 commit comments

Comments
 (0)