Skip to content

Commit e53162a

Browse files
committed
Return false on invalid codepoint in mb_chr()
Instead of returning the encoding of the current substitution character. This allows a robust check for the failure case. The substitution character (especially the default of "?") is also a valid output of mb_chr() for a valid input (for "?" that would be 0x3f), so it's a bad choice for an error value.
1 parent 41e9ba6 commit e53162a

File tree

2 files changed

+27
-46
lines changed

2 files changed

+27
-46
lines changed

ext/mbstring/mbstring.c

+25-28
Original file line numberDiff line numberDiff line change
@@ -106,8 +106,6 @@ static void php_mb_gpc_set_input_encoding(const zend_encoding *encoding);
106106

107107
static inline zend_bool php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc);
108108

109-
static inline zend_bool php_mb_is_no_encoding_unicode(enum mbfl_no_encoding no_enc);
110-
111109
static inline zend_bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc);
112110
/* }}} */
113111

@@ -3172,13 +3170,6 @@ static inline zend_bool php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding
31723170
}
31733171

31743172

3175-
/* See mbfl_no_encoding definition for list of unicode encodings */
3176-
static inline zend_bool php_mb_is_no_encoding_unicode(enum mbfl_no_encoding no_enc)
3177-
{
3178-
return (no_enc >= mbfl_no_encoding_ucs4 && no_enc <= mbfl_no_encoding_utf8_sb);
3179-
}
3180-
3181-
31823173
/* See mbfl_no_encoding definition for list of UTF-8 encodings */
31833174
static inline zend_bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc)
31843175
{
@@ -5143,10 +5134,18 @@ static inline char* php_mb_chr(zend_long cp, const char* enc, size_t *output_len
51435134
}
51445135
}
51455136

5146-
if (php_mb_is_no_encoding_utf8(no_enc)) {
5137+
if (php_mb_is_unsupported_no_encoding(no_enc)) {
5138+
php_error_docref(NULL, E_WARNING, "Unsupported encoding \"%s\"", enc);
5139+
return NULL;
5140+
}
5141+
5142+
if (cp < 0 || cp > 0x10ffff) {
5143+
return NULL;
5144+
}
51475145

5148-
if (0 > cp || cp > 0x10ffff || (cp > 0xd7ff && 0xe000 > cp)) {
5149-
cp = MBSTRG(current_filter_illegal_substchar);
5146+
if (php_mb_is_no_encoding_utf8(no_enc)) {
5147+
if (cp > 0xd7ff && 0xe000 > cp) {
5148+
return NULL;
51505149
}
51515150

51525151
if (cp < 0x80) {
@@ -5182,20 +5181,6 @@ static inline char* php_mb_chr(zend_long cp, const char* enc, size_t *output_len
51825181
}
51835182

51845183
return ret;
5185-
5186-
} else if (php_mb_is_unsupported_no_encoding(no_enc)) {
5187-
php_error_docref(NULL, E_WARNING, "Unsupported encoding \"%s\"", enc);
5188-
return NULL;
5189-
}
5190-
5191-
if (0 > cp || 0x10ffff < cp) {
5192-
5193-
if (php_mb_is_no_encoding_unicode(MBSTRG(current_internal_encoding)->no_encoding)) {
5194-
cp = MBSTRG(current_filter_illegal_substchar);
5195-
} else {
5196-
cp = 0x3f;
5197-
}
5198-
51995184
}
52005185

52015186
buf_len = 4;
@@ -5206,9 +5191,21 @@ static inline char* php_mb_chr(zend_long cp, const char* enc, size_t *output_len
52065191
buf[3] = cp & 0xff;
52075192
buf[4] = 0;
52085193

5209-
ret = php_mb_convert_encoding(buf, buf_len, enc, "UCS-4BE", &ret_len);
5210-
efree(buf);
5194+
{
5195+
long orig_illegalchars = MBSTRG(illegalchars);
5196+
MBSTRG(illegalchars) = 0;
5197+
ret = php_mb_convert_encoding(buf, buf_len, enc, "UCS-4BE", &ret_len);
5198+
if (MBSTRG(illegalchars) != 0) {
5199+
efree(buf);
5200+
efree(ret);
5201+
MBSTRG(illegalchars) = orig_illegalchars;
5202+
return NULL;
5203+
}
5204+
5205+
MBSTRG(illegalchars) = orig_illegalchars;
5206+
}
52115207

5208+
efree(buf);
52125209
if (output_len) {
52135210
*output_len = ret_len;
52145211
}

ext/mbstring/tests/mb_chr.phpt

+2-18
Original file line numberDiff line numberDiff line change
@@ -7,22 +7,8 @@ mb_chr()
77
var_dump(
88
"\u{20bb7}" === mb_chr(0x20bb7),
99
"\x8f\xa1\xef" === mb_chr(0x50aa, "EUC-JP-2004"),
10-
"?" === mb_chr(0xd800)
11-
);
12-
13-
mb_internal_encoding("UCS-4BE");
14-
mb_substitute_character(0xfffd);
15-
var_dump(
16-
"\u{fffd}" === mb_chr(0xd800, "UTF-8")
17-
);
18-
var_dump(
19-
"\u{fffd}" === mb_chr(0xd800, "UTF-8")
20-
);
21-
22-
mb_internal_encoding("EUC-JP");
23-
mb_substitute_character(0xa4a2);
24-
var_dump(
25-
"\u{a4a2}" === mb_chr(0xd800, "UTF-8")
10+
false === mb_chr(0xd800),
11+
false === mb_chr(0x1f600, "EUC-JP-2004")
2612
);
2713

2814
// Invalid
@@ -39,8 +25,6 @@ bool(true)
3925
bool(true)
4026
bool(true)
4127
bool(true)
42-
bool(true)
43-
bool(true)
4428

4529
Warning: mb_chr(): Unknown encoding "typo" in %s on line %d
4630

0 commit comments

Comments
 (0)