Skip to content

Commit 41e9ba6

Browse files
committed
Always use Unicode codepoints in mb_ord() and mb_chr()
Previously mb_chr() had two different encoding-dependent behaviors: * For "Unicode-encodings" it took a Unicode codepoint and returned its encoded representation. * Otherwise it returned a big-endian binary encoding of the passed integer. Now the input is always interpreted as a Unicode codepoint. If a big-endian binary encoding is what you want, you don't need mbstring to implement that.
1 parent fb9bf5b commit 41e9ba6

File tree

3 files changed

+20
-101
lines changed

3 files changed

+20
-101
lines changed

ext/mbstring/mbstring.c

+18-99
Original file line numberDiff line numberDiff line change
@@ -5065,8 +5065,6 @@ static inline zend_long php_mb_ord(const char* str, size_t str_len, const char*
50655065
enum mbfl_no_encoding no_enc;
50665066
char* ret;
50675067
size_t ret_len;
5068-
const mbfl_encoding *encoding;
5069-
unsigned char char_len;
50705068
zend_long cp;
50715069

50725070
if (enc == NULL) {
@@ -5080,52 +5078,20 @@ static inline zend_long php_mb_ord(const char* str, size_t str_len, const char*
50805078
}
50815079
}
50825080

5083-
if (php_mb_is_no_encoding_unicode(no_enc)) {
5084-
5085-
ret = php_mb_convert_encoding(str, str_len, "UCS-4BE", enc, &ret_len);
5086-
5087-
if (ret == NULL) {
5088-
return -1;
5089-
}
5090-
5091-
cp = (unsigned char) ret[0] << 24 | \
5092-
(unsigned char) ret[1] << 16 | \
5093-
(unsigned char) ret[2] << 8 | \
5094-
(unsigned char) ret[3];
5095-
5096-
efree(ret);
5097-
5098-
return cp;
5099-
5100-
} else if (php_mb_is_unsupported_no_encoding(no_enc)) {
5081+
if (php_mb_is_unsupported_no_encoding(no_enc)) {
51015082
php_error_docref(NULL, E_WARNING, "Unsupported encoding \"%s\"", enc);
51025083
return -1;
51035084
}
51045085

5105-
ret = php_mb_convert_encoding(str, str_len, enc, enc, &ret_len);
5106-
5086+
ret = php_mb_convert_encoding(str, str_len, "UCS-4BE", enc, &ret_len);
51075087
if (ret == NULL) {
51085088
return -1;
51095089
}
51105090

5111-
encoding = mbfl_no2encoding(no_enc);
5112-
char_len = php_mb_mbchar_bytes_ex(ret, encoding);
5113-
5114-
if (char_len == 1) {
5115-
cp = (unsigned char) ret[0];
5116-
} else if (char_len == 2) {
5117-
cp = ((unsigned char) ret[0] << 8) | \
5118-
(unsigned char) ret[1];
5119-
} else if (char_len == 3) {
5120-
cp = ((unsigned char) ret[0] << 16) | \
5121-
((unsigned char) ret[1] << 8) | \
5122-
(unsigned char) ret[2];
5123-
} else {
5124-
cp = ((unsigned char) ret[0] << 24) | \
5125-
((unsigned char) ret[1] << 16) | \
5126-
((unsigned char) ret[2] << 8) | \
5127-
(unsigned char) ret[3];
5128-
}
5091+
cp = (unsigned char) ret[0] << 24 | \
5092+
(unsigned char) ret[1] << 16 | \
5093+
(unsigned char) ret[2] << 8 | \
5094+
(unsigned char) ret[3];
51295095

51305096
efree(ret);
51315097

@@ -5217,77 +5183,30 @@ static inline char* php_mb_chr(zend_long cp, const char* enc, size_t *output_len
52175183

52185184
return ret;
52195185

5220-
} else if (php_mb_is_no_encoding_unicode(no_enc)) {
5221-
5222-
if (0 > cp || 0x10ffff < cp) {
5223-
5224-
if (php_mb_is_no_encoding_unicode(MBSTRG(current_internal_encoding)->no_encoding)) {
5225-
cp = MBSTRG(current_filter_illegal_substchar);
5226-
} else {
5227-
cp = 0x3f;
5228-
}
5229-
5230-
}
5231-
5232-
buf_len = 4;
5233-
buf = (char *) safe_emalloc(buf_len, 1, 1);
5234-
buf[0] = (cp >> 24) & 0xff;
5235-
buf[1] = (cp >> 16) & 0xff;
5236-
buf[2] = (cp >> 8) & 0xff;
5237-
buf[3] = cp & 0xff;
5238-
buf[4] = 0;
5239-
5240-
ret = php_mb_convert_encoding(buf, buf_len, enc, "UCS-4BE", &ret_len);
5241-
efree(buf);
5242-
5243-
if (output_len) {
5244-
*output_len = ret_len;
5245-
}
5246-
5247-
return ret;
5248-
52495186
} else if (php_mb_is_unsupported_no_encoding(no_enc)) {
52505187
php_error_docref(NULL, E_WARNING, "Unsupported encoding \"%s\"", enc);
52515188
return NULL;
52525189
}
52535190

5254-
if (0 > cp || cp > 0x100000000) {
5255-
if (no_enc == MBSTRG(current_internal_encoding)->no_encoding) {
5191+
if (0 > cp || 0x10ffff < cp) {
5192+
5193+
if (php_mb_is_no_encoding_unicode(MBSTRG(current_internal_encoding)->no_encoding)) {
52565194
cp = MBSTRG(current_filter_illegal_substchar);
52575195
} else {
52585196
cp = 0x3f;
52595197
}
5260-
}
52615198

5262-
if (cp < 0x100) {
5263-
buf_len = 1;
5264-
buf = (char *) safe_emalloc(buf_len, 1, 1);
5265-
buf[0] = cp;
5266-
buf[1] = 0;
5267-
} else if (cp < 0x10000) {
5268-
buf_len = 2;
5269-
buf = (char *) safe_emalloc(buf_len, 1, 1);
5270-
buf[0] = cp >> 8;
5271-
buf[1] = cp & 0xff;
5272-
buf[2] = 0;
5273-
} else if (cp < 0x1000000) {
5274-
buf_len = 3;
5275-
buf = (char *) safe_emalloc(buf_len, 1, 1);
5276-
buf[0] = cp >> 16;
5277-
buf[1] = (cp >> 8) & 0xff;
5278-
buf[2] = cp & 0xff;
5279-
buf[3] = 0;
5280-
} else {
5281-
buf_len = 4;
5282-
buf = (char *) safe_emalloc(buf_len, 1, 1);
5283-
buf[0] = cp >> 24;
5284-
buf[1] = (cp >> 16) & 0xff;
5285-
buf[2] = (cp >> 8) & 0xff;
5286-
buf[3] = cp & 0xff;
5287-
buf[4] = 0;
52885199
}
52895200

5290-
ret = php_mb_convert_encoding(buf, buf_len, enc, enc, &ret_len);
5201+
buf_len = 4;
5202+
buf = (char *) safe_emalloc(buf_len, 1, 1);
5203+
buf[0] = (cp >> 24) & 0xff;
5204+
buf[1] = (cp >> 16) & 0xff;
5205+
buf[2] = (cp >> 8) & 0xff;
5206+
buf[3] = cp & 0xff;
5207+
buf[4] = 0;
5208+
5209+
ret = php_mb_convert_encoding(buf, buf_len, enc, "UCS-4BE", &ret_len);
52915210
efree(buf);
52925211

52935212
if (output_len) {

ext/mbstring/tests/mb_chr.phpt

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ mb_chr()
66
<?php
77
var_dump(
88
"\u{20bb7}" === mb_chr(0x20bb7),
9-
"\x8f\xa1\xef" === mb_chr(0x8fa1ef, "EUC-JP-2004"),
9+
"\x8f\xa1\xef" === mb_chr(0x50aa, "EUC-JP-2004"),
1010
"?" === mb_chr(0xd800)
1111
);
1212

ext/mbstring/tests/mb_ord.phpt

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ mb_ord()
77
var_dump(
88
0x20bb7 === mb_ord("\u{20bb7}"),
99
0x3f === mb_ord("\u{d800}"),
10-
0x8fa1ef === mb_ord("\x8f\xa1\xef", "EUC-JP-2004")
10+
0x50aa === mb_ord("\x8f\xa1\xef", "EUC-JP-2004")
1111
);
1212

1313
// Invalid

0 commit comments

Comments
 (0)