Skip to content

Commit d8b5b9f

Browse files
committed
Add unit tests for mb_str_split/mb_substr on MacJapanese encoding
MacJapanese has a somewhat unusual feature that when mapped to Unicode, many characters map to sequences of several codepoints. Add test cases demonstrating how mb_str_split and mb_substr behave in this situation. When adding these tests, I found the behavior of mb_substr was wrong due to an inconsistency between the string "length" as measured by mb_strlen and the number of native MacJapanese characters which mb_substr would count when iterating over the string using the mblen_table. This has been fixed. I believe that mb_strstr will also return wrong results in some cases for MacJapanese. I still need to come up with unit tests which demonstrate the problem and figure out how to fix it.
1 parent cca4ca6 commit d8b5b9f

File tree

3 files changed

+56
-2
lines changed

3 files changed

+56
-2
lines changed

ext/mbstring/mbstring.c

+19-2
Original file line numberDiff line numberDiff line change
@@ -2041,7 +2041,10 @@ static zend_string* mb_get_substr(zend_string *input, size_t from, size_t len, c
20412041
len = in_len;
20422042
}
20432043
return zend_string_init_fast((const char*)in, len);
2044-
} else if (enc->mblen_table != NULL) {
2044+
} else if (enc->mblen_table) {
2045+
/* The use of the `mblen_table` means that for encodings like MacJapanese,
2046+
* we treat each character in its native charset as "1 character", even if it
2047+
* maps to a sequence of several codepoints */
20452048
const unsigned char *mbtab = enc->mblen_table;
20462049
unsigned char *limit = in + in_len;
20472050
while (from && in < limit) {
@@ -2254,7 +2257,21 @@ PHP_FUNCTION(mb_substr)
22542257

22552258
size_t mblen = 0;
22562259
if (from < 0 || (!len_is_null && len < 0)) {
2257-
mblen = mb_get_strlen(str, enc);
2260+
if (enc->mblen_table) {
2261+
/* Because we use the `mblen_table` when iterating over the string and
2262+
* extracting the requested part, we also need to use it here for counting
2263+
* the "length" of the string
2264+
* Otherwise, we can get wrong results for text encodings like MacJapanese,
2265+
* where one native 'character' can map to a sequence of several codepoints */
2266+
const unsigned char *mbtab = enc->mblen_table;
2267+
unsigned char *p = (unsigned char*)ZSTR_VAL(str), *e = p + ZSTR_LEN(str);
2268+
while (p < e) {
2269+
p += mbtab[*p];
2270+
mblen++;
2271+
}
2272+
} else {
2273+
mblen = mb_get_strlen(str, enc);
2274+
}
22582275
}
22592276

22602277
/* if "from" position is negative, count start position from the end

ext/mbstring/tests/mb_str_split_jp.phpt

+20
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,23 @@ foreach (['SJIS', 'SJIS-2004', 'MacJapanese', 'SJIS-Mobile#DOCOMO', 'SJIS-Mobile
8080
echo "$encoding: [" . implode(', ', array_map('bin2hex', $array)) . "]\n";
8181
}
8282

83+
/*
84+
Some MacJapanese characters map to a sequence of several Unicode codepoints. Examples:
85+
86+
0x85AB 0xF862+0x0058+0x0049+0x0049+0x0049 # roman numeral thirteen
87+
0x85AC 0xF861+0x0058+0x0049+0x0056 # roman numeral fourteen
88+
0x85AD 0xF860+0x0058+0x0056 # roman numeral fifteen
89+
0x85BF 0xF862+0x0078+0x0069+0x0069+0x0069 # small roman numeral thirteen
90+
0x85C0 0xF861+0x0078+0x0069+0x0076 # small roman numeral fourteen
91+
0x85C1 0xF860+0x0078+0x0076 # small roman numeral fifteen
92+
93+
Even though they map to multiple codepoints, mb_str_split treats these as ONE character each
94+
*/
95+
96+
echo "== MacJapanese characters which map to 3-5 codepoints each ==\n";
97+
echo "[", implode(', ', array_map('bin2hex', mb_str_split("abc\x85\xAB\x85\xAC\x85\xAD", 1, 'MacJapanese'))), "]\n";
98+
echo "[", implode(', ', array_map('bin2hex', mb_str_split("abc\x85\xBF\x85\xC0\x85\xC1", 2, 'MacJapanese'))), "]\n";
99+
83100
?>
84101
--EXPECT--
85102
BIG-5: a4e9 a5bb
@@ -104,3 +121,6 @@ SJIS-Mobile#KDDI: [80a1, 6162, 6380, a1]
104121
SJIS-Mobile#KDDI: [6162, 63fd, feff, 6162, fdfe, ff]
105122
SJIS-Mobile#SoftBank: [80a1, 6162, 6380, a1]
106123
SJIS-Mobile#SoftBank: [6162, 63fd, feff, 6162, fdfe, ff]
124+
== MacJapanese characters which map to 3-5 codepoints each ==
125+
[61, 62, 63, 85ab, 85ac, 85ad]
126+
[6162, 6385bf, 85c085c1]

ext/mbstring/tests/mb_substr.phpt

+17
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,16 @@ echo "SJIS-Mobile#SoftBank:\n";
6464
print bin2hex(mb_substr("\x80abc\x80\xA1", 3, 2, 'SJIS-Mobile#SoftBank')) . "\n";
6565
print bin2hex(mb_substr("\x80abc\x80\xA1", 0, 3, 'SJIS-Mobile#SoftBank')) . "\n";
6666

67+
echo "-- Testing MacJapanese characters which map to 3-5 codepoints each --\n";
68+
69+
/* There are many characters in MacJapanese which map to sequences of several codepoints */
70+
print bin2hex(mb_substr("abc\x85\xAB\x85\xAC\x85\xAD", 0, 3, 'MacJapanese')) . "\n";
71+
print bin2hex(mb_substr("abc\x85\xAB\x85\xAC\x85\xAD", 3, 2, 'MacJapanese')) . "\n";
72+
print bin2hex(mb_substr("abc\x85\xAB\x85\xAC\x85\xAD", -2, 1, 'MacJapanese')) . "\n";
73+
print bin2hex(mb_substr("abc\x85\xBF\x85\xC0\x85\xC1", 0, 3, 'MacJapanese')) . "\n";
74+
print bin2hex(mb_substr("abc\x85\xBF\x85\xC0\x85\xC1", 3, 2, 'MacJapanese')) . "\n";
75+
print bin2hex(mb_substr("abc\x85\xBF\x85\xC0\x85\xC1", -2, 1, 'MacJapanese')) . "\n";
76+
6777
echo "ISO-2022-JP:\n";
6878
print "1: " . bin2hex(mb_substr($iso2022jp, 0, 3, 'ISO-2022-JP')) . "\n";
6979
print "2: " . bin2hex(mb_substr($iso2022jp, -1, null, 'ISO-2022-JP')) . "\n";
@@ -145,6 +155,13 @@ SJIS-Mobile#KDDI:
145155
SJIS-Mobile#SoftBank:
146156
6380
147157
806162
158+
-- Testing MacJapanese characters which map to 3-5 codepoints each --
159+
616263
160+
85ab85ac
161+
85ac
162+
616263
163+
85bf85c0
164+
85c0
148165
ISO-2022-JP:
149166
1: 1b2442212121721b284241
150167
2: 43

0 commit comments

Comments
 (0)