Add unit tests for mb_str_split/mb_substr on MacJapanese encoding

alexdowad · alexdowad · commit d8b5b9fa559c · 2023-01-08T17:23:47.000+02:00
MacJapanese has a somewhat unusual feature that when mapped to
Unicode, many characters map to sequences of several codepoints.
Add test cases demonstrating how mb_str_split and mb_substr behave in
this situation.

When adding these tests, I found the behavior of mb_substr was wrong
due to an inconsistency between the string "length" as measured by
mb_strlen and the number of native MacJapanese characters which
mb_substr would count when iterating over the string using the
mblen_table. This has been fixed.

I believe that mb_strstr will also return wrong results in some cases
for MacJapanese. I still need to come up with unit tests which
demonstrate the problem and figure out how to fix it.
diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c
@@ -2041,7 +2041,10 @@ static zend_string* mb_get_substr(zend_string *input, size_t from, size_t len, c
 			len = in_len;
 		}
 		return zend_string_init_fast((const char*)in, len);
-	} else if (enc->mblen_table != NULL) {
+	} else if (enc->mblen_table) {
+		/* The use of the `mblen_table` means that for encodings like MacJapanese,
+		 * we treat each character in its native charset as "1 character", even if it
+		 * maps to a sequence of several codepoints */
 		const unsigned char *mbtab = enc->mblen_table;
 		unsigned char *limit = in + in_len;
 		while (from && in < limit) {
@@ -2254,7 +2257,21 @@ PHP_FUNCTION(mb_substr)
 
 	size_t mblen = 0;
 	if (from < 0 || (!len_is_null && len < 0)) {
-		mblen = mb_get_strlen(str, enc);
+		if (enc->mblen_table) {
+			/* Because we use the `mblen_table` when iterating over the string and
+			 * extracting the requested part, we also need to use it here for counting
+			 * the "length" of the string
+			 * Otherwise, we can get wrong results for text encodings like MacJapanese,
+			 * where one native 'character' can map to a sequence of several codepoints */
+			const unsigned char *mbtab = enc->mblen_table;
+			unsigned char *p = (unsigned char*)ZSTR_VAL(str), *e = p + ZSTR_LEN(str);
+			while (p < e) {
+				p += mbtab[*p];
+				mblen++;
+			}
+		} else {
+			mblen = mb_get_strlen(str, enc);
+		}
 	}
 
 	/* if "from" position is negative, count start position from the end
diff --git a/ext/mbstring/tests/mb_str_split_jp.phpt b/ext/mbstring/tests/mb_str_split_jp.phpt
@@ -80,6 +80,23 @@ foreach (['SJIS', 'SJIS-2004', 'MacJapanese', 'SJIS-Mobile#DOCOMO', 'SJIS-Mobile
     echo "$encoding: [" . implode(', ', array_map('bin2hex', $array)) . "]\n";
 }
 
+/*
+Some MacJapanese characters map to a sequence of several Unicode codepoints. Examples:
+
+0x85AB  0xF862+0x0058+0x0049+0x0049+0x0049  # roman numeral thirteen
+0x85AC  0xF861+0x0058+0x0049+0x0056 # roman numeral fourteen
+0x85AD  0xF860+0x0058+0x0056    # roman numeral fifteen
+0x85BF  0xF862+0x0078+0x0069+0x0069+0x0069  # small roman numeral thirteen
+0x85C0  0xF861+0x0078+0x0069+0x0076 # small roman numeral fourteen
+0x85C1  0xF860+0x0078+0x0076    # small roman numeral fifteen
+
+Even though they map to multiple codepoints, mb_str_split treats these as ONE character each
+*/
+
+echo "== MacJapanese characters which map to 3-5 codepoints each ==\n";
+echo "[", implode(', ', array_map('bin2hex', mb_str_split("abc\x85\xAB\x85\xAC\x85\xAD", 1, 'MacJapanese'))), "]\n";
+echo "[", implode(', ', array_map('bin2hex', mb_str_split("abc\x85\xBF\x85\xC0\x85\xC1", 2, 'MacJapanese'))), "]\n";
+
 ?>
 --EXPECT--
 BIG-5: a4e9 a5bb
@@ -104,3 +121,6 @@ SJIS-Mobile#KDDI: [80a1, 6162, 6380, a1]
 SJIS-Mobile#KDDI: [6162, 63fd, feff, 6162, fdfe, ff]
 SJIS-Mobile#SoftBank: [80a1, 6162, 6380, a1]
 SJIS-Mobile#SoftBank: [6162, 63fd, feff, 6162, fdfe, ff]
+== MacJapanese characters which map to 3-5 codepoints each ==
+[61, 62, 63, 85ab, 85ac, 85ad]
+[6162, 6385bf, 85c085c1]
diff --git a/ext/mbstring/tests/mb_substr.phpt b/ext/mbstring/tests/mb_substr.phpt
@@ -64,6 +64,16 @@ echo "SJIS-Mobile#SoftBank:\n";
 print bin2hex(mb_substr("\x80abc\x80\xA1", 3, 2, 'SJIS-Mobile#SoftBank')) . "\n";
 print bin2hex(mb_substr("\x80abc\x80\xA1", 0, 3, 'SJIS-Mobile#SoftBank')) . "\n";
 
+echo "-- Testing MacJapanese characters which map to 3-5 codepoints each --\n";
+
+/* There are many characters in MacJapanese which map to sequences of several codepoints */
+print bin2hex(mb_substr("abc\x85\xAB\x85\xAC\x85\xAD", 0, 3, 'MacJapanese')) . "\n";
+print bin2hex(mb_substr("abc\x85\xAB\x85\xAC\x85\xAD", 3, 2, 'MacJapanese')) . "\n";
+print bin2hex(mb_substr("abc\x85\xAB\x85\xAC\x85\xAD", -2, 1, 'MacJapanese')) . "\n";
+print bin2hex(mb_substr("abc\x85\xBF\x85\xC0\x85\xC1", 0, 3, 'MacJapanese')) . "\n";
+print bin2hex(mb_substr("abc\x85\xBF\x85\xC0\x85\xC1", 3, 2, 'MacJapanese')) . "\n";
+print bin2hex(mb_substr("abc\x85\xBF\x85\xC0\x85\xC1", -2, 1, 'MacJapanese')) . "\n";
+
 echo "ISO-2022-JP:\n";
 print "1: " . bin2hex(mb_substr($iso2022jp, 0, 3, 'ISO-2022-JP')) . "\n";
 print "2: " . bin2hex(mb_substr($iso2022jp, -1, null, 'ISO-2022-JP')) . "\n";
@@ -145,6 +155,13 @@ SJIS-Mobile#KDDI:
 SJIS-Mobile#SoftBank:
 6380
 806162
+-- Testing MacJapanese characters which map to 3-5 codepoints each --
+616263
+85ab85ac
+85ac
+616263
+85bf85c0
+85c0
 ISO-2022-JP:
 1: 1b2442212121721b284241
 2: 43