Add more tests for mbstring encoding conversion

alexdowad · alexdowad · commit ae05c96b2232 · 2022-05-08T14:59:57.000+02:00
When testing the preceding commits, I used a script to generate a large
number of random strings and try to find strings which would yield
different outputs from the new and old encoding conversion code.
Some were found. In most cases, analysis revealed that the new code
was correct and the old code was not.

In all cases where the new code was incorrect, regression tests were
added. However, there may be some value in adding regression tests
for cases where the old code was incorrect as well. That is done here.

This does not cover every case where the new and old code yielded
different results. Some of them were very obscure, and it is proving
difficult even to reproduce them (since I did not keep a record of
all the input strings which triggered the differing output).
diff --git a/ext/mbstring/tests/cp5022x_encoding.phpt b/ext/mbstring/tests/cp5022x_encoding.phpt
@@ -366,9 +366,14 @@ foreach (['CP50220', 'CP50221', 'CP50222'] as $encoding) {
 
 echo "Invalid escape sequences OK\n";
 
-// Regression test
+// Regression tests
 if (mb_convert_encoding("\x1BC\xF5", 'UTF-16BE', 'CP50221') !== "\x00%\x00C\x00%")
-  die("Bad")
+  die("Bad");
+
+// Previously, the CP50220 implementation would eat trailing null bytes
+$converted = mb_convert_encoding("ab\x00", 'UTF-16BE', 'CP50220');
+if ($converted !== "\x00a\x00b\x00\x00")
+  die("Bad handling of trailing null byte (got " . bin2hex($converted) . ")");
 
 ?>
 --EXPECT--
diff --git a/ext/mbstring/tests/iso2022jp_2004_encoding.phpt b/ext/mbstring/tests/iso2022jp_2004_encoding.phpt
@@ -320,6 +320,21 @@ for ($i = 0; $i < 100; $i++) {
 	testValid($testString, $convertsTo, false);
 }
 
+// Regression test: Test handling of 0x80-0x9F; these have a special meaning in EUC-JP-2004,
+// but not in ISO-2022-JP-2004
+for ($i = 0x80; $i <= 0x9F; $i++)
+	convertInvalidString(chr($i), "%", "ISO-2022-JP-2004", "UTF-8");
+
+// Regression test: Codepoint which has a special representation in EUC-JP-2004
+convertInvalidString("\xFF\x95", "%", "UTF-16BE", "ISO-2022-JP-2004");
+
+// Regression test: Old implementation did not switch properly between JIS X 0213 plane 1
+// and plane 2
+// So try a character which is in plane 1 followed by one in plane 2
+testValidString("\x30\x00\x4E\x02", "\x1B\$(Q\x21\x21\x1B\$(P\x21\x22\x1B(B", "UTF-16BE", "ISO-2022-JP-2004");
+// Try plane 2 followed by plane 1
+testValidString("\x4E\x02\x30\x00", "\x1B\$(P\x21\x22\x1B\$(Q\x21\x21\x1B(B", "UTF-16BE", "ISO-2022-JP-2004");
+
 // Test "long" illegal character markers
 mb_substitute_character("long");
 convertInvalidString("\xE0", "%", "ISO-2022-JP-2004", "UTF-8");
diff --git a/ext/mbstring/tests/iso2022jp_encoding.phpt b/ext/mbstring/tests/iso2022jp_encoding.phpt
@@ -192,6 +192,9 @@ for ($i = 0; $i <= 0xFF; $i++) {
 		identifyInvalidString($escapeSequence, 'ISO-2022-JP');
 	}
 }
+/* Also try a bare ESC */
+identifyInvalidString("\x1B", 'JIS');
+identifyInvalidString("\x1B", 'ISO-2022-JP');
 
 echo "All escape sequences work as expected\n";
 
diff --git a/ext/mbstring/tests/iso2022jp_kddi_encoding.phpt b/ext/mbstring/tests/iso2022jp_kddi_encoding.phpt
@@ -209,6 +209,16 @@ testValidString("\x30\x00\x00A", "\x1B\$B\x21\x21\x1B(BA", "UTF-16BE", "ISO-2022
 // Switch from JISX 0208 Kanji to JISX 0201 Kana
 testValidString("\x30\x00\xFF\x67", "\x1B\$B\x21\x21\x1B(I'\x1B(B", "UTF-16BE", "ISO-2022-JP-KDDI", false);
 
+/* Convert Unicode flag emoji to ISO-2022-JP-KDDI proprietary flag emoji
+ * I am not able to confirm that the kuten codes we are using for these proprietary emoji are the right ones
+ * (There doesn't seem to be any publically available reference, and I don't have a legacy KDDI device)
+ *
+ * However, the conversion does not work in the opposite direction; this is because of the test
+ * `if (s >= (84 * 94) && s < (91 * 94))`, which the kuten code which we are using for flag emoji doesn't match
+ * That test is inherited from the old implementation (from libmbfl), and I have no way to confirm that
+ * changing it won't break anything */
+testValidString("\x00\x01\xF1\xF0\x00\x01\xF1\xF7", "\x1B\$B\x70\x55\x1B(B", "UTF-32BE", "ISO-2022-JP-KDDI", false);
+
 echo "JIS X 0208 (with MS extensions) and KDDI emoji support OK\n";
 
 testValidString("\x00\xA5", "\x1B\$B!o\x1B(B", "UTF-16BE", "ISO-2022-JP-KDDI", false);
diff --git a/ext/mbstring/tests/iso2022kr_encoding.phpt b/ext/mbstring/tests/iso2022kr_encoding.phpt
@@ -106,6 +106,14 @@ convertValidString("\x76\x20\x00a\x00b", "\x1B$)C\x0E\x74\x30\x0Fab", "UTF-16BE"
 // 0x7E, resulting in a failed assertion
 convertInvalidString("\x0E~/", "%", "ISO-2022-KR", "UTF-8");
 
+// Regression test: The old implementation would wrongly convert some codepoints
+// which are not in KS X 1001 at all to 'random' characters in KS X 1001
+convertInvalidString("\xFF\x86", "\x1B\$)C%", "UTF-16BE", "ISO-2022-KR");
+
+// Regression test: The old implementation would sometimes emit an extra 0x0F ('shift out')
+// character at the end of a string, although the string was already ending in ASCII mode
+convertValidString("\x68\x46\x00a", "\x1B\$)C\x0E\x68\x46\x0Fa", "UTF-16BE", "ISO-2022-KR", false);
+
 // Test "long" illegal character markers
 mb_substitute_character("long");
 convertInvalidString("\x1B", "%", "ISO-2022-KR", "UTF-8");
diff --git a/ext/mbstring/tests/sjis_mobile_encodings.phpt b/ext/mbstring/tests/sjis_mobile_encodings.phpt
@@ -306,6 +306,14 @@ convertValidString("\xA9\xA9\xA9\xA9", "\xF9\xD6\xF9\xD6\xF9\xD6\xF9\xD6", '8bit
 convertValidString("\xA9\xA9\xA9\xA9", "\xF7\x74\xF7\x74\xF7\x74\xF7\x74", '8bit', 'SJIS-Mobile#KDDI');
 convertValidString("\xA9\xA9\xA9\xA9", "\xF7\xEE\xF7\xEE\xF7\xEE\xF7\xEE", '8bit', 'SJIS-Mobile#SOFTBANK');
 
+// Regression test: Old implementation used to drop digits (0-9) and hash (#) if
+// they appeared at end of input string
+for ($i = ord('0'); $i <= ord('9'); $i++) {
+  convertValidString("abc" . chr($i), "abc" . chr($i), 'UTF-8', 'SJIS-Mobile#DOCOMO');
+  convertValidString("abc" . chr($i), "abc" . chr($i), 'UTF-8', 'SJIS-Mobile#KDDI');
+  convertValidString("abc" . chr($i), "abc" . chr($i), 'UTF-8', 'SJIS-Mobile#SOFTBANK');
+}
+
 ?>
 --EXPECT--
 SJIS-Mobile#DOCOMO verification and conversion works on all valid characters
diff --git a/ext/mbstring/tests/ucs4_encoding.phpt b/ext/mbstring/tests/ucs4_encoding.phpt
@@ -12,8 +12,14 @@ testValidString("\x00\x00\xFE\xFF\x00\x00\x30\x01", "\x30\x01", "UCS-4", "UTF-16
 testValidString("\x02\x30\x00\x00", "\x30\x02", "UCS-4LE", "UTF-16BE");
 testValidString("\x00\x00\x30\x03", "\x30\x03", "UCS-4BE", "UTF-16BE");
 
+// Truncated input
 convertInvalidString("\x01\x02\x03", "%", "UCS-4", "UTF-8");
 
+
+// Codepoint above U+10FFFF
+convertInvalidString("\x00\x11\x00\x00", "%", "UCS-4BE", "UTF-8");
+convertInvalidString("\x00\x00\x11\x00", "%", "UCS-4LE", "UTF-8");
+
 // Test "long" illegal character markers
 mb_substitute_character("long");
 convertInvalidString("\x6F\x00\x00\x00", "U+6F000000", "UCS-4BE", "UTF-8");

Original file line number	Diff line number	Diff line change
`@@ -192,6 +192,9 @@ for ($i = 0; $i <= 0xFF; $i++) {`
`192`	`192`	`identifyInvalidString($escapeSequence, 'ISO-2022-JP');`
`193`	`193`	`}`
`194`	`194`	`}`
	`195`	`+/* Also try a bare ESC */`
	`196`	`+identifyInvalidString("\x1B", 'JIS');`
	`197`	`+identifyInvalidString("\x1B", 'ISO-2022-JP');`
`195`	`198`
`196`	`199`	`echo "All escape sequences work as expected\n";`
`197`	`200`