Skip to content

Commit ae05c96

Browse files
committed
Add more tests for mbstring encoding conversion
When testing the preceding commits, I used a script to generate a large number of random strings and try to find strings which would yield different outputs from the new and old encoding conversion code. Some were found. In most cases, analysis revealed that the new code was correct and the old code was not. In all cases where the new code was incorrect, regression tests were added. However, there may be some value in adding regression tests for cases where the old code was incorrect as well. That is done here. This does not cover every case where the new and old code yielded different results. Some of them were very obscure, and it is proving difficult even to reproduce them (since I did not keep a record of all the input strings which triggered the differing output).
1 parent 8466e07 commit ae05c96

7 files changed

+57
-2
lines changed

ext/mbstring/tests/cp5022x_encoding.phpt

+7-2
Original file line numberDiff line numberDiff line change
@@ -366,9 +366,14 @@ foreach (['CP50220', 'CP50221', 'CP50222'] as $encoding) {
366366

367367
echo "Invalid escape sequences OK\n";
368368

369-
// Regression test
369+
// Regression tests
370370
if (mb_convert_encoding("\x1BC\xF5", 'UTF-16BE', 'CP50221') !== "\x00%\x00C\x00%")
371-
die("Bad")
371+
die("Bad");
372+
373+
// Previously, the CP50220 implementation would eat trailing null bytes
374+
$converted = mb_convert_encoding("ab\x00", 'UTF-16BE', 'CP50220');
375+
if ($converted !== "\x00a\x00b\x00\x00")
376+
die("Bad handling of trailing null byte (got " . bin2hex($converted) . ")");
372377

373378
?>
374379
--EXPECT--

ext/mbstring/tests/iso2022jp_2004_encoding.phpt

+15
Original file line numberDiff line numberDiff line change
@@ -320,6 +320,21 @@ for ($i = 0; $i < 100; $i++) {
320320
testValid($testString, $convertsTo, false);
321321
}
322322

323+
// Regression test: Test handling of 0x80-0x9F; these have a special meaning in EUC-JP-2004,
324+
// but not in ISO-2022-JP-2004
325+
for ($i = 0x80; $i <= 0x9F; $i++)
326+
convertInvalidString(chr($i), "%", "ISO-2022-JP-2004", "UTF-8");
327+
328+
// Regression test: Codepoint which has a special representation in EUC-JP-2004
329+
convertInvalidString("\xFF\x95", "%", "UTF-16BE", "ISO-2022-JP-2004");
330+
331+
// Regression test: Old implementation did not switch properly between JIS X 0213 plane 1
332+
// and plane 2
333+
// So try a character which is in plane 1 followed by one in plane 2
334+
testValidString("\x30\x00\x4E\x02", "\x1B\$(Q\x21\x21\x1B\$(P\x21\x22\x1B(B", "UTF-16BE", "ISO-2022-JP-2004");
335+
// Try plane 2 followed by plane 1
336+
testValidString("\x4E\x02\x30\x00", "\x1B\$(P\x21\x22\x1B\$(Q\x21\x21\x1B(B", "UTF-16BE", "ISO-2022-JP-2004");
337+
323338
// Test "long" illegal character markers
324339
mb_substitute_character("long");
325340
convertInvalidString("\xE0", "%", "ISO-2022-JP-2004", "UTF-8");

ext/mbstring/tests/iso2022jp_encoding.phpt

+3
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,9 @@ for ($i = 0; $i <= 0xFF; $i++) {
192192
identifyInvalidString($escapeSequence, 'ISO-2022-JP');
193193
}
194194
}
195+
/* Also try a bare ESC */
196+
identifyInvalidString("\x1B", 'JIS');
197+
identifyInvalidString("\x1B", 'ISO-2022-JP');
195198

196199
echo "All escape sequences work as expected\n";
197200

ext/mbstring/tests/iso2022jp_kddi_encoding.phpt

+10
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,16 @@ testValidString("\x30\x00\x00A", "\x1B\$B\x21\x21\x1B(BA", "UTF-16BE", "ISO-2022
209209
// Switch from JISX 0208 Kanji to JISX 0201 Kana
210210
testValidString("\x30\x00\xFF\x67", "\x1B\$B\x21\x21\x1B(I'\x1B(B", "UTF-16BE", "ISO-2022-JP-KDDI", false);
211211

212+
/* Convert Unicode flag emoji to ISO-2022-JP-KDDI proprietary flag emoji
213+
* I am not able to confirm that the kuten codes we are using for these proprietary emoji are the right ones
214+
* (There doesn't seem to be any publically available reference, and I don't have a legacy KDDI device)
215+
*
216+
* However, the conversion does not work in the opposite direction; this is because of the test
217+
* `if (s >= (84 * 94) && s < (91 * 94))`, which the kuten code which we are using for flag emoji doesn't match
218+
* That test is inherited from the old implementation (from libmbfl), and I have no way to confirm that
219+
* changing it won't break anything */
220+
testValidString("\x00\x01\xF1\xF0\x00\x01\xF1\xF7", "\x1B\$B\x70\x55\x1B(B", "UTF-32BE", "ISO-2022-JP-KDDI", false);
221+
212222
echo "JIS X 0208 (with MS extensions) and KDDI emoji support OK\n";
213223

214224
testValidString("\x00\xA5", "\x1B\$B!o\x1B(B", "UTF-16BE", "ISO-2022-JP-KDDI", false);

ext/mbstring/tests/iso2022kr_encoding.phpt

+8
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,14 @@ convertValidString("\x76\x20\x00a\x00b", "\x1B$)C\x0E\x74\x30\x0Fab", "UTF-16BE"
106106
// 0x7E, resulting in a failed assertion
107107
convertInvalidString("\x0E~/", "%", "ISO-2022-KR", "UTF-8");
108108

109+
// Regression test: The old implementation would wrongly convert some codepoints
110+
// which are not in KS X 1001 at all to 'random' characters in KS X 1001
111+
convertInvalidString("\xFF\x86", "\x1B\$)C%", "UTF-16BE", "ISO-2022-KR");
112+
113+
// Regression test: The old implementation would sometimes emit an extra 0x0F ('shift out')
114+
// character at the end of a string, although the string was already ending in ASCII mode
115+
convertValidString("\x68\x46\x00a", "\x1B\$)C\x0E\x68\x46\x0Fa", "UTF-16BE", "ISO-2022-KR", false);
116+
109117
// Test "long" illegal character markers
110118
mb_substitute_character("long");
111119
convertInvalidString("\x1B", "%", "ISO-2022-KR", "UTF-8");

ext/mbstring/tests/sjis_mobile_encodings.phpt

+8
Original file line numberDiff line numberDiff line change
@@ -306,6 +306,14 @@ convertValidString("\xA9\xA9\xA9\xA9", "\xF9\xD6\xF9\xD6\xF9\xD6\xF9\xD6", '8bit
306306
convertValidString("\xA9\xA9\xA9\xA9", "\xF7\x74\xF7\x74\xF7\x74\xF7\x74", '8bit', 'SJIS-Mobile#KDDI');
307307
convertValidString("\xA9\xA9\xA9\xA9", "\xF7\xEE\xF7\xEE\xF7\xEE\xF7\xEE", '8bit', 'SJIS-Mobile#SOFTBANK');
308308

309+
// Regression test: Old implementation used to drop digits (0-9) and hash (#) if
310+
// they appeared at end of input string
311+
for ($i = ord('0'); $i <= ord('9'); $i++) {
312+
convertValidString("abc" . chr($i), "abc" . chr($i), 'UTF-8', 'SJIS-Mobile#DOCOMO');
313+
convertValidString("abc" . chr($i), "abc" . chr($i), 'UTF-8', 'SJIS-Mobile#KDDI');
314+
convertValidString("abc" . chr($i), "abc" . chr($i), 'UTF-8', 'SJIS-Mobile#SOFTBANK');
315+
}
316+
309317
?>
310318
--EXPECT--
311319
SJIS-Mobile#DOCOMO verification and conversion works on all valid characters

ext/mbstring/tests/ucs4_encoding.phpt

+6
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,14 @@ testValidString("\x00\x00\xFE\xFF\x00\x00\x30\x01", "\x30\x01", "UCS-4", "UTF-16
1212
testValidString("\x02\x30\x00\x00", "\x30\x02", "UCS-4LE", "UTF-16BE");
1313
testValidString("\x00\x00\x30\x03", "\x30\x03", "UCS-4BE", "UTF-16BE");
1414

15+
// Truncated input
1516
convertInvalidString("\x01\x02\x03", "%", "UCS-4", "UTF-8");
1617

18+
19+
// Codepoint above U+10FFFF
20+
convertInvalidString("\x00\x11\x00\x00", "%", "UCS-4BE", "UTF-8");
21+
convertInvalidString("\x00\x00\x11\x00", "%", "UCS-4LE", "UTF-8");
22+
1723
// Test "long" illegal character markers
1824
mb_substitute_character("long");
1925
convertInvalidString("\x6F\x00\x00\x00", "U+6F000000", "UCS-4BE", "UTF-8");

0 commit comments

Comments
 (0)