Skip to content

Commit 04e59c9

Browse files
committed
Error handling for UTF-8 complies with WHATWG specification
In 7502c86, I adjusted the number of error markers emitted on invalid UTF-8 text to be more consistent with mbstring's behavior on other text encodings (generally, it emits one error marker for one unexpected byte). I didn't expect that anybody would actually care one way or the other, but felt that it was better to be consistent than not. Later, Martin Auswöger kindly pointed out that the WHATWG encoding specification, which governs how various text encodings are handled by web browsers, does actually specify how many error markers should be generated for any given piece of invalid UTF-8 text. Until now, we have never really paid much attention to the WHATWG specification, but we do want to comply with as many relevant specifications as possible. And since PHP is commonly used for web applications, compatibility with the behavior of web browsers is obviously a good thing.
1 parent b0ab5d0 commit 04e59c9

File tree

5 files changed

+81
-29
lines changed

5 files changed

+81
-29
lines changed

NEWS

+4
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,10 @@ PHP NEWS
2222
- Hash:
2323
. Fixed bug #81714 (segfault when serializing finalized HashContext). (cmb)
2424

25+
- MBString:
26+
. Number of error markers emitted for invalid UTF-8 text matches WHATWG specification.
27+
This is a return to the behavior of PHP 8.0 and earlier. (alexdowad)
28+
2529
- MySQLi:
2630
. Fixed bug GH-8267 (MySQLi uses unsupported format specifier on Windows).
2731
(cmb)

ext/mbstring/libmbfl/filters/mbfilter_utf8.c

+4-12
Original file line numberDiff line numberDiff line change
@@ -122,9 +122,7 @@ int mbfl_filt_conv_utf8_wchar(int c, mbfl_convert_filter *filter)
122122
CK((*filter->output_function)(s, filter->data));
123123
} else {
124124
CK(mbfl_filt_put_invalid_char(filter));
125-
if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) {
126-
goto retry;
127-
}
125+
goto retry;
128126
}
129127
break;
130128
case 0x20: /* 3byte code 2nd char: 0:0xa0-0xbf,D:0x80-9F,1-C,E-F:0x80-0x9f */
@@ -139,9 +137,7 @@ int mbfl_filt_conv_utf8_wchar(int c, mbfl_convert_filter *filter)
139137
filter->status++;
140138
} else {
141139
CK(mbfl_filt_put_invalid_char(filter));
142-
if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) {
143-
goto retry;
144-
}
140+
goto retry;
145141
}
146142
break;
147143
case 0x30: /* 4byte code 2nd char: 0:0x90-0xbf,1-3:0x80-0xbf,4:0x80-0x8f */
@@ -156,9 +152,7 @@ int mbfl_filt_conv_utf8_wchar(int c, mbfl_convert_filter *filter)
156152
filter->status++;
157153
} else {
158154
CK(mbfl_filt_put_invalid_char(filter));
159-
if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) {
160-
goto retry;
161-
}
155+
goto retry;
162156
}
163157
break;
164158
case 0x31: /* 4byte code 3rd char: 0x80-0xbf */
@@ -167,9 +161,7 @@ int mbfl_filt_conv_utf8_wchar(int c, mbfl_convert_filter *filter)
167161
filter->status++;
168162
} else {
169163
CK(mbfl_filt_put_invalid_char(filter));
170-
if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) {
171-
goto retry;
172-
}
164+
goto retry;
173165
}
174166
break;
175167
default:

ext/mbstring/tests/illformed_utf_sequences.phpt

+9-9
Original file line numberDiff line numberDiff line change
@@ -22,28 +22,28 @@ var_dump(chk_enc("\x31\x32\x33", 0));
2222
var_dump(chk_enc("\x41\x42\x43", 0));
2323
var_dump(chk_enc("\xc0\xb1\xc0\xb2\xc0\xb3", 6));
2424
var_dump(chk_enc("\xc1\x81\xc1\x82\xc1\x83", 6));
25-
var_dump(chk_enc("\xe0\x80\xb1\xe0\x80\xb2\xe0\x80\xb3", 6));
26-
var_dump(chk_enc("\xe0\x81\x81\xe0\x81\x82\xe0\x81\x83", 6));
27-
var_dump(chk_enc("\xf0\x80\x80\xb1\xf0\x80\x80\xb2\xf0\x80\x80\xb3", 9));
28-
var_dump(chk_enc("\xf0\x80\x81\x81\xf0\x80\x81\x82\xf0\x81\x83", 8));
25+
var_dump(chk_enc("\xe0\x80\xb1\xe0\x80\xb2\xe0\x80\xb3", 9));
26+
var_dump(chk_enc("\xe0\x81\x81\xe0\x81\x82\xe0\x81\x83", 9));
27+
var_dump(chk_enc("\xf0\x80\x80\xb1\xf0\x80\x80\xb2\xf0\x80\x80\xb3", 12));
28+
var_dump(chk_enc("\xf0\x80\x81\x81\xf0\x80\x81\x82\xf0\x81\x83", 11));
2929
var_dump(chk_enc("\xf8\x80\x80\x80\xb1\xf8\x80\x80\x80\xb2\xf8\x80\x80\x80\xb3", 15));
3030
var_dump(chk_enc("\xf8\x80\x80\x81\x81\xf8\x80\x80\x81\x82\xf8\x80\x80\x81\x83", 15));
3131
var_dump(chk_enc("\xfc\x80\x80\x80\x80\xb1\xfc\x80\x80\x80\x80\xb2\xfc\x80\x80\x80\x80\xb3", 18));
3232
var_dump(chk_enc("\xfc\x80\x80\x80\x81\x81\xfc\x80\x80\x80\x81\x82\xfc\x80\x80\x80\x81\x83", 18));
3333

3434
var_dump(chk_enc("\xc2\xa2\xc2\xa3\xc2\xa5", 0));
35-
var_dump(chk_enc("\xe0\x82\xa2\xe0\x82\xa3\xe0\x82\xa5", 6));
36-
var_dump(chk_enc("\xf0\x80\x82\xa2\xf0\x80\x82\xa3\xf0\x80\x82\xa5", 9));
35+
var_dump(chk_enc("\xe0\x82\xa2\xe0\x82\xa3\xe0\x82\xa5", 9));
36+
var_dump(chk_enc("\xf0\x80\x82\xa2\xf0\x80\x82\xa3\xf0\x80\x82\xa5", 12));
3737
var_dump(chk_enc("\xf8\x80\x80\x82\xa2\xf8\x80\x80\x82\xa3\xf8\x80\x80\x82\xa5", 15));
3838
var_dump(chk_enc("\xfc\x80\x80\x80\x82\xa2\xfc\x80\x80\x80\x82\xa3\xfc\x80\x80\x80\x82\xa5", 18));
3939

4040
var_dump(chk_enc("\xc1\xbf", 2));
4141
var_dump(chk_enc("\xc2\x80", 0));
4242
var_dump(chk_enc("\xdf\xbf", 0));
43-
var_dump(chk_enc("\xe0\x9f\xff", 2));
43+
var_dump(chk_enc("\xe0\x9f\xff", 3));
4444
var_dump(chk_enc("\xe0\xa0\x80", 2));
4545
var_dump(chk_enc("\xef\xbf\xbf", 0));
46-
var_dump(chk_enc("\xf0\x8f\xbf\xbf", 3));
46+
var_dump(chk_enc("\xf0\x8f\xbf\xbf", 4));
4747
var_dump(chk_enc("\xf0\x90\x80\x80", 0));
4848
var_dump(chk_enc("\xf7\xbf\xbf\xbf", 4));
4949
var_dump(chk_enc("\xf8\x87\xbf\xbf\xbf", 5));
@@ -58,7 +58,7 @@ echo "UTF-8 and surrogates area\n";
5858
$out = '';
5959
$cnt = 0;
6060
for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
61-
$s = chk_enc(pack('C3', 0xe0 | ($i >> 12), 0x80 | ($i >> 6) & 0x3f, 0x80 | $i & 0x3f), 2);
61+
$s = chk_enc(pack('C3', 0xe0 | ($i >> 12), 0x80 | ($i >> 6) & 0x3f, 0x80 | $i & 0x3f), 3);
6262
if ($s === false) {
6363
$cnt++;
6464
} else {
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
--TEST--
2+
Confirm error handling for UTF-8 complies with WHATWG spec
3+
--EXTENSIONS--
4+
mbstring
5+
--FILE--
6+
<?php
7+
/* The WHATWG specifies not just how web browsers should handle _valid_
8+
* UTF-8 text, but how they should handle _invalid_ UTF-8 text (such
9+
* as how many error markers each invalid byte sequence should decode
10+
* to).
11+
* That specification is followed by the JavaScript Encoding API.
12+
*
13+
* The API documentation for mb_convert_encoding does not specify how
14+
* many error markers we will emit for each possible invalid byte
15+
* sequence, so we might as well comply with the WHATWG specification.
16+
*
17+
* Thanks to Martin Auswöger for pointing this out... and another big
18+
* thanks for providing test cases!
19+
*
20+
* Ref: https://encoding.spec.whatwg.org/#utf-8-decoder
21+
*/
22+
mb_substitute_character(0x25);
23+
24+
$testCases = [
25+
["\x80", "%"],
26+
["\xFF", "%"],
27+
["\xC2\x7F", "%\x7F"],
28+
["\xC2\x80", "\xC2\x80"],
29+
["\xDF\xBF", "\xDF\xBF"],
30+
["\xDF\xC0", "%%"],
31+
["\xE0\xA0\x7F", "%\x7F"],
32+
["\xE0\xA0\x80", "\xE0\xA0\x80"],
33+
["\xEF\xBF\xBF", "\xEF\xBF\xBF"],
34+
["\xEF\xBF\xC0", "%%"],
35+
["\xF0\x90\x80\x7F", "%\x7F"],
36+
["\xF0\x90\x80\x80", "\xF0\x90\x80\x80"],
37+
["\xF4\x8F\xBF\xBF", "\xF4\x8F\xBF\xBF"],
38+
["\xF4\x8F\xBF\xC0", "%%"],
39+
["\xFA\x80\x80\x80\x80", "%%%%%"],
40+
["\xFB\xBF\xBF\xBF\xBF", "%%%%%"],
41+
["\xFD\x80\x80\x80\x80\x80", "%%%%%%"],
42+
["\xFD\xBF\xBF\xBF\xBF\xBF", "%%%%%%"]
43+
];
44+
45+
foreach ($testCases as $testCase) {
46+
$result = mb_convert_encoding($testCase[0], 'UTF-8', 'UTF-8');
47+
if ($result !== $testCase[1]) {
48+
die("Expected UTF-8 string " . bin2hex($testCase[0]) . " to convert to UTF-8 string " . bin2hex($testCase[1]) . "; got " . bin2hex($result));
49+
}
50+
}
51+
52+
echo "All done!\n";
53+
54+
?>
55+
--EXPECT--
56+
All done!

ext/mbstring/tests/utf_encodings.phpt

+8-8
Original file line numberDiff line numberDiff line change
@@ -761,14 +761,14 @@ testValidString('', '', 'UTF-8', 'UTF-32BE');
761761

762762
$invalid = array(
763763
// Codepoints outside of valid 0-0x10FFFF range for Unicode
764-
"\xF4\x90\x80\x80" => str_repeat("\x00\x00\x00%", 3), // CP 0x110000
764+
"\xF4\x90\x80\x80" => str_repeat("\x00\x00\x00%", 4), // CP 0x110000
765765
"\xF7\x80\x80\x80" => str_repeat("\x00\x00\x00%", 4), // CP 0x1C0000
766766
"\xF7\xBF\xBF\xBF" => str_repeat("\x00\x00\x00%", 4), // CP 0x1FFFFF
767767

768768
// Reserved range for UTF-16 surrogate pairs
769-
"\xED\xA0\x80" => str_repeat("\x00\x00\x00%", 2), // CP 0xD800
770-
"\xED\xAF\xBF" => str_repeat("\x00\x00\x00%", 2), // CP 0xDBFF
771-
"\xED\xBF\xBF" => str_repeat("\x00\x00\x00%", 2), // CP 0xDFFF
769+
"\xED\xA0\x80" => str_repeat("\x00\x00\x00%", 3), // CP 0xD800
770+
"\xED\xAF\xBF" => str_repeat("\x00\x00\x00%", 3), // CP 0xDBFF
771+
"\xED\xBF\xBF" => str_repeat("\x00\x00\x00%", 3), // CP 0xDFFF
772772

773773
// Truncated characters
774774
"\xDF" => "\x00\x00\x00%", // should have been 2-byte
@@ -788,8 +788,8 @@ $invalid = array(
788788

789789
// Multi-byte characters which end too soon and go to a junk byte
790790
// (Which isn't even valid to start a new character)
791-
"\xF0\xBF\xBF\xFF" => "\x00\x00\x00%",
792-
"\xF0\xBF\xFF" => "\x00\x00\x00%",
791+
"\xF0\xBF\xBF\xFF" => str_repeat("\x00\x00\x00%", 2),
792+
"\xF0\xBF\xFF" => str_repeat("\x00\x00\x00%", 2),
793793

794794
// Continuation bytes which appear outside of a MB char
795795
"\x80" => "\x00\x00\x00%",
@@ -799,8 +799,8 @@ $invalid = array(
799799
// Overlong code units
800800
// (Using more bytes than needed to encode a character)
801801
"\xC1\xBF" => str_repeat("\x00\x00\x00%", 2), // didn't need 2 bytes
802-
"\xE0\x9F\xBF" => str_repeat("\x00\x00\x00%", 2), // didn't need 3 bytes
803-
"\xF0\x8F\xBF\xBF" => str_repeat("\x00\x00\x00%", 3) // didn't need 4 bytes
802+
"\xE0\x9F\xBF" => str_repeat("\x00\x00\x00%", 3), // didn't need 3 bytes
803+
"\xF0\x8F\xBF\xBF" => str_repeat("\x00\x00\x00%", 4) // didn't need 4 bytes
804804
);
805805

806806
testInvalidCodepoints($invalid, 'UTF-8');

0 commit comments

Comments
 (0)