Error handling for UTF-8 complies with WHATWG specification

alexdowad · alexdowad · commit 04e59c916f12 · 2022-04-16T15:04:38.000+02:00
In 7502c86, I adjusted the number of error markers emitted on invalid UTF-8 text to be more consistent with mbstring's behavior on other text encodings (generally, it emits one error marker for one unexpected byte). I didn't expect that anybody would actually care one way or the other, but felt that it was better to be consistent than not. Later, Martin Auswöger kindly pointed out that the WHATWG encoding specification, which governs how various text encodings are handled by web browsers, does actually specify how many error markers should be generated for any given piece of invalid UTF-8 text. Until now, we have never really paid much attention to the WHATWG specification, but we do want to comply with as many relevant specifications as possible. And since PHP is commonly used for web applications, compatibility with the behavior of web browsers is obviously a good thing.
diff --git a/NEWS b/NEWS
@@ -22,6 +22,10 @@ PHP                                                                        NEWS
 - Hash:
   . Fixed bug #81714 (segfault when serializing finalized HashContext). (cmb)
 
+- MBString:
+  . Number of error markers emitted for invalid UTF-8 text matches WHATWG specification.
+    This is a return to the behavior of PHP 8.0 and earlier. (alexdowad)
+
 - MySQLi:
   . Fixed bug GH-8267 (MySQLi uses unsupported format specifier on Windows).
     (cmb)
diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf8.c b/ext/mbstring/libmbfl/filters/mbfilter_utf8.c
@@ -122,9 +122,7 @@ int mbfl_filt_conv_utf8_wchar(int c, mbfl_convert_filter *filter)
 			CK((*filter->output_function)(s, filter->data));
 		} else {
 			CK(mbfl_filt_put_invalid_char(filter));
-			if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) {
-				goto retry;
-			}
+			goto retry;
 		}
 		break;
 	case 0x20: /* 3byte code 2nd char: 0:0xa0-0xbf,D:0x80-9F,1-C,E-F:0x80-0x9f */
@@ -139,9 +137,7 @@ int mbfl_filt_conv_utf8_wchar(int c, mbfl_convert_filter *filter)
 			filter->status++;
 		} else {
 			CK(mbfl_filt_put_invalid_char(filter));
-			if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) {
-				goto retry;
-			}
+			goto retry;
 		}
 		break;
 	case 0x30: /* 4byte code 2nd char: 0:0x90-0xbf,1-3:0x80-0xbf,4:0x80-0x8f */
@@ -156,9 +152,7 @@ int mbfl_filt_conv_utf8_wchar(int c, mbfl_convert_filter *filter)
 			filter->status++;
 		} else {
 			CK(mbfl_filt_put_invalid_char(filter));
-			if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) {
-				goto retry;
-			}
+			goto retry;
 		}
 		break;
 	case 0x31: /* 4byte code 3rd char: 0x80-0xbf */
@@ -167,9 +161,7 @@ int mbfl_filt_conv_utf8_wchar(int c, mbfl_convert_filter *filter)
 			filter->status++;
 		} else {
 			CK(mbfl_filt_put_invalid_char(filter));
-			if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) {
-				goto retry;
-			}
+			goto retry;
 		}
 		break;
 	default:
diff --git a/ext/mbstring/tests/illformed_utf_sequences.phpt b/ext/mbstring/tests/illformed_utf_sequences.phpt
@@ -22,28 +22,28 @@ var_dump(chk_enc("\x31\x32\x33", 0));
 var_dump(chk_enc("\x41\x42\x43", 0));
 var_dump(chk_enc("\xc0\xb1\xc0\xb2\xc0\xb3", 6));
 var_dump(chk_enc("\xc1\x81\xc1\x82\xc1\x83", 6));
-var_dump(chk_enc("\xe0\x80\xb1\xe0\x80\xb2\xe0\x80\xb3", 6));
-var_dump(chk_enc("\xe0\x81\x81\xe0\x81\x82\xe0\x81\x83", 6));
-var_dump(chk_enc("\xf0\x80\x80\xb1\xf0\x80\x80\xb2\xf0\x80\x80\xb3", 9));
-var_dump(chk_enc("\xf0\x80\x81\x81\xf0\x80\x81\x82\xf0\x81\x83", 8));
+var_dump(chk_enc("\xe0\x80\xb1\xe0\x80\xb2\xe0\x80\xb3", 9));
+var_dump(chk_enc("\xe0\x81\x81\xe0\x81\x82\xe0\x81\x83", 9));
+var_dump(chk_enc("\xf0\x80\x80\xb1\xf0\x80\x80\xb2\xf0\x80\x80\xb3", 12));
+var_dump(chk_enc("\xf0\x80\x81\x81\xf0\x80\x81\x82\xf0\x81\x83", 11));
 var_dump(chk_enc("\xf8\x80\x80\x80\xb1\xf8\x80\x80\x80\xb2\xf8\x80\x80\x80\xb3", 15));
 var_dump(chk_enc("\xf8\x80\x80\x81\x81\xf8\x80\x80\x81\x82\xf8\x80\x80\x81\x83", 15));
 var_dump(chk_enc("\xfc\x80\x80\x80\x80\xb1\xfc\x80\x80\x80\x80\xb2\xfc\x80\x80\x80\x80\xb3", 18));
 var_dump(chk_enc("\xfc\x80\x80\x80\x81\x81\xfc\x80\x80\x80\x81\x82\xfc\x80\x80\x80\x81\x83", 18));
 
 var_dump(chk_enc("\xc2\xa2\xc2\xa3\xc2\xa5", 0));
-var_dump(chk_enc("\xe0\x82\xa2\xe0\x82\xa3\xe0\x82\xa5", 6));
-var_dump(chk_enc("\xf0\x80\x82\xa2\xf0\x80\x82\xa3\xf0\x80\x82\xa5", 9));
+var_dump(chk_enc("\xe0\x82\xa2\xe0\x82\xa3\xe0\x82\xa5", 9));
+var_dump(chk_enc("\xf0\x80\x82\xa2\xf0\x80\x82\xa3\xf0\x80\x82\xa5", 12));
 var_dump(chk_enc("\xf8\x80\x80\x82\xa2\xf8\x80\x80\x82\xa3\xf8\x80\x80\x82\xa5", 15));
 var_dump(chk_enc("\xfc\x80\x80\x80\x82\xa2\xfc\x80\x80\x80\x82\xa3\xfc\x80\x80\x80\x82\xa5", 18));
 
 var_dump(chk_enc("\xc1\xbf", 2));
 var_dump(chk_enc("\xc2\x80", 0));
 var_dump(chk_enc("\xdf\xbf", 0));
-var_dump(chk_enc("\xe0\x9f\xff", 2));
+var_dump(chk_enc("\xe0\x9f\xff", 3));
 var_dump(chk_enc("\xe0\xa0\x80", 2));
 var_dump(chk_enc("\xef\xbf\xbf", 0));
-var_dump(chk_enc("\xf0\x8f\xbf\xbf", 3));
+var_dump(chk_enc("\xf0\x8f\xbf\xbf", 4));
 var_dump(chk_enc("\xf0\x90\x80\x80", 0));
 var_dump(chk_enc("\xf7\xbf\xbf\xbf", 4));
 var_dump(chk_enc("\xf8\x87\xbf\xbf\xbf", 5));
@@ -58,7 +58,7 @@ echo "UTF-8 and surrogates area\n";
 $out = '';
 $cnt = 0;
 for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
-    $s = chk_enc(pack('C3', 0xe0 | ($i >> 12), 0x80 | ($i >> 6) & 0x3f, 0x80 | $i & 0x3f), 2);
+    $s = chk_enc(pack('C3', 0xe0 | ($i >> 12), 0x80 | ($i >> 6) & 0x3f, 0x80 | $i & 0x3f), 3);
     if ($s === false) {
         $cnt++;
     } else {
diff --git a/ext/mbstring/tests/utf8_error_handling.phpt b/ext/mbstring/tests/utf8_error_handling.phpt
@@ -0,0 +1,56 @@
+--TEST--
+Confirm error handling for UTF-8 complies with WHATWG spec
+--EXTENSIONS--
+mbstring
+--FILE--
+<?php
+/* The WHATWG specifies not just how web browsers should handle _valid_
+ * UTF-8 text, but how they should handle _invalid_ UTF-8 text (such
+ * as how many error markers each invalid byte sequence should decode
+ * to).
+ * That specification is followed by the JavaScript Encoding API.
+ *
+ * The API documentation for mb_convert_encoding does not specify how
+ * many error markers we will emit for each possible invalid byte
+ * sequence, so we might as well comply with the WHATWG specification.
+ *
+ * Thanks to Martin Auswöger for pointing this out... and another big
+ * thanks for providing test cases!
+ *
+ * Ref: https://encoding.spec.whatwg.org/#utf-8-decoder
+ */
+mb_substitute_character(0x25);
+
+$testCases = [
+  ["\x80", "%"],
+  ["\xFF", "%"],
+  ["\xC2\x7F", "%\x7F"],
+  ["\xC2\x80", "\xC2\x80"],
+  ["\xDF\xBF", "\xDF\xBF"],
+  ["\xDF\xC0", "%%"],
+  ["\xE0\xA0\x7F", "%\x7F"],
+  ["\xE0\xA0\x80", "\xE0\xA0\x80"],
+  ["\xEF\xBF\xBF", "\xEF\xBF\xBF"],
+  ["\xEF\xBF\xC0", "%%"],
+  ["\xF0\x90\x80\x7F", "%\x7F"],
+  ["\xF0\x90\x80\x80", "\xF0\x90\x80\x80"],
+  ["\xF4\x8F\xBF\xBF", "\xF4\x8F\xBF\xBF"],
+  ["\xF4\x8F\xBF\xC0", "%%"],
+  ["\xFA\x80\x80\x80\x80", "%%%%%"],
+  ["\xFB\xBF\xBF\xBF\xBF", "%%%%%"],
+  ["\xFD\x80\x80\x80\x80\x80", "%%%%%%"],
+  ["\xFD\xBF\xBF\xBF\xBF\xBF", "%%%%%%"]
+];
+
+foreach ($testCases as $testCase) {
+  $result = mb_convert_encoding($testCase[0], 'UTF-8', 'UTF-8');
+  if ($result !== $testCase[1]) {
+    die("Expected UTF-8 string " . bin2hex($testCase[0]) . " to convert to UTF-8 string " . bin2hex($testCase[1]) . "; got " . bin2hex($result));
+  }
+}
+
+echo "All done!\n";
+
+?>
+--EXPECT--
+All done!
diff --git a/ext/mbstring/tests/utf_encodings.phpt b/ext/mbstring/tests/utf_encodings.phpt
@@ -761,14 +761,14 @@ testValidString('', '', 'UTF-8', 'UTF-32BE');
 
 $invalid = array(
   // Codepoints outside of valid 0-0x10FFFF range for Unicode
-  "\xF4\x90\x80\x80" => str_repeat("\x00\x00\x00%", 3), // CP 0x110000
+  "\xF4\x90\x80\x80" => str_repeat("\x00\x00\x00%", 4), // CP 0x110000
   "\xF7\x80\x80\x80" => str_repeat("\x00\x00\x00%", 4), // CP 0x1C0000
   "\xF7\xBF\xBF\xBF" => str_repeat("\x00\x00\x00%", 4), // CP 0x1FFFFF
 
   // Reserved range for UTF-16 surrogate pairs
-  "\xED\xA0\x80" => str_repeat("\x00\x00\x00%", 2),     // CP 0xD800
-  "\xED\xAF\xBF" => str_repeat("\x00\x00\x00%", 2),     // CP 0xDBFF
-  "\xED\xBF\xBF" => str_repeat("\x00\x00\x00%", 2),     // CP 0xDFFF
+  "\xED\xA0\x80" => str_repeat("\x00\x00\x00%", 3),     // CP 0xD800
+  "\xED\xAF\xBF" => str_repeat("\x00\x00\x00%", 3),     // CP 0xDBFF
+  "\xED\xBF\xBF" => str_repeat("\x00\x00\x00%", 3),     // CP 0xDFFF
 
   // Truncated characters
   "\xDF" => "\x00\x00\x00%",         // should have been 2-byte
@@ -788,8 +788,8 @@ $invalid = array(
 
   // Multi-byte characters which end too soon and go to a junk byte
   // (Which isn't even valid to start a new character)
-  "\xF0\xBF\xBF\xFF" => "\x00\x00\x00%",
-  "\xF0\xBF\xFF" => "\x00\x00\x00%",
+  "\xF0\xBF\xBF\xFF" => str_repeat("\x00\x00\x00%", 2),
+  "\xF0\xBF\xFF" => str_repeat("\x00\x00\x00%", 2),
 
   // Continuation bytes which appear outside of a MB char
   "\x80" => "\x00\x00\x00%",
@@ -799,8 +799,8 @@ $invalid = array(
   // Overlong code units
   // (Using more bytes than needed to encode a character)
   "\xC1\xBF" => str_repeat("\x00\x00\x00%", 2),        // didn't need 2 bytes
-  "\xE0\x9F\xBF" => str_repeat("\x00\x00\x00%", 2),    // didn't need 3 bytes
-  "\xF0\x8F\xBF\xBF" => str_repeat("\x00\x00\x00%", 3) // didn't need 4 bytes
+  "\xE0\x9F\xBF" => str_repeat("\x00\x00\x00%", 3),    // didn't need 3 bytes
+  "\xF0\x8F\xBF\xBF" => str_repeat("\x00\x00\x00%", 4) // didn't need 4 bytes
 );
 
 testInvalidCodepoints($invalid, 'UTF-8');