mb_detect_encoding is more accurate on strings with UTF-8/16 BOM

alexdowad · alexdowad · commit cb840799b487 · 2023-01-19T08:40:39.000+02:00
Thanks to the GitHub user 'titanz35' for pointing out that the new
implementation of mb_detect_encoding had poor detection accuracy on
UTF-8 and UTF-16 strings with a byte-order mark.
diff --git a/NEWS b/NEWS
@@ -51,6 +51,8 @@ PHP                                                                        NEWS
     casing rules for the Greek letter sigma. For mb_convert_case, conditional
     casing only applies to MB_CASE_LOWER and MB_CASE_TITLE modes, not to
     MB_CASE_LOWER_SIMPLE and MB_CASE_TITLE_SIMPLE. (Alex Dowad)
+  . mb_detect_encoding is better able to identify UTF-8 and UTF-16 strings
+    with a byte-order mark. (Alex Dowad)
 
 - Opcache:
   . Added start, restart and force restart time to opcache's
diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c
@@ -43,6 +43,7 @@
 #include "libmbfl/filters/mbfilter_uuencode.h"
 #include "libmbfl/filters/mbfilter_ucs4.h"
 #include "libmbfl/filters/mbfilter_utf8.h"
+#include "libmbfl/filters/mbfilter_utf16.h"
 #include "libmbfl/filters/mbfilter_singlebyte.h"
 #include "libmbfl/filters/translit_kana_jisx0201_jisx0208.h"
 
@@ -2994,6 +2995,24 @@ static const mbfl_encoding* mb_guess_encoding(unsigned char *in, size_t in_len,
 		data[i].in_len = in_len;
 		data[i].state = 0;
 		data[i].demerits = 0;
+
+		/* Skip byte order mark for UTF-8, UTF-16BE, or UTF-16LE */
+		if (elist[i] == &mbfl_encoding_utf8) {
+			if (in_len >= 3 && in[0] == 0xEF && in[1] == 0xBB && in[2] == 0xBF) {
+				data[i].in_len -= 3;
+				data[i].in += 3;
+			}
+		} else if (elist[i] == &mbfl_encoding_utf16be) {
+			if (in_len >= 2 && in[0] == 0xFE && in[1] == 0xFF) {
+				data[i].in_len -= 2;
+				data[i].in += 2;
+			}
+		} else if (elist[i] == &mbfl_encoding_utf16le) {
+			if (in_len >= 2 && in[0] == 0xFF && in[1] == 0xFE) {
+				data[i].in_len -= 2;
+				data[i].in += 2;
+			}
+		}
 	}
 
 	unsigned int finished = 0; /* For how many candidate encodings have we processed all the input? */
diff --git a/ext/mbstring/tests/mb_detect_encoding.phpt b/ext/mbstring/tests/mb_detect_encoding.phpt
@@ -100,6 +100,19 @@ try {
     echo $e->getMessage() . \PHP_EOL;
 }
 
+echo "== BOM TEST ==\n";
+
+$str = chr(239).chr(187).chr(191).chr(195).chr(180); // UTF-8 BOM followed by ô
+var_dump(mb_detect_encoding($str, ['UTF-8', 'ISO-8859-1'], true));
+// U+4E4E is the Chinese character 乎; normally it would be impossible to distinguish UTF-16LE from UTF-16BE
+// But the BOM can tell us which one it is
+var_dump(mb_detect_encoding("\xFE\xFF\x4E\x4E", ['UTF-8', 'ISO-8859-1', 'UTF-16LE', 'UTF-16BE'], true));
+var_dump(mb_detect_encoding("\xFF\xFE\x4E\x4E", ['UTF-8', 'ISO-8859-1', 'UTF-16LE', 'UTF-16BE'], true));
+// However, a BOM should only appear at the beginning of the string
+$detected = mb_detect_encoding("\x4E\x4E\xFE\xFF\x4E\x4E", ['UTF-8', 'ISO-8859-1', 'UTF-16LE', 'UTF-16BE'], true);
+if ($detected === 'UTF-16BE' || $detected === 'UTF-16LE')
+    die("Don't accept a BOM in the middle of a string");
+
 echo "== TORTURE TEST ==\n";
 
 function test($strings, $encodings) {
@@ -373,5 +386,9 @@ SJIS: SJIS
 INT: EUC-JP
 EUC-JP: EUC-JP
 mb_detect_encoding(): Argument #2 ($encodings) contains invalid encoding "BAD"
+== BOM TEST ==
+string(5) "UTF-8"
+string(8) "UTF-16BE"
+string(8) "UTF-16LE"
 == TORTURE TEST ==
 Done!