Skip to content

Commit cb84079

Browse files
committed
mb_detect_encoding is more accurate on strings with UTF-8/16 BOM
Thanks to the GitHub user 'titanz35' for pointing out that the new implementation of mb_detect_encoding had poor detection accuracy on UTF-8 and UTF-16 strings with a byte-order mark.
1 parent e8f14da commit cb84079

File tree

3 files changed

+38
-0
lines changed

3 files changed

+38
-0
lines changed

NEWS

+2
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ PHP NEWS
5151
casing rules for the Greek letter sigma. For mb_convert_case, conditional
5252
casing only applies to MB_CASE_LOWER and MB_CASE_TITLE modes, not to
5353
MB_CASE_LOWER_SIMPLE and MB_CASE_TITLE_SIMPLE. (Alex Dowad)
54+
. mb_detect_encoding is better able to identify UTF-8 and UTF-16 strings
55+
with a byte-order mark. (Alex Dowad)
5456

5557
- Opcache:
5658
. Added start, restart and force restart time to opcache's

ext/mbstring/mbstring.c

+19
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
#include "libmbfl/filters/mbfilter_uuencode.h"
4444
#include "libmbfl/filters/mbfilter_ucs4.h"
4545
#include "libmbfl/filters/mbfilter_utf8.h"
46+
#include "libmbfl/filters/mbfilter_utf16.h"
4647
#include "libmbfl/filters/mbfilter_singlebyte.h"
4748
#include "libmbfl/filters/translit_kana_jisx0201_jisx0208.h"
4849

@@ -2994,6 +2995,24 @@ static const mbfl_encoding* mb_guess_encoding(unsigned char *in, size_t in_len,
29942995
data[i].in_len = in_len;
29952996
data[i].state = 0;
29962997
data[i].demerits = 0;
2998+
2999+
/* Skip byte order mark for UTF-8, UTF-16BE, or UTF-16LE */
3000+
if (elist[i] == &mbfl_encoding_utf8) {
3001+
if (in_len >= 3 && in[0] == 0xEF && in[1] == 0xBB && in[2] == 0xBF) {
3002+
data[i].in_len -= 3;
3003+
data[i].in += 3;
3004+
}
3005+
} else if (elist[i] == &mbfl_encoding_utf16be) {
3006+
if (in_len >= 2 && in[0] == 0xFE && in[1] == 0xFF) {
3007+
data[i].in_len -= 2;
3008+
data[i].in += 2;
3009+
}
3010+
} else if (elist[i] == &mbfl_encoding_utf16le) {
3011+
if (in_len >= 2 && in[0] == 0xFF && in[1] == 0xFE) {
3012+
data[i].in_len -= 2;
3013+
data[i].in += 2;
3014+
}
3015+
}
29973016
}
29983017

29993018
unsigned int finished = 0; /* For how many candidate encodings have we processed all the input? */

ext/mbstring/tests/mb_detect_encoding.phpt

+17
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,19 @@ try {
100100
echo $e->getMessage() . \PHP_EOL;
101101
}
102102

103+
echo "== BOM TEST ==\n";
104+
105+
$str = chr(239).chr(187).chr(191).chr(195).chr(180); // UTF-8 BOM followed by ô
106+
var_dump(mb_detect_encoding($str, ['UTF-8', 'ISO-8859-1'], true));
107+
// U+4E4E is the Chinese character 乎; normally it would be impossible to distinguish UTF-16LE from UTF-16BE
108+
// But the BOM can tell us which one it is
109+
var_dump(mb_detect_encoding("\xFE\xFF\x4E\x4E", ['UTF-8', 'ISO-8859-1', 'UTF-16LE', 'UTF-16BE'], true));
110+
var_dump(mb_detect_encoding("\xFF\xFE\x4E\x4E", ['UTF-8', 'ISO-8859-1', 'UTF-16LE', 'UTF-16BE'], true));
111+
// However, a BOM should only appear at the beginning of the string
112+
$detected = mb_detect_encoding("\x4E\x4E\xFE\xFF\x4E\x4E", ['UTF-8', 'ISO-8859-1', 'UTF-16LE', 'UTF-16BE'], true);
113+
if ($detected === 'UTF-16BE' || $detected === 'UTF-16LE')
114+
die("Don't accept a BOM in the middle of a string");
115+
103116
echo "== TORTURE TEST ==\n";
104117

105118
function test($strings, $encodings) {
@@ -373,5 +386,9 @@ SJIS: SJIS
373386
INT: EUC-JP
374387
EUC-JP: EUC-JP
375388
mb_detect_encoding(): Argument #2 ($encodings) contains invalid encoding "BAD"
389+
== BOM TEST ==
390+
string(5) "UTF-8"
391+
string(8) "UTF-16BE"
392+
string(8) "UTF-16LE"
376393
== TORTURE TEST ==
377394
Done!

0 commit comments

Comments
 (0)