Skip to content

Commit 81faab9

Browse files
committed
Improve mb_detect_encoding accuracy for text containing vowels with macrons
Among other world languages, the Māori language commonly uses vowels with macrons.
1 parent d7eb4cf commit 81faab9

File tree

3 files changed

+26
-1
lines changed

3 files changed

+26
-1
lines changed

ext/mbstring/common_codepoints.txt

+5
Original file line numberDiff line numberDiff line change
@@ -3,18 +3,23 @@
33
0x0020 0x007E # ASCII
44
0x00A1 0x00AC # Pound sign, Yen sign, copyright sign...
55
0x00AE 0x00FF # Accented Latin characters
6+
0x0101 0x0101 # a with macron
67
0x0104 0x0107 # Polish
78
0x010C 0x010F # Czech
9+
0x0113 0x0113 # e with macron
810
0x0118 0x011B # Polish, Czech
911
0x011F 0x011F # Turkish
12+
0x012B 0x012B # i with macron
1013
0x0130 0x0131 # Turkish
1114
0x0141 0x0144 # Polish
1215
0x0147 0x0148 # Czech
16+
0x014D 0x014D # o with macron
1317
0x0150 0x0151 # Hungarian
1418
0x0158 0x015B # Czech, Polish
1519
0x015F 0x015F # Turkish
1620
0x0160 0x0161 # Used in Slavic names
1721
0x0164 0x0165 # Czech
22+
0x016B 0x016B # u with macron
1823
0x016E 0x016F # Czech
1924
0x0170 0x0171 # Hungarian
2025
0x0179 0x017E # Polish, Czech, other Slavic languages

ext/mbstring/rare_cp_bitvec.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
static const uint32_t rare_codepoint_bitvec[] = {
1313
0xffffd9ff, 0x00000000, 0x00000000, 0x80000000, 0xffffffff, 0x00002001, 0x00000000, 0x00000000,
14-
0x70ff0f0f, 0xfffcffff, 0x70fcfe61, 0x81fc3fcc, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
14+
0x70f70f0d, 0xfffcf7ff, 0x70fcde61, 0x81fc37cc, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
1515
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
1616
0xfffff800, 0xffffffff, 0xffffffff, 0x0300ffff, 0x0000280f, 0x00000004, 0x00000000, 0x00000000,
1717
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,

ext/mbstring/tests/mb_detect_encoding.phpt

+20
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,18 @@ $css = 'input[type="radio"]:checked + img {
8585
}';
8686
echo mb_detect_encoding($css, mb_list_encodings(), true), "\n";
8787

88+
// Test cases courtesy of Kirill Roskolii and Chris Burgess
89+
echo "-- Māori text --\n";
90+
91+
echo mb_detect_encoding("Total Māori,31.5,33.3,31.8,33,36.4,33.2,33.2", ['UTF-8', 'ISO-8859-1', 'Windows-1251']), "\n";
92+
// Names of native birds from Aotearoa:
93+
echo mb_detect_encoding("Kākā", ['UTF-8', 'ISO-8859-1', 'Windows-1251']), "\n";
94+
echo mb_detect_encoding("Whēkau", ['UTF-8', 'ISO-8859-1', 'Windows-1251']), "\n";
95+
echo mb_detect_encoding("Tīwaiwaka", ['UTF-8', 'ISO-8859-1', 'Windows-1251']), "\n";
96+
echo mb_detect_encoding("Kōtuku", ['UTF-8', 'ISO-8859-1', 'Windows-1251']), "\n";
97+
echo mb_detect_encoding("Kererū", ['UTF-8', 'ISO-8859-1', 'Windows-1251']), "\n";
98+
echo mb_detect_encoding("Tūī", ['UTF-8', 'ISO-8859-1', 'Windows-1251']), "\n";
99+
88100
echo "== DETECT ORDER ==\n";
89101

90102
mb_detect_order('auto');
@@ -408,6 +420,14 @@ UTF-8
408420
UTF-8
409421
SJIS
410422
UTF-8
423+
-- Māori text --
424+
UTF-8
425+
UTF-8
426+
UTF-8
427+
UTF-8
428+
UTF-8
429+
UTF-8
430+
UTF-8
411431
== DETECT ORDER ==
412432
JIS: JIS
413433
EUC-JP: EUC-JP

0 commit comments

Comments
 (0)