Skip to content

Commit d104481

Browse files
committedJan 5, 2023
Correct entry for 0x80,0xFD-FF in SJIS multi-byte character length table
As a performance optimization, mbstring implements some functions using tables which give the (byte) length of a multi-byte character using a lookup based on the value of the first byte. These tables are called `mblen_table`. For many years, the mblen_table for SJIS has had '2' in position 0x80. That is wrong; it should have been '1'. Reasons: For SJIS, SJIS-2004, and mobile variants of SJIS, 0x80 has never been treated as the first byte of a 2-byte character. It has always been treated as a single erroneous byte. On the other hand, 0x80 is a valid character in MacJapanese... but a 1-byte character, not a 2-byte one. The same applies to bytes 0xFD-FF; these are 1-byte characters in MacJapanese, and in other SJIS variants, they are not valid (as the first byte of a character). Thanks to the GitHub user 'youkidearitai' for finding this problem.
1 parent 275bf3b commit d104481

File tree

4 files changed

+233
-25
lines changed

4 files changed

+233
-25
lines changed
 

‎ext/mbstring/libmbfl/filters/mbfilter_sjis.c

+3-3
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838

3939
static int mbfl_filt_conv_sjis_wchar_flush(mbfl_convert_filter *filter);
4040

41-
const unsigned char mblen_table_sjis[] = { /* 0x80-0x9f,0xE0-0xFF */
41+
const unsigned char mblen_table_sjis[] = { /* 0x81-0x9F,0xE0-0xFC */
4242
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4343
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4444
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -47,14 +47,14 @@ const unsigned char mblen_table_sjis[] = { /* 0x80-0x9f,0xE0-0xFF */
4747
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4848
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4949
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
50-
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
50+
1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5151
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5252
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5353
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5454
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5555
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5656
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
57-
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
57+
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1
5858
};
5959

6060
static const char *mbfl_encoding_sjis_aliases[] = {"x-sjis", "SHIFT-JIS", NULL};

‎ext/mbstring/tests/mb_str_split_jp.phpt

+24
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,17 @@ if(end($array) !== $enc){
6262
last array element: %s expected: %s\n", unpack("H*", end($array))[1],unpack("H*", $enc)[1]);
6363
}
6464

65+
/* SJIS byte 0x80 was previously wrongly treated as the starting byte for a 2-byte character */
66+
echo "== Regression test for SJIS byte 0x80 ==\n";
67+
foreach (['SJIS', 'SJIS-2004', 'MacJapanese', 'SJIS-Mobile#DOCOMO', 'SJIS-Mobile#KDDI', 'SJIS-Mobile#SoftBank'] as $encoding) {
68+
$array = mb_str_split("\x80\xA1abc\x80\xA1", 2, $encoding);
69+
echo "$encoding: [" . implode(', ', array_map('bin2hex', $array)) . "]\n";
70+
71+
// Also try bytes 0xFD, 0xFE, and 0xFF
72+
$array = mb_str_split("abc\xFD\xFE\xFFab\xFD\xFE\xFF", 2, $encoding);
73+
echo "$encoding: [" . implode(', ', array_map('bin2hex', $array)) . "]\n";
74+
}
75+
6576
?>
6677
--EXPECT--
6778
BIG-5: a4e9 a5bb
@@ -73,3 +84,16 @@ UTF-16LE: e565 2c67
7384
UTF-32BE: 000065e5 0000672c
7485
UTF-32LE: e5650000 2c670000
7586
UTF-8: e697a5 e69cac
87+
== Regression test for SJIS byte 0x80 ==
88+
SJIS: [80a1, 6162, 6380, a1]
89+
SJIS: [6162, 63fd, feff, 6162, fdfe, ff]
90+
SJIS-2004: [80a1, 6162, 6380, a1]
91+
SJIS-2004: [6162, 63fd, feff, 6162, fdfe, ff]
92+
MacJapanese: [80a1, 6162, 6380, a1]
93+
MacJapanese: [6162, 63fd, feff, 6162, fdfe, ff]
94+
SJIS-Mobile#DOCOMO: [80a1, 6162, 6380, a1]
95+
SJIS-Mobile#DOCOMO: [6162, 63fd, feff, 6162, fdfe, ff]
96+
SJIS-Mobile#KDDI: [80a1, 6162, 6380, a1]
97+
SJIS-Mobile#KDDI: [6162, 63fd, feff, 6162, fdfe, ff]
98+
SJIS-Mobile#SoftBank: [80a1, 6162, 6380, a1]
99+
SJIS-Mobile#SoftBank: [6162, 63fd, feff, 6162, fdfe, ff]

‎ext/mbstring/tests/mb_strlen.phpt

+53-19
Original file line numberDiff line numberDiff line change
@@ -13,43 +13,59 @@ include_once('common.inc');
1313
mb_detect_order('auto');
1414

1515
// Test string
16-
$euc_jp = '0123この文字列は日本語です。EUC-JPを使っています。0123日本語は面倒臭い。';
16+
$euc_jp = mb_convert_encoding("0123この文字列は日本語です。EUC-JPを使っています。0123日本語は面倒臭い。", 'EUC-JP', 'UTF-8');
1717
$ascii = 'abcdefghijklmnopqrstuvwxyz;]=#0123456789';
1818

19-
// ASCII
2019
echo "== ASCII ==\n";
21-
print mb_strlen($ascii,'ASCII') . "\n";
22-
print strlen($ascii) . "\n";
20+
print mb_strlen($ascii,'ASCII') . "\n";
21+
print strlen($ascii) . "\n";
2322

24-
// EUC-JP
2523
echo "== EUC-JP ==\n";
26-
print mb_strlen($euc_jp,'EUC-JP') . "\n";
24+
print mb_strlen($euc_jp,'EUC-JP') . "\n";
2725
mb_internal_encoding('EUC-JP') or print("mb_internal_encoding() failed\n");
28-
print strlen($euc_jp) . "\n";
26+
print strlen($euc_jp) . "\n";
2927

30-
// SJIS
3128
echo "== SJIS ==\n";
3229
$sjis = mb_convert_encoding($euc_jp, 'SJIS','EUC-JP');
33-
print mb_strlen($sjis,'SJIS') . "\n";
30+
print mb_strlen($sjis,'SJIS') . "\n";
3431
mb_internal_encoding('SJIS') or print("mb_internal_encoding() failed\n");
35-
print strlen($sjis) . "\n";
32+
print strlen($sjis) . "\n";
33+
print "-- Testing illegal bytes 0x80,0xFD-FF --\n";
34+
// mb_strlen used to wrongly treat 0x80 as the starting byte of a 2-byte SJIS character
35+
print mb_strlen("\x80\xA1", 'SJIS') . "\n";
36+
print mb_strlen("abc\xFD\xFE\xFF", 'SJIS') . "\n";
37+
38+
echo "== MacJapanese ==\n";
39+
print mb_strlen("\x80\xA1", 'MacJapanese') . "\n";
40+
print mb_strlen("abc\xFD\xFE\xFF", 'MacJapanese') . "\n";
41+
42+
echo "== SJIS-2004 ==\n";
43+
print mb_strlen("\x80\xA1", 'SJIS-2004') . "\n";
44+
print mb_strlen("abc\xFD\xFE\xFF", 'SJIS-2004') . "\n";
45+
46+
echo "== SJIS-Mobile#DOCOMO ==\n";
47+
print mb_strlen("\x80\xA1", 'SJIS-Mobile#DOCOMO') . "\n";
48+
print mb_strlen("abc\xFD\xFE\xFF", 'SJIS-Mobile#DOCOMO') . "\n";
49+
50+
echo "== SJIS-Mobile#KDDI ==\n";
51+
print mb_strlen("\x80\xA1", 'SJIS-Mobile#KDDI') . "\n";
52+
print mb_strlen("abc\xFD\xFE\xFF", 'SJIS-Mobile#KDDI') . "\n";
53+
54+
echo "== SJIS-Mobile#SoftBank ==\n";
55+
print mb_strlen("\x80\xA1", 'SJIS-Mobile#SoftBank') . "\n";
56+
print mb_strlen("abc\xFD\xFE\xFF", 'SJIS-Mobile#SoftBank') . "\n";
3657

37-
// JIS
38-
// Note: either convert_encoding or strlen has problem
3958
echo "== JIS ==\n";
4059
$jis = mb_convert_encoding($euc_jp, 'JIS','EUC-JP');
41-
print mb_strlen($jis,'JIS') . "\n";
60+
print mb_strlen($jis,'JIS') . "\n";
4261
mb_internal_encoding('JIS') or print("mb_internal_encoding() failed\n");
43-
print strlen($jis) . "\n";
62+
print strlen($jis) . "\n";
4463

45-
// UTF-8
46-
// Note: either convert_encoding or strlen has problem
4764
echo "== UTF-8 ==\n";
4865
$utf8 = mb_convert_encoding($euc_jp, 'UTF-8','EUC-JP');
49-
print mb_strlen($utf8,'UTF-8') . "\n";
66+
print mb_strlen($utf8,'UTF-8') . "\n";
5067
mb_internal_encoding('UTF-8') or print("mb_internal_encoding() failed\n");
51-
print strlen($utf8) . "\n";
52-
68+
print strlen($utf8) . "\n";
5369

5470
// Wrong Parameters
5571
echo "== WRONG PARAMETERS ==\n";
@@ -72,6 +88,24 @@ try {
7288
== SJIS ==
7389
43
7490
72
91+
-- Testing illegal bytes 0x80,0xFD-FF --
92+
2
93+
6
94+
== MacJapanese ==
95+
2
96+
6
97+
== SJIS-2004 ==
98+
2
99+
6
100+
== SJIS-Mobile#DOCOMO ==
101+
2
102+
6
103+
== SJIS-Mobile#KDDI ==
104+
2
105+
6
106+
== SJIS-Mobile#SoftBank ==
107+
2
108+
6
75109
== JIS ==
76110
43
77111
90

‎ext/mbstring/tests/mb_substr.phpt

+153-3
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,21 @@ ini_set('include_path','.');
1111
include_once('common.inc');
1212

1313
// EUC-JP
14-
$euc_jp = '0123この文字列は日本語です。EUC-JPを使っています。日本語は面倒臭い。';
14+
$euc_jp = mb_convert_encoding('0123この文字列は日本語です。EUC-JPを使っています。日本語は面倒臭い。', 'EUC-JP', 'UTF-8');
15+
// SJIS
16+
$sjis = mb_convert_encoding('日本語テキストです。0123456789。', 'SJIS', 'UTF-8');
17+
// ISO-2022-JP
18+
$iso2022jp = "\x1B\$B\x21\x21!r\x1B(BABC";
19+
// GB-18030
20+
$gb18030 = mb_convert_encoding('密码用户名密码名称名称', 'GB18030', 'UTF-8');
21+
// HZ
22+
$hz = "The next sentence is in GB.~{<:Ky2;S{#,NpJ)l6HK!#~}Bye.";
23+
// UTF-8
24+
$utf8 = "Greek: Σὲ γνωρίζω ἀπὸ τὴν κόψη Russian: Зарегистрируйтесь";
25+
// UTF-32
26+
$utf32 = mb_convert_encoding($utf8, 'UTF-32', 'UTF-8');
27+
// UTF-7
28+
$utf7 = mb_convert_encoding($utf8, 'UTF-7', 'UTF-8');
1529

1630
print "1: ". bin2hex(mb_substr($euc_jp, 10, 10,'EUC-JP')) . "\n";
1731
print "2: ". bin2hex(mb_substr($euc_jp, 0, 100,'EUC-JP')) . "\n";
@@ -20,12 +34,148 @@ $str = mb_substr($euc_jp, 100, 10,'EUC-JP');
2034
// Note: returns last character
2135
($str === "") ? print "3 OK\n" : print "NG: ".bin2hex($str)."\n";
2236

23-
$str = mb_substr($euc_jp, -100, 10,'EUC-JP');
24-
($str !== "") ? print "4 OK: ".bin2hex($str)."\n" : print "NG: ".bin2hex($str)."\n";
37+
$str = mb_substr($euc_jp, -100, 10, 'EUC-JP');
38+
print ($str !== "") ? "4 OK: " . bin2hex($str) . "\n" : "BAD: " . bin2hex($str) . "\n";
39+
40+
echo "SJIS:\n";
41+
print "1: " . bin2hex(mb_substr($sjis, 0, 3, 'SJIS')) . "\n";
42+
print "2: " . bin2hex(mb_substr($sjis, -1, null, 'SJIS')) . "\n";
43+
print "3: " . bin2hex(mb_substr($sjis, -5, 3, 'SJIS')) . "\n";
44+
print "4: " . bin2hex(mb_substr($sjis, 1, null, 'SJIS')) . "\n";
45+
print "5:" . bin2hex(mb_substr($sjis, 10, 0, 'SJIS')) . "\n";
46+
echo "-- Testing illegal SJIS byte 0x80 --\n";
47+
print bin2hex(mb_substr("\x80abc\x80\xA1", 3, 2, 'SJIS')) . "\n";
48+
print bin2hex(mb_substr("\x80abc\x80\xA1", 0, 3, 'SJIS')) . "\n";
49+
50+
echo "SJIS-2004:\n";
51+
print bin2hex(mb_substr("\x80abc\x80\xA1", 3, 2, 'SJIS-2004')) . "\n";
52+
print bin2hex(mb_substr("\x80abc\x80\xA1", 0, 3, 'SJIS-2004')) . "\n";
53+
54+
echo "MacJapanese:\n";
55+
print bin2hex(mb_substr("\x80abc\x80\xA1", 3, 2, 'MacJapanese')) . "\n";
56+
print bin2hex(mb_substr("\x80abc\x80\xA1", 0, 3, 'MacJapanese')) . "\n";
57+
58+
echo "SJIS-Mobile#DOCOMO:\n";
59+
print bin2hex(mb_substr("\x80abc\x80\xA1", 3, 2, 'SJIS-Mobile#DOCOMO')) . "\n";
60+
print bin2hex(mb_substr("\x80abc\x80\xA1", 0, 3, 'SJIS-Mobile#DOCOMO')) . "\n";
61+
62+
echo "SJIS-Mobile#KDDI:\n";
63+
print bin2hex(mb_substr("\x80abc\x80\xA1", 3, 2, 'SJIS-Mobile#KDDI')) . "\n";
64+
print bin2hex(mb_substr("\x80abc\x80\xA1", 0, 3, 'SJIS-Mobile#KDDI')) . "\n";
65+
66+
echo "SJIS-Mobile#SoftBank:\n";
67+
print bin2hex(mb_substr("\x80abc\x80\xA1", 3, 2, 'SJIS-Mobile#SoftBank')) . "\n";
68+
print bin2hex(mb_substr("\x80abc\x80\xA1", 0, 3, 'SJIS-Mobile#SoftBank')) . "\n";
69+
70+
echo "ISO-2022-JP:\n";
71+
print "1: " . bin2hex(mb_substr($iso2022jp, 0, 3, 'ISO-2022-JP')) . "\n";
72+
print "2: " . bin2hex(mb_substr($iso2022jp, -1, null, 'ISO-2022-JP')) . "\n";
73+
print "3: " . bin2hex(mb_substr($iso2022jp, -6, 3, 'ISO-2022-JP')) . "\n";
74+
print "4: " . bin2hex(mb_substr($iso2022jp, -3, 2, 'ISO-2022-JP')) . "\n";
75+
print "5: " . bin2hex(mb_substr($iso2022jp, 1, null, 'ISO-2022-JP')) . "\n";
76+
print "6:" . bin2hex(mb_substr($iso2022jp, 10, 0, 'ISO-2022-JP')) . "\n";
77+
print "7:" . bin2hex(mb_substr($iso2022jp, 100, 10, 'ISO-2022-JP')) . "\n";
78+
79+
echo "GB-18030:\n";
80+
print "1: " . bin2hex(mb_substr($gb18030, 0, 3, 'GB-18030')) . "\n";
81+
print "2: " . bin2hex(mb_substr($gb18030, -1, null, 'GB-18030')) . "\n";
82+
print "3: " . bin2hex(mb_substr($gb18030, -5, 3, 'GB-18030')) . "\n";
83+
print "4: " . bin2hex(mb_substr($gb18030, 1, null, 'GB-18030')) . "\n";
84+
print "5:" . bin2hex(mb_substr($gb18030, 10, 0, 'GB-18030')) . "\n";
85+
86+
echo "HZ:\n";
87+
print "1: " . mb_substr($hz, 0, 3, 'HZ') . "\n";
88+
print "2: " . mb_substr($hz, -1, null, 'HZ') . "\n";
89+
print "3: " . mb_substr($hz, -5, 3, 'HZ') . "\n";
90+
print "4: " . mb_substr($hz, 1, null, 'HZ') . "\n";
91+
print "5:" . mb_substr($hz, 10, 0, 'HZ') . "\n";
92+
93+
echo "UTF-8:\n";
94+
print "1: " . mb_substr($utf8, 0, 3, 'UTF-8') . "\n";
95+
print "2: " . mb_substr($utf8, -1, null, 'UTF-8') . "\n";
96+
print "3: " . mb_substr($utf8, -5, 3, 'UTF-8') . "\n";
97+
print "4: " . mb_substr($utf8, 1, null, 'UTF-8') . "\n";
98+
print "5:" . mb_substr($utf8, 10, 0, 'UTF-8') . "\n";
99+
100+
echo "UTF-32:\n";
101+
print "1: " . mb_convert_encoding(mb_substr($utf32, 0, 3, 'UTF-32'), 'UTF-8', 'UTF-32') . "\n";
102+
print "2: " . mb_convert_encoding(mb_substr($utf32, -1, null, 'UTF-32'), 'UTF-8', 'UTF-32') . "\n";
103+
print "3: " . mb_convert_encoding(mb_substr($utf32, -5, 3, 'UTF-32'), 'UTF-8', 'UTF-32') . "\n";
104+
print "4: " . mb_convert_encoding(mb_substr($utf32, 1, null, 'UTF-32'), 'UTF-8', 'UTF-32') . "\n";
105+
print "5:" . mb_convert_encoding(mb_substr($utf32, 10, 0, 'UTF-32'), 'UTF-8', 'UTF-32') . "\n";
106+
107+
echo "UTF-7:\n";
108+
print "1: " . mb_convert_encoding(mb_substr($utf7, 0, 3, 'UTF-7'), 'UTF-8', 'UTF-7') . "\n";
109+
print "2: " . mb_convert_encoding(mb_substr($utf7, -1, null, 'UTF-7'), 'UTF-8', 'UTF-7') . "\n";
110+
print "3: " . mb_convert_encoding(mb_substr($utf7, -5, 3, 'UTF-7'), 'UTF-8', 'UTF-7') . "\n";
111+
print "4: " . mb_convert_encoding(mb_substr($utf7, 1, null, 'UTF-7'), 'UTF-8', 'UTF-7') . "\n";
112+
print "5:" . mb_convert_encoding(mb_substr($utf7, 10, 0, 'UTF-7'), 'UTF-8', 'UTF-7') . "\n";
25113

26114
?>
27115
--EXPECT--
28116
1: c6fccbdcb8eca4c7a4b9a1a34555432d
29117
2: 30313233a4b3a4cecab8bbfacef3a4cfc6fccbdcb8eca4c7a4b9a1a34555432d4a50a4f2bbc8a4c3a4c6a4a4a4dea4b9a1a3c6fccbdcb8eca4cfccccc5ddbdada4a4a1a3
30118
3 OK
31119
4 OK: 30313233a4b3a4cecab8bbfacef3a4cf
120+
SJIS:
121+
1: 93fa967b8cea
122+
2: 8142
123+
3: 825582568257
124+
4: 967b8cea8365834c8358836782c582b781423031323334825482558256825782588142
125+
5:
126+
-- Testing illegal SJIS byte 0x80 --
127+
6380
128+
806162
129+
SJIS-2004:
130+
6380
131+
806162
132+
MacJapanese:
133+
6380
134+
806162
135+
SJIS-Mobile#DOCOMO:
136+
6380
137+
806162
138+
SJIS-Mobile#KDDI:
139+
6380
140+
806162
141+
SJIS-Mobile#SoftBank:
142+
6380
143+
806162
144+
ISO-2022-JP:
145+
1: 1b2442212121721b284241
146+
2: 43
147+
3: 1b2442212121721b284241
148+
4: 4142
149+
5: 1b244221721b2842414243
150+
6:
151+
7:
152+
GB-18030:
153+
1: c3dcc2ebd3c3
154+
2: b3c6
155+
3: c2ebc3fbb3c6
156+
4: c2ebd3c3bba7c3fbc3dcc2ebc3fbb3c6c3fbb3c6
157+
5:
158+
HZ:
159+
1: The
160+
2: .
161+
3: ~{!#~}By
162+
4: he next sentence is in GB.~{<:Ky2;S{#,NpJ)l6HK!#~}Bye.
163+
5:
164+
UTF-8:
165+
1: Gre
166+
2: ь
167+
3: йте
168+
4: reek: Σὲ γνωρίζω ἀπὸ τὴν κόψη Russian: Зарегистрируйтесь
169+
5:
170+
UTF-32:
171+
1: Gre
172+
2: ь
173+
3: йте
174+
4: reek: Σὲ γνωρίζω ἀπὸ τὴν κόψη Russian: Зарегистрируйтесь
175+
5:
176+
UTF-7:
177+
1: Gre
178+
2: ь
179+
3: йте
180+
4: reek: Σὲ γνωρίζω ἀπὸ τὴν κόψη Russian: Зарегистрируйтесь
181+
5:

0 commit comments

Comments
 (0)
Please sign in to comment.