Correct entry for 0x80,0xFD-FF in SJIS multi-byte character length table

alexdowad · alexdowad · commit d104481af822 · 2023-01-05T14:05:39.000+02:00
As a performance optimization, mbstring implements some functions using
tables which give the (byte) length of a multi-byte character using a
lookup based on the value of the first byte. These tables are called
`mblen_table`.

For many years, the mblen_table for SJIS has had '2' in position 0x80.
That is wrong; it should have been '1'. Reasons:

For SJIS, SJIS-2004, and mobile variants of SJIS, 0x80 has never been
treated as the first byte of a 2-byte character. It has always been
treated as a single erroneous byte. On the other hand, 0x80 is a valid
character in MacJapanese... but a 1-byte character, not a 2-byte one.

The same applies to bytes 0xFD-FF; these are 1-byte characters in
MacJapanese, and in other SJIS variants, they are not valid (as the
first byte of a character).

Thanks to the GitHub user 'youkidearitai' for finding this problem.
diff --git a/ext/mbstring/libmbfl/filters/mbfilter_sjis.c b/ext/mbstring/libmbfl/filters/mbfilter_sjis.c
@@ -38,7 +38,7 @@
 
 static int mbfl_filt_conv_sjis_wchar_flush(mbfl_convert_filter *filter);
 
-const unsigned char mblen_table_sjis[] = { /* 0x80-0x9f,0xE0-0xFF */
+const unsigned char mblen_table_sjis[] = { /* 0x81-0x9F,0xE0-0xFC */
   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -47,14 +47,14 @@ const unsigned char mblen_table_sjis[] = { /* 0x80-0x9f,0xE0-0xFF */
   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1
 };
 
 static const char *mbfl_encoding_sjis_aliases[] = {"x-sjis", "SHIFT-JIS", NULL};
diff --git a/ext/mbstring/tests/mb_str_split_jp.phpt b/ext/mbstring/tests/mb_str_split_jp.phpt
@@ -62,6 +62,17 @@ if(end($array) !== $enc){
         last array element: %s expected: %s\n", unpack("H*", end($array))[1],unpack("H*", $enc)[1]);
 }
 
+/* SJIS byte 0x80 was previously wrongly treated as the starting byte for a 2-byte character */
+echo "== Regression test for SJIS byte 0x80 ==\n";
+foreach (['SJIS', 'SJIS-2004', 'MacJapanese', 'SJIS-Mobile#DOCOMO', 'SJIS-Mobile#KDDI', 'SJIS-Mobile#SoftBank'] as $encoding) {
+    $array = mb_str_split("\x80\xA1abc\x80\xA1", 2, $encoding);
+    echo "$encoding: [" . implode(', ', array_map('bin2hex', $array)) . "]\n";
+
+    // Also try bytes 0xFD, 0xFE, and 0xFF
+    $array = mb_str_split("abc\xFD\xFE\xFFab\xFD\xFE\xFF", 2, $encoding);
+    echo "$encoding: [" . implode(', ', array_map('bin2hex', $array)) . "]\n";
+}
+
 ?>
 --EXPECT--
 BIG-5: a4e9 a5bb
@@ -73,3 +84,16 @@ UTF-16LE: e565 2c67
 UTF-32BE: 000065e5 0000672c
 UTF-32LE: e5650000 2c670000
 UTF-8: e697a5 e69cac
+== Regression test for SJIS byte 0x80 ==
+SJIS: [80a1, 6162, 6380, a1]
+SJIS: [6162, 63fd, feff, 6162, fdfe, ff]
+SJIS-2004: [80a1, 6162, 6380, a1]
+SJIS-2004: [6162, 63fd, feff, 6162, fdfe, ff]
+MacJapanese: [80a1, 6162, 6380, a1]
+MacJapanese: [6162, 63fd, feff, 6162, fdfe, ff]
+SJIS-Mobile#DOCOMO: [80a1, 6162, 6380, a1]
+SJIS-Mobile#DOCOMO: [6162, 63fd, feff, 6162, fdfe, ff]
+SJIS-Mobile#KDDI: [80a1, 6162, 6380, a1]
+SJIS-Mobile#KDDI: [6162, 63fd, feff, 6162, fdfe, ff]
+SJIS-Mobile#SoftBank: [80a1, 6162, 6380, a1]
+SJIS-Mobile#SoftBank: [6162, 63fd, feff, 6162, fdfe, ff]
diff --git a/ext/mbstring/tests/mb_strlen.phpt b/ext/mbstring/tests/mb_strlen.phpt
@@ -13,43 +13,59 @@ include_once('common.inc');
 mb_detect_order('auto');
 
 // Test string
-$euc_jp = '0123����ʸ��������ܸ�Ǥ���EUC-JP��ȤäƤ��ޤ���0123���ܸ�����ݽ�����';
+$euc_jp = mb_convert_encoding("0123この文字列は日本語です。EUC-JPを使っています。0123日本語は面倒臭い。", 'EUC-JP', 'UTF-8');
 $ascii  = 'abcdefghijklmnopqrstuvwxyz;]=#0123456789';
 
-// ASCII
 echo "== ASCII ==\n";
-print  mb_strlen($ascii,'ASCII') . "\n";
-print  strlen($ascii) . "\n";
+print mb_strlen($ascii,'ASCII') . "\n";
+print strlen($ascii) . "\n";
 
-// EUC-JP
 echo "== EUC-JP ==\n";
-print  mb_strlen($euc_jp,'EUC-JP') . "\n";
+print mb_strlen($euc_jp,'EUC-JP') . "\n";
 mb_internal_encoding('EUC-JP') or print("mb_internal_encoding() failed\n");
-print  strlen($euc_jp) . "\n";
+print strlen($euc_jp) . "\n";
 
-// SJIS
 echo "== SJIS ==\n";
 $sjis = mb_convert_encoding($euc_jp, 'SJIS','EUC-JP');
-print  mb_strlen($sjis,'SJIS') . "\n";
+print mb_strlen($sjis,'SJIS') . "\n";
 mb_internal_encoding('SJIS') or print("mb_internal_encoding() failed\n");
-print  strlen($sjis) . "\n";
+print strlen($sjis) . "\n";
+print "-- Testing illegal bytes 0x80,0xFD-FF --\n";
+// mb_strlen used to wrongly treat 0x80 as the starting byte of a 2-byte SJIS character
+print mb_strlen("\x80\xA1", 'SJIS') . "\n";
+print mb_strlen("abc\xFD\xFE\xFF", 'SJIS') . "\n";
+
+echo "== MacJapanese ==\n";
+print mb_strlen("\x80\xA1", 'MacJapanese') . "\n";
+print mb_strlen("abc\xFD\xFE\xFF", 'MacJapanese') . "\n";
+
+echo "== SJIS-2004 ==\n";
+print mb_strlen("\x80\xA1", 'SJIS-2004') . "\n";
+print mb_strlen("abc\xFD\xFE\xFF", 'SJIS-2004') . "\n";
+
+echo "== SJIS-Mobile#DOCOMO ==\n";
+print mb_strlen("\x80\xA1", 'SJIS-Mobile#DOCOMO') . "\n";
+print mb_strlen("abc\xFD\xFE\xFF", 'SJIS-Mobile#DOCOMO') . "\n";
+
+echo "== SJIS-Mobile#KDDI ==\n";
+print mb_strlen("\x80\xA1", 'SJIS-Mobile#KDDI') . "\n";
+print mb_strlen("abc\xFD\xFE\xFF", 'SJIS-Mobile#KDDI') . "\n";
+
+echo "== SJIS-Mobile#SoftBank ==\n";
+print mb_strlen("\x80\xA1", 'SJIS-Mobile#SoftBank') . "\n";
+print mb_strlen("abc\xFD\xFE\xFF", 'SJIS-Mobile#SoftBank') . "\n";
 
-// JIS
-// Note: either convert_encoding or strlen has problem
 echo "== JIS ==\n";
 $jis = mb_convert_encoding($euc_jp, 'JIS','EUC-JP');
-print  mb_strlen($jis,'JIS') . "\n";
+print mb_strlen($jis,'JIS') . "\n";
 mb_internal_encoding('JIS')  or print("mb_internal_encoding() failed\n");
-print  strlen($jis) . "\n";
+print strlen($jis) . "\n";
 
-// UTF-8
-// Note: either convert_encoding or strlen has problem
 echo "== UTF-8 ==\n";
 $utf8 = mb_convert_encoding($euc_jp, 'UTF-8','EUC-JP');
-print  mb_strlen($utf8,'UTF-8') . "\n";
+print mb_strlen($utf8,'UTF-8') . "\n";
 mb_internal_encoding('UTF-8')  or print("mb_internal_encoding() failed\n");
-print  strlen($utf8) . "\n";
-
+print strlen($utf8) . "\n";
 
 // Wrong Parameters
 echo "== WRONG PARAMETERS ==\n";
@@ -72,6 +88,24 @@ try {
 == SJIS ==
 43
 72
+-- Testing illegal bytes 0x80,0xFD-FF --
+2
+6
+== MacJapanese ==
+2
+6
+== SJIS-2004 ==
+2
+6
+== SJIS-Mobile#DOCOMO ==
+2
+6
+== SJIS-Mobile#KDDI ==
+2
+6
+== SJIS-Mobile#SoftBank ==
+2
+6
 == JIS ==
 43
 90
diff --git a/ext/mbstring/tests/mb_substr.phpt b/ext/mbstring/tests/mb_substr.phpt
@@ -11,7 +11,21 @@ ini_set('include_path','.');
 include_once('common.inc');
 
 // EUC-JP
-$euc_jp = '0123����ʸ��������ܸ�Ǥ���EUC-JP��ȤäƤ��ޤ������ܸ�����ݽ�����';
+$euc_jp = mb_convert_encoding('0123この文字列は日本語です。EUC-JPを使っています。日本語は面倒臭い。', 'EUC-JP', 'UTF-8');
+// SJIS
+$sjis = mb_convert_encoding('日本語テキストです。01234５６７８９。', 'SJIS', 'UTF-8');
+// ISO-2022-JP
+$iso2022jp = "\x1B\$B\x21\x21!r\x1B(BABC";
+// GB-18030
+$gb18030 = mb_convert_encoding('密码用户名密码名称名称', 'GB18030', 'UTF-8');
+// HZ
+$hz = "The next sentence is in GB.~{<:Ky2;S{#,NpJ)l6HK!#~}Bye.";
+// UTF-8
+$utf8 = "Greek: Σὲ γνωρίζω ἀπὸ τὴν κόψη Russian: Зарегистрируйтесь";
+// UTF-32
+$utf32 = mb_convert_encoding($utf8, 'UTF-32', 'UTF-8');
+// UTF-7
+$utf7 = mb_convert_encoding($utf8, 'UTF-7', 'UTF-8');
 
 print  "1: ". bin2hex(mb_substr($euc_jp,  10,  10,'EUC-JP')) . "\n";
 print  "2: ". bin2hex(mb_substr($euc_jp,   0, 100,'EUC-JP')) . "\n";
@@ -20,12 +34,148 @@ $str = mb_substr($euc_jp, 100, 10,'EUC-JP');
 // Note: returns last character
 ($str === "") ? print "3 OK\n" : print "NG: ".bin2hex($str)."\n";
 
-$str = mb_substr($euc_jp, -100, 10,'EUC-JP');
-($str !== "") ? print "4 OK: ".bin2hex($str)."\n" : print "NG: ".bin2hex($str)."\n";
+$str = mb_substr($euc_jp, -100, 10, 'EUC-JP');
+print ($str !== "") ? "4 OK: " . bin2hex($str) . "\n" : "BAD: " . bin2hex($str) . "\n";
+
+echo "SJIS:\n";
+print "1: " . bin2hex(mb_substr($sjis, 0, 3, 'SJIS')) . "\n";
+print "2: " . bin2hex(mb_substr($sjis, -1, null, 'SJIS')) . "\n";
+print "3: " . bin2hex(mb_substr($sjis, -5, 3, 'SJIS')) . "\n";
+print "4: " . bin2hex(mb_substr($sjis, 1, null, 'SJIS')) . "\n";
+print "5:" . bin2hex(mb_substr($sjis, 10, 0, 'SJIS')) . "\n";
+echo "-- Testing illegal SJIS byte 0x80 --\n";
+print bin2hex(mb_substr("\x80abc\x80\xA1", 3, 2, 'SJIS')) . "\n";
+print bin2hex(mb_substr("\x80abc\x80\xA1", 0, 3, 'SJIS')) . "\n";
+
+echo "SJIS-2004:\n";
+print bin2hex(mb_substr("\x80abc\x80\xA1", 3, 2, 'SJIS-2004')) . "\n";
+print bin2hex(mb_substr("\x80abc\x80\xA1", 0, 3, 'SJIS-2004')) . "\n";
+
+echo "MacJapanese:\n";
+print bin2hex(mb_substr("\x80abc\x80\xA1", 3, 2, 'MacJapanese')) . "\n";
+print bin2hex(mb_substr("\x80abc\x80\xA1", 0, 3, 'MacJapanese')) . "\n";
+
+echo "SJIS-Mobile#DOCOMO:\n";
+print bin2hex(mb_substr("\x80abc\x80\xA1", 3, 2, 'SJIS-Mobile#DOCOMO')) . "\n";
+print bin2hex(mb_substr("\x80abc\x80\xA1", 0, 3, 'SJIS-Mobile#DOCOMO')) . "\n";
+
+echo "SJIS-Mobile#KDDI:\n";
+print bin2hex(mb_substr("\x80abc\x80\xA1", 3, 2, 'SJIS-Mobile#KDDI')) . "\n";
+print bin2hex(mb_substr("\x80abc\x80\xA1", 0, 3, 'SJIS-Mobile#KDDI')) . "\n";
+
+echo "SJIS-Mobile#SoftBank:\n";
+print bin2hex(mb_substr("\x80abc\x80\xA1", 3, 2, 'SJIS-Mobile#SoftBank')) . "\n";
+print bin2hex(mb_substr("\x80abc\x80\xA1", 0, 3, 'SJIS-Mobile#SoftBank')) . "\n";
+
+echo "ISO-2022-JP:\n";
+print "1: " . bin2hex(mb_substr($iso2022jp, 0, 3, 'ISO-2022-JP')) . "\n";
+print "2: " . bin2hex(mb_substr($iso2022jp, -1, null, 'ISO-2022-JP')) . "\n";
+print "3: " . bin2hex(mb_substr($iso2022jp, -6, 3, 'ISO-2022-JP')) . "\n";
+print "4: " . bin2hex(mb_substr($iso2022jp, -3, 2, 'ISO-2022-JP')) . "\n";
+print "5: " . bin2hex(mb_substr($iso2022jp, 1, null, 'ISO-2022-JP')) . "\n";
+print "6:" . bin2hex(mb_substr($iso2022jp, 10, 0, 'ISO-2022-JP')) . "\n";
+print "7:" . bin2hex(mb_substr($iso2022jp, 100, 10, 'ISO-2022-JP')) . "\n";
+
+echo "GB-18030:\n";
+print "1: " . bin2hex(mb_substr($gb18030, 0, 3, 'GB-18030')) . "\n";
+print "2: " . bin2hex(mb_substr($gb18030, -1, null, 'GB-18030')) . "\n";
+print "3: " . bin2hex(mb_substr($gb18030, -5, 3, 'GB-18030')) . "\n";
+print "4: " . bin2hex(mb_substr($gb18030, 1, null, 'GB-18030')) . "\n";
+print "5:" . bin2hex(mb_substr($gb18030, 10, 0, 'GB-18030')) . "\n";
+
+echo "HZ:\n";
+print "1: " . mb_substr($hz, 0, 3, 'HZ') . "\n";
+print "2: " . mb_substr($hz, -1, null, 'HZ') . "\n";
+print "3: " . mb_substr($hz, -5, 3, 'HZ') . "\n";
+print "4: " . mb_substr($hz, 1, null, 'HZ') . "\n";
+print "5:" . mb_substr($hz, 10, 0, 'HZ') . "\n";
+
+echo "UTF-8:\n";
+print "1: " . mb_substr($utf8, 0, 3, 'UTF-8') . "\n";
+print "2: " . mb_substr($utf8, -1, null, 'UTF-8') . "\n";
+print "3: " . mb_substr($utf8, -5, 3, 'UTF-8') . "\n";
+print "4: " . mb_substr($utf8, 1, null, 'UTF-8') . "\n";
+print "5:" . mb_substr($utf8, 10, 0, 'UTF-8') . "\n";
+
+echo "UTF-32:\n";
+print "1: " . mb_convert_encoding(mb_substr($utf32, 0, 3, 'UTF-32'), 'UTF-8', 'UTF-32') . "\n";
+print "2: " . mb_convert_encoding(mb_substr($utf32, -1, null, 'UTF-32'), 'UTF-8', 'UTF-32') . "\n";
+print "3: " . mb_convert_encoding(mb_substr($utf32, -5, 3, 'UTF-32'), 'UTF-8', 'UTF-32') . "\n";
+print "4: " . mb_convert_encoding(mb_substr($utf32, 1, null, 'UTF-32'), 'UTF-8', 'UTF-32') . "\n";
+print "5:" . mb_convert_encoding(mb_substr($utf32, 10, 0, 'UTF-32'), 'UTF-8', 'UTF-32') . "\n";
+
+echo "UTF-7:\n";
+print "1: " . mb_convert_encoding(mb_substr($utf7, 0, 3, 'UTF-7'), 'UTF-8', 'UTF-7') . "\n";
+print "2: " . mb_convert_encoding(mb_substr($utf7, -1, null, 'UTF-7'), 'UTF-8', 'UTF-7') . "\n";
+print "3: " . mb_convert_encoding(mb_substr($utf7, -5, 3, 'UTF-7'), 'UTF-8', 'UTF-7') . "\n";
+print "4: " . mb_convert_encoding(mb_substr($utf7, 1, null, 'UTF-7'), 'UTF-8', 'UTF-7') . "\n";
+print "5:" . mb_convert_encoding(mb_substr($utf7, 10, 0, 'UTF-7'), 'UTF-8', 'UTF-7') . "\n";
 
 ?>
 --EXPECT--
 1: c6fccbdcb8eca4c7a4b9a1a34555432d
 2: 30313233a4b3a4cecab8bbfacef3a4cfc6fccbdcb8eca4c7a4b9a1a34555432d4a50a4f2bbc8a4c3a4c6a4a4a4dea4b9a1a3c6fccbdcb8eca4cfccccc5ddbdada4a4a1a3
 3 OK
 4 OK: 30313233a4b3a4cecab8bbfacef3a4cf
+SJIS:
+1: 93fa967b8cea
+2: 8142
+3: 825582568257
+4: 967b8cea8365834c8358836782c582b781423031323334825482558256825782588142
+5:
+-- Testing illegal SJIS byte 0x80 --
+6380
+806162
+SJIS-2004:
+6380
+806162
+MacJapanese:
+6380
+806162
+SJIS-Mobile#DOCOMO:
+6380
+806162
+SJIS-Mobile#KDDI:
+6380
+806162
+SJIS-Mobile#SoftBank:
+6380
+806162
+ISO-2022-JP:
+1: 1b2442212121721b284241
+2: 43
+3: 1b2442212121721b284241
+4: 4142
+5: 1b244221721b2842414243
+6:
+7:
+GB-18030:
+1: c3dcc2ebd3c3
+2: b3c6
+3: c2ebc3fbb3c6
+4: c2ebd3c3bba7c3fbc3dcc2ebc3fbb3c6c3fbb3c6
+5:
+HZ:
+1: The
+2: .
+3: ~{!#~}By
+4: he next sentence is in GB.~{<:Ky2;S{#,NpJ)l6HK!#~}Bye.
+5:
+UTF-8:
+1: Gre
+2: ь
+3: йте
+4: reek: Σὲ γνωρίζω ἀπὸ τὴν κόψη Russian: Зарегистрируйтесь
+5:
+UTF-32:
+1: Gre
+2: ь
+3: йте
+4: reek: Σὲ γνωρίζω ἀπὸ τὴν κόψη Russian: Зарегистрируйтесь
+5:
+UTF-7:
+1: Gre
+2: ь
+3: йте
+4: reek: Σὲ γνωρίζω ἀπὸ τὴν κόψη Russian: Зарегистрируйтесь
+5: