Skip to content

Commit 2dc9026

Browse files
committed
Restore backwards-compatible mappings of 0x5C and 0x7E in SJIS
According to the relevant Japan Industrial Standards Committee standards, SJIS 0x5C is a Yen sign, and 0x7E is an overline. However, this conflicts with the implementation of SJIS in various legacy software (notably Microsoft products), where SJIS 0x5C and 0x7E are taken as equivalent to the same ASCII bytes. Prior to PHP 8.1, mbstring's implementation of SJIS handled these bytes compatibly with Microsoft products. This was changed in PHP 8.1.0, in an attempt to comply with the JISC specifications. However, after discussion with various concerned Japanese developers, it seems that the historical behavior was more useful in the majority of applications which process SJIS-encoded text. Since we are now treating SJIS 0x5C as equivalent to U+005C and 0x7E as equivalent to U+007E, it does not make sense to convert U+203E (OVERLINE) to 0x7E, nor does it make sense to convert U+00A5 (YEN SIGN) to 0x5C. Restore the mappings for those codepoints from before PHP 8.1.0.
1 parent 77ba689 commit 2dc9026

File tree

2 files changed

+33
-21
lines changed

2 files changed

+33
-21
lines changed

ext/mbstring/libmbfl/filters/mbfilter_sjis.c

+5-17
Original file line numberDiff line numberDiff line change
@@ -141,11 +141,7 @@ int mbfl_filt_conv_sjis_wchar(int c, mbfl_convert_filter *filter)
141141

142142
switch (filter->status) {
143143
case 0:
144-
if (c == 0x5C) {
145-
CK((*filter->output_function)(0xA5, filter->data));
146-
} else if (c == 0x7E) {
147-
CK((*filter->output_function)(0x203E, filter->data));
148-
} else if (c >= 0 && c < 0x80) { /* ASCII */
144+
if (c >= 0 && c < 0x80) { /* ASCII */
149145
CK((*filter->output_function)(c, filter->data));
150146
} else if (c > 0xA0 && c < 0xE0) { /* Kana */
151147
CK((*filter->output_function)(0xFEC0 + c, filter->data));
@@ -197,17 +193,7 @@ int mbfl_filt_conv_wchar_sjis(int c, mbfl_convert_filter *filter)
197193
int c1, c2, s1, s2;
198194

199195
s1 = 0;
200-
if (c == 0x5C) {
201-
/* Unicode 0x5C is a backslash; but Shift-JIS uses 0x5C for the
202-
* Yen sign. JIS X 0208 kuten 0x2140 is a backslash. */
203-
s1 = 0x2140;
204-
} else if (c == 0x7E) {
205-
/* Unicode 0x7E is a tilde, but Shift-JIS uses 0x7E for overline (or
206-
* macron). JIS X 0208 kuten 0x2141 is 'WAVE DASH' */
207-
s1 = 0x2141;
208-
} else if (c == 0xAF || c == 0x203E) { /* U+00AF is MACRON, U+203E is OVERLINE */
209-
s1 = 0x7E; /* Halfwidth overline/macron */
210-
} else if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
196+
if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
211197
s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
212198
} else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
213199
s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
@@ -218,7 +204,9 @@ int mbfl_filt_conv_wchar_sjis(int c, mbfl_convert_filter *filter)
218204
}
219205
if (s1 <= 0) {
220206
if (c == 0xA5) { /* YEN SIGN */
221-
s1 = 0x5C;
207+
s1 = 0x216F; /* FULLWIDTH YEN SIGN */
208+
} else if (c == 0xAF || c == 0x203E) { /* U+00AF is MACRON, U+203E is OVERLINE */
209+
s1 = 0x2131; /* FULLWIDTH MACRON */
222210
} else if (c == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
223211
s1 = 0x2140;
224212
} else if (c == 0xFF5E) { /* FULLWIDTH TILDE */

ext/mbstring/tests/sjis_encoding.phpt

+28-4
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,37 @@ for ($i = 0; $i < 0x20; $i++) {
2020
$fromUnicode["\x00" . chr($i)] = chr($i);
2121
}
2222

23-
/* U+007E is TILDE; convert to Shift-JIS 0x8160 (WAVE DASH) */
24-
$fromUnicode["\x00\x7E"] = "\x81\x60";
23+
/* According to the relevant Japan Industrial Standards Committee standards,
24+
* SJIS 0x5C is a Yen sign, and 0x7E is an overline.
25+
*
26+
* However, this conflicts with the implementation of SJIS in various legacy
27+
* software (notably Microsoft products), where SJIS 0x5C and 0x7E are taken
28+
* as equivalent to the same ASCII bytes.
29+
*
30+
* Prior to PHP 8.1, mbstring's implementation of SJIS handled these bytes
31+
* compatibly with Microsoft products. This was changed in PHP 8.1.0, in an
32+
* attempt to comply with the JISC specifications. However, after discussion
33+
* with various concerned Japanese developers, it seems that the historical
34+
* behavior was more useful in the majority of applications which process
35+
* SJIS-encoded text. */
36+
$validChars["\x5C"] = "\x00\x5C";
37+
$validChars["\x7E"] = "\x00\x7E";
38+
$fromUnicode["\x00\x5C"] = "\x5C";
39+
$fromUnicode["\x00\x7E"] = "\x7E";
40+
41+
/* That means it does not make sense to convert U+203E (OVERLINE)
42+
* to 0x7E; convert it to JIS X 0208 FULLWIDTH MACRON instead */
43+
$fromUnicode["\x20\x3E"] = "\x81\x50";
44+
/* U+00AF is MACRON; convert that to FULLWIDTH MACRON as well */
45+
$fromUnicode["\x00\xAF"] = "\x81\x50";
46+
/* Since we are treating 0x5C as equivalent to U+005C, it does not
47+
* make sense to convert U+00A5 (YEN SIGN) to 0x5C
48+
* Convert it to JIS X 0208 FULLWIDTH YEN SIGN instead */
49+
$fromUnicode["\x00\xA5"] = "\x81\x8F";
50+
2551
/* DEL character */
2652
$validChars["\x7F"] = "\x00\x7F";
2753
$fromUnicode["\x00\x7F"] = "\x7F";
28-
/* U+00AF is MACRON; Shift-JIS 0x7E is overline */
29-
$fromUnicode["\x00\xAF"] = "\x7E";
3054
/* Use fullwidth reverse solidus, not (halfwidth) backslash (0x5C) */
3155
$validChars["\x81\x5F"] = "\xFF\x3C";
3256
$fromUnicode["\xFF\x3C"] = "\x81\x5F";

0 commit comments

Comments
 (0)