Skip to content

Commit 85a95a2

Browse files
committed
Merge branch 'PHP-8.1'
* PHP-8.1: Restore backwards-compatible mappings of 0x5C and 0x7E in SJIS
2 parents 6b02cab + d62f535 commit 85a95a2

File tree

2 files changed

+33
-21
lines changed

2 files changed

+33
-21
lines changed

ext/mbstring/libmbfl/filters/mbfilter_sjis.c

Lines changed: 5 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -145,11 +145,7 @@ int mbfl_filt_conv_sjis_wchar(int c, mbfl_convert_filter *filter)
145145

146146
switch (filter->status) {
147147
case 0:
148-
if (c == 0x5C) {
149-
CK((*filter->output_function)(0xA5, filter->data));
150-
} else if (c == 0x7E) {
151-
CK((*filter->output_function)(0x203E, filter->data));
152-
} else if (c >= 0 && c < 0x80) { /* ASCII */
148+
if (c >= 0 && c < 0x80) { /* ASCII */
153149
CK((*filter->output_function)(c, filter->data));
154150
} else if (c > 0xA0 && c < 0xE0) { /* Kana */
155151
CK((*filter->output_function)(0xFEC0 + c, filter->data));
@@ -201,17 +197,7 @@ int mbfl_filt_conv_wchar_sjis(int c, mbfl_convert_filter *filter)
201197
int c1, c2, s1, s2;
202198

203199
s1 = 0;
204-
if (c == 0x5C) {
205-
/* Unicode 0x5C is a backslash; but Shift-JIS uses 0x5C for the
206-
* Yen sign. JIS X 0208 kuten 0x2140 is a backslash. */
207-
s1 = 0x2140;
208-
} else if (c == 0x7E) {
209-
/* Unicode 0x7E is a tilde, but Shift-JIS uses 0x7E for overline (or
210-
* macron). JIS X 0208 kuten 0x2141 is 'WAVE DASH' */
211-
s1 = 0x2141;
212-
} else if (c == 0xAF || c == 0x203E) { /* U+00AF is MACRON, U+203E is OVERLINE */
213-
s1 = 0x7E; /* Halfwidth overline/macron */
214-
} else if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
200+
if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
215201
s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
216202
} else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
217203
s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
@@ -222,7 +208,9 @@ int mbfl_filt_conv_wchar_sjis(int c, mbfl_convert_filter *filter)
222208
}
223209
if (s1 <= 0) {
224210
if (c == 0xA5) { /* YEN SIGN */
225-
s1 = 0x5C;
211+
s1 = 0x216F; /* FULLWIDTH YEN SIGN */
212+
} else if (c == 0xAF || c == 0x203E) { /* U+00AF is MACRON, U+203E is OVERLINE */
213+
s1 = 0x2131; /* FULLWIDTH MACRON */
226214
} else if (c == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
227215
s1 = 0x2140;
228216
} else if (c == 0x2225) { /* PARALLEL TO */

ext/mbstring/tests/sjis_encoding.phpt

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,37 @@ for ($i = 0; $i < 0x20; $i++) {
2020
$fromUnicode["\x00" . chr($i)] = chr($i);
2121
}
2222

23-
/* U+007E is TILDE; convert to Shift-JIS 0x8160 (WAVE DASH) */
24-
$fromUnicode["\x00\x7E"] = "\x81\x60";
23+
/* According to the relevant Japan Industrial Standards Committee standards,
24+
* SJIS 0x5C is a Yen sign, and 0x7E is an overline.
25+
*
26+
* However, this conflicts with the implementation of SJIS in various legacy
27+
* software (notably Microsoft products), where SJIS 0x5C and 0x7E are taken
28+
* as equivalent to the same ASCII bytes.
29+
*
30+
* Prior to PHP 8.1, mbstring's implementation of SJIS handled these bytes
31+
* compatibly with Microsoft products. This was changed in PHP 8.1.0, in an
32+
* attempt to comply with the JISC specifications. However, after discussion
33+
* with various concerned Japanese developers, it seems that the historical
34+
* behavior was more useful in the majority of applications which process
35+
* SJIS-encoded text. */
36+
$validChars["\x5C"] = "\x00\x5C";
37+
$validChars["\x7E"] = "\x00\x7E";
38+
$fromUnicode["\x00\x5C"] = "\x5C";
39+
$fromUnicode["\x00\x7E"] = "\x7E";
40+
41+
/* That means it does not make sense to convert U+203E (OVERLINE)
42+
* to 0x7E; convert it to JIS X 0208 FULLWIDTH MACRON instead */
43+
$fromUnicode["\x20\x3E"] = "\x81\x50";
44+
/* U+00AF is MACRON; convert that to FULLWIDTH MACRON as well */
45+
$fromUnicode["\x00\xAF"] = "\x81\x50";
46+
/* Since we are treating 0x5C as equivalent to U+005C, it does not
47+
* make sense to convert U+00A5 (YEN SIGN) to 0x5C
48+
* Convert it to JIS X 0208 FULLWIDTH YEN SIGN instead */
49+
$fromUnicode["\x00\xA5"] = "\x81\x8F";
50+
2551
/* DEL character */
2652
$validChars["\x7F"] = "\x00\x7F";
2753
$fromUnicode["\x00\x7F"] = "\x7F";
28-
/* U+00AF is MACRON; Shift-JIS 0x7E is overline */
29-
$fromUnicode["\x00\xAF"] = "\x7E";
3054
/* Use fullwidth reverse solidus, not (halfwidth) backslash (0x5C) */
3155
$validChars["\x81\x5F"] = "\xFF\x3C";
3256
$fromUnicode["\xFF\x3C"] = "\x81\x5F";

0 commit comments

Comments
 (0)