From 1a3e686ba4fdb53a345cca262ee2cb398af5b285 Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Sun, 26 Dec 2021 14:35:15 +0200 Subject: [PATCH 01/25] Implement fast text conversion interface for '7bit' --- ext/mbstring/libmbfl/filters/mbfilter_7bit.c | 41 +++++++++++++++++++- 1 file changed, 39 insertions(+), 2 deletions(-) diff --git a/ext/mbstring/libmbfl/filters/mbfilter_7bit.c b/ext/mbstring/libmbfl/filters/mbfilter_7bit.c index 999b96a60334e..3097dabd26e95 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_7bit.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_7bit.c @@ -31,6 +31,9 @@ #include "mbfilter.h" #include "mbfilter_7bit.h" +static size_t mb_7bit_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); +static void mb_wchar_to_7bit(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); + const mbfl_encoding mbfl_encoding_7bit = { mbfl_no_encoding_7bit, "7bit", @@ -40,8 +43,8 @@ const mbfl_encoding mbfl_encoding_7bit = { MBFL_ENCTYPE_SBCS, NULL, NULL, - NULL, - NULL + mb_7bit_to_wchar, + mb_wchar_to_7bit }; const struct mbfl_convert_vtbl vtbl_8bit_7bit = { @@ -82,3 +85,37 @@ int mbfl_filt_conv_any_7bit(int c, mbfl_convert_filter *filter) } return 0; } + +static size_t mb_7bit_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) +{ + unsigned char *p = *in, *e = p + *in_len; + uint32_t *out = buf, *limit = buf + bufsize; + + while (p < e && out < limit) { + unsigned char c = *p++; + *out++ = (c < 0x80) ? c : MBFL_BAD_INPUT; + } + + *in_len = e - p; + *in = p; + return out - buf; +} + +static void mb_wchar_to_7bit(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) +{ + unsigned char *out, *limit; + MB_CONVERT_BUF_LOAD(buf, out, limit); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + + while (len--) { + uint32_t w = *in++; + if (w <= 0x7F) { + out = mb_convert_buf_add(out, w); + } else { + MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_7bit); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + } + } + + MB_CONVERT_BUF_STORE(buf, out, limit); +} From c9fb81e074b64df8b1c02baf0fc453aba2b94d01 Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Tue, 28 Dec 2021 10:43:41 +0200 Subject: [PATCH 02/25] Implement fast text conversion interface for ISO-2022-KR When working on this, I read RFC 1557 again and realized that the comment at the top of the file was totally mistaken. Further, the legacy code did not obey the RFC. (It would emit the "ESC $ ) C" sequence anywhere, not just at the beginning of a line as the RFC requires.) The new code obeys the RFC; one quirk is that it always emits the escape sequence at the beginning of each output string, even if the string is completely ASCII (in which case the escape sequence is allowed, but not required). The new code doesn't always generate the same number of error markers for invalid escapes as the old code did. The old code could not emit the special KDDI emoji for national flags. Further, there was a bug in the test which the old code used to determine whether an 0xF byte should be emitted at the end of a string (to switch back to ASCII mode). As a result, it would not always switch back to ASCII mode, meaning that it was not always safe to concatenate the resulting strings. --- .../libmbfl/filters/mbfilter_iso2022_kr.c | 167 +++++++++++++++++- ext/mbstring/tests/iso2022kr_encoding.phpt | 12 +- 2 files changed, 174 insertions(+), 5 deletions(-) diff --git a/ext/mbstring/libmbfl/filters/mbfilter_iso2022_kr.c b/ext/mbstring/libmbfl/filters/mbfilter_iso2022_kr.c index cbb32bd635e3f..b92b8491cc911 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_iso2022_kr.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_iso2022_kr.c @@ -28,8 +28,11 @@ */ /* ISO-2022-KR is defined in RFC 1557 - * The RFC says that _each_ line which uses KS X 1001 characters must start - * with an escape sequence of ESC $ ) C + * + * The RFC says that ESC $ ) C must appear once in a ISO-2022-KR string, + * at the beginning of a line, before any instances of the Shift In or + * Shift Out bytes which are used to switch between ASCII/KSC 5601 modes + * * We don't enforce that for ISO-2022-KR input */ #include "mbfilter.h" @@ -38,6 +41,8 @@ static int mbfl_filt_conv_2022kr_wchar_flush(mbfl_convert_filter *filter); static int mbfl_filt_conv_any_2022kr_flush(mbfl_convert_filter *filter); +static size_t mb_iso2022kr_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); +static void mb_wchar_to_iso2022kr(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); const mbfl_encoding mbfl_encoding_2022kr = { mbfl_no_encoding_2022kr, @@ -48,8 +53,8 @@ const mbfl_encoding mbfl_encoding_2022kr = { MBFL_ENCTYPE_GL_UNSAFE, &vtbl_2022kr_wchar, &vtbl_wchar_2022kr, - NULL, - NULL + mb_iso2022kr_to_wchar, + mb_wchar_to_iso2022kr }; const struct mbfl_convert_vtbl vtbl_wchar_2022kr = { @@ -263,3 +268,157 @@ static int mbfl_filt_conv_any_2022kr_flush(mbfl_convert_filter *filter) return 0; } + +#define ASCII 0 +#define KSC5601 1 + +static size_t mb_iso2022kr_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) +{ + unsigned char *p = *in, *e = p + *in_len; + uint32_t *out = buf, *limit = buf + bufsize; + + while (p < e && out < limit) { + unsigned char c = *p++; + + if (c == 0x1B) { + if ((e - p) < 3) { + *out++ = MBFL_BAD_INPUT; + if (p < e && *p++ == '$') { + if (p < e) { + p++; + } + } + continue; + } + unsigned char c2 = *p++; + unsigned char c3 = *p++; + unsigned char c4 = *p++; + if (c2 == '$' && c3 == ')' && c4 == 'C') { + *state = ASCII; + } else { + if (c3 != ')') { + p--; + if (c2 != '$') + p--; + } + *out++ = MBFL_BAD_INPUT; + *state = ASCII; + } + } else if (c == 0xF) { + *state = ASCII; + } else if (c == 0xE) { + *state = KSC5601; + } else if (c >= 0x21 && c <= 0x7E && *state == KSC5601) { + if (p == e) { + *out++ = MBFL_BAD_INPUT; + break; + } + unsigned char c2 = *p++; + unsigned int w = 0; + + if (c2 < 0x21 || c2 > 0x7E) { + *out++ = MBFL_BAD_INPUT; + continue; + } + + if (c < 0x47) { + if (c != 0x22 || c2 <= 0x65) { + w = (c - 0x21)*190 + (c2 - 0x41) + 0x80; + ZEND_ASSERT(w < uhc2_ucs_table_size); + w = uhc2_ucs_table[w]; + } + } else if (c != 0x49 && c <= 0x7D) { + w = (c - 0x47)*94 + c2 - 0x21; + ZEND_ASSERT(w < uhc3_ucs_table_size); + w = uhc3_ucs_table[w]; + } + + if (!w) + w = MBFL_BAD_INPUT; + *out++ = w; + } else if (c < 0x80 && *state == ASCII) { + *out++ = c; + } else { + *out++ = MBFL_BAD_INPUT; + } + } + + *in_len = e - p; + *in = p; + return out - buf; +} + +#define EMITTED_ESC_SEQUENCE 0x10 + +static void mb_wchar_to_iso2022kr(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) +{ + unsigned char *out, *limit; + MB_CONVERT_BUF_LOAD(buf, out, limit); + + /* This escape sequence needs to come *somewhere* at the beginning of a line before + * we can use the Shift In/Shift Out bytes, but it only needs to come once in a string + * Rather than tracking newlines, we can just emit the sequence once at the beginning + * of the output string... since that will always be "the beginning of a line" */ + if (len && !(buf->state & EMITTED_ESC_SEQUENCE)) { + MB_CONVERT_BUF_ENSURE(buf, out, limit, 4 + len); + out = mb_convert_buf_add4(out, 0x1B, '$', ')', 'C'); + buf->state |= EMITTED_ESC_SEQUENCE; + } else { + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + } + + while (len--) { + uint32_t w = *in++; + unsigned int s = 0; + + if (w >= ucs_a1_uhc_table_min && w < ucs_a1_uhc_table_max) { + s = ucs_a1_uhc_table[w - ucs_a1_uhc_table_min]; + } else if (w >= ucs_a2_uhc_table_min && w < ucs_a2_uhc_table_max) { + s = ucs_a2_uhc_table[w - ucs_a2_uhc_table_min]; + } else if (w >= ucs_a3_uhc_table_min && w < ucs_a3_uhc_table_max) { + s = ucs_a3_uhc_table[w - ucs_a3_uhc_table_min]; + } else if (w >= ucs_i_uhc_table_min && w < ucs_i_uhc_table_max) { + s = ucs_i_uhc_table[w - ucs_i_uhc_table_min]; + } else if (w >= ucs_s_uhc_table_min && w < ucs_s_uhc_table_max) { + s = ucs_s_uhc_table[w - ucs_s_uhc_table_min]; + } else if (w >= ucs_r1_uhc_table_min && w < ucs_r1_uhc_table_max) { + s = ucs_r1_uhc_table[w - ucs_r1_uhc_table_min]; + } else if (w >= ucs_r2_uhc_table_min && w < ucs_r2_uhc_table_max) { + s = ucs_r2_uhc_table[w - ucs_r2_uhc_table_min]; + } + + if (((s >> 8) & 0xFF) < 0xA1 || (s & 0xFF) < 0xA1) { + s = w; + } else { + s -= 0x8080; + } + + if ((s >= 0x80 && s < 0x2121) || (s > 0x8080)) { + MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022kr); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + } else if (s < 0x80) { + if ((buf->state & 1) != ASCII) { + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2); + out = mb_convert_buf_add(out, 0xF); + buf->state &= ~KSC5601; + } + out = mb_convert_buf_add(out, s); + } else { + if ((buf->state & 1) != KSC5601) { + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 3); + out = mb_convert_buf_add(out, 0xE); + buf->state |= KSC5601; + } else { + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2); + } + out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF); + } + } + + if (end && (buf->state & 1) != ASCII) { + MB_CONVERT_BUF_ENSURE(buf, out, limit, 1); + out = mb_convert_buf_add(out, 0xF); + } + + MB_CONVERT_BUF_STORE(buf, out, limit); +} diff --git a/ext/mbstring/tests/iso2022kr_encoding.phpt b/ext/mbstring/tests/iso2022kr_encoding.phpt index 05f00493bcf10..62bcc9de55f56 100644 --- a/ext/mbstring/tests/iso2022kr_encoding.phpt +++ b/ext/mbstring/tests/iso2022kr_encoding.phpt @@ -24,8 +24,10 @@ function testValid($from, $to, $bothWays = true) { $from = substr($from, 1, strlen($from) - 1); /* If the string switches to a different charset, it should switch back to * ASCII at the end */ - if (strpos($from, "\x1B\$C") !== false) + if (strpos($from, "\x0E") !== false && $from[-1] !== "\x0F") $from .= "\x0F"; + if (strpos($from, "\x1B\$)C") === false && $from !== '') + $from = "\x1B\$)C" . $from; convertValidString($to, $from, 'UTF-16BE', 'ISO-2022-KR', false); } @@ -96,6 +98,14 @@ testValid("\x0E\x0E\x0F\x0E\x0Fabc", "\x00a\x00b\x00c", false); echo "Escapes behave as expected\n"; +// Test switching between KS X 1001 and ASCII when converting Unicode -> ISO-2022-KR +convertValidString("\x76\x20\x00a\x00b", "\x1B$)C\x0E\x74\x30\x0Fab", "UTF-16BE", "ISO-2022-KR", false); + +// Regression test: Our conversion table for KS X 1001 only goes up to 0x7D7E, but +// we previously accepted and tried to convert two-byte sequences starting with +// 0x7E, resulting in a failed assertion +convertInvalidString("\x0E~/", "%", "ISO-2022-KR", "UTF-8"); + // Test "long" illegal character markers mb_substitute_character("long"); convertInvalidString("\x1B", "%", "ISO-2022-KR", "UTF-8"); From 85e2be8855412f670cf7f0007e1feffcd9bc0441 Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Sun, 9 Jan 2022 19:53:24 +0200 Subject: [PATCH 03/25] Implement fast text conversion interface for SJIS-mac --- .../libmbfl/filters/mbfilter_sjis_mac.c | 405 +++++++++++++++++- ext/mbstring/tests/sjismac_encoding.phpt | 3 + 2 files changed, 406 insertions(+), 2 deletions(-) diff --git a/ext/mbstring/libmbfl/filters/mbfilter_sjis_mac.c b/ext/mbstring/libmbfl/filters/mbfilter_sjis_mac.c index 97fbb59e50583..5163db67b3c9f 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_sjis_mac.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_sjis_mac.c @@ -39,6 +39,8 @@ extern const unsigned char mblen_table_sjis[]; static int mbfl_filt_conv_wchar_sjis_mac_flush(mbfl_convert_filter *filter); static int mbfl_filt_conv_sjis_mac_wchar_flush(mbfl_convert_filter *filter); +static size_t mb_sjismac_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); +static void mb_wchar_to_sjismac(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); static const char *mbfl_encoding_sjis_mac_aliases[] = {"MacJapanese", "x-Mac-Japanese", NULL}; @@ -51,8 +53,8 @@ const mbfl_encoding mbfl_encoding_sjis_mac = { MBFL_ENCTYPE_GL_UNSAFE, &vtbl_sjis_mac_wchar, &vtbl_wchar_sjis_mac, - NULL, - NULL + mb_sjismac_to_wchar, + mb_wchar_to_sjismac }; const struct mbfl_convert_vtbl vtbl_sjis_mac_wchar = { @@ -662,3 +664,402 @@ mbfl_filt_conv_wchar_sjis_mac_flush(mbfl_convert_filter *filter) return 0; } + +static size_t mb_sjismac_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) +{ + unsigned char *p = *in, *e = p + *in_len; + uint32_t *out = buf, *limit = buf + bufsize; + + while (p < e && out < limit) { + unsigned char c = *p++; + + if (c < 0x80 && c != 0x5C) { + *out++ = c; + } else if (c >= 0xA1 && c <= 0xDF) { + *out++ = 0xFEC0 + c; + } else if (c > 0x80 && c <= 0xED && c != 0xA0) { + if (p == e) { + *out++ = MBFL_BAD_INPUT; + break; + } + unsigned char c2 = *p++; + + if (c2 >= 0x40 && c2 <= 0xFC && c2 != 0x7F) { + unsigned int w = 0, s1 = 0, s2 = 0; + SJIS_DECODE(c, c2, s1, s2); + unsigned int s = (s1 - 0x21)*94 + s2 - 0x21; + + if (s <= 0x89) { + if (s == 0x1C) { + w = 0x2014; /* EM DASH */ + } else if (s == 0x1F) { + w = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */ + } else if (s == 0x20) { + w = 0x301C; /* FULLWIDTH TILDE */ + } else if (s == 0x21) { + w = 0x2016; /* PARALLEL TO */ + } else if (s == 0x3C) { + w = 0x2212; /* FULLWIDTH HYPHEN-MINUS */ + } else if (s == 0x50) { + w = 0xA2; /* FULLWIDTH CENT SIGN */ + } else if (s == 0x51) { + w = 0xA3; /* FULLWIDTH POUND SIGN */ + } else if (s == 0x89) { + w = 0xAC; /* FULLWIDTH NOT SIGN */ + } + if (w) { + *out++ = w; + continue; + } + } + + for (int i = 0; i < 7; i++) { + if (s >= code_tbl[i][0] && s <= code_tbl[i][1]) { + *out++ = s - code_tbl[i][0] + code_tbl[i][2]; + goto next_iteration; + } + } + + for (int i = 0; i < code_tbl_m_len; i++) { + if (s == code_tbl_m[i][0]) { + int n = 5; + if (code_tbl_m[i][1] == 0xF860) { + n = 3; + } else if (code_tbl_m[i][1] == 0xF861) { + n = 4; + } + if ((limit - out) < n) { + p -= 2; + goto finished; + } + for (int j = 1; j <= n; j++) { + *out++ = code_tbl_m[i][j]; + } + goto next_iteration; + } + } + + for (int i = 0; i < 8; i++) { + if (s >= code_ofst_tbl[i][0] && s <= code_ofst_tbl[i][1]) { + w = code_map[i][s - code_ofst_tbl[i][0]]; + if (!w) { + *out++ = MBFL_BAD_INPUT; + goto next_iteration; + } + if ((limit - out) < 2) { + p -= 2; + goto finished; + } + *out++ = w; + if (s >= 0x43E && s <= 0x441) { + *out++ = 0xF87A; + } else if (s == 0x3B1 || s == 0x3B7) { + *out++ = 0xF87F; + } else if (s == 0x4B8 || s == 0x4B9 || s == 0x4C4) { + *out++ = 0x20DD; + } else if (s == 0x1ED9 || s == 0x1EDA || s == 0x1EE8 || s == 0x1EF3 || (s >= 0x1EF5 && s <= 0x1EFB) || s == 0x1F05 || s == 0x1F06 || s == 0x1F18 || (s >= 0x1FF2 && s <= 0x20A5)) { + *out++ = 0xF87E; + } + goto next_iteration; + } + } + + if (s < jisx0208_ucs_table_size) { + w = jisx0208_ucs_table[s]; + } + + if (!w) + w = MBFL_BAD_INPUT; + *out++ = w; + } else { + *out++ = MBFL_BAD_INPUT; + } + } else if (c == 0x5C) { + *out++ = 0xA5; + } else if (c == 0x80) { + *out++ = 0x5C; + } else if (c == 0xA0) { + *out++ = 0xA0; + } else if (c == 0xFD) { + *out++ = 0xA9; + } else if (c == 0xFE) { + *out++ = 0x2122; + } else if (c == 0xFF) { + if ((limit - out) < 2) { + p--; + break; + } + *out++ = 0x2026; + *out++ = 0xF87F; + } else { + *out++ = MBFL_BAD_INPUT; + } +next_iteration: ; + } + +finished: + *in_len = e - p; + *in = p; + return out - buf; +} + +static bool process_s_form(uint32_t w, uint32_t w2, unsigned int *s) +{ + if (w2 == 0xF87A) { + for (int i = 0; i < 4; i++) { + if (w == s_form_tbl[i+34+3+3]) { + *s = s_form_sjis_tbl[i+34+3+3]; + return true; + } + } + } else if (w2 == 0x20DD) { + for (int i = 0; i < 3; i++) { + if (w == s_form_tbl[i+34+3]) { + *s = s_form_sjis_tbl[i+34+3]; + return true; + } + } + } else if (w2 == 0xF87F) { + for (int i = 0; i < 3; i++) { + if (w == s_form_tbl[i+34]) { + *s = s_form_sjis_tbl[i+34]; + return true; + } + } + } else if (w2 == 0xF87E) { + for (int i = 0; i < 34; i++) { + if (w == s_form_tbl[i]) { + *s = s_form_sjis_tbl[i]; + return true; + } + } + } + + return false; +} + +/* For codepoints F860-F862, which are treated specially in MacJapanese */ +static int transcoding_hint_cp_width[3] = { 3, 4, 5 }; + +static void mb_wchar_to_sjismac(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) +{ + unsigned char *out, *limit; + MB_CONVERT_BUF_LOAD(buf, out, limit); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + + uint32_t w; + + if (buf->state) { + w = buf->state & 0xFFFF; + if (buf->state & 0xFF000000L) { + goto resume_transcoding_hint; + } else { + buf->state = 0; + goto process_codepoint; + } + } + + while (len--) { + w = *in++; +process_codepoint: ; + unsigned int s = 0; + + if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) { + if (w == 0x5C) { + s = 0x80; + } else if (w == 0xA9) { + s = 0xFD; + } else { + s = ucs_a1_jis_table[w - ucs_a1_jis_table_min]; + } + } else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) { + if (w == 0x2122) { + s = 0xFE; + } else if (w == 0x2014) { + s = 0x213D; + } else if (w == 0x2116) { + s = 0x2C1D; + } else { + s = ucs_a2_jis_table[w - ucs_a2_jis_table_min]; + } + } else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) { + s = ucs_i_jis_table[w - ucs_i_jis_table_min]; + } else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) { + s = ucs_r_jis_table[w - ucs_r_jis_table_min]; + } + + if (w >= 0x2000) { + for (int i = 0; i < s_form_tbl_len; i++) { + if (w == s_form_tbl[i]) { + if (!len) { + if (end) { + s = s_form_sjis_fallback_tbl[i]; + if (s) { + MB_CONVERT_BUF_ENSURE(buf, out, limit, 2); + out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF); + } else { + MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjismac); + } + } else { + buf->state = w; + } + MB_CONVERT_BUF_STORE(buf, out, limit); + return; + } + uint32_t w2 = *in++; + len--; + + if (!process_s_form(w, w2, &s)) { + in--; len++; + + for (int i = 0; i < s_form_tbl_len; i++) { + if (w == s_form_tbl[i]) { + s = s_form_sjis_fallback_tbl[i]; + break; + } + } + } + + if (s <= 0xFF) { + out = mb_convert_buf_add(out, s); + } else { + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2); + out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF); + } + + goto next_iteration; + } + } + + if (w == 0xF860 || w == 0xF861 || w == 0xF862) { + /* Apple 'transcoding hint' codepoints (from private use area) */ + if (!len) { + if (end) { + MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjismac); + } else { + buf->state = w; + } + MB_CONVERT_BUF_STORE(buf, out, limit); + return; + } + + uint32_t w2 = *in++; + len--; + + for (int i = 0; i < code_tbl_m_len; i++) { + if (w == code_tbl_m[i][1] && w2 == code_tbl_m[i][2]) { + /* This might be a valid transcoding hint sequence */ + int index = 3; + +resume_transcoding_hint: + if (buf->state) { + i = buf->state >> 24; + index = (buf->state >> 16) & 0xFF; + buf->state = 0; + } + + int expected = transcoding_hint_cp_width[w - 0xF860]; + + while (index <= expected) { + if (!len) { + if (end) { + for (int j = 1; j < index; j++) { + MB_CONVERT_ERROR(buf, out, limit, code_tbl_m[i][j], mb_wchar_to_sjismac); + } + } else { + buf->state = (i << 24) | (index << 16) | (w & 0xFFFF); + } + MB_CONVERT_BUF_STORE(buf, out, limit); + return; + } + + w2 = *in++; + len--; + + if (w2 != code_tbl_m[i][index]) { + /* Didn't match */ + for (int j = 1; j < index; j++) { + MB_CONVERT_ERROR(buf, out, limit, code_tbl_m[i][j], mb_wchar_to_sjismac); + } + MB_CONVERT_ERROR(buf, out, limit, w2, mb_wchar_to_sjismac); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + goto next_iteration; + } + + index++; + } + + /* Successful match, emit SJIS-mac bytes */ + s = code_tbl_m[i][0]; + unsigned int c1 = (s / 94) + 0x21, c2 = (s % 94) + 0x21, s1, s2; + SJIS_ENCODE(c1, c2, s1, s2); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2); + out = mb_convert_buf_add2(out, s1, s2); + goto next_iteration; + } + } + + /* No valid transcoding hint sequence found */ + in--; len++; + MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjismac); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + continue; + } + } + + if (!s) { + if (w == 0xA0) { + s = 0xA0; + } else if (w == 0xA5) { /* YEN SIGN */ + /* Unicode has codepoint 0xFFE5 for a fullwidth Yen sign; + * convert codepoint 0xA5 to halfwidth Yen sign */ + s = 0x5C; /* HALFWIDTH YEN SIGN */ + } else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */ + s = 0x2140; + } else { + for (int i = 0; i < wchar2sjis_mac_r_tbl_len; i++) { + if (w >= wchar2sjis_mac_r_tbl[i][0] && w <= wchar2sjis_mac_r_tbl[i][1]) { + s = w - wchar2sjis_mac_r_tbl[i][0] + wchar2sjis_mac_r_tbl[i][2]; + s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21); + goto found_kuten_code; + } + } + + for (int i = 0; i < wchar2sjis_mac_r_map_len; i++) { + if (w >= wchar2sjis_mac_r_map[i][0] && w <= wchar2sjis_mac_r_map[i][1]) { + s = wchar2sjis_mac_code_map[i][w - wchar2sjis_mac_r_map[i][0]]; + if (s) { + s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21); + goto found_kuten_code; + } + } + } + + for (int i = 0; i < wchar2sjis_mac_wchar_tbl_len; i++) { + if (w == wchar2sjis_mac_wchar_tbl[i][0]) { + s = wchar2sjis_mac_wchar_tbl[i][1]; + s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21); + goto found_kuten_code; + } + } + } + } + +found_kuten_code: + if ((!s && w) || s >= 0x8080) { + MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjismac); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + } else if (s <= 0xFF) { + out = mb_convert_buf_add(out, s); + } else { + unsigned int c1 = (s >> 8) & 0xFF, c2 = s & 0xFF, s1, s2; + SJIS_ENCODE(c1, c2, s1, s2); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2); + out = mb_convert_buf_add2(out, s1, s2); + } + +next_iteration: ; + } + + MB_CONVERT_BUF_STORE(buf, out, limit); +} diff --git a/ext/mbstring/tests/sjismac_encoding.phpt b/ext/mbstring/tests/sjismac_encoding.phpt index 5d1c2de869e32..8013d82e51b74 100644 --- a/ext/mbstring/tests/sjismac_encoding.phpt +++ b/ext/mbstring/tests/sjismac_encoding.phpt @@ -87,6 +87,9 @@ findInvalidChars($fromUnicode, $invalidChars, $unused, array_fill_keys(range(0, convertAllInvalidChars($invalidChars, $fromUnicode, 'UTF-16BE', 'SJIS-mac', '%'); echo "Unicode -> SJIS-mac conversion works on all invalid characters\n"; +// Regression test +convertValidString("\x20\x26\x6B\xAA", "\x81\x63\x9F\x6F", "UTF-16BE", "SJIS-mac"); + // Test special combining characters for MacJapanese when *not* appearing in // an expected combination convertInvalidString("\x20\x10\xF8\x7A", "\x81\x5D%", "UTF-16BE", "SJIS-mac"); From 41a7be2261bb26843823b774deee32cb23ac8565 Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Fri, 21 Jan 2022 22:19:08 +0200 Subject: [PATCH 04/25] Implement fast text conversion interface for UTF7-IMAP The old code would convert a 0x00 byte in the input to 0x00 in the output, but this clearly violates the RFC which defines UTF7-IMAP. --- .../libmbfl/filters/mbfilter_utf7imap.c | 285 +++++++++++++++++- ext/mbstring/tests/utf7imap_encoding.phpt | 5 + 2 files changed, 288 insertions(+), 2 deletions(-) diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf7imap.c b/ext/mbstring/libmbfl/filters/mbfilter_utf7imap.c index 7de68dea3cc22..063232e91f42a 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf7imap.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf7imap.c @@ -80,6 +80,8 @@ static int mbfl_filt_conv_wchar_utf7imap_flush(mbfl_convert_filter *filter); static int mbfl_filt_conv_utf7imap_wchar_flush(mbfl_convert_filter *filter); +static size_t mb_utf7imap_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); +static void mb_wchar_to_utf7imap(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); static const char *mbfl_encoding_utf7imap_aliases[] = {"mUTF-7", NULL}; @@ -92,8 +94,8 @@ const mbfl_encoding mbfl_encoding_utf7imap = { 0, &vtbl_utf7imap_wchar, &vtbl_wchar_utf7imap, - NULL, - NULL + mb_utf7imap_to_wchar, + mb_wchar_to_utf7imap }; const struct mbfl_convert_vtbl vtbl_utf7imap_wchar = { @@ -435,3 +437,282 @@ static int mbfl_filt_conv_wchar_utf7imap_flush(mbfl_convert_filter *filter) return 0; } + +/* Ways which a Base64-encoded section can end: */ +#define DASH 0xFE +#define ILLEGAL 0xFF + +static inline bool is_base64_end(unsigned char c) +{ + return c >= DASH; +} + +static unsigned char decode_base64(unsigned char c) +{ + if (c >= 'A' && c <= 'Z') { + return c - 65; + } else if (c >= 'a' && c <= 'z') { + return c - 71; + } else if (c >= '0' && c <= '9') { + return c + 4; + } else if (c == '+') { + return 62; + } else if (c == ',') { + return 63; + } else if (c == '-') { + return DASH; + } + return ILLEGAL; +} + +static uint32_t* handle_utf16_cp(uint16_t cp, uint32_t *out, uint16_t *surrogate1) +{ +retry: + if (*surrogate1) { + if (cp >= 0xDC00 && cp <= 0xDFFF) { + *out++ = ((*surrogate1 & 0x3FF) << 10) + (cp & 0x3FF) + 0x10000; + *surrogate1 = 0; + } else { + *out++ = MBFL_BAD_INPUT; + *surrogate1 = 0; + goto retry; + } + } else if (cp >= 0xD800 && cp <= 0xDBFF) { + *surrogate1 = cp; + } else if (cp >= 0xDC00 && cp <= 0xDFFF) { + /* 2nd part of surrogate pair came unexpectedly */ + *out++ = MBFL_BAD_INPUT; + } else if (cp >= 0x20 && cp <= 0x7E && cp != '&') { + *out++ = MBFL_BAD_INPUT; + } else { + *out++ = cp; + } + return out; +} + +static uint32_t* handle_base64_end(unsigned char n, uint32_t *out, bool *base64, bool abrupt, uint16_t *surrogate1) +{ + if (abrupt || n == ILLEGAL || *surrogate1) { + *out++ = MBFL_BAD_INPUT; + *surrogate1 = 0; + } + + *base64 = false; + return out; +} + +static size_t mb_utf7imap_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) +{ + ZEND_ASSERT(bufsize >= 5); /* This function will infinite-loop if called with a tiny output buffer */ + + unsigned char *p = *in, *e = p + *in_len; + /* Always leave one empty space in output buffer in case the string ends while + * in Base64 mode and we need to emit an error marker */ + uint32_t *out = buf, *limit = buf + bufsize - 1; + + bool base64 = *state & 1; + uint16_t surrogate1 = (*state >> 1); /* First half of a surrogate pair */ + + while (p < e && out < limit) { + if (base64) { + /* Base64 section */ + if ((limit - out) < 4) { + break; + } + + unsigned char n1 = decode_base64(*p++); + if (is_base64_end(n1)) { + out = handle_base64_end(n1, out, &base64, false, &surrogate1); + continue; + } else if (p == e) { + out = handle_base64_end(n1, out, &base64, true, &surrogate1); + continue; + } + unsigned char n2 = decode_base64(*p++); + if (is_base64_end(n2) || p == e) { + out = handle_base64_end(n2, out, &base64, true, &surrogate1); + continue; + } + unsigned char n3 = decode_base64(*p++); + if (is_base64_end(n3)) { + out = handle_base64_end(n3, out, &base64, true, &surrogate1); + continue; + } + out = handle_utf16_cp((n1 << 10) | (n2 << 4) | ((n3 & 0x3C) >> 2), out, &surrogate1); + if (p == e) { + /* It is an error if trailing padding bits are not zeroes or if we were + * expecting the 2nd part of a surrogate pair when Base64 section ends */ + if ((n3 & 0x3) || surrogate1) + *out++ = MBFL_BAD_INPUT; + break; + } + + unsigned char n4 = decode_base64(*p++); + if (is_base64_end(n4) || p == e) { + out = handle_base64_end(n4, out, &base64, n3 & 0x3, &surrogate1); + continue; + } + unsigned char n5 = decode_base64(*p++); + if (is_base64_end(n5) || p == e) { + out = handle_base64_end(n5, out, &base64, true, &surrogate1); + continue; + } + unsigned char n6 = decode_base64(*p++); + if (is_base64_end(n6)) { + out = handle_base64_end(n6, out, &base64, true, &surrogate1); + continue; + } + out = handle_utf16_cp((n3 << 14) | (n4 << 8) | (n5 << 2) | ((n6 & 0x30) >> 4), out, &surrogate1); + if (p == e) { + if ((n6 & 0xF) || surrogate1) + *out++ = MBFL_BAD_INPUT; + break; + } + + unsigned char n7 = decode_base64(*p++); + if (is_base64_end(n7) || p == e) { + out = handle_base64_end(n7, out, &base64, n6 & 0xF, &surrogate1); + continue; + } + unsigned char n8 = decode_base64(*p++); + if (is_base64_end(n8)) { + out = handle_base64_end(n8, out, &base64, true, &surrogate1); + continue; + } + out = handle_utf16_cp((n6 << 12) | (n7 << 6) | n8, out, &surrogate1); + } else { + unsigned char c = *p++; + + if (c == '&') { + if (p < e && *p == '-') { + *out++ = '&'; + p++; + } else { + base64 = true; + } + } else if (c >= 0x20 && c <= 0x7E) { + *out++ = c; + } else { + *out++ = MBFL_BAD_INPUT; + } + } + } + + if (p == e && base64) { + /* UTF7-IMAP doesn't allow strings to end in Base64 mode + * One space in output buffer was reserved just for this */ + *out++ = MBFL_BAD_INPUT; + } + + *state = (surrogate1 << 1) | base64; + *in_len = e - p; + *in = p; + return out - buf; +} + +#define SAVE_CONVERSION_STATE() buf->state = (cache << 4) | (nbits << 1) | base64 +#define RESTORE_CONVERSION_STATE() base64 = (buf->state & 1); nbits = (buf->state >> 1) & 0x7; cache = (buf->state >> 4) + +static const unsigned char mbfl_base64_table[] = { + /* 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', */ + 0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4b,0x4c,0x4d, + /* 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', */ + 0x4e,0x4f,0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5a, + /* 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', */ + 0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d, + /* 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', */ + 0x6e,0x6f,0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a, + /* '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', ',', '\0' */ + 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x2b,0x2c,0x00 +}; + +static void mb_wchar_to_utf7imap(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) +{ + unsigned char *out, *limit; + MB_CONVERT_BUF_LOAD(buf, out, limit); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + + bool base64; + unsigned char nbits, cache; /* `nbits` is the number of cached bits; either 0, 2, or 4 */ + RESTORE_CONVERSION_STATE(); + + while (len--) { + uint32_t w = *in++; + if (base64) { + if (w >= 0x20 && w <= 0x7E) { + /* End of Base64 section. Drain buffered bits (if any), close Base64 section + * Leave enough space in the output buffer such that even if the remainder of + * the input string is ASCII, we can output the whole thing without having to + * check for output buffer space again */ + base64 = false; + in--; len++; /* Unconsume codepoint; it will be handled by 'ASCII section' code below */ + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2); + if (nbits) { + out = mb_convert_buf_add(out, mbfl_base64_table[(cache << (6 - nbits)) & 0x3F]); + } + nbits = cache = 0; + out = mb_convert_buf_add(out, '-'); + } else if (w >= MBFL_WCSPLANE_UTF32MAX) { + /* Make recursive call to add an error marker character */ + SAVE_CONVERSION_STATE(); + MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf7imap); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + RESTORE_CONVERSION_STATE(); + } else { + /* Encode codepoint, preceded by any cached bits, as Base64 + * Make enough space in the output buffer to hold both any bytes that + * we emit right here, plus any finishing byte which might need to + * be emitted if the input string ends abruptly */ + uint64_t bits; + if (w >= MBFL_WCSPLANE_SUPMIN) { + /* Must use surrogate pair */ + MB_CONVERT_BUF_ENSURE(buf, out, limit, 7); + w -= 0x10000; + bits = ((uint64_t)cache << 32) | 0xD800DC00L | ((w & 0xFFC00) << 6) | (w & 0x3FF); + nbits += 32; + } else { + MB_CONVERT_BUF_ENSURE(buf, out, limit, 4); + bits = (cache << 16) | w; + nbits += 16; + } + + while (nbits >= 6) { + out = mb_convert_buf_add(out, mbfl_base64_table[(bits >> (nbits - 6)) & 0x3F]); + nbits -= 6; + } + cache = bits; + } + } else { + /* ASCII section */ + if (w == '&') { + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2); + out = mb_convert_buf_add2(out, '&', '-'); + } else if (w >= 0x20 && w <= 0x7E) { + out = mb_convert_buf_add(out, w); + } else if (w >= MBFL_WCSPLANE_UTF32MAX) { + buf->state = 0; + MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf7imap); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + RESTORE_CONVERSION_STATE(); + } else { + out = mb_convert_buf_add(out, '&'); + base64 = true; + in--; len++; /* Unconsume codepoint; it will be handled by Base64 code above */ + } + } + } + + if (end) { + if (nbits) { + out = mb_convert_buf_add(out, mbfl_base64_table[(cache << (6 - nbits)) & 0x3F]); + } + if (base64) { + MB_CONVERT_BUF_ENSURE(buf, out, limit, 1); + out = mb_convert_buf_add(out, '-'); + } + } else { + SAVE_CONVERSION_STATE(); + } + + MB_CONVERT_BUF_STORE(buf, out, limit); +} diff --git a/ext/mbstring/tests/utf7imap_encoding.phpt b/ext/mbstring/tests/utf7imap_encoding.phpt index e48ccc167c3bb..06852c9190e70 100644 --- a/ext/mbstring/tests/utf7imap_encoding.phpt +++ b/ext/mbstring/tests/utf7imap_encoding.phpt @@ -26,6 +26,10 @@ function testInvalid($from, $to) { testValid("", ""); echo "Identification passes on empty string... good start!\n"; +/* RFC says that 0x00 should be Base64-encoded */ +testValidString("\x00", "&AAA-", 'UTF-8', 'UTF7-IMAP'); +echo "Null byte converted correctly\n"; + /* Identification and conversion of ASCII characters (minus &) */ for ($i = 0x20; $i <= 0x7E; $i++) { if ($i == 0x26) // '&' @@ -221,6 +225,7 @@ echo "Done!\n"; ?> --EXPECT-- Identification passes on empty string... good start! +Null byte converted correctly Testing all valid single-character ASCII strings... check! Non-ASCII characters convert to illegal char marker... yes! & can be Base64-encoded... yes! From 41d87c2f1af0c99135dda68b8d6984a50dd7ae9c Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Sun, 23 Jan 2022 21:22:32 +0200 Subject: [PATCH 05/25] Implement fast text conversion interface for mobile SJIS variants --- .../libmbfl/filters/mbfilter_sjis_mobile.c | 826 +++++++++++++++++- ext/mbstring/tests/sjis_mobile_encodings.phpt | 24 + 2 files changed, 836 insertions(+), 14 deletions(-) diff --git a/ext/mbstring/libmbfl/filters/mbfilter_sjis_mobile.c b/ext/mbstring/libmbfl/filters/mbfilter_sjis_mobile.c index 38ec5fe11d89f..efb456f0e938c 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_sjis_mobile.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_sjis_mobile.c @@ -39,6 +39,12 @@ extern int mbfl_bisec_srch2(int w, const unsigned short tbl[], int n); extern const unsigned char mblen_table_sjis[]; static int mbfl_filt_conv_sjis_wchar_flush(mbfl_convert_filter *filter); +static size_t mb_sjis_docomo_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); +static void mb_wchar_to_sjis_docomo(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); +static size_t mb_sjis_kddi_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); +static void mb_wchar_to_sjis_kddi(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); +static size_t mb_sjis_sb_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); +static void mb_wchar_to_sjis_sb(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); static const char *mbfl_encoding_sjis_docomo_aliases[] = {"SJIS-DOCOMO", "shift_jis-imode", "x-sjis-emoji-docomo", NULL}; static const char *mbfl_encoding_sjis_kddi_aliases[] = {"SJIS-KDDI", "shift_jis-kddi", "x-sjis-emoji-kddi", NULL}; @@ -53,8 +59,8 @@ const mbfl_encoding mbfl_encoding_sjis_docomo = { MBFL_ENCTYPE_GL_UNSAFE, &vtbl_sjis_docomo_wchar, &vtbl_wchar_sjis_docomo, - NULL, - NULL + mb_sjis_docomo_to_wchar, + mb_wchar_to_sjis_docomo }; const mbfl_encoding mbfl_encoding_sjis_kddi = { @@ -66,8 +72,8 @@ const mbfl_encoding mbfl_encoding_sjis_kddi = { MBFL_ENCTYPE_GL_UNSAFE, &vtbl_sjis_kddi_wchar, &vtbl_wchar_sjis_kddi, - NULL, - NULL + mb_sjis_kddi_to_wchar, + mb_wchar_to_sjis_kddi }; const mbfl_encoding mbfl_encoding_sjis_sb = { @@ -79,8 +85,8 @@ const mbfl_encoding mbfl_encoding_sjis_sb = { MBFL_ENCTYPE_GL_UNSAFE, &vtbl_sjis_sb_wchar, &vtbl_wchar_sjis_sb, - NULL, - NULL + mb_sjis_sb_to_wchar, + mb_wchar_to_sjis_sb }; const struct mbfl_convert_vtbl vtbl_sjis_docomo_wchar = { @@ -226,13 +232,6 @@ const unsigned short mbfl_kddi2uni_pua_b[8][3] = { } \ } while (0) -/* (ku*94)+ten value -> Shift-JIS byte sequence */ -#define CODE2JIS(c1,c2,s1,s2) \ - c1 = (s1)/94+0x21; \ - c2 = (s1)-94*((c1)-0x21)+0x21; \ - s1 = ((c1) << 8) | (c2); \ - s2 = 1 - int mbfilter_conv_map_tbl(int c, int *w, const unsigned short map[][3], int n) { for (int i = 0; i < n; i++) { @@ -799,7 +798,7 @@ int mbfl_filt_conv_wchar_sjis_mobile(int c, mbfl_convert_filter *filter) if ((filter->to == &mbfl_encoding_sjis_docomo && mbfilter_unicode2sjis_emoji_docomo(c, &s1, filter)) || (filter->to == &mbfl_encoding_sjis_kddi && mbfilter_unicode2sjis_emoji_kddi(c, &s1, filter)) || (filter->to == &mbfl_encoding_sjis_sb && mbfilter_unicode2sjis_emoji_sb(c, &s1, filter))) { - CODE2JIS(c1,c2,s1,s2); + s1 = (((s1 / 94) + 0x21) << 8) | ((s1 % 94) + 0x21); } if (filter->status) { @@ -840,3 +839,802 @@ int mbfl_filt_conv_sjis_mobile_flush(mbfl_convert_filter *filter) return 0; } + +static size_t mb_sjis_docomo_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) +{ + unsigned char *p = *in, *e = p + *in_len; + /* Leave one extra space available in output buffer, since some iterations of + * main loop (below) may emit two wchars */ + uint32_t *out = buf, *limit = buf + bufsize - 1; + + while (p < e && out < limit) { + unsigned char c = *p++; + + if (c <= 0x7F) { + *out++ = c; + } else if (c >= 0xA1 && c <= 0xDF) { + /* Kana */ + *out++ = 0xFEC0 + c; + } else if (c > 0x80 && c < 0xFD && c != 0xA0) { + /* Kanji */ + if (p == e) { + *out++ = MBFL_BAD_INPUT; + break; + } + unsigned char c2 = *p++; + + if (c2 >= 0x40 && c2 <= 0xFC && c2 != 0x7F) { + uint32_t w = 0; + unsigned int s1, s2; + SJIS_DECODE(c, c2, s1, s2); + unsigned int s = ((s1 - 0x21) * 94) + s2 - 0x21; + + if (s <= 137) { + if (s == 31) { + w = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */ + } else if (s == 32) { + w = 0xFF5E; /* FULLWIDTH TILDE */ + } else if (s == 33) { + w = 0x2225; /* PARALLEL TO */ + } else if (s == 60) { + w = 0xFF0D; /* FULLWIDTH HYPHEN-MINUS */ + } else if (s == 80) { + w = 0xFFE0; /* FULLWIDTH CENT SIGN */ + } else if (s == 81) { + w = 0xFFE1; /* FULLWIDTH POUND SIGN */ + } else if (s == 137) { + w = 0xFFE2; /* FULLWIDTH NOT SIGN */ + } + } + + if (!w) { + if (s >= mb_tbl_code2uni_docomo1_min && s <= mb_tbl_code2uni_docomo1_max) { + int snd = 0; + w = mbfilter_sjis_emoji_docomo2unicode(s, &snd); + if (snd) { + *out++ = snd; + } + } else if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) { + w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min]; + } else if (s < jisx0208_ucs_table_size) { + w = jisx0208_ucs_table[s]; + } else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) { + w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min]; + } + + if (!w) { + if (s >= cp932ext3_ucs_table_min && s < cp932ext3_ucs_table_max) { + w = cp932ext3_ucs_table[s - cp932ext3_ucs_table_min]; + } else if (s >= (94*94) && s < (114*94)) { + w = s - (94*94) + 0xE000; + } + } + } + + *out++ = w ? w : MBFL_BAD_INPUT; + } else { + *out++ = MBFL_BAD_INPUT; + } + } else { + *out++ = MBFL_BAD_INPUT; + } + } + + *in_len = e - p; + *in = p; + return out - buf; +} + +static void mb_wchar_to_sjis_docomo(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) +{ + unsigned char *out, *limit; + MB_CONVERT_BUF_LOAD(buf, out, limit); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + + uint32_t w; + unsigned int s = 0; + + if (buf->state) { + /* Continue what we were doing on the previous call */ + w = buf->state; + buf->state = 0; + if (len) { + goto process_possible_keypad; + } else { + goto emit_output; + } + } + + while (len--) { + w = *in++; + s = 0; + + if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) { + s = ucs_a1_jis_table[w - ucs_a1_jis_table_min]; + } else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) { + s = ucs_a2_jis_table[w - ucs_a2_jis_table_min]; + } else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) { + s = ucs_i_jis_table[w - ucs_i_jis_table_min]; + } else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) { + s = ucs_r_jis_table[w - ucs_r_jis_table_min]; + } else if (w >= 0xE000 && w < (0xE000 + 20*94)) { + /* Private User Area (95ku - 114ku) */ + s = w - 0xE000; + s = (((s / 94) + 0x7F) << 8) | ((s % 94) + 0x21); + goto process_emoji; + } + + if (!s) { + if (w == 0xA5) { /* YEN SIGN */ + s = 0x216F; /* FULLWIDTH YEN SIGN */ + } else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */ + s = 0x2140; + } else if (w == 0x2225) { /* PARALLEL TO */ + s = 0x2142; + } else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */ + s = 0x215D; + } else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */ + s = 0x2171; + } else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */ + s = 0x2172; + } else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */ + s = 0x224C; + } + } + + if (w && (!s || s >= 0x8080)) { + s = 0; + + for (int i = 0; i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) { + if (w == cp932ext1_ucs_table[i]) { + s = (((i / 94) + 0x2D) << 8) + (i % 94) + 0x21; + goto process_emoji; + } + } + + for (int i = 0; i < cp932ext2_ucs_table_max - cp932ext2_ucs_table_min; i++) { + if (w == cp932ext2_ucs_table[i]) { + s = (((i / 94) + 0x79) << 8) + (i % 94) + 0x21; + goto process_emoji; + } + } + } + +process_emoji: + /* When converting SJIS-Mobile to Unicode, we convert keypad symbol emoji + * to a sequence of 2 codepoints, one of which is a combining character which + * adds the 'key' image around the other + * + * In the other direction, look for such sequences and convert them to a + * single emoji */ + if (w == '#' || (w >= '0' && w <= '9')) { + if (!len) { + if (end) { + goto emit_output; + } else { + /* If we are at the end of the current buffer of codepoints, but another + * buffer is coming, then remember that we have to reprocess `w` */ + buf->state = w; + break; + } + } +process_possible_keypad: ; + uint32_t w2 = *in++; len--; + if (w2 == 0x20E3) { + if (w == '#') { + s = 0x2964; + } else if (w == '0') { + s = 0x296F; + } else { /* Previous character was '1'-'9' */ + s = 0x2966 + (w - '1'); + } + s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21); + } else { + in--; len++; + } + } else if (w == 0xA9) { /* Copyright sign */ + s = (((0x29B5 / 94) + 0x21) << 8) | ((0x29B5 % 94) + 0x21); + } else if (w == 0xAE) { /* Registered sign */ + s = (((0x29BA / 94) + 0x21) << 8) | ((0x29BA % 94) + 0x21); + } else if (w >= mb_tbl_uni_docomo2code2_min && w <= mb_tbl_uni_docomo2code2_max) { + int i = mbfl_bisec_srch2(w, mb_tbl_uni_docomo2code2_key, mb_tbl_uni_docomo2code2_len); + if (i >= 0) { + s = mb_tbl_uni_docomo2code2_value[i]; + s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21); + } + } else if (w >= mb_tbl_uni_docomo2code3_min && w <= mb_tbl_uni_docomo2code3_max) { + int i = mbfl_bisec_srch2(w - 0x10000, mb_tbl_uni_docomo2code3_key, mb_tbl_uni_docomo2code3_len); + if (i >= 0) { + s = mb_tbl_uni_docomo2code3_value[i]; + s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21); + } + } else if (w >= mb_tbl_uni_docomo2code5_min && w <= mb_tbl_uni_docomo2code5_max) { + int i = mbfl_bisec_srch2(w - 0xF0000, mb_tbl_uni_docomo2code5_key, mb_tbl_uni_docomo2code5_len); + if (i >= 0) { + s = mb_tbl_uni_docomo2code5_val[i]; + s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21); + } + } + +emit_output: + if (!s && w) { + MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjis_docomo); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + } else if (s <= 0xFF) { + out = mb_convert_buf_add(out, s); + } else { + unsigned int c1 = (s >> 8) & 0xFF, c2 = s & 0xFF, s1, s2; + SJIS_ENCODE(c1, c2, s1, s2); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2); + out = mb_convert_buf_add2(out, s1, s2); + } + } + + MB_CONVERT_BUF_STORE(buf, out, limit); +} + +static size_t mb_sjis_kddi_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) +{ + unsigned char *p = *in, *e = p + *in_len; + uint32_t *out = buf, *limit = buf + bufsize - 1; + + while (p < e && out < limit) { + unsigned char c = *p++; + + if (c <= 0x7F) { + *out++ = c; + } else if (c >= 0xA1 && c <= 0xDF) { + /* Kana */ + *out++ = 0xFEC0 + c; + } else if (c > 0x80 && c < 0xFD && c != 0xA0) { + /* Kanji */ + if (p == e) { + *out++ = MBFL_BAD_INPUT; + break; + } + unsigned char c2 = *p++; + + if (c2 >= 0x40 && c2 <= 0xFC && c2 != 0x7F) { + uint32_t w = 0; + unsigned int s1, s2; + SJIS_DECODE(c, c2, s1, s2); + unsigned int s = ((s1 - 0x21) * 94) + s2 - 0x21; + + if (s <= 137) { + if (s == 31) { + w = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */ + } else if (s == 32) { + w = 0xFF5E; /* FULLWIDTH TILDE */ + } else if (s == 33) { + w = 0x2225; /* PARALLEL TO */ + } else if (s == 60) { + w = 0xFF0D; /* FULLWIDTH HYPHEN-MINUS */ + } else if (s == 80) { + w = 0xFFE0; /* FULLWIDTH CENT SIGN */ + } else if (s == 81) { + w = 0xFFE1; /* FULLWIDTH POUND SIGN */ + } else if (s == 137) { + w = 0xFFE2; /* FULLWIDTH NOT SIGN */ + } + } + + if (!w) { + if (s >= mb_tbl_code2uni_kddi1_min && s <= mb_tbl_code2uni_kddi2_max) { + int snd = 0; + w = mbfilter_sjis_emoji_kddi2unicode(s, &snd); + if (snd) { + *out++ = snd; + } + } else if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) { + w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min]; + } else if (s < jisx0208_ucs_table_size) { + w = jisx0208_ucs_table[s]; + } else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) { + w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min]; + } + + if (!w) { + if (s >= cp932ext3_ucs_table_min && s < cp932ext3_ucs_table_max) { + w = cp932ext3_ucs_table[s - cp932ext3_ucs_table_min]; + } else if (s >= (94*94) && s < (114*94)) { + w = s - (94*94) + 0xE000; + } + } + } + + *out++ = w ? w : MBFL_BAD_INPUT; + } else { + *out++ = MBFL_BAD_INPUT; + } + } else { + *out++ = MBFL_BAD_INPUT; + } + } + + *in_len = e - p; + *in = p; + return out - buf; +} + +static void mb_wchar_to_sjis_kddi(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) +{ + unsigned char *out, *limit; + MB_CONVERT_BUF_LOAD(buf, out, limit); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + + uint32_t w; + unsigned int s = 0; + + if (buf->state) { + w = buf->state; + buf->state = 0; + if (len) { + if (w >= NFLAGS('A')) { + goto process_possible_flag; + } else { + goto process_possible_keypad; + } + } else { + goto emit_output; + } + } + + while (len--) { + w = *in++; + s = 0; + + if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) { + s = ucs_a1_jis_table[w - ucs_a1_jis_table_min]; + } else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) { + s = ucs_a2_jis_table[w - ucs_a2_jis_table_min]; + } else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) { + s = ucs_i_jis_table[w - ucs_i_jis_table_min]; + } else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) { + s = ucs_r_jis_table[w - ucs_r_jis_table_min]; + } else if (w >= 0xE000 && w < (0xE000 + 20*94)) { + /* Private User Area (95ku - 114ku) */ + s = w - 0xE000; + s = (((s / 94) + 0x7F) << 8) | ((s % 94) + 0x21); + goto process_emoji; + } + + if (!s) { + if (w == 0xA5) { /* YEN SIGN */ + s = 0x216F; /* FULLWIDTH YEN SIGN */ + } else if (w == 0xFF3c) { /* FULLWIDTH REVERSE SOLIDUS */ + s = 0x2140; + } else if (w == 0x2225) { /* PARALLEL TO */ + s = 0x2142; + } else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */ + s = 0x215D; + } else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */ + s = 0x2171; + } else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */ + s = 0x2172; + } else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */ + s = 0x224C; + } + } + + if (w && (!s || s >= 0x8080)) { + s = 0; + + for (int i = 0; i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) { + if (w == cp932ext1_ucs_table[i]) { + s = (((i / 94) + 0x2D) << 8) + (i % 94) + 0x21; + goto process_emoji; + } + } + + for (int i = 0; i < cp932ext2_ucs_table_max - cp932ext2_ucs_table_min; i++) { + if (w == cp932ext2_ucs_table[i]) { + s = (((i / 94) + 0x79) << 8) + (i % 94) + 0x21; + goto process_emoji; + } + } + } + +process_emoji: + if (w == '#' || (w >= '0' && w <= '9')) { + if (!len) { + if (end) { + goto emit_output; + } else { + /* If we are at the end of the current buffer of codepoints, but another + * buffer is coming, then remember that we have to reprocess `w` */ + buf->state = w; + break; + } + } +process_possible_keypad: ; + uint32_t w2 = *in++; len--; + if (w2 == 0x20E3) { + if (w == '#') { + s = 0x25BC; + } else if (w == '0') { + s = 0x2830; + } else { /* Previous character was '1'-'9' */ + s = 0x27A6 + (w - '1'); + } + s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21); + } else { + in--; len++; + } + } else if (w >= NFLAGS('C') && w <= NFLAGS('U')) { /* C for CN, U for US */ + if (!len) { + if (end) { + MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjis_kddi); + } else { + /* Reprocess `w` when this function is called again with another buffer + * of wchars */ + buf->state = w; + } + break; + } +process_possible_flag: ; + uint32_t w2 = *in++; len--; + if (w2 >= NFLAGS('B') && w2 <= NFLAGS('U')) { /* B for GB, U for RU */ + for (int i = 0; i < 10; i++) { + if (w == NFLAGS(nflags_s[i][0]) && w2 == NFLAGS(nflags_s[i][1])) { + s = nflags_code_kddi[i]; + s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21); + goto emit_output; + } + } + } + in--; len++; + MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjis_kddi); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + continue; + } else if (w == 0xA9) { /* Copyright sign */ + s = (((0x27DC / 94) + 0x21) << 8) | ((0x27DC % 94) + 0x21); + } else if (w == 0xAE) { /* Registered sign */ + s = (((0x27DD / 94) + 0x21) << 8) | ((0x27DD % 94) + 0x21); + } else if (w >= mb_tbl_uni_kddi2code2_min && w <= mb_tbl_uni_kddi2code2_max) { + int i = mbfl_bisec_srch2(w, mb_tbl_uni_kddi2code2_key, mb_tbl_uni_kddi2code2_len); + if (i >= 0) { + s = mb_tbl_uni_kddi2code2_value[i]; + s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21); + } + } else if (w >= mb_tbl_uni_kddi2code3_min && w <= mb_tbl_uni_kddi2code3_max) { + int i = mbfl_bisec_srch2(w - 0x10000, mb_tbl_uni_kddi2code3_key, mb_tbl_uni_kddi2code3_len); + if (i >= 0) { + s = mb_tbl_uni_kddi2code3_value[i]; + s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21); + } + } else if (w >= mb_tbl_uni_kddi2code5_min && w <= mb_tbl_uni_kddi2code5_max) { + int i = mbfl_bisec_srch2(w - 0xF0000, mb_tbl_uni_kddi2code5_key, mb_tbl_uni_kddi2code5_len); + if (i >= 0) { + s = mb_tbl_uni_kddi2code5_val[i]; + s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21); + } + } + +emit_output: + if (!s && w) { + MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjis_kddi); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + } else if (s <= 0xFF) { + out = mb_convert_buf_add(out, s); + } else { + unsigned int c1 = (s >> 8) & 0xFF, c2 = s & 0xFF, s1, s2; + SJIS_ENCODE(c1, c2, s1, s2); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2); + out = mb_convert_buf_add2(out, s1, s2); + } + } + + MB_CONVERT_BUF_STORE(buf, out, limit); +} + +static size_t mb_sjis_sb_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) +{ + unsigned char *p = *in, *e = p + *in_len; + uint32_t *out = buf, *limit = buf + bufsize - 1; + + if (*state) { + goto softbank_emoji_escapes; + } + + while (p < e && out < limit) { + unsigned char c = *p++; + + if (c == 0x1B) { + /* Escape sequence */ + if (p == e || *p++ != '$' || p == e) { + *out++ = MBFL_BAD_INPUT; + continue; + } + unsigned char c2 = *p++; + if (((c2 < 'E' || c2 > 'G') && (c2 < 'O' || c2 > 'Q')) || p == e) { + *out++ = MBFL_BAD_INPUT; + continue; + } + /* Escape sequence was valid, next should be a series of specially + * encoded Softbank emoji */ + *state = c2; + +softbank_emoji_escapes: + while (p < e && out < limit) { + c = *p++; + if (c == 0xF) { + *state = 0; + break; + } + unsigned int s = 0; + if (*state == 'G' && c >= 0x21 && c <= 0x7A) { + s = (0x91 - 0x21) * 94; + } else if (*state == 'E' && c >= 0x21 && c <= 0x7A) { + s = (0x8D - 0x21) * 94; + } else if (*state == 'F' && c >= 0x21 && c <= 0x7A) { + s = (0x8E - 0x21) * 94; + } else if (*state == 'O' && c >= 0x21 && c <= 0x6D) { + s = (0x92 - 0x21) * 94; + } else if (*state == 'P' && c >= 0x21 && c <= 0x6C) { + s = (0x95 - 0x21) * 94; + } else if (*state == 'Q' && c >= 0x21 && c <= 0x5E) { + s = (0x96 - 0x21) * 94; + } else { + *out++ = MBFL_BAD_INPUT; + *state = 0; + break; + } + + int snd = 0; + uint32_t w = mbfilter_sjis_emoji_sb2unicode(s + c - 0x21, &snd); + if (w) { + if (snd) { + *out++ = snd; + } + *out++ = w; + } else { + *out++ = MBFL_BAD_INPUT; + *state = 0; + break; + } + } + } else if (c <= 0x7F) { + *out++ = c; + } else if (c >= 0xA1 && c <= 0xDF) { + /* Kana */ + *out++ = 0xFEC0 + c; + } else if (c > 0x80 && c < 0xFD && c != 0xA0) { + /* Kanji */ + if (p == e) { + *out++ = MBFL_BAD_INPUT; + break; + } + unsigned char c2 = *p++; + + if (c2 >= 0x40 && c2 <= 0xFC && c2 != 0x7F) { + uint32_t w = 0; + unsigned int s1, s2; + SJIS_DECODE(c, c2, s1, s2); + unsigned int s = ((s1 - 0x21) * 94) + s2 - 0x21; + + if (s <= 137) { + if (s == 31) { + w = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */ + } else if (s == 32) { + w = 0xFF5E; /* FULLWIDTH TILDE */ + } else if (s == 33) { + w = 0x2225; /* PARALLEL TO */ + } else if (s == 60) { + w = 0xFF0D; /* FULLWIDTH HYPHEN-MINUS */ + } else if (s == 80) { + w = 0xFFE0; /* FULLWIDTH CENT SIGN */ + } else if (s == 81) { + w = 0xFFE1; /* FULLWIDTH POUND SIGN */ + } else if (s == 137) { + w = 0xFFE2; /* FULLWIDTH NOT SIGN */ + } + } + + if (!w) { + if (s >= mb_tbl_code2uni_sb1_min && s <= mb_tbl_code2uni_sb3_max) { + int snd = 0; + w = mbfilter_sjis_emoji_sb2unicode(s, &snd); + if (snd) { + *out++ = snd; + } + } else if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) { + w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min]; + } else if (s < jisx0208_ucs_table_size) { + w = jisx0208_ucs_table[s]; + } else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) { + w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min]; + } + + if (!w) { + if (s >= cp932ext3_ucs_table_min && s < cp932ext3_ucs_table_max) { + w = cp932ext3_ucs_table[s - cp932ext3_ucs_table_min]; + } else if (s >= (94*94) && s < (114*94)) { + w = s - (94*94) + 0xE000; + } + } + } + + *out++ = w ? w : MBFL_BAD_INPUT; + } else { + *out++ = MBFL_BAD_INPUT; + } + } else { + *out++ = MBFL_BAD_INPUT; + } + } + + *in_len = e - p; + *in = p; + return out - buf; +} + +static void mb_wchar_to_sjis_sb(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) +{ + unsigned char *out, *limit; + MB_CONVERT_BUF_LOAD(buf, out, limit); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + + uint32_t w; + unsigned int s = 0; + + if (buf->state) { + w = buf->state; + buf->state = 0; + if (len) { + if (w >= NFLAGS('A')) { + goto process_possible_flag; + } else { + goto process_possible_keypad; + } + } else { + goto emit_output; + } + } + + while (len--) { + w = *in++; + s = 0; + + if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) { + s = ucs_a1_jis_table[w - ucs_a1_jis_table_min]; + } else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) { + s = ucs_a2_jis_table[w - ucs_a2_jis_table_min]; + } else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) { + s = ucs_i_jis_table[w - ucs_i_jis_table_min]; + } else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) { + s = ucs_r_jis_table[w - ucs_r_jis_table_min]; + } else if (w >= 0xE000 && w < (0xE000 + 20*94)) { + /* Private User Area (95ku - 114ku) */ + s = w - 0xE000; + s = (((s / 94) + 0x7F) << 8) | ((s % 94) + 0x21); + goto process_emoji; + } + + if (!s) { + if (w == 0xA5) { /* YEN SIGN */ + s = 0x216F; /* FULLWIDTH YEN SIGN */ + } else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */ + s = 0x2140; + } else if (w == 0x2225) { /* PARALLEL TO */ + s = 0x2142; + } else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */ + s = 0x215D; + } else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */ + s = 0x2171; + } else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */ + s = 0x2172; + } else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */ + s = 0x224C; + } + } + + if (w && (!s || s >= 0x8080)) { + s = 0; + + for (int i = 0; i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) { + if (w == cp932ext1_ucs_table[i]) { + s = (((i / 94) + 0x2D) << 8) + (i % 94) + 0x21; + goto process_emoji; + } + } + + for (int i = 0; i < cp932ext2_ucs_table_max - cp932ext2_ucs_table_min; i++) { + if (w == cp932ext2_ucs_table[i]) { + s = (((i / 94) + 0x79) << 8) + (i % 94) + 0x21; + goto process_emoji; + } + } + } + +process_emoji: + if (w == '#' || (w >= '0' && w <= '9')) { + if (!len) { + if (end) { + goto emit_output; + } else { + /* If we are at the end of the current buffer of codepoints, but another + * buffer is coming, then remember that we have to reprocess `w` */ + buf->state = w; + break; + } + } +process_possible_keypad: ; + uint32_t w2 = *in++; len--; + if (w2 == 0x20E3) { + if (w == '#') { + s = 0x2817; + } else if (w == '0') { + s = 0x282c; + } else { /* Previous character was '1'-'9' */ + s = 0x2823 + (w - '1'); + } + s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21); + } else { + in--; len++; + } + } else if (w >= NFLAGS('C') && w <= NFLAGS('U')) { /* C for CN, U for US */ + if (!len) { + if (end) { + MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjis_sb); + } else { + /* Reprocess `w` when this function is called again with another buffer + * of wchars */ + buf->state = w; + } + break; + } +process_possible_flag: ; + uint32_t w2 = *in++; len--; + if (w2 >= NFLAGS('B') && w2 <= NFLAGS('U')) { /* B for GB, U for RU */ + for (int i = 0; i < 10; i++) { + if (w == NFLAGS(nflags_s[i][0]) && w2 == NFLAGS(nflags_s[i][1])) { + s = nflags_code_sb[i]; + s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21); + goto emit_output; + } + } + } + in--; len++; + MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjis_sb); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + continue; + } else if (w == 0xA9) { /* Copyright sign */ + s = (((0x2855 / 94) + 0x21) << 8) | ((0x2855 % 94) + 0x21); + } else if (w == 0xAE) { /* Registered sign */ + s = (((0x2856 / 94) + 0x21) << 8) | ((0x2856 % 94) + 0x21); + } else if (w >= mb_tbl_uni_sb2code2_min && w <= mb_tbl_uni_sb2code2_max) { + int i = mbfl_bisec_srch2(w, mb_tbl_uni_sb2code2_key, mb_tbl_uni_sb2code2_len); + if (i >= 0) { + s = mb_tbl_uni_sb2code2_value[i]; + s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21); + } + } else if (w >= mb_tbl_uni_sb2code3_min && w <= mb_tbl_uni_sb2code3_max) { + int i = mbfl_bisec_srch2(w - 0x10000, mb_tbl_uni_sb2code3_key, mb_tbl_uni_sb2code3_len); + if (i >= 0) { + s = mb_tbl_uni_sb2code3_value[i]; + s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21); + } + } else if (w >= mb_tbl_uni_sb2code5_min && w <= mb_tbl_uni_sb2code5_max) { + int i = mbfl_bisec_srch2(w - 0xF0000, mb_tbl_uni_sb2code5_key, mb_tbl_uni_sb2code5_len); + if (i >= 0) { + s = mb_tbl_uni_sb2code5_val[i]; + s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21); + } + } + +emit_output: + if (!s && w) { + MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjis_sb); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + } else if (s <= 0xFF) { + out = mb_convert_buf_add(out, s); + } else { + unsigned int c1 = (s >> 8) & 0xFF, c2 = s & 0xFF, s1, s2; + SJIS_ENCODE(c1, c2, s1, s2); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2); + out = mb_convert_buf_add2(out, s1, s2); + } + } + + MB_CONVERT_BUF_STORE(buf, out, limit); +} diff --git a/ext/mbstring/tests/sjis_mobile_encodings.phpt b/ext/mbstring/tests/sjis_mobile_encodings.phpt index e692b9fbfde3b..67738b6b25035 100644 --- a/ext/mbstring/tests/sjis_mobile_encodings.phpt +++ b/ext/mbstring/tests/sjis_mobile_encodings.phpt @@ -300,6 +300,30 @@ testSJISVariant($docomo, $nonInvertibleDocomo, 'SJIS-Mobile#DOCOMO'); testSJISVariant($kddi, $nonInvertible, 'SJIS-Mobile#KDDI'); testSJISVariant($softbank, $nonInvertibleSoftbank, 'SJIS-Mobile#SOFTBANK'); +// Regression test for problem with not allocating enough space in output buffer +// This occurred when the input string was shorter than the output +convertValidString("\xA9\xA9\xA9\xA9", "\xF9\xD6\xF9\xD6\xF9\xD6\xF9\xD6", '8bit', 'SJIS-Mobile#DOCOMO'); +convertValidString("\xA9\xA9\xA9\xA9", "\xF7\x74\xF7\x74\xF7\x74\xF7\x74", '8bit', 'SJIS-Mobile#KDDI'); +convertValidString("\xA9\xA9\xA9\xA9", "\xF7\xEE\xF7\xEE\xF7\xEE\xF7\xEE", '8bit', 'SJIS-Mobile#SOFTBANK'); + +// Regression test: Old implementation used to drop digits (0-9) and hash (#) if +// they appeared at end of input string +for ($i = ord('0'); $i <= ord('9'); $i++) { + convertValidString("abc" . chr($i), "abc" . chr($i), 'UTF-8', 'SJIS-Mobile#DOCOMO'); + convertValidString("abc" . chr($i), "abc" . chr($i), 'UTF-8', 'SJIS-Mobile#KDDI'); + convertValidString("abc" . chr($i), "abc" . chr($i), 'UTF-8', 'SJIS-Mobile#SOFTBANK'); +} + +// Regression test: Originally, new implementation also did not handle 0-9 and hash +// followed by U+20E3 (keycap modifier) correctly if the 0-9 or hash occurred at +// the very end of one buffer of wchars, and the keycap modifier was at the +// beginning of the following buffer of wchars +for ($i = 0; $i <= 256; $i++) { + convertValidString(str_repeat("\x00a", $i) . "\x00\x30\x20\xE3", str_repeat('a', $i) . "\xF9\x90", 'UTF-16BE', 'SJIS-Mobile#DOCOMO'); + convertValidString(str_repeat("\x00a", $i) . "\x00\x30\x20\xE3", str_repeat('a', $i) . "\xF7\xC9", 'UTF-16BE', 'SJIS-Mobile#KDDI'); + convertValidString(str_repeat("\x00a", $i) . "\x00\x30\x20\xE3", str_repeat('a', $i) . "\xF7\xC5", 'UTF-16BE', 'SJIS-Mobile#SOFTBANK'); +} + ?> --EXPECT-- SJIS-Mobile#DOCOMO verification and conversion works on all valid characters From aff9a0d3b35421e06cee13bdc64c87b91560d9d5 Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Mon, 24 Jan 2022 20:53:46 +0200 Subject: [PATCH 06/25] Implement fast text conversion interface for EUC-JP-2004 All the legacy implementations of JISX 0213:2004 encodings had a common bug; their 'flush function' did not call the next flush function in the chain of conversion filters. So if any of these encodings were converted to an encoding where the flush function was needed to finish the output string, then the output would be truncated. --- ext/mbstring/config.m4 | 1 - ext/mbstring/config.w32 | 2 +- .../libmbfl/filters/mbfilter_euc_jp_2004.c | 69 ----- .../libmbfl/filters/mbfilter_euc_jp_2004.h | 3 - .../libmbfl/filters/mbfilter_sjis_2004.c | 273 +++++++++++++++++- .../libmbfl/filters/unicode_table_jis2004.h | 8 +- 6 files changed, 275 insertions(+), 81 deletions(-) delete mode 100644 ext/mbstring/libmbfl/filters/mbfilter_euc_jp_2004.c diff --git a/ext/mbstring/config.m4 b/ext/mbstring/config.m4 index 30004b1f8d45a..933d2ff21c7d4 100644 --- a/ext/mbstring/config.m4 +++ b/ext/mbstring/config.m4 @@ -103,7 +103,6 @@ AC_DEFUN([PHP_MBSTRING_SETUP_LIBMBFL], [ libmbfl/filters/mbfilter_gb18030.c libmbfl/filters/mbfilter_euc_cn.c libmbfl/filters/mbfilter_euc_jp.c - libmbfl/filters/mbfilter_euc_jp_2004.c libmbfl/filters/mbfilter_euc_jp_win.c libmbfl/filters/mbfilter_euc_kr.c libmbfl/filters/mbfilter_euc_tw.c diff --git a/ext/mbstring/config.w32 b/ext/mbstring/config.w32 index 175f59064dce7..d90a0e3d31e84 100644 --- a/ext/mbstring/config.w32 +++ b/ext/mbstring/config.w32 @@ -25,7 +25,7 @@ if (PHP_MBSTRING != "no") { mbfilter_sjis_2004.c mbfilter_qprint.c mbfilter_sjis.c mbfilter_ucs2.c \ mbfilter_ucs4.c mbfilter_uhc.c mbfilter_utf16.c mbfilter_utf32.c \ mbfilter_utf7.c mbfilter_utf7imap.c mbfilter_utf8.c \ - mbfilter_utf8_mobile.c mbfilter_euc_jp_2004.c mbfilter_uuencode.c \ + mbfilter_utf8_mobile.c mbfilter_uuencode.c \ mbfilter_cp5022x.c mbfilter_sjis_mobile.c \ mbfilter_sjis_mac.c mbfilter_iso2022jp_2004.c \ mbfilter_iso2022jp_mobile.c mbfilter_singlebyte.c \ diff --git a/ext/mbstring/libmbfl/filters/mbfilter_euc_jp_2004.c b/ext/mbstring/libmbfl/filters/mbfilter_euc_jp_2004.c deleted file mode 100644 index e4f919304e2b8..0000000000000 --- a/ext/mbstring/libmbfl/filters/mbfilter_euc_jp_2004.c +++ /dev/null @@ -1,69 +0,0 @@ -/* - * "streamable kanji code filter and converter" - * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved. - * - * LICENSE NOTICES - * - * This file is part of "streamable kanji code filter and converter", - * which is distributed under the terms of GNU Lesser General Public - * License (version 2) as published by the Free Software Foundation. - * - * This software is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with "streamable kanji code filter and converter"; - * if not, write to the Free Software Foundation, Inc., 59 Temple Place, - * Suite 330, Boston, MA 02111-1307 USA - * - * The author of this file: - * - */ -/* - * The source code included in this files was separated from mbfilter_ja.c - * by rui hirokawa on 16 aug 2011. - * - */ - -#include "mbfilter.h" -#include "mbfilter_euc_jp_2004.h" -#include "mbfilter_sjis_2004.h" - -extern const unsigned char mblen_table_eucjp[]; - -static const char *mbfl_encoding_eucjp2004_aliases[] = {"EUC_JP-2004", NULL}; - -const mbfl_encoding mbfl_encoding_eucjp2004 = { - mbfl_no_encoding_eucjp2004, - "EUC-JP-2004", - "EUC-JP", - mbfl_encoding_eucjp2004_aliases, - mblen_table_eucjp, - 0, - &vtbl_eucjp2004_wchar, - &vtbl_wchar_eucjp2004, - NULL, - NULL -}; - -const struct mbfl_convert_vtbl vtbl_eucjp2004_wchar = { - mbfl_no_encoding_eucjp2004, - mbfl_no_encoding_wchar, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_jis2004_wchar, - mbfl_filt_conv_jis2004_wchar_flush, - NULL, -}; - -const struct mbfl_convert_vtbl vtbl_wchar_eucjp2004 = { - mbfl_no_encoding_wchar, - mbfl_no_encoding_eucjp2004, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_wchar_jis2004, - mbfl_filt_conv_wchar_jis2004_flush, - NULL, -}; diff --git a/ext/mbstring/libmbfl/filters/mbfilter_euc_jp_2004.h b/ext/mbstring/libmbfl/filters/mbfilter_euc_jp_2004.h index affdd447f579c..e86fad9564cd2 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_euc_jp_2004.h +++ b/ext/mbstring/libmbfl/filters/mbfilter_euc_jp_2004.h @@ -36,7 +36,4 @@ extern const mbfl_encoding mbfl_encoding_eucjp2004; extern const struct mbfl_convert_vtbl vtbl_eucjp2004_wchar; extern const struct mbfl_convert_vtbl vtbl_wchar_eucjp2004; -int mbfl_filt_conv_eucjp2004_wchar(int c, mbfl_convert_filter *filter); -int mbfl_filt_conv_wchar_eucjp2004(int c, mbfl_convert_filter *filter); - #endif /* MBFL_MBFILTER_EUC_JP_2004_H */ diff --git a/ext/mbstring/libmbfl/filters/mbfilter_sjis_2004.c b/ext/mbstring/libmbfl/filters/mbfilter_sjis_2004.c index 993fc151dec4a..8b39ce7dbbefe 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_sjis_2004.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_sjis_2004.c @@ -29,16 +29,22 @@ #include "mbfilter.h" #include "mbfilter_sjis_2004.h" +#include "mbfilter_euc_jp_2004.h" #include "unicode_table_jis2004.h" #include "unicode_table_jis.h" extern const unsigned char mblen_table_sjis[]; +extern const unsigned char mblen_table_eucjp[]; + +static size_t mb_eucjp2004_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); +static void mb_wchar_to_eucjp2004(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); extern int mbfl_bisec_srch(int w, const unsigned short *tbl, int n); extern int mbfl_bisec_srch2(int w, const unsigned short tbl[], int n); static const char *mbfl_encoding_sjis2004_aliases[] = {"SJIS2004","Shift_JIS-2004", NULL}; +static const char *mbfl_encoding_eucjp2004_aliases[] = {"EUC_JP-2004", NULL}; const mbfl_encoding mbfl_encoding_sjis2004 = { mbfl_no_encoding_sjis2004, @@ -73,6 +79,39 @@ const struct mbfl_convert_vtbl vtbl_wchar_sjis2004 = { NULL, }; +const mbfl_encoding mbfl_encoding_eucjp2004 = { + mbfl_no_encoding_eucjp2004, + "EUC-JP-2004", + "EUC-JP", + mbfl_encoding_eucjp2004_aliases, + mblen_table_eucjp, + 0, + &vtbl_eucjp2004_wchar, + &vtbl_wchar_eucjp2004, + mb_eucjp2004_to_wchar, + mb_wchar_to_eucjp2004 +}; + +const struct mbfl_convert_vtbl vtbl_eucjp2004_wchar = { + mbfl_no_encoding_eucjp2004, + mbfl_no_encoding_wchar, + mbfl_filt_conv_common_ctor, + NULL, + mbfl_filt_conv_jis2004_wchar, + mbfl_filt_conv_jis2004_wchar_flush, + NULL, +}; + +const struct mbfl_convert_vtbl vtbl_wchar_eucjp2004 = { + mbfl_no_encoding_wchar, + mbfl_no_encoding_eucjp2004, + mbfl_filt_conv_common_ctor, + NULL, + mbfl_filt_conv_wchar_jis2004, + mbfl_filt_conv_wchar_jis2004_flush, + NULL, +}; + #define CK(statement) do { if ((statement) < 0) return (-1); } while (0) #define SJIS_ENCODE(c1,c2,s1,s2) \ @@ -287,11 +326,11 @@ int mbfl_filt_conv_jis2004_wchar(int c, mbfl_convert_filter *filter) (s1 >= 77 && s1 < 94)) && s2 >= 0 && s2 < 94) { /* calc offset from ku */ for (k = 0; k < jisx0213_p2_ofst_len; k++) { - if (s1 == jisx0213_p2_ofst[k]-1) { + if (s1 == jisx0213_p2_ofst[k]) { break; } } - k -= (jisx0213_p2_ofst[k]-1); + k -= jisx0213_p2_ofst[k]; /* check for japanese chars in BMP */ s = (s1 + 94 + k)*94 + s2; @@ -582,7 +621,7 @@ int mbfl_filt_conv_wchar_jis2004(int c, mbfl_convert_filter *filter) s2 = s1 & 0xff; k = ((s1 >> 8) & 0xff) - 0x7f; if (k >= 0 && k < jisx0213_p2_ofst_len) { - s1 = jisx0213_p2_ofst[k] - 1 + 0x21; + s1 = jisx0213_p2_ofst[k] + 0x21; } if (filter->to->no_encoding == mbfl_no_encoding_eucjp2004) { s2 |= 0x80; @@ -659,3 +698,231 @@ int mbfl_filt_conv_wchar_jis2004_flush(mbfl_convert_filter *filter) return 0; } + +static size_t mb_eucjp2004_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) +{ + unsigned char *p = *in, *e = p + *in_len; + uint32_t *out = buf, *limit = buf + bufsize - 1; + + while (p < e && out < limit) { + unsigned char c = *p++; + + if (c <= 0x7F) { + *out++ = c; + } else if (c >= 0xA1 && c <= 0xFE) { + /* Kanji */ + if (p == e) { + *out++ = MBFL_BAD_INPUT; + break; + } + unsigned char c2 = *p++; + if (c2 <= 0xA0 || c2 == 0xFF) { + *out++ = MBFL_BAD_INPUT; + continue; + } + + unsigned int s1 = c - 0x80, s2 = c2 - 0x80; + unsigned int w1 = (s1 << 8) | s2, w = 0; + + /* Conversion for combining characters */ + if ((w1 >= 0x2477 && w1 <= 0x2479) || (w1 >= 0x2479 && w1 <= 0x247B) || (w1 >= 0x2577 && w1 <= 0x257E) || w1 == 0x2678 || w1 == 0x2B44 || (w1 >= 0x2B48 && w1 <= 0x2B4F) || (w1 >= 0x2B65 && w1 <= 0x2B66)) { + int k = mbfl_bisec_srch2(w1, jisx0213_u2_key, jisx0213_u2_tbl_len); + if (k >= 0) { + *out++ = jisx0213_u2_tbl[2*k]; + *out++ = jisx0213_u2_tbl[2*k+1]; + continue; + } + } + + /* Conversion for BMP */ + w1 = (s1 - 0x21)*94 + s2 - 0x21; + if (w1 < jisx0213_ucs_table_size) { + w = jisx0213_ucs_table[w1]; + } + + /* Conversion for CJK Unified Ideographs ext.B (U+2XXXX) */ + if (!w) { + w1 = (s1 << 8) | s2; + int k = mbfl_bisec_srch2(w1, jisx0213_jis_u5_key, jisx0213_u5_tbl_len); + if (k >= 0) { + w = jisx0213_jis_u5_tbl[k] + 0x20000; + } + } + + *out++ = w ? w : MBFL_BAD_INPUT; + } else if (c == 0x8E && p < e) { + /* Kana */ + unsigned char c2 = *p++; + if (c2 >= 0xA1 && c2 <= 0xDF) { + *out++ = 0xFEC0 + c2; + } else { + *out++ = MBFL_BAD_INPUT; + } + } else if (c == 0x8F && p < e) { + unsigned char c2 = *p++; + if ((c2 == 0xA1 || (c2 >= 0xA3 && c2 <= 0xA5) || c2 == 0xA8 || (c2 >= 0xAC && c2 <= 0xAF) || (c2 >= 0xEE && c2 <= 0xFE)) && p < e) { + unsigned char c3 = *p++; + + if (c3 < 0xA1 || c3 == 0xFF) { + *out++ = MBFL_BAD_INPUT; + continue; + } + + unsigned int s1 = c2 - 0xA1, s2 = c3 - 0xA1; + + if (((s1 <= 4 && s1 != 1) || s1 == 7 || (s1 >= 11 && s1 <= 14) || (s1 >= 77 && s1 < 94)) && s2 < 94) { + int k; + for (k = 0; k < jisx0213_p2_ofst_len; k++) { + if (s1 == jisx0213_p2_ofst[k]) { + break; + } + } + k -= jisx0213_p2_ofst[k]; + + /* Check for Japanese chars in BMP */ + unsigned int s = (s1 + 94 + k)*94 + s2; + ZEND_ASSERT(s < jisx0213_ucs_table_size); + unsigned int w = jisx0213_ucs_table[s]; + + /* Check for Japanese chars in CJK Unified Ideographs ext B (U+2XXXX) */ + if (!w) { + k = mbfl_bisec_srch2(((c2 - 0x80 + k + 94) << 8) | (c3 - 0x80), jisx0213_jis_u5_key, jisx0213_u5_tbl_len); + if (k >= 0) { + w = jisx0213_jis_u5_tbl[k] + 0x20000; + } + } + + *out++ = w ? w : MBFL_BAD_INPUT; + } else { + *out++ = MBFL_BAD_INPUT; + } + } else { + *out++ = MBFL_BAD_INPUT; + } + } else { + *out++ = MBFL_BAD_INPUT; + } + } + + *in_len = e - p; + *in = p; + return out - buf; +} + +static void mb_wchar_to_eucjp2004(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) +{ + unsigned char *out, *limit; + MB_CONVERT_BUF_LOAD(buf, out, limit); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + + uint32_t w; + if (buf->state) { + w = buf->state; + buf->state = 0; + goto process_codepoint; + } + + while (len--) { + w = *in++; +process_codepoint: ; + unsigned int s = 0; + + /* Check for 1st char of combining characters */ + if (w == 0xE6 || (w >= 0x254 && w <= 0x2E9) || (w >= 0x304B && w <= 0x3053) || (w >= 0x30AB && w <= 0x30C8) || w == 0x31F7) { + for (int k = 0; k < jisx0213_u2_tbl_len; k++) { + if (w == jisx0213_u2_tbl[2*k]) { + if (!len) { + if (!end) { + buf->state = w; + MB_CONVERT_BUF_STORE(buf, out, limit); + return; + } + } else { + uint32_t w2 = *in++; len--; + if ((w == 0x254 || w == 0x28C || w == 0x259 || w == 0x25A) && w2 == 0x301) { + k++; + } + if (w2 == jisx0213_u2_tbl[2*k+1]) { + s = jisx0213_u2_key[k]; + break; + } + in--; len++; + } + + /* Fallback */ + s = jisx0213_u2_fb_tbl[k]; + break; + } + } + } + + if (!s && (w == 0x5C || w == 0x7E)) { + /* EUC-JP-2004 can represent ASCII characters directly, so there is no need + * to use the JIS X 0208 REVERSE SOLIDUS for ASCII backslash, or WAVE DASH for tilde */ + s = w; + } + + /* Check for major Japanese chars: U+4E00-U+9FFF */ + if (!s) { + for (int k = 0; k < uni2jis_tbl_len; k++) { + if (w >= uni2jis_tbl_range[k][0] && w <= uni2jis_tbl_range[k][1]) { + s = uni2jis_tbl[k][w - uni2jis_tbl_range[k][0]]; + break; + } + } + } + + /* Check for Japanese chars in compressed mapping area: U+1E00-U+4DBF */ + if (!s && w >= ucs_c1_jisx0213_min && w <= ucs_c1_jisx0213_max) { + int k = mbfl_bisec_srch(w, ucs_c1_jisx0213_tbl, ucs_c1_jisx0213_tbl_len); + if (k >= 0) { + s = ucs_c1_jisx0213_ofst[k] + w - ucs_c1_jisx0213_tbl[2*k]; + } + } + + /* Check for Japanese chars in CJK Unified Ideographs ext.B (U+2XXXX) */ + if (!s && w >= jisx0213_u5_tbl_min && w <= jisx0213_u5_tbl_max) { + int k = mbfl_bisec_srch2(w - 0x20000, jisx0213_u5_jis_key, jisx0213_u5_tbl_len); + if (k >= 0) { + s = jisx0213_u5_jis_tbl[k]; + } + } + + if (!s) { + /* CJK Compatibility Forms: U+FE30-U+FE4F */ + if (w == 0xFE45) { + s = 0x233E; + } else if (w == 0xFE46) { + s = 0x233D; + } else if (w >= 0xF91D && w <= 0xF9DC) { + /* CJK Compatibility Ideographs: U+F900-U+F92A */ + int k = mbfl_bisec_srch2(w, ucs_r2b_jisx0213_cmap_key, ucs_r2b_jisx0213_cmap_len); + if (k >= 0) { + s = ucs_r2b_jisx0213_cmap_val[k]; + } + } + } + + if (!s && w) { + MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_eucjp2004); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + } else if (s <= 0x7F) { + out = mb_convert_buf_add(out, s); + } else if (s <= 0xFF) { + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2); + out = mb_convert_buf_add2(out, 0x8E, s); + } else if (s <= 0x7EFF) { + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2); + out = mb_convert_buf_add2(out, ((s >> 8) & 0xFF) + 0x80, (s & 0xFF) + 0x80); + } else { + unsigned int s2 = s & 0xFF; + int k = ((s >> 8) & 0xFF) - 0x7F; + ZEND_ASSERT(k < jisx0213_p2_ofst_len); + s = jisx0213_p2_ofst[k] + 0x21; + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 3); + out = mb_convert_buf_add3(out, 0x8F, s | 0x80, s2 | 0x80); + } + } + + MB_CONVERT_BUF_STORE(buf, out, limit); +} diff --git a/ext/mbstring/libmbfl/filters/unicode_table_jis2004.h b/ext/mbstring/libmbfl/filters/unicode_table_jis2004.h index 09f7c43726178..4fe7c2f45199a 100644 --- a/ext/mbstring/libmbfl/filters/unicode_table_jis2004.h +++ b/ext/mbstring/libmbfl/filters/unicode_table_jis2004.h @@ -5039,11 +5039,11 @@ static const int jisx0213_u2_tbl_len = sizeof(jisx0213_u2_key)/sizeof(unsigned s static const unsigned short jisx0213_p2_ofst[] = { - 1, 8, 3, 4, 5, 12, 13, 14, 15, 78, 79, 80, 81, 82, - 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94}; + 0, 7, 2, 3, 4, 11, 12, 13, 14, 77, 78, 79, 80, 81, + 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93 +}; -static const int jisx0213_p2_ofst_len = - sizeof(jisx0213_p2_ofst)/sizeof(unsigned short); +static const int jisx0213_p2_ofst_len = sizeof(jisx0213_p2_ofst)/sizeof(unsigned short); static const int uni2jis_tbl_range[][2] = { {0x0000, 0x045f}, From 48657d988cc45b43d74d16c62e838868bff728e7 Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Fri, 28 Jan 2022 22:29:39 +0200 Subject: [PATCH 07/25] Implement fast text conversion interface for SJIS-2004 All the legacy implementations of JISX 0213:2004 encodings had a common bug; their 'flush function' did not call the next flush function in the chain of conversion filters. So if any of these encodings were converted to an encoding where the flush function was needed to finish the output string, then the output would be truncated. --- .../libmbfl/filters/mbfilter_sjis_2004.c | 179 +++++++++++++++++- 1 file changed, 177 insertions(+), 2 deletions(-) diff --git a/ext/mbstring/libmbfl/filters/mbfilter_sjis_2004.c b/ext/mbstring/libmbfl/filters/mbfilter_sjis_2004.c index 8b39ce7dbbefe..911a9fbfe17b8 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_sjis_2004.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_sjis_2004.c @@ -37,6 +37,8 @@ extern const unsigned char mblen_table_sjis[]; extern const unsigned char mblen_table_eucjp[]; +static size_t mb_sjis2004_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); +static void mb_wchar_to_sjis2004(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); static size_t mb_eucjp2004_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); static void mb_wchar_to_eucjp2004(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); @@ -55,8 +57,8 @@ const mbfl_encoding mbfl_encoding_sjis2004 = { MBFL_ENCTYPE_GL_UNSAFE, &vtbl_sjis2004_wchar, &vtbl_wchar_sjis2004, - NULL, - NULL + mb_sjis2004_to_wchar, + mb_wchar_to_sjis2004 }; const struct mbfl_convert_vtbl vtbl_sjis2004_wchar = { @@ -699,6 +701,179 @@ int mbfl_filt_conv_wchar_jis2004_flush(mbfl_convert_filter *filter) return 0; } +static size_t mb_sjis2004_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) +{ + unsigned char *p = *in, *e = p + *in_len; + uint32_t *out = buf, *limit = buf + bufsize - 1; + + while (p < e && out < limit) { + unsigned char c = *p++; + + if (c <= 0x7F) { + if (c == 0x5C) { + *out++ = 0xA5; + } else if (c == 0x7E) { + *out++ = 0x203E; + } else { + *out++ = c; + } + } else if (c >= 0xA1 && c <= 0xDF) { + *out++ = 0xFEC0 + c; + } else if (c > 0x80 && c < 0xFD && c != 0xA0) { + if (p == e) { + *out++ = MBFL_BAD_INPUT; + break; + } + unsigned char c2 = *p++; + + if (c2 < 0x40 || c2 > 0xFC || c2 == 0x7F) { + *out++ = MBFL_BAD_INPUT; + continue; + } + + unsigned int s1, s2; + SJIS_DECODE(c, c2, s1, s2); + unsigned int w1 = (s1 << 8) | s2, w = 0; + + /* Conversion for combining characters */ + if ((w1 >= 0x2477 && w1 <= 0x2479) || (w1 >= 0x2479 && w1 <= 0x247B) || (w1 >= 0x2577 && w1 <= 0x257E) || w1 == 0x2678 || w1 == 0x2B44 || (w1 >= 0x2B48 && w1 <= 0x2B4F) || (w1 >= 0x2B65 && w1 <= 0x2B66)) { + int k = mbfl_bisec_srch2(w1, jisx0213_u2_key, jisx0213_u2_tbl_len); + if (k >= 0) { + *out++ = jisx0213_u2_tbl[2*k]; + *out++ = jisx0213_u2_tbl[2*k+1]; + continue; + } + } + + /* Conversion for BMP */ + w1 = (s1 - 0x21)*94 + s2 - 0x21; + if (w1 < jisx0213_ucs_table_size) { + w = jisx0213_ucs_table[w1]; + } + + /* Conversion for CJK Unified Ideographs extension B (U+2XXXX) */ + if (!w) { + w1 = (s1 << 8) | s2; + int k = mbfl_bisec_srch2(w1, jisx0213_jis_u5_key, jisx0213_u5_tbl_len); + if (k >= 0) { + w = jisx0213_jis_u5_tbl[k] + 0x20000; + } + } + + *out++ = w ? w : MBFL_BAD_INPUT; + } else { + *out++ = MBFL_BAD_INPUT; + } + } + + *in_len = e - p; + *in = p; + return out - buf; +} + +static void mb_wchar_to_sjis2004(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) +{ + unsigned char *out, *limit; + MB_CONVERT_BUF_LOAD(buf, out, limit); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + + uint32_t w; + if (buf->state) { + w = buf->state; + buf->state = 0; + goto process_codepoint; + } + + while (len--) { + w = *in++; +process_codepoint: ; + unsigned int s = 0; + + if (w == 0xE6 || (w >= 0x254 && w <= 0x2E9) || (w >= 0x304B && w <= 0x3053) || (w >= 0x30AB && w <= 0x30C8) || w == 0x31F7) { + for (int k = 0; k < jisx0213_u2_tbl_len; k++) { + if (w == jisx0213_u2_tbl[2*k]) { + if (!len) { + if (!end) { + buf->state = w; + MB_CONVERT_BUF_STORE(buf, out, limit); + return; + } + } else { + uint32_t w2 = *in++; len--; + if ((w == 0x254 || w == 0x28C || w == 0x259 || w == 0x25A) && w2 == 0x301) { + k++; + } + if (w2 == jisx0213_u2_tbl[2*k+1]) { + s = jisx0213_u2_key[k]; + break; + } + in--; len++; + } + + /* Fallback */ + s = jisx0213_u2_fb_tbl[k]; + break; + } + } + } + + /* Check for major Japanese chars: U+4E00-U+9FFF */ + if (!s) { + for (int k = 0; k < uni2jis_tbl_len; k++) { + if (w >= uni2jis_tbl_range[k][0] && w <= uni2jis_tbl_range[k][1]) { + s = uni2jis_tbl[k][w - uni2jis_tbl_range[k][0]]; + break; + } + } + } + + /* Check for Japanese chars in compressed mapping area: U+1E00-U+4DBF */ + if (!s && w >= ucs_c1_jisx0213_min && w <= ucs_c1_jisx0213_max) { + int k = mbfl_bisec_srch(w, ucs_c1_jisx0213_tbl, ucs_c1_jisx0213_tbl_len); + if (k >= 0) { + s = ucs_c1_jisx0213_ofst[k] + w - ucs_c1_jisx0213_tbl[2*k]; + } + } + + /* Check for Japanese chars in CJK Unified Ideographs ext.B (U+2XXXX) */ + if (!s && w >= jisx0213_u5_tbl_min && w <= jisx0213_u5_tbl_max) { + int k = mbfl_bisec_srch2(w - 0x20000, jisx0213_u5_jis_key, jisx0213_u5_tbl_len); + if (k >= 0) { + s = jisx0213_u5_jis_tbl[k]; + } + } + + if (!s) { + /* CJK Compatibility Forms: U+FE30-U+FE4F */ + if (w == 0xFE45) { + s = 0x233E; + } else if (w == 0xFE46) { + s = 0x233D; + } else if (w >= 0xF91D && w <= 0xF9DC) { + /* CJK Compatibility Ideographs: U+F900-U+F92A */ + int k = mbfl_bisec_srch2(w, ucs_r2b_jisx0213_cmap_key, ucs_r2b_jisx0213_cmap_len); + if (k >= 0) { + s = ucs_r2b_jisx0213_cmap_val[k]; + } + } + } + + if (!s && w) { + MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjis2004); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + } else if (s <= 0xFF) { + out = mb_convert_buf_add(out, s); + } else { + unsigned int c1 = (s >> 8) & 0xFF, c2 = s & 0xFF, s1, s2; + SJIS_ENCODE(c1, c2, s1, s2); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2); + out = mb_convert_buf_add2(out, s1, s2); + } + } + + MB_CONVERT_BUF_STORE(buf, out, limit); +} + static size_t mb_eucjp2004_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) { unsigned char *p = *in, *e = p + *in_len; From 9a0cc3a7298cbea677fb3296b65a8e87667e4211 Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Fri, 28 Jan 2022 22:42:14 +0200 Subject: [PATCH 08/25] Implement fast text conversion interface for ISO-2022-JP-2004 There were bugs in the legacy implementation. Lots of them. It did not properly track whether it has switched to JISX 0213 plane 1 or plane 2. If it processes a character in plane 1 and then immediately one in plane 2, it failed to emit the escape code to switch to plane 2. Further, when converting codepoints from 0x80-0xFF to ISO-2022-JP-2004, the legacy implementation would totally disregard which mode it was operating in. Such codepoints would pass through directly to the output without any escape sequences being emitted. If that was not enough, all the legacy implementations of JISX 0213:2004 encodings had another common bug; their 'flush function' did not call the next flush function in the chain of conversion filters. So if any of these encodings were converted to an encoding where the flush function was needed to finish the output string, then the output would be truncated. --- ext/mbstring/config.m4 | 1 - ext/mbstring/config.w32 | 2 +- .../libmbfl/filters/mbfilter_iso2022jp_2004.c | 69 ---- .../libmbfl/filters/mbfilter_iso2022jp_2004.h | 4 - .../libmbfl/filters/mbfilter_sjis_2004.c | 317 ++++++++++++++++++ .../tests/iso2022jp_2004_encoding.phpt | 7 + 6 files changed, 325 insertions(+), 75 deletions(-) delete mode 100644 ext/mbstring/libmbfl/filters/mbfilter_iso2022jp_2004.c diff --git a/ext/mbstring/config.m4 b/ext/mbstring/config.m4 index 933d2ff21c7d4..65b8f2c4a60eb 100644 --- a/ext/mbstring/config.m4 +++ b/ext/mbstring/config.m4 @@ -109,7 +109,6 @@ AC_DEFUN([PHP_MBSTRING_SETUP_LIBMBFL], [ libmbfl/filters/mbfilter_htmlent.c libmbfl/filters/mbfilter_hz.c libmbfl/filters/mbfilter_iso2022_jp_ms.c - libmbfl/filters/mbfilter_iso2022jp_2004.c libmbfl/filters/mbfilter_iso2022jp_mobile.c libmbfl/filters/mbfilter_iso2022_kr.c libmbfl/filters/mbfilter_jis.c diff --git a/ext/mbstring/config.w32 b/ext/mbstring/config.w32 index d90a0e3d31e84..d415be6cf7472 100644 --- a/ext/mbstring/config.w32 +++ b/ext/mbstring/config.w32 @@ -27,7 +27,7 @@ if (PHP_MBSTRING != "no") { mbfilter_utf7.c mbfilter_utf7imap.c mbfilter_utf8.c \ mbfilter_utf8_mobile.c mbfilter_uuencode.c \ mbfilter_cp5022x.c mbfilter_sjis_mobile.c \ - mbfilter_sjis_mac.c mbfilter_iso2022jp_2004.c \ + mbfilter_sjis_mac.c \ mbfilter_iso2022jp_mobile.c mbfilter_singlebyte.c \ mbfilter_tl_jisx0201_jisx0208.c", "mbstring"); diff --git a/ext/mbstring/libmbfl/filters/mbfilter_iso2022jp_2004.c b/ext/mbstring/libmbfl/filters/mbfilter_iso2022jp_2004.c deleted file mode 100644 index 45b71a8a98bde..0000000000000 --- a/ext/mbstring/libmbfl/filters/mbfilter_iso2022jp_2004.c +++ /dev/null @@ -1,69 +0,0 @@ -/* - * "streamable kanji code filter and converter" - * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved. - * - * LICENSE NOTICES - * - * This file is part of "streamable kanji code filter and converter", - * which is distributed under the terms of GNU Lesser General Public - * License (version 2) as published by the Free Software Foundation. - * - * This software is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with "streamable kanji code filter and converter"; - * if not, write to the Free Software Foundation, Inc., 59 Temple Place, - * Suite 330, Boston, MA 02111-1307 USA - * - * The author of this file: - * - */ -/* - * The source code included in this files was separated from mbfilter_jis.c - * by rui hirokawa on 18 aug 2011. - * - */ - -#include "mbfilter.h" -#include "mbfilter_iso2022jp_2004.h" -#include "mbfilter_sjis_2004.h" - -#include "unicode_table_jis.h" - -extern int mbfl_filt_conv_any_jis_flush(mbfl_convert_filter *filter); - -const mbfl_encoding mbfl_encoding_2022jp_2004 = { - mbfl_no_encoding_2022jp_2004, - "ISO-2022-JP-2004", - "ISO-2022-JP-2004", - NULL, - NULL, - MBFL_ENCTYPE_GL_UNSAFE, - &vtbl_2022jp_2004_wchar, - &vtbl_wchar_2022jp_2004, - NULL, - NULL -}; - -const struct mbfl_convert_vtbl vtbl_2022jp_2004_wchar = { - mbfl_no_encoding_2022jp_2004, - mbfl_no_encoding_wchar, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_jis2004_wchar, - mbfl_filt_conv_jis2004_wchar_flush, - NULL, -}; - -const struct mbfl_convert_vtbl vtbl_wchar_2022jp_2004 = { - mbfl_no_encoding_wchar, - mbfl_no_encoding_2022jp_2004, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_wchar_jis2004, - mbfl_filt_conv_wchar_jis2004_flush, - NULL, -}; diff --git a/ext/mbstring/libmbfl/filters/mbfilter_iso2022jp_2004.h b/ext/mbstring/libmbfl/filters/mbfilter_iso2022jp_2004.h index 58b2dd31d8c4a..926dbfe9a864c 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_iso2022jp_2004.h +++ b/ext/mbstring/libmbfl/filters/mbfilter_iso2022jp_2004.h @@ -36,8 +36,4 @@ extern const mbfl_encoding mbfl_encoding_2022jp_2004; extern const struct mbfl_convert_vtbl vtbl_2022jp_2004_wchar; extern const struct mbfl_convert_vtbl vtbl_wchar_2022jp_2004; -int mbfl_filt_conv_2022jp_2004_wchar(int c, mbfl_convert_filter *filter); -int mbfl_filt_conv_wchar_2022jp_2004(int c, mbfl_convert_filter *filter); -int mbfl_filt_conv_any_2022jp_2004_flush(mbfl_convert_filter *filter); - #endif /* MBFL_MBFILTER_2022JP_2004_H */ diff --git a/ext/mbstring/libmbfl/filters/mbfilter_sjis_2004.c b/ext/mbstring/libmbfl/filters/mbfilter_sjis_2004.c index 911a9fbfe17b8..3fd8f0f408930 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_sjis_2004.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_sjis_2004.c @@ -30,6 +30,7 @@ #include "mbfilter.h" #include "mbfilter_sjis_2004.h" #include "mbfilter_euc_jp_2004.h" +#include "mbfilter_iso2022jp_2004.h" #include "unicode_table_jis2004.h" #include "unicode_table_jis.h" @@ -41,6 +42,8 @@ static size_t mb_sjis2004_to_wchar(unsigned char **in, size_t *in_len, uint32_t static void mb_wchar_to_sjis2004(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); static size_t mb_eucjp2004_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); static void mb_wchar_to_eucjp2004(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); +static size_t mb_iso2022jp2004_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); +static void mb_wchar_to_iso2022jp2004(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); extern int mbfl_bisec_srch(int w, const unsigned short *tbl, int n); extern int mbfl_bisec_srch2(int w, const unsigned short tbl[], int n); @@ -114,6 +117,39 @@ const struct mbfl_convert_vtbl vtbl_wchar_eucjp2004 = { NULL, }; +const mbfl_encoding mbfl_encoding_2022jp_2004 = { + mbfl_no_encoding_2022jp_2004, + "ISO-2022-JP-2004", + "ISO-2022-JP-2004", + NULL, + NULL, + MBFL_ENCTYPE_GL_UNSAFE, + &vtbl_2022jp_2004_wchar, + &vtbl_wchar_2022jp_2004, + mb_iso2022jp2004_to_wchar, + mb_wchar_to_iso2022jp2004 +}; + +const struct mbfl_convert_vtbl vtbl_2022jp_2004_wchar = { + mbfl_no_encoding_2022jp_2004, + mbfl_no_encoding_wchar, + mbfl_filt_conv_common_ctor, + NULL, + mbfl_filt_conv_jis2004_wchar, + mbfl_filt_conv_jis2004_wchar_flush, + NULL, +}; + +const struct mbfl_convert_vtbl vtbl_wchar_2022jp_2004 = { + mbfl_no_encoding_wchar, + mbfl_no_encoding_2022jp_2004, + mbfl_filt_conv_common_ctor, + NULL, + mbfl_filt_conv_wchar_jis2004, + mbfl_filt_conv_wchar_jis2004_flush, + NULL, +}; + #define CK(statement) do { if ((statement) < 0) return (-1); } while (0) #define SJIS_ENCODE(c1,c2,s1,s2) \ @@ -1101,3 +1137,284 @@ process_codepoint: ; MB_CONVERT_BUF_STORE(buf, out, limit); } + +#define ASCII 0 +#define JISX0208 1 +#define JISX0213_PLANE1 2 +#define JISX0213_PLANE2 3 + +static size_t mb_iso2022jp2004_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) +{ + unsigned char *p = *in, *e = p + *in_len; + uint32_t *out = buf, *limit = buf + bufsize - 1; + + while (p < e && out < limit) { + unsigned char c = *p++; + + if (c <= 0x7F) { + if (c == 0x1B) { + if ((e - p) < 2) { + *out++ = MBFL_BAD_INPUT; + p = e; + break; + } + unsigned char c2 = *p++; + unsigned char c3 = *p++; + if (c2 == '$') { + if (c3 == 'B') { + *state = JISX0208; + } else if (c3 == '(') { + if (p == e) { + *out++ = MBFL_BAD_INPUT; + break; + } + unsigned char c4 = *p++; + if (c4 == 'Q') { + *state = JISX0213_PLANE1; + } else if (c4 == 'P') { + *state = JISX0213_PLANE2; + } else { + *out++ = MBFL_BAD_INPUT; + } + } else { + *out++ = MBFL_BAD_INPUT; + } + } else if (c2 == '(') { + if (c3 == 'B') { + *state = ASCII; + } else { + *out++ = MBFL_BAD_INPUT; + } + } else { + p--; + *out++ = MBFL_BAD_INPUT; + } + } else if (*state >= JISX0208 && c > 0x20 && c < 0x7F) { + if (p == e) { + *out++ = MBFL_BAD_INPUT; + break; + } + unsigned char c2 = *p++; + if (c2 < 0x21 || c2 > 0x7E) { + *out++ = MBFL_BAD_INPUT; + continue; + } + + if (*state == JISX0213_PLANE1) { + unsigned int w1 = (c << 8) | c2; + + /* Conversion for combining characters */ + if ((w1 >= 0x2477 && w1 <= 0x2479) || (w1 >= 0x2479 && w1 <= 0x247B) || (w1 >= 0x2577 && w1 <= 0x257E) || w1 == 0x2678 || w1 == 0x2B44 || (w1 >= 0x2B48 && w1 <= 0x2B4F) || (w1 >= 0x2B65 && w1 <= 0x2B66)) { + int k = mbfl_bisec_srch2(w1, jisx0213_u2_key, jisx0213_u2_tbl_len); + if (k >= 0) { + *out++ = jisx0213_u2_tbl[2*k]; + *out++ = jisx0213_u2_tbl[2*k+1]; + continue; + } + } + + /* Conversion for BMP */ + uint32_t w = 0; + w1 = (c - 0x21)*94 + c2 - 0x21; + if (w1 < jisx0213_ucs_table_size) { + w = jisx0213_ucs_table[w1]; + } + + /* Conversion for CJK Unified Ideographs ext.B (U+2XXXX) */ + if (!w) { + int k = mbfl_bisec_srch2((c << 8) | c2, jisx0213_jis_u5_key, jisx0213_u5_tbl_len); + if (k >= 0) { + w = jisx0213_jis_u5_tbl[k] + 0x20000; + } + } + + *out++ = w ? w : MBFL_BAD_INPUT; + } else if (*state == JISX0213_PLANE2) { + + unsigned int s1 = c - 0x21, s2 = c2 - 0x21; + + if (((s1 <= 4 && s1 != 1) || s1 == 7 || (s1 >= 11 && s1 <= 14) || (s1 >= 77 && s1 < 94)) && s2 < 94) { + int k; + for (k = 0; k < jisx0213_p2_ofst_len; k++) { + if (s1 == jisx0213_p2_ofst[k]) { + break; + } + } + k -= jisx0213_p2_ofst[k]; + + /* Check for Japanese chars in BMP */ + unsigned int s = (s1 + 94 + k)*94 + s2; + ZEND_ASSERT(s < jisx0213_ucs_table_size); + uint32_t w = jisx0213_ucs_table[s]; + + /* Check for Japanese chars in CJK Unified Ideographs ext B (U+2XXXX) */ + if (!w) { + k = mbfl_bisec_srch2(((c + k + 94) << 8) | c2, jisx0213_jis_u5_key, jisx0213_u5_tbl_len); + if (k >= 0) { + w = jisx0213_jis_u5_tbl[k] + 0x20000; + } + } + + *out++ = w ? w : MBFL_BAD_INPUT; + } else { + *out++ = MBFL_BAD_INPUT; + } + } else { /* state == JISX0208 */ + unsigned int s = (c - 0x21)*94 + c2 - 0x21; + uint32_t w = 0; + if (s < jisx0208_ucs_table_size) { + w = jisx0208_ucs_table[s]; + } + *out++ = w ? w : MBFL_BAD_INPUT; + } + } else { + *out++ = c; + } + } else { + *out++ = MBFL_BAD_INPUT; + } + } + + *in_len = e - p; + *in = p; + return out - buf; +} + +static void mb_wchar_to_iso2022jp2004(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) +{ + unsigned char *out, *limit; + MB_CONVERT_BUF_LOAD(buf, out, limit); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + + uint32_t w; + if (buf->state & 0xFF00) { + int k = (buf->state >> 8) - 1; + w = jisx0213_u2_tbl[2*k]; + buf->state &= 0xFF; + goto process_codepoint; + } + + while (len--) { + w = *in++; +process_codepoint: ; + unsigned int s = 0; + + if (w == 0xE6 || (w >= 0x254 && w <= 0x2E9) || (w >= 0x304B && w <= 0x3053) || (w >= 0x30AB && w <= 0x30C8) || w == 0x31F7) { + for (int k = 0; k < jisx0213_u2_tbl_len; k++) { + if (w == jisx0213_u2_tbl[2*k]) { + if (!len) { + if (!end) { + buf->state |= (k+1) << 8; + MB_CONVERT_BUF_STORE(buf, out, limit); + return; + } + } else { + uint32_t w2 = *in++; len--; + if ((w == 0x254 || w == 0x28C || w == 0x259 || w == 0x25A) && w2 == 0x301) { + k++; + } + if (w2 == jisx0213_u2_tbl[2*k+1]) { + s = jisx0213_u2_key[k]; + break; + } + in--; len++; + } + + s = jisx0213_u2_fb_tbl[k]; + break; + } + } + } + + if (!s && (w == 0x5C || w == 0x7E)) { + /* ISO-2022-JP-2004 can represent ASCII characters directly, so there is no need + * to use the JIS X 0208 REVERSE SOLIDUS for ASCII backslash, or WAVE DASH for tilde */ + s = w; + } + + /* Check for major Japanese chars: U+4E00-U+9FFF */ + if (!s) { + for (int k = 0; k < uni2jis_tbl_len; k++) { + if (w >= uni2jis_tbl_range[k][0] && w <= uni2jis_tbl_range[k][1]) { + s = uni2jis_tbl[k][w - uni2jis_tbl_range[k][0]]; + break; + } + } + } + + /* Check for Japanese chars in compressed mapping area: U+1E00-U+4DBF */ + if (!s && w >= ucs_c1_jisx0213_min && w <= ucs_c1_jisx0213_max) { + int k = mbfl_bisec_srch(w, ucs_c1_jisx0213_tbl, ucs_c1_jisx0213_tbl_len); + if (k >= 0) { + s = ucs_c1_jisx0213_ofst[k] + w - ucs_c1_jisx0213_tbl[2*k]; + } + } + + /* Check for Japanese chars in CJK Unified Ideographs ext B (U+2XXXX) */ + if (!s && w >= jisx0213_u5_tbl_min && w <= jisx0213_u5_tbl_max) { + int k = mbfl_bisec_srch2(w - 0x20000, jisx0213_u5_jis_key, jisx0213_u5_tbl_len); + if (k >= 0) { + s = jisx0213_u5_jis_tbl[k]; + } + } + + if (!s) { + /* CJK Compatibility Forms: U+FE30-U+FE4F */ + if (w == 0xFE45) { + s = 0x233E; + } else if (w == 0xFE46) { + s = 0x233D; + } else if (w >= 0xF91D && w <= 0xF9DC) { + /* CJK Compatibility Ideographs: U+F900-U+F92A */ + int k = mbfl_bisec_srch2(w, ucs_r2b_jisx0213_cmap_key, ucs_r2b_jisx0213_cmap_len); + if (k >= 0) { + s = ucs_r2b_jisx0213_cmap_val[k]; + } + } + } + + if (!s && w) { + MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jp2004); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + } else if (s <= 0x7F) { + if (buf->state != ASCII) { + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4); + out = mb_convert_buf_add3(out, 0x1B, '(', 'B'); + buf->state = ASCII; + } + out = mb_convert_buf_add(out, s); + } else if (s <= 0xFF) { + MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jp2004); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + } else if (s <= 0x7EFF) { + if (buf->state != JISX0213_PLANE1) { + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 6); + out = mb_convert_buf_add4(out, 0x1B, '$', '(', 'Q'); + buf->state = JISX0213_PLANE1; + } else { + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2); + } + out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF); + } else { + if (buf->state != JISX0213_PLANE2) { + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 6); + out = mb_convert_buf_add4(out, 0x1B, '$', '(', 'P'); + buf->state = JISX0213_PLANE2; + } else { + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2); + } + unsigned int s2 = s & 0xFF; + int k = ((s >> 8) & 0xFF) - 0x7F; + ZEND_ASSERT(k < jisx0213_p2_ofst_len); + s = jisx0213_p2_ofst[k] + 0x21; + out = mb_convert_buf_add2(out, s, s2); + } + } + + if (end && buf->state != ASCII) { + MB_CONVERT_BUF_ENSURE(buf, out, limit, 3); + out = mb_convert_buf_add3(out, 0x1B, '(', 'B'); + } + + MB_CONVERT_BUF_STORE(buf, out, limit); +} diff --git a/ext/mbstring/tests/iso2022jp_2004_encoding.phpt b/ext/mbstring/tests/iso2022jp_2004_encoding.phpt index f678aeae2c738..68dd817aec3c3 100644 --- a/ext/mbstring/tests/iso2022jp_2004_encoding.phpt +++ b/ext/mbstring/tests/iso2022jp_2004_encoding.phpt @@ -124,6 +124,12 @@ for ($i = 0; $i <= 0x7F; $i++) { echo "Encoding verification and conversion works for all ASCII characters\n"; +for ($i = 0x80; $i <= 0x9F; $i++) { + convertInvalidString("\x00" . chr($i), "%", 'UTF-16BE', 'ISO-2022-JP-2004'); +} + +echo "Codepoints from U+0080-009F are rejected\n"; + /* Try a bare ESC */ identifyInvalidString("\x1B", 'ISO-2022-JP-2004'); @@ -330,6 +336,7 @@ echo "All done!\n"; ?> --EXPECT-- Encoding verification and conversion works for all ASCII characters +Codepoints from U+0080-009F are rejected Encoding verification and conversion rejects all invalid single bytes Encoding verification and conversion work on JISX-0208 characters Encoding verification and conversion work on JISX-0213:2004 plane 1 characters From 215a23cb071391b48dff1d862a87bf009d8157bf Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Sat, 5 Feb 2022 22:53:02 +0200 Subject: [PATCH 09/25] Implement fast text conversion interface for mobile variants of UTF-8 --- .../libmbfl/filters/mbfilter_utf8_mobile.c | 421 +++++++++++++++++- 1 file changed, 413 insertions(+), 8 deletions(-) diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf8_mobile.c b/ext/mbstring/libmbfl/filters/mbfilter_utf8_mobile.c index 04af4c0939e24..d6c64e1b7780d 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf8_mobile.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf8_mobile.c @@ -32,9 +32,21 @@ #include "mbfilter_utf8_mobile.h" #include "mbfilter_sjis_mobile.h" +#include "emoji2uni.h" + +extern int mbfl_bisec_srch2(int w, const unsigned short tbl[], int n); extern int mbfl_filt_conv_utf8_wchar_flush(mbfl_convert_filter *filter); extern int mbfl_filt_conv_sjis_mobile_flush(mbfl_convert_filter *filter); +static size_t mb_utf8_docomo_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); +static void mb_wchar_to_utf8_docomo(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); +static size_t mb_utf8_kddi_a_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); +static void mb_wchar_to_utf8_kddi_a(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); +static size_t mb_utf8_kddi_b_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); +static void mb_wchar_to_utf8_kddi_b(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); +static size_t mb_utf8_sb_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); +static void mb_wchar_to_utf8_sb(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); + extern const unsigned char mblen_table_utf8[]; static const char *mbfl_encoding_utf8_docomo_aliases[] = {"UTF-8-DOCOMO", "UTF8-DOCOMO", NULL}; @@ -50,8 +62,8 @@ const mbfl_encoding mbfl_encoding_utf8_docomo = { 0, &vtbl_utf8_docomo_wchar, &vtbl_wchar_utf8_docomo, - NULL, - NULL + mb_utf8_docomo_to_wchar, + mb_wchar_to_utf8_docomo }; const mbfl_encoding mbfl_encoding_utf8_kddi_a = { @@ -63,8 +75,8 @@ const mbfl_encoding mbfl_encoding_utf8_kddi_a = { 0, &vtbl_utf8_kddi_a_wchar, &vtbl_wchar_utf8_kddi_a, - NULL, - NULL + mb_utf8_kddi_a_to_wchar, + mb_wchar_to_utf8_kddi_a }; const mbfl_encoding mbfl_encoding_utf8_kddi_b = { @@ -76,8 +88,8 @@ const mbfl_encoding mbfl_encoding_utf8_kddi_b = { 0, &vtbl_utf8_kddi_b_wchar, &vtbl_wchar_utf8_kddi_b, - NULL, - NULL + mb_utf8_kddi_b_to_wchar, + mb_wchar_to_utf8_kddi_b }; const mbfl_encoding mbfl_encoding_utf8_sb = { @@ -89,8 +101,8 @@ const mbfl_encoding mbfl_encoding_utf8_sb = { 0, &vtbl_utf8_sb_wchar, &vtbl_wchar_utf8_sb, - NULL, - NULL + mb_utf8_sb_to_wchar, + mb_wchar_to_utf8_sb }; const struct mbfl_convert_vtbl vtbl_utf8_docomo_wchar = { @@ -313,3 +325,396 @@ int mbfl_filt_conv_wchar_utf8_mobile(int c, mbfl_convert_filter *filter) return 0; } + +/* Regional Indicator Unicode codepoints are from 0x1F1E6-0x1F1FF + * These correspond to the letters A-Z + * To display the flag emoji for a country, two unicode codepoints are combined, + * which correspond to the two-letter code for that country + * This macro converts uppercase ASCII values to Regional Indicator codepoints */ +#define NFLAGS(c) (0x1F1A5+(int)(c)) + +static const char nflags_s[10][2] = {"CN","DE","ES","FR","GB","IT","JP","KR","RU","US"}; +static const int nflags_code_kddi[10] = { 0x2549, 0x2546, 0x24C0, 0x2545, 0x2548, 0x2547, 0x2750, 0x254A, 0x24C1, 0x27F7 }; +static const int nflags_code_sb[10] = { 0x2B0A, 0x2B05, 0x2B08, 0x2B04, 0x2B07, 0x2B06, 0x2B02, 0x2B0B, 0x2B09, 0x2B03 }; + +static size_t mb_mobile_utf8_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state, const unsigned short emoji_map[][3], int (*convert_emoji)(int s, int *snd), int n) +{ + unsigned char *p = *in, *e = p + *in_len; + uint32_t *out = buf, *limit = buf + bufsize - 1; + + while (p < e && out < limit) { + unsigned char c = *p++; + unsigned int s = 0; + + if (c <= 0x7F) { + *out++ = c; + continue; + } else if (c >= 0xC2 && c <= 0xDF && p < e) { + unsigned char c2 = *p++; + + if ((c2 & 0xC0) == 0x80) { + s = ((c & 0x1F) << 6) | (c2 & 0x3F); + } else { + *out++ = MBFL_BAD_INPUT; + p--; + continue; + } + } else if (c >= 0xE0 && c <= 0xEF) { + if ((e - p) < 2) { + *out++ = MBFL_BAD_INPUT; + while (p < e && (*p & 0xC0) == 0x80) { + p++; + } + continue; + } + unsigned char c2 = *p++; + unsigned char c3 = *p++; + + if ((c2 & 0xC0) != 0x80 || (c == 0xE0 && c2 < 0xA0) || (c == 0xED && c2 >= 0xA0)) { + *out++ = MBFL_BAD_INPUT; + p -= 2; + continue; + } else if ((c3 & 0xC0) != 0x80) { + *out++ = MBFL_BAD_INPUT; + p--; + continue; + } else { + s = ((c & 0xF) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F); + } + } else if (c >= 0xF0 && c <= 0xF4) { + if ((e - p) < 3) { + *out++ = MBFL_BAD_INPUT; + if (p < e) { + unsigned char c2 = *p; + if ((c == 0xF0 && c2 >= 0x90) || (c == 0xF4 && c2 < 0x90)) { + while (p < e && (*p & 0xC0) == 0x80) { + p++; + } + } + } + continue; + } + unsigned char c2 = *p++; + unsigned char c3 = *p++; + unsigned char c4 = *p++; + + if ((c2 & 0xC0) != 0x80 || (c == 0xF0 && c2 < 0x90) || (c == 0xF4 && c2 >= 0x90)) { + *out++ = MBFL_BAD_INPUT; + p -= 3; + continue; + } else if ((c3 & 0xC0) != 0x80) { + *out++ = MBFL_BAD_INPUT; + p -= 2; + continue; + } else if ((c4 & 0xC0) != 0x80) { + *out++ = MBFL_BAD_INPUT; + p--; + continue; + } else { + s = ((c & 0x7) << 18) | ((c2 & 0x3F) << 12) | ((c3 & 0x3F) << 6) | (c4 & 0x3F); + } + } else { + *out++ = MBFL_BAD_INPUT; + continue; + } + + int s1 = 0, snd = 0; + if (mbfilter_conv_r_map_tbl(s, &s1, emoji_map, n) > 0) { + s = convert_emoji(s1, &snd); + if (snd) { + *out++ = snd; + } + } + *out++ = s; + } + + *in_len = e - p; + *in = p; + return out - buf; +} + +static size_t mb_utf8_docomo_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) +{ + return mb_mobile_utf8_to_wchar(in, in_len, buf, bufsize, state, mbfl_docomo2uni_pua, mbfilter_sjis_emoji_docomo2unicode, 4); +} + +static void mb_wchar_to_utf8_docomo(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) +{ + unsigned char *out, *limit; + MB_CONVERT_BUF_LOAD(buf, out, limit); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + + while (len--) { + uint32_t w = *in++; + unsigned int s = 0; + int c1 = 0; + + if (w < 0x110000) { + if ((w == '#' || (w >= '0' && w <= '9')) && len) { + uint32_t w2 = *in++; len--; + + if (w2 == 0x20E3) { + if (w == '#') { + s = 0x2964; + } else if (w == '0') { + s = 0x296F; + } else { + s = 0x2966 + (w - '1'); + } + } else { + in--; len++; + } + } else if (w == 0xA9) { /* Copyright sign */ + s = 0x29B5; + } else if (w == 0xAE) { /* Registered sign */ + s = 0x29BA; + } else if (w >= mb_tbl_uni_docomo2code2_min && w <= mb_tbl_uni_docomo2code2_max) { + int i = mbfl_bisec_srch2(w, mb_tbl_uni_docomo2code2_key, mb_tbl_uni_docomo2code2_len); + if (i >= 0) { + s = mb_tbl_uni_docomo2code2_value[i]; + } + } else if (w >= mb_tbl_uni_docomo2code3_min && w <= mb_tbl_uni_docomo2code3_max) { + int i = mbfl_bisec_srch2(w - 0x10000, mb_tbl_uni_docomo2code3_key, mb_tbl_uni_docomo2code3_len); + if (i >= 0) { + s = mb_tbl_uni_docomo2code3_value[i]; + } + } else if (w >= mb_tbl_uni_docomo2code5_min && w <= mb_tbl_uni_docomo2code5_max) { + int i = mbfl_bisec_srch2(w - 0xF0000, mb_tbl_uni_docomo2code5_key, mb_tbl_uni_docomo2code5_len); + if (i >= 0) { + s = mb_tbl_uni_docomo2code5_val[i]; + } + } + + if (s && mbfilter_conv_map_tbl(s, &c1, mbfl_docomo2uni_pua, 4) > 0) { + w = c1; + } + + if (w <= 0x7F) { + out = mb_convert_buf_add(out, w); + } else if (w <= 0x7FF) { + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2); + out = mb_convert_buf_add2(out, ((w >> 6) & 0x1F) | 0xC0, (w & 0x3F) | 0x80); + } else if (w <= 0xFFFF) { + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 3); + out = mb_convert_buf_add3(out, ((w >> 12) & 0xF) | 0xE0, ((w >> 6) & 0x3F) | 0x80, (w & 0x3F) | 0x80); + } else { + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4); + out = mb_convert_buf_add4(out, ((w >> 18) & 0x7) | 0xF0, ((w >> 12) & 0x3F) | 0x80, ((w >> 6) & 0x3F) | 0x80, (w & 0x3F) | 0x80); + } + } else { + MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf8_docomo); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + } + } + + MB_CONVERT_BUF_STORE(buf, out, limit); +} + +static size_t mb_utf8_kddi_a_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) +{ + return mb_mobile_utf8_to_wchar(in, in_len, buf, bufsize, state, mbfl_kddi2uni_pua, mbfilter_sjis_emoji_kddi2unicode, 7); +} + +static void mb_wchar_to_utf8_kddi(uint32_t *in, size_t len, mb_convert_buf *buf, bool end, const unsigned short emoji_map[][3], int n, mb_from_wchar_fn error_handler) +{ + unsigned char *out, *limit; + MB_CONVERT_BUF_LOAD(buf, out, limit); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + + while (len--) { + uint32_t w = *in++; + unsigned int s = 0; + int c1 = 0; + + if (w < 0x110000) { + if ((w == '#' || (w >= '0' && w <= '9')) && len) { + uint32_t w2 = *in++; len--; + + if (w2 == 0x20E3) { + if (w == '#') { + s = 0x25BC; + } else if (w == '0') { + s = 0x2830; + } else { + s = 0x27A6 + (w - '1'); + } + } else { + in--; len++; + } + } else if (w >= NFLAGS('C') && w <= NFLAGS('U')) { /* C for CN, U for US */ + if (len) { + uint32_t w2 = *in++; len--; + + if (w2 >= NFLAGS('B') && w2 <= NFLAGS('U')) { /* B for GB, U for RU */ + for (int i = 0; i < 10; i++) { + if (w == NFLAGS(nflags_s[i][0]) && w2 == NFLAGS(nflags_s[i][1])) { + s = nflags_code_kddi[i]; + goto process_kuten; + } + } + } + + in--; len++; + } + + MB_CONVERT_ERROR(buf, out, limit, w, error_handler); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + continue; + } else if (w == 0xA9) { /* Copyright sign */ + s = 0x27DC; + } else if (w == 0xAE) { /* Registered sign */ + s = 0x27DD; + } else if (w >= mb_tbl_uni_kddi2code2_min && w <= mb_tbl_uni_kddi2code2_max) { + int i = mbfl_bisec_srch2(w, mb_tbl_uni_kddi2code2_key, mb_tbl_uni_kddi2code2_len); + if (i >= 0) { + s = mb_tbl_uni_kddi2code2_value[i]; + } + } else if (w >= mb_tbl_uni_kddi2code3_min && w <= mb_tbl_uni_kddi2code3_max) { + int i = mbfl_bisec_srch2(w - 0x10000, mb_tbl_uni_kddi2code3_key, mb_tbl_uni_kddi2code3_len); + if (i >= 0) { + s = mb_tbl_uni_kddi2code3_value[i]; + } + } else if (w >= mb_tbl_uni_kddi2code5_min && w <= mb_tbl_uni_kddi2code5_max) { + int i = mbfl_bisec_srch2(w - 0xF0000, mb_tbl_uni_kddi2code5_key, mb_tbl_uni_kddi2code5_len); + if (i >= 0) { + s = mb_tbl_uni_kddi2code5_val[i]; + } + } + +process_kuten: + if (s && mbfilter_conv_map_tbl(s, &c1, emoji_map, n) > 0) { + w = c1; + } + + if (w <= 0x7F) { + out = mb_convert_buf_add(out, w); + } else if (w <= 0x7FF) { + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2); + out = mb_convert_buf_add2(out, ((w >> 6) & 0x1F) | 0xC0, (w & 0x3F) | 0x80); + } else if (w <= 0xFFFF) { + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 3); + out = mb_convert_buf_add3(out, ((w >> 12) & 0xF) | 0xE0, ((w >> 6) & 0x3F) | 0x80, (w & 0x3F) | 0x80); + } else { + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4); + out = mb_convert_buf_add4(out, ((w >> 18) & 0x7) | 0xF0, ((w >> 12) & 0x3F) | 0x80, ((w >> 6) & 0x3F) | 0x80, (w & 0x3F) | 0x80); + } + } else { + MB_CONVERT_ERROR(buf, out, limit, w, error_handler); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + } + } + + MB_CONVERT_BUF_STORE(buf, out, limit); +} + +static void mb_wchar_to_utf8_kddi_a(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) +{ + mb_wchar_to_utf8_kddi(in, len, buf, end, mbfl_kddi2uni_pua, 7, mb_wchar_to_utf8_kddi_a); +} + +static size_t mb_utf8_kddi_b_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) +{ + return mb_mobile_utf8_to_wchar(in, in_len, buf, bufsize, state, mbfl_kddi2uni_pua_b, mbfilter_sjis_emoji_kddi2unicode, 8); +} + +static void mb_wchar_to_utf8_kddi_b(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) +{ + mb_wchar_to_utf8_kddi(in, len, buf, end, mbfl_kddi2uni_pua_b, 8, mb_wchar_to_utf8_kddi_b); +} + +static size_t mb_utf8_sb_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) +{ + return mb_mobile_utf8_to_wchar(in, in_len, buf, bufsize, state, mbfl_sb2uni_pua, mbfilter_sjis_emoji_sb2unicode, 6); +} + +static void mb_wchar_to_utf8_sb(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) +{ + unsigned char *out, *limit; + MB_CONVERT_BUF_LOAD(buf, out, limit); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + + while (len--) { + uint32_t w = *in++; + unsigned int s = 0; + int c1 = 0; + + if (w < 0x110000) { + if ((w == '#' || (w >= '0' && w <= '9')) && len) { + uint32_t w2 = *in++; len--; + + if (w2 == 0x20E3) { + if (w == '#') { + s = 0x2817; + } else if (w == '0') { + s = 0x282C; + } else { + s = 0x2823 + (w - '1'); + } + } else { + in--; len++; + } + } else if (w >= NFLAGS('C') && w <= NFLAGS('U')) { /* C for CN, U for US */ + if (len) { + uint32_t w2 = *in++; len--; + + if (w2 >= NFLAGS('B') && w2 <= NFLAGS('U')) { /* B for GB, U for RU */ + for (int i = 0; i < 10; i++) { + if (w == NFLAGS(nflags_s[i][0]) && w2 == NFLAGS(nflags_s[i][1])) { + s = nflags_code_sb[i]; + goto process_kuten; + } + } + } + + in--; len++; + } + + MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf8_sb); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + continue; + } else if (w == 0xA9) { /* Copyright sign */ + s = 0x2855; + } else if (w == 0xAE) { /* Registered sign */ + s = 0x2856; + } else if (w >= mb_tbl_uni_sb2code2_min && w <= mb_tbl_uni_sb2code2_max) { + int i = mbfl_bisec_srch2(w, mb_tbl_uni_sb2code2_key, mb_tbl_uni_sb2code2_len); + if (i >= 0) { + s = mb_tbl_uni_sb2code2_value[i]; + } + } else if (w >= mb_tbl_uni_sb2code3_min && w <= mb_tbl_uni_sb2code3_max) { + int i = mbfl_bisec_srch2(w - 0x10000, mb_tbl_uni_sb2code3_key, mb_tbl_uni_sb2code3_len); + if (i >= 0) { + s = mb_tbl_uni_sb2code3_value[i]; + } + } else if (w >= mb_tbl_uni_sb2code5_min && w <= mb_tbl_uni_sb2code5_max) { + int i = mbfl_bisec_srch2(w - 0xF0000, mb_tbl_uni_sb2code5_key, mb_tbl_uni_sb2code5_len); + if (i >= 0) { + s = mb_tbl_uni_sb2code5_val[i]; + } + } + +process_kuten: + if (s && mbfilter_conv_map_tbl(s, &c1, mbfl_sb2uni_pua, 6) > 0) { + w = c1; + } + + if (w <= 0x7F) { + out = mb_convert_buf_add(out, w); + } else if (w <= 0x7FF) { + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2); + out = mb_convert_buf_add2(out, ((w >> 6) & 0x1F) | 0xC0, (w & 0x3F) | 0x80); + } else if (w <= 0xFFFF) { + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 3); + out = mb_convert_buf_add3(out, ((w >> 12) & 0xF) | 0xE0, ((w >> 6) & 0x3F) | 0x80, (w & 0x3F) | 0x80); + } else { + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4); + out = mb_convert_buf_add4(out, ((w >> 18) & 0x7) | 0xF0, ((w >> 12) & 0x3F) | 0x80, ((w >> 6) & 0x3F) | 0x80, (w & 0x3F) | 0x80); + } + } else { + MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_utf8_sb); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + } + } + + MB_CONVERT_BUF_STORE(buf, out, limit); +} From 2c6ad6e8ec393378650ad74520ff2b871845e701 Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Sun, 13 Feb 2022 23:04:11 +0200 Subject: [PATCH 10/25] Implement fast text conversion interface for ISO-2022-JP-MS --- .../libmbfl/filters/mbfilter_iso2022_jp_ms.c | 226 +++++++++++++++++- 1 file changed, 224 insertions(+), 2 deletions(-) diff --git a/ext/mbstring/libmbfl/filters/mbfilter_iso2022_jp_ms.c b/ext/mbstring/libmbfl/filters/mbfilter_iso2022_jp_ms.c index f26cbbeb8aa97..94e2e4bd84844 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_iso2022_jp_ms.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_iso2022_jp_ms.c @@ -34,6 +34,9 @@ #include "unicode_table_jis.h" #include "cp932_table.h" +static size_t mb_iso2022jpms_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); +static void mb_wchar_to_iso2022jpms(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); + static int mbfl_filt_conv_2022jpms_wchar_flush(mbfl_convert_filter *filter); static const char *mbfl_encoding_2022jpms_aliases[] = {"ISO2022JPMS", NULL}; @@ -47,8 +50,8 @@ const mbfl_encoding mbfl_encoding_2022jpms = { MBFL_ENCTYPE_GL_UNSAFE, &vtbl_2022jpms_wchar, &vtbl_wchar_2022jpms, - NULL, - NULL + mb_iso2022jpms_to_wchar, + mb_wchar_to_iso2022jpms }; const struct mbfl_convert_vtbl vtbl_2022jpms_wchar = { @@ -80,6 +83,7 @@ const struct mbfl_convert_vtbl vtbl_wchar_2022jpms = { #define idxtojis1(c) (((c) / 94) + 0x21) #define idxtojis2(c) (((c) % 94) + 0x21) +#define ASCII 0 #define JISX0201_KANA 0x20 #define JISX0208_KANJI 0x80 #define UDC 0xA0 @@ -357,3 +361,221 @@ int mbfl_filt_conv_any_2022jpms_flush(mbfl_convert_filter *filter) return 0; } + +static size_t mb_iso2022jpms_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) +{ + unsigned char *p = *in, *e = p + *in_len; + uint32_t *out = buf, *limit = buf + bufsize; + + while (p < e && out < limit) { + unsigned char c = *p++; + + if (c == 0x1B) { + if ((e - p) < 2) { + *out++ = MBFL_BAD_INPUT; + p = e; + break; + } + unsigned char c2 = *p++; + unsigned char c3 = *p++; + + if (c2 == '$') { + if (c3 == '@' || c3 == 'B') { + *state = JISX0208_KANJI; + } else if (c3 == '(' && p < e) { + unsigned char c4 = *p++; + + if (c4 == '@' || c4 == 'B') { + *state = JISX0208_KANJI; + } else if (c4 == '?') { + *state = UDC; + } else { + *out++ = MBFL_BAD_INPUT; + } + } else { + *out++ = MBFL_BAD_INPUT; + } + } else if (c2 == '(') { + if (c3 == 'B' || c3 == 'J') { + *state = ASCII; + } else if (c3 == 'I') { + *state = JISX0201_KANA; + } else { + *out++ = MBFL_BAD_INPUT; + } + } else { + p--; + *out++ = MBFL_BAD_INPUT; + } + } else if (*state == JISX0201_KANA && c >= 0x21 && c <= 0x5F) { + *out++ = 0xFF40 + c; + } else if ((*state == JISX0208_KANJI || *state == UDC) && c >= 0x21 && c <= 0x7F) { + if (p == e) { + *out++ = MBFL_BAD_INPUT; + break; + } + unsigned char c2 = *p++; + unsigned int w = 0; + + if (c2 >= 0x21 && c2 <= 0x7E) { + unsigned int s = ((c - 0x21) * 94) + c2 - 0x21; + if (*state == JISX0208_KANJI) { + if (s <= 137) { + if (s == 31) { + w = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */ + } else if (s == 32) { + w = 0xFF5E; /* FULLWIDTH TILDE */ + } else if (s == 33) { + w = 0x2225; /* PARALLEL TO */ + } else if (s == 60) { + w = 0xFF0D; /* FULLWIDTH HYPHEN-MINUS */ + } else if (s == 80) { + w = 0xFFE0; /* FULLWIDTH CENT SIGN */ + } else if (s == 81) { + w = 0xFFE1; /* FULLWIDTH POUND SIGN */ + } else if (s == 137) { + w = 0xFFE2; /* FULLWIDTH NOT SIGN */ + } + } + + if (!w) { + if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) { + w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min]; + } else if (s < jisx0208_ucs_table_size) { + w = jisx0208_ucs_table[s]; + } else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) { + w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min]; + } + } + } else if (c >= 0x21 && c <= 0x34) { + w = 0xE000 + ((c - 0x21) * 94) + c2 - 0x21; + } + + *out++ = w ? w : MBFL_BAD_INPUT; + } else { + *out++ = MBFL_BAD_INPUT; + } + } else if (c <= 0x7F) { + *out++ = c; + } else if (c >= 0xA1 && c <= 0xDF) { + *out++ = 0xFEC0 + c; + } else { + *out++ = MBFL_BAD_INPUT; + } + } + + *in_len = e - p; + *in = p; + return out - buf; +} + +static void mb_wchar_to_iso2022jpms(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) +{ + unsigned char *out, *limit; + MB_CONVERT_BUF_LOAD(buf, out, limit); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + + while (len--) { + uint32_t w = *in++; + unsigned int s = 0; + + if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) { + s = ucs_a1_jis_table[w - ucs_a1_jis_table_min]; + } else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) { + s = ucs_a2_jis_table[w - ucs_a2_jis_table_min]; + } else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) { + s = ucs_i_jis_table[w - ucs_i_jis_table_min]; + } else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) { + s = ucs_r_jis_table[w - ucs_r_jis_table_min]; + } else if (w >= 0xE000 && w < (0xE000 + 20*94)) { + /* Private User Area (95ku - 114ku) */ + s = ((((w - 0xE000) / 94) + 0x7F) << 8) | (((w - 0xE000) % 94) + 0x21); + } + + if (!s) { + if (w == 0xA5) { /* YEN SIGN */ + s = 0x216F; /* FULLWIDTH YEN SIGN */ + } else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */ + s = 0x2140; + } else if (w == 0x2225) { /* PARALLEL TO */ + s = 0x2142; + } else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */ + s = 0x215D; + } else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */ + s = 0x2171; + } else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */ + s = 0x2172; + } else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */ + s = 0x224C; + } + } + + if (s >= 0xA1A1) /* JISX 0212 */ + s = 0; + + if (!s && w) { + for (int i = 0; i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) { + if (w == cp932ext1_ucs_table[i]) { + s = (((i / 94) + 0x2D) << 8) + (i % 94) + 0x21; + break; + } + } + + if (!s) { + for (int i = 0; i < cp932ext3_ucs_table_max - cp932ext3_ucs_table_min; i++) { + if (w == cp932ext3_ucs_table[i]) { + s = cp932ext3_cp932ext2_jis(i); + break; + } + } + } + } + + if (!s && w) { + MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jpms); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + } else if (s <= 0x7F) { + if (buf->state != ASCII) { + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4); + out = mb_convert_buf_add3(out, 0x1B, '(', 'B'); + buf->state = ASCII; + } + out = mb_convert_buf_add(out, s); + } else if (s >= 0xA1 && s <= 0xDF) { + if (buf->state != JISX0201_KANA) { + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4); + out = mb_convert_buf_add3(out, 0x1B, '(', 'I'); + buf->state = JISX0201_KANA; + } + out = mb_convert_buf_add(out, s & 0x7F); + } else if (s <= 0x7E7E) { + if (buf->state != JISX0208_KANJI) { + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 5); + out = mb_convert_buf_add3(out, 0x1B, '$', 'B'); + buf->state = JISX0208_KANJI; + } else { + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2); + } + out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0x7F); + } else if (s < 0x927F) { + if (buf->state != UDC) { + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 6); + out = mb_convert_buf_add4(out, 0x1B, '$', '(', '?'); + buf->state = UDC; + } else { + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2); + } + out = mb_convert_buf_add2(out, ((s >> 8) - 0x5E) & 0x7F, s & 0x7F); + } else { + MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jpms); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + } + } + + if (end && buf->state != ASCII) { + MB_CONVERT_BUF_ENSURE(buf, out, limit, 3); + out = mb_convert_buf_add3(out, 0x1B, '(', 'B'); + } + + MB_CONVERT_BUF_STORE(buf, out, limit); +} From e5b5d48d649b08c7e796d195f3d8ffb0f0b604f8 Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Sun, 27 Feb 2022 20:02:17 +0200 Subject: [PATCH 11/25] Implement fast text conversion interface for ISO-2022-JP-KDDI One bug in the previous implementation; when it saw a sequence of codepoints which looked like they might need to be emitted as a special KDDI emoji, it would totally forget whether it was in ASCII mode, JISX 0208 mode, or something else. So it could not reliably emit the correct escape sequence to switch to the right mode. Further, if the input ends with a codepoint which looks like it could be part of a special KDDI emoji, then the legacy code did not emit an escape sequence to switch back to ASCII mode at the end of the string. This means that the emitted ISO-2022-JP-KDDI strings could not always be safely concatenated. --- .../filters/mbfilter_iso2022jp_mobile.c | 287 +++++++++++++++++- 1 file changed, 285 insertions(+), 2 deletions(-) diff --git a/ext/mbstring/libmbfl/filters/mbfilter_iso2022jp_mobile.c b/ext/mbstring/libmbfl/filters/mbfilter_iso2022jp_mobile.c index 450ac0c932f8c..fa299523187ab 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_iso2022jp_mobile.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_iso2022jp_mobile.c @@ -34,10 +34,16 @@ #include "unicode_table_cp932_ext.h" #include "unicode_table_jis.h" #include "cp932_table.h" +#include "emoji2uni.h" + +static size_t mb_iso2022jp_kddi_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); +static void mb_wchar_to_iso2022jp_kddi(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); static int mbfl_filt_conv_2022jp_mobile_wchar_flush(mbfl_convert_filter *filter); static int mbfl_filt_conv_wchar_2022jp_mobile_flush(mbfl_convert_filter *filter); +extern int mbfl_bisec_srch2(int w, const unsigned short tbl[], int n); + static const char *mbfl_encoding_2022jp_kddi_aliases[] = {"ISO-2022-JP-KDDI", NULL}; const mbfl_encoding mbfl_encoding_2022jp_kddi = { @@ -49,8 +55,8 @@ const mbfl_encoding mbfl_encoding_2022jp_kddi = { MBFL_ENCTYPE_GL_UNSAFE, &vtbl_2022jp_kddi_wchar, &vtbl_wchar_2022jp_kddi, - NULL, - NULL + mb_iso2022jp_kddi_to_wchar, + mb_wchar_to_iso2022jp_kddi }; const struct mbfl_convert_vtbl vtbl_2022jp_kddi_wchar = { @@ -115,6 +121,7 @@ const struct mbfl_convert_vtbl vtbl_wchar_2022jp_kddi = { s1 = ((c1) << 8) | (c2); \ s2 = 1 +#define ASCII 0 #define JISX0201_KANA 0x20 #define JISX0208_KANJI 0x80 @@ -363,3 +370,279 @@ static int mbfl_filt_conv_wchar_2022jp_mobile_flush(mbfl_convert_filter *filter) return 0; } + +static size_t mb_iso2022jp_kddi_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) +{ + unsigned char *p = *in, *e = p + *in_len; + uint32_t *out = buf, *limit = buf + bufsize - 1; + + while (p < e && out < limit) { + unsigned char c = *p++; + + if (c == 0x1B) { + if ((e - p) < 2) { + p = e; + *out++ = MBFL_BAD_INPUT; + break; + } + unsigned char c2 = *p++; + unsigned char c3 = *p++; + + if (c2 == '$') { + if (c3 == '@' || c3 == 'B') { + *state = JISX0208_KANJI; + } else if (c3 == '(') { + if (p == e) { + *out++ = MBFL_BAD_INPUT; + break; + } + unsigned char c4 = *p++; + + if (c4 == '@' || c4 == 'B') { + *state = JISX0208_KANJI; + } else { + *out++ = MBFL_BAD_INPUT; + } + } else { + *out++ = MBFL_BAD_INPUT; + } + } else if (c2 == '(') { + if (c3 == 'B' || c3 == 'J') { + *state = ASCII; + } else if (c3 == 'I') { + *state = JISX0201_KANA; + } else { + *out++ = MBFL_BAD_INPUT; + } + } else { + p--; + *out++ = MBFL_BAD_INPUT; + } + } else if (*state == JISX0201_KANA && c >= 0x21 && c <= 0x5F) { + *out++ = 0xFF40 + c; + } else if (*state == JISX0208_KANJI && c >= 0x21 && c <= 0x7F) { + if (p == e) { + *out++ = MBFL_BAD_INPUT; + break; + } + unsigned char c2 = *p++; + + if (c2 >= 0x21 && c2 <= 0x7E) { + unsigned int s = ((c - 0x21) * 94) + c2 - 0x21; + uint32_t w = 0; + + if (s <= 137) { + if (s == 31) { + w = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */ + } else if (s == 32) { + w = 0xFF5E; /* FULLWIDTH TILDE */ + } else if (s == 33) { + w = 0x2225; /* PARALLEL TO */ + } else if (s == 60) { + w = 0xFF0D; /* FULLWIDTH HYPHEN-MINUS */ + } else if (s == 80) { + w = 0xFFE0; /* FULLWIDTH CENT SIGN */ + } else if (s == 81) { + w = 0xFFE1; /* FULLWIDTH POUND SIGN */ + } else if (s == 137) { + w = 0xFFE2; /* FULLWIDTH NOT SIGN */ + } + } + + if (s >= (84 * 94) && s < (91 * 94)) { + int snd = 0; + s += 22 * 94; + w = mbfilter_sjis_emoji_kddi2unicode(s, &snd); + if (w && snd) { + *out++ = snd; + } + } + + if (!w) { + if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) { + w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min]; + } else if (s < jisx0208_ucs_table_size) { + w = jisx0208_ucs_table[s]; + } + } + + *out++ = w ? w : MBFL_BAD_INPUT; + } else { + *out++ = MBFL_BAD_INPUT; + } + } else if (c <= 0x7F) { + *out++ = c; + } else if (c >= 0xA1 && c <= 0xDF) { + *out++ = 0xFEC0 + c; + } else { + *out++ = MBFL_BAD_INPUT; + } + } + + *in_len = e - p; + *in = p; + return out - buf; +} + +/* Regional Indicator Unicode codepoints are from 0x1F1E6-0x1F1FF + * These correspond to the letters A-Z + * To display the flag emoji for a country, two unicode codepoints are combined, + * which correspond to the two-letter code for that country + * This macro converts uppercase ASCII values to Regional Indicator codepoints */ +#define NFLAGS(c) (0x1F1A5+((unsigned int)(c))) + +static const char nflags_s[10][2] = { + "CN","DE","ES","FR","GB","IT","JP","KR","RU","US" +}; +static const int nflags_code_kddi[10] = { + 0x2549, 0x2546, 0x24C0, 0x2545, 0x2548, 0x2547, 0x2750, 0x254A, 0x24C1, 0x27F7 +}; + +static void mb_wchar_to_iso2022jp_kddi(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) +{ + unsigned char *out, *limit; + MB_CONVERT_BUF_LOAD(buf, out, limit); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + + while (len--) { + uint32_t w = *in++; + unsigned int s = 0; + + if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) { + s = ucs_a1_jis_table[w - ucs_a1_jis_table_min]; + } else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) { + s = ucs_a2_jis_table[w - ucs_a2_jis_table_min]; + } else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) { + s = ucs_i_jis_table[w - ucs_i_jis_table_min]; + } else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) { + s = ucs_r_jis_table[w - ucs_r_jis_table_min]; + } + + if (!s) { + if (w == 0xA5) { /* YEN SIGN */ + s = 0x216F; /* FULLWIDTH YEN SIGN */ + } else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */ + s = 0x2140; + } else if (w == 0x2225) { /* PARALLEL TO */ + s = 0x2142; + } else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */ + s = 0x215D; + } else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */ + s = 0x2171; + } else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */ + s = 0x2172; + } else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */ + s = 0x224C; + } + } + + if ((w == '#' || (w >= '0' && w <= '9')) && len) { + uint32_t w2 = *in++; len--; + + if (w2 == 0x20E3) { + unsigned int s1 = 0; + if (w == '#') { + s1 = 0x25BC; + } else if (w == '0') { + s1 = 0x2830; + } else { /* Previous character was '1'-'9' */ + s1 = 0x27A6 + (w - '1'); + } + s = (((s1 / 94) + 0x21) << 8) + ((s1 % 94) + 0x21) - 0x1600; + } else { + in--; len++; + } + } else if (w >= NFLAGS('C') && w <= NFLAGS('U') && len) { /* C for CN, U for US */ + uint32_t w2 = *in++; len--; + + if (w2 >= NFLAGS('B') && w2 <= NFLAGS('U')) { /* B for GB, U for RU */ + for (int i = 0; i < 10; i++) { + if (w == NFLAGS(nflags_s[i][0]) && w2 == NFLAGS(nflags_s[i][1])) { + unsigned int s1 = nflags_code_kddi[i]; + s = (((s1 / 94) + 0x21) << 8) + ((s1 % 94) + 0x21) - 0x1600; + goto found_flag_emoji; + } + } + } + + in--; len++; +found_flag_emoji: ; + } + + if (w == 0xA9) { /* Copyright sign */ + unsigned int s1 = 0x27DC; + s = (((s1 / 94) + 0x21) << 8) + ((s1 % 94) + 0x21) - 0x1600; + } else if (w == 0xAE) { /* Registered sign */ + unsigned int s1 = 0x27DD; + s = (((s1 / 94) + 0x21) << 8) + ((s1 % 94) + 0x21) - 0x1600; + } else if (w >= mb_tbl_uni_kddi2code2_min && w <= mb_tbl_uni_kddi2code2_max) { + int i = mbfl_bisec_srch2(w, mb_tbl_uni_kddi2code2_key, mb_tbl_uni_kddi2code2_len); + if (i >= 0) { + unsigned int s1 = mb_tbl_uni_kddi2code2_value[i]; + s = (((s1 / 94) + 0x21) << 8) + ((s1 % 94) + 0x21) - 0x1600; + } + } else if (w >= mb_tbl_uni_kddi2code3_min && w <= mb_tbl_uni_kddi2code3_max) { + int i = mbfl_bisec_srch2(w - 0x10000, mb_tbl_uni_kddi2code3_key, mb_tbl_uni_kddi2code3_len); + if (i >= 0) { + unsigned int s1 = mb_tbl_uni_kddi2code3_value[i]; + s = (((s1 / 94) + 0x21) << 8) + ((s1 % 94) + 0x21) - 0x1600; + } + } else if (w >= mb_tbl_uni_kddi2code5_min && w <= mb_tbl_uni_kddi2code5_max) { + int i = mbfl_bisec_srch2(w - 0xF0000, mb_tbl_uni_kddi2code5_key, mb_tbl_uni_kddi2code5_len); + if (i >= 0) { + unsigned int s1 = mb_tbl_uni_kddi2code5_val[i]; + s = (((s1 / 94) + 0x21) << 8) + ((s1 % 94) + 0x21) - 0x1600; + } + } + + if (!s || s >= 0xA1A1) { + s = 0; + for (int i = 0; i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) { + if (w == cp932ext1_ucs_table[i]) { + s = (((i / 94) + 0x2D) << 8) + (i % 94) + 0x21; + break; + } + } + if (w == 0) + s = 0; + } + + if (!s && w) { + MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jp_kddi); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + } else if (s <= 0x7F) { + if (buf->state != ASCII) { + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4); + out = mb_convert_buf_add3(out, 0x1B, '(', 'B'); + buf->state = ASCII; + } + out = mb_convert_buf_add(out, s); + } else if (s >= 0xA1 && s <= 0xDF) { + if (buf->state != JISX0201_KANA) { + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4); + out = mb_convert_buf_add3(out, 0x1B, '(', 'I'); + buf->state = JISX0201_KANA; + } + out = mb_convert_buf_add(out, s & 0x7F); + } else if (s <= 0x7E7E) { + if (buf->state != JISX0208_KANJI) { + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 5); + out = mb_convert_buf_add3(out, 0x1B, '$', 'B'); + buf->state = JISX0208_KANJI; + } else { + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2); + } + out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF); + } else { + MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jp_kddi); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + } + } + + if (end && buf->state != ASCII) { + MB_CONVERT_BUF_ENSURE(buf, out, limit, 3); + out = mb_convert_buf_add3(out, 0x1B, '(', 'B'); + } + + MB_CONVERT_BUF_STORE(buf, out, limit); +} From 3a97806cfb7857fcac91e321b6d171576541b7e9 Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Mon, 28 Mar 2022 22:07:04 +0200 Subject: [PATCH 12/25] Implement fast text conversion interface for '8bit' --- ext/mbstring/libmbfl/mbfl/mbfilter_8bit.c | 40 +++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/ext/mbstring/libmbfl/mbfl/mbfilter_8bit.c b/ext/mbstring/libmbfl/mbfl/mbfilter_8bit.c index 670b23da8cbb5..8fe51c9fd4cbb 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfilter_8bit.c +++ b/ext/mbstring/libmbfl/mbfl/mbfilter_8bit.c @@ -36,6 +36,8 @@ const struct mbfl_convert_vtbl vtbl_8bit_wchar; const struct mbfl_convert_vtbl vtbl_wchar_8bit; static int mbfl_filt_conv_8bit_wchar(int c, mbfl_convert_filter *filter); static int mbfl_filt_conv_wchar_8bit(int c, mbfl_convert_filter *filter); +static size_t mb_8bit_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); +static void mb_wchar_to_8bit(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); static const char *mbfl_encoding_8bit_aliases[] = {"binary", NULL}; @@ -48,8 +50,8 @@ const mbfl_encoding mbfl_encoding_8bit = { MBFL_ENCTYPE_SBCS, &vtbl_8bit_wchar, &vtbl_wchar_8bit, - NULL, - NULL + mb_8bit_to_wchar, + mb_wchar_to_8bit }; const struct mbfl_convert_vtbl vtbl_8bit_wchar = { @@ -89,3 +91,37 @@ static int mbfl_filt_conv_wchar_8bit(int c, mbfl_convert_filter *filter) return 0; } + +static size_t mb_8bit_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) +{ + unsigned char *p = *in, *e = p + *in_len; + uint32_t *out = buf, *limit = buf + bufsize; + + while (p < e && out < limit) { + unsigned char c = *p++; + *out++ = c; + } + + *in_len = e - p; + *in = p; + return out - buf; +} + +static void mb_wchar_to_8bit(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) +{ + unsigned char *out, *limit; + MB_CONVERT_BUF_LOAD(buf, out, limit); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + + while (len--) { + uint32_t w = *in++; + if (w <= 0xFF) { + out = mb_convert_buf_add(out, w); + } else { + MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_8bit); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + } + } + + MB_CONVERT_BUF_STORE(buf, out, limit); +} From 3ced8516748e62917bd97feaad1cc03b174d8ef4 Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Sat, 23 Apr 2022 20:56:52 +0200 Subject: [PATCH 13/25] Implement fast text conversion interface for UUENCODE --- .../libmbfl/filters/mbfilter_uuencode.c | 163 +++++++++++++++++- ext/mbstring/tests/uuencode_encoding.phpt | 41 +++++ 2 files changed, 195 insertions(+), 9 deletions(-) create mode 100644 ext/mbstring/tests/uuencode_encoding.phpt diff --git a/ext/mbstring/libmbfl/filters/mbfilter_uuencode.c b/ext/mbstring/libmbfl/filters/mbfilter_uuencode.c index 9571131aad5b9..e2fef079c595e 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_uuencode.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_uuencode.c @@ -30,6 +30,9 @@ #include "mbfilter.h" #include "mbfilter_uuencode.h" +static size_t mb_uuencode_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); +static void mb_wchar_to_uuencode(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); + const mbfl_encoding mbfl_encoding_uuencode = { mbfl_no_encoding_uuencode, "UUENCODE", @@ -39,8 +42,8 @@ const mbfl_encoding mbfl_encoding_uuencode = { MBFL_ENCTYPE_SBCS, NULL, NULL, - NULL, - NULL + mb_uuencode_to_wchar, + mb_wchar_to_uuencode }; const struct mbfl_convert_vtbl vtbl_uuencode_8bit = { @@ -55,15 +58,21 @@ const struct mbfl_convert_vtbl vtbl_uuencode_8bit = { #define CK(statement) do { if ((statement) < 0) return (-1); } while (0) -/* uuencode => any */ -#define UUDEC(c) (char)(((c)-' ')&077) -static const char * uuenc_begin_text = "begin "; -enum { uudec_state_ground=0, uudec_state_inbegin, +#define UUDEC(c) (char)(((c)-' ') & 077) +static const char *uuenc_begin_text = "begin "; +enum { + uudec_state_ground=0, + uudec_state_inbegin, uudec_state_until_newline, - uudec_state_size, uudec_state_a, uudec_state_b, uudec_state_c, uudec_state_d, - uudec_state_skip_newline}; + uudec_state_size, + uudec_state_a, + uudec_state_b, + uudec_state_c, + uudec_state_d, + uudec_state_skip_newline +}; -int mbfl_filt_conv_uudec(int c, mbfl_convert_filter * filter) +int mbfl_filt_conv_uudec(int c, mbfl_convert_filter *filter) { int n; @@ -135,6 +144,8 @@ int mbfl_filt_conv_uudec(int c, mbfl_convert_filter * filter) CK((*filter->output_function)( (B << 4) | (C >> 2), filter->data)); if (n-- > 0) CK((*filter->output_function)( (C << 6) | D, filter->data)); + if (n < 0) + n = 0; filter->cache = n << 24; if (n == 0) @@ -149,3 +160,137 @@ int mbfl_filt_conv_uudec(int c, mbfl_convert_filter * filter) } return 0; } + +/* Using mbstring to decode UUEncoded text is already deprecated + * However, to facilitate the move to the new, faster internal conversion interface, + * We will temporarily implement it for UUEncode */ + +static size_t mb_uuencode_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) +{ + unsigned char *p = *in, *e = p + *in_len; + uint32_t *out = buf, *limit = buf + bufsize; + + unsigned int _state = *state & 0xFF; + unsigned int size = *state >> 8; + + while (p < e && (limit - out) >= 3) { + unsigned char c = *p++; + + switch (_state) { + case uudec_state_ground: + if (c == 'b') { + if ((e - p) >= 5 && memcmp(p, uuenc_begin_text+1, 5) == 0) { + p += 5; + while (p < e && *p++ != '\n'); /* Consume everything up to newline */ + _state = uudec_state_size; + } + /* We didn't find "begin " */ + } + break; + + case uudec_state_size: + size = UUDEC(c); + _state = uudec_state_a; + break; + + case uudec_state_a: + if ((e - p) < 4) { + p = e; + break; + } + + unsigned int a = UUDEC(c); + unsigned int b = UUDEC(*p++); + unsigned int c = UUDEC(*p++); + unsigned int d = UUDEC(*p++); + + if (size > 0) { + *out++ = ((a << 2) | (b >> 4)) & 0xFF; + size--; + } + if (size > 0) { + *out++ = ((b << 4) | (c >> 2)) & 0xFF; + size--; + } + if (size > 0) { + *out++ = ((c << 6) | d) & 0xFF; + size--; + } + + _state = size ? uudec_state_a : uudec_state_skip_newline; + break; + + case uudec_state_skip_newline: + _state = uudec_state_size; + break; + } + } + + *state = (size << 8) | _state; + *in_len = e - p; + *in = p; + return out - buf; +} + +static unsigned char uuencode_six_bits(unsigned int bits) +{ + if (bits == 0) { + return '`'; + } else { + return bits + 32; + } +} + +static void mb_wchar_to_uuencode(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) +{ + unsigned char *out, *limit; + MB_CONVERT_BUF_LOAD(buf, out, limit); + /* Every 3 bytes of input gets encoded as 4 bytes of output + * Additionally, we have a 'length' byte and a newline for each line of output + * (Maximum 45 input bytes can be encoded on a single output line) + * Make space for two more bytes in case we start close to where a line must end */ + MB_CONVERT_BUF_ENSURE(buf, out, limit, ((len + 2) * 4 / 3) + (((len + 44) / 45) * 2) + (buf->state ? 0 : sizeof("begin 0644 filename\n")) + 2); + + unsigned int bytes_encoded = buf->state >> 1; + + if (!buf->state) { + for (char *s = "begin 0644 filename\n"; *s; s++) { + out = mb_convert_buf_add(out, *s); + } + out = mb_convert_buf_add(out, MIN(len, 45) + 32); + buf->state |= 1; + } + + while (len--) { + uint32_t w = *in++; + uint32_t w2 = 0, w3 = 0; + + if (len) { + w2 = *in++; + len--; + } + if (len) { + w3 = *in++; + len--; + } + + out = mb_convert_buf_add4(out, uuencode_six_bits((w >> 2) & 0x3F), uuencode_six_bits(((w & 0x3) << 4) + ((w2 >> 4) & 0xF)), uuencode_six_bits(((w2 & 0xF) << 2) + ((w3 >> 6) & 0x3)), uuencode_six_bits(w3 & 0x3F)); + + bytes_encoded += 3; + + if (bytes_encoded >= 45) { + out = mb_convert_buf_add(out, '\n'); + if (len) { + out = mb_convert_buf_add(out, MIN(len, 45) + 32); + } + bytes_encoded = 0; + } + } + + if (bytes_encoded) { + out = mb_convert_buf_add(out, '\n'); + } + + buf->state = (bytes_encoded << 1) | (buf->state & 1); + MB_CONVERT_BUF_STORE(buf, out, limit); +} diff --git a/ext/mbstring/tests/uuencode_encoding.phpt b/ext/mbstring/tests/uuencode_encoding.phpt new file mode 100644 index 0000000000000..70dedc60e9007 --- /dev/null +++ b/ext/mbstring/tests/uuencode_encoding.phpt @@ -0,0 +1,41 @@ +--TEST-- +Temporary test of mbstring's UUEncode 'encoding' +--EXTENSIONS-- +mbstring +--FILE-- + +--EXPECTF-- +Deprecated: mb_convert_encoding(): Handling Uuencode via mbstring is deprecated; use convert_uuencode/convert_uudecode instead in %s + +Deprecated: mb_convert_encoding(): Handling Uuencode via mbstring is deprecated; use convert_uuencode/convert_uudecode instead in %s + +Deprecated: mb_convert_encoding(): Handling Uuencode via mbstring is deprecated; use convert_uuencode/convert_uudecode instead in %s + +Deprecated: mb_convert_encoding(): Handling Uuencode via mbstring is deprecated; use convert_uuencode/convert_uudecode instead in %s + +Deprecated: mb_convert_encoding(): Handling Uuencode via mbstring is deprecated; use convert_uuencode/convert_uudecode instead in %s +Done! From 5bfbdf246c49bf5449800ae1a28e95da30176f0c Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Sat, 23 Apr 2022 20:57:04 +0200 Subject: [PATCH 14/25] Implement fast text conversion interface for Base64 --- .../libmbfl/filters/mbfilter_base64.c | 126 +++++++++++++++++- ext/mbstring/tests/base64_encoding.phpt | 51 +++++++ 2 files changed, 175 insertions(+), 2 deletions(-) create mode 100644 ext/mbstring/tests/base64_encoding.phpt diff --git a/ext/mbstring/libmbfl/filters/mbfilter_base64.c b/ext/mbstring/libmbfl/filters/mbfilter_base64.c index 86700c84990da..1f66e259aed80 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_base64.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_base64.c @@ -31,6 +31,9 @@ #include "mbfilter.h" #include "mbfilter_base64.h" +static size_t mb_base64_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); +static void mb_wchar_to_base64(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); + const mbfl_encoding mbfl_encoding_base64 = { mbfl_no_encoding_base64, "BASE64", @@ -40,8 +43,8 @@ const mbfl_encoding mbfl_encoding_base64 = { MBFL_ENCTYPE_GL_UNSAFE, NULL, NULL, - NULL, - NULL + mb_base64_to_wchar, + mb_wchar_to_base64 }; const struct mbfl_convert_vtbl vtbl_8bit_b64 = { @@ -212,3 +215,122 @@ int mbfl_filt_conv_base64dec_flush(mbfl_convert_filter *filter) } return 0; } + +static int decode_base64(char c) +{ + if (c >= 'A' && c <= 'Z') { + return c - 'A'; + } else if (c >= 'a' && c <= 'z') { /* a - z */ + return c - 'a' + 26; + } else if (c >= '0' && c <= '9') { /* 0 - 9 */ + return c - '0' + 52; + } else if (c == '+') { + return 62; + } else if (c == '/') { + return 63; + } + return -1; +} + +static size_t mb_base64_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) +{ + unsigned char *p = *in, *e = p + *in_len; + /* Reserve two slots at the end of the output buffer so that we always have + * space to emit any trailing bytes when we hit the end of the input string */ + uint32_t *out = buf, *limit = buf + bufsize - 2; + + unsigned int bits = *state & 0xFF, cache = *state >> 8; + + while (p < e && (limit - out) >= 3) { + unsigned char c = *p++; + + if (c == '\r' || c == '\n' || c == ' ' || c == '\t' || c == '=') { + continue; + } + + int value = decode_base64(c); + + if (value == -1) { + *out++ = MBFL_BAD_INPUT; + } else { + bits += 6; + cache = (cache << 6) | (value & 0x3F); + if (bits == 24) { + *out++ = (cache >> 16) & 0xFF; + *out++ = (cache >> 8) & 0xFF; + *out++ = cache & 0xFF; + bits = cache = 0; + } + } + } + + if (p == e) { + if (bits) { + if (bits == 18) { + *out++ = (cache >> 10) & 0xFF; + *out++ = (cache >> 2) & 0xFF; + } else if (bits == 12) { + *out++ = (cache >> 4) & 0xFF; + } + } + } else { + *state = (cache << 8) | (bits & 0xFF); + } + + *in_len = e - p; + *in = p; + return out - buf; +} + +static void mb_wchar_to_base64(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) +{ + unsigned int bits = (buf->state & 0x3) * 8; + unsigned int chars_output = ((buf->state >> 2) & 0x3F) * 4; + unsigned int cache = buf->state >> 8; + + unsigned char *out, *limit; + MB_CONVERT_BUF_LOAD(buf, out, limit); + /* Every 3 bytes of input converts to 4 bytes of output... but if the number of input + * bytes is not a multiple of 3, we still pad the output out to a multiple of 4 + * That's `(len + 2) * 4 / 3`, to calculate the amount of space needed in the output buffer + * + * But also, we add a CR+LF line ending (2 bytes) for every 76 bytes of output + * That means we must multiply the above number by 78/76 + * Use `zend_safe_address_guarded` to check that the multiplication doesn't overflow + * + * And since we may enter this function multiple times when converting a large string, and + * we might already be close to where a CR+LF needs to be emitted, make space for an extra + * CR+LF pair in the output buffer */ + MB_CONVERT_BUF_ENSURE(buf, out, limit, (zend_safe_address_guarded(len + (bits / 8), 26, 52) / 19) + 2); + + while (len--) { + uint32_t w = *in++; + cache = (cache << 8) | (w & 0xFF); + bits += 8; + if (bits == 24) { + if (chars_output > 72) { + out = mb_convert_buf_add2(out, '\r', '\n'); + chars_output = 0; + } + out = mb_convert_buf_add4(out, + mbfl_base64_table[(cache >> 18) & 0x3F], + mbfl_base64_table[(cache >> 12) & 0x3F], + mbfl_base64_table[(cache >> 6) & 0x3F], + mbfl_base64_table[cache & 0x3F]); + chars_output += 4; + bits = cache = 0; + } + } + + if (end && bits) { + if (bits == 8) { + out = mb_convert_buf_add4(out, mbfl_base64_table[(cache >> 2) & 0x3F], mbfl_base64_table[(cache & 0x3) << 4], '=', '='); + } else { + out = mb_convert_buf_add4(out, mbfl_base64_table[(cache >> 10) & 0x3F], mbfl_base64_table[(cache >> 4) & 0x3F], mbfl_base64_table[(cache & 0xF) << 2], '='); + } + } else { + buf->state = (cache << 8) | (((chars_output / 4) & 0x3F) << 2) | ((bits / 8) & 0x3); + } + + MB_CONVERT_BUF_STORE(buf, out, limit); +} diff --git a/ext/mbstring/tests/base64_encoding.phpt b/ext/mbstring/tests/base64_encoding.phpt new file mode 100644 index 0000000000000..01657cb356938 --- /dev/null +++ b/ext/mbstring/tests/base64_encoding.phpt @@ -0,0 +1,51 @@ +--TEST-- +Temporary test of mbstring's Base64 'encoding' +--EXTENSIONS-- +mbstring +--FILE-- + +--EXPECTF-- +Deprecated: mb_convert_encoding(): Handling Base64 via mbstring is deprecated; use base64_encode/base64_decode instead in %s + +Deprecated: mb_convert_encoding(): Handling Base64 via mbstring is deprecated; use base64_encode/base64_decode instead in %s + +Deprecated: mb_convert_encoding(): Handling Base64 via mbstring is deprecated; use base64_encode/base64_decode instead in %s + +Deprecated: mb_convert_encoding(): Handling Base64 via mbstring is deprecated; use base64_encode/base64_decode instead in %s + +Deprecated: mb_convert_encoding(): Handling Base64 via mbstring is deprecated; use base64_encode/base64_decode instead in %s + +Deprecated: mb_convert_encoding(): Handling Base64 via mbstring is deprecated; use base64_encode/base64_decode instead in %s + +Deprecated: mb_convert_encoding(): Handling Base64 via mbstring is deprecated; use base64_encode/base64_decode instead in %s + +Deprecated: mb_convert_encoding(): Handling Base64 via mbstring is deprecated; use base64_encode/base64_decode instead in %s + +Deprecated: mb_convert_encoding(): Handling Base64 via mbstring is deprecated; use base64_encode/base64_decode instead in %s +Done! From 8804db31fc03422c2d7cfc2673030601af70db35 Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Sat, 23 Apr 2022 22:07:12 +0200 Subject: [PATCH 15/25] Simplify code for converting UTF-8 An overly complex boolean test was used to check if a 3-byte code unit was valid. Convert it to an equivalent test with fewer terms. --- ext/mbstring/libmbfl/filters/mbfilter_utf8.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf8.c b/ext/mbstring/libmbfl/filters/mbfilter_utf8.c index 3705259b452b9..fc18076ba68b3 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf8.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf8.c @@ -240,7 +240,7 @@ static size_t mb_utf8_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf if ((e - p) >= 2) { unsigned char c2 = *p++; unsigned char c3 = *p++; - if ((c2 & 0xC0) != 0x80 || !((c2 >= 0x80 && c2 <= 0xBF) && ((c == 0xE0 && c2 >= 0xA0) || (c == 0xED && c2 < 0xA0) || (c > 0xE0 && c != 0xED)))) { + if ((c2 & 0xC0) != 0x80 || (c == 0xE0 && c2 < 0xA0) || (c == 0xED && c2 >= 0xA0)) { *out++ = MBFL_BAD_INPUT; p -= 2; } else if ((c3 & 0xC0) != 0x80) { @@ -285,7 +285,7 @@ static size_t mb_utf8_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf *out++ = MBFL_BAD_INPUT; if (p < e) { unsigned char c2 = *p; - if ((c == 0xF0 && c2 >= 0x90) || (c == 0xF4 && c2 < 0x90) || c == 0xF2 || c == 0xF3) { + if ((c == 0xF0 && c2 >= 0x90) || (c == 0xF4 && c2 < 0x90)) { while (p < e && (*p & 0xC0) == 0x80) { p++; } From 3bd69a6c31c0db24f626db9a0b3c53d675b125cc Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Sun, 24 Apr 2022 16:10:14 +0200 Subject: [PATCH 16/25] Implement fast text conversion interface for HTML-ENTITIES --- .../libmbfl/filters/mbfilter_htmlent.c | 153 +++++++++++++++++- ext/mbstring/tests/htmlent_encoding.phpt | 62 +++++++ 2 files changed, 213 insertions(+), 2 deletions(-) create mode 100644 ext/mbstring/tests/htmlent_encoding.phpt diff --git a/ext/mbstring/libmbfl/filters/mbfilter_htmlent.c b/ext/mbstring/libmbfl/filters/mbfilter_htmlent.c index bf6a0ec2cc479..9c15703177ac0 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_htmlent.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_htmlent.c @@ -32,6 +32,9 @@ #include "mbfilter_htmlent.h" #include "html_entities.h" +static size_t mb_htmlent_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); +static void mb_wchar_to_htmlent(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); + static const int htmlentitifieds[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -62,8 +65,8 @@ const mbfl_encoding mbfl_encoding_html_ent = { MBFL_ENCTYPE_GL_UNSAFE, &vtbl_html_wchar, &vtbl_wchar_html, - NULL, - NULL + mb_htmlent_to_wchar, + mb_wchar_to_htmlent }; const struct mbfl_convert_vtbl vtbl_wchar_html = { @@ -311,3 +314,149 @@ void mbfl_filt_conv_html_dec_copy(mbfl_convert_filter *src, mbfl_convert_filter dest->opaque = emalloc(html_enc_buffer_size+1); memcpy(dest->opaque, src->opaque, html_enc_buffer_size+1); } + +static size_t mb_htmlent_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) +{ + unsigned char *p = *in, *e = p + *in_len; + uint32_t *out = buf, *limit = buf + bufsize; + + while (p < e && out < limit) { + unsigned char c = *p++; + + if (c == '&') { + /* Find terminating ; for HTML entity */ + unsigned char *terminator = p; + while (terminator < e && *terminator != ';') + terminator++; + if (terminator < e) { + if (*p == '#' && (e - p) >= 2) { + /* Numeric entity */ + unsigned int value = 0; + unsigned char *digits = p + 1; + if (*digits == 'x' || *digits == 'X') { + /* Hexadecimal */ + digits++; + if (digits == terminator) { + goto bad_entity; + } + while (digits < terminator) { + unsigned char digit = *digits++; + if (digit >= '0' && digit <= '9') { + value = (value * 16) + (digit - '0'); + } else if (digit >= 'A' && digit <= 'F') { + value = (value * 16) + (digit - 'A' + 10); + } else if (digit >= 'a' && digit <= 'f') { + value = (value * 16) + (digit - 'a' + 10); + } else { + goto bad_entity; + } + } + } else { + /* Decimal */ + if (digits == terminator) { + goto bad_entity; + } + while (digits < terminator) { + unsigned char digit = *digits++; + if (digit >= '0' && digit <= '9') { + value = (value * 10) + (digit - '0'); + } else { + goto bad_entity; + } + } + } + if (value > 0x10FFFF) { + goto bad_entity; + } + *out++ = value; + p = terminator + 1; + goto next_iteration; + } else { + /* Named entity */ + mbfl_html_entity_entry *entity = (mbfl_html_entity_entry*)mbfl_html_entity_list; + while (entity->name) { + if (!strncmp((char*)p, entity->name, terminator - p)) { + *out++ = entity->code; + p = terminator + 1; + goto next_iteration; + } + entity++; + } + } + } + /* Either we didn't find ;, or the name of the entity was not recognized */ +bad_entity: + *out++ = '&'; + while (p < terminator && out < limit) { + *out++ = *p++; + } + if (terminator < e && out < limit) { + *out++ = *p++; + } + } else { + *out++ = c; + } + +next_iteration: ; + } + + *in_len = e - p; + *in = p; + return out - buf; +} + +static void mb_wchar_to_htmlent(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) +{ + unsigned char *out, *limit; + MB_CONVERT_BUF_LOAD(buf, out, limit); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + + while (len--) { + uint32_t w = *in++; + + if (w < sizeof(htmlentitifieds) / sizeof(htmlentitifieds[0]) && htmlentitifieds[w] != 1) { + /* Fast path for most ASCII characters */ + out = mb_convert_buf_add(out, w); + } else { + out = mb_convert_buf_add(out, '&'); + + /* See if there is a matching named entity */ + mbfl_html_entity_entry *entity = (mbfl_html_entity_entry*)mbfl_html_entity_list; + while (entity->name) { + if (w == entity->code) { + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 1 + strlen(entity->name)); + for (char *str = entity->name; *str; str++) { + out = mb_convert_buf_add(out, *str); + } + out = mb_convert_buf_add(out, ';'); + goto next_iteration; + } + entity++; + } + + /* There is no matching named entity; emit a numeric entity instead */ + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 12); + out = mb_convert_buf_add(out, '#'); + + if (!w) { + out = mb_convert_buf_add(out, '0'); + } else { + unsigned char buf[12]; + unsigned char *converted = buf + sizeof(buf); + while (w) { + *(--converted) = "0123456789"[w % 10]; + w /= 10; + } + while (converted < buf + sizeof(buf)) { + out = mb_convert_buf_add(out, *converted++); + } + } + + out = mb_convert_buf_add(out, ';'); + } + +next_iteration: ; + } + + MB_CONVERT_BUF_STORE(buf, out, limit); +} diff --git a/ext/mbstring/tests/htmlent_encoding.phpt b/ext/mbstring/tests/htmlent_encoding.phpt new file mode 100644 index 0000000000000..7f92396742a56 --- /dev/null +++ b/ext/mbstring/tests/htmlent_encoding.phpt @@ -0,0 +1,62 @@ +--TEST-- +Temporary test of mbstring's HTML-ENTITIES 'encoding' +--EXTENSIONS-- +mbstring +--FILE-- +', '&<>'); + +convertFromEntities('"', '"'); +convertFromEntities("\xC3\xAC", 'ì'); + +convertFromEntities('あ', 'あ'); +testConversion('あ', 'あ'); +testConversion('abcあxyz', 'abcあxyz'); + +convertFromEntities('&#x;', '&#x;'); +convertFromEntities('&#;', '&#;'); +convertFromEntities('&#', '&#'); +convertFromEntities('&', '&'); + +convertFromEntities("\x00", '�'); + +testConversion(str_repeat('あ', 100), str_repeat('あ', 100)); + +echo "Done!\n"; +?> +--EXPECTF-- +Deprecated: mb_convert_encoding(): Handling HTML entities via mbstring is deprecated; use htmlspecialchars, htmlentities, or mb_encode_numericentity/mb_decode_numericentity instead in %s + +Deprecated: mb_convert_encoding(): Handling HTML entities via mbstring is deprecated; use htmlspecialchars, htmlentities, or mb_encode_numericentity/mb_decode_numericentity instead in %s + +Deprecated: mb_convert_encoding(): Handling HTML entities via mbstring is deprecated; use htmlspecialchars, htmlentities, or mb_encode_numericentity/mb_decode_numericentity instead in %s + +Deprecated: mb_convert_encoding(): Handling HTML entities via mbstring is deprecated; use htmlspecialchars, htmlentities, or mb_encode_numericentity/mb_decode_numericentity instead in %s + +Deprecated: mb_convert_encoding(): Handling HTML entities via mbstring is deprecated; use htmlspecialchars, htmlentities, or mb_encode_numericentity/mb_decode_numericentity instead in %s + +Deprecated: mb_convert_encoding(): Handling HTML entities via mbstring is deprecated; use htmlspecialchars, htmlentities, or mb_encode_numericentity/mb_decode_numericentity instead in %s +Done! From e055b42ff8cd021b3f72ba91159ccfa240907917 Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Mon, 25 Apr 2022 17:56:06 +0200 Subject: [PATCH 17/25] Implement fast text conversion interface for QPrint --- .../libmbfl/filters/mbfilter_qprint.c | 141 +++++++++++++++--- ext/mbstring/tests/qprint_encoding.phpt | 34 +++++ 2 files changed, 154 insertions(+), 21 deletions(-) create mode 100644 ext/mbstring/tests/qprint_encoding.phpt diff --git a/ext/mbstring/libmbfl/filters/mbfilter_qprint.c b/ext/mbstring/libmbfl/filters/mbfilter_qprint.c index 8f3aa16b9c579..730a7316cc7a5 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_qprint.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_qprint.c @@ -31,6 +31,9 @@ #include "mbfilter_qprint.h" #include "unicode_prop.h" +static size_t mb_qprint_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); +static void mb_wchar_to_qprint(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); + static const char *mbfl_encoding_qprint_aliases[] = {"qprint", NULL}; const mbfl_encoding mbfl_encoding_qprint = { @@ -42,8 +45,8 @@ const mbfl_encoding mbfl_encoding_qprint = { MBFL_ENCTYPE_GL_UNSAFE, NULL, NULL, - NULL, - NULL + mb_qprint_to_wchar, + mb_wchar_to_qprint }; const struct mbfl_convert_vtbl vtbl_8bit_qprint = { @@ -153,6 +156,25 @@ int mbfl_filt_conv_qprintenc_flush(mbfl_convert_filter *filter) return 0; } +static int hex2code_map[] = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, + -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 +}; + /* * Quoted-Printable => any */ @@ -160,25 +182,6 @@ int mbfl_filt_conv_qprintdec(int c, mbfl_convert_filter *filter) { int n, m; - static int hex2code_map[] = { - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, - -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 - }; - switch (filter->status) { case 1: if (hex2code_map[c & 0xff] >= 0) { @@ -242,3 +245,99 @@ int mbfl_filt_conv_qprintdec_flush(mbfl_convert_filter *filter) return 0; } + +static size_t mb_qprint_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) +{ + unsigned char *p = *in, *e = p + *in_len; + uint32_t *out = buf, *limit = buf + bufsize - 2; + + while (p < e && out < limit) { + unsigned char c = *p++; + + if (c == '=' && p < e) { + unsigned char c2 = *p++; + + if (hex2code_map[c2] >= 0 && p < e) { + unsigned char c3 = *p++; + + if (hex2code_map[c3] >= 0) { + *out++ = hex2code_map[c2] << 4 | hex2code_map[c3]; + } else { + *out++ = '='; + *out++ = c2; + *out++ = c3; + } + } else if (c2 == '\r' && p < e) { + unsigned char c3 = *p++; + + if (c3 != '\n') { + *out++ = c3; + } + } else if (c2 != '\n') { + *out++ = '='; + *out++ = c2; + } + } else { + *out++ = c; + } + } + + *in_len = e - p; + *in = p; + return out - buf; +} + +static unsigned char qprint_enc_nibble(unsigned char nibble) +{ + if (nibble < 10) { + return nibble + '0'; + } else { + return nibble - 10 + 'A'; + } +} + +static void mb_wchar_to_qprint(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) +{ + unsigned char *out, *limit; + MB_CONVERT_BUF_LOAD(buf, out, limit); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + + unsigned int chars_output = buf->state; + + while (len--) { + /* We assume that all the input 'codepoints' are not really Unicode codepoints at all, + * but raw bytes from 0x00-0xFF */ + uint32_t w = *in++; + + /* QPrint actually mandates that line length should not be more than 76 characters, + * but mbstring stops slightly short of that */ + if (chars_output >= 72) { + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4); + out = mb_convert_buf_add3(out, '=', '\r', '\n'); + chars_output = 0; + } + + if (!w) { + out = mb_convert_buf_add(out, '\0'); + chars_output = 0; + } else if (w == '\n') { + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2); + out = mb_convert_buf_add2(out, '\r', '\n'); + chars_output = 0; + } else if (w == '\r') { + /* No output */ + } else if (w >= 0x80 || w == '=') { + /* Not ASCII */ + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 3); + out = mb_convert_buf_add3(out, '=', qprint_enc_nibble((w >> 4) & 0xF), qprint_enc_nibble(w & 0xF)); + chars_output += 3; + } else { + /* Plain ASCII */ + out = mb_convert_buf_add(out, w); + chars_output++; + } + } + + buf->state = chars_output; + MB_CONVERT_BUF_STORE(buf, out, limit); +} diff --git a/ext/mbstring/tests/qprint_encoding.phpt b/ext/mbstring/tests/qprint_encoding.phpt new file mode 100644 index 0000000000000..62d43cdd743b1 --- /dev/null +++ b/ext/mbstring/tests/qprint_encoding.phpt @@ -0,0 +1,34 @@ +--TEST-- +Temporary test of mbstring's QPrint 'encoding' +--EXTENSIONS-- +mbstring +--FILE-- + +--EXPECTF-- +Deprecated: mb_convert_encoding(): Handling QPrint via mbstring is deprecated; use quoted_printable_encode/quoted_printable_decode instead in %s + +Deprecated: mb_convert_encoding(): Handling QPrint via mbstring is deprecated; use quoted_printable_encode/quoted_printable_decode instead in %s + +Deprecated: mb_convert_encoding(): Handling QPrint via mbstring is deprecated; use quoted_printable_encode/quoted_printable_decode instead in %s +Done! From 7d37ba698de65f52ae2a240c759c87790e94920a Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Mon, 2 May 2022 22:34:20 +0200 Subject: [PATCH 18/25] For JIS/ISO-2022-JP, treat a truncated escape sequence as error --- ext/mbstring/libmbfl/filters/mbfilter_jis.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ext/mbstring/libmbfl/filters/mbfilter_jis.c b/ext/mbstring/libmbfl/filters/mbfilter_jis.c index 3b7f1f8586fa0..54f5c925ab9d7 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_jis.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_jis.c @@ -266,8 +266,9 @@ mbfl_filt_conv_jis_wchar(int c, mbfl_convert_filter *filter) static int mbfl_filt_conv_jis_wchar_flush(mbfl_convert_filter *filter) { - if ((filter->status & 0xF) == 1) { - /* 2-byte (JIS X 0208 or 0212) character was truncated */ + if (filter->status & 0xF) { + /* 2-byte (JIS X 0208 or 0212) character was truncated, + * or else escape sequence was truncated */ CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); } return 0; From 0c339e8f6886c320629453d0004e11ba28925432 Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Tue, 26 Apr 2022 22:23:22 +0200 Subject: [PATCH 19/25] Add more tests for mbstring encoding conversion When testing the preceding commits, I used a script to generate a large number of random strings and try to find strings which would yield different outputs from the new and old encoding conversion code. Some were found. In most cases, analysis revealed that the new code was correct and the old code was not. In all cases where the new code was incorrect, regression tests were added. However, there may be some value in adding regression tests for cases where the old code was incorrect as well. That is done here. This does not cover every case where the new and old code yielded different results. Some of them were very obscure, and it is proving difficult even to reproduce them (since I did not keep a record of all the input strings which triggered the differing output). --- ext/mbstring/tests/cp5022x_encoding.phpt | 9 +++++++-- ext/mbstring/tests/iso2022jp_2004_encoding.phpt | 15 +++++++++++++++ ext/mbstring/tests/iso2022jp_encoding.phpt | 3 +++ ext/mbstring/tests/iso2022jp_kddi_encoding.phpt | 10 ++++++++++ ext/mbstring/tests/iso2022kr_encoding.phpt | 8 ++++++++ ext/mbstring/tests/ucs4_encoding.phpt | 6 ++++++ 6 files changed, 49 insertions(+), 2 deletions(-) diff --git a/ext/mbstring/tests/cp5022x_encoding.phpt b/ext/mbstring/tests/cp5022x_encoding.phpt index cde33e85cc670..370238047232b 100644 --- a/ext/mbstring/tests/cp5022x_encoding.phpt +++ b/ext/mbstring/tests/cp5022x_encoding.phpt @@ -366,9 +366,14 @@ foreach (['CP50220', 'CP50221', 'CP50222'] as $encoding) { echo "Invalid escape sequences OK\n"; -// Regression test +// Regression tests if (mb_convert_encoding("\x1BC\xF5", 'UTF-16BE', 'CP50221') !== "\x00%\x00C\x00%") - die("Bad") + die("Bad"); + +// Previously, the CP50220 implementation would eat trailing null bytes +$converted = mb_convert_encoding("ab\x00", 'UTF-16BE', 'CP50220'); +if ($converted !== "\x00a\x00b\x00\x00") + die("Bad handling of trailing null byte (got " . bin2hex($converted) . ")"); ?> --EXPECT-- diff --git a/ext/mbstring/tests/iso2022jp_2004_encoding.phpt b/ext/mbstring/tests/iso2022jp_2004_encoding.phpt index 68dd817aec3c3..5d7672dc9f10e 100644 --- a/ext/mbstring/tests/iso2022jp_2004_encoding.phpt +++ b/ext/mbstring/tests/iso2022jp_2004_encoding.phpt @@ -320,6 +320,21 @@ for ($i = 0; $i < 100; $i++) { testValid($testString, $convertsTo, false); } +// Regression test: Test handling of 0x80-0x9F; these have a special meaning in EUC-JP-2004, +// but not in ISO-2022-JP-2004 +for ($i = 0x80; $i <= 0x9F; $i++) + convertInvalidString(chr($i), "%", "ISO-2022-JP-2004", "UTF-8"); + +// Regression test: Codepoint which has a special representation in EUC-JP-2004 +convertInvalidString("\xFF\x95", "%", "UTF-16BE", "ISO-2022-JP-2004"); + +// Regression test: Old implementation did not switch properly between JIS X 0213 plane 1 +// and plane 2 +// So try a character which is in plane 1 followed by one in plane 2 +testValidString("\x30\x00\x4E\x02", "\x1B\$(Q\x21\x21\x1B\$(P\x21\x22\x1B(B", "UTF-16BE", "ISO-2022-JP-2004"); +// Try plane 2 followed by plane 1 +testValidString("\x4E\x02\x30\x00", "\x1B\$(P\x21\x22\x1B\$(Q\x21\x21\x1B(B", "UTF-16BE", "ISO-2022-JP-2004"); + // Test "long" illegal character markers mb_substitute_character("long"); convertInvalidString("\xE0", "%", "ISO-2022-JP-2004", "UTF-8"); diff --git a/ext/mbstring/tests/iso2022jp_encoding.phpt b/ext/mbstring/tests/iso2022jp_encoding.phpt index ea515cb436f85..166470a44cf5a 100644 --- a/ext/mbstring/tests/iso2022jp_encoding.phpt +++ b/ext/mbstring/tests/iso2022jp_encoding.phpt @@ -192,6 +192,9 @@ for ($i = 0; $i <= 0xFF; $i++) { identifyInvalidString($escapeSequence, 'ISO-2022-JP'); } } +/* Also try a bare ESC */ +identifyInvalidString("\x1B", 'JIS'); +identifyInvalidString("\x1B", 'ISO-2022-JP'); echo "All escape sequences work as expected\n"; diff --git a/ext/mbstring/tests/iso2022jp_kddi_encoding.phpt b/ext/mbstring/tests/iso2022jp_kddi_encoding.phpt index 87ebdb1cc86d9..0687056982811 100644 --- a/ext/mbstring/tests/iso2022jp_kddi_encoding.phpt +++ b/ext/mbstring/tests/iso2022jp_kddi_encoding.phpt @@ -209,6 +209,16 @@ testValidString("\x30\x00\x00A", "\x1B\$B\x21\x21\x1B(BA", "UTF-16BE", "ISO-2022 // Switch from JISX 0208 Kanji to JISX 0201 Kana testValidString("\x30\x00\xFF\x67", "\x1B\$B\x21\x21\x1B(I'\x1B(B", "UTF-16BE", "ISO-2022-JP-KDDI", false); +/* Convert Unicode flag emoji to ISO-2022-JP-KDDI proprietary flag emoji + * I am not able to confirm that the kuten codes we are using for these proprietary emoji are the right ones + * (There doesn't seem to be any publically available reference, and I don't have a legacy KDDI device) + * + * However, the conversion does not work in the opposite direction; this is because of the test + * `if (s >= (84 * 94) && s < (91 * 94))`, which the kuten code which we are using for flag emoji doesn't match + * That test is inherited from the old implementation (from libmbfl), and I have no way to confirm that + * changing it won't break anything */ +testValidString("\x00\x01\xF1\xF0\x00\x01\xF1\xF7", "\x1B\$B\x70\x55\x1B(B", "UTF-32BE", "ISO-2022-JP-KDDI", false); + echo "JIS X 0208 (with MS extensions) and KDDI emoji support OK\n"; testValidString("\x00\xA5", "\x1B\$B!o\x1B(B", "UTF-16BE", "ISO-2022-JP-KDDI", false); diff --git a/ext/mbstring/tests/iso2022kr_encoding.phpt b/ext/mbstring/tests/iso2022kr_encoding.phpt index 62bcc9de55f56..67adb103fce73 100644 --- a/ext/mbstring/tests/iso2022kr_encoding.phpt +++ b/ext/mbstring/tests/iso2022kr_encoding.phpt @@ -106,6 +106,14 @@ convertValidString("\x76\x20\x00a\x00b", "\x1B$)C\x0E\x74\x30\x0Fab", "UTF-16BE" // 0x7E, resulting in a failed assertion convertInvalidString("\x0E~/", "%", "ISO-2022-KR", "UTF-8"); +// Regression test: The old implementation would wrongly convert some codepoints +// which are not in KS X 1001 at all to 'random' characters in KS X 1001 +convertInvalidString("\xFF\x86", "\x1B\$)C%", "UTF-16BE", "ISO-2022-KR"); + +// Regression test: The old implementation would sometimes emit an extra 0x0F ('shift out') +// character at the end of a string, although the string was already ending in ASCII mode +convertValidString("\x68\x46\x00a", "\x1B\$)C\x0E\x68\x46\x0Fa", "UTF-16BE", "ISO-2022-KR", false); + // Test "long" illegal character markers mb_substitute_character("long"); convertInvalidString("\x1B", "%", "ISO-2022-KR", "UTF-8"); diff --git a/ext/mbstring/tests/ucs4_encoding.phpt b/ext/mbstring/tests/ucs4_encoding.phpt index 973e81b99a6b0..637123b258144 100644 --- a/ext/mbstring/tests/ucs4_encoding.phpt +++ b/ext/mbstring/tests/ucs4_encoding.phpt @@ -12,8 +12,14 @@ testValidString("\x00\x00\xFE\xFF\x00\x00\x30\x01", "\x30\x01", "UCS-4", "UTF-16 testValidString("\x02\x30\x00\x00", "\x30\x02", "UCS-4LE", "UTF-16BE"); testValidString("\x00\x00\x30\x03", "\x30\x03", "UCS-4BE", "UTF-16BE"); +// Truncated input convertInvalidString("\x01\x02\x03", "%", "UCS-4", "UTF-8"); + +// Codepoint above U+10FFFF +convertInvalidString("\x00\x11\x00\x00", "%", "UCS-4BE", "UTF-8"); +convertInvalidString("\x00\x00\x11\x00", "%", "UCS-4LE", "UTF-8"); + // Test "long" illegal character markers mb_substitute_character("long"); convertInvalidString("\x6F\x00\x00\x00", "U+6F000000", "UCS-4BE", "UTF-8"); From 38c4193665ce5eeef27acd49057af20b2e3ae8ec Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Sun, 8 May 2022 12:50:49 +0200 Subject: [PATCH 20/25] Add assertions to help catch buffer overflows in mbstring text conversion code --- ext/mbstring/libmbfl/mbfl/mbfl_convert.c | 1 + ext/mbstring/libmbfl/mbfl/mbfl_encoding.h | 1 + 2 files changed, 2 insertions(+) diff --git a/ext/mbstring/libmbfl/mbfl/mbfl_convert.c b/ext/mbstring/libmbfl/mbfl/mbfl_convert.c index dc76f67964edd..2061f63c0e9f7 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfl_convert.c +++ b/ext/mbstring/libmbfl/mbfl/mbfl_convert.c @@ -360,6 +360,7 @@ zend_string* mb_fast_convert(zend_string *str, const mbfl_encoding *from, const while (in_len) { size_t out_len = from->to_wchar(&in, &in_len, wchar_buf, 128, &state); + ZEND_ASSERT(out_len <= 128); to->from_wchar(wchar_buf, out_len, &buf, !in_len); } diff --git a/ext/mbstring/libmbfl/mbfl/mbfl_encoding.h b/ext/mbstring/libmbfl/mbfl/mbfl_encoding.h index 9f6b1e4bf99f2..ae3b9abbb5d7b 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfl_encoding.h +++ b/ext/mbstring/libmbfl/mbfl/mbfl_encoding.h @@ -154,6 +154,7 @@ static inline void mb_convert_buf_init(mb_convert_buf *buf, size_t initsize, uin } #define MB_CONVERT_BUF_ENSURE(buf, out, limit, needed) \ + ZEND_ASSERT(out <= limit); \ if ((limit - out) < (needed)) { \ size_t oldsize = limit - (unsigned char*)ZSTR_VAL(buf->str); \ size_t newsize = oldsize + MAX(oldsize >> 1, needed); \ From fd35531d09806ee11d71d4ce9b5ec75b775e4aa3 Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Sun, 8 May 2022 14:59:27 +0200 Subject: [PATCH 21/25] Fix buffer overflow bugs in UTF-7 text conversion After Nikita Popov found a buffer overrun bug in one of my pull requests, I was prompted to add more assertions in a38c7e5703 to help me catch such bugs myself more easily in testing. Wouldn't you just know it... as soon as I added those assertions, the mbstring test suite caught another buffer overrun bug in my UTF-7 conversion code, which I wrote the better part of a year ago. Then, when I started fuzzing the code with libfuzzer, I found and fixed another buffer overflow: If we enter the main loop, which normally outputs 3 decoded Base64 characters, where the first half of a surrogate pair had appeared at the end of the previous run, but the second half does not appear on this run, we need to output one error marker. Then, at the end of the main loop, if the Base64 input ends at an unexpected position AND the last character was not a legal Base64-encoded character, we need to output two error markers for that. The three error markers plus two valid, decoded bytes can push us over the available space in our wchar buffer. --- ext/mbstring/libmbfl/filters/mbfilter_utf7.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf7.c b/ext/mbstring/libmbfl/filters/mbfilter_utf7.c index 7f843f88bb603..45129a215aac4 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf7.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf7.c @@ -478,7 +478,7 @@ static uint32_t* handle_base64_end(unsigned char n, unsigned char **p, uint32_t static size_t mb_utf7_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) { - ZEND_ASSERT(bufsize >= 4); /* This function will infinite-loop if called with a tiny output buffer */ + ZEND_ASSERT(bufsize >= 5); /* This function will infinite-loop if called with a tiny output buffer */ unsigned char *p = *in, *e = p + *in_len; uint32_t *out = buf, *limit = buf + bufsize; @@ -489,7 +489,7 @@ static size_t mb_utf7_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf while (p < e && out < limit) { if (base64) { /* Base64 section */ - if ((limit - out) < 4) { + if ((limit - out) < 5) { break; } @@ -631,16 +631,19 @@ static void mb_wchar_to_utf7(uint32_t *in, size_t len, mb_convert_buf *buf, bool MB_CONVERT_BUF_ENSURE(buf, out, limit, len); RESTORE_CONVERSION_STATE(); } else { - /* Encode codepoint, preceded by any cached bits, as Base64 */ + /* Encode codepoint, preceded by any cached bits, as Base64 + * Make enough space in the output buffer to hold both any bytes that + * we emit right here, plus any finishing byte which might need to + * be emitted if the input string ends abruptly */ uint64_t bits; if (w >= MBFL_WCSPLANE_SUPMIN) { /* Must use surrogate pair */ - MB_CONVERT_BUF_ENSURE(buf, out, limit, 6); + MB_CONVERT_BUF_ENSURE(buf, out, limit, 7); w -= 0x10000; bits = ((uint64_t)cache << 32) | 0xD800DC00L | ((w & 0xFFC00) << 6) | (w & 0x3FF); nbits += 32; } else { - MB_CONVERT_BUF_ENSURE(buf, out, limit, 3); + MB_CONVERT_BUF_ENSURE(buf, out, limit, 4); bits = (cache << 16) | w; nbits += 16; } From 35e47686a99d6177d4d6a6c84638bd1f3f7cdee3 Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Fri, 13 May 2022 21:30:51 +0200 Subject: [PATCH 22/25] Use fast text conversion filters to implement php_mb_convert_encoding_ex --- ext/mbstring/libmbfl/mbfl/mbfl_convert.c | 4 +-- ext/mbstring/libmbfl/mbfl/mbfl_convert.h | 2 +- ext/mbstring/mbstring.c | 40 +++++------------------- sapi/fuzzer/fuzzer-mbstring.c | 3 +- 4 files changed, 12 insertions(+), 37 deletions(-) diff --git a/ext/mbstring/libmbfl/mbfl/mbfl_convert.c b/ext/mbstring/libmbfl/mbfl/mbfl_convert.c index 2061f63c0e9f7..c6f75893ed738 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfl_convert.c +++ b/ext/mbstring/libmbfl/mbfl/mbfl_convert.c @@ -348,11 +348,9 @@ int mbfl_filt_conv_common_flush(mbfl_convert_filter *filter) return 0; } -zend_string* mb_fast_convert(zend_string *str, const mbfl_encoding *from, const mbfl_encoding *to, uint32_t replacement_char, unsigned int error_mode, unsigned int *num_errors) +zend_string* mb_fast_convert(unsigned char *in, size_t in_len, const mbfl_encoding *from, const mbfl_encoding *to, uint32_t replacement_char, unsigned int error_mode, unsigned int *num_errors) { uint32_t wchar_buf[128]; - unsigned char *in = (unsigned char*)ZSTR_VAL(str); - size_t in_len = ZSTR_LEN(str); unsigned int state = 0; mb_convert_buf buf; diff --git a/ext/mbstring/libmbfl/mbfl/mbfl_convert.h b/ext/mbstring/libmbfl/mbfl/mbfl_convert.h index 12c12589b0a57..e6353b1625b7e 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfl_convert.h +++ b/ext/mbstring/libmbfl/mbfl/mbfl_convert.h @@ -81,7 +81,7 @@ MBFLAPI extern int mbfl_filt_conv_common_flush(mbfl_convert_filter *filter); MBFLAPI extern void mbfl_convert_filter_devcat(mbfl_convert_filter *filter, mbfl_memory_device *src); MBFLAPI extern int mbfl_convert_filter_strcat(mbfl_convert_filter *filter, const unsigned char *p); -MBFLAPI extern zend_string* mb_fast_convert(zend_string *str, const mbfl_encoding *from, const mbfl_encoding *to, uint32_t replacement_char, unsigned int error_mode, unsigned int *num_errors); +MBFLAPI extern zend_string* mb_fast_convert(unsigned char *in, size_t in_len, const mbfl_encoding *from, const mbfl_encoding *to, uint32_t replacement_char, unsigned int error_mode, unsigned int *num_errors); MBFLAPI extern void mb_illegal_output(uint32_t bad_cp, mb_from_wchar_fn fn, mb_convert_buf* buf); #endif /* MBFL_CONVERT_H */ diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index f43a0131f74fd..02a6ebd961613 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -2353,39 +2353,15 @@ static inline bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc) MBSTRING_API char *php_mb_convert_encoding_ex(const char *input, size_t length, const mbfl_encoding *to_encoding, const mbfl_encoding *from_encoding, size_t *output_len) { - mbfl_string string, result, *ret; - mbfl_buffer_converter *convd; - char *output = NULL; - - if (output_len) { - *output_len = 0; - } + unsigned int num_errors = 0; + zend_string *result = mb_fast_convert((unsigned char*)input, length, from_encoding, to_encoding, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode), &num_errors); - /* initialize string */ - string.encoding = from_encoding; - string.val = (unsigned char *)input; - string.len = length; - - /* initialize converter */ - convd = mbfl_buffer_converter_new(from_encoding, to_encoding, string.len); - /* If this assertion fails this means some memory allocation failure which is a bug */ - ZEND_ASSERT(convd != NULL); + MBSTRG(illegalchars) += num_errors; + *output_len = ZSTR_LEN(result); - mbfl_buffer_converter_illegal_mode(convd, MBSTRG(current_filter_illegal_mode)); - mbfl_buffer_converter_illegal_substchar(convd, MBSTRG(current_filter_illegal_substchar)); - - /* do it */ - mbfl_string_init(&result); - ret = mbfl_buffer_converter_feed_result(convd, &string, &result); - if (ret) { - if (output_len) { - *output_len = ret->len; - } - output = (char *)ret->val; - } - - MBSTRG(illegalchars) += mbfl_buffer_illegalchars(convd); - mbfl_buffer_converter_delete(convd); + char *output = emalloc(ZSTR_LEN(result) + 1); + memcpy(output, ZSTR_VAL(result), ZSTR_LEN(result) + 1); + efree(result); return output; } /* }}} */ @@ -2573,7 +2549,7 @@ PHP_FUNCTION(mb_convert_encoding) const mbfl_encoding *from_encoding = from_encodings[0]; if (from_encoding->to_wchar && to_encoding->from_wchar) { unsigned int num_errors = 0; - RETVAL_STR(mb_fast_convert(input_str, from_encoding, to_encoding, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode), &num_errors)); + RETVAL_STR(mb_fast_convert((unsigned char*)ZSTR_VAL(input_str), ZSTR_LEN(input_str), from_encoding, to_encoding, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode), &num_errors)); MBSTRG(illegalchars) += num_errors; goto out; } diff --git a/sapi/fuzzer/fuzzer-mbstring.c b/sapi/fuzzer/fuzzer-mbstring.c index 9294e71dd7a6a..05a0689873cab 100644 --- a/sapi/fuzzer/fuzzer-mbstring.c +++ b/sapi/fuzzer/fuzzer-mbstring.c @@ -51,7 +51,8 @@ int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) { return 0; } - char *Result = php_mb_convert_encoding_ex((char *) Data, Size, ToEncoding, FromEncoding, NULL); + size_t output_len; + char *Result = php_mb_convert_encoding_ex((char *) Data, Size, ToEncoding, FromEncoding, &output_len); efree(Result); efree(ToEncodingName); efree(FromEncodingName); From 7d716b8159acc706db44b0decbb800ce01b79b67 Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Sun, 22 May 2022 18:00:01 +0200 Subject: [PATCH 23/25] Fix buffer overflow bug in HZ text conversion code --- ext/mbstring/libmbfl/filters/mbfilter_hz.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ext/mbstring/libmbfl/filters/mbfilter_hz.c b/ext/mbstring/libmbfl/filters/mbfilter_hz.c index e10109bc33d8f..faf4ab9c841a2 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_hz.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_hz.c @@ -372,7 +372,7 @@ static void mb_wchar_to_hz(uint32_t *in, size_t len, mb_convert_buf *buf, bool e } else if (s < 0x80) { /* ASCII */ if (buf->state != ASCII) { - MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 3); out = mb_convert_buf_add2(out, '~', '}'); buf->state = ASCII; } @@ -385,11 +385,12 @@ static void mb_wchar_to_hz(uint32_t *in, size_t len, mb_convert_buf *buf, bool e } else { /* GB 2312-80 */ if (buf->state != GB2312) { - MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4); out = mb_convert_buf_add2(out, '~', '{'); buf->state = GB2312; + } else { + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2); } - MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2); out = mb_convert_buf_add2(out, (s >> 8) & 0x7F, s & 0x7F); } } From 0eb000f24da0eec9baf274e615bdef2a9e2f9330 Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Mon, 23 May 2022 10:03:36 +0200 Subject: [PATCH 24/25] Fix buffer overflow bugs in CP50222 text conversion code --- ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c b/ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c index 18686e8bf1ed1..600bcc08ea540 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c @@ -991,7 +991,7 @@ static void mb_wchar_to_cp50222(uint32_t *in, size_t len, mb_convert_buf *buf, b out = mb_convert_buf_add(out, s - 0x80); } else if (s <= 0x927E) { /* JISX 0208 Kanji */ - MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 5); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 6); if (buf->state == JISX_0201_KANA) { out = mb_convert_buf_add(out, 0xF); } @@ -1002,7 +1002,7 @@ static void mb_wchar_to_cp50222(uint32_t *in, size_t len, mb_convert_buf *buf, b out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF); } else if (s >= 0x10000) { /* JISX 0201 Latin; we 'tag' these by adding 0x10000 */ - MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 5); if (buf->state == JISX_0201_KANA) { out = mb_convert_buf_add(out, 0xF); } From df05cdea6316c12194ef08da2922c9356ed94de4 Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Sat, 28 May 2022 17:23:54 +0200 Subject: [PATCH 25/25] php_mb_convert_encoding{,_ex} returns zend_string That's what all existing callers want anyways. This avoids 2 unnecessary copies of the converted string. --- ext/mbstring/mbstring.c | 72 +++++++---------------------------- ext/mbstring/mbstring.h | 8 ++-- sapi/fuzzer/fuzzer-mbstring.c | 5 +-- 3 files changed, 19 insertions(+), 66 deletions(-) diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index 02a6ebd961613..06170e0dd916e 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -2351,30 +2351,18 @@ static inline bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc) return (no_enc >= mbfl_no_encoding_utf8 && no_enc <= mbfl_no_encoding_utf8_sb); } -MBSTRING_API char *php_mb_convert_encoding_ex(const char *input, size_t length, const mbfl_encoding *to_encoding, const mbfl_encoding *from_encoding, size_t *output_len) +MBSTRING_API zend_string* php_mb_convert_encoding_ex(const char *input, size_t length, const mbfl_encoding *to_encoding, const mbfl_encoding *from_encoding) { unsigned int num_errors = 0; zend_string *result = mb_fast_convert((unsigned char*)input, length, from_encoding, to_encoding, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode), &num_errors); - MBSTRG(illegalchars) += num_errors; - *output_len = ZSTR_LEN(result); - - char *output = emalloc(ZSTR_LEN(result) + 1); - memcpy(output, ZSTR_VAL(result), ZSTR_LEN(result) + 1); - efree(result); - return output; + return result; } -/* }}} */ -/* {{{ MBSTRING_API char *php_mb_convert_encoding() */ -MBSTRING_API char *php_mb_convert_encoding(const char *input, size_t length, const mbfl_encoding *to_encoding, const mbfl_encoding **from_encodings, size_t num_from_encodings, size_t *output_len) +MBSTRING_API zend_string* php_mb_convert_encoding(const char *input, size_t length, const mbfl_encoding *to_encoding, const mbfl_encoding **from_encodings, size_t num_from_encodings) { const mbfl_encoding *from_encoding; - if (output_len) { - *output_len = 0; - } - /* pre-conversion encoding */ ZEND_ASSERT(num_from_encodings >= 1); if (num_from_encodings == 1) { @@ -2393,9 +2381,8 @@ MBSTRING_API char *php_mb_convert_encoding(const char *input, size_t length, con } } - return php_mb_convert_encoding_ex(input, length, to_encoding, from_encoding, output_len); + return php_mb_convert_encoding_ex(input, length, to_encoding, from_encoding); } -/* }}} */ MBSTRING_API HashTable *php_mb_convert_encoding_recursive(HashTable *input, const mbfl_encoding *to_encoding, const mbfl_encoding **from_encodings, size_t num_from_encodings) { @@ -2403,8 +2390,6 @@ MBSTRING_API HashTable *php_mb_convert_encoding_recursive(HashTable *input, cons zend_long idx; zend_string *key; zval *entry, entry_tmp; - size_t ckey_len, cval_len; - char *ckey, *cval; if (!input) { return NULL; @@ -2420,22 +2405,14 @@ MBSTRING_API HashTable *php_mb_convert_encoding_recursive(HashTable *input, cons ZEND_HASH_FOREACH_KEY_VAL(input, idx, key, entry) { /* convert key */ if (key) { - ckey = php_mb_convert_encoding( - ZSTR_VAL(key), ZSTR_LEN(key), - to_encoding, from_encodings, num_from_encodings, &ckey_len); - key = zend_string_init(ckey, ckey_len, 0); - efree(ckey); + key = php_mb_convert_encoding(ZSTR_VAL(key), ZSTR_LEN(key), to_encoding, from_encodings, num_from_encodings); } /* convert value */ ZEND_ASSERT(entry); try_again: switch(Z_TYPE_P(entry)) { case IS_STRING: - cval = php_mb_convert_encoding( - Z_STRVAL_P(entry), Z_STRLEN_P(entry), - to_encoding, from_encodings, num_from_encodings, &cval_len); - ZVAL_STRINGL(&entry_tmp, cval, cval_len); - efree(cval); + ZVAL_STR(&entry_tmp, php_mb_convert_encoding(Z_STRVAL_P(entry), Z_STRLEN_P(entry), to_encoding, from_encodings, num_from_encodings)); break; case IS_NULL: case IS_TRUE: @@ -2545,23 +2522,9 @@ PHP_FUNCTION(mb_convert_encoding) } if (input_str) { - if (num_from_encodings == 1) { - const mbfl_encoding *from_encoding = from_encodings[0]; - if (from_encoding->to_wchar && to_encoding->from_wchar) { - unsigned int num_errors = 0; - RETVAL_STR(mb_fast_convert((unsigned char*)ZSTR_VAL(input_str), ZSTR_LEN(input_str), from_encoding, to_encoding, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode), &num_errors)); - MBSTRG(illegalchars) += num_errors; - goto out; - } - } - - size_t size; - char *ret = php_mb_convert_encoding(ZSTR_VAL(input_str), ZSTR_LEN(input_str), - to_encoding, from_encodings, num_from_encodings, &size); + zend_string *ret = php_mb_convert_encoding(ZSTR_VAL(input_str), ZSTR_LEN(input_str), to_encoding, from_encodings, num_from_encodings); if (ret != NULL) { - // TODO: avoid reallocation ??? - RETVAL_STRINGL(ret, size); /* the string is already strdup()'ed */ - efree(ret); + RETVAL_STR(ret); } else { RETVAL_FALSE; } @@ -2572,7 +2535,6 @@ PHP_FUNCTION(mb_convert_encoding) RETVAL_ARR(tmp); } -out: if (free_from_encodings) { efree(ZEND_VOIDP(from_encodings)); } @@ -4135,20 +4097,16 @@ static inline zend_string *php_mb_chr(zend_long cp, zend_string *enc_name, uint3 buf[2] = (cp >> 8) & 0xff; buf[3] = cp & 0xff; - size_t ret_len; long orig_illegalchars = MBSTRG(illegalchars); MBSTRG(illegalchars) = 0; - char *ret_str = php_mb_convert_encoding_ex(buf, 4, enc, &mbfl_encoding_ucs4be, &ret_len); + ret = php_mb_convert_encoding_ex(buf, 4, enc, &mbfl_encoding_ucs4be); + if (MBSTRG(illegalchars) != 0) { - efree(ret_str); - MBSTRG(illegalchars) = orig_illegalchars; - return NULL; + zend_string_release(ret); + ret = NULL; } - ret = zend_string_init(ret_str, ret_len, 0); - efree(ret_str); MBSTRG(illegalchars) = orig_illegalchars; - return ret; } @@ -4192,11 +4150,7 @@ PHP_FUNCTION(mb_scrub) RETURN_THROWS(); } - size_t ret_len; - char *ret = php_mb_convert_encoding_ex(str, str_len, enc, enc, &ret_len); - - RETVAL_STRINGL(ret, ret_len); - efree(ret); + RETURN_STR(php_mb_convert_encoding_ex(str, str_len, enc, enc)); } /* }}} */ diff --git a/ext/mbstring/mbstring.h b/ext/mbstring/mbstring.h index 82b28dbef00e4..e78e9087c2517 100644 --- a/ext/mbstring/mbstring.h +++ b/ext/mbstring/mbstring.h @@ -55,12 +55,12 @@ PHP_MINFO_FUNCTION(mbstring); MBSTRING_API char *php_mb_safe_strrchr(const char *s, unsigned int c, size_t nbytes, const mbfl_encoding *enc); -MBSTRING_API char *php_mb_convert_encoding_ex( +MBSTRING_API zend_string* php_mb_convert_encoding_ex( const char *input, size_t length, - const mbfl_encoding *to_encoding, const mbfl_encoding *from_encoding, size_t *output_len); -MBSTRING_API char * php_mb_convert_encoding( + const mbfl_encoding *to_encoding, const mbfl_encoding *from_encoding); +MBSTRING_API zend_string* php_mb_convert_encoding( const char *input, size_t length, const mbfl_encoding *to_encoding, - const mbfl_encoding **from_encodings, size_t num_from_encodings, size_t *output_len); + const mbfl_encoding **from_encodings, size_t num_from_encodings); MBSTRING_API size_t php_mb_mbchar_bytes(const char *s, const mbfl_encoding *enc); diff --git a/sapi/fuzzer/fuzzer-mbstring.c b/sapi/fuzzer/fuzzer-mbstring.c index 05a0689873cab..61014b2399361 100644 --- a/sapi/fuzzer/fuzzer-mbstring.c +++ b/sapi/fuzzer/fuzzer-mbstring.c @@ -51,9 +51,8 @@ int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) { return 0; } - size_t output_len; - char *Result = php_mb_convert_encoding_ex((char *) Data, Size, ToEncoding, FromEncoding, &output_len); - efree(Result); + zend_string *Result = php_mb_convert_encoding_ex((char *) Data, Size, ToEncoding, FromEncoding); + zend_string_release(Result); efree(ToEncodingName); efree(FromEncodingName);