diff options
-rw-r--r-- | prism/enc/pm_encoding.h | 1 | ||||
-rw-r--r-- | prism/enc/pm_unicode.c | 104 | ||||
-rw-r--r-- | prism/prism.c | 1 | ||||
-rw-r--r-- | test/prism/encoding_test.rb | 7 |
4 files changed, 110 insertions, 3 deletions
diff --git a/prism/enc/pm_encoding.h b/prism/enc/pm_encoding.h index 59d31e26b4..ac594d97ca 100644 --- a/prism/enc/pm_encoding.h +++ b/prism/enc/pm_encoding.h @@ -160,6 +160,7 @@ extern pm_encoding_t pm_encoding_ascii_8bit; extern pm_encoding_t pm_encoding_big5; extern pm_encoding_t pm_encoding_big5_hkscs; extern pm_encoding_t pm_encoding_big5_uao; +extern pm_encoding_t pm_encoding_cesu_8; extern pm_encoding_t pm_encoding_cp51932; extern pm_encoding_t pm_encoding_cp850; extern pm_encoding_t pm_encoding_cp852; diff --git a/prism/enc/pm_unicode.c b/prism/enc/pm_unicode.c index 09aa907a1d..cde596c360 100644 --- a/prism/enc/pm_unicode.c +++ b/prism/enc/pm_unicode.c @@ -2344,6 +2344,100 @@ pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n) { } } +static pm_unicode_codepoint_t +pm_cesu_8_codepoint(const uint8_t *b, ptrdiff_t n, size_t *width) { + if (b[0] < 0x80) { + *width = 1; + return (pm_unicode_codepoint_t) b[0]; + } + + if (n > 1 && b[0] >= 0xC2 && b[0] <= 0xDF && b[1] >= 0x80 && b[1] <= 0xBF) { + *width = 2; + + // 110xxxxx 10xxxxxx + return (pm_unicode_codepoint_t) (((b[0] & 0x1F) << 6) | (b[1] & 0x3F)); + } + + if (n > 5 && b[0] == 0xED && b[1] >= 0xA0 && b[1] <= 0xAF && b[2] >= 0x80 && b[2] <= 0xBF && b[3] == 0xED && b[4] >= 0xB0 && b[4] <= 0xBF && b[5] >= 0x80 && b[5] <= 0xBF) { + *width = 6; + + // 11101101 1010xxxx 10xxxxxx 11101101 1011xxxx 10xxxxxx + return (pm_unicode_codepoint_t) (0x10000 + (((b[1] & 0xF) << 16) | ((b[2] & 0x3F) << 10) | ((b[4] & 0xF) << 6) | (b[5] & 0x3F))); + } + + if (n > 2 && b[0] == 0xED && b[1] >= 0xA0 && b[1] <= 0xBF) { + *width = 3; + + // 11101101 1010xxxx 10xxxxx + return (pm_unicode_codepoint_t) (0x10000 + (((b[0] & 0x03) << 16) | ((b[1] & 0x3F) << 10) | (b[2] & 0x3F))); + } + + if (n > 2 && ((b[0] == 0xE0 && b[1] >= 0xA0) || (b[0] >= 0xE1 && b[0] <= 0xEF && b[1] >= 0x80)) && b[1] <= 0xBF && b[2] >= 0x80 && b[2] <= 0xBF) { + *width = 3; + + // 1110xxxx 10xxxxxx 10xxxxx + return (pm_unicode_codepoint_t) (((b[0] & 0xF) << 12) | ((b[1] & 0x3F) << 6) | (b[2] & 0x3F)); + } + + *width = 0; + return 0; +} + +static size_t +pm_encoding_cesu_8_char_width(const uint8_t *b, ptrdiff_t n) { + size_t width; + pm_cesu_8_codepoint(b, n, &width); + return width; +} + +static size_t +pm_encoding_cesu_8_alpha_char(const uint8_t *b, ptrdiff_t n) { + if (*b < 0x80) { + return (pm_encoding_unicode_table[*b] & PRISM_ENCODING_ALPHABETIC_BIT) ? 1 : 0; + } + + size_t width; + pm_unicode_codepoint_t codepoint = pm_cesu_8_codepoint(b, n, &width); + + if (codepoint <= 0xFF) { + return (pm_encoding_unicode_table[(uint8_t) codepoint] & PRISM_ENCODING_ALPHABETIC_BIT) ? width : 0; + } else { + return pm_unicode_codepoint_match(codepoint, unicode_alpha_codepoints, UNICODE_ALPHA_CODEPOINTS_LENGTH) ? width : 0; + } +} + +static size_t +pm_encoding_cesu_8_alnum_char(const uint8_t *b, ptrdiff_t n) { + if (*b < 0x80) { + return (pm_encoding_unicode_table[*b] & (PRISM_ENCODING_ALPHANUMERIC_BIT)) ? 1 : 0; + } + + size_t width; + pm_unicode_codepoint_t codepoint = pm_cesu_8_codepoint(b, n, &width); + + if (codepoint <= 0xFF) { + return (pm_encoding_unicode_table[(uint8_t) codepoint] & (PRISM_ENCODING_ALPHANUMERIC_BIT)) ? width : 0; + } else { + return pm_unicode_codepoint_match(codepoint, unicode_alnum_codepoints, UNICODE_ALNUM_CODEPOINTS_LENGTH) ? width : 0; + } +} + +bool +pm_encoding_cesu_8_isupper_char(const uint8_t *b, ptrdiff_t n) { + if (*b < 0x80) { + return (pm_encoding_unicode_table[*b] & PRISM_ENCODING_UPPERCASE_BIT) ? true : false; + } + + size_t width; + pm_unicode_codepoint_t codepoint = pm_cesu_8_codepoint(b, n, &width); + + if (codepoint <= 0xFF) { + return (pm_encoding_unicode_table[(uint8_t) codepoint] & PRISM_ENCODING_UPPERCASE_BIT) ? true : false; + } else { + return pm_unicode_codepoint_match(codepoint, unicode_isupper_codepoints, UNICODE_ISUPPER_CODEPOINTS_LENGTH) ? true : false; + } +} + #undef UNICODE_ALPHA_CODEPOINTS_LENGTH #undef UNICODE_ALNUM_CODEPOINTS_LENGTH #undef UNICODE_ISUPPER_CODEPOINTS_LENGTH @@ -2397,3 +2491,13 @@ pm_encoding_t pm_encoding_utf8_softbank = { .isupper_char = pm_encoding_utf_8_isupper_char, .multibyte = true }; + +/** CESU-8 */ +pm_encoding_t pm_encoding_cesu_8 = { + .name = "CESU-8", + .char_width = pm_encoding_cesu_8_char_width, + .alnum_char = pm_encoding_cesu_8_alnum_char, + .alpha_char = pm_encoding_cesu_8_alpha_char, + .isupper_char = pm_encoding_cesu_8_isupper_char, + .multibyte = true +}; diff --git a/prism/prism.c b/prism/prism.c index cdada34a73..baab318b9a 100644 --- a/prism/prism.c +++ b/prism/prism.c @@ -6212,6 +6212,7 @@ parser_lex_magic_comment_encoding_value(pm_parser_t *parser, const uint8_t *star ENCODING1("Big5-UAO", pm_encoding_big5_uao); break; case 'C': case 'c': + ENCODING1("CESU-8", pm_encoding_cesu_8); ENCODING1("CP437", pm_encoding_ibm437); ENCODING1("CP720", pm_encoding_ibm720); ENCODING1("CP737", pm_encoding_ibm737); diff --git a/test/prism/encoding_test.rb b/test/prism/encoding_test.rb index 3135bef824..94ba3a6c2a 100644 --- a/test/prism/encoding_test.rb +++ b/test/prism/encoding_test.rb @@ -84,7 +84,7 @@ module Prism Encoding::SJIS_DoCoMo => codepoints_2bytes, Encoding::SJIS_KDDI => codepoints_2bytes, Encoding::SJIS_SoftBank => codepoints_2bytes, - Encoding::Windows_31J => codepoints_2bytes, + Encoding::Windows_31J => codepoints_2bytes } # By default we don't test every codepoint in these encodings because they @@ -123,6 +123,7 @@ module Prism Encoding::UTF8_DoCoMo => codepoints_unicode, Encoding::UTF8_KDDI => codepoints_unicode, Encoding::UTF8_SoftBank => codepoints_unicode, + Encoding::CESU_8 => codepoints_unicode, Encoding::CP51932 => codepoints_eucjp, Encoding::EUC_JP => codepoints_eucjp, Encoding::EUCJP_MS => codepoints_eucjp, @@ -131,7 +132,7 @@ module Prism Encoding::STATELESS_ISO_2022_JP => codepoints_emacs_mule, Encoding::STATELESS_ISO_2022_JP_KDDI => codepoints_emacs_mule, Encoding::GB18030 => codepoints_gb18030, - Encoding::EUC_TW => codepoints_euc_tw, + Encoding::EUC_TW => codepoints_euc_tw ) end @@ -258,7 +259,7 @@ module Prism # themselves as lowercase, their case fold is different. I have reported # this bug upstream. case encoding - when Encoding::UTF_8, Encoding::UTF_8_MAC + when Encoding::UTF_8, Encoding::UTF_8_MAC, Encoding::UTF8_DoCoMo, Encoding::UTF8_KDDI, Encoding::UTF8_SoftBank, Encoding::CESU_8 range = range.to_a - [ 0x01c5, 0x01c8, 0x01cb, 0x01f2, 0x1f88, 0x1f89, 0x1f8a, 0x1f8b, 0x1f8c, 0x1f8d, 0x1f8e, 0x1f8f, 0x1f98, 0x1f99, 0x1f9a, 0x1f9b, |