diff options
-rw-r--r-- | prism/prism.c | 346 | ||||
-rw-r--r-- | prism/util/pm_buffer.c | 16 | ||||
-rw-r--r-- | prism/util/pm_buffer.h | 3 | ||||
-rw-r--r-- | test/prism/unescape_test.rb | 17 |
4 files changed, 364 insertions, 18 deletions
diff --git a/prism/prism.c b/prism/prism.c index 3b4261e5ca..e4448ef394 100644 --- a/prism/prism.c +++ b/prism/prism.c @@ -6064,6 +6064,340 @@ lex_interpolation(pm_parser_t *parser, const uint8_t *pound) { } } +static const uint8_t PM_ESCAPE_FLAG_CONTROL = 0x1; +static const uint8_t PM_ESCAPE_FLAG_META = 0x2; +static const uint8_t PM_ESCAPE_FLAG_SINGLE = 0x4; + +// This is a lookup table for whether or not an ASCII character is printable. +static const bool ascii_printable_chars[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0 +}; + +static inline bool +char_is_ascii_printable(const uint8_t b) { + return (b < 0x80) && ascii_printable_chars[b]; +} + +// Return the value that a hexadecimal digit character represents. For example, +// transform 'a' into 10, 'b' into 11, etc. +static inline uint8_t +escape_hexadecimal_digit(const uint8_t value) { + return (uint8_t) ((value <= '9') ? (value - '0') : (value & 0x7) + 9); +} + +// Scan the 4 digits of a Unicode escape into the value. Returns the number of +// digits scanned. This function assumes that the characters have already been +// validated. +static inline uint32_t +escape_unicode(const uint8_t *string, size_t length) { + uint32_t value = 0; + for (size_t index = 0; index < length; index++) { + if (index != 0) value <<= 4; + value |= escape_hexadecimal_digit(string[index]); + } + return value; +} + +// Escape a single character value based on the given flags. +static inline uint8_t +escape_byte(uint8_t value, const uint8_t flags) { + if (flags & PM_ESCAPE_FLAG_CONTROL) value &= 0x1f; + if (flags & PM_ESCAPE_FLAG_META) value |= 0x80; + return value; +} + +// Write a unicode codepoint to the given buffer. +static inline void +escape_write_unicode(pm_parser_t *parser, pm_buffer_t *buffer, const uint8_t *start, const uint8_t *end, uint32_t value) { + if (value <= 0x7F) { // 0xxxxxxx + pm_buffer_append_u8(buffer, (uint8_t) value); + } else if (value <= 0x7FF) { // 110xxxxx 10xxxxxx + pm_buffer_append_u8(buffer, (uint8_t) (0xC0 | (value >> 6))); + pm_buffer_append_u8(buffer, (uint8_t) (0x80 | (value & 0x3F))); + } else if (value <= 0xFFFF) { // 1110xxxx 10xxxxxx 10xxxxxx + pm_buffer_append_u8(buffer, (uint8_t) (0xE0 | (value >> 12))); + pm_buffer_append_u8(buffer, (uint8_t) (0x80 | ((value >> 6) & 0x3F))); + pm_buffer_append_u8(buffer, (uint8_t) (0x80 | (value & 0x3F))); + } else if (value <= 0x10FFFF) { // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + pm_buffer_append_u8(buffer, (uint8_t) (0xF0 | (value >> 18))); + pm_buffer_append_u8(buffer, (uint8_t) (0x80 | ((value >> 12) & 0x3F))); + pm_buffer_append_u8(buffer, (uint8_t) (0x80 | ((value >> 6) & 0x3F))); + pm_buffer_append_u8(buffer, (uint8_t) (0x80 | (value & 0x3F))); + } else { + pm_parser_err(parser, start, end, PM_ERR_ESCAPE_INVALID_UNICODE); + pm_buffer_append_u8(buffer, 0xEF); + pm_buffer_append_u8(buffer, 0xBF); + pm_buffer_append_u8(buffer, 0xBD); + } +} + +// Read the value of an escape into the buffer. +static void +escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) { + switch (peek(parser)) { + case '\\': { + parser->current.end++; + pm_buffer_append_u8(buffer, '\\'); + return; + } + case '\'': { + parser->current.end++; + pm_buffer_append_u8(buffer, '\''); + return; + } + case 'a': { + parser->current.end++; + pm_buffer_append_u8(buffer, '\a'); + return; + } + case 'b': { + parser->current.end++; + pm_buffer_append_u8(buffer, '\b'); + return; + } + case 'e': { + parser->current.end++; + pm_buffer_append_u8(buffer, '\033'); + return; + } + case 'f': { + parser->current.end++; + pm_buffer_append_u8(buffer, '\f'); + return; + } + case 'n': { + parser->current.end++; + pm_buffer_append_u8(buffer, '\n'); + return; + } + case 'r': { + parser->current.end++; + pm_buffer_append_u8(buffer, '\r'); + return; + } + case 's': { + parser->current.end++; + pm_buffer_append_u8(buffer, ' '); + return; + } + case 't': { + parser->current.end++; + pm_buffer_append_u8(buffer, '\t'); + return; + } + case 'v': { + parser->current.end++; + pm_buffer_append_u8(buffer, '\v'); + return; + } + case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': { + uint8_t value = *parser->current.end - '0'; + parser->current.end++; + + if (pm_char_is_octal_digit(peek(parser))) { + value = (uint8_t) ((value << 3) | (*parser->current.end - '0')); + parser->current.end++; + + if (pm_char_is_octal_digit(peek(parser))) { + value = (uint8_t) ((value << 3) | (*parser->current.end - '0')); + parser->current.end++; + } + } + + pm_buffer_append_u8(buffer, value); + return; + } + case 'x': { + uint8_t byte = peek(parser); + parser->current.end++; + + if (pm_char_is_hexadecimal_digit(byte)) { + uint8_t value = escape_hexadecimal_digit(byte); + parser->current.end++; + + byte = peek(parser); + if (pm_char_is_hexadecimal_digit(byte)) { + value = (uint8_t) ((value << 4) | escape_hexadecimal_digit(byte)); + parser->current.end++; + } + + pm_buffer_append_u8(buffer, value); + } else { + pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_HEXADECIMAL); + } + + return; + } + case 'u': { + parser->current.end++; + + if ( + (parser->current.end + 4 < parser->end) && + pm_char_is_hexadecimal_digit(parser->current.end[0]) && + pm_char_is_hexadecimal_digit(parser->current.end[1]) && + pm_char_is_hexadecimal_digit(parser->current.end[2]) && + pm_char_is_hexadecimal_digit(parser->current.end[3]) + ) { + uint32_t value = escape_unicode(parser->current.end, 4); + escape_write_unicode(parser, buffer, parser->current.end, parser->current.end + 4, value); + parser->current.end += 4; + } else if (peek(parser) == '{') { + const uint8_t *unicode_codepoints_start = parser->current.end - 2; + parser->current.end++; + + const uint8_t *extra_codepoints_start = NULL; + int codepoints_count = 0; + + parser->current.end += pm_strspn_whitespace(parser->current.end, parser->end - parser->current.end); + while ((parser->current.end < parser->end) && (*parser->current.end != '}')) { + const uint8_t *unicode_start = parser->current.end; + size_t hexadecimal_length = pm_strspn_hexadecimal_digit(parser->current.end, parser->end - parser->current.end); + + if (hexadecimal_length > 6) { + // \u{nnnn} character literal allows only 1-6 hexadecimal digits + pm_parser_err(parser, unicode_start, unicode_start + hexadecimal_length, PM_ERR_ESCAPE_INVALID_UNICODE_LONG); + } else if (hexadecimal_length == 0) { + // there are not hexadecimal characters + pm_parser_err(parser, unicode_start, unicode_start + hexadecimal_length, PM_ERR_ESCAPE_INVALID_UNICODE); + return; + } + + parser->current.end += hexadecimal_length; + codepoints_count++; + if (flags & PM_ESCAPE_FLAG_SINGLE && codepoints_count == 2) { + extra_codepoints_start = unicode_start; + } + + uint32_t value = escape_unicode(unicode_start, hexadecimal_length); + escape_write_unicode(parser, buffer, unicode_start, parser->current.end, value); + parser->current.end += pm_strspn_whitespace(parser->current.end, parser->end - parser->current.end); + } + + // ?\u{nnnn} character literal should contain only one codepoint and cannot be like ?\u{nnnn mmmm} + if (flags & PM_ESCAPE_FLAG_SINGLE && codepoints_count > 1) { + pm_parser_err(parser, extra_codepoints_start, parser->current.end - 1, PM_ERR_ESCAPE_INVALID_UNICODE_LITERAL); + } + + if (peek(parser) == '}') { + parser->current.end++; + } else { + pm_parser_err(parser, unicode_codepoints_start, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_TERM); + } + } else { + pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_UNICODE); + } + + return; + } + case 'c': { + parser->current.end++; + uint8_t peeked = peek(parser); + + switch (peeked) { + case '?': + parser->current.end++; + pm_buffer_append_u8(buffer, escape_byte(0x7f, flags | PM_ESCAPE_FLAG_CONTROL)); + return; + case '\\': + if (flags & PM_ESCAPE_FLAG_CONTROL) { + pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_CONTROL_REPEAT); + return; + } + parser->current.end++; + escape_read(parser, buffer, flags | PM_ESCAPE_FLAG_CONTROL); + return; + default: + if (!char_is_ascii_printable(peeked)) { + pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_CONTROL); + return; + } + parser->current.end++; + pm_buffer_append_u8(buffer, escape_byte(peeked, flags | PM_ESCAPE_FLAG_CONTROL)); + return; + } + } + case 'C': { + parser->current.end++; + if (peek(parser) != '-') { + pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_CONTROL); + return; + } + + parser->current.end++; + uint8_t peeked = peek(parser); + + switch (peeked) { + case '?': + parser->current.end++; + pm_buffer_append_u8(buffer, escape_byte(0x7f, flags | PM_ESCAPE_FLAG_CONTROL)); + return; + case '\\': + if (flags & PM_ESCAPE_FLAG_CONTROL) { + pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_CONTROL_REPEAT); + return; + } + parser->current.end++; + escape_read(parser, buffer, flags | PM_ESCAPE_FLAG_CONTROL); + return; + default: + if (!char_is_ascii_printable(peeked)) { + pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_CONTROL); + return; + } + parser->current.end++; + pm_buffer_append_u8(buffer, escape_byte(peeked, flags | PM_ESCAPE_FLAG_CONTROL)); + return; + } + } + case 'M': { + parser->current.end++; + if (peek(parser) != '-') { + pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_META); + return; + } + + parser->current.end++; + uint8_t peeked = peek(parser); + + switch (peeked) { + case '?': + parser->current.end++; + pm_buffer_append_u8(buffer, escape_byte(0x7f, flags | PM_ESCAPE_FLAG_META)); + return; + case '\\': + if (flags & PM_ESCAPE_FLAG_META) { + pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_META_REPEAT); + return; + } + parser->current.end++; + escape_read(parser, buffer, flags | PM_ESCAPE_FLAG_META); + return; + default: + if (!char_is_ascii_printable(peeked)) { + pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_META); + return; + } + parser->current.end++; + pm_buffer_append_u8(buffer, escape_byte(peeked, flags | PM_ESCAPE_FLAG_META)); + return; + } + } + default: { + if (parser->current.end < parser->end) { + pm_buffer_append_u8(buffer, *parser->current.end++); + } + return; + } + } +} + // This function is responsible for lexing either a character literal or the ? // operator. The supported character literals are described below. // @@ -6108,11 +6442,15 @@ lex_question_mark(pm_parser_t *parser) { lex_state_set(parser, PM_LEX_STATE_BEG); - if (parser->current.start[1] == '\\') { + if (match(parser, '\\')) { lex_state_set(parser, PM_LEX_STATE_END); - parser->current.end += pm_unescape_calculate_difference(parser, parser->current.start + 1, PM_UNESCAPE_ALL, true); - pm_string_shared_init(&parser->current_string, parser->current.start + 1, parser->current.end); - pm_unescape_manipulate_char_literal(parser, &parser->current_string, PM_UNESCAPE_ALL); + + pm_buffer_t buffer; + pm_buffer_init_capacity(&buffer, 3); + + escape_read(parser, &buffer, PM_ESCAPE_FLAG_SINGLE); + pm_string_owned_init(&parser->current_string, (uint8_t *) buffer.value, buffer.length); + return PM_TOKEN_CHARACTER_LITERAL; } else { size_t encoding_width = parser->encoding.char_width(parser->current.end, parser->end - parser->current.end); diff --git a/prism/util/pm_buffer.c b/prism/util/pm_buffer.c index 0d84375767..55f6b0f7f3 100644 --- a/prism/util/pm_buffer.c +++ b/prism/util/pm_buffer.c @@ -1,24 +1,26 @@ #include "prism/util/pm_buffer.h" -#define PRISM_BUFFER_INITIAL_SIZE 1024 - // Return the size of the pm_buffer_t struct. size_t pm_buffer_sizeof(void) { return sizeof(pm_buffer_t); } -// Initialize a pm_buffer_t with its default values. +// Initialize a pm_buffer_t with the given capacity. bool -pm_buffer_init(pm_buffer_t *buffer) { +pm_buffer_init_capacity(pm_buffer_t *buffer, size_t capacity) { buffer->length = 0; - buffer->capacity = PRISM_BUFFER_INITIAL_SIZE; + buffer->capacity = capacity; - buffer->value = (char *) malloc(PRISM_BUFFER_INITIAL_SIZE); + buffer->value = (char *) malloc(capacity); return buffer->value != NULL; } -#undef PRISM_BUFFER_INITIAL_SIZE +// Initialize a pm_buffer_t with its default values. +bool +pm_buffer_init(pm_buffer_t *buffer) { + return pm_buffer_init_capacity(buffer, 1024); +} // Return the value of the buffer. char * diff --git a/prism/util/pm_buffer.h b/prism/util/pm_buffer.h index 160d60bc58..d881b32441 100644 --- a/prism/util/pm_buffer.h +++ b/prism/util/pm_buffer.h @@ -21,6 +21,9 @@ typedef struct { // Return the size of the pm_buffer_t struct. PRISM_EXPORTED_FUNCTION size_t pm_buffer_sizeof(void); +// Initialize a pm_buffer_t with the given capacity. +bool pm_buffer_init_capacity(pm_buffer_t *buffer, size_t capacity); + // Initialize a pm_buffer_t with its default values. PRISM_EXPORTED_FUNCTION bool pm_buffer_init(pm_buffer_t *buffer); diff --git a/test/prism/unescape_test.rb b/test/prism/unescape_test.rb index 4e631d2238..33032ee550 100644 --- a/test/prism/unescape_test.rb +++ b/test/prism/unescape_test.rb @@ -79,17 +79,20 @@ module Prism octal = [*("0".."7")] octal = octal.product(octal).map(&:join).concat(octal.product(octal).product(octal).map(&:join)) - hex = [*("a".."f"), *("A".."F"), *("0".."9")] - hex = hex.map { |h| "x#{h}" }.concat(hex.product(hex).map { |h| "x#{h.join}" }).concat(["5", "6"].product(hex.sample(4)).product(hex.sample(4)).product(hex.sample(4)).map { |h| "u#{h.join}" }) + hex2 = [*("a".."f"), *("A".."F"), *("0".."9")] + hex2 = hex2.map { |h| "x#{h}" }.concat(hex2.product(hex2).map { |h| "x#{h.join}" }) - hexes = [*("a".."f"), *("A".."F"), *("0".."9")] - hexes = ["5", "6"].product(hexes.sample(2)).product(hexes.sample(2)).product(hexes.sample(2)).map { |h| "u{00#{h.join}}" } + hex4 = [*("a".."f"), *("A".."F"), *("0".."9")] + hex4 = ["5", "6"].product(hex4.sample(4)).product(hex4.sample(4)).product(hex4.sample(4)).map { |h| "u#{h.join}" } + + hex6 = [*("a".."f"), *("A".."F"), *("0".."9")] + hex6 = ["5", "6"].product(hex6.sample(2)).product(hex6.sample(2)).product(hex6.sample(2)).map { |h| "u{00#{h.join}}" } ctrls = (ascii.grep(/[[:print:]]/) - ["\\"]).flat_map { |c| ["C-#{c}", "c#{c}", "M-#{c}", "M-\\C-#{c}", "M-\\c#{c}", "c\\M-#{c}"] } - escapes = [*ascii, *ascii8, *octal, *hex, *hexes, *ctrls] + escapes = [*ascii, *ascii8, *octal, *hex2, *hex4, *hex6, *ctrls] contexts = [ - [Context::String.new("?", ""), [*ascii, *hex, *ctrls]], + [Context::String.new("?", ""), [*ascii, *octal]], #, *hex2]], [Context::String.new("'", "'"), escapes], [Context::String.new("\"", "\""), escapes], # [Context::String.new("%q[", "]"), escapes], @@ -133,7 +136,7 @@ module Prism "Expected #{context.name} to unescape #{escape.inspect} to #{expected.inspect}, but got #{actual.inspect}" end - if expected == :error + if expected == :error || actual == :error assert_equal expected, actual, message else assert_equal expected.bytes, actual.bytes, message |