diff options
author | Koichi ITO <[email protected]> | 2024-03-21 01:46:53 +0900 |
---|---|---|
committer | git <[email protected]> | 2024-03-25 12:16:32 +0000 |
commit | 56a2fad2a4578987a371f7a5563812b52ed8e9c6 (patch) | |
tree | 8120119e81dbb98f4f96243185a0fcb364ec50fd | |
parent | 9b921f662285e785ddbd22d0bcd540fa35151b08 (diff) |
[ruby/prism] Fix incorrect paring when using invalid regexp options
Fixes https://github.com/ruby/prism/pull/2617.
There was an issue with the lexer as follows.
The following are valid regexp options:
```console
$ bundle exec ruby -Ilib -rprism -ve 'p Prism.lex("/x/io").value.map {|token| token[0].type }'
ruby 3.3.0 (2023-12-25 revision https://github.com/ruby/prism/commit/5124f9ac75) [x86_64-darwin22]
[:REGEXP_BEGIN, :STRING_CONTENT, :REGEXP_END, :EOF]
```
The following are invalid regexp options. Unnecessary the `IDENTIFIER` token is appearing:
```console
$ bundle exec ruby -Ilib -rprism -ve 'p Prism.lex("/x/az").value.map {|token| token[0].type }'
ruby 3.3.0 (2023-12-25 revision https://github.com/ruby/prism/commit/5124f9ac75) [x86_64-darwin22]
[:REGEXP_BEGIN, :STRING_CONTENT, :REGEXP_END, :IDENTIFIER, :EOF]
```
As a behavior of Ruby, when given `A` to `Z` and `a` to `z`, they act as invalid regexp options. e.g.,
```console
$ ruby -e '/regexp/az'
-e:1: unknown regexp options - az
/regexp/az
-e: compile error (SyntaxError)
```
Thus, it should probably not be construed as `IDENTIFIER` token.
Therefore, `pm_byte_table` has been adapted to accept those invalid regexp option values.
Whether it is a valid regexp option or not is checked by `pm_regular_expression_flags_create`.
For invalid regexp options, `PM_ERR_REGEXP_UNKNOWN_OPTIONS` is added to diagnostics.
https://github.com/ruby/prism/commit/d2a6096fcf
-rw-r--r-- | lib/prism/translation/parser.rb | 2 | ||||
-rw-r--r-- | prism/config.yml | 1 | ||||
-rw-r--r-- | prism/prism.c | 21 | ||||
-rw-r--r-- | prism/templates/src/diagnostic.c.erb | 1 | ||||
-rw-r--r-- | prism/util/pm_char.c | 8 | ||||
-rw-r--r-- | test/prism/errors_test.rb | 14 | ||||
-rw-r--r-- | test/prism/location_test.rb | 2 |
7 files changed, 39 insertions, 10 deletions
diff --git a/lib/prism/translation/parser.rb b/lib/prism/translation/parser.rb index 8df7164688..0d11b8f566 100644 --- a/lib/prism/translation/parser.rb +++ b/lib/prism/translation/parser.rb @@ -173,6 +173,8 @@ module Prism Diagnostic.new(:error, :duplicate_argument, {}, diagnostic_location, []) when :parameter_numbered_reserved Diagnostic.new(:error, :reserved_for_numparam, { name: location.slice }, diagnostic_location, []) + when :regexp_unknown_options + Diagnostic.new(:error, :regexp_options, { options: location.slice[1..] }, diagnostic_location, []) when :singleton_for_literals Diagnostic.new(:error, :singleton_literal, {}, diagnostic_location, []) when :string_literal_eof diff --git a/prism/config.yml b/prism/config.yml index 269bfa73ec..d9e39460d1 100644 --- a/prism/config.yml +++ b/prism/config.yml @@ -199,6 +199,7 @@ errors: - REGEXP_INVALID_UNICODE_RANGE - REGEXP_NON_ESCAPED_MBC - REGEXP_TERM + - REGEXP_UNKNOWN_OPTIONS - REGEXP_UTF8_CHAR_NON_UTF8_REGEXP - RESCUE_EXPRESSION - RESCUE_MODIFIER_VALUE diff --git a/prism/prism.c b/prism/prism.c index 8a6ca0eccc..b410b0a510 100644 --- a/prism/prism.c +++ b/prism/prism.c @@ -1214,10 +1214,12 @@ pm_node_flag_set_repeated_parameter(pm_node_t *node) { * Parse out the options for a regular expression. */ static inline pm_node_flags_t -pm_regular_expression_flags_create(const pm_token_t *closing) { +pm_regular_expression_flags_create(pm_parser_t *parser, const pm_token_t *closing) { pm_node_flags_t flags = 0; if (closing->type == PM_TOKEN_REGEXP_END) { + pm_buffer_t unknown_flags = { 0 }; + for (const uint8_t *flag = closing->start + 1; flag < closing->end; flag++) { switch (*flag) { case 'i': flags |= PM_REGULAR_EXPRESSION_FLAGS_IGNORE_CASE; break; @@ -1230,9 +1232,16 @@ pm_regular_expression_flags_create(const pm_token_t *closing) { case 's': flags = (pm_node_flags_t) (((pm_node_flags_t) (flags & PM_REGULAR_EXPRESSION_ENCODING_MASK)) | PM_REGULAR_EXPRESSION_FLAGS_WINDOWS_31J); break; case 'u': flags = (pm_node_flags_t) (((pm_node_flags_t) (flags & PM_REGULAR_EXPRESSION_ENCODING_MASK)) | PM_REGULAR_EXPRESSION_FLAGS_UTF_8); break; - default: assert(false && "unreachable"); + default: pm_buffer_append_byte(&unknown_flags, *flag); } } + + size_t unknown_flags_length = pm_buffer_length(&unknown_flags); + if (unknown_flags_length != 0) { + char *word = unknown_flags_length >= 2 ? "options" : "option"; + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->previous, PM_ERR_REGEXP_UNKNOWN_OPTIONS, word, unknown_flags_length, pm_buffer_value(&unknown_flags)); + } + pm_buffer_free(&unknown_flags); } return flags; @@ -4297,10 +4306,10 @@ pm_interpolated_regular_expression_node_append(pm_interpolated_regular_expressio } static inline void -pm_interpolated_regular_expression_node_closing_set(pm_interpolated_regular_expression_node_t *node, const pm_token_t *closing) { +pm_interpolated_regular_expression_node_closing_set(pm_parser_t *parser, pm_interpolated_regular_expression_node_t *node, const pm_token_t *closing) { node->closing_loc = PM_LOCATION_TOKEN_VALUE(closing); node->base.location.end = closing->end; - pm_node_flag_set((pm_node_t *)node, pm_regular_expression_flags_create(closing)); + pm_node_flag_set((pm_node_t *)node, pm_regular_expression_flags_create(parser, closing)); } /** @@ -5528,7 +5537,7 @@ pm_regular_expression_node_create_unescaped(pm_parser_t *parser, const pm_token_ *node = (pm_regular_expression_node_t) { { .type = PM_REGULAR_EXPRESSION_NODE, - .flags = pm_regular_expression_flags_create(closing) | PM_NODE_FLAG_STATIC_LITERAL, + .flags = pm_regular_expression_flags_create(parser, closing) | PM_NODE_FLAG_STATIC_LITERAL, .location = { .start = MIN(opening->start, closing->start), .end = MAX(opening->end, closing->end) @@ -17490,7 +17499,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b expect1(parser, PM_TOKEN_REGEXP_END, PM_ERR_REGEXP_TERM); } - pm_interpolated_regular_expression_node_closing_set(interpolated, &closing); + pm_interpolated_regular_expression_node_closing_set(parser, interpolated, &closing); return (pm_node_t *) interpolated; } case PM_TOKEN_BACKTICK: diff --git a/prism/templates/src/diagnostic.c.erb b/prism/templates/src/diagnostic.c.erb index 2a3ac19930..818b12d98b 100644 --- a/prism/templates/src/diagnostic.c.erb +++ b/prism/templates/src/diagnostic.c.erb @@ -277,6 +277,7 @@ static const pm_diagnostic_data_t diagnostic_messages[PM_DIAGNOSTIC_ID_MAX] = { [PM_ERR_REGEXP_INCOMPAT_CHAR_ENCODING] = { "incompatible character encoding: /%.*s/", PM_ERROR_LEVEL_FATAL }, [PM_ERR_REGEXP_NON_ESCAPED_MBC] = { "/.../n has a non escaped non ASCII character in non ASCII-8BIT script: /%.*s/", PM_ERROR_LEVEL_FATAL }, [PM_ERR_REGEXP_INVALID_UNICODE_RANGE] = { "invalid Unicode range: /%.*s/", PM_ERROR_LEVEL_FATAL }, + [PM_ERR_REGEXP_UNKNOWN_OPTIONS] = { "unknown regexp %s: %.*s", PM_ERROR_LEVEL_FATAL }, [PM_ERR_REGEXP_TERM] = { "expected a closing delimiter for the regular expression", PM_ERROR_LEVEL_FATAL }, [PM_ERR_REGEXP_UTF8_CHAR_NON_UTF8_REGEXP] = { "UTF-8 character in non UTF-8 regexp: /%s/", PM_ERROR_LEVEL_FATAL }, [PM_ERR_RESCUE_EXPRESSION] = { "expected a rescued expression", PM_ERROR_LEVEL_FATAL }, diff --git a/prism/util/pm_char.c b/prism/util/pm_char.c index 13eddbba48..dce19abd1b 100644 --- a/prism/util/pm_char.c +++ b/prism/util/pm_char.c @@ -19,10 +19,10 @@ static const uint8_t pm_byte_table[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 3x - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 4x - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 5x - 0, 0, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 4, 4, // 6x - 0, 0, 0, 4, 0, 4, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, // 7x + 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 4x + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, // 5x + 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 6x + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, // 7x 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax diff --git a/test/prism/errors_test.rb b/test/prism/errors_test.rb index ccf7485c7b..9221d52ef3 100644 --- a/test/prism/errors_test.rb +++ b/test/prism/errors_test.rb @@ -2067,6 +2067,20 @@ module Prism assert_errors expression(source), source, errors, compare_ripper: false end + def test_regular_expression_with_unknown_regexp_options + source = "/foo/AZaz" + errors = [["unknown regexp options: AZaz", 4..9]] + + assert_errors expression(source), source, errors + end + + def test_interpolated_regular_expression_with_unknown_regexp_options + source = "/\#{foo}/AZaz" + errors = [["unknown regexp options: AZaz", 7..12]] + + assert_errors expression(source), source, errors + end + def test_singleton_method_for_literals source = <<~'RUBY' def (1).g; end diff --git a/test/prism/location_test.rb b/test/prism/location_test.rb index c7ce248b56..b7b9a754ca 100644 --- a/test/prism/location_test.rb +++ b/test/prism/location_test.rb @@ -527,6 +527,7 @@ module Prism def test_InterpolatedRegularExpressionNode assert_location(InterpolatedRegularExpressionNode, "/\#{foo}/") + assert_location(InterpolatedRegularExpressionNode, "/\#{foo}/io") end def test_InterpolatedStringNode @@ -730,6 +731,7 @@ module Prism def test_RegularExpressionNode assert_location(RegularExpressionNode, "/foo/") + assert_location(RegularExpressionNode, "/foo/io") end def test_RequiredKeywordParameterNode |