diff options
author | Earlopain <[email protected]> | 2025-01-14 20:20:05 +0100 |
---|---|---|
committer | git <[email protected]> | 2025-01-14 20:33:11 +0000 |
commit | 56242ba495246e95dd5178f2ec101c1005c10afc (patch) | |
tree | f491cf9f40d94af3eef0c27115807e3991db9d4c /lib/prism/translation/parser | |
parent | 51d3d6ac8c2e3b6b6dacd80a9ddf11adc46fde08 (diff) |
Better handle regexp in the parser translator
Turns out, it was already almost correct. If you disregard \c and \M style escapes, only a single character is allowed to be escaped in a regex so most tests passed already.
There was also a mistake where the wrong value was constructed for the ast, this is now fixed.
One test fails because of this, but I'm fairly sure it is because of a parser bug. For `/\“/`, the backslash is supposed to be removed because it is a multibyte character. But tbh,
I don't entirely understand all the rules.
Fixes more than half of the remaining ast differences for rubocop tests
Diffstat (limited to 'lib/prism/translation/parser')
-rw-r--r-- | lib/prism/translation/parser/compiler.rb | 2 | ||||
-rw-r--r-- | lib/prism/translation/parser/lexer.rb | 35 |
2 files changed, 26 insertions, 11 deletions
diff --git a/lib/prism/translation/parser/compiler.rb b/lib/prism/translation/parser/compiler.rb index 26c5c174f4..a7e29d9dfc 100644 --- a/lib/prism/translation/parser/compiler.rb +++ b/lib/prism/translation/parser/compiler.rb @@ -1507,7 +1507,7 @@ module Prism elsif node.content.include?("\n") string_nodes_from_line_continuations(node.unescaped, node.content, node.content_loc.start_offset, node.opening) else - [builder.string_internal(token(node.content_loc))] + [builder.string_internal([node.unescaped, srange(node.content_loc)])] end builder.regexp_compose( diff --git a/lib/prism/translation/parser/lexer.rb b/lib/prism/translation/parser/lexer.rb index a54d355652..1fa2723f03 100644 --- a/lib/prism/translation/parser/lexer.rb +++ b/lib/prism/translation/parser/lexer.rb @@ -633,18 +633,34 @@ module Prism DELIMITER_SYMETRY = { "[" => "]", "(" => ")", "{" => "}", "<" => ">" }.freeze private_constant :DELIMITER_SYMETRY + + # https://github.com/whitequark/parser/blob/v3.3.6.0/lib/parser/lexer-strings.rl#L14 + REGEXP_META_CHARACTERS = ["\\", "$", "(", ")", "*", "+", ".", "<", ">", "?", "[", "]", "^", "{", "|", "}"] + private_constant :REGEXP_META_CHARACTERS + # Apply Ruby string escaping rules def unescape_string(string, quote) # In single-quoted heredocs, everything is taken literally. return string if quote == "<<'" - # TODO: Implement regexp escaping - return string if quote == "/" || quote.start_with?("%r") - # OPTIMIZATION: Assume that few strings need escaping to speed up the common case. return string unless string.include?("\\") - if interpolation?(quote) + # Enclosing character for the string. `"` for `"foo"`, `{` for `%w{foo}`, etc. + delimiter = quote[-1] + + if regexp?(quote) + # Should be escaped handled to single-quoted heredocs. The only character that is + # allowed to be escaped is the delimiter, except when that also has special meaning + # in the regexp. Since all the symetry delimiters have special meaning, they don't need + # to be considered separately. + if REGEXP_META_CHARACTERS.include?(delimiter) + string + else + # There can never be an even amount of backslashes. It would be a syntax error. + string.gsub(/\\(#{Regexp.escape(delimiter)})/, '\1') + end + elsif interpolation?(quote) # Appending individual escape sequences may force the string out of its intended # encoding. Start out with binary and force it back later. result = "".b @@ -690,12 +706,6 @@ module Prism result else - if quote == "'" - delimiter = "'" - else - delimiter = quote[2] - end - delimiters = Regexp.escape("#{delimiter}#{DELIMITER_SYMETRY[delimiter]}") string.gsub(/\\([\\#{delimiters}])/, '\1') end @@ -706,6 +716,11 @@ module Prism quote != "'" && !quote.start_with?("%q", "%w", "%i") end + # Regexp allow interpolation but are handled differently during unescaping + def regexp?(quote) + quote == "/" || quote.start_with?("%r") + end + # Determine if the string is part of a %-style array. def percent_array?(quote) quote.start_with?("%w", "%W", "%i", "%I") |