test/prism/encoding/regular_expression_encoding_test.rb


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131

# frozen_string_literal: true

return unless defined?(RubyVM::InstructionSequence)
return if RubyVM::InstructionSequence.compile("").to_a[4][:parser] == :prism

require_relative "../test_helper"

module Prism
  class RegularExpressionEncodingTest < TestCase
    each_encoding do |encoding, _|
      define_method(:"test_regular_expression_encoding_flags_#{encoding.name}") do
        assert_regular_expression_encoding_flags(encoding, ["/a/", "/ą/", "//"])
      end

      escapes = ["\\x00", "\\x7F", "\\x80", "\\xFF", "\\u{00}", "\\u{7F}", "\\u{80}", "\\M-\\C-?"]
      escapes = escapes.concat(escapes.product(escapes).map(&:join))

      define_method(:"test_regular_expression_escape_encoding_flags_#{encoding.name}") do
        assert_regular_expression_encoding_flags(encoding, escapes.map { |e| "/#{e}/" })
      end

      ["n", "u", "e", "s"].each do |modifier|
        define_method(:"test_regular_expression_encoding_modifiers_/#{modifier}_#{encoding.name}") do
          regexp_sources = ["abc", "garçon", "\\x80", "gar\\xC3\\xA7on", "gar\\u{E7}on", "abc\\u{FFFFFF}", "\\x80\\u{80}" ]

          assert_regular_expression_encoding_flags(
            encoding,
            regexp_sources.product(["n", "u", "e", "s"]).map { |r, modifier| "/#{r}/#{modifier}" }
          )
        end
      end
    end

    private

    def assert_regular_expression_encoding_flags(encoding, regexps)
      regexps.each do |regexp|
        regexp_modifier_used = regexp.end_with?("/u") || regexp.end_with?("/e") || regexp.end_with?("/s") || regexp.end_with?("/n")
        source = "# encoding: #{encoding.name}\n#{regexp}"

        encoding_errors = ["invalid multibyte char", "escaped non ASCII character in UTF-8 regexp", "differs from source encoding"]
        skipped_errors = ["invalid multibyte escape", "incompatible character encoding", "UTF-8 character in non UTF-8 regexp", "invalid Unicode range", "invalid Unicode list"]

        # TODO (nirvdrum 21-Feb-2024): Prism currently does not handle Regexp validation unless modifiers are used. So, skip processing those errors for now: https://github.com/ruby/prism/issues/2104
        unless regexp_modifier_used
          skipped_errors += encoding_errors
          encoding_errors.clear
        end

        expected =
          begin
            eval(source).encoding
          rescue SyntaxError => error
            if encoding_errors.find { |e| error.message.include?(e) }
              error.message.split("\n").map { |m| m[/: (.+?)$/, 1] }
            elsif skipped_errors.find { |e| error.message.include?(e) }
              next
            else
              raise
            end
          end

        actual =
          Prism.parse(source).then do |result|
            if result.success?
              regexp = result.statement

              actual_encoding = if regexp.forced_utf8_encoding?
                Encoding::UTF_8
              elsif regexp.forced_binary_encoding?
                Encoding::ASCII_8BIT
              elsif regexp.forced_us_ascii_encoding?
                Encoding::US_ASCII
              elsif regexp.ascii_8bit?
                Encoding::ASCII_8BIT
              elsif regexp.utf_8?
                Encoding::UTF_8
              elsif regexp.euc_jp?
                Encoding::EUC_JP
              elsif regexp.windows_31j?
                Encoding::Windows_31J
              else
                encoding
              end

              if regexp.utf_8? && actual_encoding != Encoding::UTF_8
                raise "expected regexp encoding to be UTF-8 due to '/u' modifier, but got #{actual_encoding.name}"
              elsif regexp.ascii_8bit? && (actual_encoding != Encoding::ASCII_8BIT && actual_encoding != Encoding::US_ASCII)
                raise "expected regexp encoding to be ASCII-8BIT or US-ASCII due to '/n' modifier, but got #{actual_encoding.name}"
              elsif regexp.euc_jp? && actual_encoding != Encoding::EUC_JP
                raise "expected regexp encoding to be EUC-JP due to '/e' modifier, but got #{actual_encoding.name}"
              elsif regexp.windows_31j? && actual_encoding != Encoding::Windows_31J
                raise "expected regexp encoding to be Windows-31J due to '/s' modifier, but got #{actual_encoding.name}"
              end

              if regexp.utf_8? && regexp.forced_utf8_encoding?
                raise "the forced_utf8 flag should not be set when the UTF-8 modifier (/u) is used"
              elsif regexp.ascii_8bit? && regexp.forced_binary_encoding?
                raise "the forced_ascii_8bit flag should not be set when the UTF-8 modifier (/u) is used"
              end

              actual_encoding
            else
              errors = result.errors.map(&:message)

              if errors.last&.include?("UTF-8 mixed within")
                nil
              else
                errors
              end
            end
          end

        # TODO (nirvdrum 22-Feb-2024): Remove this workaround once Prism better maps CRuby's error messages.
        # This class of error message is tricky. The part not being compared is a representation of the regexp.
        # Depending on the source encoding and any encoding modifiers being used, CRuby alters how the regexp is represented.
        # Sometimes it's an MBC string. Other times it uses hexadecimal character escapes. And in other cases it uses
        # the long-form Unicode escape sequences. This short-circuit checks that the error message is mostly correct.
        if expected.is_a?(Array) && actual.is_a?(Array)
          if expected.last.start_with?("/.../n has a non escaped non ASCII character in non ASCII-8BIT script:") &&
              actual.last.start_with?("/.../n has a non escaped non ASCII character in non ASCII-8BIT script:")
            expected.pop
            actual.pop
          end
        end

        assert_equal expected, actual
      end
    end
  end
end