[ruby/irb] Rewrite RubyLex to fix some bugs and make it possible to

add new features easily (https://github.com/ruby/irb/pull/500) * Add nesting level parser for multiple use (indent, prompt, termination check) * Rewrite RubyLex using NestingParser * Add nesting parser tests, fix some existing tests * Add description comment, rename method to NestingParser * Add comments and tweak code to RubyLex * Update NestingParser test * Extract list of ltype tokens to constants
author: tomoya ishida <[email protected]> 2023-06-16 00:39:53 +0900
committer: git <[email protected]> 2023-06-15 15:39:58 +0000
commit: 364a6d56d776270da09604816d623047c66c5e32 (patch)
tree: 2e482a3252c5a366e9aab8fe23ae3757759069a3
parent: c1c926219de5489c321d53577ff2eb8c041e166f (diff)
4 files changed, 661 insertions, 508 deletions
diff --git a/lib/irb/nesting_parser.rb b/lib/irb/nesting_parser.rb
new file mode 100644
index 0000000000..3d4db82444
--- /dev/null
+++ b/lib/irb/nesting_parser.rb
@@ -0,0 +1,227 @@
+# frozen_string_literal: true
+module IRB
+  module NestingParser
+    IGNORE_TOKENS = %i[on_sp on_ignored_nl on_comment on_embdoc_beg on_embdoc on_embdoc_end]
+
+    # Scan each token and call the given block with array of token and other information for parsing
+    def self.scan_opens(tokens)
+      opens = []
+      pending_heredocs = []
+      first_token_on_line = true
+      tokens.each do |t|
+        skip = false
+        last_tok, state, args = opens.last
+        case state
+        when :in_unquoted_symbol
+          unless IGNORE_TOKENS.include?(t.event)
+            opens.pop
+            skip = true
+          end
+        when :in_lambda_head
+          opens.pop if t.event == :on_tlambeg || (t.event == :on_kw && t.tok == 'do')
+        when :in_method_head
+          unless IGNORE_TOKENS.include?(t.event)
+            next_args = []
+            body = nil
+            if args.include?(:receiver)
+              case t.event
+              when :on_lparen, :on_ivar, :on_gvar, :on_cvar
+                # def (receiver). | def @ivar. | def $gvar. | def @@cvar.
+                next_args << :dot
+              when :on_kw
+                case t.tok
+                when 'self', 'true', 'false', 'nil'
+                  # def self(arg) | def self.
+                  next_args.push(:arg, :dot)
+                else
+                  # def if(arg)
+                  skip = true
+                  next_args << :arg
+                end
+              when :on_op, :on_backtick
+                # def +(arg)
+                skip = true
+                next_args << :arg
+              when :on_ident, :on_const
+                # def a(arg) | def a.
+                next_args.push(:arg, :dot)
+              end
+            end
+            if args.include?(:dot)
+              # def receiver.name
+              next_args << :name if t.event == :on_period || (t.event == :on_op && t.tok == '::')
+            end
+            if args.include?(:name)
+              if %i[on_ident on_const on_op on_kw on_backtick].include?(t.event)
+                # def name(arg) | def receiver.name(arg)
+                next_args << :arg
+                skip = true
+              end
+            end
+            if args.include?(:arg)
+              case t.event
+              when :on_nl, :on_semicolon
+                # def recever.f;
+                body = :normal
+              when :on_lparen
+                # def recever.f()
+                next_args << :eq
+              else
+                if t.event == :on_op && t.tok == '='
+                  # def receiver.f =
+                  body = :oneliner
+                else
+                  # def recever.f arg
+                  next_args << :arg_without_paren
+                end
+              end
+            end
+            if args.include?(:eq)
+              if t.event == :on_op && t.tok == '='
+                body = :oneliner
+              else
+                body = :normal
+              end
+            end
+            if args.include?(:arg_without_paren)
+              if %i[on_semicolon on_nl].include?(t.event)
+                # def f a;
+                body = :normal
+              else
+                # def f a, b
+                next_args << :arg_without_paren
+              end
+            end
+            if body == :oneliner
+              opens.pop
+            elsif body
+              opens[-1] = [last_tok, nil]
+            else
+              opens[-1] = [last_tok, :in_method_head, next_args]
+            end
+          end
+        when :in_for_while_until_condition
+          if t.event == :on_semicolon || t.event == :on_nl || (t.event == :on_kw && t.tok == 'do')
+            skip = true if t.event == :on_kw && t.tok == 'do'
+            opens[-1] = [last_tok, nil]
+          end
+        end
+
+        unless skip
+          case t.event
+          when :on_kw
+            case t.tok
+            when 'begin', 'class', 'module', 'do', 'case'
+              opens << [t, nil]
+            when 'end'
+              opens.pop
+            when 'def'
+              opens << [t, :in_method_head, [:receiver, :name]]
+            when 'if', 'unless'
+              unless t.state.allbits?(Ripper::EXPR_LABEL)
+                opens << [t, nil]
+              end
+            when 'while', 'until'
+              unless t.state.allbits?(Ripper::EXPR_LABEL)
+                opens << [t, :in_for_while_until_condition]
+              end
+            when 'ensure', 'rescue'
+              unless t.state.allbits?(Ripper::EXPR_LABEL)
+                opens.pop
+                opens << [t, nil]
+              end
+            when 'elsif', 'else', 'when'
+              opens.pop
+              opens << [t, nil]
+            when 'for'
+              opens << [t, :in_for_while_until_condition]
+            when 'in'
+              if last_tok&.event == :on_kw && %w[case in].include?(last_tok.tok) && first_token_on_line
+                opens.pop
+                opens << [t, nil]
+              end
+            end
+          when :on_tlambda
+            opens << [t, :in_lambda_head]
+          when :on_lparen, :on_lbracket, :on_lbrace, :on_tlambeg, :on_embexpr_beg, :on_embdoc_beg
+            opens << [t, nil]
+          when :on_rparen, :on_rbracket, :on_rbrace, :on_embexpr_end, :on_embdoc_end
+            opens.pop
+          when :on_heredoc_beg
+            pending_heredocs << t
+          when :on_heredoc_end
+            opens.pop
+          when :on_backtick
+            opens << [t, nil] if t.state.allbits?(Ripper::EXPR_BEG)
+          when :on_tstring_beg, :on_words_beg, :on_qwords_beg, :on_symbols_beg, :on_qsymbols_beg, :on_regexp_beg
+            opens << [t, nil]
+          when :on_tstring_end, :on_regexp_end, :on_label_end
+            opens.pop
+          when :on_symbeg
+            if t.tok == ':'
+              opens << [t, :in_unquoted_symbol]
+            else
+              opens << [t, nil]
+            end
+          end
+        end
+        if t.event == :on_nl || t.event == :on_semicolon
+          first_token_on_line = true
+        elsif t.event != :on_sp
+          first_token_on_line = false
+        end
+        if pending_heredocs.any? && t.tok.include?("\n")
+          pending_heredocs.reverse_each { |t| opens << [t, nil] }
+          pending_heredocs = []
+        end
+        yield t, opens if block_given?
+      end
+      opens.map(&:first) + pending_heredocs.reverse
+    end
+
+    def self.open_tokens(tokens)
+      # scan_opens without block will return a list of open tokens at last token position
+      scan_opens(tokens)
+    end
+
+    # Calculates token information [line_tokens, prev_opens, next_opens, min_depth] for each line.
+    # Example code
+    #   ["hello
+    #   world"+(
+    # First line
+    #   line_tokens: [[lbracket, '['], [tstring_beg, '"'], [tstring_content("hello\nworld"), "hello\n"]]
+    #   prev_opens:  []
+    #   next_tokens: [lbracket, tstring_beg]
+    #   min_depth:   0 (minimum at beginning of line)
+    # Second line
+    #   line_tokens: [[tstring_content("hello\nworld"), "world"], [tstring_end, '"'], [op, '+'], [lparen, '(']]
+    #   prev_opens:  [lbracket, tstring_beg]
+    #   next_tokens: [lbracket, lparen]
+    #   min_depth:   1 (minimum just after tstring_end)
+    def self.parse_by_line(tokens)
+      line_tokens = []
+      prev_opens = []
+      min_depth = 0
+      output = []
+      last_opens = scan_opens(tokens) do |t, opens|
+        depth = t == opens.last&.first ? opens.size - 1 : opens.size
+        min_depth = depth if depth < min_depth
+        if t.tok.include?("\n")
+          t.tok.each_line do |line|
+            line_tokens << [t, line]
+            next if line[-1] != "\n"
+            next_opens = opens.map(&:first)
+            output << [line_tokens, prev_opens, next_opens, min_depth]
+            prev_opens = next_opens
+            min_depth = prev_opens.size
+            line_tokens = []
+          end
+        else
+          line_tokens << [t, t.tok]
+        end
+      end
+      output << [line_tokens, prev_opens, last_opens, min_depth] if line_tokens.any?
+      output
+    end
+  end
+end
diff --git a/lib/irb/ruby-lex.rb b/lib/irb/ruby-lex.rb
index e29d52e47c..77c5b07ae9 100644
--- a/lib/irb/ruby-lex.rb
+++ b/lib/irb/ruby-lex.rb
@@ -6,6 +6,7 @@
 
 require "ripper"
 require "jruby" if RUBY_ENGINE == "jruby"
+require_relative "nesting_parser"
 
 # :stopdoc:
 class RubyLex
@@ -54,8 +55,7 @@ class RubyLex
     if @io.respond_to?(:check_termination)
       @io.check_termination do |code|
         if Reline::IOGate.in_pasting?
-          lex = RubyLex.new(@context)
-          rest = lex.check_termination_in_prev_line(code)
+          rest = check_termination_in_prev_line(code)
           if rest
             Reline.delete_text
             rest.bytes.reverse_each do |c|
@@ -69,64 +69,39 @@ class RubyLex
           # Accept any single-line input for symbol aliases or commands that transform args
           next true if single_line_command?(code)
 
-          ltype, indent, continue, code_block_open = check_code_state(code)
-          if ltype or indent > 0 or continue or code_block_open
-            false
-          else
-            true
-          end
+          _tokens, _opens, terminated = check_code_state(code)
+          terminated
         end
       end
     end
     if @io.respond_to?(:dynamic_prompt)
       @io.dynamic_prompt do |lines|
         lines << '' if lines.empty?
-        result = []
         tokens = self.class.ripper_lex_without_warning(lines.map{ |l| l + "\n" }.join, context: @context)
-        code = String.new
-        partial_tokens = []
-        unprocessed_tokens = []
-        line_num_offset = 0
-        tokens.each do |t|
-          partial_tokens << t
-          unprocessed_tokens << t
-          if t.tok.include?("\n")
-            t_str = t.tok
-            t_str.each_line("\n") do |s|
-              code << s
-              next unless s.include?("\n")
-              ltype, indent, continue, code_block_open = check_state(code, partial_tokens)
-              result << @prompt.call(ltype, indent, continue || code_block_open, @line_no + line_num_offset)
-              line_num_offset += 1
-            end
-            unprocessed_tokens = []
-          else
-            code << t.tok
+        line_results = IRB::NestingParser.parse_by_line(tokens)
+        tokens_until_line = []
+        line_results.map.with_index do |(line_tokens, _prev_opens, next_opens, _min_depth), line_num_offset|
+          line_tokens.each do |token, _s|
+            # Avoid appending duplicated token. Tokens that include "\n" like multiline tstring_content can exist in multiple lines.
+            tokens_until_line << token if token != tokens_until_line.last
           end
+          continue = process_continue(tokens_until_line)
+          prompt(next_opens, continue, line_num_offset)
         end
-
-        unless unprocessed_tokens.empty?
-          ltype, indent, continue, code_block_open = check_state(code, unprocessed_tokens)
-          result << @prompt.call(ltype, indent, continue || code_block_open, @line_no + line_num_offset)
-        end
-        result
       end
     end
 
     if @io.respond_to?(:auto_indent) and @context.auto_indent_mode
       @io.auto_indent do |lines, line_index, byte_pointer, is_newline|
         if is_newline
-          @tokens = self.class.ripper_lex_without_warning(lines[0..line_index].join("\n"), context: @context)
-          prev_spaces = find_prev_spaces(line_index)
-          depth_difference = check_newline_depth_difference
-          depth_difference = 0 if depth_difference < 0
-          prev_spaces + depth_difference * 2
+          tokens = self.class.ripper_lex_without_warning(lines[0..line_index].join("\n"), context: @context)
+          process_indent_level(tokens, lines)
         else
           code = line_index.zero? ? '' : lines[0..(line_index - 1)].map{ |l| l + "\n" }.join
           last_line = lines[line_index]&.byteslice(0, byte_pointer)
           code += last_line if last_line
-          @tokens = self.class.ripper_lex_without_warning(code, context: @context)
-          check_corresponding_token_depth(lines, line_index)
+          tokens = self.class.ripper_lex_without_warning(code, context: @context)
+          check_corresponding_token_depth(tokens, lines, line_index)
         end
       end
     end
@@ -176,50 +151,30 @@ class RubyLex
     $VERBOSE = verbose
   end
 
-  def find_prev_spaces(line_index)
-    return 0 if @tokens.size == 0
-    md = @tokens[0].tok.match(/(\A +)/)
-    prev_spaces = md.nil? ? 0 : md[1].count(' ')
-    line_count = 0
-    @tokens.each_with_index do |t, i|
-      if t.tok.include?("\n")
-        line_count += t.tok.count("\n")
-        if line_count >= line_index
-          return prev_spaces
-        end
-        next if t.event == :on_tstring_content || t.event == :on_words_sep
-        if (@tokens.size - 1) > i
-          md = @tokens[i + 1].tok.match(/(\A +)/)
-          prev_spaces = md.nil? ? 0 : md[1].count(' ')
-        end
-      end
-    end
-    prev_spaces
-  end
-
-  def check_state(code, tokens)
-    ltype = process_literal_type(tokens)
-    indent = process_nesting_level(tokens)
-    continue = process_continue(tokens)
-    lvars_code = self.class.generate_local_variables_assign_code(@context.local_variables)
-    code = "#{lvars_code}\n#{code}" if lvars_code
-    code_block_open = check_code_block(code, tokens)
-    [ltype, indent, continue, code_block_open]
+  def prompt(opens, continue, line_num_offset)
+    ltype = ltype_from_open_tokens(opens)
+    _indent_level, nesting_level = calc_nesting_depth(opens)
+    @prompt&.call(ltype, nesting_level, opens.any? || continue, @line_no + line_num_offset)
   end
 
   def check_code_state(code)
     check_target_code = code.gsub(/\s*\z/, '').concat("\n")
     tokens = self.class.ripper_lex_without_warning(check_target_code, context: @context)
-    check_state(check_target_code, tokens)
+    opens = IRB::NestingParser.open_tokens(tokens)
+    [tokens, opens, code_terminated?(code, tokens, opens)]
   end
 
-  def save_prompt_to_context_io(ltype, indent, continue, line_num_offset)
+  def code_terminated?(code, tokens, opens)
+    opens.empty? && !process_continue(tokens) && !check_code_block(code, tokens)
+  end
+
+  def save_prompt_to_context_io(opens, continue, line_num_offset)
     # Implicitly saves prompt string to `@context.io.prompt`. This will be used in the next `@input.call`.
-    @prompt.call(ltype, indent, continue, @line_no + line_num_offset)
+    prompt(opens, continue, line_num_offset)
   end
 
   def readmultiline
-    save_prompt_to_context_io(nil, 0, false, 0)
+    save_prompt_to_context_io([], false, 0)
 
     # multiline
     return @input.call if @io.respond_to?(:check_termination)
@@ -237,11 +192,12 @@ class RubyLex
       # Accept any single-line input for symbol aliases or commands that transform args
       return code if single_line_command?(code)
 
-      ltype, indent, continue, code_block_open = check_code_state(code)
-      return code unless ltype or indent > 0 or continue or code_block_open
+      tokens, opens, terminated = check_code_state(code)
+      return code if terminated
 
       line_offset += 1
-      save_prompt_to_context_io(ltype, indent, continue, line_offset)
+      continue = process_continue(tokens)
+      save_prompt_to_context_io(opens, continue, line_offset)
     end
   end
 
@@ -282,9 +238,6 @@ class RubyLex
 
   def check_code_block(code, tokens)
     return true if tokens.empty?
-    if tokens.last.event == :on_heredoc_beg
-      return true
-    end
 
     begin # check if parser error are available
       verbose, $VERBOSE = $VERBOSE, nil
@@ -372,365 +325,82 @@ class RubyLex
     false
   end
 
-  def process_nesting_level(tokens)
-    indent = 0
-    in_oneliner_def = nil
-    tokens.each_with_index { |t, index|
-      # detecting one-liner method definition
-      if in_oneliner_def.nil?
-        if t.state.allbits?(Ripper::EXPR_ENDFN)
-          in_oneliner_def = :ENDFN
-        end
-      else
-        if t.state.allbits?(Ripper::EXPR_ENDFN)
-          # continuing
-        elsif t.state.allbits?(Ripper::EXPR_BEG)
-          if t.tok == '='
-            in_oneliner_def = :BODY
-          end
-        else
-          if in_oneliner_def == :BODY
-            # one-liner method definition
-            indent -= 1
-          end
-          in_oneliner_def = nil
-        end
-      end
-
+  # Calculates [indent_level, nesting_level]. nesting_level is used in prompt string.
+  def calc_nesting_depth(opens)
+    indent_level = 0
+    nesting_level = 0
+    opens.each do |t|
       case t.event
-      when :on_lbracket, :on_lbrace, :on_lparen, :on_tlambeg
-        indent += 1
-      when :on_rbracket, :on_rbrace, :on_rparen
-        indent -= 1
-      when :on_kw
-        next if index > 0 and tokens[index - 1].state.allbits?(Ripper::EXPR_FNAME)
-        case t.tok
-        when 'do'
-          syntax_of_do = take_corresponding_syntax_to_kw_do(tokens, index)
-          indent += 1 if syntax_of_do == :method_calling
-        when 'def', 'case', 'for', 'begin', 'class', 'module'
-          indent += 1
-        when 'if', 'unless', 'while', 'until'
-          # postfix if/unless/while/until must be Ripper::EXPR_LABEL
-          indent += 1 unless t.state.allbits?(Ripper::EXPR_LABEL)
-        when 'end'
-          indent -= 1
-        end
-      end
-      # percent literals are not indented
-    }
-    indent
-  end
-
-  def is_method_calling?(tokens, index)
-    tk = tokens[index]
-    if tk.state.anybits?(Ripper::EXPR_CMDARG) and tk.event == :on_ident
-      # The target method call to pass the block with "do".
-      return true
-    elsif tk.state.anybits?(Ripper::EXPR_ARG) and tk.event == :on_ident
-      non_sp_index = tokens[0..(index - 1)].rindex{ |t| t.event != :on_sp }
-      if non_sp_index
-        prev_tk = tokens[non_sp_index]
-        if prev_tk.state.anybits?(Ripper::EXPR_DOT) and prev_tk.event == :on_period
-          # The target method call with receiver to pass the block with "do".
-          return true
-        end
+      when :on_heredoc_beg
+        # TODO: indent heredoc
+      when :on_tstring_beg, :on_regexp_beg, :on_symbeg
+        # can be indented if t.tok starts with `%`
+      when :on_words_beg, :on_qwords_beg, :on_symbols_beg, :on_qsymbols_beg, :on_embexpr_beg
+        # can be indented but not indented in current implementation
+      when :on_embdoc_beg
+        indent_level = 0
+      else
+        nesting_level += 1
+        indent_level += 1
       end
     end
-    false
+    [indent_level, nesting_level]
   end
 
-  def take_corresponding_syntax_to_kw_do(tokens, index)
-    syntax_of_do = nil
-    # Finding a syntax corresponding to "do".
-    index.downto(0) do |i|
-      tk = tokens[i]
-      # In "continue", the token isn't the corresponding syntax to "do".
-      non_sp_index = tokens[0..(i - 1)].rindex{ |t| t.event != :on_sp }
-      first_in_fomula = false
-      if non_sp_index.nil?
-        first_in_fomula = true
-      elsif [:on_ignored_nl, :on_nl, :on_comment].include?(tokens[non_sp_index].event)
-        first_in_fomula = true
-      end
-      if is_method_calling?(tokens, i)
-        syntax_of_do = :method_calling
-        break if first_in_fomula
-      elsif tk.event == :on_kw && %w{while until for}.include?(tk.tok)
-        # A loop syntax in front of "do" found.
-        #
-        #   while cond do # also "until" or "for"
-        #   end
-        #
-        # This "do" doesn't increment indent because the loop syntax already
-        # incremented.
-        syntax_of_do = :loop_syntax
-        break if first_in_fomula
-      end
+  def free_indent_token(opens, line_index)
+    last_token = opens.last
+    return unless last_token
+    if last_token.event == :on_heredoc_beg && last_token.pos.first < line_index + 1
+      # accept extra indent spaces inside heredoc
+      last_token
     end
-    syntax_of_do
   end
 
-  def is_the_in_correspond_to_a_for(tokens, index)
-    syntax_of_in = nil
-    # Finding a syntax corresponding to "do".
-    index.downto(0) do |i|
-      tk = tokens[i]
-      # In "continue", the token isn't the corresponding syntax to "do".
-      non_sp_index = tokens[0..(i - 1)].rindex{ |t| t.event != :on_sp }
-      first_in_fomula = false
-      if non_sp_index.nil?
-        first_in_fomula = true
-      elsif [:on_ignored_nl, :on_nl, :on_comment].include?(tokens[non_sp_index].event)
-        first_in_fomula = true
-      end
-      if tk.event == :on_kw && tk.tok == 'for'
-        # A loop syntax in front of "do" found.
-        #
-        #   while cond do # also "until" or "for"
-        #   end
-        #
-        # This "do" doesn't increment indent because the loop syntax already
-        # incremented.
-        syntax_of_in = :for
-      end
-      break if first_in_fomula
+  def process_indent_level(tokens, lines)
+    opens = IRB::NestingParser.open_tokens(tokens)
+    indent_level, _nesting_level = calc_nesting_depth(opens)
+    indent = indent_level * 2
+    line_index = lines.size - 2
+    if free_indent_token(opens, line_index)
+      return [indent, lines[line_index][/^ */].length].max
     end
-    syntax_of_in
-  end
-
-  def check_newline_depth_difference
-    depth_difference = 0
-    open_brace_on_line = 0
-    in_oneliner_def = nil
-    @tokens.each_with_index do |t, index|
-      # detecting one-liner method definition
-      if in_oneliner_def.nil?
-        if t.state.allbits?(Ripper::EXPR_ENDFN)
-          in_oneliner_def = :ENDFN
-        end
-      else
-        if t.state.allbits?(Ripper::EXPR_ENDFN)
-          # continuing
-        elsif t.state.allbits?(Ripper::EXPR_BEG)
-          if t.tok == '='
-            in_oneliner_def = :BODY
-          end
-        else
-          if in_oneliner_def == :BODY
-            # one-liner method definition
-            depth_difference -= 1
-          end
-          in_oneliner_def = nil
-        end
-      end
 
-      case t.event
-      when :on_ignored_nl, :on_nl, :on_comment
-        if index != (@tokens.size - 1) and in_oneliner_def != :BODY
-          depth_difference = 0
-          open_brace_on_line = 0
-        end
-        next
-      when :on_sp
-        next
-      end
-
-      case t.event
-      when :on_lbracket, :on_lbrace, :on_lparen, :on_tlambeg
-        depth_difference += 1
-        open_brace_on_line += 1
-      when :on_rbracket, :on_rbrace, :on_rparen
-        depth_difference -= 1 if open_brace_on_line > 0
-      when :on_kw
-        next if index > 0 and @tokens[index - 1].state.allbits?(Ripper::EXPR_FNAME)
-        case t.tok
-        when 'do'
-          syntax_of_do = take_corresponding_syntax_to_kw_do(@tokens, index)
-          depth_difference += 1 if syntax_of_do == :method_calling
-        when 'def', 'case', 'for', 'begin', 'class', 'module'
-          depth_difference += 1
-        when 'if', 'unless', 'while', 'until', 'rescue'
-          # postfix if/unless/while/until/rescue must be Ripper::EXPR_LABEL
-          unless t.state.allbits?(Ripper::EXPR_LABEL)
-            depth_difference += 1
-          end
-        when 'else', 'elsif', 'ensure', 'when'
-          depth_difference += 1
-        when 'in'
-          unless is_the_in_correspond_to_a_for(@tokens, index)
-            depth_difference += 1
-          end
-        when 'end'
-          depth_difference -= 1
-        end
-      end
-    end
-    depth_difference
+    indent
   end
 
-  def check_corresponding_token_depth(lines, line_index)
-    corresponding_token_depth = nil
-    is_first_spaces_of_line = true
-    is_first_printable_of_line = true
-    spaces_of_nest = []
-    spaces_at_line_head = 0
-    open_brace_on_line = 0
-    in_oneliner_def = nil
-
-    if heredoc_scope?
+  def check_corresponding_token_depth(tokens, lines, line_index)
+    line_results = IRB::NestingParser.parse_by_line(tokens)
+    result = line_results[line_index]
+    return unless result
+
+    # To correctly indent line like `end.map do`, we use shortest open tokens on each line for indent calculation.
+    # Shortest open tokens can be calculated by `opens.take(min_depth)`
+    _tokens, prev_opens, opens, min_depth = result
+    indent_level, _nesting_level = calc_nesting_depth(opens.take(min_depth))
+    indent = indent_level * 2
+    free_indent_tok = free_indent_token(opens, line_index)
+    prev_line_free_indent_tok = free_indent_token(prev_opens, line_index - 1)
+    if prev_line_free_indent_tok && prev_line_free_indent_tok != free_indent_tok
+      return indent
+    elsif free_indent_tok
       return lines[line_index][/^ */].length
     end
-
-    @tokens.each_with_index do |t, index|
-      # detecting one-liner method definition
-      if in_oneliner_def.nil?
-        if t.state.allbits?(Ripper::EXPR_ENDFN)
-          in_oneliner_def = :ENDFN
-        end
-      else
-        if t.state.allbits?(Ripper::EXPR_ENDFN)
-          # continuing
-        elsif t.state.allbits?(Ripper::EXPR_BEG)
-          if t.tok == '='
-            in_oneliner_def = :BODY
-          end
-        else
-          if in_oneliner_def == :BODY
-            # one-liner method definition
-            if is_first_printable_of_line
-              corresponding_token_depth = spaces_of_nest.pop
-            else
-              spaces_of_nest.pop
-              corresponding_token_depth = nil
-            end
-          end
-          in_oneliner_def = nil
-        end
-      end
-
-      case t.event
-      when :on_ignored_nl, :on_nl, :on_comment, :on_heredoc_end, :on_embdoc_end
-        if in_oneliner_def != :BODY
-          corresponding_token_depth = nil
-          spaces_at_line_head = 0
-          is_first_spaces_of_line = true
-          is_first_printable_of_line = true
-          open_brace_on_line = 0
-        end
-        next
-      when :on_sp
-        spaces_at_line_head = t.tok.count(' ') if is_first_spaces_of_line
-        is_first_spaces_of_line = false
-        next
-      end
-
-      case t.event
-      when :on_lbracket, :on_lbrace, :on_lparen, :on_tlambeg
-        spaces_of_nest.push(spaces_at_line_head + open_brace_on_line * 2)
-        open_brace_on_line += 1
-      when :on_rbracket, :on_rbrace, :on_rparen
-        if is_first_printable_of_line
-          corresponding_token_depth = spaces_of_nest.pop
-        else
-          spaces_of_nest.pop
-          corresponding_token_depth = nil
-        end
-        open_brace_on_line -= 1
-      when :on_kw
-        next if index > 0 and @tokens[index - 1].state.allbits?(Ripper::EXPR_FNAME)
-        case t.tok
-        when 'do'
-          syntax_of_do = take_corresponding_syntax_to_kw_do(@tokens, index)
-          if syntax_of_do == :method_calling
-            spaces_of_nest.push(spaces_at_line_head)
-          end
-        when 'def', 'case', 'for', 'begin', 'class', 'module'
-          spaces_of_nest.push(spaces_at_line_head)
-        when 'rescue'
-          unless t.state.allbits?(Ripper::EXPR_LABEL)
-            corresponding_token_depth = spaces_of_nest.last
-          end
-        when 'if', 'unless', 'while', 'until'
-          # postfix if/unless/while/until must be Ripper::EXPR_LABEL
-          unless t.state.allbits?(Ripper::EXPR_LABEL)
-            spaces_of_nest.push(spaces_at_line_head)
-          end
-        when 'else', 'elsif', 'ensure', 'when'
-          corresponding_token_depth = spaces_of_nest.last
-        when 'in'
-          if in_keyword_case_scope?
-            corresponding_token_depth = spaces_of_nest.last
-          end
-        when 'end'
-          if is_first_printable_of_line
-            corresponding_token_depth = spaces_of_nest.pop
-          else
-            spaces_of_nest.pop
-            corresponding_token_depth = nil
-          end
-        end
-      end
-      is_first_spaces_of_line = false
-      is_first_printable_of_line = false
-    end
-    corresponding_token_depth
+    prev_indent_level, _prev_nesting_level = calc_nesting_depth(prev_opens)
+    indent if indent_level < prev_indent_level
   end
 
-  def check_string_literal(tokens)
-    i = 0
-    start_token = []
-    end_type = []
-    pending_heredocs = []
-    while i < tokens.size
-      t = tokens[i]
-      case t.event
-      when *end_type.last
-        start_token.pop
-        end_type.pop
-      when :on_tstring_beg
-        start_token << t
-        end_type << [:on_tstring_end, :on_label_end]
-      when :on_regexp_beg
-        start_token << t
-        end_type << :on_regexp_end
-      when :on_symbeg
-        acceptable_single_tokens = %i{on_ident on_const on_op on_cvar on_ivar on_gvar on_kw on_int on_backtick}
-        if (i + 1) < tokens.size
-          if acceptable_single_tokens.all?{ |st| tokens[i + 1].event != st }
-            start_token << t
-            end_type << :on_tstring_end
-          else
-            i += 1
-          end
-        end
-      when :on_backtick
-        if t.state.allbits?(Ripper::EXPR_BEG)
-          start_token << t
-          end_type << :on_tstring_end
-        end
-      when :on_qwords_beg, :on_words_beg, :on_qsymbols_beg, :on_symbols_beg
-        start_token << t
-        end_type << :on_tstring_end
-      when :on_heredoc_beg
-        pending_heredocs << t
-      end
+  LTYPE_TOKENS = %i[
+    on_heredoc_beg on_tstring_beg
+    on_regexp_beg on_symbeg on_backtick
+    on_symbols_beg on_qsymbols_beg
+    on_words_beg on_qwords_beg
+  ]
 
-      if pending_heredocs.any? && t.tok.include?("\n")
-        pending_heredocs.reverse_each do |t|
-          start_token << t
-          end_type << :on_heredoc_end
-        end
-        pending_heredocs = []
-      end
-      i += 1
+  def ltype_from_open_tokens(opens)
+    start_token = opens.reverse_each.find do |tok|
+      LTYPE_TOKENS.include?(tok.event)
     end
-    pending_heredocs.first || start_token.last
-  end
-
-  def process_literal_type(tokens)
-    start_token = check_string_literal(tokens)
-    return nil if start_token == ""
+    return nil unless start_token
 
     case start_token&.event
     when :on_tstring_beg
@@ -783,47 +453,16 @@ class RubyLex
         end
       end
 
-      if first_token.nil?
-        return false
-      elsif first_token && first_token.state == Ripper::EXPR_DOT
-        return false
-      else
+      if first_token && first_token.state != Ripper::EXPR_DOT
         tokens_without_last_line = tokens[0..index]
-        ltype = process_literal_type(tokens_without_last_line)
-        indent = process_nesting_level(tokens_without_last_line)
-        continue = process_continue(tokens_without_last_line)
-        code_block_open = check_code_block(tokens_without_last_line.map(&:tok).join(''), tokens_without_last_line)
-        if ltype or indent > 0 or continue or code_block_open
-          return false
-        else
-          return last_line_tokens.map(&:tok).join('')
+        code_without_last_line = tokens_without_last_line.map(&:tok).join
+        opens_without_last_line = IRB::NestingParser.open_tokens(tokens_without_last_line)
+        if code_terminated?(code_without_last_line, tokens_without_last_line, opens_without_last_line)
+          return last_line_tokens.map(&:tok).join
         end
       end
     end
     false
   end
-
-  private
-
-  def heredoc_scope?
-    heredoc_tokens = @tokens.select { |t| [:on_heredoc_beg, :on_heredoc_end].include?(t.event) }
-    heredoc_tokens[-1]&.event == :on_heredoc_beg
-  end
-
-  def in_keyword_case_scope?
-    kw_tokens = @tokens.select { |t| t.event == :on_kw && ['case', 'for', 'end'].include?(t.tok) }
-    counter = 0
-    kw_tokens.reverse.each do |t|
-      if t.tok == 'case'
-        return true if counter.zero?
-        counter += 1
-      elsif t.tok == 'for'
-        counter += 1
-      elsif t.tok == 'end'
-        counter -= 1
-      end
-    end
-    false
-  end
 end
 # :startdoc:
diff --git a/test/irb/test_nesting_parser.rb b/test/irb/test_nesting_parser.rb
new file mode 100644
index 0000000000..83c7fb08a6
--- /dev/null
+++ b/test/irb/test_nesting_parser.rb
@@ -0,0 +1,303 @@
+# frozen_string_literal: false
+require 'irb'
+
+require_relative "helper"
+
+module TestIRB
+  class NestingParserTest < TestCase
+    def setup
+      save_encodings
+    end
+
+    def teardown
+      restore_encodings
+    end
+
+    def parse_by_line(code)
+      IRB::NestingParser.parse_by_line(RubyLex.ripper_lex_without_warning(code))
+    end
+
+    def test_open_tokens
+      code = <<~'EOS'
+        class A
+          def f
+            if true
+              tap do
+                {
+                  x: "
+                    #{p(1, 2, 3
+      EOS
+      opens = IRB::NestingParser.open_tokens(RubyLex.ripper_lex_without_warning(code))
+      assert_equal(%w[class def if do { " #{ (], opens.map(&:tok))
+    end
+
+    def test_parse_by_line
+      code = <<~EOS
+        (((((1+2
+        ).to_s())).tap do (((
+      EOS
+      _tokens, prev_opens, next_opens, min_depth = parse_by_line(code).last
+      assert_equal(%w[( ( ( ( (], prev_opens.map(&:tok))
+      assert_equal(%w[( ( do ( ( (], next_opens.map(&:tok))
+      assert_equal(2, min_depth)
+    end
+
+    def test_ruby_syntax
+      code = <<~'EOS'
+        class A
+          1 if 2
+          1 while 2
+          1 until 2
+          1 unless 2
+          1 rescue 2
+          begin; rescue; ensure; end
+          tap do; rescue; ensure; end
+          class B; end
+          module C; end
+          def f; end
+          def `; end
+          def f() = 1
+          %(); %w[]; %q(); %r{}; %i[]
+          "#{1}"; ''; /#{1}/; `#{1}`
+          :sym; :"sym"; :+; :`; :if
+          [1, 2, 3]
+          { x: 1, y: 2 }
+          (a, (*b, c), d), e = 1, 2, 3
+          ->(a){}; ->(a) do end
+          -> a = -> b = :do do end do end
+          if 1; elsif 2; else; end
+          unless 1; end
+          while 1; end
+          until 1; end
+          for i in j; end
+          case 1; when 2; end
+          puts(1, 2, 3)
+          loop{|i|}
+          loop do |i| end
+        end
+      EOS
+      line_results = parse_by_line(code)
+      assert_equal(code.lines.size, line_results.size)
+      class_open, *inner_line_results, class_close = line_results
+      assert_equal(['class'], class_open[2].map(&:tok))
+      inner_line_results.each {|result| assert_equal(['class'], result[2].map(&:tok)) }
+      assert_equal([], class_close[2].map(&:tok))
+    end
+
+    def test_multiline_string
+      code = <<~EOS
+        "
+        aaa
+        bbb
+        "
+        <<A
+        aaa
+        bbb
+        A
+      EOS
+      line_results = parse_by_line(code)
+      assert_equal(code.lines.size, line_results.size)
+      string_content_line, string_opens = line_results[1]
+      assert_equal("\naaa\nbbb\n", string_content_line.first.first.tok)
+      assert_equal("aaa\n", string_content_line.first.last)
+      assert_equal(['"'], string_opens.map(&:tok))
+      heredoc_content_line, heredoc_opens = line_results[6]
+      assert_equal("aaa\nbbb\n", heredoc_content_line.first.first.tok)
+      assert_equal("bbb\n", heredoc_content_line.first.last)
+      assert_equal(['<<A'], heredoc_opens.map(&:tok))
+      _line, _prev_opens, next_opens, _min_depth = line_results.last
+      assert_equal([], next_opens)
+    end
+
+    def test_backslash_continued_nested_symbol
+      code = <<~'EOS'
+        x = <<A, :\
+          heredoc #{
+            here
+          }
+        A
+        =begin
+        embdoc
+        =end
+        # comment
+
+        if # this is symbol :if
+        while
+      EOS
+      line_results = parse_by_line(code)
+      assert_equal(%w[: <<A #{], line_results[2][2].map(&:tok))
+      assert_equal(%w[while], line_results.last[2].map(&:tok))
+    end
+
+    def test_oneliner_def
+      code = <<~EOC
+        if true
+          # normal oneliner def
+          def f = 1
+          def f() = 1
+          def f(*) = 1
+          # keyword, backtick, op
+          def * = 1
+          def ` = 1
+          def if = 1
+          def *() = 1
+          def `() = 1
+          def if() = 1
+          # oneliner def with receiver
+          def a.* = 1
+          def $a.* = 1
+          def @a.` = 1
+          def A.` = 1
+          def ((a;b;c)).*() = 1
+          def ((a;b;c)).if() = 1
+          def ((a;b;c)).end() = 1
+          # multiline oneliner def
+          def f =
+          1
+          def f()
+          =
+          1
+          # oneliner def with comment and embdoc
+          def # comment
+        =begin
+        embdoc
+        =end
+            ((a;b;c))
+            . # comment
+        =begin
+        embdoc
+        =end
+            f (*) # comment
+        =begin
+        embdoc
+        =end
+          =
+          1
+          # nested oneliner def
+          def f(x = def f() = 1) = def f() = 1
+      EOC
+      _tokens, _prev_opens, next_opens, min_depth = parse_by_line(code).last
+      assert_equal(['if'], next_opens.map(&:tok))
+      assert_equal(1, min_depth)
+    end
+
+    def test_heredoc_embexpr
+      code = <<~'EOS'
+        <<A+<<B+<<C+(<<D+(<<E)
+          #{
+            <<~F+"#{<<~G}
+            #{
+              here
+            }
+            F
+            G
+            "
+          }
+        A
+        B
+        C
+        D
+        E
+        )
+      EOS
+      line_results = parse_by_line(code)
+      last_opens = line_results.last[-2]
+      assert_equal([], last_opens)
+      _tokens, _prev_opens, next_opens, _min_depth = line_results[4]
+      assert_equal(%w[( <<E <<D <<C <<B <<A #{ " <<~G <<~F #{], next_opens.map(&:tok))
+    end
+
+    def test_for_in
+      code = <<~EOS
+        for i in j
+          here
+        end
+        for i in j do
+          here
+        end
+        for i in
+          j do
+          here
+        end
+        for
+          # comment
+          i in j do
+          here
+        end
+        for (a;b;c).d in (a;b;c) do
+          here
+        end
+        for i in :in + :do do
+          here
+        end
+        for i in -> do end do
+          here
+        end
+      EOS
+      line_results = parse_by_line(code).select { |tokens,| tokens.map(&:last).include?('here') }
+      assert_equal(7, line_results.size)
+      line_results.each do |_tokens, _prev_opens, next_opens, _min_depth|
+        assert_equal(['for'], next_opens.map(&:tok))
+      end
+    end
+
+    def test_while_until
+      base_code = <<~'EOS'
+        while_or_until true
+          here
+        end
+        while_or_until a < c
+          here
+        end
+        while_or_until true do
+          here
+        end
+        while_or_until
+          # comment
+          (a + b) <
+          # comment
+          c do
+          here
+        end
+        while_or_until :\
+          do do
+          here
+        end
+        while_or_until def do; end == :do do
+          here
+        end
+        while_or_until -> do end do
+          here
+        end
+      EOS
+      %w[while until].each do |keyword|
+        code = base_code.gsub('while_or_until', keyword)
+        line_results = parse_by_line(code).select { |tokens,| tokens.map(&:last).include?('here') }
+        assert_equal(7, line_results.size)
+        line_results.each do |_tokens, _prev_opens, next_opens, _min_depth|
+          assert_equal([keyword], next_opens.map(&:tok) )
+        end
+      end
+    end
+
+    def test_case_in
+      if Gem::Version.new(RUBY_VERSION) < Gem::Version.new('2.7.0')
+        pend 'This test requires ruby version that supports case-in syntax'
+      end
+      code = <<~EOS
+        case 1
+        in 1
+          here
+        in
+          2
+          here
+        end
+      EOS
+      line_results = parse_by_line(code).select { |tokens,| tokens.map(&:last).include?('here') }
+      assert_equal(2, line_results.size)
+      line_results.each do |_tokens, _prev_opens, next_opens, _min_depth|
+        assert_equal(['in'], next_opens.map(&:tok))
+      end
+    end
+  end
+end
diff --git a/test/irb/test_ruby_lex.rb b/test/irb/test_ruby_lex.rb
index aa27204e26..9d7910cca6 100644
--- a/test/irb/test_ruby_lex.rb
+++ b/test/irb/test_ruby_lex.rb
@@ -95,8 +95,11 @@ module TestIRB
 
     def check_state(lines, local_variables: [])
       context = build_context(local_variables)
+      tokens = RubyLex.ripper_lex_without_warning(lines.join("\n"), context: context)
+      opens = IRB::NestingParser.open_tokens(tokens)
       ruby_lex = RubyLex.new(context)
-      _ltype, indent, _continue, code_block_open = ruby_lex.check_code_state(lines.join("\n"))
+      indent, _nesting_level = ruby_lex.calc_nesting_depth(opens)
+      code_block_open = !opens.empty? || ruby_lex.process_continue(tokens)
       [indent, code_block_open]
     end
 
@@ -164,9 +167,9 @@ module TestIRB
         Row.new(%q(    ]), 4, 4),
         Row.new(%q(  ]), 2, 2),
         Row.new(%q(]), 0, 0),
-        Row.new(%q([<<FOO]), 0, 0),
+        Row.new(%q([<<FOO]), nil, 0),
         Row.new(%q(hello), 0, 0),
-        Row.new(%q(FOO), nil, 0),
+        Row.new(%q(FOO), 0, 0),
       ]
 
       lines = []
@@ -489,12 +492,12 @@ module TestIRB
       end
     end
 
-    def test_corresponding_syntax_to_keyword_in
+    def test_typing_incomplete_include_interpreted_as_keyword_in
       input_with_correct_indents = [
         Row.new(%q(module E), nil, 2, 1),
         Row.new(%q(end), 0, 0, 0),
         Row.new(%q(class A), nil, 2, 1),
-        Row.new(%q(  in), nil, 4, 1)
+        Row.new(%q(  in), nil, 2, 1) # scenario typing `include E`
       ]
 
       lines = []
@@ -575,11 +578,19 @@ module TestIRB
     end
 
     def test_heredoc_with_indent
+      if Gem::Version.new(RUBY_VERSION) < Gem::Version.new('2.7.0')
+        pend 'This test needs Ripper::Lexer#scan to take broken tokens'
+      end
       input_with_correct_indents = [
-        Row.new(%q(<<~Q), 0, 0, 0),
-        Row.new(%q({), 0, 0, 0),
-        Row.new(%q(  #), 2, 0, 0),
-        Row.new(%q(}), 0, 0, 0)
+        Row.new(%q(<<~Q+<<~R), nil, 0, 0),
+        Row.new(%q(a), 0, 0, 0),
+        Row.new(%q(a), 0, 0, 0),
+        Row.new(%q(  b), 2, 2, 0),
+        Row.new(%q(  b), 2, 2, 0),
+        Row.new(%q(  Q), 0, 2, 0),
+        Row.new(%q(    c), 4, 4, 0),
+        Row.new(%q(    c), 4, 4, 0),
+        Row.new(%q(    R), 0, 0, 0),
       ]
 
       lines = []
@@ -592,8 +603,8 @@ module TestIRB
 
     def test_oneliner_def_in_multiple_lines
       input_with_correct_indents = [
-        Row.new(%q(def a()=[), nil, 4, 2),
-        Row.new(%q(  1,), nil, 4, 1),
+        Row.new(%q(def a()=[), nil, 2, 1),
+        Row.new(%q(  1,), nil, 2, 1),
         Row.new(%q(].), 0, 0, 0),
         Row.new(%q(to_s), nil, 0, 0),
       ]
@@ -609,7 +620,7 @@ module TestIRB
     def test_broken_heredoc
       input_with_correct_indents = [
         Row.new(%q(def foo), nil, 2, 1),
-        Row.new(%q(  <<~Q), 2, 2, 1),
+        Row.new(%q(  <<~Q), nil, 2, 1),
         Row.new(%q(  Qend), 2, 2, 1),
       ]
 
@@ -621,6 +632,15 @@ module TestIRB
       end
     end
 
+    def test_heredoc_keep_indent_spaces
+      (1..4).each do |indent|
+        row = Row.new(' ' * indent, indent, [2, indent].max, 1)
+        lines = ['def foo', '  <<~Q', row.content]
+        assert_row_indenting(lines, row)
+        assert_nesting_level(lines, row.nesting_level)
+      end
+    end
+
     PromptRow = Struct.new(:prompt, :content)
 
     class MockIO_DynamicPrompt
@@ -746,10 +766,9 @@ module TestIRB
     end
 
     def test_unterminated_heredoc_string_literal
-      context = build_context
       ['<<A;<<B', "<<A;<<B\n", "%W[\#{<<A;<<B", "%W[\#{<<A;<<B\n"].each do |code|
         tokens = RubyLex.ripper_lex_without_warning(code)
-        string_literal = RubyLex.new(context).check_string_literal(tokens)
+        string_literal = IRB::NestingParser.open_tokens(tokens).last
         assert_equal('<<A', string_literal&.tok)
       end
     end
@@ -779,43 +798,8 @@ module TestIRB
       [reference_code, code_with_heredoc, code_with_embdoc].each do |code|
         lex = RubyLex.new(context)
         lines = code.lines
-        lex.instance_variable_set('@tokens', RubyLex.ripper_lex_without_warning(code))
-        assert_equal 2, lex.check_corresponding_token_depth(lines, lines.size)
-      end
-    end
-
-    def test_find_prev_spaces_with_multiline_literal
-      lex = RubyLex.new(build_context)
-      reference_code = <<~EOC.chomp
-        if true
-          1
-          hello
-          1
-          world
-        end
-      EOC
-      code_with_percent_string = <<~EOC.chomp
-        if true
-          %w[
-            hello
-          ]
-          world
-        end
-      EOC
-      code_with_quoted_string = <<~EOC.chomp
-        if true
-          '
-            hello
-          '
-          world
-        end
-      EOC
-      context = build_context
-      [reference_code, code_with_percent_string, code_with_quoted_string].each do |code|
-        lex = RubyLex.new(context)
-        lex.instance_variable_set('@tokens', RubyLex.ripper_lex_without_warning(code))
-        prev_spaces = (1..code.lines.size).map { |index| lex.find_prev_spaces index }
-        assert_equal [0, 2, 2, 2, 2, 0], prev_spaces
+        tokens = RubyLex.ripper_lex_without_warning(code)
+        assert_equal(2, lex.check_corresponding_token_depth(tokens, lines, lines.size - 1))
       end
     end
author	tomoya ishida <[email protected]>	2023-06-16 00:39:53 +0900
committer	git <[email protected]>	2023-06-15 15:39:58 +0000
commit	364a6d56d776270da09604816d623047c66c5e32 (patch)
tree	2e482a3252c5a366e9aab8fe23ae3757759069a3
parent	c1c926219de5489c321d53577ff2eb8c041e166f (diff)