diff options
author | Hiroshi SHIBATA <[email protected]> | 2022-12-09 08:46:14 +0900 |
---|---|---|
committer | Hiroshi SHIBATA <[email protected]> | 2022-12-09 16:36:22 +0900 |
commit | 643918ecfe9c980f251247de6acd3be6280da24c (patch) | |
tree | a5b4011c13ee3af5b110e377a839e79045266dcd /lib/csv/parser.rb | |
parent | 260a00d80e4dcc930b040313a99da29e4b1e6678 (diff) |
Merge csv-3.2.6
Notes
Notes:
Merged: https://github.com/ruby/ruby/pull/6890
Diffstat (limited to 'lib/csv/parser.rb')
-rw-r--r-- | lib/csv/parser.rb | 293 |
1 files changed, 205 insertions, 88 deletions
diff --git a/lib/csv/parser.rb b/lib/csv/parser.rb index 7e943acf21..afb3131cd5 100644 --- a/lib/csv/parser.rb +++ b/lib/csv/parser.rb @@ -2,15 +2,10 @@ require "strscan" -require_relative "delete_suffix" require_relative "input_record_separator" -require_relative "match_p" require_relative "row" require_relative "table" -using CSV::DeleteSuffix if CSV.const_defined?(:DeleteSuffix) -using CSV::MatchP if CSV.const_defined?(:MatchP) - class CSV # Note: Don't use this class directly. This is an internal class. class Parser @@ -27,6 +22,10 @@ class CSV class InvalidEncoding < StandardError end + # Raised when unexpected case is happen. + class UnexpectedError < StandardError + end + # # CSV::Scanner receives a CSV output, scans it and return the content. # It also controls the life cycle of the object with its methods +keep_start+, @@ -78,10 +77,10 @@ class CSV # +keep_end+, +keep_back+, +keep_drop+. # # CSV::InputsScanner.scan() tries to match with pattern at the current position. - # If there's a match, the scanner advances the “scan pointer” and returns the matched string. + # If there's a match, the scanner advances the "scan pointer" and returns the matched string. # Otherwise, the scanner returns nil. # - # CSV::InputsScanner.rest() returns the “rest” of the string (i.e. everything after the scan pointer). + # CSV::InputsScanner.rest() returns the "rest" of the string (i.e. everything after the scan pointer). # If there is no more data (eos? = true), it returns "". # class InputsScanner @@ -96,11 +95,13 @@ class CSV end def each_line(row_separator) + return enum_for(__method__, row_separator) unless block_given? buffer = nil input = @scanner.rest position = @scanner.pos offset = 0 n_row_separator_chars = row_separator.size + # trace(__method__, :start, line, input) while true input.each_line(row_separator) do |line| @scanner.pos += line.bytesize @@ -140,25 +141,28 @@ class CSV end def scan(pattern) + # trace(__method__, pattern, :start) value = @scanner.scan(pattern) + # trace(__method__, pattern, :done, :last, value) if @last_scanner return value if @last_scanner - if value - read_chunk if @scanner.eos? - return value - else - nil - end + read_chunk if value and @scanner.eos? + # trace(__method__, pattern, :done, value) + value end def scan_all(pattern) + # trace(__method__, pattern, :start) value = @scanner.scan(pattern) + # trace(__method__, pattern, :done, :last, value) if @last_scanner return value if @last_scanner return nil if value.nil? while @scanner.eos? and read_chunk and (sub_value = @scanner.scan(pattern)) + # trace(__method__, pattern, :sub, sub_value) value << sub_value end + # trace(__method__, pattern, :done, value) value end @@ -167,68 +171,126 @@ class CSV end def keep_start - @keeps.push([@scanner.pos, nil]) + # trace(__method__, :start) + adjust_last_keep + @keeps.push([@scanner, @scanner.pos, nil]) + # trace(__method__, :done) end def keep_end - start, buffer = @keeps.pop - keep = @scanner.string.byteslice(start, @scanner.pos - start) + # trace(__method__, :start) + scanner, start, buffer = @keeps.pop + if scanner == @scanner + keep = @scanner.string.byteslice(start, @scanner.pos - start) + else + keep = @scanner.string.byteslice(0, @scanner.pos) + end if buffer buffer << keep keep = buffer end + # trace(__method__, :done, keep) keep end def keep_back - start, buffer = @keeps.pop + # trace(__method__, :start) + scanner, start, buffer = @keeps.pop if buffer + # trace(__method__, :rescan, start, buffer) string = @scanner.string - keep = string.byteslice(start, string.bytesize - start) + if scanner == @scanner + keep = string.byteslice(start, string.bytesize - start) + else + keep = string + end if keep and not keep.empty? @inputs.unshift(StringIO.new(keep)) @last_scanner = false end @scanner = StringScanner.new(buffer) else + if @scanner != scanner + message = "scanners are different but no buffer: " + message += "#{@scanner.inspect}(#{@scanner.object_id}): " + message += "#{scanner.inspect}(#{scanner.object_id})" + raise UnexpectedError, message + end + # trace(__method__, :repos, start, buffer) @scanner.pos = start end read_chunk if @scanner.eos? end def keep_drop - @keeps.pop + _, _, buffer = @keeps.pop + # trace(__method__, :done, :empty) unless buffer + return unless buffer + + last_keep = @keeps.last + # trace(__method__, :done, :no_last_keep) unless last_keep + return unless last_keep + + if last_keep[2] + last_keep[2] << buffer + else + last_keep[2] = buffer + end + # trace(__method__, :done) end def rest @scanner.rest end + def check(pattern) + @scanner.check(pattern) + end + private - def read_chunk - return false if @last_scanner + def trace(*args) + pp([*args, @scanner, @scanner&.string, @scanner&.pos, @keeps]) + end - unless @keeps.empty? - keep = @keeps.last - keep_start = keep[0] - string = @scanner.string - keep_data = string.byteslice(keep_start, @scanner.pos - keep_start) - if keep_data - keep_buffer = keep[1] - if keep_buffer - keep_buffer << keep_data - else - keep[1] = keep_data.dup - end + def adjust_last_keep + # trace(__method__, :start) + + keep = @keeps.last + # trace(__method__, :done, :empty) if keep.nil? + return if keep.nil? + + scanner, start, buffer = keep + string = @scanner.string + if @scanner != scanner + start = 0 + end + if start == 0 and @scanner.eos? + keep_data = string + else + keep_data = string.byteslice(start, @scanner.pos - start) + end + if keep_data + if buffer + buffer << keep_data + else + keep[2] = keep_data.dup end - keep[0] = 0 end + # trace(__method__, :done) + end + + def read_chunk + return false if @last_scanner + + adjust_last_keep + input = @inputs.first case input when StringIO string = input.read raise InvalidEncoding unless string.valid_encoding? + # trace(__method__, :stringio, string) @scanner = StringScanner.new(string) @inputs.shift @last_scanner = @inputs.empty? @@ -237,6 +299,7 @@ class CSV chunk = input.gets(@row_separator, @chunk_size) if chunk raise InvalidEncoding unless chunk.valid_encoding? + # trace(__method__, :chunk, chunk) @scanner = StringScanner.new(chunk) if input.respond_to?(:eof?) and input.eof? @inputs.shift @@ -244,6 +307,7 @@ class CSV end true else + # trace(__method__, :no_chunk) @scanner = StringScanner.new("".encode(@encoding)) @inputs.shift @last_scanner = @inputs.empty? @@ -278,7 +342,11 @@ class CSV end def field_size_limit - @field_size_limit + @max_field_size&.succ + end + + def max_field_size + @max_field_size end def skip_lines @@ -346,6 +414,16 @@ class CSV end message = "Invalid byte sequence in #{@encoding}" raise MalformedCSVError.new(message, lineno) + rescue UnexpectedError => error + if @scanner + ignore_broken_line + lineno = @lineno + else + lineno = @lineno + 1 + end + message = "This should not be happen: #{error.message}: " + message += "Please report this to https://github.com/ruby/csv/issues" + raise MalformedCSVError.new(message, lineno) end end @@ -390,7 +468,7 @@ class CSV @backslash_quote = false end @unconverted_fields = @options[:unconverted_fields] - @field_size_limit = @options[:field_size_limit] + @max_field_size = @options[:max_field_size] @skip_blanks = @options[:skip_blanks] @fields_converter = @options[:fields_converter] @header_fields_converter = @options[:header_fields_converter] @@ -680,9 +758,10 @@ class CSV case headers when Array @raw_headers = headers + quoted_fields = [false] * @raw_headers.size @use_headers = true when String - @raw_headers = parse_headers(headers) + @raw_headers, quoted_fields = parse_headers(headers) @use_headers = true when nil, false @raw_headers = nil @@ -692,21 +771,28 @@ class CSV @use_headers = true end if @raw_headers - @headers = adjust_headers(@raw_headers) + @headers = adjust_headers(@raw_headers, quoted_fields) else @headers = nil end end def parse_headers(row) - CSV.parse_line(row, - col_sep: @column_separator, - row_sep: @row_separator, - quote_char: @quote_character) + quoted_fields = [] + converter = lambda do |field, info| + quoted_fields << info.quoted? + field + end + headers = CSV.parse_line(row, + col_sep: @column_separator, + row_sep: @row_separator, + quote_char: @quote_character, + converters: [converter]) + [headers, quoted_fields] end - def adjust_headers(headers) - adjusted_headers = @header_fields_converter.convert(headers, nil, @lineno) + def adjust_headers(headers, quoted_fields) + adjusted_headers = @header_fields_converter.convert(headers, nil, @lineno, quoted_fields) adjusted_headers.each {|h| h.freeze if h.is_a? String} adjusted_headers end @@ -729,28 +815,28 @@ class CSV sample[0, 128].index(@quote_character) end - SCANNER_TEST = (ENV["CSV_PARSER_SCANNER_TEST"] == "yes") - if SCANNER_TEST - class UnoptimizedStringIO - def initialize(string) - @io = StringIO.new(string, "rb:#{string.encoding}") - end + class UnoptimizedStringIO # :nodoc: + def initialize(string) + @io = StringIO.new(string, "rb:#{string.encoding}") + end - def gets(*args) - @io.gets(*args) - end + def gets(*args) + @io.gets(*args) + end - def each_line(*args, &block) - @io.each_line(*args, &block) - end + def each_line(*args, &block) + @io.each_line(*args, &block) + end - def eof? - @io.eof? - end + def eof? + @io.eof? end + end - SCANNER_TEST_CHUNK_SIZE = - Integer((ENV["CSV_PARSER_SCANNER_TEST_CHUNK_SIZE"] || "1"), 10) + SCANNER_TEST = (ENV["CSV_PARSER_SCANNER_TEST"] == "yes") + if SCANNER_TEST + SCANNER_TEST_CHUNK_SIZE_NAME = "CSV_PARSER_SCANNER_TEST_CHUNK_SIZE" + SCANNER_TEST_CHUNK_SIZE_VALUE = ENV[SCANNER_TEST_CHUNK_SIZE_NAME] def build_scanner inputs = @samples.collect do |sample| UnoptimizedStringIO.new(sample) @@ -760,10 +846,17 @@ class CSV else inputs << @input end + begin + chunk_size_value = ENV[SCANNER_TEST_CHUNK_SIZE_NAME] + rescue # Ractor::IsolationError + # Ractor on Ruby 3.0 can't read ENV value. + chunk_size_value = SCANNER_TEST_CHUNK_SIZE_VALUE + end + chunk_size = Integer((chunk_size_value || "1"), 10) InputsScanner.new(inputs, @encoding, @row_separator, - chunk_size: SCANNER_TEST_CHUNK_SIZE) + chunk_size: chunk_size) end else def build_scanner @@ -826,6 +919,14 @@ class CSV end end + def validate_field_size(field) + return unless @max_field_size + return if field.size <= @max_field_size + ignore_broken_line + message = "Field size exceeded: #{field.size} > #{@max_field_size}" + raise MalformedCSVError.new(message, @lineno) + end + def parse_no_quote(&block) @scanner.each_line(@row_separator) do |line| next if @skip_lines and skip_line?(line) @@ -835,9 +936,16 @@ class CSV if line.empty? next if @skip_blanks row = [] + quoted_fields = [] else line = strip_value(line) row = line.split(@split_column_separator, -1) + quoted_fields = [false] * row.size + if @max_field_size + row.each do |column| + validate_field_size(column) + end + end n_columns = row.size i = 0 while i < n_columns @@ -846,7 +954,7 @@ class CSV end end @last_line = original_line - emit_row(row, &block) + emit_row(row, quoted_fields, &block) end end @@ -868,31 +976,37 @@ class CSV next end row = [] + quoted_fields = [] elsif line.include?(@cr) or line.include?(@lf) @scanner.keep_back @need_robust_parsing = true return parse_quotable_robust(&block) else row = line.split(@split_column_separator, -1) + quoted_fields = [] n_columns = row.size i = 0 while i < n_columns column = row[i] if column.empty? + quoted_fields << false row[i] = nil else n_quotes = column.count(@quote_character) if n_quotes.zero? + quoted_fields << false # no quote elsif n_quotes == 2 and column.start_with?(@quote_character) and column.end_with?(@quote_character) + quoted_fields << true row[i] = column[1..-2] else @scanner.keep_back @need_robust_parsing = true return parse_quotable_robust(&block) end + validate_field_size(row[i]) end i += 1 end @@ -900,13 +1014,14 @@ class CSV @scanner.keep_drop @scanner.keep_start @last_line = original_line - emit_row(row, &block) + emit_row(row, quoted_fields, &block) end @scanner.keep_drop end def parse_quotable_robust(&block) row = [] + quoted_fields = [] skip_needless_lines start_row while true @@ -916,32 +1031,39 @@ class CSV value = parse_column_value if value @scanner.scan_all(@strip_value) if @strip_value - if @field_size_limit and value.size >= @field_size_limit - ignore_broken_line - raise MalformedCSVError.new("Field size exceeded", @lineno) - end + validate_field_size(value) end if parse_column_end row << value + quoted_fields << @quoted_column_value elsif parse_row_end if row.empty? and value.nil? - emit_row([], &block) unless @skip_blanks + emit_row([], [], &block) unless @skip_blanks else row << value - emit_row(row, &block) + quoted_fields << @quoted_column_value + emit_row(row, quoted_fields, &block) row = [] + quoted_fields = [] end skip_needless_lines start_row elsif @scanner.eos? break if row.empty? and value.nil? row << value - emit_row(row, &block) + quoted_fields << @quoted_column_value + emit_row(row, quoted_fields, &block) break else if @quoted_column_value + if liberal_parsing? and (new_line = @scanner.check(@line_end)) + message = + "Illegal end-of-line sequence outside of a quoted field " + + "<#{new_line.inspect}>" + else + message = "Any value after quoted field isn't allowed" + end ignore_broken_line - message = "Any value after quoted field isn't allowed" raise MalformedCSVError.new(message, @lineno) elsif @unquoted_column_value and (new_line = @scanner.scan(@line_end)) @@ -1034,7 +1156,7 @@ class CSV if (n_quotes % 2).zero? quotes[0, (n_quotes - 2) / 2] else - value = quotes[0, (n_quotes - 1) / 2] + value = quotes[0, n_quotes / 2] while true quoted_value = @scanner.scan_all(@quoted_value) value << quoted_value if quoted_value @@ -1058,11 +1180,9 @@ class CSV n_quotes = quotes.size if n_quotes == 1 break - elsif (n_quotes % 2) == 1 - value << quotes[0, (n_quotes - 1) / 2] - break else value << quotes[0, n_quotes / 2] + break if (n_quotes % 2) == 1 end end value @@ -1098,18 +1218,15 @@ class CSV def strip_value(value) return value unless @strip - return nil if value.nil? + return value if value.nil? case @strip when String - size = value.size - while value.start_with?(@strip) - size -= 1 - value = value[1, size] + while value.delete_prefix!(@strip) + # do nothing end - while value.end_with?(@strip) - size -= 1 - value = value[0, size] + while value.delete_suffix!(@strip) + # do nothing end else value.strip! @@ -1132,22 +1249,22 @@ class CSV @scanner.keep_start end - def emit_row(row, &block) + def emit_row(row, quoted_fields, &block) @lineno += 1 raw_row = row if @use_headers if @headers.nil? - @headers = adjust_headers(row) + @headers = adjust_headers(row, quoted_fields) return unless @return_headers row = Row.new(@headers, row, true) else row = Row.new(@headers, - @fields_converter.convert(raw_row, @headers, @lineno)) + @fields_converter.convert(raw_row, @headers, @lineno, quoted_fields)) end else # convert fields, if needed... - row = @fields_converter.convert(raw_row, nil, @lineno) + row = @fields_converter.convert(raw_row, nil, @lineno, quoted_fields) end # inject unconverted fields and accessor, if requested... |