diff options
author | kou <kou@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2018-12-23 07:00:35 +0000 |
---|---|---|
committer | kou <kou@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2018-12-23 07:00:35 +0000 |
commit | e5d634260e7927db284fd7d2d656899443f5c53e (patch) | |
tree | 31f579715ae8c73ee8094c258b634f1186a0946a /lib/csv/parser.rb | |
parent | c20a1946a6d7b260f1f0f3038b7af081174d6cd9 (diff) |
Import CSV 3.0.2
This includes performance improvement especially writing. Writing is
about 2 times faster.
git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@66507 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
Diffstat (limited to 'lib/csv/parser.rb')
-rw-r--r-- | lib/csv/parser.rb | 713 |
1 files changed, 713 insertions, 0 deletions
diff --git a/lib/csv/parser.rb b/lib/csv/parser.rb new file mode 100644 index 0000000000..2682c27ea3 --- /dev/null +++ b/lib/csv/parser.rb @@ -0,0 +1,713 @@ +# frozen_string_literal: true + +require "strscan" + +require_relative "match_p" +require_relative "row" +require_relative "table" + +using CSV::MatchP if CSV.const_defined?(:MatchP) + +class CSV + class Parser + class InvalidEncoding < StandardError + end + + class Scanner < StringScanner + alias_method :scan_all, :scan + + def initialize(*args) + super + @keeps = [] + end + + def keep_start + @keeps.push(pos) + end + + def keep_end + start = @keeps.pop + string[start, pos - start] + end + + def keep_back + self.pos = @keeps.pop + end + + def keep_drop + @keeps.pop + end + end + + class InputsScanner + def initialize(inputs, encoding, chunk_size: 8192) + @inputs = inputs.dup + @encoding = encoding + @chunk_size = chunk_size + @last_scanner = @inputs.empty? + @keeps = [] + read_chunk + end + + def scan(pattern) + value = @scanner.scan(pattern) + return value if @last_scanner + + if value + read_chunk if @scanner.eos? + return value + else + nil + end + end + + def scan_all(pattern) + value = @scanner.scan(pattern) + return value if @last_scanner + + return nil if value.nil? + while @scanner.eos? and read_chunk and (sub_value = @scanner.scan(pattern)) + value << sub_value + end + value + end + + def eos? + @scanner.eos? + end + + def keep_start + @keeps.push([@scanner.pos, nil]) + end + + def keep_end + start, buffer = @keeps.pop + keep = @scanner.string[start, @scanner.pos - start] + if buffer + buffer << keep + keep = buffer + end + keep + end + + def keep_back + start, buffer = @keeps.pop + if buffer + string = @scanner.string + keep = string[start, string.size - start] + if keep and not keep.empty? + @inputs.unshift(StringIO.new(keep)) + @last_scanner = false + end + @scanner = StringScanner.new(buffer) + else + @scanner.pos = start + end + end + + def keep_drop + @keeps.pop + end + + def rest + @scanner.rest + end + + private + def read_chunk + return false if @last_scanner + + unless @keeps.empty? + keep = @keeps.last + keep_start = keep[0] + string = @scanner.string + keep_data = string[keep_start, @scanner.pos - keep_start] + if keep_data + keep_buffer = keep[1] + if keep_buffer + keep_buffer << keep_data + else + keep[1] = keep_data.dup + end + end + keep[0] = 0 + end + + input = @inputs.first + case input + when StringIO + string = input.string + raise InvalidEncoding unless string.valid_encoding? + @scanner = StringScanner.new(string) + @inputs.shift + @last_scanner = @inputs.empty? + true + else + chunk = input.gets(nil, @chunk_size) + if chunk + raise InvalidEncoding unless chunk.valid_encoding? + @scanner = StringScanner.new(chunk) + if input.respond_to?(:eof?) and input.eof? + @inputs.shift + @last_scanner = @inputs.empty? + end + true + else + @scanner = StringScanner.new("".encode(@encoding)) + @inputs.shift + @last_scanner = @inputs.empty? + if @last_scanner + false + else + read_chunk + end + end + end + end + end + + def initialize(input, options) + @input = input + @options = options + @samples = [] + + prepare + end + + def column_separator + @column_separator + end + + def row_separator + @row_separator + end + + def quote_character + @quote_character + end + + def field_size_limit + @field_size_limit + end + + def skip_lines + @skip_lines + end + + def unconverted_fields? + @unconverted_fields + end + + def headers + @headers + end + + def header_row? + @use_headers and @headers.nil? + end + + def return_headers? + @return_headers + end + + def skip_blanks? + @skip_blanks + end + + def liberal_parsing? + @liberal_parsing + end + + def lineno + @lineno + end + + def line + last_line + end + + def parse(&block) + return to_enum(__method__) unless block_given? + + if @return_headers and @headers + headers = Row.new(@headers, @raw_headers, true) + if @unconverted_fields + headers = add_unconverted_fields(headers, []) + end + yield headers + end + + row = [] + begin + @scanner = build_scanner + skip_needless_lines + start_row + while true + @quoted_column_value = false + @unquoted_column_value = false + value = parse_column_value + if value and @field_size_limit and value.size >= @field_size_limit + raise MalformedCSVError.new("Field size exceeded", @lineno + 1) + end + if parse_column_end + row << value + elsif parse_row_end + if row.empty? and value.nil? + emit_row([], &block) unless @skip_blanks + else + row << value + emit_row(row, &block) + row = [] + end + skip_needless_lines + start_row + elsif @scanner.eos? + return if row.empty? and value.nil? + row << value + emit_row(row, &block) + return + else + if @quoted_column_value + message = "Do not allow except col_sep_split_separator " + + "after quoted fields" + raise MalformedCSVError.new(message, @lineno + 1) + elsif @unquoted_column_value and @scanner.scan(@cr_or_lf) + message = "Unquoted fields do not allow \\r or \\n" + raise MalformedCSVError.new(message, @lineno + 1) + elsif @scanner.rest.start_with?(@quote_character) + message = "Illegal quoting" + raise MalformedCSVError.new(message, @lineno + 1) + else + raise MalformedCSVError.new("TODO: Meaningful message", + @lineno + 1) + end + end + end + rescue InvalidEncoding + message = "Invalid byte sequence in #{@encoding}" + raise MalformedCSVError.new(message, @lineno + 1) + end + end + + private + def prepare + prepare_variable + prepare_regexp + prepare_line + prepare_header + prepare_parser + end + + def prepare_variable + @encoding = @options[:encoding] + @liberal_parsing = @options[:liberal_parsing] + @unconverted_fields = @options[:unconverted_fields] + @field_size_limit = @options[:field_size_limit] + @skip_blanks = @options[:skip_blanks] + @fields_converter = @options[:fields_converter] + @header_fields_converter = @options[:header_fields_converter] + end + + def prepare_regexp + @column_separator = @options[:column_separator].to_s.encode(@encoding) + @row_separator = + resolve_row_separator(@options[:row_separator]).encode(@encoding) + @quote_character = @options[:quote_character].to_s.encode(@encoding) + if @quote_character.length != 1 + raise ArgumentError, ":quote_char has to be a single character String" + end + + escaped_column_separator = Regexp.escape(@column_separator) + escaped_row_separator = Regexp.escape(@row_separator) + escaped_quote_character = Regexp.escape(@quote_character) + + skip_lines = @options[:skip_lines] + case skip_lines + when String + @skip_lines = skip_lines.encode(@encoding) + when Regexp, nil + @skip_lines = skip_lines + else + unless skip_lines.respond_to?(:match) + message = + ":skip_lines has to respond to \#match: #{skip_lines.inspect}" + raise ArgumentError, message + end + @skip_lines = skip_lines + end + + @column_end = Regexp.new(escaped_column_separator) + if @column_separator.size > 1 + @column_ends = @column_separator.each_char.collect do |char| + Regexp.new(Regexp.escape(char)) + end + else + @column_ends = nil + end + @row_end = Regexp.new(escaped_row_separator) + if @row_separator.size > 1 + @row_ends = @row_separator.each_char.collect do |char| + Regexp.new(Regexp.escape(char)) + end + else + @row_ends = nil + end + @quotes = Regexp.new(escaped_quote_character + + "+".encode(@encoding)) + @quoted_value = Regexp.new("[^".encode(@encoding) + + escaped_quote_character + + "]+".encode(@encoding)) + if @liberal_parsing + @unquoted_value = Regexp.new("[^".encode(@encoding) + + escaped_column_separator + + "\r\n]+".encode(@encoding)) + else + @unquoted_value = Regexp.new("[^".encode(@encoding) + + escaped_quote_character + + escaped_column_separator + + "\r\n]+".encode(@encoding)) + end + @cr_or_lf = Regexp.new("[\r\n]".encode(@encoding)) + @not_line_end = Regexp.new("[^\r\n]+".encode(@encoding)) + end + + def resolve_row_separator(separator) + if separator == :auto + cr = "\r".encode(@encoding) + lf = "\n".encode(@encoding) + if @input.is_a?(StringIO) + separator = detect_row_separator(@input.string, cr, lf) + elsif @input.respond_to?(:gets) + if @input.is_a?(File) + chunk_size = 32 * 1024 + else + chunk_size = 1024 + end + begin + while separator == :auto + # + # if we run out of data, it's probably a single line + # (ensure will set default value) + # + break unless sample = @input.gets(nil, chunk_size) + + # extend sample if we're unsure of the line ending + if sample.end_with?(cr) + sample << (@input.gets(nil, 1) || "") + end + + @samples << sample + + separator = detect_row_separator(sample, cr, lf) + end + rescue IOError + # do nothing: ensure will set default + end + end + separator = $INPUT_RECORD_SEPARATOR if separator == :auto + end + separator.to_s.encode(@encoding) + end + + def detect_row_separator(sample, cr, lf) + lf_index = sample.index(lf) + if lf_index + cr_index = sample[0, lf_index].index(cr) + else + cr_index = sample.index(cr) + end + if cr_index and lf_index + if cr_index + 1 == lf_index + cr + lf + elsif cr_index < lf_index + cr + else + lf + end + elsif cr_index + cr + elsif lf_index + lf + else + :auto + end + end + + def prepare_line + @lineno = 0 + @last_line = nil + @scanner = nil + end + + def last_line + if @scanner + @last_line ||= @scanner.keep_end + else + @last_line + end + end + + def prepare_header + @return_headers = @options[:return_headers] + + headers = @options[:headers] + case headers + when Array + @raw_headers = headers + @use_headers = true + when String + @raw_headers = parse_headers(headers) + @use_headers = true + when nil, false + @raw_headers = nil + @use_headers = false + else + @raw_headers = nil + @use_headers = true + end + if @raw_headers + @headers = adjust_headers(@raw_headers) + else + @headers = nil + end + end + + def parse_headers(row) + CSV.parse_line(row, + col_sep: @column_separator, + row_sep: @row_separator, + quote_char: @quote_character) + end + + def adjust_headers(headers) + adjusted_headers = @header_fields_converter.convert(headers, nil, @lineno) + adjusted_headers.each {|h| h.freeze if h.is_a? String} + adjusted_headers + end + + def prepare_parser + @may_quoted = may_quoted? + end + + def may_quoted? + if @input.is_a?(StringIO) + sample = @input.string + else + return false if @samples.empty? + sample = @samples.first + end + sample[0, 128].index(@quote_character) + end + + SCANNER_TEST = (ENV["CSV_PARSER_SCANNER_TEST"] == "yes") + if SCANNER_TEST + class UnoptimizedStringIO + def initialize(string) + @io = StringIO.new(string) + end + + def gets(*args) + @io.gets(*args) + end + + def eof? + @io.eof? + end + end + + def build_scanner + inputs = @samples.collect do |sample| + UnoptimizedStringIO.new(sample) + end + if @input.is_a?(StringIO) + inputs << UnoptimizedStringIO.new(@input.string) + else + inputs << @input + end + InputsScanner.new(inputs, @encoding, chunk_size: 1) + end + else + def build_scanner + string = nil + if @samples.empty? and @input.is_a?(StringIO) + string = @input.string + elsif @samples.size == 1 and @input.respond_to?(:eof?) and @input.eof? + string = @samples[0] + end + if string + unless string.valid_encoding? + message = "Invalid byte sequence in #{@encoding}" + raise MalformedCSVError.new(message, @lineno + 1) + end + Scanner.new(string) + else + inputs = @samples.collect do |sample| + StringIO.new(sample) + end + inputs << @input + InputsScanner.new(inputs, @encoding) + end + end + end + + def skip_needless_lines + return unless @skip_lines + + while true + @scanner.keep_start + line = @scanner.scan_all(@not_line_end) || "".encode(@encoding) + line << @row_separator if parse_row_end + if skip_line?(line) + @scanner.keep_drop + else + @scanner.keep_back + return + end + end + end + + def skip_line?(line) + case @skip_lines + when String + line.include?(@skip_lines) + when Regexp + @skip_lines.match?(line) + else + @skip_lines.match(line) + end + end + + def parse_column_value + if @liberal_parsing + quoted_value = parse_quoted_column_value + if quoted_value + unquoted_value = parse_unquoted_column_value + if unquoted_value + @quote_character + quoted_value + @quote_character + unquoted_value + else + quoted_value + end + else + parse_unquoted_column_value + end + elsif @may_quoted + parse_quoted_column_value || + parse_unquoted_column_value + else + parse_unquoted_column_value || + parse_quoted_column_value + end + end + + def parse_unquoted_column_value + value = @scanner.scan_all(@unquoted_value) + @unquoted_column_value = true if value + value + end + + def parse_quoted_column_value + quotes = @scanner.scan_all(@quotes) + return nil unless quotes + + @quoted_column_value = true + n_quotes = quotes.size + if (n_quotes % 2).zero? + quotes[0, (n_quotes - 2) / 2] + else + value = quotes[0, (n_quotes - 1) / 2] + while true + quoted_value = @scanner.scan_all(@quoted_value) + value << quoted_value if quoted_value + quotes = @scanner.scan_all(@quotes) + unless quotes + message = "Unclosed quoted field" + raise MalformedCSVError.new(message, @lineno + 1) + end + n_quotes = quotes.size + if n_quotes == 1 + break + elsif (n_quotes % 2) == 1 + value << quotes[0, (n_quotes - 1) / 2] + break + else + value << quotes[0, n_quotes / 2] + end + end + value + end + end + + def parse_column_end + return true if @scanner.scan(@column_end) + return false unless @column_ends + + @scanner.keep_start + if @column_ends.all? {|column_end| @scanner.scan(column_end)} + @scanner.keep_drop + true + else + @scanner.keep_back + false + end + end + + def parse_row_end + return true if @scanner.scan(@row_end) + return false unless @row_ends + @scanner.keep_start + if @row_ends.all? {|row_end| @scanner.scan(row_end)} + @scanner.keep_drop + true + else + @scanner.keep_back + false + end + end + + def start_row + if @last_line + @last_line = nil + else + @scanner.keep_drop + end + @scanner.keep_start + end + + def emit_row(row, &block) + @lineno += 1 + + raw_row = row + if @use_headers + if @headers.nil? + @headers = adjust_headers(row) + return unless @return_headers + row = Row.new(@headers, row, true) + else + row = Row.new(@headers, + @fields_converter.convert(raw_row, @headers, @lineno)) + end + else + # convert fields, if needed... + row = @fields_converter.convert(raw_row, nil, @lineno) + end + + # inject unconverted fields and accessor, if requested... + if @unconverted_fields and not row.respond_to?(:unconverted_fields) + add_unconverted_fields(row, raw_row) + end + + yield(row) + end + + # This method injects an instance variable <tt>unconverted_fields</tt> into + # +row+ and an accessor method for +row+ called unconverted_fields(). The + # variable is set to the contents of +fields+. + def add_unconverted_fields(row, fields) + class << row + attr_reader :unconverted_fields + end + row.instance_variable_set(:@unconverted_fields, fields) + row + end + end +end |