diff options
Diffstat (limited to 'test/csv/test_encodings.rb')
-rw-r--r-- | test/csv/test_encodings.rb | 403 |
1 files changed, 0 insertions, 403 deletions
diff --git a/test/csv/test_encodings.rb b/test/csv/test_encodings.rb deleted file mode 100644 index 55a7a60f2e..0000000000 --- a/test/csv/test_encodings.rb +++ /dev/null @@ -1,403 +0,0 @@ -# -*- coding: utf-8 -*- -# frozen_string_literal: false - -require_relative "helper" - -class TestCSVEncodings < Test::Unit::TestCase - extend DifferentOFS - include CSVHelper - - def setup - super - require 'tempfile' - @temp_csv_file = Tempfile.new(%w"test_csv. .csv") - @temp_csv_path = @temp_csv_file.path - @temp_csv_file.close - end - - def teardown - @temp_csv_file.close! - super - end - - ######################################## - ### Hand Test Some Popular Encodings ### - ######################################## - - def test_parses_utf8_encoding - assert_parses( [ %w[ one two … ], - %w[ 1 … 3 ], - %w[ … 5 6 ] ], "UTF-8" ) - end - - def test_parses_latin1_encoding - assert_parses( [ %w[ one two Résumé ], - %w[ 1 Résumé 3 ], - %w[ Résumé 5 6 ] ], "ISO-8859-1" ) - end - - def test_parses_utf16be_encoding - assert_parses( [ %w[ one two … ], - %w[ 1 … 3 ], - %w[ … 5 6 ] ], "UTF-16BE" ) - end - - def test_parses_shift_jis_encoding - assert_parses( [ %w[ 一 二 三 ], - %w[ 四 五 六 ], - %w[ 七 八 九 ] ], "Shift_JIS" ) - end - - ########################################################### - ### Try Simple Reading for All Non-dummy Ruby Encodings ### - ########################################################### - - def test_reading_with_most_encodings - each_encoding do |encoding| - begin - assert_parses( [ %w[ abc def ], - %w[ ghi jkl ] ], encoding ) - rescue Encoding::ConverterNotFoundError - fail("Failed to support #{encoding.name}.") - end - end - end - - def test_regular_expression_escaping - each_encoding do |encoding| - begin - assert_parses( [ %w[ abc def ], - %w[ ghi jkl ] ], encoding, col_sep: "|" ) - rescue Encoding::ConverterNotFoundError - fail("Failed to properly escape #{encoding.name}.") - end - end - end - - def test_read_with_default_encoding - data = "abc" - default_external = Encoding.default_external - each_encoding do |encoding| - File.open(@temp_csv_path, "wb", encoding: encoding) {|f| f << data} - begin - no_warnings do - Encoding.default_external = encoding - end - result = CSV.read(@temp_csv_path)[0][0] - ensure - no_warnings do - Encoding.default_external = default_external - end - end - assert_equal(encoding, result.encoding) - end - end - - ####################################################################### - ### Stress Test ASCII Compatible and Non-ASCII Compatible Encodings ### - ####################################################################### - - def test_auto_line_ending_detection - # arrange data to place a \r at the end of CSV's read ahead point - encode_for_tests([["a" * 509]], row_sep: "\r\n") do |data| - assert_equal("\r\n".encode(data.encoding), CSV.new(data).row_sep) - end - end - - def test_csv_chars_are_transcoded - encode_for_tests([%w[abc def]]) do |data| - %w[col_sep row_sep quote_char].each do |csv_char| - assert_equal( "|".encode(data.encoding), - CSV.new(data, csv_char.to_sym => "|").send(csv_char) ) - end - end - end - - def test_parser_works_with_encoded_headers - encode_for_tests([%w[one two three], %w[1 2 3]]) do |data| - parsed = CSV.parse(data, headers: true) - assert_all?(parsed.headers, "Wrong data encoding.") {|h| h.encoding == data.encoding} - parsed.each do |row| - assert_all?(row.fields, "Wrong data encoding.") {|f| f.encoding == data.encoding} - end - end - end - - def test_built_in_converters_transcode_to_utf_8_then_convert - encode_for_tests([%w[one two three], %w[1 2 3]]) do |data| - parsed = CSV.parse(data, converters: :integer) - assert_all?(parsed[0], "Wrong data encoding.") {|f| f.encoding == data.encoding} - assert_equal([1, 2, 3], parsed[1]) - end - end - - def test_built_in_header_converters_transcode_to_utf_8_then_convert - encode_for_tests([%w[one two three], %w[1 2 3]]) do |data| - parsed = CSV.parse( data, headers: true, - header_converters: :downcase ) - assert_all?(parsed.headers, "Wrong data encoding.") {|h| h.encoding.name == "UTF-8"} - assert_all?(parsed[0].fields, "Wrong data encoding.") {|f| f.encoding == data.encoding} - end - end - - def test_open_allows_you_to_set_encodings - encode_for_tests([%w[abc def]]) do |data| - # read and write in encoding - File.open(@temp_csv_path, "wb:#{data.encoding.name}") { |f| f << data } - CSV.open(@temp_csv_path, "rb:#{data.encoding.name}") do |csv| - csv.each do |row| - assert_all?(row, "Wrong data encoding.") {|f| f.encoding == data.encoding} - end - end - - # read and write with transcoding - File.open(@temp_csv_path, "wb:UTF-32BE:#{data.encoding.name}") do |f| - f << data - end - CSV.open(@temp_csv_path, "rb:UTF-32BE:#{data.encoding.name}") do |csv| - csv.each do |row| - assert_all?(row, "Wrong data encoding.") {|f| f.encoding == data.encoding} - end - end - end - end - - def test_foreach_allows_you_to_set_encodings - encode_for_tests([%w[abc def]]) do |data| - # read and write in encoding - File.open(@temp_csv_path, "wb", encoding: data.encoding) { |f| f << data } - CSV.foreach(@temp_csv_path, encoding: data.encoding) do |row| - row.each {|f| assert_equal(f.encoding, data.encoding)} - end - - # read and write with transcoding - File.open(@temp_csv_path, "wb:UTF-32BE:#{data.encoding.name}") do |f| - f << data - end - CSV.foreach( @temp_csv_path, - encoding: "UTF-32BE:#{data.encoding.name}" ) do |row| - assert_all?(row, "Wrong data encoding.") {|f| f.encoding == data.encoding} - end - end - end - - def test_read_allows_you_to_set_encodings - encode_for_tests([%w[abc def]]) do |data| - # read and write in encoding - File.open(@temp_csv_path, "wb:#{data.encoding.name}") { |f| f << data } - rows = CSV.read(@temp_csv_path, encoding: data.encoding.name) - assert_all?(rows.flatten, "Wrong data encoding.") {|f| f.encoding == data.encoding} - - # read and write with transcoding - File.open(@temp_csv_path, "wb:UTF-32BE:#{data.encoding.name}") do |f| - f << data - end - rows = CSV.read( @temp_csv_path, - encoding: "UTF-32BE:#{data.encoding.name}" ) - assert_all?(rows.flatten, "Wrong data encoding.") {|f| f.encoding == data.encoding} - end - end - - ################################# - ### Write CSV in any Encoding ### - ################################# - - def test_can_write_csv_in_any_encoding - each_encoding do |encoding| - # test generate_line with encoding hint - begin - csv = %w[abc d|ef].map { |f| f.encode(encoding) }. - to_csv(col_sep: "|", encoding: encoding.name) - rescue Encoding::ConverterNotFoundError - next - end - assert_equal(encoding, csv.encoding) - - # test generate_line with encoding guessing from fields - csv = %w[abc d|ef].map { |f| f.encode(encoding) }.to_csv(col_sep: "|") - assert_equal(encoding, csv.encoding) - - # writing to files - data = encode_ary([%w[abc d,ef], %w[123 456 ]], encoding) - CSV.open(@temp_csv_path, "wb:#{encoding.name}") do |f| - data.each { |row| f << row } - end - assert_equal(data, CSV.read(@temp_csv_path, encoding: encoding.name)) - end - end - - def test_encoding_is_upgraded_during_writing_as_needed - data = ["foo".force_encoding("US-ASCII"), "\u3042"] - assert_equal("US-ASCII", data.first.encoding.name) - assert_equal("UTF-8", data.last.encoding.name) - assert_equal("UTF-8", data.join('').encoding.name) - assert_equal("UTF-8", data.to_csv.encoding.name) - end - - def test_encoding_is_upgraded_for_ascii_content_during_writing_as_needed - data = ["foo".force_encoding("ISO-8859-1"), "\u3042"] - assert_equal("ISO-8859-1", data.first.encoding.name) - assert_equal("UTF-8", data.last.encoding.name) - assert_equal("UTF-8", data.join('').encoding.name) - assert_equal("UTF-8", data.to_csv.encoding.name) - end - - def test_encoding_is_not_upgraded_for_non_ascii_content_during_writing_as_needed - data = ["\u00c0".encode("ISO-8859-1"), "\u3042"] - assert_equal([ - "ISO-8859-1", - "UTF-8", - ], - data.collect {|field| field.encoding.name}) - assert_raise(Encoding::CompatibilityError) do - data.to_csv - end - end - - def test_explicit_encoding - bug9766 = '[ruby-core:62113] [Bug #9766]' - s = CSV.generate(encoding: "Windows-31J") do |csv| - csv << ["foo".force_encoding("ISO-8859-1"), "\u3042"] - end - assert_equal(["foo,\u3042\n".encode(Encoding::Windows_31J), Encoding::Windows_31J], [s, s.encoding], bug9766) - end - - def test_encoding_with_default_internal - with_default_internal(Encoding::UTF_8) do - s = CSV.generate(String.new(encoding: Encoding::Big5), encoding: Encoding::Big5) do |csv| - csv << ["漢字"] - end - assert_equal(["漢字\n".encode(Encoding::Big5), Encoding::Big5], [s, s.encoding]) - end - end - - def test_row_separator_detection_with_invalid_encoding - csv = CSV.new("invalid,\xF8\r\nvalid,x\r\n".force_encoding("UTF-8"), - encoding: "UTF-8") - assert_equal("\r\n", csv.row_sep) - end - - def test_invalid_encoding_row_error - csv = CSV.new("valid,x\rinvalid,\xF8\r".force_encoding("UTF-8"), - encoding: "UTF-8", row_sep: "\r") - error = assert_raise(CSV::InvalidEncodingError) do - csv.shift - csv.shift - end - assert_equal([Encoding::UTF_8, "Invalid byte sequence in UTF-8 in line 2."], - [error.encoding, error.message]) - end - - def test_string_input_transcode - # U+3042 HIRAGANA LETTER A - # U+3044 HIRAGANA LETTER I - # U+3046 HIRAGANA LETTER U - value = "\u3042\u3044\u3046" - csv = CSV.new(value, encoding: "UTF-8:EUC-JP") - assert_equal([[value.encode("EUC-JP")]], - csv.read) - end - - def test_string_input_set_encoding_string - # U+3042 HIRAGANA LETTER A - # U+3044 HIRAGANA LETTER I - # U+3046 HIRAGANA LETTER U - value = "\u3042\u3044\u3046".encode("EUC-JP") - csv = CSV.new(value.dup.force_encoding("UTF-8"), encoding: "EUC-JP") - assert_equal([[value.encode("EUC-JP")]], - csv.read) - end - - def test_string_input_set_encoding_encoding - # U+3042 HIRAGANA LETTER A - # U+3044 HIRAGANA LETTER I - # U+3046 HIRAGANA LETTER U - value = "\u3042\u3044\u3046".encode("EUC-JP") - csv = CSV.new(value.dup.force_encoding("UTF-8"), - encoding: Encoding.find("EUC-JP")) - assert_equal([[value.encode("EUC-JP")]], - csv.read) - end - - private - - def assert_parses(fields, encoding, **options) - encoding = Encoding.find(encoding) unless encoding.is_a? Encoding - orig_fields = fields - fields = encode_ary(fields, encoding) - data = ary_to_data(fields, **options) - parsed = CSV.parse(data, **options) - assert_equal(fields, parsed) - parsed.flatten.each_with_index do |field, i| - assert_equal(encoding, field.encoding, "Field[#{i + 1}] was transcoded.") - end - File.open(@temp_csv_path, "wb") {|f| f.print(data)} - CSV.open(@temp_csv_path, "rb:#{encoding}", **options) do |csv| - csv.each_with_index do |row, i| - assert_equal(fields[i], row) - end - end - begin - CSV.open(@temp_csv_path, - "rb:#{encoding}:#{__ENCODING__}", - **options) do |csv| - csv.each_with_index do |row, i| - assert_equal(orig_fields[i], row) - end - end unless encoding == __ENCODING__ - rescue Encoding::ConverterNotFoundError - end - options[:encoding] = encoding.name - CSV.open(@temp_csv_path, **options) do |csv| - csv.each_with_index do |row, i| - assert_equal(fields[i], row) - end - end - options.delete(:encoding) - options[:external_encoding] = encoding.name - options[:internal_encoding] = __ENCODING__.name - begin - CSV.open(@temp_csv_path, **options) do |csv| - csv.each_with_index do |row, i| - assert_equal(orig_fields[i], row) - end - end unless encoding == __ENCODING__ - rescue Encoding::ConverterNotFoundError - end - end - - def encode_ary(ary, encoding) - ary.map { |row| row.map { |field| field.encode(encoding) } } - end - - def ary_to_data(ary, **options) - encoding = ary.flatten.first.encoding - quote_char = (options[:quote_char] || '"').encode(encoding) - col_sep = (options[:col_sep] || ",").encode(encoding) - row_sep = (options[:row_sep] || "\n").encode(encoding) - ary.map { |row| - row.map { |field| - [quote_char, field.encode(encoding), quote_char].join('') - }.join(col_sep) + row_sep - }.join('').encode(encoding) - end - - def encode_for_tests(data, **options) - yield ary_to_data(encode_ary(data, "UTF-8"), **options) - yield ary_to_data(encode_ary(data, "UTF-16BE"), **options) - end - - def each_encoding - Encoding.list.each do |encoding| - next if encoding.dummy? # skip "dummy" encodings - yield encoding - end - end - - def no_warnings - old_verbose, $VERBOSE = $VERBOSE, nil - yield - ensure - $VERBOSE = old_verbose - end -end |