diff options
author | Jeremy Evans <[email protected]> | 2022-04-11 08:17:19 -0700 |
---|---|---|
committer | git <[email protected]> | 2022-04-12 00:17:34 +0900 |
commit | ebb4378237e572ce2e888136a613c7c051439f95 (patch) | |
tree | 13f22663021500a3aa223fcd175db1a05caf246e | |
parent | 4bd38e8120f2fdfdd47a34211720e048502377f1 (diff) |
[ruby/net-http] Add HTTP#response_body_encoding for setting response body encoding
This allows for the ability to opt-in to a method to set the
encoding of response bodies. By setting the accessor to a String
or Encoding instance, it will use the specified encoding.
Setting the value of true will try to detect the encoding of the
response body, either using the Content-Type header (assuming it
specifies charset) or by scanning for a <meta> tag in the document
that specifies the encoding. The default is false in which case
no forcing of encoding will be done (same as before the patch).
Implements [Feature #2567]
Implements [Feature #15517]
https://github.com/ruby/net-http/commit/6233e6b7c1
Co-authored-by: Yui Naruse <[email protected]>
-rw-r--r-- | lib/net/http.rb | 14 | ||||
-rw-r--r-- | lib/net/http/response.rb | 159 | ||||
-rw-r--r-- | test/net/http/test_http.rb | 54 | ||||
-rw-r--r-- | test/net/http/test_httpresponse.rb | 235 |
4 files changed, 462 insertions, 0 deletions
diff --git a/lib/net/http.rb b/lib/net/http.rb index 3fcf23b05c..5e64e38665 100644 --- a/lib/net/http.rb +++ b/lib/net/http.rb @@ -698,6 +698,7 @@ module Net #:nodoc: @continue_timeout = nil @max_retries = 1 @debug_output = nil + @response_body_encoding = false @proxy_from_env = false @proxy_uri = nil @@ -745,6 +746,18 @@ module Net #:nodoc: # The local port used to establish the connection. attr_accessor :local_port + # The encoding to use for the response body. If Encoding, uses the + # specified encoding. If other true value, tries to detect the response + # body encoding. + attr_reader :response_body_encoding + + # Set the encoding to use for the response body. If given a String, find + # the related Encoding. + def response_body_encoding=(value) + value = Encoding.find(value) if value.is_a?(String) + @response_body_encoding = value + end + attr_writer :proxy_from_env attr_writer :proxy_address attr_writer :proxy_port @@ -1592,6 +1605,7 @@ module Net #:nodoc: begin res = HTTPResponse.read_new(@socket) res.decode_content = req.decode_content + res.body_encoding = @response_body_encoding end while res.kind_of?(HTTPInformation) res.uri = req.uri diff --git a/lib/net/http/response.rb b/lib/net/http/response.rb index 08eaeb2cac..ecbfd42d2b 100644 --- a/lib/net/http/response.rb +++ b/lib/net/http/response.rb @@ -84,6 +84,7 @@ class Net::HTTPResponse @read = false @uri = nil @decode_content = false + @body_encoding = false end # The HTTP version supported by the server. @@ -106,6 +107,18 @@ class Net::HTTPResponse # Accept-Encoding header from the user. attr_accessor :decode_content + # The encoding to use for the response body. If Encoding, use that encoding. + # If other true value, attempt to detect the appropriate encoding, and use + # that. + attr_reader :body_encoding + + # Set the encoding to use for the response body. If given a String, find + # the related Encoding. + def body_encoding=(value) + value = Encoding.find(value) if value.is_a?(String) + @body_encoding = value + end + def inspect "#<#{self.class} #{@code} #{@message} readbody=#{@read}>" end @@ -214,6 +227,17 @@ class Net::HTTPResponse end @read = true + case enc = @body_encoding + when Encoding, false, nil + # Encoding: force given encoding + # false/nil: do not force encoding + else + # other value: detect encoding from body + enc = detect_encoding(@body) + end + + @body.force_encoding(enc) if enc + @body end @@ -245,6 +269,141 @@ class Net::HTTPResponse private + # :nodoc: + def detect_encoding(str, encoding=nil) + if encoding + elsif encoding = type_params['charset'] + elsif encoding = check_bom(str) + else + encoding = case content_type&.downcase + when %r{text/x(?:ht)?ml|application/(?:[^+]+\+)?xml} + /\A<xml[ \t\r\n]+ + version[ \t\r\n]*=[ \t\r\n]*(?:"[0-9.]+"|'[0-9.]*')[ \t\r\n]+ + encoding[ \t\r\n]*=[ \t\r\n]* + (?:"([A-Za-z][\-A-Za-z0-9._]*)"|'([A-Za-z][\-A-Za-z0-9._]*)')/x =~ str + encoding = $1 || $2 || Encoding::UTF_8 + when %r{text/html.*} + sniff_encoding(str) + end + end + return encoding + end + + # :nodoc: + def sniff_encoding(str, encoding=nil) + # the encoding sniffing algorithm + # http://www.w3.org/TR/html5/parsing.html#determining-the-character-encoding + if enc = scanning_meta(str) + enc + # 6. last visited page or something + # 7. frequency + elsif str.ascii_only? + Encoding::US_ASCII + elsif str.dup.force_encoding(Encoding::UTF_8).valid_encoding? + Encoding::UTF_8 + end + # 8. implementation-defined or user-specified + end + + # :nodoc: + def check_bom(str) + case str.byteslice(0, 2) + when "\xFE\xFF" + return Encoding::UTF_16BE + when "\xFF\xFE" + return Encoding::UTF_16LE + end + if "\xEF\xBB\xBF" == str.byteslice(0, 3) + return Encoding::UTF_8 + end + nil + end + + # :nodoc: + def scanning_meta(str) + require 'strscan' + ss = StringScanner.new(str) + if ss.scan_until(/<meta[\t\n\f\r ]*/) + attrs = {} # attribute_list + got_pragma = false + need_pragma = nil + charset = nil + + # step: Attributes + while attr = get_attribute(ss) + name, value = *attr + next if attrs[name] + attrs[name] = true + case name + when 'http-equiv' + got_pragma = true if value == 'content-type' + when 'content' + encoding = extracting_encodings_from_meta_elements(value) + unless charset + charset = encoding + end + need_pragma = true + when 'charset' + need_pragma = false + charset = value + end + end + + # step: Processing + return if need_pragma.nil? + return if need_pragma && !got_pragma + + charset = Encoding.find(charset) rescue nil + return unless charset + charset = Encoding::UTF_8 if charset == Encoding::UTF_16 + return charset # tentative + end + nil + end + + def get_attribute(ss) + ss.scan(/[\t\n\f\r \/]*/) + if ss.peek(1) == '>' + ss.getch + return nil + end + name = ss.scan(/[^=\t\n\f\r \/>]*/) + name.downcase! + raise if name.empty? + ss.skip(/[\t\n\f\r ]*/) + if ss.getch != '=' + value = '' + return [name, value] + end + ss.skip(/[\t\n\f\r ]*/) + case ss.peek(1) + when '"' + ss.getch + value = ss.scan(/[^"]+/) + value.downcase! + ss.getch + when "'" + ss.getch + value = ss.scan(/[^']+/) + value.downcase! + ss.getch + when '>' + value = '' + else + value = ss.scan(/[^\t\n\f\r >]+/) + value.downcase! + end + [name, value] + end + + def extracting_encodings_from_meta_elements(value) + # http://dev.w3.org/html5/spec/fetching-resources.html#algorithm-for-extracting-an-encoding-from-a-meta-element + if /charset[\t\n\f\r ]*=(?:"([^"]*)"|'([^']*)'|["']|\z|([^\t\n\f\r ;]+))/i =~ value + return $1 || $2 || $3 + end + return nil + end + ## # Checks for a supported Content-Encoding header and yields an Inflate # wrapper for this response's socket when zlib is present. If the diff --git a/test/net/http/test_http.rb b/test/net/http/test_http.rb index b5156078a4..4725a79147 100644 --- a/test/net/http/test_http.rb +++ b/test/net/http/test_http.rb @@ -1294,3 +1294,57 @@ class TestNetHTTPLocalBind < Test::Unit::TestCase end end +class TestNetHTTPForceEncoding < Test::Unit::TestCase + CONFIG = { + 'host' => 'localhost', + 'proxy_host' => nil, + 'proxy_port' => nil, + } + + include TestNetHTTPUtils + + def fe_request(force_enc, content_type=nil) + @server.mount_proc('/fe') do |req, res| + res['Content-Type'] = content_type if content_type + res.body = "hello\u1234" + end + + http = Net::HTTP.new(config('host'), config('port')) + http.local_host = Addrinfo.tcp(config('host'), config('port')).ip_address + assert_not_nil(http.local_host) + assert_nil(http.local_port) + + http.response_body_encoding = force_enc + http.get('/fe') + end + + def test_response_body_encoding_false + res = fe_request(false) + assert_equal("hello\u1234".b, res.body) + assert_equal(Encoding::ASCII_8BIT, res.body.encoding) + end + + def test_response_body_encoding_true_without_content_type + res = fe_request(true) + assert_equal("hello\u1234".b, res.body) + assert_equal(Encoding::ASCII_8BIT, res.body.encoding) + end + + def test_response_body_encoding_true_with_content_type + res = fe_request(true, 'text/html; charset=utf-8') + assert_equal("hello\u1234", res.body) + assert_equal(Encoding::UTF_8, res.body.encoding) + end + + def test_response_body_encoding_string_without_content_type + res = fe_request('utf-8') + assert_equal("hello\u1234", res.body) + assert_equal(Encoding::UTF_8, res.body.encoding) + end + + def test_response_body_encoding_encoding_without_content_type + res = fe_request(Encoding::UTF_8) + assert_equal("hello\u1234", res.body) + assert_equal(Encoding::UTF_8, res.body.encoding) + end +end diff --git a/test/net/http/test_httpresponse.rb b/test/net/http/test_httpresponse.rb index 86a467ac19..eb2551df46 100644 --- a/test/net/http/test_httpresponse.rb +++ b/test/net/http/test_httpresponse.rb @@ -54,6 +54,241 @@ EOS assert_equal 'hello', body end + def test_read_body_body_encoding_false + body = "hello\u1234" + io = dummy_io(<<EOS) +HTTP/1.1 200 OK +Connection: close +Content-Length: #{body.bytesize} + +#{body} +EOS + + res = Net::HTTPResponse.read_new(io) + + body = nil + + res.reading_body io, true do + body = res.read_body + end + + assert_equal "hello\u1234".b, body + assert_equal Encoding::ASCII_8BIT, body.encoding + end + + def test_read_body_body_encoding_encoding + body = "hello\u1234" + io = dummy_io(<<EOS) +HTTP/1.1 200 OK +Connection: close +Content-Length: #{body.bytesize} + +#{body} +EOS + + res = Net::HTTPResponse.read_new(io) + res.body_encoding = Encoding.find('utf-8') + + body = nil + + res.reading_body io, true do + body = res.read_body + end + + assert_equal "hello\u1234", body + assert_equal Encoding::UTF_8, body.encoding + end + + def test_read_body_body_encoding_string + body = "hello\u1234" + io = dummy_io(<<EOS) +HTTP/1.1 200 OK +Connection: close +Content-Length: #{body.bytesize} + +#{body} +EOS + + res = Net::HTTPResponse.read_new(io) + res.body_encoding = 'utf-8' + + body = nil + + res.reading_body io, true do + body = res.read_body + end + + assert_equal "hello\u1234", body + assert_equal Encoding::UTF_8, body.encoding + end + + def test_read_body_body_encoding_true_without_content_type_header + body = "hello\u1234" + io = dummy_io(<<EOS) +HTTP/1.1 200 OK +Connection: close +Content-Length: #{body.bytesize} + +#{body} +EOS + + res = Net::HTTPResponse.read_new(io) + res.body_encoding = true + + body = nil + + res.reading_body io, true do + body = res.read_body + end + + assert_equal "hello\u1234".b, body + assert_equal Encoding::ASCII_8BIT, body.encoding + end + + def test_read_body_body_encoding_true_with_utf8_content_type_header + body = "hello\u1234" + io = dummy_io(<<EOS) +HTTP/1.1 200 OK +Connection: close +Content-Length: #{body.bytesize} +Content-Type: text/plain; charset=utf-8 + +#{body} +EOS + + res = Net::HTTPResponse.read_new(io) + res.body_encoding = true + + body = nil + + res.reading_body io, true do + body = res.read_body + end + + assert_equal "hello\u1234", body + assert_equal Encoding::UTF_8, body.encoding + end + + def test_read_body_body_encoding_true_with_iso_8859_1_content_type_header + body = "hello\u1234" + io = dummy_io(<<EOS) +HTTP/1.1 200 OK +Connection: close +Content-Length: #{body.bytesize} +Content-Type: text/plain; charset=iso-8859-1 + +#{body} +EOS + + res = Net::HTTPResponse.read_new(io) + res.body_encoding = true + + body = nil + + res.reading_body io, true do + body = res.read_body + end + + assert_equal "hello\u1234".force_encoding("ISO-8859-1"), body + assert_equal Encoding::ISO_8859_1, body.encoding + end + + def test_read_body_body_encoding_true_with_utf8_meta_charset + res_body = "<html><meta charset=\"utf-8\">hello\u1234</html>" + io = dummy_io(<<EOS) +HTTP/1.1 200 OK +Connection: close +Content-Length: #{res_body.bytesize} +Content-Type: text/html + +#{res_body} +EOS + + res = Net::HTTPResponse.read_new(io) + res.body_encoding = true + + body = nil + + res.reading_body io, true do + body = res.read_body + end + + assert_equal res_body, body + assert_equal Encoding::UTF_8, body.encoding + end + + def test_read_body_body_encoding_true_with_iso8859_1_meta_charset + res_body = "<html><meta charset=\"iso-8859-1\">hello\u1234</html>" + io = dummy_io(<<EOS) +HTTP/1.1 200 OK +Connection: close +Content-Length: #{res_body.bytesize} +Content-Type: text/html + +#{res_body} +EOS + + res = Net::HTTPResponse.read_new(io) + res.body_encoding = true + + body = nil + + res.reading_body io, true do + body = res.read_body + end + + assert_equal res_body.force_encoding("ISO-8859-1"), body + assert_equal Encoding::ISO_8859_1, body.encoding + end + + def test_read_body_body_encoding_true_with_utf8_meta_content_charset + res_body = "<meta http-equiv='content-type' content='text/html; charset=UTF-8'>hello\u1234</html>" + io = dummy_io(<<EOS) +HTTP/1.1 200 OK +Connection: close +Content-Length: #{res_body.bytesize} +Content-Type: text/html + +#{res_body} +EOS + + res = Net::HTTPResponse.read_new(io) + res.body_encoding = true + + body = nil + + res.reading_body io, true do + body = res.read_body + end + + assert_equal res_body, body + assert_equal Encoding::UTF_8, body.encoding + end + + def test_read_body_body_encoding_true_with_iso8859_1_meta_content_charset + res_body = "<meta http-equiv='content-type' content='text/html; charset=ISO-8859-1'>hello\u1234</html>" + io = dummy_io(<<EOS) +HTTP/1.1 200 OK +Connection: close +Content-Length: #{res_body.bytesize} +Content-Type: text/html + +#{res_body} +EOS + + res = Net::HTTPResponse.read_new(io) + res.body_encoding = true + + body = nil + + res.reading_body io, true do + body = res.read_body + end + + assert_equal res_body.force_encoding("ISO-8859-1"), body + assert_equal Encoding::ISO_8859_1, body.encoding + end + def test_read_body_block io = dummy_io(<<EOS) HTTP/1.1 200 OK |