diff options
author | Jean Boussier <[email protected]> | 2022-06-09 12:07:58 +0200 |
---|---|---|
committer | git <[email protected]> | 2022-08-16 19:12:03 +0900 |
commit | 3850113e20b8c031529fc79de7202f61604425dd (patch) | |
tree | 9a6cbd36529b625e1514c768e6cc98efc4886f89 | |
parent | 5389c9813b1970a1a5cb2bc8f67b098d38a99d1d (diff) |
[ruby/cgi] Implement `CGI.url_encode` and `CGI.url_decode`
[Feature #18822]
Ruby is somewhat missing an RFC 3986 compliant escape method.
https://github.com/ruby/cgi/commit/c2729c7f33
-rw-r--r-- | ext/cgi/escape/escape.c | 61 | ||||
-rw-r--r-- | lib/cgi/util.rb | 49 | ||||
-rw-r--r-- | test/cgi/test_cgi_util.rb | 49 |
3 files changed, 142 insertions, 17 deletions
diff --git a/ext/cgi/escape/escape.c b/ext/cgi/escape/escape.c index 068647747d..c5b76de596 100644 --- a/ext/cgi/escape/escape.c +++ b/ext/cgi/escape/escape.c @@ -200,7 +200,7 @@ url_unreserved_char(unsigned char c) } static VALUE -optimized_escape(VALUE str) +optimized_escape(VALUE str, int plus_escape) { long i, len, beg = 0; VALUE dest = 0; @@ -220,7 +220,7 @@ optimized_escape(VALUE str) rb_str_cat(dest, cstr + beg, i - beg); beg = i + 1; - if (c == ' ') { + if (plus_escape && c == ' ') { rb_str_cat_cstr(dest, "+"); } else { @@ -242,7 +242,7 @@ optimized_escape(VALUE str) } static VALUE -optimized_unescape(VALUE str, VALUE encoding) +optimized_unescape(VALUE str, VALUE encoding, int unescape_plus) { long i, len, beg = 0; VALUE dest = 0; @@ -265,7 +265,7 @@ optimized_unescape(VALUE str, VALUE encoding) | char_to_number(cstr[i+2])); clen = 2; } - else if (c == '+') { + else if (unescape_plus && c == '+') { buf[0] = ' '; } else { @@ -348,7 +348,7 @@ cgiesc_unescape_html(VALUE self, VALUE str) * call-seq: * CGI.escape(string) -> string * - * Returns URL-escaped string. + * Returns URL-escaped string (+application/x-www-form-urlencoded+). * */ static VALUE @@ -357,7 +357,7 @@ cgiesc_escape(VALUE self, VALUE str) StringValue(str); if (rb_enc_str_asciicompat_p(str)) { - return optimized_escape(str); + return optimized_escape(str, 1); } else { return rb_call_super(1, &str); @@ -376,7 +376,7 @@ accept_charset(int argc, VALUE *argv, VALUE self) * call-seq: * CGI.unescape(string, encoding=@@accept_charset) -> string * - * Returns URL-unescaped string. + * Returns URL-unescaped string (+application/x-www-form-urlencoded+). * */ static VALUE @@ -388,7 +388,50 @@ cgiesc_unescape(int argc, VALUE *argv, VALUE self) if (rb_enc_str_asciicompat_p(str)) { VALUE enc = accept_charset(argc-1, argv+1, self); - return optimized_unescape(str, enc); + return optimized_unescape(str, enc, 1); + } + else { + return rb_call_super(argc, argv); + } +} + +/* + * call-seq: + * CGI.escapeURIComponent(string) -> string + * + * Returns URL-escaped string following RFC 3986. + * + */ +static VALUE +cgiesc_escape_uri_component(VALUE self, VALUE str) +{ + StringValue(str); + + if (rb_enc_str_asciicompat_p(str)) { + return optimized_escape(str, 0); + } + else { + return rb_call_super(1, &str); + } +} + +/* + * call-seq: + * CGI.unescapeURIComponent(string, encoding=@@accept_charset) -> string + * + * Returns URL-unescaped string following RFC 3986. + * + */ +static VALUE +cgiesc_unescape_uri_component(int argc, VALUE *argv, VALUE self) +{ + VALUE str = (rb_check_arity(argc, 1, 2), argv[0]); + + StringValue(str); + + if (rb_enc_str_asciicompat_p(str)) { + VALUE enc = accept_charset(argc-1, argv+1, self); + return optimized_unescape(str, enc, 0); } else { return rb_call_super(argc, argv); @@ -414,6 +457,8 @@ InitVM_escape(void) rb_mUtil = rb_define_module_under(rb_cCGI, "Util"); rb_define_method(rb_mEscape, "escapeHTML", cgiesc_escape_html, 1); rb_define_method(rb_mEscape, "unescapeHTML", cgiesc_unescape_html, 1); + rb_define_method(rb_mEscape, "escapeURIComponent", cgiesc_escape_uri_component, 1); + rb_define_method(rb_mEscape, "unescapeURIComponent", cgiesc_unescape_uri_component, -1); rb_define_method(rb_mEscape, "escape", cgiesc_escape, 1); rb_define_method(rb_mEscape, "unescape", cgiesc_unescape, -1); rb_prepend_module(rb_mUtil, rb_mEscape); diff --git a/lib/cgi/util.rb b/lib/cgi/util.rb index 55e61bf984..5a5c77ac97 100644 --- a/lib/cgi/util.rb +++ b/lib/cgi/util.rb @@ -5,24 +5,57 @@ class CGI extend Util end module CGI::Util - @@accept_charset="UTF-8" unless defined?(@@accept_charset) - # URL-encode a string. + @@accept_charset = Encoding::UTF_8 unless defined?(@@accept_charset) + + # URL-encode a string into application/x-www-form-urlencoded. + # Space characters (+" "+) are encoded with plus signs (+"+"+) # url_encoded_string = CGI.escape("'Stop!' said Fred") # # => "%27Stop%21%27+said+Fred" def escape(string) encoding = string.encoding - string.b.gsub(/([^ a-zA-Z0-9_.\-~]+)/) do |m| + buffer = string.b + buffer.gsub!(/([^ a-zA-Z0-9_.\-~]+)/) do |m| '%' + m.unpack('H2' * m.bytesize).join('%').upcase - end.tr(' ', '+').force_encoding(encoding) + end + buffer.tr!(' ', '+') + buffer.force_encoding(encoding) end - # URL-decode a string with encoding(optional). + # URL-decode an application/x-www-form-urlencoded string with encoding(optional). # string = CGI.unescape("%27Stop%21%27+said+Fred") # # => "'Stop!' said Fred" - def unescape(string,encoding=@@accept_charset) - str=string.tr('+', ' ').b.gsub(/((?:%[0-9a-fA-F]{2})+)/) do |m| + def unescape(string, encoding = @@accept_charset) + str = string.tr('+', ' ') + str = str.b + str.gsub!(/((?:%[0-9a-fA-F]{2})+)/) do |m| + [m.delete('%')].pack('H*') + end + str.force_encoding(encoding) + str.valid_encoding? ? str : str.force_encoding(string.encoding) + end + + # URL-encode a string following RFC 3986 + # Space characters (+" "+) are encoded with (+"%20"+) + # url_encoded_string = CGI.escape("'Stop!' said Fred") + # # => "%27Stop%21%27%20said%20Fred" + def escapeURIComponent(string) + encoding = string.encoding + buffer = string.b + buffer.gsub!(/([^a-zA-Z0-9_.\-~]+)/) do |m| + '%' + m.unpack('H2' * m.bytesize).join('%').upcase + end + buffer.force_encoding(encoding) + end + + # URL-decode a string following RFC 3986 with encoding(optional). + # string = CGI.unescape("%27Stop%21%27+said%20Fred") + # # => "'Stop!'+said Fred" + def unescapeURIComponent(string, encoding = @@accept_charset) + str = string.b + str.gsub!(/((?:%[0-9a-fA-F]{2})+)/) do |m| [m.delete('%')].pack('H*') - end.force_encoding(encoding) + end + str.force_encoding(encoding) str.valid_encoding? ? str : str.force_encoding(string.encoding) end diff --git a/test/cgi/test_cgi_util.rb b/test/cgi/test_cgi_util.rb index 5baf87db75..a3be193a13 100644 --- a/test/cgi/test_cgi_util.rb +++ b/test/cgi/test_cgi_util.rb @@ -23,7 +23,6 @@ class CGIUtilTest < Test::Unit::TestCase ENV.update(@environ) end - def test_cgi_escape assert_equal('%26%3C%3E%22+%E3%82%86%E3%82%93%E3%82%86%E3%82%93', CGI.escape(@str1)) assert_equal('%26%3C%3E%22+%E3%82%86%E3%82%93%E3%82%86%E3%82%93'.ascii_only?, CGI.escape(@str1).ascii_only?) if defined?(::Encoding) @@ -70,6 +69,54 @@ class CGIUtilTest < Test::Unit::TestCase end; end + def test_cgi_escapeURIComponent + assert_equal('%26%3C%3E%22%20%E3%82%86%E3%82%93%E3%82%86%E3%82%93', CGI.escapeURIComponent(@str1)) + assert_equal('%26%3C%3E%22%20%E3%82%86%E3%82%93%E3%82%86%E3%82%93'.ascii_only?, CGI.escapeURIComponent(@str1).ascii_only?) if defined?(::Encoding) + end + + def test_cgi_escapeURIComponent_with_unreserved_characters + assert_equal("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~", + CGI.escapeURIComponent("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~"), + "should not encode any unreserved characters, as per RFC3986 Section 2.3") + end + + def test_cgi_escapeURIComponent_with_invalid_byte_sequence + assert_equal('%C0%3C%3C', CGI.escapeURIComponent("\xC0\<\<".dup.force_encoding("UTF-8"))) + end + + def test_cgi_escapeURIComponent_preserve_encoding + assert_equal(Encoding::US_ASCII, CGI.escapeURIComponent("\xC0\<\<".dup.force_encoding("US-ASCII")).encoding) + assert_equal(Encoding::ASCII_8BIT, CGI.escapeURIComponent("\xC0\<\<".dup.force_encoding("ASCII-8BIT")).encoding) + assert_equal(Encoding::UTF_8, CGI.escapeURIComponent("\xC0\<\<".dup.force_encoding("UTF-8")).encoding) + end + + def test_cgi_unescapeURIComponent + str = CGI.unescapeURIComponent('%26%3C%3E%22%20%E3%82%86%E3%82%93%E3%82%86%E3%82%93') + assert_equal(@str1, str) + return unless defined?(::Encoding) + + assert_equal("foo+bar", CGI.unescapeURIComponent("foo+bar")) + + assert_equal(@str1.encoding, str.encoding) + assert_equal("\u{30E1 30E2 30EA 691C 7D22}", CGI.unescapeURIComponent("\u{30E1 30E2 30EA}%E6%A4%9C%E7%B4%A2")) + end + + def test_cgi_unescapeURIComponent_preserve_encoding + assert_equal(Encoding::US_ASCII, CGI.unescapeURIComponent("%C0%3C%3C".dup.force_encoding("US-ASCII")).encoding) + assert_equal(Encoding::ASCII_8BIT, CGI.unescapeURIComponent("%C0%3C%3C".dup.force_encoding("ASCII-8BIT")).encoding) + assert_equal(Encoding::UTF_8, CGI.unescapeURIComponent("%C0%3C%3C".dup.force_encoding("UTF-8")).encoding) + end + + def test_cgi_unescapeURIComponent_accept_charset + return unless defined?(::Encoding) + + assert_raise(TypeError) {CGI.unescapeURIComponent('', nil)} + assert_separately(%w[-rcgi/util], "#{<<-"begin;"}\n#{<<-"end;"}") + begin; + assert_equal("", CGI.unescapeURIComponent('')) + end; + end + def test_cgi_pretty assert_equal("<HTML>\n <BODY>\n </BODY>\n</HTML>\n",CGI.pretty("<HTML><BODY></BODY></HTML>")) assert_equal("<HTML>\n\t<BODY>\n\t</BODY>\n</HTML>\n",CGI.pretty("<HTML><BODY></BODY></HTML>","\t")) |