diff options
author | nagachika <[email protected]> | 2021-07-03 13:49:46 +0900 |
---|---|---|
committer | nagachika <[email protected]> | 2021-07-03 13:49:46 +0900 |
commit | e62cccaeb0986d43480bccbd365cb20056bda4d7 (patch) | |
tree | 7b70b94213b5b5758e9e1292fc3022011d2c914a | |
parent | 2aad080396f5b79a33502f1d812fb237968cb931 (diff) |
merge revision(s) e86c1f6fc53433ef5c82ed2b7a4cc9a12c153e4c,f6539202c52a051a4e6946a318a1d9cd29002990: [Backport #1205]
Work around issue transcoding issue with non-ASCII compatible
encodings and xml escaping
When using a non-ASCII compatible source and destination encoding
and xml escaping (the :xml option to String#encode), the resulting
string was broken, as it used the correct non-ASCII compatible
encoding, but contained data that was ASCII-compatible instead of
compatible with the string's encoding.
Work around this issue by detecting the case where both the
source and destination encoding are non-ASCII compatible, and
transcoding the source string from the non-ASCII compatible
encoding to UTF-8. The xml escaping code will correctly handle
the UTF-8 source string and the return the correctly encoded
and escaped value.
Fixes [Bug #12052]
Co-authored-by: Nobuyoshi Nakada <[email protected]>
---
test/ruby/test_transcode.rb | 19 +++++++++++++++++++
transcode.c | 6 ++++++
2 files changed, 25 insertions(+)
=?UTF-8?q?-=20add=20regression=20tests=20for=20U+6E7F=20(?=
=?UTF-8?q?=E6=B9=BF)=20in=20ISO-2022-JP?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
In ISO-2022-JP, the bytes use to code are the same as those for "<>".
This adds regression tests to make sure that these bytes, when representing
湿, are NOT escaped with encode("ISO-2022-JP, xml: :text) or similar.
These are additional regression tests for #12052.
---
test/ruby/test_transcode.rb | 3 +++
1 file changed, 3 insertions(+)
-rw-r--r-- | test/ruby/test_transcode.rb | 22 | ||||
-rw-r--r-- | transcode.c | 6 | ||||
-rw-r--r-- | version.h | 2 |
3 files changed, 29 insertions, 1 deletions
diff --git a/test/ruby/test_transcode.rb b/test/ruby/test_transcode.rb index 04c8248697..c8b0034e06 100644 --- a/test/ruby/test_transcode.rb +++ b/test/ruby/test_transcode.rb @@ -126,6 +126,28 @@ class TestTranscode < Test::Unit::TestCase assert_equal("D\xFCrst".force_encoding('iso-8859-2'), "D\xFCrst".encode('iso-8859-2', 'iso-8859-1')) end + def test_encode_xml_multibyte + encodings = %w'UTF-8 UTF-16LE UTF-16BE UTF-32LE UTF-32BE' + encodings.each do |src_enc| + encodings.each do |dst_enc| + escaped = "<>".encode(src_enc).encode(dst_enc, :xml=>:text) + assert_equal("<>", escaped.encode('UTF-8'), "failed encoding #{src_enc} to #{dst_enc} with xml: :text") + + escaped = '<">'.encode(src_enc).encode(dst_enc, :xml=>:attr) + assert_equal('"<">"', escaped.encode('UTF-8'), "failed encoding #{src_enc} to #{dst_enc} with xml: :attr") + + escaped = "<>".encode(src_enc).force_encoding("UTF-8").encode(dst_enc, src_enc, :xml=>:text) + assert_equal("<>", escaped.encode('UTF-8'), "failed encoding #{src_enc} to #{dst_enc} with xml: :text") + + escaped = '<">'.encode(src_enc).force_encoding("UTF-8").encode(dst_enc, src_enc, :xml=>:attr) + assert_equal('"<">"', escaped.encode('UTF-8'), "failed encoding #{src_enc} to #{dst_enc} with xml: :attr") + end + end + # regression test; U+6E7F (湿) uses the same bytes in ISO-2022-JP as "<>" + assert_equal( "<>\u6E7F", "<>\u6E7F".encode("ISO-2022-JP").encode("ISO-2022-JP", :xml=>:text).encode("UTF-8")) + assert_equal("\"<>\u6E7F\"", "<>\u6E7F".encode("ISO-2022-JP").encode("ISO-2022-JP", :xml=>:attr).encode("UTF-8")) + end + def test_ascii_range encodings = [ 'US-ASCII', 'ASCII-8BIT', diff --git a/transcode.c b/transcode.c index a72afdc44b..d2abd9e0e5 100644 --- a/transcode.c +++ b/transcode.c @@ -2719,6 +2719,12 @@ str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts) } } else { + if (senc && denc && !rb_enc_asciicompat(senc) && !rb_enc_asciicompat(denc)) { + rb_encoding *utf8 = rb_utf8_encoding(); + str = rb_str_conv_enc(str, senc, utf8); + senc = utf8; + sname = "UTF-8"; + } if (encoding_equal(sname, dname)) { sname = ""; dname = ""; @@ -12,7 +12,7 @@ # define RUBY_VERSION_MINOR RUBY_API_VERSION_MINOR #define RUBY_VERSION_TEENY 2 #define RUBY_RELEASE_DATE RUBY_RELEASE_YEAR_STR"-"RUBY_RELEASE_MONTH_STR"-"RUBY_RELEASE_DAY_STR -#define RUBY_PATCHLEVEL 105 +#define RUBY_PATCHLEVEL 106 #define RUBY_RELEASE_YEAR 2021 #define RUBY_RELEASE_MONTH 7 |