From: "naruse (Yui NARUSE)" Date: 2013-04-09T18:24:45+09:00 Subject: [ruby-dev:47241] [ruby-trunk - Feature #6752] Replacing ill-formed subsequencce Issue #6752 has been updated by naruse (Yui NARUSE). I wrote a updated patch which include String#scrub and String#encode with extension. String#scrub allows replacement as both argument or block. diff --git a/string.c b/string.c index 8b85739..7131ac5 100644 --- a/string.c +++ b/string.c @@ -7741,6 +7741,272 @@ rb_str_ellipsize(VALUE str, long len) return ret; } +static VALUE +str_compat_and_valid(VALUE str, rb_encoding *enc) +{ + int cr; + str = StringValue(str); + cr = rb_enc_str_coderange(str); + if (cr == ENC_CODERANGE_BROKEN) { + rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str); + } + else if (cr == ENC_CODERANGE_7BIT) { + rb_encoding *e = STR_ENC_GET(str); + if (!rb_enc_asciicompat(enc)) { + rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s", + rb_enc_name(enc), rb_enc_name(e)); + } + } + else { /* ENC_CODERANGE_VALID */ + rb_encoding *e = STR_ENC_GET(str); + if (enc != e) { + rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s", + rb_enc_name(enc), rb_enc_name(e)); + } + } + return str; +} + +/* + * call-seq: + * str.scrub -> new_str + * str.scrub(repl) -> new_str + * str.scrub{|bytes|} -> new_str + * + * If the string is invalid byte sequence then replace invalid bytes with given replacement + * character, else returns self. + */ +VALUE +rb_str_scrub(int argc, VALUE *argv, VALUE str) +{ + int cr = ENC_CODERANGE(str); + rb_encoding *enc; + VALUE repl; + + if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID) + return rb_str_dup(str); + + enc = STR_ENC_GET(str); + rb_scan_args(argc, argv, "01", &repl); + if (argc != 0) { + repl = str_compat_and_valid(repl, enc); + } + + if (rb_enc_dummy_p(enc)) { + return rb_str_dup(str); + } + + if (rb_enc_asciicompat(enc)) { + const char *p = RSTRING_PTR(str); + const char *e = RSTRING_END(str); + const char *p1 = p; + const char *rep; + long replen; + int rep7bit_p; + VALUE buf = rb_str_buf_new(RSTRING_LEN(str)); + if (rb_block_given_p()) { + rep = NULL; + } + else if (!NIL_P(repl)) { + rep = RSTRING_PTR(repl); + replen = RSTRING_LEN(repl); + rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT); + } + else if (enc == rb_utf8_encoding()) { + rep = "\xEF\xBF\xBD"; + replen = strlen(rep); + rep7bit_p = FALSE; + } + else { + rep = "?"; + replen = strlen(rep); + rep7bit_p = TRUE; + } + cr = ENC_CODERANGE_7BIT; + + p = search_nonascii(p, e); + if (!p) { + p = e; + } + while (p < e) { + int ret = rb_enc_precise_mbclen(p, e, enc); + if (MBCLEN_NEEDMORE_P(ret)) { + break; + } + else if (MBCLEN_CHARFOUND_P(ret)) { + cr = ENC_CODERANGE_VALID; + p += MBCLEN_CHARFOUND_LEN(ret); + } + else if (MBCLEN_INVALID_P(ret)) { + /* + * p1~p: valid ascii/multibyte chars + * p ~e: invalid bytes + unknown bytes + */ + long clen = rb_enc_mbmaxlen(enc); + if (p > p1) { + rb_str_buf_cat(buf, p1, p - p1); + } + + if (e - p < clen) clen = e - p; + if (clen <= 2) { + clen = 1; + } + else { + const char *q = p; + clen--; + for (; clen > 1; clen--) { + ret = rb_enc_precise_mbclen(q, q + clen, enc); + if (MBCLEN_NEEDMORE_P(ret)) break; + else if (MBCLEN_INVALID_P(ret)) continue; + else UNREACHABLE; + } + } + if (rep) { + rb_str_buf_cat(buf, rep, replen); + if (!rep7bit_p) cr = ENC_CODERANGE_VALID; + } + else { + repl = rb_yield(rb_enc_str_new(p1, clen, enc)); + repl = str_compat_and_valid(repl, enc); + rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl)); + if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID) + cr = ENC_CODERANGE_VALID; + } + p += clen; + p1 = p; + p = search_nonascii(p, e); + if (!p) { + p = e; + break; + } + } + else { + UNREACHABLE; + } + } + if (p1 < p) { + rb_str_buf_cat(buf, p1, p - p1); + } + if (p < e) { + if (rep) { + rb_str_buf_cat(buf, rep, replen); + if (!rep7bit_p) cr = ENC_CODERANGE_VALID; + } + else { + repl = rb_yield(rb_enc_str_new(p, e-p, enc)); + repl = str_compat_and_valid(repl, enc); + rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl)); + if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID) + cr = ENC_CODERANGE_VALID; + } + } + ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr); + return buf; + } + else { + /* ASCII incompatible */ + const char *p = RSTRING_PTR(str); + const char *e = RSTRING_END(str); + const char *p1 = p; + VALUE buf = rb_str_buf_new(RSTRING_LEN(str)); + const char *rep; + long replen; + long mbminlen = rb_enc_mbminlen(enc); + static rb_encoding *utf16be; + static rb_encoding *utf16le; + static rb_encoding *utf32be; + static rb_encoding *utf32le; + if (!utf16be) { + utf16be = rb_enc_find("UTF-16BE"); + utf16le = rb_enc_find("UTF-16LE"); + utf32be = rb_enc_find("UTF-32BE"); + utf32le = rb_enc_find("UTF-32LE"); + } + if (!NIL_P(repl)) { + rep = RSTRING_PTR(repl); + replen = RSTRING_LEN(repl); + } + else if (enc == utf16be) { + rep = "\xFF\xFD"; + replen = strlen(rep); + } + else if (enc == utf16le) { + rep = "\xFD\xFF"; + replen = strlen(rep); + } + else if (enc == utf32be) { + rep = "\x00\x00\xFF\xFD"; + replen = strlen(rep); + } + else if (enc == utf32le) { + rep = "\xFD\xFF\x00\x00"; + replen = strlen(rep); + } + else { + rep = "?"; + replen = strlen(rep); + } + + while (p < e) { + int ret = rb_enc_precise_mbclen(p, e, enc); + if (MBCLEN_NEEDMORE_P(ret)) { + break; + } + else if (MBCLEN_CHARFOUND_P(ret)) { + p += MBCLEN_CHARFOUND_LEN(ret); + } + else if (MBCLEN_INVALID_P(ret)) { + const char *q = p; + long clen = rb_enc_mbmaxlen(enc); + if (p > p1) rb_str_buf_cat(buf, p1, p - p1); + + if (e - p < clen) clen = e - p; + if (clen <= mbminlen * 2) { + clen = mbminlen; + } + else { + clen -= mbminlen; + for (; clen > mbminlen; clen-=mbminlen) { + ret = rb_enc_precise_mbclen(q, q + clen, enc); + if (MBCLEN_NEEDMORE_P(ret)) break; + else if (MBCLEN_INVALID_P(ret)) continue; + else UNREACHABLE; + } + rb_str_set_len(buf, len); + } + if (rep) { + rb_str_buf_cat(buf, rep, replen); + } + else { + repl = rb_yield(rb_enc_str_new(p, e-p, enc)); + repl = str_compat_and_valid(repl, enc); + rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl)); + } + p += clen; + p1 = p; + } + else { + UNREACHABLE; + } + } + if (p1 < p) { + rb_str_buf_cat(buf, p1, p - p1); + } + if (p < e) { + if (rep) { + rb_str_buf_cat(buf, rep, replen); + } + else { + repl = rb_yield(rb_enc_str_new(p, e-p, enc)); + repl = str_compat_and_valid(repl, enc); + rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl)); + } + } + ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), ENC_CODERANGE_VALID); + return buf; + } +} + /********************************************************************** * Document-class: Symbol * @@ -8222,6 +8488,7 @@ Init_String(void) rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1); rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2); rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1); + rb_define_method(rb_cString, "scrub", rb_str_scrub, -1); rb_define_method(rb_cString, "to_i", rb_str_to_i, -1); rb_define_method(rb_cString, "to_f", rb_str_to_f, 0); diff --git a/test/ruby/test_m17n.rb b/test/ruby/test_m17n.rb index a8d56a4..60834bb 100644 --- a/test/ruby/test_m17n.rb +++ b/test/ruby/test_m17n.rb @@ -1489,4 +1489,38 @@ class TestM17N < Test::Unit::TestCase s.untrust assert_equal(true, s.b.untrusted?) end + + def test_scrub + assert_equal("\uFFFD\uFFFD\uFFFD", u("\x80\x80\x80").scrub) + assert_equal("\uFFFDA", u("\xF4\x80\x80A").scrub) + + # exapmles in Unicode 6.1.0 D93b + assert_equal("\x41\uFFFD\uFFFD\x41\uFFFD\x41", + u("\x41\xC0\xAF\x41\xF4\x80\x80\x41").scrub) + assert_equal("\x41\uFFFD\uFFFD\uFFFD\x41", + u("\x41\xE0\x9F\x80\x41").scrub) + assert_equal("\u0061\uFFFD\uFFFD\uFFFD\u0062\uFFFD\u0063\uFFFD\uFFFD\u0064", + u("\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64").scrub) + assert_equal("abcdefghijklmnopqrstuvwxyz\u0061\uFFFD\uFFFD\uFFFD\u0062\uFFFD\u0063\uFFFD\uFFFD\u0064", + u("abcdefghijklmnopqrstuvwxyz\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64").scrub) + + assert_equal("\u3042\u3013", u("\xE3\x81\x82\xE3\x81").scrub("\u3013")) + assert_raise(Encoding::CompatibilityError){ u("\xE3\x81\x82\xE3\x81").scrub(e("\xA4\xA2")) } + assert_raise(TypeError){ u("\xE3\x81\x82\xE3\x81").scrub(1) } + assert_raise(ArgumentError){ u("\xE3\x81\x82\xE3\x81\x82\xE3\x81").scrub(u("\x81")) } + assert_equal(e("\xA4\xA2\xA2\xAE"), e("\xA4\xA2\xA4").scrub(e("\xA2\xAE"))) + + assert_equal("\u3042", u("\xE3\x81\x82\xE3\x81").scrub{|x|'<'+x.unpack('H*')[0]+'>'}) + assert_raise(Encoding::CompatibilityError){ u("\xE3\x81\x82\xE3\x81").scrub{e("\xA4\xA2")} } + assert_raise(TypeError){ u("\xE3\x81\x82\xE3\x81").scrub{1} } + assert_raise(ArgumentError){ u("\xE3\x81\x82\xE3\x81\x82\xE3\x81").scrub{u("\x81")} } + assert_equal(e("\xA4\xA2\xA2\xAE"), e("\xA4\xA2\xA4").scrub{e("\xA2\xAE")}) + + assert_equal("\uFFFD\u3042".encode("UTF-16BE"), + "\xD8\x00\x30\x42".force_encoding(Encoding::UTF_16BE). + scrub) + assert_equal("\uFFFD\u3042".encode("UTF-16LE"), + "\x00\xD8\x42\x30".force_encoding(Encoding::UTF_16LE). + scrub) + end end diff --git a/transcode.c b/transcode.c index de12c04..9c940ed 100644 --- a/transcode.c +++ b/transcode.c @@ -2652,6 +2652,8 @@ str_transcode_enc_args(VALUE str, volatile VALUE *arg1, volatile VALUE *arg2, return dencidx; } +VALUE rb_str_scrub(int argc, VALUE *argv, VALUE str); + static int str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts) { @@ -2686,6 +2688,17 @@ str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts) ECONV_XML_ATTR_CONTENT_DECORATOR| ECONV_XML_ATTR_QUOTE_DECORATOR)) == 0) { if (senc && senc == denc) { + if (ecflags & ECONV_INVALID_MASK) { + if (!NIL_P(ecopts)) { + VALUE rep = rb_hash_aref(ecopts, sym_replace); + dest = rb_str_scrub(1, &rep, str); + } + else { + dest = rb_str_scrub(0, NULL, str); + } + *self = dest; + return dencidx; + } return NIL_P(arg2) ? -1 : dencidx; } if (senc && denc && rb_enc_asciicompat(senc) && rb_enc_asciicompat(denc)) { ---------------------------------------- Feature #6752: Replacing ill-formed subsequencce https://bugs.ruby-lang.org/issues/6752#change-38389 Author: naruse (Yui NARUSE) Status: Assigned Priority: Normal Assignee: matz (Yukihiro Matsumoto) Category: core Target version: next minor =begin == 概要 Stringになんらかの理由で不正なバイト列が含まれている時に、それを置換文字で置き換えたい。 == ユースケース 実際に確認されているユースケースは以下の通りです。 * twitterのtitle * IRCのログ * ニコニコ動画の API * Webクローリング これらの不正なバイト列の生成過程は、おそらく、バイト単位で文字列を切り詰めた時に末尾が切れて、 末尾がおかしい不正な文字列が作られます。(前二者) これをコンテナに入れたり結合することによって、途中にも混ざった文字列が作られます。(後二者) * https://twitter.com/takahashim/status/18974040397 * https://twitter.com/n0kada/status/215674740705210368 * https://twitter.com/n0kada/status/215686490070585346 * https://twitter.com/hajimehoshi/status/215671146769682432 * http://po-ru.com/diary/fixing-invalid-utf-8-in-ruby-revisited/ * http://stackoverflow.com/questions/2982677/ruby-1-9-invalid-byte-sequence-in-utf-8 == 必要な引数: 置換文字 省略可能、String。 デフォルトは、Unicode系ならU+FFFD、それ以外では「?」。 デフォルトが空文字でない理由は、削除してしまうことで、従来は存在しなかったトークンを作れてしまい、 上位のレイヤーの脆弱性に繋がるからです。 http://unicode.org/reports/tr36/#UTF-8_Exploit == API --- str.encode(str.encoding, invalid: replace, [replace: "〓"]) * CSI的じゃなくて気持ち悪い * iconv でできるのは glibc iconv か GNU libiconv に //IGNORE つけた時で他はできない * 実装上のメリットは後述の通り、直感に反してあまりない(と思う) == 別メソッド * 新しいメソッドである * fix/repair invalid/illegal bytes/sequence あたりの名前か == 実装 === 鬼車ベース int ret = rb_enc_precise_mbclen(p, e, enc); して、 MBCLEN_INVALID_P(ret) が真な時、何バイト目が不正なのかわからないのが微妙。 ONIGENC_CONSTRUCT_MBCLEN_INVALID() がバイト数を取らないのが原因なので、 鬼車のエンコーディングモジュール全てに影響してしまうため、修正困難。 不正なバイトはほとんど存在しないと仮定して、効率を犠牲にすれば回避は可能。 === transcodeベース UCS正規化なglibc iconv, GNU libiconv, Perl Encodeなどと違って、 CSIなtranscodeでは、自分自身に変換する場合、 エンコーディングごとに「何もしない」変換モジュールを用意しないといけない。 とりあえず鬼車ベースのコンセプト実装とテストを添付しておきます。 diff --git a/string.c b/string.c index d038835..4808f15 100644 --- a/string.c +++ b/string.c @@ -7426,6 +7426,199 @@ rb_str_ellipsize(VALUE str, long len) return ret; } +/* + * call-seq: + * str.fix_invalid -> new_str + * + * If the string is well-formed, it returns self. + * If the string has invalid byte sequence, repair it with given replacement + * character. + */ +VALUE +rb_str_fix_invalid(VALUE str) +{ + int cr = ENC_CODERANGE(str); + rb_encoding *enc; + if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID) + return rb_str_dup(str); + + enc = STR_ENC_GET(str); + if (rb_enc_asciicompat(enc)) { + const char *p = RSTRING_PTR(str); + const char *e = RSTRING_END(str); + const char *p1 = p; + /* 10 should be enough for the usual use case, + * fixing a wrongly chopped character at the end of the string + */ + long room = 10; + VALUE buf = rb_str_buf_new(RSTRING_LEN(str) + room); + const char *rep; + if (enc == rb_utf8_encoding()) + rep = "\xEF\xBF\xBD"; + else + rep = "?"; + cr = ENC_CODERANGE_7BIT; + + p = search_nonascii(p, e); + if (!p) { + p = e; + } + while (p < e) { + int ret = rb_enc_precise_mbclen(p, e, enc); + if (MBCLEN_CHARFOUND_P(ret)) { + if ((unsigned char)*p > 127) cr = ENC_CODERANGE_VALID; + p += MBCLEN_CHARFOUND_LEN(ret); + } + else if (MBCLEN_INVALID_P(ret)) { + const char *q; + long clen = rb_enc_mbmaxlen(enc); + if (p > p1) rb_str_buf_cat(buf, p1, p - p1); + q = RSTRING_END(buf); + + if (e - p < clen) clen = e - p; + if (clen < 3) { + clen = 1; + } + else { + long len = RSTRING_LEN(buf); + clen--; + rb_str_buf_cat(buf, p, clen); + for (; clen > 1; clen--) { + ret = rb_enc_precise_mbclen(q, q + clen, enc); + if (MBCLEN_NEEDMORE_P(ret)) { + break; + } + else if (MBCLEN_INVALID_P(ret)) { + continue; + } + else { + rb_bug("shouldn't reach here '%s'", q); + } + } + rb_str_set_len(buf, len); + } + p += clen; + p1 = p; + rb_str_buf_cat2(buf, rep); + p = search_nonascii(p, e); + if (!p) { + p = e; + break; + } + } + else if (MBCLEN_NEEDMORE_P(ret)) { + break; + } + else { + rb_bug("shouldn't reach here"); + } + } + if (p1 < p) { + rb_str_buf_cat(buf, p1, p - p1); + } + if (p < e) { + rb_str_buf_cat2(buf, rep); + cr = ENC_CODERANGE_VALID; + } + ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr); + return buf; + } + else if (rb_enc_dummy_p(enc)) { + return rb_str_dup(str); + } + else { + /* ASCII incompatible */ + const char *p = RSTRING_PTR(str); + const char *e = RSTRING_END(str); + const char *p1 = p; + /* 10 should be enough for the usual use case, + * fixing a wrongly chopped character at the end of the string + */ + long room = 10; + VALUE buf = rb_str_buf_new(RSTRING_LEN(str) + room); + const char *rep; + long mbminlen = rb_enc_mbminlen(enc); + static rb_encoding *utf16be; + static rb_encoding *utf16le; + static rb_encoding *utf32be; + static rb_encoding *utf32le; + if (!utf16be) { + utf16be = rb_enc_find("UTF-16BE"); + utf16le = rb_enc_find("UTF-16LE"); + utf32be = rb_enc_find("UTF-32BE"); + utf32le = rb_enc_find("UTF-32LE"); + } + if (enc == utf16be) { + rep = "\xFF\xFD"; + } + else if (enc == utf16le) { + rep = "\xFD\xFF"; + } + else if (enc == utf32be) { + rep = "\x00\x00\xFF\xFD"; + } + else if (enc == utf32le) { + rep = "\xFD\xFF\x00\x00"; + } + else { + rep = "?"; + } + + while (p < e) { + int ret = rb_enc_precise_mbclen(p, e, enc); + if (MBCLEN_CHARFOUND_P(ret)) { + p += MBCLEN_CHARFOUND_LEN(ret); + } + else if (MBCLEN_INVALID_P(ret)) { + const char *q; + long clen = rb_enc_mbmaxlen(enc); + if (p > p1) rb_str_buf_cat(buf, p1, p - p1); + q = RSTRING_END(buf); + + if (e - p < clen) clen = e - p; + if (clen < mbminlen * 3) { + clen = mbminlen; + } + else { + long len = RSTRING_LEN(buf); + clen -= mbminlen; + rb_str_buf_cat(buf, p, clen); + for (; clen > mbminlen; clen-=mbminlen) { + ret = rb_enc_precise_mbclen(q, q + clen, enc); + if (MBCLEN_NEEDMORE_P(ret)) { + break; + } + else if (MBCLEN_INVALID_P(ret)) { + continue; + } + else { + rb_bug("shouldn't reach here '%s'", q); + } + } + rb_str_set_len(buf, len); + } + p += clen; + p1 = p; + rb_str_buf_cat2(buf, rep); + } + else if (MBCLEN_NEEDMORE_P(ret)) { + break; + } + else { + rb_bug("shouldn't reach here"); + } + } + if (p1 < p) { + rb_str_buf_cat(buf, p1, p - p1); + } + if (p < e) { + rb_str_buf_cat2(buf, rep); + } + ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), ENC_CODERANGE_VALID); + return buf; + } +} + /********************************************************************** * Document-class: Symbol * @@ -7882,6 +8075,7 @@ Init_String(void) rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1); rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2); rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1); + rb_define_method(rb_cString, "fix_invalid", rb_str_fix_invalid, 0); rb_define_method(rb_cString, "to_i", rb_str_to_i, -1); rb_define_method(rb_cString, "to_f", rb_str_to_f, 0); diff --git a/test/ruby/test_string.rb b/test/ruby/test_string.rb index 47f349c..2b0cfeb 100644 --- a/test/ruby/test_string.rb +++ b/test/ruby/test_string.rb @@ -2031,6 +2031,29 @@ class TestString < Test::Unit::TestCase assert_equal(u("\x82")+("\u3042"*9), ("\u3042"*10).byteslice(2, 28)) end + + def test_fix_invalid + assert_equal("\uFFFD\uFFFD\uFFFD", "\x80\x80\x80".fix_invalid) + assert_equal("\uFFFDA", "\xF4\x80\x80A".fix_invalid) + + # exapmles in Unicode 6.1.0 D93b + assert_equal("\x41\uFFFD\uFFFD\x41\uFFFD\x41", + "\x41\xC0\xAF\x41\xF4\x80\x80\x41".fix_invalid) + assert_equal("\x41\uFFFD\uFFFD\uFFFD\x41", + "\x41\xE0\x9F\x80\x41".fix_invalid) + assert_equal("\u0061\uFFFD\uFFFD\uFFFD\u0062\uFFFD\u0063\uFFFD\uFFFD\u0064", + "\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64".fix_invalid) + + assert_equal("abcdefghijklmnopqrstuvwxyz\u0061\uFFFD\uFFFD\uFFFD\u0062\uFFFD\u0063\uFFFD\uFFFD\u0064", + "abcdefghijklmnopqrstuvwxyz\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64".fix_invalid) + + assert_equal("\uFFFD\u3042".encode("UTF-16BE"), + "\xD8\x00\x30\x42".force_encoding(Encoding::UTF_16BE). + fix_invalid) + assert_equal("\uFFFD\u3042".encode("UTF-16LE"), + "\x00\xD8\x42\x30".force_encoding(Encoding::UTF_16LE). + fix_invalid) + end end class TestString2 < TestString =end -- http://bugs.ruby-lang.org/