diff options
author | Shugo Maeda <[email protected]> | 2022-02-19 19:10:00 +0900 |
---|---|---|
committer | GitHub <[email protected]> | 2022-02-19 19:10:00 +0900 |
commit | c8817d6a3ebc9bbc151625bca198b8f327d1d68f (patch) | |
tree | 8e147d1ec055f668f123a87fd979946206fd2ee4 | |
parent | db6b23c76cbc7888cd9a9912790c2068703afdd0 (diff) |
Add String#byteindex, String#byterindex, and MatchData#byteoffset (#5518)
* Add String#byteindex, String#byterindex, and MatchData#byteoffset [Feature #13110]
Co-authored-by: NARUSE, Yui <[email protected]>
Notes
Notes:
Merged-By: shugo <[email protected]>
-rw-r--r-- | NEWS.md | 6 | ||||
-rw-r--r-- | re.c | 33 | ||||
-rw-r--r-- | string.c | 260 | ||||
-rw-r--r-- | test/ruby/test_regexp.rb | 21 | ||||
-rw-r--r-- | test/ruby/test_string.rb | 140 |
5 files changed, 451 insertions, 9 deletions
@@ -59,6 +59,9 @@ Note: We're only listing outstanding class updates. empty, instead of returning the default value or calling the default proc. [[Bug #16908]] +* MatchData + * MatchData#byteoffset has been added. [[Feature #13110]] + * Module * Module.used_refinements has been added. [[Feature #14332]] * Module#refinements has been added. [[Feature #12737]] @@ -74,6 +77,9 @@ Note: We're only listing outstanding class updates. * Set is now available as a builtin class without the need for `require "set"`. [[Feature #16989]] It is currently autoloaded via the `Set` constant or a call to `Enumerable#to_set`. +* String + * String#byteindex and String#byterindex have been added. [[Feature #13110]] + * Struct * A Struct class can also be initialized with keyword arguments without `keyword_init: true` on `Struct.new` [[Feature #16806]] @@ -1234,6 +1234,38 @@ match_offset(VALUE match, VALUE n) LONG2NUM(RMATCH(match)->rmatch->char_offset[i].end)); } +/* + * call-seq: + * mtch.byteoffset(n) -> array + * + * Returns a two-element array containing the beginning and ending byte-based offsets of + * the <em>n</em>th match. + * <em>n</em> can be a string or symbol to reference a named capture. + * + * m = /(.)(.)(\d+)(\d)/.match("THX1138.") + * m.byteoffset(0) #=> [1, 7] + * m.byteoffset(4) #=> [6, 7] + * + * m = /(?<foo>.)(.)(?<bar>.)/.match("hoge") + * p m.byteoffset(:foo) #=> [0, 1] + * p m.byteoffset(:bar) #=> [2, 3] + * + */ + +static VALUE +match_byteoffset(VALUE match, VALUE n) +{ + int i = match_backref_number(match, n); + struct re_registers *regs = RMATCH_REGS(match); + + match_check(match); + backref_number_check(regs, i); + + if (BEG(i) < 0) + return rb_assoc_new(Qnil, Qnil); + return rb_assoc_new(LONG2NUM(BEG(i)), LONG2NUM(END(i))); +} + /* * call-seq: @@ -4162,6 +4194,7 @@ Init_Regexp(void) rb_define_method(rb_cMatch, "size", match_size, 0); rb_define_method(rb_cMatch, "length", match_size, 0); rb_define_method(rb_cMatch, "offset", match_offset, 1); + rb_define_method(rb_cMatch, "byteoffset", match_byteoffset, 1); rb_define_method(rb_cMatch, "begin", match_begin, 1); rb_define_method(rb_cMatch, "end", match_end, 1); rb_define_method(rb_cMatch, "match", match_nth, 1); @@ -3979,18 +3979,123 @@ rb_str_index_m(int argc, VALUE *argv, VALUE str) return LONG2NUM(pos); } +/* whether given pos is valid character boundary or not + * Note that in this function, "character" means a code point + * (Unicode scalar value), not a grapheme cluster. + */ +static bool +str_check_byte_pos(VALUE str, long pos) +{ + const char *s = RSTRING_PTR(str); + const char *e = RSTRING_END(str); + const char *p = s + pos; + const char *pp = rb_enc_left_char_head(s, p, e, rb_enc_get(str)); + return p == pp; +} + +/* + * call-seq: + * byteindex(substring, offset = 0) -> integer or nil + * byteindex(regexp, offset = 0) -> integer or nil + * + * Returns the \Integer byte-based index of the first occurrence of the given +substring+, + * or +nil+ if none found: + * + * 'foo'.byteindex('f') # => 0 + * 'foo'.byteindex('o') # => 1 + * 'foo'.byteindex('oo') # => 1 + * 'foo'.byteindex('ooo') # => nil + * + * Returns the \Integer byte-based index of the first match for the given \Regexp +regexp+, + * or +nil+ if none found: + * + * 'foo'.byteindex(/f/) # => 0 + * 'foo'.byteindex(/o/) # => 1 + * 'foo'.byteindex(/oo/) # => 1 + * 'foo'.byteindex(/ooo/) # => nil + * + * \Integer argument +offset+, if given, specifies the byte-based position in the + * string to begin the search: + * + * 'foo'.byteindex('o', 1) # => 1 + * 'foo'.byteindex('o', 2) # => 2 + * 'foo'.byteindex('o', 3) # => nil + * + * If +offset+ is negative, counts backward from the end of +self+: + * + * 'foo'.byteindex('o', -1) # => 2 + * 'foo'.byteindex('o', -2) # => 1 + * 'foo'.byteindex('o', -3) # => 1 + * 'foo'.byteindex('o', -4) # => nil + * + * If +offset+ does not land on character (codepoint) boundary, +IndexError+ is + * raised. + * + * Related: String#index, String#byterindex. + */ + +static VALUE +rb_str_byteindex_m(int argc, VALUE *argv, VALUE str) +{ + VALUE sub; + VALUE initpos; + long pos; + + if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) { + pos = NUM2LONG(initpos); + } + else { + pos = 0; + } + if (pos < 0) { + pos += RSTRING_LEN(str); + if (pos < 0) { + if (RB_TYPE_P(sub, T_REGEXP)) { + rb_backref_set(Qnil); + } + return Qnil; + } + } + + if (!str_check_byte_pos(str, pos)) { + rb_raise(rb_eIndexError, + "offset %ld does not land on character boundary", pos); + } + + if (RB_TYPE_P(sub, T_REGEXP)) { + if (pos > RSTRING_LEN(str)) + return Qnil; + if (rb_reg_search(sub, str, pos, 0) < 0) { + return Qnil; + } + else { + VALUE match = rb_backref_get(); + struct re_registers *regs = RMATCH_REGS(match); + pos = BEG(0); + return LONG2NUM(pos); + } + } + else { + StringValue(sub); + pos = rb_strseq_index(str, sub, pos, 1); + } + + if (pos == -1) return Qnil; + return LONG2NUM(pos); +} + #ifdef HAVE_MEMRCHR static long -str_rindex(VALUE str, VALUE sub, const char *s, long pos, rb_encoding *enc) +str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc) { char *hit, *adjusted; int c; long slen, searchlen; char *sbeg, *e, *t; - slen = RSTRING_LEN(sub); - if (slen == 0) return pos; sbeg = RSTRING_PTR(str); + slen = RSTRING_LEN(sub); + if (slen == 0) return s - sbeg; e = RSTRING_END(str); t = RSTRING_PTR(sub); c = *t & 0xff; @@ -4005,7 +4110,7 @@ str_rindex(VALUE str, VALUE sub, const char *s, long pos, rb_encoding *enc) continue; } if (memcmp(hit, t, slen) == 0) - return rb_str_sublen(str, hit - sbeg); + return hit - sbeg; searchlen = adjusted - sbeg; } while (searchlen > 0); @@ -4013,7 +4118,7 @@ str_rindex(VALUE str, VALUE sub, const char *s, long pos, rb_encoding *enc) } #else static long -str_rindex(VALUE str, VALUE sub, const char *s, long pos, rb_encoding *enc) +str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc) { long slen; char *sbeg, *e, *t; @@ -4025,10 +4130,9 @@ str_rindex(VALUE str, VALUE sub, const char *s, long pos, rb_encoding *enc) while (s) { if (memcmp(s, t, slen) == 0) { - return pos; + return s - sbeg; } - if (pos == 0) break; - pos--; + if (s <= sbeg) break; s = rb_enc_prev_char(sbeg, s, e, enc); } @@ -4065,7 +4169,7 @@ rb_str_rindex(VALUE str, VALUE sub, long pos) } s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte); - return str_rindex(str, sub, s, pos, enc); + return rb_str_sublen(str, str_rindex(str, sub, s, enc)); } /* @@ -4170,6 +4274,142 @@ rb_str_rindex_m(int argc, VALUE *argv, VALUE str) return Qnil; } +static long +rb_str_byterindex(VALUE str, VALUE sub, long pos) +{ + long len, slen; + char *sbeg, *s; + rb_encoding *enc; + + enc = rb_enc_check(str, sub); + if (is_broken_string(sub)) return -1; + len = RSTRING_LEN(str); + slen = RSTRING_LEN(sub); + + /* substring longer than string */ + if (len < slen) return -1; + if (len - pos < slen) pos = len - slen; + if (len == 0) return pos; + + sbeg = RSTRING_PTR(str); + + if (pos == 0) { + if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0) + return 0; + else + return -1; + } + + s = sbeg + pos; + return str_rindex(str, sub, s, enc); +} + + +/* + * call-seq: + * byterindex(substring, offset = self.length) -> integer or nil + * byterindex(regexp, offset = self.length) -> integer or nil + * + * Returns the \Integer byte-based index of the _last_ occurrence of the given +substring+, + * or +nil+ if none found: + * + * 'foo'.byterindex('f') # => 0 + * 'foo'.byterindex('o') # => 2 + * 'foo'.byterindex('oo') # => 1 + * 'foo'.byterindex('ooo') # => nil + * + * Returns the \Integer byte-based index of the _last_ match for the given \Regexp +regexp+, + * or +nil+ if none found: + * + * 'foo'.byterindex(/f/) # => 0 + * 'foo'.byterindex(/o/) # => 2 + * 'foo'.byterindex(/oo/) # => 1 + * 'foo'.byterindex(/ooo/) # => nil + * + * The _last_ match means starting at the possible last position, not + * the last of longest matches. + * + * 'foo'.byterindex(/o+/) # => 2 + * $~ #=> #<MatchData "o"> + * + * To get the last longest match, needs to combine with negative + * lookbehind. + * + * 'foo'.byterindex(/(?<!o)o+/) # => 1 + * $~ #=> #<MatchData "oo"> + * + * Or String#byteindex with negative lookforward. + * + * 'foo'.byteindex(/o+(?!.*o)/) # => 1 + * $~ #=> #<MatchData "oo"> + * + * \Integer argument +offset+, if given and non-negative, specifies the maximum starting byte-based position in the + * string to _end_ the search: + * + * 'foo'.byterindex('o', 0) # => nil + * 'foo'.byterindex('o', 1) # => 1 + * 'foo'.byterindex('o', 2) # => 2 + * 'foo'.byterindex('o', 3) # => 2 + * + * If +offset+ is a negative \Integer, the maximum starting position in the + * string to _end_ the search is the sum of the string's length and +offset+: + * + * 'foo'.byterindex('o', -1) # => 2 + * 'foo'.byterindex('o', -2) # => 1 + * 'foo'.byterindex('o', -3) # => nil + * 'foo'.byterindex('o', -4) # => nil + * + * If +offset+ does not land on character (codepoint) boundary, +IndexError+ is + * raised. + * + * Related: String#byteindex. + */ + +static VALUE +rb_str_byterindex_m(int argc, VALUE *argv, VALUE str) +{ + VALUE sub; + VALUE vpos; + long pos, len = RSTRING_LEN(str); + + if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) { + pos = NUM2LONG(vpos); + if (pos < 0) { + pos += len; + if (pos < 0) { + if (RB_TYPE_P(sub, T_REGEXP)) { + rb_backref_set(Qnil); + } + return Qnil; + } + } + if (pos > len) pos = len; + } + else { + pos = len; + } + + if (!str_check_byte_pos(str, pos)) { + rb_raise(rb_eIndexError, + "offset %ld does not land on character boundary", pos); + } + + if (RB_TYPE_P(sub, T_REGEXP)) { + if (rb_reg_search(sub, str, pos, 1) >= 0) { + VALUE match = rb_backref_get(); + struct re_registers *regs = RMATCH_REGS(match); + pos = BEG(0); + return LONG2NUM(pos); + } + } + else { + StringValue(sub); + pos = rb_str_byterindex(str, sub, pos); + if (pos >= 0) return LONG2NUM(pos); + } + return Qnil; +} + /* * call-seq: * string =~ regexp -> integer or nil @@ -12382,7 +12622,9 @@ Init_String(void) rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0); rb_define_method(rb_cString, "upto", rb_str_upto, -1); rb_define_method(rb_cString, "index", rb_str_index_m, -1); + rb_define_method(rb_cString, "byteindex", rb_str_byteindex_m, -1); rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1); + rb_define_method(rb_cString, "byterindex", rb_str_byterindex_m, -1); rb_define_method(rb_cString, "replace", rb_str_replace, 1); rb_define_method(rb_cString, "clear", rb_str_clear, 0); rb_define_method(rb_cString, "chr", rb_str_chr, 0); diff --git a/test/ruby/test_regexp.rb b/test/ruby/test_regexp.rb index 2bf4649f14..80caa879e4 100644 --- a/test/ruby/test_regexp.rb +++ b/test/ruby/test_regexp.rb @@ -424,6 +424,27 @@ class TestRegexp < Test::Unit::TestCase assert_equal([2, 3], m.offset(3)) end + def test_match_byteoffset_begin_end + m = /(?<x>b..)/.match("foobarbaz") + assert_equal([3, 6], m.byteoffset("x")) + assert_equal(3, m.begin("x")) + assert_equal(6, m.end("x")) + assert_raise(IndexError) { m.byteoffset("y") } + assert_raise(IndexError) { m.byteoffset(2) } + assert_raise(IndexError) { m.begin(2) } + assert_raise(IndexError) { m.end(2) } + + m = /(?<x>q..)?/.match("foobarbaz") + assert_equal([nil, nil], m.byteoffset("x")) + assert_equal(nil, m.begin("x")) + assert_equal(nil, m.end("x")) + + m = /\A\u3042(.)(.)?(.)\z/.match("\u3042\u3043\u3044") + assert_equal([3, 6], m.byteoffset(1)) + assert_equal([nil, nil], m.byteoffset(2)) + assert_equal([6, 9], m.byteoffset(3)) + end + def test_match_to_s m = /(?<x>b..)/.match("foobarbaz") assert_equal("bar", m.to_s) diff --git a/test/ruby/test_string.rb b/test/ruby/test_string.rb index 95fbf63702..0e2a484478 100644 --- a/test/ruby/test_string.rb +++ b/test/ruby/test_string.rb @@ -1340,6 +1340,15 @@ CODE assert_nil($~) assert_equal(2, S("abcdbce").index(/b\Kc/)) + + assert_equal(0, S("こんにちは").index(?こ)) + assert_equal(1, S("こんにちは").index(S("んにち"))) + assert_equal(2, S("こんにちは").index(/にち./)) + + assert_equal(0, S("にんにちは").index(?に, 0)) + assert_equal(2, S("にんにちは").index(?に, 1)) + assert_equal(2, S("にんにちは").index(?に, 2)) + assert_nil(S("にんにちは").index(?に, 3)) end def test_insert @@ -1502,6 +1511,11 @@ CODE assert_nil(S("hello").rindex(S("z"))) assert_nil(S("hello").rindex(/z./)) + assert_equal(5, S("hello").rindex(S(""))) + assert_equal(5, S("hello").rindex(S(""), 5)) + assert_equal(4, S("hello").rindex(S(""), 4)) + assert_equal(0, S("hello").rindex(S(""), 0)) + o = Object.new def o.to_str; "bar"; end assert_equal(6, S("foobarbarbaz").rindex(o)) @@ -1514,6 +1528,24 @@ CODE assert_equal([3, 3], $~.offset(0)) assert_equal(5, S("abcdbce").rindex(/b\Kc/)) + + assert_equal(2, S("こんにちは").rindex(?に)) + assert_equal(6, S("にちは、こんにちは").rindex(S("にちは"))) + assert_equal(6, S("にちは、こんにちは").rindex(/にち./)) + + assert_equal(6, S("にちは、こんにちは").rindex(S("にちは"), 7)) + assert_equal(6, S("にちは、こんにちは").rindex(S("にちは"), -2)) + assert_equal(6, S("にちは、こんにちは").rindex(S("にちは"), 6)) + assert_equal(6, S("にちは、こんにちは").rindex(S("にちは"), -3)) + assert_equal(0, S("にちは、こんにちは").rindex(S("にちは"), 5)) + assert_equal(0, S("にちは、こんにちは").rindex(S("にちは"), -4)) + assert_equal(0, S("にちは、こんにちは").rindex(S("にちは"), 1)) + assert_equal(0, S("にちは、こんにちは").rindex(S("にちは"), 0)) + + assert_equal(0, S("こんにちは").rindex(S("こんにちは"))) + assert_nil(S("こんにち").rindex(S("こんにちは"))) + assert_nil(S("こ").rindex(S("こんにちは"))) + assert_nil(S("").rindex(S("こんにちは"))) end def test_rjust @@ -3254,6 +3286,114 @@ CODE assert_not_predicate(data, :valid_encoding?) assert_predicate(data[100..-1], :valid_encoding?) end + + def test_byteindex + assert_equal(0, S("hello").byteindex(?h)) + assert_equal(1, S("hello").byteindex(S("ell"))) + assert_equal(2, S("hello").byteindex(/ll./)) + + assert_equal(3, S("hello").byteindex(?l, 3)) + assert_equal(3, S("hello").byteindex(S("l"), 3)) + assert_equal(3, S("hello").byteindex(/l./, 3)) + + assert_nil(S("hello").byteindex(?z, 3)) + assert_nil(S("hello").byteindex(S("z"), 3)) + assert_nil(S("hello").byteindex(/z./, 3)) + + assert_nil(S("hello").byteindex(?z)) + assert_nil(S("hello").byteindex(S("z"))) + assert_nil(S("hello").byteindex(/z./)) + + assert_equal(0, S("").byteindex(S(""))) + assert_equal(0, S("").byteindex(//)) + assert_nil(S("").byteindex(S("hello"))) + assert_nil(S("").byteindex(/hello/)) + assert_equal(0, S("hello").byteindex(S(""))) + assert_equal(0, S("hello").byteindex(//)) + + s = S("long") * 1000 << "x" + assert_nil(s.byteindex(S("y"))) + assert_equal(4 * 1000, s.byteindex(S("x"))) + s << "yx" + assert_equal(4 * 1000, s.byteindex(S("x"))) + assert_equal(4 * 1000, s.byteindex(S("xyx"))) + + o = Object.new + def o.to_str; "bar"; end + assert_equal(3, S("foobarbarbaz").byteindex(o)) + assert_raise(TypeError) { S("foo").byteindex(Object.new) } + + assert_nil(S("foo").byteindex(//, -100)) + assert_nil($~) + + assert_equal(2, S("abcdbce").byteindex(/b\Kc/)) + + assert_equal(0, S("こんにちは").byteindex(?こ)) + assert_equal(3, S("こんにちは").byteindex(S("んにち"))) + assert_equal(6, S("こんにちは").byteindex(/にち./)) + + assert_equal(0, S("にんにちは").byteindex(?に, 0)) + assert_raise(IndexError) { S("にんにちは").byteindex(?に, 1) } + assert_raise(IndexError) { S("にんにちは").byteindex(?に, 5) } + assert_equal(6, S("にんにちは").byteindex(?に, 6)) + assert_equal(6, S("にんにちは").byteindex(S("に"), 6)) + assert_equal(6, S("にんにちは").byteindex(/に./, 6)) + assert_raise(IndexError) { S("にんにちは").byteindex(?に, 7) } + end + + def test_byterindex + assert_equal(3, S("hello").byterindex(?l)) + assert_equal(6, S("ell, hello").byterindex(S("ell"))) + assert_equal(7, S("ell, hello").byterindex(/ll./)) + + assert_equal(3, S("hello,lo").byterindex(?l, 3)) + assert_equal(3, S("hello,lo").byterindex(S("l"), 3)) + assert_equal(3, S("hello,lo").byterindex(/l./, 3)) + + assert_nil(S("hello").byterindex(?z, 3)) + assert_nil(S("hello").byterindex(S("z"), 3)) + assert_nil(S("hello").byterindex(/z./, 3)) + + assert_nil(S("hello").byterindex(?z)) + assert_nil(S("hello").byterindex(S("z"))) + assert_nil(S("hello").byterindex(/z./)) + + assert_equal(5, S("hello").byterindex(S(""))) + assert_equal(5, S("hello").byterindex(S(""), 5)) + assert_equal(4, S("hello").byterindex(S(""), 4)) + assert_equal(0, S("hello").byterindex(S(""), 0)) + + o = Object.new + def o.to_str; "bar"; end + assert_equal(6, S("foobarbarbaz").byterindex(o)) + assert_raise(TypeError) { S("foo").byterindex(Object.new) } + + assert_nil(S("foo").byterindex(//, -100)) + assert_nil($~) + + assert_equal(3, S("foo").byterindex(//)) + assert_equal([3, 3], $~.offset(0)) + + assert_equal(5, S("abcdbce").byterindex(/b\Kc/)) + + assert_equal(6, S("こんにちは").byterindex(?に)) + assert_equal(18, S("にちは、こんにちは").byterindex(S("にちは"))) + assert_equal(18, S("にちは、こんにちは").byterindex(/にち./)) + + assert_raise(IndexError) { S("にちは、こんにちは").byterindex(S("にちは"), 19) } + assert_raise(IndexError) { S("にちは、こんにちは").byterindex(S("にちは"), -2) } + assert_equal(18, S("にちは、こんにちは").byterindex(S("にちは"), 18)) + assert_equal(18, S("にちは、こんにちは").byterindex(S("にちは"), -3)) + assert_raise(IndexError) { S("にちは、こんにちは").byterindex(S("にちは"), 17) } + assert_raise(IndexError) { S("にちは、こんにちは").byterindex(S("にちは"), -4) } + assert_raise(IndexError) { S("にちは、こんにちは").byterindex(S("にちは"), 1) } + assert_equal(0, S("にちは、こんにちは").byterindex(S("にちは"), 0)) + + assert_equal(0, S("こんにちは").byterindex(S("こんにちは"))) + assert_nil(S("こんにち").byterindex(S("こんにちは"))) + assert_nil(S("こ").byterindex(S("こんにちは"))) + assert_nil(S("").byterindex(S("こんにちは"))) + end end class TestString2 < TestString |