From: Nobuyoshi Nakada Date: 2009-01-19T05:28:18+09:00 Subject: [ruby-dev:37784] Re: String#hash なかだです。 At Sat, 17 Jan 2009 16:33:08 +0900, Yukihiro Matsumoto wrote in [ruby-dev:37779]: > |例えば, Complexだと > | > | @real.hash ^ @imag.hash > | > |となっています. 実際には, これはあまり良くなくて, @real==@image のとき > |つねに同じ値になってしまいます. そこで, 別のseedを与え > | > | @real.hash(seed1) ^ @image.hash(seed2) > | > |の様にすると, 上記の問題はなくなります. これは, Array等のhash関数でも > |同様です. また, Fixnum#hashとObject#hashも別のseedを与えるのもよさそう > |な気がします. > > 毎回変わる理由は[ruby-dev:37778]の通りなんですが、各オブジェ > クトのhashの計算方式を更新した方が良いという提案はそれとは独 > 立に有効だと思います。どんな計算式が良いのか議論していただけ > ませんか? ComplexやRationalはrb_memhash()が使えると思いますが、可変長であ るArrayやObject、Structなどでは多少工夫が必要でしょう。 しかし、MurmurHashって途中でたまたま0になるとそれ以降のデータは 無関係になるような気がするんですが、いいんでしょうか。 Index: array.c =================================================================== --- array.c (revision 21650) +++ array.c (working copy) @@ -2722,10 +2722,10 @@ recursive_hash(VALUE ary, VALUE dummy, i return LONG2FIX(0); } - h = RARRAY_LEN(ary); + h = rb_hash_start(RARRAY_LEN(ary)); for (i=0; ireal), f_hash(dat->imag)); + h[0] = rb_hash(rb_obj_class(self)); + n = rb_hash(dat->real); + h[1] = NUM2LONG(n); + n = rb_hash(dat->imag); + h[2] = NUM2LONG(n); + v = rb_memhash(h, sizeof(h)); + return LONG2FIX(v); } @@ -1383,5 +1389,4 @@ Init_Complex(void) id_expt = rb_intern("**"); id_floor = rb_intern("floor"); - id_hash = rb_intern("hash"); id_idiv = rb_intern("div"); id_inspect = rb_intern("inspect"); Index: hash.c =================================================================== --- hash.c (revision 21650) +++ hash.c (working copy) @@ -70,5 +70,5 @@ rb_any_hash(VALUE a) case T_FIXNUM: case T_SYMBOL: - hnum = (int)a; + hnum = rb_hash_end(rb_hash_start((unsigned int)a)); break; @@ -1511,7 +1511,10 @@ static int hash_i(VALUE key, VALUE val, int *hval) { + VALUE n; if (key == Qundef) return ST_CONTINUE; - *hval ^= rb_hash(key); - *hval ^= rb_hash(val); + n = rb_hash(key); + *hval = rb_hash_uint(*hval, NUM2LONG(n)); + n = rb_hash(val); + *hval = rb_hash_uint(*hval, NUM2LONG(n)); return ST_CONTINUE; } @@ -1527,6 +1530,7 @@ recursive_hash(VALUE hash, VALUE dummy, if (!RHASH(hash)->ntbl) return LONG2FIX(0); - hval = RHASH(hash)->ntbl->num_entries; + hval = rb_hash_start(RHASH(hash)->ntbl->num_entries); rb_hash_foreach(hash, hash_i, (st_data_t)&hval); + hval = rb_hash_end(hval); return INT2FIX(hval); } Index: gc.c =================================================================== --- gc.c (revision 21650) +++ gc.c (working copy) @@ -2904,5 +2904,4 @@ Init_GC(void) OBJ_FREEZE(nomem_error); - rb_define_method(rb_mKernel, "hash", rb_obj_id, 0); rb_define_method(rb_mKernel, "__id__", rb_obj_id, 0); rb_define_method(rb_mKernel, "object_id", rb_obj_id, 0); Index: object.c =================================================================== --- object.c (revision 21650) +++ object.c (working copy) @@ -96,4 +96,12 @@ rb_obj_equal(VALUE obj1, VALUE obj2) } +VALUE +rb_obj_hash(VALUE obj) +{ + VALUE oid = rb_obj_id(obj); + unsigned h = rb_hash_end(rb_hash_start(NUM2LONG(oid))); + return LONG2NUM(h); +} + /* * call-seq: @@ -2517,4 +2525,5 @@ Init_Object(void) rb_define_method(rb_mKernel, "!~", rb_obj_not_match, 1); rb_define_method(rb_mKernel, "eql?", rb_obj_equal, 1); + rb_define_method(rb_mKernel, "hash", rb_obj_hash, 0); rb_define_method(rb_mKernel, "class", rb_obj_class, 0); Index: range.c =================================================================== --- range.c (revision 21650) +++ range.c (working copy) @@ -214,12 +214,14 @@ static VALUE range_hash(VALUE range) { - long hash = EXCL(range); + unsigned hash = EXCL(range); VALUE v; + hash = rb_hash_start(hash); v = rb_hash(RANGE_BEG(range)); - hash ^= v << 1; + hash = rb_hash_uint(hash, NUM2LONG(v)); v = rb_hash(RANGE_END(range)); - hash ^= v << 9; - hash ^= EXCL(range) << 24; + hash = rb_hash_uint(hash, NUM2LONG(v)); + hash = rb_hash_uint(hash, EXCL(range) << 24); + hash = rb_hash_end(hash); return LONG2FIX(hash); Index: rational.c =================================================================== --- rational.c (revision 21650) +++ rational.c (working copy) @@ -28,5 +28,5 @@ VALUE rb_cRational; static ID id_abs, id_cmp, id_convert, id_equal_p, id_expt, id_floor, - id_hash, id_idiv, id_inspect, id_integer_p, id_negate, id_to_f, + id_idiv, id_inspect, id_integer_p, id_negate, id_to_f, id_to_i, id_to_s, id_truncate; @@ -136,9 +136,6 @@ f_sub(VALUE x, VALUE y) } -binop(xor, '^') - fun1(abs) fun1(floor) -fun1(hash) fun1(inspect) fun1(integer_p) @@ -1162,6 +1159,15 @@ static VALUE nurat_hash(VALUE self) { + long v, h[3]; + VALUE n; + get_dat1(self); - return f_xor(f_hash(dat->num), f_hash(dat->den)); + h[0] = rb_hash(rb_obj_class(self)); + n = rb_hash(dat->num); + h[1] = NUM2LONG(n); + n = rb_hash(dat->den); + h[2] = NUM2LONG(n); + v = rb_memhash(h, sizeof(h)); + return LONG2FIX(v); } @@ -1555,5 +1561,4 @@ Init_Rational(void) id_expt = rb_intern("**"); id_floor = rb_intern("floor"); - id_hash = rb_intern("hash"); id_idiv = rb_intern("div"); id_inspect = rb_intern("inspect"); Index: string.c =================================================================== --- string.c (revision 21650) +++ string.c (working copy) @@ -1883,10 +1883,17 @@ rb_str_concat(VALUE str1, VALUE str2) /* MurmurHash described in http://murmurhash.googlepages.com/ */ -static unsigned int -hash(const unsigned char * data, int len, unsigned int h) +static inline uint32_t +murmur(unsigned int h, const int r) { const unsigned int m = 0x7fd652ad; - const int r = 16; + h *= m; + return h ^ (h >> r); +} + +#define murmur16(h) murmur(h, 16) +static unsigned int +hash(const unsigned char * data, int len, unsigned int h) +{ h += 0xdeadbeef; @@ -1929,7 +1936,5 @@ hash(const unsigned char * data, int len t = (t >> sr) | (d << sl); #endif - h += t; - h *= m; - h ^= h >> r; + h = murmur16(h + t); t = d; @@ -1954,6 +1959,5 @@ hash(const unsigned char * data, int len h += (t >> sr) | (d << sl); #endif - h *= m; - h ^= h >> r; + h = murmur16(h); } @@ -1965,8 +1969,5 @@ hash(const unsigned char * data, int len { do { - h += *(uint32_t *)data; - h *= m; - h ^= h >> r; - + h = murmur16(h + *(uint32_t *)data); data += 4; len -= 4; @@ -1991,18 +1992,57 @@ hash(const unsigned char * data, int len h += data[0]; #endif - h *= m; - h ^= h >> r; + h = murmur16(h); } - h *= m; - h ^= h >> 10; - h *= m; - h ^= h >> 17; + return rb_hash_end(h); +} + +unsigned int +rb_hash_uint32(unsigned int h, unsigned int i) +{ + return murmur(h + i, 16); +} + +unsigned int +rb_hash_uint(unsigned int h, unsigned int i) +{ + unsigned int v = 0; + h += i; +#ifdef WORDS_BIGENDIAN +#if SIZEOF_INT*CHAR_BIT > 12*8 + v = murmur16(v + (h >> 12*8)); +#endif +#if SIZEOF_INT*CHAR_BIT > 8*8 + v = murmur16(v + (h >> 8*8)); +#endif +#if SIZEOF_INT*CHAR_BIT > 4*8 + v = murmur16(v + (h >> 4*8)); +#endif +#endif + v = murmur16(v + h); +#ifndef WORDS_BIGENDIAN +#if SIZEOF_INT*CHAR_BIT > 4*8 + v = murmur16(v + (h >> 4*8)); +#endif +#if SIZEOF_INT*CHAR_BIT > 8*8 + v = murmur16(v + (h >> 8*8)); +#endif +#if SIZEOF_INT*CHAR_BIT > 12*8 + v = murmur16(v + (h >> 12*8)); +#endif +#endif + return v; +} +unsigned int +rb_hash_end(unsigned int h) +{ + h = murmur(h, 10); + h = murmur(h, 17); return h; } -int -rb_memhash(const void *ptr, long len) +unsigned int +rb_hash_start(unsigned int h) { static int hashseed_init = 0; @@ -2013,6 +2053,11 @@ rb_memhash(const void *ptr, long len) hashseed_init = 1; } + return hashseed + h; +} - return hash(ptr, len, hashseed); +int +rb_memhash(const void *ptr, long len) +{ + return hash(ptr, len, rb_hash_start(0)); } Index: struct.c =================================================================== --- struct.c (revision 21650) +++ struct.c (working copy) @@ -805,14 +805,15 @@ static VALUE rb_struct_hash(VALUE s) { - long i, h; + long i; + unsigned h; VALUE n; - h = rb_hash(rb_obj_class(s)); + h = rb_hash_start(rb_hash(rb_obj_class(s))); for (i = 0; i < RSTRUCT_LEN(s); i++) { - h = (h << 1) | (h<0 ? 1 : 0); n = rb_hash(RSTRUCT_PTR(s)[i]); - h ^= NUM2LONG(n); + h = rb_hash_uint(h, NUM2LONG(n)); } - return LONG2FIX(h); + h = rb_hash_end(h); + return INT2FIX(h); } Index: time.c =================================================================== --- time.c (revision 21650) +++ time.c (working copy) @@ -1184,5 +1184,5 @@ time_hash(VALUE time) GetTimeval(time, tobj); - hash = tobj->ts.tv_sec ^ tobj->ts.tv_nsec; + hash = rb_hash_end(rb_hash_uint(rb_hash_start(tobj->ts.tv_sec), tobj->ts.tv_nsec)); return LONG2FIX(hash); } Index: include/ruby/intern.h =================================================================== --- include/ruby/intern.h (revision 21650) +++ include/ruby/intern.h (working copy) @@ -607,4 +607,8 @@ VALUE rb_str_append(VALUE, VALUE); VALUE rb_str_concat(VALUE, VALUE); int rb_memhash(const void *ptr, long len); +unsigned int rb_hash_start(unsigned int); +unsigned int rb_hash_uint32(unsigned int, unsigned int); +unsigned int rb_hash_uint(unsigned int, unsigned int); +unsigned int rb_hash_end(unsigned int); int rb_str_hash(VALUE); int rb_str_hash_cmp(VALUE,VALUE); -- --- 僕の前にBugはない。 --- 僕の後ろにBugはできる。 中田 伸悦