diff options
-rw-r--r-- | ChangeLog | 5 | ||||
-rw-r--r-- | string.c | 84 |
2 files changed, 73 insertions, 16 deletions
@@ -1,3 +1,8 @@ +Sat Jan 19 22:41:39 2008 Tanaka Akira <[email protected]> + + * string.c (coderange_scan): don't call mbclen functions for ASCII + characters with ASCII compatible encoding. + Sat Jan 19 21:00:34 2008 Tanaka Akira <[email protected]> * lib/rdoc/template.rb (RDoc): defined to avoid uninitialized constant @@ -115,40 +115,92 @@ single_byte_optimizable(VALUE str) VALUE rb_fs; +static inline const char * +search_nonascii(const char *p, const char *e) +{ +#if ULONG_MAX == 18446744073709551615UL +# define NONASCII_MASK 0x8080808080808080UL +#elif ULONG_MAX == 4294967295UL +# define NONASCII_MASK 0x80808080UL +#endif +#ifdef NONASCII_MASK + if (sizeof(long) * 2 < e - p) { + const unsigned long *s, *t; + const VALUE lowbits = sizeof(unsigned long) - 1; + s = (const unsigned long*)(~lowbits & ((VALUE)p + lowbits)); + t = (const unsigned long*)(~lowbits & (VALUE)e); + while (p < (const char *)s) { + if (!ISASCII(*p)) + return p; + p++; + } + while (s < t) { + if (*s & NONASCII_MASK) { + t = s; + break; + } + s++; + } + p = (const char *)t; + } +#endif + while (p < e) { + if (!ISASCII(*p)) + return p; + p++; + } + return NULL; +} + static int coderange_scan(const char *p, long len, rb_encoding *enc) { const char *e = p + len; - int cr; if (rb_enc_to_index(enc) == 0) { /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */ + p = search_nonascii(p, e); + return p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT; + } + + if (rb_enc_asciicompat(enc)) { + p = search_nonascii(p, e); + if (!p) { + return ENC_CODERANGE_7BIT; + } while (p < e) { - if (!ISASCII((unsigned char)*p)) { - return ENC_CODERANGE_VALID; + int ret = rb_enc_precise_mbclen(p, e, enc); + int len = MBCLEN_CHARFOUND(ret); + if (!len) { + return ENC_CODERANGE_BROKEN; } - p++; + p += len; + if (p < e) { + p = search_nonascii(p, e); + if (!p) { + return ENC_CODERANGE_VALID; + } + } + } + if (e < p) { + return ENC_CODERANGE_BROKEN; } - return ENC_CODERANGE_7BIT; + return ENC_CODERANGE_VALID; } - cr = rb_enc_asciicompat(enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID; while (p < e) { int ret = rb_enc_precise_mbclen(p, e, enc); int len = MBCLEN_CHARFOUND(ret); - if (len) { - if (len != 1 || !ISASCII((unsigned char)*p)) { - cr = ENC_CODERANGE_VALID; - } - p += len; - } - else { - cr = ENC_CODERANGE_BROKEN; - break; + if (!len) { + return ENC_CODERANGE_BROKEN; } + p += len; } - return cr; + if (e < p) { + return ENC_CODERANGE_BROKEN; + } + return ENC_CODERANGE_VALID; } int |