diff options
author | matz <matz@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2006-02-04 12:31:19 +0000 |
---|---|---|
committer | matz <matz@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2006-02-04 12:31:19 +0000 |
commit | 086e235f035d44c3554cab364cd70644e537a919 (patch) | |
tree | 377844e954f01e7747b22c3acaabb23c23034baa | |
parent | f0432871fd3f9489c87199700efa32a7597f135c (diff) |
* oniguruma.h: merge Oniguruma 4.0.0 [ruby-dev:28290]
git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@9885 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
-rw-r--r-- | ChangeLog | 6 | ||||
-rw-r--r-- | euc_jp.c | 20 | ||||
-rw-r--r-- | oniguruma.h | 100 | ||||
-rw-r--r-- | regcomp.c | 155 | ||||
-rw-r--r-- | regenc.c | 70 | ||||
-rw-r--r-- | regenc.h | 26 | ||||
-rw-r--r-- | regerror.c | 50 | ||||
-rw-r--r-- | regexec.c | 115 | ||||
-rw-r--r-- | regint.h | 19 | ||||
-rw-r--r-- | regparse.c | 161 | ||||
-rw-r--r-- | regparse.h | 3 | ||||
-rw-r--r-- | sjis.c | 19 | ||||
-rw-r--r-- | utf8.c | 90 |
13 files changed, 469 insertions, 365 deletions
@@ -11,6 +11,10 @@ Sat Feb 4 15:52:56 2006 Hirokazu Yamamoto <[email protected]> I think the function name of rb_int2big is quite misleading. This should be "rb_long2big". +Sat Feb 4 15:02:05 2006 Yukihiro Matsumoto <[email protected]> + + * oniguruma.h: merge Oniguruma 4.0.0 [ruby-dev:28290] + Fri Feb 3 19:25:53 2006 Hirokazu Yamamoto <[email protected]> * ruby.h: fixed prototype. @@ -86,7 +90,7 @@ Tue Jan 31 08:07:02 2006 Yukihiro Matsumoto <[email protected]> * numeric.c (int_dotimes): ditto. * enum.c (enum_first): new method Enumerable#first to take first n - element from an enumerable. + elements from an enumerable. * enum.c (enum_group_by): new method Enumerable#group_by that groups enumerable values according to their block values. @@ -31,7 +31,7 @@ #define eucjp_islead(c) ((UChar )((c) - 0xa1) > 0xfe - 0xa1) -static int EncLen_EUCJP[] = { +static const int EncLen_EUCJP[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, @@ -158,20 +158,16 @@ eucjp_is_mbc_ambiguous(OnigAmbigType flag, const UChar** pp, const UChar* end) static int eucjp_is_code_ctype(OnigCodePoint code, unsigned int ctype) { - if ((ctype & ONIGENC_CTYPE_WORD) != 0) { - if (code < 128) - return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); - else + if (code < 128) + return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); + else { + if ((ctype & (ONIGENC_CTYPE_WORD | + ONIGENC_CTYPE_GRAPH | ONIGENC_CTYPE_PRINT)) != 0) { return (eucjp_code_to_mbclen(code) > 1 ? TRUE : FALSE); - - ctype &= ~ONIGENC_CTYPE_WORD; - if (ctype == 0) return FALSE; + } } - if (code < 128) - return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); - else - return FALSE; + return FALSE; } static UChar* diff --git a/oniguruma.h b/oniguruma.h index 95dfbebc5f..0fd7e12a44 100644 --- a/oniguruma.h +++ b/oniguruma.h @@ -4,7 +4,7 @@ oniguruma.h - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2005 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2006 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -34,8 +34,8 @@ extern "C" { #endif #define ONIGURUMA -#define ONIGURUMA_VERSION_MAJOR 3 -#define ONIGURUMA_VERSION_MINOR 7 +#define ONIGURUMA_VERSION_MAJOR 4 +#define ONIGURUMA_VERSION_MINOR 0 #define ONIGURUMA_VERSION_TEENY 0 #ifdef __cplusplus @@ -79,7 +79,11 @@ extern "C" { /* PART: character encoding */ -typedef unsigned char UChar; +#ifndef ONIG_ESCAPE_UCHAR_COLLISION +#define UChar OnigUChar +#endif + +typedef unsigned char OnigUChar; typedef unsigned long OnigCodePoint; typedef unsigned int OnigDistance; @@ -149,24 +153,24 @@ typedef m17n_encoding* OnigEncoding; #else typedef struct { - int (*mbc_enc_len)(const UChar* p); + int (*mbc_enc_len)(const OnigUChar* p); const char* name; int max_enc_len; int min_enc_len; OnigAmbigType support_ambig_flag; OnigMetaCharTableType meta_char_table; - int (*is_mbc_newline)(const UChar* p, const UChar* end); - OnigCodePoint (*mbc_to_code)(const UChar* p, const UChar* end); + int (*is_mbc_newline)(const OnigUChar* p, const OnigUChar* end); + OnigCodePoint (*mbc_to_code)(const OnigUChar* p, const OnigUChar* end); int (*code_to_mbclen)(OnigCodePoint code); - int (*code_to_mbc)(OnigCodePoint code, UChar *buf); - int (*mbc_to_normalize)(OnigAmbigType flag, const UChar** pp, const UChar* end, UChar* to); - int (*is_mbc_ambiguous)(OnigAmbigType flag, const UChar** pp, const UChar* end); - int (*get_all_pair_ambig_codes)(OnigAmbigType flag, OnigPairAmbigCodes** acs); - int (*get_all_comp_ambig_codes)(OnigAmbigType flag, OnigCompAmbigCodes** acs); + int (*code_to_mbc)(OnigCodePoint code, OnigUChar *buf); + int (*mbc_to_normalize)(OnigAmbigType flag, const OnigUChar** pp, const OnigUChar* end, OnigUChar* to); + int (*is_mbc_ambiguous)(OnigAmbigType flag, const OnigUChar** pp, const OnigUChar* end); + int (*get_all_pair_ambig_codes)(OnigAmbigType flag, const OnigPairAmbigCodes** acs); + int (*get_all_comp_ambig_codes)(OnigAmbigType flag, const OnigCompAmbigCodes** acs); int (*is_code_ctype)(OnigCodePoint code, unsigned int ctype); - int (*get_ctype_code_range)(int ctype, OnigCodePoint* sb_range[], OnigCodePoint* mb_range[]); - UChar* (*left_adjust_char_head)(const UChar* start, const UChar* p); - int (*is_allowed_reverse_match)(const UChar* p, const UChar* end); + int (*get_ctype_code_range)(int ctype, const OnigCodePoint* sb_range[], const OnigCodePoint* mb_range[]); + OnigUChar* (*left_adjust_char_head)(const OnigUChar* start, const OnigUChar* p); + int (*is_allowed_reverse_match)(const OnigUChar* p, const OnigUChar* end); } OnigEncodingType; typedef OnigEncodingType* OnigEncoding; @@ -200,6 +204,7 @@ ONIG_EXTERN OnigEncodingType OnigEncodingSJIS; ONIG_EXTERN OnigEncodingType OnigEncodingKOI8; ONIG_EXTERN OnigEncodingType OnigEncodingKOI8_R; ONIG_EXTERN OnigEncodingType OnigEncodingBIG5; +ONIG_EXTERN OnigEncodingType OnigEncodingGB18030; #define ONIG_ENCODING_ASCII (&OnigEncodingASCII) #define ONIG_ENCODING_ISO_8859_1 (&OnigEncodingISO_8859_1) @@ -230,6 +235,7 @@ ONIG_EXTERN OnigEncodingType OnigEncodingBIG5; #define ONIG_ENCODING_KOI8 (&OnigEncodingKOI8) #define ONIG_ENCODING_KOI8_R (&OnigEncodingKOI8_R) #define ONIG_ENCODING_BIG5 (&OnigEncodingBIG5) +#define ONIG_ENCODING_GB18030 (&OnigEncodingGB18030) #endif /* else RUBY && M17N */ @@ -333,22 +339,22 @@ ONIG_EXTERN OnigEncodingType OnigEncodingBIG5; ONIG_EXTERN int onigenc_is_code_ctype P_((OnigEncoding enc, OnigCodePoint code, int ctype)); ONIG_EXTERN -int onigenc_code_to_mbc P_((OnigEncoding enc, OnigCodePoint code, UChar *buf)); +int onigenc_code_to_mbc P_((OnigEncoding enc, OnigCodePoint code, OnigUChar *buf)); ONIG_EXTERN -int onigenc_mbc_to_normalize P_((OnigEncoding enc, OnigAmbigType flag, const UChar** pp, const UChar* end, UChar* buf)); +int onigenc_mbc_to_normalize P_((OnigEncoding enc, OnigAmbigType flag, const OnigUChar** pp, const OnigUChar* end, OnigUChar* buf)); ONIG_EXTERN -int onigenc_is_mbc_ambiguous P_((OnigEncoding enc, OnigAmbigType flag, const UChar** pp, const UChar* end)); +int onigenc_is_mbc_ambiguous P_((OnigEncoding enc, OnigAmbigType flag, const OnigUChar** pp, const OnigUChar* end)); ONIG_EXTERN -int onigenc_is_allowed_reverse_match P_((OnigEncoding enc, const UChar* s, const UChar* end)); +int onigenc_is_allowed_reverse_match P_((OnigEncoding enc, const OnigUChar* s, const OnigUChar* end)); #else /* ONIG_RUBY_M17N */ #define ONIGENC_NAME(enc) ((enc)->name) #define ONIGENC_MBC_TO_NORMALIZE(enc,flag,pp,end,buf) \ - (enc)->mbc_to_normalize(flag,(const UChar** )pp,end,buf) + (enc)->mbc_to_normalize(flag,(const OnigUChar** )pp,end,buf) #define ONIGENC_IS_MBC_AMBIGUOUS(enc,flag,pp,end) \ - (enc)->is_mbc_ambiguous(flag,(const UChar** )pp,end) + (enc)->is_mbc_ambiguous(flag,(const OnigUChar** )pp,end) #define ONIGENC_SUPPORT_AMBIG_FLAG(enc) ((enc)->support_ambig_flag) #define ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc,s,end) \ (enc)->is_allowed_reverse_match(s,end) @@ -405,7 +411,7 @@ int onigenc_is_allowed_reverse_match P_((OnigEncoding enc, const UChar* s, const (enc)->get_ctype_code_range(ctype,sbr,mbr) ONIG_EXTERN -UChar* onigenc_step_back P_((OnigEncoding enc, const UChar* start, const UChar* s, int n)); +OnigUChar* onigenc_step_back P_((OnigEncoding enc, const OnigUChar* start, const OnigUChar* s, int n)); #endif /* is not ONIG_RUBY_M17N */ @@ -418,21 +424,21 @@ int onigenc_set_default_encoding P_((OnigEncoding enc)); ONIG_EXTERN OnigEncoding onigenc_get_default_encoding P_(()); ONIG_EXTERN -void onigenc_set_default_caseconv_table P_((const UChar* table)); +void onigenc_set_default_caseconv_table P_((const OnigUChar* table)); ONIG_EXTERN -UChar* onigenc_get_right_adjust_char_head_with_prev P_((OnigEncoding enc, const UChar* start, const UChar* s, const UChar** prev)); +OnigUChar* onigenc_get_right_adjust_char_head_with_prev P_((OnigEncoding enc, const OnigUChar* start, const OnigUChar* s, const OnigUChar** prev)); ONIG_EXTERN -UChar* onigenc_get_prev_char_head P_((OnigEncoding enc, const UChar* start, const UChar* s)); +OnigUChar* onigenc_get_prev_char_head P_((OnigEncoding enc, const OnigUChar* start, const OnigUChar* s)); ONIG_EXTERN -UChar* onigenc_get_left_adjust_char_head P_((OnigEncoding enc, const UChar* start, const UChar* s)); +OnigUChar* onigenc_get_left_adjust_char_head P_((OnigEncoding enc, const OnigUChar* start, const OnigUChar* s)); ONIG_EXTERN -UChar* onigenc_get_right_adjust_char_head P_((OnigEncoding enc, const UChar* start, const UChar* s)); +OnigUChar* onigenc_get_right_adjust_char_head P_((OnigEncoding enc, const OnigUChar* start, const OnigUChar* s)); ONIG_EXTERN -int onigenc_strlen P_((OnigEncoding enc, const UChar* p, const UChar* end)); +int onigenc_strlen P_((OnigEncoding enc, const OnigUChar* p, const OnigUChar* end)); ONIG_EXTERN -int onigenc_strlen_null P_((OnigEncoding enc, const UChar* p)); +int onigenc_strlen_null P_((OnigEncoding enc, const OnigUChar* p)); ONIG_EXTERN -int onigenc_str_bytelen_null P_((OnigEncoding enc, const UChar* p)); +int onigenc_str_bytelen_null P_((OnigEncoding enc, const OnigUChar* p)); @@ -465,6 +471,7 @@ typedef unsigned int OnigOptionType; #define ONIG_OPTION_NOTBOL (ONIG_OPTION_CAPTURE_GROUP << 1) #define ONIG_OPTION_NOTEOL (ONIG_OPTION_NOTBOL << 1) #define ONIG_OPTION_POSIX_REGION (ONIG_OPTION_NOTEOL << 1) +#define ONIG_OPTION_MAXBIT ONIG_OPTION_POSIX_REGION /* limit */ #define ONIG_OPTION_ON(options,regopt) ((options) |= (regopt)) #define ONIG_OPTION_OFF(options,regopt) ((options) &= ~(regopt)) @@ -478,6 +485,7 @@ typedef struct { OnigOptionType options; /* default option */ } OnigSyntaxType; +ONIG_EXTERN OnigSyntaxType OnigSyntaxASIS; ONIG_EXTERN OnigSyntaxType OnigSyntaxPosixBasic; ONIG_EXTERN OnigSyntaxType OnigSyntaxPosixExtended; ONIG_EXTERN OnigSyntaxType OnigSyntaxEmacs; @@ -485,9 +493,11 @@ ONIG_EXTERN OnigSyntaxType OnigSyntaxGrep; ONIG_EXTERN OnigSyntaxType OnigSyntaxGnuRegex; ONIG_EXTERN OnigSyntaxType OnigSyntaxJava; ONIG_EXTERN OnigSyntaxType OnigSyntaxPerl; +ONIG_EXTERN OnigSyntaxType OnigSyntaxPerl_NG; ONIG_EXTERN OnigSyntaxType OnigSyntaxRuby; /* predefined syntaxes (see regsyntax.c) */ +#define ONIG_SYNTAX_ASIS (&OnigSyntaxASIS) #define ONIG_SYNTAX_POSIX_BASIC (&OnigSyntaxPosixBasic) #define ONIG_SYNTAX_POSIX_EXTENDED (&OnigSyntaxPosixExtended) #define ONIG_SYNTAX_EMACS (&OnigSyntaxEmacs) @@ -495,6 +505,7 @@ ONIG_EXTERN OnigSyntaxType OnigSyntaxRuby; #define ONIG_SYNTAX_GNU_REGEX (&OnigSyntaxGnuRegex) #define ONIG_SYNTAX_JAVA (&OnigSyntaxJava) #define ONIG_SYNTAX_PERL (&OnigSyntaxPerl) +#define ONIG_SYNTAX_PERL_NG (&OnigSyntaxPerl_NG) #define ONIG_SYNTAX_RUBY (&OnigSyntaxRuby) /* default syntax */ @@ -554,6 +565,7 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax; #define ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT (1<<17) /* \p{^..}, \P{^..} */ #define ONIG_SYN_OP2_CHAR_PROPERTY_PREFIX_IS (1<<18) /* \p{IsXDigit} */ #define ONIG_SYN_OP2_ESC_H_XDIGIT (1<<19) /* \h, \H */ +#define ONIG_SYN_OP2_INEFFECTIVE_ESCAPE (1<<20) /* \ */ /* syntax (behavior) */ #define ONIG_SYN_CONTEXT_INDEP_ANCHORS (1<<31) /* not implemented */ @@ -695,8 +707,8 @@ struct re_registers { typedef struct re_registers OnigRegion; typedef struct { - UChar* par; - UChar* par_end; + OnigUChar* par; + OnigUChar* par_end; } OnigErrorInfo; typedef struct { @@ -704,8 +716,8 @@ typedef struct { int upper; } OnigRepeatRange; -typedef void (*OnigWarnFunc) P_((const char* s, ...)); -extern void onig_null_warn P_((const char* s, ...)); +typedef void (*OnigWarnFunc) P_((const char* s)); +extern void onig_null_warn P_((const char* s)); #define ONIG_NULL_WARN onig_null_warn #define ONIG_CHAR_TABLE_SIZE 256 @@ -776,25 +788,25 @@ typedef struct { ONIG_EXTERN int onig_init P_((void)); ONIG_EXTERN -int onig_error_code_to_str PV_((UChar* s, int err_code, ...)); +int onig_error_code_to_str PV_((OnigUChar* s, int err_code, ...)); ONIG_EXTERN void onig_set_warn_func P_((OnigWarnFunc f)); ONIG_EXTERN void onig_set_verb_warn_func P_((OnigWarnFunc f)); ONIG_EXTERN -int onig_new P_((regex_t**, const UChar* pattern, const UChar* pattern_end, OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax, OnigErrorInfo* einfo)); +int onig_new P_((regex_t**, const OnigUChar* pattern, const OnigUChar* pattern_end, OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax, OnigErrorInfo* einfo)); ONIG_EXTERN -int onig_new_deluxe P_((regex_t** reg, const UChar* pattern, const UChar* pattern_end, OnigCompileInfo* ci, OnigErrorInfo* einfo)); +int onig_new_deluxe P_((regex_t** reg, const OnigUChar* pattern, const OnigUChar* pattern_end, OnigCompileInfo* ci, OnigErrorInfo* einfo)); ONIG_EXTERN void onig_free P_((regex_t*)); ONIG_EXTERN -int onig_recompile P_((regex_t*, const UChar* pattern, const UChar* pattern_end, OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax, OnigErrorInfo* einfo)); +int onig_recompile P_((regex_t*, const OnigUChar* pattern, const OnigUChar* pattern_end, OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax, OnigErrorInfo* einfo)); ONIG_EXTERN -int onig_recompile_deluxe P_((regex_t* reg, const UChar* pattern, const UChar* pattern_end, OnigCompileInfo* ci, OnigErrorInfo* einfo)); +int onig_recompile_deluxe P_((regex_t* reg, const OnigUChar* pattern, const OnigUChar* pattern_end, OnigCompileInfo* ci, OnigErrorInfo* einfo)); ONIG_EXTERN -int onig_search P_((regex_t*, const UChar* str, const UChar* end, const UChar* start, const UChar* range, OnigRegion* region, OnigOptionType option)); +int onig_search P_((regex_t*, const OnigUChar* str, const OnigUChar* end, const OnigUChar* start, const OnigUChar* range, OnigRegion* region, OnigOptionType option)); ONIG_EXTERN -int onig_match P_((regex_t*, const UChar* str, const UChar* end, const UChar* at, OnigRegion* region, OnigOptionType option)); +int onig_match P_((regex_t*, const OnigUChar* str, const OnigUChar* end, const OnigUChar* at, OnigRegion* region, OnigOptionType option)); ONIG_EXTERN OnigRegion* onig_region_new P_((void)); ONIG_EXTERN @@ -810,11 +822,11 @@ int onig_region_resize P_((OnigRegion* region, int n)); ONIG_EXTERN int onig_region_set P_((OnigRegion* region, int at, int beg, int end)); ONIG_EXTERN -int onig_name_to_group_numbers P_((regex_t* reg, const UChar* name, const UChar* name_end, int** nums)); +int onig_name_to_group_numbers P_((regex_t* reg, const OnigUChar* name, const OnigUChar* name_end, int** nums)); ONIG_EXTERN -int onig_name_to_backref_number P_((regex_t* reg, const UChar* name, const UChar* name_end, OnigRegion *region)); +int onig_name_to_backref_number P_((regex_t* reg, const OnigUChar* name, const OnigUChar* name_end, OnigRegion *region)); ONIG_EXTERN -int onig_foreach_name P_((regex_t* reg, int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)); +int onig_foreach_name P_((regex_t* reg, int (*func)(const OnigUChar*, const OnigUChar*,int,int*,regex_t*,void*), void* arg)); ONIG_EXTERN int onig_number_of_names P_((regex_t* reg)); ONIG_EXTERN @@ -34,7 +34,7 @@ OnigAmbigType OnigDefaultAmbigFlag = ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE); extern OnigAmbigType -onig_get_default_ambig_flag(void) +onig_get_default_ambig_flag() { return OnigDefaultAmbigFlag; } @@ -2120,29 +2120,6 @@ get_char_length_tree(Node* node, regex_t* reg, int* len) return get_char_length_tree1(node, reg, len, 0); } -extern int -onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc) -{ - int found; - - if (ONIGENC_MBC_MINLEN(enc) > 1 || (code >= SINGLE_BYTE_SIZE)) { - if (IS_NULL(cc->mbuf)) { - found = 0; - } - else { - found = (onig_is_in_code_range(cc->mbuf->p, code) != 0 ? 1 : 0); - } - } - else { - found = (BITSET_AT(cc->bs, code) == 0 ? 0 : 1); - } - - if (IS_CCLASS_NOT(cc)) - return !found; - else - return found; -} - /* x is not included y ==> 1 : 0 */ static int is_not_included(Node* x, Node* y, regex_t* reg) @@ -2516,6 +2493,9 @@ subexp_inf_recursive_check(Node* node, ScanEnv* env, int head) case N_QUALIFIER: r = subexp_inf_recursive_check(NQUALIFIER(node).target, env, head); + if (r == RECURSION_EXIST) { + if (NQUALIFIER(node).lower == 0) r = 0; + } break; case N_ANCHOR: @@ -2943,15 +2923,55 @@ next_setup(Node* node, Node* next_node, regex_t* reg) return 0; } + +static int +divide_ambig_string_node_sub(regex_t* reg, int prev_ambig, + UChar* prev_start, UChar* prev, + UChar* end, Node*** tailp, Node** root) +{ + UChar *tmp, *wp; + Node* snode; + + if (prev_ambig != 0) { + tmp = prev_start; + wp = prev_start; + while (tmp < prev) { + wp += ONIGENC_MBC_TO_NORMALIZE(reg->enc, reg->ambig_flag, + &tmp, end, wp); + } + snode = onig_node_new_str(prev_start, wp); + CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); + NSTRING_SET_AMBIG(snode); + if (wp != prev) NSTRING_SET_AMBIG_REDUCE(snode); + } + else { + snode = onig_node_new_str(prev_start, prev); + CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); + } + + if (*tailp == (Node** )0) { + *root = onig_node_new_list(snode, NULL); + CHECK_NULL_RETURN_VAL(*root, ONIGERR_MEMORY); + *tailp = &(NCONS(*root).right); + } + else { + **tailp = onig_node_new_list(snode, NULL); + CHECK_NULL_RETURN_VAL(**tailp, ONIGERR_MEMORY); + *tailp = &(NCONS(**tailp).right); + } + + return 0; +} + static int divide_ambig_string_node(Node* node, regex_t* reg) { StrNode* sn = &NSTRING(node); int ambig, prev_ambig; UChar *prev, *p, *end, *prev_start, *start, *tmp, *wp; - Node *snode; Node *root = NULL_NODE; Node **tailp = (Node** )0; + int r; start = prev_start = p = sn->s; end = sn->end; @@ -2964,33 +2984,9 @@ divide_ambig_string_node(Node* node, regex_t* reg) if (prev_ambig != (ambig = ONIGENC_IS_MBC_AMBIGUOUS(reg->enc, reg->ambig_flag, &p, end))) { - if (prev_ambig != 0) { - tmp = prev_start; - wp = prev_start; - while (tmp < prev) { - wp += ONIGENC_MBC_TO_NORMALIZE(reg->enc, reg->ambig_flag, - &tmp, end, wp); - } - snode = onig_node_new_str(prev_start, wp); - CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); - NSTRING_SET_AMBIG(snode); - if (wp != prev) NSTRING_SET_AMBIG_REDUCE(snode); - } - else { - snode = onig_node_new_str(prev_start, prev); - CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); - } - - if (tailp == (Node** )0) { - root = onig_node_new_list(snode, NULL); - CHECK_NULL_RETURN_VAL(root, ONIGERR_MEMORY); - tailp = &(NCONS(root).right); - } - else { - *tailp = onig_node_new_list(snode, NULL); - CHECK_NULL_RETURN_VAL(*tailp, ONIGERR_MEMORY); - tailp = &(NCONS(*tailp).right); - } + r = divide_ambig_string_node_sub(reg, prev_ambig, prev_start, prev, + end, &tailp, &root); + if (r != 0) return r; prev_ambig = ambig; prev_start = prev; @@ -3011,33 +3007,9 @@ divide_ambig_string_node(Node* node, regex_t* reg) } } else { - if (prev_ambig != 0) { - tmp = prev_start; - wp = prev_start; - while (tmp < end) { - wp += ONIGENC_MBC_TO_NORMALIZE(reg->enc, reg->ambig_flag, - &tmp, end, wp); - } - snode = onig_node_new_str(prev_start, wp); - CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); - NSTRING_SET_AMBIG(snode); - if (wp != end) NSTRING_SET_AMBIG_REDUCE(snode); - } - else { - snode = onig_node_new_str(prev_start, end); - CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); - } - - if (tailp == (Node** )0) { - root = onig_node_new_list(snode, NULL); - CHECK_NULL_RETURN_VAL(root, ONIGERR_MEMORY); - tailp = &(NCONS(node).right); - } - else { - *tailp = onig_node_new_list(snode, NULL); - CHECK_NULL_RETURN_VAL(*tailp, ONIGERR_MEMORY); - tailp = &(NCONS(*tailp).right); - } + r = divide_ambig_string_node_sub(reg, prev_ambig, prev_start, end, + end, &tailp, &root); + if (r != 0) return r; swap_node(node, root); onig_node_str_clear(root); /* should be after swap! */ @@ -3383,7 +3355,7 @@ typedef struct { static int map_position_value(OnigEncoding enc, int i) { - static short int ByteValTable[] = { + static const short int ByteValTable[] = { 5, 1, 1, 1, 1, 1, 1, 1, 1, 10, 10, 1, 1, 10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 12, 4, 7, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, @@ -3408,7 +3380,7 @@ static int distance_value(MinMaxLen* mm) { /* 1000 / (min-max-dist + 1) */ - static short int dist_vals[] = { + static const short int dist_vals[] = { 1000, 500, 333, 250, 200, 167, 143, 125, 111, 100, 91, 83, 77, 71, 67, 63, 59, 56, 53, 50, 48, 45, 43, 42, 40, 38, 37, 36, 34, 33, @@ -3711,7 +3683,7 @@ select_opt_exact_info(OnigEncoding enc, OptExactInfo* now, OptExactInfo* alt) static void clear_opt_map_info(OptMapInfo* map) { - static OptMapInfo clean_info = { + static const OptMapInfo clean_info = { {0, 0}, {0, 0}, 0, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -3758,8 +3730,8 @@ add_char_amb_opt_map_info(OptMapInfo* map, UChar* p, UChar* end, int i, j, n, len; UChar buf[ONIGENC_MBC_NORMALIZE_MAXLEN]; OnigCodePoint code, ccode; - OnigCompAmbigCodes* ccs; - OnigPairAmbigCodes* pccs; + const OnigCompAmbigCodes* ccs; + const OnigPairAmbigCodes* pccs; OnigAmbigType amb; add_char_opt_map_info(map, p[0], enc); @@ -4316,10 +4288,7 @@ set_optimize_exact_info(regex_t* reg, OptExactInfo* e) CHECK_NULL_RETURN_VAL(reg->exact, ONIGERR_MEMORY); reg->exact_end = reg->exact + e->len; - if (e->anc.left_anchor & ANCHOR_BEGIN_LINE) - allow_reverse = 1; - else - allow_reverse = + allow_reverse = ONIGENC_IS_ALLOWED_REVERSE_MATCH(reg->enc, reg->exact, reg->exact_end); if (e->len >= 3 || (e->len >= 2 && allow_reverse)) { @@ -4514,8 +4483,8 @@ print_anchor(FILE* f, int anchor) static void print_optimize_info(FILE* f, regex_t* reg) { - static char* on[] = { "NONE", "EXACT", "EXACT_BM", "EXACT_BM_NOT_REV", - "EXACT_IC", "MAP" }; + static const char* on[] = { "NONE", "EXACT", "EXACT_BM", "EXACT_BM_NOT_REV", + "EXACT_IC", "MAP" }; fprintf(f, "optimize: %s\n", on[reg->optimize]); fprintf(f, " anchor: "); print_anchor(f, reg->anchor); @@ -4959,7 +4928,7 @@ onig_new(regex_t** reg, const UChar* pattern, const UChar* pattern_end, } extern int -onig_init(void) +onig_init() { if (onig_inited != 0) return 0; @@ -4981,9 +4950,9 @@ onig_init(void) extern int -onig_end(void) +onig_end() { - extern int onig_free_shared_cclass_table(void); + extern int onig_free_shared_cclass_table(); THREAD_ATOMIC_START; @@ -32,13 +32,13 @@ OnigEncoding OnigEncDefaultCharEncoding = ONIG_ENCODING_INIT_DEFAULT; extern int -onigenc_init(void) +onigenc_init() { return 0; } extern OnigEncoding -onigenc_get_default_encoding(void) +onigenc_get_default_encoding() { return OnigEncDefaultCharEncoding; } @@ -175,7 +175,7 @@ onigenc_str_bytelen_null(OnigEncoding enc, const UChar* s) #define USE_APPLICATION_TO_LOWER_CASE_TABLE -unsigned short OnigEnc_Unicode_ISO_8859_1_CtypeTable[256] = { +const unsigned short OnigEnc_Unicode_ISO_8859_1_CtypeTable[256] = { 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x228c, 0x2289, 0x2288, 0x2288, 0x2288, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, @@ -251,7 +251,7 @@ static const UChar BuiltInAsciiToLowerCaseTable[] = { #endif /* not USE_APPLICATION_TO_LOWER_CASE_TABLE */ #ifdef USE_UPPER_CASE_TABLE -UChar OnigEncAsciiToUpperCaseTable[256] = { +const UChar OnigEncAsciiToUpperCaseTable[256] = { '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007', '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017', '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027', @@ -287,7 +287,7 @@ UChar OnigEncAsciiToUpperCaseTable[256] = { }; #endif -unsigned short OnigEncAsciiCtypeTable[256] = { +const unsigned short OnigEncAsciiCtypeTable[256] = { 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x220c, 0x2209, 0x2208, 0x2208, 0x2208, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, @@ -323,7 +323,7 @@ unsigned short OnigEncAsciiCtypeTable[256] = { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }; -UChar OnigEncISO_8859_1_ToLowerCaseTable[256] = { +const UChar OnigEncISO_8859_1_ToLowerCaseTable[256] = { '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007', '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017', '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027', @@ -359,7 +359,7 @@ UChar OnigEncISO_8859_1_ToLowerCaseTable[256] = { }; #ifdef USE_UPPER_CASE_TABLE -UChar OnigEncISO_8859_1_ToUpperCaseTable[256] = { +const UChar OnigEncISO_8859_1_ToUpperCaseTable[256] = { '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007', '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017', '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027', @@ -417,7 +417,7 @@ onigenc_get_left_adjust_char_head(OnigEncoding enc, const UChar* start, const UC return ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s); } -OnigPairAmbigCodes OnigAsciiPairAmbigCodes[] = { +const OnigPairAmbigCodes OnigAsciiPairAmbigCodes[] = { { 0x41, 0x61 }, { 0x42, 0x62 }, { 0x43, 0x63 }, @@ -475,7 +475,7 @@ OnigPairAmbigCodes OnigAsciiPairAmbigCodes[] = { extern int onigenc_ascii_get_all_pair_ambig_codes(OnigAmbigType flag, - OnigPairAmbigCodes** ccs) + const OnigPairAmbigCodes** ccs) { if (flag == ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) { *ccs = OnigAsciiPairAmbigCodes; @@ -488,16 +488,16 @@ onigenc_ascii_get_all_pair_ambig_codes(OnigAmbigType flag, extern int onigenc_nothing_get_all_comp_ambig_codes(OnigAmbigType flag, - OnigCompAmbigCodes** ccs) + const OnigCompAmbigCodes** ccs) { return 0; } extern int onigenc_iso_8859_1_get_all_pair_ambig_codes(OnigAmbigType flag, - OnigPairAmbigCodes** ccs) + const OnigPairAmbigCodes** ccs) { - static OnigPairAmbigCodes cc[] = { + static const OnigPairAmbigCodes cc[] = { { 0xc0, 0xe0 }, { 0xc1, 0xe1 }, { 0xc2, 0xe2 }, @@ -577,9 +577,9 @@ onigenc_iso_8859_1_get_all_pair_ambig_codes(OnigAmbigType flag, extern int onigenc_ess_tsett_get_all_comp_ambig_codes(OnigAmbigType flag, - OnigCompAmbigCodes** ccs) + const OnigCompAmbigCodes** ccs) { - static OnigCompAmbigCodes folds[] = { + static const OnigCompAmbigCodes folds[] = { { 2, 0xdf, {{ 2, { 0x53, 0x53 } }, { 2, { 0x73, 0x73} } } } }; @@ -593,7 +593,7 @@ onigenc_ess_tsett_get_all_comp_ambig_codes(OnigAmbigType flag, extern int onigenc_not_support_get_ctype_code_range(int ctype, - OnigCodePoint* sbr[], OnigCodePoint* mbr[]) + const OnigCodePoint* sbr[], const OnigCodePoint* mbr[]) { return ONIG_NO_SUPPORT_CONFIG; } @@ -830,10 +830,10 @@ onigenc_mb4_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf) if ((code & 0xff000000) != 0) { *p++ = (UChar )((code >> 24) & 0xff); } - if ((code & 0xff0000) != 0) { + if ((code & 0xff0000) != 0 || p != buf) { *p++ = (UChar )((code >> 16) & 0xff); } - if ((code & 0xff00) != 0) { + if ((code & 0xff00) != 0 || p != buf) { *p++ = (UChar )((code >> 8) & 0xff); } *p++ = (UChar )(code & 0xff); @@ -849,40 +849,32 @@ extern int onigenc_mb2_is_code_ctype(OnigEncoding enc, OnigCodePoint code, unsigned int ctype) { - if ((ctype & ONIGENC_CTYPE_WORD) != 0) { - if (code < 128) - return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); - else + if (code < 128) + return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); + else { + if ((ctype & (ONIGENC_CTYPE_WORD | + ONIGENC_CTYPE_GRAPH | ONIGENC_CTYPE_PRINT)) != 0) { return (ONIGENC_CODE_TO_MBCLEN(enc, code) > 1 ? TRUE : FALSE); - - ctype &= ~ONIGENC_CTYPE_WORD; - if (ctype == 0) return FALSE; + } } - if (code < 128) - return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); - else - return FALSE; + return FALSE; } extern int onigenc_mb4_is_code_ctype(OnigEncoding enc, OnigCodePoint code, unsigned int ctype) { - if ((ctype & ONIGENC_CTYPE_WORD) != 0) { - if (code < 128) - return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); - else + if (code < 128) + return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); + else { + if ((ctype & (ONIGENC_CTYPE_WORD | + ONIGENC_CTYPE_GRAPH | ONIGENC_CTYPE_PRINT)) != 0) { return (ONIGENC_CODE_TO_MBCLEN(enc, code) > 1 ? TRUE : FALSE); - - ctype &= ~ONIGENC_CTYPE_WORD; - if (ctype == 0) return FALSE; + } } - if (code < 128) - return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); - else - return FALSE; + return FALSE; } extern int @@ -4,7 +4,7 @@ regenc.h - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2005 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2006 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -65,15 +65,17 @@ #else /* ONIG_RUBY_M17N */ #define USE_UNICODE_FULL_RANGE_CTYPE +/* following must not use with USE_CRNL_AS_LINE_TERMINATOR */ +/* #define USE_UNICODE_ALL_LINE_TERMINATORS */ /* see Unicode.org UTF#18 */ #define ONIG_ENCODING_INIT_DEFAULT ONIG_ENCODING_ASCII /* for encoding system implementation (internal) */ -ONIG_EXTERN int onigenc_ascii_get_all_pair_ambig_codes P_((OnigAmbigType flag, OnigPairAmbigCodes** acs)); -ONIG_EXTERN int onigenc_nothing_get_all_comp_ambig_codes P_((OnigAmbigType flag, OnigCompAmbigCodes** acs)); -ONIG_EXTERN int onigenc_iso_8859_1_get_all_pair_ambig_codes P_((OnigAmbigType flag, OnigPairAmbigCodes** acs)); -ONIG_EXTERN int onigenc_ess_tsett_get_all_comp_ambig_codes P_((OnigAmbigType flag, OnigCompAmbigCodes** acs)); -ONIG_EXTERN int onigenc_not_support_get_ctype_code_range P_((int ctype, OnigCodePoint* sbr[], OnigCodePoint* mbr[])); +ONIG_EXTERN int onigenc_ascii_get_all_pair_ambig_codes P_((OnigAmbigType flag, const OnigPairAmbigCodes** acs)); +ONIG_EXTERN int onigenc_nothing_get_all_comp_ambig_codes P_((OnigAmbigType flag, const OnigCompAmbigCodes** acs)); +ONIG_EXTERN int onigenc_iso_8859_1_get_all_pair_ambig_codes P_((OnigAmbigType flag, const OnigPairAmbigCodes** acs)); +ONIG_EXTERN int onigenc_ess_tsett_get_all_comp_ambig_codes P_((OnigAmbigType flag, const OnigCompAmbigCodes** acs)); +ONIG_EXTERN int onigenc_not_support_get_ctype_code_range P_((int ctype, const OnigCodePoint* sbr[], const OnigCodePoint* mbr[])); ONIG_EXTERN int onigenc_is_mbc_newline_0x0a P_((const UChar* p, const UChar* end)); /* methods for single byte encoding */ @@ -105,7 +107,7 @@ ONIG_EXTERN int onigenc_get_all_fold_match_code_ss_0xdf P_((OnigCodePoint** code /* in enc/unicode.c */ ONIG_EXTERN int onigenc_unicode_is_code_ctype P_((OnigCodePoint code, unsigned int ctype)); -ONIG_EXTERN int onigenc_unicode_get_ctype_code_range P_((int ctype, OnigCodePoint* sbr[], OnigCodePoint* mbr[])); +ONIG_EXTERN int onigenc_unicode_get_ctype_code_range P_((int ctype, const OnigCodePoint* sbr[], const OnigCodePoint* mbr[])); #define ONIGENC_ISO_8859_1_TO_LOWER_CASE(c) \ @@ -115,10 +117,10 @@ ONIG_EXTERN int onigenc_unicode_get_ctype_code_range P_((int ctype, OnigCodePoin #define ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(code,ctype) \ ((OnigEnc_Unicode_ISO_8859_1_CtypeTable[code] & ctype) != 0) -ONIG_EXTERN UChar OnigEncISO_8859_1_ToLowerCaseTable[]; -ONIG_EXTERN UChar OnigEncISO_8859_1_ToUpperCaseTable[]; -ONIG_EXTERN unsigned short OnigEnc_Unicode_ISO_8859_1_CtypeTable[]; -ONIG_EXTERN OnigPairAmbigCodes OnigAsciiPairAmbigCodes[]; +ONIG_EXTERN const UChar OnigEncISO_8859_1_ToLowerCaseTable[]; +ONIG_EXTERN const UChar OnigEncISO_8859_1_ToUpperCaseTable[]; +ONIG_EXTERN const unsigned short OnigEnc_Unicode_ISO_8859_1_CtypeTable[]; +ONIG_EXTERN const OnigPairAmbigCodes OnigAsciiPairAmbigCodes[]; #endif /* is not ONIG_RUBY_M17N */ @@ -133,7 +135,7 @@ extern int onig_is_in_code_range P_((const UChar* p, OnigCodePoint code)); ONIG_EXTERN OnigEncoding OnigEncDefaultCharEncoding; ONIG_EXTERN const UChar* OnigEncAsciiToLowerCaseTable; ONIG_EXTERN const UChar OnigEncAsciiToUpperCaseTable[]; -ONIG_EXTERN unsigned short OnigEncAsciiCtypeTable[]; +ONIG_EXTERN const unsigned short OnigEncAsciiCtypeTable[]; #define ONIGENC_ASCII_CODE_TO_LOWER_CASE(c) OnigEncAsciiToLowerCaseTable[c] #define ONIGENC_ASCII_CODE_TO_UPPER_CASE(c) OnigEncAsciiToUpperCaseTable[c] diff --git a/regerror.c b/regerror.c index 348b7b30ed..043f52d43b 100644 --- a/regerror.c +++ b/regerror.c @@ -30,14 +30,20 @@ #include "regint.h" #include <stdio.h> /* for vsnprintf() */ +#ifdef HAVE_STDARG_PROTOTYPES #include <stdarg.h> +#define va_init_list(a,b) va_start(a,b) +#else +#include <varargs.h> +#define va_init_list(a,b) va_start(a) +#endif -extern char* +extern UChar* onig_error_code_to_format(int code) { char *p; - if (code >= 0) return (char* )0; + if (code >= 0) return (UChar* )0; switch (code) { case ONIG_MISMATCH: @@ -171,7 +177,7 @@ onig_error_code_to_format(int code) p = "undefined error code"; break; } - return p; + return (UChar* )p; } @@ -179,14 +185,21 @@ onig_error_code_to_format(int code) #define MAX_ERROR_PAR_LEN 30 extern int +#ifdef HAVE_STDARG_PROTOTYPES onig_error_code_to_str(UChar* s, int code, ...) +#else +onig_error_code_to_str(s, code, va_alist) + UChar* s; + int code; + va_dcl +#endif { UChar *p, *q; OnigErrorInfo* einfo; int len; va_list vargs; - va_start(vargs, code); + va_init_list(vargs, code); switch (code) { case ONIGERR_UNDEFINED_NAME_REFERENCE: @@ -242,26 +255,37 @@ onig_error_code_to_str(UChar* s, int code, ...) void -onig_snprintf_with_pattern(char buf[], int bufsize, OnigEncoding enc, - char* pat, char* pat_end, char *fmt, ...) +#ifdef HAVE_STDARG_PROTOTYPES +onig_snprintf_with_pattern(UChar buf[], int bufsize, OnigEncoding enc, + UChar* pat, UChar* pat_end, const UChar *fmt, ...) +#else +onig_snprintf_with_pattern(buf, bufsize, enc, pat, pat_end, fmt, va_alist) + UChar buf[]; + int bufsize; + OnigEncoding enc; + UChar* pat; + UChar* pat_end; + const UChar *fmt; + va_dcl +#endif { int n, need, len; UChar *p, *s, *bp; - char bs[6]; + UChar bs[6]; va_list args; - va_start(args, fmt); - n = vsnprintf(buf, bufsize, fmt, args); + va_init_list(args, (char* )fmt); + n = vsnprintf((char* )buf, bufsize, (char* )fmt, args); va_end(args); need = (pat_end - pat) * 4 + 4; if (n + need < bufsize) { - strcat(buf, ": /"); + strcat((char* )buf, ": /"); s = buf + onigenc_str_bytelen_null(ONIG_ENCODING_ASCII, buf); p = pat; - while (p < (UChar* )pat_end) { + while (p < pat_end) { if (*p == MC_ESC(enc)) { *s++ = *p++; len = enc_len(enc, p); @@ -280,7 +304,7 @@ onig_snprintf_with_pattern(char buf[], int bufsize, OnigEncoding enc, int blen; while (len-- > 0) { - sprintf(bs, "\\%03o", *p++ & 0377); + sprintf((char* )bs, "\\%03o", *p++ & 0377); blen = onigenc_str_bytelen_null(ONIG_ENCODING_ASCII, bs); bp = bs; while (blen-- > 0) *s++ = *bp++; @@ -289,7 +313,7 @@ onig_snprintf_with_pattern(char buf[], int bufsize, OnigEncoding enc, } else if (!ONIGENC_IS_CODE_PRINT(enc, *p) && !ONIGENC_IS_CODE_SPACE(enc, *p)) { - sprintf(bs, "\\%03o", *p++ & 0377); + sprintf((char* )bs, "\\%03o", *p++ & 0377); len = onigenc_str_bytelen_null(ONIG_ENCODING_ASCII, bs); bp = bs; while (len-- > 0) *s++ = *bp++; @@ -29,6 +29,12 @@ #include "regint.h" +#ifdef USE_CRNL_AS_LINE_TERMINATOR +#define ONIGENC_IS_MBC_CRNL(enc,p,end) \ + (ONIGENC_MBC_TO_CODE(enc,p,end) == 13 && \ + ONIGENC_IS_MBC_NEWLINE(enc,(p+enc_len(enc,p)),end)) +#endif + #ifdef USE_CAPTURE_HISTORY static void history_tree_free(OnigCaptureTreeNode* node); @@ -227,7 +233,7 @@ onig_region_init(OnigRegion* region) } extern OnigRegion* -onig_region_new(void) +onig_region_new() { OnigRegion* r; @@ -1165,27 +1171,43 @@ onig_is_in_code_range(const UChar* p, OnigCodePoint code) } static int -code_is_in_cclass_node(void* node, OnigCodePoint code, int enclen) +is_code_in_cc(int enclen, OnigCodePoint code, CClassNode* cc) { - unsigned int in_cc; - CClassNode* cc = (CClassNode* )node; + int found; - if (enclen == 1 && code < SINGLE_BYTE_SIZE) { - in_cc = BITSET_AT(cc->bs, code); + if (enclen > 1 || (code >= SINGLE_BYTE_SIZE)) { + if (IS_NULL(cc->mbuf)) { + found = 0; + } + else { + found = (onig_is_in_code_range(cc->mbuf->p, code) != 0 ? 1 : 0); + } } else { - UChar* p = ((BBuf* )(cc->mbuf))->p; - in_cc = onig_is_in_code_range(p, code); + found = (BITSET_AT(cc->bs, code) == 0 ? 0 : 1); } - if (IS_CCLASS_NOT(cc)) { - return (in_cc ? 0 : 1); + if (IS_CCLASS_NOT(cc)) + return !found; + else + return found; +} + +extern int +onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc) +{ + int len; + + if (ONIGENC_MBC_MINLEN(enc) > 1) { + len = 2; } else { - return (in_cc ? 1 : 0); + len = ONIGENC_CODE_TO_MBCLEN(enc, code); } + return is_code_in_cc(len, code, cc); } + /* matching region of POSIX API */ typedef int regoff_t; @@ -1739,8 +1761,9 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, const UChar* sstart, mb_len = enc_len(encode, s); ss = s; s += mb_len; + DATA_ENSURE(0); code = ONIGENC_MBC_TO_CODE(encode, ss, s); - if (code_is_in_cclass_node(node, code, mb_len) == 0) goto fail; + if (is_code_in_cc(mb_len, code, node) == 0) goto fail; } STAT_OP_OUT; break; @@ -1946,6 +1969,12 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, const UChar* sstart, STAT_OP_OUT; continue; } +#ifdef USE_CRNL_AS_LINE_TERMINATOR + else if (ONIGENC_IS_MBC_CRNL(encode, s, end)) { + STAT_OP_OUT; + continue; + } +#endif goto fail; break; @@ -1966,6 +1995,15 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, const UChar* sstart, STAT_OP_OUT; continue; } +#ifdef USE_CRNL_AS_LINE_TERMINATOR + else if (ONIGENC_IS_MBC_CRNL(encode, s, end)) { + UChar* ss = s + enc_len(encode, s); + if (ON_STR_END(ss + enc_len(encode, ss))) { + STAT_OP_OUT; + continue; + } + } +#endif goto fail; break; @@ -3029,7 +3067,11 @@ forward_search_range(regex_t* reg, const UChar* str, const UChar* end, UChar* s, if (prev && ONIGENC_IS_MBC_NEWLINE(reg->enc, prev, end)) goto retry_gate; } - else if (!ONIGENC_IS_MBC_NEWLINE(reg->enc, p, end)) + else if (! ONIGENC_IS_MBC_NEWLINE(reg->enc, p, end) +#ifdef USE_CRNL_AS_LINE_TERMINATOR + && ! ONIGENC_IS_MBC_CRNL(reg->enc, p, end) +#endif + ) goto retry_gate; break; } @@ -3149,7 +3191,11 @@ backward_search_range(regex_t* reg, const UChar* str, const UChar* end, goto retry; } } - else if (!ONIGENC_IS_MBC_NEWLINE(reg->enc, p, end)) { + else if (! ONIGENC_IS_MBC_NEWLINE(reg->enc, p, end) +#ifdef USE_CRNL_AS_LINE_TERMINATOR + && ! ONIGENC_IS_MBC_CRNL(reg->enc, p, end) +#endif + ) { p = onigenc_get_prev_char_head(reg->enc, adjrange, p); if (IS_NULL(p)) goto fail; goto retry; @@ -3310,7 +3356,7 @@ onig_search(regex_t* reg, const UChar* str, const UChar* end, } } else if (str == end) { /* empty string */ - static const UChar* address_for_empty_string = ""; + static const UChar* address_for_empty_string = (UChar* )""; #ifdef ONIG_DEBUG_SEARCH fprintf(stderr, "onig_search: empty string.\n"); @@ -3354,8 +3400,11 @@ onig_search(regex_t* reg, const UChar* str, const UChar* end, if (sch_range > end) sch_range = (UChar* )end; } } - if (reg->dmax != ONIG_INFINITE_DISTANCE && - (end - start) >= reg->threshold_len) { + + if ((end - start) < reg->threshold_len) + goto mismatch; + + if (reg->dmax != ONIG_INFINITE_DISTANCE) { do { if (! forward_search_range(reg, str, end, s, sch_range, &low, &high, &low_prev)) goto mismatch; @@ -3368,22 +3417,26 @@ onig_search(regex_t* reg, const UChar* str, const UChar* end, prev = s; s += enc_len(reg->enc, s); } - if ((reg->anchor & ANCHOR_ANYCHAR_STAR) != 0) { - if (IS_NOT_NULL(prev)) { - while (!ONIGENC_IS_MBC_NEWLINE(reg->enc, prev, end) && - s < range) { - prev = s; - s += enc_len(reg->enc, s); - } - } - } } while (s < range); goto mismatch; } else { /* check only. */ - if ((end - start) < reg->threshold_len || - ! forward_search_range(reg, str, end, s, sch_range, + if (! forward_search_range(reg, str, end, s, sch_range, &low, &high, (UChar** )NULL)) goto mismatch; + + if ((reg->anchor & ANCHOR_ANYCHAR_STAR) != 0) { + do { + MATCH_AND_RETURN_CHECK; + prev = s; + s += enc_len(reg->enc, s); + + while (!ONIGENC_IS_MBC_NEWLINE(reg->enc, prev, end) && s < range) { + prev = s; + s += enc_len(reg->enc, s); + } + } while (s < range); + goto mismatch; + } } } @@ -3391,7 +3444,11 @@ onig_search(regex_t* reg, const UChar* str, const UChar* end, MATCH_AND_RETURN_CHECK; prev = s; s += enc_len(reg->enc, s); - } while (s <= range); /* exec s == range, because empty match with /$/. */ + } while (s < range); + + if (s == range) { /* because empty match with /$/. */ + MATCH_AND_RETURN_CHECK; + } } else { /* backward search */ if (reg->optimize != ONIG_OPTIMIZE_NONE) { @@ -62,6 +62,11 @@ #define USE_INFINITE_REPEAT_MONOMANIAC_MEM_STATUS_CHECK /* /(?:()|())*\2/ */ #define USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE /* /\n$/ =~ "\n" */ #define USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR +/* treat \r\n as line terminator. + !!! NO SUPPORT !!! + use this configuration on your own responsibility */ +/* #define USE_CRNL_AS_LINE_TERMINATOR */ + /* internal config */ #define USE_RECYCLE_NODE #define USE_OP_PUSH_OR_JUMP_EXACT @@ -105,8 +110,8 @@ }\ } while (0) -#define DEFAULT_WARN_FUNCTION rb_warn -#define DEFAULT_VERB_WARN_FUNCTION rb_warning +#define DEFAULT_WARN_FUNCTION onig_rb_warn +#define DEFAULT_VERB_WARN_FUNCTION onig_rb_warning #endif /* else NOT_RUBY */ @@ -721,6 +726,11 @@ typedef void* PointerType; #define MC_ONE_OR_MORE_TIME(enc) (enc)->meta_char_table.one_or_more_time #define MC_ANYCHAR_ANYTIME(enc) (enc)->meta_char_table.anychar_anytime +#define IS_MC_ESC_CODE(code, enc, syn) \ + ((code) == MC_ESC(enc) && \ + !IS_SYNTAX_OP2((syn), ONIG_SYN_OP2_INEFFECTIVE_ESCAPE)) + + #define SYN_POSIX_COMMON_OP \ ( ONIG_SYN_OP_DOT_ANYCHAR | ONIG_SYN_OP_POSIX_BRACKET | \ ONIG_SYN_OP_DECIMAL_BACKREF | \ @@ -781,13 +791,14 @@ extern void onig_print_statistics P_((FILE* f)); #endif #endif -extern char* onig_error_code_to_format P_((int code)); -extern void onig_snprintf_with_pattern PV_((char buf[], int bufsize, OnigEncoding enc, char* pat, char* pat_end, char *fmt, ...)); +extern UChar* onig_error_code_to_format P_((int code)); +extern void onig_snprintf_with_pattern PV_((UChar buf[], int bufsize, OnigEncoding enc, UChar* pat, UChar* pat_end, const UChar *fmt, ...)); extern int onig_bbuf_init P_((BBuf* buf, int size)); extern int onig_alloc_init P_((regex_t** reg, OnigOptionType option, OnigAmbigType ambig_flag, OnigEncoding enc, OnigSyntaxType* syntax)); extern int onig_compile P_((regex_t* reg, const UChar* pattern, const UChar* pattern_end, OnigErrorInfo* einfo)); extern void onig_chain_reduce P_((regex_t* reg)); extern void onig_chain_link_add P_((regex_t* to, regex_t* add)); extern void onig_transfer P_((regex_t* to, regex_t* from)); +extern int onig_is_code_in_cc P_((OnigEncoding enc, OnigCodePoint code, CClassNode* cc)); #endif /* REGINT_H */ diff --git a/regparse.c b/regparse.c index 837ea35b30..76bce7885b 100644 --- a/regparse.c +++ b/regparse.c @@ -58,7 +58,21 @@ OnigSyntaxType OnigSyntaxRuby = { OnigSyntaxType* OnigDefaultSyntax = ONIG_SYNTAX_RUBY; -extern void onig_null_warn(const char* s, ...) { } +extern void onig_null_warn(const char* s) { } + +#ifdef RUBY_PLATFORM +extern void +onig_rb_warn(const char* s) +{ + rb_warn(s); +} + +extern void +onig_rb_warning(const char* s) +{ + rb_warning(s); +} +#endif #ifdef DEFAULT_WARN_FUNCTION static OnigWarnFunc onig_warn = (OnigWarnFunc )DEFAULT_WARN_FUNCTION; @@ -1050,12 +1064,12 @@ onig_node_free(Node* node) #ifdef USE_RECYCLE_NODE extern int -onig_free_node_list(void) +onig_free_node_list() { FreeNode* n; THREAD_ATOMIC_START; - while (FreeNodeList) { + while (IS_NOT_NULL(FreeNodeList)) { n = FreeNodeList; FreeNodeList = FreeNodeList->next; xfree(n); @@ -1066,18 +1080,19 @@ onig_free_node_list(void) #endif static Node* -node_new(void) +node_new() { Node* node; #ifdef USE_RECYCLE_NODE + THREAD_ATOMIC_START; if (IS_NOT_NULL(FreeNodeList)) { - THREAD_ATOMIC_START; node = (Node* )FreeNodeList; FreeNodeList = FreeNodeList->next; THREAD_ATOMIC_END; return node; } + THREAD_ATOMIC_END; #endif node = (Node* )xmalloc(sizeof(Node)); @@ -1094,7 +1109,7 @@ initialize_cclass(CClassNode* cc) } static Node* -node_new_cclass(void) +node_new_cclass() { Node* node = node_new(); CHECK_NULL_RETURN(node); @@ -1106,7 +1121,7 @@ node_new_cclass(void) static Node* node_new_cclass_by_codepoint_range(int not, - OnigCodePoint sbr[], OnigCodePoint mbr[]) + const OnigCodePoint sbr[], const OnigCodePoint mbr[]) { CClassNode* cc; int n, i, j; @@ -1163,7 +1178,7 @@ node_new_ctype(int type) } static Node* -node_new_anychar(void) +node_new_anychar() { Node* node = node_new(); CHECK_NULL_RETURN(node); @@ -1434,7 +1449,7 @@ node_new_str_raw(UChar* s, UChar* end) } static Node* -node_new_empty(void) +node_new_empty() { return node_new_str(NULL, NULL); } @@ -2358,15 +2373,17 @@ fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env) control: if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL; PFETCH(c); - if (c == MC_ESC(enc)) { - v = fetch_escaped_value(&p, end, env); - if (v < 0) return v; - c = (OnigCodePoint )(v & 0x9f); - } - else if (c == '?') + if (c == '?') { c = 0177; - else + } + else { + if (c == MC_ESC(enc)) { + v = fetch_escaped_value(&p, end, env); + if (v < 0) return v; + c = (OnigCodePoint )v; + } c &= 0x9f; + } break; } /* fall through */ @@ -2512,11 +2529,11 @@ CC_ESC_WARN(ScanEnv* env, UChar *c) if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED) && IS_SYNTAX_BV(env->syntax, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC)) { - char buf[WARN_BUFSIZE]; + UChar buf[WARN_BUFSIZE]; onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc, env->pattern, env->pattern_end, - "character class has '%s' without escape", c); - (*onig_warn)(buf); + (UChar* )"character class has '%s' without escape", c); + (*onig_warn)((char* )buf); } } @@ -2526,11 +2543,11 @@ CCEND_ESC_WARN(ScanEnv* env, UChar* c) if (onig_warn == onig_null_warn) return ; if (IS_SYNTAX_BV((env)->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED)) { - char buf[WARN_BUFSIZE]; + UChar buf[WARN_BUFSIZE]; onig_snprintf_with_pattern(buf, WARN_BUFSIZE, (env)->enc, (env)->pattern, (env)->pattern_end, - "regular expression has '%s' without escape", c); - (*onig_warn)(buf); + (UChar* )"regular expression has '%s' without escape", c); + (*onig_warn)((char* )buf); } } @@ -2794,7 +2811,7 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) tok->type = TK_CC_CC_OPEN; } else { - CC_ESC_WARN(env, "["); + CC_ESC_WARN(env, (UChar* )"["); } } } @@ -2833,7 +2850,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) tok->backp = p; PFETCH(c); - if (c == MC_ESC(enc)) { + if (IS_MC_ESC_CODE(c, enc, syn)) { if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE; tok->backp = p; @@ -3365,7 +3382,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) case ']': if (*src > env->pattern) /* /].../ is allowed. */ - CCEND_ESC_WARN(env, "]"); + CCEND_ESC_WARN(env, (UChar* )"]"); break; case '#': @@ -3400,7 +3417,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) static int add_ctype_to_cc_by_range(CClassNode* cc, int ctype, int not, OnigEncoding enc, - OnigCodePoint sbr[], OnigCodePoint mbr[]) + const OnigCodePoint sbr[], const OnigCodePoint mbr[]) { int i, r; OnigCodePoint j; @@ -3464,7 +3481,7 @@ static int add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env) { int c, r; - OnigCodePoint *sbr, *mbr; + const OnigCodePoint *sbr, *mbr; OnigEncoding enc = env->enc; r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sbr, &mbr); @@ -3602,19 +3619,19 @@ parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env) #define POSIX_BRACKET_NAME_MAX_LEN 6 static PosixBracketEntryType PBS[] = { - { "alnum", ONIGENC_CTYPE_ALNUM, 5 }, - { "alpha", ONIGENC_CTYPE_ALPHA, 5 }, - { "blank", ONIGENC_CTYPE_BLANK, 5 }, - { "cntrl", ONIGENC_CTYPE_CNTRL, 5 }, - { "digit", ONIGENC_CTYPE_DIGIT, 5 }, - { "graph", ONIGENC_CTYPE_GRAPH, 5 }, - { "lower", ONIGENC_CTYPE_LOWER, 5 }, - { "print", ONIGENC_CTYPE_PRINT, 5 }, - { "punct", ONIGENC_CTYPE_PUNCT, 5 }, - { "space", ONIGENC_CTYPE_SPACE, 5 }, - { "upper", ONIGENC_CTYPE_UPPER, 5 }, - { "xdigit", ONIGENC_CTYPE_XDIGIT, 6 }, - { "ascii", ONIGENC_CTYPE_ASCII, 5 }, /* I don't know origin. Perl? */ + { (UChar* )"alnum", ONIGENC_CTYPE_ALNUM, 5 }, + { (UChar* )"alpha", ONIGENC_CTYPE_ALPHA, 5 }, + { (UChar* )"blank", ONIGENC_CTYPE_BLANK, 5 }, + { (UChar* )"cntrl", ONIGENC_CTYPE_CNTRL, 5 }, + { (UChar* )"digit", ONIGENC_CTYPE_DIGIT, 5 }, + { (UChar* )"graph", ONIGENC_CTYPE_GRAPH, 5 }, + { (UChar* )"lower", ONIGENC_CTYPE_LOWER, 5 }, + { (UChar* )"print", ONIGENC_CTYPE_PRINT, 5 }, + { (UChar* )"punct", ONIGENC_CTYPE_PUNCT, 5 }, + { (UChar* )"space", ONIGENC_CTYPE_SPACE, 5 }, + { (UChar* )"upper", ONIGENC_CTYPE_UPPER, 5 }, + { (UChar* )"xdigit", ONIGENC_CTYPE_XDIGIT, 6 }, + { (UChar* )"ascii", ONIGENC_CTYPE_ASCII, 5 }, { (UChar* )NULL, -1, 0 } }; @@ -3638,7 +3655,7 @@ parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env) for (pb = PBS; IS_NOT_NULL(pb->name); pb++) { if (onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0) { p = (UChar* )onigenc_step(enc, p, end, pb->len); - if (onigenc_with_ascii_strncmp(enc, p, end, ":]", 2) != 0) + if (onigenc_with_ascii_strncmp(enc, p, end, (UChar* )":]", 2) != 0) return ONIGERR_INVALID_POSIX_BRACKET_TYPE; r = add_ctype_to_cc(cc, pb->ctype, not, env); @@ -3673,19 +3690,19 @@ static int property_name_to_ctype(UChar* p, UChar* end, OnigEncoding enc) { static PosixBracketEntryType PBS[] = { - { "Alnum", ONIGENC_CTYPE_ALNUM, 5 }, - { "Alpha", ONIGENC_CTYPE_ALPHA, 5 }, - { "Blank", ONIGENC_CTYPE_BLANK, 5 }, - { "Cntrl", ONIGENC_CTYPE_CNTRL, 5 }, - { "Digit", ONIGENC_CTYPE_DIGIT, 5 }, - { "Graph", ONIGENC_CTYPE_GRAPH, 5 }, - { "Lower", ONIGENC_CTYPE_LOWER, 5 }, - { "Print", ONIGENC_CTYPE_PRINT, 5 }, - { "Punct", ONIGENC_CTYPE_PUNCT, 5 }, - { "Space", ONIGENC_CTYPE_SPACE, 5 }, - { "Upper", ONIGENC_CTYPE_UPPER, 5 }, - { "XDigit", ONIGENC_CTYPE_XDIGIT, 6 }, - { "ASCII", ONIGENC_CTYPE_ASCII, 5 }, + { (UChar* )"Alnum", ONIGENC_CTYPE_ALNUM, 5 }, + { (UChar* )"Alpha", ONIGENC_CTYPE_ALPHA, 5 }, + { (UChar* )"Blank", ONIGENC_CTYPE_BLANK, 5 }, + { (UChar* )"Cntrl", ONIGENC_CTYPE_CNTRL, 5 }, + { (UChar* )"Digit", ONIGENC_CTYPE_DIGIT, 5 }, + { (UChar* )"Graph", ONIGENC_CTYPE_GRAPH, 5 }, + { (UChar* )"Lower", ONIGENC_CTYPE_LOWER, 5 }, + { (UChar* )"Print", ONIGENC_CTYPE_PRINT, 5 }, + { (UChar* )"Punct", ONIGENC_CTYPE_PUNCT, 5 }, + { (UChar* )"Space", ONIGENC_CTYPE_SPACE, 5 }, + { (UChar* )"Upper", ONIGENC_CTYPE_UPPER, 5 }, + { (UChar* )"XDigit", ONIGENC_CTYPE_XDIGIT, 6 }, + { (UChar* )"ASCII", ONIGENC_CTYPE_ASCII, 5 }, { (UChar* )NULL, -1, 0 } }; @@ -3935,7 +3952,7 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, *src, env->pattern_end, 1, env->enc)) return ONIGERR_EMPTY_CHAR_CLASS; - CC_ESC_WARN(env, "]"); + CC_ESC_WARN(env, (UChar* )"]"); r = tok->type = TK_CHAR; /* allow []...] */ } @@ -4038,7 +4055,7 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, r = parse_posix_bracket(cc, &p, end, env); if (r < 0) goto err; if (r == 1) { /* is not POSIX bracket */ - CC_ESC_WARN(env, "["); + CC_ESC_WARN(env, (UChar* )"["); p = tok->backp; v = (OnigCodePoint )tok->u.c; in_israw = 0; @@ -4084,7 +4101,7 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, goto val_entry; } else if (r == TK_CC_AND) { - CC_ESC_WARN(env, "-"); + CC_ESC_WARN(env, (UChar* )"-"); goto range_end_val; } state = CCS_RANGE; @@ -4099,12 +4116,12 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, fetched = 1; /* [--x] or [a&&-x] is warned. */ if (r == TK_CC_RANGE || and_start != 0) - CC_ESC_WARN(env, "-"); + CC_ESC_WARN(env, (UChar* )"-"); goto val_entry; } else if (state == CCS_RANGE) { - CC_ESC_WARN(env, "-"); + CC_ESC_WARN(env, (UChar* )"-"); goto sb_char; /* [!--x] is allowed */ } else { /* CCS_COMPLETE */ @@ -4113,12 +4130,12 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, fetched = 1; if (r == TK_CC_CLOSE) goto range_end_val; /* allow [a-b-] */ else if (r == TK_CC_AND) { - CC_ESC_WARN(env, "-"); + CC_ESC_WARN(env, (UChar* )"-"); goto range_end_val; } if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC)) { - CC_ESC_WARN(env, "-"); + CC_ESC_WARN(env, (UChar* )"-"); goto sb_char; /* [0-9-a] is allowed as [0-9\-a] */ } r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS; @@ -4495,7 +4512,7 @@ set_qualifier(Node* qnode, Node* target, int group, ScanEnv* env) if (qn->by_number == 0 && qnt->by_number == 0 && IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT)) { int nestq_num, targetq_num; - char buf[WARN_BUFSIZE]; + UChar buf[WARN_BUFSIZE]; nestq_num = popular_qualifier_num(qn); targetq_num = popular_qualifier_num(qnt); @@ -4507,9 +4524,9 @@ set_qualifier(Node* qnode, Node* target, int group, ScanEnv* env) case RQ_DEL: if (onig_verb_warn != onig_null_warn) { onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc, - env->pattern, env->pattern_end, - "redundant nested repeat operator"); - (*onig_verb_warn)(buf); + env->pattern, env->pattern_end, + (UChar* )"redundant nested repeat operator"); + (*onig_verb_warn)((char* )buf); } goto warn_exit; break; @@ -4518,10 +4535,10 @@ set_qualifier(Node* qnode, Node* target, int group, ScanEnv* env) if (onig_verb_warn != onig_null_warn) { onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc, env->pattern, env->pattern_end, - "nested repeat operator %s and %s was replaced with '%s'", + (UChar* )"nested repeat operator %s and %s was replaced with '%s'", PopularQStr[targetq_num], PopularQStr[nestq_num], ReduceQStr[ReduceTypeTable[targetq_num][nestq_num]]); - (*onig_verb_warn)(buf); + (*onig_verb_warn)((char* )buf); } goto warn_exit; break; @@ -4553,8 +4570,8 @@ make_compound_alt_node_from_cc(OnigAmbigType ambig_flag, OnigEncoding enc, int r, i, j, k, clen, len, ncode, n; UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; Node **ptail, *snode = NULL_NODE; - OnigCompAmbigCodes* ccs; - OnigCompAmbigCodeItem* ci; + const OnigCompAmbigCodes* ccs; + const OnigCompAmbigCodeItem* ci; OnigAmbigType amb; n = 0; @@ -4662,7 +4679,7 @@ i_free_shared_class(type_cclass_key* key, Node* node, void* arg) } extern int -onig_free_shared_cclass_table(void) +onig_free_shared_cclass_table() { if (IS_NOT_NULL(OnigTypeCClassTable)) { onig_st_foreach(OnigTypeCClassTable, i_free_shared_class, 0); @@ -4819,7 +4836,7 @@ parse_exp(Node** np, OnigToken* tok, int term, int ctype, not; #ifdef USE_SHARED_CCLASS_TABLE - OnigCodePoint *sbr, *mbr; + const OnigCodePoint *sbr, *mbr; ctype = parse_ctype_to_enc_ctype(tok->u.subtype, ¬); r = ONIGENC_GET_CTYPE_CODE_RANGE(env->enc, ctype, &sbr, &mbr); @@ -4901,7 +4918,7 @@ parse_exp(Node** np, OnigToken* tok, int term, if (IS_IGNORECASE(env->option)) { int i, n, in_cc; - OnigPairAmbigCodes* ccs; + const OnigPairAmbigCodes* ccs; BitSetRef bs = cc->bs; OnigAmbigType amb; diff --git a/regparse.h b/regparse.h index f68d07a67f..bdf6d92219 100644 --- a/regparse.h +++ b/regparse.h @@ -290,7 +290,6 @@ typedef struct { extern int onig_renumber_name_table P_((regex_t* reg, GroupNumRemap* map)); #endif -extern int onig_is_code_in_cc P_((OnigEncoding enc, OnigCodePoint code, CClassNode* cc)); extern int onig_strncmp P_((const UChar* s1, const UChar* s2, int n)); extern void onig_scan_env_set_error_string P_((ScanEnv* env, int ecode, UChar* arg, UChar* arg_end)); extern int onig_scan_unsigned_number P_((UChar** src, const UChar* end, OnigEncoding enc)); @@ -303,7 +302,7 @@ extern Node* onig_node_new_anchor P_((int type)); extern Node* onig_node_new_str P_((const UChar* s, const UChar* end)); extern Node* onig_node_new_list P_((Node* left, Node* right)); extern void onig_node_str_clear P_((Node* node)); -extern int onig_free_node_list(void); +extern int onig_free_node_list(); extern int onig_names_free P_((regex_t* reg)); extern int onig_parse_make_tree P_((Node** root, const UChar* pattern, const UChar* end, regex_t* reg, ScanEnv* env)); @@ -29,7 +29,7 @@ #include "regenc.h" -static int EncLen_SJIS[] = { +static const int EncLen_SJIS[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, @@ -167,21 +167,16 @@ sjis_is_mbc_ambiguous(OnigAmbigType flag, const UChar** pp, const UChar* end) static int sjis_is_code_ctype(OnigCodePoint code, unsigned int ctype) { - if ((ctype & ONIGENC_CTYPE_WORD) != 0) { - if (code < 128) - return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); - else { + if (code < 128) + return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); + else { + if ((ctype & (ONIGENC_CTYPE_WORD | + ONIGENC_CTYPE_GRAPH | ONIGENC_CTYPE_PRINT)) != 0) { return (sjis_code_to_mbclen(code) > 1 ? TRUE : FALSE); } - - ctype &= ~ONIGENC_CTYPE_WORD; - if (ctype == 0) return FALSE; } - if (code < 128) - return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); - else - return FALSE; + return FALSE; } static UChar* @@ -2,7 +2,7 @@ utf8.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2005 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2006 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -40,7 +40,7 @@ #define utf8_islead(c) ((UChar )((c) & 0xc0) != 0x80) -static int EncLen_UTF8[] = { +static const int EncLen_UTF8[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, @@ -65,6 +65,29 @@ utf8_mbc_enc_len(const UChar* p) return EncLen_UTF8[*p]; } +static int +utf8_is_mbc_newline(const UChar* p, const UChar* end) +{ + if (p < end) { + if (*p == 0x0a) return 1; + +#ifdef USE_UNICODE_ALL_LINE_TERMINATORS + if (*p == 0x0d) return 1; + if (p + 1 < end) { + if (*(p+1) == 0x85 && *p == 0xc2) /* U+0085 */ + return 1; + if (p + 2 < end) { + if ((*(p+2) == 0xa8 || *(p+2) == 0xa9) + && *(p+1) == 0x80 && *p == 0xe2) /* U+2028, U+2029 */ + return 1; + } + } +#endif + } + + return 0; +} + static OnigCodePoint utf8_mbc_to_code(const UChar* p, const UChar* end) { @@ -307,16 +330,16 @@ utf8_is_mbc_ambiguous(OnigAmbigType flag, const UChar** pp, const UChar* end) } -static OnigCodePoint EmptyRange[] = { 0 }; +static const OnigCodePoint EmptyRange[] = { 0 }; -static OnigCodePoint SBAlnum[] = { +static const OnigCodePoint SBAlnum[] = { 3, 0x0030, 0x0039, 0x0041, 0x005a, 0x0061, 0x007a }; -static OnigCodePoint MBAlnum[] = { +static const OnigCodePoint MBAlnum[] = { #ifdef USE_UNICODE_FULL_RANGE_CTYPE 411, #else @@ -738,13 +761,13 @@ static OnigCodePoint MBAlnum[] = { #endif /* USE_UNICODE_FULL_RANGE_CTYPE */ }; /* end of MBAlnum */ -static OnigCodePoint SBAlpha[] = { +static const OnigCodePoint SBAlpha[] = { 2, 0x0041, 0x005a, 0x0061, 0x007a }; -static OnigCodePoint MBAlpha[] = { +static const OnigCodePoint MBAlpha[] = { #ifdef USE_UNICODE_FULL_RANGE_CTYPE 394, #else @@ -1149,13 +1172,13 @@ static OnigCodePoint MBAlpha[] = { #endif /* USE_UNICODE_FULL_RANGE_CTYPE */ }; /* end of MBAlpha */ -static OnigCodePoint SBBlank[] = { +static const OnigCodePoint SBBlank[] = { 2, 0x0009, 0x0009, 0x0020, 0x0020 }; -static OnigCodePoint MBBlank[] = { +static const OnigCodePoint MBBlank[] = { #ifdef USE_UNICODE_FULL_RANGE_CTYPE 7, #else @@ -1173,13 +1196,13 @@ static OnigCodePoint MBBlank[] = { #endif /* USE_UNICODE_FULL_RANGE_CTYPE */ }; /* end of MBBlank */ -static OnigCodePoint SBCntrl[] = { +static const OnigCodePoint SBCntrl[] = { 2, 0x0000, 0x001f, 0x007f, 0x007f }; -static OnigCodePoint MBCntrl[] = { +static const OnigCodePoint MBCntrl[] = { #ifdef USE_UNICODE_FULL_RANGE_CTYPE 18, #else @@ -1208,12 +1231,12 @@ static OnigCodePoint MBCntrl[] = { #endif /* USE_UNICODE_FULL_RANGE_CTYPE */ }; /* end of MBCntrl */ -static OnigCodePoint SBDigit[] = { +static const OnigCodePoint SBDigit[] = { 1, 0x0030, 0x0039 }; -static OnigCodePoint MBDigit[] = { +static const OnigCodePoint MBDigit[] = { #ifdef USE_UNICODE_FULL_RANGE_CTYPE 22, #else @@ -1245,12 +1268,12 @@ static OnigCodePoint MBDigit[] = { #endif /* USE_UNICODE_FULL_RANGE_CTYPE */ }; /* end of MBDigit */ -static OnigCodePoint SBGraph[] = { +static const OnigCodePoint SBGraph[] = { 1, 0x0021, 0x007e }; -static OnigCodePoint MBGraph[] = { +static const OnigCodePoint MBGraph[] = { #ifdef USE_UNICODE_FULL_RANGE_CTYPE 404, #else @@ -1665,12 +1688,12 @@ static OnigCodePoint MBGraph[] = { #endif /* USE_UNICODE_FULL_RANGE_CTYPE */ }; /* end of MBGraph */ -static OnigCodePoint SBLower[] = { +static const OnigCodePoint SBLower[] = { 1, 0x0061, 0x007a }; -static OnigCodePoint MBLower[] = { +static const OnigCodePoint MBLower[] = { #ifdef USE_UNICODE_FULL_RANGE_CTYPE 423, #else @@ -2104,13 +2127,13 @@ static OnigCodePoint MBLower[] = { #endif /* USE_UNICODE_FULL_RANGE_CTYPE */ }; /* end of MBLower */ -static OnigCodePoint SBPrint[] = { +static const OnigCodePoint SBPrint[] = { 2, 0x0009, 0x000d, 0x0020, 0x007e }; -static OnigCodePoint MBPrint[] = { +static const OnigCodePoint MBPrint[] = { #ifdef USE_UNICODE_FULL_RANGE_CTYPE 403, #else @@ -2524,7 +2547,7 @@ static OnigCodePoint MBPrint[] = { #endif /* USE_UNICODE_FULL_RANGE_CTYPE */ }; /* end of MBPrint */ -static OnigCodePoint SBPunct[] = { +static const OnigCodePoint SBPunct[] = { 9, 0x0021, 0x0023, 0x0025, 0x002a, @@ -2537,7 +2560,7 @@ static OnigCodePoint SBPunct[] = { 0x007d, 0x007d }; /* end of SBPunct */ -static OnigCodePoint MBPunct[] = { +static const OnigCodePoint MBPunct[] = { #ifdef USE_UNICODE_FULL_RANGE_CTYPE 77, #else @@ -2625,13 +2648,13 @@ static OnigCodePoint MBPunct[] = { #endif /* USE_UNICODE_FULL_RANGE_CTYPE */ }; /* end of MBPunct */ -static OnigCodePoint SBSpace[] = { +static const OnigCodePoint SBSpace[] = { 2, 0x0009, 0x000d, 0x0020, 0x0020 }; -static OnigCodePoint MBSpace[] = { +static const OnigCodePoint MBSpace[] = { #ifdef USE_UNICODE_FULL_RANGE_CTYPE 9, #else @@ -2651,12 +2674,12 @@ static OnigCodePoint MBSpace[] = { #endif /* USE_UNICODE_FULL_RANGE_CTYPE */ }; /* end of MBSpace */ -static OnigCodePoint SBUpper[] = { +static const OnigCodePoint SBUpper[] = { 1, 0x0041, 0x005a }; -static OnigCodePoint MBUpper[] = { +static const OnigCodePoint MBUpper[] = { #ifdef USE_UNICODE_FULL_RANGE_CTYPE 420, #else @@ -3087,19 +3110,19 @@ static OnigCodePoint MBUpper[] = { #endif /* USE_UNICODE_FULL_RANGE_CTYPE */ }; /* end of MBUpper */ -static OnigCodePoint SBXDigit[] = { +static const OnigCodePoint SBXDigit[] = { 3, 0x0030, 0x0039, 0x0041, 0x0046, 0x0061, 0x0066 }; -static OnigCodePoint SBASCII[] = { +static const OnigCodePoint SBASCII[] = { 1, 0x0000, 0x007f }; -static OnigCodePoint SBWord[] = { +static const OnigCodePoint SBWord[] = { 4, 0x0030, 0x0039, 0x0041, 0x005a, @@ -3107,7 +3130,7 @@ static OnigCodePoint SBWord[] = { 0x0061, 0x007a }; -static OnigCodePoint MBWord[] = { +static const OnigCodePoint MBWord[] = { #ifdef USE_UNICODE_FULL_RANGE_CTYPE 432, #else @@ -3554,7 +3577,7 @@ static OnigCodePoint MBWord[] = { static int utf8_get_ctype_code_range(int ctype, - OnigCodePoint* sbr[], OnigCodePoint* mbr[]) + const OnigCodePoint* sbr[], const OnigCodePoint* mbr[]) { #define CR_SET(sbl,mbl) do { \ *sbr = sbl; \ @@ -3622,7 +3645,7 @@ static int utf8_is_code_ctype(OnigCodePoint code, unsigned int ctype) { #ifdef USE_UNICODE_FULL_RANGE_CTYPE - OnigCodePoint *range; + const OnigCodePoint *range; #endif if (code < 256) { @@ -3674,6 +3697,9 @@ utf8_is_code_ctype(OnigCodePoint code, unsigned int ctype) case ONIGENC_CTYPE_ALNUM: range = MBAlnum; break; + case ONIGENC_CTYPE_NEWLINE: + return FALSE; + break; default: return ONIGENCERR_TYPE_BUG; @@ -3723,7 +3749,7 @@ OnigEncodingType OnigEncodingUTF8 = { , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ }, - onigenc_is_mbc_newline_0x0a, + utf8_is_mbc_newline, utf8_mbc_to_code, utf8_code_to_mbclen, utf8_code_to_mbc, |