diff options
-rw-r--r-- | ChangeLog | 10 | ||||
-rw-r--r-- | oniguruma.h | 296 | ||||
-rw-r--r-- | parse.y | 46 | ||||
-rw-r--r-- | re.c | 31 | ||||
-rw-r--r-- | regcomp.c | 816 | ||||
-rw-r--r-- | regexec.c | 813 | ||||
-rw-r--r-- | regint.h | 116 | ||||
-rw-r--r-- | regparse.c | 942 | ||||
-rw-r--r-- | regparse.h | 100 |
9 files changed, 1884 insertions, 1286 deletions
@@ -1,3 +1,13 @@ +Thu Nov 4 23:41:55 2004 Kazuo Saito <[email protected]> + + * ascii.c, euc_jp.c, oniggnu.h, oniguruma.h, regcomp.c, + regenc.c, regenc.h, regerror.c, regexec.c, reggnu.c, + regint.h, regparse.c, regparse.h, sjis.c, utf8.c: + imported Oni Guruma 3.4.0. + + * parse.y, re.c: Now mbclen() takes unsigned char as + its argument. + Thu Nov 4 21:25:38 2004 Yukihiro Matsumoto <[email protected]> * string.c (str_gsub): string modify check no longer based on diff --git a/oniguruma.h b/oniguruma.h index 3fd9f4c395..c10f3b4d18 100644 --- a/oniguruma.h +++ b/oniguruma.h @@ -1,17 +1,38 @@ +#ifndef ONIGURUMA_H +#define ONIGURUMA_H /********************************************************************** - oniguruma.h - Oniguruma (regular expression library) - - Copyright (C) 2002-2004 K.Kosako ([email protected]) - **********************************************************************/ -#ifndef ONIGURUMA_H -#define ONIGURUMA_H +/*- + * Copyright (c) 2002-2004 K.Kosako <kosako AT sofnec DOT co DOT jp> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ #define ONIGURUMA -#define ONIGURUMA_VERSION_MAJOR 2 -#define ONIGURUMA_VERSION_MINOR 2 -#define ONIGURUMA_VERSION_TEENY 8 +#define ONIGURUMA_VERSION_MAJOR 3 +#define ONIGURUMA_VERSION_MINOR 4 +#define ONIGURUMA_VERSION_TEENY 0 #ifndef P_ #if defined(__STDC__) || defined(_WIN32) @@ -56,12 +77,56 @@ typedef struct { OnigCodePoint to; } OnigCodePointRange; -#define ONIGENC_FOLD_MATCH_MAX_TARGET_NUM_SIZE 16 + +/* ambiguous match flag */ +#define ONIGENC_AMBIGUOUS_MATCH_NONE 0 +#define ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE (1<<0) +#define ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE (1<<1) +/* #define ONIGENC_AMBIGUOUS_MATCH_ACCENT (1<<2) */ +/* #define ONIGENC_AMBIGUOUS_MATCH_HIRAGANA_KATAKANA (1<<3) */ +/* #define ONIGENC_AMBIGUOUS_MATCH_KATAKANA_WIDTH (1<<4) */ + +#define ONIGENC_AMBIGUOUS_MATCH_LIMIT (1<<1) +#define ONIGENC_AMBIGUOUS_MATCH_COMPOUND (1<<30) + +#define ONIGENC_AMBIGUOUS_MATCH_FULL \ + ( ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE | \ + ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE | \ + ONIGENC_AMBIGUOUS_MATCH_COMPOUND ) +#define ONIGENC_AMBIGUOUS_MATCH_DEFAULT \ + (ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE | \ + ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE | \ + ONIGENC_AMBIGUOUS_MATCH_COMPOUND ) + +typedef unsigned int OnigAmbigType; + +#define ONIGENC_MAX_COMP_AMBIG_CODE_LEN 3 +#define ONIGENC_MAX_COMP_AMBIG_CODE_ITEM_NUM 4 + +typedef struct { + int len; + OnigCodePoint code[ONIGENC_MAX_COMP_AMBIG_CODE_LEN]; +} OnigCompAmbigCodeItem; + typedef struct { - int target_num; - int target_byte_len[ONIGENC_FOLD_MATCH_MAX_TARGET_NUM_SIZE]; - UChar* target_str[ONIGENC_FOLD_MATCH_MAX_TARGET_NUM_SIZE]; -} OnigEncFoldMatchInfo; + int n; + OnigCodePoint code; + OnigCompAmbigCodeItem items[ONIGENC_MAX_COMP_AMBIG_CODE_ITEM_NUM]; +} OnigCompAmbigCodes; + +typedef struct { + OnigCodePoint from; + OnigCodePoint to; +} OnigPairAmbigCodes; + +typedef struct { + OnigCodePoint esc; + OnigCodePoint anychar; + OnigCodePoint anytime; + OnigCodePoint zero_or_one_time; + OnigCodePoint one_or_more_time; + OnigCodePoint anychar_anytime; +} OnigMetaCharTableType; #if defined(RUBY_PLATFORM) && defined(M17N_H) @@ -72,23 +137,24 @@ typedef m17n_encoding* OnigEncoding; #else typedef struct { - const char len_table[256]; - const char* name; - int max_enc_len; - int is_fold_match; - int ctype_support_level; /* sb-only/full */ - int is_continuous_sb_mb; /* code point is continuous from sb to mb */ + int (*mbc_enc_len)(UChar* p); + const char* name; + int max_enc_len; + int min_enc_len; + OnigAmbigType support_ambig_flag; + OnigMetaCharTableType meta_char_table; + int (*is_mbc_newline)(UChar* p, UChar* end); OnigCodePoint (*mbc_to_code)(UChar* p, UChar* end); int (*code_to_mbclen)(OnigCodePoint code); int (*code_to_mbc)(OnigCodePoint code, UChar *buf); - int (*mbc_to_lower)(UChar* p, UChar* lower); - int (*mbc_is_case_ambig)(UChar* p); - int (*code_is_ctype)(OnigCodePoint code, unsigned int ctype); + int (*mbc_to_normalize)(OnigAmbigType flag, UChar** pp, UChar* end, UChar* to); + int (*is_mbc_ambiguous)(OnigAmbigType flag, UChar** pp, UChar* end); + int (*get_all_pair_ambig_codes)(OnigAmbigType flag, OnigPairAmbigCodes** acs); + int (*get_all_comp_ambig_codes)(OnigAmbigType flag, OnigCompAmbigCodes** acs); + int (*is_code_ctype)(OnigCodePoint code, unsigned int ctype); int (*get_ctype_code_range)(int ctype, int* nsb, int* nmb, OnigCodePointRange* sbr[], OnigCodePointRange* mbr[]); - UChar* (*left_adjust_char_head)(UChar* start, UChar* s); - int (*is_allowed_reverse_match)(UChar* p, UChar* e); - int (*get_all_fold_match_code)(OnigCodePoint** codes); - int (*get_fold_match_info)(UChar* p, UChar* end, OnigEncFoldMatchInfo** info); + UChar* (*left_adjust_char_head)(UChar* start, UChar* p); + int (*is_allowed_reverse_match)(UChar* p, UChar* end); } OnigEncodingType; typedef OnigEncodingType* OnigEncoding; @@ -110,6 +176,10 @@ ONIG_EXTERN OnigEncodingType OnigEncodingISO_8859_14; ONIG_EXTERN OnigEncodingType OnigEncodingISO_8859_15; ONIG_EXTERN OnigEncodingType OnigEncodingISO_8859_16; ONIG_EXTERN OnigEncodingType OnigEncodingUTF8; +ONIG_EXTERN OnigEncodingType OnigEncodingUTF16_BE; +ONIG_EXTERN OnigEncodingType OnigEncodingUTF16_LE; +ONIG_EXTERN OnigEncodingType OnigEncodingUTF32_BE; +ONIG_EXTERN OnigEncodingType OnigEncodingUTF32_LE; ONIG_EXTERN OnigEncodingType OnigEncodingEUC_JP; ONIG_EXTERN OnigEncodingType OnigEncodingEUC_TW; ONIG_EXTERN OnigEncodingType OnigEncodingEUC_KR; @@ -136,6 +206,10 @@ ONIG_EXTERN OnigEncodingType OnigEncodingBIG5; #define ONIG_ENCODING_ISO_8859_15 (&OnigEncodingISO_8859_15) #define ONIG_ENCODING_ISO_8859_16 (&OnigEncodingISO_8859_16) #define ONIG_ENCODING_UTF8 (&OnigEncodingUTF8) +#define ONIG_ENCODING_UTF16_BE (&OnigEncodingUTF16_BE) +#define ONIG_ENCODING_UTF16_LE (&OnigEncodingUTF16_LE) +#define ONIG_ENCODING_UTF32_BE (&OnigEncodingUTF32_BE) +#define ONIG_ENCODING_UTF32_LE (&OnigEncodingUTF32_LE) #define ONIG_ENCODING_EUC_JP (&OnigEncodingEUC_JP) #define ONIG_ENCODING_EUC_TW (&OnigEncodingEUC_TW) #define ONIG_ENCODING_EUC_KR (&OnigEncodingEUC_KR) @@ -151,35 +225,32 @@ ONIG_EXTERN OnigEncodingType OnigEncodingBIG5; /* work size */ -#define ONIGENC_CODE_TO_MBC_MAXLEN 7 -#define ONIGENC_MBC_TO_LOWER_MAXLEN ONIGENC_CODE_TO_MBC_MAXLEN +#define ONIGENC_CODE_TO_MBC_MAXLEN 7 +#define ONIGENC_MBC_NORMALIZE_MAXLEN ONIGENC_CODE_TO_MBC_MAXLEN /* character types */ -#define ONIGENC_CTYPE_ALPHA (1<< 0) -#define ONIGENC_CTYPE_BLANK (1<< 1) -#define ONIGENC_CTYPE_CNTRL (1<< 2) -#define ONIGENC_CTYPE_DIGIT (1<< 3) -#define ONIGENC_CTYPE_GRAPH (1<< 4) -#define ONIGENC_CTYPE_LOWER (1<< 5) -#define ONIGENC_CTYPE_PRINT (1<< 6) -#define ONIGENC_CTYPE_PUNCT (1<< 7) -#define ONIGENC_CTYPE_SPACE (1<< 8) -#define ONIGENC_CTYPE_UPPER (1<< 9) -#define ONIGENC_CTYPE_XDIGIT (1<<10) -#define ONIGENC_CTYPE_WORD (1<<11) -#define ONIGENC_CTYPE_ASCII (1<<12) +#define ONIGENC_CTYPE_NEWLINE (1<< 0) +#define ONIGENC_CTYPE_ALPHA (1<< 1) +#define ONIGENC_CTYPE_BLANK (1<< 2) +#define ONIGENC_CTYPE_CNTRL (1<< 3) +#define ONIGENC_CTYPE_DIGIT (1<< 4) +#define ONIGENC_CTYPE_GRAPH (1<< 5) +#define ONIGENC_CTYPE_LOWER (1<< 6) +#define ONIGENC_CTYPE_PRINT (1<< 7) +#define ONIGENC_CTYPE_PUNCT (1<< 8) +#define ONIGENC_CTYPE_SPACE (1<< 9) +#define ONIGENC_CTYPE_UPPER (1<<10) +#define ONIGENC_CTYPE_XDIGIT (1<<11) +#define ONIGENC_CTYPE_WORD (1<<12) +#define ONIGENC_CTYPE_ASCII (1<<13) #define ONIGENC_CTYPE_ALNUM (ONIGENC_CTYPE_ALPHA | ONIGENC_CTYPE_DIGIT) -/* ctype support level */ -#define ONIGENC_CTYPE_SUPPORT_LEVEL_SB 0 -#define ONIGENC_CTYPE_SUPPORT_LEVEL_FULL 1 - -#define enc_len(enc,byte) ONIGENC_MBC_LEN_BY_HEAD(enc,byte) +#define enc_len(enc,p) ONIGENC_MBC_ENC_LEN(enc,p) #define ONIGENC_IS_UNDEF(enc) ((enc) == ONIG_ENCODING_UNDEF) #define ONIGENC_IS_SINGLEBYTE(enc) (ONIGENC_MBC_MAXLEN(enc) == 1) -#define ONIGENC_IS_MBC_HEAD(enc,byte) (ONIGENC_MBC_LEN_BY_HEAD(enc,byte) != 1) +#define ONIGENC_IS_MBC_HEAD(enc,p) (ONIGENC_MBC_ENC_LEN(enc,p) != 1) #define ONIGENC_IS_MBC_ASCII(p) (*(p) < 128) #define ONIGENC_IS_CODE_ASCII(code) ((code) < 128) #define ONIGENC_IS_CODE_SB_WORD(enc,code) \ @@ -192,31 +263,33 @@ ONIG_EXTERN OnigEncodingType OnigEncodingBIG5; #include <ctype.h> /* for isblank(), isgraph() */ -#define ONIGENC_MBC_TO_LOWER(enc,p,buf) onigenc_mbc_to_lower(enc,p,buf) -#define ONIGENC_IS_MBC_CASE_AMBIG(enc,p) onigenc_mbc_is_case_ambig(enc,p) +#define ONIGENC_MBC_TO_NORMALIZE(enc,flag,pp,end,buf) \ + onigenc_mbc_to_normalize(enc,flag,pp,end,buf) +#define ONIGENC_IS_MBC_AMBIGUOUS(enc,flag,pp,end) \ + onigenc_is_mbc_ambiguous(enc,flag,pp,end) -#define ONIGENC_IS_FOLD_MATCH(enc) FALSE -#define ONIGENC_IS_CONTINUOUS_SB_MB(enc) FALSE -#define ONIGENC_CTYPE_SUPPORT_LEVEL(enc) ONIGENC_CTYPE_SUPPORT_LEVEL_SB +#define ONIGENC_SUPPORT_AMBIG_FLAG(enc) ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE #define ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc,s,end) \ onigenc_is_allowed_reverse_match(enc, s, end) #define ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc,start,s) \ onigenc_get_left_adjust_char_head(enc, start, s) -#define ONIGENC_GET_ALL_FOLD_MATCH_CODE(enc,codes) 0 -#define ONIGENC_GET_FOLD_MATCH_INFO(enc,p,end,info) ONIG_NO_SUPPORT_CONFIG +#define ONIGENC_GET_ALL_PAIR_AMBIG_CODES(enc, ambig_flag, acs) 0 +#define ONIGENC_GET_ALL_COMP_AMBIG_CODES(enc, ambig_flag, acs) 0 #define ONIGENC_GET_CTYPE_CODE_RANGE(enc,ctype,nsb,nmb,sbr,mbr) \ ONIG_NO_SUPPORT_CONFIG -#define ONIGENC_MBC_LEN_BY_HEAD(enc,b) m17n_mbclen(enc,(int )b) +#define ONIGENC_MBC_ENC_LEN(enc,p) m17n_mbclen(enc,(int )(*p)) #define ONIGENC_MBC_MAXLEN(enc) m17n_mbmaxlen(enc) #define ONIGENC_MBC_MAXLEN_DIST(enc) \ (ONIGENC_MBC_MAXLEN(enc) > 0 ? ONIGENC_MBC_MAXLEN(enc) \ : ONIG_INFINITE_DISTANCE) +#define ONIGENC_MBC_MINLEN(enc) 1 #define ONIGENC_MBC_TO_CODE(enc,p,e) m17n_codepoint((enc),(p),(e)) #define ONIGENC_CODE_TO_MBCLEN(enc,code) m17n_codelen((enc),(code)) #define ONIGENC_CODE_TO_MBC(enc,code,buf) onigenc_code_to_mbc(enc, code, buf) -#if 0 -#define ONIGENC_STEP_BACK(enc,start,s,n) /* !! not supported !! */ +#if 0 /* !! not supported !! */ +#define ONIGENC_IS_MBC_NEWLINE(enc,p,end) +#define ONIGENC_STEP_BACK(enc,start,s,n) #endif #define ONIGENC_IS_CODE_CTYPE(enc,code,ctype) \ @@ -251,9 +324,9 @@ int onigenc_is_code_ctype P_((OnigEncoding enc, OnigCodePoint code, int ctype)); ONIG_EXTERN int onigenc_code_to_mbc P_((OnigEncoding enc, OnigCodePoint code, UChar *buf)); ONIG_EXTERN -int onigenc_mbc_to_lower P_((OnigEncoding enc, UChar* p, UChar* buf)); +int onigenc_mbc_to_normalize P_((OnigEncoding enc, OnigAmbigType flag, UChar** pp, UChar* end, UChar* buf)); ONIG_EXTERN -int onigenc_mbc_is_case_ambig P_((OnigEncoding enc, UChar* p)); +int onigenc_is_mbc_ambiguous P_((OnigEncoding enc, OnigAmbigType flag, UChar** pp, UChar* end)); ONIG_EXTERN int onigenc_is_allowed_reverse_match P_((OnigEncoding enc, UChar* s, UChar* end)); @@ -261,32 +334,35 @@ int onigenc_is_allowed_reverse_match P_((OnigEncoding enc, UChar* s, UChar* end) #define ONIGENC_NAME(enc) ((enc)->name) -#define ONIGENC_MBC_TO_LOWER(enc,p,buf) (enc)->mbc_to_lower(p,buf) -#define ONIGENC_IS_MBC_CASE_AMBIG(enc,p) (enc)->mbc_is_case_ambig(p) - -#define ONIGENC_IS_FOLD_MATCH(enc) ((enc)->is_fold_match) -#define ONIGENC_IS_CONTINUOUS_SB_MB(enc) ((enc)->is_continuous_sb_mb) -#define ONIGENC_CTYPE_SUPPORT_LEVEL(enc) ((enc)->ctype_support_level) +#define ONIGENC_MBC_TO_NORMALIZE(enc,flag,pp,end,buf) \ + (enc)->mbc_to_normalize(flag,pp,end,buf) +#define ONIGENC_IS_MBC_AMBIGUOUS(enc,flag,pp,end) \ + (enc)->is_mbc_ambiguous(flag,pp,end) +#define ONIGENC_SUPPORT_AMBIG_FLAG(enc) ((enc)->support_ambig_flag) #define ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc,s,end) \ (enc)->is_allowed_reverse_match(s,end) #define ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc,start,s) \ (enc)->left_adjust_char_head(start, s) -#define ONIGENC_GET_ALL_FOLD_MATCH_CODE(enc,codes) \ - (enc)->get_all_fold_match_code(codes) -#define ONIGENC_GET_FOLD_MATCH_INFO(enc,p,end,info) \ - (enc)->get_fold_match_info(p,end,info) +#define ONIGENC_GET_ALL_PAIR_AMBIG_CODES(enc,ambig_flag,acs) \ + (enc)->get_all_pair_ambig_codes(ambig_flag,acs) +#define ONIGENC_GET_ALL_COMP_AMBIG_CODES(enc,ambig_flag,acs) \ + (enc)->get_all_comp_ambig_codes(ambig_flag,acs) #define ONIGENC_STEP_BACK(enc,start,s,n) \ onigenc_step_back((enc),(start),(s),(n)) -#define ONIGENC_MBC_LEN_BY_HEAD(enc,byte) ((enc)->len_table[(int )(byte)]) +#define ONIGENC_MBC_ENC_LEN(enc,p) (enc)->mbc_enc_len(p) #define ONIGENC_MBC_MAXLEN(enc) ((enc)->max_enc_len) #define ONIGENC_MBC_MAXLEN_DIST(enc) ONIGENC_MBC_MAXLEN(enc) -#define ONIGENC_MBC_TO_CODE(enc,p,e) (enc)->mbc_to_code((p),(e)) +#define ONIGENC_MBC_MINLEN(enc) ((enc)->min_enc_len) +#define ONIGENC_IS_MBC_NEWLINE(enc,p,end) (enc)->is_mbc_newline((p),(end)) +#define ONIGENC_MBC_TO_CODE(enc,p,end) (enc)->mbc_to_code((p),(end)) #define ONIGENC_CODE_TO_MBCLEN(enc,code) (enc)->code_to_mbclen(code) #define ONIGENC_CODE_TO_MBC(enc,code,buf) (enc)->code_to_mbc(code,buf) -#define ONIGENC_IS_CODE_CTYPE(enc,code,ctype) (enc)->code_is_ctype(code,ctype) +#define ONIGENC_IS_CODE_CTYPE(enc,code,ctype) (enc)->is_code_ctype(code,ctype) +#define ONIGENC_IS_CODE_NEWLINE(enc,code) \ + ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_NEWLINE) #define ONIGENC_IS_CODE_GRAPH(enc,code) \ ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_GRAPH) #define ONIGENC_IS_CODE_PRINT(enc,code) \ @@ -340,6 +416,12 @@ ONIG_EXTERN UChar* onigenc_get_left_adjust_char_head P_((OnigEncoding enc, UChar* start, UChar* s)); ONIG_EXTERN UChar* onigenc_get_right_adjust_char_head P_((OnigEncoding enc, UChar* start, UChar* s)); +ONIG_EXTERN +int onigenc_strlen P_((OnigEncoding enc, UChar* p, UChar* end)); +ONIG_EXTERN +int onigenc_strlen_null P_((OnigEncoding enc, UChar* p)); +ONIG_EXTERN +int onigenc_str_bytelen_null P_((OnigEncoding enc, UChar* p)); @@ -353,13 +435,6 @@ UChar* onigenc_get_right_adjust_char_head P_((OnigEncoding enc, UChar* start, UC /* constants */ #define ONIG_MAX_ERROR_MESSAGE_LEN 90 -#if defined(RUBY_PLATFORM) && !defined(ONIG_RUBY_M17N) -ONIG_EXTERN OnigEncoding OnigEncDefaultCharEncoding; -#undef ismbchar -#define ismbchar(c) (mbclen((c)) != 1) -#define mbclen(c) (OnigEncDefaultCharEncoding->len_table[(unsigned char )(c)]) -#endif - typedef unsigned int OnigOptionType; #define ONIG_OPTION_DEFAULT ONIG_OPTION_NONE @@ -467,6 +542,7 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax; #define ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY (1<<16) /* \p{...}, \P{...} */ #define ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT (1<<17) /* \p{^..}, \P{^..} */ #define ONIG_SYN_OP2_CHAR_PROPERTY_PREFIX_IS (1<<18) /* \p{IsXDigit} */ +#define ONIG_SYN_OP2_ESC_H_XDIGIT (1<<19) /* \h, \H */ /* syntax (behavior) */ #define ONIG_SYN_CONTEXT_INDEP_ANCHORS (1<<31) /* not implemented */ @@ -479,6 +555,7 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax; #define ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND (1<<6) /* (?<=a|bc) */ #define ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP (1<<7) /* see doc/RE */ #define ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME (1<<8) /* (?<x>)(?<x>) */ +#define ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY (1<<9) /* a{n}?=(?:a{n})? */ /* syntax (behavior) in char class [...] */ #define ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC (1<<20) /* [^...] */ @@ -565,6 +642,7 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax; #define ONIGERR_INVALID_CHAR_PROPERTY_NAME -223 #define ONIGERR_INVALID_WIDE_CHAR_VALUE -400 #define ONIGERR_TOO_BIG_WIDE_CHAR_VALUE -401 +#define ONIGERR_NOT_SUPPORTED_ENCODING_COMBINATION -402 /* errors related to thread */ #define ONIGERR_OVER_THREAD_PASS_LIMIT_COUNT -1001 @@ -575,6 +653,15 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax; #define ONIG_IS_CAPTURE_HISTORY_GROUP(r, i) \ ((i) <= ONIG_MAX_CAPTURE_HISTORY_GROUP && (r)->list && (r)->list[i]) +typedef struct OnigCaptureTreeNodeStruct { + int group; /* group number */ + int beg; + int end; + int allocated; + int num_childs; + struct OnigCaptureTreeNodeStruct** childs; +} OnigCaptureTreeNode; + /* match result region type */ struct re_registers { int allocated; @@ -582,9 +669,16 @@ struct re_registers { int* beg; int* end; /* extended */ - struct re_registers** list; /* capture history. list[1]-list[31] */ + OnigCaptureTreeNode* history_root; /* capture history tree root */ }; +/* capture tree traverse */ +#define ONIG_TRAVERSE_CALLBACK_AT_FIRST 1 +#define ONIG_TRAVERSE_CALLBACK_AT_LAST 2 +#define ONIG_TRAVERSE_CALLBACK_AT_BOTH \ + ( ONIG_TRAVERSE_CALLBACK_AT_FIRST | ONIG_TRAVERSE_CALLBACK_AT_LAST ) + + #define ONIG_REGION_NOTPOS -1 typedef struct re_registers OnigRegion; @@ -635,6 +729,7 @@ typedef struct re_pattern_buffer { OnigEncoding enc; OnigOptionType options; OnigSyntaxType* syntax; + OnigAmbigType ambig_flag; void* name_table; /* optimization info (string search, char-map and anchors) */ @@ -646,7 +741,7 @@ typedef struct re_pattern_buffer { int sub_anchor; /* start-anchor for exact or map */ unsigned char *exact; unsigned char *exact_end; - unsigned char map[ONIG_CHAR_TABLE_SIZE]; /* used as BM skip or char-map */ + unsigned char map[ONIG_CHAR_TABLE_SIZE]; /* used as BM skip or char-map */ int *int_map; /* BM skip for exact_len > 255 */ int *int_map_backward; /* BM skip for backward search */ OnigDistance dmin; /* min-distance of exact or map */ @@ -657,6 +752,15 @@ typedef struct re_pattern_buffer { } regex_t; +typedef struct { + int num_of_elements; + OnigEncoding pattern_enc; + OnigEncoding target_enc; + OnigSyntaxType* syntax; + OnigOptionType option; + OnigAmbigType ambig_flag; +} OnigCompileInfo; + /* Oniguruma Native API */ ONIG_EXTERN int onig_init P_((void)); @@ -669,10 +773,14 @@ void onig_set_verb_warn_func P_((OnigWarnFunc f)); ONIG_EXTERN int onig_new P_((regex_t**, UChar* pattern, UChar* pattern_end, OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax, OnigErrorInfo* einfo)); ONIG_EXTERN +int onig_new_deluxe P_((regex_t** reg, UChar* pattern, UChar* pattern_end, OnigCompileInfo* ci, OnigErrorInfo* einfo)); +ONIG_EXTERN void onig_free P_((regex_t*)); ONIG_EXTERN int onig_recompile P_((regex_t*, UChar* pattern, UChar* pattern_end, OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax, OnigErrorInfo* einfo)); ONIG_EXTERN +int onig_recompile_deluxe P_((regex_t* reg, UChar* pattern, UChar* pattern_end, OnigCompileInfo* ci, OnigErrorInfo* einfo)); +ONIG_EXTERN int onig_search P_((regex_t*, UChar* str, UChar* end, UChar* start, UChar* range, OnigRegion* region, OnigOptionType option)); ONIG_EXTERN int onig_match P_((regex_t*, UChar* str, UChar* end, UChar* at, OnigRegion* region, OnigOptionType option)); @@ -696,16 +804,34 @@ int onig_foreach_name P_((regex_t* reg, int (*func)(UChar*,UChar*,int,int*,regex ONIG_EXTERN int onig_number_of_names P_((regex_t* reg)); ONIG_EXTERN +int onig_number_of_captures P_((regex_t* reg)); +ONIG_EXTERN +int onig_number_of_capture_histories P_((regex_t* reg)); +ONIG_EXTERN +OnigCaptureTreeNode* onig_get_capture_tree P_((OnigRegion* region)); +ONIG_EXTERN +int onig_capture_tree_traverse P_((OnigRegion* region, int at, int(*callback_func)(int,int,int,int,int,void*), void* arg)); +ONIG_EXTERN OnigEncoding onig_get_encoding P_((regex_t* reg)); ONIG_EXTERN OnigOptionType onig_get_options P_((regex_t* reg)); ONIG_EXTERN +OnigAmbigType onig_get_ambig_flag P_((regex_t* reg)); +ONIG_EXTERN OnigSyntaxType* onig_get_syntax P_((regex_t* reg)); ONIG_EXTERN int onig_set_default_syntax P_((OnigSyntaxType* syntax)); ONIG_EXTERN void onig_copy_syntax P_((OnigSyntaxType* to, OnigSyntaxType* from)); ONIG_EXTERN +unsigned int onig_get_syntax_op P_((OnigSyntaxType* syntax)); +ONIG_EXTERN +unsigned int onig_get_syntax_op2 P_((OnigSyntaxType* syntax)); +ONIG_EXTERN +unsigned int onig_get_syntax_behavior P_((OnigSyntaxType* syntax)); +ONIG_EXTERN +OnigOptionType onig_get_syntax_options P_((OnigSyntaxType* syntax)); +ONIG_EXTERN void onig_set_syntax_op P_((OnigSyntaxType* syntax, unsigned int op)); ONIG_EXTERN void onig_set_syntax_op2 P_((OnigSyntaxType* syntax, unsigned int op2)); @@ -714,7 +840,9 @@ void onig_set_syntax_behavior P_((OnigSyntaxType* syntax, unsigned int behavior) ONIG_EXTERN void onig_set_syntax_options P_((OnigSyntaxType* syntax, OnigOptionType options)); ONIG_EXTERN -int onig_set_meta_char P_((unsigned int what, OnigCodePoint code)); +int onig_set_meta_char P_((OnigEncoding enc, unsigned int what, OnigCodePoint code)); +ONIG_EXTERN +void onig_copy_encoding P_((OnigEncoding to, OnigEncoding from)); ONIG_EXTERN unsigned int onig_get_match_stack_limit_size P_((void)); ONIG_EXTERN @@ -723,5 +851,7 @@ ONIG_EXTERN int onig_end P_((void)); ONIG_EXTERN const char* onig_version P_((void)); +ONIG_EXTERN +const char* onig_copyright P_((void)); #endif /* ONIGURUMA_H */ @@ -4853,8 +4853,10 @@ parser_tokadd_string(parser, func, term, paren, nest) long *nest; { int c; + unsigned char uc; while ((c = nextc()) != -1) { + uc = (unsigned char)c; if (paren && c == paren) { ++*nest; } @@ -4905,8 +4907,8 @@ parser_tokadd_string(parser, func, term, paren, nest) } } } - else if (ismbchar(c)) { - int i, len = mbclen(c)-1; + else if (ismbchar(uc)) { + int i, len = mbclen(uc)-1; for (i = 0; i < len; i++) { tokadd(c); @@ -5002,6 +5004,7 @@ parser_heredoc_identifier(parser) struct parser_params *parser; { int c = nextc(), term, func = 0, len; + unsigned int uc; if (c == '-') { c = nextc(); @@ -5019,7 +5022,8 @@ parser_heredoc_identifier(parser) tokadd(func); term = c; while ((c = nextc()) != -1 && c != term) { - len = mbclen(c); + uc = (unsigned int)c; + len = mbclen(uc); do {tokadd(c);} while (--len > 0 && (c = nextc()) != -1); } if (c == -1) { @@ -5029,7 +5033,8 @@ parser_heredoc_identifier(parser) break; default: - if (!is_identchar(c)) { + uc = (unsigned int)c; + if (!is_identchar(uc)) { pushback(c); if (func & STR_FUNC_INDENT) { pushback('-'); @@ -5040,9 +5045,11 @@ parser_heredoc_identifier(parser) term = '"'; tokadd(func |= str_dquote); do { - len = mbclen(c); + uc = (unsigned int)c; + len = mbclen(uc); do {tokadd(c);} while (--len > 0 && (c = nextc()) != -1); - } while ((c = nextc()) != -1 && is_identchar(c)); + } while ((c = nextc()) != -1 && + (uc = (unsigned char)c, is_identchar(uc))); pushback(c); break; } @@ -5233,6 +5240,7 @@ parser_yylex(parser) register int c; int space_seen = 0; int cmd_state; + unsigned char uc; #ifdef RIPPER int fallthru = Qfalse; #endif @@ -5519,6 +5527,7 @@ parser_yylex(parser) rb_compile_error(PARSER_ARG "incomplete character syntax"); return 0; } + uc = (unsigned char)c; if (ISSPACE(c)){ if (!IS_ARG()){ int c2 = 0; @@ -5551,7 +5560,7 @@ parser_yylex(parser) lex_state = EXPR_TERNARY; return '?'; } - else if (ismbchar(c)) { + else if (ismbchar(uc)) { rb_warnI("multibyte character literal not supported yet; use ?\\%.3o", c); goto ternary; } @@ -6098,7 +6107,8 @@ parser_yylex(parser) } else { term = nextc(); - if (ISALNUM(term) || ismbchar(term)) { + uc = (unsigned char)c; + if (ISALNUM(term) || ismbchar(uc)) { yyerror("unknown type of %string"); return 0; } @@ -6177,7 +6187,8 @@ parser_yylex(parser) switch (c) { case '_': /* $_: last read line string */ c = nextc(); - if (is_identchar(c)) { + uc = (unsigned char)c; + if (is_identchar(uc)) { tokadd('$'); tokadd('_'); break; @@ -6243,7 +6254,8 @@ parser_yylex(parser) return tNTH_REF; default: - if (!is_identchar(c)) { + uc = (unsigned char)c; + if (!is_identchar(uc)) { pushback(c); return '$'; } @@ -6268,7 +6280,8 @@ parser_yylex(parser) rb_compile_error(PARSER_ARG "`@@%c' is not allowed as a class variable name", c); } } - if (!is_identchar(c)) { + uc = (unsigned char)c; + if (!is_identchar(uc)) { pushback(c); return '@'; } @@ -6290,7 +6303,8 @@ parser_yylex(parser) break; default: - if (!is_identchar(c)) { + uc = (unsigned char)c; + if (!is_identchar(uc)) { rb_compile_error(PARSER_ARG "Invalid char `\\%03o' in expression", c); goto retry; } @@ -6299,10 +6313,11 @@ parser_yylex(parser) break; } + uc = (unsigned char)c; do { tokadd(c); - if (ismbchar(c)) { - int i, len = mbclen(c)-1; + if (ismbchar(uc)) { + int i, len = mbclen(uc)-1; for (i = 0; i < len; i++) { c = nextc(); @@ -6310,7 +6325,8 @@ parser_yylex(parser) } } c = nextc(); - } while (is_identchar(c)); + uc = (unsigned char)c; + } while (is_identchar(uc)); if ((c == '!' || c == '?') && is_identchar(tok()[0]) && !peek('=')) { tokadd(c); } @@ -248,11 +248,12 @@ rb_reg_mbclen2(c, re) VALUE re; { int len; + unsigned char uc = (unsigned char)c; if (!FL_TEST(re, KCODE_FIXED)) - return mbclen(c); + return mbclen(uc); kcode_set_option(re); - len = mbclen(c); + len = mbclen(uc); kcode_reset_option(); return len; } @@ -1775,8 +1776,8 @@ rb_reg_quote(str) send = s + RSTRING(str)->len; for (; s < send; s++) { c = *s; - if (ismbchar(c)) { - int n = mbclen(c); + if (ismbchar(*s)) { + int n = mbclen(*s); while (n-- && s < send) s++; @@ -1804,8 +1805,8 @@ rb_reg_quote(str) for (; s < send; s++) { c = *s; - if (ismbchar(c)) { - int n = mbclen(c); + if (ismbchar(*s)) { + int n = mbclen(*s); while (n-- && s < send) *t++ = *s++; @@ -2044,21 +2045,23 @@ rb_reg_regsub(str, src, regs) struct re_registers *regs; { VALUE val = 0; - char *p, *s, *e, c; + char *p, *s, *e; + unsigned char uc; int no; + p = s = RSTRING(str)->ptr; e = s + RSTRING(str)->len; while (s < e) { char *ss = s; - c = *s++; - if (ismbchar(c)) { - s += mbclen(c) - 1; + uc = (unsigned char)*s++; + if (ismbchar(uc)) { + s += mbclen(uc) - 1; continue; } - if (c != '\\' || s == e) continue; + if (uc != '\\' || s == e) continue; if (!val) { val = rb_str_buf_new(ss-p); @@ -2068,12 +2071,12 @@ rb_reg_regsub(str, src, regs) rb_str_buf_cat(val, p, ss-p); } - c = *s++; + uc = (unsigned char)*s++; p = s; - switch (c) { + switch (uc) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': - no = c - '0'; + no = uc - '0'; break; case '&': no = 0; @@ -1,16 +1,42 @@ /********************************************************************** - regcomp.c - Oniguruma (regular expression library) - - Copyright (C) 2002-2004 K.Kosako ([email protected]) - **********************************************************************/ +/*- + * Copyright (c) 2002-2004 K.Kosako <kosako AT sofnec DOT co DOT jp> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include "regparse.h" #ifndef PLATFORM_UNALIGNED_WORD_ACCESS static unsigned char PadBuf[WORD_ALIGNMENT_SIZE]; #endif +/* + Caution: node should not be a string node. + (s and end member address break) +*/ static void swap_node(Node* a, Node* b) { @@ -120,33 +146,6 @@ unset_addr_list_add(UnsetAddrList* uslist, int offset, struct _Node* node) #endif /* USE_SUBEXP_CALL */ -#if 0 -static int -bitset_mbmaxlen(BitSetRef bs, int negative, OnigEncoding enc) -{ - int i; - int len, maxlen = 0; - - if (negative) { - for (i = 0; i < SINGLE_BYTE_SIZE; i++) { - if (! BITSET_AT(bs, i)) { - len = enc_len(enc, i); - if (len > maxlen) maxlen = len; - } - } - } - else { - for (i = 0; i < SINGLE_BYTE_SIZE; i++) { - if (BITSET_AT(bs, i)) { - len = enc_len(enc, i); - if (len > maxlen) maxlen = len; - } - } - } - return maxlen; -} -#endif - static int add_opcode(regex_t* reg, int opcode) { @@ -293,15 +292,15 @@ select_str_opcode(int mb_len, int str_len, int ignore_case) { int op; - switch (mb_len) { - case 1: - if (ignore_case) { - switch (str_len) { - case 1: op = OP_EXACT1_IC; break; - default: op = OP_EXACTN_IC; break; - } + if (ignore_case) { + switch (str_len) { + case 1: op = OP_EXACT1_IC; break; + default: op = OP_EXACTN_IC; break; } - else { + } + else { + switch (mb_len) { + case 1: switch (str_len) { case 1: op = OP_EXACT1; break; case 2: op = OP_EXACT2; break; @@ -310,25 +309,25 @@ select_str_opcode(int mb_len, int str_len, int ignore_case) case 5: op = OP_EXACT5; break; default: op = OP_EXACTN; break; } - } - break; + break; - case 2: - switch (str_len) { - case 1: op = OP_EXACTMB2N1; break; - case 2: op = OP_EXACTMB2N2; break; - case 3: op = OP_EXACTMB2N3; break; - default: op = OP_EXACTMB2N; break; - } - break; + case 2: + switch (str_len) { + case 1: op = OP_EXACTMB2N1; break; + case 2: op = OP_EXACTMB2N2; break; + case 3: op = OP_EXACTMB2N3; break; + default: op = OP_EXACTMB2N; break; + } + break; - case 3: - op = OP_EXACTMB3N; - break; + case 3: + op = OP_EXACTMB3N; + break; - default: - op = OP_EXACTMBN; - break; + default: + op = OP_EXACTMBN; + break; + } } return op; } @@ -373,7 +372,7 @@ compile_call(CallNode* node, regex_t* reg) r = add_opcode(reg, OP_CALL); if (r) return r; r = unset_addr_list_add(node->unset_addr_list, BBUF_GET_OFFSET_POS(reg), - node->target); + node->target); if (r) return r; r = add_abs_addr(reg, 0 /*dummy addr.*/); return r; @@ -394,15 +393,14 @@ compile_tree_n_times(Node* node, int n, regex_t* reg) static int add_compile_string_length(UChar* s, int mb_len, int str_len, - regex_t* reg, int ignore_case) + regex_t* reg, int ignore_case) { int len; int op = select_str_opcode(mb_len, str_len, ignore_case); len = SIZE_OPCODE; - if (op == OP_EXACTMBN) - len += SIZE_LENGTH; + if (op == OP_EXACTMBN) len += SIZE_LENGTH; if (IS_NEED_STR_LEN_OP_EXACT(op)) len += SIZE_LENGTH; @@ -412,7 +410,7 @@ add_compile_string_length(UChar* s, int mb_len, int str_len, static int add_compile_string(UChar* s, int mb_len, int str_len, - regex_t* reg, int ignore_case) + regex_t* reg, int ignore_case) { int op = select_str_opcode(mb_len, str_len, ignore_case); add_opcode(reg, op); @@ -420,8 +418,12 @@ add_compile_string(UChar* s, int mb_len, int str_len, if (op == OP_EXACTMBN) add_length(reg, mb_len); - if (IS_NEED_STR_LEN_OP_EXACT(op)) - add_length(reg, str_len); + if (IS_NEED_STR_LEN_OP_EXACT(op)) { + if (op == OP_EXACTN_IC) + add_length(reg, mb_len * str_len); + else + add_length(reg, str_len); + } add_bytes(reg, s, mb_len * str_len); return 0; @@ -429,49 +431,37 @@ add_compile_string(UChar* s, int mb_len, int str_len, static int -compile_length_string_node(StrNode* sn, regex_t* reg) +compile_length_string_node(Node* node, regex_t* reg) { - int rlen, r, len, prev_len, slen, ambig, ic; + int rlen, r, len, prev_len, slen, ambig; OnigEncoding enc = reg->enc; UChar *p, *prev; + StrNode* sn; + sn = &(NSTRING(node)); if (sn->end <= sn->s) return 0; - ic = IS_IGNORECASE(reg->options); + ambig = NSTRING_IS_AMBIG(node); p = prev = sn->s; - prev_len = enc_len(enc, *p); - if (ic != 0 && prev_len == 1) - ambig = ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, p); - else - ambig = 0; - + prev_len = enc_len(enc, p); p += prev_len; slen = 1; rlen = 0; for (; p < sn->end; ) { - len = enc_len(enc, *p); + len = enc_len(enc, p); if (len == prev_len) { slen++; - if (ic != 0 && ambig == 0 && len == 1) - ambig = ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, p); } else { r = add_compile_string_length(prev, prev_len, slen, reg, ambig); rlen += r; - - if (ic != 0 && len == 1) - ambig = ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, p); - else - ambig = 0; - prev = p; slen = 1; prev_len = len; } - p += len; } r = add_compile_string_length(prev, prev_len, slen, reg, ambig); @@ -489,49 +479,33 @@ compile_length_string_raw_node(StrNode* sn, regex_t* reg) } static int -compile_string_node(StrNode* sn, regex_t* reg) +compile_string_node(Node* node, regex_t* reg) { - int r, len, prev_len, slen, ambig, ic; + int r, len, prev_len, slen, ambig; OnigEncoding enc = reg->enc; - UChar *p, *prev; + UChar *p, *prev, *end; + StrNode* sn; + sn = &(NSTRING(node)); if (sn->end <= sn->s) return 0; - ic = IS_IGNORECASE(reg->options); + end = sn->end; + ambig = NSTRING_IS_AMBIG(node); p = prev = sn->s; - prev_len = enc_len(enc, *p); - if (ic != 0 && prev_len == 1) { - ambig = ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, p); - if (ambig != 0) - ONIGENC_MBC_TO_LOWER(reg->enc, p, p); - } - else - ambig = 0; - + prev_len = enc_len(enc, p); p += prev_len; slen = 1; - for (; p < sn->end; ) { - len = enc_len(enc, *p); + for (; p < end; ) { + len = enc_len(enc, p); if (len == prev_len) { slen++; - if (ic != 0 && len == 1) { - if (ambig == 0) - ambig = ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, p); - if (ambig != 0) ONIGENC_MBC_TO_LOWER(reg->enc, p, p); - } } else { r = add_compile_string(prev, prev_len, slen, reg, ambig); if (r) return r; - if (ic != 0 && len == 1) { - ambig = ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, p); - if (ambig != 0) ONIGENC_MBC_TO_LOWER(reg->enc, p, p); - } - else - ambig = 0; prev = p; slen = 1; @@ -584,8 +558,7 @@ compile_length_cclass_node(CClassNode* cc, regex_t* reg) len = SIZE_OPCODE + SIZE_BITSET; } else { - if (bitset_is_empty(cc->bs)) { - /* SIZE_BITSET is included in mbuf->used. */ + if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) { len = SIZE_OPCODE; } else { @@ -613,7 +586,7 @@ compile_cclass_node(CClassNode* cc, regex_t* reg) r = add_bitset(reg, cc->bs); } else { - if (bitset_is_empty(cc->bs)) { + if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) { if (cc->not) add_opcode(reg, OP_CCLASS_MB_NOT); else add_opcode(reg, OP_CCLASS_MB); @@ -649,7 +622,7 @@ entry_repeat_range(regex_t* reg, int id, int lower, int upper) int n; n = reg->repeat_range_alloc + REPEAT_RANGE_ALLOC; p = (OnigRepeatRange* )xrealloc(reg->repeat_range, - sizeof(OnigRepeatRange) * n); + sizeof(OnigRepeatRange) * n); CHECK_NULL_RETURN_VAL(p, ONIGERR_MEMORY); reg->repeat_range = p; reg->repeat_range_alloc = n; @@ -665,7 +638,7 @@ entry_repeat_range(regex_t* reg, int id, int lower, int upper) static int compile_range_repeat_node(QualifierNode* qn, int target_len, int empty_info, - regex_t* reg) + regex_t* reg) { int r; int num_repeat = reg->num_repeat; @@ -685,15 +658,16 @@ compile_range_repeat_node(QualifierNode* qn, int target_len, int empty_info, if (r) return r; if ( - #ifdef USE_SUBEXP_CALL +#ifdef USE_SUBEXP_CALL reg->num_call > 0 || - #endif +#endif IS_QUALIFIER_IN_REPEAT(qn)) { r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC_SG : OP_REPEAT_INC_NG_SG); } else { r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC : OP_REPEAT_INC_NG); } + if (r) return r; r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */ return r; @@ -715,9 +689,9 @@ compile_length_qualifier_node(QualifierNode* qn, regex_t* reg) if (NTYPE(qn->target) == N_ANYCHAR) { if (qn->greedy && infinite) { if (IS_NOT_NULL(qn->next_head_exact)) - return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower; + return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower; else - return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower; + return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower; } } @@ -750,7 +724,8 @@ compile_length_qualifier_node(QualifierNode* qn, regex_t* reg) len = SIZE_OP_JUMP + tlen; } else if (!infinite && qn->greedy && - (tlen + SIZE_OP_PUSH) * qn->upper <= QUALIFIER_EXPAND_LIMIT_SIZE) { + (qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper + <= QUALIFIER_EXPAND_LIMIT_SIZE)) { len = tlen * qn->lower; len += (SIZE_OP_PUSH + tlen) * (qn->upper - qn->lower); } @@ -874,7 +849,8 @@ compile_qualifier_node(QualifierNode* qn, regex_t* reg) r = compile_tree(qn->target, reg); } else if (!infinite && qn->greedy && - (tlen + SIZE_OP_PUSH) * qn->upper <= QUALIFIER_EXPAND_LIMIT_SIZE) { + (qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper + <= QUALIFIER_EXPAND_LIMIT_SIZE)) { int n = qn->upper - qn->lower; r = compile_tree_n_times(qn->target, qn->lower, reg); @@ -934,18 +910,16 @@ compile_option_node(EffectNode* node, regex_t* reg) if (r) return r; r = add_opcode(reg, OP_FAIL); if (r) return r; + } - reg->options = node->option; - r = compile_tree(node->target, reg); - reg->options = prev; + reg->options = node->option; + r = compile_tree(node->target, reg); + reg->options = prev; + + if (IS_DYNAMIC_OPTION(prev ^ node->option)) { if (r) return r; r = add_opcode_option(reg, OP_SET_OPTION, prev); } - else { - reg->options = node->option; - r = compile_tree(node->target, reg); - reg->options = prev; - } return r; } @@ -992,7 +966,7 @@ compile_length_effect_node(EffectNode* node, regex_t* reg) break; case EFFECT_STOP_BACKTRACK: - if (IS_EFFECT_SIMPLE_REPEAT(node)) { + if (IS_EFFECT_STOP_BT_SIMPLE_REPEAT(node)) { QualifierNode* qn = &NQUALIFIER(node->target); tlen = compile_length_tree(qn->target, reg); if (tlen < 0) return tlen; @@ -1082,7 +1056,7 @@ compile_effect_node(EffectNode* node, regex_t* reg) break; case EFFECT_STOP_BACKTRACK: - if (IS_EFFECT_SIMPLE_REPEAT(node)) { + if (IS_EFFECT_STOP_BT_SIMPLE_REPEAT(node)) { QualifierNode* qn = &NQUALIFIER(node->target); r = compile_tree_n_times(qn->target, qn->lower, reg); if (r) return r; @@ -1267,7 +1241,7 @@ compile_length_tree(Node* node, regex_t* reg) if (NSTRING_IS_RAW(node)) r = compile_length_string_raw_node(&(NSTRING(node)), reg); else - r = compile_length_string_node(&(NSTRING(node)), reg); + r = compile_length_string_node(node, reg); break; case N_CCLASS: @@ -1365,7 +1339,7 @@ compile_tree(Node* node, regex_t* reg) if (NSTRING_IS_RAW(node)) r = compile_string_raw_node(&(NSTRING(node)), reg); else - r = compile_string_node(&(NSTRING(node)), reg); + r = compile_string_node(node, reg); break; case N_CCLASS: @@ -1421,8 +1395,14 @@ compile_tree(Node* node, regex_t* reg) } else { int* p; - add_opcode(reg, (IS_IGNORECASE(reg->options) ? - OP_BACKREF_MULTI_IC : OP_BACKREF_MULTI)); + + if (IS_IGNORECASE(reg->options)) { + add_opcode(reg, OP_BACKREF_MULTI_IC); + } + else { + add_opcode(reg, OP_BACKREF_MULTI); + } + if (r) return r; add_length(reg, br->back_num); if (r) return r; @@ -2053,7 +2033,7 @@ get_char_length_tree1(Node* node, regex_t* reg, int* len, int level) StrNode* sn = &(NSTRING(node)); UChar *s = sn->s; while (s < sn->end) { - s += enc_len(reg->enc, *s); + s += enc_len(reg->enc, s); (*len)++; } } @@ -2144,7 +2124,7 @@ onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc) { int found; - if (code >= SINGLE_BYTE_SIZE) { + if (ONIGENC_MBC_MINLEN(enc) > 1 || (code >= SINGLE_BYTE_SIZE)) { if (IS_NULL(cc->mbuf)) { found = 0; } @@ -2309,7 +2289,7 @@ is_not_included(Node* x, Node* y, regex_t* reg) CClassNode* cc = &(NCCLASS(y)); code = ONIGENC_MBC_TO_CODE(reg->enc, xs->s, - xs->s + enc_len(reg->enc, c)); + xs->s + ONIGENC_MBC_MAXLEN(reg->enc)); return (onig_is_code_in_cc(reg->enc, code, cc) != 0 ? 0 : 1); } break; @@ -2320,18 +2300,9 @@ is_not_included(Node* x, Node* y, regex_t* reg) StrNode* ys = &(NSTRING(y)); len = NSTRING_LEN(x); if (len > NSTRING_LEN(y)) len = NSTRING_LEN(y); - if (NSTRING_IS_CASE_AMBIG(x) || NSTRING_IS_CASE_AMBIG(y)) { - UChar plow[ONIGENC_MBC_TO_LOWER_MAXLEN]; - UChar qlow[ONIGENC_MBC_TO_LOWER_MAXLEN]; - int plen, qlen; - for (p = ys->s, q = xs->s; q < xs->end; ) { - plen = ONIGENC_MBC_TO_LOWER(reg->enc, p, plow); - qlen = ONIGENC_MBC_TO_LOWER(reg->enc, q, qlow); - if (plen != qlen || onig_strncmp(plow, qlow, plen) != 0) - return 1; - p += enc_len(reg->enc, *p); - q += enc_len(reg->enc, *q); - } + if (NSTRING_IS_AMBIG(x) || NSTRING_IS_AMBIG(y)) { + /* tiny version */ + return 0; } else { for (i = 0, p = ys->s, q = xs->s; i < len; i++, p++, q++) { @@ -2388,8 +2359,12 @@ get_head_value_node(Node* node, int exact, regex_t* reg) if (exact != 0 && !NSTRING_IS_RAW(node) && IS_IGNORECASE(reg->options)) { - if (! ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, sn->s)) +#if 0 + UChar* tmp = sn->s; + if (! ONIGENC_IS_MBC_AMBIGUOUS(reg->enc, reg->ambig_flag, + &tmp, sn->end)) n = node; +#endif } else { n = node; @@ -2946,7 +2921,7 @@ next_setup(Node* node, Node* next_node, regex_t* reg) if (IS_NOT_NULL(y) && is_not_included(x, y, reg)) { Node* en = onig_node_new_effect(EFFECT_STOP_BACKTRACK); CHECK_NULL_RETURN_VAL(en, ONIGERR_MEMORY); - SET_EFFECT_STATUS(en, NST_SIMPLE_REPEAT); + SET_EFFECT_STATUS(en, NST_STOP_BT_SIMPLE_REPEAT); swap_node(node, en); NEFFECT(node).target = en; } @@ -2965,9 +2940,114 @@ next_setup(Node* node, Node* next_node, regex_t* reg) return 0; } -#define IN_ALT (1<<0) -#define IN_NOT (1<<1) -#define IN_REPEAT (1<<2) +static int +divide_ambig_string_node(Node* node, regex_t* reg) +{ + StrNode* sn = &NSTRING(node); + int ambig, prev_ambig; + UChar *prev, *p, *end, *prev_start, *start, *tmp, *wp; + Node *snode; + Node *root = NULL_NODE; + Node **tailp = (Node** )0; + + start = prev_start = p = sn->s; + end = sn->end; + if (p >= end) return 0; + + prev_ambig = ONIGENC_IS_MBC_AMBIGUOUS(reg->enc, reg->ambig_flag, &p, end); + + while (p < end) { + prev = p; + if (prev_ambig != (ambig = ONIGENC_IS_MBC_AMBIGUOUS(reg->enc, + reg->ambig_flag, &p, end))) { + + if (prev_ambig != 0) { + tmp = prev_start; + wp = prev_start; + while (tmp < prev) { + wp += ONIGENC_MBC_TO_NORMALIZE(reg->enc, reg->ambig_flag, + &tmp, end, wp); + } + snode = onig_node_new_str(prev_start, wp); + CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); + NSTRING_SET_AMBIG(snode); + if (wp != prev) NSTRING_SET_AMBIG_REDUCE(snode); + } + else { + snode = onig_node_new_str(prev_start, prev); + CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); + } + + if (tailp == (Node** )0) { + root = onig_node_new_list(snode, NULL); + CHECK_NULL_RETURN_VAL(root, ONIGERR_MEMORY); + tailp = &(NCONS(root).right); + } + else { + *tailp = onig_node_new_list(snode, NULL); + CHECK_NULL_RETURN_VAL(*tailp, ONIGERR_MEMORY); + tailp = &(NCONS(*tailp).right); + } + + prev_ambig = ambig; + prev_start = prev; + } + } + + if (prev_start == start) { + if (prev_ambig != 0) { + NSTRING_SET_AMBIG(node); + tmp = start; + wp = start; + while (tmp < end) { + wp += ONIGENC_MBC_TO_NORMALIZE(reg->enc, reg->ambig_flag, + &tmp, end, wp); + } + if (wp != sn->end) NSTRING_SET_AMBIG_REDUCE(node); + sn->end = wp; + } + } + else { + if (prev_ambig != 0) { + tmp = prev_start; + wp = prev_start; + while (tmp < end) { + wp += ONIGENC_MBC_TO_NORMALIZE(reg->enc, reg->ambig_flag, + &tmp, end, wp); + } + snode = onig_node_new_str(prev_start, wp); + CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); + NSTRING_SET_AMBIG(snode); + if (wp != end) NSTRING_SET_AMBIG_REDUCE(snode); + } + else { + snode = onig_node_new_str(prev_start, end); + CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); + } + + if (tailp == (Node** )0) { + root = onig_node_new_list(snode, NULL); + CHECK_NULL_RETURN_VAL(root, ONIGERR_MEMORY); + tailp = &(NCONS(node).right); + } + else { + *tailp = onig_node_new_list(snode, NULL); + CHECK_NULL_RETURN_VAL(*tailp, ONIGERR_MEMORY); + tailp = &(NCONS(*tailp).right); + } + + swap_node(node, root); + onig_node_str_clear(root); /* should be after swap! */ + onig_node_free(root); /* free original string node */ + } + + return 0; +} + +#define IN_ALT (1<<0) +#define IN_NOT (1<<1) +#define IN_REPEAT (1<<2) +#define IN_VAR_REPEAT (1<<3) /* setup_tree does the following work. 1. check empty loop. (set qn->target_empty_info) @@ -3005,33 +3085,11 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) break; case N_CCLASS: - if (IS_IGNORECASE(reg->options)) { - int i; - UChar c, lowbuf[ONIGENC_MBC_TO_LOWER_MAXLEN]; - BitSetRef bs = NCCLASS(node).bs; - for (i = 0; i < SINGLE_BYTE_SIZE; i++) { - c = (UChar )i; - ONIGENC_MBC_TO_LOWER(reg->enc, &c, lowbuf); - if (*lowbuf != c) { - if (BITSET_AT(bs, c)) BITSET_SET_BIT(bs, *lowbuf); - if (BITSET_AT(bs, *lowbuf)) BITSET_SET_BIT(bs, c); - } - } - } break; case N_STRING: if (IS_IGNORECASE(reg->options) && !NSTRING_IS_RAW(node)) { - StrNode* sn = &NSTRING(node); - UChar* p = sn->s; - - while (p < sn->end) { - if (ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, p)) { - NSTRING_SET_CASE_AMBIG(node); - break; - } - p += enc_len(reg->enc, *p); - } + r = divide_ambig_string_node(node, reg); } break; @@ -3067,9 +3125,9 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) Node* target = qn->target; if ((state & IN_REPEAT) != 0) { - qn->state |= NST_IN_REPEAT; + qn->state |= NST_IN_REPEAT; } - + if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 1) { r = get_min_match_length(target, &d, env); if (r) break; @@ -3096,8 +3154,9 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) } } + state |= IN_REPEAT; if (qn->lower != qn->upper) - state |= IN_REPEAT; + state |= IN_VAR_REPEAT; r = setup_tree(target, reg, state, env); if (r) break; @@ -3154,11 +3213,13 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) break; case EFFECT_MEMORY: - if ((state & (IN_ALT | IN_NOT | IN_REPEAT)) != 0) { + if ((state & (IN_ALT | IN_NOT | IN_VAR_REPEAT)) != 0) { BIT_STATUS_ON_AT(env->bt_mem_start, en->regnum); /* SET_EFFECT_STATUS(node, NST_MEM_IN_ALT_NOT); */ } - /* fall */ + r = setup_tree(en->target, reg, state, env); + break; + case EFFECT_STOP_BACKTRACK: { Node* target = en->target; @@ -3169,7 +3230,7 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) tqn->greedy != 0) { /* (?>a*), a*+ etc... */ int qtype = NTYPE(tqn->target); if (IS_NODE_TYPE_SIMPLE(qtype)) - SET_EFFECT_STATUS(node, NST_SIMPLE_REPEAT); + SET_EFFECT_STATUS(node, NST_STOP_BT_SIMPLE_REPEAT); } } } @@ -3241,26 +3302,17 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) /* set skip map for Boyer-Moor search */ static int -set_bm_skip(UChar* s, UChar* end, OnigEncoding enc, int ignore_case, +set_bm_skip(UChar* s, UChar* end, OnigEncoding enc, UChar skip[], int** int_skip) { int i, len; - UChar lowbuf[ONIGENC_MBC_TO_LOWER_MAXLEN]; len = end - s; if (len < ONIG_CHAR_TABLE_SIZE) { for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) skip[i] = len; - if (ignore_case) { - for (i = 0; i < len - 1; i++) { - ONIGENC_MBC_TO_LOWER(enc, &(s[i]), lowbuf); - skip[*lowbuf] = len - 1 - i; - } - } - else { - for (i = 0; i < len - 1; i++) - skip[s[i]] = len - 1 - i; - } + for (i = 0; i < len - 1; i++) + skip[s[i]] = len - 1 - i; } else { if (IS_NULL(*int_skip)) { @@ -3269,16 +3321,8 @@ set_bm_skip(UChar* s, UChar* end, OnigEncoding enc, int ignore_case, } for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) (*int_skip)[i] = len; - if (ignore_case) { - for (i = 0; i < len - 1; i++) { - ONIGENC_MBC_TO_LOWER(enc, &(s[i]), lowbuf); - (*int_skip)[*lowbuf] = len - 1 - i; - } - } - else { - for (i = 0; i < len - 1; i++) - (*int_skip)[s[i]] = len - 1 - i; - } + for (i = 0; i < len - 1; i++) + (*int_skip)[s[i]] = len - 1 - i; } return 0; } @@ -3291,11 +3335,12 @@ typedef struct { } MinMaxLen; typedef struct { - MinMaxLen mmd; - BitStatusType backrefed_status; - OnigEncoding enc; - OnigOptionType options; - ScanEnv* scan_env; + MinMaxLen mmd; + BitStatusType backrefed_status; + OnigEncoding enc; + OnigOptionType options; + OnigAmbigType ambig_flag; + ScanEnv* scan_env; } OptEnv; typedef struct { @@ -3332,31 +3377,31 @@ typedef struct { OptMapInfo map; /* boundary */ } NodeOptInfo; +static short int ByteValTable[] = { + 14, 1, 1, 1, 1, 1, 1, 1, 1, 10, 10, 1, 1, 10, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 12, 4, 7, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, + 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 5, 5, 5, + 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 1 +}; static int map_position_value(int i) { - static int vals[] = { - 10, 10, 10, 10, 10, 10, 10, 10, 10, 1, 1, 10, 10, 1, 10, 10, - 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, - 1, 6, 3, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, - 5, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 5, 5, 5, - 5, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 10, - }; - - if (i < sizeof(vals)/sizeof(vals[0])) return vals[i]; - - return 7; /* Take it easy. */ + if (i < sizeof(ByteValTable)/sizeof(ByteValTable[0])) + return (int )ByteValTable[i]; + else + return 4; /* Take it easy. */ } static int distance_value(MinMaxLen* mm) { /* 1000 / (min-max-dist + 1) */ - static int dist_vals[] = { + static short int dist_vals[] = { 1000, 500, 333, 250, 200, 167, 143, 125, 111, 100, 91, 83, 77, 71, 67, 63, 59, 56, 53, 50, 48, 45, 43, 42, 40, 38, 37, 36, 34, 33, @@ -3376,7 +3421,7 @@ distance_value(MinMaxLen* mm) d = mm->max - mm->min; if (d < sizeof(dist_vals)/sizeof(dist_vals[0])) /* return dist_vals[d] * 16 / (mm->min + 12); */ - return dist_vals[d]; + return (int )dist_vals[d]; else return 1; } @@ -3432,12 +3477,14 @@ add_mml(MinMaxLen* to, MinMaxLen* from) to->max = distance_add(to->max, from->max); } +#if 0 static void add_len_mml(MinMaxLen* to, OnigDistance len) { to->min = distance_add(to->min, len); to->max = distance_add(to->max, len); } +#endif static void alt_merge_mml(MinMaxLen* to, MinMaxLen* from) @@ -3584,7 +3631,7 @@ concat_opt_exact_info_str(OptExactInfo* to, to->s[i++] = *p++; } else { - len = enc_len(enc, *p); + len = enc_len(enc, p); if (i + len > OPT_EXACT_MAXLEN) break; for (j = 0; j < len; j++) to->s[i++] = *p++; @@ -3611,7 +3658,7 @@ alt_merge_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OptEnv* env) for (i = 0; i < to->len && i < add->len; ) { if (to->s[i] != add->s[i]) break; - len = enc_len(env->enc, to->s[i]); + len = enc_len(env->enc, to->s + i); for (j = 1; j < len; j++) { if (to->s[i+j] != add->s[i+j]) break; @@ -3633,12 +3680,24 @@ alt_merge_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OptEnv* env) static void select_opt_exact_info(OptExactInfo* now, OptExactInfo* alt) { - int vlen1, vlen2; + int v1, v2; + + v1 = now->len; + v2 = alt->len; - vlen1 = now->len * (now->ignore_case ? 1 : 2); - vlen2 = alt->len * (alt->ignore_case ? 1 : 2); + if (v1 <= 2 && v2 <= 2) { + /* ByteValTable[x] is big value --> low price */ + v2 = map_position_value(now->s[0]); + v1 = map_position_value(alt->s[0]); - if (comp_distance_value(&now->mmd, &alt->mmd, vlen1, vlen2) > 0) + if (now->len > 1) v1 += 5; + if (alt->len > 1) v2 += 5; + } + + if (now->ignore_case == 0) v1 *= 2; + if (alt->ignore_case == 0) v2 *= 2; + + if (comp_distance_value(&now->mmd, &alt->mmd, v1, v2) > 0) copy_opt_exact_info(now, alt); } @@ -3661,7 +3720,7 @@ copy_opt_map_info(OptMapInfo* to, OptMapInfo* from) } static void -add_char_opt_map_info(OptMapInfo* map, int c) +add_char_opt_map_info(OptMapInfo* map, UChar c) { if (map->map[c] == 0) { map->map[c] = 1; @@ -3669,26 +3728,48 @@ add_char_opt_map_info(OptMapInfo* map, int c) } } -static void -add_char_amb_opt_map_info(OptMapInfo* map, int c, OnigEncoding enc) +static int +add_char_amb_opt_map_info(OptMapInfo* map, UChar* p, UChar* end, + OnigEncoding enc, OnigAmbigType ambig_flag) { - UChar x, low[ONIGENC_MBC_TO_LOWER_MAXLEN]; + int i, j, n, len; + UChar buf[ONIGENC_MBC_NORMALIZE_MAXLEN]; + OnigCodePoint code, ccode; + OnigCompAmbigCodes* ccs; + OnigPairAmbigCodes* pccs; + OnigAmbigType amb; - add_char_opt_map_info(map, c); + add_char_opt_map_info(map, p[0]); + code = ONIGENC_MBC_TO_CODE(enc, p, end); - x = (UChar )c; - ONIGENC_MBC_TO_LOWER(enc, &x, low); - if (*low != x) { - add_char_opt_map_info(map, (int )(*low)); - } - else { - int i; - for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) { - x = (UChar )i; - ONIGENC_MBC_TO_LOWER(enc, &x, low); - if ((int )(*low) == c) add_char_opt_map_info(map, i); + for (amb = 0x01; amb <= ONIGENC_AMBIGUOUS_MATCH_LIMIT; amb <<= 1) { + if ((amb & ambig_flag) == 0) continue; + + n = ONIGENC_GET_ALL_PAIR_AMBIG_CODES(enc, amb, &pccs); + for (i = 0; i < n; i++) { + if (pccs[i].from == code) { + len = ONIGENC_CODE_TO_MBC(enc, pccs[i].to, buf); + if (len < 0) return len; + add_char_opt_map_info(map, buf[0]); + } + } + + if ((ambig_flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0) { + n = ONIGENC_GET_ALL_COMP_AMBIG_CODES(enc, amb, &ccs); + for (i = 0; i < n; i++) { + if (ccs[i].code == code) { + for (j = 0; j < ccs[i].n; j++) { + ccode = ccs[i].items[j].code[0]; + len = ONIGENC_CODE_TO_MBC(enc, ccode, buf); + if (len < 0) return len; + add_char_opt_map_info(map, buf[0]); + } + break; + } + } } } + return 0; } static void @@ -3894,143 +3975,110 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env) case N_STRING: { - UChar *p; - int len, plen; StrNode* sn = &(NSTRING(node)); int slen = sn->end - sn->s; int is_raw = NSTRING_IS_RAW(node); - if ((! IS_IGNORECASE(env->options)) || is_raw) { + if (! NSTRING_IS_AMBIG(node)) { concat_opt_exact_info_str(&opt->exb, sn->s, sn->end, NSTRING_IS_RAW(node), env->enc); if (slen > 0) { add_char_opt_map_info(&opt->map, *(sn->s)); } + set_mml(&opt->len, slen, slen); } else { - for (p = sn->s; p < sn->end; ) { - len = enc_len(env->enc, *p); - if (len == 1 && ONIGENC_IS_MBC_CASE_AMBIG(env->enc, p)) { - break; - } - p += len; - } + int n, max; - plen = p - sn->s; - if (plen > slen / 5) { - concat_opt_exact_info_str(&opt->exb, sn->s, p, is_raw, env->enc); - concat_opt_exact_info_str(&opt->exm, p, sn->end, is_raw, env->enc); - opt->exm.ignore_case = 1; - if (opt->exm.len == sn->end - p) - opt->exm.reach_end = 1; - - copy_mml(&(opt->exm.mmd), &(opt->exb.mmd)); - add_len_mml(&(opt->exm.mmd), plen); - } - else { - concat_opt_exact_info_str(&opt->exb, sn->s, sn->end, - is_raw, env->enc); - opt->exb.ignore_case = 1; - } + concat_opt_exact_info_str(&opt->exb, sn->s, sn->end, + is_raw, env->enc); + opt->exb.ignore_case = 1; if (slen > 0) { - if (p == sn->s) - add_char_amb_opt_map_info(&opt->map, *(sn->s), env->enc); - else - add_char_opt_map_info(&opt->map, *(sn->s)); + r = add_char_amb_opt_map_info(&opt->map, sn->s, sn->end, + env->enc, env->ambig_flag); + if (r != 0) break; } + + if (NSTRING_IS_AMBIG_REDUCE(node)) { + n = onigenc_strlen(env->enc, sn->s, sn->end); + max = ONIGENC_MBC_MAXLEN_DIST(env->enc) * n; + } + else { + max = slen; + } + set_mml(&opt->len, slen, max); } if (opt->exb.len == slen) opt->exb.reach_end = 1; - - set_mml(&opt->len, slen, slen); } break; case N_CCLASS: { - int i, z, len, found, mb_found; + int i, z; CClassNode* cc = &(NCCLASS(node)); /* no need to check ignore case. (setted in setup_tree()) */ - found = mb_found = 0; - for (i = 0; i < SINGLE_BYTE_SIZE; i++) { - z = BITSET_AT(cc->bs, i); - if ((z && !cc->not) || (!z && cc->not)) { - found = 1; - add_char_opt_map_info(&opt->map, i); - } - } - if (! ONIGENC_IS_SINGLEBYTE(env->enc)) { - if (! IS_NULL(cc->mbuf) || - (cc->not != 0 && found != 0)) { - for (i = 0; i < SINGLE_BYTE_SIZE; i++) { - z = ONIGENC_IS_MBC_HEAD(env->enc, i); - if (z) { - mb_found = 1; - add_char_opt_map_info(&opt->map, i); - } - } - } - } + if (IS_NOT_NULL(cc->mbuf) || cc->not != 0) { + OnigDistance min = ONIGENC_MBC_MINLEN(env->enc); + OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc); - if (mb_found) { - len = ONIGENC_MBC_MAXLEN_DIST(env->enc); - set_mml(&opt->len, 1, len); + set_mml(&opt->len, min, max); } - else if (found) { - len = 1; - set_mml(&opt->len, 1, len); + else { + for (i = 0; i < SINGLE_BYTE_SIZE; i++) { + z = BITSET_AT(cc->bs, i); + if ((z && !cc->not) || (!z && cc->not)) { + add_char_opt_map_info(&opt->map, (UChar )i); + } + } + set_mml(&opt->len, 1, 1); } } break; case N_CTYPE: { - int c; - int len, min, max; + int i, min, max; - min = ONIGENC_MBC_MAXLEN_DIST(env->enc); - max = 0; + max = ONIGENC_MBC_MAXLEN_DIST(env->enc); -#define IS_WORD_HEAD_BYTE(enc,b) \ - (ONIGENC_IS_MBC_ASCII(&b) ? ONIGENC_IS_CODE_WORD(enc,((OnigCodePoint )b)) \ - : ONIGENC_IS_MBC_HEAD(enc,b)) + if (max == 1) { + min = 1; - switch (NCTYPE(node).type) { - case CTYPE_WORD: - for (c = 0; c < SINGLE_BYTE_SIZE; c++) { - if (IS_WORD_HEAD_BYTE(env->enc, c)) { - add_char_opt_map_info(&opt->map, c); - len = enc_len(env->enc, c); - if (len < min) min = len; - if (len > max) max = len; - } - } - break; + switch (NCTYPE(node).type) { + case CTYPE_NOT_WORD: + for (i = 0; i < SINGLE_BYTE_SIZE; i++) { + if (! ONIGENC_IS_CODE_WORD(env->enc, i)) { + add_char_opt_map_info(&opt->map, (UChar )i); + } + } + break; - case CTYPE_NOT_WORD: - for (c = 0; c < SINGLE_BYTE_SIZE; c++) { - if (! IS_WORD_HEAD_BYTE(env->enc, c)) { - add_char_opt_map_info(&opt->map, c); - len = enc_len(env->enc, c); - if (len < min) min = len; - if (len > max) max = len; - } + case CTYPE_WORD: + for (i = 0; i < SINGLE_BYTE_SIZE; i++) { + if (ONIGENC_IS_CODE_WORD(env->enc, i)) { + add_char_opt_map_info(&opt->map, (UChar )i); + } + } + break; } - break; } - + else { + min = ONIGENC_MBC_MINLEN(env->enc); + } set_mml(&opt->len, min, max); } break; case N_ANYCHAR: { - OnigDistance len = ONIGENC_MBC_MAXLEN_DIST(env->enc); - set_mml(&opt->len, 1, len); + OnigDistance min = ONIGENC_MBC_MINLEN(env->enc); + OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc); + set_mml(&opt->len, min, max); } break; @@ -4231,36 +4279,20 @@ set_optimize_exact_info(regex_t* reg, OptExactInfo* e) if (e->len == 0) return 0; - reg->exact = onig_strdup(e->s, e->s + e->len); - CHECK_NULL_RETURN_VAL(reg->exact, ONIGERR_MEMORY); - - reg->exact_end = reg->exact + e->len; - if (e->ignore_case) { - UChar buf[ONIGENC_MBC_TO_LOWER_MAXLEN]; - int len, low_len, i, j, alloc_size; - - alloc_size = e->len; - i = j = 0; - while (i < e->len) { - low_len = ONIGENC_MBC_TO_LOWER(reg->enc, &(e->s[i]), buf); - len = enc_len(reg->enc, e->s[i]); - if (low_len > alloc_size - i) { - reg->exact = xrealloc(reg->exact, alloc_size * 2); - CHECK_NULL_RETURN_VAL(reg->exact, ONIGERR_MEMORY); - alloc_size *= 2; - } - - xmemcpy(&(reg->exact[j]), buf, low_len); - i += len; - j += low_len; - } - reg->exact_end = reg->exact + j; + reg->exact = (UChar* )xmalloc(e->len); + CHECK_NULL_RETURN_VAL(reg->exact, ONIGERR_MEMORY); + xmemcpy(reg->exact, e->s, e->len); + reg->exact_end = reg->exact + e->len; reg->optimize = ONIG_OPTIMIZE_EXACT_IC; } else { int allow_reverse; + reg->exact = onig_strdup(e->s, e->s + e->len); + CHECK_NULL_RETURN_VAL(reg->exact, ONIGERR_MEMORY); + reg->exact_end = reg->exact + e->len; + if (e->anc.left_anchor & ANCHOR_BEGIN_LINE) allow_reverse = 1; else @@ -4268,7 +4300,7 @@ set_optimize_exact_info(regex_t* reg, OptExactInfo* e) ONIGENC_IS_ALLOWED_REVERSE_MATCH(reg->enc, reg->exact, reg->exact_end); if (e->len >= 3 || (e->len >= 2 && allow_reverse)) { - r = set_bm_skip(reg->exact, reg->exact_end, reg->enc, 0, + r = set_bm_skip(reg->exact, reg->exact_end, reg->enc, reg->map, &(reg->int_map)); if (r) return r; @@ -4328,6 +4360,7 @@ set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env) env.enc = reg->enc; env.options = reg->options; + env.ambig_flag = reg->ambig_flag; env.scan_env = scan_env; clear_mml(&env.mmd); @@ -4482,17 +4515,26 @@ print_optimize_info(FILE* f, regex_t* reg) fprintf(f, "]: length: %d\n", (reg->exact_end - reg->exact)); } else if (reg->optimize & ONIG_OPTIMIZE_MAP) { - int i, n = 0; + int c, i, n = 0; + for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) if (reg->map[i]) n++; fprintf(f, "map: n=%d\n", n); if (n > 0) { + c = 0; fputc('[', f); - for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) - if (reg->map[i] && enc_len(reg->enc, i) == 1 && - ONIGENC_IS_CODE_PRINT(reg->enc, i)) - fputc(i, f); + for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) { + if (reg->map[i] != 0) { + if (c > 0) fputs(", ", f); + c++; + if (ONIGENC_MBC_MAXLEN(reg->enc) == 1 && + ONIGENC_IS_CODE_PRINT(reg->enc, (OnigCodePoint )i)) + fputc(i, f); + else + fprintf(f, "%d", i); + } + } fprintf(f, "]\n"); } } @@ -4531,7 +4573,7 @@ onig_free(regex_t* reg) xfree(from);\ } while (0) -static void +extern void onig_transfer(regex_t* to, regex_t* from) { THREAD_ATOMIC_START; @@ -4545,7 +4587,7 @@ onig_transfer(regex_t* to, regex_t* from) }\ } while (0) -static void +extern void onig_chain_link_add(regex_t* to, regex_t* add) { THREAD_ATOMIC_START; @@ -4598,7 +4640,8 @@ onig_clone(regex_t** to, regex_t* from) from->state++; /* increment as search counter */ } - r = onig_alloc_init(®, ONIG_OPTION_NONE, from->enc, ONIG_SYNTAX_DEFAULT); + r = onig_alloc_init(®, ONIG_OPTION_NONE, ONIGENC_AMBIGUOUS_MATCH_DEFAULT, + from->enc, ONIG_SYNTAX_DEFAULT); if (r != 0) { from->state--; return r; @@ -4829,8 +4872,8 @@ onig_recompile(regex_t* reg, UChar* pattern, UChar* pattern_end, static int onig_inited = 0; extern int -onig_alloc_init(regex_t** reg, OnigOptionType option, OnigEncoding enc, - OnigSyntaxType* syntax) +onig_alloc_init(regex_t** reg, OnigOptionType option, OnigAmbigType ambig_flag, + OnigEncoding enc, OnigSyntaxType* syntax) { if (! onig_inited) onig_init(); @@ -4863,6 +4906,9 @@ onig_alloc_init(regex_t** reg, OnigOptionType option, OnigEncoding enc, (*reg)->used = 0; (*reg)->name_table = (void* )NULL; + (*reg)->ambig_flag = ambig_flag; + (*reg)->ambig_flag &= ONIGENC_SUPPORT_AMBIG_FLAG(enc); + return 0; } @@ -4875,7 +4921,8 @@ onig_new(regex_t** reg, UChar* pattern, UChar* pattern_end, if (IS_NOT_NULL(einfo)) einfo->par = (UChar* )NULL; - r = onig_alloc_init(reg, option, enc, syntax); + r = onig_alloc_init(reg, option, ONIGENC_AMBIGUOUS_MATCH_DEFAULT, + enc, syntax); if (r) return r; r = onig_compile(*reg, pattern, pattern_end, einfo); @@ -4971,7 +5018,7 @@ OnigOpInfoType OnigOpInfo[] = { { OP_BACKREF2, "backref2", ARG_NON }, { OP_BACKREF3, "backref3", ARG_NON }, { OP_BACKREFN, "backrefn", ARG_MEMNUM }, - { OP_BACKREFN_IC, "backrefn-ic", ARG_MEMNUM }, + { OP_BACKREFN_IC, "backrefn-ic", ARG_SPECIAL }, { OP_BACKREF_MULTI, "backref_multi", ARG_SPECIAL }, { OP_BACKREF_MULTI_IC, "backref_multi-ic",ARG_SPECIAL }, { OP_MEMORY_START_PUSH, "mem-start-push", ARG_MEMNUM }, @@ -4992,6 +5039,8 @@ OnigOpInfoType OnigOpInfo[] = { { OP_REPEAT_NG, "repeat-ng", ARG_SPECIAL }, { OP_REPEAT_INC, "repeat-inc", ARG_MEMNUM }, { OP_REPEAT_INC_NG, "repeat-inc-ng", ARG_MEMNUM }, + { OP_REPEAT_INC_SG, "repeat-inc-sg", ARG_MEMNUM }, + { OP_REPEAT_INC_NG_SG, "repeat-inc-ng-sg", ARG_MEMNUM }, { OP_NULL_CHECK_START, "null-check-start",ARG_MEMNUM }, { OP_NULL_CHECK_END, "null-check-end", ARG_MEMNUM }, { OP_NULL_CHECK_END_MEMST,"null-check-end-memst", ARG_MEMNUM }, @@ -5058,7 +5107,8 @@ p_len_string(FILE* f, LengthType len, int mb_len, UChar* s) } extern void -onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp) +onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp, + OnigEncoding enc) { int i, n, arg_type; RelAddrType addr; @@ -5150,7 +5200,9 @@ onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp) break; case OP_EXACT1_IC: - p_string(f, 1, bp++); + len = enc_len(enc, bp); + p_string(f, len, bp); + bp += len; break; case OP_EXACTN_IC: GET_LENGTH_INC(len, bp); @@ -5196,8 +5248,14 @@ onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp) fprintf(f, ":%d:%d:%d", n, (int )code, len); break; - case OP_BACKREF_MULTI: + case OP_BACKREFN_IC: + mem = *((MemNumType* )bp); + bp += SIZE_MEMNUM; + fprintf(f, ":%d", mem); + break; + case OP_BACKREF_MULTI_IC: + case OP_BACKREF_MULTI: fputs(" ", f); GET_LENGTH_INC(len, bp); for (i = 0; i < len; i++) { @@ -5265,7 +5323,7 @@ print_compiled_byte_code_list(FILE* f, regex_t* reg) else fputs(" ", f); } - onig_print_compiled_byte_code(f, bp, &bp); + onig_print_compiled_byte_code(f, bp, &bp, reg->enc); } fprintf(f, "\n"); @@ -5325,12 +5383,6 @@ print_indent_tree(FILE* f, Node* node, int indent) fprintf(f, "%0x", bbuf->p[i]); } } -#if 0 - fprintf(f, "\n"); - Indent(f, indent); - for (i = 0; i < SINGLE_BYTE_SIZE; i++) - fputc((BITSET_AT(NCCLASS(node).bs, i) ? '1' : '0'), f); -#endif break; case N_CTYPE: @@ -1,53 +1,152 @@ /********************************************************************** - regexec.c - Oniguruma (regular expression library) - - Copyright (C) 2002-2004 K.Kosako ([email protected]) - **********************************************************************/ +/*- + * Copyright (c) 2002-2004 K.Kosako <kosako AT sofnec DOT co DOT jp> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include "regint.h" +#ifdef USE_CAPTURE_HISTORY +static void history_tree_free(OnigCaptureTreeNode* node); + static void -region_list_clear(OnigRegion** list) +history_tree_clear(OnigCaptureTreeNode* node) { int i; - if (IS_NOT_NULL(list)) { - for (i = 1; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) { - if (IS_NOT_NULL(list[i])) { - xfree(list[i]); - list[i] = (OnigRegion* )0; + if (IS_NOT_NULL(node)) { + for (i = 0; i < node->num_childs; i++) { + if (IS_NOT_NULL(node->childs[i])) { + history_tree_free(node->childs[i]); } } + for (i = 0; i < node->allocated; i++) { + node->childs[i] = (OnigCaptureTreeNode* )0; + } + node->num_childs = 0; + node->beg = ONIG_REGION_NOTPOS; + node->end = ONIG_REGION_NOTPOS; + node->group = -1; } } static void -region_list_free(OnigRegion* r) +history_tree_free(OnigCaptureTreeNode* node) { - if (IS_NOT_NULL(r->list)) { - region_list_clear(r->list); - xfree(r->list); - r->list = (OnigRegion** )0; + history_tree_clear(node); + xfree(node); +} + +static void +history_root_free(OnigRegion* r) +{ + if (IS_NOT_NULL(r->history_root)) { + history_tree_free(r->history_root); + r->history_root = (OnigCaptureTreeNode* )0; } } -static OnigRegion** -region_list_new() +static OnigCaptureTreeNode* +history_node_new() { - int i; - OnigRegion** list; + OnigCaptureTreeNode* node; + + node = (OnigCaptureTreeNode* )xmalloc(sizeof(OnigCaptureTreeNode)); + CHECK_NULL_RETURN(node); + node->childs = (OnigCaptureTreeNode** )0; + node->allocated = 0; + node->num_childs = 0; + node->group = -1; + node->beg = ONIG_REGION_NOTPOS; + node->end = ONIG_REGION_NOTPOS; + + return node; +} - list = (OnigRegion** )xmalloc(sizeof(OnigRegion*) - * (ONIG_MAX_CAPTURE_HISTORY_GROUP + 1)); - CHECK_NULL_RETURN(list); - for (i = 0; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) { - list[i] = (OnigRegion* )0; +static int +history_tree_add_child(OnigCaptureTreeNode* parent, OnigCaptureTreeNode* child) +{ +#define HISTORY_TREE_INIT_ALLOC_SIZE 8 + + if (parent->num_childs >= parent->allocated) { + int n, i; + + if (IS_NULL(parent->childs)) { + n = HISTORY_TREE_INIT_ALLOC_SIZE; + parent->childs = + (OnigCaptureTreeNode** )xmalloc(sizeof(OnigCaptureTreeNode*) * n); + } + else { + n = parent->allocated * 2; + parent->childs = + (OnigCaptureTreeNode** )xrealloc(parent->childs, + sizeof(OnigCaptureTreeNode*) * n); + } + CHECK_NULL_RETURN_VAL(parent->childs, ONIGERR_MEMORY); + for (i = parent->allocated; i < n; i++) { + parent->childs[i] = (OnigCaptureTreeNode* )0; + } + parent->allocated = n; + } + + parent->childs[parent->num_childs] = child; + parent->num_childs++; + return 0; +} + +static OnigCaptureTreeNode* +history_tree_clone(OnigCaptureTreeNode* node) +{ + int i; + OnigCaptureTreeNode *clone, *child; + + clone = history_node_new(); + CHECK_NULL_RETURN(clone); + + clone->beg = node->beg; + clone->end = node->end; + for (i = 0; i < node->num_childs; i++) { + child = history_tree_clone(node->childs[i]); + if (IS_NULL(child)) { + history_tree_free(clone); + return (OnigCaptureTreeNode* )0; + } + history_tree_add_child(clone, child); } - return list; + return clone; } +extern OnigCaptureTreeNode* +onig_get_capture_tree(OnigRegion* region) +{ + return region->history_root; +} +#endif /* USE_CAPTURE_HISTORY */ + extern void onig_region_clear(OnigRegion* region) { @@ -56,7 +155,9 @@ onig_region_clear(OnigRegion* region) for (i = 0; i < region->num_regs; i++) { region->beg[i] = region->end[i] = ONIG_REGION_NOTPOS; } - region_list_clear(region->list); +#ifdef USE_CAPTURE_HISTORY + history_root_free(region); +#endif } extern int @@ -92,88 +193,20 @@ onig_region_resize(OnigRegion* region, int n) region->beg[i] = region->end[i] = ONIG_REGION_NOTPOS; } - if (IS_NOT_NULL(region->list)) - region_list_clear(region->list); - - return 0; -} - -static int -region_ensure_size(OnigRegion* region, int n) -{ - int i, new_size; - - if (region->allocated >= n) - return 0; - - new_size = region->allocated; - if (new_size == 0) - new_size = ONIG_NREGION; - while (new_size < n) - new_size *= 2; - - if (region->allocated == 0) { - region->beg = (int* )xmalloc(new_size * sizeof(int)); - region->end = (int* )xmalloc(new_size * sizeof(int)); - if (region->beg == 0 || region->end == 0) - return ONIGERR_MEMORY; - - region->allocated = new_size; - } - else if (region->allocated < new_size) { - region->beg = (int* )xrealloc(region->beg, new_size * sizeof(int)); - region->end = (int* )xrealloc(region->end, new_size * sizeof(int)); - if (region->beg == 0 || region->end == 0) - return ONIGERR_MEMORY; - - region->allocated = new_size; - } - - for (i = region->num_regs; i < n; i++) { - region->beg[i] = region->end[i] = ONIG_REGION_NOTPOS; - } - return 0; -} - -static int -region_list_add_entry(OnigRegion* region, int group, int start, int end) -{ - int r, pos; - OnigRegion** list; - - if (group > ONIG_MAX_CAPTURE_HISTORY_GROUP) - return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY; - - if (IS_NULL(region->list)) { - region->list = region_list_new(); - CHECK_NULL_RETURN_VAL(region->list, ONIGERR_MEMORY); - } - - list = region->list; - if (IS_NULL(list[group])) { - list[group] = onig_region_new(); - CHECK_NULL_RETURN_VAL(list[group], ONIGERR_MEMORY); - } - - r = region_ensure_size(list[group], list[group]->num_regs + 1); - if (r != 0) return r; - - pos = list[group]->num_regs; - list[group]->beg[pos] = start; - list[group]->end[pos] = end; - list[group]->num_regs++; - +#ifdef USE_CAPTURE_HISTORY + history_root_free(region); +#endif return 0; } static void onig_region_init(OnigRegion* region) { - region->num_regs = 0; - region->allocated = 0; - region->beg = (int* )0; - region->end = (int* )0; - region->list = (OnigRegion** )0; + region->num_regs = 0; + region->allocated = 0; + region->beg = (int* )0; + region->end = (int* )0; + region->history_root = (OnigCaptureTreeNode* )0; } extern OnigRegion* @@ -195,7 +228,9 @@ onig_region_free(OnigRegion* r, int free_self) if (r->end) xfree(r->end); r->allocated = 0; } - region_list_free(r); +#ifdef USE_CAPTURE_HISTORY + history_root_free(r); +#endif if (free_self) xfree(r); } } @@ -227,28 +262,13 @@ onig_region_copy(OnigRegion* to, OnigRegion* from) } to->num_regs = from->num_regs; - if (IS_NOT_NULL(from->list)) { - if (IS_NULL(to->list)) { - to->list = region_list_new(); - } +#ifdef USE_CAPTURE_HISTORY + history_root_free(to); - for (i = 1; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) { - if (IS_NOT_NULL(from->list[i])) { - if (IS_NULL(to->list[i])) - to->list[i] = onig_region_new(); - - onig_region_copy(to->list[i], from->list[i]); - } - else { - if (IS_NOT_NULL(to->list[i])) { - xfree(to->list[i]); - to->list[i] = (OnigRegion* )0; - } - } - } + if (IS_NOT_NULL(from->history_root)) { + to->history_root = history_tree_clone(from->history_root); } - else - region_list_free(to); +#endif } @@ -851,24 +871,25 @@ stack_double(StackType** arg_stk_base, StackType** arg_stk_end, }\ } while(0) -#define STRING_CMP_IC(s1,ps2,len) do {\ - if (string_cmp_ic(encode, s1, ps2, len) == 0) \ +#define STRING_CMP_IC(ambig_flag,s1,ps2,len) do {\ + if (string_cmp_ic(encode, ambig_flag, s1, ps2, len) == 0) \ goto fail; \ } while(0) -static int string_cmp_ic(OnigEncoding enc, +static int string_cmp_ic(OnigEncoding enc, int ambig_flag, UChar* s1, UChar** ps2, int mblen) { - UChar buf1[ONIGENC_MBC_TO_LOWER_MAXLEN]; - UChar buf2[ONIGENC_MBC_TO_LOWER_MAXLEN]; - UChar *p1, *p2, *end, *s2; + UChar buf1[ONIGENC_MBC_NORMALIZE_MAXLEN]; + UChar buf2[ONIGENC_MBC_NORMALIZE_MAXLEN]; + UChar *p1, *p2, *end, *s2, *end2; int len1, len2; - s2 = *ps2; - end = s1 + mblen; + s2 = *ps2; + end = s1 + mblen; + end2 = s2 + mblen; while (s1 < end) { - len1 = ONIGENC_MBC_TO_LOWER(enc, s1, buf1); - len2 = ONIGENC_MBC_TO_LOWER(enc, s2, buf2); + len1 = ONIGENC_MBC_TO_NORMALIZE(enc, ambig_flag, &s1, end, buf1); + len2 = ONIGENC_MBC_TO_NORMALIZE(enc, ambig_flag, &s2, end2, buf2); if (len1 != len2) return 0; p1 = buf1; p2 = buf2; @@ -877,9 +898,6 @@ static int string_cmp_ic(OnigEncoding enc, p1++; p2++; } - - s1 += enc_len(enc, *s1); - s2 += enc_len(enc, *s2); } *ps2 = s2; @@ -895,8 +913,8 @@ static int string_cmp_ic(OnigEncoding enc, }\ } while(0) -#define STRING_CMP_VALUE_IC(s1,ps2,len,is_fail) do {\ - if (string_cmp_ic(encode, s1, ps2, len) == 0) \ +#define STRING_CMP_VALUE_IC(ambig_flag,s1,ps2,len,is_fail) do {\ + if (string_cmp_ic(encode, ambig_flag, s1, ps2, len) == 0) \ is_fail = 1; \ else \ is_fail = 0; \ @@ -911,6 +929,110 @@ static int string_cmp_ic(OnigEncoding enc, #define DATA_ENSURE_CHECK(n) (s + (n) <= end) +#ifdef USE_CAPTURE_HISTORY +static int +make_capture_history_tree(OnigCaptureTreeNode* node, StackType** kp, + StackType* stk_top, UChar* str, regex_t* reg) +{ + int n, r; + OnigCaptureTreeNode* child; + StackType* k = *kp; + + while (k < stk_top) { + if (k->type == STK_MEM_START) { + n = k->u.mem.num; + if (n <= ONIG_MAX_CAPTURE_HISTORY_GROUP && + BIT_STATUS_AT(reg->capture_history, n) != 0) { + child = history_node_new(); + CHECK_NULL_RETURN_VAL(child, ONIGERR_MEMORY); + child->group = n; + child->beg = (int )(k->u.mem.pstr - str); + r = history_tree_add_child(node, child); + if (r != 0) return r; + *kp = (k + 1); + r = make_capture_history_tree(child, kp, stk_top, str, reg); + if (r != 0) return r; + + k = *kp; + child->end = (int )(k->u.mem.pstr - str); + } + } + else if (k->type == STK_MEM_END) { + if (k->u.mem.num == node->group) { + node->end = (int )(k->u.mem.pstr - str); + *kp = k; + return 0; + } + } + k++; + } + + return 1; /* 1: root node ending. */ +} +#endif + +#ifdef RUBY_PLATFORM + +typedef struct { + int state; + regex_t* reg; + MatchArg* msa; + StackType* stk_base; +} TrapEnsureArg; + +static VALUE +trap_ensure(VALUE arg) +{ + TrapEnsureArg* ta = (TrapEnsureArg* )arg; + + if (ta->state == 0) { /* trap_exec() is not normal return */ + ta->reg->state--; + if (! IS_NULL(ta->msa->stack_p) && ta->stk_base != ta->msa->stack_p) + xfree(ta->stk_base); + + MATCH_ARG_FREE(*(ta->msa)); + } + + return Qnil; +} + +static VALUE +trap_exec(VALUE arg) +{ + TrapEnsureArg* ta; + + rb_trap_exec(); + + ta = (TrapEnsureArg* )arg; + ta->state = 1; /* normal return */ + return Qnil; +} + +extern void +onig_exec_trap(regex_t* reg, MatchArg* msa, StackType* stk_base) +{ + VALUE arg; + TrapEnsureArg ta; + + ta.state = 0; + ta.reg = reg; + ta.msa = msa; + ta.stk_base = stk_base; + arg = (VALUE )(&ta); + rb_ensure(trap_exec, arg, trap_ensure, arg); +} + +#define CHECK_INTERRUPT_IN_MATCH_AT do {\ + if (rb_trap_pending) {\ + if (! rb_prohibit_interrupt) {\ + onig_exec_trap(reg, msa, stk_base);\ + }\ + }\ +} while (0) +#else +#define CHECK_INTERRUPT_IN_MATCH_AT +#endif /* RUBY_PLATFORM */ + #ifdef ONIG_DEBUG_STATISTICS #define USE_TIMEOFDAY @@ -955,6 +1077,7 @@ static int MaxStackDepth = 0; } while (0) #ifdef RUBY_PLATFORM + /* * :nodoc: */ @@ -1047,7 +1170,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, RelAddrType addr; OnigOptionType option = reg->options; OnigEncoding encode = reg->enc; - int ignore_case; + OnigAmbigType ambig_flag = reg->ambig_flag; UChar *s, *q, *sbegin; UChar *p = reg->p; char *alloca_base; @@ -1059,7 +1182,6 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, n = reg->num_repeat + reg->num_mem * 2; STACK_INIT(alloca_base, n, INIT_MATCH_STACK_SIZE); - ignore_case = IS_IGNORECASE(option); pop_level = reg->stack_pop_level; num_mem = reg->num_mem; repeat_stk = (StackIndex* )alloca_base; @@ -1092,7 +1214,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, fprintf(stderr, "%4d> \"", (int )(s - str)); bp = buf; for (i = 0, q = s; i < 7 && q < end; i++) { - len = enc_len(encode, *q); + len = enc_len(encode, q); while (len-- > 0) *bp++ = *q++; } if (q < end) { xmemcpy(bp, "...\"", 4); bp += 4; } @@ -1100,7 +1222,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, *bp = 0; fputs(buf, stderr); for (i = 0; i < 20 - (bp - buf); i++) fputc(' ', stderr); - onig_print_compiled_byte_code(stderr, p, NULL); + onig_print_compiled_byte_code(stderr, p, NULL, encode); fprintf(stderr, "\n"); } #endif @@ -1155,27 +1277,33 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, } } +#ifdef USE_CAPTURE_HISTORY if (reg->capture_history != 0) { - UChar *pstart, *pend; - for (i = 1; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) { - if (BIT_STATUS_AT(reg->capture_history, i) != 0) { - stkp = stk_base; - do { - STACK_GET_MEM_RANGE(stkp, i, pstart, pend); - if (stkp < stk) { - int r; - r = region_list_add_entry(region, i, - pstart - str, pend - str); - if (r) { - STACK_SAVE; - return r; - } - } - stkp++; - } while (stkp < stk); - } - } - } /* list of captures */ + int r; + OnigCaptureTreeNode* node; + + if (IS_NULL(region->history_root)) { + region->history_root = node = history_node_new(); + CHECK_NULL_RETURN_VAL(node, ONIGERR_MEMORY); + } + else { + node = region->history_root; + history_tree_clear(node); + } + + node->group = 0; + node->beg = sstart - str; + node->end = s - str; + + stkp = stk_base; + r = make_capture_history_tree(region->history_root, &stkp, + stk, str, reg); + if (r < 0) { + best_len = r; /* error code */ + goto finish; + } + } +#endif /* USE_CAPTURE_HISTORY */ #ifdef USE_POSIX_REGION_OPTION } /* else IS_POSIX_REGION() */ #endif @@ -1212,12 +1340,12 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, case OP_EXACT1_IC: STAT_OP_IN(OP_EXACT1_IC); { int len; - UChar *q, lowbuf[ONIGENC_MBC_TO_LOWER_MAXLEN]; + UChar *q, lowbuf[ONIGENC_MBC_NORMALIZE_MAXLEN]; - len = ONIGENC_MBC_TO_LOWER(encode, s, lowbuf); - DATA_ENSURE(len); + DATA_ENSURE(1); + len = ONIGENC_MBC_TO_NORMALIZE(encode, ambig_flag, &s, end, lowbuf); + DATA_ENSURE(0); q = lowbuf; - s += enc_len(encode, *s); while (len-- > 0) { if (*p != *q) goto fail; p++; q++; @@ -1296,16 +1424,16 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, case OP_EXACTN_IC: STAT_OP_IN(OP_EXACTN_IC); { int len; - UChar *q, *endp, lowbuf[ONIGENC_MBC_TO_LOWER_MAXLEN]; + UChar *q, *endp, lowbuf[ONIGENC_MBC_NORMALIZE_MAXLEN]; GET_LENGTH_INC(tlen, p); endp = p + tlen; while (p < endp) { - len = ONIGENC_MBC_TO_LOWER(encode, s, lowbuf); - DATA_ENSURE(len); sprev = s; - s += enc_len(encode, *s); + DATA_ENSURE(1); + len = ONIGENC_MBC_TO_NORMALIZE(encode, ambig_flag, &s, end, lowbuf); + DATA_ENSURE(0); q = lowbuf; while (len-- > 0) { if (*p != *q) goto fail; @@ -1409,20 +1537,22 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, DATA_ENSURE(1); if (BITSET_AT(((BitSetRef )p), *s) == 0) goto fail; p += SIZE_BITSET; - s += enc_len(encode, *s); /* OP_CCLASS can match mb-code. \D, \S */ + s += enc_len(encode, s); /* OP_CCLASS can match mb-code. \D, \S */ STAT_OP_OUT; break; case OP_CCLASS_MB: STAT_OP_IN(OP_CCLASS_MB); - if (! ONIGENC_IS_MBC_HEAD(encode, *s)) goto fail; + if (! ONIGENC_IS_MBC_HEAD(encode, s)) goto fail; cclass_mb: GET_LENGTH_INC(tlen, p); { OnigCodePoint code; UChar *ss; - int mb_len = enc_len(encode, *s); + int mb_len; + DATA_ENSURE(1); + mb_len = enc_len(encode, s); DATA_ENSURE(mb_len); ss = s; s += mb_len; @@ -1442,7 +1572,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, case OP_CCLASS_MIX: STAT_OP_IN(OP_CCLASS_MIX); DATA_ENSURE(1); - if (ONIGENC_IS_MBC_HEAD(encode, *s)) { + if (ONIGENC_IS_MBC_HEAD(encode, s)) { p += SIZE_BITSET; goto cclass_mb; } @@ -1462,13 +1592,13 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, DATA_ENSURE(1); if (BITSET_AT(((BitSetRef )p), *s) != 0) goto fail; p += SIZE_BITSET; - s += enc_len(encode, *s); + s += enc_len(encode, s); STAT_OP_OUT; break; case OP_CCLASS_MB_NOT: STAT_OP_IN(OP_CCLASS_MB_NOT); - if (! ONIGENC_IS_MBC_HEAD(encode, *s)) { - DATA_ENSURE(1); + DATA_ENSURE(1); + if (! ONIGENC_IS_MBC_HEAD(encode, s)) { s++; GET_LENGTH_INC(tlen, p); p += tlen; @@ -1480,7 +1610,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, { OnigCodePoint code; UChar *ss; - int mb_len = enc_len(encode, *s); + int mb_len = enc_len(encode, s); if (s + mb_len > end) { DATA_ENSURE(1); @@ -1509,7 +1639,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, case OP_CCLASS_MIX_NOT: STAT_OP_IN(OP_CCLASS_MIX_NOT); DATA_ENSURE(1); - if (ONIGENC_IS_MBC_HEAD(encode, *s)) { + if (ONIGENC_IS_MBC_HEAD(encode, s)) { p += SIZE_BITSET; goto cclass_mb_not; } @@ -1526,21 +1656,17 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, break; case OP_ANYCHAR: STAT_OP_IN(OP_ANYCHAR); - n = enc_len(encode, *s); - if (n > 1) { - DATA_ENSURE(n); - s += n; - } - else { - DATA_ENSURE(1); - if (ONIG_IS_NEWLINE(*s)) goto fail; - s++; - } + DATA_ENSURE(1); + n = enc_len(encode, s); + DATA_ENSURE(n); + if (ONIGENC_IS_MBC_NEWLINE(encode, s, end)) goto fail; + s += n; STAT_OP_OUT; break; case OP_ANYCHAR_ML: STAT_OP_IN(OP_ANYCHAR_ML); - n = enc_len(encode, *s); + DATA_ENSURE(1); + n = enc_len(encode, s); DATA_ENSURE(n); s += n; STAT_OP_OUT; @@ -1549,17 +1675,11 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, case OP_ANYCHAR_STAR: STAT_OP_IN(OP_ANYCHAR_STAR); while (s < end) { STACK_PUSH_ALT(p, s, sprev); - n = enc_len(encode, *s); - if (n > 1) { - DATA_ENSURE(n); - sprev = s; - s += n; - } - else { - if (ONIG_IS_NEWLINE(*s)) goto fail; - sprev = s; - s++; - } + n = enc_len(encode, s); + DATA_ENSURE(n); + if (ONIGENC_IS_MBC_NEWLINE(encode, s, end)) goto fail; + sprev = s; + s += n; } STAT_OP_OUT; break; @@ -1567,7 +1687,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, case OP_ANYCHAR_ML_STAR: STAT_OP_IN(OP_ANYCHAR_ML_STAR); while (s < end) { STACK_PUSH_ALT(p, s, sprev); - n = enc_len(encode, *s); + n = enc_len(encode, s); if (n > 1) { DATA_ENSURE(n); sprev = s; @@ -1586,17 +1706,11 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, if (*p == *s) { STACK_PUSH_ALT(p + 1, s, sprev); } - n = enc_len(encode, *s); - if (n > 1) { - DATA_ENSURE(n); - sprev = s; - s += n; - } - else { - if (ONIG_IS_NEWLINE(*s)) goto fail; - sprev = s; - s++; - } + n = enc_len(encode, s); + DATA_ENSURE(n); + if (ONIGENC_IS_MBC_NEWLINE(encode, s, end)) goto fail; + sprev = s; + s += n; } p++; STAT_OP_OUT; @@ -1607,7 +1721,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, if (*p == *s) { STACK_PUSH_ALT(p + 1, s, sprev); } - n = enc_len(encode, *s); + n = enc_len(encode, s); if (n >1) { DATA_ENSURE(n); sprev = s; @@ -1627,7 +1741,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, if (! ONIGENC_IS_MBC_WORD(encode, s, end)) goto fail; - s += enc_len(encode, *s); + s += enc_len(encode, s); STAT_OP_OUT; break; @@ -1636,7 +1750,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, if (ONIGENC_IS_MBC_WORD(encode, s, end)) goto fail; - s += enc_len(encode, *s); + s += enc_len(encode, s); STAT_OP_OUT; break; @@ -1719,7 +1833,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, STAT_OP_OUT; continue; } - else if (ONIG_IS_NEWLINE(*sprev) && !ON_STR_END(s)) { + else if (ONIGENC_IS_MBC_NEWLINE(encode, sprev, end) && !ON_STR_END(s)) { STAT_OP_OUT; continue; } @@ -1729,7 +1843,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, case OP_END_LINE: STAT_OP_IN(OP_END_LINE); if (ON_STR_END(s)) { #ifndef USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE - if (IS_EMPTY_STR || !ONIG_IS_NEWLINE(*sprev)) { + if (IS_EMPTY_STR || !ONIGENC_IS_MBC_NEWLINE(encode, sprev, end)) { #endif if (IS_NOTEOL(msa->options)) goto fail; STAT_OP_OUT; @@ -1738,7 +1852,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, } #endif } - else if (ONIG_IS_NEWLINE(*s)) { + else if (ONIGENC_IS_MBC_NEWLINE(encode, s, end)) { STAT_OP_OUT; continue; } @@ -1748,7 +1862,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, case OP_SEMI_END_BUF: STAT_OP_IN(OP_SEMI_END_BUF); if (ON_STR_END(s)) { #ifndef USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE - if (IS_EMPTY_STR || !ONIG_IS_NEWLINE(*sprev)) { + if (IS_EMPTY_STR || !ONIGENC_IS_MBC_NEWLINE(encode, sprev, end)) { #endif if (IS_NOTEOL(msa->options)) goto fail; /* Is it needed? */ STAT_OP_OUT; @@ -1757,7 +1871,8 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, } #endif } - if (ONIG_IS_NEWLINE(*s) && ON_STR_END(s+1)) { + else if (ONIGENC_IS_MBC_NEWLINE(encode, s, end) && + ON_STR_END(s + enc_len(encode, s))) { STAT_OP_OUT; continue; } @@ -1866,7 +1981,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, DATA_ENSURE(n); sprev = s; STRING_CMP(pstart, s, n); - while (sprev + (len = enc_len(encode, *sprev)) < s) + while (sprev + (len = enc_len(encode, sprev)) < s) sprev += len; STAT_OP_OUT; @@ -1897,8 +2012,8 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, n = pend - pstart; DATA_ENSURE(n); sprev = s; - STRING_CMP_IC(pstart, &s, n); - while (sprev + (len = enc_len(encode, *sprev)) < s) + STRING_CMP_IC(ambig_flag, pstart, &s, n); + while (sprev + (len = enc_len(encode, sprev)) < s) sprev += len; STAT_OP_OUT; @@ -1933,7 +2048,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, STRING_CMP_VALUE(pstart, swork, n, is_fail); if (is_fail) continue; s = swork; - while (sprev + (len = enc_len(encode, *sprev)) < s) + while (sprev + (len = enc_len(encode, sprev)) < s) sprev += len; p += (SIZE_MEMNUM * (tlen - i - 1)); @@ -1969,10 +2084,10 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, DATA_ENSURE(n); sprev = s; swork = s; - STRING_CMP_VALUE_IC(pstart, &swork, n, is_fail); + STRING_CMP_VALUE_IC(ambig_flag, pstart, &swork, n, is_fail); if (is_fail) continue; s = swork; - while (sprev + (len = enc_len(encode, *sprev)) < s) + while (sprev + (len = enc_len(encode, sprev)) < s) sprev += len; p += (SIZE_MEMNUM * (tlen - i - 1)); @@ -1986,7 +2101,6 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, case OP_SET_OPTION_PUSH: STAT_OP_IN(OP_SET_OPTION_PUSH); GET_OPTION_INC(option, p); - ignore_case = IS_IGNORECASE(option); STACK_PUSH_ALT(p, s, sprev); p += SIZE_OP_SET_OPTION + SIZE_OP_FAIL; STAT_OP_OUT; @@ -1995,7 +2109,6 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, case OP_SET_OPTION: STAT_OP_IN(OP_SET_OPTION); GET_OPTION_INC(option, p); - ignore_case = IS_IGNORECASE(option); STAT_OP_OUT; continue; break; @@ -2027,6 +2140,8 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, break; case OP_REPEAT_INC: case OP_REPEAT_INC_NG: + case OP_REPEAT_INC_SG: + case OP_REPEAT_INC_NG_SG: p += SIZE_MEMNUM; break; default: @@ -2093,6 +2208,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, GET_RELADDR_INC(addr, p); p += addr; STAT_OP_OUT; + CHECK_INTERRUPT_IN_MATCH_AT; continue; break; @@ -2182,13 +2298,14 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, } else if (stkp->u.repeat.count >= reg->repeat_range[mem].lower) { STACK_PUSH_ALT(p, s, sprev); - p = stkp->u.repeat.pcode; + p = STACK_AT(si)->u.repeat.pcode; /* Don't use stkp after PUSH. */ } else { p = stkp->u.repeat.pcode; } STACK_PUSH_REPEAT_INC(si); STAT_OP_OUT; + CHECK_INTERRUPT_IN_MATCH_AT; continue; break; @@ -2206,11 +2323,11 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, repeat_inc_ng: stkp->u.repeat.count++; - if (stkp->u.repeat.count < reg->repeat_range[mem].upper - || reg->repeat_range[mem].upper < 0 /* IS_REPEAT_INFINITE(upper) */) { + if (stkp->u.repeat.count < reg->repeat_range[mem].upper || + IS_REPEAT_INFINITE(reg->repeat_range[mem].upper)) { if (stkp->u.repeat.count >= reg->repeat_range[mem].lower) { UChar* pcode = stkp->u.repeat.pcode; - + STACK_PUSH_REPEAT_INC(si); STACK_PUSH_ALT(pcode, s, sprev); } @@ -2223,6 +2340,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, STACK_PUSH_REPEAT_INC(si); } STAT_OP_OUT; + CHECK_INTERRUPT_IN_MATCH_AT; continue; break; @@ -2233,6 +2351,13 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, goto repeat_inc_ng; break; + case OP_REPEAT_INC_NG_SG: STAT_OP_IN(OP_REPEAT_INC_NG_SG); + GET_MEMNUM_INC(mem, p); /* mem: OP_REPEAT ID */ + STACK_GET_REPEAT(mem, stkp); + si = GET_STACK_INDEX(stkp); + goto repeat_inc_ng; + break; + case OP_PUSH_POS: STAT_OP_IN(OP_PUSH_POS); STACK_PUSH_POS(s, sprev); STAT_OP_OUT; @@ -2390,73 +2515,39 @@ slow_search(OnigEncoding enc, UChar* target, UChar* target_end, if (t == target_end) return s; } - s += enc_len(enc, *s); + s += enc_len(enc, s); } return (UChar* )NULL; } -#if 0 -static int -str_trans_match_after_head_byte(OnigEncoding enc, - int len, UChar* t, UChar* tend, UChar* p) -{ - while (--len > 0) { - if (*t != *p) break; - t++; p++; - } - - if (len == 0) { - int lowlen; - UChar *q, lowbuf[ONIGENC_MBC_TO_LOWER_MAXLEN]; - - while (t < tend) { - len = enc_len(enc, *p); - lowlen = ONIGENC_MBC_TO_LOWER(enc, p, lowbuf); - q = lowbuf; - while (lowlen > 0) { - if (*t++ != *q++) break; - lowlen--; - } - if (lowlen > 0) break; - p += len; - } - if (t == tend) - return 1; - } - - return 0; -} -#endif - static int -str_lower_case_match(OnigEncoding enc, UChar* t, UChar* tend, UChar* p) +str_lower_case_match(OnigEncoding enc, int ambig_flag, + UChar* t, UChar* tend, UChar* p, UChar* end) { - int len, lowlen; - UChar *q, lowbuf[ONIGENC_MBC_TO_LOWER_MAXLEN]; + int lowlen; + UChar *q, lowbuf[ONIGENC_MBC_NORMALIZE_MAXLEN]; while (t < tend) { - len = enc_len(enc, *p); - lowlen = ONIGENC_MBC_TO_LOWER(enc, p, lowbuf); + lowlen = ONIGENC_MBC_TO_NORMALIZE(enc, ambig_flag, &p, end, lowbuf); q = lowbuf; while (lowlen > 0) { if (*t++ != *q++) return 0; lowlen--; } - p += len; } return 1; } static UChar* -slow_search_ic(OnigEncoding enc, +slow_search_ic(OnigEncoding enc, int ambig_flag, UChar* target, UChar* target_end, UChar* text, UChar* text_end, UChar* text_range) { - int len, lowlen; - UChar *t, *p, *s, *end; - UChar lowbuf[ONIGENC_MBC_TO_LOWER_MAXLEN]; + int lowlen; + UChar *t, *p, *s, *end, *z; + UChar lowbuf[ONIGENC_MBC_NORMALIZE_MAXLEN]; end = text_end - (target_end - target) + 1; if (end > text_range) @@ -2465,22 +2556,21 @@ slow_search_ic(OnigEncoding enc, s = text; while (s < end) { - len = enc_len(enc, *s); - lowlen = ONIGENC_MBC_TO_LOWER(enc, s, lowbuf); + z = s; + lowlen = ONIGENC_MBC_TO_NORMALIZE(enc, ambig_flag, &s, text_end, lowbuf); if (*target == *lowbuf) { p = lowbuf + 1; t = target + 1; while (--lowlen > 0) { if (*p != *t) break; - p++; *t++; + p++; t++; } if (lowlen == 0) { - if (str_lower_case_match(enc, t, target_end, s + len)) - return s; + if (str_lower_case_match(enc, ambig_flag, + t, target_end, s, text_end)) + return z; } } - - s += len; } return (UChar* )NULL; @@ -2517,14 +2607,14 @@ slow_search_backward(OnigEncoding enc, UChar* target, UChar* target_end, } static UChar* -slow_search_backward_ic(OnigEncoding enc, +slow_search_backward_ic(OnigEncoding enc, int ambig_flag, UChar* target,UChar* target_end, UChar* text, UChar* adjust_text, UChar* text_end, UChar* text_start) { int len, lowlen; - UChar *t, *p, *s; - UChar lowbuf[ONIGENC_MBC_TO_LOWER_MAXLEN]; + UChar *t, *p, *s, *z; + UChar lowbuf[ONIGENC_MBC_NORMALIZE_MAXLEN]; s = text_end - (target_end - target); if (s > text_start) @@ -2533,22 +2623,24 @@ slow_search_backward_ic(OnigEncoding enc, s = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, adjust_text, s); while (s >= text) { - len = enc_len(enc, *s); - lowlen = ONIGENC_MBC_TO_LOWER(enc, s, lowbuf); + len = enc_len(enc, s); + z = s; + lowlen = ONIGENC_MBC_TO_NORMALIZE(enc, ambig_flag, &s, text_end, lowbuf); if (*target == *lowbuf) { p = lowbuf + 1; t = target + 1; while (--lowlen > 0) { if (*p != *t) break; - p++; *t++; + p++; t++; } if (lowlen == 0) { - if (str_lower_case_match(enc, t, target_end, s + len)) - return s; + if (str_lower_case_match(enc, ambig_flag, + t, target_end, s, text_end)) + return z; } } - s = onigenc_get_prev_char_head(enc, adjust_text, s); + s = onigenc_get_prev_char_head(enc, adjust_text, z); } return (UChar* )NULL; @@ -2562,6 +2654,11 @@ bm_search_notrev(regex_t* reg, UChar* target, UChar* target_end, UChar *tail; int skip; +#ifdef ONIG_DEBUG_SEARCH + fprintf(stderr, "bm_search_notrev: text: %d, text_end: %d, text_range: %d\n", + (int )text, (int )text_end, (int )text_range); +#endif + end = text_range + (target_end - target) - 1; if (end > text_end) end = text_end; @@ -2569,7 +2666,7 @@ bm_search_notrev(regex_t* reg, UChar* target, UChar* target_end, tail = target_end - 1; s = text; while ((s - text) < target_end - target) { - s += enc_len(reg->enc, *s); + s += enc_len(reg->enc, s); } s--; /* set to text check tail position. */ @@ -2587,7 +2684,7 @@ bm_search_notrev(regex_t* reg, UChar* target, UChar* target_end, if (p >= text_end) return (UChar* )NULL; t = p; do { - p += enc_len(reg->enc, *p); + p += enc_len(reg->enc, p); } while ((p - t) < skip && p < text_end); s += (p - t); @@ -2607,7 +2704,7 @@ bm_search_notrev(regex_t* reg, UChar* target, UChar* target_end, if (p >= text_end) return (UChar* )NULL; t = p; do { - p += enc_len(reg->enc, *p); + p += enc_len(reg->enc, p); } while ((p - t) < skip && p < text_end); s += (p - t); @@ -2655,11 +2752,10 @@ bm_search(regex_t* reg, UChar* target, UChar* target_end, } static int -set_bm_backward_skip(UChar* s, UChar* end, OnigEncoding enc, - int ignore_case, int** skip) +set_bm_backward_skip(UChar* s, UChar* end, OnigEncoding enc, int** skip) + { int i, len; - UChar lowbuf[ONIGENC_MBC_TO_LOWER_MAXLEN]; if (IS_NULL(*skip)) { *skip = (int* )xmalloc(sizeof(int) * ONIG_CHAR_TABLE_SIZE); @@ -2670,16 +2766,9 @@ set_bm_backward_skip(UChar* s, UChar* end, OnigEncoding enc, for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) (*skip)[i] = len; - if (ignore_case) { - for (i = len - 1; i > 0; i--) { - ONIGENC_MBC_TO_LOWER(enc, &(s[i]), lowbuf); - (*skip)[*lowbuf] = i; - } - } - else { - for (i = len - 1; i > 0; i--) - (*skip)[s[i]] = i; - } + for (i = len - 1; i > 0; i--) + (*skip)[s[i]] = i; + return 0; } @@ -2719,7 +2808,7 @@ map_search(OnigEncoding enc, UChar map[], UChar* text, UChar* text_range) while (s < text_range) { if (map[*s]) return s; - s += enc_len(enc, *s); + s += enc_len(enc, s); } return (UChar* )NULL; } @@ -2746,6 +2835,23 @@ onig_match(regex_t* reg, UChar* str, UChar* end, UChar* at, OnigRegion* region, UChar *prev; MatchArg msa; + if (ONIG_STATE(reg) == ONIG_STATE_NORMAL) { + reg->state++; /* increment as search counter */ + if (IS_NOT_NULL(reg->chain)) { + onig_chain_reduce(reg); + reg->state++; + } + } + else { + int n = 0; + while (ONIG_STATE(reg) < ONIG_STATE_NORMAL) { + if (++n > THREAD_PASS_LIMIT_COUNT) + return ONIGERR_OVER_THREAD_PASS_LIMIT_COUNT; + THREAD_PASS; + } + reg->state++; /* increment as search counter */ + } + MATCH_ARG_INIT(msa, option, region, at); if (region @@ -2762,7 +2868,9 @@ onig_match(regex_t* reg, UChar* str, UChar* end, UChar* at, OnigRegion* region, prev = onigenc_get_prev_char_head(reg->enc, str, at); r = match_at(reg, str, end, at, prev, &msa); } + MATCH_ARG_FREE(msa); + reg->state--; /* decrement as search counter */ return r; } @@ -2784,7 +2892,7 @@ forward_search_range(regex_t* reg, UChar* str, UChar* end, UChar* s, } else { UChar *q = p + reg->dmin; - while (p < q) p += enc_len(reg->enc, *p); + while (p < q) p += enc_len(reg->enc, p); } } @@ -2794,7 +2902,8 @@ forward_search_range(regex_t* reg, UChar* str, UChar* end, UChar* s, p = slow_search(reg->enc, reg->exact, reg->exact_end, p, end, range); break; case ONIG_OPTIMIZE_EXACT_IC: - p = slow_search_ic(reg->enc, reg->exact, reg->exact_end, p, end, range); + p = slow_search_ic(reg->enc, reg->ambig_flag, + reg->exact, reg->exact_end, p, end, range); break; case ONIG_OPTIMIZE_EXACT_BM: @@ -2814,7 +2923,7 @@ forward_search_range(regex_t* reg, UChar* str, UChar* end, UChar* s, if (p - reg->dmin < s) { retry_gate: pprev = p; - p += enc_len(reg->enc, *p); + p += enc_len(reg->enc, p); goto retry; } @@ -2826,7 +2935,7 @@ forward_search_range(regex_t* reg, UChar* str, UChar* end, UChar* s, if (!ON_STR_BEGIN(p)) { prev = onigenc_get_prev_char_head(reg->enc, (pprev ? pprev : str), p); - if (!ONIG_IS_NEWLINE(*prev)) + if (!ONIGENC_IS_MBC_NEWLINE(reg->enc, prev, end)) goto retry_gate; } break; @@ -2835,10 +2944,10 @@ forward_search_range(regex_t* reg, UChar* str, UChar* end, UChar* s, if (ON_STR_END(p)) { prev = onigenc_get_prev_char_head(reg->enc, (pprev ? pprev : str), p); - if (prev && ONIG_IS_NEWLINE(*prev)) + if (prev && ONIGENC_IS_MBC_NEWLINE(reg->enc, prev, end)) goto retry_gate; } - else if (!ONIG_IS_NEWLINE(*p)) + else if (!ONIGENC_IS_MBC_NEWLINE(reg->enc, p, end)) goto retry_gate; break; } @@ -2886,7 +2995,7 @@ forward_search_range(regex_t* reg, UChar* str, UChar* end, UChar* s, } static int set_bm_backward_skip P_((UChar* s, UChar* end, OnigEncoding enc, - int ignore_case, int** skip)); + int** skip)); #define BM_BACKWARD_SEARCH_LENGTH_THRESHOLD 100 @@ -2909,8 +3018,9 @@ backward_search_range(regex_t* reg, UChar* str, UChar* end, UChar* s, break; case ONIG_OPTIMIZE_EXACT_IC: - p = slow_search_backward_ic(reg->enc, reg->exact, - reg->exact_end, range, adjrange, end, p); + p = slow_search_backward_ic(reg->enc, reg->ambig_flag, + reg->exact, reg->exact_end, + range, adjrange, end, p); break; case ONIG_OPTIMIZE_EXACT_BM: @@ -2919,7 +3029,7 @@ backward_search_range(regex_t* reg, UChar* str, UChar* end, UChar* s, if (s - range < BM_BACKWARD_SEARCH_LENGTH_THRESHOLD) goto exact_method; - r = set_bm_backward_skip(reg->exact, reg->exact_end, reg->enc, 0, + r = set_bm_backward_skip(reg->exact, reg->exact_end, reg->enc, &(reg->int_map_backward)); if (r) return r; } @@ -2940,7 +3050,7 @@ backward_search_range(regex_t* reg, UChar* str, UChar* end, UChar* s, case ANCHOR_BEGIN_LINE: if (!ON_STR_BEGIN(p)) { prev = onigenc_get_prev_char_head(reg->enc, adjrange, p); - if (!ONIG_IS_NEWLINE(*prev)) { + if (!ONIGENC_IS_MBC_NEWLINE(reg->enc, prev, end)) { p = prev; goto retry; } @@ -2951,12 +3061,12 @@ backward_search_range(regex_t* reg, UChar* str, UChar* end, UChar* s, if (ON_STR_END(p)) { prev = onigenc_get_prev_char_head(reg->enc, adjrange, p); if (IS_NULL(prev)) goto fail; - if (ONIG_IS_NEWLINE(*prev)) { + if (ONIGENC_IS_MBC_NEWLINE(reg->enc, prev, end)) { p = prev; goto retry; } } - else if (!ONIG_IS_NEWLINE(*p)) { + else if (!ONIGENC_IS_MBC_NEWLINE(reg->enc, p, end)) { p = onigenc_get_prev_char_head(reg->enc, adjrange, p); if (IS_NULL(p)) goto fail; goto retry; @@ -3096,8 +3206,10 @@ onig_search(regex_t* reg, UChar* str, UChar* end, } } else if (reg->anchor & ANCHOR_SEMI_END_BUF) { - if (ONIG_IS_NEWLINE(end[-1])) { - semi_end = end - 1; + UChar* pre_end = ONIGENC_STEP_BACK(reg->enc, start, end, 1); + + if (ONIGENC_IS_MBC_NEWLINE(reg->enc, pre_end, end)) { + semi_end = pre_end; if (semi_end > str && start <= semi_end) { goto end_buf; } @@ -3167,13 +3279,14 @@ onig_search(regex_t* reg, UChar* str, UChar* end, while (s <= high) { MATCH_AND_RETURN_CHECK; prev = s; - s += enc_len(reg->enc, *s); + s += enc_len(reg->enc, s); } if ((reg->anchor & ANCHOR_ANYCHAR_STAR) != 0) { if (IS_NOT_NULL(prev)) { - while (!ONIG_IS_NEWLINE(*prev) && s < range) { + while (!ONIGENC_IS_MBC_NEWLINE(reg->enc, prev, end) && + s < range) { prev = s; - s += enc_len(reg->enc, *s); + s += enc_len(reg->enc, s); } } } @@ -3190,14 +3303,18 @@ onig_search(regex_t* reg, UChar* str, UChar* end, do { MATCH_AND_RETURN_CHECK; prev = s; - s += enc_len(reg->enc, *s); + s += enc_len(reg->enc, s); } while (s <= range); /* exec s == range, because empty match with /$/. */ } else { /* backward search */ if (reg->optimize != ONIG_OPTIMIZE_NONE) { UChar *low, *high, *adjrange, *sch_start; - adjrange = ONIGENC_LEFT_ADJUST_CHAR_HEAD(reg->enc, str, range); + if (range < end) + adjrange = ONIGENC_LEFT_ADJUST_CHAR_HEAD(reg->enc, str, range); + else + adjrange = end; + if (reg->dmax != ONIG_INFINITE_DISTANCE && (end - range) >= reg->threshold_len) { do { @@ -3296,8 +3413,44 @@ onig_get_options(regex_t* reg) return reg->options; } +extern OnigAmbigType +onig_get_ambig_flag(regex_t* reg) +{ + return reg->ambig_flag; +} + extern OnigSyntaxType* onig_get_syntax(regex_t* reg) { return reg->syntax; } + +extern int +onig_number_of_captures(regex_t* reg) +{ + return reg->num_mem; +} + +extern int +onig_number_of_capture_histories(regex_t* reg) +{ +#ifdef USE_CAPTURE_HISTORY + int i, n; + + n = 0; + for (i = 0; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) { + if (BIT_STATUS_AT(reg->capture_history, i) != 0) + n++; + } + return n; +#else + return 0; +#endif +} + +extern void +onig_copy_encoding(OnigEncoding to, OnigEncoding from) +{ + *to = *from; +} + @@ -1,12 +1,33 @@ +#ifndef REGINT_H +#define REGINT_H /********************************************************************** - regint.h - Oniguruma (regular expression library) - - Copyright (C) 2002-2004 K.Kosako ([email protected]) - **********************************************************************/ -#ifndef REGINT_H -#define REGINT_H +/*- + * Copyright (c) 2002-2004 K.Kosako <kosako AT sofnec DOT co DOT jp> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ /* for debug */ /* #define ONIG_DEBUG_PARSE_TREE */ @@ -19,7 +40,8 @@ /* #define ONIG_DEBUG_STATISTICS */ #if defined(ONIG_DEBUG_PARSE_TREE) || defined(ONIG_DEBUG_MATCH) || \ - defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_STATISTICS) + defined(ONIG_DEBUG_SEARCH) || defined(ONIG_DEBUG_COMPILE) || \ + defined(ONIG_DEBUG_STATISTICS) #ifndef ONIG_DEBUG #define ONIG_DEBUG #endif @@ -36,7 +58,6 @@ /* spec. config */ #define USE_NAMED_GROUP #define USE_SUBEXP_CALL -#define USE_FOLD_MATCH /* ess-tsett etc... */ #define USE_INFINITE_REPEAT_MONOMANIAC_MEM_STATUS_CHECK /* /(?:()|())*\2/ */ #define USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE /* /\n$/ =~ "\n" */ #define USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR @@ -51,12 +72,14 @@ /* interface to external system */ #ifdef NOT_RUBY /* gived from Makefile */ #include "config.h" +#define USE_CAPTURE_HISTORY #define USE_VARIABLE_META_CHARS #define USE_WORD_BEGIN_END /* "\<": word-begin, "\>": word-end */ #define USE_POSIX_REGION_OPTION /* needed for POSIX API support */ #define THREAD_ATOMIC_START /* depend on thread system */ #define THREAD_ATOMIC_END /* depend on thread system */ #define THREAD_PASS /* depend on thread system */ +#define CHECK_INTERRUPT /* depend on application */ #define xmalloc malloc #define xrealloc realloc #define xfree free @@ -67,6 +90,14 @@ #define THREAD_ATOMIC_START DEFER_INTS #define THREAD_ATOMIC_END ENABLE_INTS #define THREAD_PASS rb_thread_schedule() +#define CHECK_INTERRUPT do {\ + if (rb_trap_pending) {\ + if (! rb_prohibit_interrupt) {\ + rb_trap_exec();\ + }\ + }\ +} while (0) + #define DEFAULT_WARN_FUNCTION rb_warn #define DEFAULT_VERB_WARN_FUNCTION rb_warning @@ -108,7 +139,9 @@ #endif #include <ctype.h> +#ifndef __BORLANDC__ #include <sys/types.h> +#endif #ifdef ONIG_DEBUG # include <stdio.h> @@ -291,6 +324,8 @@ typedef unsigned int BitStatusType; /* ignore-case and multibyte status are included in compiled code. */ #define IS_DYNAMIC_OPTION(option) 0 +#define REPEAT_INFINITE -1 +#define IS_REPEAT_INFINITE(n) ((n) == REPEAT_INFINITE) /* bitset */ #define BITS_PER_BYTE 8 @@ -530,11 +565,11 @@ enum OpCode { #define ARG_MEMNUM 4 #define ARG_OPTION 5 -typedef short int RelAddrType; -typedef short int AbsAddrType; -typedef short int LengthType; -typedef short int MemNumType; -typedef int RepeatNumType; +typedef int RelAddrType; +typedef int AbsAddrType; +typedef int LengthType; +typedef int RepeatNumType; +typedef short int MemNumType; #define SIZE_OPCODE 1 #define SIZE_RELADDR sizeof(RelAddrType) @@ -575,6 +610,7 @@ typedef int RepeatNumType; option = *((OnigOptionType* )(p));\ (p) += SIZE_OPTION;\ } while(0) + #else #define GET_RELADDR_INC(addr,p) GET_SHORT_INC(addr,p) @@ -637,23 +673,37 @@ typedef int RepeatNumType; #define SIZE_OP_RETURN SIZE_OPCODE -typedef struct { - UChar esc; - UChar anychar; - UChar anytime; - UChar zero_or_one_time; - UChar one_or_more_time; - UChar anychar_anytime; -} OnigMetaCharTableType; - -extern OnigMetaCharTableType OnigMetaCharTable; - -#define MC_ESC OnigMetaCharTable.esc -#define MC_ANYCHAR OnigMetaCharTable.anychar -#define MC_ANYTIME OnigMetaCharTable.anytime -#define MC_ZERO_OR_ONE_TIME OnigMetaCharTable.zero_or_one_time -#define MC_ONE_OR_MORE_TIME OnigMetaCharTable.one_or_more_time -#define MC_ANYCHAR_ANYTIME OnigMetaCharTable.anychar_anytime +#define MC_ESC(enc) (enc)->meta_char_table.esc +#define MC_ANYCHAR(enc) (enc)->meta_char_table.anychar +#define MC_ANYTIME(enc) (enc)->meta_char_table.anytime +#define MC_ZERO_OR_ONE_TIME(enc) (enc)->meta_char_table.zero_or_one_time +#define MC_ONE_OR_MORE_TIME(enc) (enc)->meta_char_table.one_or_more_time +#define MC_ANYCHAR_ANYTIME(enc) (enc)->meta_char_table.anychar_anytime + +#define SYN_POSIX_COMMON_OP \ + ( ONIG_SYN_OP_DOT_ANYCHAR | ONIG_SYN_OP_POSIX_BRACKET | \ + ONIG_SYN_OP_DECIMAL_BACKREF | \ + ONIG_SYN_OP_BRACKET_CC | ONIG_SYN_OP_ASTERISK_ZERO_INF | \ + ONIG_SYN_OP_LINE_ANCHOR | \ + ONIG_SYN_OP_ESC_CONTROL_CHARS ) + +#define SYN_GNU_REGEX_OP \ + ( ONIG_SYN_OP_DOT_ANYCHAR | ONIG_SYN_OP_BRACKET_CC | \ + ONIG_SYN_OP_POSIX_BRACKET | ONIG_SYN_OP_DECIMAL_BACKREF | \ + ONIG_SYN_OP_BRACE_INTERVAL | ONIG_SYN_OP_LPAREN_SUBEXP | \ + ONIG_SYN_OP_VBAR_ALT | \ + ONIG_SYN_OP_ASTERISK_ZERO_INF | ONIG_SYN_OP_PLUS_ONE_INF | \ + ONIG_SYN_OP_QMARK_ZERO_ONE | \ + ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR | ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR | \ + ONIG_SYN_OP_ESC_W_WORD | \ + ONIG_SYN_OP_ESC_B_WORD_BOUND | ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END | \ + ONIG_SYN_OP_ESC_S_WHITE_SPACE | ONIG_SYN_OP_ESC_D_DIGIT | \ + ONIG_SYN_OP_LINE_ANCHOR ) + +#define SYN_GNU_REGEX_BV \ + ( ONIG_SYN_CONTEXT_INDEP_ANCHORS | ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS | \ + ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS | ONIG_SYN_ALLOW_INVALID_INTERVAL | \ + ONIG_SYN_BACKSLASH_ESCAPE_IN_CC | ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC ) #define SYN_POSIX_COMMON_OP \ ( ONIG_SYN_OP_DOT_ANYCHAR | ONIG_SYN_OP_POSIX_BRACKET | \ @@ -691,7 +741,7 @@ typedef struct { extern OnigOpInfoType OnigOpInfo[]; -extern void onig_print_compiled_byte_code P_((FILE* f, UChar* bp, UChar** nextp)); +extern void onig_print_compiled_byte_code P_((FILE* f, UChar* bp, UChar** nextp, OnigEncoding enc)); #ifdef ONIG_DEBUG_STATISTICS extern void onig_statistics_init P_((void)); @@ -703,9 +753,11 @@ extern char* onig_error_code_to_format P_((int code)); extern void onig_snprintf_with_pattern PV_((char buf[], int bufsize, OnigEncoding enc, char* pat, char* pat_end, char *fmt, ...)); extern UChar* onig_strdup P_((UChar* s, UChar* end)); extern int onig_bbuf_init P_((BBuf* buf, int size)); -extern int onig_alloc_init P_((regex_t** reg, OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax)); +extern int onig_alloc_init P_((regex_t** reg, OnigOptionType option, OnigAmbigType ambig_flag, OnigEncoding enc, OnigSyntaxType* syntax)); extern int onig_compile P_((regex_t* reg, UChar* pattern, UChar* pattern_end, OnigErrorInfo* einfo)); extern void onig_chain_reduce P_((regex_t* reg)); +extern void onig_chain_link_add P_((regex_t* to, regex_t* add)); +extern void onig_transfer P_((regex_t* to, regex_t* from)); extern int onig_is_in_code_range P_((UChar* p, OnigCodePoint code)); #endif /* REGINT_H */ diff --git a/regparse.c b/regparse.c index 67bcbec5eb..b75c6951d0 100644 --- a/regparse.c +++ b/regparse.c @@ -1,10 +1,32 @@ /********************************************************************** - regparse.c - Oniguruma (regular expression library) - - Copyright (C) 2003-2004 K.Kosako ([email protected]) - **********************************************************************/ +/*- + * Copyright (c) 2002-2004 K.Kosako <kosako AT sofnec DOT co DOT jp> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include "regparse.h" #define WARN_BUFSIZE 256 @@ -21,12 +43,14 @@ OnigSyntaxType OnigSyntaxRuby = { ONIG_SYN_OP2_ESC_G_SUBEXP_CALL | ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT | ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL | - ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB ) + ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB | + ONIG_SYN_OP2_ESC_H_XDIGIT ) , ( SYN_GNU_REGEX_BV | ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV | ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND | ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP | ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME | + ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY | ONIG_SYN_WARN_CC_OP_NOT_ESCAPED | ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT ) , ONIG_OPTION_NONE @@ -34,15 +58,6 @@ OnigSyntaxType OnigSyntaxRuby = { OnigSyntaxType* OnigDefaultSyntax = ONIG_SYNTAX_RUBY; -OnigMetaCharTableType OnigMetaCharTable = { - (OnigCodePoint )'\\' /* esc */ - , (OnigCodePoint )0 /* anychar '.' */ - , (OnigCodePoint )0 /* anytime '*' */ - , (OnigCodePoint )0 /* zero or one time '?' */ - , (OnigCodePoint )0 /* one or more time '+' */ - , (OnigCodePoint )0 /* anychar anytime */ -}; - extern void onig_null_warn(char* s) { } #ifdef DEFAULT_WARN_FUNCTION @@ -93,12 +108,15 @@ bbuf_clone(BBuf** rto, BBuf* from) #define ONOFF(v,f,negative) (negative) ? ((v) &= ~(f)) : ((v) |= (f)) -#define SET_ALL_MULTI_BYTE_RANGE(pbuf) \ - add_code_range_to_buf(pbuf, (OnigCodePoint )0x80, ~((OnigCodePoint )0)) +#define MBCODE_START_POS(enc) \ + (OnigCodePoint )(ONIGENC_MBC_MINLEN(enc) > 1 ? 0 : 0x80) -#define ADD_ALL_MULTI_BYTE_RANGE(code, mbuf) do {\ - if (! ONIGENC_IS_SINGLEBYTE(code)) {\ - r = SET_ALL_MULTI_BYTE_RANGE(&(mbuf));\ +#define SET_ALL_MULTI_BYTE_RANGE(enc, pbuf) \ + add_code_range_to_buf(pbuf, MBCODE_START_POS(enc), ~((OnigCodePoint )0)) + +#define ADD_ALL_MULTI_BYTE_RANGE(enc, mbuf) do {\ + if (! ONIGENC_IS_SINGLEBYTE(enc)) {\ + r = SET_ALL_MULTI_BYTE_RANGE(enc, &(mbuf));\ if (r) return r;\ }\ } while (0) @@ -217,14 +235,23 @@ onig_strdup(UChar* s, UChar* end) } /* scan pattern methods */ -#define PEND_VALUE -1 - -#define PFETCH(c) do { (c) = *p++; } while (0) -#define PUNFETCH p-- -#define PINC p++ -#define PPEEK (p < end ? *p : PEND_VALUE) -#define PEND (p < end ? 0 : 1) +#define PEND_VALUE 0 + +#define PFETCH_READY UChar* pfetch_prev +#define PEND (p < end ? 0 : 1) +#define PUNFETCH p = pfetch_prev +#define PINC do { \ + pfetch_prev = p; \ + p += ONIGENC_MBC_ENC_LEN(enc, p); \ +} while (0) +#define PFETCH(c) do { \ + c = ONIGENC_MBC_TO_CODE(enc, p, end); \ + pfetch_prev = p; \ + p += ONIGENC_MBC_ENC_LEN(enc, p); \ +} while (0) +#define PPEEK (p < end ? ONIGENC_MBC_TO_CODE(enc, p, end) : PEND_VALUE) +#define PPEEK_IS(c) (PPEEK == (OnigCodePoint )c) static UChar* k_strcat_capa(UChar* dest, UChar* dest_end, UChar* src, UChar* src_end, @@ -388,12 +415,15 @@ typedef struct { regex_t* reg; void* arg; int ret; + OnigEncoding enc; } INamesArg; static int i_names(UChar* key, NameEntry* e, INamesArg* arg) { - int r = (*(arg->func))(e->name, e->name + strlen(e->name), e->back_num, + int r = (*(arg->func))(e->name, + e->name + onigenc_str_bytelen_null(arg->enc, e->name), + e->back_num, (e->back_num > 1 ? e->back_refs : &(e->back_ref1)), arg->reg, arg->arg); if (r != 0) { @@ -416,6 +446,7 @@ onig_foreach_name(regex_t* reg, narg.func = func; narg.reg = reg; narg.arg = arg; + narg.enc = reg->enc; /* should be pattern encoding. */ st_foreach(t, i_names, (HashDataType )&narg); } return narg.ret; @@ -973,6 +1004,12 @@ node_new_list(Node* left, Node* right) return node; } +extern Node* +onig_node_new_list(Node* left, Node* right) +{ + return node_new_list(left, right); +} + static Node* node_new_alt(Node* left, Node* right) { @@ -1172,6 +1209,20 @@ onig_node_conv_to_str_node(Node* node, int flag) NSTRING(node).end = NSTRING(node).buf; } +extern void +onig_node_str_clear(Node* node) +{ + if (NSTRING(node).capa != 0 && + IS_NOT_NULL(NSTRING(node).s) && NSTRING(node).s != NSTRING(node).buf) { + xfree(NSTRING(node).s); + } + + NSTRING(node).capa = 0; + NSTRING(node).flag = 0; + NSTRING(node).s = NSTRING(node).buf; + NSTRING(node).end = NSTRING(node).buf; +} + static Node* node_new_str(UChar* s, UChar* end) { @@ -1190,6 +1241,12 @@ node_new_str(UChar* s, UChar* end) return node; } +extern Node* +onig_node_new_str(UChar* s, UChar* end) +{ + return node_new_str(s, end); +} + static Node* node_new_str_raw(UChar* s, UChar* end) { @@ -1205,15 +1262,6 @@ node_new_empty() } static Node* -node_new_str_char(UChar c) -{ - UChar p[1]; - - p[0] = c; - return node_new_str(p, p + 1); -} - -static Node* node_new_str_raw_char(UChar c) { UChar p[1]; @@ -1244,7 +1292,7 @@ static int str_node_can_be_split(StrNode* sn, OnigEncoding enc) { if (sn->end > sn->s) { - return ((enc_len(enc, *(sn->s)) < sn->end - sn->s) ? 1 : 0); + return ((enc_len(enc, sn->s) < sn->end - sn->s) ? 1 : 0); } return 0; } @@ -1253,8 +1301,9 @@ extern int onig_scan_unsigned_number(UChar** src, UChar* end, OnigEncoding enc) { unsigned int num, val; - int c; + OnigCodePoint c; UChar* p = *src; + PFETCH_READY; num = 0; while (!PEND) { @@ -1279,9 +1328,10 @@ static int scan_unsigned_hexadecimal_number(UChar** src, UChar* end, int maxlen, OnigEncoding enc) { - int c; + OnigCodePoint c; unsigned int num, val; UChar* p = *src; + PFETCH_READY; num = 0; while (!PEND && maxlen-- != 0) { @@ -1306,9 +1356,10 @@ static int scan_unsigned_octal_number(UChar** src, UChar* end, int maxlen, OnigEncoding enc) { - int c; + OnigCodePoint c; unsigned int num, val; UChar* p = *src; + PFETCH_READY; num = 0; while (!PEND && maxlen-- != 0) { @@ -1444,15 +1495,15 @@ add_code_range(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to) } static int -not_code_range_buf(BBuf* bbuf, BBuf** pbuf) +not_code_range_buf(OnigEncoding enc, BBuf* bbuf, BBuf** pbuf) { int r, i, n; - OnigCodePoint pre, from, to, *data; + OnigCodePoint pre, from, *data, to = 0; *pbuf = (BBuf* )NULL; if (IS_NULL(bbuf)) { set_all: - return SET_ALL_MULTI_BYTE_RANGE(pbuf); + return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf); } data = (OnigCodePoint* )(bbuf->p); @@ -1461,7 +1512,7 @@ not_code_range_buf(BBuf* bbuf, BBuf** pbuf) if (n <= 0) goto set_all; r = 0; - pre = 0x80; + pre = MBCODE_START_POS(enc); for (i = 0; i < n; i++) { from = data[i*2]; to = data[i*2+1]; @@ -1486,7 +1537,8 @@ not_code_range_buf(BBuf* bbuf, BBuf** pbuf) } while (0) static int -or_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf) +or_code_range_buf(OnigEncoding enc, BBuf* bbuf1, int not1, + BBuf* bbuf2, int not2, BBuf** pbuf) { int r; OnigCodePoint i, n1, *data1; @@ -1495,7 +1547,7 @@ or_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf) *pbuf = (BBuf* )NULL; if (IS_NULL(bbuf1) && IS_NULL(bbuf2)) { if (not1 != 0 || not2 != 0) - return SET_ALL_MULTI_BYTE_RANGE(pbuf); + return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf); return 0; } @@ -1505,14 +1557,14 @@ or_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf) if (IS_NULL(bbuf1)) { if (not1 != 0) { - return SET_ALL_MULTI_BYTE_RANGE(pbuf); + return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf); } else { if (not2 == 0) { return bbuf_clone(pbuf, bbuf2); } else { - return not_code_range_buf(bbuf2, pbuf); + return not_code_range_buf(enc, bbuf2, pbuf); } } } @@ -1528,7 +1580,7 @@ or_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf) r = bbuf_clone(pbuf, bbuf2); } else if (not1 == 0) { /* 1 OR (not 2) */ - r = not_code_range_buf(bbuf2, pbuf); + r = not_code_range_buf(enc, bbuf2, pbuf); } if (r != 0) return r; @@ -1639,6 +1691,29 @@ and_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf) } static int +clear_not_flag_cclass(CClassNode* cc, OnigEncoding enc) +{ + BBuf *tbuf; + int r; + + if (cc->not != 0) { + bitset_invert(cc->bs); + + if (! ONIGENC_IS_SINGLEBYTE(enc)) { + r = not_code_range_buf(enc, cc->mbuf, &tbuf); + if (r != 0) return r; + + bbuf_free(cc->mbuf); + cc->mbuf = tbuf; + } + + cc->not = 0; + } + + return 0; +} + +static int and_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc) { int r, not1, not2; @@ -1672,13 +1747,13 @@ and_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc) if (! ONIGENC_IS_SINGLEBYTE(enc)) { if (not1 != 0 && not2 != 0) { - r = or_code_range_buf(buf1, 0, buf2, 0, &pbuf); + r = or_code_range_buf(enc, buf1, 0, buf2, 0, &pbuf); } else { r = and_code_range_buf(buf1, not1, buf2, not2, &pbuf); if (r == 0 && not1 != 0) { BBuf *tbuf; - r = not_code_range_buf(pbuf, &tbuf); + r = not_code_range_buf(enc, pbuf, &tbuf); if (r != 0) { bbuf_free(pbuf); return r; @@ -1733,10 +1808,10 @@ or_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc) r = and_code_range_buf(buf1, 0, buf2, 0, &pbuf); } else { - r = or_code_range_buf(buf1, not1, buf2, not2, &pbuf); + r = or_code_range_buf(enc, buf1, not1, buf2, not2, &pbuf); if (r == 0 && not1 != 0) { BBuf *tbuf; - r = not_code_range_buf(pbuf, &tbuf); + r = not_code_range_buf(enc, pbuf, &tbuf); if (r != 0) { bbuf_free(pbuf); return r; @@ -1855,7 +1930,6 @@ static enum ReduceType ReduceTypeTable[6][6] = { {RQ_ASIS, RQ_PQ_Q, RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL} /* '+?' */ }; - extern void onig_reduce_nested_qualifier(Node* pnode, Node* cnode) { @@ -1908,8 +1982,9 @@ onig_reduce_nested_qualifier(Node* pnode, Node* cnode) enum TokenSyms { TK_EOT = 0, /* end of token */ - TK_BYTE = 1, - TK_RAW_BYTE = 2, + TK_RAW_BYTE = 1, + TK_CHAR, + TK_STRING, TK_CODE_POINT, TK_ANYCHAR, TK_CHAR_TYPE, @@ -1939,6 +2014,7 @@ typedef struct { int base; /* is number: 8, 16 (used in [....]) */ UChar* backp; union { + UChar* s; int c; OnigCodePoint code; int anchor; @@ -1970,8 +2046,11 @@ static int fetch_range_qualifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env) { int low, up, syn_allow, non_low = 0; - int c; + int r = 0; + OnigCodePoint c; + OnigEncoding enc = env->enc; UChar* p = *src; + PFETCH_READY; syn_allow = IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INVALID_INTERVAL); @@ -2025,12 +2104,13 @@ fetch_range_qualifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env) PUNFETCH; up = low; /* {n} : exact n times */ + r = 2; /* fixed */ } if (PEND) goto invalid; PFETCH(c); if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) { - if (c != MC_ESC) goto invalid; + if (c != MC_ESC(enc)) goto invalid; PFETCH(c); } if (c != '}') goto invalid; @@ -2043,7 +2123,7 @@ fetch_range_qualifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env) tok->u.repeat.lower = low; tok->u.repeat.upper = up; *src = p; - return 0; + return r; /* 0: normal {n,m}, 2: fixed {n} */ invalid: if (syn_allow) @@ -2056,8 +2136,11 @@ fetch_range_qualifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env) static int fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env) { - int c; + int v; + OnigCodePoint c; + OnigEncoding enc = env->enc; UChar* p = *src; + PFETCH_READY; if (PEND) return ONIGERR_END_PATTERN_AT_BACKSLASH; @@ -2070,9 +2153,10 @@ fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env) if (c != '-') return ONIGERR_META_CODE_SYNTAX; if (PEND) return ONIGERR_END_PATTERN_AT_META; PFETCH(c); - if (c == MC_ESC) { - c = fetch_escaped_value(&p, end, env); - if (c < 0) return c; + if (c == MC_ESC(enc)) { + v = fetch_escaped_value(&p, end, env); + if (v < 0) return v; + c = (OnigCodePoint )v; } c = ((c & 0xff) | 0x80); } @@ -2095,9 +2179,10 @@ fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env) control: if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL; PFETCH(c); - if (c == MC_ESC) { - c = fetch_escaped_value(&p, end, env); - if (c < 0) return c; + if (c == MC_ESC(enc)) { + v = fetch_escaped_value(&p, end, env); + if (v < 0) return v; + c = (OnigCodePoint )v; } else if (c == '?') c = 0177; @@ -2129,11 +2214,13 @@ static int fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env); static int fetch_name(UChar** src, UChar* end, UChar** rname_end, ScanEnv* env, int ref) { - int r, len, is_num; - int c = 0; - OnigCodePoint code, first_code; + int r, is_num; + OnigCodePoint c = 0; + OnigCodePoint first_code; + OnigEncoding enc = env->enc; UChar *name_end; UChar *p = *src; + PFETCH_READY; name_end = end; r = 0; @@ -2144,23 +2231,20 @@ fetch_name(UChar** src, UChar* end, UChar** rname_end, ScanEnv* env, int ref) else { first_code = ONIGENC_MBC_TO_CODE(env->enc, p, end); PFETCH(c); + first_code = c; if (c == '>') return ONIGERR_EMPTY_GROUP_NAME; - if (ONIGENC_IS_CODE_DIGIT(env->enc, first_code)) { + if (ONIGENC_IS_CODE_DIGIT(enc, c)) { if (ref == 1) is_num = 1; else { r = ONIGERR_INVALID_GROUP_NAME; } } - else if (! ONIGENC_IS_CODE_WORD(env->enc, first_code)) { + else if (!ONIGENC_IS_CODE_WORD(enc, c)) { r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; } - - len = enc_len(env->enc, c); - while (!PEND && len-- > 1) - PFETCH(c); } while (!PEND) { @@ -2169,35 +2253,28 @@ fetch_name(UChar** src, UChar* end, UChar** rname_end, ScanEnv* env, int ref) PFETCH(c); if (c == '>' || c == ')') break; - len = enc_len(env->enc, c); if (is_num == 1) { - if (len == 1) { - if (! ONIGENC_IS_CODE_DIGIT(env->enc, code)) { - if (!ONIGENC_IS_CODE_WORD(env->enc, code)) - r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; - else - r = ONIGERR_INVALID_GROUP_NAME; - } - } - else { - r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; + if (! ONIGENC_IS_CODE_DIGIT(enc, c)) { + if (!ONIGENC_IS_CODE_WORD(enc, c)) + r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; + else + r = ONIGERR_INVALID_GROUP_NAME; } } else { - if (! ONIGENC_IS_CODE_WORD(env->enc, code)) { + if (!ONIGENC_IS_CODE_WORD(enc, c)) { r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; } } - - while (!PEND && len-- > 1) - PFETCH(c); } + if (c != '>') { r = ONIGERR_INVALID_GROUP_NAME; name_end = end; } else { - if (ONIGENC_IS_CODE_UPPER(env->enc, first_code)) + if (ONIGENC_IS_CODE_ASCII(first_code) && + ONIGENC_IS_CODE_UPPER(enc, first_code)) r = ONIGERR_INVALID_GROUP_NAME; } @@ -2216,22 +2293,21 @@ static int fetch_name(UChar** src, UChar* end, UChar** rname_end, ScanEnv* env, int ref) { int r, len; - int c = 0; - OnigCodePoint code; + OnigCodePoint c = 0; UChar *name_end; + OnigEncoding enc = env->enc; UChar *p = *src; + PFETCH_READY; r = 0; while (!PEND) { name_end = p; - code = ONIGENC_MBC_TO_CODE(env->enc, p, end); - len = enc_len(env->enc, c); - PFETCH(c); - if (len > 1) + if (enc_len(enc, p) > 1) r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; + PFETCH(c); if (c == '>' || c == ')') break; - if (! ONIGENC_IS_CODE_DIGIT(env->enc, code)) + if (! ONIGENC_IS_CODE_DIGIT(enc, c)) r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; p += (len - 1); @@ -2294,12 +2370,12 @@ find_str_position(OnigCodePoint s[], int n, UChar* from, UChar* to, while (p < to) { x = ONIGENC_MBC_TO_CODE(enc, p, to); - q = p + enc_len(enc, *p); + q = p + enc_len(enc, p); if (x == s[0]) { for (i = 1; i < n && q < to; i++) { x = ONIGENC_MBC_TO_CODE(enc, q, to); if (x != s[i]) break; - q += enc_len(enc, *q); + q += enc_len(enc, q); } if (i >= n) { if (IS_NOT_NULL(next)) @@ -2325,24 +2401,24 @@ str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to, while (p < to) { if (in_esc) { in_esc = 0; - p += enc_len(enc, *p); + p += enc_len(enc, p); } else { x = ONIGENC_MBC_TO_CODE(enc, p, to); - q = p + enc_len(enc, *p); + q = p + enc_len(enc, p); if (x == s[0]) { for (i = 1; i < n && q < to; i++) { x = ONIGENC_MBC_TO_CODE(enc, q, to); if (x != s[i]) break; - q += enc_len(enc, *q); + q += enc_len(enc, q); } if (i >= n) return 1; - p += enc_len(enc, *p); + p += enc_len(enc, p); } else { x = ONIGENC_MBC_TO_CODE(enc, p, to); if (x == bad) return 0; - else if (x == MC_ESC) in_esc = 1; + else if (x == MC_ESC(enc)) in_esc = 1; p = q; } } @@ -2353,10 +2429,13 @@ str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to, static int fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) { - int c, num; + int num; + OnigCodePoint c, c2; OnigSyntaxType* syn = env->syntax; + OnigEncoding enc = env->enc; UChar* prev; UChar* p = *src; + PFETCH_READY; if (PEND) { tok->type = TK_EOT; @@ -2364,7 +2443,7 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) } PFETCH(c); - tok->type = TK_BYTE; + tok->type = TK_CHAR; tok->base = 0; tok->u.c = c; if (c == ']') { @@ -2373,7 +2452,7 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) else if (c == '-') { tok->type = TK_CC_RANGE; } - else if (c == MC_ESC) { + else if (c == MC_ESC(enc)) { if (! IS_SYNTAX_BV(syn, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC)) goto end; @@ -2407,17 +2486,27 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) tok->type = TK_CHAR_TYPE; tok->u.subtype = CTYPE_NOT_WHITE_SPACE; break; + case 'h': + if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break; + tok->type = TK_CHAR_TYPE; + tok->u.subtype = CTYPE_XDIGIT; + break; + case 'H': + if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break; + tok->type = TK_CHAR_TYPE; + tok->u.subtype = CTYPE_NOT_XDIGIT; + break; case 'p': case 'P': - if (PPEEK == '{' && + c2 = PPEEK; + if (c2 == '{' && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) { PINC; tok->type = TK_CHAR_PROPERTY; tok->u.prop.not = (c == 'P' ? 1 : 0); if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) { - int c2; PFETCH(c2); if (c2 == '^') { tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0); @@ -2432,14 +2521,17 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) if (PEND) break; prev = p; - if (PPEEK == '{' && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) { + if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) { PINC; - num = scan_unsigned_hexadecimal_number(&p, end, 8, env->enc); + num = scan_unsigned_hexadecimal_number(&p, end, 8, enc); if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; - if (!PEND && ONIGENC_IS_CODE_XDIGIT(env->enc, *p) && p - prev >= 9) - return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; + if (!PEND) { + c2 = PPEEK; + if (ONIGENC_IS_CODE_XDIGIT(enc, c2)) + return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; + } - if (p > prev + 1 && !PEND && PPEEK == '}') { + if (p > prev + enc_len(enc, prev) && !PEND && (PPEEK_IS('}'))) { PINC; tok->type = TK_CODE_POINT; tok->base = 16; @@ -2451,7 +2543,7 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) } } else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) { - num = scan_unsigned_hexadecimal_number(&p, end, 2, env->enc); + num = scan_unsigned_hexadecimal_number(&p, end, 2, enc); if (num < 0) return ONIGERR_TOO_BIG_NUMBER; if (p == prev) { /* can't read nothing. */ num = 0; /* but, it's not error */ @@ -2467,12 +2559,12 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) prev = p; if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) { - num = scan_unsigned_hexadecimal_number(&p, end, 4, env->enc); + num = scan_unsigned_hexadecimal_number(&p, end, 4, enc); if (num < 0) return ONIGERR_TOO_BIG_NUMBER; if (p == prev) { /* can't read nothing. */ num = 0; /* but, it's not error */ } - tok->type = TK_RAW_BYTE; + tok->type = TK_CODE_POINT; tok->base = 16; tok->u.c = num; } @@ -2483,7 +2575,7 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) { PUNFETCH; prev = p; - num = scan_unsigned_octal_number(&p, end, 3, env->enc); + num = scan_unsigned_octal_number(&p, end, 3, enc); if (num < 0) return ONIGERR_TOO_BIG_NUMBER; if (p == prev) { /* can't read nothing. */ num = 0; /* but, it's not error */ @@ -2500,18 +2592,18 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) if (num < 0) return num; if (tok->u.c != num) { tok->u.c = num; - tok->type = TK_RAW_BYTE; + tok->type = TK_CODE_POINT; } break; } } else if (c == '[') { - if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_POSIX_BRACKET) && PPEEK == ':') { + if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_POSIX_BRACKET) && (PPEEK_IS(':'))) { OnigCodePoint send[] = { (OnigCodePoint )':', (OnigCodePoint )']' }; tok->backp = p; /* point at '[' is readed */ PINC; - if (str_exist_check_with_esc(send, 2, p, end, (OnigCodePoint )']', - env->enc)) { + if (str_exist_check_with_esc(send, 2, p, end, + (OnigCodePoint )']', enc)) { tok->type = TK_POSIX_BRACKET_OPEN; } else { @@ -2531,7 +2623,7 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) } else if (c == '&') { if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP) && - !PEND && PPEEK == '&') { + !PEND && (PPEEK_IS('&'))) { PINC; tok->type = TK_CC_AND; } @@ -2545,10 +2637,13 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) static int fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) { - int r, c, num; + int r, num; + OnigCodePoint c; + OnigEncoding enc = env->enc; OnigSyntaxType* syn = env->syntax; UChar* prev; UChar* p = *src; + PFETCH_READY; start: if (PEND) { @@ -2556,13 +2651,17 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) return tok->type; } - tok->type = TK_BYTE; - tok->base = 0; + tok->type = TK_STRING; + tok->base = 0; + tok->backp = p; + PFETCH(c); - if (c == MC_ESC) { + if (c == MC_ESC(enc)) { if (PEND) return ONIGERR_END_PATTERN_AT_BACKSLASH; + tok->backp = p; PFETCH(c); + tok->u.c = c; tok->escaped = 1; switch (c) { @@ -2588,37 +2687,42 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) tok->u.repeat.lower = 0; tok->u.repeat.upper = 1; greedy_check: - if (!PEND && PPEEK == '?' && + if (!PEND && PPEEK_IS('?') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_NON_GREEDY)) { PFETCH(c); tok->u.repeat.greedy = 0; tok->u.repeat.possessive = 0; } - else if (!PEND && PPEEK == '+' && - ((IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT) && - tok->type != TK_INTERVAL) || - (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL) && - tok->type == TK_INTERVAL))) { - PFETCH(c); - tok->u.repeat.greedy = 1; - tok->u.repeat.possessive = 1; - } else { - tok->u.repeat.greedy = 1; - tok->u.repeat.possessive = 0; + possessive_check: + if (!PEND && PPEEK_IS('+') && + ((IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT) && + tok->type != TK_INTERVAL) || + (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL) && + tok->type == TK_INTERVAL))) { + PFETCH(c); + tok->u.repeat.greedy = 1; + tok->u.repeat.possessive = 1; + } + else { + tok->u.repeat.greedy = 1; + tok->u.repeat.possessive = 0; + } } break; case '{': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) break; - tok->backp = p; r = fetch_range_qualifier(&p, end, tok, env); if (r < 0) return r; /* error */ - if (r > 0) { - /* normal char */ - } - else + if (r == 0) goto greedy_check; + else if (r == 2) { /* {n} */ + if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY)) + goto possessive_check; + goto greedy_check; + } + /* r == 1 : normal char */ break; case '|': @@ -2698,6 +2802,18 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) tok->u.subtype = CTYPE_NOT_DIGIT; break; + case 'h': + if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break; + tok->type = TK_CHAR_TYPE; + tok->u.subtype = CTYPE_XDIGIT; + break; + + case 'H': + if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break; + tok->type = TK_CHAR_TYPE; + tok->u.subtype = CTYPE_NOT_XDIGIT; + break; + case 'A': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break; begin_buf: @@ -2738,14 +2854,16 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) if (PEND) break; prev = p; - if (PPEEK == '{' && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) { + if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) { PINC; - num = scan_unsigned_hexadecimal_number(&p, end, 8, env->enc); + num = scan_unsigned_hexadecimal_number(&p, end, 8, enc); if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; - if (!PEND && ONIGENC_IS_CODE_XDIGIT(env->enc, *p) && p - prev >= 9) - return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; + if (!PEND) { + if (ONIGENC_IS_CODE_XDIGIT(enc, PPEEK)) + return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; + } - if (p > prev + 1 && !PEND && PPEEK == '}') { + if ((p > prev + enc_len(enc, prev)) && !PEND && PPEEK_IS('}')) { PINC; tok->type = TK_CODE_POINT; tok->u.code = (OnigCodePoint )num; @@ -2756,7 +2874,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) } } else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) { - num = scan_unsigned_hexadecimal_number(&p, end, 2, env->enc); + num = scan_unsigned_hexadecimal_number(&p, end, 2, enc); if (num < 0) return ONIGERR_TOO_BIG_NUMBER; if (p == prev) { /* can't read nothing. */ num = 0; /* but, it's not error */ @@ -2772,12 +2890,12 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) prev = p; if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) { - num = scan_unsigned_hexadecimal_number(&p, end, 4, env->enc); + num = scan_unsigned_hexadecimal_number(&p, end, 4, enc); if (num < 0) return ONIGERR_TOO_BIG_NUMBER; if (p == prev) { /* can't read nothing. */ num = 0; /* but, it's not error */ } - tok->type = TK_RAW_BYTE; + tok->type = TK_CODE_POINT; tok->base = 16; tok->u.c = num; } @@ -2787,9 +2905,10 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) case '5': case '6': case '7': case '8': case '9': PUNFETCH; prev = p; - num = onig_scan_unsigned_number(&p, end, env->enc); - if (num < 0) return ONIGERR_TOO_BIG_NUMBER; - if (num > ONIG_MAX_BACKREF_NUM) return ONIGERR_TOO_BIG_BACKREF_NUMBER; + num = onig_scan_unsigned_number(&p, end, enc); + if (num < 0 || num > ONIG_MAX_BACKREF_NUM) { + goto skip_backref; + } if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_DECIMAL_BACKREF) && (num <= env->num_mem || num <= 9)) { /* This spec. from GNU regex */ @@ -2804,7 +2923,9 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) tok->u.backref.by_name = 0; break; } - else if (c == '8' || c == '9') { + + skip_backref: + if (c == '8' || c == '9') { /* normal char */ p = prev; PINC; break; @@ -2815,7 +2936,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) case '0': if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) { prev = p; - num = scan_unsigned_octal_number(&p, end, (c == '0' ? 2:3), env->enc); + num = scan_unsigned_octal_number(&p, end, (c == '0' ? 2:3), enc); if (num < 0) return ONIGERR_TOO_BIG_NUMBER; if (p == prev) { /* can't read nothing. */ num = 0; /* but, it's not error */ @@ -2901,16 +3022,15 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) case 'p': case 'P': - if (PPEEK == '{' && + if (PPEEK_IS('{') && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) { PINC; tok->type = TK_CHAR_PROPERTY; tok->u.prop.not = (c == 'P' ? 1 : 0); if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) { - int c2; - PFETCH(c2); - if (c2 == '^') { + PFETCH(c); + if (c == '^') { tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0); } else @@ -2925,9 +3045,12 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) if (num < 0) return num; /* set_raw: */ if (tok->u.c != num) { - tok->type = TK_RAW_BYTE; + tok->type = TK_CODE_POINT; tok->u.c = num; } + else { /* string */ + p = tok->backp + enc_len(enc, tok->backp); + } break; } } @@ -2938,15 +3061,15 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) #ifdef USE_VARIABLE_META_CHARS if ((c != ONIG_INEFFECTIVE_META_CHAR) && IS_SYNTAX_OP(syn, ONIG_SYN_OP_VARIABLE_META_CHARACTERS)) { - if (c == MC_ANYCHAR) + if (c == MC_ANYCHAR(enc)) goto any_char; - else if (c == MC_ANYTIME) + else if (c == MC_ANYTIME(enc)) goto anytime; - else if (c == MC_ZERO_OR_ONE_TIME) + else if (c == MC_ZERO_OR_ONE_TIME(enc)) goto zero_or_one_time; - else if (c == MC_ONE_OR_MORE_TIME) + else if (c == MC_ONE_OR_MORE_TIME(enc)) goto one_or_more_time; - else if (c == MC_ANYCHAR_ANYTIME) { + else if (c == MC_ANYCHAR_ANYTIME(enc)) { tok->type = TK_ANYCHAR_ANYTIME; goto out; } @@ -2989,14 +3112,16 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) case '{': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACE_INTERVAL)) break; - tok->backp = p; r = fetch_range_qualifier(&p, end, tok, env); if (r < 0) return r; /* error */ - if (r > 0) { - /* normal char */ - } - else + if (r == 0) goto greedy_check; + else if (r == 2) { /* {n} */ + if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY)) + goto possessive_check; + goto greedy_check; + } + /* r == 1 : normal char */ break; case '|': @@ -3005,15 +3130,15 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) break; case '(': - if (PPEEK == '?' && + if (PPEEK_IS('?') && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) { PINC; - if (PPEEK == '#') { + if (PPEEK_IS('#')) { PFETCH(c); while (1) { if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; PFETCH(c); - if (c == MC_ESC) { + if (c == MC_ESC(enc)) { if (!PEND) PFETCH(c); } else { @@ -3062,7 +3187,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) if (IS_EXTEND(env->option)) { while (!PEND) { PFETCH(c); - if (ONIG_IS_NEWLINE(c)) + if (ONIGENC_IS_CODE_NEWLINE(enc, c)) break; } goto start; @@ -3076,6 +3201,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) break; default: + /* string */ break; } } @@ -3086,22 +3212,20 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) } static int -add_ctype_to_cc_by_list(CClassNode* cc, int ctype, int not, - OnigEncoding enc) +add_ctype_to_cc_by_range(CClassNode* cc, int ctype, int not, OnigEncoding enc, + int nsb, int nmb, + OnigCodePointRange *sbr, OnigCodePointRange *mbr) { - int i, r, nsb, nmb; - OnigCodePointRange *sbr, *mbr; + int i, r; OnigCodePoint j; - r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &nsb, &nmb, &sbr, &mbr); - if (r != 0) return r; - if (not == 0) { for (i = 0; i < nsb; i++) { for (j = sbr[i].from; j <= sbr[i].to; j++) { - BITSET_SET_BIT(cc->bs, j); + BITSET_SET_BIT(cc->bs, j); } } + for (i = 0; i < nmb; i++) { r = add_code_range_to_buf(&(cc->mbuf), mbr[i].from, mbr[i].to); if (r != 0) return r; @@ -3109,19 +3233,23 @@ add_ctype_to_cc_by_list(CClassNode* cc, int ctype, int not, } else { OnigCodePoint prev = 0; - for (i = 0; i < nsb; i++) { - for (j = prev; j < sbr[i].from; j++) { - BITSET_SET_BIT(cc->bs, j); + + if (ONIGENC_MBC_MINLEN(enc) == 1) { + for (i = 0; i < nsb; i++) { + for (j = prev; j < sbr[i].from; j++) { + BITSET_SET_BIT(cc->bs, j); + } + prev = sbr[i].to + 1; } - prev = sbr[i].to + 1; - } - if (prev < 0x7f) { - for (j = prev; j < 0x7f; j++) { - BITSET_SET_BIT(cc->bs, j); + if (prev < 0x7f) { + for (j = prev; j < 0x7f; j++) { + BITSET_SET_BIT(cc->bs, j); + } } + + prev = 0x80; } - prev = 0x80; for (i = 0; i < nmb; i++) { if (prev < mbr[i].from) { r = add_code_range_to_buf(&(cc->mbuf), prev, mbr[i].from - 1); @@ -3135,17 +3263,23 @@ add_ctype_to_cc_by_list(CClassNode* cc, int ctype, int not, } } - return r; + return 0; } static int add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env) { int c, r; + int nsb, nmb; + OnigCodePointRange *sbr, *mbr; OnigEncoding enc = env->enc; - if (ONIGENC_CTYPE_SUPPORT_LEVEL(enc) != ONIGENC_CTYPE_SUPPORT_LEVEL_SB) { - r = add_ctype_to_cc_by_list(cc, ctype, not, env->enc); + r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &nsb, &nmb, &sbr, &mbr); + if (r == 0) { + return add_ctype_to_cc_by_range(cc, ctype, not, env->enc, + nsb, nmb, sbr, mbr); + } + else if (r != ONIG_NO_SUPPORT_CONFIG) { return r; } @@ -3203,7 +3337,8 @@ add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env) } else { for (c = 0; c < SINGLE_BYTE_SIZE; c++) { - if (! ONIGENC_IS_CODE_SB_WORD(enc, c) && ! ONIGENC_IS_MBC_HEAD(enc, c)) + if ((ONIGENC_CODE_TO_MBCLEN(enc, c) > 0) && + ! ONIGENC_IS_CODE_WORD(enc, c)) BITSET_SET_BIT(cc->bs, c); } } @@ -3247,6 +3382,14 @@ parse_ctype_to_enc_ctype(int pctype, int* not) ctype = ONIGENC_CTYPE_DIGIT; *not = 1; break; + case CTYPE_XDIGIT: + ctype = ONIGENC_CTYPE_XDIGIT; + *not = 0; + break; + case CTYPE_NOT_XDIGIT: + ctype = ONIGENC_CTYPE_XDIGIT; + *not = 1; + break; default: return ONIGERR_PARSER_BUG; break; @@ -3284,23 +3427,26 @@ parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env) }; PosixBracketEntryType *pb; - int not, i, c, r; + int not, i, r; + OnigCodePoint c; + OnigEncoding enc = env->enc; UChar *p = *src; + PFETCH_READY; - if (PPEEK == '^') { + if (PPEEK_IS('^')) { PINC; not = 1; } else not = 0; - if (end - p < POSIX_BRACKET_NAME_MAX_LEN + 1) + if (onigenc_strlen(enc, p, end) < POSIX_BRACKET_NAME_MAX_LEN + 2) goto not_posix_bracket; for (pb = PBS; IS_NOT_NULL(pb->name); pb++) { - if (onig_strncmp(p, pb->name, pb->len) == 0) { - p += pb->len; - if (end - p < 2 || *p != ':' || *(p+1) != ']') + if (onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0) { + p = onigenc_step(enc, p, end, pb->len); + if (onigenc_with_ascii_strncmp(enc, p, end, ":]", 2) != 0) return ONIGERR_INVALID_POSIX_BRACKET_TYPE; r = add_ctype_to_cc(cc, pb->ctype, not, env); @@ -3319,9 +3465,9 @@ parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env) PINC; if (++i > POSIX_BRACKET_CHECK_LIMIT_LENGTH) break; } - if (c == ':' && !PEND) { + if (c == ':' && ! PEND) { PINC; - if (!PEND) { + if (! PEND) { PFETCH(c); if (c == ']') return ONIGERR_INVALID_POSIX_BRACKET_TYPE; @@ -3332,7 +3478,7 @@ parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env) } static int -property_name_to_ctype(UChar* p, UChar* end) +property_name_to_ctype(UChar* p, UChar* end, OnigEncoding enc) { static PosixBracketEntryType PBS[] = { { "Alnum", ONIGENC_CTYPE_ALNUM, 5 }, @@ -3354,9 +3500,10 @@ property_name_to_ctype(UChar* p, UChar* end) PosixBracketEntryType *pb; int len; - len = end - p; + len = onigenc_strlen(enc, p, end); for (pb = PBS; IS_NOT_NULL(pb->name); pb++) { - if (len == pb->len && onig_strncmp(p, pb->name, pb->len) == 0) + if (len == pb->len && + onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0) return pb->ctype; } @@ -3367,8 +3514,10 @@ static int fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env) { int ctype; + OnigCodePoint c; + OnigEncoding enc = env->enc; UChar *prev, *start, *p = *src; - int c; + PFETCH_READY; /* 'IsXXXX' => 'XXXX' */ if (!PEND && @@ -3392,7 +3541,7 @@ fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env) prev = p; PFETCH(c); if (c == '}') { - ctype = property_name_to_ctype(start, prev); + ctype = property_name_to_ctype(start, prev, enc); if (ctype < 0) break; *src = p; @@ -3499,12 +3648,26 @@ next_state_val(CClassNode* cc, OnigCodePoint *vs, OnigCodePoint v, } } else { +#if 0 if (intype == CCV_CODE_POINT && *type == CCV_SB && ONIGENC_IS_CONTINUOUS_SB_MB(env->enc)) { bitset_set_range(cc->bs, (int )*vs, 0x7f); r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )0x80, v); if (r < 0) return r; } +#else + if (intype == CCV_CODE_POINT && *type == CCV_SB) { + if (*vs > v) { + if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC)) + goto ccs_range_end; + else + return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS; + } + bitset_set_range(cc->bs, (int )*vs, (int )(v < 0xff ? v : 0xff)); + r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )*vs, v); + if (r < 0) return r; + } +#endif else return ONIGERR_MISMATCH_CODE_LENGTH_IN_CLASS_RANGE; } @@ -3528,22 +3691,24 @@ next_state_val(CClassNode* cc, OnigCodePoint *vs, OnigCodePoint v, } static int -char_exist_check(UChar c, UChar* from, UChar* to, int ignore_escaped, +code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped, OnigEncoding enc) { int in_esc; + OnigCodePoint code; UChar* p = from; + PFETCH_READY; in_esc = 0; - while (p < to) { + while (! PEND) { if (ignore_escaped && in_esc) { in_esc = 0; } else { - if (*p == c) return 1; - if (*p == MC_ESC) in_esc = 1; + PFETCH(code); + if (code == c) return 1; + if (code == MC_ESC(enc)) in_esc = 1; } - p += enc_len(enc, *p); } return 0; } @@ -3566,7 +3731,7 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, prev_cc = (CClassNode* )NULL; *np = NULL_NODE; r = fetch_token_in_cc(tok, src, end, env); - if (r == TK_BYTE && tok->u.c == '^' && tok->escaped == 0) { + if (r == TK_CHAR && tok->u.c == '^' && tok->escaped == 0) { neg = 1; r = fetch_token_in_cc(tok, src, end, env); } @@ -3576,11 +3741,12 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, if (r < 0) return r; if (r == TK_CC_CLOSE) { - if (! char_exist_check(']', *src, env->pattern_end, 1, env->enc)) + if (! code_exist_check((OnigCodePoint )']', + *src, env->pattern_end, 1, env->enc)) return ONIGERR_EMPTY_CHAR_CLASS; CC_ESC_WARN(env, "]"); - r = tok->type = TK_BYTE; /* allow []...] */ + r = tok->type = TK_CHAR; /* allow []...] */ } *np = node = node_new_cclass(); @@ -3593,58 +3759,69 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, while (r != TK_CC_CLOSE) { fetched = 0; switch (r) { - case TK_BYTE: - len = enc_len(env->enc, tok->u.c); + case TK_CHAR: + len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.c); if (len > 1) { - PUNFETCH; - v = ONIGENC_MBC_TO_CODE(env->enc, p, end); - p += len; in_type = CCV_CODE_POINT; } else { sb_char: - v = (OnigCodePoint )tok->u.c; in_type = CCV_SB; } + v = (OnigCodePoint )tok->u.c; in_israw = 0; goto val_entry2; break; case TK_RAW_BYTE: - len = enc_len(env->enc, tok->u.c); - if (len > 1 && tok->base != 0) { /* tok->base != 0 : octal or hexadec. */ + /* tok->base != 0 : octal or hexadec. */ + if (! ONIGENC_IS_SINGLEBYTE(env->enc) && tok->base != 0) { UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; - UChar* bufp = buf; UChar* bufe = buf + ONIGENC_CODE_TO_MBC_MAXLEN; + UChar* psave = p; int i, base = tok->base; - if (len > ONIGENC_CODE_TO_MBC_MAXLEN) { - bufp = (UChar* )xmalloc(len); - if (IS_NULL(bufp)) { - r = ONIGERR_MEMORY; - goto err; + buf[0] = tok->u.c; + for (i = 1; i < ONIGENC_MBC_MAXLEN(env->enc); i++) { + r = fetch_token_in_cc(tok, &p, end, env); + if (r < 0) goto err; + if (r != TK_RAW_BYTE || tok->base != base) { + fetched = 1; + break; } - bufe = bufp + len; + buf[i] = tok->u.c; } - bufp[0] = tok->u.c; - for (i = 1; i < len; i++) { - r = fetch_token_in_cc(tok, &p, end, env); - if (r < 0) goto raw_byte_err; - if (r != TK_RAW_BYTE || tok->base != base) break; - bufp[i] = tok->u.c; + + if (i < ONIGENC_MBC_MINLEN(env->enc)) { + r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING; + goto err; } + + len = enc_len(env->enc, buf); if (i < len) { r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING; - raw_byte_err: - if (bufp != buf) xfree(bufp); goto err; } - v = ONIGENC_MBC_TO_CODE(env->enc, bufp, bufe); - if (bufp != buf) xfree(bufp); - in_type = CCV_CODE_POINT; + else if (i > len) { /* fetch back */ + p = psave; + for (i = 1; i < len; i++) { + r = fetch_token_in_cc(tok, &p, end, env); + } + fetched = 0; + } + + if (i == 1) { + v = (OnigCodePoint )buf[0]; + goto raw_single; + } + else { + v = ONIGENC_MBC_TO_CODE(env->enc, buf, bufe); + in_type = CCV_CODE_POINT; + } } else { v = (OnigCodePoint )tok->u.c; + raw_single: in_type = CCV_SB; } in_israw = 1; @@ -3838,8 +4015,17 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, is_empty = (IS_NULL(cc->mbuf) ? 1 : 0); if (is_empty != 0) BITSET_IS_EMPTY(cc->bs, is_empty); - if (is_empty == 0) - BITSET_SET_BIT(cc->bs, ONIG_NEWLINE); + + if (is_empty == 0) { +#define NEWLINE_CODE 0x0a + + if (ONIGENC_IS_CODE_NEWLINE(env->enc, NEWLINE_CODE)) { + if (ONIGENC_CODE_TO_MBCLEN(env->enc, NEWLINE_CODE) == 1) + BITSET_SET_BIT(cc->bs, NEWLINE_CODE); + else + add_code_range(&(cc->mbuf), env, NEWLINE_CODE, NEWLINE_CODE); + } + } } *src = p; return 0; @@ -3858,17 +4044,20 @@ static int parse_effect(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, ScanEnv* env) { + int r, num; + int list_capture; Node *target; OnigOptionType option; - int r, c, num; - int list_capture; + OnigEncoding enc = env->enc; + OnigCodePoint c; UChar* p = *src; + PFETCH_READY; *np = NULL; if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS; option = env->option; - if (PPEEK == '?' && + if (PPEEK_IS('?') && IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) { PINC; if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; @@ -4016,7 +4205,7 @@ parse_effect(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, else if (c == ':') { OnigOptionType prev = env->option; - env->option = option; + env->option = option; r = fetch_token(tok, &p, end, env); if (r < 0) return r; r = parse_subexp(&target, tok, term, &p, end, env); @@ -4072,7 +4261,6 @@ parse_effect(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, return 0; } - static char* PopularQStr[] = { "?", "*", "+", "??", "*?", "+?" }; @@ -4137,7 +4325,7 @@ set_qualifier(Node* qnode, Node* target, int group, ScanEnv* env) if (onig_verb_warn != onig_null_warn) { onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc, env->pattern, env->pattern_end, - "nested repeat operator '%s and %s' was replaced with '%s'", + "nested repeat operator %s and %s was replaced with '%s'", PopularQStr[targetq_num], PopularQStr[nestq_num], ReduceQStr[ReduceTypeTable[targetq_num][nestq_num]]); (*onig_verb_warn)(buf); @@ -4165,74 +4353,59 @@ set_qualifier(Node* qnode, Node* target, int group, ScanEnv* env) return 0; } -#ifdef USE_FOLD_MATCH static int -make_alt_node_from_fold_info(OnigEncFoldMatchInfo* info, Node** node) +make_compound_alt_node_from_cc(OnigAmbigType ambig_flag, OnigEncoding enc, + CClassNode* cc, Node** root) { - int i; - UChar *s, *end; - Node *root, **ptail, *snode; - - ptail = &root; - for (i = 0; i < info->target_num; i++) { - s = info->target_str[i]; - end = s + info->target_byte_len[i]; - /* ex. - U+00DF match "ss" and "SS, but not match "Ss". - So, string nodes must be raw. - */ - snode = node_new_str_raw(s, end); - CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); - - *ptail = node_new_alt(snode, NULL_NODE); - CHECK_NULL_RETURN_VAL(*ptail, ONIGERR_MEMORY); - ptail = &(NCONS(*ptail).right); - } - *ptail = NULL_NODE; - *node = root; - return 0; -} - -static int -make_fold_alt_node_from_cc(OnigEncoding enc, CClassNode* cc, Node** root) -{ - int i, j, flen, len, ncode, n; - UChar *s, *end, buf[ONIGENC_CODE_TO_MBC_MAXLEN]; - OnigCodePoint* codes; - Node **ptail, *snode; - OnigEncFoldMatchInfo* info; + int r, i, j, k, clen, len, ncode, n; + UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; + Node **ptail, *snode = NULL_NODE; + OnigCompAmbigCodes* ccs; + OnigCompAmbigCodeItem* ci; + OnigAmbigType amb; + n = 0; *root = NULL_NODE; ptail = root; - ncode = ONIGENC_GET_ALL_FOLD_MATCH_CODE(enc, &codes); - n = 0; - for (i = 0; i < ncode; i++) { - if (onig_is_code_in_cc(enc, codes[i], cc)) { - len = ONIGENC_CODE_TO_MBC(enc, codes[i], buf); - flen = ONIGENC_GET_FOLD_MATCH_INFO(enc, buf, buf + len, &info); - if (flen > 0) { /* fold */ - for (j = 0; j < info->target_num; j++) { - s = info->target_str[j]; - end = s + info->target_byte_len[j]; - if (onig_strncmp(s, buf, enc_len(enc, *s)) == 0) - continue; /* ignore single char. */ - - snode = node_new_str_raw(s, end); - CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); - - *ptail = node_new_alt(snode, NULL_NODE); - CHECK_NULL_RETURN_VAL(*ptail, ONIGERR_MEMORY); - ptail = &(NCONS(*ptail).right); - n++; - } + + for (amb = 0x01; amb <= ONIGENC_AMBIGUOUS_MATCH_LIMIT; amb <<= 1) { + if ((amb & ambig_flag) == 0) continue; + + ncode = ONIGENC_GET_ALL_COMP_AMBIG_CODES(enc, amb, &ccs); + for (i = 0; i < ncode; i++) { + if (onig_is_code_in_cc(enc, ccs[i].code, cc)) { + for (j = 0; j < ccs[i].n; j++) { + ci = &(ccs[i].items[j]); + if (ci->len > 1) { /* compound only */ + if (cc->not) clear_not_flag_cclass(cc, enc); + + clen = ci->len; + for (k = 0; k < clen; k++) { + len = ONIGENC_CODE_TO_MBC(enc, ci->code[k], buf); + + if (k == 0) { + snode = node_new_str_raw(buf, buf + len); + CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); + } + else { + r = onig_node_str_cat(snode, buf, buf + len); + if (r < 0) return r; + } + } + + *ptail = node_new_alt(snode, NULL_NODE); + CHECK_NULL_RETURN_VAL(*ptail, ONIGERR_MEMORY); + ptail = &(NCONS(*ptail).right); + n++; + } + } } } } return n; } -#endif static int parse_exp(Node** np, OnigToken* tok, int term, @@ -4281,76 +4454,22 @@ parse_exp(Node** np, OnigToken* tok, int term, else goto tk_byte; break; - case TK_BYTE: + case TK_STRING: tk_byte: { - *np = node_new_str_char((UChar )tok->u.c); + *np = node_new_str(tok->backp, *src); CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY); while (1) { - len = enc_len(env->enc, tok->u.c); - if (len > 1) { - r = onig_node_str_cat(*np, *src, *src + len - 1); - if (r < 0) return r; - *src += (len - 1); - } - r = fetch_token(tok, src, end, env); if (r < 0) return r; - if (r != TK_BYTE) break; + if (r != TK_STRING) break; - r = node_str_cat_char(*np, (UChar )tok->u.c); + r = onig_node_str_cat(*np, tok->backp, *src); if (r < 0) return r; } - fold_entry: -#ifdef USE_FOLD_MATCH - if (IS_IGNORECASE(env->option) && ONIGENC_IS_FOLD_MATCH(env->enc)) { - int flen, ret; - Node *root, **ptail, *work, *snode, *anode; - UChar *p, *pprev; - OnigEncFoldMatchInfo* fold_info; - StrNode* sn = &(NSTRING(*np)); - - ptail = &root; - pprev = sn->s; - for (p = sn->s; p < sn->end; ) { - flen = ONIGENC_GET_FOLD_MATCH_INFO(env->enc, p, sn->end, &fold_info); - if (flen > 0) { /* fold */ - ret = make_alt_node_from_fold_info(fold_info, &anode); - if (ret != 0) return ret; - work = node_new_list(anode, NULL); - CHECK_NULL_RETURN_VAL(work, ONIGERR_MEMORY); - - if (pprev < p) { - snode = node_new_str(pprev, p); - CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); - *ptail = node_new_list(snode, work); - CHECK_NULL_RETURN_VAL(*ptail, ONIGERR_MEMORY); - } - else { - *ptail = work; - } - ptail = &(NCONS(work).right); - p += flen; - pprev = p; - } - else - p += enc_len(env->enc, *p); - } - *ptail = NULL_NODE; - if (IS_NOT_NULL(root)) { - if (pprev < sn->end) { - snode = node_new_str(pprev, sn->end); - CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); - *ptail = node_new_list(snode, NULL_NODE); - CHECK_NULL_RETURN_VAL(*ptail, ONIGERR_MEMORY); - } - onig_node_free(*np); - *np = root; - } - } -#endif + string_end: targetp = np; goto repeat; } @@ -4359,22 +4478,19 @@ parse_exp(Node** np, OnigToken* tok, int term, case TK_RAW_BYTE: tk_raw_byte: { - int expect_len; - *np = node_new_str_raw_char((UChar )tok->u.c); CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY); - expect_len = enc_len(env->enc, tok->u.c); len = 1; while (1) { r = fetch_token(tok, src, end, env); if (r < 0) return r; if (r != TK_RAW_BYTE) { #ifndef NUMBERED_CHAR_IS_NOT_CASE_AMBIG - if (len >= expect_len) { + if (len >= enc_len(env->enc, NSTRING(*np).s)) { NSTRING_CLEAR_RAW(*np); } #endif - goto fold_entry; + goto string_end; } r = node_str_cat_char(*np, (UChar )tok->u.c); @@ -4403,7 +4519,7 @@ parse_exp(Node** np, OnigToken* tok, int term, OnigCodePoint end_op[2]; UChar *qstart, *qend, *nextp; - end_op[0] = (OnigCodePoint )MC_ESC; + end_op[0] = (OnigCodePoint )MC_ESC(env->enc); end_op[1] = (OnigCodePoint )'E'; qstart = *src; qend = find_str_position(end_op, 2, qstart, end, &nextp, env->enc); @@ -4429,6 +4545,8 @@ parse_exp(Node** np, OnigToken* tok, int term, case CTYPE_NOT_WHITE_SPACE: case CTYPE_DIGIT: case CTYPE_NOT_DIGIT: + case CTYPE_XDIGIT: + case CTYPE_NOT_XDIGIT: { CClassNode* cc; int ctype, not; @@ -4456,27 +4574,65 @@ parse_exp(Node** np, OnigToken* tok, int term, break; case TK_CC_OPEN: - r = parse_char_class(np, tok, src, end, env); - if (r != 0) return r; + { + CClassNode* cc; -#ifdef USE_FOLD_MATCH - if (IS_IGNORECASE(env->option) && ONIGENC_IS_FOLD_MATCH(env->enc)) { - int res; - Node *alt_root, *work; - CClassNode* cc = &(NCCLASS(*np)); - - res = make_fold_alt_node_from_cc(env->enc, cc, &alt_root); - if (res < 0) return res; - if (res > 0) { - work = node_new_alt(*np, alt_root); - if (IS_NULL(work)) { - onig_node_free(alt_root); - return ONIGERR_MEMORY; - } - *np = work; + r = parse_char_class(np, tok, src, end, env); + if (r != 0) return r; + + cc = &(NCCLASS(*np)); + + if (IS_IGNORECASE(env->option)) { + int i, n, in_cc; + OnigPairAmbigCodes* ccs; + BitSetRef bs = cc->bs; + OnigAmbigType amb; + + for (amb = 0x01; amb <= ONIGENC_AMBIGUOUS_MATCH_LIMIT; amb <<= 1) { + if ((amb & env->ambig_flag) == 0) continue; + + n = ONIGENC_GET_ALL_PAIR_AMBIG_CODES(env->enc, amb, &ccs); + for (i = 0; i < n; i++) { + in_cc = onig_is_code_in_cc(env->enc, ccs[i].from, cc); + + if ((in_cc != 0 && cc->not == 0) || (in_cc == 0 && cc->not != 0)) { + if (ONIGENC_MBC_MINLEN(env->enc) > 1 || + ccs[i].from >= SINGLE_BYTE_SIZE) { + /* if (cc->not) clear_not_flag_cclass(cc, env->enc); */ + add_code_range(&(cc->mbuf), env, ccs[i].to, ccs[i].to); + } + else { + if (BITSET_AT(bs, ccs[i].from)) { + /* /(?i:[^A-C])/.match("a") ==> fail. */ + BITSET_SET_BIT(bs, ccs[i].to); + } + if (BITSET_AT(bs, ccs[i].to)) { + BITSET_SET_BIT(bs, ccs[i].from); + } + } + } + } + } + } + + if (IS_IGNORECASE(env->option) && + (env->ambig_flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0) { + int res; + Node *alt_root, *work; + + res = make_compound_alt_node_from_cc(env->ambig_flag, env->enc, + cc, &alt_root); + if (res < 0) return res; + if (res > 0) { + work = node_new_alt(*np, alt_root); + if (IS_NULL(work)) { + onig_node_free(alt_root); + return ONIGERR_MEMORY; + } + *np = work; + } } } -#endif break; case TK_ANYCHAR: @@ -4522,7 +4678,6 @@ parse_exp(Node** np, OnigToken* tok, int term, *np = node_new_empty(); } else { - *src = tok->backp; goto tk_byte; } break; @@ -4685,6 +4840,7 @@ onig_parse_make_tree(Node** root, UChar* pattern, UChar* end, regex_t* reg, scan_env_clear(env); env->option = reg->options; + env->ambig_flag = reg->ambig_flag; env->enc = reg->enc; env->syntax = reg->syntax; env->pattern = pattern; diff --git a/regparse.h b/regparse.h index a4acd92208..5982ec8081 100644 --- a/regparse.h +++ b/regparse.h @@ -1,12 +1,33 @@ +#ifndef REGPARSE_H +#define REGPARSE_H /********************************************************************** - regparse.h - Oniguruma (regular expression library) - - Copyright (C) 2003-2004 K.Kosako ([email protected]) - **********************************************************************/ -#ifndef REGPARSE_H -#define REGPARSE_H +/*- + * Copyright (c) 2002-2004 K.Kosako <kosako AT sofnec DOT co DOT jp> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ #include "regint.h" @@ -43,7 +64,8 @@ #define CTYPE_NOT_WHITE_SPACE (1<<3) #define CTYPE_DIGIT (1<<4) #define CTYPE_NOT_DIGIT (1<<5) - +#define CTYPE_XDIGIT (1<<6) +#define CTYPE_NOT_XDIGIT (1<<7) #define ANCHOR_ANYCHAR_STAR_MASK (ANCHOR_ANYCHAR_STAR | ANCHOR_ANYCHAR_STAR_PL) #define ANCHOR_END_BUF_MASK (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF) @@ -52,23 +74,23 @@ #define EFFECT_OPTION (1<<1) #define EFFECT_STOP_BACKTRACK (1<<2) -#define REPEAT_INFINITE -1 -#define IS_REPEAT_INFINITE(n) ((n) == REPEAT_INFINITE) - #define NODE_STR_MARGIN 16 #define NODE_STR_BUF_SIZE 24 /* sizeof(CClassNode) - sizeof(int)*4 */ #define NODE_BACKREFS_SIZE 7 #define NSTR_RAW (1<<0) /* by backslashed number */ -#define NSTR_CASE_AMBIG (1<<1) - -#define NSTRING_LEN(node) ((node)->u.str.end - (node)->u.str.s) -#define NSTRING_SET_RAW(node) (node)->u.str.flag |= NSTR_RAW -#define NSTRING_CLEAR_RAW(node) (node)->u.str.flag &= ~NSTR_RAW -#define NSTRING_SET_CASE_AMBIG(node) (node)->u.str.flag |= NSTR_CASE_AMBIG -#define NSTRING_IS_RAW(node) (((node)->u.str.flag & NSTR_RAW) != 0) -#define NSTRING_IS_CASE_AMBIG(node) \ - (((node)->u.str.flag & NSTR_CASE_AMBIG) != 0) +#define NSTR_AMBIG (1<<1) +#define NSTR_AMBIG_REDUCE (1<<2) + +#define NSTRING_LEN(node) ((node)->u.str.end - (node)->u.str.s) +#define NSTRING_SET_RAW(node) (node)->u.str.flag |= NSTR_RAW +#define NSTRING_CLEAR_RAW(node) (node)->u.str.flag &= ~NSTR_RAW +#define NSTRING_SET_AMBIG(node) (node)->u.str.flag |= NSTR_AMBIG +#define NSTRING_SET_AMBIG_REDUCE(node) (node)->u.str.flag |= NSTR_AMBIG_REDUCE +#define NSTRING_IS_RAW(node) (((node)->u.str.flag & NSTR_RAW) != 0) +#define NSTRING_IS_AMBIG(node) (((node)->u.str.flag & NSTR_AMBIG) != 0) +#define NSTRING_IS_AMBIG_REDUCE(node) \ + (((node)->u.str.flag & NSTR_AMBIG_REDUCE) != 0) #define BACKREFS_P(br) \ (IS_NOT_NULL((br)->back_dynamic) ? (br)->back_dynamic : (br)->back_static); @@ -109,20 +131,19 @@ typedef struct { } QualifierNode; /* status bits */ -#define NST_MIN_FIXED (1<<0) -#define NST_MAX_FIXED (1<<1) -#define NST_CLEN_FIXED (1<<2) -#define NST_MARK1 (1<<3) -#define NST_MARK2 (1<<4) -#define NST_MEM_BACKREFED (1<<5) -#define NST_SIMPLE_REPEAT (1<<6) /* for stop backtrack optimization */ - -#define NST_RECURSION (1<<7) -#define NST_CALLED (1<<8) -#define NST_ADDR_FIXED (1<<9) -#define NST_NAMED_GROUP (1<<10) -#define NST_NAME_REF (1<<11) -#define NST_IN_REPEAT (1<<12) /* STK_REPEAT is nested in match stack. */ +#define NST_MIN_FIXED (1<<0) +#define NST_MAX_FIXED (1<<1) +#define NST_CLEN_FIXED (1<<2) +#define NST_MARK1 (1<<3) +#define NST_MARK2 (1<<4) +#define NST_MEM_BACKREFED (1<<5) +#define NST_STOP_BT_SIMPLE_REPEAT (1<<6) +#define NST_RECURSION (1<<7) +#define NST_CALLED (1<<8) +#define NST_ADDR_FIXED (1<<9) +#define NST_NAMED_GROUP (1<<10) +#define NST_NAME_REF (1<<11) +#define NST_IN_REPEAT (1<<12) /* STK_REPEAT is nested in stack. */ #define SET_EFFECT_STATUS(node,f) (node)->u.effect.state |= (f) #define CLEAR_EFFECT_STATUS(node,f) (node)->u.effect.state &= ~(f) @@ -135,7 +156,8 @@ typedef struct { #define IS_EFFECT_MIN_FIXED(en) (((en)->state & NST_MIN_FIXED) != 0) #define IS_EFFECT_MAX_FIXED(en) (((en)->state & NST_MAX_FIXED) != 0) #define IS_EFFECT_CLEN_FIXED(en) (((en)->state & NST_CLEN_FIXED) != 0) -#define IS_EFFECT_SIMPLE_REPEAT(en) (((en)->state & NST_SIMPLE_REPEAT) != 0) +#define IS_EFFECT_STOP_BT_SIMPLE_REPEAT(en) \ + (((en)->state & NST_STOP_BT_SIMPLE_REPEAT) != 0) #define IS_EFFECT_NAMED_GROUP(en) (((en)->state & NST_NAMED_GROUP) != 0) #define SET_CALL_RECURSION(node) (node)->u.call.state |= NST_RECURSION @@ -227,9 +249,10 @@ typedef struct _Node { (senv)->mem_nodes_dynamic : (senv)->mem_nodes_static) typedef struct { - OnigOptionType option; - OnigEncoding enc; - OnigSyntaxType* syntax; + OnigOptionType option; + OnigAmbigType ambig_flag; + OnigEncoding enc; + OnigSyntaxType* syntax; BitStatusType capture_history; BitStatusType bt_mem_start; BitStatusType bt_mem_end; @@ -267,6 +290,9 @@ extern int onig_node_str_cat P_((Node* node, UChar* s, UChar* end)); extern void onig_node_free P_((Node* node)); extern Node* onig_node_new_effect P_((int type)); extern Node* onig_node_new_anchor P_((int type)); +extern Node* onig_node_new_str P_((UChar* s, UChar* end)); +extern Node* onig_node_new_list P_((Node* left, Node* right)); +extern void onig_node_str_clear P_((Node* node)); extern int onig_free_node_list(); extern int onig_names_free P_((regex_t* reg)); extern int onig_parse_make_tree P_((Node** root, UChar* pattern, UChar* end, regex_t* reg, ScanEnv* env)); |