diff options
-rw-r--r-- | NEWS | 4 | ||||
-rw-r--r-- | include/ruby/onigmo.h | 11 | ||||
-rw-r--r-- | regcomp.c | 99 | ||||
-rw-r--r-- | regenc.c | 4 | ||||
-rw-r--r-- | regexec.c | 157 | ||||
-rw-r--r-- | regint.h | 15 | ||||
-rw-r--r-- | regparse.c | 26 | ||||
-rw-r--r-- | regparse.h | 1 |
8 files changed, 227 insertions, 90 deletions
@@ -20,6 +20,10 @@ with all sufficient information, see the ChangeLog file or Redmine === Core classes updates (outstanding ones only) +* Regexp + * Update Onigmo 6.1.1. + * Support absent operator https://github.com/k-takata/Onigmo/issues/82 + === Stdlib updates (outstanding ones only) === Compatibility issues (excluding feature bug fixes) diff --git a/include/ruby/onigmo.h b/include/ruby/onigmo.h index 228aa77ea5..868372494b 100644 --- a/include/ruby/onigmo.h +++ b/include/ruby/onigmo.h @@ -5,7 +5,7 @@ **********************************************************************/ /*- * Copyright (c) 2002-2009 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> - * Copyright (c) 2011-2016 K.Takata <kentkt AT csc DOT jp> + * Copyright (c) 2011-2017 K.Takata <kentkt AT csc DOT jp> * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -38,8 +38,8 @@ extern "C" { #endif #define ONIGMO_VERSION_MAJOR 6 -#define ONIGMO_VERSION_MINOR 0 -#define ONIGMO_VERSION_TEENY 0 +#define ONIGMO_VERSION_MINOR 1 +#define ONIGMO_VERSION_TEENY 1 #ifndef ONIG_EXTERN # ifdef RUBY_EXTERN @@ -580,7 +580,8 @@ ONIG_EXTERN const OnigSyntaxType* OnigDefaultSyntax; #define ONIG_SYN_OP2_QMARK_VBAR_BRANCH_RESET (1U<<28) /* (?|...) */ /* NOTIMPL */ #define ONIG_SYN_OP2_QMARK_LPAREN_CONDITION (1U<<29) /* (?(cond)yes...|no...) */ #define ONIG_SYN_OP2_QMARK_CAPITAL_P_NAMED_GROUP (1U<<30) /* (?P<name>...), (?P=name), (?P>name) -- Python/PCRE */ -#define ONIG_SYN_OP2_OPTION_JAVA (1U<<31) /* (?idmsux), (?-idmsux) */ /* NOTIMPL */ +#define ONIG_SYN_OP2_QMARK_TILDE_ABSENT (1U<<31) /* (?~...) */ +/* #define ONIG_SYN_OP2_OPTION_JAVA (1U<<xx) */ /* (?idmsux), (?-idmsux) */ /* NOTIMPL */ /* syntax (behavior) */ #define ONIG_SYN_CONTEXT_INDEP_ANCHORS (1U<<31) /* not implemented */ @@ -824,7 +825,7 @@ int onig_new(OnigRegex*, const OnigUChar* pattern, const OnigUChar* pattern_end, ONIG_EXTERN int onig_reg_init(OnigRegex reg, OnigOptionType option, OnigCaseFoldType case_fold_flag, OnigEncoding enc, const OnigSyntaxType* syntax); ONIG_EXTERN -int onig_new_without_alloc(OnigRegex, const OnigUChar* pattern, const OnigUChar* pattern_end, OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax, OnigErrorInfo* einfo); +int onig_new_without_alloc(OnigRegex, const OnigUChar* pattern, const OnigUChar* pattern_end, OnigOptionType option, OnigEncoding enc, const OnigSyntaxType* syntax, OnigErrorInfo* einfo); ONIG_EXTERN int onig_new_deluxe(OnigRegex* reg, const OnigUChar* pattern, const OnigUChar* pattern_end, OnigCompileInfo* ci, OnigErrorInfo* einfo); ONIG_EXTERN @@ -1286,6 +1286,10 @@ compile_length_enclose_node(EncloseNode* node, regex_t* reg) } break; + case ENCLOSE_ABSENT: + len = SIZE_OP_PUSH_ABSENT_POS + SIZE_OP_ABSENT + tlen + SIZE_OP_ABSENT_END; + break; + default: return ONIGERR_TYPE_BUG; break; @@ -1430,6 +1434,19 @@ compile_enclose_node(EncloseNode* node, regex_t* reg) } break; + case ENCLOSE_ABSENT: + len = compile_length_tree(node->target, reg); + if (len < 0) return len; + + r = add_opcode(reg, OP_PUSH_ABSENT_POS); + if (r) return r; + r = add_opcode_rel_addr(reg, OP_ABSENT, len + SIZE_OP_ABSENT_END); + if (r) return r; + r = compile_tree(node->target, reg); + if (r) return r; + r = add_opcode(reg, OP_ABSENT_END); + break; + default: return ONIGERR_TYPE_BUG; break; @@ -1484,9 +1501,6 @@ compile_anchor_node(AnchorNode* node, regex_t* reg) case ANCHOR_SEMI_END_BUF: r = add_opcode(reg, OP_SEMI_END_BUF); break; case ANCHOR_BEGIN_POSITION: r = add_opcode(reg, OP_BEGIN_POSITION); break; - /* used for implicit anchor optimization: /.*a/ ==> /(?:^|\G).*a/ */ - case ANCHOR_ANYCHAR_STAR: r = add_opcode(reg, OP_BEGIN_POS_OR_LINE); break; - case ANCHOR_WORD_BOUND: if (node->ascii_range) r = add_opcode(reg, OP_ASCII_WORD_BOUND); else r = add_opcode(reg, OP_WORD_BOUND); @@ -2112,6 +2126,7 @@ quantifiers_memory_node_info(Node* node) case ENCLOSE_OPTION: case ENCLOSE_STOP_BACKTRACK: case ENCLOSE_CONDITION: + case ENCLOSE_ABSENT: r = quantifiers_memory_node_info(en->target); break; default: @@ -2251,6 +2266,9 @@ get_min_match_length(Node* node, OnigDistance *min, ScanEnv* env) case ENCLOSE_CONDITION: r = get_min_match_length(en->target, min, env); break; + + case ENCLOSE_ABSENT: + break; } } break; @@ -2374,6 +2392,9 @@ get_max_match_length(Node* node, OnigDistance *max, ScanEnv* env) case ENCLOSE_CONDITION: r = get_max_match_length(en->target, max, env); break; + + case ENCLOSE_ABSENT: + break; } } break; @@ -2497,6 +2518,7 @@ get_char_length_tree1(Node* node, regex_t* reg, int* len, int level) case ENCLOSE_CONDITION: r = get_char_length_tree1(en->target, reg, len, level); break; + case ENCLOSE_ABSENT: default: break; } @@ -2790,6 +2812,9 @@ get_head_value_node(Node* node, int exact, regex_t* reg) case ENCLOSE_CONDITION: n = get_head_value_node(en->target, exact, reg); break; + + case ENCLOSE_ABSENT: + break; } } break; @@ -3295,7 +3320,7 @@ setup_look_behind(Node* node, regex_t* reg, ScanEnv* env) } static int -next_setup(Node* node, Node* next_node, int in_root, regex_t* reg) +next_setup(Node* node, Node* next_node, regex_t* reg) { int type; @@ -3329,32 +3354,10 @@ next_setup(Node* node, Node* next_node, int in_root, regex_t* reg) } } } - -#ifndef ONIG_DONT_OPTIMIZE - if (NTYPE(node) == NT_QTFR && /* the type may be changed by above block */ - in_root && /* qn->lower == 0 && */ - NTYPE(qn->target) == NT_CANY && - ! IS_MULTILINE(reg->options)) { - /* implicit anchor: /.*a/ ==> /(?:^|\G).*a/ */ - Node *np; - np = onig_node_new_list(NULL_NODE, NULL_NODE); - CHECK_NULL_RETURN_MEMERR(np); - swap_node(node, np); - NCDR(node) = onig_node_new_list(np, NULL_NODE); - if (IS_NULL(NCDR(node))) { - onig_node_free(np); - return ONIGERR_MEMORY; - } - np = onig_node_new_anchor(ANCHOR_ANYCHAR_STAR); /* (?:^|\G) */ - CHECK_NULL_RETURN_MEMERR(np); - NCAR(node) = np; - } -#endif } } else if (type == NT_ENCLOSE) { EncloseNode* en = NENCLOSE(node); - in_root = 0; if (en->type == ENCLOSE_MEMORY) { node = en->target; goto retry; @@ -3852,9 +3855,8 @@ setup_comb_exp_check(Node* node, int state, ScanEnv* env) #define IN_NOT (1<<1) #define IN_REPEAT (1<<2) #define IN_VAR_REPEAT (1<<3) -#define IN_ROOT (1<<4) -#define IN_CALL (1<<5) -#define IN_RECCALL (1<<6) +#define IN_CALL (1<<4) +#define IN_RECCALL (1<<5) /* setup_tree does the following work. 1. check empty loop. (set qn->target_empty_info) @@ -3869,25 +3871,19 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) { int type; int r = 0; - int in_root = state & IN_ROOT; - state &= ~IN_ROOT; restart: type = NTYPE(node); switch (type) { case NT_LIST: { Node* prev = NULL_NODE; - int prev_in_root = 0; - state |= in_root; do { r = setup_tree(NCAR(node), reg, state, env); if (IS_NOT_NULL(prev) && r == 0) { - r = next_setup(prev, NCAR(node), prev_in_root, reg); + r = next_setup(prev, NCAR(node), reg); } prev = NCAR(node); - prev_in_root = state & IN_ROOT; - state &= ~IN_ROOT; } while (r == 0 && IS_NOT_NULL(node = NCDR(node))); } break; @@ -4051,7 +4047,6 @@ restart: case ENCLOSE_OPTION: { OnigOptionType options = reg->options; - state |= in_root; reg->options = NENCLOSE(node)->option; r = setup_tree(NENCLOSE(node)->target, reg, state, env); reg->options = options; @@ -4101,6 +4096,10 @@ restart: return ONIGERR_INVALID_BACKREF; r = setup_tree(NENCLOSE(node)->target, reg, state, env); break; + + case ENCLOSE_ABSENT: + r = setup_tree(NENCLOSE(node)->target, reg, state, env); + break; } } break; @@ -4195,6 +4194,8 @@ set_bm_skip(UChar* s, UChar* end, regex_t* reg, n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag, p, end, items); clen = enclen(enc, p, end); + if (p + clen > end) + clen = (int )(end - p); for (j = 0; j < n; j++) { if ((items[j].code_len != 1) || (items[j].byte_len != clen)) @@ -4229,6 +4230,8 @@ set_bm_skip(UChar* s, UChar* end, regex_t* reg, n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag, p, end, items); clen = enclen(enc, p, end); + if (p + clen > end) + clen = (int )(end - p); for (j = 0; j < n; j++) { if ((items[j].code_len != 1) || (items[j].byte_len != clen)) @@ -4273,6 +4276,8 @@ set_bm_skip(UChar* s, UChar* end, regex_t* reg, n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag, p, end, items); clen = enclen(enc, p, end); + if (p + clen > end) + clen = (int )(end - p); for (j = 0; j < n; j++) { if ((items[j].code_len != 1) || (items[j].byte_len != clen)) @@ -4307,6 +4312,8 @@ set_bm_skip(UChar* s, UChar* end, regex_t* reg, n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag, p, end, items); clen = enclen(enc, p, end); + if (p + clen > end) + clen = (int )(end - p); for (j = 0; j < n; j++) { if ((items[j].code_len != 1) || (items[j].byte_len != clen)) @@ -5274,6 +5281,10 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env) case ENCLOSE_CONDITION: r = optimize_node_left(en->target, opt, env); break; + + case ENCLOSE_ABSENT: + set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE); + break; } } break; @@ -5782,7 +5793,7 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, reg->num_call = 0; #endif - r = setup_tree(root, reg, IN_ROOT, &scan_env); + r = setup_tree(root, reg, 0, &scan_env); if (r != 0) goto err_unset; #ifdef ONIG_DEBUG_PARSE_TREE @@ -5944,7 +5955,7 @@ onig_reg_init(regex_t* reg, OnigOptionType option, extern int onig_new_without_alloc(regex_t* reg, const UChar* pattern, const UChar* pattern_end, OnigOptionType option, OnigEncoding enc, - OnigSyntaxType* syntax, OnigErrorInfo* einfo) + const OnigSyntaxType* syntax, OnigErrorInfo* einfo) { int r; @@ -6173,7 +6184,6 @@ OnigOpInfoType OnigOpInfo[] = { { OP_END_LINE, "end-line", ARG_NON }, { OP_SEMI_END_BUF, "semi-end-buf", ARG_NON }, { OP_BEGIN_POSITION, "begin-position", ARG_NON }, - { OP_BEGIN_POS_OR_LINE, "begin-pos-or-line", ARG_NON }, { OP_BACKREF1, "backref1", ARG_NON }, { OP_BACKREF2, "backref2", ARG_NON }, { OP_BACKREFN, "backrefn", ARG_MEMNUM }, @@ -6215,6 +6225,9 @@ OnigOpInfoType OnigOpInfo[] = { { OP_LOOK_BEHIND, "look-behind", ARG_SPECIAL }, { OP_PUSH_LOOK_BEHIND_NOT, "push-look-behind-not", ARG_SPECIAL }, { OP_FAIL_LOOK_BEHIND_NOT, "fail-look-behind-not", ARG_NON }, + { OP_PUSH_ABSENT_POS, "push-absent-pos", ARG_NON }, + { OP_ABSENT, "absent", ARG_RELADDR }, + { OP_ABSENT_END, "absent-end", ARG_NON }, { OP_CALL, "call", ARG_ABSADDR }, { OP_RETURN, "return", ARG_NON }, { OP_CONDITION, "condition", ARG_SPECIAL }, @@ -6509,7 +6522,7 @@ onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar* bpend, UChar** nextp, default: fprintf(stderr, "onig_print_compiled_byte_code: undefined code %d\n", - *--bp); + bp[-1]); } } fputs("]", f); @@ -6629,7 +6642,6 @@ print_indent_tree(FILE* f, Node* node, int indent) case ANCHOR_END_LINE: fputs("end line", f); break; case ANCHOR_SEMI_END_BUF: fputs("semi end buf", f); break; case ANCHOR_BEGIN_POSITION: fputs("begin position", f); break; - case ANCHOR_ANYCHAR_STAR: fputs("begin position/line", f); break; case ANCHOR_WORD_BOUND: fputs("word bound", f); break; case ANCHOR_NOT_WORD_BOUND: fputs("not word bound", f); break; @@ -6694,6 +6706,9 @@ print_indent_tree(FILE* f, Node* node, int indent) case ENCLOSE_CONDITION: fprintf(f, "condition:%d", NENCLOSE(node)->regnum); break; + case ENCLOSE_ABSENT: + fprintf(f, "absent"); + break; default: break; @@ -54,11 +54,11 @@ onigenc_set_default_encoding(OnigEncoding enc) extern int onigenc_mbclen_approximate(const OnigUChar* p,const OnigUChar* e, OnigEncoding enc) { - int ret = ONIGENC_PRECISE_MBC_ENC_LEN(enc,p,e); + int ret = ONIGENC_PRECISE_MBC_ENC_LEN(enc, p, e); if (ONIGENC_MBCLEN_CHARFOUND_P(ret)) return ONIGENC_MBCLEN_CHARFOUND_LEN(ret); else if (ONIGENC_MBCLEN_NEEDMORE_P(ret)) - return (int)(e-p)+ONIGENC_MBCLEN_NEEDMORE_LEN(ret); + return (int )(e - p) + ONIGENC_MBCLEN_NEEDMORE_LEN(ret); return 1; } @@ -403,6 +403,8 @@ onig_region_copy(OnigRegion* to, const OnigRegion* from) #define STK_CALL_FRAME 0x0800 #define STK_RETURN 0x0900 #define STK_VOID 0x0a00 /* for fill a blank */ +#define STK_ABSENT_POS 0x0b00 /* for absent */ +#define STK_ABSENT 0x0c00 /* absent inner loop marker */ /* stack type check mask */ #define STK_MASK_POP_USED 0x00ff @@ -673,7 +675,8 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end, #define STACK_PUSH_ALT(pat,s,sprev,keep) STACK_PUSH(STK_ALT,pat,s,sprev,keep) #define STACK_PUSH_POS(s,sprev,keep) STACK_PUSH(STK_POS,NULL_UCHARP,s,sprev,keep) #define STACK_PUSH_POS_NOT(pat,s,sprev,keep) STACK_PUSH(STK_POS_NOT,pat,s,sprev,keep) -#define STACK_PUSH_STOP_BT STACK_PUSH_TYPE(STK_STOP_BT) +#define STACK_PUSH_ABSENT STACK_PUSH_TYPE(STK_ABSENT) +#define STACK_PUSH_STOP_BT STACK_PUSH_TYPE(STK_STOP_BT) #define STACK_PUSH_LOOK_BEHIND_NOT(pat,s,sprev,keep) \ STACK_PUSH(STK_LOOK_BEHIND_NOT,pat,s,sprev,keep) @@ -785,6 +788,14 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end, STACK_INC;\ } while(0) +#define STACK_PUSH_ABSENT_POS(start, end) do {\ + STACK_ENSURE(1);\ + stk->type = STK_ABSENT_POS;\ + stk->u.absent_pos.abs_pstr = (start);\ + stk->u.absent_pos.end_pstr = (end);\ + STACK_INC;\ +} while(0) + #ifdef ONIG_DEBUG # define STACK_BASE_CHECK(p, at) \ @@ -885,6 +896,33 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end, }\ } while(0) +#define STACK_POP_TIL_ABSENT do {\ + while (1) {\ + stk--;\ + STACK_BASE_CHECK(stk, "STACK_POP_TIL_ABSENT"); \ + if (stk->type == STK_ABSENT) break;\ + else if (stk->type == STK_MEM_START) {\ + mem_start_stk[stk->u.mem.num] = stk->u.mem.start;\ + mem_end_stk[stk->u.mem.num] = stk->u.mem.end;\ + }\ + else if (stk->type == STK_REPEAT_INC) {\ + STACK_AT(stk->u.repeat_inc.si)->u.repeat.count--;\ + }\ + else if (stk->type == STK_MEM_END) {\ + mem_start_stk[stk->u.mem.num] = stk->u.mem.start;\ + mem_end_stk[stk->u.mem.num] = stk->u.mem.end;\ + }\ + ELSE_IF_STATE_CHECK_MARK(stk);\ + }\ +} while(0) + +#define STACK_POP_ABSENT_POS(start, end) do {\ + stk--;\ + STACK_BASE_CHECK(stk, "STACK_POP_ABSENT_POS"); \ + (start) = stk->u.absent_pos.abs_pstr;\ + (end) = stk->u.absent_pos.end_pstr;\ +} while(0) + #define STACK_POS_END(k) do {\ k = stk;\ while (1) {\ @@ -1136,10 +1174,12 @@ static int string_cmp_ic(OnigEncoding enc, int case_fold_flag, # define DATA_ENSURE_CHECK1 (s < right_range) # define DATA_ENSURE_CHECK(n) (s + (n) <= right_range) # define DATA_ENSURE(n) if (s + (n) > right_range) goto fail +# define ABSENT_END_POS right_range #else # define DATA_ENSURE_CHECK1 (s < end) # define DATA_ENSURE_CHECK(n) (s + (n) <= end) # define DATA_ENSURE(n) if (s + (n) > end) goto fail +# define ABSENT_END_POS end #endif /* USE_MATCH_RANGE_MUST_BE_INSIDE_OF_SPECIFIED_RANGE */ @@ -1372,6 +1412,8 @@ stack_type_str(int stack_type) case STK_CALL_FRAME: return "Call "; case STK_RETURN: return "Ret "; case STK_VOID: return "Void "; + case STK_ABSENT_POS: return "AbsPos"; + case STK_ABSENT: return "Absent"; default: return " "; } } @@ -1484,7 +1526,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, &&L_OP_END_LINE, &&L_OP_SEMI_END_BUF, &&L_OP_BEGIN_POSITION, - &&L_OP_BEGIN_POS_OR_LINE, /* used for implicit anchor optimization */ &&L_OP_BACKREF1, &&L_OP_BACKREF2, @@ -1552,6 +1593,9 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, &&L_OP_LOOK_BEHIND, /* (?<=...) start (no needs end opcode) */ &&L_OP_PUSH_LOOK_BEHIND_NOT, /* (?<!...) start */ &&L_OP_FAIL_LOOK_BEHIND_NOT, /* (?<!...) end */ + &&L_OP_PUSH_ABSENT_POS, /* (?~...) start */ + &&L_OP_ABSENT, /* (?~...) start of inner loop */ + &&L_OP_ABSENT_END, /* (?~...) end */ # ifdef USE_SUBEXP_CALL &&L_OP_CALL, /* \g<name> */ @@ -1636,8 +1680,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, #endif #ifdef ONIG_DEBUG_MATCH - fprintf(stderr, "match_at: str: %"PRIdPTR" (%p), end: %"PRIdPTR" (%p), start: %"PRIdPTR" (%p), sprev: %"PRIdPTR" (%p)\n", - (intptr_t )str, str, (intptr_t )end, end, (intptr_t )sstart, sstart, (intptr_t )sprev, sprev); + fprintf(stderr, "match_at: str: %"PRIuPTR" (%p), end: %"PRIuPTR" (%p), start: %"PRIuPTR" (%p), sprev: %"PRIuPTR" (%p)\n", + (uintptr_t )str, str, (uintptr_t )end, end, (uintptr_t )sstart, sstart, (uintptr_t )sprev, sprev); fprintf(stderr, "size: %d, start offset: %d\n", (int )(end - str), (int )(sstart - str)); fprintf(stderr, "\n ofs> str stk:type addr:opcode\n"); @@ -2378,7 +2422,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, JUMP; CASE(OP_BEGIN_LINE) MOP_IN(OP_BEGIN_LINE); - op_begin_line: if (ON_STR_BEGIN(s)) { if (IS_NOTBOL(msa->options)) goto fail; MOP_OUT; @@ -2454,13 +2497,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, MOP_OUT; JUMP; - CASE(OP_BEGIN_POS_OR_LINE) MOP_IN(OP_BEGIN_POS_OR_LINE); - if (s != msa->gpos) - goto op_begin_line; - - MOP_OUT; - JUMP; - CASE(OP_MEMORY_START_PUSH) MOP_IN(OP_MEMORY_START_PUSH); GET_MEMNUM_INC(mem, p); STACK_PUSH_MEM_START(mem, s); @@ -2721,8 +2757,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, STACK_NULL_CHECK(isnull, mem, s); if (isnull) { #ifdef ONIG_DEBUG_MATCH - fprintf(stderr, "NULL_CHECK_END: skip id:%d, s:%"PRIdPTR" (%p)\n", - (int )mem, (intptr_t )s, s); + fprintf(stderr, "NULL_CHECK_END: skip id:%d, s:%"PRIuPTR" (%p)\n", + (int )mem, (uintptr_t )s, s); #endif null_check_found: /* empty loop founded, skip next instruction */ @@ -2755,8 +2791,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, STACK_NULL_CHECK_MEMST(isnull, mem, s, reg); if (isnull) { # ifdef ONIG_DEBUG_MATCH - fprintf(stderr, "NULL_CHECK_END_MEMST: skip id:%d, s:%"PRIdPTR" (%p)\n", - (int )mem, (intptr_t )s, s); + fprintf(stderr, "NULL_CHECK_END_MEMST: skip id:%d, s:%"PRIuPTR" (%p)\n", + (int )mem, (uintptr_t )s, s); # endif if (isnull == -1) goto fail; goto null_check_found; @@ -2780,8 +2816,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, # endif if (isnull) { # ifdef ONIG_DEBUG_MATCH - fprintf(stderr, "NULL_CHECK_END_MEMST_PUSH: skip id:%d, s:%"PRIdPTR" (%p)\n", - (int )mem, (intptr_t )s, s); + fprintf(stderr, "NULL_CHECK_END_MEMST_PUSH: skip id:%d, s:%"PRIuPTR" (%p)\n", + (int )mem, (uintptr_t )s, s); # endif if (isnull == -1) goto fail; goto null_check_found; @@ -3033,6 +3069,63 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, goto fail; NEXT; + CASE(OP_PUSH_ABSENT_POS) MOP_IN(OP_PUSH_ABSENT_POS); + /* Save the absent-start-pos and the original end-pos. */ + STACK_PUSH_ABSENT_POS(s, ABSENT_END_POS); + MOP_OUT; + JUMP; + + CASE(OP_ABSENT) MOP_IN(OP_ABSENT); + { + const UChar* aend = ABSENT_END_POS; + UChar* absent; + UChar* selfp = p - 1; + + STACK_POP_ABSENT_POS(absent, ABSENT_END_POS); /* Restore end-pos. */ + GET_RELADDR_INC(addr, p); +#ifdef ONIG_DEBUG_MATCH + fprintf(stderr, "ABSENT: s:%p, end:%p, absent:%p, aend:%p\n", s, end, absent, aend); +#endif + if ((absent > aend) && (s > absent)) { + /* An empty match occurred in (?~...) at the start point. + * Never match. */ + STACK_POP; + goto fail; + } + else if ((s >= aend) && (s > absent)) { + if (s > aend) { + /* Only one (or less) character matched in the last iteration. + * This is not a possible point. */ + goto fail; + } + /* All possible points were found. Try matching after (?~...). */ + DATA_ENSURE(0); + p += addr; + } + else { + STACK_PUSH_ALT(p + addr, s, sprev, pkeep); /* Push possible point. */ + n = enclen(encode, s, end); + STACK_PUSH_ABSENT_POS(absent, ABSENT_END_POS); /* Save the original pos. */ + STACK_PUSH_ALT(selfp, s + n, s, pkeep); /* Next iteration. */ + STACK_PUSH_ABSENT; + ABSENT_END_POS = aend; + } + } + MOP_OUT; + JUMP; + + CASE(OP_ABSENT_END) MOP_IN(OP_ABSENT_END); + /* The pattern inside (?~...) was matched. + * Set the end-pos temporary and go to next iteration. */ + if (sprev < ABSENT_END_POS) + ABSENT_END_POS = sprev; +#ifdef ONIG_DEBUG_MATCH + fprintf(stderr, "ABSENT_END: end:%p\n", ABSENT_END_POS); +#endif + STACK_POP_TIL_ABSENT; + goto fail; + NEXT; + #ifdef USE_SUBEXP_CALL CASE(OP_CALL) MOP_IN(OP_CALL); GET_ABSADDR_INC(addr, p); @@ -3270,7 +3363,7 @@ bm_search_notrev(regex_t* reg, const UChar* target, const UChar* target_end, # ifdef ONIG_DEBUG_SEARCH fprintf(stderr, "bm_search_notrev: text: %"PRIuPTR" (%p), text_end: %"PRIuPTR" (%p), text_range: %"PRIuPTR" (%p)\n", - text, text, text_end, text_end, text_range, text_range); + (uintptr_t )text, text, (uintptr_t )text_end, text_end, (uintptr_t )text_range, text_range); # endif tail = target_end - 1; @@ -3326,8 +3419,8 @@ bm_search(regex_t* reg, const UChar* target, const UChar* target_end, const UChar *tail; # ifdef ONIG_DEBUG_SEARCH - fprintf(stderr, "bm_search: text: %"PRIuPTR", text_end: %"PRIuPTR", text_range: %"PRIuPTR"\n", - text, text_end, text_range); + fprintf(stderr, "bm_search: text: %"PRIuPTR" (%p), text_end: %"PRIuPTR" (%p), text_range: %"PRIuPTR" (%p)\n", + (uintptr_t )text, text, (uintptr_t )text_end, text_end, (uintptr_t )text_range, text_range); # endif end = text_range + (target_end - target) - 1; @@ -3482,8 +3575,8 @@ bm_search_notrev(regex_t* reg, const UChar* target, const UChar* target_end, OnigEncoding enc = reg->enc; # ifdef ONIG_DEBUG_SEARCH - fprintf(stderr, "bm_search_notrev: text: %"PRIdPTR" (%p), text_end: %"PRIdPTR" (%p), text_range: %"PRIdPTR" (%p)\n", - (intptr_t )text, text, (intptr_t )text_end, text_end, (intptr_t )text_range, text_range); + fprintf(stderr, "bm_search_notrev: text: %"PRIuPTR" (%p), text_end: %"PRIuPTR" (%p), text_range: %"PRIuPTR" (%p)\n", + (uintptr_t )text, text, (uintptr_t )text_end, text_end, (uintptr_t )text_range, text_range); # endif tail = target_end - 1; @@ -3542,8 +3635,8 @@ bm_search(regex_t* reg, const UChar* target, const UChar* target_end, ptrdiff_t tlen1; # ifdef ONIG_DEBUG_SEARCH - fprintf(stderr, "bm_search: text: %"PRIuPTR", text_end: %"PRIuPTR", text_range: %"PRIuPTR"\n", - text, text_end, text_range); + fprintf(stderr, "bm_search: text: %"PRIuPTR" (%p), text_end: %"PRIuPTR" (%p), text_range: %"PRIuPTR" (%p)\n", + (uintptr_t )text, text, (uintptr_t )text_end, text_end, (uintptr_t )text_range, text_range); # endif tail = target_end - 1; @@ -3595,8 +3688,8 @@ bm_search_notrev_ic(regex_t* reg, const UChar* target, const UChar* target_end, int case_fold_flag = reg->case_fold_flag; # ifdef ONIG_DEBUG_SEARCH - fprintf(stderr, "bm_search_notrev_ic: text: %"PRIdPTR" (%p), text_end: %"PRIdPTR" (%p), text_range: %"PRIdPTR" (%p)\n", - (intptr_t )text, text, (intptr_t )text_end, text_end, (intptr_t )text_range, text_range); + fprintf(stderr, "bm_search_notrev_ic: text: %"PRIuPTR" (%p), text_end: %"PRIuPTR" (%p), text_range: %"PRIuPTR" (%p)\n", + (uintptr_t )text, text, (uintptr_t )text_end, text_end, (uintptr_t )text_range, text_range); # endif tail = target_end - 1; @@ -3653,8 +3746,8 @@ bm_search_ic(regex_t* reg, const UChar* target, const UChar* target_end, int case_fold_flag = reg->case_fold_flag; # ifdef ONIG_DEBUG_SEARCH - fprintf(stderr, "bm_search_ic: text: %"PRIdPTR" (%p), text_end: %"PRIdPTR" (%p), text_range: %"PRIdPTR" (%p)\n", - (intptr_t )text, text, (intptr_t )text_end, text_end, (intptr_t )text_range, text_range); + fprintf(stderr, "bm_search_ic: text: %"PRIuPTR" (%p), text_end: %"PRIuPTR" (%p), text_range: %"PRIuPTR" (%p)\n", + (uintptr_t )text, text, (uintptr_t )text_end, text_end, (uintptr_t )text_range, text_range); # endif tail = target_end - 1; @@ -3814,7 +3907,7 @@ forward_search_range(regex_t* reg, const UChar* str, const UChar* end, UChar* s, #ifdef ONIG_DEBUG_SEARCH fprintf(stderr, "forward_search_range: str: %"PRIuPTR" (%p), end: %"PRIuPTR" (%p), s: %"PRIuPTR" (%p), range: %"PRIuPTR" (%p)\n", - (intptr_t )str, str, (intptr_t )end, end, (intptr_t )s, s, (intptr_t )range, range); + (uintptr_t )str, str, (uintptr_t )end, end, (uintptr_t )s, s, (uintptr_t )range, range); #endif p = s; @@ -4068,7 +4161,7 @@ onig_search_gpos(regex_t* reg, const UChar* str, const UChar* end, #ifdef ONIG_DEBUG_SEARCH fprintf(stderr, "onig_search (entry point): str: %"PRIuPTR" (%p), end: %"PRIuPTR", start: %"PRIuPTR", range: %"PRIuPTR"\n", - (intptr_t )str, str, end - str, start - str, range - str); + (uintptr_t )str, str, end - str, start - str, range - str); #endif if (region) { @@ -4302,8 +4395,6 @@ onig_search_gpos(regex_t* reg, const UChar* str, const UChar* end, if ((reg->anchor & ANCHOR_ANYCHAR_STAR) != 0) { do { - if ((reg->anchor & ANCHOR_BEGIN_POSITION) == 0) - msa.gpos = s; /* move \G position */ MATCH_AND_RETURN_CHECK(orig_range); prev = s; s += enclen(reg->enc, s, end); @@ -202,7 +202,9 @@ #define xmemcpy memcpy #define xmemmove memmove -#if defined(RUBY_MSVCRT_VERSION) && RUBY_MSVCRT_VERSION >= 90 && !defined(__GNUC__) +#if ((defined(RUBY_MSVCRT_VERSION) && RUBY_MSVCRT_VERSION >= 90) \ + || (!defined(RUBY_MSVCRT_VERSION) && defined(_WIN32))) \ + && !defined(__GNUC__) # define xalloca _alloca # define xvsnprintf(buf,size,fmt,args) _vsnprintf_s(buf,size,_TRUNCATE,fmt,args) # define xsnprintf sprintf_s @@ -598,7 +600,6 @@ enum OpCode { OP_END_LINE, OP_SEMI_END_BUF, OP_BEGIN_POSITION, - OP_BEGIN_POS_OR_LINE, /* used for implicit anchor optimization */ OP_BACKREF1, OP_BACKREF2, @@ -643,6 +644,9 @@ enum OpCode { OP_LOOK_BEHIND, /* (?<=...) start (no needs end opcode) */ OP_PUSH_LOOK_BEHIND_NOT, /* (?<!...) start */ OP_FAIL_LOOK_BEHIND_NOT, /* (?<!...) end */ + OP_PUSH_ABSENT_POS, /* (?~...) start */ + OP_ABSENT, /* (?~...) start of inner loop */ + OP_ABSENT_END, /* (?~...) end */ OP_CALL, /* \g<name> */ OP_RETURN, @@ -730,6 +734,9 @@ typedef void* PointerType; #define SIZE_OP_CALL (SIZE_OPCODE + SIZE_ABSADDR) #define SIZE_OP_RETURN SIZE_OPCODE #define SIZE_OP_CONDITION (SIZE_OPCODE + SIZE_MEMNUM + SIZE_RELADDR) +#define SIZE_OP_PUSH_ABSENT_POS SIZE_OPCODE +#define SIZE_OP_ABSENT (SIZE_OPCODE + SIZE_RELADDR) +#define SIZE_OP_ABSENT_END SIZE_OPCODE #ifdef USE_COMBINATION_EXPLOSION_CHECK # define SIZE_OP_STATE_CHECK (SIZE_OPCODE + SIZE_STATE_CHECK_NUM) @@ -841,6 +848,10 @@ typedef struct _OnigStackType { UChar *pstr; /* string position */ } call_frame; #endif + struct { + UChar *abs_pstr; /* absent start position */ + const UChar *end_pstr; /* end position */ + } absent_pos; } u; } OnigStackType; diff --git a/regparse.c b/regparse.c index 204aa46ce9..a2d2fcf6a7 100644 --- a/regparse.c +++ b/regparse.c @@ -58,7 +58,8 @@ const OnigSyntaxType OnigSyntaxRuby = { ONIG_SYN_OP2_ESC_CAPITAL_X_EXTENDED_GRAPHEME_CLUSTER | ONIG_SYN_OP2_QMARK_LPAREN_CONDITION | ONIG_SYN_OP2_ESC_CAPITAL_R_LINEBREAK | - ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP ) + ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP | + ONIG_SYN_OP2_QMARK_TILDE_ABSENT ) , ( SYN_GNU_REGEX_BV | ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV | ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND | @@ -1024,14 +1025,15 @@ scan_env_add_mem_entry(ScanEnv* env) if (IS_NULL(env->mem_nodes_dynamic)) { alloc = INIT_SCANENV_MEMNODES_ALLOC_SIZE; p = (Node** )xmalloc(sizeof(Node*) * alloc); + CHECK_NULL_RETURN_MEMERR(p); xmemcpy(p, env->mem_nodes_static, sizeof(Node*) * SCANENV_MEMNODES_SIZE); } else { alloc = env->mem_alloc * 2; p = (Node** )xrealloc(env->mem_nodes_dynamic, sizeof(Node*) * alloc); + CHECK_NULL_RETURN_MEMERR(p); } - CHECK_NULL_RETURN_MEMERR(p); for (i = env->num_mem + 1; i < alloc; i++) p[i] = NULL_NODE; @@ -3176,7 +3178,7 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) PUNFETCH; num = fetch_escaped_value(&p, end, env, &c2); if (num < 0) return num; - if ((OnigCodePoint)tok->u.c != c2) { + if ((OnigCodePoint )tok->u.c != c2) { tok->u.code = (OnigCodePoint )c2; tok->type = TK_CODE_POINT; } @@ -3780,7 +3782,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) num = fetch_escaped_value(&p, end, env, &c2); if (num < 0) return num; /* set_raw: */ - if ((OnigCodePoint)tok->u.c != c2) { + if ((OnigCodePoint )tok->u.c != c2) { tok->type = TK_CODE_POINT; tok->u.code = (OnigCodePoint )c2; } @@ -4989,6 +4991,14 @@ parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, case '>': /* (?>...) stop backtrack */ *np = node_new_enclose(ENCLOSE_STOP_BACKTRACK); break; + case '~': /* (?~...) absent operator */ + if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_TILDE_ABSENT)) { + *np = node_new_enclose(ENCLOSE_ABSENT); + } + else { + return ONIGERR_UNDEFINED_GROUP_OPTION; + } + break; #ifdef USE_NAMED_GROUP case '\'': @@ -5030,7 +5040,9 @@ parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, named_group1: list_capture = 0; +# ifdef USE_CAPTURE_HISTORY named_group2: +# endif name = p; r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &num, 0); if (r < 0) return r; @@ -5060,9 +5072,10 @@ parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, #endif break; +#ifdef USE_CAPTURE_HISTORY case '@': if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY)) { -#ifdef USE_NAMED_GROUP +# ifdef USE_NAMED_GROUP if (!PEND && IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) { PFETCH(c); @@ -5072,7 +5085,7 @@ parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, } PUNFETCH; } -#endif +# endif *np = node_new_enclose_memory(env->option, 0); CHECK_NULL_RETURN_MEMERR(*np); num = scan_env_add_mem_entry(env); @@ -5087,6 +5100,7 @@ parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, return ONIGERR_UNDEFINED_GROUP_OPTION; } break; +#endif /* USE_CAPTURE_HISTORY */ case '(': /* conditional expression: (?(cond)yes), (?(cond)yes|no) */ if (!PEND && diff --git a/regparse.h b/regparse.h index 111a840b84..888ebf4ce6 100644 --- a/regparse.h +++ b/regparse.h @@ -95,6 +95,7 @@ RUBY_SYMBOL_EXPORT_BEGIN #define ENCLOSE_OPTION (1<<1) #define ENCLOSE_STOP_BACKTRACK (1<<2) #define ENCLOSE_CONDITION (1<<3) +#define ENCLOSE_ABSENT (1<<4) #define NODE_STR_MARGIN 16 #define NODE_STR_BUF_SIZE 24 /* sizeof(CClassNode) - sizeof(int)*4 */ |