From d2a5354255bab4d74c710ee8ff21f43998f33095 Mon Sep 17 00:00:00 2001 From: naruse Date: Mon, 15 Sep 2014 16:18:41 +0000 Subject: * reg*.c: Merge Onigmo 5.15.0 38a870960aa7370051a3544 git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@47598 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- regcomp.c | 127 +++++++++++++++++++++++++++++++++++--------------------------- 1 file changed, 71 insertions(+), 56 deletions(-) (limited to 'regcomp.c') diff --git a/regcomp.c b/regcomp.c index 676bee26cc..c1698ea1dc 100644 --- a/regcomp.c +++ b/regcomp.c @@ -330,9 +330,10 @@ static int compile_tree(Node* node, regex_t* reg); (op) == OP_EXACTMB3N || (op) == OP_EXACTMBN || (op) == OP_EXACTN_IC) static int -select_str_opcode(int mb_len, OnigDistance str_len, int ignore_case) +select_str_opcode(int mb_len, OnigDistance byte_len, int ignore_case) { int op; + OnigDistance str_len = (byte_len + mb_len - 1) / mb_len; if (ignore_case) { switch (str_len) { @@ -434,11 +435,11 @@ compile_tree_n_times(Node* node, int n, regex_t* reg) } static int -add_compile_string_length(UChar* s ARG_UNUSED, int mb_len, OnigDistance str_len, +add_compile_string_length(UChar* s ARG_UNUSED, int mb_len, OnigDistance byte_len, regex_t* reg ARG_UNUSED, int ignore_case) { int len; - int op = select_str_opcode(mb_len, str_len, ignore_case); + int op = select_str_opcode(mb_len, byte_len, ignore_case); len = SIZE_OPCODE; @@ -446,15 +447,15 @@ add_compile_string_length(UChar* s ARG_UNUSED, int mb_len, OnigDistance str_len, if (IS_NEED_STR_LEN_OP_EXACT(op)) len += SIZE_LENGTH; - len += mb_len * (int )str_len; + len += (int )byte_len; return len; } static int -add_compile_string(UChar* s, int mb_len, OnigDistance str_len, +add_compile_string(UChar* s, int mb_len, OnigDistance byte_len, regex_t* reg, int ignore_case) { - int op = select_str_opcode(mb_len, str_len, ignore_case); + int op = select_str_opcode(mb_len, byte_len, ignore_case); add_opcode(reg, op); if (op == OP_EXACTMBN) @@ -462,12 +463,12 @@ add_compile_string(UChar* s, int mb_len, OnigDistance str_len, if (IS_NEED_STR_LEN_OP_EXACT(op)) { if (op == OP_EXACTN_IC) - add_length(reg, mb_len * str_len); + add_length(reg, byte_len); else - add_length(reg, str_len); + add_length(reg, byte_len / mb_len); } - add_bytes(reg, s, mb_len * str_len); + add_bytes(reg, s, byte_len); return 0; } @@ -475,7 +476,7 @@ add_compile_string(UChar* s, int mb_len, OnigDistance str_len, static int compile_length_string_node(Node* node, regex_t* reg) { - int rlen, r, len, prev_len, slen, ambig; + int rlen, r, len, prev_len, blen, ambig; OnigEncoding enc = reg->enc; UChar *p, *prev; StrNode* sn; @@ -489,24 +490,24 @@ compile_length_string_node(Node* node, regex_t* reg) p = prev = sn->s; prev_len = enclen(enc, p, sn->end); p += prev_len; - slen = 1; + blen = prev_len; rlen = 0; for (; p < sn->end; ) { len = enclen(enc, p, sn->end); - if (len == prev_len) { - slen++; + if (len == prev_len || ambig) { + blen += len; } else { - r = add_compile_string_length(prev, prev_len, slen, reg, ambig); + r = add_compile_string_length(prev, prev_len, blen, reg, ambig); rlen += r; prev = p; - slen = 1; + blen = len; prev_len = len; } p += len; } - r = add_compile_string_length(prev, prev_len, slen, reg, ambig); + r = add_compile_string_length(prev, prev_len, blen, reg, ambig); rlen += r; return rlen; } @@ -523,7 +524,7 @@ compile_length_string_raw_node(StrNode* sn, regex_t* reg) static int compile_string_node(Node* node, regex_t* reg) { - int r, len, prev_len, slen, ambig; + int r, len, prev_len, blen, ambig; OnigEncoding enc = reg->enc; UChar *p, *prev, *end; StrNode* sn; @@ -538,25 +539,25 @@ compile_string_node(Node* node, regex_t* reg) p = prev = sn->s; prev_len = enclen(enc, p, end); p += prev_len; - slen = 1; + blen = prev_len; for (; p < end; ) { len = enclen(enc, p, end); - if (len == prev_len) { - slen++; + if (len == prev_len || ambig) { + blen += len; } else { - r = add_compile_string(prev, prev_len, slen, reg, ambig); + r = add_compile_string(prev, prev_len, blen, reg, ambig); if (r) return r; prev = p; - slen = 1; + blen = len; prev_len = len; } p += len; } - return add_compile_string(prev, prev_len, slen, reg, ambig); + return add_compile_string(prev, prev_len, blen, reg, ambig); } static int @@ -2591,6 +2592,7 @@ is_not_included(Node* x, Node* y, regex_t* reg) return 0; } else { + if (IS_NOT_NULL(xc->mbuf)) return 0; for (i = 0; i < SINGLE_BYTE_SIZE; i++) { int is_word; if (NCTYPE(y)->ascii_range) @@ -3311,7 +3313,7 @@ next_setup(Node* node, Node* next_node, int in_root, regex_t* reg) qn->next_head_exact = n; } #endif - /* automatic possessivation a*b ==> (?>a*)b */ + /* automatic possessification a*b ==> (?>a*)b */ if (qn->lower <= 1) { int ttype = NTYPE(qn->target); if (IS_NODE_TYPE_SIMPLE(ttype)) { @@ -3432,27 +3434,40 @@ expand_case_fold_make_rem_string(Node** rnode, UChar *s, UChar *end, return 0; } +static int +is_case_fold_variable_len(int item_num, OnigCaseFoldCodeItem items[], + int slen) +{ + int i; + + for (i = 0; i < item_num; i++) { + if (items[i].byte_len != slen) { + return 1; + } + if (items[i].code_len != 1) { + return 1; + } + } + return 0; +} + static int expand_case_fold_string_alt(int item_num, OnigCaseFoldCodeItem items[], UChar *p, int slen, UChar *end, regex_t* reg, Node **rnode) { - int r, i, j, len, varlen, varclen; + int r, i, j, len, varlen; Node *anode, *var_anode, *snode, *xnode, *an; UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; *rnode = var_anode = NULL_NODE; varlen = 0; - varclen = 0; for (i = 0; i < item_num; i++) { if (items[i].byte_len != slen) { varlen = 1; break; } - if (items[i].code_len != 1) { - varclen |= 1; - } } if (varlen != 0) { @@ -3537,8 +3552,6 @@ expand_case_fold_string_alt(int item_num, OnigCaseFoldCodeItem items[], } } - if (varclen && !varlen) - return 2; return varlen; mem_err2: @@ -3582,7 +3595,8 @@ expand_case_fold_string(Node* node, regex_t* reg) len = enclen(reg->enc, p, end); - if (n == 0) { + varlen = is_case_fold_variable_len(n, items, len); + if (n == 0 || varlen == 0) { if (IS_NULL(snode)) { if (IS_NULL(root) && IS_NOT_NULL(prev_node)) { top_root = root = onig_node_list_add(NULL_NODE, prev_node); @@ -3607,11 +3621,14 @@ expand_case_fold_string(Node* node, regex_t* reg) } else { alt_num *= (n + 1); - if (alt_num > THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION) { - varlen = 1; /* Assume that expanded strings are variable length. */ - break; - } + if (alt_num > THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION) break; + if (IS_NOT_NULL(snode)) { + r = update_string_node_case_fold(reg, snode); + if (r == 0) { + NSTRING_SET_AMBIG(snode); + } + } if (IS_NULL(root) && IS_NOT_NULL(prev_node)) { top_root = root = onig_node_list_add(NULL_NODE, prev_node); if (IS_NULL(root)) { @@ -3622,7 +3639,6 @@ expand_case_fold_string(Node* node, regex_t* reg) r = expand_case_fold_string_alt(n, items, p, len, end, reg, &prev_node); if (r < 0) goto mem_err; - if (r > 0) varlen = 1; if (r == 1) { if (IS_NULL(root)) { top_root = prev_node; @@ -3636,7 +3652,7 @@ expand_case_fold_string(Node* node, regex_t* reg) root = NCAR(prev_node); } - else { /* r == 0 || r == 2 */ + else { /* r == 0 */ if (IS_NOT_NULL(root)) { if (IS_NULL(onig_node_list_add(root, prev_node))) { onig_node_free(prev_node); @@ -3650,6 +3666,12 @@ expand_case_fold_string(Node* node, regex_t* reg) p += len; } + if (IS_NOT_NULL(snode)) { + r = update_string_node_case_fold(reg, snode); + if (r == 0) { + NSTRING_SET_AMBIG(snode); + } + } if (p < end) { Node *srem; @@ -3679,20 +3701,9 @@ expand_case_fold_string(Node* node, regex_t* reg) /* ending */ top_root = (IS_NOT_NULL(top_root) ? top_root : prev_node); - if (!varlen) { - /* When all expanded strings are same length, case-insensitive - BM search will be used. */ - r = update_string_node_case_fold(reg, node); - if (r == 0) { - NSTRING_SET_AMBIG(node); - } - } - else { - swap_node(node, top_root); - r = 0; - } + swap_node(node, top_root); onig_node_free(top_root); - return r; + return 0; mem_err: r = ONIGERR_MEMORY; @@ -4367,7 +4378,7 @@ map_position_value(OnigEncoding enc, int i) 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 1 }; - if (i < (int )(sizeof(ByteValTable)/sizeof(ByteValTable[0]))) { + if (i < numberof(ByteValTable)) { if (i == 0 && ONIGENC_MBC_MINLEN(enc) > 1) return 20; else @@ -4399,7 +4410,7 @@ distance_value(MinMaxLen* mm) if (mm->max == ONIG_INFINITE_DISTANCE) return 0; d = mm->max - mm->min; - if (d < sizeof(dist_vals)/sizeof(dist_vals[0])) + if (d < numberof(dist_vals)) /* return dist_vals[d] * 16 / (mm->min + 12); */ return (int )dist_vals[d]; else @@ -4507,6 +4518,9 @@ concat_opt_anc_info(OptAncInfo* to, OptAncInfo* left, OptAncInfo* right, if (right_len == 0) { to->right_anchor |= left->right_anchor; } + else { + to->right_anchor |= (left->right_anchor & ANCHOR_PREC_READ_NOT); + } } static int @@ -5080,7 +5094,8 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env) case ANCHOR_END_BUF: case ANCHOR_SEMI_END_BUF: case ANCHOR_END_LINE: - case ANCHOR_LOOK_BEHIND: /* just for (?<=x).* */ + case ANCHOR_LOOK_BEHIND: /* just for (?<=x).* */ + case ANCHOR_PREC_READ_NOT: /* just for (?!x).* */ add_opt_anc_info(&opt->anc, NANCHOR(node)->type); break; @@ -5103,7 +5118,6 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env) } break; - case ANCHOR_PREC_READ_NOT: case ANCHOR_LOOK_BEHIND_NOT: break; } @@ -5369,7 +5383,8 @@ set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env) ANCHOR_BEGIN_POSITION | ANCHOR_ANYCHAR_STAR | ANCHOR_ANYCHAR_STAR_ML | ANCHOR_LOOK_BEHIND); - reg->anchor |= opt.anc.right_anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF); + reg->anchor |= opt.anc.right_anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF | + ANCHOR_PREC_READ_NOT); if (reg->anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF)) { reg->anchor_dmin = opt.len.min; -- cgit v1.2.3