diff options
author | matz <matz@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 1998-06-01 04:23:43 +0000 |
---|---|---|
committer | matz <matz@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 1998-06-01 04:23:43 +0000 |
commit | e6ab550ac5504c4e5f3b2946749f8381513448ff (patch) | |
tree | 454d2b1e5f17c97f13445701b5aa5664bebeb652 | |
parent | cfe64537f8c34e003b49a5c265600787555ae467 (diff) |
regex.c
git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/branches/v1_1r@231 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
-rw-r--r-- | ChangeLog | 5 | ||||
-rw-r--r-- | ext/extmk.rb.in | 2 | ||||
-rw-r--r-- | re.c | 12 | ||||
-rw-r--r-- | regex.c | 152 |
4 files changed, 86 insertions, 85 deletions
@@ -1,3 +1,8 @@ +Sat May 30 07:10:02 1998 Yukihiro Matsumoto <[email protected]> + + * re.c (reg_prepare_re): no more needless regular expression + recompile on casefold conditions. + Thu May 28 18:02:55 1998 Yukihiro Matsumoto <[email protected]> * object.c (nil_plus): no more `+' method for nil. diff --git a/ext/extmk.rb.in b/ext/extmk.rb.in index a2ff7c63ab..847e34f87e 100644 --- a/ext/extmk.rb.in +++ b/ext/extmk.rb.in @@ -472,7 +472,7 @@ if $cache_mod end exit if $install or $clean -$extinit += "" +$extinit = "" unless $extinit if $extlist.size > 0 for s,t in $extlist f = format("%s/%s.o", s, t) @@ -89,6 +89,7 @@ str_cicmp(str1, str2) } #define REG_IGNORECASE FL_USER0 +#define REG_CASESTATE FL_USER1 #define KCODE_NONE 0 #define KCODE_EUC FL_USER2 @@ -376,9 +377,10 @@ reg_prepare_re(reg) if (FL_TEST(reg, REG_IGNORECASE)) { casefold = TRUE; } - if ((casefold && !(RREGEXP(reg)->ptr->options & RE_OPTION_IGNORECASE)) - || (!casefold && (RREGEXP(reg)->ptr->options & RE_OPTION_IGNORECASE))) { + else if ((casefold && !FL_TEST(reg, REG_CASESTATE)) || + (!casefold && FL_TEST(reg, REG_CASESTATE))) { RREGEXP(reg)->ptr->fastmap_accurate = 0; + RBASIC(reg)->flags ^= REG_CASESTATE; need_recompile = 1; } @@ -387,7 +389,7 @@ reg_prepare_re(reg) } else if ((RBASIC(reg)->flags & KCODE_MASK) != reg_kcode) { need_recompile = 1; - RBASIC(reg)->flags = RBASIC(reg)->flags & ~KCODE_MASK; + RBASIC(reg)->flags &= ~KCODE_MASK; RBASIC(reg)->flags |= reg_kcode; } @@ -647,6 +649,10 @@ reg_new_1(klass, s, len, options) } kcode_set_option(re); + if (RTEST(ignorecase)) { + options |= RE_OPTION_IGNORECASE; + FL_SET(re, REG_CASESTATE); + } re->ptr = make_regexp(s, len, options & 0x3); re->str = ALLOC_N(char, len+1); memcpy(re->str, s, len); @@ -294,7 +294,6 @@ enum regexpcode and store it in a memory register. Followed by one byte containing the register number. Register numbers must be in the range 0 through RE_NREGS. */ - start_paren, /* Just a mark for starting(?:). */ casefold_on, /* Turn on casefold flag. */ casefold_off, /* Turn off casefold flag. */ start_nowidth, /* Save string point to the stack. */ @@ -661,10 +660,6 @@ print_partial_compiled_pattern(start, end) printf ("/stop_memory/%d", mcnt); break; - case start_paren: - printf ("/start_paren"); - break; - case casefold_on: printf ("/casefold_on"); break; @@ -882,7 +877,6 @@ calculate_must_string(start, end) case casefold_off: return 0; /* should not check must_string */ - case start_paren: case start_nowidth: case stop_nowidth: case pop_and_fail: @@ -1010,6 +1004,10 @@ re_compile_pattern(pattern, size, bufp) char *begalt = b; + /* Place in the uncompiled pattern (i.e., the {) to + which to go back if the interval is invalid. */ + char *beg_interval; + /* In processing an interval, at least this many matches must be made. */ int lower_bound; @@ -1530,8 +1528,6 @@ re_compile_pattern(pattern, size, bufp) break; case ':': - if (b > bufp->buffer && b[-1] != start_paren) - BUFPUSH(start_paren); pending_exact = 0; default: break; @@ -1616,46 +1612,35 @@ re_compile_pattern(pattern, size, bufp) case '{': /* If there is no previous pattern, this isn't an interval. */ - if (!laststart) + if (!laststart || p == pend) { goto normal_backsl; } - /* It also isn't an interval if not preceded by an re - matching a single character or subexpression, or if - the current type of intervals can't handle back - references and the previous thing is a back reference. */ - - if (! (*laststart == anychar - || *laststart == charset - || *laststart == charset_not - || *laststart == wordchar - || *laststart == notwordchar - || *laststart == start_memory - || *laststart == start_paren - || (*laststart == exactn - && (laststart[1] == 1 - || (laststart[1] == 2 && ismbchar(laststart[2])))) - || *laststart == duplicate)) - { - /* Posix extended syntax is handled in previous - statement; this is for Posix basic syntax. */ - goto normal_backsl; - } + + beg_interval = p - 1; + lower_bound = -1; /* So can see if are set. */ upper_bound = -1; GET_UNSIGNED_NUMBER(lower_bound); if (c == ',') { GET_UNSIGNED_NUMBER(upper_bound); - if (upper_bound < 0) - upper_bound = RE_DUP_MAX; + if (upper_bound < 0) upper_bound = RE_DUP_MAX; } - if (upper_bound < 0) + else + /* Interval such as `{1}' => match exactly once. */ upper_bound = lower_bound; - if (c != '}' || lower_bound < 0 || upper_bound > RE_DUP_MAX - || lower_bound > upper_bound - || (p != pend && *p == '{')) { - goto invalid_pattern; - } + + if (lower_bound < 0 || c != '}') + goto unfetch_interval; + + if (lower_bound > RE_DUP_MAX || upper_bound > RE_DUP_MAX) + FREE_AND_RETURN(stackb, "too big quantifier in {,}"); + if (lower_bound > upper_bound) + FREE_AND_RETURN(stackb, "can't do {n,m} with n > m"); + + beg_interval = 0; + pending_exact = 0; + greedy = 1; if (p != pend) { PATFETCH(c); @@ -1663,17 +1648,6 @@ re_compile_pattern(pattern, size, bufp) else PATUNFETCH; } - /* If upper_bound is zero, don't want to succeed at all; - jump from laststart to b + 3, which will be the end of - the buffer after this jump is inserted. */ - - if (upper_bound == 0) { - GET_BUFFER_SPACE(3); - insert_jump(jump, laststart, b + 3, b); - b += 3; - break; - } - if (lower_bound == 0) { zero_times_ok = 1; if (upper_bound == RE_DUP_MAX) { @@ -1685,28 +1659,49 @@ re_compile_pattern(pattern, size, bufp) goto repeat; } } - if (lower_bound == 1 && upper_bound == RE_DUP_MAX) { - many_times_ok = 1; - zero_times_ok = 0; - goto repeat; + if (lower_bound == 1) { + if (upper_bound == 1) { + /* No need to repeat */ + break; + } + if (upper_bound == RE_DUP_MAX) { + many_times_ok = 1; + zero_times_ok = 0; + goto repeat; + } } - /* Star, etc. applied to an empty pattern is equivalent - to an empty pattern. */ - if (!laststart) + /* If upper_bound is zero, don't want to succeed at all; + jump from laststart to b + 3, which will be the end of + the buffer after this jump is inserted. */ + + if (upper_bound == 0) { + GET_BUFFER_SPACE(3); + insert_jump(jump, laststart, b + 3, b); + b += 3; break; + } + /* Otherwise, we have a nontrivial interval. When + we're all done, the pattern will look like: + set_number_at <jump count> <upper bound> + set_number_at <succeed_n count> <lower bound> + succeed_n <after jump addr> <succed_n count> + <body of loop> + jump_n <succeed_n addr> <jump count> + (The upper bound and `jump_n' are omitted if + `upper_bound' is 1, though.) */ { /* If the upper bound is > 1, we need to insert more at the end of the loop. */ - unsigned slots_needed = upper_bound == 1 ? 5 : 10; + unsigned nbytes = upper_bound == 1 ? 10 : 20; - GET_BUFFER_SPACE(5); + GET_BUFFER_SPACE(nbytes); /* Initialize lower bound of the `succeed_n', even though it will be set during matching by its attendant `set_number_at' (inserted next), because `re_compile_fastmap' needs to know. Jump to the `jump_n' we might insert below. */ - insert_jump_n(succeed_n, laststart, b + slots_needed, + insert_jump_n(succeed_n, laststart, b + (nbytes/2), b, lower_bound); b += 5; /* Just increment for the succeed_n here. */ @@ -1714,7 +1709,6 @@ re_compile_pattern(pattern, size, bufp) before the `succeed_n'. The `5' is the last two bytes of this `set_number_at', plus 3 bytes of the following `succeed_n'. */ - GET_BUFFER_SPACE(5); insert_op_2(set_number_at, laststart, b, 5, lower_bound); b += 5; @@ -1727,7 +1721,8 @@ re_compile_pattern(pattern, size, bufp) we'll have matched the interval once, so jump back only `upper_bound - 1' times. */ GET_BUFFER_SPACE(5); - store_jump_n(b, greedy?jump_n:finalize_push_n, laststart + 5, upper_bound - 1); + store_jump_n(b, greedy?jump_n:finalize_push_n, laststart + 5, + upper_bound - 1); b += 5; /* The location we want to set is the second @@ -1744,24 +1739,22 @@ re_compile_pattern(pattern, size, bufp) We insert this at the beginning of the loop so that if we fail during matching, we'll reinitialize the bounds. */ - GET_BUFFER_SPACE(5); - insert_op_2(set_number_at, laststart, b, b - laststart, upper_bound - 1); + insert_op_2(set_number_at, laststart, b, b - laststart, + upper_bound - 1); b += 5; - - GET_BUFFER_SPACE(5); - BUFPUSH(set_number_at); - STORE_NUMBER_AND_INCR(b, laststart - b + 11); - STORE_NUMBER_AND_INCR(b, lower_bound); - - GET_BUFFER_SPACE(5); - BUFPUSH(set_number_at); - STORE_NUMBER_AND_INCR(b, -10); - STORE_NUMBER_AND_INCR(b, upper_bound - 1); } - pending_exact = 0; } break; + unfetch_interval: + /* If an invalid interval, match the characters as literals. */ + p = beg_interval; + beg_interval = 0; + + /* normal_char and normal_backslash need `c'. */ + PATFETCH (c); + goto normal_char; + case '\\': if (p == pend) goto invalid_pattern; /* Do not translate the character after the \, so that we can @@ -2246,7 +2239,6 @@ re_compile_fastmap(bufp) case wordbeg: case wordend: case pop_and_fail: - case start_paren: continue; case casefold_on: @@ -2283,6 +2275,7 @@ re_compile_fastmap(bufp) if ((enum regexpcode) *p != on_failure_jump && (enum regexpcode) *p != try_next + && (enum regexpcode) *p != succeed_n && (enum regexpcode) *p != finalize_push && (enum regexpcode) *p != finalize_push_n) continue; @@ -2718,11 +2711,11 @@ typedef union { \ unsigned char **stackx; \ unsigned int len = stacke - stackb; \ - if (len > re_max_failures * MAX_NUM_FAILURE_ITEMS) \ + /* if (len > re_max_failures * MAX_NUM_FAILURE_ITEMS) \ { \ FREE_VARIABLES(); \ FREE_AND_RETURN(stackb,(-2)); \ - } \ + }*/ \ \ /* Roughly double the size of the stack. */ \ EXPAND_FAIL_STACK(stackx, stackb, len); \ @@ -3387,7 +3380,7 @@ re_match(bufp, string_arg, size, pos, regs) EXTRACT_NUMBER_AND_INCR(mcnt, p); PUSH_FAILURE_POINT(p + mcnt, d); stackp[-1] = (unsigned char*)1; - p += 7; /* skip n and set_number_at after destination */ + p += 2; /* skip n */ } /* If don't have to push any more, skip over the rest of command. */ else @@ -3399,9 +3392,6 @@ re_match(bufp, string_arg, size, pos, regs) case unused: continue; - case start_paren: - continue; - case casefold_on: options |= RE_OPTION_IGNORECASE; continue; |