1 #ifdef PERL_EXT_RE_BUILD
6 #define PERL_IN_REGEX_ENGINE
7 #define PERL_IN_REGCOMP_ANY
8 #define PERL_IN_REGCOMP_STUDY_C
11 #ifdef PERL_IN_XSUB_RE
17 #include "invlist_inline.h"
18 #include "unicode_constants.h"
19 #include "regcomp_internal.h"
21 #define INIT_AND_WITHP \
23 Newx(and_withp, 1, regnode_ssc); \
28 S_unwind_scan_frames(pTHX_ const void *p)
30 PERL_ARGS_ASSERT_UNWIND_SCAN_FRAMES;
31 scan_frame *f= (scan_frame *)p;
33 scan_frame *n= f->next_frame;
39 /* Follow the next-chain of the current node and optimize away
40 all the NOTHINGs from it.
43 S_rck_elide_nothing(pTHX_ regnode *node)
45 PERL_ARGS_ASSERT_RCK_ELIDE_NOTHING;
47 if (OP(node) != CURLYX) {
48 const int max = (REGNODE_OFF_BY_ARG(OP(node))
50 /* I32 may be smaller than U16 on CRAYs! */
51 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
52 int off = (REGNODE_OFF_BY_ARG(OP(node)) ? ARG1u(node) : NEXT_OFF(node));
56 /* Skip NOTHING and LONGJMP. */
60 (REGNODE_TYPE(OP(n)) == NOTHING && (noff = NEXT_OFF(n)))
61 || ((OP(n) == LONGJMP) && (noff = ARG1u(n)))
67 if (REGNODE_OFF_BY_ARG(OP(node)))
77 * As best we can, determine the characters that can match the start of
78 * the given EXACTF-ish node. This is for use in creating ssc nodes, so there
79 * can be false positive matches
81 * Returns the invlist as a new SV*; it is the caller's responsibility to
82 * call SvREFCNT_dec() when done with it.
85 S_make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
87 const U8 * s = (U8*)STRING(node);
88 SSize_t bytelen = STR_LEN(node);
90 /* Start out big enough for 2 separate code points */
91 SV* invlist = _new_invlist(4);
93 PERL_ARGS_ASSERT_MAKE_EXACTF_INVLIST;
98 /* We punt and assume can match anything if the node begins
99 * with a multi-character fold. Things are complicated. For
100 * example, /ffi/i could match any of:
101 * "\N{LATIN SMALL LIGATURE FFI}"
102 * "\N{LATIN SMALL LIGATURE FF}I"
103 * "F\N{LATIN SMALL LIGATURE FI}"
104 * plus several other things; and making sure we have all the
105 * possibilities is hard. */
106 if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) {
107 invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
110 /* Any Latin1 range character can potentially match any
111 * other depending on the locale, and in Turkic locales, 'I' and
112 * 'i' can match U+130 and U+131 */
113 if (OP(node) == EXACTFL) {
114 _invlist_union(invlist, PL_Latin1, &invlist);
115 if (isALPHA_FOLD_EQ(uc, 'I')) {
116 invlist = add_cp_to_invlist(invlist,
117 LATIN_SMALL_LETTER_DOTLESS_I);
118 invlist = add_cp_to_invlist(invlist,
119 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
123 /* But otherwise, it matches at least itself. We can
124 * quickly tell if it has a distinct fold, and if so,
125 * it matches that as well */
126 invlist = add_cp_to_invlist(invlist, uc);
127 if (IS_IN_SOME_FOLD_L1(uc))
128 invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]);
131 /* Some characters match above-Latin1 ones under /i. This
132 * is true of EXACTFL ones when the locale is UTF-8 */
133 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
134 && (! isASCII(uc) || ! inRANGE(OP(node), EXACTFAA,
137 add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist);
141 else { /* Pattern is UTF-8 */
142 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
143 const U8* e = s + bytelen;
146 fc = uc = utf8_to_uv_or_die(s, s + bytelen, NULL);
148 /* The only code points that aren't folded in a UTF EXACTFish
149 * node are the problematic ones in EXACTFL nodes */
150 if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
151 /* We need to check for the possibility that this EXACTFL
152 * node begins with a multi-char fold. Therefore we fold
153 * the first few characters of it so that we can make that
159 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
161 *(d++) = (U8) toFOLD(*s);
162 if (fc < 0) { /* Save the first fold */
169 UV fold = toFOLD_utf8_safe(s, e, d, &len);
170 if (fc < 0) { /* Save the first fold */
178 /* And set up so the code below that looks in this folded
179 * buffer instead of the node's string */
184 /* When we reach here 's' points to the fold of the first
185 * character(s) of the node; and 'e' points to far enough along
186 * the folded string to be just past any possible multi-char
189 * Like the non-UTF case above, we punt if the node begins with a
192 if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
193 invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
195 else { /* Single char fold */
198 const U32 * remaining_folds;
201 /* It matches itself */
202 invlist = add_cp_to_invlist(invlist, fc);
204 /* ... plus all the things that fold to it, which are found in
205 * PL_utf8_foldclosures */
206 folds_count = _inverse_folds(fc, &first_fold,
208 for (k = 0; k < folds_count; k++) {
209 UV c = (k == 0) ? first_fold : remaining_folds[k-1];
211 /* /aa doesn't allow folds between ASCII and non- */
212 if ( inRANGE(OP(node), EXACTFAA, EXACTFAA_NO_TRIE)
213 && isASCII(c) != isASCII(fc))
218 invlist = add_cp_to_invlist(invlist, c);
221 if (OP(node) == EXACTFL) {
223 /* If either [iI] are present in an EXACTFL node the above code
224 * should have added its normal case pair, but under a Turkish
225 * locale they could match instead the case pairs from it. Add
226 * those as potential matches as well */
227 if (isALPHA_FOLD_EQ(fc, 'I')) {
228 invlist = add_cp_to_invlist(invlist,
229 LATIN_SMALL_LETTER_DOTLESS_I);
230 invlist = add_cp_to_invlist(invlist,
231 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
233 else if (fc == LATIN_SMALL_LETTER_DOTLESS_I) {
234 invlist = add_cp_to_invlist(invlist, 'I');
236 else if (fc == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
237 invlist = add_cp_to_invlist(invlist, 'i');
247 /* Mark that we cannot extend a found fixed substring at this point.
248 Update the longest found anchored substring or the longest found
249 floating substrings if needed. */
252 Perl_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
253 SSize_t *minlenp, int is_inf)
255 const STRLEN l = CHR_SVLEN(data->last_found);
256 SV * const longest_sv = data->substrs[data->cur_is_floating].str;
257 const STRLEN old_l = CHR_SVLEN(longest_sv);
258 DECLARE_AND_GET_RE_DEBUG_FLAGS;
260 PERL_ARGS_ASSERT_SCAN_COMMIT;
262 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
263 const U8 i = data->cur_is_floating;
264 SvSetMagicSV(longest_sv, data->last_found);
265 data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min;
268 data->substrs[0].max_offset = data->substrs[0].min_offset;
270 data->substrs[1].max_offset =
274 ? data->last_start_max
275 : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min
277 : data->pos_min + data->pos_delta));
280 data->substrs[i].flags &= ~SF_BEFORE_EOL;
281 data->substrs[i].flags |= data->flags & SF_BEFORE_EOL;
282 data->substrs[i].minlenp = minlenp;
283 data->substrs[i].lookbehind = 0;
286 SvCUR_set(data->last_found, 0);
288 SV * const sv = data->last_found;
289 if (SvUTF8(sv) && SvMAGICAL(sv)) {
290 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
296 data->flags &= ~SF_BEFORE_EOL;
297 DEBUG_STUDYDATA("commit", data, 0, is_inf, -1, -1, -1);
300 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
301 * list that describes which code points it matches */
304 S_ssc_anything(pTHX_ regnode_ssc *ssc)
306 /* Set the SSC 'ssc' to match an empty string or any code point */
308 PERL_ARGS_ASSERT_SSC_ANYTHING;
310 assert(is_ANYOF_SYNTHETIC(ssc));
312 /* mortalize so won't leak */
313 ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
314 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
318 S_ssc_is_anything(const regnode_ssc *ssc)
320 /* Returns true if the SSC 'ssc' can match the empty string and any code
321 * point; false otherwise. Thus, this is used to see if using 'ssc' buys
322 * us anything: if the function returns true, 'ssc' hasn't been restricted
323 * in any way, so there's no point in using it */
325 UV start = 0, end = 0; /* Initialize due to messages from dumb compiler */
328 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
330 assert(is_ANYOF_SYNTHETIC(ssc));
332 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
336 /* See if the list consists solely of the range 0 - Infinity */
337 invlist_iterinit(ssc->invlist);
338 ret = invlist_iternext(ssc->invlist, &start, &end)
342 invlist_iterfinish(ssc->invlist);
348 /* If e.g., both \w and \W are set, matches everything */
349 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
351 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
352 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
362 Perl_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
364 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
365 * string, any code point, or any posix class under locale */
367 PERL_ARGS_ASSERT_SSC_INIT;
369 Zero(ssc, 1, regnode_ssc);
370 set_ANYOF_SYNTHETIC(ssc);
371 ARG1u_SET(ssc, ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE);
374 /* If any portion of the regex is to operate under locale rules that aren't
375 * fully known at compile time, initialization includes it. The reason
376 * this isn't done for all regexes is that the optimizer was written under
377 * the assumption that locale was all-or-nothing. Given the complexity and
378 * lack of documentation in the optimizer, and that there are inadequate
379 * test cases for locale, many parts of it may not work properly, it is
380 * safest to avoid locale unless necessary. */
381 if (RExC_contains_locale) {
382 ANYOF_POSIXL_SETALL(ssc);
385 ANYOF_POSIXL_ZERO(ssc);
390 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
391 const regnode_ssc *ssc)
393 /* Returns true if the SSC 'ssc' is in its initial state with regard only
394 * to the list of code points matched, and locale posix classes; hence does
395 * not check its flags) */
397 UV start = 0, end = 0; /* Initialize due to messages from dumb compiler */
400 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
402 assert(is_ANYOF_SYNTHETIC(ssc));
404 invlist_iterinit(ssc->invlist);
405 ret = invlist_iternext(ssc->invlist, &start, &end)
409 invlist_iterfinish(ssc->invlist);
415 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
424 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
425 const regnode_charclass* const node)
427 /* Returns a mortal inversion list defining which code points are matched
428 * by 'node', which is of ANYOF-ish type . Handles complementing the
429 * result if appropriate. If some code points aren't knowable at this
430 * time, the returned list must, and will, contain every code point that is
434 SV* only_utf8_locale_invlist = NULL;
435 bool new_node_has_latin1 = false;
436 const U8 flags = (REGNODE_TYPE(OP(node)) == ANYOF)
440 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
442 /* Look at the data structure created by S_set_ANYOF_arg() */
443 if (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(node)) {
444 invlist = sv_2mortal(_new_invlist(1));
445 invlist = _add_range_to_invlist(invlist, NUM_ANYOF_CODE_POINTS, UV_MAX);
447 else if (ANYOF_HAS_AUX(node)) {
448 const U32 n = ARG1u(node);
449 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
450 AV * const av = AV_FROM_REF(rv);
451 SV **const ary = AvARRAY(av);
453 if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
455 /* Here there are things that won't be known until runtime -- we
456 * have to assume it could be anything */
457 invlist = sv_2mortal(_new_invlist(1));
458 return _add_range_to_invlist(invlist, 0, UV_MAX);
460 else if (ary[INVLIST_INDEX]) {
462 /* Use the node's inversion list */
463 invlist = sv_2mortal(invlist_clone(ary[INVLIST_INDEX], NULL));
466 /* Get the code points valid only under UTF-8 locales */
467 if ( (flags & ANYOFL_FOLD)
468 && av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX)
470 only_utf8_locale_invlist = ary[ONLY_LOCALE_MATCHES_INDEX];
475 invlist = sv_2mortal(_new_invlist(0));
478 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
479 * code points, and an inversion list for the others, but if there are code
480 * points that should match only conditionally on the target string being
481 * UTF-8, those are placed in the inversion list, and not the bitmap.
482 * Since there are circumstances under which they could match, they are
483 * included in the SSC. But if the ANYOF node is to be inverted, we have
484 * to exclude them here, so that when we invert below, the end result
485 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
486 * have to do this here before we add the unconditionally matched code
488 if (flags & ANYOF_INVERT) {
489 _invlist_intersection_complement_2nd(invlist,
494 /* Add in the points from the bit map */
495 if (REGNODE_TYPE(OP(node)) == ANYOF){
496 for (unsigned i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
497 if (ANYOF_BITMAP_TEST(node, i)) {
498 unsigned int start = i++;
500 for (; i < NUM_ANYOF_CODE_POINTS
501 && ANYOF_BITMAP_TEST(node, i); ++i)
505 invlist = _add_range_to_invlist(invlist, start, i-1);
506 new_node_has_latin1 = true;
511 /* If this can match all upper Latin1 code points, have to add them
512 * as well. But don't add them if inverting, as when that gets done below,
513 * it would exclude all these characters, including the ones it shouldn't
514 * that were added just above */
515 if ( ! (flags & ANYOF_INVERT)
516 && OP(node) == ANYOFD
517 && (flags & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared))
519 _invlist_union(invlist, PL_UpperLatin1, &invlist);
522 /* Similarly for these */
523 if (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(node)) {
524 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
527 if (flags & ANYOF_INVERT) {
528 _invlist_invert(invlist);
530 else if (flags & ANYOFL_FOLD) {
531 if (new_node_has_latin1) {
533 /* These folds are potential in Turkic locales */
534 if (_invlist_contains_cp(invlist, 'i')) {
535 invlist = add_cp_to_invlist(invlist,
536 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
538 if (_invlist_contains_cp(invlist, 'I')) {
539 invlist = add_cp_to_invlist(invlist,
540 LATIN_SMALL_LETTER_DOTLESS_I);
543 /* Under /li, any 0-255 could fold to any other 0-255, depending on
544 * the locale. We can skip this if there are no 0-255 at all. */
545 _invlist_union(invlist, PL_Latin1, &invlist);
548 if (_invlist_contains_cp(invlist, LATIN_SMALL_LETTER_DOTLESS_I)) {
549 invlist = add_cp_to_invlist(invlist, 'I');
551 if (_invlist_contains_cp(invlist,
552 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
554 invlist = add_cp_to_invlist(invlist, 'i');
559 /* Similarly add the UTF-8 locale possible matches. These have to be
560 * deferred until after the non-UTF-8 locale ones are taken care of just
561 * above, or it leads to wrong results under ANYOF_INVERT */
562 if (only_utf8_locale_invlist) {
563 _invlist_union_maybe_complement_2nd(invlist,
564 only_utf8_locale_invlist,
565 flags & ANYOF_INVERT,
572 /* 'AND' a given class with another one. Can create false positives. 'ssc'
573 * should not be inverted. */
576 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
577 const regnode_charclass *and_with)
579 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
580 * another SSC or a regular ANYOF class. Can create false positives. */
583 U8 and_with_flags = (REGNODE_TYPE(OP(and_with)) == ANYOF)
584 ? ANYOF_FLAGS(and_with)
588 PERL_ARGS_ASSERT_SSC_AND;
590 assert(is_ANYOF_SYNTHETIC(ssc));
592 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
593 * the code point inversion list and just the relevant flags */
594 if (is_ANYOF_SYNTHETIC(and_with)) {
595 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
596 anded_flags = and_with_flags;
598 /* XXX This is a kludge around what appears to be deficiencies in the
599 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
600 * there are paths through the optimizer where it doesn't get weeded
601 * out when it should. And if we don't make some extra provision for
602 * it like the code just below, it doesn't get added when it should.
603 * This solution is to add it only when AND'ing, which is here, and
604 * only when what is being AND'ed is the pristine, original node
605 * matching anything. Thus it is like adding it to ssc_anything() but
606 * only when the result is to be AND'ed. Probably the same solution
607 * could be adopted for the same problem we have with /l matching,
608 * which is solved differently in S_ssc_init(), and that would lead to
609 * fewer false positives than that solution has. But if this solution
610 * creates bugs, the consequences are only that a warning isn't raised
611 * that should be; while the consequences for having /l bugs is
612 * incorrect matches */
613 if (ssc_is_anything((regnode_ssc *)and_with)) {
614 anded_flags |= ANYOF_WARN_SUPER__shared;
618 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
619 if (OP(and_with) == ANYOFD) {
620 anded_flags = and_with_flags & ANYOF_COMMON_FLAGS;
623 anded_flags = and_with_flags
624 & ( ANYOF_COMMON_FLAGS
625 |ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared
626 |ANYOF_HAS_EXTRA_RUNTIME_MATCHES);
627 if (and_with_flags & ANYOFL_UTF8_LOCALE_REQD) {
628 anded_flags &= ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
633 ANYOF_FLAGS(ssc) &= anded_flags;
635 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
636 * C2 is the list of code points in 'and-with'; P2, its posix classes.
637 * 'and_with' may be inverted. When not inverted, we have the situation of
639 * (C1 | P1) & (C2 | P2)
640 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
641 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
642 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
643 * <= ((C1 & C2) | P1 | P2)
644 * Alternatively, the last few steps could be:
645 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
646 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
647 * <= (C1 | C2 | (P1 & P2))
648 * We favor the second approach if either P1 or P2 is non-empty. This is
649 * because these components are a barrier to doing optimizations, as what
650 * they match cannot be known until the moment of matching as they are
651 * dependent on the current locale, 'AND"ing them likely will reduce or
653 * But we can do better if we know that C1,P1 are in their initial state (a
654 * frequent occurrence), each matching everything:
655 * (<everything>) & (C2 | P2) = C2 | P2
656 * Similarly, if C2,P2 are in their initial state (again a frequent
657 * occurrence), the result is a no-op
658 * (C1 | P1) & (<everything>) = C1 | P1
661 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
662 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
663 * <= (C1 & ~C2) | (P1 & ~P2)
666 if ((and_with_flags & ANYOF_INVERT)
667 && ! is_ANYOF_SYNTHETIC(and_with))
671 ssc_intersection(ssc,
673 false /* Has already been inverted */
676 /* If either P1 or P2 is empty, the intersection will be also; can skip
678 if (! (and_with_flags & ANYOF_MATCHES_POSIXL)) {
679 ANYOF_POSIXL_ZERO(ssc);
681 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
683 /* Note that the Posix class component P from 'and_with' actually
685 * P = Pa | Pb | ... | Pn
686 * where each component is one posix class, such as in [\w\s].
688 * ~P = ~(Pa | Pb | ... | Pn)
689 * = ~Pa & ~Pb & ... & ~Pn
690 * <= ~Pa | ~Pb | ... | ~Pn
691 * The last is something we can easily calculate, but unfortunately
692 * is likely to have many false positives. We could do better
693 * in some (but certainly not all) instances if two classes in
694 * P have known relationships. For example
695 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
697 * :lower: & :print: = :lower:
698 * And similarly for classes that must be disjoint. For example,
699 * since \s and \w can have no elements in common based on rules in
700 * the POSIX standard,
702 * Unfortunately, some vendor locales do not meet the Posix
703 * standard, in particular almost everything by Microsoft.
704 * The loop below just changes e.g., \w into \W and vice versa */
706 regnode_charclass_posixl temp;
707 int add = 1; /* To calculate the index of the complement */
709 Zero(&temp, 1, regnode_charclass_posixl);
710 ANYOF_POSIXL_ZERO(&temp);
711 for (i = 0; i < ANYOF_MAX; i++) {
713 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
714 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
716 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
717 ANYOF_POSIXL_SET(&temp, i + add);
719 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
721 ANYOF_POSIXL_AND(&temp, ssc);
723 } /* else ssc already has no posixes */
724 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
725 in its initial state */
726 else if (! is_ANYOF_SYNTHETIC(and_with)
727 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
729 /* But if 'ssc' is in its initial state, the result is just 'and_with';
730 * copy it over 'ssc' */
731 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
732 if (is_ANYOF_SYNTHETIC(and_with)) {
733 StructCopy(and_with, ssc, regnode_ssc);
736 ssc->invlist = anded_cp_list;
737 ANYOF_POSIXL_ZERO(ssc);
738 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
739 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
743 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
744 || (and_with_flags & ANYOF_MATCHES_POSIXL))
746 /* One or the other of P1, P2 is non-empty. */
747 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
748 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
750 ssc_union(ssc, anded_cp_list, false);
752 else { /* P1 = P2 = empty */
753 ssc_intersection(ssc, anded_cp_list, false);
759 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
760 const regnode_charclass *or_with)
762 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
763 * another SSC or a regular ANYOF class. Can create false positives if
764 * 'or_with' is to be inverted. */
768 U8 or_with_flags = (REGNODE_TYPE(OP(or_with)) == ANYOF)
769 ? ANYOF_FLAGS(or_with)
772 PERL_ARGS_ASSERT_SSC_OR;
774 assert(is_ANYOF_SYNTHETIC(ssc));
776 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
777 * the code point inversion list and just the relevant flags */
778 if (is_ANYOF_SYNTHETIC(or_with)) {
779 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
780 ored_flags = or_with_flags;
783 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
784 ored_flags = or_with_flags & ANYOF_COMMON_FLAGS;
785 if (OP(or_with) != ANYOFD) {
787 or_with_flags & ( ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared
788 |ANYOF_HAS_EXTRA_RUNTIME_MATCHES);
789 if (or_with_flags & ANYOFL_UTF8_LOCALE_REQD) {
790 ored_flags |= ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
795 ANYOF_FLAGS(ssc) |= ored_flags;
797 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
798 * C2 is the list of code points in 'or-with'; P2, its posix classes.
799 * 'or_with' may be inverted. When not inverted, we have the simple
800 * situation of computing:
801 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
802 * If P1|P2 yields a situation with both a class and its complement are
803 * set, like having both \w and \W, this matches all code points, and we
804 * can delete these from the P component of the ssc going forward. XXX We
805 * might be able to delete all the P components, but I (khw) am not certain
806 * about this, and it is better to be safe.
809 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
812 * (which results in actually simpler code than the non-inverted case)
815 if ((or_with_flags & ANYOF_INVERT)
816 && ! is_ANYOF_SYNTHETIC(or_with))
818 /* We ignore P2, leaving P1 going forward */
819 } /* else Not inverted */
820 else if (or_with_flags & ANYOF_MATCHES_POSIXL) {
821 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
822 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
824 for (i = 0; i < ANYOF_MAX; i += 2) {
825 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
827 ssc_match_all_cp(ssc);
828 ANYOF_POSIXL_CLEAR(ssc, i);
829 ANYOF_POSIXL_CLEAR(ssc, i+1);
837 false /* Already has been inverted */
842 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
844 PERL_ARGS_ASSERT_SSC_UNION;
846 assert(is_ANYOF_SYNTHETIC(ssc));
848 _invlist_union_maybe_complement_2nd(ssc->invlist,
855 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
857 const bool invert2nd)
859 PERL_ARGS_ASSERT_SSC_INTERSECTION;
861 assert(is_ANYOF_SYNTHETIC(ssc));
863 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
870 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
872 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
874 assert(is_ANYOF_SYNTHETIC(ssc));
876 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
880 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
882 /* AND just the single code point 'cp' into the SSC 'ssc' */
884 SV* cp_list = _new_invlist(2);
886 PERL_ARGS_ASSERT_SSC_CP_AND;
888 assert(is_ANYOF_SYNTHETIC(ssc));
890 cp_list = add_cp_to_invlist(cp_list, cp);
891 ssc_intersection(ssc, cp_list,
892 false /* Not inverted */
894 SvREFCNT_dec_NN(cp_list);
898 S_ssc_clear_locale(regnode_ssc *ssc)
900 /* Set the SSC 'ssc' to not match any locale things */
901 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
903 assert(is_ANYOF_SYNTHETIC(ssc));
905 ANYOF_POSIXL_ZERO(ssc);
906 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
911 /* The below joins as many adjacent EXACTish nodes as possible into a single
912 * one. The regop may be changed if the node(s) contain certain sequences that
913 * require special handling. The joining is only done if:
914 * 1) there is room in the current conglomerated node to entirely contain the
916 * 2) they are compatible node types
918 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
919 * these get optimized out
921 * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full
922 * as possible, even if that means splitting an existing node so that its first
923 * part is moved to the preceding node. This would maximise the efficiency of
924 * memEQ during matching.
926 * If a node is to match under /i (folded), the number of characters it matches
927 * can be different than its character length if it contains a multi-character
928 * fold. *min_subtract is set to the total delta number of characters of the
931 * And *unfolded_multi_char is set to indicate whether or not the node contains
932 * an unfolded multi-char fold. This happens when it won't be known until
933 * runtime whether the fold is valid or not; namely
934 * 1) for EXACTF nodes that contain LATIN SMALL LETTER SHARP S, as only if the
935 * target string being matched against turns out to be UTF-8 is that fold
937 * 2) for EXACTFL nodes whose folding rules depend on the locale in force at
939 * (Multi-char folds whose components are all above the Latin1 range are not
940 * run-time locale dependent, and have already been folded by the time this
941 * function is called.)
943 * This is as good a place as any to discuss the design of handling these
944 * multi-character fold sequences. It's been wrong in Perl for a very long
945 * time. There are three code points in Unicode whose multi-character folds
946 * were long ago discovered to mess things up. The previous designs for
947 * dealing with these involved assigning a special node for them. This
948 * approach doesn't always work, as evidenced by this example:
949 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
950 * Both sides fold to "sss", but if the pattern is parsed to create a node that
951 * would match just the \xDF, it won't be able to handle the case where a
952 * successful match would have to cross the node's boundary. The new approach
953 * that hopefully generally solves the problem generates an EXACTFUP node
954 * that is "sss" in this case.
956 * It turns out that there are problems with all multi-character folds, and not
957 * just these three. Now the code is general, for all such cases. The
959 * 1) This routine examines each EXACTFish node that could contain multi-
960 * character folded sequences. Since a single character can fold into
961 * such a sequence, the minimum match length for this node is less than
962 * the number of characters in the node. This routine returns in
963 * *min_subtract how many characters to subtract from the actual
964 * length of the string to get a real minimum match length; it is 0 if
965 * there are no multi-char foldeds. This delta is used by the caller to
966 * adjust the min length of the match, and the delta between min and max,
967 * so that the optimizer doesn't reject these possibilities based on size
970 * 2) For the sequence involving the LATIN SMALL LETTER SHARP S (U+00DF)
971 * under /u, we fold it to 'ss' in regatom(), and in this routine, after
972 * joining, we scan for occurrences of the sequence 'ss' in non-UTF-8
973 * EXACTFU nodes. The node type of such nodes is then changed to
974 * EXACTFUP, indicating it is problematic, and needs careful handling.
975 * (The procedures in step 1) above are sufficient to handle this case in
976 * UTF-8 encoded nodes.) The reason this is problematic is that this is
977 * the only case where there is a possible fold length change in non-UTF-8
978 * patterns. By reserving a special node type for problematic cases, the
979 * far more common regular EXACTFU nodes can be processed faster.
980 * regexec.c takes advantage of this.
982 * EXACTFUP has been created as a grab-bag for (hopefully uncommon)
983 * problematic cases. These all only occur when the pattern is not
984 * UTF-8. In addition to the 'ss' sequence where there is a possible fold
985 * length change, it handles the situation where the string cannot be
986 * entirely folded. The strings in an EXACTFish node are folded as much
987 * as possible during compilation in regcomp.c. This saves effort in
988 * regex matching. By using an EXACTFUP node when it is not possible to
989 * fully fold at compile time, regexec.c can know that everything in an
990 * EXACTFU node is folded, so folding can be skipped at runtime. The only
991 * case where folding in EXACTFU nodes can't be done at compile time is
992 * the presumably uncommon MICRO SIGN, when the pattern isn't UTF-8. This
993 * is because its fold requires UTF-8 to represent. Thus EXACTFUP nodes
994 * handle two very different cases. Alternatively, there could have been
995 * a node type where there are length changes, one for unfolded, and one
996 * for both. If yet another special case needed to be created, the number
997 * of required node types would have to go to 7. khw figures that even
998 * though there are plenty of node types to spare, that the maintenance
999 * cost wasn't worth the small speedup of doing it that way, especially
1000 * since he thinks the MICRO SIGN is rarely encountered in practice.
1002 * There are other cases where folding isn't done at compile time, but
1003 * none of them are under /u, and hence not for EXACTFU nodes. The folds
1004 * in EXACTFL nodes aren't known until runtime, and vary as the locale
1005 * changes. Some folds in EXACTF depend on if the runtime target string
1006 * is UTF-8 or not. (regatom() will create an EXACTFU node even under /di
1007 * when no fold in it depends on the UTF-8ness of the target string.)
1009 * 3) A problem remains for unfolded multi-char folds. (These occur when the
1010 * validity of the fold won't be known until runtime, and so must remain
1011 * unfolded for now. This happens for the sharp s in EXACTF and EXACTFAA
1012 * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot
1013 * be an EXACTF node with a UTF-8 pattern.) They also occur for various
1014 * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
1015 * The reason this is a problem is that the optimizer part of regexec.c
1016 * (probably unwittingly, in Perl_regexec_flags()) makes an assumption
1017 * that a character in the pattern corresponds to at most a single
1018 * character in the target string. (And I do mean character, and not byte
1019 * here, unlike other parts of the documentation that have never been
1020 * updated to account for multibyte Unicode.) Sharp s in EXACTF and
1021 * EXACTFL nodes can match the two character string 'ss'; in EXACTFAA
1022 * nodes it can match "\x{17F}\x{17F}". These, along with other ones in
1023 * EXACTFL nodes, violate the assumption, and they are the only instances
1024 * where it is violated. I'm reluctant to try to change the assumption,
1025 * as the code involved is impenetrable to me (khw), so instead the code
1026 * here punts. This routine examines EXACTFL nodes, and (when the pattern
1027 * isn't UTF-8) EXACTF and EXACTFAA for such unfolded folds, and returns a
1028 * boolean indicating whether or not the node contains such a fold. When
1029 * it is true, the caller sets a flag that later causes the optimizer in
1030 * this file to not set values for the floating and fixed string lengths,
1031 * and thus avoids the optimizer code in regexec.c that makes the invalid
1032 * assumption. Thus, there is no optimization based on string lengths for
1033 * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
1034 * EXACTF and EXACTFAA nodes that contain the sharp s. (The reason the
1035 * assumption is wrong only in these cases is that all other non-UTF-8
1036 * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
1037 * their expanded versions. (Again, we can't prefold sharp s to 'ss' in
1038 * EXACTF nodes because we don't know at compile time if it actually
1039 * matches 'ss' or not. For EXACTF nodes it will match iff the target
1040 * string is in UTF-8. This is in contrast to EXACTFU nodes, where it
1041 * always matches; and EXACTFAA where it never does. In an EXACTFAA node
1042 * in a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
1043 * problem; but in a non-UTF8 pattern, folding it to that above-Latin1
1044 * string would require the pattern to be forced into UTF-8, the overhead
1045 * of which we want to avoid. Similarly the unfolded multi-char folds in
1046 * EXACTFL nodes will match iff the locale at the time of match is a UTF-8
1049 * Similarly, the code that generates tries doesn't currently handle
1050 * not-already-folded multi-char folds, and it looks like a pain to change
1051 * that. Therefore, trie generation of EXACTFAA nodes with the sharp s
1052 * doesn't work. Instead, such an EXACTFAA is turned into a new regnode,
1053 * EXACTFAA_NO_TRIE, which the trie code knows not to handle. Most people
1054 * using /iaa matching will be doing so almost entirely with ASCII
1055 * strings, so this should rarely be encountered in practice */
1058 Perl_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
1059 UV *min_subtract, bool *unfolded_multi_char,
1060 U32 flags, regnode *val, U32 depth)
1062 /* Merge several consecutive EXACTish nodes into one. */
1064 regnode *n = regnext(scan);
1066 regnode *next = REGNODE_AFTER_varies(scan);
1070 regnode *stop = scan;
1071 DECLARE_AND_GET_RE_DEBUG_FLAGS;
1073 PERL_UNUSED_ARG(depth);
1076 PERL_ARGS_ASSERT_JOIN_EXACT;
1077 #ifndef EXPERIMENTAL_INPLACESCAN
1078 PERL_UNUSED_ARG(flags);
1079 PERL_UNUSED_ARG(val);
1081 DEBUG_PEEP("join", scan, depth, 0);
1083 assert(REGNODE_TYPE(OP(scan)) == EXACT);
1085 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
1086 * EXACT ones that are mergeable to the current one. */
1088 && ( REGNODE_TYPE(OP(n)) == NOTHING
1089 || (stringok && REGNODE_TYPE(OP(n)) == EXACT))
1091 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
1094 if (OP(n) == TAIL || n > next)
1096 if (REGNODE_TYPE(OP(n)) == NOTHING) {
1097 DEBUG_PEEP("skip:", n, depth, 0);
1098 NEXT_OFF(scan) += NEXT_OFF(n);
1099 next = n + NODE_STEP_REGNODE;
1106 else if (stringok) {
1107 const unsigned int oldl = STR_LEN(scan);
1108 regnode * const nnext = regnext(n);
1110 /* XXX I (khw) kind of doubt that this works on platforms (should
1111 * Perl ever run on one) where U8_MAX is above 255 because of lots
1112 * of other assumptions */
1113 /* Don't join if the sum can't fit into a single node */
1114 if (oldl + STR_LEN(n) > U8_MAX)
1117 /* Joining something that requires UTF-8 with something that
1118 * doesn't, means the result requires UTF-8. */
1119 if (OP(scan) == EXACT && (OP(n) == EXACT_REQ8)) {
1120 OP(scan) = EXACT_REQ8;
1122 else if (OP(scan) == EXACT_REQ8 && (OP(n) == EXACT)) {
1123 ; /* join is compatible, no need to change OP */
1125 else if ((OP(scan) == EXACTFU) && (OP(n) == EXACTFU_REQ8)) {
1126 OP(scan) = EXACTFU_REQ8;
1128 else if ((OP(scan) == EXACTFU_REQ8) && (OP(n) == EXACTFU)) {
1129 ; /* join is compatible, no need to change OP */
1131 else if (OP(scan) == EXACTFU && OP(n) == EXACTFU) {
1132 ; /* join is compatible, no need to change OP */
1134 else if (OP(scan) == EXACTFU && OP(n) == EXACTFU_S_EDGE) {
1136 /* Under /di, temporary EXACTFU_S_EDGE nodes are generated,
1137 * which can join with EXACTFU ones. We check for this case
1138 * here. These need to be resolved to either EXACTFU or
1139 * EXACTF at joining time. They have nothing in them that
1140 * would forbid them from being the more desirable EXACTFU
1141 * nodes except that they begin and/or end with a single [Ss].
1142 * The reason this is problematic is because they could be
1143 * joined in this loop with an adjacent node that ends and/or
1144 * begins with [Ss] which would then form the sequence 'ss',
1145 * which matches differently under /di than /ui, in which case
1146 * EXACTFU can't be used. If the 'ss' sequence doesn't get
1147 * formed, the nodes get absorbed into any adjacent EXACTFU
1148 * node. And if the only adjacent node is EXACTF, they get
1149 * absorbed into that, under the theory that a longer node is
1150 * better than two shorter ones, even if one is EXACTFU. Note
1151 * that EXACTFU_REQ8 is generated only for UTF-8 patterns,
1152 * and the EXACTFU_S_EDGE ones only for non-UTF-8. */
1154 if (STRING(n)[STR_LEN(n)-1] == 's') {
1156 /* Here the joined node would end with 's'. If the node
1157 * following the combination is an EXACTF one, it's better to
1158 * join this trailing edge 's' node with that one, leaving the
1159 * current one in 'scan' be the more desirable EXACTFU */
1160 if (OP(nnext) == EXACTF) {
1164 OP(scan) = EXACTFU_S_EDGE;
1166 } /* Otherwise, the beginning 's' of the 2nd node just
1167 becomes an interior 's' in 'scan' */
1169 else if (OP(scan) == EXACTF && OP(n) == EXACTF) {
1170 ; /* join is compatible, no need to change OP */
1172 else if (OP(scan) == EXACTF && OP(n) == EXACTFU_S_EDGE) {
1174 /* EXACTF nodes are compatible for joining with EXACTFU_S_EDGE
1175 * nodes. But the latter nodes can be also joined with EXACTFU
1176 * ones, and that is a better outcome, so if the node following
1177 * 'n' is EXACTFU, quit now so that those two can be joined
1179 if (OP(nnext) == EXACTFU) {
1183 /* The join is compatible, and the combined node will be
1184 * EXACTF. (These don't care if they begin or end with 's' */
1186 else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU_S_EDGE) {
1187 if ( STRING(scan)[STR_LEN(scan)-1] == 's'
1188 && STRING(n)[0] == 's')
1190 /* When combined, we have the sequence 'ss', which means we
1191 * have to remain /di */
1195 else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU) {
1196 if (STRING(n)[0] == 's') {
1197 ; /* Here the join is compatible and the combined node
1198 starts with 's', no need to change OP */
1200 else { /* Now the trailing 's' is in the interior */
1204 else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTF) {
1206 /* The join is compatible, and the combined node will be
1207 * EXACTF. (These don't care if they begin or end with 's' */
1210 else if (OP(scan) != OP(n)) {
1212 /* The only other compatible joinings are the same node type */
1216 DEBUG_PEEP("merg", n, depth, 0);
1221 next = REGNODE_AFTER_varies(n);
1222 NEXT_OFF(scan) += NEXT_OFF(n);
1223 assert( ( STR_LEN(scan) + STR_LEN(n) ) < 256 );
1224 setSTR_LEN(scan, (U8)(STR_LEN(scan) + STR_LEN(n)));
1225 /* Now we can overwrite *n : */
1226 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
1234 #ifdef EXPERIMENTAL_INPLACESCAN
1235 if (flags && !NEXT_OFF(n)) {
1236 DEBUG_PEEP("atch", val, depth, 0);
1237 if (REGNODE_OFF_BY_ARG(OP(n))) {
1238 ARG1u_SET(n, val - n);
1241 NEXT_OFF(n) = val - n;
1248 /* This temporary node can now be turned into EXACTFU, and must, as
1249 * regexec.c doesn't handle it */
1250 if (OP(scan) == EXACTFU_S_EDGE) {
1255 *unfolded_multi_char = false;
1257 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
1258 * can now analyze for sequences of problematic code points. (Prior to
1259 * this final joining, sequences could have been split over boundaries, and
1260 * hence missed). The sequences only happen in folding, hence for any
1261 * non-EXACT EXACTish node */
1262 if (OP(scan) != EXACT && OP(scan) != EXACT_REQ8 && OP(scan) != EXACTL) {
1263 U8* s0 = (U8*) STRING(scan);
1265 U8* s_end = s0 + STR_LEN(scan);
1267 int total_count_delta = 0; /* Total delta number of characters that
1268 multi-char folds expand to */
1270 /* One pass is made over the node's string looking for all the
1271 * possibilities. To avoid some tests in the loop, there are two main
1272 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
1277 if (OP(scan) == EXACTFL) {
1280 /* An EXACTFL node would already have been changed to another
1281 * node type unless there is at least one character in it that
1282 * is problematic; likely a character whose fold definition
1283 * won't be known until runtime, and so has yet to be folded.
1284 * For all but the UTF-8 locale, folds are 1-1 in length, but
1285 * to handle the UTF-8 case, we need to create a temporary
1286 * folded copy using UTF-8 locale rules in order to analyze it.
1287 * This is because our macros that look to see if a sequence is
1288 * a multi-char fold assume everything is folded (otherwise the
1289 * tests in those macros would be too complicated and slow).
1290 * Note that here, the non-problematic folds will have already
1291 * been done, so we can just copy such characters. We actually
1292 * don't completely fold the EXACTFL string. We skip the
1293 * unfolded multi-char folds, as that would just create work
1294 * below to figure out the size they already are */
1296 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
1299 STRLEN s_len = UTF8SKIP(s);
1300 if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
1301 Copy(s, d, s_len, U8);
1304 else if (is_FOLDS_TO_MULTI_utf8(s)) {
1305 *unfolded_multi_char = true;
1306 Copy(s, d, s_len, U8);
1309 else if (isASCII(*s)) {
1310 *(d++) = toFOLD(*s);
1314 _toFOLD_utf8_flags(s, s_end, d, &len, FOLD_FLAGS_FULL);
1320 /* Point the remainder of the routine to look at our temporary
1324 } /* End of creating folded copy of EXACTFL string */
1326 /* Examine the string for a multi-character fold sequence. UTF-8
1327 * patterns have all characters pre-folded by the time this code is
1329 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
1330 length sequence we are looking for is 2 */
1332 int count = 0; /* How many characters in a multi-char fold */
1333 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
1334 if (! len) { /* Not a multi-char fold: get next char */
1339 { /* Here is a generic multi-char fold. */
1340 U8* multi_end = s + len;
1342 /* Count how many characters are in it. In the case of
1343 * /aa, no folds which contain ASCII code points are
1344 * allowed, so check for those, and skip if found. */
1345 if (OP(scan) != EXACTFAA && OP(scan) != EXACTFAA_NO_TRIE) {
1346 count = utf8_length(s, multi_end);
1350 while (s < multi_end) {
1353 goto next_iteration;
1363 /* The delta is how long the sequence is minus 1 (1 is how long
1364 * the character that folds to the sequence is) */
1365 total_count_delta += count - 1;
1369 /* We created a temporary folded copy of the string in EXACTFL
1370 * nodes. Therefore we need to be sure it doesn't go below zero,
1371 * as the real string could be shorter */
1372 if (OP(scan) == EXACTFL) {
1373 int total_chars = utf8_length((U8*) STRING(scan),
1374 (U8*) STRING(scan) + STR_LEN(scan));
1375 if (total_count_delta > total_chars) {
1376 total_count_delta = total_chars;
1380 *min_subtract += total_count_delta;
1383 else if (OP(scan) == EXACTFAA) {
1385 /* Non-UTF-8 pattern, EXACTFAA node. There can't be a multi-char
1386 * fold to the ASCII range (and there are no existing ones in the
1387 * upper latin1 range). But, as outlined in the comments preceding
1388 * this function, we need to flag any occurrences of the sharp s.
1389 * This character forbids trie formation (because of added
1391 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
1392 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
1393 || UNICODE_DOT_DOT_VERSION > 0)
1395 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
1396 OP(scan) = EXACTFAA_NO_TRIE;
1397 *unfolded_multi_char = true;
1403 else if (OP(scan) != EXACTFAA_NO_TRIE) {
1405 /* Non-UTF-8 pattern, not EXACTFAA node. Look for the multi-char
1406 * folds that are all Latin1. As explained in the comments
1407 * preceding this function, we look also for the sharp s in EXACTF
1408 * and EXACTFL nodes; it can be in the final position. Otherwise
1409 * we can stop looking 1 byte earlier because have to find at least
1410 * two characters for a multi-fold */
1411 const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
1416 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
1417 if (! len) { /* Not a multi-char fold. */
1418 if (*s == LATIN_SMALL_LETTER_SHARP_S
1419 && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
1421 *unfolded_multi_char = true;
1428 && isALPHA_FOLD_EQ(*s, 's')
1429 && isALPHA_FOLD_EQ(*(s+1), 's'))
1432 /* EXACTF nodes need to know that the minimum length
1433 * changed so that a sharp s in the string can match this
1434 * ss in the pattern, but they remain EXACTF nodes, as they
1435 * won't match this unless the target string is in UTF-8,
1436 * which we don't know until runtime. EXACTFL nodes can't
1437 * transform into EXACTFU nodes */
1438 if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
1439 OP(scan) = EXACTFUP;
1443 *min_subtract += len - 1;
1451 /* Allow dumping but overwriting the collection of skipped
1452 * ops and/or strings with fake optimized ops */
1453 n = REGNODE_AFTER_varies(scan);
1461 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl", scan, depth, 0);});
1465 /* REx optimizer. Converts nodes into quicker variants "in place".
1466 Finds fixed substrings. */
1469 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
1470 to the position after last scanned or to NULL. */
1472 /* the return from this sub is the minimum length that could possibly match */
1474 Perl_study_chunk(pTHX_
1475 RExC_state_t *pRExC_state,
1476 regnode **scanp, /* Start here (read-write). */
1477 SSize_t *minlenp, /* used for the minlen of substrings? */
1478 SSize_t *deltap, /* Write maxlen-minlen here. */
1479 regnode *last, /* Stop before this one. */
1480 scan_data_t *data, /* string data about the pattern */
1481 I32 stopparen, /* treat CLOSE-N as END, see GOSUB */
1482 U32 recursed_depth, /* how deep have we recursed via GOSUB */
1483 regnode_ssc *and_withp, /* Valid if flags & SCF_DO_STCLASS_OR */
1484 U32 flags, /* flags controlling this call, see SCF_ flags */
1485 U32 depth, /* how deep have we recursed period */
1486 bool was_mutate_ok /* true if in-place optimizations are allowed.
1487 false only if the caller (recursively) was
1488 prohibited from modifying the regops, because
1489 a higher caller is holding a ptr to them. */
1492 /* vars about the regnodes we are working with */
1493 regnode *scan = *scanp; /* the current opcode we are inspecting */
1494 regnode *next = NULL; /* the next opcode beyond scan, tmp var */
1495 regnode *first_non_open = scan; /* FIXME: should this init to NULL?
1496 the first non open regop, if the init
1497 val IS an OPEN then we will skip past
1498 it just after the var decls section */
1499 I32 code = 0; /* temp var used to hold the optype of a regop */
1501 /* vars about the min and max length of the pattern */
1502 SSize_t min = 0; /* min length of this part of the pattern */
1503 SSize_t stopmin = OPTIMIZE_INFTY; /* min length accounting for ACCEPT
1504 this is adjusted down if we find
1506 SSize_t delta = 0; /* difference between min and max length
1507 (not accounting for stopmin) */
1509 /* vars about capture buffers in the pattern */
1510 I32 pars = 0; /* count of OPEN opcodes */
1511 I32 is_par = OP(scan) == OPEN ? PARNO(scan) : 0; /* is this op an OPEN? */
1513 /* vars about whether this pattern contains something that can match
1514 * infinitely long strings, eg, X* or X+ */
1515 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
1516 int is_inf_internal = 0; /* The studied chunk is infinite */
1518 /* scan_data_t (struct) is used to hold information about the substrings
1519 * and start class we have extracted from the string */
1520 scan_data_t data_fake; /* temp var used for recursing in some cases */
1522 SV *re_trie_maxbuff = NULL; /* temp var used to hold whether we can do
1523 trie optimizations */
1525 scan_frame *frame = NULL; /* used as part of fake recursion */
1527 DECLARE_AND_GET_RE_DEBUG_FLAGS;
1529 PERL_ARGS_ASSERT_STUDY_CHUNK;
1530 RExC_study_started = 1;
1532 Zero(&data_fake, 1, scan_data_t);
1535 while (first_non_open && OP(first_non_open) == OPEN)
1536 first_non_open = regnext(first_non_open);
1541 RExC_study_chunk_recursed_count++;
1543 DEBUG_OPTIMISE_MORE_r(
1545 Perl_re_indentf( aTHX_ "study_chunk stopparen = %ld recursed_count = %lu "
1546 "depth = %lu recursed_depth = %lu scan = %p last = %p",
1547 depth, (long)stopparen,
1548 (unsigned long)RExC_study_chunk_recursed_count,
1549 (unsigned long)depth, (unsigned long)recursed_depth,
1552 if (recursed_depth) {
1555 for ( j = 0 ; j < recursed_depth ; j++ ) {
1556 for ( i = 0 ; i < (U32)RExC_total_parens ; i++ ) {
1557 if (PAREN_TEST(j, i) && (!j || !PAREN_TEST(j - 1, i))) {
1558 Perl_re_printf( aTHX_ " %d",(int)i);
1562 if ( j + 1 < recursed_depth ) {
1563 Perl_re_printf( aTHX_ ",");
1567 Perl_re_printf( aTHX_ "\n");
1570 while ( scan && OP(scan) != END && scan < last ){
1571 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
1572 node length to get a real minimum (because
1573 the folded version may be shorter) */
1574 bool unfolded_multi_char = false;
1575 /* avoid mutating ops if we are anywhere within the recursed or
1576 * enframed handling for a GOSUB: the outermost level will handle it.
1578 bool mutate_ok = was_mutate_ok && !(frame && frame->in_gosub);
1579 /* Peephole optimizer: */
1580 DEBUG_STUDYDATA("Peep", data, depth, is_inf, min, stopmin, delta);
1581 DEBUG_PEEP("Peep", scan, depth, flags);
1584 /* The reason we do this here is that we need to deal with things like
1585 * /(?:f)(?:o)(?:o)/ which cant be dealt with by the normal EXACT
1586 * parsing code, as each (?:..) is handled by a different invocation of
1589 if (REGNODE_TYPE(OP(scan)) == EXACT
1590 && OP(scan) != LEXACT
1591 && OP(scan) != LEXACT_REQ8
1594 join_exact(pRExC_state, scan, &min_subtract, &unfolded_multi_char,
1595 0, NULL, depth + 1);
1598 /* Follow the next-chain of the current node and optimize
1599 away all the NOTHINGs from it.
1601 rck_elide_nothing(scan);
1603 /* The principal pseudo-switch. Cannot be a switch, since we look into
1604 * several different things. */
1605 if ( OP(scan) == DEFINEP ) {
1607 SSize_t deltanext = 0;
1608 SSize_t fake_last_close = 0;
1609 regnode *fake_last_close_op = NULL;
1610 U32 f = SCF_IN_DEFINE | (flags & SCF_TRIE_DOING_RESTUDY);
1612 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
1613 scan = regnext(scan);
1614 assert( OP(scan) == IFTHEN );
1615 DEBUG_PEEP("expect IFTHEN", scan, depth, flags);
1617 data_fake.last_closep= &fake_last_close;
1618 data_fake.last_close_opp= &fake_last_close_op;
1620 next = regnext(scan);
1621 scan = REGNODE_AFTER_type(scan,tregnode_IFTHEN);
1622 DEBUG_PEEP("scan", scan, depth, flags);
1623 DEBUG_PEEP("next", next, depth, flags);
1625 /* we suppose the run is continuous, last = next...
1626 * NOTE we dont use the return here! */
1627 /* DEFINEP study_chunk() recursion */
1628 (void)study_chunk(pRExC_state, &scan, &minlen,
1629 &deltanext, next, &data_fake, stopparen,
1630 recursed_depth, NULL, f, depth+1, mutate_ok);
1635 OP(scan) == BRANCH ||
1636 OP(scan) == BRANCHJ ||
1639 next = regnext(scan);
1642 /* The op(next)==code check below is to see if we
1643 * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
1644 * IFTHEN is special as it might not appear in pairs.
1645 * Not sure whether BRANCH-BRANCHJ is possible, regardless
1646 * we dont handle it cleanly. */
1647 if (OP(next) == code || code == IFTHEN) {
1648 /* NOTE - There is similar code to this block below for
1649 * handling TRIE nodes on a re-study. If you change stuff here
1650 * check there too. */
1651 SSize_t max1 = 0, min1 = OPTIMIZE_INFTY, num = 0;
1653 regnode * const startbranch = scan;
1655 if (flags & SCF_DO_SUBSTR) {
1656 /* Cannot merge strings after this. */
1657 scan_commit(pRExC_state, data, minlenp, is_inf);
1660 if (flags & SCF_DO_STCLASS)
1661 ssc_init_zero(pRExC_state, &accum);
1663 while (OP(scan) == code) {
1664 SSize_t deltanext, minnext, fake_last_close = 0;
1665 regnode *fake_last_close_op = NULL;
1666 U32 f = (flags & SCF_TRIE_DOING_RESTUDY);
1667 regnode_ssc this_class;
1669 DEBUG_PEEP("Branch", scan, depth, flags);
1672 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
1674 data_fake.whilem_c = data->whilem_c;
1675 data_fake.last_closep = data->last_closep;
1676 data_fake.last_close_opp = data->last_close_opp;
1679 data_fake.last_closep = &fake_last_close;
1680 data_fake.last_close_opp = &fake_last_close_op;
1683 data_fake.pos_delta = delta;
1684 next = regnext(scan);
1686 scan = REGNODE_AFTER_opcode(scan, code);
1688 if (flags & SCF_DO_STCLASS) {
1689 ssc_init(pRExC_state, &this_class);
1690 data_fake.start_class = &this_class;
1691 f |= SCF_DO_STCLASS_AND;
1693 if (flags & SCF_WHILEM_VISITED_POS)
1694 f |= SCF_WHILEM_VISITED_POS;
1696 /* we suppose the run is continuous, last = next...*/
1697 /* recurse study_chunk() for each BRANCH in an alternation */
1698 minnext = study_chunk(pRExC_state, &scan, minlenp,
1699 &deltanext, next, &data_fake, stopparen,
1700 recursed_depth, NULL, f, depth+1,
1705 if (deltanext == OPTIMIZE_INFTY) {
1706 is_inf = is_inf_internal = 1;
1707 max1 = OPTIMIZE_INFTY;
1708 } else if (max1 < minnext + deltanext)
1709 max1 = minnext + deltanext;
1711 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1713 if (data_fake.flags & SCF_SEEN_ACCEPT) {
1714 if ( stopmin > minnext)
1715 stopmin = min + min1;
1716 flags &= ~SCF_DO_SUBSTR;
1718 data->flags |= SCF_SEEN_ACCEPT;
1721 if (data_fake.flags & SF_HAS_EVAL)
1722 data->flags |= SF_HAS_EVAL;
1723 data->whilem_c = data_fake.whilem_c;
1725 if (flags & SCF_DO_STCLASS)
1726 ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
1727 DEBUG_STUDYDATA("end BRANCH", data, depth, is_inf, min, stopmin, delta);
1729 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
1731 if (flags & SCF_DO_SUBSTR) {
1732 data->pos_min += min1;
1733 if (data->pos_delta >= OPTIMIZE_INFTY - (max1 - min1))
1734 data->pos_delta = OPTIMIZE_INFTY;
1736 data->pos_delta += max1 - min1;
1737 if (max1 != min1 || is_inf)
1738 data->cur_is_floating = 1;
1741 if (delta == OPTIMIZE_INFTY
1742 || OPTIMIZE_INFTY - delta - (max1 - min1) < 0)
1743 delta = OPTIMIZE_INFTY;
1745 delta += max1 - min1;
1746 if (flags & SCF_DO_STCLASS_OR) {
1747 ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
1749 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
1750 flags &= ~SCF_DO_STCLASS;
1753 else if (flags & SCF_DO_STCLASS_AND) {
1755 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
1756 flags &= ~SCF_DO_STCLASS;
1759 /* Switch to OR mode: cache the old value of
1760 * data->start_class */
1762 StructCopy(data->start_class, and_withp, regnode_ssc);
1763 flags &= ~SCF_DO_STCLASS_AND;
1764 StructCopy(&accum, data->start_class, regnode_ssc);
1765 flags |= SCF_DO_STCLASS_OR;
1768 DEBUG_STUDYDATA("pre TRIE", data, depth, is_inf, min, stopmin, delta);
1770 if (PERL_ENABLE_TRIE_OPTIMISATION
1771 && OP(startbranch) == BRANCH
1776 Assuming this was/is a branch we are dealing with: 'scan'
1777 now points at the item that follows the branch sequence,
1778 whatever it is. We now start at the beginning of the
1779 sequence and look for subsequences of
1785 which would be constructed from a pattern like
1788 If we can find such a subsequence we need to turn the first
1789 element into a trie and then add the subsequent branch exact
1790 strings to the trie.
1794 1. patterns where the whole set of branches can be
1797 2. patterns where only a subset can be converted.
1799 In case 1 we can replace the whole set with a single regop
1800 for the trie. In case 2 we need to keep the start and end
1803 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
1804 becomes BRANCH TRIE; BRANCH X;
1806 There is an additional case, that being where there is a
1807 common prefix, which gets split out into an EXACT like node
1808 preceding the TRIE node.
1810 If X(1..n)==tail then we can do a simple trie, if not we make
1811 a "jump" trie, such that when we match the appropriate word
1812 we "jump" to the appropriate tail node. Essentially we turn
1813 a nested if into a case structure of sorts.
1818 if (!re_trie_maxbuff) {
1819 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1820 if (!SvIOK(re_trie_maxbuff))
1821 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1823 if ( SvIV(re_trie_maxbuff)>=0 ) {
1825 regnode *first = (regnode *)NULL;
1826 regnode *prev = (regnode *)NULL;
1827 regnode *tail = scan;
1831 /* var tail is used because there may be a TAIL
1832 regop in the way. Ie, the exacts will point to the
1833 thing following the TAIL, but the last branch will
1834 point at the TAIL. So we advance tail. If we
1835 have nested (?:) we may have to move through several
1839 while ( OP( tail ) == TAIL ) {
1840 /* this is the TAIL generated by (?:) */
1841 tail = regnext( tail );
1845 DEBUG_TRIE_COMPILE_r({
1846 regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
1847 Perl_re_indentf( aTHX_ "%s %" UVuf ":%s\n",
1849 "Looking for TRIE'able sequences. Tail node is ",
1850 (UV) REGNODE_OFFSET(tail),
1851 SvPV_nolen_const( RExC_mysv )
1857 Step through the branches
1858 cur represents each branch,
1859 noper is the first thing to be matched as part
1861 noper_next is the regnext() of that node.
1863 We normally handle a case like this
1864 /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
1865 support building with NOJUMPTRIE, which restricts
1866 the trie logic to structures like /FOO|BAR/.
1868 If noper is a trieable nodetype then the branch is
1869 a possible optimization target. If we are building
1870 under NOJUMPTRIE then we require that noper_next is
1871 the same as scan (our current position in the regex
1874 Once we have two or more consecutive such branches
1875 we can create a trie of the EXACT's contents and
1876 stitch it in place into the program.
1878 If the sequence represents all of the branches in
1879 the alternation we replace the entire thing with a
1882 Otherwise when it is a subsequence we need to
1883 stitch it in place and replace only the relevant
1884 branches. This means the first branch has to remain
1885 as it is used by the alternation logic, and its
1886 next pointer, and needs to be repointed at the item
1887 on the branch chain following the last branch we
1888 have optimized away.
1890 This could be either a BRANCH, in which case the
1891 subsequence is internal, or it could be the item
1892 following the branch sequence in which case the
1893 subsequence is at the end (which does not
1894 necessarily mean the first node is the start of the
1897 TRIE_TYPE(X) is a define which maps the optype to a
1901 ----------------+-----------
1906 EXACTFU_REQ8 | EXACTFU
1910 EXACTFLU8 | EXACTFLU8
1914 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) \
1916 : ( EXACT == (X) || EXACT_REQ8 == (X) ) \
1918 : ( EXACTFU == (X) \
1919 || EXACTFU_REQ8 == (X) \
1920 || EXACTFUP == (X) ) \
1922 : ( EXACTFAA == (X) ) \
1924 : ( EXACTL == (X) ) \
1926 : ( EXACTFLU8 == (X) ) \
1930 /* dont use tail as the end marker for this traverse */
1931 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
1932 regnode * const noper = REGNODE_AFTER( cur );
1933 U8 noper_type = OP( noper );
1934 U8 noper_trietype = TRIE_TYPE( noper_type );
1935 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
1936 regnode * const noper_next = regnext( noper );
1937 U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
1938 U8 noper_next_trietype = (noper_next && noper_next < tail) ? TRIE_TYPE( noper_next_type ) :0;
1941 DEBUG_TRIE_COMPILE_r({
1942 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
1943 Perl_re_indentf( aTHX_ "- %d:%s (%d)",
1945 REG_NODE_NUM(cur), SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
1947 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
1948 Perl_re_printf( aTHX_ " -> %d:%s",
1949 REG_NODE_NUM(noper), SvPV_nolen_const(RExC_mysv));
1952 regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
1953 Perl_re_printf( aTHX_ "\t=> %d:%s\t",
1954 REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv));
1956 Perl_re_printf( aTHX_ "(First == %d,Last == %d,Cur == %d,tt == %s,ntt == %s,nntt == %s)\n",
1957 REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
1958 REGNODE_NAME(trietype), REGNODE_NAME(noper_trietype), REGNODE_NAME(noper_next_trietype)
1962 /* Is noper a trieable nodetype that can be merged
1963 * with the current trie (if there is one)? */
1967 ( noper_trietype == NOTHING )
1968 || ( trietype == NOTHING )
1969 || ( trietype == noper_trietype )
1972 && noper_next >= tail
1976 /* Handle mergable triable node Either we are
1977 * the first node in a new trieable sequence,
1978 * in which case we do some bookkeeping,
1979 * otherwise we update the end pointer. */
1982 if ( noper_trietype == NOTHING ) {
1983 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
1984 regnode * const noper_next = regnext( noper );
1985 U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
1986 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
1989 if ( noper_next_trietype ) {
1990 trietype = noper_next_trietype;
1991 } else if (noper_next_type) {
1992 /* a NOTHING regop is 1 regop wide.
1993 * We need at least two for a trie
1994 * so we can't merge this in */
1998 trietype = noper_trietype;
2001 if ( trietype == NOTHING )
2002 trietype = noper_trietype;
2007 } /* end handle mergable triable node */
2009 /* handle unmergable node -
2010 * noper may either be a triable node which can
2011 * not be tried together with the current trie,
2012 * or a non triable node */
2014 /* If last is set and trietype is not
2015 * NOTHING then we have found at least two
2016 * triable branch sequences in a row of a
2017 * similar trietype so we can turn them
2018 * into a trie. If/when we allow NOTHING to
2019 * start a trie sequence this condition
2020 * will be required, and it isn't expensive
2021 * so we leave it in for now. */
2022 if ( trietype && trietype != NOTHING )
2023 make_trie( pRExC_state,
2024 startbranch, first, cur, tail,
2025 count, trietype, depth+1 );
2026 prev = NULL; /* note: we clear/update
2027 first, trietype etc below,
2028 so we dont do it here */
2032 && noper_next >= tail
2035 /* noper is triable, so we can start a new
2039 trietype = noper_trietype;
2041 /* if we already saw a first but the
2042 * current node is not triable then we have
2043 * to reset the first information. */
2048 } /* end handle unmergable node */
2049 } /* loop over branches */
2050 DEBUG_TRIE_COMPILE_r({
2051 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
2052 Perl_re_indentf( aTHX_ "- %s (%d) <SCAN FINISHED> ",
2053 depth+1, SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
2054 Perl_re_printf( aTHX_ "(First == %d, Last == %d, Cur == %d, tt == %s)\n",
2055 REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
2056 REGNODE_NAME(trietype)
2060 if ( prev && trietype ) {
2061 if ( trietype != NOTHING ) {
2062 /* the last branch of the sequence was part of
2063 * a trie, so we have to construct it here
2064 * outside of the loop */
2065 made = make_trie( pRExC_state, startbranch,
2066 first, scan, tail, count,
2067 trietype, depth+1 );
2068 #ifdef TRIE_STUDY_OPT
2069 if ( ((made == MADE_EXACT_TRIE &&
2070 startbranch == first)
2071 || ( first_non_open == first )) &&
2073 flags |= SCF_TRIE_RESTUDY;
2074 if ( startbranch == first
2077 RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
2082 /* at this point we know whatever we have is a
2083 * NOTHING sequence/branch AND if 'startbranch'
2084 * is 'first' then we can turn the whole thing
2087 if ( startbranch == first ) {
2089 /* the entire thing is a NOTHING sequence,
2090 * something like this: (?:|) So we can
2091 * turn it into a plain NOTHING op. */
2092 DEBUG_TRIE_COMPILE_r({
2093 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
2094 Perl_re_indentf( aTHX_ "- %s (%d) <NOTHING BRANCH SEQUENCE>\n",
2096 SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
2099 OP(startbranch)= NOTHING;
2100 NEXT_OFF(startbranch)= tail - startbranch;
2101 for ( opt = startbranch + 1; opt < tail ; opt++ )
2105 } /* end if ( prev) */
2106 } /* TRIE_MAXBUF is non zero */
2108 DEBUG_STUDYDATA("after TRIE", data, depth, is_inf, min, stopmin, delta);
2111 scan = REGNODE_AFTER_opcode(scan,code);
2113 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB) {
2115 regnode *start = NULL;
2116 regnode *end = NULL;
2117 U32 my_recursed_depth = recursed_depth;
2119 if (OP(scan) != SUSPEND) { /* GOSUB */
2120 /* Do setup, note this code has side effects beyond
2121 * the rest of this block. Specifically setting
2122 * RExC_recurse[] must happen at least once during
2124 paren = ARG1u(scan);
2125 RExC_recurse[ARG2i(scan)] = scan;
2126 start = REGNODE_p(RExC_open_parens[paren]);
2127 end = REGNODE_p(RExC_close_parens[paren]);
2129 /* NOTE we MUST always execute the above code, even
2130 * if we do nothing with a GOSUB */
2132 ( flags & SCF_IN_DEFINE )
2135 (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF))
2137 ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
2140 /* no need to do anything here if we are in a define. */
2141 /* or we are after some kind of infinite construct
2142 * so we can skip recursing into this item.
2143 * Since it is infinite we will not change the maxlen
2144 * or delta, and if we miss something that might raise
2145 * the minlen it will merely pessimise a little.
2147 * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
2148 * might result in a minlen of 1 and not of 4,
2149 * but this doesn't make us mismatch, just try a bit
2150 * harder than we should.
2152 * However we must assume this GOSUB is infinite, to
2153 * avoid wrongly applying other optimizations in the
2154 * enclosing scope - see GH 18096, for example.
2156 is_inf = is_inf_internal = 1;
2157 scan = regnext(scan);
2163 || !PAREN_TEST(recursed_depth - 1, paren)
2165 /* it is quite possible that there are more efficient ways
2166 * to do this. We maintain a bitmap per level of recursion
2167 * of which patterns we have entered so we can detect if a
2168 * pattern creates a possible infinite loop. When we
2169 * recurse down a level we copy the previous levels bitmap
2170 * down. When we are at recursion level 0 we zero the top
2171 * level bitmap. It would be nice to implement a different
2172 * more efficient way of doing this. In particular the top
2173 * level bitmap may be unnecessary.
2175 if (!recursed_depth) {
2176 Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
2178 Copy(PAREN_OFFSET(recursed_depth - 1),
2179 PAREN_OFFSET(recursed_depth),
2180 RExC_study_chunk_recursed_bytes, U8);
2182 /* we havent recursed into this paren yet, so recurse into it */
2183 DEBUG_STUDYDATA("gosub-set", data, depth, is_inf, min, stopmin, delta);
2184 PAREN_SET(recursed_depth, paren);
2185 my_recursed_depth = recursed_depth + 1;
2187 DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf, min, stopmin, delta);
2188 /* some form of infinite recursion, assume infinite length
2190 if (flags & SCF_DO_SUBSTR) {
2191 scan_commit(pRExC_state, data, minlenp, is_inf);
2192 data->cur_is_floating = 1;
2194 is_inf = is_inf_internal = 1;
2195 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2196 ssc_anything(data->start_class);
2197 flags &= ~SCF_DO_STCLASS;
2199 start = NULL; /* reset start so we dont recurse later on. */
2204 end = regnext(scan);
2207 scan_frame *newframe;
2209 if (!RExC_frame_last) {
2210 Newxz(newframe, 1, scan_frame);
2211 SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
2212 RExC_frame_head = newframe;
2214 } else if (!RExC_frame_last->next_frame) {
2215 Newxz(newframe, 1, scan_frame);
2216 RExC_frame_last->next_frame= newframe;
2217 newframe->prev_frame= RExC_frame_last;
2220 newframe = RExC_frame_last->next_frame;
2222 RExC_frame_last = newframe;
2224 newframe->next_regnode = regnext(scan);
2225 newframe->last_regnode = last;
2226 newframe->stopparen = stopparen;
2227 newframe->prev_recursed_depth = recursed_depth;
2228 newframe->this_prev_frame= frame;
2229 newframe->in_gosub = (
2230 (frame && frame->in_gosub) || OP(scan) == GOSUB
2233 DEBUG_STUDYDATA("frame-new", data, depth, is_inf, min, stopmin, delta);
2234 DEBUG_PEEP("fnew", scan, depth, flags);
2241 recursed_depth = my_recursed_depth;
2246 else if (REGNODE_TYPE(OP(scan)) == EXACT && ! isEXACTFish(OP(scan))) {
2247 SSize_t bytelen = STR_LEN(scan), charlen;
2251 const U8 * const s = (U8*)STRING(scan);
2252 uc = utf8_to_uv_or_die(s, s + bytelen, NULL);
2253 charlen = utf8_length(s, s + bytelen);
2255 uc = *((U8*)STRING(scan));
2259 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2260 /* The code below prefers earlier match for fixed
2261 offset, later match for variable offset. */
2262 if (data->last_end == -1) { /* Update the start info. */
2263 data->last_start_min = data->pos_min;
2264 data->last_start_max =
2265 is_inf ? OPTIMIZE_INFTY
2266 : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min)
2267 ? OPTIMIZE_INFTY : data->pos_min + data->pos_delta;
2269 sv_catpvn(data->last_found, STRING(scan), bytelen);
2271 SvUTF8_on(data->last_found);
2273 SV * const sv = data->last_found;
2274 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2275 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2276 if (mg && mg->mg_len >= 0)
2277 mg->mg_len += charlen;
2279 data->last_end = data->pos_min + charlen;
2280 data->pos_min += charlen; /* As in the first entry. */
2281 data->flags &= ~SF_BEFORE_EOL;
2284 /* ANDing the code point leaves at most it, and not in locale, and
2285 * can't match null string */
2286 if (flags & SCF_DO_STCLASS_AND) {
2287 ssc_cp_and(data->start_class, uc);
2288 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
2289 ssc_clear_locale(data->start_class);
2291 else if (flags & SCF_DO_STCLASS_OR) {
2292 ssc_add_cp(data->start_class, uc);
2293 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
2295 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
2296 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
2298 flags &= ~SCF_DO_STCLASS;
2299 DEBUG_STUDYDATA("end EXACT", data, depth, is_inf, min, stopmin, delta);
2301 else if (REGNODE_TYPE(OP(scan)) == EXACT) {
2302 /* But OP != EXACT!, so is EXACTFish */
2303 SSize_t bytelen = STR_LEN(scan), charlen;
2304 const U8 * s = (U8*)STRING(scan);
2306 /* Replace a length 1 ASCII fold pair node with an ANYOFM node,
2307 * with the mask set to the complement of the bit that differs
2308 * between upper and lower case, and the lowest code point of the
2309 * pair (which the '&' forces) */
2312 && ( OP(scan) == EXACTFAA
2313 || ( OP(scan) == EXACTFU
2314 && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(*s)))
2317 U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */
2320 ARG1u_SET(scan, *s & mask);
2322 /* We're not EXACTFish any more, so restudy.
2323 * Search for "restudy" in this file to find
2324 * a comment with details. */
2328 /* Search for fixed substrings supports EXACT only. */
2329 if (flags & SCF_DO_SUBSTR) {
2331 scan_commit(pRExC_state, data, minlenp, is_inf);
2333 charlen = UTF ? (SSize_t) utf8_length(s, s + bytelen) : bytelen;
2334 if (unfolded_multi_char) {
2335 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
2337 min += charlen - min_subtract;
2339 if ((SSize_t)min_subtract < OPTIMIZE_INFTY
2340 && delta < OPTIMIZE_INFTY - (SSize_t)min_subtract
2342 delta += min_subtract;
2344 delta = OPTIMIZE_INFTY;
2346 if (flags & SCF_DO_SUBSTR) {
2347 data->pos_min += charlen - min_subtract;
2348 if (data->pos_min < 0) {
2351 if ((SSize_t)min_subtract < OPTIMIZE_INFTY
2352 && data->pos_delta < OPTIMIZE_INFTY - (SSize_t)min_subtract
2354 data->pos_delta += min_subtract;
2356 data->pos_delta = OPTIMIZE_INFTY;
2359 data->cur_is_floating = 1; /* float */
2363 if (flags & SCF_DO_STCLASS) {
2364 SV* EXACTF_invlist = make_exactf_invlist(pRExC_state, scan);
2366 assert(EXACTF_invlist);
2367 if (flags & SCF_DO_STCLASS_AND) {
2368 if (OP(scan) != EXACTFL)
2369 ssc_clear_locale(data->start_class);
2370 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
2371 ANYOF_POSIXL_ZERO(data->start_class);
2372 ssc_intersection(data->start_class, EXACTF_invlist, false);
2374 else { /* SCF_DO_STCLASS_OR */
2375 ssc_union(data->start_class, EXACTF_invlist, false);
2376 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
2378 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
2379 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
2381 flags &= ~SCF_DO_STCLASS;
2382 SvREFCNT_dec(EXACTF_invlist);
2384 DEBUG_STUDYDATA("end EXACTish", data, depth, is_inf, min, stopmin, delta);
2386 else if (REGNODE_VARIES(OP(scan))) {
2387 SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
2390 regnode * const oscan = scan;
2391 regnode_ssc this_class;
2392 regnode_ssc *oclass = NULL;
2393 I32 next_is_eval = 0;
2395 switch (REGNODE_TYPE(OP(scan))) {
2396 case WHILEM: /* End of (?:...)* . */
2397 scan = REGNODE_AFTER(scan);
2400 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2401 next = REGNODE_AFTER(scan);
2402 if ( ( REGNODE_TYPE(OP(next)) == EXACT
2403 && ! isEXACTFish(OP(next)))
2404 || (flags & SCF_DO_STCLASS))
2407 maxcount = REG_INFTY;
2408 next = regnext(scan);
2409 scan = REGNODE_AFTER(scan);
2413 if (flags & SCF_DO_SUBSTR)
2415 /* This will bypass the formal 'min += minnext * mincount'
2416 * calculation in the do_curly path, so assumes min width
2417 * of the PLUS payload is exactly one. */
2421 next = REGNODE_AFTER(scan);
2423 /* This temporary node can now be turned into EXACTFU, and
2424 * must, as regexec.c doesn't handle it */
2425 if (OP(next) == EXACTFU_S_EDGE && mutate_ok) {
2429 if ( STR_LEN(next) == 1
2430 && isALPHA_A(* STRING(next))
2431 && ( OP(next) == EXACTFAA
2432 || ( OP(next) == EXACTFU
2433 && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(next))))
2436 /* These differ in just one bit */
2437 U8 mask = ~ ('A' ^ 'a');
2439 assert(isALPHA_A(* STRING(next)));
2441 /* Then replace it by an ANYOFM node, with
2442 * the mask set to the complement of the
2443 * bit that differs between upper and lower
2444 * case, and the lowest code point of the
2445 * pair (which the '&' forces) */
2447 ARG1u_SET(next, *STRING(next) & mask);
2451 if (flags & SCF_DO_STCLASS) {
2453 maxcount = REG_INFTY;
2454 next = regnext(scan);
2455 scan = REGNODE_AFTER(scan);
2458 if (flags & SCF_DO_SUBSTR) {
2459 scan_commit(pRExC_state, data, minlenp, is_inf);
2460 /* Cannot extend fixed substrings */
2461 data->cur_is_floating = 1; /* float */
2463 is_inf = is_inf_internal = 1;
2464 scan = regnext(scan);
2465 goto optimize_curly_tail;
2467 if (stopparen > 0 && (OP(scan) == CURLYN || OP(scan) == CURLYM)
2468 && (FLAGS(scan) == stopparen))
2473 mincount = ARG1i(scan);
2474 maxcount = ARG2i(scan);
2476 next = regnext(scan);
2477 if (OP(scan) == CURLYX) {
2478 I32 lp = (data ? *(data->last_closep) : 0);
2479 FLAGS(scan) = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
2481 scan = REGNODE_AFTER(scan);
2482 next_is_eval = (OP(scan) == EVAL);
2484 if (flags & SCF_DO_SUBSTR) {
2486 scan_commit(pRExC_state, data, minlenp, is_inf);
2487 /* Cannot extend fixed substrings */
2488 pos_before = data->pos_min;
2492 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2494 data->flags |= SF_IS_INF;
2496 if (flags & SCF_DO_STCLASS) {
2497 ssc_init(pRExC_state, &this_class);
2498 oclass = data->start_class;
2499 data->start_class = &this_class;
2500 f |= SCF_DO_STCLASS_AND;
2501 f &= ~SCF_DO_STCLASS_OR;
2503 /* Exclude from super-linear cache processing any {n,m}
2504 regops for which the combination of input pos and regex
2505 pos is not enough information to determine if a match
2508 For example, in the regex /foo(bar\s*){4,8}baz/ with the
2509 regex pos at the \s*, the prospects for a match depend not
2510 only on the input position but also on how many (bar\s*)
2511 repeats into the {4,8} we are. */
2512 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
2513 f &= ~SCF_WHILEM_VISITED_POS;
2515 /* This will finish on WHILEM, setting scan, or on NULL: */
2516 /* recurse study_chunk() on loop bodies */
2517 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2518 last, data, stopparen, recursed_depth, NULL,
2520 ? (f & ~SCF_DO_SUBSTR)
2522 , depth+1, mutate_ok);
2524 if (data && data->flags & SCF_SEEN_ACCEPT) {
2529 if (flags & SCF_DO_STCLASS)
2530 data->start_class = oclass;
2531 if (mincount == 0 || minnext == 0) {
2532 if (flags & SCF_DO_STCLASS_OR) {
2533 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
2535 else if (flags & SCF_DO_STCLASS_AND) {
2536 /* Switch to OR mode: cache the old value of
2537 * data->start_class */
2539 StructCopy(data->start_class, and_withp, regnode_ssc);
2540 flags &= ~SCF_DO_STCLASS_AND;
2541 StructCopy(&this_class, data->start_class, regnode_ssc);
2542 flags |= SCF_DO_STCLASS_OR;
2543 ANYOF_FLAGS(data->start_class)
2544 |= SSC_MATCHES_EMPTY_STRING;
2546 } else { /* Non-zero len */
2547 if (flags & SCF_DO_STCLASS_OR) {
2548 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
2549 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
2551 else if (flags & SCF_DO_STCLASS_AND)
2552 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
2553 flags &= ~SCF_DO_STCLASS;
2555 if (!scan) /* It was not CURLYX, but CURLY. */
2557 if (((flags & (SCF_TRIE_DOING_RESTUDY|SCF_DO_SUBSTR))==SCF_DO_SUBSTR)
2558 /* ? quantifier ok, except for (?{ ... }) */
2559 && (next_is_eval || !(mincount == 0 && maxcount == 1))
2560 && (minnext == 0) && (deltanext == 0)
2561 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2562 && maxcount <= REG_INFTY/3) /* Complement check for big
2565 _WARN_HELPER(RExC_precomp_end, packWARN(WARN_REGEXP),
2566 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
2567 "Quantifier unexpected on zero-length expression "
2568 "in regex m/%" UTF8f "/",
2569 UTF8fARG(UTF, RExC_precomp_end - RExC_precomp,
2573 if ( ( minnext > 0 && mincount >= SSize_t_MAX / minnext )
2574 || min >= SSize_t_MAX - minnext * mincount )
2576 FAIL("Regexp out of space");
2579 min += minnext * mincount;
2580 is_inf_internal |= deltanext == OPTIMIZE_INFTY
2581 || (maxcount == REG_INFTY && minnext + deltanext > 0);
2582 is_inf |= is_inf_internal;
2584 delta = OPTIMIZE_INFTY;
2586 delta += (minnext + deltanext) * maxcount
2587 - minnext * mincount;
2590 if (data && data->flags & SCF_SEEN_ACCEPT) {
2591 if (flags & SCF_DO_SUBSTR) {
2592 scan_commit(pRExC_state, data, minlenp, is_inf);
2593 flags &= ~SCF_DO_SUBSTR;
2597 DEBUG_STUDYDATA("after-whilem accept", data, depth, is_inf, min, stopmin, delta);
2599 DEBUG_STUDYDATA("PRE CURLYX_TO_CURLYN", data, depth, is_inf, min, stopmin, delta);
2600 /* Try powerful optimization CURLYX => CURLYN. */
2601 if ( RE_OPTIMIZE_CURLYX_TO_CURLYN
2602 && OP(oscan) == CURLYX
2604 && !(RExC_seen & REG_PESSIMIZE_SEEN) /* XXX: for now disable whenever a
2605 non optimistic eval is seen
2607 && ( data->flags & SF_IN_PAR ) /* has parens */
2612 DEBUG_STUDYDATA("CURLYX_TO_CURLYN", data, depth, is_inf, min, stopmin, delta);
2613 /* Try to optimize to CURLYN. */
2614 regnode *nxt = REGNODE_AFTER_type(oscan, tregnode_CURLYX);
2615 regnode * const nxt1 = nxt;
2621 if (!REGNODE_SIMPLE(OP(nxt))
2622 && !(REGNODE_TYPE(OP(nxt)) == EXACT
2623 && STR_LEN(nxt) == 1))
2629 if (OP(nxt) != CLOSE)
2631 if (RExC_open_parens) {
2634 RExC_open_parens[PARNO(nxt1)] = REGNODE_OFFSET(oscan);
2637 RExC_close_parens[PARNO(nxt1)] = REGNODE_OFFSET(nxt) + 2;
2639 /* Now we know that nxt2 is the only contents: */
2640 FLAGS(oscan) = (U8)PARNO(nxt);
2642 OP(nxt1) = NOTHING; /* was OPEN. */
2645 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2646 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
2647 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
2648 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2649 OP(nxt + 1) = OPTIMIZED; /* was count. */
2650 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
2655 DEBUG_STUDYDATA("PRE CURLYX_TO_CURLYM", data, depth, is_inf, min, stopmin, delta);
2657 /* Try optimization CURLYX => CURLYM. */
2658 if ( RE_OPTIMIZE_CURLYX_TO_CURLYM
2659 && OP(oscan) == CURLYX
2661 && !(RExC_seen & REG_PESSIMIZE_SEEN) /* XXX: for now disable whenever a
2662 non optimistic eval is seen
2664 && !(data->flags & SF_HAS_PAR) /* no parens! */
2665 && !deltanext /* atom is fixed width */
2666 && minnext != 0 /* CURLYM can't handle zero width */
2667 /* Nor characters whose fold at run-time may be
2668 * multi-character */
2669 && !(RExC_seen & REG_UNFOLDED_MULTI_SEEN)
2672 DEBUG_STUDYDATA("CURLYX_TO_CURLYM", data, depth, is_inf, min, stopmin, delta);
2673 /* XXXX How to optimize if data == 0? */
2674 /* Optimize to a simpler form. */
2675 regnode *nxt = REGNODE_AFTER_type(oscan, tregnode_CURLYX); /* OPEN */
2679 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
2680 && (OP(nxt2) != WHILEM))
2682 OP(nxt2) = SUCCEED; /* Whas WHILEM */
2683 /* Need to optimize away parenths. */
2684 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
2685 /* Set the parenth number. */
2686 /* note that we have changed the type of oscan to CURLYM here */
2687 regnode *nxt1 = REGNODE_AFTER_type(oscan, tregnode_CURLYM); /* OPEN*/
2689 FLAGS(oscan) = (U8)PARNO(nxt);
2690 if (RExC_open_parens) {
2692 RExC_open_parens[PARNO(nxt1)] = REGNODE_OFFSET(oscan);
2695 RExC_close_parens[PARNO(nxt1)] = REGNODE_OFFSET(nxt2)
2698 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2699 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2702 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2703 OP(nxt + 1) = OPTIMIZED; /* was count. */
2704 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
2705 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
2708 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2709 regnode *nnxt = regnext(nxt1);
2711 if (REGNODE_OFF_BY_ARG(OP(nxt1)))
2712 ARG1u_SET(nxt1, nxt2 - nxt1);
2713 else if (nxt2 - nxt1 < U16_MAX)
2714 NEXT_OFF(nxt1) = nxt2 - nxt1;
2716 OP(nxt) = NOTHING; /* Cannot beautify */
2721 /* Optimize again: */
2722 /* recurse study_chunk() on optimised CURLYX => CURLYM */
2723 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
2724 NULL, stopparen, recursed_depth, NULL, 0,
2725 depth+1, mutate_ok);
2730 else if ((OP(oscan) == CURLYX)
2731 && (flags & SCF_WHILEM_VISITED_POS)
2732 /* See the comment on a similar expression above.
2733 However, this time it's not a subexpression
2734 we care about, but the expression itself. */
2735 && (maxcount == REG_INFTY)
2737 /* This stays as CURLYX, we can put the count/of pair. */
2738 /* Find WHILEM (as in regexec.c) */
2739 regnode *nxt = oscan + NEXT_OFF(oscan);
2741 if (OP(REGNODE_BEFORE(nxt)) == NOTHING) /* LONGJMP */
2743 nxt = REGNODE_BEFORE(nxt);
2744 if (FLAGS(nxt) & 0xf) {
2745 /* we've already set whilem count on this node */
2746 } else if (++data->whilem_c < 16) {
2747 assert(data->whilem_c <= RExC_whilem_seen);
2748 FLAGS(nxt) = (U8)(data->whilem_c
2749 | (RExC_whilem_seen << 4)); /* On WHILEM */
2752 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
2754 if (flags & SCF_DO_SUBSTR) {
2755 SV *last_str = NULL;
2756 STRLEN last_chrs = 0;
2757 int counted = mincount != 0;
2759 if (data->last_end > 0 && mincount != 0) { /* Ends with a
2761 SSize_t b = pos_before >= data->last_start_min
2762 ? pos_before : data->last_start_min;
2764 const char * const s = SvPV_const(data->last_found, l);
2765 SSize_t old = b - data->last_start_min;
2769 old = utf8_hop_forward((U8*)s, old,
2770 (U8 *) SvEND(data->last_found))
2773 /* Get the added string: */
2774 last_str = newSVpvn_utf8(s + old, l, UTF);
2775 last_chrs = UTF ? utf8_length((U8*)(s + old),
2776 (U8*)(s + old + l)) : l;
2777 if (deltanext == 0 && pos_before == b) {
2778 /* What was added is a constant string */
2781 SvGROW(last_str, (mincount * l) + 1);
2782 repeatcpy(SvPVX(last_str) + l,
2783 SvPVX_const(last_str), l,
2785 SvCUR_set(last_str, SvCUR(last_str) * mincount);
2786 /* Add additional parts. */
2787 SvCUR_set(data->last_found,
2788 SvCUR(data->last_found) - l);
2789 sv_catsv(data->last_found, last_str);
2791 SV * sv = data->last_found;
2793 SvUTF8(sv) && SvMAGICAL(sv) ?
2794 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2795 if (mg && mg->mg_len >= 0)
2796 mg->mg_len += last_chrs * (mincount-1);
2798 last_chrs *= mincount;
2799 data->last_end += l * (mincount - 1);
2802 /* start offset must point into the last copy */
2803 data->last_start_min += minnext * (mincount - 1);
2804 data->last_start_max =
2807 : data->last_start_max +
2808 (maxcount - 1) * (minnext + data->pos_delta);
2811 /* It is counted once already... */
2812 data->pos_min += minnext * (mincount - counted);
2814 Perl_re_printf( aTHX_ "counted = %" UVuf " deltanext = %" UVuf
2815 " OPTIMIZE_INFTY = %" UVuf " minnext = %" UVuf
2816 " maxcount = %" UVuf " mincount = %" UVuf
2817 " data->pos_delta = %" UVuf "\n",
2818 (UV)counted, (UV)deltanext, (UV)OPTIMIZE_INFTY, (UV)minnext,
2819 (UV)maxcount, (UV)mincount, (UV)data->pos_delta);
2820 if (deltanext != OPTIMIZE_INFTY)
2821 Perl_re_printf( aTHX_ "LHS = %" UVuf " RHS = %" UVuf "\n",
2822 (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
2823 - minnext * mincount), (UV)(OPTIMIZE_INFTY - data->pos_delta));
2825 if (deltanext == OPTIMIZE_INFTY
2826 || data->pos_delta == OPTIMIZE_INFTY
2827 || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= OPTIMIZE_INFTY - data->pos_delta)
2828 data->pos_delta = OPTIMIZE_INFTY;
2830 data->pos_delta += - counted * deltanext +
2831 (minnext + deltanext) * maxcount - minnext * mincount;
2832 if (mincount != maxcount) {
2833 /* Cannot extend fixed substrings found inside
2835 scan_commit(pRExC_state, data, minlenp, is_inf);
2836 if (mincount && last_str) {
2837 SV * const sv = data->last_found;
2838 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2839 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2843 sv_setsv(sv, last_str);
2844 data->last_end = data->pos_min;
2845 data->last_start_min = data->pos_min - last_chrs;
2846 data->last_start_max = is_inf
2848 : data->pos_min + data->pos_delta - last_chrs;
2850 data->cur_is_floating = 1; /* float */
2852 SvREFCNT_dec(last_str);
2854 if (data && (fl & SF_HAS_EVAL))
2855 data->flags |= SF_HAS_EVAL;
2856 optimize_curly_tail:
2857 rck_elide_nothing(oscan);
2861 croak("panic: unexpected varying REx opcode %d",
2865 if (flags & SCF_DO_SUBSTR) {
2866 /* Cannot expect anything... */
2867 scan_commit(pRExC_state, data, minlenp, is_inf);
2868 data->cur_is_floating = 1; /* float */
2870 is_inf = is_inf_internal = 1;
2871 if (flags & SCF_DO_STCLASS_OR) {
2872 if (OP(scan) == CLUMP) {
2873 /* Actually is any start char, but very few code points
2874 * aren't start characters */
2875 ssc_match_all_cp(data->start_class);
2878 ssc_anything(data->start_class);
2881 flags &= ~SCF_DO_STCLASS;
2885 else if (OP(scan) == LNBREAK) {
2886 if (flags & SCF_DO_STCLASS) {
2887 if (flags & SCF_DO_STCLASS_AND) {
2888 ssc_intersection(data->start_class,
2889 PL_XPosix_ptrs[CC_VERTSPACE_], false);
2890 ssc_clear_locale(data->start_class);
2891 ANYOF_FLAGS(data->start_class)
2892 &= ~SSC_MATCHES_EMPTY_STRING;
2894 else if (flags & SCF_DO_STCLASS_OR) {
2895 ssc_union(data->start_class,
2896 PL_XPosix_ptrs[CC_VERTSPACE_],
2898 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
2900 /* See commit msg for
2901 * 749e076fceedeb708a624933726e7989f2302f6a */
2902 ANYOF_FLAGS(data->start_class)
2903 &= ~SSC_MATCHES_EMPTY_STRING;
2905 flags &= ~SCF_DO_STCLASS;
2908 if (delta != OPTIMIZE_INFTY)
2909 delta++; /* Because of the 2 char string cr-lf */
2910 if (flags & SCF_DO_SUBSTR) {
2911 /* Cannot expect anything... */
2912 scan_commit(pRExC_state, data, minlenp, is_inf);
2914 if (data->pos_delta != OPTIMIZE_INFTY) {
2915 data->pos_delta += 1;
2917 data->cur_is_floating = 1; /* float */
2920 else if (REGNODE_SIMPLE(OP(scan))) {
2922 if (flags & SCF_DO_SUBSTR) {
2923 scan_commit(pRExC_state, data, minlenp, is_inf);
2927 if (flags & SCF_DO_STCLASS) {
2929 SV* my_invlist = NULL;
2932 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
2933 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
2935 /* Some of the logic below assumes that switching
2936 locale on will only add false positives. */
2941 croak("panic: unexpected simple REx opcode %d",
2945 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2946 ssc_match_all_cp(data->start_class);
2951 SV* REG_ANY_invlist = _new_invlist(2);
2952 REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
2954 if (flags & SCF_DO_STCLASS_OR) {
2955 ssc_union(data->start_class,
2957 true /* true => invert, hence all but \n
2961 else if (flags & SCF_DO_STCLASS_AND) {
2962 ssc_intersection(data->start_class,
2964 true /* true => invert */
2966 ssc_clear_locale(data->start_class);
2968 SvREFCNT_dec_NN(REG_ANY_invlist);
2980 if (flags & SCF_DO_STCLASS_AND)
2981 ssc_and(pRExC_state, data->start_class,
2982 (regnode_charclass *) scan);
2984 ssc_or(pRExC_state, data->start_class,
2985 (regnode_charclass *) scan);
2990 SV* cp_list = get_ANYOFHbbm_contents(scan);
2992 if (flags & SCF_DO_STCLASS_OR) {
2993 ssc_union(data->start_class, cp_list, invert);
2995 else if (flags & SCF_DO_STCLASS_AND) {
2996 ssc_intersection(data->start_class, cp_list, invert);
2999 SvREFCNT_dec_NN(cp_list);
3003 case NANYOFM: /* NANYOFM already contains the inversion of the
3004 input ANYOF data, so, unlike things like
3005 NPOSIXA, don't change 'invert' to true */
3009 SV* cp_list = get_ANYOFM_contents(scan);
3011 if (flags & SCF_DO_STCLASS_OR) {
3012 ssc_union(data->start_class, cp_list, invert);
3014 else if (flags & SCF_DO_STCLASS_AND) {
3015 ssc_intersection(data->start_class, cp_list, invert);
3018 SvREFCNT_dec_NN(cp_list);
3027 cp_list = _add_range_to_invlist(cp_list,
3029 ANYOFRbase(scan) + ANYOFRdelta(scan));
3031 if (flags & SCF_DO_STCLASS_OR) {
3032 ssc_union(data->start_class, cp_list, invert);
3034 else if (flags & SCF_DO_STCLASS_AND) {
3035 ssc_intersection(data->start_class, cp_list, invert);
3038 SvREFCNT_dec_NN(cp_list);
3047 namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
3048 if (flags & SCF_DO_STCLASS_AND) {
3049 bool was_there = cBOOL(
3050 ANYOF_POSIXL_TEST(data->start_class,
3052 ANYOF_POSIXL_ZERO(data->start_class);
3053 if (was_there) { /* Do an AND */
3054 ANYOF_POSIXL_SET(data->start_class, namedclass);
3056 /* No individual code points can now match */
3057 data->start_class->invlist
3058 = sv_2mortal(_new_invlist(0));
3061 int complement = namedclass + ((invert) ? -1 : 1);
3063 assert(flags & SCF_DO_STCLASS_OR);
3065 /* If the complement of this class was already there,
3066 * the result is that they match all code points,
3067 * (\d + \D == everything). Remove the classes from
3068 * future consideration. Locale is not relevant in
3070 if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
3071 ssc_match_all_cp(data->start_class);
3072 ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
3073 ANYOF_POSIXL_CLEAR(data->start_class, complement);
3075 else { /* The usual case; just add this class to the
3077 ANYOF_POSIXL_SET(data->start_class, namedclass);
3082 case NPOSIXA: /* For these, we always know the exact set of
3087 my_invlist = invlist_clone(PL_Posix_ptrs[FLAGS(scan)], NULL);
3088 goto join_posix_and_ascii;
3096 my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)], NULL);
3098 /* NPOSIXD matches all upper Latin1 code points unless the
3099 * target string being matched is UTF-8, which is
3100 * unknowable until match time. Since we are going to
3101 * invert, we want to get rid of all of them so that the
3102 * inversion will match all */
3103 if (OP(scan) == NPOSIXD) {
3104 _invlist_subtract(my_invlist, PL_UpperLatin1,
3108 join_posix_and_ascii:
3110 if (flags & SCF_DO_STCLASS_AND) {
3111 ssc_intersection(data->start_class, my_invlist, invert);
3112 ssc_clear_locale(data->start_class);
3115 assert(flags & SCF_DO_STCLASS_OR);
3116 ssc_union(data->start_class, my_invlist, invert);
3118 SvREFCNT_dec(my_invlist);
3120 if (flags & SCF_DO_STCLASS_OR)
3121 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3122 flags &= ~SCF_DO_STCLASS;
3125 else if (REGNODE_TYPE(OP(scan)) == EOL && flags & SCF_DO_SUBSTR) {
3126 data->flags |= (OP(scan) == MEOL
3129 scan_commit(pRExC_state, data, minlenp, is_inf);
3132 else if ( REGNODE_TYPE(OP(scan)) == BRANCHJ
3133 /* Lookbehind, or need to calculate parens/evals/stclass: */
3134 && (FLAGS(scan) || data || (flags & SCF_DO_STCLASS))
3135 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
3137 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3138 || OP(scan) == UNLESSM )
3140 /* Negative Lookahead/lookbehind
3141 In this case we can't do fixed string optimisation.
3144 bool is_positive = OP(scan) == IFMATCH ? 1 : 0;
3145 SSize_t deltanext, minnext;
3146 SSize_t fake_last_close = 0;
3147 regnode *fake_last_close_op = NULL;
3148 regnode *cur_last_close_op;
3151 U32 f = (flags & SCF_TRIE_DOING_RESTUDY);
3153 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3155 data_fake.whilem_c = data->whilem_c;
3156 data_fake.last_closep = data->last_closep;
3157 data_fake.last_close_opp = data->last_close_opp;
3160 data_fake.last_closep = &fake_last_close;
3161 data_fake.last_close_opp = &fake_last_close_op;
3164 /* remember the last_close_op we saw so we can see if
3165 * we are dealing with variable length lookbehind that
3166 * contains capturing buffers, which are considered
3168 cur_last_close_op = *(data_fake.last_close_opp);
3170 data_fake.pos_delta = delta;
3171 if ( flags & SCF_DO_STCLASS && !FLAGS(scan)
3172 && OP(scan) == IFMATCH ) { /* Lookahead */
3173 ssc_init(pRExC_state, &intrnl);
3174 data_fake.start_class = &intrnl;
3175 f |= SCF_DO_STCLASS_AND;
3177 if (flags & SCF_WHILEM_VISITED_POS)
3178 f |= SCF_WHILEM_VISITED_POS;
3179 next = regnext(scan);
3180 nscan = REGNODE_AFTER(scan);
3182 /* recurse study_chunk() for lookahead body */
3183 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3184 last, &data_fake, stopparen,
3185 recursed_depth, NULL, f, depth+1,
3190 || deltanext > (I32) U8_MAX
3191 || minnext > (I32)U8_MAX
3192 || minnext + deltanext > (I32)U8_MAX)
3194 FAIL2("Lookbehind longer than %" UVuf " not implemented",
3198 /* The 'next_off' field has been repurposed to count the
3199 * additional starting positions to try beyond the initial
3200 * one. (This leaves it at 0 for non-variable length
3201 * matches to avoid breakage for those not using this
3204 NEXT_OFF(scan) = deltanext;
3206 /* See a CLOSE op inside this lookbehind? */
3207 cur_last_close_op != *(data_fake.last_close_opp)
3208 /* and not doing restudy. see: restudied */
3209 && !(flags & SCF_TRIE_DOING_RESTUDY)
3211 /* this is positive variable length lookbehind with
3212 * capture buffers inside of it */
3213 ckWARNexperimental_with_arg(RExC_parse,
3214 WARN_EXPERIMENTAL__VLB,
3215 "Variable length %s lookbehind with capturing is experimental",
3216 is_positive ? "positive" : "negative");
3219 FLAGS(scan) = (U8)minnext + deltanext;
3222 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3224 if (data_fake.flags & SF_HAS_EVAL)
3225 data->flags |= SF_HAS_EVAL;
3226 data->whilem_c = data_fake.whilem_c;
3228 if (f & SCF_DO_STCLASS_AND) {
3229 if (flags & SCF_DO_STCLASS_OR) {
3230 /* OR before, AND after: ideally we would recurse with
3231 * data_fake to get the AND applied by study of the
3232 * remainder of the pattern, and then derecurse;
3233 * *** HACK *** for now just treat as "no information".
3234 * See [perl #56690].
3236 ssc_init(pRExC_state, data->start_class);
3238 /* AND before and after: combine and continue. These
3239 * assertions are zero-length, so can match an EMPTY
3241 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
3242 ANYOF_FLAGS(data->start_class)
3243 |= SSC_MATCHES_EMPTY_STRING;
3246 DEBUG_STUDYDATA("end LOOKAROUND", data, depth, is_inf, min, stopmin, delta);
3248 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3250 /* Positive Lookahead/lookbehind
3251 In this case we can do fixed string optimisation,
3252 but we must be careful about it. Note in the case of
3253 lookbehind the positions will be offset by the minimum
3254 length of the pattern, something we won't know about
3255 until after the recurse.
3257 SSize_t deltanext, fake_last_close = 0;
3258 regnode *last_close_op = NULL;
3261 U32 f = (flags & SCF_TRIE_DOING_RESTUDY);
3262 /* We use SAVEFREEPV so that when the full compile
3263 is finished perl will clean up the allocated
3264 minlens when it's all done. This way we don't
3265 have to worry about freeing them when we know
3266 they wont be used, which would be a pain.
3269 Newx( minnextp, 1, SSize_t );
3270 SAVEFREEPV(minnextp);
3273 StructCopy(data, &data_fake, scan_data_t);
3274 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3277 scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
3278 data_fake.last_found = newSVsv(data->last_found);
3282 data_fake.last_closep = &fake_last_close;
3283 data_fake.last_close_opp = &fake_last_close_opp;
3285 data_fake.flags = 0;
3286 data_fake.substrs[0].flags = 0;
3287 data_fake.substrs[1].flags = 0;
3288 data_fake.pos_delta = delta;
3290 data_fake.flags |= SF_IS_INF;
3291 if ( flags & SCF_DO_STCLASS && !FLAGS(scan)
3292 && OP(scan) == IFMATCH ) { /* Lookahead */
3293 ssc_init(pRExC_state, &intrnl);
3294 data_fake.start_class = &intrnl;
3295 f |= SCF_DO_STCLASS_AND;
3297 if (flags & SCF_WHILEM_VISITED_POS)
3298 f |= SCF_WHILEM_VISITED_POS;
3299 next = regnext(scan);
3300 nscan = REGNODE_AFTER(scan);
3302 /* positive lookahead study_chunk() recursion */
3303 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
3304 &deltanext, last, &data_fake,
3305 stopparen, recursed_depth, NULL,
3306 f, depth+1, mutate_ok);
3308 assert(0); /* This code has never been tested since this
3309 is normally not compiled */
3311 || deltanext > (I32) U8_MAX
3312 || *minnextp > (I32)U8_MAX
3313 || *minnextp + deltanext > (I32)U8_MAX)
3315 FAIL2("Lookbehind longer than %" UVuf " not implemented",
3320 NEXT_OFF(scan) = deltanext;
3322 FLAGS(scan) = (U8)*minnextp + deltanext;
3327 if (f & SCF_DO_STCLASS_AND) {
3328 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
3329 ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
3332 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3334 if (data_fake.flags & SF_HAS_EVAL)
3335 data->flags |= SF_HAS_EVAL;
3336 data->whilem_c = data_fake.whilem_c;
3337 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3339 if (RExC_rx->minlen < *minnextp)
3340 RExC_rx->minlen = *minnextp;
3341 scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
3342 SvREFCNT_dec_NN(data_fake.last_found);
3344 for (i = 0; i < 2; i++) {
3345 if (data_fake.substrs[i].minlenp != minlenp) {
3346 data->substrs[i].min_offset =
3347 data_fake.substrs[i].min_offset;
3348 data->substrs[i].max_offset =
3349 data_fake.substrs[i].max_offset;
3350 data->substrs[i].minlenp =
3351 data_fake.substrs[i].minlenp;
3352 data->substrs[i].lookbehind += FLAGS(scan);
3360 else if (OP(scan) == OPEN) {
3361 if (stopparen != (I32)PARNO(scan))
3364 else if (OP(scan) == CLOSE) {
3365 if (stopparen == (I32)PARNO(scan)) {
3368 if ((I32)PARNO(scan) == is_par) {
3369 next = regnext(scan);
3371 if ( next && (OP(next) != WHILEM) && next < last)
3372 is_par = 0; /* Disable optimization */
3375 *(data->last_closep) = PARNO(scan);
3376 *(data->last_close_opp) = scan;
3379 else if (OP(scan) == EVAL) {
3380 if (data && !(FLAGS(scan) & EVAL_OPTIMISTIC_FLAG) )
3381 data->flags |= SF_HAS_EVAL;
3383 else if ( REGNODE_TYPE(OP(scan)) == ENDLIKE ) {
3384 if (flags & SCF_DO_SUBSTR) {
3385 scan_commit(pRExC_state, data, minlenp, is_inf);
3386 flags &= ~SCF_DO_SUBSTR;
3388 if (OP(scan)==ACCEPT) {
3389 /* m{(*ACCEPT)x} does not have to start with 'x' */
3390 flags &= ~SCF_DO_STCLASS;
3392 data->flags |= SCF_SEEN_ACCEPT;
3397 else if (OP(scan) == COMMIT) {
3398 /* gh18770: m{abc(*COMMIT)xyz} must fail on "abc abcxyz", so we
3399 * must not end up with "abcxyz" as a fixed substring else we'll
3400 * skip straight to attempting to match at offset 4.
3402 if (flags & SCF_DO_SUBSTR) {
3403 scan_commit(pRExC_state, data, minlenp, is_inf);
3404 flags &= ~SCF_DO_SUBSTR;
3407 else if (OP(scan) == LOGICAL && FLAGS(scan) == 2) /* Embedded follows */
3409 if (flags & SCF_DO_SUBSTR) {
3410 scan_commit(pRExC_state, data, minlenp, is_inf);
3411 data->cur_is_floating = 1; /* float */
3413 is_inf = is_inf_internal = 1;
3414 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3415 ssc_anything(data->start_class);
3416 flags &= ~SCF_DO_STCLASS;
3418 else if (OP(scan) == GPOS) {
3419 if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
3420 !(delta || is_inf || (data && data->pos_delta)))
3422 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
3423 RExC_rx->intflags |= PREGf_ANCH_GPOS;
3424 if (RExC_rx->gofs < (STRLEN)min)
3425 RExC_rx->gofs = min;
3427 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
3431 #ifdef TRIE_STUDY_OPT
3432 #ifdef FULL_TRIE_STUDY
3433 else if (REGNODE_TYPE(OP(scan)) == TRIE) {
3434 /* NOTE - There is similar code to this block above for handling
3435 BRANCH nodes on the initial study. If you change stuff here
3437 regnode *trie_node= scan;
3438 regnode *tail= regnext(scan);
3439 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG1u(scan) ];
3440 SSize_t max1 = 0, min1 = OPTIMIZE_INFTY;
3443 if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
3444 /* Cannot merge strings after this. */
3445 scan_commit(pRExC_state, data, minlenp, is_inf);
3447 if (flags & SCF_DO_STCLASS)
3448 ssc_init_zero(pRExC_state, &accum);
3451 min1 = trie->minlen;
3452 max1 = trie->maxlen;
3454 const regnode *nextbranch= NULL;
3457 for ( word = 1 ; word <= trie->wordcount ; word++)
3459 SSize_t deltanext = 0, minnext = 0;
3460 U32 f = (flags & SCF_TRIE_DOING_RESTUDY);
3461 SSize_t fake_last_close = 0;
3462 regnode *fake_last_close_op = NULL;
3463 regnode_ssc this_class;
3465 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3467 data_fake.whilem_c = data->whilem_c;
3468 data_fake.last_closep = data->last_closep;
3469 data_fake.last_close_opp = data->last_close_opp;
3472 data_fake.last_closep = &fake_last_close;
3473 data_fake.last_close_opp = &fake_last_close_op;
3475 data_fake.pos_delta = delta;
3476 if (flags & SCF_DO_STCLASS) {
3477 ssc_init(pRExC_state, &this_class);
3478 data_fake.start_class = &this_class;
3479 f |= SCF_DO_STCLASS_AND;
3481 if (flags & SCF_WHILEM_VISITED_POS)
3482 f |= SCF_WHILEM_VISITED_POS;
3484 if (trie->jump[word]) {
3486 nextbranch = trie_node + trie->jump[0];
3487 scan = trie_node + trie->jump[word];
3488 /* We go from the jump point to the branch that follows
3489 it. Note this means we need the vestigal unused
3490 branches even though they arent otherwise used. */
3491 /* optimise study_chunk() for TRIE */
3492 minnext = study_chunk(pRExC_state, &scan, minlenp,
3493 &deltanext, (regnode *)nextbranch, &data_fake,
3494 stopparen, recursed_depth, NULL, f, depth+1,
3497 if (nextbranch && REGNODE_TYPE(OP(nextbranch))==BRANCH)
3498 nextbranch = regnext((regnode*)nextbranch);
3500 if (min1 > (SSize_t)(minnext + trie->minlen))
3501 min1 = minnext + trie->minlen;
3502 if (deltanext == OPTIMIZE_INFTY) {
3503 is_inf = is_inf_internal = 1;
3504 max1 = OPTIMIZE_INFTY;
3505 } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
3506 max1 = minnext + deltanext + trie->maxlen;
3508 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3510 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3511 if ( stopmin > min + min1)
3512 stopmin = min + min1;
3513 flags &= ~SCF_DO_SUBSTR;
3515 data->flags |= SCF_SEEN_ACCEPT;
3518 if (data_fake.flags & SF_HAS_EVAL)
3519 data->flags |= SF_HAS_EVAL;
3520 data->whilem_c = data_fake.whilem_c;
3522 if (flags & SCF_DO_STCLASS)
3523 ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
3525 DEBUG_STUDYDATA("after JUMPTRIE", data, depth, is_inf, min, stopmin, delta);
3527 if (flags & SCF_DO_SUBSTR) {
3528 data->pos_min += min1;
3529 data->pos_delta += max1 - min1;
3530 if (max1 != min1 || is_inf)
3531 data->cur_is_floating = 1; /* float */
3534 if (delta != OPTIMIZE_INFTY) {
3535 if (OPTIMIZE_INFTY - (max1 - min1) >= delta)
3536 delta += max1 - min1;
3538 delta = OPTIMIZE_INFTY;
3540 if (flags & SCF_DO_STCLASS_OR) {
3541 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
3543 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3544 flags &= ~SCF_DO_STCLASS;
3547 else if (flags & SCF_DO_STCLASS_AND) {
3549 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
3550 flags &= ~SCF_DO_STCLASS;
3553 /* Switch to OR mode: cache the old value of
3554 * data->start_class */
3556 StructCopy(data->start_class, and_withp, regnode_ssc);
3557 flags &= ~SCF_DO_STCLASS_AND;
3558 StructCopy(&accum, data->start_class, regnode_ssc);
3559 flags |= SCF_DO_STCLASS_OR;
3563 DEBUG_STUDYDATA("after TRIE study", data, depth, is_inf, min, stopmin, delta);
3567 else if (REGNODE_TYPE(OP(scan)) == TRIE) {
3568 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG1u(scan) ];
3571 min += trie->minlen;
3572 delta += (trie->maxlen - trie->minlen);
3573 flags &= ~SCF_DO_STCLASS; /* xxx */
3574 if (flags & SCF_DO_SUBSTR) {
3575 /* Cannot expect anything... */
3576 scan_commit(pRExC_state, data, minlenp, is_inf);
3577 data->pos_min += trie->minlen;
3578 data->pos_delta += (trie->maxlen - trie->minlen);
3579 if (trie->maxlen != trie->minlen)
3580 data->cur_is_floating = 1; /* float */
3582 if (trie->jump) /* no more substrings -- for now /grr*/
3583 flags &= ~SCF_DO_SUBSTR;
3586 #endif /* old or new */
3587 #endif /* TRIE_STUDY_OPT */
3589 else if (OP(scan) == REGEX_SET) {
3590 croak("panic: %s regnode should be resolved"
3591 " before optimization", REGNODE_NAME(REGEX_SET));
3594 /* Else: zero-length, ignore. */
3595 scan = regnext(scan);
3600 /* we need to unwind recursion. */
3603 DEBUG_STUDYDATA("frame-end", data, depth, is_inf, min, stopmin, delta);
3604 DEBUG_PEEP("fend", scan, depth, flags);
3606 /* restore previous context */
3607 last = frame->last_regnode;
3608 scan = frame->next_regnode;
3609 stopparen = frame->stopparen;
3610 recursed_depth = frame->prev_recursed_depth;
3612 RExC_frame_last = frame->prev_frame;
3613 frame = frame->this_prev_frame;
3614 goto fake_study_recurse;
3618 DEBUG_STUDYDATA("pre-fin", data, depth, is_inf, min, stopmin, delta);
3620 /* is this pattern infinite? Eg, consider /(a|b+)/ */
3621 if (is_inf_internal)
3622 delta = OPTIMIZE_INFTY;
3624 /* deal with (*ACCEPT), Eg, consider /(foo(*ACCEPT)|bop)bar/ */
3625 if (min > stopmin) {
3627 At this point 'min' represents the minimum length string we can
3628 match while *ignoring* the implication of ACCEPT, and 'delta'
3629 represents the difference between the minimum length and maximum
3630 length, and if the pattern matches an infinitely long string
3631 (consider the + and * quantifiers) then we use the special delta
3632 value of OPTIMIZE_INFTY to represent it. 'stopmin' is the
3633 minimum length that can be matched *and* accepted.
3635 A pattern is accepted when matching was successful *and*
3636 complete, and thus there is no further matching needing to be
3637 done, no backtracking to occur, etc. Prior to the introduction
3638 of ACCEPT the only opcode that signaled acceptance was the END
3639 opcode, which is always the very last opcode in a regex program.
3640 ACCEPT is thus conceptually an early successful return out of
3641 the matching process. stopmin starts out as OPTIMIZE_INFTY to
3642 represent "the entire pattern", and is ratched down to the
3643 "current min" if necessary when an ACCEPT opcode is encountered.
3645 Thus stopmin might be smaller than min if we saw an (*ACCEPT),
3646 and we now need to account for it in both min and delta.
3647 Consider that in a pattern /AB/ normally the min length it can
3648 match can be computed as min(A)+min(B). But (*ACCEPT) means
3649 that it might be something else, not even neccesarily min(A) at
3652 A = /(foo(*ACCEPT)|x+)/
3654 AB = /(foo(*ACCEPT)|x+)whop/
3656 The min for A is 1 for "x" and the delta for A is OPTIMIZE_INFTY
3657 for "xxxxx...", its stopmin is 3 for "foo". The min for B is 4 for
3658 "whop", and the delta of 0 as the pattern is of fixed length, the
3659 stopmin would be OPTIMIZE_INFTY as it does not contain an ACCEPT.
3660 When handling AB we expect to see a min of 5 for "xwhop", and a
3661 delta of OPTIMIZE_INFTY for "xxxxx...whop", and a stopmin of 3
3662 for "foo". This should result in a final min of 3 for "foo", and
3663 a final delta of OPTIMIZE_INFTY for "xxxxx...whop".
3665 In something like /(dude(*ACCEPT)|irk)x{3,7}/ we would have a
3666 min of 6 for "irkxxx" and a delta of 4 for "irkxxxxxxx", and the
3667 stop min would be 4 for "dude". This should result in a final
3668 min of 4 for "dude", and a final delta of 6, for "irkxxxxxxx".
3670 When min is smaller than stopmin then we can ignore it. In the
3671 fragment /(x{10,20}(*ACCEPT)|a)b+/, we would have a min of 2,
3672 and a delta of OPTIMIZE_INFTY, and a stopmin of 10. Obviously
3673 the ACCEPT doesn't reduce the minimum length of the string that
3674 might be matched, nor affect the maximum length.
3676 In something like /foo(*ACCEPT)ba?r/ we would have a min of 5
3677 for "foobr", a delta of 1 for "foobar", and a stopmin of 3 for
3678 "foo". We currently turn this into a min of 3 for "foo" and a
3679 delta of 3 for "foobar" even though technically "foobar" isn't
3680 possible. ACCEPT affects some aspects of the optimizer, like
3681 length computations and mandatory substring optimizations, but
3682 there are other optimzations this routine perfoms that are not
3683 affected and this compromise simplifies implementation.
3685 It might be helpful to consider that this C function is called
3686 recursively on the pattern in a bottom up fashion, and that the
3687 min returned by a nested call may be marked as coming from an
3688 ACCEPT, causing its callers to treat the returned min as a
3689 stopmin as the recursion unwinds. Thus a single ACCEPT can affect
3690 multiple calls into this function in different ways.
3693 if (OPTIMIZE_INFTY - delta >= min - stopmin)
3694 delta += min - stopmin;
3696 delta = OPTIMIZE_INFTY;
3703 if (flags & SCF_DO_SUBSTR && is_inf)
3704 data->pos_delta = OPTIMIZE_INFTY - data->pos_min;
3705 if (is_par > (I32)U8_MAX)
3707 if (is_par && pars == 1 && data) {
3708 data->flags |= SF_IN_PAR;
3709 data->flags &= ~SF_HAS_PAR;
3711 else if (pars && data) {
3712 data->flags |= SF_HAS_PAR;
3713 data->flags &= ~SF_IN_PAR;
3715 if (flags & SCF_DO_STCLASS_OR)
3716 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3717 if (flags & SCF_TRIE_RESTUDY)
3718 data->flags |= SCF_TRIE_RESTUDY;
3721 if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
3722 if (min > OPTIMIZE_INFTY - delta)
3723 RExC_maxlen = OPTIMIZE_INFTY;
3724 else if (RExC_maxlen < min + delta)
3725 RExC_maxlen = min + delta;
3727 DEBUG_STUDYDATA("post-fin", data, depth, is_inf, min, stopmin, delta);