regcomp_study.c

   1 #ifdef PERL_EXT_RE_BUILD
   2 #include "re_top.h"
   3 #endif
   4
   5 #include "EXTERN.h"
   6 #define PERL_IN_REGEX_ENGINE
   7 #define PERL_IN_REGCOMP_ANY
   8 #define PERL_IN_REGCOMP_STUDY_C
   9 #include "perl.h"
  10
  11 #ifdef PERL_IN_XSUB_RE
  12 #  include "re_comp.h"
  13 #else
  14 #  include "regcomp.h"
  15 #endif
  16
  17 #include "invlist_inline.h"
  18 #include "unicode_constants.h"
  19 #include "regcomp_internal.h"
  20
  21 #define INIT_AND_WITHP \
  22     assert(!and_withp); \
  23     Newx(and_withp, 1, regnode_ssc); \
  24     SAVEFREEPV(and_withp)
  25
  26
  27 STATIC void
  28 S_unwind_scan_frames(pTHX_ const void *p)
  29 {
  30     PERL_ARGS_ASSERT_UNWIND_SCAN_FRAMES;
  31     scan_frame *f= (scan_frame *)p;
  32     do {
  33         scan_frame *n= f->next_frame;
  34         Safefree(f);
  35         f = n;
  36     } while (f);
  37 }
  38
  39 /* Follow the next-chain of the current node and optimize away
  40    all the NOTHINGs from it.
  41  */
  42 STATIC void
  43 S_rck_elide_nothing(pTHX_ regnode *node)
  44 {
  45     PERL_ARGS_ASSERT_RCK_ELIDE_NOTHING;
  46
  47     if (OP(node) != CURLYX) {
  48         const int max = (REGNODE_OFF_BY_ARG(OP(node))
  49                         ? I32_MAX
  50                           /* I32 may be smaller than U16 on CRAYs! */
  51                         : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
  52         int off = (REGNODE_OFF_BY_ARG(OP(node)) ? ARG1u(node) : NEXT_OFF(node));
  53         int noff;
  54         regnode *n = node;
  55
  56         /* Skip NOTHING and LONGJMP. */
  57         while (
  58             (n = regnext(n))
  59             && (
  60                 (REGNODE_TYPE(OP(n)) == NOTHING && (noff = NEXT_OFF(n)))
  61                 || ((OP(n) == LONGJMP) && (noff = ARG1u(n)))
  62             )
  63             && off + noff < max
  64         ) {
  65             off += noff;
  66         }
  67         if (REGNODE_OFF_BY_ARG(OP(node)))
  68             ARG1u(node) = off;
  69         else
  70             NEXT_OFF(node) = off;
  71     }
  72     return;
  73 }
  74
  75
  76 /*
  77  * As best we can, determine the characters that can match the start of
  78  * the given EXACTF-ish node.  This is for use in creating ssc nodes, so there
  79  * can be false positive matches
  80  *
  81  * Returns the invlist as a new SV*; it is the caller's responsibility to
  82  * call SvREFCNT_dec() when done with it.
  83  */
  84 STATIC SV*
  85 S_make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
  86 {
  87     const U8 * s = (U8*)STRING(node);
  88     SSize_t bytelen = STR_LEN(node);
  89     UV uc;
  90     /* Start out big enough for 2 separate code points */
  91     SV* invlist = _new_invlist(4);
  92
  93     PERL_ARGS_ASSERT_MAKE_EXACTF_INVLIST;
  94
  95     if (! UTF) {
  96         uc = *s;
  97
  98         /* We punt and assume can match anything if the node begins
  99          * with a multi-character fold.  Things are complicated.  For
 100          * example, /ffi/i could match any of:
 101          *  "\N{LATIN SMALL LIGATURE FFI}"
 102          *  "\N{LATIN SMALL LIGATURE FF}I"
 103          *  "F\N{LATIN SMALL LIGATURE FI}"
 104          *  plus several other things; and making sure we have all the
 105          *  possibilities is hard. */
 106         if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) {
 107             invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
 108         }
 109         else {
 110             /* Any Latin1 range character can potentially match any
 111              * other depending on the locale, and in Turkic locales, 'I' and
 112              * 'i' can match U+130 and U+131 */
 113             if (OP(node) == EXACTFL) {
 114                 _invlist_union(invlist, PL_Latin1, &invlist);
 115                 if (isALPHA_FOLD_EQ(uc, 'I')) {
 116                     invlist = add_cp_to_invlist(invlist,
 117                                                 LATIN_SMALL_LETTER_DOTLESS_I);
 118                     invlist = add_cp_to_invlist(invlist,
 119                                         LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
 120                 }
 121             }
 122             else {
 123                 /* But otherwise, it matches at least itself.  We can
 124                  * quickly tell if it has a distinct fold, and if so,
 125                  * it matches that as well */
 126                 invlist = add_cp_to_invlist(invlist, uc);
 127                 if (IS_IN_SOME_FOLD_L1(uc))
 128                     invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]);
 129             }
 130
 131             /* Some characters match above-Latin1 ones under /i.  This
 132              * is true of EXACTFL ones when the locale is UTF-8 */
 133             if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
 134                 && (! isASCII(uc) || ! inRANGE(OP(node), EXACTFAA,
 135                                                          EXACTFAA_NO_TRIE)))
 136             {
 137                 add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist);
 138             }
 139         }
 140     }
 141     else {  /* Pattern is UTF-8 */
 142         U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
 143         const U8* e = s + bytelen;
 144         IV fc;
 145
 146         fc = uc = utf8_to_uv_or_die(s, s + bytelen, NULL);
 147
 148         /* The only code points that aren't folded in a UTF EXACTFish
 149          * node are the problematic ones in EXACTFL nodes */
 150         if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
 151             /* We need to check for the possibility that this EXACTFL
 152              * node begins with a multi-char fold.  Therefore we fold
 153              * the first few characters of it so that we can make that
 154              * check */
 155             U8 *d = folded;
 156             int i;
 157
 158             fc = -1;
 159             for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
 160                 if (isASCII(*s)) {
 161                     *(d++) = (U8) toFOLD(*s);
 162                     if (fc < 0) {       /* Save the first fold */
 163                         fc = *(d-1);
 164                     }
 165                     s++;
 166                 }
 167                 else {
 168                     STRLEN len;
 169                     UV fold = toFOLD_utf8_safe(s, e, d, &len);
 170                     if (fc < 0) {       /* Save the first fold */
 171                         fc = fold;
 172                     }
 173                     d += len;
 174                     s += UTF8SKIP(s);
 175                 }
 176             }
 177
 178             /* And set up so the code below that looks in this folded
 179              * buffer instead of the node's string */
 180             e = d;
 181             s = folded;
 182         }
 183
 184         /* When we reach here 's' points to the fold of the first
 185          * character(s) of the node; and 'e' points to far enough along
 186          * the folded string to be just past any possible multi-char
 187          * fold.
 188          *
 189          * Like the non-UTF case above, we punt if the node begins with a
 190          * multi-char fold  */
 191
 192         if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
 193             invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
 194         }
 195         else {  /* Single char fold */
 196             unsigned int k;
 197             U32 first_fold;
 198             const U32 * remaining_folds;
 199             Size_t folds_count;
 200
 201             /* It matches itself */
 202             invlist = add_cp_to_invlist(invlist, fc);
 203
 204             /* ... plus all the things that fold to it, which are found in
 205              * PL_utf8_foldclosures */
 206             folds_count = _inverse_folds(fc, &first_fold,
 207                                                 &remaining_folds);
 208             for (k = 0; k < folds_count; k++) {
 209                 UV c = (k == 0) ? first_fold : remaining_folds[k-1];
 210
 211                 /* /aa doesn't allow folds between ASCII and non- */
 212                 if (   inRANGE(OP(node), EXACTFAA, EXACTFAA_NO_TRIE)
 213                     && isASCII(c) != isASCII(fc))
 214                 {
 215                     continue;
 216                 }
 217
 218                 invlist = add_cp_to_invlist(invlist, c);
 219             }
 220
 221             if (OP(node) == EXACTFL) {
 222
 223                 /* If either [iI] are present in an EXACTFL node the above code
 224                  * should have added its normal case pair, but under a Turkish
 225                  * locale they could match instead the case pairs from it.  Add
 226                  * those as potential matches as well */
 227                 if (isALPHA_FOLD_EQ(fc, 'I')) {
 228                     invlist = add_cp_to_invlist(invlist,
 229                                                 LATIN_SMALL_LETTER_DOTLESS_I);
 230                     invlist = add_cp_to_invlist(invlist,
 231                                         LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
 232                 }
 233                 else if (fc == LATIN_SMALL_LETTER_DOTLESS_I) {
 234                     invlist = add_cp_to_invlist(invlist, 'I');
 235                 }
 236                 else if (fc == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
 237                     invlist = add_cp_to_invlist(invlist, 'i');
 238                 }
 239             }
 240         }
 241     }
 242
 243     return invlist;
 244 }
 245
 246
 247 /* Mark that we cannot extend a found fixed substring at this point.
 248    Update the longest found anchored substring or the longest found
 249    floating substrings if needed. */
 250
 251 void
 252 Perl_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
 253                     SSize_t *minlenp, int is_inf)
 254 {
 255     const STRLEN l = CHR_SVLEN(data->last_found);
 256     SV * const longest_sv = data->substrs[data->cur_is_floating].str;
 257     const STRLEN old_l = CHR_SVLEN(longest_sv);
 258     DECLARE_AND_GET_RE_DEBUG_FLAGS;
 259
 260     PERL_ARGS_ASSERT_SCAN_COMMIT;
 261
 262     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 263         const U8 i = data->cur_is_floating;
 264         SvSetMagicSV(longest_sv, data->last_found);
 265         data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min;
 266
 267         if (!i) /* fixed */
 268             data->substrs[0].max_offset = data->substrs[0].min_offset;
 269         else { /* float */
 270             data->substrs[1].max_offset =
 271                       (is_inf)
 272                        ? OPTIMIZE_INFTY
 273                        : (l
 274                           ? data->last_start_max
 275                           : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min
 276                                          ? OPTIMIZE_INFTY
 277                                          : data->pos_min + data->pos_delta));
 278         }
 279
 280         data->substrs[i].flags &= ~SF_BEFORE_EOL;
 281         data->substrs[i].flags |= data->flags & SF_BEFORE_EOL;
 282         data->substrs[i].minlenp = minlenp;
 283         data->substrs[i].lookbehind = 0;
 284     }
 285
 286     SvCUR_set(data->last_found, 0);
 287     {
 288         SV * const sv = data->last_found;
 289         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 290             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 291             if (mg)
 292                 mg->mg_len = 0;
 293         }
 294     }
 295     data->last_end = -1;
 296     data->flags &= ~SF_BEFORE_EOL;
 297     DEBUG_STUDYDATA("commit", data, 0, is_inf, -1, -1, -1);
 298 }
 299
 300 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
 301  * list that describes which code points it matches */
 302
 303 STATIC void
 304 S_ssc_anything(pTHX_ regnode_ssc *ssc)
 305 {
 306     /* Set the SSC 'ssc' to match an empty string or any code point */
 307
 308     PERL_ARGS_ASSERT_SSC_ANYTHING;
 309
 310     assert(is_ANYOF_SYNTHETIC(ssc));
 311
 312     /* mortalize so won't leak */
 313     ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
 314     ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING;  /* Plus matches empty */
 315 }
 316
 317 STATIC int
 318 S_ssc_is_anything(const regnode_ssc *ssc)
 319 {
 320     /* Returns true if the SSC 'ssc' can match the empty string and any code
 321      * point; false otherwise.  Thus, this is used to see if using 'ssc' buys
 322      * us anything: if the function returns true, 'ssc' hasn't been restricted
 323      * in any way, so there's no point in using it */
 324
 325     UV start = 0, end = 0;  /* Initialize due to messages from dumb compiler */
 326     bool ret;
 327
 328     PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
 329
 330     assert(is_ANYOF_SYNTHETIC(ssc));
 331
 332     if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
 333         return false;
 334     }
 335
 336     /* See if the list consists solely of the range 0 - Infinity */
 337     invlist_iterinit(ssc->invlist);
 338     ret = invlist_iternext(ssc->invlist, &start, &end)
 339           && start == 0
 340           && end == UV_MAX;
 341
 342     invlist_iterfinish(ssc->invlist);
 343
 344     if (ret) {
 345         return true;
 346     }
 347
 348     /* If e.g., both \w and \W are set, matches everything */
 349     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
 350         int i;
 351         for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
 352             if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
 353                 return true;
 354             }
 355         }
 356     }
 357
 358     return false;
 359 }
 360
 361 void
 362 Perl_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
 363 {
 364     /* Initializes the SSC 'ssc'.  This includes setting it to match an empty
 365      * string, any code point, or any posix class under locale */
 366
 367     PERL_ARGS_ASSERT_SSC_INIT;
 368
 369     Zero(ssc, 1, regnode_ssc);
 370     set_ANYOF_SYNTHETIC(ssc);
 371     ARG1u_SET(ssc, ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE);
 372     ssc_anything(ssc);
 373
 374     /* If any portion of the regex is to operate under locale rules that aren't
 375      * fully known at compile time, initialization includes it.  The reason
 376      * this isn't done for all regexes is that the optimizer was written under
 377      * the assumption that locale was all-or-nothing.  Given the complexity and
 378      * lack of documentation in the optimizer, and that there are inadequate
 379      * test cases for locale, many parts of it may not work properly, it is
 380      * safest to avoid locale unless necessary. */
 381     if (RExC_contains_locale) {
 382         ANYOF_POSIXL_SETALL(ssc);
 383     }
 384     else {
 385         ANYOF_POSIXL_ZERO(ssc);
 386     }
 387 }
 388
 389 STATIC int
 390 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
 391                         const regnode_ssc *ssc)
 392 {
 393     /* Returns true if the SSC 'ssc' is in its initial state with regard only
 394      * to the list of code points matched, and locale posix classes; hence does
 395      * not check its flags) */
 396
 397     UV start = 0, end = 0;  /* Initialize due to messages from dumb compiler */
 398     bool ret;
 399
 400     PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
 401
 402     assert(is_ANYOF_SYNTHETIC(ssc));
 403
 404     invlist_iterinit(ssc->invlist);
 405     ret = invlist_iternext(ssc->invlist, &start, &end)
 406           && start == 0
 407           && end == UV_MAX;
 408
 409     invlist_iterfinish(ssc->invlist);
 410
 411     if (! ret) {
 412         return false;
 413     }
 414
 415     if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
 416         return false;
 417     }
 418
 419     return true;
 420 }
 421
 422
 423 STATIC SV*
 424 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
 425                                const regnode_charclass* const node)
 426 {
 427     /* Returns a mortal inversion list defining which code points are matched
 428      * by 'node', which is of ANYOF-ish type .  Handles complementing the
 429      * result if appropriate.  If some code points aren't knowable at this
 430      * time, the returned list must, and will, contain every code point that is
 431      * a possibility. */
 432
 433     SV* invlist = NULL;
 434     SV* only_utf8_locale_invlist = NULL;
 435     bool new_node_has_latin1 = false;
 436     const U8 flags = (REGNODE_TYPE(OP(node)) == ANYOF)
 437                       ? ANYOF_FLAGS(node)
 438                       : 0;
 439
 440     PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
 441
 442     /* Look at the data structure created by S_set_ANYOF_arg() */
 443     if (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(node)) {
 444         invlist = sv_2mortal(_new_invlist(1));
 445         invlist = _add_range_to_invlist(invlist, NUM_ANYOF_CODE_POINTS, UV_MAX);
 446     }
 447     else if (ANYOF_HAS_AUX(node)) {
 448         const U32 n = ARG1u(node);
 449         SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
 450         AV * const av = AV_FROM_REF(rv);
 451         SV **const ary = AvARRAY(av);
 452
 453         if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
 454
 455             /* Here there are things that won't be known until runtime -- we
 456              * have to assume it could be anything */
 457             invlist = sv_2mortal(_new_invlist(1));
 458             return _add_range_to_invlist(invlist, 0, UV_MAX);
 459         }
 460         else if (ary[INVLIST_INDEX]) {
 461
 462             /* Use the node's inversion list */
 463             invlist = sv_2mortal(invlist_clone(ary[INVLIST_INDEX], NULL));
 464         }
 465
 466         /* Get the code points valid only under UTF-8 locales */
 467         if (   (flags & ANYOFL_FOLD)
 468             &&  av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX)
 469         {
 470             only_utf8_locale_invlist = ary[ONLY_LOCALE_MATCHES_INDEX];
 471         }
 472     }
 473
 474     if (! invlist) {
 475         invlist = sv_2mortal(_new_invlist(0));
 476     }
 477
 478     /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
 479      * code points, and an inversion list for the others, but if there are code
 480      * points that should match only conditionally on the target string being
 481      * UTF-8, those are placed in the inversion list, and not the bitmap.
 482      * Since there are circumstances under which they could match, they are
 483      * included in the SSC.  But if the ANYOF node is to be inverted, we have
 484      * to exclude them here, so that when we invert below, the end result
 485      * actually does include them.  (Think about "\xe0" =~ /[^\xc0]/di;).  We
 486      * have to do this here before we add the unconditionally matched code
 487      * points */
 488     if (flags & ANYOF_INVERT) {
 489         _invlist_intersection_complement_2nd(invlist,
 490                                              PL_UpperLatin1,
 491                                              &invlist);
 492     }
 493
 494     /* Add in the points from the bit map */
 495     if (REGNODE_TYPE(OP(node)) == ANYOF){
 496         for (unsigned i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
 497             if (ANYOF_BITMAP_TEST(node, i)) {
 498                 unsigned int start = i++;
 499
 500                 for (;    i < NUM_ANYOF_CODE_POINTS
 501                        && ANYOF_BITMAP_TEST(node, i); ++i)
 502                 {
 503                     /* empty */
 504                 }
 505                 invlist = _add_range_to_invlist(invlist, start, i-1);
 506                 new_node_has_latin1 = true;
 507             }
 508         }
 509     }
 510
 511     /* If this can match all upper Latin1 code points, have to add them
 512      * as well.  But don't add them if inverting, as when that gets done below,
 513      * it would exclude all these characters, including the ones it shouldn't
 514      * that were added just above */
 515     if ( ! (flags & ANYOF_INVERT)
 516         &&  OP(node) == ANYOFD
 517         && (flags & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared))
 518     {
 519         _invlist_union(invlist, PL_UpperLatin1, &invlist);
 520     }
 521
 522     /* Similarly for these */
 523     if (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(node)) {
 524         _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
 525     }
 526
 527     if (flags & ANYOF_INVERT) {
 528         _invlist_invert(invlist);
 529     }
 530     else if (flags & ANYOFL_FOLD) {
 531         if (new_node_has_latin1) {
 532
 533             /* These folds are potential in Turkic locales */
 534             if (_invlist_contains_cp(invlist, 'i')) {
 535                 invlist = add_cp_to_invlist(invlist,
 536                                         LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
 537             }
 538             if (_invlist_contains_cp(invlist, 'I')) {
 539                 invlist = add_cp_to_invlist(invlist,
 540                                                 LATIN_SMALL_LETTER_DOTLESS_I);
 541             }
 542
 543             /* Under /li, any 0-255 could fold to any other 0-255, depending on
 544              * the locale.  We can skip this if there are no 0-255 at all. */
 545             _invlist_union(invlist, PL_Latin1, &invlist);
 546         }
 547         else {
 548             if (_invlist_contains_cp(invlist, LATIN_SMALL_LETTER_DOTLESS_I)) {
 549                 invlist = add_cp_to_invlist(invlist, 'I');
 550             }
 551             if (_invlist_contains_cp(invlist,
 552                                         LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
 553             {
 554                 invlist = add_cp_to_invlist(invlist, 'i');
 555             }
 556         }
 557     }
 558
 559     /* Similarly add the UTF-8 locale possible matches.  These have to be
 560      * deferred until after the non-UTF-8 locale ones are taken care of just
 561      * above, or it leads to wrong results under ANYOF_INVERT */
 562     if (only_utf8_locale_invlist) {
 563         _invlist_union_maybe_complement_2nd(invlist,
 564                                             only_utf8_locale_invlist,
 565                                             flags & ANYOF_INVERT,
 566                                             &invlist);
 567     }
 568
 569     return invlist;
 570 }
 571
 572 /* 'AND' a given class with another one.  Can create false positives.  'ssc'
 573  * should not be inverted. */
 574
 575 STATIC void
 576 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
 577                 const regnode_charclass *and_with)
 578 {
 579     /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
 580      * another SSC or a regular ANYOF class.  Can create false positives. */
 581
 582     SV* anded_cp_list;
 583     U8  and_with_flags = (REGNODE_TYPE(OP(and_with)) == ANYOF)
 584                           ? ANYOF_FLAGS(and_with)
 585                           : 0;
 586     U8  anded_flags;
 587
 588     PERL_ARGS_ASSERT_SSC_AND;
 589
 590     assert(is_ANYOF_SYNTHETIC(ssc));
 591
 592     /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
 593      * the code point inversion list and just the relevant flags */
 594     if (is_ANYOF_SYNTHETIC(and_with)) {
 595         anded_cp_list = ((regnode_ssc *)and_with)->invlist;
 596         anded_flags = and_with_flags;
 597
 598         /* XXX This is a kludge around what appears to be deficiencies in the
 599          * optimizer.  If we make S_ssc_anything() add in the WARN_SUPER flag,
 600          * there are paths through the optimizer where it doesn't get weeded
 601          * out when it should.  And if we don't make some extra provision for
 602          * it like the code just below, it doesn't get added when it should.
 603          * This solution is to add it only when AND'ing, which is here, and
 604          * only when what is being AND'ed is the pristine, original node
 605          * matching anything.  Thus it is like adding it to ssc_anything() but
 606          * only when the result is to be AND'ed.  Probably the same solution
 607          * could be adopted for the same problem we have with /l matching,
 608          * which is solved differently in S_ssc_init(), and that would lead to
 609          * fewer false positives than that solution has.  But if this solution
 610          * creates bugs, the consequences are only that a warning isn't raised
 611          * that should be; while the consequences for having /l bugs is
 612          * incorrect matches */
 613         if (ssc_is_anything((regnode_ssc *)and_with)) {
 614             anded_flags |= ANYOF_WARN_SUPER__shared;
 615         }
 616     }
 617     else {
 618         anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
 619         if (OP(and_with) == ANYOFD) {
 620             anded_flags = and_with_flags & ANYOF_COMMON_FLAGS;
 621         }
 622         else {
 623             anded_flags = and_with_flags
 624                             & ( ANYOF_COMMON_FLAGS
 625                                |ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared
 626                                |ANYOF_HAS_EXTRA_RUNTIME_MATCHES);
 627             if (and_with_flags & ANYOFL_UTF8_LOCALE_REQD) {
 628                 anded_flags &= ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
 629             }
 630         }
 631     }
 632
 633     ANYOF_FLAGS(ssc) &= anded_flags;
 634
 635     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
 636      * C2 is the list of code points in 'and-with'; P2, its posix classes.
 637      * 'and_with' may be inverted.  When not inverted, we have the situation of
 638      * computing:
 639      *  (C1 | P1) & (C2 | P2)
 640      *                     =  (C1 & (C2 | P2)) | (P1 & (C2 | P2))
 641      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
 642      *                    <=  ((C1 & C2) |       P2)) | ( P1       | (P1 & P2))
 643      *                    <=  ((C1 & C2) | P1 | P2)
 644      * Alternatively, the last few steps could be:
 645      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
 646      *                    <=  ((C1 & C2) |  C1      ) | (      C2  | (P1 & P2))
 647      *                    <=  (C1 | C2 | (P1 & P2))
 648      * We favor the second approach if either P1 or P2 is non-empty.  This is
 649      * because these components are a barrier to doing optimizations, as what
 650      * they match cannot be known until the moment of matching as they are
 651      * dependent on the current locale, 'AND"ing them likely will reduce or
 652      * eliminate them.
 653      * But we can do better if we know that C1,P1 are in their initial state (a
 654      * frequent occurrence), each matching everything:
 655      *  (<everything>) & (C2 | P2) =  C2 | P2
 656      * Similarly, if C2,P2 are in their initial state (again a frequent
 657      * occurrence), the result is a no-op
 658      *  (C1 | P1) & (<everything>) =  C1 | P1
 659      *
 660      * Inverted, we have
 661      *  (C1 | P1) & ~(C2 | P2)  =  (C1 | P1) & (~C2 & ~P2)
 662      *                          =  (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
 663      *                         <=  (C1 & ~C2) | (P1 & ~P2)
 664      * */
 665
 666     if ((and_with_flags & ANYOF_INVERT)
 667         && ! is_ANYOF_SYNTHETIC(and_with))
 668     {
 669         unsigned int i;
 670
 671         ssc_intersection(ssc,
 672                          anded_cp_list,
 673                          false /* Has already been inverted */
 674                          );
 675
 676         /* If either P1 or P2 is empty, the intersection will be also; can skip
 677          * the loop */
 678         if (! (and_with_flags & ANYOF_MATCHES_POSIXL)) {
 679             ANYOF_POSIXL_ZERO(ssc);
 680         }
 681         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
 682
 683             /* Note that the Posix class component P from 'and_with' actually
 684              * looks like:
 685              *      P = Pa | Pb | ... | Pn
 686              * where each component is one posix class, such as in [\w\s].
 687              * Thus
 688              *      ~P = ~(Pa | Pb | ... | Pn)
 689              *         = ~Pa & ~Pb & ... & ~Pn
 690              *        <= ~Pa | ~Pb | ... | ~Pn
 691              * The last is something we can easily calculate, but unfortunately
 692              * is likely to have many false positives.  We could do better
 693              * in some (but certainly not all) instances if two classes in
 694              * P have known relationships.  For example
 695              *      :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
 696              * So
 697              *      :lower: & :print: = :lower:
 698              * And similarly for classes that must be disjoint.  For example,
 699              * since \s and \w can have no elements in common based on rules in
 700              * the POSIX standard,
 701              *      \w & ^\S = nothing
 702              * Unfortunately, some vendor locales do not meet the Posix
 703              * standard, in particular almost everything by Microsoft.
 704              * The loop below just changes e.g., \w into \W and vice versa */
 705
 706             regnode_charclass_posixl temp;
 707             int add = 1;    /* To calculate the index of the complement */
 708
 709             Zero(&temp, 1, regnode_charclass_posixl);
 710             ANYOF_POSIXL_ZERO(&temp);
 711             for (i = 0; i < ANYOF_MAX; i++) {
 712                 assert(i % 2 != 0
 713                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
 714                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
 715
 716                 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
 717                     ANYOF_POSIXL_SET(&temp, i + add);
 718                 }
 719                 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
 720             }
 721             ANYOF_POSIXL_AND(&temp, ssc);
 722
 723         } /* else ssc already has no posixes */
 724     } /* else: Not inverted.  This routine is a no-op if 'and_with' is an SSC
 725          in its initial state */
 726     else if (! is_ANYOF_SYNTHETIC(and_with)
 727              || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
 728     {
 729         /* But if 'ssc' is in its initial state, the result is just 'and_with';
 730          * copy it over 'ssc' */
 731         if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
 732             if (is_ANYOF_SYNTHETIC(and_with)) {
 733                 StructCopy(and_with, ssc, regnode_ssc);
 734             }
 735             else {
 736                 ssc->invlist = anded_cp_list;
 737                 ANYOF_POSIXL_ZERO(ssc);
 738                 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
 739                     ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
 740                 }
 741             }
 742         }
 743         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
 744                  || (and_with_flags & ANYOF_MATCHES_POSIXL))
 745         {
 746             /* One or the other of P1, P2 is non-empty. */
 747             if (and_with_flags & ANYOF_MATCHES_POSIXL) {
 748                 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
 749             }
 750             ssc_union(ssc, anded_cp_list, false);
 751         }
 752         else { /* P1 = P2 = empty */
 753             ssc_intersection(ssc, anded_cp_list, false);
 754         }
 755     }
 756 }
 757
 758 STATIC void
 759 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
 760                const regnode_charclass *or_with)
 761 {
 762     /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
 763      * another SSC or a regular ANYOF class.  Can create false positives if
 764      * 'or_with' is to be inverted. */
 765
 766     SV* ored_cp_list;
 767     U8 ored_flags;
 768     U8  or_with_flags = (REGNODE_TYPE(OP(or_with)) == ANYOF)
 769                          ? ANYOF_FLAGS(or_with)
 770                          : 0;
 771
 772     PERL_ARGS_ASSERT_SSC_OR;
 773
 774     assert(is_ANYOF_SYNTHETIC(ssc));
 775
 776     /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
 777      * the code point inversion list and just the relevant flags */
 778     if (is_ANYOF_SYNTHETIC(or_with)) {
 779         ored_cp_list = ((regnode_ssc*) or_with)->invlist;
 780         ored_flags = or_with_flags;
 781     }
 782     else {
 783         ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
 784         ored_flags = or_with_flags & ANYOF_COMMON_FLAGS;
 785         if (OP(or_with) != ANYOFD) {
 786             ored_flags |=
 787                 or_with_flags & ( ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared
 788                                  |ANYOF_HAS_EXTRA_RUNTIME_MATCHES);
 789             if (or_with_flags & ANYOFL_UTF8_LOCALE_REQD) {
 790                 ored_flags |= ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
 791             }
 792         }
 793     }
 794
 795     ANYOF_FLAGS(ssc) |= ored_flags;
 796
 797     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
 798      * C2 is the list of code points in 'or-with'; P2, its posix classes.
 799      * 'or_with' may be inverted.  When not inverted, we have the simple
 800      * situation of computing:
 801      *  (C1 | P1) | (C2 | P2)  =  (C1 | C2) | (P1 | P2)
 802      * If P1|P2 yields a situation with both a class and its complement are
 803      * set, like having both \w and \W, this matches all code points, and we
 804      * can delete these from the P component of the ssc going forward.  XXX We
 805      * might be able to delete all the P components, but I (khw) am not certain
 806      * about this, and it is better to be safe.
 807      *
 808      * Inverted, we have
 809      *  (C1 | P1) | ~(C2 | P2)  =  (C1 | P1) | (~C2 & ~P2)
 810      *                         <=  (C1 | P1) | ~C2
 811      *                         <=  (C1 | ~C2) | P1
 812      * (which results in actually simpler code than the non-inverted case)
 813      * */
 814
 815     if ((or_with_flags & ANYOF_INVERT)
 816         && ! is_ANYOF_SYNTHETIC(or_with))
 817     {
 818         /* We ignore P2, leaving P1 going forward */
 819     }   /* else  Not inverted */
 820     else if (or_with_flags & ANYOF_MATCHES_POSIXL) {
 821         ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
 822         if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
 823             unsigned int i;
 824             for (i = 0; i < ANYOF_MAX; i += 2) {
 825                 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
 826                 {
 827                     ssc_match_all_cp(ssc);
 828                     ANYOF_POSIXL_CLEAR(ssc, i);
 829                     ANYOF_POSIXL_CLEAR(ssc, i+1);
 830                 }
 831             }
 832         }
 833     }
 834
 835     ssc_union(ssc,
 836               ored_cp_list,
 837               false /* Already has been inverted */
 838               );
 839 }
 840
 841 STATIC void
 842 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
 843 {
 844     PERL_ARGS_ASSERT_SSC_UNION;
 845
 846     assert(is_ANYOF_SYNTHETIC(ssc));
 847
 848     _invlist_union_maybe_complement_2nd(ssc->invlist,
 849                                         invlist,
 850                                         invert2nd,
 851                                         &ssc->invlist);
 852 }
 853
 854 STATIC void
 855 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
 856                          SV* const invlist,
 857                          const bool invert2nd)
 858 {
 859     PERL_ARGS_ASSERT_SSC_INTERSECTION;
 860
 861     assert(is_ANYOF_SYNTHETIC(ssc));
 862
 863     _invlist_intersection_maybe_complement_2nd(ssc->invlist,
 864                                                invlist,
 865                                                invert2nd,
 866                                                &ssc->invlist);
 867 }
 868
 869 STATIC void
 870 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
 871 {
 872     PERL_ARGS_ASSERT_SSC_ADD_RANGE;
 873
 874     assert(is_ANYOF_SYNTHETIC(ssc));
 875
 876     ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
 877 }
 878
 879 STATIC void
 880 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
 881 {
 882     /* AND just the single code point 'cp' into the SSC 'ssc' */
 883
 884     SV* cp_list = _new_invlist(2);
 885
 886     PERL_ARGS_ASSERT_SSC_CP_AND;
 887
 888     assert(is_ANYOF_SYNTHETIC(ssc));
 889
 890     cp_list = add_cp_to_invlist(cp_list, cp);
 891     ssc_intersection(ssc, cp_list,
 892                      false /* Not inverted */
 893                      );
 894     SvREFCNT_dec_NN(cp_list);
 895 }
 896
 897 STATIC void
 898 S_ssc_clear_locale(regnode_ssc *ssc)
 899 {
 900     /* Set the SSC 'ssc' to not match any locale things */
 901     PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
 902
 903     assert(is_ANYOF_SYNTHETIC(ssc));
 904
 905     ANYOF_POSIXL_ZERO(ssc);
 906     ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
 907 }
 908
 909
 910
 911 /* The below joins as many adjacent EXACTish nodes as possible into a single
 912  * one.  The regop may be changed if the node(s) contain certain sequences that
 913  * require special handling.  The joining is only done if:
 914  * 1) there is room in the current conglomerated node to entirely contain the
 915  *    next one.
 916  * 2) they are compatible node types
 917  *
 918  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
 919  * these get optimized out
 920  *
 921  * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full
 922  * as possible, even if that means splitting an existing node so that its first
 923  * part is moved to the preceding node.  This would maximise the efficiency of
 924  * memEQ during matching.
 925  *
 926  * If a node is to match under /i (folded), the number of characters it matches
 927  * can be different than its character length if it contains a multi-character
 928  * fold.  *min_subtract is set to the total delta number of characters of the
 929  * input nodes.
 930  *
 931  * And *unfolded_multi_char is set to indicate whether or not the node contains
 932  * an unfolded multi-char fold.  This happens when it won't be known until
 933  * runtime whether the fold is valid or not; namely
 934  *  1) for EXACTF nodes that contain LATIN SMALL LETTER SHARP S, as only if the
 935  *      target string being matched against turns out to be UTF-8 is that fold
 936  *      valid; or
 937  *  2) for EXACTFL nodes whose folding rules depend on the locale in force at
 938  *      runtime.
 939  * (Multi-char folds whose components are all above the Latin1 range are not
 940  * run-time locale dependent, and have already been folded by the time this
 941  * function is called.)
 942  *
 943  * This is as good a place as any to discuss the design of handling these
 944  * multi-character fold sequences.  It's been wrong in Perl for a very long
 945  * time.  There are three code points in Unicode whose multi-character folds
 946  * were long ago discovered to mess things up.  The previous designs for
 947  * dealing with these involved assigning a special node for them.  This
 948  * approach doesn't always work, as evidenced by this example:
 949  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
 950  * Both sides fold to "sss", but if the pattern is parsed to create a node that
 951  * would match just the \xDF, it won't be able to handle the case where a
 952  * successful match would have to cross the node's boundary.  The new approach
 953  * that hopefully generally solves the problem generates an EXACTFUP node
 954  * that is "sss" in this case.
 955  *
 956  * It turns out that there are problems with all multi-character folds, and not
 957  * just these three.  Now the code is general, for all such cases.  The
 958  * approach taken is:
 959  * 1)   This routine examines each EXACTFish node that could contain multi-
 960  *      character folded sequences.  Since a single character can fold into
 961  *      such a sequence, the minimum match length for this node is less than
 962  *      the number of characters in the node.  This routine returns in
 963  *      *min_subtract how many characters to subtract from the actual
 964  *      length of the string to get a real minimum match length; it is 0 if
 965  *      there are no multi-char foldeds.  This delta is used by the caller to
 966  *      adjust the min length of the match, and the delta between min and max,
 967  *      so that the optimizer doesn't reject these possibilities based on size
 968  *      constraints.
 969  *
 970  * 2)   For the sequence involving the LATIN SMALL LETTER SHARP S (U+00DF)
 971  *      under /u, we fold it to 'ss' in regatom(), and in this routine, after
 972  *      joining, we scan for occurrences of the sequence 'ss' in non-UTF-8
 973  *      EXACTFU nodes.  The node type of such nodes is then changed to
 974  *      EXACTFUP, indicating it is problematic, and needs careful handling.
 975  *      (The procedures in step 1) above are sufficient to handle this case in
 976  *      UTF-8 encoded nodes.)  The reason this is problematic is that this is
 977  *      the only case where there is a possible fold length change in non-UTF-8
 978  *      patterns.  By reserving a special node type for problematic cases, the
 979  *      far more common regular EXACTFU nodes can be processed faster.
 980  *      regexec.c takes advantage of this.
 981  *
 982  *      EXACTFUP has been created as a grab-bag for (hopefully uncommon)
 983  *      problematic cases.   These all only occur when the pattern is not
 984  *      UTF-8.  In addition to the 'ss' sequence where there is a possible fold
 985  *      length change, it handles the situation where the string cannot be
 986  *      entirely folded.  The strings in an EXACTFish node are folded as much
 987  *      as possible during compilation in regcomp.c.  This saves effort in
 988  *      regex matching.  By using an EXACTFUP node when it is not possible to
 989  *      fully fold at compile time, regexec.c can know that everything in an
 990  *      EXACTFU node is folded, so folding can be skipped at runtime.  The only
 991  *      case where folding in EXACTFU nodes can't be done at compile time is
 992  *      the presumably uncommon MICRO SIGN, when the pattern isn't UTF-8.  This
 993  *      is because its fold requires UTF-8 to represent.  Thus EXACTFUP nodes
 994  *      handle two very different cases.  Alternatively, there could have been
 995  *      a node type where there are length changes, one for unfolded, and one
 996  *      for both.  If yet another special case needed to be created, the number
 997  *      of required node types would have to go to 7.  khw figures that even
 998  *      though there are plenty of node types to spare, that the maintenance
 999  *      cost wasn't worth the small speedup of doing it that way, especially
1000  *      since he thinks the MICRO SIGN is rarely encountered in practice.
1001  *
1002  *      There are other cases where folding isn't done at compile time, but
1003  *      none of them are under /u, and hence not for EXACTFU nodes.  The folds
1004  *      in EXACTFL nodes aren't known until runtime, and vary as the locale
1005  *      changes.  Some folds in EXACTF depend on if the runtime target string
1006  *      is UTF-8 or not.  (regatom() will create an EXACTFU node even under /di
1007  *      when no fold in it depends on the UTF-8ness of the target string.)
1008  *
1009  * 3)   A problem remains for unfolded multi-char folds. (These occur when the
1010  *      validity of the fold won't be known until runtime, and so must remain
1011  *      unfolded for now.  This happens for the sharp s in EXACTF and EXACTFAA
1012  *      nodes when the pattern isn't in UTF-8.  (Note, BTW, that there cannot
1013  *      be an EXACTF node with a UTF-8 pattern.)  They also occur for various
1014  *      folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
1015  *      The reason this is a problem is that the optimizer part of regexec.c
1016  *      (probably unwittingly, in Perl_regexec_flags()) makes an assumption
1017  *      that a character in the pattern corresponds to at most a single
1018  *      character in the target string.  (And I do mean character, and not byte
1019  *      here, unlike other parts of the documentation that have never been
1020  *      updated to account for multibyte Unicode.)  Sharp s in EXACTF and
1021  *      EXACTFL nodes can match the two character string 'ss'; in EXACTFAA
1022  *      nodes it can match "\x{17F}\x{17F}".  These, along with other ones in
1023  *      EXACTFL nodes, violate the assumption, and they are the only instances
1024  *      where it is violated.  I'm reluctant to try to change the assumption,
1025  *      as the code involved is impenetrable to me (khw), so instead the code
1026  *      here punts.  This routine examines EXACTFL nodes, and (when the pattern
1027  *      isn't UTF-8) EXACTF and EXACTFAA for such unfolded folds, and returns a
1028  *      boolean indicating whether or not the node contains such a fold.  When
1029  *      it is true, the caller sets a flag that later causes the optimizer in
1030  *      this file to not set values for the floating and fixed string lengths,
1031  *      and thus avoids the optimizer code in regexec.c that makes the invalid
1032  *      assumption.  Thus, there is no optimization based on string lengths for
1033  *      EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
1034  *      EXACTF and EXACTFAA nodes that contain the sharp s.  (The reason the
1035  *      assumption is wrong only in these cases is that all other non-UTF-8
1036  *      folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
1037  *      their expanded versions.  (Again, we can't prefold sharp s to 'ss' in
1038  *      EXACTF nodes because we don't know at compile time if it actually
1039  *      matches 'ss' or not.  For EXACTF nodes it will match iff the target
1040  *      string is in UTF-8.  This is in contrast to EXACTFU nodes, where it
1041  *      always matches; and EXACTFAA where it never does.  In an EXACTFAA node
1042  *      in a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
1043  *      problem; but in a non-UTF8 pattern, folding it to that above-Latin1
1044  *      string would require the pattern to be forced into UTF-8, the overhead
1045  *      of which we want to avoid.  Similarly the unfolded multi-char folds in
1046  *      EXACTFL nodes will match iff the locale at the time of match is a UTF-8
1047  *      locale.)
1048  *
1049  *      Similarly, the code that generates tries doesn't currently handle
1050  *      not-already-folded multi-char folds, and it looks like a pain to change
1051  *      that.  Therefore, trie generation of EXACTFAA nodes with the sharp s
1052  *      doesn't work.  Instead, such an EXACTFAA is turned into a new regnode,
1053  *      EXACTFAA_NO_TRIE, which the trie code knows not to handle.  Most people
1054  *      using /iaa matching will be doing so almost entirely with ASCII
1055  *      strings, so this should rarely be encountered in practice */
1056
1057 U32
1058 Perl_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
1059                    UV *min_subtract, bool *unfolded_multi_char,
1060                    U32 flags, regnode *val, U32 depth)
1061 {
1062     /* Merge several consecutive EXACTish nodes into one. */
1063
1064     regnode *n = regnext(scan);
1065     U32 stringok = 1;
1066     regnode *next = REGNODE_AFTER_varies(scan);
1067     U32 stopnow = 0;
1068 #ifdef DEBUGGING
1069     U32 merged = 0;
1070     regnode *stop = scan;
1071     DECLARE_AND_GET_RE_DEBUG_FLAGS;
1072 #else
1073     PERL_UNUSED_ARG(depth);
1074 #endif
1075
1076     PERL_ARGS_ASSERT_JOIN_EXACT;
1077 #ifndef EXPERIMENTAL_INPLACESCAN
1078     PERL_UNUSED_ARG(flags);
1079     PERL_UNUSED_ARG(val);
1080 #endif
1081     DEBUG_PEEP("join", scan, depth, 0);
1082
1083     assert(REGNODE_TYPE(OP(scan)) == EXACT);
1084
1085     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
1086      * EXACT ones that are mergeable to the current one. */
1087     while (    n
1088            && (    REGNODE_TYPE(OP(n)) == NOTHING
1089                || (stringok && REGNODE_TYPE(OP(n)) == EXACT))
1090            && NEXT_OFF(n)
1091            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
1092     {
1093
1094         if (OP(n) == TAIL || n > next)
1095             stringok = 0;
1096         if (REGNODE_TYPE(OP(n)) == NOTHING) {
1097             DEBUG_PEEP("skip:", n, depth, 0);
1098             NEXT_OFF(scan) += NEXT_OFF(n);
1099             next = n + NODE_STEP_REGNODE;
1100 #ifdef DEBUGGING
1101             if (stringok)
1102                 stop = n;
1103 #endif
1104             n = regnext(n);
1105         }
1106         else if (stringok) {
1107             const unsigned int oldl = STR_LEN(scan);
1108             regnode * const nnext = regnext(n);
1109
1110             /* XXX I (khw) kind of doubt that this works on platforms (should
1111              * Perl ever run on one) where U8_MAX is above 255 because of lots
1112              * of other assumptions */
1113             /* Don't join if the sum can't fit into a single node */
1114             if (oldl + STR_LEN(n) > U8_MAX)
1115                 break;
1116
1117             /* Joining something that requires UTF-8 with something that
1118              * doesn't, means the result requires UTF-8. */
1119             if (OP(scan) == EXACT && (OP(n) == EXACT_REQ8)) {
1120                 OP(scan) = EXACT_REQ8;
1121             }
1122             else if (OP(scan) == EXACT_REQ8 && (OP(n) == EXACT)) {
1123                 ;   /* join is compatible, no need to change OP */
1124             }
1125             else if ((OP(scan) == EXACTFU) && (OP(n) == EXACTFU_REQ8)) {
1126                 OP(scan) = EXACTFU_REQ8;
1127             }
1128             else if ((OP(scan) == EXACTFU_REQ8) && (OP(n) == EXACTFU)) {
1129                 ;   /* join is compatible, no need to change OP */
1130             }
1131             else if (OP(scan) == EXACTFU && OP(n) == EXACTFU) {
1132                 ;   /* join is compatible, no need to change OP */
1133             }
1134             else if (OP(scan) == EXACTFU && OP(n) == EXACTFU_S_EDGE) {
1135
1136                  /* Under /di, temporary EXACTFU_S_EDGE nodes are generated,
1137                   * which can join with EXACTFU ones.  We check for this case
1138                   * here.  These need to be resolved to either EXACTFU or
1139                   * EXACTF at joining time.  They have nothing in them that
1140                   * would forbid them from being the more desirable EXACTFU
1141                   * nodes except that they begin and/or end with a single [Ss].
1142                   * The reason this is problematic is because they could be
1143                   * joined in this loop with an adjacent node that ends and/or
1144                   * begins with [Ss] which would then form the sequence 'ss',
1145                   * which matches differently under /di than /ui, in which case
1146                   * EXACTFU can't be used.  If the 'ss' sequence doesn't get
1147                   * formed, the nodes get absorbed into any adjacent EXACTFU
1148                   * node.  And if the only adjacent node is EXACTF, they get
1149                   * absorbed into that, under the theory that a longer node is
1150                   * better than two shorter ones, even if one is EXACTFU.  Note
1151                   * that EXACTFU_REQ8 is generated only for UTF-8 patterns,
1152                   * and the EXACTFU_S_EDGE ones only for non-UTF-8.  */
1153
1154                 if (STRING(n)[STR_LEN(n)-1] == 's') {
1155
1156                     /* Here the joined node would end with 's'.  If the node
1157                      * following the combination is an EXACTF one, it's better to
1158                      * join this trailing edge 's' node with that one, leaving the
1159                      * current one in 'scan' be the more desirable EXACTFU */
1160                     if (OP(nnext) == EXACTF) {
1161                         break;
1162                     }
1163
1164                     OP(scan) = EXACTFU_S_EDGE;
1165
1166                 }   /* Otherwise, the beginning 's' of the 2nd node just
1167                        becomes an interior 's' in 'scan' */
1168             }
1169             else if (OP(scan) == EXACTF && OP(n) == EXACTF) {
1170                 ;   /* join is compatible, no need to change OP */
1171             }
1172             else if (OP(scan) == EXACTF && OP(n) == EXACTFU_S_EDGE) {
1173
1174                 /* EXACTF nodes are compatible for joining with EXACTFU_S_EDGE
1175                  * nodes.  But the latter nodes can be also joined with EXACTFU
1176                  * ones, and that is a better outcome, so if the node following
1177                  * 'n' is EXACTFU, quit now so that those two can be joined
1178                  * later */
1179                 if (OP(nnext) == EXACTFU) {
1180                     break;
1181                 }
1182
1183                 /* The join is compatible, and the combined node will be
1184                  * EXACTF.  (These don't care if they begin or end with 's' */
1185             }
1186             else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU_S_EDGE) {
1187                 if (   STRING(scan)[STR_LEN(scan)-1] == 's'
1188                     && STRING(n)[0] == 's')
1189                 {
1190                     /* When combined, we have the sequence 'ss', which means we
1191                      * have to remain /di */
1192                     OP(scan) = EXACTF;
1193                 }
1194             }
1195             else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU) {
1196                 if (STRING(n)[0] == 's') {
1197                     ;   /* Here the join is compatible and the combined node
1198                            starts with 's', no need to change OP */
1199                 }
1200                 else {  /* Now the trailing 's' is in the interior */
1201                     OP(scan) = EXACTFU;
1202                 }
1203             }
1204             else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTF) {
1205
1206                 /* The join is compatible, and the combined node will be
1207                  * EXACTF.  (These don't care if they begin or end with 's' */
1208                 OP(scan) = EXACTF;
1209             }
1210             else if (OP(scan) != OP(n)) {
1211
1212                 /* The only other compatible joinings are the same node type */
1213                 break;
1214             }
1215
1216             DEBUG_PEEP("merg", n, depth, 0);
1217 #ifdef DEBUGGING
1218             merged++;
1219 #endif
1220
1221             next = REGNODE_AFTER_varies(n);
1222             NEXT_OFF(scan) += NEXT_OFF(n);
1223             assert( ( STR_LEN(scan) + STR_LEN(n) ) < 256 );
1224             setSTR_LEN(scan, (U8)(STR_LEN(scan) + STR_LEN(n)));
1225             /* Now we can overwrite *n : */
1226             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
1227 #ifdef DEBUGGING
1228             stop = next - 1;
1229 #endif
1230             n = nnext;
1231             if (stopnow) break;
1232         }
1233
1234 #ifdef EXPERIMENTAL_INPLACESCAN
1235         if (flags && !NEXT_OFF(n)) {
1236             DEBUG_PEEP("atch", val, depth, 0);
1237             if (REGNODE_OFF_BY_ARG(OP(n))) {
1238                 ARG1u_SET(n, val - n);
1239             }
1240             else {
1241                 NEXT_OFF(n) = val - n;
1242             }
1243             stopnow = 1;
1244         }
1245 #endif
1246     }
1247
1248     /* This temporary node can now be turned into EXACTFU, and must, as
1249      * regexec.c doesn't handle it */
1250     if (OP(scan) == EXACTFU_S_EDGE) {
1251         OP(scan) = EXACTFU;
1252     }
1253
1254     *min_subtract = 0;
1255     *unfolded_multi_char = false;
1256
1257     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
1258      * can now analyze for sequences of problematic code points.  (Prior to
1259      * this final joining, sequences could have been split over boundaries, and
1260      * hence missed).  The sequences only happen in folding, hence for any
1261      * non-EXACT EXACTish node */
1262     if (OP(scan) != EXACT && OP(scan) != EXACT_REQ8 && OP(scan) != EXACTL) {
1263         U8* s0 = (U8*) STRING(scan);
1264         U8* s = s0;
1265         U8* s_end = s0 + STR_LEN(scan);
1266
1267         int total_count_delta = 0;  /* Total delta number of characters that
1268                                        multi-char folds expand to */
1269
1270         /* One pass is made over the node's string looking for all the
1271          * possibilities.  To avoid some tests in the loop, there are two main
1272          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
1273          * non-UTF-8 */
1274         if (UTF) {
1275             U8* folded = NULL;
1276
1277             if (OP(scan) == EXACTFL) {
1278                 U8 *d;
1279
1280                 /* An EXACTFL node would already have been changed to another
1281                  * node type unless there is at least one character in it that
1282                  * is problematic; likely a character whose fold definition
1283                  * won't be known until runtime, and so has yet to be folded.
1284                  * For all but the UTF-8 locale, folds are 1-1 in length, but
1285                  * to handle the UTF-8 case, we need to create a temporary
1286                  * folded copy using UTF-8 locale rules in order to analyze it.
1287                  * This is because our macros that look to see if a sequence is
1288                  * a multi-char fold assume everything is folded (otherwise the
1289                  * tests in those macros would be too complicated and slow).
1290                  * Note that here, the non-problematic folds will have already
1291                  * been done, so we can just copy such characters.  We actually
1292                  * don't completely fold the EXACTFL string.  We skip the
1293                  * unfolded multi-char folds, as that would just create work
1294                  * below to figure out the size they already are */
1295
1296                 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
1297                 d = folded;
1298                 while (s < s_end) {
1299                     STRLEN s_len = UTF8SKIP(s);
1300                     if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
1301                         Copy(s, d, s_len, U8);
1302                         d += s_len;
1303                     }
1304                     else if (is_FOLDS_TO_MULTI_utf8(s)) {
1305                         *unfolded_multi_char = true;
1306                         Copy(s, d, s_len, U8);
1307                         d += s_len;
1308                     }
1309                     else if (isASCII(*s)) {
1310                         *(d++) = toFOLD(*s);
1311                     }
1312                     else {
1313                         STRLEN len;
1314                         _toFOLD_utf8_flags(s, s_end, d, &len, FOLD_FLAGS_FULL);
1315                         d += len;
1316                     }
1317                     s += s_len;
1318                 }
1319
1320                 /* Point the remainder of the routine to look at our temporary
1321                  * folded copy */
1322                 s = folded;
1323                 s_end = d;
1324             } /* End of creating folded copy of EXACTFL string */
1325
1326             /* Examine the string for a multi-character fold sequence.  UTF-8
1327              * patterns have all characters pre-folded by the time this code is
1328              * executed */
1329             while (s < s_end - 1) /* Can stop 1 before the end, as minimum
1330                                      length sequence we are looking for is 2 */
1331             {
1332                 int count = 0;  /* How many characters in a multi-char fold */
1333                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
1334                 if (! len) {    /* Not a multi-char fold: get next char */
1335                     s += UTF8SKIP(s);
1336                     continue;
1337                 }
1338
1339                 { /* Here is a generic multi-char fold. */
1340                     U8* multi_end  = s + len;
1341
1342                     /* Count how many characters are in it.  In the case of
1343                      * /aa, no folds which contain ASCII code points are
1344                      * allowed, so check for those, and skip if found. */
1345                     if (OP(scan) != EXACTFAA && OP(scan) != EXACTFAA_NO_TRIE) {
1346                         count = utf8_length(s, multi_end);
1347                         s = multi_end;
1348                     }
1349                     else {
1350                         while (s < multi_end) {
1351                             if (isASCII(*s)) {
1352                                 s++;
1353                                 goto next_iteration;
1354                             }
1355                             else {
1356                                 s += UTF8SKIP(s);
1357                             }
1358                             count++;
1359                         }
1360                     }
1361                 }
1362
1363                 /* The delta is how long the sequence is minus 1 (1 is how long
1364                  * the character that folds to the sequence is) */
1365                 total_count_delta += count - 1;
1366               next_iteration: ;
1367             }
1368
1369             /* We created a temporary folded copy of the string in EXACTFL
1370              * nodes.  Therefore we need to be sure it doesn't go below zero,
1371              * as the real string could be shorter */
1372             if (OP(scan) == EXACTFL) {
1373                 int total_chars = utf8_length((U8*) STRING(scan),
1374                                            (U8*) STRING(scan) + STR_LEN(scan));
1375                 if (total_count_delta > total_chars) {
1376                     total_count_delta = total_chars;
1377                 }
1378             }
1379
1380             *min_subtract += total_count_delta;
1381             Safefree(folded);
1382         }
1383         else if (OP(scan) == EXACTFAA) {
1384
1385             /* Non-UTF-8 pattern, EXACTFAA node.  There can't be a multi-char
1386              * fold to the ASCII range (and there are no existing ones in the
1387              * upper latin1 range).  But, as outlined in the comments preceding
1388              * this function, we need to flag any occurrences of the sharp s.
1389              * This character forbids trie formation (because of added
1390              * complexity) */
1391 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
1392    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
1393                                       || UNICODE_DOT_DOT_VERSION > 0)
1394             while (s < s_end) {
1395                 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
1396                     OP(scan) = EXACTFAA_NO_TRIE;
1397                     *unfolded_multi_char = true;
1398                     break;
1399                 }
1400                 s++;
1401             }
1402         }
1403         else if (OP(scan) != EXACTFAA_NO_TRIE) {
1404
1405             /* Non-UTF-8 pattern, not EXACTFAA node.  Look for the multi-char
1406              * folds that are all Latin1.  As explained in the comments
1407              * preceding this function, we look also for the sharp s in EXACTF
1408              * and EXACTFL nodes; it can be in the final position.  Otherwise
1409              * we can stop looking 1 byte earlier because have to find at least
1410              * two characters for a multi-fold */
1411             const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
1412                               ? s_end
1413                               : s_end -1;
1414
1415             while (s < upper) {
1416                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
1417                 if (! len) {    /* Not a multi-char fold. */
1418                     if (*s == LATIN_SMALL_LETTER_SHARP_S
1419                         && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
1420                     {
1421                         *unfolded_multi_char = true;
1422                     }
1423                     s++;
1424                     continue;
1425                 }
1426
1427                 if (len == 2
1428                     && isALPHA_FOLD_EQ(*s, 's')
1429                     && isALPHA_FOLD_EQ(*(s+1), 's'))
1430                 {
1431
1432                     /* EXACTF nodes need to know that the minimum length
1433                      * changed so that a sharp s in the string can match this
1434                      * ss in the pattern, but they remain EXACTF nodes, as they
1435                      * won't match this unless the target string is in UTF-8,
1436                      * which we don't know until runtime.  EXACTFL nodes can't
1437                      * transform into EXACTFU nodes */
1438                     if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
1439                         OP(scan) = EXACTFUP;
1440                     }
1441                 }
1442
1443                 *min_subtract += len - 1;
1444                 s += len;
1445             }
1446 #endif
1447         }
1448     }
1449
1450 #ifdef DEBUGGING
1451     /* Allow dumping but overwriting the collection of skipped
1452      * ops and/or strings with fake optimized ops */
1453     n = REGNODE_AFTER_varies(scan);
1454     while (n <= stop) {
1455         OP(n) = OPTIMIZED;
1456         FLAGS(n) = 0;
1457         NEXT_OFF(n) = 0;
1458         n++;
1459     }
1460 #endif
1461     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl", scan, depth, 0);});
1462     return stopnow;
1463 }
1464
1465 /* REx optimizer.  Converts nodes into quicker variants "in place".
1466    Finds fixed substrings.  */
1467
1468
1469 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
1470    to the position after last scanned or to NULL. */
1471
1472 /* the return from this sub is the minimum length that could possibly match */
1473 SSize_t
1474 Perl_study_chunk(pTHX_
1475     RExC_state_t *pRExC_state,
1476     regnode **scanp,        /* Start here (read-write). */
1477     SSize_t *minlenp,       /* used for the minlen of substrings? */
1478     SSize_t *deltap,        /* Write maxlen-minlen here. */
1479     regnode *last,          /* Stop before this one. */
1480     scan_data_t *data,      /* string data about the pattern */
1481     I32 stopparen,          /* treat CLOSE-N as END, see GOSUB */
1482     U32 recursed_depth,     /* how deep have we recursed via GOSUB */
1483     regnode_ssc *and_withp, /* Valid if flags & SCF_DO_STCLASS_OR */
1484     U32 flags,              /* flags controlling this call, see SCF_ flags */
1485     U32 depth,              /* how deep have we recursed period */
1486     bool was_mutate_ok      /* true if in-place optimizations are allowed.
1487                                false only if the caller (recursively) was
1488                                prohibited from modifying the regops, because
1489                                a higher caller is holding a ptr to them. */
1490 )
1491 {
1492     /* vars about the regnodes we are working with */
1493     regnode *scan = *scanp; /* the current opcode we are inspecting */
1494     regnode *next = NULL;   /* the next opcode beyond scan, tmp var */
1495     regnode *first_non_open = scan; /* FIXME: should this init to NULL?
1496                                        the first non open regop, if the init
1497                                        val IS an OPEN then we will skip past
1498                                        it just after the var decls section */
1499     I32 code = 0;           /* temp var used to hold the optype of a regop */
1500
1501     /* vars about the min and max length of the pattern */
1502     SSize_t min = 0;    /* min length of this part of the pattern */
1503     SSize_t stopmin = OPTIMIZE_INFTY; /* min length accounting for ACCEPT
1504                                          this is adjusted down if we find
1505                                          an ACCEPT */
1506     SSize_t delta = 0;  /* difference between min and max length
1507                            (not accounting for stopmin) */
1508
1509     /* vars about capture buffers in the pattern */
1510     I32 pars = 0;       /* count of OPEN opcodes */
1511     I32 is_par = OP(scan) == OPEN ? PARNO(scan) : 0; /* is this op an OPEN? */
1512
1513     /* vars about whether this pattern contains something that can match
1514      * infinitely long strings, eg, X* or X+ */
1515     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
1516     int is_inf_internal = 0;            /* The studied chunk is infinite */
1517
1518     /* scan_data_t (struct) is used to hold information about the substrings
1519      * and start class we have extracted from the string */
1520     scan_data_t data_fake; /* temp var used for recursing in some cases */
1521
1522     SV *re_trie_maxbuff = NULL; /* temp var used to hold whether we can do
1523                                    trie optimizations */
1524
1525     scan_frame *frame = NULL;  /* used as part of fake recursion */
1526
1527     DECLARE_AND_GET_RE_DEBUG_FLAGS;
1528
1529     PERL_ARGS_ASSERT_STUDY_CHUNK;
1530     RExC_study_started = 1;
1531
1532     Zero(&data_fake, 1, scan_data_t);
1533
1534     if ( depth == 0 ) {
1535         while (first_non_open && OP(first_non_open) == OPEN)
1536             first_non_open = regnext(first_non_open);
1537     }
1538
1539   fake_study_recurse:
1540     DEBUG_r(
1541         RExC_study_chunk_recursed_count++;
1542     );
1543     DEBUG_OPTIMISE_MORE_r(
1544     {
1545         Perl_re_indentf( aTHX_  "study_chunk stopparen = %ld recursed_count = %lu "
1546                                 "depth = %lu recursed_depth = %lu scan = %p last = %p",
1547             depth, (long)stopparen,
1548             (unsigned long)RExC_study_chunk_recursed_count,
1549             (unsigned long)depth, (unsigned long)recursed_depth,
1550             scan,
1551             last);
1552         if (recursed_depth) {
1553             U32 i;
1554             U32 j;
1555             for ( j = 0 ; j < recursed_depth ; j++ ) {
1556                 for ( i = 0 ; i < (U32)RExC_total_parens ; i++ ) {
1557                     if (PAREN_TEST(j, i) && (!j || !PAREN_TEST(j - 1, i))) {
1558                         Perl_re_printf( aTHX_ " %d",(int)i);
1559                         break;
1560                     }
1561                 }
1562                 if ( j + 1 < recursed_depth ) {
1563                     Perl_re_printf( aTHX_  ",");
1564                 }
1565             }
1566         }
1567         Perl_re_printf( aTHX_ "\n");
1568     }
1569     );
1570     while ( scan && OP(scan) != END && scan < last ){
1571         UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
1572                                    node length to get a real minimum (because
1573                                    the folded version may be shorter) */
1574         bool unfolded_multi_char = false;
1575         /* avoid mutating ops if we are anywhere within the recursed or
1576          * enframed handling for a GOSUB: the outermost level will handle it.
1577          */
1578         bool mutate_ok = was_mutate_ok && !(frame && frame->in_gosub);
1579         /* Peephole optimizer: */
1580         DEBUG_STUDYDATA("Peep", data, depth, is_inf, min, stopmin, delta);
1581         DEBUG_PEEP("Peep", scan, depth, flags);
1582
1583
1584         /* The reason we do this here is that we need to deal with things like
1585          * /(?:f)(?:o)(?:o)/ which cant be dealt with by the normal EXACT
1586          * parsing code, as each (?:..) is handled by a different invocation of
1587          * reg() -- Yves
1588          */
1589         if (REGNODE_TYPE(OP(scan)) == EXACT
1590             && OP(scan) != LEXACT
1591             && OP(scan) != LEXACT_REQ8
1592             && mutate_ok
1593         ) {
1594             join_exact(pRExC_state, scan, &min_subtract, &unfolded_multi_char,
1595                     0, NULL, depth + 1);
1596         }
1597
1598         /* Follow the next-chain of the current node and optimize
1599            away all the NOTHINGs from it.
1600          */
1601         rck_elide_nothing(scan);
1602
1603         /* The principal pseudo-switch.  Cannot be a switch, since we look into
1604          * several different things.  */
1605         if ( OP(scan) == DEFINEP ) {
1606             SSize_t minlen = 0;
1607             SSize_t deltanext = 0;
1608             SSize_t fake_last_close = 0;
1609             regnode *fake_last_close_op = NULL;
1610             U32 f = SCF_IN_DEFINE | (flags & SCF_TRIE_DOING_RESTUDY);
1611
1612             StructCopy(&zero_scan_data, &data_fake, scan_data_t);
1613             scan = regnext(scan);
1614             assert( OP(scan) == IFTHEN );
1615             DEBUG_PEEP("expect IFTHEN", scan, depth, flags);
1616
1617             data_fake.last_closep= &fake_last_close;
1618             data_fake.last_close_opp= &fake_last_close_op;
1619             minlen = *minlenp;
1620             next = regnext(scan);
1621             scan = REGNODE_AFTER_type(scan,tregnode_IFTHEN);
1622             DEBUG_PEEP("scan", scan, depth, flags);
1623             DEBUG_PEEP("next", next, depth, flags);
1624
1625             /* we suppose the run is continuous, last = next...
1626              * NOTE we dont use the return here! */
1627             /* DEFINEP study_chunk() recursion */
1628             (void)study_chunk(pRExC_state, &scan, &minlen,
1629                               &deltanext, next, &data_fake, stopparen,
1630                               recursed_depth, NULL, f, depth+1, mutate_ok);
1631
1632             scan = next;
1633         } else
1634         if (
1635             OP(scan) == BRANCH  ||
1636             OP(scan) == BRANCHJ ||
1637             OP(scan) == IFTHEN
1638         ) {
1639             next = regnext(scan);
1640             code = OP(scan);
1641
1642             /* The op(next)==code check below is to see if we
1643              * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
1644              * IFTHEN is special as it might not appear in pairs.
1645              * Not sure whether BRANCH-BRANCHJ is possible, regardless
1646              * we dont handle it cleanly. */
1647             if (OP(next) == code || code == IFTHEN) {
1648                 /* NOTE - There is similar code to this block below for
1649                  * handling TRIE nodes on a re-study.  If you change stuff here
1650                  * check there too. */
1651                 SSize_t max1 = 0, min1 = OPTIMIZE_INFTY, num = 0;
1652                 regnode_ssc accum;
1653                 regnode * const startbranch = scan;
1654
1655                 if (flags & SCF_DO_SUBSTR) {
1656                     /* Cannot merge strings after this. */
1657                     scan_commit(pRExC_state, data, minlenp, is_inf);
1658                 }
1659
1660                 if (flags & SCF_DO_STCLASS)
1661                     ssc_init_zero(pRExC_state, &accum);
1662
1663                 while (OP(scan) == code) {
1664                     SSize_t deltanext, minnext, fake_last_close = 0;
1665                     regnode *fake_last_close_op = NULL;
1666                     U32 f = (flags & SCF_TRIE_DOING_RESTUDY);
1667                     regnode_ssc this_class;
1668
1669                     DEBUG_PEEP("Branch", scan, depth, flags);
1670
1671                     num++;
1672                     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
1673                     if (data) {
1674                         data_fake.whilem_c = data->whilem_c;
1675                         data_fake.last_closep = data->last_closep;
1676                         data_fake.last_close_opp = data->last_close_opp;
1677                     }
1678                     else {
1679                         data_fake.last_closep = &fake_last_close;
1680                         data_fake.last_close_opp = &fake_last_close_op;
1681                     }
1682
1683                     data_fake.pos_delta = delta;
1684                     next = regnext(scan);
1685
1686                     scan = REGNODE_AFTER_opcode(scan, code);
1687
1688                     if (flags & SCF_DO_STCLASS) {
1689                         ssc_init(pRExC_state, &this_class);
1690                         data_fake.start_class = &this_class;
1691                         f |= SCF_DO_STCLASS_AND;
1692                     }
1693                     if (flags & SCF_WHILEM_VISITED_POS)
1694                         f |= SCF_WHILEM_VISITED_POS;
1695
1696                     /* we suppose the run is continuous, last = next...*/
1697                     /* recurse study_chunk() for each BRANCH in an alternation */
1698                     minnext = study_chunk(pRExC_state, &scan, minlenp,
1699                                       &deltanext, next, &data_fake, stopparen,
1700                                       recursed_depth, NULL, f, depth+1,
1701                                       mutate_ok);
1702
1703                     if (min1 > minnext)
1704                         min1 = minnext;
1705                     if (deltanext == OPTIMIZE_INFTY) {
1706                         is_inf = is_inf_internal = 1;
1707                         max1 = OPTIMIZE_INFTY;
1708                     } else if (max1 < minnext + deltanext)
1709                         max1 = minnext + deltanext;
1710                     scan = next;
1711                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1712                         pars++;
1713                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
1714                         if ( stopmin > minnext)
1715                             stopmin = min + min1;
1716                         flags &= ~SCF_DO_SUBSTR;
1717                         if (data)
1718                             data->flags |= SCF_SEEN_ACCEPT;
1719                     }
1720                     if (data) {
1721                         if (data_fake.flags & SF_HAS_EVAL)
1722                             data->flags |= SF_HAS_EVAL;
1723                         data->whilem_c = data_fake.whilem_c;
1724                     }
1725                     if (flags & SCF_DO_STCLASS)
1726                         ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
1727                     DEBUG_STUDYDATA("end BRANCH", data, depth, is_inf, min, stopmin, delta);
1728                 }
1729                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
1730                     min1 = 0;
1731                 if (flags & SCF_DO_SUBSTR) {
1732                     data->pos_min += min1;
1733                     if (data->pos_delta >= OPTIMIZE_INFTY - (max1 - min1))
1734                         data->pos_delta = OPTIMIZE_INFTY;
1735                     else
1736                         data->pos_delta += max1 - min1;
1737                     if (max1 != min1 || is_inf)
1738                         data->cur_is_floating = 1;
1739                 }
1740                 min += min1;
1741                 if (delta == OPTIMIZE_INFTY
1742                  || OPTIMIZE_INFTY - delta - (max1 - min1) < 0)
1743                     delta = OPTIMIZE_INFTY;
1744                 else
1745                     delta += max1 - min1;
1746                 if (flags & SCF_DO_STCLASS_OR) {
1747                     ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
1748                     if (min1) {
1749                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
1750                         flags &= ~SCF_DO_STCLASS;
1751                     }
1752                 }
1753                 else if (flags & SCF_DO_STCLASS_AND) {
1754                     if (min1) {
1755                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
1756                         flags &= ~SCF_DO_STCLASS;
1757                     }
1758                     else {
1759                         /* Switch to OR mode: cache the old value of
1760                          * data->start_class */
1761                         INIT_AND_WITHP;
1762                         StructCopy(data->start_class, and_withp, regnode_ssc);
1763                         flags &= ~SCF_DO_STCLASS_AND;
1764                         StructCopy(&accum, data->start_class, regnode_ssc);
1765                         flags |= SCF_DO_STCLASS_OR;
1766                     }
1767                 }
1768                 DEBUG_STUDYDATA("pre TRIE", data, depth, is_inf, min, stopmin, delta);
1769
1770                 if (PERL_ENABLE_TRIE_OPTIMISATION
1771                     && OP(startbranch) == BRANCH
1772                     && mutate_ok
1773                 ) {
1774                 /* demq.
1775
1776                    Assuming this was/is a branch we are dealing with: 'scan'
1777                    now points at the item that follows the branch sequence,
1778                    whatever it is. We now start at the beginning of the
1779                    sequence and look for subsequences of
1780
1781                    BRANCH->EXACT => x1
1782                    BRANCH->EXACT => x2
1783                    tail
1784
1785                    which would be constructed from a pattern like
1786                    /A|LIST|OF|WORDS/
1787
1788                    If we can find such a subsequence we need to turn the first
1789                    element into a trie and then add the subsequent branch exact
1790                    strings to the trie.
1791
1792                    We have two cases
1793
1794                      1. patterns where the whole set of branches can be
1795                         converted.
1796
1797                      2. patterns where only a subset can be converted.
1798
1799                    In case 1 we can replace the whole set with a single regop
1800                    for the trie. In case 2 we need to keep the start and end
1801                    branches so
1802
1803                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
1804                      becomes BRANCH TRIE; BRANCH X;
1805
1806                   There is an additional case, that being where there is a
1807                   common prefix, which gets split out into an EXACT like node
1808                   preceding the TRIE node.
1809
1810                   If X(1..n)==tail then we can do a simple trie, if not we make
1811                   a "jump" trie, such that when we match the appropriate word
1812                   we "jump" to the appropriate tail node. Essentially we turn
1813                   a nested if into a case structure of sorts.
1814
1815                 */
1816
1817                     int made = 0;
1818                     if (!re_trie_maxbuff) {
1819                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1820                         if (!SvIOK(re_trie_maxbuff))
1821                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1822                     }
1823                     if ( SvIV(re_trie_maxbuff)>=0  ) {
1824                         regnode *cur;
1825                         regnode *first = (regnode *)NULL;
1826                         regnode *prev = (regnode *)NULL;
1827                         regnode *tail = scan;
1828                         U8 trietype = 0;
1829                         U32 count = 0;
1830
1831                         /* var tail is used because there may be a TAIL
1832                            regop in the way. Ie, the exacts will point to the
1833                            thing following the TAIL, but the last branch will
1834                            point at the TAIL. So we advance tail. If we
1835                            have nested (?:) we may have to move through several
1836                            tails.
1837                          */
1838
1839                         while ( OP( tail ) == TAIL ) {
1840                             /* this is the TAIL generated by (?:) */
1841                             tail = regnext( tail );
1842                         }
1843
1844
1845                         DEBUG_TRIE_COMPILE_r({
1846                             regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
1847                             Perl_re_indentf( aTHX_  "%s %" UVuf ":%s\n",
1848                               depth+1,
1849                               "Looking for TRIE'able sequences. Tail node is ",
1850                               (UV) REGNODE_OFFSET(tail),
1851                               SvPV_nolen_const( RExC_mysv )
1852                             );
1853                         });
1854
1855                         /*
1856
1857                             Step through the branches
1858                                 cur represents each branch,
1859                                 noper is the first thing to be matched as part
1860                                       of that branch
1861                                 noper_next is the regnext() of that node.
1862
1863                             We normally handle a case like this
1864                             /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
1865                             support building with NOJUMPTRIE, which restricts
1866                             the trie logic to structures like /FOO|BAR/.
1867
1868                             If noper is a trieable nodetype then the branch is
1869                             a possible optimization target. If we are building
1870                             under NOJUMPTRIE then we require that noper_next is
1871                             the same as scan (our current position in the regex
1872                             program).
1873
1874                             Once we have two or more consecutive such branches
1875                             we can create a trie of the EXACT's contents and
1876                             stitch it in place into the program.
1877
1878                             If the sequence represents all of the branches in
1879                             the alternation we replace the entire thing with a
1880                             single TRIE node.
1881
1882                             Otherwise when it is a subsequence we need to
1883                             stitch it in place and replace only the relevant
1884                             branches. This means the first branch has to remain
1885                             as it is used by the alternation logic, and its
1886                             next pointer, and needs to be repointed at the item
1887                             on the branch chain following the last branch we
1888                             have optimized away.
1889
1890                             This could be either a BRANCH, in which case the
1891                             subsequence is internal, or it could be the item
1892                             following the branch sequence in which case the
1893                             subsequence is at the end (which does not
1894                             necessarily mean the first node is the start of the
1895                             alternation).
1896
1897                             TRIE_TYPE(X) is a define which maps the optype to a
1898                             trietype.
1899
1900                                 optype          |  trietype
1901                                 ----------------+-----------
1902                                 NOTHING         | NOTHING
1903                                 EXACT           | EXACT
1904                                 EXACT_REQ8      | EXACT
1905                                 EXACTFU         | EXACTFU
1906                                 EXACTFU_REQ8    | EXACTFU
1907                                 EXACTFUP        | EXACTFU
1908                                 EXACTFAA        | EXACTFAA
1909                                 EXACTL          | EXACTL
1910                                 EXACTFLU8       | EXACTFLU8
1911
1912
1913                         */
1914 #define TRIE_TYPE(X) ( ( NOTHING == (X) )                                   \
1915                        ? NOTHING                                            \
1916                        : ( EXACT == (X) || EXACT_REQ8 == (X) )             \
1917                          ? EXACT                                            \
1918                          : (     EXACTFU == (X)                             \
1919                               || EXACTFU_REQ8 == (X)                       \
1920                               || EXACTFUP == (X) )                          \
1921                            ? EXACTFU                                        \
1922                            : ( EXACTFAA == (X) )                            \
1923                              ? EXACTFAA                                     \
1924                              : ( EXACTL == (X) )                            \
1925                                ? EXACTL                                     \
1926                                : ( EXACTFLU8 == (X) )                       \
1927                                  ? EXACTFLU8                                \
1928                                  : 0 )
1929
1930                         /* dont use tail as the end marker for this traverse */
1931                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
1932                             regnode * const noper = REGNODE_AFTER( cur );
1933                             U8 noper_type = OP( noper );
1934                             U8 noper_trietype = TRIE_TYPE( noper_type );
1935 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
1936                             regnode * const noper_next = regnext( noper );
1937                             U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
1938                             U8 noper_next_trietype = (noper_next && noper_next < tail) ? TRIE_TYPE( noper_next_type ) :0;
1939 #endif
1940
1941                             DEBUG_TRIE_COMPILE_r({
1942                                 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
1943                                 Perl_re_indentf( aTHX_  "- %d:%s (%d)",
1944                                    depth+1,
1945                                    REG_NODE_NUM(cur), SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
1946
1947                                 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
1948                                 Perl_re_printf( aTHX_  " -> %d:%s",
1949                                     REG_NODE_NUM(noper), SvPV_nolen_const(RExC_mysv));
1950
1951                                 if ( noper_next ) {
1952                                   regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
1953                                   Perl_re_printf( aTHX_ "\t=> %d:%s\t",
1954                                     REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv));
1955                                 }
1956                                 Perl_re_printf( aTHX_  "(First == %d,Last == %d,Cur == %d,tt == %s,ntt == %s,nntt == %s)\n",
1957                                    REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
1958                                    REGNODE_NAME(trietype), REGNODE_NAME(noper_trietype), REGNODE_NAME(noper_next_trietype)
1959                                 );
1960                             });
1961
1962                             /* Is noper a trieable nodetype that can be merged
1963                              * with the current trie (if there is one)? */
1964                             if ( noper_trietype
1965                                   &&
1966                                   (
1967                                         ( noper_trietype == NOTHING )
1968                                         || ( trietype == NOTHING )
1969                                         || ( trietype == noper_trietype )
1970                                   )
1971 #ifdef NOJUMPTRIE
1972                                   && noper_next >= tail
1973 #endif
1974                                   && count < U16_MAX)
1975                             {
1976                                 /* Handle mergable triable node Either we are
1977                                  * the first node in a new trieable sequence,
1978                                  * in which case we do some bookkeeping,
1979                                  * otherwise we update the end pointer. */
1980                                 if ( !first ) {
1981                                     first = cur;
1982                                     if ( noper_trietype == NOTHING ) {
1983 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
1984                                         regnode * const noper_next = regnext( noper );
1985                                         U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
1986                                         U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
1987 #endif
1988
1989                                         if ( noper_next_trietype ) {
1990                                             trietype = noper_next_trietype;
1991                                         } else if (noper_next_type)  {
1992                                             /* a NOTHING regop is 1 regop wide.
1993                                              * We need at least two for a trie
1994                                              * so we can't merge this in */
1995                                             first = NULL;
1996                                         }
1997                                     } else {
1998                                         trietype = noper_trietype;
1999                                     }
2000                                 } else {
2001                                     if ( trietype == NOTHING )
2002                                         trietype = noper_trietype;
2003                                     prev = cur;
2004                                 }
2005                                 if (first)
2006                                     count++;
2007                             } /* end handle mergable triable node */
2008                             else {
2009                                 /* handle unmergable node -
2010                                  * noper may either be a triable node which can
2011                                  * not be tried together with the current trie,
2012                                  * or a non triable node */
2013                                 if ( prev ) {
2014                                     /* If last is set and trietype is not
2015                                      * NOTHING then we have found at least two
2016                                      * triable branch sequences in a row of a
2017                                      * similar trietype so we can turn them
2018                                      * into a trie. If/when we allow NOTHING to
2019                                      * start a trie sequence this condition
2020                                      * will be required, and it isn't expensive
2021                                      * so we leave it in for now. */
2022                                     if ( trietype && trietype != NOTHING )
2023                                         make_trie( pRExC_state,
2024                                                 startbranch, first, cur, tail,
2025                                                 count, trietype, depth+1 );
2026                                     prev = NULL; /* note: we clear/update
2027                                                     first, trietype etc below,
2028                                                     so we dont do it here */
2029                                 }
2030                                 if ( noper_trietype
2031 #ifdef NOJUMPTRIE
2032                                      && noper_next >= tail
2033 #endif
2034                                 ){
2035                                     /* noper is triable, so we can start a new
2036                                      * trie sequence */
2037                                     count = 1;
2038                                     first = cur;
2039                                     trietype = noper_trietype;
2040                                 } else if (first) {
2041                                     /* if we already saw a first but the
2042                                      * current node is not triable then we have
2043                                      * to reset the first information. */
2044                                     count = 0;
2045                                     first = NULL;
2046                                     trietype = 0;
2047                                 }
2048                             } /* end handle unmergable node */
2049                         } /* loop over branches */
2050                         DEBUG_TRIE_COMPILE_r({
2051                             regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
2052                             Perl_re_indentf( aTHX_  "- %s (%d) <SCAN FINISHED> ",
2053                               depth+1, SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
2054                             Perl_re_printf( aTHX_  "(First == %d, Last == %d, Cur == %d, tt == %s)\n",
2055                                REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
2056                                REGNODE_NAME(trietype)
2057                             );
2058
2059                         });
2060                         if ( prev && trietype ) {
2061                             if ( trietype != NOTHING ) {
2062                                 /* the last branch of the sequence was part of
2063                                  * a trie, so we have to construct it here
2064                                  * outside of the loop */
2065                                 made = make_trie( pRExC_state, startbranch,
2066                                                  first, scan, tail, count,
2067                                                  trietype, depth+1 );
2068 #ifdef TRIE_STUDY_OPT
2069                                 if ( ((made == MADE_EXACT_TRIE &&
2070                                      startbranch == first)
2071                                      || ( first_non_open == first )) &&
2072                                      depth == 0 ) {
2073                                     flags |= SCF_TRIE_RESTUDY;
2074                                     if ( startbranch == first
2075                                          && scan >= tail )
2076                                     {
2077                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
2078                                     }
2079                                 }
2080 #endif
2081                             } else {
2082                                 /* at this point we know whatever we have is a
2083                                  * NOTHING sequence/branch AND if 'startbranch'
2084                                  * is 'first' then we can turn the whole thing
2085                                  * into a NOTHING
2086                                  */
2087                                 if ( startbranch == first ) {
2088                                     regnode *opt;
2089                                     /* the entire thing is a NOTHING sequence,
2090                                      * something like this: (?:|) So we can
2091                                      * turn it into a plain NOTHING op. */
2092                                     DEBUG_TRIE_COMPILE_r({
2093                                         regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
2094                                         Perl_re_indentf( aTHX_  "- %s (%d) <NOTHING BRANCH SEQUENCE>\n",
2095                                           depth+1,
2096                                           SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
2097
2098                                     });
2099                                     OP(startbranch)= NOTHING;
2100                                     NEXT_OFF(startbranch)= tail - startbranch;
2101                                     for ( opt = startbranch + 1; opt < tail ; opt++ )
2102                                         OP(opt)= OPTIMIZED;
2103                                 }
2104                             }
2105                         } /* end if ( prev) */
2106                     } /* TRIE_MAXBUF is non zero */
2107                 } /* do trie */
2108                 DEBUG_STUDYDATA("after TRIE", data, depth, is_inf, min, stopmin, delta);
2109             }
2110             else
2111                 scan = REGNODE_AFTER_opcode(scan,code);
2112             continue;
2113         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB) {
2114             I32 paren = 0;
2115             regnode *start = NULL;
2116             regnode *end = NULL;
2117             U32 my_recursed_depth = recursed_depth;
2118
2119             if (OP(scan) != SUSPEND) { /* GOSUB */
2120                 /* Do setup, note this code has side effects beyond
2121                  * the rest of this block. Specifically setting
2122                  * RExC_recurse[] must happen at least once during
2123                  * study_chunk(). */
2124                 paren = ARG1u(scan);
2125                 RExC_recurse[ARG2i(scan)] = scan;
2126                 start = REGNODE_p(RExC_open_parens[paren]);
2127                 end   = REGNODE_p(RExC_close_parens[paren]);
2128
2129                 /* NOTE we MUST always execute the above code, even
2130                  * if we do nothing with a GOSUB */
2131                 if (
2132                     ( flags & SCF_IN_DEFINE )
2133                     ||
2134                     (
2135                         (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF))
2136                         &&
2137                         ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
2138                     )
2139                 ) {
2140                     /* no need to do anything here if we are in a define. */
2141                     /* or we are after some kind of infinite construct
2142                      * so we can skip recursing into this item.
2143                      * Since it is infinite we will not change the maxlen
2144                      * or delta, and if we miss something that might raise
2145                      * the minlen it will merely pessimise a little.
2146                      *
2147                      * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
2148                      * might result in a minlen of 1 and not of 4,
2149                      * but this doesn't make us mismatch, just try a bit
2150                      * harder than we should.
2151                      *
2152                      * However we must assume this GOSUB is infinite, to
2153                      * avoid wrongly applying other optimizations in the
2154                      * enclosing scope - see GH 18096, for example.
2155                      */
2156                     is_inf = is_inf_internal = 1;
2157                     scan = regnext(scan);
2158                     continue;
2159                 }
2160
2161                 if (
2162                     !recursed_depth
2163                     || !PAREN_TEST(recursed_depth - 1, paren)
2164                 ) {
2165                     /* it is quite possible that there are more efficient ways
2166                      * to do this. We maintain a bitmap per level of recursion
2167                      * of which patterns we have entered so we can detect if a
2168                      * pattern creates a possible infinite loop. When we
2169                      * recurse down a level we copy the previous levels bitmap
2170                      * down. When we are at recursion level 0 we zero the top
2171                      * level bitmap. It would be nice to implement a different
2172                      * more efficient way of doing this. In particular the top
2173                      * level bitmap may be unnecessary.
2174                      */
2175                     if (!recursed_depth) {
2176                         Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
2177                     } else {
2178                         Copy(PAREN_OFFSET(recursed_depth - 1),
2179                              PAREN_OFFSET(recursed_depth),
2180                              RExC_study_chunk_recursed_bytes, U8);
2181                     }
2182                     /* we havent recursed into this paren yet, so recurse into it */
2183                     DEBUG_STUDYDATA("gosub-set", data, depth, is_inf, min, stopmin, delta);
2184                     PAREN_SET(recursed_depth, paren);
2185                     my_recursed_depth = recursed_depth + 1;
2186                 } else {
2187                     DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf, min, stopmin, delta);
2188                     /* some form of infinite recursion, assume infinite length
2189                      * */
2190                     if (flags & SCF_DO_SUBSTR) {
2191                         scan_commit(pRExC_state, data, minlenp, is_inf);
2192                         data->cur_is_floating = 1;
2193                     }
2194                     is_inf = is_inf_internal = 1;
2195                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2196                         ssc_anything(data->start_class);
2197                     flags &= ~SCF_DO_STCLASS;
2198
2199                     start = NULL; /* reset start so we dont recurse later on. */
2200                 }
2201             } else {
2202                 paren = stopparen;
2203                 start = scan + 2;
2204                 end = regnext(scan);
2205             }
2206             if (start) {
2207                 scan_frame *newframe;
2208                 assert(end);
2209                 if (!RExC_frame_last) {
2210                     Newxz(newframe, 1, scan_frame);
2211                     SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
2212                     RExC_frame_head = newframe;
2213                     RExC_frame_count++;
2214                 } else if (!RExC_frame_last->next_frame) {
2215                     Newxz(newframe, 1, scan_frame);
2216                     RExC_frame_last->next_frame= newframe;
2217                     newframe->prev_frame= RExC_frame_last;
2218                     RExC_frame_count++;
2219                 } else {
2220                     newframe = RExC_frame_last->next_frame;
2221                 }
2222                 RExC_frame_last = newframe;
2223
2224                 newframe->next_regnode = regnext(scan);
2225                 newframe->last_regnode = last;
2226                 newframe->stopparen = stopparen;
2227                 newframe->prev_recursed_depth = recursed_depth;
2228                 newframe->this_prev_frame= frame;
2229                 newframe->in_gosub = (
2230                     (frame && frame->in_gosub) || OP(scan) == GOSUB
2231                 );
2232
2233                 DEBUG_STUDYDATA("frame-new", data, depth, is_inf, min, stopmin, delta);
2234                 DEBUG_PEEP("fnew", scan, depth, flags);
2235
2236                 frame = newframe;
2237                 scan =  start;
2238                 stopparen = paren;
2239                 last = end;
2240                 depth = depth + 1;
2241                 recursed_depth = my_recursed_depth;
2242
2243                 continue;
2244             }
2245         }
2246         else if (REGNODE_TYPE(OP(scan)) == EXACT && ! isEXACTFish(OP(scan))) {
2247             SSize_t bytelen = STR_LEN(scan), charlen;
2248             UV uc;
2249             assert(bytelen);
2250             if (UTF) {
2251                 const U8 * const s = (U8*)STRING(scan);
2252                 uc = utf8_to_uv_or_die(s, s + bytelen, NULL);
2253                 charlen = utf8_length(s, s + bytelen);
2254             } else {
2255                 uc = *((U8*)STRING(scan));
2256                 charlen = bytelen;
2257             }
2258             min += charlen;
2259             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2260                 /* The code below prefers earlier match for fixed
2261                    offset, later match for variable offset.  */
2262                 if (data->last_end == -1) { /* Update the start info. */
2263                     data->last_start_min = data->pos_min;
2264                     data->last_start_max =
2265                         is_inf ? OPTIMIZE_INFTY
2266                         : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min)
2267                             ? OPTIMIZE_INFTY : data->pos_min + data->pos_delta;
2268                 }
2269                 sv_catpvn(data->last_found, STRING(scan), bytelen);
2270                 if (UTF)
2271                     SvUTF8_on(data->last_found);
2272                 {
2273                     SV * const sv = data->last_found;
2274                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2275                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
2276                     if (mg && mg->mg_len >= 0)
2277                         mg->mg_len += charlen;
2278                 }
2279                 data->last_end = data->pos_min + charlen;
2280                 data->pos_min += charlen; /* As in the first entry. */
2281                 data->flags &= ~SF_BEFORE_EOL;
2282             }
2283
2284             /* ANDing the code point leaves at most it, and not in locale, and
2285              * can't match null string */
2286             if (flags & SCF_DO_STCLASS_AND) {
2287                 ssc_cp_and(data->start_class, uc);
2288                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
2289                 ssc_clear_locale(data->start_class);
2290             }
2291             else if (flags & SCF_DO_STCLASS_OR) {
2292                 ssc_add_cp(data->start_class, uc);
2293                 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
2294
2295                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
2296                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
2297             }
2298             flags &= ~SCF_DO_STCLASS;
2299             DEBUG_STUDYDATA("end EXACT", data, depth, is_inf, min, stopmin, delta);
2300         }
2301         else if (REGNODE_TYPE(OP(scan)) == EXACT) {
2302             /* But OP != EXACT!, so is EXACTFish */
2303             SSize_t bytelen = STR_LEN(scan), charlen;
2304             const U8 * s = (U8*)STRING(scan);
2305
2306             /* Replace a length 1 ASCII fold pair node with an ANYOFM node,
2307              * with the mask set to the complement of the bit that differs
2308              * between upper and lower case, and the lowest code point of the
2309              * pair (which the '&' forces) */
2310             if (     bytelen == 1
2311                 &&   isALPHA_A(*s)
2312                 &&  (         OP(scan) == EXACTFAA
2313                      || (     OP(scan) == EXACTFU
2314                          && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(*s)))
2315                 &&   mutate_ok
2316             ) {
2317                 U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */
2318
2319                 OP(scan) = ANYOFM;
2320                 ARG1u_SET(scan, *s & mask);
2321                 FLAGS(scan) = mask;
2322                 /* We're not EXACTFish any more, so restudy.
2323                  * Search for "restudy" in this file to find
2324                  * a comment with details. */
2325                 continue;
2326             }
2327
2328             /* Search for fixed substrings supports EXACT only. */
2329             if (flags & SCF_DO_SUBSTR) {
2330                 assert(data);
2331                 scan_commit(pRExC_state, data, minlenp, is_inf);
2332             }
2333             charlen = UTF ? (SSize_t) utf8_length(s, s + bytelen) : bytelen;
2334             if (unfolded_multi_char) {
2335                 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
2336             }
2337             min += charlen - min_subtract;
2338             assert (min >= 0);
2339             if ((SSize_t)min_subtract < OPTIMIZE_INFTY
2340                 && delta < OPTIMIZE_INFTY - (SSize_t)min_subtract
2341             ) {
2342                 delta += min_subtract;
2343             } else {
2344                 delta = OPTIMIZE_INFTY;
2345             }
2346             if (flags & SCF_DO_SUBSTR) {
2347                 data->pos_min += charlen - min_subtract;
2348                 if (data->pos_min < 0) {
2349                     data->pos_min = 0;
2350                 }
2351                 if ((SSize_t)min_subtract < OPTIMIZE_INFTY
2352                     && data->pos_delta < OPTIMIZE_INFTY - (SSize_t)min_subtract
2353                 ) {
2354                     data->pos_delta += min_subtract;
2355                 } else {
2356                     data->pos_delta = OPTIMIZE_INFTY;
2357                 }
2358                 if (min_subtract) {
2359                     data->cur_is_floating = 1; /* float */
2360                 }
2361             }
2362
2363             if (flags & SCF_DO_STCLASS) {
2364                 SV* EXACTF_invlist = make_exactf_invlist(pRExC_state, scan);
2365
2366                 assert(EXACTF_invlist);
2367                 if (flags & SCF_DO_STCLASS_AND) {
2368                     if (OP(scan) != EXACTFL)
2369                         ssc_clear_locale(data->start_class);
2370                     ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
2371                     ANYOF_POSIXL_ZERO(data->start_class);
2372                     ssc_intersection(data->start_class, EXACTF_invlist, false);
2373                 }
2374                 else {  /* SCF_DO_STCLASS_OR */
2375                     ssc_union(data->start_class, EXACTF_invlist, false);
2376                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
2377
2378                     /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
2379                     ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
2380                 }
2381                 flags &= ~SCF_DO_STCLASS;
2382                 SvREFCNT_dec(EXACTF_invlist);
2383             }
2384             DEBUG_STUDYDATA("end EXACTish", data, depth, is_inf, min, stopmin, delta);
2385         }
2386         else if (REGNODE_VARIES(OP(scan))) {
2387             SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
2388             I32 fl = 0;
2389             U32 f = flags;
2390             regnode * const oscan = scan;
2391             regnode_ssc this_class;
2392             regnode_ssc *oclass = NULL;
2393             I32 next_is_eval = 0;
2394
2395             switch (REGNODE_TYPE(OP(scan))) {
2396             case WHILEM:                /* End of (?:...)* . */
2397                 scan = REGNODE_AFTER(scan);
2398                 goto finish;
2399             case PLUS:
2400                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2401                     next = REGNODE_AFTER(scan);
2402                     if (   (     REGNODE_TYPE(OP(next)) == EXACT
2403                             && ! isEXACTFish(OP(next)))
2404                         || (flags & SCF_DO_STCLASS))
2405                     {
2406                         mincount = 1;
2407                         maxcount = REG_INFTY;
2408                         next = regnext(scan);
2409                         scan = REGNODE_AFTER(scan);
2410                         goto do_curly;
2411                     }
2412                 }
2413                 if (flags & SCF_DO_SUBSTR)
2414                     data->pos_min++;
2415                 /* This will bypass the formal 'min += minnext * mincount'
2416                  * calculation in the do_curly path, so assumes min width
2417                  * of the PLUS payload is exactly one. */
2418                 min++;
2419                 /* FALLTHROUGH */
2420             case STAR:
2421                 next = REGNODE_AFTER(scan);
2422
2423                 /* This temporary node can now be turned into EXACTFU, and
2424                  * must, as regexec.c doesn't handle it */
2425                 if (OP(next) == EXACTFU_S_EDGE && mutate_ok) {
2426                     OP(next) = EXACTFU;
2427                 }
2428
2429                 if (     STR_LEN(next) == 1
2430                     &&   isALPHA_A(* STRING(next))
2431                     && (         OP(next) == EXACTFAA
2432                         || (     OP(next) == EXACTFU
2433                             && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(next))))
2434                     &&   mutate_ok
2435                 ) {
2436                     /* These differ in just one bit */
2437                     U8 mask = ~ ('A' ^ 'a');
2438
2439                     assert(isALPHA_A(* STRING(next)));
2440
2441                     /* Then replace it by an ANYOFM node, with
2442                     * the mask set to the complement of the
2443                     * bit that differs between upper and lower
2444                     * case, and the lowest code point of the
2445                     * pair (which the '&' forces) */
2446                     OP(next) = ANYOFM;
2447                     ARG1u_SET(next, *STRING(next) & mask);
2448                     FLAGS(next) = mask;
2449                 }
2450
2451                 if (flags & SCF_DO_STCLASS) {
2452                     mincount = 0;
2453                     maxcount = REG_INFTY;
2454                     next = regnext(scan);
2455                     scan = REGNODE_AFTER(scan);
2456                     goto do_curly;
2457                 }
2458                 if (flags & SCF_DO_SUBSTR) {
2459                     scan_commit(pRExC_state, data, minlenp, is_inf);
2460                     /* Cannot extend fixed substrings */
2461                     data->cur_is_floating = 1; /* float */
2462                 }
2463                 is_inf = is_inf_internal = 1;
2464                 scan = regnext(scan);
2465                 goto optimize_curly_tail;
2466             case CURLY:
2467                 if (stopparen > 0 && (OP(scan) == CURLYN || OP(scan) == CURLYM)
2468                     && (FLAGS(scan) == stopparen))
2469                 {
2470                     mincount = 1;
2471                     maxcount = 1;
2472                 } else {
2473                     mincount = ARG1i(scan);
2474                     maxcount = ARG2i(scan);
2475                 }
2476                 next = regnext(scan);
2477                 if (OP(scan) == CURLYX) {
2478                     I32 lp = (data ? *(data->last_closep) : 0);
2479                     FLAGS(scan) = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
2480                 }
2481                 scan = REGNODE_AFTER(scan);
2482                 next_is_eval = (OP(scan) == EVAL);
2483               do_curly:
2484                 if (flags & SCF_DO_SUBSTR) {
2485                     if (mincount == 0)
2486                         scan_commit(pRExC_state, data, minlenp, is_inf);
2487                     /* Cannot extend fixed substrings */
2488                     pos_before = data->pos_min;
2489                 }
2490                 if (data) {
2491                     fl = data->flags;
2492                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2493                     if (is_inf)
2494                         data->flags |= SF_IS_INF;
2495                 }
2496                 if (flags & SCF_DO_STCLASS) {
2497                     ssc_init(pRExC_state, &this_class);
2498                     oclass = data->start_class;
2499                     data->start_class = &this_class;
2500                     f |= SCF_DO_STCLASS_AND;
2501                     f &= ~SCF_DO_STCLASS_OR;
2502                 }
2503                 /* Exclude from super-linear cache processing any {n,m}
2504                    regops for which the combination of input pos and regex
2505                    pos is not enough information to determine if a match
2506                    will be possible.
2507
2508                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
2509                    regex pos at the \s*, the prospects for a match depend not
2510                    only on the input position but also on how many (bar\s*)
2511                    repeats into the {4,8} we are. */
2512                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
2513                     f &= ~SCF_WHILEM_VISITED_POS;
2514
2515                 /* This will finish on WHILEM, setting scan, or on NULL: */
2516                 /* recurse study_chunk() on loop bodies */
2517                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2518                                   last, data, stopparen, recursed_depth, NULL,
2519                                   (mincount == 0
2520                                    ? (f & ~SCF_DO_SUBSTR)
2521                                    : f)
2522                                   , depth+1, mutate_ok);
2523
2524                 if (data && data->flags & SCF_SEEN_ACCEPT) {
2525                     if (mincount > 1)
2526                         mincount = 1;
2527                 }
2528
2529                 if (flags & SCF_DO_STCLASS)
2530                     data->start_class = oclass;
2531                 if (mincount == 0 || minnext == 0) {
2532                     if (flags & SCF_DO_STCLASS_OR) {
2533                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
2534                     }
2535                     else if (flags & SCF_DO_STCLASS_AND) {
2536                         /* Switch to OR mode: cache the old value of
2537                          * data->start_class */
2538                         INIT_AND_WITHP;
2539                         StructCopy(data->start_class, and_withp, regnode_ssc);
2540                         flags &= ~SCF_DO_STCLASS_AND;
2541                         StructCopy(&this_class, data->start_class, regnode_ssc);
2542                         flags |= SCF_DO_STCLASS_OR;
2543                         ANYOF_FLAGS(data->start_class)
2544                                                 |= SSC_MATCHES_EMPTY_STRING;
2545                     }
2546                 } else {                /* Non-zero len */
2547                     if (flags & SCF_DO_STCLASS_OR) {
2548                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
2549                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
2550                     }
2551                     else if (flags & SCF_DO_STCLASS_AND)
2552                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
2553                     flags &= ~SCF_DO_STCLASS;
2554                 }
2555                 if (!scan)              /* It was not CURLYX, but CURLY. */
2556                     scan = next;
2557                 if (((flags & (SCF_TRIE_DOING_RESTUDY|SCF_DO_SUBSTR))==SCF_DO_SUBSTR)
2558                     /* ? quantifier ok, except for (?{ ... }) */
2559                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
2560                     && (minnext == 0) && (deltanext == 0)
2561                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2562                     && maxcount <= REG_INFTY/3) /* Complement check for big
2563                                                    count */
2564                 {
2565                     _WARN_HELPER(RExC_precomp_end, packWARN(WARN_REGEXP),
2566                         Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
2567                             "Quantifier unexpected on zero-length expression "
2568                             "in regex m/%" UTF8f "/",
2569                              UTF8fARG(UTF, RExC_precomp_end - RExC_precomp,
2570                                   RExC_precomp)));
2571                 }
2572
2573                 if ( ( minnext > 0 && mincount >= SSize_t_MAX / minnext )
2574                     || min >= SSize_t_MAX - minnext * mincount )
2575                 {
2576                     FAIL("Regexp out of space");
2577                 }
2578
2579                 min += minnext * mincount;
2580                 is_inf_internal |= deltanext == OPTIMIZE_INFTY
2581                          || (maxcount == REG_INFTY && minnext + deltanext > 0);
2582                 is_inf |= is_inf_internal;
2583                 if (is_inf) {
2584                     delta = OPTIMIZE_INFTY;
2585                 } else {
2586                     delta += (minnext + deltanext) * maxcount
2587                              - minnext * mincount;
2588                 }
2589
2590                 if (data && data->flags & SCF_SEEN_ACCEPT) {
2591                     if (flags & SCF_DO_SUBSTR) {
2592                         scan_commit(pRExC_state, data, minlenp, is_inf);
2593                         flags &= ~SCF_DO_SUBSTR;
2594                     }
2595                     if (stopmin > min)
2596                         stopmin = min;
2597                     DEBUG_STUDYDATA("after-whilem accept", data, depth, is_inf, min, stopmin, delta);
2598                 }
2599                 DEBUG_STUDYDATA("PRE CURLYX_TO_CURLYN", data, depth, is_inf, min, stopmin, delta);
2600                 /* Try powerful optimization CURLYX => CURLYN. */
2601                 if ( RE_OPTIMIZE_CURLYX_TO_CURLYN
2602                      && OP(oscan) == CURLYX
2603                      && data
2604                      && !(RExC_seen & REG_PESSIMIZE_SEEN) /* XXX: for now disable whenever a
2605                                                             non optimistic eval is seen
2606                                                             anywhere.*/
2607                      && ( data->flags & SF_IN_PAR ) /* has parens */
2608                      && !deltanext
2609                      && minnext == 1
2610                      && mutate_ok
2611                 ) {
2612                     DEBUG_STUDYDATA("CURLYX_TO_CURLYN", data, depth, is_inf, min, stopmin, delta);
2613                     /* Try to optimize to CURLYN.  */
2614                     regnode *nxt = REGNODE_AFTER_type(oscan, tregnode_CURLYX);
2615                     regnode * const nxt1 = nxt;
2616 #ifdef DEBUGGING
2617                     regnode *nxt2;
2618 #endif
2619                     /* Skip open. */
2620                     nxt = regnext(nxt);
2621                     if (!REGNODE_SIMPLE(OP(nxt))
2622                         && !(REGNODE_TYPE(OP(nxt)) == EXACT
2623                              && STR_LEN(nxt) == 1))
2624                         goto nogo;
2625 #ifdef DEBUGGING
2626                     nxt2 = nxt;
2627 #endif
2628                     nxt = regnext(nxt);
2629                     if (OP(nxt) != CLOSE)
2630                         goto nogo;
2631                     if (RExC_open_parens) {
2632
2633                         /*open->CURLYM*/
2634                         RExC_open_parens[PARNO(nxt1)] = REGNODE_OFFSET(oscan);
2635
2636                         /*close->while*/
2637                         RExC_close_parens[PARNO(nxt1)] = REGNODE_OFFSET(nxt) + 2;
2638                     }
2639                     /* Now we know that nxt2 is the only contents: */
2640                     FLAGS(oscan) = (U8)PARNO(nxt);
2641                     OP(oscan) = CURLYN;
2642                     OP(nxt1) = NOTHING; /* was OPEN. */
2643
2644 #ifdef DEBUGGING
2645                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2646                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
2647                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
2648                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
2649                     OP(nxt + 1) = OPTIMIZED; /* was count. */
2650                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
2651 #endif
2652                 }
2653               nogo:
2654
2655                 DEBUG_STUDYDATA("PRE CURLYX_TO_CURLYM", data, depth, is_inf, min, stopmin, delta);
2656
2657                 /* Try optimization CURLYX => CURLYM. */
2658                 if ( RE_OPTIMIZE_CURLYX_TO_CURLYM
2659                      && OP(oscan) == CURLYX
2660                      && data
2661                      && !(RExC_seen & REG_PESSIMIZE_SEEN) /* XXX: for now disable whenever a
2662                                                             non optimistic eval is seen
2663                                                             anywhere.*/
2664                      && !(data->flags & SF_HAS_PAR) /* no parens! */
2665                      && !deltanext     /* atom is fixed width */
2666                      && minnext != 0  /* CURLYM can't handle zero width */
2667                          /* Nor characters whose fold at run-time may be
2668                           * multi-character */
2669                      && !(RExC_seen & REG_UNFOLDED_MULTI_SEEN)
2670                      && mutate_ok
2671                 ) {
2672                     DEBUG_STUDYDATA("CURLYX_TO_CURLYM", data, depth, is_inf, min, stopmin, delta);
2673                     /* XXXX How to optimize if data == 0? */
2674                     /* Optimize to a simpler form.  */
2675                     regnode *nxt = REGNODE_AFTER_type(oscan, tregnode_CURLYX); /* OPEN */
2676                     regnode *nxt2;
2677
2678                     OP(oscan) = CURLYM;
2679                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
2680                             && (OP(nxt2) != WHILEM))
2681                         nxt = nxt2;
2682                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
2683                     /* Need to optimize away parenths. */
2684                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
2685                         /* Set the parenth number.  */
2686                         /* note that we have changed the type of oscan to CURLYM here */
2687                         regnode *nxt1 = REGNODE_AFTER_type(oscan, tregnode_CURLYM); /* OPEN*/
2688
2689                         FLAGS(oscan) = (U8)PARNO(nxt);
2690                         if (RExC_open_parens) {
2691                              /*open->CURLYM*/
2692                             RExC_open_parens[PARNO(nxt1)] = REGNODE_OFFSET(oscan);
2693
2694                             /*close->NOTHING*/
2695                             RExC_close_parens[PARNO(nxt1)] = REGNODE_OFFSET(nxt2)
2696                                                          + 1;
2697                         }
2698                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
2699                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
2700
2701 #ifdef DEBUGGING
2702                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2703                         OP(nxt + 1) = OPTIMIZED; /* was count. */
2704                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
2705                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
2706 #endif
2707 #if 0
2708                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
2709                             regnode *nnxt = regnext(nxt1);
2710                             if (nnxt == nxt) {
2711                                 if (REGNODE_OFF_BY_ARG(OP(nxt1)))
2712                                     ARG1u_SET(nxt1, nxt2 - nxt1);
2713                                 else if (nxt2 - nxt1 < U16_MAX)
2714                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
2715                                 else
2716                                     OP(nxt) = NOTHING;  /* Cannot beautify */
2717                             }
2718                             nxt1 = nnxt;
2719                         }
2720 #endif
2721                         /* Optimize again: */
2722                         /* recurse study_chunk() on optimised CURLYX => CURLYM */
2723                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
2724                                     NULL, stopparen, recursed_depth, NULL, 0,
2725                                     depth+1, mutate_ok);
2726                     }
2727                     else
2728                         FLAGS(oscan) = 0;
2729                 }
2730                 else if ((OP(oscan) == CURLYX)
2731                          && (flags & SCF_WHILEM_VISITED_POS)
2732                          /* See the comment on a similar expression above.
2733                             However, this time it's not a subexpression
2734                             we care about, but the expression itself. */
2735                          && (maxcount == REG_INFTY)
2736                          && data) {
2737                     /* This stays as CURLYX, we can put the count/of pair. */
2738                     /* Find WHILEM (as in regexec.c) */
2739                     regnode *nxt = oscan + NEXT_OFF(oscan);
2740
2741                     if (OP(REGNODE_BEFORE(nxt)) == NOTHING) /* LONGJMP */
2742                         nxt += ARG1u(nxt);
2743                     nxt = REGNODE_BEFORE(nxt);
2744                     if (FLAGS(nxt) & 0xf) {
2745                         /* we've already set whilem count on this node */
2746                     } else if (++data->whilem_c < 16) {
2747                         assert(data->whilem_c <= RExC_whilem_seen);
2748                         FLAGS(nxt) = (U8)(data->whilem_c
2749                             | (RExC_whilem_seen << 4)); /* On WHILEM */
2750                     }
2751                 }
2752                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
2753                     pars++;
2754                 if (flags & SCF_DO_SUBSTR) {
2755                     SV *last_str = NULL;
2756                     STRLEN last_chrs = 0;
2757                     int counted = mincount != 0;
2758
2759                     if (data->last_end > 0 && mincount != 0) { /* Ends with a
2760                                                                   string. */
2761                         SSize_t b = pos_before >= data->last_start_min
2762                             ? pos_before : data->last_start_min;
2763                         STRLEN l;
2764                         const char * const s = SvPV_const(data->last_found, l);
2765                         SSize_t old = b - data->last_start_min;
2766                         assert(old >= 0);
2767
2768                         if (UTF)
2769                             old = utf8_hop_forward((U8*)s, old,
2770                                                (U8 *) SvEND(data->last_found))
2771                                 - (U8*)s;
2772                         l -= old;
2773                         /* Get the added string: */
2774                         last_str = newSVpvn_utf8(s  + old, l, UTF);
2775                         last_chrs = UTF ? utf8_length((U8*)(s + old),
2776                                             (U8*)(s + old + l)) : l;
2777                         if (deltanext == 0 && pos_before == b) {
2778                             /* What was added is a constant string */
2779                             if (mincount > 1) {
2780
2781                                 SvGROW(last_str, (mincount * l) + 1);
2782                                 repeatcpy(SvPVX(last_str) + l,
2783                                           SvPVX_const(last_str), l,
2784                                           mincount - 1);
2785                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
2786                                 /* Add additional parts. */
2787                                 SvCUR_set(data->last_found,
2788                                           SvCUR(data->last_found) - l);
2789                                 sv_catsv(data->last_found, last_str);
2790                                 {
2791                                     SV * sv = data->last_found;
2792                                     MAGIC *mg =
2793                                         SvUTF8(sv) && SvMAGICAL(sv) ?
2794                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
2795                                     if (mg && mg->mg_len >= 0)
2796                                         mg->mg_len += last_chrs * (mincount-1);
2797                                 }
2798                                 last_chrs *= mincount;
2799                                 data->last_end += l * (mincount - 1);
2800                             }
2801                         } else {
2802                             /* start offset must point into the last copy */
2803                             data->last_start_min += minnext * (mincount - 1);
2804                             data->last_start_max =
2805                               is_inf
2806                                ? OPTIMIZE_INFTY
2807                                : data->last_start_max +
2808                                  (maxcount - 1) * (minnext + data->pos_delta);
2809                         }
2810                     }
2811                     /* It is counted once already... */
2812                     data->pos_min += minnext * (mincount - counted);
2813 #if 0
2814     Perl_re_printf( aTHX_  "counted = %" UVuf " deltanext = %" UVuf
2815                               " OPTIMIZE_INFTY = %" UVuf " minnext = %" UVuf
2816                               " maxcount = %" UVuf " mincount = %" UVuf
2817                               " data->pos_delta = %" UVuf "\n",
2818         (UV)counted, (UV)deltanext, (UV)OPTIMIZE_INFTY, (UV)minnext,
2819         (UV)maxcount, (UV)mincount, (UV)data->pos_delta);
2820     if (deltanext != OPTIMIZE_INFTY)
2821         Perl_re_printf( aTHX_  "LHS = %" UVuf " RHS = %" UVuf "\n",
2822             (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
2823             - minnext * mincount), (UV)(OPTIMIZE_INFTY - data->pos_delta));
2824 #endif
2825                     if (deltanext == OPTIMIZE_INFTY
2826                         || data->pos_delta == OPTIMIZE_INFTY
2827                         || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= OPTIMIZE_INFTY - data->pos_delta)
2828                         data->pos_delta = OPTIMIZE_INFTY;
2829                     else
2830                         data->pos_delta += - counted * deltanext +
2831                         (minnext + deltanext) * maxcount - minnext * mincount;
2832                     if (mincount != maxcount) {
2833                          /* Cannot extend fixed substrings found inside
2834                             the group.  */
2835                         scan_commit(pRExC_state, data, minlenp, is_inf);
2836                         if (mincount && last_str) {
2837                             SV * const sv = data->last_found;
2838                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2839                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2840
2841                             if (mg)
2842                                 mg->mg_len = -1;
2843                             sv_setsv(sv, last_str);
2844                             data->last_end = data->pos_min;
2845                             data->last_start_min = data->pos_min - last_chrs;
2846                             data->last_start_max = is_inf
2847                                 ? OPTIMIZE_INFTY
2848                                 : data->pos_min + data->pos_delta - last_chrs;
2849                         }
2850                         data->cur_is_floating = 1; /* float */
2851                     }
2852                     SvREFCNT_dec(last_str);
2853                 }
2854                 if (data && (fl & SF_HAS_EVAL))
2855                     data->flags |= SF_HAS_EVAL;
2856               optimize_curly_tail:
2857                 rck_elide_nothing(oscan);
2858                 continue;
2859
2860             default:
2861                 croak("panic: unexpected varying REx opcode %d",
2862                                                                     OP(scan));
2863             case REF:
2864             case CLUMP:
2865                 if (flags & SCF_DO_SUBSTR) {
2866                     /* Cannot expect anything... */
2867                     scan_commit(pRExC_state, data, minlenp, is_inf);
2868                     data->cur_is_floating = 1; /* float */
2869                 }
2870                 is_inf = is_inf_internal = 1;
2871                 if (flags & SCF_DO_STCLASS_OR) {
2872                     if (OP(scan) == CLUMP) {
2873                         /* Actually is any start char, but very few code points
2874                          * aren't start characters */
2875                         ssc_match_all_cp(data->start_class);
2876                     }
2877                     else {
2878                         ssc_anything(data->start_class);
2879                     }
2880                 }
2881                 flags &= ~SCF_DO_STCLASS;
2882                 break;
2883             }
2884         }
2885         else if (OP(scan) == LNBREAK) {
2886             if (flags & SCF_DO_STCLASS) {
2887                 if (flags & SCF_DO_STCLASS_AND) {
2888                     ssc_intersection(data->start_class,
2889                                     PL_XPosix_ptrs[CC_VERTSPACE_], false);
2890                     ssc_clear_locale(data->start_class);
2891                     ANYOF_FLAGS(data->start_class)
2892                                                 &= ~SSC_MATCHES_EMPTY_STRING;
2893                 }
2894                 else if (flags & SCF_DO_STCLASS_OR) {
2895                     ssc_union(data->start_class,
2896                               PL_XPosix_ptrs[CC_VERTSPACE_],
2897                               false);
2898                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
2899
2900                     /* See commit msg for
2901                      * 749e076fceedeb708a624933726e7989f2302f6a */
2902                     ANYOF_FLAGS(data->start_class)
2903                                                 &= ~SSC_MATCHES_EMPTY_STRING;
2904                 }
2905                 flags &= ~SCF_DO_STCLASS;
2906             }
2907             min++;
2908             if (delta != OPTIMIZE_INFTY)
2909                 delta++;    /* Because of the 2 char string cr-lf */
2910             if (flags & SCF_DO_SUBSTR) {
2911                 /* Cannot expect anything... */
2912                 scan_commit(pRExC_state, data, minlenp, is_inf);
2913                 data->pos_min += 1;
2914                 if (data->pos_delta != OPTIMIZE_INFTY) {
2915                     data->pos_delta += 1;
2916                 }
2917                 data->cur_is_floating = 1; /* float */
2918             }
2919         }
2920         else if (REGNODE_SIMPLE(OP(scan))) {
2921
2922             if (flags & SCF_DO_SUBSTR) {
2923                 scan_commit(pRExC_state, data, minlenp, is_inf);
2924                 data->pos_min++;
2925             }
2926             min++;
2927             if (flags & SCF_DO_STCLASS) {
2928                 bool invert = 0;
2929                 SV* my_invlist = NULL;
2930                 U8 namedclass;
2931
2932                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
2933                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
2934
2935                 /* Some of the logic below assumes that switching
2936                    locale on will only add false positives. */
2937                 switch (OP(scan)) {
2938
2939                 default:
2940 #ifdef DEBUGGING
2941                    croak("panic: unexpected simple REx opcode %d",
2942                                                                      OP(scan));
2943 #endif
2944                 case SANY:
2945                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2946                         ssc_match_all_cp(data->start_class);
2947                     break;
2948
2949                 case REG_ANY:
2950                     {
2951                         SV* REG_ANY_invlist = _new_invlist(2);
2952                         REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
2953                                                             '\n');
2954                         if (flags & SCF_DO_STCLASS_OR) {
2955                             ssc_union(data->start_class,
2956                                       REG_ANY_invlist,
2957                                       true /* true => invert, hence all but \n
2958                                             */
2959                                       );
2960                         }
2961                         else if (flags & SCF_DO_STCLASS_AND) {
2962                             ssc_intersection(data->start_class,
2963                                              REG_ANY_invlist,
2964                                              true  /* true => invert */
2965                                              );
2966                             ssc_clear_locale(data->start_class);
2967                         }
2968                         SvREFCNT_dec_NN(REG_ANY_invlist);
2969                     }
2970                     break;
2971
2972                 case ANYOFD:
2973                 case ANYOFL:
2974                 case ANYOFPOSIXL:
2975                 case ANYOFH:
2976                 case ANYOFHb:
2977                 case ANYOFHr:
2978                 case ANYOFHs:
2979                 case ANYOF:
2980                     if (flags & SCF_DO_STCLASS_AND)
2981                         ssc_and(pRExC_state, data->start_class,
2982                                 (regnode_charclass *) scan);
2983                     else
2984                         ssc_or(pRExC_state, data->start_class,
2985                                                           (regnode_charclass *) scan);
2986                     break;
2987
2988                 case ANYOFHbbm:
2989                   {
2990                     SV* cp_list = get_ANYOFHbbm_contents(scan);
2991
2992                     if (flags & SCF_DO_STCLASS_OR) {
2993                         ssc_union(data->start_class, cp_list, invert);
2994                     }
2995                     else if (flags & SCF_DO_STCLASS_AND) {
2996                         ssc_intersection(data->start_class, cp_list, invert);
2997                     }
2998
2999                     SvREFCNT_dec_NN(cp_list);
3000                     break;
3001                   }
3002
3003                 case NANYOFM: /* NANYOFM already contains the inversion of the
3004                                  input ANYOF data, so, unlike things like
3005                                  NPOSIXA, don't change 'invert' to true */
3006                     /* FALLTHROUGH */
3007                 case ANYOFM:
3008                   {
3009                     SV* cp_list = get_ANYOFM_contents(scan);
3010
3011                     if (flags & SCF_DO_STCLASS_OR) {
3012                         ssc_union(data->start_class, cp_list, invert);
3013                     }
3014                     else if (flags & SCF_DO_STCLASS_AND) {
3015                         ssc_intersection(data->start_class, cp_list, invert);
3016                     }
3017
3018                     SvREFCNT_dec_NN(cp_list);
3019                     break;
3020                   }
3021
3022                 case ANYOFR:
3023                 case ANYOFRb:
3024                   {
3025                     SV* cp_list = NULL;
3026
3027                     cp_list = _add_range_to_invlist(cp_list,
3028                                         ANYOFRbase(scan),
3029                                         ANYOFRbase(scan) + ANYOFRdelta(scan));
3030
3031                     if (flags & SCF_DO_STCLASS_OR) {
3032                         ssc_union(data->start_class, cp_list, invert);
3033                     }
3034                     else if (flags & SCF_DO_STCLASS_AND) {
3035                         ssc_intersection(data->start_class, cp_list, invert);
3036                     }
3037
3038                     SvREFCNT_dec_NN(cp_list);
3039                     break;
3040                   }
3041
3042                 case NPOSIXL:
3043                     invert = 1;
3044                     /* FALLTHROUGH */
3045
3046                 case POSIXL:
3047                     namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
3048                     if (flags & SCF_DO_STCLASS_AND) {
3049                         bool was_there = cBOOL(
3050                                           ANYOF_POSIXL_TEST(data->start_class,
3051                                                                  namedclass));
3052                         ANYOF_POSIXL_ZERO(data->start_class);
3053                         if (was_there) {    /* Do an AND */
3054                             ANYOF_POSIXL_SET(data->start_class, namedclass);
3055                         }
3056                         /* No individual code points can now match */
3057                         data->start_class->invlist
3058                                                 = sv_2mortal(_new_invlist(0));
3059                     }
3060                     else {
3061                         int complement = namedclass + ((invert) ? -1 : 1);
3062
3063                         assert(flags & SCF_DO_STCLASS_OR);
3064
3065                         /* If the complement of this class was already there,
3066                          * the result is that they match all code points,
3067                          * (\d + \D == everything).  Remove the classes from
3068                          * future consideration.  Locale is not relevant in
3069                          * this case */
3070                         if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
3071                             ssc_match_all_cp(data->start_class);
3072                             ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
3073                             ANYOF_POSIXL_CLEAR(data->start_class, complement);
3074                         }
3075                         else {  /* The usual case; just add this class to the
3076                                    existing set */
3077                             ANYOF_POSIXL_SET(data->start_class, namedclass);
3078                         }
3079                     }
3080                     break;
3081
3082                 case NPOSIXA:   /* For these, we always know the exact set of
3083                                    what's matched */
3084                     invert = 1;
3085                     /* FALLTHROUGH */
3086                 case POSIXA:
3087                     my_invlist = invlist_clone(PL_Posix_ptrs[FLAGS(scan)], NULL);
3088                     goto join_posix_and_ascii;
3089
3090                 case NPOSIXD:
3091                 case NPOSIXU:
3092                     invert = 1;
3093                     /* FALLTHROUGH */
3094                 case POSIXD:
3095                 case POSIXU:
3096                     my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)], NULL);
3097
3098                     /* NPOSIXD matches all upper Latin1 code points unless the
3099                      * target string being matched is UTF-8, which is
3100                      * unknowable until match time.  Since we are going to
3101                      * invert, we want to get rid of all of them so that the
3102                      * inversion will match all */
3103                     if (OP(scan) == NPOSIXD) {
3104                         _invlist_subtract(my_invlist, PL_UpperLatin1,
3105                                           &my_invlist);
3106                     }
3107
3108                   join_posix_and_ascii:
3109
3110                     if (flags & SCF_DO_STCLASS_AND) {
3111                         ssc_intersection(data->start_class, my_invlist, invert);
3112                         ssc_clear_locale(data->start_class);
3113                     }
3114                     else {
3115                         assert(flags & SCF_DO_STCLASS_OR);
3116                         ssc_union(data->start_class, my_invlist, invert);
3117                     }
3118                     SvREFCNT_dec(my_invlist);
3119                 }
3120                 if (flags & SCF_DO_STCLASS_OR)
3121                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3122                 flags &= ~SCF_DO_STCLASS;
3123             }
3124         }
3125         else if (REGNODE_TYPE(OP(scan)) == EOL && flags & SCF_DO_SUBSTR) {
3126             data->flags |= (OP(scan) == MEOL
3127                             ? SF_BEFORE_MEOL
3128                             : SF_BEFORE_SEOL);
3129             scan_commit(pRExC_state, data, minlenp, is_inf);
3130
3131         }
3132         else if (  REGNODE_TYPE(OP(scan)) == BRANCHJ
3133                  /* Lookbehind, or need to calculate parens/evals/stclass: */
3134                    && (FLAGS(scan) || data || (flags & SCF_DO_STCLASS))
3135                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
3136         {
3137             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3138                 || OP(scan) == UNLESSM )
3139             {
3140                 /* Negative Lookahead/lookbehind
3141                    In this case we can't do fixed string optimisation.
3142                 */
3143
3144                 bool is_positive = OP(scan) == IFMATCH ? 1 : 0;
3145                 SSize_t deltanext, minnext;
3146                 SSize_t fake_last_close = 0;
3147                 regnode *fake_last_close_op = NULL;
3148                 regnode *cur_last_close_op;
3149                 regnode *nscan;
3150                 regnode_ssc intrnl;
3151                 U32 f = (flags & SCF_TRIE_DOING_RESTUDY);
3152
3153                 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3154                 if (data) {
3155                     data_fake.whilem_c = data->whilem_c;
3156                     data_fake.last_closep = data->last_closep;
3157                     data_fake.last_close_opp = data->last_close_opp;
3158                 }
3159                 else {
3160                     data_fake.last_closep = &fake_last_close;
3161                     data_fake.last_close_opp = &fake_last_close_op;
3162                 }
3163
3164                 /* remember the last_close_op we saw so we can see if
3165                  * we are dealing with variable length lookbehind that
3166                  * contains capturing buffers, which are considered
3167                  * experimental */
3168                 cur_last_close_op = *(data_fake.last_close_opp);
3169
3170                 data_fake.pos_delta = delta;
3171                 if ( flags & SCF_DO_STCLASS && !FLAGS(scan)
3172                      && OP(scan) == IFMATCH ) { /* Lookahead */
3173                     ssc_init(pRExC_state, &intrnl);
3174                     data_fake.start_class = &intrnl;
3175                     f |= SCF_DO_STCLASS_AND;
3176                 }
3177                 if (flags & SCF_WHILEM_VISITED_POS)
3178                     f |= SCF_WHILEM_VISITED_POS;
3179                 next = regnext(scan);
3180                 nscan = REGNODE_AFTER(scan);
3181
3182                 /* recurse study_chunk() for lookahead body */
3183                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3184                                       last, &data_fake, stopparen,
3185                                       recursed_depth, NULL, f, depth+1,
3186                                       mutate_ok);
3187
3188                 if (FLAGS(scan)) {
3189                     if (   deltanext < 0
3190                         || deltanext > (I32) U8_MAX
3191                         || minnext > (I32)U8_MAX
3192                         || minnext + deltanext > (I32)U8_MAX)
3193                     {
3194                         FAIL2("Lookbehind longer than %" UVuf " not implemented",
3195                               (UV)U8_MAX);
3196                     }
3197
3198                     /* The 'next_off' field has been repurposed to count the
3199                      * additional starting positions to try beyond the initial
3200                      * one.  (This leaves it at 0 for non-variable length
3201                      * matches to avoid breakage for those not using this
3202                      * extension) */
3203                     if (deltanext)  {
3204                         NEXT_OFF(scan) = deltanext;
3205                         if (
3206                             /* See a CLOSE op inside this lookbehind? */
3207                             cur_last_close_op != *(data_fake.last_close_opp)
3208                             /* and not doing restudy. see: restudied */
3209                             && !(flags & SCF_TRIE_DOING_RESTUDY)
3210                         ) {
3211                             /* this is positive variable length lookbehind with
3212                              * capture buffers inside of it */
3213                             ckWARNexperimental_with_arg(RExC_parse,
3214                                 WARN_EXPERIMENTAL__VLB,
3215                                 "Variable length %s lookbehind with capturing is experimental",
3216                                 is_positive ? "positive" : "negative");
3217                         }
3218                     }
3219                     FLAGS(scan) = (U8)minnext + deltanext;
3220                 }
3221                 if (data) {
3222                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3223                         pars++;
3224                     if (data_fake.flags & SF_HAS_EVAL)
3225                         data->flags |= SF_HAS_EVAL;
3226                     data->whilem_c = data_fake.whilem_c;
3227                 }
3228                 if (f & SCF_DO_STCLASS_AND) {
3229                     if (flags & SCF_DO_STCLASS_OR) {
3230                         /* OR before, AND after: ideally we would recurse with
3231                          * data_fake to get the AND applied by study of the
3232                          * remainder of the pattern, and then derecurse;
3233                          * *** HACK *** for now just treat as "no information".
3234                          * See [perl #56690].
3235                          */
3236                         ssc_init(pRExC_state, data->start_class);
3237                     }  else {
3238                         /* AND before and after: combine and continue.  These
3239                          * assertions are zero-length, so can match an EMPTY
3240                          * string */
3241                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
3242                         ANYOF_FLAGS(data->start_class)
3243                                                    |= SSC_MATCHES_EMPTY_STRING;
3244                     }
3245                 }
3246                 DEBUG_STUDYDATA("end LOOKAROUND", data, depth, is_inf, min, stopmin, delta);
3247             }
3248 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3249             else {
3250                 /* Positive Lookahead/lookbehind
3251                    In this case we can do fixed string optimisation,
3252                    but we must be careful about it. Note in the case of
3253                    lookbehind the positions will be offset by the minimum
3254                    length of the pattern, something we won't know about
3255                    until after the recurse.
3256                 */
3257                 SSize_t deltanext, fake_last_close = 0;
3258                 regnode *last_close_op = NULL;
3259                 regnode *nscan;
3260                 regnode_ssc intrnl;
3261                 U32 f = (flags & SCF_TRIE_DOING_RESTUDY);
3262                 /* We use SAVEFREEPV so that when the full compile
3263                     is finished perl will clean up the allocated
3264                     minlens when it's all done. This way we don't
3265                     have to worry about freeing them when we know
3266                     they wont be used, which would be a pain.
3267                  */
3268                 SSize_t *minnextp;
3269                 Newx( minnextp, 1, SSize_t );
3270                 SAVEFREEPV(minnextp);
3271
3272                 if (data) {
3273                     StructCopy(data, &data_fake, scan_data_t);
3274                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3275                         f |= SCF_DO_SUBSTR;
3276                         if (FLAGS(scan))
3277                             scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
3278                         data_fake.last_found = newSVsv(data->last_found);
3279                     }
3280                 }
3281                 else {
3282                     data_fake.last_closep = &fake_last_close;
3283                     data_fake.last_close_opp = &fake_last_close_opp;
3284                 }
3285                 data_fake.flags = 0;
3286                 data_fake.substrs[0].flags = 0;
3287                 data_fake.substrs[1].flags = 0;
3288                 data_fake.pos_delta = delta;
3289                 if (is_inf)
3290                     data_fake.flags |= SF_IS_INF;
3291                 if ( flags & SCF_DO_STCLASS && !FLAGS(scan)
3292                      && OP(scan) == IFMATCH ) { /* Lookahead */
3293                     ssc_init(pRExC_state, &intrnl);
3294                     data_fake.start_class = &intrnl;
3295                     f |= SCF_DO_STCLASS_AND;
3296                 }
3297                 if (flags & SCF_WHILEM_VISITED_POS)
3298                     f |= SCF_WHILEM_VISITED_POS;
3299                 next = regnext(scan);
3300                 nscan = REGNODE_AFTER(scan);
3301
3302                 /* positive lookahead study_chunk() recursion */
3303                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
3304                                         &deltanext, last, &data_fake,
3305                                         stopparen, recursed_depth, NULL,
3306                                         f, depth+1, mutate_ok);
3307                 if (FLAGS(scan)) {
3308                     assert(0);  /* This code has never been tested since this
3309                                    is normally not compiled */
3310                     if (   deltanext < 0
3311                         || deltanext > (I32) U8_MAX
3312                         || *minnextp > (I32)U8_MAX
3313                         || *minnextp + deltanext > (I32)U8_MAX)
3314                     {
3315                         FAIL2("Lookbehind longer than %" UVuf " not implemented",
3316                               (UV)U8_MAX);
3317                     }
3318
3319                     if (deltanext) {
3320                         NEXT_OFF(scan) = deltanext;
3321                     }
3322                     FLAGS(scan) = (U8)*minnextp + deltanext;
3323                 }
3324
3325                 *minnextp += min;
3326
3327                 if (f & SCF_DO_STCLASS_AND) {
3328                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
3329                     ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
3330                 }
3331                 if (data) {
3332                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3333                         pars++;
3334                     if (data_fake.flags & SF_HAS_EVAL)
3335                         data->flags |= SF_HAS_EVAL;
3336                     data->whilem_c = data_fake.whilem_c;
3337                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3338                         int i;
3339                         if (RExC_rx->minlen < *minnextp)
3340                             RExC_rx->minlen = *minnextp;
3341                         scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
3342                         SvREFCNT_dec_NN(data_fake.last_found);
3343
3344                         for (i = 0; i < 2; i++) {
3345                             if (data_fake.substrs[i].minlenp != minlenp) {
3346                                 data->substrs[i].min_offset =
3347                                             data_fake.substrs[i].min_offset;
3348                                 data->substrs[i].max_offset =
3349                                             data_fake.substrs[i].max_offset;
3350                                 data->substrs[i].minlenp =
3351                                             data_fake.substrs[i].minlenp;
3352                                 data->substrs[i].lookbehind += FLAGS(scan);
3353                             }
3354                         }
3355                     }
3356                 }
3357             }
3358 #endif
3359         }
3360         else if (OP(scan) == OPEN) {
3361             if (stopparen != (I32)PARNO(scan))
3362                 pars++;
3363         }
3364         else if (OP(scan) == CLOSE) {
3365             if (stopparen == (I32)PARNO(scan)) {
3366                 break;
3367             }
3368             if ((I32)PARNO(scan) == is_par) {
3369                 next = regnext(scan);
3370
3371                 if ( next && (OP(next) != WHILEM) && next < last)
3372                     is_par = 0;         /* Disable optimization */
3373             }
3374             if (data) {
3375                 *(data->last_closep) = PARNO(scan);
3376                 *(data->last_close_opp) = scan;
3377             }
3378         }
3379         else if (OP(scan) == EVAL) {
3380             if (data && !(FLAGS(scan) & EVAL_OPTIMISTIC_FLAG) )
3381                 data->flags |= SF_HAS_EVAL;
3382         }
3383         else if ( REGNODE_TYPE(OP(scan)) == ENDLIKE ) {
3384             if (flags & SCF_DO_SUBSTR) {
3385                 scan_commit(pRExC_state, data, minlenp, is_inf);
3386                 flags &= ~SCF_DO_SUBSTR;
3387             }
3388             if (OP(scan)==ACCEPT) {
3389                 /* m{(*ACCEPT)x} does not have to start with 'x' */
3390                 flags &= ~SCF_DO_STCLASS;
3391                 if (data)
3392                     data->flags |= SCF_SEEN_ACCEPT;
3393                 if (stopmin > min)
3394                     stopmin = min;
3395             }
3396         }
3397         else if (OP(scan) == COMMIT) {
3398             /* gh18770: m{abc(*COMMIT)xyz} must fail on "abc abcxyz", so we
3399              * must not end up with "abcxyz" as a fixed substring else we'll
3400              * skip straight to attempting to match at offset 4.
3401              */
3402             if (flags & SCF_DO_SUBSTR) {
3403                 scan_commit(pRExC_state, data, minlenp, is_inf);
3404                 flags &= ~SCF_DO_SUBSTR;
3405             }
3406         }
3407         else if (OP(scan) == LOGICAL && FLAGS(scan) == 2) /* Embedded follows */
3408         {
3409                 if (flags & SCF_DO_SUBSTR) {
3410                     scan_commit(pRExC_state, data, minlenp, is_inf);
3411                     data->cur_is_floating = 1; /* float */
3412                 }
3413                 is_inf = is_inf_internal = 1;
3414                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3415                     ssc_anything(data->start_class);
3416                 flags &= ~SCF_DO_STCLASS;
3417         }
3418         else if (OP(scan) == GPOS) {
3419             if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
3420                 !(delta || is_inf || (data && data->pos_delta)))
3421             {
3422                 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
3423                     RExC_rx->intflags |= PREGf_ANCH_GPOS;
3424                 if (RExC_rx->gofs < (STRLEN)min)
3425                     RExC_rx->gofs = min;
3426             } else {
3427                 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
3428                 RExC_rx->gofs = 0;
3429             }
3430         }
3431 #ifdef TRIE_STUDY_OPT
3432 #ifdef FULL_TRIE_STUDY
3433         else if (REGNODE_TYPE(OP(scan)) == TRIE) {
3434             /* NOTE - There is similar code to this block above for handling
3435                BRANCH nodes on the initial study.  If you change stuff here
3436                check there too. */
3437             regnode *trie_node= scan;
3438             regnode *tail= regnext(scan);
3439             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG1u(scan) ];
3440             SSize_t max1 = 0, min1 = OPTIMIZE_INFTY;
3441             regnode_ssc accum;
3442
3443             if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
3444                 /* Cannot merge strings after this. */
3445                 scan_commit(pRExC_state, data, minlenp, is_inf);
3446             }
3447             if (flags & SCF_DO_STCLASS)
3448                 ssc_init_zero(pRExC_state, &accum);
3449
3450             if (!trie->jump) {
3451                 min1 = trie->minlen;
3452                 max1 = trie->maxlen;
3453             } else {
3454                 const regnode *nextbranch= NULL;
3455                 U32 word;
3456
3457                 for ( word = 1 ; word <= trie->wordcount ; word++)
3458                 {
3459                     SSize_t deltanext = 0, minnext = 0;
3460                     U32 f = (flags & SCF_TRIE_DOING_RESTUDY);
3461                     SSize_t fake_last_close = 0;
3462                     regnode *fake_last_close_op = NULL;
3463                     regnode_ssc this_class;
3464
3465                     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3466                     if (data) {
3467                         data_fake.whilem_c = data->whilem_c;
3468                         data_fake.last_closep = data->last_closep;
3469                         data_fake.last_close_opp = data->last_close_opp;
3470                     }
3471                     else {
3472                         data_fake.last_closep = &fake_last_close;
3473                         data_fake.last_close_opp = &fake_last_close_op;
3474                     }
3475                     data_fake.pos_delta = delta;
3476                     if (flags & SCF_DO_STCLASS) {
3477                         ssc_init(pRExC_state, &this_class);
3478                         data_fake.start_class = &this_class;
3479                         f |= SCF_DO_STCLASS_AND;
3480                     }
3481                     if (flags & SCF_WHILEM_VISITED_POS)
3482                         f |= SCF_WHILEM_VISITED_POS;
3483
3484                     if (trie->jump[word]) {
3485                         if (!nextbranch)
3486                             nextbranch = trie_node + trie->jump[0];
3487                         scan = trie_node + trie->jump[word];
3488                         /* We go from the jump point to the branch that follows
3489                            it. Note this means we need the vestigal unused
3490                            branches even though they arent otherwise used. */
3491                         /* optimise study_chunk() for TRIE */
3492                         minnext = study_chunk(pRExC_state, &scan, minlenp,
3493                             &deltanext, (regnode *)nextbranch, &data_fake,
3494                             stopparen, recursed_depth, NULL, f, depth+1,
3495                             mutate_ok);
3496                     }
3497                     if (nextbranch && REGNODE_TYPE(OP(nextbranch))==BRANCH)
3498                         nextbranch = regnext((regnode*)nextbranch);
3499
3500                     if (min1 > (SSize_t)(minnext + trie->minlen))
3501                         min1 = minnext + trie->minlen;
3502                     if (deltanext == OPTIMIZE_INFTY) {
3503                         is_inf = is_inf_internal = 1;
3504                         max1 = OPTIMIZE_INFTY;
3505                     } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
3506                         max1 = minnext + deltanext + trie->maxlen;
3507
3508                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3509                         pars++;
3510                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
3511                         if ( stopmin > min + min1)
3512                             stopmin = min + min1;
3513                         flags &= ~SCF_DO_SUBSTR;
3514                         if (data)
3515                             data->flags |= SCF_SEEN_ACCEPT;
3516                     }
3517                     if (data) {
3518                         if (data_fake.flags & SF_HAS_EVAL)
3519                             data->flags |= SF_HAS_EVAL;
3520                         data->whilem_c = data_fake.whilem_c;
3521                     }
3522                     if (flags & SCF_DO_STCLASS)
3523                         ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
3524                 }
3525                 DEBUG_STUDYDATA("after JUMPTRIE", data, depth, is_inf, min, stopmin, delta);
3526             }
3527             if (flags & SCF_DO_SUBSTR) {
3528                 data->pos_min += min1;
3529                 data->pos_delta += max1 - min1;
3530                 if (max1 != min1 || is_inf)
3531                     data->cur_is_floating = 1; /* float */
3532             }
3533             min += min1;
3534             if (delta != OPTIMIZE_INFTY) {
3535                 if (OPTIMIZE_INFTY - (max1 - min1) >= delta)
3536                     delta += max1 - min1;
3537                 else
3538                     delta = OPTIMIZE_INFTY;
3539             }
3540             if (flags & SCF_DO_STCLASS_OR) {
3541                 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
3542                 if (min1) {
3543                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3544                     flags &= ~SCF_DO_STCLASS;
3545                 }
3546             }
3547             else if (flags & SCF_DO_STCLASS_AND) {
3548                 if (min1) {
3549                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
3550                     flags &= ~SCF_DO_STCLASS;
3551                 }
3552                 else {
3553                     /* Switch to OR mode: cache the old value of
3554                      * data->start_class */
3555                     INIT_AND_WITHP;
3556                     StructCopy(data->start_class, and_withp, regnode_ssc);
3557                     flags &= ~SCF_DO_STCLASS_AND;
3558                     StructCopy(&accum, data->start_class, regnode_ssc);
3559                     flags |= SCF_DO_STCLASS_OR;
3560                 }
3561             }
3562             scan = tail;
3563             DEBUG_STUDYDATA("after TRIE study", data, depth, is_inf, min, stopmin, delta);
3564             continue;
3565         }
3566 #else
3567         else if (REGNODE_TYPE(OP(scan)) == TRIE) {
3568             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG1u(scan) ];
3569             U8 *bang = NULL;
3570
3571             min += trie->minlen;
3572             delta += (trie->maxlen - trie->minlen);
3573             flags &= ~SCF_DO_STCLASS; /* xxx */
3574             if (flags & SCF_DO_SUBSTR) {
3575                 /* Cannot expect anything... */
3576                 scan_commit(pRExC_state, data, minlenp, is_inf);
3577                 data->pos_min += trie->minlen;
3578                 data->pos_delta += (trie->maxlen - trie->minlen);
3579                 if (trie->maxlen != trie->minlen)
3580                     data->cur_is_floating = 1; /* float */
3581             }
3582             if (trie->jump) /* no more substrings -- for now /grr*/
3583                flags &= ~SCF_DO_SUBSTR;
3584         }
3585
3586 #endif /* old or new */
3587 #endif /* TRIE_STUDY_OPT */
3588
3589         else if (OP(scan) == REGEX_SET) {
3590             croak("panic: %s regnode should be resolved"
3591                              " before optimization", REGNODE_NAME(REGEX_SET));
3592         }
3593
3594         /* Else: zero-length, ignore. */
3595         scan = regnext(scan);
3596     }
3597
3598   finish:
3599     if (frame) {
3600         /* we need to unwind recursion. */
3601         depth = depth - 1;
3602
3603         DEBUG_STUDYDATA("frame-end", data, depth, is_inf, min, stopmin, delta);
3604         DEBUG_PEEP("fend", scan, depth, flags);
3605
3606         /* restore previous context */
3607         last = frame->last_regnode;
3608         scan = frame->next_regnode;
3609         stopparen = frame->stopparen;
3610         recursed_depth = frame->prev_recursed_depth;
3611
3612         RExC_frame_last = frame->prev_frame;
3613         frame = frame->this_prev_frame;
3614         goto fake_study_recurse;
3615     }
3616
3617     assert(!frame);
3618     DEBUG_STUDYDATA("pre-fin", data, depth, is_inf, min, stopmin, delta);
3619
3620     /* is this pattern infinite? Eg, consider /(a|b+)/ */
3621     if (is_inf_internal)
3622         delta = OPTIMIZE_INFTY;
3623
3624     /* deal with (*ACCEPT), Eg, consider /(foo(*ACCEPT)|bop)bar/ */
3625     if (min > stopmin) {
3626         /*
3627         At this point 'min' represents the minimum length string we can
3628         match while *ignoring* the implication of ACCEPT, and 'delta'
3629         represents the difference between the minimum length and maximum
3630         length, and if the pattern matches an infinitely long string
3631         (consider the + and * quantifiers) then we use the special delta
3632         value of OPTIMIZE_INFTY to represent it. 'stopmin' is the
3633         minimum length that can be matched *and* accepted.
3634
3635         A pattern is accepted when matching was successful *and*
3636         complete, and thus there is no further matching needing to be
3637         done, no backtracking to occur, etc. Prior to the introduction
3638         of ACCEPT the only opcode that signaled acceptance was the END
3639         opcode, which is always the very last opcode in a regex program.
3640         ACCEPT is thus conceptually an early successful return out of
3641         the matching process. stopmin starts out as OPTIMIZE_INFTY to
3642         represent "the entire pattern", and is ratched down to the
3643         "current min" if necessary when an ACCEPT opcode is encountered.
3644
3645         Thus stopmin might be smaller than min if we saw an (*ACCEPT),
3646         and we now need to account for it in both min and delta.
3647         Consider that in a pattern /AB/ normally the min length it can
3648         match can be computed as min(A)+min(B). But (*ACCEPT) means
3649         that it might be something else, not even neccesarily min(A) at
3650         all. Consider
3651
3652              A  = /(foo(*ACCEPT)|x+)/
3653              B  = /whop/
3654              AB = /(foo(*ACCEPT)|x+)whop/
3655
3656         The min for A is 1 for "x" and the delta for A is OPTIMIZE_INFTY
3657         for "xxxxx...", its stopmin is 3 for "foo". The min for B is 4 for
3658         "whop", and the delta of 0 as the pattern is of fixed length, the
3659         stopmin would be OPTIMIZE_INFTY as it does not contain an ACCEPT.
3660         When handling AB we expect to see a min of 5 for "xwhop", and a
3661         delta of OPTIMIZE_INFTY for "xxxxx...whop", and a stopmin of 3
3662         for "foo". This should result in a final min of 3 for "foo", and
3663         a final delta of OPTIMIZE_INFTY for "xxxxx...whop".
3664
3665         In something like /(dude(*ACCEPT)|irk)x{3,7}/ we would have a
3666         min of 6 for "irkxxx" and a delta of 4 for "irkxxxxxxx", and the
3667         stop min would be 4 for "dude". This should result in a final
3668         min of 4 for "dude", and a final delta of 6, for "irkxxxxxxx".
3669
3670         When min is smaller than stopmin then we can ignore it. In the
3671         fragment /(x{10,20}(*ACCEPT)|a)b+/, we would have a min of 2,
3672         and a delta of OPTIMIZE_INFTY, and a stopmin of 10. Obviously
3673         the ACCEPT doesn't reduce the minimum length of the string that
3674         might be matched, nor affect the maximum length.
3675
3676         In something like /foo(*ACCEPT)ba?r/ we would have a min of 5
3677         for "foobr", a delta of 1 for "foobar", and a stopmin of 3 for
3678         "foo". We currently turn this into a min of 3 for "foo" and a
3679         delta of 3 for "foobar" even though technically "foobar" isn't
3680         possible. ACCEPT affects some aspects of the optimizer, like
3681         length computations and mandatory substring optimizations, but
3682         there are other optimzations this routine perfoms that are not
3683         affected and this compromise simplifies implementation.
3684
3685         It might be helpful to consider that this C function is called
3686         recursively on the pattern in a bottom up fashion, and that the
3687         min returned by a nested call may be marked as coming from an
3688         ACCEPT, causing its callers to treat the returned min as a
3689         stopmin as the recursion unwinds. Thus a single ACCEPT can affect
3690         multiple calls into this function in different ways.
3691         */
3692
3693         if (OPTIMIZE_INFTY - delta >= min - stopmin)
3694             delta += min - stopmin;
3695         else
3696             delta = OPTIMIZE_INFTY;
3697         min = stopmin;
3698     }
3699
3700     *scanp = scan;
3701     *deltap = delta;
3702
3703     if (flags & SCF_DO_SUBSTR && is_inf)
3704         data->pos_delta = OPTIMIZE_INFTY - data->pos_min;
3705     if (is_par > (I32)U8_MAX)
3706         is_par = 0;
3707     if (is_par && pars == 1 && data) {
3708         data->flags |= SF_IN_PAR;
3709         data->flags &= ~SF_HAS_PAR;
3710     }
3711     else if (pars && data) {
3712         data->flags |= SF_HAS_PAR;
3713         data->flags &= ~SF_IN_PAR;
3714     }
3715     if (flags & SCF_DO_STCLASS_OR)
3716         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3717     if (flags & SCF_TRIE_RESTUDY)
3718         data->flags |=  SCF_TRIE_RESTUDY;
3719
3720
3721     if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
3722         if (min > OPTIMIZE_INFTY - delta)
3723             RExC_maxlen = OPTIMIZE_INFTY;
3724         else if (RExC_maxlen < min + delta)
3725             RExC_maxlen = min + delta;
3726     }
3727     DEBUG_STUDYDATA("post-fin", data, depth, is_inf, min, stopmin, delta);
3728     return min;
3729 }