diff options
author | Tom Lane | 2012-02-24 06:40:18 +0000 |
---|---|---|
committer | Tom Lane | 2012-02-24 06:41:03 +0000 |
commit | 173e29aa5deefd9e71c183583ba37805c8102a72 (patch) | |
tree | f7e997faabaa7aa3e19e00ee561096404817d092 /src/backend/regex/regcomp.c | |
parent | 0c9e5d5e0d407013bf66af01942a7b2dd3342546 (diff) |
Fix the general case of quantified regex back-references.
Cases where a back-reference is part of a larger subexpression that
is quantified have never worked in Spencer's regex engine, because
he used a compile-time transformation that neglected the need to
check the back-reference match in iterations before the last one.
(That was okay for capturing parens, and we still do it if the
regex has *only* capturing parens ... but it's not okay for backrefs.)
To make this work properly, we have to add an "iteration" node type
to the regex engine's vocabulary of sub-regex nodes. Since this is a
moderately large change with a fair risk of introducing new bugs of its
own, apply to HEAD only, even though it's a fix for a longstanding bug.
Diffstat (limited to 'src/backend/regex/regcomp.c')
-rw-r--r-- | src/backend/regex/regcomp.c | 88 |
1 files changed, 52 insertions, 36 deletions
diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c index 6b80140e909..b84d0c3af55 100644 --- a/src/backend/regex/regcomp.c +++ b/src/backend/regex/regcomp.c @@ -1036,11 +1036,17 @@ parseqatom(struct vars * v, /*---------- * Prepare a general-purpose state skeleton. * - * ---> [s] ---prefix---> [begin] ---atom---> [end] ----rest---> [rp] - * / / - * [lp] ----> [s2] ----bypass--------------------- + * In the no-backrefs case, we want this: * - * where bypass is an empty, and prefix is some repetitions of atom + * [lp] ---> [s] ---prefix---> [begin] ---atom---> [end] ---rest---> [rp] + * + * where prefix is some repetitions of atom. In the general case we need + * + * [lp] ---> [s] ---iterator---> [s2] ---rest---> [rp] + * + * where the iterator wraps around [begin] ---atom---> [end] + * + * We make the s state here for both cases; s2 is made below if needed *---------- */ s = newstate(v->nfa); /* first, new endpoints for the atom */ @@ -1051,11 +1057,9 @@ parseqatom(struct vars * v, NOERR(); atom->begin = s; atom->end = s2; - s = newstate(v->nfa); /* and spots for prefix and bypass */ - s2 = newstate(v->nfa); + s = newstate(v->nfa); /* set up starting state */ NOERR(); EMPTYARC(lp, s); - EMPTYARC(lp, s2); NOERR(); /* break remaining subRE into x{...} and what follows */ @@ -1089,28 +1093,9 @@ parseqatom(struct vars * v, } /* - * It's quantifier time. If the atom is just a BACKREF, we'll let it deal - * with quantifiers internally. Otherwise, the first step is to turn - * x{0,...} into x{1,...}|empty + * It's quantifier time. If the atom is just a backref, we'll let it deal + * with quantifiers internally. */ - if (m == 0 && atomtype != BACKREF) - { - EMPTYARC(s2, atom->end); /* the bypass */ - assert(PREF(qprefer) != 0); - f = COMBINE(qprefer, atom->flags); - t = subre(v, '|', f, lp, atom->end); - NOERR(); - t->left = atom; - t->right = subre(v, '|', PREF(f), s2, atom->end); - NOERR(); - t->right->left = subre(v, '=', 0, s2, atom->end); - NOERR(); - *atomp = t; - atomp = &t->left; - m = 1; - } - - /* deal with the rest of the quantifier */ if (atomtype == BACKREF) { /* special case: backrefs have internal quantifiers */ @@ -1120,17 +1105,25 @@ parseqatom(struct vars * v, atom->min = (short) m; atom->max = (short) n; atom->flags |= COMBINE(qprefer, atom->flags); + /* rest of branch can be strung starting from atom->end */ + s2 = atom->end; } else if (m == 1 && n == 1) { /* no/vacuous quantifier: done */ EMPTYARC(s, atom->begin); /* empty prefix */ + /* rest of branch can be strung starting from atom->end */ + s2 = atom->end; } - else + else if (m > 0 && !(atom->flags & BACKR)) { /* - * Turn x{m,n} into x{m-1,n-1}x, with capturing parens in only the - * second x + * If there's no backrefs involved, we can turn x{m,n} into + * x{m-1,n-1}x, with capturing parens in only the second x. This + * is valid because we only care about capturing matches from the + * final iteration of the quantifier. It's a win because we can + * implement the backref-free left side as a plain DFA node, since + * we don't really care where its submatches are. */ dupnfa(v->nfa, atom->begin, atom->end, s, atom->begin); assert(m >= 1 && m != INFINITY && n >= 1); @@ -1142,16 +1135,36 @@ parseqatom(struct vars * v, NOERR(); t->right = atom; *atomp = t; + /* rest of branch can be strung starting from atom->end */ + s2 = atom->end; + } + else + { + /* general case: need an iteration node */ + s2 = newstate(v->nfa); + NOERR(); + moveouts(v->nfa, atom->end, s2); + NOERR(); + dupnfa(v->nfa, atom->begin, atom->end, s, s2); + repeat(v, s, s2, m, n); + f = COMBINE(qprefer, atom->flags); + t = subre(v, '*', f, s, s2); + NOERR(); + t->min = (short) m; + t->max = (short) n; + t->left = atom; + *atomp = t; + /* rest of branch is to be strung from iteration's end state */ } /* and finally, look after that postponed recursion */ t = top->right; if (!(SEE('|') || SEE(stopper) || SEE(EOS))) - t->right = parsebranch(v, stopper, type, atom->end, rp, 1); + t->right = parsebranch(v, stopper, type, s2, rp, 1); else { - EMPTYARC(atom->end, rp); - t->right = subre(v, '=', 0, atom->end, rp); + EMPTYARC(s2, rp); + t->right = subre(v, '=', 0, s2, rp); } assert(SEE('|') || SEE(stopper) || SEE(EOS)); t->flags |= COMBINE(t->flags, t->right->flags); @@ -1214,6 +1227,9 @@ scannum(struct vars * v) /* * repeat - replicate subNFA for quantifiers * + * The sub-NFA strung from lp to rp is modified to represent m to n + * repetitions of its initial contents. + * * The duplication sequences used here are chosen carefully so that any * pointers starting out pointing into the subexpression end up pointing into * the last occurrence. (Note that it may not be strung between the same @@ -1229,7 +1245,7 @@ repeat(struct vars * v, int n) { #define SOME 2 -#define INF 3 +#define INF 3 #define PAIR(x, y) ((x)*4 + (y)) #define REDUCE(x) ( ((x) == INFINITY) ? INF : (((x) > 1) ? SOME : (x)) ) const int rm = REDUCE(m); @@ -1603,7 +1619,7 @@ subre(struct vars * v, v->treechain = ret; } - assert(strchr("|.b(=", op) != NULL); + assert(strchr("=b|.*(", op) != NULL); ret->op = op; ret->flags = flags; |