summaryrefslogtreecommitdiff
path: root/src/backend/parser/parser.c
diff options
context:
space:
mode:
authorTom Lane2015-02-24 22:53:42 +0000
committerTom Lane2015-02-24 22:53:45 +0000
commitd809fd0008a2e26de463f47b7aba0365264078f3 (patch)
treed965a051f8ad5ef8d6579408cd13e392f6701de2 /src/backend/parser/parser.c
parent23a78352c0a0dc21d6120bd868f0b2d07395b537 (diff)
Improve parser's one-extra-token lookahead mechanism.
There are a couple of places in our grammar that fail to be strict LALR(1), by requiring more than a single token of lookahead to decide what to do. Up to now we've dealt with that by using a filter between the lexer and parser that merges adjacent tokens into one in the places where two tokens of lookahead are necessary. But that creates a number of user-visible anomalies, for instance that you can't name a CTE "ordinality" because "WITH ordinality AS ..." triggers folding of WITH and ORDINALITY into one token. I realized that there's a better way. In this patch, we still do the lookahead basically as before, but we never merge the second token into the first; we replace just the first token by a special lookahead symbol when one of the lookahead pairs is seen. This requires a couple extra productions in the grammar, but it involves fewer special tokens, so that the grammar tables come out a bit smaller than before. The filter logic is no slower than before, perhaps a bit faster. I also fixed the filter logic so that when backing up after a lookahead, the current token's terminator is correctly restored; this eliminates some weird behavior in error message issuance, as is shown by the one change in existing regression test outputs. I believe that this patch entirely eliminates odd behaviors caused by lookahead for WITH. It doesn't really improve the situation for NULLS followed by FIRST/LAST unfortunately: those sequences still act like a reserved word, even though there are cases where they should be seen as two ordinary identifiers, eg "SELECT nulls first FROM ...". I experimented with additional grammar hacks but couldn't find any simple solution for that. Still, this is better than before, and it seems much more likely that we *could* somehow solve the NULLS case on the basis of this filter behavior than the previous one.
Diffstat (limited to 'src/backend/parser/parser.c')
-rw-r--r--src/backend/parser/parser.c105
1 files changed, 58 insertions, 47 deletions
diff --git a/src/backend/parser/parser.c b/src/backend/parser/parser.c
index db49275e00a..b17771d4cca 100644
--- a/src/backend/parser/parser.c
+++ b/src/backend/parser/parser.c
@@ -64,13 +64,13 @@ raw_parser(const char *str)
/*
* Intermediate filter between parser and core lexer (core_yylex in scan.l).
*
- * The filter is needed because in some cases the standard SQL grammar
+ * This filter is needed because in some cases the standard SQL grammar
* requires more than one token lookahead. We reduce these cases to one-token
- * lookahead by combining tokens here, in order to keep the grammar LALR(1).
+ * lookahead by replacing tokens here, in order to keep the grammar LALR(1).
*
* Using a filter is simpler than trying to recognize multiword tokens
* directly in scan.l, because we'd have to allow for comments between the
- * words. Furthermore it's not clear how to do it without re-introducing
+ * words. Furthermore it's not clear how to do that without re-introducing
* scanner backtrack, which would cost more performance than this filter
* layer does.
*
@@ -84,7 +84,7 @@ base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
base_yy_extra_type *yyextra = pg_yyget_extra(yyscanner);
int cur_token;
int next_token;
- core_YYSTYPE cur_yylval;
+ int cur_token_length;
YYLTYPE cur_yylloc;
/* Get next token --- we might already have it */
@@ -93,74 +93,85 @@ base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
cur_token = yyextra->lookahead_token;
lvalp->core_yystype = yyextra->lookahead_yylval;
*llocp = yyextra->lookahead_yylloc;
+ *(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
yyextra->have_lookahead = false;
}
else
cur_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner);
- /* Do we need to look ahead for a possible multiword token? */
+ /*
+ * If this token isn't one that requires lookahead, just return it. If it
+ * does, determine the token length. (We could get that via strlen(), but
+ * since we have such a small set of possibilities, hardwiring seems
+ * feasible and more efficient.)
+ */
switch (cur_token)
{
case NULLS_P:
+ cur_token_length = 5;
+ break;
+ case WITH:
+ cur_token_length = 4;
+ break;
+ default:
+ return cur_token;
+ }
- /*
- * NULLS FIRST and NULLS LAST must be reduced to one token
- */
- cur_yylval = lvalp->core_yystype;
- cur_yylloc = *llocp;
- next_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner);
+ /*
+ * Identify end+1 of current token. core_yylex() has temporarily stored a
+ * '\0' here, and will undo that when we call it again. We need to redo
+ * it to fully revert the lookahead call for error reporting purposes.
+ */
+ yyextra->lookahead_end = yyextra->core_yy_extra.scanbuf +
+ *llocp + cur_token_length;
+ Assert(*(yyextra->lookahead_end) == '\0');
+
+ /*
+ * Save and restore *llocp around the call. It might look like we could
+ * avoid this by just passing &lookahead_yylloc to core_yylex(), but that
+ * does not work because flex actually holds onto the last-passed pointer
+ * internally, and will use that for error reporting. We need any error
+ * reports to point to the current token, not the next one.
+ */
+ cur_yylloc = *llocp;
+
+ /* Get next token, saving outputs into lookahead variables */
+ next_token = core_yylex(&(yyextra->lookahead_yylval), llocp, yyscanner);
+ yyextra->lookahead_token = next_token;
+ yyextra->lookahead_yylloc = *llocp;
+
+ *llocp = cur_yylloc;
+
+ /* Now revert the un-truncation of the current token */
+ yyextra->lookahead_hold_char = *(yyextra->lookahead_end);
+ *(yyextra->lookahead_end) = '\0';
+
+ yyextra->have_lookahead = true;
+
+ /* Replace cur_token if needed, based on lookahead */
+ switch (cur_token)
+ {
+ case NULLS_P:
+ /* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */
switch (next_token)
{
case FIRST_P:
- cur_token = NULLS_FIRST;
- break;
case LAST_P:
- cur_token = NULLS_LAST;
- break;
- default:
- /* save the lookahead token for next time */
- yyextra->lookahead_token = next_token;
- yyextra->lookahead_yylval = lvalp->core_yystype;
- yyextra->lookahead_yylloc = *llocp;
- yyextra->have_lookahead = true;
- /* and back up the output info to cur_token */
- lvalp->core_yystype = cur_yylval;
- *llocp = cur_yylloc;
+ cur_token = NULLS_LA;
break;
}
break;
case WITH:
-
- /*
- * WITH TIME and WITH ORDINALITY must each be reduced to one token
- */
- cur_yylval = lvalp->core_yystype;
- cur_yylloc = *llocp;
- next_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner);
+ /* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */
switch (next_token)
{
case TIME:
- cur_token = WITH_TIME;
- break;
case ORDINALITY:
- cur_token = WITH_ORDINALITY;
- break;
- default:
- /* save the lookahead token for next time */
- yyextra->lookahead_token = next_token;
- yyextra->lookahead_yylval = lvalp->core_yystype;
- yyextra->lookahead_yylloc = *llocp;
- yyextra->have_lookahead = true;
- /* and back up the output info to cur_token */
- lvalp->core_yystype = cur_yylval;
- *llocp = cur_yylloc;
+ cur_token = WITH_LA;
break;
}
break;
-
- default:
- break;
}
return cur_token;