From: David Rowley Date: Mon, 8 Jul 2024 21:54:59 +0000 (+1200) Subject: Teach planner how to estimate rows for timestamp generate_series X-Git-Tag: REL_18_BETA1~2443 X-Git-Url: http://git.postgresql.org/gitweb/?a=commitdiff_plain;h=036bdcec9f9998a6e50711fadce69e482ff18f55;p=postgresql.git Teach planner how to estimate rows for timestamp generate_series This provides the planner with row estimates for generate_series(TIMESTAMP, TIMESTAMP, INTERVAL), generate_series(TIMESTAMPTZ, TIMESTAMPTZ, INTERVAL) and generate_series(TIMESTAMPTZ, TIMESTAMPTZ, INTERVAL, TEXT) when the input parameter values can be estimated during planning. Author: David Rowley Reviewed-by: jian he Discussion: https://postgr.es/m/CAApHDvrBE%3D%2BASo_sGYmQJ3GvO8GPvX5yxXhRS%3Dt_ybd4odFkhQ%40mail.gmail.com --- diff --git a/src/backend/utils/adt/timestamp.c b/src/backend/utils/adt/timestamp.c index cdc7e43b930..69fe7860ede 100644 --- a/src/backend/utils/adt/timestamp.c +++ b/src/backend/utils/adt/timestamp.c @@ -27,6 +27,7 @@ #include "funcapi.h" #include "libpq/pqformat.h" #include "miscadmin.h" +#include "optimizer/optimizer.h" #include "nodes/nodeFuncs.h" #include "nodes/supportnodes.h" #include "parser/scansup.h" @@ -6680,6 +6681,93 @@ generate_series_timestamptz_at_zone(PG_FUNCTION_ARGS) return generate_series_timestamptz_internal(fcinfo); } +/* + * Planner support function for generate_series(timestamp, timestamp, interval) + */ +Datum +generate_series_timestamp_support(PG_FUNCTION_ARGS) +{ + Node *rawreq = (Node *) PG_GETARG_POINTER(0); + Node *ret = NULL; + + if (IsA(rawreq, SupportRequestRows)) + { + /* Try to estimate the number of rows returned */ + SupportRequestRows *req = (SupportRequestRows *) rawreq; + + if (is_funcclause(req->node)) /* be paranoid */ + { + List *args = ((FuncExpr *) req->node)->args; + Node *arg1, + *arg2, + *arg3; + + /* We can use estimated argument values here */ + arg1 = estimate_expression_value(req->root, linitial(args)); + arg2 = estimate_expression_value(req->root, lsecond(args)); + arg3 = estimate_expression_value(req->root, lthird(args)); + + /* + * If any argument is constant NULL, we can safely assume that + * zero rows are returned. Otherwise, if they're all non-NULL + * constants, we can calculate the number of rows that will be + * returned. + */ + if ((IsA(arg1, Const) && ((Const *) arg1)->constisnull) || + (IsA(arg2, Const) && ((Const *) arg2)->constisnull) || + (IsA(arg3, Const) && ((Const *) arg3)->constisnull)) + { + req->rows = 0; + ret = (Node *) req; + } + else if (IsA(arg1, Const) && IsA(arg2, Const) && IsA(arg3, Const)) + { + Timestamp start, + finish; + Interval *step; + Datum diff; + double dstep; + int64 dummy; + + start = DatumGetTimestamp(((Const *) arg1)->constvalue); + finish = DatumGetTimestamp(((Const *) arg2)->constvalue); + step = DatumGetIntervalP(((Const *) arg3)->constvalue); + + /* + * Perform some prechecks which could cause timestamp_mi to + * raise an ERROR. It's much better to just return some + * default estimate than error out in a support function. + */ + if (!TIMESTAMP_NOT_FINITE(start) && !TIMESTAMP_NOT_FINITE(finish) && + !pg_sub_s64_overflow(finish, start, &dummy)) + { + diff = DirectFunctionCall2(timestamp_mi, + TimestampGetDatum(finish), + TimestampGetDatum(start)); + +#define INTERVAL_TO_MICROSECONDS(i) ((((double) (i)->month * DAYS_PER_MONTH + (i)->day)) * USECS_PER_DAY + (i)->time) + + dstep = INTERVAL_TO_MICROSECONDS(step); + + /* This equation works for either sign of step */ + if (dstep != 0.0) + { + Interval *idiff = DatumGetIntervalP(diff); + double ddiff = INTERVAL_TO_MICROSECONDS(idiff); + + req->rows = floor(ddiff / dstep + 1.0); + ret = (Node *) req; + } +#undef INTERVAL_TO_MICROSECONDS + } + } + } + } + + PG_RETURN_POINTER(ret); +} + + /* timestamp_at_local() * timestamptz_at_local() * diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 0bf413fe054..e899ed5e77e 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -8331,19 +8331,25 @@ prorettype => 'numeric', proargtypes => 'numeric numeric', prosrc => 'generate_series_numeric' }, { oid => '938', descr => 'non-persistent series generator', - proname => 'generate_series', prorows => '1000', proretset => 't', + proname => 'generate_series', prorows => '1000', + prosupport => 'generate_series_timestamp_support', proretset => 't', prorettype => 'timestamp', proargtypes => 'timestamp timestamp interval', prosrc => 'generate_series_timestamp' }, { oid => '939', descr => 'non-persistent series generator', - proname => 'generate_series', prorows => '1000', proretset => 't', + proname => 'generate_series', prorows => '1000', + prosupport => 'generate_series_timestamp_support', proretset => 't', provolatile => 's', prorettype => 'timestamptz', proargtypes => 'timestamptz timestamptz interval', prosrc => 'generate_series_timestamptz' }, { oid => '6274', descr => 'non-persistent series generator', - proname => 'generate_series', prorows => '1000', proretset => 't', + proname => 'generate_series', prorows => '1000', + prosupport => 'generate_series_timestamp_support', proretset => 't', prorettype => 'timestamptz', proargtypes => 'timestamptz timestamptz interval text', prosrc => 'generate_series_timestamptz_at_zone' }, +{ oid => '8402', descr => 'planner support for generate_series', + proname => 'generate_series_timestamp_support', prorettype => 'internal', + proargtypes => 'internal', prosrc => 'generate_series_timestamp_support' }, # boolean aggregates { oid => '2515', descr => 'aggregate transition function', diff --git a/src/test/regress/expected/misc_functions.out b/src/test/regress/expected/misc_functions.out index d94056862ae..78c91eff013 100644 --- a/src/test/regress/expected/misc_functions.out +++ b/src/test/regress/expected/misc_functions.out @@ -2,6 +2,43 @@ \getenv libdir PG_LIBDIR \getenv dlsuffix PG_DLSUFFIX \set regresslib :libdir '/regress' :dlsuffix +-- Function to assist with verifying EXPLAIN which includes costs. A series +-- of bool flags allows control over which portions are masked out +CREATE FUNCTION explain_mask_costs(query text, do_analyze bool, + hide_costs bool, hide_row_est bool, hide_width bool) RETURNS setof text +LANGUAGE plpgsql AS +$$ +DECLARE + ln text; + analyze_str text; +BEGIN + IF do_analyze = true THEN + analyze_str := 'on'; + ELSE + analyze_str := 'off'; + END IF; + + FOR ln IN + EXECUTE format('explain (analyze %s, costs on, summary off, timing off) %s', + analyze_str, query) + LOOP + IF hide_costs = true THEN + ln := regexp_replace(ln, 'cost=\d+\.\d\d\.\.\d+\.\d\d', 'cost=N..N'); + END IF; + + IF hide_row_est = true THEN + -- don't use 'g' so that we leave the actual rows intact + ln := regexp_replace(ln, 'rows=\d+', 'rows=N'); + END IF; + + IF hide_width = true THEN + ln := regexp_replace(ln, 'width=\d+', 'width=N'); + END IF; + + RETURN NEXT ln; + END LOOP; +END; +$$; -- -- num_nulls() -- @@ -594,6 +631,78 @@ SELECT * FROM tenk1 a JOIN my_gen_series(1,10) g ON a.unique1 = g; Index Cond: (unique1 = g.g) (4 rows) +-- +-- Test the SupportRequestRows support function for generate_series_timestamp() +-- +-- Ensure the row estimate matches the actual rows +SELECT explain_mask_costs($$ +SELECT * FROM generate_series(TIMESTAMPTZ '2024-02-01', TIMESTAMPTZ '2024-03-01', INTERVAL '1 day') g(s);$$, +true, true, false, true); + explain_mask_costs +------------------------------------------------------------------------------------------ + Function Scan on generate_series g (cost=N..N rows=30 width=N) (actual rows=30 loops=1) +(1 row) + +-- As above but with generate_series_timestamp +SELECT explain_mask_costs($$ +SELECT * FROM generate_series(TIMESTAMP '2024-02-01', TIMESTAMP '2024-03-01', INTERVAL '1 day') g(s);$$, +true, true, false, true); + explain_mask_costs +------------------------------------------------------------------------------------------ + Function Scan on generate_series g (cost=N..N rows=30 width=N) (actual rows=30 loops=1) +(1 row) + +-- As above but with generate_series_timestamptz_at_zone() +SELECT explain_mask_costs($$ +SELECT * FROM generate_series(TIMESTAMPTZ '2024-02-01', TIMESTAMPTZ '2024-03-01', INTERVAL '1 day', 'UTC') g(s);$$, +true, true, false, true); + explain_mask_costs +------------------------------------------------------------------------------------------ + Function Scan on generate_series g (cost=N..N rows=30 width=N) (actual rows=30 loops=1) +(1 row) + +-- Ensure the estimated and actual row counts match when the range isn't +-- evenly divisible by the step +SELECT explain_mask_costs($$ +SELECT * FROM generate_series(TIMESTAMPTZ '2024-02-01', TIMESTAMPTZ '2024-03-01', INTERVAL '7 day') g(s);$$, +true, true, false, true); + explain_mask_costs +---------------------------------------------------------------------------------------- + Function Scan on generate_series g (cost=N..N rows=5 width=N) (actual rows=5 loops=1) +(1 row) + +-- Ensure the estimates match when step is decreasing +SELECT explain_mask_costs($$ +SELECT * FROM generate_series(TIMESTAMPTZ '2024-03-01', TIMESTAMPTZ '2024-02-01', INTERVAL '-1 day') g(s);$$, +true, true, false, true); + explain_mask_costs +------------------------------------------------------------------------------------------ + Function Scan on generate_series g (cost=N..N rows=30 width=N) (actual rows=30 loops=1) +(1 row) + +-- Ensure an empty range estimates 1 row +SELECT explain_mask_costs($$ +SELECT * FROM generate_series(TIMESTAMPTZ '2024-03-01', TIMESTAMPTZ '2024-02-01', INTERVAL '1 day') g(s);$$, +true, true, false, true); + explain_mask_costs +---------------------------------------------------------------------------------------- + Function Scan on generate_series g (cost=N..N rows=1 width=N) (actual rows=0 loops=1) +(1 row) + +-- Ensure we get the default row estimate for infinity values +SELECT explain_mask_costs($$ +SELECT * FROM generate_series(TIMESTAMPTZ '-infinity', TIMESTAMPTZ 'infinity', INTERVAL '1 day') g(s);$$, +false, true, false, true); + explain_mask_costs +------------------------------------------------------------------- + Function Scan on generate_series g (cost=N..N rows=1000 width=N) +(1 row) + +-- Ensure the row estimate behaves correctly when step size is zero. +-- We expect generate_series_timestamp() to throw the error rather than in +-- the support function. +SELECT * FROM generate_series(TIMESTAMPTZ '2024-02-01', TIMESTAMPTZ '2024-03-01', INTERVAL '0 day') g(s); +ERROR: step size cannot equal zero -- Test functions for control data SELECT count(*) > 0 AS ok FROM pg_control_checkpoint(); ok @@ -706,3 +815,4 @@ SELECT pg_column_toast_chunk_id(a) IS NULL, (1 row) DROP TABLE test_chunk_id; +DROP FUNCTION explain_mask_costs(text, bool, bool, bool, bool); diff --git a/src/test/regress/sql/misc_functions.sql b/src/test/regress/sql/misc_functions.sql index 76470fcb3f6..f30387f54ab 100644 --- a/src/test/regress/sql/misc_functions.sql +++ b/src/test/regress/sql/misc_functions.sql @@ -4,6 +4,44 @@ \set regresslib :libdir '/regress' :dlsuffix +-- Function to assist with verifying EXPLAIN which includes costs. A series +-- of bool flags allows control over which portions are masked out +CREATE FUNCTION explain_mask_costs(query text, do_analyze bool, + hide_costs bool, hide_row_est bool, hide_width bool) RETURNS setof text +LANGUAGE plpgsql AS +$$ +DECLARE + ln text; + analyze_str text; +BEGIN + IF do_analyze = true THEN + analyze_str := 'on'; + ELSE + analyze_str := 'off'; + END IF; + + FOR ln IN + EXECUTE format('explain (analyze %s, costs on, summary off, timing off) %s', + analyze_str, query) + LOOP + IF hide_costs = true THEN + ln := regexp_replace(ln, 'cost=\d+\.\d\d\.\.\d+\.\d\d', 'cost=N..N'); + END IF; + + IF hide_row_est = true THEN + -- don't use 'g' so that we leave the actual rows intact + ln := regexp_replace(ln, 'rows=\d+', 'rows=N'); + END IF; + + IF hide_width = true THEN + ln := regexp_replace(ln, 'width=\d+', 'width=N'); + END IF; + + RETURN NEXT ln; + END LOOP; +END; +$$; + -- -- num_nulls() -- @@ -224,6 +262,51 @@ SELECT * FROM tenk1 a JOIN my_gen_series(1,1000) g ON a.unique1 = g; EXPLAIN (COSTS OFF) SELECT * FROM tenk1 a JOIN my_gen_series(1,10) g ON a.unique1 = g; +-- +-- Test the SupportRequestRows support function for generate_series_timestamp() +-- + +-- Ensure the row estimate matches the actual rows +SELECT explain_mask_costs($$ +SELECT * FROM generate_series(TIMESTAMPTZ '2024-02-01', TIMESTAMPTZ '2024-03-01', INTERVAL '1 day') g(s);$$, +true, true, false, true); + +-- As above but with generate_series_timestamp +SELECT explain_mask_costs($$ +SELECT * FROM generate_series(TIMESTAMP '2024-02-01', TIMESTAMP '2024-03-01', INTERVAL '1 day') g(s);$$, +true, true, false, true); + +-- As above but with generate_series_timestamptz_at_zone() +SELECT explain_mask_costs($$ +SELECT * FROM generate_series(TIMESTAMPTZ '2024-02-01', TIMESTAMPTZ '2024-03-01', INTERVAL '1 day', 'UTC') g(s);$$, +true, true, false, true); + +-- Ensure the estimated and actual row counts match when the range isn't +-- evenly divisible by the step +SELECT explain_mask_costs($$ +SELECT * FROM generate_series(TIMESTAMPTZ '2024-02-01', TIMESTAMPTZ '2024-03-01', INTERVAL '7 day') g(s);$$, +true, true, false, true); + +-- Ensure the estimates match when step is decreasing +SELECT explain_mask_costs($$ +SELECT * FROM generate_series(TIMESTAMPTZ '2024-03-01', TIMESTAMPTZ '2024-02-01', INTERVAL '-1 day') g(s);$$, +true, true, false, true); + +-- Ensure an empty range estimates 1 row +SELECT explain_mask_costs($$ +SELECT * FROM generate_series(TIMESTAMPTZ '2024-03-01', TIMESTAMPTZ '2024-02-01', INTERVAL '1 day') g(s);$$, +true, true, false, true); + +-- Ensure we get the default row estimate for infinity values +SELECT explain_mask_costs($$ +SELECT * FROM generate_series(TIMESTAMPTZ '-infinity', TIMESTAMPTZ 'infinity', INTERVAL '1 day') g(s);$$, +false, true, false, true); + +-- Ensure the row estimate behaves correctly when step size is zero. +-- We expect generate_series_timestamp() to throw the error rather than in +-- the support function. +SELECT * FROM generate_series(TIMESTAMPTZ '2024-02-01', TIMESTAMPTZ '2024-03-01', INTERVAL '0 day') g(s); + -- Test functions for control data SELECT count(*) > 0 AS ok FROM pg_control_checkpoint(); SELECT count(*) > 0 AS ok FROM pg_control_init(); @@ -273,3 +356,4 @@ SELECT pg_column_toast_chunk_id(a) IS NULL, pg_column_toast_chunk_id(b) IN (SELECT chunk_id FROM pg_toast.:toastrel) FROM test_chunk_id; DROP TABLE test_chunk_id; +DROP FUNCTION explain_mask_costs(text, bool, bool, bool, bool);