Skip to content

Commit 0c882e5

Browse files
committed
Improve ineq_histogram_selectivity's behavior for non-default orderings.
ineq_histogram_selectivity() can be invoked in situations where the ordering we care about is not that of the column's histogram. We could be considering some other collation, or even more drastically, the query operator might not agree at all with what was used to construct the histogram. (We'll get here for anything using scalarineqsel-based estimators, so that's quite likely to happen for extension operators.) Up to now we just ignored this issue and assumed we were dealing with an operator/collation whose sort order exactly matches the histogram, possibly resulting in junk estimates if the binary search gets confused. It's past time to improve that, since the use of nondefault collations is increasing. What we can do is verify that the given operator and collation match what's recorded in pg_statistic, and use the existing code only if so. When they don't match, instead execute the operator against each histogram entry, and take the fraction of successes as our selectivity estimate. This gives an estimate that is probably good to about 1/histogram_size, with no assumptions about ordering. (The quality of the estimate is likely to degrade near the ends of the value range, since the two orderings probably don't agree on what is an extremal value; but this is surely going to be more reliable than what we did before.) At some point we might further improve matters by storing more than one histogram calculated according to different orderings. But this code would still be good fallback logic when no matches exist, so that is not an argument for not doing this. While here, also improve get_variable_range() to deal more honestly with non-default collations. This isn't back-patchable, because it requires adding another argument to ineq_histogram_selectivity, and because it might have significant impact on the estimation results for extension operators relying on scalarineqsel --- mostly for the better, one hopes, but in any case destabilizing plan choices in back branches is best avoided. Per investigation of a report from James Lucas. Discussion: https://postgr.es/m/CAAFmbbOvfi=wMM=3qRsPunBSLb8BFREno2oOzSBS=mzfLPKABw@mail.gmail.com
1 parent 87fb04a commit 0c882e5

File tree

7 files changed

+205
-77
lines changed

7 files changed

+205
-77
lines changed

src/backend/utils/adt/like_support.c

+2-2
Original file line numberDiff line numberDiff line change
@@ -1217,7 +1217,7 @@ prefix_selectivity(PlannerInfo *root, VariableStatData *vardata,
12171217
fmgr_info(get_opcode(geopr), &opproc);
12181218

12191219
prefixsel = ineq_histogram_selectivity(root, vardata,
1220-
&opproc, true, true,
1220+
geopr, &opproc, true, true,
12211221
collation,
12221222
prefixcon->constvalue,
12231223
prefixcon->consttype);
@@ -1238,7 +1238,7 @@ prefix_selectivity(PlannerInfo *root, VariableStatData *vardata,
12381238
Selectivity topsel;
12391239

12401240
topsel = ineq_histogram_selectivity(root, vardata,
1241-
&opproc, false, false,
1241+
ltopr, &opproc, false, false,
12421242
collation,
12431243
greaterstrcon->constvalue,
12441244
greaterstrcon->consttype);

src/backend/utils/adt/selfuncs.c

+145-61
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,10 @@ static void examine_simple_variable(PlannerInfo *root, Var *var,
192192
static bool get_variable_range(PlannerInfo *root, VariableStatData *vardata,
193193
Oid sortop, Oid collation,
194194
Datum *min, Datum *max);
195+
static void get_stats_slot_range(AttStatsSlot *sslot,
196+
Oid opfuncoid, FmgrInfo *opproc,
197+
Oid collation, int16 typLen, bool typByVal,
198+
Datum *min, Datum *max, bool *p_have_data);
195199
static bool get_actual_variable_range(PlannerInfo *root,
196200
VariableStatData *vardata,
197201
Oid sortop, Oid collation,
@@ -679,7 +683,7 @@ scalarineqsel(PlannerInfo *root, Oid operator, bool isgt, bool iseq,
679683
* compute the resulting contribution to selectivity.
680684
*/
681685
hist_selec = ineq_histogram_selectivity(root, vardata,
682-
&opproc, isgt, iseq,
686+
operator, &opproc, isgt, iseq,
683687
collation,
684688
constval, consttype);
685689

@@ -1019,6 +1023,9 @@ generic_restriction_selectivity(PlannerInfo *root, Oid oproid, Oid collation,
10191023
* satisfies the inequality condition, ie, VAR < (or <=, >, >=) CONST.
10201024
* The isgt and iseq flags distinguish which of the four cases apply.
10211025
*
1026+
* While opproc could be looked up from the operator OID, common callers
1027+
* also need to call it separately, so we make the caller pass both.
1028+
*
10221029
* Returns -1 if there is no histogram (valid results will always be >= 0).
10231030
*
10241031
* Note that the result disregards both the most-common-values (if any) and
@@ -1030,7 +1037,7 @@ generic_restriction_selectivity(PlannerInfo *root, Oid oproid, Oid collation,
10301037
double
10311038
ineq_histogram_selectivity(PlannerInfo *root,
10321039
VariableStatData *vardata,
1033-
FmgrInfo *opproc, bool isgt, bool iseq,
1040+
Oid opoid, FmgrInfo *opproc, bool isgt, bool iseq,
10341041
Oid collation,
10351042
Datum constval, Oid consttype)
10361043
{
@@ -1042,31 +1049,30 @@ ineq_histogram_selectivity(PlannerInfo *root,
10421049
/*
10431050
* Someday, ANALYZE might store more than one histogram per rel/att,
10441051
* corresponding to more than one possible sort ordering defined for the
1045-
* column type. However, to make that work we will need to figure out
1046-
* which staop to search for --- it's not necessarily the one we have at
1047-
* hand! (For example, we might have a '<=' operator rather than the '<'
1048-
* operator that will appear in staop.) The collation might not agree
1049-
* either. For now, just assume that whatever appears in pg_statistic is
1050-
* sorted the same way our operator sorts, or the reverse way if isgt is
1051-
* true. This could result in a bogus estimate, but it still seems better
1052-
* than falling back to the default estimate.
1052+
* column type. Right now, we know there is only one, so just grab it and
1053+
* see if it matches the query.
1054+
*
1055+
* Note that we can't use opoid as search argument; the staop appearing in
1056+
* pg_statistic will be for the relevant '<' operator, but what we have
1057+
* might be some other inequality operator such as '>='. (Even if opoid
1058+
* is a '<' operator, it could be cross-type.) Hence we must use
1059+
* comparison_ops_are_compatible() to see if the operators match.
10531060
*/
10541061
if (HeapTupleIsValid(vardata->statsTuple) &&
10551062
statistic_proc_security_check(vardata, opproc->fn_oid) &&
10561063
get_attstatsslot(&sslot, vardata->statsTuple,
10571064
STATISTIC_KIND_HISTOGRAM, InvalidOid,
10581065
ATTSTATSSLOT_VALUES))
10591066
{
1060-
if (sslot.nvalues > 1)
1067+
if (sslot.nvalues > 1 &&
1068+
sslot.stacoll == collation &&
1069+
comparison_ops_are_compatible(sslot.staop, opoid))
10611070
{
10621071
/*
10631072
* Use binary search to find the desired location, namely the
10641073
* right end of the histogram bin containing the comparison value,
10651074
* which is the leftmost entry for which the comparison operator
1066-
* succeeds (if isgt) or fails (if !isgt). (If the given operator
1067-
* isn't actually sort-compatible with the histogram, you'll get
1068-
* garbage results ... but probably not any more garbage-y than
1069-
* you would have from the old linear search.)
1075+
* succeeds (if isgt) or fails (if !isgt).
10701076
*
10711077
* In this loop, we pay no attention to whether the operator iseq
10721078
* or not; that detail will be mopped up below. (We cannot tell,
@@ -1332,6 +1338,50 @@ ineq_histogram_selectivity(PlannerInfo *root,
13321338
hist_selec = 1.0 - cutoff;
13331339
}
13341340
}
1341+
else if (sslot.nvalues > 1)
1342+
{
1343+
/*
1344+
* If we get here, we have a histogram but it's not sorted the way
1345+
* we want. Do a brute-force search to see how many of the
1346+
* entries satisfy the comparison condition, and take that
1347+
* fraction as our estimate. (This is identical to the inner loop
1348+
* of histogram_selectivity; maybe share code?)
1349+
*/
1350+
LOCAL_FCINFO(fcinfo, 2);
1351+
int nmatch = 0;
1352+
1353+
InitFunctionCallInfoData(*fcinfo, opproc, 2, collation,
1354+
NULL, NULL);
1355+
fcinfo->args[0].isnull = false;
1356+
fcinfo->args[1].isnull = false;
1357+
fcinfo->args[1].value = constval;
1358+
for (int i = 0; i < sslot.nvalues; i++)
1359+
{
1360+
Datum fresult;
1361+
1362+
fcinfo->args[0].value = sslot.values[i];
1363+
fcinfo->isnull = false;
1364+
fresult = FunctionCallInvoke(fcinfo);
1365+
if (!fcinfo->isnull && DatumGetBool(fresult))
1366+
nmatch++;
1367+
}
1368+
hist_selec = ((double) nmatch) / ((double) sslot.nvalues);
1369+
1370+
/*
1371+
* As above, clamp to a hundredth of the histogram resolution.
1372+
* This case is surely even less trustworthy than the normal one,
1373+
* so we shouldn't believe exact 0 or 1 selectivity. (Maybe the
1374+
* clamp should be more restrictive in this case?)
1375+
*/
1376+
{
1377+
double cutoff = 0.01 / (double) (sslot.nvalues - 1);
1378+
1379+
if (hist_selec < cutoff)
1380+
hist_selec = cutoff;
1381+
else if (hist_selec > 1.0 - cutoff)
1382+
hist_selec = 1.0 - cutoff;
1383+
}
1384+
}
13351385

13361386
free_attstatsslot(&sslot);
13371387
}
@@ -5363,8 +5413,8 @@ get_variable_range(PlannerInfo *root, VariableStatData *vardata,
53635413
int16 typLen;
53645414
bool typByVal;
53655415
Oid opfuncoid;
5416+
FmgrInfo opproc;
53665417
AttStatsSlot sslot;
5367-
int i;
53685418

53695419
/*
53705420
* XXX It's very tempting to try to use the actual column min and max, if
@@ -5395,78 +5445,56 @@ get_variable_range(PlannerInfo *root, VariableStatData *vardata,
53955445
(opfuncoid = get_opcode(sortop))))
53965446
return false;
53975447

5448+
opproc.fn_oid = InvalidOid; /* mark this as not looked up yet */
5449+
53985450
get_typlenbyval(vardata->atttype, &typLen, &typByVal);
53995451

54005452
/*
5401-
* If there is a histogram, grab the first and last values.
5402-
*
5403-
* If there is a histogram that is sorted with some other operator than
5404-
* the one we want, fail --- this suggests that there is data we can't
5405-
* use. XXX consider collation too.
5453+
* If there is a histogram with the ordering we want, grab the first and
5454+
* last values.
54065455
*/
54075456
if (get_attstatsslot(&sslot, vardata->statsTuple,
54085457
STATISTIC_KIND_HISTOGRAM, sortop,
54095458
ATTSTATSSLOT_VALUES))
54105459
{
5411-
if (sslot.nvalues > 0)
5460+
if (sslot.stacoll == collation && sslot.nvalues > 0)
54125461
{
54135462
tmin = datumCopy(sslot.values[0], typByVal, typLen);
54145463
tmax = datumCopy(sslot.values[sslot.nvalues - 1], typByVal, typLen);
54155464
have_data = true;
54165465
}
54175466
free_attstatsslot(&sslot);
54185467
}
5419-
else if (get_attstatsslot(&sslot, vardata->statsTuple,
5420-
STATISTIC_KIND_HISTOGRAM, InvalidOid,
5421-
0))
5468+
5469+
/*
5470+
* Otherwise, if there is a histogram with some other ordering, scan it
5471+
* and get the min and max values according to the ordering we want. This
5472+
* of course may not find values that are really extremal according to our
5473+
* ordering, but it beats ignoring available data.
5474+
*/
5475+
if (!have_data &&
5476+
get_attstatsslot(&sslot, vardata->statsTuple,
5477+
STATISTIC_KIND_HISTOGRAM, InvalidOid,
5478+
ATTSTATSSLOT_VALUES))
54225479
{
5480+
get_stats_slot_range(&sslot, opfuncoid, &opproc,
5481+
collation, typLen, typByVal,
5482+
&tmin, &tmax, &have_data);
54235483
free_attstatsslot(&sslot);
5424-
return false;
54255484
}
54265485

54275486
/*
54285487
* If we have most-common-values info, look for extreme MCVs. This is
54295488
* needed even if we also have a histogram, since the histogram excludes
5430-
* the MCVs. However, usually the MCVs will not be the extreme values, so
5431-
* avoid unnecessary data copying.
5489+
* the MCVs.
54325490
*/
54335491
if (get_attstatsslot(&sslot, vardata->statsTuple,
54345492
STATISTIC_KIND_MCV, InvalidOid,
54355493
ATTSTATSSLOT_VALUES))
54365494
{
5437-
bool tmin_is_mcv = false;
5438-
bool tmax_is_mcv = false;
5439-
FmgrInfo opproc;
5440-
5441-
fmgr_info(opfuncoid, &opproc);
5442-
5443-
for (i = 0; i < sslot.nvalues; i++)
5444-
{
5445-
if (!have_data)
5446-
{
5447-
tmin = tmax = sslot.values[i];
5448-
tmin_is_mcv = tmax_is_mcv = have_data = true;
5449-
continue;
5450-
}
5451-
if (DatumGetBool(FunctionCall2Coll(&opproc,
5452-
collation,
5453-
sslot.values[i], tmin)))
5454-
{
5455-
tmin = sslot.values[i];
5456-
tmin_is_mcv = true;
5457-
}
5458-
if (DatumGetBool(FunctionCall2Coll(&opproc,
5459-
collation,
5460-
tmax, sslot.values[i])))
5461-
{
5462-
tmax = sslot.values[i];
5463-
tmax_is_mcv = true;
5464-
}
5465-
}
5466-
if (tmin_is_mcv)
5467-
tmin = datumCopy(tmin, typByVal, typLen);
5468-
if (tmax_is_mcv)
5469-
tmax = datumCopy(tmax, typByVal, typLen);
5495+
get_stats_slot_range(&sslot, opfuncoid, &opproc,
5496+
collation, typLen, typByVal,
5497+
&tmin, &tmax, &have_data);
54705498
free_attstatsslot(&sslot);
54715499
}
54725500

@@ -5475,6 +5503,62 @@ get_variable_range(PlannerInfo *root, VariableStatData *vardata,
54755503
return have_data;
54765504
}
54775505

5506+
/*
5507+
* get_stats_slot_range: scan sslot for min/max values
5508+
*
5509+
* Subroutine for get_variable_range: update min/max/have_data according
5510+
* to what we find in the statistics array.
5511+
*/
5512+
static void
5513+
get_stats_slot_range(AttStatsSlot *sslot, Oid opfuncoid, FmgrInfo *opproc,
5514+
Oid collation, int16 typLen, bool typByVal,
5515+
Datum *min, Datum *max, bool *p_have_data)
5516+
{
5517+
Datum tmin = *min;
5518+
Datum tmax = *max;
5519+
bool have_data = *p_have_data;
5520+
bool found_tmin = false;
5521+
bool found_tmax = false;
5522+
5523+
/* Look up the comparison function, if we didn't already do so */
5524+
if (opproc->fn_oid != opfuncoid)
5525+
fmgr_info(opfuncoid, opproc);
5526+
5527+
/* Scan all the slot's values */
5528+
for (int i = 0; i < sslot->nvalues; i++)
5529+
{
5530+
if (!have_data)
5531+
{
5532+
tmin = tmax = sslot->values[i];
5533+
found_tmin = found_tmax = true;
5534+
*p_have_data = have_data = true;
5535+
continue;
5536+
}
5537+
if (DatumGetBool(FunctionCall2Coll(opproc,
5538+
collation,
5539+
sslot->values[i], tmin)))
5540+
{
5541+
tmin = sslot->values[i];
5542+
found_tmin = true;
5543+
}
5544+
if (DatumGetBool(FunctionCall2Coll(opproc,
5545+
collation,
5546+
tmax, sslot->values[i])))
5547+
{
5548+
tmax = sslot->values[i];
5549+
found_tmax = true;
5550+
}
5551+
}
5552+
5553+
/*
5554+
* Copy the slot's values, if we found new extreme values.
5555+
*/
5556+
if (found_tmin)
5557+
*min = datumCopy(tmin, typByVal, typLen);
5558+
if (found_tmax)
5559+
*max = datumCopy(tmax, typByVal, typLen);
5560+
}
5561+
54785562

54795563
/*
54805564
* get_actual_variable_range

0 commit comments

Comments
 (0)