Omit null rows when applying the Haas-Stokes estimator for ndistinct.

author Tom Lane <[email protected]>

Fri, 1 Apr 2016 19:47:52 +0000 (15:47 -0400)

committer Tom Lane <[email protected]>

Fri, 1 Apr 2016 19:48:24 +0000 (15:48 -0400)
author Tom Lane <[email protected]>
Fri, 1 Apr 2016 19:47:52 +0000 (15:47 -0400)
committer Tom Lane <[email protected]>
Fri, 1 Apr 2016 19:48:24 +0000 (15:48 -0400)
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c

index 8a5f07c957c2dd7d6eee4ba033afcf8991509f0b..b0c65650ee742c07ec1762c0bfbf42399ee8fa2e 100644 (file)
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -2072,6 +2072,12 @@ compute_distinct_stats(VacAttrStatsP stats,
              * recommend are considerably more complex, and are numerically
              * very unstable when n is much smaller than N.
              *
+            * In this calculation, we consider only non-nulls.  We used to
+            * include rows with null values in the n and N counts, but that
+            * leads to inaccurate answers in columns with many nulls, and
+            * it's intuitively bogus anyway considering the desired result is
+            * the number of distinct non-null values.
+            *
              * We assume (not very reliably!) that all the multiply-occurring
              * values are reflected in the final track[] list, and the other
              * nonnull values all appeared but once.  (XXX this usually
@@ -2081,21 +2087,22 @@ compute_distinct_stats(VacAttrStatsP stats,
              */
             int         f1 = nonnull_cnt - summultiple;
             int         d = f1 + nmultiple;
-           double      numer,
-                       denom,
-                       stadistinct;
-
-           numer = (double) samplerows *(double) d;
+           double      n = samplerows - null_cnt;
+           double      N = totalrows * (1.0 - stats->stanullfrac);
+           double      stadistinct;
  
-           denom = (double) (samplerows - f1) +
-               (double) f1 *(double) samplerows / totalrows;
+           /* N == 0 shouldn't happen, but just in case ... */
+           if (N > 0)
+               stadistinct = (n * d) / ((n - f1) + f1 * n / N);
+           else
+               stadistinct = 0;
  
-           stadistinct = numer / denom;
             /* Clamp to sane range in case of roundoff error */
-           if (stadistinct < (double) d)
-               stadistinct = (double) d;
-           if (stadistinct > totalrows)
-               stadistinct = totalrows;
+           if (stadistinct < d)
+               stadistinct = d;
+           if (stadistinct > N)
+               stadistinct = N;
+           /* And round to integer */
             stats->stadistinct = floor(stadistinct + 0.5);
         }
  
@@ -2425,26 +2432,33 @@ compute_scalar_stats(VacAttrStatsP stats,
              * recommend are considerably more complex, and are numerically
              * very unstable when n is much smaller than N.
              *
+            * In this calculation, we consider only non-nulls.  We used to
+            * include rows with null values in the n and N counts, but that
+            * leads to inaccurate answers in columns with many nulls, and
+            * it's intuitively bogus anyway considering the desired result is
+            * the number of distinct non-null values.
+            *
              * Overwidth values are assumed to have been distinct.
              *----------
              */
             int         f1 = ndistinct - nmultiple + toowide_cnt;
             int         d = f1 + nmultiple;
-           double      numer,
-                       denom,
-                       stadistinct;
-
-           numer = (double) samplerows *(double) d;
+           double      n = samplerows - null_cnt;
+           double      N = totalrows * (1.0 - stats->stanullfrac);
+           double      stadistinct;
  
-           denom = (double) (samplerows - f1) +
-               (double) f1 *(double) samplerows / totalrows;
+           /* N == 0 shouldn't happen, but just in case ... */
+           if (N > 0)
+               stadistinct = (n * d) / ((n - f1) + f1 * n / N);
+           else
+               stadistinct = 0;
  
-           stadistinct = numer / denom;
             /* Clamp to sane range in case of roundoff error */
-           if (stadistinct < (double) d)
-               stadistinct = (double) d;
-           if (stadistinct > totalrows)
-               stadistinct = totalrows;
+           if (stadistinct < d)
+               stadistinct = d;
+           if (stadistinct > N)
+               stadistinct = N;
+           /* And round to integer */
             stats->stadistinct = floor(stadistinct + 0.5);
         }
author	Tom Lane <[email protected]>
	Fri, 1 Apr 2016 19:47:52 +0000 (15:47 -0400)
committer	Tom Lane <[email protected]>
	Fri, 1 Apr 2016 19:48:24 +0000 (15:48 -0400)