* recommend are considerably more complex, and are numerically
* very unstable when n is much smaller than N.
*
+ * In this calculation, we consider only non-nulls. We used to
+ * include rows with null values in the n and N counts, but that
+ * leads to inaccurate answers in columns with many nulls, and
+ * it's intuitively bogus anyway considering the desired result is
+ * the number of distinct non-null values.
+ *
* We assume (not very reliably!) that all the multiply-occurring
* values are reflected in the final track[] list, and the other
* nonnull values all appeared but once. (XXX this usually
*/
int f1 = nonnull_cnt - summultiple;
int d = f1 + nmultiple;
- double numer,
- denom,
- stadistinct;
-
- numer = (double) samplerows *(double) d;
+ double n = samplerows - null_cnt;
+ double N = totalrows * (1.0 - stats->stanullfrac);
+ double stadistinct;
- denom = (double) (samplerows - f1) +
- (double) f1 *(double) samplerows / totalrows;
+ /* N == 0 shouldn't happen, but just in case ... */
+ if (N > 0)
+ stadistinct = (n * d) / ((n - f1) + f1 * n / N);
+ else
+ stadistinct = 0;
- stadistinct = numer / denom;
/* Clamp to sane range in case of roundoff error */
- if (stadistinct < (double) d)
- stadistinct = (double) d;
- if (stadistinct > totalrows)
- stadistinct = totalrows;
+ if (stadistinct < d)
+ stadistinct = d;
+ if (stadistinct > N)
+ stadistinct = N;
+ /* And round to integer */
stats->stadistinct = floor(stadistinct + 0.5);
}
* recommend are considerably more complex, and are numerically
* very unstable when n is much smaller than N.
*
+ * In this calculation, we consider only non-nulls. We used to
+ * include rows with null values in the n and N counts, but that
+ * leads to inaccurate answers in columns with many nulls, and
+ * it's intuitively bogus anyway considering the desired result is
+ * the number of distinct non-null values.
+ *
* Overwidth values are assumed to have been distinct.
*----------
*/
int f1 = ndistinct - nmultiple + toowide_cnt;
int d = f1 + nmultiple;
- double numer,
- denom,
- stadistinct;
-
- numer = (double) samplerows *(double) d;
+ double n = samplerows - null_cnt;
+ double N = totalrows * (1.0 - stats->stanullfrac);
+ double stadistinct;
- denom = (double) (samplerows - f1) +
- (double) f1 *(double) samplerows / totalrows;
+ /* N == 0 shouldn't happen, but just in case ... */
+ if (N > 0)
+ stadistinct = (n * d) / ((n - f1) + f1 * n / N);
+ else
+ stadistinct = 0;
- stadistinct = numer / denom;
/* Clamp to sane range in case of roundoff error */
- if (stadistinct < (double) d)
- stadistinct = (double) d;
- if (stadistinct > totalrows)
- stadistinct = totalrows;
+ if (stadistinct < d)
+ stadistinct = d;
+ if (stadistinct > N)
+ stadistinct = N;
+ /* And round to integer */
stats->stadistinct = floor(stadistinct + 0.5);
}