diff options
author | Simon Riggs | 2015-05-15 02:02:54 +0000 |
---|---|---|
committer | Simon Riggs | 2015-05-15 02:02:54 +0000 |
commit | 83e176ec18d2a91dbea1d0d1bd94c38dc47cd77c (patch) | |
tree | 6c92ec7402b4ac14a585c41d960e35bd283a878b /src/backend/commands | |
parent | 5a3022fde018aca9b23b7f7506233b437d943de2 (diff) |
Separate block sampling functions
Refactoring ahead of tablesample patch
Requested and reviewed by Michael Paquier
Petr Jelinek
Diffstat (limited to 'src/backend/commands')
-rw-r--r-- | src/backend/commands/analyze.c | 225 |
1 files changed, 6 insertions, 219 deletions
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 15ec0ad551c..952cf204a0a 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -50,23 +50,13 @@ #include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/pg_rusage.h" +#include "utils/sampling.h" #include "utils/sortsupport.h" #include "utils/syscache.h" #include "utils/timestamp.h" #include "utils/tqual.h" -/* Data structure for Algorithm S from Knuth 3.4.2 */ -typedef struct -{ - BlockNumber N; /* number of blocks, known in advance */ - int n; /* desired sample size */ - BlockNumber t; /* current block number */ - int m; /* blocks selected so far */ -} BlockSamplerData; - -typedef BlockSamplerData *BlockSampler; - /* Per-index data for ANALYZE */ typedef struct AnlIndexData { @@ -89,10 +79,6 @@ static void do_analyze_rel(Relation onerel, int options, VacuumParams *params, List *va_cols, AcquireSampleRowsFunc acquirefunc, BlockNumber relpages, bool inh, bool in_outer_xact, int elevel); -static void BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, - int samplesize); -static bool BlockSampler_HasMore(BlockSampler bs); -static BlockNumber BlockSampler_Next(BlockSampler bs); static void compute_index_stats(Relation onerel, double totalrows, AnlIndexData *indexdata, int nindexes, HeapTuple *rows, int numrows, @@ -951,94 +937,6 @@ examine_attribute(Relation onerel, int attnum, Node *index_expr) } /* - * BlockSampler_Init -- prepare for random sampling of blocknumbers - * - * BlockSampler is used for stage one of our new two-stage tuple - * sampling mechanism as discussed on pgsql-hackers 2004-04-02 (subject - * "Large DB"). It selects a random sample of samplesize blocks out of - * the nblocks blocks in the table. If the table has less than - * samplesize blocks, all blocks are selected. - * - * Since we know the total number of blocks in advance, we can use the - * straightforward Algorithm S from Knuth 3.4.2, rather than Vitter's - * algorithm. - */ -static void -BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize) -{ - bs->N = nblocks; /* measured table size */ - - /* - * If we decide to reduce samplesize for tables that have less or not much - * more than samplesize blocks, here is the place to do it. - */ - bs->n = samplesize; - bs->t = 0; /* blocks scanned so far */ - bs->m = 0; /* blocks selected so far */ -} - -static bool -BlockSampler_HasMore(BlockSampler bs) -{ - return (bs->t < bs->N) && (bs->m < bs->n); -} - -static BlockNumber -BlockSampler_Next(BlockSampler bs) -{ - BlockNumber K = bs->N - bs->t; /* remaining blocks */ - int k = bs->n - bs->m; /* blocks still to sample */ - double p; /* probability to skip block */ - double V; /* random */ - - Assert(BlockSampler_HasMore(bs)); /* hence K > 0 and k > 0 */ - - if ((BlockNumber) k >= K) - { - /* need all the rest */ - bs->m++; - return bs->t++; - } - - /*---------- - * It is not obvious that this code matches Knuth's Algorithm S. - * Knuth says to skip the current block with probability 1 - k/K. - * If we are to skip, we should advance t (hence decrease K), and - * repeat the same probabilistic test for the next block. The naive - * implementation thus requires an anl_random_fract() call for each block - * number. But we can reduce this to one anl_random_fract() call per - * selected block, by noting that each time the while-test succeeds, - * we can reinterpret V as a uniform random number in the range 0 to p. - * Therefore, instead of choosing a new V, we just adjust p to be - * the appropriate fraction of its former value, and our next loop - * makes the appropriate probabilistic test. - * - * We have initially K > k > 0. If the loop reduces K to equal k, - * the next while-test must fail since p will become exactly zero - * (we assume there will not be roundoff error in the division). - * (Note: Knuth suggests a "<=" loop condition, but we use "<" just - * to be doubly sure about roundoff error.) Therefore K cannot become - * less than k, which means that we cannot fail to select enough blocks. - *---------- - */ - V = anl_random_fract(); - p = 1.0 - (double) k / (double) K; - while (V < p) - { - /* skip */ - bs->t++; - K--; /* keep K == N - t */ - - /* adjust p to be new cutoff point in reduced range */ - p *= 1.0 - (double) k / (double) K; - } - - /* select */ - bs->m++; - return bs->t++; -} - -/* * acquire_sample_rows -- acquire a random sample of rows from the table * * Selected rows are returned in the caller-allocated array rows[], which @@ -1084,7 +982,7 @@ acquire_sample_rows(Relation onerel, int elevel, BlockNumber totalblocks; TransactionId OldestXmin; BlockSamplerData bs; - double rstate; + ReservoirStateData rstate; Assert(targrows > 0); @@ -1094,9 +992,9 @@ acquire_sample_rows(Relation onerel, int elevel, OldestXmin = GetOldestXmin(onerel, true); /* Prepare for sampling block numbers */ - BlockSampler_Init(&bs, totalblocks, targrows); + BlockSampler_Init(&bs, totalblocks, targrows, random()); /* Prepare for sampling rows */ - rstate = anl_init_selection_state(targrows); + reservoir_init_selection_state(&rstate, targrows); /* Outer loop over blocks to sample */ while (BlockSampler_HasMore(&bs)) @@ -1244,8 +1142,7 @@ acquire_sample_rows(Relation onerel, int elevel, * t. */ if (rowstoskip < 0) - rowstoskip = anl_get_next_S(samplerows, targrows, - &rstate); + rowstoskip = reservoir_get_next_S(&rstate, samplerows, targrows); if (rowstoskip <= 0) { @@ -1253,7 +1150,7 @@ acquire_sample_rows(Relation onerel, int elevel, * Found a suitable tuple, so save it, replacing one * old tuple at random */ - int k = (int) (targrows * anl_random_fract()); + int k = (int) (targrows * sampler_random_fract()); Assert(k >= 0 && k < targrows); heap_freetuple(rows[k]); @@ -1312,116 +1209,6 @@ acquire_sample_rows(Relation onerel, int elevel, return numrows; } -/* Select a random value R uniformly distributed in (0 - 1) */ -double -anl_random_fract(void) -{ - return ((double) random() + 1) / ((double) MAX_RANDOM_VALUE + 2); -} - -/* - * These two routines embody Algorithm Z from "Random sampling with a - * reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1 - * (Mar. 1985), Pages 37-57. Vitter describes his algorithm in terms - * of the count S of records to skip before processing another record. - * It is computed primarily based on t, the number of records already read. - * The only extra state needed between calls is W, a random state variable. - * - * anl_init_selection_state computes the initial W value. - * - * Given that we've already read t records (t >= n), anl_get_next_S - * determines the number of records to skip before the next record is - * processed. - */ -double -anl_init_selection_state(int n) -{ - /* Initial value of W (for use when Algorithm Z is first applied) */ - return exp(-log(anl_random_fract()) / n); -} - -double -anl_get_next_S(double t, int n, double *stateptr) -{ - double S; - - /* The magic constant here is T from Vitter's paper */ - if (t <= (22.0 * n)) - { - /* Process records using Algorithm X until t is large enough */ - double V, - quot; - - V = anl_random_fract(); /* Generate V */ - S = 0; - t += 1; - /* Note: "num" in Vitter's code is always equal to t - n */ - quot = (t - (double) n) / t; - /* Find min S satisfying (4.1) */ - while (quot > V) - { - S += 1; - t += 1; - quot *= (t - (double) n) / t; - } - } - else - { - /* Now apply Algorithm Z */ - double W = *stateptr; - double term = t - (double) n + 1; - - for (;;) - { - double numer, - numer_lim, - denom; - double U, - X, - lhs, - rhs, - y, - tmp; - - /* Generate U and X */ - U = anl_random_fract(); - X = t * (W - 1.0); - S = floor(X); /* S is tentatively set to floor(X) */ - /* Test if U <= h(S)/cg(X) in the manner of (6.3) */ - tmp = (t + 1) / term; - lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n); - rhs = (((t + X) / (term + S)) * term) / t; - if (lhs <= rhs) - { - W = rhs / lhs; - break; - } - /* Test if U <= f(S)/cg(X) */ - y = (((U * (t + 1)) / term) * (t + S + 1)) / (t + X); - if ((double) n < S) - { - denom = t; - numer_lim = term + S; - } - else - { - denom = t - (double) n + S; - numer_lim = t + 1; - } - for (numer = t + S; numer >= numer_lim; numer -= 1) - { - y *= numer / denom; - denom -= 1; - } - W = exp(-log(anl_random_fract()) / n); /* Generate W in advance */ - if (exp(log(y) / n) <= (t + X) / t) - break; - } - *stateptr = W; - } - return S; -} - /* * qsort comparator for sorting rows[] array */ |