--
-- RANDOM
-- Test random() and allies
--
-- Tests in this file may have a small probability of failure,
-- since we are dealing with randomness.  Try to keep the failure
-- risk for any one test case under 1e-9.
--
-- There should be no duplicates in 1000 random() values.
-- (Assuming 52 random bits in the float8 results, we could
-- take as many as 3000 values and still have less than 1e-9 chance
-- of failure, per https://en.wikipedia.org/wiki/Birthday_problem)
SELECT r, count(*)
FROM (SELECT random() r FROM generate_series(1, 1000)) ss
GROUP BY r HAVING count(*) > 1;
 r | count 
---+-------
(0 rows)

-- The range should be [0, 1).  We can expect that at least one out of 2000
-- random values is in the lowest or highest 1% of the range with failure
-- probability less than about 1e-9.
SELECT count(*) FILTER (WHERE r < 0 OR r >= 1) AS out_of_range,
       (count(*) FILTER (WHERE r < 0.01)) > 0 AS has_small,
       (count(*) FILTER (WHERE r > 0.99)) > 0 AS has_large
FROM (SELECT random() r FROM generate_series(1, 2000)) ss;
 out_of_range | has_small | has_large 
--------------+-----------+-----------
            0 | t         | t
(1 row)

-- Check for uniform distribution using the Kolmogorov-Smirnov test.
CREATE FUNCTION ks_test_uniform_random()
RETURNS boolean AS
$$
DECLARE
  n int := 1000;        -- Number of samples
  c float8 := 1.94947;  -- Critical value for 99.9% confidence
  ok boolean;
BEGIN
  ok := (
    WITH samples AS (
      SELECT random() r FROM generate_series(1, n) ORDER BY 1
    ), indexed_samples AS (
      SELECT (row_number() OVER())-1.0 i, r FROM samples
    )
    SELECT max(abs(i/n-r)) < c / sqrt(n) FROM indexed_samples
  );
  RETURN ok;
END
$$
LANGUAGE plpgsql;
-- As written, ks_test_uniform_random() returns true about 99.9%
-- of the time.  To get down to a roughly 1e-9 test failure rate,
-- just run it 3 times and accept if any one of them passes.
SELECT ks_test_uniform_random() OR
       ks_test_uniform_random() OR
       ks_test_uniform_random() AS uniform;
 uniform 
---------
 t
(1 row)

-- now test random_normal()
-- As above, there should be no duplicates in 1000 random_normal() values.
SELECT r, count(*)
FROM (SELECT random_normal() r FROM generate_series(1, 1000)) ss
GROUP BY r HAVING count(*) > 1;
 r | count 
---+-------
(0 rows)

-- ... unless we force the range (standard deviation) to zero.
-- This is a good place to check that the mean input does something, too.
SELECT r, count(*)
FROM (SELECT random_normal(10, 0) r FROM generate_series(1, 100)) ss
GROUP BY r;
 r  | count 
----+-------
 10 |   100
(1 row)

SELECT r, count(*)
FROM (SELECT random_normal(-10, 0) r FROM generate_series(1, 100)) ss
GROUP BY r;
  r  | count 
-----+-------
 -10 |   100
(1 row)

-- Check standard normal distribution using the Kolmogorov-Smirnov test.
CREATE FUNCTION ks_test_normal_random()
RETURNS boolean AS
$$
DECLARE
  n int := 1000;        -- Number of samples
  c float8 := 1.94947;  -- Critical value for 99.9% confidence
  ok boolean;
BEGIN
  ok := (
    WITH samples AS (
      SELECT random_normal() r FROM generate_series(1, n) ORDER BY 1
    ), indexed_samples AS (
      SELECT (row_number() OVER())-1.0 i, r FROM samples
    )
    SELECT max(abs((1+erf(r/sqrt(2)))/2 - i/n)) < c / sqrt(n)
    FROM indexed_samples
  );
  RETURN ok;
END
$$
LANGUAGE plpgsql;
-- As above, ks_test_normal_random() returns true about 99.9%
-- of the time, so try it 3 times and accept if any test passes.
SELECT ks_test_normal_random() OR
       ks_test_normal_random() OR
       ks_test_normal_random() AS standard_normal;
 standard_normal 
-----------------
 t
(1 row)

-- setseed() should produce a reproducible series of random() values.
SELECT setseed(0.5);
 setseed 
---------
 
(1 row)

SELECT random() FROM generate_series(1, 10);
       random        
---------------------
  0.9851677175347999
   0.825301858027981
 0.12974610012450416
 0.16356291958601088
     0.6476186144084
  0.8822771983038762
  0.1404566845227775
 0.15619865764623442
  0.5145227426983392
  0.7712969548127826
(10 rows)

-- Likewise for random_normal(); however, since its implementation relies
-- on libm functions that have different roundoff behaviors on different
-- machines, we have to round off the results a bit to get consistent output.
SET extra_float_digits = -1;
SELECT random_normal() FROM generate_series(1, 10);
   random_normal   
-------------------
  0.20853464493838
  0.26453024054096
 -0.60675246790043
  0.82579942785265
   1.7011161173536
 -0.22344546371619
    0.249712419191
  -1.2494722990669
  0.12562715204368
  0.47539161454401
(10 rows)

SELECT random_normal(mean => 1, stddev => 0.1) r FROM generate_series(1, 10);
        r         
------------------
  1.0060597281173
    1.09685453015
  1.0286920613201
 0.90947567671234
 0.98372476313426
 0.93934454957762
  1.1871350020636
 0.96225768429293
 0.91444120680041
 0.96403105557543
(10 rows)