Improve isolationtester's timeout management.

isolationtester.c had a hard-wired limit of 3 minutes per test step. It now emerges that this isn't quite enough for some of the slowest buildfarm animals. This isn't the first time we've had to raise this limit (cf. 1db439ad4), so let's make it configurable. This patch raises the default to 5 minutes, and introduces an environment variable PGISOLATIONTIMEOUT that can be set if more time is needed, following the precedent of PGCTLTIMEOUT. Also, modify isolationtester so that when the timeout is hit, it explicitly reports having sent a cancel. This makes the regression failure log considerably more intelligible. (In the worst case, a timed-out test might actually be reported as "passing" without this extra output, so arguably this is a bug fix in itself.) In passing, update the README file, which had apparently not gotten touched when we added "make check" support here. Back-patch to 9.6; older versions don't have comparable timeout logic. Discussion: https://postgr.es/m/[email protected]
author: Tom Lane 2019-12-09 19:31:57 +0000
committer: Tom Lane 2019-12-09 19:31:57 +0000
commit: 99351a8b5a3882ee5d131bd37d03a67ab07b4fea (patch)
tree: 9a30925d473b935fb7766426caf6e82294b9ce29 /src/test/isolation/isolationtester.c
parent: 2d0fdfaccec8b314895e026018874dcc5565b43e (diff)
1 files changed, 27 insertions, 6 deletions
diff --git a/src/test/isolation/isolationtester.c b/src/test/isolation/isolationtester.c
index b50c1897324..f80261c0229 100644
--- a/src/test/isolation/isolationtester.c
+++ b/src/test/isolation/isolationtester.c
@@ -30,6 +30,10 @@ static int *backend_pids = NULL;
 static const char **backend_pid_strs = NULL;
 static int	nconns = 0;
 
+/* Maximum time to wait before giving up on a step (in usec) */
+static int64 max_step_wait = 300 * USECS_PER_SEC;
+
+
 static void run_testspec(TestSpec *testspec);
 static void run_all_permutations(TestSpec *testspec);
 static void run_all_permutations_recurse(TestSpec *testspec, int nsteps,
@@ -62,6 +66,7 @@ int
 main(int argc, char **argv)
 {
 	const char *conninfo;
+	const char *env_wait;
 	TestSpec   *testspec;
 	int			i,
 				j;
@@ -103,6 +108,14 @@ main(int argc, char **argv)
 	else
 		conninfo = "dbname = postgres";
 
+	/*
+	 * If PGISOLATIONTIMEOUT is set in the environment, adopt its value (given
+	 * in seconds) as the max time to wait for any one step to complete.
+	 */
+	env_wait = getenv("PGISOLATIONTIMEOUT");
+	if (env_wait != NULL)
+		max_step_wait = ((int64) atoi(env_wait)) * USECS_PER_SEC;
+
 	/* Read the test spec from stdin */
 	spec_yyparse();
 	testspec = &parseresult;
@@ -766,7 +779,7 @@ try_complete_step(TestSpec *testspec, Step *step, int flags)
 			td += (int64) current_time.tv_usec - (int64) start_time.tv_usec;
 
 			/*
-			 * After 180 seconds, try to cancel the query.
+			 * After max_step_wait microseconds, try to cancel the query.
 			 *
 			 * If the user tries to test an invalid permutation, we don't want
 			 * to hang forever, especially when this is running in the
@@ -774,7 +787,7 @@ try_complete_step(TestSpec *testspec, Step *step, int flags)
 			 * failing, but remaining permutations and tests should still be
 			 * OK.
 			 */
-			if (td > 180 * USECS_PER_SEC && !canceled)
+			if (td > max_step_wait && !canceled)
 			{
 				PGcancel   *cancel = PQgetCancel(conn);
 
@@ -783,7 +796,15 @@ try_complete_step(TestSpec *testspec, Step *step, int flags)
 					char		buf[256];
 
 					if (PQcancel(cancel, buf, sizeof(buf)))
+					{
+						/*
+						 * print to stdout not stderr, as this should appear
+						 * in the test case's results
+						 */
+						printf("isolationtester: canceling step %s after %d seconds\n",
+							   step->name, (int) (td / USECS_PER_SEC));
 						canceled = true;
+					}
 					else
 						fprintf(stderr, "PQcancel failed: %s\n", buf);
 					PQfreeCancel(cancel);
@@ -791,16 +812,16 @@ try_complete_step(TestSpec *testspec, Step *step, int flags)
 			}
 
 			/*
-			 * After 200 seconds, just give up and die.
+			 * After twice max_step_wait, just give up and die.
 			 *
 			 * Since cleanup steps won't be run in this case, this may cause
 			 * later tests to fail.  That stinks, but it's better than waiting
 			 * forever for the server to respond to the cancel.
 			 */
-			if (td > 200 * USECS_PER_SEC)
+			if (td > 2 * max_step_wait)
 			{
-				fprintf(stderr, "step %s timed out after 200 seconds\n",
-						step->name);
+				fprintf(stderr, "step %s timed out after %d seconds\n",
+						step->name, (int) (td / USECS_PER_SEC));
 				exit(1);
 			}
 		}
author	Tom Lane	2019-12-09 19:31:57 +0000
committer	Tom Lane	2019-12-09 19:31:57 +0000
commit	99351a8b5a3882ee5d131bd37d03a67ab07b4fea (patch)
tree	9a30925d473b935fb7766426caf6e82294b9ce29 /src/test/isolation/isolationtester.c
parent	2d0fdfaccec8b314895e026018874dcc5565b43e (diff)