diff options
author | Fujii Masao | 2015-02-23 11:55:17 +0000 |
---|---|---|
committer | Fujii Masao | 2015-02-23 11:55:17 +0000 |
commit | 5d2b45e3f78a85639f30431181c06d4c3221c5a1 (patch) | |
tree | 845775ab6e3021da1a16b7a552d9e6989f6061a0 /src/backend | |
parent | 2a3f6e368babdac7b586a7d43105af60fc08b1a3 (diff) |
Add GUC to control the time to wait before retrieving WAL after failed attempt.
Previously when the standby server failed to retrieve WAL files from any sources
(i.e., streaming replication, local pg_xlog directory or WAL archive), it always
waited for five seconds (hard-coded) before the next attempt. For example,
this is problematic in warm-standby because restore_command can fail
every five seconds even while new WAL file is expected to be unavailable for
a long time and flood the log files with its error messages.
This commit adds new parameter, wal_retrieve_retry_interval, to control that
wait time.
Alexey Vasiliev and Michael Paquier, reviewed by Andres Freund and me.
Diffstat (limited to 'src/backend')
-rw-r--r-- | src/backend/access/transam/xlog.c | 37 | ||||
-rw-r--r-- | src/backend/utils/misc/guc.c | 12 | ||||
-rw-r--r-- | src/backend/utils/misc/postgresql.conf.sample | 2 |
3 files changed, 38 insertions, 13 deletions
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 629a457965f..f68f82b255c 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -93,6 +93,7 @@ int sync_method = DEFAULT_SYNC_METHOD; int wal_level = WAL_LEVEL_MINIMAL; int CommitDelay = 0; /* precommit delay in microseconds */ int CommitSiblings = 5; /* # concurrent xacts needed to sleep */ +int wal_retrieve_retry_interval = 5000; #ifdef WAL_DEBUG bool XLOG_DEBUG = false; @@ -10340,8 +10341,8 @@ static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, bool fetching_ckpt, XLogRecPtr tliRecPtr) { - static pg_time_t last_fail_time = 0; - pg_time_t now; + static TimestampTz last_fail_time = 0; + TimestampTz now; /*------- * Standby mode is implemented by a state machine: @@ -10351,7 +10352,7 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, * 2. Check trigger file * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM) * 4. Rescan timelines - * 5. Sleep 5 seconds, and loop back to 1. + * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1. * * Failure to read from the current source advances the state machine to * the next state. @@ -10490,14 +10491,25 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, * machine, so we've exhausted all the options for * obtaining the requested WAL. We're going to loop back * and retry from the archive, but if it hasn't been long - * since last attempt, sleep 5 seconds to avoid - * busy-waiting. + * since last attempt, sleep wal_retrieve_retry_interval + * milliseconds to avoid busy-waiting. */ - now = (pg_time_t) time(NULL); - if ((now - last_fail_time) < 5) + now = GetCurrentTimestamp(); + if (!TimestampDifferenceExceeds(last_fail_time, now, + wal_retrieve_retry_interval)) { - pg_usleep(1000000L * (5 - (now - last_fail_time))); - now = (pg_time_t) time(NULL); + long secs, wait_time; + int usecs; + + TimestampDifference(last_fail_time, now, &secs, &usecs); + wait_time = wal_retrieve_retry_interval - + (secs * 1000 + usecs / 1000); + + WaitLatch(&XLogCtl->recoveryWakeupLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, + wait_time); + ResetLatch(&XLogCtl->recoveryWakeupLatch); + now = GetCurrentTimestamp(); } last_fail_time = now; currentSource = XLOG_FROM_ARCHIVE; @@ -10653,12 +10665,11 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, } /* - * Wait for more WAL to arrive. Time out after 5 seconds, - * like when polling the archive, to react to a trigger - * file promptly. + * Wait for more WAL to arrive. Time out after 5 seconds + * to react to a trigger file promptly. */ WaitLatch(&XLogCtl->recoveryWakeupLatch, - WL_LATCH_SET | WL_TIMEOUT, + WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, 5000L); ResetLatch(&XLogCtl->recoveryWakeupLatch); break; diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 95727776d38..cf401d3cf03 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -2364,6 +2364,18 @@ static struct config_int ConfigureNamesInt[] = }, { + {"wal_retrieve_retry_interval", PGC_SIGHUP, REPLICATION_STANDBY, + gettext_noop("Sets the time to wait before retrying to retrieve WAL" + "after a failed attempt."), + NULL, + GUC_UNIT_MS + }, + &wal_retrieve_retry_interval, + 5000, 1, INT_MAX, + NULL, NULL, NULL + }, + + { {"wal_segment_size", PGC_INTERNAL, PRESET_OPTIONS, gettext_noop("Shows the number of pages per write ahead log segment."), NULL, diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index b053659f88e..29d8485964d 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -260,6 +260,8 @@ #wal_receiver_timeout = 60s # time that receiver waits for # communication from master # in milliseconds; 0 disables +#wal_retrieve_retry_interval = 5s # time to wait before retrying to + # retrieve WAL after a failed attempt #------------------------------------------------------------------------------ |