Prevent concurrent SimpleLruTruncate() for any given SLRU.

The SimpleLruTruncate() header comment states the new coding rule. To achieve this, add locktype "frozenid" and two LWLocks. This closes a rare opportunity for data loss, which manifested as "apparent wraparound" or "could not access status of transaction" errors. Data loss is more likely in pg_multixact, due to released branches' thin margin between multiStopLimit and multiWrapLimit. If a user's physical replication primary logged ": apparent wraparound" messages, the user should rebuild standbys of that primary regardless of symptoms. At less risk is a cluster having emitted "not accepting commands" errors or "must be vacuumed" warnings at some point. One can test a cluster for this data loss by running VACUUM FREEZE in every database. Back-patch to 9.5 (all supported versions). Discussion: https://postgr.es/m/[email protected]
author: Noah Misch 2020-08-15 17:15:53 +0000
committer: Noah Misch 2020-08-15 17:15:53 +0000
commit: 566372b3d6435639e4cc4476d79b8505a0297c87 (patch)
tree: cb23c4c35d85e463569d0759b275f484c240cc47 /src/backend
parent: d4d443b3bbbb3eb9cdc511564ef3c57fde7dd3ac (diff)
7 files changed, 85 insertions, 12 deletions
diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
index d1dbb43e096..7640f153c22 100644
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -1191,6 +1191,14 @@ SimpleLruFlush(SlruCtl ctl, bool allow_redirtied)
 
 /*
  * Remove all segments before the one holding the passed page number
+ *
+ * All SLRUs prevent concurrent calls to this function, either with an LWLock
+ * or by calling it only as part of a checkpoint.  Mutual exclusion must begin
+ * before computing cutoffPage.  Mutual exclusion must end after any limit
+ * update that would permit other backends to write fresh data into the
+ * segment immediately preceding the one containing cutoffPage.  Otherwise,
+ * when the SLRU is quite full, SimpleLruTruncate() might delete that segment
+ * after it has accrued freshly-written data.
  */
 void
 SimpleLruTruncate(SlruCtl ctl, int cutoffPage)
diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c
index a087a555421..a50f60b99af 100644
--- a/src/backend/access/transam/subtrans.c
+++ b/src/backend/access/transam/subtrans.c
@@ -349,8 +349,8 @@ ExtendSUBTRANS(TransactionId newestXact)
 /*
  * Remove all SUBTRANS segments before the one holding the passed transaction ID
  *
- * This is normally called during checkpoint, with oldestXact being the
- * oldest TransactionXmin of any running transaction.
+ * oldestXact is the oldest TransactionXmin of any running transaction.  This
+ * is called only during checkpoint.
  */
 void
 TruncateSUBTRANS(TransactionId oldestXact)
diff --git a/src/backend/commands/async.c b/src/backend/commands/async.c
index 71b7577afc0..4c1286eb988 100644
--- a/src/backend/commands/async.c
+++ b/src/backend/commands/async.c
@@ -244,19 +244,22 @@ typedef struct QueueBackendStatus
 /*
  * Shared memory state for LISTEN/NOTIFY (excluding its SLRU stuff)
  *
- * The AsyncQueueControl structure is protected by the NotifyQueueLock.
+ * The AsyncQueueControl structure is protected by the NotifyQueueLock and
+ * NotifyQueueTailLock.
  *
- * When holding the lock in SHARED mode, backends may only inspect their own
- * entries as well as the head and tail pointers. Consequently we can allow a
- * backend to update its own record while holding only SHARED lock (since no
- * other backend will inspect it).
+ * When holding NotifyQueueLock in SHARED mode, backends may only inspect
+ * their own entries as well as the head and tail pointers. Consequently we
+ * can allow a backend to update its own record while holding only SHARED lock
+ * (since no other backend will inspect it).
  *
- * When holding the lock in EXCLUSIVE mode, backends can inspect the entries
- * of other backends and also change the head and tail pointers.
+ * When holding NotifyQueueLock in EXCLUSIVE mode, backends can inspect the
+ * entries of other backends and also change the head pointer. When holding
+ * both NotifyQueueLock and NotifyQueueTailLock in EXCLUSIVE mode, backends
+ * can change the tail pointer.
  *
  * NotifySLRULock is used as the control lock for the pg_notify SLRU buffers.
- * In order to avoid deadlocks, whenever we need both locks, we always first
- * get NotifyQueueLock and then NotifySLRULock.
+ * In order to avoid deadlocks, whenever we need multiple locks, we first get
+ * NotifyQueueTailLock, then NotifyQueueLock, and lastly NotifySLRULock.
  *
  * Each backend uses the backend[] array entry with index equal to its
  * BackendId (which can range from 1 to MaxBackends).  We rely on this to make
@@ -2177,6 +2180,10 @@ asyncQueueAdvanceTail(void)
 	int			newtailpage;
 	int			boundary;
 
+	/* Restrict task to one backend per cluster; see SimpleLruTruncate(). */
+	LWLockAcquire(NotifyQueueTailLock, LW_EXCLUSIVE);
+
+	/* Compute the new tail. */
 	LWLockAcquire(NotifyQueueLock, LW_EXCLUSIVE);
 	min = QUEUE_HEAD;
 	for (BackendId i = QUEUE_FIRST_LISTENER; i > 0; i = QUEUE_NEXT_LISTENER(i))
@@ -2185,7 +2192,6 @@ asyncQueueAdvanceTail(void)
 		min = QUEUE_POS_MIN(min, QUEUE_BACKEND_POS(i));
 	}
 	oldtailpage = QUEUE_POS_PAGE(QUEUE_TAIL);
-	QUEUE_TAIL = min;
 	LWLockRelease(NotifyQueueLock);
 
 	/*
@@ -2205,6 +2211,17 @@ asyncQueueAdvanceTail(void)
 		 */
 		SimpleLruTruncate(NotifyCtl, newtailpage);
 	}
+
+	/*
+	 * Advertise the new tail.  This changes asyncQueueIsFull()'s verdict for
+	 * the segment immediately prior to the new tail, allowing fresh data into
+	 * that segment.
+	 */
+	LWLockAcquire(NotifyQueueLock, LW_EXCLUSIVE);
+	QUEUE_TAIL = min;
+	LWLockRelease(NotifyQueueLock);
+
+	LWLockRelease(NotifyQueueTailLock);
 }
 
 /*
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index aba13c31d1b..5189a5ad5e3 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -1362,6 +1362,14 @@ vac_update_datfrozenxid(void)
 	bool		dirty = false;
 
 	/*
+	 * Restrict this task to one backend per database.  This avoids race
+	 * conditions that would move datfrozenxid or datminmxid backward.  It
+	 * avoids calling vac_truncate_clog() with a datfrozenxid preceding a
+	 * datfrozenxid passed to an earlier vac_truncate_clog() call.
+	 */
+	LockDatabaseFrozenIds(ExclusiveLock);
+
+	/*
 	 * Initialize the "min" calculation with
 	 * GetOldestNonRemovableTransactionId(), which is a reasonable
 	 * approximation to the minimum relfrozenxid for not-yet-committed
@@ -1551,6 +1559,9 @@ vac_truncate_clog(TransactionId frozenXID,
 	bool		bogus = false;
 	bool		frozenAlreadyWrapped = false;
 
+	/* Restrict task to one backend per cluster; see SimpleLruTruncate(). */
+	LWLockAcquire(WrapLimitsVacuumLock, LW_EXCLUSIVE);
+
 	/* init oldest datoids to sync with my frozenXID/minMulti values */
 	oldestxid_datoid = MyDatabaseId;
 	minmulti_datoid = MyDatabaseId;
@@ -1660,6 +1671,8 @@ vac_truncate_clog(TransactionId frozenXID,
 	 */
 	SetTransactionIdLimit(frozenXID, oldestxid_datoid);
 	SetMultiXactIdLimit(minMulti, minmulti_datoid, false);
+
+	LWLockRelease(WrapLimitsVacuumLock);
 }
 
 
diff --git a/src/backend/storage/lmgr/lmgr.c b/src/backend/storage/lmgr/lmgr.c
index 20103200952..7409de94059 100644
--- a/src/backend/storage/lmgr/lmgr.c
+++ b/src/backend/storage/lmgr/lmgr.c
@@ -461,6 +461,21 @@ UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
 }
 
 /*
+ *		LockDatabaseFrozenIds
+ *
+ * This allows one backend per database to execute vac_update_datfrozenxid().
+ */
+void
+LockDatabaseFrozenIds(LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_DATABASE_FROZEN_IDS(tag, MyDatabaseId);
+
+	(void) LockAcquire(&tag, lockmode, false, false);
+}
+
+/*
  *		LockPage
  *
  * Obtain a page-level lock.  This is currently used by some index access
@@ -1098,6 +1113,11 @@ DescribeLockTag(StringInfo buf, const LOCKTAG *tag)
 							 tag->locktag_field2,
 							 tag->locktag_field1);
 			break;
+		case LOCKTAG_DATABASE_FROZEN_IDS:
+			appendStringInfo(buf,
+							 _("pg_database.datfrozenxid of database %u"),
+							 tag->locktag_field1);
+			break;
 		case LOCKTAG_PAGE:
 			appendStringInfo(buf,
 							 _("page %u of relation %u of database %u"),
diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt
index e6985e8eedf..774292fd942 100644
--- a/src/backend/storage/lmgr/lwlocknames.txt
+++ b/src/backend/storage/lmgr/lwlocknames.txt
@@ -50,3 +50,6 @@ MultiXactTruncationLock				41
 OldSnapshotTimeMapLock				42
 LogicalRepWorkerLock				43
 XactTruncationLock					44
+# 45 was XactTruncationLock until removal of BackendRandomLock
+WrapLimitsVacuumLock				46
+NotifyQueueTailLock					47
diff --git a/src/backend/utils/adt/lockfuncs.c b/src/backend/utils/adt/lockfuncs.c
index e992d1bbfce..f592292d067 100644
--- a/src/backend/utils/adt/lockfuncs.c
+++ b/src/backend/utils/adt/lockfuncs.c
@@ -29,6 +29,7 @@
 const char *const LockTagTypeNames[] = {
 	"relation",
 	"extend",
+	"frozenid",
 	"page",
 	"tuple",
 	"transactionid",
@@ -254,6 +255,17 @@ pg_lock_status(PG_FUNCTION_ARGS)
 				nulls[8] = true;
 				nulls[9] = true;
 				break;
+			case LOCKTAG_DATABASE_FROZEN_IDS:
+				values[1] = ObjectIdGetDatum(instance->locktag.locktag_field1);
+				nulls[2] = true;
+				nulls[3] = true;
+				nulls[4] = true;
+				nulls[5] = true;
+				nulls[6] = true;
+				nulls[7] = true;
+				nulls[8] = true;
+				nulls[9] = true;
+				break;
 			case LOCKTAG_PAGE:
 				values[1] = ObjectIdGetDatum(instance->locktag.locktag_field1);
 				values[2] = ObjectIdGetDatum(instance->locktag.locktag_field2);
author	Noah Misch	2020-08-15 17:15:53 +0000
committer	Noah Misch	2020-08-15 17:15:53 +0000
commit	566372b3d6435639e4cc4476d79b8505a0297c87 (patch)
tree	cb23c4c35d85e463569d0759b275f484c240cc47 /src/backend
parent	d4d443b3bbbb3eb9cdc511564ef3c57fde7dd3ac (diff)