summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/backend/access/nbtree/README23
-rw-r--r--src/backend/access/nbtree/nbtxlog.c58
2 files changed, 35 insertions, 46 deletions
diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README
index 9d5fc424a57..abce31a5a96 100644
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@@ -572,23 +572,12 @@ replay of page deletion records does not hold a write lock on the target
leaf page throughout; only the primary needs to block out concurrent
writers that insert on to the page being deleted.)
-There are also locking differences between the primary and WAL replay
-for the first stage of a page split (i.e. same-level differences in
-locking). Replay of the first phase of a page split can get away with
-locking and updating the original right sibling page (which is also the
-new right sibling page's right sibling) after locks on the original page
-and its new right sibling have been released. Again, this is okay
-because there are no writers. Page deletion WAL replay cannot get away
-with being lax about same-level locking during replay, though -- doing
-so risks confusing concurrent backwards scans.
-
-Page deletion's second phase locks the left sibling page, target page,
-and right page in order on the standby, just like on the primary. This
-allows backwards scans running on a standby to reason about page
-deletion on the leaf level; a page cannot appear deleted without that
-being reflected in the sibling pages. It's probably possible to be more
-lax about how locks are acquired on the standby during the second phase
-of page deletion, but that hardly seems worth it.
+WAL replay holds same-level locks in a way that matches the approach
+taken during original execution, though. This prevent readers from
+observing same-level inconsistencies. It's probably possible to be more
+lax about how same-level locks are acquired during recovery (most kinds
+of readers could still move right to recover if we didn't couple
+same-level locks), but we prefer to be conservative here.
During recovery all index scans start with ignore_killed_tuples = false
and we never set kill_prior_tuple. We do this because the oldest xmin
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index 1fd63924632..dbec58d5249 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -172,10 +172,10 @@ btree_xlog_insert(bool isleaf, bool ismeta, bool posting,
* Insertion to an internal page finishes an incomplete split at the child
* level. Clear the incomplete-split flag in the child. Note: during
* normal operation, the child and parent pages are locked at the same
- * time, so that clearing the flag and inserting the downlink appear
- * atomic to other backends. We don't bother with that during replay,
- * because readers don't care about the incomplete-split flag and there
- * cannot be updates happening.
+ * time (the locks are coupled), so that clearing the flag and inserting
+ * the downlink appear atomic to other backends. We don't bother with
+ * that during replay, because readers don't care about the
+ * incomplete-split flag and there cannot be updates happening.
*/
if (!isleaf)
_bt_clear_incomplete_split(record, 1);
@@ -272,9 +272,17 @@ btree_xlog_split(bool newitemonleft, XLogReaderState *record)
spagenumber = P_NONE;
/*
- * Clear the incomplete split flag on the left sibling of the child page
- * this is a downlink for. (Like in btree_xlog_insert, this can be done
- * before locking the other pages)
+ * Clear the incomplete split flag on the appropriate child page one level
+ * down when origpage/buf is an internal page (there must have been
+ * cascading page splits during original execution in the event of an
+ * internal page split). This is like the corresponding btree_xlog_insert
+ * call for internal pages. We're not clearing the incomplete split flag
+ * for the current page split here (you can think of this as part of the
+ * insert of newitem that the page split action needs to perform in
+ * passing).
+ *
+ * Like in btree_xlog_insert, this can be done before locking other pages.
+ * We never need to couple cross-level locks in REDO routines.
*/
if (!isleaf)
_bt_clear_incomplete_split(record, 3);
@@ -427,22 +435,7 @@ btree_xlog_split(bool newitemonleft, XLogReaderState *record)
MarkBufferDirty(buf);
}
- /*
- * We no longer need the buffers. They must be released together, so that
- * readers cannot observe two inconsistent halves.
- */
- if (BufferIsValid(buf))
- UnlockReleaseBuffer(buf);
- UnlockReleaseBuffer(rbuf);
-
- /*
- * Fix left-link of the page to the right of the new right sibling.
- *
- * Note: in normal operation, we do this while still holding lock on the
- * two split pages. However, that's not necessary for correctness in WAL
- * replay, because no other index update can be in progress, and readers
- * will cope properly when following an obsolete left-link.
- */
+ /* Fix left-link of the page to the right of the new right sibling */
if (spagenumber != P_NONE)
{
Buffer sbuf;
@@ -460,6 +453,14 @@ btree_xlog_split(bool newitemonleft, XLogReaderState *record)
if (BufferIsValid(sbuf))
UnlockReleaseBuffer(sbuf);
}
+
+ /*
+ * Finally, release the remaining buffers. sbuf, rbuf, and buf must be
+ * released together, so that readers cannot observe inconsistencies.
+ */
+ UnlockReleaseBuffer(rbuf);
+ if (BufferIsValid(buf))
+ UnlockReleaseBuffer(buf);
}
static void
@@ -733,6 +734,11 @@ btree_xlog_mark_page_halfdead(uint8 info, XLogReaderState *record)
PageSetLSN(page, lsn);
MarkBufferDirty(buffer);
}
+
+ /*
+ * Don't need to couple cross-level locks in REDO routines, so release
+ * lock on internal page immediately
+ */
if (BufferIsValid(buffer))
UnlockReleaseBuffer(buffer);
@@ -789,12 +795,6 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record)
* the pages in the same standard left-to-right order (leftsib, target,
* rightsib), and don't release the sibling locks until the target is
* marked deleted.
- *
- * btree_xlog_split() can get away with fixing its right sibling page's
- * left link last of all, after dropping all other locks. We prefer to
- * avoid dropping locks on same-level pages early compared to normal
- * operation. This keeps things simple for backwards scans. See
- * nbtree/README.
*/
/* Fix right-link of left sibling, if any */