summaryrefslogtreecommitdiff
path: root/src/backend/access
diff options
context:
space:
mode:
authorRobert Haas2017-02-08 20:45:30 +0000
committerRobert Haas2017-02-08 20:45:30 +0000
commita507b86900f695aacc8d52b7d2cfcb65f58862a2 (patch)
tree1cc25aaddb8613d2744e803e3ea970fd50d1e309 /src/backend/access
parent115cb31597fac8a17202d1e41da8baf33fcb60cf (diff)
Add WAL consistency checking facility.
When the new GUC wal_consistency_checking is set to a non-empty value, it triggers recording of additional full-page images, which are compared on the standby against the results of applying the WAL record (without regard to those full-page images). Allowable differences such as hints are masked out, and the resulting pages are compared; any difference results in a FATAL error on the standby. Kuntal Ghosh, based on earlier patches by Michael Paquier and Heikki Linnakangas. Extensively reviewed and revised by Michael Paquier and by me, with additional reviews and comments from Amit Kapila, Álvaro Herrera, Simon Riggs, and Peter Eisentraut.
Diffstat (limited to 'src/backend/access')
-rw-r--r--src/backend/access/brin/brin_xlog.c20
-rw-r--r--src/backend/access/common/Makefile4
-rw-r--r--src/backend/access/common/bufmask.c128
-rw-r--r--src/backend/access/gin/ginxlog.c32
-rw-r--r--src/backend/access/gist/gistxlog.c43
-rw-r--r--src/backend/access/heap/heapam.c79
-rw-r--r--src/backend/access/nbtree/nbtxlog.c50
-rw-r--r--src/backend/access/rmgrdesc/gindesc.c14
-rw-r--r--src/backend/access/spgist/spgxlog.c21
-rw-r--r--src/backend/access/transam/generic_xlog.c12
-rw-r--r--src/backend/access/transam/rmgr.c4
-rw-r--r--src/backend/access/transam/xlog.c120
-rw-r--r--src/backend/access/transam/xloginsert.c38
-rw-r--r--src/backend/access/transam/xlogreader.c8
-rw-r--r--src/backend/access/transam/xlogutils.c11
15 files changed, 568 insertions, 16 deletions
diff --git a/src/backend/access/brin/brin_xlog.c b/src/backend/access/brin/brin_xlog.c
index b698c9b58c5..f416bacc3f7 100644
--- a/src/backend/access/brin/brin_xlog.c
+++ b/src/backend/access/brin/brin_xlog.c
@@ -13,6 +13,7 @@
#include "access/brin_page.h"
#include "access/brin_pageops.h"
#include "access/brin_xlog.h"
+#include "access/bufmask.h"
#include "access/xlogutils.h"
@@ -279,3 +280,22 @@ brin_redo(XLogReaderState *record)
elog(PANIC, "brin_redo: unknown op code %u", info);
}
}
+
+/*
+ * Mask a BRIN page before doing consistency checks.
+ */
+void
+brin_mask(char *pagedata, BlockNumber blkno)
+{
+ Page page = (Page) pagedata;
+
+ mask_page_lsn(page);
+
+ mask_page_hint_bits(page);
+
+ if (BRIN_IS_REGULAR_PAGE(page))
+ {
+ /* Regular brin pages contain unused space which needs to be masked. */
+ mask_unused_space(page);
+ }
+}
diff --git a/src/backend/access/common/Makefile b/src/backend/access/common/Makefile
index d4b8132a973..fb27944b891 100644
--- a/src/backend/access/common/Makefile
+++ b/src/backend/access/common/Makefile
@@ -12,7 +12,7 @@ subdir = src/backend/access/common
top_builddir = ../../../..
include $(top_builddir)/src/Makefile.global
-OBJS = heaptuple.o indextuple.o printsimple.o printtup.o reloptions.o \
- scankey.o tupconvert.o tupdesc.o
+OBJS = bufmask.o heaptuple.o indextuple.o printsimple.o printtup.o \
+ reloptions.o scankey.o tupconvert.o tupdesc.o
include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/access/common/bufmask.c b/src/backend/access/common/bufmask.c
new file mode 100644
index 00000000000..3b06115e035
--- /dev/null
+++ b/src/backend/access/common/bufmask.c
@@ -0,0 +1,128 @@
+/*-------------------------------------------------------------------------
+ *
+ * bufmask.c
+ * Routines for buffer masking. Used to mask certain bits
+ * in a page which can be different when the WAL is generated
+ * and when the WAL is applied.
+ *
+ * Portions Copyright (c) 2016, PostgreSQL Global Development Group
+ *
+ * Contains common routines required for masking a page.
+ *
+ * IDENTIFICATION
+ * src/backend/storage/buffer/bufmask.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/bufmask.h"
+
+/*
+ * mask_page_lsn
+ *
+ * In consistency checks, the LSN of the two pages compared will likely be
+ * different because of concurrent operations when the WAL is generated
+ * and the state of the page when WAL is applied.
+ */
+void
+mask_page_lsn(Page page)
+{
+ PageHeader phdr = (PageHeader) page;
+
+ PageXLogRecPtrSet(phdr->pd_lsn, (uint64) MASK_MARKER);
+}
+
+/*
+ * mask_page_hint_bits
+ *
+ * Mask hint bits in PageHeader. We want to ignore differences in hint bits,
+ * since they can be set without emitting any WAL.
+ */
+void
+mask_page_hint_bits(Page page)
+{
+ PageHeader phdr = (PageHeader) page;
+
+ /* Ignore prune_xid (it's like a hint-bit) */
+ phdr->pd_prune_xid = MASK_MARKER;
+
+ /* Ignore PD_PAGE_FULL and PD_HAS_FREE_LINES flags, they are just hints. */
+ PageClearFull(page);
+ PageClearHasFreeLinePointers(page);
+
+ /*
+ * During replay, if the page LSN has advanced past our XLOG record's LSN,
+ * we don't mark the page all-visible. See heap_xlog_visible() for
+ * details.
+ */
+ PageClearAllVisible(page);
+}
+
+/*
+ * mask_unused_space
+ *
+ * Mask the unused space of a page between pd_lower and pd_upper.
+ */
+void
+mask_unused_space(Page page)
+{
+ int pd_lower = ((PageHeader) page)->pd_lower;
+ int pd_upper = ((PageHeader) page)->pd_upper;
+ int pd_special = ((PageHeader) page)->pd_special;
+
+ /* Sanity check */
+ if (pd_lower > pd_upper || pd_special < pd_upper ||
+ pd_lower < SizeOfPageHeaderData || pd_special > BLCKSZ)
+ {
+ elog(ERROR, "invalid page pd_lower %u pd_upper %u pd_special %u\n",
+ pd_lower, pd_upper, pd_special);
+ }
+
+ memset(page + pd_lower, MASK_MARKER, pd_upper - pd_lower);
+}
+
+/*
+ * mask_lp_flags
+ *
+ * In some index AMs, line pointer flags can be modified in master without
+ * emitting any WAL record.
+ */
+void
+mask_lp_flags(Page page)
+{
+ OffsetNumber offnum,
+ maxoff;
+
+ maxoff = PageGetMaxOffsetNumber(page);
+ for (offnum = FirstOffsetNumber;
+ offnum <= maxoff;
+ offnum = OffsetNumberNext(offnum))
+ {
+ ItemId itemId = PageGetItemId(page, offnum);
+
+ if (ItemIdIsUsed(itemId))
+ itemId->lp_flags = LP_UNUSED;
+ }
+}
+
+/*
+ * mask_page_content
+ *
+ * In some index AMs, the contents of deleted pages need to be almost
+ * completely ignored.
+ */
+void
+mask_page_content(Page page)
+{
+ /* Mask Page Content */
+ memset(page + SizeOfPageHeaderData, MASK_MARKER,
+ BLCKSZ - SizeOfPageHeaderData);
+
+ /* Mask pd_lower and pd_upper */
+ memset(&((PageHeader) page)->pd_lower, MASK_MARKER,
+ sizeof(uint16));
+ memset(&((PageHeader) page)->pd_upper, MASK_MARKER,
+ sizeof(uint16));
+}
diff --git a/src/backend/access/gin/ginxlog.c b/src/backend/access/gin/ginxlog.c
index 8468fe825cf..2995e7b06a7 100644
--- a/src/backend/access/gin/ginxlog.c
+++ b/src/backend/access/gin/ginxlog.c
@@ -13,6 +13,7 @@
*/
#include "postgres.h"
+#include "access/bufmask.h"
#include "access/gin_private.h"
#include "access/xlogutils.h"
#include "utils/memutils.h"
@@ -758,3 +759,34 @@ gin_xlog_cleanup(void)
MemoryContextDelete(opCtx);
opCtx = NULL;
}
+
+/*
+ * Mask a GIN page before running consistency checks on it.
+ */
+void
+gin_mask(char *pagedata, BlockNumber blkno)
+{
+ Page page = (Page) pagedata;
+ GinPageOpaque opaque;
+
+ mask_page_lsn(page);
+ opaque = GinPageGetOpaque(page);
+
+ mask_page_hint_bits(page);
+
+ /*
+ * GIN metapage doesn't use pd_lower/pd_upper. Other page types do. Hence,
+ * we need to apply masking for those pages.
+ */
+ if (opaque->flags != GIN_META)
+ {
+ /*
+ * For GIN_DELETED page, the page is initialized to empty. Hence, mask
+ * the page content.
+ */
+ if (opaque->flags & GIN_DELETED)
+ mask_page_content(page);
+ else
+ mask_unused_space(page);
+ }
+}
diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c
index 88b97a4e487..cbda9e705cc 100644
--- a/src/backend/access/gist/gistxlog.c
+++ b/src/backend/access/gist/gistxlog.c
@@ -13,6 +13,7 @@
*/
#include "postgres.h"
+#include "access/bufmask.h"
#include "access/gist_private.h"
#include "access/xloginsert.h"
#include "access/xlogutils.h"
@@ -343,6 +344,48 @@ gist_xlog_cleanup(void)
}
/*
+ * Mask a Gist page before running consistency checks on it.
+ */
+void
+gist_mask(char *pagedata, BlockNumber blkno)
+{
+ Page page = (Page) pagedata;
+
+ mask_page_lsn(page);
+
+ mask_page_hint_bits(page);
+ mask_unused_space(page);
+
+ /*
+ * NSN is nothing but a special purpose LSN. Hence, mask it for the same
+ * reason as mask_page_lsn.
+ */
+ GistPageSetNSN(page, (uint64) MASK_MARKER);
+
+ /*
+ * We update F_FOLLOW_RIGHT flag on the left child after writing WAL
+ * record. Hence, mask this flag. See gistplacetopage() for details.
+ */
+ GistMarkFollowRight(page);
+
+ if (GistPageIsLeaf(page))
+ {
+ /*
+ * In gist leaf pages, it is possible to modify the LP_FLAGS without
+ * emitting any WAL record. Hence, mask the line pointer flags. See
+ * gistkillitems() for details.
+ */
+ mask_lp_flags(page);
+ }
+
+ /*
+ * During gist redo, we never mark a page as garbage. Hence, mask it to
+ * ignore any differences.
+ */
+ GistClearPageHasGarbage(page);
+}
+
+/*
* Write WAL record of a page split.
*/
XLogRecPtr
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 5fd7f1e1a20..0be48fb3ee1 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -38,6 +38,7 @@
*/
#include "postgres.h"
+#include "access/bufmask.h"
#include "access/heapam.h"
#include "access/heapam_xlog.h"
#include "access/hio.h"
@@ -9142,3 +9143,81 @@ heap_sync(Relation rel)
heap_close(toastrel, AccessShareLock);
}
}
+
+/*
+ * Mask a heap page before performing consistency checks on it.
+ */
+void
+heap_mask(char *pagedata, BlockNumber blkno)
+{
+ Page page = (Page) pagedata;
+ OffsetNumber off;
+
+ mask_page_lsn(page);
+
+ mask_page_hint_bits(page);
+ mask_unused_space(page);
+
+ for (off = 1; off <= PageGetMaxOffsetNumber(page); off++)
+ {
+ ItemId iid = PageGetItemId(page, off);
+ char *page_item;
+
+ page_item = (char *) (page + ItemIdGetOffset(iid));
+
+ if (ItemIdIsNormal(iid))
+ {
+
+ HeapTupleHeader page_htup = (HeapTupleHeader) page_item;
+
+ /*
+ * If xmin of a tuple is not yet frozen, we should ignore
+ * differences in hint bits, since they can be set without
+ * emitting WAL.
+ */
+ if (!HeapTupleHeaderXminFrozen(page_htup))
+ page_htup->t_infomask &= ~HEAP_XACT_MASK;
+ else
+ {
+ /* Still we need to mask xmax hint bits. */
+ page_htup->t_infomask &= ~HEAP_XMAX_INVALID;
+ page_htup->t_infomask &= ~HEAP_XMAX_COMMITTED;
+ }
+
+ /*
+ * During replay, we set Command Id to FirstCommandId. Hence, mask
+ * it. See heap_xlog_insert() for details.
+ */
+ page_htup->t_choice.t_heap.t_field3.t_cid = MASK_MARKER;
+
+ /*
+ * For a speculative tuple, heap_insert() does not set ctid in the
+ * caller-passed heap tuple itself, leaving the ctid field to
+ * contain a speculative token value - a per-backend monotonically
+ * increasing identifier. Besides, it does not WAL-log ctid under
+ * any circumstances.
+ *
+ * During redo, heap_xlog_insert() sets t_ctid to current block
+ * number and self offset number. It doesn't care about any
+ * speculative insertions in master. Hence, we set t_ctid to
+ * current block number and self offset number to ignore any
+ * inconsistency.
+ */
+ if (HeapTupleHeaderIsSpeculative(page_htup))
+ ItemPointerSet(&page_htup->t_ctid, blkno, off);
+ }
+
+ /*
+ * Ignore any padding bytes after the tuple, when the length of the
+ * item is not MAXALIGNed.
+ */
+ if (ItemIdHasStorage(iid))
+ {
+ int len = ItemIdGetLength(iid);
+ int padlen = MAXALIGN(len) - len;
+
+ if (padlen > 0)
+ memset(page_item + len, MASK_MARKER, padlen);
+ }
+ }
+}
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index efad745c57e..a9ca279d813 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -14,6 +14,7 @@
*/
#include "postgres.h"
+#include "access/bufmask.h"
#include "access/heapam_xlog.h"
#include "access/nbtree.h"
#include "access/transam.h"
@@ -1028,3 +1029,52 @@ btree_redo(XLogReaderState *record)
elog(PANIC, "btree_redo: unknown op code %u", info);
}
}
+
+/*
+ * Mask a btree page before performing consistency checks on it.
+ */
+void
+btree_mask(char *pagedata, BlockNumber blkno)
+{
+ Page page = (Page) pagedata;
+ BTPageOpaque maskopaq;
+
+ mask_page_lsn(page);
+
+ mask_page_hint_bits(page);
+ mask_unused_space(page);
+
+ maskopaq = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ if (P_ISDELETED(maskopaq))
+ {
+ /*
+ * Mask page content on a DELETED page since it will be re-initialized
+ * during replay. See btree_xlog_unlink_page() for details.
+ */
+ mask_page_content(page);
+ }
+ else if (P_ISLEAF(maskopaq))
+ {
+ /*
+ * In btree leaf pages, it is possible to modify the LP_FLAGS without
+ * emitting any WAL record. Hence, mask the line pointer flags. See
+ * _bt_killitems(), _bt_check_unique() for details.
+ */
+ mask_lp_flags(page);
+ }
+
+ /*
+ * BTP_HAS_GARBAGE is just an un-logged hint bit. So, mask it. See
+ * _bt_killitems(), _bt_check_unique() for details.
+ */
+ maskopaq->btpo_flags &= ~BTP_HAS_GARBAGE;
+
+ /*
+ * During replay of a btree page split, we don't set the BTP_SPLIT_END
+ * flag of the right sibling and initialize the cycle_id to 0 for the same
+ * page. See btree_xlog_split() for details.
+ */
+ maskopaq->btpo_flags &= ~BTP_SPLIT_END;
+ maskopaq->btpo_cycleid = 0;
+}
diff --git a/src/backend/access/rmgrdesc/gindesc.c b/src/backend/access/rmgrdesc/gindesc.c
index 9e488b359af..d4ed7f9c0ab 100644
--- a/src/backend/access/rmgrdesc/gindesc.c
+++ b/src/backend/access/rmgrdesc/gindesc.c
@@ -105,7 +105,12 @@ gin_desc(StringInfo buf, XLogReaderState *record)
leftChildBlkno, rightChildBlkno);
}
if (XLogRecHasBlockImage(record, 0))
- appendStringInfoString(buf, " (full page image)");
+ {
+ if (XLogRecBlockImageApply(record, 0))
+ appendStringInfoString(buf, " (full page image)");
+ else
+ appendStringInfoString(buf, " (full page image, for WAL verification)");
+ }
else
{
char *payload = XLogRecGetBlockData(record, 0, NULL);
@@ -145,7 +150,12 @@ gin_desc(StringInfo buf, XLogReaderState *record)
case XLOG_GIN_VACUUM_DATA_LEAF_PAGE:
{
if (XLogRecHasBlockImage(record, 0))
- appendStringInfoString(buf, " (full page image)");
+ {
+ if (XLogRecBlockImageApply(record, 0))
+ appendStringInfoString(buf, " (full page image)");
+ else
+ appendStringInfoString(buf, " (full page image, for WAL verification)");
+ }
else
{
ginxlogVacuumDataLeafPage *xlrec =
diff --git a/src/backend/access/spgist/spgxlog.c b/src/backend/access/spgist/spgxlog.c
index 3dc6a5ab881..596b266ba64 100644
--- a/src/backend/access/spgist/spgxlog.c
+++ b/src/backend/access/spgist/spgxlog.c
@@ -14,6 +14,7 @@
*/
#include "postgres.h"
+#include "access/bufmask.h"
#include "access/spgist_private.h"
#include "access/transam.h"
#include "access/xlog.h"
@@ -1023,3 +1024,23 @@ spg_xlog_cleanup(void)
MemoryContextDelete(opCtx);
opCtx = NULL;
}
+
+/*
+ * Mask a SpGist page before performing consistency checks on it.
+ */
+void
+spg_mask(char *pagedata, BlockNumber blkno)
+{
+ Page page = (Page) pagedata;
+
+ mask_page_lsn(page);
+
+ mask_page_hint_bits(page);
+
+ /*
+ * Any SpGist page other than meta contains unused space which needs to be
+ * masked.
+ */
+ if (!SpGistPageIsMeta(page))
+ mask_unused_space(page);
+}
diff --git a/src/backend/access/transam/generic_xlog.c b/src/backend/access/transam/generic_xlog.c
index eddec9bc548..fbc6810c2f2 100644
--- a/src/backend/access/transam/generic_xlog.c
+++ b/src/backend/access/transam/generic_xlog.c
@@ -13,6 +13,7 @@
*/
#include "postgres.h"
+#include "access/bufmask.h"
#include "access/generic_xlog.h"
#include "access/xlogutils.h"
#include "miscadmin.h"
@@ -533,3 +534,14 @@ generic_redo(XLogReaderState *record)
UnlockReleaseBuffer(buffers[block_id]);
}
}
+
+/*
+ * Mask a generic page before performing consistency checks on it.
+ */
+void
+generic_mask(char *page, BlockNumber blkno)
+{
+ mask_page_lsn(page);
+
+ mask_unused_space(page);
+}
diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c
index 9bb136218d5..eae75242fea 100644
--- a/src/backend/access/transam/rmgr.c
+++ b/src/backend/access/transam/rmgr.c
@@ -30,8 +30,8 @@
#include "utils/relmapper.h"
/* must be kept in sync with RmgrData definition in xlog_internal.h */
-#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup) \
- { name, redo, desc, identify, startup, cleanup },
+#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask) \
+ { name, redo, desc, identify, startup, cleanup, mask },
const RmgrData RmgrTable[RM_MAX_ID + 1] = {
#include "access/rmgrlist.h"
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 2f5d6030660..cc8b83fa8d6 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -95,6 +95,8 @@ bool EnableHotStandby = false;
bool fullPageWrites = true;
bool wal_log_hints = false;
bool wal_compression = false;
+char *wal_consistency_checking_string = NULL;
+bool *wal_consistency_checking = NULL;
bool log_checkpoints = false;
int sync_method = DEFAULT_SYNC_METHOD;
int wal_level = WAL_LEVEL_MINIMAL;
@@ -245,6 +247,10 @@ bool InArchiveRecovery = false;
/* Was the last xlog file restored from archive, or local? */
static bool restoredFromArchive = false;
+/* Buffers dedicated to consistency checks of size BLCKSZ */
+static char *replay_image_masked = NULL;
+static char *master_image_masked = NULL;
+
/* options taken from recovery.conf for archive recovery */
char *recoveryRestoreCommand = NULL;
static char *recoveryEndCommand = NULL;
@@ -903,6 +909,7 @@ static char *GetXLogBuffer(XLogRecPtr ptr);
static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos);
static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos);
static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr);
+static void checkXLogConsistency(XLogReaderState *record);
static void WALInsertLockAcquire(void);
static void WALInsertLockAcquireExclusive(void);
@@ -1315,6 +1322,103 @@ ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr)
}
/*
+ * Checks whether the current buffer page and backup page stored in the
+ * WAL record are consistent or not. Before comparing the two pages, a
+ * masking can be applied to the pages to ignore certain areas like hint bits,
+ * unused space between pd_lower and pd_upper among other things. This
+ * function should be called once WAL replay has been completed for a
+ * given record.
+ */
+static void
+checkXLogConsistency(XLogReaderState *record)
+{
+ RmgrId rmid = XLogRecGetRmid(record);
+ RelFileNode rnode;
+ ForkNumber forknum;
+ BlockNumber blkno;
+ int block_id;
+
+ /* Records with no backup blocks have no need for consistency checks. */
+ if (!XLogRecHasAnyBlockRefs(record))
+ return;
+
+ Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0);
+
+ for (block_id = 0; block_id <= record->max_block_id; block_id++)
+ {
+ Buffer buf;
+ Page page;
+
+ if (!XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno))
+ {
+ /*
+ * WAL record doesn't contain a block reference with the given id.
+ * Do nothing.
+ */
+ continue;
+ }
+
+ Assert(XLogRecHasBlockImage(record, block_id));
+
+ /*
+ * Read the contents from the current buffer and store it in a
+ * temporary page.
+ */
+ buf = XLogReadBufferExtended(rnode, forknum, blkno,
+ RBM_NORMAL_NO_LOG);
+ if (!BufferIsValid(buf))
+ continue;
+
+ LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+ page = BufferGetPage(buf);
+
+ /*
+ * Take a copy of the local page where WAL has been applied to have a
+ * comparison base before masking it...
+ */
+ memcpy(replay_image_masked, page, BLCKSZ);
+
+ /* No need for this page anymore now that a copy is in. */
+ UnlockReleaseBuffer(buf);
+
+ /*
+ * If the block LSN is already ahead of this WAL record, we can't
+ * expect contents to match. This can happen if recovery is restarted.
+ */
+ if (PageGetLSN(replay_image_masked) > record->EndRecPtr)
+ continue;
+
+ /*
+ * Read the contents from the backup copy, stored in WAL record and
+ * store it in a temporary page. There is not need to allocate a new
+ * page here, a local buffer is fine to hold its contents and a mask
+ * can be directly applied on it.
+ */
+ if (!RestoreBlockImage(record, block_id, master_image_masked))
+ elog(ERROR, "failed to restore block image");
+
+ /*
+ * If masking function is defined, mask both the master and replay
+ * images
+ */
+ if (RmgrTable[rmid].rm_mask != NULL)
+ {
+ RmgrTable[rmid].rm_mask(replay_image_masked, blkno);
+ RmgrTable[rmid].rm_mask(master_image_masked, blkno);
+ }
+
+ /* Time to compare the master and replay images. */
+ if (memcmp(replay_image_masked, master_image_masked, BLCKSZ) != 0)
+ {
+ elog(FATAL,
+ "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
+ rnode.spcNode, rnode.dbNode, rnode.relNode,
+ forknum, blkno);
+ }
+ }
+}
+
+/*
* Subroutine of XLogInsertRecord. Copies a WAL record to an already-reserved
* area in the WAL.
*/
@@ -6200,6 +6304,13 @@ StartupXLOG(void)
errdetail("Failed while allocating an XLog reading processor.")));
xlogreader->system_identifier = ControlFile->system_identifier;
+ /*
+ * Allocate pages dedicated to WAL consistency checks, those had better
+ * be aligned.
+ */
+ replay_image_masked = (char *) palloc(BLCKSZ);
+ master_image_masked = (char *) palloc(BLCKSZ);
+
if (read_backup_label(&checkPointLoc, &backupEndRequired,
&backupFromStandby))
{
@@ -7000,6 +7111,15 @@ StartupXLOG(void)
/* Now apply the WAL record itself */
RmgrTable[record->xl_rmid].rm_redo(xlogreader);
+ /*
+ * After redo, check whether the backup pages associated with
+ * the WAL record are consistent with the existing pages. This
+ * check is done only if consistency check is enabled for this
+ * record.
+ */
+ if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
+ checkXLogConsistency(xlogreader);
+
/* Pop the error context stack */
error_context_stack = errcallback.previous;
diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
index a5aa58d845d..797e68cd901 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -421,10 +421,12 @@ XLogInsert(RmgrId rmid, uint8 info)
elog(ERROR, "XLogBeginInsert was not called");
/*
- * The caller can set rmgr bits and XLR_SPECIAL_REL_UPDATE; the rest are
- * reserved for use by me.
+ * The caller can set rmgr bits, XLR_SPECIAL_REL_UPDATE and
+ * XLR_CHECK_CONSISTENCY; the rest are reserved for use by me.
*/
- if ((info & ~(XLR_RMGR_INFO_MASK | XLR_SPECIAL_REL_UPDATE)) != 0)
+ if ((info & ~(XLR_RMGR_INFO_MASK |
+ XLR_SPECIAL_REL_UPDATE |
+ XLR_CHECK_CONSISTENCY)) != 0)
elog(PANIC, "invalid xlog info mask %02X", info);
TRACE_POSTGRESQL_XLOG_INSERT(rmid, info);
@@ -505,6 +507,15 @@ XLogRecordAssemble(RmgrId rmid, uint8 info,
hdr_rdt.data = hdr_scratch;
/*
+ * Enforce consistency checks for this record if user is looking for
+ * it. Do this before at the beginning of this routine to give the
+ * possibility for callers of XLogInsert() to pass XLR_CHECK_CONSISTENCY
+ * directly for a record.
+ */
+ if (wal_consistency_checking[rmid])
+ info |= XLR_CHECK_CONSISTENCY;
+
+ /*
* Make an rdata chain containing all the data portions of all block
* references. This includes the data for full-page images. Also append
* the headers for the block references in the scratch buffer.
@@ -520,6 +531,7 @@ XLogRecordAssemble(RmgrId rmid, uint8 info,
XLogRecordBlockCompressHeader cbimg = {0};
bool samerel;
bool is_compressed = false;
+ bool include_image;
if (!regbuf->in_use)
continue;
@@ -563,7 +575,14 @@ XLogRecordAssemble(RmgrId rmid, uint8 info,
if ((regbuf->flags & REGBUF_WILL_INIT) == REGBUF_WILL_INIT)
bkpb.fork_flags |= BKPBLOCK_WILL_INIT;
- if (needs_backup)
+ /*
+ * If needs_backup is true or WAL checking is enabled for
+ * current resource manager, log a full-page write for the current
+ * block.
+ */
+ include_image = needs_backup || (info & XLR_CHECK_CONSISTENCY) != 0;
+
+ if (include_image)
{
Page page = regbuf->page;
uint16 compressed_len;
@@ -625,6 +644,15 @@ XLogRecordAssemble(RmgrId rmid, uint8 info,
bimg.bimg_info = (cbimg.hole_length == 0) ? 0 : BKPIMAGE_HAS_HOLE;
+ /*
+ * If WAL consistency checking is enabled for the resource manager of
+ * this WAL record, a full-page image is included in the record
+ * for the block modified. During redo, the full-page is replayed
+ * only if BKPIMAGE_APPLY is set.
+ */
+ if (needs_backup)
+ bimg.bimg_info |= BKPIMAGE_APPLY;
+
if (is_compressed)
{
bimg.length = compressed_len;
@@ -687,7 +715,7 @@ XLogRecordAssemble(RmgrId rmid, uint8 info,
/* Ok, copy the header to the scratch buffer */
memcpy(scratch, &bkpb, SizeOfXLogRecordBlockHeader);
scratch += SizeOfXLogRecordBlockHeader;
- if (needs_backup)
+ if (include_image)
{
memcpy(scratch, &bimg, SizeOfXLogRecordBlockImageHeader);
scratch += SizeOfXLogRecordBlockImageHeader;
diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c
index b528745fe85..f077662946f 100644
--- a/src/backend/access/transam/xlogreader.c
+++ b/src/backend/access/transam/xlogreader.c
@@ -997,6 +997,7 @@ ResetDecoder(XLogReaderState *state)
state->blocks[block_id].in_use = false;
state->blocks[block_id].has_image = false;
state->blocks[block_id].has_data = false;
+ state->blocks[block_id].apply_image = false;
}
state->max_block_id = -1;
}
@@ -1089,6 +1090,7 @@ DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, char **errormsg)
blk = &state->blocks[block_id];
blk->in_use = true;
+ blk->apply_image = false;
COPY_HEADER_FIELD(&fork_flags, sizeof(uint8));
blk->forknum = fork_flags & BKPBLOCK_FORK_MASK;
@@ -1120,6 +1122,9 @@ DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, char **errormsg)
COPY_HEADER_FIELD(&blk->bimg_len, sizeof(uint16));
COPY_HEADER_FIELD(&blk->hole_offset, sizeof(uint16));
COPY_HEADER_FIELD(&blk->bimg_info, sizeof(uint8));
+
+ blk->apply_image = ((blk->bimg_info & BKPIMAGE_APPLY) != 0);
+
if (blk->bimg_info & BKPIMAGE_IS_COMPRESSED)
{
if (blk->bimg_info & BKPIMAGE_HAS_HOLE)
@@ -1243,6 +1248,9 @@ DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, char **errormsg)
if (!blk->in_use)
continue;
+
+ Assert(blk->has_image || !blk->apply_image);
+
if (blk->has_image)
{
blk->bkp_image = ptr;
diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c
index 0de2419e54b..6627f5498b9 100644
--- a/src/backend/access/transam/xlogutils.c
+++ b/src/backend/access/transam/xlogutils.c
@@ -275,9 +275,9 @@ XLogCheckInvalidPages(void)
* will complain if we don't have the lock. In hot standby mode it's
* definitely necessary.)
*
- * Note: when a backup block is available in XLOG, we restore it
- * unconditionally, even if the page in the database appears newer. This is
- * to protect ourselves against database pages that were partially or
+ * Note: when a backup block is available in XLOG with the BKPIMAGE_APPLY flag
+ * set, we restore it, even if the page in the database appears newer. This
+ * is to protect ourselves against database pages that were partially or
* incorrectly written during a crash. We assume that the XLOG data must be
* good because it has passed a CRC check, while the database page might not
* be. This will force us to replay all subsequent modifications of the page
@@ -352,9 +352,10 @@ XLogReadBufferForRedoExtended(XLogReaderState *record,
if (!willinit && zeromode)
elog(PANIC, "block to be initialized in redo routine must be marked with WILL_INIT flag in the WAL record");
- /* If it's a full-page image, restore it. */
- if (XLogRecHasBlockImage(record, block_id))
+ /* If it has a full-page image and it should be restored, do it. */
+ if (XLogRecBlockImageApply(record, block_id))
{
+ Assert(XLogRecHasBlockImage(record, block_id));
*buf = XLogReadBufferExtended(rnode, forknum, blkno,
get_cleanup_lock ? RBM_ZERO_AND_CLEANUP_LOCK : RBM_ZERO_AND_LOCK);
page = BufferGetPage(*buf);