summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/backend/access/transam/xlog.c120
-rw-r--r--src/backend/access/transam/xlogreader.c3
-rw-r--r--src/backend/replication/walsender.c12
-rw-r--r--src/include/access/xlog.h3
4 files changed, 134 insertions, 4 deletions
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 478377c4a23..4e14c242b15 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -1706,6 +1706,126 @@ GetXLogBuffer(XLogRecPtr ptr, TimeLineID tli)
}
/*
+ * Read WAL data directly from WAL buffers, if available. Returns the number
+ * of bytes read successfully.
+ *
+ * Fewer than 'count' bytes may be read if some of the requested WAL data has
+ * already been evicted from the WAL buffers, or if the caller requests data
+ * that is not yet available.
+ *
+ * No locks are taken.
+ *
+ * The 'tli' argument is only used as a convenient safety check so that
+ * callers do not read from WAL buffers on a historical timeline.
+ */
+Size
+WALReadFromBuffers(char *dstbuf, XLogRecPtr startptr, Size count,
+ TimeLineID tli)
+{
+ char *pdst = dstbuf;
+ XLogRecPtr recptr = startptr;
+ XLogRecPtr upto;
+ Size nbytes;
+
+ if (RecoveryInProgress() || tli != GetWALInsertionTimeLine())
+ return 0;
+
+ Assert(!XLogRecPtrIsInvalid(startptr));
+
+ /*
+ * Don't read past the available WAL data.
+ *
+ * Check using local copy of LogwrtResult. Ordinarily it's been updated by
+ * the caller when determining how far to read; but if not, it just means
+ * we'll read less data.
+ *
+ * XXX: the available WAL could be extended to the WAL insert pointer by
+ * calling WaitXLogInsertionsToFinish().
+ */
+ upto = Min(startptr + count, LogwrtResult.Write);
+ nbytes = upto - startptr;
+
+ /*
+ * Loop through the buffers without a lock. For each buffer, atomically
+ * read and verify the end pointer, then copy the data out, and finally
+ * re-read and re-verify the end pointer.
+ *
+ * Once a page is evicted, it never returns to the WAL buffers, so if the
+ * end pointer matches the expected end pointer before and after we copy
+ * the data, then the right page must have been present during the data
+ * copy. Read barriers are necessary to ensure that the data copy actually
+ * happens between the two verification steps.
+ *
+ * If either verification fails, we simply terminate the loop and return
+ * with the data that had been already copied out successfully.
+ */
+ while (nbytes > 0)
+ {
+ uint32 offset = recptr % XLOG_BLCKSZ;
+ int idx = XLogRecPtrToBufIdx(recptr);
+ XLogRecPtr expectedEndPtr;
+ XLogRecPtr endptr;
+ const char *page;
+ const char *psrc;
+ Size npagebytes;
+
+ /*
+ * Calculate the end pointer we expect in the xlblocks array if the
+ * correct page is present.
+ */
+ expectedEndPtr = recptr + (XLOG_BLCKSZ - offset);
+
+ /*
+ * First verification step: check that the correct page is present in
+ * the WAL buffers.
+ */
+ endptr = pg_atomic_read_u64(&XLogCtl->xlblocks[idx]);
+ if (expectedEndPtr != endptr)
+ break;
+
+ /*
+ * The correct page is present (or was at the time the endptr was
+ * read; must re-verify later). Calculate pointer to source data and
+ * determine how much data to read from this page.
+ */
+ page = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ;
+ psrc = page + offset;
+ npagebytes = Min(nbytes, XLOG_BLCKSZ - offset);
+
+ /*
+ * Ensure that the data copy and the first verification step are not
+ * reordered.
+ */
+ pg_read_barrier();
+
+ /* data copy */
+ memcpy(pdst, psrc, npagebytes);
+
+ /*
+ * Ensure that the data copy and the second verification step are not
+ * reordered.
+ */
+ pg_read_barrier();
+
+ /*
+ * Second verification step: check that the page we read from wasn't
+ * evicted while we were copying the data.
+ */
+ endptr = pg_atomic_read_u64(&XLogCtl->xlblocks[idx]);
+ if (expectedEndPtr != endptr)
+ break;
+
+ pdst += npagebytes;
+ recptr += npagebytes;
+ nbytes -= npagebytes;
+ }
+
+ Assert(pdst - dstbuf <= count);
+
+ return pdst - dstbuf;
+}
+
+/*
* Converts a "usable byte position" to XLogRecPtr. A usable byte position
* is the position starting from the beginning of WAL, excluding all WAL
* page headers.
diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c
index 7190156f2fe..74a6b118669 100644
--- a/src/backend/access/transam/xlogreader.c
+++ b/src/backend/access/transam/xlogreader.c
@@ -1500,9 +1500,6 @@ err:
*
* Returns true if succeeded, false if an error occurs, in which case
* 'errinfo' receives error details.
- *
- * XXX probably this should be improved to suck data directly from the
- * WAL buffers when possible.
*/
bool
WALRead(XLogReaderState *state,
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index 77c8baa32a4..146826d5db9 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -2966,6 +2966,7 @@ XLogSendPhysical(void)
Size nbytes;
XLogSegNo segno;
WALReadError errinfo;
+ Size rbytes;
/* If requested switch the WAL sender to the stopping state. */
if (got_STOPPING)
@@ -3181,7 +3182,16 @@ XLogSendPhysical(void)
enlargeStringInfo(&output_message, nbytes);
retry:
- if (!WALRead(xlogreader,
+ /* attempt to read WAL from WAL buffers first */
+ rbytes = WALReadFromBuffers(&output_message.data[output_message.len],
+ startptr, nbytes, xlogreader->seg.ws_tli);
+ output_message.len += rbytes;
+ startptr += rbytes;
+ nbytes -= rbytes;
+
+ /* now read the remaining WAL from WAL file */
+ if (nbytes > 0 &&
+ !WALRead(xlogreader,
&output_message.data[output_message.len],
startptr,
nbytes,
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 301c5fa11fb..76787a82673 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -252,6 +252,9 @@ extern XLogRecPtr GetLastImportantRecPtr(void);
extern void SetWalWriterSleeping(bool sleeping);
+extern Size WALReadFromBuffers(char *dstbuf, XLogRecPtr startptr, Size count,
+ TimeLineID tli);
+
/*
* Routines used by xlogrecovery.c to call back into xlog.c during recovery.
*/