diff options
-rw-r--r-- | src/backend/access/transam/xlog.c | 120 | ||||
-rw-r--r-- | src/backend/access/transam/xlogreader.c | 3 | ||||
-rw-r--r-- | src/backend/replication/walsender.c | 12 | ||||
-rw-r--r-- | src/include/access/xlog.h | 3 |
4 files changed, 134 insertions, 4 deletions
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 478377c4a23..4e14c242b15 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -1706,6 +1706,126 @@ GetXLogBuffer(XLogRecPtr ptr, TimeLineID tli) } /* + * Read WAL data directly from WAL buffers, if available. Returns the number + * of bytes read successfully. + * + * Fewer than 'count' bytes may be read if some of the requested WAL data has + * already been evicted from the WAL buffers, or if the caller requests data + * that is not yet available. + * + * No locks are taken. + * + * The 'tli' argument is only used as a convenient safety check so that + * callers do not read from WAL buffers on a historical timeline. + */ +Size +WALReadFromBuffers(char *dstbuf, XLogRecPtr startptr, Size count, + TimeLineID tli) +{ + char *pdst = dstbuf; + XLogRecPtr recptr = startptr; + XLogRecPtr upto; + Size nbytes; + + if (RecoveryInProgress() || tli != GetWALInsertionTimeLine()) + return 0; + + Assert(!XLogRecPtrIsInvalid(startptr)); + + /* + * Don't read past the available WAL data. + * + * Check using local copy of LogwrtResult. Ordinarily it's been updated by + * the caller when determining how far to read; but if not, it just means + * we'll read less data. + * + * XXX: the available WAL could be extended to the WAL insert pointer by + * calling WaitXLogInsertionsToFinish(). + */ + upto = Min(startptr + count, LogwrtResult.Write); + nbytes = upto - startptr; + + /* + * Loop through the buffers without a lock. For each buffer, atomically + * read and verify the end pointer, then copy the data out, and finally + * re-read and re-verify the end pointer. + * + * Once a page is evicted, it never returns to the WAL buffers, so if the + * end pointer matches the expected end pointer before and after we copy + * the data, then the right page must have been present during the data + * copy. Read barriers are necessary to ensure that the data copy actually + * happens between the two verification steps. + * + * If either verification fails, we simply terminate the loop and return + * with the data that had been already copied out successfully. + */ + while (nbytes > 0) + { + uint32 offset = recptr % XLOG_BLCKSZ; + int idx = XLogRecPtrToBufIdx(recptr); + XLogRecPtr expectedEndPtr; + XLogRecPtr endptr; + const char *page; + const char *psrc; + Size npagebytes; + + /* + * Calculate the end pointer we expect in the xlblocks array if the + * correct page is present. + */ + expectedEndPtr = recptr + (XLOG_BLCKSZ - offset); + + /* + * First verification step: check that the correct page is present in + * the WAL buffers. + */ + endptr = pg_atomic_read_u64(&XLogCtl->xlblocks[idx]); + if (expectedEndPtr != endptr) + break; + + /* + * The correct page is present (or was at the time the endptr was + * read; must re-verify later). Calculate pointer to source data and + * determine how much data to read from this page. + */ + page = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ; + psrc = page + offset; + npagebytes = Min(nbytes, XLOG_BLCKSZ - offset); + + /* + * Ensure that the data copy and the first verification step are not + * reordered. + */ + pg_read_barrier(); + + /* data copy */ + memcpy(pdst, psrc, npagebytes); + + /* + * Ensure that the data copy and the second verification step are not + * reordered. + */ + pg_read_barrier(); + + /* + * Second verification step: check that the page we read from wasn't + * evicted while we were copying the data. + */ + endptr = pg_atomic_read_u64(&XLogCtl->xlblocks[idx]); + if (expectedEndPtr != endptr) + break; + + pdst += npagebytes; + recptr += npagebytes; + nbytes -= npagebytes; + } + + Assert(pdst - dstbuf <= count); + + return pdst - dstbuf; +} + +/* * Converts a "usable byte position" to XLogRecPtr. A usable byte position * is the position starting from the beginning of WAL, excluding all WAL * page headers. diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c index 7190156f2fe..74a6b118669 100644 --- a/src/backend/access/transam/xlogreader.c +++ b/src/backend/access/transam/xlogreader.c @@ -1500,9 +1500,6 @@ err: * * Returns true if succeeded, false if an error occurs, in which case * 'errinfo' receives error details. - * - * XXX probably this should be improved to suck data directly from the - * WAL buffers when possible. */ bool WALRead(XLogReaderState *state, diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index 77c8baa32a4..146826d5db9 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -2966,6 +2966,7 @@ XLogSendPhysical(void) Size nbytes; XLogSegNo segno; WALReadError errinfo; + Size rbytes; /* If requested switch the WAL sender to the stopping state. */ if (got_STOPPING) @@ -3181,7 +3182,16 @@ XLogSendPhysical(void) enlargeStringInfo(&output_message, nbytes); retry: - if (!WALRead(xlogreader, + /* attempt to read WAL from WAL buffers first */ + rbytes = WALReadFromBuffers(&output_message.data[output_message.len], + startptr, nbytes, xlogreader->seg.ws_tli); + output_message.len += rbytes; + startptr += rbytes; + nbytes -= rbytes; + + /* now read the remaining WAL from WAL file */ + if (nbytes > 0 && + !WALRead(xlogreader, &output_message.data[output_message.len], startptr, nbytes, diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 301c5fa11fb..76787a82673 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -252,6 +252,9 @@ extern XLogRecPtr GetLastImportantRecPtr(void); extern void SetWalWriterSleeping(bool sleeping); +extern Size WALReadFromBuffers(char *dstbuf, XLogRecPtr startptr, Size count, + TimeLineID tli); + /* * Routines used by xlogrecovery.c to call back into xlog.c during recovery. */ |