Introduce logical decoding.

This feature, building on previous commits, allows the write-ahead log stream to be decoded into a series of logical changes; that is, inserts, updates, and deletes and the transactions which contain them. It is capable of handling decoding even across changes to the schema of the effected tables. The output format is controlled by a so-called "output plugin"; an example is included. To make use of this in a real replication system, the output plugin will need to be modified to produce output in the format appropriate to that system, and to perform filtering. Currently, information can be extracted from the logical decoding system only via SQL; future commits will add the ability to stream changes via walsender. Andres Freund, with review and other contributions from many other people, including Álvaro Herrera, Abhijit Menon-Sen, Peter Gheogegan, Kevin Grittner, Robert Haas, Heikki Linnakangas, Fujii Masao, Abhijit Menon-Sen, Michael Paquier, Simon Riggs, Craig Ringer, and Steve Singer.
author: Robert Haas 2014-03-03 21:32:18 +0000
committer: Robert Haas 2014-03-03 21:32:18 +0000
commit: b89e151054a05f0f6d356ca52e3b725dd0505e53 (patch)
tree: 9b9193e808625a381003650ff68b66cdb5f9f46e /src/backend
parent: de94b47c0a92faeddab5ac980449d3fa877b4a4f (diff)
35 files changed, 8865 insertions, 172 deletions
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index de4befa93f4..71ec74015cd 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -347,8 +347,7 @@ heapgetpage(HeapScanDesc scan, BlockNumber page)
 	/*
 	 * Prune and repair fragmentation for the whole page, if possible.
 	 */
-	Assert(TransactionIdIsValid(RecentGlobalXmin));
-	heap_page_prune_opt(scan->rs_rd, buffer, RecentGlobalXmin);
+	heap_page_prune_opt(scan->rs_rd, buffer);
 
 	/*
 	 * We must hold share lock on the buffer content while examining tuple
@@ -1750,10 +1749,22 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
 		 */
 		if (!skip)
 		{
+			/*
+			 * For the benefit of logical decoding, have t_self point at the
+			 * element of the HOT chain we're currently investigating instead
+			 * of the root tuple of the HOT chain. This is important because
+			 * the *Satisfies routine for historical mvcc snapshots needs the
+			 * correct tid to decide about the visibility in some cases.
+			 */
+			ItemPointerSet(&(heapTuple->t_self), BufferGetBlockNumber(buffer), offnum);
+
 			/* If it's visible per the snapshot, we must return it */
 			valid = HeapTupleSatisfiesVisibility(heapTuple, snapshot, buffer);
 			CheckForSerializableConflictOut(valid, relation, heapTuple,
 											buffer, snapshot);
+			/* reset to original, non-redirected, tid */
+			heapTuple->t_self = *tid;
+
 			if (valid)
 			{
 				ItemPointerSetOffsetNumber(tid, offnum);
@@ -8207,6 +8218,9 @@ heap2_redo(XLogRecPtr lsn, XLogRecord *record)
 			 * decoding.
 			 */
 			break;
+		case XLOG_HEAP2_REWRITE:
+			heap_xlog_logical_rewrite(lsn, record);
+			break;
 		default:
 			elog(PANIC, "heap2_redo: unknown op code %u", info);
 	}
diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c
index 27cbac85256..3c69e1badac 100644
--- a/src/backend/access/heap/pruneheap.c
+++ b/src/backend/access/heap/pruneheap.c
@@ -18,13 +18,14 @@
 #include "access/heapam_xlog.h"
 #include "access/transam.h"
 #include "access/htup_details.h"
+#include "catalog/catalog.h"
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "storage/bufmgr.h"
+#include "utils/snapmgr.h"
 #include "utils/rel.h"
 #include "utils/tqual.h"
 
-
 /* Working data for heap_page_prune and subroutines */
 typedef struct
 {
@@ -70,10 +71,34 @@ static void heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum);
  * or RECENTLY_DEAD (see HeapTupleSatisfiesVacuum).
  */
 void
-heap_page_prune_opt(Relation relation, Buffer buffer, TransactionId OldestXmin)
+heap_page_prune_opt(Relation relation, Buffer buffer)
 {
 	Page		page = BufferGetPage(buffer);
 	Size		minfree;
+	TransactionId OldestXmin;
+
+	/*
+	 * We can't write WAL in recovery mode, so there's no point trying to
+	 * clean the page. The master will likely issue a cleaning WAL record soon
+	 * anyway, so this is no particular loss.
+	 */
+	if (RecoveryInProgress())
+		return;
+
+	/*
+	 * Use the appropriate xmin horizon for this relation. If it's a proper
+	 * catalog relation or a user defined, additional, catalog relation, we
+	 * need to use the horizon that includes slots, otherwise the data-only
+	 * horizon can be used. Note that the toast relation of user defined
+	 * relations are *not* considered catalog relations.
+	 */
+	if (IsCatalogRelation(relation) ||
+		RelationIsAccessibleInLogicalDecoding(relation))
+		OldestXmin = RecentGlobalXmin;
+	else
+		OldestXmin = RecentGlobalDataXmin;
+
+	Assert(TransactionIdIsValid(OldestXmin));
 
 	/*
 	 * Let's see if we really need pruning.
@@ -85,14 +110,6 @@ heap_page_prune_opt(Relation relation, Buffer buffer, TransactionId OldestXmin)
 		return;
 
 	/*
-	 * We can't write WAL in recovery mode, so there's no point trying to
-	 * clean the page. The master will likely issue a cleaning WAL record soon
-	 * anyway, so this is no particular loss.
-	 */
-	if (RecoveryInProgress())
-		return;
-
-	/*
 	 * We prune when a previous UPDATE failed to find enough space on the page
 	 * for a new tuple version, or when free space falls below the relation's
 	 * fill-factor target (but not less than 10%).
diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index c34ab9865f8..239c7dad0c9 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -102,17 +102,34 @@
  */
 #include "postgres.h"
 
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "miscadmin.h"
+
 #include "access/heapam.h"
 #include "access/heapam_xlog.h"
 #include "access/rewriteheap.h"
 #include "access/transam.h"
 #include "access/tuptoaster.h"
+#include "access/xact.h"
+
+#include "catalog/catalog.h"
+
+#include "lib/ilist.h"
+
+#include "replication/logical.h"
+#include "replication/slot.h"
+
 #include "storage/bufmgr.h"
+#include "storage/fd.h"
 #include "storage/smgr.h"
+
 #include "utils/memutils.h"
 #include "utils/rel.h"
 #include "utils/tqual.h"
 
+#include "storage/procarray.h"
 
 /*
  * State associated with a rewrite operation. This is opaque to the user
@@ -120,21 +137,28 @@
  */
 typedef struct RewriteStateData
 {
+	Relation	rs_old_rel;		/* source heap */
 	Relation	rs_new_rel;		/* destination heap */
 	Page		rs_buffer;		/* page currently being built */
 	BlockNumber rs_blockno;		/* block where page will go */
 	bool		rs_buffer_valid;	/* T if any tuples in buffer */
 	bool		rs_use_wal;		/* must we WAL-log inserts? */
+	bool		rs_logical_rewrite; /* do we need to do logical rewriting */
 	TransactionId rs_oldest_xmin;		/* oldest xmin used by caller to
 										 * determine tuple visibility */
 	TransactionId rs_freeze_xid;/* Xid that will be used as freeze cutoff
 								 * point */
+	TransactionId rs_logical_xmin;	/* Xid that will be used as cutoff
+									 * point for logical rewrites */
 	MultiXactId rs_cutoff_multi;/* MultiXactId that will be used as cutoff
 								 * point for multixacts */
 	MemoryContext rs_cxt;		/* for hash tables and entries and tuples in
 								 * them */
+	XLogRecPtr	rs_begin_lsn;	/* XLogInsertLsn when starting the rewrite */
 	HTAB	   *rs_unresolved_tups;		/* unmatched A tuples */
 	HTAB	   *rs_old_new_tid_map;		/* unmatched B tuples */
+	HTAB	   *rs_logical_mappings;	/* logical remapping files */
+	uint32		rs_num_rewrite_mappings; /* # in memory mappings */
 }	RewriteStateData;
 
 /*
@@ -169,14 +193,45 @@ typedef struct
 
 typedef OldToNewMappingData *OldToNewMapping;
 
+/*
+ * In-Memory data for a xid that might need logical remapping entries
+ * to be logged.
+ */
+typedef struct RewriteMappingFile
+{
+	TransactionId		xid;		/* xid that might need to see the row */
+	int					vfd;		/* fd of mappings file */
+	off_t				off;		/* how far have we written yet */
+	uint32				num_mappings; /* number of in-memory mappings */
+	dlist_head			mappings;	/* list of in-memory mappings */
+	char				path[MAXPGPATH]; /* path, for error messages */
+} RewriteMappingFile;
+
+/*
+ * A single In-Memeory logical rewrite mapping, hanging of
+ * RewriteMappingFile->mappings.
+ */
+typedef struct RewriteMappingDataEntry
+{
+	LogicalRewriteMappingData map;	/* map between old and new location of
+									 * the tuple */
+	dlist_node	node;
+} RewriteMappingDataEntry;
+
 
 /* prototypes for internal functions */
 static void raw_heap_insert(RewriteState state, HeapTuple tup);
 
+/* internal logical remapping prototypes */
+static void logical_begin_heap_rewrite(RewriteState state);
+static void logical_rewrite_heap_tuple(RewriteState state, ItemPointerData old_tid, HeapTuple new_tuple);
+static void logical_end_heap_rewrite(RewriteState state);
+
 
 /*
  * Begin a rewrite of a table
  *
+ * old_heap		old, locked heap relation tuples will be read from
  * new_heap		new, locked heap relation to insert tuples to
  * oldest_xmin	xid used by the caller to determine which tuples are dead
  * freeze_xid	xid before which tuples will be frozen
@@ -187,7 +242,7 @@ static void raw_heap_insert(RewriteState state, HeapTuple tup);
  * to be used in subsequent calls to the other functions.
  */
 RewriteState
-begin_heap_rewrite(Relation new_heap, TransactionId oldest_xmin,
+begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xmin,
 				   TransactionId freeze_xid, MultiXactId cutoff_multi,
 				   bool use_wal)
 {
@@ -210,6 +265,7 @@ begin_heap_rewrite(Relation new_heap, TransactionId oldest_xmin,
 	/* Create and fill in the state struct */
 	state = palloc0(sizeof(RewriteStateData));
 
+	state->rs_old_rel = old_heap;
 	state->rs_new_rel = new_heap;
 	state->rs_buffer = (Page) palloc(BLCKSZ);
 	/* new_heap needn't be empty, just locked */
@@ -244,6 +300,8 @@ begin_heap_rewrite(Relation new_heap, TransactionId oldest_xmin,
 
 	MemoryContextSwitchTo(old_cxt);
 
+	logical_begin_heap_rewrite(state);
+
 	return state;
 }
 
@@ -301,6 +359,8 @@ end_heap_rewrite(RewriteState state)
 	if (RelationNeedsWAL(state->rs_new_rel))
 		heap_sync(state->rs_new_rel);
 
+	logical_end_heap_rewrite(state);
+
 	/* Deleting the context frees everything */
 	MemoryContextDelete(state->rs_cxt);
 }
@@ -429,6 +489,8 @@ rewrite_heap_tuple(RewriteState state,
 		raw_heap_insert(state, new_tuple);
 		new_tid = new_tuple->t_self;
 
+		logical_rewrite_heap_tuple(state, old_tid, new_tuple);
+
 		/*
 		 * If the tuple is the updated version of a row, and the prior version
 		 * wouldn't be DEAD yet, then we need to either resolve the prior
@@ -678,3 +740,545 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
 	if (heaptup != tup)
 		heap_freetuple(heaptup);
 }
+
+/* ------------------------------------------------------------------------
+ * Logical rewrite support
+ *
+ * When doing logical decoding - which relies on using cmin/cmax of catalog
+ * tuples, via xl_heap_new_cid records - heap rewrites have to log enough
+ * information to allow the decoding backend to updates its internal mapping
+ * of (relfilenode,ctid) => (cmin, cmax) to be correct for the rewritten heap.
+ *
+ * For that, every time we find a tuple that's been modified in a catalog
+ * relation within the xmin horizon of any decoding slot, we log a mapping
+ * from the old to the new location.
+ *
+ * To deal with rewrites that abort the filename of a mapping file contains
+ * the xid of the transaction performing the rewrite, which then can be
+ * checked before being read in.
+ *
+ * For efficiency we don't immediately spill every single map mapping for a
+ * row to disk but only do so in batches when we've collected several of them
+ * in memory or when end_heap_rewrite() has been called.
+ *
+ * Crash-Safety: This module diverts from the usual patterns of doing WAL
+ * since it cannot rely on checkpoint flushing out all buffers and thus
+ * waiting for exlusive locks on buffers. Usually the XLogInsert() covering
+ * buffer modifications is performed while the buffer(s) that are being
+ * modified are exlusively locked guaranteeing that both the WAL record and
+ * the modified heap are on either side of the checkpoint. But since the
+ * mapping files we log aren't in shared_buffers that interlock doesn't work.
+ *
+ * Instead we simply write the mapping files out to disk, *before* the
+ * XLogInsert() is performed. That guarantees that either the XLogInsert() is
+ * inserted after the checkpoint's redo pointer or that the checkpoint (via
+ * LogicalRewriteHeapCheckpoint()) has flushed the (partial) mapping file to
+ * disk. That leaves the tail end that has not yet been flushed open to
+ * corruption, which is solved by including the current offset in the
+ * xl_heap_rewrite_mapping records and truncating the mapping file to it
+ * during replay. Every time a rewrite is finished all generated mapping files
+ * are synced to disk.
+ *
+ * Note that if we were only concerned about crash safety we wouldn't have to
+ * deal with WAL logging at all - an fsync() at the end of a rewrite would be
+ * sufficient for crash safety. Any mapping that hasn't been safely flushed to
+ * disk has to be by an aborted (explicitly or via a crash) transaction and is
+ * ignored by virtue of the xid in it's name being subject to a
+ * TransactionDidCommit() check. But we want to support having standbys via
+ * physical replication, both for availability and to to do logical decoding
+ * there.
+ * ------------------------------------------------------------------------
+ */
+
+/*
+ * Do preparations for logging logical mappings during a rewrite if
+ * necessary. If we detect that we don't need to log anything we'll prevent
+ * any further action by the various logical rewrite functions.
+ */
+static void
+logical_begin_heap_rewrite(RewriteState state)
+{
+	HASHCTL		hash_ctl;
+	TransactionId logical_xmin;
+
+	/*
+	 * We only need to persist these mappings if the rewritten table can be
+	 * accessed during logical decoding, if not, we can skip doing any
+	 * additional work.
+	 */
+	state->rs_logical_rewrite =
+		RelationIsAccessibleInLogicalDecoding(state->rs_old_rel);
+
+	if (!state->rs_logical_rewrite)
+		return;
+
+	Assert(ReplicationSlotCtl != NULL);
+
+	ProcArrayGetReplicationSlotXmin(NULL, &logical_xmin);
+
+	/*
+	 * If there are no logical slots in progress we don't need to do anything,
+	 * there cannot be any remappings for relevant rows yet. The relation's
+	 * lock protects us against races.
+	 */
+	if (logical_xmin == InvalidTransactionId)
+	{
+		state->rs_logical_rewrite = false;
+		return;
+	}
+
+	state->rs_logical_xmin = logical_xmin;
+	state->rs_begin_lsn = GetXLogInsertRecPtr();
+	state->rs_num_rewrite_mappings = 0;
+
+	memset(&hash_ctl, 0, sizeof(hash_ctl));
+	hash_ctl.keysize = sizeof(TransactionId);
+	hash_ctl.entrysize = sizeof(RewriteMappingFile);
+	hash_ctl.hcxt = state->rs_cxt;
+	hash_ctl.hash = tag_hash;
+
+	state->rs_logical_mappings =
+		hash_create("Logical rewrite mapping",
+					128,		/* arbitrary initial size */
+					&hash_ctl,
+					HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
+}
+
+/*
+ * Flush all logical in-memory mappings to disk, but don't fsync them yet.
+ */
+static void
+logical_heap_rewrite_flush_mappings(RewriteState state)
+{
+	HASH_SEQ_STATUS seq_status;
+	RewriteMappingFile *src;
+	dlist_mutable_iter iter;
+
+	Assert(state->rs_logical_rewrite);
+
+	/* no logical rewrite in progress, no need to iterate over mappings */
+	if (state->rs_num_rewrite_mappings == 0)
+		return;
+
+	elog(DEBUG1, "flushing %u logical rewrite mapping entries",
+		 state->rs_num_rewrite_mappings);
+
+	hash_seq_init(&seq_status, state->rs_logical_mappings);
+	while ((src = (RewriteMappingFile *) hash_seq_search(&seq_status)) != NULL)
+	{
+		XLogRecData		rdata[2];
+		char		   *waldata;
+		char		   *waldata_start;
+		xl_heap_rewrite_mapping xlrec;
+		Oid				dboid;
+		uint32			len;
+		int				written;
+
+		/* this file hasn't got any new mappings */
+		if (src->num_mappings == 0)
+			continue;
+
+		if (state->rs_old_rel->rd_rel->relisshared)
+			dboid = InvalidOid;
+		else
+			dboid = MyDatabaseId;
+
+		xlrec.num_mappings = src->num_mappings;
+		xlrec.mapped_rel = RelationGetRelid(state->rs_old_rel);
+		xlrec.mapped_xid = src->xid;
+		xlrec.mapped_db = dboid;
+		xlrec.offset = src->off;
+		xlrec.start_lsn = state->rs_begin_lsn;
+
+		rdata[0].data = (char *) (&xlrec);
+		rdata[0].len = sizeof(xlrec);
+		rdata[0].buffer = InvalidBuffer;
+		rdata[0].next = &(rdata[1]);
+
+		/* write all mappings consecutively */
+		len = src->num_mappings * sizeof(LogicalRewriteMappingData);
+		waldata = palloc(len);
+		waldata_start = waldata;
+
+		/*
+		 * collect data we need to write out, but don't modify ondisk data yet
+		 */
+		dlist_foreach_modify(iter, &src->mappings)
+		{
+			RewriteMappingDataEntry *pmap;
+
+			pmap = dlist_container(RewriteMappingDataEntry, node, iter.cur);
+
+			memcpy(waldata, &pmap->map, sizeof(pmap->map));
+			waldata += sizeof(pmap->map);
+
+			/* remove from the list and free */
+			dlist_delete(&pmap->node);
+			pfree(pmap);
+
+			/* update bookkeeping */
+			state->rs_num_rewrite_mappings--;
+			src->num_mappings--;
+		}
+
+		/*
+		 * Note that we deviate from the usual WAL coding practices here,
+		 * check the above "Logical rewrite support" comment for reasoning.
+		 */
+		written = FileWrite(src->vfd, waldata_start, len);
+		if (written != len)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not write to file \"%s\", wrote %d of %d: %m", src->path,
+							written, len)));
+		src->off += len;
+
+		Assert(src->num_mappings == 0);
+
+		rdata[1].data = waldata_start;
+		rdata[1].len = len;
+		rdata[1].buffer = InvalidBuffer;
+		rdata[1].next = NULL;
+
+		/* write xlog record */
+		XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_REWRITE, rdata);
+
+	}
+	Assert(state->rs_num_rewrite_mappings == 0);
+}
+
+/*
+ * Logical remapping part of end_heap_rewrite().
+ */
+static void
+logical_end_heap_rewrite(RewriteState state)
+{
+	HASH_SEQ_STATUS seq_status;
+	RewriteMappingFile *src;
+
+	/* done, no logical rewrite in progress */
+	if (!state->rs_logical_rewrite)
+		return;
+
+	/* writeout remaining in-memory entries */
+	if (state->rs_num_rewrite_mappings > 0 )
+		logical_heap_rewrite_flush_mappings(state);
+
+	/* Iterate over all mappings we have written and fsync the files. */
+	hash_seq_init(&seq_status, state->rs_logical_mappings);
+	while ((src = (RewriteMappingFile *) hash_seq_search(&seq_status)) != NULL)
+	{
+		if(FileSync(src->vfd) != 0)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not fsync file \"%s\": %m", src->path)));
+		FileClose(src->vfd);
+	}
+	/* memory context cleanup will deal with the rest */
+}
+
+/*
+ * Log a single (old->new) mapping for 'xid'.
+ */
+static void
+logical_rewrite_log_mapping(RewriteState state, TransactionId xid,
+							LogicalRewriteMappingData *map)
+{
+	RewriteMappingFile		   *src;
+	RewriteMappingDataEntry	   *pmap;
+	Oid							relid;
+	bool						found;
+
+	relid = RelationGetRelid(state->rs_old_rel);
+
+	/* look for existing mappings for this 'mapped' xid */
+	src = hash_search(state->rs_logical_mappings, &xid,
+					  HASH_ENTER, &found);
+
+	/*
+	 * We haven't yet had the need to map anything for this xid, create
+	 * per-xid data structures.
+	 */
+	if (!found)
+	{
+		char		path[MAXPGPATH];
+		Oid			dboid;
+
+		if (state->rs_old_rel->rd_rel->relisshared)
+			dboid = InvalidOid;
+		else
+			dboid = MyDatabaseId;
+
+		snprintf(path, MAXPGPATH,
+				 "pg_llog/mappings/" LOGICAL_REWRITE_FORMAT,
+				 dboid, relid,
+				 (uint32) (state->rs_begin_lsn >> 32),
+				 (uint32) state->rs_begin_lsn,
+				 xid, GetCurrentTransactionId());
+
+		dlist_init(&src->mappings);
+		src->num_mappings = 0;
+		src->off = 0;
+		memcpy(src->path, path, sizeof(path));
+		src->vfd = PathNameOpenFile(path,
+									O_CREAT | O_EXCL | O_WRONLY | PG_BINARY,
+									S_IRUSR | S_IWUSR);
+		if (src->vfd < 0)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not create file \"%s\": %m",	path)));
+	}
+
+	pmap = MemoryContextAlloc(state->rs_cxt,
+							  sizeof(RewriteMappingDataEntry));
+	memcpy(&pmap->map, map, sizeof(LogicalRewriteMappingData));
+	dlist_push_tail(&src->mappings, &pmap->node);
+	src->num_mappings++;
+	state->rs_num_rewrite_mappings++;
+
+	/*
+	 * Write out buffer every time we've too many in-memory entries across all
+	 * mapping files.
+	 */
+	if (state->rs_num_rewrite_mappings >= 1000 /* arbitrary number */)
+		logical_heap_rewrite_flush_mappings(state);
+}
+
+/*
+ * Perform logical remapping for a tuple that's mapped from old_tid to
+ * new_tuple->t_self by rewrite_heap_tuple() iff necessary for the tuple.
+ */
+static void
+logical_rewrite_heap_tuple(RewriteState state, ItemPointerData old_tid,
+						   HeapTuple new_tuple)
+{
+	ItemPointerData new_tid = new_tuple->t_self;
+	TransactionId	cutoff = state->rs_logical_xmin;
+	TransactionId	xmin;
+	TransactionId	xmax;
+	bool			do_log_xmin = false;
+	bool			do_log_xmax = false;
+	LogicalRewriteMappingData map;
+
+	/* no logical rewrite in progress, we don't need to log anything */
+	if (!state->rs_logical_rewrite)
+		return;
+
+	xmin = HeapTupleHeaderGetXmin(new_tuple->t_data);
+	/* use *GetUpdateXid to correctly deal with multixacts */
+	xmax = HeapTupleHeaderGetUpdateXid(new_tuple->t_data);
+
+	/*
+	 * Log the mapping iff the tuple has been created recently.
+	 */
+	if (TransactionIdIsNormal(xmin) && !TransactionIdPrecedes(xmin, cutoff))
+		do_log_xmin = true;
+
+	if (!TransactionIdIsNormal(xmax))
+	{
+		/*
+		 * no xmax is set, can't have any permanent ones, so this check is
+		 * sufficient
+		 */
+	}
+	else if (HEAP_XMAX_IS_LOCKED_ONLY(new_tuple->t_data->t_infomask))
+	{
+		/* only locked, we don't care */
+	}
+	else if (!TransactionIdPrecedes(xmax, cutoff))
+	{
+		/* tuple has been deleted recently, log */
+		do_log_xmax = true;
+	}
+
+	/* if neither needs to be logged, we're done */
+	if (!do_log_xmin && !do_log_xmax)
+		return;
+
+	/* fill out mapping information */
+	map.old_node = state->rs_old_rel->rd_node;
+	map.old_tid = old_tid;
+	map.new_node = state->rs_new_rel->rd_node;
+	map.new_tid = new_tid;
+
+	/* ---
+	 * Now persist the mapping for the individual xids that are affected. We
+	 * need to log for both xmin and xmax if they aren't the same transaction
+	 * since the mapping files are per "affected" xid.
+	 * We don't muster all that much effort detecting whether xmin and xmax
+	 * are actually the same transaction, we just check whether the xid is the
+	 * same disregarding subtransactions. Logging too much is relatively
+	 * harmless and we could never do the check fully since subtransaction
+	 * data is thrown away during restarts.
+	 * ---
+	 */
+	if (do_log_xmin)
+		logical_rewrite_log_mapping(state, xmin, &map);
+	/* separately log mapping for xmax unless it'd be redundant */
+	if (do_log_xmax && !TransactionIdEquals(xmin, xmax))
+		logical_rewrite_log_mapping(state, xmax, &map);
+}
+
+/*
+ * Replay XLOG_HEAP2_REWRITE records
+ */
+void
+heap_xlog_logical_rewrite(XLogRecPtr lsn, XLogRecord *r)
+{
+	char		path[MAXPGPATH];
+	int			fd;
+	xl_heap_rewrite_mapping *xlrec;
+	uint32		len;
+	char	   *data;
+
+	xlrec = (xl_heap_rewrite_mapping *) XLogRecGetData(r);
+
+	snprintf(path, MAXPGPATH,
+			 "pg_llog/mappings/" LOGICAL_REWRITE_FORMAT,
+			 xlrec->mapped_db, xlrec->mapped_rel,
+			 (uint32) (xlrec->start_lsn >> 32),
+			 (uint32) xlrec->start_lsn,
+			 xlrec->mapped_xid, r->xl_xid);
+
+	fd = OpenTransientFile(path,
+						   O_CREAT | O_WRONLY | PG_BINARY,
+						   S_IRUSR | S_IWUSR);
+	if (fd < 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not create file \"%s\": %m",	path)));
+	/*
+	 * Truncate all data that's not guaranteed to have been safely fsynced (by
+	 * previous record or by the last checkpoint).
+	 */
+	if (ftruncate(fd, xlrec->offset) != 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not truncate file \"%s\" to %u: %m",
+						path, (uint32) xlrec->offset)));
+
+	/* now seek to the position we want to write our data to */
+	if (lseek(fd, xlrec->offset, SEEK_SET) != xlrec->offset)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not seek to the end of file \"%s\": %m",
+						path)));
+
+	data = XLogRecGetData(r) + sizeof(*xlrec);
+
+	len = xlrec->num_mappings * sizeof(LogicalRewriteMappingData);
+
+	/* write out tail end of mapping file (again) */
+	if (write(fd, data, len) != len)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not write to file \"%s\": %m", path)));
+	/*
+	 * Now fsync all previously written data. We could improve things and only
+	 * do this for the last write to a file, but the required bookkeeping
+	 * doesn't seem worth the trouble.
+	 */
+	if (pg_fsync(fd) != 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not fsync file \"%s\": %m", path)));
+
+	CloseTransientFile(fd);
+}
+
+/* ---
+ * Perform a checkpoint for logical rewrite mappings
+ *
+ * This serves two tasks:
+ * 1) Remove all mappings not needed anymore based on the logical restart LSN
+ * 2) Flush all remaining mappings to disk, so that replay after a checkpoint
+ *	  only has to deal with the parts of a mapping that have been written out
+ *	  after the checkpoint started.
+ * ---
+ */
+void
+CheckPointLogicalRewriteHeap(void)
+{
+	XLogRecPtr	cutoff;
+	XLogRecPtr	redo;
+	DIR		   *mappings_dir;
+	struct dirent *mapping_de;
+	char		path[MAXPGPATH];
+
+	/*
+	 * We start of with a minimum of the last redo pointer. No new decoding
+	 * slot will start before that, so that's a safe upper bound for removal.
+	 */
+	redo = GetRedoRecPtr();
+
+	/* now check for the restart ptrs from existing slots */
+	cutoff = ReplicationSlotsComputeLogicalRestartLSN();
+
+	/* don't start earlier than the restart lsn */
+	if (cutoff != InvalidXLogRecPtr && redo < cutoff)
+		cutoff = redo;
+
+	mappings_dir = AllocateDir("pg_llog/mappings");
+	while ((mapping_de = ReadDir(mappings_dir, "pg_llog/mappings")) != NULL)
+	{
+		struct stat	statbuf;
+		Oid			dboid;
+		Oid			relid;
+		XLogRecPtr	lsn;
+		TransactionId rewrite_xid;
+		TransactionId create_xid;
+		uint32		hi,	lo;
+
+		if (strcmp(mapping_de->d_name, ".") == 0 ||
+			strcmp(mapping_de->d_name, "..") == 0)
+			continue;
+
+		snprintf(path, MAXPGPATH, "pg_llog/mappings/%s", mapping_de->d_name);
+		if (lstat(path, &statbuf) == 0 && !S_ISREG(statbuf.st_mode))
+			continue;
+
+		/* Skip over files that cannot be ours. */
+		if (strncmp(mapping_de->d_name, "map-", 4) != 0)
+			continue;
+
+		if (sscanf(mapping_de->d_name, LOGICAL_REWRITE_FORMAT,
+				   &dboid, &relid, &hi, &lo, &rewrite_xid, &create_xid) != 6)
+			elog(ERROR,"could not parse filename \"%s\"", mapping_de->d_name);
+
+		lsn = ((uint64) hi) << 32 | lo;
+
+		if (lsn < cutoff || cutoff == InvalidXLogRecPtr)
+		{
+			elog(DEBUG1, "removing logical rewrite file \"%s\"", path);
+			if (unlink(path) < 0)
+				ereport(ERROR,
+						(errcode_for_file_access(),
+						 errmsg("could not unlink file \"%s\": %m", path)));
+		}
+		else
+		{
+			int		fd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0);
+
+			/*
+			 * The file cannot vanish due to concurrency since this function
+			 * is the only one removing logical mappings and it's run while
+			 * CheckpointLock is held exclusively.
+			 */
+			if (fd < 0)
+				ereport(ERROR,
+						(errcode_for_file_access(),
+						 errmsg("could not open file \"%s\": %m", path)));
+			/*
+			 * We could try to avoid fsyncing files that either haven't
+			 * changed or have only been created since the checkpoint's start,
+			 * but it's currently not deemed worth the effort.
+			 */
+			else if (pg_fsync(fd) != 0)
+				ereport(ERROR,
+						(errcode_for_file_access(),
+						 errmsg("could not fsync file \"%s\": %m", path)));
+			CloseTransientFile(fd);
+		}
+	}
+	FreeDir(mappings_dir);
+}
diff --git a/src/backend/access/heap/tuptoaster.c b/src/backend/access/heap/tuptoaster.c
index 97c9f238a7b..9a821d3e1cf 100644
--- a/src/backend/access/heap/tuptoaster.c
+++ b/src/backend/access/heap/tuptoaster.c
@@ -44,32 +44,6 @@
 
 #undef TOAST_DEBUG
 
-/*
- * Testing whether an externally-stored value is compressed now requires
- * comparing extsize (the actual length of the external data) to rawsize
- * (the original uncompressed datum's size).  The latter includes VARHDRSZ
- * overhead, the former doesn't.  We never use compression unless it actually
- * saves space, so we expect either equality or less-than.
- */
-#define VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer) \
-	((toast_pointer).va_extsize < (toast_pointer).va_rawsize - VARHDRSZ)
-
-/*
- * Macro to fetch the possibly-unaligned contents of an EXTERNAL datum
- * into a local "struct varatt_external" toast pointer.  This should be
- * just a memcpy, but some versions of gcc seem to produce broken code
- * that assumes the datum contents are aligned.  Introducing an explicit
- * intermediate "varattrib_1b_e *" variable seems to fix it.
- */
-#define VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr) \
-do { \
-	varattrib_1b_e *attre = (varattrib_1b_e *) (attr); \
-	Assert(VARATT_IS_EXTERNAL(attre)); \
-	Assert(VARSIZE_EXTERNAL(attre) == sizeof(toast_pointer) + VARHDRSZ_EXTERNAL); \
-	memcpy(&(toast_pointer), VARDATA_EXTERNAL(attre), sizeof(toast_pointer)); \
-} while (0)
-
-
 static void toast_delete_datum(Relation rel, Datum value);
 static Datum toast_save_datum(Relation rel, Datum value,
 				 struct varlena * oldexternal, int options);
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index 1aba2f04cc4..a4b5f3d698e 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -67,7 +67,10 @@
 
 #include "access/relscan.h"
 #include "access/transam.h"
+#include "access/xlog.h"
+
 #include "catalog/index.h"
+#include "catalog/catalog.h"
 #include "pgstat.h"
 #include "storage/bufmgr.h"
 #include "storage/lmgr.h"
@@ -520,8 +523,7 @@ index_fetch_heap(IndexScanDesc scan)
 		 * Prune page, but only if we weren't already on this page
 		 */
 		if (prev_buf != scan->xs_cbuf)
-			heap_page_prune_opt(scan->heapRelation, scan->xs_cbuf,
-								RecentGlobalXmin);
+			heap_page_prune_opt(scan->heapRelation, scan->xs_cbuf);
 	}
 
 	/* Obtain share-lock on the buffer so we can examine visibility */
diff --git a/src/backend/access/rmgrdesc/heapdesc.c b/src/backend/access/rmgrdesc/heapdesc.c
index 89ba09a206f..c8a61669dd2 100644
--- a/src/backend/access/rmgrdesc/heapdesc.c
+++ b/src/backend/access/rmgrdesc/heapdesc.c
@@ -149,6 +149,10 @@ heap2_desc(StringInfo buf, uint8 xl_info, char *rec)
 						 xlrec->node.relNode, xlrec->block,
 						 xlrec->cutoff_xid, xlrec->ntuples);
 	}
+	else if (info == XLOG_HEAP2_REWRITE)
+	{
+		appendStringInfoString(buf, "heap rewrite:");
+	}
 	else if (info == XLOG_HEAP2_CLEANUP_INFO)
 	{
 		xl_heap_cleanup_info *xlrec = (xl_heap_cleanup_info *) rec;
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 0487be17df7..b20d9732e78 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -1074,8 +1074,16 @@ RecordTransactionCommit(void)
 
 		/*
 		 * Do we need the long commit record? If not, use the compact format.
+		 *
+		 * For now always use the non-compact version if wal_level=logical, so
+		 * we can hide commits from other databases. TODO: In the future we
+		 * should merge compact and non-compact commits and use a flags
+		 * variable to determine if it contains subxacts, relations or
+		 * invalidation messages, that's more extensible and degrades more
+		 * gracefully. Till then, it's just 20 bytes of overhead.
 		 */
-		if (nrels > 0 || nmsgs > 0 || RelcacheInitFileInval || forceSyncCommit)
+		if (nrels > 0 || nmsgs > 0 || RelcacheInitFileInval || forceSyncCommit ||
+			XLogLogicalInfoActive())
 		{
 			XLogRecData rdata[4];
 			int			lastrdata = 0;
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index ad46eb0cebf..53a20b1e606 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -23,6 +23,7 @@
 
 #include "access/clog.h"
 #include "access/multixact.h"
+#include "access/rewriteheap.h"
 #include "access/subtrans.h"
 #include "access/timeline.h"
 #include "access/transam.h"
@@ -39,7 +40,9 @@
 #include "pgstat.h"
 #include "postmaster/bgwriter.h"
 #include "postmaster/startup.h"
+#include "replication/logical.h"
 #include "replication/slot.h"
+#include "replication/snapbuild.h"
 #include "replication/walreceiver.h"
 #include "replication/walsender.h"
 #include "storage/barrier.h"
@@ -4016,6 +4019,27 @@ CheckXLogRemoved(XLogSegNo segno, TimeLineID tli)
 }
 
 /*
+ * Return the last WAL segment removed, or 0 if no segment has been removed
+ * since startup.
+ *
+ * NB: the result can be out of date arbitrarily fast, the caller has to deal
+ * with that.
+ */
+XLogSegNo
+XLogGetLastRemovedSegno(void)
+{
+	/* use volatile pointer to prevent code rearrangement */
+	volatile XLogCtlData *xlogctl = XLogCtl;
+	XLogSegNo	lastRemovedSegNo;
+
+	SpinLockAcquire(&xlogctl->info_lck);
+	lastRemovedSegNo = xlogctl->lastRemovedSegNo;
+	SpinLockRelease(&xlogctl->info_lck);
+
+	return lastRemovedSegNo;
+}
+
+/*
  * Update the last removed segno pointer in shared memory, to reflect
  * that the given XLOG file has been removed.
  */
@@ -6559,6 +6583,12 @@ StartupXLOG(void)
 	StartupReplicationSlots(checkPoint.redo);
 
 	/*
+	 * Startup logical state, needs to be setup now so we have proper data
+	 * during crash recovery.
+	 */
+	StartupReorderBuffer();
+
+	/*
 	 * Startup MultiXact.  We need to do this early for two reasons: one
 	 * is that we might try to access multixacts when we do tuple freezing,
 	 * and the other is we need its state initialized because we attempt
@@ -8589,7 +8619,7 @@ CreateCheckPoint(int flags)
 	 * StartupSUBTRANS hasn't been called yet.
 	 */
 	if (!RecoveryInProgress())
-		TruncateSUBTRANS(GetOldestXmin(true, false));
+		TruncateSUBTRANS(GetOldestXmin(NULL, false));
 
 	/* Real work is done, but log and update stats before releasing lock. */
 	LogCheckpointEnd(false);
@@ -8674,6 +8704,8 @@ CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
 	CheckPointPredicate();
 	CheckPointRelationMap();
 	CheckPointReplicationSlots();
+	CheckPointSnapBuild();
+	CheckPointLogicalRewriteHeap();
 	CheckPointBuffers(flags);	/* performs all required fsyncs */
 	/* We deliberately delay 2PC checkpointing as long as possible */
 	CheckPointTwoPhase(checkPointRedo);
@@ -8965,7 +8997,7 @@ CreateRestartPoint(int flags)
 	 * this because StartupSUBTRANS hasn't been called yet.
 	 */
 	if (EnableHotStandby)
-		TruncateSUBTRANS(GetOldestXmin(true, false));
+		TruncateSUBTRANS(GetOldestXmin(NULL, false));
 
 	/* Real work is done, but log and update before releasing lock. */
 	LogCheckpointEnd(true);
diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c
index cebca95ac8d..877d7678f7a 100644
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -2156,7 +2156,7 @@ IndexBuildHeapScan(Relation heapRelation,
 	{
 		snapshot = SnapshotAny;
 		/* okay to ignore lazy VACUUMs here */
-		OldestXmin = GetOldestXmin(heapRelation->rd_rel->relisshared, true);
+		OldestXmin = GetOldestXmin(heapRelation, true);
 	}
 
 	scan = heap_beginscan_strat(heapRelation,	/* relation */
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index 04dfbb0ee54..0500a73e1ba 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -619,11 +619,13 @@ CREATE VIEW pg_stat_replication AS
 CREATE VIEW pg_replication_slots AS
     SELECT
             L.slot_name,
+            L.plugin,
             L.slot_type,
             L.datoid,
             D.datname AS database,
             L.active,
             L.xmin,
+            L.catalog_xmin,
             L.restart_lsn
     FROM pg_get_replication_slots() AS L
             LEFT JOIN pg_database D ON (L.datoid = D.oid);
@@ -822,3 +824,35 @@ CREATE OR REPLACE FUNCTION
 CREATE OR REPLACE FUNCTION
   json_populate_recordset(base anyelement, from_json json, use_json_as_text boolean DEFAULT false)
   RETURNS SETOF anyelement LANGUAGE internal STABLE ROWS 100  AS 'json_populate_recordset';
+
+CREATE OR REPLACE FUNCTION pg_logical_slot_get_changes(
+    IN slotname name, IN upto_lsn pg_lsn, IN upto_nchanges int, VARIADIC options text[] DEFAULT '{}',
+    OUT location pg_lsn, OUT xid xid, OUT data text)
+RETURNS SETOF RECORD
+LANGUAGE INTERNAL
+VOLATILE ROWS 1000 COST 1000
+AS 'pg_logical_slot_get_changes';
+
+CREATE OR REPLACE FUNCTION pg_logical_slot_peek_changes(
+    IN slotname name, IN upto_lsn pg_lsn, IN upto_nchanges int, VARIADIC options text[] DEFAULT '{}',
+    OUT location pg_lsn, OUT xid xid, OUT data text)
+RETURNS SETOF RECORD
+LANGUAGE INTERNAL
+VOLATILE ROWS 1000 COST 1000
+AS 'pg_logical_slot_peek_changes';
+
+CREATE OR REPLACE FUNCTION pg_logical_slot_get_binary_changes(
+    IN slotname name, IN upto_lsn pg_lsn, IN upto_nchanges int, VARIADIC options text[] DEFAULT '{}',
+    OUT location pg_lsn, OUT xid xid, OUT data bytea)
+RETURNS SETOF RECORD
+LANGUAGE INTERNAL
+VOLATILE ROWS 1000 COST 1000
+AS 'pg_logical_slot_get_binary_changes';
+
+CREATE OR REPLACE FUNCTION pg_logical_slot_peek_binary_changes(
+    IN slotname name, IN upto_lsn pg_lsn, IN upto_nchanges int, VARIADIC options text[] DEFAULT '{}',
+    OUT location pg_lsn, OUT xid xid, OUT data bytea)
+RETURNS SETOF RECORD
+LANGUAGE INTERNAL
+VOLATILE ROWS 1000 COST 1000
+AS 'pg_logical_slot_peek_binary_changes';
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index e7fcb558684..a04adeaac75 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -22,6 +22,7 @@
 #include "access/tuptoaster.h"
 #include "access/visibilitymap.h"
 #include "access/xact.h"
+#include "catalog/catalog.h"
 #include "catalog/index.h"
 #include "catalog/indexing.h"
 #include "catalog/pg_collation.h"
@@ -1081,7 +1082,7 @@ acquire_sample_rows(Relation onerel, int elevel,
 	totalblocks = RelationGetNumberOfBlocks(onerel);
 
 	/* Need a cutoff xmin for HeapTupleSatisfiesVacuum */
-	OldestXmin = GetOldestXmin(onerel->rd_rel->relisshared, true);
+	OldestXmin = GetOldestXmin(onerel, true);
 
 	/* Prepare for sampling block numbers */
 	BlockSampler_Init(&bs, totalblocks, targrows);
diff --git a/src/backend/commands/cluster.c b/src/backend/commands/cluster.c
index 8b18e4acb72..b6b40e724e7 100644
--- a/src/backend/commands/cluster.c
+++ b/src/backend/commands/cluster.c
@@ -850,7 +850,7 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose,
 	 * Since we're going to rewrite the whole table anyway, there's no reason
 	 * not to be aggressive about this.
 	 */
-	vacuum_set_xid_limits(0, 0, 0, 0, OldHeap->rd_rel->relisshared,
+	vacuum_set_xid_limits(OldHeap, 0, 0, 0, 0,
 						  &OldestXmin, &FreezeXid, NULL, &MultiXactCutoff,
 						  NULL);
 
@@ -869,7 +869,7 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose,
 	is_system_catalog = IsSystemRelation(OldHeap);
 
 	/* Initialize the rewrite operation */
-	rwstate = begin_heap_rewrite(NewHeap, OldestXmin, FreezeXid,
+	rwstate = begin_heap_rewrite(OldHeap, NewHeap, OldestXmin, FreezeXid,
 								 MultiXactCutoff, use_wal);
 
 	/*
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index 5d540aa3a01..4996a2e7cd2 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -45,6 +45,7 @@
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "postmaster/bgwriter.h"
+#include "replication/slot.h"
 #include "storage/copydir.h"
 #include "storage/fd.h"
 #include "storage/lmgr.h"
@@ -750,6 +751,7 @@ dropdb(const char *dbname, bool missing_ok)
 	HeapTuple	tup;
 	int			notherbackends;
 	int			npreparedxacts;
+	int			nslots, nslots_active;
 
 	/*
 	 * Look up the target database's OID, and get exclusive lock on it. We
@@ -807,6 +809,19 @@ dropdb(const char *dbname, bool missing_ok)
 				 errmsg("cannot drop the currently open database")));
 
 	/*
+	 * Check whether there are, possibly unconnected, logical slots that refer
+	 * to the to-be-dropped database. The database lock we are holding
+	 * prevents the creation of new slots using the database.
+	 */
+	if (ReplicationSlotsCountDBSlots(db_id, &nslots, &nslots_active))
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_IN_USE),
+				 errmsg("database \"%s\" is used by a logical decoding slot",
+						dbname),
+				 errdetail("There are %d slot(s), %d of them active",
+						   nslots, nslots_active)));
+
+	/*
 	 * Check for other backends in the target database.  (Because we hold the
 	 * database lock, no new ones can start after this.)
 	 *
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index 5ae7763534b..ded1841dc65 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -398,11 +398,11 @@ get_rel_oids(Oid relid, const RangeVar *vacrel)
  * not interested.
  */
 void
-vacuum_set_xid_limits(int freeze_min_age,
+vacuum_set_xid_limits(Relation rel,
+					  int freeze_min_age,
 					  int freeze_table_age,
 					  int multixact_freeze_min_age,
 					  int multixact_freeze_table_age,
-					  bool sharedRel,
 					  TransactionId *oldestXmin,
 					  TransactionId *freezeLimit,
 					  TransactionId *xidFullScanLimit,
@@ -425,7 +425,7 @@ vacuum_set_xid_limits(int freeze_min_age,
 	 * working on a particular table at any time, and that each vacuum is
 	 * always an independent transaction.
 	 */
-	*oldestXmin = GetOldestXmin(sharedRel, true);
+	*oldestXmin = GetOldestXmin(rel, true);
 
 	Assert(TransactionIdIsNormal(*oldestXmin));
 
@@ -795,7 +795,7 @@ vac_update_datfrozenxid(void)
 	 * committed pg_class entries for new tables; see AddNewRelationTuple().
 	 * So we cannot produce a wrong minimum by starting with this.
 	 */
-	newFrozenXid = GetOldestXmin(true, true);
+	newFrozenXid = GetOldestXmin(NULL, true);
 
 	/*
 	 * Similarly, initialize the MultiXact "min" with the value that would be
diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c
index d77892ee7f8..d5db917d97f 100644
--- a/src/backend/commands/vacuumlazy.c
+++ b/src/backend/commands/vacuumlazy.c
@@ -44,6 +44,7 @@
 #include "access/multixact.h"
 #include "access/transam.h"
 #include "access/visibilitymap.h"
+#include "catalog/catalog.h"
 #include "catalog/storage.h"
 #include "commands/dbcommands.h"
 #include "commands/vacuum.h"
@@ -204,10 +205,10 @@ lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt,
 
 	vac_strategy = bstrategy;
 
-	vacuum_set_xid_limits(vacstmt->freeze_min_age, vacstmt->freeze_table_age,
+	vacuum_set_xid_limits(onerel,
+						  vacstmt->freeze_min_age, vacstmt->freeze_table_age,
 						  vacstmt->multixact_freeze_min_age,
 						  vacstmt->multixact_freeze_table_age,
-						  onerel->rd_rel->relisshared,
 						  &OldestXmin, &FreezeLimit, &xidFullScanLimit,
 						  &MultiXactCutoff, &mxactFullScanLimit);
 
diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c
index 1a8d4e51430..7d8a3f2c248 100644
--- a/src/backend/executor/nodeBitmapHeapscan.c
+++ b/src/backend/executor/nodeBitmapHeapscan.c
@@ -336,8 +336,7 @@ bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres)
 	/*
 	 * Prune and repair fragmentation for the whole page, if possible.
 	 */
-	Assert(TransactionIdIsValid(RecentGlobalXmin));
-	heap_page_prune_opt(scan->rs_rd, buffer, RecentGlobalXmin);
+	heap_page_prune_opt(scan->rs_rd, buffer);
 
 	/*
 	 * We must hold share lock on the buffer content while examining tuple
diff --git a/src/backend/replication/Makefile b/src/backend/replication/Makefile
index 7941cb8d5e7..6f17b08a6a5 100644
--- a/src/backend/replication/Makefile
+++ b/src/backend/replication/Makefile
@@ -17,6 +17,8 @@ override CPPFLAGS := -I$(srcdir) $(CPPFLAGS)
 OBJS = walsender.o walreceiverfuncs.o walreceiver.o basebackup.o \
 	repl_gram.o slot.o slotfuncs.o syncrep.o
 
+SUBDIRS = logical
+
 include $(top_srcdir)/src/backend/common.mk
 
 # repl_scanner is compiled as part of repl_gram
diff --git a/src/backend/replication/logical/Makefile b/src/backend/replication/logical/Makefile
new file mode 100644
index 00000000000..310a45c5c05
--- /dev/null
+++ b/src/backend/replication/logical/Makefile
@@ -0,0 +1,19 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+#    Makefile for src/backend/replication/logical
+#
+# IDENTIFICATION
+#    src/backend/replication/logical/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/replication/logical
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+override CPPFLAGS := -I$(srcdir) $(CPPFLAGS)
+
+OBJS = decode.o logical.o logicalfuncs.o reorderbuffer.o snapbuild.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c
new file mode 100644
index 00000000000..e8949aab324
--- /dev/null
+++ b/src/backend/replication/logical/decode.c
@@ -0,0 +1,826 @@
+/* -------------------------------------------------------------------------
+ *
+ * decode.c
+ *		This module decodes WAL records read using xlogreader.h's APIs for the
+ *		purpose of logical decoding by passing information to the
+ *		reorderbuffer module (containing the actual changes) and to the
+ *		snapbuild module to build a fitting catalog snapshot (to be able to
+ *		properly decode the changes in the reorderbuffer).
+ *
+ * NOTE:
+ *		This basically tries to handle all low level xlog stuff for
+ *      reorderbuffer.c and snapbuild.c. There's some minor leakage where a
+ *      specific record's struct is used to pass data along, but those just
+ *      happen to contain the right amount of data in a convenient
+ *      format. There isn't and shouldn't be much intelligence about the
+ *      contents of records in here except turning them into a more usable
+ *      format.
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/replication/logical/decode.c
+ *
+ * -------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/heapam.h"
+#include "access/heapam_xlog.h"
+#include "access/transam.h"
+#include "access/xact.h"
+#include "access/xlog_internal.h"
+#include "access/xlogreader.h"
+
+#include "catalog/pg_control.h"
+
+#include "replication/decode.h"
+#include "replication/logical.h"
+#include "replication/reorderbuffer.h"
+#include "replication/snapbuild.h"
+
+#include "storage/standby.h"
+
+typedef struct XLogRecordBuffer
+{
+	XLogRecPtr origptr;
+	XLogRecPtr endptr;
+	XLogRecord record;
+	char *record_data;
+} XLogRecordBuffer;
+
+/* RMGR Handlers */
+static void DecodeXLogOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
+static void DecodeHeapOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
+static void DecodeHeap2Op(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
+static void DecodeXactOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
+static void DecodeStandbyOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
+
+/* individual record(group)'s handlers */
+static void DecodeInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
+static void DecodeUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
+static void DecodeDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
+static void DecodeMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
+static void DecodeCommit(LogicalDecodingContext *ctx, XLogRecordBuffer *buf,
+						 TransactionId xid, Oid dboid,
+						 TimestampTz commit_time,
+						 int nsubxacts, TransactionId *sub_xids,
+						 int ninval_msgs, SharedInvalidationMessage *msg);
+static void DecodeAbort(LogicalDecodingContext *ctx, XLogRecPtr lsn,
+			TransactionId xid, TransactionId *sub_xids, int nsubxacts);
+
+/* common function to decode tuples */
+static void DecodeXLogTuple(char *data, Size len, ReorderBufferTupleBuf *tup);
+
+/*
+ * Take every XLogReadRecord()ed record and perform the actions required to
+ * decode it using the output plugin already setup in the logical decoding
+ * context.
+ */
+void
+LogicalDecodingProcessRecord(LogicalDecodingContext *ctx, XLogRecord *record)
+{
+	XLogRecordBuffer buf;
+
+	buf.origptr = ctx->reader->ReadRecPtr;
+	buf.endptr = ctx->reader->EndRecPtr;
+	buf.record = *record;
+	buf.record_data = XLogRecGetData(record);
+
+	/* cast so we get a warning when new rmgrs are added */
+	switch ((RmgrIds) buf.record.xl_rmid)
+	{
+		/*
+		 * Rmgrs we care about for logical decoding. Add new rmgrs in
+		 * rmgrlist.h's order.
+		 */
+		case RM_XLOG_ID:
+			DecodeXLogOp(ctx, &buf);
+			break;
+
+		case RM_XACT_ID:
+			DecodeXactOp(ctx, &buf);
+			break;
+
+		case RM_STANDBY_ID:
+			DecodeStandbyOp(ctx, &buf);
+			break;
+
+		case RM_HEAP2_ID:
+			DecodeHeap2Op(ctx, &buf);
+			break;
+
+		case RM_HEAP_ID:
+			DecodeHeapOp(ctx, &buf);
+			break;
+
+		/*
+		 * Rmgrs irrelevant for logical decoding; they describe stuff not
+		 * represented in logical decoding. Add new rmgrs in rmgrlist.h's
+		 * order.
+		 */
+		case RM_SMGR_ID:
+		case RM_CLOG_ID:
+		case RM_DBASE_ID:
+		case RM_TBLSPC_ID:
+		case RM_MULTIXACT_ID:
+		case RM_RELMAP_ID:
+		case RM_BTREE_ID:
+		case RM_HASH_ID:
+		case RM_GIN_ID:
+		case RM_GIST_ID:
+		case RM_SEQ_ID:
+		case RM_SPGIST_ID:
+			break;
+		case RM_NEXT_ID:
+			elog(ERROR, "unexpected RM_NEXT_ID rmgr_id: %u", (RmgrIds) buf.record.xl_rmid);
+	}
+}
+
+/*
+ * Handle rmgr XLOG_ID records for DecodeRecordIntoReorderBuffer().
+ */
+static void
+DecodeXLogOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+	SnapBuild  *builder = ctx->snapshot_builder;
+	uint8		info = buf->record.xl_info & ~XLR_INFO_MASK;
+
+	switch (info)
+	{
+		/* this is also used in END_OF_RECOVERY checkpoints */
+		case XLOG_CHECKPOINT_SHUTDOWN:
+		case XLOG_END_OF_RECOVERY:
+			SnapBuildSerializationPoint(builder, buf->origptr);
+
+			break;
+		case XLOG_CHECKPOINT_ONLINE:
+			/*
+			 * a RUNNING_XACTS record will have been logged near to this, we
+			 * can restart from there.
+			 */
+			break;
+		case XLOG_NOOP:
+		case XLOG_NEXTOID:
+		case XLOG_SWITCH:
+		case XLOG_BACKUP_END:
+		case XLOG_PARAMETER_CHANGE:
+		case XLOG_RESTORE_POINT:
+		case XLOG_FPW_CHANGE:
+		case XLOG_FPI:
+			break;
+		default:
+			elog(ERROR, "unexpected RM_XLOG_ID record type: %u", info);
+	}
+}
+
+/*
+ * Handle rmgr XACT_ID records for DecodeRecordIntoReorderBuffer().
+ */
+static void
+DecodeXactOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+	SnapBuild	   *builder = ctx->snapshot_builder;
+	ReorderBuffer  *reorder = ctx->reorder;
+	XLogRecord	   *r = &buf->record;
+	uint8		info = r->xl_info & ~XLR_INFO_MASK;
+
+	/* no point in doing anything yet, data could not be decoded anyway */
+	if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT)
+		return;
+
+	switch (info)
+	{
+		case XLOG_XACT_COMMIT:
+			{
+				xl_xact_commit *xlrec;
+				TransactionId *subxacts = NULL;
+				SharedInvalidationMessage *invals = NULL;
+
+				xlrec = (xl_xact_commit *) buf->record_data;
+
+				subxacts = (TransactionId *) &(xlrec->xnodes[xlrec->nrels]);
+				invals = (SharedInvalidationMessage *) &(subxacts[xlrec->nsubxacts]);
+
+				DecodeCommit(ctx, buf, r->xl_xid, xlrec->dbId,
+							 xlrec->xact_time,
+							 xlrec->nsubxacts, subxacts,
+							 xlrec->nmsgs, invals);
+
+				break;
+			}
+		case XLOG_XACT_COMMIT_PREPARED:
+			{
+				xl_xact_commit_prepared *prec;
+				xl_xact_commit *xlrec;
+				TransactionId *subxacts;
+				SharedInvalidationMessage *invals = NULL;
+
+				/* Prepared commits contain a normal commit record... */
+				prec = (xl_xact_commit_prepared *) buf->record_data;
+				xlrec = &prec->crec;
+
+				subxacts = (TransactionId *) &(xlrec->xnodes[xlrec->nrels]);
+				invals = (SharedInvalidationMessage *) &(subxacts[xlrec->nsubxacts]);
+
+				DecodeCommit(ctx, buf, r->xl_xid, xlrec->dbId,
+							 xlrec->xact_time,
+							 xlrec->nsubxacts, subxacts,
+							 xlrec->nmsgs, invals);
+
+				break;
+			}
+		case XLOG_XACT_COMMIT_COMPACT:
+			{
+				xl_xact_commit_compact *xlrec;
+
+				xlrec = (xl_xact_commit_compact *) buf->record_data;
+
+				DecodeCommit(ctx, buf, r->xl_xid, InvalidOid,
+							 xlrec->xact_time,
+							 xlrec->nsubxacts, xlrec->subxacts,
+							 0, NULL);
+				break;
+			}
+		case XLOG_XACT_ABORT:
+			{
+				xl_xact_abort *xlrec;
+				TransactionId *sub_xids;
+
+				xlrec = (xl_xact_abort *) buf->record_data;
+
+				sub_xids = (TransactionId *) &(xlrec->xnodes[xlrec->nrels]);
+
+				DecodeAbort(ctx, buf->origptr, r->xl_xid,
+							sub_xids, xlrec->nsubxacts);
+				break;
+			}
+		case XLOG_XACT_ABORT_PREPARED:
+			{
+				xl_xact_abort_prepared *prec;
+				xl_xact_abort *xlrec;
+				TransactionId *sub_xids;
+
+				/* prepared abort contain a normal commit abort... */
+				prec = (xl_xact_abort_prepared *) buf->record_data;
+				xlrec = &prec->arec;
+
+				sub_xids = (TransactionId *) &(xlrec->xnodes[xlrec->nrels]);
+
+				/* r->xl_xid is committed in a separate record */
+				DecodeAbort(ctx, buf->origptr, prec->xid,
+							sub_xids, xlrec->nsubxacts);
+				break;
+			}
+
+		case XLOG_XACT_ASSIGNMENT:
+			{
+				xl_xact_assignment *xlrec;
+				int			i;
+				TransactionId *sub_xid;
+
+				xlrec =	(xl_xact_assignment *) buf->record_data;
+
+				sub_xid = &xlrec->xsub[0];
+
+				for (i = 0; i < xlrec->nsubxacts; i++)
+				{
+					ReorderBufferAssignChild(reorder, xlrec->xtop,
+											 *(sub_xid++), buf->origptr);
+				}
+				break;
+			}
+		case XLOG_XACT_PREPARE:
+			/*
+			 * Currently decoding ignores PREPARE TRANSACTION and will just
+			 * decode the transaction when the COMMIT PREPARED is sent or
+			 * throw away the transaction's contents when a ROLLBACK PREPARED
+			 * is received. In the future we could add code to expose prepared
+			 * transactions in the changestream allowing for a kind of
+			 * distributed 2PC.
+			 */
+			break;
+		default:
+			elog(ERROR, "unexpected RM_XACT_ID record type: %u", info);
+	}
+}
+
+/*
+ * Handle rmgr STANDBY_ID records for DecodeRecordIntoReorderBuffer().
+ */
+static void
+DecodeStandbyOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+	SnapBuild  *builder = ctx->snapshot_builder;
+	XLogRecord *r = &buf->record;
+	uint8		info = r->xl_info & ~XLR_INFO_MASK;
+
+	switch (info)
+	{
+		case XLOG_RUNNING_XACTS:
+			{
+				xl_running_xacts *running = (xl_running_xacts *) buf->record_data;
+				SnapBuildProcessRunningXacts(builder, buf->origptr, running);
+				/*
+				 * Abort all transactions that we keep track of, that are
+				 * older than the record's oldestRunningXid. This is the most
+				 * convenient spot for doing so since, in contrast to shutdown
+				 * or end-of-recovery checkpoints, we have information about
+				 * all running transactions which includes prepared ones,
+				 * while shutdown checkpoints just know that no non-prepared
+				 * transactions are in progress.
+				 */
+				ReorderBufferAbortOld(ctx->reorder, running->oldestRunningXid);
+			}
+			break;
+		case XLOG_STANDBY_LOCK:
+			break;
+		default:
+			elog(ERROR, "unexpected RM_STANDBY_ID record type: %u", info);
+	}
+}
+
+/*
+ * Handle rmgr HEAP2_ID records for DecodeRecordIntoReorderBuffer().
+ */
+static void
+DecodeHeap2Op(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+	uint8		info = buf->record.xl_info & XLOG_HEAP_OPMASK;
+	TransactionId xid = buf->record.xl_xid;
+	SnapBuild  *builder = ctx->snapshot_builder;
+
+	/* no point in doing anything yet */
+	if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT)
+		return;
+
+	switch (info)
+	{
+		case XLOG_HEAP2_MULTI_INSERT:
+			if (SnapBuildProcessChange(builder, xid, buf->origptr))
+				DecodeMultiInsert(ctx, buf);
+			break;
+		case XLOG_HEAP2_NEW_CID:
+			{
+				xl_heap_new_cid *xlrec;
+				xlrec = (xl_heap_new_cid *) buf->record_data;
+				SnapBuildProcessNewCid(builder, xid, buf->origptr, xlrec);
+
+				break;
+			}
+		case XLOG_HEAP2_REWRITE:
+			/*
+			 * Although these records only exist to serve the needs of logical
+			 * decoding, all the work happens as part of crash or archive
+			 * recovery, so we don't need to do anything here.
+			 */
+			break;
+		/*
+		 * Everything else here is just low level physical stuff we're
+		 * not interested in.
+		 */
+		case XLOG_HEAP2_FREEZE_PAGE:
+		case XLOG_HEAP2_CLEAN:
+		case XLOG_HEAP2_CLEANUP_INFO:
+		case XLOG_HEAP2_VISIBLE:
+		case XLOG_HEAP2_LOCK_UPDATED:
+			break;
+		default:
+			elog(ERROR, "unexpected RM_HEAP2_ID record type: %u", info);
+	}
+}
+
+/*
+ * Handle rmgr HEAP_ID records for DecodeRecordIntoReorderBuffer().
+ */
+static void
+DecodeHeapOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+	uint8		info = buf->record.xl_info & XLOG_HEAP_OPMASK;
+	TransactionId xid = buf->record.xl_xid;
+	SnapBuild  *builder = ctx->snapshot_builder;
+
+	/* no point in doing anything yet */
+	if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT)
+		return;
+
+	switch (info)
+	{
+		case XLOG_HEAP_INSERT:
+			if (SnapBuildProcessChange(builder, xid, buf->origptr))
+				DecodeInsert(ctx, buf);
+			break;
+
+			/*
+			 * Treat HOT update as normal updates. There is no useful
+			 * information in the fact that we could make it a HOT update
+			 * locally and the WAL layout is compatible.
+			 */
+		case XLOG_HEAP_HOT_UPDATE:
+		case XLOG_HEAP_UPDATE:
+			if (SnapBuildProcessChange(builder, xid, buf->origptr))
+				DecodeUpdate(ctx, buf);
+			break;
+
+		case XLOG_HEAP_DELETE:
+			if (SnapBuildProcessChange(builder, xid, buf->origptr))
+				DecodeDelete(ctx, buf);
+			break;
+
+		case XLOG_HEAP_NEWPAGE:
+			/*
+			 * This is only used in places like indexams and CLUSTER which
+			 * don't contain changes relevant for logical replication.
+			 */
+			break;
+
+		case XLOG_HEAP_INPLACE:
+			/*
+			 * Inplace updates are only ever performed on catalog tuples and
+			 * can, per definition, not change tuple visibility.  Since we
+			 * don't decode catalog tuples, we're not interested in the
+			 * record's contents.
+			 *
+			 * In-place updates can be used either by XID-bearing transactions
+			 * (e.g.  in CREATE INDEX CONCURRENTLY) or by XID-less
+			 * transactions (e.g.  VACUUM).  In the former case, the commit
+			 * record will include cache invalidations, so we mark the
+			 * transaction as catalog modifying here. Currently that's
+			 * redundant because the commit will do that as well, but once we
+			 * support decoding in-progress relations, this will be important.
+			 */
+			if (!TransactionIdIsValid(xid))
+				break;
+
+			SnapBuildProcessChange(builder, xid, buf->origptr);
+			ReorderBufferXidSetCatalogChanges(ctx->reorder, xid, buf->origptr);
+			break;
+
+		case XLOG_HEAP_LOCK:
+			/* we don't care about row level locks for now */
+			break;
+
+		default:
+			elog(ERROR, "unexpected RM_HEAP_ID record type: %u", info);
+			break;
+	}
+}
+
+/*
+ * Consolidated commit record handling between the different form of commit
+ * records.
+ */
+static void
+DecodeCommit(LogicalDecodingContext *ctx, XLogRecordBuffer *buf,
+			 TransactionId xid, Oid dboid,
+			 TimestampTz commit_time,
+			 int nsubxacts, TransactionId *sub_xids,
+			 int ninval_msgs, SharedInvalidationMessage *msgs)
+{
+	int			i;
+
+	/*
+	 * Process invalidation messages, even if we're not interested in the
+	 * transaction's contents, since the various caches need to always be
+	 * consistent.
+	 */
+	if (ninval_msgs > 0)
+	{
+		ReorderBufferAddInvalidations(ctx->reorder, xid, buf->origptr,
+									  ninval_msgs, msgs);
+		ReorderBufferXidSetCatalogChanges(ctx->reorder, xid, buf->origptr);
+	}
+
+	SnapBuildCommitTxn(ctx->snapshot_builder, buf->origptr, xid,
+					   nsubxacts, sub_xids);
+
+	/* ----
+	 * Check whether we are interested in this specific transaction, and tell
+	 * the the reorderbuffer to forget the content of the (sub-)transactions
+	 * if not.
+	 *
+	 * There basically two reasons we might not be interested in this
+	 * transaction:
+	 * 1) We might not be interested in decoding transactions up to this
+	 *    LSN. This can happen because we previously decoded it and now just
+	 *    are restarting or if we haven't assembled a consistent snapshot yet.
+	 * 2) The transaction happened in another database.
+	 *
+	 * We can't just use ReorderBufferAbort() here, because we need to execute
+	 * the transaction's invalidations.  This currently won't be needed if
+	 * we're just skipping over the transaction because currently we only do
+	 * so during startup, to get to the first transaction the client needs. As
+	 * we have reset the catalog caches before starting to read WAL, and we
+	 * haven't yet touched any catalogs, there can't be anything to invalidate.
+	 * But if we're "forgetting" this commit because it's it happened in
+	 * another database, the invalidations might be important, because they
+	 * could be for shared catalogs and we might have loaded data into the
+	 * relevant syscaches.
+	 * ---
+	 */
+	if (SnapBuildXactNeedsSkip(ctx->snapshot_builder, buf->origptr) ||
+		(dboid != InvalidOid && dboid != ctx->slot->data.database))
+	{
+		for (i = 0; i < nsubxacts; i++)
+		{
+			ReorderBufferForget(ctx->reorder, *sub_xids, buf->origptr);
+			sub_xids++;
+		}
+		ReorderBufferForget(ctx->reorder, xid, buf->origptr);
+
+		return;
+	}
+
+	/* tell the reorderbuffer about the surviving subtransactions */
+	for (i = 0; i < nsubxacts; i++)
+	{
+		ReorderBufferCommitChild(ctx->reorder, xid, *sub_xids,
+								 buf->origptr, buf->endptr);
+		sub_xids++;
+	}
+
+	/* replay actions of all transaction + subtransactions in order */
+	ReorderBufferCommit(ctx->reorder, xid, buf->origptr, buf->endptr,
+						commit_time);
+}
+
+/*
+ * Get the data from the various forms of abort records and pass it on to
+ * snapbuild.c and reorderbuffer.c
+ */
+static void
+DecodeAbort(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid,
+			TransactionId *sub_xids, int nsubxacts)
+{
+	int			i;
+
+	SnapBuildAbortTxn(ctx->snapshot_builder, lsn, xid, nsubxacts, sub_xids);
+
+	for (i = 0; i < nsubxacts; i++)
+	{
+		ReorderBufferAbort(ctx->reorder, *sub_xids, lsn);
+		sub_xids++;
+	}
+
+	ReorderBufferAbort(ctx->reorder, xid, lsn);
+}
+
+/*
+ * Parse XLOG_HEAP_INSERT (not MULTI_INSERT!) records into tuplebufs.
+ *
+ * Deletes can contain the new tuple.
+ */
+static void
+DecodeInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+	XLogRecord *r = &buf->record;
+	xl_heap_insert *xlrec;
+	ReorderBufferChange *change;
+
+	xlrec = (xl_heap_insert *) buf->record_data;
+
+	/* only interested in our database */
+	if (xlrec->target.node.dbNode != ctx->slot->data.database)
+		return;
+
+	change = ReorderBufferGetChange(ctx->reorder);
+	change->action = REORDER_BUFFER_CHANGE_INSERT;
+	memcpy(&change->tp.relnode, &xlrec->target.node, sizeof(RelFileNode));
+
+	if (xlrec->flags & XLOG_HEAP_CONTAINS_NEW_TUPLE)
+	{
+		Assert(r->xl_len > (SizeOfHeapInsert + SizeOfHeapHeader));
+
+		change->tp.newtuple = ReorderBufferGetTupleBuf(ctx->reorder);
+
+		DecodeXLogTuple((char *) xlrec + SizeOfHeapInsert,
+						r->xl_len - SizeOfHeapInsert,
+						change->tp.newtuple);
+	}
+
+	ReorderBufferQueueChange(ctx->reorder, r->xl_xid, buf->origptr, change);
+}
+
+/*
+ * Parse XLOG_HEAP_UPDATE and XLOG_HEAP_HOT_UPDATE, which have the same layout
+ * in the record, from wal into proper tuplebufs.
+ *
+ * Updates can possibly contain a new tuple and the old primary key.
+ */
+static void
+DecodeUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+	XLogRecord *r = &buf->record;
+	xl_heap_update *xlrec;
+	xl_heap_header_len *xlhdr;
+	ReorderBufferChange *change;
+	char	   *data;
+
+	xlrec = (xl_heap_update *) buf->record_data;
+	xlhdr = (xl_heap_header_len *) (buf->record_data + SizeOfHeapUpdate);
+
+	/* only interested in our database */
+	if (xlrec->target.node.dbNode != ctx->slot->data.database)
+		return;
+
+	change = ReorderBufferGetChange(ctx->reorder);
+	change->action = REORDER_BUFFER_CHANGE_UPDATE;
+	memcpy(&change->tp.relnode, &xlrec->target.node, sizeof(RelFileNode));
+
+	data = (char *) &xlhdr->header;
+
+	if (xlrec->flags & XLOG_HEAP_CONTAINS_NEW_TUPLE)
+	{
+		Assert(r->xl_len > (SizeOfHeapUpdate + SizeOfHeapHeaderLen));
+
+		change->tp.newtuple = ReorderBufferGetTupleBuf(ctx->reorder);
+
+		DecodeXLogTuple(data,
+						xlhdr->t_len + SizeOfHeapHeader,
+						change->tp.newtuple);
+		/* skip over the rest of the tuple header */
+		data += SizeOfHeapHeader;
+		/* skip over the tuple data */
+		data += xlhdr->t_len;
+	}
+
+	if (xlrec->flags & XLOG_HEAP_CONTAINS_OLD)
+	{
+		xlhdr = (xl_heap_header_len *) data;
+		change->tp.oldtuple = ReorderBufferGetTupleBuf(ctx->reorder);
+		DecodeXLogTuple((char *) &xlhdr->header,
+						xlhdr->t_len + SizeOfHeapHeader,
+						change->tp.oldtuple);
+		data = (char *) &xlhdr->header;
+		data += SizeOfHeapHeader;
+		data += xlhdr->t_len;
+	}
+
+	ReorderBufferQueueChange(ctx->reorder, r->xl_xid, buf->origptr, change);
+}
+
+/*
+ * Parse XLOG_HEAP_DELETE from wal into proper tuplebufs.
+ *
+ * Deletes can possibly contain the old primary key.
+ */
+static void
+DecodeDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+	XLogRecord *r = &buf->record;
+	xl_heap_delete *xlrec;
+	ReorderBufferChange *change;
+
+	xlrec = (xl_heap_delete *) buf->record_data;
+
+	/* only interested in our database */
+	if (xlrec->target.node.dbNode != ctx->slot->data.database)
+		return;
+
+	change = ReorderBufferGetChange(ctx->reorder);
+	change->action = REORDER_BUFFER_CHANGE_DELETE;
+
+	memcpy(&change->tp.relnode, &xlrec->target.node, sizeof(RelFileNode));
+
+	/* old primary key stored */
+	if (xlrec->flags & XLOG_HEAP_CONTAINS_OLD)
+	{
+		Assert(r->xl_len > (SizeOfHeapDelete + SizeOfHeapHeader));
+
+		change->tp.oldtuple = ReorderBufferGetTupleBuf(ctx->reorder);
+
+		DecodeXLogTuple((char *) xlrec + SizeOfHeapDelete,
+						r->xl_len - SizeOfHeapDelete,
+						change->tp.oldtuple);
+	}
+	ReorderBufferQueueChange(ctx->reorder, r->xl_xid, buf->origptr, change);
+}
+
+/*
+ * Decode XLOG_HEAP2_MULTI_INSERT_insert record into multiple tuplebufs.
+ *
+ * Currently MULTI_INSERT will always contain the full tuples.
+ */
+static void
+DecodeMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+	XLogRecord *r = &buf->record;
+	xl_heap_multi_insert *xlrec;
+	int			i;
+	char	   *data;
+	bool		isinit = (r->xl_info & XLOG_HEAP_INIT_PAGE) != 0;
+
+	xlrec = (xl_heap_multi_insert *) buf->record_data;
+
+	/* only interested in our database */
+	if (xlrec->node.dbNode != ctx->slot->data.database)
+		return;
+
+	data = buf->record_data + SizeOfHeapMultiInsert;
+
+	/*
+	 * OffsetNumbers (which are not of interest to us) are stored when
+	 * XLOG_HEAP_INIT_PAGE is not set -- skip over them.
+	 */
+	if (!isinit)
+		data += sizeof(OffsetNumber) * xlrec->ntuples;
+
+	for (i = 0; i < xlrec->ntuples; i++)
+	{
+		ReorderBufferChange *change;
+		xl_multi_insert_tuple *xlhdr;
+		int			datalen;
+		ReorderBufferTupleBuf *tuple;
+
+		change = ReorderBufferGetChange(ctx->reorder);
+		change->action = REORDER_BUFFER_CHANGE_INSERT;
+		memcpy(&change->tp.relnode, &xlrec->node, sizeof(RelFileNode));
+
+		/*
+		 * CONTAINS_NEW_TUPLE will always be set currently as multi_insert
+		 * isn't used for catalogs, but better be future proof.
+		 *
+		 * We decode the tuple in pretty much the same way as DecodeXLogTuple,
+		 * but since the layout is slightly different, we can't use it here.
+		 */
+		if (xlrec->flags & XLOG_HEAP_CONTAINS_NEW_TUPLE)
+		{
+			change->tp.newtuple = ReorderBufferGetTupleBuf(ctx->reorder);
+
+			tuple = change->tp.newtuple;
+
+			/* not a disk based tuple */
+			ItemPointerSetInvalid(&tuple->tuple.t_self);
+
+			xlhdr = (xl_multi_insert_tuple *) SHORTALIGN(data);
+			data = ((char *) xlhdr) + SizeOfMultiInsertTuple;
+			datalen = xlhdr->datalen;
+
+			/*
+			 * We can only figure this out after reassembling the
+			 * transactions.
+			 */
+			tuple->tuple.t_tableOid = InvalidOid;
+			tuple->tuple.t_data = &tuple->header;
+			tuple->tuple.t_len = datalen
+				+ offsetof(HeapTupleHeaderData, t_bits);
+
+			memset(&tuple->header, 0, sizeof(HeapTupleHeaderData));
+
+			memcpy((char *) &tuple->header
+				   + offsetof(HeapTupleHeaderData, t_bits),
+				   (char *) data,
+				   datalen);
+			data += datalen;
+
+			tuple->header.t_infomask = xlhdr->t_infomask;
+			tuple->header.t_infomask2 = xlhdr->t_infomask2;
+			tuple->header.t_hoff = xlhdr->t_hoff;
+		}
+
+		ReorderBufferQueueChange(ctx->reorder, r->xl_xid,
+								 buf->origptr, change);
+	}
+}
+
+/*
+ * Read a HeapTuple as WAL logged by heap_insert, heap_update and heap_delete
+ * (but not by heap_multi_insert) into a tuplebuf.
+ *
+ * The size 'len' and the pointer 'data' in the record need to be
+ * computed outside as they are record specific.
+ */
+static void
+DecodeXLogTuple(char *data, Size len, ReorderBufferTupleBuf *tuple)
+{
+	xl_heap_header xlhdr;
+	int			datalen = len - SizeOfHeapHeader;
+
+	Assert(datalen >= 0);
+	Assert(datalen <= MaxHeapTupleSize);
+
+	tuple->tuple.t_len = datalen + offsetof(HeapTupleHeaderData, t_bits);
+
+	/* not a disk based tuple */
+	ItemPointerSetInvalid(&tuple->tuple.t_self);
+
+	/* we can only figure this out after reassembling the transactions */
+	tuple->tuple.t_tableOid = InvalidOid;
+	tuple->tuple.t_data = &tuple->header;
+
+	/* data is not stored aligned, copy to aligned storage */
+	memcpy((char *) &xlhdr,
+		   data,
+		   SizeOfHeapHeader);
+
+	memset(&tuple->header, 0, sizeof(HeapTupleHeaderData));
+
+	memcpy((char *) &tuple->header + offsetof(HeapTupleHeaderData, t_bits),
+		   data + SizeOfHeapHeader,
+		   datalen);
+
+	tuple->header.t_infomask = xlhdr.t_infomask;
+	tuple->header.t_infomask2 = xlhdr.t_infomask2;
+	tuple->header.t_hoff = xlhdr.t_hoff;
+}
diff --git a/src/backend/replication/logical/logical.c b/src/backend/replication/logical/logical.c
new file mode 100644
index 00000000000..4fb0974f297
--- /dev/null
+++ b/src/backend/replication/logical/logical.c
@@ -0,0 +1,920 @@
+/*-------------------------------------------------------------------------
+ * logical.c
+ *	   PostgreSQL logical decoding coordination
+ *
+ * Copyright (c) 2012-2014, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/backend/replication/logical/logical.c
+ *
+ * NOTES
+ *    This file coordinates interaction between the various modules that
+ *    together providethe logical decoding, primarily by providing so
+ *    called LogicalDecodingContexts. The goal is to encapsulate most of the
+ *    internal complexity for consumers of logical decoding, so they can
+ *    create and consume a changestream with a low amount of code.
+ *
+ *    The idea is that a consumer provides three callbacks, one to read WAL,
+ *    one to prepare a data write, and a final one for actually writing since
+ *    their implementation depends on the type of consumer.  Check
+ *    logicalfunc.c for an example implementations of a fairly simple consumer
+ *    and a implementation of a WAL reading callback that's suitable for
+ *    simpler consumers.
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <unistd.h>
+#include <sys/stat.h>
+
+#include "miscadmin.h"
+
+#include "access/xact.h"
+
+#include "replication/decode.h"
+#include "replication/logical.h"
+#include "replication/reorderbuffer.h"
+#include "replication/snapbuild.h"
+
+#include "storage/proc.h"
+#include "storage/procarray.h"
+
+#include "utils/memutils.h"
+
+/* data for errcontext callback */
+typedef struct LogicalErrorCallbackState
+{
+	LogicalDecodingContext *ctx;
+	const char *callback_name;
+	XLogRecPtr	report_location;
+} LogicalErrorCallbackState;
+
+/* wrappers around output plugin callbacks */
+static void output_plugin_error_callback(void *arg);
+static void startup_cb_wrapper(LogicalDecodingContext *ctx, OutputPluginOptions *opt,
+								 bool is_init);
+static void shutdown_cb_wrapper(LogicalDecodingContext *ctx);
+static void begin_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn);
+static void commit_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
+							   XLogRecPtr commit_lsn);
+static void change_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
+						   Relation relation, ReorderBufferChange *change);
+
+static void LoadOutputPlugin(OutputPluginCallbacks *callbacks, char *plugin);
+
+/*
+ * Make sure the current settings & environment are capable of doing logical
+ * decoding.
+ */
+void
+CheckLogicalDecodingRequirements(void)
+{
+	CheckSlotRequirements();
+
+	if (wal_level < WAL_LEVEL_LOGICAL)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("logical decoding requires wal_level >= logical")));
+
+	if (MyDatabaseId == InvalidOid)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("logical decoding requires a database connection")));
+
+	/* ----
+	 * TODO: We got to change that someday soon...
+	 *
+	 * There's basically three things missing to allow this:
+	 * 1) We need to be able to correctly and quickly identify the timeline a
+	 *    LSN belongs to
+	 * 2) We need to force hot_standby_feedback to be enabled at all times so
+	 *    the primary cannot remove rows we need.
+	 * 3) support dropping replication slots referring to a database, in
+	 *    dbase_redo. There can't be any active ones due to HS recovery
+	 *    conflicts, so that should be relatively easy.
+	 * ----
+	 */
+	if (RecoveryInProgress())
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("logical decoding cannot be used while in recovery")));
+}
+
+/*
+ * Helper function for CreateInitialDecodingContext() and
+ * CreateDecodingContext() performing common tasks.
+ */
+static LogicalDecodingContext *
+StartupDecodingContext(List *output_plugin_options,
+					   XLogRecPtr start_lsn,
+					   TransactionId xmin_horizon,
+					   XLogPageReadCB read_page,
+					   LogicalOutputPluginWriterPrepareWrite prepare_write,
+					   LogicalOutputPluginWriterWrite do_write)
+{
+	ReplicationSlot *slot;
+	MemoryContext context, old_context;
+	LogicalDecodingContext *ctx;
+
+	/* shorter lines... */
+	slot = MyReplicationSlot;
+
+	context = AllocSetContextCreate(CurrentMemoryContext,
+									"Changeset Extraction Context",
+									ALLOCSET_DEFAULT_MINSIZE,
+									ALLOCSET_DEFAULT_INITSIZE,
+									ALLOCSET_DEFAULT_MAXSIZE);
+	old_context = MemoryContextSwitchTo(context);
+	ctx = palloc0(sizeof(LogicalDecodingContext));
+
+	ctx->context = context;
+
+	/* (re-)load output plugins, so we detect a bad (removed) output plugin now. */
+	LoadOutputPlugin(&ctx->callbacks, NameStr(slot->data.plugin));
+
+	/*
+	 * Now that the slot's xmin has been set, we can announce ourselves as a
+	 * logical decoding backend which doesn't need to be checked individually
+	 * when computing the xmin horizon because the xmin is enforced via
+	 * replication slots.
+	 */
+	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+	MyPgXact->vacuumFlags |= PROC_IN_LOGICAL_DECODING;
+	LWLockRelease(ProcArrayLock);
+
+	ctx->slot = slot;
+
+	ctx->reader = XLogReaderAllocate(read_page, ctx);
+	ctx->reader->private_data = ctx;
+
+	ctx->reorder = ReorderBufferAllocate();
+	ctx->snapshot_builder =
+		AllocateSnapshotBuilder(ctx->reorder, xmin_horizon, start_lsn);
+
+	ctx->reorder->private_data = ctx;
+
+	/* wrap output plugin callbacks, so we can add error context information */
+	ctx->reorder->begin = begin_cb_wrapper;
+	ctx->reorder->apply_change = change_cb_wrapper;
+	ctx->reorder->commit = commit_cb_wrapper;
+
+	ctx->out = makeStringInfo();
+	ctx->prepare_write = prepare_write;
+	ctx->write = do_write;
+
+	ctx->output_plugin_options = output_plugin_options;
+
+	MemoryContextSwitchTo(old_context);
+
+	return ctx;
+}
+
+/*
+ * Create a new decoding context, for a new logical slot.
+ *
+ * plugin contains the name of the output plugin
+ * output_plugin_options contains options passed to the output plugin
+ * read_page, prepare_write, do_write are callbacks that have to be filled to
+ *		perform the use-case dependent, actual, work.
+ *
+ * Needs to be called while in a memory context that's at least as long lived
+ * as the the decoding context because further memory contexts will be created
+ * inside it.
+ *
+ * Returns an initialized decoding context after calling the output plugin's
+ * startup function.
+ */
+LogicalDecodingContext *
+CreateInitDecodingContext(char *plugin,
+						  List *output_plugin_options,
+						  XLogPageReadCB read_page,
+						  LogicalOutputPluginWriterPrepareWrite prepare_write,
+						  LogicalOutputPluginWriterWrite do_write)
+{
+	TransactionId	xmin_horizon = InvalidTransactionId;
+	ReplicationSlot *slot;
+	LogicalDecodingContext *ctx;
+	MemoryContext	old_context;
+
+	/* shorter lines... */
+	slot = MyReplicationSlot;
+
+	/* first some sanity checks that are unlikely to be violated */
+	if (slot == NULL)
+		elog(ERROR, "cannot perform logical decoding without a acquired slot");
+
+	if (plugin == NULL)
+		elog(ERROR, "cannot initialize logical decoding without a specified plugin");
+
+	/* Make sure the passed slot is suitable. These are user facing errors. */
+	if (slot->data.database == InvalidOid)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("cannot use physical replication slot created for logical decoding")));
+
+	if (slot->data.database != MyDatabaseId)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("replication slot \"%s\" was not created in this database",
+						 NameStr(slot->data.name))));
+
+	if (IsTransactionState() &&
+		GetTopTransactionIdIfAny() != InvalidTransactionId)
+		ereport(ERROR,
+				(errcode(ERRCODE_ACTIVE_SQL_TRANSACTION),
+				 errmsg("cannot create logical replication slot in transaction that has performed writes")));
+
+	/* register output plugin name with slot */
+	SpinLockAcquire(&slot->mutex);
+	strncpy(NameStr(slot->data.plugin), plugin,
+			NAMEDATALEN);
+	NameStr(slot->data.plugin)[NAMEDATALEN - 1] = '\0';
+	SpinLockRelease(&slot->mutex);
+
+	/*
+	 * The replication slot mechanism is used to prevent removal of required
+	 * WAL. As there is no interlock between this and checkpoints required WAL
+	 * could be removed before ReplicationSlotsComputeRequiredLSN() has been
+	 * called to prevent that. In the very unlikely case that this happens
+	 * we'll just retry.
+	 */
+	while (true)
+	{
+		XLogSegNo	segno;
+
+		/*
+		 * Let's start with enough information if we can, so log a standby
+		 * snapshot and start decoding at exactly that position.
+		 */
+		if (!RecoveryInProgress())
+		{
+			XLogRecPtr flushptr;
+
+			/* start at current insert position*/
+			slot->data.restart_lsn = GetXLogInsertRecPtr();
+
+			/* make sure we have enough information to start */
+			flushptr = LogStandbySnapshot();
+
+			/* and make sure it's fsynced to disk */
+			XLogFlush(flushptr);
+		}
+		else
+			slot->data.restart_lsn = GetRedoRecPtr();
+
+		/* prevent WAL removal as fast as possible */
+		ReplicationSlotsComputeRequiredLSN();
+
+		/*
+		 * If all required WAL is still there, great, otherwise retry. The
+		 * slot should prevent further removal of WAL, unless there's a
+		 * concurrent ReplicationSlotsComputeRequiredLSN() after we've written
+		 * the new restart_lsn above, so normally we should never need to loop
+		 * more than twice.
+		 */
+		XLByteToSeg(slot->data.restart_lsn, segno);
+		if (XLogGetLastRemovedSegno() < segno)
+			break;
+	}
+
+
+	/* ----
+	 * This is a bit tricky: We need to determine a safe xmin horizon to start
+	 * decoding from, to avoid starting from a running xacts record referring
+	 * to xids whose rows have been vacuumed or pruned
+	 * already. GetOldestSafeDecodingTransactionId() returns such a value, but
+	 * without further interlock it's return value might immediately be out of
+	 * date.
+	 *
+	 * So we have to acquire the ProcArrayLock to prevent computation of new
+	 * xmin horizons by other backends, get the safe decoding xid, and inform
+	 * the slot machinery about the new limit. Once that's done the
+	 * ProcArrayLock can be be released as the slot machinery now is
+	 * protecting against vacuum.
+	 * ----
+	 */
+	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+	slot->effective_catalog_xmin = GetOldestSafeDecodingTransactionId();
+	slot->data.catalog_xmin = slot->effective_catalog_xmin;
+
+	ReplicationSlotsComputeRequiredXmin(true);
+
+	LWLockRelease(ProcArrayLock);
+
+	/*
+	 * tell the snapshot builder to only assemble snapshot once reaching
+	 * the a running_xact's record with the respective xmin.
+	 */
+	xmin_horizon = slot->data.catalog_xmin;
+
+	ReplicationSlotMarkDirty();
+	ReplicationSlotSave();
+
+	ctx = StartupDecodingContext(NIL, InvalidXLogRecPtr, xmin_horizon,
+								  read_page, prepare_write, do_write);
+
+	/* call output plugin initialization callback */
+	old_context = MemoryContextSwitchTo(ctx->context);
+	if (ctx->callbacks.startup_cb != NULL)
+		startup_cb_wrapper(ctx, &ctx->options, true);
+	MemoryContextSwitchTo(old_context);
+
+	return ctx;
+}
+
+/*
+ * Create a new decoding context, for a logical slot that has previously been
+ * used already.
+ *
+ * start_lsn contains the LSN of the last received data or InvalidXLogRecPtr
+ * output_plugin_options contains options passed to the output plugin
+ * read_page, prepare_write, do_write are callbacks that have to be filled to
+ *		perform the use-case dependent, actual, work.
+ *
+ * Needs to be called while in a memory context that's at least as long lived
+ * as the the decoding context because further memory contexts will be created
+ * inside it.
+ *
+ * Returns an initialized decoding context after calling the output plugin's
+ * startup function.
+ */
+LogicalDecodingContext *
+CreateDecodingContext(XLogRecPtr start_lsn,
+					  List *output_plugin_options,
+					  XLogPageReadCB read_page,
+					  LogicalOutputPluginWriterPrepareWrite prepare_write,
+					  LogicalOutputPluginWriterWrite do_write)
+{
+	LogicalDecodingContext *ctx;
+	ReplicationSlot *slot;
+	MemoryContext	old_context;
+
+	/* shorter lines... */
+	slot = MyReplicationSlot;
+
+	/* first some sanity checks that are unlikely to be violated */
+	if (slot == NULL)
+		elog(ERROR, "cannot perform logical decoding without a acquired slot");
+
+	/* make sure the passed slot is suitable, these are user facing errors */
+	if (slot->data.database == InvalidOid)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 (errmsg("cannot use physical replication slot for logical decoding"))));
+
+	if (slot->data.database != MyDatabaseId)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 (errmsg("replication slot \"%s\" was not created in this database",
+						 NameStr(slot->data.name)))));
+
+	if (start_lsn == InvalidXLogRecPtr)
+	{
+		/* continue from last position */
+		start_lsn = slot->data.confirmed_flush;
+	}
+	else if (start_lsn < slot->data.confirmed_flush)
+	{
+		/*
+		 * It might seem like we should error out in this case, but it's
+		 * pretty common for a client to acknowledge a LSN it doesn't have to
+		 * do anything for, and thus didn't store persistently, because the
+		 * xlog records didn't result in anything relevant for logical
+		 * decoding. Clients have to be able to do that to support
+		 * synchronous replication.
+		 */
+		start_lsn = slot->data.confirmed_flush;
+		elog(DEBUG1, "cannot stream from %X/%X, minimum is %X/%X, forwarding",
+			 (uint32)(start_lsn >> 32), (uint32)start_lsn,
+			 (uint32)(slot->data.confirmed_flush >> 32),
+			 (uint32)slot->data.confirmed_flush);
+	}
+
+	ctx = StartupDecodingContext(output_plugin_options,
+								 start_lsn, InvalidTransactionId,
+								 read_page, prepare_write, do_write);
+
+	/* call output plugin initialization callback */
+	old_context = MemoryContextSwitchTo(ctx->context);
+	if (ctx->callbacks.startup_cb != NULL)
+		startup_cb_wrapper(ctx, &ctx->options, true);
+	MemoryContextSwitchTo(old_context);
+
+	ereport(LOG,
+			(errmsg("starting logical decoding for slot %s",
+					NameStr(slot->data.name)),
+			 errdetail("streaming transactions committing after %X/%X, reading WAL from %X/%X",
+					   (uint32)(slot->data.confirmed_flush >> 32),
+					   (uint32)slot->data.confirmed_flush,
+					   (uint32)(slot->data.restart_lsn >> 32),
+					   (uint32)slot->data.restart_lsn)));
+
+	return ctx;
+}
+
+/*
+ * Returns true if an consistent initial decoding snapshot has been built.
+ */
+bool
+DecodingContextReady(LogicalDecodingContext *ctx)
+{
+	return SnapBuildCurrentState(ctx->snapshot_builder) == SNAPBUILD_CONSISTENT;
+}
+
+/*
+ * Read from the decoding slot, until it is ready to start extracting changes.
+ */
+void
+DecodingContextFindStartpoint(LogicalDecodingContext *ctx)
+{
+	XLogRecPtr	startptr;
+
+	/* Initialize from where to start reading WAL. */
+	startptr = ctx->slot->data.restart_lsn;
+
+	elog(DEBUG1, "searching for logical decoding starting point, starting at %X/%X",
+		 (uint32)(ctx->slot->data.restart_lsn >> 32),
+		 (uint32)ctx->slot->data.restart_lsn);
+
+	/* Wait for a consistent starting point */
+	for (;;)
+	{
+		XLogRecord *record;
+		char	   *err = NULL;
+
+		/*
+		 * If the caller requires that interrupts be checked, the read_page
+		 * callback should do so, as those will often wait.
+		 */
+
+		/* the read_page callback waits for new WAL */
+		record = XLogReadRecord(ctx->reader, startptr, &err);
+		if (err)
+			elog(ERROR, "%s", err);
+
+		Assert(record);
+
+		startptr = InvalidXLogRecPtr;
+
+		LogicalDecodingProcessRecord(ctx, record);
+
+		/* only continue till we found a consistent spot */
+		if (DecodingContextReady(ctx))
+			break;
+	}
+
+	ctx->slot->data.confirmed_flush = ctx->reader->EndRecPtr;
+}
+
+/*
+ * Free a previously allocated decoding context, invoking the shutdown
+ * callback if necessary.
+ */
+void
+FreeDecodingContext(LogicalDecodingContext *ctx)
+{
+	if (ctx->callbacks.shutdown_cb != NULL)
+		shutdown_cb_wrapper(ctx);
+
+	ReorderBufferFree(ctx->reorder);
+	FreeSnapshotBuilder(ctx->snapshot_builder);
+	XLogReaderFree(ctx->reader);
+	MemoryContextDelete(ctx->context);
+}
+
+/*
+ * Prepare a write using the context's output routine.
+ */
+void
+OutputPluginPrepareWrite(struct LogicalDecodingContext *ctx, bool last_write)
+{
+	if (!ctx->accept_writes)
+		elog(ERROR, "writes are only accepted in commit, begin and change callbacks");
+
+	ctx->prepare_write(ctx, ctx->write_location, ctx->write_xid, last_write);
+	ctx->prepared_write = true;
+}
+
+/*
+ * Perform a write using the context's output routine.
+ */
+void
+OutputPluginWrite(struct LogicalDecodingContext *ctx, bool last_write)
+{
+	if (!ctx->prepared_write)
+		elog(ERROR, "OutputPluginPrepareWrite needs to be called before OutputPluginWrite");
+
+	ctx->write(ctx, ctx->write_location, ctx->write_xid, last_write);
+	ctx->prepared_write = false;
+}
+
+/*
+ * Load the output plugin, lookup its output plugin init function, and check
+ * that it provides the required callbacks.
+ */
+static void
+LoadOutputPlugin(OutputPluginCallbacks *callbacks, char *plugin)
+{
+	LogicalOutputPluginInit plugin_init;
+
+	plugin_init = (LogicalOutputPluginInit)
+		load_external_function(plugin, "_PG_output_plugin_init", false, NULL);
+
+	if (plugin_init == NULL)
+		elog(ERROR, "output plugins have to declare the _PG_output_plugin_init symbol");
+
+	/* ask the output plugin to fill the callback struct */
+	plugin_init(callbacks);
+
+	if (callbacks->begin_cb == NULL)
+		elog(ERROR, "output plugins have to register a begin callback");
+	if (callbacks->change_cb == NULL)
+		elog(ERROR, "output plugins have to register a change callback");
+	if (callbacks->commit_cb == NULL)
+		elog(ERROR, "output plugins have to register a commit callback");
+}
+
+static void
+output_plugin_error_callback(void *arg)
+{
+	LogicalErrorCallbackState *state = (LogicalErrorCallbackState *) arg;
+	/* not all callbacks have an associated LSN  */
+	if (state->report_location != InvalidXLogRecPtr)
+		errcontext("slot \"%s\", output plugin \"%s\", in the %s callback, associated LSN %X/%X",
+				   NameStr(state->ctx->slot->data.name),
+				   NameStr(state->ctx->slot->data.plugin),
+				   state->callback_name,
+				   (uint32)(state->report_location >> 32),
+				   (uint32)state->report_location);
+	else
+		errcontext("slot \"%s\", output plugin \"%s\", in the %s callback",
+				   NameStr(state->ctx->slot->data.name),
+				   NameStr(state->ctx->slot->data.plugin),
+				   state->callback_name);
+}
+
+static void
+startup_cb_wrapper(LogicalDecodingContext *ctx, OutputPluginOptions *opt, bool is_init)
+{
+	LogicalErrorCallbackState state;
+	ErrorContextCallback errcallback;
+
+	/* Push callback + info on the error context stack */
+	state.ctx = ctx;
+	state.callback_name = "startup";
+	state.report_location = InvalidXLogRecPtr;
+	errcallback.callback = output_plugin_error_callback;
+	errcallback.arg = (void *) &state;
+	errcallback.previous = error_context_stack;
+	error_context_stack = &errcallback;
+
+	/* set output state */
+	ctx->accept_writes = false;
+
+	/* do the actual work: call callback */
+	ctx->callbacks.startup_cb(ctx, opt, is_init);
+
+	/* Pop the error context stack */
+	error_context_stack = errcallback.previous;
+}
+
+static void
+shutdown_cb_wrapper(LogicalDecodingContext *ctx)
+{
+	LogicalErrorCallbackState state;
+	ErrorContextCallback errcallback;
+
+	/* Push callback + info on the error context stack */
+	state.ctx = ctx;
+	state.callback_name = "shutdown";
+	state.report_location = InvalidXLogRecPtr;
+	errcallback.callback = output_plugin_error_callback;
+	errcallback.arg = (void *) &state;
+	errcallback.previous = error_context_stack;
+	error_context_stack = &errcallback;
+
+	/* set output state */
+	ctx->accept_writes = false;
+
+	/* do the actual work: call callback */
+	ctx->callbacks.shutdown_cb(ctx);
+
+	/* Pop the error context stack */
+	error_context_stack = errcallback.previous;
+}
+
+
+/*
+ * Callbacks for ReorderBuffer which add in some more information and then call
+ * output_plugin.h plugins.
+ */
+static void
+begin_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn)
+{
+	LogicalDecodingContext *ctx = cache->private_data;
+	LogicalErrorCallbackState state;
+	ErrorContextCallback errcallback;
+
+	/* Push callback + info on the error context stack */
+	state.ctx = ctx;
+	state.callback_name = "begin";
+	state.report_location = txn->first_lsn;
+	errcallback.callback = output_plugin_error_callback;
+	errcallback.arg = (void *) &state;
+	errcallback.previous = error_context_stack;
+	error_context_stack = &errcallback;
+
+	/* set output state */
+	ctx->accept_writes = true;
+	ctx->write_xid = txn->xid;
+	ctx->write_location = txn->first_lsn;
+
+	/* do the actual work: call callback */
+	ctx->callbacks.begin_cb(ctx, txn);
+
+	/* Pop the error context stack */
+	error_context_stack = errcallback.previous;
+}
+
+static void
+commit_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
+				   XLogRecPtr commit_lsn)
+{
+	LogicalDecodingContext *ctx = cache->private_data;
+	LogicalErrorCallbackState state;
+	ErrorContextCallback errcallback;
+
+	/* Push callback + info on the error context stack */
+	state.ctx = ctx;
+	state.callback_name = "commit";
+	state.report_location = txn->final_lsn; /* beginning of commit record */
+	errcallback.callback = output_plugin_error_callback;
+	errcallback.arg = (void *) &state;
+	errcallback.previous = error_context_stack;
+	error_context_stack = &errcallback;
+
+	/* set output state */
+	ctx->accept_writes = true;
+	ctx->write_xid = txn->xid;
+	ctx->write_location = txn->end_lsn; /* points to the end of the record */
+
+	/* do the actual work: call callback */
+	ctx->callbacks.commit_cb(ctx, txn, commit_lsn);
+
+	/* Pop the error context stack */
+	error_context_stack = errcallback.previous;
+}
+
+static void
+change_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
+			   Relation relation, ReorderBufferChange *change)
+{
+	LogicalDecodingContext *ctx = cache->private_data;
+	LogicalErrorCallbackState state;
+	ErrorContextCallback errcallback;
+
+	/* Push callback + info on the error context stack */
+	state.ctx = ctx;
+	state.callback_name = "change";
+	state.report_location = change->lsn;
+	errcallback.callback = output_plugin_error_callback;
+	errcallback.arg = (void *) &state;
+	errcallback.previous = error_context_stack;
+	error_context_stack = &errcallback;
+
+	/* set output state */
+	ctx->accept_writes = true;
+	ctx->write_xid = txn->xid;
+	/*
+	 * report this change's lsn so replies from clients can give an up2date
+	 * answer. This won't ever be enough (and shouldn't be!) to confirm
+	 * receipt of this transaction, but it might allow another transaction's
+	 * commit to be confirmed with one message.
+	 */
+	ctx->write_location = change->lsn;
+
+	ctx->callbacks.change_cb(ctx, txn, relation, change);
+
+	/* Pop the error context stack */
+	error_context_stack = errcallback.previous;
+}
+
+/*
+ * Set the required catalog xmin horizon for historic snapshots in the current
+ * replication slot.
+ *
+ * Note that in the most cases, we won't be able to immediately use the xmin
+ * to increase the xmin horizon, we need to wait till the client has confirmed
+ * receiving current_lsn with LogicalConfirmReceivedLocation().
+ */
+void
+LogicalIncreaseXminForSlot(XLogRecPtr current_lsn, TransactionId xmin)
+{
+	bool	updated_xmin = false;
+	ReplicationSlot *slot;
+
+	slot = MyReplicationSlot;
+
+	Assert(slot != NULL);
+
+	SpinLockAcquire(&slot->mutex);
+
+	/*
+	 * don't overwrite if we already have a newer xmin. This can
+	 * happen if we restart decoding in a slot.
+	 */
+	if (TransactionIdPrecedesOrEquals(xmin, slot->data.catalog_xmin))
+	{
+	}
+	/*
+	 * If the client has already confirmed up to this lsn, we directly
+	 * can mark this as accepted. This can happen if we restart
+	 * decoding in a slot.
+	 */
+	else if (current_lsn <= slot->data.confirmed_flush)
+	{
+		slot->candidate_catalog_xmin = xmin;
+		slot->candidate_xmin_lsn = current_lsn;
+
+		/* our candidate can directly be used */
+		updated_xmin = true;
+	}
+	/*
+	 * Only increase if the previous values have been applied, otherwise we
+	 * might never end up updating if the receiver acks too slowly.
+	 */
+	else if (slot->candidate_xmin_lsn == InvalidXLogRecPtr)
+	{
+		slot->candidate_catalog_xmin = xmin;
+		slot->candidate_xmin_lsn = current_lsn;
+	}
+	SpinLockRelease(&slot->mutex);
+
+	/* candidate already valid with the current flush position, apply */
+	if (updated_xmin)
+		LogicalConfirmReceivedLocation(slot->data.confirmed_flush);
+}
+
+/*
+ * Mark the minimal LSN (restart_lsn) we need to read to replay all
+ * transactions that have not yet committed at current_lsn.
+ *
+ * Just like IncreaseRestartDecodingForSlot this nly takes effect when the
+ * client has confirmed to have received current_lsn.
+ */
+void
+LogicalIncreaseRestartDecodingForSlot(XLogRecPtr current_lsn, XLogRecPtr restart_lsn)
+{
+	bool	updated_lsn = false;
+	ReplicationSlot *slot;
+
+	slot = MyReplicationSlot;
+
+	Assert(slot != NULL);
+	Assert(restart_lsn != InvalidXLogRecPtr);
+	Assert(current_lsn != InvalidXLogRecPtr);
+
+	SpinLockAcquire(&slot->mutex);
+
+	/* don't overwrite if have a newer restart lsn*/
+	if (restart_lsn <= slot->data.restart_lsn)
+	{
+	}
+	/*
+	 * We might have already flushed far enough to directly accept this lsn, in
+	 * this case there is no need to check for existing candidate LSNs
+	 */
+	else if (current_lsn <= slot->data.confirmed_flush)
+	{
+		slot->candidate_restart_valid = current_lsn;
+		slot->candidate_restart_lsn = restart_lsn;
+
+		/* our candidate can directly be used */
+		updated_lsn = true;
+	}
+	/*
+	 * Only increase if the previous values have been applied, otherwise we
+	 * might never end up updating if the receiver acks too slowly. A missed
+	 * value here will just cause some extra effort after reconnecting.
+	 */
+	if (slot->candidate_restart_valid == InvalidXLogRecPtr)
+	{
+		slot->candidate_restart_valid = current_lsn;
+		slot->candidate_restart_lsn = restart_lsn;
+
+		elog(DEBUG1, "got new restart lsn %X/%X at %X/%X",
+			 (uint32) (restart_lsn >> 32), (uint32) restart_lsn,
+			 (uint32) (current_lsn >> 32), (uint32) current_lsn);
+	}
+	else
+	{
+		elog(DEBUG1, "failed to increase restart lsn: proposed %X/%X, after %X/%X, current candidate %X/%X, current after %X/%X, flushed up to %X/%X",
+			 (uint32) (restart_lsn >> 32), (uint32) restart_lsn,
+			 (uint32) (current_lsn >> 32), (uint32) current_lsn,
+			 (uint32) (slot->candidate_restart_lsn >> 32),
+			 (uint32) slot->candidate_restart_lsn,
+			 (uint32) (slot->candidate_restart_valid >> 32),
+			 (uint32) slot->candidate_restart_valid,
+			 (uint32) (slot->data.confirmed_flush >> 32),
+			 (uint32) slot->data.confirmed_flush
+			);
+	}
+	SpinLockRelease(&slot->mutex);
+
+	/* candidates are already valid with the current flush position, apply */
+	if (updated_lsn)
+		LogicalConfirmReceivedLocation(slot->data.confirmed_flush);
+}
+
+/*
+ * Handle a consumer's conformation having received all changes up to lsn.
+ */
+void
+LogicalConfirmReceivedLocation(XLogRecPtr lsn)
+{
+	Assert(lsn != InvalidXLogRecPtr);
+
+	/* Do an unlocked check for candidate_lsn first. */
+	if (MyReplicationSlot->candidate_xmin_lsn != InvalidXLogRecPtr ||
+		MyReplicationSlot->candidate_restart_valid != InvalidXLogRecPtr)
+	{
+		bool		updated_xmin = false;
+		bool		updated_restart = false;
+
+		/* use volatile pointer to prevent code rearrangement */
+		volatile ReplicationSlot *slot = MyReplicationSlot;
+
+		SpinLockAcquire(&slot->mutex);
+
+		slot->data.confirmed_flush = lsn;
+
+		/* if were past the location required for bumping xmin, do so */
+		if (slot->candidate_xmin_lsn != InvalidXLogRecPtr &&
+			slot->candidate_xmin_lsn <= lsn)
+		{
+			/*
+			 * We have to write the changed xmin to disk *before* we change
+			 * the in-memory value, otherwise after a crash we wouldn't know
+			 * that some catalog tuples might have been removed already.
+			 *
+			 * Ensure that by first writing to ->xmin and only update
+			 * ->effective_xmin once the new state is synced to disk. After a
+			 * crash ->effective_xmin is set to ->xmin.
+			 */
+			if (TransactionIdIsValid(slot->candidate_catalog_xmin) &&
+				slot->data.catalog_xmin != slot->candidate_catalog_xmin)
+			{
+				slot->data.catalog_xmin = slot->candidate_catalog_xmin;
+				slot->candidate_catalog_xmin = InvalidTransactionId;
+				slot->candidate_xmin_lsn = InvalidXLogRecPtr;
+				updated_xmin = true;
+			}
+		}
+
+		if (slot->candidate_restart_valid != InvalidXLogRecPtr &&
+			slot->candidate_restart_valid <= lsn)
+		{
+			Assert(slot->candidate_restart_lsn != InvalidXLogRecPtr);
+
+			slot->data.restart_lsn = slot->candidate_restart_lsn;
+			slot->candidate_restart_lsn = InvalidXLogRecPtr;
+			slot->candidate_restart_valid = InvalidXLogRecPtr;
+			updated_restart = true;
+		}
+
+		SpinLockRelease(&slot->mutex);
+
+		/* first write new xmin to disk, so we know whats up after a crash */
+		if (updated_xmin || updated_restart)
+		{
+			ReplicationSlotMarkDirty();
+			ReplicationSlotSave();
+			elog(DEBUG1, "updated xmin: %u restart: %u", updated_xmin, updated_restart);
+		}
+		/*
+		 * Now the new xmin is safely on disk, we can let the global value
+		 * advance. We do not take ProcArrayLock or similar since we only
+		 * advance xmin here and there's not much harm done by a concurrent
+		 * computation missing that.
+		 */
+		if (updated_xmin)
+		{
+			SpinLockAcquire(&slot->mutex);
+			slot->effective_catalog_xmin = slot->data.catalog_xmin;
+			SpinLockRelease(&slot->mutex);
+
+			ReplicationSlotsComputeRequiredXmin(false);
+			ReplicationSlotsComputeRequiredLSN();
+		}
+	}
+	else
+	{
+		volatile ReplicationSlot *slot = MyReplicationSlot;
+
+		SpinLockAcquire(&slot->mutex);
+		slot->data.confirmed_flush = lsn;
+		SpinLockRelease(&slot->mutex);
+	}
+}
diff --git a/src/backend/replication/logical/logicalfuncs.c b/src/backend/replication/logical/logicalfuncs.c
new file mode 100644
index 00000000000..3b8ae3853ba
--- /dev/null
+++ b/src/backend/replication/logical/logicalfuncs.c
@@ -0,0 +1,509 @@
+/*-------------------------------------------------------------------------
+ *
+ * logicalfuncs.c
+ *
+ *	   Support functions for using logical decoding and managemnt of
+ *	   logical replication slots via SQL.
+ *
+ *
+ * Copyright (c) 2012-2014, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/backend/replication/logicalfuncs.c
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <unistd.h>
+
+#include "fmgr.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+
+#include "catalog/pg_type.h"
+
+#include "nodes/makefuncs.h"
+
+#include "mb/pg_wchar.h"
+
+#include "utils/array.h"
+#include "utils/builtins.h"
+#include "utils/inval.h"
+#include "utils/memutils.h"
+#include "utils/pg_lsn.h"
+#include "utils/resowner.h"
+#include "utils/lsyscache.h"
+
+#include "replication/decode.h"
+#include "replication/logical.h"
+#include "replication/logicalfuncs.h"
+
+#include "storage/fd.h"
+
+/* private date for writing out data */
+typedef struct DecodingOutputState {
+	Tuplestorestate *tupstore;
+	TupleDesc tupdesc;
+	bool binary_output;
+	int64 returned_rows;
+} DecodingOutputState;
+
+/*
+ * Prepare for a output plugin write.
+ */
+static void
+LogicalOutputPrepareWrite(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid,
+						  bool last_write)
+{
+	resetStringInfo(ctx->out);
+}
+
+/*
+ * Perform output plugin write into tuplestore.
+ */
+static void
+LogicalOutputWrite(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid,
+				   bool last_write)
+{
+	Datum		values[3];
+	bool		nulls[3];
+	DecodingOutputState *p;
+
+	/* SQL Datums can only be of a limited length... */
+	if (ctx->out->len > MaxAllocSize - VARHDRSZ)
+		elog(ERROR, "too much output for sql interface");
+
+	p = (DecodingOutputState *) ctx->output_writer_private;
+
+	memset(nulls, 0, sizeof(nulls));
+	values[0] = LSNGetDatum(lsn);
+	values[1] = TransactionIdGetDatum(xid);
+
+	/*
+	 * Assert ctx->out is in database encoding when we're writing textual
+	 * output.
+	 */
+	if (!p->binary_output)
+		Assert(pg_verify_mbstr(GetDatabaseEncoding(),
+							   ctx->out->data, ctx->out->len,
+							   false));
+
+	/* ick, but cstring_to_text_with_len works for bytea perfectly fine */
+	values[2] = PointerGetDatum(
+		cstring_to_text_with_len(ctx->out->data, ctx->out->len));
+
+	tuplestore_putvalues(p->tupstore, p->tupdesc, values, nulls);
+	p->returned_rows++;
+}
+
+/*
+ * TODO: This is duplicate code with pg_xlogdump, similar to walsender.c, but
+ * we currently don't have the infrastructure (elog!) to share it.
+ */
+static void
+XLogRead(char *buf, TimeLineID tli, XLogRecPtr startptr, Size count)
+{
+	char	   *p;
+	XLogRecPtr	recptr;
+	Size		nbytes;
+
+	static int	sendFile = -1;
+	static XLogSegNo sendSegNo = 0;
+	static uint32 sendOff = 0;
+
+	p = buf;
+	recptr = startptr;
+	nbytes = count;
+
+	while (nbytes > 0)
+	{
+		uint32		startoff;
+		int			segbytes;
+		int			readbytes;
+
+		startoff = recptr % XLogSegSize;
+
+		if (sendFile < 0 || !XLByteInSeg(recptr, sendSegNo))
+		{
+			char		path[MAXPGPATH];
+
+			/* Switch to another logfile segment */
+			if (sendFile >= 0)
+				close(sendFile);
+
+			XLByteToSeg(recptr, sendSegNo);
+
+			XLogFilePath(path, tli, sendSegNo);
+
+			sendFile = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
+
+			if (sendFile < 0)
+			{
+				if (errno == ENOENT)
+					ereport(ERROR,
+							(errcode_for_file_access(),
+							 errmsg("requested WAL segment %s has already been removed",
+									path)));
+				else
+					ereport(ERROR,
+							(errcode_for_file_access(),
+							 errmsg("could not open file \"%s\": %m",
+									path)));
+			}
+			sendOff = 0;
+		}
+
+		/* Need to seek in the file? */
+		if (sendOff != startoff)
+		{
+			if (lseek(sendFile, (off_t) startoff, SEEK_SET) < 0)
+			{
+				char		path[MAXPGPATH];
+
+				XLogFilePath(path, tli, sendSegNo);
+
+				ereport(ERROR,
+						(errcode_for_file_access(),
+				  errmsg("could not seek in log segment %s to offset %u: %m",
+						 path, startoff)));
+			}
+			sendOff = startoff;
+		}
+
+		/* How many bytes are within this segment? */
+		if (nbytes > (XLogSegSize - startoff))
+			segbytes = XLogSegSize - startoff;
+		else
+			segbytes = nbytes;
+
+		readbytes = read(sendFile, p, segbytes);
+		if (readbytes <= 0)
+		{
+			char		path[MAXPGPATH];
+
+			XLogFilePath(path, tli, sendSegNo);
+
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not read from log segment %s, offset %u, length %lu: %m",
+							path, sendOff, (unsigned long) segbytes)));
+		}
+
+		/* Update state for read */
+		recptr += readbytes;
+
+		sendOff += readbytes;
+		nbytes -= readbytes;
+		p += readbytes;
+	}
+}
+
+static void
+check_permissions(void)
+{
+	if (!superuser() && !has_rolreplication(GetUserId()))
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+				 (errmsg("must be superuser or replication role to use replication slots"))));
+}
+
+/*
+ * read_page callback for logical decoding contexts.
+ *
+ * Public because it would likely be very helpful for someone writing another
+ * output method outside walsender, e.g. in a bgworker.
+ *
+ * TODO: The walsender has it's own version of this, but it relies on the
+ * walsender's latch being set whenever WAL is flushed. No such infrastructure
+ * exists for normal backends, so we have to do a check/sleep/repeat style of
+ * loop for now.
+ */
+int
+logical_read_local_xlog_page(XLogReaderState *state, XLogRecPtr targetPagePtr,
+	int reqLen, XLogRecPtr targetRecPtr, char *cur_page, TimeLineID *pageTLI)
+{
+	XLogRecPtr	flushptr,
+				loc;
+	int			count;
+
+	loc = targetPagePtr + reqLen;
+	while (1)
+	{
+		/*
+		 * TODO: we're going to have to do something more intelligent about
+		 * timelines on standbys. Use readTimeLineHistory() and
+		 * tliOfPointInHistory() to get the proper LSN? For now we'll catch
+		 * that case earlier, but the code and TODO is left in here for when
+		 * that changes.
+		 */
+		if (!RecoveryInProgress())
+		{
+			*pageTLI = ThisTimeLineID;
+			flushptr = GetFlushRecPtr();
+		}
+		else
+			flushptr = GetXLogReplayRecPtr(pageTLI);
+
+		if (loc <= flushptr)
+			break;
+
+		CHECK_FOR_INTERRUPTS();
+		pg_usleep(1000L);
+	}
+
+	/* more than one block available */
+	if (targetPagePtr + XLOG_BLCKSZ <= flushptr)
+		count = XLOG_BLCKSZ;
+	/* not enough data there */
+	else if (targetPagePtr + reqLen > flushptr)
+		return -1;
+	/* part of the page available */
+	else
+		count = flushptr - targetPagePtr;
+
+	XLogRead(cur_page, *pageTLI, targetPagePtr, XLOG_BLCKSZ);
+
+	return count;
+}
+
+/*
+ * Helper function for the various SQL callable logical decoding functions.
+ */
+static Datum
+pg_logical_slot_get_changes_guts(FunctionCallInfo fcinfo, bool confirm, bool binary)
+{
+	Name		name = PG_GETARG_NAME(0);
+	XLogRecPtr	upto_lsn;
+	int32		upto_nchanges;
+
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	MemoryContext per_query_ctx;
+	MemoryContext oldcontext;
+
+	XLogRecPtr	end_of_wal;
+	XLogRecPtr	startptr;
+
+	LogicalDecodingContext *ctx;
+
+	ResourceOwner old_resowner = CurrentResourceOwner;
+	ArrayType  *arr;
+	Size		ndim;
+	List	   *options = NIL;
+	DecodingOutputState *p;
+
+	if (PG_ARGISNULL(1))
+		upto_lsn = InvalidXLogRecPtr;
+	else
+		upto_lsn = PG_GETARG_LSN(1);
+
+	if (PG_ARGISNULL(2))
+		upto_nchanges = InvalidXLogRecPtr;
+	else
+		upto_nchanges = PG_GETARG_INT32(2);
+
+	/* check to see if caller supports us returning a tuplestore */
+	if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("set-valued function called in context that cannot accept a set")));
+	if (!(rsinfo->allowedModes & SFRM_Materialize))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("materialize mode required, but it is not allowed in this context")));
+
+	/* state to write output to */
+	p = palloc0(sizeof(DecodingOutputState));
+
+	p->binary_output = binary;
+
+	/* Build a tuple descriptor for our result type */
+	if (get_call_result_type(fcinfo, NULL, &p->tupdesc) != TYPEFUNC_COMPOSITE)
+		elog(ERROR, "return type must be a row type");
+
+	check_permissions();
+
+	CheckLogicalDecodingRequirements();
+
+	arr = PG_GETARG_ARRAYTYPE_P(3);
+	ndim = ARR_NDIM(arr);
+
+	per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
+	oldcontext = MemoryContextSwitchTo(per_query_ctx);
+
+	if (ndim > 1)
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("array must be one-dimensional")));
+	}
+	else if (array_contains_nulls(arr))
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("array must not contain nulls")));
+	}
+	else if (ndim == 1)
+	{
+		int			nelems;
+		Datum	   *datum_opts;
+		int			i;
+
+		Assert(ARR_ELEMTYPE(arr) == TEXTOID);
+
+		deconstruct_array(arr, TEXTOID, -1, false, 'i',
+						  &datum_opts, NULL, &nelems);
+
+		if (nelems % 2 != 0)
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("array must have even number of elements")));
+
+		for (i = 0; i < nelems; i += 2)
+		{
+			char	   *name = TextDatumGetCString(datum_opts[i]);
+			char	   *opt = TextDatumGetCString(datum_opts[i + 1]);
+
+			options = lappend(options, makeDefElem(name, (Node *) makeString(opt)));
+		}
+	}
+
+	p->tupstore = tuplestore_begin_heap(true, false, work_mem);
+	rsinfo->returnMode = SFRM_Materialize;
+	rsinfo->setResult = p->tupstore;
+	rsinfo->setDesc = p->tupdesc;
+
+	/* compute the current end-of-wal */
+	if (!RecoveryInProgress())
+		end_of_wal = GetFlushRecPtr();
+	else
+		end_of_wal = GetXLogReplayRecPtr(NULL);
+
+	CheckLogicalDecodingRequirements();
+	ReplicationSlotAcquire(NameStr(*name));
+
+	PG_TRY();
+	{
+		ctx = CreateDecodingContext(InvalidXLogRecPtr,
+									options,
+									logical_read_local_xlog_page,
+									LogicalOutputPrepareWrite,
+									LogicalOutputWrite);
+
+		MemoryContextSwitchTo(oldcontext);
+
+		/*
+		 * Check whether the output pluggin writes textual output if that's
+		 * what we need.
+		 */
+		if (!binary &&
+			ctx->options.output_type != OUTPUT_PLUGIN_TEXTUAL_OUTPUT)
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("output plugin cannot produce text output")));
+
+		ctx->output_writer_private = p;
+
+		startptr = MyReplicationSlot->data.restart_lsn;
+
+		CurrentResourceOwner = ResourceOwnerCreate(CurrentResourceOwner, "logical decoding");
+
+		/* invalidate non-timetravel entries */
+		InvalidateSystemCaches();
+
+		while ((startptr != InvalidXLogRecPtr && startptr < end_of_wal) ||
+			   (ctx->reader->EndRecPtr && ctx->reader->EndRecPtr < end_of_wal))
+		{
+			XLogRecord *record;
+			char	   *errm = NULL;
+
+			record = XLogReadRecord(ctx->reader, startptr, &errm);
+			if (errm)
+				elog(ERROR, "%s", errm);
+
+			startptr = InvalidXLogRecPtr;
+
+			/*
+			 * The {begin_txn,change,commit_txn}_wrapper callbacks above will
+			 * store the description into our tuplestore.
+			 */
+			if (record != NULL)
+				LogicalDecodingProcessRecord(ctx, record);
+
+			/* check limits */
+			if (upto_lsn != InvalidXLogRecPtr &&
+				upto_lsn <= ctx->reader->EndRecPtr)
+				break;
+			if (upto_nchanges != 0 &&
+				upto_nchanges <= p->returned_rows)
+				break;
+		}
+	}
+	PG_CATCH();
+	{
+		/* clear all timetravel entries */
+		InvalidateSystemCaches();
+
+		PG_RE_THROW();
+	}
+	PG_END_TRY();
+
+	tuplestore_donestoring(tupstore);
+
+	CurrentResourceOwner = old_resowner;
+
+	/*
+	 * Next time, start where we left off. (Hunting things, the family
+	 * business..)
+	 */
+	if (ctx->reader->EndRecPtr != InvalidXLogRecPtr && confirm)
+		LogicalConfirmReceivedLocation(ctx->reader->EndRecPtr);
+
+	/* free context, call shutdown callback */
+	FreeDecodingContext(ctx);
+
+	ReplicationSlotRelease();
+	InvalidateSystemCaches();
+
+	return (Datum) 0;
+}
+
+/*
+ * SQL function returning the changestream as text, consuming the data.
+ */
+Datum
+pg_logical_slot_get_changes(PG_FUNCTION_ARGS)
+{
+	Datum ret = pg_logical_slot_get_changes_guts(fcinfo, true, false);
+	return ret;
+}
+
+/*
+ * SQL function returning the changestream as text, only peeking ahead.
+ */
+Datum
+pg_logical_slot_peek_changes(PG_FUNCTION_ARGS)
+{
+	Datum ret = pg_logical_slot_get_changes_guts(fcinfo, false, false);
+	return ret;
+}
+
+/*
+ * SQL function returning the changestream in binary, consuming the data.
+ */
+Datum
+pg_logical_slot_get_binary_changes(PG_FUNCTION_ARGS)
+{
+	Datum ret = pg_logical_slot_get_changes_guts(fcinfo, true, true);
+	return ret;
+}
+
+/*
+ * SQL function returning the changestream in binary, only peeking ahead.
+ */
+Datum
+pg_logical_slot_peek_binary_changes(PG_FUNCTION_ARGS)
+{
+	Datum ret = pg_logical_slot_get_changes_guts(fcinfo, false, true);
+	return ret;
+}
diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c
new file mode 100644
index 00000000000..e7182338b89
--- /dev/null
+++ b/src/backend/replication/logical/reorderbuffer.c
@@ -0,0 +1,3059 @@
+/*-------------------------------------------------------------------------
+ *
+ * reorderbuffer.c
+ *	  PostgreSQL logical replay/reorder buffer management
+ *
+ *
+ * Copyright (c) 2012-2014, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/replication/reorderbuffer.c
+ *
+ * NOTES
+ *	  This module gets handed individual pieces of transactions in the order
+ *	  they are written to the WAL and is responsible to reassemble them into
+ *	  toplevel transaction sized pieces. When a transaction is completely
+ *	  reassembled - signalled by reading the transaction commit record - it
+ *	  will then call the output plugin (c.f. ReorderBufferCommit()) with the
+ *	  individual changes. The output plugins rely on snapshots built by
+ *	  snapbuild.c which hands them to us.
+ *
+ *	  Transactions and subtransactions/savepoints in postgres are not
+ *	  immediately linked to each other from outside the performing
+ *	  backend. Only at commit/abort (or special xact_assignment records) they
+ *	  are linked together. Which means that we will have to splice together a
+ *	  toplevel transaction from its subtransactions. To do that efficiently we
+ *	  build a binary heap indexed by the smallest current lsn of the individual
+ *	  subtransactions' changestreams. As the individual streams are inherently
+ *	  ordered by LSN - since that is where we build them from - the transaction
+ *	  can easily be reassembled by always using the subtransaction with the
+ *	  smallest current LSN from the heap.
+ *
+ *	  In order to cope with large transactions - which can be several times as
+ *	  big as the available memory - this module supports spooling the contents
+ *	  of a large transactions to disk. When the transaction is replayed the
+ *	  contents of individual (sub-)transactions will be read from disk in
+ *	  chunks.
+ *
+ *	  This module also has to deal with reassembling toast records from the
+ *	  individual chunks stored in WAL. When a new (or initial) version of a
+ *	  tuple is stored in WAL it will always be preceded by the toast chunks
+ *	  emitted for the columns stored out of line. Within a single toplevel
+ *	  transaction there will be no other data carrying records between a row's
+ *	  toast chunks and the row data itself. See ReorderBufferToast* for
+ *	  details.
+ * -------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <unistd.h>
+#include <sys/stat.h>
+
+#include "miscadmin.h"
+
+#include "access/rewriteheap.h"
+#include "access/transam.h"
+#include "access/tuptoaster.h"
+#include "access/xact.h"
+
+#include "catalog/catalog.h"
+
+#include "common/relpath.h"
+
+#include "lib/binaryheap.h"
+
+#include "replication/logical.h"
+#include "replication/reorderbuffer.h"
+#include "replication/slot.h"
+#include "replication/snapbuild.h" /* just for SnapBuildSnapDecRefcount */
+
+#include "storage/bufmgr.h"
+#include "storage/fd.h"
+#include "storage/sinval.h"
+
+#include "utils/builtins.h"
+#include "utils/combocid.h"
+#include "utils/memdebug.h"
+#include "utils/memutils.h"
+#include "utils/relcache.h"
+#include "utils/relfilenodemap.h"
+#include "utils/tqual.h"
+
+/*
+ * For efficiency and simplicity reasons we want to keep Snapshots, CommandIds
+ * and ComboCids in the same list with the user visible INSERT/UPDATE/DELETE
+ * changes. We don't want to leak those internal values to external users
+ * though (they would just use switch()...default:) because that would make it
+ * harder to add to new user visible values.
+ *
+ * This needs to be synchronized with ReorderBufferChangeType! Adjust the
+ * StaticAssertExpr's in ReorderBufferAllocate if you add anything!
+ */
+typedef enum
+{
+	REORDER_BUFFER_CHANGE_INTERNAL_INSERT,
+	REORDER_BUFFER_CHANGE_INTERNAL_UPDATE,
+	REORDER_BUFFER_CHANGE_INTERNAL_DELETE,
+	REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT,
+	REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID,
+	REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID
+} ReorderBufferChangeTypeInternal;
+
+/* entry for a hash table we use to map from xid to our transaction state */
+typedef struct ReorderBufferTXNByIdEnt
+{
+	TransactionId xid;
+	ReorderBufferTXN *txn;
+} ReorderBufferTXNByIdEnt;
+
+/* data structures for (relfilenode, ctid) => (cmin, cmax) mapping */
+typedef struct ReorderBufferTupleCidKey
+{
+	RelFileNode relnode;
+	ItemPointerData tid;
+} ReorderBufferTupleCidKey;
+
+typedef struct ReorderBufferTupleCidEnt
+{
+	ReorderBufferTupleCidKey key;
+	CommandId	cmin;
+	CommandId	cmax;
+	CommandId	combocid;		/* just for debugging */
+} ReorderBufferTupleCidEnt;
+
+/* k-way in-order change iteration support structures */
+typedef struct ReorderBufferIterTXNEntry
+{
+	XLogRecPtr	lsn;
+	ReorderBufferChange *change;
+	ReorderBufferTXN *txn;
+	int			fd;
+	XLogSegNo	segno;
+} ReorderBufferIterTXNEntry;
+
+typedef struct ReorderBufferIterTXNState
+{
+	binaryheap *heap;
+	Size		nr_txns;
+	dlist_head	old_change;
+	ReorderBufferIterTXNEntry entries[FLEXIBLE_ARRAY_MEMBER];
+} ReorderBufferIterTXNState;
+
+/* toast datastructures */
+typedef struct ReorderBufferToastEnt
+{
+	Oid			chunk_id;		/* toast_table.chunk_id */
+	int32		last_chunk_seq; /* toast_table.chunk_seq of the last chunk we
+								 * have seen */
+	Size		num_chunks;		/* number of chunks we've already seen */
+	Size		size;			/* combined size of chunks seen */
+	dlist_head	chunks;			/* linked list of chunks */
+	struct varlena *reconstructed;		/* reconstructed varlena now pointed
+										 * to in main tup */
+} ReorderBufferToastEnt;
+
+/* Disk serialization support datastructures */
+typedef struct ReorderBufferDiskChange
+{
+	Size		size;
+	ReorderBufferChange change;
+	/* data follows */
+} ReorderBufferDiskChange;
+
+/*
+ * Maximum number of changes kept in memory, per transaction. After that,
+ * changes are spooled to disk.
+ *
+ * The current value should be sufficient to decode the entire transaction
+ * without hitting disk in OLTP workloads, while starting to spool to disk in
+ * other workloads reasonably fast.
+ *
+ * At some point in the future it probaly makes sense to have a more elaborate
+ * resource management here, but it's not entirely clear what that would look
+ * like.
+ */
+static const Size max_changes_in_memory = 4096;
+
+/*
+ * We use a very simple form of a slab allocator for frequently allocated
+ * objects, simply keeping a fixed number in a linked list when unused,
+ * instead pfree()ing them. Without that in many workloads aset.c becomes a
+ * major bottleneck, especially when spilling to disk while decoding batch
+ * workloads.
+ */
+static const Size max_cached_changes = 4096 * 2;
+static const Size max_cached_tuplebufs = 4096 * 2;		/* ~8MB */
+static const Size max_cached_transactions = 512;
+
+
+/* ---------------------------------------
+ * primary reorderbuffer support routines
+ * ---------------------------------------
+ */
+static ReorderBufferTXN *ReorderBufferGetTXN(ReorderBuffer *rb);
+static void ReorderBufferReturnTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
+static ReorderBufferTXN *ReorderBufferTXNByXid(ReorderBuffer *rb,
+					  TransactionId xid, bool create, bool *is_new,
+					  XLogRecPtr lsn, bool create_as_top);
+
+static void AssertTXNLsnOrder(ReorderBuffer *rb);
+
+/* ---------------------------------------
+ * support functions for lsn-order iterating over the ->changes of a
+ * transaction and its subtransactions
+ *
+ * used for iteration over the k-way heap merge of a transaction and its
+ * subtransactions
+ * ---------------------------------------
+ */
+static ReorderBufferIterTXNState *ReorderBufferIterTXNInit(ReorderBuffer *rb, ReorderBufferTXN *txn);
+static ReorderBufferChange *
+			ReorderBufferIterTXNNext(ReorderBuffer *rb, ReorderBufferIterTXNState *state);
+static void ReorderBufferIterTXNFinish(ReorderBuffer *rb,
+						   ReorderBufferIterTXNState *state);
+static void ReorderBufferExecuteInvalidations(ReorderBuffer *rb, ReorderBufferTXN *txn);
+
+/*
+ * ---------------------------------------
+ * Disk serialization support functions
+ * ---------------------------------------
+ */
+static void ReorderBufferCheckSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
+static void ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
+static void ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
+							 int fd, ReorderBufferChange *change);
+static Size ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn,
+							int *fd, XLogSegNo *segno);
+static void ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
+						   char *change);
+static void ReorderBufferRestoreCleanup(ReorderBuffer *rb, ReorderBufferTXN *txn);
+
+static void ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap);
+static Snapshot ReorderBufferCopySnap(ReorderBuffer *rb, Snapshot orig_snap,
+					  ReorderBufferTXN *txn, CommandId cid);
+
+/* ---------------------------------------
+ * toast reassembly support
+ * ---------------------------------------
+ */
+static void ReorderBufferToastInitHash(ReorderBuffer *rb, ReorderBufferTXN *txn);
+static void ReorderBufferToastReset(ReorderBuffer *rb, ReorderBufferTXN *txn);
+static void ReorderBufferToastReplace(ReorderBuffer *rb, ReorderBufferTXN *txn,
+						  Relation relation, ReorderBufferChange *change);
+static void ReorderBufferToastAppendChunk(ReorderBuffer *rb, ReorderBufferTXN *txn,
+							  Relation relation, ReorderBufferChange *change);
+
+
+/*
+ * Allocate a new ReorderBuffer
+ */
+ReorderBuffer *
+ReorderBufferAllocate(void)
+{
+	ReorderBuffer *buffer;
+	HASHCTL		hash_ctl;
+	MemoryContext new_ctx;
+
+	StaticAssertExpr((int) REORDER_BUFFER_CHANGE_INTERNAL_INSERT == (int) REORDER_BUFFER_CHANGE_INSERT, "out of sync enums");
+	StaticAssertExpr((int) REORDER_BUFFER_CHANGE_INTERNAL_UPDATE == (int) REORDER_BUFFER_CHANGE_UPDATE, "out of sync enums");
+	StaticAssertExpr((int) REORDER_BUFFER_CHANGE_INTERNAL_DELETE == (int) REORDER_BUFFER_CHANGE_DELETE, "out of sync enums");
+
+	/* allocate memory in own context, to have better accountability */
+	new_ctx = AllocSetContextCreate(CurrentMemoryContext,
+									"ReorderBuffer",
+									ALLOCSET_DEFAULT_MINSIZE,
+									ALLOCSET_DEFAULT_INITSIZE,
+									ALLOCSET_DEFAULT_MAXSIZE);
+
+	buffer =
+		(ReorderBuffer *) MemoryContextAlloc(new_ctx, sizeof(ReorderBuffer));
+
+	memset(&hash_ctl, 0, sizeof(hash_ctl));
+
+	buffer->context = new_ctx;
+
+	hash_ctl.keysize = sizeof(TransactionId);
+	hash_ctl.entrysize = sizeof(ReorderBufferTXNByIdEnt);
+	hash_ctl.hash = tag_hash;
+	hash_ctl.hcxt = buffer->context;
+
+	buffer->by_txn = hash_create("ReorderBufferByXid", 1000, &hash_ctl,
+								 HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
+
+	buffer->by_txn_last_xid = InvalidTransactionId;
+	buffer->by_txn_last_txn = NULL;
+
+	buffer->nr_cached_transactions = 0;
+	buffer->nr_cached_changes = 0;
+	buffer->nr_cached_tuplebufs = 0;
+
+	buffer->outbuf = NULL;
+	buffer->outbufsize = 0;
+
+	buffer->current_restart_decoding_lsn = InvalidXLogRecPtr;
+
+	dlist_init(&buffer->toplevel_by_lsn);
+	dlist_init(&buffer->cached_transactions);
+	dlist_init(&buffer->cached_changes);
+	slist_init(&buffer->cached_tuplebufs);
+
+	return buffer;
+}
+
+/*
+ * Free a ReorderBuffer
+ */
+void
+ReorderBufferFree(ReorderBuffer *rb)
+{
+	MemoryContext context = rb->context;
+
+	/*
+	 * We free separately allocated data by entirely scrapping reorderbuffer's
+	 * memory context.
+	 */
+	MemoryContextDelete(context);
+}
+
+/*
+ * Get a unused, possibly preallocated, ReorderBufferTXN.
+ */
+static ReorderBufferTXN *
+ReorderBufferGetTXN(ReorderBuffer *rb)
+{
+	ReorderBufferTXN *txn;
+
+	/* check the slab cache */
+	if (rb->nr_cached_transactions > 0)
+	{
+		rb->nr_cached_transactions--;
+		txn = (ReorderBufferTXN *)
+			dlist_container(ReorderBufferTXN, node,
+							dlist_pop_head_node(&rb->cached_transactions));
+	}
+	else
+	{
+		txn = (ReorderBufferTXN *)
+			MemoryContextAlloc(rb->context, sizeof(ReorderBufferTXN));
+	}
+
+	memset(txn, 0, sizeof(ReorderBufferTXN));
+
+	dlist_init(&txn->changes);
+	dlist_init(&txn->tuplecids);
+	dlist_init(&txn->subtxns);
+
+	return txn;
+}
+
+/*
+ * Free a ReorderBufferTXN.
+ *
+ * Deallocation might be delayed for efficiency purposes, for details check
+ * the comments above max_cached_changes's definition.
+ */
+void
+ReorderBufferReturnTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
+{
+	/* clean the lookup cache if we were cached (quite likely) */
+	if (rb->by_txn_last_xid == txn->xid)
+	{
+		rb->by_txn_last_xid = InvalidTransactionId;
+		rb->by_txn_last_txn = NULL;
+	}
+
+	/* free data that's contained */
+
+	if (txn->tuplecid_hash != NULL)
+	{
+		hash_destroy(txn->tuplecid_hash);
+		txn->tuplecid_hash = NULL;
+	}
+
+	if (txn->invalidations)
+	{
+		pfree(txn->invalidations);
+		txn->invalidations = NULL;
+	}
+
+	/* check whether to put into the slab cache */
+	if (rb->nr_cached_transactions < max_cached_transactions)
+	{
+		rb->nr_cached_transactions++;
+		dlist_push_head(&rb->cached_transactions, &txn->node);
+		VALGRIND_MAKE_MEM_UNDEFINED(txn, sizeof(ReorderBufferTXN));
+		VALGRIND_MAKE_MEM_DEFINED(&txn->node, sizeof(txn->node));
+	}
+	else
+	{
+		pfree(txn);
+	}
+}
+
+/*
+ * Get a unused, possibly preallocated, ReorderBufferChange.
+ */
+ReorderBufferChange *
+ReorderBufferGetChange(ReorderBuffer *rb)
+{
+	ReorderBufferChange *change;
+
+	/* check the slab cache */
+	if (rb->nr_cached_changes)
+	{
+		rb->nr_cached_changes--;
+		change = (ReorderBufferChange *)
+			dlist_container(ReorderBufferChange, node,
+							dlist_pop_head_node(&rb->cached_changes));
+	}
+	else
+	{
+		change = (ReorderBufferChange *)
+			MemoryContextAlloc(rb->context, sizeof(ReorderBufferChange));
+	}
+
+	memset(change, 0, sizeof(ReorderBufferChange));
+	return change;
+}
+
+/*
+ * Free an ReorderBufferChange.
+ *
+ * Deallocation might be delayed for efficiency purposes, for details check
+ * the comments above max_cached_changes's definition.
+ */
+void
+ReorderBufferReturnChange(ReorderBuffer *rb, ReorderBufferChange *change)
+{
+	/* free contained data */
+	switch ((ReorderBufferChangeTypeInternal) change->action_internal)
+	{
+		case REORDER_BUFFER_CHANGE_INTERNAL_INSERT:
+		case REORDER_BUFFER_CHANGE_INTERNAL_UPDATE:
+		case REORDER_BUFFER_CHANGE_INTERNAL_DELETE:
+			if (change->tp.newtuple)
+			{
+				ReorderBufferReturnTupleBuf(rb, change->tp.newtuple);
+				change->tp.newtuple = NULL;
+			}
+
+			if (change->tp.oldtuple)
+			{
+				ReorderBufferReturnTupleBuf(rb, change->tp.oldtuple);
+				change->tp.oldtuple = NULL;
+			}
+			break;
+		case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
+			if (change->snapshot)
+			{
+				ReorderBufferFreeSnap(rb, change->snapshot);
+				change->snapshot = NULL;
+			}
+			break;
+		case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
+			break;
+		case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
+			break;
+	}
+
+	/* check whether to put into the slab cache */
+	if (rb->nr_cached_changes < max_cached_changes)
+	{
+		rb->nr_cached_changes++;
+		dlist_push_head(&rb->cached_changes, &change->node);
+		VALGRIND_MAKE_MEM_UNDEFINED(change, sizeof(ReorderBufferChange));
+		VALGRIND_MAKE_MEM_DEFINED(&change->node, sizeof(change->node));
+	}
+	else
+	{
+		pfree(change);
+	}
+}
+
+
+/*
+ * Get a unused, possibly preallocated, ReorderBufferTupleBuf
+ */
+ReorderBufferTupleBuf *
+ReorderBufferGetTupleBuf(ReorderBuffer *rb)
+{
+	ReorderBufferTupleBuf *tuple;
+
+	/* check the slab cache */
+	if (rb->nr_cached_tuplebufs)
+	{
+		rb->nr_cached_tuplebufs--;
+		tuple = slist_container(ReorderBufferTupleBuf, node,
+								slist_pop_head_node(&rb->cached_tuplebufs));
+#ifdef USE_ASSERT_CHECKING
+		memset(tuple, 0xdeadbeef, sizeof(ReorderBufferTupleBuf));
+#endif
+	}
+	else
+	{
+		tuple = (ReorderBufferTupleBuf *)
+			MemoryContextAlloc(rb->context, sizeof(ReorderBufferTupleBuf));
+	}
+
+	return tuple;
+}
+
+/*
+ * Free an ReorderBufferTupleBuf.
+ *
+ * Deallocation might be delayed for efficiency purposes, for details check
+ * the comments above max_cached_changes's definition.
+ */
+void
+ReorderBufferReturnTupleBuf(ReorderBuffer *rb, ReorderBufferTupleBuf *tuple)
+{
+	/* check whether to put into the slab cache */
+	if (rb->nr_cached_tuplebufs < max_cached_tuplebufs)
+	{
+		rb->nr_cached_tuplebufs++;
+		slist_push_head(&rb->cached_tuplebufs, &tuple->node);
+		VALGRIND_MAKE_MEM_UNDEFINED(tuple, sizeof(ReorderBufferTupleBuf));
+		VALGRIND_MAKE_MEM_DEFINED(&tuple->node, sizeof(tuple->node));
+	}
+	else
+	{
+		pfree(tuple);
+	}
+}
+
+/*
+ * Return the ReorderBufferTXN from the given buffer, specified by Xid.
+ * If create is true, and a transaction doesn't already exist, create it
+ * (with the given LSN, and as top transaction if that's specified);
+ * when this happens, is_new is set to true.
+ */
+static ReorderBufferTXN *
+ReorderBufferTXNByXid(ReorderBuffer *rb, TransactionId xid, bool create,
+					  bool *is_new, XLogRecPtr lsn, bool create_as_top)
+{
+	ReorderBufferTXN *txn;
+	ReorderBufferTXNByIdEnt *ent;
+	bool		found;
+
+	Assert(TransactionIdIsValid(xid));
+	Assert(!create || lsn != InvalidXLogRecPtr);
+
+	/*
+	 * Check the one-entry lookup cache first
+	 */
+	if (TransactionIdIsValid(rb->by_txn_last_xid) &&
+		rb->by_txn_last_xid == xid)
+	{
+		txn = rb->by_txn_last_txn;
+
+		if (txn != NULL)
+		{
+			/* found it, and it's valid */
+			if (is_new)
+				*is_new = false;
+			return txn;
+		}
+
+		/*
+		 * cached as non-existant, and asked not to create? Then nothing else
+		 * to do.
+		 */
+		if (!create)
+			return NULL;
+		/* otherwise fall through to create it */
+	}
+
+	/*
+	 * If the cache wasn't hit or it yielded an "does-not-exist" and we want
+	 * to create an entry.
+	 */
+
+	/* search the lookup table */
+	ent = (ReorderBufferTXNByIdEnt *)
+		hash_search(rb->by_txn,
+					(void *) &xid,
+					create ? HASH_ENTER : HASH_FIND,
+					&found);
+	if (found)
+		txn = ent->txn;
+	else if (create)
+	{
+		/* initialize the new entry, if creation was requested */
+		Assert(ent != NULL);
+
+		ent->txn = ReorderBufferGetTXN(rb);
+		ent->txn->xid = xid;
+		txn = ent->txn;
+		txn->first_lsn = lsn;
+		txn->restart_decoding_lsn = rb->current_restart_decoding_lsn;
+
+		if (create_as_top)
+		{
+			dlist_push_tail(&rb->toplevel_by_lsn, &txn->node);
+			AssertTXNLsnOrder(rb);
+		}
+	}
+	else
+		txn = NULL;				/* not found and not asked to create */
+
+	/* update cache */
+	rb->by_txn_last_xid = xid;
+	rb->by_txn_last_txn = txn;
+
+	if (is_new)
+		*is_new = !found;
+
+	Assert(!create || !!txn);
+	return txn;
+}
+
+/*
+ * Queue a change into a transaction so it can be replayed upon commit.
+ */
+void
+ReorderBufferQueueChange(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn,
+					   ReorderBufferChange *change)
+{
+	ReorderBufferTXN *txn;
+
+	txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
+
+	change->lsn = lsn;
+	Assert(InvalidXLogRecPtr != lsn);
+	dlist_push_tail(&txn->changes, &change->node);
+	txn->nentries++;
+	txn->nentries_mem++;
+
+	ReorderBufferCheckSerializeTXN(rb, txn);
+}
+
+static void
+AssertTXNLsnOrder(ReorderBuffer *rb)
+{
+#ifdef USE_ASSERT_CHECKING
+	dlist_iter	iter;
+	XLogRecPtr	prev_first_lsn = InvalidXLogRecPtr;
+
+	dlist_foreach(iter, &rb->toplevel_by_lsn)
+	{
+		ReorderBufferTXN *cur_txn;
+
+		cur_txn = dlist_container(ReorderBufferTXN, node, iter.cur);
+		Assert(cur_txn->first_lsn != InvalidXLogRecPtr);
+
+		if (cur_txn->end_lsn != InvalidXLogRecPtr)
+			Assert(cur_txn->first_lsn <= cur_txn->end_lsn);
+
+		if (prev_first_lsn != InvalidXLogRecPtr)
+			Assert(prev_first_lsn < cur_txn->first_lsn);
+
+		Assert(!cur_txn->is_known_as_subxact);
+		prev_first_lsn = cur_txn->first_lsn;
+	}
+#endif
+}
+
+ReorderBufferTXN *
+ReorderBufferGetOldestTXN(ReorderBuffer *rb)
+{
+	ReorderBufferTXN *txn;
+
+	if (dlist_is_empty(&rb->toplevel_by_lsn))
+		return NULL;
+
+	AssertTXNLsnOrder(rb);
+
+	txn = dlist_head_element(ReorderBufferTXN, node, &rb->toplevel_by_lsn);
+
+	Assert(!txn->is_known_as_subxact);
+	Assert(txn->first_lsn != InvalidXLogRecPtr);
+	return txn;
+}
+
+void
+ReorderBufferSetRestartPoint(ReorderBuffer *rb, XLogRecPtr ptr)
+{
+	rb->current_restart_decoding_lsn = ptr;
+}
+
+void
+ReorderBufferAssignChild(ReorderBuffer *rb, TransactionId xid,
+						 TransactionId subxid, XLogRecPtr lsn)
+{
+	ReorderBufferTXN *txn;
+	ReorderBufferTXN *subtxn;
+	bool		new_top;
+	bool		new_sub;
+
+	txn = ReorderBufferTXNByXid(rb, xid, true, &new_top, lsn, true);
+	subtxn = ReorderBufferTXNByXid(rb, subxid, true, &new_sub, lsn, false);
+
+	if (new_sub)
+	{
+		/*
+		 * we assign subtransactions to top level transaction even if we don't
+		 * have data for it yet, assignment records frequently reference xids
+		 * that have not yet produced any records. Knowing those aren't top
+		 * level xids allows us to make processing cheaper in some places.
+		 */
+		dlist_push_tail(&txn->subtxns, &subtxn->node);
+		txn->nsubtxns++;
+	}
+	else if (!subtxn->is_known_as_subxact)
+	{
+		subtxn->is_known_as_subxact = true;
+		Assert(subtxn->nsubtxns == 0);
+
+		/* remove from lsn order list of top-level transactions */
+		dlist_delete(&subtxn->node);
+
+		/* add to toplevel transaction */
+		dlist_push_tail(&txn->subtxns, &subtxn->node);
+		txn->nsubtxns++;
+	}
+	else if (new_top)
+	{
+		elog(ERROR, "existing subxact assigned to unknown toplevel xact");
+	}
+}
+
+/*
+ * Associate a subtransaction with its toplevel transaction at commit
+ * time. There may be no further changes added after this.
+ */
+void
+ReorderBufferCommitChild(ReorderBuffer *rb, TransactionId xid,
+						 TransactionId subxid, XLogRecPtr commit_lsn,
+						 XLogRecPtr end_lsn)
+{
+	ReorderBufferTXN *txn;
+	ReorderBufferTXN *subtxn;
+
+	subtxn = ReorderBufferTXNByXid(rb, subxid, false, NULL,
+								   InvalidXLogRecPtr, false);
+
+	/*
+	 * No need to do anything if that subtxn didn't contain any changes
+	 */
+	if (!subtxn)
+		return;
+
+	txn = ReorderBufferTXNByXid(rb, xid, false, NULL, commit_lsn, true);
+
+	if (txn == NULL)
+		elog(ERROR, "subxact logged without previous toplevel record");
+
+	/*
+	 * Pass the our base snapshot to the parent transaction if it doesn't have
+	 * one, or ours is older. That can happen if there are no changes in the
+	 * toplevel transaction but in one of the child transactions. This allows
+	 * the parent to simply use it's base snapshot initially.
+	 */
+	if (txn->base_snapshot == NULL ||
+		txn->base_snapshot_lsn > subtxn->base_snapshot_lsn)
+	{
+		txn->base_snapshot = subtxn->base_snapshot;
+		txn->base_snapshot_lsn = subtxn->base_snapshot_lsn;
+		subtxn->base_snapshot = NULL;
+		subtxn->base_snapshot_lsn = InvalidXLogRecPtr;
+	}
+
+	subtxn->final_lsn = commit_lsn;
+	subtxn->end_lsn = end_lsn;
+
+	if (!subtxn->is_known_as_subxact)
+	{
+		subtxn->is_known_as_subxact = true;
+		Assert(subtxn->nsubtxns == 0);
+
+		/* remove from lsn order list of top-level transactions */
+		dlist_delete(&subtxn->node);
+
+		/* add to subtransaction list */
+		dlist_push_tail(&txn->subtxns, &subtxn->node);
+		txn->nsubtxns++;
+	}
+}
+
+
+/*
+ * Support for efficiently iterating over a transaction's and its
+ * subtransactions' changes.
+ *
+ * We do by doing a k-way merge between transactions/subtransactions. For that
+ * we model the current heads of the different transactions as a binary heap
+ * so we easily know which (sub-)transaction has the change with the smallest
+ * lsn next.
+ *
+ * We assume the changes in individual transactions are already sorted by LSN.
+ */
+
+/*
+ * Binary heap comparison function.
+ */
+static int
+ReorderBufferIterCompare(Datum a, Datum b, void *arg)
+{
+	ReorderBufferIterTXNState *state = (ReorderBufferIterTXNState *) arg;
+	XLogRecPtr	pos_a = state->entries[DatumGetInt32(a)].lsn;
+	XLogRecPtr	pos_b = state->entries[DatumGetInt32(b)].lsn;
+
+	if (pos_a < pos_b)
+		return 1;
+	else if (pos_a == pos_b)
+		return 0;
+	return -1;
+}
+
+/*
+ * Allocate & initialize an iterator which iterates in lsn order over a
+ * transaction and all its subtransactions.
+ */
+static ReorderBufferIterTXNState *
+ReorderBufferIterTXNInit(ReorderBuffer *rb, ReorderBufferTXN *txn)
+{
+	Size		nr_txns = 0;
+	ReorderBufferIterTXNState *state;
+	dlist_iter	cur_txn_i;
+	int32		off;
+
+	/*
+	 * Calculate the size of our heap: one element for every transaction that
+	 * contains changes.  (Besides the transactions already in the reorder
+	 * buffer, we count the one we were directly passed.)
+	 */
+	if (txn->nentries > 0)
+		nr_txns++;
+
+	dlist_foreach(cur_txn_i, &txn->subtxns)
+	{
+		ReorderBufferTXN *cur_txn;
+
+		cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur);
+
+		if (cur_txn->nentries > 0)
+			nr_txns++;
+	}
+
+	/*
+	 * TODO: Consider adding fastpath for the rather common nr_txns=1 case, no
+	 * need to allocate/build a heap then.
+	 */
+
+	/* allocate iteration state */
+	state = (ReorderBufferIterTXNState *)
+		MemoryContextAllocZero(rb->context,
+							   sizeof(ReorderBufferIterTXNState) +
+							   sizeof(ReorderBufferIterTXNEntry) * nr_txns);
+
+	state->nr_txns = nr_txns;
+	dlist_init(&state->old_change);
+
+	for (off = 0; off < state->nr_txns; off++)
+	{
+		state->entries[off].fd = -1;
+		state->entries[off].segno = 0;
+	}
+
+	/* allocate heap */
+	state->heap = binaryheap_allocate(state->nr_txns,
+									  ReorderBufferIterCompare,
+									  state);
+
+	/*
+	 * Now insert items into the binary heap, in an unordered fashion.  (We
+	 * will run a heap assembly step at the end; this is more efficient.)
+	 */
+
+	off = 0;
+
+	/* add toplevel transaction if it contains changes */
+	if (txn->nentries > 0)
+	{
+		ReorderBufferChange *cur_change;
+
+		if (txn->nentries != txn->nentries_mem)
+			ReorderBufferRestoreChanges(rb, txn, &state->entries[off].fd,
+										&state->entries[off].segno);
+
+		cur_change = dlist_head_element(ReorderBufferChange, node,
+										&txn->changes);
+
+		state->entries[off].lsn = cur_change->lsn;
+		state->entries[off].change = cur_change;
+		state->entries[off].txn = txn;
+
+		binaryheap_add_unordered(state->heap, Int32GetDatum(off++));
+	}
+
+	/* add subtransactions if they contain changes */
+	dlist_foreach(cur_txn_i, &txn->subtxns)
+	{
+		ReorderBufferTXN *cur_txn;
+
+		cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur);
+
+		if (cur_txn->nentries > 0)
+		{
+			ReorderBufferChange *cur_change;
+
+			if (txn->nentries != txn->nentries_mem)
+				ReorderBufferRestoreChanges(rb, cur_txn,
+											&state->entries[off].fd,
+											&state->entries[off].segno);
+
+			cur_change = dlist_head_element(ReorderBufferChange, node,
+											&cur_txn->changes);
+
+			state->entries[off].lsn = cur_change->lsn;
+			state->entries[off].change = cur_change;
+			state->entries[off].txn = cur_txn;
+
+			binaryheap_add_unordered(state->heap, Int32GetDatum(off++));
+		}
+	}
+
+	/* assemble a valid binary heap */
+	binaryheap_build(state->heap);
+
+	return state;
+}
+
+/*
+ * Return the next change when iterating over a transaction and its
+ * subtransactions.
+ *
+ * Returns NULL when no further changes exist.
+ */
+static ReorderBufferChange *
+ReorderBufferIterTXNNext(ReorderBuffer *rb, ReorderBufferIterTXNState *state)
+{
+	ReorderBufferChange *change;
+	ReorderBufferIterTXNEntry *entry;
+	int32		off;
+
+	/* nothing there anymore */
+	if (state->heap->bh_size == 0)
+		return NULL;
+
+	off = DatumGetInt32(binaryheap_first(state->heap));
+	entry = &state->entries[off];
+
+	/* free memory we might have "leaked" in the previous *Next call */
+	if (!dlist_is_empty(&state->old_change))
+	{
+		change = dlist_container(ReorderBufferChange, node,
+								 dlist_pop_head_node(&state->old_change));
+		ReorderBufferReturnChange(rb, change);
+		Assert(dlist_is_empty(&state->old_change));
+	}
+
+	change = entry->change;
+
+	/*
+	 * update heap with information about which transaction has the next
+	 * relevant change in LSN order
+	 */
+
+	/* there are in-memory changes */
+	if (dlist_has_next(&entry->txn->changes, &entry->change->node))
+	{
+		dlist_node *next = dlist_next_node(&entry->txn->changes, &change->node);
+		ReorderBufferChange *next_change =
+		dlist_container(ReorderBufferChange, node, next);
+
+		/* txn stays the same */
+		state->entries[off].lsn = next_change->lsn;
+		state->entries[off].change = next_change;
+
+		binaryheap_replace_first(state->heap, Int32GetDatum(off));
+		return change;
+	}
+
+	/* try to load changes from disk */
+	if (entry->txn->nentries != entry->txn->nentries_mem)
+	{
+		/*
+		 * Ugly: restoring changes will reuse *Change records, thus delete the
+		 * current one from the per-tx list and only free in the next call.
+		 */
+		dlist_delete(&change->node);
+		dlist_push_tail(&state->old_change, &change->node);
+
+		if (ReorderBufferRestoreChanges(rb, entry->txn, &entry->fd,
+										&state->entries[off].segno))
+		{
+			/* successfully restored changes from disk */
+			ReorderBufferChange *next_change =
+			dlist_head_element(ReorderBufferChange, node,
+							   &entry->txn->changes);
+
+			elog(DEBUG2, "restored %u/%u changes from disk",
+				 (uint32) entry->txn->nentries_mem,
+				 (uint32) entry->txn->nentries);
+
+			Assert(entry->txn->nentries_mem);
+			/* txn stays the same */
+			state->entries[off].lsn = next_change->lsn;
+			state->entries[off].change = next_change;
+			binaryheap_replace_first(state->heap, Int32GetDatum(off));
+
+			return change;
+		}
+	}
+
+	/* ok, no changes there anymore, remove */
+	binaryheap_remove_first(state->heap);
+
+	return change;
+}
+
+/*
+ * Deallocate the iterator
+ */
+static void
+ReorderBufferIterTXNFinish(ReorderBuffer *rb,
+						   ReorderBufferIterTXNState *state)
+{
+	int32		off;
+
+	for (off = 0; off < state->nr_txns; off++)
+	{
+		if (state->entries[off].fd != -1)
+			CloseTransientFile(state->entries[off].fd);
+	}
+
+	/* free memory we might have "leaked" in the last *Next call */
+	if (!dlist_is_empty(&state->old_change))
+	{
+		ReorderBufferChange *change;
+
+		change = dlist_container(ReorderBufferChange, node,
+								 dlist_pop_head_node(&state->old_change));
+		ReorderBufferReturnChange(rb, change);
+		Assert(dlist_is_empty(&state->old_change));
+	}
+
+	binaryheap_free(state->heap);
+	pfree(state);
+}
+
+/*
+ * Cleanup the contents of a transaction, usually after the transaction
+ * committed or aborted.
+ */
+static void
+ReorderBufferCleanupTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
+{
+	bool		found;
+	dlist_mutable_iter iter;
+
+	/* cleanup subtransactions & their changes */
+	dlist_foreach_modify(iter, &txn->subtxns)
+	{
+		ReorderBufferTXN *subtxn;
+
+		subtxn = dlist_container(ReorderBufferTXN, node, iter.cur);
+
+		/*
+		 * Subtransactions are always associated to the toplevel TXN, even if
+		 * they originally were happening inside another subtxn, so we won't
+		 * ever recurse more than one level deep here.
+		 */
+		Assert(subtxn->is_known_as_subxact);
+		Assert(subtxn->nsubtxns == 0);
+
+		ReorderBufferCleanupTXN(rb, subtxn);
+	}
+
+	/* cleanup changes in the toplevel txn */
+	dlist_foreach_modify(iter, &txn->changes)
+	{
+		ReorderBufferChange *change;
+
+		change = dlist_container(ReorderBufferChange, node, iter.cur);
+
+		ReorderBufferReturnChange(rb, change);
+	}
+
+	/*
+	 * Cleanup the tuplecids we stored for decoding catalog snapshot
+	 * access. They are always stored in the toplevel transaction.
+	 */
+	dlist_foreach_modify(iter, &txn->tuplecids)
+	{
+		ReorderBufferChange *change;
+
+		change = dlist_container(ReorderBufferChange, node, iter.cur);
+		Assert(change->action_internal == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID);
+		ReorderBufferReturnChange(rb, change);
+	}
+
+	if (txn->base_snapshot != NULL)
+	{
+		SnapBuildSnapDecRefcount(txn->base_snapshot);
+		txn->base_snapshot = NULL;
+		txn->base_snapshot_lsn = InvalidXLogRecPtr;
+	}
+
+	/* delete from list of known subxacts */
+	if (txn->is_known_as_subxact)
+	{
+		/* NB: nsubxacts count of parent will be too high now */
+		dlist_delete(&txn->node);
+	}
+	/* delete from LSN ordered list of toplevel TXNs */
+	else
+	{
+		dlist_delete(&txn->node);
+	}
+
+	/* now remove reference from buffer */
+	hash_search(rb->by_txn,
+				(void *) &txn->xid,
+				HASH_REMOVE,
+				&found);
+	Assert(found);
+
+	/* remove entries spilled to disk */
+	if (txn->nentries != txn->nentries_mem)
+		ReorderBufferRestoreCleanup(rb, txn);
+
+	/* deallocate */
+	ReorderBufferReturnTXN(rb, txn);
+}
+
+/*
+ * Build a hash with a (relfilenode, ctid) -> (cmin, cmax) mapping for use by
+ * tqual.c's HeapTupleSatisfiesHistoricMVCC.
+ */
+static void
+ReorderBufferBuildTupleCidHash(ReorderBuffer *rb, ReorderBufferTXN *txn)
+{
+	dlist_iter	iter;
+	HASHCTL		hash_ctl;
+
+	if (!txn->has_catalog_changes || dlist_is_empty(&txn->tuplecids))
+		return;
+
+	memset(&hash_ctl, 0, sizeof(hash_ctl));
+
+	hash_ctl.keysize = sizeof(ReorderBufferTupleCidKey);
+	hash_ctl.entrysize = sizeof(ReorderBufferTupleCidEnt);
+	hash_ctl.hash = tag_hash;
+	hash_ctl.hcxt = rb->context;
+
+	/*
+	 * create the hash with the exact number of to-be-stored tuplecids from
+	 * the start
+	 */
+	txn->tuplecid_hash =
+		hash_create("ReorderBufferTupleCid", txn->ntuplecids, &hash_ctl,
+					HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
+
+	dlist_foreach(iter, &txn->tuplecids)
+	{
+		ReorderBufferTupleCidKey key;
+		ReorderBufferTupleCidEnt *ent;
+		bool		found;
+		ReorderBufferChange *change;
+
+		change = dlist_container(ReorderBufferChange, node, iter.cur);
+
+		Assert(change->action_internal == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID);
+
+		/* be careful about padding */
+		memset(&key, 0, sizeof(ReorderBufferTupleCidKey));
+
+		key.relnode = change->tuplecid.node;
+
+		ItemPointerCopy(&change->tuplecid.tid,
+						&key.tid);
+
+		ent = (ReorderBufferTupleCidEnt *)
+			hash_search(txn->tuplecid_hash,
+						(void *) &key,
+						HASH_ENTER | HASH_FIND,
+						&found);
+		if (!found)
+		{
+			ent->cmin = change->tuplecid.cmin;
+			ent->cmax = change->tuplecid.cmax;
+			ent->combocid = change->tuplecid.combocid;
+		}
+		else
+		{
+			Assert(ent->cmin == change->tuplecid.cmin);
+			Assert(ent->cmax == InvalidCommandId ||
+				   ent->cmax == change->tuplecid.cmax);
+
+			/*
+			 * if the tuple got valid in this transaction and now got deleted
+			 * we already have a valid cmin stored. The cmax will be
+			 * InvalidCommandId though.
+			 */
+			ent->cmax = change->tuplecid.cmax;
+		}
+	}
+}
+
+/*
+ * Copy a provided snapshot so we can modify it privately. This is needed so
+ * that catalog modifying transactions can look into intermediate catalog
+ * states.
+ */
+static Snapshot
+ReorderBufferCopySnap(ReorderBuffer *rb, Snapshot orig_snap,
+					  ReorderBufferTXN *txn, CommandId cid)
+{
+	Snapshot	snap;
+	dlist_iter	iter;
+	int			i = 0;
+	Size		size;
+
+	size = sizeof(SnapshotData) +
+		sizeof(TransactionId) * orig_snap->xcnt +
+		sizeof(TransactionId) * (txn->nsubtxns + 1);
+
+	snap = MemoryContextAllocZero(rb->context, size);
+	memcpy(snap, orig_snap, sizeof(SnapshotData));
+
+	snap->copied = true;
+	snap->active_count = 0;
+	snap->regd_count = 1;
+	snap->xip = (TransactionId *) (snap + 1);
+
+	memcpy(snap->xip, orig_snap->xip, sizeof(TransactionId) * snap->xcnt);
+
+	/*
+	 * snap->subxip contains all txids that belong to our transaction which we
+	 * need to check via cmin/cmax. Thats why we store the toplevel
+	 * transaction in there as well.
+	 */
+	snap->subxip = snap->xip + snap->xcnt;
+	snap->subxip[i++] = txn->xid;
+
+	/*
+	 * nsubxcnt isn't decreased when subtransactions abort, so count
+	 * manually. Since it's an upper boundary it is safe to use it for the
+	 * allocation above.
+	 */
+	snap->subxcnt = 1;
+
+	dlist_foreach(iter, &txn->subtxns)
+	{
+		ReorderBufferTXN *sub_txn;
+
+		sub_txn = dlist_container(ReorderBufferTXN, node, iter.cur);
+		snap->subxip[i++] = sub_txn->xid;
+		snap->subxcnt++;
+	}
+
+	/* sort so we can bsearch() later */
+	qsort(snap->subxip, snap->subxcnt, sizeof(TransactionId), xidComparator);
+
+	/* store the specified current CommandId */
+	snap->curcid = cid;
+
+	return snap;
+}
+
+/*
+ * Free a previously ReorderBufferCopySnap'ed snapshot
+ */
+static void
+ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap)
+{
+	if (snap->copied)
+		pfree(snap);
+	else
+		SnapBuildSnapDecRefcount(snap);
+}
+
+/*
+ * Perform the replay of a transaction and it's non-aborted subtransactions.
+ *
+ * Subtransactions previously have to be processed by
+ * ReorderBufferCommitChild(), even if previously assigned to the toplevel
+ * transaction with ReorderBufferAssignChild.
+ *
+ * We currently can only decode a transaction's contents in when their commit
+ * record is read because that's currently the only place where we know about
+ * cache invalidations. Thus, once a toplevel commit is read, we iterate over
+ * the top and subtransactions (using a k-way merge) and replay the changes in
+ * lsn order.
+ */
+void
+ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid,
+					XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
+					TimestampTz commit_time)
+{
+	ReorderBufferTXN *txn;
+	ReorderBufferIterTXNState *iterstate = NULL;
+	ReorderBufferChange *change;
+
+	volatile CommandId	command_id = FirstCommandId;
+	volatile Snapshot	snapshot_now = NULL;
+	volatile bool		txn_started = false;
+	volatile bool		subtxn_started = false;
+
+	txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
+								false);
+
+	/* unknown transaction, nothing to replay */
+	if (txn == NULL)
+		return;
+
+	txn->final_lsn = commit_lsn;
+	txn->end_lsn = end_lsn;
+	txn->commit_time = commit_time;
+
+	/* serialize the last bunch of changes if we need start earlier anyway */
+	if (txn->nentries_mem != txn->nentries)
+		ReorderBufferSerializeTXN(rb, txn);
+
+	/*
+	 * If this transaction didn't have any real changes in our database, it's
+	 * OK not to have a snapshot. Note that ReorderBufferCommitChild will have
+	 * transferred its snapshot to this transaction if it had one and the
+	 * toplevel tx didn't.
+	 */
+	if (txn->base_snapshot == NULL)
+	{
+		Assert(txn->ninvalidations == 0);
+		ReorderBufferCleanupTXN(rb, txn);
+		return;
+	}
+
+	snapshot_now = txn->base_snapshot;
+
+	/* build data to be able to lookup the CommandIds of catalog tuples */
+	ReorderBufferBuildTupleCidHash(rb, txn);
+
+	/* setup the initial snapshot */
+	SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
+
+	PG_TRY();
+	{
+		txn_started = false;
+
+		/*
+		 * Decoding needs access to syscaches et al., which in turn use
+		 * heavyweight locks and such. Thus we need to have enough state around
+		 * to keep track of those. The easiest way is to simply use a
+		 * transaction internally. That also allows us to easily enforce that
+		 * nothing writes to the database by checking for xid assignments.
+		 *
+		 * When we're called via the SQL SRF there's already a transaction
+		 * started, so start an explicit subtransaction there.
+		 */
+		if (IsTransactionOrTransactionBlock())
+		{
+			BeginInternalSubTransaction("replay");
+			subtxn_started = true;
+		}
+		else
+		{
+			StartTransactionCommand();
+			txn_started = true;
+		}
+
+		rb->begin(rb, txn);
+
+		iterstate = ReorderBufferIterTXNInit(rb, txn);
+		while ((change = ReorderBufferIterTXNNext(rb, iterstate)))
+		{
+			Relation	relation = NULL;
+			Oid			reloid;
+
+			switch ((ReorderBufferChangeTypeInternal) change->action_internal)
+			{
+				case REORDER_BUFFER_CHANGE_INTERNAL_INSERT:
+				case REORDER_BUFFER_CHANGE_INTERNAL_UPDATE:
+				case REORDER_BUFFER_CHANGE_INTERNAL_DELETE:
+					Assert(snapshot_now);
+
+					reloid = RelidByRelfilenode(change->tp.relnode.spcNode,
+												change->tp.relnode.relNode);
+
+					/*
+					 * Catalog tuple without data, emitted while catalog was
+					 * in the process of being rewritten.
+					 */
+					if (reloid == InvalidOid &&
+						change->tp.newtuple == NULL &&
+						change->tp.oldtuple == NULL)
+						continue;
+					else if (reloid == InvalidOid)
+						elog(ERROR, "could not lookup relation %s",
+							 relpathperm(change->tp.relnode, MAIN_FORKNUM));
+
+					relation = RelationIdGetRelation(reloid);
+
+					if (relation == NULL)
+						elog(ERROR, "could open relation descriptor %s",
+							 relpathperm(change->tp.relnode, MAIN_FORKNUM));
+
+					if (RelationIsLogicallyLogged(relation))
+					{
+						/*
+						 * For now ignore sequence changes entirely. Most of
+						 * the time they don't log changes using records we
+						 * understand, so it doesn't make sense to handle the
+						 * few cases we do.
+						 */
+						if (relation->rd_rel->relkind == RELKIND_SEQUENCE)
+						{
+						}
+						/* user-triggered change */
+						else if (!IsToastRelation(relation))
+						{
+							ReorderBufferToastReplace(rb, txn, relation, change);
+							rb->apply_change(rb, txn, relation, change);
+							ReorderBufferToastReset(rb, txn);
+						}
+						/* we're not interested in toast deletions */
+						else if (change->action == REORDER_BUFFER_CHANGE_INSERT)
+						{
+							/*
+							 * Need to reassemble the full toasted Datum in
+							 * memory, to ensure the chunks don't get reused
+							 * till we're done remove it from the list of this
+							 * transaction's changes. Otherwise it will get
+							 * freed/reused while restoring spooled data from
+							 * disk.
+							 */
+							dlist_delete(&change->node);
+							ReorderBufferToastAppendChunk(rb, txn, relation,
+														  change);
+						}
+
+					}
+					RelationClose(relation);
+					break;
+				case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
+					/* get rid of the old */
+					TeardownHistoricSnapshot(false);
+
+					if (snapshot_now->copied)
+					{
+						ReorderBufferFreeSnap(rb, snapshot_now);
+						snapshot_now =
+							ReorderBufferCopySnap(rb, change->snapshot,
+												  txn, command_id);
+					}
+					/*
+					 * Restored from disk, need to be careful not to double
+					 * free. We could introduce refcounting for that, but for
+					 * now this seems infrequent enough not to care.
+					 */
+					else if (change->snapshot->copied)
+					{
+						snapshot_now =
+							ReorderBufferCopySnap(rb, change->snapshot,
+												  txn, command_id);
+					}
+					else
+					{
+						snapshot_now = change->snapshot;
+					}
+
+
+					/* and continue with the new one */
+					SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
+					break;
+
+				case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
+					Assert(change->command_id != InvalidCommandId);
+
+					if (command_id < change->command_id)
+					{
+						command_id = change->command_id;
+
+						if (!snapshot_now->copied)
+						{
+							/* we don't use the global one anymore */
+							snapshot_now = ReorderBufferCopySnap(rb, snapshot_now,
+																 txn, command_id);
+						}
+
+						snapshot_now->curcid = command_id;
+
+						TeardownHistoricSnapshot(false);
+						SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
+
+						/*
+						 * Every time the CommandId is incremented, we could
+						 * see new catalog contents, so execute all
+						 * invalidations.
+						 */
+						ReorderBufferExecuteInvalidations(rb, txn);
+					}
+
+					break;
+
+				case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
+					elog(ERROR, "tuplecid value in changequeue");
+					break;
+			}
+		}
+
+		ReorderBufferIterTXNFinish(rb, iterstate);
+
+		/* call commit callback */
+		rb->commit(rb, txn, commit_lsn);
+
+		/* this is just a sanity check against bad output plugin behaviour */
+		if (GetCurrentTransactionIdIfAny() != InvalidTransactionId)
+			elog(ERROR, "output plugin used xid %u",
+				 GetCurrentTransactionId());
+
+		/* make sure there's no cache pollution */
+		ReorderBufferExecuteInvalidations(rb, txn);
+
+		/* cleanup */
+		TeardownHistoricSnapshot(false);
+
+		/*
+		 * Abort subtransaction or the transaction as a whole has the right
+		 * semantics. We want all locks acquired in here to be released, not
+		 * reassigned to the parent and we do not want any database access
+		 * have persistent effects.
+		 */
+		if (subtxn_started)
+			RollbackAndReleaseCurrentSubTransaction();
+		else if (txn_started)
+			AbortCurrentTransaction();
+
+		if (snapshot_now->copied)
+			ReorderBufferFreeSnap(rb, snapshot_now);
+
+		/* remove potential on-disk data, and deallocate */
+		ReorderBufferCleanupTXN(rb, txn);
+	}
+	PG_CATCH();
+	{
+		/* TODO: Encapsulate cleanup from the PG_TRY and PG_CATCH blocks */
+		if (iterstate)
+			ReorderBufferIterTXNFinish(rb, iterstate);
+
+		TeardownHistoricSnapshot(true);
+
+		if (snapshot_now->copied)
+			ReorderBufferFreeSnap(rb, snapshot_now);
+
+		if (subtxn_started)
+			RollbackAndReleaseCurrentSubTransaction();
+		else if (txn_started)
+			AbortCurrentTransaction();
+
+		/*
+		 * Invalidations in an aborted transactions aren't allowed to do
+		 * catalog access, so we don't need to still have the snapshot setup.
+		 */
+		ReorderBufferExecuteInvalidations(rb, txn);
+
+		/* remove potential on-disk data, and deallocate */
+		ReorderBufferCleanupTXN(rb, txn);
+
+		PG_RE_THROW();
+	}
+	PG_END_TRY();
+}
+
+/*
+ * Abort a transaction that possibly has previous changes. Needs to be first
+ * called for subtransactions and then for the toplevel xid.
+ *
+ * NB: Transactions handled here have to have actively aborted (i.e. have
+ * produced an abort record). Implicitly aborted transactions are handled via
+ * ReorderBufferAbortOld(); transactions we're just not interesteded in, but
+ * which have committed are handled in ReorderBufferForget().
+ *
+ * This function purges this transaction and its contents from memory and
+ * disk.
+ */
+void
+ReorderBufferAbort(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
+{
+	ReorderBufferTXN *txn;
+
+	txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
+								false);
+
+	/* unknown, nothing to remove */
+	if (txn == NULL)
+		return;
+
+	/* cosmetic... */
+	txn->final_lsn = lsn;
+
+	/* remove potential on-disk data, and deallocate */
+	ReorderBufferCleanupTXN(rb, txn);
+}
+
+/*
+ * Abort all transactions that aren't actually running anymore because the
+ * server restarted.
+ *
+ * NB: These really have to be transactions that have aborted due to a server
+ * crash/immediate restart, as we don't deal with invalidations here.
+ */
+void
+ReorderBufferAbortOld(ReorderBuffer *rb, TransactionId oldestRunningXid)
+{
+	dlist_mutable_iter it;
+
+	/*
+	 * Iterate through all (potential) toplevel TXNs and abort all that are
+	 * older than what possibly can be running. Once we've found the first
+	 * that is alive we stop, there might be some that acquired an xid earlier
+	 * but started writing later, but it's unlikely and they will cleaned up
+	 * in a later call to ReorderBufferAbortOld().
+	 */
+	dlist_foreach_modify(it, &rb->toplevel_by_lsn)
+	{
+		ReorderBufferTXN * txn;
+
+		txn = dlist_container(ReorderBufferTXN, node, it.cur);
+
+		if (TransactionIdPrecedes(txn->xid, oldestRunningXid))
+		{
+			elog(DEBUG1, "aborting old transaction %u", txn->xid);
+
+			/* remove potential on-disk data, and deallocate this tx */
+			ReorderBufferCleanupTXN(rb, txn);
+		}
+		else
+			return;
+	}
+}
+
+/*
+ * Forget the contents of a transaction if we aren't interested in it's
+ * contents. Needs to be first called for subtransactions and then for the
+ * toplevel xid.
+ *
+ * This is significantly different to ReorderBufferAbort() because
+ * transactions that have committed need to be treated differenly from aborted
+ * ones since they may have modified the catalog.
+ *
+ * Note that this is only allowed to be called in the moment a transaction
+ * commit has just been read, not earlier; otherwise later records referring
+ * to this xid might re-create the transaction incompletely.
+ */
+void
+ReorderBufferForget(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
+{
+	ReorderBufferTXN *txn;
+
+	txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
+								false);
+
+	/* unknown, nothing to forget */
+	if (txn == NULL)
+		return;
+
+	/* cosmetic... */
+	txn->final_lsn = lsn;
+
+	/*
+	 * Proccess cache invalidation messages if there are any. Even if we're
+	 * not interested in the transaction's contents, it could have manipulated
+	 * the catalog and we need to update the caches according to that.
+	 */
+	if (txn->base_snapshot != NULL && txn->ninvalidations > 0)
+	{
+		/* setup snapshot to perform the invalidations in */
+		SetupHistoricSnapshot(txn->base_snapshot, txn->tuplecid_hash);
+		PG_TRY();
+		{
+			ReorderBufferExecuteInvalidations(rb, txn);
+			TeardownHistoricSnapshot(false);
+		}
+		PG_CATCH();
+		{
+			/* cleanup */
+			TeardownHistoricSnapshot(true);
+			PG_RE_THROW();
+		}
+		PG_END_TRY();
+	}
+	else
+		Assert(txn->ninvalidations == 0);
+
+	/* remove potential on-disk data, and deallocate */
+	ReorderBufferCleanupTXN(rb, txn);
+}
+
+
+/*
+ * Check whether a transaction is already known in this module.xs
+ */
+bool
+ReorderBufferIsXidKnown(ReorderBuffer *rb, TransactionId xid)
+{
+	ReorderBufferTXN *txn;
+
+	txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
+								false);
+	return txn != NULL;
+}
+
+/*
+ * Add a new snapshot to this transaction that may only used after lsn 'lsn'
+ * because the previous snapshot doesn't describe the catalog correctly for
+ * following rows.
+ */
+void
+ReorderBufferAddSnapshot(ReorderBuffer *rb, TransactionId xid,
+						 XLogRecPtr lsn, Snapshot snap)
+{
+	ReorderBufferChange *change = ReorderBufferGetChange(rb);
+
+	change->snapshot = snap;
+	change->action_internal = REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT;
+
+	ReorderBufferQueueChange(rb, xid, lsn, change);
+}
+
+/*
+ * Setup the base snapshot of a transaction. The base snapshot is the snapshot
+ * that is used to decode all changes until either this transaction modifies
+ * the catalog or another catalog modifying transaction commits.
+ *
+ * Needs to be called before any changes are added with
+ * ReorderBufferQueueChange().
+ */
+void
+ReorderBufferSetBaseSnapshot(ReorderBuffer *rb, TransactionId xid,
+							 XLogRecPtr lsn, Snapshot snap)
+{
+	ReorderBufferTXN *txn;
+	bool		is_new;
+
+	txn = ReorderBufferTXNByXid(rb, xid, true, &is_new, lsn, true);
+	Assert(txn->base_snapshot == NULL);
+	Assert(snap != NULL);
+
+	txn->base_snapshot = snap;
+	txn->base_snapshot_lsn = lsn;
+}
+
+/*
+ * Access the catalog with this CommandId at this point in the changestream.
+ *
+ * May only be called for command ids > 1
+ */
+void
+ReorderBufferAddNewCommandId(ReorderBuffer *rb, TransactionId xid,
+							 XLogRecPtr lsn, CommandId cid)
+{
+	ReorderBufferChange *change = ReorderBufferGetChange(rb);
+
+	change->command_id = cid;
+	change->action_internal = REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID;
+
+	ReorderBufferQueueChange(rb, xid, lsn, change);
+}
+
+
+/*
+ * Add new (relfilenode, tid) -> (cmin, cmax) mappings.
+ */
+void
+ReorderBufferAddNewTupleCids(ReorderBuffer *rb, TransactionId xid,
+							 XLogRecPtr lsn, RelFileNode node,
+							 ItemPointerData tid, CommandId cmin,
+							 CommandId cmax, CommandId combocid)
+{
+	ReorderBufferChange *change = ReorderBufferGetChange(rb);
+	ReorderBufferTXN *txn;
+
+	txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
+
+	change->tuplecid.node = node;
+	change->tuplecid.tid = tid;
+	change->tuplecid.cmin = cmin;
+	change->tuplecid.cmax = cmax;
+	change->tuplecid.combocid = combocid;
+	change->lsn = lsn;
+	change->action_internal = REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID;
+
+	dlist_push_tail(&txn->tuplecids, &change->node);
+	txn->ntuplecids++;
+}
+
+/*
+ * Setup the invalidation of the toplevel transaction.
+ *
+ * This needs to be done before ReorderBufferCommit is called!
+ */
+void
+ReorderBufferAddInvalidations(ReorderBuffer *rb, TransactionId xid,
+							  XLogRecPtr lsn, Size nmsgs,
+							  SharedInvalidationMessage *msgs)
+{
+	ReorderBufferTXN *txn;
+
+	txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
+
+	if (txn->ninvalidations != 0)
+		elog(ERROR, "only ever add one set of invalidations");
+
+	Assert(nmsgs > 0);
+
+	txn->ninvalidations = nmsgs;
+	txn->invalidations = (SharedInvalidationMessage *)
+		MemoryContextAlloc(rb->context,
+						   sizeof(SharedInvalidationMessage) * nmsgs);
+	memcpy(txn->invalidations, msgs,
+		   sizeof(SharedInvalidationMessage) * nmsgs);
+}
+
+/*
+ * Apply all invalidations we know. Possibly we only need parts at this point
+ * in the changestream but we don't know which those are.
+ */
+static void
+ReorderBufferExecuteInvalidations(ReorderBuffer *rb, ReorderBufferTXN *txn)
+{
+	int			i;
+
+	for (i = 0; i < txn->ninvalidations; i++)
+		LocalExecuteInvalidationMessage(&txn->invalidations[i]);
+}
+
+/*
+ * Mark a transaction as containing catalog changes
+ */
+void
+ReorderBufferXidSetCatalogChanges(ReorderBuffer *rb, TransactionId xid,
+								  XLogRecPtr lsn)
+{
+	ReorderBufferTXN *txn;
+
+	txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
+
+	txn->has_catalog_changes = true;
+}
+
+/*
+ * Query whether a transaction is already *known* to contain catalog
+ * changes. This can be wrong until directly before the commit!
+ */
+bool
+ReorderBufferXidHasCatalogChanges(ReorderBuffer *rb, TransactionId xid)
+{
+	ReorderBufferTXN *txn;
+
+	txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
+								false);
+	if (txn == NULL)
+		return false;
+
+	return txn->has_catalog_changes;
+}
+
+/*
+ * Have we already added the first snapshot?
+ */
+bool
+ReorderBufferXidHasBaseSnapshot(ReorderBuffer *rb, TransactionId xid)
+{
+	ReorderBufferTXN *txn;
+
+	txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
+								false);
+
+	/* transaction isn't known yet, ergo no snapshot */
+	if (txn == NULL)
+		return false;
+
+	/*
+	 * TODO: It would be a nice improvement if we would check the toplevel
+	 * transaction in subtransactions, but we'd need to keep track of a bit
+	 * more state.
+	 */
+	return txn->base_snapshot != NULL;
+}
+
+
+/*
+ * ---------------------------------------
+ * Disk serialization support
+ * ---------------------------------------
+ */
+
+/*
+ * Ensure the IO buffer is >= sz.
+ */
+static void
+ReorderBufferSerializeReserve(ReorderBuffer *rb, Size sz)
+{
+	if (!rb->outbufsize)
+	{
+		rb->outbuf = MemoryContextAlloc(rb->context, sz);
+		rb->outbufsize = sz;
+	}
+	else if (rb->outbufsize < sz)
+	{
+		rb->outbuf = repalloc(rb->outbuf, sz);
+		rb->outbufsize = sz;
+	}
+}
+
+/*
+ * Check whether the transaction tx should spill its data to disk.
+ */
+static void
+ReorderBufferCheckSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
+{
+	/*
+	 * TODO: improve accounting so we cheaply can take subtransactions into
+	 * account here.
+	 */
+	if (txn->nentries_mem >= max_changes_in_memory)
+	{
+		ReorderBufferSerializeTXN(rb, txn);
+		Assert(txn->nentries_mem == 0);
+	}
+}
+
+/*
+ * Spill data of a large transaction (and its subtransactions) to disk.
+ */
+static void
+ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
+{
+	dlist_iter	subtxn_i;
+	dlist_mutable_iter change_i;
+	int			fd = -1;
+	XLogSegNo	curOpenSegNo = 0;
+	Size		spilled = 0;
+	char		path[MAXPGPATH];
+
+	elog(DEBUG2, "spill %u changes in tx %u to disk",
+		 (uint32) txn->nentries_mem, txn->xid);
+
+	/* do the same to all child TXs */
+	dlist_foreach(subtxn_i, &txn->subtxns)
+	{
+		ReorderBufferTXN *subtxn;
+
+		subtxn = dlist_container(ReorderBufferTXN, node, subtxn_i.cur);
+		ReorderBufferSerializeTXN(rb, subtxn);
+	}
+
+	/* serialize changestream */
+	dlist_foreach_modify(change_i, &txn->changes)
+	{
+		ReorderBufferChange *change;
+
+		change = dlist_container(ReorderBufferChange, node, change_i.cur);
+
+		/*
+		 * store in segment in which it belongs by start lsn, don't split over
+		 * multiple segments tho
+		 */
+		if (fd == -1 || XLByteInSeg(change->lsn, curOpenSegNo))
+		{
+			XLogRecPtr	recptr;
+
+			if (fd != -1)
+				CloseTransientFile(fd);
+
+			XLByteToSeg(change->lsn, curOpenSegNo);
+			XLogSegNoOffsetToRecPtr(curOpenSegNo, 0, recptr);
+
+			/*
+			 * No need to care about TLIs here, only used during a single run,
+			 * so each LSN only maps to a specific WAL record.
+			 */
+			sprintf(path, "pg_replslot/%s/xid-%u-lsn-%X-%X.snap",
+					NameStr(MyReplicationSlot->data.name), txn->xid,
+					(uint32) (recptr >> 32), (uint32) recptr);
+
+			/* open segment, create it if necessary */
+			fd = OpenTransientFile(path,
+								   O_CREAT | O_WRONLY | O_APPEND | PG_BINARY,
+								   S_IRUSR | S_IWUSR);
+
+			if (fd < 0)
+				ereport(ERROR,
+						(errcode_for_file_access(),
+						 errmsg("could not open file \"%s\": %m",
+								path)));
+		}
+
+		ReorderBufferSerializeChange(rb, txn, fd, change);
+		dlist_delete(&change->node);
+		ReorderBufferReturnChange(rb, change);
+
+		spilled++;
+	}
+
+	Assert(spilled == txn->nentries_mem);
+	Assert(dlist_is_empty(&txn->changes));
+	txn->nentries_mem = 0;
+
+	if (fd != -1)
+		CloseTransientFile(fd);
+}
+
+/*
+ * Serialize individual change to disk.
+ */
+static void
+ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
+							 int fd, ReorderBufferChange *change)
+{
+	ReorderBufferDiskChange *ondisk;
+	Size		sz = sizeof(ReorderBufferDiskChange);
+
+	ReorderBufferSerializeReserve(rb, sz);
+
+	ondisk = (ReorderBufferDiskChange *) rb->outbuf;
+	memcpy(&ondisk->change, change, sizeof(ReorderBufferChange));
+
+	switch ((ReorderBufferChangeTypeInternal) change->action_internal)
+	{
+		case REORDER_BUFFER_CHANGE_INTERNAL_INSERT:
+			/* fall through */
+		case REORDER_BUFFER_CHANGE_INTERNAL_UPDATE:
+			/* fall through */
+		case REORDER_BUFFER_CHANGE_INTERNAL_DELETE:
+			{
+				char	   *data;
+				Size		oldlen = 0;
+				Size		newlen = 0;
+
+				if (change->tp.oldtuple)
+					oldlen = offsetof(ReorderBufferTupleBuf, data)
+						+ change->tp.oldtuple->tuple.t_len
+						- offsetof(HeapTupleHeaderData, t_bits);
+
+				if (change->tp.newtuple)
+					newlen = offsetof(ReorderBufferTupleBuf, data)
+						+ change->tp.newtuple->tuple.t_len
+						- offsetof(HeapTupleHeaderData, t_bits);
+
+				sz += oldlen;
+				sz += newlen;
+
+				/* make sure we have enough space */
+				ReorderBufferSerializeReserve(rb, sz);
+
+				data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
+				/* might have been reallocated above */
+				ondisk = (ReorderBufferDiskChange *) rb->outbuf;
+
+				if (oldlen)
+				{
+					memcpy(data, change->tp.oldtuple, oldlen);
+					data += oldlen;
+					Assert(&change->tp.oldtuple->header == change->tp.oldtuple->tuple.t_data);
+				}
+
+				if (newlen)
+				{
+					memcpy(data, change->tp.newtuple, newlen);
+					data += newlen;
+					Assert(&change->tp.newtuple->header == change->tp.newtuple->tuple.t_data);
+				}
+				break;
+			}
+		case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
+			{
+				char	   *data;
+
+				sz += sizeof(SnapshotData) +
+					sizeof(TransactionId) * change->snapshot->xcnt +
+					sizeof(TransactionId) * change->snapshot->subxcnt
+					;
+
+				/* make sure we have enough space */
+				ReorderBufferSerializeReserve(rb, sz);
+				data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
+				/* might have been reallocated above */
+				ondisk = (ReorderBufferDiskChange *) rb->outbuf;
+
+				memcpy(data, change->snapshot, sizeof(SnapshotData));
+				data += sizeof(SnapshotData);
+
+				if (change->snapshot->xcnt)
+				{
+					memcpy(data, change->snapshot->xip,
+						   sizeof(TransactionId) + change->snapshot->xcnt);
+					data += sizeof(TransactionId) + change->snapshot->xcnt;
+				}
+
+				if (change->snapshot->subxcnt)
+				{
+					memcpy(data, change->snapshot->subxip,
+						   sizeof(TransactionId) + change->snapshot->subxcnt);
+					data += sizeof(TransactionId) + change->snapshot->subxcnt;
+				}
+				break;
+			}
+		case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
+			/* ReorderBufferChange contains everything important */
+			break;
+		case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
+			/* ReorderBufferChange contains everything important */
+			break;
+	}
+
+	ondisk->size = sz;
+
+	if (write(fd, rb->outbuf, ondisk->size) != ondisk->size)
+	{
+		CloseTransientFile(fd);
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not write to xid %u's data file: %m",
+						txn->xid)));
+	}
+
+	Assert(ondisk->change.action_internal == change->action_internal);
+}
+
+/*
+ * Restore a number of changes spilled to disk back into memory.
+ */
+static Size
+ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn,
+							int *fd, XLogSegNo *segno)
+{
+	Size		restored = 0;
+	XLogSegNo	last_segno;
+	dlist_mutable_iter cleanup_iter;
+
+	Assert(txn->first_lsn != InvalidXLogRecPtr);
+	Assert(txn->final_lsn != InvalidXLogRecPtr);
+
+	/* free current entries, so we have memory for more */
+	dlist_foreach_modify(cleanup_iter, &txn->changes)
+	{
+		ReorderBufferChange *cleanup =
+		dlist_container(ReorderBufferChange, node, cleanup_iter.cur);
+
+		dlist_delete(&cleanup->node);
+		ReorderBufferReturnChange(rb, cleanup);
+	}
+	txn->nentries_mem = 0;
+	Assert(dlist_is_empty(&txn->changes));
+
+	XLByteToSeg(txn->final_lsn, last_segno);
+
+	while (restored < max_changes_in_memory && *segno <= last_segno)
+	{
+		int			readBytes;
+		ReorderBufferDiskChange *ondisk;
+
+		if (*fd == -1)
+		{
+			XLogRecPtr	recptr;
+			char		path[MAXPGPATH];
+
+			/* first time in */
+			if (*segno == 0)
+			{
+				XLByteToSeg(txn->first_lsn, *segno);
+			}
+
+			Assert(*segno != 0 || dlist_is_empty(&txn->changes));
+			XLogSegNoOffsetToRecPtr(*segno, 0, recptr);
+
+			/*
+			 * No need to care about TLIs here, only used during a single run,
+			 * so each LSN only maps to a specific WAL record.
+			 */
+			sprintf(path, "pg_replslot/%s/xid-%u-lsn-%X-%X.snap",
+					NameStr(MyReplicationSlot->data.name), txn->xid,
+					(uint32) (recptr >> 32), (uint32) recptr);
+
+			*fd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0);
+			if (*fd < 0 && errno == ENOENT)
+			{
+				*fd = -1;
+				(*segno)++;
+				continue;
+			}
+			else if (*fd < 0)
+				ereport(ERROR,
+						(errcode_for_file_access(),
+						 errmsg("could not open file \"%s\": %m",
+								path)));
+
+		}
+
+		ReorderBufferSerializeReserve(rb, sizeof(ReorderBufferDiskChange));
+
+
+		/*
+		 * Read the statically sized part of a change which has information
+		 * about the total size. If we couldn't read a record, we're at the
+		 * end of this file.
+		 */
+
+		readBytes = read(*fd, rb->outbuf, sizeof(ReorderBufferDiskChange));
+
+		/* eof */
+		if (readBytes == 0)
+		{
+			CloseTransientFile(*fd);
+			*fd = -1;
+			(*segno)++;
+			continue;
+		}
+		else if (readBytes < 0)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not read from reorderbuffer spill file: %m")));
+		else if (readBytes != sizeof(ReorderBufferDiskChange))
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("incomplete read from reorderbuffer spill file: read %d instead of %u",
+							readBytes,
+							(uint32) sizeof(ReorderBufferDiskChange))));
+
+		ondisk = (ReorderBufferDiskChange *) rb->outbuf;
+
+		ReorderBufferSerializeReserve(rb,
+									  sizeof(ReorderBufferDiskChange) + ondisk->size);
+		ondisk = (ReorderBufferDiskChange *) rb->outbuf;
+
+		readBytes = read(*fd, rb->outbuf + sizeof(ReorderBufferDiskChange),
+						 ondisk->size - sizeof(ReorderBufferDiskChange));
+
+		if (readBytes < 0)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not read from reorderbuffer spill file: %m")));
+		else if (readBytes != ondisk->size - sizeof(ReorderBufferDiskChange))
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not read from reorderbuffer spill file: read %d instead of %u",
+							readBytes,
+							(uint32) (ondisk->size - sizeof(ReorderBufferDiskChange)))));
+
+		/*
+		 * ok, read a full change from disk, now restore it into proper
+		 * in-memory format
+		 */
+		ReorderBufferRestoreChange(rb, txn, rb->outbuf);
+		restored++;
+	}
+
+	return restored;
+}
+
+/*
+ * Convert change from its on-disk format to in-memory format and queue it onto
+ * the TXN's ->changes list.
+ */
+static void
+ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
+						   char *data)
+{
+	ReorderBufferDiskChange *ondisk;
+	ReorderBufferChange *change;
+
+	ondisk = (ReorderBufferDiskChange *) data;
+
+	change = ReorderBufferGetChange(rb);
+
+	/* copy static part */
+	memcpy(change, &ondisk->change, sizeof(ReorderBufferChange));
+
+	data += sizeof(ReorderBufferDiskChange);
+
+	/* restore individual stuff */
+	switch ((ReorderBufferChangeTypeInternal) change->action_internal)
+	{
+		case REORDER_BUFFER_CHANGE_INTERNAL_INSERT:
+			/* fall through */
+		case REORDER_BUFFER_CHANGE_INTERNAL_UPDATE:
+			/* fall through */
+		case REORDER_BUFFER_CHANGE_INTERNAL_DELETE:
+			if (change->tp.newtuple)
+			{
+				Size		len = offsetof(ReorderBufferTupleBuf, data)
+				+((ReorderBufferTupleBuf *) data)->tuple.t_len
+				- offsetof(HeapTupleHeaderData, t_bits);
+
+				change->tp.newtuple = ReorderBufferGetTupleBuf(rb);
+				memcpy(change->tp.newtuple, data, len);
+				change->tp.newtuple->tuple.t_data = &change->tp.newtuple->header;
+
+				data += len;
+			}
+
+			if (change->tp.oldtuple)
+			{
+				Size		len = offsetof(ReorderBufferTupleBuf, data)
+				+((ReorderBufferTupleBuf *) data)->tuple.t_len
+				- offsetof(HeapTupleHeaderData, t_bits);
+
+				change->tp.oldtuple = ReorderBufferGetTupleBuf(rb);
+				memcpy(change->tp.oldtuple, data, len);
+				change->tp.oldtuple->tuple.t_data = &change->tp.oldtuple->header;
+				data += len;
+			}
+			break;
+		case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
+			{
+				Snapshot	oldsnap = (Snapshot) data;
+				Size		size = sizeof(SnapshotData) +
+				sizeof(TransactionId) * oldsnap->xcnt +
+				sizeof(TransactionId) * (oldsnap->subxcnt + 0)
+						   ;
+
+				Assert(change->snapshot != NULL);
+
+				change->snapshot = MemoryContextAllocZero(rb->context, size);
+
+				memcpy(change->snapshot, data, size);
+				change->snapshot->xip = (TransactionId *)
+					(((char *) change->snapshot) + sizeof(SnapshotData));
+				change->snapshot->subxip =
+					change->snapshot->xip + change->snapshot->xcnt + 0;
+				change->snapshot->copied = true;
+				break;
+			}
+			/* the base struct contains all the data, easy peasy */
+		case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
+		case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
+			break;
+	}
+
+	dlist_push_tail(&txn->changes, &change->node);
+	txn->nentries_mem++;
+}
+
+/*
+ * Remove all on-disk stored for the passed in transaction.
+ */
+static void
+ReorderBufferRestoreCleanup(ReorderBuffer *rb, ReorderBufferTXN *txn)
+{
+	XLogSegNo	first;
+	XLogSegNo	cur;
+	XLogSegNo	last;
+
+	Assert(txn->first_lsn != InvalidXLogRecPtr);
+	Assert(txn->final_lsn != InvalidXLogRecPtr);
+
+	XLByteToSeg(txn->first_lsn, first);
+	XLByteToSeg(txn->final_lsn, last);
+
+	/* iterate over all possible filenames, and delete them */
+	for (cur = first; cur <= last; cur++)
+	{
+		char		path[MAXPGPATH];
+		XLogRecPtr	recptr;
+
+		XLogSegNoOffsetToRecPtr(cur, 0, recptr);
+
+		sprintf(path, "pg_replslot/%s/xid-%u-lsn-%X-%X.snap",
+				NameStr(MyReplicationSlot->data.name), txn->xid,
+				(uint32) (recptr >> 32), (uint32) recptr);
+		if (unlink(path) != 0 && errno != ENOENT)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not unlink file \"%s\": %m", path)));
+	}
+}
+
+/*
+ * Delete all data spilled to disk after we've restarted/crashed. It will be
+ * recreated when the respective slots are reused.
+ */
+void
+StartupReorderBuffer(void)
+{
+	DIR		   *logical_dir;
+	struct dirent *logical_de;
+
+	DIR		   *spill_dir;
+	struct dirent *spill_de;
+
+	logical_dir = AllocateDir("pg_replslot");
+	while ((logical_de = ReadDir(logical_dir, "pg_replslot")) != NULL)
+	{
+		struct stat	statbuf;
+		char		path[MAXPGPATH];
+
+		if (strcmp(logical_de->d_name, ".") == 0 ||
+			strcmp(logical_de->d_name, "..") == 0)
+			continue;
+
+		/* if it cannot be a slot, skip the directory */
+		if (!ReplicationSlotValidateName(logical_de->d_name, DEBUG2))
+			continue;
+
+		/*
+		 * ok, has to be a surviving logical slot, iterate and delete
+		 * everythign starting with xid-*
+		 */
+		sprintf(path, "pg_replslot/%s", logical_de->d_name);
+
+		/* we're only creating directories here, skip if it's not our's */
+		if (lstat(path, &statbuf) == 0 && !S_ISDIR(statbuf.st_mode))
+			continue;
+
+		spill_dir = AllocateDir(path);
+		while ((spill_de = ReadDir(spill_dir, path)) != NULL)
+		{
+			if (strcmp(spill_de->d_name, ".") == 0 ||
+				strcmp(spill_de->d_name, "..") == 0)
+				continue;
+
+			/* only look at names that can be ours */
+			if (strncmp(spill_de->d_name, "xid", 3) == 0)
+			{
+				sprintf(path, "pg_replslot/%s/%s", logical_de->d_name,
+						spill_de->d_name);
+
+				if (unlink(path) != 0)
+					ereport(PANIC,
+							(errcode_for_file_access(),
+							 errmsg("could not unlink file \"%s\": %m",
+									path)));
+			}
+		}
+		FreeDir(spill_dir);
+	}
+	FreeDir(logical_dir);
+}
+
+/* ---------------------------------------
+ * toast reassembly support
+ * ---------------------------------------
+ */
+
+/*
+ * Initialize per tuple toast reconstruction support.
+ */
+static void
+ReorderBufferToastInitHash(ReorderBuffer *rb, ReorderBufferTXN *txn)
+{
+	HASHCTL		hash_ctl;
+
+	Assert(txn->toast_hash == NULL);
+
+	memset(&hash_ctl, 0, sizeof(hash_ctl));
+	hash_ctl.keysize = sizeof(Oid);
+	hash_ctl.entrysize = sizeof(ReorderBufferToastEnt);
+	hash_ctl.hash = tag_hash;
+	hash_ctl.hcxt = rb->context;
+	txn->toast_hash = hash_create("ReorderBufferToastHash", 5, &hash_ctl,
+								  HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
+}
+
+/*
+ * Per toast-chunk handling for toast reconstruction
+ *
+ * Appends a toast chunk so we can reconstruct it when the tuple "owning" the
+ * toasted Datum comes along.
+ */
+static void
+ReorderBufferToastAppendChunk(ReorderBuffer *rb, ReorderBufferTXN *txn,
+							  Relation relation, ReorderBufferChange *change)
+{
+	ReorderBufferToastEnt *ent;
+	bool		found;
+	int32		chunksize;
+	bool		isnull;
+	Pointer		chunk;
+	TupleDesc	desc = RelationGetDescr(relation);
+	Oid			chunk_id;
+	Oid			chunk_seq;
+
+	if (txn->toast_hash == NULL)
+		ReorderBufferToastInitHash(rb, txn);
+
+	Assert(IsToastRelation(relation));
+
+	chunk_id = DatumGetObjectId(fastgetattr(&change->tp.newtuple->tuple, 1, desc, &isnull));
+	Assert(!isnull);
+	chunk_seq = DatumGetInt32(fastgetattr(&change->tp.newtuple->tuple, 2, desc, &isnull));
+	Assert(!isnull);
+
+	ent = (ReorderBufferToastEnt *)
+		hash_search(txn->toast_hash,
+					(void *) &chunk_id,
+					HASH_ENTER,
+					&found);
+
+	if (!found)
+	{
+		Assert(ent->chunk_id == chunk_id);
+		ent->num_chunks = 0;
+		ent->last_chunk_seq = 0;
+		ent->size = 0;
+		ent->reconstructed = NULL;
+		dlist_init(&ent->chunks);
+
+		if (chunk_seq != 0)
+			elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq 0",
+				 chunk_seq, chunk_id);
+	}
+	else if (found && chunk_seq != ent->last_chunk_seq + 1)
+		elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq %d",
+			 chunk_seq, chunk_id, ent->last_chunk_seq + 1);
+
+	chunk = DatumGetPointer(fastgetattr(&change->tp.newtuple->tuple, 3, desc, &isnull));
+	Assert(!isnull);
+
+	/* calculate size so we can allocate the right size at once later */
+	if (!VARATT_IS_EXTENDED(chunk))
+		chunksize = VARSIZE(chunk) - VARHDRSZ;
+	else if (VARATT_IS_SHORT(chunk))
+		/* could happen due to heap_form_tuple doing its thing */
+		chunksize = VARSIZE_SHORT(chunk) - VARHDRSZ_SHORT;
+	else
+		elog(ERROR, "unexpected type of toast chunk");
+
+	ent->size += chunksize;
+	ent->last_chunk_seq = chunk_seq;
+	ent->num_chunks++;
+	dlist_push_tail(&ent->chunks, &change->node);
+}
+
+/*
+ * Rejigger change->newtuple to point to in-memory toast tuples instead to
+ * on-disk toast tuples that may not longer exist (think DROP TABLE or VACUUM).
+ *
+ * We cannot replace unchanged toast tuples though, so those will still point
+ * to on-disk toast data.
+ */
+static void
+ReorderBufferToastReplace(ReorderBuffer *rb, ReorderBufferTXN *txn,
+						  Relation relation, ReorderBufferChange *change)
+{
+	TupleDesc	desc;
+	int			natt;
+	Datum	   *attrs;
+	bool	   *isnull;
+	bool	   *free;
+	HeapTuple	newtup;
+	Relation	toast_rel;
+	TupleDesc	toast_desc;
+	MemoryContext oldcontext;
+
+	/* no toast tuples changed */
+	if (txn->toast_hash == NULL)
+		return;
+
+	oldcontext = MemoryContextSwitchTo(rb->context);
+
+	/* we should only have toast tuples in an INSERT or UPDATE */
+	Assert(change->tp.newtuple);
+
+	desc = RelationGetDescr(relation);
+
+	toast_rel = RelationIdGetRelation(relation->rd_rel->reltoastrelid);
+	toast_desc = RelationGetDescr(toast_rel);
+
+	/* should we allocate from stack instead? */
+	attrs = palloc0(sizeof(Datum) * desc->natts);
+	isnull = palloc0(sizeof(bool) * desc->natts);
+	free = palloc0(sizeof(bool) * desc->natts);
+
+	heap_deform_tuple(&change->tp.newtuple->tuple, desc,
+					  attrs, isnull);
+
+	for (natt = 0; natt < desc->natts; natt++)
+	{
+		Form_pg_attribute attr = desc->attrs[natt];
+		ReorderBufferToastEnt *ent;
+		struct varlena *varlena;
+
+		/* va_rawsize is the size of the original datum -- including header */
+		struct varatt_external toast_pointer;
+		struct varatt_indirect redirect_pointer;
+		struct varlena *new_datum = NULL;
+		struct varlena *reconstructed;
+		dlist_iter	it;
+		Size		data_done = 0;
+
+		/* system columns aren't toasted */
+		if (attr->attnum < 0)
+			continue;
+
+		if (attr->attisdropped)
+			continue;
+
+		/* not a varlena datatype */
+		if (attr->attlen != -1)
+			continue;
+
+		/* no data */
+		if (isnull[natt])
+			continue;
+
+		/* ok, we know we have a toast datum */
+		varlena = (struct varlena *) DatumGetPointer(attrs[natt]);
+
+		/* no need to do anything if the tuple isn't external */
+		if (!VARATT_IS_EXTERNAL(varlena))
+			continue;
+
+		VARATT_EXTERNAL_GET_POINTER(toast_pointer, varlena);
+
+		/*
+		 * Check whether the toast tuple changed, replace if so.
+		 */
+		ent = (ReorderBufferToastEnt *)
+			hash_search(txn->toast_hash,
+						(void *) &toast_pointer.va_valueid,
+						HASH_FIND,
+						NULL);
+		if (ent == NULL)
+			continue;
+
+		new_datum =
+			(struct varlena *) palloc0(INDIRECT_POINTER_SIZE);
+
+		free[natt] = true;
+
+		reconstructed = palloc0(toast_pointer.va_rawsize);
+
+		ent->reconstructed = reconstructed;
+
+		/* stitch toast tuple back together from its parts */
+		dlist_foreach(it, &ent->chunks)
+		{
+			bool		isnull;
+			ReorderBufferTupleBuf *tup =
+			dlist_container(ReorderBufferChange, node, it.cur)->tp.newtuple;
+			Pointer		chunk =
+			DatumGetPointer(fastgetattr(&tup->tuple, 3, toast_desc, &isnull));
+
+			Assert(!isnull);
+			Assert(!VARATT_IS_EXTERNAL(chunk));
+			Assert(!VARATT_IS_SHORT(chunk));
+
+			memcpy(VARDATA(reconstructed) + data_done,
+				   VARDATA(chunk),
+				   VARSIZE(chunk) - VARHDRSZ);
+			data_done += VARSIZE(chunk) - VARHDRSZ;
+		}
+		Assert(data_done == toast_pointer.va_extsize);
+
+		/* make sure its marked as compressed or not */
+		if (VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer))
+			SET_VARSIZE_COMPRESSED(reconstructed, data_done + VARHDRSZ);
+		else
+			SET_VARSIZE(reconstructed, data_done + VARHDRSZ);
+
+		memset(&redirect_pointer, 0, sizeof(redirect_pointer));
+		redirect_pointer.pointer = reconstructed;
+
+		SET_VARTAG_EXTERNAL(new_datum, VARTAG_INDIRECT);
+		memcpy(VARDATA_EXTERNAL(new_datum), &redirect_pointer,
+			   sizeof(redirect_pointer));
+
+		attrs[natt] = PointerGetDatum(new_datum);
+	}
+
+	/*
+	 * Build tuple in separate memory & copy tuple back into the tuplebuf
+	 * passed to the output plugin. We can't directly heap_fill_tuple() into
+	 * the tuplebuf because attrs[] will point back into the current content.
+	 */
+	newtup = heap_form_tuple(desc, attrs, isnull);
+	Assert(change->tp.newtuple->tuple.t_len <= MaxHeapTupleSize);
+	Assert(&change->tp.newtuple->header == change->tp.newtuple->tuple.t_data);
+
+	memcpy(change->tp.newtuple->tuple.t_data,
+		   newtup->t_data,
+		   newtup->t_len);
+	change->tp.newtuple->tuple.t_len = newtup->t_len;
+
+	/*
+	 * free resources we won't further need, more persistent stuff will be
+	 * free'd in ReorderBufferToastReset().
+	 */
+	RelationClose(toast_rel);
+	pfree(newtup);
+	for (natt = 0; natt < desc->natts; natt++)
+	{
+		if (free[natt])
+			pfree(DatumGetPointer(attrs[natt]));
+	}
+	pfree(attrs);
+	pfree(free);
+	pfree(isnull);
+
+	MemoryContextSwitchTo(oldcontext);
+}
+
+/*
+ * Free all resources allocated for toast reconstruction.
+ */
+static void
+ReorderBufferToastReset(ReorderBuffer *rb, ReorderBufferTXN *txn)
+{
+	HASH_SEQ_STATUS hstat;
+	ReorderBufferToastEnt *ent;
+
+	if (txn->toast_hash == NULL)
+		return;
+
+	/* sequentially walk over the hash and free everything */
+	hash_seq_init(&hstat, txn->toast_hash);
+	while ((ent = (ReorderBufferToastEnt *) hash_seq_search(&hstat)) != NULL)
+	{
+		dlist_mutable_iter it;
+
+		if (ent->reconstructed != NULL)
+			pfree(ent->reconstructed);
+
+		dlist_foreach_modify(it, &ent->chunks)
+		{
+			ReorderBufferChange *change =
+			dlist_container(ReorderBufferChange, node, it.cur);
+
+			dlist_delete(&change->node);
+			ReorderBufferReturnChange(rb, change);
+		}
+	}
+
+	hash_destroy(txn->toast_hash);
+	txn->toast_hash = NULL;
+}
+
+
+/* ---------------------------------------
+ * Visibility support for logical decoding
+ *
+ *
+ * Lookup actual cmin/cmax values when using decoding snapshot. We can't
+ * always rely on stored cmin/cmax values because of two scenarios:
+ *
+ * * A tuple got changed multiple times during a single transaction and thus
+ *	 has got a combocid. Combocid's are only valid for the duration of a
+ *	 single transaction.
+ * * A tuple with a cmin but no cmax (and thus no combocid) got
+ *	 deleted/updated in another transaction than the one which created it
+ *	 which we are looking at right now. As only one of cmin, cmax or combocid
+ *	 is actually stored in the heap we don't have access to the the value we
+ *	 need anymore.
+ *
+ * To resolve those problems we have a per-transaction hash of (cmin,
+ * cmax) tuples keyed by (relfilenode, ctid) which contains the actual
+ * (cmin, cmax) values. That also takes care of combocids by simply
+ * not caring about them at all. As we have the real cmin/cmax values
+ * combocids aren't interesting.
+ *
+ * As we only care about catalog tuples here the overhead of this
+ * hashtable should be acceptable.
+ *
+ * Heap rewrites complicate this a bit, check rewriteheap.c for
+ * details.
+ * -------------------------------------------------------------------------
+ */
+
+/* struct for qsort()ing mapping files by lsn somewhat efficiently */
+typedef struct RewriteMappingFile
+{
+	XLogRecPtr	lsn;
+	char		fname[MAXPGPATH];
+} RewriteMappingFile;
+
+#if NOT_USED
+static void
+DisplayMapping(HTAB *tuplecid_data)
+{
+	HASH_SEQ_STATUS hstat;
+	ReorderBufferTupleCidEnt *ent;
+
+	hash_seq_init(&hstat, tuplecid_data);
+	while ((ent = (ReorderBufferTupleCidEnt *) hash_seq_search(&hstat)) != NULL)
+	{
+		elog(DEBUG3, "mapping: node: %u/%u/%u tid: %u/%u cmin: %u, cmax: %u",
+			 ent->key.relnode.dbNode,
+			 ent->key.relnode.spcNode,
+			 ent->key.relnode.relNode,
+			 BlockIdGetBlockNumber(&ent->key.tid.ip_blkid),
+			 ent->key.tid.ip_posid,
+			 ent->cmin,
+			 ent->cmax
+			);
+	}
+}
+#endif
+
+/*
+ * Apply a single mapping file to tuplecid_data.
+ *
+ * The mapping file has to have been verified to be a) committed b) for our
+ * transaction c) applied in LSN order.
+ */
+static void
+ApplyLogicalMappingFile(HTAB *tuplecid_data, Oid relid, const char *fname)
+{
+	char		path[MAXPGPATH];
+	int			fd;
+	int			readBytes;
+	LogicalRewriteMappingData map;
+
+	sprintf(path, "pg_llog/mappings/%s", fname);
+	fd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0);
+	if (fd < 0)
+		ereport(ERROR,
+				(errmsg("could not open file \"%s\": %m", path)));
+
+	while (true)
+	{
+		ReorderBufferTupleCidKey key;
+		ReorderBufferTupleCidEnt *ent;
+		ReorderBufferTupleCidEnt *new_ent;
+		bool found;
+
+		/* be careful about padding */
+		memset(&key, 0, sizeof(ReorderBufferTupleCidKey));
+
+		/* read all mappings till the end of the file */
+		readBytes = read(fd, &map, sizeof(LogicalRewriteMappingData));
+
+		if (readBytes < 0)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not read file \"%s\": %m",
+							path)));
+		else if (readBytes == 0) /* EOF */
+			break;
+		else if (readBytes != sizeof(LogicalRewriteMappingData))
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not read file \"%s\", read %d instead of %d",
+							path, readBytes,
+							(int32) sizeof(LogicalRewriteMappingData))));
+
+		key.relnode = map.old_node;
+		ItemPointerCopy(&map.old_tid,
+						&key.tid);
+
+
+		ent = (ReorderBufferTupleCidEnt *)
+			hash_search(tuplecid_data,
+						(void *) &key,
+						HASH_FIND,
+						NULL);
+
+		/* no existing mapping, no need to update */
+		if (!ent)
+			continue;
+
+		key.relnode = map.new_node;
+		ItemPointerCopy(&map.new_tid,
+						&key.tid);
+
+		new_ent = (ReorderBufferTupleCidEnt *)
+			hash_search(tuplecid_data,
+						(void *) &key,
+						HASH_ENTER,
+						&found);
+
+		if (found)
+		{
+			/*
+			 * Make sure the existing mapping makes sense. We sometime update
+			 * old records that did not yet have a cmax (e.g. pg_class' own
+			 * entry while rewriting it) during rewrites, so allow that.
+			 */
+			Assert(ent->cmin == InvalidCommandId || ent->cmin == new_ent->cmin);
+			Assert(ent->cmax == InvalidCommandId || ent->cmax == new_ent->cmax);
+		}
+		else
+		{
+			/* update mapping */
+			new_ent->cmin = ent->cmin;
+			new_ent->cmax = ent->cmax;
+			new_ent->combocid = ent->combocid;
+		}
+	}
+}
+
+
+/*
+ * Check whether the TransactionOId 'xid' is in the pre-sorted array 'xip'.
+ */
+static bool
+TransactionIdInArray(TransactionId xid, TransactionId *xip, Size num)
+{
+	return bsearch(&xid, xip, num,
+				   sizeof(TransactionId), xidComparator) != NULL;
+}
+
+/*
+ * qsort() comparator for sorting RewriteMappingFiles in LSN order.
+ */
+static int
+file_sort_by_lsn(const void *a_p, const void *b_p)
+{
+	RewriteMappingFile *a = *(RewriteMappingFile **)a_p;
+	RewriteMappingFile *b = *(RewriteMappingFile **)b_p;
+
+	if (a->lsn < b->lsn)
+		return -1;
+	else if (a->lsn > b->lsn)
+		return 1;
+	return 0;
+}
+
+/*
+ * Apply any existing logical remapping files if there are any targeted at our
+ * transaction for relid.
+ */
+static void
+UpdateLogicalMappings(HTAB *tuplecid_data, Oid relid, Snapshot snapshot)
+{
+	DIR		   *mapping_dir;
+	struct dirent *mapping_de;
+	List	   *files = NIL;
+	ListCell   *file;
+	RewriteMappingFile **files_a;
+	size_t		off;
+	Oid			dboid = IsSharedRelation(relid) ? InvalidOid : MyDatabaseId;
+
+	mapping_dir = AllocateDir("pg_llog/mappings");
+	while ((mapping_de = ReadDir(mapping_dir, "pg_llog/mappings")) != NULL)
+	{
+		Oid				f_dboid;
+		Oid				f_relid;
+		TransactionId	f_mapped_xid;
+		TransactionId	f_create_xid;
+		XLogRecPtr		f_lsn;
+		uint32			f_hi, f_lo;
+		RewriteMappingFile *f;
+
+		if (strcmp(mapping_de->d_name, ".") == 0 ||
+			strcmp(mapping_de->d_name, "..") == 0)
+			continue;
+
+		/* Ignore files that aren't ours*/
+		if (strncmp(mapping_de->d_name, "map-", 4) != 0)
+			continue;
+
+		if (sscanf(mapping_de->d_name, LOGICAL_REWRITE_FORMAT,
+				   &f_dboid, &f_relid, &f_hi, &f_lo,
+				   &f_mapped_xid, &f_create_xid) != 6)
+			elog(ERROR, "could not parse fname %s", mapping_de->d_name);
+
+		f_lsn = ((uint64) f_hi) << 32 | f_lo;
+
+		/* mapping for another database */
+		if (f_dboid != dboid)
+			continue;
+
+		/* mapping for another relation */
+		if (f_relid != relid)
+			continue;
+
+		/* did the creating transaction abort? */
+		if (!TransactionIdDidCommit(f_create_xid))
+			continue;
+
+		/* not for our transaction */
+		if (!TransactionIdInArray(f_mapped_xid, snapshot->subxip, snapshot->subxcnt))
+			continue;
+
+		/* ok, relevant, queue for apply */
+		f = palloc(sizeof(RewriteMappingFile));
+		f->lsn = f_lsn;
+		strcpy(f->fname, mapping_de->d_name);
+		files = lappend(files, f);
+	}
+	FreeDir(mapping_dir);
+
+	/* build array we can easily sort */
+	files_a = palloc(list_length(files) * sizeof(RewriteMappingFile *));
+	off = 0;
+	foreach(file, files)
+	{
+		files_a[off++] = lfirst(file);
+	}
+
+	/* sort files so we apply them in LSN order */
+	qsort(files_a, list_length(files), sizeof(RewriteMappingFile *),
+		  file_sort_by_lsn);
+
+	for(off = 0; off < list_length(files); off++)
+	{
+		RewriteMappingFile *f = files_a[off];
+		elog(DEBUG1, "applying mapping: %s in %u", f->fname,
+			snapshot->subxip[0]);
+		ApplyLogicalMappingFile(tuplecid_data, relid, f->fname);
+		pfree(f);
+	}
+}
+
+/*
+ * Lookup cmin/cmax of a tuple, during logical decoding where we can't rely on
+ * combocids.
+ */
+bool
+ResolveCminCmaxDuringDecoding(HTAB *tuplecid_data,
+							  Snapshot snapshot,
+							  HeapTuple htup, Buffer buffer,
+							  CommandId *cmin, CommandId *cmax)
+{
+	ReorderBufferTupleCidKey key;
+	ReorderBufferTupleCidEnt *ent;
+	ForkNumber	forkno;
+	BlockNumber blockno;
+	bool updated_mapping = false;
+
+	/* be careful about padding */
+	memset(&key, 0, sizeof(key));
+
+	Assert(!BufferIsLocal(buffer));
+
+	/*
+	 * get relfilenode from the buffer, no convenient way to access it other
+	 * than that.
+	 */
+	BufferGetTag(buffer, &key.relnode, &forkno, &blockno);
+
+	/* tuples can only be in the main fork */
+	Assert(forkno == MAIN_FORKNUM);
+	Assert(blockno == ItemPointerGetBlockNumber(&htup->t_self));
+
+	ItemPointerCopy(&htup->t_self,
+					&key.tid);
+
+restart:
+	ent = (ReorderBufferTupleCidEnt *)
+		hash_search(tuplecid_data,
+					(void *) &key,
+					HASH_FIND,
+					NULL);
+
+	/*
+	 * failed to find a mapping, check whether the table was rewritten and
+	 * apply mapping if so, but only do that once - there can be no new
+	 * mappings while we are in here since we have to hold a lock on the
+	 * relation.
+	 */
+	if (ent == NULL && !updated_mapping)
+	{
+		UpdateLogicalMappings(tuplecid_data, htup->t_tableOid, snapshot);
+		/* now check but don't update for a mapping again */
+		updated_mapping = true;
+		goto restart;
+	}
+	else if (ent == NULL)
+		return false;
+
+	if (cmin)
+		*cmin = ent->cmin;
+	if (cmax)
+		*cmax = ent->cmax;
+	return true;
+}
diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c
new file mode 100644
index 00000000000..28f9a8a1a6f
--- /dev/null
+++ b/src/backend/replication/logical/snapbuild.c
@@ -0,0 +1,1885 @@
+/*-------------------------------------------------------------------------
+ *
+ * snapbuild.c
+ *
+ *	  Infrastructure for building historic catalog snapshots based on contents
+ *	  of the WAL, for the purpose of decoding heapam.c style values in the
+ *	  WAL.
+ *
+ * NOTES:
+ *
+ * We build snapshots which can *only* be used to read catalog contents and we
+ * do so by reading and interpreting the WAL stream. The aim is to build a
+ * snapshot that behaves the same as a freshly taken MVCC snapshot would have
+ * at the time the XLogRecord was generated.
+ *
+ * To build the snapshots we reuse the infrastructure built for Hot
+ * Standby. The in-memory snapshots we build look different than HS' because
+ * we have different needs. To successfully decode data from the WAL we only
+ * need to access catalog tables and (sys|rel|cat)cache, not the actual user
+ * tables since the data we decode is wholly contained in the WAL
+ * records. Also, our snapshots need to be different in comparison to normal
+ * MVCC ones because in contrast to those we cannot fully rely on the clog and
+ * pg_subtrans for information about committed transactions because they might
+ * commit in the future from the POV of the WAL entry we're currently
+ * decoding. This definition has the advantage that we only need to prevent
+ * removal of catalog rows, while normal table's rows can still be
+ * removed. This is achieved by using the replication slot mechanism.
+ *
+ * As the percentage of transactions modifying the catalog normally is fairly
+ * small in comparisons to ones only manipulating user data, we keep track of
+ * the committed catalog modifying ones inside (xmin, xmax) instead of keeping
+ * track of all running transactions like its done in a normal snapshot. Note
+ * that we're generally only looking at transactions that have acquired an
+ * xid. That is we keep a list of transactions between snapshot->(xmin, xmax)
+ * that we consider committed, everything else is considered aborted/in
+ * progress. That also allows us not to care about subtransactions before they
+ * have committed which means this modules, in contrast to HS, doesn't have to
+ * care about suboverflowed subtransactions and similar.
+ *
+ * One complexity of doing this is that to e.g. handle mixed DDL/DML
+ * transactions we need Snapshots that see intermediate versions of the
+ * catalog in a transaction. During normal operation this is achieved by using
+ * CommandIds/cmin/cmax. The problem with that however is that for space
+ * efficiency reasons only one value of that is stored
+ * (c.f. combocid.c). Since ComboCids are only available in memory we log
+ * additional information which allows us to get the original (cmin, cmax)
+ * pair during visibility checks. Check the reorderbuffer.c's comment above
+ * ResolveCminCmaxDuringDecoding() for details.
+ *
+ * To facilitate all this we need our own visibility routine, as the normal
+ * ones are optimized for different usecases.
+ *
+ * To replace the normal catalog snapshots with decoding ones use the
+ * SetupHistoricSnapshot() and TeardownHistoricSnapshot() functions.
+ *
+ *
+ *
+ * The snapbuild machinery is starting up in in several stages, as illustrated
+ * by the following graph:
+ *         +-------------------------+
+ *    +----|SNAPBUILD_START          |-------------+
+ *    |    +-------------------------+             |
+ *    |                 |                          |
+ *    |                 |                          |
+ *    |     running_xacts with running xacts       |
+ *    |                 |                          |
+ *    |                 |                          |
+ *    |                 v                          |
+ *    |    +-------------------------+             v
+ *    |    |SNAPBUILD_FULL_SNAPSHOT  |------------>|
+ *    |    +-------------------------+             |
+ * running_xacts        |                      saved snapshot
+ * with zero xacts      |                 at running_xacts's lsn
+ *    |                 |                          |
+ *    |     all running toplevel TXNs finished     |
+ *    |                 |                          |
+ *    |                 v                          |
+ *    |    +-------------------------+             |
+ *    +--->|SNAPBUILD_CONSISTENT     |<------------+
+ *         +-------------------------+
+ *
+ * Initially the machinery is in the START stage. When a xl_running_xacts
+ * record is read that is sufficiently new (above the safe xmin horizon),
+ * there's a state transation. If there were no running xacts when the
+ * runnign_xacts record was generated, we'll directly go into CONSISTENT
+ * state, otherwise we'll switch to the FULL_SNAPSHOT state. Having a full
+ * snapshot means that all transactions that start henceforth can be decoded
+ * in their entirety, but transactions that started previously can't. In
+ * FULL_SNAPSHOT we'll switch into CONSISTENT once all those previously
+ * running transactions have committed or aborted.
+ *
+ * Only transactions that commit after CONSISTENT state has been reached will
+ * be replayed, even though they might have started while still in
+ * FULL_SNAPSHOT. That ensures that we'll reach a point where no previous
+ * changes has been exported, but all the following ones will be. That point
+ * is a convenient point to initialize replication from, which is why we
+ * export a snapshot at that point, which *can* be used to read normal data.
+ *
+ * Copyright (c) 2012-2014, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/backend/replication/snapbuild.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "miscadmin.h"
+
+#include "access/heapam_xlog.h"
+#include "access/transam.h"
+#include "access/xact.h"
+
+#include "replication/logical.h"
+#include "replication/reorderbuffer.h"
+#include "replication/snapbuild.h"
+
+#include "utils/builtins.h"
+#include "utils/memutils.h"
+#include "utils/snapshot.h"
+#include "utils/snapmgr.h"
+#include "utils/tqual.h"
+
+#include "storage/block.h"		/* debugging output */
+#include "storage/fd.h"
+#include "storage/lmgr.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "storage/standby.h"
+
+/*
+ * This struct contains the current state of the snapshot building
+ * machinery. Besides a forward declaration in the header, it is not exposed
+ * to the public, so we can easily change it's contents.
+ */
+struct SnapBuild
+{
+	/* how far are we along building our first full snapshot */
+	SnapBuildState state;
+
+	/* private memory context used to allocate memory for this module. */
+	MemoryContext context;
+
+	/* all transactions < than this have committed/aborted */
+	TransactionId xmin;
+
+	/* all transactions >= than this are uncommitted */
+	TransactionId xmax;
+
+	/*
+	 * Don't replay commits from an LSN <= this LSN. This can be set
+	 * externally but it will also be advanced (never retreat) from within
+	 * snapbuild.c.
+	 */
+	XLogRecPtr	transactions_after;
+
+	/*
+	 * Don't start decoding WAL until the "xl_running_xacts" information
+	 * indicates there are no running xids with a xid smaller than this.
+	 */
+	TransactionId initial_xmin_horizon;
+
+	/*
+	 * Snapshot that's valid to see the catalog state seen at this moment.
+	 */
+	Snapshot	snapshot;
+
+	/*
+	 * LSN of the last location we are sure a snapshot has been serialized to.
+	 */
+	XLogRecPtr	last_serialized_snapshot;
+
+	/*
+	 * The reorderbuffer we need to update with usable snapshots et al.
+	 */
+	ReorderBuffer *reorder;
+
+	/*
+	 * Information about initially running transactions
+	 *
+	 * When we start building a snapshot there already may be transactions in
+	 * progress.  Those are stored in running.xip.	We don't have enough
+	 * information about those to decode their contents, so until they are
+	 * finished (xcnt=0) we cannot switch to a CONSISTENT state.
+	 */
+	struct
+	{
+		/*
+		 * As long as running.xcnt all XIDs < running.xmin and > running.xmax
+		 * have to be checked whether they still are running.
+		 */
+		TransactionId xmin;
+		TransactionId xmax;
+
+		size_t		xcnt;		/* number of used xip entries */
+		size_t		xcnt_space; /* allocated size of xip */
+		TransactionId *xip;		/* running xacts array, xidComparator-sorted */
+	}			running;
+
+	/*
+	 * Array of transactions which could have catalog changes that committed
+	 * between xmin and xmax.
+	 */
+	struct
+	{
+		/* number of committed transactions */
+		size_t		xcnt;
+
+		/* available space for committed transactions */
+		size_t		xcnt_space;
+
+		/*
+		 * Until we reach a CONSISTENT state, we record commits of all
+		 * transactions, not just the catalog changing ones. Record when that
+		 * changes so we know we cannot export a snapshot safely anymore.
+		 */
+		bool		includes_all_transactions;
+
+		/*
+		 * Array of committed transactions that have modified the catalog.
+		 *
+		 * As this array is frequently modified we do *not* keep it in
+		 * xidComparator order. Instead we sort the array when building &
+		 * distributing a snapshot.
+		 *
+		 * TODO: It's unclear whether that reasoning has much merit. Every
+		 * time we add something here after becoming consistent will also
+		 * require distributing a snapshot. Storing them sorted would
+		 * potentially also make it easier to purge (but more complicated wrt
+		 * wraparound?). Should be improved if sorting while building the
+		 * snapshot shows up in profiles.
+		 */
+		TransactionId *xip;
+	}			committed;
+};
+
+/*
+ * Starting a transaction -- which we need to do while exporting a snapshot --
+ * removes knowledge about the previously used resowner, so we save it here.
+ */
+ResourceOwner SavedResourceOwnerDuringExport = NULL;
+bool ExportInProgress = false;
+
+/* transaction state manipulation functions */
+static void SnapBuildEndTxn(SnapBuild *builder, XLogRecPtr lsn, TransactionId xid);
+
+/* ->running manipulation */
+static bool SnapBuildTxnIsRunning(SnapBuild *builder, TransactionId xid);
+
+/* ->committed manipulation */
+static void SnapBuildPurgeCommittedTxn(SnapBuild *builder);
+
+/* snapshot building/manipulation/distribution functions */
+static Snapshot SnapBuildBuildSnapshot(SnapBuild *builder, TransactionId xid);
+
+static void SnapBuildFreeSnapshot(Snapshot snap);
+
+static void SnapBuildSnapIncRefcount(Snapshot snap);
+
+static void SnapBuildDistributeNewCatalogSnapshot(SnapBuild *builder, XLogRecPtr lsn);
+
+/* xlog reading helper functions for SnapBuildProcessRecord */
+static bool SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *running);
+
+/* serialization functions */
+static void SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn);
+static bool SnapBuildRestore(SnapBuild *builder, XLogRecPtr lsn);
+
+
+/*
+ * Allocate a new snapshot builder.
+ *
+ * xmin_horizon is the xid >=which we can be sure no catalog rows have been
+ * removed, start_lsn is the LSN >= we want to replay commits.
+ */
+SnapBuild *
+AllocateSnapshotBuilder(ReorderBuffer *reorder,
+						TransactionId xmin_horizon,
+						XLogRecPtr start_lsn)
+{
+	MemoryContext context;
+	MemoryContext oldcontext;
+	SnapBuild  *builder;
+
+	/* allocate memory in own context, to have better accountability */
+	context = AllocSetContextCreate(CurrentMemoryContext,
+									"snapshot builder context",
+									ALLOCSET_DEFAULT_MINSIZE,
+									ALLOCSET_DEFAULT_INITSIZE,
+									ALLOCSET_DEFAULT_MAXSIZE);
+	oldcontext = MemoryContextSwitchTo(context);
+
+	builder = palloc0(sizeof(SnapBuild));
+
+	builder->state = SNAPBUILD_START;
+	builder->context = context;
+	builder->reorder = reorder;
+	/* Other struct members initialized by zeroing via palloc0 above */
+
+	builder->committed.xcnt = 0;
+	builder->committed.xcnt_space = 128;		/* arbitrary number */
+	builder->committed.xip =
+		palloc0(builder->committed.xcnt_space * sizeof(TransactionId));
+	builder->committed.includes_all_transactions = true;
+	builder->committed.xip =
+		palloc0(builder->committed.xcnt_space * sizeof(TransactionId));
+	builder->initial_xmin_horizon = xmin_horizon;
+	builder->transactions_after = start_lsn;
+
+	MemoryContextSwitchTo(oldcontext);
+
+	return builder;
+}
+
+/*
+ * Free a snapshot builder.
+ */
+void
+FreeSnapshotBuilder(SnapBuild *builder)
+{
+	MemoryContext context = builder->context;
+
+	/* free snapshot explicitly, that contains some error checking */
+	if (builder->snapshot != NULL)
+	{
+		SnapBuildSnapDecRefcount(builder->snapshot);
+		builder->snapshot = NULL;
+	}
+
+	/* other resources are deallocated via memory context reset */
+	MemoryContextDelete(context);
+}
+
+/*
+ * Free an unreferenced snapshot that has previously been built by us.
+ */
+static void
+SnapBuildFreeSnapshot(Snapshot snap)
+{
+	/* make sure we don't get passed an external snapshot */
+	Assert(snap->satisfies == HeapTupleSatisfiesHistoricMVCC);
+
+	/* make sure nobody modified our snapshot */
+	Assert(snap->curcid == FirstCommandId);
+	Assert(!snap->suboverflowed);
+	Assert(!snap->takenDuringRecovery);
+	Assert(snap->regd_count == 1);
+
+	/* slightly more likely, so it's checked even without c-asserts */
+	if (snap->copied)
+		elog(ERROR, "cannot free a copied snapshot");
+
+	if (snap->active_count)
+		elog(ERROR, "cannot free an active snapshot");
+
+	pfree(snap);
+}
+
+/*
+ * In which state of snapshot building are we?
+ */
+SnapBuildState
+SnapBuildCurrentState(SnapBuild *builder)
+{
+	return builder->state;
+}
+
+/*
+ * Should the contents of transaction ending at 'ptr' be decoded?
+ */
+bool
+SnapBuildXactNeedsSkip(SnapBuild *builder, XLogRecPtr ptr)
+{
+	return ptr <= builder->transactions_after;
+}
+
+/*
+ * Increase refcount of a snapshot.
+ *
+ * This is used when handing out a snapshot to some external resource or when
+ * adding a Snapshot as builder->snapshot.
+ */
+static void
+SnapBuildSnapIncRefcount(Snapshot snap)
+{
+	snap->active_count++;
+}
+
+/*
+ * Decrease refcount of a snapshot and free if the refcount reaches zero.
+ *
+ * Externally visible, so that external resources that have been handed an
+ * IncRef'ed Snapshot can adjust its refcount easily.
+ */
+void
+SnapBuildSnapDecRefcount(Snapshot snap)
+{
+	/* make sure we don't get passed an external snapshot */
+	Assert(snap->satisfies == HeapTupleSatisfiesHistoricMVCC);
+
+	/* make sure nobody modified our snapshot */
+	Assert(snap->curcid == FirstCommandId);
+	Assert(!snap->suboverflowed);
+	Assert(!snap->takenDuringRecovery);
+
+	Assert(snap->regd_count == 1);
+
+	Assert(snap->active_count);
+
+	/* slightly more likely, so its checked even without casserts */
+	if (snap->copied)
+		elog(ERROR, "cannot free a copied snapshot");
+
+	snap->active_count--;
+	if (!snap->active_count)
+		SnapBuildFreeSnapshot(snap);
+}
+
+/*
+ * Build a new snapshot, based on currently committed catalog-modifying
+ * transactions.
+ *
+ * In-progress transactions with catalog access are *not* allowed to modify
+ * these snapshots; they have to copy them and fill in appropriate ->curcid
+ * and ->subxip/subxcnt values.
+ */
+static Snapshot
+SnapBuildBuildSnapshot(SnapBuild *builder, TransactionId xid)
+{
+	Snapshot	snapshot;
+	Size		ssize;
+
+	Assert(builder->state >= SNAPBUILD_FULL_SNAPSHOT);
+
+	ssize = sizeof(SnapshotData)
+		+ sizeof(TransactionId) * builder->committed.xcnt
+		+ sizeof(TransactionId) * 1 /* toplevel xid */ ;
+
+	snapshot = MemoryContextAllocZero(builder->context, ssize);
+
+	snapshot->satisfies = HeapTupleSatisfiesHistoricMVCC;
+
+	/*
+	 * We misuse the original meaning of SnapshotData's xip and subxip fields
+	 * to make the more fitting for our needs.
+	 *
+	 * In the 'xip' array we store transactions that have to be treated as
+	 * committed. Since we will only ever look at tuples from transactions
+	 * that have modified the catalog its more efficient to store those few
+	 * that exist between xmin and xmax (frequently there are none).
+	 *
+	 * Snapshots that are used in transactions that have modified the catalog
+	 * also use the 'subxip' array to store their toplevel xid and all the
+	 * subtransaction xids so we can recognize when we need to treat rows as
+	 * visible that are not in xip but still need to be visible. Subxip only
+	 * gets filled when the transaction is copied into the context of a
+	 * catalog modifying transaction since we otherwise share a snapshot
+	 * between transactions. As long as a txn hasn't modified the catalog it
+	 * doesn't need to treat any uncommitted rows as visible, so there is no
+	 * need for those xids.
+	 *
+	 * Both arrays are qsort'ed so that we can use bsearch() on them.
+	 */
+	Assert(TransactionIdIsNormal(builder->xmin));
+	Assert(TransactionIdIsNormal(builder->xmax));
+
+	snapshot->xmin = builder->xmin;
+	snapshot->xmax = builder->xmax;
+
+	/* store all transactions to be treated as committed by this snapshot */
+	snapshot->xip =
+		(TransactionId *) ((char *) snapshot + sizeof(SnapshotData));
+	snapshot->xcnt = builder->committed.xcnt;
+	memcpy(snapshot->xip,
+		   builder->committed.xip,
+		   builder->committed.xcnt * sizeof(TransactionId));
+
+	/* sort so we can bsearch() */
+	qsort(snapshot->xip, snapshot->xcnt, sizeof(TransactionId), xidComparator);
+
+	/*
+	 * Initially, subxip is empty, i.e. it's a snapshot to be used by
+	 * transactions that don't modify the catalog. Will be filled by
+	 * ReorderBufferCopySnap() if necessary.
+	 */
+	snapshot->subxcnt = 0;
+	snapshot->subxip = NULL;
+
+	snapshot->suboverflowed = false;
+	snapshot->takenDuringRecovery = false;
+	snapshot->copied = false;
+	snapshot->curcid = FirstCommandId;
+	snapshot->active_count = 0;
+	snapshot->regd_count = 1; /* mark as registered so nobody frees it */
+
+	return snapshot;
+}
+
+/*
+ * Export a snapshot so it can be set in another session with SET TRANSACTION
+ * SNAPSHOT.
+ *
+ * For that we need to start a transaction in the current backend as the
+ * importing side checks whether the source transaction is still open to make
+ * sure the xmin horizon hasn't advanced since then.
+ *
+ * After that we convert a locally built snapshot into the normal variant
+ * understood by HeapTupleSatisfiesMVCC et al.
+ */
+const char *
+SnapBuildExportSnapshot(SnapBuild *builder)
+{
+	Snapshot	snap;
+	char	   *snapname;
+	TransactionId xid;
+	TransactionId *newxip;
+	int			newxcnt = 0;
+
+	if (builder->state != SNAPBUILD_CONSISTENT)
+		elog(ERROR, "cannot export a snapshot before reaching a consistent state");
+
+	if (!builder->committed.includes_all_transactions)
+		elog(ERROR, "cannot export a snapshot, not all transactions are monitored anymore");
+
+	/* so we don't overwrite the existing value */
+	if (TransactionIdIsValid(MyPgXact->xmin))
+		elog(ERROR, "cannot export a snapshot when MyPgXact->xmin already is valid");
+
+	if (IsTransactionOrTransactionBlock())
+		elog(ERROR, "cannot export a snapshot from within a transaction");
+
+	if (SavedResourceOwnerDuringExport)
+		elog(ERROR, "can only export one snapshot at a time");
+
+	SavedResourceOwnerDuringExport = CurrentResourceOwner;
+	ExportInProgress = true;
+
+	StartTransactionCommand();
+
+	Assert(!FirstSnapshotSet);
+
+	/* There doesn't seem to a nice API to set these */
+	XactIsoLevel = XACT_REPEATABLE_READ;
+	XactReadOnly = true;
+
+	snap = SnapBuildBuildSnapshot(builder, GetTopTransactionId());
+
+	/*
+	 * We know that snap->xmin is alive, enforced by the logical xmin
+	 * mechanism. Due to that we can do this without locks, we're only
+	 * changing our own value.
+	 */
+	MyPgXact->xmin = snap->xmin;
+
+	/* allocate in transaction context */
+	newxip = (TransactionId *)
+		palloc(sizeof(TransactionId) * GetMaxSnapshotXidCount());
+
+	/*
+	 * snapbuild.c builds transactions in an "inverted" manner, which means it
+	 * stores committed transactions in ->xip, not ones in progress. Build a
+	 * classical snapshot by marking all non-committed transactions as
+	 * in-progress. This can be expensive.
+	 */
+	for (xid = snap->xmin; NormalTransactionIdPrecedes(xid, snap->xmax);)
+	{
+		void	   *test;
+
+		/*
+		 * Check whether transaction committed using the decoding snapshot
+		 * meaning of ->xip.
+		 */
+		test = bsearch(&xid, snap->xip, snap->xcnt,
+					   sizeof(TransactionId), xidComparator);
+
+		if (test == NULL)
+		{
+			if (newxcnt >= GetMaxSnapshotXidCount())
+				elog(ERROR, "snapshot too large");
+
+			newxip[newxcnt++] = xid;
+		}
+
+		TransactionIdAdvance(xid);
+	}
+
+	snap->xcnt = newxcnt;
+	snap->xip = newxip;
+
+	/*
+	 * now that we've built a plain snapshot, use the normal mechanisms for
+	 * exporting it
+	 */
+	snapname = ExportSnapshot(snap);
+
+	ereport(LOG,
+			(errmsg("exported logical decoding snapshot: \"%s\" with %u xids",
+					snapname, snap->xcnt)));
+	return snapname;
+}
+
+/*
+ * Reset a previously SnapBuildExportSnapshot()'ed snapshot if there is
+ * any. Aborts the previously started transaction and resets the resource
+ * owner back to it's original value.
+ */
+void
+SnapBuildClearExportedSnapshot()
+{
+	/* nothing exported, thats the usual case */
+	if (!ExportInProgress)
+		return;
+
+	if (!IsTransactionState())
+		elog(ERROR, "clearing exported snapshot in wrong transaction state");
+
+	/* make sure nothing  could have ever happened */
+	AbortCurrentTransaction();
+
+	CurrentResourceOwner = SavedResourceOwnerDuringExport;
+	SavedResourceOwnerDuringExport = NULL;
+	ExportInProgress = false;
+}
+
+/*
+ * Handle the effects of a single heap change, appropriate to the current state
+ * of the snapshot builder and returns whether changes made at (xid, lsn) can
+ * be decoded.
+ */
+bool
+SnapBuildProcessChange(SnapBuild *builder, TransactionId xid, XLogRecPtr lsn)
+{
+	bool is_old_tx;
+
+	/*
+	 * We can't handle data in transactions if we haven't built a snapshot
+	 * yet, so don't store them.
+	 */
+	if (builder->state < SNAPBUILD_FULL_SNAPSHOT)
+		return false;
+
+	/*
+	 * No point in keeping track of changes in transactions that we don't have
+	 * enough information about to decode. This means that they started before
+	 * we got into the SNAPBUILD_FULL_SNAPSHOT state.
+	 */
+	if (builder->state < SNAPBUILD_CONSISTENT &&
+		SnapBuildTxnIsRunning(builder, xid))
+		return false;
+
+	/*
+	 * If the reorderbuffer doesn't yet have a snapshot, add one now, it will
+	 * be needed to decode the change we're currently processing.
+	 */
+	is_old_tx = ReorderBufferIsXidKnown(builder->reorder, xid);
+
+	if (!is_old_tx || !ReorderBufferXidHasBaseSnapshot(builder->reorder, xid))
+	{
+		/* only build a new snapshot if we don't have a prebuilt one */
+		if (builder->snapshot == NULL)
+		{
+			builder->snapshot = SnapBuildBuildSnapshot(builder, xid);
+			/* inrease refcount for the snapshot builder */
+			SnapBuildSnapIncRefcount(builder->snapshot);
+		}
+
+		/*
+		 * Increase refcount for the transaction we're handing the snapshot
+		 * out to.
+		 */
+		SnapBuildSnapIncRefcount(builder->snapshot);
+		ReorderBufferSetBaseSnapshot(builder->reorder, xid, lsn,
+									 builder->snapshot);
+	}
+
+	return true;
+}
+
+/*
+ * Do CommandId/ComboCid handling after reading a xl_heap_new_cid record. This
+ * implies that a transaction has done some form of write to system catalogs.
+ */
+void
+SnapBuildProcessNewCid(SnapBuild *builder, TransactionId xid,
+					   XLogRecPtr lsn, xl_heap_new_cid *xlrec)
+{
+	CommandId	cid;
+
+	/*
+	 * we only log new_cid's if a catalog tuple was modified, so mark
+	 * the transaction as containing catalog modifications
+	 */
+	ReorderBufferXidSetCatalogChanges(builder->reorder, xid,lsn);
+
+	ReorderBufferAddNewTupleCids(builder->reorder, xlrec->top_xid, lsn,
+								 xlrec->target.node, xlrec->target.tid,
+								 xlrec->cmin, xlrec->cmax,
+								 xlrec->combocid);
+
+	/* figure out new command id */
+	if (xlrec->cmin != InvalidCommandId &&
+		xlrec->cmax != InvalidCommandId)
+		cid = Max(xlrec->cmin, xlrec->cmax);
+	else if (xlrec->cmax != InvalidCommandId)
+		cid = xlrec->cmax;
+	else if (xlrec->cmin != InvalidCommandId)
+		cid = xlrec->cmin;
+	else
+	{
+		cid = InvalidCommandId;		/* silence compiler */
+		elog(ERROR, "xl_heap_new_cid record without a valid CommandId");
+	}
+
+	ReorderBufferAddNewCommandId(builder->reorder, xid, lsn, cid + 1);
+}
+
+/*
+ * Check whether `xid` is currently 'running'.
+ *
+ * Running transactions in our parlance are transactions which we didn't
+ * observe from the start so we can't properly decode their contents. They
+ * only exist after we freshly started from an < CONSISTENT snapshot.
+ */
+static bool
+SnapBuildTxnIsRunning(SnapBuild *builder, TransactionId xid)
+{
+	Assert(builder->state < SNAPBUILD_CONSISTENT);
+	Assert(TransactionIdIsNormal(builder->running.xmin));
+	Assert(TransactionIdIsNormal(builder->running.xmax));
+
+	if (builder->running.xcnt &&
+		NormalTransactionIdFollows(xid, builder->running.xmin) &&
+		NormalTransactionIdPrecedes(xid, builder->running.xmax))
+	{
+		TransactionId *search =
+		bsearch(&xid, builder->running.xip, builder->running.xcnt_space,
+				sizeof(TransactionId), xidComparator);
+
+		if (search != NULL)
+		{
+			Assert(*search == xid);
+			return true;
+		}
+	}
+
+	return false;
+}
+
+/*
+ * Add a new Snapshot to all transactions we're decoding that currently are
+ * in-progress so they can see new catalog contents made by the transaction
+ * that just committed. This is necessary because those in-progress
+ * transactions will use the new catalog's contents from here on (at the very
+ * least everything they do needs to be compatible with newer catalog
+ * contents).
+ */
+static void
+SnapBuildDistributeNewCatalogSnapshot(SnapBuild *builder, XLogRecPtr lsn)
+{
+	dlist_iter	txn_i;
+	ReorderBufferTXN *txn;
+
+	/*
+	 * Iterate through all toplevel transactions. This can include
+	 * subtransactions which we just don't yet know to be that, but that's
+	 * fine, they will just get an unneccesary snapshot queued.
+	 */
+	dlist_foreach(txn_i, &builder->reorder->toplevel_by_lsn)
+	{
+		txn = dlist_container(ReorderBufferTXN, node, txn_i.cur);
+
+		Assert(TransactionIdIsValid(txn->xid));
+
+		/*
+		 * If we don't have a base snapshot yet, there are no changes in this
+		 * transaction which in turn implies we don't yet need a snapshot at
+		 * all. We'll add add a snapshot when the first change gets queued.
+		 *
+		 * NB: This works correctly even for subtransactions because
+		 * ReorderBufferCommitChild() takes care to pass the parent the base
+		 * snapshot, and while iterating the changequeue we'll get the change
+		 * from the subtxn.
+		 */
+		if (!ReorderBufferXidHasBaseSnapshot(builder->reorder, txn->xid))
+			continue;
+
+		elog(DEBUG2, "adding a new snapshot to %u at %X/%X",
+			 txn->xid, (uint32) (lsn >> 32), (uint32) lsn);
+
+		/*
+		 * increase the snapshot's refcount for the transaction we are handing
+		 * it out to
+		 */
+		SnapBuildSnapIncRefcount(builder->snapshot);
+		ReorderBufferAddSnapshot(builder->reorder, txn->xid, lsn,
+								 builder->snapshot);
+	}
+}
+
+/*
+ * Keep track of a new catalog changing transaction that has committed.
+ */
+static void
+SnapBuildAddCommittedTxn(SnapBuild *builder, TransactionId xid)
+{
+	Assert(TransactionIdIsValid(xid));
+
+	if (builder->committed.xcnt == builder->committed.xcnt_space)
+	{
+		builder->committed.xcnt_space = builder->committed.xcnt_space * 2 + 1;
+
+		elog(DEBUG1, "increasing space for committed transactions to %u",
+			 (uint32) builder->committed.xcnt_space);
+
+		builder->committed.xip = repalloc(builder->committed.xip,
+					builder->committed.xcnt_space * sizeof(TransactionId));
+	}
+
+	/*
+	 * TODO: It might make sense to keep the array sorted here instead of
+	 * doing it every time we build a new snapshot. On the other hand this
+	 * gets called repeatedly when a transaction with subtransactions commits.
+	 */
+	builder->committed.xip[builder->committed.xcnt++] = xid;
+}
+
+/*
+ * Remove knowledge about transactions we treat as committed that are smaller
+ * than ->xmin. Those won't ever get checked via the ->commited array but via
+ * the clog machinery, so we don't need to waste memory on them.
+ */
+static void
+SnapBuildPurgeCommittedTxn(SnapBuild *builder)
+{
+	int			off;
+	TransactionId *workspace;
+	int			surviving_xids = 0;
+
+	/* not ready yet */
+	if (!TransactionIdIsNormal(builder->xmin))
+		return;
+
+	/* TODO: Neater algorithm than just copying and iterating? */
+	workspace =
+		MemoryContextAlloc(builder->context,
+						   builder->committed.xcnt * sizeof(TransactionId));
+
+	/* copy xids that still are interesting to workspace */
+	for (off = 0; off < builder->committed.xcnt; off++)
+	{
+		if (NormalTransactionIdPrecedes(builder->committed.xip[off],
+										builder->xmin))
+			;					/* remove */
+		else
+			workspace[surviving_xids++] = builder->committed.xip[off];
+	}
+
+	/* copy workspace back to persistent state */
+	memcpy(builder->committed.xip, workspace,
+		   surviving_xids * sizeof(TransactionId));
+
+	elog(DEBUG3, "purged committed transactions from %u to %u, xmin: %u, xmax: %u",
+		 (uint32) builder->committed.xcnt, (uint32) surviving_xids,
+		 builder->xmin, builder->xmax);
+	builder->committed.xcnt = surviving_xids;
+
+	pfree(workspace);
+}
+
+/*
+ * Common logic for SnapBuildAbortTxn and SnapBuildCommitTxn dealing with
+ * keeping track of the amount of running transactions.
+ */
+static void
+SnapBuildEndTxn(SnapBuild *builder, XLogRecPtr lsn, TransactionId xid)
+{
+	if (builder->state == SNAPBUILD_CONSISTENT)
+		return;
+
+	/*
+	 * NB: This handles subtransactions correctly even if we started from
+	 * suboverflowed xl_running_xacts because we only keep track of toplevel
+	 * transactions. Since the latter are always are allocated before their
+	 * subxids and since they end at the same time it's sufficient to deal
+	 * with them here.
+	 */
+	if (SnapBuildTxnIsRunning(builder, xid))
+	{
+		Assert(builder->running.xcnt > 0);
+
+		if (!--builder->running.xcnt)
+		{
+			/*
+			 * None of the originally running transaction is running anymore,
+			 * so our incrementaly built snapshot now is consistent.
+			 */
+			ereport(LOG,
+					(errmsg("logical decoding found consistent point at %X/%X",
+							(uint32)(lsn >> 32), (uint32)lsn),
+					 errdetail("xid %u finished, no running transactions anymore",
+							   xid)));
+			builder->state = SNAPBUILD_CONSISTENT;
+		}
+	}
+}
+
+/*
+ * Abort a transaction, throw away all state we kept.
+ */
+void
+SnapBuildAbortTxn(SnapBuild *builder, XLogRecPtr lsn,
+				  TransactionId xid,
+				  int nsubxacts, TransactionId *subxacts)
+{
+	int			i;
+
+	for (i = 0; i < nsubxacts; i++)
+	{
+		TransactionId subxid = subxacts[i];
+
+		SnapBuildEndTxn(builder, lsn, subxid);
+	}
+
+	SnapBuildEndTxn(builder, lsn, xid);
+}
+
+/*
+ * Handle everything that needs to be done when a transaction commits
+ */
+void
+SnapBuildCommitTxn(SnapBuild *builder, XLogRecPtr lsn, TransactionId xid,
+				   int nsubxacts, TransactionId *subxacts)
+{
+	int			nxact;
+
+	bool		forced_timetravel = false;
+	bool		sub_needs_timetravel = false;
+	bool		top_needs_timetravel = false;
+
+	TransactionId xmax = xid;
+
+	/*
+	 * If we couldn't observe every change of a transaction because it was
+	 * already running at the point we started to observe we have to assume it
+	 * made catalog changes.
+	 *
+	 * This has the positive benefit that we afterwards have enough
+	 * information to build an exportable snapshot that's usable by pg_dump et
+	 * al.
+	 */
+	if (builder->state < SNAPBUILD_CONSISTENT)
+	{
+		/* ensure that only commits after this are getting replayed */
+		if (builder->transactions_after < lsn)
+			builder->transactions_after = lsn;
+
+		/*
+		 * We could avoid treating !SnapBuildTxnIsRunning transactions as
+		 * timetravel ones, but we want to be able to export a snapshot when
+		 * we reached consistency.
+		 */
+		forced_timetravel = true;
+		elog(DEBUG1, "forced to assume catalog changes for xid %u because it was running to early", xid);
+	}
+
+	for (nxact = 0; nxact < nsubxacts; nxact++)
+	{
+		TransactionId subxid = subxacts[nxact];
+
+		/*
+		 * make sure txn is not tracked in running txn's anymore, switch state
+		 */
+		SnapBuildEndTxn(builder, lsn, subxid);
+
+		/*
+		 * If we're forcing timetravel we also need visibility information
+		 * about subtransaction, so keep track of subtransaction's state.
+		 */
+		if (forced_timetravel)
+		{
+			SnapBuildAddCommittedTxn(builder, subxid);
+			if (NormalTransactionIdFollows(subxid, xmax))
+				xmax = subxid;
+		}
+
+		/*
+		 * Add subtransaction to base snapshot if it DDL, we don't distinguish
+		 * to toplevel transactions there.
+		 */
+		else if (ReorderBufferXidHasCatalogChanges(builder->reorder, subxid))
+		{
+			sub_needs_timetravel = true;
+
+			elog(DEBUG1, "found subtransaction %u:%u with catalog changes.",
+				 xid, subxid);
+
+			SnapBuildAddCommittedTxn(builder, subxid);
+
+			if (NormalTransactionIdFollows(subxid, xmax))
+				xmax = subxid;
+		}
+	}
+
+	/*
+	 * Make sure toplevel txn is not tracked in running txn's anymore, switch
+	 * state to consistent if possible.
+	 */
+	SnapBuildEndTxn(builder, lsn, xid);
+
+	if (forced_timetravel)
+	{
+		elog(DEBUG2, "forced transaction %u to do timetravel.", xid);
+
+		SnapBuildAddCommittedTxn(builder, xid);
+	}
+	/* add toplevel transaction to base snapshot */
+	else if (ReorderBufferXidHasCatalogChanges(builder->reorder, xid))
+	{
+		elog(DEBUG2, "found top level transaction %u, with catalog changes!",
+			 xid);
+
+		top_needs_timetravel = true;
+		SnapBuildAddCommittedTxn(builder, xid);
+	}
+	else if (sub_needs_timetravel)
+	{
+		/* mark toplevel txn as timetravel as well */
+		SnapBuildAddCommittedTxn(builder, xid);
+	}
+
+	/* if there's any reason to build a historic snapshot, to so now */
+	if (forced_timetravel || top_needs_timetravel || sub_needs_timetravel)
+	{
+		/*
+		 * Adjust xmax of the snapshot builder, we only do that for committed,
+		 * catalog modifying, transactions, everything else isn't interesting
+		 * for us since we'll never look at the respective rows.
+		 */
+		if (!TransactionIdIsValid(builder->xmax) ||
+			TransactionIdFollowsOrEquals(xmax, builder->xmax))
+		{
+			builder->xmax = xmax;
+			TransactionIdAdvance(builder->xmax);
+		}
+
+		/*
+		 * If we haven't built a complete snapshot yet there's no need to hand
+		 * it out, it wouldn't (and couldn't) be used anyway.
+		 */
+		if (builder->state < SNAPBUILD_FULL_SNAPSHOT)
+			return;
+
+		/*
+		 * Decrease the snapshot builder's refcount of the old snapshot, note
+		 * that it still will be used if it has been handed out to the
+		 * reorderbuffer earlier.
+		 */
+		if (builder->snapshot)
+			SnapBuildSnapDecRefcount(builder->snapshot);
+
+		builder->snapshot = SnapBuildBuildSnapshot(builder, xid);
+
+		/* we might need to execute invalidations, add snapshot */
+		if (!ReorderBufferXidHasBaseSnapshot(builder->reorder, xid))
+		{
+			SnapBuildSnapIncRefcount(builder->snapshot);
+			ReorderBufferSetBaseSnapshot(builder->reorder, xid, lsn,
+										 builder->snapshot);
+		}
+
+		/* refcount of the snapshot builder for the new snapshot */
+		SnapBuildSnapIncRefcount(builder->snapshot);
+
+		/* add a new SnapshotNow to all currently running transactions */
+		SnapBuildDistributeNewCatalogSnapshot(builder, lsn);
+	}
+	else
+	{
+		/* record that we cannot export a general snapshot anymore */
+		builder->committed.includes_all_transactions = false;
+	}
+}
+
+
+/* -----------------------------------
+ * Snapshot building functions dealing with xlog records
+ * -----------------------------------
+ */
+
+/*
+ * Process a running xacts record, and use it's information to first build a
+ * historic snapshot and later to release resources that aren't needed
+ * anymore.
+ */
+void
+SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *running)
+{
+	ReorderBufferTXN *txn;
+
+	/*
+	 * If we're not consistent yet, inspect the record to see whether it
+	 * allows to get closer to being consistent. If we are consistent, dump
+	 * our snapshot so others or we, after a restart, can use it.
+	 */
+	if (builder->state < SNAPBUILD_CONSISTENT)
+	{
+		/* returns false if there's no point in performing cleanup just yet */
+		if (!SnapBuildFindSnapshot(builder, lsn, running))
+			return;
+	}
+	else
+		SnapBuildSerialize(builder, lsn);
+
+	/*
+	 * Update range of interesting xids base don the running xacts
+	 * information. We don't increase ->xmax using it, because once we are in
+	 * a consistent state we can do that ourselves and much more efficiently
+	 * so, because we only need to do it for catalog transactions since we
+	 * only ever look at those.
+	 *
+	 * NB: Because of that xmax can be lower than xmin, because we only
+	 * increase xmax when a catalog modifying transaction commits. While odd
+	 * looking, its correct and actually more efficient this way since we hit
+	 * fast paths in tqual.c.
+	 */
+	builder->xmin = running->oldestRunningXid;
+
+	/* Remove transactions we don't need to keep track off anymore */
+	SnapBuildPurgeCommittedTxn(builder);
+
+	elog(DEBUG3, "xmin: %u, xmax: %u, oldestrunning: %u",
+		 builder->xmin, builder->xmax,
+		 running->oldestRunningXid);
+
+	/*
+	 * Inrease shared memory limits, so vacuum can work on tuples we prevented
+	 * from being pruned till now.
+	 */
+	LogicalIncreaseXminForSlot(lsn, running->oldestRunningXid);
+
+	/*
+	 * Also tell the slot where we can restart decoding from. We don't want to
+	 * do that after every commit because changing that implies an fsync of
+	 * the logical slot's state file, so we only do it every time we see a
+	 * running xacts record.
+	 *
+	 * Do so by looking for the oldest in progress transaction (determined by
+	 * the first LSN of any of its relevant records). Every transaction
+	 * remembers the last location we stored the snapshot to disk before its
+	 * beginning. That point is where we can restart from.
+	 */
+
+	/*
+	 * Can't know about a serialized snapshot's location if we're not
+	 * consistent.
+	 */
+	if (builder->state < SNAPBUILD_CONSISTENT)
+		return;
+
+	txn = ReorderBufferGetOldestTXN(builder->reorder);
+
+	/*
+	 * oldest ongoing txn might have started when we didn't yet serialize
+	 * anything because we hadn't reached a consistent state yet.
+	 */
+	if (txn != NULL && txn->restart_decoding_lsn != InvalidXLogRecPtr)
+		LogicalIncreaseRestartDecodingForSlot(lsn, txn->restart_decoding_lsn);
+	/*
+	 * No in-progress transaction, can reuse the last serialized snapshot if
+	 * we have one.
+	 */
+	else if (txn == NULL &&
+			 builder->reorder->current_restart_decoding_lsn != InvalidXLogRecPtr &&
+			 builder->last_serialized_snapshot != InvalidXLogRecPtr)
+		LogicalIncreaseRestartDecodingForSlot(lsn,
+										   builder->last_serialized_snapshot);
+}
+
+
+/*
+ * Build the start of a snapshot that's capable of decoding the catalog.
+ *
+ * Helper function for SnapBuildProcessRunningXacts() while we're not yet
+ * consistent.
+ *
+ * Returns true if there is a point in performing internal maintenance/cleanup
+ * using the xl_running_xacts record.
+ */
+static bool
+SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *running)
+{
+	/* ---
+	 * Build catalog decoding snapshot incrementally using information about
+	 * the currently running transactions. There are several ways to do that:
+	 *
+	 * a) There were no running transactions when the xl_running_xacts record
+	 *    was inserted, jump to CONSISTENT immediately. We might find such a
+	 *    state we were waiting for b) and c).
+	 *
+	 * b) Wait for all toplevel transactions that were running to end. We
+	 *    simply track the number of in-progress toplevel transactions and
+	 *    lower it whenever one commits or aborts. When that number
+	 *    (builder->running.xcnt) reaches zero, we can go from FULL_SNAPSHOT
+	 *    to CONSISTENT.
+	 *	  NB: We need to search running.xip when seeing a transaction's end to
+	 *    make sure it's a toplevel transaction and it's been one of the
+	 *    intially running ones.
+	 *	  Interestingly, in contrast to HS, this allows us not to care about
+	 *	  subtransactions - and by extension suboverflowed xl_running_xacts -
+	 *	  at all.
+	 *
+	 * c) This (in a previous run) or another decoding slot serialized a
+	 *    snapshot to disk that we can use.
+	 * ---
+	 */
+
+	/*
+	 * xl_running_xact record is older than what we can use, we might not have
+	 * all necessary catalog rows anymore.
+	 */
+	if (TransactionIdIsNormal(builder->initial_xmin_horizon) &&
+		NormalTransactionIdPrecedes(running->oldestRunningXid,
+									builder->initial_xmin_horizon))
+	{
+		ereport(DEBUG1,
+				(errmsg("skipping snapshot at %X/%X while building logical decoding snapshot, xmin horizon too low",
+						(uint32) (lsn >> 32), (uint32) lsn),
+				 errdetail("initial xmin horizon of %u vs the snapshot's %u",
+						   builder->initial_xmin_horizon, running->oldestRunningXid)));
+		return true;
+	}
+
+	/*
+	 * a) No transaction were running, we can jump to consistent.
+	 *
+	 * NB: We might have already started to incrementally assemble a snapshot,
+	 * so we need to be careful to deal with that.
+	 */
+	if (running->xcnt == 0)
+	{
+		if (builder->transactions_after == InvalidXLogRecPtr ||
+			builder->transactions_after < lsn)
+			builder->transactions_after = lsn;
+
+		builder->xmin = running->oldestRunningXid;
+		builder->xmax = running->latestCompletedXid;
+		TransactionIdAdvance(builder->xmax);
+
+		Assert(TransactionIdIsNormal(builder->xmin));
+		Assert(TransactionIdIsNormal(builder->xmax));
+
+		/* no transactions running now */
+		builder->running.xcnt = 0;
+		builder->running.xmin = InvalidTransactionId;
+		builder->running.xmax = InvalidTransactionId;
+
+		builder->state = SNAPBUILD_CONSISTENT;
+
+		ereport(LOG,
+				(errmsg("logical decoding found consistent point at %X/%X",
+						(uint32)(lsn >> 32), (uint32)lsn),
+				 errdetail("running xacts with xcnt == 0")));
+
+		return false;
+	}
+	/* c) valid on disk state */
+	else if (SnapBuildRestore(builder, lsn))
+	{
+		/* there won't be any state to cleanup */
+		return false;
+	}
+	/*
+	 * b) first encounter of a useable xl_running_xacts record. If we had
+	 * found one earlier we would either track running transactions
+	 * (i.e. builder->running.xcnt != 0) or be consistent (this function
+	 * wouldn't get called).
+	 */
+	else if (!builder->running.xcnt)
+	{
+		int off;
+
+		/*
+		 * We only care about toplevel xids as those are the ones we
+		 * definitely see in the wal stream. As snapbuild.c tracks committed
+		 * instead of running transactions we don't need to know anything
+		 * about uncommitted subtransactions.
+		 */
+		builder->xmin = running->oldestRunningXid;
+		builder->xmax = running->latestCompletedXid;
+		TransactionIdAdvance(builder->xmax);
+
+		/* so we can safely use the faster comparisons */
+		Assert(TransactionIdIsNormal(builder->xmin));
+		Assert(TransactionIdIsNormal(builder->xmax));
+
+		builder->running.xcnt = running->xcnt;
+		builder->running.xcnt_space = running->xcnt;
+		builder->running.xip =
+			MemoryContextAlloc(builder->context,
+							builder->running.xcnt * sizeof(TransactionId));
+		memcpy(builder->running.xip, running->xids,
+			   builder->running.xcnt * sizeof(TransactionId));
+
+		/* sort so we can do a binary search */
+		qsort(builder->running.xip, builder->running.xcnt,
+			  sizeof(TransactionId), xidComparator);
+
+		builder->running.xmin = builder->running.xip[0];
+		builder->running.xmax = builder->running.xip[running->xcnt - 1];
+
+		/* makes comparisons cheaper later */
+		TransactionIdRetreat(builder->running.xmin);
+		TransactionIdAdvance(builder->running.xmax);
+
+		builder->state = SNAPBUILD_FULL_SNAPSHOT;
+
+		ereport(LOG,
+				(errmsg("logical decoding found initial starting point at %X/%X",
+						(uint32)(lsn >> 32), (uint32)lsn),
+				 errdetail("%u xacts need to finish", (uint32) builder->running.xcnt)));
+
+		/*
+		 * Iterate through all xids, wait for them to finish.
+		 *
+		 * This isn't required for the correctness of decoding, but to allow
+		 * isolationtester to notice that we're currently waiting for
+		 * something.
+		 */
+		for(off = 0; off < builder->running.xcnt; off++)
+		{
+			TransactionId xid = builder->running.xip[off];
+
+			/*
+			 * Upper layers should prevent that we ever need to wait on
+			 * ourselves. Check anyway, since failing to do so would either
+			 * result in an endless wait or an Assert() failure.
+			 */
+			if (TransactionIdIsCurrentTransactionId(xid))
+				elog(ERROR, "waiting for ourselves");
+
+			XactLockTableWait(xid);
+		}
+
+		/* nothing could have built up so far, so don't perform cleanup */
+		return false;
+	}
+
+	/*
+	 * We already started to track running xacts and need to wait for all
+	 * in-progress ones to finish. We fall through to the normal processing of
+	 * records so incremental cleanup can be performed.
+	 */
+	return true;
+}
+
+
+/* -----------------------------------
+ * Snapshot serialization support
+ * -----------------------------------
+ */
+
+/*
+ * We store current state of struct SnapBuild on disk in the following manner:
+ *
+ * struct SnapBuildOnDisk;
+ * TransactionId * running.xcnt_space;
+ * TransactionId * committed.xcnt; (*not xcnt_space*)
+ *
+ */
+typedef struct SnapBuildOnDisk
+{
+	/* first part of this struct needs to be version independent */
+
+	/* data not covered by checksum */
+	uint32		magic;
+	pg_crc32	checksum;
+
+	/* data covered by checksum */
+
+	/* version, in case we want to support pg_upgrade */
+	uint32		version;
+	/* how large is the on disk data, excluding the constant sized part */
+	uint32		length;
+
+	/* version dependent part */
+	SnapBuild	builder;
+
+	/* variable amount of TransactionIds follows */
+} SnapBuildOnDisk;
+
+#define SnapBuildOnDiskConstantSize \
+	offsetof(SnapBuildOnDisk, builder)
+#define SnapBuildOnDiskNotChecksummedSize \
+	offsetof(SnapBuildOnDisk, version)
+
+#define SNAPBUILD_MAGIC 0x51A1E001
+#define SNAPBUILD_VERSION 1
+
+/*
+ * Store/Load a snapshot from disk, depending on the snapshot builder's state.
+ *
+ * Supposed to be used by external (i.e. not snapbuild.c) code that just reada
+ * record that's a potential location for a serialized snapshot.
+ */
+void
+SnapBuildSerializationPoint(SnapBuild *builder, XLogRecPtr lsn)
+{
+	if (builder->state < SNAPBUILD_CONSISTENT)
+		SnapBuildRestore(builder, lsn);
+	else
+		SnapBuildSerialize(builder, lsn);
+}
+
+/*
+ * Serialize the snapshot 'builder' at the location 'lsn' if it hasn't already
+ * been done by another decoding process.
+ */
+static void
+SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn)
+{
+	Size		needed_length;
+	SnapBuildOnDisk *ondisk;
+	char	   *ondisk_c;
+	int			fd;
+	char		tmppath[MAXPGPATH];
+	char		path[MAXPGPATH];
+	int			ret;
+	struct stat stat_buf;
+	uint32		sz;
+
+	Assert(lsn != InvalidXLogRecPtr);
+	Assert(builder->last_serialized_snapshot == InvalidXLogRecPtr ||
+		   builder->last_serialized_snapshot <= lsn);
+
+	/*
+	 * no point in serializing if we cannot continue to work immediately after
+	 * restoring the snapshot
+	 */
+	if (builder->state < SNAPBUILD_CONSISTENT)
+		return;
+
+	/*
+	 * We identify snapshots by the LSN they are valid for. We don't need to
+	 * include timelines in the name as each LSN maps to exactly one timeline
+	 * unless the user used pg_resetxlog or similar. If a user did so, there's
+	 * no hope continuing to decode anyway.
+	 */
+	sprintf(path, "pg_llog/snapshots/%X-%X.snap",
+			(uint32) (lsn >> 32), (uint32) lsn);
+
+	/*
+	 * first check whether some other backend already has written the snapshot
+	 * for this LSN. It's perfectly fine if there's none, so we accept ENOENT
+	 * as a valid state. Everything else is an unexpected error.
+	 */
+	ret = stat(path, &stat_buf);
+
+	if (ret != 0 && errno != ENOENT)
+		ereport(ERROR,
+				(errmsg("could not stat file \"%s\": %m", path)));
+
+	else if (ret == 0)
+	{
+		/*
+		 * somebody else has already serialized to this point, don't overwrite
+		 * but remember location, so we don't need to read old data again.
+		 *
+		 * To be sure it has been synced to disk after the rename() from the
+		 * tempfile filename to the real filename, we just repeat the
+		 * fsync. That ought to be cheap because in most scenarios it should
+		 * already be safely on disk.
+		 */
+		fsync_fname(path, false);
+		fsync_fname("pg_llog/snapshots", true);
+
+		builder->last_serialized_snapshot = lsn;
+		goto out;
+	}
+
+	/*
+	 * there is an obvious race condition here between the time we stat(2) the
+	 * file and us writing the file. But we rename the file into place
+	 * atomically and all files created need to contain the same data anyway,
+	 * so this is perfectly fine, although a bit of a resource waste. Locking
+	 * seems like pointless complication.
+	 */
+	elog(DEBUG1, "serializing snapshot to %s", path);
+
+	/* to make sure only we will write to this tempfile, include pid */
+	sprintf(tmppath, "pg_llog/snapshots/%X-%X.snap.%u.tmp",
+			(uint32) (lsn >> 32), (uint32) lsn, MyProcPid);
+
+	/*
+	 * Unlink temporary file if it already exists, needs to have been before a
+	 * crash/error since we won't enter this function twice from within a
+	 * single decoding slot/backend and the temporary file contains the pid of
+	 * the current process.
+	 */
+	if (unlink(tmppath) != 0 && errno != ENOENT)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not unlink file \"%s\": %m",	path)));
+
+	needed_length = sizeof(SnapBuildOnDisk) +
+		sizeof(TransactionId) * builder->running.xcnt_space +
+		sizeof(TransactionId) * builder->committed.xcnt;
+
+	ondisk_c = MemoryContextAllocZero(builder->context, needed_length);
+	ondisk = (SnapBuildOnDisk *) ondisk_c;
+	ondisk->magic = SNAPBUILD_MAGIC;
+	ondisk->version = SNAPBUILD_VERSION;
+	ondisk->length = needed_length;
+	INIT_CRC32(ondisk->checksum);
+	COMP_CRC32(ondisk->checksum,
+			   ((char *) ondisk) + SnapBuildOnDiskNotChecksummedSize,
+			   SnapBuildOnDiskConstantSize - SnapBuildOnDiskNotChecksummedSize);
+	ondisk_c += sizeof(SnapBuildOnDisk);
+
+	memcpy(&ondisk->builder, builder, sizeof(SnapBuild));
+	/* NULL-ify memory-only data */
+	ondisk->builder.context = NULL;
+	ondisk->builder.snapshot = NULL;
+	ondisk->builder.reorder = NULL;
+	ondisk->builder.running.xip = NULL;
+	ondisk->builder.committed.xip = NULL;
+
+	COMP_CRC32(ondisk->checksum,
+			   &ondisk->builder,
+			   sizeof(SnapBuild));
+
+	/* copy running xacts */
+	sz = sizeof(TransactionId) * builder->running.xcnt_space;
+	memcpy(ondisk_c, builder->running.xip, sz);
+	COMP_CRC32(ondisk->checksum, ondisk_c, sz);
+	ondisk_c += sz;
+
+	/* copy committed xacts */
+	sz = sizeof(TransactionId) * builder->committed.xcnt;
+	memcpy(ondisk_c, builder->committed.xip, sz);
+	COMP_CRC32(ondisk->checksum, ondisk_c, sz);
+	ondisk_c += sz;
+
+	/* we have valid data now, open tempfile and write it there */
+	fd = OpenTransientFile(tmppath,
+						   O_CREAT | O_EXCL | O_WRONLY | PG_BINARY,
+						   S_IRUSR | S_IWUSR);
+	if (fd < 0)
+		ereport(ERROR,
+				(errmsg("could not open file \"%s\": %m", path)));
+
+	if ((write(fd, ondisk, needed_length)) != needed_length)
+	{
+		CloseTransientFile(fd);
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not write to file \"%s\": %m", tmppath)));
+	}
+
+	/*
+	 * fsync the file before renaming so that even if we crash after this we
+	 * have either a fully valid file or nothing.
+	 *
+	 * TODO: Do the fsync() via checkpoints/restartpoints, doing it here has
+	 * some noticeable overhead since it's performed synchronously during
+	 * decoding?
+	 */
+	if (pg_fsync(fd) != 0)
+	{
+		CloseTransientFile(fd);
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not fsync file \"%s\": %m", tmppath)));
+	}
+	CloseTransientFile(fd);
+
+	fsync_fname("pg_llog/snapshots", true);
+
+	/*
+	 * We may overwrite the work from some other backend, but that's ok, our
+	 * snapshot is valid as well, we'll just have done some superflous work.
+	 */
+	if (rename(tmppath, path) != 0)
+	{
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not rename file \"%s\" to \"%s\": %m",
+						tmppath, path)));
+	}
+
+	/* make sure we persist */
+	fsync_fname(path, false);
+	fsync_fname("pg_llog/snapshots", true);
+
+	/*
+	 * Now there's no way we can loose the dumped state anymore, remember
+	 * this as a serialization point.
+	 */
+	builder->last_serialized_snapshot = lsn;
+
+out:
+	ReorderBufferSetRestartPoint(builder->reorder,
+								 builder->last_serialized_snapshot);
+}
+
+/*
+ * Restore a snapshot into 'builder' if previously one has been stored at the
+ * location indicated by 'lsn'. Returns true if successful, false otherwise.
+ */
+static bool
+SnapBuildRestore(SnapBuild *builder, XLogRecPtr lsn)
+{
+	SnapBuildOnDisk ondisk;
+	int			fd;
+	char		path[MAXPGPATH];
+	Size		sz;
+	int			readBytes;
+	pg_crc32	checksum;
+
+	/* no point in loading a snapshot if we're already there */
+	if (builder->state == SNAPBUILD_CONSISTENT)
+		return false;
+
+	sprintf(path, "pg_llog/snapshots/%X-%X.snap",
+			(uint32) (lsn >> 32), (uint32) lsn);
+
+	fd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0);
+
+	if (fd < 0 && errno == ENOENT)
+		return false;
+	else if (fd < 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not open file \"%s\": %m", path)));
+
+	/* ----
+	 * Make sure the snapshot had been stored safely to disk, that's normally
+	 * cheap.
+	 * Note that we do not need PANIC here, nobody will be able to use the
+	 * slot without fsyncing, and saving it won't suceed without an fsync()
+	 * either...
+	 * ----
+	 */
+	fsync_fname(path, false);
+	fsync_fname("pg_llog/snapshots", true);
+
+
+	/* read statically sized portion of snapshot */
+	readBytes = read(fd, &ondisk, SnapBuildOnDiskConstantSize);
+	if (readBytes != SnapBuildOnDiskConstantSize)
+	{
+		CloseTransientFile(fd);
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not read file \"%s\", read %d of %d: %m",
+						path, readBytes, (int) SnapBuildOnDiskConstantSize)));
+	}
+
+	if (ondisk.magic != SNAPBUILD_MAGIC)
+		ereport(ERROR,
+				(errmsg("snapbuild state file \"%s\" has wrong magic %u instead of %u",
+						path, ondisk.magic, SNAPBUILD_MAGIC)));
+
+	if (ondisk.version != SNAPBUILD_VERSION)
+		ereport(ERROR,
+				(errmsg("snapbuild state file \"%s\" has unsupported version %u instead of %u",
+						path, ondisk.version, SNAPBUILD_VERSION)));
+
+	INIT_CRC32(checksum);
+	COMP_CRC32(checksum,
+			   ((char *) &ondisk) + SnapBuildOnDiskNotChecksummedSize,
+			   SnapBuildOnDiskConstantSize - SnapBuildOnDiskNotChecksummedSize);
+
+	/* read SnapBuild */
+	readBytes = read(fd, &ondisk.builder, sizeof(SnapBuild));
+	if (readBytes != sizeof(SnapBuild))
+	{
+		CloseTransientFile(fd);
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not read file \"%s\", read %d of %d: %m",
+						path, readBytes, (int) sizeof(SnapBuild))));
+	}
+	COMP_CRC32(checksum, &ondisk.builder, sizeof(SnapBuild));
+
+	/* restore running xacts information */
+	sz = sizeof(TransactionId) * ondisk.builder.running.xcnt_space;
+	ondisk.builder.running.xip = MemoryContextAlloc(builder->context, sz);
+	readBytes = read(fd, ondisk.builder.running.xip, sz);
+	if (readBytes != sz)
+	{
+		CloseTransientFile(fd);
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not read file \"%s\", read %d of %d: %m",
+						path, readBytes, (int) sz)));
+	}
+	COMP_CRC32(checksum, ondisk.builder.running.xip, sz);
+
+	/* restore committed xacts information */
+	sz = sizeof(TransactionId) * ondisk.builder.committed.xcnt;
+	ondisk.builder.committed.xip = MemoryContextAlloc(builder->context, sz);
+	readBytes = read(fd, ondisk.builder.committed.xip, sz);
+	if (readBytes != sz)
+	{
+		CloseTransientFile(fd);
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not read file \"%s\", read %d of %d: %m",
+						path, readBytes, (int) sz)));
+	}
+	COMP_CRC32(checksum, ondisk.builder.committed.xip, sz);
+
+	CloseTransientFile(fd);
+
+	/* verify checksum of what we've read */
+	if (!EQ_CRC32(checksum, ondisk.checksum))
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("snapbuild state file %s: checksum mismatch, is %u, should be %u",
+						path, checksum, ondisk.checksum)));
+
+	/*
+	 * ok, we now have a sensible snapshot here, figure out if it has more
+	 * information than we have.
+	 */
+
+	/*
+	 * We are only interested in consistent snapshots for now, comparing
+	 * whether one imcomplete snapshot is more "advanced" seems to be
+	 * unnecessarily complex.
+	 */
+	if (ondisk.builder.state < SNAPBUILD_CONSISTENT)
+		goto snapshot_not_interesting;
+
+	/*
+	 * Don't use a snapshot that requires an xmin that we cannot guarantee to
+	 * be available.
+	 */
+	if (TransactionIdPrecedes(ondisk.builder.xmin, builder->initial_xmin_horizon))
+		goto snapshot_not_interesting;
+
+
+	/* ok, we think the snapshot is sensible, copy over everything important */
+	builder->xmin = ondisk.builder.xmin;
+	builder->xmax = ondisk.builder.xmax;
+	builder->state = ondisk.builder.state;
+
+	builder->committed.xcnt = ondisk.builder.committed.xcnt;
+	/* We only allocated/stored xcnt, not xcnt_space xids ! */
+	/* don't overwrite preallocated xip, if we don't have anything here */
+	if (builder->committed.xcnt > 0)
+	{
+		pfree(builder->committed.xip);
+		builder->committed.xcnt_space = ondisk.builder.committed.xcnt;
+		builder->committed.xip = ondisk.builder.committed.xip;
+	}
+	ondisk.builder.committed.xip = NULL;
+
+	builder->running.xcnt = ondisk.builder.committed.xcnt;
+	if (builder->running.xip)
+		pfree(builder->running.xip);
+	builder->running.xcnt_space = ondisk.builder.committed.xcnt_space;
+	builder->running.xip = ondisk.builder.running.xip;
+
+	/* our snapshot is not interesting anymore, build a new one */
+	if (builder->snapshot != NULL)
+	{
+		SnapBuildSnapDecRefcount(builder->snapshot);
+	}
+	builder->snapshot = SnapBuildBuildSnapshot(builder, InvalidTransactionId);
+	SnapBuildSnapIncRefcount(builder->snapshot);
+
+	ReorderBufferSetRestartPoint(builder->reorder, lsn);
+
+	Assert(builder->state == SNAPBUILD_CONSISTENT);
+
+	ereport(LOG,
+			(errmsg("logical decoding found consistent point at %X/%X",
+					(uint32)(lsn >> 32), (uint32)lsn),
+			 errdetail("found initial snapshot in snapbuild file")));
+	return true;
+
+snapshot_not_interesting:
+	if (ondisk.builder.running.xip != NULL)
+		pfree(ondisk.builder.running.xip);
+	if (ondisk.builder.committed.xip != NULL)
+		pfree(ondisk.builder.committed.xip);
+	return false;
+}
+
+/*
+ * Remove all serialized snapshots that are not required anymore because no
+ * slot can need them. This doesn't actually have to run during a checkpoint,
+ * but it's a convenient point to schedule this.
+ *
+ * NB: We run this during checkpoints even if logical decoding is disabled so
+ * we cleanup old slots at some point after it got disabled.
+ */
+void
+CheckPointSnapBuild(void)
+{
+	XLogRecPtr	cutoff;
+	XLogRecPtr	redo;
+	DIR		   *snap_dir;
+	struct dirent *snap_de;
+	char		path[MAXPGPATH];
+
+	/*
+	 * We start of with a minimum of the last redo pointer. No new replication
+	 * slot will start before that, so that's a safe upper bound for removal.
+	 */
+	redo = GetRedoRecPtr();
+
+	/* now check for the restart ptrs from existing slots */
+	cutoff = ReplicationSlotsComputeLogicalRestartLSN();
+
+	/* don't start earlier than the restart lsn */
+	if (redo < cutoff)
+		cutoff = redo;
+
+	snap_dir = AllocateDir("pg_llog/snapshots");
+	while ((snap_de = ReadDir(snap_dir, "pg_llog/snapshots")) != NULL)
+	{
+		uint32		hi;
+		uint32		lo;
+		XLogRecPtr	lsn;
+		struct stat	statbuf;
+
+		if (strcmp(snap_de->d_name, ".") == 0 ||
+			strcmp(snap_de->d_name, "..") == 0)
+			continue;
+
+		snprintf(path, MAXPGPATH, "pg_llog/snapshots/%s", snap_de->d_name);
+
+		if (lstat(path, &statbuf) == 0 && !S_ISREG(statbuf.st_mode))
+		{
+			elog(DEBUG1, "only regular files expected: %s", path);
+			continue;
+		}
+
+		/*
+		 * temporary filenames from SnapBuildSerialize() include the LSN and
+		 * everything but are postfixed by .$pid.tmp. We can just remove them
+		 * the same as other files because there can be none that are currently
+		 * being written that are older than cutoff.
+		 *
+		 * We just log a message if a file doesn't fit the pattern, it's
+		 * probably some editors lock/state file or similar...
+		 */
+		if (sscanf(snap_de->d_name, "%X-%X.snap", &hi, &lo) != 2)
+		{
+			ereport(LOG,
+					(errmsg("could not parse filename \"%s\"", path)));
+			continue;
+		}
+
+		lsn = ((uint64) hi) << 32 | lo;
+
+		/* check whether we still need it */
+		if (lsn < cutoff || cutoff == InvalidXLogRecPtr)
+		{
+			elog(DEBUG1, "removing snapbuild snapshot %s", path);
+
+			/*
+			 * It's not particularly harmful, though strange, if we can't
+			 * remove the file here. Don't prevent the checkpoint from
+			 * completing, that'd be cure worse than the disease.
+			 */
+			if (unlink(path) < 0)
+			{
+				ereport(LOG,
+						(errcode_for_file_access(),
+						 errmsg("could not unlink file \"%s\": %m",
+								path)));
+				continue;
+			}
+		}
+	}
+	FreeDir(snap_dir);
+}
diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c
index 826c7f027e5..45ed7e40e89 100644
--- a/src/backend/replication/slot.c
+++ b/src/backend/replication/slot.c
@@ -43,6 +43,7 @@
 #include "miscadmin.h"
 #include "replication/slot.h"
 #include "storage/fd.h"
+#include "storage/proc.h"
 #include "storage/procarray.h"
 
 /*
@@ -82,6 +83,8 @@ ReplicationSlot *MyReplicationSlot = NULL;
 /* GUCs */
 int			max_replication_slots = 0;	/* the maximum number of replication slots */
 
+static void ReplicationSlotDropAcquired(void);
+
 /* internal persistency functions */
 static void RestoreSlotFromDisk(const char *name);
 static void CreateSlotOnDisk(ReplicationSlot *slot);
@@ -190,11 +193,12 @@ ReplicationSlotValidateName(const char *name, int elevel)
  * Create a new replication slot and mark it as used by this backend.
  *
  * name: Name of the slot
- * db_specific: changeset extraction is db specific, if the slot is going to
+ * db_specific: logical decoding is db specific; if the slot is going to
  *     be used for that pass true, otherwise false.
  */
 void
-ReplicationSlotCreate(const char *name, bool db_specific)
+ReplicationSlotCreate(const char *name, bool db_specific,
+					  ReplicationSlotPersistency persistency)
 {
 	ReplicationSlot *slot = NULL;
 	int			i;
@@ -246,6 +250,7 @@ ReplicationSlotCreate(const char *name, bool db_specific)
 	 */
 	Assert(!slot->in_use);
 	Assert(!slot->active);
+	slot->data.persistency = persistency;
 	slot->data.xmin = InvalidTransactionId;
 	slot->effective_xmin = InvalidTransactionId;
 	strncpy(NameStr(slot->data.name), name, NAMEDATALEN);
@@ -348,14 +353,30 @@ ReplicationSlotRelease(void)
 
 	Assert(slot != NULL && slot->active);
 
-	/* Mark slot inactive.  We're not freeing it, just disconnecting. */
+	if (slot->data.persistency == RS_EPHEMERAL)
+	{
+		/*
+		 * Delete the slot. There is no !PANIC case where this is allowed to
+		 * fail, all that may happen is an incomplete cleanup of the on-disk
+		 * data.
+		 */
+		ReplicationSlotDropAcquired();
+	}
+	else
 	{
+		/* Mark slot inactive.  We're not freeing it, just disconnecting. */
 		volatile ReplicationSlot *vslot = slot;
 		SpinLockAcquire(&slot->mutex);
 		vslot->active = false;
 		SpinLockRelease(&slot->mutex);
-		MyReplicationSlot = NULL;
 	}
+
+	MyReplicationSlot = NULL;
+
+	/* might not have been set when we've been a plain slot */
+	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+	MyPgXact->vacuumFlags &= ~PROC_IN_LOGICAL_DECODING;
+	LWLockRelease(ProcArrayLock);
 }
 
 /*
@@ -364,52 +385,36 @@ ReplicationSlotRelease(void)
 void
 ReplicationSlotDrop(const char *name)
 {
-	ReplicationSlot *slot = NULL;
-	int			i;
-	bool		active;
+	Assert(MyReplicationSlot == NULL);
+
+	ReplicationSlotAcquire(name);
+
+	ReplicationSlotDropAcquired();
+}
+
+/*
+ * Permanently drop the currently acquired replication slot which will be
+ * released by the point this function returns.
+ */
+static void
+ReplicationSlotDropAcquired(void)
+{
 	char		path[MAXPGPATH];
 	char		tmppath[MAXPGPATH];
+	ReplicationSlot *slot = MyReplicationSlot;
 
-	ReplicationSlotValidateName(name, ERROR);
+	Assert(MyReplicationSlot != NULL);
+
+	/* slot isn't acquired anymore */
+	MyReplicationSlot = NULL;
 
 	/*
-	 * If some other backend ran this code currently with us, we might both
-	 * try to free the same slot at the same time.  Or we might try to delete
-	 * a slot with a certain name while someone else was trying to create a
-	 * slot with the same name.
+	 * If some other backend ran this code concurrently with us, we might try
+	 * to delete a slot with a certain name while someone else was trying to
+	 * create a slot with the same name.
 	 */
 	LWLockAcquire(ReplicationSlotAllocationLock, LW_EXCLUSIVE);
 
-	/* Search for the named slot and mark it active if we find it. */
-	LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
-	for (i = 0; i < max_replication_slots; i++)
-	{
-		ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
-
-		if (s->in_use && strcmp(name, NameStr(s->data.name)) == 0)
-		{
-			volatile ReplicationSlot *vslot = s;
-
-			SpinLockAcquire(&s->mutex);
-			active = vslot->active;
-			vslot->active = true;
-			SpinLockRelease(&s->mutex);
-			slot = s;
-			break;
-		}
-	}
-	LWLockRelease(ReplicationSlotControlLock);
-
-	/* If we did not find the slot or it was already active, error out. */
-	if (slot == NULL)
-		ereport(ERROR,
-				(errcode(ERRCODE_UNDEFINED_OBJECT),
-				 errmsg("replication slot \"%s\" does not exist", name)));
-	if (active)
-		ereport(ERROR,
-				(errcode(ERRCODE_OBJECT_IN_USE),
-				 errmsg("replication slot \"%s\" is already active", name)));
-
 	/* Generate pathnames. */
 	sprintf(path, "pg_replslot/%s", NameStr(slot->data.name));
 	sprintf(tmppath, "pg_replslot/%s.tmp", NameStr(slot->data.name));
@@ -417,35 +422,41 @@ ReplicationSlotDrop(const char *name)
 	/*
 	 * Rename the slot directory on disk, so that we'll no longer recognize
 	 * this as a valid slot.  Note that if this fails, we've got to mark the
-	 * slot inactive again before bailing out.
+	 * slot inactive before bailing out.  If we're dropping a ephemeral slot,
+	 * we better never fail hard as the caller won't expect the slot to
+	 * survive and this might get called during error handling.
 	 */
-	if (rename(path, tmppath) != 0)
+	if (rename(path, tmppath) == 0)
+	{
+		/*
+		 * We need to fsync() the directory we just renamed and its parent to
+		 * make sure that our changes are on disk in a crash-safe fashion.  If
+		 * fsync() fails, we can't be sure whether the changes are on disk or
+		 * not.  For now, we handle that by panicking;
+		 * StartupReplicationSlots() will try to straighten it out after
+		 * restart.
+		 */
+		START_CRIT_SECTION();
+		fsync_fname(tmppath, true);
+		fsync_fname("pg_replslot", true);
+		END_CRIT_SECTION();
+	}
+	else
 	{
 		volatile ReplicationSlot *vslot = slot;
+		bool fail_softly = slot->data.persistency == RS_EPHEMERAL;
 
 		SpinLockAcquire(&slot->mutex);
 		vslot->active = false;
 		SpinLockRelease(&slot->mutex);
 
-		ereport(ERROR,
+		ereport(fail_softly ? WARNING : ERROR,
 				(errcode_for_file_access(),
 				 errmsg("could not rename \"%s\" to \"%s\": %m",
 						path, tmppath)));
 	}
 
 	/*
-	 * We need to fsync() the directory we just renamed and its parent to make
-	 * sure that our changes are on disk in a crash-safe fashion.  If fsync()
-	 * fails, we can't be sure whether the changes are on disk or not.  For
-	 * now, we handle that by panicking; StartupReplicationSlots() will
-	 * try to straighten it out after restart.
-	 */
-	START_CRIT_SECTION();
-	fsync_fname(tmppath, true);
-	fsync_fname("pg_replslot", true);
-	END_CRIT_SECTION();
-
-	/*
 	 * The slot is definitely gone.  Lock out concurrent scans of the array
 	 * long enough to kill it.  It's OK to clear the active flag here without
 	 * grabbing the mutex because nobody else can be scanning the array here,
@@ -461,7 +472,7 @@ ReplicationSlotDrop(const char *name)
 	 * Slot is dead and doesn't prevent resource removal anymore, recompute
 	 * limits.
 	 */
-	ReplicationSlotsComputeRequiredXmin();
+	ReplicationSlotsComputeRequiredXmin(false);
 	ReplicationSlotsComputeRequiredLSN();
 
 	/*
@@ -519,21 +530,49 @@ ReplicationSlotMarkDirty(void)
 }
 
 /*
+ * Convert a slot that's marked as RS_DROP_ON_ERROR to a RS_PERSISTENT slot,
+ * guaranteeing it will be there after a eventual crash.
+ */
+void
+ReplicationSlotPersist(void)
+{
+	ReplicationSlot *slot = MyReplicationSlot;
+
+	Assert(slot != NULL);
+	Assert(slot->data.persistency != RS_PERSISTENT);
+
+	{
+		volatile ReplicationSlot *vslot = slot;
+
+		SpinLockAcquire(&slot->mutex);
+		vslot->data.persistency = RS_PERSISTENT;
+		SpinLockRelease(&slot->mutex);
+	}
+
+	ReplicationSlotMarkDirty();
+	ReplicationSlotSave();
+}
+
+/*
  * Compute the oldest xmin across all slots and store it in the ProcArray.
  */
 void
-ReplicationSlotsComputeRequiredXmin(void)
+ReplicationSlotsComputeRequiredXmin(bool already_locked)
 {
 	int			i;
 	TransactionId agg_xmin = InvalidTransactionId;
+	TransactionId agg_catalog_xmin = InvalidTransactionId;
 
 	Assert(ReplicationSlotCtl != NULL);
 
-	LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
+	if (!already_locked)
+		LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
+
 	for (i = 0; i < max_replication_slots; i++)
 	{
 		ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
 		TransactionId	effective_xmin;
+		TransactionId	effective_catalog_xmin;
 
 		if (!s->in_use)
 			continue;
@@ -543,6 +582,7 @@ ReplicationSlotsComputeRequiredXmin(void)
 
 			SpinLockAcquire(&s->mutex);
 			effective_xmin = vslot->effective_xmin;
+			effective_catalog_xmin = vslot->effective_catalog_xmin;
 			SpinLockRelease(&s->mutex);
 		}
 
@@ -551,10 +591,18 @@ ReplicationSlotsComputeRequiredXmin(void)
 			(!TransactionIdIsValid(agg_xmin) ||
 			 TransactionIdPrecedes(effective_xmin, agg_xmin)))
 			agg_xmin = effective_xmin;
+
+		/* check the catalog xmin */
+		if (TransactionIdIsValid(effective_catalog_xmin) &&
+			(!TransactionIdIsValid(agg_catalog_xmin) ||
+			 TransactionIdPrecedes(effective_catalog_xmin, agg_catalog_xmin)))
+			agg_catalog_xmin = effective_catalog_xmin;
 	}
-	LWLockRelease(ReplicationSlotControlLock);
 
-	ProcArraySetReplicationSlotXmin(agg_xmin);
+	if (!already_locked)
+		LWLockRelease(ReplicationSlotControlLock);
+
+	ProcArraySetReplicationSlotXmin(agg_xmin, agg_catalog_xmin, already_locked);
 }
 
 /*
@@ -596,6 +644,110 @@ ReplicationSlotsComputeRequiredLSN(void)
 }
 
 /*
+ * Compute the oldest WAL LSN required by *logical* decoding slots..
+ *
+ * Returns InvalidXLogRecPtr if logical decoding is disabled or no logicals
+ * slots exist.
+ *
+ * NB: this returns a value >= ReplicationSlotsComputeRequiredLSN(), since it
+ * ignores physical replication slots.
+ *
+ * The results aren't required frequently, so we don't maintain a precomputed
+ * value like we do for ComputeRequiredLSN() and ComputeRequiredXmin().
+ */
+XLogRecPtr
+ReplicationSlotsComputeLogicalRestartLSN(void)
+{
+	XLogRecPtr	result = InvalidXLogRecPtr;
+	int			i;
+
+	if (max_replication_slots <= 0)
+		return InvalidXLogRecPtr;
+
+	LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
+
+	for (i = 0; i < max_replication_slots; i++)
+	{
+		volatile ReplicationSlot *s;
+		XLogRecPtr		restart_lsn;
+
+		s = &ReplicationSlotCtl->replication_slots[i];
+
+		/* cannot change while ReplicationSlotCtlLock is held */
+		if (!s->in_use)
+			continue;
+
+		/* we're only interested in logical slots */
+		if (s->data.database == InvalidOid)
+			continue;
+
+		/* read once, it's ok if it increases while we're checking */
+		SpinLockAcquire(&s->mutex);
+		restart_lsn = s->data.restart_lsn;
+		SpinLockRelease(&s->mutex);
+
+		if (result == InvalidXLogRecPtr ||
+			restart_lsn < result)
+			result = restart_lsn;
+	}
+
+	LWLockRelease(ReplicationSlotControlLock);
+
+	return result;
+}
+
+/*
+ * ReplicationSlotsCountDBSlots -- count the number of slots that refer to the
+ * passed database oid.
+ *
+ * Returns true if there are any slots referencing the database. *nslots will
+ * be set to the absolute number of slots in the database, *nactive to ones
+ * currently active.
+ */
+bool
+ReplicationSlotsCountDBSlots(Oid dboid, int *nslots, int *nactive)
+{
+	int			i;
+
+	*nslots = *nactive = 0;
+
+	if (max_replication_slots <= 0)
+		return false;
+
+	LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
+	for (i = 0; i < max_replication_slots; i++)
+	{
+		volatile ReplicationSlot *s;
+
+		s = &ReplicationSlotCtl->replication_slots[i];
+
+		/* cannot change while ReplicationSlotCtlLock is held */
+		if (!s->in_use)
+			continue;
+
+		/* not database specific, skip */
+		if (s->data.database == InvalidOid)
+
+		/* not our database, skip */
+		if (s->data.database != dboid)
+			continue;
+
+		/* count slots with spinlock held */
+		SpinLockAcquire(&s->mutex);
+		(*nslots)++;
+		if (s->active)
+			(*nactive)++;
+		SpinLockRelease(&s->mutex);
+	}
+	LWLockRelease(ReplicationSlotControlLock);
+
+	if (*nslots > 0)
+		return true;
+	return false;
+}
+
+
+/*
  * Check whether the server's configuration supports using replication
  * slots.
  */
@@ -723,7 +875,7 @@ StartupReplicationSlots(XLogRecPtr checkPointRedo)
 		return;
 
 	/* Now that we have recovered all the data, compute replication xmin */
-	ReplicationSlotsComputeRequiredXmin();
+	ReplicationSlotsComputeRequiredXmin(false);
 	ReplicationSlotsComputeRequiredLSN();
 }
 
@@ -1050,8 +1202,19 @@ RestoreSlotFromDisk(const char *name)
 		memcpy(&slot->data, &cp.slotdata,
 			   sizeof(ReplicationSlotPersistentData));
 
+		/* Don't restore the slot if it's not parked as persistent. */
+		if (slot->data.persistency != RS_PERSISTENT)
+			return;
+
 		/* initialize in memory state */
 		slot->effective_xmin = cp.slotdata.xmin;
+		slot->effective_catalog_xmin = cp.slotdata.catalog_xmin;
+
+		slot->candidate_catalog_xmin = InvalidTransactionId;
+		slot->candidate_xmin_lsn = InvalidXLogRecPtr;
+		slot->candidate_restart_lsn = InvalidXLogRecPtr;
+		slot->candidate_restart_valid = InvalidXLogRecPtr;
+
 		slot->in_use = true;
 		slot->active = false;
 
diff --git a/src/backend/replication/slotfuncs.c b/src/backend/replication/slotfuncs.c
index 5acd2bae19c..c9416b03eee 100644
--- a/src/backend/replication/slotfuncs.c
+++ b/src/backend/replication/slotfuncs.c
@@ -15,13 +15,13 @@
 
 #include "funcapi.h"
 #include "miscadmin.h"
+
 #include "access/htup_details.h"
+#include "replication/slot.h"
+#include "replication/logical.h"
+#include "replication/logicalfuncs.h"
 #include "utils/builtins.h"
 #include "utils/pg_lsn.h"
-#include "replication/slot.h"
-
-Datum		pg_create_physical_replication_slot(PG_FUNCTION_ARGS);
-Datum		pg_drop_replication_slot(PG_FUNCTION_ARGS);
 
 static void
 check_permissions(void)
@@ -54,7 +54,7 @@ pg_create_physical_replication_slot(PG_FUNCTION_ARGS)
 		elog(ERROR, "return type must be a row type");
 
 	/* acquire replication slot, this will check for conflicting names*/
-	ReplicationSlotCreate(NameStr(*name), false);
+	ReplicationSlotCreate(NameStr(*name), false, RS_PERSISTENT);
 
 	values[0] = NameGetDatum(&MyReplicationSlot->data.name);
 
@@ -69,6 +69,68 @@ pg_create_physical_replication_slot(PG_FUNCTION_ARGS)
 	PG_RETURN_DATUM(result);
 }
 
+
+/*
+ * SQL function for creating a new logical replication slot.
+ */
+Datum
+pg_create_logical_replication_slot(PG_FUNCTION_ARGS)
+{
+	Name		name = PG_GETARG_NAME(0);
+	Name		plugin = PG_GETARG_NAME(1);
+
+	LogicalDecodingContext *ctx = NULL;
+
+	TupleDesc	tupdesc;
+	HeapTuple	tuple;
+	Datum		result;
+	Datum		values[2];
+	bool		nulls[2];
+
+	if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+		elog(ERROR, "return type must be a row type");
+
+	check_permissions();
+
+	CheckLogicalDecodingRequirements();
+
+	Assert(!MyReplicationSlot);
+
+	/*
+	 * Acquire a logical decoding slot, this will check for conflicting
+	 * names.
+	 */
+	ReplicationSlotCreate(NameStr(*name), true, RS_EPHEMERAL);
+
+	/*
+	 * Create logical decoding context, to build the initial snapshot.
+	 */
+	ctx = CreateInitDecodingContext(
+		NameStr(*plugin), NIL,
+		logical_read_local_xlog_page, NULL, NULL);
+
+	/* build initial snapshot, might take a while */
+	DecodingContextFindStartpoint(ctx);
+
+	values[0] = CStringGetTextDatum(NameStr(MyReplicationSlot->data.name));
+	values[1] = LSNGetDatum(MyReplicationSlot->data.confirmed_flush);
+
+	/* don't need the decoding context anymore */
+	FreeDecodingContext(ctx);
+
+	memset(nulls, 0, sizeof(nulls));
+
+	tuple = heap_form_tuple(tupdesc, values, nulls);
+	result = HeapTupleGetDatum(tuple);
+
+	/* ok, slot is now fully created, mark it as persistent */
+	ReplicationSlotPersist();
+	ReplicationSlotRelease();
+
+	PG_RETURN_DATUM(result);
+}
+
+
 /*
  * SQL function for dropping a replication slot.
  */
@@ -92,7 +154,7 @@ pg_drop_replication_slot(PG_FUNCTION_ARGS)
 Datum
 pg_get_replication_slots(PG_FUNCTION_ARGS)
 {
-#define PG_STAT_GET_REPLICATION_SLOTS_COLS 6
+#define PG_GET_REPLICATION_SLOTS_COLS 8
 	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
 	TupleDesc	tupdesc;
 	Tuplestorestate *tupstore;
@@ -134,15 +196,16 @@ pg_get_replication_slots(PG_FUNCTION_ARGS)
 	for (slotno = 0; slotno < max_replication_slots; slotno++)
 	{
 		ReplicationSlot *slot = &ReplicationSlotCtl->replication_slots[slotno];
-		Datum		values[PG_STAT_GET_REPLICATION_SLOTS_COLS];
-		bool		nulls[PG_STAT_GET_REPLICATION_SLOTS_COLS];
+		Datum		values[PG_GET_REPLICATION_SLOTS_COLS];
+		bool		nulls[PG_GET_REPLICATION_SLOTS_COLS];
 
 		TransactionId xmin;
+		TransactionId catalog_xmin;
 		XLogRecPtr	restart_lsn;
 		bool		active;
 		Oid			database;
 		NameData	slot_name;
-
+		NameData	plugin;
 		int			i;
 
 		SpinLockAcquire(&slot->mutex);
@@ -154,9 +217,11 @@ pg_get_replication_slots(PG_FUNCTION_ARGS)
 		else
 		{
 			xmin = slot->data.xmin;
+			catalog_xmin = slot->data.catalog_xmin;
 			database = slot->data.database;
 			restart_lsn = slot->data.restart_lsn;
 			namecpy(&slot_name, &slot->data.name);
+			namecpy(&plugin, &slot->data.plugin);
 
 			active = slot->active;
 		}
@@ -166,19 +231,34 @@ pg_get_replication_slots(PG_FUNCTION_ARGS)
 
 		i = 0;
 		values[i++] = NameGetDatum(&slot_name);
+
+		if (database == InvalidOid)
+			nulls[i++] = true;
+		else
+			values[i++] = NameGetDatum(&plugin);
+
 		if (database == InvalidOid)
 			values[i++] = CStringGetTextDatum("physical");
 		else
 			values[i++] = CStringGetTextDatum("logical");
+
 		if (database == InvalidOid)
 			nulls[i++] = true;
 		else
 			values[i++] = database;
+
 		values[i++] = BoolGetDatum(active);
+
 		if (xmin != InvalidTransactionId)
 			values[i++] = TransactionIdGetDatum(xmin);
 		else
 			nulls[i++] = true;
+
+		if (catalog_xmin != InvalidTransactionId)
+			values[i++] = TransactionIdGetDatum(catalog_xmin);
+		else
+			nulls[i++] = true;
+
 		if (restart_lsn != InvalidTransactionId)
 			values[i++] = LSNGetDatum(restart_lsn);
 		else
diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c
index e31977eee02..43db10851c3 100644
--- a/src/backend/replication/walreceiver.c
+++ b/src/backend/replication/walreceiver.c
@@ -1147,7 +1147,7 @@ XLogWalRcvSendHSFeedback(bool immed)
 	 * everything else has been checked.
 	 */
 	if (hot_standby_feedback)
-		xmin = GetOldestXmin(true, false);
+		xmin = GetOldestXmin(NULL, false);
 	else
 		xmin = InvalidTransactionId;
 
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index 048367af299..5227eab414f 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -55,6 +55,7 @@
 #include "replication/basebackup.h"
 #include "replication/slot.h"
 #include "replication/syncrep.h"
+#include "replication/slot.h"
 #include "replication/walreceiver.h"
 #include "replication/walsender.h"
 #include "replication/walsender_private.h"
@@ -434,7 +435,7 @@ StartReplication(StartReplicationCmd *cmd)
 		if (MyReplicationSlot->data.database != InvalidOid)
 			ereport(ERROR,
 					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
-					 (errmsg("cannot use a replication slot created for changeset extraction for streaming replication"))));
+					 (errmsg("cannot use a logical replication slot for physical replication"))));
 	}
 
 	/*
@@ -656,7 +657,9 @@ CreateReplicationSlot(CreateReplicationSlotCmd *cmd)
 	sendTimeLineIsHistoric = false;
 	sendTimeLine = ThisTimeLineID;
 
-	ReplicationSlotCreate(cmd->slotname, cmd->kind == REPLICATION_KIND_LOGICAL);
+	ReplicationSlotCreate(cmd->slotname,
+						  cmd->kind == REPLICATION_KIND_LOGICAL,
+						  RS_PERSISTENT);
 
 	initStringInfo(&output_message);
 
@@ -766,7 +769,7 @@ exec_replication_command(const char *cmd_string)
 				if (cmd->kind == REPLICATION_KIND_PHYSICAL)
 					StartReplication(cmd);
 				else
-					elog(ERROR, "cannot handle changeset extraction yet");
+					elog(ERROR, "cannot handle logical decoding yet");
 				break;
 			}
 
@@ -1017,7 +1020,7 @@ ProcessStandbyReplyMessage(void)
 	if (MyReplicationSlot && flushPtr != InvalidXLogRecPtr)
 	{
 		if (MyReplicationSlot->data.database != InvalidOid)
-			elog(ERROR, "cannot handle changeset extraction yet");
+			elog(ERROR, "cannot handle logical decoding yet");
 		else
 			PhysicalConfirmReceivedLocation(flushPtr);
 	}
@@ -1050,7 +1053,7 @@ PhysicalReplicationSlotNewXmin(TransactionId feedbackXmin)
 	if (changed)
 	{
 		ReplicationSlotMarkDirty();
-		ReplicationSlotsComputeRequiredXmin();
+		ReplicationSlotsComputeRequiredXmin(false);
 	}
 }
 
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index eac418442d3..3376a353a40 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -50,11 +50,13 @@
 #include "access/transam.h"
 #include "access/xact.h"
 #include "access/twophase.h"
+#include "catalog/catalog.h"
 #include "miscadmin.h"
 #include "storage/proc.h"
 #include "storage/procarray.h"
 #include "storage/spin.h"
 #include "utils/builtins.h"
+#include "utils/rel.h"
 #include "utils/snapmgr.h"
 
 
@@ -84,6 +86,8 @@ typedef struct ProcArrayStruct
 
 	/* oldest xmin of any replication slot */
 	TransactionId replication_slot_xmin;
+	/* oldest catalog xmin of any replication slot */
+	TransactionId replication_slot_catalog_xmin;
 
 	/*
 	 * We declare pgprocnos[] as 1 entry because C wants a fixed-size array,
@@ -1108,21 +1112,22 @@ TransactionIdIsActive(TransactionId xid)
  * GetOldestXmin -- returns oldest transaction that was running
  *					when any current transaction was started.
  *
- * If allDbs is TRUE then all backends are considered; if allDbs is FALSE
- * then only backends running in my own database are considered.
+ * If rel is NULL or a shared relation, all backends are considered, otherwise
+ * only backends running in this database are considered.
  *
  * If ignoreVacuum is TRUE then backends with the PROC_IN_VACUUM flag set are
  * ignored.
  *
- * This is used by VACUUM to decide which deleted tuples must be preserved
- * in a table.	allDbs = TRUE is needed for shared relations, but allDbs =
- * FALSE is sufficient for non-shared relations, since only backends in my
- * own database could ever see the tuples in them.	Also, we can ignore
- * concurrently running lazy VACUUMs because (a) they must be working on other
- * tables, and (b) they don't need to do snapshot-based lookups.
+ * This is used by VACUUM to decide which deleted tuples must be preserved in
+ * the passed in table. For shared relations backends in all databases must be
+ * considered, but for non-shared relations that's not required, since only
+ * backends in my own database could ever see the tuples in them. Also, we can
+ * ignore concurrently running lazy VACUUMs because (a) they must be working
+ * on other tables, and (b) they don't need to do snapshot-based lookups.
  *
- * This is also used to determine where to truncate pg_subtrans.  allDbs
- * must be TRUE for that case, and ignoreVacuum FALSE.
+ * This is also used to determine where to truncate pg_subtrans.  For that
+ * backends in all databases have to be considered, so rel = NULL has to be
+ * passed in.
  *
  * Note: we include all currently running xids in the set of considered xids.
  * This ensures that if a just-started xact has not yet set its snapshot,
@@ -1133,7 +1138,7 @@ TransactionIdIsActive(TransactionId xid)
  * backwards on repeated calls. The calculated value is conservative, so that
  * anything older is definitely not considered as running by anyone anymore,
  * but the exact value calculated depends on a number of things. For example,
- * if allDbs is FALSE and there are no transactions running in the current
+ * if rel = NULL and there are no transactions running in the current
  * database, GetOldestXmin() returns latestCompletedXid. If a transaction
  * begins after that, its xmin will include in-progress transactions in other
  * databases that started earlier, so another call will return a lower value.
@@ -1152,12 +1157,22 @@ TransactionIdIsActive(TransactionId xid)
  * GetOldestXmin() move backwards, with no consequences for data integrity.
  */
 TransactionId
-GetOldestXmin(bool allDbs, bool ignoreVacuum)
+GetOldestXmin(Relation rel, bool ignoreVacuum)
 {
 	ProcArrayStruct *arrayP = procArray;
 	TransactionId result;
 	int			index;
+	bool		allDbs;
+
 	volatile TransactionId replication_slot_xmin = InvalidTransactionId;
+	volatile TransactionId replication_slot_catalog_xmin = InvalidTransactionId;
+
+	/*
+	 * If we're not computing a relation specific limit, or if a shared
+	 * relation has been passed in, backends in all databases have to be
+	 * considered.
+	 */
+	allDbs = rel == NULL || rel->rd_rel->relisshared;
 
 	/* Cannot look for individual databases during recovery */
 	Assert(allDbs || !RecoveryInProgress());
@@ -1180,6 +1195,13 @@ GetOldestXmin(bool allDbs, bool ignoreVacuum)
 		volatile PGPROC *proc = &allProcs[pgprocno];
 		volatile PGXACT *pgxact = &allPgXact[pgprocno];
 
+		/*
+		 * Backend is doing logical decoding which manages xmin separately,
+		 * check below.
+		 */
+		if (pgxact->vacuumFlags & PROC_IN_LOGICAL_DECODING)
+			continue;
+
 		if (ignoreVacuum && (pgxact->vacuumFlags & PROC_IN_VACUUM))
 			continue;
 
@@ -1211,6 +1233,7 @@ GetOldestXmin(bool allDbs, bool ignoreVacuum)
 
 	/* fetch into volatile var while ProcArrayLock is held */
 	replication_slot_xmin = procArray->replication_slot_xmin;
+	replication_slot_catalog_xmin = procArray->replication_slot_catalog_xmin;
 
 	if (RecoveryInProgress())
 	{
@@ -1259,6 +1282,18 @@ GetOldestXmin(bool allDbs, bool ignoreVacuum)
 		NormalTransactionIdPrecedes(replication_slot_xmin, result))
 		result = replication_slot_xmin;
 
+	/*
+	 * After locks have been released and defer_cleanup_age has been applied,
+	 * check whether we need to back up further to make logical decoding
+	 * possible. We need to do so if we're computing the global limit (rel =
+	 * NULL) or if the passed relation is a catalog relation of some kind.
+	 */
+	if ((rel == NULL ||
+		 RelationIsAccessibleInLogicalDecoding(rel)) &&
+		TransactionIdIsValid(replication_slot_catalog_xmin) &&
+		NormalTransactionIdPrecedes(replication_slot_catalog_xmin, result))
+		result = replication_slot_catalog_xmin;
+
 	return result;
 }
 
@@ -1313,6 +1348,8 @@ GetMaxSnapshotSubxidCount(void)
  *		RecentGlobalXmin: the global xmin (oldest TransactionXmin across all
  *			running transactions, except those running LAZY VACUUM).  This is
  *			the same computation done by GetOldestXmin(true, true).
+ *		RecentGlobalDataXmin: the global xmin for non-catalog tables
+ *			>= RecentGlobalXmin
  *
  * Note: this function should probably not be called with an argument that's
  * not statically allocated (see xip allocation below).
@@ -1329,6 +1366,7 @@ GetSnapshotData(Snapshot snapshot)
 	int			subcount = 0;
 	bool		suboverflowed = false;
 	volatile TransactionId replication_slot_xmin = InvalidTransactionId;
+	volatile TransactionId replication_slot_catalog_xmin = InvalidTransactionId;
 
 	Assert(snapshot != NULL);
 
@@ -1397,6 +1435,13 @@ GetSnapshotData(Snapshot snapshot)
 			volatile PGXACT *pgxact = &allPgXact[pgprocno];
 			TransactionId xid;
 
+			/*
+			 * Backend is doing logical decoding which manages xmin
+			 * separately, check below.
+			 */
+			if (pgxact->vacuumFlags & PROC_IN_LOGICAL_DECODING)
+				continue;
+
 			/* Ignore procs running LAZY VACUUM */
 			if (pgxact->vacuumFlags & PROC_IN_VACUUM)
 				continue;
@@ -1509,6 +1554,7 @@ GetSnapshotData(Snapshot snapshot)
 
 	/* fetch into volatile var while ProcArrayLock is held */
 	replication_slot_xmin = procArray->replication_slot_xmin;
+	replication_slot_catalog_xmin = procArray->replication_slot_catalog_xmin;
 
 	if (!TransactionIdIsValid(MyPgXact->xmin))
 		MyPgXact->xmin = TransactionXmin = xmin;
@@ -1533,6 +1579,17 @@ GetSnapshotData(Snapshot snapshot)
 		NormalTransactionIdPrecedes(replication_slot_xmin, RecentGlobalXmin))
 		RecentGlobalXmin = replication_slot_xmin;
 
+	/* Non-catalog tables can be vacuumed if older than this xid */
+	RecentGlobalDataXmin = RecentGlobalXmin;
+
+	/*
+	 * Check whether there's a replication slot requiring an older catalog
+	 * xmin.
+	 */
+	if (TransactionIdIsNormal(replication_slot_catalog_xmin) &&
+		NormalTransactionIdPrecedes(replication_slot_catalog_xmin, RecentGlobalXmin))
+		RecentGlobalXmin = replication_slot_catalog_xmin;
+
 	RecentXmin = xmin;
 
 	snapshot->xmin = xmin;
@@ -1633,9 +1690,11 @@ ProcArrayInstallImportedXmin(TransactionId xmin, TransactionId sourcexid)
  * Similar to GetSnapshotData but returns more information. We include
  * all PGXACTs with an assigned TransactionId, even VACUUM processes.
  *
- * We acquire XidGenLock, but the caller is responsible for releasing it.
- * This ensures that no new XIDs enter the proc array until the caller has
- * WAL-logged this snapshot, and releases the lock.
+ * We acquire XidGenLock and ProcArrayLock, but the caller is responsible for
+ * releasing them. Acquiring XidGenLock ensures that no new XIDs enter the proc
+ * array until the caller has WAL-logged this snapshot, and releases the
+ * lock. Acquiring ProcArrayLock ensures that no transactions commit until the
+ * lock is released.
  *
  * The returned data structure is statically allocated; caller should not
  * modify it, and must not assume it is valid past the next call.
@@ -1770,6 +1829,15 @@ GetRunningTransactionData(void)
 		}
 	}
 
+	/*
+	 * It's important *not* to include the limits set by slots here because
+	 * snapbuild.c uses oldestRunningXid to manage its xmin horizon. If those
+	 * were to be included here the initial value could never increase because
+	 * of a circular dependency where slots only increase their limits when
+	 * running xacts increases oldestRunningXid and running xacts only
+	 * increases if slots do.
+	 */
+
 	CurrentRunningXacts->xcnt = count - subcount;
 	CurrentRunningXacts->subxcnt = subcount;
 	CurrentRunningXacts->subxid_overflow = suboverflowed;
@@ -1777,13 +1845,12 @@ GetRunningTransactionData(void)
 	CurrentRunningXacts->oldestRunningXid = oldestRunningXid;
 	CurrentRunningXacts->latestCompletedXid = latestCompletedXid;
 
-	/* We don't release XidGenLock here, the caller is responsible for that */
-	LWLockRelease(ProcArrayLock);
-
 	Assert(TransactionIdIsValid(CurrentRunningXacts->nextXid));
 	Assert(TransactionIdIsValid(CurrentRunningXacts->oldestRunningXid));
 	Assert(TransactionIdIsNormal(CurrentRunningXacts->latestCompletedXid));
 
+	/* We don't release the locks here, the caller is responsible for that */
+
 	return CurrentRunningXacts;
 }
 
@@ -1853,6 +1920,92 @@ GetOldestActiveTransactionId(void)
 }
 
 /*
+ * GetOldestSafeDecodingTransactionId -- lowest xid not affected by vacuum
+ *
+ * Returns the oldest xid that we can guarantee not to have been affected by
+ * vacuum, i.e. no rows >= that xid have been vacuumed away unless the
+ * transaction aborted. Note that the value can (and most of the time will) be
+ * much more conservative than what really has been affected by vacuum, but we
+ * currently don't have better data available.
+ *
+ * This is useful to initalize the cutoff xid after which a new changeset
+ * extraction replication slot can start decoding changes.
+ *
+ * Must be called with ProcArrayLock held either shared or exclusively,
+ * although most callers will want to use exclusive mode since it is expected
+ * that the caller will immediately use the xid to peg the xmin horizon.
+ */
+TransactionId
+GetOldestSafeDecodingTransactionId(void)
+{
+	ProcArrayStruct *arrayP = procArray;
+	TransactionId oldestSafeXid;
+	int			index;
+	bool		recovery_in_progress = RecoveryInProgress();
+
+	Assert(LWLockHeldByMe(ProcArrayLock));
+
+	/*
+	 * Acquire XidGenLock, so no transactions can acquire an xid while we're
+	 * running. If no transaction with xid were running concurrently a new xid
+	 * could influence the the RecentXmin et al.
+	 *
+	 * We initialize the computation to nextXid since that's guaranteed to be
+	 * a safe, albeit pessimal, value.
+	 */
+	LWLockAcquire(XidGenLock, LW_SHARED);
+	oldestSafeXid = ShmemVariableCache->nextXid;
+
+	/*
+	 * If there's already a slot pegging the xmin horizon, we can start with
+	 * that value, it's guaranteed to be safe since it's computed by this
+	 * routine initally and has been enforced since.
+	 */
+	if (TransactionIdIsValid(procArray->replication_slot_catalog_xmin) &&
+		TransactionIdPrecedes(procArray->replication_slot_catalog_xmin,
+							  oldestSafeXid))
+		oldestSafeXid = procArray->replication_slot_catalog_xmin;
+
+	/*
+	 * If we're not in recovery, we walk over the procarray and collect the
+	 * lowest xid. Since we're called with ProcArrayLock held and have
+	 * acquired XidGenLock, no entries can vanish concurrently, since
+	 * PGXACT->xid is only set with XidGenLock held and only cleared with
+	 * ProcArrayLock held.
+	 *
+	 * In recovery we can't lower the safe value besides what we've computed
+	 * above, so we'll have to wait a bit longer there. We unfortunately can
+	 * *not* use KnownAssignedXidsGetOldestXmin() since the KnownAssignedXids
+	 * machinery can miss values and return an older value than is safe.
+	 */
+	if (!recovery_in_progress)
+	{
+		/*
+		 * Spin over procArray collecting all min(PGXACT->xid)
+		 */
+		for (index = 0; index < arrayP->numProcs; index++)
+		{
+			int			pgprocno = arrayP->pgprocnos[index];
+			volatile PGXACT *pgxact = &allPgXact[pgprocno];
+			TransactionId xid;
+
+			/* Fetch xid just once - see GetNewTransactionId */
+			xid = pgxact->xid;
+
+			if (!TransactionIdIsNormal(xid))
+				continue;
+
+			if (TransactionIdPrecedes(xid, oldestSafeXid))
+				oldestSafeXid = xid;
+		}
+	}
+
+	LWLockRelease(XidGenLock);
+
+	return oldestSafeXid;
+}
+
+/*
  * GetVirtualXIDsDelayingChkpt -- Get the VXIDs of transactions that are
  * delaying checkpoint because they have critical actions in progress.
  *
@@ -2523,10 +2676,39 @@ CountOtherDBBackends(Oid databaseId, int *nbackends, int *nprepared)
  * replicaton slots.
  */
 void
-ProcArraySetReplicationSlotXmin(TransactionId xmin)
+ProcArraySetReplicationSlotXmin(TransactionId xmin, TransactionId catalog_xmin,
+								bool already_locked)
 {
-	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+	Assert(!already_locked || LWLockHeldByMe(ProcArrayLock));
+
+	if (!already_locked)
+		LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
 	procArray->replication_slot_xmin = xmin;
+	procArray->replication_slot_catalog_xmin = catalog_xmin;
+
+	if (!already_locked)
+		LWLockRelease(ProcArrayLock);
+}
+
+/*
+ * ProcArrayGetReplicationSlotXmin
+ *
+ * Return the current slot xmin limits. That's useful to be able to remove
+ * data that's older than those limits.
+ */
+void
+ProcArrayGetReplicationSlotXmin(TransactionId *xmin,
+								TransactionId *catalog_xmin)
+{
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+	if (xmin != NULL)
+		*xmin = procArray->replication_slot_xmin;
+
+	if (catalog_xmin != NULL)
+		*catalog_xmin = procArray->replication_slot_catalog_xmin;
+
 	LWLockRelease(ProcArrayLock);
 }
 
diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c
index fb5f18edfc7..aa8bea5538b 100644
--- a/src/backend/storage/ipc/standby.c
+++ b/src/backend/storage/ipc/standby.c
@@ -800,7 +800,9 @@ standby_redo(XLogRecPtr lsn, XLogRecord *record)
 
 /*
  * Log details of the current snapshot to WAL. This allows the snapshot state
- * to be reconstructed on the standby.
+ * to be reconstructed on the standby and for logical decoding.
+ *
+ * This is used for Hot Standby as follows:
  *
  * We can move directly to STANDBY_SNAPSHOT_READY at startup if we
  * start from a shutdown checkpoint because we know nothing was running
@@ -854,6 +856,12 @@ standby_redo(XLogRecPtr lsn, XLogRecord *record)
  * Zero xids should no longer be possible, but we may be replaying WAL
  * from a time when they were possible.
  *
+ * For logical decoding only the running xacts information is needed;
+ * there's no need to look at the locking information, but it's logged anyway,
+ * as there's no independent knob to just enable logical decoding. For
+ * details of how this is used, check snapbuild.c's introductory comment.
+ *
+ *
  * Returns the RecPtr of the last inserted record.
  */
 XLogRecPtr
@@ -879,8 +887,28 @@ LogStandbySnapshot(void)
 	 * record we write, because standby will open up when it sees this.
 	 */
 	running = GetRunningTransactionData();
+
+	/*
+	 * GetRunningTransactionData() acquired ProcArrayLock, we must release
+	 * it. For Hot Standby this can be done before inserting the WAL record
+	 * because ProcArrayApplyRecoveryInfo() rechecks the commit status using
+	 * the clog. For logical decoding, though, the lock can't be released
+	 * early becuase the clog might be "in the future" from the POV of the
+	 * historic snapshot. This would allow for situations where we're waiting
+	 * for the end of a transaction listed in the xl_running_xacts record
+	 * which, according to the WAL, have commit before the xl_running_xacts
+	 * record. Fortunately this routine isn't executed frequently, and it's
+	 * only a shared lock.
+	 */
+	if (wal_level < WAL_LEVEL_LOGICAL)
+		LWLockRelease(ProcArrayLock);
+
 	recptr = LogCurrentRunningXacts(running);
 
+	/* Release lock if we kept it longer ... */
+	if (wal_level >= WAL_LEVEL_LOGICAL)
+		LWLockRelease(ProcArrayLock);
+
 	/* GetRunningTransactionData() acquired XidGenLock, we must release it */
 	LWLockRelease(XidGenLock);
 
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index fa460ca82eb..f595a0747c1 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -781,10 +781,6 @@ ProcKill(int code, Datum arg)
 	/* Make sure we're out of the sync rep lists */
 	SyncRepCleanupAtProcExit();
 
-	/* Make sure active replication slots are released */
-	if (MyReplicationSlot != NULL)
-		ReplicationSlotRelease();
-
 #ifdef USE_ASSERT_CHECKING
 	if (assert_enabled)
 	{
@@ -803,6 +799,10 @@ ProcKill(int code, Datum arg)
 	 */
 	LWLockReleaseAll();
 
+	/* Make sure active replication slots are released */
+	if (MyReplicationSlot != NULL)
+		ReplicationSlotRelease();
+
 	/*
 	 * Clear MyProc first; then disown the process latch.  This is so that
 	 * signal handlers won't try to clear the process latch after it's no
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index a230d7eda69..be961017d66 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -55,6 +55,7 @@
 #include "pg_getopt.h"
 #include "postmaster/autovacuum.h"
 #include "postmaster/postmaster.h"
+#include "replication/slot.h"
 #include "replication/walsender.h"
 #include "rewrite/rewriteHandler.h"
 #include "storage/bufmgr.h"
@@ -3854,6 +3855,16 @@ PostgresMain(int argc, char *argv[],
 			WalSndErrorCleanup();
 
 		/*
+		 * We can't release replication slots inside AbortTransaction() as we
+		 * need to be able to start and abort transactions while having a slot
+		 * acquired. But we never need to hold them across top level errors,
+		 * so releasing here is fine. There's another cleanup in ProcKill()
+		 * ensuring we'll correctly cleanup on FATAL errors as well.
+		 */
+		if (MyReplicationSlot != NULL)
+			ReplicationSlotRelease();
+
+		/*
 		 * Now return to normal top-level context and clear ErrorContext for
 		 * next time.
 		 */
diff --git a/src/backend/utils/cache/inval.c b/src/backend/utils/cache/inval.c
index 4423fe01bdd..115bcac5d23 100644
--- a/src/backend/utils/cache/inval.c
+++ b/src/backend/utils/cache/inval.c
@@ -512,7 +512,7 @@ RegisterSnapshotInvalidation(Oid dbId, Oid relId)
  * Only the local caches are flushed; this does not transmit the message
  * to other backends.
  */
-static void
+void
 LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg)
 {
 	if (msg->id >= 0)
@@ -596,7 +596,7 @@ LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg)
  *		since that tells us we've lost some shared-inval messages and hence
  *		don't know what needs to be invalidated.
  */
-static void
+void
 InvalidateSystemCaches(void)
 {
 	int			i;
diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c
index 2810b35eea1..32313244adb 100644
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@@ -73,6 +73,7 @@
 #include "utils/memutils.h"
 #include "utils/relmapper.h"
 #include "utils/resowner_private.h"
+#include "utils/snapmgr.h"
 #include "utils/syscache.h"
 #include "utils/tqual.h"
 
@@ -235,7 +236,7 @@ static void formrdesc(const char *relationName, Oid relationReltype,
 		  bool isshared, bool hasoids,
 		  int natts, const FormData_pg_attribute *attrs);
 
-static HeapTuple ScanPgRelation(Oid targetRelId, bool indexOK);
+static HeapTuple ScanPgRelation(Oid targetRelId, bool indexOK, bool force_non_historic);
 static Relation AllocateRelationDesc(Form_pg_class relp);
 static void RelationParseRelOptions(Relation relation, HeapTuple tuple);
 static void RelationBuildTupleDesc(Relation relation);
@@ -274,12 +275,13 @@ static void unlink_initfile(const char *initfilename);
  *		and must eventually be freed with heap_freetuple.
  */
 static HeapTuple
-ScanPgRelation(Oid targetRelId, bool indexOK)
+ScanPgRelation(Oid targetRelId, bool indexOK, bool force_non_historic)
 {
 	HeapTuple	pg_class_tuple;
 	Relation	pg_class_desc;
 	SysScanDesc pg_class_scan;
 	ScanKeyData key[1];
+	Snapshot	snapshot;
 
 	/*
 	 * If something goes wrong during backend startup, we might find ourselves
@@ -305,9 +307,20 @@ ScanPgRelation(Oid targetRelId, bool indexOK)
 	 * scan by setting indexOK == false.
 	 */
 	pg_class_desc = heap_open(RelationRelationId, AccessShareLock);
+
+	/*
+	 * The caller might need a tuple that's newer than the one the historic
+	 * snapshot; currently the only case requiring to do so is looking up the
+	 * relfilenode of non mapped system relations during decoding.
+	 */
+	if (force_non_historic)
+		snapshot = GetNonHistoricCatalogSnapshot(RelationRelationId);
+	else
+		snapshot = GetCatalogSnapshot(RelationRelationId);
+
 	pg_class_scan = systable_beginscan(pg_class_desc, ClassOidIndexId,
 									   indexOK && criticalRelcachesBuilt,
-									   NULL,
+									   snapshot,
 									   1, key);
 
 	pg_class_tuple = systable_getnext(pg_class_scan);
@@ -836,7 +849,7 @@ RelationBuildDesc(Oid targetRelId, bool insertIt)
 	/*
 	 * find the tuple in pg_class corresponding to the given relation id
 	 */
-	pg_class_tuple = ScanPgRelation(targetRelId, true);
+	pg_class_tuple = ScanPgRelation(targetRelId, true, false);
 
 	/*
 	 * if no such tuple exists, return NULL
@@ -989,8 +1002,42 @@ RelationInitPhysicalAddr(Relation relation)
 		relation->rd_node.dbNode = InvalidOid;
 	else
 		relation->rd_node.dbNode = MyDatabaseId;
+
 	if (relation->rd_rel->relfilenode)
+	{
+		/*
+		 * Even if we are using a decoding snapshot that doesn't represent
+		 * the current state of the catalog we need to make sure the
+		 * filenode points to the current file since the older file will
+		 * be gone (or truncated). The new file will still contain older
+		 * rows so lookups in them will work correctly. This wouldn't work
+		 * correctly if rewrites were allowed to change the schema in a
+		 * noncompatible way, but those are prevented both on catalog
+		 * tables and on user tables declared as additional catalog
+		 * tables.
+		 */
+		if (HistoricSnapshotActive()
+			&& RelationIsAccessibleInLogicalDecoding(relation)
+			&& IsTransactionState())
+		{
+			HeapTuple		phys_tuple;
+			Form_pg_class	physrel;
+
+			phys_tuple = ScanPgRelation(RelationGetRelid(relation),
+										RelationGetRelid(relation) != ClassOidIndexId,
+										true);
+			if (!HeapTupleIsValid(phys_tuple))
+				elog(ERROR, "could not find pg_class entry for %u",
+					 RelationGetRelid(relation));
+			physrel = (Form_pg_class) GETSTRUCT(phys_tuple);
+
+			relation->rd_rel->reltablespace = physrel->reltablespace;
+			relation->rd_rel->relfilenode = physrel->relfilenode;
+			heap_freetuple(phys_tuple);
+		}
+
 		relation->rd_node.relNode = relation->rd_rel->relfilenode;
+	}
 	else
 	{
 		/* Consult the relation mapper */
@@ -1742,7 +1789,7 @@ RelationReloadIndexInfo(Relation relation)
 	 * for pg_class_oid_index ...
 	 */
 	indexOK = (RelationGetRelid(relation) != ClassOidIndexId);
-	pg_class_tuple = ScanPgRelation(RelationGetRelid(relation), indexOK);
+	pg_class_tuple = ScanPgRelation(RelationGetRelid(relation), indexOK, false);
 	if (!HeapTupleIsValid(pg_class_tuple))
 		elog(ERROR, "could not find pg_class tuple for index %u",
 			 RelationGetRelid(relation));
diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c
index 4c0e0accc1c..4146527d2fd 100644
--- a/src/backend/utils/time/snapmgr.c
+++ b/src/backend/utils/time/snapmgr.c
@@ -19,6 +19,10 @@
  * have regd_count = 1 and are counted in RegisteredSnapshots, but are not
  * tracked by any resource owner.
  *
+ * The same is true for historic snapshots used during logical decoding,
+ * their lifetime is managed separately (as they life longer as one xact.c
+ * transaction).
+ *
  * These arrangements let us reset MyPgXact->xmin when there are no snapshots
  * referenced by this transaction.	(One possible improvement would be to be
  * able to advance Xmin when the snapshot with the earliest Xmin is no longer
@@ -69,12 +73,13 @@
  */
 static SnapshotData CurrentSnapshotData = {HeapTupleSatisfiesMVCC};
 static SnapshotData SecondarySnapshotData = {HeapTupleSatisfiesMVCC};
-static SnapshotData CatalogSnapshotData = {HeapTupleSatisfiesMVCC};
+SnapshotData CatalogSnapshotData = {HeapTupleSatisfiesMVCC};
 
 /* Pointers to valid snapshots */
 static Snapshot CurrentSnapshot = NULL;
 static Snapshot SecondarySnapshot = NULL;
 static Snapshot CatalogSnapshot = NULL;
+static Snapshot HistoricSnapshot = NULL;
 
 /*
  * Staleness detection for CatalogSnapshot.
@@ -86,13 +91,18 @@ static bool CatalogSnapshotStale = true;
  * for the convenience of TransactionIdIsInProgress: even in bootstrap
  * mode, we don't want it to say that BootstrapTransactionId is in progress.
  *
- * RecentGlobalXmin is initialized to InvalidTransactionId, to ensure that no
- * one tries to use a stale value.	Readers should ensure that it has been set
- * to something else before using it.
+ * RecentGlobalXmin and RecentGlobalDataXmin are initialized to
+ * InvalidTransactionId, to ensure that no one tries to use a stale
+ * value. Readers should ensure that it has been set to something else
+ * before using it.
  */
 TransactionId TransactionXmin = FirstNormalTransactionId;
 TransactionId RecentXmin = FirstNormalTransactionId;
 TransactionId RecentGlobalXmin = InvalidTransactionId;
+TransactionId RecentGlobalDataXmin = InvalidTransactionId;
+
+/* (table, ctid) => (cmin, cmax) mapping during timetravel */
+static HTAB *tuplecid_data = NULL;
 
 /*
  * Elements of the active snapshot stack.
@@ -158,6 +168,18 @@ static void SnapshotResetXmin(void);
 Snapshot
 GetTransactionSnapshot(void)
 {
+	/*
+	 * Return historic snapshot if doing logical decoding. We'll never
+	 * need a non-historic transaction snapshot in this (sub-)transaction, so
+	 * there's no need to be careful to set one up for later calls to
+	 * GetTransactionSnapshot().
+	 */
+	if (HistoricSnapshotActive())
+	{
+		Assert(!FirstSnapshotSet);
+		return HistoricSnapshot;
+	}
+
 	/* First call in transaction? */
 	if (!FirstSnapshotSet)
 	{
@@ -214,6 +236,13 @@ GetTransactionSnapshot(void)
 Snapshot
 GetLatestSnapshot(void)
 {
+	/*
+	 * So far there are no cases requiring support for GetLatestSnapshot()
+	 * during logical decoding, but it wouldn't be hard to add if
+	 * required.
+	 */
+	Assert(!HistoricSnapshotActive());
+
 	/* If first call in transaction, go ahead and set the xact snapshot */
 	if (!FirstSnapshotSet)
 		return GetTransactionSnapshot();
@@ -232,6 +261,26 @@ Snapshot
 GetCatalogSnapshot(Oid relid)
 {
 	/*
+	 * Return historic snapshot if we're doing logical decoding, but
+	 * return a non-historic, snapshot if we temporarily are doing up2date
+	 * lookups.
+	 */
+	if (HistoricSnapshotActive())
+		return HistoricSnapshot;
+
+	return GetNonHistoricCatalogSnapshot(relid);
+}
+
+/*
+ * GetNonHistoricCatalogSnapshot
+ *		Get a snapshot that is sufficiently up-to-date for scan of the system
+ *		catalog with the specified OID, even while historic snapshots are set
+ *		up.
+ */
+Snapshot
+GetNonHistoricCatalogSnapshot(Oid relid)
+{
+	/*
 	 * If the caller is trying to scan a relation that has no syscache,
 	 * no catcache invalidations will be sent when it is updated.  For a
 	 * a few key relations, snapshot invalidations are sent instead.  If
@@ -303,6 +352,7 @@ SetTransactionSnapshot(Snapshot sourcesnap, TransactionId sourcexid)
 
 	Assert(RegisteredSnapshots == 0);
 	Assert(FirstXactSnapshot == NULL);
+	Assert(HistoricSnapshotActive());
 
 	/*
 	 * Even though we are not going to use the snapshot it computes, we must
@@ -796,7 +846,7 @@ AtEOXact_Snapshot(bool isCommit)
  *		Returns the token (the file name) that can be used to import this
  *		snapshot.
  */
-static char *
+char *
 ExportSnapshot(Snapshot snapshot)
 {
 	TransactionId topXid;
@@ -1258,3 +1308,45 @@ ThereAreNoPriorRegisteredSnapshots(void)
 
 	return false;
 }
+
+/*
+ * Setup a snapshot that replaces normal catalog snapshots that allows catalog
+ * access to behave just like it did at a certain point in the past.
+ *
+ * Needed for logical decoding.
+ */
+void
+SetupHistoricSnapshot(Snapshot historic_snapshot, HTAB *tuplecids)
+{
+	Assert(historic_snapshot != NULL);
+
+	/* setup the timetravel snapshot */
+	HistoricSnapshot = historic_snapshot;
+
+	/* setup (cmin, cmax) lookup hash */
+	tuplecid_data = tuplecids;
+}
+
+
+/*
+ * Make catalog snapshots behave normally again.
+ */
+void
+TeardownHistoricSnapshot(bool is_error)
+{
+	HistoricSnapshot = NULL;
+	tuplecid_data = NULL;
+}
+
+bool
+HistoricSnapshotActive(void)
+{
+	return HistoricSnapshot != NULL;
+}
+
+HTAB *
+HistoricSnapshotGetTupleCids(void)
+{
+	Assert(HistoricSnapshotActive());
+	return tuplecid_data;
+}
diff --git a/src/backend/utils/time/tqual.c b/src/backend/utils/time/tqual.c
index f6267552573..c4732ed3110 100644
--- a/src/backend/utils/time/tqual.c
+++ b/src/backend/utils/time/tqual.c
@@ -62,6 +62,9 @@
 #include "access/xact.h"
 #include "storage/bufmgr.h"
 #include "storage/procarray.h"
+#include "utils/builtins.h"
+#include "utils/combocid.h"
+#include "utils/snapmgr.h"
 #include "utils/tqual.h"
 
 
@@ -73,7 +76,6 @@ SnapshotData SnapshotToastData = {HeapTupleSatisfiesToast};
 /* local functions */
 static bool XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot);
 
-
 /*
  * SetHintBits()
  *
@@ -1545,3 +1547,163 @@ HeapTupleHeaderIsOnlyLocked(HeapTupleHeader tuple)
 	 */
 	return true;
 }
+
+/*
+ * check whether the transaciont id 'xid' in in the pre-sorted array 'xip'.
+ */
+static bool
+TransactionIdInArray(TransactionId xid, TransactionId *xip, Size num)
+{
+	return bsearch(&xid, xip, num,
+				   sizeof(TransactionId), xidComparator) != NULL;
+}
+
+/*
+ * See the comments for HeapTupleSatisfiesMVCC for the semantics this function
+ * obeys.
+ *
+ * Only usable on tuples from catalog tables!
+ *
+ * We don't need to support HEAP_MOVED_(IN|OFF) for now because we only support
+ * reading catalog pages which couldn't have been created in an older version.
+ *
+ * We don't set any hint bits in here as it seems unlikely to be beneficial as
+ * those should already be set by normal access and it seems to be too
+ * dangerous to do so as the semantics of doing so during timetravel are more
+ * complicated than when dealing "only" with the present.
+ */
+bool
+HeapTupleSatisfiesHistoricMVCC(HeapTuple htup, Snapshot snapshot,
+						   Buffer buffer)
+{
+	HeapTupleHeader tuple = htup->t_data;
+	TransactionId xmin = HeapTupleHeaderGetXmin(tuple);
+	TransactionId xmax = HeapTupleHeaderGetRawXmax(tuple);
+
+	Assert(ItemPointerIsValid(&htup->t_self));
+	Assert(htup->t_tableOid != InvalidOid);
+
+	/* inserting transaction aborted */
+	if (HeapTupleHeaderXminInvalid(tuple))
+	{
+		Assert(!TransactionIdDidCommit(xmin));
+		return false;
+	}
+	/* check if its one of our txids, toplevel is also in there */
+	else if (TransactionIdInArray(xmin, snapshot->subxip, snapshot->subxcnt))
+	{
+		bool		resolved;
+		CommandId	cmin = HeapTupleHeaderGetRawCommandId(tuple);
+		CommandId	cmax = InvalidCommandId;
+
+		/*
+		 * another transaction might have (tried to) delete this tuple or
+		 * cmin/cmax was stored in a combocid. S we need to to lookup the
+		 * actual values externally.
+		 */
+		resolved = ResolveCminCmaxDuringDecoding(HistoricSnapshotGetTupleCids(), snapshot,
+												 htup, buffer,
+												 &cmin, &cmax);
+
+		if (!resolved)
+			elog(ERROR, "could not resolve cmin/cmax of catalog tuple");
+
+		Assert(cmin != InvalidCommandId);
+
+		if (cmin >= snapshot->curcid)
+			return false;	/* inserted after scan started */
+		/* fall through */
+	}
+	/* committed before our xmin horizon. Do a normal visibility check. */
+	else if (TransactionIdPrecedes(xmin, snapshot->xmin))
+	{
+		Assert(!(HeapTupleHeaderXminCommitted(tuple) &&
+				 !TransactionIdDidCommit(xmin)));
+
+		/* check for hint bit first, consult clog afterwards */
+		if (!HeapTupleHeaderXminCommitted(tuple) &&
+			!TransactionIdDidCommit(xmin))
+			return false;
+		/* fall through */
+	}
+	/* beyond our xmax horizon, i.e. invisible */
+	else if (TransactionIdFollowsOrEquals(xmin, snapshot->xmax))
+	{
+		return false;
+	}
+	/* check if it's a committed transaction in [xmin, xmax) */
+	else if(TransactionIdInArray(xmin, snapshot->xip, snapshot->xcnt))
+	{
+		/* fall through */
+	}
+	/*
+	 * none of the above, i.e. between [xmin, xmax) but hasn't
+	 * committed. I.e. invisible.
+	 */
+	else
+	{
+		return false;
+	}
+
+	/* at this point we know xmin is visible, go on to check xmax */
+
+	/* xid invalid or aborted */
+	if (tuple->t_infomask & HEAP_XMAX_INVALID)
+		return true;
+	/* locked tuples are always visible */
+	else if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
+		return true;
+	/*
+	 * We can see multis here if we're looking at user tables or if
+	 * somebody SELECT ... FOR SHARE/UPDATE a system table.
+	 */
+	else if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
+	{
+		xmax = HeapTupleGetUpdateXid(tuple);
+	}
+
+	/* check if its one of our txids, toplevel is also in there */
+	if (TransactionIdInArray(xmax, snapshot->subxip, snapshot->subxcnt))
+	{
+		bool resolved;
+		CommandId cmin;
+		CommandId cmax = HeapTupleHeaderGetRawCommandId(tuple);
+
+		/* Lookup actual cmin/cmax values */
+		resolved = ResolveCminCmaxDuringDecoding(HistoricSnapshotGetTupleCids(), snapshot,
+												 htup, buffer,
+												 &cmin, &cmax);
+
+		if (!resolved)
+			elog(ERROR, "could not resolve combocid to cmax");
+
+		Assert(cmax != InvalidCommandId);
+
+		if (cmax >= snapshot->curcid)
+			return true;	/* deleted after scan started */
+		else
+			return false;	/* deleted before scan started */
+	}
+	/* below xmin horizon, normal transaction state is valid */
+	else if (TransactionIdPrecedes(xmax, snapshot->xmin))
+	{
+		Assert(!(tuple->t_infomask & HEAP_XMAX_COMMITTED &&
+				 !TransactionIdDidCommit(xmax)));
+
+		/* check hint bit first */
+		if (tuple->t_infomask & HEAP_XMAX_COMMITTED)
+			return false;
+
+		/* check clog */
+		return !TransactionIdDidCommit(xmax);
+	}
+	/* above xmax horizon, we cannot possibly see the deleting transaction */
+	else if (TransactionIdFollowsOrEquals(xmax, snapshot->xmax))
+		return true;
+	/* xmax is between [xmin, xmax), check known committed array */
+	else if (TransactionIdInArray(xmax, snapshot->xip, snapshot->xcnt))
+		return false;
+	/* xmax is between [xmin, xmax), but known not to have committed yet */
+	else
+		return true;
+}
author	Robert Haas	2014-03-03 21:32:18 +0000
committer	Robert Haas	2014-03-03 21:32:18 +0000
commit	b89e151054a05f0f6d356ca52e3b725dd0505e53 (patch)
tree	9b9193e808625a381003650ff68b66cdb5f9f46e /src/backend
parent	de94b47c0a92faeddab5ac980449d3fa877b4a4f (diff)