From 5031ac86adb6487be4ce2b8b11b99015bea73597 Mon Sep 17 00:00:00 2001
From: Thomas Munro <tmunro@postgresql.org>
Date: Thu, 27 Feb 2025 21:03:39 +1300
Subject: [PATCH 1/9] Improve buffer manager API for backend pin limits.

Previously the support functions assumed that the caller needed one pin
to make progress, and could optionally use some more.  Add a couple more
functions for callers that want to know:

* what the maximum possible number could be irrespective of currently
  held pins, for space planning purposes, called the "soft pin limit"

* how many additional pins they could acquire right now, without the
  special case allowing one pin, for users that already hold pins and
  could make progress even if zero extra pins are available

These APIs are better suited to read_stream.c, which will be improved in
a follow-up patch.  Also compute MaxProportionalPins up front, to avoid
performing division whenever we check the balance.

Reviewed-by: Andres Freund <andres@anarazel.de>
Discussion: https://postgr.es/m/CA%2BhUKGK_%3D4CVmMHvsHjOVrK6t4F%3DLBpFzsrr3R%2BaJYN8kcTfWg%40mail.gmail.com
---
 src/backend/storage/buffer/bufmgr.c   | 85 +++++++++++++++++++--------
 src/backend/storage/buffer/localbuf.c | 16 +++++
 src/include/storage/bufmgr.h          |  4 ++
 3 files changed, 80 insertions(+), 25 deletions(-)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 7915ed624c12..a6138e79306f 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -211,6 +211,8 @@ static int32 PrivateRefCountOverflowed = 0;
 static uint32 PrivateRefCountClock = 0;
 static PrivateRefCountEntry *ReservedRefCountEntry = NULL;
 
+static uint32 MaxProportionalPins;
+
 static void ReservePrivateRefCountEntry(void);
 static PrivateRefCountEntry *NewPrivateRefCountEntry(Buffer buffer);
 static PrivateRefCountEntry *GetPrivateRefCountEntry(Buffer buffer, bool do_move);
@@ -2097,43 +2099,67 @@ GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
 	return buf;
 }
 
+/*
+ * Return the maximum number of buffer than this backend should try to pin at
+ * once, to avoid pinning more than its fair share.  This is the highest value
+ * that GetAdditionalPinLimit() and LimitAdditionalPins() could ever return.
+ *
+ * It's called a soft limit because nothing stops a backend from trying to
+ * acquire more pins than this if it needs them to make progress, but code that
+ * wants optional extra buffers for optimizations should respect this
+ * per-backend limit.
+ */
+uint32
+GetSoftPinLimit(void)
+{
+	return MaxProportionalPins;
+}
+
+/*
+ * Return the maximum number of additional buffers that this backend should
+ * pin if it wants to stay under the per-backend soft limit, considering the
+ * number of buffers it has already pinned.  Unlike LimitAdditionalPins(), the
+ * result can be zero, so the caller is expected to adjust it if required to
+ * make progress.
+ */
+uint32
+GetAdditionalPinLimit(void)
+{
+	uint32		estimated_pins_held;
+
+	/*
+	 * We get the number of "overflowed" pins for free, but don't know the
+	 * number of pins in PrivateRefCountArray.  The cost of calculating that
+	 * exactly doesn't seem worth it, so just assume the max.
+	 */
+	estimated_pins_held = PrivateRefCountOverflowed + REFCOUNT_ARRAY_ENTRIES;
+
+	/* Is this backend already holding more than its fair share? */
+	if (estimated_pins_held > MaxProportionalPins)
+		return 0;
+
+	return MaxProportionalPins - estimated_pins_held;
+}
+
 /*
  * Limit the number of pins a batch operation may additionally acquire, to
  * avoid running out of pinnable buffers.
  *
- * One additional pin is always allowed, as otherwise the operation likely
- * cannot be performed at all.
- *
- * The number of allowed pins for a backend is computed based on
- * shared_buffers and the maximum number of connections possible. That's very
- * pessimistic, but outside of toy-sized shared_buffers it should allow
- * sufficient pins.
+ * One additional pin is always allowed, on the assumption that the operation
+ * requires at least one to make progress.
  */
 void
 LimitAdditionalPins(uint32 *additional_pins)
 {
-	uint32		max_backends;
-	int			max_proportional_pins;
+	uint32		limit;
 
 	if (*additional_pins <= 1)
 		return;
 
-	max_backends = MaxBackends + NUM_AUXILIARY_PROCS;
-	max_proportional_pins = NBuffers / max_backends;
-
-	/*
-	 * Subtract the approximate number of buffers already pinned by this
-	 * backend. We get the number of "overflowed" pins for free, but don't
-	 * know the number of pins in PrivateRefCountArray. The cost of
-	 * calculating that exactly doesn't seem worth it, so just assume the max.
-	 */
-	max_proportional_pins -= PrivateRefCountOverflowed + REFCOUNT_ARRAY_ENTRIES;
-
-	if (max_proportional_pins <= 0)
-		max_proportional_pins = 1;
-
-	if (*additional_pins > max_proportional_pins)
-		*additional_pins = max_proportional_pins;
+	limit = GetAdditionalPinLimit();
+	limit = Max(limit, 1);
+	if (limit < *additional_pins)
+		*additional_pins = limit;
 }
 
 /*
@@ -3575,6 +3601,15 @@ InitBufferManagerAccess(void)
 {
 	HASHCTL		hash_ctl;
 
+	/*
+	 * The soft limit on the number of pins each backend should respect, based
+	 * on shared_buffers and the maximum number of connections possible.
+	 * That's very pessimistic, but outside toy-sized shared_buffers it should
+	 * allow plenty of pins.  LimitAdditionalPins() or GetAdditionalPinLimit()
+	 * can be used to check the remaining balance.
+	 */
+	MaxProportionalPins = NBuffers / (MaxBackends + NUM_AUXILIARY_PROCS);
+
 	memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
 
 	hash_ctl.keysize = sizeof(int32);
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
index 80b83444eb2d..5378ba843162 100644
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -286,6 +286,22 @@ GetLocalVictimBuffer(void)
 	return BufferDescriptorGetBuffer(bufHdr);
 }
 
+/* see GetSoftPinLimit() */
+uint32
+GetSoftLocalPinLimit(void)
+{
+	/* Every backend has its own temporary buffers, and can pin them all. */
+	return num_temp_buffers;
+}
+
+/* see GetAdditionalPinLimit() */
+uint32
+GetAdditionalLocalPinLimit(void)
+{
+	Assert(NLocalPinnedBuffers <= num_temp_buffers);
+	return num_temp_buffers - NLocalPinnedBuffers;
+}
+
 /* see LimitAdditionalPins() */
 void
 LimitAdditionalLocalPins(uint32 *additional_pins)
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index b204e4731c18..74b5afe8a1ad 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -290,6 +290,10 @@ extern bool HoldingBufferPinThatDelaysRecovery(void);
 
 extern bool BgBufferSync(struct WritebackContext *wb_context);
 
+extern uint32 GetSoftPinLimit(void);
+extern uint32 GetSoftLocalPinLimit(void);
+extern uint32 GetAdditionalPinLimit(void);
+extern uint32 GetAdditionalLocalPinLimit(void);
 extern void LimitAdditionalPins(uint32 *additional_pins);
 extern void LimitAdditionalLocalPins(uint32 *additional_pins);
 

From 76915dbd48770b1e0679e29afd175ac32481c5b6 Mon Sep 17 00:00:00 2001
From: Thomas Munro <tmunro@postgresql.org>
Date: Thu, 27 Feb 2025 21:42:05 +1300
Subject: [PATCH 2/9] Respect pin limits accurately in read_stream.c.

To avoid pinning too much of the buffer pool at once, we previously used
LimitAdditionalBuffers().  The coding was naive, and only considered the
available buffers at stream construction time.

This commit checks at the time of use with new buffer manager APIs.  The
result might change dynamically due to pins acquired outside this stream
by the same backend.  No extra CPU cycles are added to the all-buffered
fast-path code, but the I/O-starting path now considers the up-to-date
remaining buffer limit when making look-ahead decisions.

In practice it was very difficult to exceed the limit in v17, so no
back-patch, but changes due to land soon make it easy.

Per code review from Andres, in the course of testing his AIO patches.

Reviewed-by: Andres Freund <andres@anarazel.de>
Reported-by: Andres Freund <andres@anarazel.de>
Discussion: https://postgr.es/m/CA%2BhUKGK_%3D4CVmMHvsHjOVrK6t4F%3DLBpFzsrr3R%2BaJYN8kcTfWg%40mail.gmail.com
---
 src/backend/storage/aio/read_stream.c | 104 ++++++++++++++++++++++----
 1 file changed, 90 insertions(+), 14 deletions(-)

diff --git a/src/backend/storage/aio/read_stream.c b/src/backend/storage/aio/read_stream.c
index 36fb9fe152cf..11ee16ec2287 100644
--- a/src/backend/storage/aio/read_stream.c
+++ b/src/backend/storage/aio/read_stream.c
@@ -116,6 +116,7 @@ struct ReadStream
 	int16		pinned_buffers;
 	int16		distance;
 	bool		advice_enabled;
+	bool		temporary;
 
 	/*
 	 * One-block buffer to support 'ungetting' a block number, to resolve flow
@@ -225,7 +226,17 @@ read_stream_unget_block(ReadStream *stream, BlockNumber blocknum)
 	stream->buffered_blocknum = blocknum;
 }
 
-static void
+/*
+ * Start as much of the current pending read as we can.  If we have to split it
+ * because of the per-backend buffer limit, or the buffer manager decides to
+ * split it, then the pending read is adjusted to hold the remaining portion.
+ *
+ * We can always start a read of at least size one if we have no progress yet.
+ * Otherwise it's possible that we can't start a read at all because of a lack
+ * of buffers, and then false is returned.  Buffer shortages also reduce the
+ * distance to a level that prevents look-ahead until buffers are released.
+ */
+static bool
 read_stream_start_pending_read(ReadStream *stream, bool suppress_advice)
 {
 	bool		need_wait;
@@ -234,12 +245,13 @@ read_stream_start_pending_read(ReadStream *stream, bool suppress_advice)
 	int16		io_index;
 	int16		overflow;
 	int16		buffer_index;
+	int16		buffer_limit;
 
 	/* This should only be called with a pending read. */
 	Assert(stream->pending_read_nblocks > 0);
 	Assert(stream->pending_read_nblocks <= stream->io_combine_limit);
 
-	/* We had better not exceed the pin limit by starting this read. */
+	/* We had better not exceed the per-stream buffer limit with this read. */
 	Assert(stream->pinned_buffers + stream->pending_read_nblocks <=
 		   stream->max_pinned_buffers);
 
@@ -260,10 +272,39 @@ read_stream_start_pending_read(ReadStream *stream, bool suppress_advice)
 	else
 		flags = 0;
 
-	/* We say how many blocks we want to read, but may be smaller on return. */
+	/* Compute the remaining portion of the per-backend buffer limit. */
+	if (stream->temporary)
+		buffer_limit = Min(GetAdditionalLocalPinLimit(), PG_INT16_MAX);
+	else
+		buffer_limit = Min(GetAdditionalPinLimit(), PG_INT16_MAX);
+	if (buffer_limit == 0 && stream->pinned_buffers == 0)
+		buffer_limit = 1;		/* guarantee progress */
+
+	/* Does the per-backend buffer limit affect this read? */
+	nblocks = stream->pending_read_nblocks;
+	if (buffer_limit < nblocks)
+	{
+		int16		new_distance;
+
+		/* Shrink distance: no more look-ahead until buffers are released. */
+		new_distance = stream->pinned_buffers + buffer_limit;
+		if (stream->distance > new_distance)
+			stream->distance = new_distance;
+
+		/* If we've already made progress, just give up and wait for buffers. */
+		if (stream->pinned_buffers > 0)
+			return false;
+
+		/* A short read is required to make progress. */
+		nblocks = buffer_limit;
+	}
+
+	/*
+	 * We say how many blocks we want to read, but it may be smaller on return
+	 * if the buffer manager decides it needs a short read at its level.
+	 */
 	buffer_index = stream->next_buffer_index;
 	io_index = stream->next_io_index;
-	nblocks = stream->pending_read_nblocks;
 	need_wait = StartReadBuffers(&stream->ios[io_index].op,
 								 &stream->buffers[buffer_index],
 								 stream->pending_read_blocknum,
@@ -313,6 +354,8 @@ read_stream_start_pending_read(ReadStream *stream, bool suppress_advice)
 	/* Adjust the pending read to cover the remaining portion, if any. */
 	stream->pending_read_blocknum += nblocks;
 	stream->pending_read_nblocks -= nblocks;
+
+	return true;
 }
 
 static void
@@ -361,14 +404,15 @@ read_stream_look_ahead(ReadStream *stream, bool suppress_advice)
 		/* We have to start the pending read before we can build another. */
 		while (stream->pending_read_nblocks > 0)
 		{
-			read_stream_start_pending_read(stream, suppress_advice);
-			suppress_advice = false;
-			if (stream->ios_in_progress == stream->max_ios)
+			if (!read_stream_start_pending_read(stream, suppress_advice) ||
+				stream->ios_in_progress == stream->max_ios)
 			{
-				/* And we've hit the limit.  Rewind, and stop here. */
+				/* And we've hit a buffer or I/O limit.  Rewind and wait. */
 				read_stream_unget_block(stream, blocknum);
 				return;
 			}
+
+			suppress_advice = false;
 		}
 
 		/* This is the start of a new pending read. */
@@ -382,15 +426,25 @@ read_stream_look_ahead(ReadStream *stream, bool suppress_advice)
 	 * io_combine_limit size once more buffers have been consumed.  However,
 	 * if we've already reached io_combine_limit, or we've reached the
 	 * distance limit and there isn't anything pinned yet, or the callback has
-	 * signaled end-of-stream, we start the read immediately.
+	 * signaled end-of-stream, we start the read immediately.  Note that the
+	 * pending read could even exceed the distance goal, if the latter was
+	 * reduced on buffer limit exhaustion.
 	 */
 	if (stream->pending_read_nblocks > 0 &&
 		(stream->pending_read_nblocks == stream->io_combine_limit ||
-		 (stream->pending_read_nblocks == stream->distance &&
+		 (stream->pending_read_nblocks >= stream->distance &&
 		  stream->pinned_buffers == 0) ||
 		 stream->distance == 0) &&
 		stream->ios_in_progress < stream->max_ios)
 		read_stream_start_pending_read(stream, suppress_advice);
+
+	/*
+	 * There should always be something pinned when we leave this function,
+	 * whether started by this call or not, unless we've hit the end of the
+	 * stream.  In the worst case we can always make progress one buffer at a
+	 * time.
+	 */
+	Assert(stream->pinned_buffers > 0 || stream->distance == 0);
 }
 
 /*
@@ -420,6 +474,7 @@ read_stream_begin_impl(int flags,
 	int			max_ios;
 	int			strategy_pin_limit;
 	uint32		max_pinned_buffers;
+	uint32		max_possible_buffer_limit;
 	Oid			tablespace_id;
 
 	/*
@@ -475,12 +530,23 @@ read_stream_begin_impl(int flags,
 	strategy_pin_limit = GetAccessStrategyPinLimit(strategy);
 	max_pinned_buffers = Min(strategy_pin_limit, max_pinned_buffers);
 
-	/* Don't allow this backend to pin more than its share of buffers. */
+	/*
+	 * Also limit our queue to the maximum number of pins we could possibly
+	 * ever be allowed to acquire according to the buffer manager.  We may not
+	 * really be able to use them all due to other pins held by this backend,
+	 * but we'll check that later in read_stream_start_pending_read().
+	 */
 	if (SmgrIsTemp(smgr))
-		LimitAdditionalLocalPins(&max_pinned_buffers);
+		max_possible_buffer_limit = GetSoftLocalPinLimit();
 	else
-		LimitAdditionalPins(&max_pinned_buffers);
-	Assert(max_pinned_buffers > 0);
+		max_possible_buffer_limit = GetSoftPinLimit();
+	max_pinned_buffers = Min(max_pinned_buffers, max_possible_buffer_limit);
+
+	/*
+	 * The soft limit might be zero on a system configured with more
+	 * connections than buffers.  We need at least one to make progress.
+	 */
+	max_pinned_buffers = Max(1, max_pinned_buffers);
 
 	/*
 	 * We need one extra entry for buffers and per-buffer data, because users
@@ -546,6 +612,7 @@ read_stream_begin_impl(int flags,
 	stream->callback = callback;
 	stream->callback_private_data = callback_private_data;
 	stream->buffered_blocknum = InvalidBlockNumber;
+	stream->temporary = SmgrIsTemp(smgr);
 
 	/*
 	 * Skip the initial ramp-up phase if the caller says we're going to be
@@ -674,6 +741,12 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 			 * arbitrary I/O entry (they're all free).  We don't have to
 			 * adjust pinned_buffers because we're transferring one to caller
 			 * but pinning one more.
+			 *
+			 * In the fast path we don't need to check the pin limit.  We're
+			 * always allowed at least one pin so that progress can be made,
+			 * and that's all we need here.  Although two pins are momentarily
+			 * held at the same time, the model used here is that the stream
+			 * holds only one, and the other now belongs to the caller.
 			 */
 			if (likely(!StartReadBuffer(&stream->ios[0].op,
 										&stream->buffers[oldest_buffer_index],
@@ -874,6 +947,9 @@ read_stream_reset(ReadStream *stream)
 	stream->buffered_blocknum = InvalidBlockNumber;
 	stream->fast_path = false;
 
+	/* There is no point in reading whatever was pending. */
+	stream->pending_read_nblocks = 0;
+
 	/* Unpin anything that wasn't consumed. */
 	while ((buffer = read_stream_next_buffer(stream, NULL)) != InvalidBuffer)
 		ReleaseBuffer(buffer);

From 6658d7847db2b49abee2c0d85a1ef7ec6c74df60 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Tue, 18 Feb 2025 15:59:13 +1300
Subject: [PATCH 3/9] Improve read stream advice for large random chunks.

read_stream.c tries not to issue advice when it thinks the kernel's
readahead should be active, ie when using buffered I/O and reading
sequential blocks.  It previously gave up a little too easily: it should
issue advice until it has started running sequential pread() calls, not
just when it's planning to.  The simpler strategy worked for random
chunks of size <= io_combine_limit and entirely sequential streams, but
so not well when reading random chunks > io_combine limit.  For example,
a 256kB chunk of sequential data would benefit from only one fadvise(),
but (assuming io_combine_limit=128kB) could suffer an I/O stall for the
second half of it.

Keep issuing advice until the pread() calls catch up with the start of
the region we're currently issuing advice for, if ever.  In practice, if
there are any jumps in the lookahead window, we'll never stop issuing
advice, and if the whole lookahead window becomes sequential we'll
finally stop issuing advice.

Discovered by Tomas Vondra's regression testing of many data clustering
patterns using Melanie Plageman's streaming Bitmap Heap Scan patch, with
analysis of the I/O stall-producing pattern from Andres Freund.

Reviewed-by: Andres Freund <andres@anarazel.de>
Discussion: https://postgr.es/m/CA%2BhUKGK_%3D4CVmMHvsHjOVrK6t4F%3DLBpFzsrr3R%2BaJYN8kcTfWg%40mail.gmail.com
Discussion: https://postgr.es/m/CA%2BhUKGJ3HSWciQCz8ekP1Zn7N213RfA4nbuotQawfpq23%2Bw-5Q%40mail.gmail.com
---
 src/backend/storage/aio/read_stream.c | 71 +++++++++++++++++++--------
 1 file changed, 50 insertions(+), 21 deletions(-)

diff --git a/src/backend/storage/aio/read_stream.c b/src/backend/storage/aio/read_stream.c
index 11ee16ec2287..a8a96baf8c16 100644
--- a/src/backend/storage/aio/read_stream.c
+++ b/src/backend/storage/aio/read_stream.c
@@ -133,6 +133,7 @@ struct ReadStream
 
 	/* Next expected block, for detecting sequential access. */
 	BlockNumber seq_blocknum;
+	BlockNumber seq_until_processed;
 
 	/* The read operation we are currently preparing. */
 	BlockNumber pending_read_blocknum;
@@ -237,11 +238,11 @@ read_stream_unget_block(ReadStream *stream, BlockNumber blocknum)
  * distance to a level that prevents look-ahead until buffers are released.
  */
 static bool
-read_stream_start_pending_read(ReadStream *stream, bool suppress_advice)
+read_stream_start_pending_read(ReadStream *stream)
 {
 	bool		need_wait;
 	int			nblocks;
-	int			flags;
+	int			flags = 0;
 	int16		io_index;
 	int16		overflow;
 	int16		buffer_index;
@@ -261,16 +262,36 @@ read_stream_start_pending_read(ReadStream *stream, bool suppress_advice)
 	else
 		Assert(stream->next_buffer_index == stream->oldest_buffer_index);
 
-	/*
-	 * If advice hasn't been suppressed, this system supports it, and this
-	 * isn't a strictly sequential pattern, then we'll issue advice.
-	 */
-	if (!suppress_advice &&
-		stream->advice_enabled &&
-		stream->pending_read_blocknum != stream->seq_blocknum)
-		flags = READ_BUFFERS_ISSUE_ADVICE;
-	else
-		flags = 0;
+	/* Do we need to issue read-ahead advice? */
+	if (stream->advice_enabled)
+	{
+		bool		no_wait;
+
+		/*
+		 * We only issue advice if we won't immediately have to call
+		 * WaitReadBuffers().
+		 */
+		no_wait = stream->pinned_buffers > 0 ||
+			stream->pending_read_nblocks < stream->distance;
+
+		if (stream->pending_read_blocknum == stream->seq_blocknum)
+		{
+			/*
+			 * Sequential: issue advice only until the WaitReadBuffers() calls
+			 * catch up with the first advice issued for this sequential
+			 * region, so the kernel can see sequential access.
+			 */
+			if (stream->seq_until_processed != InvalidBlockNumber && no_wait)
+				flags = READ_BUFFERS_ISSUE_ADVICE;
+		}
+		else
+		{
+			/* Random jump: start tracking new region. */
+			stream->seq_until_processed = stream->pending_read_blocknum;
+			if (no_wait)
+				flags = READ_BUFFERS_ISSUE_ADVICE;
+		}
+	}
 
 	/* Compute the remaining portion of the per-backend buffer limit. */
 	if (stream->temporary)
@@ -359,7 +380,7 @@ read_stream_start_pending_read(ReadStream *stream, bool suppress_advice)
 }
 
 static void
-read_stream_look_ahead(ReadStream *stream, bool suppress_advice)
+read_stream_look_ahead(ReadStream *stream)
 {
 	while (stream->ios_in_progress < stream->max_ios &&
 		   stream->pinned_buffers + stream->pending_read_nblocks < stream->distance)
@@ -370,8 +391,7 @@ read_stream_look_ahead(ReadStream *stream, bool suppress_advice)
 
 		if (stream->pending_read_nblocks == stream->io_combine_limit)
 		{
-			read_stream_start_pending_read(stream, suppress_advice);
-			suppress_advice = false;
+			read_stream_start_pending_read(stream);
 			continue;
 		}
 
@@ -404,15 +424,13 @@ read_stream_look_ahead(ReadStream *stream, bool suppress_advice)
 		/* We have to start the pending read before we can build another. */
 		while (stream->pending_read_nblocks > 0)
 		{
-			if (!read_stream_start_pending_read(stream, suppress_advice) ||
+			if (!read_stream_start_pending_read(stream) ||
 				stream->ios_in_progress == stream->max_ios)
 			{
 				/* And we've hit a buffer or I/O limit.  Rewind and wait. */
 				read_stream_unget_block(stream, blocknum);
 				return;
 			}
-
-			suppress_advice = false;
 		}
 
 		/* This is the start of a new pending read. */
@@ -436,7 +454,7 @@ read_stream_look_ahead(ReadStream *stream, bool suppress_advice)
 		  stream->pinned_buffers == 0) ||
 		 stream->distance == 0) &&
 		stream->ios_in_progress < stream->max_ios)
-		read_stream_start_pending_read(stream, suppress_advice);
+		read_stream_start_pending_read(stream);
 
 	/*
 	 * There should always be something pinned when we leave this function,
@@ -612,6 +630,8 @@ read_stream_begin_impl(int flags,
 	stream->callback = callback;
 	stream->callback_private_data = callback_private_data;
 	stream->buffered_blocknum = InvalidBlockNumber;
+	stream->seq_blocknum = InvalidBlockNumber;
+	stream->seq_until_processed = InvalidBlockNumber;
 	stream->temporary = SmgrIsTemp(smgr);
 
 	/*
@@ -792,7 +812,7 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 		 * space for more, but if we're just starting up we'll need to crank
 		 * the handle to get started.
 		 */
-		read_stream_look_ahead(stream, true);
+		read_stream_look_ahead(stream);
 
 		/* End of stream reached? */
 		if (stream->pinned_buffers == 0)
@@ -837,6 +857,15 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 			distance = stream->distance * 2;
 			distance = Min(distance, stream->max_pinned_buffers);
 			stream->distance = distance;
+
+			/*
+			 * If we've caught up with the first advice issued for the current
+			 * sequential region, cancel further advice until the next random
+			 * jump.  The kernel should be able to see the pattern now that
+			 * we're actually making sequential preadv() calls.
+			 */
+			if (stream->ios[io_index].op.blocknum == stream->seq_until_processed)
+				stream->seq_until_processed = InvalidBlockNumber;
 		}
 		else
 		{
@@ -898,7 +927,7 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 		stream->oldest_buffer_index = 0;
 
 	/* Prepare for the next call. */
-	read_stream_look_ahead(stream, false);
+	read_stream_look_ahead(stream);
 
 #ifndef READ_STREAM_DISABLE_FAST_PATH
 	/* See if we can take the fast path for all-cached scans next time. */

From 977125f89e3c0b5d1313384f7a1559911c7dffa7 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Wed, 19 Feb 2025 01:25:40 +1300
Subject: [PATCH 4/9] Look ahead more when sequential in read_stream.c.

Previously, sequential reads would cause the look-ahead distance to
fall back to io_combine_limit, on the basis that kernel read-ahead
should start helping.  It also meant that we'd have to ramp the distance
back up when a sequential region was followed by a burst of random
jumps, with little hope of avoiding a stall, which is not a good
trade-off and is incompatible with AIO plans (you have to look ahead if
you have to start real I/O).

Simplify the algorithm: now only cache hits make the look-ahead distance
drop off, and cache misses still make it grow rapidly.  Random vs
sequential heuristics are no longer taken into consideration while
making that decision.

Reviewed-by: Andres Freund <andres@anarazel.de>
Discussion: https://postgr.es/m/CA%2BhUKGK_%3D4CVmMHvsHjOVrK6t4F%3DLBpFzsrr3R%2BaJYN8kcTfWg%40mail.gmail.com
---
 src/backend/storage/aio/read_stream.c | 92 ++++++++++-----------------
 1 file changed, 33 insertions(+), 59 deletions(-)

diff --git a/src/backend/storage/aio/read_stream.c b/src/backend/storage/aio/read_stream.c
index a8a96baf8c16..57cde89cfdc6 100644
--- a/src/backend/storage/aio/read_stream.c
+++ b/src/backend/storage/aio/read_stream.c
@@ -17,30 +17,12 @@
  * pending read.  When that isn't possible, the existing pending read is sent
  * to StartReadBuffers() so that a new one can begin to form.
  *
- * The algorithm for controlling the look-ahead distance tries to classify the
- * stream into three ideal behaviors:
+ * The algorithm for controlling the look-ahead distance is based on recent
+ * cache hits and misses:
  *
- * A) No I/O is necessary, because the requested blocks are fully cached
- * already.  There is no benefit to looking ahead more than one block, so
- * distance is 1.  This is the default initial assumption.
- *
- * B) I/O is necessary, but read-ahead advice is undesirable because the
- * access is sequential and we can rely on the kernel's read-ahead heuristics,
- * or impossible because direct I/O is enabled, or the system doesn't support
- * read-ahead advice.  There is no benefit in looking ahead more than
- * io_combine_limit, because in this case the only goal is larger read system
- * calls.  Looking further ahead would pin many buffers and perform
- * speculative work for no benefit.
- *
- * C) I/O is necessary, it appears to be random, and this system supports
- * read-ahead advice.  We'll look further ahead in order to reach the
- * configured level of I/O concurrency.
- *
- * The distance increases rapidly and decays slowly, so that it moves towards
- * those levels as different I/O patterns are discovered.  For example, a
- * sequential scan of fully cached data doesn't bother looking ahead, but a
- * sequential scan that hits a region of uncached blocks will start issuing
- * increasingly wide read calls until it plateaus at io_combine_limit.
+ * When no I/O is necessary, there is no point in looking ahead more than one
+ * block.  This is the default initial assumption.  Otherwise rapidly increase
+ * the distance to try to benefit from I/O combining and I/O concurrency.
  *
  * The main data structure is a circular queue of buffers of size
  * max_pinned_buffers plus some extra space for technical reasons, ready to be
@@ -336,7 +318,7 @@ read_stream_start_pending_read(ReadStream *stream)
 	/* Remember whether we need to wait before returning this buffer. */
 	if (!need_wait)
 	{
-		/* Look-ahead distance decays, no I/O necessary (behavior A). */
+		/* Look-ahead distance decays, no I/O necessary. */
 		if (stream->distance > 1)
 			stream->distance--;
 	}
@@ -517,6 +499,15 @@ read_stream_begin_impl(int flags,
 	else
 		max_ios = get_tablespace_io_concurrency(tablespace_id);
 
+	/*
+	 * XXX Since we don't have asynchronous I/O yet, if direct I/O is enabled
+	 * then just behave as though I/O concurrency is set to 0.  Otherwise we
+	 * would look ahead pinning many buffers for no benefit, for lack of
+	 * advice and AIO.
+	 */
+	if (io_direct_flags & IO_DIRECT_DATA)
+		max_ios = 0;
+
 	/* Cap to INT16_MAX to avoid overflowing below */
 	max_ios = Min(max_ios, PG_INT16_MAX);
 
@@ -637,7 +628,7 @@ read_stream_begin_impl(int flags,
 	/*
 	 * Skip the initial ramp-up phase if the caller says we're going to be
 	 * reading the whole relation.  This way we start out assuming we'll be
-	 * doing full io_combine_limit sized reads (behavior B).
+	 * doing full io_combine_limit sized reads.
 	 */
 	if (flags & READ_STREAM_FULL)
 		stream->distance = Min(max_pinned_buffers, stream->io_combine_limit);
@@ -728,10 +719,10 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 #ifndef READ_STREAM_DISABLE_FAST_PATH
 
 	/*
-	 * A fast path for all-cached scans (behavior A).  This is the same as the
-	 * usual algorithm, but it is specialized for no I/O and no per-buffer
-	 * data, so we can skip the queue management code, stay in the same buffer
-	 * slot and use singular StartReadBuffer().
+	 * A fast path for all-cached scans.  This is the same as the usual
+	 * algorithm, but it is specialized for no I/O and no per-buffer data, so
+	 * we can skip the queue management code, stay in the same buffer slot and
+	 * use singular StartReadBuffer().
 	 */
 	if (likely(stream->fast_path))
 	{
@@ -851,37 +842,20 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 		if (++stream->oldest_io_index == stream->max_ios)
 			stream->oldest_io_index = 0;
 
-		if (stream->ios[io_index].op.flags & READ_BUFFERS_ISSUE_ADVICE)
-		{
-			/* Distance ramps up fast (behavior C). */
-			distance = stream->distance * 2;
-			distance = Min(distance, stream->max_pinned_buffers);
-			stream->distance = distance;
+		/* Look-ahead distance ramps up quickly after we do I/O. */
+		distance = stream->distance * 2;
+		distance = Min(distance, stream->max_pinned_buffers);
+		stream->distance = distance;
 
-			/*
-			 * If we've caught up with the first advice issued for the current
-			 * sequential region, cancel further advice until the next random
-			 * jump.  The kernel should be able to see the pattern now that
-			 * we're actually making sequential preadv() calls.
-			 */
-			if (stream->ios[io_index].op.blocknum == stream->seq_until_processed)
-				stream->seq_until_processed = InvalidBlockNumber;
-		}
-		else
-		{
-			/* No advice; move towards io_combine_limit (behavior B). */
-			if (stream->distance > stream->io_combine_limit)
-			{
-				stream->distance--;
-			}
-			else
-			{
-				distance = stream->distance * 2;
-				distance = Min(distance, stream->io_combine_limit);
-				distance = Min(distance, stream->max_pinned_buffers);
-				stream->distance = distance;
-			}
-		}
+		/*
+		 * If we've caught up with the first advice issued for the current
+		 * sequential region, cancel further advice until the next random
+		 * jump.  The kernel should be able to see the pattern now that we're
+		 * actually making sequential preadv() calls.
+		 */
+		if (stream->advice_enabled &&
+			stream->ios[io_index].op.blocknum == stream->seq_until_processed)
+			stream->seq_until_processed = InvalidBlockNumber;
 	}
 
 #ifdef CLOBBER_FREED_MEMORY

From 5d04d960e94009405c6013a437630b3102340d0c Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Thu, 30 Jan 2025 11:42:03 +1300
Subject: [PATCH 5/9] Support buffer forwarding in read_stream.c.

In preparation for a following change to the buffer manager, teach read
stream to keep track of buffers that were "forwarded" from one call to
StartReadBuffers() to the next.

Since StartReadBuffers() buffers argument will become an in/out
argument, we need to initialize the buffer queue entries with
InvalidBuffer.  We don't want to do that up front, because we try to
keep stream initialization cheap and code that uses the fast path stays
in one single buffer queue element.  Satisfy both goals by initializing
the queue incrementally on the first cycle.

Discussion: https://postgr.es/m/CA%2BhUKGK_%3D4CVmMHvsHjOVrK6t4F%3DLBpFzsrr3R%2BaJYN8kcTfWg%40mail.gmail.com
---
 src/backend/storage/aio/read_stream.c | 102 +++++++++++++++++++++++---
 1 file changed, 92 insertions(+), 10 deletions(-)

diff --git a/src/backend/storage/aio/read_stream.c b/src/backend/storage/aio/read_stream.c
index 57cde89cfdc6..51c15330117c 100644
--- a/src/backend/storage/aio/read_stream.c
+++ b/src/backend/storage/aio/read_stream.c
@@ -95,8 +95,10 @@ struct ReadStream
 	int16		ios_in_progress;
 	int16		queue_size;
 	int16		max_pinned_buffers;
+	int16		forwarded_buffers;
 	int16		pinned_buffers;
 	int16		distance;
+	int16		initialized_buffers;
 	bool		advice_enabled;
 	bool		temporary;
 
@@ -223,8 +225,10 @@ static bool
 read_stream_start_pending_read(ReadStream *stream)
 {
 	bool		need_wait;
+	int			requested_nblocks;
 	int			nblocks;
 	int			flags = 0;
+	int			forwarded;
 	int16		io_index;
 	int16		overflow;
 	int16		buffer_index;
@@ -275,11 +279,19 @@ read_stream_start_pending_read(ReadStream *stream)
 		}
 	}
 
-	/* Compute the remaining portion of the per-backend buffer limit. */
+	/*
+	 * Compute the remaining portion of the per-backend buffer limit.  If we
+	 * already have some forwarded buffers, we can certainly use those.  They
+	 * are already pinned, and are mapped to the starting blocks of the pending
+	 * read, they just don't have any I/O started yet and are not counted in
+	 * stream->pinned_buffers.
+	 */
 	if (stream->temporary)
 		buffer_limit = Min(GetAdditionalLocalPinLimit(), PG_INT16_MAX);
 	else
 		buffer_limit = Min(GetAdditionalPinLimit(), PG_INT16_MAX);
+	Assert(stream->forwarded_buffers <= stream->pending_read_nblocks);
+	buffer_limit += stream->forwarded_buffers;
 	if (buffer_limit == 0 && stream->pinned_buffers == 0)
 		buffer_limit = 1;		/* guarantee progress */
 
@@ -306,8 +318,31 @@ read_stream_start_pending_read(ReadStream *stream)
 	 * We say how many blocks we want to read, but it may be smaller on return
 	 * if the buffer manager decides it needs a short read at its level.
 	 */
+	requested_nblocks = Min(buffer_limit, stream->pending_read_nblocks);
+	nblocks = requested_nblocks;
 	buffer_index = stream->next_buffer_index;
 	io_index = stream->next_io_index;
+
+	/*
+	 * The first time around the queue we initialize it as we go, including
+	 * the overflow zone, because otherwise the entries would appear as
+	 * forwarded buffers.  This avoids initializing the whole queue up front
+	 * in cases where it is large but we don't ever use it due to the
+	 * all-cached fast path or small scans.
+	 */
+	while (stream->initialized_buffers < buffer_index + nblocks)
+		stream->buffers[stream->initialized_buffers++] = InvalidBuffer;
+
+	/*
+	 * Start the I/O.  Any buffers that are not InvalidBuffer will be
+	 * interpreted as already pinned, forwarded by an earlier call to
+	 * StartReadBuffers(), and must map to the expected blocks.  The nblocks
+	 * value may be smaller on return indicating the size of the I/O that
+	 * could be started.  Buffers beyond the output nblocks number may also
+	 * have been pinned without starting I/O due to various edge cases.  In
+	 * that case we'll just leave them in the queue ahead of us, "forwarded"
+	 * to the next call, avoiding the need to unpin/repin.
+	 */
 	need_wait = StartReadBuffers(&stream->ios[io_index].op,
 								 &stream->buffers[buffer_index],
 								 stream->pending_read_blocknum,
@@ -336,16 +371,35 @@ read_stream_start_pending_read(ReadStream *stream)
 		stream->seq_blocknum = stream->pending_read_blocknum + nblocks;
 	}
 
+	/*
+	 * How many pins were acquired but forwarded to the next call?  These need
+	 * to be passed to the next StartReadBuffers() call, or released if the
+	 * stream ends early.  We need the number for accounting purposes, since
+	 * they are not counted in stream->pinned_buffers but we already hold
+	 * them.
+	 */
+	forwarded = 0;
+	while (nblocks + forwarded < requested_nblocks &&
+		   stream->buffers[buffer_index + nblocks + forwarded] != InvalidBuffer)
+		forwarded++;
+	stream->forwarded_buffers = forwarded;
+
 	/*
 	 * We gave a contiguous range of buffer space to StartReadBuffers(), but
-	 * we want it to wrap around at queue_size.  Slide overflowing buffers to
-	 * the front of the array.
+	 * we want it to wrap around at queue_size.  Copy overflowing buffers to
+	 * the front of the array where they'll be consumed, but also leave a copy
+	 * in the overflow zone which the I/O operation has a pointer to (it needs
+	 * a contiguous array).  Both copies will be cleared when the buffers are
+	 * handed to the consumer.
 	 */
-	overflow = (buffer_index + nblocks) - stream->queue_size;
+	overflow = (buffer_index + nblocks + forwarded) - stream->queue_size;
 	if (overflow > 0)
-		memmove(&stream->buffers[0],
-				&stream->buffers[stream->queue_size],
-				sizeof(stream->buffers[0]) * overflow);
+	{
+		Assert(overflow < stream->queue_size);	/* can't overlap */
+		memcpy(&stream->buffers[0],
+			   &stream->buffers[stream->queue_size],
+			   sizeof(stream->buffers[0]) * overflow);
+	}
 
 	/* Compute location of start of next read, without using % operator. */
 	buffer_index += nblocks;
@@ -730,10 +784,12 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 
 		/* Fast path assumptions. */
 		Assert(stream->ios_in_progress == 0);
+		Assert(stream->forwarded_buffers == 0);
 		Assert(stream->pinned_buffers == 1);
 		Assert(stream->distance == 1);
 		Assert(stream->pending_read_nblocks == 0);
 		Assert(stream->per_buffer_data_size == 0);
+		Assert(stream->initialized_buffers > stream->oldest_buffer_index);
 
 		/* We're going to return the buffer we pinned last time. */
 		oldest_buffer_index = stream->oldest_buffer_index;
@@ -782,6 +838,7 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 			stream->distance = 0;
 			stream->oldest_buffer_index = stream->next_buffer_index;
 			stream->pinned_buffers = 0;
+			stream->buffers[oldest_buffer_index] = InvalidBuffer;
 		}
 
 		stream->fast_path = false;
@@ -858,10 +915,15 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 			stream->seq_until_processed = InvalidBlockNumber;
 	}
 
-#ifdef CLOBBER_FREED_MEMORY
-	/* Clobber old buffer for debugging purposes. */
+	/*
+	 * We must zap this queue entry, or else it would appear as a forwarded
+	 * buffer.  If it's potentially in the overflow zone (ie it wrapped around
+	 * the queue), also zap that copy.
+	 */
 	stream->buffers[oldest_buffer_index] = InvalidBuffer;
-#endif
+	if (oldest_buffer_index < io_combine_limit - 1)
+		stream->buffers[stream->queue_size + oldest_buffer_index] =
+			InvalidBuffer;
 
 #if defined(CLOBBER_FREED_MEMORY) || defined(USE_VALGRIND)
 
@@ -906,6 +968,7 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 #ifndef READ_STREAM_DISABLE_FAST_PATH
 	/* See if we can take the fast path for all-cached scans next time. */
 	if (stream->ios_in_progress == 0 &&
+		stream->forwarded_buffers == 0 &&
 		stream->pinned_buffers == 1 &&
 		stream->distance == 1 &&
 		stream->pending_read_nblocks == 0 &&
@@ -941,6 +1004,7 @@ read_stream_next_block(ReadStream *stream, BufferAccessStrategy *strategy)
 void
 read_stream_reset(ReadStream *stream)
 {
+	int16		index;
 	Buffer		buffer;
 
 	/* Stop looking ahead. */
@@ -957,6 +1021,24 @@ read_stream_reset(ReadStream *stream)
 	while ((buffer = read_stream_next_buffer(stream, NULL)) != InvalidBuffer)
 		ReleaseBuffer(buffer);
 
+	/* Unpin any unused forwarded buffers. */
+	index = stream->next_buffer_index;
+	while (index < stream->initialized_buffers &&
+		   (buffer = stream->buffers[index]) != InvalidBuffer)
+	{
+		Assert(stream->forwarded_buffers > 0);
+		stream->forwarded_buffers--;
+		ReleaseBuffer(buffer);
+
+		stream->buffers[index] = InvalidBuffer;
+		if (index < io_combine_limit - 1)
+			stream->buffers[stream->queue_size + index] = InvalidBuffer;
+
+		if (++index == stream->queue_size)
+			index = 0;
+	}
+
+	Assert(stream->forwarded_buffers == 0);
 	Assert(stream->pinned_buffers == 0);
 	Assert(stream->ios_in_progress == 0);
 

From 0ba3a9cc2af75f44387e82167397049ae6856c45 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Mon, 10 Feb 2025 21:55:40 +1300
Subject: [PATCH 6/9] Support buffer forwarding in StartReadBuffers().

Sometimes we have to perform a short read because we hit a cached block
that ends a contiguous run of blocks requiring I/O.  We don't want
StartReadBuffers() to have to start more than one I/O, so we stop there.
We also don't want to have to unpin the cached block (and repin it
later), so previously we'd silently pretend the hit was part of the I/O,
and just leave it out of the read from disk.  Now, we'll "forward" it to
the next call.  We still write it to the buffers[] array for the caller
to pass back to us later, but it's not included in *nblocks.

This policy means that we no longer mix hits and misses in a single
operation's results, so we avoid the requirement to call
WaitReadBuffers(), which might stall, before the caller can make use of
the hits.  The caller will get the hit in the next call instead, and
know that it doesn't have to wait.  That's important for later work on
out-of-order read streams that minimize I/O stalls.

This also makes life easier for proposed work on true AIO, which
occasionally needs to split a large I/O after pinning all the buffers,
while the current coding only ever forwards a single bookending hit.

This API is natural for read_stream.c: it just leaves forwarded buffers
where they are in its circular queue, where the next call will pick them
up and continue, minimizing pin churn.

If we ever think of a good reason to disable this feature, i.e. for
other users of StartReadBuffers() that don't want to deal with forwarded
buffers, then we could add a flag for that.  For now read_steam.c is the
only user.

Discussion: https://postgr.es/m/CA%2BhUKGK_%3D4CVmMHvsHjOVrK6t4F%3DLBpFzsrr3R%2BaJYN8kcTfWg%40mail.gmail.com
---
 src/backend/storage/buffer/bufmgr.c | 128 ++++++++++++++++++++--------
 src/include/storage/bufmgr.h        |   1 -
 2 files changed, 91 insertions(+), 38 deletions(-)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index a6138e79306f..d56bff96cec4 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -1257,10 +1257,10 @@ StartReadBuffersImpl(ReadBuffersOperation *operation,
 					 Buffer *buffers,
 					 BlockNumber blockNum,
 					 int *nblocks,
-					 int flags)
+					 int flags,
+					 bool allow_forwarding)
 {
 	int			actual_nblocks = *nblocks;
-	int			io_buffers_len = 0;
 	int			maxcombine = 0;
 
 	Assert(*nblocks > 0);
@@ -1270,30 +1270,80 @@ StartReadBuffersImpl(ReadBuffersOperation *operation,
 	{
 		bool		found;
 
-		buffers[i] = PinBufferForBlock(operation->rel,
-									   operation->smgr,
-									   operation->persistence,
-									   operation->forknum,
-									   blockNum + i,
-									   operation->strategy,
-									   &found);
+		if (allow_forwarding && buffers[i] != InvalidBuffer)
+		{
+			BufferDesc *bufHdr;
+
+			/*
+			 * This is a buffer that was pinned by an earlier call to
+			 * StartReadBuffers(), but couldn't be handled in one operation at
+			 * that time.  The operation was split, and the caller has passed
+			 * an already pinned buffer back to us to handle the rest of the
+			 * operation.  It must continue at the expected block number.
+			 */
+			Assert(BufferGetBlockNumber(buffers[i]) == blockNum + i);
+
+			/*
+			 * It might be an already valid buffer (a hit) that followed the
+			 * final contiguous block of an earlier I/O (a miss) marking the
+			 * end of it, or a buffer that some other backend has since made
+			 * valid by performing the I/O for us, in which case we can handle
+			 * it as a hit now.  It is safe to check for a BM_VALID flag with
+			 * a relaxed load, because we got a fresh view of it while pinning
+			 * it in the previous call.
+			 *
+			 * On the other hand if we don't see BM_VALID yet, it must be an
+			 * I/O that was split by the previous call and we need to try to
+			 * start a new I/O from this block.  We're also racing against any
+			 * other backend that might start the I/O or even manage to mark
+			 * it BM_VALID after this check, BM_VALID after this check, but
+			 * StartBufferIO() will handle those cases.
+			 */
+			if (BufferIsLocal(buffers[i]))
+				bufHdr = GetLocalBufferDescriptor(-buffers[i] - 1);
+			else
+				bufHdr = GetBufferDescriptor(buffers[i] - 1);
+			found = pg_atomic_read_u32(&bufHdr->state) & BM_VALID;
+		}
+		else
+		{
+			buffers[i] = PinBufferForBlock(operation->rel,
+										   operation->smgr,
+										   operation->persistence,
+										   operation->forknum,
+										   blockNum + i,
+										   operation->strategy,
+										   &found);
+		}
 
 		if (found)
 		{
 			/*
-			 * Terminate the read as soon as we get a hit.  It could be a
-			 * single buffer hit, or it could be a hit that follows a readable
-			 * range.  We don't want to create more than one readable range,
-			 * so we stop here.
+			 * We have a hit.  If it's the first block in the requested range,
+			 * we can return it immediately and report that WaitReadBuffers()
+			 * does not need to be called.  If the initial value of *nblocks
+			 * was larger, the caller will have to call again for the rest.
 			 */
-			actual_nblocks = i + 1;
+			if (i == 0)
+			{
+				*nblocks = 1;
+				return false;
+			}
+
+			/*
+			 * Otherwise we already have an I/O to perform, but this block
+			 * can't be included as it is already valid.  Split the I/O here.
+			 * There may or may not be more blocks requiring I/O after this
+			 * one, we haven't checked, but it can't be contiguous with this
+			 * hit in the way.  We'll leave this buffer pinned, forwarding it
+			 * to the next call, avoiding the need to unpin it here and re-pin
+			 * it in the next call.
+			 */
+			actual_nblocks = i;
 			break;
 		}
 		else
 		{
-			/* Extend the readable range to cover this block. */
-			io_buffers_len++;
-
 			/*
 			 * Check how many blocks we can cover with the same IO. The smgr
 			 * implementation might e.g. be limited due to a segment boundary.
@@ -1314,15 +1364,11 @@ StartReadBuffersImpl(ReadBuffersOperation *operation,
 	}
 	*nblocks = actual_nblocks;
 
-	if (likely(io_buffers_len == 0))
-		return false;
-
 	/* Populate information needed for I/O. */
 	operation->buffers = buffers;
 	operation->blocknum = blockNum;
 	operation->flags = flags;
 	operation->nblocks = actual_nblocks;
-	operation->io_buffers_len = io_buffers_len;
 
 	if (flags & READ_BUFFERS_ISSUE_ADVICE)
 	{
@@ -1337,7 +1383,7 @@ StartReadBuffersImpl(ReadBuffersOperation *operation,
 		smgrprefetch(operation->smgr,
 					 operation->forknum,
 					 blockNum,
-					 operation->io_buffers_len);
+					 actual_nblocks);
 	}
 
 	/* Indicate that WaitReadBuffers() should be called. */
@@ -1351,11 +1397,21 @@ StartReadBuffersImpl(ReadBuffersOperation *operation,
  * actual number, which may be fewer than requested.  Caller sets some of the
  * members of operation; see struct definition.
  *
+ * The initial contents of the elements of buffers up to *nblocks should
+ * either be InvalidBuffer or an already-pinned buffer that was left by an
+ * preceding call to StartReadBuffers() that had to be split.  On return, some
+ * elements of buffers may hold pinned buffers beyond the number indicated by
+ * the updated value of *nblocks.  Operations are split on boundaries known to
+ * smgr (eg md.c segment boundaries that require crossing into a different
+ * underlying file), or when already cached blocks are found in the buffer
+ * that prevent the formation of a contiguous read.
+ *
  * If false is returned, no I/O is necessary.  If true is returned, one I/O
  * has been started, and WaitReadBuffers() must be called with the same
  * operation object before the buffers are accessed.  Along with the operation
  * object, the caller-supplied array of buffers must remain valid until
- * WaitReadBuffers() is called.
+ * WaitReadBuffers() is called, and any forwarded buffers must also be
+ * preserved for a future call unless explicitly released.
  *
  * Currently the I/O is only started with optional operating system advice if
  * requested by the caller with READ_BUFFERS_ISSUE_ADVICE, and the real I/O
@@ -1369,13 +1425,18 @@ StartReadBuffers(ReadBuffersOperation *operation,
 				 int *nblocks,
 				 int flags)
 {
-	return StartReadBuffersImpl(operation, buffers, blockNum, nblocks, flags);
+	return StartReadBuffersImpl(operation, buffers, blockNum, nblocks, flags,
+								true /* expect forwarded buffers */ );
 }
 
 /*
  * Single block version of the StartReadBuffers().  This might save a few
  * instructions when called from another translation unit, because it is
  * specialized for nblocks == 1.
+ *
+ * This version does not support "forwarded" buffers: they cannot be created
+ * by reading only one block, and the current contents of *buffer is ignored
+ * on entry.
  */
 bool
 StartReadBuffer(ReadBuffersOperation *operation,
@@ -1386,7 +1447,8 @@ StartReadBuffer(ReadBuffersOperation *operation,
 	int			nblocks = 1;
 	bool		result;
 
-	result = StartReadBuffersImpl(operation, buffer, blocknum, &nblocks, flags);
+	result = StartReadBuffersImpl(operation, buffer, blocknum, &nblocks, flags,
+								  false /* single block, no forwarding */ );
 	Assert(nblocks == 1);		/* single block can't be short */
 
 	return result;
@@ -1416,24 +1478,16 @@ WaitReadBuffers(ReadBuffersOperation *operation)
 	IOObject	io_object;
 	char		persistence;
 
-	/*
-	 * Currently operations are only allowed to include a read of some range,
-	 * with an optional extra buffer that is already pinned at the end.  So
-	 * nblocks can be at most one more than io_buffers_len.
-	 */
-	Assert((operation->nblocks == operation->io_buffers_len) ||
-		   (operation->nblocks == operation->io_buffers_len + 1));
-
 	/* Find the range of the physical read we need to perform. */
-	nblocks = operation->io_buffers_len;
-	if (nblocks == 0)
-		return;					/* nothing to do */
-
+	nblocks = operation->nblocks;
 	buffers = &operation->buffers[0];
 	blocknum = operation->blocknum;
 	forknum = operation->forknum;
 	persistence = operation->persistence;
 
+	Assert(nblocks > 0);
+	Assert(nblocks <= MAX_IO_COMBINE_LIMIT);
+
 	if (persistence == RELPERSISTENCE_TEMP)
 	{
 		io_context = IOCONTEXT_NORMAL;
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 74b5afe8a1ad..307f36af3849 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -130,7 +130,6 @@ struct ReadBuffersOperation
 	BlockNumber blocknum;
 	int			flags;
 	int16		nblocks;
-	int16		io_buffers_len;
 };
 
 typedef struct ReadBuffersOperation ReadBuffersOperation;

From f850d35d70f187b7dc6e807195ed0aca5707a6c9 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Thu, 20 Feb 2025 12:30:26 -0500
Subject: [PATCH 7/9] Separate TBM[Shared|Private]Iterator and TBMIterateResult

Remove the TBMIterateResult member from the TBMPrivateIterator and
TBMSharedIterator and make tbm_[shared|private_]iterate() take a
TBMIterateResult as a parameter.

This allows tidbitmap API users to manage multiple TBMIterateResults per
scan. This is required for bitmap heap scan to use the read stream API,
with which there may be multiple I/Os in flight at once, each one with a
TBMIterateResult.

Reviewed-by: Tomas Vondra <tomas@vondra.me>
Discussion: https://postgr.es/m/d4bb26c9-fe07-439e-ac53-c0e244387e01%40vondra.me
---
 src/backend/access/gin/ginget.c           | 33 ++++----
 src/backend/access/gin/ginscan.c          |  2 +-
 src/backend/access/heap/heapam_handler.c  | 28 +++----
 src/backend/executor/nodeBitmapHeapscan.c | 39 +++++----
 src/backend/nodes/tidbitmap.c             | 96 ++++++++++++-----------
 src/include/access/gin_private.h          |  7 +-
 src/include/nodes/tidbitmap.h             |  6 +-
 7 files changed, 110 insertions(+), 101 deletions(-)

diff --git a/src/backend/access/gin/ginget.c b/src/backend/access/gin/ginget.c
index 4a56f19390dc..f29ccd3c2d1f 100644
--- a/src/backend/access/gin/ginget.c
+++ b/src/backend/access/gin/ginget.c
@@ -332,8 +332,8 @@ startScanEntry(GinState *ginstate, GinScanEntry entry, Snapshot snapshot)
 	entry->list = NULL;
 	entry->nlist = 0;
 	entry->matchBitmap = NULL;
-	entry->matchResult = NULL;
 	entry->matchNtuples = -1;
+	entry->matchResult.blockno = InvalidBlockNumber;
 	entry->reduceResult = false;
 	entry->predictNumberResult = 0;
 
@@ -827,20 +827,19 @@ entryGetItem(GinState *ginstate, GinScanEntry entry,
 		{
 			/*
 			 * If we've exhausted all items on this block, move to next block
-			 * in the bitmap.
+			 * in the bitmap. tbm_private_iterate() sets matchResult.blockno
+			 * to InvalidBlockNumber when the bitmap is exhausted.
 			 */
-			while (entry->matchResult == NULL ||
-				   (!entry->matchResult->lossy &&
+			while ((!BlockNumberIsValid(entry->matchResult.blockno)) ||
+				   (!entry->matchResult.lossy &&
 					entry->offset >= entry->matchNtuples) ||
-				   entry->matchResult->blockno < advancePastBlk ||
+				   entry->matchResult.blockno < advancePastBlk ||
 				   (ItemPointerIsLossyPage(&advancePast) &&
-					entry->matchResult->blockno == advancePastBlk))
+					entry->matchResult.blockno == advancePastBlk))
 			{
-				entry->matchResult =
-					tbm_private_iterate(entry->matchIterator);
-
-				if (entry->matchResult == NULL)
+				if (!tbm_private_iterate(entry->matchIterator, &entry->matchResult))
 				{
+					Assert(!BlockNumberIsValid(entry->matchResult.blockno));
 					ItemPointerSetInvalid(&entry->curItem);
 					tbm_end_private_iterate(entry->matchIterator);
 					entry->matchIterator = NULL;
@@ -849,14 +848,14 @@ entryGetItem(GinState *ginstate, GinScanEntry entry,
 				}
 
 				/* Exact pages need their tuple offsets extracted. */
-				if (!entry->matchResult->lossy)
-					entry->matchNtuples = tbm_extract_page_tuple(entry->matchResult,
+				if (!entry->matchResult.lossy)
+					entry->matchNtuples = tbm_extract_page_tuple(&entry->matchResult,
 																 entry->matchOffsets,
 																 TBM_MAX_TUPLES_PER_PAGE);
 
 				/*
 				 * Reset counter to the beginning of entry->matchResult. Note:
-				 * entry->offset is still greater than entry->matchNtuples if
+				 * entry->offset is still greater than matchResult.ntuples if
 				 * matchResult is lossy.  So, on next call we will get next
 				 * result from TIDBitmap.
 				 */
@@ -869,10 +868,10 @@ entryGetItem(GinState *ginstate, GinScanEntry entry,
 			 * We're now on the first page after advancePast which has any
 			 * items on it. If it's a lossy result, return that.
 			 */
-			if (entry->matchResult->lossy)
+			if (entry->matchResult.lossy)
 			{
 				ItemPointerSetLossyPage(&entry->curItem,
-										entry->matchResult->blockno);
+										entry->matchResult.blockno);
 
 				/*
 				 * We might as well fall out of the loop; we could not
@@ -889,7 +888,7 @@ entryGetItem(GinState *ginstate, GinScanEntry entry,
 			Assert(entry->matchNtuples > -1);
 
 			/* Skip over any offsets <= advancePast, and return that. */
-			if (entry->matchResult->blockno == advancePastBlk)
+			if (entry->matchResult.blockno == advancePastBlk)
 			{
 				Assert(entry->matchNtuples > 0);
 
@@ -910,7 +909,7 @@ entryGetItem(GinState *ginstate, GinScanEntry entry,
 			}
 
 			ItemPointerSet(&entry->curItem,
-						   entry->matchResult->blockno,
+						   entry->matchResult.blockno,
 						   entry->matchOffsets[entry->offset]);
 			entry->offset++;
 
diff --git a/src/backend/access/gin/ginscan.c b/src/backend/access/gin/ginscan.c
index f6cdd098a028..c2d1771bd77b 100644
--- a/src/backend/access/gin/ginscan.c
+++ b/src/backend/access/gin/ginscan.c
@@ -111,7 +111,7 @@ ginFillScanEntry(GinScanOpaque so, OffsetNumber attnum,
 	ItemPointerSetMin(&scanEntry->curItem);
 	scanEntry->matchBitmap = NULL;
 	scanEntry->matchIterator = NULL;
-	scanEntry->matchResult = NULL;
+	scanEntry->matchResult.blockno = InvalidBlockNumber;
 	scanEntry->matchNtuples = -1;
 	scanEntry->list = NULL;
 	scanEntry->nlist = 0;
diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index d74f0fbc5cd1..bd49ff969f51 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -2126,7 +2126,7 @@ heapam_scan_bitmap_next_block(TableScanDesc scan,
 	Buffer		buffer;
 	Snapshot	snapshot;
 	int			ntup;
-	TBMIterateResult *tbmres;
+	TBMIterateResult tbmres;
 	OffsetNumber offsets[TBM_MAX_TUPLES_PER_PAGE];
 	int			noffsets = -1;
 
@@ -2142,14 +2142,12 @@ heapam_scan_bitmap_next_block(TableScanDesc scan,
 	{
 		CHECK_FOR_INTERRUPTS();
 
-		tbmres = tbm_iterate(&scan->st.rs_tbmiterator);
-
-		if (tbmres == NULL)
+		if (!tbm_iterate(&scan->st.rs_tbmiterator, &tbmres))
 			return false;
 
 		/* Exact pages need their tuple offsets extracted. */
-		if (!tbmres->lossy)
-			noffsets = tbm_extract_page_tuple(tbmres, offsets,
+		if (!tbmres.lossy)
+			noffsets = tbm_extract_page_tuple(&tbmres, offsets,
 											  TBM_MAX_TUPLES_PER_PAGE);
 
 		/*
@@ -2161,11 +2159,11 @@ heapam_scan_bitmap_next_block(TableScanDesc scan,
 		 * reachable by the index.
 		 */
 	} while (!IsolationIsSerializable() &&
-			 tbmres->blockno >= hscan->rs_nblocks);
+			 tbmres.blockno >= hscan->rs_nblocks);
 
 	/* Got a valid block */
-	*blockno = tbmres->blockno;
-	*recheck = tbmres->recheck;
+	*blockno = tbmres.blockno;
+	*recheck = tbmres.recheck;
 
 	/*
 	 * We can skip fetching the heap page if we don't need any fields from the
@@ -2173,11 +2171,11 @@ heapam_scan_bitmap_next_block(TableScanDesc scan,
 	 * page are visible to our transaction.
 	 */
 	if (!(scan->rs_flags & SO_NEED_TUPLES) &&
-		!tbmres->recheck &&
-		VM_ALL_VISIBLE(scan->rs_rd, tbmres->blockno, &bscan->rs_vmbuffer))
+		!tbmres.recheck &&
+		VM_ALL_VISIBLE(scan->rs_rd, tbmres.blockno, &bscan->rs_vmbuffer))
 	{
 		/* can't be lossy in the skip_fetch case */
-		Assert(!tbmres->lossy);
+		Assert(!tbmres.lossy);
 		Assert(bscan->rs_empty_tuples_pending >= 0);
 		Assert(noffsets > -1);
 
@@ -2186,7 +2184,7 @@ heapam_scan_bitmap_next_block(TableScanDesc scan,
 		return true;
 	}
 
-	block = tbmres->blockno;
+	block = tbmres.blockno;
 
 	/*
 	 * Acquire pin on the target heap page, trading in any pin we held before.
@@ -2215,7 +2213,7 @@ heapam_scan_bitmap_next_block(TableScanDesc scan,
 	/*
 	 * We need two separate strategies for lossy and non-lossy cases.
 	 */
-	if (!tbmres->lossy)
+	if (!tbmres.lossy)
 	{
 		/*
 		 * Bitmap is non-lossy, so we just look through the offsets listed in
@@ -2279,7 +2277,7 @@ heapam_scan_bitmap_next_block(TableScanDesc scan,
 	Assert(ntup <= MaxHeapTuplesPerPage);
 	hscan->rs_ntuples = ntup;
 
-	if (tbmres->lossy)
+	if (tbmres.lossy)
 		(*lossy_pages)++;
 	else
 		(*exact_pages)++;
diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c
index be0d24d901b5..3b4ea0f61446 100644
--- a/src/backend/executor/nodeBitmapHeapscan.c
+++ b/src/backend/executor/nodeBitmapHeapscan.c
@@ -317,7 +317,7 @@ BitmapAdjustPrefetchIterator(BitmapHeapScanState *node)
 {
 #ifdef USE_PREFETCH
 	ParallelBitmapHeapState *pstate = node->pstate;
-	TBMIterateResult *tbmpre;
+	TBMIterateResult tbmpre;
 
 	if (pstate == NULL)
 	{
@@ -330,9 +330,8 @@ BitmapAdjustPrefetchIterator(BitmapHeapScanState *node)
 		}
 		else if (!tbm_exhausted(prefetch_iterator))
 		{
-			tbmpre = tbm_iterate(prefetch_iterator);
-			node->prefetch_blockno = tbmpre ? tbmpre->blockno :
-				InvalidBlockNumber;
+			tbm_iterate(prefetch_iterator, &tbmpre);
+			node->prefetch_blockno = tbmpre.blockno;
 		}
 		return;
 	}
@@ -371,9 +370,8 @@ BitmapAdjustPrefetchIterator(BitmapHeapScanState *node)
 			 */
 			if (!tbm_exhausted(prefetch_iterator))
 			{
-				tbmpre = tbm_iterate(prefetch_iterator);
-				node->prefetch_blockno = tbmpre ? tbmpre->blockno :
-					InvalidBlockNumber;
+				tbm_iterate(prefetch_iterator, &tbmpre);
+				node->prefetch_blockno = tbmpre.blockno;
 			}
 		}
 	}
@@ -441,17 +439,18 @@ BitmapPrefetch(BitmapHeapScanState *node, TableScanDesc scan)
 		{
 			while (node->prefetch_pages < node->prefetch_target)
 			{
-				TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator);
+				TBMIterateResult tbmpre;
 				bool		skip_fetch;
 
-				if (tbmpre == NULL)
+				if (!tbm_iterate(prefetch_iterator, &tbmpre))
 				{
 					/* No more pages to prefetch */
+					Assert(!BlockNumberIsValid(tbmpre.blockno));
 					tbm_end_iterate(prefetch_iterator);
 					break;
 				}
 				node->prefetch_pages++;
-				node->prefetch_blockno = tbmpre->blockno;
+				node->prefetch_blockno = tbmpre.blockno;
 
 				/*
 				 * If we expect not to have to actually read this heap page,
@@ -460,13 +459,13 @@ BitmapPrefetch(BitmapHeapScanState *node, TableScanDesc scan)
 				 * prefetch_pages?)
 				 */
 				skip_fetch = (!(scan->rs_flags & SO_NEED_TUPLES) &&
-							  !tbmpre->recheck &&
+							  !tbmpre.recheck &&
 							  VM_ALL_VISIBLE(node->ss.ss_currentRelation,
-											 tbmpre->blockno,
+											 tbmpre.blockno,
 											 &node->pvmbuffer));
 
 				if (!skip_fetch)
-					PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno);
+					PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre.blockno);
 			}
 		}
 
@@ -481,7 +480,7 @@ BitmapPrefetch(BitmapHeapScanState *node, TableScanDesc scan)
 		{
 			while (1)
 			{
-				TBMIterateResult *tbmpre;
+				TBMIterateResult tbmpre;
 				bool		do_prefetch = false;
 				bool		skip_fetch;
 
@@ -500,25 +499,25 @@ BitmapPrefetch(BitmapHeapScanState *node, TableScanDesc scan)
 				if (!do_prefetch)
 					return;
 
-				tbmpre = tbm_iterate(prefetch_iterator);
-				if (tbmpre == NULL)
+				if (!tbm_iterate(prefetch_iterator, &tbmpre))
 				{
+					Assert(!BlockNumberIsValid(tbmpre.blockno));
 					/* No more pages to prefetch */
 					tbm_end_iterate(prefetch_iterator);
 					break;
 				}
 
-				node->prefetch_blockno = tbmpre->blockno;
+				node->prefetch_blockno = tbmpre.blockno;
 
 				/* As above, skip prefetch if we expect not to need page */
 				skip_fetch = (!(scan->rs_flags & SO_NEED_TUPLES) &&
-							  !tbmpre->recheck &&
+							  !tbmpre.recheck &&
 							  VM_ALL_VISIBLE(node->ss.ss_currentRelation,
-											 tbmpre->blockno,
+											 tbmpre.blockno,
 											 &node->pvmbuffer));
 
 				if (!skip_fetch)
-					PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno);
+					PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre.blockno);
 			}
 		}
 	}
diff --git a/src/backend/nodes/tidbitmap.c b/src/backend/nodes/tidbitmap.c
index 3d835024caa2..41031aa8f2fa 100644
--- a/src/backend/nodes/tidbitmap.c
+++ b/src/backend/nodes/tidbitmap.c
@@ -172,7 +172,6 @@ struct TBMPrivateIterator
 	int			spageptr;		/* next spages index */
 	int			schunkptr;		/* next schunks index */
 	int			schunkbit;		/* next bit to check in current schunk */
-	TBMIterateResult output;
 };
 
 /*
@@ -213,7 +212,6 @@ struct TBMSharedIterator
 	PTEntryArray *ptbase;		/* pagetable element array */
 	PTIterationArray *ptpages;	/* sorted exact page index list */
 	PTIterationArray *ptchunks; /* sorted lossy page index list */
-	TBMIterateResult output;
 };
 
 /* Local function prototypes */
@@ -957,21 +955,28 @@ tbm_advance_schunkbit(PagetableEntry *chunk, int *schunkbitp)
 /*
  * tbm_private_iterate - scan through next page of a TIDBitmap
  *
- * Returns a TBMIterateResult representing one page, or NULL if there are
- * no more pages to scan.  Pages are guaranteed to be delivered in numerical
- * order.  If lossy is true, then the bitmap is "lossy" and failed to
- * remember the exact tuples to look at on this page --- the caller must
- * examine all tuples on the page and check if they meet the intended
- * condition.  result->ntuples is set to -1 when the bitmap is lossy.
- * If result->recheck is true, only the indicated tuples need
- * be examined, but the condition must be rechecked anyway.  (For ease of
- * testing, recheck is always set true when lossy is true.)
+ * Caller must pass in a TBMIterateResult to be filled.
+ *
+ * Pages are guaranteed to be delivered in numerical order.
+ *
+ * Returns false when there are no more pages to scan and true otherwise. When
+ * there are no more pages to scan, tbmres->blockno is set to
+ * InvalidBlockNumber.
+ *
+ * If lossy is true, then the bitmap is "lossy" and failed to remember
+ * the exact tuples to look at on this page --- the caller must examine all
+ * tuples on the page and check if they meet the intended condition. If lossy
+ * is false, the caller must later extract the tuple offsets from the page
+ * pointed to by internal_page with tbm_extract_page_tuple.
+ *
+ * If tbmres->recheck is true, only the indicated tuples need be examined, but
+ * the condition must be rechecked anyway.  (For ease of testing, recheck is
+ * always set true when lossy is true.)
  */
-TBMIterateResult *
-tbm_private_iterate(TBMPrivateIterator *iterator)
+bool
+tbm_private_iterate(TBMPrivateIterator *iterator, TBMIterateResult *tbmres)
 {
 	TIDBitmap  *tbm = iterator->tbm;
-	TBMIterateResult *output = &(iterator->output);
 
 	Assert(tbm->iterating == TBM_ITERATING_PRIVATE);
 
@@ -1009,12 +1014,12 @@ tbm_private_iterate(TBMPrivateIterator *iterator)
 			chunk_blockno < tbm->spages[iterator->spageptr]->blockno)
 		{
 			/* Return a lossy page indicator from the chunk */
-			output->blockno = chunk_blockno;
-			output->lossy = true;
-			output->recheck = true;
-			output->internal_page = NULL;
+			tbmres->blockno = chunk_blockno;
+			tbmres->lossy = true;
+			tbmres->recheck = true;
+			tbmres->internal_page = NULL;
 			iterator->schunkbit++;
-			return output;
+			return true;
 		}
 	}
 
@@ -1028,16 +1033,17 @@ tbm_private_iterate(TBMPrivateIterator *iterator)
 		else
 			page = tbm->spages[iterator->spageptr];
 
-		output->internal_page = page;
-		output->blockno = page->blockno;
-		output->lossy = false;
-		output->recheck = page->recheck;
+		tbmres->internal_page = page;
+		tbmres->blockno = page->blockno;
+		tbmres->lossy = false;
+		tbmres->recheck = page->recheck;
 		iterator->spageptr++;
-		return output;
+		return true;
 	}
 
 	/* Nothing more in the bitmap */
-	return NULL;
+	tbmres->blockno = InvalidBlockNumber;
+	return false;
 }
 
 /*
@@ -1047,10 +1053,9 @@ tbm_private_iterate(TBMPrivateIterator *iterator)
  *	across multiple processes.  We need to acquire the iterator LWLock,
  *	before accessing the shared members.
  */
-TBMIterateResult *
-tbm_shared_iterate(TBMSharedIterator *iterator)
+bool
+tbm_shared_iterate(TBMSharedIterator *iterator, TBMIterateResult *tbmres)
 {
-	TBMIterateResult *output = &iterator->output;
 	TBMSharedIteratorState *istate = iterator->state;
 	PagetableEntry *ptbase = NULL;
 	int		   *idxpages = NULL;
@@ -1101,14 +1106,14 @@ tbm_shared_iterate(TBMSharedIterator *iterator)
 			chunk_blockno < ptbase[idxpages[istate->spageptr]].blockno)
 		{
 			/* Return a lossy page indicator from the chunk */
-			output->blockno = chunk_blockno;
-			output->lossy = true;
-			output->recheck = true;
-			output->internal_page = NULL;
+			tbmres->blockno = chunk_blockno;
+			tbmres->lossy = true;
+			tbmres->recheck = true;
+			tbmres->internal_page = NULL;
 			istate->schunkbit++;
 
 			LWLockRelease(&istate->lock);
-			return output;
+			return true;
 		}
 	}
 
@@ -1116,21 +1121,22 @@ tbm_shared_iterate(TBMSharedIterator *iterator)
 	{
 		PagetableEntry *page = &ptbase[idxpages[istate->spageptr]];
 
-		output->internal_page = page;
-		output->blockno = page->blockno;
-		output->lossy = false;
-		output->recheck = page->recheck;
+		tbmres->internal_page = page;
+		tbmres->blockno = page->blockno;
+		tbmres->lossy = false;
+		tbmres->recheck = page->recheck;
 		istate->spageptr++;
 
 		LWLockRelease(&istate->lock);
 
-		return output;
+		return true;
 	}
 
 	LWLockRelease(&istate->lock);
 
 	/* Nothing more in the bitmap */
-	return NULL;
+	tbmres->blockno = InvalidBlockNumber;
+	return false;
 }
 
 /*
@@ -1604,15 +1610,17 @@ tbm_end_iterate(TBMIterator *iterator)
 }
 
 /*
- * Get the next TBMIterateResult from the shared or private bitmap iterator.
+ * Populate the next TBMIterateResult using the shared or private bitmap
+ * iterator. Returns false when there is nothing more to scan.
  */
-TBMIterateResult *
-tbm_iterate(TBMIterator *iterator)
+bool
+tbm_iterate(TBMIterator *iterator, TBMIterateResult *tbmres)
 {
 	Assert(iterator);
+	Assert(tbmres);
 
 	if (iterator->shared)
-		return tbm_shared_iterate(iterator->i.shared_iterator);
+		return tbm_shared_iterate(iterator->i.shared_iterator, tbmres);
 	else
-		return tbm_private_iterate(iterator->i.private_iterator);
+		return tbm_private_iterate(iterator->i.private_iterator, tbmres);
 }
diff --git a/src/include/access/gin_private.h b/src/include/access/gin_private.h
index 95d8805b66f2..aee1f70c22ee 100644
--- a/src/include/access/gin_private.h
+++ b/src/include/access/gin_private.h
@@ -354,7 +354,12 @@ typedef struct GinScanEntryData
 	/* for a partial-match or full-scan query, we accumulate all TIDs here */
 	TIDBitmap  *matchBitmap;
 	TBMPrivateIterator *matchIterator;
-	TBMIterateResult *matchResult;
+
+	/*
+	 * If blockno is InvalidBlockNumber, all of the other fields in the
+	 * matchResult are meaningless.
+	 */
+	TBMIterateResult matchResult;
 	OffsetNumber matchOffsets[TBM_MAX_TUPLES_PER_PAGE];
 	int			matchNtuples;
 
diff --git a/src/include/nodes/tidbitmap.h b/src/include/nodes/tidbitmap.h
index e185635c10be..99f795ceab52 100644
--- a/src/include/nodes/tidbitmap.h
+++ b/src/include/nodes/tidbitmap.h
@@ -101,8 +101,8 @@ extern bool tbm_is_empty(const TIDBitmap *tbm);
 
 extern TBMPrivateIterator *tbm_begin_private_iterate(TIDBitmap *tbm);
 extern dsa_pointer tbm_prepare_shared_iterate(TIDBitmap *tbm);
-extern TBMIterateResult *tbm_private_iterate(TBMPrivateIterator *iterator);
-extern TBMIterateResult *tbm_shared_iterate(TBMSharedIterator *iterator);
+extern bool tbm_private_iterate(TBMPrivateIterator *iterator, TBMIterateResult *tbmres);
+extern bool tbm_shared_iterate(TBMSharedIterator *iterator, TBMIterateResult *tbmres);
 extern void tbm_end_private_iterate(TBMPrivateIterator *iterator);
 extern void tbm_end_shared_iterate(TBMSharedIterator *iterator);
 extern TBMSharedIterator *tbm_attach_shared_iterate(dsa_area *dsa,
@@ -113,7 +113,7 @@ extern TBMIterator tbm_begin_iterate(TIDBitmap *tbm,
 									 dsa_area *dsa, dsa_pointer dsp);
 extern void tbm_end_iterate(TBMIterator *iterator);
 
-extern TBMIterateResult *tbm_iterate(TBMIterator *iterator);
+extern bool tbm_iterate(TBMIterator *iterator, TBMIterateResult *tbmres);
 
 static inline bool
 tbm_exhausted(TBMIterator *iterator)

From 1455c2ea34433beff2fee77a9b45bfe2639775e6 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Fri, 21 Feb 2025 15:15:47 -0500
Subject: [PATCH 8/9] BitmapHeapScan uses the read stream API

Make Bitmap Heap Scan use the read stream API instead of invoking
ReadBuffer() for each block indicated by the bitmap.

The read stream API handles prefetching, so remove all of the explicit
prefetching from bitmap heap scan code.

Now, heap table AM implements a read stream callback which uses the
bitmap iterator to return the next required block to the read stream
code.

Reviewed-by: Tomas Vondra <tomas@vondra.me>
Discussion: https://postgr.es/m/d4bb26c9-fe07-439e-ac53-c0e244387e01%40vondra.me
---
 src/backend/access/heap/heapam.c          |  80 +++++
 src/backend/access/heap/heapam_handler.c  |  90 +++---
 src/backend/executor/nodeBitmapHeapscan.c | 341 +---------------------
 src/include/access/tableam.h              |  25 +-
 src/include/nodes/execnodes.h             |  23 +-
 5 files changed, 124 insertions(+), 435 deletions(-)

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index fa7935a0ed39..65e44b25b74e 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -279,6 +279,72 @@ heap_scan_stream_read_next_serial(ReadStream *stream,
 	return scan->rs_prefetch_block;
 }
 
+/*
+ * Read stream API callback for bitmap heap scans.
+ * Returns the next block the caller wants from the read stream or
+ * InvalidBlockNumber when done.
+ */
+static BlockNumber
+bitmapheap_stream_read_next(ReadStream *pgsr, void *private_data,
+							void *per_buffer_data)
+{
+	TBMIterateResult *tbmres = per_buffer_data;
+	BitmapHeapScanDesc bscan = (BitmapHeapScanDesc) private_data;
+	HeapScanDesc hscan = (HeapScanDesc) bscan;
+	TableScanDesc sscan = &hscan->rs_base;
+
+	for (;;)
+	{
+		CHECK_FOR_INTERRUPTS();
+
+		/* no more entries in the bitmap */
+		if (!tbm_iterate(&sscan->st.rs_tbmiterator, tbmres))
+			return InvalidBlockNumber;
+
+		/*
+		 * Ignore any claimed entries past what we think is the end of the
+		 * relation. It may have been extended after the start of our scan (we
+		 * only hold an AccessShareLock, and it could be inserts from this
+		 * backend).  We don't take this optimization in SERIALIZABLE
+		 * isolation though, as we need to examine all invisible tuples
+		 * reachable by the index.
+		 */
+		if (!IsolationIsSerializable() &&
+			tbmres->blockno >= hscan->rs_nblocks)
+			continue;
+
+		/*
+		 * We can skip fetching the heap page if we don't need any fields from
+		 * the heap, the bitmap entries don't need rechecking, and all tuples
+		 * on the page are visible to our transaction.
+		 */
+		if (!(sscan->rs_flags & SO_NEED_TUPLES) &&
+			!tbmres->recheck &&
+			VM_ALL_VISIBLE(sscan->rs_rd, tbmres->blockno, &bscan->rs_vmbuffer))
+		{
+			OffsetNumber offsets[TBM_MAX_TUPLES_PER_PAGE];
+			int			noffsets;
+
+			/* can't be lossy in the skip_fetch case */
+			Assert(!tbmres->lossy);
+			Assert(bscan->rs_empty_tuples_pending >= 0);
+
+			/*
+			 * We throw away the offsets, but this is the easiest way to get a
+			 * count of tuples.
+			 */
+			noffsets = tbm_extract_page_tuple(tbmres, offsets, TBM_MAX_TUPLES_PER_PAGE);
+			bscan->rs_empty_tuples_pending += noffsets;
+			continue;
+		}
+
+		return tbmres->blockno;
+	}
+
+	/* not reachable */
+	Assert(false);
+}
+
 /* ----------------
  *		initscan - scan code common to heap_beginscan and heap_rescan
  * ----------------
@@ -1067,6 +1133,7 @@ heap_beginscan(Relation relation, Snapshot snapshot,
 	scan->rs_base.rs_flags = flags;
 	scan->rs_base.rs_parallel = parallel_scan;
 	scan->rs_strategy = NULL;	/* set in initscan */
+	scan->rs_cbuf = InvalidBuffer;
 
 	/*
 	 * Disable page-at-a-time mode if it's not a MVCC-safe snapshot.
@@ -1146,6 +1213,16 @@ heap_beginscan(Relation relation, Snapshot snapshot,
 														  scan,
 														  0);
 	}
+	else if (scan->rs_base.rs_flags & SO_TYPE_BITMAPSCAN)
+	{
+		scan->rs_read_stream = read_stream_begin_relation(READ_STREAM_DEFAULT,
+														  scan->rs_strategy,
+														  scan->rs_base.rs_rd,
+														  MAIN_FORKNUM,
+														  bitmapheap_stream_read_next,
+														  scan,
+														  sizeof(TBMIterateResult));
+	}
 
 
 	return (TableScanDesc) scan;
@@ -1180,7 +1257,10 @@ heap_rescan(TableScanDesc sscan, ScanKey key, bool set_params,
 	 * unpin scan buffers
 	 */
 	if (BufferIsValid(scan->rs_cbuf))
+	{
 		ReleaseBuffer(scan->rs_cbuf);
+		scan->rs_cbuf = InvalidBuffer;
+	}
 
 	if (scan->rs_base.rs_flags & SO_TYPE_BITMAPSCAN)
 	{
diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index bd49ff969f51..f0da9b6fb845 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -2117,82 +2117,72 @@ heapam_estimate_rel_size(Relation rel, int32 *attr_widths,
 
 static bool
 heapam_scan_bitmap_next_block(TableScanDesc scan,
-							  BlockNumber *blockno, bool *recheck,
+							  bool *recheck,
 							  uint64 *lossy_pages, uint64 *exact_pages)
 {
 	BitmapHeapScanDesc bscan = (BitmapHeapScanDesc) scan;
 	HeapScanDesc hscan = (HeapScanDesc) bscan;
 	BlockNumber block;
+	void	   *per_buffer_data;
 	Buffer		buffer;
 	Snapshot	snapshot;
 	int			ntup;
-	TBMIterateResult tbmres;
+	TBMIterateResult *tbmres;
 	OffsetNumber offsets[TBM_MAX_TUPLES_PER_PAGE];
 	int			noffsets = -1;
 
 	Assert(scan->rs_flags & SO_TYPE_BITMAPSCAN);
+	Assert(hscan->rs_read_stream);
 
 	hscan->rs_cindex = 0;
 	hscan->rs_ntuples = 0;
 
-	*blockno = InvalidBlockNumber;
-	*recheck = true;
-
-	do
+	/* Release buffer containing previous block. */
+	if (BufferIsValid(hscan->rs_cbuf))
 	{
-		CHECK_FOR_INTERRUPTS();
+		ReleaseBuffer(hscan->rs_cbuf);
+		hscan->rs_cbuf = InvalidBuffer;
+	}
 
-		if (!tbm_iterate(&scan->st.rs_tbmiterator, &tbmres))
-			return false;
+	hscan->rs_cbuf = read_stream_next_buffer(hscan->rs_read_stream,
+											 &per_buffer_data);
 
-		/* Exact pages need their tuple offsets extracted. */
-		if (!tbmres.lossy)
-			noffsets = tbm_extract_page_tuple(&tbmres, offsets,
-											  TBM_MAX_TUPLES_PER_PAGE);
+	if (BufferIsInvalid(hscan->rs_cbuf))
+	{
+		if (BufferIsValid(bscan->rs_vmbuffer))
+		{
+			ReleaseBuffer(bscan->rs_vmbuffer);
+			bscan->rs_vmbuffer = InvalidBuffer;
+		}
 
 		/*
-		 * Ignore any claimed entries past what we think is the end of the
-		 * relation. It may have been extended after the start of our scan (we
-		 * only hold an AccessShareLock, and it could be inserts from this
-		 * backend).  We don't take this optimization in SERIALIZABLE
-		 * isolation though, as we need to examine all invisible tuples
-		 * reachable by the index.
+		 * Bitmap is exhausted. Time to emit empty tuples if relevant. We emit
+		 * all empty tuples at the end instead of emitting them per block we
+		 * skip fetching. This is necessary because the streaming read API
+		 * will only return TBMIterateResults for blocks actually fetched.
+		 * When we skip fetching a block, we keep track of how many empty
+		 * tuples to emit at the end of the BitmapHeapScan. We do not recheck
+		 * all NULL tuples.
 		 */
-	} while (!IsolationIsSerializable() &&
-			 tbmres.blockno >= hscan->rs_nblocks);
+		*recheck = false;
+		return bscan->rs_empty_tuples_pending > 0;
+	}
 
-	/* Got a valid block */
-	*blockno = tbmres.blockno;
-	*recheck = tbmres.recheck;
+	Assert(per_buffer_data);
 
-	/*
-	 * We can skip fetching the heap page if we don't need any fields from the
-	 * heap, the bitmap entries don't need rechecking, and all tuples on the
-	 * page are visible to our transaction.
-	 */
-	if (!(scan->rs_flags & SO_NEED_TUPLES) &&
-		!tbmres.recheck &&
-		VM_ALL_VISIBLE(scan->rs_rd, tbmres.blockno, &bscan->rs_vmbuffer))
-	{
-		/* can't be lossy in the skip_fetch case */
-		Assert(!tbmres.lossy);
-		Assert(bscan->rs_empty_tuples_pending >= 0);
-		Assert(noffsets > -1);
+	tbmres = per_buffer_data;
 
-		bscan->rs_empty_tuples_pending += noffsets;
+	Assert(BlockNumberIsValid(tbmres->blockno));
+	Assert(BufferGetBlockNumber(hscan->rs_cbuf) == tbmres->blockno);
 
-		return true;
-	}
+	/* Exact pages need their tuple offsets extracted. */
+	if (!tbmres->lossy)
+		noffsets = tbm_extract_page_tuple(tbmres, offsets,
+										  TBM_MAX_TUPLES_PER_PAGE);
 
-	block = tbmres.blockno;
+	*recheck = tbmres->recheck;
 
-	/*
-	 * Acquire pin on the target heap page, trading in any pin we held before.
-	 */
-	hscan->rs_cbuf = ReleaseAndReadBuffer(hscan->rs_cbuf,
-										  scan->rs_rd,
-										  block);
-	hscan->rs_cblock = block;
+	block = hscan->rs_cblock = tbmres->blockno;
 	buffer = hscan->rs_cbuf;
 	snapshot = scan->rs_snapshot;
 
@@ -2213,7 +2203,7 @@ heapam_scan_bitmap_next_block(TableScanDesc scan,
 	/*
 	 * We need two separate strategies for lossy and non-lossy cases.
 	 */
-	if (!tbmres.lossy)
+	if (!tbmres->lossy)
 	{
 		/*
 		 * Bitmap is non-lossy, so we just look through the offsets listed in
@@ -2277,7 +2267,7 @@ heapam_scan_bitmap_next_block(TableScanDesc scan,
 	Assert(ntup <= MaxHeapTuplesPerPage);
 	hscan->rs_ntuples = ntup;
 
-	if (tbmres.lossy)
+	if (tbmres->lossy)
 		(*lossy_pages)++;
 	else
 		(*exact_pages)++;
diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c
index 3b4ea0f61446..6df34094a130 100644
--- a/src/backend/executor/nodeBitmapHeapscan.c
+++ b/src/backend/executor/nodeBitmapHeapscan.c
@@ -51,10 +51,6 @@
 static void BitmapTableScanSetup(BitmapHeapScanState *node);
 static TupleTableSlot *BitmapHeapNext(BitmapHeapScanState *node);
 static inline void BitmapDoneInitializingSharedState(ParallelBitmapHeapState *pstate);
-static inline void BitmapAdjustPrefetchIterator(BitmapHeapScanState *node);
-static inline void BitmapAdjustPrefetchTarget(BitmapHeapScanState *node);
-static inline void BitmapPrefetch(BitmapHeapScanState *node,
-								  TableScanDesc scan);
 static bool BitmapShouldInitializeSharedState(ParallelBitmapHeapState *pstate);
 
 
@@ -62,14 +58,6 @@ static bool BitmapShouldInitializeSharedState(ParallelBitmapHeapState *pstate);
  * Do the underlying index scan, build the bitmap, set up the parallel state
  * needed for parallel workers to iterate through the bitmap, and set up the
  * underlying table scan descriptor.
- *
- * For prefetching, we use *two* iterators, one for the pages we are actually
- * scanning and another that runs ahead of the first for prefetching.
- * node->prefetch_pages tracks exactly how many pages ahead the prefetch
- * iterator is.  Also, node->prefetch_target tracks the desired prefetch
- * distance, which starts small and increases up to the
- * node->prefetch_maximum.  This is to avoid doing a lot of prefetching in a
- * scan that stops after a few tuples because of a LIMIT.
  */
 static void
 BitmapTableScanSetup(BitmapHeapScanState *node)
@@ -102,14 +90,6 @@ BitmapTableScanSetup(BitmapHeapScanState *node)
 		 */
 		pstate->tbmiterator = tbm_prepare_shared_iterate(node->tbm);
 
-#ifdef USE_PREFETCH
-		if (node->prefetch_maximum > 0)
-		{
-			pstate->prefetch_iterator =
-				tbm_prepare_shared_iterate(node->tbm);
-		}
-#endif							/* USE_PREFETCH */
-
 		/* We have initialized the shared state so wake up others. */
 		BitmapDoneInitializingSharedState(pstate);
 	}
@@ -119,15 +99,6 @@ BitmapTableScanSetup(BitmapHeapScanState *node)
 									pstate->tbmiterator :
 									InvalidDsaPointer);
 
-#ifdef USE_PREFETCH
-	if (node->prefetch_maximum > 0)
-		node->prefetch_iterator =
-			tbm_begin_iterate(node->tbm, dsa,
-							  pstate ?
-							  pstate->prefetch_iterator :
-							  InvalidDsaPointer);
-#endif							/* USE_PREFETCH */
-
 	/*
 	 * If this is the first scan of the underlying table, create the table
 	 * scan descriptor and begin the scan.
@@ -158,7 +129,6 @@ BitmapTableScanSetup(BitmapHeapScanState *node)
 	node->initialized = true;
 }
 
-
 /* ----------------------------------------------------------------
  *		BitmapHeapNext
  *
@@ -172,10 +142,6 @@ BitmapHeapNext(BitmapHeapScanState *node)
 	TableScanDesc scan;
 	TupleTableSlot *slot;
 
-#ifdef USE_PREFETCH
-	ParallelBitmapHeapState *pstate = node->pstate;
-#endif
-
 	/*
 	 * extract necessary information from index scan node
 	 */
@@ -204,37 +170,6 @@ BitmapHeapNext(BitmapHeapScanState *node)
 
 			CHECK_FOR_INTERRUPTS();
 
-#ifdef USE_PREFETCH
-
-			/*
-			 * Try to prefetch at least a few pages even before we get to the
-			 * second page if we don't stop reading after the first tuple.
-			 */
-			if (!pstate)
-			{
-				if (node->prefetch_target < node->prefetch_maximum)
-					node->prefetch_target++;
-			}
-			else if (pstate->prefetch_target < node->prefetch_maximum)
-			{
-				/* take spinlock while updating shared state */
-				SpinLockAcquire(&pstate->mutex);
-				if (pstate->prefetch_target < node->prefetch_maximum)
-					pstate->prefetch_target++;
-				SpinLockRelease(&pstate->mutex);
-			}
-#endif							/* USE_PREFETCH */
-
-			/*
-			 * We issue prefetch requests *after* fetching the current page to
-			 * try to avoid having prefetching interfere with the main I/O.
-			 * Also, this should happen only when we have determined there is
-			 * still something to do on the current page, else we may
-			 * uselessly prefetch the same page we are just about to request
-			 * for real.
-			 */
-			BitmapPrefetch(node, scan);
-
 			/*
 			 * If we are using lossy info, we have to recheck the qual
 			 * conditions at every tuple.
@@ -257,31 +192,15 @@ BitmapHeapNext(BitmapHeapScanState *node)
 
 new_page:
 
-		BitmapAdjustPrefetchIterator(node);
-
 		/*
 		 * Returns false if the bitmap is exhausted and there are no further
 		 * blocks we need to scan.
 		 */
-		if (!table_scan_bitmap_next_block(scan, &node->blockno,
+		if (!table_scan_bitmap_next_block(scan,
 										  &node->recheck,
 										  &node->stats.lossy_pages,
 										  &node->stats.exact_pages))
 			break;
-
-		/*
-		 * If serial, we can error out if the prefetch block doesn't stay
-		 * ahead of the current block.
-		 */
-		if (node->pstate == NULL &&
-			!tbm_exhausted(&node->prefetch_iterator) &&
-			node->prefetch_blockno < node->blockno)
-			elog(ERROR,
-				 "prefetch and main iterators are out of sync. pfblockno: %d. blockno: %d",
-				 node->prefetch_blockno, node->blockno);
-
-		/* Adjust the prefetch target */
-		BitmapAdjustPrefetchTarget(node);
 	}
 
 	/*
@@ -305,225 +224,6 @@ BitmapDoneInitializingSharedState(ParallelBitmapHeapState *pstate)
 	ConditionVariableBroadcast(&pstate->cv);
 }
 
-/*
- *	BitmapAdjustPrefetchIterator - Adjust the prefetch iterator
- *
- *	We keep track of how far the prefetch iterator is ahead of the main
- *	iterator in prefetch_pages. For each block the main iterator returns, we
- *	decrement prefetch_pages.
- */
-static inline void
-BitmapAdjustPrefetchIterator(BitmapHeapScanState *node)
-{
-#ifdef USE_PREFETCH
-	ParallelBitmapHeapState *pstate = node->pstate;
-	TBMIterateResult tbmpre;
-
-	if (pstate == NULL)
-	{
-		TBMIterator *prefetch_iterator = &node->prefetch_iterator;
-
-		if (node->prefetch_pages > 0)
-		{
-			/* The main iterator has closed the distance by one page */
-			node->prefetch_pages--;
-		}
-		else if (!tbm_exhausted(prefetch_iterator))
-		{
-			tbm_iterate(prefetch_iterator, &tbmpre);
-			node->prefetch_blockno = tbmpre.blockno;
-		}
-		return;
-	}
-
-	/*
-	 * XXX: There is a known issue with keeping the prefetch and current block
-	 * iterators in sync for parallel bitmap table scans. This can lead to
-	 * prefetching blocks that have already been read. See the discussion
-	 * here:
-	 * https://postgr.es/m/20240315211449.en2jcmdqxv5o6tlz%40alap3.anarazel.de
-	 * Note that moving the call site of BitmapAdjustPrefetchIterator()
-	 * exacerbates the effects of this bug.
-	 */
-	if (node->prefetch_maximum > 0)
-	{
-		TBMIterator *prefetch_iterator = &node->prefetch_iterator;
-
-		SpinLockAcquire(&pstate->mutex);
-		if (pstate->prefetch_pages > 0)
-		{
-			pstate->prefetch_pages--;
-			SpinLockRelease(&pstate->mutex);
-		}
-		else
-		{
-			/* Release the mutex before iterating */
-			SpinLockRelease(&pstate->mutex);
-
-			/*
-			 * In case of shared mode, we can not ensure that the current
-			 * blockno of the main iterator and that of the prefetch iterator
-			 * are same.  It's possible that whatever blockno we are
-			 * prefetching will be processed by another process.  Therefore,
-			 * we don't validate the blockno here as we do in non-parallel
-			 * case.
-			 */
-			if (!tbm_exhausted(prefetch_iterator))
-			{
-				tbm_iterate(prefetch_iterator, &tbmpre);
-				node->prefetch_blockno = tbmpre.blockno;
-			}
-		}
-	}
-#endif							/* USE_PREFETCH */
-}
-
-/*
- * BitmapAdjustPrefetchTarget - Adjust the prefetch target
- *
- * Increase prefetch target if it's not yet at the max.  Note that
- * we will increase it to zero after fetching the very first
- * page/tuple, then to one after the second tuple is fetched, then
- * it doubles as later pages are fetched.
- */
-static inline void
-BitmapAdjustPrefetchTarget(BitmapHeapScanState *node)
-{
-#ifdef USE_PREFETCH
-	ParallelBitmapHeapState *pstate = node->pstate;
-
-	if (pstate == NULL)
-	{
-		if (node->prefetch_target >= node->prefetch_maximum)
-			 /* don't increase any further */ ;
-		else if (node->prefetch_target >= node->prefetch_maximum / 2)
-			node->prefetch_target = node->prefetch_maximum;
-		else if (node->prefetch_target > 0)
-			node->prefetch_target *= 2;
-		else
-			node->prefetch_target++;
-		return;
-	}
-
-	/* Do an unlocked check first to save spinlock acquisitions. */
-	if (pstate->prefetch_target < node->prefetch_maximum)
-	{
-		SpinLockAcquire(&pstate->mutex);
-		if (pstate->prefetch_target >= node->prefetch_maximum)
-			 /* don't increase any further */ ;
-		else if (pstate->prefetch_target >= node->prefetch_maximum / 2)
-			pstate->prefetch_target = node->prefetch_maximum;
-		else if (pstate->prefetch_target > 0)
-			pstate->prefetch_target *= 2;
-		else
-			pstate->prefetch_target++;
-		SpinLockRelease(&pstate->mutex);
-	}
-#endif							/* USE_PREFETCH */
-}
-
-/*
- * BitmapPrefetch - Prefetch, if prefetch_pages are behind prefetch_target
- */
-static inline void
-BitmapPrefetch(BitmapHeapScanState *node, TableScanDesc scan)
-{
-#ifdef USE_PREFETCH
-	ParallelBitmapHeapState *pstate = node->pstate;
-
-	if (pstate == NULL)
-	{
-		TBMIterator *prefetch_iterator = &node->prefetch_iterator;
-
-		if (!tbm_exhausted(prefetch_iterator))
-		{
-			while (node->prefetch_pages < node->prefetch_target)
-			{
-				TBMIterateResult tbmpre;
-				bool		skip_fetch;
-
-				if (!tbm_iterate(prefetch_iterator, &tbmpre))
-				{
-					/* No more pages to prefetch */
-					Assert(!BlockNumberIsValid(tbmpre.blockno));
-					tbm_end_iterate(prefetch_iterator);
-					break;
-				}
-				node->prefetch_pages++;
-				node->prefetch_blockno = tbmpre.blockno;
-
-				/*
-				 * If we expect not to have to actually read this heap page,
-				 * skip this prefetch call, but continue to run the prefetch
-				 * logic normally.  (Would it be better not to increment
-				 * prefetch_pages?)
-				 */
-				skip_fetch = (!(scan->rs_flags & SO_NEED_TUPLES) &&
-							  !tbmpre.recheck &&
-							  VM_ALL_VISIBLE(node->ss.ss_currentRelation,
-											 tbmpre.blockno,
-											 &node->pvmbuffer));
-
-				if (!skip_fetch)
-					PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre.blockno);
-			}
-		}
-
-		return;
-	}
-
-	if (pstate->prefetch_pages < pstate->prefetch_target)
-	{
-		TBMIterator *prefetch_iterator = &node->prefetch_iterator;
-
-		if (!tbm_exhausted(prefetch_iterator))
-		{
-			while (1)
-			{
-				TBMIterateResult tbmpre;
-				bool		do_prefetch = false;
-				bool		skip_fetch;
-
-				/*
-				 * Recheck under the mutex. If some other process has already
-				 * done enough prefetching then we need not to do anything.
-				 */
-				SpinLockAcquire(&pstate->mutex);
-				if (pstate->prefetch_pages < pstate->prefetch_target)
-				{
-					pstate->prefetch_pages++;
-					do_prefetch = true;
-				}
-				SpinLockRelease(&pstate->mutex);
-
-				if (!do_prefetch)
-					return;
-
-				if (!tbm_iterate(prefetch_iterator, &tbmpre))
-				{
-					Assert(!BlockNumberIsValid(tbmpre.blockno));
-					/* No more pages to prefetch */
-					tbm_end_iterate(prefetch_iterator);
-					break;
-				}
-
-				node->prefetch_blockno = tbmpre.blockno;
-
-				/* As above, skip prefetch if we expect not to need page */
-				skip_fetch = (!(scan->rs_flags & SO_NEED_TUPLES) &&
-							  !tbmpre.recheck &&
-							  VM_ALL_VISIBLE(node->ss.ss_currentRelation,
-											 tbmpre.blockno,
-											 &node->pvmbuffer));
-
-				if (!skip_fetch)
-					PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre.blockno);
-			}
-		}
-	}
-#endif							/* USE_PREFETCH */
-}
-
 /*
  * BitmapHeapRecheck -- access method routine to recheck a tuple in EvalPlanQual
  */
@@ -580,24 +280,12 @@ ExecReScanBitmapHeapScan(BitmapHeapScanState *node)
 		table_rescan(node->ss.ss_currentScanDesc, NULL);
 	}
 
-	/* If we did not already clean up the prefetch iterator, do so now. */
-	if (!tbm_exhausted(&node->prefetch_iterator))
-		tbm_end_iterate(&node->prefetch_iterator);
-
 	/* release bitmaps and buffers if any */
 	if (node->tbm)
 		tbm_free(node->tbm);
-	if (node->pvmbuffer != InvalidBuffer)
-		ReleaseBuffer(node->pvmbuffer);
 	node->tbm = NULL;
 	node->initialized = false;
-	node->pvmbuffer = InvalidBuffer;
 	node->recheck = true;
-	/* Only used for serial BHS */
-	node->blockno = InvalidBlockNumber;
-	node->prefetch_blockno = InvalidBlockNumber;
-	node->prefetch_pages = 0;
-	node->prefetch_target = -1;
 
 	ExecScanReScan(&node->ss);
 
@@ -666,17 +354,11 @@ ExecEndBitmapHeapScan(BitmapHeapScanState *node)
 		table_endscan(scanDesc);
 	}
 
-	/* If we did not already clean up the prefetch iterator, do so now. */
-	if (!tbm_exhausted(&node->prefetch_iterator))
-		tbm_end_iterate(&node->prefetch_iterator);
-
 	/*
 	 * release bitmaps and buffers if any
 	 */
 	if (node->tbm)
 		tbm_free(node->tbm);
-	if (node->pvmbuffer != InvalidBuffer)
-		ReleaseBuffer(node->pvmbuffer);
 }
 
 /* ----------------------------------------------------------------
@@ -709,18 +391,13 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
 	scanstate->ss.ps.ExecProcNode = ExecBitmapHeapScan;
 
 	scanstate->tbm = NULL;
-	scanstate->pvmbuffer = InvalidBuffer;
 
 	/* Zero the statistics counters */
 	memset(&scanstate->stats, 0, sizeof(BitmapHeapScanInstrumentation));
 
-	scanstate->prefetch_pages = 0;
-	scanstate->prefetch_target = -1;
 	scanstate->initialized = false;
 	scanstate->pstate = NULL;
 	scanstate->recheck = true;
-	scanstate->blockno = InvalidBlockNumber;
-	scanstate->prefetch_blockno = InvalidBlockNumber;
 
 	/*
 	 * Miscellaneous initialization
@@ -760,13 +437,6 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
 	scanstate->bitmapqualorig =
 		ExecInitQual(node->bitmapqualorig, (PlanState *) scanstate);
 
-	/*
-	 * Maximum number of prefetches for the tablespace if configured,
-	 * otherwise the current value of the effective_io_concurrency GUC.
-	 */
-	scanstate->prefetch_maximum =
-		get_tablespace_io_concurrency(currentRelation->rd_rel->reltablespace);
-
 	scanstate->ss.ss_currentRelation = currentRelation;
 
 	/*
@@ -870,12 +540,9 @@ ExecBitmapHeapInitializeDSM(BitmapHeapScanState *node,
 		sinstrument = (SharedBitmapHeapInstrumentation *) ptr;
 
 	pstate->tbmiterator = 0;
-	pstate->prefetch_iterator = 0;
 
 	/* Initialize the mutex */
 	SpinLockInit(&pstate->mutex);
-	pstate->prefetch_pages = 0;
-	pstate->prefetch_target = -1;
 	pstate->state = BM_INITIAL;
 
 	ConditionVariableInit(&pstate->cv);
@@ -912,17 +579,11 @@ ExecBitmapHeapReInitializeDSM(BitmapHeapScanState *node,
 		return;
 
 	pstate->state = BM_INITIAL;
-	pstate->prefetch_pages = 0;
-	pstate->prefetch_target = -1;
 
 	if (DsaPointerIsValid(pstate->tbmiterator))
 		tbm_free_shared_area(dsa, pstate->tbmiterator);
 
-	if (DsaPointerIsValid(pstate->prefetch_iterator))
-		tbm_free_shared_area(dsa, pstate->prefetch_iterator);
-
 	pstate->tbmiterator = InvalidDsaPointer;
-	pstate->prefetch_iterator = InvalidDsaPointer;
 }
 
 /* ----------------------------------------------------------------
diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h
index 131c050c15f1..507d4ebe68f8 100644
--- a/src/include/access/tableam.h
+++ b/src/include/access/tableam.h
@@ -797,28 +797,12 @@ typedef struct TableAmRoutine
 	 * always need to be rechecked, but some non-lossy pages' tuples may also
 	 * require recheck.
 	 *
-	 * `blockno` is the current block and is set by the table AM. The table AM
-	 * is responsible for advancing the main iterator, but the bitmap table
-	 * scan code still advances the prefetch iterator. `blockno` is used by
-	 * bitmap table scan code to validate that the prefetch block stays ahead
-	 * of the current block.
-	 *
-	 * XXX: Currently this may only be implemented if the AM uses md.c as its
-	 * storage manager, and uses ItemPointer->ip_blkid in a manner that maps
-	 * blockids directly to the underlying storage. nodeBitmapHeapscan.c
-	 * performs prefetching directly using that interface.  This probably
-	 * needs to be rectified at a later point.
-	 *
-	 * XXX: Currently this may only be implemented if the AM uses the
-	 * visibilitymap, as nodeBitmapHeapscan.c unconditionally accesses it to
-	 * perform prefetching.  This probably needs to be rectified at a later
-	 * point.
+	 * Prefetching additional data from the bitmap is left to the table AM.
 	 *
 	 * Optional callback, but either both scan_bitmap_next_block and
 	 * scan_bitmap_next_tuple need to exist, or neither.
 	 */
 	bool		(*scan_bitmap_next_block) (TableScanDesc scan,
-										   BlockNumber *blockno,
 										   bool *recheck,
 										   uint64 *lossy_pages,
 										   uint64 *exact_pages);
@@ -1966,16 +1950,11 @@ table_relation_estimate_size(Relation rel, int32 *attr_widths,
  * `recheck` is set by the table AM to indicate whether or not the tuples
  * from this block should be rechecked.
  *
- * `blockno` is the current block and is set by the table AM and is used by
- * bitmap table scan code to validate that the prefetch block stays ahead of
- * the current block.
- *
  * Note, this is an optionally implemented function, therefore should only be
  * used after verifying the presence (at plan time or such).
  */
 static inline bool
 table_scan_bitmap_next_block(TableScanDesc scan,
-							 BlockNumber *blockno,
 							 bool *recheck,
 							 uint64 *lossy_pages,
 							 uint64 *exact_pages)
@@ -1989,7 +1968,7 @@ table_scan_bitmap_next_block(TableScanDesc scan,
 		elog(ERROR, "unexpected table_scan_bitmap_next_block call during logical decoding");
 
 	return scan->rs_rd->rd_tableam->scan_bitmap_next_block(scan,
-														   blockno, recheck,
+														   recheck,
 														   lossy_pages,
 														   exact_pages);
 }
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 575b0b1bd246..d4d4e655180c 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1840,11 +1840,7 @@ typedef enum
 /* ----------------
  *	 ParallelBitmapHeapState information
  *		tbmiterator				iterator for scanning current pages
- *		prefetch_iterator		iterator for prefetching ahead of current page
- *		mutex					mutual exclusion for the prefetching variable
- *								and state
- *		prefetch_pages			# pages prefetch iterator is ahead of current
- *		prefetch_target			current target prefetch distance
+ *		mutex					mutual exclusion for state
  *		state					current state of the TIDBitmap
  *		cv						conditional wait variable
  * ----------------
@@ -1852,10 +1848,7 @@ typedef enum
 typedef struct ParallelBitmapHeapState
 {
 	dsa_pointer tbmiterator;
-	dsa_pointer prefetch_iterator;
 	slock_t		mutex;
-	int			prefetch_pages;
-	int			prefetch_target;
 	SharedBitmapState state;
 	ConditionVariable cv;
 } ParallelBitmapHeapState;
@@ -1879,18 +1872,11 @@ typedef struct SharedBitmapHeapInstrumentation
  *
  *		bitmapqualorig	   execution state for bitmapqualorig expressions
  *		tbm				   bitmap obtained from child index scan(s)
- *		pvmbuffer		   buffer for visibility-map lookups of prefetched pages
  *		stats			   execution statistics
- *		prefetch_iterator  iterator for prefetching ahead of current page
- *		prefetch_pages	   # pages prefetch iterator is ahead of current
- *		prefetch_target    current target prefetch distance
- *		prefetch_maximum   maximum value for prefetch_target
  *		initialized		   is node is ready to iterate
  *		pstate			   shared state for parallel bitmap scan
  *		sinstrument		   statistics for parallel workers
  *		recheck			   do current page's tuples need recheck
- *		blockno			   used to validate pf and current block stay in sync
- *		prefetch_blockno   used to validate pf stays ahead of current block
  * ----------------
  */
 typedef struct BitmapHeapScanState
@@ -1898,18 +1884,11 @@ typedef struct BitmapHeapScanState
 	ScanState	ss;				/* its first field is NodeTag */
 	ExprState  *bitmapqualorig;
 	TIDBitmap  *tbm;
-	Buffer		pvmbuffer;
 	BitmapHeapScanInstrumentation stats;
-	TBMIterator prefetch_iterator;
-	int			prefetch_pages;
-	int			prefetch_target;
-	int			prefetch_maximum;
 	bool		initialized;
 	ParallelBitmapHeapState *pstate;
 	SharedBitmapHeapInstrumentation *sinstrument;
 	bool		recheck;
-	BlockNumber blockno;
-	BlockNumber prefetch_blockno;
 } BitmapHeapScanState;
 
 /* ----------------

From 08cba161c4770fe3e865887992acb6b5c5cba06b Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Fri, 21 Feb 2025 15:32:33 -0500
Subject: [PATCH 9/9] Remove table AM callback scan_bitmap_next_block

After pushing the bitmap iterator into table-AM specific code (as part
of making bitmap heap scan use the read stream API),
scan_bitmap_next_block() no longer returns the current block number.
Since scan_bitmap_next_block() isn't returning any relevant information
to bitmap table scan code, it makes more sense to get rid of it.

Now, bitmap table scan code only calls table_scan_bitmap_next_tuple(),
and the heap AM implementation of scan_bitmap_next_block() is a local
helper in heapam_handler.c.

Reviewed-by: Tomas Vondra <tomas@vondra.me>
Discussion: https://postgr.es/m/d4bb26c9-fe07-439e-ac53-c0e244387e01%40vondra.me

ci-os-only:
---
 src/backend/access/heap/heapam_handler.c  | 372 ++++++++++++----------
 src/backend/access/table/tableamapi.c     |   3 -
 src/backend/executor/nodeBitmapHeapscan.c |  73 ++---
 src/backend/optimizer/util/plancat.c      |   2 +-
 src/include/access/tableam.h              |  90 ++----
 5 files changed, 244 insertions(+), 296 deletions(-)

diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index f0da9b6fb845..49267ce8cec2 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -56,6 +56,10 @@ static bool SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer,
 
 static BlockNumber heapam_scan_get_blocks_done(HeapScanDesc hscan);
 
+static bool BitmapHeapScanNextBlock(TableScanDesc scan,
+									bool *recheck,
+									uint64 *lossy_pages, uint64 *exact_pages);
+
 
 /* ------------------------------------------------------------------------
  * Slot related callbacks for heap AM
@@ -2116,198 +2120,44 @@ heapam_estimate_rel_size(Relation rel, int32 *attr_widths,
  */
 
 static bool
-heapam_scan_bitmap_next_block(TableScanDesc scan,
+heapam_scan_bitmap_next_tuple(TableScanDesc scan,
+							  TupleTableSlot *slot,
 							  bool *recheck,
-							  uint64 *lossy_pages, uint64 *exact_pages)
+							  uint64 *lossy_pages,
+							  uint64 *exact_pages)
 {
 	BitmapHeapScanDesc bscan = (BitmapHeapScanDesc) scan;
 	HeapScanDesc hscan = (HeapScanDesc) bscan;
-	BlockNumber block;
-	void	   *per_buffer_data;
-	Buffer		buffer;
-	Snapshot	snapshot;
-	int			ntup;
-	TBMIterateResult *tbmres;
-	OffsetNumber offsets[TBM_MAX_TUPLES_PER_PAGE];
-	int			noffsets = -1;
-
-	Assert(scan->rs_flags & SO_TYPE_BITMAPSCAN);
-	Assert(hscan->rs_read_stream);
-
-	hscan->rs_cindex = 0;
-	hscan->rs_ntuples = 0;
-
-	/* Release buffer containing previous block. */
-	if (BufferIsValid(hscan->rs_cbuf))
-	{
-		ReleaseBuffer(hscan->rs_cbuf);
-		hscan->rs_cbuf = InvalidBuffer;
-	}
-
-	hscan->rs_cbuf = read_stream_next_buffer(hscan->rs_read_stream,
-											 &per_buffer_data);
-
-	if (BufferIsInvalid(hscan->rs_cbuf))
-	{
-		if (BufferIsValid(bscan->rs_vmbuffer))
-		{
-			ReleaseBuffer(bscan->rs_vmbuffer);
-			bscan->rs_vmbuffer = InvalidBuffer;
-		}
-
-		/*
-		 * Bitmap is exhausted. Time to emit empty tuples if relevant. We emit
-		 * all empty tuples at the end instead of emitting them per block we
-		 * skip fetching. This is necessary because the streaming read API
-		 * will only return TBMIterateResults for blocks actually fetched.
-		 * When we skip fetching a block, we keep track of how many empty
-		 * tuples to emit at the end of the BitmapHeapScan. We do not recheck
-		 * all NULL tuples.
-		 */
-		*recheck = false;
-		return bscan->rs_empty_tuples_pending > 0;
-	}
-
-	Assert(per_buffer_data);
-
-	tbmres = per_buffer_data;
-
-	Assert(BlockNumberIsValid(tbmres->blockno));
-	Assert(BufferGetBlockNumber(hscan->rs_cbuf) == tbmres->blockno);
-
-	/* Exact pages need their tuple offsets extracted. */
-	if (!tbmres->lossy)
-		noffsets = tbm_extract_page_tuple(tbmres, offsets,
-										  TBM_MAX_TUPLES_PER_PAGE);
-
-	*recheck = tbmres->recheck;
-
-	block = hscan->rs_cblock = tbmres->blockno;
-	buffer = hscan->rs_cbuf;
-	snapshot = scan->rs_snapshot;
-
-	ntup = 0;
-
-	/*
-	 * Prune and repair fragmentation for the whole page, if possible.
-	 */
-	heap_page_prune_opt(scan->rs_rd, buffer);
-
-	/*
-	 * We must hold share lock on the buffer content while examining tuple
-	 * visibility.  Afterwards, however, the tuples we have found to be
-	 * visible are guaranteed good as long as we hold the buffer pin.
-	 */
-	LockBuffer(buffer, BUFFER_LOCK_SHARE);
+	OffsetNumber targoffset;
+	Page		page;
+	ItemId		lp;
 
 	/*
-	 * We need two separate strategies for lossy and non-lossy cases.
+	 * Out of range?  If so, nothing more to look at on this page
 	 */
-	if (!tbmres->lossy)
-	{
-		/*
-		 * Bitmap is non-lossy, so we just look through the offsets listed in
-		 * tbmres; but we have to follow any HOT chain starting at each such
-		 * offset.
-		 */
-		int			curslot;
-
-		/* We must have extracted the tuple offsets by now */
-		Assert(noffsets > -1);
-
-		for (curslot = 0; curslot < noffsets; curslot++)
-		{
-			OffsetNumber offnum = offsets[curslot];
-			ItemPointerData tid;
-			HeapTupleData heapTuple;
-
-			ItemPointerSet(&tid, block, offnum);
-			if (heap_hot_search_buffer(&tid, scan->rs_rd, buffer, snapshot,
-									   &heapTuple, NULL, true))
-				hscan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid);
-		}
-	}
-	else
+	while (hscan->rs_cindex >= hscan->rs_ntuples)
 	{
 		/*
-		 * Bitmap is lossy, so we must examine each line pointer on the page.
-		 * But we can ignore HOT chains, since we'll check each tuple anyway.
+		 * Emit empty tuples before advancing to the next block
 		 */
-		Page		page = BufferGetPage(buffer);
-		OffsetNumber maxoff = PageGetMaxOffsetNumber(page);
-		OffsetNumber offnum;
-
-		for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum))
+		if (bscan->rs_empty_tuples_pending > 0)
 		{
-			ItemId		lp;
-			HeapTupleData loctup;
-			bool		valid;
-
-			lp = PageGetItemId(page, offnum);
-			if (!ItemIdIsNormal(lp))
-				continue;
-			loctup.t_data = (HeapTupleHeader) PageGetItem(page, lp);
-			loctup.t_len = ItemIdGetLength(lp);
-			loctup.t_tableOid = scan->rs_rd->rd_id;
-			ItemPointerSet(&loctup.t_self, block, offnum);
-			valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
-			if (valid)
-			{
-				hscan->rs_vistuples[ntup++] = offnum;
-				PredicateLockTID(scan->rs_rd, &loctup.t_self, snapshot,
-								 HeapTupleHeaderGetXmin(loctup.t_data));
-			}
-			HeapCheckForSerializableConflictOut(valid, scan->rs_rd, &loctup,
-												buffer, snapshot);
+			/*
+			 * If we don't have to fetch the tuple, just return nulls.
+			 */
+			ExecStoreAllNullTuple(slot);
+			bscan->rs_empty_tuples_pending--;
+			return true;
 		}
-	}
 
-	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
-
-	Assert(ntup <= MaxHeapTuplesPerPage);
-	hscan->rs_ntuples = ntup;
-
-	if (tbmres->lossy)
-		(*lossy_pages)++;
-	else
-		(*exact_pages)++;
-
-	/*
-	 * Return true to indicate that a valid block was found and the bitmap is
-	 * not exhausted. If there are no visible tuples on this page,
-	 * hscan->rs_ntuples will be 0 and heapam_scan_bitmap_next_tuple() will
-	 * return false returning control to this function to advance to the next
-	 * block in the bitmap.
-	 */
-	return true;
-}
-
-static bool
-heapam_scan_bitmap_next_tuple(TableScanDesc scan,
-							  TupleTableSlot *slot)
-{
-	BitmapHeapScanDesc bscan = (BitmapHeapScanDesc) scan;
-	HeapScanDesc hscan = (HeapScanDesc) bscan;
-	OffsetNumber targoffset;
-	Page		page;
-	ItemId		lp;
-
-	if (bscan->rs_empty_tuples_pending > 0)
-	{
 		/*
-		 * If we don't have to fetch the tuple, just return nulls.
+		 * Returns false if the bitmap is exhausted and there are no further
+		 * blocks we need to scan.
 		 */
-		ExecStoreAllNullTuple(slot);
-		bscan->rs_empty_tuples_pending--;
-		return true;
+		if (!BitmapHeapScanNextBlock(scan, recheck, lossy_pages, exact_pages))
+			return false;
 	}
 
-	/*
-	 * Out of range?  If so, nothing more to look at on this page
-	 */
-	if (hscan->rs_cindex >= hscan->rs_ntuples)
-		return false;
-
 	targoffset = hscan->rs_vistuples[hscan->rs_cindex];
 	page = BufferGetPage(hscan->rs_cbuf);
 	lp = PageGetItemId(page, targoffset);
@@ -2614,6 +2464,177 @@ SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer,
 	}
 }
 
+/*
+ * Helper function get the next block of a bitmap heap scan. Returns true when
+ * it got the next block and saved it in the scan descriptor and false when
+ * the bitmap and or relation are exhausted.
+ */
+static bool
+BitmapHeapScanNextBlock(TableScanDesc scan,
+						bool *recheck,
+						uint64 *lossy_pages, uint64 *exact_pages)
+{
+	BitmapHeapScanDesc bscan = (BitmapHeapScanDesc) scan;
+	HeapScanDesc hscan = (HeapScanDesc) bscan;
+	BlockNumber block;
+	void	   *per_buffer_data;
+	Buffer		buffer;
+	Snapshot	snapshot;
+	int			ntup;
+	TBMIterateResult *tbmres;
+	OffsetNumber offsets[TBM_MAX_TUPLES_PER_PAGE];
+	int			noffsets = -1;
+
+	Assert(scan->rs_flags & SO_TYPE_BITMAPSCAN);
+	Assert(hscan->rs_read_stream);
+
+	hscan->rs_cindex = 0;
+	hscan->rs_ntuples = 0;
+
+	/* Release buffer containing previous block. */
+	if (BufferIsValid(hscan->rs_cbuf))
+	{
+		ReleaseBuffer(hscan->rs_cbuf);
+		hscan->rs_cbuf = InvalidBuffer;
+	}
+
+	hscan->rs_cbuf = read_stream_next_buffer(hscan->rs_read_stream,
+											 &per_buffer_data);
+
+	if (BufferIsInvalid(hscan->rs_cbuf))
+	{
+		if (BufferIsValid(bscan->rs_vmbuffer))
+		{
+			ReleaseBuffer(bscan->rs_vmbuffer);
+			bscan->rs_vmbuffer = InvalidBuffer;
+		}
+
+		/*
+		 * Bitmap is exhausted. Time to emit empty tuples if relevant. We emit
+		 * all empty tuples at the end instead of emitting them per block we
+		 * skip fetching. This is necessary because the streaming read API
+		 * will only return TBMIterateResults for blocks actually fetched.
+		 * When we skip fetching a block, we keep track of how many empty
+		 * tuples to emit at the end of the BitmapHeapScan. We do not recheck
+		 * all NULL tuples.
+		 */
+		*recheck = false;
+		return bscan->rs_empty_tuples_pending > 0;
+	}
+
+	Assert(per_buffer_data);
+
+	tbmres = per_buffer_data;
+
+	Assert(BlockNumberIsValid(tbmres->blockno));
+	Assert(BufferGetBlockNumber(hscan->rs_cbuf) == tbmres->blockno);
+
+	/* Exact pages need their tuple offsets extracted. */
+	if (!tbmres->lossy)
+		noffsets = tbm_extract_page_tuple(tbmres, offsets,
+										  TBM_MAX_TUPLES_PER_PAGE);
+
+	*recheck = tbmres->recheck;
+
+	block = hscan->rs_cblock = tbmres->blockno;
+	buffer = hscan->rs_cbuf;
+	snapshot = scan->rs_snapshot;
+
+	ntup = 0;
+
+	/*
+	 * Prune and repair fragmentation for the whole page, if possible.
+	 */
+	heap_page_prune_opt(scan->rs_rd, buffer);
+
+	/*
+	 * We must hold share lock on the buffer content while examining tuple
+	 * visibility.  Afterwards, however, the tuples we have found to be
+	 * visible are guaranteed good as long as we hold the buffer pin.
+	 */
+	LockBuffer(buffer, BUFFER_LOCK_SHARE);
+
+	/*
+	 * We need two separate strategies for lossy and non-lossy cases.
+	 */
+	if (!tbmres->lossy)
+	{
+		/*
+		 * Bitmap is non-lossy, so we just look through the offsets listed in
+		 * tbmres; but we have to follow any HOT chain starting at each such
+		 * offset.
+		 */
+		int			curslot;
+
+		/* We must have extracted the tuple offsets by now */
+		Assert(noffsets > -1);
+
+		for (curslot = 0; curslot < noffsets; curslot++)
+		{
+			OffsetNumber offnum = offsets[curslot];
+			ItemPointerData tid;
+			HeapTupleData heapTuple;
+
+			ItemPointerSet(&tid, block, offnum);
+			if (heap_hot_search_buffer(&tid, scan->rs_rd, buffer, snapshot,
+									   &heapTuple, NULL, true))
+				hscan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid);
+		}
+	}
+	else
+	{
+		/*
+		 * Bitmap is lossy, so we must examine each line pointer on the page.
+		 * But we can ignore HOT chains, since we'll check each tuple anyway.
+		 */
+		Page		page = BufferGetPage(buffer);
+		OffsetNumber maxoff = PageGetMaxOffsetNumber(page);
+		OffsetNumber offnum;
+
+		for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum))
+		{
+			ItemId		lp;
+			HeapTupleData loctup;
+			bool		valid;
+
+			lp = PageGetItemId(page, offnum);
+			if (!ItemIdIsNormal(lp))
+				continue;
+			loctup.t_data = (HeapTupleHeader) PageGetItem(page, lp);
+			loctup.t_len = ItemIdGetLength(lp);
+			loctup.t_tableOid = scan->rs_rd->rd_id;
+			ItemPointerSet(&loctup.t_self, block, offnum);
+			valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
+			if (valid)
+			{
+				hscan->rs_vistuples[ntup++] = offnum;
+				PredicateLockTID(scan->rs_rd, &loctup.t_self, snapshot,
+								 HeapTupleHeaderGetXmin(loctup.t_data));
+			}
+			HeapCheckForSerializableConflictOut(valid, scan->rs_rd, &loctup,
+												buffer, snapshot);
+		}
+	}
+
+	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+
+	Assert(ntup <= MaxHeapTuplesPerPage);
+	hscan->rs_ntuples = ntup;
+
+	if (tbmres->lossy)
+		(*lossy_pages)++;
+	else
+		(*exact_pages)++;
+
+	/*
+	 * Return true to indicate that a valid block was found and the bitmap is
+	 * not exhausted. If there are no visible tuples on this page,
+	 * hscan->rs_ntuples will be 0 and heapam_scan_bitmap_next_tuple() will
+	 * return false returning control to this function to advance to the next
+	 * block in the bitmap.
+	 */
+	return true;
+}
 
 /* ------------------------------------------------------------------------
  * Definition of the heap table access method.
@@ -2673,7 +2694,6 @@ static const TableAmRoutine heapam_methods = {
 
 	.relation_estimate_size = heapam_estimate_rel_size,
 
-	.scan_bitmap_next_block = heapam_scan_bitmap_next_block,
 	.scan_bitmap_next_tuple = heapam_scan_bitmap_next_tuple,
 	.scan_sample_next_block = heapam_scan_sample_next_block,
 	.scan_sample_next_tuple = heapam_scan_sample_next_tuple
diff --git a/src/backend/access/table/tableamapi.c b/src/backend/access/table/tableamapi.c
index 760a36fd2a17..476663b66aad 100644
--- a/src/backend/access/table/tableamapi.c
+++ b/src/backend/access/table/tableamapi.c
@@ -91,9 +91,6 @@ GetTableAmRoutine(Oid amhandler)
 
 	Assert(routine->relation_estimate_size != NULL);
 
-	/* optional, but one callback implies presence of the other */
-	Assert((routine->scan_bitmap_next_block == NULL) ==
-		   (routine->scan_bitmap_next_tuple == NULL));
 	Assert(routine->scan_sample_next_block != NULL);
 	Assert(routine->scan_sample_next_tuple != NULL);
 
diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c
index 6df34094a130..3e33360c0fce 100644
--- a/src/backend/executor/nodeBitmapHeapscan.c
+++ b/src/backend/executor/nodeBitmapHeapscan.c
@@ -138,69 +138,44 @@ BitmapTableScanSetup(BitmapHeapScanState *node)
 static TupleTableSlot *
 BitmapHeapNext(BitmapHeapScanState *node)
 {
-	ExprContext *econtext;
-	TableScanDesc scan;
-	TupleTableSlot *slot;
-
-	/*
-	 * extract necessary information from index scan node
-	 */
-	econtext = node->ss.ps.ps_ExprContext;
-	slot = node->ss.ss_ScanTupleSlot;
-	scan = node->ss.ss_currentScanDesc;
+	ExprContext *econtext = node->ss.ps.ps_ExprContext;
+	TupleTableSlot *slot = node->ss.ss_ScanTupleSlot;
 
 	/*
 	 * If we haven't yet performed the underlying index scan, do it, and begin
 	 * the iteration over the bitmap.
 	 */
 	if (!node->initialized)
-	{
 		BitmapTableScanSetup(node);
-		scan = node->ss.ss_currentScanDesc;
-		goto new_page;
-	}
 
-	for (;;)
+	while (table_scan_bitmap_next_tuple(node->ss.ss_currentScanDesc,
+										slot, &node->recheck,
+										&node->stats.lossy_pages,
+										&node->stats.exact_pages))
 	{
-		while (table_scan_bitmap_next_tuple(scan, slot))
-		{
-			/*
-			 * Continuing in previously obtained page.
-			 */
-
-			CHECK_FOR_INTERRUPTS();
+		/*
+		 * Continuing in previously obtained page.
+		 */
+		CHECK_FOR_INTERRUPTS();
 
-			/*
-			 * If we are using lossy info, we have to recheck the qual
-			 * conditions at every tuple.
-			 */
-			if (node->recheck)
+		/*
+		 * If we are using lossy info, we have to recheck the qual conditions
+		 * at every tuple.
+		 */
+		if (node->recheck)
+		{
+			econtext->ecxt_scantuple = slot;
+			if (!ExecQualAndReset(node->bitmapqualorig, econtext))
 			{
-				econtext->ecxt_scantuple = slot;
-				if (!ExecQualAndReset(node->bitmapqualorig, econtext))
-				{
-					/* Fails recheck, so drop it and loop back for another */
-					InstrCountFiltered2(node, 1);
-					ExecClearTuple(slot);
-					continue;
-				}
+				/* Fails recheck, so drop it and loop back for another */
+				InstrCountFiltered2(node, 1);
+				ExecClearTuple(slot);
+				continue;
 			}
-
-			/* OK to return this tuple */
-			return slot;
 		}
 
-new_page:
-
-		/*
-		 * Returns false if the bitmap is exhausted and there are no further
-		 * blocks we need to scan.
-		 */
-		if (!table_scan_bitmap_next_block(scan,
-										  &node->recheck,
-										  &node->stats.lossy_pages,
-										  &node->stats.exact_pages))
-			break;
+		/* OK to return this tuple */
+		return slot;
 	}
 
 	/*
diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c
index 71abb01f6558..0489ad366446 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -325,7 +325,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 				info->amcanparallel = amroutine->amcanparallel;
 				info->amhasgettuple = (amroutine->amgettuple != NULL);
 				info->amhasgetbitmap = amroutine->amgetbitmap != NULL &&
-					relation->rd_tableam->scan_bitmap_next_block != NULL;
+					relation->rd_tableam->scan_bitmap_next_tuple != NULL;
 				info->amcanmarkpos = (amroutine->ammarkpos != NULL &&
 									  amroutine->amrestrpos != NULL);
 				info->amcostestimate = amroutine->amcostestimate;
diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h
index 507d4ebe68f8..b8cb1e744ad1 100644
--- a/src/include/access/tableam.h
+++ b/src/include/access/tableam.h
@@ -780,43 +780,23 @@ typedef struct TableAmRoutine
 	 */
 
 	/*
-	 * Prepare to fetch / check / return tuples from `blockno` as part of a
-	 * bitmap table scan. `scan` was started via table_beginscan_bm(). Return
-	 * false if the bitmap is exhausted and true otherwise.
-	 *
-	 * This will typically read and pin the target block, and do the necessary
-	 * work to allow scan_bitmap_next_tuple() to return tuples (e.g. it might
-	 * make sense to perform tuple visibility checks at this time).
-	 *
-	 * `lossy_pages` and `exact_pages` are EXPLAIN counters that can be
-	 * incremented by the table AM to indicate whether or not the block's
-	 * representation in the bitmap is lossy.
+	 * Fetch the next tuple of a bitmap table scan into `slot` and return true
+	 * if a visible tuple was found, false otherwise.
 	 *
-	 * `recheck` is set by the table AM to indicate whether or not the tuples
-	 * from this block should be rechecked. Tuples from lossy pages will
-	 * always need to be rechecked, but some non-lossy pages' tuples may also
-	 * require recheck.
+	 * `lossy_pages` is incremented if the bitmap is lossy for the selected
+	 * page; otherwise, `exact_pages` is incremented. These are tracked for
+	 * display in EXPLAIN ANALYZE output.
 	 *
 	 * Prefetching additional data from the bitmap is left to the table AM.
 	 *
-	 * Optional callback, but either both scan_bitmap_next_block and
-	 * scan_bitmap_next_tuple need to exist, or neither.
+	 * This is an optional callback.
 	 */
-	bool		(*scan_bitmap_next_block) (TableScanDesc scan,
+	bool		(*scan_bitmap_next_tuple) (TableScanDesc scan,
+										   TupleTableSlot *slot,
 										   bool *recheck,
 										   uint64 *lossy_pages,
 										   uint64 *exact_pages);
 
-	/*
-	 * Fetch the next tuple of a bitmap table scan into `slot` and return true
-	 * if a visible tuple was found, false otherwise.
-	 *
-	 * Optional callback, but either both scan_bitmap_next_block and
-	 * scan_bitmap_next_tuple need to exist, or neither.
-	 */
-	bool		(*scan_bitmap_next_tuple) (TableScanDesc scan,
-										   TupleTableSlot *slot);
-
 	/*
 	 * Prepare to fetch tuples from the next block in a sample scan. Return
 	 * false if the sample scan is finished, true otherwise. `scan` was
@@ -1939,51 +1919,24 @@ table_relation_estimate_size(Relation rel, int32 *attr_widths,
  */
 
 /*
- * Prepare to fetch / check / return tuples as part of a bitmap table scan.
- * `scan` needs to have been started via table_beginscan_bm(). Returns false
- * if there are no more blocks in the bitmap, true otherwise.
- *
- * `lossy_pages` and `exact_pages` are EXPLAIN counters that can be
- * incremented by the table AM to indicate whether or not the block's
- * representation in the bitmap is lossy.
+ * Fetch / check / return tuples as part of a bitmap table scan. `scan` needs
+ * to have been started via table_beginscan_bm(). Fetch the next tuple of a
+ * bitmap table scan into `slot` and return true if a visible tuple was found,
+ * false otherwise.
  *
- * `recheck` is set by the table AM to indicate whether or not the tuples
- * from this block should be rechecked.
+ * `recheck` is set by the table AM to indicate whether or not the tuple in
+ * `slot` should be rechecked. Tuples from lossy pages will always need to be
+ * rechecked, but some non-lossy pages' tuples may also require recheck.
  *
- * Note, this is an optionally implemented function, therefore should only be
- * used after verifying the presence (at plan time or such).
+ * `lossy_pages` is incremented if the block's representation in the bitmap is
+ * lossy; otherwise, `exact_pages` is incremented.
  */
 static inline bool
-table_scan_bitmap_next_block(TableScanDesc scan,
+table_scan_bitmap_next_tuple(TableScanDesc scan,
+							 TupleTableSlot *slot,
 							 bool *recheck,
 							 uint64 *lossy_pages,
 							 uint64 *exact_pages)
-{
-	/*
-	 * We don't expect direct calls to table_scan_bitmap_next_block with valid
-	 * CheckXidAlive for catalog or regular tables.  See detailed comments in
-	 * xact.c where these variables are declared.
-	 */
-	if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan))
-		elog(ERROR, "unexpected table_scan_bitmap_next_block call during logical decoding");
-
-	return scan->rs_rd->rd_tableam->scan_bitmap_next_block(scan,
-														   recheck,
-														   lossy_pages,
-														   exact_pages);
-}
-
-/*
- * Fetch the next tuple of a bitmap table scan into `slot` and return true if
- * a visible tuple was found, false otherwise.
- * table_scan_bitmap_next_block() needs to previously have selected a
- * block (i.e. returned true), and no previous
- * table_scan_bitmap_next_tuple() for the same block may have
- * returned false.
- */
-static inline bool
-table_scan_bitmap_next_tuple(TableScanDesc scan,
-							 TupleTableSlot *slot)
 {
 	/*
 	 * We don't expect direct calls to table_scan_bitmap_next_tuple with valid
@@ -1994,7 +1947,10 @@ table_scan_bitmap_next_tuple(TableScanDesc scan,
 		elog(ERROR, "unexpected table_scan_bitmap_next_tuple call during logical decoding");
 
 	return scan->rs_rd->rd_tableam->scan_bitmap_next_tuple(scan,
-														   slot);
+														   slot,
+														   recheck,
+														   lossy_pages,
+														   exact_pages);
 }
 
 /*