@@ -136,6 +136,7 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan,
136
136
{
137
137
/* Switch to correct buffer if we don't have it already */
138
138
Buffer prev_buf = hscan -> xs_cbuf ;
139
+ bool release_prev = true;
139
140
140
141
/*
141
142
* Read the block for the requested TID. With a read stream, simply
@@ -157,7 +158,56 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan,
157
158
* API.
158
159
*/
159
160
if (scan -> rs )
160
- hscan -> xs_cbuf = read_stream_next_buffer (scan -> rs , NULL );
161
+ {
162
+ /*
163
+ * If we're trying to read the same block as the last time, don't
164
+ * try reading it from the stream again, but just return the last
165
+ * buffer. We need to check if the previous buffer is still pinned
166
+ * and contains the correct block (it might have been unpinned,
167
+ * used for a different block, so we need to be careful).
168
+ *
169
+ * The place scheduling the blocks (index_scan_stream_read_next)
170
+ * needs to do the same thing and not schedule the blocks if it
171
+ * matches the previous one. Otherwise the stream will get out of
172
+ * sync, causing confusion.
173
+ *
174
+ * This is what ReleaseAndReadBuffer does too, but it does not
175
+ * have a queue of requests scheduled from somewhere else, so it
176
+ * does not need to worry about that.
177
+ *
178
+ * XXX Maybe we should remember the block in IndexFetchTableData,
179
+ * so that we can make the check even cheaper, without looking at
180
+ * the buffer descriptor? But that assumes the buffer was not
181
+ * unpinned (or repinned) elsewhere, before we got back here. But
182
+ * can that even happen? If yes, I guess we shouldn't be releasing
183
+ * the prev buffer anyway.
184
+ *
185
+ * XXX This has undesired impact on prefetch distance. The read
186
+ * stream schedules reads for a certain number of future blocks,
187
+ * but if we skip duplicate blocks, the prefetch distance may get
188
+ * unexpectedly large (e.g. for correlated indexes, with long runs
189
+ * of TIDs from the same heap page). This may spend a lot of CPU
190
+ * time in the index_scan_stream_read_next callback, but more
191
+ * importantly it may require reading (and keeping) a lot of leaf
192
+ * pages from the index.
193
+ *
194
+ * XXX What if we pinned the buffer twice (increase the refcount),
195
+ * so that if the caller unpins the buffer, we still keep the
196
+ * second pin. Wouldn't that mean we don't need to worry about the
197
+ * possibility someone loaded another page into the buffer?
198
+ *
199
+ * XXX We might also keep a longer history of recent blocks, not
200
+ * just the immediately preceding one. But that makes it harder,
201
+ * because the two places (read_next callback and here) need to
202
+ * have a slightly different view.
203
+ */
204
+ if (BufferMatches (hscan -> xs_cbuf ,
205
+ hscan -> xs_base .rel ,
206
+ ItemPointerGetBlockNumber (tid )))
207
+ release_prev = false;
208
+ else
209
+ hscan -> xs_cbuf = read_stream_next_buffer (scan -> rs , NULL );
210
+ }
161
211
else
162
212
hscan -> xs_cbuf = ReleaseAndReadBuffer (hscan -> xs_cbuf ,
163
213
hscan -> xs_base .rel ,
@@ -181,7 +231,8 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan,
181
231
heap_page_prune_opt (hscan -> xs_base .rel , hscan -> xs_cbuf );
182
232
183
233
/*
184
- * When using the read stream, release the old buffer.
234
+ * When using the read stream, release the old buffer - but only if
235
+ * we're reading a different block.
185
236
*
186
237
* XXX Not sure this is really needed, or maybe this is not the right
187
238
* place to do this, and buffers should be released elsewhere. The
@@ -199,7 +250,7 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan,
199
250
* XXX Does this do the right thing when reading the same page? That
200
251
* should return the same buffer, so won't we release it prematurely?
201
252
*/
202
- if (scan -> rs && (prev_buf != InvalidBuffer ))
253
+ if (scan -> rs && (prev_buf != InvalidBuffer ) && release_prev )
203
254
{
204
255
ReleaseBuffer (prev_buf );
205
256
}
0 commit comments