Skip to content

Commit e852dd3

Browse files
tvondraCommitfest Bot
authored and
Commitfest Bot
committed
WIP: Don't read the same block repeatedly
1 parent 11d3f18 commit e852dd3

File tree

5 files changed

+109
-3
lines changed

5 files changed

+109
-3
lines changed

src/backend/access/heap/heapam_handler.c

+54-3
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,7 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan,
136136
{
137137
/* Switch to correct buffer if we don't have it already */
138138
Buffer prev_buf = hscan->xs_cbuf;
139+
bool release_prev = true;
139140

140141
/*
141142
* Read the block for the requested TID. With a read stream, simply
@@ -157,7 +158,56 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan,
157158
* API.
158159
*/
159160
if (scan->rs)
160-
hscan->xs_cbuf = read_stream_next_buffer(scan->rs, NULL);
161+
{
162+
/*
163+
* If we're trying to read the same block as the last time, don't
164+
* try reading it from the stream again, but just return the last
165+
* buffer. We need to check if the previous buffer is still pinned
166+
* and contains the correct block (it might have been unpinned,
167+
* used for a different block, so we need to be careful).
168+
*
169+
* The place scheduling the blocks (index_scan_stream_read_next)
170+
* needs to do the same thing and not schedule the blocks if it
171+
* matches the previous one. Otherwise the stream will get out of
172+
* sync, causing confusion.
173+
*
174+
* This is what ReleaseAndReadBuffer does too, but it does not
175+
* have a queue of requests scheduled from somewhere else, so it
176+
* does not need to worry about that.
177+
*
178+
* XXX Maybe we should remember the block in IndexFetchTableData,
179+
* so that we can make the check even cheaper, without looking at
180+
* the buffer descriptor? But that assumes the buffer was not
181+
* unpinned (or repinned) elsewhere, before we got back here. But
182+
* can that even happen? If yes, I guess we shouldn't be releasing
183+
* the prev buffer anyway.
184+
*
185+
* XXX This has undesired impact on prefetch distance. The read
186+
* stream schedules reads for a certain number of future blocks,
187+
* but if we skip duplicate blocks, the prefetch distance may get
188+
* unexpectedly large (e.g. for correlated indexes, with long runs
189+
* of TIDs from the same heap page). This may spend a lot of CPU
190+
* time in the index_scan_stream_read_next callback, but more
191+
* importantly it may require reading (and keeping) a lot of leaf
192+
* pages from the index.
193+
*
194+
* XXX What if we pinned the buffer twice (increase the refcount),
195+
* so that if the caller unpins the buffer, we still keep the
196+
* second pin. Wouldn't that mean we don't need to worry about the
197+
* possibility someone loaded another page into the buffer?
198+
*
199+
* XXX We might also keep a longer history of recent blocks, not
200+
* just the immediately preceding one. But that makes it harder,
201+
* because the two places (read_next callback and here) need to
202+
* have a slightly different view.
203+
*/
204+
if (BufferMatches(hscan->xs_cbuf,
205+
hscan->xs_base.rel,
206+
ItemPointerGetBlockNumber(tid)))
207+
release_prev = false;
208+
else
209+
hscan->xs_cbuf = read_stream_next_buffer(scan->rs, NULL);
210+
}
161211
else
162212
hscan->xs_cbuf = ReleaseAndReadBuffer(hscan->xs_cbuf,
163213
hscan->xs_base.rel,
@@ -181,7 +231,8 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan,
181231
heap_page_prune_opt(hscan->xs_base.rel, hscan->xs_cbuf);
182232

183233
/*
184-
* When using the read stream, release the old buffer.
234+
* When using the read stream, release the old buffer - but only if
235+
* we're reading a different block.
185236
*
186237
* XXX Not sure this is really needed, or maybe this is not the right
187238
* place to do this, and buffers should be released elsewhere. The
@@ -199,7 +250,7 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan,
199250
* XXX Does this do the right thing when reading the same page? That
200251
* should return the same buffer, so won't we release it prematurely?
201252
*/
202-
if (scan->rs && (prev_buf != InvalidBuffer))
253+
if (scan->rs && (prev_buf != InvalidBuffer) && release_prev)
203254
{
204255
ReleaseBuffer(prev_buf);
205256
}

src/backend/access/index/indexam.c

+11
Original file line numberDiff line numberDiff line change
@@ -1908,6 +1908,15 @@ index_scan_stream_read_next(ReadStream *stream,
19081908
continue;
19091909
}
19101910

1911+
/* same block as before, don't need to read it */
1912+
if (scan->xs_batches->lastBlock == ItemPointerGetBlockNumber(tid))
1913+
{
1914+
DEBUG_LOG("index_scan_stream_read_next: skip block (lastBlock)");
1915+
continue;
1916+
}
1917+
1918+
scan->xs_batches->lastBlock = ItemPointerGetBlockNumber(tid);
1919+
19111920
return ItemPointerGetBlockNumber(tid);
19121921
}
19131922

@@ -2268,6 +2277,7 @@ index_batch_init(IndexScanDesc scan)
22682277
index_batch_pos_reset(scan, &scan->xs_batches->markPos);
22692278

22702279
// scan->xs_batches->currentBatch = NULL;
2280+
scan->xs_batches->lastBlock = InvalidBlockNumber;
22712281
}
22722282

22732283
/*
@@ -2350,6 +2360,7 @@ index_batch_reset(IndexScanDesc scan, bool complete)
23502360
batches->finished = false;
23512361
batches->reset = false;
23522362
// batches->currentBatch = NULL;
2363+
batches->lastBlock = InvalidBlockNumber;
23532364

23542365
AssertCheckBatches(scan);
23552366
}

src/backend/storage/buffer/bufmgr.c

+40
Original file line numberDiff line numberDiff line change
@@ -3045,6 +3045,46 @@ ReleaseAndReadBuffer(Buffer buffer,
30453045
return ReadBuffer(relation, blockNum);
30463046
}
30473047

3048+
/*
3049+
* BufferMatches
3050+
* Check if the buffer (still) contains the expected page.
3051+
*
3052+
* Check if the buffer contains the expected page. The buffer may be invalid,
3053+
* or valid and pinned.
3054+
*/
3055+
bool
3056+
BufferMatches(Buffer buffer,
3057+
Relation relation,
3058+
BlockNumber blockNum)
3059+
{
3060+
ForkNumber forkNum = MAIN_FORKNUM;
3061+
BufferDesc *bufHdr;
3062+
3063+
if (BufferIsValid(buffer))
3064+
{
3065+
Assert(BufferIsPinned(buffer));
3066+
if (BufferIsLocal(buffer))
3067+
{
3068+
bufHdr = GetLocalBufferDescriptor(-buffer - 1);
3069+
if (bufHdr->tag.blockNum == blockNum &&
3070+
BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
3071+
BufTagGetForkNum(&bufHdr->tag) == forkNum)
3072+
return true;
3073+
}
3074+
else
3075+
{
3076+
bufHdr = GetBufferDescriptor(buffer - 1);
3077+
/* we have pin, so it's ok to examine tag without spinlock */
3078+
if (bufHdr->tag.blockNum == blockNum &&
3079+
BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
3080+
BufTagGetForkNum(&bufHdr->tag) == forkNum)
3081+
return true;
3082+
}
3083+
}
3084+
3085+
return false;
3086+
}
3087+
30483088
/*
30493089
* PinBuffer -- make buffer unavailable for replacement.
30503090
*

src/include/access/relscan.h

+2
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,8 @@ typedef struct IndexScanBatches
242242
bool finished;
243243
bool reset;
244244

245+
BlockNumber lastBlock;
246+
245247
/*
246248
* Current scan direction, for the currently loaded batches. This is used
247249
* to load data in the read stream API callback, etc.

src/include/storage/bufmgr.h

+2
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,8 @@ extern void IncrBufferRefCount(Buffer buffer);
237237
extern void CheckBufferIsPinnedOnce(Buffer buffer);
238238
extern Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation,
239239
BlockNumber blockNum);
240+
extern bool BufferMatches(Buffer buffer, Relation relation,
241+
BlockNumber blockNum);
240242

241243
extern Buffer ExtendBufferedRel(BufferManagerRelation bmr,
242244
ForkNumber forkNum,

0 commit comments

Comments
 (0)