diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index ac082fefa77a..326d5fed681e 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -79,11 +79,12 @@ heapam_slot_callbacks(Relation relation) */ static IndexFetchTableData * -heapam_index_fetch_begin(Relation rel) +heapam_index_fetch_begin(Relation rel, ReadStream *rs) { IndexFetchHeapData *hscan = palloc0(sizeof(IndexFetchHeapData)); hscan->xs_base.rel = rel; + hscan->xs_base.rs = rs; hscan->xs_cbuf = InvalidBuffer; return &hscan->xs_base; @@ -94,6 +95,9 @@ heapam_index_fetch_reset(IndexFetchTableData *scan) { IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan; + if (scan->rs) + read_stream_reset(scan->rs); + if (BufferIsValid(hscan->xs_cbuf)) { ReleaseBuffer(hscan->xs_cbuf); @@ -108,6 +112,9 @@ heapam_index_fetch_end(IndexFetchTableData *scan) heapam_index_fetch_reset(scan); + if (scan->rs) + read_stream_end(scan->rs); + pfree(hscan); } @@ -129,16 +136,124 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan, { /* Switch to correct buffer if we don't have it already */ Buffer prev_buf = hscan->xs_cbuf; + bool release_prev = true; + + /* + * Read the block for the requested TID. With a read stream, simply + * read the next block we queued earlier (from the callback). + * Otherwise just do the regular read using the TID. + * + * XXX It's a bit fragile to just read buffers, expecting the right + * block, which we queued from the callback sometime much earlier. If + * the two streams get out of sync in any way (which can happen + * easily, due to some optimization heuristics), it may misbehave in + * strange ways. + * + * XXX We need to support both the old ReadBuffer and ReadStream, as + * some places are unlikely to benefit from a read stream - e.g. + * because they only fetch a single tuple. So better to support this. + * + * XXX Another reason is that some index AMs may not support the + * batching interface, which is a prerequisite for using read_stream + * API. + */ + if (scan->rs) + { + /* + * If we're trying to read the same block as the last time, don't + * try reading it from the stream again, but just return the last + * buffer. We need to check if the previous buffer is still pinned + * and contains the correct block (it might have been unpinned, + * used for a different block, so we need to be careful). + * + * The place scheduling the blocks (index_scan_stream_read_next) + * needs to do the same thing and not schedule the blocks if it + * matches the previous one. Otherwise the stream will get out of + * sync, causing confusion. + * + * This is what ReleaseAndReadBuffer does too, but it does not + * have a queue of requests scheduled from somewhere else, so it + * does not need to worry about that. + * + * XXX Maybe we should remember the block in IndexFetchTableData, + * so that we can make the check even cheaper, without looking at + * the buffer descriptor? But that assumes the buffer was not + * unpinned (or repinned) elsewhere, before we got back here. But + * can that even happen? If yes, I guess we shouldn't be releasing + * the prev buffer anyway. + * + * XXX This has undesired impact on prefetch distance. The read + * stream schedules reads for a certain number of future blocks, + * but if we skip duplicate blocks, the prefetch distance may get + * unexpectedly large (e.g. for correlated indexes, with long runs + * of TIDs from the same heap page). This may spend a lot of CPU + * time in the index_scan_stream_read_next callback, but more + * importantly it may require reading (and keeping) a lot of leaf + * pages from the index. + * + * XXX What if we pinned the buffer twice (increase the refcount), + * so that if the caller unpins the buffer, we still keep the + * second pin. Wouldn't that mean we don't need to worry about the + * possibility someone loaded another page into the buffer? + * + * XXX We might also keep a longer history of recent blocks, not + * just the immediately preceding one. But that makes it harder, + * because the two places (read_next callback and here) need to + * have a slightly different view. + */ + if (BufferMatches(hscan->xs_cbuf, + hscan->xs_base.rel, + ItemPointerGetBlockNumber(tid))) + release_prev = false; + else + hscan->xs_cbuf = read_stream_next_buffer(scan->rs, NULL); + } + else + hscan->xs_cbuf = ReleaseAndReadBuffer(hscan->xs_cbuf, + hscan->xs_base.rel, + ItemPointerGetBlockNumber(tid)); - hscan->xs_cbuf = ReleaseAndReadBuffer(hscan->xs_cbuf, - hscan->xs_base.rel, - ItemPointerGetBlockNumber(tid)); + /* We should always get a valid buffer for a valid TID. */ + Assert(BufferIsValid(hscan->xs_cbuf)); + + /* + * Did we read the expected block number (per the TID)? For the + * regular buffer reads this should always match, but with the read + * stream it might disagree due to a bug elsewhere (happened + * repeatedly). + */ + Assert(BufferGetBlockNumber(hscan->xs_cbuf) == ItemPointerGetBlockNumber(tid)); /* * Prune page, but only if we weren't already on this page */ if (prev_buf != hscan->xs_cbuf) heap_page_prune_opt(hscan->xs_base.rel, hscan->xs_cbuf); + + /* + * When using the read stream, release the old buffer - but only if + * we're reading a different block. + * + * XXX Not sure this is really needed, or maybe this is not the right + * place to do this, and buffers should be released elsewhere. The + * problem is that other place may not really know if the index scan + * uses read stream API. + * + * XXX We need to do this, because otherwise the caller would need to + * do different things depending on whether the read_stream was used + * or not. With the read_stream it'd have to also explicitly release + * the buffers, but doing that for every caller seems error prone + * (easy to forget). It's also not clear whether it would free the + * buffer before or after the index_fetch_tuple call (we don't know if + * the buffer changed until *after* the call, etc.). + * + * XXX Does this do the right thing when reading the same page? That + * should return the same buffer, so won't we release it prematurely? + */ + if (scan->rs && (prev_buf != InvalidBuffer) && release_prev) + { + ReleaseBuffer(prev_buf); + } } /* Obtain share-lock on the buffer so we can examine visibility */ @@ -753,7 +868,14 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, tableScan = NULL; heapScan = NULL; - indexScan = index_beginscan(OldHeap, OldIndex, SnapshotAny, NULL, 0, 0); + + /* + * XXX Maybe enable batching/prefetch for clustering? Seems like it + * might be a pretty substantial win if the table is not yet well + * clustered by the index. + */ + indexScan = index_beginscan(OldHeap, OldIndex, SnapshotAny, NULL, 0, 0, + false); index_rescan(indexScan, NULL, 0, NULL, 0); } else diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index 8f532e14590e..8266d5e0e872 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -446,8 +446,21 @@ systable_beginscan(Relation heapRelation, elog(ERROR, "column is not in index"); } + /* + * No batching/prefetch for catalogs. We don't expect that to help + * very much, because we usually need just one row, and even if we + * need multiple rows, they tend to be colocated in heap. + * + * XXX Maybe we could do that, the prefetching only ramps up over time + * anyway? There was a problem with infinite recursion when looking up + * effective_io_concurrency for a tablespace (which may do an index + * scan internally), but the read_stream should care of that. Still, + * we don't expect this to help a lot. + * + * XXX This also means scans on catalogs won't use read_stream. + */ sysscan->iscan = index_beginscan(heapRelation, irel, - snapshot, NULL, nkeys, 0); + snapshot, NULL, nkeys, 0, false); index_rescan(sysscan->iscan, idxkey, nkeys, NULL, 0); sysscan->scan = NULL; @@ -707,8 +720,21 @@ systable_beginscan_ordered(Relation heapRelation, elog(ERROR, "column is not in index"); } + /* + * No batching/prefetch for catalogs. We don't expect that to help very + * much, because we usually need just one row, and even if we need + * multiple rows, they tend to be colocated in heap. + * + * XXX Maybe we could do that, the prefetching only ramps up over time + * anyway? There was a problem with infinite recursion when looking up + * effective_io_concurrency for a tablespace (which may do an index scan + * internally), but the read_stream should care of that. Still, we don't + * expect this to help a lot. + * + * XXX This also means scans on catalogs won't use read_stream. + */ sysscan->iscan = index_beginscan(heapRelation, indexRelation, - snapshot, NULL, nkeys, 0); + snapshot, NULL, nkeys, 0, false); index_rescan(sysscan->iscan, idxkey, nkeys, NULL, 0); sysscan->scan = NULL; diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index 219df1971da6..ae4f3ffb0cac 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -44,6 +44,7 @@ #include "postgres.h" #include "access/amapi.h" +#include "access/nbtree.h" /* XXX for MaxTIDsPerBTreePage (should remove) */ #include "access/relation.h" #include "access/reloptions.h" #include "access/relscan.h" @@ -58,6 +59,8 @@ #include "utils/snapmgr.h" #include "utils/syscache.h" +/* enable batching / prefetching during index scans */ +bool enable_indexscan_batching = false; /* ---------------------------------------------------------------- * macros used in index_ routines @@ -109,6 +112,36 @@ static IndexScanDesc index_beginscan_internal(Relation indexRelation, ParallelIndexScanDesc pscan, bool temp_snap); static inline void validate_relation_kind(Relation r); +/* index batching */ +static void index_batch_init(IndexScanDesc scan); +static void index_batch_reset(IndexScanDesc scan, bool complete); +static void index_batch_end(IndexScanDesc scan); +static bool index_batch_getnext(IndexScanDesc scan); +static void index_batch_free(IndexScanDesc scan, IndexScanBatch batch); +static ItemPointer index_batch_getnext_tid(IndexScanDesc scan, + ScanDirection direction); + +static BlockNumber index_scan_stream_read_next(ReadStream *stream, + void *callback_private_data, + void *per_buffer_data); + +static bool index_batch_pos_advance(IndexScanDesc scan, IndexScanBatchPos *pos); +static void index_batch_pos_reset(IndexScanDesc scan, IndexScanBatchPos *pos); +static void index_batch_kill_item(IndexScanDesc scan); + +static void AssertCheckBatchPosValid(IndexScanDesc scan, IndexScanBatchPos *pos); +static void AssertCheckBatch(IndexScanDesc scan, IndexScanBatch batch); +static void AssertCheckBatches(IndexScanDesc scan); + + +#define INDEX_SCAN_BATCH(scan, idx) \ + ((scan)->xs_batches->batches[(idx) % (scan)->xs_batches->maxBatches]) + +#ifdef INDEXAM_DEBUG +#define DEBUG_LOG(...) elog(WARNING, __VA_ARGS__) +#else +#define DEBUG_LOG(...) +#endif /* ---------------------------------------------------------------- * index_ interface functions @@ -250,6 +283,10 @@ index_insert_cleanup(Relation indexRelation, /* * index_beginscan - start a scan of an index with amgettuple * + * enable_batching determines whether the scan should try using the batching + * interface (amgetbatch/amfreebatch), if supported by the index AM, or the + * regular amgettuple interface. + * * Caller must be holding suitable locks on the heap and the index. */ IndexScanDesc @@ -257,8 +294,10 @@ index_beginscan(Relation heapRelation, Relation indexRelation, Snapshot snapshot, IndexScanInstrumentation *instrument, - int nkeys, int norderbys) + int nkeys, int norderbys, + bool enable_batching) { + ReadStream *rs = NULL; IndexScanDesc scan; Assert(snapshot != InvalidSnapshot); @@ -273,8 +312,45 @@ index_beginscan(Relation heapRelation, scan->xs_snapshot = snapshot; scan->instrument = instrument; + /* + * If explicitly requested and supported by both the index AM and the + * plan, initialize batching info. We only use stream read API with + * batching enabled (so not with systable scans). But maybe we should + * change that, and just use different read_next callbacks (or something + * like that)? + * + * XXX Maybe we should have a separate "amcanbatch" call, to let the AM + * decide if batching is supported depending on the scan details. That + * might be needed for certain index AMs, that can do batching only for + * some scans (I'm thinking about GiST/SP-GiST indexes, with ORDER BY). + * + * XXX Do this before initializing xs_heapfetch, so that we can pass the + * read stream to it. + */ + if ((indexRelation->rd_indam->amgetbatch != NULL) && + enable_batching && + enable_indexscan_batching) + { + /* + * XXX We do this after index_beginscan_internal(), which means we + * can't init the batch state in there (it doesn't even know if + * batching will be used at that point). We can't init the read_stream + * there, because it needs the heapRelation. + */ + index_batch_init(scan); + + /* initialize stream */ + rs = read_stream_begin_relation(READ_STREAM_DEFAULT, + NULL, + heapRelation, + MAIN_FORKNUM, + index_scan_stream_read_next, + scan, + 0); + } + /* prepare to fetch index matches from table */ - scan->xs_heapfetch = table_index_fetch_begin(heapRelation); + scan->xs_heapfetch = table_index_fetch_begin(heapRelation, rs); return scan; } @@ -337,6 +413,12 @@ index_beginscan_internal(Relation indexRelation, scan->parallel_scan = pscan; scan->xs_temp_snap = temp_snap; + /* + * No batching by default, so set it to NULL. Will be initialized later if + * batching is requested and AM supports it. + */ + scan->xs_batches = NULL; + return scan; } @@ -370,6 +452,19 @@ index_rescan(IndexScanDesc scan, scan->kill_prior_tuple = false; /* for safety */ scan->xs_heap_continue = false; + /* + * Reset the batching. This makes it look like there are no batches, + * discards reads already scheduled to the read stream, etc. + * + * XXX We do this before calling amrescan, so that it could reinitialize + * everything (this probably does not matter very much, now that we've + * moved all the batching logic to indexam.c, it was more important when + * the index AM was responsible for more of it). + * + * XXX Maybe this should also happen before table_index_fetch_reset? + */ + index_batch_reset(scan, true); + scan->indexRelation->rd_indam->amrescan(scan, keys, nkeys, orderbys, norderbys); } @@ -384,6 +479,9 @@ index_endscan(IndexScanDesc scan) SCAN_CHECKS; CHECK_SCAN_PROCEDURE(amendscan); + /* Cleanup batching, so that the AM can release pins and so on. */ + index_batch_end(scan); + /* Release resources (like buffer pins) from table accesses */ if (scan->xs_heapfetch) { @@ -414,7 +512,46 @@ index_markpos(IndexScanDesc scan) SCAN_CHECKS; CHECK_SCAN_PROCEDURE(ammarkpos); - scan->indexRelation->rd_indam->ammarkpos(scan); + /* + * Without batching, just use the ammarkpos() callback. With batching + * everything is handled at this layer, without calling the AM. + */ + if (scan->xs_batches == NULL) + { + scan->indexRelation->rd_indam->ammarkpos(scan); + } + else + { + IndexScanBatches *batches = scan->xs_batches; + IndexScanBatchPos *pos = &batches->markPos; + IndexScanBatchData *batch = batches->markBatch; + + /* + * Free the previous mark batch (if any), but only if the batch is no + * longer valid (in the current first/next range). This means that if + * we're marking the same batch (different item), we don't really do + * anything. + * + * XXX Should have some macro for this check, I guess. + */ + if ((batch != NULL) && + (pos->batch < batches->firstBatch || pos->batch >= batches->nextBatch)) + { + batches->markBatch = NULL; + index_batch_free(scan, batch); + } + + /* just copy the read position (which has to be valid) */ + batches->markPos = batches->readPos; + batches->markBatch = INDEX_SCAN_BATCH(scan, batches->markPos.batch); + + /* + * FIXME we need to make sure the batch does not get freed during the + * regular advances. + */ + + AssertCheckBatchPosValid(scan, &batches->markPos); + } } /* ---------------- @@ -447,7 +584,58 @@ index_restrpos(IndexScanDesc scan) scan->kill_prior_tuple = false; /* for safety */ scan->xs_heap_continue = false; - scan->indexRelation->rd_indam->amrestrpos(scan); + /* + * Without batching, just use the amrestrpos() callback. With batching + * everything is handled at this layer, without calling the AM. + */ + if (scan->xs_batches == NULL) + scan->indexRelation->rd_indam->amrestrpos(scan); + else + { + IndexScanBatches *batches = scan->xs_batches; + IndexScanBatchPos *pos = &batches->markPos; + IndexScanBatchData *batch = scan->xs_batches->markBatch; + + Assert(batch != NULL); + + /* + * XXX The pos can be invalid, if we already advanced past the the + * marked batch (and stashed it in markBatch instead of freeing). So + * this assert would be incorrect. + */ + /* AssertCheckBatchPosValid(scan, &pos); */ + + /* FIXME we should still check the batch was not freed yet */ + + /* + * Reset the batching state, except for the marked batch, and make it + * look like we have a single batch - the marked one. + * + * XXX This seems a bit ugly / hacky, maybe there's a more elegant way + * to do this? + */ + index_batch_reset(scan, false); + + batches->markPos = *pos; + batches->readPos = *pos; + batches->firstBatch = pos->batch; + batches->nextBatch = (batches->firstBatch + 1); + + INDEX_SCAN_BATCH(scan, batches->markPos.batch) = batch; + + /* + * XXX I really dislike that we have so many definitions of "current" + * batch. We have readPos, streamPos, currentBatch, ... seems very ad + * hoc - I just added a new "current" field when I needed one. We + * should make that somewhat more consistent, or at least explain it + * clearly somewhere. + * + * XXX Do we even need currentBatch? It's not accessed anywhere, at + * least not in this patch. + */ + // batches->currentBatch = batch; + batches->markBatch = batch; /* also remember this */ + } } /* @@ -569,6 +757,18 @@ index_parallelrescan(IndexScanDesc scan) if (scan->xs_heapfetch) table_index_fetch_reset(scan->xs_heapfetch); + /* + * Reset the batching. This makes it look like there are no batches, + * discards reads already scheduled to the read stream, etc. We Do this + * before calling amrescan, so that it can reinitialize everything. + * + * XXX We do this before calling amparallelrescan, so that it could + * reinitialize everything (this probably does not matter very much, now + * that we've moved all the batching logic to indexam.c, it was more + * important when the index AM was responsible for more of it). + */ + index_batch_reset(scan, true); + /* amparallelrescan is optional; assume no-op if not provided by AM */ if (scan->indexRelation->rd_indam->amparallelrescan != NULL) scan->indexRelation->rd_indam->amparallelrescan(scan); @@ -583,10 +783,12 @@ IndexScanDesc index_beginscan_parallel(Relation heaprel, Relation indexrel, IndexScanInstrumentation *instrument, int nkeys, int norderbys, - ParallelIndexScanDesc pscan) + ParallelIndexScanDesc pscan, + bool enable_batching) { Snapshot snapshot; IndexScanDesc scan; + ReadStream *rs = NULL; Assert(RelFileLocatorEquals(heaprel->rd_locator, pscan->ps_locator)); Assert(RelFileLocatorEquals(indexrel->rd_locator, pscan->ps_indexlocator)); @@ -604,8 +806,48 @@ index_beginscan_parallel(Relation heaprel, Relation indexrel, scan->xs_snapshot = snapshot; scan->instrument = instrument; + /* + * If explicitly requested and supported by both the index AM and the + * plan, initialize batching info. We only use stream read API with + * batching enabled (so not with systable scans). But maybe we should + * change that, and just use different read_next callbacks (or something + * like that)? + * + * XXX Maybe we should have a separate "amcanbatch" call, to let the AM + * decide if batching is supported depending on the scan details. That + * might be needed for certain index AMs, that can do batching only for + * some scans (I'm thinking about GiST/SP-GiST indexes, with ORDER BY). + * + * XXX Do this before initializing xs_heapfetch, so that we can pass the + * read stream to it. + * + * XXX Pretty duplicate with the code in index_beginscan(), so maybe move + * into a shared function. + */ + if ((indexrel->rd_indam->amgetbatch != NULL) && + enable_batching && + enable_indexscan_batching) + { + /* + * XXX We do this after index_beginscan_internal(), which means we + * can't init the batch state in there (it doesn't even know if + * batching will be used at that point). We can't init the read_stream + * there, because it needs the heapRelation. + */ + index_batch_init(scan); + + /* initialize stream */ + rs = read_stream_begin_relation(READ_STREAM_DEFAULT, + NULL, + heaprel, + MAIN_FORKNUM, + index_scan_stream_read_next, + scan, + 0); + } + /* prepare to fetch index matches from table */ - scan->xs_heapfetch = table_index_fetch_begin(heaprel); + scan->xs_heapfetch = table_index_fetch_begin(heaprel, rs); return scan; } @@ -628,6 +870,27 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction) /* XXX: we should assert that a snapshot is pushed or registered */ Assert(TransactionIdIsValid(RecentXmin)); + /* + * When using batching (which may be disabled for various reasons - e.g. + * through a GUC, the index AM not supporting it), redirect the code to + * the "batch" variant. If needed (e.g. for the first call) the call may + * read the next batch (leaf page) from the index (but that's driven by + * the read stream). + * + * XXX Maybe we should enable batching based on the plan too, so that we + * don't do batching when it's probably useless (e.g. semijoins or queries + * with LIMIT 1 etc.). The amcanbatch() callback might consider things + * like that, or maybe that should be considered outside AM. However, the + * slow ramp-up (starting with small batches) in read_stream should handle + * this well enough. + * + * XXX Perhaps it'd be possible to do both in index_getnext_slot(), i.e. + * call either the original code without batching, or the new batching + * code if supported/enabled. It's not great to have duplicated code. + */ + if (scan->xs_batches != NULL) + return index_batch_getnext_tid(scan, direction); + /* * The AM's amgettuple proc finds the next index entry matching the scan * keys, and puts the TID into scan->xs_heaptid. It should also set @@ -694,9 +957,22 @@ index_fetch_heap(IndexScanDesc scan, TupleTableSlot *slot) * amgettuple call, in index_getnext_tid). We do not do this when in * recovery because it may violate MVCC to do so. See comments in * RelationGetIndexScan(). + * + * XXX For scans using batching, record the flag in the batch (we will + * pass it to the AM later, when freeing it). Otherwise just pass it to + * the AM using the kill_prior_tuple field. */ if (!scan->xactStartedInRecovery) - scan->kill_prior_tuple = all_dead; + { + if (scan->xs_batches == NULL) + { + scan->kill_prior_tuple = all_dead; + } + else if (all_dead) + { + index_batch_kill_item(scan); + } + } return found; } @@ -1084,3 +1360,1105 @@ index_opclass_options(Relation indrel, AttrNumber attnum, Datum attoptions, return build_local_reloptions(&relopts, attoptions, validate); } + +/* + * INDEX BATCHING (AND PREFETCHING) + * + * The traditional AM interface (amgettuple) is designed to walk the index one + * leaf page at a time, and the state (representing the leaf page) is managed + * by the AM implementation. Before advancing to the next leaf page, the index + * AM forgets the "current" leaf page. This makes it impossible to implement + * features that operate on multiple leaf pages - like for example prefetch. + * + * The batching relaxes this by extending the AM API with two new methods, + * amgetbatch and amfreebatch, that separate the "advance" to the next leaf + * page, and "forgetting" the previous one. This means there may be multiple + * leaf pages loaded at once, if necessary. It's a bit like having multiple + * "positions" within the index. + * + * The AM is no longer responsible for management of these "batches" - once + * a batch is returned from amgetbatch(), it's up to indexam.c to determine + * when it's no longer necessary, and call amfreebatch(). That is, the AM + * can no longer discard a leaf page when advancing to the next one. + * + * This allows operating on "future" index entries, e.g. to prefetch tuples + * from the table. Without the batching, we could do this within the single + * leaf page, which has limitations, e.g. inability to prefetch beyond the + * of the current leaf page, and the prefetch distance drop to 0. (Most + * indexes have many index items per leaf page, so the prefetching would + * be beneficial even with this limitation, but it's not great either.) + * + * Moving the batch management to the indexam.c also means defining a common + * batch state, instead of each index AM defining it's own opaque state. The + * AM merely "fills" the batch, and everything else is handled by code in + * indexam.c (so not AM-specific). Including prefetching. + * + * Without this "common" batch definition, each AM would need to do a fair + * bit of the prefetching on it's own. + * + * + * note: Strictly speaking, the AM may keep a second leaf page because of + * mark/restore may, but that's a minor detail. + * + * note: There are different definitions of "batch" - I use it as a synonym + * for a leaf page, or the index tuples read from one leaf page. Others use + * "batch" when talking about all the leaf pages kept in memory at a given + * moment in time (so in a way, there's a single batch, changing over time). + * It's not my ambition to present a binding definition of a batch, but it's + * good to consider this when reading comments by other people. + * + * note: In theory, how the batch maps to leaf pages is mostly up to the index + * AM - as long as it can "advance" between batches, etc. it could use batches + * that represent a subset of a leaf page, or multiple leaf pages at once. + * + * note: Or maybe it doesn't need to map to leaf pages at all, at least not + * in a simple way. Consider for example ordered scans on SP-GiST indexes, + * or similar cases. I think that could be handled by having "abstract" + * batches - such indexes don't support mark/restore or changing direction, + * so this should be OK. + * + * note: When thinking about an index AM, think about BTREE, unless another + * AM is mentioned explicitly. Most AMs are based on / derived from BTREE, + * and everything about BTREE directly extends to them. + * + * note: In the following text "index AM" refers to an implementation of a + * particular index AM (e.g. BTREE), i.e. code src/backend/access/nbtree), + * while "indexam.c" is the shared executor level used to interact with + * indexes. + * + * + * index scan state + * ---------------- + * With the traditional API (amgettuple), index scan state is stored at the + * scan-level in AM-specific structs - e.g. in BTScanOpaque for BTREE). So + * there can be only a single leaf page "loaded" for a scan at a time. + * + * With the new API (amgetbatch/amfreebatch), an index scan needs to store + * multiple batches - but not in private "scan opaque" struct. Instead, + * the queue of batches and some of the other information was moved to the + * IndexScanDesc, into a common struct. So the AM-specific scan-opaque + * structs get split and moved into three places: + * + * 1) scan-opaque - Fields that are truly related to the scan as a whole + * remain in the struct (which is AM-specific, i.e. each AM method may + * keep something different). Example: scankeys/arraykeys are still + * kept in BTScanOpaque. + * + * 2) batch-opaque - AM-specific information related to a particular leaf + * page are moved to a new batch-level struct. A good example are for + * example the position of the leaf page / batch in the index (current + * page, left/righ pages, etc.). + * + * 3) batch - A significant part of the patch is introducing a common + * representation of a batch, common to all the index AMs. Until now + * each AM had it's own way of representing tuples from a leaf page, + * and accessing it required going through the AM again. The common + * representation allows accessing the batches through the indexam.c + * layer, without having to go through the AM. + * + * + * amgetbatch/amfreebatch + * ---------------------- + * To support batching, the index AM needs to implement two optional + * callbacks - amgetbatch() and amfreebatch(), which load data from the + * "next" leaf page, and then free it when the batch is no longer needed. + * + * For now the amgettuple() callback is still required even for AMs that + * support batching, so that we can fall-back to the non-batched scan + * for cases when batching is not supported (e.g. scans of system tables) + * or when batching is disabled using the enable_indexscan_batching GUC. + * + * + * batch + * ---------------------- + * A good way to visualize batching is a sliding window over the key space of + * an index. At any given moment, we have a "window" representing a range of + * the keys, consisting of one or more batches, each with items from a single + * leaf page. + * + * For now, each batch is exactly one whole leaf page. We might allow batches + * to be smaller or larger, but that doesn't seem very useful. It would make + * things more complex, without providing much benefit. Ultimately it's up to + * the index AM - it can produce any batches it wants, as long as it keeps + * necessary information in the batch-opaque struct, and handles this in the + * amgetbatch/amfreebatch callbacks. + * + * + * prefetching: leaf pages vs. heap pages + * -------------------------------------- + * This patch is only about prefetching pages from the indexed relation (e.g. + * heap), not about prefetching index leaf pages etc. The read_next callback + * does read leaf pages when needed (after reaching the end of the current + * batch), but this is synchronous, and the callback will block until the leaf + * page is read. + * + * + * gradual ramp up + * --------------- + * The prefetching is driven by the read_stream API / implementation. There + * are no explicit fadvise calls in the index code, that all happens in the + * read stream. The read stream does the usual gradual ramp up to not regress + * LIMIT 1 queries etc. + * + * + * kill_prior_tuples + * ----------------- + * If we decide a tuple should be "killed" in the index, the a flag is used to + * pass this information to indexam.c - the item is recorded in the batch, and + * the actual killing is postponed until the batch is freed using amfreebatch(). + * The scan flag is reset to false, so that the index AM does not get confused + * and does not do something for a different "current" item. + * + * That is, this is very similar to what happens without batching, except that + * the killed items are accumulated in indexam.c, not in the AM. + */ + +/* + * Maximum number of batches (leaf pages) we can keep in memory. + * + * The value 64 value is arbitrary, it's about 1MB of data with 8KB pages. We + * should not really need this many batches - we need a certain number of TIDs, + * to satisfy the prefetch distance, and there usually are many index tuples + * per page. In the worst case we might have one index tuple per leaf page, + * but even that may not quite work in some cases. + * + * But there may be cases when this does not work - some examples: + * + * a) the index may be bloated, with many pages only have a single index item + * + * b) the index is correlated, and we skip prefetches of duplicate blocks + * + * c) we may be doing index-only scan, and we don't prefetch all-visible pages + * + * So we might need to load huge number of batches before we find the first + * block to load from the table. Or enough pages to satisfy the prefetch + * distance. + * + * XXX Currently, once we hit this number of batches, we fail in the stream + * callback (or rather in index_batch_getnext), because that's where we load + * batches. It'd be nice to "pause" the read stream for a bit instead, but + * there's no built-in way to do that. So we can only "stop" the stream by + * returning InvalidBlockNumber. But we could also remember this, and do + * read_stream_reset() to continue, after consuming all the already scheduled + * blocks. + * + * XXX Maybe 64 is too high - it also defines the maximum amount of overhead + * allowed. In the worst case, reading a single row might trigger reading this + * many leaf pages (e.g. with IOS). Which might be an issue with LIMIT queries, + * when we actually won't need most of the leaf pages. + * + * XXX We could/should use a lower value for testing, to make it more likely + * we hit this issue. With 64 the whole check-world passes without hitting + * the limit, wo we wouldn't test it's handled correctly. + */ +#define INDEX_SCAN_MAX_BATCHES 64 + +#define INDEX_SCAN_BATCH_COUNT(scan) \ + ((scan)->xs_batches->nextBatch - (scan)->xs_batches->firstBatch) + +#define INDEX_SCAN_BATCH_LOADED(scan, idx) \ + ((idx) < (scan)->xs_batches->nextBatch) + +#define INDEX_SCAN_BATCH_FULL(scan) \ + (INDEX_SCAN_BATCH_COUNT(scan) == scan->xs_batches->maxBatches) + +/* + * Check that a position (batch,item) is valid with respect to the batches we + * have currently loaded. + * + * XXX The "marked" batch is an exception. The marked batch may get outside + * the range of current batches, so make sure to never check the position + * for that. + */ +static void +AssertCheckBatchPosValid(IndexScanDesc scan, IndexScanBatchPos *pos) +{ +#ifdef USE_ASSERT_CHECKING + IndexScanBatches *batch = scan->xs_batches; + + /* make sure the position is valid for currently loaded batches */ + Assert(pos->batch >= batch->firstBatch); + Assert(pos->batch < batch->nextBatch); +#endif +} + +/* + * Check a single batch is valid. + */ +static void +AssertCheckBatch(IndexScanDesc scan, IndexScanBatch batch) +{ +#ifdef USE_ASSERT_CHECKING + /* there must be valid range of items */ + Assert(batch->firstItem <= batch->lastItem); + Assert(batch->firstItem >= 0); + Assert(batch->lastItem <= MaxTIDsPerBTreePage); /* XXX tied to BTREE */ + + /* we should have items (buffer and pointers) */ + Assert(batch->items != NULL); + // Assert(batch->currTuples != NULL); + + /* + * The number of killed items must be valid, and there must be an array of + * indexes if there are items. + */ + Assert(batch->numKilled >= 0); + Assert(batch->numKilled <= MaxTIDsPerBTreePage); /* XXX tied to BTREE */ + Assert(!((batch->numKilled > 0) && (batch->killedItems == NULL))); + + /* XXX can we check some of the other batch fields? */ +#endif +} + +/* + * Check invariants on current batches + * + * Makes sure the indexes are set as expected, the buffer size is within + * limits, and so on. + */ +static void +AssertCheckBatches(IndexScanDesc scan) +{ +#ifdef USE_ASSERT_CHECKING + IndexScanBatches *batches = scan->xs_batches; + + /* we should have batches initialized */ + Assert(batches != NULL); + + /* We should not have too many batches. */ + Assert((batches->maxBatches > 0) && + (batches->maxBatches <= INDEX_SCAN_MAX_BATCHES)); + + /* + * The first/next indexes should define a valid range (in the cyclic + * buffer, and should not overflow maxBatches. + */ + Assert((batches->firstBatch >= 0) && + (batches->firstBatch <= batches->nextBatch)); + Assert((batches->nextBatch - batches->firstBatch) <= batches->maxBatches); + + /* Check all current batches */ + for (int i = batches->firstBatch; i < batches->nextBatch; i++) + { + IndexScanBatch batch = INDEX_SCAN_BATCH(scan, i); + + AssertCheckBatch(scan, batch); + } +#endif +} + +/* debug: print info about current batches */ +static void +index_batch_print(const char *label, IndexScanDesc scan) +{ +#ifdef INDEXAM_DEBUG + IndexScanBatches *batches = scan->xs_batches; + + if (!scan->xs_batches) + return; + + DEBUG_LOG("%s: batches firstBatch %d nextBatch %d maxBatches %d", + label, + batches->firstBatch, batches->nextBatch, batches->maxBatches); + + for (int i = batches->firstBatch; i < batches->nextBatch; i++) + { + IndexScanBatchData *batch = INDEX_SCAN_BATCH(scan, i); + + DEBUG_LOG("%s: batch %d %p first %d last %d item %d killed %d", + label, i, batch, batch->firstItem, batch->lastItem, + batch->itemIndex, batch->numKilled); + } +#endif +} + +/* + * index_batch_pos_advance + * Advance the position to the next item, depending on scan direction. + * + * Advance the position to the next item, either in the same batch or the + * following one (if already available). + * + * We can advance only if we already have some batches loaded, and there's + * either enough items in the current batch, or some more items in the + * subsequent batches. + * + * If this is the first advance, right after loading the first batch, the + * position is still be undefined. Otherwise we expect the position to be + * valid. + * + * Returns true if the position was advanced, false otherwise. + * + * The poisition is guaranteed to be valid only after an advance. + */ +static bool +index_batch_pos_advance(IndexScanDesc scan, IndexScanBatchPos *pos) +{ + IndexScanBatchData *batch; + ScanDirection direction = scan->xs_batches->direction; + + /* make sure we have batching initialized and consistent */ + AssertCheckBatches(scan); + + /* should know direction by now */ + Assert(direction != NoMovementScanDirection); + + /* We can't advance if there are no batches available. */ + if (INDEX_SCAN_BATCH_COUNT(scan) == 0) + return false; + + /* + * If the position has not been advanced yet, it has to be right after we + * loaded the first batch. In that case just initialize it to the first + * item in the batch (or last item, if it's backwards scaa). + * + * XXX Maybe we should just explicitly initialize the postition after + * loading the first batch, without having to go through the advance. + * + * XXX Add a macro INDEX_SCAN_POS_DEFINED() or something like this, to + * make this easier to understand. + */ + if ((pos->batch == -1) && (pos->index == -1)) + { + /* we should have loaded the very first batch */ + Assert(scan->xs_batches->firstBatch == 0); + + batch = INDEX_SCAN_BATCH(scan, scan->xs_batches->firstBatch); + Assert(batch != NULL); + + pos->batch = 0; + + if (ScanDirectionIsForward(direction)) + pos->index = batch->firstItem; + else + pos->index = batch->lastItem; + + /* the position we just set has to be valid */ + AssertCheckBatchPosValid(scan, pos); + + return true; + } + + /* + * The position is already defined, so we should have some batches loaded + * and the position has to be valid with respect to those. + */ + AssertCheckBatchPosValid(scan, pos); + + /* + * Advance to the next item in the same batch. If the position is for the + * last item in the batch, try advancing to the next batch (if loaded). + */ + batch = INDEX_SCAN_BATCH(scan, pos->batch); + + if (ScanDirectionIsForward(direction)) + { + if (pos->index < batch->lastItem) + { + pos->index++; + + /* the position has to be valid */ + AssertCheckBatchPosValid(scan, pos); + + return true; + } + } + else /* ScanDirectionIsBackward */ + { + if (pos->index > batch->firstItem) + { + pos->index--; + + /* the position has to be valid */ + AssertCheckBatchPosValid(scan, pos); + + return true; + } + } + + /* + * We couldn't advance within the same batch, try advancing to the next + * batch, if it's already loaded. + */ + if (INDEX_SCAN_BATCH_LOADED(scan, pos->batch + 1)) + { + /* advance to the next batch */ + pos->batch++; + + batch = INDEX_SCAN_BATCH(scan, pos->batch); + Assert(batch != NULL); + + if (ScanDirectionIsForward(direction)) + pos->index = batch->firstItem; + else + pos->index = batch->lastItem; + + /* the position has to be valid */ + AssertCheckBatchPosValid(scan, pos); + + return true; + } + + /* can't advance */ + return false; +} + +/* + * index_batch_pos_reset + * Reset the position, so that it looks as if never advanced. + */ +static void +index_batch_pos_reset(IndexScanDesc scan, IndexScanBatchPos *pos) +{ + pos->batch = -1; + pos->index = -1; +} + +/* + * index_scan_stream_read_next + * return the next block to pass to the read stream + * + * This assumes the "current" scan direction, requested by the caller. If + * that changes before consuming all buffers, we'll reset the stream and + * start from scratch. Which may seem inefficient, but it's no worse than + * what we do now, and it's not a very common case. + * + * The position of the read_stream is stored in streamPos, which may be + * ahead of the current readPos (which is what got consumed by the scan). + * + * The scan direction change is checked / handled elsewhere. Here we rely + * on having the correct value in xs_batches->direction. + */ +static BlockNumber +index_scan_stream_read_next(ReadStream *stream, + void *callback_private_data, + void *per_buffer_data) +{ + IndexScanDesc scan = (IndexScanDesc) callback_private_data; + IndexScanBatchPos *pos = &scan->xs_batches->streamPos; + + /* we should have set the direction already */ + Assert(scan->xs_batches->direction != NoMovementScanDirection); + + /* + * The read position has to be valid, because we initialize/advance it + * before maybe even attempting to read the heap tuple. And it lags behind + * the stream position, so it can't be invalid yet. If this is the first + * time for this callback, we will use the readPos to init streamPos, so + * better check it's valid. + */ + AssertCheckBatchPosValid(scan, &scan->xs_batches->readPos); + + /* + * Try to advance to the next item, and if there's none in the current + * batch, try loading the next batch. + * + * XXX This loop shouldn't happen more than twice, because if we fail to + * advance the position, we'll try to load the next batch and then in the + * next loop the advance has to succeed. + */ + while (true) + { + bool advanced = false; + + /* + * If the stream position is undefined, just use the read position. + * + * It's possible we got here only fairly late in the scan, e.g. if + * many tuples got skipped in the index-only scan, etc. In this case + * just use the read position as a starting point. + * + * The first batch is loaded from index_batch_getnext_tid(), because + * we don't get here until the first index_fetch_heap() call - only + * then can read_stream start loading more batches. It's also possible + * to disable prefetching (effective_io_concurrency=0), in which case + * all batches get loaded in index_batch_getnext_tid. + */ + if ((pos->batch == -1) && (pos->index == -1)) + { + *pos = scan->xs_batches->readPos; + advanced = true; + } + else if (index_batch_pos_advance(scan, pos)) + { + advanced = true; + } + + /* FIXME maybe check the streamPos is not behind readPos? */ + + /* If we advanced the position, return the block for the TID. */ + if (advanced) + { + IndexScanBatch batch = INDEX_SCAN_BATCH(scan, pos->batch); + ItemPointer tid = &batch->items[pos->index].heapTid; + + DEBUG_LOG("index_scan_stream_read_next: index %d TID (%u,%u)", + pos->index, + ItemPointerGetBlockNumber(tid), + ItemPointerGetOffsetNumber(tid)); + + /* + * if there's a prefetch callback, use it to decide if we will + * need to read the block + */ + if (scan->xs_batches->prefetchCallback && + !scan->xs_batches->prefetchCallback(scan, scan->xs_batches->prefetchArgument, pos)) + { + DEBUG_LOG("index_scan_stream_read_next: skip block (callback)"); + continue; + } + + /* same block as before, don't need to read it */ + if (scan->xs_batches->lastBlock == ItemPointerGetBlockNumber(tid)) + { + DEBUG_LOG("index_scan_stream_read_next: skip block (lastBlock)"); + continue; + } + + scan->xs_batches->lastBlock = ItemPointerGetBlockNumber(tid); + + return ItemPointerGetBlockNumber(tid); + } + + /* + * Couldn't advance the position, so either there are no more items in + * the current batch, or maybe we don't have any batches yet (if is + * the first time through). Try loading the next batch - if that + * succeeds, try the advance again (and this time the advance should + * work). + * + * If we fail to load the next batch, we're done. + */ + if (!index_batch_getnext(scan)) + break; + } + + /* no more items in this scan */ + return InvalidBlockNumber; +} + +/* ---------------- + * index_batch_getnext - get the next batch of TIDs from a scan + * + * Returns true if we managed to read at least some TIDs into the batch, or + * false if there are no more TIDs in the scan. The batch load may fail for + * multiple reasons - there really may not be more batches in the scan, or + * maybe we reached INDEX_SCAN_MAX_BATCHES. + * + * Returns true if the batch was loaded successfully, false otherwise. + * + * XXX This only loads the TIDs and resets the various batch fields to + * fresh state. It does not set xs_heaptid/xs_itup/xs_hitup, that's the + * responsibility of the following index_batch_getnext_tid() calls. + * ---------------- + */ +static bool +index_batch_getnext(IndexScanDesc scan) +{ + IndexScanBatchData *batch; + ItemPointerData tid; + ScanDirection direction = scan->xs_batches->direction; + IndexTuple itup; + + SCAN_CHECKS; + CHECK_SCAN_PROCEDURE(amgetbatch); + + /* XXX: we should assert that a snapshot is pushed or registered */ + Assert(TransactionIdIsValid(RecentXmin)); + + /* + * If we already used the maximum number of batch slots available, it's + * pointless to try loading another one. This can happen for various + * reasons, e.g. for index-only scans on all-visible table, or skipping + * duplicate blocks on perfectly correlated indexes, etc. + * + * We could enlarge the array to allow more batches, but that's futile, we + * can always construct a case using more memory. Not only it would risk + * OOM, it'd also be inefficient because this happens early in the scan + * (so it'd interfere with LIMIT queries). + * + * XXX For now we just error out, but the correct solution is to pause the + * stream by returning InvalidBlockNumber and then unpause it by doing + * read_stream_reset. + */ + if (INDEX_SCAN_BATCH_FULL(scan)) + { + DEBUG_LOG("index_batch_getnext: ran out of space for batches"); + scan->xs_batches->reset = true; + } + + /* + * Did we fill the batch queue, either in this or some earlier call? + * If yes, we have to consume everything from currently loaded batch + * before we reset the stream and continue. It's a bit like 'finished' + * but it's only a temporary pause, not the end of the stream. + */ + if (scan->xs_batches->reset) + return NULL; + + /* + * Did we already read the last batch for this scan? + * + * We may read the batches in two places, so we need to remember that, + * otherwise the retry restarts the scan. + * + * XXX This comment might be obsolete, from before using the read_stream. + * + * XXX Also, maybe we should do this before calling INDEX_SCAN_BATCH_FULL? + */ + if (scan->xs_batches->finished) + return NULL; + + index_batch_print("index_batch_getnext / start", scan); + + /* + * FIXME btgetbatch calls _bt_returnitem, which however sets xs_heaptid, + * and so would interfere with index scans (because this may get executed + * from the read_stream_next_buffer callback during the scan (fetching + * heap tuples in heapam_index_fetch_tuple). Ultimately we should not do + * _bt_returnitem at all, just functions like _bt_steppage etc. while + * loading the next batch. + * + * XXX I think this is no longer true, the amgetbatch does not do that I + * believe (_bt_returnitem_batch should not set these fields). + */ + tid = scan->xs_heaptid; + itup = scan->xs_itup; + + batch = scan->indexRelation->rd_indam->amgetbatch(scan, direction); + if (batch != NULL) + { + /* + * We got the batch from the AM, but we need to add it to the queue. + * Maybe that should be part of the "batch allocation" that happens in + * the AM? + */ + int batchIndex = scan->xs_batches->nextBatch; + + INDEX_SCAN_BATCH(scan, batchIndex) = batch; + + scan->xs_batches->nextBatch++; + + /* + * XXX Why do we need currentBatch, actually? It doesn't seem to be + * used anywhere, just set ... + */ + // scan->xs_batches->currentBatch = batch; + + DEBUG_LOG("index_batch_getnext firstBatch %d nextBatch %d batch %p", + scan->xs_batches->firstBatch, scan->xs_batches->nextBatch, batch); + } + else + scan->xs_batches->finished = true; + + /* XXX see FIXME above */ + scan->xs_heaptid = tid; + scan->xs_itup = itup; + + AssertCheckBatches(scan); + + index_batch_print("index_batch_getnext / end", scan); + + return (batch != NULL); +} + +/* ---------------- + * index_getnext_batch_tid - get the next TID from the current batch + * + * The calling convention is similar to index_getnext_tid() - NULL means no + * more items in the current batch, and no more batches. + * + * If we advance to the next batch, we release the previous one (unless it's + * tracked for mark/restore). + * + * Returns the next TID, or NULL if no more items (or batches). + * + * FIXME This only sets xs_heaptid and xs_itup (if requested). Not sure if + * we need to do something with xs_hitup. Should this set xs_hitup? + * + * XXX Maybe if we advance the position to the next batch, we could keep the + * batch for a bit more, in case the scan direction changes (as long as it + * fits into maxBatches)? But maybe that's unnecessary complexity for too + * little gain, we'd need to be careful about releasing the batches lazily. + * ---------------- + */ +static ItemPointer +index_batch_getnext_tid(IndexScanDesc scan, ScanDirection direction) +{ + IndexScanBatchPos *pos; + + /* shouldn't get here without batching */ + AssertCheckBatches(scan); + + /* read the next TID from the index */ + pos = &scan->xs_batches->readPos; + + /* FIXME handle change of scan direction (reset stream, ...) */ + scan->xs_batches->direction = direction; + + DEBUG_LOG("index_batch_getnext_tid pos %d %d direction %d", + pos->batch, pos->index, direction); + + /* + * Try advancing the batch position. If that doesn't succeed, it means we + * don't have more items in the current batch, and there's no future batch + * loaded. So try loading another batch, and maybe retry. + * + * FIXME This loop shouldn't happen more than twice. Maybe we should have + * some protection against infinite loops? If the advance/getnext + * functions get to disagree? + */ + while (true) + { + /* + * If we manage to advance to the next items, return it and we're + * done. Otherwise try loading another batch. + */ + if (index_batch_pos_advance(scan, pos)) + { + IndexScanBatchData *batch = INDEX_SCAN_BATCH(scan, pos->batch); + + Assert(batch != NULL); + + /* set the TID / itup for the scan */ + scan->xs_heaptid = batch->items[pos->index].heapTid; + scan->xs_itup = (IndexTuple) (batch->currTuples + batch->items[pos->index].tupleOffset); + + DEBUG_LOG("pos batch %p first %d last %d pos %d/%d TID (%u,%u)", + batch, batch->firstItem, batch->lastItem, + pos->batch, pos->index, + ItemPointerGetBlockNumber(&scan->xs_heaptid), + ItemPointerGetOffsetNumber(&scan->xs_heaptid)); + + /* + * If we advanced to the next batch, release the batch we no + * longer need. The positions is the "read" position, and we can + * compare it to firstBatch. + */ + if (pos->batch != scan->xs_batches->firstBatch) + { + batch = INDEX_SCAN_BATCH(scan, scan->xs_batches->firstBatch); + Assert(batch != NULL); + + /* + * XXX When advancing readPos, the streamPos may get behind as + * we're only advancing it when actually requesting heap blocks. + * But we may not do that often enough - e.g. IOS may not need + * to access all-visible heap blocks, so the read_next callback + * does not get invoked for a long time. It's possible the + * stream gets so mucu behind the position gets invalid, as we + * already removed the batch. But that means we don't need any + * heap blocks until the current read position - if we did, we + * would not be in this situation (or it's a sign of a bug, as + * those two places are expected to be in sync). So if the + * streamPos still points at the batch we're about to free, + * just reset the position - we'll set it to readPos in the + * read_next callback later. + * + * XXX This can happen after the queue gets full, we "pause" + * the stream, and then reset it to continue. But I think that + * just increases the probability of hitting the issue, it's + * just more chance to to not advance the streamPos, which + * depends on when we try to fetch the first heap block after + * calling read_stream_reset(). + */ + if (scan->xs_batches->streamPos.batch == scan->xs_batches->firstBatch) + { + index_batch_pos_reset(scan, &scan->xs_batches->streamPos); + } + + DEBUG_LOG("index_batch_getnext_tid free batch %p firstBatch %d nextBatch %d", + batch, + scan->xs_batches->firstBatch, + scan->xs_batches->nextBatch); + + /* Free the batch (except when it's needed for mark/restore). */ + index_batch_free(scan, batch); + + /* + * In any case, remove the batch from the regular queue, even + * if we kept it for mar/restore. + */ + scan->xs_batches->firstBatch++; + + DEBUG_LOG("index_batch_getnext_tid batch freed firstBatch %d nextBatch %d", + scan->xs_batches->firstBatch, + scan->xs_batches->nextBatch); + + index_batch_print("index_batch_getnext_tid / free old batch", scan); + + /* we can't skip any batches */ + Assert(scan->xs_batches->firstBatch == pos->batch); + } + + return &scan->xs_heaptid; + } + + /* + * We failed to advance, i.e. we ran out of currently loaded batches. + * So if we filled the queue, this is a good time to reset the stream + * (before we try loading the next batch). + */ + if (scan->xs_batches->reset) + { + DEBUG_LOG("resetting read stream pos %d,%d", + scan->xs_batches->readPos.batch, scan->xs_batches->readPos.index); + + scan->xs_batches->reset = false; + + /* + * Need to reset the stream position, it might be too far behind. + * Ultimately we want to set it to readPos, but we can't do that + * yet - readPos still point sat the old batch, so just reset it + * and we'll init it to readPos later in the callback. + */ + index_batch_pos_reset(scan, &scan->xs_batches->streamPos); + + read_stream_reset(scan->xs_heapfetch->rs); + } + + /* + * Failed to advance the read position, so try reading the next batch. + * If this fails, we're done - there's nothing more to load. + * + * Most of the batches should be loaded from read_stream_next_buffer, + * but we need to call index_batch_getnext here too, for two reasons. + * First, the read_stream only gets working after we try fetching the + * first heap tuple, so we need to load the first batch from here. + * Second, while most batches will be preloaded by the stream thank's + * to prefetching, it's possible to set effective_io_concurrency=0, in + * which case all the batch loads happen from here. + */ + if (!index_batch_getnext(scan)) + break; + + DEBUG_LOG("loaded next batch, retry to advance position"); + } + + /* + * If we get here, we failed to advance the position and there are no more + * batches, so we're done. + */ + DEBUG_LOG("no more batches to process"); + + return NULL; +} + +/* + * index_batch_init + * Initialize various fields / arrays needed by batching. + * + * FIXME This is a bit ad-hoc hodge podge, due to how I was adding more and + * more pieces. Some of the fields may be not quite necessary, needs cleanup. + */ +static void +index_batch_init(IndexScanDesc scan) +{ + /* init batching info, assume batching is supported by the AM */ + Assert(scan->indexRelation->rd_indam->amgetbatch != NULL); + Assert(scan->indexRelation->rd_indam->amfreebatch != NULL); + + scan->xs_batches = palloc0(sizeof(IndexScanBatches)); + + /* We don't know direction of the scan yet. */ + scan->xs_batches->direction = NoMovementScanDirection; + + /* Initialize the batch */ + scan->xs_batches->maxBatches = INDEX_SCAN_MAX_BATCHES; + scan->xs_batches->firstBatch = 0; /* first batch */ + scan->xs_batches->nextBatch = 0; /* first batch is empty */ + + scan->xs_batches->batches + = palloc(sizeof(IndexScanBatchData *) * scan->xs_batches->maxBatches); + + /* positions in the queue of batches */ + index_batch_pos_reset(scan, &scan->xs_batches->readPos); + index_batch_pos_reset(scan, &scan->xs_batches->streamPos); + index_batch_pos_reset(scan, &scan->xs_batches->markPos); + + // scan->xs_batches->currentBatch = NULL; + scan->xs_batches->lastBlock = InvalidBlockNumber; +} + +/* + * index_batch_reset + * Reset the batch before reading the next chunk of data. + * + * complete - true means we reset even marked batch + * + * XXX Should this reset the batch memory context, xs_itup, xs_hitup, etc? + */ +static void +index_batch_reset(IndexScanDesc scan, bool complete) +{ + IndexScanBatches *batches = scan->xs_batches; + + /* bail out if batching not enabled */ + if (!batches) + return; + + AssertCheckBatches(scan); + + index_batch_print("index_batch_reset", scan); + + /* With batching enabled, we should have a read stream. Reset it. */ + Assert(scan->xs_heapfetch); + read_stream_reset(scan->xs_heapfetch->rs); + + /* reset the positions */ + index_batch_pos_reset(scan, &batches->readPos); + index_batch_pos_reset(scan, &batches->streamPos); + + /* + * With "complete" reset, make sure to also free the marked batch, either + * by just forgetting it (if it's still in the queue), or by explicitly + * freeing it. + * + * XXX Do this before the loop, so that it calls the amfreebatch(). + */ + if (complete && (batches->markBatch != NULL)) + { + IndexScanBatchPos *pos = &batches->markPos; + IndexScanBatch batch = batches->markBatch; + + /* always reset the position, forget the marked batch */ + batches->markBatch = NULL; + + /* + * If we've already moved past the marked batch (it's not in the + * current queue), free it explicitly. Otherwise it'll be in the freed + * later. + */ + if ((pos->batch < batches->firstBatch) || + (pos->batch >= batches->nextBatch)) + { + index_batch_free(scan, batch); + } + + /* reset position only after the queue range check */ + index_batch_pos_reset(scan, &batches->markPos); + } + + /* release all currently loaded batches */ + while (batches->firstBatch < batches->nextBatch) + { + IndexScanBatch batch = INDEX_SCAN_BATCH(scan, batches->firstBatch); + + DEBUG_LOG("freeing batch %d %p", batches->firstBatch, batch); + + index_batch_free(scan, batch); + + /* update the valid range, so that asserts / debugging works */ + batches->firstBatch++; + } + + /* reset relevant IndexScanBatches fields */ + batches->maxBatches = INDEX_SCAN_MAX_BATCHES; + batches->firstBatch = 0; /* first batch */ + batches->nextBatch = 0; /* first batch is empty */ + + batches->finished = false; + batches->reset = false; + // batches->currentBatch = NULL; + batches->lastBlock = InvalidBlockNumber; + + AssertCheckBatches(scan); +} + +static void +index_batch_kill_item(IndexScanDesc scan) +{ + IndexScanBatchPos *pos = &scan->xs_batches->readPos; + IndexScanBatchData *batch = INDEX_SCAN_BATCH(scan, pos->batch); + + /* FIXME mark item at current readPos as deleted */ + AssertCheckBatchPosValid(scan, pos); + + /* + * XXX Too tied to btree (through MaxTIDsPerBTreePage), we should make + * this AM agnostic. We could maybe even replace this with Bitmapset. It + * might be more expensive if we only kill items at the end of the page + * (in which case we still have to walk the first part to find the bits at + * the end). But given the lower memory usage it still sees like a good + * tradeoff overall. + */ + if (batch->killedItems == NULL) + batch->killedItems = (int *) + palloc(MaxTIDsPerBTreePage * sizeof(int)); + if (batch->numKilled < MaxTIDsPerBTreePage) + batch->killedItems[batch->numKilled++] = pos->index; + + /* elog(WARNING, "index_batch_kill_item (%d,%d)", pos->batch, pos->index); */ + /* FIXME index_batch_kill_item not implemented */ +} + +static void +index_batch_free(IndexScanDesc scan, IndexScanBatch batch) +{ + SCAN_CHECKS; + CHECK_SCAN_PROCEDURE(amfreebatch); + + AssertCheckBatch(scan, batch); + + /* don't free the batch that is marked */ + if (batch == scan->xs_batches->markBatch) + return; + + scan->indexRelation->rd_indam->amfreebatch(scan, batch); +} + +/* */ +static void +index_batch_end(IndexScanDesc scan) +{ + index_batch_reset(scan, true); +} + +IndexScanBatch +index_batch_alloc(int maxitems, bool want_itup) +{ + IndexScanBatch batch = palloc(sizeof(IndexScanBatchData)); + + batch->firstItem = -1; + batch->lastItem = -1; + batch->itemIndex = -1; + + batch->killedItems = NULL; /* FIXME allocate an array, actually */ + batch->numKilled = 0; /* nothing killed yet */ + + /* + * If we are doing an index-only scan, these are the tuple storage + * workspaces for the currPos and markPos respectively. Each is of size + * BLCKSZ, so it can hold as much as a full page's worth of tuples. + * + * XXX allocate + */ + batch->currTuples = NULL; /* tuple storage for currPos */ + if (want_itup) + batch->currTuples = palloc(BLCKSZ); + + /* + * XXX Maybe don't size to MaxTIDsPerBTreePage? We don't reuse batches + * (unlike currPos), so we can size it for just what we need. + */ + batch->items = palloc0(sizeof(IndexScanBatchPosItem) * maxitems); + + /* + * batch contents (TIDs, index tuples, kill bitmap, ...) + * + * XXX allocate as needed? + */ + batch->itups = NULL; /* IndexTuples, if requested */ + batch->htups = NULL; /* HeapTuples, if requested */ + batch->recheck = NULL; /* recheck flags */ + batch->privateData = NULL; /* private data for batch */ + + /* xs_orderbyvals / xs_orderbynulls */ + batch->orderbyvals = NULL; + batch->orderbynulls = NULL; + + /* AM-specific per-batch state */ + batch->opaque = NULL; + + return batch; +} diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 765659887af7..405c601d3ffd 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -159,6 +159,8 @@ bthandler(PG_FUNCTION_ARGS) amroutine->ambeginscan = btbeginscan; amroutine->amrescan = btrescan; amroutine->amgettuple = btgettuple; + amroutine->amgetbatch = btgetbatch; + amroutine->amfreebatch = btfreebatch; amroutine->amgetbitmap = btgetbitmap; amroutine->amendscan = btendscan; amroutine->ammarkpos = btmarkpos; @@ -279,6 +281,158 @@ btgettuple(IndexScanDesc scan, ScanDirection dir) return res; } +/* FIXME duplicate from indexam.c */ +#define INDEX_SCAN_BATCH(scan, idx) \ + ((scan)->xs_batches->batches[(idx) % (scan)->xs_batches->maxBatches]) + +/* + * btgetbatch() -- Get the next batch of tuples in the scan. + * + * XXX Simplified version of btgettuple(), but for batches of tuples. + */ +IndexScanBatch +btgetbatch(IndexScanDesc scan, ScanDirection dir) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + IndexScanBatch res; + BTBatchScanPos pos = NULL; + + /* batching does not work with regular scan-level positions */ + Assert(!BTScanPosIsValid(so->currPos)); + Assert(!BTScanPosIsValid(so->markPos)); + + /* btree indexes are never lossy */ + scan->xs_recheck = false; + + if (scan->xs_batches->firstBatch < scan->xs_batches->nextBatch) + { + IndexScanBatch batch = INDEX_SCAN_BATCH(scan, scan->xs_batches->nextBatch-1); + pos = (BTBatchScanPos) batch->opaque; + } + + /* Each loop iteration performs another primitive index scan */ + do + { + /* + * If we've already initialized this scan, we can just advance it in + * the appropriate direction. If we haven't done so yet, we call + * _bt_first() to get the first item in the scan. + */ + if (pos == NULL) + res = _bt_first_batch(scan, dir); + else + { + /* + * Now continue the scan. + */ + res = _bt_next_batch(scan, pos, dir); + } + + /* If we have a batch, return it ... */ + if (res) + break; + + /* + * XXX we need to invoke _bt_first_batch on the next iteration, to + * advance SAOP keys etc. But indexam.c already does this, but that's + * only after this returns, so maybe this should do this in some other + * way, not sure who should be responsible for setting currentBatch. + * + * XXX Maybe we don't even need that field? What is a current batch + * anyway? There seem to be at least multiple concepts of "current" + * batch, one for the read stream, another for executor ... + */ + // scan->xs_batches->currentBatch = res; + + /* + * We may do a new scan, depending on what _bt_start_prim_scan says. + * In that case we need to start from scratch, not from the position + * of the last batch. In regular non-batched scans we have currPos, + * because we have just one leaf page for the whole scan, and we + * invalidate it before loading the next one. But with batching that + * doesn't work - we have many leafs, it's not clear which one is + * 'current' (well, it's the last), and we can't invalidate it, + * that's up to amfreebatch(). For now we deduce the position and + * reset it to NULL, to indicate the same thing. + * + * XXX Maybe we should have something like 'currentBatch'? But then + * that probably should be in BTScanOpaque, not in the generic + * indexam.c part? Or it it a sufficiently generic thing? How would + * we keep it in sync with the batch queue? If freeing batches is + * up to indexam, how do we ensure the currentBatch does not point + * to already removed batch? + */ + pos = NULL; + + /* ... otherwise see if we need another primitive index scan */ + } while (so->numArrayKeys && _bt_start_prim_scan(scan, dir)); + + return res; +} + +/* + * btgetbatch() -- Get the next batch of tuples in the scan. + * + * XXX Pretty much like btgettuple(), but for batches of tuples. + */ +void +btfreebatch(IndexScanDesc scan, IndexScanBatch batch) +{ + BTScanOpaque so PG_USED_FOR_ASSERTS_ONLY = (BTScanOpaque) scan->opaque; + + /* batching does not work with regular scan-level positions */ + Assert(!BTScanPosIsValid(so->currPos)); + Assert(!BTScanPosIsValid(so->markPos)); + + /* + * Check to see if we should kill tuples from the previous batch. + */ + _bt_kill_batch(scan, batch); + + /* free all the stuff that might be allocated */ + + if (batch->items) + pfree(batch->items); + + if (batch->itups) + pfree(batch->itups); + + if (batch->htups) + pfree(batch->htups); + + if (batch->recheck) + pfree(batch->recheck); + + if (batch->privateData) + pfree(batch->privateData); + + if (batch->orderbyvals) + pfree(batch->orderbyvals); + + if (batch->orderbynulls) + pfree(batch->orderbynulls); + + if (batch->currTuples) + pfree(batch->currTuples); + + if (batch->opaque) + { + BTBatchScanPos pos = (BTBatchScanPos) batch->opaque; + + BTBatchScanPosIsValid(*pos); + BTBatchScanPosIsPinned(*pos); + + BTBatchScanPosUnpinIfPinned(*pos); + + pfree(batch->opaque); + } + + /* and finally free the batch itself */ + pfree(batch); + + return; +} + /* * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap */ @@ -376,6 +530,10 @@ btbeginscan(Relation rel, int nkeys, int norderbys) /* * btrescan() -- rescan an index relation + * + * Batches should have been freed from indexam using btfreebatch() before we + * get here, but then some of the generic scan stuff needs to be reset here. + * But we shouldn't need to do anything particular here, I think. */ void btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, @@ -400,6 +558,10 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, BTScanPosUnpinIfPinned(so->markPos); BTScanPosInvalidate(so->markPos); + /* FIXME should be in indexam.c I think */ + // if (scan->xs_batches) + // scan->xs_batches->currentBatch = NULL; + /* * Allocate tuple workspace arrays, if needed for an index-only scan and * not already done in a previous rescan call. To save on palloc @@ -433,6 +595,10 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, /* * btendscan() -- close down a scan + * + * Batches should have been freed from indexam using btfreebatch() before we + * get here, but then some of the generic scan stuff needs to be reset here. + * But we shouldn't need to do anything particular here, I think. */ void btendscan(IndexScanDesc scan) @@ -469,12 +635,18 @@ btendscan(IndexScanDesc scan) /* * btmarkpos() -- save current scan position + * + * With batching, all the interesting markpos() stuff happens in indexam.c. We + * should not even get here. */ void btmarkpos(IndexScanDesc scan) { BTScanOpaque so = (BTScanOpaque) scan->opaque; + /* with batching, mark/restore is handled in indexam */ + Assert(scan->xs_batches == NULL); + /* There may be an old mark with a pin (but no lock). */ BTScanPosUnpinIfPinned(so->markPos); @@ -495,12 +667,18 @@ btmarkpos(IndexScanDesc scan) /* * btrestrpos() -- restore scan to last saved position + * + * With batching, all the interesting restrpos() stuff happens in indexam.c. We + * should not even get here. */ void btrestrpos(IndexScanDesc scan) { BTScanOpaque so = (BTScanOpaque) scan->opaque; + /* with batching, mark/restore is handled in indexam */ + Assert(scan->xs_batches == NULL); + if (so->markItemIndex >= 0) { /* @@ -900,6 +1078,147 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page, return status; } +/* + * _bt_parallel_seize() -- Begin the process of advancing the scan to a new + * page. Other scans must wait until we call _bt_parallel_release() + * or _bt_parallel_done(). + * + * The return value is true if we successfully seized the scan and false + * if we did not. The latter case occurs when no pages remain, or when + * another primitive index scan is scheduled that caller's backend cannot + * start just yet (only backends that call from _bt_first are capable of + * starting primitive index scans, which they indicate by passing first=true). + * + * If the return value is true, *next_scan_page returns the next page of the + * scan, and *last_curr_page returns the page that *next_scan_page came from. + * An invalid *next_scan_page means the scan hasn't yet started, or that + * caller needs to start the next primitive index scan (if it's the latter + * case we'll set so.needPrimScan). + * + * Callers should ignore the value of *next_scan_page and *last_curr_page if + * the return value is false. + */ +bool +_bt_parallel_seize_batch(IndexScanDesc scan, BTBatchScanPos pos, + BlockNumber *next_scan_page, + BlockNumber *last_curr_page, bool first) +{ + Relation rel = scan->indexRelation; + BTScanOpaque so = (BTScanOpaque) scan->opaque; + bool exit_loop = false, + status = true, + endscan = false; + ParallelIndexScanDesc parallel_scan = scan->parallel_scan; + BTParallelScanDesc btscan; + + *next_scan_page = InvalidBlockNumber; + *last_curr_page = InvalidBlockNumber; + + /* + * Reset so->currPos, and initialize moreLeft/moreRight such that the next + * call to _bt_readnextpage treats this backend similarly to a serial + * backend that steps from *last_curr_page to *next_scan_page (unless this + * backend's so->currPos is initialized by _bt_readfirstpage before then). + */ + BTScanPosInvalidate(so->currPos); + pos->moreLeft = pos->moreRight = true; + + if (first) + { + /* + * Initialize array related state when called from _bt_first, assuming + * that this will be the first primitive index scan for the scan + */ + so->needPrimScan = false; + so->scanBehind = false; + so->oppositeDirCheck = false; + } + else + { + /* + * Don't attempt to seize the scan when it requires another primitive + * index scan, since caller's backend cannot start it right now + */ + if (so->needPrimScan) + return false; + } + + btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan, + parallel_scan->ps_offset_am); + + while (1) + { + LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE); + + if (btscan->btps_pageStatus == BTPARALLEL_DONE) + { + /* We're done with this parallel index scan */ + status = false; + } + else if (btscan->btps_pageStatus == BTPARALLEL_IDLE && + btscan->btps_nextScanPage == P_NONE) + { + /* End this parallel index scan */ + status = false; + endscan = true; + } + else if (btscan->btps_pageStatus == BTPARALLEL_NEED_PRIMSCAN) + { + Assert(so->numArrayKeys); + + if (first) + { + /* Can start scheduled primitive scan right away, so do so */ + btscan->btps_pageStatus = BTPARALLEL_ADVANCING; + + /* Restore scan's array keys from serialized values */ + _bt_parallel_restore_arrays(rel, btscan, so); + exit_loop = true; + } + else + { + /* + * Don't attempt to seize the scan when it requires another + * primitive index scan, since caller's backend cannot start + * it right now + */ + status = false; + } + + /* + * Either way, update backend local state to indicate that a + * pending primitive scan is required + */ + so->needPrimScan = true; + so->scanBehind = false; + so->oppositeDirCheck = false; + } + else if (btscan->btps_pageStatus != BTPARALLEL_ADVANCING) + { + /* + * We have successfully seized control of the scan for the purpose + * of advancing it to a new page! + */ + btscan->btps_pageStatus = BTPARALLEL_ADVANCING; + Assert(btscan->btps_nextScanPage != P_NONE); + *next_scan_page = btscan->btps_nextScanPage; + *last_curr_page = btscan->btps_lastCurrPage; + exit_loop = true; + } + LWLockRelease(&btscan->btps_lock); + if (exit_loop || !status) + break; + ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE); + } + ConditionVariableCancelSleep(); + + /* When the scan has reached the rightmost (or leftmost) page, end it */ + if (endscan) + _bt_parallel_done(scan); + + return status; +} + /* * _bt_parallel_release() -- Complete the process of advancing the scan to a * new page. We now have the new value btps_nextScanPage; another backend diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 77264ddeecb5..10b28a76c0f6 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -24,8 +24,20 @@ #include "utils/lsyscache.h" #include "utils/rel.h" +/* + * XXX A lot of the new functions are copies of the non-batching version, with + * changes to make it work with batching (which means with position provided + * by the caller, not from the BTScanOpaque). The duplication is not great, + * but it's a bit unclear what to do about it. One option would be to remove + * the amgettuple() interface altogether, once the batching API works, but we + * may also choose to keep both (e.g. for cases that don't support batching, + * like scans of catalogs). In that case we'd need to do some refactoring to + * share as much code as possible. + */ static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp); + +/* static void _bt_drop_lock_and_maybe_pin_batch(IndexScanDesc scan, BTBatchScanPos sp); */ static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, int access); @@ -34,24 +46,44 @@ static int _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum); static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, bool firstpage); +static IndexScanBatch _bt_readpage_batch(IndexScanDesc scan, BTBatchScanPos pos, + ScanDirection dir, OffsetNumber offnum, + bool firstPage); static void _bt_saveitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, IndexTuple itup); +static void _bt_saveitem_batch(IndexScanBatch batch, int itemIndex, + OffsetNumber offnum, IndexTuple itup); static int _bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum, ItemPointer heapTid, IndexTuple itup); +static int _bt_setuppostingitems_batch(IndexScanBatch batch, int itemIndex, + OffsetNumber offnum, ItemPointer heapTid, + IndexTuple itup); static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, ItemPointer heapTid, int tupleOffset); +static inline void _bt_savepostingitem_batch(IndexScanBatch batch, int itemIndex, + OffsetNumber offnum, + ItemPointer heapTid, int tupleOffset); static inline void _bt_returnitem(IndexScanDesc scan, BTScanOpaque so); static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir); +static IndexScanBatch _bt_steppage_batch(IndexScanDesc scan, BTBatchScanPos pos, + ScanDirection dir); static bool _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir); +static IndexScanBatch _bt_readfirstpage_batch(IndexScanDesc scan, BTBatchScanPos pos, + OffsetNumber offnum, + ScanDirection dir); static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, BlockNumber lastcurrblkno, ScanDirection dir, bool seized); +static IndexScanBatch _bt_readnextpage_batch(IndexScanDesc scan, BTBatchScanPos pos, + BlockNumber blkno, BlockNumber lastcurrblkno, + ScanDirection dir, bool seized); static Buffer _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno, BlockNumber lastcurrblkno); static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir); +static IndexScanBatch _bt_endpoint_batch(IndexScanDesc scan, ScanDirection dir); /* @@ -77,6 +109,20 @@ _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp) } } +/* static void */ +/* _bt_drop_lock_and_maybe_pin_batch(IndexScanDesc scan, BTBatchScanPos sp) */ +/* { */ +/* _bt_unlockbuf(scan->indexRelation, sp->buf); */ +/* */ +/* / if (IsMVCCSnapshot(scan->xs_snapshot) && */ +/* RelationNeedsWAL(scan->indexRelation) && */ +/* !scan->xs_want_itup) */ +/* { */ +/* ReleaseBuffer(sp->buf); */ +/* sp->buf = InvalidBuffer; */ +/* } */ +/* } */ + /* * _bt_search() -- Search the tree for a particular scankey, * or more precisely for the first leaf page it could be on. @@ -1570,136 +1616,1344 @@ _bt_next(IndexScanDesc scan, ScanDirection dir) } /* - * _bt_readpage() -- Load data from current index page into so->currPos + * _bt_first_batch() -- Load the first batch in a scan. * - * Caller must have pinned and read-locked so->currPos.buf; the buffer's state - * is not changed here. Also, currPos.moreLeft and moreRight must be valid; - * they are updated as appropriate. All other fields of so->currPos are - * initialized from scratch here. + * A batch variant of _bt_first(). Most of the comments for that function + * apply here too. * - * We scan the current page starting at offnum and moving in the indicated - * direction. All items matching the scan keys are loaded into currPos.items. - * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports - * that there can be no more matching tuples in the current scan direction - * (could just be for the current primitive index scan when scan has arrays). + * XXX This only populates the batch, it does not set any other fields like + * scan->xs_heaptid or scan->xs_itup. That happens in getnext_tid() calls. * - * In the case of a parallel scan, caller must have called _bt_parallel_seize - * prior to calling this function; this function will invoke - * _bt_parallel_release before returning. + * XXX I'm not sure it works to mix batched and non-batches calls, e.g. get + * a TID and then a batch of TIDs. It probably should work as long as we + * update itemIndex correctly, but we need to be careful about killed items + * (right now the two places use different ways to communicate which items + * should be killed). * - * Returns true if any matching items found on the page, false if none. + * XXX We probably should not rely on _bt_first/_bt_steppage, because that + * very much relies on currPos, and it's just laziness to rely on that. For + * batching we probably need something else anyway. */ -static bool -_bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, - bool firstpage) +IndexScanBatch +_bt_first_batch(IndexScanDesc scan, ScanDirection dir) { Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; - Page page; - BTPageOpaque opaque; - OffsetNumber minoff; - OffsetNumber maxoff; - BTReadPageState pstate; - bool arrayKeys; - int itemIndex, - indnatts; + BTStack stack; + OffsetNumber offnum; + BTScanInsertData inskey; + ScanKey startKeys[INDEX_MAX_KEYS]; + ScanKeyData notnullkeys[INDEX_MAX_KEYS]; + int keysz = 0; + StrategyNumber strat_total; + BlockNumber blkno = InvalidBlockNumber, + lastcurrblkno; + BTBatchScanPosData pos; - /* save the page/buffer block number, along with its sibling links */ - page = BufferGetPage(so->currPos.buf); - opaque = BTPageGetOpaque(page); - so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf); - so->currPos.prevPage = opaque->btpo_prev; - so->currPos.nextPage = opaque->btpo_next; + BTBatchScanPosInvalidate(pos); - Assert(!P_IGNORE(opaque)); - Assert(BTScanPosIsPinned(so->currPos)); - Assert(!so->needPrimScan); + /* batching does not work with regular scan-level positions */ + Assert(!BTScanPosIsValid(so->currPos)); + Assert(!BTScanPosIsValid(so->markPos)); - if (scan->parallel_scan) - { - /* allow next/prev page to be read by other worker without delay */ - if (ScanDirectionIsForward(dir)) - _bt_parallel_release(scan, so->currPos.nextPage, - so->currPos.currPage); - else - _bt_parallel_release(scan, so->currPos.prevPage, - so->currPos.currPage); - } + /* FIXME maybe check there's no active batch yet */ + /* Assert(!BTScanPosIsValid(so->currPos)); */ - /* initialize remaining currPos fields related to current page */ - so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf); - so->currPos.dir = dir; - so->currPos.nextTupleOffset = 0; - /* either moreLeft or moreRight should be set now (may be unset later) */ - Assert(ScanDirectionIsForward(dir) ? so->currPos.moreRight : - so->currPos.moreLeft); + /* + * Examine the scan keys and eliminate any redundant keys; also mark the + * keys that must be matched to continue the scan. + */ + _bt_preprocess_keys(scan); - PredicateLockPage(rel, so->currPos.currPage, scan->xs_snapshot); + /* + * Quit now if _bt_preprocess_keys() discovered that the scan keys can + * never be satisfied (eg, x == 1 AND x > 2). + */ + if (!so->qual_ok) + { + Assert(!so->needPrimScan); + _bt_parallel_done(scan); + return false; + } - /* initialize local variables */ - indnatts = IndexRelationGetNumberOfAttributes(rel); - arrayKeys = so->numArrayKeys != 0; - minoff = P_FIRSTDATAKEY(opaque); - maxoff = PageGetMaxOffsetNumber(page); + /* + * If this is a parallel scan, we must seize the scan. _bt_readfirstpage + * will likely release the parallel scan later on. + */ + if (scan->parallel_scan != NULL && + !_bt_parallel_seize_batch(scan, &pos, &blkno, &lastcurrblkno, true)) + return false; - /* initialize page-level state that we'll pass to _bt_checkkeys */ - pstate.minoff = minoff; - pstate.maxoff = maxoff; - pstate.finaltup = NULL; - pstate.page = page; - pstate.firstpage = firstpage; - pstate.forcenonrequired = false; - pstate.startikey = 0; - pstate.offnum = InvalidOffsetNumber; - pstate.skip = InvalidOffsetNumber; - pstate.continuescan = true; /* default assumption */ - pstate.rechecks = 0; - pstate.targetdistance = 0; - pstate.nskipadvances = 0; + /* + * Initialize the scan's arrays (if any) for the current scan direction + * (except when they were already set to later values as part of + * scheduling the primitive index scan that is now underway) + */ + if (so->numArrayKeys && !so->needPrimScan) + _bt_start_array_keys(scan, dir); - if (ScanDirectionIsForward(dir)) + if (blkno != InvalidBlockNumber) { - /* SK_SEARCHARRAY forward scans must provide high key up front */ - if (arrayKeys) - { - if (!P_RIGHTMOST(opaque)) - { - ItemId iid = PageGetItemId(page, P_HIKEY); + /* + * We anticipated calling _bt_search, but another worker bet us to it. + * _bt_readnextpage releases the scan for us (not _bt_readfirstpage). + */ + Assert(scan->parallel_scan != NULL); + Assert(!so->needPrimScan); + Assert(blkno != P_NONE); - pstate.finaltup = (IndexTuple) PageGetItem(page, iid); + return _bt_readnextpage_batch(scan, &pos, blkno, lastcurrblkno, dir, true); + } - if (so->scanBehind && - !_bt_scanbehind_checkkeys(scan, dir, pstate.finaltup)) - { - /* Schedule another primitive index scan after all */ - so->currPos.moreRight = false; - so->needPrimScan = true; - if (scan->parallel_scan) - _bt_parallel_primscan_schedule(scan, - so->currPos.currPage); - return false; - } - } + /* + * Count an indexscan for stats, now that we know that we'll call + * _bt_search/_bt_endpoint below + */ + pgstat_count_index_scan(rel); + if (scan->instrument) + scan->instrument->nsearches++; - so->scanBehind = so->oppositeDirCheck = false; /* reset */ - } + /*---------- + * Examine the scan keys to discover where we need to start the scan. + * + * We want to identify the keys that can be used as starting boundaries; + * these are =, >, or >= keys for a forward scan or =, <, <= keys for + * a backwards scan. We can use keys for multiple attributes so long as + * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept + * a > or < boundary or find an attribute with no boundary (which can be + * thought of as the same as "> -infinity"), we can't use keys for any + * attributes to its right, because it would break our simplistic notion + * of what initial positioning strategy to use. + * + * When the scan keys include cross-type operators, _bt_preprocess_keys + * may not be able to eliminate redundant keys; in such cases we will + * arbitrarily pick a usable one for each attribute. This is correct + * but possibly not optimal behavior. (For example, with keys like + * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when + * x=5 would be more efficient.) Since the situation only arises given + * a poorly-worded query plus an incomplete opfamily, live with it. + * + * When both equality and inequality keys appear for a single attribute + * (again, only possible when cross-type operators appear), we *must* + * select one of the equality keys for the starting point, because + * _bt_checkkeys() will stop the scan as soon as an equality qual fails. + * For example, if we have keys like "x >= 4 AND x = 10" and we elect to + * start at x=4, we will fail and stop before reaching x=10. If multiple + * equality quals survive preprocessing, however, it doesn't matter which + * one we use --- by definition, they are either redundant or + * contradictory. + * + * In practice we rarely see any "attribute boundary key gaps" here. + * Preprocessing can usually backfill skip array keys for any attributes + * that were omitted from the original scan->keyData[] input keys. All + * array keys are always considered = keys, but we'll sometimes need to + * treat the current key value as if we were using an inequality strategy. + * This happens with range skip arrays, which store inequality keys in the + * array's low_compare/high_compare fields (used to find the first/last + * set of matches, when = key will lack a usable sk_argument value). + * These are always preferred over any redundant "standard" inequality + * keys on the same column (per the usual rule about preferring = keys). + * Note also that any column with an = skip array key can never have an + * additional, contradictory = key. + * + * All keys (with the exception of SK_SEARCHNULL keys and SK_BT_SKIP + * array keys whose array is "null_elem=true") imply a NOT NULL qualifier. + * If the index stores nulls at the end of the index we'll be starting + * from, and we have no boundary key for the column (which means the key + * we deduced NOT NULL from is an inequality key that constrains the other + * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to + * use as a boundary key. If we didn't do this, we might find ourselves + * traversing a lot of null entries at the start of the scan. + * + * In this loop, row-comparison keys are treated the same as keys on their + * first (leftmost) columns. We'll add on lower-order columns of the row + * comparison below, if possible. + * + * The selected scan keys (at most one per index column) are remembered by + * storing their addresses into the local startKeys[] array. + * + * _bt_checkkeys/_bt_advance_array_keys decide whether and when to start + * the next primitive index scan (for scans with array keys) based in part + * on an understanding of how it'll enable us to reposition the scan. + * They're directly aware of how we'll sometimes cons up an explicit + * SK_SEARCHNOTNULL key. They'll even end primitive scans by applying a + * symmetric "deduce NOT NULL" rule of their own. This allows top-level + * scans to skip large groups of NULLs through repeated deductions about + * key strictness (for a required inequality key) and whether NULLs in the + * key's index column are stored last or first (relative to non-NULLs). + * If you update anything here, _bt_checkkeys/_bt_advance_array_keys might + * need to be kept in sync. + *---------- + */ + strat_total = BTEqualStrategyNumber; + if (so->numberOfKeys > 0) + { + AttrNumber curattr; + ScanKey chosen; + ScanKey impliesNN; + ScanKey cur; /* - * Consider pstate.startikey optimization once the ongoing primitive - * index scan has already read at least one page + * chosen is the so-far-chosen key for the current attribute, if any. + * We don't cast the decision in stone until we reach keys for the + * next attribute. */ - if (!pstate.firstpage && minoff < maxoff) - _bt_set_startikey(scan, &pstate); - - /* load items[] in ascending order */ - itemIndex = 0; - - offnum = Max(offnum, minoff); + cur = so->keyData; + curattr = 1; + chosen = NULL; + /* Also remember any scankey that implies a NOT NULL constraint */ + impliesNN = NULL; - while (offnum <= maxoff) + /* + * Loop iterates from 0 to numberOfKeys inclusive; we use the last + * pass to handle after-last-key processing. Actual exit from the + * loop is at one of the "break" statements below. + */ + for (int i = 0;; cur++, i++) { - ItemId iid = PageGetItemId(page, offnum); - IndexTuple itup; + if (i >= so->numberOfKeys || cur->sk_attno != curattr) + { + /* + * Done looking at keys for curattr. + * + * If this is a scan key for a skip array whose current + * element is MINVAL, choose low_compare (when scanning + * backwards it'll be MAXVAL, and we'll choose high_compare). + * + * Note: if the array's low_compare key makes 'chosen' NULL, + * then we behave as if the array's first element is -inf, + * except when !array->null_elem implies a usable NOT NULL + * constraint. + */ + if (chosen != NULL && + (chosen->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))) + { + int ikey = chosen - so->keyData; + ScanKey skipequalitykey = chosen; + BTArrayKeyInfo *array = NULL; + + for (int arridx = 0; arridx < so->numArrayKeys; arridx++) + { + array = &so->arrayKeys[arridx]; + if (array->scan_key == ikey) + break; + } + + if (ScanDirectionIsForward(dir)) + { + Assert(!(skipequalitykey->sk_flags & SK_BT_MAXVAL)); + chosen = array->low_compare; + } + else + { + Assert(!(skipequalitykey->sk_flags & SK_BT_MINVAL)); + chosen = array->high_compare; + } + + Assert(chosen == NULL || + chosen->sk_attno == skipequalitykey->sk_attno); + + if (!array->null_elem) + impliesNN = skipequalitykey; + else + Assert(chosen == NULL && impliesNN == NULL); + } + + /* + * If we didn't find a usable boundary key, see if we can + * deduce a NOT NULL key + */ + if (chosen == NULL && impliesNN != NULL && + ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ? + ScanDirectionIsForward(dir) : + ScanDirectionIsBackward(dir))) + { + /* Yes, so build the key in notnullkeys[keysz] */ + chosen = ¬nullkeys[keysz]; + ScanKeyEntryInitialize(chosen, + (SK_SEARCHNOTNULL | SK_ISNULL | + (impliesNN->sk_flags & + (SK_BT_DESC | SK_BT_NULLS_FIRST))), + curattr, + ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ? + BTGreaterStrategyNumber : + BTLessStrategyNumber), + InvalidOid, + InvalidOid, + InvalidOid, + (Datum) 0); + } + + /* + * If we still didn't find a usable boundary key, quit; else + * save the boundary key pointer in startKeys. + */ + if (chosen == NULL) + break; + startKeys[keysz++] = chosen; + + /* + * We can only consider adding more boundary keys when the one + * that we just chose to add uses either the = or >= strategy + * (during backwards scans we can only do so when the key that + * we just added to startKeys[] uses the = or <= strategy) + */ + strat_total = chosen->sk_strategy; + if (strat_total == BTGreaterStrategyNumber || + strat_total == BTLessStrategyNumber) + break; + + /* + * If the key that we just added to startKeys[] is a skip + * array = key whose current element is marked NEXT or PRIOR, + * make strat_total > or < (and stop adding boundary keys). + * This can only happen with opclasses that lack skip support. + */ + if (chosen->sk_flags & (SK_BT_NEXT | SK_BT_PRIOR)) + { + Assert(chosen->sk_flags & SK_BT_SKIP); + Assert(strat_total == BTEqualStrategyNumber); + + if (ScanDirectionIsForward(dir)) + { + Assert(!(chosen->sk_flags & SK_BT_PRIOR)); + strat_total = BTGreaterStrategyNumber; + } + else + { + Assert(!(chosen->sk_flags & SK_BT_NEXT)); + strat_total = BTLessStrategyNumber; + } + + /* + * We're done. We'll never find an exact = match for a + * NEXT or PRIOR sentinel sk_argument value. There's no + * sense in trying to add more keys to startKeys[]. + */ + break; + } + + /* + * Done if that was the last scan key output by preprocessing. + * Also done if there is a gap index attribute that lacks a + * usable key (only possible when preprocessing was unable to + * generate a skip array key to "fill in the gap"). + */ + if (i >= so->numberOfKeys || + cur->sk_attno != curattr + 1) + break; + + /* + * Reset for next attr. + */ + curattr = cur->sk_attno; + chosen = NULL; + impliesNN = NULL; + } + + /* + * Can we use this key as a starting boundary for this attr? + * + * If not, does it imply a NOT NULL constraint? (Because + * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber, + * *any* inequality key works for that; we need not test.) + */ + switch (cur->sk_strategy) + { + case BTLessStrategyNumber: + case BTLessEqualStrategyNumber: + if (chosen == NULL) + { + if (ScanDirectionIsBackward(dir)) + chosen = cur; + else + impliesNN = cur; + } + break; + case BTEqualStrategyNumber: + /* override any non-equality choice */ + chosen = cur; + break; + case BTGreaterEqualStrategyNumber: + case BTGreaterStrategyNumber: + if (chosen == NULL) + { + if (ScanDirectionIsForward(dir)) + chosen = cur; + else + impliesNN = cur; + } + break; + } + } + } + + /* + * If we found no usable boundary keys, we have to start from one end of + * the tree. Walk down that edge to the first or last key, and scan from + * there. + * + * Note: calls _bt_readfirstpage for us, which releases the parallel scan. + */ + if (keysz == 0) + return _bt_endpoint_batch(scan, dir); + + /* + * We want to start the scan somewhere within the index. Set up an + * insertion scankey we can use to search for the boundary point we + * identified above. The insertion scankey is built using the keys + * identified by startKeys[]. (Remaining insertion scankey fields are + * initialized after initial-positioning scan keys are finalized.) + */ + Assert(keysz <= INDEX_MAX_KEYS); + for (int i = 0; i < keysz; i++) + { + ScanKey cur = startKeys[i]; + + Assert(cur->sk_attno == i + 1); + + if (cur->sk_flags & SK_ROW_HEADER) + { + /* + * Row comparison header: look to the first row member instead + */ + ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument); + + /* + * Cannot be a NULL in the first row member: _bt_preprocess_keys + * would've marked the qual as unsatisfiable, preventing us from + * ever getting this far + */ + Assert(subkey->sk_flags & SK_ROW_MEMBER); + Assert(subkey->sk_attno == cur->sk_attno); + Assert(!(subkey->sk_flags & SK_ISNULL)); + + /* + * The member scankeys are already in insertion format (ie, they + * have sk_func = 3-way-comparison function) + */ + memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData)); + + /* + * If the row comparison is the last positioning key we accepted, + * try to add additional keys from the lower-order row members. + * (If we accepted independent conditions on additional index + * columns, we use those instead --- doesn't seem worth trying to + * determine which is more restrictive.) Note that this is OK + * even if the row comparison is of ">" or "<" type, because the + * condition applied to all but the last row member is effectively + * ">=" or "<=", and so the extra keys don't break the positioning + * scheme. But, by the same token, if we aren't able to use all + * the row members, then the part of the row comparison that we + * did use has to be treated as just a ">=" or "<=" condition, and + * so we'd better adjust strat_total accordingly. + */ + if (i == keysz - 1) + { + bool used_all_subkeys = false; + + Assert(!(subkey->sk_flags & SK_ROW_END)); + for (;;) + { + subkey++; + Assert(subkey->sk_flags & SK_ROW_MEMBER); + if (subkey->sk_attno != keysz + 1) + break; /* out-of-sequence, can't use it */ + if (subkey->sk_strategy != cur->sk_strategy) + break; /* wrong direction, can't use it */ + if (subkey->sk_flags & SK_ISNULL) + break; /* can't use null keys */ + Assert(keysz < INDEX_MAX_KEYS); + memcpy(inskey.scankeys + keysz, subkey, + sizeof(ScanKeyData)); + keysz++; + if (subkey->sk_flags & SK_ROW_END) + { + used_all_subkeys = true; + break; + } + } + if (!used_all_subkeys) + { + switch (strat_total) + { + case BTLessStrategyNumber: + strat_total = BTLessEqualStrategyNumber; + break; + case BTGreaterStrategyNumber: + strat_total = BTGreaterEqualStrategyNumber; + break; + } + } + break; /* done with outer loop */ + } + } + else + { + /* + * Ordinary comparison key. Transform the search-style scan key + * to an insertion scan key by replacing the sk_func with the + * appropriate btree comparison function. + * + * If scankey operator is not a cross-type comparison, we can use + * the cached comparison function; otherwise gotta look it up in + * the catalogs. (That can't lead to infinite recursion, since no + * indexscan initiated by syscache lookup will use cross-data-type + * operators.) + * + * We support the convention that sk_subtype == InvalidOid means + * the opclass input type; this is a hack to simplify life for + * ScanKeyInit(). + */ + if (cur->sk_subtype == rel->rd_opcintype[i] || + cur->sk_subtype == InvalidOid) + { + FmgrInfo *procinfo; + + procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC); + ScanKeyEntryInitializeWithInfo(inskey.scankeys + i, + cur->sk_flags, + cur->sk_attno, + InvalidStrategy, + cur->sk_subtype, + cur->sk_collation, + procinfo, + cur->sk_argument); + } + else + { + RegProcedure cmp_proc; + + cmp_proc = get_opfamily_proc(rel->rd_opfamily[i], + rel->rd_opcintype[i], + cur->sk_subtype, + BTORDER_PROC); + if (!RegProcedureIsValid(cmp_proc)) + elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"", + BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype, + cur->sk_attno, RelationGetRelationName(rel)); + ScanKeyEntryInitialize(inskey.scankeys + i, + cur->sk_flags, + cur->sk_attno, + InvalidStrategy, + cur->sk_subtype, + cur->sk_collation, + cmp_proc, + cur->sk_argument); + } + } + } + + /*---------- + * Examine the selected initial-positioning strategy to determine exactly + * where we need to start the scan, and set flag variables to control the + * initial descent by _bt_search (and our _bt_binsrch call for the leaf + * page _bt_search returns). + *---------- + */ + _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage); + inskey.anynullkeys = false; /* unused */ + inskey.scantid = NULL; + inskey.keysz = keysz; + switch (strat_total) + { + case BTLessStrategyNumber: + + inskey.nextkey = false; + inskey.backward = true; + break; + + case BTLessEqualStrategyNumber: + + inskey.nextkey = true; + inskey.backward = true; + break; + + case BTEqualStrategyNumber: + + /* + * If a backward scan was specified, need to start with last equal + * item not first one. + */ + if (ScanDirectionIsBackward(dir)) + { + /* + * This is the same as the <= strategy + */ + inskey.nextkey = true; + inskey.backward = true; + } + else + { + /* + * This is the same as the >= strategy + */ + inskey.nextkey = false; + inskey.backward = false; + } + break; + + case BTGreaterEqualStrategyNumber: + + /* + * Find first item >= scankey + */ + inskey.nextkey = false; + inskey.backward = false; + break; + + case BTGreaterStrategyNumber: + + /* + * Find first item > scankey + */ + inskey.nextkey = true; + inskey.backward = false; + break; + + default: + /* can't get here, but keep compiler quiet */ + elog(ERROR, "unrecognized strat_total: %d", (int) strat_total); + return false; + } + + /* + * Use the manufactured insertion scan key to descend the tree and + * position ourselves on the target leaf page. + */ + Assert(ScanDirectionIsBackward(dir) == inskey.backward); + stack = _bt_search(rel, NULL, &inskey, &pos.buf, BT_READ); + + /* don't need to keep the stack around... */ + _bt_freestack(stack); + + if (!BufferIsValid(pos.buf)) + { + /* + * We only get here if the index is completely empty. Lock relation + * because nothing finer to lock exists. Without a buffer lock, it's + * possible for another transaction to insert data between + * _bt_search() and PredicateLockRelation(). We have to try again + * after taking the relation-level predicate lock, to close a narrow + * window where we wouldn't scan concurrently inserted tuples, but the + * writer wouldn't see our predicate lock. + */ + if (IsolationIsSerializable()) + { + PredicateLockRelation(rel, scan->xs_snapshot); + stack = _bt_search(rel, NULL, &inskey, &pos.buf, BT_READ); + _bt_freestack(stack); + } + + if (!BufferIsValid(pos.buf)) + { + Assert(!so->needPrimScan); + _bt_parallel_done(scan); + return false; + } + } + + /* position to the precise item on the page */ + offnum = _bt_binsrch(rel, &inskey, pos.buf); + + /* + * Now load data from the first page of the scan (usually the page + * currently in so->currPos.buf). + * + * If inskey.nextkey = false and inskey.backward = false, offnum is + * positioned at the first non-pivot tuple >= inskey.scankeys. + * + * If inskey.nextkey = false and inskey.backward = true, offnum is + * positioned at the last non-pivot tuple < inskey.scankeys. + * + * If inskey.nextkey = true and inskey.backward = false, offnum is + * positioned at the first non-pivot tuple > inskey.scankeys. + * + * If inskey.nextkey = true and inskey.backward = true, offnum is + * positioned at the last non-pivot tuple <= inskey.scankeys. + * + * It's possible that _bt_binsrch returned an offnum that is out of bounds + * for the page. For example, when inskey is both < the leaf page's high + * key and > all of its non-pivot tuples, offnum will be "maxoff + 1". + */ + return _bt_readfirstpage_batch(scan, &pos, offnum, dir); +} + +/* + * _bt_next_batch() -- Get the next batch of items in a scan. + * + * A batch variant of _bt_next(). Most of the comments for that function + * apply here too. + * + * We should only get here only when the current batch has no more items + * in the given direction. We don't get here with empty batches, that's + * handled by _bt_fist_batch(). + * + * XXX See also the comments at _bt_first_batch() about returning a single + * batch for the page, etc. + */ +IndexScanBatch +_bt_next_batch(IndexScanDesc scan, BTBatchScanPos pos, ScanDirection dir) +{ + BTScanOpaque so PG_USED_FOR_ASSERTS_ONLY = (BTScanOpaque) scan->opaque; + // BTBatchScanPos pos; + BTBatchScanPosData tmp; + // IndexScanBatch batch; + // int idx; + + /* batching does not work with regular scan-level positions */ + Assert(!BTScanPosIsValid(so->currPos)); + Assert(!BTScanPosIsValid(so->markPos)); + + /* + * restore the BTScanOpaque from the current batch + * + * XXX This is pretty ugly/expensive. Ideally we'd have all the fields + * needed to determine "location" in the index (essentially BTScanPosData) + * in the batch, without cloning all the other stuff. + */ + // Assert(scan->xs_batches->currentBatch != NULL); + + /* + * Use the last batch as the "current" batch. We use the streamPos if + * initialized, or the readPos as a fallback. Alternatively, we could + * simply use the last batch in the queue, i.e. (nextBatch - 1). + * + * Even better, we could pass the "correct" batch from indexam.c, and + * let that figure out which position to move from. + */ +/* + idx = scan->xs_batches->streamPos.batch; + if (idx == -1) + idx = scan->xs_batches->readPos.batch; + + batch = INDEX_SCAN_BATCH(scan, idx); + Assert(batch != NULL); + pos = (BTBatchScanPos) batch->opaque; +*/ + + Assert(BTBatchScanPosIsPinned(*pos)); + + memcpy(&tmp, pos, sizeof(tmp)); + + /* + * Advance to next page, load the data into the index batch. + * + * FIXME It may not be quite correct to just pass the position from + * current batch, some of the functions scribble over it (e.g. + * _bt_readpage_batch). Maybe we should create a copy, or something? + * + * XXX For now we pass a local copy "tmp". + */ + return _bt_steppage_batch(scan, &tmp, dir); +} + +/* + * _bt_kill_batch() -- remember the items-to-be-killed from the current batch + * + * We simply translate the bitmap into the "regular" killedItems array, and let + * that to drive which items are killed. + */ +void +_bt_kill_batch(IndexScanDesc scan, IndexScanBatch batch) +{ + BTScanOpaque so PG_USED_FOR_ASSERTS_ONLY = (BTScanOpaque) scan->opaque; + + /* batching does not work with regular scan-level positions */ + Assert(!BTScanPosIsValid(so->currPos)); + Assert(!BTScanPosIsValid(so->markPos)); + + /* we should only get here for scans with batching */ + Assert(scan->xs_batches); + + /* bail out if the batch has no killed items */ + if (batch->numKilled == 0) + return; + + /* + * XXX Now what? we don't have the currPos around anymore, so we should + * load that, and apply the killed items to that, somehow? + */ + /* FIXME: _bt_kill_batch not implemented */ + + /* + * XXX maybe we should have a separate callback for this, and call it from + * the indexam.c directly whenever we think it's appropriate? And not only + * from here when freeing the batch? + */ + _bt_killitems_batch(scan, batch); +} + +/* + * _bt_readpage() -- Load data from current index page into so->currPos + * + * Caller must have pinned and read-locked so->currPos.buf; the buffer's state + * is not changed here. Also, currPos.moreLeft and moreRight must be valid; + * they are updated as appropriate. All other fields of so->currPos are + * initialized from scratch here. + * + * We scan the current page starting at offnum and moving in the indicated + * direction. All items matching the scan keys are loaded into currPos.items. + * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports + * that there can be no more matching tuples in the current scan direction + * (could just be for the current primitive index scan when scan has arrays). + * + * In the case of a parallel scan, caller must have called _bt_parallel_seize + * prior to calling this function; this function will invoke + * _bt_parallel_release before returning. + * + * Returns true if any matching items found on the page, false if none. + */ +static bool +_bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, + bool firstpage) +{ + Relation rel = scan->indexRelation; + BTScanOpaque so = (BTScanOpaque) scan->opaque; + Page page; + BTPageOpaque opaque; + OffsetNumber minoff; + OffsetNumber maxoff; + BTReadPageState pstate; + bool arrayKeys; + int itemIndex, + indnatts; + + /* save the page/buffer block number, along with its sibling links */ + page = BufferGetPage(so->currPos.buf); + opaque = BTPageGetOpaque(page); + so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf); + so->currPos.prevPage = opaque->btpo_prev; + so->currPos.nextPage = opaque->btpo_next; + + Assert(!P_IGNORE(opaque)); + Assert(BTScanPosIsPinned(so->currPos)); + Assert(!so->needPrimScan); + + if (scan->parallel_scan) + { + /* allow next/prev page to be read by other worker without delay */ + if (ScanDirectionIsForward(dir)) + _bt_parallel_release(scan, so->currPos.nextPage, + so->currPos.currPage); + else + _bt_parallel_release(scan, so->currPos.prevPage, + so->currPos.currPage); + } + + /* initialize remaining currPos fields related to current page */ + so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf); + so->currPos.dir = dir; + so->currPos.nextTupleOffset = 0; + /* either moreLeft or moreRight should be set now (may be unset later) */ + Assert(ScanDirectionIsForward(dir) ? so->currPos.moreRight : + so->currPos.moreLeft); + + PredicateLockPage(rel, so->currPos.currPage, scan->xs_snapshot); + + /* initialize local variables */ + indnatts = IndexRelationGetNumberOfAttributes(rel); + arrayKeys = so->numArrayKeys != 0; + minoff = P_FIRSTDATAKEY(opaque); + maxoff = PageGetMaxOffsetNumber(page); + + /* initialize page-level state that we'll pass to _bt_checkkeys */ + pstate.minoff = minoff; + pstate.maxoff = maxoff; + pstate.finaltup = NULL; + pstate.page = page; + pstate.firstpage = firstpage; + pstate.forcenonrequired = false; + pstate.startikey = 0; + pstate.offnum = InvalidOffsetNumber; + pstate.skip = InvalidOffsetNumber; + pstate.continuescan = true; /* default assumption */ + pstate.rechecks = 0; + pstate.targetdistance = 0; + pstate.nskipadvances = 0; + + if (ScanDirectionIsForward(dir)) + { + /* SK_SEARCHARRAY forward scans must provide high key up front */ + if (arrayKeys) + { + if (!P_RIGHTMOST(opaque)) + { + ItemId iid = PageGetItemId(page, P_HIKEY); + + pstate.finaltup = (IndexTuple) PageGetItem(page, iid); + + if (so->scanBehind && + !_bt_scanbehind_checkkeys(scan, dir, pstate.finaltup)) + { + /* Schedule another primitive index scan after all */ + so->currPos.moreRight = false; + so->needPrimScan = true; + if (scan->parallel_scan) + _bt_parallel_primscan_schedule(scan, + so->currPos.currPage); + return false; + } + } + + so->scanBehind = so->oppositeDirCheck = false; /* reset */ + } + + /* + * Consider pstate.startikey optimization once the ongoing primitive + * index scan has already read at least one page + */ + if (!pstate.firstpage && minoff < maxoff) + _bt_set_startikey(scan, &pstate); + + /* load items[] in ascending order */ + itemIndex = 0; + + offnum = Max(offnum, minoff); + + while (offnum <= maxoff) + { + ItemId iid = PageGetItemId(page, offnum); + IndexTuple itup; + bool passes_quals; + + /* + * If the scan specifies not to return killed tuples, then we + * treat a killed tuple as not passing the qual + */ + if (scan->ignore_killed_tuples && ItemIdIsDead(iid)) + { + offnum = OffsetNumberNext(offnum); + continue; + } + + itup = (IndexTuple) PageGetItem(page, iid); + Assert(!BTreeTupleIsPivot(itup)); + + pstate.offnum = offnum; + passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys, + itup, indnatts); + + /* + * Check if we need to skip ahead to a later tuple (only possible + * when the scan uses array keys) + */ + if (arrayKeys && OffsetNumberIsValid(pstate.skip)) + { + Assert(!passes_quals && pstate.continuescan); + Assert(offnum < pstate.skip); + Assert(!pstate.forcenonrequired); + + offnum = pstate.skip; + pstate.skip = InvalidOffsetNumber; + continue; + } + + if (passes_quals) + { + /* tuple passes all scan key conditions */ + if (!BTreeTupleIsPosting(itup)) + { + /* Remember it */ + _bt_saveitem(so, itemIndex, offnum, itup); + itemIndex++; + } + else + { + int tupleOffset; + + /* + * Set up state to return posting list, and remember first + * TID + */ + tupleOffset = + _bt_setuppostingitems(so, itemIndex, offnum, + BTreeTupleGetPostingN(itup, 0), + itup); + itemIndex++; + /* Remember additional TIDs */ + for (int i = 1; i < BTreeTupleGetNPosting(itup); i++) + { + _bt_savepostingitem(so, itemIndex, offnum, + BTreeTupleGetPostingN(itup, i), + tupleOffset); + itemIndex++; + } + } + } + /* When !continuescan, there can't be any more matches, so stop */ + if (!pstate.continuescan) + break; + + offnum = OffsetNumberNext(offnum); + } + + /* + * We don't need to visit page to the right when the high key + * indicates that no more matches will be found there. + * + * Checking the high key like this works out more often than you might + * think. Leaf page splits pick a split point between the two most + * dissimilar tuples (this is weighed against the need to evenly share + * free space). Leaf pages with high key attribute values that can + * only appear on non-pivot tuples on the right sibling page are + * common. + */ + if (pstate.continuescan && !so->scanBehind && !P_RIGHTMOST(opaque)) + { + ItemId iid = PageGetItemId(page, P_HIKEY); + IndexTuple itup = (IndexTuple) PageGetItem(page, iid); + int truncatt; + + truncatt = BTreeTupleGetNAtts(itup, rel); + pstate.forcenonrequired = false; + pstate.startikey = 0; /* _bt_set_startikey ignores P_HIKEY */ + _bt_checkkeys(scan, &pstate, arrayKeys, itup, truncatt); + } + + if (!pstate.continuescan) + so->currPos.moreRight = false; + + Assert(itemIndex <= MaxTIDsPerBTreePage); + so->currPos.firstItem = 0; + so->currPos.lastItem = itemIndex - 1; + so->currPos.itemIndex = 0; + } + else + { + /* SK_SEARCHARRAY backward scans must provide final tuple up front */ + if (arrayKeys) + { + if (minoff <= maxoff && !P_LEFTMOST(opaque)) + { + ItemId iid = PageGetItemId(page, minoff); + + pstate.finaltup = (IndexTuple) PageGetItem(page, iid); + + if (so->scanBehind && + !_bt_scanbehind_checkkeys(scan, dir, pstate.finaltup)) + { + /* Schedule another primitive index scan after all */ + so->currPos.moreLeft = false; + so->needPrimScan = true; + if (scan->parallel_scan) + _bt_parallel_primscan_schedule(scan, + so->currPos.currPage); + return false; + } + } + + so->scanBehind = so->oppositeDirCheck = false; /* reset */ + } + + /* + * Consider pstate.startikey optimization once the ongoing primitive + * index scan has already read at least one page + */ + if (!pstate.firstpage && minoff < maxoff) + _bt_set_startikey(scan, &pstate); + + /* load items[] in descending order */ + itemIndex = MaxTIDsPerBTreePage; + + offnum = Min(offnum, maxoff); + + while (offnum >= minoff) + { + ItemId iid = PageGetItemId(page, offnum); + IndexTuple itup; + bool tuple_alive; + bool passes_quals; + + /* + * If the scan specifies not to return killed tuples, then we + * treat a killed tuple as not passing the qual. Most of the + * time, it's a win to not bother examining the tuple's index + * keys, but just skip to the next tuple (previous, actually, + * since we're scanning backwards). However, if this is the first + * tuple on the page, we do check the index keys, to prevent + * uselessly advancing to the page to the left. This is similar + * to the high key optimization used by forward scans. + */ + if (scan->ignore_killed_tuples && ItemIdIsDead(iid)) + { + if (offnum > minoff) + { + offnum = OffsetNumberPrev(offnum); + continue; + } + + tuple_alive = false; + } + else + tuple_alive = true; + + itup = (IndexTuple) PageGetItem(page, iid); + Assert(!BTreeTupleIsPivot(itup)); + + pstate.offnum = offnum; + if (arrayKeys && offnum == minoff && pstate.forcenonrequired) + { + pstate.forcenonrequired = false; + pstate.startikey = 0; + } + passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys, + itup, indnatts); + + if (arrayKeys && so->scanBehind) + { + /* + * Done scanning this page, but not done with the current + * primscan. + * + * Note: Forward scans don't check this explicitly, since they + * prefer to reuse pstate.skip for this instead. + */ + Assert(!passes_quals && pstate.continuescan); + Assert(!pstate.forcenonrequired); + + break; + } + + /* + * Check if we need to skip ahead to a later tuple (only possible + * when the scan uses array keys) + */ + if (arrayKeys && OffsetNumberIsValid(pstate.skip)) + { + Assert(!passes_quals && pstate.continuescan); + Assert(offnum > pstate.skip); + Assert(!pstate.forcenonrequired); + + offnum = pstate.skip; + pstate.skip = InvalidOffsetNumber; + continue; + } + + if (passes_quals && tuple_alive) + { + /* tuple passes all scan key conditions */ + if (!BTreeTupleIsPosting(itup)) + { + /* Remember it */ + itemIndex--; + _bt_saveitem(so, itemIndex, offnum, itup); + } + else + { + int tupleOffset; + + /* + * Set up state to return posting list, and remember first + * TID. + * + * Note that we deliberately save/return items from + * posting lists in ascending heap TID order for backwards + * scans. This allows _bt_killitems() to make a + * consistent assumption about the order of items + * associated with the same posting list tuple. + */ + itemIndex--; + tupleOffset = + _bt_setuppostingitems(so, itemIndex, offnum, + BTreeTupleGetPostingN(itup, 0), + itup); + /* Remember additional TIDs */ + for (int i = 1; i < BTreeTupleGetNPosting(itup); i++) + { + itemIndex--; + _bt_savepostingitem(so, itemIndex, offnum, + BTreeTupleGetPostingN(itup, i), + tupleOffset); + } + } + } + /* When !continuescan, there can't be any more matches, so stop */ + if (!pstate.continuescan) + break; + + offnum = OffsetNumberPrev(offnum); + } + + /* + * We don't need to visit page to the left when no more matches will + * be found there + */ + if (!pstate.continuescan) + so->currPos.moreLeft = false; + + Assert(itemIndex >= 0); + so->currPos.firstItem = itemIndex; + so->currPos.lastItem = MaxTIDsPerBTreePage - 1; + so->currPos.itemIndex = MaxTIDsPerBTreePage - 1; + } + + /* + * If _bt_set_startikey told us to temporarily treat the scan's keys as + * nonrequired (possible only during scans with array keys), there must be + * no lasting consequences for the scan's array keys. The scan's arrays + * should now have exactly the same elements as they would have had if the + * nonrequired behavior had never been used. (In general, a scan's arrays + * are expected to track its progress through the index's key space.) + * + * We are required (by _bt_set_startikey) to call _bt_checkkeys against + * pstate.finaltup with pstate.forcenonrequired=false to allow the scan's + * arrays to recover. Assert that that step hasn't been missed. + */ + Assert(!pstate.forcenonrequired); + + return (so->currPos.firstItem <= so->currPos.lastItem); +} + +static IndexScanBatch +_bt_readpage_batch(IndexScanDesc scan, BTBatchScanPos pos, ScanDirection dir, OffsetNumber offnum, + bool firstpage) +{ + Relation rel = scan->indexRelation; + BTScanOpaque so = (BTScanOpaque) scan->opaque; + Page page; + BTPageOpaque opaque; + OffsetNumber minoff; + OffsetNumber maxoff; + BTReadPageState pstate; + bool arrayKeys; + int itemIndex, + indnatts; + + /* result */ + /* IndexScanBatch batch = ddd; */ + IndexScanBatch batch; + + /* batching does not work with regular scan-level positions */ + Assert(!BTScanPosIsValid(so->currPos)); + Assert(!BTScanPosIsValid(so->markPos)); + + /* + * FIXME fake for _bt_checkkeys, needs to be set properly elsewhere (not + * sure where) + */ + + /* + * XXX we shouldn't be passing this info through currPos but directly, I + * guess. + */ + so->currPos.dir = dir; + + /* + * XXX We can pass the exact number if items from this page, by using + * maxoff + */ + batch = index_batch_alloc(MaxTIDsPerBTreePage, scan->xs_want_itup); + + /* FIXME but we don't copy the contents until the end */ + batch->opaque = palloc0(sizeof(BTBatchScanPosData)); + + /* bogus values */ + batch->firstItem = -1; + batch->lastItem = -1; + batch->itemIndex = -1; + + /* if (so->currTuples) */ + /* { */ + /* batch->currTuples = (char *) palloc(BLCKSZ); */ + /* memcpy(batch->currTuples, so->currTuples, BLCKSZ); */ + /* } */ + + /* save the page/buffer block number, along with its sibling links */ + page = BufferGetPage(pos->buf); + opaque = BTPageGetOpaque(page); + pos->currPage = BufferGetBlockNumber(pos->buf); + pos->prevPage = opaque->btpo_prev; + pos->nextPage = opaque->btpo_next; + + Assert(!P_IGNORE(opaque)); + Assert(BTBatchScanPosIsPinned(*pos)); + Assert(!so->needPrimScan); + + if (scan->parallel_scan) + { + /* allow next/prev page to be read by other worker without delay */ + if (ScanDirectionIsForward(dir)) + _bt_parallel_release(scan, pos->nextPage, + pos->currPage); + else + _bt_parallel_release(scan, pos->prevPage, + pos->currPage); + } + + /* initialize remaining currPos fields related to current page */ + pos->lsn = BufferGetLSNAtomic(pos->buf); + pos->dir = dir; + pos->nextTupleOffset = 0; + /* either moreLeft or moreRight should be set now (may be unset later) */ + Assert(ScanDirectionIsForward(dir) ? pos->moreRight : pos->moreLeft); + + PredicateLockPage(rel, pos->currPage, scan->xs_snapshot); + + /* initialize local variables */ + indnatts = IndexRelationGetNumberOfAttributes(rel); + arrayKeys = so->numArrayKeys != 0; + minoff = P_FIRSTDATAKEY(opaque); + maxoff = PageGetMaxOffsetNumber(page); + + /* initialize page-level state that we'll pass to _bt_checkkeys */ + pstate.minoff = minoff; + pstate.maxoff = maxoff; + pstate.finaltup = NULL; + pstate.page = page; + pstate.firstpage = firstpage; + pstate.forcenonrequired = false; + pstate.startikey = 0; + pstate.offnum = InvalidOffsetNumber; + pstate.skip = InvalidOffsetNumber; + pstate.continuescan = true; /* default assumption */ + pstate.rechecks = 0; + pstate.targetdistance = 0; + pstate.nskipadvances = 0; + + if (ScanDirectionIsForward(dir)) + { + /* SK_SEARCHARRAY forward scans must provide high key up front */ + if (arrayKeys) + { + if (!P_RIGHTMOST(opaque)) + { + ItemId iid = PageGetItemId(page, P_HIKEY); + + pstate.finaltup = (IndexTuple) PageGetItem(page, iid); + + if (so->scanBehind && + !_bt_scanbehind_checkkeys(scan, dir, pstate.finaltup)) + { + /* Schedule another primitive index scan after all */ + pos->moreRight = false; + so->needPrimScan = true; + if (scan->parallel_scan) + _bt_parallel_primscan_schedule(scan, + pos->currPage); + return NULL; + } + } + + so->scanBehind = so->oppositeDirCheck = false; /* reset */ + } + + /* + * Consider pstate.startikey optimization once the ongoing primitive + * index scan has already read at least one page + */ + if (!pstate.firstpage && minoff < maxoff) + _bt_set_startikey(scan, &pstate); + + /* load items[] in ascending order */ + itemIndex = 0; + + offnum = Max(offnum, minoff); + + while (offnum <= maxoff) + { + ItemId iid = PageGetItemId(page, offnum); + IndexTuple itup; bool passes_quals; /* @@ -1740,7 +2994,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, if (!BTreeTupleIsPosting(itup)) { /* Remember it */ - _bt_saveitem(so, itemIndex, offnum, itup); + _bt_saveitem_batch(batch, itemIndex, offnum, itup); itemIndex++; } else @@ -1752,16 +3006,16 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, * TID */ tupleOffset = - _bt_setuppostingitems(so, itemIndex, offnum, - BTreeTupleGetPostingN(itup, 0), - itup); + _bt_setuppostingitems_batch(batch, itemIndex, offnum, + BTreeTupleGetPostingN(itup, 0), + itup); itemIndex++; /* Remember additional TIDs */ for (int i = 1; i < BTreeTupleGetNPosting(itup); i++) { - _bt_savepostingitem(so, itemIndex, offnum, - BTreeTupleGetPostingN(itup, i), - tupleOffset); + _bt_savepostingitem_batch(batch, itemIndex, offnum, + BTreeTupleGetPostingN(itup, i), + tupleOffset); itemIndex++; } } @@ -1792,17 +3046,17 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, truncatt = BTreeTupleGetNAtts(itup, rel); pstate.forcenonrequired = false; - pstate.startikey = 0; /* _bt_set_startikey ignores P_HIKEY */ + pstate.startikey = 0; /* _bt_set_startikey ignores HIKEY */ _bt_checkkeys(scan, &pstate, arrayKeys, itup, truncatt); } if (!pstate.continuescan) - so->currPos.moreRight = false; + pos->moreRight = false; Assert(itemIndex <= MaxTIDsPerBTreePage); - so->currPos.firstItem = 0; - so->currPos.lastItem = itemIndex - 1; - so->currPos.itemIndex = 0; + batch->firstItem = 0; + batch->lastItem = itemIndex - 1; + batch->itemIndex = 0; } else { @@ -1819,12 +3073,12 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, !_bt_scanbehind_checkkeys(scan, dir, pstate.finaltup)) { /* Schedule another primitive index scan after all */ - so->currPos.moreLeft = false; + pos->moreLeft = false; so->needPrimScan = true; if (scan->parallel_scan) _bt_parallel_primscan_schedule(scan, - so->currPos.currPage); - return false; + pos->currPage); + return NULL; } } @@ -1922,7 +3176,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, { /* Remember it */ itemIndex--; - _bt_saveitem(so, itemIndex, offnum, itup); + _bt_saveitem_batch(batch, itemIndex, offnum, itup); } else { @@ -1940,16 +3194,16 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, */ itemIndex--; tupleOffset = - _bt_setuppostingitems(so, itemIndex, offnum, - BTreeTupleGetPostingN(itup, 0), - itup); + _bt_setuppostingitems_batch(batch, itemIndex, offnum, + BTreeTupleGetPostingN(itup, 0), + itup); /* Remember additional TIDs */ for (int i = 1; i < BTreeTupleGetNPosting(itup); i++) { itemIndex--; - _bt_savepostingitem(so, itemIndex, offnum, - BTreeTupleGetPostingN(itup, i), - tupleOffset); + _bt_savepostingitem_batch(batch, itemIndex, offnum, + BTreeTupleGetPostingN(itup, i), + tupleOffset); } } } @@ -1965,12 +3219,12 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, * be found there */ if (!pstate.continuescan) - so->currPos.moreLeft = false; + pos->moreLeft = false; Assert(itemIndex >= 0); - so->currPos.firstItem = itemIndex; - so->currPos.lastItem = MaxTIDsPerBTreePage - 1; - so->currPos.itemIndex = MaxTIDsPerBTreePage - 1; + batch->firstItem = itemIndex; + batch->lastItem = MaxTIDsPerBTreePage - 1; + batch->itemIndex = MaxTIDsPerBTreePage - 1; } /* @@ -1987,7 +3241,12 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, */ Assert(!pstate.forcenonrequired); - return (so->currPos.firstItem <= so->currPos.lastItem); + if (batch->firstItem > batch->lastItem) + return NULL; + + memcpy(batch->opaque, pos, sizeof(BTBatchScanPosData)); + + return batch; } /* Save an index item into so->currPos.items[itemIndex] */ @@ -2005,9 +3264,97 @@ _bt_saveitem(BTScanOpaque so, int itemIndex, { Size itupsz = IndexTupleSize(itup); - currItem->tupleOffset = so->currPos.nextTupleOffset; - memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz); - so->currPos.nextTupleOffset += MAXALIGN(itupsz); + currItem->tupleOffset = so->currPos.nextTupleOffset; + memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz); + so->currPos.nextTupleOffset += MAXALIGN(itupsz); + } +} + +/* + * Setup state to save TIDs/items from a single posting list tuple. + * + * Saves an index item into so->currPos.items[itemIndex] for TID that is + * returned to scan first. Second or subsequent TIDs for posting list should + * be saved by calling _bt_savepostingitem(). + * + * Returns an offset into tuple storage space that main tuple is stored at if + * needed. + */ +static int +_bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum, + ItemPointer heapTid, IndexTuple itup) +{ + BTScanPosItem *currItem = &so->currPos.items[itemIndex]; + + Assert(BTreeTupleIsPosting(itup)); + + currItem->heapTid = *heapTid; + currItem->indexOffset = offnum; + if (so->currTuples) + { + /* Save base IndexTuple (truncate posting list) */ + IndexTuple base; + Size itupsz = BTreeTupleGetPostingOffset(itup); + + itupsz = MAXALIGN(itupsz); + currItem->tupleOffset = so->currPos.nextTupleOffset; + base = (IndexTuple) (so->currTuples + so->currPos.nextTupleOffset); + memcpy(base, itup, itupsz); + /* Defensively reduce work area index tuple header size */ + base->t_info &= ~INDEX_SIZE_MASK; + base->t_info |= itupsz; + so->currPos.nextTupleOffset += itupsz; + + return currItem->tupleOffset; + } + + return 0; +} + +/* + * Save an index item into so->currPos.items[itemIndex] for current posting + * tuple. + * + * Assumes that _bt_setuppostingitems() has already been called for current + * posting list tuple. Caller passes its return value as tupleOffset. + */ +static inline void +_bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, + ItemPointer heapTid, int tupleOffset) +{ + BTScanPosItem *currItem = &so->currPos.items[itemIndex]; + + currItem->heapTid = *heapTid; + currItem->indexOffset = offnum; + + /* + * Have index-only scans return the same base IndexTuple for every TID + * that originates from the same posting list + */ + if (so->currTuples) + currItem->tupleOffset = tupleOffset; +} + +/* Save an index item into so->currPos.items[itemIndex] */ +static void +_bt_saveitem_batch(IndexScanBatch batch, int itemIndex, + OffsetNumber offnum, IndexTuple itup) +{ + BTBatchScanPos pos = (BTBatchScanPos) batch->opaque; + + Assert(!BTreeTupleIsPivot(itup) && !BTreeTupleIsPosting(itup)); + + /* copy the populated part of the items array */ + batch->items[itemIndex].heapTid = itup->t_tid; + batch->items[itemIndex].indexOffset = offnum; + + if (batch->currTuples) + { + Size itupsz = IndexTupleSize(itup); + + batch->items[itemIndex].tupleOffset = pos->nextTupleOffset; + memcpy(batch->currTuples + pos->nextTupleOffset, itup, itupsz); + pos->nextTupleOffset += MAXALIGN(itupsz); } } @@ -2022,31 +3369,34 @@ _bt_saveitem(BTScanOpaque so, int itemIndex, * needed. */ static int -_bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum, - ItemPointer heapTid, IndexTuple itup) +_bt_setuppostingitems_batch(IndexScanBatch batch, int itemIndex, OffsetNumber offnum, + ItemPointer heapTid, IndexTuple itup) { - BTScanPosItem *currItem = &so->currPos.items[itemIndex]; + BTBatchScanPos pos = (BTBatchScanPos) batch->opaque; + IndexScanBatchPosItem *item = &batch->items[itemIndex]; Assert(BTreeTupleIsPosting(itup)); - currItem->heapTid = *heapTid; - currItem->indexOffset = offnum; - if (so->currTuples) + /* copy the populated part of the items array */ + item->heapTid = *heapTid; + item->indexOffset = offnum; + + if (batch->currTuples) { /* Save base IndexTuple (truncate posting list) */ IndexTuple base; Size itupsz = BTreeTupleGetPostingOffset(itup); itupsz = MAXALIGN(itupsz); - currItem->tupleOffset = so->currPos.nextTupleOffset; - base = (IndexTuple) (so->currTuples + so->currPos.nextTupleOffset); + item->tupleOffset = pos->nextTupleOffset; + base = (IndexTuple) (batch->currTuples + pos->nextTupleOffset); memcpy(base, itup, itupsz); /* Defensively reduce work area index tuple header size */ base->t_info &= ~INDEX_SIZE_MASK; base->t_info |= itupsz; - so->currPos.nextTupleOffset += itupsz; + pos->nextTupleOffset += itupsz; - return currItem->tupleOffset; + return item->tupleOffset; } return 0; @@ -2060,20 +3410,20 @@ _bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum, * posting list tuple. Caller passes its return value as tupleOffset. */ static inline void -_bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, - ItemPointer heapTid, int tupleOffset) +_bt_savepostingitem_batch(IndexScanBatch batch, int itemIndex, OffsetNumber offnum, + ItemPointer heapTid, int tupleOffset) { - BTScanPosItem *currItem = &so->currPos.items[itemIndex]; + IndexScanBatchPosItem *item = &batch->items[itemIndex]; - currItem->heapTid = *heapTid; - currItem->indexOffset = offnum; + item->heapTid = *heapTid; + item->indexOffset = offnum; /* * Have index-only scans return the same base IndexTuple for every TID * that originates from the same posting list */ - if (so->currTuples) - currItem->tupleOffset = tupleOffset; + if (batch->currTuples) + item->tupleOffset = tupleOffset; } /* @@ -2186,6 +3536,71 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) return _bt_readnextpage(scan, blkno, lastcurrblkno, dir, false); } +/* + * a batching version of _bt_steppage(), ignoring irrelevant bits + */ +static IndexScanBatch +_bt_steppage_batch(IndexScanDesc scan, BTBatchScanPos pos, ScanDirection dir) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + BlockNumber blkno, + lastcurrblkno; + + /* batching does not work with regular scan-level positions */ + Assert(!BTScanPosIsValid(so->currPos)); + Assert(!BTScanPosIsValid(so->markPos)); + + /* Batching has a different concept of position, stored in the batch. */ + Assert(BTBatchScanPosIsValid(*pos)); + + /* + * killitems + * + * No need to handle killtuples here, that's going to be dealt with at the + * indexam.c level when freeing the batch, or possibly in when calling + * amfreebatch. + */ + + /* + * mark/restore + * + * Mark/restore shall also be handled at the indexam.c level, by keeping + * the correct batch around, etc. We don't discard the old batch here. + * + * In _bt_steppage this also handled primitive scans for array keys, but + * that probably would be handled at indexam.c level too. + */ + + /* Don't unpin the buffer here, keep the batch pinned until amfreebatch. */ + + /* Walk to the next page with data */ + if (ScanDirectionIsForward(dir)) + blkno = pos->nextPage; + else + blkno = pos->prevPage; + + lastcurrblkno = pos->currPage; + + /* + * Cancel primitive index scans that were scheduled when the call to + * _bt_readpage for currPos happened to use the opposite direction to the + * one that we're stepping in now. (It's okay to leave the scan's array + * keys as-is, since the next _bt_readpage will advance them.) + * + * XXX Not sure this is correct. Can we combine the direction from some + * older batch (with mark/restore?) and the current needPrimScan from the + * latest batch we processed? But, the mark/restore code in indexam should + * reset this somehow. + * + * XXX However, aren't primitive scans very btree-specific code? How could + * indexam.c ever handle that? + */ + if (pos->dir != dir) + so->needPrimScan = false; + + return _bt_readnextpage_batch(scan, pos, blkno, lastcurrblkno, dir, false); +} + /* * _bt_readfirstpage() -- Read first page containing valid data for _bt_first * @@ -2265,6 +3680,77 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir) return true; } +static IndexScanBatch +_bt_readfirstpage_batch(IndexScanDesc scan, BTBatchScanPos pos, OffsetNumber offnum, ScanDirection dir) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + IndexScanBatch batch; + + /* batching does not work with regular scan-level positions */ + Assert(!BTScanPosIsValid(so->currPos)); + Assert(!BTScanPosIsValid(so->markPos)); + + so->numKilled = 0; /* just paranoia */ + so->markItemIndex = -1; /* ditto */ + + /* copy position info from BTScanOpaque */ + + /* Initialize so->currPos for the first page (page in so->currPos.buf) */ + if (so->needPrimScan) + { + Assert(so->numArrayKeys); + + pos->moreLeft = true; + pos->moreRight = true; + so->needPrimScan = false; + } + else if (ScanDirectionIsForward(dir)) + { + pos->moreLeft = false; + pos->moreRight = true; + } + else + { + pos->moreLeft = true; + pos->moreRight = false; + } + + /* + * Attempt to load matching tuples from the first page. + * + * Note that _bt_readpage will finish initializing the so->currPos fields. + * _bt_readpage also releases parallel scan (even when it returns false). + */ + if ((batch = _bt_readpage_batch(scan, pos, dir, offnum, true)) != NULL) + { + pos = (BTBatchScanPos) batch->opaque; + + /* + * _bt_readpage succeeded. Drop the lock (and maybe the pin) on + * so->currPos.buf in preparation for btgettuple returning tuples. + */ + Assert(BTBatchScanPosIsPinned(*pos)); + + /* _bt_drop_lock_and_maybe_pin_batch(scan, pos); */ + /* XXX drop just the lock, not the pin, that's up to btfreebatch */ + /* without this btfreebatch triggers an assert when unpinning the */ + /* buffer, because that checks we're not holding a lock on it */ + _bt_unlockbuf(scan->indexRelation, pos->buf); + return batch; + } + + /* There's no actually-matching data on the page in so->currPos.buf */ + _bt_unlockbuf(scan->indexRelation, pos->buf); + + /* XXX Not sure we can drop the pin before calling steppage_batch? But */ + /* without this, \d+ reports unreleased buffer ... */ + /* And the non-batch code doesn't need to do this. */ + ReleaseBuffer(pos->buf); + + /* Call _bt_readnextpage using its _bt_steppage wrapper function */ + return _bt_steppage_batch(scan, pos, dir); +} + /* * _bt_readnextpage() -- Read next page containing valid data for _bt_next * @@ -2412,6 +3898,138 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, return true; } +static IndexScanBatch +_bt_readnextpage_batch(IndexScanDesc scan, BTBatchScanPos pos, BlockNumber blkno, + BlockNumber lastcurrblkno, ScanDirection dir, bool seized) +{ + Relation rel = scan->indexRelation; + BTScanOpaque so PG_USED_FOR_ASSERTS_ONLY = (BTScanOpaque) scan->opaque; + + /* BTBatchScanPosData newpos; */ + IndexScanBatch newbatch = NULL; + + /* batching does not work with regular scan-level positions */ + Assert(!BTScanPosIsValid(so->currPos)); + Assert(!BTScanPosIsValid(so->markPos)); + + Assert(pos->currPage == lastcurrblkno || seized); + Assert(BTBatchScanPosIsPinned(*pos) || seized); + + /* initialize the new position to the old one, we'll modify it */ + /* newpos = *pos; */ + + /* pos->moreLeft = pos->moreRight = false; */ + + /* + * Remember that the scan already read lastcurrblkno, a page to the left + * of blkno (or remember reading a page to the right, for backwards scans) + */ + if (ScanDirectionIsForward(dir)) + pos->moreLeft = true; + else + pos->moreRight = true; + + for (;;) + { + Page page; + BTPageOpaque opaque; + + if (blkno == P_NONE || + (ScanDirectionIsForward(dir) ? + !pos->moreRight : !pos->moreLeft)) + { + /* most recent _bt_readpage call (for lastcurrblkno) ended scan */ + Assert(pos->currPage == lastcurrblkno && !seized); + BTBatchScanPosInvalidate(*pos); + _bt_parallel_done(scan); /* iff !so->needPrimScan */ + return NULL; + } + + Assert(!so->needPrimScan); + + /* parallel scan must never actually visit so->currPos blkno */ + if (!seized && scan->parallel_scan != NULL && + !_bt_parallel_seize_batch(scan, pos, &blkno, &lastcurrblkno, false)) + { + /* whole scan is now done (or another primitive scan required) */ + BTBatchScanPosInvalidate(*pos); + return NULL; + } + + if (ScanDirectionIsForward(dir)) + { + /* read blkno, but check for interrupts first */ + CHECK_FOR_INTERRUPTS(); + pos->buf = _bt_getbuf(rel, blkno, BT_READ); + } + else + { + /* read blkno, avoiding race (also checks for interrupts) */ + pos->buf = _bt_lock_and_validate_left(rel, &blkno, + lastcurrblkno); + if (pos->buf == InvalidBuffer) + { + /* must have been a concurrent deletion of leftmost page */ + BTBatchScanPosInvalidate(*pos); + _bt_parallel_done(scan); + return NULL; + } + } + + page = BufferGetPage(pos->buf); + opaque = BTPageGetOpaque(page); + lastcurrblkno = blkno; + if (likely(!P_IGNORE(opaque))) + { + /* see if there are any matches on this page */ + if (ScanDirectionIsForward(dir)) + { + /* note that this will clear moreRight if we can stop */ + if ((newbatch = _bt_readpage_batch(scan, pos, dir, P_FIRSTDATAKEY(opaque), false)) != NULL) + break; + blkno = pos->nextPage; + } + else + { + /* note that this will clear moreLeft if we can stop */ + if ((newbatch = _bt_readpage_batch(scan, pos, dir, PageGetMaxOffsetNumber(page), false)) != NULL) + break; + blkno = pos->prevPage; + } + } + else + { + /* _bt_readpage not called, so do all this for ourselves */ + if (ScanDirectionIsForward(dir)) + blkno = opaque->btpo_next; + else + blkno = opaque->btpo_prev; + if (scan->parallel_scan != NULL) + _bt_parallel_release(scan, blkno, lastcurrblkno); + } + + /* no matching tuples on this page */ + _bt_relbuf(rel, pos->buf); + seized = false; /* released by _bt_readpage (or by us) */ + } + + /* */ + Assert(newbatch != NULL); + + pos = (BTBatchScanPos) newbatch->opaque; + + /* + * _bt_readpage succeeded. Drop the lock (and maybe the pin) on + * so->currPos.buf in preparation for btgettuple returning tuples. + */ + Assert(pos->currPage == blkno); + Assert(BTBatchScanPosIsPinned(*pos)); + /* _bt_drop_lock_and_maybe_pin_batch(scan, pos); */ + _bt_unlockbuf(scan->indexRelation, pos->buf); + + return newbatch; +} + /* * _bt_lock_and_validate_left() -- lock caller's left sibling blkno, * recovering from concurrent page splits/page deletions when necessary @@ -2693,3 +4311,79 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) _bt_returnitem(scan, so); return true; } + +/* + * _bt_endpoint() -- Find the first or last page in the index, and scan + * from there to the first key satisfying all the quals. + * + * This is used by _bt_first() to set up a scan when we've determined + * that the scan must start at the beginning or end of the index (for + * a forward or backward scan respectively). + * + * Parallel scan callers must have seized the scan before calling here. + * Exit conditions are the same as for _bt_first(). + */ +static IndexScanBatch +_bt_endpoint_batch(IndexScanDesc scan, ScanDirection dir) +{ + Relation rel = scan->indexRelation; + BTScanOpaque so PG_USED_FOR_ASSERTS_ONLY = (BTScanOpaque) scan->opaque; + Page page; + BTPageOpaque opaque; + OffsetNumber start; + BTBatchScanPosData pos; + + BTBatchScanPosInvalidate(pos); + + /* batching does not work with regular scan-level positions */ + Assert(!BTScanPosIsValid(so->currPos)); + Assert(!BTScanPosIsValid(so->markPos)); + + Assert(!BTScanPosIsValid(so->currPos)); + Assert(!so->needPrimScan); + + /* + * Scan down to the leftmost or rightmost leaf page. This is a simplified + * version of _bt_search(). + */ + pos.buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir)); + + if (!BufferIsValid(pos.buf)) + { + /* + * Empty index. Lock the whole relation, as nothing finer to lock + * exists. + */ + PredicateLockRelation(rel, scan->xs_snapshot); + _bt_parallel_done(scan); + return false; + } + + page = BufferGetPage(pos.buf); + opaque = BTPageGetOpaque(page); + Assert(P_ISLEAF(opaque)); + + if (ScanDirectionIsForward(dir)) + { + /* There could be dead pages to the left, so not this: */ + /* Assert(P_LEFTMOST(opaque)); */ + + start = P_FIRSTDATAKEY(opaque); + } + else if (ScanDirectionIsBackward(dir)) + { + Assert(P_RIGHTMOST(opaque)); + + start = PageGetMaxOffsetNumber(page); + } + else + { + elog(ERROR, "invalid scan direction: %d", (int) dir); + start = 0; /* keep compiler quiet */ + } + + /* + * Now load data from the first page of the scan. + */ + return _bt_readfirstpage_batch(scan, &pos, start, dir); +} diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 11802a4c2151..187f6fa5934b 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -3492,6 +3492,185 @@ _bt_killitems(IndexScanDesc scan) _bt_unlockbuf(scan->indexRelation, so->currPos.buf); } +/* + * _bt_killitems_batch + * a variant of _bt_killitems, using the batch-level killedItems + */ +void +_bt_killitems_batch(IndexScanDesc scan, IndexScanBatch batch) +{ + /* BTScanOpaque so = (BTScanOpaque) scan->opaque; */ + BTBatchScanPos pos = (BTBatchScanPos) batch->opaque; + Page page; + BTPageOpaque opaque; + OffsetNumber minoff; + OffsetNumber maxoff; + int i; + int numKilled = batch->numKilled; + bool killedsomething = false; + bool droppedpin PG_USED_FOR_ASSERTS_ONLY; + + Assert(BTBatchScanPosIsValid(*pos)); + + /* + * Always reset the scan state, so we don't look for same items on other + * pages. + */ + batch->numKilled = 0; + + if (BTBatchScanPosIsPinned(*pos)) + { + /* + * We have held the pin on this page since we read the index tuples, + * so all we need to do is lock it. The pin will have prevented + * re-use of any TID on the page, so there is no need to check the + * LSN. + */ + droppedpin = false; + _bt_lockbuf(scan->indexRelation, pos->buf, BT_READ); + + page = BufferGetPage(pos->buf); + } + else + { + Buffer buf; + + droppedpin = true; + /* Attempt to re-read the buffer, getting pin and lock. */ + buf = _bt_getbuf(scan->indexRelation, pos->currPage, BT_READ); + + page = BufferGetPage(buf); + if (BufferGetLSNAtomic(buf) == pos->lsn) + pos->buf = buf; + else + { + /* Modified while not pinned means hinting is not safe. */ + _bt_relbuf(scan->indexRelation, buf); + return; + } + } + + opaque = BTPageGetOpaque(page); + minoff = P_FIRSTDATAKEY(opaque); + maxoff = PageGetMaxOffsetNumber(page); + + for (i = 0; i < numKilled; i++) + { + int itemIndex = batch->killedItems[i]; + IndexScanBatchPosItem *kitem = &batch->items[itemIndex]; + OffsetNumber offnum = kitem->indexOffset; + + Assert(itemIndex >= batch->firstItem && + itemIndex <= batch->lastItem); + if (offnum < minoff) + continue; /* pure paranoia */ + while (offnum <= maxoff) + { + ItemId iid = PageGetItemId(page, offnum); + IndexTuple ituple = (IndexTuple) PageGetItem(page, iid); + bool killtuple = false; + + if (BTreeTupleIsPosting(ituple)) + { + int pi = i + 1; + int nposting = BTreeTupleGetNPosting(ituple); + int j; + + /* + * We rely on the convention that heap TIDs in the scanpos + * items array are stored in ascending heap TID order for a + * group of TIDs that originally came from a posting list + * tuple. This convention even applies during backwards + * scans, where returning the TIDs in descending order might + * seem more natural. This is about effectiveness, not + * correctness. + * + * Note that the page may have been modified in almost any way + * since we first read it (in the !droppedpin case), so it's + * possible that this posting list tuple wasn't a posting list + * tuple when we first encountered its heap TIDs. + */ + for (j = 0; j < nposting; j++) + { + ItemPointer item = BTreeTupleGetPostingN(ituple, j); + + if (!ItemPointerEquals(item, &kitem->heapTid)) + break; /* out of posting list loop */ + + /* + * kitem must have matching offnum when heap TIDs match, + * though only in the common case where the page can't + * have been concurrently modified + */ + Assert(kitem->indexOffset == offnum || !droppedpin); + + /* + * Read-ahead to later kitems here. + * + * We rely on the assumption that not advancing kitem here + * will prevent us from considering the posting list tuple + * fully dead by not matching its next heap TID in next + * loop iteration. + * + * If, on the other hand, this is the final heap TID in + * the posting list tuple, then tuple gets killed + * regardless (i.e. we handle the case where the last + * kitem is also the last heap TID in the last index tuple + * correctly -- posting tuple still gets killed). + */ + if (pi < numKilled) + kitem = &batch->items[batch->killedItems[pi++]]; + } + + /* + * Don't bother advancing the outermost loop's int iterator to + * avoid processing killed items that relate to the same + * offnum/posting list tuple. This micro-optimization hardly + * seems worth it. (Further iterations of the outermost loop + * will fail to match on this same posting list's first heap + * TID instead, so we'll advance to the next offnum/index + * tuple pretty quickly.) + */ + if (j == nposting) + killtuple = true; + } + else if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid)) + killtuple = true; + + /* + * Mark index item as dead, if it isn't already. Since this + * happens while holding a buffer lock possibly in shared mode, + * it's possible that multiple processes attempt to do this + * simultaneously, leading to multiple full-page images being sent + * to WAL (if wal_log_hints or data checksums are enabled), which + * is undesirable. + */ + if (killtuple && !ItemIdIsDead(iid)) + { + /* found the item/all posting list items */ + ItemIdMarkDead(iid); + killedsomething = true; + break; /* out of inner search loop */ + } + offnum = OffsetNumberNext(offnum); + } + } + + /* + * Since this can be redone later if needed, mark as dirty hint. + * + * Whenever we mark anything LP_DEAD, we also set the page's + * BTP_HAS_GARBAGE flag, which is likewise just a hint. (Note that we + * only rely on the page-level flag in !heapkeyspace indexes.) + */ + if (killedsomething) + { + opaque->btpo_flags |= BTP_HAS_GARBAGE; + MarkBufferDirtyHint(pos->buf, true); + } + + _bt_unlockbuf(scan->indexRelation, pos->buf); +} /* * The following routines manage a shared-memory area in which we track diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c index a56c5eceb14a..be8e02a9c452 100644 --- a/src/backend/access/table/tableam.c +++ b/src/backend/access/table/tableam.c @@ -217,7 +217,7 @@ table_index_fetch_tuple_check(Relation rel, bool found; slot = table_slot_create(rel, NULL); - scan = table_index_fetch_begin(rel); + scan = table_index_fetch_begin(rel, NULL); found = table_index_fetch_tuple(scan, tid, snapshot, slot, &call_again, all_dead); table_index_fetch_end(scan); diff --git a/src/backend/commands/constraint.c b/src/backend/commands/constraint.c index 3497a8221f29..8a5d79a27a66 100644 --- a/src/backend/commands/constraint.c +++ b/src/backend/commands/constraint.c @@ -106,7 +106,8 @@ unique_key_recheck(PG_FUNCTION_ARGS) */ tmptid = checktid; { - IndexFetchTableData *scan = table_index_fetch_begin(trigdata->tg_relation); + IndexFetchTableData *scan = table_index_fetch_begin(trigdata->tg_relation, + NULL); bool call_again = false; if (!table_index_fetch_tuple(scan, &tmptid, SnapshotSelf, slot, diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c index bdf862b24062..1ec046adeffd 100644 --- a/src/backend/executor/execIndexing.c +++ b/src/backend/executor/execIndexing.c @@ -815,7 +815,17 @@ check_exclusion_or_unique_constraint(Relation heap, Relation index, retry: conflict = false; found_self = false; - index_scan = index_beginscan(heap, index, &DirtySnapshot, NULL, indnkeyatts, 0); + + /* + * It doesn't seem very useful to allow batching/prefetching when checking + * exclusion/uniqueness constraints. We should only find either no or just + * one row, I think. + * + * XXX Maybe there are cases where we could find multiple "candidate" + * rows, e.g. with exclusion constraints? Not sure. + */ + index_scan = index_beginscan(heap, index, &DirtySnapshot, NULL, indnkeyatts, 0, + false); index_rescan(index_scan, scankeys, indnkeyatts, NULL, 0); while (index_getnext_slot(index_scan, ForwardScanDirection, existing_slot)) diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c index 53ddd25c42db..9c7df9b9ccbc 100644 --- a/src/backend/executor/execReplication.c +++ b/src/backend/executor/execReplication.c @@ -201,8 +201,13 @@ RelationFindReplTupleByIndex(Relation rel, Oid idxoid, /* Build scan key. */ skey_attoff = build_replindex_scan_key(skey, rel, idxrel, searchslot); - /* Start an index scan. */ - scan = index_beginscan(rel, idxrel, &snap, NULL, skey_attoff, 0); + /* + * Start an index scan. + * + * XXX No prefetching for replication identity. We expect to find just one + * row, so prefetching would be pointless. + */ + scan = index_beginscan(rel, idxrel, &snap, NULL, skey_attoff, 0, false); retry: found = false; diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c index f464cca9507a..1a14f5faa68c 100644 --- a/src/backend/executor/nodeIndexonlyscan.c +++ b/src/backend/executor/nodeIndexonlyscan.c @@ -49,7 +49,13 @@ static TupleTableSlot *IndexOnlyNext(IndexOnlyScanState *node); static void StoreIndexTuple(IndexOnlyScanState *node, TupleTableSlot *slot, IndexTuple itup, TupleDesc itupdesc); +static bool ios_prefetch_block(IndexScanDesc scan, void *data, + IndexScanBatchPos *pos); +/* values stored in ios_prefetch_block in the batch cache */ +#define IOS_UNKNOWN_VISIBILITY 0 /* default value */ +#define IOS_ALL_VISIBLE 1 +#define IOS_NOT_ALL_VISIBLE 2 /* ---------------------------------------------------------------- * IndexOnlyNext @@ -94,15 +100,26 @@ IndexOnlyNext(IndexOnlyScanState *node) estate->es_snapshot, &node->ioss_Instrument, node->ioss_NumScanKeys, - node->ioss_NumOrderByKeys); + node->ioss_NumOrderByKeys, + node->ioss_CanBatch); node->ioss_ScanDesc = scandesc; - /* Set it up for index-only scan */ node->ioss_ScanDesc->xs_want_itup = true; node->ioss_VMBuffer = InvalidBuffer; + /* + * Set the prefetch callback info, if the scan has batching enabled + * (we only know what after index_beginscan, which also checks which + * callbacks are defined for the AM. + */ + if (scandesc->xs_batches != NULL) + { + scandesc->xs_batches->prefetchCallback = ios_prefetch_block; + scandesc->xs_batches->prefetchArgument = (void *) node; + } + /* * If no run-time keys to calculate or they are ready, go ahead and * pass the scankeys to the index AM. @@ -120,10 +137,42 @@ IndexOnlyNext(IndexOnlyScanState *node) */ while ((tid = index_getnext_tid(scandesc, direction)) != NULL) { + bool all_visible; bool tuple_from_heap = false; CHECK_FOR_INTERRUPTS(); + /* + * Without batching, inspect the VM directly. With batching, we need + * to retrieve the visibility information seen by the read_stream + * callback (or rather by ios_prefetch_block), otherwise the + * read_stream might get out of sync (if the VM got updated since + * then). + */ + if (scandesc->xs_batches == NULL) + { + all_visible = VM_ALL_VISIBLE(scandesc->heapRelation, + ItemPointerGetBlockNumber(tid), + &node->ioss_VMBuffer); + } + else + { + /* + * Reuse the previously determined page visibility info, or + * calculate it now. If we decided not to prefetch the block, the + * page had to be all-visible at that point. The VM bit might have + * changed since then, but the tuple visibility could not have. + * + * XXX It's a bit weird we use the visibility to decide if we + * should skip prefetching the block, and then deduce the + * visibility from that (even if it matches pretty clearly). But + * maybe we could/should have a more direct way to read the + * private state? + */ + all_visible = !ios_prefetch_block(scandesc, node, + &scandesc->xs_batches->readPos); + } + /* * We can skip the heap fetch if the TID references a heap page on * which all tuples are known visible to everybody. In any case, @@ -158,9 +207,7 @@ IndexOnlyNext(IndexOnlyScanState *node) * It's worth going through this complexity to avoid needing to lock * the VM buffer, which could cause significant contention. */ - if (!VM_ALL_VISIBLE(scandesc->heapRelation, - ItemPointerGetBlockNumber(tid), - &node->ioss_VMBuffer)) + if (!all_visible) { /* * Rats, we have to visit the heap to check visibility. @@ -596,6 +643,20 @@ ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags) indexstate->recheckqual = ExecInitQual(node->recheckqual, (PlanState *) indexstate); + /* + * All index scans can do batching. + * + * XXX Maybe this should check if the index AM supports batching, or even + * call something like "amcanbatch" (does not exist yet). Or check the + * enable_indexscan_batching GUC? + * + * XXX For now we only know if the scan gets to use batching after the + * index_beginscan() returns, so maybe this name is a bit misleading. It's + * more about "allow batching". But maybe this field is unnecessary - we + * check all the interesting stuff in index_beginscan() anyway. + */ + indexstate->ioss_CanBatch = true; + /* * If we are just doing EXPLAIN (ie, aren't going to run the plan), stop * here. This allows an index-advisor plugin to EXPLAIN a plan containing @@ -783,13 +844,21 @@ ExecIndexOnlyScanInitializeDSM(IndexOnlyScanState *node, return; } + /* + * XXX Do we actually want prefetching for parallel index scans? Maybe + * not, but then we need to be careful not to call index_batch_getnext_tid + * (which now can happen, because we'll call IndexOnlyNext even for + * parallel plans). Although, that should not happen, because we only call + * that with (xs_batches != NULL). + */ node->ioss_ScanDesc = index_beginscan_parallel(node->ss.ss_currentRelation, node->ioss_RelationDesc, &node->ioss_Instrument, node->ioss_NumScanKeys, node->ioss_NumOrderByKeys, - piscan); + piscan, + node->ioss_CanBatch); node->ioss_ScanDesc->xs_want_itup = true; node->ioss_VMBuffer = InvalidBuffer; @@ -849,13 +918,15 @@ ExecIndexOnlyScanInitializeWorker(IndexOnlyScanState *node, return; } + /* XXX Do we actually want prefetching for parallel index scans? */ node->ioss_ScanDesc = index_beginscan_parallel(node->ss.ss_currentRelation, node->ioss_RelationDesc, &node->ioss_Instrument, node->ioss_NumScanKeys, node->ioss_NumOrderByKeys, - piscan); + piscan, + node->ioss_CanBatch); node->ioss_ScanDesc->xs_want_itup = true; /* @@ -889,3 +960,51 @@ ExecIndexOnlyScanRetrieveInstrumentation(IndexOnlyScanState *node) node->ioss_SharedInfo = palloc(size); memcpy(node->ioss_SharedInfo, SharedInfo, size); } + +/* FIXME duplicate from indexam.c */ +#define INDEX_SCAN_BATCH(scan, idx) \ + ((scan)->xs_batches->batches[(idx) % (scan)->xs_batches->maxBatches]) + +/* + * ios_prefetch_block + * Callback to only prefetch blocks that are not all-visible. + * + * We don't want to inspect the visibility map repeatedly, so the result of + * VM_ALL_VISIBLE is stored in the batch private data. The values are set + * to 0 by default, so we use two constants to remember if all-visible or + * not all-visible. + * + * However, this is not merely a question of performance. The VM may get + * modified during the scan, and we need to make sure the two places (the + * read_next callback and the index_fetch_heap here) make the same decision, + * otherwise we might get out of sync with the stream. For example, the + * callback might find a page is all-visible (and skips reading the block), + * and then someone might update the page, resetting the VM bit. If this + * place attempts to read the page from the stream, it'll fail because it + * will probably receive an entirely different page. + */ +static bool +ios_prefetch_block(IndexScanDesc scan, void *arg, IndexScanBatchPos *pos) +{ + IndexOnlyScanState *node = (IndexOnlyScanState *) arg; + IndexScanBatch batch = INDEX_SCAN_BATCH(scan, pos->batch); + + if (batch->privateData == NULL) + batch->privateData = palloc0(sizeof(Datum) * (batch->lastItem + 1)); + + if (batch->privateData[pos->index] == IOS_UNKNOWN_VISIBILITY) + { + bool all_visible; + ItemPointer tid = &batch->items[pos->index].heapTid; + + all_visible = VM_ALL_VISIBLE(scan->heapRelation, + ItemPointerGetBlockNumber(tid), + &node->ioss_VMBuffer); + + batch->privateData[pos->index] + = all_visible ? IOS_ALL_VISIBLE : IOS_NOT_ALL_VISIBLE; + } + + /* prefetch only blocks that are not all-visible */ + return (batch->privateData[pos->index] == IOS_NOT_ALL_VISIBLE); +} diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c index 7fcaa37fe625..177d74c2c273 100644 --- a/src/backend/executor/nodeIndexscan.c +++ b/src/backend/executor/nodeIndexscan.c @@ -111,7 +111,8 @@ IndexNext(IndexScanState *node) estate->es_snapshot, &node->iss_Instrument, node->iss_NumScanKeys, - node->iss_NumOrderByKeys); + node->iss_NumOrderByKeys, + node->iss_CanBatch); node->iss_ScanDesc = scandesc; @@ -201,13 +202,16 @@ IndexNextWithReorder(IndexScanState *node) /* * We reach here if the index scan is not parallel, or if we're * serially executing an index scan that was planned to be parallel. + * + * XXX Should we use batching here? Does it even work for reordering? */ scandesc = index_beginscan(node->ss.ss_currentRelation, node->iss_RelationDesc, estate->es_snapshot, &node->iss_Instrument, node->iss_NumScanKeys, - node->iss_NumOrderByKeys); + node->iss_NumOrderByKeys, + false); node->iss_ScanDesc = scandesc; @@ -965,6 +969,18 @@ ExecInitIndexScan(IndexScan *node, EState *estate, int eflags) indexstate->indexorderbyorig = ExecInitExprList(node->indexorderbyorig, (PlanState *) indexstate); + /* + * All index scans can do batching. + * + * XXX Maybe this should check if the index AM supports batching, or even + * call something like "amcanbatch" (does not exist yet). Or check the + * enable_indexscan_batching GUC? + * + * XXX Well, we disable batching for reordering, so maybe we should check + * that here instead? But maybe it's unnecessary limitation? + */ + indexstate->iss_CanBatch = true; + /* * If we are just doing EXPLAIN (ie, aren't going to run the plan), stop * here. This allows an index-advisor plugin to EXPLAIN a plan containing @@ -1719,13 +1735,17 @@ ExecIndexScanInitializeDSM(IndexScanState *node, return; } + /* + * XXX Do we actually want prefetching for parallel index scans? + */ node->iss_ScanDesc = index_beginscan_parallel(node->ss.ss_currentRelation, node->iss_RelationDesc, &node->iss_Instrument, node->iss_NumScanKeys, node->iss_NumOrderByKeys, - piscan); + piscan, + node->iss_CanBatch); /* * If no run-time keys to calculate or they are ready, go ahead and pass @@ -1783,13 +1803,17 @@ ExecIndexScanInitializeWorker(IndexScanState *node, return; } + /* + * XXX Do we actually want prefetching for parallel index scans? + */ node->iss_ScanDesc = index_beginscan_parallel(node->ss.ss_currentRelation, node->iss_RelationDesc, &node->iss_Instrument, node->iss_NumScanKeys, node->iss_NumOrderByKeys, - piscan); + piscan, + node->iss_CanBatch); /* * If no run-time keys to calculate or they are ready, go ahead and pass diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 0b317d2d809f..35c3526e2501 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -3045,6 +3045,46 @@ ReleaseAndReadBuffer(Buffer buffer, return ReadBuffer(relation, blockNum); } +/* + * BufferMatches + * Check if the buffer (still) contains the expected page. + * + * Check if the buffer contains the expected page. The buffer may be invalid, + * or valid and pinned. + */ +bool +BufferMatches(Buffer buffer, + Relation relation, + BlockNumber blockNum) +{ + ForkNumber forkNum = MAIN_FORKNUM; + BufferDesc *bufHdr; + + if (BufferIsValid(buffer)) + { + Assert(BufferIsPinned(buffer)); + if (BufferIsLocal(buffer)) + { + bufHdr = GetLocalBufferDescriptor(-buffer - 1); + if (bufHdr->tag.blockNum == blockNum && + BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) && + BufTagGetForkNum(&bufHdr->tag) == forkNum) + return true; + } + else + { + bufHdr = GetBufferDescriptor(buffer - 1); + /* we have pin, so it's ok to examine tag without spinlock */ + if (bufHdr->tag.blockNum == blockNum && + BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) && + BufTagGetForkNum(&bufHdr->tag) == forkNum) + return true; + } + } + + return false; +} + /* * PinBuffer -- make buffer unavailable for replacement. * diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index a96b1b9c0bc6..facc83bb83a5 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -6719,9 +6719,14 @@ get_actual_variable_endpoint(Relation heapRel, InitNonVacuumableSnapshot(SnapshotNonVacuumable, GlobalVisTestFor(heapRel)); + /* + * XXX I'm not sure about batching/prefetching here. In most cases we + * expect to find the endpoints immediately, but sometimes we have a lot + * of dead tuples - and then prefetching might help. + */ index_scan = index_beginscan(heapRel, indexRel, &SnapshotNonVacuumable, NULL, - 1, 0); + 1, 0, false); /* Set it up for index-only scan */ index_scan->xs_want_itup = true; index_rescan(index_scan, scankeys, 1, NULL, 0); diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 2f8cbd867599..36d2b7f1e68f 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -809,6 +809,16 @@ struct config_bool ConfigureNamesBool[] = true, NULL, NULL, NULL }, + { + {"enable_indexscan_batching", PGC_USERSET, QUERY_TUNING_METHOD, + gettext_noop("Enables the planner's use of index-scan batching."), + NULL, + GUC_EXPLAIN + }, + &enable_indexscan_batching, + true, + NULL, NULL, NULL + }, { {"enable_indexonlyscan", PGC_USERSET, QUERY_TUNING_METHOD, gettext_noop("Enables the planner's use of index-only-scan plans."), diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 34826d01380b..649df2b06a0d 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -415,6 +415,7 @@ #enable_hashjoin = on #enable_incremental_sort = on #enable_indexscan = on +#enable_indexscan_batching = on #enable_indexonlyscan = on #enable_material = on #enable_memoize = on diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h index 52916bab7a31..0028bb558436 100644 --- a/src/include/access/amapi.h +++ b/src/include/access/amapi.h @@ -196,6 +196,14 @@ typedef void (*amrescan_function) (IndexScanDesc scan, typedef bool (*amgettuple_function) (IndexScanDesc scan, ScanDirection direction); +/* next batch of valid tuples */ +typedef IndexScanBatch(*amgetbatch_function) (IndexScanDesc scan, + ScanDirection direction); + +/* release batch of valid tuples */ +typedef void (*amfreebatch_function) (IndexScanDesc scan, + IndexScanBatch batch); + /* fetch all valid tuples */ typedef int64 (*amgetbitmap_function) (IndexScanDesc scan, TIDBitmap *tbm); @@ -307,6 +315,8 @@ typedef struct IndexAmRoutine ambeginscan_function ambeginscan; amrescan_function amrescan; amgettuple_function amgettuple; /* can be NULL */ + amgetbatch_function amgetbatch; /* can be NULL */ + amfreebatch_function amfreebatch; /* can be NULL */ amgetbitmap_function amgetbitmap; /* can be NULL */ amendscan_function amendscan; ammarkpos_function ammarkpos; /* can be NULL */ diff --git a/src/include/access/genam.h b/src/include/access/genam.h index 5b2ab181b5f8..8bef942b11d5 100644 --- a/src/include/access/genam.h +++ b/src/include/access/genam.h @@ -15,6 +15,7 @@ #define GENAM_H #include "access/htup.h" +#include "access/itup.h" #include "access/sdir.h" #include "access/skey.h" #include "nodes/tidbitmap.h" @@ -111,6 +112,7 @@ typedef bool (*IndexBulkDeleteCallback) (ItemPointer itemptr, void *state); /* struct definitions appear in relscan.h */ typedef struct IndexScanDescData *IndexScanDesc; +typedef struct IndexScanBatchData *IndexScanBatch; typedef struct SysScanDescData *SysScanDesc; typedef struct ParallelIndexScanDescData *ParallelIndexScanDesc; @@ -155,6 +157,8 @@ typedef struct IndexOrderByDistance * generalized index_ interface routines (in indexam.c) */ +extern PGDLLIMPORT bool enable_indexscan_batching; + /* * IndexScanIsValid * True iff the index scan is valid. @@ -179,7 +183,8 @@ extern IndexScanDesc index_beginscan(Relation heapRelation, Relation indexRelation, Snapshot snapshot, IndexScanInstrumentation *instrument, - int nkeys, int norderbys); + int nkeys, int norderbys, + bool enable_batching); extern IndexScanDesc index_beginscan_bitmap(Relation indexRelation, Snapshot snapshot, IndexScanInstrumentation *instrument, @@ -205,7 +210,8 @@ extern IndexScanDesc index_beginscan_parallel(Relation heaprel, Relation indexrel, IndexScanInstrumentation *instrument, int nkeys, int norderbys, - ParallelIndexScanDesc pscan); + ParallelIndexScanDesc pscan, + bool enable_batching); extern ItemPointer index_getnext_tid(IndexScanDesc scan, ScanDirection direction); struct TupleTableSlot; @@ -213,7 +219,6 @@ extern bool index_fetch_heap(IndexScanDesc scan, struct TupleTableSlot *slot); extern bool index_getnext_slot(IndexScanDesc scan, ScanDirection direction, struct TupleTableSlot *slot); extern int64 index_getbitmap(IndexScanDesc scan, TIDBitmap *bitmap); - extern IndexBulkDeleteResult *index_bulk_delete(IndexVacuumInfo *info, IndexBulkDeleteResult *istat, IndexBulkDeleteCallback callback, @@ -231,7 +236,7 @@ extern void index_store_float8_orderby_distances(IndexScanDesc scan, bool recheckOrderBy); extern bytea *index_opclass_options(Relation indrel, AttrNumber attnum, Datum attoptions, bool validate); - +extern IndexScanBatch index_batch_alloc(int maxitems, bool want_itup); /* * index access method support routines (in genam.c) diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index ebca02588d3e..a00a1108ba51 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -1001,6 +1001,38 @@ typedef struct BTScanPosData typedef BTScanPosData *BTScanPos; +/* + * Minimal AM-specific concept of "position" for batching. + */ +typedef struct BTBatchScanPosData +{ + Buffer buf; /* currPage buf (invalid means unpinned) */ + + /* page details as of the saved position's call to _bt_readpage */ + BlockNumber currPage; /* page referenced by items array */ + BlockNumber prevPage; /* currPage's left link */ + BlockNumber nextPage; /* currPage's right link */ + XLogRecPtr lsn; /* currPage's LSN */ + + /* scan direction for the saved position's call to _bt_readpage */ + ScanDirection dir; + + /* + * If we are doing an index-only scan, nextTupleOffset is the first free + * location in the associated tuple storage workspace. + */ + int nextTupleOffset; + + /* + * moreLeft and moreRight track whether we think there may be matching + * index entries to the left and right of the current page, respectively. + */ + bool moreLeft; + bool moreRight; +} BTBatchScanPosData; + +typedef BTBatchScanPosData *BTBatchScanPos; + #define BTScanPosIsPinned(scanpos) \ ( \ AssertMacro(BlockNumberIsValid((scanpos).currPage) || \ @@ -1017,7 +1049,6 @@ typedef BTScanPosData *BTScanPos; if (BTScanPosIsPinned(scanpos)) \ BTScanPosUnpin(scanpos); \ } while (0) - #define BTScanPosIsValid(scanpos) \ ( \ AssertMacro(BlockNumberIsValid((scanpos).currPage) || \ @@ -1030,6 +1061,35 @@ typedef BTScanPosData *BTScanPos; (scanpos).currPage = InvalidBlockNumber; \ } while (0) +#define BTBatchScanPosIsPinned(scanpos) \ +( \ + AssertMacro(BlockNumberIsValid((scanpos).currPage) || \ + !BufferIsValid((scanpos).buf)), \ + BufferIsValid((scanpos).buf) \ +) +#define BTBatchScanPosUnpin(scanpos) \ + do { \ + ReleaseBuffer((scanpos).buf); \ + (scanpos).buf = InvalidBuffer; \ + } while (0) +#define BTBatchScanPosUnpinIfPinned(scanpos) \ + do { \ + if (BTBatchScanPosIsPinned(scanpos)) \ + BTBatchScanPosUnpin(scanpos); \ + } while (0) +#define BTBatchScanPosIsValid(scanpos) \ +( \ + AssertMacro(BlockNumberIsValid((scanpos).currPage) || \ + !BufferIsValid((scanpos).buf)), \ + BlockNumberIsValid((scanpos).currPage) \ +) +#define BTBatchScanPosInvalidate(scanpos) \ + do { \ + (scanpos).buf = InvalidBuffer; \ + (scanpos).currPage = InvalidBlockNumber; \ + } while (0) + + /* We need one of these for each equality-type SK_SEARCHARRAY scan key */ typedef struct BTArrayKeyInfo { @@ -1191,6 +1251,8 @@ extern IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys); extern Size btestimateparallelscan(Relation rel, int nkeys, int norderbys); extern void btinitparallelscan(void *target); extern bool btgettuple(IndexScanDesc scan, ScanDirection dir); +extern IndexScanBatch btgetbatch(IndexScanDesc scan, ScanDirection dir); +extern void btfreebatch(IndexScanDesc scan, IndexScanBatch batch); extern int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm); extern void btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys); @@ -1215,6 +1277,9 @@ extern StrategyNumber bttranslatecmptype(CompareType cmptype, Oid opfamily); */ extern bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page, BlockNumber *last_curr_page, bool first); +extern bool _bt_parallel_seize_batch(IndexScanDesc scan, BTBatchScanPos pos, + BlockNumber *next_scan_page, + BlockNumber *last_curr_page, bool first); extern void _bt_parallel_release(IndexScanDesc scan, BlockNumber next_scan_page, BlockNumber curr_page); @@ -1308,6 +1373,10 @@ extern bool _bt_first(IndexScanDesc scan, ScanDirection dir); extern bool _bt_next(IndexScanDesc scan, ScanDirection dir); extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost); +extern IndexScanBatch _bt_first_batch(IndexScanDesc scan, ScanDirection dir); +extern IndexScanBatch _bt_next_batch(IndexScanDesc scan, BTBatchScanPos pos, ScanDirection dir); +extern void _bt_kill_batch(IndexScanDesc scan, IndexScanBatch batch); + /* * prototypes for functions in nbtutils.c */ @@ -1326,6 +1395,7 @@ extern bool _bt_scanbehind_checkkeys(IndexScanDesc scan, ScanDirection dir, IndexTuple finaltup); extern void _bt_set_startikey(IndexScanDesc scan, BTReadPageState *pstate); extern void _bt_killitems(IndexScanDesc scan); +extern void _bt_killitems_batch(IndexScanDesc scan, IndexScanBatch batch); extern BTCycleId _bt_vacuum_cycleid(Relation rel); extern BTCycleId _bt_start_vacuum(Relation rel); extern void _bt_end_vacuum(Relation rel); diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index b5e0fb386c0a..2bbd0db0223a 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -16,9 +16,11 @@ #include "access/htup_details.h" #include "access/itup.h" +#include "access/sdir.h" #include "nodes/tidbitmap.h" #include "port/atomics.h" #include "storage/buf.h" +#include "storage/read_stream.h" #include "storage/relfilelocator.h" #include "storage/spin.h" #include "utils/relcache.h" @@ -121,10 +123,164 @@ typedef struct ParallelBlockTableScanWorkerData *ParallelBlockTableScanWorker; typedef struct IndexFetchTableData { Relation rel; + ReadStream *rs; } IndexFetchTableData; struct IndexScanInstrumentation; +/* Forward declaration, the prefetch callback needs IndexScanDescData. */ +typedef struct IndexScanBatchData IndexScanBatchData; + +/* + * XXX parts of BTScanOpaqueData, BTScanPosItem and BTScanPosData relevant + * for one batch. + */ +typedef struct IndexScanBatchPosItem /* what we remember about each match */ +{ + ItemPointerData heapTid; /* TID of referenced heap item */ + OffsetNumber indexOffset; /* index item's location within page */ + LocationIndex tupleOffset; /* IndexTuple's offset in workspace, if any */ +} IndexScanBatchPosItem; + +/* + * Data about one batch of items returned by the index AM. This is similar + * to the AM-specific "opaque" structs, used by each AM to track items + * loaded from one leaf page, but generalized for all AMs. + * + * XXX Not sure which of there fields are 100% needed for all index AMs, + * most of this comes from nbtree. + * + * XXX Mostly a copy of BTScanPosData, but other AMs may need different (or + * only some of those) fields. + */ +typedef struct IndexScanBatchData +{ + /* + * AM-specific concept of position within the index, and other stuff the + * AM might need to store for each batch. + * + * XXX maybe "position" is not the best name, it can have other stuff the + * AM needs to keep per-batch (even only for reading the leaf items, like + * nextTupleOffset). + */ + void *opaque; + + /* + * The items array is always ordered in index order (ie, increasing + * indexoffset). When scanning backwards it is convenient to fill the + * array back-to-front, so we start at the last slot and fill downwards. + * Hence we need both a first-valid-entry and a last-valid-entry counter. + * itemIndex is a cursor showing which entry was last returned to caller. + * + * XXX Do we need all these indexes, or would it be enough to have just + * 0-indexed array with only itemIndex? + */ + int firstItem; /* first valid index in items[] */ + int lastItem; /* last valid index in items[] */ + int itemIndex; /* current index in items[] */ + + /* info about killed items if any (killedItems is NULL if never used) */ + int *killedItems; /* indexes of killed items */ + int numKilled; /* number of currently stored items */ + + /* + * If we are doing an index-only scan, these are the tuple storage + * workspaces for the currPos and markPos respectively. Each is of size + * BLCKSZ, so it can hold as much as a full page's worth of tuples. + * + * XXX maybe currTuples should be part of the am-specific per-batch state + * stored in "position" field? + */ + char *currTuples; /* tuple storage for currPos */ + IndexScanBatchPosItem *items; /* XXX don't size to MaxTIDsPerBTreePage */ + + /* + * batch contents (TIDs, index tuples, kill bitmap, ...) + * + * XXX Shouldn't this be part of the "IndexScanBatchPosItem" struct? To + * keep everything in one place? Or why should we have separate arrays? + * One advantage is that we don't need to allocate memory for arrays that + * we don't need ... e.g. if we don't need heap tuples, we don't allocate + * that. We couldn't do that with everything in one struct. + */ + IndexTuple *itups; /* IndexTuples, if requested */ + HeapTuple *htups; /* HeapTuples, if requested */ + bool *recheck; /* recheck flags */ + + /* XXX why do we need this on top of "opaque" pointer? */ + Datum *privateData; /* private data for batch */ + + /* xs_orderbyvals / xs_orderbynulls */ + Datum *orderbyvals; + bool *orderbynulls; + +} IndexScanBatchData; + +/* + * Position in the queue of batches - index of a batch, index of item in a batch. + */ +typedef struct IndexScanBatchPos +{ + int batch; + int index; +} IndexScanBatchPos; + +typedef struct IndexScanDescData IndexScanDescData; +typedef bool (*IndexPrefetchCallback) (IndexScanDescData * scan, void *arg, IndexScanBatchPos *pos); + +/* + * Queue + */ +typedef struct IndexScanBatches +{ + /* + * Did we read the last batch? The batches may be loaded from multiple + * places, and we need to remember when we fail to load the next batch in + * a given scan (which means "no more batches"). amgetbatch may restart + * the scan on the get call, so we need to remember it's over. + */ + bool finished; + bool reset; + + BlockNumber lastBlock; + + /* + * Current scan direction, for the currently loaded batches. This is used + * to load data in the read stream API callback, etc. + * + * XXX May need some work to use already loaded batches after change of + * direction, instead of just throwing everything away. May need to reset + * the stream but keep the batches? + */ + ScanDirection direction; + + /* positions in the queue of batches (batch + item) */ + IndexScanBatchPos readPos; /* read position */ + IndexScanBatchPos streamPos; /* prefetch position (for read stream API) */ + IndexScanBatchPos markPos; /* mark/restore position */ + + IndexScanBatchData *markBatch; + // IndexScanBatchData *currentBatch; + + /* + * Array of batches returned by the AM. The array has a capacity (but can + * be resized if needed). The firstBatch is an index of the first batch, + * but needs to be translated by (modulo maxBatches) into index in the + * batches array. + * + * FIXME Maybe these fields should be uint32, or something like that? + */ + int maxBatches; /* size of the batches array */ + int firstBatch; /* first used batch slot */ + int nextBatch; /* next empty batch slot */ + + IndexScanBatchData **batches; + + /* callback to skip prefetching in IOS etc. */ + IndexPrefetchCallback prefetchCallback; + void *prefetchArgument; +} IndexScanBatches; + /* * We use the same IndexScanDescData structure for both amgettuple-based * and amgetbitmap-based index scans. Some fields are only relevant in @@ -176,6 +332,12 @@ typedef struct IndexScanDescData bool xs_recheck; /* T means scan keys must be rechecked */ + /* + * Batches index scan keep a list of batches loaded from the index in a + * circular buffer. + */ + IndexScanBatches *xs_batches; + /* * When fetching with an ordering operator, the values of the ORDER BY * expressions of the last returned tuple, according to the index. If diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 8713e12cbfb9..5bed359cf135 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -413,8 +413,14 @@ typedef struct TableAmRoutine * structure with additional information. * * Tuples for an index scan can then be fetched via index_fetch_tuple. + * + * The ReadStream pointer is optional - NULL means the regular buffer + * reads are used. If a valid ReadStream is provided, the callback + * (generating the blocks to read) and index_fetch_tuple (consuming the + * buffers) need to agree on the exact order. */ - struct IndexFetchTableData *(*index_fetch_begin) (Relation rel); + struct IndexFetchTableData *(*index_fetch_begin) (Relation rel, + ReadStream *rs); /* * Reset index fetch. Typically this will release cross index fetch @@ -1149,9 +1155,9 @@ table_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan) * Tuples for an index scan can then be fetched via table_index_fetch_tuple(). */ static inline IndexFetchTableData * -table_index_fetch_begin(Relation rel) +table_index_fetch_begin(Relation rel, ReadStream *rs) { - return rel->rd_tableam->index_fetch_begin(rel); + return rel->rd_tableam->index_fetch_begin(rel, rs); } /* diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 5b6cadb5a6c1..ef672e203d0e 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -1697,6 +1697,7 @@ typedef struct * OrderByTypByVals is the datatype of order by expression pass-by-value? * OrderByTypLens typlens of the datatypes of order by expressions * PscanLen size of parallel index scan descriptor + * CanBatch batching (and prefetching) enabled * ---------------- */ typedef struct IndexScanState @@ -1726,6 +1727,10 @@ typedef struct IndexScanState bool *iss_OrderByTypByVals; int16 *iss_OrderByTypLens; Size iss_PscanLen; + + /* batching/prefetching enabled? */ + bool iss_CanBatch; + } IndexScanState; /* ---------------- @@ -1749,6 +1754,7 @@ typedef struct IndexScanState * PscanLen size of parallel index-only scan descriptor * NameCStringAttNums attnums of name typed columns to pad to NAMEDATALEN * NameCStringCount number of elements in the NameCStringAttNums array + * CanBatch batching (and prefetching) enabled * ---------------- */ typedef struct IndexOnlyScanState @@ -1772,6 +1778,7 @@ typedef struct IndexOnlyScanState Size ioss_PscanLen; AttrNumber *ioss_NameCStringAttNums; int ioss_NameCStringCount; + bool ioss_CanBatch; } IndexOnlyScanState; /* ---------------- diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index 41fdc1e76938..3b7d4e6a6a28 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -237,6 +237,8 @@ extern void IncrBufferRefCount(Buffer buffer); extern void CheckBufferIsPinnedOnce(Buffer buffer); extern Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation, BlockNumber blockNum); +extern bool BufferMatches(Buffer buffer, Relation relation, + BlockNumber blockNum); extern Buffer ExtendBufferedRel(BufferManagerRelation bmr, ForkNumber forkNum, diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index ae17d028ed3b..220b61fad2dc 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -158,6 +158,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_incremental_sort | on enable_indexonlyscan | on enable_indexscan | on + enable_indexscan_batching | on enable_material | on enable_memoize | on enable_mergejoin | on @@ -172,7 +173,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_seqscan | on enable_sort | on enable_tidscan | on -(24 rows) +(25 rows) -- There are always wait event descriptions for various types. InjectionPoint -- may be present or absent, depending on history since last postmaster start. diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index e5879e00dffe..1e5548aacb93 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -195,6 +195,8 @@ BOOL BOOLEAN BOX BTArrayKeyInfo +BTBatchInfo +BTBatchScanPosData BTBuildState BTCallbackState BTCycleId @@ -1260,6 +1262,10 @@ IndexOrderByDistance IndexPath IndexRuntimeKeyInfo IndexScan +IndexScanBatchData +IndexScanBatchPos +IndexScanBatchPosItem +IndexScanBatches IndexScanDesc IndexScanInstrumentation IndexScanState @@ -3396,6 +3402,7 @@ amendscan_function amestimateparallelscan_function amgetbitmap_function amgettuple_function +amgetbatch_function aminitparallelscan_function aminsert_function aminsertcleanup_function