diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index c0a8833e0683..be29c4e70e28 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -32,6 +32,7 @@ #include "storage/lmgr.h" #include "utils/fmgrprotos.h" #include "utils/index_selfuncs.h" +#include "utils/injection_point.h" #include "utils/memutils.h" @@ -86,7 +87,7 @@ typedef struct BTParallelScanDescData *BTParallelScanDesc; static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, BTCycleId cycleid); -static void btvacuumpage(BTVacState *vstate, BlockNumber scanblkno); +static BlockNumber btvacuumpage(BTVacState *vstate, Buffer buf); static BTVacuumPosting btreevacuumposting(BTVacState *vstate, IndexTuple posting, OffsetNumber updatedoffset, @@ -991,8 +992,9 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, Relation rel = info->index; BTVacState vstate; BlockNumber num_pages; - BlockNumber scanblkno; bool needLock; + BlockRangeReadStreamPrivate p; + ReadStream *stream = NULL; /* * Reset fields that track information about the entire index now. This @@ -1061,9 +1063,18 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, */ needLock = !RELATION_IS_LOCAL(rel); - scanblkno = BTREE_METAPAGE + 1; + p.current_blocknum = BTREE_METAPAGE + 1; + stream = read_stream_begin_relation(READ_STREAM_FULL, + info->strategy, + rel, + MAIN_FORKNUM, + block_range_read_stream_cb, + &p, + 0); for (;;) { + Buffer buf; + /* Get the current relation length */ if (needLock) LockRelationForExtension(rel, ExclusiveLock); @@ -1076,18 +1087,44 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, num_pages); /* Quit if we've scanned the whole relation */ - if (scanblkno >= num_pages) + if (p.current_blocknum >= num_pages) break; - /* Iterate over pages, then loop back to recheck length */ - for (; scanblkno < num_pages; scanblkno++) + + + p.last_exclusive = num_pages; + + /* Iterate over pages, then loop back to recheck relation length */ + while (true) { - btvacuumpage(&vstate, scanblkno); + BlockNumber current_block; + + /* call vacuum_delay_point while not holding any buffer lock */ + vacuum_delay_point(false); + + buf = read_stream_next_buffer(stream, NULL); + + if (!BufferIsValid(buf)) + break; + + current_block = btvacuumpage(&vstate, buf); + if (info->report_progress) pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE, - scanblkno); + current_block); } + + Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer); + + /* + * We have to reset the read stream to use it again. After returning + * InvalidBuffer, the read stream API won't invoke our callback again + * until the stream has been reset. + */ + read_stream_reset(stream); } + read_stream_end(stream); + /* Set statistics num_pages field to final size of index */ stats->num_pages = num_pages; @@ -1111,14 +1148,16 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, * btvacuumpage --- VACUUM one page * * This processes a single page for btvacuumscan(). In some cases we must - * backtrack to re-examine and VACUUM pages that were the scanblkno during + * backtrack to re-examine and VACUUM pages that were on buf's page during * a previous call here. This is how we handle page splits (that happened * after our cycleid was acquired) whose right half page happened to reuse * a block that we might have processed at some point before it was * recycled (i.e. before the page split). + * + * Returns BlockNumber of a scanned page (not backtracked). */ -static void -btvacuumpage(BTVacState *vstate, BlockNumber scanblkno) +static BlockNumber +btvacuumpage(BTVacState *vstate, Buffer buf) { IndexVacuumInfo *info = vstate->info; IndexBulkDeleteResult *stats = vstate->stats; @@ -1129,7 +1168,7 @@ btvacuumpage(BTVacState *vstate, BlockNumber scanblkno) bool attempt_pagedel; BlockNumber blkno, backtrack_to; - Buffer buf; + BlockNumber scanblkno = BufferGetBlockNumber(buf); Page page; BTPageOpaque opaque; @@ -1140,17 +1179,7 @@ btvacuumpage(BTVacState *vstate, BlockNumber scanblkno) attempt_pagedel = false; backtrack_to = P_NONE; - /* call vacuum_delay_point while not holding any buffer lock */ - vacuum_delay_point(false); - - /* - * We can't use _bt_getbuf() here because it always applies - * _bt_checkpage(), which will barf on an all-zero page. We want to - * recycle all-zero pages, not fail. Also, we want to use a nondefault - * buffer access strategy. - */ - buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, - info->strategy); + INJECTION_POINT("nbtree-vacuum-page"); _bt_lockbuf(rel, buf, BT_READ); page = BufferGetPage(buf); opaque = NULL; @@ -1186,7 +1215,7 @@ btvacuumpage(BTVacState *vstate, BlockNumber scanblkno) errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"", blkno, scanblkno, RelationGetRelationName(rel)))); _bt_relbuf(rel, buf); - return; + return scanblkno; } /* @@ -1206,7 +1235,7 @@ btvacuumpage(BTVacState *vstate, BlockNumber scanblkno) { /* Done with current scanblkno (and all lower split pages) */ _bt_relbuf(rel, buf); - return; + return scanblkno; } } @@ -1437,8 +1466,24 @@ btvacuumpage(BTVacState *vstate, BlockNumber scanblkno) if (backtrack_to != P_NONE) { blkno = backtrack_to; + + /* check for vacuum delay while not holding any buffer lock */ + vacuum_delay_point(false); + + INJECTION_POINT("nbtree-vacuum-page-backtrack"); + + /* + * We can't use _bt_getbuf() here because it always applies + * _bt_checkpage(), which will barf on an all-zero page. We want to + * recycle all-zero pages, not fail. Also, we want to use a + * nondefault buffer access strategy. + */ + buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, + info->strategy); goto backtrack; } + + return scanblkno; } /* diff --git a/src/test/modules/test_misc/meson.build b/src/test/modules/test_misc/meson.build index 9c50de7efb0f..ed0822f19eee 100644 --- a/src/test/modules/test_misc/meson.build +++ b/src/test/modules/test_misc/meson.build @@ -16,6 +16,7 @@ tests += { 't/005_timeouts.pl', 't/006_signal_autovacuum.pl', 't/007_catcache_inval.pl', + 't/008_vacuum_btree.pl', ], }, } diff --git a/src/test/modules/test_misc/t/008_vacuum_btree.pl b/src/test/modules/test_misc/t/008_vacuum_btree.pl new file mode 100644 index 000000000000..d393e1809ce3 --- /dev/null +++ b/src/test/modules/test_misc/t/008_vacuum_btree.pl @@ -0,0 +1,96 @@ +# Copyright (c) 2024, PostgreSQL Global Development Group + +# Test btree vacuum + +use strict; +use warnings; +use PostgreSQL::Test::Cluster; +use Test::More; + +if ($ENV{enable_injection_points} ne 'yes') +{ + plan skip_all => 'Injection points not supported by this build'; +} + +my $node = PostgreSQL::Test::Cluster->new('node'); +$node->init; + +# Turn autovacuum off for the cluster. This could be done at the table level +# instead. However, since this file exercises vacuum, turn off autovacuum +# globally. This also allows use of non-local injection points in vacuum code. +$node->append_conf('postgresql.conf', 'autovacuum = off'); +$node->start; + +# Check if extension injection_points is available +if (!$node->check_extension('injection_points')) +{ + plan skip_all => 'Extension injection_points not installed'; +} + +$node->safe_psql('postgres', 'CREATE EXTENSION injection_points;'); + +my $psql_session = $node->background_psql('postgres'); + +# Create a table with an index filled with dead index entries +$node->safe_psql('postgres', q[ + create table test_backtrack (col1 int); + insert into test_backtrack select generate_series(1,800); + create index on test_backtrack(col1); + delete from test_backtrack; + ] +); + +# Attach to two injection points. The first one will allow us to stop between +# vacuuming each index page. The second is our validation that we did backtrack. +$psql_session->query_safe( + qq[ + SELECT injection_points_set_local(); + SELECT injection_points_attach('nbtree-vacuum-page', 'wait'); + SELECT injection_points_attach('nbtree-vacuum-page-backtrack', 'wait'); +]); + +# Start a vacuum of the table and index. +$psql_session->query_until( + qr/starting_bg_psql/, + q(\echo starting_bg_psql + vacuum (index_cleanup on, parallel 0) test_backtrack; + )); + +# The index vacuum should be waiting on our first injection point and is yet to +# process any pages. +$node->wait_for_event('client backend', 'nbtree-vacuum-page'); + +# Wake up vacuum so it can process the first index leaf page. +$node->safe_psql('postgres', "SELECT injection_points_wakeup('nbtree-vacuum-page');"); + +# The first index leaf page is now vacuumed and vacuum should be waiting again +# on the first injection point. +$node->wait_for_event('client backend', 'nbtree-vacuum-page'); + +# Insert data while vacuum is waiting to process the next leaf page. The +# inserted data will force a page split in which some tuples from unprocessed +# leaf pages will be moved to the first already vacuumed leaf page. +$node->safe_psql('postgres', + "insert into test_backtrack select generate_series(1,300);"); + +# Now we want the vacuum to continue. We don't want to wait on our first break +# point again. +# We need to make sure we are waiting before detaching and issuing a wakeup. +# Otherwise there could be a race and the backend may not get woken up. +$node->wait_for_event('client backend', 'nbtree-vacuum-page'); +$node->safe_psql('postgres', "SELECT injection_points_detach('nbtree-vacuum-page');"); +$node->safe_psql('postgres', "SELECT injection_points_wakeup('nbtree-vacuum-page');"); + +# Wait on our second break point. Vacuum should have been forced to backtrack +# and vacuum the first leaf page again to ensure it removed all dead index +# entries. +$node->wait_for_event('client backend', 'nbtree-vacuum-page-backtrack'); + +# Once we wait on our second break point, we're done. Time to tell the backend +# to detach and wake it up. +$node->safe_psql('postgres', "SELECT injection_points_detach('nbtree-vacuum-page-backtrack');"); +$node->safe_psql('postgres', "SELECT injection_points_wakeup('nbtree-vacuum-page-backtrack');"); + +ok($psql_session->quit); + +done_testing();