From fcfa5dbf98114e8a6b2677b74d775d3e0927e29b Mon Sep 17 00:00:00 2001 From: Melanie Plageman Date: Thu, 20 Mar 2025 16:09:11 -0400 Subject: [PATCH 1/2] Use streaming read I/O in btree vacuuming Btree vacuum processes all index pages in physical order. Now it uses the read stream API to get the next buffer instead of explicitly invoking ReadBuffer(). It is possible for concurrent indexes to cause page splits during index vacuuming. This can lead to index entries that have yet to be vacuumed being moved to pages that have already been vacuumed. Btree vacuum code handles this by backtracking to reprocess those pages. So, while normal, sequentially encountered pages are now read through the read stream API, backtracked pages are still read with explicit ReadBuffer() calls. Author: Andrey Borodin Reviewed-by: Melanie Plageman Reviewed-by: Junwang Zhao Reviewed-by: Kirill Reshke Discussion: https://postgr.es/m/flat/CAAKRu_bW1UOyup%3DjdFw%2BkOF9bCaAm%3D9UpiyZtbPMn8n_vnP%2Big%40mail.gmail.com#3b3a84132fc683b3ee5b40bc4c2ea2a5 --- src/backend/access/nbtree/nbtree.c | 91 ++++++++++++++++++++++-------- 1 file changed, 66 insertions(+), 25 deletions(-) diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index c0a8833e0683..353600414a2d 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -86,7 +86,7 @@ typedef struct BTParallelScanDescData *BTParallelScanDesc; static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, BTCycleId cycleid); -static void btvacuumpage(BTVacState *vstate, BlockNumber scanblkno); +static BlockNumber btvacuumpage(BTVacState *vstate, Buffer buf); static BTVacuumPosting btreevacuumposting(BTVacState *vstate, IndexTuple posting, OffsetNumber updatedoffset, @@ -991,8 +991,9 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, Relation rel = info->index; BTVacState vstate; BlockNumber num_pages; - BlockNumber scanblkno; bool needLock; + BlockRangeReadStreamPrivate p; + ReadStream *stream = NULL; /* * Reset fields that track information about the entire index now. This @@ -1061,9 +1062,18 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, */ needLock = !RELATION_IS_LOCAL(rel); - scanblkno = BTREE_METAPAGE + 1; + p.current_blocknum = BTREE_METAPAGE + 1; + stream = read_stream_begin_relation(READ_STREAM_FULL, + info->strategy, + rel, + MAIN_FORKNUM, + block_range_read_stream_cb, + &p, + 0); for (;;) { + Buffer buf; + /* Get the current relation length */ if (needLock) LockRelationForExtension(rel, ExclusiveLock); @@ -1076,18 +1086,44 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, num_pages); /* Quit if we've scanned the whole relation */ - if (scanblkno >= num_pages) + if (p.current_blocknum >= num_pages) break; - /* Iterate over pages, then loop back to recheck length */ - for (; scanblkno < num_pages; scanblkno++) + + + p.last_exclusive = num_pages; + + /* Iterate over pages, then loop back to recheck relation length */ + while (true) { - btvacuumpage(&vstate, scanblkno); + BlockNumber current_block; + + /* call vacuum_delay_point while not holding any buffer lock */ + vacuum_delay_point(false); + + buf = read_stream_next_buffer(stream, NULL); + + if (!BufferIsValid(buf)) + break; + + current_block = btvacuumpage(&vstate, buf); + if (info->report_progress) pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE, - scanblkno); + current_block); } + + Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer); + + /* + * We have to reset the read stream to use it again. After returning + * InvalidBuffer, the read stream API won't invoke our callback again + * until the stream has been reset. + */ + read_stream_reset(stream); } + read_stream_end(stream); + /* Set statistics num_pages field to final size of index */ stats->num_pages = num_pages; @@ -1111,14 +1147,16 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, * btvacuumpage --- VACUUM one page * * This processes a single page for btvacuumscan(). In some cases we must - * backtrack to re-examine and VACUUM pages that were the scanblkno during + * backtrack to re-examine and VACUUM pages that were on buf's page during * a previous call here. This is how we handle page splits (that happened * after our cycleid was acquired) whose right half page happened to reuse * a block that we might have processed at some point before it was * recycled (i.e. before the page split). + * + * Returns BlockNumber of a scanned page (not backtracked). */ -static void -btvacuumpage(BTVacState *vstate, BlockNumber scanblkno) +static BlockNumber +btvacuumpage(BTVacState *vstate, Buffer buf) { IndexVacuumInfo *info = vstate->info; IndexBulkDeleteResult *stats = vstate->stats; @@ -1129,7 +1167,7 @@ btvacuumpage(BTVacState *vstate, BlockNumber scanblkno) bool attempt_pagedel; BlockNumber blkno, backtrack_to; - Buffer buf; + BlockNumber scanblkno = BufferGetBlockNumber(buf); Page page; BTPageOpaque opaque; @@ -1140,17 +1178,6 @@ btvacuumpage(BTVacState *vstate, BlockNumber scanblkno) attempt_pagedel = false; backtrack_to = P_NONE; - /* call vacuum_delay_point while not holding any buffer lock */ - vacuum_delay_point(false); - - /* - * We can't use _bt_getbuf() here because it always applies - * _bt_checkpage(), which will barf on an all-zero page. We want to - * recycle all-zero pages, not fail. Also, we want to use a nondefault - * buffer access strategy. - */ - buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, - info->strategy); _bt_lockbuf(rel, buf, BT_READ); page = BufferGetPage(buf); opaque = NULL; @@ -1186,7 +1213,7 @@ btvacuumpage(BTVacState *vstate, BlockNumber scanblkno) errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"", blkno, scanblkno, RelationGetRelationName(rel)))); _bt_relbuf(rel, buf); - return; + return scanblkno; } /* @@ -1206,7 +1233,7 @@ btvacuumpage(BTVacState *vstate, BlockNumber scanblkno) { /* Done with current scanblkno (and all lower split pages) */ _bt_relbuf(rel, buf); - return; + return scanblkno; } } @@ -1437,8 +1464,22 @@ btvacuumpage(BTVacState *vstate, BlockNumber scanblkno) if (backtrack_to != P_NONE) { blkno = backtrack_to; + + /* check for vacuum delay while not holding any buffer lock */ + vacuum_delay_point(false); + + /* + * We can't use _bt_getbuf() here because it always applies + * _bt_checkpage(), which will barf on an all-zero page. We want to + * recycle all-zero pages, not fail. Also, we want to use a + * nondefault buffer access strategy. + */ + buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, + info->strategy); goto backtrack; } + + return scanblkno; } /* From 4e1bb8e4a49b27d6aa76db8fbc03bb51d5e49d1b Mon Sep 17 00:00:00 2001 From: Melanie Plageman Date: Thu, 20 Mar 2025 16:09:41 -0400 Subject: [PATCH 2/2] test backtrack --- src/backend/access/nbtree/nbtree.c | 4 + src/test/modules/test_misc/meson.build | 1 + .../modules/test_misc/t/008_vacuum_btree.pl | 96 +++++++++++++++++++ 3 files changed, 101 insertions(+) create mode 100644 src/test/modules/test_misc/t/008_vacuum_btree.pl diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 353600414a2d..be29c4e70e28 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -32,6 +32,7 @@ #include "storage/lmgr.h" #include "utils/fmgrprotos.h" #include "utils/index_selfuncs.h" +#include "utils/injection_point.h" #include "utils/memutils.h" @@ -1178,6 +1179,7 @@ btvacuumpage(BTVacState *vstate, Buffer buf) attempt_pagedel = false; backtrack_to = P_NONE; + INJECTION_POINT("nbtree-vacuum-page"); _bt_lockbuf(rel, buf, BT_READ); page = BufferGetPage(buf); opaque = NULL; @@ -1468,6 +1470,8 @@ btvacuumpage(BTVacState *vstate, Buffer buf) /* check for vacuum delay while not holding any buffer lock */ vacuum_delay_point(false); + INJECTION_POINT("nbtree-vacuum-page-backtrack"); + /* * We can't use _bt_getbuf() here because it always applies * _bt_checkpage(), which will barf on an all-zero page. We want to diff --git a/src/test/modules/test_misc/meson.build b/src/test/modules/test_misc/meson.build index 9c50de7efb0f..ed0822f19eee 100644 --- a/src/test/modules/test_misc/meson.build +++ b/src/test/modules/test_misc/meson.build @@ -16,6 +16,7 @@ tests += { 't/005_timeouts.pl', 't/006_signal_autovacuum.pl', 't/007_catcache_inval.pl', + 't/008_vacuum_btree.pl', ], }, } diff --git a/src/test/modules/test_misc/t/008_vacuum_btree.pl b/src/test/modules/test_misc/t/008_vacuum_btree.pl new file mode 100644 index 000000000000..d393e1809ce3 --- /dev/null +++ b/src/test/modules/test_misc/t/008_vacuum_btree.pl @@ -0,0 +1,96 @@ +# Copyright (c) 2024, PostgreSQL Global Development Group + +# Test btree vacuum + +use strict; +use warnings; +use PostgreSQL::Test::Cluster; +use Test::More; + +if ($ENV{enable_injection_points} ne 'yes') +{ + plan skip_all => 'Injection points not supported by this build'; +} + +my $node = PostgreSQL::Test::Cluster->new('node'); +$node->init; + +# Turn autovacuum off for the cluster. This could be done at the table level +# instead. However, since this file exercises vacuum, turn off autovacuum +# globally. This also allows use of non-local injection points in vacuum code. +$node->append_conf('postgresql.conf', 'autovacuum = off'); +$node->start; + +# Check if extension injection_points is available +if (!$node->check_extension('injection_points')) +{ + plan skip_all => 'Extension injection_points not installed'; +} + +$node->safe_psql('postgres', 'CREATE EXTENSION injection_points;'); + +my $psql_session = $node->background_psql('postgres'); + +# Create a table with an index filled with dead index entries +$node->safe_psql('postgres', q[ + create table test_backtrack (col1 int); + insert into test_backtrack select generate_series(1,800); + create index on test_backtrack(col1); + delete from test_backtrack; + ] +); + +# Attach to two injection points. The first one will allow us to stop between +# vacuuming each index page. The second is our validation that we did backtrack. +$psql_session->query_safe( + qq[ + SELECT injection_points_set_local(); + SELECT injection_points_attach('nbtree-vacuum-page', 'wait'); + SELECT injection_points_attach('nbtree-vacuum-page-backtrack', 'wait'); +]); + +# Start a vacuum of the table and index. +$psql_session->query_until( + qr/starting_bg_psql/, + q(\echo starting_bg_psql + vacuum (index_cleanup on, parallel 0) test_backtrack; + )); + +# The index vacuum should be waiting on our first injection point and is yet to +# process any pages. +$node->wait_for_event('client backend', 'nbtree-vacuum-page'); + +# Wake up vacuum so it can process the first index leaf page. +$node->safe_psql('postgres', "SELECT injection_points_wakeup('nbtree-vacuum-page');"); + +# The first index leaf page is now vacuumed and vacuum should be waiting again +# on the first injection point. +$node->wait_for_event('client backend', 'nbtree-vacuum-page'); + +# Insert data while vacuum is waiting to process the next leaf page. The +# inserted data will force a page split in which some tuples from unprocessed +# leaf pages will be moved to the first already vacuumed leaf page. +$node->safe_psql('postgres', + "insert into test_backtrack select generate_series(1,300);"); + +# Now we want the vacuum to continue. We don't want to wait on our first break +# point again. +# We need to make sure we are waiting before detaching and issuing a wakeup. +# Otherwise there could be a race and the backend may not get woken up. +$node->wait_for_event('client backend', 'nbtree-vacuum-page'); +$node->safe_psql('postgres', "SELECT injection_points_detach('nbtree-vacuum-page');"); +$node->safe_psql('postgres', "SELECT injection_points_wakeup('nbtree-vacuum-page');"); + +# Wait on our second break point. Vacuum should have been forced to backtrack +# and vacuum the first leaf page again to ensure it removed all dead index +# entries. +$node->wait_for_event('client backend', 'nbtree-vacuum-page-backtrack'); + +# Once we wait on our second break point, we're done. Time to tell the backend +# to detach and wake it up. +$node->safe_psql('postgres', "SELECT injection_points_detach('nbtree-vacuum-page-backtrack');"); +$node->safe_psql('postgres', "SELECT injection_points_wakeup('nbtree-vacuum-page-backtrack');"); + +ok($psql_session->quit); + +done_testing();