From da5ffbcba8c0cdff65360472d0c6f81b4f35f752 Mon Sep 17 00:00:00 2001 From: Melanie Plageman Date: Thu, 3 Apr 2025 12:47:19 -0400 Subject: [PATCH 1/4] Fix autoprewarm neglect of tablespaces While prewarming blocks from a dump file, autoprewarm_database_main() mistakenly ignored tablespace when detecting the beginning of the next relation to prewarm. Because RelFileNumbers are only unique within a tablespace, autoprewarm could miss prewarming blocks from a relation with the same RelFileNumber in a different tablespace. Though this situation is likely rare in practice, it's best to make the code correct. Do so by explicitly checking for the RelFileNumber when detecting a new relation. Reported-by: Heikki Linnakangas Discussion: https://postgr.es/m/97c36982-603b-494a-95f4-aaf2a12ac27e%40iki.fi --- contrib/pg_prewarm/autoprewarm.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c index 73485a2323cf..760b1548eff9 100644 --- a/contrib/pg_prewarm/autoprewarm.c +++ b/contrib/pg_prewarm/autoprewarm.c @@ -472,10 +472,15 @@ autoprewarm_database_main(Datum main_arg) /* * As soon as we encounter a block of a new relation, close the old - * relation. Note that rel will be NULL if try_relation_open failed - * previously; in that case, there is nothing to close. + * relation. RelFileNumbers are only guaranteed to be unique within a + * tablespace, so check that too. + * + * Note that rel will be NULL if try_relation_open failed previously; + * in that case, there is nothing to close. */ - if (old_blk != NULL && old_blk->filenumber != blk->filenumber && + if (old_blk != NULL && + (old_blk->tablespace != blk->tablespace || + old_blk->filenumber != blk->filenumber) && rel != NULL) { relation_close(rel, AccessShareLock); @@ -487,7 +492,9 @@ autoprewarm_database_main(Datum main_arg) * Try to open each new relation, but only once, when we first * encounter it. If it's been dropped, skip the associated blocks. */ - if (old_blk == NULL || old_blk->filenumber != blk->filenumber) + if (old_blk == NULL || + old_blk->tablespace != blk->tablespace || + old_blk->filenumber != blk->filenumber) { Oid reloid; @@ -508,6 +515,7 @@ autoprewarm_database_main(Datum main_arg) /* Once per fork, check for fork existence and size. */ if (old_blk == NULL || + old_blk->tablespace != blk->tablespace || old_blk->filenumber != blk->filenumber || old_blk->forknum != blk->forknum) { From a6c7f219b838e3c6ddc14d9dbcab385c3ff463a9 Mon Sep 17 00:00:00 2001 From: Melanie Plageman Date: Thu, 3 Apr 2025 14:54:09 -0400 Subject: [PATCH 2/4] Remove superfluous autoprewarm check autoprewarm_database_main() prewarms blocks from the same database. It is passed an array of sorted BlockInfoRecords and a start and stop index into the array. The range represented should include only blocks belonging to global objects or blocks from a single database. Remove an unnecessary check that the current block is from the same database and add an assert to ensure this invariant remains. --- contrib/pg_prewarm/autoprewarm.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c index 760b1548eff9..5f6dca57cdda 100644 --- a/contrib/pg_prewarm/autoprewarm.c +++ b/contrib/pg_prewarm/autoprewarm.c @@ -463,12 +463,10 @@ autoprewarm_database_main(Datum main_arg) CHECK_FOR_INTERRUPTS(); /* - * Quit if we've reached records for another database. If previous - * blocks are of some global objects, then continue pre-warming. + * All blocks between prewarm_start_idx and prewarm_stop_idx should + * belong either to global objects or the same database. */ - if (old_blk != NULL && old_blk->database != blk->database && - old_blk->database != 0) - break; + Assert(blk->database == apw_state->database || blk->database == 0); /* * As soon as we encounter a block of a new relation, close the old From 98d3496134d7837a11ae400f1f534ae8c29714b3 Mon Sep 17 00:00:00 2001 From: Melanie Plageman Date: Mon, 31 Mar 2025 22:02:25 -0400 Subject: [PATCH 3/4] Refactor autoprewarm_database_main() in preparation for read stream Autoprewarm prewarms blocks from a dump file representing the contents of shared buffers at the time it was dumped. It uses a sorted array of BlockInfoRecords, each representing a block from one of the cluster's databases and tables. autoprewarm_database_main() prewarms all the blocks from a single database. It is optimized to ensure we don't try to open the same relation or fork over and over again if it has been dropped or is invalid. The main loop handled this by carefully setting various local variables to sentinel values when a run of blocks should be skipped. This method won't work with the read stream API. A read stream can only be created for a single relation and fork combination. The callback has to be able to advance the position in the array to allow for reading ahead additional blocks, however the callback cannot try to open another relation or close the current relation. So, the main loop in autoprewarm_database_main() must also advance the position in the array of BlockInfoRecords. To make it compatible with the read stream API, change autoprewarm_database_main() to explicitly fast-forward in the array past the blocks belonging to an invalid relation or fork. This commit only implements the new control flow -- it does not use the read stream API. Co-authored-by: Nazir Bilal Yavuz Co-authored-by: Melanie Plageman Reviewed-by: Heikki Linnakangas Reviewed-by: Daniel Gustafsson Discussion: https://postgr.es/m/flat/CAN55FZ3n8Gd%2BhajbL%3D5UkGzu_aHGRqnn%2BxktXq2fuds%3D1AOR6Q%40mail.gmail.com --- contrib/pg_prewarm/autoprewarm.c | 172 +++++++++++++++++-------------- 1 file changed, 94 insertions(+), 78 deletions(-) diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c index 5f6dca57cdda..761f6a77926b 100644 --- a/contrib/pg_prewarm/autoprewarm.c +++ b/contrib/pg_prewarm/autoprewarm.c @@ -429,11 +429,9 @@ apw_load_buffers(void) void autoprewarm_database_main(Datum main_arg) { - int pos; BlockInfoRecord *block_info; - Relation rel = NULL; - BlockNumber nblocks = 0; - BlockInfoRecord *old_blk = NULL; + int i; + BlockInfoRecord blk; dsm_segment *seg; /* Establish signal handlers; once that's done, unblock signals. */ @@ -449,16 +447,20 @@ autoprewarm_database_main(Datum main_arg) errmsg("could not map dynamic shared memory segment"))); BackgroundWorkerInitializeConnectionByOid(apw_state->database, InvalidOid, 0); block_info = (BlockInfoRecord *) dsm_segment_address(seg); - pos = apw_state->prewarm_start_idx; + + i = apw_state->prewarm_start_idx; + blk = block_info[i]; /* * Loop until we run out of blocks to prewarm or until we run out of free * buffers. */ - while (pos < apw_state->prewarm_stop_idx && have_free_buffer()) + while (i < apw_state->prewarm_stop_idx && have_free_buffer()) { - BlockInfoRecord *blk = &block_info[pos++]; - Buffer buf; + Oid tablespace = blk.tablespace; + RelFileNumber filenumber = blk.filenumber; + Oid reloid; + Relation rel; CHECK_FOR_INTERRUPTS(); @@ -466,97 +468,111 @@ autoprewarm_database_main(Datum main_arg) * All blocks between prewarm_start_idx and prewarm_stop_idx should * belong either to global objects or the same database. */ - Assert(blk->database == apw_state->database || blk->database == 0); + Assert(blk.database == apw_state->database || blk.database == 0); - /* - * As soon as we encounter a block of a new relation, close the old - * relation. RelFileNumbers are only guaranteed to be unique within a - * tablespace, so check that too. - * - * Note that rel will be NULL if try_relation_open failed previously; - * in that case, there is nothing to close. - */ - if (old_blk != NULL && - (old_blk->tablespace != blk->tablespace || - old_blk->filenumber != blk->filenumber) && - rel != NULL) - { - relation_close(rel, AccessShareLock); - rel = NULL; - CommitTransactionCommand(); - } + StartTransactionCommand(); - /* - * Try to open each new relation, but only once, when we first - * encounter it. If it's been dropped, skip the associated blocks. - */ - if (old_blk == NULL || - old_blk->tablespace != blk->tablespace || - old_blk->filenumber != blk->filenumber) + reloid = RelidByRelfilenumber(blk.tablespace, blk.filenumber); + if (!OidIsValid(reloid) || + (rel = try_relation_open(reloid, AccessShareLock)) == NULL) { - Oid reloid; + /* We failed to open the relation, so there is nothing to close. */ + CommitTransactionCommand(); - Assert(rel == NULL); - StartTransactionCommand(); - reloid = RelidByRelfilenumber(blk->tablespace, blk->filenumber); - if (OidIsValid(reloid)) - rel = try_relation_open(reloid, AccessShareLock); + /* + * Fast-forward to the next relation. We want to skip all of the + * other records referencing this relation since we know we can't + * open it. That way, we avoid repeatedly trying and failing to + * open the same relation. + */ + for (; i < apw_state->prewarm_stop_idx; i++) + { + blk = block_info[i]; + if (blk.tablespace != tablespace || + blk.filenumber != filenumber) + break; + } - if (!rel) - CommitTransactionCommand(); - } - if (!rel) - { - old_blk = blk; + /* Time to try and open our newfound relation */ continue; } - /* Once per fork, check for fork existence and size. */ - if (old_blk == NULL || - old_blk->tablespace != blk->tablespace || - old_blk->filenumber != blk->filenumber || - old_blk->forknum != blk->forknum) + /* + * We have a relation; now let's loop until we find a valid fork of + * the relation or we run out of free buffers. Once we've read from + * all valid forks or run out of options, we'll close the relation and + * move on. + */ + while (i < apw_state->prewarm_stop_idx && + blk.tablespace == tablespace && + blk.filenumber == filenumber && + have_free_buffer()) { + ForkNumber forknum = blk.forknum; + BlockNumber nblocks; + Buffer buf; + /* * smgrexists is not safe for illegal forknum, hence check whether * the passed forknum is valid before using it in smgrexists. */ - if (blk->forknum > InvalidForkNumber && - blk->forknum <= MAX_FORKNUM && - smgrexists(RelationGetSmgr(rel), blk->forknum)) - nblocks = RelationGetNumberOfBlocksInFork(rel, blk->forknum); - else - nblocks = 0; - } + if (blk.forknum <= InvalidForkNumber || + blk.forknum > MAX_FORKNUM || + !smgrexists(RelationGetSmgr(rel), blk.forknum)) + { + /* + * Fast-forward to the next fork. We want to skip all of the + * other records referencing this fork since we already know + * it's not valid. + */ + for (; i < apw_state->prewarm_stop_idx; i++) + { + blk = block_info[i]; + if (blk.tablespace != tablespace || + blk.filenumber != filenumber || + blk.forknum != forknum) + break; + } + + /* Time to check if this newfound fork is valid */ + continue; + } - /* Check whether blocknum is valid and within fork file size. */ - if (blk->blocknum >= nblocks) - { - /* Move to next forknum. */ - old_blk = blk; - continue; - } + nblocks = RelationGetNumberOfBlocksInFork(rel, blk.forknum); - /* Prewarm buffer. */ - buf = ReadBufferExtended(rel, blk->forknum, blk->blocknum, RBM_NORMAL, - NULL); - if (BufferIsValid(buf)) - { - apw_state->prewarmed_blocks++; - ReleaseBuffer(buf); - } + /* Prewarm buffers. */ + while (i < apw_state->prewarm_stop_idx && + blk.tablespace == tablespace && + blk.filenumber == filenumber && + blk.forknum == forknum && + have_free_buffer()) + { + CHECK_FOR_INTERRUPTS(); - old_blk = blk; - } + /* Check whether blocknum is valid and within fork file size. */ + if (blk.blocknum >= nblocks) + { + blk = block_info[++i]; + continue; + } - dsm_detach(seg); + buf = ReadBufferExtended(rel, blk.forknum, blk.blocknum, RBM_NORMAL, + NULL); + + blk = block_info[++i]; + if (!BufferIsValid(buf)) + break; + + apw_state->prewarmed_blocks++; + ReleaseBuffer(buf); + } + } - /* Release lock on previous relation. */ - if (rel) - { relation_close(rel, AccessShareLock); CommitTransactionCommand(); } + + dsm_detach(seg); } /* From 15e22a2ac71c42652c3720cc679bf0f9d045ea16 Mon Sep 17 00:00:00 2001 From: Melanie Plageman Date: Tue, 1 Apr 2025 18:07:38 -0400 Subject: [PATCH 4/4] Use streaming read I/O in autoprewarm Make a read stream for each valid fork of each valid relation represented in the autoprewarm dump file and prewarm those blocks through the read stream API instead of by directly invoking ReadBuffer(). Co-authored-by: Nazir Bilal Yavuz Co-authored-by: Melanie Plageman Reviewed-by: Heikki Linnakangas Reviewed-by: Daniel Gustafsson Reviewed-by: Andrey M. Borodin (earlier versions) Reviewed-by: Kirill Reshke (earlier versions) Reviewed-by: Matheus Alcantara (earlier versions) Discussion: https://postgr.es/m/flat/CAN55FZ3n8Gd%2BhajbL%3D5UkGzu_aHGRqnn%2BxktXq2fuds%3D1AOR6Q%40mail.gmail.com --- contrib/pg_prewarm/autoprewarm.c | 125 +++++++++++++++++++++++++------ src/tools/pgindent/typedefs.list | 1 + 2 files changed, 103 insertions(+), 23 deletions(-) diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c index 761f6a77926b..195dc4b773b1 100644 --- a/contrib/pg_prewarm/autoprewarm.c +++ b/contrib/pg_prewarm/autoprewarm.c @@ -41,6 +41,7 @@ #include "storage/latch.h" #include "storage/lwlock.h" #include "storage/procsignal.h" +#include "storage/read_stream.h" #include "storage/smgr.h" #include "tcop/tcopprot.h" #include "utils/guc.h" @@ -75,6 +76,28 @@ typedef struct AutoPrewarmSharedState int prewarmed_blocks; } AutoPrewarmSharedState; +/* + * Private data passed through the read stream API for our use in the + * callback. + */ +typedef struct AutoPrewarmReadStreamData +{ + /* The array of records containing the blocks we should prewarm. */ + BlockInfoRecord *block_info; + + /* + * pos is the read stream callback's index into block_info. Because the + * read stream may read ahead, pos is likely to be ahead of the index in + * the main loop in autoprewarm_database_main(). + */ + int pos; + Oid tablespace; + RelFileNumber filenumber; + ForkNumber forknum; + BlockNumber nblocks; +} AutoPrewarmReadStreamData; + + PGDLLEXPORT void autoprewarm_main(Datum main_arg); PGDLLEXPORT void autoprewarm_database_main(Datum main_arg); @@ -422,6 +445,54 @@ apw_load_buffers(void) apw_state->prewarmed_blocks, num_elements))); } +/* + * Return the next block number of a specific relation and fork to read + * according to the array of BlockInfoRecord. + */ +static BlockNumber +apw_read_stream_next_block(ReadStream *stream, + void *callback_private_data, + void *per_buffer_data) +{ + AutoPrewarmReadStreamData *p = callback_private_data; + + CHECK_FOR_INTERRUPTS(); + + while (p->pos < apw_state->prewarm_stop_idx) + { + BlockInfoRecord blk = p->block_info[p->pos]; + + if (!have_free_buffer()) + { + p->pos = apw_state->prewarm_stop_idx; + return InvalidBlockNumber; + } + + if (blk.tablespace != p->tablespace) + return InvalidBlockNumber; + + if (blk.filenumber != p->filenumber) + return InvalidBlockNumber; + + if (blk.forknum != p->forknum) + return InvalidBlockNumber; + + p->pos++; + + /* + * Check whether blocknum is valid and within fork file size. + * Fast-forward through any invalid blocks. We want p->pos to reflect + * the location of the next relation or fork before ending the stream. + */ + if (blk.blocknum >= p->nblocks) + continue; + + return blk.blocknum; + } + + return InvalidBlockNumber; +} + /* * Prewarm all blocks for one database (and possibly also global objects, if * those got grouped with this database). @@ -462,8 +533,6 @@ autoprewarm_database_main(Datum main_arg) Oid reloid; Relation rel; - CHECK_FOR_INTERRUPTS(); - /* * All blocks between prewarm_start_idx and prewarm_stop_idx should * belong either to global objects or the same database. @@ -510,6 +579,8 @@ autoprewarm_database_main(Datum main_arg) { ForkNumber forknum = blk.forknum; BlockNumber nblocks; + struct AutoPrewarmReadStreamData p; + ReadStream *stream; Buffer buf; /* @@ -540,32 +611,40 @@ autoprewarm_database_main(Datum main_arg) nblocks = RelationGetNumberOfBlocksInFork(rel, blk.forknum); - /* Prewarm buffers. */ - while (i < apw_state->prewarm_stop_idx && - blk.tablespace == tablespace && - blk.filenumber == filenumber && - blk.forknum == forknum && - have_free_buffer()) + p = (struct AutoPrewarmReadStreamData) { - CHECK_FOR_INTERRUPTS(); - - /* Check whether blocknum is valid and within fork file size. */ - if (blk.blocknum >= nblocks) - { - blk = block_info[++i]; - continue; - } - - buf = ReadBufferExtended(rel, blk.forknum, blk.blocknum, RBM_NORMAL, - NULL); - - blk = block_info[++i]; - if (!BufferIsValid(buf)) - break; + .block_info = block_info, + .pos = i, + .tablespace = tablespace, + .filenumber = filenumber, + .forknum = forknum, + .nblocks = nblocks, + }; + + stream = read_stream_begin_relation(READ_STREAM_FULL, + NULL, + rel, + p.forknum, + apw_read_stream_next_block, + &p, + 0); + /* + * Loop until we've prewarmed all the blocks from this fork. The + * read stream callback will check that we still have free buffers + * before requesting each block from the read stream API. + */ + while ((buf = read_stream_next_buffer(stream, NULL)) != InvalidBuffer) + { apw_state->prewarmed_blocks++; ReleaseBuffer(buf); } + + read_stream_end(stream); + + /* Advance i past all the blocks just prewarmed. */ + i = p.pos; + blk = block_info[i]; } relation_close(rel, AccessShareLock); diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 8f28d8ff28eb..5ac290fae789 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -175,6 +175,7 @@ AttributeOpts AuthRequest AuthToken AutoPrewarmSharedState +AutoPrewarmReadStreamData AutoVacOpts AutoVacuumShmemStruct AutoVacuumWorkItem