Skip to content

Commit c0d88cb

Browse files
MMeentCommitfest Bot
authored and
Commitfest Bot
committed
IOS/TableAM: Support AM-specific fast visibility tests
Previously, we assumed VM_ALL_VISIBLE is universal across all AMs. This is probably not the case, so we introduce a new table method called "table_index_vischeck_tuples" which allows anyone to ask the AM whether a tuple is definitely visible to everyone or might be invisible to someone. The API is intended to replace direct calls to VM_ALL_VISIBLE and as such doesn't include "definitely dead to everyone", as the Heap AM's VM doesn't support *definitely dead* as output for its lookups; and thus it would be too expensive for the Heap AM to produce such results. A future commit will use this inside GIST and SP-GIST to fix a race condition between IOS and VACUUM, which causes a bug with tuple visibility, and a further patch will add support for this to nbtree.
1 parent f132815 commit c0d88cb

File tree

11 files changed

+430
-82
lines changed

11 files changed

+430
-82
lines changed

src/backend/access/heap/heapam.c

+177
Original file line numberDiff line numberDiff line change
@@ -101,11 +101,37 @@ static bool ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status
101101
uint16 infomask, Relation rel, int *remaining,
102102
bool logLockFailure);
103103
static void index_delete_sort(TM_IndexDeleteOp *delstate);
104+
static inline int heap_ivc_process_block(Relation rel, Buffer *vmbuf,
105+
TM_VisCheck *checks, int nchecks);
106+
static void heap_ivc_process_all(Relation rel, Buffer *vmbuf,
107+
TM_VisCheck *checks, int nchecks);
104108
static int bottomup_sort_and_shrink(TM_IndexDeleteOp *delstate);
105109
static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup);
106110
static HeapTuple ExtractReplicaIdentity(Relation relation, HeapTuple tp, bool key_required,
107111
bool *copy);
108112

113+
/* sort template definitions for index */
114+
#define ST_SORT heap_ivc_sortby_tidheapblk
115+
#define ST_ELEMENT_TYPE TM_VisCheck
116+
#define ST_DECLARE
117+
#define ST_DEFINE
118+
#define ST_SCOPE static inline
119+
#define ST_COMPARE(a, b) ( \
120+
a->tidblkno < b->tidblkno ? -1 : ( \
121+
a->tidblkno > b->tidblkno ? 1 : 0 \
122+
) \
123+
)
124+
125+
#include "lib/sort_template.h"
126+
127+
#define ST_SORT heap_ivc_sortby_idx
128+
#define ST_ELEMENT_TYPE TM_VisCheck
129+
#define ST_DECLARE
130+
#define ST_DEFINE
131+
#define ST_SCOPE static inline
132+
#define ST_COMPARE(a, b) (((int) a->idxoffnum) - ((int) b->idxoffnum))
133+
#include "lib/sort_template.h"
134+
109135

110136
/*
111137
* Each tuple lock mode has a corresponding heavyweight lock, and one or two
@@ -8750,6 +8776,157 @@ bottomup_sort_and_shrink(TM_IndexDeleteOp *delstate)
87508776
return nblocksfavorable;
87518777
}
87528778

8779+
/*
8780+
* heapam implementation of tableam's index_vischeck_tuples interface.
8781+
*
8782+
* This helper function is called by index AMs during index-only scans,
8783+
* to do VM-based visibility checks on individual tuples, so that the AM
8784+
* can hold the tuple in memory for e.g. reordering for extended periods of
8785+
* time while without holding thousands of pins to conflict with VACUUM.
8786+
*
8787+
* It's possible for this to generate a fair amount of I/O, since we may be
8788+
* checking hundreds of tuples from a single index block, but that is
8789+
* preferred over holding thousands of pins.
8790+
*
8791+
* We use heuristics to balance the costs of sorting TIDs with VM page
8792+
* lookups.
8793+
*/
8794+
void
8795+
heap_index_vischeck_tuples(Relation rel, TM_IndexVisibilityCheckOp *checkop)
8796+
{
8797+
Buffer vmbuf = *checkop->vmbuf;
8798+
Buffer storvmbuf = vmbuf;
8799+
TM_VisCheck *checks = checkop->checktids;
8800+
int checkntids = checkop->checkntids;
8801+
int upcomingvmbufchanges = 0;
8802+
8803+
/*
8804+
* The first index scan will have to pin the VM buffer, and that first
8805+
* change in the vm buffer shouldn't put us into the expensive VM page &
8806+
* sort path; so we special-case this operation.
8807+
*/
8808+
if (!BufferIsValid(vmbuf))
8809+
{
8810+
int processed;
8811+
processed = heap_ivc_process_block(rel, &vmbuf, checks,checkntids);
8812+
checkntids -= processed;
8813+
checks += processed;
8814+
storvmbuf = vmbuf;
8815+
Assert(processed > 0);
8816+
}
8817+
8818+
while (vmbuf == storvmbuf && checkntids > 0)
8819+
{
8820+
int processed;
8821+
8822+
processed = heap_ivc_process_block(rel, &vmbuf, checks,checkntids);
8823+
8824+
Assert(processed <= checkntids);
8825+
8826+
checkntids -= processed;
8827+
checks += processed;
8828+
}
8829+
8830+
*checkop->vmbuf = vmbuf;
8831+
8832+
if (checkntids == 0)
8833+
{
8834+
return;
8835+
}
8836+
8837+
upcomingvmbufchanges = 0;
8838+
8839+
for (int i = 1; i < checkntids; i++)
8840+
{
8841+
/*
8842+
* Instead of storing the previous iteration's result, we only match
8843+
* the block numbers
8844+
*/
8845+
BlockNumber lastblkno = checks[i - 1].tidblkno;
8846+
BlockNumber newblkno = checks[i].tidblkno;
8847+
/*
8848+
* divide-by-constant can be faster than BufferGetBlockNumber()
8849+
*/
8850+
BlockNumber lastvmblkno = HEAPBLK_TO_VMBLOCK(lastblkno);
8851+
BlockNumber newvmblkno = HEAPBLK_TO_VMBLOCK(newblkno);
8852+
8853+
if (lastvmblkno != newvmblkno)
8854+
upcomingvmbufchanges++;
8855+
}
8856+
8857+
if (upcomingvmbufchanges <= pg_ceil_log2_32(checkntids))
8858+
{
8859+
/*
8860+
* No big amount of VM buf changes, so do all visibility checks
8861+
* without sorting.
8862+
*/
8863+
heap_ivc_process_all(rel, checkop->vmbuf, checks, checkntids);
8864+
8865+
return;
8866+
}
8867+
8868+
/*
8869+
* Order the TIDs to heap order, so that we will only need to visit every
8870+
* VM page at most once.
8871+
*/
8872+
heap_ivc_sortby_tidheapblk(checks, checkntids);
8873+
8874+
/* do all visibility checks */
8875+
heap_ivc_process_all(rel, checkop->vmbuf, checks, checkntids);
8876+
8877+
/* put the checks back in index order */
8878+
heap_ivc_sortby_idx(checks, checkntids);
8879+
}
8880+
8881+
8882+
static inline int
8883+
heap_ivc_process_block(Relation rel, Buffer *vmbuf, TM_VisCheck *checks,
8884+
int nchecks)
8885+
{
8886+
BlockNumber blkno;
8887+
BlockNumber prevblkno = blkno = checks->tidblkno;
8888+
TMVC_Result result;
8889+
int processed = 0;
8890+
8891+
if (VM_ALL_VISIBLE(rel, blkno, vmbuf))
8892+
result = TMVC_Visible;
8893+
else
8894+
result = TMVC_MaybeVisible;
8895+
8896+
do
8897+
{
8898+
checks->vischeckresult = result;
8899+
8900+
nchecks--;
8901+
processed++;
8902+
checks++;
8903+
8904+
if (nchecks <= 0)
8905+
return processed;
8906+
8907+
blkno = checks->tidblkno;
8908+
} while (blkno == prevblkno);
8909+
8910+
return processed;
8911+
}
8912+
8913+
static void
8914+
heap_ivc_process_all(Relation rel, Buffer *vmbuf,
8915+
TM_VisCheck *checks, int nchecks)
8916+
{
8917+
while (nchecks > 0)
8918+
{
8919+
int processed;
8920+
8921+
processed = heap_ivc_process_block(rel, vmbuf, checks, nchecks);
8922+
8923+
Assert(processed <= nchecks);
8924+
8925+
nchecks -= processed;
8926+
checks += processed;
8927+
}
8928+
}
8929+
87538930
/*
87548931
* Perform XLogInsert for a heap-visible operation. 'block' is the block
87558932
* being marked all-visible, and vm_buffer is the buffer containing the

src/backend/access/heap/heapam_handler.c

+1
Original file line numberDiff line numberDiff line change
@@ -2648,6 +2648,7 @@ static const TableAmRoutine heapam_methods = {
26482648
.tuple_tid_valid = heapam_tuple_tid_valid,
26492649
.tuple_satisfies_snapshot = heapam_tuple_satisfies_snapshot,
26502650
.index_delete_tuples = heap_index_delete_tuples,
2651+
.index_vischeck_tuples = heap_index_vischeck_tuples,
26512652

26522653
.relation_set_new_filelocator = heapam_relation_set_new_filelocator,
26532654
.relation_nontransactional_truncate = heapam_relation_nontransactional_truncate,

src/backend/access/heap/visibilitymap.c

+14-25
Original file line numberDiff line numberDiff line change
@@ -107,17 +107,6 @@
107107
*/
108108
#define MAPSIZE (BLCKSZ - MAXALIGN(SizeOfPageHeaderData))
109109

110-
/* Number of heap blocks we can represent in one byte */
111-
#define HEAPBLOCKS_PER_BYTE (BITS_PER_BYTE / BITS_PER_HEAPBLOCK)
112-
113-
/* Number of heap blocks we can represent in one visibility map page. */
114-
#define HEAPBLOCKS_PER_PAGE (MAPSIZE * HEAPBLOCKS_PER_BYTE)
115-
116-
/* Mapping from heap block number to the right bit in the visibility map */
117-
#define HEAPBLK_TO_MAPBLOCK(x) ((x) / HEAPBLOCKS_PER_PAGE)
118-
#define HEAPBLK_TO_MAPBYTE(x) (((x) % HEAPBLOCKS_PER_PAGE) / HEAPBLOCKS_PER_BYTE)
119-
#define HEAPBLK_TO_OFFSET(x) (((x) % HEAPBLOCKS_PER_BYTE) * BITS_PER_HEAPBLOCK)
120-
121110
/* Masks for counting subsets of bits in the visibility map. */
122111
#define VISIBLE_MASK8 (0x55) /* The lower bit of each bit pair */
123112
#define FROZEN_MASK8 (0xaa) /* The upper bit of each bit pair */
@@ -137,9 +126,9 @@ static Buffer vm_extend(Relation rel, BlockNumber vm_nblocks);
137126
bool
138127
visibilitymap_clear(Relation rel, BlockNumber heapBlk, Buffer vmbuf, uint8 flags)
139128
{
140-
BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
141-
int mapByte = HEAPBLK_TO_MAPBYTE(heapBlk);
142-
int mapOffset = HEAPBLK_TO_OFFSET(heapBlk);
129+
BlockNumber mapBlock = HEAPBLK_TO_VMBLOCK(heapBlk);
130+
int mapByte = HEAPBLK_TO_VMBYTE(heapBlk);
131+
int mapOffset = HEAPBLK_TO_VMOFFSET(heapBlk);
143132
uint8 mask = flags << mapOffset;
144133
char *map;
145134
bool cleared = false;
@@ -190,7 +179,7 @@ visibilitymap_clear(Relation rel, BlockNumber heapBlk, Buffer vmbuf, uint8 flags
190179
void
191180
visibilitymap_pin(Relation rel, BlockNumber heapBlk, Buffer *vmbuf)
192181
{
193-
BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
182+
BlockNumber mapBlock = HEAPBLK_TO_VMBLOCK(heapBlk);
194183

195184
/* Reuse the old pinned buffer if possible */
196185
if (BufferIsValid(*vmbuf))
@@ -214,7 +203,7 @@ visibilitymap_pin(Relation rel, BlockNumber heapBlk, Buffer *vmbuf)
214203
bool
215204
visibilitymap_pin_ok(BlockNumber heapBlk, Buffer vmbuf)
216205
{
217-
BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
206+
BlockNumber mapBlock = HEAPBLK_TO_VMBLOCK(heapBlk);
218207

219208
return BufferIsValid(vmbuf) && BufferGetBlockNumber(vmbuf) == mapBlock;
220209
}
@@ -247,9 +236,9 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf,
247236
XLogRecPtr recptr, Buffer vmBuf, TransactionId cutoff_xid,
248237
uint8 flags)
249238
{
250-
BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
251-
uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk);
252-
uint8 mapOffset = HEAPBLK_TO_OFFSET(heapBlk);
239+
BlockNumber mapBlock = HEAPBLK_TO_VMBLOCK(heapBlk);
240+
uint32 mapByte = HEAPBLK_TO_VMBYTE(heapBlk);
241+
uint8 mapOffset = HEAPBLK_TO_VMOFFSET(heapBlk);
253242
Page page;
254243
uint8 *map;
255244
uint8 status;
@@ -340,9 +329,9 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf,
340329
uint8
341330
visibilitymap_get_status(Relation rel, BlockNumber heapBlk, Buffer *vmbuf)
342331
{
343-
BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
344-
uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk);
345-
uint8 mapOffset = HEAPBLK_TO_OFFSET(heapBlk);
332+
BlockNumber mapBlock = HEAPBLK_TO_VMBLOCK(heapBlk);
333+
uint32 mapByte = HEAPBLK_TO_VMBYTE(heapBlk);
334+
uint8 mapOffset = HEAPBLK_TO_VMOFFSET(heapBlk);
346335
char *map;
347336
uint8 result;
348337

@@ -445,9 +434,9 @@ visibilitymap_prepare_truncate(Relation rel, BlockNumber nheapblocks)
445434
BlockNumber newnblocks;
446435

447436
/* last remaining block, byte, and bit */
448-
BlockNumber truncBlock = HEAPBLK_TO_MAPBLOCK(nheapblocks);
449-
uint32 truncByte = HEAPBLK_TO_MAPBYTE(nheapblocks);
450-
uint8 truncOffset = HEAPBLK_TO_OFFSET(nheapblocks);
437+
BlockNumber truncBlock = HEAPBLK_TO_VMBLOCK(nheapblocks);
438+
uint32 truncByte = HEAPBLK_TO_VMBYTE(nheapblocks);
439+
uint8 truncOffset = HEAPBLK_TO_VMOFFSET(nheapblocks);
451440

452441
#ifdef TRACE_VISIBILITYMAP
453442
elog(DEBUG1, "vm_truncate %s %d", RelationGetRelationName(rel), nheapblocks);

src/backend/access/index/indexam.c

+6
Original file line numberDiff line numberDiff line change
@@ -628,6 +628,12 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction)
628628
/* XXX: we should assert that a snapshot is pushed or registered */
629629
Assert(TransactionIdIsValid(RecentXmin));
630630

631+
/*
632+
* Reset xs_visrecheck, so we don't confuse the next tuple's visibility
633+
* state with that of the previous.
634+
*/
635+
scan->xs_visrecheck = TMVC_Unchecked;
636+
631637
/*
632638
* The AM's amgettuple proc finds the next index entry matching the scan
633639
* keys, and puts the TID into scan->xs_heaptid. It should also set

src/backend/access/table/tableamapi.c

+1
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ GetTableAmRoutine(Oid amhandler)
6161
Assert(routine->tuple_get_latest_tid != NULL);
6262
Assert(routine->tuple_satisfies_snapshot != NULL);
6363
Assert(routine->index_delete_tuples != NULL);
64+
Assert(routine->index_vischeck_tuples != NULL);
6465

6566
Assert(routine->tuple_insert != NULL);
6667

0 commit comments

Comments
 (0)