diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index 219df1971da6..9184f14e0142 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -57,6 +57,7 @@ #include "utils/ruleutils.h" #include "utils/snapmgr.h" #include "utils/syscache.h" +#include "utils/injection_point.h" /* ---------------------------------------------------------------- @@ -741,6 +742,13 @@ index_getnext_slot(IndexScanDesc scan, ScanDirection direction, TupleTableSlot * * the index. */ Assert(ItemPointerIsValid(&scan->xs_heaptid)); +#ifdef USE_INJECTION_POINTS + if (!IsCatalogRelationOid(scan->indexRelation->rd_id)) + { + INJECTION_POINT("index_getnext_slot_before_fetch"); + } +#endif + if (index_fetch_heap(scan, slot)) return true; } diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README index 53d4a61dc3f1..a9280415633d 100644 --- a/src/backend/access/nbtree/README +++ b/src/backend/access/nbtree/README @@ -85,7 +85,8 @@ move right until we find a page whose right-link matches the page we came from. (Actually, it's even harder than that; see page deletion discussion below.) -Page read locks are held only for as long as a scan is examining a page. +Page read locks are held only for as long as a scan is examining a page +(with exception for SnapshotDirty and SnapshotSelf scans - see below). To minimize lock/unlock traffic, an index scan always searches a leaf page to identify all the matching items at once, copying their heap tuple IDs into backend-local storage. The heap tuple IDs are then processed while @@ -103,6 +104,16 @@ We also remember the left-link, and follow it when the scan moves backwards (though this requires extra handling to account for concurrent splits of the left sibling; see detailed move-left algorithm below). +Despite the described mechanics in place, inconsistent results may still occur +during non-MVCC scans (SnapshotDirty and SnapshotSelf). This issue can occur if a +concurrent transaction deletes a tuple and inserts a new tuple with a new TID in the +same page. If the scan has already visited the page and cached its content in the +backend-local storage, it might skip the old tuple due to deletion and miss the new +tuple because the scan does not re-read the page. To address this issue, for +SnapshotDirty and SnapshotSelf scans, we retain the read lock on the page until +we're completely done processing all the tuples from that page, preventing +concurrent modifications that could lead to inconsistent results. + In most cases we release our lock and pin on a page before attempting to acquire pin and lock on the page we are moving to. In a few places it is necessary to lock the next page before releasing the current one. diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 765659887af7..8239076c5182 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -389,6 +389,12 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, /* Before leaving current page, deal with any killed items */ if (so->numKilled > 0) _bt_killitems(scan); + /* Release any extended lock held for SnapshotDirty/Self scans */ + if (so->currPos.extra_unlock) + { + _bt_unlockbuf(scan->indexRelation, so->currPos.buf); + so->currPos.extra_unlock = false; + } BTScanPosUnpinIfPinned(so->currPos); BTScanPosInvalidate(so->currPos); } @@ -445,6 +451,12 @@ btendscan(IndexScanDesc scan) /* Before leaving current page, deal with any killed items */ if (so->numKilled > 0) _bt_killitems(scan); + /* Release any extended lock held for SnapshotDirty/Self scans */ + if (so->currPos.extra_unlock) + { + _bt_unlockbuf(scan->indexRelation, so->currPos.buf); + so->currPos.extra_unlock = false; + } BTScanPosUnpinIfPinned(so->currPos); } @@ -525,6 +537,11 @@ btrestrpos(IndexScanDesc scan) /* Before leaving current page, deal with any killed items */ if (so->numKilled > 0) _bt_killitems(scan); + if (so->currPos.extra_unlock) + { + _bt_unlockbuf(scan->indexRelation, so->currPos.buf); + so->currPos.extra_unlock = false; + } BTScanPosUnpinIfPinned(so->currPos); } diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 77264ddeecb5..1e1d24b32433 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -61,11 +61,22 @@ static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir); * This will prevent vacuum from stalling in a blocked state trying to read a * page when a cursor is sitting on it. * + * For SnapshotDirty and SnapshotSelf scans, we don't actually unlock the buffer + * here, but instead set extra_unlock to indicate that the lock should be held + * until we're completely done with this page. This prevents concurrent + * modifications from causing inconsistent results during non-MVCC scans. + * * See nbtree/README section on making concurrent TID recycling safe. */ static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp) { + if (scan->xs_snapshot->snapshot_type == SNAPSHOT_DIRTY || + scan->xs_snapshot->snapshot_type == SNAPSHOT_SELF) + { + sp->extra_unlock = true; + return; + } _bt_unlockbuf(scan->indexRelation, sp->buf); if (IsMVCCSnapshot(scan->xs_snapshot) && @@ -1527,7 +1538,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * _bt_next() -- Get the next item in a scan. * * On entry, so->currPos describes the current page, which may be pinned - * but is not locked, and so->currPos.itemIndex identifies which item was + * but is not locked (except for SnapshotDirty and SnapshotSelf scans, where + * the page remains locked), and so->currPos.itemIndex identifies which item was * previously returned. * * On success exit, so->currPos is updated as needed, and _bt_returnitem @@ -2101,10 +2113,11 @@ _bt_returnitem(IndexScanDesc scan, BTScanOpaque so) * * Wrapper on _bt_readnextpage that performs final steps for the current page. * - * On entry, if so->currPos.buf is valid the buffer is pinned but not locked. - * If there's no pin held, it's because _bt_drop_lock_and_maybe_pin dropped - * the pin eagerly earlier on. The scan must have so->currPos.currPage set to - * a valid block, in any case. + * On entry, if so->currPos.buf is valid the buffer is pinned but not locked, + * except for SnapshotDirty and SnapshotSelf scans where the buffer remains locked + * until we're done with all tuples from the page. If there's no pin held, it's + * because _bt_drop_lock_and_maybe_pin dropped the pin eagerly earlier on. + * The scan must have so->currPos.currPage set to a valid block, in any case. */ static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir) @@ -2163,8 +2176,20 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) /* mark/restore not supported by parallel scans */ Assert(!scan->parallel_scan); + Assert(scan->xs_snapshot->snapshot_type != SNAPSHOT_DIRTY); + Assert(scan->xs_snapshot->snapshot_type != SNAPSHOT_SELF); } + /* + * For SnapshotDirty/Self scans, we kept the read lock after processing + * the page's tuples (see _bt_drop_lock_and_maybe_pin). Now that we're + * moving to another page, we need to explicitly release that lock. + */ + if (so->currPos.extra_unlock) + { + _bt_unlockbuf(scan->indexRelation, so->currPos.buf); + so->currPos.extra_unlock = false; + } BTScanPosUnpinIfPinned(so->currPos); /* Walk to the next page with data */ diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 11802a4c2151..22b9c9d232b6 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -3347,13 +3347,17 @@ _bt_killitems(IndexScanDesc scan) * LSN. */ droppedpin = false; - _bt_lockbuf(scan->indexRelation, so->currPos.buf, BT_READ); + /* For SnapshotDirty/Self scans, the buffer is already locked */ + if (!so->currPos.extra_unlock) + _bt_lockbuf(scan->indexRelation, so->currPos.buf, BT_READ); page = BufferGetPage(so->currPos.buf); } else { Buffer buf; + /* extra_unlock should never be set without a valid buffer pin */ + Assert(!so->currPos.extra_unlock); droppedpin = true; /* Attempt to re-read the buffer, getting pin and lock. */ @@ -3490,6 +3494,8 @@ _bt_killitems(IndexScanDesc scan) } _bt_unlockbuf(scan->indexRelation, so->currPos.buf); + /* Reset the extra_unlock flag since we've now released the lock */ + so->currPos.extra_unlock = false; } diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c index bdf862b24062..eaf7e8434bbd 100644 --- a/src/backend/executor/execIndexing.c +++ b/src/backend/executor/execIndexing.c @@ -117,6 +117,7 @@ #include "utils/multirangetypes.h" #include "utils/rangetypes.h" #include "utils/snapmgr.h" +#include "utils/injection_point.h" /* waitMode argument to check_exclusion_or_unique_constraint() */ typedef enum @@ -943,6 +944,8 @@ check_exclusion_or_unique_constraint(Relation heap, Relation index, ExecDropSingleTupleTableSlot(existing_slot); + if (!conflict) + INJECTION_POINT("check_exclusion_or_unique_constraint_no_conflict"); return !conflict; } diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index ebca02588d3e..2c2485f34bda 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -962,6 +962,7 @@ typedef struct BTScanPosItem /* what we remember about each match */ typedef struct BTScanPosData { Buffer buf; /* currPage buf (invalid means unpinned) */ + bool extra_unlock; /* for SnapshotDirty/Self, read lock is held even after _bt_drop_lock_and_maybe_pin */ /* page details as of the saved position's call to _bt_readpage */ BlockNumber currPage; /* page referenced by items array */ @@ -1009,6 +1010,7 @@ typedef BTScanPosData *BTScanPos; ) #define BTScanPosUnpin(scanpos) \ do { \ + Assert(!(scanpos).extra_unlock); \ ReleaseBuffer((scanpos).buf); \ (scanpos).buf = InvalidBuffer; \ } while (0) @@ -1028,6 +1030,7 @@ typedef BTScanPosData *BTScanPos; do { \ (scanpos).buf = InvalidBuffer; \ (scanpos).currPage = InvalidBlockNumber; \ + (scanpos).extra_unlock = false; \ } while (0) /* We need one of these for each equality-type SK_SEARCHARRAY scan key */ diff --git a/src/test/modules/injection_points/Makefile b/src/test/modules/injection_points/Makefile index e680991f8d4f..b73f8ac80f2b 100644 --- a/src/test/modules/injection_points/Makefile +++ b/src/test/modules/injection_points/Makefile @@ -14,7 +14,7 @@ PGFILEDESC = "injection_points - facility for injection points" REGRESS = injection_points hashagg reindex_conc REGRESS_OPTS = --dlpath=$(top_builddir)/src/test/regress -ISOLATION = basic inplace syscache-update-pruned +ISOLATION = basic inplace syscache-update-pruned dirty_index_scan TAP_TESTS = 1 diff --git a/src/test/modules/injection_points/expected/dirty_index_scan.out b/src/test/modules/injection_points/expected/dirty_index_scan.out new file mode 100644 index 000000000000..c286a9fd5b63 --- /dev/null +++ b/src/test/modules/injection_points/expected/dirty_index_scan.out @@ -0,0 +1,27 @@ +Parsed test spec with 3 sessions + +starting permutation: s1_s1 s2_s1 s3_s1 +injection_points_attach +----------------------- + +(1 row) + +step s1_s1: INSERT INTO test.tbl VALUES(42, 1) on conflict(i) do update set n = EXCLUDED.n + 1; +step s2_s1: UPDATE test.tbl SET n = n + 1 WHERE i = 42; +step s3_s1: + SELECT injection_points_detach('index_getnext_slot_before_fetch'); + SELECT injection_points_wakeup('index_getnext_slot_before_fetch'); + +step s1_s1: <... completed> +step s2_s1: <... completed> +step s3_s1: <... completed> +injection_points_detach +----------------------- + +(1 row) + +injection_points_wakeup +----------------------- + +(1 row) + diff --git a/src/test/modules/injection_points/meson.build b/src/test/modules/injection_points/meson.build index d61149712fd7..bb3869f9a750 100644 --- a/src/test/modules/injection_points/meson.build +++ b/src/test/modules/injection_points/meson.build @@ -47,6 +47,7 @@ tests += { 'basic', 'inplace', 'syscache-update-pruned', + 'dirty_index_scan', ], 'runningcheck': false, # see syscache-update-pruned }, diff --git a/src/test/modules/injection_points/specs/dirty_index_scan.spec b/src/test/modules/injection_points/specs/dirty_index_scan.spec new file mode 100644 index 000000000000..373bcaf4929f --- /dev/null +++ b/src/test/modules/injection_points/specs/dirty_index_scan.spec @@ -0,0 +1,37 @@ +setup +{ + CREATE EXTENSION injection_points; + CREATE SCHEMA test; + CREATE UNLOGGED TABLE test.tbl(i int primary key, n int); + CREATE INDEX tbl_n_idx ON test.tbl(n); + INSERT INTO test.tbl VALUES(42,1); +} + +teardown +{ + DROP SCHEMA test CASCADE; + DROP EXTENSION injection_points; +} + +session s1 +setup { + SELECT injection_points_set_local(); + SELECT injection_points_attach('check_exclusion_or_unique_constraint_no_conflict', 'error'); + SELECT injection_points_attach('index_getnext_slot_before_fetch', 'wait'); +} + +step s1_s1 { INSERT INTO test.tbl VALUES(42, 1) on conflict(i) do update set n = EXCLUDED.n + 1; } + +session s2 +step s2_s1 { UPDATE test.tbl SET n = n + 1 WHERE i = 42; } + +session s3 +step s3_s1 { + SELECT injection_points_detach('index_getnext_slot_before_fetch'); + SELECT injection_points_wakeup('index_getnext_slot_before_fetch'); +} + +permutation + s1_s1 + s2_s1(*) + s3_s1(s1_s1) \ No newline at end of file