diff options
Diffstat (limited to 'src/include')
| -rw-r--r-- | src/include/access/nbtree.h | 14 | ||||
| -rw-r--r-- | src/include/access/parallel.h | 4 | ||||
| -rw-r--r-- | src/include/access/relscan.h | 1 | ||||
| -rw-r--r-- | src/include/catalog/index.h | 9 | ||||
| -rw-r--r-- | src/include/miscadmin.h | 1 | ||||
| -rw-r--r-- | src/include/nodes/execnodes.h | 6 | ||||
| -rw-r--r-- | src/include/optimizer/paths.h | 2 | ||||
| -rw-r--r-- | src/include/optimizer/planner.h | 1 | ||||
| -rw-r--r-- | src/include/pgstat.h | 1 | ||||
| -rw-r--r-- | src/include/storage/buffile.h | 2 | ||||
| -rw-r--r-- | src/include/storage/fd.h | 1 | ||||
| -rw-r--r-- | src/include/utils/logtape.h | 39 | ||||
| -rw-r--r-- | src/include/utils/tuplesort.h | 132 |
13 files changed, 184 insertions, 29 deletions
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index d28f413c663..0f6a40168ca 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -21,6 +21,7 @@ #include "catalog/pg_index.h" #include "lib/stringinfo.h" #include "storage/bufmgr.h" +#include "storage/shm_toc.h" /* There's room for a 16-bit vacuum cycle ID in BTPageOpaqueData */ typedef uint16 BTCycleId; @@ -430,8 +431,6 @@ typedef BTScanOpaqueData *BTScanOpaque; /* * external entry points for btree, in nbtree.c */ -extern IndexBuildResult *btbuild(Relation heap, Relation index, - struct IndexInfo *indexInfo); extern void btbuildempty(Relation index); extern bool btinsert(Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, @@ -547,13 +546,8 @@ extern bool btvalidate(Oid opclassoid); /* * prototypes for functions in nbtsort.c */ -typedef struct BTSpool BTSpool; /* opaque type known only within nbtsort.c */ - -extern BTSpool *_bt_spoolinit(Relation heap, Relation index, - bool isunique, bool isdead); -extern void _bt_spooldestroy(BTSpool *btspool); -extern void _bt_spool(BTSpool *btspool, ItemPointer self, - Datum *values, bool *isnull); -extern void _bt_leafbuild(BTSpool *btspool, BTSpool *spool2); +extern IndexBuildResult *btbuild(Relation heap, Relation index, + struct IndexInfo *indexInfo); +extern void _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc); #endif /* NBTREE_H */ diff --git a/src/include/access/parallel.h b/src/include/access/parallel.h index d0c218b1854..025691fd82d 100644 --- a/src/include/access/parallel.h +++ b/src/include/access/parallel.h @@ -59,7 +59,9 @@ extern PGDLLIMPORT bool InitializingParallelWorker; #define IsParallelWorker() (ParallelWorkerNumber >= 0) -extern ParallelContext *CreateParallelContext(const char *library_name, const char *function_name, int nworkers); +extern ParallelContext *CreateParallelContext(const char *library_name, + const char *function_name, int nworkers, + bool serializable_okay); extern void InitializeParallelDSM(ParallelContext *pcxt); extern void ReinitializeParallelDSM(ParallelContext *pcxt); extern void LaunchParallelWorkers(ParallelContext *pcxt); diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index 9c603ca637a..18c7dedd5d3 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -39,6 +39,7 @@ typedef struct ParallelHeapScanDescData BlockNumber phs_startblock; /* starting block number */ pg_atomic_uint64 phs_nallocated; /* number of blocks allocated to * workers so far. */ + bool phs_snapshot_any; /* SnapshotAny, not phs_snapshot_data? */ char phs_snapshot_data[FLEXIBLE_ARRAY_MEMBER]; } ParallelHeapScanDescData; diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h index 235e180299c..a5cd8ddb1eb 100644 --- a/src/include/catalog/index.h +++ b/src/include/catalog/index.h @@ -104,14 +104,16 @@ extern void index_build(Relation heapRelation, Relation indexRelation, IndexInfo *indexInfo, bool isprimary, - bool isreindex); + bool isreindex, + bool parallel); extern double IndexBuildHeapScan(Relation heapRelation, Relation indexRelation, IndexInfo *indexInfo, bool allow_sync, IndexBuildCallback callback, - void *callback_state); + void *callback_state, + HeapScanDesc scan); extern double IndexBuildHeapRangeScan(Relation heapRelation, Relation indexRelation, IndexInfo *indexInfo, @@ -120,7 +122,8 @@ extern double IndexBuildHeapRangeScan(Relation heapRelation, BlockNumber start_blockno, BlockNumber end_blockno, IndexBuildCallback callback, - void *callback_state); + void *callback_state, + HeapScanDesc scan); extern void validate_index(Oid heapId, Oid indexId, Snapshot snapshot); diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 54ee2737477..429c0554899 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -241,6 +241,7 @@ extern bool enableFsync; extern PGDLLIMPORT bool allowSystemTableMods; extern PGDLLIMPORT int work_mem; extern PGDLLIMPORT int maintenance_work_mem; +extern PGDLLIMPORT int max_parallel_maintenance_workers; extern int VacuumCostPageHit; extern int VacuumCostPageMiss; diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 1bf67455e07..a2a2a9f3d4d 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -132,11 +132,12 @@ typedef struct ExprState * ReadyForInserts is it valid for inserts? * Concurrent are we doing a concurrent index build? * BrokenHotChain did we detect any broken HOT chains? + * ParallelWorkers # of workers requested (excludes leader) * AmCache private cache area for index AM * Context memory context holding this IndexInfo * - * ii_Concurrent and ii_BrokenHotChain are used only during index build; - * they're conventionally set to false otherwise. + * ii_Concurrent, ii_BrokenHotChain, and ii_ParallelWorkers are used only + * during index build; they're conventionally zeroed otherwise. * ---------------- */ typedef struct IndexInfo @@ -158,6 +159,7 @@ typedef struct IndexInfo bool ii_ReadyForInserts; bool ii_Concurrent; bool ii_BrokenHotChain; + int ii_ParallelWorkers; Oid ii_Am; void *ii_AmCache; MemoryContext ii_Context; diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h index 0072b7aa0d4..b6be259ff73 100644 --- a/src/include/optimizer/paths.h +++ b/src/include/optimizer/paths.h @@ -55,7 +55,7 @@ extern RelOptInfo *standard_join_search(PlannerInfo *root, int levels_needed, extern void generate_gather_paths(PlannerInfo *root, RelOptInfo *rel); extern int compute_parallel_worker(RelOptInfo *rel, double heap_pages, - double index_pages); + double index_pages, int max_workers); extern void create_partial_bitmap_paths(PlannerInfo *root, RelOptInfo *rel, Path *bitmapqual); extern void generate_partition_wise_join_paths(PlannerInfo *root, diff --git a/src/include/optimizer/planner.h b/src/include/optimizer/planner.h index 29173d36c49..0d8b88d78be 100644 --- a/src/include/optimizer/planner.h +++ b/src/include/optimizer/planner.h @@ -56,6 +56,7 @@ extern Expr *expression_planner(Expr *expr); extern Expr *preprocess_phv_expression(PlannerInfo *root, Expr *expr); extern bool plan_cluster_use_sort(Oid tableOid, Oid indexOid); +extern int plan_create_index_workers(Oid tableOid, Oid indexOid); extern List *get_partitioned_child_rels(PlannerInfo *root, Index rti, bool *part_cols_updated); diff --git a/src/include/pgstat.h b/src/include/pgstat.h index 3d3c0b64fc3..be2f59239bf 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -826,6 +826,7 @@ typedef enum WAIT_EVENT_MQ_SEND, WAIT_EVENT_PARALLEL_FINISH, WAIT_EVENT_PARALLEL_BITMAP_SCAN, + WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN, WAIT_EVENT_PROCARRAY_GROUP_UPDATE, WAIT_EVENT_CLOG_GROUP_UPDATE, WAIT_EVENT_REPLICATION_ORIGIN_DROP, diff --git a/src/include/storage/buffile.h b/src/include/storage/buffile.h index a3df056a61b..a6cdeb451c1 100644 --- a/src/include/storage/buffile.h +++ b/src/include/storage/buffile.h @@ -43,6 +43,8 @@ extern size_t BufFileWrite(BufFile *file, void *ptr, size_t size); extern int BufFileSeek(BufFile *file, int fileno, off_t offset, int whence); extern void BufFileTell(BufFile *file, int *fileno, off_t *offset); extern int BufFileSeekBlock(BufFile *file, long blknum); +extern off_t BufFileSize(BufFile *file); +extern long BufFileAppend(BufFile *target, BufFile *source); extern BufFile *BufFileCreateShared(SharedFileSet *fileset, const char *name); extern void BufFileExportShared(BufFile *file); diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index db5ca166794..4244e7b1fd8 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -78,6 +78,7 @@ extern char *FilePathName(File file); extern int FileGetRawDesc(File file); extern int FileGetRawFlags(File file); extern mode_t FileGetRawMode(File file); +extern off_t FileGetSize(File file); /* Operations used for sharing named temporary files */ extern File PathNameCreateTemporaryFile(const char *name, bool error_on_failure); diff --git a/src/include/utils/logtape.h b/src/include/utils/logtape.h index 88662c10a43..9bf1d801424 100644 --- a/src/include/utils/logtape.h +++ b/src/include/utils/logtape.h @@ -16,15 +16,49 @@ #ifndef LOGTAPE_H #define LOGTAPE_H +#include "storage/sharedfileset.h" + /* LogicalTapeSet is an opaque type whose details are not known outside logtape.c. */ typedef struct LogicalTapeSet LogicalTapeSet; /* + * The approach tuplesort.c takes to parallel external sorts is that workers, + * whose state is almost the same as independent serial sorts, are made to + * produce a final materialized tape of sorted output in all cases. This is + * frozen, just like any case requiring a final materialized tape. However, + * there is one difference, which is that freezing will also export an + * underlying shared fileset BufFile for sharing. Freezing produces TapeShare + * metadata for the worker when this happens, which is passed along through + * shared memory to leader. + * + * The leader process can then pass an array of TapeShare metadata (one per + * worker participant) to LogicalTapeSetCreate(), alongside a handle to a + * shared fileset, which is sufficient to construct a new logical tapeset that + * consists of each of the tapes materialized by workers. + * + * Note that while logtape.c does create an empty leader tape at the end of the + * tapeset in the leader case, it can never be written to due to a restriction + * in the shared buffile infrastructure. + */ +typedef struct TapeShare +{ + /* + * firstblocknumber is first block that should be read from materialized + * tape. + * + * buffilesize is the size of associated BufFile following freezing. + */ + long firstblocknumber; + off_t buffilesize; +} TapeShare; + +/* * prototypes for functions in logtape.c */ -extern LogicalTapeSet *LogicalTapeSetCreate(int ntapes); +extern LogicalTapeSet *LogicalTapeSetCreate(int ntapes, TapeShare *shared, + SharedFileSet *fileset, int worker); extern void LogicalTapeSetClose(LogicalTapeSet *lts); extern void LogicalTapeSetForgetFreeSpace(LogicalTapeSet *lts); extern size_t LogicalTapeRead(LogicalTapeSet *lts, int tapenum, @@ -34,7 +68,8 @@ extern void LogicalTapeWrite(LogicalTapeSet *lts, int tapenum, extern void LogicalTapeRewindForRead(LogicalTapeSet *lts, int tapenum, size_t buffer_size); extern void LogicalTapeRewindForWrite(LogicalTapeSet *lts, int tapenum); -extern void LogicalTapeFreeze(LogicalTapeSet *lts, int tapenum); +extern void LogicalTapeFreeze(LogicalTapeSet *lts, int tapenum, + TapeShare *share); extern size_t LogicalTapeBackspace(LogicalTapeSet *lts, int tapenum, size_t size); extern void LogicalTapeSeek(LogicalTapeSet *lts, int tapenum, diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h index 5d57c503ab2..d2e6754f043 100644 --- a/src/include/utils/tuplesort.h +++ b/src/include/utils/tuplesort.h @@ -8,7 +8,8 @@ * if necessary). It works efficiently for both small and large amounts * of data. Small amounts are sorted in-memory using qsort(). Large * amounts are sorted using temporary files and a standard external sort - * algorithm. + * algorithm. Parallel sorts use a variant of this external sort + * algorithm, and are typically only used for large amounts of data. * * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California @@ -23,13 +24,39 @@ #include "access/itup.h" #include "executor/tuptable.h" #include "fmgr.h" +#include "storage/dsm.h" #include "utils/relcache.h" -/* Tuplesortstate is an opaque type whose details are not known outside - * tuplesort.c. +/* + * Tuplesortstate and Sharedsort are opaque types whose details are not + * known outside tuplesort.c. */ typedef struct Tuplesortstate Tuplesortstate; +typedef struct Sharedsort Sharedsort; + +/* + * Tuplesort parallel coordination state, allocated by each participant in + * local memory. Participant caller initializes everything. See usage notes + * below. + */ +typedef struct SortCoordinateData +{ + /* Worker process? If not, must be leader. */ + bool isWorker; + + /* + * Leader-process-passed number of participants known launched (workers + * set this to -1). Includes state within leader needed for it to + * participate as a worker, if any. + */ + int nParticipants; + + /* Private opaque state (points to shared memory) */ + Sharedsort *sharedsort; +} SortCoordinateData; + +typedef struct SortCoordinateData *SortCoordinate; /* * Data structures for reporting sort statistics. Note that @@ -66,6 +93,8 @@ typedef struct TuplesortInstrumentation * sorting HeapTuples and two more for sorting IndexTuples. Yet another * API supports sorting bare Datums. * + * Serial sort callers should pass NULL for their coordinate argument. + * * The "heap" API actually stores/sorts MinimalTuples, which means it doesn't * preserve the system columns (tuple identity and transaction visibility * info). The sort keys are specified by column numbers within the tuples @@ -84,30 +113,107 @@ typedef struct TuplesortInstrumentation * * The "index_hash" API is similar to index_btree, but the tuples are * actually sorted by their hash codes not the raw data. + * + * Parallel sort callers are required to coordinate multiple tuplesort states + * in a leader process and one or more worker processes. The leader process + * must launch workers, and have each perform an independent "partial" + * tuplesort, typically fed by the parallel heap interface. The leader later + * produces the final output (internally, it merges runs output by workers). + * + * Callers must do the following to perform a sort in parallel using multiple + * worker processes: + * + * 1. Request tuplesort-private shared memory for n workers. Use + * tuplesort_estimate_shared() to get the required size. + * 2. Have leader process initialize allocated shared memory using + * tuplesort_initialize_shared(). Launch workers. + * 3. Initialize a coordinate argument within both the leader process, and + * for each worker process. This has a pointer to the shared + * tuplesort-private structure, as well as some caller-initialized fields. + * Leader's coordinate argument reliably indicates number of workers + * launched (this is unused by workers). + * 4. Begin a tuplesort using some appropriate tuplesort_begin* routine, + * (passing the coordinate argument) within each worker. The workMem + * arguments need not be identical. All other arguments should match + * exactly, though. + * 5. tuplesort_attach_shared() should be called by all workers. Feed tuples + * to each worker, and call tuplesort_performsort() within each when input + * is exhausted. + * 6. Call tuplesort_end() in each worker process. Worker processes can shut + * down once tuplesort_end() returns. + * 7. Begin a tuplesort in the leader using the same tuplesort_begin* + * routine, passing a leader-appropriate coordinate argument (this can + * happen as early as during step 3, actually, since we only need to know + * the number of workers successfully launched). The leader must now wait + * for workers to finish. Caller must use own mechanism for ensuring that + * next step isn't reached until all workers have called and returned from + * tuplesort_performsort(). (Note that it's okay if workers have already + * also called tuplesort_end() by then.) + * 8. Call tuplesort_performsort() in leader. Consume output using the + * appropriate tuplesort_get* routine. Leader can skip this step if + * tuplesort turns out to be unnecessary. + * 9. Call tuplesort_end() in leader. + * + * This division of labor assumes nothing about how input tuples are produced, + * but does require that caller combine the state of multiple tuplesorts for + * any purpose other than producing the final output. For example, callers + * must consider that tuplesort_get_stats() reports on only one worker's role + * in a sort (or the leader's role), and not statistics for the sort as a + * whole. + * + * Note that callers may use the leader process to sort runs as if it was an + * independent worker process (prior to the process performing a leader sort + * to produce the final sorted output). Doing so only requires a second + * "partial" tuplesort within the leader process, initialized like that of a + * worker process. The steps above don't touch on this directly. The only + * difference is that the tuplesort_attach_shared() call is never needed within + * leader process, because the backend as a whole holds the shared fileset + * reference. A worker Tuplesortstate in leader is expected to do exactly the + * same amount of total initial processing work as a worker process + * Tuplesortstate, since the leader process has nothing else to do before + * workers finish. + * + * Note that only a very small amount of memory will be allocated prior to + * the leader state first consuming input, and that workers will free the + * vast majority of their memory upon returning from tuplesort_performsort(). + * Callers can rely on this to arrange for memory to be used in a way that + * respects a workMem-style budget across an entire parallel sort operation. + * + * Callers are responsible for parallel safety in general. However, they + * can at least rely on there being no parallel safety hazards within + * tuplesort, because tuplesort thinks of the sort as several independent + * sorts whose results are combined. Since, in general, the behavior of + * sort operators is immutable, caller need only worry about the parallel + * safety of whatever the process is through which input tuples are + * generated (typically, caller uses a parallel heap scan). */ extern Tuplesortstate *tuplesort_begin_heap(TupleDesc tupDesc, int nkeys, AttrNumber *attNums, Oid *sortOperators, Oid *sortCollations, bool *nullsFirstFlags, - int workMem, bool randomAccess); + int workMem, SortCoordinate coordinate, + bool randomAccess); extern Tuplesortstate *tuplesort_begin_cluster(TupleDesc tupDesc, - Relation indexRel, - int workMem, bool randomAccess); + Relation indexRel, int workMem, + SortCoordinate coordinate, bool randomAccess); extern Tuplesortstate *tuplesort_begin_index_btree(Relation heapRel, Relation indexRel, bool enforceUnique, - int workMem, bool randomAccess); + int workMem, SortCoordinate coordinate, + bool randomAccess); extern Tuplesortstate *tuplesort_begin_index_hash(Relation heapRel, Relation indexRel, uint32 high_mask, uint32 low_mask, uint32 max_buckets, - int workMem, bool randomAccess); + int workMem, SortCoordinate coordinate, + bool randomAccess); extern Tuplesortstate *tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation, bool nullsFirstFlag, - int workMem, bool randomAccess); + int workMem, SortCoordinate coordinate, + bool randomAccess); extern void tuplesort_set_bound(Tuplesortstate *state, int64 bound); @@ -141,10 +247,16 @@ extern const char *tuplesort_space_type_name(TuplesortSpaceType t); extern int tuplesort_merge_order(int64 allowedMem); +extern Size tuplesort_estimate_shared(int nworkers); +extern void tuplesort_initialize_shared(Sharedsort *shared, int nWorkers, + dsm_segment *seg); +extern void tuplesort_attach_shared(Sharedsort *shared, dsm_segment *seg); + /* * These routines may only be called if randomAccess was specified 'true'. * Likewise, backwards scan in gettuple/getdatum is only allowed if - * randomAccess was specified. + * randomAccess was specified. Note that parallel sorts do not support + * randomAccess. */ extern void tuplesort_rescan(Tuplesortstate *state); |
