summaryrefslogtreecommitdiff
path: root/src/include
diff options
context:
space:
mode:
Diffstat (limited to 'src/include')
-rw-r--r--src/include/access/nbtree.h14
-rw-r--r--src/include/access/parallel.h4
-rw-r--r--src/include/access/relscan.h1
-rw-r--r--src/include/catalog/index.h9
-rw-r--r--src/include/miscadmin.h1
-rw-r--r--src/include/nodes/execnodes.h6
-rw-r--r--src/include/optimizer/paths.h2
-rw-r--r--src/include/optimizer/planner.h1
-rw-r--r--src/include/pgstat.h1
-rw-r--r--src/include/storage/buffile.h2
-rw-r--r--src/include/storage/fd.h1
-rw-r--r--src/include/utils/logtape.h39
-rw-r--r--src/include/utils/tuplesort.h132
13 files changed, 184 insertions, 29 deletions
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index d28f413c663..0f6a40168ca 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -21,6 +21,7 @@
#include "catalog/pg_index.h"
#include "lib/stringinfo.h"
#include "storage/bufmgr.h"
+#include "storage/shm_toc.h"
/* There's room for a 16-bit vacuum cycle ID in BTPageOpaqueData */
typedef uint16 BTCycleId;
@@ -430,8 +431,6 @@ typedef BTScanOpaqueData *BTScanOpaque;
/*
* external entry points for btree, in nbtree.c
*/
-extern IndexBuildResult *btbuild(Relation heap, Relation index,
- struct IndexInfo *indexInfo);
extern void btbuildempty(Relation index);
extern bool btinsert(Relation rel, Datum *values, bool *isnull,
ItemPointer ht_ctid, Relation heapRel,
@@ -547,13 +546,8 @@ extern bool btvalidate(Oid opclassoid);
/*
* prototypes for functions in nbtsort.c
*/
-typedef struct BTSpool BTSpool; /* opaque type known only within nbtsort.c */
-
-extern BTSpool *_bt_spoolinit(Relation heap, Relation index,
- bool isunique, bool isdead);
-extern void _bt_spooldestroy(BTSpool *btspool);
-extern void _bt_spool(BTSpool *btspool, ItemPointer self,
- Datum *values, bool *isnull);
-extern void _bt_leafbuild(BTSpool *btspool, BTSpool *spool2);
+extern IndexBuildResult *btbuild(Relation heap, Relation index,
+ struct IndexInfo *indexInfo);
+extern void _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc);
#endif /* NBTREE_H */
diff --git a/src/include/access/parallel.h b/src/include/access/parallel.h
index d0c218b1854..025691fd82d 100644
--- a/src/include/access/parallel.h
+++ b/src/include/access/parallel.h
@@ -59,7 +59,9 @@ extern PGDLLIMPORT bool InitializingParallelWorker;
#define IsParallelWorker() (ParallelWorkerNumber >= 0)
-extern ParallelContext *CreateParallelContext(const char *library_name, const char *function_name, int nworkers);
+extern ParallelContext *CreateParallelContext(const char *library_name,
+ const char *function_name, int nworkers,
+ bool serializable_okay);
extern void InitializeParallelDSM(ParallelContext *pcxt);
extern void ReinitializeParallelDSM(ParallelContext *pcxt);
extern void LaunchParallelWorkers(ParallelContext *pcxt);
diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h
index 9c603ca637a..18c7dedd5d3 100644
--- a/src/include/access/relscan.h
+++ b/src/include/access/relscan.h
@@ -39,6 +39,7 @@ typedef struct ParallelHeapScanDescData
BlockNumber phs_startblock; /* starting block number */
pg_atomic_uint64 phs_nallocated; /* number of blocks allocated to
* workers so far. */
+ bool phs_snapshot_any; /* SnapshotAny, not phs_snapshot_data? */
char phs_snapshot_data[FLEXIBLE_ARRAY_MEMBER];
} ParallelHeapScanDescData;
diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h
index 235e180299c..a5cd8ddb1eb 100644
--- a/src/include/catalog/index.h
+++ b/src/include/catalog/index.h
@@ -104,14 +104,16 @@ extern void index_build(Relation heapRelation,
Relation indexRelation,
IndexInfo *indexInfo,
bool isprimary,
- bool isreindex);
+ bool isreindex,
+ bool parallel);
extern double IndexBuildHeapScan(Relation heapRelation,
Relation indexRelation,
IndexInfo *indexInfo,
bool allow_sync,
IndexBuildCallback callback,
- void *callback_state);
+ void *callback_state,
+ HeapScanDesc scan);
extern double IndexBuildHeapRangeScan(Relation heapRelation,
Relation indexRelation,
IndexInfo *indexInfo,
@@ -120,7 +122,8 @@ extern double IndexBuildHeapRangeScan(Relation heapRelation,
BlockNumber start_blockno,
BlockNumber end_blockno,
IndexBuildCallback callback,
- void *callback_state);
+ void *callback_state,
+ HeapScanDesc scan);
extern void validate_index(Oid heapId, Oid indexId, Snapshot snapshot);
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index 54ee2737477..429c0554899 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -241,6 +241,7 @@ extern bool enableFsync;
extern PGDLLIMPORT bool allowSystemTableMods;
extern PGDLLIMPORT int work_mem;
extern PGDLLIMPORT int maintenance_work_mem;
+extern PGDLLIMPORT int max_parallel_maintenance_workers;
extern int VacuumCostPageHit;
extern int VacuumCostPageMiss;
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 1bf67455e07..a2a2a9f3d4d 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -132,11 +132,12 @@ typedef struct ExprState
* ReadyForInserts is it valid for inserts?
* Concurrent are we doing a concurrent index build?
* BrokenHotChain did we detect any broken HOT chains?
+ * ParallelWorkers # of workers requested (excludes leader)
* AmCache private cache area for index AM
* Context memory context holding this IndexInfo
*
- * ii_Concurrent and ii_BrokenHotChain are used only during index build;
- * they're conventionally set to false otherwise.
+ * ii_Concurrent, ii_BrokenHotChain, and ii_ParallelWorkers are used only
+ * during index build; they're conventionally zeroed otherwise.
* ----------------
*/
typedef struct IndexInfo
@@ -158,6 +159,7 @@ typedef struct IndexInfo
bool ii_ReadyForInserts;
bool ii_Concurrent;
bool ii_BrokenHotChain;
+ int ii_ParallelWorkers;
Oid ii_Am;
void *ii_AmCache;
MemoryContext ii_Context;
diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h
index 0072b7aa0d4..b6be259ff73 100644
--- a/src/include/optimizer/paths.h
+++ b/src/include/optimizer/paths.h
@@ -55,7 +55,7 @@ extern RelOptInfo *standard_join_search(PlannerInfo *root, int levels_needed,
extern void generate_gather_paths(PlannerInfo *root, RelOptInfo *rel);
extern int compute_parallel_worker(RelOptInfo *rel, double heap_pages,
- double index_pages);
+ double index_pages, int max_workers);
extern void create_partial_bitmap_paths(PlannerInfo *root, RelOptInfo *rel,
Path *bitmapqual);
extern void generate_partition_wise_join_paths(PlannerInfo *root,
diff --git a/src/include/optimizer/planner.h b/src/include/optimizer/planner.h
index 29173d36c49..0d8b88d78be 100644
--- a/src/include/optimizer/planner.h
+++ b/src/include/optimizer/planner.h
@@ -56,6 +56,7 @@ extern Expr *expression_planner(Expr *expr);
extern Expr *preprocess_phv_expression(PlannerInfo *root, Expr *expr);
extern bool plan_cluster_use_sort(Oid tableOid, Oid indexOid);
+extern int plan_create_index_workers(Oid tableOid, Oid indexOid);
extern List *get_partitioned_child_rels(PlannerInfo *root, Index rti,
bool *part_cols_updated);
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 3d3c0b64fc3..be2f59239bf 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -826,6 +826,7 @@ typedef enum
WAIT_EVENT_MQ_SEND,
WAIT_EVENT_PARALLEL_FINISH,
WAIT_EVENT_PARALLEL_BITMAP_SCAN,
+ WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN,
WAIT_EVENT_PROCARRAY_GROUP_UPDATE,
WAIT_EVENT_CLOG_GROUP_UPDATE,
WAIT_EVENT_REPLICATION_ORIGIN_DROP,
diff --git a/src/include/storage/buffile.h b/src/include/storage/buffile.h
index a3df056a61b..a6cdeb451c1 100644
--- a/src/include/storage/buffile.h
+++ b/src/include/storage/buffile.h
@@ -43,6 +43,8 @@ extern size_t BufFileWrite(BufFile *file, void *ptr, size_t size);
extern int BufFileSeek(BufFile *file, int fileno, off_t offset, int whence);
extern void BufFileTell(BufFile *file, int *fileno, off_t *offset);
extern int BufFileSeekBlock(BufFile *file, long blknum);
+extern off_t BufFileSize(BufFile *file);
+extern long BufFileAppend(BufFile *target, BufFile *source);
extern BufFile *BufFileCreateShared(SharedFileSet *fileset, const char *name);
extern void BufFileExportShared(BufFile *file);
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index db5ca166794..4244e7b1fd8 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -78,6 +78,7 @@ extern char *FilePathName(File file);
extern int FileGetRawDesc(File file);
extern int FileGetRawFlags(File file);
extern mode_t FileGetRawMode(File file);
+extern off_t FileGetSize(File file);
/* Operations used for sharing named temporary files */
extern File PathNameCreateTemporaryFile(const char *name, bool error_on_failure);
diff --git a/src/include/utils/logtape.h b/src/include/utils/logtape.h
index 88662c10a43..9bf1d801424 100644
--- a/src/include/utils/logtape.h
+++ b/src/include/utils/logtape.h
@@ -16,15 +16,49 @@
#ifndef LOGTAPE_H
#define LOGTAPE_H
+#include "storage/sharedfileset.h"
+
/* LogicalTapeSet is an opaque type whose details are not known outside logtape.c. */
typedef struct LogicalTapeSet LogicalTapeSet;
/*
+ * The approach tuplesort.c takes to parallel external sorts is that workers,
+ * whose state is almost the same as independent serial sorts, are made to
+ * produce a final materialized tape of sorted output in all cases. This is
+ * frozen, just like any case requiring a final materialized tape. However,
+ * there is one difference, which is that freezing will also export an
+ * underlying shared fileset BufFile for sharing. Freezing produces TapeShare
+ * metadata for the worker when this happens, which is passed along through
+ * shared memory to leader.
+ *
+ * The leader process can then pass an array of TapeShare metadata (one per
+ * worker participant) to LogicalTapeSetCreate(), alongside a handle to a
+ * shared fileset, which is sufficient to construct a new logical tapeset that
+ * consists of each of the tapes materialized by workers.
+ *
+ * Note that while logtape.c does create an empty leader tape at the end of the
+ * tapeset in the leader case, it can never be written to due to a restriction
+ * in the shared buffile infrastructure.
+ */
+typedef struct TapeShare
+{
+ /*
+ * firstblocknumber is first block that should be read from materialized
+ * tape.
+ *
+ * buffilesize is the size of associated BufFile following freezing.
+ */
+ long firstblocknumber;
+ off_t buffilesize;
+} TapeShare;
+
+/*
* prototypes for functions in logtape.c
*/
-extern LogicalTapeSet *LogicalTapeSetCreate(int ntapes);
+extern LogicalTapeSet *LogicalTapeSetCreate(int ntapes, TapeShare *shared,
+ SharedFileSet *fileset, int worker);
extern void LogicalTapeSetClose(LogicalTapeSet *lts);
extern void LogicalTapeSetForgetFreeSpace(LogicalTapeSet *lts);
extern size_t LogicalTapeRead(LogicalTapeSet *lts, int tapenum,
@@ -34,7 +68,8 @@ extern void LogicalTapeWrite(LogicalTapeSet *lts, int tapenum,
extern void LogicalTapeRewindForRead(LogicalTapeSet *lts, int tapenum,
size_t buffer_size);
extern void LogicalTapeRewindForWrite(LogicalTapeSet *lts, int tapenum);
-extern void LogicalTapeFreeze(LogicalTapeSet *lts, int tapenum);
+extern void LogicalTapeFreeze(LogicalTapeSet *lts, int tapenum,
+ TapeShare *share);
extern size_t LogicalTapeBackspace(LogicalTapeSet *lts, int tapenum,
size_t size);
extern void LogicalTapeSeek(LogicalTapeSet *lts, int tapenum,
diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h
index 5d57c503ab2..d2e6754f043 100644
--- a/src/include/utils/tuplesort.h
+++ b/src/include/utils/tuplesort.h
@@ -8,7 +8,8 @@
* if necessary). It works efficiently for both small and large amounts
* of data. Small amounts are sorted in-memory using qsort(). Large
* amounts are sorted using temporary files and a standard external sort
- * algorithm.
+ * algorithm. Parallel sorts use a variant of this external sort
+ * algorithm, and are typically only used for large amounts of data.
*
* Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
@@ -23,13 +24,39 @@
#include "access/itup.h"
#include "executor/tuptable.h"
#include "fmgr.h"
+#include "storage/dsm.h"
#include "utils/relcache.h"
-/* Tuplesortstate is an opaque type whose details are not known outside
- * tuplesort.c.
+/*
+ * Tuplesortstate and Sharedsort are opaque types whose details are not
+ * known outside tuplesort.c.
*/
typedef struct Tuplesortstate Tuplesortstate;
+typedef struct Sharedsort Sharedsort;
+
+/*
+ * Tuplesort parallel coordination state, allocated by each participant in
+ * local memory. Participant caller initializes everything. See usage notes
+ * below.
+ */
+typedef struct SortCoordinateData
+{
+ /* Worker process? If not, must be leader. */
+ bool isWorker;
+
+ /*
+ * Leader-process-passed number of participants known launched (workers
+ * set this to -1). Includes state within leader needed for it to
+ * participate as a worker, if any.
+ */
+ int nParticipants;
+
+ /* Private opaque state (points to shared memory) */
+ Sharedsort *sharedsort;
+} SortCoordinateData;
+
+typedef struct SortCoordinateData *SortCoordinate;
/*
* Data structures for reporting sort statistics. Note that
@@ -66,6 +93,8 @@ typedef struct TuplesortInstrumentation
* sorting HeapTuples and two more for sorting IndexTuples. Yet another
* API supports sorting bare Datums.
*
+ * Serial sort callers should pass NULL for their coordinate argument.
+ *
* The "heap" API actually stores/sorts MinimalTuples, which means it doesn't
* preserve the system columns (tuple identity and transaction visibility
* info). The sort keys are specified by column numbers within the tuples
@@ -84,30 +113,107 @@ typedef struct TuplesortInstrumentation
*
* The "index_hash" API is similar to index_btree, but the tuples are
* actually sorted by their hash codes not the raw data.
+ *
+ * Parallel sort callers are required to coordinate multiple tuplesort states
+ * in a leader process and one or more worker processes. The leader process
+ * must launch workers, and have each perform an independent "partial"
+ * tuplesort, typically fed by the parallel heap interface. The leader later
+ * produces the final output (internally, it merges runs output by workers).
+ *
+ * Callers must do the following to perform a sort in parallel using multiple
+ * worker processes:
+ *
+ * 1. Request tuplesort-private shared memory for n workers. Use
+ * tuplesort_estimate_shared() to get the required size.
+ * 2. Have leader process initialize allocated shared memory using
+ * tuplesort_initialize_shared(). Launch workers.
+ * 3. Initialize a coordinate argument within both the leader process, and
+ * for each worker process. This has a pointer to the shared
+ * tuplesort-private structure, as well as some caller-initialized fields.
+ * Leader's coordinate argument reliably indicates number of workers
+ * launched (this is unused by workers).
+ * 4. Begin a tuplesort using some appropriate tuplesort_begin* routine,
+ * (passing the coordinate argument) within each worker. The workMem
+ * arguments need not be identical. All other arguments should match
+ * exactly, though.
+ * 5. tuplesort_attach_shared() should be called by all workers. Feed tuples
+ * to each worker, and call tuplesort_performsort() within each when input
+ * is exhausted.
+ * 6. Call tuplesort_end() in each worker process. Worker processes can shut
+ * down once tuplesort_end() returns.
+ * 7. Begin a tuplesort in the leader using the same tuplesort_begin*
+ * routine, passing a leader-appropriate coordinate argument (this can
+ * happen as early as during step 3, actually, since we only need to know
+ * the number of workers successfully launched). The leader must now wait
+ * for workers to finish. Caller must use own mechanism for ensuring that
+ * next step isn't reached until all workers have called and returned from
+ * tuplesort_performsort(). (Note that it's okay if workers have already
+ * also called tuplesort_end() by then.)
+ * 8. Call tuplesort_performsort() in leader. Consume output using the
+ * appropriate tuplesort_get* routine. Leader can skip this step if
+ * tuplesort turns out to be unnecessary.
+ * 9. Call tuplesort_end() in leader.
+ *
+ * This division of labor assumes nothing about how input tuples are produced,
+ * but does require that caller combine the state of multiple tuplesorts for
+ * any purpose other than producing the final output. For example, callers
+ * must consider that tuplesort_get_stats() reports on only one worker's role
+ * in a sort (or the leader's role), and not statistics for the sort as a
+ * whole.
+ *
+ * Note that callers may use the leader process to sort runs as if it was an
+ * independent worker process (prior to the process performing a leader sort
+ * to produce the final sorted output). Doing so only requires a second
+ * "partial" tuplesort within the leader process, initialized like that of a
+ * worker process. The steps above don't touch on this directly. The only
+ * difference is that the tuplesort_attach_shared() call is never needed within
+ * leader process, because the backend as a whole holds the shared fileset
+ * reference. A worker Tuplesortstate in leader is expected to do exactly the
+ * same amount of total initial processing work as a worker process
+ * Tuplesortstate, since the leader process has nothing else to do before
+ * workers finish.
+ *
+ * Note that only a very small amount of memory will be allocated prior to
+ * the leader state first consuming input, and that workers will free the
+ * vast majority of their memory upon returning from tuplesort_performsort().
+ * Callers can rely on this to arrange for memory to be used in a way that
+ * respects a workMem-style budget across an entire parallel sort operation.
+ *
+ * Callers are responsible for parallel safety in general. However, they
+ * can at least rely on there being no parallel safety hazards within
+ * tuplesort, because tuplesort thinks of the sort as several independent
+ * sorts whose results are combined. Since, in general, the behavior of
+ * sort operators is immutable, caller need only worry about the parallel
+ * safety of whatever the process is through which input tuples are
+ * generated (typically, caller uses a parallel heap scan).
*/
extern Tuplesortstate *tuplesort_begin_heap(TupleDesc tupDesc,
int nkeys, AttrNumber *attNums,
Oid *sortOperators, Oid *sortCollations,
bool *nullsFirstFlags,
- int workMem, bool randomAccess);
+ int workMem, SortCoordinate coordinate,
+ bool randomAccess);
extern Tuplesortstate *tuplesort_begin_cluster(TupleDesc tupDesc,
- Relation indexRel,
- int workMem, bool randomAccess);
+ Relation indexRel, int workMem,
+ SortCoordinate coordinate, bool randomAccess);
extern Tuplesortstate *tuplesort_begin_index_btree(Relation heapRel,
Relation indexRel,
bool enforceUnique,
- int workMem, bool randomAccess);
+ int workMem, SortCoordinate coordinate,
+ bool randomAccess);
extern Tuplesortstate *tuplesort_begin_index_hash(Relation heapRel,
Relation indexRel,
uint32 high_mask,
uint32 low_mask,
uint32 max_buckets,
- int workMem, bool randomAccess);
+ int workMem, SortCoordinate coordinate,
+ bool randomAccess);
extern Tuplesortstate *tuplesort_begin_datum(Oid datumType,
Oid sortOperator, Oid sortCollation,
bool nullsFirstFlag,
- int workMem, bool randomAccess);
+ int workMem, SortCoordinate coordinate,
+ bool randomAccess);
extern void tuplesort_set_bound(Tuplesortstate *state, int64 bound);
@@ -141,10 +247,16 @@ extern const char *tuplesort_space_type_name(TuplesortSpaceType t);
extern int tuplesort_merge_order(int64 allowedMem);
+extern Size tuplesort_estimate_shared(int nworkers);
+extern void tuplesort_initialize_shared(Sharedsort *shared, int nWorkers,
+ dsm_segment *seg);
+extern void tuplesort_attach_shared(Sharedsort *shared, dsm_segment *seg);
+
/*
* These routines may only be called if randomAccess was specified 'true'.
* Likewise, backwards scan in gettuple/getdatum is only allowed if
- * randomAccess was specified.
+ * randomAccess was specified. Note that parallel sorts do not support
+ * randomAccess.
*/
extern void tuplesort_rescan(Tuplesortstate *state);