diff --git a/doc/src/sgml/ref/initdb.sgml b/doc/src/sgml/ref/initdb.sgml
index 0026318485a7..2f1f9a42f904 100644
--- a/doc/src/sgml/ref/initdb.sgml
+++ b/doc/src/sgml/ref/initdb.sgml
@@ -527,6 +527,33 @@ PostgreSQL documentation
+
+
+
+
+ By default, initdb safely writes all database files
+ to disk. This option instructs initdb to skip
+ synchronizing all files in the individual database directories, the
+ database directories themselves, and the tablespace directories, i.e.,
+ everything in the base subdirectory and any other
+ tablespace directories. Other files, such as those in
+ pg_wal and pg_xact, will still be
+ synchronized unless the option is also
+ specified.
+
+
+ Note that if is used in
+ conjuction with , some or all of
+ the aforementioned files and directories will be synchronized because
+ syncfs processes entire file systems.
+
+
+ This option is primarily intended for internal use by tools that
+ separately ensure the skipped files are synchronized to disk.
+
+
+
+
diff --git a/doc/src/sgml/ref/pg_dump.sgml b/doc/src/sgml/ref/pg_dump.sgml
index 0ae40f9be58d..63cca18711a4 100644
--- a/doc/src/sgml/ref/pg_dump.sgml
+++ b/doc/src/sgml/ref/pg_dump.sgml
@@ -1298,6 +1298,17 @@ PostgreSQL documentation
+
+
+
+
+ Include sequence data in the dump. This is the default behavior except
+ when , , or
+ is specified.
+
+
+
+
diff --git a/doc/src/sgml/ref/pgupgrade.sgml b/doc/src/sgml/ref/pgupgrade.sgml
index 5db761d1ff19..da2616190438 100644
--- a/doc/src/sgml/ref/pgupgrade.sgml
+++ b/doc/src/sgml/ref/pgupgrade.sgml
@@ -244,7 +244,8 @@ PostgreSQL documentation
Copy files to the new cluster. This is the default. (See also
- and .)
+ , ,
+ , and .)
@@ -262,6 +263,32 @@ PostgreSQL documentation
+
+
+
+
+ Move the data directories from the old cluster to the new cluster.
+ Then, replace the catalog files with those generated for the new
+ cluster. This mode can outperform ,
+ , , and
+ , especially on clusters with many
+ relations.
+
+
+ However, this mode creates many garbage files in the old cluster, which
+ can prolong the file synchronization step if
+ is used. Therefore, it is
+ recommended to use with
+ .
+
+
+ Additionally, once the file transfer step begins, the old cluster will
+ be destructively modified and therefore will no longer be safe to
+ start. See for details.
+
+
+
+
method
@@ -530,6 +557,10 @@ NET STOP postgresql-&majorversion;
is started. Clone mode also requires that the old and new data
directories be in the same file system. This mode is only available
on certain operating systems and file systems.
+ Swap mode may be the fastest if there are many relations, but you will not
+ be able to access your old cluster once the file transfer step begins.
+ Swap mode also requires that the old and new cluster data directories be
+ in the same file system.
@@ -889,6 +920,32 @@ psql --username=postgres --file=script.sql postgres
+
+
+
+ If the option was used, the old cluster might
+ be destructively modified:
+
+
+
+
+ If pg_upgrade aborts before reporting that the
+ old cluster is no longer safe to start, the old cluster was
+ unmodified; it can be restarted.
+
+
+
+
+
+ If pg_upgrade has reported that the old cluster
+ is no longer safe to start, the old cluster was destructively
+ modified. The old cluster will need to be restored from backup in
+ this case.
+
+
+
+
+
diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index 21a0fe3ecd97..22b7d31b1654 100644
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -168,6 +168,7 @@ static bool data_checksums = true;
static char *xlog_dir = NULL;
static int wal_segment_size_mb = (DEFAULT_XLOG_SEG_SIZE) / (1024 * 1024);
static DataDirSyncMethod sync_method = DATA_DIR_SYNC_METHOD_FSYNC;
+static bool sync_data_files = true;
/* internal vars */
@@ -2566,6 +2567,7 @@ usage(const char *progname)
printf(_(" -L DIRECTORY where to find the input files\n"));
printf(_(" -n, --no-clean do not clean up after errors\n"));
printf(_(" -N, --no-sync do not wait for changes to be written safely to disk\n"));
+ printf(_(" --no-sync-data-files do not sync files within database directories\n"));
printf(_(" --no-instructions do not print instructions for next steps\n"));
printf(_(" -s, --show show internal settings, then exit\n"));
printf(_(" --sync-method=METHOD set method for syncing files to disk\n"));
@@ -3208,6 +3210,7 @@ main(int argc, char *argv[])
{"icu-rules", required_argument, NULL, 18},
{"sync-method", required_argument, NULL, 19},
{"no-data-checksums", no_argument, NULL, 20},
+ {"no-sync-data-files", no_argument, NULL, 21},
{NULL, 0, NULL, 0}
};
@@ -3402,6 +3405,9 @@ main(int argc, char *argv[])
case 20:
data_checksums = false;
break;
+ case 21:
+ sync_data_files = false;
+ break;
default:
/* getopt_long already emitted a complaint */
pg_log_error_hint("Try \"%s --help\" for more information.", progname);
@@ -3453,7 +3459,7 @@ main(int argc, char *argv[])
fputs(_("syncing data to disk ... "), stdout);
fflush(stdout);
- sync_pgdata(pg_data, PG_VERSION_NUM, sync_method);
+ sync_pgdata(pg_data, PG_VERSION_NUM, sync_method, sync_data_files);
check_ok();
return 0;
}
@@ -3516,7 +3522,7 @@ main(int argc, char *argv[])
{
fputs(_("syncing data to disk ... "), stdout);
fflush(stdout);
- sync_pgdata(pg_data, PG_VERSION_NUM, sync_method);
+ sync_pgdata(pg_data, PG_VERSION_NUM, sync_method, sync_data_files);
check_ok();
}
else
diff --git a/src/bin/initdb/t/001_initdb.pl b/src/bin/initdb/t/001_initdb.pl
index 01cc4a1602b8..15dd10ce40a3 100644
--- a/src/bin/initdb/t/001_initdb.pl
+++ b/src/bin/initdb/t/001_initdb.pl
@@ -76,6 +76,7 @@
'checksums are enabled in control file');
command_ok([ 'initdb', '--sync-only', $datadir ], 'sync only');
+command_ok([ 'initdb', '--sync-only', '--no-sync-data-files', $datadir ], '--no-sync-data-files');
command_fails([ 'initdb', $datadir ], 'existing data directory');
if ($supports_syncfs)
diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c
index d4b4e3340143..1da4bfc2351e 100644
--- a/src/bin/pg_basebackup/pg_basebackup.c
+++ b/src/bin/pg_basebackup/pg_basebackup.c
@@ -2310,7 +2310,7 @@ BaseBackup(char *compression_algorithm, char *compression_detail,
}
else
{
- (void) sync_pgdata(basedir, serverVersion, sync_method);
+ (void) sync_pgdata(basedir, serverVersion, sync_method, true);
}
}
diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c
index 867aeddc601f..f20be82862a2 100644
--- a/src/bin/pg_checksums/pg_checksums.c
+++ b/src/bin/pg_checksums/pg_checksums.c
@@ -633,7 +633,7 @@ main(int argc, char *argv[])
if (do_sync)
{
pg_log_info("syncing data directory");
- sync_pgdata(DataDir, PG_VERSION_NUM, sync_method);
+ sync_pgdata(DataDir, PG_VERSION_NUM, sync_method, true);
}
pg_log_info("updating control file");
diff --git a/src/bin/pg_combinebackup/pg_combinebackup.c b/src/bin/pg_combinebackup/pg_combinebackup.c
index d480dc74436e..050260ee832a 100644
--- a/src/bin/pg_combinebackup/pg_combinebackup.c
+++ b/src/bin/pg_combinebackup/pg_combinebackup.c
@@ -424,7 +424,7 @@ main(int argc, char *argv[])
else
{
pg_log_debug("recursively fsyncing \"%s\"", opt.output);
- sync_pgdata(opt.output, version * 10000, opt.sync_method);
+ sync_pgdata(opt.output, version * 10000, opt.sync_method, true);
}
}
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index 428ed2d60fca..e6253331e273 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -518,6 +518,7 @@ main(int argc, char **argv)
{"sync-method", required_argument, NULL, 15},
{"filter", required_argument, NULL, 16},
{"exclude-extension", required_argument, NULL, 17},
+ {"sequence-data", no_argument, &dopt.sequence_data, 1},
{NULL, 0, NULL, 0}
};
@@ -801,14 +802,6 @@ main(int argc, char **argv)
if (dopt.column_inserts && dopt.dump_inserts == 0)
dopt.dump_inserts = DUMP_DEFAULT_ROWS_PER_INSERT;
- /*
- * Binary upgrade mode implies dumping sequence data even in schema-only
- * mode. This is not exposed as a separate option, but kept separate
- * internally for clarity.
- */
- if (dopt.binary_upgrade)
- dopt.sequence_data = 1;
-
if (data_only && schema_only)
pg_fatal("options -s/--schema-only and -a/--data-only cannot be used together");
if (schema_only && statistics_only)
@@ -1275,6 +1268,7 @@ help(const char *progname)
printf(_(" --quote-all-identifiers quote all identifiers, even if not key words\n"));
printf(_(" --rows-per-insert=NROWS number of rows per INSERT; implies --inserts\n"));
printf(_(" --section=SECTION dump named section (pre-data, data, or post-data)\n"));
+ printf(_(" --sequence-data include sequence data in dump\n"));
printf(_(" --serializable-deferrable wait until the dump can run without anomalies\n"));
printf(_(" --snapshot=SNAPSHOT use given snapshot for the dump\n"));
printf(_(" --statistics-only dump only the statistics, not schema or data\n"));
diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl
index d281e27aa677..ed379033da73 100644
--- a/src/bin/pg_dump/t/002_pg_dump.pl
+++ b/src/bin/pg_dump/t/002_pg_dump.pl
@@ -66,6 +66,7 @@
'--file' => "$tempdir/binary_upgrade.dump",
'--no-password',
'--no-data',
+ '--sequence-data',
'--binary-upgrade',
'--dbname' => 'postgres', # alternative way to specify database
],
diff --git a/src/bin/pg_rewind/file_ops.c b/src/bin/pg_rewind/file_ops.c
index 467845419eda..55659ce201f4 100644
--- a/src/bin/pg_rewind/file_ops.c
+++ b/src/bin/pg_rewind/file_ops.c
@@ -296,7 +296,7 @@ sync_target_dir(void)
if (!do_sync || dry_run)
return;
- sync_pgdata(datadir_target, PG_VERSION_NUM, sync_method);
+ sync_pgdata(datadir_target, PG_VERSION_NUM, sync_method, true);
}
diff --git a/src/bin/pg_upgrade/TESTING b/src/bin/pg_upgrade/TESTING
index 00842ac6ec3a..c3d463c9c292 100644
--- a/src/bin/pg_upgrade/TESTING
+++ b/src/bin/pg_upgrade/TESTING
@@ -20,13 +20,13 @@ export oldinstall=...otherversion/ (old version's install base path)
See DETAILS below for more information about creation of the dump.
You can also test the different transfer modes (--copy, --link,
---clone, --copy-file-range) by setting the environment variable
+--clone, --copy-file-range, --swap) by setting the environment variable
PG_TEST_PG_UPGRADE_MODE to the respective command-line option, like
make check PG_TEST_PG_UPGRADE_MODE=--link
-The default is --copy. Note that the other modes are not supported on
-all operating systems.
+The default is --copy. Note that not all modes are supported on all
+operating systems.
DETAILS
-------
diff --git a/src/bin/pg_upgrade/check.c b/src/bin/pg_upgrade/check.c
index 117f461d46a1..02d9146e5ed7 100644
--- a/src/bin/pg_upgrade/check.c
+++ b/src/bin/pg_upgrade/check.c
@@ -709,7 +709,34 @@ check_new_cluster(void)
check_copy_file_range();
break;
case TRANSFER_MODE_LINK:
- check_hard_link();
+ check_hard_link(TRANSFER_MODE_LINK);
+ break;
+ case TRANSFER_MODE_SWAP:
+
+ /*
+ * We do the hard link check for --swap, too, since it's an easy
+ * way to verify the clusters are in the same file system. This
+ * allows us to take some shortcuts in the file synchronization
+ * step. With some more effort, we could probably support the
+ * separate-file-system use case, but this mode is unlikely to
+ * offer much benefit if we have to copy the files across file
+ * system boundaries.
+ */
+ check_hard_link(TRANSFER_MODE_SWAP);
+
+ /*
+ * There are a few known issues with using --swap to upgrade from
+ * versions older than 10. For example, the sequence tuple format
+ * changed in v10, and the visibility map format changed in 9.6.
+ * While such problems are not insurmountable (and we may have to
+ * deal with similar problems in the future, anyway), it doesn't
+ * seem worth the effort to support swap mode for upgrades from
+ * long-unsupported versions.
+ */
+ if (GET_MAJOR_VERSION(old_cluster.major_version) < 1000)
+ pg_fatal("Swap mode can only upgrade clusters from PostgreSQL version %s and later.",
+ "10");
+
break;
}
diff --git a/src/bin/pg_upgrade/controldata.c b/src/bin/pg_upgrade/controldata.c
index bd49ea867bfc..47ee27ec8354 100644
--- a/src/bin/pg_upgrade/controldata.c
+++ b/src/bin/pg_upgrade/controldata.c
@@ -751,7 +751,7 @@ check_control_data(ControlData *oldctrl,
void
-disable_old_cluster(void)
+disable_old_cluster(transferMode transfer_mode)
{
char old_path[MAXPGPATH],
new_path[MAXPGPATH];
@@ -766,10 +766,17 @@ disable_old_cluster(void)
old_path, new_path);
check_ok();
- pg_log(PG_REPORT, "\n"
- "If you want to start the old cluster, you will need to remove\n"
- "the \".old\" suffix from %s/global/pg_control.old.\n"
- "Because \"link\" mode was used, the old cluster cannot be safely\n"
- "started once the new cluster has been started.",
- old_cluster.pgdata);
+ if (transfer_mode == TRANSFER_MODE_LINK)
+ pg_log(PG_REPORT, "\n"
+ "If you want to start the old cluster, you will need to remove\n"
+ "the \".old\" suffix from %s/global/pg_control.old.\n"
+ "Because \"link\" mode was used, the old cluster cannot be safely\n"
+ "started once the new cluster has been started.",
+ old_cluster.pgdata);
+ else if (transfer_mode == TRANSFER_MODE_SWAP)
+ pg_log(PG_REPORT, "\n"
+ "Because \"swap\" mode was used, the old cluster can no longer be\n"
+ "safely started.");
+ else
+ pg_fatal("unrecognized transfer mode");
}
diff --git a/src/bin/pg_upgrade/dump.c b/src/bin/pg_upgrade/dump.c
index 23fe7280a161..23cb08e83476 100644
--- a/src/bin/pg_upgrade/dump.c
+++ b/src/bin/pg_upgrade/dump.c
@@ -52,9 +52,11 @@ generate_old_dump(void)
snprintf(log_file_name, sizeof(log_file_name), DB_DUMP_LOG_FILE_MASK, old_db->db_oid);
parallel_exec_prog(log_file_name, NULL,
- "\"%s/pg_dump\" %s --no-data %s --quote-all-identifiers "
+ "\"%s/pg_dump\" %s --no-data %s %s --quote-all-identifiers "
"--binary-upgrade --format=custom %s --no-sync --file=\"%s/%s\" %s",
new_cluster.bindir, cluster_conn_opts(&old_cluster),
+ (user_opts.transfer_mode == TRANSFER_MODE_SWAP) ?
+ "" : "--sequence-data",
log_opts.verbose ? "--verbose" : "",
user_opts.do_statistics ? "" : "--no-statistics",
log_opts.dumpdir,
diff --git a/src/bin/pg_upgrade/file.c b/src/bin/pg_upgrade/file.c
index 7fd1991204ae..91ed16acb088 100644
--- a/src/bin/pg_upgrade/file.c
+++ b/src/bin/pg_upgrade/file.c
@@ -434,7 +434,7 @@ check_copy_file_range(void)
}
void
-check_hard_link(void)
+check_hard_link(transferMode transfer_mode)
{
char existing_file[MAXPGPATH];
char new_link_file[MAXPGPATH];
@@ -444,8 +444,16 @@ check_hard_link(void)
unlink(new_link_file); /* might fail */
if (link(existing_file, new_link_file) < 0)
- pg_fatal("could not create hard link between old and new data directories: %m\n"
- "In link mode the old and new data directories must be on the same file system.");
+ {
+ if (transfer_mode == TRANSFER_MODE_LINK)
+ pg_fatal("could not create hard link between old and new data directories: %m\n"
+ "In link mode the old and new data directories must be on the same file system.");
+ else if (transfer_mode == TRANSFER_MODE_SWAP)
+ pg_fatal("could not create hard link between old and new data directories: %m\n"
+ "In swap mode the old and new data directories must be on the same file system.");
+ else
+ pg_fatal("unrecognized transfer mode");
+ }
unlink(new_link_file);
}
diff --git a/src/bin/pg_upgrade/info.c b/src/bin/pg_upgrade/info.c
index ad52de8b607e..4b7a56f5b3be 100644
--- a/src/bin/pg_upgrade/info.c
+++ b/src/bin/pg_upgrade/info.c
@@ -490,7 +490,7 @@ get_rel_infos_query(void)
" FROM pg_catalog.pg_class c JOIN pg_catalog.pg_namespace n "
" ON c.relnamespace = n.oid "
" WHERE relkind IN (" CppAsString2(RELKIND_RELATION) ", "
- CppAsString2(RELKIND_MATVIEW) ") AND "
+ CppAsString2(RELKIND_MATVIEW) "%s) AND "
/* exclude possible orphaned temp tables */
" ((n.nspname !~ '^pg_temp_' AND "
" n.nspname !~ '^pg_toast_temp_' AND "
@@ -499,6 +499,8 @@ get_rel_infos_query(void)
" c.oid >= %u::pg_catalog.oid) OR "
" (n.nspname = 'pg_catalog' AND "
" relname IN ('pg_largeobject') ))), ",
+ (user_opts.transfer_mode == TRANSFER_MODE_SWAP) ?
+ ", " CppAsString2(RELKIND_SEQUENCE) : "",
FirstNormalObjectId);
/*
diff --git a/src/bin/pg_upgrade/option.c b/src/bin/pg_upgrade/option.c
index 188dd8d8a8ba..7fd7f1d33fcb 100644
--- a/src/bin/pg_upgrade/option.c
+++ b/src/bin/pg_upgrade/option.c
@@ -62,6 +62,7 @@ parseCommandLine(int argc, char *argv[])
{"sync-method", required_argument, NULL, 4},
{"no-statistics", no_argument, NULL, 5},
{"set-char-signedness", required_argument, NULL, 6},
+ {"swap", no_argument, NULL, 7},
{NULL, 0, NULL, 0}
};
@@ -228,6 +229,11 @@ parseCommandLine(int argc, char *argv[])
else
pg_fatal("invalid argument for option %s", "--set-char-signedness");
break;
+
+ case 7:
+ user_opts.transfer_mode = TRANSFER_MODE_SWAP;
+ break;
+
default:
fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
os_info.progname);
@@ -325,6 +331,7 @@ usage(void)
printf(_(" --no-statistics do not import statistics from old cluster\n"));
printf(_(" --set-char-signedness=OPTION set new cluster char signedness to \"signed\" or\n"
" \"unsigned\"\n"));
+ printf(_(" --swap move data directories to new cluster\n"));
printf(_(" --sync-method=METHOD set method for syncing files to disk\n"));
printf(_(" -?, --help show this help, then exit\n"));
printf(_("\n"
diff --git a/src/bin/pg_upgrade/pg_upgrade.c b/src/bin/pg_upgrade/pg_upgrade.c
index 174cd920840c..9295e46aed3e 100644
--- a/src/bin/pg_upgrade/pg_upgrade.c
+++ b/src/bin/pg_upgrade/pg_upgrade.c
@@ -170,12 +170,14 @@ main(int argc, char **argv)
/*
* Most failures happen in create_new_objects(), which has completed at
- * this point. We do this here because it is just before linking, which
- * will link the old and new cluster data files, preventing the old
- * cluster from being safely started once the new cluster is started.
+ * this point. We do this here because it is just before file transfer,
+ * which for --link will make it unsafe to start the old cluster once the
+ * new cluster is started, and for --swap will make it unsafe to start the
+ * old cluster at all.
*/
- if (user_opts.transfer_mode == TRANSFER_MODE_LINK)
- disable_old_cluster();
+ if (user_opts.transfer_mode == TRANSFER_MODE_LINK ||
+ user_opts.transfer_mode == TRANSFER_MODE_SWAP)
+ disable_old_cluster(user_opts.transfer_mode);
transfer_all_new_tablespaces(&old_cluster.dbarr, &new_cluster.dbarr,
old_cluster.pgdata, new_cluster.pgdata);
@@ -212,8 +214,10 @@ main(int argc, char **argv)
{
prep_status("Sync data directory to disk");
exec_prog(UTILITY_LOG_FILE, NULL, true, true,
- "\"%s/initdb\" --sync-only \"%s\" --sync-method %s",
+ "\"%s/initdb\" --sync-only %s \"%s\" --sync-method %s",
new_cluster.bindir,
+ (user_opts.transfer_mode == TRANSFER_MODE_SWAP) ?
+ "--no-sync-data-files" : "",
new_cluster.pgdata,
user_opts.sync_method);
check_ok();
diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h
index 4c9d01721491..69c965bb7d09 100644
--- a/src/bin/pg_upgrade/pg_upgrade.h
+++ b/src/bin/pg_upgrade/pg_upgrade.h
@@ -262,6 +262,7 @@ typedef enum
TRANSFER_MODE_COPY,
TRANSFER_MODE_COPY_FILE_RANGE,
TRANSFER_MODE_LINK,
+ TRANSFER_MODE_SWAP,
} transferMode;
/*
@@ -391,7 +392,7 @@ void create_script_for_old_cluster_deletion(char **deletion_script_file_name);
void get_control_data(ClusterInfo *cluster);
void check_control_data(ControlData *oldctrl, ControlData *newctrl);
-void disable_old_cluster(void);
+void disable_old_cluster(transferMode transfer_mode);
/* dump.c */
@@ -423,7 +424,7 @@ void rewriteVisibilityMap(const char *fromfile, const char *tofile,
const char *schemaName, const char *relName);
void check_file_clone(void);
void check_copy_file_range(void);
-void check_hard_link(void);
+void check_hard_link(transferMode transfer_mode);
/* fopen_priv() is no longer different from fopen() */
#define fopen_priv(path, mode) fopen(path, mode)
diff --git a/src/bin/pg_upgrade/relfilenumber.c b/src/bin/pg_upgrade/relfilenumber.c
index 8c23c583172a..c0affa5565c5 100644
--- a/src/bin/pg_upgrade/relfilenumber.c
+++ b/src/bin/pg_upgrade/relfilenumber.c
@@ -11,11 +11,92 @@
#include
+#include "common/file_perm.h"
+#include "common/file_utils.h"
+#include "common/int.h"
+#include "common/logging.h"
#include "pg_upgrade.h"
static void transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace);
static void transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_frozenbit);
+/*
+ * The following set of sync_queue_* functions are used for --swap to reduce
+ * the amount of time spent synchronizing the swapped catalog files. When a
+ * file is added to the queue, we also alert the file system that we'd like it
+ * to be persisted to disk in the near future (if that operation is supported
+ * by the current platform). Once the queue is full, all of the files are
+ * synchronized to disk. This strategy should generally be much faster than
+ * simply calling fsync() on the files right away.
+ *
+ * The general usage pattern should be something like:
+ *
+ * for (int i = 0; i < num_files; i++)
+ * sync_queue_push(files[i]);
+ *
+ * // be sure to sync any remaining files in the queue
+ * sync_queue_sync_all();
+ * synq_queue_destroy();
+ */
+
+#define SYNC_QUEUE_MAX_LEN (1024)
+
+static char *sync_queue[SYNC_QUEUE_MAX_LEN];
+static bool sync_queue_inited;
+static int sync_queue_len;
+
+static inline void
+sync_queue_init(void)
+{
+ if (sync_queue_inited)
+ return;
+
+ sync_queue_inited = true;
+ for (int i = 0; i < SYNC_QUEUE_MAX_LEN; i++)
+ sync_queue[i] = palloc(MAXPGPATH);
+}
+
+static inline void
+sync_queue_sync_all(void)
+{
+ if (!sync_queue_inited)
+ return;
+
+ for (int i = 0; i < sync_queue_len; i++)
+ {
+ if (fsync_fname(sync_queue[i], false) != 0)
+ pg_fatal("could not synchronize file \"%s\": %m", sync_queue[i]);
+ }
+
+ sync_queue_len = 0;
+}
+
+static inline void
+sync_queue_push(const char *fname)
+{
+ sync_queue_init();
+
+ pre_sync_fname(fname, false);
+
+ strncpy(sync_queue[sync_queue_len++], fname, MAXPGPATH);
+ if (sync_queue_len >= SYNC_QUEUE_MAX_LEN)
+ sync_queue_sync_all();
+}
+
+static inline void
+sync_queue_destroy(void)
+{
+ if (!sync_queue_inited)
+ return;
+
+ sync_queue_inited = false;
+ sync_queue_len = 0;
+ for (int i = 0; i < SYNC_QUEUE_MAX_LEN; i++)
+ {
+ pfree(sync_queue[i]);
+ sync_queue[i] = NULL;
+ }
+}
/*
* transfer_all_new_tablespaces()
@@ -41,6 +122,9 @@ transfer_all_new_tablespaces(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr,
case TRANSFER_MODE_LINK:
prep_status_progress("Linking user relation files");
break;
+ case TRANSFER_MODE_SWAP:
+ prep_status_progress("Swapping data directories");
+ break;
}
/*
@@ -125,6 +209,278 @@ transfer_all_new_dbs(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr,
/* We allocate something even for n_maps == 0 */
pg_free(mappings);
}
+
+ /*
+ * Make sure anything pending synchronization in swap mode is fully
+ * persisted to disk. This is a no-op for other transfer modes.
+ */
+ sync_queue_sync_all();
+ sync_queue_destroy();
+}
+
+/*
+ * prepare_for_swap()
+ *
+ * This function moves the database directory from the old cluster to the new
+ * cluster in preparation for moving the pg_restore-generated catalog files
+ * into place. Returns false if the database with the given OID does not have
+ * a directory in the given tablespace, otherwise returns true.
+ *
+ * old_cat (the directory for the old catalog files), new_dat (the database
+ * directory in the new cluster), and moved_dat (the destination for the
+ * pg_restore-generated database directory) should be sized to MAXPGPATH bytes.
+ * This function will return the appropriate paths in those variables.
+ */
+static bool
+prepare_for_swap(const char *old_tablespace, Oid db_oid,
+ char *old_cat, char *new_dat, char *moved_dat)
+{
+ const char *new_tablespace;
+ const char *old_tblspc_suffix;
+ const char *new_tblspc_suffix;
+ char old_tblspc[MAXPGPATH];
+ char new_tblspc[MAXPGPATH];
+ char moved_tblspc[MAXPGPATH];
+ char old_dat[MAXPGPATH];
+ struct stat st;
+
+ if (strcmp(old_tablespace, old_cluster.pgdata) == 0)
+ {
+ new_tablespace = new_cluster.pgdata;
+ new_tblspc_suffix = "/base";
+ old_tblspc_suffix = "/base";
+ }
+ else
+ {
+ /*
+ * XXX: The below line is a hack to deal with the fact that we
+ * presently don't have an easy way to find the corresponding new
+ * tablespace's path. This will need to be fixed if/when we add
+ * pg_upgrade support for in-place tablespaces.
+ */
+ new_tablespace = old_tablespace;
+
+ new_tblspc_suffix = new_cluster.tablespace_suffix;
+ old_tblspc_suffix = old_cluster.tablespace_suffix;
+ }
+
+ /* Old and new cluster paths. */
+ snprintf(old_tblspc, sizeof(old_tblspc), "%s%s", old_tablespace, old_tblspc_suffix);
+ snprintf(new_tblspc, sizeof(new_tblspc), "%s%s", new_tablespace, new_tblspc_suffix);
+ snprintf(old_dat, sizeof(old_dat), "%s/%u", old_tblspc, db_oid);
+ snprintf(new_dat, MAXPGPATH, "%s/%u", new_tblspc, db_oid);
+
+ /*
+ * Paths for "moved aside" stuff. We intentionally put these in the old
+ * cluster so that the delete_old_cluster.{sh,bat} script handles them.
+ */
+ snprintf(moved_tblspc, sizeof(moved_tblspc), "%s/moved_for_upgrade", old_tblspc);
+ snprintf(old_cat, MAXPGPATH, "%s/%u_old_catalogs", moved_tblspc, db_oid);
+ snprintf(moved_dat, MAXPGPATH, "%s/%u", moved_tblspc, db_oid);
+
+ /* Check that the database directory exists in the given tablespace. */
+ if (stat(old_dat, &st) != 0)
+ {
+ if (errno != ENOENT)
+ pg_fatal("could not stat file \"%s\": %m", old_dat);
+ return false;
+ }
+
+ /* Create directory for stuff that is moved aside. */
+ if (pg_mkdir_p(moved_tblspc, pg_dir_create_mode) != 0 && errno != EEXIST)
+ pg_fatal("could not create directory \"%s\"", moved_tblspc);
+
+ /* Create directory for old catalog files. */
+ if (pg_mkdir_p(old_cat, pg_dir_create_mode) != 0)
+ pg_fatal("could not create directory \"%s\"", old_cat);
+
+ /* Move the new cluster's database directory aside. */
+ if (rename(new_dat, moved_dat) != 0)
+ pg_fatal("could not rename \"%s\" to \"%s\"", new_dat, moved_dat);
+
+ /* Move the old cluster's database directory into place. */
+ if (rename(old_dat, new_dat) != 0)
+ pg_fatal("could not rename \"%s\" to \"%s\"", old_dat, new_dat);
+
+ return true;
+}
+
+/*
+ * FileNameMapCmp()
+ *
+ * qsort() comparator for FileNameMap that sorts by RelFileNumber.
+ */
+static int
+FileNameMapCmp(const void *a, const void *b)
+{
+ const FileNameMap *map1 = (const FileNameMap *) a;
+ const FileNameMap *map2 = (const FileNameMap *) b;
+
+ return pg_cmp_u32(map1->relfilenumber, map2->relfilenumber);
+}
+
+/*
+ * parse_relfilenumber()
+ *
+ * Attempt to parse the RelFileNumber of the given file name. If we can't,
+ * return InvalidRelFileNumber. Note that this code snippet is lifted from
+ * parse_filename_for_nontemp_relation().
+ */
+static RelFileNumber
+parse_relfilenumber(const char *filename)
+{
+ char *endp;
+ unsigned long n;
+
+ if (filename[0] < '1' || filename[0] > '9')
+ return InvalidRelFileNumber;
+
+ errno = 0;
+ n = strtoul(filename, &endp, 10);
+ if (errno || filename == endp || n <= 0 || n > PG_UINT32_MAX)
+ return InvalidRelFileNumber;
+
+ return (RelFileNumber) n;
+}
+
+/*
+ * swap_catalog_files()
+ *
+ * Moves the old catalog files aside, and moves the new catalog files into
+ * place. prepare_for_swap() should have already been called (and returned
+ * true) for the tablespace being transferred. old_cat (the directory for the
+ * old catalog files), new_dat (the database directory in the new cluster), and
+ * moved_dat (the location of the moved-aside pg_restore-generated database
+ * directory) should be the variables returned by prepare_for_swap().
+ */
+static void
+swap_catalog_files(FileNameMap *maps, int size, const char *old_cat,
+ const char *new_dat, const char *moved_dat)
+{
+ DIR *dir;
+ struct dirent *de;
+ char path[MAXPGPATH];
+ char dest[MAXPGPATH];
+ RelFileNumber rfn;
+
+ /* Move the old catalog files aside. */
+ dir = opendir(new_dat);
+ if (dir == NULL)
+ pg_fatal("could not open directory \"%s\": %m", new_dat);
+ while (errno = 0, (de = readdir(dir)) != NULL)
+ {
+ snprintf(path, sizeof(path), "%s/%s", new_dat, de->d_name);
+ if (get_dirent_type(path, de, false, PG_LOG_ERROR) != PGFILETYPE_REG)
+ continue;
+
+ rfn = parse_relfilenumber(de->d_name);
+ if (RelFileNumberIsValid(rfn))
+ {
+ FileNameMap key = {.relfilenumber = rfn};
+
+ if (bsearch(&key, maps, size, sizeof(FileNameMap), FileNameMapCmp))
+ continue;
+ }
+
+ snprintf(dest, sizeof(dest), "%s/%s", old_cat, de->d_name);
+ if (rename(path, dest) != 0)
+ pg_fatal("could not rename \"%s\" to \"%s\": %m", path, dest);
+ }
+ if (errno)
+ pg_fatal("could not read directory \"%s\": %m", new_dat);
+ (void) closedir(dir);
+
+ /* Move the new catalog files into place. */
+ dir = opendir(moved_dat);
+ if (dir == NULL)
+ pg_fatal("could not open directory \"%s\": %m", moved_dat);
+ while (errno = 0, (de = readdir(dir)) != NULL)
+ {
+ snprintf(path, sizeof(path), "%s/%s", moved_dat, de->d_name);
+ if (get_dirent_type(path, de, false, PG_LOG_ERROR) != PGFILETYPE_REG)
+ continue;
+
+ rfn = parse_relfilenumber(de->d_name);
+ if (RelFileNumberIsValid(rfn))
+ {
+ FileNameMap key = {.relfilenumber = rfn};
+
+ if (bsearch(&key, maps, size, sizeof(FileNameMap), FileNameMapCmp))
+ continue;
+ }
+
+ snprintf(dest, sizeof(dest), "%s/%s", new_dat, de->d_name);
+ if (rename(path, dest) != 0)
+ pg_fatal("could not rename \"%s\" to \"%s\": %m", path, dest);
+
+ /*
+ * We don't fsync() the database files in the file synchronization
+ * stage of pg_upgrade in swap mode, so we need to synchronize them
+ * ourselves. We only do this for the catalog files because they were
+ * created during pg_restore with fsync=off. We assume that the user
+ * data files files were properly persisted to disk when the user last
+ * shut it down.
+ */
+ if (user_opts.do_sync)
+ sync_queue_push(dest);
+ }
+ if (errno)
+ pg_fatal("could not read directory \"%s\": %m", moved_dat);
+ (void) closedir(dir);
+
+ /* Ensure the directory entries are persisted to disk. */
+ if (fsync_fname(new_dat, true) != 0)
+ pg_fatal("could not synchronize directory \"%s\": %m", new_dat);
+ if (fsync_parent_path(new_dat) != 0)
+ pg_fatal("could not synchronize parent directory of \"%s\": %m", new_dat);
+}
+
+/*
+ * do_swap()
+ *
+ * Perform the required steps for --swap for a single database. In short this
+ * moves the old cluster's database directory into the new cluster and then
+ * replaces any files for system catalogs with the ones that were generated
+ * during pg_restore.
+ */
+static void
+do_swap(FileNameMap *maps, int size, char *old_tablespace)
+{
+ char old_cat[MAXPGPATH];
+ char new_dat[MAXPGPATH];
+ char moved_dat[MAXPGPATH];
+
+ /*
+ * We perform many lookups on maps by relfilenumber in swap mode, so make
+ * sure it's sorted by relfilenumber. maps should already be sorted by
+ * OID, so in general this shouldn't have much work to do.
+ */
+ qsort(maps, size, sizeof(FileNameMap), FileNameMapCmp);
+
+ /*
+ * If an old tablespace is given, we only need to process that one. If no
+ * old tablespace is specified, we need to process all the tablespaces on
+ * the system.
+ */
+ if (old_tablespace)
+ {
+ if (prepare_for_swap(old_tablespace, maps[0].db_oid,
+ old_cat, new_dat, moved_dat))
+ swap_catalog_files(maps, size, old_cat, new_dat, moved_dat);
+ }
+ else
+ {
+ if (prepare_for_swap(old_cluster.pgdata, maps[0].db_oid,
+ old_cat, new_dat, moved_dat))
+ swap_catalog_files(maps, size, old_cat, new_dat, moved_dat);
+
+ for (int tblnum = 0; tblnum < os_info.num_old_tablespaces; tblnum++)
+ {
+ if (prepare_for_swap(os_info.old_tablespaces[tblnum], maps[0].db_oid,
+ old_cat, new_dat, moved_dat))
+ swap_catalog_files(maps, size, old_cat, new_dat, moved_dat);
+ }
+ }
}
/*
@@ -145,6 +501,20 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace)
new_cluster.controldata.cat_ver >= VISIBILITY_MAP_FROZEN_BIT_CAT_VER)
vm_must_add_frozenbit = true;
+ /* --swap has its own subroutine */
+ if (user_opts.transfer_mode == TRANSFER_MODE_SWAP)
+ {
+ /*
+ * We don't support --swap to upgrade from versions that require
+ * rewriting the visibility map. We should've failed already if
+ * someone tries to do that.
+ */
+ Assert(!vm_must_add_frozenbit);
+
+ do_swap(maps, size, old_tablespace);
+ return;
+ }
+
for (mapnum = 0; mapnum < size; mapnum++)
{
if (old_tablespace == NULL ||
@@ -259,6 +629,11 @@ transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_fro
pg_log(PG_VERBOSE, "linking \"%s\" to \"%s\"",
old_file, new_file);
linkFile(old_file, new_file, map->nspname, map->relname);
+ break;
+ case TRANSFER_MODE_SWAP:
+ /* swap mode is handled in its own code path */
+ pg_fatal("should never happen");
+ break;
}
}
}
diff --git a/src/bin/pg_upgrade/t/006_transfer_modes.pl b/src/bin/pg_upgrade/t/006_transfer_modes.pl
index 518e09941459..34fddbcdab57 100644
--- a/src/bin/pg_upgrade/t/006_transfer_modes.pl
+++ b/src/bin/pg_upgrade/t/006_transfer_modes.pl
@@ -16,6 +16,15 @@ sub test_mode
my $old = PostgreSQL::Test::Cluster->new('old', install_path => $ENV{oldinstall});
my $new = PostgreSQL::Test::Cluster->new('new');
+ # --swap can't be used to upgrade from versions older than 10, so just skip
+ # the test if the old cluster version is too old.
+ if ($old->pg_version < 10 && $mode eq "--swap")
+ {
+ $old->clean_node();
+ $new->clean_node();
+ return;
+ }
+
if (defined($ENV{oldinstall}))
{
# Checksums are now enabled by default, but weren't before 18, so pass
@@ -97,5 +106,6 @@ sub test_mode
test_mode('--copy');
test_mode('--copy-file-range');
test_mode('--link');
+test_mode('--swap');
done_testing();
diff --git a/src/common/file_utils.c b/src/common/file_utils.c
index eaa2e76f43f5..7b62687a2aa7 100644
--- a/src/common/file_utils.c
+++ b/src/common/file_utils.c
@@ -45,12 +45,10 @@
*/
#define MINIMUM_VERSION_FOR_PG_WAL 100000
-#ifdef PG_FLUSH_DATA_WORKS
-static int pre_sync_fname(const char *fname, bool isdir);
-#endif
static void walkdir(const char *path,
int (*action) (const char *fname, bool isdir),
- bool process_symlinks);
+ bool process_symlinks,
+ const char *exclude_dir);
#ifdef HAVE_SYNCFS
@@ -93,11 +91,15 @@ do_syncfs(const char *path)
* syncing, and might not have privileges to write at all.
*
* serverVersion indicates the version of the server to be sync'd.
+ *
+ * If sync_data_files is false, this function skips syncing "base/" and any
+ * other tablespace directories.
*/
void
sync_pgdata(const char *pg_data,
int serverVersion,
- DataDirSyncMethod sync_method)
+ DataDirSyncMethod sync_method,
+ bool sync_data_files)
{
bool xlog_is_symlink;
char pg_wal[MAXPGPATH];
@@ -147,30 +149,33 @@ sync_pgdata(const char *pg_data,
do_syncfs(pg_data);
/* If any tablespaces are configured, sync each of those. */
- dir = opendir(pg_tblspc);
- if (dir == NULL)
- pg_log_error("could not open directory \"%s\": %m",
- pg_tblspc);
- else
+ if (sync_data_files)
{
- while (errno = 0, (de = readdir(dir)) != NULL)
+ dir = opendir(pg_tblspc);
+ if (dir == NULL)
+ pg_log_error("could not open directory \"%s\": %m",
+ pg_tblspc);
+ else
{
- char subpath[MAXPGPATH * 2];
+ while (errno = 0, (de = readdir(dir)) != NULL)
+ {
+ char subpath[MAXPGPATH * 2];
- if (strcmp(de->d_name, ".") == 0 ||
- strcmp(de->d_name, "..") == 0)
- continue;
+ if (strcmp(de->d_name, ".") == 0 ||
+ strcmp(de->d_name, "..") == 0)
+ continue;
- snprintf(subpath, sizeof(subpath), "%s/%s",
- pg_tblspc, de->d_name);
- do_syncfs(subpath);
- }
+ snprintf(subpath, sizeof(subpath), "%s/%s",
+ pg_tblspc, de->d_name);
+ do_syncfs(subpath);
+ }
- if (errno)
- pg_log_error("could not read directory \"%s\": %m",
- pg_tblspc);
+ if (errno)
+ pg_log_error("could not read directory \"%s\": %m",
+ pg_tblspc);
- (void) closedir(dir);
+ (void) closedir(dir);
+ }
}
/* If pg_wal is a symlink, process that too. */
@@ -182,15 +187,21 @@ sync_pgdata(const char *pg_data,
case DATA_DIR_SYNC_METHOD_FSYNC:
{
+ char *exclude_dir = NULL;
+
+ if (!sync_data_files)
+ exclude_dir = psprintf("%s/base", pg_data);
+
/*
* If possible, hint to the kernel that we're soon going to
* fsync the data directory and its contents.
*/
#ifdef PG_FLUSH_DATA_WORKS
- walkdir(pg_data, pre_sync_fname, false);
+ walkdir(pg_data, pre_sync_fname, false, exclude_dir);
if (xlog_is_symlink)
- walkdir(pg_wal, pre_sync_fname, false);
- walkdir(pg_tblspc, pre_sync_fname, true);
+ walkdir(pg_wal, pre_sync_fname, false, NULL);
+ if (sync_data_files)
+ walkdir(pg_tblspc, pre_sync_fname, true, NULL);
#endif
/*
@@ -203,10 +214,14 @@ sync_pgdata(const char *pg_data,
* get fsync'd twice. That's not an expected case so we don't
* worry about optimizing it.
*/
- walkdir(pg_data, fsync_fname, false);
+ walkdir(pg_data, fsync_fname, false, exclude_dir);
if (xlog_is_symlink)
- walkdir(pg_wal, fsync_fname, false);
- walkdir(pg_tblspc, fsync_fname, true);
+ walkdir(pg_wal, fsync_fname, false, NULL);
+ if (sync_data_files)
+ walkdir(pg_tblspc, fsync_fname, true, NULL);
+
+ if (exclude_dir)
+ pfree(exclude_dir);
}
break;
}
@@ -245,10 +260,10 @@ sync_dir_recurse(const char *dir, DataDirSyncMethod sync_method)
* fsync the data directory and its contents.
*/
#ifdef PG_FLUSH_DATA_WORKS
- walkdir(dir, pre_sync_fname, false);
+ walkdir(dir, pre_sync_fname, false, NULL);
#endif
- walkdir(dir, fsync_fname, false);
+ walkdir(dir, fsync_fname, false, NULL);
}
break;
}
@@ -264,6 +279,9 @@ sync_dir_recurse(const char *dir, DataDirSyncMethod sync_method)
* ignored in subdirectories, ie we intentionally don't pass down the
* process_symlinks flag to recursive calls.
*
+ * If exclude_dir is not NULL, it specifies a directory path to skip
+ * processing.
+ *
* Errors are reported but not considered fatal.
*
* See also walkdir in fd.c, which is a backend version of this logic.
@@ -271,11 +289,15 @@ sync_dir_recurse(const char *dir, DataDirSyncMethod sync_method)
static void
walkdir(const char *path,
int (*action) (const char *fname, bool isdir),
- bool process_symlinks)
+ bool process_symlinks,
+ const char *exclude_dir)
{
DIR *dir;
struct dirent *de;
+ if (exclude_dir && strcmp(exclude_dir, path) == 0)
+ return;
+
dir = opendir(path);
if (dir == NULL)
{
@@ -299,7 +321,7 @@ walkdir(const char *path,
(*action) (subpath, false);
break;
case PGFILETYPE_DIR:
- walkdir(subpath, action, false);
+ walkdir(subpath, action, false, exclude_dir);
break;
default:
@@ -327,16 +349,16 @@ walkdir(const char *path,
}
/*
- * Hint to the OS that it should get ready to fsync() this file.
+ * Hint to the OS that it should get ready to fsync() this file, if supported
+ * by the platform.
*
* Ignores errors trying to open unreadable files, and reports other errors
* non-fatally.
*/
-#ifdef PG_FLUSH_DATA_WORKS
-
-static int
+int
pre_sync_fname(const char *fname, bool isdir)
{
+#ifdef PG_FLUSH_DATA_WORKS
int fd;
fd = open(fname, O_RDONLY | PG_BINARY, 0);
@@ -363,11 +385,10 @@ pre_sync_fname(const char *fname, bool isdir)
#endif
(void) close(fd);
+#endif /* PG_FLUSH_DATA_WORKS */
return 0;
}
-#endif /* PG_FLUSH_DATA_WORKS */
-
/*
* fsync_fname -- Try to fsync a file or directory
*
diff --git a/src/include/common/file_utils.h b/src/include/common/file_utils.h
index a832210adc1d..9fd88953e43b 100644
--- a/src/include/common/file_utils.h
+++ b/src/include/common/file_utils.h
@@ -33,9 +33,10 @@ typedef enum DataDirSyncMethod
struct iovec; /* avoid including port/pg_iovec.h here */
#ifdef FRONTEND
+extern int pre_sync_fname(const char *fname, bool isdir);
extern int fsync_fname(const char *fname, bool isdir);
extern void sync_pgdata(const char *pg_data, int serverVersion,
- DataDirSyncMethod sync_method);
+ DataDirSyncMethod sync_method, bool sync_data_files);
extern void sync_dir_recurse(const char *dir, DataDirSyncMethod sync_method);
extern int durable_rename(const char *oldfile, const char *newfile);
extern int fsync_parent_path(const char *fname);
diff --git a/src/test/modules/test_pg_dump/t/001_base.pl b/src/test/modules/test_pg_dump/t/001_base.pl
index a9bcac4169d7..adcaa419616c 100644
--- a/src/test/modules/test_pg_dump/t/001_base.pl
+++ b/src/test/modules/test_pg_dump/t/001_base.pl
@@ -48,7 +48,7 @@
dump_cmd => [
'pg_dump', '--no-sync',
'--file' => "$tempdir/binary_upgrade.sql",
- '--schema-only', '--binary-upgrade',
+ '--schema-only', '--sequence-data', '--binary-upgrade',
'--dbname' => 'postgres',
],
},