diff --git a/doc/src/sgml/ref/initdb.sgml b/doc/src/sgml/ref/initdb.sgml index 0026318485a7..2f1f9a42f904 100644 --- a/doc/src/sgml/ref/initdb.sgml +++ b/doc/src/sgml/ref/initdb.sgml @@ -527,6 +527,33 @@ PostgreSQL documentation + + + + + By default, initdb safely writes all database files + to disk. This option instructs initdb to skip + synchronizing all files in the individual database directories, the + database directories themselves, and the tablespace directories, i.e., + everything in the base subdirectory and any other + tablespace directories. Other files, such as those in + pg_wal and pg_xact, will still be + synchronized unless the option is also + specified. + + + Note that if is used in + conjuction with , some or all of + the aforementioned files and directories will be synchronized because + syncfs processes entire file systems. + + + This option is primarily intended for internal use by tools that + separately ensure the skipped files are synchronized to disk. + + + + diff --git a/doc/src/sgml/ref/pg_dump.sgml b/doc/src/sgml/ref/pg_dump.sgml index 0ae40f9be58d..63cca18711a4 100644 --- a/doc/src/sgml/ref/pg_dump.sgml +++ b/doc/src/sgml/ref/pg_dump.sgml @@ -1298,6 +1298,17 @@ PostgreSQL documentation + + + + + Include sequence data in the dump. This is the default behavior except + when , , or + is specified. + + + + diff --git a/doc/src/sgml/ref/pgupgrade.sgml b/doc/src/sgml/ref/pgupgrade.sgml index 5db761d1ff19..da2616190438 100644 --- a/doc/src/sgml/ref/pgupgrade.sgml +++ b/doc/src/sgml/ref/pgupgrade.sgml @@ -244,7 +244,8 @@ PostgreSQL documentation Copy files to the new cluster. This is the default. (See also - and .) + , , + , and .) @@ -262,6 +263,32 @@ PostgreSQL documentation + + + + + Move the data directories from the old cluster to the new cluster. + Then, replace the catalog files with those generated for the new + cluster. This mode can outperform , + , , and + , especially on clusters with many + relations. + + + However, this mode creates many garbage files in the old cluster, which + can prolong the file synchronization step if + is used. Therefore, it is + recommended to use with + . + + + Additionally, once the file transfer step begins, the old cluster will + be destructively modified and therefore will no longer be safe to + start. See for details. + + + + method @@ -530,6 +557,10 @@ NET STOP postgresql-&majorversion; is started. Clone mode also requires that the old and new data directories be in the same file system. This mode is only available on certain operating systems and file systems. + Swap mode may be the fastest if there are many relations, but you will not + be able to access your old cluster once the file transfer step begins. + Swap mode also requires that the old and new cluster data directories be + in the same file system. @@ -889,6 +920,32 @@ psql --username=postgres --file=script.sql postgres + + + + If the option was used, the old cluster might + be destructively modified: + + + + + If pg_upgrade aborts before reporting that the + old cluster is no longer safe to start, the old cluster was + unmodified; it can be restarted. + + + + + + If pg_upgrade has reported that the old cluster + is no longer safe to start, the old cluster was destructively + modified. The old cluster will need to be restored from backup in + this case. + + + + + diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index 21a0fe3ecd97..22b7d31b1654 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -168,6 +168,7 @@ static bool data_checksums = true; static char *xlog_dir = NULL; static int wal_segment_size_mb = (DEFAULT_XLOG_SEG_SIZE) / (1024 * 1024); static DataDirSyncMethod sync_method = DATA_DIR_SYNC_METHOD_FSYNC; +static bool sync_data_files = true; /* internal vars */ @@ -2566,6 +2567,7 @@ usage(const char *progname) printf(_(" -L DIRECTORY where to find the input files\n")); printf(_(" -n, --no-clean do not clean up after errors\n")); printf(_(" -N, --no-sync do not wait for changes to be written safely to disk\n")); + printf(_(" --no-sync-data-files do not sync files within database directories\n")); printf(_(" --no-instructions do not print instructions for next steps\n")); printf(_(" -s, --show show internal settings, then exit\n")); printf(_(" --sync-method=METHOD set method for syncing files to disk\n")); @@ -3208,6 +3210,7 @@ main(int argc, char *argv[]) {"icu-rules", required_argument, NULL, 18}, {"sync-method", required_argument, NULL, 19}, {"no-data-checksums", no_argument, NULL, 20}, + {"no-sync-data-files", no_argument, NULL, 21}, {NULL, 0, NULL, 0} }; @@ -3402,6 +3405,9 @@ main(int argc, char *argv[]) case 20: data_checksums = false; break; + case 21: + sync_data_files = false; + break; default: /* getopt_long already emitted a complaint */ pg_log_error_hint("Try \"%s --help\" for more information.", progname); @@ -3453,7 +3459,7 @@ main(int argc, char *argv[]) fputs(_("syncing data to disk ... "), stdout); fflush(stdout); - sync_pgdata(pg_data, PG_VERSION_NUM, sync_method); + sync_pgdata(pg_data, PG_VERSION_NUM, sync_method, sync_data_files); check_ok(); return 0; } @@ -3516,7 +3522,7 @@ main(int argc, char *argv[]) { fputs(_("syncing data to disk ... "), stdout); fflush(stdout); - sync_pgdata(pg_data, PG_VERSION_NUM, sync_method); + sync_pgdata(pg_data, PG_VERSION_NUM, sync_method, sync_data_files); check_ok(); } else diff --git a/src/bin/initdb/t/001_initdb.pl b/src/bin/initdb/t/001_initdb.pl index 01cc4a1602b8..15dd10ce40a3 100644 --- a/src/bin/initdb/t/001_initdb.pl +++ b/src/bin/initdb/t/001_initdb.pl @@ -76,6 +76,7 @@ 'checksums are enabled in control file'); command_ok([ 'initdb', '--sync-only', $datadir ], 'sync only'); +command_ok([ 'initdb', '--sync-only', '--no-sync-data-files', $datadir ], '--no-sync-data-files'); command_fails([ 'initdb', $datadir ], 'existing data directory'); if ($supports_syncfs) diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c index d4b4e3340143..1da4bfc2351e 100644 --- a/src/bin/pg_basebackup/pg_basebackup.c +++ b/src/bin/pg_basebackup/pg_basebackup.c @@ -2310,7 +2310,7 @@ BaseBackup(char *compression_algorithm, char *compression_detail, } else { - (void) sync_pgdata(basedir, serverVersion, sync_method); + (void) sync_pgdata(basedir, serverVersion, sync_method, true); } } diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c index 867aeddc601f..f20be82862a2 100644 --- a/src/bin/pg_checksums/pg_checksums.c +++ b/src/bin/pg_checksums/pg_checksums.c @@ -633,7 +633,7 @@ main(int argc, char *argv[]) if (do_sync) { pg_log_info("syncing data directory"); - sync_pgdata(DataDir, PG_VERSION_NUM, sync_method); + sync_pgdata(DataDir, PG_VERSION_NUM, sync_method, true); } pg_log_info("updating control file"); diff --git a/src/bin/pg_combinebackup/pg_combinebackup.c b/src/bin/pg_combinebackup/pg_combinebackup.c index d480dc74436e..050260ee832a 100644 --- a/src/bin/pg_combinebackup/pg_combinebackup.c +++ b/src/bin/pg_combinebackup/pg_combinebackup.c @@ -424,7 +424,7 @@ main(int argc, char *argv[]) else { pg_log_debug("recursively fsyncing \"%s\"", opt.output); - sync_pgdata(opt.output, version * 10000, opt.sync_method); + sync_pgdata(opt.output, version * 10000, opt.sync_method, true); } } diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index 428ed2d60fca..e6253331e273 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -518,6 +518,7 @@ main(int argc, char **argv) {"sync-method", required_argument, NULL, 15}, {"filter", required_argument, NULL, 16}, {"exclude-extension", required_argument, NULL, 17}, + {"sequence-data", no_argument, &dopt.sequence_data, 1}, {NULL, 0, NULL, 0} }; @@ -801,14 +802,6 @@ main(int argc, char **argv) if (dopt.column_inserts && dopt.dump_inserts == 0) dopt.dump_inserts = DUMP_DEFAULT_ROWS_PER_INSERT; - /* - * Binary upgrade mode implies dumping sequence data even in schema-only - * mode. This is not exposed as a separate option, but kept separate - * internally for clarity. - */ - if (dopt.binary_upgrade) - dopt.sequence_data = 1; - if (data_only && schema_only) pg_fatal("options -s/--schema-only and -a/--data-only cannot be used together"); if (schema_only && statistics_only) @@ -1275,6 +1268,7 @@ help(const char *progname) printf(_(" --quote-all-identifiers quote all identifiers, even if not key words\n")); printf(_(" --rows-per-insert=NROWS number of rows per INSERT; implies --inserts\n")); printf(_(" --section=SECTION dump named section (pre-data, data, or post-data)\n")); + printf(_(" --sequence-data include sequence data in dump\n")); printf(_(" --serializable-deferrable wait until the dump can run without anomalies\n")); printf(_(" --snapshot=SNAPSHOT use given snapshot for the dump\n")); printf(_(" --statistics-only dump only the statistics, not schema or data\n")); diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl index d281e27aa677..ed379033da73 100644 --- a/src/bin/pg_dump/t/002_pg_dump.pl +++ b/src/bin/pg_dump/t/002_pg_dump.pl @@ -66,6 +66,7 @@ '--file' => "$tempdir/binary_upgrade.dump", '--no-password', '--no-data', + '--sequence-data', '--binary-upgrade', '--dbname' => 'postgres', # alternative way to specify database ], diff --git a/src/bin/pg_rewind/file_ops.c b/src/bin/pg_rewind/file_ops.c index 467845419eda..55659ce201f4 100644 --- a/src/bin/pg_rewind/file_ops.c +++ b/src/bin/pg_rewind/file_ops.c @@ -296,7 +296,7 @@ sync_target_dir(void) if (!do_sync || dry_run) return; - sync_pgdata(datadir_target, PG_VERSION_NUM, sync_method); + sync_pgdata(datadir_target, PG_VERSION_NUM, sync_method, true); } diff --git a/src/bin/pg_upgrade/TESTING b/src/bin/pg_upgrade/TESTING index 00842ac6ec3a..c3d463c9c292 100644 --- a/src/bin/pg_upgrade/TESTING +++ b/src/bin/pg_upgrade/TESTING @@ -20,13 +20,13 @@ export oldinstall=...otherversion/ (old version's install base path) See DETAILS below for more information about creation of the dump. You can also test the different transfer modes (--copy, --link, ---clone, --copy-file-range) by setting the environment variable +--clone, --copy-file-range, --swap) by setting the environment variable PG_TEST_PG_UPGRADE_MODE to the respective command-line option, like make check PG_TEST_PG_UPGRADE_MODE=--link -The default is --copy. Note that the other modes are not supported on -all operating systems. +The default is --copy. Note that not all modes are supported on all +operating systems. DETAILS ------- diff --git a/src/bin/pg_upgrade/check.c b/src/bin/pg_upgrade/check.c index 117f461d46a1..02d9146e5ed7 100644 --- a/src/bin/pg_upgrade/check.c +++ b/src/bin/pg_upgrade/check.c @@ -709,7 +709,34 @@ check_new_cluster(void) check_copy_file_range(); break; case TRANSFER_MODE_LINK: - check_hard_link(); + check_hard_link(TRANSFER_MODE_LINK); + break; + case TRANSFER_MODE_SWAP: + + /* + * We do the hard link check for --swap, too, since it's an easy + * way to verify the clusters are in the same file system. This + * allows us to take some shortcuts in the file synchronization + * step. With some more effort, we could probably support the + * separate-file-system use case, but this mode is unlikely to + * offer much benefit if we have to copy the files across file + * system boundaries. + */ + check_hard_link(TRANSFER_MODE_SWAP); + + /* + * There are a few known issues with using --swap to upgrade from + * versions older than 10. For example, the sequence tuple format + * changed in v10, and the visibility map format changed in 9.6. + * While such problems are not insurmountable (and we may have to + * deal with similar problems in the future, anyway), it doesn't + * seem worth the effort to support swap mode for upgrades from + * long-unsupported versions. + */ + if (GET_MAJOR_VERSION(old_cluster.major_version) < 1000) + pg_fatal("Swap mode can only upgrade clusters from PostgreSQL version %s and later.", + "10"); + break; } diff --git a/src/bin/pg_upgrade/controldata.c b/src/bin/pg_upgrade/controldata.c index bd49ea867bfc..47ee27ec8354 100644 --- a/src/bin/pg_upgrade/controldata.c +++ b/src/bin/pg_upgrade/controldata.c @@ -751,7 +751,7 @@ check_control_data(ControlData *oldctrl, void -disable_old_cluster(void) +disable_old_cluster(transferMode transfer_mode) { char old_path[MAXPGPATH], new_path[MAXPGPATH]; @@ -766,10 +766,17 @@ disable_old_cluster(void) old_path, new_path); check_ok(); - pg_log(PG_REPORT, "\n" - "If you want to start the old cluster, you will need to remove\n" - "the \".old\" suffix from %s/global/pg_control.old.\n" - "Because \"link\" mode was used, the old cluster cannot be safely\n" - "started once the new cluster has been started.", - old_cluster.pgdata); + if (transfer_mode == TRANSFER_MODE_LINK) + pg_log(PG_REPORT, "\n" + "If you want to start the old cluster, you will need to remove\n" + "the \".old\" suffix from %s/global/pg_control.old.\n" + "Because \"link\" mode was used, the old cluster cannot be safely\n" + "started once the new cluster has been started.", + old_cluster.pgdata); + else if (transfer_mode == TRANSFER_MODE_SWAP) + pg_log(PG_REPORT, "\n" + "Because \"swap\" mode was used, the old cluster can no longer be\n" + "safely started."); + else + pg_fatal("unrecognized transfer mode"); } diff --git a/src/bin/pg_upgrade/dump.c b/src/bin/pg_upgrade/dump.c index 23fe7280a161..23cb08e83476 100644 --- a/src/bin/pg_upgrade/dump.c +++ b/src/bin/pg_upgrade/dump.c @@ -52,9 +52,11 @@ generate_old_dump(void) snprintf(log_file_name, sizeof(log_file_name), DB_DUMP_LOG_FILE_MASK, old_db->db_oid); parallel_exec_prog(log_file_name, NULL, - "\"%s/pg_dump\" %s --no-data %s --quote-all-identifiers " + "\"%s/pg_dump\" %s --no-data %s %s --quote-all-identifiers " "--binary-upgrade --format=custom %s --no-sync --file=\"%s/%s\" %s", new_cluster.bindir, cluster_conn_opts(&old_cluster), + (user_opts.transfer_mode == TRANSFER_MODE_SWAP) ? + "" : "--sequence-data", log_opts.verbose ? "--verbose" : "", user_opts.do_statistics ? "" : "--no-statistics", log_opts.dumpdir, diff --git a/src/bin/pg_upgrade/file.c b/src/bin/pg_upgrade/file.c index 7fd1991204ae..91ed16acb088 100644 --- a/src/bin/pg_upgrade/file.c +++ b/src/bin/pg_upgrade/file.c @@ -434,7 +434,7 @@ check_copy_file_range(void) } void -check_hard_link(void) +check_hard_link(transferMode transfer_mode) { char existing_file[MAXPGPATH]; char new_link_file[MAXPGPATH]; @@ -444,8 +444,16 @@ check_hard_link(void) unlink(new_link_file); /* might fail */ if (link(existing_file, new_link_file) < 0) - pg_fatal("could not create hard link between old and new data directories: %m\n" - "In link mode the old and new data directories must be on the same file system."); + { + if (transfer_mode == TRANSFER_MODE_LINK) + pg_fatal("could not create hard link between old and new data directories: %m\n" + "In link mode the old and new data directories must be on the same file system."); + else if (transfer_mode == TRANSFER_MODE_SWAP) + pg_fatal("could not create hard link between old and new data directories: %m\n" + "In swap mode the old and new data directories must be on the same file system."); + else + pg_fatal("unrecognized transfer mode"); + } unlink(new_link_file); } diff --git a/src/bin/pg_upgrade/info.c b/src/bin/pg_upgrade/info.c index ad52de8b607e..4b7a56f5b3be 100644 --- a/src/bin/pg_upgrade/info.c +++ b/src/bin/pg_upgrade/info.c @@ -490,7 +490,7 @@ get_rel_infos_query(void) " FROM pg_catalog.pg_class c JOIN pg_catalog.pg_namespace n " " ON c.relnamespace = n.oid " " WHERE relkind IN (" CppAsString2(RELKIND_RELATION) ", " - CppAsString2(RELKIND_MATVIEW) ") AND " + CppAsString2(RELKIND_MATVIEW) "%s) AND " /* exclude possible orphaned temp tables */ " ((n.nspname !~ '^pg_temp_' AND " " n.nspname !~ '^pg_toast_temp_' AND " @@ -499,6 +499,8 @@ get_rel_infos_query(void) " c.oid >= %u::pg_catalog.oid) OR " " (n.nspname = 'pg_catalog' AND " " relname IN ('pg_largeobject') ))), ", + (user_opts.transfer_mode == TRANSFER_MODE_SWAP) ? + ", " CppAsString2(RELKIND_SEQUENCE) : "", FirstNormalObjectId); /* diff --git a/src/bin/pg_upgrade/option.c b/src/bin/pg_upgrade/option.c index 188dd8d8a8ba..7fd7f1d33fcb 100644 --- a/src/bin/pg_upgrade/option.c +++ b/src/bin/pg_upgrade/option.c @@ -62,6 +62,7 @@ parseCommandLine(int argc, char *argv[]) {"sync-method", required_argument, NULL, 4}, {"no-statistics", no_argument, NULL, 5}, {"set-char-signedness", required_argument, NULL, 6}, + {"swap", no_argument, NULL, 7}, {NULL, 0, NULL, 0} }; @@ -228,6 +229,11 @@ parseCommandLine(int argc, char *argv[]) else pg_fatal("invalid argument for option %s", "--set-char-signedness"); break; + + case 7: + user_opts.transfer_mode = TRANSFER_MODE_SWAP; + break; + default: fprintf(stderr, _("Try \"%s --help\" for more information.\n"), os_info.progname); @@ -325,6 +331,7 @@ usage(void) printf(_(" --no-statistics do not import statistics from old cluster\n")); printf(_(" --set-char-signedness=OPTION set new cluster char signedness to \"signed\" or\n" " \"unsigned\"\n")); + printf(_(" --swap move data directories to new cluster\n")); printf(_(" --sync-method=METHOD set method for syncing files to disk\n")); printf(_(" -?, --help show this help, then exit\n")); printf(_("\n" diff --git a/src/bin/pg_upgrade/pg_upgrade.c b/src/bin/pg_upgrade/pg_upgrade.c index 174cd920840c..9295e46aed3e 100644 --- a/src/bin/pg_upgrade/pg_upgrade.c +++ b/src/bin/pg_upgrade/pg_upgrade.c @@ -170,12 +170,14 @@ main(int argc, char **argv) /* * Most failures happen in create_new_objects(), which has completed at - * this point. We do this here because it is just before linking, which - * will link the old and new cluster data files, preventing the old - * cluster from being safely started once the new cluster is started. + * this point. We do this here because it is just before file transfer, + * which for --link will make it unsafe to start the old cluster once the + * new cluster is started, and for --swap will make it unsafe to start the + * old cluster at all. */ - if (user_opts.transfer_mode == TRANSFER_MODE_LINK) - disable_old_cluster(); + if (user_opts.transfer_mode == TRANSFER_MODE_LINK || + user_opts.transfer_mode == TRANSFER_MODE_SWAP) + disable_old_cluster(user_opts.transfer_mode); transfer_all_new_tablespaces(&old_cluster.dbarr, &new_cluster.dbarr, old_cluster.pgdata, new_cluster.pgdata); @@ -212,8 +214,10 @@ main(int argc, char **argv) { prep_status("Sync data directory to disk"); exec_prog(UTILITY_LOG_FILE, NULL, true, true, - "\"%s/initdb\" --sync-only \"%s\" --sync-method %s", + "\"%s/initdb\" --sync-only %s \"%s\" --sync-method %s", new_cluster.bindir, + (user_opts.transfer_mode == TRANSFER_MODE_SWAP) ? + "--no-sync-data-files" : "", new_cluster.pgdata, user_opts.sync_method); check_ok(); diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h index 4c9d01721491..69c965bb7d09 100644 --- a/src/bin/pg_upgrade/pg_upgrade.h +++ b/src/bin/pg_upgrade/pg_upgrade.h @@ -262,6 +262,7 @@ typedef enum TRANSFER_MODE_COPY, TRANSFER_MODE_COPY_FILE_RANGE, TRANSFER_MODE_LINK, + TRANSFER_MODE_SWAP, } transferMode; /* @@ -391,7 +392,7 @@ void create_script_for_old_cluster_deletion(char **deletion_script_file_name); void get_control_data(ClusterInfo *cluster); void check_control_data(ControlData *oldctrl, ControlData *newctrl); -void disable_old_cluster(void); +void disable_old_cluster(transferMode transfer_mode); /* dump.c */ @@ -423,7 +424,7 @@ void rewriteVisibilityMap(const char *fromfile, const char *tofile, const char *schemaName, const char *relName); void check_file_clone(void); void check_copy_file_range(void); -void check_hard_link(void); +void check_hard_link(transferMode transfer_mode); /* fopen_priv() is no longer different from fopen() */ #define fopen_priv(path, mode) fopen(path, mode) diff --git a/src/bin/pg_upgrade/relfilenumber.c b/src/bin/pg_upgrade/relfilenumber.c index 8c23c583172a..c0affa5565c5 100644 --- a/src/bin/pg_upgrade/relfilenumber.c +++ b/src/bin/pg_upgrade/relfilenumber.c @@ -11,11 +11,92 @@ #include +#include "common/file_perm.h" +#include "common/file_utils.h" +#include "common/int.h" +#include "common/logging.h" #include "pg_upgrade.h" static void transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace); static void transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_frozenbit); +/* + * The following set of sync_queue_* functions are used for --swap to reduce + * the amount of time spent synchronizing the swapped catalog files. When a + * file is added to the queue, we also alert the file system that we'd like it + * to be persisted to disk in the near future (if that operation is supported + * by the current platform). Once the queue is full, all of the files are + * synchronized to disk. This strategy should generally be much faster than + * simply calling fsync() on the files right away. + * + * The general usage pattern should be something like: + * + * for (int i = 0; i < num_files; i++) + * sync_queue_push(files[i]); + * + * // be sure to sync any remaining files in the queue + * sync_queue_sync_all(); + * synq_queue_destroy(); + */ + +#define SYNC_QUEUE_MAX_LEN (1024) + +static char *sync_queue[SYNC_QUEUE_MAX_LEN]; +static bool sync_queue_inited; +static int sync_queue_len; + +static inline void +sync_queue_init(void) +{ + if (sync_queue_inited) + return; + + sync_queue_inited = true; + for (int i = 0; i < SYNC_QUEUE_MAX_LEN; i++) + sync_queue[i] = palloc(MAXPGPATH); +} + +static inline void +sync_queue_sync_all(void) +{ + if (!sync_queue_inited) + return; + + for (int i = 0; i < sync_queue_len; i++) + { + if (fsync_fname(sync_queue[i], false) != 0) + pg_fatal("could not synchronize file \"%s\": %m", sync_queue[i]); + } + + sync_queue_len = 0; +} + +static inline void +sync_queue_push(const char *fname) +{ + sync_queue_init(); + + pre_sync_fname(fname, false); + + strncpy(sync_queue[sync_queue_len++], fname, MAXPGPATH); + if (sync_queue_len >= SYNC_QUEUE_MAX_LEN) + sync_queue_sync_all(); +} + +static inline void +sync_queue_destroy(void) +{ + if (!sync_queue_inited) + return; + + sync_queue_inited = false; + sync_queue_len = 0; + for (int i = 0; i < SYNC_QUEUE_MAX_LEN; i++) + { + pfree(sync_queue[i]); + sync_queue[i] = NULL; + } +} /* * transfer_all_new_tablespaces() @@ -41,6 +122,9 @@ transfer_all_new_tablespaces(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr, case TRANSFER_MODE_LINK: prep_status_progress("Linking user relation files"); break; + case TRANSFER_MODE_SWAP: + prep_status_progress("Swapping data directories"); + break; } /* @@ -125,6 +209,278 @@ transfer_all_new_dbs(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr, /* We allocate something even for n_maps == 0 */ pg_free(mappings); } + + /* + * Make sure anything pending synchronization in swap mode is fully + * persisted to disk. This is a no-op for other transfer modes. + */ + sync_queue_sync_all(); + sync_queue_destroy(); +} + +/* + * prepare_for_swap() + * + * This function moves the database directory from the old cluster to the new + * cluster in preparation for moving the pg_restore-generated catalog files + * into place. Returns false if the database with the given OID does not have + * a directory in the given tablespace, otherwise returns true. + * + * old_cat (the directory for the old catalog files), new_dat (the database + * directory in the new cluster), and moved_dat (the destination for the + * pg_restore-generated database directory) should be sized to MAXPGPATH bytes. + * This function will return the appropriate paths in those variables. + */ +static bool +prepare_for_swap(const char *old_tablespace, Oid db_oid, + char *old_cat, char *new_dat, char *moved_dat) +{ + const char *new_tablespace; + const char *old_tblspc_suffix; + const char *new_tblspc_suffix; + char old_tblspc[MAXPGPATH]; + char new_tblspc[MAXPGPATH]; + char moved_tblspc[MAXPGPATH]; + char old_dat[MAXPGPATH]; + struct stat st; + + if (strcmp(old_tablespace, old_cluster.pgdata) == 0) + { + new_tablespace = new_cluster.pgdata; + new_tblspc_suffix = "/base"; + old_tblspc_suffix = "/base"; + } + else + { + /* + * XXX: The below line is a hack to deal with the fact that we + * presently don't have an easy way to find the corresponding new + * tablespace's path. This will need to be fixed if/when we add + * pg_upgrade support for in-place tablespaces. + */ + new_tablespace = old_tablespace; + + new_tblspc_suffix = new_cluster.tablespace_suffix; + old_tblspc_suffix = old_cluster.tablespace_suffix; + } + + /* Old and new cluster paths. */ + snprintf(old_tblspc, sizeof(old_tblspc), "%s%s", old_tablespace, old_tblspc_suffix); + snprintf(new_tblspc, sizeof(new_tblspc), "%s%s", new_tablespace, new_tblspc_suffix); + snprintf(old_dat, sizeof(old_dat), "%s/%u", old_tblspc, db_oid); + snprintf(new_dat, MAXPGPATH, "%s/%u", new_tblspc, db_oid); + + /* + * Paths for "moved aside" stuff. We intentionally put these in the old + * cluster so that the delete_old_cluster.{sh,bat} script handles them. + */ + snprintf(moved_tblspc, sizeof(moved_tblspc), "%s/moved_for_upgrade", old_tblspc); + snprintf(old_cat, MAXPGPATH, "%s/%u_old_catalogs", moved_tblspc, db_oid); + snprintf(moved_dat, MAXPGPATH, "%s/%u", moved_tblspc, db_oid); + + /* Check that the database directory exists in the given tablespace. */ + if (stat(old_dat, &st) != 0) + { + if (errno != ENOENT) + pg_fatal("could not stat file \"%s\": %m", old_dat); + return false; + } + + /* Create directory for stuff that is moved aside. */ + if (pg_mkdir_p(moved_tblspc, pg_dir_create_mode) != 0 && errno != EEXIST) + pg_fatal("could not create directory \"%s\"", moved_tblspc); + + /* Create directory for old catalog files. */ + if (pg_mkdir_p(old_cat, pg_dir_create_mode) != 0) + pg_fatal("could not create directory \"%s\"", old_cat); + + /* Move the new cluster's database directory aside. */ + if (rename(new_dat, moved_dat) != 0) + pg_fatal("could not rename \"%s\" to \"%s\"", new_dat, moved_dat); + + /* Move the old cluster's database directory into place. */ + if (rename(old_dat, new_dat) != 0) + pg_fatal("could not rename \"%s\" to \"%s\"", old_dat, new_dat); + + return true; +} + +/* + * FileNameMapCmp() + * + * qsort() comparator for FileNameMap that sorts by RelFileNumber. + */ +static int +FileNameMapCmp(const void *a, const void *b) +{ + const FileNameMap *map1 = (const FileNameMap *) a; + const FileNameMap *map2 = (const FileNameMap *) b; + + return pg_cmp_u32(map1->relfilenumber, map2->relfilenumber); +} + +/* + * parse_relfilenumber() + * + * Attempt to parse the RelFileNumber of the given file name. If we can't, + * return InvalidRelFileNumber. Note that this code snippet is lifted from + * parse_filename_for_nontemp_relation(). + */ +static RelFileNumber +parse_relfilenumber(const char *filename) +{ + char *endp; + unsigned long n; + + if (filename[0] < '1' || filename[0] > '9') + return InvalidRelFileNumber; + + errno = 0; + n = strtoul(filename, &endp, 10); + if (errno || filename == endp || n <= 0 || n > PG_UINT32_MAX) + return InvalidRelFileNumber; + + return (RelFileNumber) n; +} + +/* + * swap_catalog_files() + * + * Moves the old catalog files aside, and moves the new catalog files into + * place. prepare_for_swap() should have already been called (and returned + * true) for the tablespace being transferred. old_cat (the directory for the + * old catalog files), new_dat (the database directory in the new cluster), and + * moved_dat (the location of the moved-aside pg_restore-generated database + * directory) should be the variables returned by prepare_for_swap(). + */ +static void +swap_catalog_files(FileNameMap *maps, int size, const char *old_cat, + const char *new_dat, const char *moved_dat) +{ + DIR *dir; + struct dirent *de; + char path[MAXPGPATH]; + char dest[MAXPGPATH]; + RelFileNumber rfn; + + /* Move the old catalog files aside. */ + dir = opendir(new_dat); + if (dir == NULL) + pg_fatal("could not open directory \"%s\": %m", new_dat); + while (errno = 0, (de = readdir(dir)) != NULL) + { + snprintf(path, sizeof(path), "%s/%s", new_dat, de->d_name); + if (get_dirent_type(path, de, false, PG_LOG_ERROR) != PGFILETYPE_REG) + continue; + + rfn = parse_relfilenumber(de->d_name); + if (RelFileNumberIsValid(rfn)) + { + FileNameMap key = {.relfilenumber = rfn}; + + if (bsearch(&key, maps, size, sizeof(FileNameMap), FileNameMapCmp)) + continue; + } + + snprintf(dest, sizeof(dest), "%s/%s", old_cat, de->d_name); + if (rename(path, dest) != 0) + pg_fatal("could not rename \"%s\" to \"%s\": %m", path, dest); + } + if (errno) + pg_fatal("could not read directory \"%s\": %m", new_dat); + (void) closedir(dir); + + /* Move the new catalog files into place. */ + dir = opendir(moved_dat); + if (dir == NULL) + pg_fatal("could not open directory \"%s\": %m", moved_dat); + while (errno = 0, (de = readdir(dir)) != NULL) + { + snprintf(path, sizeof(path), "%s/%s", moved_dat, de->d_name); + if (get_dirent_type(path, de, false, PG_LOG_ERROR) != PGFILETYPE_REG) + continue; + + rfn = parse_relfilenumber(de->d_name); + if (RelFileNumberIsValid(rfn)) + { + FileNameMap key = {.relfilenumber = rfn}; + + if (bsearch(&key, maps, size, sizeof(FileNameMap), FileNameMapCmp)) + continue; + } + + snprintf(dest, sizeof(dest), "%s/%s", new_dat, de->d_name); + if (rename(path, dest) != 0) + pg_fatal("could not rename \"%s\" to \"%s\": %m", path, dest); + + /* + * We don't fsync() the database files in the file synchronization + * stage of pg_upgrade in swap mode, so we need to synchronize them + * ourselves. We only do this for the catalog files because they were + * created during pg_restore with fsync=off. We assume that the user + * data files files were properly persisted to disk when the user last + * shut it down. + */ + if (user_opts.do_sync) + sync_queue_push(dest); + } + if (errno) + pg_fatal("could not read directory \"%s\": %m", moved_dat); + (void) closedir(dir); + + /* Ensure the directory entries are persisted to disk. */ + if (fsync_fname(new_dat, true) != 0) + pg_fatal("could not synchronize directory \"%s\": %m", new_dat); + if (fsync_parent_path(new_dat) != 0) + pg_fatal("could not synchronize parent directory of \"%s\": %m", new_dat); +} + +/* + * do_swap() + * + * Perform the required steps for --swap for a single database. In short this + * moves the old cluster's database directory into the new cluster and then + * replaces any files for system catalogs with the ones that were generated + * during pg_restore. + */ +static void +do_swap(FileNameMap *maps, int size, char *old_tablespace) +{ + char old_cat[MAXPGPATH]; + char new_dat[MAXPGPATH]; + char moved_dat[MAXPGPATH]; + + /* + * We perform many lookups on maps by relfilenumber in swap mode, so make + * sure it's sorted by relfilenumber. maps should already be sorted by + * OID, so in general this shouldn't have much work to do. + */ + qsort(maps, size, sizeof(FileNameMap), FileNameMapCmp); + + /* + * If an old tablespace is given, we only need to process that one. If no + * old tablespace is specified, we need to process all the tablespaces on + * the system. + */ + if (old_tablespace) + { + if (prepare_for_swap(old_tablespace, maps[0].db_oid, + old_cat, new_dat, moved_dat)) + swap_catalog_files(maps, size, old_cat, new_dat, moved_dat); + } + else + { + if (prepare_for_swap(old_cluster.pgdata, maps[0].db_oid, + old_cat, new_dat, moved_dat)) + swap_catalog_files(maps, size, old_cat, new_dat, moved_dat); + + for (int tblnum = 0; tblnum < os_info.num_old_tablespaces; tblnum++) + { + if (prepare_for_swap(os_info.old_tablespaces[tblnum], maps[0].db_oid, + old_cat, new_dat, moved_dat)) + swap_catalog_files(maps, size, old_cat, new_dat, moved_dat); + } + } } /* @@ -145,6 +501,20 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace) new_cluster.controldata.cat_ver >= VISIBILITY_MAP_FROZEN_BIT_CAT_VER) vm_must_add_frozenbit = true; + /* --swap has its own subroutine */ + if (user_opts.transfer_mode == TRANSFER_MODE_SWAP) + { + /* + * We don't support --swap to upgrade from versions that require + * rewriting the visibility map. We should've failed already if + * someone tries to do that. + */ + Assert(!vm_must_add_frozenbit); + + do_swap(maps, size, old_tablespace); + return; + } + for (mapnum = 0; mapnum < size; mapnum++) { if (old_tablespace == NULL || @@ -259,6 +629,11 @@ transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_fro pg_log(PG_VERBOSE, "linking \"%s\" to \"%s\"", old_file, new_file); linkFile(old_file, new_file, map->nspname, map->relname); + break; + case TRANSFER_MODE_SWAP: + /* swap mode is handled in its own code path */ + pg_fatal("should never happen"); + break; } } } diff --git a/src/bin/pg_upgrade/t/006_transfer_modes.pl b/src/bin/pg_upgrade/t/006_transfer_modes.pl index 518e09941459..34fddbcdab57 100644 --- a/src/bin/pg_upgrade/t/006_transfer_modes.pl +++ b/src/bin/pg_upgrade/t/006_transfer_modes.pl @@ -16,6 +16,15 @@ sub test_mode my $old = PostgreSQL::Test::Cluster->new('old', install_path => $ENV{oldinstall}); my $new = PostgreSQL::Test::Cluster->new('new'); + # --swap can't be used to upgrade from versions older than 10, so just skip + # the test if the old cluster version is too old. + if ($old->pg_version < 10 && $mode eq "--swap") + { + $old->clean_node(); + $new->clean_node(); + return; + } + if (defined($ENV{oldinstall})) { # Checksums are now enabled by default, but weren't before 18, so pass @@ -97,5 +106,6 @@ sub test_mode test_mode('--copy'); test_mode('--copy-file-range'); test_mode('--link'); +test_mode('--swap'); done_testing(); diff --git a/src/common/file_utils.c b/src/common/file_utils.c index eaa2e76f43f5..7b62687a2aa7 100644 --- a/src/common/file_utils.c +++ b/src/common/file_utils.c @@ -45,12 +45,10 @@ */ #define MINIMUM_VERSION_FOR_PG_WAL 100000 -#ifdef PG_FLUSH_DATA_WORKS -static int pre_sync_fname(const char *fname, bool isdir); -#endif static void walkdir(const char *path, int (*action) (const char *fname, bool isdir), - bool process_symlinks); + bool process_symlinks, + const char *exclude_dir); #ifdef HAVE_SYNCFS @@ -93,11 +91,15 @@ do_syncfs(const char *path) * syncing, and might not have privileges to write at all. * * serverVersion indicates the version of the server to be sync'd. + * + * If sync_data_files is false, this function skips syncing "base/" and any + * other tablespace directories. */ void sync_pgdata(const char *pg_data, int serverVersion, - DataDirSyncMethod sync_method) + DataDirSyncMethod sync_method, + bool sync_data_files) { bool xlog_is_symlink; char pg_wal[MAXPGPATH]; @@ -147,30 +149,33 @@ sync_pgdata(const char *pg_data, do_syncfs(pg_data); /* If any tablespaces are configured, sync each of those. */ - dir = opendir(pg_tblspc); - if (dir == NULL) - pg_log_error("could not open directory \"%s\": %m", - pg_tblspc); - else + if (sync_data_files) { - while (errno = 0, (de = readdir(dir)) != NULL) + dir = opendir(pg_tblspc); + if (dir == NULL) + pg_log_error("could not open directory \"%s\": %m", + pg_tblspc); + else { - char subpath[MAXPGPATH * 2]; + while (errno = 0, (de = readdir(dir)) != NULL) + { + char subpath[MAXPGPATH * 2]; - if (strcmp(de->d_name, ".") == 0 || - strcmp(de->d_name, "..") == 0) - continue; + if (strcmp(de->d_name, ".") == 0 || + strcmp(de->d_name, "..") == 0) + continue; - snprintf(subpath, sizeof(subpath), "%s/%s", - pg_tblspc, de->d_name); - do_syncfs(subpath); - } + snprintf(subpath, sizeof(subpath), "%s/%s", + pg_tblspc, de->d_name); + do_syncfs(subpath); + } - if (errno) - pg_log_error("could not read directory \"%s\": %m", - pg_tblspc); + if (errno) + pg_log_error("could not read directory \"%s\": %m", + pg_tblspc); - (void) closedir(dir); + (void) closedir(dir); + } } /* If pg_wal is a symlink, process that too. */ @@ -182,15 +187,21 @@ sync_pgdata(const char *pg_data, case DATA_DIR_SYNC_METHOD_FSYNC: { + char *exclude_dir = NULL; + + if (!sync_data_files) + exclude_dir = psprintf("%s/base", pg_data); + /* * If possible, hint to the kernel that we're soon going to * fsync the data directory and its contents. */ #ifdef PG_FLUSH_DATA_WORKS - walkdir(pg_data, pre_sync_fname, false); + walkdir(pg_data, pre_sync_fname, false, exclude_dir); if (xlog_is_symlink) - walkdir(pg_wal, pre_sync_fname, false); - walkdir(pg_tblspc, pre_sync_fname, true); + walkdir(pg_wal, pre_sync_fname, false, NULL); + if (sync_data_files) + walkdir(pg_tblspc, pre_sync_fname, true, NULL); #endif /* @@ -203,10 +214,14 @@ sync_pgdata(const char *pg_data, * get fsync'd twice. That's not an expected case so we don't * worry about optimizing it. */ - walkdir(pg_data, fsync_fname, false); + walkdir(pg_data, fsync_fname, false, exclude_dir); if (xlog_is_symlink) - walkdir(pg_wal, fsync_fname, false); - walkdir(pg_tblspc, fsync_fname, true); + walkdir(pg_wal, fsync_fname, false, NULL); + if (sync_data_files) + walkdir(pg_tblspc, fsync_fname, true, NULL); + + if (exclude_dir) + pfree(exclude_dir); } break; } @@ -245,10 +260,10 @@ sync_dir_recurse(const char *dir, DataDirSyncMethod sync_method) * fsync the data directory and its contents. */ #ifdef PG_FLUSH_DATA_WORKS - walkdir(dir, pre_sync_fname, false); + walkdir(dir, pre_sync_fname, false, NULL); #endif - walkdir(dir, fsync_fname, false); + walkdir(dir, fsync_fname, false, NULL); } break; } @@ -264,6 +279,9 @@ sync_dir_recurse(const char *dir, DataDirSyncMethod sync_method) * ignored in subdirectories, ie we intentionally don't pass down the * process_symlinks flag to recursive calls. * + * If exclude_dir is not NULL, it specifies a directory path to skip + * processing. + * * Errors are reported but not considered fatal. * * See also walkdir in fd.c, which is a backend version of this logic. @@ -271,11 +289,15 @@ sync_dir_recurse(const char *dir, DataDirSyncMethod sync_method) static void walkdir(const char *path, int (*action) (const char *fname, bool isdir), - bool process_symlinks) + bool process_symlinks, + const char *exclude_dir) { DIR *dir; struct dirent *de; + if (exclude_dir && strcmp(exclude_dir, path) == 0) + return; + dir = opendir(path); if (dir == NULL) { @@ -299,7 +321,7 @@ walkdir(const char *path, (*action) (subpath, false); break; case PGFILETYPE_DIR: - walkdir(subpath, action, false); + walkdir(subpath, action, false, exclude_dir); break; default: @@ -327,16 +349,16 @@ walkdir(const char *path, } /* - * Hint to the OS that it should get ready to fsync() this file. + * Hint to the OS that it should get ready to fsync() this file, if supported + * by the platform. * * Ignores errors trying to open unreadable files, and reports other errors * non-fatally. */ -#ifdef PG_FLUSH_DATA_WORKS - -static int +int pre_sync_fname(const char *fname, bool isdir) { +#ifdef PG_FLUSH_DATA_WORKS int fd; fd = open(fname, O_RDONLY | PG_BINARY, 0); @@ -363,11 +385,10 @@ pre_sync_fname(const char *fname, bool isdir) #endif (void) close(fd); +#endif /* PG_FLUSH_DATA_WORKS */ return 0; } -#endif /* PG_FLUSH_DATA_WORKS */ - /* * fsync_fname -- Try to fsync a file or directory * diff --git a/src/include/common/file_utils.h b/src/include/common/file_utils.h index a832210adc1d..9fd88953e43b 100644 --- a/src/include/common/file_utils.h +++ b/src/include/common/file_utils.h @@ -33,9 +33,10 @@ typedef enum DataDirSyncMethod struct iovec; /* avoid including port/pg_iovec.h here */ #ifdef FRONTEND +extern int pre_sync_fname(const char *fname, bool isdir); extern int fsync_fname(const char *fname, bool isdir); extern void sync_pgdata(const char *pg_data, int serverVersion, - DataDirSyncMethod sync_method); + DataDirSyncMethod sync_method, bool sync_data_files); extern void sync_dir_recurse(const char *dir, DataDirSyncMethod sync_method); extern int durable_rename(const char *oldfile, const char *newfile); extern int fsync_parent_path(const char *fname); diff --git a/src/test/modules/test_pg_dump/t/001_base.pl b/src/test/modules/test_pg_dump/t/001_base.pl index a9bcac4169d7..adcaa419616c 100644 --- a/src/test/modules/test_pg_dump/t/001_base.pl +++ b/src/test/modules/test_pg_dump/t/001_base.pl @@ -48,7 +48,7 @@ dump_cmd => [ 'pg_dump', '--no-sync', '--file' => "$tempdir/binary_upgrade.sql", - '--schema-only', '--binary-upgrade', + '--schema-only', '--sequence-data', '--binary-upgrade', '--dbname' => 'postgres', ], },