diff --git a/src/backend/access/transam/xlogarchive.c b/src/backend/access/transam/xlogarchive.c index 1ef1713c91a4..c7640ec50257 100644 --- a/src/backend/access/transam/xlogarchive.c +++ b/src/backend/access/transam/xlogarchive.c @@ -158,27 +158,8 @@ RestoreArchivedFile(char *path, const char *xlogfname, (errmsg_internal("executing restore command \"%s\"", xlogRestoreCmd))); - fflush(NULL); - pgstat_report_wait_start(WAIT_EVENT_RESTORE_COMMAND); - - /* - * PreRestoreCommand() informs the SIGTERM handler for the startup process - * that it should proc_exit() right away. This is done for the duration - * of the system() call because there isn't a good way to break out while - * it is executing. Since we might call proc_exit() in a signal handler, - * it is best to put any additional logic before or after the - * PreRestoreCommand()/PostRestoreCommand() section. - */ - PreRestoreCommand(); - - /* - * Copy xlog from archival storage to XLOGDIR - */ - rc = system(xlogRestoreCmd); - - PostRestoreCommand(); - - pgstat_report_wait_end(); + /* Copy xlog from archival storage to XLOGDIR */ + rc = pg_system(xlogRestoreCmd, WAIT_EVENT_RESTORE_COMMAND); pfree(xlogRestoreCmd); if (rc == 0) @@ -325,11 +306,7 @@ ExecuteRecoveryCommand(const char *command, const char *commandName, /* * execute the constructed command */ - fflush(NULL); - pgstat_report_wait_start(wait_event_info); - rc = system(xlogRecoveryCmd); - pgstat_report_wait_end(); - + rc = pg_system(xlogRecoveryCmd, wait_event_info); pfree(xlogRecoveryCmd); if (rc != 0) diff --git a/src/backend/archive/shell_archive.c b/src/backend/archive/shell_archive.c index 828723afe476..64c2c6aa7605 100644 --- a/src/backend/archive/shell_archive.c +++ b/src/backend/archive/shell_archive.c @@ -75,10 +75,7 @@ shell_archive_file(ArchiveModuleState *state, const char *file, (errmsg_internal("executing archive command \"%s\"", xlogarchcmd))); - fflush(NULL); - pgstat_report_wait_start(WAIT_EVENT_ARCHIVE_COMMAND); - rc = system(xlogarchcmd); - pgstat_report_wait_end(); + rc = pg_system(xlogarchcmd, WAIT_EVENT_ARCHIVE_COMMAND); if (rc != 0) { diff --git a/src/backend/postmaster/startup.c b/src/backend/postmaster/startup.c index 7149a67fcbcd..eb79fda1fd99 100644 --- a/src/backend/postmaster/startup.c +++ b/src/backend/postmaster/startup.c @@ -33,6 +33,7 @@ #include "utils/guc.h" #include "utils/memutils.h" #include "utils/timeout.h" +#include "utils/wait_event_types.h" #ifndef USE_POSTMASTER_DEATH_SIGNAL diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index 0e8299dd5564..8cee38e6c177 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -158,6 +158,13 @@ int max_files_per_process = 1000; */ int max_safe_fds = FD_MINFREE; /* default if not changed */ +#ifdef HAVE_GETRLIMIT +static bool saved_original_max_open_files; +static struct rlimit original_max_open_files; +static struct rlimit custom_max_open_files; +#endif + + /* Whether it is safe to continue running after fsync() fails. */ bool data_sync_retry = false; @@ -946,6 +953,152 @@ InitTemporaryFileAccess(void) #endif } +/* + * Returns true if the passed in highestfd is the last one that we're allowed + * to open based on our. This should only be called if + */ +static bool +IsOpenFileLimit(int highestfd) +{ +#ifdef HAVE_GETRLIMIT + if (!saved_original_max_open_files) + { + return false; + } + + return highestfd >= custom_max_open_files.rlim_cur - 1; +#else + return false; +#endif +} + +/* + * Increases the open file limit (RLIMIT_NOFILE) by the requested amount. + * Returns true if successful, false otherwise. + */ +static bool +IncreaseOpenFileLimit(int extra_files) +{ +#ifdef HAVE_GETRLIMIT + struct rlimit rlim; + + if (!saved_original_max_open_files) + { + return false; + } + + rlim = custom_max_open_files; + + /* If we're already at the max we reached our limit */ + if (rlim.rlim_cur == original_max_open_files.rlim_max) + return false; + + /* Otherwise try to increase the soft limit to what we need */ + rlim.rlim_cur = Min(rlim.rlim_cur + extra_files, rlim.rlim_max); + + if (setrlimit(RLIMIT_NOFILE, &rlim) != 0) + { + /* We made sure not to exceed the hard limit, so this shouldn't fail */ + ereport(WARNING, (errmsg("setrlimit failed: %m"))); + return false; + } + + custom_max_open_files = rlim; + + elog(LOG, "increased open file limit to %ld", (long) rlim.rlim_cur); + + return true; +#else + return false; +#endif +} + +/* + * Saves the original open file limit (RLIMIT_NOFILE) the first time when this + * is called. If called again it's a no-op. + * + * Returns true if successful, false otherwise. + */ +static void +SaveOriginalOpenFileLimit(void) +{ +#ifdef HAVE_GETRLIMIT + int status; + + if (saved_original_max_open_files) + { + /* Already saved, no need to do it again */ + return; + } + + status = getrlimit(RLIMIT_NOFILE, &original_max_open_files); + if (status != 0) + { + ereport(WARNING, (errmsg("getrlimit failed: %m"))); + return; + } + + custom_max_open_files = original_max_open_files; + saved_original_max_open_files = true; + return; +#endif +} + +/* + * UseOriginalOpenFileLimit --- Makes the process use the original open file + * limit that was present at postmaster start. + * + * This should be called before spawning subprocesses that might use select(2) + * which can only handle file descriptors up to 1024. + */ +static void +UseOriginalOpenFileLimit(void) +{ +#ifdef HAVE_GETRLIMIT + if (!saved_original_max_open_files) + { + return; + } + + if (custom_max_open_files.rlim_cur == original_max_open_files.rlim_cur) + { + /* Not changed, so no need to call setrlimit at all */ + return; + } + + if (setrlimit(RLIMIT_NOFILE, &original_max_open_files) != 0) + { + ereport(WARNING, (errmsg("setrlimit failed: %m"))); + } +#endif +} + +/* + * UseCustomOpenFileLimit --- Makes the process use our custom open file limit + * after that we configured based on the max_files_per_process GUC. + */ +static void +UseCustomOpenFileLimit(void) +{ +#ifdef HAVE_GETRLIMIT + if (!saved_original_max_open_files) + { + return; + } + + if (custom_max_open_files.rlim_cur == original_max_open_files.rlim_cur) + { + /* Not changed, so no need to call setrlimit at all */ + return; + } + + if (setrlimit(RLIMIT_NOFILE, &custom_max_open_files) != 0) + { + ereport(WARNING, (errmsg("setrlimit failed: %m"))); + } +#endif +} + /* * count_usable_fds --- count how many FDs the system will let us open, * and estimate how many are already open. @@ -969,38 +1122,39 @@ count_usable_fds(int max_to_probe, int *usable_fds, int *already_open) int highestfd = 0; int j; -#ifdef HAVE_GETRLIMIT - struct rlimit rlim; - int getrlimit_status; -#endif - size = 1024; fd = (int *) palloc(size * sizeof(int)); -#ifdef HAVE_GETRLIMIT - getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim); - if (getrlimit_status != 0) - ereport(WARNING, (errmsg("getrlimit failed: %m"))); -#endif /* HAVE_GETRLIMIT */ + SaveOriginalOpenFileLimit(); /* dup until failure or probe limit reached */ for (;;) { int thisfd; -#ifdef HAVE_GETRLIMIT - /* * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on * some platforms */ - if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1) - break; -#endif + if (IsOpenFileLimit(highestfd)) + { + if (!IncreaseOpenFileLimit(max_to_probe - used)) + break; + } thisfd = dup(2); if (thisfd < 0) { + /* + * Eventhough we do the pre-check above, it's still possible that + * the call to dup fails with EMFILE. This can happen if the last + * file descriptor was already assigned to an "already open" file. + * One example of this happening, is if we're already at the soft + * limit when we call count_usable_fds. + */ + if (errno == EMFILE && IncreaseOpenFileLimit(max_to_probe - used)) + continue; + /* Expect EMFILE or ENFILE, else it's fishy */ if (errno != EMFILE && errno != ENFILE) elog(WARNING, "duplicating stderr file descriptor failed after %d successes: %m", used); @@ -1045,6 +1199,7 @@ set_max_safe_fds(void) { int usable_fds; int already_open; + char *max_safe_fds_string; /*---------- * We want to set max_safe_fds to @@ -1060,6 +1215,16 @@ set_max_safe_fds(void) max_safe_fds = Min(usable_fds, max_files_per_process); + /* + * Update GUC variable to allow users to see if the result is different + * than what the used value turns out to be different than what they had + * configured. + */ + max_safe_fds_string = psprintf("%d", max_safe_fds); + SetConfigOption("max_files_per_process", max_safe_fds_string, + PGC_POSTMASTER, PGC_S_OVERRIDE); + pfree(max_safe_fds_string); + /* * Take off the FDs reserved for system() etc. */ @@ -2734,6 +2899,50 @@ OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode) return -1; /* failure */ } +/* + * A custom wrapper around the system() function that calls the necessary + * functions pre/post-fork. + * + * If WAIT_EVENT_RESTORE_COMMAND is passed as the wait_event_info, it will also + * call the necessary PreRestoreCommand/PostRerstoreCommand functions. It's + * unfortunate that we have to do couple the behaviour of this function so + * tighlty to the restore command logic, but it's the only way to make sure + * that we don't have additional logic inbetween the PreRestoreCommand and + * PostRestoreCommand calls. + */ +int +pg_system(const char *command, uint32 wait_event_info) +{ + int rc; + + UseOriginalOpenFileLimit(); + fflush(NULL); + pgstat_report_wait_start(wait_event_info); + + if (wait_event_info == WAIT_EVENT_RESTORE_COMMAND) + { + /* + * PreRestoreCommand() informs the SIGTERM handler for the startup + * process that it should proc_exit() right away. This is done for + * the duration of the system() call because there isn't a good way to + * break out while it is executing. Since we might call proc_exit() + * in a signal handler, it is best to put any additional logic before + * or after the PreRestoreCommand()/PostRestoreCommand() section. + */ + PreRestoreCommand(); + } + + rc = system(command); + + if (wait_event_info == WAIT_EVENT_RESTORE_COMMAND) + PostRestoreCommand(); + + pgstat_report_wait_end(); + UseCustomOpenFileLimit(); + return rc; +} + + /* * Routines that want to initiate a pipe stream should use OpenPipeStream * rather than plain popen(). This lets fd.c deal with freeing FDs if @@ -2763,6 +2972,19 @@ OpenPipeStream(const char *command, const char *mode) ReleaseLruFiles(); TryAgain: + + /* + * It would be great if we could call UseOriginalOpenFileLimit here, but + * since popen() also opens a file in the current process (this side of the + * pipe) we cannot do so safely. Because we might already have many more + * files open than the original limit. + * + * The only way to address this would be implementing a custom popen() that + * calls UseOriginalOpenFileLimit only around the actual fork call, but + * that seems too much effort to handle the corner case where this external + * command uses both select() and tries to open more files than select() + * allows for. + */ fflush(NULL); pqsignal(SIGPIPE, SIG_DFL); errno = 0; diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index b77d8e5e30e3..2d445674a1af 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -187,6 +187,7 @@ extern int pg_fsync_writethrough(int fd); extern int pg_fdatasync(int fd); extern bool pg_file_exists(const char *name); extern void pg_flush_data(int fd, off_t offset, off_t nbytes); +extern int pg_system(const char *command, uint32 wait_event_info); extern int pg_truncate(const char *path, off_t length); extern void fsync_fname(const char *fname, bool isdir); extern int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel);