summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorThomas Munro2018-11-19 00:31:10 +0000
committerThomas Munro2018-11-19 04:41:26 +0000
commit9ccdd7f66e3324d2b6d3dec282cfa9ff084083f1 (patch)
treeaac67e1a7976dcad55e7dce5574412b7f5048381 /src
parent1556cb2fc5c774c3f7390dd6fb19190ee0c73f8b (diff)
PANIC on fsync() failure.
On some operating systems, it doesn't make sense to retry fsync(), because dirty data cached by the kernel may have been dropped on write-back failure. In that case the only remaining copy of the data is in the WAL. A subsequent fsync() could appear to succeed, but not have flushed the data. That means that a future checkpoint could apparently complete successfully but have lost data. Therefore, violently prevent any future checkpoint attempts by panicking on the first fsync() failure. Note that we already did the same for WAL data; this change extends that behavior to non-temporary data files. Provide a GUC data_sync_retry to control this new behavior, for users of operating systems that don't eject dirty data, and possibly forensic/testing uses. If it is set to on and the write-back error was transient, a later checkpoint might genuinely succeed (on a system that does not throw away buffers on failure); if the error is permanent, later checkpoints will continue to fail. The GUC defaults to off, meaning that we panic. Back-patch to all supported releases. There is still a narrow window for error-loss on some operating systems: if the file is closed and later reopened and a write-back error occurs in the intervening time, but the inode has the bad luck to be evicted due to memory pressure before we reopen, we could miss the error. A later patch will address that with a scheme for keeping files with dirty data open at all times, but we judge that to be too complicated to back-patch. Author: Craig Ringer, with some adjustments by Thomas Munro Reported-by: Craig Ringer Reviewed-by: Robert Haas, Thomas Munro, Andres Freund Discussion: https://postgr.es/m/20180427222842.in2e4mibx45zdth5%40alap3.anarazel.de
Diffstat (limited to 'src')
-rw-r--r--src/backend/access/heap/rewriteheap.c6
-rw-r--r--src/backend/access/transam/slru.c2
-rw-r--r--src/backend/access/transam/timeline.c4
-rw-r--r--src/backend/access/transam/xlog.c2
-rw-r--r--src/backend/replication/logical/snapbuild.c3
-rw-r--r--src/backend/storage/file/fd.c48
-rw-r--r--src/backend/storage/smgr/md.c6
-rw-r--r--src/backend/utils/cache/relmapper.c2
-rw-r--r--src/backend/utils/misc/guc.c9
-rw-r--r--src/backend/utils/misc/postgresql.conf.sample1
-rw-r--r--src/include/storage/fd.h2
11 files changed, 67 insertions, 18 deletions
diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index c5db75afa1f..d5bd282f8c7 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -978,7 +978,7 @@ logical_end_heap_rewrite(RewriteState state)
while ((src = (RewriteMappingFile *) hash_seq_search(&seq_status)) != NULL)
{
if (FileSync(src->vfd, WAIT_EVENT_LOGICAL_REWRITE_SYNC) != 0)
- ereport(ERROR,
+ ereport(data_sync_elevel(ERROR),
(errcode_for_file_access(),
errmsg("could not fsync file \"%s\": %m", src->path)));
FileClose(src->vfd);
@@ -1199,7 +1199,7 @@ heap_xlog_logical_rewrite(XLogReaderState *r)
*/
pgstat_report_wait_start(WAIT_EVENT_LOGICAL_REWRITE_MAPPING_SYNC);
if (pg_fsync(fd) != 0)
- ereport(ERROR,
+ ereport(data_sync_elevel(ERROR),
(errcode_for_file_access(),
errmsg("could not fsync file \"%s\": %m", path)));
pgstat_report_wait_end();
@@ -1298,7 +1298,7 @@ CheckPointLogicalRewriteHeap(void)
*/
pgstat_report_wait_start(WAIT_EVENT_LOGICAL_REWRITE_CHECKPOINT_SYNC);
if (pg_fsync(fd) != 0)
- ereport(ERROR,
+ ereport(data_sync_elevel(ERROR),
(errcode_for_file_access(),
errmsg("could not fsync file \"%s\": %m", path)));
pgstat_report_wait_end();
diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
index 1132eef0384..fad5d363e32 100644
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -928,7 +928,7 @@ SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid)
path, offset)));
break;
case SLRU_FSYNC_FAILED:
- ereport(ERROR,
+ ereport(data_sync_elevel(ERROR),
(errcode_for_file_access(),
errmsg("could not access status of transaction %u", xid),
errdetail("Could not fsync file \"%s\": %m.",
diff --git a/src/backend/access/transam/timeline.c b/src/backend/access/transam/timeline.c
index 61d36050c34..70eec5676eb 100644
--- a/src/backend/access/transam/timeline.c
+++ b/src/backend/access/transam/timeline.c
@@ -406,7 +406,7 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_SYNC);
if (pg_fsync(fd) != 0)
- ereport(ERROR,
+ ereport(data_sync_elevel(ERROR),
(errcode_for_file_access(),
errmsg("could not fsync file \"%s\": %m", tmppath)));
pgstat_report_wait_end();
@@ -485,7 +485,7 @@ writeTimeLineHistoryFile(TimeLineID tli, char *content, int size)
pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_FILE_SYNC);
if (pg_fsync(fd) != 0)
- ereport(ERROR,
+ ereport(data_sync_elevel(ERROR),
(errcode_for_file_access(),
errmsg("could not fsync file \"%s\": %m", tmppath)));
pgstat_report_wait_end();
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 2875fe023af..80616c5f1e7 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -3455,7 +3455,7 @@ XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno,
pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_SYNC);
if (pg_fsync(fd) != 0)
- ereport(ERROR,
+ ereport(data_sync_elevel(ERROR),
(errcode_for_file_access(),
errmsg("could not fsync file \"%s\": %m", tmppath)));
pgstat_report_wait_end();
diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c
index a6cd6c67d16..363ddf4505e 100644
--- a/src/backend/replication/logical/snapbuild.c
+++ b/src/backend/replication/logical/snapbuild.c
@@ -1629,6 +1629,9 @@ SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn)
* fsync the file before renaming so that even if we crash after this we
* have either a fully valid file or nothing.
*
+ * It's safe to just ERROR on fsync() here because we'll retry the whole
+ * operation including the writes.
+ *
* TODO: Do the fsync() via checkpoints/restartpoints, doing it here has
* some noticeable overhead since it's performed synchronously during
* decoding?
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index 2d75773ef02..827a1e2620b 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -145,6 +145,8 @@ int max_files_per_process = 1000;
*/
int max_safe_fds = 32; /* default if not changed */
+/* Whether it is safe to continue running after fsync() fails. */
+bool data_sync_retry = false;
/* Debugging.... */
@@ -430,11 +432,9 @@ pg_flush_data(int fd, off_t offset, off_t nbytes)
*/
rc = sync_file_range(fd, offset, nbytes,
SYNC_FILE_RANGE_WRITE);
-
- /* don't error out, this is just a performance optimization */
if (rc != 0)
{
- ereport(WARNING,
+ ereport(data_sync_elevel(WARNING),
(errcode_for_file_access(),
errmsg("could not flush dirty data: %m")));
}
@@ -506,7 +506,7 @@ pg_flush_data(int fd, off_t offset, off_t nbytes)
rc = msync(p, (size_t) nbytes, MS_ASYNC);
if (rc != 0)
{
- ereport(WARNING,
+ ereport(data_sync_elevel(WARNING),
(errcode_for_file_access(),
errmsg("could not flush dirty data: %m")));
/* NB: need to fall through to munmap()! */
@@ -562,7 +562,7 @@ pg_flush_data(int fd, off_t offset, off_t nbytes)
void
fsync_fname(const char *fname, bool isdir)
{
- fsync_fname_ext(fname, isdir, false, ERROR);
+ fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
}
/*
@@ -1022,7 +1022,8 @@ LruDelete(File file)
* to leak the FD than to mess up our internal state.
*/
if (close(vfdP->fd))
- elog(LOG, "could not close file \"%s\": %m", vfdP->fileName);
+ elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
+ "could not close file \"%s\": %m", vfdP->fileName);
vfdP->fd = VFD_CLOSED;
--nfile;
@@ -1698,7 +1699,14 @@ FileClose(File file)
{
/* close the file */
if (close(vfdP->fd))
- elog(LOG, "could not close file \"%s\": %m", vfdP->fileName);
+ {
+ /*
+ * We may need to panic on failure to close non-temporary files;
+ * see LruDelete.
+ */
+ elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
+ "could not close file \"%s\": %m", vfdP->fileName);
+ }
--nfile;
vfdP->fd = VFD_CLOSED;
@@ -3091,6 +3099,9 @@ looks_like_temp_rel_name(const char *name)
* harmless cases such as read-only files in the data directory, and that's
* not good either.
*
+ * Note that if we previously crashed due to a PANIC on fsync(), we'll be
+ * rewriting all changes again during recovery.
+ *
* Note we assume we're chdir'd into PGDATA to begin with.
*/
void
@@ -3413,3 +3424,26 @@ MakePGDirectory(const char *directoryName)
{
return mkdir(directoryName, pg_dir_create_mode);
}
+
+/*
+ * Return the passed-in error level, or PANIC if data_sync_retry is off.
+ *
+ * Failure to fsync any data file is cause for immediate panic, unless
+ * data_sync_retry is enabled. Data may have been written to the operating
+ * system and removed from our buffer pool already, and if we are running on
+ * an operating system that forgets dirty data on write-back failure, there
+ * may be only one copy of the data remaining: in the WAL. A later attempt to
+ * fsync again might falsely report success. Therefore we must not allow any
+ * further checkpoints to be attempted. data_sync_retry can in theory be
+ * enabled on systems known not to drop dirty buffered data on write-back
+ * failure (with the likely outcome that checkpoints will continue to fail
+ * until the underlying problem is fixed).
+ *
+ * Any code that reports a failure from fsync() or related functions should
+ * filter the error level with this function.
+ */
+int
+data_sync_elevel(int elevel)
+{
+ return data_sync_retry ? elevel : PANIC;
+}
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 04c1069a60b..4c6a50509f8 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -1012,7 +1012,7 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum)
MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1];
if (FileSync(v->mdfd_vfd, WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC) < 0)
- ereport(ERROR,
+ ereport(data_sync_elevel(ERROR),
(errcode_for_file_access(),
errmsg("could not fsync file \"%s\": %m",
FilePathName(v->mdfd_vfd))));
@@ -1257,7 +1257,7 @@ mdsync(void)
bms_join(new_requests, requests);
errno = save_errno;
- ereport(ERROR,
+ ereport(data_sync_elevel(ERROR),
(errcode_for_file_access(),
errmsg("could not fsync file \"%s\": %m",
path)));
@@ -1431,7 +1431,7 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
(errmsg("could not forward fsync request because request queue is full")));
if (FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) < 0)
- ereport(ERROR,
+ ereport(data_sync_elevel(ERROR),
(errcode_for_file_access(),
errmsg("could not fsync file \"%s\": %m",
FilePathName(seg->mdfd_vfd))));
diff --git a/src/backend/utils/cache/relmapper.c b/src/backend/utils/cache/relmapper.c
index 905867dc767..328d4aae7b7 100644
--- a/src/backend/utils/cache/relmapper.c
+++ b/src/backend/utils/cache/relmapper.c
@@ -876,7 +876,7 @@ write_relmap_file(bool shared, RelMapFile *newmap,
*/
pgstat_report_wait_start(WAIT_EVENT_RELATION_MAP_SYNC);
if (pg_fsync(fd) != 0)
- ereport(ERROR,
+ ereport(data_sync_elevel(ERROR),
(errcode_for_file_access(),
errmsg("could not fsync file \"%s\": %m",
mapfilename)));
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index f9074215a2d..514595699be 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -1830,6 +1830,15 @@ static struct config_bool ConfigureNamesBool[] =
NULL, NULL, NULL
},
+ {
+ {"data_sync_retry", PGC_POSTMASTER, ERROR_HANDLING_OPTIONS,
+ gettext_noop("Whether to continue running after a failure to sync data files."),
+ },
+ &data_sync_retry,
+ false,
+ NULL, NULL, NULL
+ },
+
/* End-of-list marker */
{
{NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL, NULL
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 3fe257c53f1..ab063dae419 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -666,6 +666,7 @@
#exit_on_error = off # terminate session on any error?
#restart_after_crash = on # reinitialize after backend crash?
+#data_sync_retry = off # retry or panic on failure to fsync data?
#------------------------------------------------------------------------------
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index 1289589a46b..cb882fb74e5 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -47,6 +47,7 @@ typedef int File;
/* GUC parameter */
extern PGDLLIMPORT int max_files_per_process;
+extern PGDLLIMPORT bool data_sync_retry;
/*
* This is private to fd.c, but exported for save/restore_backend_variables()
@@ -134,6 +135,7 @@ extern int durable_rename(const char *oldfile, const char *newfile, int loglevel
extern int durable_unlink(const char *fname, int loglevel);
extern int durable_link_or_rename(const char *oldfile, const char *newfile, int loglevel);
extern void SyncDataDirectory(void);
+extern int data_sync_elevel(int elevel);
/* Filename components */
#define PG_TEMP_FILES_DIR "pgsql_tmp"