PANIC on fsync() failure.

author Thomas Munro <tmunro@postgresql.org>

Mon, 19 Nov 2018 00:31:10 +0000 (13:31 +1300)

committer Thomas Munro <tmunro@postgresql.org>

Mon, 19 Nov 2018 04:41:26 +0000 (17:41 +1300)
author Thomas Munro <tmunro@postgresql.org>
Mon, 19 Nov 2018 00:31:10 +0000 (13:31 +1300)
committer Thomas Munro <tmunro@postgresql.org>
Mon, 19 Nov 2018 04:41:26 +0000 (17:41 +1300)
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml

index 0f8f2ef920dd23e165ba58483e6905e041f1d6a3..c4effa034c12e6d8831bbbc88635f612e3fa9282 100644 (file)
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -8161,6 +8161,38 @@ dynamic_library_path = 'C:\tools\postgresql;H:\my_project\lib;$libdir'
        </listitem>
       </varlistentry>
  
+     <varlistentry id="guc-data-sync-retry" xreflabel="data_sync_retry">
+      <term><varname>data_sync_retry</varname> (<type>boolean</type>)
+      <indexterm>
+       <primary><varname>data_sync_retry</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        When set to false, which is the default, <productname>PostgreSQL</productname>
+        will raise a PANIC-level error on failure to flush modified data files
+        to the filesystem.  This causes the database server to crash.
+       </para>
+       <para>
+        On some operating systems, the status of data in the kernel's page
+        cache is unknown after a write-back failure.  In some cases it might
+        have been entirely forgotten, making it unsafe to retry; the second
+        attempt may be reported as successful, when in fact the data has been
+        lost.  In these circumstances, the only way to avoid data loss is to
+        recover from the WAL after any failure is reported, preferably
+        after investigating the root cause of the failure and replacing any
+        faulty hardware.
+       </para>
+       <para>
+        If set to true, <productname>PostgreSQL</productname> will instead
+        report an error but continue to run so that the data flushing
+        operation can be retried in a later checkpoint.  Only set it to true
+        after investigating the operating system's treatment of buffered data
+        in case of write-back failure.
+       </para>
+      </listitem>
+     </varlistentry>
+
      </variablelist>
  
     </sect1>
diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c

index c5db75afa1fe02d75b513596c044c97d339bef7f..d5bd282f8c771f33fa38bd09a5c9eadab4b19458 100644 (file)
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -978,7 +978,7 @@ logical_end_heap_rewrite(RewriteState state)
     while ((src = (RewriteMappingFile *) hash_seq_search(&seq_status)) != NULL)
     {
         if (FileSync(src->vfd, WAIT_EVENT_LOGICAL_REWRITE_SYNC) != 0)
-           ereport(ERROR,
+           ereport(data_sync_elevel(ERROR),
                     (errcode_for_file_access(),
                      errmsg("could not fsync file \"%s\": %m", src->path)));
         FileClose(src->vfd);
@@ -1199,7 +1199,7 @@ heap_xlog_logical_rewrite(XLogReaderState *r)
      */
     pgstat_report_wait_start(WAIT_EVENT_LOGICAL_REWRITE_MAPPING_SYNC);
     if (pg_fsync(fd) != 0)
-       ereport(ERROR,
+       ereport(data_sync_elevel(ERROR),
                 (errcode_for_file_access(),
                  errmsg("could not fsync file \"%s\": %m", path)));
     pgstat_report_wait_end();
@@ -1298,7 +1298,7 @@ CheckPointLogicalRewriteHeap(void)
              */
             pgstat_report_wait_start(WAIT_EVENT_LOGICAL_REWRITE_CHECKPOINT_SYNC);
             if (pg_fsync(fd) != 0)
-               ereport(ERROR,
+               ereport(data_sync_elevel(ERROR),
                         (errcode_for_file_access(),
                          errmsg("could not fsync file \"%s\": %m", path)));
             pgstat_report_wait_end();
diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c

index 1132eef0384a0767b056cfb9c1339e88bdd72869..fad5d363e32ff9d636371ea74f71b1808d9b8ab3 100644 (file)
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -928,7 +928,7 @@ SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid)
                                path, offset)));
             break;
         case SLRU_FSYNC_FAILED:
-           ereport(ERROR,
+           ereport(data_sync_elevel(ERROR),
                     (errcode_for_file_access(),
                      errmsg("could not access status of transaction %u", xid),
                      errdetail("Could not fsync file \"%s\": %m.",
diff --git a/src/backend/access/transam/timeline.c b/src/backend/access/transam/timeline.c

index 61d36050c34212d5c289e0d9c05ef05613668c5a..70eec5676eb299977590f58140c20c63025fcd24 100644 (file)
--- a/src/backend/access/transam/timeline.c
+++ b/src/backend/access/transam/timeline.c
@@ -406,7 +406,7 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
  
     pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_SYNC);
     if (pg_fsync(fd) != 0)
-       ereport(ERROR,
+       ereport(data_sync_elevel(ERROR),
                 (errcode_for_file_access(),
                  errmsg("could not fsync file \"%s\": %m", tmppath)));
     pgstat_report_wait_end();
@@ -485,7 +485,7 @@ writeTimeLineHistoryFile(TimeLineID tli, char *content, int size)
  
     pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_FILE_SYNC);
     if (pg_fsync(fd) != 0)
-       ereport(ERROR,
+       ereport(data_sync_elevel(ERROR),
                 (errcode_for_file_access(),
                  errmsg("could not fsync file \"%s\": %m", tmppath)));
     pgstat_report_wait_end();
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c

index 2875fe023aff9da68ddd451f2fa5def2a8fb131e..80616c5f1e739adf00264c9fd9df220f0185fe47 100644 (file)
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -3455,7 +3455,7 @@ XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno,
  
     pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_SYNC);
     if (pg_fsync(fd) != 0)
-       ereport(ERROR,
+       ereport(data_sync_elevel(ERROR),
                 (errcode_for_file_access(),
                  errmsg("could not fsync file \"%s\": %m", tmppath)));
     pgstat_report_wait_end();
diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c

index a6cd6c67d16d8ad24bb966fbcc16f944182eed90..363ddf4505ef63ed9466a7eaa2167e2525dc65b3 100644 (file)
--- a/src/backend/replication/logical/snapbuild.c
+++ b/src/backend/replication/logical/snapbuild.c
@@ -1629,6 +1629,9 @@ SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn)
      * fsync the file before renaming so that even if we crash after this we
      * have either a fully valid file or nothing.
      *
+    * It's safe to just ERROR on fsync() here because we'll retry the whole
+    * operation including the writes.
+    *
      * TODO: Do the fsync() via checkpoints/restartpoints, doing it here has
      * some noticeable overhead since it's performed synchronously during
      * decoding?
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c

index 2d75773ef02b5e95b10b875d61c7026a90590f06..827a1e2620b4a1b4a5ec0045e784533415ca6fc5 100644 (file)
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -145,6 +145,8 @@ int         max_files_per_process = 1000;
   */
  int            max_safe_fds = 32;  /* default if not changed */
  
+/* Whether it is safe to continue running after fsync() fails. */
+bool       data_sync_retry = false;
  
  /* Debugging.... */
  
@@ -430,11 +432,9 @@ pg_flush_data(int fd, off_t offset, off_t nbytes)
          */
         rc = sync_file_range(fd, offset, nbytes,
                              SYNC_FILE_RANGE_WRITE);
-
-       /* don't error out, this is just a performance optimization */
         if (rc != 0)
         {
-           ereport(WARNING,
+           ereport(data_sync_elevel(WARNING),
                     (errcode_for_file_access(),
                      errmsg("could not flush dirty data: %m")));
         }
@@ -506,7 +506,7 @@ pg_flush_data(int fd, off_t offset, off_t nbytes)
             rc = msync(p, (size_t) nbytes, MS_ASYNC);
             if (rc != 0)
             {
-               ereport(WARNING,
+               ereport(data_sync_elevel(WARNING),
                         (errcode_for_file_access(),
                          errmsg("could not flush dirty data: %m")));
                 /* NB: need to fall through to munmap()! */
@@ -562,7 +562,7 @@ pg_flush_data(int fd, off_t offset, off_t nbytes)
  void
  fsync_fname(const char *fname, bool isdir)
  {
-   fsync_fname_ext(fname, isdir, false, ERROR);
+   fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
  }
  
  /*
@@ -1022,7 +1022,8 @@ LruDelete(File file)
      * to leak the FD than to mess up our internal state.
      */
     if (close(vfdP->fd))
-       elog(LOG, "could not close file \"%s\": %m", vfdP->fileName);
+       elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
+            "could not close file \"%s\": %m", vfdP->fileName);
     vfdP->fd = VFD_CLOSED;
     --nfile;
  
@@ -1698,7 +1699,14 @@ FileClose(File file)
     {
         /* close the file */
         if (close(vfdP->fd))
-           elog(LOG, "could not close file \"%s\": %m", vfdP->fileName);
+       {
+           /*
+            * We may need to panic on failure to close non-temporary files;
+            * see LruDelete.
+            */
+           elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
+               "could not close file \"%s\": %m", vfdP->fileName);
+       }
  
         --nfile;
         vfdP->fd = VFD_CLOSED;
@@ -3091,6 +3099,9 @@ looks_like_temp_rel_name(const char *name)
   * harmless cases such as read-only files in the data directory, and that's
   * not good either.
   *
+ * Note that if we previously crashed due to a PANIC on fsync(), we'll be
+ * rewriting all changes again during recovery.
+ *
   * Note we assume we're chdir'd into PGDATA to begin with.
   */
  void
@@ -3413,3 +3424,26 @@ MakePGDirectory(const char *directoryName)
  {
     return mkdir(directoryName, pg_dir_create_mode);
  }
+
+/*
+ * Return the passed-in error level, or PANIC if data_sync_retry is off.
+ *
+ * Failure to fsync any data file is cause for immediate panic, unless
+ * data_sync_retry is enabled.  Data may have been written to the operating
+ * system and removed from our buffer pool already, and if we are running on
+ * an operating system that forgets dirty data on write-back failure, there
+ * may be only one copy of the data remaining: in the WAL.  A later attempt to
+ * fsync again might falsely report success.  Therefore we must not allow any
+ * further checkpoints to be attempted.  data_sync_retry can in theory be
+ * enabled on systems known not to drop dirty buffered data on write-back
+ * failure (with the likely outcome that checkpoints will continue to fail
+ * until the underlying problem is fixed).
+ *
+ * Any code that reports a failure from fsync() or related functions should
+ * filter the error level with this function.
+ */
+int
+data_sync_elevel(int elevel)
+{
+   return data_sync_retry ? elevel : PANIC;
+}
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c

index 04c1069a60b994f1851210c975cc3bb731648a36..4c6a50509f8fa7c11c1101449269746585522599 100644 (file)
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -1012,7 +1012,7 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum)
         MdfdVec    *v = &reln->md_seg_fds[forknum][segno - 1];
  
         if (FileSync(v->mdfd_vfd, WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC) < 0)
-           ereport(ERROR,
+           ereport(data_sync_elevel(ERROR),
                     (errcode_for_file_access(),
                      errmsg("could not fsync file \"%s\": %m",
                             FilePathName(v->mdfd_vfd))));
@@ -1257,7 +1257,7 @@ mdsync(void)
                             bms_join(new_requests, requests);
  
                         errno = save_errno;
-                       ereport(ERROR,
+                       ereport(data_sync_elevel(ERROR),
                                 (errcode_for_file_access(),
                                  errmsg("could not fsync file \"%s\": %m",
                                         path)));
@@ -1431,7 +1431,7 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
                 (errmsg("could not forward fsync request because request queue is full")));
  
         if (FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) < 0)
-           ereport(ERROR,
+           ereport(data_sync_elevel(ERROR),
                     (errcode_for_file_access(),
                      errmsg("could not fsync file \"%s\": %m",
                             FilePathName(seg->mdfd_vfd))));
diff --git a/src/backend/utils/cache/relmapper.c b/src/backend/utils/cache/relmapper.c

index 905867dc767f70f57b173b40cba4e439a400285b..328d4aae7b75faf8901119951091765fc33b56c1 100644 (file)
--- a/src/backend/utils/cache/relmapper.c
+++ b/src/backend/utils/cache/relmapper.c
@@ -876,7 +876,7 @@ write_relmap_file(bool shared, RelMapFile *newmap,
      */
     pgstat_report_wait_start(WAIT_EVENT_RELATION_MAP_SYNC);
     if (pg_fsync(fd) != 0)
-       ereport(ERROR,
+       ereport(data_sync_elevel(ERROR),
                 (errcode_for_file_access(),
                  errmsg("could not fsync file \"%s\": %m",
                         mapfilename)));
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c

index f9074215a2d6ea7ba528ddb8980cb0cac3b10bbd..514595699beb227a4aab145bbe89c2c379905d6e 100644 (file)
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -1830,6 +1830,15 @@ static struct config_bool ConfigureNamesBool[] =
         NULL, NULL, NULL
     },
  
+   {
+       {"data_sync_retry", PGC_POSTMASTER, ERROR_HANDLING_OPTIONS,
+           gettext_noop("Whether to continue running after a failure to sync data files."),
+       },
+       &data_sync_retry,
+       false,
+       NULL, NULL, NULL
+   },
+
     /* End-of-list marker */
     {
         {NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL, NULL
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample

index 3fe257c53f1a9341548183b0fe8f4f48ea286e75..ab063dae4193bb7cc62010f986463319c2b9e8a0 100644 (file)
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -666,6 +666,7 @@
  
  #exit_on_error = off           # terminate session on any error?
  #restart_after_crash = on      # reinitialize after backend crash?
+#data_sync_retry = off         # retry or panic on failure to fsync data?
  
  
  #------------------------------------------------------------------------------
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h

index 1289589a46b4a5c776eb421b0ddf86ba0f3f2e79..cb882fb74e57a3df4a8162c247c7534bd066e758 100644 (file)
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -47,6 +47,7 @@ typedef int File;
  
  /* GUC parameter */
  extern PGDLLIMPORT int max_files_per_process;
+extern PGDLLIMPORT bool data_sync_retry;
  
  /*
   * This is private to fd.c, but exported for save/restore_backend_variables()
@@ -134,6 +135,7 @@ extern int  durable_rename(const char *oldfile, const char *newfile, int loglevel
  extern int durable_unlink(const char *fname, int loglevel);
  extern int durable_link_or_rename(const char *oldfile, const char *newfile, int loglevel);
  extern void SyncDataDirectory(void);
+extern int data_sync_elevel(int elevel);
  
  /* Filename components */
  #define PG_TEMP_FILES_DIR "pgsql_tmp"
author	Thomas Munro <tmunro@postgresql.org>
	Mon, 19 Nov 2018 00:31:10 +0000 (13:31 +1300)
committer	Thomas Munro <tmunro@postgresql.org>
	Mon, 19 Nov 2018 04:41:26 +0000 (17:41 +1300)
doc/src/sgml/config.sgml		patch \| blob \| blame \| history
src/backend/access/heap/rewriteheap.c		patch \| blob \| blame \| history
src/backend/access/transam/slru.c		patch \| blob \| blame \| history
src/backend/access/transam/timeline.c		patch \| blob \| blame \| history
src/backend/access/transam/xlog.c		patch \| blob \| blame \| history
src/backend/replication/logical/snapbuild.c		patch \| blob \| blame \| history
src/backend/storage/file/fd.c		patch \| blob \| blame \| history
src/backend/storage/smgr/md.c		patch \| blob \| blame \| history
src/backend/utils/cache/relmapper.c		patch \| blob \| blame \| history
src/backend/utils/misc/guc.c		patch \| blob \| blame \| history
src/backend/utils/misc/postgresql.conf.sample		patch \| blob \| blame \| history
src/include/storage/fd.h		patch \| blob \| blame \| history