There's a race condition if a file changes in the source system
after we have collected the file list. If the file becomes larger,
we only fetched up to its original size. That can easily result in
a truncated file. That's not a problem for relation files, files
in pg_xact, etc. because any actions on them will be replayed from
the WAL. However, configuration files are affected.
This commit mitigates the race condition by fetching small files in
whole, even if they have grown. A test is added in which an extra
file copied is concurrently grown with the output of pg_rewind thus
guaranteeing it to have changed in size during the operation. This
is not a full fix: we still believe the original file size for files
larger than 1 MB. That should be enough for configuration files,
and doing more than that would require big changes to the chunking
logic in libpq_source.c.
This mitigates the race condition if the file is modified between
the original scan of files and copying the file, but there's still
a race condition if a file is changed while it's being copied.
That's a much smaller window, though, and pg_basebackup has the
same issue.
This race can be seen with pg_auto_failover, which frequently uses
ALTER SYSTEM, which updates postgresql.auto.conf. Often, pg_rewind
will fail, because the postgresql.auto.conf file changed concurrently
and a partial version of it was copied to the target. The partial
file would fail to parse, preventing the server from starting up.
Author: Heikki Linnakangas
Reviewed-by: Cary Huang
Discussion: https://postgr.es/m/
f67feb24-5833-88cb-1020-
19a4a2b83ac7%40iki.fi
/* public interface functions */
static void libpq_traverse_files(rewind_source *source,
process_file_callback_t callback);
+static void libpq_queue_fetch_file(rewind_source *source, const char *path, size_t len);
static void libpq_queue_fetch_range(rewind_source *source, const char *path,
off_t off, size_t len);
static void libpq_finish_fetch(rewind_source *source);
src->common.traverse_files = libpq_traverse_files;
src->common.fetch_file = libpq_fetch_file;
+ src->common.queue_fetch_file = libpq_queue_fetch_file;
src->common.queue_fetch_range = libpq_queue_fetch_range;
src->common.finish_fetch = libpq_finish_fetch;
src->common.get_current_wal_insert_lsn = libpq_get_current_wal_insert_lsn;
PQclear(res);
}
+/*
+ * Queue up a request to fetch a file from remote system.
+ */
+static void
+libpq_queue_fetch_file(rewind_source *source, const char *path, size_t len)
+{
+ /*
+ * Truncate the target file immediately, and queue a request to fetch it
+ * from the source. If the file is small, smaller than MAX_CHUNK_SIZE,
+ * request fetching a full-sized chunk anyway, so that if the file has
+ * become larger in the source system, after we scanned the source
+ * directory, we still fetch the whole file. This only works for files up
+ * to MAX_CHUNK_SIZE, but that's good enough for small configuration files
+ * and such that are changed every now and then, but not WAL-logged. For
+ * larger files, we fetch up to the original size.
+ *
+ * Even with that mechanism, there is an inherent race condition if the
+ * file is modified at the same instant that we're copying it, so that we
+ * might copy a torn version of the file with one half from the old
+ * version and another half from the new. But pg_basebackup has the same
+ * problem, and it hasn't been a problem in practice.
+ *
+ * It might seem more natural to truncate the file later, when we receive
+ * it from the source server, but then we'd need to track which
+ * fetch-requests are for a whole file.
+ */
+ open_target_file(path, true);
+ libpq_queue_fetch_range(source, path, 0, Max(len, MAX_CHUNK_SIZE));
+}
+
/*
* Queue up a request to fetch a piece of a file from remote system.
*/
process_file_callback_t callback);
static char *local_fetch_file(rewind_source *source, const char *path,
size_t *filesize);
-static void local_fetch_file_range(rewind_source *source, const char *path,
- off_t off, size_t len);
+static void local_queue_fetch_file(rewind_source *source, const char *path,
+ size_t len);
+static void local_queue_fetch_range(rewind_source *source, const char *path,
+ off_t off, size_t len);
static void local_finish_fetch(rewind_source *source);
static void local_destroy(rewind_source *source);
src->common.traverse_files = local_traverse_files;
src->common.fetch_file = local_fetch_file;
- src->common.queue_fetch_range = local_fetch_file_range;
+ src->common.queue_fetch_file = local_queue_fetch_file;
+ src->common.queue_fetch_range = local_queue_fetch_range;
src->common.finish_fetch = local_finish_fetch;
src->common.get_current_wal_insert_lsn = NULL;
src->common.destroy = local_destroy;
return slurpFile(((local_source *) source)->datadir, path, filesize);
}
+/*
+ * Copy a file from source to target.
+ *
+ * 'len' is the expected length of the file.
+ */
+static void
+local_queue_fetch_file(rewind_source *source, const char *path, size_t len)
+{
+ const char *datadir = ((local_source *) source)->datadir;
+ PGAlignedBlock buf;
+ char srcpath[MAXPGPATH];
+ int srcfd;
+ size_t written_len;
+
+ snprintf(srcpath, sizeof(srcpath), "%s/%s", datadir, path);
+
+ /* Open source file for reading */
+ srcfd = open(srcpath, O_RDONLY | PG_BINARY, 0);
+ if (srcfd < 0)
+ pg_fatal("could not open source file \"%s\": %m",
+ srcpath);
+
+ /* Truncate and open the target file for writing */
+ open_target_file(path, true);
+
+ written_len = 0;
+ for (;;)
+ {
+ ssize_t read_len;
+
+ read_len = read(srcfd, buf.data, sizeof(buf));
+
+ if (read_len < 0)
+ pg_fatal("could not read file \"%s\": %m", srcpath);
+ else if (read_len == 0)
+ break; /* EOF reached */
+
+ write_target_range(buf.data, written_len, read_len);
+ written_len += read_len;
+ }
+
+ /*
+ * A local source is not expected to change while we're rewinding, so
+ * check that the size of the file matches our earlier expectation.
+ */
+ if (written_len != len)
+ pg_fatal("size of source file \"%s\" changed concurrently: " UINT64_FORMAT " bytes expected, " UINT64_FORMAT " copied",
+ srcpath, len, written_len);
+
+ if (close(srcfd) != 0)
+ pg_fatal("could not close file \"%s\": %m", srcpath);
+}
+
/*
* Copy a file from source to target, starting at 'off', for 'len' bytes.
*/
static void
-local_fetch_file_range(rewind_source *source, const char *path, off_t off,
- size_t len)
+local_queue_fetch_range(rewind_source *source, const char *path, off_t off,
+ size_t len)
{
const char *datadir = ((local_source *) source)->datadir;
PGAlignedBlock buf;
while (end - begin > 0)
{
ssize_t readlen;
- size_t len;
+ size_t thislen;
if (end - begin > sizeof(buf))
- len = sizeof(buf);
+ thislen = sizeof(buf);
else
- len = end - begin;
+ thislen = end - begin;
- readlen = read(srcfd, buf.data, len);
+ readlen = read(srcfd, buf.data, thislen);
if (readlen < 0)
pg_fatal("could not read file \"%s\": %m", srcpath);
local_finish_fetch(rewind_source *source)
{
/*
- * Nothing to do, local_fetch_file_range() copies the ranges immediately.
+ * Nothing to do, local_queue_fetch_range() copies the ranges immediately.
*/
}
break;
case FILE_ACTION_COPY:
- /* Truncate the old file out of the way, if any */
- open_target_file(entry->path, true);
- source->queue_fetch_range(source, entry->path,
- 0, entry->source_size);
+ source->queue_fetch_file(source, entry->path, entry->source_size);
break;
case FILE_ACTION_TRUNCATE:
void (*queue_fetch_range) (struct rewind_source *, const char *path,
off_t offset, size_t len);
+ /*
+ * Like queue_fetch_range(), but requests replacing the whole local file
+ * from the source system. 'len' is the expected length of the file,
+ * although when the source is a live server, the file may change
+ * concurrently. The implementation is not obliged to copy more than 'len'
+ * bytes, even if the file is larger. However, to avoid copying a
+ * truncated version of the file, which can cause trouble if e.g. a
+ * configuration file is modified concurrently, the implementation should
+ * try to copy the whole file, even if it's larger than expected.
+ */
+ void (*queue_fetch_file) (struct rewind_source *, const char *path,
+ size_t len);
+
/*
* Execute all requests queued up with queue_fetch_range().
*/
--- /dev/null
+
+# Copyright (c) 2021-2022, PostgreSQL Global Development Group
+
+use strict;
+use warnings;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+use FindBin;
+use lib $FindBin::RealBin;
+
+use RewindTest;
+
+RewindTest::setup_cluster("local");
+RewindTest::start_primary();
+
+# Create a test table and insert a row in primary.
+primary_psql("CREATE TABLE tbl1 (d text)");
+primary_psql("INSERT INTO tbl1 VALUES ('in primary')");
+primary_psql("CHECKPOINT");
+
+RewindTest::create_standby("local");
+
+# Insert additional data on primary that will be replicated to standby
+primary_psql("INSERT INTO tbl1 values ('in primary, before promotion')");
+primary_psql('CHECKPOINT');
+
+RewindTest::promote_standby();
+
+# Insert a row in the old primary. This causes the primary and standby to have
+# "diverged", it's no longer possible to just apply the standy's logs over
+# primary directory - you need to rewind. Also insert a new row in the
+# standby, which won't be present in the old primary.
+primary_psql("INSERT INTO tbl1 VALUES ('in primary, after promotion')");
+standby_psql("INSERT INTO tbl1 VALUES ('in standby, after promotion')");
+
+# Stop the nodes before running pg_rewind
+$node_standby->stop;
+$node_primary->stop;
+
+my $primary_pgdata = $node_primary->data_dir;
+my $standby_pgdata = $node_standby->data_dir;
+
+# Add an extra file that we can tamper with without interfering with the data
+# directory data files.
+mkdir "$standby_pgdata/tst_both_dir";
+append_to_file "$standby_pgdata/tst_both_dir/file1", 'a';
+
+# Run pg_rewind and pipe the output from the run into the extra file we want
+# to copy. This will ensure that the file is continously growing during the
+# copy operation and the result will be an error.
+my $ret = run_log(
+ [
+ 'pg_rewind', '--debug',
+ '--source-pgdata', $standby_pgdata,
+ '--target-pgdata', $primary_pgdata,
+ '--no-sync',
+ ],
+ '2>>', "$standby_pgdata/tst_both_dir/file1");
+ok(!$ret, 'Error out on copying growing file');
+
+# Ensure that the files are of different size, the final error message should
+# only be in one of them making them guaranteed to be different
+my $primary_size = -s "$primary_pgdata/tst_both_dir/file1";
+my $standby_size = -s "$standby_pgdata/tst_both_dir/file1";
+isnt($standby_size, $primary_size, "File sizes should differ");
+
+# Extract the last line from the verbose output as that should have the error
+# message for the unexpected file size
+my $last;
+open my $f, '<', "$standby_pgdata/tst_both_dir/file1";
+$last = $_ while (<$f>);
+close $f;
+like($last, qr/fatal: size of source file/, "Check error message");
+
+done_testing();