* XLOG_BLCKSZ, and make sure it's at least 5*Max(BLCKSZ, XLOG_BLCKSZ) to start
* with. (That is enough for all "normal" records, but very large commit or
* abort records might need more space.)
+ *
+ * Note: This routine should *never* be called for xl_tot_len until the header
+ * of the record has been fully validated.
*/
static bool
allocate_recordbuf(XLogReaderState *state, uint32 reclength)
newSize += XLOG_BLCKSZ - (newSize % XLOG_BLCKSZ);
newSize = Max(newSize, 5 * Max(BLCKSZ, XLOG_BLCKSZ));
-#ifndef FRONTEND
-
- /*
- * Note that in much unlucky circumstances, the random data read from a
- * recycled segment can cause this routine to be called with a size
- * causing a hard failure at allocation. For a standby, this would cause
- * the instance to stop suddenly with a hard failure, preventing it to
- * retry fetching WAL from one of its sources which could allow it to move
- * on with replay without a manual restart. If the data comes from a past
- * recycled segment and is still valid, then the allocation may succeed
- * but record checks are going to fail so this would be short-lived. If
- * the allocation fails because of a memory shortage, then this is not a
- * hard failure either per the guarantee given by MCXT_ALLOC_NO_OOM.
- */
- if (!AllocSizeIsValid(newSize))
- return false;
-
-#endif
-
if (state->readRecordBuf)
pfree(state->readRecordBuf);
state->readRecordBuf =
}
else
{
- /* XXX: more validation should be done here */
- if (total_len < SizeOfXLogRecord)
- {
- report_invalid_record(state,
- "invalid record length at %X/%X: expected at least %u, got %u",
- LSN_FORMAT_ARGS(RecPtr),
- (uint32) SizeOfXLogRecord, total_len);
- goto err;
- }
+ /* We'll validate the header once we have the next page. */
gotheader = false;
}
/*
- * Find space to decode this record. Don't allow oversized allocation if
- * the caller requested nonblocking. Otherwise, we *have* to try to
- * decode the record now because the caller has nothing else to do, so
- * allow an oversized record to be palloc'd if that turns out to be
- * necessary.
+ * Try to find space to decode this record, if we can do so without
+ * calling palloc. If we can't, we'll try again below after we've
+ * validated that total_len isn't garbage bytes from a recycled WAL page.
*/
decoded = XLogReadRecordAlloc(state,
total_len,
- !nonblocking /* allow_oversized */ );
- if (decoded == NULL)
+ false /* allow_oversized */ );
+ if (decoded == NULL && nonblocking)
{
/*
- * There is no space in the decode buffer. The caller should help
- * with that problem by consuming some records.
+ * There is no space in the circular decode buffer, and the caller is
+ * only reading ahead. The caller should consume existing records to
+ * make space.
*/
- if (nonblocking)
- return XLREAD_WOULDBLOCK;
-
- /* We failed to allocate memory for an oversized record. */
- report_invalid_record(state,
- "out of memory while trying to decode a record of length %u", total_len);
- goto err;
+ return XLREAD_WOULDBLOCK;
}
len = XLOG_BLCKSZ - RecPtr % XLOG_BLCKSZ;
assembled = true;
/*
- * Enlarge readRecordBuf as needed.
+ * We always have space for a couple of pages, enough to validate a
+ * boundary-spanning record header.
*/
- if (total_len > state->readRecordBufSize &&
- !allocate_recordbuf(state, total_len))
- {
- /* We treat this as a "bogus data" condition */
- report_invalid_record(state, "record length %u at %X/%X too long",
- total_len, LSN_FORMAT_ARGS(RecPtr));
- goto err;
- }
+ Assert(state->readRecordBufSize >= XLOG_BLCKSZ * 2);
+ Assert(state->readRecordBufSize >= len);
/* Copy the first fragment of the record from the first page. */
memcpy(state->readRecordBuf,
goto err;
gotheader = true;
}
- } while (gotlen < total_len);
+ /*
+ * We might need a bigger buffer. We have validated the record
+ * header, in the case that it split over a page boundary. We've
+ * also cross-checked total_len against xlp_rem_len on the second
+ * page, and verified xlp_pageaddr on both.
+ */
+ if (total_len > state->readRecordBufSize)
+ {
+ char save_copy[XLOG_BLCKSZ * 2];
+
+ /*
+ * Save and restore the data we already had. It can't be more
+ * than two pages.
+ */
+ Assert(gotlen <= lengthof(save_copy));
+ Assert(gotlen <= state->readRecordBufSize);
+ memcpy(save_copy, state->readRecordBuf, gotlen);
+ if (!allocate_recordbuf(state, total_len))
+ {
+ /* We treat this as a "bogus data" condition */
+ report_invalid_record(state, "record length %u at %X/%X too long",
+ total_len, LSN_FORMAT_ARGS(RecPtr));
+ goto err;
+ }
+ memcpy(state->readRecordBuf, save_copy, gotlen);
+ buffer = state->readRecordBuf + gotlen;
+ }
+ } while (gotlen < total_len);
Assert(gotheader);
record = (XLogRecord *) state->readRecordBuf;
state->NextRecPtr -= XLogSegmentOffset(state->NextRecPtr, state->segcxt.ws_segsize);
}
+ /*
+ * If we got here without a DecodedXLogRecord, it means we needed to
+ * validate total_len before trusting it, but by now now we've done that.
+ */
+ if (decoded == NULL)
+ {
+ Assert(!nonblocking);
+ decoded = XLogReadRecordAlloc(state,
+ total_len,
+ true /* allow_oversized */ );
+ if (decoded == NULL)
+ {
+ /*
+ * We failed to allocate memory for an oversized record. As
+ * above, we currently treat this as a "bogus data" condition.
+ */
+ report_invalid_record(state,
+ "out of memory while trying to decode a record of length %u", total_len);
+ goto err;
+ }
+ }
+
if (DecodeXLogRecord(state, decoded, record, RecPtr, &errormsg))
{
/* Record the location of the next record. */
state->decode_queue_head = decoded;
return XLREAD_SUCCESS;
}
- else
- return XLREAD_FAIL;
err:
if (assembled)
--- /dev/null
+# Copyright (c) 2023, PostgreSQL Global Development Group
+#
+# Test detecting end-of-WAL conditions. This test suite generates
+# fake defective page and record headers to trigger various failure
+# scenarios.
+
+use strict;
+use warnings;
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+use Fcntl qw(SEEK_SET);
+
+use integer; # causes / operator to use integer math
+
+# Header size of record header.
+my $RECORD_HEADER_SIZE = 24;
+
+# Fields retrieved from code headers.
+my @scan_result = scan_server_header('access/xlog_internal.h',
+ '#define\s+XLOG_PAGE_MAGIC\s+(\w+)');
+my $XLP_PAGE_MAGIC = hex($scan_result[0]);
+@scan_result = scan_server_header('access/xlog_internal.h',
+ '#define\s+XLP_FIRST_IS_CONTRECORD\s+(\w+)');
+my $XLP_FIRST_IS_CONTRECORD = hex($scan_result[0]);
+
+# Values queried from the server
+my $WAL_SEGMENT_SIZE;
+my $WAL_BLOCK_SIZE;
+my $TLI;
+
+# Build path of a WAL segment.
+sub wal_segment_path
+{
+ my $node = shift;
+ my $tli = shift;
+ my $segment = shift;
+ my $wal_path =
+ sprintf("%s/pg_wal/%08X%08X%08X", $node->data_dir, $tli, 0, $segment);
+ return $wal_path;
+}
+
+# Calculate from a LSN (in bytes) its segment number and its offset.
+sub lsn_to_segment_and_offset
+{
+ my $lsn = shift;
+ return ($lsn / $WAL_SEGMENT_SIZE, $lsn % $WAL_SEGMENT_SIZE);
+}
+
+# Write some arbitrary data in WAL for the given segment at LSN.
+# This should be called while the cluster is not running.
+sub write_wal
+{
+ my $node = shift;
+ my $tli = shift;
+ my $lsn = shift;
+ my $data = shift;
+
+ my ($segment, $offset) = lsn_to_segment_and_offset($lsn);
+ my $path = wal_segment_path($node, $tli, $segment);
+
+ open my $fh, "+<:raw", $path or die;
+ seek($fh, $offset, SEEK_SET) or die;
+ print $fh $data;
+ close $fh;
+}
+
+# Emit a WAL record of arbitrary size. Returns the end LSN of the
+# record inserted, in bytes.
+sub emit_message
+{
+ my $node = shift;
+ my $size = shift;
+ return int(
+ $node->safe_psql(
+ 'postgres',
+ "SELECT pg_logical_emit_message(true, '', repeat('a', $size)) - '0/0'"
+ ));
+}
+
+# Get the current insert LSN of a node, in bytes.
+sub get_insert_lsn
+{
+ my $node = shift;
+ return int(
+ $node->safe_psql(
+ 'postgres', "SELECT pg_current_wal_insert_lsn() - '0/0'"));
+}
+
+# Get GUC value, converted to an int.
+sub get_int_setting
+{
+ my $node = shift;
+ my $name = shift;
+ return int(
+ $node->safe_psql(
+ 'postgres',
+ "SELECT setting FROM pg_settings WHERE name = '$name'"));
+}
+
+sub start_of_page
+{
+ my $lsn = shift;
+ return $lsn & ~($WAL_BLOCK_SIZE - 1);
+}
+
+sub start_of_next_page
+{
+ my $lsn = shift;
+ return start_of_page($lsn) + $WAL_BLOCK_SIZE;
+}
+
+# Build a fake WAL record header based on the data given by the caller.
+# This needs to follow the format of the C structure XLogRecord. To
+# be inserted with write_wal().
+sub build_record_header
+{
+ my $xl_tot_len = shift;
+ my $xl_xid = shift || 0;
+ my $xl_prev = shift || 0;
+ my $xl_info = shift || 0;
+ my $xl_rmid = shift || 0;
+ my $xl_crc = shift || 0;
+
+ # This needs to follow the structure XLogRecord:
+ # I for xl_tot_len
+ # I for xl_xid
+ # Q for xl_prev
+ # C for xl_info
+ # C for xl_rmid
+ # BB for two bytes of padding
+ # I for xl_crc
+ return pack("IIQCCBBI",
+ $xl_tot_len, $xl_xid, $xl_prev, $xl_info, $xl_rmid, 0, 0, $xl_crc);
+}
+
+# Build a fake WAL page header, based on the data given by the caller
+# This needs to follow the format of the C structure XLogPageHeaderData.
+# To be inserted with write_wal().
+sub build_page_header
+{
+ my $xlp_magic = shift;
+ my $xlp_info = shift || 0;
+ my $xlp_tli = shift || 0;
+ my $xlp_pageaddr = shift || 0;
+ my $xlp_rem_len = shift || 0;
+
+ # This needs to follow the structure XLogPageHeaderData:
+ # S for xlp_magic
+ # S for xlp_info
+ # I for xlp_tli
+ # Q for xlp_pageaddr
+ # I for xlp_rem_len
+ return pack("SSIQI",
+ $xlp_magic, $xlp_info, $xlp_tli, $xlp_pageaddr, $xlp_rem_len);
+}
+
+# Make sure we are far away enough from the end of a page that we could insert
+# a couple of small records. This inserts a few records of a fixed size, until
+# the threshold gets close enough to the end of the WAL page inserting records
+# to.
+sub advance_out_of_record_splitting_zone
+{
+ my $node = shift;
+
+ my $page_threshold = $WAL_BLOCK_SIZE / 4;
+ my $end_lsn = get_insert_lsn($node);
+ my $page_offset = $end_lsn % $WAL_BLOCK_SIZE;
+ while ($page_offset >= $WAL_BLOCK_SIZE - $page_threshold)
+ {
+ emit_message($node, $page_threshold);
+ $end_lsn = get_insert_lsn($node);
+ $page_offset = $end_lsn % $WAL_BLOCK_SIZE;
+ }
+ return $end_lsn;
+}
+
+# Advance so close to the end of a page that an XLogRecordHeader would not
+# fit on it.
+sub advance_to_record_splitting_zone
+{
+ my $node = shift;
+
+ my $end_lsn = get_insert_lsn($node);
+ my $page_offset = $end_lsn % $WAL_BLOCK_SIZE;
+
+ # Get fairly close to the end of a page in big steps
+ while ($page_offset <= $WAL_BLOCK_SIZE - 512)
+ {
+ emit_message($node, $WAL_BLOCK_SIZE - $page_offset - 256);
+ $end_lsn = get_insert_lsn($node);
+ $page_offset = $end_lsn % $WAL_BLOCK_SIZE;
+ }
+
+ # Calibrate our message size so that we can get closer 8 bytes at
+ # a time.
+ my $message_size = $WAL_BLOCK_SIZE - 80;
+ while ($page_offset <= $WAL_BLOCK_SIZE - $RECORD_HEADER_SIZE)
+ {
+ emit_message($node, $message_size);
+ $end_lsn = get_insert_lsn($node);
+
+ my $old_offset = $page_offset;
+ $page_offset = $end_lsn % $WAL_BLOCK_SIZE;
+
+ # Adjust the message size until it causes 8 bytes changes in
+ # offset, enough to be able to split a record header.
+ my $delta = $page_offset - $old_offset;
+ if ($delta > 8)
+ {
+ $message_size -= 8;
+ }
+ elsif ($delta <= 0)
+ {
+ $message_size += 8;
+ }
+ }
+ return $end_lsn;
+}
+
+# Setup a new node. The configuration chosen here minimizes the number
+# of arbitrary records that could get generated in a cluster. Enlarging
+# checkpoint_timeout avoids noise with checkpoint activity. wal_level
+# set to "minimal" avoids random standby snapshot records. Autovacuum
+# could also trigger randomly, generating random WAL activity of its own.
+my $node = PostgreSQL::Test::Cluster->new("node");
+$node->init;
+$node->append_conf(
+ 'postgresql.conf',
+ q[wal_level = minimal
+ autovacuum = off
+ checkpoint_timeout = '30min'
+]);
+$node->start;
+$node->safe_psql('postgres', "CREATE TABLE t AS SELECT 42");
+
+$WAL_SEGMENT_SIZE = get_int_setting($node, 'wal_segment_size');
+$WAL_BLOCK_SIZE = get_int_setting($node, 'wal_block_size');
+$TLI = $node->safe_psql('postgres',
+ "SELECT timeline_id FROM pg_control_checkpoint();");
+
+my $end_lsn;
+my $prev_lsn;
+
+###########################################################################
+note "Single-page end-of-WAL detection";
+###########################################################################
+
+# xl_tot_len is 0 (a common case, we hit trailing zeroes).
+emit_message($node, 0);
+$end_lsn = advance_out_of_record_splitting_zone($node);
+$node->stop('immediate');
+my $log_size = -s $node->logfile;
+$node->start;
+ok( $node->log_contains(
+ "invalid record length at .*: expected at least 24, got 0", $log_size
+ ),
+ "xl_tot_len zero");
+
+# xl_tot_len is < 24 (presumably recycled garbage).
+emit_message($node, 0);
+$end_lsn = advance_out_of_record_splitting_zone($node);
+$node->stop('immediate');
+write_wal($node, $TLI, $end_lsn, build_record_header(23));
+$log_size = -s $node->logfile;
+$node->start;
+ok( $node->log_contains(
+ "invalid record length at .*: expected at least 24, got 23",
+ $log_size),
+ "xl_tot_len short");
+
+# Need more pages, but xl_prev check fails first.
+emit_message($node, 0);
+$end_lsn = advance_out_of_record_splitting_zone($node);
+$node->stop('immediate');
+write_wal($node, $TLI, $end_lsn,
+ build_record_header(2 * 1024 * 1024 * 1024, 0, 0xdeadbeef));
+$log_size = -s $node->logfile;
+$node->start;
+ok( $node->log_contains(
+ "record with incorrect prev-link 0/DEADBEEF at .*", $log_size),
+ "xl_prev bad");
+
+# xl_crc check fails.
+emit_message($node, 0);
+advance_out_of_record_splitting_zone($node);
+$end_lsn = emit_message($node, 10);
+$node->stop('immediate');
+# Corrupt a byte in that record, breaking its CRC.
+write_wal($node, $TLI, $end_lsn - 8, '!');
+$log_size = -s $node->logfile;
+$node->start;
+ok( $node->log_contains(
+ "incorrect resource manager data checksum in record at .*", $log_size
+ ),
+ "xl_crc bad");
+
+
+###########################################################################
+note "Multi-page end-of-WAL detection, header is not split";
+###########################################################################
+
+# This series of tests requires a valid xl_prev set in the record header
+# written to WAL.
+
+# Good xl_prev, we hit zero page next (zero magic).
+emit_message($node, 0);
+$prev_lsn = advance_out_of_record_splitting_zone($node);
+$end_lsn = emit_message($node, 0);
+$node->stop('immediate');
+write_wal($node, $TLI, $end_lsn,
+ build_record_header(2 * 1024 * 1024 * 1024, 0, $prev_lsn));
+$log_size = -s $node->logfile;
+$node->start;
+ok($node->log_contains("invalid magic number 0000 .* LSN .*", $log_size),
+ "xlp_magic zero");
+
+# Good xl_prev, we hit garbage page next (bad magic).
+emit_message($node, 0);
+$prev_lsn = advance_out_of_record_splitting_zone($node);
+$end_lsn = emit_message($node, 0);
+$node->stop('immediate');
+write_wal($node, $TLI, $end_lsn,
+ build_record_header(2 * 1024 * 1024 * 1024, 0, $prev_lsn));
+write_wal(
+ $node, $TLI,
+ start_of_next_page($end_lsn),
+ build_page_header(0xcafe, 0, 1, 0));
+$log_size = -s $node->logfile;
+$node->start;
+ok($node->log_contains("invalid magic number CAFE .* LSN .*", $log_size),
+ "xlp_magic bad");
+
+# Good xl_prev, we hit typical recycled page (good xlp_magic, bad
+# xlp_pageaddr).
+emit_message($node, 0);
+$prev_lsn = advance_out_of_record_splitting_zone($node);
+$end_lsn = emit_message($node, 0);
+$node->stop('immediate');
+write_wal($node, $TLI, $end_lsn,
+ build_record_header(2 * 1024 * 1024 * 1024, 0, $prev_lsn));
+write_wal(
+ $node, $TLI,
+ start_of_next_page($end_lsn),
+ build_page_header($XLP_PAGE_MAGIC, 0, 1, 0xbaaaaaad));
+$log_size = -s $node->logfile;
+$node->start;
+ok( $node->log_contains(
+ "unexpected pageaddr 0/BAAAAAAD in .*, LSN .*,", $log_size),
+ "xlp_pageaddr bad");
+
+# Good xl_prev, xlp_magic, xlp_pageaddr, but bogus xlp_info.
+emit_message($node, 0);
+$prev_lsn = advance_out_of_record_splitting_zone($node);
+$end_lsn = emit_message($node, 0);
+$node->stop('immediate');
+write_wal($node, $TLI, $end_lsn,
+ build_record_header(2 * 1024 * 1024 * 1024, 42, $prev_lsn));
+write_wal(
+ $node, $TLI,
+ start_of_next_page($end_lsn),
+ build_page_header(
+ $XLP_PAGE_MAGIC, 0x1234, 1, start_of_next_page($end_lsn)));
+$log_size = -s $node->logfile;
+$node->start;
+ok($node->log_contains("invalid info bits 1234 in .*, LSN .*,", $log_size),
+ "xlp_info bad");
+
+# Good xl_prev, xlp_magic, xlp_pageaddr, but xlp_info doesn't mention
+# continuation record.
+emit_message($node, 0);
+$prev_lsn = advance_out_of_record_splitting_zone($node);
+$end_lsn = emit_message($node, 0);
+$node->stop('immediate');
+write_wal($node, $TLI, $end_lsn,
+ build_record_header(2 * 1024 * 1024 * 1024, 42, $prev_lsn));
+write_wal(
+ $node, $TLI,
+ start_of_next_page($end_lsn),
+ build_page_header($XLP_PAGE_MAGIC, 0, 1, start_of_next_page($end_lsn)));
+$log_size = -s $node->logfile;
+$node->start;
+ok($node->log_contains("there is no contrecord flag at .*", $log_size),
+ "xlp_info lacks XLP_FIRST_IS_CONTRECORD");
+
+# Good xl_prev, xlp_magic, xlp_pageaddr, xlp_info but xlp_rem_len doesn't add
+# up.
+emit_message($node, 0);
+$prev_lsn = advance_out_of_record_splitting_zone($node);
+$end_lsn = emit_message($node, 0);
+$node->stop('immediate');
+write_wal($node, $TLI, $end_lsn,
+ build_record_header(2 * 1024 * 1024 * 1024, 42, $prev_lsn));
+write_wal(
+ $node, $TLI,
+ start_of_next_page($end_lsn),
+ build_page_header(
+ $XLP_PAGE_MAGIC, $XLP_FIRST_IS_CONTRECORD,
+ 1, start_of_next_page($end_lsn),
+ 123456));
+$log_size = -s $node->logfile;
+$node->start;
+ok( $node->log_contains(
+ "invalid contrecord length 123456 .* at .*", $log_size),
+ "xlp_rem_len bad");
+
+
+###########################################################################
+note "Multi-page, but header is split, so page checks are done first";
+###########################################################################
+
+# xl_prev is bad and xl_tot_len is too big, but we'll check xlp_magic first.
+emit_message($node, 0);
+$end_lsn = advance_to_record_splitting_zone($node);
+$node->stop('immediate');
+write_wal($node, $TLI, $end_lsn,
+ build_record_header(2 * 1024 * 1024 * 1024, 0, 0xdeadbeef));
+$log_size = -s $node->logfile;
+$node->start;
+ok($node->log_contains("invalid magic number 0000 .* LSN .*", $log_size),
+ "xlp_magic zero (split record header)");
+
+# And we'll also check xlp_pageaddr before any header checks.
+emit_message($node, 0);
+$end_lsn = advance_to_record_splitting_zone($node);
+$node->stop('immediate');
+write_wal($node, $TLI, $end_lsn,
+ build_record_header(2 * 1024 * 1024 * 1024, 0, 0xdeadbeef));
+write_wal(
+ $node, $TLI,
+ start_of_next_page($end_lsn),
+ build_page_header(
+ $XLP_PAGE_MAGIC, $XLP_FIRST_IS_CONTRECORD, 1, 0xbaaaaaad));
+$log_size = -s $node->logfile;
+$node->start;
+ok( $node->log_contains(
+ "unexpected pageaddr 0/BAAAAAAD in .*, LSN .*,", $log_size),
+ "xlp_pageaddr bad (split record header)");
+
+# We'll also discover that xlp_rem_len doesn't add up before any
+# header checks,
+emit_message($node, 0);
+$end_lsn = advance_to_record_splitting_zone($node);
+$node->stop('immediate');
+write_wal($node, $TLI, $end_lsn,
+ build_record_header(2 * 1024 * 1024 * 1024, 0, 0xdeadbeef));
+write_wal(
+ $node, $TLI,
+ start_of_next_page($end_lsn),
+ build_page_header(
+ $XLP_PAGE_MAGIC, $XLP_FIRST_IS_CONTRECORD,
+ 1, start_of_next_page($end_lsn),
+ 123456));
+$log_size = -s $node->logfile;
+$node->start;
+ok( $node->log_contains(
+ "invalid contrecord length 123456 .* at .*", $log_size),
+ "xlp_rem_len bad (split record header)");
+
+done_testing();