On sparc64+ext4, suppress test failures from known WAL read failure.
authorNoah Misch <noah@leadboat.com>
Thu, 27 Jan 2022 02:06:19 +0000 (18:06 -0800)
committerNoah Misch <noah@leadboat.com>
Thu, 27 Jan 2022 02:06:19 +0000 (18:06 -0800)
Buildfarm members kittiwake, tadarida and snapper began to fail
frequently when commits 3cd9c3b921977272e6650a5efbeade4203c4bca2 and
f47ed79cc8a0cfa154dc7f01faaf59822552363f added tests of concurrency, but
the problem was reachable before those commits.  Back-patch to v10 (all
supported versions).

Discussion: https://postgr.es/m/20220116210241.GC756210@rfd.leadboat.com

contrib/amcheck/t/003_cic_2pc.pl
src/test/perl/PostgreSQL/Test/Utils.pm
src/test/recovery/t/027_stream_regress.pl

index dfe7f0ff3b61363e911c6af4f1bc26eb762d5c4b..2f804efb84504a39f0f222ca5c6f990eefb804a3 100644 (file)
@@ -11,6 +11,8 @@ use PostgreSQL::Test::Utils;
 
 use Test::More tests => 5;
 
+local $TODO = 'filesystem bug' if PostgreSQL::Test::Utils::has_wal_read_bug;
+
 my ($node, $result);
 
 #
index 50be10fb5af5313399d09f0cfd13296269fd61c7..3da04b3b6cc3287e1d98107c3d979246dfcacd6b 100644 (file)
@@ -351,6 +351,29 @@ sub perl2host
 
 =pod
 
+=item has_wal_read_bug()
+
+Returns true if $tmp_check is subject to a sparc64+ext4 bug that causes WAL
+readers to see zeros if another process simultaneously wrote the same offsets.
+Consult this in tests that fail frequently on affected configurations.  The
+bug has made streaming standbys fail to advance, reporting corrupt WAL.  It
+has made COMMIT PREPARED fail with "could not read two-phase state from WAL".
+Non-WAL PostgreSQL reads haven't been affected, likely because those readers
+and writers have buffering systems in common.  See
+https://postgr.es/m/20220116210241.GC756210@rfd.leadboat.com for details.
+
+=cut
+
+sub has_wal_read_bug
+{
+   return
+        $Config{osname} eq 'linux'
+     && $Config{archname} =~ /^sparc/
+     && !run_log([ qw(df -x ext4), $tmp_check ], '>', '/dev/null', '2>&1');
+}
+
+=pod
+
 =item system_log(@cmd)
 
 Run (via C<system()>) the command passed as argument; the return
index 8c0a8b6d0acb71d053b15abf7c8c262067f5b824..0596980b1a6daffe5b05c433377deca867bbc81d 100644 (file)
@@ -3,9 +3,20 @@ use strict;
 use warnings;
 use PostgreSQL::Test::Cluster;
 use PostgreSQL::Test::Utils;
-use Test::More tests => 4;
+use Test::More;
 use File::Basename;
 
+if (PostgreSQL::Test::Utils::has_wal_read_bug)
+{
+   # We'd prefer to use "local $TODO", but the bug causes this test file to
+   # die(), not merely to fail.
+   plan skip_all => 'filesystem bug';
+}
+else
+{
+   plan tests => 4;
+}
+
 # Initialize primary node
 my $node_primary = PostgreSQL::Test::Cluster->new('primary');
 $node_primary->init(allows_streaming => 1);