Fix multiple crasher bugs in partitioned-table replication logic.

author Tom Lane <tgl@sss.pgh.pa.us>

Fri, 11 Jun 2021 20:12:36 +0000 (16:12 -0400)

committer Tom Lane <tgl@sss.pgh.pa.us>

Fri, 11 Jun 2021 20:12:41 +0000 (16:12 -0400)
author Tom Lane <tgl@sss.pgh.pa.us>
Fri, 11 Jun 2021 20:12:36 +0000 (16:12 -0400)
committer Tom Lane <tgl@sss.pgh.pa.us>
Fri, 11 Jun 2021 20:12:41 +0000 (16:12 -0400)
diff --git a/src/backend/replication/logical/relation.c b/src/backend/replication/logical/relation.c

index e861c0ff8029df720c1112815c9da50a390cdf1f..4930f2ca348f9bdf0d0f15108d665b8fad5599bb 100644 (file)
--- a/src/backend/replication/logical/relation.c
+++ b/src/backend/replication/logical/relation.c
@@ -620,7 +620,9 @@ logicalrep_partmap_init(void)
   * logicalrep_partition_open
   *
   * Returned entry reuses most of the values of the root table's entry, save
- * the attribute map, which can be different for the partition.
+ * the attribute map, which can be different for the partition.  However,
+ * we must physically copy all the data, in case the root table's entry
+ * gets freed/rebuilt.
   *
   * Note there's no logicalrep_partition_close, because the caller closes the
   * component relation.
@@ -656,7 +658,7 @@ logicalrep_partition_open(LogicalRepRelMapEntry *root,
  
     part_entry->partoid = partOid;
  
-   /* Remote relation is used as-is from the root entry. */
+   /* Remote relation is copied as-is from the root entry. */
     entry = &part_entry->relmapentry;
     entry->remoterel.remoteid = remoterel->remoteid;
     entry->remoterel.nspname = pstrdup(remoterel->nspname);
@@ -699,7 +701,12 @@ logicalrep_partition_open(LogicalRepRelMapEntry *root,
         }
     }
     else
-       entry->attrmap = attrmap;
+   {
+       /* Lacking copy_attmap, do this the hard way. */
+       entry->attrmap = make_attrmap(attrmap->maplen);
+       memcpy(entry->attrmap->attnums, attrmap->attnums,
+              attrmap->maplen * sizeof(AttrNumber));
+   }
  
     entry->updatable = root->updatable;
  
diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c

index 98c26002e837501fe8f0504592b34ffdc8582564..689a66cc72ddbe0d5ca6231627347509f8f425ab 100644 (file)
--- a/src/backend/replication/logical/worker.c
+++ b/src/backend/replication/logical/worker.c
@@ -1477,12 +1477,13 @@ apply_handle_update_internal(ApplyExecutionData *edata,
     else
     {
         /*
-        * The tuple to be updated could not be found.
+        * The tuple to be updated could not be found.  Do nothing except for
+        * emitting a log message.
          *
-        * TODO what to do here, change the log level to LOG perhaps?
+        * XXX should this be promoted to ereport(LOG) perhaps?
          */
         elog(DEBUG1,
-            "logical replication did not find row for update "
+            "logical replication did not find row to be updated "
              "in replication target relation \"%s\"",
              RelationGetRelationName(localrel));
     }
@@ -1589,9 +1590,14 @@ apply_handle_delete_internal(ApplyExecutionData *edata,
     }
     else
     {
-       /* The tuple to be deleted could not be found. */
+       /*
+        * The tuple to be deleted could not be found.  Do nothing except for
+        * emitting a log message.
+        *
+        * XXX should this be promoted to ereport(LOG) perhaps?
+        */
         elog(DEBUG1,
-            "logical replication did not find row for delete "
+            "logical replication did not find row to be deleted "
              "in replication target relation \"%s\"",
              RelationGetRelationName(localrel));
     }
@@ -1728,30 +1734,30 @@ apply_handle_tuple_routing(ApplyExecutionData *edata,
                 found = FindReplTupleInLocalRel(estate, partrel,
                                                 &part_entry->remoterel,
                                                 remoteslot_part, &localslot);
-
-               oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate));
-               if (found)
-               {
-                   /* Apply the update.  */
-                   slot_modify_data(remoteslot_part, localslot,
-                                    part_entry,
-                                    newtup);
-                   MemoryContextSwitchTo(oldctx);
-               }
-               else
+               if (!found)
                 {
                     /*
-                    * The tuple to be updated could not be found.
+                    * The tuple to be updated could not be found.  Do nothing
+                    * except for emitting a log message.
                      *
-                    * TODO what to do here, change the log level to LOG
-                    * perhaps?
+                    * XXX should this be promoted to ereport(LOG) perhaps?
                      */
                     elog(DEBUG1,
-                        "logical replication did not find row for update "
-                        "in replication target relation \"%s\"",
+                        "logical replication did not find row to be updated "
+                        "in replication target relation's partition \"%s\"",
                          RelationGetRelationName(partrel));
+                   return;
                 }
  
+               /*
+                * Apply the update to the local tuple, putting the result in
+                * remoteslot_part.
+                */
+               oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate));
+               slot_modify_data(remoteslot_part, localslot, part_entry,
+                                newtup);
+               MemoryContextSwitchTo(oldctx);
+
                 /*
                  * Does the updated tuple still satisfy the current
                  * partition's constraint?
diff --git a/src/backend/replication/pgoutput/pgoutput.c b/src/backend/replication/pgoutput/pgoutput.c

index fe12d08a946daace8434a7ced030120d7e91eba1..63f108f960f754ca73bf2306ce9f186a0a9c30dd 100644 (file)
--- a/src/backend/replication/pgoutput/pgoutput.c
+++ b/src/backend/replication/pgoutput/pgoutput.c
@@ -612,8 +612,11 @@ pgoutput_change(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
                     /* Convert tuples if needed. */
                     if (relentry->map)
                     {
-                       oldtuple = execute_attr_map_tuple(oldtuple, relentry->map);
-                       newtuple = execute_attr_map_tuple(newtuple, relentry->map);
+                       if (oldtuple)
+                           oldtuple = execute_attr_map_tuple(oldtuple,
+                                                             relentry->map);
+                       newtuple = execute_attr_map_tuple(newtuple,
+                                                         relentry->map);
                     }
                 }
  
diff --git a/src/test/subscription/t/001_rep_changes.pl b/src/test/subscription/t/001_rep_changes.pl

index 1f654ee6afc37e46479ccb96647342b958bd6987..b2fdb289dbfa8fd44026bf7732d28068b4dadc00 100644 (file)
--- a/src/test/subscription/t/001_rep_changes.pl
+++ b/src/test/subscription/t/001_rep_changes.pl
@@ -6,7 +6,7 @@ use strict;
  use warnings;
  use PostgresNode;
  use TestLib;
-use Test::More tests => 28;
+use Test::More tests => 31;
  
  # Initialize publisher node
  my $node_publisher = get_new_node('publisher');
@@ -118,7 +118,7 @@ $node_publisher->safe_psql('postgres',
     "INSERT INTO tab_mixed VALUES (2, 'bar', 2.2)");
  
  $node_publisher->safe_psql('postgres',
-   "INSERT INTO tab_full_pk VALUES (1, 'foo')");
+   "INSERT INTO tab_full_pk VALUES (1, 'foo'), (2, 'baz')");
  
  $node_publisher->safe_psql('postgres',
     "INSERT INTO tab_nothing VALUES (generate_series(1,20))");
@@ -297,6 +297,38 @@ is( $result, qq(local|1.1|baz|1
  local|2.2|bar|2),
     'update works with different column order and subscriber local values');
  
+$result = $node_subscriber->safe_psql('postgres',
+   "SELECT * FROM tab_full_pk ORDER BY a");
+is( $result, qq(1|bar
+2|baz),
+   'update works with REPLICA IDENTITY FULL and a primary key');
+
+# Check that subscriber handles cases where update/delete target tuple
+# is missing.  We have to look for the DEBUG1 log messages about that,
+# so temporarily bump up the log verbosity.
+$node_subscriber->append_conf('postgresql.conf', "log_min_messages = debug1");
+$node_subscriber->reload;
+
+$node_subscriber->safe_psql('postgres', "DELETE FROM tab_full_pk");
+
+$node_publisher->safe_psql('postgres',
+   "UPDATE tab_full_pk SET b = 'quux' WHERE a = 1");
+$node_publisher->safe_psql('postgres', "DELETE FROM tab_full_pk WHERE a = 2");
+
+$node_publisher->wait_for_catchup('tap_sub');
+
+my $logfile = slurp_file($node_subscriber->logfile());
+ok( $logfile =~
+     qr/logical replication did not find row to be updated in replication target relation "tab_full_pk"/,
+   'update target row is missing');
+ok( $logfile =~
+     qr/logical replication did not find row to be deleted in replication target relation "tab_full_pk"/,
+   'delete target row is missing');
+
+$node_subscriber->append_conf('postgresql.conf',
+   "log_min_messages = warning");
+$node_subscriber->reload;
+
  # check behavior with toasted values
  
  $node_publisher->safe_psql('postgres',
diff --git a/src/test/subscription/t/013_partition.pl b/src/test/subscription/t/013_partition.pl

index 9de01017be5dc1769289b6ac9cc71037a93a5fb0..d0b5a55d186324de2e1d5bc127518af7a4062eb1 100644 (file)
--- a/src/test/subscription/t/013_partition.pl
+++ b/src/test/subscription/t/013_partition.pl
@@ -6,7 +6,7 @@ use strict;
  use warnings;
  use PostgresNode;
  use TestLib;
-use Test::More tests => 56;
+use Test::More tests => 62;
  
  # setup
  
@@ -347,6 +347,46 @@ $result =
    $node_subscriber2->safe_psql('postgres', "SELECT a FROM tab1 ORDER BY 1");
  is($result, qq(), 'truncate of tab1 replicated');
  
+# Check that subscriber handles cases where update/delete target tuple
+# is missing.  We have to look for the DEBUG1 log messages about that,
+# so temporarily bump up the log verbosity.
+$node_subscriber1->append_conf('postgresql.conf',
+   "log_min_messages = debug1");
+$node_subscriber1->reload;
+
+$node_publisher->safe_psql('postgres',
+   "INSERT INTO tab1 VALUES (1, 'foo'), (4, 'bar'), (10, 'baz')");
+
+$node_publisher->wait_for_catchup('sub1');
+$node_publisher->wait_for_catchup('sub2');
+
+$node_subscriber1->safe_psql('postgres', "DELETE FROM tab1");
+
+$node_publisher->safe_psql('postgres',
+   "UPDATE tab1 SET b = 'quux' WHERE a = 4");
+$node_publisher->safe_psql('postgres', "DELETE FROM tab1");
+
+$node_publisher->wait_for_catchup('sub1');
+$node_publisher->wait_for_catchup('sub2');
+
+my $logfile = slurp_file($node_subscriber1->logfile());
+ok( $logfile =~
+     qr/logical replication did not find row to be updated in replication target relation's partition "tab1_2_2"/,
+   'update target row is missing in tab1_2_2');
+ok( $logfile =~
+     qr/logical replication did not find row to be deleted in replication target relation "tab1_1"/,
+   'delete target row is missing in tab1_1');
+ok( $logfile =~
+     qr/logical replication did not find row to be deleted in replication target relation "tab1_2_2"/,
+   'delete target row is missing in tab1_2_2');
+ok( $logfile =~
+     qr/logical replication did not find row to be deleted in replication target relation "tab1_def"/,
+   'delete target row is missing in tab1_def');
+
+$node_subscriber1->append_conf('postgresql.conf',
+   "log_min_messages = warning");
+$node_subscriber1->reload;
+
  # Tests for replication using root table identity and schema
  
  # publisher
@@ -650,3 +690,32 @@ is( $result, qq(pub_tab2|1|xxx
  pub_tab2|3|yyy
  pub_tab2|5|zzz
  xxx_c|6|aaa), 'inserts into tab2 replicated');
+
+# Check that subscriber handles cases where update/delete target tuple
+# is missing.  We have to look for the DEBUG1 log messages about that,
+# so temporarily bump up the log verbosity.
+$node_subscriber1->append_conf('postgresql.conf',
+   "log_min_messages = debug1");
+$node_subscriber1->reload;
+
+$node_subscriber1->safe_psql('postgres', "DELETE FROM tab2");
+
+$node_publisher->safe_psql('postgres',
+   "UPDATE tab2 SET b = 'quux' WHERE a = 5");
+$node_publisher->safe_psql('postgres', "DELETE FROM tab2 WHERE a = 1");
+
+$node_publisher->wait_for_catchup('sub_viaroot');
+$node_publisher->wait_for_catchup('sub2');
+
+$logfile = slurp_file($node_subscriber1->logfile());
+ok( $logfile =~
+     qr/logical replication did not find row to be updated in replication target relation's partition "tab2_1"/,
+   'update target row is missing in tab2_1');
+ok( $logfile =~
+     qr/logical replication did not find row to be deleted in replication target relation "tab2_1"/,
+   'delete target row is missing in tab2_1');
+
+# No need for this until more tests are added.
+# $node_subscriber1->append_conf('postgresql.conf',
+#  "log_min_messages = warning");
+# $node_subscriber1->reload;
author	Tom Lane <tgl@sss.pgh.pa.us>
	Fri, 11 Jun 2021 20:12:36 +0000 (16:12 -0400)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Fri, 11 Jun 2021 20:12:41 +0000 (16:12 -0400)
src/backend/replication/logical/relation.c		patch \| blob \| blame \| history
src/backend/replication/logical/worker.c		patch \| blob \| blame \| history
src/backend/replication/pgoutput/pgoutput.c		patch \| blob \| blame \| history
src/test/subscription/t/001_rep_changes.pl		patch \| blob \| blame \| history
src/test/subscription/t/013_partition.pl		patch \| blob \| blame \| history