Logical replication support for initial data copy
authorPeter Eisentraut <peter_e@gmx.net>
Thu, 23 Mar 2017 12:36:36 +0000 (08:36 -0400)
committerPeter Eisentraut <peter_e@gmx.net>
Thu, 23 Mar 2017 12:55:37 +0000 (08:55 -0400)
Add functionality for a new subscription to copy the initial data in the
tables and then sync with the ongoing apply process.

For the copying, add a new internal COPY option to have the COPY source
data provided by a callback function.  The initial data copy works on
the subscriber by receiving COPY data from the publisher and then
providing it locally into a COPY that writes to the destination table.

A WAL receiver can now execute full SQL commands.  This is used here to
obtain information about tables and publications.

Several new options were added to CREATE and ALTER SUBSCRIPTION to
control whether and when initial table syncing happens.

Change pg_dump option --no-create-subscription-slots to
--no-subscription-connect and use the new CREATE SUBSCRIPTION
... NOCONNECT option for that.

Author: Petr Jelinek <petr.jelinek@2ndquadrant.com>
Tested-by: Erik Rijkers <er@xs4all.nl>
62 files changed:
contrib/file_fdw/file_fdw.c
doc/src/sgml/catalogs.sgml
doc/src/sgml/config.sgml
doc/src/sgml/logical-replication.sgml
doc/src/sgml/monitoring.sgml
doc/src/sgml/protocol.sgml
doc/src/sgml/ref/alter_subscription.sgml
doc/src/sgml/ref/create_subscription.sgml
doc/src/sgml/ref/pg_dump.sgml
src/backend/catalog/Makefile
src/backend/catalog/heap.c
src/backend/catalog/pg_publication.c
src/backend/catalog/pg_subscription.c
src/backend/catalog/system_views.sql
src/backend/commands/copy.c
src/backend/commands/subscriptioncmds.c
src/backend/parser/gram.y
src/backend/postmaster/pgstat.c
src/backend/replication/libpqwalreceiver/libpqwalreceiver.c
src/backend/replication/logical/Makefile
src/backend/replication/logical/launcher.c
src/backend/replication/logical/relation.c
src/backend/replication/logical/snapbuild.c
src/backend/replication/logical/tablesync.c [new file with mode: 0644]
src/backend/replication/logical/worker.c
src/backend/replication/repl_gram.y
src/backend/replication/repl_scanner.l
src/backend/replication/walsender.c
src/backend/tcop/postgres.c
src/backend/utils/adt/misc.c
src/backend/utils/cache/syscache.c
src/backend/utils/misc/guc.c
src/bin/pg_dump/pg_backup.h
src/bin/pg_dump/pg_dump.c
src/bin/pg_dump/t/002_pg_dump.pl
src/include/catalog/catversion.h
src/include/catalog/indexing.h
src/include/catalog/pg_proc.h
src/include/catalog/pg_subscription_rel.h [new file with mode: 0644]
src/include/commands/copy.h
src/include/nodes/nodes.h
src/include/nodes/parsenodes.h
src/include/nodes/replnodes.h
src/include/parser/kwlist.h
src/include/pgstat.h
src/include/replication/logical.h
src/include/replication/logicallauncher.h
src/include/replication/snapbuild.h
src/include/replication/walreceiver.h
src/include/replication/walsender.h
src/include/replication/worker_internal.h
src/include/utils/syscache.h
src/test/regress/expected/object_address.out
src/test/regress/expected/rules.out
src/test/regress/expected/sanity_check.out
src/test/regress/expected/subscription.out
src/test/regress/sql/object_address.sql
src/test/regress/sql/subscription.sql
src/test/subscription/t/001_rep_changes.pl
src/test/subscription/t/002_types.pl
src/test/subscription/t/003_constraints.pl
src/test/subscription/t/004_sync.pl [new file with mode: 0644]

index 735b79484c8538c75fef792d2f75d2781276f5c8..277639f6e9db8a5ee0da594aacb0fd74b5f95f3c 100644 (file)
@@ -662,6 +662,7 @@ fileBeginForeignScan(ForeignScanState *node, int eflags)
                           node->ss.ss_currentRelation,
                           filename,
                           is_program,
+                          NULL,
                           NIL,
                           options);
 
@@ -737,6 +738,7 @@ fileReScanForeignScan(ForeignScanState *node)
                                    node->ss.ss_currentRelation,
                                    festate->filename,
                                    festate->is_program,
+                                   NULL,
                                    NIL,
                                    festate->options);
 }
@@ -1100,7 +1102,8 @@ file_acquire_sample_rows(Relation onerel, int elevel,
    /*
     * Create CopyState from FDW options.
     */
-   cstate = BeginCopyFrom(NULL, onerel, filename, is_program, NIL, options);
+   cstate = BeginCopyFrom(NULL, onerel, filename, is_program, NULL, NIL,
+                          options);
 
    /*
     * Use per-tuple memory context to prevent leak of memory used to read
index df0435c3f009af74791359f3397d2bdfd1c36cc2..228ec7803189447e40c04355b4cb6117fbe74066 100644 (file)
       <entry>logical replication subscriptions</entry>
      </row>
 
+     <row>
+      <entry><link linkend="catalog-pg-subscription-rel"><structname>pg_subscription_rel</structname></link></entry>
+      <entry>relation state for subscriptions</entry>
+     </row>
+
      <row>
       <entry><link linkend="catalog-pg-tablespace"><structname>pg_tablespace</structname></link></entry>
       <entry>tablespaces within this database cluster</entry>
   </table>
  </sect1>
 
+ <sect1 id="catalog-pg-subscription-rel">
+  <title><structname>pg_subscription_rel</structname></title>
+
+  <indexterm zone="catalog-pg-subscription-rel">
+   <primary>pg_subscription_rel</primary>
+  </indexterm>
+
+  <para>
+   The catalog <structname>pg_subscription_rel</structname> contains the
+   state for each replicated relation in each subscription.  This is a
+   many-to-many mapping.
+  </para>
+
+  <para>
+   This catalog only contains tables known to the subscription after running
+   either <command>CREATE SUBSCRIPTION</command> or
+   <command>ALTER SUBSCRIPTION ... REFRESH</command>.
+  </para>
+
+  <table>
+   <title><structname>pg_subscription_rel</structname> Columns</title>
+
+   <tgroup cols="4">
+    <thead>
+     <row>
+      <entry>Name</entry>
+      <entry>Type</entry>
+      <entry>References</entry>
+      <entry>Description</entry>
+     </row>
+    </thead>
+
+    <tbody>
+     <row>
+      <entry><structfield>srsubid</structfield></entry>
+      <entry><type>oid</type></entry>
+      <entry><literal><link linkend="catalog-pg-subscription"><structname>pg_subscription</structname></link>.oid</literal></entry>
+      <entry>Reference to subscription</entry>
+     </row>
+
+     <row>
+      <entry><structfield>srrelid</structfield></entry>
+      <entry><type>oid</type></entry>
+      <entry><literal><link linkend="catalog-pg-class"><structname>pg_class</structname></link>.oid</literal></entry>
+      <entry>Reference to relation</entry>
+     </row>
+
+     <row>
+      <entry><structfield>srsubstate</structfield></entry>
+      <entry><type>char</type></entry>
+      <entry></entry>
+      <entry>
+       State code:
+       <literal>i</> = initialize,
+       <literal>d</> = data is being copied,
+       <literal>s</> = synchronized,
+       <literal>r</> = ready (normal replication)
+      </entry>
+     </row>
+
+     <row>
+      <entry><structfield>srsublsn</structfield></entry>
+      <entry><type>pg_lsn</type></entry>
+      <entry></entry>
+      <entry>
+       End LSN for <literal>s</> and <literal>r</> states.
+      </entry>
+     </row>
+    </tbody>
+   </tgroup>
+  </table>
+ </sect1>
+
  <sect1 id="catalog-pg-tablespace">
   <title><structname>pg_tablespace</structname></title>
 
index b379b67b30adf20bb854cfce4a960687d4bc65ef..2de3540def75257ca41fcd30edf5a1946decd081 100644 (file)
@@ -3449,6 +3449,31 @@ ANY <replaceable class="parameter">num_sync</replaceable> ( <replaceable class="
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-max-sync-workers-per-subscription" xreflabel="max_sync_workers_per_subscription">
+      <term><varname>max_sync_workers_per_subscription</varname> (<type>integer</type>)
+      <indexterm>
+       <primary><varname>max_sync_workers_per_subscription</> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Maximum number of synchronization workers per subscription. This
+        parameter controls the amount of paralelism of the initial data copy
+        during the subscription initialization or when new tables are added.
+       </para>
+       <para>
+        Currently, there can be only one synchronization worker per table.
+       </para>
+       <para>
+        The synchronization workers are taken from the pool defined by
+        <varname>max_logical_replication_workers</varname>.
+       </para>
+       <para>
+        The default value is 2.
+       </para>
+      </listitem>
+     </varlistentry>
+
      </variablelist>
     </sect2>
 
index 44cd78563d35a6127ae4573682204ebe42c0ea00..48db9cd08b7cd6fd33759937528e5ab7c31db340 100644 (file)
  </para>
 
  <para>
-  Logical replication sends changes on the publisher to the subscriber as
-  they occur in real-time.  The subscriber applies the data in the same order
-  as the publisher so that transactional consistency is guaranteed for
+  Logical replication of a table typically starts with a taking a snapshot
+  of the data on the publisher database and copying that to the subscriber.
+  Once that is done, the changes on the publisher are sent to the subscriber
+  as they occur in real-time.  The subscriber applies the data in the same
+  order as the publisher so that transactional consistency is guaranteed for
   publications within a single subscription.  This method of data replication
   is sometimes referred to as transactional replication.
  </para>
 
   <para>
    Each subscription will receive changes via one replication slot (see
-   <xref linkend="streaming-replication-slots">).
+   <xref linkend="streaming-replication-slots">).  Additional temporary
+   replication slots may be required for the initial data synchronization
+   of pre-existing table data.
   </para>
 
   <para>
    to <literal>replica</literal>, which produces the usual effects on triggers
    and constraints.
   </para>
+
+  <sect2 id="logical-replication-snapshot">
+    <title>Initial Snapshot</title>
+    <para>
+      The initial data in existing subscribed tables are snapshotted and
+      copied in a parallel instance of a special kind of apply process.
+      This process will create its own temporary replication slot and
+      copy the existing data. Once existing data is copied, the worker
+      enters synchronization mode, which ensures that the table is brought
+      up to a synchronized state with the main apply process by streaming
+      any changes that happened during the initial data copy using standard
+      logical replication. Once the synchronization is done, the control
+      of the replication of the table is given back to the main apply
+      process where the replication continues as normal.
+    </para>
+  </sect2>
  </sect1>
 
 <sect1 id="logical-replication-monitoring">
+ <sect1 id="logical-replication-monitoring">
   <title>Monitoring</title>
 
   <para>
   <para>
    Normally, there is a single apply process running for an enabled
    subscription.  A disabled subscription or a crashed subscription will have
-   zero rows in this view.
+   zero rows in this view.  If the initial data synchronization of any
+   table is in progress, there will be additional workers for the tables
+   being synchronized.
   </para>
  </sect1>
 
   <para>
    On the publisher side, <varname>wal_level</varname> must be set to
    <literal>logical</literal>, and <varname>max_replication_slots</varname>
-   must be set to at least the number of subscriptions expected to connect.
-   And <varname>max_wal_senders</varname> should be set to at least the same
-   as <varname>max_replication_slots</varname> plus the number of physical replicas
-   that are connected at the same time.
+   must be set to at least the number of subscriptions expected to connect,
+   plus some reserve for table synchronization.  And
+   <varname>max_wal_senders</varname> should be set to at least the same as
+   <varname>max_replication_slots</varname> plus the number of physical
+   replicas that are connected at the same time.
   </para>
 
   <para>
    to be set.  In this case it should be set to at least the number of
    subscriptions that will be added to the subscriber.
    <varname>max_logical_replication_workers</varname> must be set to at
-   least the number of subscriptions.  Additionally the
-   <varname>max_worker_processes</varname> may need to be adjusted to
-   accommodate for replication workers, at least
+   least the number of subscriptions, again plus some reserve for the table
+   synchronization.  Additionally the <varname>max_worker_processes</varname>
+   may need to be adjusted to accommodate for replication workers, at least
    (<varname>max_logical_replication_workers</varname>
    + <literal>1</literal>).  Note that some extensions and parallel queries
    also take worker slots from <varname>max_worker_processes</varname>.
@@ -393,8 +416,10 @@ CREATE SUBSCRIPTION mysub CONNECTION 'dbname=foo host=bar user=repuser' PUBLICAT
   </para>
 
   <para>
-   The above will start the replication process of changes to
-   <literal>users</literal> and <literal>departments</literal> tables.
+   The above will start the replication process, which synchronizes the
+   initial table contents of the tables <literal>users</literal> and
+   <literal>departments</literal> and then starts replicating
+   incremental changes to those tables.
   </para>
  </sect1>
 </chapter>
index dcb2d3303c1c7ecb607c8e20f761eb52caa238f5..eb6f486677350f14dfa7f609dfaed8a4666e11c5 100644 (file)
@@ -1863,6 +1863,12 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i
      <entry><type>integer</></entry>
      <entry>Process ID of the subscription worker process</entry>
     </row>
+    <row>
+     <entry><structfield>relid</></entry>
+     <entry><type>Oid</></entry>
+     <entry>OID of the relation that the worker is synchronizing; null for the
+     main apply worker</entry>
+    </row>
     <row>
      <entry><structfield>received_lsn</></entry>
      <entry><type>pg_lsn</></entry>
@@ -1899,7 +1905,8 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i
   <para>
    The <structname>pg_stat_subscription</structname> view will contain one
    row per subscription for main worker (with null PID if the worker is
-   not running).
+   not running), and additional rows for workers handling the initial data
+   copy of the subscribed tables.
   </para>
 
   <table id="pg-stat-ssl-view" xreflabel="pg_stat_ssl">
index 244e381de9ae902a5b54af3d675edcbabb30e376..48ca4140312e2e66524a7863408bc9e53cd06bb1 100644 (file)
@@ -1487,7 +1487,7 @@ The commands accepted in walsender mode are:
   </varlistentry>
 
   <varlistentry id="protocol-replication-create-slot" xreflabel="CREATE_REPLICATION_SLOT">
-   <term><literal>CREATE_REPLICATION_SLOT</literal> <replaceable class="parameter">slot_name</> [ <literal>TEMPORARY</> ] { <literal>PHYSICAL</> [ <literal>RESERVE_WAL</> ] | <literal>LOGICAL</> <replaceable class="parameter">output_plugin</> [ <literal>EXPORT_SNAPSHOT</> | <literal>NOEXPORT_SNAPSHOT</> ] }
+   <term><literal>CREATE_REPLICATION_SLOT</literal> <replaceable class="parameter">slot_name</> [ <literal>TEMPORARY</> ] { <literal>PHYSICAL</> [ <literal>RESERVE_WAL</> ] | <literal>LOGICAL</> <replaceable class="parameter">output_plugin</> [ <literal>EXPORT_SNAPSHOT</> | <literal>NOEXPORT_SNAPSHOT</> | <literal>USE_SNAPSHOT</> ] }
      <indexterm><primary>CREATE_REPLICATION_SLOT</primary></indexterm>
     </term>
     <listitem>
@@ -1542,12 +1542,17 @@ The commands accepted in walsender mode are:
       <varlistentry>
        <term><literal>EXPORT_SNAPSHOT</></term>
        <term><literal>NOEXPORT_SNAPSHOT</></term>
+       <term><literal>USE_SNAPSHOT</></term>
        <listitem>
         <para>
          Decides what to do with the snapshot created during logical slot
          initialization. <literal>EXPORT_SNAPSHOT</>, which is the default,
          will export the snapshot for use in other sessions. This option can't
-         be used inside a transaction.  <literal>NOEXPORT_SNAPSHOT</> will
+         be used inside a transaction.  <literal>USE_SNAPSHOT</> will use the
+         snapshot for the current transaction executing the command. This
+         option must be used in a transaction, and
+         <literal>CREATE_REPLICATION_SLOT</literal> must be the first command
+         run in that transaction.  Finally, <literal>NOEXPORT_SNAPSHOT</> will
          just use the snapshot for logical decoding as normal but won't do
          anything else with it.
         </para>
index 5e18e2ff6c1e4a9eefc5490fe0419a7113ba5871..6f94247b923a7144143c1611231cf44d33d02256 100644 (file)
@@ -21,15 +21,21 @@ PostgreSQL documentation
 
  <refsynopsisdiv>
 <synopsis>
-ALTER SUBSCRIPTION <replaceable class="PARAMETER">name</replaceable> WITH ( <replaceable class="PARAMETER">option</replaceable> [, ... ] ) ]
+ALTER SUBSCRIPTION <replaceable class="PARAMETER">name</replaceable> WITH ( <replaceable class="PARAMETER">suboption</replaceable> [, ... ] ) ]
 
-<phrase>where <replaceable class="PARAMETER">option</replaceable> can be:</phrase>
+<phrase>where <replaceable class="PARAMETER">suboption</replaceable> can be:</phrase>
 
-  SLOT NAME = <replaceable class="PARAMETER">slot_name</replaceable>
+    SLOT NAME = <replaceable class="PARAMETER">slot_name</replaceable>
+
+ALTER SUBSCRIPTION <replaceable class="PARAMETER">name</replaceable> SET PUBLICATION <replaceable class="PARAMETER">publication_name</replaceable> [, ...] { REFRESH WITH ( <replaceable class="PARAMETER">puboption</replaceable> [, ... ] ) | NOREFRESH }
+ALTER SUBSCRIPTION <replaceable class="PARAMETER">name</replaceable> REFRESH PUBLICATION WITH ( <replaceable class="PARAMETER">puboption</replaceable> [, ... ] )
+
+<phrase>where <replaceable class="PARAMETER">puboption</replaceable> can be:</phrase>
+
+    COPY DATA | NOCOPY DATA
 
 ALTER SUBSCRIPTION <replaceable class="PARAMETER">name</replaceable> OWNER TO { <replaceable>new_owner</replaceable> | CURRENT_USER | SESSION_USER }
 ALTER SUBSCRIPTION <replaceable class="PARAMETER">name</replaceable> CONNECTION '<replaceable>conninfo</replaceable>'
-ALTER SUBSCRIPTION <replaceable class="PARAMETER">name</replaceable> SET PUBLICATION <replaceable>publication_name</replaceable> [, ...]
 ALTER SUBSCRIPTION <replaceable class="PARAMETER">name</replaceable> ENABLE
 ALTER SUBSCRIPTION <replaceable class="PARAMETER">name</replaceable> DISABLE
 </synopsis>
@@ -65,7 +71,6 @@ ALTER SUBSCRIPTION <replaceable class="PARAMETER">name</replaceable> DISABLE
 
    <varlistentry>
     <term><literal>CONNECTION '<replaceable class="parameter">conninfo</replaceable>'</literal></term>
-    <term><literal>SET PUBLICATION <replaceable class="parameter">publication_name</replaceable></literal></term>
     <term><literal>SLOT NAME = <replaceable class="parameter">slot_name</replaceable></literal></term>
     <listitem>
      <para>
@@ -76,6 +81,40 @@ ALTER SUBSCRIPTION <replaceable class="PARAMETER">name</replaceable> DISABLE
     </listitem>
    </varlistentry>
 
+   <varlistentry>
+    <term><literal>SET PUBLICATION <replaceable class="parameter">publication_name</replaceable></literal></term>
+    <listitem>
+     <para>
+      Changes list of subscribed publications. See
+      <xref linkend="SQL-CREATESUBSCRIPTION"> for more information.
+     </para>
+     <para>
+      When <literal>REFRESH</literal> is specified, this command will also
+      act like <literal>REFRESH PUBLICATION</literal>. When
+      <literal>NOREFRESH</literal> is specified, the comamnd will not try to
+      refresh table information.
+     </para>
+    </listitem>
+   </varlistentry>
+
+   <varlistentry>
+    <term>REFRESH PUBLICATION</term>
+    <listitem>
+     <para>
+      Fetch missing table information from publisher. This will start
+      replication of tables that were added to the subscribed-to publications
+      since the last invocation of <command>REFRESH PUBLICATION</command> or
+      since <command>CREATE SUBSCRIPTION</command>.
+     </para>
+     <para>
+      The <literal>COPY DATA</literal> and <literal>NOCOPY DATA</literal>
+      options specify if the existing data in the publications that are being
+      subscribed to should be copied. <literal>COPY DATA</literal> is the
+      default.
+     </para>
+    </listitem>
+   </varlistentry>
+
    <varlistentry>
     <term><literal>ENABLE</literal></term>
     <listitem>
@@ -95,6 +134,7 @@ ALTER SUBSCRIPTION <replaceable class="PARAMETER">name</replaceable> DISABLE
      </para>
     </listitem>
    </varlistentry>
+
   </variablelist>
  </refsect1>
 
index e2000767003700a66d338d73a21badbbbb3b0c95..8f3c30b9b0132a7cb767a0e861498488bfe60e4b 100644 (file)
@@ -31,6 +31,8 @@ CREATE SUBSCRIPTION <replaceable class="PARAMETER">subscription_name</replaceabl
     | ENABLED | DISABLED
     | CREATE SLOT | NOCREATE SLOT
     | SLOT NAME = <replaceable class="PARAMETER">slot_name</replaceable>
+    | COPY DATA | NOCOPY DATA
+    | NOCONNECT
 </synopsis>
  </refsynopsisdiv>
 
@@ -132,6 +134,42 @@ CREATE SUBSCRIPTION <replaceable class="PARAMETER">subscription_name</replaceabl
      </para>
     </listitem>
    </varlistentry>
+
+   <varlistentry>
+    <term>COPY DATA</term>
+    <term>NOCOPY DATA</term>
+    <listitem>
+     <para>
+      Specifies if the existing data in the publications that are being
+      subscribed to should be copied once the replication starts.
+      <literal>COPY DATA</literal> is the default.
+     </para>
+    </listitem>
+   </varlistentry>
+
+   <varlistentry>
+    <term>NOCONNECT</term>
+    <listitem>
+     <para>
+      Instructs <command>CREATE SUBSCRIPTION</command> to skip the initial
+      connection to the provider. This will change default values of other
+      options to <literal>DISABLED</literal>,
+      <literal>NOCREATE SLOT</literal>, and <literal>NOCOPY DATA</literal>.
+     </para>
+     <para>
+      It's not allowed to combine <literal>NOCONNECT</literal> and
+      <literal>ENABLED</literal>, <literal>CREATE SLOT</literal>, or
+      <literal>COPY DATA</literal>.
+     </para>
+     <para>
+      Since no connection is made when this option is specified, the tables
+      are not subscribed, so after you enable the subscription nothing will
+      be replicated. It is required to run
+      <literal>ALTER SUBSCRIPTION ... REFRESH PUBLICATION</> in order for
+      tables to be subscribed.
+     </para>
+    </listitem>
+   </varlistentry>
   </variablelist>
  </refsect1>
 
index bb32fb12e0b4c46e582a098d35cabbd4db69eb65..4f19b89232173eff92c7ea95efff2d2a24a33925 100644 (file)
@@ -799,22 +799,23 @@ PostgreSQL documentation
      </varlistentry>
 
      <varlistentry>
-      <term><option>--no-create-subscription-slots</option></term>
+      <term><option>--no-security-labels</option></term>
       <listitem>
        <para>
-        When dumping logical replication subscriptions,
-        generate <command>CREATE SUBSCRIPTION</command> commands that do not
-        create the remote replication slot.  That way, the dump can be
-        restored without requiring network access to the remote servers.
+        Do not dump security labels.
        </para>
       </listitem>
      </varlistentry>
 
      <varlistentry>
-      <term><option>--no-security-labels</option></term>
+      <term><option>--no-subscription-connect</option></term>
       <listitem>
        <para>
-        Do not dump security labels.
+        When dumping logical replication subscriptions,
+        generate <command>CREATE SUBSCRIPTION</command> commands that do not
+        make remote connections for creating replication slot or initial table
+        copy.  That way, the dump can be restored without requiring network
+        access to the remote servers.
        </para>
       </listitem>
      </varlistentry>
index 31368585d212c8472891b41edc8e45f632326b4f..159cab5c18c0552d19526db06727a8d8cc1e21ee 100644 (file)
@@ -44,6 +44,7 @@ POSTGRES_BKI_SRCS = $(addprefix $(top_srcdir)/src/include/catalog/,\
    pg_default_acl.h pg_init_privs.h pg_seclabel.h pg_shseclabel.h \
    pg_collation.h pg_partitioned_table.h pg_range.h pg_transform.h \
    pg_sequence.h pg_publication.h pg_publication_rel.h pg_subscription.h \
+   pg_subscription_rel.h toasting.h indexing.h \
    toasting.h indexing.h \
     )
 
index 41c00565569546c3b3e801e4eedb471b00e6d02f..d49dcdc015d9ca905ef15f6abc04992890e8a4d6 100644 (file)
@@ -52,6 +52,7 @@
 #include "catalog/pg_opclass.h"
 #include "catalog/pg_partitioned_table.h"
 #include "catalog/pg_statistic.h"
+#include "catalog/pg_subscription_rel.h"
 #include "catalog/pg_tablespace.h"
 #include "catalog/pg_type.h"
 #include "catalog/pg_type_fn.h"
@@ -1831,6 +1832,11 @@ heap_drop_with_catalog(Oid relid)
     */
    relation_close(rel, NoLock);
 
+   /*
+    * Remove any associated relation synchronization states.
+    */
+   RemoveSubscriptionRel(InvalidOid, relid);
+
    /*
     * Forget any ON COMMIT action for the rel
     */
index 0f784690ce4dfd524fc1ec3feda69ec171f97725..9330e2380af5264b6c0817f77aa64c4c3f558766 100644 (file)
@@ -221,8 +221,8 @@ GetPublicationRelations(Oid pubid)
                BTEqualStrategyNumber, F_OIDEQ,
                ObjectIdGetDatum(pubid));
 
-   scan = systable_beginscan(pubrelsrel, PublicationRelMapIndexId, true,
-                             NULL, 1, &scankey);
+   scan = systable_beginscan(pubrelsrel, PublicationRelPrrelidPrpubidIndexId,
+                             true, NULL, 1, &scankey);
 
    result = NIL;
    while (HeapTupleIsValid(tup = systable_getnext(scan)))
index 20fdd6a54f01912d705288c391cd622442e38840..e420ec14d238463fa0f31865ba54513f983de1d3 100644 (file)
 #include "access/genam.h"
 #include "access/heapam.h"
 #include "access/htup_details.h"
+#include "access/xact.h"
 
+#include "catalog/indexing.h"
 #include "catalog/pg_type.h"
 #include "catalog/pg_subscription.h"
+#include "catalog/pg_subscription_rel.h"
 
 #include "nodes/makefuncs.h"
 
 #include "utils/array.h"
 #include "utils/builtins.h"
 #include "utils/fmgroids.h"
+#include "utils/pg_lsn.h"
+#include "utils/rel.h"
 #include "utils/syscache.h"
 
 
@@ -206,3 +211,280 @@ textarray_to_stringlist(ArrayType *textarray)
 
    return res;
 }
+
+/*
+ * Set the state of a subscription table.
+ */
+Oid
+SetSubscriptionRelState(Oid subid, Oid relid, char state,
+                          XLogRecPtr sublsn)
+{
+   Relation    rel;
+   HeapTuple   tup;
+   Oid         subrelid;
+   bool        nulls[Natts_pg_subscription_rel];
+   Datum       values[Natts_pg_subscription_rel];
+
+   /* Prevent concurrent changes. */
+   rel = heap_open(SubscriptionRelRelationId, ShareRowExclusiveLock);
+
+   /* Try finding existing mapping. */
+   tup = SearchSysCacheCopy2(SUBSCRIPTIONRELMAP,
+                             ObjectIdGetDatum(relid),
+                             ObjectIdGetDatum(subid));
+
+   /*
+    * If the record for given table does not exist yet create new
+    * record, otherwise update the existing one.
+    */
+   if (!HeapTupleIsValid(tup))
+   {
+       /* Form the tuple. */
+       memset(values, 0, sizeof(values));
+       memset(nulls, false, sizeof(nulls));
+       values[Anum_pg_subscription_rel_srsubid - 1] = ObjectIdGetDatum(subid);
+       values[Anum_pg_subscription_rel_srrelid - 1] = ObjectIdGetDatum(relid);
+       values[Anum_pg_subscription_rel_srsubstate - 1] = CharGetDatum(state);
+       if (sublsn != InvalidXLogRecPtr)
+           values[Anum_pg_subscription_rel_srsublsn - 1] = LSNGetDatum(sublsn);
+       else
+           nulls[Anum_pg_subscription_rel_srsublsn - 1] = true;
+
+       tup = heap_form_tuple(RelationGetDescr(rel), values, nulls);
+
+       /* Insert tuple into catalog. */
+       subrelid = CatalogTupleInsert(rel, tup);
+
+       heap_freetuple(tup);
+   }
+   else
+   {
+       bool        replaces[Natts_pg_subscription_rel];
+
+       /* Update the tuple. */
+       memset(values, 0, sizeof(values));
+       memset(nulls, false, sizeof(nulls));
+       memset(replaces, false, sizeof(replaces));
+
+       replaces[Anum_pg_subscription_rel_srsubstate - 1] = true;
+       values[Anum_pg_subscription_rel_srsubstate - 1] = CharGetDatum(state);
+
+       replaces[Anum_pg_subscription_rel_srsublsn - 1] = true;
+       if (sublsn != InvalidXLogRecPtr)
+           values[Anum_pg_subscription_rel_srsublsn - 1] = LSNGetDatum(sublsn);
+       else
+           nulls[Anum_pg_subscription_rel_srsublsn - 1] = true;
+
+       tup = heap_modify_tuple(tup, RelationGetDescr(rel), values, nulls,
+                               replaces);
+
+       /* Update the catalog. */
+       CatalogTupleUpdate(rel, &tup->t_self, tup);
+
+       subrelid = HeapTupleGetOid(tup);
+   }
+
+   /* Cleanup. */
+   heap_close(rel, NoLock);
+
+   return subrelid;
+}
+
+/*
+ * Get state of subscription table.
+ *
+ * Returns SUBREL_STATE_UNKNOWN when not found and missing_ok is true.
+ */
+char
+GetSubscriptionRelState(Oid subid, Oid relid, XLogRecPtr *sublsn,
+                       bool missing_ok)
+{
+   Relation    rel;
+   HeapTuple   tup;
+   char        substate;
+   bool        isnull;
+   Datum       d;
+
+   rel = heap_open(SubscriptionRelRelationId, AccessShareLock);
+
+   /* Try finding the mapping. */
+   tup = SearchSysCache2(SUBSCRIPTIONRELMAP,
+                         ObjectIdGetDatum(relid),
+                         ObjectIdGetDatum(subid));
+
+   if (!HeapTupleIsValid(tup))
+   {
+       if (missing_ok)
+       {
+           heap_close(rel, AccessShareLock);
+           *sublsn = InvalidXLogRecPtr;
+           return SUBREL_STATE_UNKNOWN;
+       }
+
+       elog(ERROR, "subscription table %u in subscription %u does not exist",
+            relid, subid);
+   }
+
+   /* Get the state. */
+   d = SysCacheGetAttr(SUBSCRIPTIONRELMAP, tup,
+                       Anum_pg_subscription_rel_srsubstate, &isnull);
+   Assert(!isnull);
+   substate = DatumGetChar(d);
+   d = SysCacheGetAttr(SUBSCRIPTIONRELMAP, tup,
+                       Anum_pg_subscription_rel_srsublsn, &isnull);
+   if (isnull)
+       *sublsn = InvalidXLogRecPtr;
+   else
+       *sublsn = DatumGetLSN(d);
+
+   /* Cleanup */
+   ReleaseSysCache(tup);
+   heap_close(rel, AccessShareLock);
+
+   return substate;
+}
+
+/*
+ * Drop subscription relation mapping. These can be for a particular
+ * subscription, or for a particular relation, or both.
+ */
+void
+RemoveSubscriptionRel(Oid subid, Oid relid)
+{
+   Relation    rel;
+   HeapScanDesc scan;
+   ScanKeyData skey[2];
+   HeapTuple   tup;
+   int         nkeys = 0;
+
+   /* Prevent concurrent changes (see SetSubscriptionRelState()). */
+   rel = heap_open(SubscriptionRelRelationId, ShareRowExclusiveLock);
+
+   if (OidIsValid(subid))
+   {
+       ScanKeyInit(&skey[nkeys++],
+                   Anum_pg_subscription_rel_srsubid,
+                   BTEqualStrategyNumber,
+                   F_OIDEQ,
+                   ObjectIdGetDatum(subid));
+   }
+
+   if (OidIsValid(relid))
+   {
+       ScanKeyInit(&skey[nkeys++],
+                   Anum_pg_subscription_rel_srrelid,
+                   BTEqualStrategyNumber,
+                   F_OIDEQ,
+                   ObjectIdGetDatum(relid));
+   }
+
+   /* Do the search and delete what we found. */
+   scan = heap_beginscan_catalog(rel, nkeys, skey);
+   while (HeapTupleIsValid(tup = heap_getnext(scan, ForwardScanDirection)))
+   {
+       simple_heap_delete(rel, &tup->t_self);
+   }
+   heap_endscan(scan);
+
+   heap_close(rel, ShareRowExclusiveLock);
+}
+
+
+/*
+ * Get all relations for subscription.
+ *
+ * Returned list is palloced in current memory context.
+ */
+List *
+GetSubscriptionRelations(Oid subid)
+{
+   List       *res = NIL;
+   Relation    rel;
+   HeapTuple   tup;
+   int         nkeys = 0;
+   ScanKeyData skey[2];
+   SysScanDesc scan;
+
+   rel = heap_open(SubscriptionRelRelationId, AccessShareLock);
+
+   ScanKeyInit(&skey[nkeys++],
+               Anum_pg_subscription_rel_srsubid,
+               BTEqualStrategyNumber, F_OIDEQ,
+               ObjectIdGetDatum(subid));
+
+   scan = systable_beginscan(rel, InvalidOid, false,
+                             NULL, nkeys, skey);
+
+   while (HeapTupleIsValid(tup = systable_getnext(scan)))
+   {
+       Form_pg_subscription_rel    subrel;
+       SubscriptionRelState       *relstate;
+
+       subrel = (Form_pg_subscription_rel) GETSTRUCT(tup);
+
+       relstate = (SubscriptionRelState *)palloc(sizeof(SubscriptionRelState));
+       relstate->relid = subrel->srrelid;
+       relstate->state = subrel->srsubstate;
+       relstate->lsn = subrel->srsublsn;
+
+       res = lappend(res, relstate);
+   }
+
+   /* Cleanup */
+   systable_endscan(scan);
+   heap_close(rel, AccessShareLock);
+
+   return res;
+}
+
+/*
+ * Get all relations for subscription that are not in a ready state.
+ *
+ * Returned list is palloced in current memory context.
+ */
+List *
+GetSubscriptionNotReadyRelations(Oid subid)
+{
+   List       *res = NIL;
+   Relation    rel;
+   HeapTuple   tup;
+   int         nkeys = 0;
+   ScanKeyData skey[2];
+   SysScanDesc scan;
+
+   rel = heap_open(SubscriptionRelRelationId, AccessShareLock);
+
+   ScanKeyInit(&skey[nkeys++],
+               Anum_pg_subscription_rel_srsubid,
+               BTEqualStrategyNumber, F_OIDEQ,
+               ObjectIdGetDatum(subid));
+
+   ScanKeyInit(&skey[nkeys++],
+               Anum_pg_subscription_rel_srsubstate,
+               BTEqualStrategyNumber, F_CHARNE,
+               CharGetDatum(SUBREL_STATE_READY));
+
+   scan = systable_beginscan(rel, InvalidOid, false,
+                             NULL, nkeys, skey);
+
+   while (HeapTupleIsValid(tup = systable_getnext(scan)))
+   {
+       Form_pg_subscription_rel    subrel;
+       SubscriptionRelState       *relstate;
+
+       subrel = (Form_pg_subscription_rel) GETSTRUCT(tup);
+
+       relstate = (SubscriptionRelState *)palloc(sizeof(SubscriptionRelState));
+       relstate->relid = subrel->srrelid;
+       relstate->state = subrel->srsubstate;
+       relstate->lsn = subrel->srsublsn;
+
+       res = lappend(res, relstate);
+   }
+
+   /* Cleanup */
+   systable_endscan(scan);
+   heap_close(rel, AccessShareLock);
+
+   return res;
+}
index c2b0bedc1d680ecf59f7aa417164e96b61550cba..5723714fb972489717ff62ca74bf501966ab713b 100644 (file)
@@ -733,6 +733,7 @@ CREATE VIEW pg_stat_subscription AS
             su.oid AS subid,
             su.subname,
             st.pid,
+            st.relid,
             st.received_lsn,
             st.last_msg_send_time,
             st.last_msg_receipt_time,
index ba89b292d1e7eca9685b9e3c10d40a5e147826bd..b0fd09f458a95159454b8d58f3291e0414dc6d4f 100644 (file)
@@ -60,7 +60,8 @@ typedef enum CopyDest
 {
    COPY_FILE,                  /* to/from file (or a piped program) */
    COPY_OLD_FE,                /* to/from frontend (2.0 protocol) */
-   COPY_NEW_FE                 /* to/from frontend (3.0 protocol) */
+   COPY_NEW_FE,                /* to/from frontend (3.0 protocol) */
+   COPY_CALLBACK               /* to/from callback function */
 } CopyDest;
 
 /*
@@ -109,6 +110,7 @@ typedef struct CopyStateData
    List       *attnumlist;     /* integer list of attnums to copy */
    char       *filename;       /* filename, or NULL for STDIN/STDOUT */
    bool        is_program;     /* is 'filename' a program to popen? */
+   copy_data_source_cb data_source_cb;     /* function for reading data*/
    bool        binary;         /* binary format? */
    bool        oids;           /* include OIDs? */
    bool        freeze;         /* freeze rows on loading? */
@@ -299,7 +301,6 @@ static uint64 DoCopyTo(CopyState cstate);
 static uint64 CopyTo(CopyState cstate);
 static void CopyOneRowTo(CopyState cstate, Oid tupleOid,
             Datum *values, bool *nulls);
-static uint64 CopyFrom(CopyState cstate);
 static void CopyFromInsertBatch(CopyState cstate, EState *estate,
                    CommandId mycid, int hi_options,
                    ResultRelInfo *resultRelInfo, TupleTableSlot *myslot,
@@ -529,6 +530,9 @@ CopySendEndOfRow(CopyState cstate)
            /* Dump the accumulated row as one CopyData message */
            (void) pq_putmessage('d', fe_msgbuf->data, fe_msgbuf->len);
            break;
+       case COPY_CALLBACK:
+           Assert(false); /* Not yet supported. */
+           break;
    }
 
    resetStringInfo(fe_msgbuf);
@@ -643,6 +647,9 @@ CopyGetData(CopyState cstate, void *databuf, int minread, int maxread)
                bytesread += avail;
            }
            break;
+       case COPY_CALLBACK:
+           bytesread = cstate->data_source_cb(databuf, minread, maxread);
+           break;
    }
 
    return bytesread;
@@ -969,7 +976,7 @@ DoCopy(ParseState *pstate, const CopyStmt *stmt,
        PreventCommandIfParallelMode("COPY FROM");
 
        cstate = BeginCopyFrom(pstate, rel, stmt->filename, stmt->is_program,
-                              stmt->attlist, stmt->options);
+                              NULL, stmt->attlist, stmt->options);
        cstate->range_table = range_table;
        *processed = CopyFrom(cstate);  /* copy from file to database */
        EndCopyFrom(cstate);
@@ -2286,7 +2293,7 @@ limit_printout_length(const char *str)
 /*
  * Copy FROM file to relation.
  */
-static uint64
+uint64
 CopyFrom(CopyState cstate)
 {
    HeapTuple   tuple;
@@ -2878,6 +2885,7 @@ BeginCopyFrom(ParseState *pstate,
              Relation rel,
              const char *filename,
              bool is_program,
+             copy_data_source_cb data_source_cb,
              List *attnamelist,
              List *options)
 {
@@ -2992,7 +3000,12 @@ BeginCopyFrom(ParseState *pstate,
    cstate->num_defaults = num_defaults;
    cstate->is_program = is_program;
 
-   if (pipe)
+   if (data_source_cb)
+   {
+       cstate->copy_dest = COPY_CALLBACK;
+       cstate->data_source_cb = data_source_cb;
+   }
+   else if (pipe)
    {
        Assert(!is_program);    /* the grammar does not allow this */
        if (whereToSendOutput == DestRemote)
index 0198e6d75bad9693c313ac75dc1f1b07c33d1d7d..0784ca79515df41c0f6cb656ec1602a661c1b09d 100644 (file)
 #include "access/htup_details.h"
 #include "access/xact.h"
 
+#include "catalog/dependency.h"
 #include "catalog/indexing.h"
+#include "catalog/namespace.h"
 #include "catalog/objectaccess.h"
 #include "catalog/objectaddress.h"
 #include "catalog/pg_type.h"
 #include "catalog/pg_subscription.h"
+#include "catalog/pg_subscription_rel.h"
 
 #include "commands/defrem.h"
 #include "commands/event_trigger.h"
 #include "commands/subscriptioncmds.h"
 
+#include "nodes/makefuncs.h"
+
 #include "replication/logicallauncher.h"
 #include "replication/origin.h"
 #include "replication/walreceiver.h"
+#include "replication/walsender.h"
 #include "replication/worker_internal.h"
 
 #include "storage/lmgr.h"
 
 #include "utils/builtins.h"
+#include "utils/lsyscache.h"
 #include "utils/memutils.h"
 #include "utils/syscache.h"
 
+static List *fetch_table_list(WalReceiverConn *wrconn, List *publications);
+
 /*
  * Common option parsing function for CREATE and ALTER SUBSCRIPTION commands.
  *
  * accomodate that.
  */
 static void
-parse_subscription_options(List *options, char **conninfo,
-                          List **publications, bool *enabled_given,
-                          bool *enabled, bool *create_slot, char **slot_name)
+parse_subscription_options(List *options, bool *connect, bool *enabled_given,
+                          bool *enabled, bool *create_slot, char **slot_name,
+                          bool *copy_data)
 {
    ListCell   *lc;
+   bool        connect_given = false;
    bool        create_slot_given = false;
+   bool        copy_data_given = false;
 
-   if (conninfo)
-       *conninfo = NULL;
-   if (publications)
-       *publications = NIL;
+   if (connect)
+       *connect = true;
    if (enabled)
    {
        *enabled_given = false;
@@ -69,29 +78,23 @@ parse_subscription_options(List *options, char **conninfo,
        *create_slot = true;
    if (slot_name)
        *slot_name = NULL;
+   if (copy_data)
+       *copy_data = true;
 
    /* Parse options */
    foreach (lc, options)
    {
        DefElem    *defel = (DefElem *) lfirst(lc);
 
-       if (strcmp(defel->defname, "conninfo") == 0 && conninfo)
-       {
-           if (*conninfo)
-               ereport(ERROR,
-                       (errcode(ERRCODE_SYNTAX_ERROR),
-                        errmsg("conflicting or redundant options")));
-
-           *conninfo = defGetString(defel);
-       }
-       else if (strcmp(defel->defname, "publication") == 0 && publications)
+       if (strcmp(defel->defname, "noconnect") == 0 && connect)
        {
-           if (*publications)
+           if (connect_given)
                ereport(ERROR,
                        (errcode(ERRCODE_SYNTAX_ERROR),
                         errmsg("conflicting or redundant options")));
 
-           *publications = defGetStringList(defel);
+           connect_given = true;
+           *connect = !defGetBoolean(defel);
        }
        else if (strcmp(defel->defname, "enabled") == 0 && enabled)
        {
@@ -142,9 +145,57 @@ parse_subscription_options(List *options, char **conninfo,
 
            *slot_name = defGetString(defel);
        }
+       else if (strcmp(defel->defname, "copy data") == 0 && copy_data)
+       {
+           if (copy_data_given)
+               ereport(ERROR,
+                       (errcode(ERRCODE_SYNTAX_ERROR),
+                        errmsg("conflicting or redundant options")));
+
+           copy_data_given = true;
+           *copy_data = defGetBoolean(defel);
+       }
+       else if (strcmp(defel->defname, "nocopy data") == 0 && copy_data)
+       {
+           if (copy_data_given)
+               ereport(ERROR,
+                       (errcode(ERRCODE_SYNTAX_ERROR),
+                        errmsg("conflicting or redundant options")));
+
+           copy_data_given = true;
+           *copy_data = !defGetBoolean(defel);
+       }
        else
            elog(ERROR, "unrecognized option: %s", defel->defname);
    }
+
+   /*
+    * We've been explicitly asked to not connect, that requires some
+    * additional processing.
+    */
+   if (connect && !*connect)
+   {
+       /* Check for incompatible options from the user. */
+       if (*enabled_given && *enabled)
+           ereport(ERROR,
+                   (errcode(ERRCODE_SYNTAX_ERROR),
+                    errmsg("noconnect and enabled are mutually exclusive options")));
+
+       if (create_slot_given && *create_slot)
+           ereport(ERROR,
+                   (errcode(ERRCODE_SYNTAX_ERROR),
+                    errmsg("noconnect and create slot are mutually exclusive options")));
+
+       if (copy_data_given && *copy_data)
+           ereport(ERROR,
+                   (errcode(ERRCODE_SYNTAX_ERROR),
+                    errmsg("noconnect and copy data are mutually exclusive options")));
+
+       /* Change the defaults of other options. */
+       *enabled = false;
+       *create_slot = false;
+       *copy_data = false;
+   }
 }
 
 /*
@@ -214,8 +265,10 @@ CreateSubscription(CreateSubscriptionStmt *stmt, bool isTopLevel)
    Datum       values[Natts_pg_subscription];
    Oid         owner = GetUserId();
    HeapTuple   tup;
+   bool        connect;
    bool        enabled_given;
    bool        enabled;
+   bool        copy_data;
    char       *conninfo;
    char       *slotname;
    char        originname[NAMEDATALEN];
@@ -226,9 +279,8 @@ CreateSubscription(CreateSubscriptionStmt *stmt, bool isTopLevel)
     * Parse and check options.
     * Connection and publication should not be specified here.
     */
-   parse_subscription_options(stmt->options, NULL, NULL,
-                              &enabled_given, &enabled,
-                              &create_slot, &slotname);
+   parse_subscription_options(stmt->options, &connect, &enabled_given,
+                              &enabled, &create_slot, &slotname, &copy_data);
 
    /*
     * Since creating a replication slot is not transactional, rolling back
@@ -297,14 +349,17 @@ CreateSubscription(CreateSubscriptionStmt *stmt, bool isTopLevel)
    replorigin_create(originname);
 
    /*
-    * If requested, create the replication slot on remote side for our
-    * newly created subscription.
+    * Connect to remote side to execute requested commands and fetch table
+    * info.
     */
-   if (create_slot)
+   if (connect)
    {
        XLogRecPtr          lsn;
        char               *err;
        WalReceiverConn    *wrconn;
+       List               *tables;
+       ListCell           *lc;
+       char                table_state;
 
        /* Try to connect to the publisher. */
        wrconn = walrcv_connect(conninfo, true, stmt->subname, &err);
@@ -315,13 +370,43 @@ CreateSubscription(CreateSubscriptionStmt *stmt, bool isTopLevel)
        PG_TRY();
        {
            /*
-            * Create permanent slot for the subscription.  We won't use the
-            * initial snapshot for anything, so no need to export it.
+            * If requested, create permanent slot for the subscription.
+            * We won't use the initial snapshot for anything, so no need
+            * to export it.
+            */
+           if (create_slot)
+           {
+               walrcv_create_slot(wrconn, slotname, false,
+                                  CRS_NOEXPORT_SNAPSHOT, &lsn);
+               ereport(NOTICE,
+                       (errmsg("created replication slot \"%s\" on publisher",
+                               slotname)));
+           }
+
+           /*
+            * Set sync state based on if we were asked to do data copy or
+            * not.
             */
-           walrcv_create_slot(wrconn, slotname, false, false, &lsn);
+           table_state = copy_data ? SUBREL_STATE_INIT : SUBREL_STATE_READY;
+
+           /*
+            * Get the table list from publisher and build local table status
+            * info.
+            */
+           tables = fetch_table_list(wrconn, publications);
+           foreach (lc, tables)
+           {
+               RangeVar   *rv = (RangeVar *) lfirst(lc);
+               Oid         relid;
+
+               relid = RangeVarGetRelid(rv, AccessShareLock, true);
+
+               SetSubscriptionRelState(subid, relid, table_state,
+                                       InvalidXLogRecPtr);
+           }
+
            ereport(NOTICE,
-                   (errmsg("created replication slot \"%s\" on publisher",
-                           slotname)));
+                   (errmsg("synchronized table states")));
        }
        PG_CATCH();
        {
@@ -334,6 +419,11 @@ CreateSubscription(CreateSubscriptionStmt *stmt, bool isTopLevel)
        /* And we are done with the remote side. */
        walrcv_disconnect(wrconn);
    }
+   else
+       ereport(WARNING,
+               (errmsg("tables were not subscribed, you will have to run "
+                       "ALTER SUBSCRIPTION ... REFRESH PUBLICATION to "
+                       "subscribe the tables")));
 
    heap_close(rel, RowExclusiveLock);
 
@@ -346,6 +436,108 @@ CreateSubscription(CreateSubscriptionStmt *stmt, bool isTopLevel)
    return myself;
 }
 
+static void
+AlterSubscription_refresh(Subscription *sub, bool copy_data)
+{
+   char           *err;
+   List       *pubrel_names;
+   List       *subrel_states;
+   Oid        *subrel_local_oids;
+   Oid        *pubrel_local_oids;
+   ListCell   *lc;
+   int         off;
+
+   /* Load the library providing us libpq calls. */
+   load_file("libpqwalreceiver", false);
+
+   /* Try to connect to the publisher. */
+   wrconn = walrcv_connect(sub->conninfo, true, sub->name, &err);
+   if (!wrconn)
+       ereport(ERROR,
+               (errmsg("could not connect to the publisher: %s", err)));
+
+   /* Get the table list from publisher. */
+   pubrel_names = fetch_table_list(wrconn, sub->publications);
+
+   /* We are done with the remote side, close connection. */
+   walrcv_disconnect(wrconn);
+
+   /* Get local table list. */
+   subrel_states = GetSubscriptionRelations(sub->oid);
+
+   /*
+    * Build qsorted array of local table oids for faster lookup.
+    * This can potentially contain all tables in the database so
+    * speed of lookup is important.
+    */
+   subrel_local_oids = palloc(list_length(subrel_states) * sizeof(Oid));
+   off = 0;
+   foreach(lc, subrel_states)
+   {
+       SubscriptionRelState *relstate = (SubscriptionRelState *) lfirst(lc);
+       subrel_local_oids[off++] = relstate->relid;
+   }
+   qsort(subrel_local_oids, list_length(subrel_states),
+         sizeof(Oid), oid_cmp);
+
+   /*
+    * Walk over the remote tables and try to match them to locally
+    * known tables. If the table is not known locally create a new state
+    * for it.
+    *
+    * Also builds array of local oids of remote tables for the next step.
+    */
+   off = 0;
+   pubrel_local_oids = palloc(list_length(pubrel_names) * sizeof(Oid));
+
+   foreach (lc, pubrel_names)
+   {
+       RangeVar   *rv = (RangeVar *) lfirst(lc);
+       Oid         relid;
+
+       relid = RangeVarGetRelid(rv, AccessShareLock, false);
+       pubrel_local_oids[off++] = relid;
+
+       if (!bsearch(&relid, subrel_local_oids,
+                    list_length(subrel_states), sizeof(Oid), oid_cmp))
+       {
+           SetSubscriptionRelState(sub->oid, relid,
+                                   copy_data ? SUBREL_STATE_INIT : SUBREL_STATE_READY,
+                                   InvalidXLogRecPtr);
+           ereport(NOTICE,
+                   (errmsg("added subscription for table %s.%s",
+                           quote_identifier(rv->schemaname),
+                           quote_identifier(rv->relname))));
+       }
+   }
+
+   /*
+    * Next remove state for tables we should not care about anymore using
+    * the data we collected above
+    */
+   qsort(pubrel_local_oids, list_length(pubrel_names),
+         sizeof(Oid), oid_cmp);
+
+   for (off = 0; off < list_length(subrel_states); off++)
+   {
+       Oid relid = subrel_local_oids[off];
+
+       if (!bsearch(&relid, pubrel_local_oids,
+                    list_length(pubrel_names), sizeof(Oid), oid_cmp))
+       {
+           char   *namespace;
+
+           RemoveSubscriptionRel(sub->oid, relid);
+
+           namespace = get_namespace_name(get_rel_namespace(relid));
+           ereport(NOTICE,
+                   (errmsg("removed subscription for table %s.%s",
+                           quote_identifier(namespace),
+                           quote_identifier(get_rel_name(relid)))));
+       }
+   }
+}
+
 /*
  * Alter the existing subscription.
  */
@@ -359,11 +551,7 @@ AlterSubscription(AlterSubscriptionStmt *stmt)
    Datum       values[Natts_pg_subscription];
    HeapTuple   tup;
    Oid         subid;
-   bool        enabled_given;
-   bool        enabled;
-   char       *conninfo;
-   char       *slot_name;
-   List       *publications;
+   bool        update_tuple = false;
 
    rel = heap_open(SubscriptionRelationId, RowExclusiveLock);
 
@@ -384,52 +572,113 @@ AlterSubscription(AlterSubscriptionStmt *stmt)
 
    subid = HeapTupleGetOid(tup);
 
-   /* Parse options. */
-   parse_subscription_options(stmt->options, &conninfo, &publications,
-                              &enabled_given, &enabled,
-                              NULL, &slot_name);
-
    /* Form a new tuple. */
    memset(values, 0, sizeof(values));
    memset(nulls, false, sizeof(nulls));
    memset(replaces, false, sizeof(replaces));
 
-   if (enabled_given)
-   {
-       values[Anum_pg_subscription_subenabled - 1] = BoolGetDatum(enabled);
-       replaces[Anum_pg_subscription_subenabled - 1] = true;
-   }
-   if (conninfo)
-   {
-       values[Anum_pg_subscription_subconninfo - 1] =
-           CStringGetTextDatum(conninfo);
-       replaces[Anum_pg_subscription_subconninfo - 1] = true;
-   }
-   if (slot_name)
-   {
-       values[Anum_pg_subscription_subslotname - 1] =
-           DirectFunctionCall1(namein, CStringGetDatum(slot_name));
-       replaces[Anum_pg_subscription_subslotname - 1] = true;
-   }
-   if (publications != NIL)
+   switch (stmt->kind)
    {
-       values[Anum_pg_subscription_subpublications - 1] =
-            publicationListToArray(publications);
-       replaces[Anum_pg_subscription_subpublications - 1] = true;
+       case ALTER_SUBSCRIPTION_OPTIONS:
+           {
+               char *slot_name;
+
+               parse_subscription_options(stmt->options, NULL, NULL, NULL,
+                                          NULL, &slot_name, NULL);
+
+               values[Anum_pg_subscription_subslotname - 1] =
+                   DirectFunctionCall1(namein, CStringGetDatum(slot_name));
+               replaces[Anum_pg_subscription_subslotname - 1] = true;
+
+               update_tuple = true;
+               break;
+           }
+
+       case ALTER_SUBSCRIPTION_ENABLED:
+           {
+               bool enabled,
+                    enabled_given;
+
+               parse_subscription_options(stmt->options, NULL,
+                                          &enabled_given, &enabled, NULL,
+                                          NULL, NULL);
+               Assert(enabled_given);
+
+               values[Anum_pg_subscription_subenabled - 1] =
+                   BoolGetDatum(enabled);
+               replaces[Anum_pg_subscription_subenabled - 1] = true;
+
+               update_tuple = true;
+               break;
+           }
+
+       case ALTER_SUBSCRIPTION_CONNECTION:
+           values[Anum_pg_subscription_subconninfo - 1] =
+               CStringGetTextDatum(stmt->conninfo);
+           replaces[Anum_pg_subscription_subconninfo - 1] = true;
+           update_tuple = true;
+           break;
+
+       case ALTER_SUBSCRIPTION_PUBLICATION:
+       case ALTER_SUBSCRIPTION_PUBLICATION_REFRESH:
+           {
+               bool            copy_data;
+               Subscription   *sub = GetSubscription(subid, false);
+
+               parse_subscription_options(stmt->options, NULL, NULL, NULL,
+                                          NULL, NULL, &copy_data);
+
+               values[Anum_pg_subscription_subpublications - 1] =
+                    publicationListToArray(stmt->publication);
+               replaces[Anum_pg_subscription_subpublications - 1] = true;
+
+               update_tuple = true;
+
+               /* Refresh if user asked us to. */
+               if (stmt->kind == ALTER_SUBSCRIPTION_PUBLICATION_REFRESH)
+               {
+                   /* Make sure refresh sees the new list of publications. */
+                   sub->publications = stmt->publication;
+
+                   AlterSubscription_refresh(sub, copy_data);
+               }
+
+               break;
+           }
+
+       case ALTER_SUBSCRIPTION_REFRESH:
+           {
+               bool            copy_data;
+               Subscription   *sub = GetSubscription(subid, false);
+
+               parse_subscription_options(stmt->options, NULL, NULL, NULL,
+                                          NULL, NULL, &copy_data);
+
+               AlterSubscription_refresh(sub, copy_data);
+
+               break;
+           }
+
+       default:
+           elog(ERROR, "unrecognized ALTER SUBSCRIPTION kind %d",
+                stmt->kind);
    }
 
-   tup = heap_modify_tuple(tup, RelationGetDescr(rel), values, nulls,
-                           replaces);
+   /* Update the catalog if needed. */
+   if (update_tuple)
+   {
+       tup = heap_modify_tuple(tup, RelationGetDescr(rel), values, nulls,
+                               replaces);
 
-   /* Update the catalog. */
-   CatalogTupleUpdate(rel, &tup->t_self, tup);
+       CatalogTupleUpdate(rel, &tup->t_self, tup);
 
-   ObjectAddressSet(myself, SubscriptionRelationId, subid);
+       heap_freetuple(tup);
+   }
 
-   /* Cleanup. */
-   heap_freetuple(tup);
    heap_close(rel, RowExclusiveLock);
 
+   ObjectAddressSet(myself, SubscriptionRelationId, subid);
+
    InvokeObjectPostAlterHook(SubscriptionRelationId, subid, 0);
 
    return myself;
@@ -537,8 +786,11 @@ DropSubscription(DropSubscriptionStmt *stmt, bool isTopLevel)
    /* Clean up dependencies */
    deleteSharedDependencyRecordsFor(SubscriptionRelationId, subid, 0);
 
+   /* Remove any associated relation synchronization states. */
+   RemoveSubscriptionRel(subid, InvalidOid);
+
    /* Kill the apply worker so that the slot becomes accessible. */
-   logicalrep_worker_stop(subid);
+   logicalrep_worker_stop(subid, InvalidOid);
 
    /* Remove the origin tracking if exists. */
    snprintf(originname, sizeof(originname), "pg_%u", subid);
@@ -571,15 +823,20 @@ DropSubscription(DropSubscriptionStmt *stmt, bool isTopLevel)
 
    PG_TRY();
    {
-       if (!walrcv_command(wrconn, cmd.data, &err))
+       WalRcvExecResult   *res;
+       res = walrcv_exec(wrconn, cmd.data, 0, NULL);
+
+       if (res->status != WALRCV_OK_COMMAND)
            ereport(ERROR,
                    (errmsg("could not drop the replication slot \"%s\" on publisher",
                            slotname),
-                    errdetail("The error was: %s", err)));
+                    errdetail("The error was: %s", res->err)));
        else
            ereport(NOTICE,
                    (errmsg("dropped replication slot \"%s\" on publisher",
                            slotname)));
+
+       walrcv_clear_result(res);
    }
    PG_CATCH();
    {
@@ -691,3 +948,72 @@ AlterSubscriptionOwner_oid(Oid subid, Oid newOwnerId)
 
    heap_close(rel, RowExclusiveLock);
 }
+
+/*
+ * Get the list of tables which belong to specified publications on the
+ * publisher connection.
+ */
+static List *
+fetch_table_list(WalReceiverConn *wrconn, List *publications)
+{
+   WalRcvExecResult   *res;
+   StringInfoData      cmd;
+   TupleTableSlot     *slot;
+   Oid                 tableRow[2] = {TEXTOID, TEXTOID};
+   ListCell           *lc;
+   bool                first;
+   List               *tablelist = NIL;
+
+   Assert(list_length(publications) > 0);
+
+   initStringInfo(&cmd);
+   appendStringInfo(&cmd, "SELECT DISTINCT t.schemaname, t.tablename\n"
+                          "  FROM pg_catalog.pg_publication_tables t\n"
+                          " WHERE t.pubname IN (");
+   first = true;
+   foreach (lc, publications)
+   {
+       char *pubname = strVal(lfirst(lc));
+
+       if (first)
+           first = false;
+       else
+           appendStringInfoString(&cmd, ", ");
+
+       appendStringInfo(&cmd, "%s", quote_literal_cstr(pubname));
+   }
+   appendStringInfoString(&cmd, ")");
+
+   res = walrcv_exec(wrconn, cmd.data, 2, tableRow);
+   pfree(cmd.data);
+
+   if (res->status != WALRCV_OK_TUPLES)
+       ereport(ERROR,
+               (errmsg("could not receive list of replicated tables from the publisher: %s",
+                       res->err)));
+
+   /* Process tables. */
+   slot = MakeSingleTupleTableSlot(res->tupledesc);
+   while (tuplestore_gettupleslot(res->tuplestore, true, false, slot))
+   {
+       char       *nspname;
+       char       *relname;
+       bool        isnull;
+       RangeVar   *rv;
+
+       nspname = TextDatumGetCString(slot_getattr(slot, 1, &isnull));
+       Assert(!isnull);
+       relname = TextDatumGetCString(slot_getattr(slot, 2, &isnull));
+       Assert(!isnull);
+
+       rv = makeRangeVar(pstrdup(nspname), pstrdup(relname), -1);
+       tablelist = lappend(tablelist, rv);
+
+       ExecClearTuple(slot);
+   }
+   ExecDropSingleTupleTableSlot(slot);
+
+   walrcv_clear_result(res);
+
+   return tablelist;
+}
index d0d45a557b425890e711d9b3039d40e942989429..50126baacf6431e379b53d5d27d856d1bcec3333 100644 (file)
@@ -651,7 +651,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
    MAPPING MATCH MATERIALIZED MAXVALUE METHOD MINUTE_P MINVALUE MODE MONTH_P MOVE
 
    NAME_P NAMES NATIONAL NATURAL NCHAR NEW NEXT NO NONE
-   NOT NOTHING NOTIFY NOTNULL NOWAIT NULL_P NULLIF
+   NOREFRESH NOT NOTHING NOTIFY NOTNULL NOWAIT NULL_P NULLIF
    NULLS_P NUMERIC
 
    OBJECT_P OF OFF OFFSET OIDS OLD ON ONLY OPERATOR OPTION OPTIONS OR
@@ -9095,6 +9095,7 @@ AlterSubscriptionStmt:
                {
                    AlterSubscriptionStmt *n =
                        makeNode(AlterSubscriptionStmt);
+                   n->kind = ALTER_SUBSCRIPTION_OPTIONS;
                    n->subname = $3;
                    n->options = $5;
                    $$ = (Node *)n;
@@ -9103,24 +9104,45 @@ AlterSubscriptionStmt:
                {
                    AlterSubscriptionStmt *n =
                        makeNode(AlterSubscriptionStmt);
+                   n->kind = ALTER_SUBSCRIPTION_CONNECTION;
                    n->subname = $3;
-                   n->options = list_make1(makeDefElem("conninfo",
-                                           (Node *)makeString($5), @1));
+                   n->conninfo = $5;
+                   $$ = (Node *)n;
+               }
+           | ALTER SUBSCRIPTION name REFRESH PUBLICATION opt_definition
+               {
+                   AlterSubscriptionStmt *n =
+                       makeNode(AlterSubscriptionStmt);
+                   n->kind = ALTER_SUBSCRIPTION_REFRESH;
+                   n->subname = $3;
+                   n->options = $6;
+                   $$ = (Node *)n;
+               }
+           | ALTER SUBSCRIPTION name SET PUBLICATION publication_name_list REFRESH opt_definition
+               {
+                   AlterSubscriptionStmt *n =
+                       makeNode(AlterSubscriptionStmt);
+                   n->kind = ALTER_SUBSCRIPTION_PUBLICATION_REFRESH;
+                   n->subname = $3;
+                   n->publication = $6;
+                   n->options = $8;
                    $$ = (Node *)n;
                }
-           | ALTER SUBSCRIPTION name SET PUBLICATION publication_name_list
+           | ALTER SUBSCRIPTION name SET PUBLICATION publication_name_list NOREFRESH
                {
                    AlterSubscriptionStmt *n =
                        makeNode(AlterSubscriptionStmt);
+                   n->kind = ALTER_SUBSCRIPTION_PUBLICATION;
                    n->subname = $3;
-                   n->options = list_make1(makeDefElem("publication",
-                                           (Node *)$6, @1));
+                   n->publication = $6;
+                   n->options = NIL;
                    $$ = (Node *)n;
                }
            | ALTER SUBSCRIPTION name ENABLE_P
                {
                    AlterSubscriptionStmt *n =
                        makeNode(AlterSubscriptionStmt);
+                   n->kind = ALTER_SUBSCRIPTION_ENABLED;
                    n->subname = $3;
                    n->options = list_make1(makeDefElem("enabled",
                                            (Node *)makeInteger(TRUE), @1));
@@ -9130,11 +9152,13 @@ AlterSubscriptionStmt:
                {
                    AlterSubscriptionStmt *n =
                        makeNode(AlterSubscriptionStmt);
+                   n->kind = ALTER_SUBSCRIPTION_ENABLED;
                    n->subname = $3;
                    n->options = list_make1(makeDefElem("enabled",
                                            (Node *)makeInteger(FALSE), @1));
                    $$ = (Node *)n;
-               }       ;
+               }
+       ;
 
 /*****************************************************************************
  *
@@ -14548,6 +14572,7 @@ unreserved_keyword:
            | NEW
            | NEXT
            | NO
+           | NOREFRESH
            | NOTHING
            | NOTIFY
            | NOWAIT
index 3a50488db3286036583d9fed2d18ea956cad4cdd..b704788eb5a1b85f172c6daf88b65fb04f5578d8 100644 (file)
@@ -3415,6 +3415,12 @@ pgstat_get_wait_ipc(WaitEventIPC w)
        case WAIT_EVENT_SYNC_REP:
            event_name = "SyncRep";
            break;
+       case WAIT_EVENT_LOGICAL_SYNC_DATA:
+           event_name = "LogicalSyncData";
+           break;
+       case WAIT_EVENT_LOGICAL_SYNC_STATE_CHANGE:
+           event_name = "LogicalSyncStateChange";
+           break;
        /* no default case, so that compiler will warn */
    }
 
index 65a9e6c81ce82960f9b6837e18a378623fc898e7..4dd8eef1f92d6cf8bcb611145cd49d241e8010ec 100644 (file)
 #include "libpq-fe.h"
 #include "pqexpbuffer.h"
 #include "access/xlog.h"
+#include "catalog/pg_type.h"
+#include "funcapi.h"
 #include "mb/pg_wchar.h"
 #include "miscadmin.h"
 #include "pgstat.h"
-#include "replication/logicalproto.h"
 #include "replication/walreceiver.h"
-#include "storage/proc.h"
 #include "utils/builtins.h"
+#include "utils/memutils.h"
 #include "utils/pg_lsn.h"
+#include "utils/tuplestore.h"
 
 PG_MODULE_MAGIC;
 
@@ -68,10 +70,12 @@ static void libpqrcv_send(WalReceiverConn *conn, const char *buffer,
 static char *libpqrcv_create_slot(WalReceiverConn *conn,
                                  const char *slotname,
                                  bool temporary,
-                                 bool export_snapshot,
+                                 CRSSnapshotAction snapshot_action,
                                  XLogRecPtr *lsn);
-static bool libpqrcv_command(WalReceiverConn *conn,
-                            const char *cmd, char **err);
+static WalRcvExecResult *libpqrcv_exec(WalReceiverConn *conn,
+                                      const char *query,
+                                      const int nRetTypes,
+                                      const Oid *retTypes);
 static void libpqrcv_disconnect(WalReceiverConn *conn);
 
 static WalReceiverFunctionsType PQWalReceiverFunctions = {
@@ -85,7 +89,7 @@ static WalReceiverFunctionsType PQWalReceiverFunctions = {
    libpqrcv_receive,
    libpqrcv_send,
    libpqrcv_create_slot,
-   libpqrcv_command,
+   libpqrcv_exec,
    libpqrcv_disconnect
 };
 
@@ -431,10 +435,8 @@ libpqrcv_endstreaming(WalReceiverConn *conn, TimeLineID *next_tli)
     * next timeline's ID, or just CommandComplete if the server was shut
     * down.
     *
-    * If we had not yet received CopyDone from the backend, PGRES_COPY_IN
-    * would also be possible. However, at the moment this function is only
-    * called after receiving CopyDone from the backend - the walreceiver
-    * never terminates replication on its own initiative.
+    * If we had not yet received CopyDone from the backend, PGRES_COPY_OUT
+    * is also possible in case we aborted the copy in mid-stream.
     */
    res = PQgetResult(conn->streamConn);
    if (PQresultStatus(res) == PGRES_TUPLES_OK)
@@ -531,7 +533,7 @@ libpqrcv_readtimelinehistoryfile(WalReceiverConn *conn,
  * Windows.
  *
  * The function is modeled on PQexec() in libpq, but only implements
- * those parts that are in use in the walreceiver.
+ * those parts that are in use in the walreceiver api.
  *
  * Queries are always executed on the connection in streamConn.
  */
@@ -543,8 +545,9 @@ libpqrcv_PQexec(PGconn *streamConn, const char *query)
 
    /*
     * PQexec() silently discards any prior query results on the connection.
-    * This is not required for walreceiver since it's expected that walsender
-    * won't generate any such junk results.
+    * This is not required for this function as it's expected that the
+    * caller (which is this library in all cases) will behave correctly and
+    * we don't have to be backwards compatible with old libpq.
     */
 
    /*
@@ -593,8 +596,7 @@ libpqrcv_PQexec(PGconn *streamConn, const char *query)
 
        /*
         * Emulate the PQexec()'s behavior of returning the last result when
-        * there are many. Since walsender will never generate multiple
-        * results, we skip the concatenation of error messages.
+        * there are many. We are fine with returning just last error message.
         */
        result = PQgetResult(streamConn);
        if (result == NULL)
@@ -675,8 +677,19 @@ libpqrcv_receive(WalReceiverConn *conn, char **buffer,
        PGresult   *res;
 
        res = PQgetResult(conn->streamConn);
-       if (PQresultStatus(res) == PGRES_COMMAND_OK ||
-           PQresultStatus(res) == PGRES_COPY_IN)
+       if (PQresultStatus(res) == PGRES_COMMAND_OK)
+       {
+           PQclear(res);
+
+           /* Verify that there are no more results */
+           res = PQgetResult(conn->streamConn);
+           if (res != NULL)
+               ereport(ERROR,
+                       (errmsg("unexpected result after CommandComplete: %s",
+                               PQerrorMessage(conn->streamConn))));
+           return -1;
+       }
+       else if (PQresultStatus(res) == PGRES_COPY_IN)
        {
            PQclear(res);
            return -1;
@@ -721,7 +734,8 @@ libpqrcv_send(WalReceiverConn *conn, const char *buffer, int nbytes)
  */
 static char *
 libpqrcv_create_slot(WalReceiverConn *conn, const char *slotname,
-                    bool temporary, bool export_snapshot, XLogRecPtr *lsn)
+                    bool temporary, CRSSnapshotAction snapshot_action,
+                    XLogRecPtr *lsn)
 {
    PGresult       *res;
    StringInfoData  cmd;
@@ -737,10 +751,18 @@ libpqrcv_create_slot(WalReceiverConn *conn, const char *slotname,
    if (conn->logical)
    {
        appendStringInfo(&cmd, " LOGICAL pgoutput");
-       if (export_snapshot)
-           appendStringInfo(&cmd, " EXPORT_SNAPSHOT");
-       else
-           appendStringInfo(&cmd, " NOEXPORT_SNAPSHOT");
+       switch (snapshot_action)
+       {
+           case CRS_EXPORT_SNAPSHOT:
+               appendStringInfo(&cmd, " EXPORT_SNAPSHOT");
+               break;
+           case CRS_NOEXPORT_SNAPSHOT:
+               appendStringInfo(&cmd, " NOEXPORT_SNAPSHOT");
+               break;
+           case CRS_USE_SNAPSHOT:
+               appendStringInfo(&cmd, " USE_SNAPSHOT");
+               break;
+       }
    }
 
    res = libpqrcv_PQexec(conn->streamConn, cmd.data);
@@ -767,28 +789,139 @@ libpqrcv_create_slot(WalReceiverConn *conn, const char *slotname,
 }
 
 /*
- * Run command.
+ * Convert tuple query result to tuplestore.
+ */
+static void
+libpqrcv_processTuples(PGresult *pgres, WalRcvExecResult *walres,
+                       const int nRetTypes, const Oid *retTypes)
+{
+   int     tupn;
+   int     coln;
+   int     nfields = PQnfields(pgres);
+   HeapTuple       tuple;
+   AttInMetadata  *attinmeta;
+   MemoryContext   rowcontext;
+   MemoryContext   oldcontext;
+
+   /* No point in doing anything here if there were no tuples returned. */
+   if (PQntuples(pgres) == 0)
+       return;
+
+   /* Make sure we got expected number of fields. */
+   if (nfields != nRetTypes)
+       ereport(ERROR,
+               (errmsg("invalid query responser"),
+                errdetail("Expected %d fields, got %d fields.",
+                          nRetTypes, nfields)));
+
+
+   walres->tuplestore = tuplestore_begin_heap(true, false, work_mem);
+
+   /* Create tuple descriptor corresponding to expected result. */
+   walres->tupledesc = CreateTemplateTupleDesc(nRetTypes, false);
+   for (coln = 0; coln < nRetTypes; coln++)
+       TupleDescInitEntry(walres->tupledesc, (AttrNumber) coln + 1,
+                          PQfname(pgres, coln), retTypes[coln], -1, 0);
+   attinmeta = TupleDescGetAttInMetadata(walres->tupledesc);
+
+   /* Create temporary context for local allocations. */
+   rowcontext = AllocSetContextCreate(CurrentMemoryContext,
+                                      "libpqrcv query result context",
+                                      ALLOCSET_DEFAULT_SIZES);
+
+   /* Process returned rows. */
+   for (tupn = 0; tupn < PQntuples(pgres); tupn++)
+   {
+       char   *cstrs[MaxTupleAttributeNumber];
+
+       CHECK_FOR_INTERRUPTS();
+
+       /* Do the allocations in temporary context. */
+       oldcontext = MemoryContextSwitchTo(rowcontext);
+
+       /*
+        * Fill cstrs with null-terminated strings of column values.
+        */
+       for (coln = 0; coln < nfields; coln++)
+       {
+           if (PQgetisnull(pgres, tupn, coln))
+               cstrs[coln] = NULL;
+           else
+               cstrs[coln] = PQgetvalue(pgres, tupn, coln);
+       }
+
+       /* Convert row to a tuple, and add it to the tuplestore */
+       tuple = BuildTupleFromCStrings(attinmeta, cstrs);
+       tuplestore_puttuple(walres->tuplestore, tuple);
+
+       /* Clean up */
+       MemoryContextSwitchTo(oldcontext);
+       MemoryContextReset(rowcontext);
+   }
+
+   MemoryContextDelete(rowcontext);
+}
+
+/*
+ * Public interface for sending generic queries (and commands).
  *
- * Returns if the command has succeeded and fills the err with palloced
- * error message if not.
+ * This can only be called from process connected to database.
  */
-static bool
-libpqrcv_command(WalReceiverConn *conn, const char *cmd, char **err)
+static WalRcvExecResult *
+libpqrcv_exec(WalReceiverConn *conn, const char *query,
+             const int nRetTypes, const Oid *retTypes)
 {
-   PGresult       *res;
+   PGresult   *pgres = NULL;
+   WalRcvExecResult *walres = palloc0(sizeof(WalRcvExecResult));
 
-   res = libpqrcv_PQexec(conn->streamConn, cmd);
+   if (MyDatabaseId == InvalidOid)
+       ereport(ERROR,
+               (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                errmsg("the query interface requires a database connection")));
 
-   if (PQresultStatus(res) != PGRES_COMMAND_OK)
+   pgres = libpqrcv_PQexec(conn->streamConn, query);
+
+   switch (PQresultStatus(pgres))
    {
-       PQclear(res);
-       *err = pchomp(PQerrorMessage(conn->streamConn));
-       return false;
+       case PGRES_SINGLE_TUPLE:
+       case PGRES_TUPLES_OK:
+           walres->status = WALRCV_OK_TUPLES;
+           libpqrcv_processTuples(pgres, walres, nRetTypes, retTypes);
+           break;
+
+       case PGRES_COPY_IN:
+           walres->status = WALRCV_OK_COPY_IN;
+           break;
+
+       case PGRES_COPY_OUT:
+           walres->status = WALRCV_OK_COPY_OUT;
+           break;
+
+       case PGRES_COPY_BOTH:
+           walres->status = WALRCV_OK_COPY_BOTH;
+           break;
+
+       case PGRES_COMMAND_OK:
+           walres->status = WALRCV_OK_COMMAND;
+           break;
+
+       /* Empty query is considered error. */
+       case PGRES_EMPTY_QUERY:
+           walres->status = WALRCV_ERROR;
+           walres->err = _("empty query");
+           break;
+
+       case PGRES_NONFATAL_ERROR:
+       case PGRES_FATAL_ERROR:
+       case PGRES_BAD_RESPONSE:
+           walres->status = WALRCV_ERROR;
+           walres->err = pchomp(PQerrorMessage(conn->streamConn));
+           break;
    }
 
-   PQclear(res);
+   PQclear(pgres);
 
-   return true;
+   return walres;
 }
 
 /*
index 259befa4e6c7ed2de775b1e7781e1d47edcd7b37..bb417b042ee52ddec726712cec94951e5d9b7db2 100644 (file)
@@ -15,6 +15,6 @@ include $(top_builddir)/src/Makefile.global
 override CPPFLAGS := -I$(srcdir) $(CPPFLAGS)
 
 OBJS = decode.o launcher.o logical.o logicalfuncs.o message.o origin.o \
-      proto.o relation.o reorderbuffer.o snapbuild.o worker.o
+      proto.o relation.o reorderbuffer.o snapbuild.o tablesync.o worker.o
 
 include $(top_srcdir)/src/backend/common.mk
index 20b43626ddd612478b08b845a9e50406c7397bbc..255b22597b64c5afcf7593817e46ae60d89e4460 100644 (file)
@@ -27,6 +27,7 @@
 #include "access/xact.h"
 
 #include "catalog/pg_subscription.h"
+#include "catalog/pg_subscription_rel.h"
 
 #include "libpq/pqsignal.h"
 
@@ -56,6 +57,8 @@
 #define DEFAULT_NAPTIME_PER_CYCLE 180000L
 
 int    max_logical_replication_workers = 4;
+int max_sync_workers_per_subscription = 2;
+
 LogicalRepWorker *MyLogicalRepWorker = NULL;
 
 typedef struct LogicalRepCtxStruct
@@ -198,20 +201,22 @@ WaitForReplicationWorkerAttach(LogicalRepWorker *worker,
 
 /*
  * Walks the workers array and searches for one that matches given
- * subscription id.
+ * subscription id and relid.
  */
 LogicalRepWorker *
-logicalrep_worker_find(Oid subid)
+logicalrep_worker_find(Oid subid, Oid relid, bool only_running)
 {
    int i;
    LogicalRepWorker   *res = NULL;
 
    Assert(LWLockHeldByMe(LogicalRepWorkerLock));
+
    /* Search for attached worker for a given subscription id. */
    for (i = 0; i < max_logical_replication_workers; i++)
    {
        LogicalRepWorker   *w = &LogicalRepCtx->workers[i];
-       if (w->subid == subid && w->proc && IsBackendPid(w->proc->pid))
+       if (w->subid == subid && w->relid == relid &&
+           (!only_running || (w->proc && IsBackendPid(w->proc->pid))))
        {
            res = w;
            break;
@@ -225,7 +230,8 @@ logicalrep_worker_find(Oid subid)
  * Start new apply background worker.
  */
 void
-logicalrep_worker_launch(Oid dbid, Oid subid, const char *subname, Oid userid)
+logicalrep_worker_launch(Oid dbid, Oid subid, const char *subname, Oid userid,
+                        Oid relid)
 {
    BackgroundWorker    bgw;
    BackgroundWorkerHandle *bgw_handle;
@@ -270,10 +276,18 @@ logicalrep_worker_launch(Oid dbid, Oid subid, const char *subname, Oid userid)
    }
 
    /* Prepare the worker info. */
-   memset(worker, 0, sizeof(LogicalRepWorker));
+   worker->proc = NULL;
    worker->dbid = dbid;
    worker->userid = userid;
    worker->subid = subid;
+   worker->relid = relid;
+   worker->relstate = SUBREL_STATE_UNKNOWN;
+   worker->relstate_lsn = InvalidXLogRecPtr;
+   worker->last_lsn = InvalidXLogRecPtr;
+   TIMESTAMP_NOBEGIN(worker->last_send_time);
+   TIMESTAMP_NOBEGIN(worker->last_recv_time);
+   worker->reply_lsn = InvalidXLogRecPtr;
+   TIMESTAMP_NOBEGIN(worker->reply_time);
 
    LWLockRelease(LogicalRepWorkerLock);
 
@@ -282,8 +296,12 @@ logicalrep_worker_launch(Oid dbid, Oid subid, const char *subname, Oid userid)
        BGWORKER_BACKEND_DATABASE_CONNECTION;
    bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
    bgw.bgw_main = ApplyWorkerMain;
-   snprintf(bgw.bgw_name, BGW_MAXLEN,
-            "logical replication worker for subscription %u", subid);
+   if (OidIsValid(relid))
+       snprintf(bgw.bgw_name, BGW_MAXLEN,
+                "logical replication worker for subscription %u sync %u", subid, relid);
+   else
+       snprintf(bgw.bgw_name, BGW_MAXLEN,
+                "logical replication worker for subscription %u", subid);
 
    bgw.bgw_restart_time = BGW_NEVER_RESTART;
    bgw.bgw_notify_pid = MyProcPid;
@@ -307,13 +325,13 @@ logicalrep_worker_launch(Oid dbid, Oid subid, const char *subname, Oid userid)
  * slot.
  */
 void
-logicalrep_worker_stop(Oid subid)
+logicalrep_worker_stop(Oid subid, Oid relid)
 {
    LogicalRepWorker *worker;
 
    LWLockAcquire(LogicalRepWorkerLock, LW_SHARED);
 
-   worker = logicalrep_worker_find(subid);
+   worker = logicalrep_worker_find(subid, relid, false);
 
    /* No worker, nothing to do. */
    if (!worker)
@@ -395,6 +413,31 @@ logicalrep_worker_stop(Oid subid)
    }
 }
 
+/*
+ * Wake up (using latch) the logical replication worker.
+ */
+void
+logicalrep_worker_wakeup(Oid subid, Oid relid)
+{
+   LogicalRepWorker   *worker;
+
+   LWLockAcquire(LogicalRepWorkerLock, LW_SHARED);
+   worker = logicalrep_worker_find(subid, relid, true);
+   LWLockRelease(LogicalRepWorkerLock);
+
+   if (worker)
+       logicalrep_worker_wakeup_ptr(worker);
+}
+
+/*
+ * Wake up (using latch) the logical replication worker.
+ */
+void
+logicalrep_worker_wakeup_ptr(LogicalRepWorker *worker)
+{
+   SetLatch(&worker->proc->procLatch);
+}
+
 /*
  * Attach to a slot.
  */
@@ -457,6 +500,29 @@ logicalrep_worker_sigterm(SIGNAL_ARGS)
    SetLatch(MyLatch);
 }
 
+/*
+ * Count the number of registered (not necessarily running) sync workers
+ * for a subscription.
+ */
+int
+logicalrep_sync_worker_count(Oid subid)
+{
+   int i;
+   int res = 0;
+
+   Assert(LWLockHeldByMe(LogicalRepWorkerLock));
+
+   /* Search for attached worker for a given subscription id. */
+   for (i = 0; i < max_logical_replication_workers; i++)
+   {
+       LogicalRepWorker   *w = &LogicalRepCtx->workers[i];
+       if (w->subid == subid && OidIsValid(w->relid))
+           res++;
+   }
+
+   return res;
+}
+
 /*
  * ApplyLauncherShmemSize
  *     Compute space needed for replication launcher shared memory
@@ -512,7 +578,20 @@ ApplyLauncherShmemInit(void)
                        &found);
 
    if (!found)
+   {
+       int slot;
+
        memset(LogicalRepCtx, 0, ApplyLauncherShmemSize());
+
+       /* Initialize memory and spin locks for each worker slot. */
+       for (slot = 0; slot < max_logical_replication_workers; slot++)
+       {
+           LogicalRepWorker *worker = &LogicalRepCtx->workers[slot];
+
+           memset(worker, 0, sizeof(LogicalRepWorker));
+           SpinLockInit(&worker->relmutex);
+       }
+   }
 }
 
 /*
@@ -607,12 +686,13 @@ ApplyLauncherMain(Datum main_arg)
                LogicalRepWorker   *w;
 
                LWLockAcquire(LogicalRepWorkerLock, LW_SHARED);
-               w = logicalrep_worker_find(sub->oid);
+               w = logicalrep_worker_find(sub->oid, InvalidOid, false);
                LWLockRelease(LogicalRepWorkerLock);
 
                if (sub->enabled && w == NULL)
                {
-                   logicalrep_worker_launch(sub->dbid, sub->oid, sub->name, sub->owner);
+                   logicalrep_worker_launch(sub->dbid, sub->oid, sub->name,
+                                            sub->owner, InvalidOid);
                    last_start_time = now;
                    wait_time = wal_retrieve_retry_interval;
                    /* Limit to one worker per mainloop cycle. */
@@ -664,7 +744,7 @@ ApplyLauncherMain(Datum main_arg)
 Datum
 pg_stat_get_subscription(PG_FUNCTION_ARGS)
 {
-#define PG_STAT_GET_SUBSCRIPTION_COLS  7
+#define PG_STAT_GET_SUBSCRIPTION_COLS  8
    Oid         subid = PG_ARGISNULL(0) ? InvalidOid : PG_GETARG_OID(0);
    int         i;
    ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
@@ -723,27 +803,31 @@ pg_stat_get_subscription(PG_FUNCTION_ARGS)
        MemSet(nulls, 0, sizeof(nulls));
 
        values[0] = ObjectIdGetDatum(worker.subid);
-       values[1] = Int32GetDatum(worker_pid);
+       if (OidIsValid(worker.relid))
+           values[1] = ObjectIdGetDatum(worker.relid);
+       else
+           nulls[1] = true;
+       values[2] = Int32GetDatum(worker_pid);
        if (XLogRecPtrIsInvalid(worker.last_lsn))
-           nulls[2] = true;
+           nulls[3] = true;
        else
-           values[2] = LSNGetDatum(worker.last_lsn);
+           values[3] = LSNGetDatum(worker.last_lsn);
        if (worker.last_send_time == 0)
-           nulls[3] = true;
+           nulls[4] = true;
        else
-           values[3] = TimestampTzGetDatum(worker.last_send_time);
+           values[4] = TimestampTzGetDatum(worker.last_send_time);
        if (worker.last_recv_time == 0)
-           nulls[4] = true;
+           nulls[5] = true;
        else
-           values[4] = TimestampTzGetDatum(worker.last_recv_time);
+           values[5] = TimestampTzGetDatum(worker.last_recv_time);
        if (XLogRecPtrIsInvalid(worker.reply_lsn))
-           nulls[5] = true;
+           nulls[6] = true;
        else
-           values[5] = LSNGetDatum(worker.reply_lsn);
+           values[6] = LSNGetDatum(worker.reply_lsn);
        if (worker.reply_time == 0)
-           nulls[6] = true;
+           nulls[7] = true;
        else
-           values[6] = TimestampTzGetDatum(worker.reply_time);
+           values[7] = TimestampTzGetDatum(worker.reply_time);
 
        tuplestore_putvalues(tupstore, tupdesc, values, nulls);
 
index d8dc0c719402291b5d01a8b7e831952b91a1eaa7..875a08185a6d5c28948c950e87c9263675873fd6 100644 (file)
@@ -19,6 +19,7 @@
 #include "access/heapam.h"
 #include "access/sysattr.h"
 #include "catalog/namespace.h"
+#include "catalog/pg_subscription_rel.h"
 #include "nodes/makefuncs.h"
 #include "replication/logicalrelation.h"
 #include "replication/worker_internal.h"
@@ -357,6 +358,12 @@ logicalrep_rel_open(LogicalRepRelId remoteid, LOCKMODE lockmode)
    else
        entry->localrel = heap_open(entry->localreloid, lockmode);
 
+   if (entry->state != SUBREL_STATE_READY)
+       entry->state = GetSubscriptionRelState(MySubscription->oid,
+                                              entry->localreloid,
+                                              &entry->statelsn,
+                                              true);
+
    return entry;
 }
 
index 3f242a8ed7068f48eeab40e4e73077344ed018d8..a73a7b98f97a1adbdd4b7db99f81e9fb31d15330 100644 (file)
@@ -499,51 +499,32 @@ SnapBuildBuildSnapshot(SnapBuild *builder, TransactionId xid)
 }
 
 /*
- * Export a snapshot so it can be set in another session with SET TRANSACTION
- * SNAPSHOT.
- *
- * For that we need to start a transaction in the current backend as the
- * importing side checks whether the source transaction is still open to make
- * sure the xmin horizon hasn't advanced since then.
+ * Build the initial slot snapshot and convert it to normal snapshot that
+ * is understood by HeapTupleSatisfiesMVCC.
  *
- * After that we convert a locally built snapshot into the normal variant
- * understood by HeapTupleSatisfiesMVCC et al.
+ * The snapshot will be usable directly in current transaction or exported
+ * for loading in different transaction.
  */
-const char *
-SnapBuildExportSnapshot(SnapBuild *builder)
+Snapshot
+SnapBuildInitalSnapshot(SnapBuild *builder)
 {
    Snapshot    snap;
-   char       *snapname;
    TransactionId xid;
    TransactionId *newxip;
    int         newxcnt = 0;
 
+   Assert(!FirstSnapshotSet);
+   Assert(XactIsoLevel = XACT_REPEATABLE_READ);
+
    if (builder->state != SNAPBUILD_CONSISTENT)
-       elog(ERROR, "cannot export a snapshot before reaching a consistent state");
+       elog(ERROR, "cannot build an initial slot snapshot before reaching a consistent state");
 
    if (!builder->committed.includes_all_transactions)
-       elog(ERROR, "cannot export a snapshot, not all transactions are monitored anymore");
+       elog(ERROR, "cannot build an initial slot snapshot, not all transactions are monitored anymore");
 
    /* so we don't overwrite the existing value */
    if (TransactionIdIsValid(MyPgXact->xmin))
-       elog(ERROR, "cannot export a snapshot when MyPgXact->xmin already is valid");
-
-   if (IsTransactionOrTransactionBlock())
-       elog(ERROR, "cannot export a snapshot from within a transaction");
-
-   if (SavedResourceOwnerDuringExport)
-       elog(ERROR, "can only export one snapshot at a time");
-
-   SavedResourceOwnerDuringExport = CurrentResourceOwner;
-   ExportInProgress = true;
-
-   StartTransactionCommand();
-
-   Assert(!FirstSnapshotSet);
-
-   /* There doesn't seem to a nice API to set these */
-   XactIsoLevel = XACT_REPEATABLE_READ;
-   XactReadOnly = true;
+       elog(ERROR, "cannot build an initial slot snapshot when MyPgXact->xmin already is valid");
 
    snap = SnapBuildBuildSnapshot(builder, GetTopTransactionId());
 
@@ -578,7 +559,9 @@ SnapBuildExportSnapshot(SnapBuild *builder)
        if (test == NULL)
        {
            if (newxcnt >= GetMaxSnapshotXidCount())
-               elog(ERROR, "snapshot too large");
+               ereport(ERROR,
+                   (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+                    errmsg("initial slot snapshot too large")));
 
            newxip[newxcnt++] = xid;
        }
@@ -589,9 +572,43 @@ SnapBuildExportSnapshot(SnapBuild *builder)
    snap->xcnt = newxcnt;
    snap->xip = newxip;
 
+   return snap;
+}
+
+/*
+ * Export a snapshot so it can be set in another session with SET TRANSACTION
+ * SNAPSHOT.
+ *
+ * For that we need to start a transaction in the current backend as the
+ * importing side checks whether the source transaction is still open to make
+ * sure the xmin horizon hasn't advanced since then.
+ */
+const char *
+SnapBuildExportSnapshot(SnapBuild *builder)
+{
+   Snapshot    snap;
+   char       *snapname;
+
+   if (IsTransactionOrTransactionBlock())
+       elog(ERROR, "cannot export a snapshot from within a transaction");
+
+   if (SavedResourceOwnerDuringExport)
+       elog(ERROR, "can only export one snapshot at a time");
+
+   SavedResourceOwnerDuringExport = CurrentResourceOwner;
+   ExportInProgress = true;
+
+   StartTransactionCommand();
+
+   /* There doesn't seem to a nice API to set these */
+   XactIsoLevel = XACT_REPEATABLE_READ;
+   XactReadOnly = true;
+
+   snap = SnapBuildInitalSnapshot(builder);
+
    /*
-    * now that we've built a plain snapshot, use the normal mechanisms for
-    * exporting it
+    * now that we've built a plain snapshot, make it active and use the
+    * normal mechanisms for exporting it
     */
    snapname = ExportSnapshot(snap);
 
diff --git a/src/backend/replication/logical/tablesync.c b/src/backend/replication/logical/tablesync.c
new file mode 100644 (file)
index 0000000..3e16b0d
--- /dev/null
@@ -0,0 +1,840 @@
+/*-------------------------------------------------------------------------
+ * tablesync.c
+ *   PostgreSQL logical replication
+ *
+ * Copyright (c) 2012-2016, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *   src/backend/replication/logical/tablesync.c
+ *
+ * NOTES
+ *   This file contains code for initial table data synchronization for
+ *   logical replication.
+ *
+ *   The initial data synchronization is done separately for each table,
+ *   in separate apply worker that only fetches the initial snapshot data
+ *   from the publisher and then synchronizes the position in stream with
+ *   the main apply worker.
+ *
+ *   The are several reasons for doing the synchronization this way:
+ *    - It allows us to parallelize the initial data synchronization
+ *      which lowers the time needed for it to happen.
+ *    - The initial synchronization does not have to hold the xid and LSN
+ *      for the time it takes to copy data of all tables, causing less
+ *      bloat and lower disk consumption compared to doing the
+ *      synchronization in single process for whole database.
+ *    - It allows us to synchronize the tables added after the initial
+ *      synchronization has finished.
+ *
+ *   The stream position synchronization works in multiple steps.
+ *    - Sync finishes copy and sets table state as SYNCWAIT and waits
+ *      for state to change in a loop.
+ *    - Apply periodically checks tables that are synchronizing for SYNCWAIT.
+ *      When the desired state appears it will compare its position in the
+ *      stream with the SYNCWAIT position and based on that changes the
+ *      state to based on following rules:
+ *       - if the apply is in front of the sync in the wal stream the new
+ *         state is set to CATCHUP and apply loops until the sync process
+ *         catches up to the same LSN as apply
+ *       - if the sync is in front of the apply in the wal stream the new
+ *         state is set to SYNCDONE
+ *       - if both apply and sync are at the same position in the wal stream
+ *         the state of the table is set to READY
+ *    - If the state was set to CATCHUP sync will read the stream and
+ *      apply changes until it catches up to the specified stream
+ *      position and then sets state to READY and signals apply that it
+ *      can stop waiting and exits, if the state was set to something
+ *      else than CATCHUP the sync process will simply end.
+ *    - If the state was set to SYNCDONE by apply, the apply will
+ *      continue tracking the table until it reaches the SYNCDONE stream
+ *      position at which point it sets state to READY and stops tracking.
+ *
+ *   The catalog pg_subscription_rel is used to keep information about
+ *   subscribed tables and their state and some transient state during
+ *   data synchronization is kept in shared memory.
+ *
+ *   Example flows look like this:
+ *    - Apply is in front:
+ *       sync:8
+ *         -> set SYNCWAIT
+ *       apply:10
+ *         -> set CATCHUP
+ *         -> enter wait-loop
+ *       sync:10
+ *         -> set READY
+ *         -> exit
+ *       apply:10
+ *         -> exit wait-loop
+ *         -> continue rep
+ *    - Sync in front:
+ *       sync:10
+ *         -> set SYNCWAIT
+ *       apply:8
+ *         -> set SYNCDONE
+ *         -> continue per-table filtering
+ *       sync:10
+ *         -> exit
+ *       apply:10
+ *         -> set READY
+ *         -> stop per-table filtering
+ *         -> continue rep
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "miscadmin.h"
+#include "pgstat.h"
+
+#include "access/xact.h"
+
+#include "catalog/pg_subscription_rel.h"
+#include "catalog/pg_type.h"
+
+#include "commands/copy.h"
+
+#include "replication/logicallauncher.h"
+#include "replication/logicalrelation.h"
+#include "replication/walreceiver.h"
+#include "replication/worker_internal.h"
+
+#include "storage/ipc.h"
+
+#include "utils/builtins.h"
+#include "utils/lsyscache.h"
+#include "utils/memutils.h"
+
+static bool table_states_valid = false;
+
+StringInfo copybuf = NULL;
+
+/*
+ * Exit routine for synchronization worker.
+ */
+static void pg_attribute_noreturn()
+finish_sync_worker(void)
+{
+   /* Commit any outstanding transaction. */
+   if (IsTransactionState())
+       CommitTransactionCommand();
+
+   /* And flush all writes. */
+   XLogFlush(GetXLogWriteRecPtr());
+
+   /* Find the main apply worker and signal it. */
+   logicalrep_worker_wakeup(MyLogicalRepWorker->subid, InvalidOid);
+
+   ereport(LOG,
+           (errmsg("logical replication synchronization worker finished processing")));
+
+   /* Stop gracefully */
+   walrcv_disconnect(wrconn);
+   proc_exit(0);
+}
+
+/*
+ * Wait until the table synchronization change.
+ *
+ * Returns false if the relation subscription state disappeared.
+ */
+static bool
+wait_for_sync_status_change(Oid relid, char origstate)
+{
+   int     rc;
+   char    state = origstate;
+
+   while (!got_SIGTERM)
+   {
+       LogicalRepWorker   *worker;
+
+       LWLockAcquire(LogicalRepWorkerLock, LW_SHARED);
+       worker = logicalrep_worker_find(MyLogicalRepWorker->subid,
+                                       relid, false);
+       if (!worker)
+       {
+           LWLockRelease(LogicalRepWorkerLock);
+           return false;
+       }
+       state = worker->relstate;
+       LWLockRelease(LogicalRepWorkerLock);
+
+       if (state == SUBREL_STATE_UNKNOWN)
+           return false;
+
+       if (state != origstate)
+           return true;
+
+       rc = WaitLatch(&MyProc->procLatch,
+                      WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
+                      10000L, WAIT_EVENT_LOGICAL_SYNC_STATE_CHANGE);
+
+       /* emergency bailout if postmaster has died */
+       if (rc & WL_POSTMASTER_DEATH)
+           proc_exit(1);
+
+       ResetLatch(&MyProc->procLatch);
+   }
+
+   return false;
+}
+
+/*
+ * Callback from syscache invalidation.
+ */
+void
+invalidate_syncing_table_states(Datum arg, int cacheid, uint32 hashvalue)
+{
+   table_states_valid = false;
+}
+
+/*
+ * Handle table synchronization cooperation from the synchronization
+ * worker.
+ *
+ * If the sync worker is in catch up mode and reached the predetermined
+ * synchronization point in the WAL stream, mark the table as READY and
+ * finish.  If it caught up too far, set to SYNCDONE and finish.  Things will
+ * then proceed in the "sync in front" scenario.
+ */
+static void
+process_syncing_tables_for_sync(XLogRecPtr current_lsn)
+{
+   Assert(IsTransactionState());
+
+   SpinLockAcquire(&MyLogicalRepWorker->relmutex);
+
+   if (MyLogicalRepWorker->relstate == SUBREL_STATE_CATCHUP &&
+       current_lsn >= MyLogicalRepWorker->relstate_lsn)
+   {
+       TimeLineID  tli;
+
+       MyLogicalRepWorker->relstate =
+           (current_lsn == MyLogicalRepWorker->relstate_lsn)
+           ? SUBREL_STATE_READY
+           : SUBREL_STATE_SYNCDONE;
+       MyLogicalRepWorker->relstate_lsn = current_lsn;
+
+       SpinLockRelease(&MyLogicalRepWorker->relmutex);
+
+       SetSubscriptionRelState(MyLogicalRepWorker->subid,
+                               MyLogicalRepWorker->relid,
+                               MyLogicalRepWorker->relstate,
+                               MyLogicalRepWorker->relstate_lsn);
+
+       walrcv_endstreaming(wrconn, &tli);
+       finish_sync_worker();
+   }
+   else
+       SpinLockRelease(&MyLogicalRepWorker->relmutex);
+}
+
+/*
+ * Handle table synchronization cooperation from the apply worker.
+ *
+ * Walk over all subscription tables that are individually tracked by the
+ * apply process (currently, all that have state other than
+ * SUBREL_STATE_READY) and manage synchronization for them.
+ *
+ * If there are tables that need synchronizing and are not being synchronized
+ * yet, start sync workers for them (if there are free slots for sync
+ * workers).
+ *
+ * For tables that are being synchronized already, check if sync workers
+ * either need action from the apply worker or have finished.
+ *
+ * The usual scenario is that the apply got ahead of the sync while the sync
+ * ran, and then the action needed by apply is to mark a table for CATCHUP and
+ * wait for the catchup to happen.  In the less common case that sync worker
+ * got in front of the apply worker, the table is marked as SYNCDONE but not
+ * ready yet, as it needs to be tracked until apply reaches the same position
+ * to which it was synced.
+ *
+ * If the synchronization position is reached, then the table can be marked as
+ * READY and is no longer tracked.
+ */
+static void
+process_syncing_tables_for_apply(XLogRecPtr current_lsn)
+{
+   static List *table_states = NIL;
+   ListCell   *lc;
+
+   Assert(!IsTransactionState());
+
+   /* We need up to date sync state info for subscription tables here. */
+   if (!table_states_valid)
+   {
+       MemoryContext   oldctx;
+       List           *rstates;
+       ListCell       *lc;
+       SubscriptionRelState *rstate;
+
+       /* Clean the old list. */
+       list_free_deep(table_states);
+       table_states = NIL;
+
+       StartTransactionCommand();
+
+       /* Fetch all non-ready tables. */
+       rstates = GetSubscriptionNotReadyRelations(MySubscription->oid);
+
+       /* Allocate the tracking info in a permanent memory context. */
+       oldctx = MemoryContextSwitchTo(CacheMemoryContext);
+       foreach(lc, rstates)
+       {
+           rstate = palloc(sizeof(SubscriptionRelState));
+           memcpy(rstate, lfirst(lc), sizeof(SubscriptionRelState));
+           table_states = lappend(table_states, rstate);
+       }
+       MemoryContextSwitchTo(oldctx);
+
+       CommitTransactionCommand();
+
+       table_states_valid = true;
+   }
+
+   /* Process all tables that are being synchronized. */
+   foreach(lc, table_states)
+   {
+       SubscriptionRelState *rstate = (SubscriptionRelState *)lfirst(lc);
+
+       if (rstate->state == SUBREL_STATE_SYNCDONE)
+       {
+           /*
+            * Apply has caught up to the position where the table sync
+            * has finished.  Time to mark the table as ready so that
+            * apply will just continue to replicate it normally.
+            */
+           if (current_lsn >= rstate->lsn)
+           {
+               rstate->state = SUBREL_STATE_READY;
+               rstate->lsn = current_lsn;
+               StartTransactionCommand();
+               SetSubscriptionRelState(MyLogicalRepWorker->subid,
+                                       rstate->relid, rstate->state,
+                                       rstate->lsn);
+               CommitTransactionCommand();
+           }
+       }
+       else
+       {
+           LogicalRepWorker   *syncworker;
+           int                 nsyncworkers = 0;
+
+           LWLockAcquire(LogicalRepWorkerLock, LW_SHARED);
+           syncworker = logicalrep_worker_find(MyLogicalRepWorker->subid,
+                                               rstate->relid, false);
+           if (syncworker)
+           {
+               SpinLockAcquire(&syncworker->relmutex);
+               rstate->state = syncworker->relstate;
+               rstate->lsn = syncworker->relstate_lsn;
+               SpinLockRelease(&syncworker->relmutex);
+           }
+           else
+               /*
+                * If no sync worker for this table yet, could running sync
+                * workers for this subscription, while we have the lock, for
+                * later.
+                */
+               nsyncworkers = logicalrep_sync_worker_count(MyLogicalRepWorker->subid);
+           LWLockRelease(LogicalRepWorkerLock);
+
+           /*
+            * There is a worker synchronizing the relation and waiting for
+            * apply to do something.
+            */
+           if (syncworker && rstate->state == SUBREL_STATE_SYNCWAIT)
+           {
+               /*
+                * There are three possible synchronization situations here.
+                *
+                * a) Apply is in front of the table sync: We tell the table
+                *    sync to CATCHUP.
+                *
+                * b) Apply is behind the table sync: We tell the table sync
+                *    to mark the table as SYNCDONE and finish.
+
+                * c) Apply and table sync are at the same position: We tell
+                *    table sync to mark the table as READY and finish.
+                *
+                * In any case we'll need to wait for table sync to change
+                * the state in catalog and only then continue ourselves.
+                */
+               if (current_lsn > rstate->lsn)
+               {
+                   rstate->state = SUBREL_STATE_CATCHUP;
+                   rstate->lsn = current_lsn;
+               }
+               else if (current_lsn == rstate->lsn)
+               {
+                   rstate->state = SUBREL_STATE_READY;
+                   rstate->lsn = current_lsn;
+               }
+               else
+                   rstate->state = SUBREL_STATE_SYNCDONE;
+
+               SpinLockAcquire(&syncworker->relmutex);
+               syncworker->relstate = rstate->state;
+               syncworker->relstate_lsn = rstate->lsn;
+               SpinLockRelease(&syncworker->relmutex);
+
+               /* Signal the sync worker, as it may be waiting for us. */
+               logicalrep_worker_wakeup_ptr(syncworker);
+
+               /*
+                * Enter busy loop and wait for synchronization status
+                * change.
+                */
+               wait_for_sync_status_change(rstate->relid, rstate->state);
+           }
+
+           /*
+            * If there is no sync worker registered for the table and
+            * there is some free sync worker slot, start new sync worker
+            * for the table.
+            */
+           else if (!syncworker && nsyncworkers < max_sync_workers_per_subscription)
+           {
+               logicalrep_worker_launch(MyLogicalRepWorker->dbid,
+                                        MySubscription->oid,
+                                        MySubscription->name,
+                                        MyLogicalRepWorker->userid,
+                                        rstate->relid);
+           }
+       }
+   }
+}
+
+/*
+ * Process state possible change(s) of tables that are being synchronized.
+ */
+void
+process_syncing_tables(XLogRecPtr current_lsn)
+{
+   if (am_tablesync_worker())
+       process_syncing_tables_for_sync(current_lsn);
+   else
+       process_syncing_tables_for_apply(current_lsn);
+}
+
+/*
+ * Create list of columns for COPY based on logical relation mapping.
+ */
+static List *
+make_copy_attnamelist(LogicalRepRelMapEntry *rel)
+{
+   List       *attnamelist = NIL;
+   TupleDesc   desc = RelationGetDescr(rel->localrel);
+   int         i;
+
+   for (i = 0; i < desc->natts; i++)
+   {
+       int     remoteattnum = rel->attrmap[i];
+
+       /* Skip dropped attributes. */
+       if (desc->attrs[i]->attisdropped)
+           continue;
+
+       /* Skip attributes that are missing on remote side. */
+       if (remoteattnum < 0)
+           continue;
+
+       attnamelist = lappend(attnamelist,
+                           makeString(rel->remoterel.attnames[remoteattnum]));
+   }
+
+   return attnamelist;
+}
+
+/*
+ * Data source callback for the COPY FROM, which reads from the remote
+ * connection and passes the data back to our local COPY.
+ */
+static int
+copy_read_data(void *outbuf, int minread, int maxread)
+{
+   int     bytesread = 0;
+   int     avail;
+
+   /* If there are some leftover data from previous read, use them. */
+   avail = copybuf->len - copybuf->cursor;
+   if (avail)
+   {
+       if (avail > maxread)
+           avail = maxread;
+       memcpy(outbuf, &copybuf->data[copybuf->cursor], avail);
+       copybuf->cursor += avail;
+       maxread -= avail;
+       bytesread += avail;
+   }
+
+   while (!got_SIGTERM && maxread > 0 && bytesread < minread)
+   {
+       pgsocket    fd = PGINVALID_SOCKET;
+       int         rc;
+       int         len;
+       char       *buf = NULL;
+
+       for (;;)
+       {
+           /* Try read the data. */
+           len = walrcv_receive(wrconn, &buf, &fd);
+
+           CHECK_FOR_INTERRUPTS();
+
+           if (len == 0)
+               break;
+           else if (len < 0)
+               return bytesread;
+           else
+           {
+               /* Process the data */
+               copybuf->data = buf;
+               copybuf->len = len;
+               copybuf->cursor = 0;
+
+               avail = copybuf->len - copybuf->cursor;
+               if (avail > maxread)
+                   avail = maxread;
+               memcpy(outbuf, &copybuf->data[copybuf->cursor], avail);
+               outbuf = (void *) ((char *) outbuf + avail);
+               copybuf->cursor += avail;
+               maxread -= avail;
+               bytesread += avail;
+           }
+
+           if (maxread <= 0 || bytesread >= minread)
+               return bytesread;
+       }
+
+       /*
+        * Wait for more data or latch.
+        */
+       rc = WaitLatchOrSocket(&MyProc->procLatch,
+                              WL_SOCKET_READABLE | WL_LATCH_SET |
+                              WL_TIMEOUT | WL_POSTMASTER_DEATH,
+                              fd, 1000L, WAIT_EVENT_LOGICAL_SYNC_DATA);
+
+       /* Emergency bailout if postmaster has died */
+       if (rc & WL_POSTMASTER_DEATH)
+           proc_exit(1);
+
+       ResetLatch(&MyProc->procLatch);
+   }
+
+   /* Check for exit condition. */
+   if (got_SIGTERM)
+       proc_exit(0);
+
+   return bytesread;
+}
+
+
+/*
+ * Get information about remote relation in similar fashion the RELATION
+ * message provides during replication.
+ */
+static void
+fetch_remote_table_info(char *nspname, char *relname,
+                       LogicalRepRelation *lrel)
+{
+   WalRcvExecResult   *res;
+   StringInfoData      cmd;
+   TupleTableSlot     *slot;
+   Oid                 tableRow[2] = {OIDOID, CHAROID};
+   Oid                 attrRow[4] = {TEXTOID, OIDOID, INT4OID, BOOLOID};
+   bool                isnull;
+   int                 natt;
+
+   lrel->nspname = nspname;
+   lrel->relname = relname;
+
+   /* First fetch Oid and replica identity. */
+   initStringInfo(&cmd);
+   appendStringInfo(&cmd, "SELECT c.oid, c.relreplident"
+                          "  FROM pg_catalog.pg_class c,"
+                          "       pg_catalog.pg_namespace n"
+                          " WHERE n.nspname = %s"
+                          "   AND c.relname = %s"
+                          "   AND c.relkind = 'r'",
+                          quote_literal_cstr(nspname),
+                          quote_literal_cstr(relname));
+   res = walrcv_exec(wrconn, cmd.data, 2, tableRow);
+
+   if (res->status != WALRCV_OK_TUPLES)
+       ereport(ERROR,
+               (errmsg("could not fetch table info for table \"%s.%s\" from publisher: %s",
+                       nspname, relname, res->err)));
+
+   slot = MakeSingleTupleTableSlot(res->tupledesc);
+   if (!tuplestore_gettupleslot(res->tuplestore, true, false, slot))
+       ereport(ERROR,
+               (errmsg("table \"%s.%s\" not found on publisher",
+                       nspname, relname)));
+
+   lrel->remoteid = DatumGetObjectId(slot_getattr(slot, 1, &isnull));
+   Assert(!isnull);
+   lrel->replident = DatumGetChar(slot_getattr(slot, 2, &isnull));
+   Assert(!isnull);
+
+   ExecDropSingleTupleTableSlot(slot);
+   walrcv_clear_result(res);
+
+   /* Now fetch columns. */
+   resetStringInfo(&cmd);
+   appendStringInfo(&cmd,
+                    "SELECT a.attname,"
+                    "       a.atttypid,"
+                    "       a.atttypmod,"
+                    "       a.attnum = ANY(i.indkey)"
+                    "  FROM pg_catalog.pg_attribute a"
+                    "  LEFT JOIN pg_catalog.pg_index i"
+                    "       ON (i.indexrelid = pg_get_replica_identity_index(%u))"
+                    " WHERE a.attnum > 0::pg_catalog.int2"
+                    "   AND NOT a.attisdropped"
+                    "   AND a.attrelid = %u"
+                    " ORDER BY a.attnum",
+                    lrel->remoteid, lrel->remoteid);
+   res = walrcv_exec(wrconn, cmd.data, 4, attrRow);
+
+   if (res->status != WALRCV_OK_TUPLES)
+       ereport(ERROR,
+               (errmsg("could not fetch table info for table \"%s.%s\": %s",
+                       nspname, relname, res->err)));
+
+   /* We don't know number of rows coming, so allocate enough space. */
+   lrel->attnames = palloc0(MaxTupleAttributeNumber * sizeof(char *));
+   lrel->atttyps = palloc0(MaxTupleAttributeNumber * sizeof(Oid));
+   lrel->attkeys = NULL;
+
+   natt = 0;
+   slot = MakeSingleTupleTableSlot(res->tupledesc);
+   while (tuplestore_gettupleslot(res->tuplestore, true, false, slot))
+   {
+       lrel->attnames[natt] =
+           pstrdup(TextDatumGetCString(slot_getattr(slot, 1, &isnull)));
+       Assert(!isnull);
+       lrel->atttyps[natt] = DatumGetObjectId(slot_getattr(slot, 2, &isnull));
+       Assert(!isnull);
+       if (DatumGetBool(slot_getattr(slot, 4, &isnull)))
+           lrel->attkeys = bms_add_member(lrel->attkeys, natt);
+
+       /* Should never happen. */
+       if (++natt >= MaxTupleAttributeNumber)
+           elog(ERROR, "too many columns in remote table \"%s.%s\"",
+                       nspname, relname);
+
+       ExecClearTuple(slot);
+   }
+   ExecDropSingleTupleTableSlot(slot);
+
+   lrel->natts = natt;
+
+   walrcv_clear_result(res);
+   pfree(cmd.data);
+}
+
+/*
+ * Copy existing data of a table from publisher.
+ *
+ * Caller is responsible for locking the local relation.
+ */
+static void
+copy_table(Relation rel)
+{
+   LogicalRepRelMapEntry *relmapentry;
+   LogicalRepRelation  lrel;
+   WalRcvExecResult   *res;
+   StringInfoData      cmd;
+   CopyState   cstate;
+   List       *attnamelist;
+
+   /* Get the publisher relation info. */
+   fetch_remote_table_info(get_namespace_name(RelationGetNamespace(rel)),
+                           RelationGetRelationName(rel), &lrel);
+
+   /* Put the relation into relmap. */
+   logicalrep_relmap_update(&lrel);
+
+   /* Map the publisher relation to local one. */
+   relmapentry = logicalrep_rel_open(lrel.remoteid, NoLock);
+   Assert(rel == relmapentry->localrel);
+
+   /* Start copy on the publisher. */
+   initStringInfo(&cmd);
+   appendStringInfo(&cmd, "COPY %s TO STDOUT",
+                    quote_qualified_identifier(lrel.nspname, lrel.relname));
+   res = walrcv_exec(wrconn, cmd.data, 0, NULL);
+   pfree(cmd.data);
+   if (res->status != WALRCV_OK_COPY_OUT)
+       ereport(ERROR,
+               (errmsg("could not start initial contents copy for table \"%s.%s\": %s",
+                       lrel.nspname, lrel.relname, res->err)));
+   walrcv_clear_result(res);
+
+   copybuf = makeStringInfo();
+
+   /* Create CopyState for ingestion of the data from publisher. */
+   attnamelist = make_copy_attnamelist(relmapentry);
+   cstate = BeginCopyFrom(NULL, rel, NULL, false, copy_read_data, attnamelist, NIL);
+
+   /* Do the copy */
+   (void) CopyFrom(cstate);
+
+   logicalrep_rel_close(relmapentry, NoLock);
+}
+
+/*
+ * Start syncing the table in the sync worker.
+ *
+ * The returned slot name is palloced in current memory context.
+ */
+char *
+LogicalRepSyncTableStart(XLogRecPtr *origin_startpos)
+{
+   char           *slotname;
+   char           *err;
+
+   /* Check the state of the table synchronization. */
+   StartTransactionCommand();
+   SpinLockAcquire(&MyLogicalRepWorker->relmutex);
+   MyLogicalRepWorker->relstate =
+       GetSubscriptionRelState(MyLogicalRepWorker->subid,
+                               MyLogicalRepWorker->relid,
+                               &MyLogicalRepWorker->relstate_lsn,
+                               false);
+   SpinLockRelease(&MyLogicalRepWorker->relmutex);
+   CommitTransactionCommand();
+
+   /*
+    * To build a slot name for the sync work, we are limited to NAMEDATALEN -
+    * 1 characters.  We cut the original slot name to NAMEDATALEN - 28 chars
+    * and append _%u_sync_%u (1 + 10 + 6 + 10 + '\0').  (It's actually the
+    * NAMEDATALEN on the remote that matters, but this scheme will also work
+    * reasonably if that is different.)
+    */
+   StaticAssertStmt(NAMEDATALEN >= 32, "NAMEDATALEN too small"); /* for sanity */
+   slotname = psprintf("%.*s_%u_sync_%u",
+                       NAMEDATALEN - 28,
+                       MySubscription->slotname,
+                       MySubscription->oid,
+                       MyLogicalRepWorker->relid);
+
+   wrconn = walrcv_connect(MySubscription->conninfo, true, slotname, &err);
+   if (wrconn == NULL)
+       ereport(ERROR,
+               (errmsg("could not connect to the publisher: %s", err)));
+
+   switch (MyLogicalRepWorker->relstate)
+   {
+       case SUBREL_STATE_INIT:
+       case SUBREL_STATE_DATASYNC:
+           {
+               Relation    rel;
+               WalRcvExecResult   *res;
+
+               SpinLockAcquire(&MyLogicalRepWorker->relmutex);
+               MyLogicalRepWorker->relstate = SUBREL_STATE_DATASYNC;
+               MyLogicalRepWorker->relstate_lsn = InvalidXLogRecPtr;
+               SpinLockRelease(&MyLogicalRepWorker->relmutex);
+
+               /* Update the state and make it visible to others. */
+               StartTransactionCommand();
+               SetSubscriptionRelState(MyLogicalRepWorker->subid,
+                                       MyLogicalRepWorker->relid,
+                                       MyLogicalRepWorker->relstate,
+                                       MyLogicalRepWorker->relstate_lsn);
+               CommitTransactionCommand();
+
+               /*
+                * We want to do the table data sync in single
+                * transaction.
+                */
+               StartTransactionCommand();
+
+               /*
+                * Use standard write lock here. It might be better to
+                * disallow access to table while it's being synchronized.
+                * But we don't want to block the main apply process from
+                * working and it has to open relation in RowExclusiveLock
+                * when remapping remote relation id to local one.
+                */
+               rel = heap_open(MyLogicalRepWorker->relid, RowExclusiveLock);
+
+               /*
+                * Create temporary slot for the sync process.
+                * We do this inside transaction so that we can use the
+                * snapshot made by the slot to get existing data.
+                */
+               res = walrcv_exec(wrconn,
+                                 "BEGIN READ ONLY ISOLATION LEVEL "
+                                 "REPEATABLE READ", 0, NULL);
+               if (res->status != WALRCV_OK_COMMAND)
+                   ereport(ERROR,
+                           (errmsg("table copy could not start transaction on publisher"),
+                            errdetail("The error was: %s", res->err)));
+               walrcv_clear_result(res);
+
+               /*
+                * Create new temporary logical decoding slot.
+                *
+                * We'll use slot for data copy so make sure the snapshot
+                * is used for the transaction, that way the COPY will get
+                * data that is consistent with the lsn used by the slot
+                * to start decoding.
+                */
+               walrcv_create_slot(wrconn, slotname, true,
+                                  CRS_USE_SNAPSHOT, origin_startpos);
+
+               copy_table(rel);
+
+               res = walrcv_exec(wrconn, "COMMIT", 0, NULL);
+               if (res->status != WALRCV_OK_COMMAND)
+                   ereport(ERROR,
+                           (errmsg("table copy could not finish transaction on publisher"),
+                            errdetail("The error was: %s", res->err)));
+               walrcv_clear_result(res);
+
+               heap_close(rel, NoLock);
+
+               /* Make the copy visible. */
+               CommandCounterIncrement();
+
+               /*
+                * We are done with the initial data synchronization,
+                * update the state.
+                */
+               SpinLockAcquire(&MyLogicalRepWorker->relmutex);
+               MyLogicalRepWorker->relstate = SUBREL_STATE_SYNCWAIT;
+               MyLogicalRepWorker->relstate_lsn = *origin_startpos;
+               SpinLockRelease(&MyLogicalRepWorker->relmutex);
+
+               /*
+                * Wait for main apply worker to either tell us to
+                * catchup or that we are done.
+                */
+               wait_for_sync_status_change(MyLogicalRepWorker->relid,
+                                           MyLogicalRepWorker->relstate);
+               if (MyLogicalRepWorker->relstate != SUBREL_STATE_CATCHUP)
+               {
+                   /* Update the new state. */
+                   SetSubscriptionRelState(MyLogicalRepWorker->subid,
+                                           MyLogicalRepWorker->relid,
+                                           MyLogicalRepWorker->relstate,
+                                           MyLogicalRepWorker->relstate_lsn);
+                   finish_sync_worker();
+               }
+               break;
+           }
+       case SUBREL_STATE_SYNCDONE:
+       case SUBREL_STATE_READY:
+           /* Nothing to do here but finish. */
+           finish_sync_worker();
+           break;
+       default:
+           elog(ERROR, "unknown relation state \"%c\"",
+                MyLogicalRepWorker->relstate);
+   }
+
+   return slotname;
+}
index c3e54af259137e9890ac7ae3fa0eebaa6c0dac4e..bbf3506be04db63cb5150223f4efc0f524bda2eb 100644 (file)
@@ -32,6 +32,7 @@
 
 #include "catalog/namespace.h"
 #include "catalog/pg_subscription.h"
+#include "catalog/pg_subscription_rel.h"
 
 #include "commands/trigger.h"
 
@@ -101,7 +102,7 @@ typedef struct SlotErrCallbackArg
 } SlotErrCallbackArg;
 
 static MemoryContext   ApplyContext = NULL;
-static MemoryContext   ApplyCacheContext = NULL;
+MemoryContext          ApplyCacheContext = NULL;
 
 WalReceiverConn       *wrconn = NULL;
 
@@ -109,6 +110,7 @@ Subscription       *MySubscription = NULL;
 bool               MySubscriptionValid = false;
 
 bool               in_remote_transaction = false;
+static XLogRecPtr  remote_final_lsn = InvalidXLogRecPtr;
 
 static void send_feedback(XLogRecPtr recvpos, bool force, bool requestReply);
 
@@ -116,6 +118,30 @@ static void store_flush_position(XLogRecPtr remote_lsn);
 
 static void reread_subscription(void);
 
+/*
+ * Should this worker apply changes for given relation.
+ *
+ * This is mainly needed for initial relation data sync as that runs in
+ * separate worker process running in parallel and we need some way to skip
+ * changes coming to the main apply worker during the sync of a table.
+ *
+ * Note we need to do smaller or equals comparison for SYNCDONE state because
+ * it might hold position of end of intitial slot consistent point WAL
+ * record + 1 (ie start of next record) and next record can be COMMIT of
+ * transaction we are now processing (which is what we set remote_final_lsn
+ * to in apply_handle_begin).
+ */
+static bool
+should_apply_changes_for_rel(LogicalRepRelMapEntry *rel)
+{
+   if (am_tablesync_worker())
+       return MyLogicalRepWorker->relid == rel->localreloid;
+   else
+       return (rel->state == SUBREL_STATE_READY ||
+               (rel->state == SUBREL_STATE_SYNCDONE &&
+                rel->statelsn <= remote_final_lsn));
+}
+
 /*
  * Make sure that we started local transaction.
  *
@@ -398,6 +424,8 @@ apply_handle_begin(StringInfo s)
    replorigin_session_origin_timestamp = begin_data.committime;
    replorigin_session_origin_lsn = begin_data.final_lsn;
 
+   remote_final_lsn = begin_data.final_lsn;
+
    in_remote_transaction = true;
 
    pgstat_report_activity(STATE_RUNNING, NULL);
@@ -418,7 +446,10 @@ apply_handle_commit(StringInfo s)
    Assert(commit_data.commit_lsn == replorigin_session_origin_lsn);
    Assert(commit_data.committime == replorigin_session_origin_timestamp);
 
-   if (IsTransactionState())
+   Assert(commit_data.commit_lsn == remote_final_lsn);
+
+   /* The synchronization worker runs in single transaction. */
+   if (IsTransactionState() && !am_tablesync_worker())
    {
        CommitTransactionCommand();
 
@@ -427,6 +458,9 @@ apply_handle_commit(StringInfo s)
 
    in_remote_transaction = false;
 
+   /* Process any tables that are being synchronized in parallel. */
+   process_syncing_tables(commit_data.end_lsn);
+
    pgstat_report_activity(STATE_IDLE, NULL);
 }
 
@@ -442,7 +476,8 @@ apply_handle_origin(StringInfo s)
     * ORIGIN message can only come inside remote transaction and before
     * any actual writes.
     */
-   if (!in_remote_transaction || IsTransactionState())
+   if (!in_remote_transaction ||
+       (IsTransactionState() && !am_tablesync_worker()))
        ereport(ERROR,
                (errcode(ERRCODE_PROTOCOL_VIOLATION),
                 errmsg("ORIGIN message sent out of order")));
@@ -515,6 +550,15 @@ apply_handle_insert(StringInfo s)
 
    relid = logicalrep_read_insert(s, &newtup);
    rel = logicalrep_rel_open(relid, RowExclusiveLock);
+   if (!should_apply_changes_for_rel(rel))
+   {
+       /*
+        * The relation can't become interesting in the middle of the
+        * transaction so it's safe to unlock it.
+        */
+       logicalrep_rel_close(rel, RowExclusiveLock);
+       return;
+   }
 
    /* Initialize the executor state. */
    estate = create_estate_for_relation(rel);
@@ -607,6 +651,15 @@ apply_handle_update(StringInfo s)
    relid = logicalrep_read_update(s, &has_oldtup, &oldtup,
                                   &newtup);
    rel = logicalrep_rel_open(relid, RowExclusiveLock);
+   if (!should_apply_changes_for_rel(rel))
+   {
+       /*
+        * The relation can't become interesting in the middle of the
+        * transaction so it's safe to unlock it.
+        */
+       logicalrep_rel_close(rel, RowExclusiveLock);
+       return;
+   }
 
    /* Check if we can do the update. */
    check_relation_updatable(rel);
@@ -716,6 +769,15 @@ apply_handle_delete(StringInfo s)
 
    relid = logicalrep_read_delete(s, &oldtup);
    rel = logicalrep_rel_open(relid, RowExclusiveLock);
+   if (!should_apply_changes_for_rel(rel))
+   {
+       /*
+        * The relation can't become interesting in the middle of the
+        * transaction so it's safe to unlock it.
+        */
+       logicalrep_rel_close(rel, RowExclusiveLock);
+       return;
+   }
 
    /* Check if we can do the delete. */
    check_relation_updatable(rel);
@@ -927,10 +989,8 @@ UpdateWorkerStats(XLogRecPtr last_lsn, TimestampTz send_time, bool reply)
  * Apply main loop.
  */
 static void
-ApplyLoop(void)
+LogicalRepApplyLoop(XLogRecPtr last_received)
 {
-   XLogRecPtr  last_received = InvalidXLogRecPtr;
-
    /* Init the ApplyContext which we use for easier cleanup. */
    ApplyContext = AllocSetContextCreate(TopMemoryContext,
                                         "ApplyContext",
@@ -1014,15 +1074,18 @@ ApplyLoop(void)
                    }
                    else if (c == 'k')
                    {
-                       XLogRecPtr  endpos;
+                       XLogRecPtr  end_lsn;
                        TimestampTz timestamp;
                        bool        reply_requested;
 
-                       endpos = pq_getmsgint64(&s);
+                       end_lsn = pq_getmsgint64(&s);
                        timestamp = pq_getmsgint64(&s);
                        reply_requested = pq_getmsgbyte(&s);
 
-                       send_feedback(endpos, reply_requested, false);
+                       if (last_received < end_lsn)
+                           last_received = end_lsn;
+
+                       send_feedback(last_received, reply_requested, false);
                        UpdateWorkerStats(last_received, timestamp, true);
                    }
                    /* other message types are purposefully ignored */
@@ -1030,6 +1093,9 @@ ApplyLoop(void)
 
                len = walrcv_receive(wrconn, &buf, &fd);
            }
+
+           /* confirm all writes at once */
+           send_feedback(last_received, false, false);
        }
 
        if (!in_remote_transaction)
@@ -1038,15 +1104,13 @@ ApplyLoop(void)
             * If we didn't get any transactions for a while there might be
             * unconsumed invalidation messages in the queue, consume them now.
             */
-           StartTransactionCommand();
-           /* Check for subscription change */
+           AcceptInvalidationMessages();
            if (!MySubscriptionValid)
                reread_subscription();
-           CommitTransactionCommand();
-       }
 
-       /* confirm all writes at once */
-       send_feedback(last_received, false, false);
+           /* Process any table synchronization changes. */
+           process_syncing_tables(last_received);
+       }
 
        /* Cleanup the memory. */
        MemoryContextResetAndDeleteChildren(ApplyContext);
@@ -1054,7 +1118,11 @@ ApplyLoop(void)
 
        /* Check if we need to exit the streaming loop. */
        if (endofstream)
+       {
+           TimeLineID  tli;
+           walrcv_endstreaming(wrconn, &tli);
            break;
+       }
 
        /*
         * Wait for more data or latch.
@@ -1222,6 +1290,14 @@ reread_subscription(void)
 {
    MemoryContext   oldctx;
    Subscription   *newsub;
+   bool            started_tx = false;
+
+   /* This function might be called inside or outside of transaction. */
+   if (!IsTransactionState())
+   {
+       StartTransactionCommand();
+       started_tx = true;
+   }
 
    /* Ensure allocations in permanent context. */
    oldctx = MemoryContextSwitchTo(ApplyCacheContext);
@@ -1319,6 +1395,9 @@ reread_subscription(void)
 
    MemoryContextSwitchTo(oldctx);
 
+   if (started_tx)
+       CommitTransactionCommand();
+
    MySubscriptionValid = true;
 }
 
@@ -1339,11 +1418,8 @@ ApplyWorkerMain(Datum main_arg)
    int             worker_slot = DatumGetObjectId(main_arg);
    MemoryContext   oldctx;
    char            originname[NAMEDATALEN];
-   RepOriginId     originid;
    XLogRecPtr      origin_startpos;
-   char           *err;
-   int             server_version;
-   TimeLineID      startpointTLI;
+   char           *myslotname;
    WalRcvStreamOptions options;
 
    /* Attach to slot */
@@ -1402,49 +1478,90 @@ ApplyWorkerMain(Datum main_arg)
                                  subscription_change_cb,
                                  (Datum) 0);
 
-   ereport(LOG,
-           (errmsg("logical replication apply for subscription \"%s\" has started",
-                   MySubscription->name)));
-
-   /* Setup replication origin tracking. */
-   snprintf(originname, sizeof(originname), "pg_%u", MySubscription->oid);
-   originid = replorigin_by_name(originname, true);
-   if (!OidIsValid(originid))
-       originid = replorigin_create(originname);
-   replorigin_session_setup(originid);
-   replorigin_session_origin = originid;
-   origin_startpos = replorigin_session_get_progress(false);
+   if (am_tablesync_worker())
+       elog(LOG, "logical replication sync for subscription %s, table %s started",
+            MySubscription->name, get_rel_name(MyLogicalRepWorker->relid));
+   else
+       elog(LOG, "logical replication apply for subscription %s started",
+            MySubscription->name);
 
    CommitTransactionCommand();
 
    /* Connect to the origin and start the replication. */
    elog(DEBUG1, "connecting to publisher using connection string \"%s\"",
         MySubscription->conninfo);
-   wrconn = walrcv_connect(MySubscription->conninfo, true,
-                               MySubscription->name, &err);
-   if (wrconn == NULL)
-       ereport(ERROR,
-               (errmsg("could not connect to the publisher: %s", err)));
+
+   if (am_tablesync_worker())
+   {
+       char *syncslotname;
+
+       /* This is table synchroniation worker, call initial sync. */
+       syncslotname = LogicalRepSyncTableStart(&origin_startpos);
+
+       /* The slot name needs to be allocated in permanent memory context. */
+       oldctx = MemoryContextSwitchTo(ApplyCacheContext);
+       myslotname = pstrdup(syncslotname);
+       MemoryContextSwitchTo(oldctx);
+
+       pfree(syncslotname);
+   }
+   else
+   {
+       /* This is main apply worker */
+       RepOriginId     originid;
+       TimeLineID      startpointTLI;
+       char           *err;
+       int             server_version;
+
+       myslotname = MySubscription->slotname;
+
+       /* Setup replication origin tracking. */
+       StartTransactionCommand();
+       snprintf(originname, sizeof(originname), "pg_%u", MySubscription->oid);
+       originid = replorigin_by_name(originname, true);
+       if (!OidIsValid(originid))
+           originid = replorigin_create(originname);
+       replorigin_session_setup(originid);
+       replorigin_session_origin = originid;
+       origin_startpos = replorigin_session_get_progress(false);
+       CommitTransactionCommand();
+
+       wrconn = walrcv_connect(MySubscription->conninfo, true, myslotname,
+                               &err);
+       if (wrconn == NULL)
+           ereport(ERROR,
+                   (errmsg("could not connect to the publisher: %s", err)));
+
+       /*
+        * We don't really use the output identify_system for anything
+        * but it does some initializations on the upstream so let's still
+        * call it.
+        */
+       (void) walrcv_identify_system(wrconn, &startpointTLI,
+                                     &server_version);
+
+   }
 
    /*
-    * We don't really use the output identify_system for anything
-    * but it does some initializations on the upstream so let's still
-    * call it.
+    * Setup callback for syscache so that we know when something
+    * changes in the subscription relation state.
     */
-   (void) walrcv_identify_system(wrconn, &startpointTLI, &server_version);
+   CacheRegisterSyscacheCallback(SUBSCRIPTIONRELMAP,
+                                 invalidate_syncing_table_states,
+                                 (Datum) 0);
 
    /* Build logical replication streaming options. */
    options.logical = true;
    options.startpoint = origin_startpos;
-   options.slotname = MySubscription->slotname;
+   options.slotname = myslotname;
    options.proto.logical.proto_version = LOGICALREP_PROTO_VERSION_NUM;
    options.proto.logical.publication_names = MySubscription->publications;
 
-   /* Start streaming from the slot. */
+   /* Start normal logical streaming replication. */
    walrcv_startstreaming(wrconn, &options);
 
    /* Run the main loop. */
-   ApplyLoop();
+   LogicalRepApplyLoop(origin_startpos);
 
    walrcv_disconnect(wrconn);
 
index f1e43bc9f3d82bdac1ccb65b814a93f4d4bdea39..ec047c827cf0b261dc913b2213273730204e97ac 100644 (file)
@@ -25,6 +25,8 @@
 /* Result of the parsing is returned here */
 Node *replication_parse_result;
 
+static SQLCmd *make_sqlcmd(void);
+
 
 /*
  * Bison doesn't allocate anything that needs to live across parser calls,
@@ -57,6 +59,7 @@ Node *replication_parse_result;
 %token <str> SCONST IDENT
 %token <uintval> UCONST
 %token <recptr> RECPTR
+%token T_WORD
 
 /* Keyword tokens. */
 %token K_BASE_BACKUP
@@ -81,11 +84,12 @@ Node *replication_parse_result;
 %token K_TEMPORARY
 %token K_EXPORT_SNAPSHOT
 %token K_NOEXPORT_SNAPSHOT
+%token K_USE_SNAPSHOT
 
 %type <node>   command
 %type <node>   base_backup start_replication start_logical_replication
                create_replication_slot drop_replication_slot identify_system
-               timeline_history show
+               timeline_history show sql_cmd
 %type <list>   base_backup_opt_list
 %type <defelt> base_backup_opt
 %type <uintval>    opt_timeline
@@ -118,6 +122,7 @@ command:
            | drop_replication_slot
            | timeline_history
            | show
+           | sql_cmd
            ;
 
 /*
@@ -248,6 +253,11 @@ create_slot_opt:
                  $$ = makeDefElem("export_snapshot",
                                   (Node *)makeInteger(FALSE), -1);
                }
+           | K_USE_SNAPSHOT
+               {
+                 $$ = makeDefElem("use_snapshot",
+                                  (Node *)makeInteger(TRUE), -1);
+               }
            | K_RESERVE_WAL
                {
                  $$ = makeDefElem("reserve_wal",
@@ -373,6 +383,26 @@ plugin_opt_arg:
            SCONST                          { $$ = (Node *) makeString($1); }
            | /* EMPTY */                   { $$ = NULL; }
        ;
+
+sql_cmd:
+           IDENT                           { $$ = (Node *) make_sqlcmd(); }
+       ;
 %%
 
+static SQLCmd *
+make_sqlcmd(void)
+{
+   SQLCmd *cmd = makeNode(SQLCmd);
+   int tok;
+
+   /* Just move lexer to the end of command. */
+   for (;;)
+   {
+       tok = yylex();
+       if (tok == ';' || tok == 0)
+           break;
+   }
+   return cmd;
+}
+
 #include "repl_scanner.c"
index f56d41d59c75bde235bc5943797c26c6fa690672..52ae7b343fbfaad6e592aafbd42e4a9e1d8c374e 100644 (file)
@@ -102,6 +102,7 @@ SLOT                { return K_SLOT; }
 TEMPORARY          { return K_TEMPORARY; }
 EXPORT_SNAPSHOT        { return K_EXPORT_SNAPSHOT; }
 NOEXPORT_SNAPSHOT  { return K_NOEXPORT_SNAPSHOT; }
+USE_SNAPSHOT       { return K_USE_SNAPSHOT; }
 
 ","                { return ','; }
 ";"                { return ';'; }
@@ -180,9 +181,7 @@ NOEXPORT_SNAPSHOT   { return K_NOEXPORT_SNAPSHOT; }
                }
 
 .              {
-                   ereport(ERROR,
-                           (errcode(ERRCODE_SYNTAX_ERROR),
-                            errmsg("syntax error: unexpected character \"%s\"", yytext)));
+                   return T_WORD;
                }
 %%
 
index 75617709ecfbfa58cd7a49db1bace68431c25466..c6ba916c49b51382c25f1d29425339aee6dbdbb4 100644 (file)
@@ -753,7 +753,7 @@ logical_read_xlog_page(XLogReaderState *state, XLogRecPtr targetPagePtr, int req
 static void
 parseCreateReplSlotOptions(CreateReplicationSlotCmd *cmd,
                           bool *reserve_wal,
-                          bool *export_snapshot)
+                          CRSSnapshotAction *snapshot_action)
 {
    ListCell   *lc;
    bool        snapshot_action_given = false;
@@ -772,7 +772,18 @@ parseCreateReplSlotOptions(CreateReplicationSlotCmd *cmd,
                         errmsg("conflicting or redundant options")));
 
            snapshot_action_given = true;
-           *export_snapshot = defGetBoolean(defel);
+           *snapshot_action = defGetBoolean(defel) ? CRS_EXPORT_SNAPSHOT :
+               CRS_NOEXPORT_SNAPSHOT;
+       }
+       else if (strcmp(defel->defname, "use_snapshot") == 0)
+       {
+           if (snapshot_action_given || cmd->kind != REPLICATION_KIND_LOGICAL)
+               ereport(ERROR,
+                       (errcode(ERRCODE_SYNTAX_ERROR),
+                        errmsg("conflicting or redundant options")));
+
+           snapshot_action_given = true;
+           *snapshot_action = CRS_USE_SNAPSHOT;
        }
        else if (strcmp(defel->defname, "reserve_wal") == 0)
        {
@@ -799,7 +810,7 @@ CreateReplicationSlot(CreateReplicationSlotCmd *cmd)
    char        xpos[MAXFNAMELEN];
    char       *slot_name;
    bool        reserve_wal = false;
-   bool        export_snapshot = true;
+   CRSSnapshotAction snapshot_action = CRS_EXPORT_SNAPSHOT;
    DestReceiver *dest;
    TupOutputState *tstate;
    TupleDesc   tupdesc;
@@ -808,7 +819,7 @@ CreateReplicationSlot(CreateReplicationSlotCmd *cmd)
 
    Assert(!MyReplicationSlot);
 
-   parseCreateReplSlotOptions(cmd, &reserve_wal, &export_snapshot);
+   parseCreateReplSlotOptions(cmd, &reserve_wal, &snapshot_action);
 
    /* setup state for XLogReadPage */
    sendTimeLineIsHistoric = false;
@@ -838,6 +849,40 @@ CreateReplicationSlot(CreateReplicationSlotCmd *cmd)
    {
        LogicalDecodingContext *ctx;
 
+       /*
+        * Do options check early so that we can bail before calling the
+        * DecodingContextFindStartpoint which can take long time.
+        */
+       if (snapshot_action == CRS_EXPORT_SNAPSHOT)
+       {
+           if (IsTransactionBlock())
+               ereport(ERROR,
+                       (errmsg("CREATE_REPLICATION_SLOT ... EXPORT_SNAPSHOT "
+                               "must not be called inside a transaction")));
+       }
+       else if (snapshot_action == CRS_USE_SNAPSHOT)
+       {
+           if (!IsTransactionBlock())
+               ereport(ERROR,
+                       (errmsg("CREATE_REPLICATION_SLOT ... USE_SNAPSHOT "
+                               "must be called inside a transaction")));
+
+           if (XactIsoLevel != XACT_REPEATABLE_READ)
+               ereport(ERROR,
+                       (errmsg("CREATE_REPLICATION_SLOT ... USE_SNAPSHOT "
+                               "must be called in REPEATABLE READ isolation mode transaction")));
+
+           if (FirstSnapshotSet)
+               ereport(ERROR,
+                       (errmsg("CREATE_REPLICATION_SLOT ... USE_SNAPSHOT "
+                               "must be called before any query")));
+
+           if (IsSubTransaction())
+               ereport(ERROR,
+                       (errmsg("CREATE_REPLICATION_SLOT ... USE_SNAPSHOT "
+                               "must not be called in a subtransaction")));
+       }
+
        ctx = CreateInitDecodingContext(cmd->plugin, NIL,
                                        logical_read_xlog_page,
                                        WalSndPrepareWrite, WalSndWriteData);
@@ -855,13 +900,22 @@ CreateReplicationSlot(CreateReplicationSlotCmd *cmd)
        DecodingContextFindStartpoint(ctx);
 
        /*
-        * Export the snapshot if we've been asked to do so.
+        * Export or use the snapshot if we've been asked to do so.
         *
         * NB. We will convert the snapbuild.c kind of snapshot to normal
         * snapshot when doing this.
         */
-       if (export_snapshot)
+       if (snapshot_action == CRS_EXPORT_SNAPSHOT)
+       {
            snapshot_name = SnapBuildExportSnapshot(ctx->snapshot_builder);
+       }
+       else if (snapshot_action == CRS_USE_SNAPSHOT)
+       {
+           Snapshot    snap;
+
+           snap = SnapBuildInitalSnapshot(ctx->snapshot_builder);
+           RestoreTransactionSnapshot(snap, MyProc);
+       }
 
        /* don't need the decoding context anymore */
        FreeDecodingContext(ctx);
@@ -1277,8 +1331,11 @@ WalSndWaitForWal(XLogRecPtr loc)
 
 /*
  * Execute an incoming replication command.
+ *
+ * Returns true if the cmd_string was recognized as WalSender command, false
+ * if not.
  */
-void
+bool
 exec_replication_command(const char *cmd_string)
 {
    int         parse_rc;
@@ -1317,6 +1374,25 @@ exec_replication_command(const char *cmd_string)
 
    cmd_node = replication_parse_result;
 
+   /*
+    * CREATE_REPLICATION_SLOT ... LOGICAL exports a snapshot. If it was
+    * called outside of transaction the snapshot should be cleared here.
+    */
+   if (!IsTransactionBlock())
+       SnapBuildClearExportedSnapshot();
+
+   /*
+    * For aborted transactions, don't allow anything except pure SQL,
+    * the exec_simple_query() will handle it correctly.
+    */
+   if (IsAbortedTransactionBlockState() && !IsA(cmd_node, SQLCmd))
+       ereport(ERROR,
+               (errcode(ERRCODE_IN_FAILED_SQL_TRANSACTION),
+                errmsg("current transaction is aborted, "
+                       "commands ignored until end of transaction block")));
+
+   CHECK_FOR_INTERRUPTS();
+
    /*
     * Allocate buffers that will be used for each outgoing and incoming
     * message.  We do this just once per command to reduce palloc overhead.
@@ -1332,6 +1408,7 @@ exec_replication_command(const char *cmd_string)
            break;
 
        case T_BaseBackupCmd:
+           PreventTransactionChain(true, "BASE_BACKUP");
            SendBaseBackup((BaseBackupCmd *) cmd_node);
            break;
 
@@ -1347,6 +1424,8 @@ exec_replication_command(const char *cmd_string)
            {
                StartReplicationCmd *cmd = (StartReplicationCmd *) cmd_node;
 
+               PreventTransactionChain(true, "START_REPLICATION");
+
                if (cmd->kind == REPLICATION_KIND_PHYSICAL)
                    StartReplication(cmd);
                else
@@ -1355,6 +1434,7 @@ exec_replication_command(const char *cmd_string)
            }
 
        case T_TimeLineHistoryCmd:
+           PreventTransactionChain(true, "TIMELINE_HISTORY");
            SendTimeLineHistory((TimeLineHistoryCmd *) cmd_node);
            break;
 
@@ -1367,6 +1447,14 @@ exec_replication_command(const char *cmd_string)
            }
            break;
 
+       case T_SQLCmd:
+           if (MyDatabaseId == InvalidOid)
+               ereport(ERROR,
+                       (errmsg("not connected to database")));
+
+           /* Tell the caller that this wasn't a WalSender command. */
+           return false;
+
        default:
            elog(ERROR, "unrecognized replication command node tag: %u",
                 cmd_node->type);
@@ -1378,6 +1466,8 @@ exec_replication_command(const char *cmd_string)
 
    /* Send CommandComplete message */
    EndCommand("SELECT", DestRemote);
+
+   return true;
 }
 
 /*
index b07d6c6cb9b83fe0297615ed373f36be6c229ce4..ba41f90712689313793a89939b4d19d4be7cfbca 100644 (file)
@@ -4061,7 +4061,10 @@ PostgresMain(int argc, char *argv[],
                    pq_getmsgend(&input_message);
 
                    if (am_walsender)
-                       exec_replication_command(query_string);
+                   {
+                       if (!exec_replication_command(query_string))
+                           exec_simple_query(query_string);
+                   }
                    else
                        exec_simple_query(query_string);
 
index 1ec7f32470a838e4a3d6ce99f92a6f9071781acc..7dcecb2f0f6d912826f77672b33c4d73652a4d83 100644 (file)
@@ -982,3 +982,23 @@ pg_current_logfile_1arg(PG_FUNCTION_ARGS)
 {
    return pg_current_logfile(fcinfo);
 }
+
+/*
+ * SQL wrapper around RelationGetReplicaIndex().
+ */
+Datum
+pg_get_replica_identity_index(PG_FUNCTION_ARGS)
+{
+   Oid         reloid = PG_GETARG_OID(0);
+   Oid         idxoid;
+   Relation    rel;
+
+   rel = heap_open(reloid, AccessShareLock);
+   idxoid = RelationGetReplicaIndex(rel);
+   heap_close(rel, AccessShareLock);
+
+   if (OidIsValid(idxoid))
+       PG_RETURN_OID(idxoid);
+   else
+       PG_RETURN_NULL();
+}
index b1c0b4b1be11d0d915f5e9a9342400542368bc0e..d5a376406fee37810d1817dee0cb87151146d7cf 100644 (file)
@@ -62,6 +62,7 @@
 #include "catalog/pg_replication_origin.h"
 #include "catalog/pg_statistic.h"
 #include "catalog/pg_subscription.h"
+#include "catalog/pg_subscription_rel.h"
 #include "catalog/pg_tablespace.h"
 #include "catalog/pg_transform.h"
 #include "catalog/pg_ts_config.h"
@@ -693,7 +694,7 @@ static const struct cachedesc cacheinfo[] = {
        64
    },
    {PublicationRelRelationId,      /* PUBLICATIONRELMAP */
-       PublicationRelMapIndexId,
+       PublicationRelPrrelidPrpubidIndexId,
        2,
        {
            Anum_pg_publication_rel_prrelid,
@@ -758,6 +759,17 @@ static const struct cachedesc cacheinfo[] = {
        },
        4
    },
+   {SubscriptionRelRelationId,     /* SUBSCRIPTIONRELMAP */
+       SubscriptionRelSrrelidSrsubidIndexId,
+       2,
+       {
+           Anum_pg_subscription_rel_srrelid,
+           Anum_pg_subscription_rel_srsubid,
+           0,
+           0
+       },
+       64
+   },
    {TableSpaceRelationId,      /* TABLESPACEOID */
        TablespaceOidIndexId,
        1,
index 4feb26aa7a0a863b6cfc69bc48c15a0d50a10dd7..291bf7631dba7731f11f424e1e7488017e07c51d 100644 (file)
@@ -2497,6 +2497,18 @@ static struct config_int ConfigureNamesInt[] =
        NULL, NULL, NULL
    },
 
+   {
+       {"max_sync_workers_per_subscription",
+           PGC_SIGHUP,
+           RESOURCES_ASYNCHRONOUS,
+           gettext_noop("Maximum number of table synchronization workers per subscription."),
+           NULL,
+       },
+       &max_sync_workers_per_subscription,
+       2, 0, MAX_BACKENDS,
+       NULL, NULL, NULL
+   },
+
    {
        {"log_rotation_age", PGC_SIGHUP, LOGGING_WHERE,
            gettext_noop("Automatic log file rotation will occur after N minutes."),
index 610bed531c4b0544571e02c6f9bbf94a12b79c19..98bc1a586ace3c32c6b3ed3429b55ccf66c81da7 100644 (file)
@@ -155,7 +155,7 @@ typedef struct _dumpOptions
    int         use_setsessauth;
    int         enable_row_security;
    int         include_subscriptions;
-   int         no_create_subscription_slots;
+   int         no_subscription_connect;
 
    /* default, if no "inclusion" switches appear, is to dump everything */
    bool        include_everything;
index 2b5a52656c96ea987030fcdbbe15caf4beb86ca5..a98747d89a461952ee3780620d4e5d6e046ada51 100644 (file)
@@ -351,8 +351,8 @@ main(int argc, char **argv)
        {"snapshot", required_argument, NULL, 6},
        {"strict-names", no_argument, &strict_names, 1},
        {"use-set-session-authorization", no_argument, &dopt.use_setsessauth, 1},
-       {"no-create-subscription-slots", no_argument, &dopt.no_create_subscription_slots, 1},
        {"no-security-labels", no_argument, &dopt.no_security_labels, 1},
+       {"no-subscription-connect", no_argument, &dopt.no_subscription_connect, 1},
        {"no-synchronized-snapshots", no_argument, &dopt.no_synchronized_snapshots, 1},
        {"no-unlogged-table-data", no_argument, &dopt.no_unlogged_table_data, 1},
        {"no-sync", no_argument, NULL, 7},
@@ -951,9 +951,8 @@ help(const char *progname)
    printf(_("  --if-exists                  use IF EXISTS when dropping objects\n"));
    printf(_("  --include-subscriptions      dump logical replication subscriptions\n"));
    printf(_("  --inserts                    dump data as INSERT commands, rather than COPY\n"));
-   printf(_("  --no-create-subscription-slots\n"
-            "                               do not create replication slots for subscriptions\n"));
    printf(_("  --no-security-labels         do not dump security label assignments\n"));
+   printf(_("  --no-subscription-connect    dump subscriptions so they don't connect on restore\n"));
    printf(_("  --no-synchronized-snapshots  do not use synchronized snapshots in parallel jobs\n"));
    printf(_("  --no-tablespaces             do not dump tablespace assignments\n"));
    printf(_("  --no-unlogged-table-data     do not dump unlogged table data\n"));
@@ -3774,8 +3773,8 @@ dumpSubscription(Archive *fout, SubscriptionInfo *subinfo)
    appendPQExpBufferStr(query, ", SLOT NAME = ");
    appendStringLiteralAH(query, subinfo->subslotname, fout);
 
-   if (dopt->no_create_subscription_slots)
-       appendPQExpBufferStr(query, ", NOCREATE SLOT");
+   if (dopt->no_subscription_connect)
+       appendPQExpBufferStr(query, ", NOCONNECT");
 
    appendPQExpBufferStr(query, ");\n");
 
index a46dcdbcd7d94566f0cfe2bb124404df365f7989..021f4bf081a7a43a51824a532783ea4d3d4f3683 100644 (file)
@@ -4224,7 +4224,7 @@ qr/CREATE TRANSFORM FOR integer LANGUAGE sql \(FROM SQL WITH FUNCTION pg_catalog
        create_order => 50,
        create_sql   => 'CREATE SUBSCRIPTION sub1
                         CONNECTION \'dbname=doesnotexist\' PUBLICATION pub1
-                        WITH (DISABLED, NOCREATE SLOT);',
+                        WITH (DISABLED, NOCONNECT);',
        regexp       => qr/^
            \QCREATE SUBSCRIPTION sub1 CONNECTION 'dbname=doesnotexist' PUBLICATION pub1 WITH (DISABLED, SLOT NAME = 'sub1');\E
            /xm,
index 315f155b6453bb928c4394e4ac2214189d352685..d8679f5f591f00f3b374e8977cbfcb37de082c6c 100644 (file)
@@ -53,6 +53,6 @@
  */
 
 /*                         yyyymmddN */
-#define CATALOG_VERSION_NO 201703221
+#define CATALOG_VERSION_NO 201703231
 
 #endif
index 6bce7328a289f1f6b6d949186dde616912ee540f..5d4190c05eba824dde8ee3a06b42612d4374951e 100644 (file)
@@ -340,8 +340,8 @@ DECLARE_UNIQUE_INDEX(pg_publication_pubname_index, 6111, on pg_publication using
 DECLARE_UNIQUE_INDEX(pg_publication_rel_oid_index, 6112, on pg_publication_rel using btree(oid oid_ops));
 #define PublicationRelObjectIndexId 6112
 
-DECLARE_UNIQUE_INDEX(pg_publication_rel_map_index, 6113, on pg_publication_rel using btree(prrelid oid_ops, prpubid oid_ops));
-#define PublicationRelMapIndexId 6113
+DECLARE_UNIQUE_INDEX(pg_publication_rel_prrelid_prpubid_index, 6113, on pg_publication_rel using btree(prrelid oid_ops, prpubid oid_ops));
+#define PublicationRelPrrelidPrpubidIndexId 6113
 
 DECLARE_UNIQUE_INDEX(pg_subscription_oid_index, 6114, on pg_subscription using btree(oid oid_ops));
 #define SubscriptionObjectIndexId 6114
@@ -349,6 +349,9 @@ DECLARE_UNIQUE_INDEX(pg_subscription_oid_index, 6114, on pg_subscription using b
 DECLARE_UNIQUE_INDEX(pg_subscription_subname_index, 6115, on pg_subscription using btree(subdbid oid_ops, subname name_ops));
 #define SubscriptionNameIndexId 6115
 
+DECLARE_UNIQUE_INDEX(pg_subscription_rel_srrelid_srsubid_index, 6117, on pg_subscription_rel using btree(srrelid oid_ops, srsubid oid_ops));
+#define SubscriptionRelSrrelidSrsubidIndexId 6117
+
 /* last step of initialization script: build the indexes declared above */
 BUILD_INDICES
 
index 22635655f5670a30bd762e70095baa4f062fc7c9..78c23e3f5d54ec539f63a6ad1bf1ca33f9270864 100644 (file)
@@ -2021,6 +2021,9 @@ DESCR("is a relation insertable/updatable/deletable");
 DATA(insert OID = 3843 (  pg_column_is_updatable   PGNSP PGUID 12 10 0 0 0 f f f f t f s s 3 0 16 "2205 21 16" _null_ _null_ _null_ _null_ _null_ pg_column_is_updatable _null_ _null_ _null_ ));
 DESCR("is a column updatable");
 
+DATA(insert OID = 6120 (  pg_get_replica_identity_index    PGNSP PGUID 12 10 0 0 0 f f f f t f s s 1 0 2205 "2205" _null_ _null_ _null_ _null_ _null_ pg_get_replica_identity_index _null_ _null_ _null_ ));
+DESCR("oid of replica identity index if any");
+
 /* Deferrable unique constraint trigger */
 DATA(insert OID = 1250 (  unique_key_recheck   PGNSP PGUID 12 1 0 0 0 f f f f t f v s 0 0 2279 "" _null_ _null_ _null_ _null_ _null_ unique_key_recheck _null_ _null_ _null_ ));
 DESCR("deferred UNIQUE constraint check");
@@ -2805,7 +2808,7 @@ DATA(insert OID = 3099 (  pg_stat_get_wal_senders PGNSP PGUID 12 1 10 0 0 f f f
 DESCR("statistics: information about currently active replication");
 DATA(insert OID = 3317 (  pg_stat_get_wal_receiver PGNSP PGUID 12 1 0 0 0 f f f f f f s r 0 0 2249 "" "{23,25,3220,23,3220,23,1184,1184,3220,1184,25,25}" "{o,o,o,o,o,o,o,o,o,o,o,o}" "{pid,status,receive_start_lsn,receive_start_tli,received_lsn,received_tli,last_msg_send_time,last_msg_receipt_time,latest_end_lsn,latest_end_time,slot_name,conninfo}" _null_ _null_ pg_stat_get_wal_receiver _null_ _null_ _null_ ));
 DESCR("statistics: information about WAL receiver");
-DATA(insert OID = 6118 (  pg_stat_get_subscription PGNSP PGUID 12 1 0 0 0 f f f f f f s r 1 0 2249 "26" "{26,26,23,3220,1184,1184,3220,1184}" "{i,o,o,o,o,o,o,o}" "{subid,subid,pid,received_lsn,last_msg_send_time,last_msg_receipt_time,latest_end_lsn,latest_end_time}" _null_ _null_ pg_stat_get_subscription _null_ _null_ _null_ ));
+DATA(insert OID = 6118 (  pg_stat_get_subscription PGNSP PGUID 12 1 0 0 0 f f f f f f s r 1 0 2249 "26" "{26,26,26,23,3220,1184,1184,3220,1184}" "{i,o,o,o,o,o,o,o,o}" "{subid,subid,relid,pid,received_lsn,last_msg_send_time,last_msg_receipt_time,latest_end_lsn,latest_end_time}" _null_ _null_ pg_stat_get_subscription _null_ _null_ _null_ ));
 DESCR("statistics: information about subscription");
 DATA(insert OID = 2026 (  pg_backend_pid               PGNSP PGUID 12 1 0 0 0 f f f f t f s r 0 0 23 "" _null_ _null_ _null_ _null_ _null_ pg_backend_pid _null_ _null_ _null_ ));
 DESCR("statistics: current backend PID");
diff --git a/src/include/catalog/pg_subscription_rel.h b/src/include/catalog/pg_subscription_rel.h
new file mode 100644 (file)
index 0000000..129aa99
--- /dev/null
@@ -0,0 +1,78 @@
+/* -------------------------------------------------------------------------
+ *
+ * pg_subscription_rel.h
+ *     Local info about tables that come from the publisher of a
+ *     subscription (pg_subscription_rel).
+ *
+ * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * -------------------------------------------------------------------------
+ */
+#ifndef PG_SUBSCRIPTION_REL_H
+#define PG_SUBSCRIPTION_REL_H
+
+#include "catalog/genbki.h"
+
+/* ----------------
+ *     pg_subscription_rel definition. cpp turns this into
+ *     typedef struct FormData_pg_subscription_rel
+ * ----------------
+ */
+#define SubscriptionRelRelationId          6102
+
+/* Workaround for genbki not knowing about XLogRecPtr */
+#define pg_lsn XLogRecPtr
+
+CATALOG(pg_subscription_rel,6102) BKI_WITHOUT_OIDS
+{
+   Oid         srsubid;        /* Oid of subscription */
+   Oid         srrelid;        /* Oid of relation */
+   char        srsubstate;     /* state of the relation in subscription */
+   pg_lsn      srsublsn;       /* remote lsn of the state change
+                                * used for synchronization coordination */
+} FormData_pg_subscription_rel;
+
+typedef FormData_pg_subscription_rel *Form_pg_subscription_rel;
+
+/* ----------------
+ *     compiler constants for pg_subscription_rel
+ * ----------------
+ */
+#define Natts_pg_subscription_rel              4
+#define Anum_pg_subscription_rel_srsubid       1
+#define Anum_pg_subscription_rel_srrelid       2
+#define Anum_pg_subscription_rel_srsubstate        3
+#define Anum_pg_subscription_rel_srsublsn      4
+
+/* ----------------
+ *     substate constants
+ * ----------------
+ */
+#define          SUBREL_STATE_INIT             'i'     /* initializing (sublsn NULL) */
+#define          SUBREL_STATE_DATASYNC         'd'     /* data is being synchronized (sublsn NULL) */
+#define          SUBREL_STATE_SYNCDONE         's'     /* synchronization finished infront of apply (sublsn set) */
+#define          SUBREL_STATE_READY            'r'     /* ready (sublsn set) */
+
+/* These are never stored in the catalog, we only use them for IPC. */
+#define          SUBREL_STATE_UNKNOWN          '\0'    /* unknown state */
+#define          SUBREL_STATE_SYNCWAIT         'w'     /* waiting for sync */
+#define          SUBREL_STATE_CATCHUP          'c'     /* catching up with apply */
+
+typedef struct SubscriptionRelState
+{
+   Oid         relid;
+   XLogRecPtr  lsn;
+   char        state;
+} SubscriptionRelState;
+
+extern Oid SetSubscriptionRelState(Oid subid, Oid relid, char state,
+                                  XLogRecPtr sublsn);
+extern char GetSubscriptionRelState(Oid subid, Oid relid,
+                                   XLogRecPtr *sublsn, bool missing_ok);
+extern void RemoveSubscriptionRel(Oid subid, Oid relid);
+
+extern List *GetSubscriptionRelations(Oid subid);
+extern List *GetSubscriptionNotReadyRelations(Oid subid);
+
+#endif   /* PG_SUBSCRIPTION_REL_H */
index d63ca0f5e9966f4be93e8cbfe2acc707fea5a390..f081f2219f2981ca5883bee88f8d4a63557279b3 100644 (file)
@@ -21,6 +21,7 @@
 
 /* CopyStateData is private in commands/copy.c */
 typedef struct CopyStateData *CopyState;
+typedef int (*copy_data_source_cb) (void *outbuf, int minread, int maxread);
 
 extern void DoCopy(ParseState *state, const CopyStmt *stmt,
       int stmt_location, int stmt_len,
@@ -28,7 +29,7 @@ extern void DoCopy(ParseState *state, const CopyStmt *stmt,
 
 extern void ProcessCopyOptions(ParseState *pstate, CopyState cstate, bool is_from, List *options);
 extern CopyState BeginCopyFrom(ParseState *pstate, Relation rel, const char *filename,
-             bool is_program, List *attnamelist, List *options);
+             bool is_program, copy_data_source_cb data_source_cb, List *attnamelist, List *options);
 extern void EndCopyFrom(CopyState cstate);
 extern bool NextCopyFrom(CopyState cstate, ExprContext *econtext,
             Datum *values, bool *nulls, Oid *tupleOid);
@@ -36,6 +37,8 @@ extern bool NextCopyFromRawFields(CopyState cstate,
                      char ***fields, int *nfields);
 extern void CopyFromErrorCallback(void *arg);
 
+extern uint64 CopyFrom(CopyState cstate);
+
 extern DestReceiver *CreateCopyDestReceiver(void);
 
 #endif   /* COPY_H */
index 2cbd6d77b8d3259b6d560d6fa376be82fdd09765..9a4221a9e7bbca1ce897af3bff0f485d2c11c52a 100644 (file)
@@ -488,6 +488,7 @@ typedef enum NodeTag
    T_DropReplicationSlotCmd,
    T_StartReplicationCmd,
    T_TimeLineHistoryCmd,
+   T_SQLCmd,
 
    /*
     * TAGS FOR RANDOM OTHER STUFF
index a15df229a494696f9663af0b5defccba25d4ce83..582e0e0ebe94a9c1d0ccf97b3342fcb478b1dbcb 100644 (file)
@@ -3319,10 +3319,23 @@ typedef struct CreateSubscriptionStmt
    List       *options;        /* List of DefElem nodes */
 } CreateSubscriptionStmt;
 
+typedef enum AlterSubscriptionType
+{
+   ALTER_SUBSCRIPTION_OPTIONS,
+   ALTER_SUBSCRIPTION_CONNECTION,
+   ALTER_SUBSCRIPTION_PUBLICATION,
+   ALTER_SUBSCRIPTION_PUBLICATION_REFRESH,
+   ALTER_SUBSCRIPTION_REFRESH,
+   ALTER_SUBSCRIPTION_ENABLED
+} AlterSubscriptionType;
+
 typedef struct AlterSubscriptionStmt
 {
    NodeTag     type;
+   AlterSubscriptionType kind; /* ALTER_SUBSCRIPTION_OPTIONS, etc */
    char       *subname;        /* Name of of the subscription */
+   char       *conninfo;       /* Connection string to publisher */
+   List       *publication;    /* One or more publication to subscribe to */
    List       *options;        /* List of DefElem nodes */
 } AlterSubscriptionStmt;
 
index 996da3c02ea01bd2f5774cadef2b87dc40956661..92ada41b6d522fd8ceee32a1acec7be4b76f7274 100644 (file)
@@ -96,4 +96,13 @@ typedef struct TimeLineHistoryCmd
    TimeLineID  timeline;
 } TimeLineHistoryCmd;
 
+/* ----------------------
+ *     SQL commands
+ * ----------------------
+ */
+typedef struct SQLCmd
+{
+   NodeTag     type;
+} SQLCmd;
+
 #endif   /* REPLNODES_H */
index 28c4dab258624f46120d7f0736b4854e83118648..6cd36c7fe3014a63ca479768239fc81c8735679b 100644 (file)
@@ -258,6 +258,7 @@ PG_KEYWORD("new", NEW, UNRESERVED_KEYWORD)
 PG_KEYWORD("next", NEXT, UNRESERVED_KEYWORD)
 PG_KEYWORD("no", NO, UNRESERVED_KEYWORD)
 PG_KEYWORD("none", NONE, COL_NAME_KEYWORD)
+PG_KEYWORD("norefresh", NOREFRESH, UNRESERVED_KEYWORD)
 PG_KEYWORD("not", NOT, RESERVED_KEYWORD)
 PG_KEYWORD("nothing", NOTHING, UNRESERVED_KEYWORD)
 PG_KEYWORD("notify", NOTIFY, UNRESERVED_KEYWORD)
index f2daf32e1abaa868cd9d7cec18fdfc1f14efe8b7..a67524297119576e1670e84cd48938bdf405820d 100644 (file)
@@ -790,7 +790,9 @@ typedef enum
    WAIT_EVENT_PARALLEL_FINISH,
    WAIT_EVENT_PARALLEL_BITMAP_SCAN,
    WAIT_EVENT_SAFE_SNAPSHOT,
-   WAIT_EVENT_SYNC_REP
+   WAIT_EVENT_SYNC_REP,
+   WAIT_EVENT_LOGICAL_SYNC_DATA,
+   WAIT_EVENT_LOGICAL_SYNC_STATE_CHANGE
 } WaitEventIPC;
 
 /* ----------
index fd34964bad3d13eec06374d24bae31d68bee1707..d10dd2c90af3410d00254626969d2272f7411c1f 100644 (file)
@@ -31,9 +31,11 @@ typedef struct LogicalDecodingContext
    /* memory context this is all allocated in */
    MemoryContext context;
 
-   /* infrastructure pieces */
-   XLogReaderState *reader;
+   /* The associated replication slot */
    ReplicationSlot *slot;
+
+   /* infrastructure pieces for decoding */
+   XLogReaderState *reader;
    struct ReorderBuffer *reorder;
    struct SnapBuild *snapshot_builder;
 
@@ -75,6 +77,7 @@ typedef struct LogicalDecodingContext
    TransactionId write_xid;
 } LogicalDecodingContext;
 
+
 extern void CheckLogicalDecodingRequirements(void);
 
 extern LogicalDecodingContext *CreateInitDecodingContext(char *plugin,
@@ -92,6 +95,12 @@ extern void DecodingContextFindStartpoint(LogicalDecodingContext *ctx);
 extern bool DecodingContextReady(LogicalDecodingContext *ctx);
 extern void FreeDecodingContext(LogicalDecodingContext *ctx);
 
+extern LogicalDecodingContext *CreateCopyDecodingContext(
+                     List *output_plugin_options,
+                     LogicalOutputPluginWriterPrepareWrite prepare_write,
+                     LogicalOutputPluginWriterWrite do_write);
+extern List *DecodingContextGetTableList(LogicalDecodingContext *ctx);
+
 extern void LogicalIncreaseXminForSlot(XLogRecPtr lsn, TransactionId xmin);
 extern void LogicalIncreaseRestartDecodingForSlot(XLogRecPtr current_lsn,
                                      XLogRecPtr restart_lsn);
index cfe3db10dd0b733de17a2beea0f488f2511aeea9..060946a096412cec92526264ba965963157b6b8d 100644 (file)
@@ -13,6 +13,7 @@
 #define LOGICALLAUNCHER_H
 
 extern int max_logical_replication_workers;
+extern int max_sync_workers_per_subscription;
 
 extern void ApplyLauncherRegister(void);
 extern void ApplyLauncherMain(Datum main_arg);
index 5e824ae6fc8a2920f7f072127ad088804755bad6..091a9f91e36e64ef167969251f5378fdd9db3981 100644 (file)
@@ -59,6 +59,7 @@ extern void FreeSnapshotBuilder(SnapBuild *cache);
 
 extern void SnapBuildSnapDecRefcount(Snapshot snap);
 
+extern Snapshot SnapBuildInitalSnapshot(SnapBuild *builder);
 extern const char *SnapBuildExportSnapshot(SnapBuild *snapstate);
 extern void SnapBuildClearExportedSnapshot(void);
 
index 78e577c89b170527b25ef9d5a4ef9d49ca6332ba..fb55c30fa191b8b38651ffa3e5a2be4037796d15 100644 (file)
 #include "access/xlog.h"
 #include "access/xlogdefs.h"
 #include "fmgr.h"
+#include "replication/logicalproto.h"
+#include "replication/walsender.h"
 #include "storage/latch.h"
 #include "storage/spin.h"
 #include "pgtime.h"
+#include "utils/tuplestore.h"
 
 /* user-settable parameters */
 extern int wal_receiver_status_interval;
@@ -160,6 +163,33 @@ typedef struct
 struct WalReceiverConn;
 typedef struct WalReceiverConn WalReceiverConn;
 
+/*
+ * Status of walreceiver query execution.
+ *
+ * We only define statuses that are currently used.
+ */
+typedef enum
+{
+   WALRCV_ERROR,               /* There was error when executing the query. */
+   WALRCV_OK_COMMAND,          /* Query executed utility or replication command. */
+   WALRCV_OK_TUPLES,           /* Query returned tuples. */
+   WALRCV_OK_COPY_IN,          /* Query started COPY FROM. */
+   WALRCV_OK_COPY_OUT,         /* Query started COPY TO. */
+   WALRCV_OK_COPY_BOTH,        /* Query started COPY BOTH replication protocol. */
+} WalRcvExecStatus;
+
+/*
+ * Return value for walrcv_query, returns the status of the execution and
+ * tuples if any.
+ */
+typedef struct WalRcvExecResult
+{
+   WalRcvExecStatus    status;
+   char               *err;
+   Tuplestorestate    *tuplestore;
+   TupleDesc           tupledesc;
+} WalRcvExecResult;
+
 /* libpqwalreceiver hooks */
 typedef WalReceiverConn *(*walrcv_connect_fn) (const char *conninfo, bool logical,
                                               const char *appname,
@@ -183,9 +213,12 @@ typedef void (*walrcv_send_fn) (WalReceiverConn *conn, const char *buffer,
                                int nbytes);
 typedef char *(*walrcv_create_slot_fn) (WalReceiverConn *conn,
                                        const char *slotname, bool temporary,
-                                       bool export_snapshot, XLogRecPtr *lsn);
-typedef bool (*walrcv_command_fn) (WalReceiverConn *conn, const char *cmd,
-                                  char **err);
+                                       CRSSnapshotAction snapshot_action,
+                                       XLogRecPtr *lsn);
+typedef WalRcvExecResult *(*walrcv_exec_fn) (WalReceiverConn *conn,
+                                            const char *query,
+                                            const int nRetTypes,
+                                            const Oid *retTypes);
 typedef void (*walrcv_disconnect_fn) (WalReceiverConn *conn);
 
 typedef struct WalReceiverFunctionsType
@@ -200,7 +233,7 @@ typedef struct WalReceiverFunctionsType
    walrcv_receive_fn                   walrcv_receive;
    walrcv_send_fn                      walrcv_send;
    walrcv_create_slot_fn               walrcv_create_slot;
-   walrcv_command_fn                   walrcv_command;
+   walrcv_exec_fn                      walrcv_exec;
    walrcv_disconnect_fn                walrcv_disconnect;
 } WalReceiverFunctionsType;
 
@@ -224,13 +257,31 @@ extern PGDLLIMPORT WalReceiverFunctionsType *WalReceiverFunctions;
    WalReceiverFunctions->walrcv_receive(conn, buffer, wait_fd)
 #define walrcv_send(conn, buffer, nbytes) \
    WalReceiverFunctions->walrcv_send(conn, buffer, nbytes)
-#define walrcv_create_slot(conn, slotname, temporary, export_snapshot, lsn) \
-   WalReceiverFunctions->walrcv_create_slot(conn, slotname, temporary, export_snapshot, lsn)
-#define walrcv_command(conn, cmd, err) \
-   WalReceiverFunctions->walrcv_command(conn, cmd, err)
+#define walrcv_create_slot(conn, slotname, temporary, snapshot_action, lsn) \
+   WalReceiverFunctions->walrcv_create_slot(conn, slotname, temporary, snapshot_action, lsn)
+#define walrcv_exec(conn, exec, nRetTypes, retTypes) \
+   WalReceiverFunctions->walrcv_exec(conn, exec, nRetTypes, retTypes)
 #define walrcv_disconnect(conn) \
    WalReceiverFunctions->walrcv_disconnect(conn)
 
+static inline void
+walrcv_clear_result(WalRcvExecResult *walres)
+{
+   if (!walres)
+       return;
+
+   if (walres->err)
+       pfree(walres->err);
+
+   if (walres->tuplestore)
+       tuplestore_end(walres->tuplestore);
+
+   if (walres->tupledesc)
+       FreeTupleDesc(walres->tupledesc);
+
+   pfree(walres);
+}
+
 /* prototypes for functions in walreceiver.c */
 extern void WalReceiverMain(void) pg_attribute_noreturn();
 
index fe23f6619fa536668cdc5137f6ea11e8851779d6..2ca903872e4e89445501a7f734aa10d84d7da1c6 100644 (file)
 
 #include "fmgr.h"
 
+/*
+ * What to do with a snapshot in create replication slot command.
+ */
+typedef enum
+{
+   CRS_EXPORT_SNAPSHOT,
+   CRS_NOEXPORT_SNAPSHOT,
+   CRS_USE_SNAPSHOT
+} CRSSnapshotAction;
+
 /* global state */
 extern bool am_walsender;
 extern bool am_cascading_walsender;
@@ -28,7 +38,7 @@ extern int    wal_sender_timeout;
 extern bool log_replication_commands;
 
 extern void InitWalSender(void);
-extern void exec_replication_command(const char *query_string);
+extern bool exec_replication_command(const char *query_string);
 extern void WalSndErrorCleanup(void);
 extern void WalSndSignals(void);
 extern Size WalSndShmemSize(void);
index 8cbf2687a9ce199188b3f95e082370da52188e38..bf96d340caacfde373904df43703c6162a5eebea 100644 (file)
@@ -33,6 +33,9 @@ typedef struct LogicalRepWorker
 
    /* Used for initial table synchronization. */
    Oid     relid;
+   char    relstate;
+   XLogRecPtr  relstate_lsn;
+   slock_t     relmutex;
 
    /* Stats. */
    XLogRecPtr  last_lsn;
@@ -42,6 +45,9 @@ typedef struct LogicalRepWorker
    TimestampTz reply_time;
 } LogicalRepWorker;
 
+/* Memory context for cached variables in apply worker. */
+MemoryContext          ApplyCacheContext;
+
 /* libpqreceiver connection */
 extern struct WalReceiverConn     *wrconn;
 
@@ -53,12 +59,26 @@ extern bool in_remote_transaction;
 extern bool    got_SIGTERM;
 
 extern void logicalrep_worker_attach(int slot);
-extern LogicalRepWorker *logicalrep_worker_find(Oid subid);
-extern int logicalrep_worker_count(Oid subid);
-extern void logicalrep_worker_launch(Oid dbid, Oid subid, const char *subname, Oid userid);
-extern void logicalrep_worker_stop(Oid subid);
-extern void logicalrep_worker_wakeup(Oid subid);
+extern LogicalRepWorker *logicalrep_worker_find(Oid subid, Oid relid,
+                                               bool only_running);
+extern void logicalrep_worker_launch(Oid dbid, Oid subid, const char *subname,
+                                    Oid userid, Oid relid);
+extern void logicalrep_worker_stop(Oid subid, Oid relid);
+extern void logicalrep_worker_wakeup(Oid subid, Oid relid);
+extern void logicalrep_worker_wakeup_ptr(LogicalRepWorker *worker);
+
+extern int logicalrep_sync_worker_count(Oid subid);
 
 extern void logicalrep_worker_sigterm(SIGNAL_ARGS);
+extern char *LogicalRepSyncTableStart(XLogRecPtr *origin_startpos);
+void process_syncing_tables(XLogRecPtr current_lsn);
+void invalidate_syncing_table_states(Datum arg, int cacheid,
+                                    uint32 hashvalue);
+
+static inline bool
+am_tablesync_worker(void)
+{
+   return OidIsValid(MyLogicalRepWorker->relid);
+}
 
 #endif   /* WORKER_INTERNAL_H */
index 66f60d271e2880f9f8852201a6b6ef5b0dfe1b92..b35faf81b9e42ae9dfed00f30ad7b100d56e0c48 100644 (file)
@@ -89,6 +89,7 @@ enum SysCacheIdentifier
    STATRELATTINH,
    SUBSCRIPTIONOID,
    SUBSCRIPTIONNAME,
+   SUBSCRIPTIONRELMAP,
    TABLESPACEOID,
    TRFOID,
    TRFTYPELANG,
index 90c4ba4608d687820780996f0c14123423be0313..978d9a9a0f89914154ee612eef3aa4d547794c7c 100644 (file)
@@ -37,7 +37,8 @@ CREATE TRANSFORM FOR int LANGUAGE SQL (
    FROM SQL WITH FUNCTION varchar_transform(internal),
    TO SQL WITH FUNCTION int4recv(internal));
 CREATE PUBLICATION addr_pub FOR TABLE addr_nsp.gentable;
-CREATE SUBSCRIPTION addr_sub CONNECTION '' PUBLICATION bar WITH (DISABLED, NOCREATE SLOT);
+CREATE SUBSCRIPTION addr_sub CONNECTION '' PUBLICATION bar WITH (DISABLED, NOCONNECT);
+WARNING:  tables were not subscribed, you will have to run ALTER SUBSCRIPTION ... REFRESH PUBLICATION to subscribe the tables
 -- test some error cases
 SELECT pg_get_object_address('stone', '{}', '{}');
 ERROR:  unrecognized object type "stone"
index bd13ae6010a73b6e05b136674b897297526cc10e..f7c3a637b5d209f58fcc07b589a11c8ecaef3466 100644 (file)
@@ -1847,13 +1847,14 @@ pg_stat_ssl| SELECT s.pid,
 pg_stat_subscription| SELECT su.oid AS subid,
     su.subname,
     st.pid,
+    st.relid,
     st.received_lsn,
     st.last_msg_send_time,
     st.last_msg_receipt_time,
     st.latest_end_lsn,
     st.latest_end_time
    FROM (pg_subscription su
-     LEFT JOIN pg_stat_get_subscription(NULL::oid) st(subid, pid, received_lsn, last_msg_send_time, last_msg_receipt_time, latest_end_lsn, latest_end_time) ON ((st.subid = su.oid)));
+     LEFT JOIN pg_stat_get_subscription(NULL::oid) st(subid, relid, pid, received_lsn, last_msg_send_time, last_msg_receipt_time, latest_end_lsn, latest_end_time) ON ((st.subid = su.oid)));
 pg_stat_sys_indexes| SELECT pg_stat_all_indexes.relid,
     pg_stat_all_indexes.indexrelid,
     pg_stat_all_indexes.schemaname,
index 88b4c973a1650482623f9932d31ed58ead4ab3ef..8e3028edaa265757eda78767de8df504eb2145fe 100644 (file)
@@ -143,6 +143,7 @@ pg_shdescription|t
 pg_shseclabel|t
 pg_statistic|t
 pg_subscription|t
+pg_subscription_rel|t
 pg_tablespace|t
 pg_transform|t
 pg_trigger|t
index 3471d88ca76846b30d6f8d27c55470f0e8d7a8ff..0912bef6576462478fa72b9db71290e2bdb8b410 100644 (file)
@@ -14,7 +14,6 @@ CREATE SUBSCRIPTION testsub PUBLICATION foo;
 ERROR:  syntax error at or near "PUBLICATION"
 LINE 1: CREATE SUBSCRIPTION testsub PUBLICATION foo;
                                     ^
-set client_min_messages to error;
 -- fail - cannot do CREATE SUBSCRIPTION CREATE SLOT inside transaction block
 BEGIN;
 CREATE SUBSCRIPTION testsub CONNECTION 'testconn' PUBLICATION testpub WITH (CREATE SLOT);
@@ -23,8 +22,8 @@ COMMIT;
 CREATE SUBSCRIPTION testsub CONNECTION 'testconn' PUBLICATION testpub;
 ERROR:  invalid connection string syntax: missing "=" after "testconn" in connection info string
 
-CREATE SUBSCRIPTION testsub CONNECTION 'dbname=doesnotexist' PUBLICATION testpub WITH (DISABLED, NOCREATE SLOT);
-reset client_min_messages;
+CREATE SUBSCRIPTION testsub CONNECTION 'dbname=doesnotexist' PUBLICATION testpub WITH (NOCONNECT);
+WARNING:  tables were not subscribed, you will have to run ALTER SUBSCRIPTION ... REFRESH PUBLICATION to subscribe the tables
 \dRs+
                                List of subscriptions
   Name   |           Owner           | Enabled | Publication |      Conninfo       
@@ -32,38 +31,30 @@ reset client_min_messages;
  testsub | regress_subscription_user | f       | {testpub}   | dbname=doesnotexist
 (1 row)
 
-ALTER SUBSCRIPTION testsub SET PUBLICATION testpub2, testpub3;
-\dRs
-                        List of subscriptions
-  Name   |           Owner           | Enabled |     Publication     
----------+---------------------------+---------+---------------------
- testsub | regress_subscription_user | f       | {testpub2,testpub3}
-(1 row)
-
+ALTER SUBSCRIPTION testsub SET PUBLICATION testpub2, testpub3 NOREFRESH;
 ALTER SUBSCRIPTION testsub CONNECTION 'dbname=doesnotexist2';
-ALTER SUBSCRIPTION testsub SET PUBLICATION testpub, testpub1;
 \dRs+
                                    List of subscriptions
-  Name   |           Owner           | Enabled |    Publication     |       Conninfo       
----------+---------------------------+---------+--------------------+----------------------
- testsub | regress_subscription_user | f       | {testpub,testpub1} | dbname=doesnotexist2
+  Name   |           Owner           | Enabled |     Publication     |       Conninfo       
+---------+---------------------------+---------+---------------------+----------------------
+ testsub | regress_subscription_user | f       | {testpub2,testpub3} | dbname=doesnotexist2
 (1 row)
 
 BEGIN;
 ALTER SUBSCRIPTION testsub ENABLE;
 \dRs
-                       List of subscriptions
-  Name   |           Owner           | Enabled |    Publication     
----------+---------------------------+---------+--------------------
- testsub | regress_subscription_user | t       | {testpub,testpub1}
+                        List of subscriptions
+  Name   |           Owner           | Enabled |     Publication     
+---------+---------------------------+---------+---------------------
+ testsub | regress_subscription_user | t       | {testpub2,testpub3}
 (1 row)
 
 ALTER SUBSCRIPTION testsub DISABLE;
 \dRs
-                       List of subscriptions
-  Name   |           Owner           | Enabled |    Publication     
----------+---------------------------+---------+--------------------
- testsub | regress_subscription_user | f       | {testpub,testpub1}
+                        List of subscriptions
+  Name   |           Owner           | Enabled |     Publication     
+---------+---------------------------+---------+---------------------
+ testsub | regress_subscription_user | f       | {testpub2,testpub3}
 (1 row)
 
 COMMIT;
@@ -74,10 +65,10 @@ ERROR:  must be owner of subscription testsub
 RESET ROLE;
 ALTER SUBSCRIPTION testsub RENAME TO testsub_foo;
 \dRs
-                         List of subscriptions
-    Name     |           Owner           | Enabled |    Publication     
--------------+---------------------------+---------+--------------------
- testsub_foo | regress_subscription_user | f       | {testpub,testpub1}
+                          List of subscriptions
+    Name     |           Owner           | Enabled |     Publication     
+-------------+---------------------------+---------+---------------------
+ testsub_foo | regress_subscription_user | f       | {testpub2,testpub3}
 (1 row)
 
 -- rename back to keep the rest simple
index 6b85fe2949738400af865db92b7c1a5cae6a0acd..28476daff18b07c97c61ed906cd4d051cd5bfe62 100644 (file)
@@ -40,7 +40,7 @@ CREATE TRANSFORM FOR int LANGUAGE SQL (
    FROM SQL WITH FUNCTION varchar_transform(internal),
    TO SQL WITH FUNCTION int4recv(internal));
 CREATE PUBLICATION addr_pub FOR TABLE addr_nsp.gentable;
-CREATE SUBSCRIPTION addr_sub CONNECTION '' PUBLICATION bar WITH (DISABLED, NOCREATE SLOT);
+CREATE SUBSCRIPTION addr_sub CONNECTION '' PUBLICATION bar WITH (DISABLED, NOCONNECT);
 
 -- test some error cases
 SELECT pg_get_object_address('stone', '{}', '{}');
index 5c05b14f9e298d26d15d59d5431c2bf3c21a6f00..c1199ee6292d02ecdfe55c8c671acfe83b728198 100644 (file)
@@ -12,24 +12,19 @@ CREATE SUBSCRIPTION testsub CONNECTION 'foo';
 -- fail - no connection
 CREATE SUBSCRIPTION testsub PUBLICATION foo;
 
-set client_min_messages to error;
 -- fail - cannot do CREATE SUBSCRIPTION CREATE SLOT inside transaction block
 BEGIN;
 CREATE SUBSCRIPTION testsub CONNECTION 'testconn' PUBLICATION testpub WITH (CREATE SLOT);
 COMMIT;
 
 CREATE SUBSCRIPTION testsub CONNECTION 'testconn' PUBLICATION testpub;
-CREATE SUBSCRIPTION testsub CONNECTION 'dbname=doesnotexist' PUBLICATION testpub WITH (DISABLED, NOCREATE SLOT);
-reset client_min_messages;
 
-\dRs+
-
-ALTER SUBSCRIPTION testsub SET PUBLICATION testpub2, testpub3;
+CREATE SUBSCRIPTION testsub CONNECTION 'dbname=doesnotexist' PUBLICATION testpub WITH (NOCONNECT);
 
-\dRs
+\dRs+
 
+ALTER SUBSCRIPTION testsub SET PUBLICATION testpub2, testpub3 NOREFRESH;
 ALTER SUBSCRIPTION testsub CONNECTION 'dbname=doesnotexist2';
-ALTER SUBSCRIPTION testsub SET PUBLICATION testpub, testpub1;
 
 \dRs+
 
index b81028aed143916ca7af37f9819b4048bd5255fb..d1817f57da45ded8518d9bfc9522694c24b86605 100644 (file)
@@ -3,7 +3,7 @@ use strict;
 use warnings;
 use PostgresNode;
 use TestLib;
-use Test::More tests => 11;
+use Test::More tests => 14;
 
 # Initialize publisher node
 my $node_publisher = get_new_node('publisher');
@@ -19,7 +19,7 @@ $node_subscriber->start;
 $node_publisher->safe_psql('postgres',
    "CREATE TABLE tab_notrep AS SELECT generate_series(1,10) AS a");
 $node_publisher->safe_psql('postgres',
-   "CREATE TABLE tab_ins (a int)");
+   "CREATE TABLE tab_ins AS SELECT generate_series(1,1002) AS a");
 $node_publisher->safe_psql('postgres',
    "CREATE TABLE tab_full AS SELECT generate_series(1,10) AS a");
 $node_publisher->safe_psql('postgres',
@@ -56,10 +56,20 @@ my $caughtup_query =
 $node_publisher->poll_query_until('postgres', $caughtup_query)
   or die "Timed out while waiting for subscriber to catch up";
 
+# Also wait for initial table sync to finish
+my $synced_query =
+"SELECT count(1) = 0 FROM pg_subscription_rel WHERE srsubstate NOT IN ('r', 's');";
+$node_subscriber->poll_query_until('postgres', $synced_query)
+  or die "Timed out while waiting for subscriber to synchronize data";
+
 my $result =
   $node_subscriber->safe_psql('postgres', "SELECT count(*) FROM tab_notrep");
 is($result, qq(0), 'check non-replicated table is empty on subscriber');
 
+$result =
+  $node_subscriber->safe_psql('postgres', "SELECT count(*) FROM tab_ins");
+is($result, qq(1002), 'check initial data was copied to subscriber');
+
 $node_publisher->safe_psql('postgres',
    "INSERT INTO tab_ins SELECT generate_series(1,50)");
 $node_publisher->safe_psql('postgres',
@@ -79,7 +89,7 @@ $node_publisher->poll_query_until('postgres', $caughtup_query)
 
 $result =
   $node_subscriber->safe_psql('postgres', "SELECT count(*), min(a), max(a) FROM tab_ins");
-is($result, qq(50|1|50), 'check replicated inserts on subscriber');
+is($result, qq(1052|1|1002), 'check replicated inserts on subscriber');
 
 $result =
   $node_subscriber->safe_psql('postgres', "SELECT count(*), min(a), max(a) FROM tab_rep");
@@ -109,7 +119,7 @@ $node_publisher->poll_query_until('postgres', $caughtup_query)
 
 $result =
   $node_subscriber->safe_psql('postgres', "SELECT count(*), min(a), max(a) FROM tab_full");
-is($result, qq(10|1|100), 'update works with REPLICA IDENTITY FULL and duplicate tuples');
+is($result, qq(20|1|100), 'update works with REPLICA IDENTITY FULL and duplicate tuples');
 
 # check that change of connection string and/or publication list causes
 # restart of subscription workers. Not all of these are registered as tests
@@ -126,7 +136,7 @@ $node_publisher->poll_query_until('postgres',
 $oldpid = $node_publisher->safe_psql('postgres',
    "SELECT pid FROM pg_stat_replication WHERE application_name = '$appname';");
 $node_subscriber->safe_psql('postgres',
-   "ALTER SUBSCRIPTION tap_sub SET PUBLICATION tap_pub_ins_only");
+   "ALTER SUBSCRIPTION tap_sub SET PUBLICATION tap_pub_ins_only REFRESH WITH (NOCOPY DATA)");
 $node_publisher->poll_query_until('postgres',
    "SELECT pid != $oldpid FROM pg_stat_replication WHERE application_name = '$appname';")
   or die "Timed out while waiting for apply to restart";
@@ -141,7 +151,7 @@ $node_publisher->poll_query_until('postgres', $caughtup_query)
 
 $result =
   $node_subscriber->safe_psql('postgres', "SELECT count(*), min(a), max(a) FROM tab_ins");
-is($result, qq(150|1|1100), 'check replicated inserts after subscription publication change');
+is($result, qq(1152|1|1100), 'check replicated inserts after subscription publication change');
 
 $result =
   $node_subscriber->safe_psql('postgres', "SELECT count(*), min(a), max(a) FROM tab_rep");
@@ -154,6 +164,8 @@ $node_publisher->safe_psql('postgres',
    "ALTER PUBLICATION tap_pub_ins_only ADD TABLE tab_full");
 $node_publisher->safe_psql('postgres',
    "DELETE FROM tab_ins WHERE a > 0");
+$node_subscriber->safe_psql('postgres',
+   "ALTER SUBSCRIPTION tap_sub REFRESH PUBLICATION WITH (NOCOPY DATA)");
 $node_publisher->safe_psql('postgres',
    "INSERT INTO tab_full VALUES(0)");
 
@@ -163,11 +175,11 @@ $node_publisher->poll_query_until('postgres', $caughtup_query)
 # note that data are different on provider and subscriber
 $result =
   $node_subscriber->safe_psql('postgres', "SELECT count(*), min(a), max(a) FROM tab_ins");
-is($result, qq(50|1|50), 'check replicated deletes after alter publication');
+is($result, qq(1052|1|1002), 'check replicated deletes after alter publication');
 
 $result =
   $node_subscriber->safe_psql('postgres', "SELECT count(*), min(a), max(a) FROM tab_full");
-is($result, qq(11|0|100), 'check replicated insert after alter publication');
+is($result, qq(21|0|100), 'check replicated insert after alter publication');
 
 # check restart on rename
 $oldpid = $node_publisher->safe_psql('postgres',
@@ -189,6 +201,14 @@ $result =
   $node_publisher->safe_psql('postgres', "SELECT count(*) FROM pg_replication_slots");
 is($result, qq(0), 'check replication slot was dropped on publisher');
 
+$result =
+  $node_subscriber->safe_psql('postgres', "SELECT count(*) FROM pg_subscription_rel");
+is($result, qq(0), 'check subscription relation status was dropped on subscriber');
+
+$result =
+  $node_publisher->safe_psql('postgres', "SELECT count(*) FROM pg_replication_slots");
+is($result, qq(0), 'check replication slot was dropped on publisher');
+
 $result =
   $node_subscriber->safe_psql('postgres', "SELECT count(*) FROM pg_replication_origin");
 is($result, qq(0), 'check replication origin was dropped on subscriber');
index f44e1e671d2c7bfb791e1db062f8ad84be5cfc7d..ad15e85c0ca8083ea88245a577673c5cde4b7217 100644 (file)
@@ -111,6 +111,12 @@ my $caughtup_query =
 $node_publisher->poll_query_until('postgres', $caughtup_query)
   or die "Timed out while waiting for subscriber to catch up";
 
+# Wait for initial sync to finish as well
+my $synced_query =
+"SELECT count(1) = 0 FROM pg_subscription_rel WHERE srsubstate NOT IN ('s', 'r');";
+$node_subscriber->poll_query_until('postgres', $synced_query)
+  or die "Timed out while waiting for subscriber to synchronize data";
+
 # Insert initial test data
 $node_publisher->safe_psql('postgres', qq(
    -- test_tbl_one_array_col
index b785132f5b2a10ef56be8970d871ac8047dbac3f..11b82541551bed4cc551c211121df90c49dbf0db 100644 (file)
@@ -34,7 +34,7 @@ $node_publisher->safe_psql('postgres',
 
 my $appname = 'tap_sub';
 $node_subscriber->safe_psql('postgres',
-   "CREATE SUBSCRIPTION tap_sub CONNECTION '$publisher_connstr application_name=$appname' PUBLICATION tap_pub;");
+   "CREATE SUBSCRIPTION tap_sub CONNECTION '$publisher_connstr application_name=$appname' PUBLICATION tap_pub WITH (NOCOPY DATA)");
 
 # Wait for subscriber to finish initialization
 my $caughtup_query =
diff --git a/src/test/subscription/t/004_sync.pl b/src/test/subscription/t/004_sync.pl
new file mode 100644 (file)
index 0000000..87541a0
--- /dev/null
@@ -0,0 +1,159 @@
+# Tests for logical replication table syncing
+use strict;
+use warnings;
+use PostgresNode;
+use TestLib;
+use Test::More tests => 7;
+
+# Initialize publisher node
+my $node_publisher = get_new_node('publisher');
+$node_publisher->init(allows_streaming => 'logical');
+$node_publisher->start;
+
+# Create subscriber node
+my $node_subscriber = get_new_node('subscriber');
+$node_subscriber->init(allows_streaming => 'logical');
+$node_subscriber->start;
+
+# Create some preexisting content on publisher
+$node_publisher->safe_psql('postgres',
+   "CREATE TABLE tab_rep (a int primary key)");
+$node_publisher->safe_psql('postgres',
+   "INSERT INTO tab_rep SELECT generate_series(1,10)");
+
+# Setup structure on subscriber
+$node_subscriber->safe_psql('postgres',
+   "CREATE TABLE tab_rep (a int primary key)");
+
+# Setup logical replication
+my $publisher_connstr = $node_publisher->connstr . ' dbname=postgres';
+$node_publisher->safe_psql('postgres',
+   "CREATE PUBLICATION tap_pub FOR ALL TABLES");
+
+my $appname = 'tap_sub';
+$node_subscriber->safe_psql('postgres',
+   "CREATE SUBSCRIPTION tap_sub CONNECTION '$publisher_connstr application_name=$appname' PUBLICATION tap_pub");
+
+# Wait for subscriber to finish initialization
+my $caughtup_query =
+"SELECT pg_current_wal_location() <= replay_location FROM pg_stat_replication WHERE application_name = '$appname';";
+$node_publisher->poll_query_until('postgres', $caughtup_query)
+  or die "Timed out while waiting for subscriber to catch up";
+
+# Also wait for initial table sync to finish
+my $synced_query =
+"SELECT count(1) = 0 FROM pg_subscription_rel WHERE srsubstate NOT IN ('r', 's');";
+$node_subscriber->poll_query_until('postgres', $synced_query)
+  or die "Timed out while waiting for subscriber to synchronize data";
+
+my $result =
+  $node_subscriber->safe_psql('postgres', "SELECT count(*) FROM tab_rep");
+is($result, qq(10), 'initial data synced for first sub');
+
+# drop subscription so that there is unreplicated data
+$node_subscriber->safe_psql('postgres', "DROP SUBSCRIPTION tap_sub");
+
+$node_publisher->safe_psql('postgres',
+   "INSERT INTO tab_rep SELECT generate_series(11,20)");
+
+# recreate the subscription, it will try to do initial copy
+$node_subscriber->safe_psql('postgres',
+   "CREATE SUBSCRIPTION tap_sub CONNECTION '$publisher_connstr application_name=$appname' PUBLICATION tap_pub");
+
+# but it will be stuck on data copy as it will fail on constraint
+my $started_query =
+"SELECT srsubstate = 'd' FROM pg_subscription_rel;";
+$node_subscriber->poll_query_until('postgres', $started_query)
+  or die "Timed out while waiting for subscriber to start sync";
+
+# remove the conflicting data
+$node_subscriber->safe_psql('postgres',
+   "DELETE FROM tab_rep;");
+
+# wait for sync to finish this time
+$node_subscriber->poll_query_until('postgres', $synced_query)
+  or die "Timed out while waiting for subscriber to synchronize data";
+
+# check that all data is synced
+$result =
+  $node_subscriber->safe_psql('postgres', "SELECT count(*) FROM tab_rep");
+is($result, qq(20), 'initial data synced for second sub');
+
+# now check another subscription for the same node pair
+$node_subscriber->safe_psql('postgres',
+   "CREATE SUBSCRIPTION tap_sub2 CONNECTION '$publisher_connstr application_name=$appname' PUBLICATION tap_pub WITH (NOCOPY DATA)");
+
+# wait for it to start
+$node_subscriber->poll_query_until('postgres', "SELECT pid IS NOT NULL FROM pg_stat_subscription WHERE subname = 'tap_sub2' AND relid IS NULL")
+  or die "Timed out while waiting for subscriber to start";
+
+# and drop both subscriptions
+$node_subscriber->safe_psql('postgres', "DROP SUBSCRIPTION tap_sub");
+$node_subscriber->safe_psql('postgres', "DROP SUBSCRIPTION tap_sub2");
+
+# check subscriptions are removed
+$result =
+  $node_subscriber->safe_psql('postgres', "SELECT count(*) FROM pg_subscription");
+is($result, qq(0), 'second and third sub are dropped');
+
+# remove the conflicting data
+$node_subscriber->safe_psql('postgres',
+   "DELETE FROM tab_rep;");
+
+# recreate the subscription again
+$node_subscriber->safe_psql('postgres',
+   "CREATE SUBSCRIPTION tap_sub CONNECTION '$publisher_connstr application_name=$appname' PUBLICATION tap_pub");
+
+# and wait for data sync to finish again
+$node_subscriber->poll_query_until('postgres', $synced_query)
+  or die "Timed out while waiting for subscriber to synchronize data";
+
+# check that all data is synced
+$result =
+  $node_subscriber->safe_psql('postgres', "SELECT count(*) FROM tab_rep");
+is($result, qq(20), 'initial data synced for fourth sub');
+
+# add new table on subscriber
+$node_subscriber->safe_psql('postgres',
+   "CREATE TABLE tab_rep_next (a int)");
+
+# setup structure with existing data on pubisher
+$node_publisher->safe_psql('postgres',
+   "CREATE TABLE tab_rep_next (a) AS SELECT generate_series(1,10)");
+
+# Wait for subscription to catch up
+$node_publisher->poll_query_until('postgres', $caughtup_query)
+  or die "Timed out while waiting for subscriber to catch up";
+
+$result =
+  $node_subscriber->safe_psql('postgres', "SELECT count(*) FROM tab_rep_next");
+is($result, qq(0), 'no data for table added after subscription initialized');
+
+# ask for data sync
+$node_subscriber->safe_psql('postgres',
+   "ALTER SUBSCRIPTION tap_sub REFRESH PUBLICATION");
+
+# wait for sync to finish
+$node_subscriber->poll_query_until('postgres', $synced_query)
+  or die "Timed out while waiting for subscriber to synchronize data";
+
+$result =
+  $node_subscriber->safe_psql('postgres', "SELECT count(*) FROM tab_rep_next");
+is($result, qq(10), 'data for table added after subscription initialized are now synced');
+
+# Add some data
+$node_publisher->safe_psql('postgres',
+   "INSERT INTO tab_rep_next SELECT generate_series(1,10)");
+
+# Wait for subscription to catch up
+$node_publisher->poll_query_until('postgres', $caughtup_query)
+  or die "Timed out while waiting for subscriber to catch up";
+
+$result =
+  $node_subscriber->safe_psql('postgres', "SELECT count(*) FROM tab_rep_next");
+is($result, qq(20), 'changes for table added after subscription initialized replicated');
+
+$node_subscriber->safe_psql('postgres', "DROP SUBSCRIPTION tap_sub");
+
+$node_subscriber->stop('fast');
+$node_publisher->stop('fast');