Add further debug info to help debug 019_replslot_limit.pl failures.

author Andres Freund <andres@anarazel.de>

Sat, 26 Feb 2022 00:58:48 +0000 (16:58 -0800)

committer Andres Freund <andres@anarazel.de>

Sat, 26 Feb 2022 01:04:39 +0000 (17:04 -0800)
author Andres Freund <andres@anarazel.de>
Sat, 26 Feb 2022 00:58:48 +0000 (16:58 -0800)
committer Andres Freund <andres@anarazel.de>
Sat, 26 Feb 2022 01:04:39 +0000 (17:04 -0800)
diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c

index 3d39fddaaef2b420b53ad8f4ddbf5d03f1c773ea..f238a392ae7eb70fddfb69670bb557a7311aa4d5 100644 (file)
--- a/src/backend/replication/slot.c
+++ b/src/backend/replication/slot.c
@@ -569,6 +569,10 @@ restart:
         if (!s->in_use)
             continue;
  
+       /* unlocked read of active_pid is ok for debugging purposes */
+       elog(DEBUG3, "temporary replication slot cleanup: %d in use, active_pid: %d",
+            i, s->active_pid);
+
         SpinLockAcquire(&s->mutex);
         if (s->active_pid == MyProcPid)
         {
@@ -629,6 +633,9 @@ ReplicationSlotDropPtr(ReplicationSlot *slot)
     char        path[MAXPGPATH];
     char        tmppath[MAXPGPATH];
  
+   /* temp debugging aid to analyze 019_replslot_limit failures */
+   elog(DEBUG3, "replication slot drop: %s: begin", NameStr(slot->data.name));
+
     /*
      * If some other backend ran this code concurrently with us, we might try
      * to delete a slot with a certain name while someone else was trying to
@@ -679,6 +686,9 @@ ReplicationSlotDropPtr(ReplicationSlot *slot)
                         path, tmppath)));
     }
  
+   elog(DEBUG3, "replication slot drop: %s: removed on-disk",
+        NameStr(slot->data.name));
+
     /*
      * The slot is definitely gone.  Lock out concurrent scans of the array
      * long enough to kill it.  It's OK to clear the active PID here without
@@ -734,6 +744,9 @@ ReplicationSlotDropPtr(ReplicationSlot *slot)
      * a slot while we're still cleaning up the detritus of the old one.
      */
     LWLockRelease(ReplicationSlotAllocationLock);
+
+   elog(DEBUG3, "replication slot drop: %s: done",
+        NameStr(slot->data.name));
  }
  
  /*
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c

index 7b0dea4abece84a5f7d01b538c78d484c653ef9d..8f7f1b2f7c3fa854394fa4fa7b8422d03db30c5f 100644 (file)
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -1945,3 +1945,10 @@ LWLockHeldByMeInMode(LWLock *l, LWLockMode mode)
     }
     return false;
  }
+
+/* temp debugging aid to analyze 019_replslot_limit failures */
+int
+LWLockHeldCount(void)
+{
+   return num_held_lwlocks;
+}
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c

index a29fa0b3e6a2efc9d3178755c923127c895014ea..86d193c89fca70f74f612af922fbbb213b10df3d 100644 (file)
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -1262,6 +1262,23 @@ ShutdownPostgres(int code, Datum arg)
      * them explicitly.
      */
     LockReleaseAll(USER_LOCKMETHOD, true);
+
+   /*
+    * temp debugging aid to analyze 019_replslot_limit failures
+    *
+    * If an error were thrown outside of a transaction nothing up to now
+    * would have released lwlocks. We probably will add an
+    * LWLockReleaseAll(). But for now make it easier to understand such cases
+    * by warning if any lwlocks are held.
+    */
+#ifdef USE_ASSERT_CHECKING
+   {
+       int held_lwlocks = LWLockHeldCount();
+       if (held_lwlocks)
+           elog(WARNING, "holding %d lwlocks at the end of ShutdownPostgres()",
+                held_lwlocks);
+   }
+#endif
  }
  
  
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h

index 124977cf7e3492a2659a302fabe0af55e2c93e75..c3d5889d7b288711ced2e2254e9833a720218204 100644 (file)
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -121,6 +121,7 @@ extern void LWLockReleaseClearVar(LWLock *lock, uint64 *valptr, uint64 val);
  extern void LWLockReleaseAll(void);
  extern bool LWLockHeldByMe(LWLock *lock);
  extern bool LWLockHeldByMeInMode(LWLock *lock, LWLockMode mode);
+extern int LWLockHeldCount(void);
  
  extern bool LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval);
  extern void LWLockUpdateVar(LWLock *lock, uint64 *valptr, uint64 value);
diff --git a/src/test/recovery/t/019_replslot_limit.pl b/src/test/recovery/t/019_replslot_limit.pl

index 0c9da9bf27218746cd7957bd564fef7eda90df09..9bb71b62c06c9644bd75dbdedca831f7d64fe459 100644 (file)
--- a/src/test/recovery/t/019_replslot_limit.pl
+++ b/src/test/recovery/t/019_replslot_limit.pl
@@ -335,7 +335,23 @@ $node_standby3->start;
  $node_primary3->wait_for_catchup($node_standby3);
  my $senderpid = $node_primary3->safe_psql('postgres',
     "SELECT pid FROM pg_stat_activity WHERE backend_type = 'walsender'");
-like($senderpid, qr/^[0-9]+$/, "have walsender pid $senderpid");
+
+# We've seen occasional cases where multiple walsender pids are active. An
+# immediate shutdown may hide evidence of a locking bug. So if multiple
+# walsenders are observed, shut down in fast mode, and collect some more
+# information.
+if (not like($senderpid, qr/^[0-9]+$/, "have walsender pid $senderpid"))
+{
+   my ($stdout, $stderr);
+   $node_primary3->psql('postgres',
+                        "\\a\\t\nSELECT * FROM pg_stat_activity",
+                        stdout => \$stdout, stderr => \$stderr);
+   diag $stdout, $stderr;
+   $node_primary3->stop('fast');
+   $node_standby3->stop('fast');
+   die "could not determine walsender pid, can't continue";
+}
+
  my $receiverpid = $node_standby3->safe_psql('postgres',
     "SELECT pid FROM pg_stat_activity WHERE backend_type = 'walreceiver'");
  like($receiverpid, qr/^[0-9]+$/, "have walreceiver pid $receiverpid");
author	Andres Freund <andres@anarazel.de>
	Sat, 26 Feb 2022 00:58:48 +0000 (16:58 -0800)
committer	Andres Freund <andres@anarazel.de>
	Sat, 26 Feb 2022 01:04:39 +0000 (17:04 -0800)
src/backend/replication/slot.c		patch \| blob \| blame \| history
src/backend/storage/lmgr/lwlock.c		patch \| blob \| blame \| history
src/backend/utils/init/postinit.c		patch \| blob \| blame \| history
src/include/storage/lwlock.h		patch \| blob \| blame \| history
src/test/recovery/t/019_replslot_limit.pl		patch \| blob \| blame \| history