Allow Hot Standby to begin from a shutdown checkpoint.
authorHeikki Linnakangas <heikki.linnakangas@iki.fi>
Tue, 13 Apr 2010 14:17:46 +0000 (14:17 +0000)
committerHeikki Linnakangas <heikki.linnakangas@iki.fi>
Tue, 13 Apr 2010 14:17:46 +0000 (14:17 +0000)
Patch by Simon Riggs & me

src/backend/access/transam/twophase.c
src/backend/access/transam/xlog.c
src/include/access/twophase.h

index b1bf2c4f26052ade81df863a947700e459783a74..faafc7e5c18c7d717affac2f1ee1e59d9e488063 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *     $PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.59 2010/02/26 02:00:34 momjian Exp $
+ *     $PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.60 2010/04/13 14:17:46 heikki Exp $
  *
  * NOTES
  *     Each global transaction is associated with a global transaction
@@ -1718,6 +1718,89 @@ PrescanPreparedTransactions(TransactionId **xids_p, int *nxids_p)
    return result;
 }
 
+/*
+ * StandbyRecoverPreparedTransactions
+ *
+ * Scan the pg_twophase directory and setup all the required information to
+ * allow standby queries to treat prepared transactions as still active.
+ * This is never called at the end of recovery - we use
+ * RecoverPreparedTransactions() at that point.
+ *
+ * Currently we simply call SubTransSetParent() for any subxids of prepared
+ * transactions. If overwriteOK is true, it's OK if some XIDs have already
+ * been marked in pg_subtrans.
+ */
+void
+StandbyRecoverPreparedTransactions(bool overwriteOK)
+{
+   DIR        *cldir;
+   struct dirent *clde;
+
+   cldir = AllocateDir(TWOPHASE_DIR);
+   while ((clde = ReadDir(cldir, TWOPHASE_DIR)) != NULL)
+   {
+       if (strlen(clde->d_name) == 8 &&
+           strspn(clde->d_name, "0123456789ABCDEF") == 8)
+       {
+           TransactionId xid;
+           char       *buf;
+           TwoPhaseFileHeader *hdr;
+           TransactionId *subxids;
+           int         i;
+
+           xid = (TransactionId) strtoul(clde->d_name, NULL, 16);
+
+           /* Already processed? */
+           if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid))
+           {
+               ereport(WARNING,
+                       (errmsg("removing stale two-phase state file \"%s\"",
+                               clde->d_name)));
+               RemoveTwoPhaseFile(xid, true);
+               continue;
+           }
+
+           /* Read and validate file */
+           buf = ReadTwoPhaseFile(xid, true);
+           if (buf == NULL)
+           {
+               ereport(WARNING,
+                     (errmsg("removing corrupt two-phase state file \"%s\"",
+                             clde->d_name)));
+               RemoveTwoPhaseFile(xid, true);
+               continue;
+           }
+
+           /* Deconstruct header */
+           hdr = (TwoPhaseFileHeader *) buf;
+           if (!TransactionIdEquals(hdr->xid, xid))
+           {
+               ereport(WARNING,
+                     (errmsg("removing corrupt two-phase state file \"%s\"",
+                             clde->d_name)));
+               RemoveTwoPhaseFile(xid, true);
+               pfree(buf);
+               continue;
+           }
+
+           /*
+            * Examine subtransaction XIDs ... they should all follow main
+            * XID.
+            */
+           subxids = (TransactionId *)
+               (buf + MAXALIGN(sizeof(TwoPhaseFileHeader)));
+           for (i = 0; i < hdr->nsubxacts; i++)
+           {
+               TransactionId subxid = subxids[i];
+
+               Assert(TransactionIdFollows(subxid, xid));
+               SubTransSetParent(xid, subxid, overwriteOK);
+           }
+       }
+   }
+   FreeDir(cldir);
+}
+
 /*
  * RecoverPreparedTransactions
  *
index 379c6f11750b26ea4feff7f777d333668b302ca9..5fd4b870bef3d9254afe4cdecf847b94229ce4c2 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.393 2010/04/12 10:40:42 heikki Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.394 2010/04/13 14:17:46 heikki Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -496,6 +496,7 @@ static TimeLineID lastPageTLI = 0;
 static XLogRecPtr minRecoveryPoint;        /* local copy of
                                         * ControlFile->minRecoveryPoint */
 static bool updateMinRecoveryPoint = true;
+static bool reachedMinRecoveryPoint = false;
 
 static bool InRedo = false;
 
@@ -551,6 +552,7 @@ static void ValidateXLOGDirectoryStructure(void);
 static void CleanupBackupHistory(void);
 static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
 static XLogRecord *ReadRecord(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt);
+static void CheckRecoveryConsistency(void);
 static bool ValidXLOGHeader(XLogPageHeader hdr, int emode);
 static XLogRecord *ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt);
 static List *readTimeLineHistory(TimeLineID targetTLI);
@@ -5591,7 +5593,6 @@ StartupXLOG(void)
    uint32      freespace;
    TransactionId oldestActiveXID;
    bool        bgwriterLaunched = false;
-   bool        backendsAllowed = false;
 
    /*
     * Read control file and check XLOG status looks valid.
@@ -5838,6 +5839,8 @@ StartupXLOG(void)
    if (InRecovery)
    {
        int         rmid;
+       /* use volatile pointer to prevent code rearrangement */
+       volatile XLogCtlData *xlogctl = XLogCtl;
 
        /*
         * Update pg_control to show that we are recovering and to show the
@@ -5930,6 +5933,33 @@ StartupXLOG(void)
            StartupMultiXact();
 
            ProcArrayInitRecoveryInfo(oldestActiveXID);
+
+           /*
+            * If we're beginning at a shutdown checkpoint, we know that
+            * nothing was running on the master at this point. So fake-up
+            * an empty running-xacts record and use that here and now.
+            * Recover additional standby state for prepared transactions.
+            */
+           if (wasShutdown)
+           {
+               RunningTransactionsData running;
+
+               /*
+                * Construct a RunningTransactions snapshot representing a shut
+                * down server, with only prepared transactions still alive.
+                * We're never overflowed at this point because all subxids
+                * are listed with their parent prepared transactions.
+                */
+               running.xcnt = nxids;
+               running.subxid_overflow = false;
+               running.nextXid = checkPoint.nextXid;
+               running.oldestRunningXid = oldestActiveXID;
+               running.xids = xids;
+
+               ProcArrayApplyRecoveryInfo(&running);
+
+               StandbyRecoverPreparedTransactions(false);
+           }
        }
 
        /* Initialize resource managers */
@@ -5939,6 +5969,46 @@ StartupXLOG(void)
                RmgrTable[rmid].rm_startup();
        }
 
+       /*
+        * Initialize shared replayEndRecPtr and recoveryLastRecPtr.
+        *
+        * This is slightly confusing if we're starting from an online
+        * checkpoint; we've just read and replayed the chekpoint record,
+        * but we're going to start replay from its redo pointer, which
+        * precedes the location of the checkpoint record itself. So even
+        * though the last record we've replayed is indeed ReadRecPtr, we
+        * haven't replayed all the preceding records yet. That's OK for
+        * the current use of these variables.
+        */
+       SpinLockAcquire(&xlogctl->info_lck);
+       xlogctl->replayEndRecPtr = ReadRecPtr;
+       xlogctl->recoveryLastRecPtr = ReadRecPtr;
+       SpinLockRelease(&xlogctl->info_lck);
+
+       /*
+        * Let postmaster know we've started redo now, so that it can
+        * launch bgwriter to perform restartpoints.  We don't bother
+        * during crash recovery as restartpoints can only be performed
+        * during archive recovery.  And we'd like to keep crash recovery
+        * simple, to avoid introducing bugs that could you from
+        * recovering after crash.
+        *
+        * After this point, we can no longer assume that we're the only
+        * process in addition to postmaster!  Also, fsync requests are
+        * subsequently to be handled by the bgwriter, not locally.
+        */
+       if (InArchiveRecovery && IsUnderPostmaster)
+       {
+           SetForwardFsyncRequests();
+           SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
+           bgwriterLaunched = true;
+       }
+
+       /*
+        * Allow read-only connections immediately if we're consistent already.
+        */
+       CheckRecoveryConsistency();
+
        /*
         * Find the first record that logically follows the checkpoint --- it
         * might physically precede it, though.
@@ -5958,43 +6028,14 @@ StartupXLOG(void)
        {
            bool        recoveryContinue = true;
            bool        recoveryApply = true;
-           bool        reachedMinRecoveryPoint = false;
            ErrorContextCallback errcontext;
 
-           /* use volatile pointer to prevent code rearrangement */
-           volatile XLogCtlData *xlogctl = XLogCtl;
-
-           /* initialize shared replayEndRecPtr and recoveryLastRecPtr */
-           SpinLockAcquire(&xlogctl->info_lck);
-           xlogctl->replayEndRecPtr = ReadRecPtr;
-           xlogctl->recoveryLastRecPtr = ReadRecPtr;
-           SpinLockRelease(&xlogctl->info_lck);
-
            InRedo = true;
 
            ereport(LOG,
                    (errmsg("redo starts at %X/%X",
                            ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));
 
-           /*
-            * Let postmaster know we've started redo now, so that it can
-            * launch bgwriter to perform restartpoints.  We don't bother
-            * during crash recovery as restartpoints can only be performed
-            * during archive recovery.  And we'd like to keep crash recovery
-            * simple, to avoid introducing bugs that could you from
-            * recovering after crash.
-            *
-            * After this point, we can no longer assume that we're the only
-            * process in addition to postmaster!  Also, fsync requests are
-            * subsequently to be handled by the bgwriter, not locally.
-            */
-           if (InArchiveRecovery && IsUnderPostmaster)
-           {
-               SetForwardFsyncRequests();
-               SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
-               bgwriterLaunched = true;
-           }
-
            /*
             * main redo apply loop
             */
@@ -6024,32 +6065,8 @@ StartupXLOG(void)
                /* Handle interrupt signals of startup process */
                HandleStartupProcInterrupts();
 
-               /*
-                * Have we passed our safe starting point?
-                */
-               if (!reachedMinRecoveryPoint &&
-                   XLByteLE(minRecoveryPoint, EndRecPtr) &&
-                   XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
-               {
-                   reachedMinRecoveryPoint = true;
-                   ereport(LOG,
-                       (errmsg("consistent recovery state reached at %X/%X",
-                               EndRecPtr.xlogid, EndRecPtr.xrecoff)));
-               }
-
-               /*
-                * Have we got a valid starting snapshot that will allow
-                * queries to be run? If so, we can tell postmaster that the
-                * database is consistent now, enabling connections.
-                */
-               if (standbyState == STANDBY_SNAPSHOT_READY &&
-                   !backendsAllowed &&
-                   reachedMinRecoveryPoint &&
-                   IsUnderPostmaster)
-               {
-                   backendsAllowed = true;
-                   SendPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT);
-               }
+               /* Allow read-only connections if we're consistent now */
+               CheckRecoveryConsistency();
 
                /*
                 * Have we reached our recovery target?
@@ -6398,6 +6415,44 @@ StartupXLOG(void)
    }
 }
 
+/*
+ * Checks if recovery has reached a consistent state. When consistency is
+ * reached and we have a valid starting standby snapshot, tell postmaster
+ * that it can start accepting read-only connections.
+ */
+static void
+CheckRecoveryConsistency(void)
+{
+   static bool     backendsAllowed = false;
+
+   /*
+    * Have we passed our safe starting point?
+    */
+   if (!reachedMinRecoveryPoint &&
+       XLByteLE(minRecoveryPoint, EndRecPtr) &&
+       XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
+   {
+       reachedMinRecoveryPoint = true;
+       ereport(LOG,
+               (errmsg("consistent recovery state reached at %X/%X",
+                       EndRecPtr.xlogid, EndRecPtr.xrecoff)));
+   }
+
+   /*
+    * Have we got a valid starting snapshot that will allow
+    * queries to be run? If so, we can tell postmaster that the
+    * database is consistent now, enabling connections.
+    */
+   if (standbyState == STANDBY_SNAPSHOT_READY &&
+       !backendsAllowed &&
+       reachedMinRecoveryPoint &&
+       IsUnderPostmaster)
+   {
+       backendsAllowed = true;
+       SendPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT);
+   }
+}
+
 /*
  * Is the system still in recovery?
  *
@@ -7657,13 +7712,36 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
        if (standbyState != STANDBY_DISABLED)
            CheckRequiredParameterValues(checkPoint);
 
+       /*
+        * If we see a shutdown checkpoint, we know that nothing was
+        * running on the master at this point. So fake-up an empty
+        * running-xacts record and use that here and now. Recover
+        * additional standby state for prepared transactions.
+        */
        if (standbyState >= STANDBY_INITIALIZED)
        {
+           TransactionId *xids;
+           int         nxids;
+           TransactionId oldestActiveXID;
+           RunningTransactionsData running;
+
+           oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
+
            /*
-            * Remove stale transactions, if any.
+            * Construct a RunningTransactions snapshot representing a shut
+            * down server, with only prepared transactions still alive.
+            * We're never overflowed at this point because all subxids
+            * are listed with their parent prepared transactions.
             */
-           ExpireOldKnownAssignedTransactionIds(checkPoint.nextXid);
-           StandbyReleaseOldLocks(checkPoint.nextXid);
+           running.xcnt = nxids;
+           running.subxid_overflow = false;
+           running.nextXid = checkPoint.nextXid;
+           running.oldestRunningXid = oldestActiveXID;
+           running.xids = xids;
+
+           ProcArrayApplyRecoveryInfo(&running);
+
+           StandbyRecoverPreparedTransactions(true);
        }
 
        /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
index 61b92244fb972b3cfcdd913b8ce92ebd35f28dcf..ea3c9966c734f6639077c17f0715e48502eba5e6 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/twophase.h,v 1.14 2010/01/02 16:58:00 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/access/twophase.h,v 1.15 2010/04/13 14:17:46 heikki Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -44,6 +44,7 @@ extern bool StandbyTransactionIdIsPrepared(TransactionId xid);
 
 extern TransactionId PrescanPreparedTransactions(TransactionId **xids_p,
                            int *nxids_p);
+extern void StandbyRecoverPreparedTransactions(bool overwriteOK);
 extern void RecoverPreparedTransactions(void);
 
 extern void RecreateTwoPhaseFile(TransactionId xid, void *content, int len);