Add page-level freezing to VACUUM.

author Peter Geoghegan <pg@bowt.ie>

Wed, 28 Dec 2022 16:50:47 +0000 (08:50 -0800)

committer Peter Geoghegan <pg@bowt.ie>

Wed, 28 Dec 2022 16:50:47 +0000 (08:50 -0800)
author Peter Geoghegan <pg@bowt.ie>
Wed, 28 Dec 2022 16:50:47 +0000 (08:50 -0800)
committer Peter Geoghegan <pg@bowt.ie>
Wed, 28 Dec 2022 16:50:47 +0000 (08:50 -0800)
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml

index 3071c8eace4d5e49d7c4c27eb44c63c811462312..05b3862d09f1d537d39fdb356524e4c1251b8b5f 100644 (file)
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -9194,9 +9194,9 @@ COPY postgres_log FROM '/full/path/to/logfile.csv' WITH csv;
        </term>
        <listitem>
         <para>
-        Specifies the cutoff age (in transactions) that <command>VACUUM</command>
-        should use to decide whether to freeze row versions
-        while scanning a table.
+        Specifies the cutoff age (in transactions) that
+        <command>VACUUM</command> should use to decide whether to
+        trigger freezing of pages that have an older XID.
          The default is 50 million transactions.  Although
          users can set this value anywhere from zero to one billion,
          <command>VACUUM</command> will silently limit the effective value to half
@@ -9274,9 +9274,8 @@ COPY postgres_log FROM '/full/path/to/logfile.csv' WITH csv;
        <listitem>
         <para>
          Specifies the cutoff age (in multixacts) that <command>VACUUM</command>
-        should use to decide whether to replace multixact IDs with a newer
-        transaction ID or multixact ID while scanning a table.  The default
-        is 5 million multixacts.
+        should use to decide whether to trigger freezing of pages with
+        an older multixact ID.  The default is 5 million multixacts.
          Although users can set this value anywhere from zero to one billion,
          <command>VACUUM</command> will silently limit the effective value to half
          the value of <xref linkend="guc-autovacuum-multixact-freeze-max-age"/>,
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c

index 71bd071d2b0a3de42a3e7cff36022514e793bd54..34d83dc7024e294265a2f55b7f202f3adca5c764 100644 (file)
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -6098,9 +6098,7 @@ heap_inplace_update(Relation relation, HeapTuple tuple)
   *     MultiXactId.
   *
   * "flags" is an output value; it's used to tell caller what to do on return.
- *
- * "mxid_oldest_xid_out" is an output value; it's used to track the oldest
- * extant Xid within any Multixact that will remain after freezing executes.
+ * "pagefrz" is an input/output value, used to manage page level freezing.
   *
   * Possible values that we can set in "flags":
   * FRM_NOOP
@@ -6115,17 +6113,37 @@ heap_inplace_update(Relation relation, HeapTuple tuple)
   *     The return value is a new MultiXactId to set as new Xmax.
   *     (caller must obtain proper infomask bits using GetMultiXactIdHintBits)
   *
- * "mxid_oldest_xid_out" is only set when "flags" contains either FRM_NOOP or
- * FRM_RETURN_IS_MULTI, since we only leave behind a MultiXactId for these.
- *
- * NB: Creates a _new_ MultiXactId when FRM_RETURN_IS_MULTI is set in "flags".
+ * Caller delegates control of page freezing to us.  In practice we always
+ * force freezing of caller's page unless FRM_NOOP processing is indicated.
+ * We help caller ensure that XIDs < FreezeLimit and MXIDs < MultiXactCutoff
+ * can never be left behind.  We freely choose when and how to process each
+ * Multi, without ever violating the cutoff postconditions for freezing.
+ *
+ * It's useful to remove Multis on a proactive timeline (relative to freezing
+ * XIDs) to keep MultiXact member SLRU buffer misses to a minimum.  It can also
+ * be cheaper in the short run, for us, since we too can avoid SLRU buffer
+ * misses through eager processing.
+ *
+ * NB: Creates a _new_ MultiXactId when FRM_RETURN_IS_MULTI is set, though only
+ * when FreezeLimit and/or MultiXactCutoff cutoffs leave us with no choice.
+ * This can usually be put off, which is usually enough to avoid it altogether.
+ * Allocating new multis during VACUUM should be avoided on general principle;
+ * only VACUUM can advance relminmxid, so allocating new Multis here comes with
+ * its own special risks.
+ *
+ * NB: Caller must maintain "no freeze" NewRelfrozenXid/NewRelminMxid trackers
+ * using heap_tuple_should_freeze when we haven't forced page-level freezing.
+ *
+ * NB: Caller should avoid needlessly calling heap_tuple_should_freeze when we
+ * have already forced page-level freezing, since that might incur the same
+ * SLRU buffer misses that we specifically intended to avoid by freezing.
   */
  static TransactionId
  FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
                   const struct VacuumCutoffs *cutoffs, uint16 *flags,
-                 TransactionId *mxid_oldest_xid_out)
+                 HeapPageFreeze *pagefrz)
  {
-   TransactionId newxmax = InvalidTransactionId;
+   TransactionId newxmax;
     MultiXactMember *members;
     int         nmembers;
     bool        need_replace;
@@ -6134,7 +6152,7 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
     bool        has_lockers;
     TransactionId update_xid;
     bool        update_committed;
-   TransactionId temp_xid_out;
+   TransactionId FreezePageRelfrozenXid;
  
     *flags = 0;
  
@@ -6144,8 +6162,8 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
     if (!MultiXactIdIsValid(multi) ||
         HEAP_LOCKED_UPGRADED(t_infomask))
     {
-       /* Ensure infomask bits are appropriately set/reset */
         *flags |= FRM_INVALIDATE_XMAX;
+       pagefrz->freeze_required = true;
         return InvalidTransactionId;
     }
     else if (MultiXactIdPrecedes(multi, cutoffs->relminmxid))
@@ -6153,8 +6171,10 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
                 (errcode(ERRCODE_DATA_CORRUPTED),
                  errmsg_internal("found multixact %u from before relminmxid %u",
                                  multi, cutoffs->relminmxid)));
-   else if (MultiXactIdPrecedes(multi, cutoffs->MultiXactCutoff))
+   else if (MultiXactIdPrecedes(multi, cutoffs->OldestMxact))
     {
+       TransactionId update_xact;
+
         /*
          * This old multi cannot possibly have members still running, but
          * verify just in case.  If it was a locker only, it can be removed
@@ -6165,52 +6185,46 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
                                  HEAP_XMAX_IS_LOCKED_ONLY(t_infomask)))
             ereport(ERROR,
                     (errcode(ERRCODE_DATA_CORRUPTED),
-                    errmsg_internal("multixact %u from before cutoff %u found to be still running",
-                                    multi, cutoffs->MultiXactCutoff)));
+                    errmsg_internal("multixact %u from before multi freeze cutoff %u found to be still running",
+                                    multi, cutoffs->OldestMxact)));
  
         if (HEAP_XMAX_IS_LOCKED_ONLY(t_infomask))
         {
             *flags |= FRM_INVALIDATE_XMAX;
-           newxmax = InvalidTransactionId;
+           pagefrz->freeze_required = true;
+           return InvalidTransactionId;
         }
-       else
-       {
-           /* replace multi with single XID for its updater */
-           newxmax = MultiXactIdGetUpdateXid(multi, t_infomask);
-
-           /* wasn't only a lock, xid needs to be valid */
-           Assert(TransactionIdIsValid(newxmax));
-
-           if (TransactionIdPrecedes(newxmax, cutoffs->relfrozenxid))
-               ereport(ERROR,
-                       (errcode(ERRCODE_DATA_CORRUPTED),
-                        errmsg_internal("found update xid %u from before relfrozenxid %u",
-                                        newxmax, cutoffs->relfrozenxid)));
  
+       /* replace multi with single XID for its updater? */
+       update_xact = MultiXactIdGetUpdateXid(multi, t_infomask);
+       if (TransactionIdPrecedes(update_xact, cutoffs->relfrozenxid))
+           ereport(ERROR,
+                   (errcode(ERRCODE_DATA_CORRUPTED),
+                    errmsg_internal("multixact %u contains update XID %u from before relfrozenxid %u",
+                                    multi, update_xact,
+                                    cutoffs->relfrozenxid)));
+       else if (TransactionIdPrecedes(update_xact, cutoffs->OldestXmin))
+       {
             /*
-            * If the new xmax xid is older than OldestXmin, it has to have
-            * aborted, otherwise the tuple would have been pruned away
+            * Updater XID has to have aborted (otherwise the tuple would have
+            * been pruned away instead, since updater XID is < OldestXmin).
+            * Just remove xmax.
              */
-           if (TransactionIdPrecedes(newxmax, cutoffs->OldestXmin))
-           {
-               if (TransactionIdDidCommit(newxmax))
-                   ereport(ERROR,
-                           (errcode(ERRCODE_DATA_CORRUPTED),
-                            errmsg_internal("cannot freeze committed update xid %u", newxmax)));
-               *flags |= FRM_INVALIDATE_XMAX;
-               newxmax = InvalidTransactionId;
-           }
-           else
-           {
-               *flags |= FRM_RETURN_IS_XID;
-           }
+           if (TransactionIdDidCommit(update_xact))
+               ereport(ERROR,
+                       (errcode(ERRCODE_DATA_CORRUPTED),
+                        errmsg_internal("multixact %u contains committed update XID %u from before removable cutoff %u",
+                                        multi, update_xact,
+                                        cutoffs->OldestXmin)));
+           *flags |= FRM_INVALIDATE_XMAX;
+           pagefrz->freeze_required = true;
+           return InvalidTransactionId;
         }
  
-       /*
-        * Don't push back mxid_oldest_xid_out using FRM_RETURN_IS_XID Xid, or
-        * when no Xids will remain
-        */
-       return newxmax;
+       /* Have to keep updater XID as new xmax */
+       *flags |= FRM_RETURN_IS_XID;
+       pagefrz->freeze_required = true;
+       return update_xact;
     }
  
     /*
@@ -6225,11 +6239,30 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
     {
         /* Nothing worth keeping */
         *flags |= FRM_INVALIDATE_XMAX;
+       pagefrz->freeze_required = true;
         return InvalidTransactionId;
     }
  
+   /*
+    * The FRM_NOOP case is the only case where we might need to ratchet back
+    * FreezePageRelfrozenXid or FreezePageRelminMxid.  It is also the only
+    * case where our caller might ratchet back its NoFreezePageRelfrozenXid
+    * or NoFreezePageRelminMxid "no freeze" trackers to deal with a multi.
+    * FRM_NOOP handling should result in the NewRelfrozenXid/NewRelminMxid
+    * trackers managed by VACUUM being ratcheting back by xmax to the degree
+    * required to make it safe to leave xmax undisturbed, independent of
+    * whether or not page freezing is triggered somewhere else.
+    *
+    * Our policy is to force freezing in every case other than FRM_NOOP,
+    * which obviates the need to maintain either set of trackers, anywhere.
+    * Every other case will reliably execute a freeze plan for xmax that
+    * either replaces xmax with an XID/MXID >= OldestXmin/OldestMxact, or
+    * sets xmax to an InvalidTransactionId XID, rendering xmax fully frozen.
+    * (VACUUM's NewRelfrozenXid/NewRelminMxid trackers are initialized with
+    * OldestXmin/OldestMxact, so later values never need to be tracked here.)
+    */
     need_replace = false;
-   temp_xid_out = *mxid_oldest_xid_out;    /* init for FRM_NOOP */
+   FreezePageRelfrozenXid = pagefrz->FreezePageRelfrozenXid;
     for (int i = 0; i < nmembers; i++)
     {
         TransactionId xid = members[i].xid;
@@ -6238,26 +6271,29 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
  
         if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
         {
+           /* Can't violate the FreezeLimit postcondition */
             need_replace = true;
             break;
         }
-       if (TransactionIdPrecedes(members[i].xid, temp_xid_out))
-           temp_xid_out = members[i].xid;
+       if (TransactionIdPrecedes(xid, FreezePageRelfrozenXid))
+           FreezePageRelfrozenXid = xid;
     }
  
-   /*
-    * In the simplest case, there is no member older than FreezeLimit; we can
-    * keep the existing MultiXactId as-is, avoiding a more expensive second
-    * pass over the multi
-    */
+   /* Can't violate the MultiXactCutoff postcondition, either */
+   if (!need_replace)
+       need_replace = MultiXactIdPrecedes(multi, cutoffs->MultiXactCutoff);
+
     if (!need_replace)
     {
         /*
-        * When mxid_oldest_xid_out gets pushed back here it's likely that the
-        * update Xid was the oldest member, but we don't rely on that
+        * vacuumlazy.c might ratchet back NewRelminMxid, NewRelfrozenXid, or
+        * both together to make it safe to retain this particular multi after
+        * freezing its page
          */
         *flags |= FRM_NOOP;
-       *mxid_oldest_xid_out = temp_xid_out;
+       pagefrz->FreezePageRelfrozenXid = FreezePageRelfrozenXid;
+       if (MultiXactIdPrecedes(multi, pagefrz->FreezePageRelminMxid))
+           pagefrz->FreezePageRelminMxid = multi;
         pfree(members);
         return multi;
     }
@@ -6266,13 +6302,15 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
      * Do a more thorough second pass over the multi to figure out which
      * member XIDs actually need to be kept.  Checking the precise status of
      * individual members might even show that we don't need to keep anything.
+    * That is quite possible even though the Multi must be >= OldestMxact,
+    * since our second pass only keeps member XIDs when it's truly necessary;
+    * even member XIDs >= OldestXmin often won't be kept by second pass.
      */
     nnewmembers = 0;
     newmembers = palloc(sizeof(MultiXactMember) * nmembers);
     has_lockers = false;
     update_xid = InvalidTransactionId;
     update_committed = false;
-   temp_xid_out = *mxid_oldest_xid_out;    /* init for FRM_RETURN_IS_MULTI */
  
     /*
      * Determine whether to keep each member xid, or to ignore it instead
@@ -6293,14 +6331,14 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
             if (TransactionIdIsCurrentTransactionId(xid) ||
                 TransactionIdIsInProgress(xid))
             {
+               if (TransactionIdPrecedes(xid, cutoffs->OldestXmin))
+                   ereport(ERROR,
+                           (errcode(ERRCODE_DATA_CORRUPTED),
+                            errmsg_internal("multixact %u contains running locker XID %u from before removable cutoff %u",
+                                            multi, xid,
+                                            cutoffs->OldestXmin)));
                 newmembers[nnewmembers++] = members[i];
                 has_lockers = true;
-
-               /*
-                * Cannot possibly be older than VACUUM's OldestXmin, so we
-                * don't need a NewRelfrozenXid step here
-                */
-               Assert(TransactionIdPrecedesOrEquals(cutoffs->OldestXmin, xid));
             }
  
             continue;
@@ -6310,15 +6348,13 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
          * Updater XID (not locker XID).  Should we keep it?
          *
          * Since the tuple wasn't totally removed when vacuum pruned, the
-        * update Xid cannot possibly be older than OldestXmin cutoff. The
-        * presence of such a tuple would cause corruption, so be paranoid and
-        * check.
+        * update Xid cannot possibly be older than OldestXmin cutoff unless
+        * the updater XID aborted.  If the updater transaction is known
+        * aborted or crashed then it's okay to ignore it, otherwise not.
+        *
+        * In any case the Multi should never contain two updaters, whatever
+        * their individual commit status.  Check for that first, in passing.
          */
-       if (TransactionIdPrecedes(xid, cutoffs->OldestXmin))
-           ereport(ERROR,
-                   (errcode(ERRCODE_DATA_CORRUPTED),
-                    errmsg_internal("found update xid %u from before removable cutoff %u",
-                                    xid, cutoffs->OldestXmin)));
         if (TransactionIdIsValid(update_xid))
             ereport(ERROR,
                     (errcode(ERRCODE_DATA_CORRUPTED),
@@ -6328,9 +6364,6 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
                                         update_xid, xid)));
  
         /*
-        * If the transaction is known aborted or crashed then it's okay to
-        * ignore it, otherwise not.
-        *
          * As with all tuple visibility routines, it's critical to test
          * TransactionIdIsInProgress before TransactionIdDidCommit, because of
          * race conditions explained in detail in heapam_visibility.c.
@@ -6358,13 +6391,15 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
         }
  
         /*
-        * We determined that this is an Xid corresponding to an update that
-        * must be retained -- add it to new members list for later.  Also
-        * consider pushing back mxid_oldest_xid_out.
+        * We determined that updater must be kept -- add it to pending new
+        * members list
          */
+       if (TransactionIdPrecedes(xid, cutoffs->OldestXmin))
+           ereport(ERROR,
+                   (errcode(ERRCODE_DATA_CORRUPTED),
+                    errmsg_internal("multixact %u contains committed update XID %u from before removable cutoff %u",
+                                    multi, xid, cutoffs->OldestXmin)));
         newmembers[nnewmembers++] = members[i];
-       if (TransactionIdPrecedes(xid, temp_xid_out))
-           temp_xid_out = xid;
     }
  
     pfree(members);
@@ -6375,10 +6410,9 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
      */
     if (nnewmembers == 0)
     {
-       /* nothing worth keeping!? Tell caller to remove the whole thing */
+       /* Nothing worth keeping */
         *flags |= FRM_INVALIDATE_XMAX;
         newxmax = InvalidTransactionId;
-       /* Don't push back mxid_oldest_xid_out -- no Xids will remain */
     }
     else if (TransactionIdIsValid(update_xid) && !has_lockers)
     {
@@ -6394,22 +6428,20 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
         if (update_committed)
             *flags |= FRM_MARK_COMMITTED;
         newxmax = update_xid;
-       /* Don't push back mxid_oldest_xid_out using FRM_RETURN_IS_XID Xid */
     }
     else
     {
         /*
          * Create a new multixact with the surviving members of the previous
-        * one, to set as new Xmax in the tuple.  The oldest surviving member
-        * might push back mxid_oldest_xid_out.
+        * one, to set as new Xmax in the tuple
          */
         newxmax = MultiXactIdCreateFromMembers(nnewmembers, newmembers);
         *flags |= FRM_RETURN_IS_MULTI;
-       *mxid_oldest_xid_out = temp_xid_out;
     }
  
     pfree(newmembers);
  
+   pagefrz->freeze_required = true;
     return newxmax;
  }
  
@@ -6417,9 +6449,9 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
   * heap_prepare_freeze_tuple
   *
   * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
- * are older than the FreezeLimit and/or MultiXactCutoff freeze cutoffs.  If so,
- * setup enough state (in the *frz output argument) to later execute and
- * WAL-log what caller needs to do for the tuple, and return true.  Return
+ * are older than the OldestXmin and/or OldestMxact freeze cutoffs.  If so,
+ * setup enough state (in the *frz output argument) to enable caller to
+ * process this tuple as part of freezing its page, and return true.  Return
   * false if nothing can be changed about the tuple right now.
   *
   * Also sets *totally_frozen to true if the tuple will be totally frozen once
@@ -6427,22 +6459,30 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
   * frozen by an earlier VACUUM).  This indicates that there are no remaining
   * XIDs or MultiXactIds that will need to be processed by a future VACUUM.
   *
- * VACUUM caller must assemble HeapTupleFreeze entries for every tuple that we
- * returned true for when called.  A later heap_freeze_execute_prepared call
- * will execute freezing for caller's page as a whole.
+ * VACUUM caller must assemble HeapTupleFreeze freeze plan entries for every
+ * tuple that we returned true for, and call heap_freeze_execute_prepared to
+ * execute freezing.  Caller must initialize pagefrz fields for page as a
+ * whole before first call here for each heap page.
+ *
+ * VACUUM caller decides on whether or not to freeze the page as a whole.
+ * We'll often prepare freeze plans for a page that caller just discards.
+ * However, VACUUM doesn't always get to make a choice; it must freeze when
+ * pagefrz.freeze_required is set, to ensure that any XIDs < FreezeLimit (and
+ * MXIDs < MultiXactCutoff) can never be left behind.  We help to make sure
+ * that VACUUM always follows that rule.
+ *
+ * We sometimes force freezing of xmax MultiXactId values long before it is
+ * strictly necessary to do so just to ensure the FreezeLimit postcondition.
+ * It's worth processing MultiXactIds proactively when it is cheap to do so,
+ * and it's convenient to make that happen by piggy-backing it on the "force
+ * freezing" mechanism.  Conversely, we sometimes delay freezing MultiXactIds
+ * because it is expensive right now (though only when it's still possible to
+ * do so without violating the FreezeLimit/MultiXactCutoff postcondition).
   *
   * It is assumed that the caller has checked the tuple with
   * HeapTupleSatisfiesVacuum() and determined that it is not HEAPTUPLE_DEAD
   * (else we should be removing the tuple, not freezing it).
   *
- * The *relfrozenxid_out and *relminmxid_out arguments are the current target
- * relfrozenxid and relminmxid for VACUUM caller's heap rel.  Any and all
- * unfrozen XIDs or MXIDs that remain in caller's rel after VACUUM finishes
- * _must_ have values >= the final relfrozenxid/relminmxid values in pg_class.
- * This includes XIDs that remain as MultiXact members from any tuple's xmax.
- * Each call here pushes back *relfrozenxid_out and/or *relminmxid_out as
- * needed to avoid unsafe final values in rel's authoritative pg_class tuple.
- *
   * NB: This function has side effects: it might allocate a new MultiXactId.
   * It will be set as tuple's new xmax when our *frz output is processed within
   * heap_execute_freeze_tuple later on.  If the tuple is in a shared buffer
@@ -6451,9 +6491,8 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
  bool
  heap_prepare_freeze_tuple(HeapTupleHeader tuple,
                           const struct VacuumCutoffs *cutoffs,
-                         HeapTupleFreeze *frz, bool *totally_frozen,
-                         TransactionId *relfrozenxid_out,
-                         MultiXactId *relminmxid_out)
+                         HeapPageFreeze *pagefrz,
+                         HeapTupleFreeze *frz, bool *totally_frozen)
  {
     bool        xmin_already_frozen = false,
                 xmax_already_frozen = false;
@@ -6470,7 +6509,7 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple,
  
     /*
      * Process xmin, while keeping track of whether it's already frozen, or
-    * will become frozen when our freeze plan is executed by caller (could be
+    * will become frozen iff our freeze plan is executed by caller (could be
      * neither).
      */
     xid = HeapTupleHeaderGetXmin(tuple);
@@ -6484,21 +6523,14 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple,
                      errmsg_internal("found xmin %u from before relfrozenxid %u",
                                      xid, cutoffs->relfrozenxid)));
  
-       freeze_xmin = TransactionIdPrecedes(xid, cutoffs->FreezeLimit);
-       if (freeze_xmin)
-       {
-           if (!TransactionIdDidCommit(xid))
-               ereport(ERROR,
-                       (errcode(ERRCODE_DATA_CORRUPTED),
-                        errmsg_internal("uncommitted xmin %u from before xid cutoff %u needs to be frozen",
-                                        xid, cutoffs->FreezeLimit)));
-       }
-       else
-       {
-           /* xmin to remain unfrozen.  Could push back relfrozenxid_out. */
-           if (TransactionIdPrecedes(xid, *relfrozenxid_out))
-               *relfrozenxid_out = xid;
-       }
+       freeze_xmin = TransactionIdPrecedes(xid, cutoffs->OldestXmin);
+       if (freeze_xmin && !TransactionIdDidCommit(xid))
+           ereport(ERROR,
+                   (errcode(ERRCODE_DATA_CORRUPTED),
+                    errmsg_internal("uncommitted xmin %u from before xid cutoff %u needs to be frozen",
+                                    xid, cutoffs->OldestXmin)));
+
+       /* Will set freeze_xmin flags in freeze plan below */
     }
  
     /*
@@ -6515,41 +6547,59 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple,
          * For Xvac, we always freeze proactively.  This allows totally_frozen
          * tracking to ignore xvac.
          */
-       replace_xvac = true;
+       replace_xvac = pagefrz->freeze_required = true;
+
+       /* Will set replace_xvac flags in freeze plan below */
     }
  
-   /*
-    * Process xmax.  To thoroughly examine the current Xmax value we need to
-    * resolve a MultiXactId to its member Xids, in case some of them are
-    * below the given FreezeLimit.  In that case, those values might need
-    * freezing, too.  Also, if a multi needs freezing, we cannot simply take
-    * it out --- if there's a live updater Xid, it needs to be kept.
-    *
-    * Make sure to keep heap_tuple_would_freeze in sync with this.
-    */
+   /* Now process xmax */
     xid = HeapTupleHeaderGetRawXmax(tuple);
-
     if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
     {
         /* Raw xmax is a MultiXactId */
         TransactionId newxmax;
         uint16      flags;
-       TransactionId mxid_oldest_xid_out = *relfrozenxid_out;
  
+       /*
+        * We will either remove xmax completely (in the "freeze_xmax" path),
+        * process xmax by replacing it (in the "replace_xmax" path), or
+        * perform no-op xmax processing.  The only constraint is that the
+        * FreezeLimit/MultiXactCutoff postcondition must never be violated.
+        */
         newxmax = FreezeMultiXactId(xid, tuple->t_infomask, cutoffs,
-                                   &flags, &mxid_oldest_xid_out);
+                                   &flags, pagefrz);
  
-       if (flags & FRM_RETURN_IS_XID)
+       if (flags & FRM_NOOP)
+       {
+           /*
+            * xmax is a MultiXactId, and nothing about it changes for now.
+            * This is the only case where 'freeze_required' won't have been
+            * set for us by FreezeMultiXactId, as well as the only case where
+            * neither freeze_xmax nor replace_xmax are set (given a multi).
+            *
+            * This is a no-op, but the call to FreezeMultiXactId might have
+            * ratcheted back NewRelfrozenXid and/or NewRelminMxid trackers
+            * for us (the "freeze page" variants, specifically).  That'll
+            * make it safe for our caller to freeze the page later on, while
+            * leaving this particular xmax undisturbed.
+            *
+            * FreezeMultiXactId is _not_ responsible for the "no freeze"
+            * NewRelfrozenXid/NewRelminMxid trackers, though -- that's our
+            * job.  A call to heap_tuple_should_freeze for this same tuple
+            * will take place below if 'freeze_required' isn't set already.
+            * (This repeats work from FreezeMultiXactId, but allows "no
+            * freeze" tracker maintenance to happen in only one place.)
+            */
+           Assert(!MultiXactIdPrecedes(newxmax, cutoffs->MultiXactCutoff));
+           Assert(MultiXactIdIsValid(newxmax) && xid == newxmax);
+       }
+       else if (flags & FRM_RETURN_IS_XID)
         {
             /*
              * xmax will become an updater Xid (original MultiXact's updater
              * member Xid will be carried forward as a simple Xid in Xmax).
-            * Might have to ratchet back relfrozenxid_out here, though never
-            * relminmxid_out.
              */
             Assert(!TransactionIdPrecedes(newxmax, cutoffs->OldestXmin));
-           if (TransactionIdPrecedes(newxmax, *relfrozenxid_out))
-               *relfrozenxid_out = newxmax;
  
             /*
              * NB -- some of these transformations are only valid because we
@@ -6572,13 +6622,8 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple,
             /*
              * xmax is an old MultiXactId that we have to replace with a new
              * MultiXactId, to carry forward two or more original member XIDs.
-            * Might have to ratchet back relfrozenxid_out here, though never
-            * relminmxid_out.
              */
             Assert(!MultiXactIdPrecedes(newxmax, cutoffs->OldestMxact));
-           Assert(TransactionIdPrecedesOrEquals(mxid_oldest_xid_out,
-                                                *relfrozenxid_out));
-           *relfrozenxid_out = mxid_oldest_xid_out;
  
             /*
              * We can't use GetMultiXactIdHintBits directly on the new multi
@@ -6594,20 +6639,6 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple,
             frz->xmax = newxmax;
             replace_xmax = true;
         }
-       else if (flags & FRM_NOOP)
-       {
-           /*
-            * xmax is a MultiXactId, and nothing about it changes for now.
-            * Might have to ratchet back relminmxid_out, relfrozenxid_out, or
-            * both together.
-            */
-           Assert(MultiXactIdIsValid(newxmax) && xid == newxmax);
-           Assert(TransactionIdPrecedesOrEquals(mxid_oldest_xid_out,
-                                                *relfrozenxid_out));
-           if (MultiXactIdPrecedes(xid, *relminmxid_out))
-               *relminmxid_out = xid;
-           *relfrozenxid_out = mxid_oldest_xid_out;
-       }
         else
         {
             /*
@@ -6617,9 +6648,12 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple,
             Assert(flags & FRM_INVALIDATE_XMAX);
             Assert(!TransactionIdIsValid(newxmax));
  
-           /* Will set t_infomask/t_infomask2 flags in freeze plan below */
+           /* Will set freeze_xmax flags in freeze plan below */
             freeze_xmax = true;
         }
+
+       /* MultiXactId processing forces freezing (barring FRM_NOOP case) */
+       Assert(pagefrz->freeze_required || (!freeze_xmax && !replace_xmax));
     }
     else if (TransactionIdIsNormal(xid))
     {
@@ -6630,28 +6664,21 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple,
                      errmsg_internal("found xmax %u from before relfrozenxid %u",
                                      xid, cutoffs->relfrozenxid)));
  
-       if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
-       {
-           /*
-            * If we freeze xmax, make absolutely sure that it's not an XID
-            * that is important.  (Note, a lock-only xmax can be removed
-            * independent of committedness, since a committed lock holder has
-            * released the lock).
-            */
-           if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask) &&
-               TransactionIdDidCommit(xid))
-               ereport(ERROR,
-                       (errcode(ERRCODE_DATA_CORRUPTED),
-                        errmsg_internal("cannot freeze committed xmax %u",
-                                        xid)));
+       if (TransactionIdPrecedes(xid, cutoffs->OldestXmin))
             freeze_xmax = true;
-           /* No need for relfrozenxid_out handling, since we'll freeze xmax */
-       }
-       else
-       {
-           if (TransactionIdPrecedes(xid, *relfrozenxid_out))
-               *relfrozenxid_out = xid;
-       }
+
+       /*
+        * If we freeze xmax, make absolutely sure that it's not an XID that
+        * is important.  (Note, a lock-only xmax can be removed independent
+        * of committedness, since a committed lock holder has released the
+        * lock).
+        */
+       if (freeze_xmax && !HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask) &&
+           TransactionIdDidCommit(xid))
+           ereport(ERROR,
+                   (errcode(ERRCODE_DATA_CORRUPTED),
+                    errmsg_internal("cannot freeze committed xmax %u",
+                                    xid)));
     }
     else if (!TransactionIdIsValid(xid))
     {
@@ -6678,6 +6705,7 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple,
          * failed; whereas a non-dead MOVED_IN tuple must mean the xvac
          * transaction succeeded.
          */
+       Assert(pagefrz->freeze_required);
         if (tuple->t_infomask & HEAP_MOVED_OFF)
             frz->frzflags |= XLH_INVALID_XVAC;
         else
@@ -6686,8 +6714,9 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple,
     if (replace_xmax)
     {
         Assert(!xmax_already_frozen && !freeze_xmax);
+       Assert(pagefrz->freeze_required);
  
-       /* Already set t_infomask/t_infomask2 flags in freeze plan */
+       /* Already set replace_xmax flags in freeze plan earlier */
     }
     if (freeze_xmax)
     {
@@ -6708,13 +6737,23 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple,
  
     /*
      * Determine if this tuple is already totally frozen, or will become
-    * totally frozen
+    * totally frozen (provided caller executes freeze plans for the page)
      */
     *totally_frozen = ((freeze_xmin || xmin_already_frozen) &&
                        (freeze_xmax || xmax_already_frozen));
  
-   /* A "totally_frozen" tuple must not leave anything behind in xmax */
-   Assert(!*totally_frozen || !replace_xmax);
+   if (!pagefrz->freeze_required && !(xmin_already_frozen &&
+                                      xmax_already_frozen))
+   {
+       /*
+        * So far no previous tuple from the page made freezing mandatory.
+        * Does this tuple force caller to freeze the entire page?
+        */
+       pagefrz->freeze_required =
+           heap_tuple_should_freeze(tuple, cutoffs,
+                                    &pagefrz->NoFreezePageRelfrozenXid,
+                                    &pagefrz->NoFreezePageRelminMxid);
+   }
  
     /* Tell caller if this tuple has a usable freeze plan set in *frz */
     return freeze_xmin || replace_xvac || replace_xmax || freeze_xmax;
@@ -6760,13 +6799,12 @@ heap_execute_freeze_tuple(HeapTupleHeader tuple, HeapTupleFreeze *frz)
   */
  void
  heap_freeze_execute_prepared(Relation rel, Buffer buffer,
-                            TransactionId FreezeLimit,
+                            TransactionId snapshotConflictHorizon,
                              HeapTupleFreeze *tuples, int ntuples)
  {
     Page        page = BufferGetPage(buffer);
  
     Assert(ntuples > 0);
-   Assert(TransactionIdIsNormal(FreezeLimit));
  
     START_CRIT_SECTION();
  
@@ -6789,19 +6827,10 @@ heap_freeze_execute_prepared(Relation rel, Buffer buffer,
         int         nplans;
         xl_heap_freeze_page xlrec;
         XLogRecPtr  recptr;
-       TransactionId snapshotConflictHorizon;
  
         /* Prepare deduplicated representation for use in WAL record */
         nplans = heap_xlog_freeze_plan(tuples, ntuples, plans, offsets);
  
-       /*
-        * FreezeLimit is (approximately) the first XID not frozen by VACUUM.
-        * Back up caller's FreezeLimit to avoid false conflicts when
-        * FreezeLimit is precisely equal to VACUUM's OldestXmin cutoff.
-        */
-       snapshotConflictHorizon = FreezeLimit;
-       TransactionIdRetreat(snapshotConflictHorizon);
-
         xlrec.snapshotConflictHorizon = snapshotConflictHorizon;
         xlrec.nplans = nplans;
  
@@ -6842,8 +6871,7 @@ heap_freeze_tuple(HeapTupleHeader tuple,
     bool        do_freeze;
     bool        totally_frozen;
     struct VacuumCutoffs cutoffs;
-   TransactionId NewRelfrozenXid = FreezeLimit;
-   MultiXactId NewRelminMxid = MultiXactCutoff;
+   HeapPageFreeze pagefrz;
  
     cutoffs.relfrozenxid = relfrozenxid;
     cutoffs.relminmxid = relminmxid;
@@ -6852,9 +6880,14 @@ heap_freeze_tuple(HeapTupleHeader tuple,
     cutoffs.FreezeLimit = FreezeLimit;
     cutoffs.MultiXactCutoff = MultiXactCutoff;
  
+   pagefrz.freeze_required = true;
+   pagefrz.FreezePageRelfrozenXid = FreezeLimit;
+   pagefrz.FreezePageRelminMxid = MultiXactCutoff;
+   pagefrz.NoFreezePageRelfrozenXid = FreezeLimit;
+   pagefrz.NoFreezePageRelminMxid = MultiXactCutoff;
+
     do_freeze = heap_prepare_freeze_tuple(tuple, &cutoffs,
-                                         &frz, &totally_frozen,
-                                         &NewRelfrozenXid, &NewRelminMxid);
+                                         &pagefrz, &frz, &totally_frozen);
  
     /*
      * Note that because this is not a WAL-logged operation, we don't need to
@@ -7277,22 +7310,24 @@ heap_tuple_needs_eventual_freeze(HeapTupleHeader tuple)
  }
  
  /*
- * heap_tuple_would_freeze
+ * heap_tuple_should_freeze
   *
   * Return value indicates if heap_prepare_freeze_tuple sibling function would
- * freeze any of the XID/MXID fields from the tuple, given the same cutoffs.
- * We must also deal with dead tuples here, since (xmin, xmax, xvac) fields
- * could be processed by pruning away the whole tuple instead of freezing.
- *
- * The *relfrozenxid_out and *relminmxid_out input/output arguments work just
- * like the heap_prepare_freeze_tuple arguments that they're based on.  We
- * never freeze here, which makes tracking the oldest extant XID/MXID simple.
+ * (or should) force freezing of the heap page that contains caller's tuple.
+ * Tuple header XIDs/MXIDs < FreezeLimit/MultiXactCutoff trigger freezing.
+ * This includes (xmin, xmax, xvac) fields, as well as MultiXact member XIDs.
+ *
+ * The *NoFreezePageRelfrozenXid and *NoFreezePageRelminMxid input/output
+ * arguments help VACUUM track the oldest extant XID/MXID remaining in rel.
+ * Our working assumption is that caller won't decide to freeze this tuple.
+ * It's up to caller to only ratchet back its own top-level trackers after the
+ * point that it fully commits to not freezing the tuple/page in question.
   */
  bool
-heap_tuple_would_freeze(HeapTupleHeader tuple,
-                       const struct VacuumCutoffs *cutoffs,
-                       TransactionId *relfrozenxid_out,
-                       MultiXactId *relminmxid_out)
+heap_tuple_should_freeze(HeapTupleHeader tuple,
+                        const struct VacuumCutoffs *cutoffs,
+                        TransactionId *NoFreezePageRelfrozenXid,
+                        MultiXactId *NoFreezePageRelminMxid)
  {
     TransactionId xid;
     MultiXactId multi;
@@ -7303,8 +7338,8 @@ heap_tuple_would_freeze(HeapTupleHeader tuple,
     if (TransactionIdIsNormal(xid))
     {
         Assert(TransactionIdPrecedesOrEquals(cutoffs->relfrozenxid, xid));
-       if (TransactionIdPrecedes(xid, *relfrozenxid_out))
-           *relfrozenxid_out = xid;
+       if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
+           *NoFreezePageRelfrozenXid = xid;
         if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
             freeze = true;
     }
@@ -7321,8 +7356,8 @@ heap_tuple_would_freeze(HeapTupleHeader tuple,
     {
         Assert(TransactionIdPrecedesOrEquals(cutoffs->relfrozenxid, xid));
         /* xmax is a non-permanent XID */
-       if (TransactionIdPrecedes(xid, *relfrozenxid_out))
-           *relfrozenxid_out = xid;
+       if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
+           *NoFreezePageRelfrozenXid = xid;
         if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
             freeze = true;
     }
@@ -7333,8 +7368,8 @@ heap_tuple_would_freeze(HeapTupleHeader tuple,
     else if (HEAP_LOCKED_UPGRADED(tuple->t_infomask))
     {
         /* xmax is a pg_upgrade'd MultiXact, which can't have updater XID */
-       if (MultiXactIdPrecedes(multi, *relminmxid_out))
-           *relminmxid_out = multi;
+       if (MultiXactIdPrecedes(multi, *NoFreezePageRelminMxid))
+           *NoFreezePageRelminMxid = multi;
         /* heap_prepare_freeze_tuple always freezes pg_upgrade'd xmax */
         freeze = true;
     }
@@ -7345,8 +7380,8 @@ heap_tuple_would_freeze(HeapTupleHeader tuple,
         int         nmembers;
  
         Assert(MultiXactIdPrecedesOrEquals(cutoffs->relminmxid, multi));
-       if (MultiXactIdPrecedes(multi, *relminmxid_out))
-           *relminmxid_out = multi;
+       if (MultiXactIdPrecedes(multi, *NoFreezePageRelminMxid))
+           *NoFreezePageRelminMxid = multi;
         if (MultiXactIdPrecedes(multi, cutoffs->MultiXactCutoff))
             freeze = true;
  
@@ -7358,8 +7393,8 @@ heap_tuple_would_freeze(HeapTupleHeader tuple,
         {
             xid = members[i].xid;
             Assert(TransactionIdPrecedesOrEquals(cutoffs->relfrozenxid, xid));
-           if (TransactionIdPrecedes(xid, *relfrozenxid_out))
-               *relfrozenxid_out = xid;
+           if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
+               *NoFreezePageRelfrozenXid = xid;
             if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
                 freeze = true;
         }
@@ -7373,9 +7408,9 @@ heap_tuple_would_freeze(HeapTupleHeader tuple,
         if (TransactionIdIsNormal(xid))
         {
             Assert(TransactionIdPrecedesOrEquals(cutoffs->relfrozenxid, xid));
-           if (TransactionIdPrecedes(xid, *relfrozenxid_out))
-               *relfrozenxid_out = xid;
-           /* heap_prepare_freeze_tuple always freezes xvac */
+           if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
+               *NoFreezePageRelfrozenXid = xid;
+           /* heap_prepare_freeze_tuple forces xvac freezing */
             freeze = true;
         }
     }
diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c

index 98ccb98825bc320e33dd1972070c14e9c778f445..9923994b50efe86b60201727e7438f8654611aef 100644 (file)
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -1525,8 +1525,8 @@ lazy_scan_prune(LVRelState *vacrel,
                 live_tuples,
                 recently_dead_tuples;
     int         nnewlpdead;
-   TransactionId NewRelfrozenXid;
-   MultiXactId NewRelminMxid;
+   HeapPageFreeze pagefrz;
+   int64       fpi_before = pgWalUsage.wal_fpi;
     OffsetNumber deadoffsets[MaxHeapTuplesPerPage];
     HeapTupleFreeze frozen[MaxHeapTuplesPerPage];
  
@@ -1542,8 +1542,11 @@ lazy_scan_prune(LVRelState *vacrel,
  retry:
  
     /* Initialize (or reset) page-level state */
-   NewRelfrozenXid = vacrel->NewRelfrozenXid;
-   NewRelminMxid = vacrel->NewRelminMxid;
+   pagefrz.freeze_required = false;
+   pagefrz.FreezePageRelfrozenXid = vacrel->NewRelfrozenXid;
+   pagefrz.FreezePageRelminMxid = vacrel->NewRelminMxid;
+   pagefrz.NoFreezePageRelfrozenXid = vacrel->NewRelfrozenXid;
+   pagefrz.NoFreezePageRelminMxid = vacrel->NewRelminMxid;
     tuples_deleted = 0;
     tuples_frozen = 0;
     lpdead_items = 0;
@@ -1596,27 +1599,23 @@ retry:
             continue;
         }
  
-       /*
-        * LP_DEAD items are processed outside of the loop.
-        *
-        * Note that we deliberately don't set hastup=true in the case of an
-        * LP_DEAD item here, which is not how count_nondeletable_pages() does
-        * it -- it only considers pages empty/truncatable when they have no
-        * items at all (except LP_UNUSED items).
-        *
-        * Our assumption is that any LP_DEAD items we encounter here will
-        * become LP_UNUSED inside lazy_vacuum_heap_page() before we actually
-        * call count_nondeletable_pages().  In any case our opinion of
-        * whether or not a page 'hastup' (which is how our caller sets its
-        * vacrel->nonempty_pages value) is inherently race-prone.  It must be
-        * treated as advisory/unreliable, so we might as well be slightly
-        * optimistic.
-        */
         if (ItemIdIsDead(itemid))
         {
+           /*
+            * Deliberately don't set hastup for LP_DEAD items.  We make the
+            * soft assumption that any LP_DEAD items encountered here will
+            * become LP_UNUSED later on, before count_nondeletable_pages is
+            * reached.  If we don't make this assumption then rel truncation
+            * will only happen every other VACUUM, at most.  Besides, VACUUM
+            * must treat hastup/nonempty_pages as provisional no matter how
+            * LP_DEAD items are handled (handled here, or handled later on).
+            *
+            * Also deliberately delay unsetting all_visible until just before
+            * we return to lazy_scan_heap caller, as explained in full below.
+            * (This is another case where it's useful to anticipate that any
+            * LP_DEAD items will become LP_UNUSED during the ongoing VACUUM.)
+            */
             deadoffsets[lpdead_items++] = offnum;
-           prunestate->all_visible = false;
-           prunestate->has_lpdead_items = true;
             continue;
         }
  
@@ -1743,56 +1742,105 @@ retry:
         prunestate->hastup = true;  /* page makes rel truncation unsafe */
  
         /* Tuple with storage -- consider need to freeze */
-       if (heap_prepare_freeze_tuple(tuple.t_data, &vacrel->cutoffs,
-                                     &frozen[tuples_frozen], &totally_frozen,
-                                     &NewRelfrozenXid, &NewRelminMxid))
+       if (heap_prepare_freeze_tuple(tuple.t_data, &vacrel->cutoffs, &pagefrz,
+                                     &frozen[tuples_frozen], &totally_frozen))
         {
             /* Save prepared freeze plan for later */
             frozen[tuples_frozen++].offset = offnum;
         }
  
         /*
-        * If tuple is not frozen (and not about to become frozen) then caller
-        * had better not go on to set this page's VM bit
+        * If any tuple isn't either totally frozen already or eligible to
+        * become totally frozen (according to its freeze plan), then the page
+        * definitely cannot be set all-frozen in the visibility map later on
          */
         if (!totally_frozen)
             prunestate->all_frozen = false;
     }
  
-   vacrel->offnum = InvalidOffsetNumber;
-
     /*
      * We have now divided every item on the page into either an LP_DEAD item
      * that will need to be vacuumed in indexes later, or a LP_NORMAL tuple
      * that remains and needs to be considered for freezing now (LP_UNUSED and
      * LP_REDIRECT items also remain, but are of no further interest to us).
      */
-   vacrel->NewRelfrozenXid = NewRelfrozenXid;
-   vacrel->NewRelminMxid = NewRelminMxid;
+   vacrel->offnum = InvalidOffsetNumber;
  
     /*
-    * Consider the need to freeze any items with tuple storage from the page
-    * first (arbitrary)
+    * Freeze the page when heap_prepare_freeze_tuple indicates that at least
+    * one XID/MXID from before FreezeLimit/MultiXactCutoff is present.  Also
+    * freeze when pruning generated an FPI, if doing so means that we set the
+    * page all-frozen afterwards (might not happen until final heap pass).
      */
-   if (tuples_frozen > 0)
+   if (pagefrz.freeze_required || tuples_frozen == 0 ||
+       (prunestate->all_visible && prunestate->all_frozen &&
+        fpi_before != pgWalUsage.wal_fpi))
     {
-       Assert(prunestate->hastup);
+       /*
+        * We're freezing the page.  Our final NewRelfrozenXid doesn't need to
+        * be affected by the XIDs that are just about to be frozen anyway.
+        */
+       vacrel->NewRelfrozenXid = pagefrz.FreezePageRelfrozenXid;
+       vacrel->NewRelminMxid = pagefrz.FreezePageRelminMxid;
+
+       if (tuples_frozen == 0)
+       {
+           /*
+            * We're freezing all eligible tuples on the page, but have no
+            * freeze plans to execute.  This is structured as a case where
+            * the page is nominally frozen so that we set pages all-frozen
+            * whenever no freeze plans need to be executed to make it safe.
+            * If this was handled via "no freeze" processing instead then
+            * VACUUM would senselessly waste certain opportunities to set
+            * pages all-frozen (not just all-visible) at no added cost.
+            *
+            * We never increment the frozen_pages instrumentation counter
+            * here, since it only counts pages with newly frozen tuples
+            * (don't confuse that with pages newly set all-frozen in VM).
+            */
+       }
+       else
+       {
+           TransactionId snapshotConflictHorizon;
+
+           Assert(prunestate->hastup);
  
-       vacrel->frozen_pages++;
+           vacrel->frozen_pages++;
  
-       /* Execute all freeze plans for page as a single atomic action */
-       heap_freeze_execute_prepared(vacrel->rel, buf,
-                                    vacrel->cutoffs.FreezeLimit,
-                                    frozen, tuples_frozen);
+           /*
+            * We can use visibility_cutoff_xid as our cutoff for conflicts
+            * when the whole page is eligible to become all-frozen in the VM
+            * once we're done with it.  Otherwise we generate a conservative
+            * cutoff by stepping back from OldestXmin.
+            */
+           if (prunestate->all_visible && prunestate->all_frozen)
+               snapshotConflictHorizon = prunestate->visibility_cutoff_xid;
+           else
+           {
+               /* Avoids false conflicts when hot_standby_feedback in use */
+               snapshotConflictHorizon = vacrel->cutoffs.OldestXmin;
+               TransactionIdRetreat(snapshotConflictHorizon);
+           }
+
+           /* Execute all freeze plans for page as a single atomic action */
+           heap_freeze_execute_prepared(vacrel->rel, buf,
+                                        snapshotConflictHorizon,
+                                        frozen, tuples_frozen);
+       }
+   }
+   else
+   {
+       /*
+        * Page requires "no freeze" processing.  It might be set all-visible
+        * in the visibility map, but it can never be set all-frozen.
+        */
+       vacrel->NewRelfrozenXid = pagefrz.NoFreezePageRelfrozenXid;
+       vacrel->NewRelminMxid = pagefrz.NoFreezePageRelminMxid;
+       prunestate->all_frozen = false;
+       tuples_frozen = 0;      /* avoid miscounts in instrumentation */
     }
  
     /*
-    * The second pass over the heap can also set visibility map bits, using
-    * the same approach.  This is important when the table frequently has a
-    * few old LP_DEAD items on each page by the time we get to it (typically
-    * because past opportunistic pruning operations freed some non-HOT
-    * tuples).
-    *
      * VACUUM will call heap_page_is_all_visible() during the second pass over
      * the heap to determine all_visible and all_frozen for the page -- this
      * is a specialized version of the logic from this function.  Now that
@@ -1801,7 +1849,7 @@ retry:
      */
  #ifdef USE_ASSERT_CHECKING
     /* Note that all_frozen value does not matter when !all_visible */
-   if (prunestate->all_visible)
+   if (prunestate->all_visible && lpdead_items == 0)
     {
         TransactionId cutoff;
         bool        all_frozen;
@@ -1809,9 +1857,6 @@ retry:
         if (!heap_page_is_all_visible(vacrel, buf, &cutoff, &all_frozen))
             Assert(false);
  
-       Assert(lpdead_items == 0);
-       Assert(prunestate->all_frozen == all_frozen);
-
         /*
          * It's possible that we froze tuples and made the page's XID cutoff
          * (for recovery conflict purposes) FrozenTransactionId.  This is okay
@@ -1831,10 +1876,8 @@ retry:
         VacDeadItems *dead_items = vacrel->dead_items;
         ItemPointerData tmp;
  
-       Assert(!prunestate->all_visible);
-       Assert(prunestate->has_lpdead_items);
-
         vacrel->lpdead_item_pages++;
+       prunestate->has_lpdead_items = true;
  
         ItemPointerSetBlockNumber(&tmp, blkno);
  
@@ -1847,6 +1890,19 @@ retry:
         Assert(dead_items->num_items <= dead_items->max_items);
         pgstat_progress_update_param(PROGRESS_VACUUM_NUM_DEAD_TUPLES,
                                      dead_items->num_items);
+
+       /*
+        * It was convenient to ignore LP_DEAD items in all_visible earlier on
+        * to make the choice of whether or not to freeze the page unaffected
+        * by the short-term presence of LP_DEAD items.  These LP_DEAD items
+        * were effectively assumed to be LP_UNUSED items in the making.  It
+        * doesn't matter which heap pass (initial pass or final pass) ends up
+        * setting the page all-frozen, as long as the ongoing VACUUM does it.
+        *
+        * Now that freezing has been finalized, unset all_visible.  It needs
+        * to reflect the present state of things, as expected by our caller.
+        */
+       prunestate->all_visible = false;
     }
  
     /* Finally, add page-local counts to whole-VACUUM counts */
@@ -1891,8 +1947,8 @@ lazy_scan_noprune(LVRelState *vacrel,
                 recently_dead_tuples,
                 missed_dead_tuples;
     HeapTupleHeader tupleheader;
-   TransactionId NewRelfrozenXid = vacrel->NewRelfrozenXid;
-   MultiXactId NewRelminMxid = vacrel->NewRelminMxid;
+   TransactionId NoFreezePageRelfrozenXid = vacrel->NewRelfrozenXid;
+   MultiXactId NoFreezePageRelminMxid = vacrel->NewRelminMxid;
     OffsetNumber deadoffsets[MaxHeapTuplesPerPage];
  
     Assert(BufferGetBlockNumber(buf) == blkno);
@@ -1937,8 +1993,9 @@ lazy_scan_noprune(LVRelState *vacrel,
  
         *hastup = true;         /* page prevents rel truncation */
         tupleheader = (HeapTupleHeader) PageGetItem(page, itemid);
-       if (heap_tuple_would_freeze(tupleheader, &vacrel->cutoffs,
-                                   &NewRelfrozenXid, &NewRelminMxid))
+       if (heap_tuple_should_freeze(tupleheader, &vacrel->cutoffs,
+                                    &NoFreezePageRelfrozenXid,
+                                    &NoFreezePageRelminMxid))
         {
             /* Tuple with XID < FreezeLimit (or MXID < MultiXactCutoff) */
             if (vacrel->aggressive)
@@ -2019,8 +2076,8 @@ lazy_scan_noprune(LVRelState *vacrel,
      * this particular page until the next VACUUM.  Remember its details now.
      * (lazy_scan_prune expects a clean slate, so we have to do this last.)
      */
-   vacrel->NewRelfrozenXid = NewRelfrozenXid;
-   vacrel->NewRelminMxid = NewRelminMxid;
+   vacrel->NewRelfrozenXid = NoFreezePageRelfrozenXid;
+   vacrel->NewRelminMxid = NoFreezePageRelminMxid;
  
     /* Save any LP_DEAD items found on the page in dead_items array */
     if (vacrel->nindexes == 0)
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h

index 53eb011766b50d71b103bac9393d5bd577bddcc5..09a1993f4d73ff4fb62c8aef97b9d1affb02c26b 100644 (file)
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -113,6 +113,82 @@ typedef struct HeapTupleFreeze
     OffsetNumber offset;
  } HeapTupleFreeze;
  
+/*
+ * State used by VACUUM to track the details of freezing all eligible tuples
+ * on a given heap page.
+ *
+ * VACUUM prepares freeze plans for each page via heap_prepare_freeze_tuple
+ * calls (every tuple with storage gets its own call).  This page-level freeze
+ * state is updated across each call, which ultimately determines whether or
+ * not freezing the page is required.
+ *
+ * Aside from the basic question of whether or not freezing will go ahead, the
+ * state also tracks the oldest extant XID/MXID in the table as a whole, for
+ * the purposes of advancing relfrozenxid/relminmxid values in pg_class later
+ * on.  Each heap_prepare_freeze_tuple call pushes NewRelfrozenXid and/or
+ * NewRelminMxid back as required to avoid unsafe final pg_class values.  Any
+ * and all unfrozen XIDs or MXIDs that remain after VACUUM finishes _must_
+ * have values >= the final relfrozenxid/relminmxid values in pg_class.  This
+ * includes XIDs that remain as MultiXact members from any tuple's xmax.
+ *
+ * When 'freeze_required' flag isn't set after all tuples are examined, the
+ * final choice on freezing is made by vacuumlazy.c.  It can decide to trigger
+ * freezing based on whatever criteria it deems appropriate.  However, it is
+ * recommended that vacuumlazy.c avoid early freezing when freezing does not
+ * enable setting the target page all-frozen in the visibility map afterwards.
+ */
+typedef struct HeapPageFreeze
+{
+   /* Is heap_prepare_freeze_tuple caller required to freeze page? */
+   bool        freeze_required;
+
+   /*
+    * "Freeze" NewRelfrozenXid/NewRelminMxid trackers.
+    *
+    * Trackers used when heap_freeze_execute_prepared freezes the page, and
+    * when page is "nominally frozen", which happens with pages where every
+    * call to heap_prepare_freeze_tuple produced no usable freeze plan.
+    *
+    * "Nominal freezing" enables vacuumlazy.c's approach of setting a page
+    * all-frozen in the visibility map when every tuple's 'totally_frozen'
+    * result is true.  That always works in the same way, independent of the
+    * need to freeze tuples, and without complicating the general rule around
+    * 'totally_frozen' results (which is that 'totally_frozen' results are
+    * only to be trusted with a page that goes on to be frozen by caller).
+    *
+    * When we freeze a page, we generally freeze all XIDs < OldestXmin, only
+    * leaving behind XIDs that are ineligible for freezing, if any.  And so
+    * you might wonder why these trackers are necessary at all; why should
+    * _any_ page that VACUUM freezes _ever_ be left with XIDs/MXIDs that
+    * ratchet back the top-level NewRelfrozenXid/NewRelminMxid trackers?
+    *
+    * It is useful to use a definition of "freeze the page" that does not
+    * overspecify how MultiXacts are affected.  heap_prepare_freeze_tuple
+    * generally prefers to remove Multis eagerly, but lazy processing is used
+    * in cases where laziness allows VACUUM to avoid allocating a new Multi.
+    * The "freeze the page" trackers enable this flexibility.
+    */
+   TransactionId FreezePageRelfrozenXid;
+   MultiXactId FreezePageRelminMxid;
+
+   /*
+    * "No freeze" NewRelfrozenXid/NewRelminMxid trackers.
+    *
+    * These trackers are maintained in the same way as the trackers used when
+    * VACUUM scans a page that isn't cleanup locked.  Both code paths are
+    * based on the same general idea (do less work for this page during the
+    * ongoing VACUUM, at the cost of having to accept older final values).
+    *
+    * When vacuumlazy.c caller decides to do "no freeze" processing, it must
+    * not go on to set the page all-frozen (setting the page all-visible
+    * could still be okay).  heap_prepare_freeze_tuple's 'totally_frozen'
+    * results can only be used on a page that also gets frozen as instructed.
+    */
+   TransactionId NoFreezePageRelfrozenXid;
+   MultiXactId NoFreezePageRelminMxid;
+
+} HeapPageFreeze;
+
  /* ----------------
   *     function prototypes for heap access method
   *
@@ -180,19 +256,18 @@ extern TM_Result heap_lock_tuple(Relation relation, HeapTuple tuple,
  extern void heap_inplace_update(Relation relation, HeapTuple tuple);
  extern bool heap_prepare_freeze_tuple(HeapTupleHeader tuple,
                                       const struct VacuumCutoffs *cutoffs,
-                                     HeapTupleFreeze *frz, bool *totally_frozen,
-                                     TransactionId *relfrozenxid_out,
-                                     MultiXactId *relminmxid_out);
+                                     HeapPageFreeze *pagefrz,
+                                     HeapTupleFreeze *frz, bool *totally_frozen);
  extern void heap_freeze_execute_prepared(Relation rel, Buffer buffer,
-                                        TransactionId FreezeLimit,
+                                        TransactionId snapshotConflictHorizon,
                                          HeapTupleFreeze *tuples, int ntuples);
  extern bool heap_freeze_tuple(HeapTupleHeader tuple,
                               TransactionId relfrozenxid, TransactionId relminmxid,
                               TransactionId FreezeLimit, TransactionId MultiXactCutoff);
-extern bool heap_tuple_would_freeze(HeapTupleHeader tuple,
-                                   const struct VacuumCutoffs *cutoffs,
-                                   TransactionId *relfrozenxid_out,
-                                   MultiXactId *relminmxid_out);
+extern bool heap_tuple_should_freeze(HeapTupleHeader tuple,
+                                    const struct VacuumCutoffs *cutoffs,
+                                    TransactionId *NoFreezePageRelfrozenXid,
+                                    MultiXactId *NoFreezePageRelminMxid);
  extern bool heap_tuple_needs_eventual_freeze(HeapTupleHeader tuple);
  
  extern void simple_heap_insert(Relation relation, HeapTuple tup);
author	Peter Geoghegan <pg@bowt.ie>
	Wed, 28 Dec 2022 16:50:47 +0000 (08:50 -0800)
committer	Peter Geoghegan <pg@bowt.ie>
	Wed, 28 Dec 2022 16:50:47 +0000 (08:50 -0800)
doc/src/sgml/config.sgml		patch \| blob \| blame \| history
src/backend/access/heap/heapam.c		patch \| blob \| blame \| history
src/backend/access/heap/vacuumlazy.c		patch \| blob \| blame \| history
src/include/access/heapam.h		patch \| blob \| blame \| history