WAL-log inplace update before revealing it to other sessions.

author Noah Misch <noah@leadboat.com>

Fri, 25 Oct 2024 13:51:03 +0000 (06:51 -0700)

committer Noah Misch <noah@leadboat.com>

Fri, 25 Oct 2024 13:51:03 +0000 (06:51 -0700)
author Noah Misch <noah@leadboat.com>
Fri, 25 Oct 2024 13:51:03 +0000 (06:51 -0700)
committer Noah Misch <noah@leadboat.com>
Fri, 25 Oct 2024 13:51:03 +0000 (06:51 -0700)
diff --git a/src/backend/access/heap/README.tuplock b/src/backend/access/heap/README.tuplock

index 818cd7f980601926ae98293dbcfbb6dd99a14172..31c52ad28f9d6b01ecebaf3ea499c2033ddeda4c 100644 (file)
--- a/src/backend/access/heap/README.tuplock
+++ b/src/backend/access/heap/README.tuplock
@@ -203,6 +203,4 @@ Inplace updates create an exception to the rule that tuple data won't change
  under a reader holding a pin.  A reader of a heap_fetch() result tuple may
  witness a torn read.  Current inplace-updated fields are aligned and are no
  wider than four bytes, and current readers don't need consistency across
-fields.  Hence, they get by with just fetching each field once.  XXX such a
-caller may also read a value that has not reached WAL; see
-systable_inplace_update_finish().
+fields.  Hence, they get by with just fetching each field once.
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c

index 3a13671a1ef68fc35dc008fb4d2dddfd31d3f57a..4c8febdc811230734ea13d929022173ea4f83229 100644 (file)
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -6326,6 +6326,8 @@ heap_inplace_update_and_unlock(Relation relation,
     HeapTupleHeader htup = oldtup->t_data;
     uint32      oldlen;
     uint32      newlen;
+   char       *dst;
+   char       *src;
     int         nmsgs = 0;
     SharedInvalidationMessage *invalMessages = NULL;
     bool        RelcacheInitFileInval = false;
@@ -6336,6 +6338,9 @@ heap_inplace_update_and_unlock(Relation relation,
     if (oldlen != newlen || htup->t_hoff != tuple->t_data->t_hoff)
         elog(ERROR, "wrong tuple length");
  
+   dst = (char *) htup + htup->t_hoff;
+   src = (char *) tuple->t_data + tuple->t_data->t_hoff;
+
     /*
      * Construct shared cache inval if necessary.  Note that because we only
      * pass the new version of the tuple, this mustn't be used for any
@@ -6359,15 +6364,15 @@ heap_inplace_update_and_unlock(Relation relation,
      */
     PreInplace_Inval();
  
-   /* NO EREPORT(ERROR) from here till changes are logged */
-   START_CRIT_SECTION();
-
-   memcpy((char *) htup + htup->t_hoff,
-          (char *) tuple->t_data + tuple->t_data->t_hoff,
-          newlen);
-
     /*----------
-    * XXX A crash here can allow datfrozenxid() to get ahead of relfrozenxid:
+    * NO EREPORT(ERROR) from here till changes are complete
+    *
+    * Our buffer lock won't stop a reader having already pinned and checked
+    * visibility for this tuple.  Hence, we write WAL first, then mutate the
+    * buffer.  Like in MarkBufferDirtyHint() or RecordTransactionCommit(),
+    * checkpoint delay makes that acceptable.  With the usual order of
+    * changes, a crash after memcpy() and before XLogInsert() could allow
+    * datfrozenxid to overtake relfrozenxid:
      *
      * ["D" is a VACUUM (ONLY_DATABASE_STATS)]
      * ["R" is a VACUUM tbl]
@@ -6377,14 +6382,28 @@ heap_inplace_update_and_unlock(Relation relation,
      * D: raise pg_database.datfrozenxid, XLogInsert(), finish
      * [crash]
      * [recovery restores datfrozenxid w/o relfrozenxid]
+    *
+    * Like in MarkBufferDirtyHint() subroutine XLogSaveBufferForHint(), copy
+    * the buffer to the stack before logging.  Here, that facilitates a FPI
+    * of the post-mutation block before we accept other sessions seeing it.
      */
-
-   MarkBufferDirty(buffer);
+   Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0);
+   START_CRIT_SECTION();
+   MyProc->delayChkptFlags |= DELAY_CHKPT_START;
  
     /* XLOG stuff */
     if (RelationNeedsWAL(relation))
     {
         xl_heap_inplace xlrec;
+       PGAlignedBlock copied_buffer;
+       char       *origdata = (char *) BufferGetBlock(buffer);
+       Page        page = BufferGetPage(buffer);
+       uint16      lower = ((PageHeader) page)->pd_lower;
+       uint16      upper = ((PageHeader) page)->pd_upper;
+       uintptr_t   dst_offset_in_block;
+       RelFileLocator rlocator;
+       ForkNumber  forkno;
+       BlockNumber blkno;
         XLogRecPtr  recptr;
  
         xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
@@ -6399,16 +6418,28 @@ heap_inplace_update_and_unlock(Relation relation,
             XLogRegisterData((char *) invalMessages,
                              nmsgs * sizeof(SharedInvalidationMessage));
  
-       XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
-       XLogRegisterBufData(0, (char *) htup + htup->t_hoff, newlen);
+       /* register block matching what buffer will look like after changes */
+       memcpy(copied_buffer.data, origdata, lower);
+       memcpy(copied_buffer.data + upper, origdata + upper, BLCKSZ - upper);
+       dst_offset_in_block = dst - origdata;
+       memcpy(copied_buffer.data + dst_offset_in_block, src, newlen);
+       BufferGetTag(buffer, &rlocator, &forkno, &blkno);
+       Assert(forkno == MAIN_FORKNUM);
+       XLogRegisterBlock(0, &rlocator, forkno, blkno, copied_buffer.data,
+                         REGBUF_STANDARD);
+       XLogRegisterBufData(0, src, newlen);
  
         /* inplace updates aren't decoded atm, don't log the origin */
  
         recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_INPLACE);
  
-       PageSetLSN(BufferGetPage(buffer), recptr);
+       PageSetLSN(page, recptr);
     }
  
+   memcpy(dst, src, newlen);
+
+   MarkBufferDirty(buffer);
+
     LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  
     /*
@@ -6421,6 +6452,7 @@ heap_inplace_update_and_unlock(Relation relation,
      */
     AtInplace_Inval();
  
+   MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
     END_CRIT_SECTION();
     UnlockTuple(relation, &tuple->t_self, InplaceUpdateTupleLock);
author	Noah Misch <noah@leadboat.com>
	Fri, 25 Oct 2024 13:51:03 +0000 (06:51 -0700)
committer	Noah Misch <noah@leadboat.com>
	Fri, 25 Oct 2024 13:51:03 +0000 (06:51 -0700)
src/backend/access/heap/README.tuplock		patch \| blob \| blame \| history
src/backend/access/heap/heapam.c		patch \| blob \| blame \| history