* crash/basebackup, even though the state of the data directory would
* require it.
*/
- Assert(!MyProc->delayChkpt);
- MyProc->delayChkpt = true;
+ Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0);
+ MyProc->delayChkpt |= DELAY_CHKPT_START;
/* WAL log truncation */
WriteMTruncateXlogRec(newOldestMultiDB,
/* Then offsets */
PerformOffsetsTruncation(oldestMulti, newOldestMulti);
- MyProc->delayChkpt = false;
+ MyProc->delayChkpt &= ~DELAY_CHKPT_START;
END_CRIT_SECTION();
LWLockRelease(MultiXactTruncationLock);
}
proc->xid = xid;
Assert(proc->xmin == InvalidTransactionId);
- proc->delayChkpt = false;
+ proc->delayChkpt = 0;
proc->statusFlags = 0;
proc->pid = 0;
proc->databaseId = databaseid;
START_CRIT_SECTION();
- MyProc->delayChkpt = true;
+ Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0);
+ MyProc->delayChkpt |= DELAY_CHKPT_START;
XLogBeginInsert();
for (record = records.head; record != NULL; record = record->next)
* checkpoint starting after this will certainly see the gxact as a
* candidate for fsyncing.
*/
- MyProc->delayChkpt = false;
+ MyProc->delayChkpt &= ~DELAY_CHKPT_START;
/*
* Remember that we have this GlobalTransaction entry locked for us. If
START_CRIT_SECTION();
/* See notes in RecordTransactionCommit */
- MyProc->delayChkpt = true;
+ Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0);
+ MyProc->delayChkpt |= DELAY_CHKPT_START;
/*
* Emit the XLOG commit record. Note that we mark 2PC commits as
TransactionIdCommitTree(xid, nchildren, children);
/* Checkpoint can proceed now */
- MyProc->delayChkpt = false;
+ MyProc->delayChkpt &= ~DELAY_CHKPT_START;
END_CRIT_SECTION();
* This makes checkpoint's determination of which xacts are delayChkpt
* a bit fuzzy, but it doesn't matter.
*/
+ Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0);
START_CRIT_SECTION();
- MyProc->delayChkpt = true;
+ MyProc->delayChkpt |= DELAY_CHKPT_START;
SetCurrentTransactionStopTimestamp();
*/
if (markXidCommitted)
{
- MyProc->delayChkpt = false;
+ MyProc->delayChkpt &= ~DELAY_CHKPT_START;
END_CRIT_SECTION();
}
* and we will correctly flush the update below. So we cannot miss any
* xacts we need to wait for.
*/
- vxids = GetVirtualXIDsDelayingChkpt(&nvxids);
+ vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_START);
if (nvxids > 0)
{
do
{
pg_usleep(10000L); /* wait for 10 msec */
- } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids));
+ } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids,
+ DELAY_CHKPT_START));
}
pfree(vxids);
CheckPointGuts(checkPoint.redo, flags);
+ vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_COMPLETE);
+ if (nvxids > 0)
+ {
+ do
+ {
+ pg_usleep(10000L); /* wait for 10 msec */
+ } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids,
+ DELAY_CHKPT_COMPLETE));
+ }
+ pfree(vxids);
+
/*
* Take a snapshot of running transactions and write this to WAL. This
* allows us to reconstruct the state of running transactions during
/*
* Ensure no checkpoint can change our view of RedoRecPtr.
*/
- Assert(MyProc->delayChkpt);
+ Assert((MyProc->delayChkpt & DELAY_CHKPT_START) != 0);
/*
* Update RedoRecPtr so that we can make the right decision
RelationPreTruncate(rel);
+ /*
+ * Make sure that a concurrent checkpoint can't complete while truncation
+ * is in progress.
+ *
+ * The truncation operation might drop buffers that the checkpoint
+ * otherwise would have flushed. If it does, then it's essential that
+ * the files actually get truncated on disk before the checkpoint record
+ * is written. Otherwise, if reply begins from that checkpoint, the
+ * to-be-truncated blocks might still exist on disk but have older
+ * contents than expected, which can cause replay to fail. It's OK for
+ * the blocks to not exist on disk at all, but not for them to have the
+ * wrong contents.
+ */
+ Assert((MyProc->delayChkpt & DELAY_CHKPT_COMPLETE) == 0);
+ MyProc->delayChkpt |= DELAY_CHKPT_COMPLETE;
+
/*
* We WAL-log the truncation before actually truncating, which means
* trouble if the truncation fails. If we then crash, the WAL replay
XLogFlush(lsn);
}
- /* Do the real work to truncate relation forks */
+ /*
+ * This will first remove any buffers from the buffer pool that should no
+ * longer exist after truncation is complete, and then truncate the
+ * corresponding files on disk.
+ */
smgrtruncate(RelationGetSmgr(rel), forks, nforks, blocks);
+ /* We've done all the critical work, so checkpoints are OK now. */
+ MyProc->delayChkpt &= ~DELAY_CHKPT_COMPLETE;
+
/*
* Update upper-level FSM pages to account for the truncation. This is
* important because the just-truncated pages were likely marked as
* all-free, and would be preferentially selected.
+ *
+ * NB: There's no point in delaying checkpoints until this is done.
+ * Because the FSM is not WAL-logged, we have to be prepared for the
+ * possibility of corruption after a crash anyway.
*/
if (need_fsm_vacuum)
FreeSpaceMapVacuumRange(rel, nblocks, InvalidBlockNumber);
* essential that CreateCheckPoint waits for virtual transactions
* rather than full transactionids.
*/
- MyProc->delayChkpt = delayChkpt = true;
+ Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0);
+ MyProc->delayChkpt |= DELAY_CHKPT_START;
+ delayChkpt = true;
lsn = XLogSaveBufferForHint(buffer, buffer_std);
}
UnlockBufHdr(bufHdr, buf_state);
if (delayChkpt)
- MyProc->delayChkpt = false;
+ MyProc->delayChkpt &= ~DELAY_CHKPT_START;
if (dirtied)
{
proc->lxid = InvalidLocalTransactionId;
proc->xmin = InvalidTransactionId;
- proc->delayChkpt = false; /* be sure this is cleared in abort */
+
+ /* be sure this is cleared in abort */
+ proc->delayChkpt = 0;
+
proc->recoveryConflictPending = false;
/* must be cleared with xid/xmin: */
proc->xid = InvalidTransactionId;
proc->lxid = InvalidLocalTransactionId;
proc->xmin = InvalidTransactionId;
- proc->delayChkpt = false; /* be sure this is cleared in abort */
+
+ /* be sure this is cleared in abort */
+ proc->delayChkpt = 0;
+
proc->recoveryConflictPending = false;
/* must be cleared with xid/xmin: */
* delaying checkpoint because they have critical actions in progress.
*
* Constructs an array of VXIDs of transactions that are currently in commit
- * critical sections, as shown by having delayChkpt set in their PGPROC.
+ * critical sections, as shown by having specified delayChkpt bits set in their
+ * PGPROC.
*
* Returns a palloc'd array that should be freed by the caller.
* *nvxids is the number of valid entries.
* for clearing of delayChkpt to propagate is unimportant for correctness.
*/
VirtualTransactionId *
-GetVirtualXIDsDelayingChkpt(int *nvxids)
+GetVirtualXIDsDelayingChkpt(int *nvxids, int type)
{
VirtualTransactionId *vxids;
ProcArrayStruct *arrayP = procArray;
int count = 0;
int index;
+ Assert(type != 0);
+
/* allocate what's certainly enough result space */
vxids = (VirtualTransactionId *)
palloc(sizeof(VirtualTransactionId) * arrayP->maxProcs);
int pgprocno = arrayP->pgprocnos[index];
PGPROC *proc = &allProcs[pgprocno];
- if (proc->delayChkpt)
+ if ((proc->delayChkpt & type) != 0)
{
VirtualTransactionId vxid;
* those numbers should be small enough for it not to be a problem.
*/
bool
-HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids)
+HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids, int type)
{
bool result = false;
ProcArrayStruct *arrayP = procArray;
int index;
+ Assert(type != 0);
+
LWLockAcquire(ProcArrayLock, LW_SHARED);
for (index = 0; index < arrayP->numProcs; index++)
GET_VXID_FROM_PGPROC(vxid, *proc);
- if (proc->delayChkpt && VirtualTransactionIdIsValid(vxid))
+ if ((proc->delayChkpt & type) != 0 &&
+ VirtualTransactionIdIsValid(vxid))
{
int i;
MyProc->roleId = InvalidOid;
MyProc->tempNamespaceId = InvalidOid;
MyProc->isBackgroundWorker = IsBackgroundWorker;
- MyProc->delayChkpt = false;
+ MyProc->delayChkpt = 0;
MyProc->statusFlags = 0;
/* NB -- autovac launcher intentionally does not set IS_AUTOVACUUM */
if (IsAutoVacuumWorkerProcess())
MyProc->roleId = InvalidOid;
MyProc->tempNamespaceId = InvalidOid;
MyProc->isBackgroundWorker = IsBackgroundWorker;
- MyProc->delayChkpt = false;
+ MyProc->delayChkpt = 0;
MyProc->statusFlags = 0;
MyProc->lwWaiting = false;
MyProc->lwWaitMode = 0;
*/
#define INVALID_PGPROCNO PG_INT32_MAX
+/*
+ * Flags for PGPROC.delayChkpt
+ *
+ * These flags can be used to delay the start or completion of a checkpoint
+ * for short periods. A flag is in effect if the corresponding bit is set in
+ * the PGPROC of any backend.
+ *
+ * For our purposes here, a checkpoint has three phases: (1) determine the
+ * location to which the redo pointer will be moved, (2) write all the
+ * data durably to disk, and (3) WAL-log the checkpoint.
+ *
+ * Setting DELAY_CHKPT_START prevents the system from moving from phase 1
+ * to phase 2. This is useful when we are performing a WAL-logged modification
+ * of data that will be flushed to disk in phase 2. By setting this flag
+ * before writing WAL and clearing it after we've both written WAL and
+ * performed the corresponding modification, we ensure that if the WAL record
+ * is inserted prior to the new redo point, the corresponding data changes will
+ * also be flushed to disk before the checkpoint can complete. (In the
+ * extremely common case where the data being modified is in shared buffers
+ * and we acquire an exclusive content lock on the relevant buffers before
+ * writing WAL, this mechanism is not needed, because phase 2 will block
+ * until we release the content lock and then flush the modified data to
+ * disk.)
+ *
+ * Setting DELAY_CHKPT_COMPLETE prevents the system from moving from phase 2
+ * to phase 3. This is useful if we are performing a WAL-logged operation that
+ * might invalidate buffers, such as relation truncation. In this case, we need
+ * to ensure that any buffers which were invalidated and thus not flushed by
+ * the checkpoint are actaully destroyed on disk. Replay can cope with a file
+ * or block that doesn't exist, but not with a block that has the wrong
+ * contents.
+ */
+#define DELAY_CHKPT_START (1<<0)
+#define DELAY_CHKPT_COMPLETE (1<<1)
+
typedef enum
{
PROC_WAIT_STATUS_OK,
pg_atomic_uint64 waitStart; /* time at which wait for lock acquisition
* started */
- bool delayChkpt; /* true if this proc delays checkpoint start */
+ int delayChkpt; /* for DELAY_CHKPT_* flags */
uint8 statusFlags; /* this backend's status flags, see PROC_*
* above. mirrored in
extern TransactionId GetOldestSafeDecodingTransactionId(bool catalogOnly);
extern void GetReplicationHorizons(TransactionId *slot_xmin, TransactionId *catalog_xmin);
-extern VirtualTransactionId *GetVirtualXIDsDelayingChkpt(int *nvxids);
-extern bool HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids);
+extern VirtualTransactionId *GetVirtualXIDsDelayingChkpt(int *nvxids, int type);
+extern bool HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids,
+ int nvxids, int type);
extern PGPROC *BackendPidGetProc(int pid);
extern PGPROC *BackendPidGetProcWithLock(int pid);