pg_buffercache_pages.o
EXTENSION = pg_buffercache
-DATA = pg_buffercache--1.2.sql pg_buffercache--1.2--1.3.sql \
- pg_buffercache--1.1--1.2.sql pg_buffercache--1.0--1.1.sql
+DATA = pg_buffercache--1.0--1.1.sql pg_buffercache--1.1--1.2.sql pg_buffercache--1.2.sql \
+ pg_buffercache--1.2--1.3.sql pg_buffercache--1.3--1.4.sql
PGFILEDESC = "pg_buffercache - monitoring of shared buffer cache in real-time"
REGRESS = pg_buffercache
--- /dev/null
+/* contrib/pg_buffercache/pg_buffercache--1.3--1.4.sql */
+
+-- complain if script is sourced in psql, rather than via ALTER EXTENSION
+\echo Use "ALTER EXTENSION pg_buffercache UPDATE TO '1.4'" to load this file. \quit
+
+/* First we have to remove them from the extension */
+ALTER EXTENSION pg_buffercache DROP VIEW pg_buffercache;
+ALTER EXTENSION pg_buffercache DROP FUNCTION pg_buffercache_pages();
+
+/* Then we can drop them */
+DROP VIEW pg_buffercache;
+DROP FUNCTION pg_buffercache_pages();
+
+/* Now redefine */
+CREATE FUNCTION pg_buffercache_pages()
+RETURNS SETOF RECORD
+AS 'MODULE_PATHNAME', 'pg_buffercache_pages_v1_4'
+LANGUAGE C PARALLEL SAFE;
+
+CREATE VIEW pg_buffercache AS
+ SELECT P.* FROM pg_buffercache_pages() AS P
+ (bufferid integer, relfilenode int8, reltablespace oid, reldatabase oid,
+ relforknumber int2, relblocknumber int8, isdirty bool, usagecount int2,
+ pinning_backends int4);
+
+-- Don't want these to be available to public.
+REVOKE ALL ON FUNCTION pg_buffercache_pages() FROM PUBLIC;
+REVOKE ALL ON pg_buffercache FROM PUBLIC;
+GRANT EXECUTE ON FUNCTION pg_buffercache_pages() TO pg_monitor;
+GRANT SELECT ON pg_buffercache TO pg_monitor;
# pg_buffercache extension
comment = 'examine the shared buffer cache'
-default_version = '1.3'
+default_version = '1.4'
module_pathname = '$libdir/pg_buffercache'
relocatable = true
* relation node/tablespace/database/blocknum and dirty indicator.
*/
PG_FUNCTION_INFO_V1(pg_buffercache_pages);
+PG_FUNCTION_INFO_V1(pg_buffercache_pages_v1_4);
-Datum
-pg_buffercache_pages(PG_FUNCTION_ARGS)
+static Datum
+pg_buffercache_pages_internal(PG_FUNCTION_ARGS, Oid rfn_typid)
{
FuncCallContext *funcctx;
Datum result;
TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid",
INT4OID, -1, 0);
TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode",
- OIDOID, -1, 0);
+ rfn_typid, -1, 0);
TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace",
OIDOID, -1, 0);
TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase",
}
else
{
- values[1] = ObjectIdGetDatum(fctx->record[i].relfilenumber);
+ if (rfn_typid == INT8OID)
+ values[1] =
+ Int64GetDatum((int64) fctx->record[i].relfilenumber);
+ else
+ {
+ Assert(rfn_typid == OIDOID);
+
+ if (fctx->record[i].relfilenumber > OID_MAX)
+ ereport(ERROR,
+ errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("relfilenode %llu is too large to be represented as an OID",
+ (unsigned long long) fctx->record[i].relfilenumber),
+ errhint("Upgrade the extension using ALTER EXTENSION pg_buffercache UPDATE"));
+
+ values[1] =
+ ObjectIdGetDatum((Oid) fctx->record[i].relfilenumber);
+ }
+
nulls[1] = false;
values[2] = ObjectIdGetDatum(fctx->record[i].reltablespace);
nulls[2] = false;
else
SRF_RETURN_DONE(funcctx);
}
+
+/* entry point for old extension version */
+Datum
+pg_buffercache_pages(PG_FUNCTION_ARGS)
+{
+ return pg_buffercache_pages_internal(fcinfo, OIDOID);
+}
+
+Datum
+pg_buffercache_pages_v1_4(PG_FUNCTION_ARGS)
+{
+ return pg_buffercache_pages_internal(fcinfo, INT8OID);
+}
{
unsigned forknum;
- if (fscanf(file, "%u,%u,%u,%u,%u\n", &blkinfo[i].database,
+ if (fscanf(file, "%u,%u," UINT64_FORMAT ",%u,%u\n", &blkinfo[i].database,
&blkinfo[i].tablespace, &blkinfo[i].filenumber,
&forknum, &blkinfo[i].blocknum) != 5)
ereport(ERROR,
{
CHECK_FOR_INTERRUPTS();
- ret = fprintf(file, "%u,%u,%u,%u,%u\n",
+ ret = fprintf(file, "%u,%u," UINT64_FORMAT ",%u,%u\n",
block_info_array[i].database,
block_info_array[i].tablespace,
block_info_array[i].filenumber,
-- ===================================================================
-- Test for filtering out WAL records of a particular table
-- ===================================================================
-SELECT oid AS sample_tbl_oid FROM pg_class WHERE relname = 'sample_tbl' \gset
+SELECT relfilenode AS sample_tbl_relfilenode FROM pg_class WHERE relname = 'sample_tbl' \gset
SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_records_info(:'wal_lsn1', :'wal_lsn2')
- WHERE block_ref LIKE concat('%', :'sample_tbl_oid', '%') AND resource_manager = 'Heap';
+ WHERE block_ref LIKE concat('%', :'sample_tbl_relfilenode', '%') AND resource_manager = 'Heap';
ok
----
t
-- Test for filtering out WAL records of a particular table
-- ===================================================================
-SELECT oid AS sample_tbl_oid FROM pg_class WHERE relname = 'sample_tbl' \gset
+SELECT relfilenode AS sample_tbl_relfilenode FROM pg_class WHERE relname = 'sample_tbl' \gset
SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_records_info(:'wal_lsn1', :'wal_lsn2')
- WHERE block_ref LIKE concat('%', :'sample_tbl_oid', '%') AND resource_manager = 'Heap';
+ WHERE block_ref LIKE concat('%', :'sample_tbl_relfilenode', '%') AND resource_manager = 'Heap';
-- ===================================================================
-- Test for filtering out WAL records based on resource_manager and
<row>
<entry role="catalog_table_entry"><para role="column_definition">
- <structfield>relfilenode</structfield> <type>oid</type>
+ <structfield>relfilenode</structfield> <type>int8</type>
</para>
<para>
Name of the on-disk file of this relation; zero means this
<entry><type>timestamp with time zone</type></entry>
</row>
+ <row>
+ <entry><structfield>next_relfilenumber</structfield></entry>
+ <entry><type>timestamp with time zone</type></entry>
+ </row>
+
</tbody>
</tgroup>
</table>
<row>
<entry role="catalog_table_entry"><para role="column_definition">
- <structfield>relfilenode</structfield> <type>oid</type>
+ <structfield>relfilenode</structfield> <type>int8</type>
(references <link linkend="catalog-pg-class"><structname>pg_class</structname></link>.<structfield>relfilenode</structfield>)
</para>
<para>
<caution>
<para>
-Note that while a table's filenode often matches its OID, this is
-<emphasis>not</emphasis> necessarily the case; some operations, like
-<command>TRUNCATE</command>, <command>REINDEX</command>, <command>CLUSTER</command> and some forms
-of <command>ALTER TABLE</command>, can change the filenode while preserving the OID.
-Avoid assuming that filenode and table OID are the same.
+Note that a table's filenode will normally be different than the OID. For
+system tables, the initial filenode will be equal to the table OID, but it will
+be different if the table has ever been subjected to a rewriting operation,
+such as <command>TRUNCATE</command>, <command>REINDEX</command>,
+<command>CLUSTER</command> or some forms of <command>ALTER TABLE</command>.
+For user tables, even the initial filenode will be different than the table OID.
Also, for certain system catalogs including <structname>pg_class</structname> itself,
<structname>pg_class</structname>.<structfield>relfilenode</structfield> contains zero. The
actual filenode number of these catalogs is stored in a lower-level data
BlockNumber blknum;
BufferGetTag(buffer, &locator, &forknum, &blknum);
- elog(ERROR, "failed to add item to index page in %u/%u/%u",
+ elog(ERROR, "failed to add item to index page in %u/%u/" UINT64_FORMAT,
locator.spcOid, locator.dbOid, locator.relNumber);
}
}
static void
out_gistxlogPageReuse(StringInfo buf, gistxlogPageReuse *xlrec)
{
- appendStringInfo(buf, "rel %u/%u/%u; blk %u; latestRemovedXid %u:%u",
+ appendStringInfo(buf, "rel %u/%u/" UINT64_FORMAT "; blk %u; latestRemovedXid %u:%u",
xlrec->locator.spcOid, xlrec->locator.dbOid,
xlrec->locator.relNumber, xlrec->block,
EpochFromFullTransactionId(xlrec->latestRemovedFullXid),
{
xl_heap_new_cid *xlrec = (xl_heap_new_cid *) rec;
- appendStringInfo(buf, "rel %u/%u/%u; tid %u/%u",
+ appendStringInfo(buf, "rel %u/%u/" UINT64_FORMAT "; tid %u/%u",
xlrec->target_locator.spcOid,
xlrec->target_locator.dbOid,
xlrec->target_locator.relNumber,
{
xl_btree_reuse_page *xlrec = (xl_btree_reuse_page *) rec;
- appendStringInfo(buf, "rel %u/%u/%u; latestRemovedXid %u:%u",
+ appendStringInfo(buf, "rel %u/%u/" UINT64_FORMAT "; latestRemovedXid %u:%u",
xlrec->locator.spcOid, xlrec->locator.dbOid,
xlrec->locator.relNumber,
EpochFromFullTransactionId(xlrec->latestRemovedFullXid),
xl_seq_rec *xlrec = (xl_seq_rec *) rec;
if (info == XLOG_SEQ_LOG)
- appendStringInfo(buf, "rel %u/%u/%u",
+ appendStringInfo(buf, "rel %u/%u/" UINT64_FORMAT,
xlrec->locator.spcOid, xlrec->locator.dbOid,
xlrec->locator.relNumber);
}
CheckPoint *checkpoint = (CheckPoint *) rec;
appendStringInfo(buf, "redo %X/%X; "
- "tli %u; prev tli %u; fpw %s; xid %u:%u; oid %u; multi %u; offset %u; "
- "oldest xid %u in DB %u; oldest multi %u in DB %u; "
+ "tli %u; prev tli %u; fpw %s; xid %u:%u; relfilenumber " UINT64_FORMAT ";oid %u; "
+ "multi %u; offset %u; oldest xid %u in DB %u; oldest multi %u in DB %u; "
"oldest/newest commit timestamp xid: %u/%u; "
"oldest running xid %u; %s",
LSN_FORMAT_ARGS(checkpoint->redo),
checkpoint->fullPageWrites ? "true" : "false",
EpochFromFullTransactionId(checkpoint->nextXid),
XidFromFullTransactionId(checkpoint->nextXid),
+ checkpoint->nextRelFileNumber,
checkpoint->nextOid,
checkpoint->nextMulti,
checkpoint->nextMultiOffset,
memcpy(&nextOid, rec, sizeof(Oid));
appendStringInfo(buf, "%u", nextOid);
}
+ else if (info == XLOG_NEXT_RELFILENUMBER)
+ {
+ RelFileNumber nextRelFileNumber;
+
+ memcpy(&nextRelFileNumber, rec, sizeof(RelFileNumber));
+ appendStringInfo(buf, UINT64_FORMAT, nextRelFileNumber);
+ }
else if (info == XLOG_RESTORE_POINT)
{
xl_restore_point *xlrec = (xl_restore_point *) rec;
case XLOG_NEXTOID:
id = "NEXTOID";
break;
+ case XLOG_NEXT_RELFILENUMBER:
+ id = "NEXT_RELFILENUMBER";
+ break;
case XLOG_SWITCH:
id = "SWITCH";
break;
appendStringInfoChar(buf, ' ');
appendStringInfo(buf,
- "blkref #%d: rel %u/%u/%u fork %s blk %u",
+ "blkref #%d: rel %u/%u/" UINT64_FORMAT " fork %s blk %u",
block_id,
rlocator.spcOid, rlocator.dbOid, rlocator.relNumber,
forkNames[forknum],
if (forknum != MAIN_FORKNUM)
{
appendStringInfo(buf,
- ", blkref #%d: rel %u/%u/%u fork %s blk %u",
+ ", blkref #%d: rel %u/%u/" UINT64_FORMAT " fork %s blk %u",
block_id,
rlocator.spcOid, rlocator.dbOid, rlocator.relNumber,
forkNames[forknum],
else
{
appendStringInfo(buf,
- ", blkref #%d: rel %u/%u/%u blk %u",
+ ", blkref #%d: rel %u/%u/" UINT64_FORMAT " blk %u",
block_id,
rlocator.spcOid, rlocator.dbOid, rlocator.relNumber,
blk);
entry in pg_class, but that currently isn't done because of the possibility
of deleting data that is useful for forensic analysis of the crash.
Orphan files are harmless --- at worst they waste a bit of disk space ---
-because we check for on-disk collisions when allocating new relfilenumber
-OIDs. So cleaning up isn't really necessary.
+because the relfilenumber counter is monotonically increasing. The maximum
+value is 2^56-1, and there is no provision for wraparound. Thus, on-disk
+collisions aren't possible.
3. Deleting a table, which requires an unlink() that could fail.
#include "postgres.h"
+#include <unistd.h>
+
#include "access/clog.h"
#include "access/commit_ts.h"
#include "access/subtrans.h"
#include "access/transam.h"
#include "access/xact.h"
#include "access/xlogutils.h"
+#include "catalog/pg_class.h"
+#include "catalog/pg_tablespace.h"
#include "commands/dbcommands.h"
#include "miscadmin.h"
#include "postmaster/autovacuum.h"
/* Number of OIDs to prefetch (preallocate) per XLOG write */
#define VAR_OID_PREFETCH 8192
+/* Number of RelFileNumbers to be logged per XLOG write */
+#define VAR_RELNUMBER_PER_XLOG 512
+
+/*
+ * Need to log more if remaining logged RelFileNumbers are less than the
+ * threshold. Valid range could be between 0 to VAR_RELNUMBER_PER_XLOG - 1.
+ */
+#define VAR_RELNUMBER_NEW_XLOG_THRESHOLD 256
+
/* pointer to "variable cache" in shared memory (set up by shmem.c) */
VariableCache ShmemVariableCache = NULL;
* wide, counter wraparound will occur eventually, and therefore it is unwise
* to assume they are unique unless precautions are taken to make them so.
* Hence, this routine should generally not be used directly. The only direct
- * callers should be GetNewOidWithIndex() and GetNewRelFileNumber() in
- * catalog/catalog.c.
+ * caller should be GetNewOidWithIndex() in catalog/catalog.c.
*/
Oid
GetNewObjectId(void)
LWLockRelease(OidGenLock);
}
+/*
+ * GetNewRelFileNumber
+ *
+ * Similar to GetNewObjectId but instead of new Oid it generates new
+ * relfilenumber.
+ */
+RelFileNumber
+GetNewRelFileNumber(Oid reltablespace, char relpersistence)
+{
+ RelFileNumber result;
+ RelFileNumber nextRelFileNumber,
+ loggedRelFileNumber,
+ flushedRelFileNumber;
+
+ StaticAssertStmt(VAR_RELNUMBER_NEW_XLOG_THRESHOLD < VAR_RELNUMBER_PER_XLOG,
+ "VAR_RELNUMBER_NEW_XLOG_THRESHOLD must be smaller than VAR_RELNUMBER_PER_XLOG");
+
+ /* safety check, we should never get this far in a HS standby */
+ if (RecoveryInProgress())
+ elog(ERROR, "cannot assign RelFileNumber during recovery");
+
+ if (IsBinaryUpgrade)
+ elog(ERROR, "cannot assign RelFileNumber during binary upgrade");
+
+ LWLockAcquire(RelFileNumberGenLock, LW_EXCLUSIVE);
+
+ nextRelFileNumber = ShmemVariableCache->nextRelFileNumber;
+ loggedRelFileNumber = ShmemVariableCache->loggedRelFileNumber;
+ flushedRelFileNumber = ShmemVariableCache->flushedRelFileNumber;
+
+ Assert(nextRelFileNumber <= flushedRelFileNumber);
+ Assert(flushedRelFileNumber <= loggedRelFileNumber);
+
+ /* check for the wraparound for the relfilenumber counter */
+ if (unlikely(nextRelFileNumber > MAX_RELFILENUMBER))
+ elog(ERROR, "relfilenumber is too large");
+
+ /*
+ * If the remaining logged relfilenumbers values are less than the
+ * threshold value then log more. Ideally, we can wait until all
+ * relfilenumbers have been consumed before logging more. Nevertheless, if
+ * we do that, we must immediately flush the logged wal record because we
+ * want to ensure that the nextRelFileNumber is always larger than any
+ * relfilenumber already in use on disk. And, to maintain that invariant,
+ * we must make sure that the record we log reaches the disk before any new
+ * files are created with the newly logged range.
+ *
+ * So in order to avoid flushing the wal immediately, we always log before
+ * consuming all the relfilenumber, and now we only have to flush the newly
+ * logged relfilenumber wal before consuming the relfilenumber from this
+ * new range. By the time we need to flush this wal, hopefully, those have
+ * already been flushed with some other XLogFlush operation.
+ */
+ if (loggedRelFileNumber - nextRelFileNumber <=
+ VAR_RELNUMBER_NEW_XLOG_THRESHOLD)
+ {
+ XLogRecPtr recptr;
+
+ loggedRelFileNumber = loggedRelFileNumber + VAR_RELNUMBER_PER_XLOG;
+ recptr = LogNextRelFileNumber(loggedRelFileNumber);
+ ShmemVariableCache->loggedRelFileNumber = loggedRelFileNumber;
+
+ /* remember for the future flush */
+ ShmemVariableCache->loggedRelFileNumberRecPtr = recptr;
+ }
+
+ /*
+ * If the nextRelFileNumber is already reached to the already flushed
+ * relfilenumber then flush the WAL for previously logged relfilenumber.
+ */
+ if (nextRelFileNumber >= flushedRelFileNumber)
+ {
+ XLogFlush(ShmemVariableCache->loggedRelFileNumberRecPtr);
+ ShmemVariableCache->flushedRelFileNumber = loggedRelFileNumber;
+ }
+
+ result = ShmemVariableCache->nextRelFileNumber;
+
+ /* we should never be using any relfilenumber outside the flushed range */
+ Assert(result <= ShmemVariableCache->flushedRelFileNumber);
+
+ (ShmemVariableCache->nextRelFileNumber)++;
+
+ LWLockRelease(RelFileNumberGenLock);
+
+ /*
+ * Because the RelFileNumber counter only ever increases and never wraps
+ * around, it should be impossible for the newly-allocated RelFileNumber to
+ * already be in use. But, if Asserts are enabled, double check that
+ * there's no main-fork relation file with the new RelFileNumber already on
+ * disk.
+ */
+#ifdef USE_ASSERT_CHECKING
+ {
+ RelFileLocatorBackend rlocator;
+ char *rpath;
+ BackendId backend;
+
+ switch (relpersistence)
+ {
+ case RELPERSISTENCE_TEMP:
+ backend = BackendIdForTempRelations();
+ break;
+ case RELPERSISTENCE_UNLOGGED:
+ case RELPERSISTENCE_PERMANENT:
+ backend = InvalidBackendId;
+ break;
+ default:
+ elog(ERROR, "invalid relpersistence: %c", relpersistence);
+ }
+
+ /* this logic should match RelationInitPhysicalAddr */
+ rlocator.locator.spcOid =
+ reltablespace ? reltablespace : MyDatabaseTableSpace;
+ rlocator.locator.dbOid = (reltablespace == GLOBALTABLESPACE_OID) ?
+ InvalidOid : MyDatabaseId;
+ rlocator.locator.relNumber = result;
+
+ /*
+ * The relpath will vary based on the backend ID, so we must
+ * initialize that properly here to make sure that any collisions
+ * based on filename are properly detected.
+ */
+ rlocator.backend = backend;
+
+ /* check for existing file of same name. */
+ rpath = relpath(rlocator, MAIN_FORKNUM);
+ Assert(access(rpath, F_OK) != 0);
+ }
+#endif
+
+ return result;
+}
+
+/*
+ * SetNextRelFileNumber
+ *
+ * This may only be called during pg_upgrade; it advances the RelFileNumber
+ * counter to the specified value if the current value is smaller than the
+ * input value.
+ */
+void
+SetNextRelFileNumber(RelFileNumber relnumber)
+{
+ /* safety check, we should never get this far in a HS standby */
+ if (RecoveryInProgress())
+ elog(ERROR, "cannot set RelFileNumber during recovery");
+
+ if (!IsBinaryUpgrade)
+ elog(ERROR, "RelFileNumber can be set only during binary upgrade");
+
+ LWLockAcquire(RelFileNumberGenLock, LW_EXCLUSIVE);
+
+ /*
+ * If previous assigned value of the nextRelFileNumber is already higher
+ * than the current value then nothing to be done. This is possible
+ * because during upgrade the objects are not created in relfilenumber
+ * order.
+ */
+ if (relnumber <= ShmemVariableCache->nextRelFileNumber)
+ {
+ LWLockRelease(RelFileNumberGenLock);
+ return;
+ }
+
+ /*
+ * If the new relfilenumber to be set is greater than or equal to already
+ * flushed relfilenumber then log more and flush immediately.
+ *
+ * (This is less efficient than GetNewRelFileNumber, which arranges to
+ * log some new relfilenumbers before the old batch is exhausted in the
+ * hope that a flush will happen in the background before any values are
+ * needed from the new batch. However, since thais is only used during
+ * binary upgrade, it shouldn't really matter.)
+ */
+ if (relnumber >= ShmemVariableCache->flushedRelFileNumber)
+ {
+ RelFileNumber newlogrelnum;
+
+ newlogrelnum = relnumber + VAR_RELNUMBER_PER_XLOG;
+ XLogFlush(LogNextRelFileNumber(newlogrelnum));
+
+ /* we have flushed whatever we have logged so no pending flush */
+ ShmemVariableCache->loggedRelFileNumber = newlogrelnum;
+ ShmemVariableCache->flushedRelFileNumber = newlogrelnum;
+ ShmemVariableCache->loggedRelFileNumberRecPtr = InvalidXLogRecPtr;
+ }
+
+ ShmemVariableCache->nextRelFileNumber = relnumber;
+
+ LWLockRelease(RelFileNumberGenLock);
+}
+
/*
* StopGeneratingPinnedObjectIds
*
checkPoint.nextXid =
FullTransactionIdFromEpochAndXid(0, FirstNormalTransactionId);
checkPoint.nextOid = FirstGenbkiObjectId;
+ checkPoint.nextRelFileNumber = FirstNormalRelFileNumber;
checkPoint.nextMulti = FirstMultiXactId;
checkPoint.nextMultiOffset = 0;
checkPoint.oldestXid = FirstNormalTransactionId;
ShmemVariableCache->nextXid = checkPoint.nextXid;
ShmemVariableCache->nextOid = checkPoint.nextOid;
+ ShmemVariableCache->nextRelFileNumber = checkPoint.nextRelFileNumber;
ShmemVariableCache->oidCount = 0;
+ ShmemVariableCache->loggedRelFileNumber = checkPoint.nextRelFileNumber;
+ ShmemVariableCache->flushedRelFileNumber = checkPoint.nextRelFileNumber;
+ ShmemVariableCache->loggedRelFileNumberRecPtr = InvalidXLogRecPtr;
MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
AdvanceOldestClogXid(checkPoint.oldestXid);
SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
/* initialize shared memory variables from the checkpoint record */
ShmemVariableCache->nextXid = checkPoint.nextXid;
ShmemVariableCache->nextOid = checkPoint.nextOid;
+ ShmemVariableCache->nextRelFileNumber = checkPoint.nextRelFileNumber;
ShmemVariableCache->oidCount = 0;
+ ShmemVariableCache->loggedRelFileNumber = checkPoint.nextRelFileNumber;
+ ShmemVariableCache->flushedRelFileNumber = checkPoint.nextRelFileNumber;
MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
AdvanceOldestClogXid(checkPoint.oldestXid);
SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
checkPoint.nextOid += ShmemVariableCache->oidCount;
LWLockRelease(OidGenLock);
+ /*
+ * If this is a shutdown checkpoint then we can safely start allocating
+ * relfilenumber from the nextRelFileNumber value after the restart because
+ * no one one else can use the relfilenumber beyond that number before the
+ * shutdown. OTOH, if it is a normal checkpoint then if there is a crash
+ * after this point then we might end up reusing the same relfilenumbers
+ * after the restart so we need to set the nextRelFileNumber to the already
+ * logged relfilenumber as no one will use number beyond this limit without
+ * logging again.
+ */
+ LWLockAcquire(RelFileNumberGenLock, LW_SHARED);
+ if (shutdown)
+ checkPoint.nextRelFileNumber = ShmemVariableCache->nextRelFileNumber;
+ else
+ checkPoint.nextRelFileNumber = ShmemVariableCache->loggedRelFileNumber;
+
+ LWLockRelease(RelFileNumberGenLock);
+
MultiXactGetCheckptMulti(shutdown,
&checkPoint.nextMulti,
&checkPoint.nextMultiOffset,
*/
}
+/*
+ * Similar to the XLogPutNextOid but instead of writing NEXTOID log record it
+ * writes a NEXT_RELFILENUMBER log record. It also returns the XLogRecPtr of
+ * the currently logged relfilenumber record, so that the caller can flush it
+ * at the appropriate time.
+ */
+XLogRecPtr
+LogNextRelFileNumber(RelFileNumber nextrelnumber)
+{
+ XLogRecPtr recptr;
+
+ XLogBeginInsert();
+ XLogRegisterData((char *) (&nextrelnumber), sizeof(RelFileNumber));
+ recptr = XLogInsert(RM_XLOG_ID, XLOG_NEXT_RELFILENUMBER);
+
+ return recptr;
+}
+
/*
* Write an XLOG SWITCH record.
*
ShmemVariableCache->oidCount = 0;
LWLockRelease(OidGenLock);
}
+ if (info == XLOG_NEXT_RELFILENUMBER)
+ {
+ RelFileNumber nextRelFileNumber;
+
+ memcpy(&nextRelFileNumber, XLogRecGetData(record), sizeof(RelFileNumber));
+ LWLockAcquire(RelFileNumberGenLock, LW_EXCLUSIVE);
+ ShmemVariableCache->nextRelFileNumber = nextRelFileNumber;
+ ShmemVariableCache->loggedRelFileNumber = nextRelFileNumber;
+ ShmemVariableCache->flushedRelFileNumber = nextRelFileNumber;
+ LWLockRelease(RelFileNumberGenLock);
+ }
else if (info == XLOG_CHECKPOINT_SHUTDOWN)
{
CheckPoint checkPoint;
ShmemVariableCache->nextOid = checkPoint.nextOid;
ShmemVariableCache->oidCount = 0;
LWLockRelease(OidGenLock);
+ LWLockAcquire(RelFileNumberGenLock, LW_EXCLUSIVE);
+ ShmemVariableCache->nextRelFileNumber = checkPoint.nextRelFileNumber;
+ ShmemVariableCache->loggedRelFileNumber = checkPoint.nextRelFileNumber;
+ ShmemVariableCache->flushedRelFileNumber = checkPoint.nextRelFileNumber;
+ LWLockRelease(RelFileNumberGenLock);
MultiXactSetNextMXact(checkPoint.nextMulti,
checkPoint.nextMultiOffset);
#ifdef XLOGPREFETCHER_DEBUG_LEVEL
elog(XLOGPREFETCHER_DEBUG_LEVEL,
- "suppressing prefetch in relation %u/%u/%u until %X/%X is replayed, which creates the relation",
+ "suppressing prefetch in relation %u/%u/" UINT64_FORMAT " until %X/%X is replayed, which creates the relation",
xlrec->rlocator.spcOid,
xlrec->rlocator.dbOid,
xlrec->rlocator.relNumber,
#ifdef XLOGPREFETCHER_DEBUG_LEVEL
elog(XLOGPREFETCHER_DEBUG_LEVEL,
- "suppressing prefetch in relation %u/%u/%u from block %u until %X/%X is replayed, which truncates the relation",
+ "suppressing prefetch in relation %u/%u/" UINT64_FORMAT " from block %u until %X/%X is replayed, which truncates the relation",
xlrec->rlocator.spcOid,
xlrec->rlocator.dbOid,
xlrec->rlocator.relNumber,
{
#ifdef XLOGPREFETCHER_DEBUG_LEVEL
elog(XLOGPREFETCHER_DEBUG_LEVEL,
- "suppressing all prefetch in relation %u/%u/%u until %X/%X is replayed, because the relation does not exist on disk",
+ "suppressing all prefetch in relation %u/%u/" UINT64_FORMAT " until %X/%X is replayed, because the relation does not exist on disk",
reln->smgr_rlocator.locator.spcOid,
reln->smgr_rlocator.locator.dbOid,
reln->smgr_rlocator.locator.relNumber,
{
#ifdef XLOGPREFETCHER_DEBUG_LEVEL
elog(XLOGPREFETCHER_DEBUG_LEVEL,
- "suppressing prefetch in relation %u/%u/%u from block %u until %X/%X is replayed, because the relation is too small",
+ "suppressing prefetch in relation %u/%u/" UINT64_FORMAT " from block %u until %X/%X is replayed, because the relation is too small",
reln->smgr_rlocator.locator.spcOid,
reln->smgr_rlocator.locator.dbOid,
reln->smgr_rlocator.locator.relNumber,
* truncated beneath our feet?
*/
elog(ERROR,
- "could not prefetch relation %u/%u/%u block %u",
+ "could not prefetch relation %u/%u/" UINT64_FORMAT " block %u",
reln->smgr_rlocator.locator.spcOid,
reln->smgr_rlocator.locator.dbOid,
reln->smgr_rlocator.locator.relNumber,
{
#ifdef XLOGPREFETCHER_DEBUG_LEVEL
elog(XLOGPREFETCHER_DEBUG_LEVEL,
- "prefetch of %u/%u/%u block %u suppressed; filtering until LSN %X/%X is replayed (blocks >= %u filtered)",
+ "prefetch of %u/%u/" UINT64_FORMAT " block %u suppressed; filtering until LSN %X/%X is replayed (blocks >= %u filtered)",
rlocator.spcOid, rlocator.dbOid, rlocator.relNumber, blockno,
LSN_FORMAT_ARGS(filter->filter_until_replayed),
filter->filter_from_block);
{
#ifdef XLOGPREFETCHER_DEBUG_LEVEL
elog(XLOGPREFETCHER_DEBUG_LEVEL,
- "prefetch of %u/%u/%u block %u suppressed; filtering until LSN %X/%X is replayed (whole database)",
+ "prefetch of %u/%u/" UINT64_FORMAT " block %u suppressed; filtering until LSN %X/%X is replayed (whole database)",
rlocator.spcOid, rlocator.dbOid, rlocator.relNumber, blockno,
LSN_FORMAT_ARGS(filter->filter_until_replayed));
#endif
continue;
if (forknum != MAIN_FORKNUM)
- appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, fork %u, blk %u",
+ appendStringInfo(buf, "; blkref #%d: rel %u/%u/" UINT64_FORMAT ", fork %u, blk %u",
block_id,
rlocator.spcOid, rlocator.dbOid,
rlocator.relNumber,
forknum,
blk);
else
- appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, blk %u",
+ appendStringInfo(buf, "; blkref #%d: rel %u/%u/" UINT64_FORMAT ", blk %u",
block_id,
rlocator.spcOid, rlocator.dbOid,
rlocator.relNumber,
if (memcmp(replay_image_masked, primary_image_masked, BLCKSZ) != 0)
{
elog(FATAL,
- "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
+ "inconsistent page found, rel %u/%u/" UINT64_FORMAT ", forknum %u, blkno %u",
rlocator.spcOid, rlocator.dbOid, rlocator.relNumber,
forknum, blkno);
}
rel->rd_rel->relpersistence = RELPERSISTENCE_PERMANENT;
/* We don't know the name of the relation; use relfilenumber instead */
- sprintf(RelationGetRelationName(rel), "%u", rlocator.relNumber);
+ sprintf(RelationGetRelationName(rel), UINT64_FORMAT, rlocator.relNumber);
/*
* We set up the lockRelId in case anything tries to lock the dummy
- * relation. Note that this is fairly bogus since relNumber may be
+ * relation. Note that this is fairly bogus since relNumber are completely
* different from the relation's OID. It shouldn't really matter though.
* In recovery, we are running by ourselves and can't have any lock
* conflicts. While syncing, we already hold AccessExclusiveLock.
*/
rel->rd_lockInfo.lockRelId.dbId = rlocator.dbOid;
- rel->rd_lockInfo.lockRelId.relId = rlocator.relNumber;
+ rel->rd_lockInfo.lockRelId.relId = (Oid) rlocator.relNumber;
rel->rd_smgr = NULL;
if (relForkNum != INIT_FORKNUM)
{
char initForkFile[MAXPGPATH];
- char relNumber[OIDCHARS + 1];
+ char relNumber[RELNUMBERCHARS + 1];
/*
* If any other type of fork, check if there is an init fork
return newOid;
}
-/*
- * GetNewRelFileNumber
- * Generate a new relfilenumber that is unique within the
- * database of the given tablespace.
- *
- * If the relfilenumber will also be used as the relation's OID, pass the
- * opened pg_class catalog, and this routine will guarantee that the result
- * is also an unused OID within pg_class. If the result is to be used only
- * as a relfilenumber for an existing relation, pass NULL for pg_class.
- *
- * As with GetNewOidWithIndex(), there is some theoretical risk of a race
- * condition, but it doesn't seem worth worrying about.
- *
- * Note: we don't support using this in bootstrap mode. All relations
- * created by bootstrap have preassigned OIDs, so there's no need.
- */
-RelFileNumber
-GetNewRelFileNumber(Oid reltablespace, Relation pg_class, char relpersistence)
-{
- RelFileLocatorBackend rlocator;
- char *rpath;
- bool collides;
- BackendId backend;
-
- /*
- * If we ever get here during pg_upgrade, there's something wrong; all
- * relfilenumber assignments during a binary-upgrade run should be
- * determined by commands in the dump script.
- */
- Assert(!IsBinaryUpgrade);
-
- switch (relpersistence)
- {
- case RELPERSISTENCE_TEMP:
- backend = BackendIdForTempRelations();
- break;
- case RELPERSISTENCE_UNLOGGED:
- case RELPERSISTENCE_PERMANENT:
- backend = InvalidBackendId;
- break;
- default:
- elog(ERROR, "invalid relpersistence: %c", relpersistence);
- return InvalidRelFileNumber; /* placate compiler */
- }
-
- /* This logic should match RelationInitPhysicalAddr */
- rlocator.locator.spcOid = reltablespace ? reltablespace : MyDatabaseTableSpace;
- rlocator.locator.dbOid =
- (rlocator.locator.spcOid == GLOBALTABLESPACE_OID) ?
- InvalidOid : MyDatabaseId;
-
- /*
- * The relpath will vary based on the backend ID, so we must initialize
- * that properly here to make sure that any collisions based on filename
- * are properly detected.
- */
- rlocator.backend = backend;
-
- do
- {
- CHECK_FOR_INTERRUPTS();
-
- /* Generate the OID */
- if (pg_class)
- rlocator.locator.relNumber = GetNewOidWithIndex(pg_class, ClassOidIndexId,
- Anum_pg_class_oid);
- else
- rlocator.locator.relNumber = GetNewObjectId();
-
- /* Check for existing file of same name */
- rpath = relpath(rlocator, MAIN_FORKNUM);
-
- if (access(rpath, F_OK) == 0)
- {
- /* definite collision */
- collides = true;
- }
- else
- {
- /*
- * Here we have a little bit of a dilemma: if errno is something
- * other than ENOENT, should we declare a collision and loop? In
- * practice it seems best to go ahead regardless of the errno. If
- * there is a colliding file we will get an smgr failure when we
- * attempt to create the new relation file.
- */
- collides = false;
- }
-
- pfree(rpath);
- } while (collides);
-
- return rlocator.locator.relNumber;
-}
-
/*
* SQL callable interface for GetNewOidWithIndex(). Outside of initdb's
* direct insertions into catalog tables, and recovering from corruption, this
else
{
/*
- * If relfilenumber is unspecified by the caller then create storage
- * with oid same as relid.
+ * If relfilenumber is unspecified by the caller then allocate a new
+ * one, except for system tables, for which we make the initial
+ * relfilenumber the same as the table OID. See the comments for
+ * FirstNormalRelFileNumber for an explanation of why we do this.
*/
if (!RelFileNumberIsValid(relfilenumber))
- relfilenumber = relid;
+ {
+ if (relid < FirstNormalObjectId)
+ relfilenumber = relid;
+ else
+ relfilenumber = GetNewRelFileNumber(reltablespace,
+ relpersistence);
+ }
}
/*
values[Anum_pg_class_reloftype - 1] = ObjectIdGetDatum(rd_rel->reloftype);
values[Anum_pg_class_relowner - 1] = ObjectIdGetDatum(rd_rel->relowner);
values[Anum_pg_class_relam - 1] = ObjectIdGetDatum(rd_rel->relam);
- values[Anum_pg_class_relfilenode - 1] = ObjectIdGetDatum(rd_rel->relfilenode);
+ values[Anum_pg_class_relfilenode - 1] = Int64GetDatum(rd_rel->relfilenode);
values[Anum_pg_class_reltablespace - 1] = ObjectIdGetDatum(rd_rel->reltablespace);
values[Anum_pg_class_relpages - 1] = Int32GetDatum(rd_rel->relpages);
values[Anum_pg_class_reltuples - 1] = Float4GetDatum(rd_rel->reltuples);
if (shared_relation && reltablespace != GLOBALTABLESPACE_OID)
elog(ERROR, "shared relations must be placed in pg_global tablespace");
- /*
- * Allocate an OID for the relation, unless we were told what to use.
- *
- * The OID will be the relfilenumber as well, so make sure it doesn't
- * collide with either pg_class OIDs or existing physical files.
- */
+ /* Allocate an OID for the relation, unless we were told what to use. */
if (!OidIsValid(relid))
{
/* Use binary-upgrade override for pg_class.oid and relfilenumber */
}
if (!OidIsValid(relid))
- relid = GetNewRelFileNumber(reltablespace, pg_class_desc,
- relpersistence);
+ relid = GetNewOidWithIndex(pg_class_desc, ClassOidIndexId,
+ Anum_pg_class_oid);
}
/*
collationObjectId,
classObjectId);
- /*
- * Allocate an OID for the index, unless we were told what to use.
- *
- * The OID will be the relfilenumber as well, so make sure it doesn't
- * collide with either pg_class OIDs or existing physical files.
- */
+ /* Allocate an OID for the index, unless we were told what to use. */
if (!OidIsValid(indexRelationId))
{
/* Use binary-upgrade override for pg_class.oid and relfilenumber */
}
else
{
- indexRelationId =
- GetNewRelFileNumber(tableSpaceId, pg_class, relpersistence);
+ indexRelationId = GetNewOidWithIndex(pg_class, ClassOidIndexId,
+ Anum_pg_class_oid);
}
}
xl_smgr_create *xlrec = (xl_smgr_create *) XLogRecGetData(record);
SMgrRelation reln;
+ if (xlrec->rlocator.relNumber > ShmemVariableCache->nextRelFileNumber)
+ elog(ERROR, "unexpected relnumber " UINT64_FORMAT " that is bigger than nextRelFileNumber " UINT64_FORMAT,
+ xlrec->rlocator.relNumber, ShmemVariableCache->nextRelFileNumber);
+
reln = smgropen(xlrec->rlocator, InvalidBackendId);
smgrcreate(reln, xlrec->forkNum, true);
}
int nforks = 0;
bool need_fsm_vacuum = false;
+ if (xlrec->rlocator.relNumber > ShmemVariableCache->nextRelFileNumber)
+ elog(ERROR, "unexpected relnumber " UINT64_FORMAT "that is bigger than nextRelFileNumber " UINT64_FORMAT,
+ xlrec->rlocator.relNumber, ShmemVariableCache->nextRelFileNumber);
+
reln = smgropen(xlrec->rlocator, InvalidBackendId);
/*
}
/*
- * Relfilenumbers are not unique in databases across tablespaces, so we
- * need to allocate a new one in the new tablespace.
- */
- newrelfilenumber = GetNewRelFileNumber(newTableSpace, NULL,
+ * Generate a new relfilenumber. We cannot reuse the old relfilenumber
+ * because of the possibility that that relation will be moved back to the
+ * original tablespace before the next checkpoint. At that point, the
+ * first segment of the main fork won't have been unlinked yet, and an
+ * attempt to create new relation storage with that same relfilenumber
+ * will fail.
+ */
+ newrelfilenumber = GetNewRelFileNumber(newTableSpace,
rel->rd_rel->relpersistence);
/* Open old and new relation */
* parts.
*/
if (strlen(location) + 1 + strlen(TABLESPACE_VERSION_DIRECTORY) + 1 +
- OIDCHARS + 1 + OIDCHARS + 1 + FORKNAMECHARS + 1 + OIDCHARS > MAXPGPATH)
+ OIDCHARS + 1 + RELNUMBERCHARS + 1 + FORKNAMECHARS + 1 + OIDCHARS > MAXPGPATH)
ereport(ERROR,
(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
errmsg("tablespace location \"%s\" is too long",
print $off "\tWRITE_UINT_FIELD($f);\n";
print $rff "\tREAD_UINT_FIELD($f);\n" unless $no_read;
}
- elsif ($t eq 'uint64')
+ elsif ($t eq 'uint64' || $t eq 'RelFileNumber')
{
print $off "\tWRITE_UINT64_FIELD($f);\n";
print $rff "\tREAD_UINT64_FIELD($f);\n" unless $no_read;
}
- elsif ($t eq 'Oid' || $t eq 'RelFileNumber')
+ elsif ($t eq 'Oid')
{
print $off "\tWRITE_OID_FIELD($f);\n";
print $rff "\tREAD_OID_FIELD($f);\n" unless $no_read;
break;
case XLOG_NOOP:
case XLOG_NEXTOID:
+ case XLOG_NEXT_RELFILENUMBER:
case XLOG_SWITCH:
case XLOG_BACKUP_END:
case XLOG_PARAMETER_CHANGE:
hash_seq_init(&hstat, tuplecid_data);
while ((ent = (ReorderBufferTupleCidEnt *) hash_seq_search(&hstat)) != NULL)
{
- elog(DEBUG3, "mapping: node: %u/%u/%u tid: %u/%u cmin: %u, cmax: %u",
+ elog(DEBUG3, "mapping: node: %u/%u/" UINT64_FORMAT " tid: %u/%u cmin: %u, cmax: %u",
ent->key.rlocator.dbOid,
ent->key.rlocator.spcOid,
ent->key.rlocator.relNumber,
typedef struct
{
- Oid reloid; /* hash key */
+ RelFileNumber relnumber; /* hash key */
} unlogged_relation_entry;
/*
* need to be reset. Otherwise, this cleanup operation would be
* O(n^2).
*/
- ctl.keysize = sizeof(Oid);
+ ctl.keysize = sizeof(RelFileNumber);
ctl.entrysize = sizeof(unlogged_relation_entry);
ctl.hcxt = CurrentMemoryContext;
- hash = hash_create("unlogged relation OIDs", 32, &ctl,
+ hash = hash_create("unlogged relation RelFileNumbers", 32, &ctl,
HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
/* Scan the directory. */
continue;
/*
- * Put the OID portion of the name into the hash table, if it
- * isn't already.
+ * Put the RELFILENUMBER portion of the name into the hash table,
+ * if it isn't already.
*/
- ent.reloid = atooid(de->d_name);
+ ent.relnumber = atorelnumber(de->d_name);
(void) hash_search(hash, &ent, HASH_ENTER, NULL);
}
continue;
/*
- * See whether the OID portion of the name shows up in the hash
- * table. If so, nuke it!
+ * See whether the RELFILENUMBER portion of the name shows up in
+ * the hash table. If so, nuke it!
*/
- ent.reloid = atooid(de->d_name);
+ ent.relnumber = atorelnumber(de->d_name);
if (hash_search(hash, &ent, HASH_FIND, NULL))
{
snprintf(rm_path, sizeof(rm_path), "%s/%s",
{
ForkNumber forkNum;
int relnumchars;
- char relnumbuf[OIDCHARS + 1];
+ char relnumbuf[RELNUMBERCHARS + 1];
char srcpath[MAXPGPATH * 2];
char dstpath[MAXPGPATH];
{
ForkNumber forkNum;
int relnumchars;
- char relnumbuf[OIDCHARS + 1];
+ char relnumbuf[RELNUMBERCHARS + 1];
char mainpath[MAXPGPATH];
/* Skip anything that doesn't look like a relation data file. */
* for a non-temporary relation and false otherwise.
*
* NB: If this function returns true, the caller is entitled to assume that
- * *relnumchars has been set to a value no more than OIDCHARS, and thus
- * that a buffer of OIDCHARS+1 characters is sufficient to hold the
+ * *relnumchars has been set to a value no more than RELNUMBERCHARS, and thus
+ * that a buffer of RELNUMBERCHARS+1 characters is sufficient to hold the
* RelFileNumber portion of the filename. This is critical to protect against
* a possible buffer overrun.
*/
/* Look for a non-empty string of digits (that isn't too long). */
for (pos = 0; isdigit((unsigned char) name[pos]); ++pos)
;
- if (pos == 0 || pos > OIDCHARS)
+ if (pos == 0 || pos > RELNUMBERCHARS)
return false;
*relnumchars = pos;
BlockNumber blknum;
BufferGetTag(buf, &rlocator, &forknum, &blknum);
- elog(DEBUG1, "fixing corrupt FSM block %u, relation %u/%u/%u",
+ elog(DEBUG1, "fixing corrupt FSM block %u, relation %u/%u/" UINT64_FORMAT,
blknum, rlocator.spcOid, rlocator.dbOid, rlocator.relNumber);
/* make sure we hold an exclusive lock */
# 45 was XactTruncationLock until removal of BackendRandomLock
WrapLimitsVacuumLock 46
NotifyQueueTailLock 47
+RelFileNumberGenLock 48
\ No newline at end of file
* next checkpoint, we prevent reassignment of the relfilenumber until it's
* safe, because relfilenumber assignment skips over any existing file.
*
+ * XXX. Although all of this was true when relfilenumbers were 32 bits wide,
+ * they are now 56 bits wide and do not wrap around, so in the future we can
+ * change the code to immediately unlink the first segment of the relation
+ * along with all the others. We still do reuse relfilenumbers when createdb()
+ * is performed using the file-copy method or during movedb(), but the scenario
+ * described above can only happen when creating a new relation.
+ *
* We do not need to go through this dance for temp relations, though, because
* we never make WAL entries for temp rels, and so a temp rel poses no threat
* to the health of a regular rel that has taken over its relfilenumber.
/* First time through: initialize the hash table */
HASHCTL ctl;
- ctl.keysize = sizeof(RelFileLocatorBackend);
+ ctl.keysize = SizeOfRelFileLocatorBackend;
ctl.entrysize = sizeof(SMgrRelationData);
SMgrRelationHash = hash_create("smgr relation table", 400,
&ctl, HASH_ELEM | HASH_BLOBS);
if (!RelFileNumberIsValid(result))
PG_RETURN_NULL();
- PG_RETURN_OID(result);
+ PG_RETURN_INT64(result);
}
/*
pg_filenode_relation(PG_FUNCTION_ARGS)
{
Oid reltablespace = PG_GETARG_OID(0);
- RelFileNumber relfilenumber = PG_GETARG_OID(1);
+ RelFileNumber relfilenumber = PG_GETARG_INT64(1);
Oid heaprel;
+ /* check whether the relfilenumber is within a valid range */
+ CHECK_RELFILENUMBER_RANGE(relfilenumber);
+
/* test needed so RelidByRelfilenumber doesn't misbehave */
if (!RelFileNumberIsValid(relfilenumber))
PG_RETURN_NULL();
#include "catalog/pg_type.h"
#include "commands/extension.h"
#include "miscadmin.h"
+#include "storage/relfilelocator.h"
#include "utils/array.h"
#include "utils/builtins.h"
Datum
binary_upgrade_set_next_heap_relfilenode(PG_FUNCTION_ARGS)
{
- RelFileNumber relfilenumber = PG_GETARG_OID(0);
+ RelFileNumber relfilenumber = PG_GETARG_INT64(0);
CHECK_IS_BINARY_UPGRADE;
+ CHECK_RELFILENUMBER_RANGE(relfilenumber);
binary_upgrade_next_heap_pg_class_relfilenumber = relfilenumber;
+ SetNextRelFileNumber(relfilenumber + 1);
PG_RETURN_VOID();
}
Datum
binary_upgrade_set_next_index_relfilenode(PG_FUNCTION_ARGS)
{
- RelFileNumber relfilenumber = PG_GETARG_OID(0);
+ RelFileNumber relfilenumber = PG_GETARG_INT64(0);
CHECK_IS_BINARY_UPGRADE;
+ CHECK_RELFILENUMBER_RANGE(relfilenumber);
binary_upgrade_next_index_pg_class_relfilenumber = relfilenumber;
+ SetNextRelFileNumber(relfilenumber + 1);
PG_RETURN_VOID();
}
Datum
binary_upgrade_set_next_toast_relfilenode(PG_FUNCTION_ARGS)
{
- RelFileNumber relfilenumber = PG_GETARG_OID(0);
+ RelFileNumber relfilenumber = PG_GETARG_INT64(0);
CHECK_IS_BINARY_UPGRADE;
+ CHECK_RELFILENUMBER_RANGE(relfilenumber);
binary_upgrade_next_toast_pg_class_relfilenumber = relfilenumber;
+ SetNextRelFileNumber(relfilenumber + 1);
PG_RETURN_VOID();
}
{
/* Allocate a new relfilenumber */
newrelfilenumber = GetNewRelFileNumber(relation->rd_rel->reltablespace,
- NULL, persistence);
+ persistence);
}
else if (relation->rd_rel->relkind == RELKIND_INDEX)
{
/* set scan arguments */
skey[0].sk_argument = ObjectIdGetDatum(reltablespace);
- skey[1].sk_argument = ObjectIdGetDatum(relfilenumber);
+ skey[1].sk_argument = Int64GetDatum((int64) relfilenumber);
scandesc = systable_beginscan(relation,
ClassTblspcRelfilenodeIndexId,
if (found)
elog(ERROR,
- "unexpected duplicate for tablespace %u, relfilenumber %u",
+ "unexpected duplicate for tablespace %u, relfilenumber " UINT64_FORMAT,
reltablespace, relfilenumber);
found = true;
Datum
pg_control_checkpoint(PG_FUNCTION_ARGS)
{
- Datum values[18];
- bool nulls[18];
+ Datum values[19];
+ bool nulls[19];
TupleDesc tupdesc;
HeapTuple htup;
ControlFileData *ControlFile;
XIDOID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 18, "checkpoint_time",
TIMESTAMPTZOID, -1, 0);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 19, "next_relfilenumber",
+ INT8OID, -1, 0);
tupdesc = BlessTupleDesc(tupdesc);
/* Read the control file. */
values[17] = TimestampTzGetDatum(time_t_to_timestamptz(ControlFile->checkPointCopy.time));
nulls[17] = false;
+ values[18] = Int64GetDatum((int64) ControlFile->checkPointCopy.nextRelFileNumber);
+ nulls[18] = false;
+
htup = heap_form_tuple(tupdesc, values, nulls);
PG_RETURN_DATUM(HeapTupleGetDatum(htup));
mode = PG_MODE_ENABLE;
break;
case 'f':
- if (!option_parse_int(optarg, "-f/--filenode", 0,
- INT_MAX,
- NULL))
+ if (!option_parse_relfilenumber(optarg, "-f/--filenode"))
exit(1);
only_filenode = pstrdup(optarg);
break;
printf(_("Latest checkpoint's NextXID: %u:%u\n"),
EpochFromFullTransactionId(ControlFile->checkPointCopy.nextXid),
XidFromFullTransactionId(ControlFile->checkPointCopy.nextXid));
+ printf(_("Latest checkpoint's NextRelFileNumber:%llu\n"),
+ (unsigned long long) ControlFile->checkPointCopy.nextRelFileNumber);
printf(_("Latest checkpoint's NextOID: %u\n"),
ControlFile->checkPointCopy.nextOid);
printf(_("Latest checkpoint's NextMultiXactId: %u\n"),
atooid(PQgetvalue(lo_res, i, ii_oid)));
oid = atooid(PQgetvalue(lo_res, i, ii_oid));
- relfilenumber = atooid(PQgetvalue(lo_res, i, ii_relfilenode));
+ relfilenumber = atorelnumber(PQgetvalue(lo_res, i, ii_relfilenode));
if (oid == LargeObjectRelationId)
appendPQExpBuffer(loOutQry,
- "SELECT pg_catalog.binary_upgrade_set_next_heap_relfilenode('%u'::pg_catalog.oid);\n",
+ "SELECT pg_catalog.binary_upgrade_set_next_heap_relfilenode('" UINT64_FORMAT "'::pg_catalog.int8);\n",
relfilenumber);
else if (oid == LargeObjectLOidPNIndexId)
appendPQExpBuffer(loOutQry,
- "SELECT pg_catalog.binary_upgrade_set_next_index_relfilenode('%u'::pg_catalog.oid);\n",
+ "SELECT pg_catalog.binary_upgrade_set_next_index_relfilenode('" UINT64_FORMAT "'::pg_catalog.int8);\n",
relfilenumber);
}
relkind = *PQgetvalue(upgrade_res, 0, PQfnumber(upgrade_res, "relkind"));
- relfilenumber = atooid(PQgetvalue(upgrade_res, 0,
- PQfnumber(upgrade_res, "relfilenode")));
+ relfilenumber = atorelnumber(PQgetvalue(upgrade_res, 0,
+ PQfnumber(upgrade_res, "relfilenode")));
toast_oid = atooid(PQgetvalue(upgrade_res, 0,
PQfnumber(upgrade_res, "reltoastrelid")));
- toast_relfilenumber = atooid(PQgetvalue(upgrade_res, 0,
- PQfnumber(upgrade_res, "toast_relfilenode")));
+ toast_relfilenumber = atorelnumber(PQgetvalue(upgrade_res, 0,
+ PQfnumber(upgrade_res, "toast_relfilenode")));
toast_index_oid = atooid(PQgetvalue(upgrade_res, 0,
PQfnumber(upgrade_res, "indexrelid")));
- toast_index_relfilenumber = atooid(PQgetvalue(upgrade_res, 0,
- PQfnumber(upgrade_res, "toast_index_relfilenode")));
+ toast_index_relfilenumber = atorelnumber(PQgetvalue(upgrade_res, 0,
+ PQfnumber(upgrade_res, "toast_index_relfilenode")));
appendPQExpBufferStr(upgrade_buffer,
"\n-- For binary upgrade, must preserve pg_class oids and relfilenodes\n");
*/
if (RelFileNumberIsValid(relfilenumber) && relkind != RELKIND_PARTITIONED_TABLE)
appendPQExpBuffer(upgrade_buffer,
- "SELECT pg_catalog.binary_upgrade_set_next_heap_relfilenode('%u'::pg_catalog.oid);\n",
+ "SELECT pg_catalog.binary_upgrade_set_next_heap_relfilenode('" UINT64_FORMAT "'::pg_catalog.int8);\n",
relfilenumber);
/*
"SELECT pg_catalog.binary_upgrade_set_next_toast_pg_class_oid('%u'::pg_catalog.oid);\n",
toast_oid);
appendPQExpBuffer(upgrade_buffer,
- "SELECT pg_catalog.binary_upgrade_set_next_toast_relfilenode('%u'::pg_catalog.oid);\n",
+ "SELECT pg_catalog.binary_upgrade_set_next_toast_relfilenode('" UINT64_FORMAT "'::pg_catalog.int8);\n",
toast_relfilenumber);
/* every toast table has an index */
"SELECT pg_catalog.binary_upgrade_set_next_index_pg_class_oid('%u'::pg_catalog.oid);\n",
toast_index_oid);
appendPQExpBuffer(upgrade_buffer,
- "SELECT pg_catalog.binary_upgrade_set_next_index_relfilenode('%u'::pg_catalog.oid);\n",
+ "SELECT pg_catalog.binary_upgrade_set_next_index_relfilenode('" UINT64_FORMAT "'::pg_catalog.int8);\n",
toast_index_relfilenumber);
}
"SELECT pg_catalog.binary_upgrade_set_next_index_pg_class_oid('%u'::pg_catalog.oid);\n",
pg_class_oid);
appendPQExpBuffer(upgrade_buffer,
- "SELECT pg_catalog.binary_upgrade_set_next_index_relfilenode('%u'::pg_catalog.oid);\n",
+ "SELECT pg_catalog.binary_upgrade_set_next_index_relfilenode('" UINT64_FORMAT "'::pg_catalog.int8);\n",
relfilenumber);
}
segNo = 0;
matched = false;
- nmatch = sscanf(path, "global/%u.%u", &rlocator.relNumber, &segNo);
+ nmatch = sscanf(path, "global/" UINT64_FORMAT ".%u", &rlocator.relNumber, &segNo);
if (nmatch == 1 || nmatch == 2)
{
rlocator.spcOid = GLOBALTABLESPACE_OID;
}
else
{
- nmatch = sscanf(path, "base/%u/%u.%u",
+ nmatch = sscanf(path, "base/%u/" UINT64_FORMAT ".%u",
&rlocator.dbOid, &rlocator.relNumber, &segNo);
if (nmatch == 2 || nmatch == 3)
{
}
else
{
- nmatch = sscanf(path, "pg_tblspc/%u/" TABLESPACE_VERSION_DIRECTORY "/%u/%u.%u",
+ nmatch = sscanf(path, "pg_tblspc/%u/" TABLESPACE_VERSION_DIRECTORY "/%u/" UINT64_FORMAT ".%u",
&rlocator.spcOid, &rlocator.dbOid, &rlocator.relNumber,
&segNo);
if (nmatch == 3 || nmatch == 4)
relname = PQgetvalue(res, relnum, i_relname);
curr->relname = pg_strdup(relname);
- curr->relfilenumber = atooid(PQgetvalue(res, relnum, i_relfilenumber));
+ curr->relfilenumber =
+ atorelnumber(PQgetvalue(res, relnum, i_relfilenumber));
curr->tblsp_alloc = false;
/* Is the tablespace oid non-default? */
* oids are the same between old and new clusters. This is important
* because toast oids are stored as toast pointers in user tables.
*
- * While pg_class.oid and pg_class.relfilenode are initially the same in a
- * cluster, they can diverge due to CLUSTER, REINDEX, or VACUUM FULL. We
- * control assignments of pg_class.relfilenode because we want the filenames
- * to match between the old and new cluster.
+ * We control assignments of pg_class.relfilenode because we want the
+ * filenames to match between the old and new cluster.
*
* We control assignment of pg_tablespace.oid because we want the oid to match
* between the old and new cluster.
else
snprintf(extent_suffix, sizeof(extent_suffix), ".%d", segno);
- snprintf(old_file, sizeof(old_file), "%s%s/%u/%u%s%s",
+ snprintf(old_file, sizeof(old_file), "%s%s/%u/" UINT64_FORMAT "%s%s",
map->old_tablespace,
map->old_tablespace_suffix,
map->db_oid,
map->relfilenumber,
type_suffix,
extent_suffix);
- snprintf(new_file, sizeof(new_file), "%s%s/%u/%u%s%s",
+ snprintf(new_file, sizeof(new_file), "%s%s/%u/" UINT64_FORMAT "%s%s",
map->new_tablespace,
map->new_tablespace_suffix,
map->db_oid,
}
break;
case 'R':
- if (sscanf(optarg, "%u/%u/%u",
+ if (sscanf(optarg, "%u/%u/" UINT64_FORMAT,
&config.filter_by_relation.spcOid,
&config.filter_by_relation.dbOid,
&config.filter_by_relation.relNumber) != 3 ||
# REINDEX operations. A set of relfilenodes is saved from the catalogs
# and then compared with pg_class.
$node->safe_psql('postgres',
- 'CREATE TABLE index_relfilenodes (parent regclass, indname text, indoid oid, relfilenode oid);'
+ 'CREATE TABLE index_relfilenodes (parent regclass, indname text, indoid oid, relfilenode int8);'
);
# Save the relfilenode of a set of toast indexes, one from the catalog
# pg_constraint and one from the test table.
Assert(dbOid == 0);
Assert(backendId == InvalidBackendId);
if (forkNumber != MAIN_FORKNUM)
- path = psprintf("global/%u_%s",
+ path = psprintf("global/" UINT64_FORMAT "_%s",
relNumber, forkNames[forkNumber]);
else
- path = psprintf("global/%u", relNumber);
+ path = psprintf("global/" UINT64_FORMAT, relNumber);
}
else if (spcOid == DEFAULTTABLESPACE_OID)
{
if (backendId == InvalidBackendId)
{
if (forkNumber != MAIN_FORKNUM)
- path = psprintf("base/%u/%u_%s",
+ path = psprintf("base/%u/" UINT64_FORMAT "_%s",
dbOid, relNumber,
forkNames[forkNumber]);
else
- path = psprintf("base/%u/%u",
+ path = psprintf("base/%u/" UINT64_FORMAT,
dbOid, relNumber);
}
else
{
if (forkNumber != MAIN_FORKNUM)
- path = psprintf("base/%u/t%d_%u_%s",
+ path = psprintf("base/%u/t%d_" UINT64_FORMAT "_%s",
dbOid, backendId, relNumber,
forkNames[forkNumber]);
else
- path = psprintf("base/%u/t%d_%u",
+ path = psprintf("base/%u/t%d_" UINT64_FORMAT,
dbOid, backendId, relNumber);
}
}
if (backendId == InvalidBackendId)
{
if (forkNumber != MAIN_FORKNUM)
- path = psprintf("pg_tblspc/%u/%s/%u/%u_%s",
+ path = psprintf("pg_tblspc/%u/%s/%u/" UINT64_FORMAT "_%s",
spcOid, TABLESPACE_VERSION_DIRECTORY,
dbOid, relNumber,
forkNames[forkNumber]);
else
- path = psprintf("pg_tblspc/%u/%s/%u/%u",
+ path = psprintf("pg_tblspc/%u/%s/%u/" UINT64_FORMAT,
spcOid, TABLESPACE_VERSION_DIRECTORY,
dbOid, relNumber);
}
else
{
if (forkNumber != MAIN_FORKNUM)
- path = psprintf("pg_tblspc/%u/%s/%u/t%d_%u_%s",
+ path = psprintf("pg_tblspc/%u/%s/%u/t%d_" UINT64_FORMAT "_%s",
spcOid, TABLESPACE_VERSION_DIRECTORY,
dbOid, backendId, relNumber,
forkNames[forkNumber]);
else
- path = psprintf("pg_tblspc/%u/%s/%u/t%d_%u",
+ path = psprintf("pg_tblspc/%u/%s/%u/t%d_" UINT64_FORMAT,
spcOid, TABLESPACE_VERSION_DIRECTORY,
dbOid, backendId, relNumber);
}
#include "postgres_fe.h"
#include "common/logging.h"
+#include "common/relpath.h"
#include "common/string.h"
#include "fe_utils/option_utils.h"
*result = val;
return true;
}
+
+/*
+ * option_parse_relfilenumber
+ *
+ * Parse relfilenumber value for an option. If the parsing is successful,
+ * returns; if parsing fails, returns false.
+ */
+bool
+option_parse_relfilenumber(const char *optarg, const char *optname)
+{
+ char *endptr;
+ uint64 val;
+
+ errno = 0;
+ val = strtou64(optarg, &endptr, 10);
+
+ /*
+ * Skip any trailing whitespace; if anything but whitespace remains before
+ * the terminating character, fail.
+ */
+ while (*endptr != '\0' && isspace((unsigned char) *endptr))
+ endptr++;
+
+ if (*endptr != '\0')
+ {
+ pg_log_error("invalid value \"%s\" for option %s",
+ optarg, optname);
+ return false;
+ }
+
+ if (val > MAX_RELFILENUMBER)
+ {
+ pg_log_error("%s must be in range " UINT64_FORMAT ".." UINT64_FORMAT,
+ optname, UINT64CONST(0), MAX_RELFILENUMBER);
+ return false;
+ }
+
+ return true;
+}
#define TRANSAM_H
#include "access/xlogdefs.h"
+#include "common/relpath.h"
/* ----------------
#define FirstUnpinnedObjectId 12000
#define FirstNormalObjectId 16384
+/* ----------
+ * RelFileNumbers are normally assigned sequentially beginning with
+ * FirstNormalRelFileNumber, but for system tables the initial RelFileNumber
+ * is equal to the table OID. This scheme allows pg_upgrade to work: we expect
+ * that the new cluster will contain only system tables, and that none of those
+ * will have previously been rewritten, so any RelFileNumber which is in use
+ * in both the old and new clusters will be used for the same relation in both
+ * places.
+ *
+ * This is important because pg_upgrade can't reactively move conflicting
+ * relations out of the way. If it tries to set the RelFileNumber for a
+ * relation to some value that's already in use by a different relation, the
+ * upgrade will just fail. It's OK if the same RelFileNumber is used for the
+ * same relation, though, since then nothing needs to be changed.
+ * ----------
+ */
+#define FirstNormalRelFileNumber ((RelFileNumber) 100000)
+
+#define CHECK_RELFILENUMBER_RANGE(relfilenumber) \
+do { \
+ if ((relfilenumber) < 0 || (relfilenumber) > MAX_RELFILENUMBER) \
+ ereport(ERROR, \
+ errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
+ errmsg("relfilenumber %llu is out of range", \
+ (unsigned long long) (relfilenumber))); \
+} while (0)
+
/*
* VariableCache is a data structure in shared memory that is used to track
* OID and XID assignment state. For largely historical reasons, there is
Oid nextOid; /* next OID to assign */
uint32 oidCount; /* OIDs available before must do XLOG work */
+ /*
+ * These fields are protected by RelFileNumberGenLock.
+ */
+ RelFileNumber nextRelFileNumber; /* next relfilenumber to assign */
+ RelFileNumber loggedRelFileNumber; /* last logged relfilenumber */
+ RelFileNumber flushedRelFileNumber; /* last flushed relfilenumber */
+ XLogRecPtr loggedRelFileNumberRecPtr; /* xlog record pointer w.r.t.
+ * loggedRelFileNumber */
+
/*
* These fields are protected by XidGenLock.
*/
extern void AdvanceOldestClogXid(TransactionId oldest_datfrozenxid);
extern bool ForceTransactionIdLimitUpdate(void);
extern Oid GetNewObjectId(void);
+extern RelFileNumber GetNewRelFileNumber(Oid reltablespace,
+ char relpersistence);
+extern void SetNextRelFileNumber(RelFileNumber relnumber);
extern void StopGeneratingPinnedObjectIds(void);
#ifdef USE_ASSERT_CHECKING
extern bool CreateRestartPoint(int flags);
extern WALAvailability GetWALAvailability(XLogRecPtr targetLSN);
extern void XLogPutNextOid(Oid nextOid);
+extern XLogRecPtr LogNextRelFileNumber(RelFileNumber nextrelnumber);
extern XLogRecPtr XLogRestorePoint(const char *rpName);
extern void UpdateFullPageWrites(void);
extern void GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, bool *doPageWrites_p);
extern Oid GetNewOidWithIndex(Relation relation, Oid indexId,
AttrNumber oidcolumn);
-extern RelFileNumber GetNewRelFileNumber(Oid reltablespace,
- Relation pg_class,
- char relpersistence);
#endif /* CATALOG_H */
*/
/* yyyymmddN */
-#define CATALOG_VERSION_NO 202209261
+#define CATALOG_VERSION_NO 202209271
#endif
/* oid */
Oid oid;
+ /* access method; 0 if not a table / index */
+ Oid relam BKI_DEFAULT(heap) BKI_LOOKUP_OPT(pg_am);
+
+ /* identifier of physical storage file */
+ /* relfilenode == 0 means it is a "mapped" relation, see relmapper.c */
+ int64 relfilenode BKI_DEFAULT(0);
+
/* class name */
NameData relname;
/* class owner */
Oid relowner BKI_DEFAULT(POSTGRES) BKI_LOOKUP(pg_authid);
- /* access method; 0 if not a table / index */
- Oid relam BKI_DEFAULT(heap) BKI_LOOKUP_OPT(pg_am);
-
- /* identifier of physical storage file */
- /* relfilenode == 0 means it is a "mapped" relation, see relmapper.c */
- Oid relfilenode BKI_DEFAULT(0);
-
/* identifier of table space for relation (0 means default for database) */
Oid reltablespace BKI_DEFAULT(0) BKI_LOOKUP_OPT(pg_tablespace);
DECLARE_UNIQUE_INDEX_PKEY(pg_class_oid_index, 2662, ClassOidIndexId, on pg_class using btree(oid oid_ops));
DECLARE_UNIQUE_INDEX(pg_class_relname_nsp_index, 2663, ClassNameNspIndexId, on pg_class using btree(relname name_ops, relnamespace oid_ops));
-DECLARE_INDEX(pg_class_tblspc_relfilenode_index, 3455, ClassTblspcRelfilenodeIndexId, on pg_class using btree(reltablespace oid_ops, relfilenode oid_ops));
+DECLARE_INDEX(pg_class_tblspc_relfilenode_index, 3455, ClassTblspcRelfilenodeIndexId, on pg_class using btree(reltablespace oid_ops, relfilenode int8_ops));
#ifdef EXPOSE_TO_CLIENT_CODE
* timeline (equals ThisTimeLineID otherwise) */
bool fullPageWrites; /* current full_page_writes */
FullTransactionId nextXid; /* next free transaction ID */
+ RelFileNumber nextRelFileNumber; /* next relfilenumber */
Oid nextOid; /* next free OID */
MultiXactId nextMulti; /* next free MultiXactId */
MultiXactOffset nextMultiOffset; /* next free MultiXact offset */
#define XLOG_FPI 0xB0
/* 0xC0 is used in Postgres 9.5-11 */
#define XLOG_OVERWRITE_CONTRECORD 0xD0
+#define XLOG_NEXT_RELFILENUMBER 0xE0
/*
proname => 'pg_indexes_size', provolatile => 'v', prorettype => 'int8',
proargtypes => 'regclass', prosrc => 'pg_indexes_size' },
{ oid => '2999', descr => 'filenode identifier of relation',
- proname => 'pg_relation_filenode', provolatile => 's', prorettype => 'oid',
+ proname => 'pg_relation_filenode', provolatile => 's', prorettype => 'int8',
proargtypes => 'regclass', prosrc => 'pg_relation_filenode' },
{ oid => '3454', descr => 'relation OID for filenode and tablespace',
proname => 'pg_filenode_relation', provolatile => 's',
- prorettype => 'regclass', proargtypes => 'oid oid',
+ prorettype => 'regclass', proargtypes => 'oid int8',
prosrc => 'pg_filenode_relation' },
{ oid => '3034', descr => 'file path of relation',
proname => 'pg_relation_filepath', provolatile => 's', prorettype => 'text',
prosrc => 'binary_upgrade_set_missing_value' },
{ oid => '4545', descr => 'for use by pg_upgrade',
proname => 'binary_upgrade_set_next_heap_relfilenode', provolatile => 'v',
- proparallel => 'u', prorettype => 'void', proargtypes => 'oid',
+ proparallel => 'u', prorettype => 'void', proargtypes => 'int8',
prosrc => 'binary_upgrade_set_next_heap_relfilenode' },
{ oid => '4546', descr => 'for use by pg_upgrade',
proname => 'binary_upgrade_set_next_index_relfilenode', provolatile => 'v',
- proparallel => 'u', prorettype => 'void', proargtypes => 'oid',
+ proparallel => 'u', prorettype => 'void', proargtypes => 'int8',
prosrc => 'binary_upgrade_set_next_index_relfilenode' },
{ oid => '4547', descr => 'for use by pg_upgrade',
proname => 'binary_upgrade_set_next_toast_relfilenode', provolatile => 'v',
- proparallel => 'u', prorettype => 'void', proargtypes => 'oid',
+ proparallel => 'u', prorettype => 'void', proargtypes => 'int8',
prosrc => 'binary_upgrade_set_next_toast_relfilenode' },
{ oid => '4548', descr => 'for use by pg_upgrade',
proname => 'binary_upgrade_set_next_pg_tablespace_oid', provolatile => 'v',
/*
* RelFileNumber data type identifies the specific relation file name.
*/
-typedef Oid RelFileNumber;
-#define InvalidRelFileNumber ((RelFileNumber) InvalidOid)
+typedef uint64 RelFileNumber;
+#define InvalidRelFileNumber ((RelFileNumber) 0)
#define RelFileNumberIsValid(relnumber) \
((bool) ((relnumber) != InvalidRelFileNumber))
+#define atorelnumber(x) ((RelFileNumber) strtou64((x), NULL, 10))
+#define MAX_RELFILENUMBER UINT64CONST(0x00FFFFFFFFFFFFFF)
/*
* Name of major-version-specific tablespace subdirectories
/* Characters to allow for an OID in a relation path */
#define OIDCHARS 10 /* max chars printed by %u */
+#define RELNUMBERCHARS 20 /* max chars printed by UINT64_FORMAT */
/*
* Stuff for fork names.
extern bool option_parse_int(const char *optarg, const char *optname,
int min_range, int max_range,
int *result);
+extern bool option_parse_relfilenumber(const char *optarg,
+ const char *optname);
#endif /* OPTION_UTILS_H */
{
Oid spcOid; /* tablespace oid */
Oid dbOid; /* database oid */
- RelFileNumber relNumber; /* relation file number */
- ForkNumber forkNum; /* fork number */
+
+ /*
+ * relForkDetails[] stores the fork number in the high 8 bits of the first
+ * integer; the remaining 56 bits are used to store the relfilenmber.
+ * Expanding the relfilenumber to a full 64 bits would require widening
+ * the BufferTag, which is undesirable for performance reasons. We use
+ * two 32-bit values here rather than a single 64-bit value to avoid
+ * padding the struct out to a multiple of 8 bytes.
+ */
+ uint32 relForkDetails[2];
BlockNumber blockNum; /* blknum relative to begin of reln */
} BufferTag;
+/* High relNumber bits in relForkDetails[0] */
+#define BUFTAG_RELNUM_HIGH_BITS 24
+
+/* Low relNumber bits in relForkDetails[1] */
+#define BUFTAG_RELNUM_LOW_BITS 32
+
+/* Mask to fetch high bits of relNumber from relForkDetails[0] */
+#define BUFTAG_RELNUM_HIGH_MASK ((1U << BUFTAG_RELNUM_HIGH_BITS) - 1)
+
+/* Mask to fetch low bits of relNumber from relForkDetails[1] */
+#define BUFTAG_RELNUM_LOW_MASK 0XFFFFFFFF
+
static inline RelFileNumber
BufTagGetRelNumber(const BufferTag *tag)
{
- return tag->relNumber;
+ uint64 relnum;
+
+ relnum = ((uint64) tag->relForkDetails[0]) & BUFTAG_RELNUM_HIGH_MASK;
+ relnum = (relnum << BUFTAG_RELNUM_LOW_BITS) | tag->relForkDetails[1];
+
+ Assert(relnum <= MAX_RELFILENUMBER);
+ return (RelFileNumber) relnum;
}
static inline ForkNumber
BufTagGetForkNum(const BufferTag *tag)
{
- return tag->forkNum;
+ ForkNumber ret;
+
+ StaticAssertStmt(MAX_FORKNUM <= INT8_MAX,
+ "MAX_FORKNUM can't be greater than INT8_MAX");
+
+ ret = (int8) (tag->relForkDetails[0] >> BUFTAG_RELNUM_HIGH_BITS);
+ return ret;
}
static inline void
BufTagSetRelForkDetails(BufferTag *tag, RelFileNumber relnumber,
ForkNumber forknum)
{
- tag->relNumber = relnumber;
- tag->forkNum = forknum;
+ Assert(relnumber <= MAX_RELFILENUMBER);
+ Assert(forknum <= MAX_FORKNUM);
+
+ tag->relForkDetails[0] = (relnumber >> BUFTAG_RELNUM_LOW_BITS) &
+ BUFTAG_RELNUM_HIGH_MASK;
+ tag->relForkDetails[0] |= (forknum << BUFTAG_RELNUM_HIGH_BITS);
+ tag->relForkDetails[1] = relnumber & BUFTAG_RELNUM_LOW_MASK;
}
static inline RelFileLocator
{
return (tag1->spcOid == tag2->spcOid) &&
(tag1->dbOid == tag2->dbOid) &&
- (tag1->relNumber == tag2->relNumber) &&
- (tag1->blockNum == tag2->blockNum) &&
- (tag1->forkNum == tag2->forkNum);
+ (tag1->relForkDetails[0] == tag2->relForkDetails[0]) &&
+ (tag1->relForkDetails[1] == tag2->relForkDetails[1]) &&
+ (tag1->blockNum == tag2->blockNum);
}
static inline bool
* Nonzero dbOid values correspond to pg_database.oid.
*
* relNumber identifies the specific relation. relNumber corresponds to
- * pg_class.relfilenode (NOT pg_class.oid, because we need to be able
- * to assign new physical files to relations in some situations).
- * Notice that relNumber is only unique within a database in a particular
- * tablespace.
+ * pg_class.relfilenode. Notice that relNumber values are assigned by
+ * GetNewRelFileNumber(), which will only ever assign the same value once
+ * during the lifetime of a cluster. However, since CREATE DATABASE duplicates
+ * the relfilenumbers of the template database, the values are in practice only
+ * unique within a database, not globally.
*
* Note: spcOid must be GLOBALTABLESPACE_OID if and only if dbOid is
* zero. We support shared relations only in the "global" tablespace.
BackendId backend;
} RelFileLocatorBackend;
+#define SizeOfRelFileLocatorBackend \
+ (offsetof(RelFileLocatorBackend, backend) + sizeof(BackendId))
+
#define RelFileLocatorBackendIsTemp(rlocator) \
((rlocator).backend != InvalidBackendId)
c.oid = oldoid as orig_oid,
case relfilenode
when 0 then 'none'
- when c.oid then 'own'
when oldfilenode then 'orig'
- else 'OTHER'
+ else 'new'
end as storage,
obj_description(c.oid, 'pg_class') as desc
from pg_class c left join old_oids using (relname)
relname | orig_oid | storage | desc
------------------------------+----------+---------+---------------
at_partitioned | t | none |
- at_partitioned_0 | t | own |
- at_partitioned_0_id_name_key | t | own | child 0 index
- at_partitioned_1 | t | own |
- at_partitioned_1_id_name_key | t | own | child 1 index
+ at_partitioned_0 | t | orig |
+ at_partitioned_0_id_name_key | t | orig | child 0 index
+ at_partitioned_1 | t | orig |
+ at_partitioned_1_id_name_key | t | orig | child 1 index
at_partitioned_id_name_key | t | none | parent index
(6 rows)
c.oid = oldoid as orig_oid,
case relfilenode
when 0 then 'none'
- when c.oid then 'own'
when oldfilenode then 'orig'
- else 'OTHER'
+ else 'new'
end as storage,
obj_description(c.oid, 'pg_class') as desc
from pg_class c left join old_oids using (relname)
relname | orig_oid | storage | desc
------------------------------+----------+---------+--------------
at_partitioned | t | none |
- at_partitioned_0 | t | own |
- at_partitioned_0_id_name_key | f | own | parent index
- at_partitioned_1 | t | own |
- at_partitioned_1_id_name_key | f | own | parent index
+ at_partitioned_0 | t | orig |
+ at_partitioned_0_id_name_key | f | new | parent index
+ at_partitioned_1 | t | orig |
+ at_partitioned_1_id_name_key | f | new | parent index
at_partitioned_id_name_key | f | none | parent index
(6 rows)
RETURNS boolean
LANGUAGE plpgsql AS $$
DECLARE
- v_relfilenode oid;
+ v_relfilenode int8;
BEGIN
v_relfilenode := relfilenode FROM pg_class WHERE oid = p_tablename;
--
SET search_path = fast_default;
CREATE SCHEMA fast_default;
-CREATE TABLE m(id OID);
-INSERT INTO m VALUES (NULL::OID);
+CREATE TABLE m(id BIGINT);
+INSERT INTO m VALUES (NULL::BIGINT);
CREATE FUNCTION set(tabname name) RETURNS VOID
AS $$
BEGIN
NOTICE: checking pg_attribute {attrelid} => pg_class {oid}
NOTICE: checking pg_attribute {atttypid} => pg_type {oid}
NOTICE: checking pg_attribute {attcollation} => pg_collation {oid}
+NOTICE: checking pg_class {relam} => pg_am {oid}
NOTICE: checking pg_class {relnamespace} => pg_namespace {oid}
NOTICE: checking pg_class {reltype} => pg_type {oid}
NOTICE: checking pg_class {reloftype} => pg_type {oid}
NOTICE: checking pg_class {relowner} => pg_authid {oid}
-NOTICE: checking pg_class {relam} => pg_am {oid}
NOTICE: checking pg_class {reltablespace} => pg_tablespace {oid}
NOTICE: checking pg_class {reltoastrelid} => pg_class {oid}
NOTICE: checking pg_class {relrewrite} => pg_class {oid}
c.oid = oldoid as orig_oid,
case relfilenode
when 0 then 'none'
- when c.oid then 'own'
when oldfilenode then 'orig'
- else 'OTHER'
+ else 'new'
end as storage,
obj_description(c.oid, 'pg_class') as desc
from pg_class c left join old_oids using (relname)
c.oid = oldoid as orig_oid,
case relfilenode
when 0 then 'none'
- when c.oid then 'own'
when oldfilenode then 'orig'
- else 'OTHER'
+ else 'new'
end as storage,
obj_description(c.oid, 'pg_class') as desc
from pg_class c left join old_oids using (relname)
RETURNS boolean
LANGUAGE plpgsql AS $$
DECLARE
- v_relfilenode oid;
+ v_relfilenode int8;
BEGIN
v_relfilenode := relfilenode FROM pg_class WHERE oid = p_tablename;
SET search_path = fast_default;
CREATE SCHEMA fast_default;
-CREATE TABLE m(id OID);
-INSERT INTO m VALUES (NULL::OID);
+CREATE TABLE m(id BIGINT);
+INSERT INTO m VALUES (NULL::BIGINT);
CREATE FUNCTION set(tabname name) RETURNS VOID
AS $$