<entry>shared memory allocations</entry>
</row>
+ <row>
+ <entry><link linkend="view-pg-shmem-allocations-numa"><structname>pg_shmem_allocations_numa</structname></link></entry>
+ <entry>NUMA node mappings for shared memory allocations</entry>
+ </row>
+
<row>
<entry><link linkend="view-pg-stats"><structname>pg_stats</structname></link></entry>
<entry>planner statistics</entry>
</para>
</sect1>
+ <sect1 id="view-pg-shmem-allocations-numa">
+ <title><structname>pg_shmem_allocations_numa</structname></title>
+
+ <indexterm zone="view-pg-shmem-allocations-numa">
+ <primary>pg_shmem_allocations_numa</primary>
+ </indexterm>
+
+ <para>
+ The <structname>pg_shmem_allocations_numa</structname> shows how shared
+ memory allocations in the server's main shared memory segment are distributed
+ across NUMA nodes. This includes both memory allocated by
+ <productname>PostgreSQL</productname> itself and memory allocated
+ by extensions using the mechanisms detailed in
+ <xref linkend="xfunc-shared-addin" />. This view will output multiple rows
+ for each of the shared memory segments provided that they are spread accross
+ multiple NUMA nodes. This view should not be queried by monitoring systems
+ as it is very slow and may end up allocating shared memory in case it was not
+ used earlier.
+ Current limitation for this view is that won't show anonymous shared memory
+ allocations.
+ </para>
+
+ <para>
+ Note that this view does not include memory allocated using the dynamic
+ shared memory infrastructure.
+ </para>
+
+ <warning>
+ <para>
+ When determining the <acronym>NUMA</acronym> node, the view touches
+ all memory pages for the shared memory segment. This will force
+ allocation of the shared memory, if it wasn't allocated already,
+ and the memory may get allocated in a single <acronym>NUMA</acronym>
+ node (depending on system configuration).
+ </para>
+ </warning>
+
+ <table>
+ <title><structname>pg_shmem_allocations_numa</structname> Columns</title>
+ <tgroup cols="1">
+ <thead>
+ <row>
+ <entry role="catalog_table_entry"><para role="column_definition">
+ Column Type
+ </para>
+ <para>
+ Description
+ </para></entry>
+ </row>
+ </thead>
+
+ <tbody>
+ <row>
+ <entry role="catalog_table_entry"><para role="column_definition">
+ <structfield>name</structfield> <type>text</type>
+ </para>
+ <para>
+ The name of the shared memory allocation.
+ </para></entry>
+ </row>
+
+ <row>
+ <entry role="catalog_table_entry"><para role="column_definition">
+ <structfield>numa_node</structfield> <type>int4</type>
+ </para>
+ <para>
+ ID of <acronym>NUMA</acronym> node
+ </para></entry>
+ </row>
+
+ <row>
+ <entry role="catalog_table_entry"><para role="column_definition">
+ <structfield>size</structfield> <type>int4</type>
+ </para>
+ <para>
+ Size of the allocation on this particular NUMA memory node in bytes
+ </para></entry>
+ </row>
+
+ </tbody>
+ </tgroup>
+ </table>
+
+ <para>
+ By default, the <structname>pg_shmem_allocations_numa</structname> view can be
+ read only by superusers or roles with privileges of the
+ <literal>pg_read_all_stats</literal> role.
+ </para>
+ </sect1>
+
<sect1 id="view-pg-stats">
<title><structname>pg_stats</structname></title>
REVOKE EXECUTE ON FUNCTION pg_get_shmem_allocations() FROM PUBLIC;
GRANT EXECUTE ON FUNCTION pg_get_shmem_allocations() TO pg_read_all_stats;
+CREATE VIEW pg_shmem_allocations_numa AS
+ SELECT * FROM pg_get_shmem_allocations_numa();
+
+REVOKE ALL ON pg_shmem_allocations_numa FROM PUBLIC;
+GRANT SELECT ON pg_shmem_allocations_numa TO pg_read_all_stats;
+REVOKE EXECUTE ON FUNCTION pg_get_shmem_allocations_numa() FROM PUBLIC;
+GRANT EXECUTE ON FUNCTION pg_get_shmem_allocations_numa() TO pg_read_all_stats;
+
CREATE VIEW pg_backend_memory_contexts AS
SELECT * FROM pg_get_backend_memory_contexts();
#include "fmgr.h"
#include "funcapi.h"
#include "miscadmin.h"
+#include "port/pg_numa.h"
#include "storage/lwlock.h"
#include "storage/pg_shmem.h"
#include "storage/shmem.h"
static HTAB *ShmemIndex = NULL; /* primary index hashtable for shmem */
+/* To get reliable results for NUMA inquiry we need to "touch pages" once */
+static bool firstNumaTouch = true;
/*
* InitShmemAccess() --- set up basic pointers to shared memory.
return (Datum) 0;
}
+
+/*
+ * SQL SRF showing NUMA memory nodes for allocated shared memory
+ *
+ * Compared to pg_get_shmem_allocations(), this function does not return
+ * information about shared anonymous allocations and unused shared memory.
+ */
+Datum
+pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS)
+{
+#define PG_GET_SHMEM_NUMA_SIZES_COLS 3
+ ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+ HASH_SEQ_STATUS hstat;
+ ShmemIndexEnt *ent;
+ Datum values[PG_GET_SHMEM_NUMA_SIZES_COLS];
+ bool nulls[PG_GET_SHMEM_NUMA_SIZES_COLS];
+ Size os_page_size;
+ void **page_ptrs;
+ int *pages_status;
+ uint64 shm_total_page_count,
+ shm_ent_page_count,
+ max_nodes;
+ Size *nodes;
+
+ if (pg_numa_init() == -1)
+ elog(ERROR, "libnuma initialization failed or NUMA is not supported on this platform");
+
+ InitMaterializedSRF(fcinfo, 0);
+
+ max_nodes = pg_numa_get_max_node();
+ nodes = palloc(sizeof(Size) * (max_nodes + 1));
+
+ /*
+ * Different database block sizes (4kB, 8kB, ..., 32kB) can be used, while
+ * the OS may have different memory page sizes.
+ *
+ * To correctly map between them, we need to: 1. Determine the OS memory
+ * page size 2. Calculate how many OS pages are used by all buffer blocks
+ * 3. Calculate how many OS pages are contained within each database
+ * block.
+ *
+ * This information is needed before calling move_pages() for NUMA memory
+ * node inquiry.
+ */
+ os_page_size = pg_numa_get_pagesize();
+
+ /*
+ * Allocate memory for page pointers and status based on total shared
+ * memory size. This simplified approach allocates enough space for all
+ * pages in shared memory rather than calculating the exact requirements
+ * for each segment.
+ *
+ * Add 1, because we don't know how exactly the segments align to OS
+ * pages, so the allocation might use one more memory page. In practice
+ * this is not very likely, and moreover we have more entries, each of
+ * them using only fraction of the total pages.
+ */
+ shm_total_page_count = (ShmemSegHdr->totalsize / os_page_size) + 1;
+ page_ptrs = palloc0(sizeof(void *) * shm_total_page_count);
+ pages_status = palloc(sizeof(int) * shm_total_page_count);
+
+ if (firstNumaTouch)
+ elog(DEBUG1, "NUMA: page-faulting shared memory segments for proper NUMA readouts");
+
+ LWLockAcquire(ShmemIndexLock, LW_SHARED);
+
+ hash_seq_init(&hstat, ShmemIndex);
+
+ /* output all allocated entries */
+ memset(nulls, 0, sizeof(nulls));
+ while ((ent = (ShmemIndexEnt *) hash_seq_search(&hstat)) != NULL)
+ {
+ int i;
+ char *startptr,
+ *endptr;
+ Size total_len;
+
+ /*
+ * Calculate the range of OS pages used by this segment. The segment
+ * may start / end half-way through a page, we want to count these
+ * pages too. So we align the start/end pointers down/up, and then
+ * calculate the number of pages from that.
+ */
+ startptr = (char *) TYPEALIGN_DOWN(os_page_size, ent->location);
+ endptr = (char *) TYPEALIGN(os_page_size,
+ (char *) ent->location + ent->allocated_size);
+ total_len = (endptr - startptr);
+
+ shm_ent_page_count = total_len / os_page_size;
+
+ /*
+ * If we ever get 0xff (-1) back from kernel inquiry, then we probably
+ * have a bug in mapping buffers to OS pages.
+ */
+ memset(pages_status, 0xff, sizeof(int) * shm_ent_page_count);
+
+ /*
+ * Setup page_ptrs[] with pointers to all OS pages for this segment,
+ * and get the NUMA status using pg_numa_query_pages.
+ *
+ * In order to get reliable results we also need to touch memory
+ * pages, so that inquiry about NUMA memory node doesn't return -2
+ * (ENOENT, which indicates unmapped/unallocated pages).
+ */
+ for (i = 0; i < shm_ent_page_count; i++)
+ {
+ volatile uint64 touch pg_attribute_unused();
+
+ page_ptrs[i] = startptr + (i * os_page_size);
+
+ if (firstNumaTouch)
+ pg_numa_touch_mem_if_required(touch, page_ptrs[i]);
+
+ CHECK_FOR_INTERRUPTS();
+ }
+
+ if (pg_numa_query_pages(0, shm_ent_page_count, page_ptrs, pages_status) == -1)
+ elog(ERROR, "failed NUMA pages inquiry status: %m");
+
+ /* Count number of NUMA nodes used for this shared memory entry */
+ memset(nodes, 0, sizeof(Size) * (max_nodes + 1));
+
+ for (i = 0; i < shm_ent_page_count; i++)
+ {
+ int s = pages_status[i];
+
+ /* Ensure we are adding only valid index to the array */
+ if (s < 0 || s > max_nodes)
+ {
+ elog(ERROR, "invalid NUMA node id outside of allowed range "
+ "[0, " UINT64_FORMAT "]: %d", max_nodes, s);
+ }
+
+ nodes[s]++;
+ }
+
+ /*
+ * Add one entry for each NUMA node, including those without allocated
+ * memory for this segment.
+ */
+ for (i = 0; i <= max_nodes; i++)
+ {
+ values[0] = CStringGetTextDatum(ent->key);
+ values[1] = i;
+ values[2] = Int64GetDatum(nodes[i] * os_page_size);
+
+ tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
+ values, nulls);
+ }
+ }
+
+ LWLockRelease(ShmemIndexLock);
+ firstNumaTouch = false;
+
+ return (Datum) 0;
+}
*/
/* yyyymmddN */
-#define CATALOG_VERSION_NO 202504072
+#define CATALOG_VERSION_NO 202504073
#endif
proname => 'pg_numa_available', provolatile => 's', prorettype => 'bool',
proargtypes => '', prosrc => 'pg_numa_available' },
+# shared memory usage with NUMA info
+{ oid => '4100', descr => 'NUMA mappings for the main shared memory segment',
+ proname => 'pg_get_shmem_allocations_numa', prorows => '50', proretset => 't',
+ provolatile => 'v', prorettype => 'record', proargtypes => '',
+ proallargtypes => '{text,int4,int8}', proargmodes => '{o,o,o}',
+ proargnames => '{name,numa_node,size}',
+ prosrc => 'pg_get_shmem_allocations_numa' },
+
# memory context of local backend
{ oid => '2282',
descr => 'information about all memory contexts of local backend',
--- /dev/null
+SELECT NOT(pg_numa_available()) AS skip_test \gset
+\if :skip_test
+SELECT COUNT(*) = 0 AS ok FROM pg_shmem_allocations_numa;
+\quit
+\endif
+-- switch to superuser
+\c -
+SELECT COUNT(*) >= 0 AS ok FROM pg_shmem_allocations_numa;
+ ok
+----
+ t
+(1 row)
+
--- /dev/null
+SELECT NOT(pg_numa_available()) AS skip_test \gset
+\if :skip_test
+SELECT COUNT(*) = 0 AS ok FROM pg_shmem_allocations_numa;
+ERROR: libnuma initialization failed or NUMA is not supported on this platform
+\quit
-- clean up
DROP TABLE lock_table;
DROP USER regress_locktable_user;
--- test to check privileges of system views pg_shmem_allocations and
--- pg_backend_memory_contexts.
+-- test to check privileges of system views pg_shmem_allocations,
+-- pg_shmem_allocations_numa and pg_backend_memory_contexts.
-- switch to superuser
\c -
CREATE ROLE regress_readallstats;
f
(1 row)
+SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations_numa','SELECT'); -- no
+ has_table_privilege
+---------------------
+ f
+(1 row)
+
GRANT pg_read_all_stats TO regress_readallstats;
SELECT has_table_privilege('regress_readallstats','pg_aios','SELECT'); -- yes
has_table_privilege
t
(1 row)
+SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations_numa','SELECT'); -- yes
+ has_table_privilege
+---------------------
+ t
+(1 row)
+
-- run query to ensure that functions within views can be executed
SET ROLE regress_readallstats;
SELECT COUNT(*) >= 0 AS ok FROM pg_aios;
size,
allocated_size
FROM pg_get_shmem_allocations() pg_get_shmem_allocations(name, off, size, allocated_size);
+pg_shmem_allocations_numa| SELECT name,
+ numa_node,
+ size
+ FROM pg_get_shmem_allocations_numa() pg_get_shmem_allocations_numa(name, numa_node, size);
pg_stat_activity| SELECT s.datid,
d.datname,
s.pid,
# The stats test resets stats, so nothing else needing stats access can be in
# this group.
# ----------
-test: partition_join partition_prune reloptions hash_part indexing partition_aggregate partition_info tuplesort explain compression memoize stats predicate
+test: partition_join partition_prune reloptions hash_part indexing partition_aggregate partition_info tuplesort explain compression memoize stats predicate numa
# event_trigger depends on create_am and cannot run concurrently with
# any test that runs DDL
--- /dev/null
+SELECT NOT(pg_numa_available()) AS skip_test \gset
+\if :skip_test
+SELECT COUNT(*) = 0 AS ok FROM pg_shmem_allocations_numa;
+\quit
+\endif
+
+-- switch to superuser
+\c -
+
+SELECT COUNT(*) >= 0 AS ok FROM pg_shmem_allocations_numa;
DROP TABLE lock_table;
DROP USER regress_locktable_user;
--- test to check privileges of system views pg_shmem_allocations and
--- pg_backend_memory_contexts.
+-- test to check privileges of system views pg_shmem_allocations,
+-- pg_shmem_allocations_numa and pg_backend_memory_contexts.
-- switch to superuser
\c -
SELECT has_table_privilege('regress_readallstats','pg_aios','SELECT'); -- no
SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no
SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT'); -- no
+SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations_numa','SELECT'); -- no
GRANT pg_read_all_stats TO regress_readallstats;
SELECT has_table_privilege('regress_readallstats','pg_aios','SELECT'); -- yes
SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- yes
SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT'); -- yes
+SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations_numa','SELECT'); -- yes
-- run query to ensure that functions within views can be executed
SET ROLE regress_readallstats;