Introduce pg_shmem_allocations_numa view
authorTomas Vondra <tomas.vondra@postgresql.org>
Mon, 7 Apr 2025 20:54:49 +0000 (22:54 +0200)
committerTomas Vondra <tomas.vondra@postgresql.org>
Mon, 7 Apr 2025 21:08:17 +0000 (23:08 +0200)
Introduce new pg_shmem_alloctions_numa view with information about how
shared memory is distributed across NUMA nodes. For each shared memory
segment, the view returns one row for each NUMA node backing it, with
the total amount of memory allocated from that node.

The view may be relatively expensive, especially when executed for the
first time in a backend, as it has to touch all memory pages to get
reliable information about the NUMA node. This may also force allocation
of the shared memory.

Unlike pg_shmem_allocations, the view does not show anonymous shared
memory allocations. It also does not show memory allocated using the
dynamic shared memory infrastructure.

Author: Jakub Wartak <jakub.wartak@enterprisedb.com>
Reviewed-by: Andres Freund <andres@anarazel.de>
Reviewed-by: Bertrand Drouvot <bertranddrouvot.pg@gmail.com>
Reviewed-by: Tomas Vondra <tomas@vondra.me>
Discussion: https://postgr.es/m/CAKZiRmxh6KWo0aqRqvmcoaX2jUxZYb4kGp3N%3Dq1w%2BDiH-696Xw%40mail.gmail.com

12 files changed:
doc/src/sgml/system-views.sgml
src/backend/catalog/system_views.sql
src/backend/storage/ipc/shmem.c
src/include/catalog/catversion.h
src/include/catalog/pg_proc.dat
src/test/regress/expected/numa.out [new file with mode: 0644]
src/test/regress/expected/numa_1.out [new file with mode: 0644]
src/test/regress/expected/privileges.out
src/test/regress/expected/rules.out
src/test/regress/parallel_schedule
src/test/regress/sql/numa.sql [new file with mode: 0644]
src/test/regress/sql/privileges.sql

index 4f336ee0adfaef86e4044729ac5c7926efc4332a..0eba37268bf5a035cc02412e5d893a5771e273c8 100644 (file)
       <entry>shared memory allocations</entry>
      </row>
 
+     <row>
+      <entry><link linkend="view-pg-shmem-allocations-numa"><structname>pg_shmem_allocations_numa</structname></link></entry>
+      <entry>NUMA node mappings for shared memory allocations</entry>
+     </row>
+
      <row>
       <entry><link linkend="view-pg-stats"><structname>pg_stats</structname></link></entry>
       <entry>planner statistics</entry>
@@ -4051,6 +4056,96 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
   </para>
  </sect1>
 
+ <sect1 id="view-pg-shmem-allocations-numa">
+  <title><structname>pg_shmem_allocations_numa</structname></title>
+
+  <indexterm zone="view-pg-shmem-allocations-numa">
+   <primary>pg_shmem_allocations_numa</primary>
+  </indexterm>
+
+  <para>
+   The <structname>pg_shmem_allocations_numa</structname> shows how shared
+   memory allocations in the server's main shared memory segment are distributed
+   across NUMA nodes. This includes both memory allocated by
+   <productname>PostgreSQL</productname> itself and memory allocated
+   by extensions using the mechanisms detailed in
+   <xref linkend="xfunc-shared-addin" />. This view will output multiple rows
+   for each of the shared memory segments provided that they are spread accross
+   multiple NUMA nodes. This view should not be queried by monitoring systems
+   as it is very slow and may end up allocating shared memory in case it was not
+   used earlier.
+   Current limitation for this view is that won't show anonymous shared memory
+   allocations.
+  </para>
+
+  <para>
+   Note that this view does not include memory allocated using the dynamic
+   shared memory infrastructure.
+  </para>
+
+  <warning>
+    <para>
+      When determining the <acronym>NUMA</acronym> node, the view touches
+      all memory pages for the shared memory segment. This will force
+      allocation of the shared memory, if it wasn't allocated already,
+      and the memory may get allocated in a single <acronym>NUMA</acronym>
+      node (depending on system configuration).
+    </para>
+  </warning>
+
+  <table>
+   <title><structname>pg_shmem_allocations_numa</structname> Columns</title>
+   <tgroup cols="1">
+    <thead>
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       Column Type
+      </para>
+      <para>
+       Description
+      </para></entry>
+     </row>
+    </thead>
+
+    <tbody>
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>name</structfield> <type>text</type>
+      </para>
+      <para>
+       The name of the shared memory allocation.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>numa_node</structfield> <type>int4</type>
+      </para>
+      <para>
+      ID of <acronym>NUMA</acronym> node
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>size</structfield> <type>int4</type>
+      </para>
+      <para>
+       Size of the allocation on this particular NUMA memory node in bytes
+      </para></entry>
+     </row>
+
+    </tbody>
+   </tgroup>
+  </table>
+
+  <para>
+   By default, the <structname>pg_shmem_allocations_numa</structname> view can be
+   read only by superusers or roles with privileges of the
+   <literal>pg_read_all_stats</literal> role.
+  </para>
+ </sect1>
+
  <sect1 id="view-pg-stats">
   <title><structname>pg_stats</structname></title>
 
index 273008db37fc81ef19956c9776a6ec3af8b496ab..08f780a2e638277971ef568cb10277ec35585ed2 100644 (file)
@@ -658,6 +658,14 @@ GRANT SELECT ON pg_shmem_allocations TO pg_read_all_stats;
 REVOKE EXECUTE ON FUNCTION pg_get_shmem_allocations() FROM PUBLIC;
 GRANT EXECUTE ON FUNCTION pg_get_shmem_allocations() TO pg_read_all_stats;
 
+CREATE VIEW pg_shmem_allocations_numa AS
+    SELECT * FROM pg_get_shmem_allocations_numa();
+
+REVOKE ALL ON pg_shmem_allocations_numa FROM PUBLIC;
+GRANT SELECT ON pg_shmem_allocations_numa TO pg_read_all_stats;
+REVOKE EXECUTE ON FUNCTION pg_get_shmem_allocations_numa() FROM PUBLIC;
+GRANT EXECUTE ON FUNCTION pg_get_shmem_allocations_numa() TO pg_read_all_stats;
+
 CREATE VIEW pg_backend_memory_contexts AS
     SELECT * FROM pg_get_backend_memory_contexts();
 
index 895a43fb39e54b698a3333c7ddf6926ff3b30174..e10b380e5c7c04c5fcb51b1c5a76c1a4f51c3d20 100644 (file)
@@ -68,6 +68,7 @@
 #include "fmgr.h"
 #include "funcapi.h"
 #include "miscadmin.h"
+#include "port/pg_numa.h"
 #include "storage/lwlock.h"
 #include "storage/pg_shmem.h"
 #include "storage/shmem.h"
@@ -89,6 +90,8 @@ slock_t    *ShmemLock;            /* spinlock for shared memory and LWLock
 
 static HTAB *ShmemIndex = NULL; /* primary index hashtable for shmem */
 
+/* To get reliable results for NUMA inquiry we need to "touch pages" once */
+static bool firstNumaTouch = true;
 
 /*
  * InitShmemAccess() --- set up basic pointers to shared memory.
@@ -568,3 +571,159 @@ pg_get_shmem_allocations(PG_FUNCTION_ARGS)
 
    return (Datum) 0;
 }
+
+/*
+ * SQL SRF showing NUMA memory nodes for allocated shared memory
+ *
+ * Compared to pg_get_shmem_allocations(), this function does not return
+ * information about shared anonymous allocations and unused shared memory.
+ */
+Datum
+pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS)
+{
+#define PG_GET_SHMEM_NUMA_SIZES_COLS 3
+   ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+   HASH_SEQ_STATUS hstat;
+   ShmemIndexEnt *ent;
+   Datum       values[PG_GET_SHMEM_NUMA_SIZES_COLS];
+   bool        nulls[PG_GET_SHMEM_NUMA_SIZES_COLS];
+   Size        os_page_size;
+   void      **page_ptrs;
+   int        *pages_status;
+   uint64      shm_total_page_count,
+               shm_ent_page_count,
+               max_nodes;
+   Size       *nodes;
+
+   if (pg_numa_init() == -1)
+       elog(ERROR, "libnuma initialization failed or NUMA is not supported on this platform");
+
+   InitMaterializedSRF(fcinfo, 0);
+
+   max_nodes = pg_numa_get_max_node();
+   nodes = palloc(sizeof(Size) * (max_nodes + 1));
+
+   /*
+    * Different database block sizes (4kB, 8kB, ..., 32kB) can be used, while
+    * the OS may have different memory page sizes.
+    *
+    * To correctly map between them, we need to: 1. Determine the OS memory
+    * page size 2. Calculate how many OS pages are used by all buffer blocks
+    * 3. Calculate how many OS pages are contained within each database
+    * block.
+    *
+    * This information is needed before calling move_pages() for NUMA memory
+    * node inquiry.
+    */
+   os_page_size = pg_numa_get_pagesize();
+
+   /*
+    * Allocate memory for page pointers and status based on total shared
+    * memory size. This simplified approach allocates enough space for all
+    * pages in shared memory rather than calculating the exact requirements
+    * for each segment.
+    *
+    * Add 1, because we don't know how exactly the segments align to OS
+    * pages, so the allocation might use one more memory page. In practice
+    * this is not very likely, and moreover we have more entries, each of
+    * them using only fraction of the total pages.
+    */
+   shm_total_page_count = (ShmemSegHdr->totalsize / os_page_size) + 1;
+   page_ptrs = palloc0(sizeof(void *) * shm_total_page_count);
+   pages_status = palloc(sizeof(int) * shm_total_page_count);
+
+   if (firstNumaTouch)
+       elog(DEBUG1, "NUMA: page-faulting shared memory segments for proper NUMA readouts");
+
+   LWLockAcquire(ShmemIndexLock, LW_SHARED);
+
+   hash_seq_init(&hstat, ShmemIndex);
+
+   /* output all allocated entries */
+   memset(nulls, 0, sizeof(nulls));
+   while ((ent = (ShmemIndexEnt *) hash_seq_search(&hstat)) != NULL)
+   {
+       int         i;
+       char       *startptr,
+                  *endptr;
+       Size        total_len;
+
+       /*
+        * Calculate the range of OS pages used by this segment. The segment
+        * may start / end half-way through a page, we want to count these
+        * pages too. So we align the start/end pointers down/up, and then
+        * calculate the number of pages from that.
+        */
+       startptr = (char *) TYPEALIGN_DOWN(os_page_size, ent->location);
+       endptr = (char *) TYPEALIGN(os_page_size,
+                                   (char *) ent->location + ent->allocated_size);
+       total_len = (endptr - startptr);
+
+       shm_ent_page_count = total_len / os_page_size;
+
+       /*
+        * If we ever get 0xff (-1) back from kernel inquiry, then we probably
+        * have a bug in mapping buffers to OS pages.
+        */
+       memset(pages_status, 0xff, sizeof(int) * shm_ent_page_count);
+
+       /*
+        * Setup page_ptrs[] with pointers to all OS pages for this segment,
+        * and get the NUMA status using pg_numa_query_pages.
+        *
+        * In order to get reliable results we also need to touch memory
+        * pages, so that inquiry about NUMA memory node doesn't return -2
+        * (ENOENT, which indicates unmapped/unallocated pages).
+        */
+       for (i = 0; i < shm_ent_page_count; i++)
+       {
+           volatile uint64 touch pg_attribute_unused();
+
+           page_ptrs[i] = startptr + (i * os_page_size);
+
+           if (firstNumaTouch)
+               pg_numa_touch_mem_if_required(touch, page_ptrs[i]);
+
+           CHECK_FOR_INTERRUPTS();
+       }
+
+       if (pg_numa_query_pages(0, shm_ent_page_count, page_ptrs, pages_status) == -1)
+           elog(ERROR, "failed NUMA pages inquiry status: %m");
+
+       /* Count number of NUMA nodes used for this shared memory entry */
+       memset(nodes, 0, sizeof(Size) * (max_nodes + 1));
+
+       for (i = 0; i < shm_ent_page_count; i++)
+       {
+           int         s = pages_status[i];
+
+           /* Ensure we are adding only valid index to the array */
+           if (s < 0 || s > max_nodes)
+           {
+               elog(ERROR, "invalid NUMA node id outside of allowed range "
+                    "[0, " UINT64_FORMAT "]: %d", max_nodes, s);
+           }
+
+           nodes[s]++;
+       }
+
+       /*
+        * Add one entry for each NUMA node, including those without allocated
+        * memory for this segment.
+        */
+       for (i = 0; i <= max_nodes; i++)
+       {
+           values[0] = CStringGetTextDatum(ent->key);
+           values[1] = i;
+           values[2] = Int64GetDatum(nodes[i] * os_page_size);
+
+           tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
+                                values, nulls);
+       }
+   }
+
+   LWLockRelease(ShmemIndexLock);
+   firstNumaTouch = false;
+
+   return (Datum) 0;
+}
index 2a3d9dc8a7abf7afaebb3ead0e952787dcce79c3..18a1284cf518dc5ae2a458427227830216870837 100644 (file)
@@ -57,6 +57,6 @@
  */
 
 /*                         yyyymmddN */
-#define CATALOG_VERSION_NO 202504072
+#define CATALOG_VERSION_NO 202504073
 
 #endif
index a9a9afb93c80e827fd5a49de1554747261c5c626..37a484147a8f24f67c47ee74c0dac143028c5c9a 100644 (file)
   proname => 'pg_numa_available', provolatile => 's', prorettype => 'bool',
   proargtypes => '', prosrc => 'pg_numa_available' },
 
+# shared memory usage with NUMA info
+{ oid => '4100', descr => 'NUMA mappings for the main shared memory segment',
+  proname => 'pg_get_shmem_allocations_numa', prorows => '50', proretset => 't',
+  provolatile => 'v', prorettype => 'record', proargtypes => '',
+  proallargtypes => '{text,int4,int8}', proargmodes => '{o,o,o}',
+  proargnames => '{name,numa_node,size}',
+  prosrc => 'pg_get_shmem_allocations_numa' },
+
 # memory context of local backend
 { oid => '2282',
   descr => 'information about all memory contexts of local backend',
diff --git a/src/test/regress/expected/numa.out b/src/test/regress/expected/numa.out
new file mode 100644 (file)
index 0000000..8af5dfe
--- /dev/null
@@ -0,0 +1,13 @@
+SELECT NOT(pg_numa_available()) AS skip_test \gset
+\if :skip_test
+SELECT COUNT(*) = 0 AS ok FROM pg_shmem_allocations_numa;
+\quit
+\endif
+-- switch to superuser
+\c -
+SELECT COUNT(*) >= 0 AS ok FROM pg_shmem_allocations_numa;
+ ok 
+----
+ t
+(1 row)
+
diff --git a/src/test/regress/expected/numa_1.out b/src/test/regress/expected/numa_1.out
new file mode 100644 (file)
index 0000000..c90042f
--- /dev/null
@@ -0,0 +1,5 @@
+SELECT NOT(pg_numa_available()) AS skip_test \gset
+\if :skip_test
+SELECT COUNT(*) = 0 AS ok FROM pg_shmem_allocations_numa;
+ERROR:  libnuma initialization failed or NUMA is not supported on this platform
+\quit
index 1fddb13b6aef2b4663daa5e6b7838c10f3ac789b..c25062c288f32351b94db98f5066880d817b9c46 100644 (file)
@@ -3219,8 +3219,8 @@ REVOKE MAINTAIN ON lock_table FROM regress_locktable_user;
 -- clean up
 DROP TABLE lock_table;
 DROP USER regress_locktable_user;
--- test to check privileges of system views pg_shmem_allocations and
--- pg_backend_memory_contexts.
+-- test to check privileges of system views pg_shmem_allocations,
+-- pg_shmem_allocations_numa and pg_backend_memory_contexts.
 -- switch to superuser
 \c -
 CREATE ROLE regress_readallstats;
@@ -3242,6 +3242,12 @@ SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT
  f
 (1 row)
 
+SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations_numa','SELECT'); -- no
+ has_table_privilege 
+---------------------
+ f
+(1 row)
+
 GRANT pg_read_all_stats TO regress_readallstats;
 SELECT has_table_privilege('regress_readallstats','pg_aios','SELECT'); -- yes
  has_table_privilege 
@@ -3261,6 +3267,12 @@ SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT
  t
 (1 row)
 
+SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations_numa','SELECT'); -- yes
+ has_table_privilege 
+---------------------
+ t
+(1 row)
+
 -- run query to ensure that functions within views can be executed
 SET ROLE regress_readallstats;
 SELECT COUNT(*) >= 0 AS ok FROM pg_aios;
index 673c63b8d1b6aae6e7937d2d001d97fa80a7ac85..6cf828ca8d0dc4c82273f81e5a1d812560efe723 100644 (file)
@@ -1757,6 +1757,10 @@ pg_shmem_allocations| SELECT name,
     size,
     allocated_size
    FROM pg_get_shmem_allocations() pg_get_shmem_allocations(name, off, size, allocated_size);
+pg_shmem_allocations_numa| SELECT name,
+    numa_node,
+    size
+   FROM pg_get_shmem_allocations_numa() pg_get_shmem_allocations_numa(name, numa_node, size);
 pg_stat_activity| SELECT s.datid,
     d.datname,
     s.pid,
index 0a35f2f8f6a9354f713fdafbbb462167741ea016..0f38caa0d240a174b3813885f0805de2bfe46170 100644 (file)
@@ -119,7 +119,7 @@ test: plancache limit plpgsql copy2 temp domain rangefuncs prepare conversion tr
 # The stats test resets stats, so nothing else needing stats access can be in
 # this group.
 # ----------
-test: partition_join partition_prune reloptions hash_part indexing partition_aggregate partition_info tuplesort explain compression memoize stats predicate
+test: partition_join partition_prune reloptions hash_part indexing partition_aggregate partition_info tuplesort explain compression memoize stats predicate numa
 
 # event_trigger depends on create_am and cannot run concurrently with
 # any test that runs DDL
diff --git a/src/test/regress/sql/numa.sql b/src/test/regress/sql/numa.sql
new file mode 100644 (file)
index 0000000..324481c
--- /dev/null
@@ -0,0 +1,10 @@
+SELECT NOT(pg_numa_available()) AS skip_test \gset
+\if :skip_test
+SELECT COUNT(*) = 0 AS ok FROM pg_shmem_allocations_numa;
+\quit
+\endif
+
+-- switch to superuser
+\c -
+
+SELECT COUNT(*) >= 0 AS ok FROM pg_shmem_allocations_numa;
index 85d7280f35fca1c8a31e3e488b1ff3d2886bb7b2..f337aa67c13f23204d3d97f59f16a7c331bcd6a7 100644 (file)
@@ -1947,8 +1947,8 @@ REVOKE MAINTAIN ON lock_table FROM regress_locktable_user;
 DROP TABLE lock_table;
 DROP USER regress_locktable_user;
 
--- test to check privileges of system views pg_shmem_allocations and
--- pg_backend_memory_contexts.
+-- test to check privileges of system views pg_shmem_allocations,
+-- pg_shmem_allocations_numa and pg_backend_memory_contexts.
 
 -- switch to superuser
 \c -
@@ -1958,12 +1958,14 @@ CREATE ROLE regress_readallstats;
 SELECT has_table_privilege('regress_readallstats','pg_aios','SELECT'); -- no
 SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no
 SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT'); -- no
+SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations_numa','SELECT'); -- no
 
 GRANT pg_read_all_stats TO regress_readallstats;
 
 SELECT has_table_privilege('regress_readallstats','pg_aios','SELECT'); -- yes
 SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- yes
 SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT'); -- yes
+SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations_numa','SELECT'); -- yes
 
 -- run query to ensure that functions within views can be executed
 SET ROLE regress_readallstats;