Introduce pg_shmem_allocations_numa view

author Tomas Vondra <tomas.vondra@postgresql.org>

Mon, 7 Apr 2025 20:54:49 +0000 (22:54 +0200)

committer Tomas Vondra <tomas.vondra@postgresql.org>

Mon, 7 Apr 2025 21:08:17 +0000 (23:08 +0200)
author Tomas Vondra <tomas.vondra@postgresql.org>
Mon, 7 Apr 2025 20:54:49 +0000 (22:54 +0200)
committer Tomas Vondra <tomas.vondra@postgresql.org>
Mon, 7 Apr 2025 21:08:17 +0000 (23:08 +0200)
diff --git a/doc/src/sgml/system-views.sgml b/doc/src/sgml/system-views.sgml

index 4f336ee0adfaef86e4044729ac5c7926efc4332a..0eba37268bf5a035cc02412e5d893a5771e273c8 100644 (file)
--- a/doc/src/sgml/system-views.sgml
+++ b/doc/src/sgml/system-views.sgml
@@ -181,6 +181,11 @@
        <entry>shared memory allocations</entry>
       </row>
  
+     <row>
+      <entry><link linkend="view-pg-shmem-allocations-numa"><structname>pg_shmem_allocations_numa</structname></link></entry>
+      <entry>NUMA node mappings for shared memory allocations</entry>
+     </row>
+
       <row>
        <entry><link linkend="view-pg-stats"><structname>pg_stats</structname></link></entry>
        <entry>planner statistics</entry>
@@ -4051,6 +4056,96 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
    </para>
   </sect1>
  
+ <sect1 id="view-pg-shmem-allocations-numa">
+  <title><structname>pg_shmem_allocations_numa</structname></title>
+
+  <indexterm zone="view-pg-shmem-allocations-numa">
+   <primary>pg_shmem_allocations_numa</primary>
+  </indexterm>
+
+  <para>
+   The <structname>pg_shmem_allocations_numa</structname> shows how shared
+   memory allocations in the server's main shared memory segment are distributed
+   across NUMA nodes. This includes both memory allocated by
+   <productname>PostgreSQL</productname> itself and memory allocated
+   by extensions using the mechanisms detailed in
+   <xref linkend="xfunc-shared-addin" />. This view will output multiple rows
+   for each of the shared memory segments provided that they are spread accross
+   multiple NUMA nodes. This view should not be queried by monitoring systems
+   as it is very slow and may end up allocating shared memory in case it was not
+   used earlier.
+   Current limitation for this view is that won't show anonymous shared memory
+   allocations.
+  </para>
+
+  <para>
+   Note that this view does not include memory allocated using the dynamic
+   shared memory infrastructure.
+  </para>
+
+  <warning>
+    <para>
+      When determining the <acronym>NUMA</acronym> node, the view touches
+      all memory pages for the shared memory segment. This will force
+      allocation of the shared memory, if it wasn't allocated already,
+      and the memory may get allocated in a single <acronym>NUMA</acronym>
+      node (depending on system configuration).
+    </para>
+  </warning>
+
+  <table>
+   <title><structname>pg_shmem_allocations_numa</structname> Columns</title>
+   <tgroup cols="1">
+    <thead>
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       Column Type
+      </para>
+      <para>
+       Description
+      </para></entry>
+     </row>
+    </thead>
+
+    <tbody>
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>name</structfield> <type>text</type>
+      </para>
+      <para>
+       The name of the shared memory allocation.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>numa_node</structfield> <type>int4</type>
+      </para>
+      <para>
+      ID of <acronym>NUMA</acronym> node
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>size</structfield> <type>int4</type>
+      </para>
+      <para>
+       Size of the allocation on this particular NUMA memory node in bytes
+      </para></entry>
+     </row>
+
+    </tbody>
+   </tgroup>
+  </table>
+
+  <para>
+   By default, the <structname>pg_shmem_allocations_numa</structname> view can be
+   read only by superusers or roles with privileges of the
+   <literal>pg_read_all_stats</literal> role.
+  </para>
+ </sect1>
+
   <sect1 id="view-pg-stats">
    <title><structname>pg_stats</structname></title>
  
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql

index 273008db37fc81ef19956c9776a6ec3af8b496ab..08f780a2e638277971ef568cb10277ec35585ed2 100644 (file)
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -658,6 +658,14 @@ GRANT SELECT ON pg_shmem_allocations TO pg_read_all_stats;
  REVOKE EXECUTE ON FUNCTION pg_get_shmem_allocations() FROM PUBLIC;
  GRANT EXECUTE ON FUNCTION pg_get_shmem_allocations() TO pg_read_all_stats;
  
+CREATE VIEW pg_shmem_allocations_numa AS
+    SELECT * FROM pg_get_shmem_allocations_numa();
+
+REVOKE ALL ON pg_shmem_allocations_numa FROM PUBLIC;
+GRANT SELECT ON pg_shmem_allocations_numa TO pg_read_all_stats;
+REVOKE EXECUTE ON FUNCTION pg_get_shmem_allocations_numa() FROM PUBLIC;
+GRANT EXECUTE ON FUNCTION pg_get_shmem_allocations_numa() TO pg_read_all_stats;
+
  CREATE VIEW pg_backend_memory_contexts AS
      SELECT * FROM pg_get_backend_memory_contexts();
  
diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c

index 895a43fb39e54b698a3333c7ddf6926ff3b30174..e10b380e5c7c04c5fcb51b1c5a76c1a4f51c3d20 100644 (file)
--- a/src/backend/storage/ipc/shmem.c
+++ b/src/backend/storage/ipc/shmem.c
@@ -68,6 +68,7 @@
  #include "fmgr.h"
  #include "funcapi.h"
  #include "miscadmin.h"
+#include "port/pg_numa.h"
  #include "storage/lwlock.h"
  #include "storage/pg_shmem.h"
  #include "storage/shmem.h"
@@ -89,6 +90,8 @@ slock_t    *ShmemLock;            /* spinlock for shared memory and LWLock
  
  static HTAB *ShmemIndex = NULL; /* primary index hashtable for shmem */
  
+/* To get reliable results for NUMA inquiry we need to "touch pages" once */
+static bool firstNumaTouch = true;
  
  /*
   * InitShmemAccess() --- set up basic pointers to shared memory.
@@ -568,3 +571,159 @@ pg_get_shmem_allocations(PG_FUNCTION_ARGS)
  
     return (Datum) 0;
  }
+
+/*
+ * SQL SRF showing NUMA memory nodes for allocated shared memory
+ *
+ * Compared to pg_get_shmem_allocations(), this function does not return
+ * information about shared anonymous allocations and unused shared memory.
+ */
+Datum
+pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS)
+{
+#define PG_GET_SHMEM_NUMA_SIZES_COLS 3
+   ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+   HASH_SEQ_STATUS hstat;
+   ShmemIndexEnt *ent;
+   Datum       values[PG_GET_SHMEM_NUMA_SIZES_COLS];
+   bool        nulls[PG_GET_SHMEM_NUMA_SIZES_COLS];
+   Size        os_page_size;
+   void      **page_ptrs;
+   int        *pages_status;
+   uint64      shm_total_page_count,
+               shm_ent_page_count,
+               max_nodes;
+   Size       *nodes;
+
+   if (pg_numa_init() == -1)
+       elog(ERROR, "libnuma initialization failed or NUMA is not supported on this platform");
+
+   InitMaterializedSRF(fcinfo, 0);
+
+   max_nodes = pg_numa_get_max_node();
+   nodes = palloc(sizeof(Size) * (max_nodes + 1));
+
+   /*
+    * Different database block sizes (4kB, 8kB, ..., 32kB) can be used, while
+    * the OS may have different memory page sizes.
+    *
+    * To correctly map between them, we need to: 1. Determine the OS memory
+    * page size 2. Calculate how many OS pages are used by all buffer blocks
+    * 3. Calculate how many OS pages are contained within each database
+    * block.
+    *
+    * This information is needed before calling move_pages() for NUMA memory
+    * node inquiry.
+    */
+   os_page_size = pg_numa_get_pagesize();
+
+   /*
+    * Allocate memory for page pointers and status based on total shared
+    * memory size. This simplified approach allocates enough space for all
+    * pages in shared memory rather than calculating the exact requirements
+    * for each segment.
+    *
+    * Add 1, because we don't know how exactly the segments align to OS
+    * pages, so the allocation might use one more memory page. In practice
+    * this is not very likely, and moreover we have more entries, each of
+    * them using only fraction of the total pages.
+    */
+   shm_total_page_count = (ShmemSegHdr->totalsize / os_page_size) + 1;
+   page_ptrs = palloc0(sizeof(void *) * shm_total_page_count);
+   pages_status = palloc(sizeof(int) * shm_total_page_count);
+
+   if (firstNumaTouch)
+       elog(DEBUG1, "NUMA: page-faulting shared memory segments for proper NUMA readouts");
+
+   LWLockAcquire(ShmemIndexLock, LW_SHARED);
+
+   hash_seq_init(&hstat, ShmemIndex);
+
+   /* output all allocated entries */
+   memset(nulls, 0, sizeof(nulls));
+   while ((ent = (ShmemIndexEnt *) hash_seq_search(&hstat)) != NULL)
+   {
+       int         i;
+       char       *startptr,
+                  *endptr;
+       Size        total_len;
+
+       /*
+        * Calculate the range of OS pages used by this segment. The segment
+        * may start / end half-way through a page, we want to count these
+        * pages too. So we align the start/end pointers down/up, and then
+        * calculate the number of pages from that.
+        */
+       startptr = (char *) TYPEALIGN_DOWN(os_page_size, ent->location);
+       endptr = (char *) TYPEALIGN(os_page_size,
+                                   (char *) ent->location + ent->allocated_size);
+       total_len = (endptr - startptr);
+
+       shm_ent_page_count = total_len / os_page_size;
+
+       /*
+        * If we ever get 0xff (-1) back from kernel inquiry, then we probably
+        * have a bug in mapping buffers to OS pages.
+        */
+       memset(pages_status, 0xff, sizeof(int) * shm_ent_page_count);
+
+       /*
+        * Setup page_ptrs[] with pointers to all OS pages for this segment,
+        * and get the NUMA status using pg_numa_query_pages.
+        *
+        * In order to get reliable results we also need to touch memory
+        * pages, so that inquiry about NUMA memory node doesn't return -2
+        * (ENOENT, which indicates unmapped/unallocated pages).
+        */
+       for (i = 0; i < shm_ent_page_count; i++)
+       {
+           volatile uint64 touch pg_attribute_unused();
+
+           page_ptrs[i] = startptr + (i * os_page_size);
+
+           if (firstNumaTouch)
+               pg_numa_touch_mem_if_required(touch, page_ptrs[i]);
+
+           CHECK_FOR_INTERRUPTS();
+       }
+
+       if (pg_numa_query_pages(0, shm_ent_page_count, page_ptrs, pages_status) == -1)
+           elog(ERROR, "failed NUMA pages inquiry status: %m");
+
+       /* Count number of NUMA nodes used for this shared memory entry */
+       memset(nodes, 0, sizeof(Size) * (max_nodes + 1));
+
+       for (i = 0; i < shm_ent_page_count; i++)
+       {
+           int         s = pages_status[i];
+
+           /* Ensure we are adding only valid index to the array */
+           if (s < 0 || s > max_nodes)
+           {
+               elog(ERROR, "invalid NUMA node id outside of allowed range "
+                    "[0, " UINT64_FORMAT "]: %d", max_nodes, s);
+           }
+
+           nodes[s]++;
+       }
+
+       /*
+        * Add one entry for each NUMA node, including those without allocated
+        * memory for this segment.
+        */
+       for (i = 0; i <= max_nodes; i++)
+       {
+           values[0] = CStringGetTextDatum(ent->key);
+           values[1] = i;
+           values[2] = Int64GetDatum(nodes[i] * os_page_size);
+
+           tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
+                                values, nulls);
+       }
+   }
+
+   LWLockRelease(ShmemIndexLock);
+   firstNumaTouch = false;
+
+   return (Datum) 0;
+}
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h

index 2a3d9dc8a7abf7afaebb3ead0e952787dcce79c3..18a1284cf518dc5ae2a458427227830216870837 100644 (file)
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -57,6 +57,6 @@
   */
  
  /*                         yyyymmddN */
-#define CATALOG_VERSION_NO 202504072
+#define CATALOG_VERSION_NO 202504073
  
  #endif
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat

index a9a9afb93c80e827fd5a49de1554747261c5c626..37a484147a8f24f67c47ee74c0dac143028c5c9a 100644 (file)
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -8546,6 +8546,14 @@
    proname => 'pg_numa_available', provolatile => 's', prorettype => 'bool',
    proargtypes => '', prosrc => 'pg_numa_available' },
  
+# shared memory usage with NUMA info
+{ oid => '4100', descr => 'NUMA mappings for the main shared memory segment',
+  proname => 'pg_get_shmem_allocations_numa', prorows => '50', proretset => 't',
+  provolatile => 'v', prorettype => 'record', proargtypes => '',
+  proallargtypes => '{text,int4,int8}', proargmodes => '{o,o,o}',
+  proargnames => '{name,numa_node,size}',
+  prosrc => 'pg_get_shmem_allocations_numa' },
+
  # memory context of local backend
  { oid => '2282',
    descr => 'information about all memory contexts of local backend',
diff --git a/src/test/regress/expected/numa.out b/src/test/regress/expected/numa.out

new file mode 100644 (file)

index 0000000..8af5dfe
--- /dev/null
+++ b/src/test/regress/expected/numa.out
@@ -0,0 +1,13 @@
+SELECT NOT(pg_numa_available()) AS skip_test \gset
+\if :skip_test
+SELECT COUNT(*) = 0 AS ok FROM pg_shmem_allocations_numa;
+\quit
+\endif
+-- switch to superuser
+\c -
+SELECT COUNT(*) >= 0 AS ok FROM pg_shmem_allocations_numa;
+ ok 
+----
+ t
+(1 row)
+
diff --git a/src/test/regress/expected/numa_1.out b/src/test/regress/expected/numa_1.out

new file mode 100644 (file)

index 0000000..c90042f
--- /dev/null
+++ b/src/test/regress/expected/numa_1.out
@@ -0,0 +1,5 @@
+SELECT NOT(pg_numa_available()) AS skip_test \gset
+\if :skip_test
+SELECT COUNT(*) = 0 AS ok FROM pg_shmem_allocations_numa;
+ERROR:  libnuma initialization failed or NUMA is not supported on this platform
+\quit
diff --git a/src/test/regress/expected/privileges.out b/src/test/regress/expected/privileges.out

index 1fddb13b6aef2b4663daa5e6b7838c10f3ac789b..c25062c288f32351b94db98f5066880d817b9c46 100644 (file)
--- a/src/test/regress/expected/privileges.out
+++ b/src/test/regress/expected/privileges.out
@@ -3219,8 +3219,8 @@ REVOKE MAINTAIN ON lock_table FROM regress_locktable_user;
  -- clean up
  DROP TABLE lock_table;
  DROP USER regress_locktable_user;
--- test to check privileges of system views pg_shmem_allocations and
--- pg_backend_memory_contexts.
+-- test to check privileges of system views pg_shmem_allocations,
+-- pg_shmem_allocations_numa and pg_backend_memory_contexts.
  -- switch to superuser
  \c -
  CREATE ROLE regress_readallstats;
@@ -3242,6 +3242,12 @@ SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT
   f
  (1 row)
  
+SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations_numa','SELECT'); -- no
+ has_table_privilege 
+---------------------
+ f
+(1 row)
+
  GRANT pg_read_all_stats TO regress_readallstats;
  SELECT has_table_privilege('regress_readallstats','pg_aios','SELECT'); -- yes
   has_table_privilege 
@@ -3261,6 +3267,12 @@ SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT
   t
  (1 row)
  
+SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations_numa','SELECT'); -- yes
+ has_table_privilege 
+---------------------
+ t
+(1 row)
+
  -- run query to ensure that functions within views can be executed
  SET ROLE regress_readallstats;
  SELECT COUNT(*) >= 0 AS ok FROM pg_aios;
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out

index 673c63b8d1b6aae6e7937d2d001d97fa80a7ac85..6cf828ca8d0dc4c82273f81e5a1d812560efe723 100644 (file)
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -1757,6 +1757,10 @@ pg_shmem_allocations| SELECT name,
      size,
      allocated_size
     FROM pg_get_shmem_allocations() pg_get_shmem_allocations(name, off, size, allocated_size);
+pg_shmem_allocations_numa| SELECT name,
+    numa_node,
+    size
+   FROM pg_get_shmem_allocations_numa() pg_get_shmem_allocations_numa(name, numa_node, size);
  pg_stat_activity| SELECT s.datid,
      d.datname,
      s.pid,
diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule

index 0a35f2f8f6a9354f713fdafbbb462167741ea016..0f38caa0d240a174b3813885f0805de2bfe46170 100644 (file)
--- a/src/test/regress/parallel_schedule
+++ b/src/test/regress/parallel_schedule
@@ -119,7 +119,7 @@ test: plancache limit plpgsql copy2 temp domain rangefuncs prepare conversion tr
  # The stats test resets stats, so nothing else needing stats access can be in
  # this group.
  # ----------
-test: partition_join partition_prune reloptions hash_part indexing partition_aggregate partition_info tuplesort explain compression memoize stats predicate
+test: partition_join partition_prune reloptions hash_part indexing partition_aggregate partition_info tuplesort explain compression memoize stats predicate numa
  
  # event_trigger depends on create_am and cannot run concurrently with
  # any test that runs DDL
diff --git a/src/test/regress/sql/numa.sql b/src/test/regress/sql/numa.sql

new file mode 100644 (file)

index 0000000..324481c
--- /dev/null
+++ b/src/test/regress/sql/numa.sql
@@ -0,0 +1,10 @@
+SELECT NOT(pg_numa_available()) AS skip_test \gset
+\if :skip_test
+SELECT COUNT(*) = 0 AS ok FROM pg_shmem_allocations_numa;
+\quit
+\endif
+
+-- switch to superuser
+\c -
+
+SELECT COUNT(*) >= 0 AS ok FROM pg_shmem_allocations_numa;
diff --git a/src/test/regress/sql/privileges.sql b/src/test/regress/sql/privileges.sql

index 85d7280f35fca1c8a31e3e488b1ff3d2886bb7b2..f337aa67c13f23204d3d97f59f16a7c331bcd6a7 100644 (file)
--- a/src/test/regress/sql/privileges.sql
+++ b/src/test/regress/sql/privileges.sql
@@ -1947,8 +1947,8 @@ REVOKE MAINTAIN ON lock_table FROM regress_locktable_user;
  DROP TABLE lock_table;
  DROP USER regress_locktable_user;
  
--- test to check privileges of system views pg_shmem_allocations and
--- pg_backend_memory_contexts.
+-- test to check privileges of system views pg_shmem_allocations,
+-- pg_shmem_allocations_numa and pg_backend_memory_contexts.
  
  -- switch to superuser
  \c -
@@ -1958,12 +1958,14 @@ CREATE ROLE regress_readallstats;
  SELECT has_table_privilege('regress_readallstats','pg_aios','SELECT'); -- no
  SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no
  SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT'); -- no
+SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations_numa','SELECT'); -- no
  
  GRANT pg_read_all_stats TO regress_readallstats;
  
  SELECT has_table_privilege('regress_readallstats','pg_aios','SELECT'); -- yes
  SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- yes
  SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT'); -- yes
+SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations_numa','SELECT'); -- yes
  
  -- run query to ensure that functions within views can be executed
  SET ROLE regress_readallstats;
author	Tomas Vondra <tomas.vondra@postgresql.org>
	Mon, 7 Apr 2025 20:54:49 +0000 (22:54 +0200)
committer	Tomas Vondra <tomas.vondra@postgresql.org>
	Mon, 7 Apr 2025 21:08:17 +0000 (23:08 +0200)
doc/src/sgml/system-views.sgml		patch \| blob \| blame \| history
src/backend/catalog/system_views.sql		patch \| blob \| blame \| history
src/backend/storage/ipc/shmem.c		patch \| blob \| blame \| history
src/include/catalog/catversion.h		patch \| blob \| blame \| history
src/include/catalog/pg_proc.dat		patch \| blob \| blame \| history
src/test/regress/expected/numa.out	[new file with mode: 0644]	patch \| blob
src/test/regress/expected/numa_1.out	[new file with mode: 0644]	patch \| blob
src/test/regress/expected/privileges.out		patch \| blob \| blame \| history
src/test/regress/expected/rules.out		patch \| blob \| blame \| history
src/test/regress/parallel_schedule		patch \| blob \| blame \| history
src/test/regress/sql/numa.sql	[new file with mode: 0644]	patch \| blob
src/test/regress/sql/privileges.sql		patch \| blob \| blame \| history