Consider BufFiles when adjusting hashjoin parameters

author Tomas Vondra <tomas.vondra@postgresql.org>

Wed, 19 Feb 2025 19:29:26 +0000 (20:29 +0100)

committer Tomas Vondra <tomas.vondra@postgresql.org>

Wed, 19 Feb 2025 20:08:20 +0000 (21:08 +0100)
author Tomas Vondra <tomas.vondra@postgresql.org>
Wed, 19 Feb 2025 19:29:26 +0000 (20:29 +0100)
committer Tomas Vondra <tomas.vondra@postgresql.org>
Wed, 19 Feb 2025 20:08:20 +0000 (21:08 +0100)
diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c

index 6f8a379e3b94b2faf4874b6519f407e5f49684ce..8d2201ab67fa5767ba0e83f856eb71765b505769 100644 (file)
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -848,6 +848,90 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
         nbatch = pg_nextpower2_32(Max(2, minbatch));
     }
  
+   /*
+    * Optimize the total amount of memory consumed by the hash node.
+    *
+    * The nbatch calculation above focuses on the size of the in-memory hash
+    * table, assuming no per-batch overhead. Now adjust the number of batches
+    * and the size of the hash table to minimize total memory consumed by the
+    * hash node.
+    *
+    * Each batch file has a BLCKSZ buffer, and we may need two files per
+    * batch (inner and outer side). So with enough batches this can be
+    * significantly more memory than the hashtable itself.
+    *
+    * The total memory usage may be expressed by this formula:
+    *
+    * (inner_rel_bytes / nbatch) + (2 * nbatch * BLCKSZ) <= hash_table_bytes
+    *
+    * where (inner_rel_bytes / nbatch) is the size of the in-memory hash
+    * table and (2 * nbatch * BLCKSZ) is the amount of memory used by file
+    * buffers. But for sufficiently large values of inner_rel_bytes value
+    * there may not be a nbatch value that would make both parts fit into
+    * hash_table_bytes.
+    *
+    * In this case we can't enforce the memory limit - we're going to exceed
+    * it. We can however minimize the impact and use as little memory as
+    * possible. (We haven't really enforced it before either, as we simply
+    * ignored the batch files.)
+    *
+    * The formula for total memory usage says that given an inner relation of
+    * size inner_rel_bytes, we may divide it into an arbitrary number of
+    * batches. This determines both the size of the in-memory hash table and
+    * the amount of memory needed for batch files. These two terms work in
+    * opposite ways - when one decreases, the other increases.
+    *
+    * For low nbatch values, the hash table takes most of the memory, but at
+    * some point the batch files start to dominate. If you combine these two
+    * terms, the memory consumption (for a fixed size of the inner relation)
+    * has a u-shape, with a minimum at some nbatch value.
+    *
+    * Our goal is to find this nbatch value, minimizing the memory usage. We
+    * calculate the memory usage with half the batches (i.e. nbatch/2), and
+    * if it's lower than the current memory usage we know it's better to use
+    * fewer batches. We repeat this until reducing the number of batches does
+    * not reduce the memory usage - we found the optimum. We know the optimum
+    * exists, thanks to the u-shape.
+    *
+    * We only want to do this when exceeding the memory limit, not every
+    * time. The goal is not to minimize memory usage in every case, but to
+    * minimize the memory usage when we can't stay within the memory limit.
+    *
+    * For this reason we only consider reducing the number of batches. We
+    * could try the opposite direction too, but that would save memory only
+    * when most of the memory is used by the hash table. And the hash table
+    * was used for the initial sizing, so we shouldn't be exceeding the
+    * memory limit too much. We might save memory by using more batches, but
+    * it would result in spilling more batch files, which does not seem like
+    * a great trade off.
+    *
+    * While growing the hashtable, we also adjust the number of buckets, to
+    * not have more than one tuple per bucket (load factor 1). We can only do
+    * this during the initial sizing - once we start building the hash,
+    * nbucket is fixed.
+    */
+   while (nbatch > 0)
+   {
+       /* how much memory are we using with current nbatch value */
+       size_t      current_space = hash_table_bytes + (2 * nbatch * BLCKSZ);
+
+       /* how much memory would we use with half the batches */
+       size_t      new_space = hash_table_bytes * 2 + (nbatch * BLCKSZ);
+
+       /* If the memory usage would not decrease, we found the optimum. */
+       if (current_space < new_space)
+           break;
+
+       /*
+        * It's better to use half the batches, so do that and adjust the
+        * nbucket in the opposite direction, and double the allowance.
+        */
+       nbatch /= 2;
+       nbuckets *= 2;
+
+       *space_allowed = (*space_allowed) * 2;
+   }
+
     Assert(nbuckets > 0);
     Assert(nbatch > 0);
  
@@ -890,6 +974,47 @@ ExecHashTableDestroy(HashJoinTable hashtable)
     pfree(hashtable);
  }
  
+/*
+ * Consider adjusting the allowed hash table size, depending on the number
+ * of batches, to minimize the overall memory usage (for both the hashtable
+ * and batch files).
+ *
+ * We're adjusting the size of the hash table, not the (optimal) number of
+ * buckets. We can't change that once we start building the hash, due to how
+ * ExecHashGetBucketAndBatch calculates batchno/bucketno from the hash. This
+ * means the load factor may not be optimal, but we're in damage control so
+ * we accept slower lookups. It's still much better than batch explosion.
+ *
+ * Returns true if we chose to increase the batch size (and thus we don't
+ * need to add batches), and false if we should increase nbatch.
+ */
+static bool
+ExecHashIncreaseBatchSize(HashJoinTable hashtable)
+{
+   /*
+    * How much additional memory would doubling nbatch use? Each batch may
+    * require two buffered files (inner/outer), with a BLCKSZ buffer.
+    */
+   size_t      batchSpace = (hashtable->nbatch * 2 * BLCKSZ);
+
+   /*
+    * Compare the new space needed for doubling nbatch and for enlarging the
+    * in-memory hash table. If doubling the hash table needs less memory,
+    * just do that. Otherwise, continue with doubling the nbatch.
+    *
+    * We're either doubling spaceAllowed of batchSpace, so which of those
+    * increases the memory usage the least is the same as comparing the
+    * values directly.
+    */
+   if (hashtable->spaceAllowed <= batchSpace)
+   {
+       hashtable->spaceAllowed *= 2;
+       return true;
+   }
+
+   return false;
+}
+
  /*
   * ExecHashIncreaseNumBatches
   *     increase the original number of batches in order to reduce
@@ -913,6 +1038,10 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
     if (oldnbatch > Min(INT_MAX / 2, MaxAllocSize / (sizeof(void *) * 2)))
         return;
  
+   /* consider increasing size of the in-memory hash table instead */
+   if (ExecHashIncreaseBatchSize(hashtable))
+       return;
+
     nbatch = oldnbatch * 2;
     Assert(nbatch > 1);
author	Tomas Vondra <tomas.vondra@postgresql.org>
	Wed, 19 Feb 2025 19:29:26 +0000 (20:29 +0100)
committer	Tomas Vondra <tomas.vondra@postgresql.org>
	Wed, 19 Feb 2025 20:08:20 +0000 (21:08 +0100)