Have ExecFindPartition cache the last found partition

author David Rowley <drowley@postgresql.org>

Mon, 1 Aug 2022 21:55:27 +0000 (09:55 +1200)

committer David Rowley <drowley@postgresql.org>

Mon, 1 Aug 2022 21:55:27 +0000 (09:55 +1200)
author David Rowley <drowley@postgresql.org>
Mon, 1 Aug 2022 21:55:27 +0000 (09:55 +1200)
committer David Rowley <drowley@postgresql.org>
Mon, 1 Aug 2022 21:55:27 +0000 (09:55 +1200)
diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c

index e03ea27299c8ee929b96e07673e41d6b64cf83ee..eb4910610243bc79f6675370bca792ee96526c24 100644 (file)
--- a/src/backend/executor/execPartition.c
+++ b/src/backend/executor/execPartition.c
@@ -1332,10 +1332,49 @@ FormPartitionKeyDatum(PartitionDispatch pd,
         elog(ERROR, "wrong number of partition key expressions");
  }
  
+/*
+ * The number of times the same partition must be found in a row before we
+ * switch from a binary search for the given values to just checking if the
+ * values belong to the last found partition.  This must be above 0.
+ */
+#define PARTITION_CACHED_FIND_THRESHOLD            16
+
  /*
   * get_partition_for_tuple
   *     Finds partition of relation which accepts the partition key specified
- *     in values and isnull
+ *     in values and isnull.
+ *
+ * Calling this function can be quite expensive when LIST and RANGE
+ * partitioned tables have many partitions.  This is due to the binary search
+ * that's done to find the correct partition.  Many of the use cases for LIST
+ * and RANGE partitioned tables make it likely that the same partition is
+ * found in subsequent ExecFindPartition() calls.  This is especially true for
+ * cases such as RANGE partitioned tables on a TIMESTAMP column where the
+ * partition key is the current time.  When asked to find a partition for a
+ * RANGE or LIST partitioned table, we record the partition index and datum
+ * offset we've found for the given 'values' in the PartitionDesc (which is
+ * stored in relcache), and if we keep finding the same partition
+ * PARTITION_CACHED_FIND_THRESHOLD times in a row, then we'll enable caching
+ * logic and instead of performing a binary search to find the correct
+ * partition, we'll just double-check that 'values' still belong to the last
+ * found partition, and if so, we'll return that partition index, thus
+ * skipping the need for the binary search.  If we fail to match the last
+ * partition when double checking, then we fall back on doing a binary search.
+ * In this case, unless we find 'values' belong to the DEFAULT partition,
+ * we'll reset the number of times we've hit the same partition so that we
+ * don't attempt to use the cache again until we've found that partition at
+ * least PARTITION_CACHED_FIND_THRESHOLD times in a row.
+ *
+ * For cases where the partition changes on each lookup, the amount of
+ * additional work required just amounts to recording the last found partition
+ * and bound offset then resetting the found counter.  This is cheap and does
+ * not appear to cause any meaningful slowdowns for such cases.
+ *
+ * No caching of partitions is done when the last found partition is the
+ * DEFAULT or NULL partition.  For the case of the DEFAULT partition, there
+ * is no bound offset storing the matching datum, so we cannot confirm the
+ * indexes match.  For the NULL partition, this is just so cheap, there's no
+ * sense in caching.
   *
   * Return value is index of the partition (>= 0 and < partdesc->nparts) if one
   * found or -1 if none found.
@@ -1343,12 +1382,24 @@ FormPartitionKeyDatum(PartitionDispatch pd,
  static int
  get_partition_for_tuple(PartitionDispatch pd, Datum *values, bool *isnull)
  {
-   int         bound_offset;
+   int         bound_offset = -1;
     int         part_index = -1;
     PartitionKey key = pd->key;
     PartitionDesc partdesc = pd->partdesc;
     PartitionBoundInfo boundinfo = partdesc->boundinfo;
  
+   /*
+    * In the switch statement below, when we perform a cached lookup for
+    * RANGE and LIST partitioned tables, if we find that the last found
+    * partition matches the 'values', we return the partition index right
+    * away.  We do this instead of breaking out of the switch as we don't
+    * want to execute the code about the DEFAULT partition or do any updates
+    * for any of the cache-related fields.  That would be a waste of effort
+    * as we already know it's not the DEFAULT partition and have no need to
+    * increment the number of times we found the same partition any higher
+    * than PARTITION_CACHED_FIND_THRESHOLD.
+    */
+
     /* Route as appropriate based on partitioning strategy. */
     switch (key->strategy)
     {
@@ -1356,24 +1407,59 @@ get_partition_for_tuple(PartitionDispatch pd, Datum *values, bool *isnull)
             {
                 uint64      rowHash;
  
+               /* hash partitioning is too cheap to bother caching */
                 rowHash = compute_partition_hash_value(key->partnatts,
                                                        key->partsupfunc,
                                                        key->partcollation,
                                                        values, isnull);
  
-               part_index = boundinfo->indexes[rowHash % boundinfo->nindexes];
+               /*
+                * HASH partitions can't have a DEFAULT partition and we don't
+                * do any caching work for them, so just return the part index
+                */
+               return boundinfo->indexes[rowHash % boundinfo->nindexes];
             }
-           break;
  
         case PARTITION_STRATEGY_LIST:
             if (isnull[0])
             {
+               /* this is far too cheap to bother doing any caching */
                 if (partition_bound_accepts_nulls(boundinfo))
-                   part_index = boundinfo->null_index;
+               {
+                   /*
+                    * When there is a NULL partition we just return that
+                    * directly.  We don't have a bound_offset so it's not
+                    * valid to drop into the code after the switch which
+                    * checks and updates the cache fields.  We perhaps should
+                    * be invalidating the details of the last cached
+                    * partition but there's no real need to.  Keeping those
+                    * fields set gives a chance at matching to the cached
+                    * partition on the next lookup.
+                    */
+                   return boundinfo->null_index;
+               }
             }
             else
             {
-               bool        equal = false;
+               bool        equal;
+
+               if (partdesc->last_found_count >= PARTITION_CACHED_FIND_THRESHOLD)
+               {
+                   int         last_datum_offset = partdesc->last_found_datum_index;
+                   Datum       lastDatum = boundinfo->datums[last_datum_offset][0];
+                   int32       cmpval;
+
+                   /* does the last found datum index match this datum? */
+                   cmpval = DatumGetInt32(FunctionCall2Coll(&key->partsupfunc[0],
+                                                            key->partcollation[0],
+                                                            lastDatum,
+                                                            values[0]));
+
+                   if (cmpval == 0)
+                       return boundinfo->indexes[last_datum_offset];
+
+                   /* fall-through and do a manual lookup */
+               }
  
                 bound_offset = partition_list_bsearch(key->partsupfunc,
                                                       key->partcollation,
@@ -1403,23 +1489,64 @@ get_partition_for_tuple(PartitionDispatch pd, Datum *values, bool *isnull)
                     }
                 }
  
-               if (!range_partkey_has_null)
+               /* NULLs belong in the DEFAULT partition */
+               if (range_partkey_has_null)
+                   break;
+
+               if (partdesc->last_found_count >= PARTITION_CACHED_FIND_THRESHOLD)
                 {
-                   bound_offset = partition_range_datum_bsearch(key->partsupfunc,
-                                                                key->partcollation,
-                                                                boundinfo,
-                                                                key->partnatts,
-                                                                values,
-                                                                &equal);
+                   int         last_datum_offset = partdesc->last_found_datum_index;
+                   Datum      *lastDatums = boundinfo->datums[last_datum_offset];
+                   PartitionRangeDatumKind *kind = boundinfo->kind[last_datum_offset];
+                   int32       cmpval;
+
+                   /* check if the value is >= to the lower bound */
+                   cmpval = partition_rbound_datum_cmp(key->partsupfunc,
+                                                       key->partcollation,
+                                                       lastDatums,
+                                                       kind,
+                                                       values,
+                                                       key->partnatts);
  
                     /*
-                    * The bound at bound_offset is less than or equal to the
-                    * tuple value, so the bound at offset+1 is the upper
-                    * bound of the partition we're looking for, if there
-                    * actually exists one.
+                    * If it's equal to the lower bound then no need to check
+                    * the upper bound.
                      */
-                   part_index = boundinfo->indexes[bound_offset + 1];
+                   if (cmpval == 0)
+                       return boundinfo->indexes[last_datum_offset + 1];
+
+                   if (cmpval < 0 && last_datum_offset + 1 < boundinfo->ndatums)
+                   {
+                       /* check if the value is below the upper bound */
+                       lastDatums = boundinfo->datums[last_datum_offset + 1];
+                       kind = boundinfo->kind[last_datum_offset + 1];
+                       cmpval = partition_rbound_datum_cmp(key->partsupfunc,
+                                                           key->partcollation,
+                                                           lastDatums,
+                                                           kind,
+                                                           values,
+                                                           key->partnatts);
+
+                       if (cmpval > 0)
+                           return boundinfo->indexes[last_datum_offset + 1];
+                   }
+                   /* fall-through and do a manual lookup */
                 }
+
+               bound_offset = partition_range_datum_bsearch(key->partsupfunc,
+                                                            key->partcollation,
+                                                            boundinfo,
+                                                            key->partnatts,
+                                                            values,
+                                                            &equal);
+
+               /*
+                * The bound at bound_offset is less than or equal to the
+                * tuple value, so the bound at offset+1 is the upper bound of
+                * the partition we're looking for, if there actually exists
+                * one.
+                */
+               part_index = boundinfo->indexes[bound_offset + 1];
             }
             break;
  
@@ -1433,7 +1560,34 @@ get_partition_for_tuple(PartitionDispatch pd, Datum *values, bool *isnull)
      * the default partition, if there is one.
      */
     if (part_index < 0)
-       part_index = boundinfo->default_index;
+   {
+       /*
+        * No need to reset the cache fields here.  The next set of values
+        * might end up belonging to the cached partition, so leaving the
+        * cache alone improves the chances of a cache hit on the next lookup.
+        */
+       return boundinfo->default_index;
+   }
+
+   /* we should only make it here when the code above set bound_offset */
+   Assert(bound_offset >= 0);
+
+   /*
+    * Attend to the cache fields.  If the bound_offset matches the last
+    * cached bound offset then we've found the same partition as last time,
+    * so bump the count by one.  If all goes well, we'll eventually reach
+    * PARTITION_CACHED_FIND_THRESHOLD and try the cache path next time
+    * around.  Otherwise, we'll reset the cache count back to 1 to mark that
+    * we've found this partition for the first time.
+    */
+   if (bound_offset == partdesc->last_found_datum_index)
+       partdesc->last_found_count++;
+   else
+   {
+       partdesc->last_found_count = 1;
+       partdesc->last_found_part_index = part_index;
+       partdesc->last_found_datum_index = bound_offset;
+   }
  
     return part_index;
  }
diff --git a/src/backend/partitioning/partdesc.c b/src/backend/partitioning/partdesc.c

index 8b6e0bd595333df9032ae71bd915e69af17850ea..737f0edd89b985a0140a501a4059cfcac2ebbc55 100644 (file)
--- a/src/backend/partitioning/partdesc.c
+++ b/src/backend/partitioning/partdesc.c
@@ -290,6 +290,12 @@ RelationBuildPartitionDesc(Relation rel, bool omit_detached)
     {
         oldcxt = MemoryContextSwitchTo(new_pdcxt);
         partdesc->boundinfo = partition_bounds_copy(boundinfo, key);
+
+       /* Initialize caching fields for speeding up ExecFindPartition */
+       partdesc->last_found_datum_index = -1;
+       partdesc->last_found_part_index = -1;
+       partdesc->last_found_count = 0;
+
         partdesc->oids = (Oid *) palloc(nparts * sizeof(Oid));
         partdesc->is_leaf = (bool *) palloc(nparts * sizeof(bool));
  
diff --git a/src/include/partitioning/partdesc.h b/src/include/partitioning/partdesc.h

index ae1afe3d78ed0f12157a70d620766d2e5db014c3..7e979433b6a6fc351df554aafad4ca52b85c87ae 100644 (file)
--- a/src/include/partitioning/partdesc.h
+++ b/src/include/partitioning/partdesc.h
@@ -36,6 +36,31 @@ typedef struct PartitionDescData
                                  * the corresponding 'oids' element belongs to
                                  * a leaf partition or not */
     PartitionBoundInfo boundinfo;   /* collection of partition bounds */
+
+   /* Caching fields to cache lookups in get_partition_for_tuple() */
+
+   /*
+    * Index into the PartitionBoundInfo's datum array for the last found
+    * partition or -1 if none.
+    */
+   int         last_found_datum_index;
+
+   /*
+    * Partition index of the last found partition or -1 if none has been
+    * found yet.
+    */
+   int         last_found_part_index;
+
+   /*
+    * For LIST partitioning, this is the number of times in a row that the
+    * datum we're looking for a partition for matches the datum in the
+    * last_found_datum_index index of the boundinfo->datums array.  For RANGE
+    * partitioning, this is the number of times in a row we've found that the
+    * datum we're looking for a partition for falls into the range of the
+    * partition corresponding to the last_found_datum_index index of the
+    * boundinfo->datums array.
+    */
+   int         last_found_count;
  } PartitionDescData;
author	David Rowley <drowley@postgresql.org>
	Mon, 1 Aug 2022 21:55:27 +0000 (09:55 +1200)
committer	David Rowley <drowley@postgresql.org>
	Mon, 1 Aug 2022 21:55:27 +0000 (09:55 +1200)
src/backend/executor/execPartition.c		patch \| blob \| blame \| history
src/backend/partitioning/partdesc.c		patch \| blob \| blame \| history
src/include/partitioning/partdesc.h		patch \| blob \| blame \| history