Collect and use histograms of lower and upper bounds for range types.

author Heikki Linnakangas <heikki.linnakangas@iki.fi>

Mon, 27 Aug 2012 12:48:46 +0000 (15:48 +0300)

committer Heikki Linnakangas <heikki.linnakangas@iki.fi>

Mon, 27 Aug 2012 12:58:46 +0000 (15:58 +0300)
author Heikki Linnakangas <heikki.linnakangas@iki.fi>
Mon, 27 Aug 2012 12:48:46 +0000 (15:48 +0300)
committer Heikki Linnakangas <heikki.linnakangas@iki.fi>
Mon, 27 Aug 2012 12:58:46 +0000 (15:58 +0300)
diff --git a/src/backend/utils/adt/Makefile b/src/backend/utils/adt/Makefile

index a692086bdfd1fead182ce269ad8380e40472d366..a929f4ae197c736d1354eb2186c135b6519be539 100644 (file)
--- a/src/backend/utils/adt/Makefile
+++ b/src/backend/utils/adt/Makefile
@@ -30,7 +30,8 @@ OBJS = acl.o arrayfuncs.o array_selfuncs.o array_typanalyze.o \
     tsginidx.o tsgistidx.o tsquery.o tsquery_cleanup.o tsquery_gist.o \
     tsquery_op.o tsquery_rewrite.o tsquery_util.o tsrank.o \
     tsvector.o tsvector_op.o tsvector_parser.o \
-   txid.o uuid.o windowfuncs.o xml.o rangetypes_spgist.o
+   txid.o uuid.o windowfuncs.o xml.o rangetypes_spgist.o \
+   rangetypes_typanalyze.o rangetypes_selfuncs.o
  
  like.o: like.c like_match.c
  
diff --git a/src/backend/utils/adt/rangetypes.c b/src/backend/utils/adt/rangetypes.c

index fe9e0c4bc6aeacae3ab6555ea534c49f20675dfa..f229a9d7b9136d1bef36bdc625d0e9571e6e5649 100644 (file)
--- a/src/backend/utils/adt/rangetypes.c
+++ b/src/backend/utils/adt/rangetypes.c
@@ -1228,23 +1228,6 @@ hash_range(PG_FUNCTION_ARGS)
     PG_RETURN_INT32(result);
  }
  
-/* ANALYZE support */
-
-/* typanalyze function for range datatypes */
-Datum
-range_typanalyze(PG_FUNCTION_ARGS)
-{
-   /*
-    * For the moment, just punt and don't analyze range columns.  If we get
-    * close to release without having a better answer, we could consider
-    * letting std_typanalyze do what it can ... but those stats are probably
-    * next door to useless for most activity with range columns, so it's not
-    * clear it's worth gathering them.
-    */
-   PG_RETURN_BOOL(false);
-}
-
-
  /*
   *----------------------------------------------------------
   * CANONICAL FUNCTIONS
diff --git a/src/backend/utils/adt/rangetypes_selfuncs.c b/src/backend/utils/adt/rangetypes_selfuncs.c

new file mode 100644 (file)

index 0000000..ab2c6d6
--- /dev/null
+++ b/src/backend/utils/adt/rangetypes_selfuncs.c
@@ -0,0 +1,599 @@
+/*-------------------------------------------------------------------------
+ *
+ * rangetypes_selfuncs.c
+ *   Functions for selectivity estimation of range operators
+ *
+ * Estimates are based on histograms of lower and upper bounds, and the
+ * fraction of empty ranges.
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *   src/backend/utils/adt/rangetypes_selfuncs.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "catalog/pg_operator.h"
+#include "catalog/pg_statistic.h"
+#include "utils/lsyscache.h"
+#include "utils/rangetypes.h"
+#include "utils/selfuncs.h"
+#include "utils/typcache.h"
+
+static double calc_rangesel(TypeCacheEntry *typcache, VariableStatData *vardata,
+             RangeType *constval, Oid operator);
+static double default_range_selectivity(Oid operator);
+static double calc_hist_selectivity(TypeCacheEntry *typcache,
+                     VariableStatData *vardata, RangeType *constval,
+                     Oid operator);
+static double calc_hist_selectivity_scalar(TypeCacheEntry *typcache,
+                            RangeBound *constbound,
+                            RangeBound *hist, int hist_nvalues,
+                            bool equal);
+static int rbound_bsearch(TypeCacheEntry *typcache, RangeBound *value,
+              RangeBound *hist, int hist_length, bool equal);
+static float8 get_position(TypeCacheEntry *typcache, RangeBound *value,
+            RangeBound *hist1, RangeBound *hist2);
+
+/*
+ * Returns a default selectivity estimate for given operator, when we don't
+ * have statistics or cannot use them for some reason.
+ */
+static double
+default_range_selectivity(Oid operator)
+{
+   switch (operator)
+   {
+       case OID_RANGE_OVERLAP_OP:
+           return 0.01;
+
+       case OID_RANGE_CONTAINS_OP:
+       case OID_RANGE_CONTAINED_OP:
+           return 0.005;
+
+       case OID_RANGE_CONTAINS_ELEM_OP:
+           /*
+            * "range @> elem" is more or less identical to a scalar
+            * inequality "A >= b AND A <= c".
+            */
+           return DEFAULT_RANGE_INEQ_SEL;
+
+       case OID_RANGE_LESS_OP:
+       case OID_RANGE_LESS_EQUAL_OP:
+       case OID_RANGE_GREATER_OP:
+       case OID_RANGE_GREATER_EQUAL_OP:
+       case OID_RANGE_LEFT_OP:
+       case OID_RANGE_RIGHT_OP:
+           /* these are similar to regular scalar inequalities */
+           return DEFAULT_INEQ_SEL;
+
+       default:
+           /* all range operators should be handled above, but just in case */
+           return 0.01;
+   }
+}
+
+/*
+ * rangesel -- restriction selectivity for range operators
+ */
+Datum
+rangesel(PG_FUNCTION_ARGS)
+{
+   PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
+   Oid         operator = PG_GETARG_OID(1);
+   List       *args = (List *) PG_GETARG_POINTER(2);
+   int         varRelid = PG_GETARG_INT32(3);
+   VariableStatData vardata;
+   Node       *other;
+   bool        varonleft;
+   Selectivity selec;
+   TypeCacheEntry *typcache;
+   RangeType  *constrange = NULL;
+
+   /*
+    * If expression is not (variable op something) or (something op
+    * variable), then punt and return a default estimate.
+    */
+   if (!get_restriction_variable(root, args, varRelid,
+                                 &vardata, &other, &varonleft))
+       PG_RETURN_FLOAT8(default_range_selectivity(operator));
+
+   /*
+    * Can't do anything useful if the something is not a constant, either.
+    */
+   if (!IsA(other, Const))
+   {
+       ReleaseVariableStats(vardata);
+       PG_RETURN_FLOAT8(default_range_selectivity(operator));
+   }
+
+   /*
+    * All the range operators are strict, so we can cope with a NULL constant
+    * right away.
+    */
+   if (((Const *) other)->constisnull)
+   {
+       ReleaseVariableStats(vardata);
+       PG_RETURN_FLOAT8(0.0);
+   }
+
+   /*
+    * If var is on the right, commute the operator, so that we can assume the
+    * var is on the left in what follows.
+    */
+   if (!varonleft)
+   {
+       /* we have other Op var, commute to make var Op other */
+       operator = get_commutator(operator);
+       if (!operator)
+       {
+           /* Use default selectivity (should we raise an error instead?) */
+           ReleaseVariableStats(vardata);
+           PG_RETURN_FLOAT8(default_range_selectivity(operator));
+       }
+   }
+
+   typcache = range_get_typcache(fcinfo, vardata.vartype);
+
+   /*
+    * OK, there's a Var and a Const we're dealing with here.  We need the
+    * Const to be of same range type as the column, else we can't do anything
+    * useful. (Such cases will likely fail at runtime, but here we'd rather
+    * just return a default estimate.)
+    *
+    * If the operator is "range @> element", the constant should be of the
+    * element type of the range column. Convert it to a range that includes
+    * only that single point, so that we don't need special handling for
+    * that in what follows.
+    */
+   if (operator == OID_RANGE_CONTAINS_ELEM_OP)
+   {
+       if (((Const *) other)->consttype == typcache->rngelemtype->type_id)
+       {
+           RangeBound lower, upper;
+           lower.inclusive = true;
+           lower.val = ((Const *) other)->constvalue;
+           lower.infinite = false;
+           lower.lower = true;
+           upper.inclusive = true;
+           upper.val = ((Const *) other)->constvalue;
+           upper.infinite = false;
+           upper.lower = false;
+           constrange = range_serialize(typcache, &lower, &upper, false);
+       }
+   }
+   else
+   {
+       if (((Const *) other)->consttype == vardata.vartype)
+           constrange = DatumGetRangeType(((Const *) other)->constvalue);
+   }
+
+   /*
+    * If we got a valid constant on one side of the operator, proceed to
+    * estimate using statistics. Otherwise punt and return a default
+    * constant estimate.
+    */
+   if (constrange)
+       selec = calc_rangesel(typcache, &vardata, constrange, operator);
+   else
+       selec = default_range_selectivity(operator);
+
+   ReleaseVariableStats(vardata);
+
+   CLAMP_PROBABILITY(selec);
+
+   PG_RETURN_FLOAT8((float8) selec);
+}
+
+static double
+calc_rangesel(TypeCacheEntry *typcache, VariableStatData *vardata,
+             RangeType *constval, Oid operator)
+{
+   double      hist_selec;
+   double      selec;
+   float4      empty_frac, null_frac;
+
+   /*
+    * First look up the fraction of NULLs and empty ranges from pg_statistic.
+    */
+   if (HeapTupleIsValid(vardata->statsTuple))
+   {
+       Form_pg_statistic stats;
+       float4     *numbers;
+       int         nnumbers;
+
+       stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
+       null_frac = stats->stanullfrac;
+
+       /* Try to get fraction of empty ranges */
+       if (get_attstatsslot(vardata->statsTuple,
+                            vardata->atttype, vardata->atttypmod,
+                            STATISTIC_KIND_RANGE_EMPTY_FRAC, InvalidOid,
+                            NULL,
+                            NULL, NULL,
+                            &numbers, &nnumbers))
+       {
+           if (nnumbers != 1)
+               elog(ERROR, "invalid empty fraction statistic"); /* shouldn't happen */
+           empty_frac = numbers[0];
+       }
+       else
+       {
+           /* No empty fraction statistic. Assume no empty ranges. */
+           empty_frac = 0.0;
+       }
+   }
+   else
+   {
+       /*
+        * No stats are available. Follow through the calculations below
+        * anyway, assuming no NULLs and no empty ranges. This still allows
+        * us to give a better-than-nothing estimate based on whether the
+        * constant is an empty range or not.
+        */
+       null_frac = 0.0;
+       empty_frac = 0.0;
+   }
+
+   if (RangeIsEmpty(constval))
+   {
+       /*
+        * An empty range matches all ranges, all empty ranges, or nothing,
+        * depending on the operator
+        */
+       switch (operator)
+       {
+           case OID_RANGE_OVERLAP_OP:
+           case OID_RANGE_OVERLAPS_LEFT_OP:
+           case OID_RANGE_OVERLAPS_RIGHT_OP:
+           case OID_RANGE_LEFT_OP:
+           case OID_RANGE_RIGHT_OP:
+               /* these return false if either argument is empty */
+               selec = 0.0;
+               break;
+
+           case OID_RANGE_CONTAINED_OP:
+           case OID_RANGE_LESS_EQUAL_OP:
+           case OID_RANGE_GREATER_EQUAL_OP:
+               /*
+                * these return true when both args are empty, false if only
+                * one is empty
+                */
+               selec = empty_frac;
+               break;
+
+           case OID_RANGE_CONTAINS_OP:
+               /* everything contains an empty range */
+               selec = 1.0;
+               break;
+
+           case OID_RANGE_CONTAINS_ELEM_OP:
+           default:
+               elog(ERROR, "unexpected operator %u", operator);
+               selec = 0.0; /* keep compiler quiet */
+               break;
+       }
+   }
+   else
+   {
+       /*
+        * Calculate selectivity using bound histograms. If that fails for
+        * some reason, e.g no histogram in pg_statistic, use the default
+        * constant estimate for the fraction of non-empty values. This is
+        * still somewhat better than just returning the default estimate,
+        * because this still takes into account the fraction of empty and
+        * NULL tuples, if we had statistics for them.
+        */
+       hist_selec = calc_hist_selectivity(typcache, vardata, constval,
+                                          operator);
+       if (hist_selec < 0.0)
+           hist_selec = default_range_selectivity(operator);
+
+       /*
+        * Now merge the results for the empty ranges and histogram
+        * calculations, realizing that the histogram covers only the
+        * non-null, non-empty values.
+        */
+       if (operator == OID_RANGE_CONTAINED_OP)
+       {
+           /* empty is contained by anything non-empty */
+           selec = (1.0 - empty_frac) * hist_selec + empty_frac;
+       }
+       else
+       {
+           /* with any other operator, empty Op non-empty matches nothing */
+           selec = (1.0 - empty_frac) * hist_selec;
+       }
+   }
+
+   /* all range operators are strict */
+   selec *= (1.0 - null_frac);
+
+   /* result should be in range, but make sure... */
+   CLAMP_PROBABILITY(selec);
+
+   return selec;
+}
+
+/*
+ * Calculate range operator selectivity using histograms of range bounds.
+ *
+ * This estimate is for the portion of values that are not empty and not
+ * NULL.
+ */
+static double
+calc_hist_selectivity(TypeCacheEntry *typcache, VariableStatData *vardata,
+                     RangeType *constval, Oid operator)
+{
+   Datum      *hist_values;
+   int         nhist;
+   RangeBound *hist_lower;
+   RangeBound *hist_upper;
+   int         i;
+   RangeBound  const_lower;
+   RangeBound  const_upper;
+   bool        empty;
+   double      hist_selec;
+
+   /* Try to get histogram of ranges */
+   if (!(HeapTupleIsValid(vardata->statsTuple) &&
+         get_attstatsslot(vardata->statsTuple,
+                          vardata->atttype, vardata->atttypmod,
+                          STATISTIC_KIND_BOUNDS_HISTOGRAM, InvalidOid,
+                          NULL,
+                          &hist_values, &nhist,
+                          NULL, NULL)))
+       return -1.0;
+
+   /*
+    * Convert histogram of ranges into histograms of its lower and upper
+    * bounds.
+    */
+   hist_lower = (RangeBound *) palloc(sizeof(RangeBound) * nhist);
+   hist_upper = (RangeBound *) palloc(sizeof(RangeBound) * nhist);
+   for (i = 0; i < nhist; i++)
+   {
+       range_deserialize(typcache, DatumGetRangeType(hist_values[i]),
+                         &hist_lower[i], &hist_upper[i], &empty);
+       /* The histogram should not contain any empty ranges */
+       if (empty)
+           elog(ERROR, "bounds histogram contains an empty range");
+   }
+
+   /* Extract the bounds of the constant value. */
+   range_deserialize(typcache, constval, &const_lower, &const_upper, &empty);
+   Assert (!empty);
+
+   /*
+    * Calculate selectivity comparing the lower or upper bound of the
+    * constant with the histogram of lower or upper bounds.
+    */
+   switch (operator)
+   {
+       case OID_RANGE_LESS_OP:
+           /*
+            * The regular b-tree comparison operators (<, <=, >, >=) compare
+            * the lower bounds first, and the upper bounds for values with
+            * equal lower bounds. Estimate that by comparing the lower bounds
+            * only. This gives a fairly accurate estimate assuming there
+            * aren't many rows with a lower bound equal to the constant's
+            * lower bound.
+            */
+           hist_selec =
+               calc_hist_selectivity_scalar(typcache, &const_lower,
+                                            hist_lower, nhist, false);
+           break;
+
+       case OID_RANGE_LESS_EQUAL_OP:
+           hist_selec =
+               calc_hist_selectivity_scalar(typcache, &const_lower,
+                                            hist_lower, nhist, true);
+           break;
+
+       case OID_RANGE_GREATER_OP:
+           hist_selec =
+               1 - calc_hist_selectivity_scalar(typcache, &const_lower,
+                                                hist_lower, nhist, true);
+           break;
+
+       case OID_RANGE_GREATER_EQUAL_OP:
+           hist_selec =
+               1 - calc_hist_selectivity_scalar(typcache, &const_lower,
+                                                hist_lower, nhist, false);
+           break;
+
+       case OID_RANGE_LEFT_OP:
+           /* var << const when upper(var) < lower(const) */
+           hist_selec =
+               calc_hist_selectivity_scalar(typcache, &const_lower,
+                                            hist_upper, nhist, false);
+           break;
+
+       case OID_RANGE_RIGHT_OP:
+           /* var >> const when lower(var) > upper(const) */
+           hist_selec =
+               1 - calc_hist_selectivity_scalar(typcache, &const_upper,
+                                                hist_lower, nhist, true);
+           break;
+
+       case OID_RANGE_OVERLAPS_RIGHT_OP:
+           /* compare lower bounds */
+           hist_selec =
+               1 - calc_hist_selectivity_scalar(typcache, &const_lower,
+                                                hist_lower, nhist, false);
+           break;
+
+       case OID_RANGE_OVERLAPS_LEFT_OP:
+           /* compare upper bounds */
+           hist_selec =
+               calc_hist_selectivity_scalar(typcache, &const_upper,
+                                            hist_upper, nhist, true);
+           break;
+
+       case OID_RANGE_OVERLAP_OP:
+       case OID_RANGE_CONTAINS_ELEM_OP:
+           /*
+            * A && B <=> NOT (A << B OR A >> B).
+            *
+            * "range @> elem" is equivalent to "range && [elem,elem]". The
+            * caller already constructed the singular range from the element
+            * constant, so just treat it the same as &&.
+            */
+           hist_selec =
+               calc_hist_selectivity_scalar(typcache, &const_lower, hist_upper,
+                                            nhist, false);
+           hist_selec +=
+               (1.0 - calc_hist_selectivity_scalar(typcache, &const_upper, hist_lower,
+                                                 nhist, true));
+           hist_selec = 1.0 - hist_selec;
+           break;
+
+       case OID_RANGE_CONTAINS_OP:
+       case OID_RANGE_CONTAINED_OP:
+           /* TODO: not implemented yet */
+           hist_selec = -1.0;
+           break;
+
+       default:
+           elog(ERROR, "unknown range operator %u", operator);
+           hist_selec = -1.0; /* keep compiler quiet */
+           break;
+   }
+
+   return hist_selec;
+}
+
+
+/*
+ * Look up the fraction of values less than (or equal, if 'equal' argument
+ * is true) a given const in a histogram of range bounds.
+ */
+static double
+calc_hist_selectivity_scalar(TypeCacheEntry *typcache, RangeBound *constbound,
+                            RangeBound *hist, int hist_nvalues, bool equal)
+{
+   Selectivity selec;
+   int         index;
+
+   /*
+    * Find the histogram bin the given constant falls into. Estimate
+    * selectivity as the number of preceding whole bins.
+    */
+   index = rbound_bsearch(typcache, constbound, hist, hist_nvalues, equal);
+   selec = (Selectivity) (Max(index, 0)) / (Selectivity) (hist_nvalues - 1);
+
+   /* Adjust using linear interpolation within the bin */
+   if (index >= 0 && index < hist_nvalues - 1)
+       selec += get_position(typcache, constbound, &hist[index],
+                       &hist[index + 1]) / (Selectivity) (hist_nvalues - 1);
+
+   return selec;
+}
+
+/*
+ * Binary search on an array of range bounds. Returns greatest index of range
+ * bound in array which is less than given range bound. If all range bounds in
+ * array are greater or equal than given range bound, return -1.
+ */
+static int
+rbound_bsearch(TypeCacheEntry *typcache, RangeBound *value, RangeBound *hist,
+             int hist_length, bool equal)
+{
+   int         lower = -1,
+               upper = hist_length - 1,
+               cmp,
+               middle;
+
+   while (lower < upper)
+   {
+       middle = (lower + upper + 1) / 2;
+       cmp = range_cmp_bounds(typcache, &hist[middle], value);
+
+       if (cmp < 0 || (equal && cmp == 0))
+           lower = middle;
+       else
+           upper = middle - 1;
+   }
+   return lower;
+}
+
+/*
+ * Get relative position of value in histogram bin in [0,1] range.
+ */
+static float8
+get_position(TypeCacheEntry *typcache, RangeBound *value, RangeBound *hist1,
+            RangeBound *hist2)
+{
+   bool        has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
+   float8      position;
+
+   if (!hist1->infinite && !hist2->infinite)
+   {
+       float8      bin_width;
+
+       /*
+        * Both bounds are finite. Assuming the subtype's comparison function
+        * works sanely, the value must be finite, too, because it lies
+        * somewhere between the bounds. If it doesn't, just return something.
+        */
+       if (value->infinite)
+           return 0.5;
+
+       /* Can't interpolate without subdiff function */
+       if (!has_subdiff)
+           return 0.5;
+
+       /* Calculate relative position using subdiff function. */
+       bin_width = DatumGetFloat8(FunctionCall2Coll(
+                                               &typcache->rng_subdiff_finfo,
+                                                    typcache->rng_collation,
+                                                    hist2->val,
+                                                    hist1->val));
+       if (bin_width <= 0.0)
+           return 0.5;     /* zero width bin */
+
+       position = DatumGetFloat8(FunctionCall2Coll(
+                                               &typcache->rng_subdiff_finfo,
+                                                   typcache->rng_collation,
+                                                   value->val,
+                                                   hist1->val))
+           / bin_width;
+
+       /* Relative position must be in [0,1] range */
+       position = Max(position, 0.0);
+       position = Min(position, 1.0);
+       return position;
+   }
+   else if (hist1->infinite && !hist2->infinite)
+   {
+       /*
+        * Lower bin boundary is -infinite, upper is finite. If the value is
+        * -infinite, return 0.0 to indicate it's equal to the lower bound.
+        * Otherwise return 1.0 to indicate it's infinitely far from the lower
+        * bound.
+        */
+       return ((value->infinite && value->lower) ? 0.0 : 1.0);
+   }
+   else if (!hist1->infinite && hist2->infinite)
+   {
+       /* same as above, but in reverse */
+       return ((value->infinite && !value->lower) ? 1.0 : 0.0);
+   }
+   else
+   {
+       /*
+        * If both bin boundaries are infinite, they should be equal to each
+        * other, and the value should also be infinite and equal to both
+        * bounds. (But don't Assert that, to avoid crashing if a user creates
+        * a datatype with a broken comparison function).
+        *
+        * Assume the value to lie in the middle of the infinite bounds.
+        */
+       return 0.5;
+   }
+}
+
diff --git a/src/backend/utils/adt/rangetypes_typanalyze.c b/src/backend/utils/adt/rangetypes_typanalyze.c

new file mode 100644 (file)

index 0000000..e276ad8
--- /dev/null
+++ b/src/backend/utils/adt/rangetypes_typanalyze.c
@@ -0,0 +1,232 @@
+/*-------------------------------------------------------------------------
+ *
+ * ragetypes_typanalyze.c
+ *   Functions for gathering statistics from range columns
+ *
+ * For a range type column, histograms of lower and upper bounds, and
+ * the fraction of NULL and empty ranges are collected.
+ *
+ * Both histograms have the same length, and they are combined into a
+ * single array of ranges. This has the same shape as the histogram that
+ * std_typanalyze would collect, but the values are different. Each range
+ * in the array is a valid range, even though the lower and upper bounds
+ * come from different tuples. In theory, the standard scalar selectivity
+ * functions could be used with the combined histogram.
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *   src/backend/utils/adt/rangetypes_typanalyze.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "catalog/pg_operator.h"
+#include "commands/vacuum.h"
+#include "utils/builtins.h"
+#include "utils/rangetypes.h"
+
+static void compute_range_stats(VacAttrStats *stats,
+          AnalyzeAttrFetchFunc fetchfunc, int samplerows, double totalrows);
+
+/*
+ * range_typanalyze -- typanalyze function for range columns
+ */
+Datum
+range_typanalyze(PG_FUNCTION_ARGS)
+{
+   VacAttrStats *stats = (VacAttrStats *) PG_GETARG_POINTER(0);
+   TypeCacheEntry *typcache;
+   Form_pg_attribute attr = stats->attr;
+
+   /* Get information about range type */
+   typcache = range_get_typcache(fcinfo, stats->attrtypid);
+
+   if (attr->attstattarget < 0)
+        attr->attstattarget = default_statistics_target;
+
+   stats->compute_stats = compute_range_stats;
+   stats->extra_data = typcache;
+   /* same as in std_typanalyze */
+   stats->minrows = 300 * attr->attstattarget;
+
+   PG_RETURN_BOOL(true);
+}
+
+/*
+ * Comparison function for sorting RangeBounds.
+ */
+static int
+range_bound_qsort_cmp(const void *a1, const void *a2, void *arg)
+{
+   RangeBound *b1 = (RangeBound *)a1;
+   RangeBound *b2 = (RangeBound *)a2;
+   TypeCacheEntry *typcache = (TypeCacheEntry *)arg;
+
+   return range_cmp_bounds(typcache, b1, b2);
+}
+
+/*
+ * compute_range_stats() -- compute statistics for a range column
+ */
+static void
+compute_range_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
+                   int samplerows, double totalrows)
+{
+   TypeCacheEntry *typcache = (TypeCacheEntry *) stats->extra_data;
+   int         null_cnt = 0;
+   int         non_null_cnt = 0;
+   int         non_empty_cnt = 0;
+   int         empty_cnt = 0;
+   int         range_no;
+   int         slot_idx;
+   int         num_bins = stats->attr->attstattarget;
+   int         num_hist;
+   RangeBound *lowers, *uppers;
+   double      total_width = 0;
+
+   /* Allocate memory for arrays of range bounds. */
+   lowers = (RangeBound *) palloc(sizeof(RangeBound) * samplerows);
+   uppers = (RangeBound *) palloc(sizeof(RangeBound) * samplerows);
+
+   /* Loop over the sample ranges. */
+   for (range_no = 0; range_no < samplerows; range_no++)
+   {
+       Datum       value;
+       bool        isnull,
+                   empty;
+       RangeType  *range;
+       RangeBound  lower,
+                   upper;
+
+       vacuum_delay_point();
+
+       value = fetchfunc(stats, range_no, &isnull);
+       if (isnull)
+       {
+           /* range is null, just count that */
+           null_cnt++;
+           continue;
+       }
+
+       /*
+        * XXX: should we ignore wide values, like std_typanalyze does, to
+        * avoid bloating the statistics table?
+        */
+       total_width += VARSIZE_ANY(DatumGetPointer(value));
+
+       /* Get range and deserialize it for further analysis. */
+       range = DatumGetRangeType(value);
+       range_deserialize(typcache, range, &lower, &upper, &empty);
+
+       if (!empty)
+       {
+           /* Fill bound values for further usage in histograms */
+           lowers[non_empty_cnt] = lower;
+           uppers[non_empty_cnt] = upper;
+           non_empty_cnt++;
+       }
+       else
+           empty_cnt++;
+
+       non_null_cnt++;
+   }
+
+   slot_idx = 0;
+
+   /* We can only compute real stats if we found some non-null values. */
+   if (non_null_cnt > 0)
+   {
+       Datum      *bound_hist_values;
+       int         pos,
+                   posfrac,
+                   delta,
+                   deltafrac,
+                   i;
+       MemoryContext old_cxt;
+       float4     *emptyfrac;
+
+       stats->stats_valid = true;
+       /* Do the simple null-frac and width stats */
+       stats->stanullfrac = (double) null_cnt / (double) samplerows;
+       stats->stawidth = total_width / (double) non_null_cnt;
+       stats->stadistinct = -1.0;
+
+       /* Must copy the target values into anl_context */
+       old_cxt = MemoryContextSwitchTo(stats->anl_context);
+
+       if (non_empty_cnt > 0)
+       {
+           /* Sort bound values */
+           qsort_arg(lowers, non_empty_cnt, sizeof(RangeBound),
+                     range_bound_qsort_cmp, typcache);
+           qsort_arg(uppers, non_empty_cnt, sizeof(RangeBound),
+                     range_bound_qsort_cmp, typcache);
+
+           num_hist = non_empty_cnt;
+           if (num_hist > num_bins)
+               num_hist = num_bins + 1;
+
+           bound_hist_values = (Datum *) palloc(num_hist * sizeof(Datum));
+
+           /*
+            * The object of this loop is to construct ranges from first and
+            * last entries in lowers[] and uppers[] along with evenly-spaced
+            * values in between. So the i'th value is a range of
+            * lowers[(i * (nvals - 1)) / (num_hist - 1)] and
+            * uppers[(i * (nvals - 1)) / (num_hist - 1)]. But computing that
+            * subscript directly risks integer overflow when the stats target
+            * is more than a couple thousand.  Instead we add
+            * (nvals - 1) / (num_hist - 1) to pos at each step, tracking the
+            * integral and fractional parts of the sum separately.
+            */
+           delta = (non_empty_cnt - 1) / (num_hist - 1);
+           deltafrac = (non_empty_cnt - 1) % (num_hist - 1);
+           pos = posfrac = 0;
+
+           for (i = 0; i < num_hist; i++)
+           {
+               bound_hist_values[i] = PointerGetDatum(range_serialize(
+                               typcache, &lowers[pos], &uppers[pos], false));
+               pos += delta;
+               posfrac += deltafrac;
+               if (posfrac >= (num_hist - 1))
+               {
+                   /* fractional part exceeds 1, carry to integer part */
+                   pos++;
+                   posfrac -= (num_hist - 1);
+               }
+           }
+
+           stats->stakind[slot_idx] = STATISTIC_KIND_BOUNDS_HISTOGRAM;
+           stats->stavalues[slot_idx] = bound_hist_values;
+           stats->numvalues[slot_idx] = num_hist;
+           slot_idx++;
+       }
+
+       /* Store the fraction of empty ranges */
+       emptyfrac = (float4 *) palloc(sizeof(float4));
+       *emptyfrac = ((double) empty_cnt) / ((double) non_null_cnt);
+       stats->stakind[slot_idx] = STATISTIC_KIND_RANGE_EMPTY_FRAC;
+       stats->stanumbers[slot_idx] = emptyfrac;
+       stats->numnumbers[slot_idx] = 1;
+       slot_idx++;
+
+       MemoryContextSwitchTo(old_cxt);
+   }
+   else if (null_cnt > 0)
+   {
+       /* We found only nulls; assume the column is entirely null */
+       stats->stats_valid = true;
+       stats->stanullfrac = 1.0;
+       stats->stawidth = 0;        /* "unknown" */
+       stats->stadistinct = 0.0;   /* "unknown" */
+   }
+   /*
+    * We don't need to bother cleaning up any of our temporary palloc's. The
+    * hashtable should also go away, as it used a child memory context.
+    */
+}
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h

index da84f625203822211900c45a7b5b675b8bf170e7..3fb0b5b227c43342f27fe64151a46b2fb082386b 100644 (file)
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -53,6 +53,6 @@
   */
  
  /*                         yyyymmddN */
-#define CATALOG_VERSION_NO 201208161
+#define CATALOG_VERSION_NO 201208271
  
  #endif
diff --git a/src/include/catalog/pg_operator.h b/src/include/catalog/pg_operator.h

index 94702541f55642862367cb505ef1b919fd5851e0..abfec5cdb9d4ac1356bfc3eec5c82312e20810e1 100644 (file)
--- a/src/include/catalog/pg_operator.h
+++ b/src/include/catalog/pg_operator.h
@@ -1676,32 +1676,45 @@ DATA(insert OID = 3882 (  "="      PGNSP PGUID b t t 3831 3831 16 3882 3883 range_
  DESCR("equal");
  DATA(insert OID = 3883 (  "<>"    PGNSP PGUID b f f 3831 3831 16 3883 3882 range_ne neqsel neqjoinsel ));
  DESCR("not equal");
-DATA(insert OID = 3884 (  "<"     PGNSP PGUID b f f 3831 3831 16 3887 3886 range_lt scalarltsel scalarltjoinsel ));
+DATA(insert OID = 3884 (  "<"     PGNSP PGUID b f f 3831 3831 16 3887 3886 range_lt rangesel scalarltjoinsel ));
  DESCR("less than");
-DATA(insert OID = 3885 (  "<="    PGNSP PGUID b f f 3831 3831 16 3886 3887 range_le scalarltsel scalarltjoinsel ));
+#define OID_RANGE_LESS_OP 3884
+DATA(insert OID = 3885 (  "<="    PGNSP PGUID b f f 3831 3831 16 3886 3887 range_le rangesel scalarltjoinsel ));
  DESCR("less than or equal");
-DATA(insert OID = 3886 (  ">="    PGNSP PGUID b f f 3831 3831 16 3885 3884 range_ge scalargtsel scalargtjoinsel ));
+#define OID_RANGE_LESS_EQUAL_OP 3885
+DATA(insert OID = 3886 (  ">="    PGNSP PGUID b f f 3831 3831 16 3885 3884 range_ge rangesel scalargtjoinsel ));
  DESCR("greater than or equal");
-DATA(insert OID = 3887 (  ">"     PGNSP PGUID b f f 3831 3831 16 3884 3885 range_gt scalargtsel scalargtjoinsel ));
+#define OID_RANGE_GREATER_OP 3886
+DATA(insert OID = 3887 (  ">"     PGNSP PGUID b f f 3831 3831 16 3884 3885 range_gt rangesel scalargtjoinsel ));
  DESCR("greater than");
-DATA(insert OID = 3888 (  "&&"    PGNSP PGUID b f f 3831 3831 16 3888 0 range_overlaps areasel areajoinsel ));
+#define OID_RANGE_GREATER_EQUAL_OP 3887
+DATA(insert OID = 3888 (  "&&"    PGNSP PGUID b f f 3831 3831 16 3888 0 range_overlaps rangesel areajoinsel ));
  DESCR("overlaps");
-DATA(insert OID = 3889 (  "@>"    PGNSP PGUID b f f 3831 2283 16 3891 0 range_contains_elem contsel contjoinsel ));
+#define OID_RANGE_OVERLAP_OP 3888
+DATA(insert OID = 3889 (  "@>"    PGNSP PGUID b f f 3831 2283 16 3891 0 range_contains_elem rangesel contjoinsel ));
  DESCR("contains");
-DATA(insert OID = 3890 (  "@>"    PGNSP PGUID b f f 3831 3831 16 3892 0 range_contains contsel contjoinsel ));
+#define OID_RANGE_CONTAINS_ELEM_OP 3889
+DATA(insert OID = 3890 (  "@>"    PGNSP PGUID b f f 3831 3831 16 3892 0 range_contains rangesel contjoinsel ));
  DESCR("contains");
-DATA(insert OID = 3891 (  "<@"    PGNSP PGUID b f f 2283 3831 16 3889 0 elem_contained_by_range contsel contjoinsel ));
+#define OID_RANGE_CONTAINS_OP 3890
+DATA(insert OID = 3891 (  "<@"    PGNSP PGUID b f f 2283 3831 16 3889 0 elem_contained_by_range rangesel contjoinsel ));
  DESCR("is contained by");
-DATA(insert OID = 3892 (  "<@"    PGNSP PGUID b f f 3831 3831 16 3890 0 range_contained_by contsel contjoinsel ));
+#define OID_RANGE_ELEM_CONTAINED_OP 3891
+DATA(insert OID = 3892 (  "<@"    PGNSP PGUID b f f 3831 3831 16 3890 0 range_contained_by rangesel contjoinsel ));
  DESCR("is contained by");
-DATA(insert OID = 3893 (  "<<"    PGNSP PGUID b f f 3831 3831 16 3894 0 range_before scalarltsel scalarltjoinsel ));
+#define OID_RANGE_CONTAINED_OP 3892
+DATA(insert OID = 3893 (  "<<"    PGNSP PGUID b f f 3831 3831 16 3894 0 range_before rangesel scalarltjoinsel ));
  DESCR("is left of");
-DATA(insert OID = 3894 (  ">>"    PGNSP PGUID b f f 3831 3831 16 3893 0 range_after scalargtsel scalargtjoinsel ));
+#define OID_RANGE_LEFT_OP 3893
+DATA(insert OID = 3894 (  ">>"    PGNSP PGUID b f f 3831 3831 16 3893 0 range_after rangesel scalargtjoinsel ));
  DESCR("is right of");
-DATA(insert OID = 3895 (  "&<"    PGNSP PGUID b f f 3831 3831 16 0 0 range_overleft scalarltsel scalarltjoinsel ));
+#define OID_RANGE_RIGHT_OP 3894
+DATA(insert OID = 3895 (  "&<"    PGNSP PGUID b f f 3831 3831 16 0 0 range_overleft rangesel scalarltjoinsel ));
  DESCR("overlaps or is left of");
-DATA(insert OID = 3896 (  "&>"    PGNSP PGUID b f f 3831 3831 16 0 0 range_overright scalargtsel scalargtjoinsel ));
+#define OID_RANGE_OVERLAPS_LEFT_OP 3895
+DATA(insert OID = 3896 (  "&>"    PGNSP PGUID b f f 3831 3831 16 0 0 range_overright rangesel scalargtjoinsel ));
  DESCR("overlaps or is right of");
+#define OID_RANGE_OVERLAPS_RIGHT_OP 3896
  DATA(insert OID = 3897 (  "-|-"    PGNSP PGUID b f f 3831 3831 16 3897 0 range_adjacent contsel contjoinsel ));
  DESCR("is adjacent to");
  DATA(insert OID = 3898 (  "+"     PGNSP PGUID b f f 3831 3831 3831 3898 0 range_union - - ));
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h

index 51449a4f1c5a2a2d95aae6059be5e4cae4c18f60..77a3b413ce525c5c5854f1ce60964c0c6812552c 100644 (file)
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -4544,6 +4544,8 @@ DATA(insert OID = 3902 (  hash_range          PGNSP PGUID 12 1 0 0 0 f f f f t f i 1 0
  DESCR("hash a range");
  DATA(insert OID = 3916 (  range_typanalyze     PGNSP PGUID 12 1 0 0 0 f f f f t f s 1 0 16 "2281" _null_ _null_ _null_ _null_ range_typanalyze _null_ _null_ _null_ ));
  DESCR("range typanalyze");
+DATA(insert OID = 3169 (  rangesel             PGNSP PGUID 12 1 0 0 0 f f f f t f s 4 0 701 "2281 26 2281 23" _null_ _null_ _null_ _null_ rangesel _null_ _null_ _null_ ));
+DESCR("restriction selectivity for range operators");
  
  DATA(insert OID = 3914 (  int4range_canonical         PGNSP PGUID 12 1 0 0 0 f f f f t f i 1 0 3904 "3904" _null_ _null_ _null_ _null_ int4range_canonical _null_ _null_ _null_ ));
  DESCR("convert an int4 range to canonical form");
diff --git a/src/include/catalog/pg_statistic.h b/src/include/catalog/pg_statistic.h

index 8724cb50fa1d9451e99b2033c66fa1b96b1cd703..5df685e4bcbf1d66d862dbb1c8ded06c8f578a9c 100644 (file)
--- a/src/include/catalog/pg_statistic.h
+++ b/src/include/catalog/pg_statistic.h
@@ -268,4 +268,22 @@ typedef FormData_pg_statistic *Form_pg_statistic;
   */
  #define STATISTIC_KIND_DECHIST 5
  
+/*
+ * An "empty frac" slot describes the fraction of empty ranges in a range-type
+ * column.  stavalues is not used and should be NULL.  stanumbers contains a
+ * single entry, the fraction of empty ranges (0.0 to 1.0).
+ */
+#define STATISTIC_KIND_RANGE_EMPTY_FRAC  6
+
+/*
+ * A "bounds histogram" slot is similar to STATISTIC_KIND_HISTOGRAM, but for
+ * a range-type column.  stavalues contains M (>=2) range values that divide
+ * the column data values into M-1 bins of approximately equal population.
+ * Unlike a regular scalar histogram, this is actually two histograms combined
+ * into a single array, with the lower bounds of each value forming a
+ * histogram of lower bounds, and the upper bounds a histogram of upper
+ * bounds.  Only non-NULL, non-empty ranges are included.
+ */
+#define STATISTIC_KIND_BOUNDS_HISTOGRAM  7
+
  #endif   /* PG_STATISTIC_H */
diff --git a/src/include/utils/rangetypes.h b/src/include/utils/rangetypes.h

index fb0b23b4be08f97dbb704886488cc87191fecbec..a7eeda05c67e1db91356607fddf126763b5186e1 100644 (file)
--- a/src/include/utils/rangetypes.h
+++ b/src/include/utils/rangetypes.h
@@ -170,6 +170,7 @@ extern Datum hash_range(PG_FUNCTION_ARGS);
  
  /* ANALYZE support */
  extern Datum range_typanalyze(PG_FUNCTION_ARGS);
+extern Datum rangesel(PG_FUNCTION_ARGS);
  
  /* Canonical functions */
  extern Datum int4range_canonical(PG_FUNCTION_ARGS);
author	Heikki Linnakangas <heikki.linnakangas@iki.fi>
	Mon, 27 Aug 2012 12:48:46 +0000 (15:48 +0300)
committer	Heikki Linnakangas <heikki.linnakangas@iki.fi>
	Mon, 27 Aug 2012 12:58:46 +0000 (15:58 +0300)
src/backend/utils/adt/Makefile		patch \| blob \| blame \| history
src/backend/utils/adt/rangetypes.c		patch \| blob \| blame \| history
src/backend/utils/adt/rangetypes_selfuncs.c	[new file with mode: 0644]	patch \| blob
src/backend/utils/adt/rangetypes_typanalyze.c	[new file with mode: 0644]	patch \| blob
src/include/catalog/catversion.h		patch \| blob \| blame \| history
src/include/catalog/pg_operator.h		patch \| blob \| blame \| history
src/include/catalog/pg_proc.h		patch \| blob \| blame \| history
src/include/catalog/pg_statistic.h		patch \| blob \| blame \| history
src/include/utils/rangetypes.h		patch \| blob \| blame \| history