Allow left join removals and unique joins on partitioned tables
authorDavid Rowley <drowley@postgresql.org>
Mon, 9 Jan 2023 04:15:08 +0000 (17:15 +1300)
committerDavid Rowley <drowley@postgresql.org>
Mon, 9 Jan 2023 04:15:08 +0000 (17:15 +1300)
This allows left join removals and unique joins to work with partitioned
tables.  The planner just lacked sufficient proofs that a given join
would not cause any row duplication.  Unique indexes currently serve as
that proof, so have get_relation_info() populate the indexlist for
partitioned tables too.

Author: Arne Roland
Reviewed-by: Alvaro Herrera, Zhihong Yu, Amit Langote, David Rowley
Discussion: https://postgr.es/m/c3b2408b7a39433b8230bbcd02e9f302@index.de

src/backend/optimizer/util/plancat.c
src/backend/utils/adt/selfuncs.c
src/include/nodes/pathnodes.h
src/test/regress/expected/join.out
src/test/regress/expected/partition_join.out
src/test/regress/sql/join.sql
src/test/regress/sql/partition_join.sql

index 9f158f2421b0f7be3b6da784b098298d557bc410..d58c4a10782740b8d6dcfe7377fe79bf341ce4f9 100644 (file)
@@ -109,7 +109,9 @@ static void set_baserel_partition_constraint(Relation relation,
  * If inhparent is true, all we need to do is set up the attr arrays:
  * the RelOptInfo actually represents the appendrel formed by an inheritance
  * tree, and so the parent rel's physical size and index information isn't
- * important for it.
+ * important for it, however, for partitioned tables, we do populate the
+ * indexlist as the planner uses unique indexes as unique proofs for certain
+ * optimizations.
  */
 void
 get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
@@ -175,10 +177,14 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 
    /*
     * Make list of indexes.  Ignore indexes on system catalogs if told to.
-    * Don't bother with indexes for an inheritance parent, either.
+    * Don't bother with indexes from traditional inheritance parents.  For
+    * partitioned tables, we need a list of at least unique indexes as these
+    * serve as unique proofs for certain planner optimizations.  However,
+    * let's not discriminate here and just record all partitioned indexes
+    * whether they're unique indexes or not.
     */
-   if (inhparent ||
-       (IgnoreSystemIndexes && IsSystemRelation(relation)))
+   if ((inhparent && relation->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
+       || (IgnoreSystemIndexes && IsSystemRelation(relation)))
        hasindex = false;
    else
        hasindex = relation->rd_rel->relhasindex;
@@ -231,16 +237,6 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
                continue;
            }
 
-           /*
-            * Ignore partitioned indexes, since they are not usable for
-            * queries.
-            */
-           if (indexRelation->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
-           {
-               index_close(indexRelation, NoLock);
-               continue;
-           }
-
            /*
             * If the index is valid, but cannot yet be used, ignore it; but
             * mark the plan we are generating as transient. See
@@ -285,105 +281,129 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 
            info->relam = indexRelation->rd_rel->relam;
 
-           /* We copy just the fields we need, not all of rd_indam */
-           amroutine = indexRelation->rd_indam;
-           info->amcanorderbyop = amroutine->amcanorderbyop;
-           info->amoptionalkey = amroutine->amoptionalkey;
-           info->amsearcharray = amroutine->amsearcharray;
-           info->amsearchnulls = amroutine->amsearchnulls;
-           info->amcanparallel = amroutine->amcanparallel;
-           info->amhasgettuple = (amroutine->amgettuple != NULL);
-           info->amhasgetbitmap = amroutine->amgetbitmap != NULL &&
-               relation->rd_tableam->scan_bitmap_next_block != NULL;
-           info->amcanmarkpos = (amroutine->ammarkpos != NULL &&
-                                 amroutine->amrestrpos != NULL);
-           info->amcostestimate = amroutine->amcostestimate;
-           Assert(info->amcostestimate != NULL);
-
-           /* Fetch index opclass options */
-           info->opclassoptions = RelationGetIndexAttOptions(indexRelation, true);
-
            /*
-            * Fetch the ordering information for the index, if any.
+            * We don't have an AM for partitioned indexes, so we'll just
+            * NULLify the AM related fields for those.
             */
-           if (info->relam == BTREE_AM_OID)
+           if (indexRelation->rd_rel->relkind != RELKIND_PARTITIONED_INDEX)
            {
+               /* We copy just the fields we need, not all of rd_indam */
+               amroutine = indexRelation->rd_indam;
+               info->amcanorderbyop = amroutine->amcanorderbyop;
+               info->amoptionalkey = amroutine->amoptionalkey;
+               info->amsearcharray = amroutine->amsearcharray;
+               info->amsearchnulls = amroutine->amsearchnulls;
+               info->amcanparallel = amroutine->amcanparallel;
+               info->amhasgettuple = (amroutine->amgettuple != NULL);
+               info->amhasgetbitmap = amroutine->amgetbitmap != NULL &&
+                   relation->rd_tableam->scan_bitmap_next_block != NULL;
+               info->amcanmarkpos = (amroutine->ammarkpos != NULL &&
+                                     amroutine->amrestrpos != NULL);
+               info->amcostestimate = amroutine->amcostestimate;
+               Assert(info->amcostestimate != NULL);
+
+               /* Fetch index opclass options */
+               info->opclassoptions = RelationGetIndexAttOptions(indexRelation, true);
+
                /*
-                * If it's a btree index, we can use its opfamily OIDs
-                * directly as the sort ordering opfamily OIDs.
+                * Fetch the ordering information for the index, if any.
                 */
-               Assert(amroutine->amcanorder);
-
-               info->sortopfamily = info->opfamily;
-               info->reverse_sort = (bool *) palloc(sizeof(bool) * nkeycolumns);
-               info->nulls_first = (bool *) palloc(sizeof(bool) * nkeycolumns);
-
-               for (i = 0; i < nkeycolumns; i++)
+               if (info->relam == BTREE_AM_OID)
                {
-                   int16       opt = indexRelation->rd_indoption[i];
+                   /*
+                    * If it's a btree index, we can use its opfamily OIDs
+                    * directly as the sort ordering opfamily OIDs.
+                    */
+                   Assert(amroutine->amcanorder);
 
-                   info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0;
-                   info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0;
-               }
-           }
-           else if (amroutine->amcanorder)
-           {
-               /*
-                * Otherwise, identify the corresponding btree opfamilies by
-                * trying to map this index's "<" operators into btree.  Since
-                * "<" uniquely defines the behavior of a sort order, this is
-                * a sufficient test.
-                *
-                * XXX This method is rather slow and also requires the
-                * undesirable assumption that the other index AM numbers its
-                * strategies the same as btree.  It'd be better to have a way
-                * to explicitly declare the corresponding btree opfamily for
-                * each opfamily of the other index type.  But given the lack
-                * of current or foreseeable amcanorder index types, it's not
-                * worth expending more effort on now.
-                */
-               info->sortopfamily = (Oid *) palloc(sizeof(Oid) * nkeycolumns);
-               info->reverse_sort = (bool *) palloc(sizeof(bool) * nkeycolumns);
-               info->nulls_first = (bool *) palloc(sizeof(bool) * nkeycolumns);
+                   info->sortopfamily = info->opfamily;
+                   info->reverse_sort = (bool *) palloc(sizeof(bool) * nkeycolumns);
+                   info->nulls_first = (bool *) palloc(sizeof(bool) * nkeycolumns);
 
-               for (i = 0; i < nkeycolumns; i++)
-               {
-                   int16       opt = indexRelation->rd_indoption[i];
-                   Oid         ltopr;
-                   Oid         btopfamily;
-                   Oid         btopcintype;
-                   int16       btstrategy;
-
-                   info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0;
-                   info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0;
-
-                   ltopr = get_opfamily_member(info->opfamily[i],
-                                               info->opcintype[i],
-                                               info->opcintype[i],
-                                               BTLessStrategyNumber);
-                   if (OidIsValid(ltopr) &&
-                       get_ordering_op_properties(ltopr,
-                                                  &btopfamily,
-                                                  &btopcintype,
-                                                  &btstrategy) &&
-                       btopcintype == info->opcintype[i] &&
-                       btstrategy == BTLessStrategyNumber)
+                   for (i = 0; i < nkeycolumns; i++)
                    {
-                       /* Successful mapping */
-                       info->sortopfamily[i] = btopfamily;
+                       int16       opt = indexRelation->rd_indoption[i];
+
+                       info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0;
+                       info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0;
                    }
-                   else
+               }
+               else if (amroutine->amcanorder)
+               {
+                   /*
+                    * Otherwise, identify the corresponding btree opfamilies
+                    * by trying to map this index's "<" operators into btree.
+                    * Since "<" uniquely defines the behavior of a sort
+                    * order, this is a sufficient test.
+                    *
+                    * XXX This method is rather slow and also requires the
+                    * undesirable assumption that the other index AM numbers
+                    * its strategies the same as btree.  It'd be better to
+                    * have a way to explicitly declare the corresponding
+                    * btree opfamily for each opfamily of the other index
+                    * type.  But given the lack of current or foreseeable
+                    * amcanorder index types, it's not worth expending more
+                    * effort on now.
+                    */
+                   info->sortopfamily = (Oid *) palloc(sizeof(Oid) * nkeycolumns);
+                   info->reverse_sort = (bool *) palloc(sizeof(bool) * nkeycolumns);
+                   info->nulls_first = (bool *) palloc(sizeof(bool) * nkeycolumns);
+
+                   for (i = 0; i < nkeycolumns; i++)
                    {
-                       /* Fail ... quietly treat index as unordered */
-                       info->sortopfamily = NULL;
-                       info->reverse_sort = NULL;
-                       info->nulls_first = NULL;
-                       break;
+                       int16       opt = indexRelation->rd_indoption[i];
+                       Oid         ltopr;
+                       Oid         btopfamily;
+                       Oid         btopcintype;
+                       int16       btstrategy;
+
+                       info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0;
+                       info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0;
+
+                       ltopr = get_opfamily_member(info->opfamily[i],
+                                                   info->opcintype[i],
+                                                   info->opcintype[i],
+                                                   BTLessStrategyNumber);
+                       if (OidIsValid(ltopr) &&
+                           get_ordering_op_properties(ltopr,
+                                                      &btopfamily,
+                                                      &btopcintype,
+                                                      &btstrategy) &&
+                           btopcintype == info->opcintype[i] &&
+                           btstrategy == BTLessStrategyNumber)
+                       {
+                           /* Successful mapping */
+                           info->sortopfamily[i] = btopfamily;
+                       }
+                       else
+                       {
+                           /* Fail ... quietly treat index as unordered */
+                           info->sortopfamily = NULL;
+                           info->reverse_sort = NULL;
+                           info->nulls_first = NULL;
+                           break;
+                       }
                    }
                }
+               else
+               {
+                   info->sortopfamily = NULL;
+                   info->reverse_sort = NULL;
+                   info->nulls_first = NULL;
+               }
            }
            else
            {
+               info->amcanorderbyop = false;
+               info->amoptionalkey = false;
+               info->amsearcharray = false;
+               info->amsearchnulls = false;
+               info->amcanparallel = false;
+               info->amhasgettuple = false;
+               info->amhasgetbitmap = false;
+               info->amcanmarkpos = false;
+               info->amcostestimate = NULL;
+
                info->sortopfamily = NULL;
                info->reverse_sort = NULL;
                info->nulls_first = NULL;
@@ -416,31 +436,45 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
             * the number-of-tuples estimate to equal the parent table; if it
             * is partial then we have to use the same methods as we would for
             * a table, except we can be sure that the index is not larger
-            * than the table.
+            * than the table.  We must ignore partitioned indexes here as as
+            * there are not physical indexes.
             */
-           if (info->indpred == NIL)
+           if (indexRelation->rd_rel->relkind != RELKIND_PARTITIONED_INDEX)
            {
-               info->pages = RelationGetNumberOfBlocks(indexRelation);
-               info->tuples = rel->tuples;
-           }
-           else
-           {
-               double      allvisfrac; /* dummy */
-
-               estimate_rel_size(indexRelation, NULL,
-                                 &info->pages, &info->tuples, &allvisfrac);
-               if (info->tuples > rel->tuples)
+               if (info->indpred == NIL)
+               {
+                   info->pages = RelationGetNumberOfBlocks(indexRelation);
                    info->tuples = rel->tuples;
-           }
+               }
+               else
+               {
+                   double      allvisfrac; /* dummy */
 
-           if (info->relam == BTREE_AM_OID)
-           {
-               /* For btrees, get tree height while we have the index open */
-               info->tree_height = _bt_getrootheight(indexRelation);
+                   estimate_rel_size(indexRelation, NULL,
+                                     &info->pages, &info->tuples, &allvisfrac);
+                   if (info->tuples > rel->tuples)
+                       info->tuples = rel->tuples;
+               }
+
+               if (info->relam == BTREE_AM_OID)
+               {
+                   /*
+                    * For btrees, get tree height while we have the index
+                    * open
+                    */
+                   info->tree_height = _bt_getrootheight(indexRelation);
+               }
+               else
+               {
+                   /* For other index types, just set it to "unknown" for now */
+                   info->tree_height = -1;
+               }
            }
            else
            {
-               /* For other index types, just set it to "unknown" for now */
+               /* Zero these out for partitioned indexes */
+               info->pages = 0;
+               info->tuples = 0.0;
                info->tree_height = -1;
            }
 
index f50e58adbd63b264973cd80772f74cdd4ca10513..57de51f0db20b439f4880e523e654ae54c830921 100644 (file)
@@ -5994,6 +5994,10 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata,
    rte = root->simple_rte_array[rel->relid];
    Assert(rte->rtekind == RTE_RELATION);
 
+   /* ignore partitioned tables.  Any indexes here are not real indexes */
+   if (rte->relkind == RELKIND_PARTITIONED_TABLE)
+       return false;
+
    /* Search through the indexes to see if any match our problem */
    foreach(lc, rel->indexlist)
    {
index 1827e5064799082c887f66574b0f10d7a9ced3e0..c20b7298a3d879de8a3f0a7ec487495f39ecd348 100644 (file)
@@ -653,7 +653,7 @@ typedef struct PartitionSchemeData *PartitionScheme;
  *     lateral_referencers - relids of rels that reference this one laterally
  *             (includes both direct and indirect lateral references)
  *     indexlist - list of IndexOptInfo nodes for relation's indexes
- *                 (always NIL if it's not a table)
+ *                 (always NIL if it's not a table or partitioned table)
  *     pages - number of disk pages in relation (zero if not a table)
  *     tuples - number of tuples in relation (not considering restrictions)
  *     allvisfrac - fraction of disk pages that are marked all-visible
@@ -1097,11 +1097,11 @@ struct IndexOptInfo
    Oid        *opfamily pg_node_attr(array_size(nkeycolumns));
    /* OIDs of opclass declared input data types */
    Oid        *opcintype pg_node_attr(array_size(nkeycolumns));
-   /* OIDs of btree opfamilies, if orderable */
+   /* OIDs of btree opfamilies, if orderable.  NULL if partitioned index */
    Oid        *sortopfamily pg_node_attr(array_size(nkeycolumns));
-   /* is sort order descending? */
+   /* is sort order descending? or NULL if partitioned index */
    bool       *reverse_sort pg_node_attr(array_size(nkeycolumns));
-   /* do NULLs come first in the sort order? */
+   /* do NULLs come first in the sort order? or NULL if partitioned index */
    bool       *nulls_first pg_node_attr(array_size(nkeycolumns));
    /* opclass-specific options for columns */
    bytea     **opclassoptions pg_node_attr(read_write_ignore);
@@ -1139,7 +1139,7 @@ struct IndexOptInfo
 
    /*
     * Remaining fields are copied from the index AM's API struct
-    * (IndexAmRoutine).
+    * (IndexAmRoutine).  These fields are not set for partitioned indexes.
     */
    bool        amcanorderbyop;
    bool        amoptionalkey;
index 3ddea3b683770432d4b27f99f4760d44c82b1754..c2b85d27950519c4a2c187b5133c627dae0dfdc1 100644 (file)
@@ -4860,6 +4860,16 @@ select 1 from (select a.id FROM a left join b on a.b_id = b.id) q,
          Filter: (a.id = i)
 (4 rows)
 
+CREATE TEMP TABLE parted_b (id int PRIMARY KEY) partition by range(id);
+CREATE TEMP TABLE parted_b1 partition of parted_b for values from (0) to (10);
+-- test join removals on a partitioned table
+explain (costs off)
+select a.* from a left join parted_b pb on a.b_id = pb.id;
+  QUERY PLAN   
+---------------
+ Seq Scan on a
+(1 row)
+
 rollback;
 create temp table parent (k int primary key, pd int);
 create temp table child (k int unique, cd int);
index c59caf1cb3d962daea4ee6a66853ca863e5bfb24..c649c4aeaae45cc16e654b07cb747ad05327eb2c 100644 (file)
@@ -4874,7 +4874,7 @@ ANALYZE fract_t;
 SET max_parallel_workers_per_gather = 0;
 SET enable_partitionwise_join = on;
 EXPLAIN (COSTS OFF)
-SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id ASC LIMIT 10;
+SELECT x.id, y.id FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY x.id ASC LIMIT 10;
                               QUERY PLAN                               
 -----------------------------------------------------------------------
  Limit
@@ -4891,7 +4891,7 @@ SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id ASC LIMIT 10;
 (11 rows)
 
 EXPLAIN (COSTS OFF)
-SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id DESC LIMIT 10;
+SELECT x.id, y.id FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY x.id DESC LIMIT 10;
                                    QUERY PLAN                                   
 --------------------------------------------------------------------------------
  Limit
index 9fc6ef43768eda46d0bde5334033bc86af219376..027927354c09f69361e04fb2922d960df8536867 100644 (file)
@@ -1709,6 +1709,13 @@ explain (costs off)
 select 1 from (select a.id FROM a left join b on a.b_id = b.id) q,
              lateral generate_series(1, q.id) gs(i) where q.id = gs.i;
 
+CREATE TEMP TABLE parted_b (id int PRIMARY KEY) partition by range(id);
+CREATE TEMP TABLE parted_b1 partition of parted_b for values from (0) to (10);
+
+-- test join removals on a partitioned table
+explain (costs off)
+select a.* from a left join parted_b pb on a.b_id = pb.id;
+
 rollback;
 
 create temp table parent (k int primary key, pd int);
index 67f506361f8661600c80b4dad65f1fb198a99d0a..9e16f1ca550fae192065d64612b8800577b2b30e 100644 (file)
@@ -1157,10 +1157,10 @@ SET max_parallel_workers_per_gather = 0;
 SET enable_partitionwise_join = on;
 
 EXPLAIN (COSTS OFF)
-SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id ASC LIMIT 10;
+SELECT x.id, y.id FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY x.id ASC LIMIT 10;
 
 EXPLAIN (COSTS OFF)
-SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id DESC LIMIT 10;
+SELECT x.id, y.id FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY x.id DESC LIMIT 10;
 
 -- cleanup
 DROP TABLE fract_t;