Implement "join removal" for cases where the inner side of a left join
authorTom Lane <tgl@sss.pgh.pa.us>
Thu, 17 Sep 2009 20:49:29 +0000 (20:49 +0000)
committerTom Lane <tgl@sss.pgh.pa.us>
Thu, 17 Sep 2009 20:49:29 +0000 (20:49 +0000)
is unique and is not referenced above the join.  In this case the inner
side doesn't affect the query result and can be thrown away entirely.
Although perhaps nobody would ever write such a thing by hand, it's
a reasonably common case in machine-generated SQL.

The current implementation only recognizes the case where the inner side
is a simple relation with a unique index matching the query conditions.
This is enough for the use-cases that have been shown so far, but we
might want to try to handle other cases later.

Robert Haas, somewhat rewritten by Tom

src/backend/nodes/outfuncs.c
src/backend/optimizer/README
src/backend/optimizer/path/allpaths.c
src/backend/optimizer/path/indxpath.c
src/backend/optimizer/path/joinpath.c
src/backend/optimizer/plan/createplan.c
src/backend/optimizer/util/pathnode.c
src/include/nodes/nodes.h
src/include/nodes/relation.h
src/include/optimizer/pathnode.h
src/include/optimizer/paths.h

index af801a4395bbf8f95ebfdfe6e35cde21162fb39d..ae864ad2b0d1592f93532ebc6fb654627e4a61c4 100644 (file)
@@ -1421,6 +1421,16 @@ _outUniquePath(StringInfo str, UniquePath *node)
        WRITE_FLOAT_FIELD(rows, "%.0f");
 }
 
+static void
+_outNoOpPath(StringInfo str, NoOpPath *node)
+{
+       WRITE_NODE_TYPE("NOOPPATH");
+
+       _outPathInfo(str, (Path *) node);
+
+       WRITE_NODE_FIELD(subpath);
+}
+
 static void
 _outNestPath(StringInfo str, NestPath *node)
 {
@@ -2634,6 +2644,9 @@ _outNode(StringInfo str, void *obj)
                        case T_UniquePath:
                                _outUniquePath(str, obj);
                                break;
+                       case T_NoOpPath:
+                               _outNoOpPath(str, obj);
+                               break;
                        case T_NestPath:
                                _outNestPath(str, obj);
                                break;
index 4cc6f3f808a202b237c7d1d51ec5c373a7272afb..26e6dabae1ac73e66c91a76f0b844ef503fa8b80 100644 (file)
@@ -354,6 +354,7 @@ RelOptInfo      - a relation or joined relations
   NestPath      - nested-loop joins
   MergePath     - merge joins
   HashPath      - hash joins
+  NoOpPath      - same as its input path (used when a join is removed)
 
  EquivalenceClass - a data structure representing a set of values known equal
 
index a1aa660fb3e8f26d6d59b8f90521e985b609a851..fbad3919241df232a38320349f742474e8328086 100644 (file)
@@ -1387,6 +1387,10 @@ print_path(PlannerInfo *root, Path *path, int indent)
                        ptype = "Unique";
                        subpath = ((UniquePath *) path)->subpath;
                        break;
+               case T_NoOpPath:
+                       ptype = "NoOp";
+                       subpath = ((NoOpPath *) path)->subpath;
+                       break;
                case T_NestPath:
                        ptype = "NestLoop";
                        join = true;
index 52cadddeba9bb9c9131f7893f6a368e415c9faf2..02748c15cf76050a887bcff522a92787dd8ff15c 100644 (file)
@@ -1918,6 +1918,86 @@ find_clauses_for_join(PlannerInfo *root, RelOptInfo *rel,
        return clause_list;
 }
 
+/*
+ * relation_has_unique_index_for
+ *       Determine whether the relation provably has at most one row satisfying
+ *       a set of equality conditions, because the conditions constrain all
+ *       columns of some unique index.
+ *
+ * The conditions are provided as a list of RestrictInfo nodes, where the
+ * caller has already determined that each condition is a mergejoinable
+ * equality with an expression in this relation on one side, and an
+ * expression not involving this relation on the other.  The transient
+ * outer_is_left flag is used to identify which side we should look at:
+ * left side if outer_is_left is false, right side if it is true.
+ */
+bool
+relation_has_unique_index_for(PlannerInfo *root, RelOptInfo *rel,
+                                                         List *restrictlist)
+{
+       ListCell   *ic;
+
+       /* Short-circuit the easy case */
+       if (restrictlist == NIL)
+               return false;
+
+       /* Examine each index of the relation ... */
+       foreach(ic, rel->indexlist)
+       {
+               IndexOptInfo   *ind = (IndexOptInfo *) lfirst(ic);
+               int                             c;
+
+               /*
+                * If the index is not unique or if it's a partial index that doesn't
+                * match the query, it's useless here.
+                */
+               if (!ind->unique || (ind->indpred != NIL && !ind->predOK))
+                       continue;
+
+               /*
+                * Try to find each index column in the list of conditions.  This is
+                * O(n^2) or worse, but we expect all the lists to be short.
+                */
+               for (c = 0; c < ind->ncolumns; c++)
+               {
+                       ListCell   *lc;
+
+                       foreach(lc, restrictlist)
+                       {
+                               RestrictInfo   *rinfo = (RestrictInfo *) lfirst(lc);
+                               Node   *rexpr;
+
+                               /*
+                                * The condition's equality operator must be a member of the
+                                * index opfamily, else it is not asserting the right kind
+                                * of equality behavior for this index.  We check this first
+                                * since it's probably cheaper than match_index_to_operand().
+                                */
+                               if (!list_member_oid(rinfo->mergeopfamilies, ind->opfamily[c]))
+                                       continue;
+
+                               /* OK, see if the condition operand matches the index key */
+                               if (rinfo->outer_is_left)
+                                       rexpr = get_rightop(rinfo->clause);
+                               else
+                                       rexpr = get_leftop(rinfo->clause);
+
+                               if (match_index_to_operand(rexpr, c, ind))
+                                       break;          /* found a match; column is unique */
+                       }
+
+                       if (lc == NULL)
+                               break;                  /* no match; this index doesn't help us */
+               }
+
+               /* Matched all columns of this index? */
+               if (c == ind->ncolumns)
+                       return true;
+       }
+
+       return false;
+}
+
 
 /****************************************************************************
  *                             ----  PATH CREATION UTILITIES  ----
index ef13110fb926b5341a3a6405a8b81a66a17a2f97..6de821a6e23e57194047b690ea0544c20b83960e 100644 (file)
 #include "optimizer/paths.h"
 
 
+static bool join_is_removable(PlannerInfo *root, RelOptInfo *joinrel,
+                                 RelOptInfo *outerrel, RelOptInfo *innerrel,
+                                 List *restrictlist, JoinType jointype);
+static void generate_outer_only(PlannerInfo *root, RelOptInfo *joinrel,
+                                       RelOptInfo *outerrel);
 static void sort_inner_and_outer(PlannerInfo *root, RelOptInfo *joinrel,
                                         RelOptInfo *outerrel, RelOptInfo *innerrel,
                                         List *restrictlist, List *mergeclause_list,
@@ -78,11 +83,26 @@ add_paths_to_joinrel(PlannerInfo *root,
 {
        List       *mergeclause_list = NIL;
 
+       /*
+        * 0. Consider join removal.  This is always the most efficient strategy,
+        * so if it works, there's no need to consider anything further.
+        */
+       if (join_is_removable(root, joinrel, outerrel, innerrel,
+                                                 restrictlist, jointype))
+       {
+               generate_outer_only(root, joinrel, outerrel);
+               return;
+       }
+
        /*
         * Find potential mergejoin clauses.  We can skip this if we are not
         * interested in doing a mergejoin.  However, mergejoin is currently our
         * only way of implementing full outer joins, so override mergejoin
         * disable if it's a full join.
+        *
+        * Note: do this after join_is_removable(), because this sets the
+        * outer_is_left flags in the mergejoin clauses, while join_is_removable
+        * uses those flags for its own purposes.
         */
        if (enable_mergejoin || jointype == JOIN_FULL)
                mergeclause_list = select_mergejoin_clauses(root,
@@ -133,6 +153,180 @@ add_paths_to_joinrel(PlannerInfo *root,
                                                         restrictlist, jointype, sjinfo);
 }
 
+/*
+ * join_is_removable
+ *       Determine whether we need not perform the join at all, because
+ *       it will just duplicate its left input.
+ *
+ * This is true for a left join for which the join condition cannot match
+ * more than one inner-side row.  (There are other possibly interesting
+ * cases, but we don't have the infrastructure to prove them.)
+ *
+ * Note: there is no need to consider the symmetrical case of duplicating the
+ * right input, because add_paths_to_joinrel() will be called with each rel
+ * on the outer side.
+ */
+static bool
+join_is_removable(PlannerInfo *root,
+                                 RelOptInfo *joinrel,
+                                 RelOptInfo *outerrel,
+                                 RelOptInfo *innerrel,
+                                 List *restrictlist,
+                                 JoinType jointype)
+{
+       List       *clause_list = NIL;
+       ListCell   *l;
+       int                     attroff;
+
+       /*
+        * Currently, we only know how to remove left joins to a baserel with
+        * unique indexes.  We can check most of these criteria pretty trivially
+        * to avoid doing useless extra work.  But checking whether any of the
+        * indexes are unique would require iterating over the indexlist, so for
+        * now we just make sure there are indexes of some sort or other.  If none
+        * of them are unique, join removal will still fail, just slightly later.
+        */
+       if (jointype != JOIN_LEFT ||
+               innerrel->reloptkind == RELOPT_JOINREL ||
+               innerrel->rtekind != RTE_RELATION ||
+               innerrel->indexlist == NIL)
+               return false;
+
+       /*
+        * We can't remove the join if any inner-rel attributes are used above
+        * the join.
+        *
+        * As a micro-optimization, it seems better to start with max_attr and
+        * count down rather than starting with min_attr and counting up, on the
+        * theory that the system attributes are somewhat less likely to be wanted
+        * and should be tested last.
+        */
+       for (attroff = innerrel->max_attr - innerrel->min_attr;
+                attroff >= 0;
+                attroff--)
+       {
+               if (!bms_is_subset(innerrel->attr_needed[attroff], joinrel->relids))
+                       return false;
+       }
+
+       /*
+        * Search for mergejoinable clauses that constrain the inner rel against
+        * either the outer rel or a pseudoconstant.  If an operator is
+        * mergejoinable then it behaves like equality for some btree opclass,
+        * so it's what we want.  The mergejoinability test also eliminates
+        * clauses containing volatile functions, which we couldn't depend on.
+        */
+       foreach(l, restrictlist)
+       {
+               RestrictInfo *restrictinfo = (RestrictInfo *) lfirst(l);
+
+               /*
+                * We are always considering an outer join here, so ignore pushed-down
+                * clauses.  Also ignore anything that doesn't have a mergejoinable
+                * operator.
+                */
+               if (restrictinfo->is_pushed_down)
+                       continue;
+
+               if (!restrictinfo->can_join ||
+                       restrictinfo->mergeopfamilies == NIL)
+                       continue;                       /* not mergejoinable */
+
+               /*
+                * Check if clause is usable with these input rels.  All the vars
+                * needed on each side of the clause must be available from one or the
+                * other of the input rels.
+                */
+               if (bms_is_subset(restrictinfo->left_relids, outerrel->relids) &&
+                       bms_is_subset(restrictinfo->right_relids, innerrel->relids))
+               {
+                       /* righthand side is inner */
+                       restrictinfo->outer_is_left = true;
+               }
+               else if (bms_is_subset(restrictinfo->left_relids, innerrel->relids) &&
+                                bms_is_subset(restrictinfo->right_relids, outerrel->relids))
+               {
+                       /* lefthand side is inner */
+                       restrictinfo->outer_is_left = false;
+               }
+               else
+                       continue;                       /* no good for these input relations */
+
+               /* OK, add to list */
+               clause_list = lappend(clause_list, restrictinfo);
+       }
+
+       /* Now examine the rel's restriction clauses for var = const clauses */
+       foreach(l, innerrel->baserestrictinfo)
+       {
+               RestrictInfo *restrictinfo = (RestrictInfo *) lfirst(l);
+
+               /*
+                * Note: can_join won't be set for a restriction clause, but
+                * mergeopfamilies will be if it has a mergejoinable operator
+                * and doesn't contain volatile functions.
+                */
+               if (restrictinfo->mergeopfamilies == NIL)
+                       continue;                       /* not mergejoinable */
+
+               /*
+                * The clause certainly doesn't refer to anything but the given
+                * rel.  If either side is pseudoconstant then we can use it.
+                */
+               if (bms_is_empty(restrictinfo->left_relids))
+               {
+                       /* righthand side is inner */
+                       restrictinfo->outer_is_left = true;
+               }
+               else if (bms_is_empty(restrictinfo->right_relids))
+               {
+                       /* lefthand side is inner */
+                       restrictinfo->outer_is_left = false;
+               }
+               else
+                       continue;
+
+               /* OK, add to list */
+               clause_list = lappend(clause_list, restrictinfo);
+       }
+
+       /* Now examine the indexes to see if we have a matching unique index */
+       if (relation_has_unique_index_for(root, innerrel, clause_list))
+               return true;
+
+       /*
+        * Some day it would be nice to check for other methods of establishing
+        * distinctness.
+        */
+       return false;
+}
+
+/*
+ * generate_outer_only
+ *       Generate "join" paths when we have found the join is removable.
+ */
+static void
+generate_outer_only(PlannerInfo *root, RelOptInfo *joinrel,
+                                       RelOptInfo *outerrel)
+{
+       ListCell   *lc;
+
+       /*
+        * For the moment, replicate all of the outerrel's paths as join paths.
+        * Some of them might not really be interesting above the join, if they
+        * have sort orderings that have no real use except to do a mergejoin
+        * for the join we've just found we don't need.  But distinguishing that
+        * case probably isn't worth the extra code it would take.
+        */
+       foreach(lc, outerrel->pathlist)
+       {
+               Path   *outerpath = (Path *) lfirst(lc);
+
+               add_path(joinrel, (Path *)
+                                create_noop_path(root, joinrel, outerpath));
+       }
+}
+
 /*
  * sort_inner_and_outer
  *       Create mergejoin join paths by explicitly sorting both the outer and
index ccea2e8e09adfc0d2907a10aade598822d417516..69c731ef9fb8a020d971d8cfd24fa1c634cd9a70 100644 (file)
@@ -164,6 +164,11 @@ create_plan(PlannerInfo *root, Path *best_path)
                case T_WorkTableScan:
                        plan = create_scan_plan(root, best_path);
                        break;
+               case T_Join:
+                       /* this is only used for no-op joins */
+                       Assert(IsA(best_path, NoOpPath));
+                       plan = create_plan(root, ((NoOpPath *) best_path)->subpath);
+                       break;
                case T_HashJoin:
                case T_MergeJoin:
                case T_NestLoop:
index e80f4cf1743e1de269f707145a1c0059b5bbc750..bbb9817ddb04cb6604ef3be3a86acf3ab83d2606 100644 (file)
@@ -1215,6 +1215,26 @@ distinct_col_search(int colno, List *colnos, List *opids)
        return InvalidOid;
 }
 
+/*
+ * create_noop_path
+ *       Creates a path equivalent to the input subpath, but having a different
+ *       parent rel.  This is used when a join is found to be removable.
+ */
+NoOpPath *
+create_noop_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath)
+{
+       NoOpPath   *pathnode = makeNode(NoOpPath);
+
+       pathnode->path.pathtype = T_Join;                       /* by convention */
+       pathnode->path.parent = rel;
+       pathnode->path.startup_cost = subpath->startup_cost;
+       pathnode->path.total_cost = subpath->total_cost;
+       pathnode->path.pathkeys = subpath->pathkeys;
+       pathnode->subpath = subpath;
+
+       return pathnode;
+}
+
 /*
  * create_subqueryscan_path
  *       Creates a path corresponding to a sequential scan of a subquery,
index 31135a3fe1f6cc49bdf81d638ed5828a237ff06d..19e28e71f3a6cde3e61219bc807d980fa945b70e 100644 (file)
@@ -211,6 +211,7 @@ typedef enum NodeTag
        T_ResultPath,
        T_MaterialPath,
        T_UniquePath,
+       T_NoOpPath,
        T_EquivalenceClass,
        T_EquivalenceMember,
        T_PathKey,
index bbce826e0f705898bc8ef176d0f70dd970b1d36d..41fce02ea1c6aee3b70affb3d9d493c3fccb4e8b 100644 (file)
@@ -783,6 +783,22 @@ typedef struct UniquePath
        double          rows;                   /* estimated number of result tuples */
 } UniquePath;
 
+/*
+ * NoOpPath represents exactly the same plan as its subpath.  This is used
+ * when we have determined that a join can be eliminated.  The difference
+ * between the NoOpPath and its subpath is just that the NoOpPath's parent
+ * is the whole join relation while the subpath is for one of the joined
+ * relations (and the other one isn't needed).
+ *
+ * Note: path.pathtype is always T_Join, but this won't actually give rise
+ * to a Join plan node.
+ */
+typedef struct NoOpPath
+{
+       Path            path;
+       Path       *subpath;
+} NoOpPath;
+
 /*
  * All join-type paths share these fields.
  */
index 0f4c52ef7c25bffd0fac58c0d052613ffcb2e462..fbed15aefa98dca15cb9cbd1f43605ba5bf6e666 100644 (file)
@@ -51,6 +51,8 @@ extern ResultPath *create_result_path(List *quals);
 extern MaterialPath *create_material_path(RelOptInfo *rel, Path *subpath);
 extern UniquePath *create_unique_path(PlannerInfo *root, RelOptInfo *rel,
                                   Path *subpath, SpecialJoinInfo *sjinfo);
+extern NoOpPath *create_noop_path(PlannerInfo *root, RelOptInfo *rel,
+                                                                 Path *subpath);
 extern Path *create_subqueryscan_path(RelOptInfo *rel, List *pathkeys);
 extern Path *create_functionscan_path(PlannerInfo *root, RelOptInfo *rel);
 extern Path *create_valuesscan_path(PlannerInfo *root, RelOptInfo *rel);
index 4f80edc4923651cc70d57af75e104df7e3aaa80d..f574960734b1ac758ebf8f7c50085c81767a95a2 100644 (file)
@@ -57,6 +57,8 @@ extern List *generate_bitmap_or_paths(PlannerInfo *root, RelOptInfo *rel,
 extern void best_inner_indexscan(PlannerInfo *root, RelOptInfo *rel,
                                         RelOptInfo *outer_rel, JoinType jointype,
                                         Path **cheapest_startup, Path **cheapest_total);
+extern bool relation_has_unique_index_for(PlannerInfo *root, RelOptInfo *rel,
+                                                         List *restrictlist);
 extern List *group_clauses_by_indexkey(IndexOptInfo *index,
                                                  List *clauses, List *outer_clauses,
                                                  Relids outer_relids,