Implement "join removal" for cases where the inner side of a left join

author Tom Lane <tgl@sss.pgh.pa.us>

Thu, 17 Sep 2009 20:49:29 +0000 (20:49 +0000)

committer Tom Lane <tgl@sss.pgh.pa.us>

Thu, 17 Sep 2009 20:49:29 +0000 (20:49 +0000)
author Tom Lane <tgl@sss.pgh.pa.us>
Thu, 17 Sep 2009 20:49:29 +0000 (20:49 +0000)
committer Tom Lane <tgl@sss.pgh.pa.us>
Thu, 17 Sep 2009 20:49:29 +0000 (20:49 +0000)
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c

index af801a4395bbf8f95ebfdfe6e35cde21162fb39d..ae864ad2b0d1592f93532ebc6fb654627e4a61c4 100644 (file)
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -1421,6 +1421,16 @@ _outUniquePath(StringInfo str, UniquePath *node)
         WRITE_FLOAT_FIELD(rows, "%.0f");
  }
  
+static void
+_outNoOpPath(StringInfo str, NoOpPath *node)
+{
+       WRITE_NODE_TYPE("NOOPPATH");
+
+       _outPathInfo(str, (Path *) node);
+
+       WRITE_NODE_FIELD(subpath);
+}
+
  static void
  _outNestPath(StringInfo str, NestPath *node)
  {
@@ -2634,6 +2644,9 @@ _outNode(StringInfo str, void *obj)
                         case T_UniquePath:
                                 _outUniquePath(str, obj);
                                 break;
+                       case T_NoOpPath:
+                               _outNoOpPath(str, obj);
+                               break;
                         case T_NestPath:
                                 _outNestPath(str, obj);
                                 break;
diff --git a/src/backend/optimizer/README b/src/backend/optimizer/README

index 4cc6f3f808a202b237c7d1d51ec5c373a7272afb..26e6dabae1ac73e66c91a76f0b844ef503fa8b80 100644 (file)
--- a/src/backend/optimizer/README
+++ b/src/backend/optimizer/README
@@ -354,6 +354,7 @@ RelOptInfo      - a relation or joined relations
    NestPath      - nested-loop joins
    MergePath     - merge joins
    HashPath      - hash joins
+  NoOpPath      - same as its input path (used when a join is removed)
  
   EquivalenceClass - a data structure representing a set of values known equal
  
diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c

index a1aa660fb3e8f26d6d59b8f90521e985b609a851..fbad3919241df232a38320349f742474e8328086 100644 (file)
--- a/src/backend/optimizer/path/allpaths.c
+++ b/src/backend/optimizer/path/allpaths.c
@@ -1387,6 +1387,10 @@ print_path(PlannerInfo *root, Path *path, int indent)
                         ptype = "Unique";
                         subpath = ((UniquePath *) path)->subpath;
                         break;
+               case T_NoOpPath:
+                       ptype = "NoOp";
+                       subpath = ((NoOpPath *) path)->subpath;
+                       break;
                 case T_NestPath:
                         ptype = "NestLoop";
                         join = true;
diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c

index 52cadddeba9bb9c9131f7893f6a368e415c9faf2..02748c15cf76050a887bcff522a92787dd8ff15c 100644 (file)
--- a/src/backend/optimizer/path/indxpath.c
+++ b/src/backend/optimizer/path/indxpath.c
@@ -1918,6 +1918,86 @@ find_clauses_for_join(PlannerInfo *root, RelOptInfo *rel,
         return clause_list;
  }
  
+/*
+ * relation_has_unique_index_for
+ *       Determine whether the relation provably has at most one row satisfying
+ *       a set of equality conditions, because the conditions constrain all
+ *       columns of some unique index.
+ *
+ * The conditions are provided as a list of RestrictInfo nodes, where the
+ * caller has already determined that each condition is a mergejoinable
+ * equality with an expression in this relation on one side, and an
+ * expression not involving this relation on the other.  The transient
+ * outer_is_left flag is used to identify which side we should look at:
+ * left side if outer_is_left is false, right side if it is true.
+ */
+bool
+relation_has_unique_index_for(PlannerInfo *root, RelOptInfo *rel,
+                                                         List *restrictlist)
+{
+       ListCell   *ic;
+
+       /* Short-circuit the easy case */
+       if (restrictlist == NIL)
+               return false;
+
+       /* Examine each index of the relation ... */
+       foreach(ic, rel->indexlist)
+       {
+               IndexOptInfo   *ind = (IndexOptInfo *) lfirst(ic);
+               int                             c;
+
+               /*
+                * If the index is not unique or if it's a partial index that doesn't
+                * match the query, it's useless here.
+                */
+               if (!ind->unique || (ind->indpred != NIL && !ind->predOK))
+                       continue;
+
+               /*
+                * Try to find each index column in the list of conditions.  This is
+                * O(n^2) or worse, but we expect all the lists to be short.
+                */
+               for (c = 0; c < ind->ncolumns; c++)
+               {
+                       ListCell   *lc;
+
+                       foreach(lc, restrictlist)
+                       {
+                               RestrictInfo   *rinfo = (RestrictInfo *) lfirst(lc);
+                               Node   *rexpr;
+
+                               /*
+                                * The condition's equality operator must be a member of the
+                                * index opfamily, else it is not asserting the right kind
+                                * of equality behavior for this index.  We check this first
+                                * since it's probably cheaper than match_index_to_operand().
+                                */
+                               if (!list_member_oid(rinfo->mergeopfamilies, ind->opfamily[c]))
+                                       continue;
+
+                               /* OK, see if the condition operand matches the index key */
+                               if (rinfo->outer_is_left)
+                                       rexpr = get_rightop(rinfo->clause);
+                               else
+                                       rexpr = get_leftop(rinfo->clause);
+
+                               if (match_index_to_operand(rexpr, c, ind))
+                                       break;          /* found a match; column is unique */
+                       }
+
+                       if (lc == NULL)
+                               break;                  /* no match; this index doesn't help us */
+               }
+
+               /* Matched all columns of this index? */
+               if (c == ind->ncolumns)
+                       return true;
+       }
+
+       return false;
+}
+
  
  /****************************************************************************
   *                             ----  PATH CREATION UTILITIES  ----
diff --git a/src/backend/optimizer/path/joinpath.c b/src/backend/optimizer/path/joinpath.c

index ef13110fb926b5341a3a6405a8b81a66a17a2f97..6de821a6e23e57194047b690ea0544c20b83960e 100644 (file)
--- a/src/backend/optimizer/path/joinpath.c
+++ b/src/backend/optimizer/path/joinpath.c
@@ -22,6 +22,11 @@
  #include "optimizer/paths.h"
  
  
+static bool join_is_removable(PlannerInfo *root, RelOptInfo *joinrel,
+                                 RelOptInfo *outerrel, RelOptInfo *innerrel,
+                                 List *restrictlist, JoinType jointype);
+static void generate_outer_only(PlannerInfo *root, RelOptInfo *joinrel,
+                                       RelOptInfo *outerrel);
  static void sort_inner_and_outer(PlannerInfo *root, RelOptInfo *joinrel,
                                          RelOptInfo *outerrel, RelOptInfo *innerrel,
                                          List *restrictlist, List *mergeclause_list,
@@ -78,11 +83,26 @@ add_paths_to_joinrel(PlannerInfo *root,
  {
         List       *mergeclause_list = NIL;
  
+       /*
+        * 0. Consider join removal.  This is always the most efficient strategy,
+        * so if it works, there's no need to consider anything further.
+        */
+       if (join_is_removable(root, joinrel, outerrel, innerrel,
+                                                 restrictlist, jointype))
+       {
+               generate_outer_only(root, joinrel, outerrel);
+               return;
+       }
+
         /*
          * Find potential mergejoin clauses.  We can skip this if we are not
          * interested in doing a mergejoin.  However, mergejoin is currently our
          * only way of implementing full outer joins, so override mergejoin
          * disable if it's a full join.
+        *
+        * Note: do this after join_is_removable(), because this sets the
+        * outer_is_left flags in the mergejoin clauses, while join_is_removable
+        * uses those flags for its own purposes.
          */
         if (enable_mergejoin || jointype == JOIN_FULL)
                 mergeclause_list = select_mergejoin_clauses(root,
@@ -133,6 +153,180 @@ add_paths_to_joinrel(PlannerInfo *root,
                                                          restrictlist, jointype, sjinfo);
  }
  
+/*
+ * join_is_removable
+ *       Determine whether we need not perform the join at all, because
+ *       it will just duplicate its left input.
+ *
+ * This is true for a left join for which the join condition cannot match
+ * more than one inner-side row.  (There are other possibly interesting
+ * cases, but we don't have the infrastructure to prove them.)
+ *
+ * Note: there is no need to consider the symmetrical case of duplicating the
+ * right input, because add_paths_to_joinrel() will be called with each rel
+ * on the outer side.
+ */
+static bool
+join_is_removable(PlannerInfo *root,
+                                 RelOptInfo *joinrel,
+                                 RelOptInfo *outerrel,
+                                 RelOptInfo *innerrel,
+                                 List *restrictlist,
+                                 JoinType jointype)
+{
+       List       *clause_list = NIL;
+       ListCell   *l;
+       int                     attroff;
+
+       /*
+        * Currently, we only know how to remove left joins to a baserel with
+        * unique indexes.  We can check most of these criteria pretty trivially
+        * to avoid doing useless extra work.  But checking whether any of the
+        * indexes are unique would require iterating over the indexlist, so for
+        * now we just make sure there are indexes of some sort or other.  If none
+        * of them are unique, join removal will still fail, just slightly later.
+        */
+       if (jointype != JOIN_LEFT ||
+               innerrel->reloptkind == RELOPT_JOINREL ||
+               innerrel->rtekind != RTE_RELATION ||
+               innerrel->indexlist == NIL)
+               return false;
+
+       /*
+        * We can't remove the join if any inner-rel attributes are used above
+        * the join.
+        *
+        * As a micro-optimization, it seems better to start with max_attr and
+        * count down rather than starting with min_attr and counting up, on the
+        * theory that the system attributes are somewhat less likely to be wanted
+        * and should be tested last.
+        */
+       for (attroff = innerrel->max_attr - innerrel->min_attr;
+                attroff >= 0;
+                attroff--)
+       {
+               if (!bms_is_subset(innerrel->attr_needed[attroff], joinrel->relids))
+                       return false;
+       }
+
+       /*
+        * Search for mergejoinable clauses that constrain the inner rel against
+        * either the outer rel or a pseudoconstant.  If an operator is
+        * mergejoinable then it behaves like equality for some btree opclass,
+        * so it's what we want.  The mergejoinability test also eliminates
+        * clauses containing volatile functions, which we couldn't depend on.
+        */
+       foreach(l, restrictlist)
+       {
+               RestrictInfo *restrictinfo = (RestrictInfo *) lfirst(l);
+
+               /*
+                * We are always considering an outer join here, so ignore pushed-down
+                * clauses.  Also ignore anything that doesn't have a mergejoinable
+                * operator.
+                */
+               if (restrictinfo->is_pushed_down)
+                       continue;
+
+               if (!restrictinfo->can_join ||
+                       restrictinfo->mergeopfamilies == NIL)
+                       continue;                       /* not mergejoinable */
+
+               /*
+                * Check if clause is usable with these input rels.  All the vars
+                * needed on each side of the clause must be available from one or the
+                * other of the input rels.
+                */
+               if (bms_is_subset(restrictinfo->left_relids, outerrel->relids) &&
+                       bms_is_subset(restrictinfo->right_relids, innerrel->relids))
+               {
+                       /* righthand side is inner */
+                       restrictinfo->outer_is_left = true;
+               }
+               else if (bms_is_subset(restrictinfo->left_relids, innerrel->relids) &&
+                                bms_is_subset(restrictinfo->right_relids, outerrel->relids))
+               {
+                       /* lefthand side is inner */
+                       restrictinfo->outer_is_left = false;
+               }
+               else
+                       continue;                       /* no good for these input relations */
+
+               /* OK, add to list */
+               clause_list = lappend(clause_list, restrictinfo);
+       }
+
+       /* Now examine the rel's restriction clauses for var = const clauses */
+       foreach(l, innerrel->baserestrictinfo)
+       {
+               RestrictInfo *restrictinfo = (RestrictInfo *) lfirst(l);
+
+               /*
+                * Note: can_join won't be set for a restriction clause, but
+                * mergeopfamilies will be if it has a mergejoinable operator
+                * and doesn't contain volatile functions.
+                */
+               if (restrictinfo->mergeopfamilies == NIL)
+                       continue;                       /* not mergejoinable */
+
+               /*
+                * The clause certainly doesn't refer to anything but the given
+                * rel.  If either side is pseudoconstant then we can use it.
+                */
+               if (bms_is_empty(restrictinfo->left_relids))
+               {
+                       /* righthand side is inner */
+                       restrictinfo->outer_is_left = true;
+               }
+               else if (bms_is_empty(restrictinfo->right_relids))
+               {
+                       /* lefthand side is inner */
+                       restrictinfo->outer_is_left = false;
+               }
+               else
+                       continue;
+
+               /* OK, add to list */
+               clause_list = lappend(clause_list, restrictinfo);
+       }
+
+       /* Now examine the indexes to see if we have a matching unique index */
+       if (relation_has_unique_index_for(root, innerrel, clause_list))
+               return true;
+
+       /*
+        * Some day it would be nice to check for other methods of establishing
+        * distinctness.
+        */
+       return false;
+}
+
+/*
+ * generate_outer_only
+ *       Generate "join" paths when we have found the join is removable.
+ */
+static void
+generate_outer_only(PlannerInfo *root, RelOptInfo *joinrel,
+                                       RelOptInfo *outerrel)
+{
+       ListCell   *lc;
+
+       /*
+        * For the moment, replicate all of the outerrel's paths as join paths.
+        * Some of them might not really be interesting above the join, if they
+        * have sort orderings that have no real use except to do a mergejoin
+        * for the join we've just found we don't need.  But distinguishing that
+        * case probably isn't worth the extra code it would take.
+        */
+       foreach(lc, outerrel->pathlist)
+       {
+               Path   *outerpath = (Path *) lfirst(lc);
+
+               add_path(joinrel, (Path *)
+                                create_noop_path(root, joinrel, outerpath));
+       }
+}
+
  /*
   * sort_inner_and_outer
   *       Create mergejoin join paths by explicitly sorting both the outer and
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c

index ccea2e8e09adfc0d2907a10aade598822d417516..69c731ef9fb8a020d971d8cfd24fa1c634cd9a70 100644 (file)
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -164,6 +164,11 @@ create_plan(PlannerInfo *root, Path *best_path)
                 case T_WorkTableScan:
                         plan = create_scan_plan(root, best_path);
                         break;
+               case T_Join:
+                       /* this is only used for no-op joins */
+                       Assert(IsA(best_path, NoOpPath));
+                       plan = create_plan(root, ((NoOpPath *) best_path)->subpath);
+                       break;
                 case T_HashJoin:
                 case T_MergeJoin:
                 case T_NestLoop:
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c

index e80f4cf1743e1de269f707145a1c0059b5bbc750..bbb9817ddb04cb6604ef3be3a86acf3ab83d2606 100644 (file)
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -1215,6 +1215,26 @@ distinct_col_search(int colno, List *colnos, List *opids)
         return InvalidOid;
  }
  
+/*
+ * create_noop_path
+ *       Creates a path equivalent to the input subpath, but having a different
+ *       parent rel.  This is used when a join is found to be removable.
+ */
+NoOpPath *
+create_noop_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath)
+{
+       NoOpPath   *pathnode = makeNode(NoOpPath);
+
+       pathnode->path.pathtype = T_Join;                       /* by convention */
+       pathnode->path.parent = rel;
+       pathnode->path.startup_cost = subpath->startup_cost;
+       pathnode->path.total_cost = subpath->total_cost;
+       pathnode->path.pathkeys = subpath->pathkeys;
+       pathnode->subpath = subpath;
+
+       return pathnode;
+}
+
  /*
   * create_subqueryscan_path
   *       Creates a path corresponding to a sequential scan of a subquery,
diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h

index 31135a3fe1f6cc49bdf81d638ed5828a237ff06d..19e28e71f3a6cde3e61219bc807d980fa945b70e 100644 (file)
--- a/src/include/nodes/nodes.h
+++ b/src/include/nodes/nodes.h
@@ -211,6 +211,7 @@ typedef enum NodeTag
         T_ResultPath,
         T_MaterialPath,
         T_UniquePath,
+       T_NoOpPath,
         T_EquivalenceClass,
         T_EquivalenceMember,
         T_PathKey,
diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h

index bbce826e0f705898bc8ef176d0f70dd970b1d36d..41fce02ea1c6aee3b70affb3d9d493c3fccb4e8b 100644 (file)
--- a/src/include/nodes/relation.h
+++ b/src/include/nodes/relation.h
@@ -783,6 +783,22 @@ typedef struct UniquePath
         double          rows;                   /* estimated number of result tuples */
  } UniquePath;
  
+/*
+ * NoOpPath represents exactly the same plan as its subpath.  This is used
+ * when we have determined that a join can be eliminated.  The difference
+ * between the NoOpPath and its subpath is just that the NoOpPath's parent
+ * is the whole join relation while the subpath is for one of the joined
+ * relations (and the other one isn't needed).
+ *
+ * Note: path.pathtype is always T_Join, but this won't actually give rise
+ * to a Join plan node.
+ */
+typedef struct NoOpPath
+{
+       Path            path;
+       Path       *subpath;
+} NoOpPath;
+
  /*
   * All join-type paths share these fields.
   */
diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h

index 0f4c52ef7c25bffd0fac58c0d052613ffcb2e462..fbed15aefa98dca15cb9cbd1f43605ba5bf6e666 100644 (file)
--- a/src/include/optimizer/pathnode.h
+++ b/src/include/optimizer/pathnode.h
@@ -51,6 +51,8 @@ extern ResultPath *create_result_path(List *quals);
  extern MaterialPath *create_material_path(RelOptInfo *rel, Path *subpath);
  extern UniquePath *create_unique_path(PlannerInfo *root, RelOptInfo *rel,
                                    Path *subpath, SpecialJoinInfo *sjinfo);
+extern NoOpPath *create_noop_path(PlannerInfo *root, RelOptInfo *rel,
+                                                                 Path *subpath);
  extern Path *create_subqueryscan_path(RelOptInfo *rel, List *pathkeys);
  extern Path *create_functionscan_path(PlannerInfo *root, RelOptInfo *rel);
  extern Path *create_valuesscan_path(PlannerInfo *root, RelOptInfo *rel);
diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h

index 4f80edc4923651cc70d57af75e104df7e3aaa80d..f574960734b1ac758ebf8f7c50085c81767a95a2 100644 (file)
--- a/src/include/optimizer/paths.h
+++ b/src/include/optimizer/paths.h
@@ -57,6 +57,8 @@ extern List *generate_bitmap_or_paths(PlannerInfo *root, RelOptInfo *rel,
  extern void best_inner_indexscan(PlannerInfo *root, RelOptInfo *rel,
                                          RelOptInfo *outer_rel, JoinType jointype,
                                          Path **cheapest_startup, Path **cheapest_total);
+extern bool relation_has_unique_index_for(PlannerInfo *root, RelOptInfo *rel,
+                                                         List *restrictlist);
  extern List *group_clauses_by_indexkey(IndexOptInfo *index,
                                                   List *clauses, List *outer_clauses,
                                                   Relids outer_relids,
author	Tom Lane <tgl@sss.pgh.pa.us>
	Thu, 17 Sep 2009 20:49:29 +0000 (20:49 +0000)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Thu, 17 Sep 2009 20:49:29 +0000 (20:49 +0000)
src/backend/nodes/outfuncs.c		patch \| blob \| blame \| history
src/backend/optimizer/README		patch \| blob \| blame \| history
src/backend/optimizer/path/allpaths.c		patch \| blob \| blame \| history
src/backend/optimizer/path/indxpath.c		patch \| blob \| blame \| history
src/backend/optimizer/path/joinpath.c		patch \| blob \| blame \| history
src/backend/optimizer/plan/createplan.c		patch \| blob \| blame \| history
src/backend/optimizer/util/pathnode.c		patch \| blob \| blame \| history
src/include/nodes/nodes.h		patch \| blob \| blame \| history
src/include/nodes/relation.h		patch \| blob \| blame \| history
src/include/optimizer/pathnode.h		patch \| blob \| blame \| history
src/include/optimizer/paths.h		patch \| blob \| blame \| history