Consider explicit incremental sort for mergejoins
authorRichard Guo <rguo@postgresql.org>
Wed, 9 Oct 2024 08:14:42 +0000 (17:14 +0900)
committerRichard Guo <rguo@postgresql.org>
Wed, 9 Oct 2024 08:14:42 +0000 (17:14 +0900)
For a mergejoin, if the given outer path or inner path is not already
well enough ordered, we need to do an explicit sort.  Currently, we
only consider explicit full sort and do not account for incremental
sort.

In this patch, for the outer path of a mergejoin, we choose to use
explicit incremental sort if it is enabled and there are presorted
keys.  For the inner path, though, we cannot use incremental sort
because it does not support mark/restore at present.

The rationale is based on the assumption that incremental sort is
always faster than full sort when there are presorted keys, a premise
that has been applied in various parts of the code.  In addition, the
current cost model tends to favor incremental sort as being cheaper
than full sort in the presence of presorted keys, making it reasonable
not to consider full sort in such cases.

It could be argued that what if a mergejoin with an incremental sort
as the outer path is selected as the inner path of another mergejoin.
However, this should not be a problem, because mergejoin itself does
not support mark/restore either, and we will add a Material node on
top of it anyway in this case (see final_cost_mergejoin).

There is one ensuing plan change in the regression tests, and we have
to modify that test case to ensure that it continues to test what it
is intended to.

No backpatch as this could result in plan changes.

Author: Richard Guo
Reviewed-by: David Rowley, Tomas Vondra
Discussion: https://postgr.es/m/CAMbWs49x425QrX7h=Ux05WEnt8GS757H-jOP3_xsX5t1FoUsZw@mail.gmail.com

src/backend/optimizer/path/costsize.c
src/backend/optimizer/plan/createplan.c
src/test/regress/expected/aggregates.out
src/test/regress/expected/incremental_sort.out
src/test/regress/sql/aggregates.sql
src/test/regress/sql/incremental_sort.sql

index e1523d15df1c7450ae6f6eb4b468a8eac698832e..c6e66e46f4a0dfe83aa813a4dc3e359639c5b8f3 100644 (file)
@@ -3532,7 +3532,8 @@ final_cost_nestloop(PlannerInfo *root, NestPath *path,
  * join quals here, except for obtaining the scan selectivity estimate which
  * is really essential (but fortunately, use of caching keeps the cost of
  * getting that down to something reasonable).
- * We also assume that cost_sort is cheap enough to use here.
+ * We also assume that cost_sort/cost_incremental_sort is cheap enough to use
+ * here.
  *
  * 'workspace' is to be filled with startup_cost, total_cost, and perhaps
  *     other data to be used by final_cost_mergejoin
@@ -3569,7 +3570,8 @@ initial_cost_mergejoin(PlannerInfo *root, JoinCostWorkspace *workspace,
                outerendsel,
                innerstartsel,
                innerendsel;
-   Path        sort_path;      /* dummy for result of cost_sort */
+   Path        sort_path;      /* dummy for result of
+                                * cost_sort/cost_incremental_sort */
 
    /* Protect some assumptions below that rowcounts aren't zero */
    if (outer_path_rows <= 0)
@@ -3682,16 +3684,54 @@ initial_cost_mergejoin(PlannerInfo *root, JoinCostWorkspace *workspace,
 
    if (outersortkeys)          /* do we need to sort outer? */
    {
-       cost_sort(&sort_path,
-                 root,
-                 outersortkeys,
-                 outer_path->disabled_nodes,
-                 outer_path->total_cost,
-                 outer_path_rows,
-                 outer_path->pathtarget->width,
-                 0.0,
-                 work_mem,
-                 -1.0);
+       bool        use_incremental_sort = false;
+       int         presorted_keys;
+
+       /*
+        * We choose to use incremental sort if it is enabled and there are
+        * presorted keys; otherwise we use full sort.
+        */
+       if (enable_incremental_sort)
+       {
+           bool        is_sorted PG_USED_FOR_ASSERTS_ONLY;
+
+           is_sorted = pathkeys_count_contained_in(outersortkeys,
+                                                   outer_path->pathkeys,
+                                                   &presorted_keys);
+           Assert(!is_sorted);
+
+           if (presorted_keys > 0)
+               use_incremental_sort = true;
+       }
+
+       if (!use_incremental_sort)
+       {
+           cost_sort(&sort_path,
+                     root,
+                     outersortkeys,
+                     outer_path->disabled_nodes,
+                     outer_path->total_cost,
+                     outer_path_rows,
+                     outer_path->pathtarget->width,
+                     0.0,
+                     work_mem,
+                     -1.0);
+       }
+       else
+       {
+           cost_incremental_sort(&sort_path,
+                                 root,
+                                 outersortkeys,
+                                 presorted_keys,
+                                 outer_path->disabled_nodes,
+                                 outer_path->startup_cost,
+                                 outer_path->total_cost,
+                                 outer_path_rows,
+                                 outer_path->pathtarget->width,
+                                 0.0,
+                                 work_mem,
+                                 -1.0);
+       }
        disabled_nodes += sort_path.disabled_nodes;
        startup_cost += sort_path.startup_cost;
        startup_cost += (sort_path.total_cost - sort_path.startup_cost)
@@ -3711,6 +3751,11 @@ initial_cost_mergejoin(PlannerInfo *root, JoinCostWorkspace *workspace,
 
    if (innersortkeys)          /* do we need to sort inner? */
    {
+       /*
+        * We do not consider incremental sort for inner path, because
+        * incremental sort does not support mark/restore.
+        */
+
        cost_sort(&sort_path,
                  root,
                  innersortkeys,
index bb45ef318fb45d9e2223da48de4433c4ba1d2896..0d195a07ffc0cffd459923cc0ddfc06b4f4b3041 100644 (file)
@@ -179,6 +179,8 @@ static void copy_generic_path_info(Plan *dest, Path *src);
 static void copy_plan_costsize(Plan *dest, Plan *src);
 static void label_sort_with_costsize(PlannerInfo *root, Sort *plan,
                                     double limit_tuples);
+static void label_incrementalsort_with_costsize(PlannerInfo *root, IncrementalSort *plan,
+                                               List *pathkeys, double limit_tuples);
 static SeqScan *make_seqscan(List *qptlist, List *qpqual, Index scanrelid);
 static SampleScan *make_samplescan(List *qptlist, List *qpqual, Index scanrelid,
                                   TableSampleClause *tsc);
@@ -4523,12 +4525,51 @@ create_mergejoin_plan(PlannerInfo *root,
    if (best_path->outersortkeys)
    {
        Relids      outer_relids = outer_path->parent->relids;
-       Sort       *sort = make_sort_from_pathkeys(outer_plan,
+       Plan       *sort_plan;
+       bool        use_incremental_sort = false;
+       int         presorted_keys;
+
+       /*
+        * We choose to use incremental sort if it is enabled and there are
+        * presorted keys; otherwise we use full sort.
+        */
+       if (enable_incremental_sort)
+       {
+           bool        is_sorted PG_USED_FOR_ASSERTS_ONLY;
+
+           is_sorted = pathkeys_count_contained_in(best_path->outersortkeys,
+                                                   outer_path->pathkeys,
+                                                   &presorted_keys);
+           Assert(!is_sorted);
+
+           if (presorted_keys > 0)
+               use_incremental_sort = true;
+       }
+
+       if (!use_incremental_sort)
+       {
+           sort_plan = (Plan *)
+               make_sort_from_pathkeys(outer_plan,
+                                       best_path->outersortkeys,
+                                       outer_relids);
+
+           label_sort_with_costsize(root, (Sort *) sort_plan, -1.0);
+       }
+       else
+       {
+           sort_plan = (Plan *)
+               make_incrementalsort_from_pathkeys(outer_plan,
                                                   best_path->outersortkeys,
-                                                  outer_relids);
+                                                  outer_relids,
+                                                  presorted_keys);
 
-       label_sort_with_costsize(root, sort, -1.0);
-       outer_plan = (Plan *) sort;
+           label_incrementalsort_with_costsize(root,
+                                               (IncrementalSort *) sort_plan,
+                                               best_path->outersortkeys,
+                                               -1.0);
+       }
+
+       outer_plan = sort_plan;
        outerpathkeys = best_path->outersortkeys;
    }
    else
@@ -4536,6 +4577,11 @@ create_mergejoin_plan(PlannerInfo *root,
 
    if (best_path->innersortkeys)
    {
+       /*
+        * We do not consider incremental sort for inner path, because
+        * incremental sort does not support mark/restore.
+        */
+
        Relids      inner_relids = inner_path->parent->relids;
        Sort       *sort = make_sort_from_pathkeys(inner_plan,
                                                   best_path->innersortkeys,
@@ -5447,10 +5493,6 @@ label_sort_with_costsize(PlannerInfo *root, Sort *plan, double limit_tuples)
    Plan       *lefttree = plan->plan.lefttree;
    Path        sort_path;      /* dummy for result of cost_sort */
 
-   /*
-    * This function shouldn't have to deal with IncrementalSort plans because
-    * they are only created from corresponding Path nodes.
-    */
    Assert(IsA(plan, Sort));
 
    cost_sort(&sort_path, root, NIL,
@@ -5469,6 +5511,37 @@ label_sort_with_costsize(PlannerInfo *root, Sort *plan, double limit_tuples)
    plan->plan.parallel_safe = lefttree->parallel_safe;
 }
 
+/*
+ * Same as label_sort_with_costsize, but labels the IncrementalSort node
+ * instead.
+ */
+static void
+label_incrementalsort_with_costsize(PlannerInfo *root, IncrementalSort *plan,
+                                   List *pathkeys, double limit_tuples)
+{
+   Plan       *lefttree = plan->sort.plan.lefttree;
+   Path        sort_path;      /* dummy for result of cost_incremental_sort */
+
+   Assert(IsA(plan, IncrementalSort));
+
+   cost_incremental_sort(&sort_path, root, pathkeys,
+                         plan->nPresortedCols,
+                         plan->sort.plan.disabled_nodes,
+                         lefttree->startup_cost,
+                         lefttree->total_cost,
+                         lefttree->plan_rows,
+                         lefttree->plan_width,
+                         0.0,
+                         work_mem,
+                         limit_tuples);
+   plan->sort.plan.startup_cost = sort_path.startup_cost;
+   plan->sort.plan.total_cost = sort_path.total_cost;
+   plan->sort.plan.plan_rows = lefttree->plan_rows;
+   plan->sort.plan.plan_width = lefttree->plan_width;
+   plan->sort.plan.parallel_aware = false;
+   plan->sort.plan.parallel_safe = lefttree->parallel_safe;
+}
+
 /*
  * bitmap_subplan_mark_shared
  *  Set isshared flag in bitmap subplan so that it will be created in
index e14e73565675564a799248f265175af0d650c18e..495deb606e25aa8aa296927140676fb180688b6a 100644 (file)
@@ -2858,29 +2858,27 @@ GROUP BY w, x, z, y;
          ->  Index Scan using btg_x_y_idx on btg
 (6 rows)
 
--- Utilize the ordering of merge join to avoid a full Sort operation
+-- Utilize the ordering of merge join to avoid a Sort operation
 SET enable_hashjoin = off;
 SET enable_nestloop = off;
 EXPLAIN (COSTS OFF)
 SELECT count(*)
-  FROM btg t1 JOIN btg t2 ON t1.z = t2.z AND t1.w = t2.w AND t1.x = t2.x
-  GROUP BY t1.x, t1.y, t1.z, t1.w;
-                                  QUERY PLAN                                   
--------------------------------------------------------------------------------
+  FROM btg t1 JOIN btg t2 ON t1.w = t2.w AND t1.x = t2.x AND t1.z = t2.z
+  GROUP BY t1.w, t1.z, t1.x;
+                               QUERY PLAN                                
+-------------------------------------------------------------------------
  GroupAggregate
-   Group Key: t1.z, t1.w, t1.x, t1.y
-   ->  Incremental Sort
-         Sort Key: t1.z, t1.w, t1.x, t1.y
-         Presorted Key: t1.z, t1.w, t1.x
-         ->  Merge Join
-               Merge Cond: ((t1.z = t2.z) AND (t1.w = t2.w) AND (t1.x = t2.x))
-               ->  Sort
-                     Sort Key: t1.z, t1.w, t1.x
-                     ->  Index Scan using btg_x_y_idx on btg t1
-               ->  Sort
-                     Sort Key: t2.z, t2.w, t2.x
-                     ->  Index Scan using btg_x_y_idx on btg t2
-(13 rows)
+   Group Key: t1.x, t1.w, t1.z
+   ->  Merge Join
+         Merge Cond: ((t1.x = t2.x) AND (t1.w = t2.w) AND (t1.z = t2.z))
+         ->  Incremental Sort
+               Sort Key: t1.x, t1.w, t1.z
+               Presorted Key: t1.x
+               ->  Index Scan using btg_x_y_idx on btg t1
+         ->  Sort
+               Sort Key: t2.x, t2.w, t2.z
+               ->  Index Scan using btg_x_y_idx on btg t2
+(11 rows)
 
 RESET enable_nestloop;
 RESET enable_hashjoin;
index 79f0d37a87ef677da4cddbdb5959258cc85ad320..c561b62b2db2a1323ad91aee86b964a851453358 100644 (file)
@@ -1701,3 +1701,24 @@ explain (costs off) select a, b, a <-> point(5, 5) dist from point_table order b
                Order By: (a <-> '(5,5)'::point)
 (6 rows)
 
+-- Ensure we get an incremental sort on the outer side of the mergejoin
+explain (costs off)
+select * from
+  (select * from tenk1 order by four) t1 join tenk1 t2 on t1.four = t2.four and t1.two = t2.two
+order by t1.four, t1.two limit 1;
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Limit
+   ->  Merge Join
+         Merge Cond: ((tenk1.four = t2.four) AND (tenk1.two = t2.two))
+         ->  Incremental Sort
+               Sort Key: tenk1.four, tenk1.two
+               Presorted Key: tenk1.four
+               ->  Sort
+                     Sort Key: tenk1.four
+                     ->  Seq Scan on tenk1
+         ->  Sort
+               Sort Key: t2.four, t2.two
+               ->  Seq Scan on tenk1 t2
+(12 rows)
+
index ddf38bafb4280d8ac8e5b97011d4983a8b5e7fce..4885daffe633e5f364a5347f25134268c4389334 100644 (file)
@@ -1232,13 +1232,13 @@ EXPLAIN (COSTS OFF) SELECT count(*)
 FROM (SELECT * FROM btg ORDER BY x, y, w, z) AS q1
 GROUP BY w, x, z, y;
 
--- Utilize the ordering of merge join to avoid a full Sort operation
+-- Utilize the ordering of merge join to avoid a Sort operation
 SET enable_hashjoin = off;
 SET enable_nestloop = off;
 EXPLAIN (COSTS OFF)
 SELECT count(*)
-  FROM btg t1 JOIN btg t2 ON t1.z = t2.z AND t1.w = t2.w AND t1.x = t2.x
-  GROUP BY t1.x, t1.y, t1.z, t1.w;
+  FROM btg t1 JOIN btg t2 ON t1.w = t2.w AND t1.x = t2.x AND t1.z = t2.z
+  GROUP BY t1.w, t1.z, t1.x;
 RESET enable_nestloop;
 RESET enable_hashjoin;
 
index ab471bdfffc12b3217608d8ae6a3f141626b29b3..98b20e17e180edd84a080142e8a7a361854ab264 100644 (file)
@@ -292,3 +292,9 @@ create index point_table_a_idx on point_table using gist(a);
 -- Ensure we get an incremental sort plan for both of the following queries
 explain (costs off) select a, b, a <-> point(5, 5) dist from point_table order by dist, b limit 1;
 explain (costs off) select a, b, a <-> point(5, 5) dist from point_table order by dist, b desc limit 1;
+
+-- Ensure we get an incremental sort on the outer side of the mergejoin
+explain (costs off)
+select * from
+  (select * from tenk1 order by four) t1 join tenk1 t2 on t1.four = t2.four and t1.two = t2.two
+order by t1.four, t1.two limit 1;