diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveRelOptUtil.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveRelOptUtil.java index 2c2f91b520..b52fe85def 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveRelOptUtil.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveRelOptUtil.java @@ -35,6 +35,7 @@ import org.apache.calcite.plan.RelOptCluster; import org.apache.calcite.plan.RelOptTable; import org.apache.calcite.plan.RelOptUtil; +import org.apache.calcite.plan.hep.HepRelVertex; import org.apache.calcite.rel.RelNode; import org.apache.calcite.rel.RelReferentialConstraint; import org.apache.calcite.rel.core.Aggregate; @@ -54,6 +55,7 @@ import org.apache.calcite.rex.RexCall; import org.apache.calcite.rex.RexFieldAccess; import org.apache.calcite.rex.RexInputRef; +import org.apache.calcite.rex.RexLiteral; import org.apache.calcite.rex.RexNode; import org.apache.calcite.rex.RexOver; import org.apache.calcite.rex.RexTableInputRef; @@ -66,6 +68,9 @@ import org.apache.calcite.util.ImmutableBitSet; import org.apache.calcite.util.Pair; import org.apache.hadoop.hive.ql.exec.FunctionRegistry; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAggregate; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveProject; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveSortLimit; import org.apache.hadoop.hive.ql.optimizer.calcite.translator.TypeConverter; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; @@ -1047,4 +1052,32 @@ public static String toJsonString(final RelNode rel) { return planWriter.asString(); } + public static boolean produceAtmostOneRow(RelNode rel) { + if(rel instanceof HepRelVertex) { + rel = ((HepRelVertex)rel).getCurrentRel(); + } + if(rel instanceof HiveProject) { + if( ((HiveProject)rel).hasWindowingExpr()) { + return false; + } else { + return produceAtmostOneRow(((HiveProject)rel).getInput()); + } + } else if (rel instanceof HiveAggregate) { + // if there is no group by keys and only aggregate + // TODO: group by keys are constant + return ((HiveAggregate)rel).getGroupCount() == 0 ? true: false; + } else if (rel instanceof HiveSortLimit) { + // if LIMIT is less than equal to 1 + RexNode fetch = ((HiveSortLimit)rel).getFetchExpr(); + if(fetch != null) { + int limit = RexLiteral.intValue(fetch); + if(limit <=1 ) { + return true; + } + } + } else { + return false; + } + return false; + } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveProject.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveProject.java index 67312a4ee1..dbe8d39b46 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveProject.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveProject.java @@ -31,6 +31,7 @@ import org.apache.calcite.rel.type.RelDataTypeField; import org.apache.calcite.rex.RexBuilder; import org.apache.calcite.rex.RexNode; +import org.apache.calcite.rex.RexOver; import org.apache.calcite.rex.RexUtil; import org.apache.calcite.util.Util; import org.apache.calcite.util.mapping.Mapping; @@ -198,4 +199,13 @@ public boolean isSynthetic() { return shuttle.visit(this); } + public boolean hasWindowingExpr() { + for (RexNode expr : this.getChildExps()) { + if (expr instanceof RexOver) { + // Bail out as it may change cardinality + return true; + } + } + return false; + } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveSortLimitRemoveRule.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveSortLimitRemoveRule.java new file mode 100644 index 0000000000..2c555b8c49 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveSortLimitRemoveRule.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer.calcite.rules; + +import org.apache.calcite.plan.RelOptRule; +import org.apache.calcite.plan.RelOptRuleCall; +import org.apache.calcite.plan.RelOptRuleOperand; +import org.apache.calcite.rel.metadata.RelMetadataQuery; +import org.apache.calcite.rex.RexLiteral; +import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRelOptUtil; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveSortLimit; + +/** + * Planner rule that removes + * a {@link org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveSortLimit}. + * Note that this is different from HiveSortRemoveRule because this is not based on statistics + */ +public class HiveSortLimitRemoveRule extends RelOptRule { + + //~ Constructors ----------------------------------------------------------- + + public HiveSortLimitRemoveRule() { + this(operand(HiveSortLimit.class, any())); + } + + private HiveSortLimitRemoveRule(RelOptRuleOperand operand) { + super(operand); + } + + //~ Methods ---------------------------------------------------------------- + + @Override + public boolean matches(RelOptRuleCall call) { + final HiveSortLimit sortLimit = call.rel(0); + + return HiveRelOptUtil.produceAtmostOneRow(sortLimit.getInput()); + } + + @Override + public void onMatch(RelOptRuleCall call) { + final HiveSortLimit sortLimit = call.rel(0); + + // We remove the limit operator + call.transformTo(sortLimit.getInput()); + } + +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java index f5a1c74671..c395d9b3c7 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java @@ -212,6 +212,7 @@ import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveSemiJoinRule; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveSortJoinReduceRule; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveSortLimitPullUpConstantsRule; +import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveSortLimitRemoveRule; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveSortMergeRule; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveSortProjectTransposeRule; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveSortRemoveRule; @@ -1921,6 +1922,11 @@ public RelNode apply(RelOptCluster cluster, RelOptSchema relOptSchema, SchemaPlu perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.OPTIMIZER, "Calcite: Window fixing rule"); } + perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.OPTIMIZER); + calciteOptimizedPlan= hepPlan(calciteOptimizedPlan, false, mdProvider.getMetadataProvider(), null, + new HiveSortLimitRemoveRule()); + perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.OPTIMIZER, + "Calcite: Trying to remove Limit and Order by"); // 10. Apply Druid transformation rules perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.OPTIMIZER); calciteOptimizedPlan = hepPlan(calciteOptimizedPlan, false, mdProvider.getMetadataProvider(), null, diff --git a/ql/src/test/queries/clientpositive/cbo_limit.q b/ql/src/test/queries/clientpositive/cbo_limit.q index 24c1eae228..2f6809b8cf 100644 --- a/ql/src/test/queries/clientpositive/cbo_limit.q +++ b/ql/src/test/queries/clientpositive/cbo_limit.q @@ -17,3 +17,20 @@ select key, c_int from(select key, c_int from (select key, c_int from cbo_t1 ord select cbo_t3.c_int, c, count(*) from (select key as a, c_int+1 as b, sum(c_int) as c from cbo_t1 where (cbo_t1.c_int + 1 >= 0) and (cbo_t1.c_int > 0 or cbo_t1.c_float >= 0) group by c_float, cbo_t1.c_int, key order by a limit 5) cbo_t1 join (select key as p, c_int+1 as q, sum(c_int) as r from cbo_t2 where (cbo_t2.c_int + 1 >= 0) and (cbo_t2.c_int > 0 or cbo_t2.c_float >= 0) group by c_float, cbo_t2.c_int, key order by q/10 desc, r asc limit 5) cbo_t2 on cbo_t1.a=p join cbo_t3 on cbo_t1.a=key where (b + cbo_t2.q >= 0) and (b > 0 or c_int >= 0) group by cbo_t3.c_int, c order by cbo_t3.c_int+c desc, c limit 5; select cbo_t3.c_int, c, count(*) from (select key as a, c_int+1 as b, sum(c_int) as c from cbo_t1 where (cbo_t1.c_int + 1 >= 0) and (cbo_t1.c_int > 0 or cbo_t1.c_float >= 0) group by c_float, cbo_t1.c_int, key having cbo_t1.c_float > 0 and (c_int >=1 or c_float >= 1) and (c_int + c_float) >= 0 order by b % c asc, b desc limit 5) cbo_t1 left outer join (select key as p, c_int+1 as q, sum(c_int) as r from cbo_t2 where (cbo_t2.c_int + 1 >= 0) and (cbo_t2.c_int > 0 or cbo_t2.c_float >= 0) group by c_float, cbo_t2.c_int, key having cbo_t2.c_float > 0 and (c_int >=1 or c_float >= 1) and (c_int + c_float) >= 0 limit 5) cbo_t2 on cbo_t1.a=p left outer join cbo_t3 on cbo_t1.a=key where (b + cbo_t2.q >= 0) and (b > 0 or c_int >= 0) group by cbo_t3.c_int, c having cbo_t3.c_int > 0 and (c_int >=1 or c >= 1) and (c_int + c) >= 0 order by cbo_t3.c_int % c asc, cbo_t3.c_int, c desc limit 5; + +-- order by and limit +explain cbo select count(*) cs from cbo_t1 where c_int > 1 order by cs limit 100; +select count(*) cs from cbo_t1 where c_int > 1 order by cs limit 100; + +-- only order by +explain cbo select count(*) cs from cbo_t1 where c_int > 1 order by cs ; +select count(*) cs from cbo_t1 where c_int > 1 order by cs ; + +-- only LIMIT +explain cbo select count(*) cs from cbo_t1 where c_int > 1 LIMIT 100; +select count(*) cs from cbo_t1 where c_int > 1 LIMIT 100; + +-- windowing +explain cbo select count(*) over (partition by c_int) cs from cbo_t1 where c_float > 1.0 ORDER BY cs LIMIT 100; +select count(*) over (partition by c_int) cs from cbo_t1 where c_float > 1.0 ORDER BY cs LIMIT 100; + diff --git a/ql/src/test/results/clientpositive/llap/cbo_limit.q.out b/ql/src/test/results/clientpositive/llap/cbo_limit.q.out index 87a5770ad2..8cc0d27250 100644 --- a/ql/src/test/results/clientpositive/llap/cbo_limit.q.out +++ b/ql/src/test/results/clientpositive/llap/cbo_limit.q.out @@ -86,3 +86,107 @@ POSTHOOK: Input: default@cbo_t3 #### A masked pattern was here #### 1 12 6 1 2 6 +PREHOOK: query: explain cbo select count(*) cs from cbo_t1 where c_int > 1 order by cs limit 100 +PREHOOK: type: QUERY +PREHOOK: Input: default@cbo_t1 +PREHOOK: Input: default@cbo_t1@dt=2014 +#### A masked pattern was here #### +POSTHOOK: query: explain cbo select count(*) cs from cbo_t1 where c_int > 1 order by cs limit 100 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@cbo_t1 +POSTHOOK: Input: default@cbo_t1@dt=2014 +#### A masked pattern was here #### +CBO PLAN: +HiveAggregate(group=[{}], agg#0=[count()]) + HiveFilter(condition=[>($2, 1)]) + HiveTableScan(table=[[default, cbo_t1]], table:alias=[cbo_t1]) + +PREHOOK: query: select count(*) cs from cbo_t1 where c_int > 1 order by cs limit 100 +PREHOOK: type: QUERY +PREHOOK: Input: default@cbo_t1 +PREHOOK: Input: default@cbo_t1@dt=2014 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) cs from cbo_t1 where c_int > 1 order by cs limit 100 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@cbo_t1 +POSTHOOK: Input: default@cbo_t1@dt=2014 +#### A masked pattern was here #### +0 +PREHOOK: query: explain cbo select count(*) cs from cbo_t1 where c_int > 1 order by cs +PREHOOK: type: QUERY +PREHOOK: Input: default@cbo_t1 +PREHOOK: Input: default@cbo_t1@dt=2014 +#### A masked pattern was here #### +POSTHOOK: query: explain cbo select count(*) cs from cbo_t1 where c_int > 1 order by cs +POSTHOOK: type: QUERY +POSTHOOK: Input: default@cbo_t1 +POSTHOOK: Input: default@cbo_t1@dt=2014 +#### A masked pattern was here #### +CBO PLAN: +HiveAggregate(group=[{}], agg#0=[count()]) + HiveFilter(condition=[>($2, 1)]) + HiveTableScan(table=[[default, cbo_t1]], table:alias=[cbo_t1]) + +PREHOOK: query: select count(*) cs from cbo_t1 where c_int > 1 order by cs +PREHOOK: type: QUERY +PREHOOK: Input: default@cbo_t1 +PREHOOK: Input: default@cbo_t1@dt=2014 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) cs from cbo_t1 where c_int > 1 order by cs +POSTHOOK: type: QUERY +POSTHOOK: Input: default@cbo_t1 +POSTHOOK: Input: default@cbo_t1@dt=2014 +#### A masked pattern was here #### +0 +PREHOOK: query: explain cbo select count(*) cs from cbo_t1 where c_int > 1 LIMIT 100 +PREHOOK: type: QUERY +PREHOOK: Input: default@cbo_t1 +PREHOOK: Input: default@cbo_t1@dt=2014 +#### A masked pattern was here #### +POSTHOOK: query: explain cbo select count(*) cs from cbo_t1 where c_int > 1 LIMIT 100 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@cbo_t1 +POSTHOOK: Input: default@cbo_t1@dt=2014 +#### A masked pattern was here #### +CBO PLAN: +HiveAggregate(group=[{}], agg#0=[count()]) + HiveFilter(condition=[>($2, 1)]) + HiveTableScan(table=[[default, cbo_t1]], table:alias=[cbo_t1]) + +PREHOOK: query: select count(*) cs from cbo_t1 where c_int > 1 LIMIT 100 +PREHOOK: type: QUERY +PREHOOK: Input: default@cbo_t1 +PREHOOK: Input: default@cbo_t1@dt=2014 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) cs from cbo_t1 where c_int > 1 LIMIT 100 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@cbo_t1 +POSTHOOK: Input: default@cbo_t1@dt=2014 +#### A masked pattern was here #### +0 +PREHOOK: query: explain cbo select count(*) over (partition by c_int) cs from cbo_t1 where c_float > 1.0 ORDER BY cs LIMIT 100 +PREHOOK: type: QUERY +PREHOOK: Input: default@cbo_t1 +PREHOOK: Input: default@cbo_t1@dt=2014 +#### A masked pattern was here #### +POSTHOOK: query: explain cbo select count(*) over (partition by c_int) cs from cbo_t1 where c_float > 1.0 ORDER BY cs LIMIT 100 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@cbo_t1 +POSTHOOK: Input: default@cbo_t1@dt=2014 +#### A masked pattern was here #### +CBO PLAN: +HiveSortLimit(sort0=[$0], dir0=[ASC], fetch=[100]) + HiveProject(count_window_0=[count() OVER (PARTITION BY $2 ORDER BY $2 NULLS FIRST ROWS BETWEEN 2147483647 FOLLOWING AND 2147483647 PRECEDING)]) + HiveFilter(condition=[>($3, 1.0E0)]) + HiveTableScan(table=[[default, cbo_t1]], table:alias=[cbo_t1]) + +PREHOOK: query: select count(*) over (partition by c_int) cs from cbo_t1 where c_float > 1.0 ORDER BY cs LIMIT 100 +PREHOOK: type: QUERY +PREHOOK: Input: default@cbo_t1 +PREHOOK: Input: default@cbo_t1@dt=2014 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) over (partition by c_int) cs from cbo_t1 where c_float > 1.0 ORDER BY cs LIMIT 100 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@cbo_t1 +POSTHOOK: Input: default@cbo_t1@dt=2014 +#### A masked pattern was here ####