From e53b2126f519639cc76a4d402150b32d92a1e07c Mon Sep 17 00:00:00 2001 From: Ashutosh Chauhan Date: Wed, 16 Nov 2016 17:11:03 -0800 Subject: [PATCH] HIVE-15227 : Optimize join + gby into semijoin --- .../hadoop/hive/ql/parse/CalcitePlanner.java | 9 ++- ql/src/test/queries/clientpositive/join_aggr.q | 4 + ql/src/test/results/clientpositive/join_aggr.q.out | 91 ++++++++++++++++++++++ 3 files changed, 102 insertions(+), 2 deletions(-) create mode 100644 ql/src/test/queries/clientpositive/join_aggr.q create mode 100644 ql/src/test/results/clientpositive/join_aggr.q.out diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java index 78011c2..4114008 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java @@ -81,6 +81,7 @@ import org.apache.calcite.rel.rules.SemiJoinFilterTransposeRule; import org.apache.calcite.rel.rules.SemiJoinJoinTransposeRule; import org.apache.calcite.rel.rules.SemiJoinProjectTransposeRule; +import org.apache.calcite.rel.rules.SemiJoinRule; import org.apache.calcite.rel.rules.UnionMergeRule; import org.apache.calcite.rel.type.RelDataType; import org.apache.calcite.rel.type.RelDataTypeFactory; @@ -1115,6 +1116,10 @@ public RelNode apply(RelOptCluster cluster, RelOptSchema relOptSchema, SchemaPlu perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.OPTIMIZER, "Calcite: Aggregate join transpose"); } + // convert Join + GBy to semijoin + // run this rule at later stages, since many calcite rules cant deal with semijoin + calciteOptimizedPlan = hepPlan(calciteOptimizedPlan, false, mdProvider.getMetadataProvider(), null, SemiJoinRule.INSTANCE); + // 7. Run rule to fix windowing issue when it is done over // aggregation columns (HIVE-10627) if (profilesCBO.contains(ExtendedCBOProfile.WINDOWING_POSTPROCESSING)) { @@ -1187,7 +1192,7 @@ private RelNode applyPreJoinOrderingTransforms(RelNode basePlan, RelMetadataProv // TODO: Decorelation of subquery should be done before attempting // Partition Pruning; otherwise Expression evaluation may try to execute // corelated sub query. - + PerfLogger perfLogger = SessionState.getPerfLogger(); final int maxCNFNodeCount = conf.getIntVar(HiveConf.ConfVars.HIVE_CBO_CNF_NODES_LIMIT); @@ -3394,7 +3399,7 @@ private RelNode genUDTFPlan(GenericUDTF genericUDTF, String genericUDTFName, Str for (ColumnInfo ci : rs.getSignature()) { argTypeBldr.add(TypeConverter.convert(ci.getType(), dtFactory)); } - + SqlOperator calciteOp = SqlFunctionConverter.getCalciteOperator(genericUDTFName, genericUDTF, argTypeBldr.build(), retType); diff --git a/ql/src/test/queries/clientpositive/join_aggr.q b/ql/src/test/queries/clientpositive/join_aggr.q new file mode 100644 index 0000000..3887433 --- /dev/null +++ b/ql/src/test/queries/clientpositive/join_aggr.q @@ -0,0 +1,4 @@ +create table t1 (a int, b int); +create table t2 (c int, d int); + +explain select t1.b from t1 join (select c from t2 group by c)t3 on t1.a = t3.c; diff --git a/ql/src/test/results/clientpositive/join_aggr.q.out b/ql/src/test/results/clientpositive/join_aggr.q.out new file mode 100644 index 0000000..11a168b --- /dev/null +++ b/ql/src/test/results/clientpositive/join_aggr.q.out @@ -0,0 +1,91 @@ +PREHOOK: query: create table t1 (a int, b int) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@t1 +POSTHOOK: query: create table t1 (a int, b int) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t1 +PREHOOK: query: create table t2 (c int, d int) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@t2 +POSTHOOK: query: create table t2 (c int, d int) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t2 +PREHOOK: query: explain select t1.b from t1 join (select c from t2 group by c)t3 on t1.a = t3.c +PREHOOK: type: QUERY +POSTHOOK: query: explain select t1.b from t1 join (select c from t2 group by c)t3 on t1.a = t3.c +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: t1 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: a is not null (type: boolean) + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: a (type: int), b (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + value expressions: _col1 (type: int) + TableScan + alias: t2 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: c is not null (type: boolean) + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: c (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Group By Operator + keys: _col0 (type: int) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Reduce Operator Tree: + Join Operator + condition map: + Left Semi Join 0 to 1 + keys: + 0 _col0 (type: int) + 1 _col0 (type: int) + outputColumnNames: _col1 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: _col1 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + -- 1.7.12.4 (Apple Git-37)