diff --git common/src/java/org/apache/hadoop/hive/conf/HiveConf.java common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 7f29da2..b3b047d 100644 --- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -1245,6 +1245,14 @@ public void setSparkConfigUpdated(boolean isSparkConfigUpdated) { "we are increasing the number of files possibly by a big margin. So, we merge aggressively."), HIVEOPTCORRELATION("hive.optimize.correlation", false, "exploit intra-query correlations."), + HIVE_OPTIMIZE_LIMIT_JOIN_TRANSPOSE("hive.optimize.limitjointranspose", false, + "Whether to push a limit through left/right outer join. If the value is true and the size of the outer\n" + + "input is reduced enough (as specified in hive.optimize.limitjointranspose.reduction), the limit is pushed\n" + + "to the outer input; to remain semantically correct, the limit is kept on top of the join too."), + HIVE_OPTIMIZE_LIMIT_JOIN_TRANSPOSE_REDUCTION_PERCENTAGE("hive.optimize.limitjointranspose.reduction", 1.0f, + "When hive.optimize.limitjointranspose is true, this variable specifies the minimal reduction of the\n" + + "size of the outer input of the join that we should get in order to apply the rule."), + HIVE_HADOOP_SUPPORTS_SUBDIRECTORIES("hive.mapred.supports.subdirectories", false, "Whether the version of Hadoop which is running supports sub-directories for tables/partitions. \n" + "Many Hive optimizations can be applied if the Hadoop version supports sub-directories for\n" + diff --git itests/src/test/resources/testconfiguration.properties itests/src/test/resources/testconfiguration.properties index b47d1b5..c90b5d9 100644 --- itests/src/test/resources/testconfiguration.properties +++ itests/src/test/resources/testconfiguration.properties @@ -341,6 +341,7 @@ minitez.query.files=bucket_map_join_tez1.q,\ tez_insert_overwrite_local_directory_1.q,\ tez_dynpart_hashjoin_1.q,\ tez_dynpart_hashjoin_2.q,\ + tez_dynpart_hashjoin_3.q,\ tez_vector_dynpart_hashjoin_1.q,\ tez_vector_dynpart_hashjoin_2.q,\ tez_join_hash.q,\ diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveSortJoinReduceRule.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveSortJoinReduceRule.java new file mode 100644 index 0000000..4c19328 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveSortJoinReduceRule.java @@ -0,0 +1,204 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer.calcite.rules; + +import org.apache.calcite.plan.RelOptRule; +import org.apache.calcite.plan.RelOptRuleCall; +import org.apache.calcite.plan.RelOptRuleOperand; +import org.apache.calcite.plan.RelOptUtil; +import org.apache.calcite.plan.hep.HepRelVertex; +import org.apache.calcite.rel.RelCollations; +import org.apache.calcite.rel.RelFieldCollation; +import org.apache.calcite.rel.RelNode; +import org.apache.calcite.rel.core.JoinRelType; +import org.apache.calcite.rel.metadata.RelMetadataQuery; +import org.apache.calcite.rex.RexLiteral; +import org.apache.calcite.util.mapping.Mappings; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveJoin; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveProject; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveSortLimit; + +import com.google.common.collect.ImmutableList; + +/** + * Planner rule that pushes + * a {@link org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveSort} + * past a {@link org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveJoin}. + */ +public class HiveSortJoinReduceRule extends RelOptRule { + + protected final float reduction; + + //~ Constructors ----------------------------------------------------------- + + public HiveSortJoinReduceRule(float reduction) { + this( + operand( + HiveSortLimit.class, + operand(HiveJoin.class, any())), reduction); + } + + private HiveSortJoinReduceRule(RelOptRuleOperand operand, float reduction) { + super(operand); + this.reduction = reduction; + } + + //~ Methods ---------------------------------------------------------------- + + @Override + public boolean matches(RelOptRuleCall call) { + final HiveSortLimit sortLimit = call.rel(0); + final HiveJoin join = call.rel(1); + + return matches(sortLimit, null, join); + } + + protected boolean matches(HiveSortLimit sortLimit, HiveProject project, HiveJoin join) { + // If sort does not consist only of a limit operation, we bail out + if (sortLimit.getCollation() != RelCollations.EMPTY) { + return false; + } + + // Determine mapping between project input and output fields. If sort + // relies on non-trivial expressions, we can't push. + if (project != null) { // Project is optional + final Mappings.TargetMapping map = + RelOptUtil.permutation( + project.getProjects(), project.getInput().getRowType()); + for (RelFieldCollation fc : sortLimit.getCollation().getFieldCollations()) { + if (map.getTargetOpt(fc.getFieldIndex()) < 0) { + return false; + } + } + } + + // If join is not a left or right outer, we bail out + RelNode reducedInput; + if (join.getJoinType() == JoinRelType.LEFT) { + reducedInput = join.getLeft(); + } else if (join.getJoinType() == JoinRelType.RIGHT) { + reducedInput = join.getRight(); + } else { + return false; + } + + // Finally, if we do not reduce the size input enough, we bail out + if (RexLiteral.intValue(sortLimit.fetch) + > reduction * RelMetadataQuery.getRowCount(reducedInput)) { + return false; + } + + return true; + } + + @Override + public void onMatch(RelOptRuleCall call) { + final HiveSortLimit sortLimit = call.rel(0); + final HiveJoin join = call.rel(1); + + onMatch(call, sortLimit, null, join); + } + + protected void onMatch(RelOptRuleCall call, HiveSortLimit sortLimit, + HiveProject project, HiveJoin join) { + int limit = RexLiteral.intValue(sortLimit.fetch); + // We create a new sort operator on the corresponding input + RelNode inputLeft; + RelNode inputRight; + if (join.getJoinType() == JoinRelType.LEFT) { + if (((HepRelVertex)join.getLeft()).getCurrentRel() instanceof HiveSortLimit) { + HiveSortLimit inputSort = (HiveSortLimit) ((HepRelVertex)join.getLeft()).getCurrentRel(); + // If we have already pushed the limit or it is already lower, we bail out + if (limit >= RexLiteral.intValue(inputSort.fetch)) { + return; + } + // If another limit is present, we update it + inputLeft = inputSort.copy(inputSort.getTraitSet(), inputSort.getInput(), + inputSort.getCollation(), inputSort.offset, sortLimit.fetch); + } else { + inputLeft = sortLimit.copy(sortLimit.getTraitSet(), join.getLeft(), sortLimit.getCollation(), + sortLimit.offset, sortLimit.fetch); + } + inputRight = join.getRight(); + } else { + inputLeft = join.getLeft(); + if (((HepRelVertex)join.getRight()).getCurrentRel() instanceof HiveSortLimit) { + HiveSortLimit inputSort = (HiveSortLimit) ((HepRelVertex)join.getRight()).getCurrentRel(); + // If we have already pushed the limit or it is already lower, we bail out + if (limit >= RexLiteral.intValue(inputSort.fetch)) { + return; + } + // If another limit is present, we update it + inputRight = inputSort.copy(inputSort.getTraitSet(), inputSort.getInput(), + inputSort.getCollation(), inputSort.offset, sortLimit.fetch); + } else { + inputRight = sortLimit.copy(sortLimit.getTraitSet(), join.getRight(), sortLimit.getCollation(), + sortLimit.offset, sortLimit.fetch); + } + } + // We copy the join, the project (optional), and the top sort operator + RelNode result = join.copy(join.getTraitSet(), join.getCondition(), inputLeft, + inputRight, join.getJoinType(), join.isSemiJoinDone()); + if (project != null) { + result = project.copy(project.getTraitSet(), ImmutableList.of(result)); + } + result = sortLimit.copy(sortLimit.getTraitSet(), result, sortLimit.getCollation(), + sortLimit.offset, sortLimit.fetch); + + call.transformTo(result); + } + + /** + * Planner rule that pushes + * a {@link org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveSort} + * past a {@link org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveJoin} + * even in the presence of a {@link org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveProject} + */ + public static class HiveSortProjectJoinReduceRule extends HiveSortJoinReduceRule { + + //~ Constructors ----------------------------------------------------------- + + public HiveSortProjectJoinReduceRule(float reduction) { + super( + operand(HiveSortLimit.class, + operand(HiveProject.class, + operand(HiveJoin.class, any()))), reduction); + } + + //~ Methods ---------------------------------------------------------------- + + @Override + public boolean matches(RelOptRuleCall call) { + final HiveSortLimit sortLimit = call.rel(0); + final HiveProject project = call.rel(1); + final HiveJoin join = call.rel(2); + + return matches(sortLimit, project, join); + } + + @Override + public void onMatch(RelOptRuleCall call) { + final HiveSortLimit sortLimit = call.rel(0); + final HiveProject project = call.rel(1); + final HiveJoin join = call.rel(2); + + onMatch(call, sortLimit, project, join); + } + } + +} diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java index 0a7ce3a..bfe0e43 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java @@ -149,6 +149,8 @@ import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HivePreFilteringRule; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveProjectMergeRule; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveRelFieldTrimmer; +import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveSortJoinReduceRule; +import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveSortJoinReduceRule.HiveSortProjectJoinReduceRule; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveWindowingFixRule; import org.apache.hadoop.hive.ql.optimizer.calcite.translator.ASTConverter; import org.apache.hadoop.hive.ql.optimizer.calcite.translator.HiveOpConverter; @@ -973,16 +975,26 @@ private RelNode applyPreJoinOrderingTransforms(RelNode basePlan, RelMetadataProv basePlan = hepPlan(basePlan, true, mdProvider, HiveExpandDistinctAggregatesRule.INSTANCE); } - // 1. Push Down Semi Joins + // 1. Push down limit through outer join + if (conf.getBoolVar(HiveConf.ConfVars.HIVE_OPTIMIZE_LIMIT_JOIN_TRANSPOSE)) { + // This should be a cost based decision, but till we enable the extended cost + // model, we will use the given value for the variable + final float reduction = HiveConf.getFloatVar(conf, + HiveConf.ConfVars.HIVE_OPTIMIZE_LIMIT_JOIN_TRANSPOSE_REDUCTION_PERCENTAGE); + basePlan = hepPlan(basePlan, true, mdProvider, new HiveSortJoinReduceRule(reduction), + new HiveSortProjectJoinReduceRule(reduction), HiveProjectMergeRule.INSTANCE); + } + + // 2. Push Down Semi Joins basePlan = hepPlan(basePlan, true, mdProvider, SemiJoinJoinTransposeRule.INSTANCE, SemiJoinFilterTransposeRule.INSTANCE, SemiJoinProjectTransposeRule.INSTANCE); - // 2. Add not null filters + // 3. Add not null filters if (conf.getBoolVar(HiveConf.ConfVars.HIVE_CBO_RETPATH_HIVEOP)) { basePlan = hepPlan(basePlan, true, mdProvider, HiveJoinAddNotNullRule.INSTANCE); } - // 3. Constant propagation, common filter extraction, and PPD + // 4. Constant propagation, common filter extraction, and PPD basePlan = hepPlan(basePlan, true, mdProvider, ReduceExpressionsRule.PROJECT_INSTANCE, ReduceExpressionsRule.FILTER_INSTANCE, @@ -996,19 +1008,19 @@ private RelNode applyPreJoinOrderingTransforms(RelNode basePlan, RelMetadataProv new FilterAggregateTransposeRule(Filter.class, HiveFilter.DEFAULT_FILTER_FACTORY, Aggregate.class)); - // 4. Transitive inference & Partition Pruning + // 5. Transitive inference & Partition Pruning basePlan = hepPlan(basePlan, false, mdProvider, new HiveJoinPushTransitivePredicatesRule( Join.class, HiveFilter.DEFAULT_FILTER_FACTORY), new HivePartitionPruneRule(conf)); - // 5. Projection Pruning + // 6. Projection Pruning HiveRelFieldTrimmer fieldTrimmer = new HiveRelFieldTrimmer(null, HiveProject.DEFAULT_PROJECT_FACTORY, HiveFilter.DEFAULT_FILTER_FACTORY, HiveJoin.HIVE_JOIN_FACTORY, HiveSemiJoin.HIVE_SEMIJOIN_FACTORY, HiveSortLimit.HIVE_SORT_REL_FACTORY, HiveAggregate.HIVE_AGGR_REL_FACTORY, HiveUnion.UNION_REL_FACTORY); basePlan = fieldTrimmer.trim(basePlan); - // 6. Rerun PPD through Project as column pruning would have introduced DT + // 7. Rerun PPD through Project as column pruning would have introduced DT // above scans basePlan = hepPlan(basePlan, true, mdProvider, new FilterProjectTransposeRule(Filter.class, HiveFilter.DEFAULT_FILTER_FACTORY, diff --git ql/src/test/queries/clientpositive/limit_join_transpose.q ql/src/test/queries/clientpositive/limit_join_transpose.q new file mode 100644 index 0000000..a5712f0 --- /dev/null +++ ql/src/test/queries/clientpositive/limit_join_transpose.q @@ -0,0 +1,41 @@ +set hive.optimize.limitjointranspose=false; + +explain +select * +from src src1 left outer join src src2 +on src1.key = src2.key +limit 1; + +select * +from src src1 left outer join src src2 +on src1.key = src2.key +limit 1; + + +set hive.optimize.limitjointranspose=true; +set hive.optimize.limitjointranspose.reduction=0.0001f; + +explain +select * +from src src1 left outer join src src2 +on src1.key = src2.key +limit 1; + +select * +from src src1 left outer join src src2 +on src1.key = src2.key +limit 1; + + +set hive.optimize.limitjointranspose.reduction=0.1f; + +explain +select * +from src src1 left outer join src src2 +on src1.key = src2.key +limit 1; + +select * +from src src1 left outer join src src2 +on src1.key = src2.key +limit 1; diff --git ql/src/test/queries/clientpositive/tez_dynpart_hashjoin_3.q ql/src/test/queries/clientpositive/tez_dynpart_hashjoin_3.q new file mode 100644 index 0000000..f8451ac --- /dev/null +++ ql/src/test/queries/clientpositive/tez_dynpart_hashjoin_3.q @@ -0,0 +1,23 @@ +set hive.optimize.limitjointranspose=true; +set hive.optimize.limitjointranspose.reduction=0.1f; +set hive.explain.user=false; +set hive.auto.convert.join=false; +set hive.optimize.dynamic.partition.hashjoin=false; + +explain +select a.* +from alltypesorc a left outer join src b +on a.cint = cast(b.key as int) and (a.cint < 100) +limit 1; + + +set hive.auto.convert.join=true; +set hive.optimize.dynamic.partition.hashjoin=true; +set hive.auto.convert.join.noconditionaltask.size=200000; +set hive.exec.reducers.bytes.per.reducer=200000; + +explain +select a.* +from alltypesorc a left outer join src b +on a.cint = cast(b.key as int) and (a.cint < 100) +limit 1; diff --git ql/src/test/results/clientpositive/limit_join_transpose.q.out ql/src/test/results/clientpositive/limit_join_transpose.q.out new file mode 100644 index 0000000..73ac200 --- /dev/null +++ ql/src/test/results/clientpositive/limit_join_transpose.q.out @@ -0,0 +1,288 @@ +PREHOOK: query: explain +select * +from src src1 left outer join src src2 +on src1.key = src2.key +limit 1 +PREHOOK: type: QUERY +POSTHOOK: query: explain +select * +from src src1 left outer join src src2 +on src1.key = src2.key +limit 1 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: string) + TableScan + alias: src1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: string) + Reduce Operator Tree: + Join Operator + condition map: + Left Outer Join0 to 1 + keys: + 0 _col0 (type: string) + 1 _col0 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 1 + Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: 1 + Processor Tree: + ListSink + +PREHOOK: query: select * +from src src1 left outer join src src2 +on src1.key = src2.key +limit 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select * +from src src1 left outer join src src2 +on src1.key = src2.key +limit 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +0 val_0 0 val_0 +PREHOOK: query: explain +select * +from src src1 left outer join src src2 +on src1.key = src2.key +limit 1 +PREHOOK: type: QUERY +POSTHOOK: query: explain +select * +from src src1 left outer join src src2 +on src1.key = src2.key +limit 1 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: string) + TableScan + alias: src1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: string) + Reduce Operator Tree: + Join Operator + condition map: + Left Outer Join0 to 1 + keys: + 0 _col0 (type: string) + 1 _col0 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 1 + Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: 1 + Processor Tree: + ListSink + +PREHOOK: query: select * +from src src1 left outer join src src2 +on src1.key = src2.key +limit 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select * +from src src1 left outer join src src2 +on src1.key = src2.key +limit 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +0 val_0 0 val_0 +PREHOOK: query: explain +select * +from src src1 left outer join src src2 +on src1.key = src2.key +limit 1 +PREHOOK: type: QUERY +POSTHOOK: query: explain +select * +from src src1 left outer join src src2 +on src1.key = src2.key +limit 1 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 1 + Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string) + Reduce Operator Tree: + Select Operator + expressions: VALUE._col0 (type: string), VALUE._col1 (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 1 + Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: string) + TableScan + alias: src1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: string) + Reduce Operator Tree: + Join Operator + condition map: + Left Outer Join0 to 1 + keys: + 0 _col0 (type: string) + 1 _col0 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 1 + Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: 1 + Processor Tree: + ListSink + +PREHOOK: query: select * +from src src1 left outer join src src2 +on src1.key = src2.key +limit 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select * +from src src1 left outer join src src2 +on src1.key = src2.key +limit 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +238 val_238 238 val_238 diff --git ql/src/test/results/clientpositive/tez/tez_dynpart_hashjoin_3.q.out ql/src/test/results/clientpositive/tez/tez_dynpart_hashjoin_3.q.out new file mode 100644 index 0000000..29ffb47 --- /dev/null +++ ql/src/test/results/clientpositive/tez/tez_dynpart_hashjoin_3.q.out @@ -0,0 +1,192 @@ +PREHOOK: query: explain +select a.* +from alltypesorc a left outer join src b +on a.cint = cast(b.key as int) and (a.cint < 100) +limit 1 +PREHOOK: type: QUERY +POSTHOOK: query: explain +select a.* +from alltypesorc a left outer join src b +on a.cint = cast(b.key as int) and (a.cint < 100) +limit 1 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) + Reducer 3 <- Map 4 (SIMPLE_EDGE), Reducer 2 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: a + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: ctinyint (type: tinyint), csmallint (type: smallint), cint (type: int), cbigint (type: bigint), cfloat (type: float), cdouble (type: double), cstring1 (type: string), cstring2 (type: string), ctimestamp1 (type: timestamp), ctimestamp2 (type: timestamp), cboolean1 (type: boolean), cboolean2 (type: boolean) + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11 + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 1 + Statistics: Num rows: 1 Data size: 215 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 215 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: tinyint), _col1 (type: smallint), _col2 (type: int), _col3 (type: bigint), _col4 (type: float), _col5 (type: double), _col6 (type: string), _col7 (type: string), _col8 (type: timestamp), _col9 (type: timestamp), _col10 (type: boolean), _col11 (type: boolean) + Map 4 + Map Operator Tree: + TableScan + alias: b + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: UDFToInteger(_col0) (type: int) + sort order: + + Map-reduce partition columns: UDFToInteger(_col0) (type: int) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reducer 2 + Reduce Operator Tree: + Select Operator + expressions: VALUE._col0 (type: tinyint), VALUE._col1 (type: smallint), VALUE._col2 (type: int), VALUE._col3 (type: bigint), VALUE._col4 (type: float), VALUE._col5 (type: double), VALUE._col6 (type: string), VALUE._col7 (type: string), VALUE._col8 (type: timestamp), VALUE._col9 (type: timestamp), VALUE._col10 (type: boolean), VALUE._col11 (type: boolean) + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11 + Statistics: Num rows: 1 Data size: 215 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 1 + Statistics: Num rows: 1 Data size: 215 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col2 (type: int) + sort order: + + Map-reduce partition columns: _col2 (type: int) + Statistics: Num rows: 1 Data size: 215 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: tinyint), _col1 (type: smallint), _col3 (type: bigint), _col4 (type: float), _col5 (type: double), _col6 (type: string), _col7 (type: string), _col8 (type: timestamp), _col9 (type: timestamp), _col10 (type: boolean), _col11 (type: boolean) + Reducer 3 + Reduce Operator Tree: + Merge Join Operator + condition map: + Left Outer Join0 to 1 + filter predicates: + 0 {(KEY.reducesinkkey0 < 100)} + 1 + keys: + 0 _col2 (type: int) + 1 UDFToInteger(_col0) (type: int) + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11 + Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 1 + Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: 1 + Processor Tree: + ListSink + +PREHOOK: query: explain +select a.* +from alltypesorc a left outer join src b +on a.cint = cast(b.key as int) and (a.cint < 100) +limit 1 +PREHOOK: type: QUERY +POSTHOOK: query: explain +select a.* +from alltypesorc a left outer join src b +on a.cint = cast(b.key as int) and (a.cint < 100) +limit 1 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 3 (BROADCAST_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: a + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: ctinyint (type: tinyint), csmallint (type: smallint), cint (type: int), cbigint (type: bigint), cfloat (type: float), cdouble (type: double), cstring1 (type: string), cstring2 (type: string), ctimestamp1 (type: timestamp), ctimestamp2 (type: timestamp), cboolean1 (type: boolean), cboolean2 (type: boolean) + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11 + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 1 + Statistics: Num rows: 1 Data size: 215 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 215 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: tinyint), _col1 (type: smallint), _col2 (type: int), _col3 (type: bigint), _col4 (type: float), _col5 (type: double), _col6 (type: string), _col7 (type: string), _col8 (type: timestamp), _col9 (type: timestamp), _col10 (type: boolean), _col11 (type: boolean) + Map 3 + Map Operator Tree: + TableScan + alias: b + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: UDFToInteger(_col0) (type: int) + sort order: + + Map-reduce partition columns: UDFToInteger(_col0) (type: int) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reducer 2 + Reduce Operator Tree: + Select Operator + expressions: VALUE._col0 (type: tinyint), VALUE._col1 (type: smallint), VALUE._col2 (type: int), VALUE._col3 (type: bigint), VALUE._col4 (type: float), VALUE._col5 (type: double), VALUE._col6 (type: string), VALUE._col7 (type: string), VALUE._col8 (type: timestamp), VALUE._col9 (type: timestamp), VALUE._col10 (type: boolean), VALUE._col11 (type: boolean) + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11 + Statistics: Num rows: 1 Data size: 215 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 1 + Statistics: Num rows: 1 Data size: 215 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Left Outer Join0 to 1 + filter predicates: + 0 {(_col2 < 100)} + 1 + keys: + 0 _col2 (type: int) + 1 UDFToInteger(_col0) (type: int) + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11 + input vertices: + 1 Map 3 + Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE + HybridGraceHashJoin: true + Limit + Number of rows: 1 + Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: 1 + Processor Tree: + ListSink +