diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index da45f1a..f200e16 100644 --- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -440,6 +440,19 @@ HIVEJOINEMITINTERVAL("hive.join.emit.interval", 1000), HIVEJOINCACHESIZE("hive.join.cache.size", 25000), + // CBO related + /* + * Flag to control enabling Cost Based Optimizations using Optiq framework. + */ + HIVE_CBO_ENABLED("hive.cbo.enable", false), + /* + * Control queries that will be considered for join reordering, based on + * number of joins in them. Beyond a certain number of joins, the cost of + * considering possible permutations is prohibitive. + */ + HIVE_CBO_MAX_JOINS_SUPPORTED("hive.cbo.max.joins.supported", 10), + HIVE_CBO_PULLPROJECTABOVEJOIN_RULE("hive.cbo.project.pullabovejoin.rule", false), + // hive.mapjoin.bucket.cache.size has been replaced by hive.smbjoin.cache.row, // need to remove by hive .13. Also, do not change default (see SMB operator) HIVEMAPJOINBUCKETCACHESIZE("hive.mapjoin.bucket.cache.size", 100), diff --git a/conf/hive-default.xml.template b/conf/hive-default.xml.template index 0e6aaf9..f02283d 100644 --- a/conf/hive-default.xml.template +++ b/conf/hive-default.xml.template @@ -547,6 +547,23 @@ + hive.cbo.enable + false + + Flag to control enabling Cost Based Optimizations using Optiq framework. + + + + + hive.cbo.max.joins.supported + 10 + + Control queries that will be considered for join reordering, based on number of joins in them. + Beyond a certain number of joins, the cost of considering possible permutations is prohibitive. + + + + hive.mapred.supports.subdirectories false Whether the version of Hadoop which is running supports sub-directories for tables/partitions. diff --git a/pom.xml b/pom.xml index 426dca8..2c6c4af 100644 --- a/pom.xml +++ b/pom.xml @@ -195,6 +195,17 @@ false + + conjars + Optiq Conjars repository + http://conjars.org/repo + default + + true + always + warn + + diff --git a/ql/pom.xml b/ql/pom.xml index afdaa19..53861fc 100644 --- a/ql/pom.xml +++ b/ql/pom.xml @@ -28,6 +28,7 @@ Hive Query Language + 0.5 .. @@ -176,6 +177,24 @@ ${datanucleus-core.version} + net.hydromatic + optiq-core + ${optiq.version} + + + + org.hsqldb + hsqldb + + + com.fasterxml.jackson.core + jackson-databind + + + + com.google.guava guava ${guava.version} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/QueryProperties.java b/ql/src/java/org/apache/hadoop/hive/ql/QueryProperties.java index 1ba5654..0b1bcfb 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/QueryProperties.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/QueryProperties.java @@ -48,12 +48,25 @@ boolean mapJoinRemoved = false; boolean hasMapGroupBy = false; + private int noOfJoins = 0; + private int noOfOuterJoins = 0; + public boolean hasJoin() { - return hasJoin; + return (noOfJoins > 0); + } + + public void incrementJoinCount(boolean noOuterJoin) { + noOfJoins++; + if (!noOuterJoin) + noOfOuterJoins++; + } + + public int getJoinCount() { + return noOfJoins; } - public void setHasJoin(boolean hasJoin) { - this.hasJoin = hasJoin; + public int getOuterJoinCount() { + return noOfOuterJoins; } public boolean hasGroupBy() { @@ -143,4 +156,25 @@ public boolean isHasMapGroupBy() { public void setHasMapGroupBy(boolean hasMapGroupBy) { this.hasMapGroupBy = hasMapGroupBy; } + + public void clear() { + hasJoin = false; + hasGroupBy = false; + hasOrderBy = false; + hasSortBy = false; + hasJoinFollowedByGroupBy = false; + hasPTF = false; + hasWindowing = false; + + // does the query have a using clause + usesScript = false; + + hasDistributeBy = false; + hasClusterBy = false; + mapJoinRemoved = false; + hasMapGroupBy = false; + + noOfJoins = 0; + noOfOuterJoins = 0; + } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/CostBasedOptimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/CostBasedOptimizer.java new file mode 100644 index 0000000..069daa7 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/CostBasedOptimizer.java @@ -0,0 +1,219 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer; + +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Lists; + +import net.hydromatic.optiq.SchemaPlus; +import net.hydromatic.optiq.tools.Frameworks; + +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.api.FieldSchema; +import org.apache.hadoop.hive.ql.QueryProperties; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.optimizer.optiq.HiveDefaultRelMetadataProvider; +import org.apache.hadoop.hive.ql.optimizer.optiq.cost.HiveVolcanoPlanner; +import org.apache.hadoop.hive.ql.optimizer.optiq.reloperators.HiveRel; +import org.apache.hadoop.hive.ql.optimizer.optiq.rules.HiveMergeProjectRule; +import org.apache.hadoop.hive.ql.optimizer.optiq.rules.HivePullUpProjectsAboveJoinRule; +import org.apache.hadoop.hive.ql.optimizer.optiq.rules.HivePushJoinThroughJoinRule; +import org.apache.hadoop.hive.ql.optimizer.optiq.rules.HiveSwapJoinRule; +import org.apache.hadoop.hive.ql.optimizer.optiq.translator.ASTConverter; +import org.apache.hadoop.hive.ql.optimizer.optiq.translator.RelNodeConverter; +import org.apache.hadoop.hive.ql.parse.ASTNode; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.SemanticAnalyzer; +import org.apache.hadoop.hive.ql.plan.api.OperatorType; +import org.eigenbase.rel.RelCollationImpl; +import org.eigenbase.rel.RelNode; +import org.eigenbase.rel.metadata.CachingRelMetadataProvider; +import org.eigenbase.rel.metadata.ChainedRelMetadataProvider; +import org.eigenbase.rel.metadata.RelMetadataProvider; +import org.eigenbase.relopt.RelOptCluster; +import org.eigenbase.relopt.RelOptPlanner; +import org.eigenbase.relopt.RelOptQuery; +import org.eigenbase.relopt.RelOptSchema; +import org.eigenbase.relopt.RelTraitSet; +import org.eigenbase.rex.RexBuilder; + +/* + * Entry point to Optimizations using Optiq. + */ +public class CostBasedOptimizer implements Frameworks.PlannerAction { + private static final Set m_unsupportedOpTypes = ImmutableSet.of(OperatorType.DEMUX, + OperatorType.FORWARD, + OperatorType.LATERALVIEWFORWARD, + OperatorType.LATERALVIEWJOIN, + OperatorType.MUX, + OperatorType.PTF, + OperatorType.SCRIPT, + OperatorType.UDTF, + OperatorType.UNION); + + @SuppressWarnings("rawtypes") + private final Operator m_sinkOp; + private final SemanticAnalyzer m_semanticAnalyzer; + private final ParseContext m_ParseContext; + + public CostBasedOptimizer(@SuppressWarnings("rawtypes") Operator sinkOp, + SemanticAnalyzer semanticAnalyzer, ParseContext pCtx) { + m_sinkOp = sinkOp; + m_semanticAnalyzer = semanticAnalyzer; + m_ParseContext = pCtx; + } + + /* + * Currently contract is given a Hive Operator Tree, it returns an optimal + * plan as an Hive AST. + */ + public static ASTNode optimize(@SuppressWarnings("rawtypes") Operator sinkOp, + SemanticAnalyzer semanticAnalyzer, ParseContext pCtx, List resultSchema) { + ASTNode optiqOptimizedAST = null; + RelNode optimizedOptiqPlan = Frameworks.withPlanner(new CostBasedOptimizer(sinkOp, + semanticAnalyzer, pCtx)); + optiqOptimizedAST = ASTConverter.convert(optimizedOptiqPlan, resultSchema); + + return optiqOptimizedAST; + } + + @Override + @SuppressWarnings("unchecked") + public RelNode apply(RelOptCluster cluster, RelOptSchema relOptSchema, SchemaPlus schema) { + RelOptPlanner planner = HiveVolcanoPlanner.createPlanner(); + + /* + * recreate cluster, so that it picks up the additional traitDef + */ + final RelOptQuery query = new RelOptQuery(planner); + final RexBuilder rexBuilder = cluster.getRexBuilder(); + cluster = query.createCluster(rexBuilder.getTypeFactory(), rexBuilder); + List list = Lists.newArrayList(); + list.add(HiveDefaultRelMetadataProvider.INSTANCE); + planner.registerMetadataProviders(list); + + RelMetadataProvider chainedProvider = ChainedRelMetadataProvider.of(list); + cluster.setMetadataProvider(new CachingRelMetadataProvider(chainedProvider, planner)); + + RelNode opTreeInOptiq = RelNodeConverter.convert(m_sinkOp, cluster, relOptSchema, + m_semanticAnalyzer, m_ParseContext); + + planner.clearRules(); + planner.addRule(HiveSwapJoinRule.INSTANCE); + planner.addRule(HivePushJoinThroughJoinRule.LEFT); + planner.addRule(HivePushJoinThroughJoinRule.RIGHT); + if (HiveConf.getBoolVar(m_ParseContext.getConf(), + HiveConf.ConfVars.HIVE_CBO_PULLPROJECTABOVEJOIN_RULE)) { + planner.addRule(HivePullUpProjectsAboveJoinRule.BOTH_PROJECT); + planner.addRule(HivePullUpProjectsAboveJoinRule.LEFT_PROJECT); + planner.addRule(HivePullUpProjectsAboveJoinRule.RIGHT_PROJECT); + planner.addRule(HiveMergeProjectRule.INSTANCE); + } + + RelTraitSet desiredTraits = cluster.traitSetOf(HiveRel.CONVENTION, RelCollationImpl.EMPTY); + + RelNode rootRel = opTreeInOptiq; + if (!rootRel.getTraitSet().equals(desiredTraits)) { + rootRel = planner.changeTraits(opTreeInOptiq, desiredTraits); + } + planner.setRoot(rootRel); + + return planner.findBestExp(); + } + + public static boolean canHandleOpTree(@SuppressWarnings("rawtypes") Operator sinkOp, HiveConf conf, + QueryProperties qp) { + boolean runOptiq = false; + + if ((qp.getJoinCount() > 1) && (qp.getJoinCount() < HiveConf.getIntVar(conf, + HiveConf.ConfVars.HIVE_CBO_MAX_JOINS_SUPPORTED)) + && (qp.getOuterJoinCount() == 0) + && !qp.hasClusterBy() && !qp.hasDistributeBy() && !qp.hasSortBy() && !qp.hasWindowing()) { + @SuppressWarnings("rawtypes") + final HashSet start = new HashSet(); + + start.add(sinkOp); + // TODO: use queryproperties instead of walking the tree + if (!CostBasedOptimizer.operatorExists(start, true, m_unsupportedOpTypes)) { + runOptiq = true; + } + } + + return runOptiq; + } + + /* + * TODO: moved this out of OperatorUtils for now HIVE-6403 is going to bring + * in iterateParents: https://reviews.apache.org/r/18137/diff/#index_header + * Will just use/enhance that once it is in. hb 2/15 + */ + /** + * Check if operator tree, in the direction specified forward/backward, + * contains any operator specified in the targetOPTypes. + * + * @param start + * list of operators to start checking from + * @param backward + * direction of DAG traversal; if true implies get parent ops for + * traversal otherwise children will be used + * @param targetOPTypes + * Set of operator types to look for + * + * @return true if any of the operator or its parent/children is of the name + * specified in the targetOPTypes + * + * NOTE: 1. This employs breadth first search 2. By using HashSet for + * "start" we avoid revisiting same operator twice. However it doesn't + * prevent revisiting the same node more than once for some complex + * dags. + */ + @SuppressWarnings("unchecked") + public static boolean operatorExists(@SuppressWarnings("rawtypes") final HashSet start, + final boolean backward, final Set targetOPTypes) { + @SuppressWarnings("rawtypes") + HashSet nextSetOfOperators = new HashSet(); + + for (@SuppressWarnings("rawtypes") + Operator op : start) { + if (targetOPTypes.contains(op.getType())) { + return true; + } + + if (backward) { + if (op.getParentOperators() != null) { + nextSetOfOperators.addAll(op.getParentOperators()); + } + } else { + if (op.getChildOperators() != null) { + nextSetOfOperators.addAll(op.getChildOperators()); + } + } + } + + if (!nextSetOfOperators.isEmpty()) { + return operatorExists(nextSetOfOperators, backward, targetOPTypes); + } + + return false; + } + +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/PreCBOOptimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/PreCBOOptimizer.java new file mode 100644 index 0000000..e5b64a6 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/PreCBOOptimizer.java @@ -0,0 +1,85 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.optimizer.lineage.Generator; +import org.apache.hadoop.hive.ql.optimizer.pcr.PartitionConditionRemover; +import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner; +import org.apache.hadoop.hive.ql.optimizer.stats.annotation.AnnotateWithStatistics; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.ppd.PredicatePushDown; +import org.apache.hadoop.hive.ql.ppd.PredicateTransitivePropagate; + +/* + * do PredicatePushdown, PartitionPruning and Column Pruning before CBO + */ +public class PreCBOOptimizer { + private ParseContext pctx; + private List transformations; + + /** + * Create the list of transformations. + * + * @param hiveConf + */ + public void initialize(HiveConf hiveConf) { + transformations = new ArrayList(); + // Add the transformation that computes the lineage information. + transformations.add(new Generator()); + transformations.add(new PredicateTransitivePropagate()); + transformations.add(new PredicatePushDown()); + transformations.add(new PartitionPruner()); + transformations.add(new PartitionConditionRemover()); + transformations.add(new ColumnPruner()); + transformations.add(new AnnotateWithStatistics()); + } + + /** + * Invoke all the transformations one-by-one, and alter the query plan. + * + * @return ParseContext + * @throws SemanticException + */ + public ParseContext optimize() throws SemanticException { + for (Transform t : transformations) { + pctx = t.transform(pctx); + } + return pctx; + } + + /** + * @return the pctx + */ + public ParseContext getPctx() { + return pctx; + } + + /** + * @param pctx + * the pctx to set + */ + public void setPctx(ParseContext pctx) { + this.pctx = pctx; + } + +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/HiveDefaultRelMetadataProvider.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/HiveDefaultRelMetadataProvider.java new file mode 100644 index 0000000..2c08772 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/HiveDefaultRelMetadataProvider.java @@ -0,0 +1,27 @@ +package org.apache.hadoop.hive.ql.optimizer.optiq; + +import com.google.common.collect.ImmutableList; + +import org.apache.hadoop.hive.ql.optimizer.optiq.stats.HiveRelMdDistinctRowCount; +import org.apache.hadoop.hive.ql.optimizer.optiq.stats.HiveRelMdSelectivity; +import org.eigenbase.rel.metadata.ChainedRelMetadataProvider; +import org.eigenbase.rel.metadata.DefaultRelMetadataProvider; +import org.eigenbase.rel.metadata.RelMetadataProvider; + +/** + * Distinct row count and Selectivity is overridden for Hive.
+ *

+ * Distinct Row Count is overridden for:
+ * 1) Join 2) TableScan.
+ * Selectivity is overridden for:
+ * 1) Join 2) TableScan & Filter. + */ +public class HiveDefaultRelMetadataProvider { + private HiveDefaultRelMetadataProvider() { + } + + public static final RelMetadataProvider INSTANCE = ChainedRelMetadataProvider.of(ImmutableList + .of(HiveRelMdDistinctRowCount.SOURCE, + HiveRelMdSelectivity.SOURCE, + new DefaultRelMetadataProvider())); +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/HiveOptiqUtil.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/HiveOptiqUtil.java new file mode 100644 index 0000000..a062e6f --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/HiveOptiqUtil.java @@ -0,0 +1,51 @@ +package org.apache.hadoop.hive.ql.optimizer.optiq; + +import java.util.ArrayList; +import java.util.BitSet; +import java.util.List; + +import org.eigenbase.rex.RexInputRef; +import org.eigenbase.rex.RexNode; + +/** + * Generic utility functions needed for Optiq based Hive CBO. + */ + +public class HiveOptiqUtil { + + /** + * Get list of virtual columns from the given list of projections. + *

+ * + * @param exps + * list of rex nodes representing projections + * @return List of Virtual Columns, will not be null. + */ + public static List getVirtualCols(List exps) { + List vCols = new ArrayList(); + + for (int i = 0; i < exps.size(); i++) { + if (!(exps.get(i) instanceof RexInputRef)) { + vCols.add(i); + } + } + + return vCols; + } + + public static List translateBitSetToProjIndx(BitSet projBitSet) { + List projIndxLst = new ArrayList(); + + for (int i = 0; i < projBitSet.length(); i++) { + if (projBitSet.get(i)) { + projIndxLst.add(i); + } + } + + return projIndxLst; + } + + @Deprecated + public static void todo(String s) { + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/JoinUtil.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/JoinUtil.java new file mode 100644 index 0000000..da77d36 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/JoinUtil.java @@ -0,0 +1,295 @@ +package org.apache.hadoop.hive.ql.optimizer.optiq; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; + +import org.apache.hadoop.hive.ql.optimizer.optiq.reloperators.HiveJoinRel; +import org.eigenbase.relopt.RelOptUtil; +import org.eigenbase.relopt.RelOptUtil.InputReferencedVisitor; +import org.eigenbase.rex.RexNode; +import org.eigenbase.sql.SqlKind; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; + +/** + * Utility for inspecting Join Conditions.
+ *

+ * Main Elements:
+ * 1. JoinPredicateInfo - represents Join Condition.
+ * 2. JoinLeafPredicateInfo - represents leaf predicates with in join condition. + * + * TODO: Move this to Optiq Framework + */ +public class JoinUtil { + + /** + * JoinPredicateInfo represents Join condition; JoinPredicate Info uses + * JoinLeafPredicateInfo to represent individual conjunctive elements in the + * predicate.
+ * JoinPredicateInfo = JoinLeafPredicateInfo1 and JoinLeafPredicateInfo2...
+ *

+ * JoinPredicateInfo:
+ * 1. preserves the order of conjuctive elements for + * equi-join(m_equiJoinPredicateElements)
+ * 2. Stores set of projection indexes from left and right child which is part + * of equi join keys; the indexes are both in child and Join node schema.
+ * 3. Keeps a map of projection indexes that are part of join keys to list of + * conjuctive elements(JoinLeafPredicateInfo) that uses them. + * + */ + public static class JoinPredicateInfo { + private final ImmutableList m_nonEquiJoinPredicateElements; + private final ImmutableList m_equiJoinPredicateElements; + private final ImmutableSet m_projsFromLeftPartOfJoinKeysInChildSchema; + private final ImmutableSet m_projsFromRightPartOfJoinKeysInChildSchema; + private final ImmutableSet m_projsFromRightPartOfJoinKeysInJoinSchema; + private final ImmutableMap> m_mapOfProjIndxInJoinSchemaToLeafPInfo; + + public JoinPredicateInfo(List nonEquiJoinPredicateElements, + List equiJoinPredicateElements, + Set projsFromLeftPartOfJoinKeysInChildSchema, + Set projsFromRightPartOfJoinKeysInChildSchema, + Set projsFromRightPartOfJoinKeysInJoinSchema, + Map> mapOfProjIndxInJoinSchemaToLeafPInfo) { + m_nonEquiJoinPredicateElements = ImmutableList.copyOf(nonEquiJoinPredicateElements); + m_equiJoinPredicateElements = ImmutableList.copyOf(equiJoinPredicateElements); + m_projsFromLeftPartOfJoinKeysInChildSchema = ImmutableSet + .copyOf(projsFromLeftPartOfJoinKeysInChildSchema); + m_projsFromRightPartOfJoinKeysInChildSchema = ImmutableSet + .copyOf(projsFromRightPartOfJoinKeysInChildSchema); + m_projsFromRightPartOfJoinKeysInJoinSchema = ImmutableSet + .copyOf(projsFromRightPartOfJoinKeysInJoinSchema); + m_mapOfProjIndxInJoinSchemaToLeafPInfo = ImmutableMap + .copyOf(mapOfProjIndxInJoinSchemaToLeafPInfo); + } + + public List getNonEquiJoinPredicateElements() { + return m_nonEquiJoinPredicateElements; + } + + public List getEquiJoinPredicateElements() { + return m_equiJoinPredicateElements; + } + + public Set getProjsFromLeftPartOfJoinKeysInChildSchema() { + return m_projsFromLeftPartOfJoinKeysInChildSchema; + } + + public Set getProjsFromRightPartOfJoinKeysInChildSchema() { + return m_projsFromRightPartOfJoinKeysInChildSchema; + } + + /** + * NOTE: Join Schema = left Schema + (right Schema offset by + * left.fieldcount). Hence its ok to return projections from left in child + * schema. + */ + public Set getProjsFromLeftPartOfJoinKeysInJoinSchema() { + return m_projsFromLeftPartOfJoinKeysInChildSchema; + } + + public Set getProjsFromRightPartOfJoinKeysInJoinSchema() { + return m_projsFromRightPartOfJoinKeysInJoinSchema; + } + + public Map> getMapOfProjIndxToLeafPInfo() { + return m_mapOfProjIndxInJoinSchemaToLeafPInfo; + } + + public static JoinPredicateInfo constructJoinPredicateInfo(HiveJoinRel j) { + return constructJoinPredicateInfo(j, j.getCondition()); + } + + public static JoinPredicateInfo constructJoinPredicateInfo(HiveJoinRel j, RexNode predicate) { + JoinPredicateInfo jpi = null; + JoinLeafPredicateInfo jlpi = null; + List equiLPIList = new ArrayList(); + List nonEquiLPIList = new ArrayList(); + Set projsFromLeftPartOfJoinKeys = new HashSet(); + Set projsFromRightPartOfJoinKeys = new HashSet(); + Set projsFromRightPartOfJoinKeysInJoinSchema = new HashSet(); + Map> tmpMapOfProjIndxInJoinSchemaToLeafPInfo = new HashMap>(); + Map> mapOfProjIndxInJoinSchemaToLeafPInfo = new HashMap>(); + List tmpJLPILst = null; + int rightOffSet = j.getLeft().getRowType().getFieldCount(); + int projIndxInJoin; + List conjuctiveElements; + + todo("Move this to Optiq"); + + // 1. Decompose Join condition to a number of leaf predicates + // (conjuctive elements) + conjuctiveElements = RelOptUtil.conjunctions(predicate); + + // 2. Walk through leaf predicates building up JoinLeafPredicateInfo + for (RexNode ce : conjuctiveElements) { + // 2.1 Construct JoinLeafPredicateInfo + jlpi = JoinLeafPredicateInfo.constructJoinLeafPredicateInfo(j, ce); + + // 2.2 Classify leaf predicate as Equi vs Non Equi + if (jlpi.m_comparisonType.equals(SqlKind.EQUALS)) { + equiLPIList.add(jlpi); + } else { + nonEquiLPIList.add(jlpi); + } + + // 2.3 Maintain join keys coming from left vs right (in child & + // Join Schema) + projsFromLeftPartOfJoinKeys.addAll(jlpi.getProjsFromLeftPartOfJoinKeysInChildSchema()); + projsFromRightPartOfJoinKeys.addAll(jlpi.getProjsFromRightPartOfJoinKeysInChildSchema()); + projsFromRightPartOfJoinKeysInJoinSchema.addAll(jlpi + .getProjsFromRightPartOfJoinKeysInJoinSchema()); + + // 2.4 Update Join Key to JoinLeafPredicateInfo map with keys + // from left + for (Integer projIndx : jlpi.getProjsFromLeftPartOfJoinKeysInChildSchema()) { + tmpJLPILst = tmpMapOfProjIndxInJoinSchemaToLeafPInfo.get(projIndx); + if (tmpJLPILst == null) + tmpJLPILst = new ArrayList(); + tmpJLPILst.add(jlpi); + tmpMapOfProjIndxInJoinSchemaToLeafPInfo.put(projIndx, tmpJLPILst); + } + + // 2.5 Update Join Key to JoinLeafPredicateInfo map with keys + // from right + for (Integer projIndx : jlpi.getProjsFromRightPartOfJoinKeysInChildSchema()) { + projIndxInJoin = projIndx + rightOffSet; + tmpJLPILst = tmpMapOfProjIndxInJoinSchemaToLeafPInfo.get(projIndxInJoin); + if (tmpJLPILst == null) + tmpJLPILst = new ArrayList(); + tmpJLPILst.add(jlpi); + tmpMapOfProjIndxInJoinSchemaToLeafPInfo.put(projIndxInJoin, tmpJLPILst); + } + + } + + // 3. Update Update Join Key to List to use + // ImmutableList + for (Entry> e : tmpMapOfProjIndxInJoinSchemaToLeafPInfo + .entrySet()) { + mapOfProjIndxInJoinSchemaToLeafPInfo.put(e.getKey(), ImmutableList.copyOf(e.getValue())); + } + + // 4. Construct JoinPredicateInfo + jpi = new JoinPredicateInfo(nonEquiLPIList, equiLPIList, projsFromLeftPartOfJoinKeys, + projsFromRightPartOfJoinKeys, projsFromRightPartOfJoinKeysInJoinSchema, + mapOfProjIndxInJoinSchemaToLeafPInfo); + return jpi; + } + } + + /** + * JoinLeafPredicateInfo represents leaf predicate in Join condition + * (conjuctive lement).
+ *

+ * JoinLeafPredicateInfo:
+ * 1. Stores list of expressions from left and right child which is part of + * equi join keys.
+ * 2. Stores set of projection indexes from left and right child which is part + * of equi join keys; the indexes are both in child and Join node schema.
+ */ + public static class JoinLeafPredicateInfo { + private final SqlKind m_comparisonType; + private final ImmutableList m_joinKeyExprsFromLeft; + private final ImmutableList m_joinKeyExprsFromRight; + private final ImmutableSet m_projsFromLeftPartOfJoinKeysInChildSchema; + private final ImmutableSet m_projsFromRightPartOfJoinKeysInChildSchema; + private final ImmutableSet m_projsFromRightPartOfJoinKeysInJoinSchema; + + public JoinLeafPredicateInfo(SqlKind comparisonType, List joinKeyExprsFromLeft, + List joinKeyExprsFromRight, Set projsFromLeftPartOfJoinKeysInChildSchema, + Set projsFromRightPartOfJoinKeysInChildSchema, + Set projsFromRightPartOfJoinKeysInJoinSchema) { + m_comparisonType = comparisonType; + m_joinKeyExprsFromLeft = ImmutableList.copyOf(joinKeyExprsFromLeft); + m_joinKeyExprsFromRight = ImmutableList.copyOf(joinKeyExprsFromRight); + m_projsFromLeftPartOfJoinKeysInChildSchema = ImmutableSet + .copyOf(projsFromLeftPartOfJoinKeysInChildSchema); + m_projsFromRightPartOfJoinKeysInChildSchema = ImmutableSet + .copyOf(projsFromRightPartOfJoinKeysInChildSchema); + m_projsFromRightPartOfJoinKeysInJoinSchema = ImmutableSet + .copyOf(projsFromRightPartOfJoinKeysInJoinSchema); + } + + public List getJoinKeyExprsFromLeft() { + return m_joinKeyExprsFromLeft; + } + + public List getJoinKeyExprsFromRight() { + return m_joinKeyExprsFromRight; + } + + public Set getProjsFromLeftPartOfJoinKeysInChildSchema() { + return m_projsFromLeftPartOfJoinKeysInChildSchema; + } + + /** + * NOTE: Join Schema = left Schema + (right Schema offset by + * left.fieldcount). Hence its ok to return projections from left in child + * schema. + */ + public Set getProjsFromLeftPartOfJoinKeysInJoinSchema() { + return m_projsFromLeftPartOfJoinKeysInChildSchema; + } + + public Set getProjsFromRightPartOfJoinKeysInChildSchema() { + return m_projsFromRightPartOfJoinKeysInChildSchema; + } + + public Set getProjsFromRightPartOfJoinKeysInJoinSchema() { + return m_projsFromRightPartOfJoinKeysInJoinSchema; + } + + public static JoinLeafPredicateInfo constructJoinLeafPredicateInfo(HiveJoinRel j, RexNode pe) { + JoinLeafPredicateInfo jlpi = null; + List filterNulls = new ArrayList(); + List joinKeyExprsFromLeft = new ArrayList(); + List joinKeyExprsFromRight = new ArrayList(); + Set projsFromLeftPartOfJoinKeysInChildSchema = new HashSet(); + Set projsFromRightPartOfJoinKeysInChildSchema = new HashSet(); + Set projsFromRightPartOfJoinKeysInJoinSchema = new HashSet(); + int rightOffSet = j.getLeft().getRowType().getFieldCount(); + + todo("Move this to Optiq"); + + // 1. Split leaf join predicate to expressions from left, right + @SuppressWarnings("unused") + RexNode nonEquiPredicate = RelOptUtil.splitJoinCondition(j.getSystemFieldList(), j.getLeft(), + j.getRight(), pe, joinKeyExprsFromLeft, joinKeyExprsFromRight, filterNulls, null); + + // 2. For left expressions, collect child projection indexes used + InputReferencedVisitor irvLeft = new InputReferencedVisitor(); + irvLeft.apply(joinKeyExprsFromLeft); + projsFromLeftPartOfJoinKeysInChildSchema.addAll(irvLeft.inputPosReferenced); + + // 3. For right expressions, collect child projection indexes used + InputReferencedVisitor irvRight = new InputReferencedVisitor(); + irvRight.apply(joinKeyExprsFromRight); + projsFromRightPartOfJoinKeysInChildSchema.addAll(irvRight.inputPosReferenced); + + // 3. Translate projection indexes from right to join schema, by adding + // offset. + for (Integer indx : projsFromRightPartOfJoinKeysInChildSchema) { + projsFromRightPartOfJoinKeysInJoinSchema.add(indx + rightOffSet); + } + + // 4. Construct JoinLeafPredicateInfo + jlpi = new JoinLeafPredicateInfo(pe.getKind(), joinKeyExprsFromLeft, joinKeyExprsFromRight, + projsFromLeftPartOfJoinKeysInChildSchema, projsFromRightPartOfJoinKeysInChildSchema, + projsFromRightPartOfJoinKeysInJoinSchema); + + return jlpi; + } + } + + @Deprecated + public static void todo(String s) { + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/Pair.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/Pair.java new file mode 100644 index 0000000..c923340 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/Pair.java @@ -0,0 +1,19 @@ +package org.apache.hadoop.hive.ql.optimizer.optiq; + +public class Pair { + private final T1 m_first; + private final T2 m_second; + + public Pair(T1 first, T2 second) { + m_first = first; + m_second = second; + } + + public T1 getFirst() { + return m_first; + } + + public T2 getSecond() { + return m_second; + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/RelOptHiveTable.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/RelOptHiveTable.java new file mode 100644 index 0000000..fe42f25 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/RelOptHiveTable.java @@ -0,0 +1,67 @@ +package org.apache.hadoop.hive.ql.optimizer.optiq; + +import java.util.BitSet; +import java.util.HashMap; +import java.util.Map; + +import org.apache.hadoop.hive.ql.metadata.Table; +import org.apache.hadoop.hive.ql.plan.Statistics; +import org.eigenbase.rel.RelNode; +import org.eigenbase.rel.TableAccessRel; +import org.eigenbase.relopt.RelOptAbstractTable; +import org.eigenbase.relopt.RelOptSchema; +import org.eigenbase.reltype.RelDataType; + +//Fix Me: use table meta data and stats util to get stats +public class RelOptHiveTable extends RelOptAbstractTable { + private final Table m_hiveTblMetadata; + private double m_rowCount = -1; + + final Map m_columnIdxToSizeMap = new HashMap(); + + Map m_bucketingColMap; + Map m_bucketingSortColMap; + + Statistics m_hiveStats; + + // NOTE: name here is the table alias which may or may not be the real name in + // metadata. Use + // m_hiveTblMetadata.getTableName() for table name and + // m_hiveTblMetadata.getDbName() for db name. + public RelOptHiveTable(RelOptSchema schema, String name, RelDataType rowType, + Table hiveTblMetadata, Statistics stats) { + super(schema, name, rowType); + m_hiveTblMetadata = hiveTblMetadata; + m_hiveStats = stats; + + m_rowCount = stats.getNumRows(); + } + + @Override + public boolean isKey(BitSet arg0) { + return false; + } + + @Override + public RelNode toRel(ToRelContext context) { + return new TableAccessRel(context.getCluster(), this); + } + + @Override + public T unwrap(Class arg0) { + return arg0.isInstance(this) ? arg0.cast(this) : null; + } + + @Override + public double getRowCount() { + return m_rowCount; + } + + public Table getHiveTableMD() { + return m_hiveTblMetadata; + } + + public Statistics getHiveStats() { + return m_hiveStats; + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/TraitsUtil.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/TraitsUtil.java new file mode 100644 index 0000000..6bde1d1 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/TraitsUtil.java @@ -0,0 +1,50 @@ +package org.apache.hadoop.hive.ql.optimizer.optiq; + +import java.util.List; + +import org.apache.hadoop.hive.ql.optimizer.optiq.reloperators.HiveRel; +import org.eigenbase.rel.AggregateCall; +import org.eigenbase.rel.RelCollation; +import org.eigenbase.rel.RelCollationImpl; +import org.eigenbase.rel.RelNode; +import org.eigenbase.relopt.RelOptCluster; +import org.eigenbase.relopt.RelTraitSet; +import org.eigenbase.reltype.RelDataType; +import org.eigenbase.rex.RexNode; + +public class TraitsUtil { + + public static RelTraitSet getSelectTraitSet(RelOptCluster cluster, List exps, + RelNode child) { + return cluster.traitSetOf(HiveRel.CONVENTION, RelCollationImpl.EMPTY); + } + + public static RelTraitSet getSortTraitSet(RelOptCluster cluster, RelTraitSet traitSet, + RelCollation collation) { + return traitSet.plus(collation); + } + + public static RelTraitSet getFilterTraitSet(RelOptCluster cluster, RelTraitSet traitSet, + RelNode child) { + return cluster.traitSetOf(HiveRel.CONVENTION, RelCollationImpl.EMPTY); + } + + public static RelTraitSet getLimitTraitSet(RelOptCluster cluster, RelTraitSet traitSet, + RelNode child) { + return cluster.traitSetOf(HiveRel.CONVENTION, RelCollationImpl.EMPTY); + } + + public static RelTraitSet getAggregateTraitSet(RelOptCluster cluster, RelTraitSet traitSet, + List gbCols, List aggCalls, RelNode child) { + return cluster.traitSetOf(HiveRel.CONVENTION, RelCollationImpl.EMPTY); + } + + public static RelTraitSet getTableScanTraitSet(RelOptCluster cluster, RelTraitSet traitSet, + RelOptHiveTable table, RelDataType rowtype) { + return cluster.traitSetOf(HiveRel.CONVENTION, RelCollationImpl.EMPTY); + } + + public static RelTraitSet getJoinTraitSet(RelOptCluster cluster, RelTraitSet traitSet) { + return cluster.traitSetOf(HiveRel.CONVENTION, RelCollationImpl.EMPTY); + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/cost/HiveCost.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/cost/HiveCost.java new file mode 100644 index 0000000..f9f3d0c --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/cost/HiveCost.java @@ -0,0 +1,194 @@ +package org.apache.hadoop.hive.ql.optimizer.optiq.cost; + +import org.eigenbase.relopt.RelOptCost; +import org.eigenbase.relopt.RelOptCostFactory; +import org.eigenbase.relopt.RelOptUtil; + +// TODO: This should inherit from VolcanoCost and should just override isLE method. +public class HiveCost implements RelOptCost { + // ~ Static fields/initializers --------------------------------------------- + + static final HiveCost INFINITY = new HiveCost(Double.POSITIVE_INFINITY, + Double.POSITIVE_INFINITY, + Double.POSITIVE_INFINITY) { + @Override + public String toString() { + return "{inf}"; + } + }; + + static final HiveCost HUGE = new HiveCost(Double.MAX_VALUE, Double.MAX_VALUE, + Double.MAX_VALUE) { + @Override + public String toString() { + return "{huge}"; + } + }; + + static final HiveCost ZERO = new HiveCost(0.0, 0.0, 0.0) { + @Override + public String toString() { + return "{0}"; + } + }; + + static final HiveCost TINY = new HiveCost(1.0, 1.0, 0.0) { + @Override + public String toString() { + return "{tiny}"; + } + }; + + public static final RelOptCostFactory FACTORY = new Factory(); + + // ~ Instance fields -------------------------------------------------------- + + final double cpu; + final double io; + final double rowCount; + + // ~ Constructors ----------------------------------------------------------- + + HiveCost(double rowCount, double cpu, double io) { + assert rowCount >= 0d; + assert cpu >= 0d; + assert io >= 0d; + this.rowCount = rowCount; + this.cpu = cpu; + this.io = io; + } + + // ~ Methods ---------------------------------------------------------------- + + public double getCpu() { + return cpu; + } + + public boolean isInfinite() { + return (this == INFINITY) || (this.rowCount == Double.POSITIVE_INFINITY) + || (this.cpu == Double.POSITIVE_INFINITY) || (this.io == Double.POSITIVE_INFINITY); + } + + public double getIo() { + return io; + } + + // TODO: If two cost is equal, could we do any better than comparing + // cardinality (may be some other heuristics to break the tie) + public boolean isLe(RelOptCost other) { + return this == other || this.rowCount <= other.getRows(); + /* + * if (((this.dCpu + this.dIo) < (other.getCpu() + other.getIo())) || + * ((this.dCpu + this.dIo) == (other.getCpu() + other.getIo()) && this.dRows + * <= other.getRows())) { return true; } else { return false; } + */ + } + + public boolean isLt(RelOptCost other) { + return this.rowCount < other.getRows(); + /* + * return isLe(other) && !equals(other); + */ + } + + public double getRows() { + return rowCount; + } + + public boolean equals(RelOptCost other) { + return (this == other) || ((this.rowCount) == (other.getRows())); + + /* + * //TODO: should we consider cardinality as well? return (this == other) || + * ((this.dCpu + this.dIo) == (other.getCpu() + other.getIo())); + */ + } + + public boolean isEqWithEpsilon(RelOptCost other) { + return (this == other) || (Math.abs((this.rowCount) - (other.getRows())) < RelOptUtil.EPSILON); + /* + * return (this == other) || (Math.abs((this.dCpu + this.dIo) - + * (other.getCpu() + other.getIo())) < RelOptUtil.EPSILON); + */ + } + + public RelOptCost minus(RelOptCost other) { + if (this == INFINITY) { + return this; + } + + return new HiveCost(this.rowCount - other.getRows(), this.cpu - other.getCpu(), this.io + - other.getIo()); + } + + public RelOptCost multiplyBy(double factor) { + if (this == INFINITY) { + return this; + } + return new HiveCost(rowCount * factor, cpu * factor, io * factor); + } + + public double divideBy(RelOptCost cost) { + // Compute the geometric average of the ratios of all of the factors + // which are non-zero and finite. + double d = 1; + double n = 0; + if ((this.rowCount != 0) && !Double.isInfinite(this.rowCount) && (cost.getRows() != 0) + && !Double.isInfinite(cost.getRows())) { + d *= this.rowCount / cost.getRows(); + ++n; + } + if ((this.cpu != 0) && !Double.isInfinite(this.cpu) && (cost.getCpu() != 0) + && !Double.isInfinite(cost.getCpu())) { + d *= this.cpu / cost.getCpu(); + ++n; + } + if ((this.io != 0) && !Double.isInfinite(this.io) && (cost.getIo() != 0) + && !Double.isInfinite(cost.getIo())) { + d *= this.io / cost.getIo(); + ++n; + } + if (n == 0) { + return 1.0; + } + return Math.pow(d, 1 / n); + } + + public RelOptCost plus(RelOptCost other) { + if ((this == INFINITY) || (other.isInfinite())) { + return INFINITY; + } + return new HiveCost(this.rowCount + other.getRows(), this.cpu + other.getCpu(), this.io + + other.getIo()); + } + + @Override + public String toString() { + return "{" + rowCount + " rows, " + cpu + " cpu, " + io + " io}"; + } + + private static class Factory implements RelOptCostFactory { + private Factory() { + } + + public RelOptCost makeCost(double rowCount, double cpu, double io) { + return new HiveCost(rowCount, cpu, io); + } + + public RelOptCost makeHugeCost() { + return HUGE; + } + + public HiveCost makeInfiniteCost() { + return INFINITY; + } + + public HiveCost makeTinyCost() { + return TINY; + } + + public HiveCost makeZeroCost() { + return ZERO; + } + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/cost/HiveCostUtil.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/cost/HiveCostUtil.java new file mode 100644 index 0000000..926bca5 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/cost/HiveCostUtil.java @@ -0,0 +1,23 @@ +package org.apache.hadoop.hive.ql.optimizer.optiq.cost; + +import org.apache.hadoop.hive.ql.optimizer.optiq.reloperators.HiveRel; +import org.apache.hadoop.hive.ql.optimizer.optiq.reloperators.HiveTableScanRel; +import org.eigenbase.relopt.RelOptCost; + +public class HiveCostUtil { + private static final double cpuCostInNanoSec = 1.0; + private static final double netCostInNanoSec = 150 * cpuCostInNanoSec; + private static final double localFSWriteCostInNanoSec = 4 * netCostInNanoSec; + private static final double localFSReadCostInNanoSec = 4 * netCostInNanoSec; + private static final double hDFSWriteCostInNanoSec = 10 * localFSWriteCostInNanoSec; + private static final double hDFSReadCostInNanoSec = 1.5 * localFSReadCostInNanoSec; + + public static RelOptCost computCardinalityBasedCost(HiveRel hr) { + return new HiveCost(hr.getRows(), 0, 0); + } + + public static HiveCost computeCost(HiveTableScanRel t) { + double cardinality = t.getRows(); + return new HiveCost(cardinality, 0, hDFSWriteCostInNanoSec * cardinality * 0); + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/cost/HiveVolcanoPlanner.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/cost/HiveVolcanoPlanner.java new file mode 100644 index 0000000..74fe6e8 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/cost/HiveVolcanoPlanner.java @@ -0,0 +1,32 @@ +package org.apache.hadoop.hive.ql.optimizer.optiq.cost; + +import org.eigenbase.rel.RelCollationTraitDef; +import org.eigenbase.relopt.ConventionTraitDef; +import org.eigenbase.relopt.RelOptPlanner; +import org.eigenbase.relopt.volcano.VolcanoPlanner; + +/** + * Refinement of {@link org.eigenbase.relopt.volcano.VolcanoPlanner} for Hive. + * + *

+ * It uses {@link org.apache.hadoop.hive.ql.optimizer.optiq.cost.HiveCost} as + * its cost model. + */ +public class HiveVolcanoPlanner extends VolcanoPlanner { + private static final boolean ENABLE_COLLATION_TRAIT = true; + + /** Creates a HiveVolcanoPlanner. */ + public HiveVolcanoPlanner() { + super(HiveCost.FACTORY); + } + + public static RelOptPlanner createPlanner() { + final VolcanoPlanner planner = new HiveVolcanoPlanner(); + planner.addRelTraitDef(ConventionTraitDef.INSTANCE); + if (ENABLE_COLLATION_TRAIT) { + planner.addRelTraitDef(RelCollationTraitDef.INSTANCE); + planner.registerAbstractRelationalRules(); + } + return planner; + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/reloperators/HiveAggregateRel.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/reloperators/HiveAggregateRel.java new file mode 100644 index 0000000..bd346eb --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/reloperators/HiveAggregateRel.java @@ -0,0 +1,58 @@ +package org.apache.hadoop.hive.ql.optimizer.optiq.reloperators; + +import java.util.BitSet; +import java.util.List; + +import net.hydromatic.optiq.util.BitSets; + +import org.apache.hadoop.hive.ql.optimizer.optiq.TraitsUtil; +import org.apache.hadoop.hive.ql.optimizer.optiq.cost.HiveCost; +import org.eigenbase.rel.AggregateCall; +import org.eigenbase.rel.AggregateRelBase; +import org.eigenbase.rel.InvalidRelException; +import org.eigenbase.rel.RelNode; +import org.eigenbase.rel.metadata.RelMetadataQuery; +import org.eigenbase.relopt.RelOptCluster; +import org.eigenbase.relopt.RelOptCost; +import org.eigenbase.relopt.RelOptPlanner; +import org.eigenbase.relopt.RelTraitSet; + +public class HiveAggregateRel extends AggregateRelBase implements HiveRel { + + public HiveAggregateRel(RelOptCluster cluster, RelTraitSet traitSet, RelNode child, + BitSet groupSet, List aggCalls) throws InvalidRelException { + super(cluster, TraitsUtil.getAggregateTraitSet(cluster, traitSet, BitSets.toList(groupSet), + aggCalls, child), child, groupSet, aggCalls); + + for (AggregateCall aggCall : aggCalls) { + if (aggCall.isDistinct()) { + throw new InvalidRelException("distinct aggregation not supported"); + } + } + } + + public AggregateRelBase copy(RelTraitSet traitSet, RelNode input, BitSet groupSet, + List aggCalls) { + try { + return new HiveAggregateRel(getCluster(), traitSet, input, groupSet, aggCalls); + } catch (InvalidRelException e) { + // Semantic error not possible. Must be a bug. Convert to + // internal error. + throw new AssertionError(e); + } + } + + public void implement(Implementor implementor) { + } + + @Override + public RelOptCost computeSelfCost(RelOptPlanner planner) { + return HiveCost.FACTORY.makeZeroCost(); + } + + @Override + public double getRows() { + return RelMetadataQuery.getDistinctRowCount(this, groupSet, getCluster().getRexBuilder() + .makeLiteral(true)); + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/reloperators/HiveFilterRel.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/reloperators/HiveFilterRel.java new file mode 100644 index 0000000..f8c085c --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/reloperators/HiveFilterRel.java @@ -0,0 +1,35 @@ +package org.apache.hadoop.hive.ql.optimizer.optiq.reloperators; + +import java.util.List; + +import org.apache.hadoop.hive.ql.optimizer.optiq.TraitsUtil; +import org.apache.hadoop.hive.ql.optimizer.optiq.cost.HiveCost; +import org.eigenbase.rel.FilterRelBase; +import org.eigenbase.rel.RelNode; +import org.eigenbase.relopt.RelOptCluster; +import org.eigenbase.relopt.RelOptCost; +import org.eigenbase.relopt.RelOptPlanner; +import org.eigenbase.relopt.RelTraitSet; +import org.eigenbase.rex.RexNode; + +public class HiveFilterRel extends FilterRelBase implements HiveRel { + + public HiveFilterRel(RelOptCluster cluster, RelTraitSet traits, RelNode child, RexNode condition) { + super(cluster, TraitsUtil.getFilterTraitSet(cluster, traits, child), child, condition); + } + + @Override + public RelNode copy(RelTraitSet traitSet, List inputs) { + assert traitSet.containsIfApplicable(HiveRel.CONVENTION); + return new HiveFilterRel(getCluster(), traitSet, sole(inputs), getCondition()); + } + + @Override + public void implement(Implementor implementor) { + } + + @Override + public RelOptCost computeSelfCost(RelOptPlanner planner) { + return HiveCost.FACTORY.makeZeroCost(); + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/reloperators/HiveJoinRel.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/reloperators/HiveJoinRel.java new file mode 100644 index 0000000..81319b8 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/reloperators/HiveJoinRel.java @@ -0,0 +1,100 @@ +package org.apache.hadoop.hive.ql.optimizer.optiq.reloperators; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.LinkedList; +import java.util.List; +import java.util.Set; + +import org.apache.hadoop.hive.ql.optimizer.optiq.TraitsUtil; +import org.apache.hadoop.hive.ql.optimizer.optiq.cost.HiveCostUtil; +import org.eigenbase.rel.InvalidRelException; +import org.eigenbase.rel.JoinRelBase; +import org.eigenbase.rel.JoinRelType; +import org.eigenbase.rel.RelNode; +import org.eigenbase.relopt.RelOptCluster; +import org.eigenbase.relopt.RelOptCost; +import org.eigenbase.relopt.RelOptPlanner; +import org.eigenbase.relopt.RelOptUtil; +import org.eigenbase.relopt.RelTraitSet; +import org.eigenbase.rex.RexNode; + +//TODO: Should we convert MultiJoin to be a child of HiveJoinRelBase +public class HiveJoinRel extends JoinRelBase implements HiveRel { + // NOTE: COMMON_JOIN & SMB_JOIN are Sort Merge Join (in case of COMMON_JOIN + // each parallel computation handles multiple splits where as in case of SMB + // each parallel computation handles one bucket). MAP_JOIN and BUCKET_JOIN is + // hash joins where MAP_JOIN keeps the whole data set of non streaming tables + // in memory where as BUCKET_JOIN keeps only the b + public enum JoinAlgorithm { + NONE, COMMON_JOIN, MAP_JOIN, BUCKET_JOIN, SMB_JOIN + } + + public enum MapJoinStreamingRelation { + NONE, LEFT_RELATION, RIGHT_RELATION + } + + private final JoinAlgorithm m_joinAlgorithm; + private MapJoinStreamingRelation m_mapJoinStreamingSide = MapJoinStreamingRelation.NONE; + + public static HiveJoinRel getJoin(RelOptCluster cluster, RelNode left, RelNode right, + RexNode condition, JoinRelType joinType) { + try { + Set variablesStopped = Collections.emptySet(); + return new HiveJoinRel(cluster, null, left, right, condition, joinType, variablesStopped, + JoinAlgorithm.NONE, null); + } catch (InvalidRelException e) { + throw new RuntimeException(e); + } + } + + protected HiveJoinRel(RelOptCluster cluster, RelTraitSet traits, RelNode left, RelNode right, + RexNode condition, JoinRelType joinType, Set variablesStopped, + JoinAlgorithm joinAlgo, MapJoinStreamingRelation streamingSideForMapJoin) + throws InvalidRelException { + super(cluster, TraitsUtil.getJoinTraitSet(cluster, traits), left, right, condition, joinType, + variablesStopped); + + final List leftKeys = new ArrayList(); + final List rightKeys = new ArrayList(); + List filterNulls = new LinkedList(); + RexNode remaining = RelOptUtil.splitJoinCondition(getSystemFieldList(), left, right, condition, + leftKeys, rightKeys, filterNulls, null); + + if (!remaining.isAlwaysTrue()) { + throw new InvalidRelException("EnumerableJoinRel only supports equi-join"); + } + this.m_joinAlgorithm = joinAlgo; + } + + @Override + public void implement(Implementor implementor) { + } + + @Override + public final HiveJoinRel copy(RelTraitSet traitSet, RexNode conditionExpr, RelNode left, + RelNode right, JoinRelType joinType) { + return copy(traitSet, conditionExpr, left, right, m_joinAlgorithm, m_mapJoinStreamingSide); + } + + public HiveJoinRel copy(RelTraitSet traitSet, RexNode conditionExpr, RelNode left, RelNode right, + JoinAlgorithm joinalgo, MapJoinStreamingRelation streamingSide) { + try { + return new HiveJoinRel(getCluster(), traitSet, left, right, conditionExpr, joinType, + variablesStopped, joinalgo, streamingSide); + } catch (InvalidRelException e) { + // Semantic error not possible. Must be a bug. Convert to + // internal error. + throw new AssertionError(e); + } + } + + public JoinAlgorithm getJoinAlgorithm() { + return m_joinAlgorithm; + } + + @Override + public RelOptCost computeSelfCost(RelOptPlanner planner) { + return HiveCostUtil.computCardinalityBasedCost(this); + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/reloperators/HiveLimitRel.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/reloperators/HiveLimitRel.java new file mode 100644 index 0000000..bf37a7b --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/reloperators/HiveLimitRel.java @@ -0,0 +1,40 @@ +package org.apache.hadoop.hive.ql.optimizer.optiq.reloperators; + +import java.util.List; + +import org.apache.hadoop.hive.ql.optimizer.optiq.TraitsUtil; +import org.apache.hadoop.hive.ql.optimizer.optiq.cost.HiveCost; +import org.eigenbase.rel.RelNode; +import org.eigenbase.rel.SingleRel; +import org.eigenbase.relopt.RelOptCluster; +import org.eigenbase.relopt.RelOptCost; +import org.eigenbase.relopt.RelOptPlanner; +import org.eigenbase.relopt.RelTraitSet; +import org.eigenbase.rex.RexNode; + +public class HiveLimitRel extends SingleRel implements HiveRel { + private final RexNode offset; + private final RexNode fetch; + + HiveLimitRel(RelOptCluster cluster, RelTraitSet traitSet, RelNode child, RexNode offset, + RexNode fetch) { + super(cluster, TraitsUtil.getLimitTraitSet(cluster, traitSet, child), child); + this.offset = offset; + this.fetch = fetch; + assert getConvention() instanceof HiveRel; + assert getConvention() == child.getConvention(); + } + + @Override + public HiveLimitRel copy(RelTraitSet traitSet, List newInputs) { + return new HiveLimitRel(getCluster(), traitSet, sole(newInputs), offset, fetch); + } + + public void implement(Implementor implementor) { + } + + @Override + public RelOptCost computeSelfCost(RelOptPlanner planner) { + return HiveCost.FACTORY.makeZeroCost(); + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/reloperators/HiveProjectRel.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/reloperators/HiveProjectRel.java new file mode 100644 index 0000000..03f58d0 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/reloperators/HiveProjectRel.java @@ -0,0 +1,108 @@ +package org.apache.hadoop.hive.ql.optimizer.optiq.reloperators; + +import java.util.Collections; +import java.util.List; + +import com.google.common.collect.ImmutableList; + +import org.apache.hadoop.hive.ql.optimizer.optiq.HiveOptiqUtil; +import org.apache.hadoop.hive.ql.optimizer.optiq.TraitsUtil; +import org.apache.hadoop.hive.ql.optimizer.optiq.cost.HiveCost; +import org.eigenbase.rel.ProjectRelBase; +import org.eigenbase.rel.RelCollation; +import org.eigenbase.rel.RelFactories.ProjectFactory; +import org.eigenbase.rel.RelNode; +import org.eigenbase.relopt.RelOptCluster; +import org.eigenbase.relopt.RelOptCost; +import org.eigenbase.relopt.RelOptPlanner; +import org.eigenbase.relopt.RelOptRule; +import org.eigenbase.relopt.RelTraitSet; +import org.eigenbase.reltype.RelDataType; +import org.eigenbase.rex.RexNode; +import org.eigenbase.rex.RexUtil; + +public class HiveProjectRel extends ProjectRelBase implements HiveRel { + + public static final ProjectFactory DEFAULT_PROJECT_FACTORY = new HiveProjectFactoryImpl(); + + private final List m_virtualCols; + + /** + * Creates a HiveProjectRel. + * + * @param cluster + * Cluster this relational expression belongs to + * @param child + * input relational expression + * @param exps + * List of expressions for the input columns + * @param rowType + * output row type + * @param flags + * values as in {@link ProjectRelBase.Flags} + */ + public HiveProjectRel(RelOptCluster cluster, RelTraitSet traitSet, RelNode child, + List exps, RelDataType rowType, int flags) { + super(cluster, traitSet, child, exps, rowType, flags); + m_virtualCols = ImmutableList.copyOf(HiveOptiqUtil.getVirtualCols(exps)); + } + + /** + * Creates a HiveProjectRel with no sort keys. + * + * @param child + * input relational expression + * @param exps + * set of expressions for the input columns + * @param fieldNames + * aliases of the expressions + */ + public static HiveProjectRel create(RelNode child, List exps, List fieldNames) { + RelOptCluster cluster = child.getCluster(); + RelDataType rowType = RexUtil.createStructType(cluster.getTypeFactory(), exps, fieldNames); + return create(cluster, child, exps, rowType, Collections. emptyList()); + } + + /** + * Creates a HiveProjectRel. + */ + public static HiveProjectRel create(RelOptCluster cluster, RelNode child, List exps, + RelDataType rowType, final List collationList) { + RelTraitSet traitSet = TraitsUtil.getSelectTraitSet(cluster, exps, child); + return new HiveProjectRel(cluster, traitSet, child, exps, rowType, Flags.BOXED); + } + + public ProjectRelBase copy(RelTraitSet traitSet, RelNode input, List exps, + RelDataType rowType) { + assert traitSet.containsIfApplicable(HiveRel.CONVENTION); + return new HiveProjectRel(getCluster(), traitSet, input, exps, rowType, getFlags()); + } + + @Override + public RelOptCost computeSelfCost(RelOptPlanner planner) { + return HiveCost.FACTORY.makeZeroCost(); + } + + public void implement(Implementor implementor) { + } + + public List getVirtualCols() { + return m_virtualCols; + } + + /** + * Implementation of {@link ProjectFactory} that returns + * {@link org.apache.hadoop.hive.ql.optimizer.optiq.reloperators.HiveProjectRel} + * . + */ + private static class HiveProjectFactoryImpl implements ProjectFactory { + @Override + public RelNode createProject(RelNode input, List exps, List fieldNames) { + RelNode project = HiveProjectRel.create(input, exps, fieldNames); + + // Make sure extra traits are carried over from the original rel + project = RelOptRule.convert(project, input.getTraitSet()); + return project; + } + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/reloperators/HiveRel.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/reloperators/HiveRel.java new file mode 100644 index 0000000..6f3f1d8 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/reloperators/HiveRel.java @@ -0,0 +1,19 @@ +package org.apache.hadoop.hive.ql.optimizer.optiq.reloperators; + +import org.eigenbase.rel.RelNode; +import org.eigenbase.relopt.Convention; + +public interface HiveRel extends RelNode { + void implement(Implementor implementor); + + /** Calling convention for relational operations that occur in Hive. */ + final Convention CONVENTION = new Convention.Impl("HIVE", HiveRel.class); + + class Implementor { + + public void visitChild(int ordinal, RelNode input) { + assert ordinal == 0; + ((HiveRel) input).implement(this); + } + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/reloperators/HiveSortRel.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/reloperators/HiveSortRel.java new file mode 100644 index 0000000..41b1be7 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/reloperators/HiveSortRel.java @@ -0,0 +1,37 @@ +package org.apache.hadoop.hive.ql.optimizer.optiq.reloperators; + +import org.apache.hadoop.hive.ql.optimizer.optiq.TraitsUtil; +import org.eigenbase.rel.RelCollation; +import org.eigenbase.rel.RelCollationImpl; +import org.eigenbase.rel.RelNode; +import org.eigenbase.rel.SortRel; +import org.eigenbase.relopt.RelOptCluster; +import org.eigenbase.relopt.RelTraitSet; +import org.eigenbase.rex.RexNode; + +public class HiveSortRel extends SortRel implements HiveRel { + + public HiveSortRel(RelOptCluster cluster, RelTraitSet traitSet, RelNode child, + RelCollation collation, RexNode offset, RexNode fetch) { + super(cluster, TraitsUtil.getSortTraitSet(cluster, traitSet, collation), child, collation, + offset, fetch); + + assert getConvention() == child.getConvention(); + } + + @Override + public HiveSortRel copy(RelTraitSet traitSet, RelNode newInput, RelCollation newCollation, + RexNode offset, RexNode fetch) { + // TODO: can we blindly copy sort trait? What if inputs changed and we + // are now sorting by different cols + RelCollation canonizedCollation = traitSet.canonize(newCollation); + return new HiveSortRel(getCluster(), traitSet, newInput, canonizedCollation, offset, fetch); + } + + public RexNode getFetchExpr() { + return fetch; + } + + public void implement(Implementor implementor) { + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/reloperators/HiveTableScanRel.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/reloperators/HiveTableScanRel.java new file mode 100644 index 0000000..0bc8857 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/reloperators/HiveTableScanRel.java @@ -0,0 +1,94 @@ +package org.apache.hadoop.hive.ql.optimizer.optiq.reloperators; + +import java.util.LinkedList; +import java.util.List; + +import org.apache.hadoop.hive.ql.optimizer.optiq.RelOptHiveTable; +import org.apache.hadoop.hive.ql.optimizer.optiq.TraitsUtil; +import org.apache.hadoop.hive.ql.optimizer.optiq.cost.HiveCost; +import org.apache.hadoop.hive.ql.plan.ColStatistics; +import org.eigenbase.rel.RelNode; +import org.eigenbase.rel.TableAccessRelBase; +import org.eigenbase.relopt.RelOptCluster; +import org.eigenbase.relopt.RelOptCost; +import org.eigenbase.relopt.RelOptPlanner; +import org.eigenbase.relopt.RelTraitSet; +import org.eigenbase.reltype.RelDataType; + +import com.google.common.collect.ImmutableList; + +/** + * Relational expression representing a scan of a HiveDB collection. + * + *

+ * Additional operations might be applied, using the "find" or "aggregate" + * methods. + *

+ */ +public class HiveTableScanRel extends TableAccessRelBase implements HiveRel { + private final ImmutableList m_hiveColStat; + + /** + * Creates a HiveTableScan. + * + * @param cluster + * Cluster + * @param traitSet + * Traits + * @param table + * Table + * @param table + * HiveDB table + */ + public HiveTableScanRel(RelOptCluster cluster, RelTraitSet traitSet, RelOptHiveTable table, + RelDataType rowtype) { + super(cluster, TraitsUtil.getTableScanTraitSet(cluster, traitSet, table, rowtype), table); + assert getConvention() == HiveRel.CONVENTION; + + ImmutableList.Builder b = new ImmutableList.Builder(); + for (String fN : rowtype.getFieldNames()) { + ColStatistics cStat = table.getHiveStats().getColumnStatisticsForColumn( + table.getName(), fN); + b.add(cStat); + } + m_hiveColStat = b.build(); + } + + @Override + public RelNode copy(RelTraitSet traitSet, List inputs) { + assert inputs.isEmpty(); + return this; + } + + @Override + public RelOptCost computeSelfCost(RelOptPlanner planner) { + return HiveCost.FACTORY.makeZeroCost(); + } + + @Override + public void register(RelOptPlanner planner) { + + } + + public void implement(Implementor implementor) { + + } + + @Override + public double getRows() { + return ((RelOptHiveTable) table).getRowCount(); + } + + public List getColStat(List projIndxLst) { + if (projIndxLst != null) { + List hiveColStatLst = new LinkedList(); + for (Integer i : projIndxLst) { + hiveColStatLst.add(m_hiveColStat.get(i)); + } + return hiveColStatLst; + } else { + return m_hiveColStat; + } + } + +} \ No newline at end of file diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/rules/HiveMergeProjectRule.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/rules/HiveMergeProjectRule.java new file mode 100644 index 0000000..a34b532 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/rules/HiveMergeProjectRule.java @@ -0,0 +1,12 @@ +package org.apache.hadoop.hive.ql.optimizer.optiq.rules; + +import org.apache.hadoop.hive.ql.optimizer.optiq.reloperators.HiveProjectRel; +import org.eigenbase.rel.rules.MergeProjectRule; + +public class HiveMergeProjectRule extends MergeProjectRule { + public static final HiveMergeProjectRule INSTANCE = new HiveMergeProjectRule(); + + public HiveMergeProjectRule() { + super(true, HiveProjectRel.DEFAULT_PROJECT_FACTORY); + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/rules/HivePullUpProjectsAboveJoinRule.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/rules/HivePullUpProjectsAboveJoinRule.java new file mode 100644 index 0000000..a48112e --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/rules/HivePullUpProjectsAboveJoinRule.java @@ -0,0 +1,44 @@ +package org.apache.hadoop.hive.ql.optimizer.optiq.rules; + +import org.apache.hadoop.hive.ql.optimizer.optiq.reloperators.HiveJoinRel; +import org.apache.hadoop.hive.ql.optimizer.optiq.reloperators.HiveProjectRel; +import org.eigenbase.rel.ProjectRelBase; +import org.eigenbase.rel.RelNode; +import org.eigenbase.rel.rules.PullUpProjectsAboveJoinRule; +import org.eigenbase.relopt.RelOptRuleOperand; + +public class HivePullUpProjectsAboveJoinRule extends PullUpProjectsAboveJoinRule { + + public static final HivePullUpProjectsAboveJoinRule BOTH_PROJECT = new HivePullUpProjectsAboveJoinRule( + operand( + HiveJoinRel.class, + operand( + ProjectRelBase.class, + any()), + operand( + ProjectRelBase.class, + any())), + "HivePullUpProjectsAboveJoinRule: with two HiveProjectRel children"); + + public static final HivePullUpProjectsAboveJoinRule LEFT_PROJECT = new HivePullUpProjectsAboveJoinRule( + operand( + HiveJoinRel.class, + some(operand( + ProjectRelBase.class, + any()))), + "HivePullUpProjectsAboveJoinRule: with HiveProjectRel on left"); + + public static final HivePullUpProjectsAboveJoinRule RIGHT_PROJECT = new HivePullUpProjectsAboveJoinRule( + operand( + HiveJoinRel.class, + operand(RelNode.class, + any()), + operand( + ProjectRelBase.class, + any())), + "HivePullUpProjectsAboveJoinRule: with HiveProjectRel on right"); + + public HivePullUpProjectsAboveJoinRule(RelOptRuleOperand operand, String description) { + super(operand, description, HiveProjectRel.DEFAULT_PROJECT_FACTORY); + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/rules/HivePushJoinThroughJoinRule.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/rules/HivePushJoinThroughJoinRule.java new file mode 100644 index 0000000..01a408d --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/rules/HivePushJoinThroughJoinRule.java @@ -0,0 +1,37 @@ +package org.apache.hadoop.hive.ql.optimizer.optiq.rules; + +import org.apache.hadoop.hive.ql.optimizer.optiq.reloperators.HiveJoinRel; +import org.apache.hadoop.hive.ql.optimizer.optiq.reloperators.HiveJoinRel.JoinAlgorithm; +import org.apache.hadoop.hive.ql.optimizer.optiq.reloperators.HiveProjectRel; +import org.eigenbase.rel.JoinRelBase; +import org.eigenbase.rel.rules.PushJoinThroughJoinRule; +import org.eigenbase.relopt.RelOptRule; +import org.eigenbase.relopt.RelOptRuleCall; + +public class HivePushJoinThroughJoinRule extends PushJoinThroughJoinRule { + public static final RelOptRule RIGHT = new HivePushJoinThroughJoinRule( + "Hive PushJoinThroughJoinRule:right", true, + HiveJoinRel.class); + public static final RelOptRule LEFT = new HivePushJoinThroughJoinRule( + "Hive PushJoinThroughJoinRule:left", false, + HiveJoinRel.class); + + private HivePushJoinThroughJoinRule(String description, boolean right, + Class clazz) { + super(description, right, clazz, HiveProjectRel.DEFAULT_PROJECT_FACTORY); + } + + @Override + public boolean matches(RelOptRuleCall call) { + boolean isAMatch = false; + final HiveJoinRel topJoin = call.rel(0); + final HiveJoinRel bottomJoin = call.rel(1); + + if (topJoin.getJoinAlgorithm() == JoinAlgorithm.NONE + && bottomJoin.getJoinAlgorithm() == JoinAlgorithm.NONE) { + isAMatch = true; + } + + return isAMatch; + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/rules/HiveSwapJoinRule.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/rules/HiveSwapJoinRule.java new file mode 100644 index 0000000..d699de3 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/rules/HiveSwapJoinRule.java @@ -0,0 +1,22 @@ +package org.apache.hadoop.hive.ql.optimizer.optiq.rules; + +import org.apache.hadoop.hive.ql.optimizer.optiq.reloperators.HiveJoinRel; +import org.apache.hadoop.hive.ql.optimizer.optiq.reloperators.HiveJoinRel.JoinAlgorithm; +import org.apache.hadoop.hive.ql.optimizer.optiq.reloperators.HiveProjectRel; + +import org.eigenbase.rel.rules.SwapJoinRule; +import org.eigenbase.relopt.RelOptRuleCall; + +public class HiveSwapJoinRule extends SwapJoinRule { + public static final HiveSwapJoinRule INSTANCE = new HiveSwapJoinRule(); + + private HiveSwapJoinRule() { + super(HiveJoinRel.class, HiveProjectRel.DEFAULT_PROJECT_FACTORY); + } + + @Override + public boolean matches(RelOptRuleCall call) { + return super.matches(call) + && call. rel(0).getJoinAlgorithm() == JoinAlgorithm.NONE; + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/stats/CBOTableStatsValidator.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/stats/CBOTableStatsValidator.java new file mode 100644 index 0000000..370feaa --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/stats/CBOTableStatsValidator.java @@ -0,0 +1,90 @@ +package org.apache.hadoop.hive.ql.optimizer.optiq.stats; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Map; +import java.util.Stack; + +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.TableScanOperator; +import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher; +import org.apache.hadoop.hive.ql.lib.Dispatcher; +import org.apache.hadoop.hive.ql.lib.ForwardWalker; +import org.apache.hadoop.hive.ql.lib.GraphWalker; +import org.apache.hadoop.hive.ql.lib.Node; +import org.apache.hadoop.hive.ql.lib.NodeProcessor; +import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; +import org.apache.hadoop.hive.ql.lib.Rule; +import org.apache.hadoop.hive.ql.lib.RuleRegExp; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; +import org.apache.hadoop.hive.ql.plan.Statistics; + +import com.google.common.collect.ImmutableMap; + +public class CBOTableStatsValidator { + private final CBOValidateStatsContext m_ctx = new CBOValidateStatsContext(); + + public boolean validStats(Operator sinkOp, ParseContext pCtx) { + Map rules = ImmutableMap + . builder() + .put(new RuleRegExp("R1", TableScanOperator.getOperatorName() + "%"), + new TableScanProcessor()).build(); + + Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), rules, m_ctx); + GraphWalker fWalker = new ForwardWalker(disp); + + ArrayList topNodes = new ArrayList(); + topNodes.addAll(pCtx.getTopOps().values()); + + try { + fWalker.startWalking(topNodes, null); + } catch (SemanticException e) { + throw new RuntimeException(e); + } + + return (m_ctx.m_tabsWithIncompleteStats.isEmpty()); + } + + public String getIncompleteStatsTabNames() { + StringBuilder sb = new StringBuilder(); + for (String tabName : m_ctx.m_tabsWithIncompleteStats) { + if (sb.length() > 1) + sb.append(", "); + sb.append(tabName); + } + return sb.toString(); + } + + private static NodeProcessor getDefaultProc() { + return new NodeProcessor() { + @Override + public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) { + return null; + // TODO: Shouldn't we throw exception? as this would imply we got an op + // tree with no TS + } + }; + } + + static class TableScanProcessor implements NodeProcessor { + public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) { + TableScanOperator tableScanOp = (TableScanOperator) nd; + Statistics stats = tableScanOp.getStatistics(); + int noColsWithStats = (stats != null && stats.getColumnStats() != null) ? stats + .getColumnStats().size() : 0; + if (noColsWithStats != tableScanOp.getNeededColumns().size()) { + ((CBOValidateStatsContext) procCtx).m_tabsWithIncompleteStats.add(tableScanOp.getConf() + .getAlias()); + } + return null; + } + } + + static class CBOValidateStatsContext implements NodeProcessorCtx { + final private HashSet m_tabsWithIncompleteStats = new HashSet(); + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/stats/FilterSelectivityEstimator.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/stats/FilterSelectivityEstimator.java new file mode 100644 index 0000000..887f8dd --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/stats/FilterSelectivityEstimator.java @@ -0,0 +1,181 @@ +package org.apache.hadoop.hive.ql.optimizer.optiq.stats; + +import org.eigenbase.rel.RelNode; +import org.eigenbase.rel.metadata.RelMetadataQuery; +import org.eigenbase.relopt.RelOptUtil.InputReferencedVisitor; +import org.eigenbase.rex.RexCall; +import org.eigenbase.rex.RexInputRef; +import org.eigenbase.rex.RexNode; +import org.eigenbase.rex.RexVisitorImpl; +import org.eigenbase.sql.SqlKind; + +public class FilterSelectivityEstimator extends RexVisitorImpl { + private final RelNode m_childRel; + private final double m_childCardinality; + + protected FilterSelectivityEstimator(RelNode childRel) { + super(true); + m_childRel = childRel; + m_childCardinality = RelMetadataQuery.getRowCount(m_childRel); + } + + public Double estimateSelectivity(RexNode predicate) { + return predicate.accept(this); + } + + public Double visitCall(RexCall call) { + if (!deep) { + return 1.0; + } + + Double selectivity = null; + SqlKind op = call.getKind(); + + switch (op) { + case AND: { + selectivity = computeConjunctionSelectivity(call); + break; + } + + case OR: { + selectivity = computeDisjunctionSelectivity(call); + break; + } + + case NOT_EQUALS: { + selectivity = computeNotEqualitySelectivity(call); + } + + case LESS_THAN_OR_EQUAL: + case GREATER_THAN_OR_EQUAL: + case LESS_THAN: + case GREATER_THAN: { + selectivity = ((double) 1 / (double) 3); + break; + } + + case IN: { + selectivity = ((double) 1 / ((double) call.operands.size())); + break; + } + + default: + selectivity = computeFunctionSelectivity(call); + } + + return selectivity; + } + + /** + * NDV of "f1(x, y, z) != f2(p, q, r)" -> + * "(maxNDV(x,y,z,p,q,r) - 1)/maxNDV(x,y,z,p,q,r)". + *

+ * + * @param call + * @return + */ + private Double computeNotEqualitySelectivity(RexCall call) { + double tmpNDV = getMaxNDV(call); + + if (tmpNDV > 1) + return (tmpNDV - (double) 1) / tmpNDV; + else + return 1.0; + } + + /** + * Selectivity of f(X,y,z) -> 1/maxNDV(x,y,z). + *

+ * Note that >, >=, <, <=, = ... are considered generic functions and uses + * this method to find their selectivity. + * + * @param call + * @return + */ + private Double computeFunctionSelectivity(RexCall call) { + return 1 / getMaxNDV(call); + } + + /** + * Disjunction Selectivity -> (1 Ð(1-m1/n)(1-m2/n)) where n is the total + * number of tuples from child and m1 and m2 is the expected number of tuples + * from each part of the disjunction predicate. + *

+ * Note we compute m1. m2.. by applying selectivity of the disjunctive element + * on the cardinality from child. + * + * @param call + * @return + */ + private Double computeDisjunctionSelectivity(RexCall call) { + Double tmpCardinality; + Double tmpSelectivity; + double selectivity = 1; + + for (RexNode dje : call.getOperands()) { + tmpSelectivity = dje.accept(this); + if (tmpSelectivity == null) { + tmpSelectivity = 0.99; + } + tmpCardinality = m_childCardinality * tmpSelectivity; + + if (tmpCardinality > 1) + tmpSelectivity = (1 - tmpCardinality / m_childCardinality); + else + tmpSelectivity = 1.0; + + selectivity *= tmpSelectivity; + } + + if (selectivity > 1) + return (1 - selectivity); + else + return 1.0; + } + + /** + * Selectivity of conjunctive predicate -> (selectivity of conjunctive + * element1) * (selectivity of conjunctive element2)... + * + * @param call + * @return + */ + private Double computeConjunctionSelectivity(RexCall call) { + Double tmpSelectivity; + double selectivity = 1; + + for (RexNode cje : call.getOperands()) { + tmpSelectivity = cje.accept(this); + if (tmpSelectivity != null) { + selectivity *= tmpSelectivity; + } + } + + return selectivity; + } + + private Double getMaxNDV(RexCall call) { + double tmpNDV; + double maxNDV = 1.0; + InputReferencedVisitor irv; + + for (RexNode op : call.getOperands()) { + if (op instanceof RexInputRef) { + tmpNDV = HiveRelMdDistinctRowCount.getDistinctRowCount(m_childRel, + ((RexInputRef) op).getIndex()); + if (tmpNDV > maxNDV) + maxNDV = tmpNDV; + } else { + irv = new InputReferencedVisitor(); + irv.apply(op); + for (Integer childProjIndx : irv.inputPosReferenced) { + tmpNDV = HiveRelMdDistinctRowCount.getDistinctRowCount(m_childRel, childProjIndx); + if (tmpNDV > maxNDV) + maxNDV = tmpNDV; + } + } + } + + return maxNDV; + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/stats/HiveRelMdDistinctRowCount.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/stats/HiveRelMdDistinctRowCount.java new file mode 100644 index 0000000..ccc5385 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/stats/HiveRelMdDistinctRowCount.java @@ -0,0 +1,62 @@ +package org.apache.hadoop.hive.ql.optimizer.optiq.stats; + +import java.util.BitSet; +import java.util.List; + +import net.hydromatic.optiq.BuiltinMethod; + +import org.apache.hadoop.hive.ql.optimizer.optiq.HiveOptiqUtil; +import org.apache.hadoop.hive.ql.optimizer.optiq.reloperators.HiveTableScanRel; +import org.apache.hadoop.hive.ql.plan.ColStatistics; +import org.eigenbase.rel.JoinRelBase; +import org.eigenbase.rel.RelNode; +import org.eigenbase.rel.metadata.ReflectiveRelMetadataProvider; +import org.eigenbase.rel.metadata.RelMdDistinctRowCount; +import org.eigenbase.rel.metadata.RelMdUtil; +import org.eigenbase.rel.metadata.RelMetadataProvider; +import org.eigenbase.rel.metadata.RelMetadataQuery; +import org.eigenbase.rex.RexNode; +import org.eigenbase.util14.NumberUtil; + +public class HiveRelMdDistinctRowCount extends RelMdDistinctRowCount { + public static final RelMetadataProvider SOURCE = ReflectiveRelMetadataProvider.reflectiveSource( + BuiltinMethod.DISTINCT_ROW_COUNT.method, + new HiveRelMdDistinctRowCount()); + + private HiveRelMdDistinctRowCount() { + } + + // Catch-all rule when none of the others apply. + @Override + public Double getDistinctRowCount(RelNode rel, BitSet groupKey, RexNode predicate) { + if (rel instanceof HiveTableScanRel) { + return getDistinctRowCount((HiveTableScanRel) rel, groupKey, predicate); + } + + return NumberUtil.multiply(RelMetadataQuery.getRowCount(rel), + RelMetadataQuery.getSelectivity(rel, predicate)); + } + + private Double getDistinctRowCount(HiveTableScanRel htRel, BitSet groupKey, RexNode predicate) { + List projIndxLst = HiveOptiqUtil.translateBitSetToProjIndx(groupKey); + List colStats = htRel.getColStat(projIndxLst); + Double noDistinctRows = 1.0; + for (ColStatistics cStat : colStats) { + noDistinctRows *= cStat.getCountDistint(); + } + + return Math.min(noDistinctRows, htRel.getRows()); + } + + public static Double getDistinctRowCount(RelNode r, int indx) { + BitSet bitSetOfRqdProj = new BitSet(); + bitSetOfRqdProj.set(indx); + return RelMetadataQuery.getDistinctRowCount(r, bitSetOfRqdProj, r.getCluster().getRexBuilder() + .makeLiteral(true)); + } + + @Override + public Double getDistinctRowCount(JoinRelBase rel, BitSet groupKey, RexNode predicate) { + return RelMdUtil.getJoinDistinctRowCount(rel, rel.getJoinType(), groupKey, predicate, true); + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/stats/HiveRelMdSelectivity.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/stats/HiveRelMdSelectivity.java new file mode 100644 index 0000000..8f424fd --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/stats/HiveRelMdSelectivity.java @@ -0,0 +1,178 @@ +package org.apache.hadoop.hive.ql.optimizer.optiq.stats; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import net.hydromatic.optiq.BuiltinMethod; + +import org.apache.hadoop.hive.ql.optimizer.optiq.JoinUtil.JoinLeafPredicateInfo; +import org.apache.hadoop.hive.ql.optimizer.optiq.JoinUtil.JoinPredicateInfo; +import org.apache.hadoop.hive.ql.optimizer.optiq.reloperators.HiveJoinRel; +import org.apache.hadoop.hive.ql.optimizer.optiq.reloperators.HiveTableScanRel; +import org.eigenbase.rel.JoinRelType; +import org.eigenbase.rel.metadata.ReflectiveRelMetadataProvider; +import org.eigenbase.rel.metadata.RelMdSelectivity; +import org.eigenbase.rel.metadata.RelMdUtil; +import org.eigenbase.rel.metadata.RelMetadataProvider; +import org.eigenbase.rel.metadata.RelMetadataQuery; +import org.eigenbase.rex.RexNode; +import org.eigenbase.rex.RexUtil; + +import com.google.common.collect.ImmutableMap; + +public class HiveRelMdSelectivity extends RelMdSelectivity { + public static final RelMetadataProvider SOURCE = ReflectiveRelMetadataProvider.reflectiveSource( + BuiltinMethod.SELECTIVITY.method, + new HiveRelMdSelectivity()); + + protected HiveRelMdSelectivity() { + super(); + } + + public Double getSelectivity(HiveTableScanRel t, RexNode predicate) { + if (predicate != null) { + FilterSelectivityEstimator filterSelEstmator = new FilterSelectivityEstimator(t); + return filterSelEstmator.estimateSelectivity(predicate); + } + + return 1.0; + } + + public Double getSelectivity(HiveJoinRel j, RexNode predicate) { + if (j.getJoinType().equals(JoinRelType.INNER)) { + return computeInnerJoinSelectivity(j, predicate); + } + return 1.0; + } + + private Double computeInnerJoinSelectivity(HiveJoinRel j, RexNode predicate) { + double ndvCrossProduct = 1; + RexNode combinedPredicate = getCombinedPredicateForJoin(j, predicate); + JoinPredicateInfo jpi = JoinPredicateInfo.constructJoinPredicateInfo(j, combinedPredicate); + ImmutableMap.Builder colStatMapBuilder = ImmutableMap.builder(); + ImmutableMap colStatMap; + int rightOffSet = j.getLeft().getRowType().getFieldCount(); + + // 1. Update Col Stats Map with col stats for columns from left side of + // Join which are part of join keys + for (Integer ljk : jpi.getProjsFromLeftPartOfJoinKeysInChildSchema()) { + colStatMapBuilder.put(ljk, HiveRelMdDistinctRowCount.getDistinctRowCount(j.getLeft(), ljk)); + } + + // 2. Update Col Stats Map with col stats for columns from right side of + // Join which are part of join keys + for (Integer rjk : jpi.getProjsFromRightPartOfJoinKeysInChildSchema()) { + colStatMapBuilder.put(rjk + rightOffSet, + HiveRelMdDistinctRowCount.getDistinctRowCount(j.getRight(), rjk)); + } + colStatMap = colStatMapBuilder.build(); + + // 3. Walk through the Join Condition Building NDV for selectivity + // NDV of the join can not exceed the cardinality of cross join. + List peLst = jpi.getEquiJoinPredicateElements(); + int noOfPE = peLst.size(); + if (noOfPE > 0) { + // 3.1 Use first conjunctive predicate element's NDV as the seed + ndvCrossProduct = getMaxNDVForJoinSelectivity(peLst.get(0), colStatMap); + + // 3.2 if conjunctive predicate elements are more than one, then walk + // through them one by one. Compute cross product of NDV. Cross product is + // computed by multiplying the largest NDV of all of the conjunctive + // predicate + // elements with degraded NDV of rest of the conjunctive predicate + // elements. NDV is + // degraded using log function.Finally the ndvCrossProduct is fenced at + // the join + // cross product to ensure that NDV can not exceed worst case join + // cardinality.
+ // NDV of a conjunctive predicate element is the max NDV of all arguments + // to lhs, rhs expressions. + // NDV(JoinCondition) = min (left cardinality * right cardinality, + // ndvCrossProduct(JoinCondition)) + // ndvCrossProduct(JoinCondition) = ndv(pex)*log(ndv(pe1))*log(ndv(pe2)) + // where pex is the predicate element of join condition with max ndv. + // ndv(pe) = max(NDV(left.Expr), NDV(right.Expr)) + // NDV(expr) = max(NDV( expr args)) + if (noOfPE > 1) { + double maxNDVSoFar = ndvCrossProduct; + double ndvToBeSmoothed; + double tmpNDV; + + for (int i = 1; i < noOfPE; i++) { + tmpNDV = getMaxNDVForJoinSelectivity(peLst.get(i), colStatMap); + if (tmpNDV > maxNDVSoFar) { + ndvToBeSmoothed = maxNDVSoFar; + maxNDVSoFar = tmpNDV; + ndvCrossProduct = (ndvCrossProduct / ndvToBeSmoothed) * tmpNDV; + } else { + ndvToBeSmoothed = tmpNDV; + } + // TODO: revisit the fence + if (ndvToBeSmoothed > 3) + ndvCrossProduct *= Math.log(ndvToBeSmoothed); + else + ndvCrossProduct *= ndvToBeSmoothed; + } + + ndvCrossProduct = Math.min( + RelMetadataQuery.getRowCount(j.getLeft()) * RelMetadataQuery.getRowCount(j.getRight()), + ndvCrossProduct); + } + } + + // 4. Join Selectivity = 1/NDV + return (1 / ndvCrossProduct); + } + + private RexNode getCombinedPredicateForJoin(HiveJoinRel j, RexNode additionalPredicate) { + RexNode minusPred = RelMdUtil.minusPreds(j.getCluster().getRexBuilder(), additionalPredicate, + j.getCondition()); + + if (minusPred != null) { + List minusList = new ArrayList(); + minusList.add(j.getCondition()); + minusList.add(minusPred); + + return RexUtil.composeConjunction(j.getCluster().getRexBuilder(), minusList, true); + } + + return j.getCondition(); + } + + /** + * Compute Max NDV to determine Join Selectivity. + * + * @param jlpi + * @param colStatMap + * Immutable Map of Projection Index (in Join Schema) to Column Stat + * @param rightProjOffSet + * @return + */ + private static Double getMaxNDVForJoinSelectivity(JoinLeafPredicateInfo jlpi, + ImmutableMap colStatMap) { + Double maxNDVSoFar = 1.0; + + maxNDVSoFar = getMaxNDVFromProjections(colStatMap, + jlpi.getProjsFromLeftPartOfJoinKeysInJoinSchema(), maxNDVSoFar); + maxNDVSoFar = getMaxNDVFromProjections(colStatMap, + jlpi.getProjsFromRightPartOfJoinKeysInJoinSchema(), maxNDVSoFar); + + return maxNDVSoFar; + } + + private static Double getMaxNDVFromProjections(Map colStatMap, + Set projectionSet, Double defaultMaxNDV) { + Double colNDV = null; + Double maxNDVSoFar = defaultMaxNDV; + + for (Integer projIndx : projectionSet) { + colNDV = colStatMap.get(projIndx); + if (colNDV > maxNDVSoFar) + maxNDVSoFar = colNDV; + } + + return maxNDVSoFar; + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/translator/ASTBuilder.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/translator/ASTBuilder.java new file mode 100644 index 0000000..7ddaceb --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/translator/ASTBuilder.java @@ -0,0 +1,166 @@ +package org.apache.hadoop.hive.ql.optimizer.optiq.translator; + +import org.apache.hadoop.hive.ql.optimizer.optiq.RelOptHiveTable; +import org.apache.hadoop.hive.ql.parse.ASTNode; +import org.apache.hadoop.hive.ql.parse.HiveParser; +import org.apache.hadoop.hive.ql.parse.ParseDriver; +import org.eigenbase.rel.JoinRelType; +import org.eigenbase.rel.TableAccessRelBase; +import org.eigenbase.rex.RexLiteral; +import org.eigenbase.sql.type.SqlTypeName; + +class ASTBuilder { + + static ASTBuilder construct(int tokenType, String text) { + ASTBuilder b = new ASTBuilder(); + b.curr = createAST(tokenType, text); + return b; + } + + static ASTNode createAST(int tokenType, String text) { + return (ASTNode) ParseDriver.adaptor.create(tokenType, text); + } + + static ASTNode destNode() { + return ASTBuilder + .construct(HiveParser.TOK_DESTINATION, "TOK_DESTINATION") + .add( + ASTBuilder.construct(HiveParser.TOK_DIR, "TOK_DIR").add(HiveParser.TOK_TMP_FILE, + "TOK_TMP_FILE")).node(); + } + + static ASTNode table(TableAccessRelBase scan) { + RelOptHiveTable hTbl = (RelOptHiveTable) scan.getTable(); + ASTBuilder b = ASTBuilder + .construct(HiveParser.TOK_TABREF, "TOK_TABREF") + .add( + ASTBuilder.construct(HiveParser.TOK_TABNAME, "TOK_TABNAME") + .add(HiveParser.Identifier, hTbl.getHiveTableMD().getDbName()) + .add(HiveParser.Identifier, hTbl.getHiveTableMD().getTableName())) + .add(HiveParser.Identifier, hTbl.getName()); + return b.node(); + } + + static ASTNode join(ASTNode left, ASTNode right, JoinRelType joinType, ASTNode cond) { + ASTBuilder b = null; + + switch (joinType) { + case INNER: + b = ASTBuilder.construct(HiveParser.TOK_JOIN, "TOK_JOIN"); + break; + case LEFT: + b = ASTBuilder.construct(HiveParser.TOK_LEFTOUTERJOIN, "TOK_LEFTOUTERJOIN"); + break; + case RIGHT: + b = ASTBuilder.construct(HiveParser.TOK_RIGHTOUTERJOIN, "TOK_RIGHTOUTERJOIN"); + break; + case FULL: + b = ASTBuilder.construct(HiveParser.TOK_FULLOUTERJOIN, "TOK_FULLOUTERJOIN"); + break; + } + + b.add(left).add(right).add(cond); + return b.node(); + } + + static ASTNode subQuery(ASTNode qry, String alias) { + return ASTBuilder.construct(HiveParser.TOK_SUBQUERY, "TOK_SUBQUERY").add(qry) + .add(HiveParser.Identifier, alias).node(); + } + + static ASTNode qualifiedName(String tableName, String colName) { + ASTBuilder b = ASTBuilder + .construct(HiveParser.DOT, ".") + .add( + ASTBuilder.construct(HiveParser.TOK_TABLE_OR_COL, "TOK_TABLE_OR_COL").add( + HiveParser.Identifier, tableName)).add(HiveParser.Identifier, colName); + return b.node(); + } + + static ASTNode unqualifiedName(String colName) { + ASTBuilder b = ASTBuilder +.construct(HiveParser.TOK_TABLE_OR_COL, + "TOK_TABLE_OR_COL").add(HiveParser.Identifier, colName); + return b.node(); + } + + static ASTNode where(ASTNode cond) { + return ASTBuilder.construct(HiveParser.TOK_WHERE, "TOK_WHERE").add(cond).node(); + } + + static ASTNode having(ASTNode cond) { + return ASTBuilder.construct(HiveParser.TOK_HAVING, "TOK_HAVING").add(cond).node(); + } + + static ASTNode limit(Object value) { + return ASTBuilder.construct(HiveParser.TOK_LIMIT, "TOK_LIMIT") + .add(HiveParser.Number, value.toString()).node(); + } + + static ASTNode selectExpr(ASTNode expr, String alias) { + return ASTBuilder.construct(HiveParser.TOK_SELEXPR, "TOK_SELEXPR").add(expr) + .add(HiveParser.Identifier, alias).node(); + } + + static ASTNode literal(RexLiteral literal) { + Object val = literal.getValue3(); + int type = 0; + SqlTypeName sqlType = literal.getType().getSqlTypeName(); + + switch (sqlType) { + case TINYINT: + type = HiveParser.TinyintLiteral; + break; + case SMALLINT: + type = HiveParser.SmallintLiteral; + break; + case INTEGER: + case BIGINT: + type = HiveParser.BigintLiteral; + break; + case DECIMAL: + case FLOAT: + case DOUBLE: + case REAL: + type = HiveParser.Number; + break; + case VARCHAR: + case CHAR: + type = HiveParser.StringLiteral; + val = "'" + String.valueOf(val) + "'"; + break; + case BOOLEAN: + type = ((Boolean) val).booleanValue() ? HiveParser.KW_TRUE + : HiveParser.KW_FALSE; + break; + + default: + throw new RuntimeException("Unsupported Type: " + sqlType); + } + + return (ASTNode) ParseDriver.adaptor.create(type, String.valueOf(val)); + } + + ASTNode curr; + + ASTNode node() { + return curr; + } + + ASTBuilder add(int tokenType, String text) { + ParseDriver.adaptor.addChild(curr, createAST(tokenType, text)); + return this; + } + + ASTBuilder add(ASTBuilder b) { + ParseDriver.adaptor.addChild(curr, b.curr); + return this; + } + + ASTBuilder add(ASTNode n) { + if (n != null) { + ParseDriver.adaptor.addChild(curr, n); + } + return this; + } +} \ No newline at end of file diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/translator/ASTConverter.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/translator/ASTConverter.java new file mode 100644 index 0000000..f70dfd5 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/translator/ASTConverter.java @@ -0,0 +1,420 @@ +package org.apache.hadoop.hive.ql.optimizer.optiq.translator; + +import java.util.ArrayList; +import java.util.LinkedList; +import java.util.List; +import java.util.concurrent.atomic.AtomicLong; + +import net.hydromatic.optiq.util.BitSets; + +import org.apache.hadoop.hive.metastore.api.FieldSchema; +import org.apache.hadoop.hive.ql.optimizer.optiq.reloperators.HiveSortRel; +import org.apache.hadoop.hive.ql.parse.ASTNode; +import org.apache.hadoop.hive.ql.parse.HiveParser; +import org.apache.hadoop.hive.ql.parse.ParseDriver; +import org.eigenbase.rel.AggregateCall; +import org.eigenbase.rel.AggregateRelBase; +import org.eigenbase.rel.FilterRelBase; +import org.eigenbase.rel.JoinRelBase; +import org.eigenbase.rel.ProjectRelBase; +import org.eigenbase.rel.RelFieldCollation; +import org.eigenbase.rel.RelNode; +import org.eigenbase.rel.RelVisitor; +import org.eigenbase.rel.SortRel; +import org.eigenbase.rel.TableAccessRelBase; +import org.eigenbase.reltype.RelDataTypeField; +import org.eigenbase.rex.RexCall; +import org.eigenbase.rex.RexInputRef; +import org.eigenbase.rex.RexLiteral; +import org.eigenbase.rex.RexNode; +import org.eigenbase.rex.RexUtil; +import org.eigenbase.rex.RexVisitorImpl; +import org.eigenbase.sql.SqlKind; +import org.eigenbase.sql.SqlOperator; +import org.eigenbase.sql.type.BasicSqlType; +import org.eigenbase.sql.type.SqlTypeName; + +import com.google.common.collect.Iterables; + +public class ASTConverter { + + RelNode root; + HiveAST hiveAST; + RelNode from; + FilterRelBase where; + AggregateRelBase groupBy; + FilterRelBase having; + ProjectRelBase select; + SortRel order; + + Schema schema; + + ASTConverter(RelNode root) { + this.root = root; + hiveAST = new HiveAST(); + } + + public static ASTNode convert(final RelNode relNode, List resultSchema) { + SortRel sortrel = null; + RelNode root = DerivedTableInjector.convertOpTree(relNode, resultSchema); + + if (root instanceof SortRel) { + sortrel = (SortRel) root; + root = sortrel.getChild(); + if (!(root instanceof ProjectRelBase)) + throw new RuntimeException("Child of root sort node is not a project"); + } + + ASTConverter c = new ASTConverter(root); + return c.convert(sortrel); + } + + public ASTNode convert(SortRel sortrel) { + /* + * 1. Walk RelNode Graph; note from, where, gBy.. nodes. + */ + new QBVisitor().go(root); + + /* + * 2. convert from node. + */ + QueryBlockInfo qb = convertSource(from); + schema = qb.schema; + hiveAST.from = ASTBuilder.construct(HiveParser.TOK_FROM, "TOK_FROM").add(qb.ast).node(); + + /* + * 3. convert filterNode + */ + if (where != null) { + ASTNode cond = where.getCondition().accept(new RexVisitor(schema)); + hiveAST.where = ASTBuilder.where(cond); + } + + /* + * 4. GBy + */ + if (groupBy != null) { + ASTBuilder b = ASTBuilder.construct(HiveParser.TOK_GROUPBY, "TOK_GROUPBY"); + for (int i : BitSets.toIter(groupBy.getGroupSet())) { + RexInputRef iRef = new RexInputRef(i, new BasicSqlType(SqlTypeName.ANY)); + b.add(iRef.accept(new RexVisitor(schema))); + } + hiveAST.groupBy = b.node(); + schema = new Schema(schema, groupBy); + } + + /* + * 5. Having + */ + if (having != null) { + ASTNode cond = having.getCondition().accept(new RexVisitor(schema)); + hiveAST.having = ASTBuilder.having(cond); + } + + /* + * 6. Project + */ + int i = 0; + ASTBuilder b = ASTBuilder.construct(HiveParser.TOK_SELECT, "TOK_SELECT"); + + for (RexNode r : select.getChildExps()) { + ASTNode selectExpr = ASTBuilder.selectExpr(r.accept(new RexVisitor(schema)), select + .getRowType().getFieldNames().get(i++)); + b.add(selectExpr); + } + hiveAST.select = b.node(); + + /* + * 7. Order + * Use in Order By from the block above. RelNode has no pointer to parent + * hence we need to go top down; but OB at each block really belong to its + * src/from. Hence the need to pass in sortRel for each block from its parent. + */ + if (sortrel != null) { + HiveSortRel hiveSort = (HiveSortRel) sortrel; + if (!hiveSort.getCollation().getFieldCollations().isEmpty()) { + ASTNode orderAst = ASTBuilder.createAST(HiveParser.TOK_ORDERBY, "TOK_ORDERBY"); + schema = new Schema((HiveSortRel) sortrel); + for (RelFieldCollation c : hiveSort.getCollation().getFieldCollations()) { + ColumnInfo cI = schema.get(c.getFieldIndex()); + /* + * The RowResolver setup for Select drops Table associations. So setup + * ASTNode on unqualified name. + */ + ASTNode astCol = ASTBuilder.unqualifiedName(cI.column); + ASTNode astNode = c.getDirection() == RelFieldCollation.Direction.ASCENDING + ? ASTBuilder.createAST(HiveParser.TOK_TABSORTCOLNAMEASC, "TOK_TABSORTCOLNAMEASC") + : ASTBuilder.createAST(HiveParser.TOK_TABSORTCOLNAMEDESC, "TOK_TABSORTCOLNAMEDESC"); + astNode.addChild(astCol); + orderAst.addChild(astNode); + } + hiveAST.order = orderAst; + } + RexNode limitExpr = hiveSort.getFetchExpr(); + if (limitExpr != null) { + Object val = ((RexLiteral) limitExpr).getValue2(); + hiveAST.limit = ASTBuilder.limit(val); + } + + } + + return hiveAST.getAST(); + } + + private Schema getRowSchema(String tblAlias) { + return new Schema(select, tblAlias); + } + + private QueryBlockInfo convertSource(RelNode r) { + Schema s; + ASTNode ast; + + if (r instanceof TableAccessRelBase) { + TableAccessRelBase f = (TableAccessRelBase) r; + s = new Schema(f); + ast = ASTBuilder.table(f); + } else if (r instanceof JoinRelBase) { + JoinRelBase join = (JoinRelBase) r; + QueryBlockInfo left = convertSource(join.getLeft()); + QueryBlockInfo right = convertSource(join.getRight()); + s = new Schema(left.schema, right.schema); + ASTNode cond = join.getCondition().accept(new RexVisitor(s)); + ast = ASTBuilder.join(left.ast, right.ast, join.getJoinType(), cond); + } else { + ASTConverter src = new ASTConverter(r); + ASTNode srcAST = src.convert(order); + String sqAlias = ASTConverter.nextAlias(); + s = src.getRowSchema(sqAlias); + ast = ASTBuilder.subQuery(srcAST, sqAlias); + } + return new QueryBlockInfo(s, ast); + } + + class QBVisitor extends RelVisitor { + + public void handle(FilterRelBase filter) { + RelNode child = filter.getChild(); + if (child instanceof AggregateRelBase) { + ASTConverter.this.having = filter; + } else { + ASTConverter.this.where = filter; + } + } + + public void handle(ProjectRelBase project) { + if (ASTConverter.this.select == null) { + ASTConverter.this.select = project; + } else { + ASTConverter.this.from = project; + } + } + + @Override + public void visit(RelNode node, int ordinal, RelNode parent) { + + if (node instanceof TableAccessRelBase) { + ASTConverter.this.from = node; + } else if (node instanceof FilterRelBase) { + handle((FilterRelBase) node); + } else if (node instanceof ProjectRelBase) { + handle((ProjectRelBase) node); + } else if (node instanceof JoinRelBase) { + ASTConverter.this.from = node; + } else if (node instanceof AggregateRelBase) { + ASTConverter.this.groupBy = (AggregateRelBase) node; + } else if (node instanceof SortRel) { + ASTConverter.this.order = (SortRel) node; + } + /* + * once the source node is reached; stop traversal for this QB + */ + if (ASTConverter.this.from == null) { + node.childrenAccept(this); + } + } + + } + + static class RexVisitor extends RexVisitorImpl { + + private final Schema schema; + + protected RexVisitor(Schema schema) { + super(true); + this.schema = schema; + } + + @Override + public ASTNode visitInputRef(RexInputRef inputRef) { + ColumnInfo cI = schema.get(inputRef.getIndex()); + if (cI.agg != null) { + return (ASTNode) ParseDriver.adaptor.dupTree(cI.agg); + } + return ASTBuilder.qualifiedName(cI.table, cI.column); + } + + @Override + public ASTNode visitLiteral(RexLiteral literal) { + return ASTBuilder.literal(literal); + } + + @Override + public ASTNode visitCall(RexCall call) { + if (!deep) { + return null; + } + + SqlOperator op = call.getOperator(); + List astNodeLst = new LinkedList(); + for (RexNode operand : call.operands) { + astNodeLst.add(operand.accept(this)); + } + if (isFlat(call)) + return SqlFunctionConverter.buildAST(op, astNodeLst, 0); + else + return SqlFunctionConverter.buildAST(op, astNodeLst); + } + } + + static class QueryBlockInfo { + Schema schema; + ASTNode ast; + + public QueryBlockInfo(Schema schema, ASTNode ast) { + super(); + this.schema = schema; + this.ast = ast; + } + } + + /* + * represents the schema exposed by a QueryBlock. + */ + static class Schema extends ArrayList { + + private static final long serialVersionUID = 1L; + + Schema(TableAccessRelBase scan) { + String tabName = scan.getTable().getQualifiedName().get(0); + for (RelDataTypeField field : scan.getRowType().getFieldList()) { + add(new ColumnInfo(tabName, field.getName())); + } + } + + Schema(ProjectRelBase select, String alias) { + for (RelDataTypeField field : select.getRowType().getFieldList()) { + add(new ColumnInfo(alias, field.getName())); + } + } + + @SuppressWarnings("unchecked") + Schema(Schema left, Schema right) { + for (ColumnInfo cI : Iterables.concat(left, right)) { + add(cI); + } + } + + Schema(Schema src, AggregateRelBase gBy) { + for (int i : BitSets.toIter(gBy.getGroupSet())) { + ColumnInfo cI = src.get(i); + add(cI); + } + List aggs = gBy.getAggCallList(); + for (AggregateCall agg : aggs) { + int argCount = agg.getArgList().size(); + ASTBuilder b = agg.isDistinct() ? ASTBuilder.construct(HiveParser.TOK_FUNCTIONDI, + "TOK_FUNCTIONDI") : argCount == 0 ? ASTBuilder.construct(HiveParser.TOK_FUNCTIONSTAR, + "TOK_FUNCTIONSTAR") : ASTBuilder.construct(HiveParser.TOK_FUNCTION, "TOK_FUNCTION"); + b.add(HiveParser.Identifier, agg.getAggregation().getName()); + for (int i : agg.getArgList()) { + RexInputRef iRef = new RexInputRef(i, new BasicSqlType(SqlTypeName.ANY)); + b.add(iRef.accept(new RexVisitor(src))); + } + add(new ColumnInfo(null, b.node())); + } + } + + /** + * Assumption:
+ * 1. ProjectRel will always be child of SortRel.
+ * 2. In Optiq every projection in ProjectRelBase is uniquely named + * (unambigous) without using table qualifier (table name).
+ * + * @param order + * Hive Sort Rel Node + * @return Schema + */ + public Schema(HiveSortRel order) { + ProjectRelBase select = (ProjectRelBase) order.getChild(); + for (String projName : select.getRowType().getFieldNames()) { + add(new ColumnInfo(null, projName)); + } + } + } + + /* + * represents Column information exposed by a QueryBlock. + */ + static class ColumnInfo { + String table; + String column; + ASTNode agg; + + ColumnInfo(String table, String column) { + super(); + this.table = table; + this.column = column; + } + + ColumnInfo(String table, ASTNode agg) { + super(); + this.table = table; + this.agg = agg; + } + + ColumnInfo(String alias, ColumnInfo srcCol) { + this.table = alias; + this.column = srcCol.column; + this.agg = srcCol.agg; + } + } + + static String nextAlias() { + return String.format("$hdt$_%d", derivedTableCounter.getAndIncrement()); + } + + private static AtomicLong derivedTableCounter = new AtomicLong(0); + + static class HiveAST { + + ASTNode from; + ASTNode where; + ASTNode groupBy; + ASTNode having; + ASTNode select; + ASTNode order; + ASTNode limit; + + public ASTNode getAST() { + ASTBuilder b = ASTBuilder + .construct(HiveParser.TOK_QUERY, "TOK_QUERY") + .add(from) + .add( + ASTBuilder.construct(HiveParser.TOK_INSERT, "TOK_INSERT").add(ASTBuilder.destNode()) + .add(select).add(where).add(groupBy).add(having).add(order).add(limit)); + return b.node(); + } + } + + private static boolean isFlat(RexCall call) { + boolean flat = false; + if (call.operands != null && call.operands.size() > 2) { + SqlOperator op = call.getOperator(); + if (op.getKind() == SqlKind.AND || op.getKind() == SqlKind.OR) { + flat = true; + } + } + + return flat; + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/translator/DerivedTableInjector.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/translator/DerivedTableInjector.java new file mode 100644 index 0000000..72dfbcd --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/translator/DerivedTableInjector.java @@ -0,0 +1,214 @@ +package org.apache.hadoop.hive.ql.optimizer.optiq.translator; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.hadoop.hive.metastore.api.FieldSchema; +import org.apache.hadoop.hive.ql.optimizer.optiq.reloperators.HiveAggregateRel; +import org.apache.hadoop.hive.ql.optimizer.optiq.reloperators.HiveJoinRel; +import org.apache.hadoop.hive.ql.optimizer.optiq.reloperators.HiveProjectRel; +import org.apache.hadoop.hive.ql.optimizer.optiq.reloperators.HiveSortRel; +import org.eigenbase.rel.AggregateRelBase; +import org.eigenbase.rel.EmptyRel; +import org.eigenbase.rel.FilterRelBase; +import org.eigenbase.rel.JoinRelBase; +import org.eigenbase.rel.OneRowRelBase; +import org.eigenbase.rel.ProjectRelBase; +import org.eigenbase.rel.RelNode; +import org.eigenbase.rel.SetOpRel; +import org.eigenbase.rel.SingleRel; +import org.eigenbase.rel.TableAccessRelBase; +import org.eigenbase.rel.TableFunctionRelBase; +import org.eigenbase.rel.ValuesRelBase; +import org.eigenbase.rel.rules.MultiJoinRel; +import org.eigenbase.relopt.hep.HepRelVertex; +import org.eigenbase.relopt.volcano.RelSubset; +import org.eigenbase.reltype.RelDataTypeField; +import org.eigenbase.rex.RexInputRef; +import org.eigenbase.rex.RexNode; + +import com.google.common.base.Function; +import com.google.common.collect.Lists; + +public class DerivedTableInjector { + + public static RelNode convertOpTree(RelNode rel, List resultSchema) { + // Disable introducing top level select since Hive seems to have bugs with + // OB, Limit in sub query. + // RelNode newTopSelect = introduceTopLevelSelectInResultSchema(rel, + // resultSchema); + RelNode newTopSelect = rel; + convertOpTree(newTopSelect, (RelNode) null); + return newTopSelect; + } + + private static void convertOpTree(RelNode rel, RelNode parent) { + + if (rel instanceof EmptyRel) { + // TODO: replace with null scan + } else if (rel instanceof HepRelVertex) { + // TODO: is this relevant? + } else if (rel instanceof HiveJoinRel) { + if (!validJoinParent(rel, parent)) { + introduceDerivedTable(rel, parent); + } + } else if (rel instanceof MultiJoinRel) { + + } else if (rel instanceof OneRowRelBase) { + + } else if (rel instanceof RelSubset) { + + } else if (rel instanceof SetOpRel) { + + } else if (rel instanceof SingleRel) { + if (rel instanceof FilterRelBase) { + if (!validFilterParent(rel, parent)) { + introduceDerivedTable(rel, parent); + } + } else if (rel instanceof HiveSortRel) { + if (!validSortParent(rel, parent)) { + introduceDerivedTable(rel, parent); + } + if (!validSortChild((HiveSortRel) rel)) { + introduceDerivedTable(((HiveSortRel) rel).getChild(), rel); + } + } else if (rel instanceof HiveAggregateRel) { + if (!validGBParent(rel, parent)) { + introduceDerivedTable(rel, parent); + } + } + } else if (rel instanceof TableAccessRelBase) { + + } else if (rel instanceof TableFunctionRelBase) { + + } else if (rel instanceof ValuesRelBase) { + + } + + List childNodes = rel.getInputs(); + if (childNodes != null) { + for (RelNode r : childNodes) { + convertOpTree(r, rel); + } + } + } + + private static HiveProjectRel introduceTopLevelSelectInResultSchema(final RelNode rootRel, + List resultSchema) { + RelNode curNode = rootRel; + HiveProjectRel rootProjRel = null; + while (curNode != null) { + if (curNode instanceof HiveProjectRel) { + rootProjRel = (HiveProjectRel) curNode; + break; + } + curNode = curNode.getInput(0); + } + + //Assumption: tree could only be (limit)?(OB)?(ProjectRelBase).... + List rootChildExps = rootProjRel.getChildExps(); + if (resultSchema.size() != rootChildExps.size()) { + throw new RuntimeException("Result Schema didn't match Optiq Optimized Op Tree Schema"); + } + + List newSelExps = new ArrayList(); + List newSelAliases = new ArrayList(); + for (int i = 0; i < rootChildExps.size(); i++) { + newSelExps.add(new RexInputRef(i, rootChildExps.get(i).getType())); + newSelAliases.add(resultSchema.get(i).getName()); + } + + return HiveProjectRel.create(rootRel, newSelExps, newSelAliases); + } + + private static void introduceDerivedTable(final RelNode rel, RelNode parent) { + int i = 0; + int pos = -1; + List childList = parent.getInputs(); + + for (RelNode child : childList) { + if (child == rel) { + pos = i; + break; + } + i++; + } + + if (pos == -1) { + throw new RuntimeException("Couldn't find child node in parent's inputs"); + } + + List projectList = Lists.transform(rel.getRowType().getFieldList(), + new Function() { + public RexNode apply(RelDataTypeField field) { + return rel.getCluster().getRexBuilder().makeInputRef(field.getType(), field.getIndex()); + } + }); + + HiveProjectRel select = HiveProjectRel.create(rel.getCluster(), rel, projectList, + rel.getRowType(), rel.getCollationList()); + parent.replaceInput(pos, select); + + } + + private static boolean validJoinParent(RelNode joinNode, RelNode parent) { + boolean validParent = true; + + if (parent instanceof JoinRelBase) { + if (((JoinRelBase) parent).getRight() == joinNode) { + validParent = false; + } + } else if (parent instanceof SetOpRel) { + validParent = false; + } + + return validParent; + } + + private static boolean validFilterParent(RelNode filterNode, RelNode parent) { + boolean validParent = true; + + // TOODO: Verify GB having is not a seperate filter (if so we shouldn't + // introduce derived table) + if (parent instanceof FilterRelBase || parent instanceof JoinRelBase + || parent instanceof SetOpRel) { + validParent = false; + } + + return validParent; + } + + private static boolean validGBParent(RelNode gbNode, RelNode parent) { + boolean validParent = true; + + // TOODO: Verify GB having is not a seperate filter (if so we shouldn't + // introduce derived table) + if (parent instanceof JoinRelBase || parent instanceof SetOpRel + || parent instanceof AggregateRelBase) { + validParent = false; + } + + return validParent; + } + + private static boolean validSortParent(RelNode sortNode, RelNode parent) { + boolean validParent = true; + + if (parent != null && !(parent instanceof ProjectRelBase)) { + validParent = false; + } + + return validParent; + } + + private static boolean validSortChild(HiveSortRel sortNode) { + boolean validChild = true; + RelNode child = sortNode.getChild(); + + if (!(child instanceof ProjectRelBase)) { + validChild = false; + } + + return validChild; + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/translator/RelNodeConverter.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/translator/RelNodeConverter.java new file mode 100644 index 0000000..62750a3 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/translator/RelNodeConverter.java @@ -0,0 +1,683 @@ +package org.apache.hadoop.hive.ql.optimizer.optiq.translator; + +import java.math.BigDecimal; +import java.util.ArrayList; +import java.util.BitSet; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; +import java.util.Stack; + +import org.apache.hadoop.hive.metastore.api.FieldSchema; +import org.apache.hadoop.hive.ql.exec.ColumnInfo; +import org.apache.hadoop.hive.ql.exec.FilterOperator; +import org.apache.hadoop.hive.ql.exec.GroupByOperator; +import org.apache.hadoop.hive.ql.exec.JoinOperator; +import org.apache.hadoop.hive.ql.exec.LimitOperator; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; +import org.apache.hadoop.hive.ql.exec.RowSchema; +import org.apache.hadoop.hive.ql.exec.SelectOperator; +import org.apache.hadoop.hive.ql.exec.TableScanOperator; +import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher; +import org.apache.hadoop.hive.ql.lib.Dispatcher; +import org.apache.hadoop.hive.ql.lib.ForwardWalker; +import org.apache.hadoop.hive.ql.lib.GraphWalker; +import org.apache.hadoop.hive.ql.lib.Node; +import org.apache.hadoop.hive.ql.lib.NodeProcessor; +import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; +import org.apache.hadoop.hive.ql.lib.Rule; +import org.apache.hadoop.hive.ql.lib.RuleRegExp; +import org.apache.hadoop.hive.ql.metadata.Table; +import org.apache.hadoop.hive.ql.optimizer.optiq.RelOptHiveTable; +import org.apache.hadoop.hive.ql.optimizer.optiq.reloperators.HiveAggregateRel; +import org.apache.hadoop.hive.ql.optimizer.optiq.reloperators.HiveFilterRel; +import org.apache.hadoop.hive.ql.optimizer.optiq.reloperators.HiveJoinRel; +import org.apache.hadoop.hive.ql.optimizer.optiq.reloperators.HiveProjectRel; +import org.apache.hadoop.hive.ql.optimizer.optiq.reloperators.HiveRel; +import org.apache.hadoop.hive.ql.optimizer.optiq.reloperators.HiveSortRel; +import org.apache.hadoop.hive.ql.optimizer.optiq.reloperators.HiveTableScanRel; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.PrunedPartitionList; +import org.apache.hadoop.hive.ql.parse.QBJoinTree; +import org.apache.hadoop.hive.ql.parse.RowResolver; +import org.apache.hadoop.hive.ql.parse.SemanticAnalyzer; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.plan.AggregationDesc; +import org.apache.hadoop.hive.ql.plan.ColStatistics; +import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.JoinCondDesc; +import org.apache.hadoop.hive.ql.plan.JoinDesc; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; +import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc; +import org.apache.hadoop.hive.ql.plan.Statistics; +import org.eigenbase.rel.AggregateCall; +import org.eigenbase.rel.Aggregation; +import org.eigenbase.rel.InvalidRelException; +import org.eigenbase.rel.JoinRelType; +import org.eigenbase.rel.RelCollation; +import org.eigenbase.rel.RelCollationImpl; +import org.eigenbase.rel.RelFieldCollation; +import org.eigenbase.rel.RelNode; +import org.eigenbase.rel.TableAccessRelBase; +import org.eigenbase.relopt.RelOptCluster; +import org.eigenbase.relopt.RelOptSchema; +import org.eigenbase.relopt.RelTraitSet; +import org.eigenbase.reltype.RelDataType; +import org.eigenbase.reltype.RelDataTypeField; +import org.eigenbase.rex.RexCall; +import org.eigenbase.rex.RexInputRef; +import org.eigenbase.rex.RexNode; +import org.eigenbase.rex.RexUtil; +import org.eigenbase.sql.fun.SqlStdOperatorTable; +import org.eigenbase.util.CompositeList; +import org.eigenbase.util.Pair; + +import com.esotericsoftware.minlog.Log; +import com.google.common.base.Function; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Lists; + +public class RelNodeConverter { + private static final Map AGG_MAP = ImmutableMap + . builder() + .put("count", (Aggregation) SqlStdOperatorTable.COUNT) + .put("sum", SqlStdOperatorTable.SUM).put("min", SqlStdOperatorTable.MIN) + .put("max", SqlStdOperatorTable.MAX).put("avg", SqlStdOperatorTable.AVG) + .put("stddev_samp", SqlFunctionConverter.hiveAggFunction("stddev_samp")) + .build(); + + public static RelNode convert(Operator sinkOp, RelOptCluster cluster, + RelOptSchema schema, SemanticAnalyzer sA, ParseContext pCtx) { + + Context ctx = new Context(cluster, schema, sA, pCtx); + + Map rules = ImmutableMap + . builder() + .put(new RuleRegExp("R1", TableScanOperator.getOperatorName() + "%"), + new TableScanProcessor()) + .put(new RuleRegExp("R2", FilterOperator.getOperatorName() + "%"), new FilterProcessor()) + .put(new RuleRegExp("R3", SelectOperator.getOperatorName() + "%"), new SelectProcessor()) + .put(new RuleRegExp("R4", JoinOperator.getOperatorName() + "%"), new JoinProcessor()) + .put(new RuleRegExp("R5", LimitOperator.getOperatorName() + "%"), new LimitProcessor()) + .put(new RuleRegExp("R6", GroupByOperator.getOperatorName() + "%"), new GroupByProcessor()) + .put(new RuleRegExp("R7", ReduceSinkOperator.getOperatorName() + "%"), + new ReduceSinkProcessor()).build(); + + Dispatcher disp = new DefaultRuleDispatcher(new DefaultProcessor(), rules, ctx); + GraphWalker egw = new ForwardWalker(disp); + + ArrayList topNodes = new ArrayList(); + topNodes.addAll(pCtx.getTopOps().values()); + + HashMap outputMap = new HashMap(); + try { + egw.startWalking(topNodes, outputMap); + } catch (SemanticException se) { + // @revisit + throw new RuntimeException(se); + } + return (HiveRel) outputMap.get(sinkOp); + } + + static class Context implements NodeProcessorCtx { + RelOptCluster cluster; + RelOptSchema schema; + SemanticAnalyzer sA; + ParseContext parseCtx; + /* + * A Map from hive column internalNames to Optiq positions. A separate map + * for each Operator. + */ + Map> opPositionMap; + + Map, RelNode> hiveOpToRelNode; + + public Context(RelOptCluster cluster, RelOptSchema schema, SemanticAnalyzer sA, + ParseContext parseCtx) { + super(); + this.cluster = cluster; + this.schema = schema; + this.sA = sA; + this.parseCtx = parseCtx; + opPositionMap = new HashMap>(); + hiveOpToRelNode = new HashMap, RelNode>(); + } + + void buildColumnMap(Operator op, RelNode rNode) { + RowSchema rr = op.getSchema(); + ImmutableMap.Builder b = new ImmutableMap.Builder(); + int i = 0; + for (ColumnInfo ci : rr.getSignature()) { + b.put(ci.getInternalName(), i); + i++; + } + opPositionMap.put(rNode, b.build()); + } + + /* + * Why special handling for TableScan? - the RowResolver coming from hive + * for TScan still has all the columns, whereas the Optiq type we build is + * based on the needed columns in the TScanOp. + */ + void buildColumnMap(TableScanOperator tsOp, RelNode rNode) { + RelDataType oType = rNode.getRowType(); + int i = 0; + ImmutableMap.Builder b = new ImmutableMap.Builder(); + for (String fN : oType.getFieldNames()) { + b.put(fN, i); + i++; + } + opPositionMap.put(rNode, b.build()); + } + + Map reducerMap(Map inpMap, ReduceSinkOperator rsOp) { + ImmutableMap.Builder b = new ImmutableMap.Builder(); + Map colExprMap = rsOp.getColumnExprMap(); + for (Map.Entry e : colExprMap.entrySet()) { + String inpCol = ((ExprNodeColumnDesc) e.getValue()).getColumn(); + b.put(e.getKey(), inpMap.get(inpCol)); + } + return b.build(); + } + + /* + * The Optiq JoinRel datatype is formed by combining the columns from its + * input RelNodes. Whereas the Hive RowResolver of the JoinOp contains only + * the columns needed by childOps. + */ + void buildColumnMap(JoinOperator jOp, HiveJoinRel jRel) throws SemanticException { + RowResolver rr = sA.getRowResolver(jOp); + QBJoinTree hTree = parseCtx.getJoinContext().get(jOp); + Map leftMap = opPositionMap.get(jRel.getLeft()); + Map rightMap = opPositionMap.get(jRel.getRight()); + leftMap = reducerMap(leftMap, (ReduceSinkOperator) jOp.getParentOperators().get(0)); + rightMap = reducerMap(rightMap, (ReduceSinkOperator) jOp.getParentOperators().get(1)); + int leftColCount = jRel.getLeft().getRowType().getFieldCount(); + ImmutableMap.Builder b = new ImmutableMap.Builder(); + for (Map.Entry> tableEntry : rr.getRslvMap() + .entrySet()) { + String table = tableEntry.getKey(); + LinkedHashMap cols = tableEntry.getValue(); + Map posMap = leftMap; + int offset = 0; + if (hTree.getRightAliases() != null) { + for (String rAlias : hTree.getRightAliases()) { + if (table.equals(rAlias)) { + posMap = rightMap; + offset = leftColCount; + break; + } + } + } + for (Map.Entry colEntry : cols.entrySet()) { + ColumnInfo ci = colEntry.getValue(); + ExprNodeDesc e = jOp.getColumnExprMap().get(ci.getInternalName()); + String cName = ((ExprNodeColumnDesc) e).getColumn(); + int pos = posMap.get(cName); + + b.put(ci.getInternalName(), pos + offset); + } + } + opPositionMap.put(jRel, b.build()); + } + + void propagatePosMap(RelNode node, RelNode parent) { + opPositionMap.put(node, opPositionMap.get(parent)); + } + + RexNode convertToOptiqExpr(final ExprNodeDesc expr, final RelNode optiqOP, final boolean flatten) { + return convertToOptiqExpr(expr, optiqOP, 0, flatten); + } + + RexNode convertToOptiqExpr(final ExprNodeDesc expr, final RelNode optiqOP, int offset, final boolean flatten) { + ImmutableMap posMap = opPositionMap.get(optiqOP); + RexNodeConverter c = new RexNodeConverter(cluster, optiqOP.getRowType(), posMap, offset, flatten); + return c.convert(expr); + } + + RelNode getParentNode(Operator hiveOp, int i) { + Operator p = hiveOp.getParentOperators().get(i); + return p == null ? null : hiveOpToRelNode.get(p); + } + + } + + static class JoinProcessor implements NodeProcessor { + @SuppressWarnings("unchecked") + public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException { + Context ctx = (Context) procCtx; + HiveRel left = (HiveRel) ctx.getParentNode((Operator) nd, 0); + HiveRel right = (HiveRel) ctx.getParentNode((Operator) nd, 1); + JoinOperator joinOp = (JoinOperator) nd; + JoinCondDesc[] jConds = joinOp.getConf().getConds(); + assert jConds.length == 1; + HiveJoinRel joinRel = convertJoinOp(ctx, joinOp, jConds[0], left, right); + ctx.buildColumnMap(joinOp, joinRel); + ctx.hiveOpToRelNode.put(joinOp, joinRel); + return joinRel; + } + + /* + * @todo: cleanup, for now just copied from HiveToOptiqRelConvereter + */ + private HiveJoinRel convertJoinOp(Context ctx, JoinOperator op, JoinCondDesc jc, + HiveRel leftRel, HiveRel rightRel) { + HiveJoinRel joinRel; + Operator leftParent = op.getParentOperators().get(jc.getLeft()); + Operator rightParent = op.getParentOperators().get(jc.getRight()); + + if (leftParent instanceof ReduceSinkOperator && rightParent instanceof ReduceSinkOperator) { + List leftCols = ((ReduceSinkDesc) (leftParent.getConf())).getKeyCols(); + List rightCols = ((ReduceSinkDesc) (rightParent.getConf())).getKeyCols(); + RexNode joinPredicate = null; + JoinRelType joinType = JoinRelType.INNER; + int rightColOffSet = leftRel.getRowType().getFieldCount(); + + // TODO: what about semi join + switch (jc.getType()) { + case JoinDesc.INNER_JOIN: + joinType = JoinRelType.INNER; + break; + case JoinDesc.LEFT_OUTER_JOIN: + joinType = JoinRelType.LEFT; + break; + case JoinDesc.RIGHT_OUTER_JOIN: + joinType = JoinRelType.RIGHT; + break; + case JoinDesc.FULL_OUTER_JOIN: + joinType = JoinRelType.FULL; + break; + } + + int i = 0; + for (ExprNodeDesc expr : leftCols) { + List eqExpr = new LinkedList(); + eqExpr.add(ctx.convertToOptiqExpr(expr, leftRel, 0, false)); + eqExpr.add(ctx.convertToOptiqExpr(rightCols.get(i), rightRel, rightColOffSet, false)); + + RexNode eqOp = ctx.cluster.getRexBuilder().makeCall(SqlStdOperatorTable.EQUALS, eqExpr); + i++; + + if (joinPredicate == null) { + joinPredicate = eqOp; + } else { + List conjElements = new LinkedList(); + conjElements.add(joinPredicate); + conjElements.add(eqOp); + joinPredicate = ctx.cluster.getRexBuilder().makeCall(SqlStdOperatorTable.AND, + conjElements); + } + } + + // Translate non-joinkey predicate + Set>> filterExprSet = op.getConf().getFilters().entrySet(); + if (!filterExprSet.isEmpty()) { + RexNode eqExpr; + int colOffSet; + RelNode childRel; + Operator parentHiveOp; + int inputId; + + for (Entry> entry : filterExprSet) { + inputId = entry.getKey().intValue(); + if (inputId == 0) { + colOffSet = 0; + childRel = leftRel; + parentHiveOp = leftParent; + } else if (inputId == 1) { + colOffSet = rightColOffSet; + childRel = rightRel; + parentHiveOp = rightParent; + } else { + throw new RuntimeException("Invalid Join Input"); + } + + for (ExprNodeDesc expr : entry.getValue()) { + eqExpr = ctx.convertToOptiqExpr(expr, childRel, colOffSet, false); + List conjElements = new LinkedList(); + conjElements.add(joinPredicate); + conjElements.add(eqExpr); + joinPredicate = ctx.cluster.getRexBuilder().makeCall(SqlStdOperatorTable.AND, + conjElements); + } + } + } + + joinRel = HiveJoinRel.getJoin(ctx.cluster, leftRel, rightRel, joinPredicate, joinType); + } else { + throw new RuntimeException("Right & Left of Join Condition columns are not equal"); + } + + return joinRel; + } + } + + private static int convertExpr(Context ctx, RelNode input, ExprNodeDesc expr, + List extraExprs) { + final RexNode rex = ctx.convertToOptiqExpr(expr, input, false); + final int index; + if (rex instanceof RexInputRef) { + index = ((RexInputRef) rex).getIndex(); + } else { + index = input.getRowType().getFieldCount() + extraExprs.size(); + extraExprs.add(rex); + } + return index; + } + + private static AggregateCall convertAgg(Context ctx, AggregationDesc agg, RelNode input, + ColumnInfo cI, List extraExprs) { + final Aggregation aggregation = AGG_MAP.get(agg.getGenericUDAFName()); + if (aggregation == null) { + throw new AssertionError("agg not found: " + agg.getGenericUDAFName()); + } + + List argList = new ArrayList(); + RelDataType type = TypeConverter.convert(cI.getType(), ctx.cluster.getTypeFactory()); + if (aggregation.equals(SqlStdOperatorTable.AVG)) { + type = type.getField("sum", false).getType(); + } + for (ExprNodeDesc expr : agg.getParameters()) { + int index = convertExpr(ctx, input, expr, extraExprs); + argList.add(index); + } + + /* + * set the type to the first arg, it there is one; because the RTi set on + * Aggregation call assumes this is the output type. + */ + if (argList.size() > 0) { + RexNode rex = ctx.convertToOptiqExpr(agg.getParameters().get(0), input, false); + type = rex.getType(); + } + return new AggregateCall(aggregation, agg.getDistinct(), argList, type, null); + } + + static class FilterProcessor implements NodeProcessor { + @SuppressWarnings("unchecked") + public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException { + Context ctx = (Context) procCtx; + HiveRel input = (HiveRel) ctx.getParentNode((Operator) nd, 0); + FilterOperator filterOp = (FilterOperator) nd; + RexNode convertedFilterExpr = ctx + .convertToOptiqExpr(filterOp.getConf().getPredicate(), input, true); + + // Flatten the condition otherwise Optiq chokes on assertion + // (FilterRelBase) + if (convertedFilterExpr instanceof RexCall) { + RexCall call = (RexCall) convertedFilterExpr; + convertedFilterExpr = ctx.cluster.getRexBuilder().makeFlatCall(call.getOperator(), + call.getOperands()); + } + + HiveRel filtRel = new HiveFilterRel(ctx.cluster, ctx.cluster.traitSetOf(HiveRel.CONVENTION), + input, convertedFilterExpr); + ctx.propagatePosMap(filtRel, input); + ctx.hiveOpToRelNode.put(filterOp, filtRel); + return filtRel; + } + } + + static class SelectProcessor implements NodeProcessor { + @SuppressWarnings("unchecked") + public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException { + Context ctx = (Context) procCtx; + HiveRel inputRelNode = (HiveRel) ctx.getParentNode((Operator) nd, 0); + SelectOperator selectOp = (SelectOperator) nd; + + List colLst = selectOp.getConf().getColList(); + List optiqColLst = new LinkedList(); + + for (ExprNodeDesc colExpr : colLst) { + optiqColLst.add(ctx.convertToOptiqExpr(colExpr, inputRelNode, false)); + } + + /* + * Hive treats names that start with '_c' as internalNames; so change the + * names so we don't run into this issue when converting back to Hive AST. + */ + List oFieldNames = Lists.transform(selectOp.getConf().getOutputColumnNames(), + new Function() { + public String apply(String hName) { + return "_o_" + hName; + } + }); + + HiveRel selRel = HiveProjectRel.create(inputRelNode, optiqColLst, oFieldNames); + ctx.buildColumnMap(selectOp, selRel); + ctx.hiveOpToRelNode.put(selectOp, selRel); + return selRel; + } + } + + static class LimitProcessor implements NodeProcessor { + public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException { + Context ctx = (Context) procCtx; + HiveRel input = (HiveRel) ctx.getParentNode((Operator) nd, 0); + LimitOperator limitOp = (LimitOperator) nd; + + // in Optiq, a limit is represented as a sort on 0 columns + final RexNode fetch; + if (limitOp.getConf().getLimit() >= 0) { + fetch = ctx.cluster.getRexBuilder().makeExactLiteral( + BigDecimal.valueOf(limitOp.getConf().getLimit())); + } else { + fetch = null; + } + RelTraitSet traitSet = ctx.cluster.traitSetOf(HiveRel.CONVENTION); + RelCollation canonizedCollation = traitSet.canonize(RelCollationImpl.EMPTY); + HiveRel sortRel = new HiveSortRel(ctx.cluster, traitSet, input, canonizedCollation, null, + fetch); + ctx.propagatePosMap(sortRel, input); + ctx.hiveOpToRelNode.put(limitOp, sortRel); + return sortRel; + } + } + + static class GroupByProcessor implements NodeProcessor { + public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException { + Context ctx = (Context) procCtx; + + HiveRel input = (HiveRel) ctx.getParentNode((Operator) nd, 0); + GroupByOperator groupByOp = (GroupByOperator) nd; + RowResolver rr = ctx.sA.getRowResolver(groupByOp); + ArrayList signature = rr.getRowSchema().getSignature(); + + // GroupBy is represented by two operators, one map side and one reduce + // side. We only translate the map-side one. + if (groupByOp.getParentOperators().get(0) instanceof ReduceSinkOperator) { + ctx.hiveOpToRelNode.put(groupByOp, input); + return input; + } + + final List extraExprs = Lists.newArrayList(); + final BitSet groupSet = new BitSet(); + for (ExprNodeDesc key : groupByOp.getConf().getKeys()) { + int index = convertExpr(ctx, input, key, extraExprs); + groupSet.set(index); + } + List aggregateCalls = Lists.newArrayList(); + int i = groupByOp.getConf().getKeys().size(); + for (AggregationDesc agg : groupByOp.getConf().getAggregators()) { + aggregateCalls.add(convertAgg(ctx, agg, input, signature.get(i++), extraExprs)); + } + + if (!extraExprs.isEmpty()) { + // noinspection unchecked + input = HiveProjectRel.create(input, CompositeList.of(Lists.transform(input.getRowType() + .getFieldList(), new Function() { + public RexNode apply(RelDataTypeField input) { + return new RexInputRef(input.getIndex(), input.getType()); + } + }), extraExprs), null); + } + try { + HiveRel aggregateRel = new HiveAggregateRel(ctx.cluster, + ctx.cluster.traitSetOf(HiveRel.CONVENTION), input, groupSet, aggregateCalls); + ctx.buildColumnMap(groupByOp, aggregateRel); + ctx.hiveOpToRelNode.put(groupByOp, aggregateRel); + return aggregateRel; + } catch (InvalidRelException e) { + throw new AssertionError(e); // not possible + } + } + } + + static class ReduceSinkProcessor implements NodeProcessor { + public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException { + Context ctx = (Context) procCtx; + HiveRel input = (HiveRel) ctx.getParentNode((Operator) nd, 0); + ReduceSinkOperator sinkOp = (ReduceSinkOperator) nd; + + // It is a sort reducer if and only if the number of reducers is 1. + final ReduceSinkDesc conf = sinkOp.getConf(); + if (conf.getNumReducers() != 1) { + Operator op = (Operator) nd; + ctx.hiveOpToRelNode.put(op, input); + return input; + } + + final String order = conf.getOrder(); // "+-" means "ASC, DESC" + assert order.length() == conf.getKeyCols().size(); + + /* + * numReducers == 1 and order.length = 1 => a RS for CrossJoin. + */ + if ( order.length() == 0 ) { + Operator op = (Operator) nd; + ctx.hiveOpToRelNode.put(op, input); + return input; + } + + final List fieldCollations = Lists.newArrayList(); + final List extraExprs = Lists.newArrayList(); + for (Pair pair : Pair.zip(conf.getKeyCols(), + Lists.charactersOf(order))) { + int index = convertExpr(ctx, input, pair.left, extraExprs); + RelFieldCollation.Direction direction = getDirection(pair.right); + fieldCollations.add(new RelFieldCollation(index, direction)); + } + + if (!extraExprs.isEmpty()) { + // noinspection unchecked + input = HiveProjectRel.create(input, CompositeList.of(Lists.transform(input.getRowType() + .getFieldList(), new Function() { + public RexNode apply(RelDataTypeField input) { + return new RexInputRef(input.getIndex(), input.getType()); + } + }), extraExprs), null); + } + + RelTraitSet traitSet = ctx.cluster.traitSetOf(HiveRel.CONVENTION); + RelCollation canonizedCollation = traitSet.canonize(RelCollationImpl.of(fieldCollations)); + HiveRel sortRel = new HiveSortRel(ctx.cluster, traitSet, input, canonizedCollation, null, + null); + ctx.propagatePosMap(sortRel, input); + ctx.hiveOpToRelNode.put(sinkOp, sortRel); + + // REVIEW: Do we need to remove the columns we added due to extraExprs? + + return sortRel; + } + + private RelFieldCollation.Direction getDirection(char c) { + switch (c) { + case '+': + return RelFieldCollation.Direction.ASCENDING; + case '-': + return RelFieldCollation.Direction.DESCENDING; + default: + throw new AssertionError("unexpected direction " + c); + } + } + } + + static class TableScanProcessor implements NodeProcessor { + public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException { + Context ctx = (Context) procCtx; + TableScanOperator tableScanOp = (TableScanOperator) nd; + RowResolver rr = ctx.sA.getRowResolver(tableScanOp); + + List neededCols = new ArrayList( + tableScanOp.getNeededColumns()); + Statistics stats = tableScanOp.getStatistics(); + + try { + stats = addPartitionColumns(ctx, tableScanOp, tableScanOp.getConf() + .getAlias(), ctx.sA.getTable(tableScanOp), stats, neededCols); + } catch (CloneNotSupportedException ce) { + throw new SemanticException(ce); + } + + if (stats.getColumnStats().size() != neededCols.size()) { + throw new SemanticException("Incomplete Col stats for table: " + + tableScanOp.getConf().getAlias()); + } + RelDataType rowType = TypeConverter.getType(ctx.cluster, rr, neededCols); + RelOptHiveTable optTable = new RelOptHiveTable(ctx.schema, tableScanOp.getConf().getAlias(), + rowType, ctx.sA.getTable(tableScanOp), stats); + TableAccessRelBase tableRel = new HiveTableScanRel(ctx.cluster, + ctx.cluster.traitSetOf(HiveRel.CONVENTION), optTable, rowType); + ctx.buildColumnMap(tableScanOp, tableRel); + ctx.hiveOpToRelNode.put(tableScanOp, tableRel); + return tableRel; + } + + /* + * Add partition columns to needed columns and fake the COlStats for it. + */ + private Statistics addPartitionColumns(Context ctx, + TableScanOperator tableScanOp, String tblAlias, Table tbl, + Statistics stats, List neededCols) + throws CloneNotSupportedException { + if (!tbl.isPartitioned()) { + return stats; + } + List pStats = new ArrayList(); + List pCols = tbl.getPartCols(); + for (FieldSchema pC : pCols) { + neededCols.add(pC.getName()); + ColStatistics cStats = stats.getColumnStatisticsForColumn(tblAlias, + pC.getName()); + if (cStats == null) { + PrunedPartitionList partList = ctx.parseCtx.getOpToPartList().get( + tableScanOp); + cStats = new ColStatistics(tblAlias, pC.getName(), pC.getType()); + cStats.setCountDistint(partList.getPartitions().size()); + pStats.add(cStats); + } + } + if (pStats.size() > 0) { + stats = stats.clone(); + stats.addToColumnStats(pStats); + } + + return stats; + } + } + + static class DefaultProcessor implements NodeProcessor { + public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException { + @SuppressWarnings("unchecked") + Operator op = (Operator) nd; + Context ctx = (Context) procCtx; + RelNode node = (HiveRel) ctx.getParentNode(op, 0); + ctx.hiveOpToRelNode.put(op, node); + return node; + } + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/translator/RexNodeConverter.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/translator/RexNodeConverter.java new file mode 100644 index 0000000..a1fcadf --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/translator/RexNodeConverter.java @@ -0,0 +1,130 @@ +package org.apache.hadoop.hive.ql.optimizer.optiq.translator; + +import java.math.BigDecimal; +import java.util.LinkedList; +import java.util.List; + +import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.eigenbase.relopt.RelOptCluster; +import org.eigenbase.reltype.RelDataType; +import org.eigenbase.reltype.RelDataTypeFactory; +import org.eigenbase.rex.RexBuilder; +import org.eigenbase.rex.RexCall; +import org.eigenbase.rex.RexNode; +import org.eigenbase.sql.SqlOperator; + +import com.google.common.collect.ImmutableMap; + +public class RexNodeConverter { + + private final RelOptCluster m_cluster; + private final RelDataType m_inpDataType; + private final ImmutableMap m_nameToPosMap; + private final int m_offset; + private final boolean m_flattenExpr; + + public RexNodeConverter(RelOptCluster cluster, RelDataType inpDataType, + ImmutableMap nameToPosMap, int offset, boolean flattenExpr) { + this.m_cluster = cluster; + this.m_inpDataType = inpDataType; + this.m_nameToPosMap = nameToPosMap; + this.m_offset = offset; + m_flattenExpr = flattenExpr; + } + + public RexNode convert(ExprNodeDesc expr) { + if (expr instanceof ExprNodeGenericFuncDesc) { + return convert((ExprNodeGenericFuncDesc) expr); + } else if (expr instanceof ExprNodeConstantDesc) { + return convert((ExprNodeConstantDesc) expr); + } else if (expr instanceof ExprNodeColumnDesc) { + return convert((ExprNodeColumnDesc) expr); + } else { + throw new RuntimeException("Unsupported Expression"); + } + // TODO: handle a) ExprNodeNullDesc b) ExprNodeFieldDesc c) + // ExprNodeColumnListDesc + } + + private RexNode convert(final ExprNodeGenericFuncDesc func) { + SqlOperator optiqOp = SqlFunctionConverter.getOptiqOperator(func.getGenericUDF()); + List childRexNodeLst = new LinkedList(); + + for (ExprNodeDesc childExpr : func.getChildren()) { + childRexNodeLst.add(convert(childExpr)); + } + + RexNode convertedFilterExpr = m_cluster.getRexBuilder().makeCall(optiqOp, childRexNodeLst); + if (m_flattenExpr && convertedFilterExpr instanceof RexCall) { + RexCall call = (RexCall) convertedFilterExpr; + convertedFilterExpr = m_cluster.getRexBuilder().makeFlatCall(call.getOperator(), + call.getOperands()); + } + + return convertedFilterExpr; + } + + protected RexNode convert(ExprNodeColumnDesc col) { + int pos = m_nameToPosMap.get(col.getColumn()); + return m_cluster.getRexBuilder().makeInputRef(m_inpDataType.getFieldList().get(pos).getType(), + pos + m_offset); + } + + protected RexNode convert(ExprNodeConstantDesc literal) { + RexBuilder rexBuilder = m_cluster.getRexBuilder(); + RelDataTypeFactory dtFactory = rexBuilder.getTypeFactory(); + PrimitiveTypeInfo hiveType = (PrimitiveTypeInfo) literal.getTypeInfo(); + RelDataType optiqDataType = TypeConverter.convert(hiveType, dtFactory); + + PrimitiveCategory hiveTypeCategory = hiveType.getPrimitiveCategory(); + RexNode optiqLiteral = null; + Object value = literal.getValue(); + + // TODO: Verify if we need to use ConstantObjectInspector to unwrap data + switch (hiveTypeCategory) { + case BOOLEAN: + optiqLiteral = rexBuilder.makeLiteral(((Boolean) value).booleanValue()); + break; + case BYTE: + optiqLiteral = rexBuilder.makeExactLiteral(new BigDecimal((Short) value)); + break; + case SHORT: + optiqLiteral = rexBuilder.makeExactLiteral(new BigDecimal((Short) value)); + break; + case INT: + optiqLiteral = rexBuilder.makeExactLiteral(new BigDecimal((Integer) value)); + break; + case LONG: + optiqLiteral = rexBuilder.makeBigintLiteral(new BigDecimal((Long) value)); + break; + // TODO: is Decimal an exact numeric or approximate numeric? + case DECIMAL: + optiqLiteral = rexBuilder.makeExactLiteral((BigDecimal) value); + break; + case FLOAT: + optiqLiteral = rexBuilder.makeApproxLiteral(new BigDecimal((Float) value), optiqDataType); + break; + case DOUBLE: + optiqLiteral = rexBuilder.makeApproxLiteral(new BigDecimal((Double) value), optiqDataType); + break; + case STRING: + optiqLiteral = rexBuilder.makeLiteral((String) value); + break; + case DATE: + case TIMESTAMP: + case BINARY: + case VOID: + case UNKNOWN: + default: + throw new RuntimeException("UnSupported Literal"); + } + + return optiqLiteral; + } + +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/translator/SqlFunctionConverter.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/translator/SqlFunctionConverter.java new file mode 100644 index 0000000..629248e --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/translator/SqlFunctionConverter.java @@ -0,0 +1,261 @@ +package org.apache.hadoop.hive.ql.optimizer.optiq.translator; + +import java.util.List; +import java.util.Map; + +import org.apache.hadoop.hive.ql.exec.FunctionInfo; +import org.apache.hadoop.hive.ql.exec.FunctionRegistry; +import org.apache.hadoop.hive.ql.parse.ASTNode; +import org.apache.hadoop.hive.ql.parse.HiveParser; +import org.apache.hadoop.hive.ql.parse.ParseDriver; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge; +import org.apache.hadoop.hive.serde.serdeConstants; +import org.eigenbase.reltype.RelDataType; +import org.eigenbase.reltype.RelDataTypeFactory; +import org.eigenbase.sql.SqlAggFunction; +import org.eigenbase.sql.SqlFunction; +import org.eigenbase.sql.SqlFunctionCategory; +import org.eigenbase.sql.SqlKind; +import org.eigenbase.sql.SqlOperator; +import org.eigenbase.sql.fun.SqlStdOperatorTable; +import org.eigenbase.sql.type.OperandTypes; +import org.eigenbase.sql.type.ReturnTypes; +import org.eigenbase.sql.type.SqlReturnTypeInference; +import org.eigenbase.sql.type.SqlTypeName; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Maps; + +public class SqlFunctionConverter { + static final Map operatorMap; + static final Map hiveToOptiq; + static final Map optiqToHiveToken; + + static { + Builder builder = new Builder(); + operatorMap = ImmutableMap.copyOf(builder.operatorMap); + hiveToOptiq = ImmutableMap.copyOf(builder.hiveToOptiq); + optiqToHiveToken = ImmutableMap.copyOf(builder.optiqToHiveToken); + } + + public static SqlOperator getOptiqOperator(GenericUDF hiveUDF) { + return hiveToOptiq.get(getName(hiveUDF)); + } + + public static ASTNode buildAST(SqlOperator op, List children) { + HiveToken hToken = optiqToHiveToken.get(op); + ASTNode node; + if (hToken != null) { + node = (ASTNode) ParseDriver.adaptor.create(hToken.type, hToken.text); + } else { + node = (ASTNode) ParseDriver.adaptor.create(HiveParser.TOK_FUNCTION, "TOK_FUNCTION"); + node.addChild((ASTNode) ParseDriver.adaptor.create(HiveParser.Identifier, op.getName())); + } + + for (ASTNode c : children) { + ParseDriver.adaptor.addChild(node, c); + } + return node; + } + + /** + * Build AST for flattened Associative expressions ('and', 'or'). Flattened + * expressions is of the form or[x,y,z] which is originally represented as + * "or[x, or[y, z]]". + */ + public static ASTNode buildAST(SqlOperator op, List children, int i) { + if (i + 1 < children.size()) { + HiveToken hToken = optiqToHiveToken.get(op); + ASTNode curNode = ((ASTNode) ParseDriver.adaptor.create(hToken.type, hToken.text)); + ParseDriver.adaptor.addChild(curNode, children.get(i)); + ParseDriver.adaptor.addChild(curNode, buildAST(op, children, i + 1)); + return curNode; + } else { + return children.get(i); + } + + } + + private static String getName(GenericUDF hiveUDF) { + if (hiveUDF instanceof GenericUDFBridge) { + return ((GenericUDFBridge) hiveUDF).getUdfName(); + } else { + return hiveUDF.getClass().getName(); + } + } + + private static class Builder { + final Map operatorMap = Maps.newHashMap(); + final Map hiveToOptiq = Maps.newHashMap(); + final Map optiqToHiveToken = Maps.newHashMap(); + + Builder() { + registerFunction("concat", SqlStdOperatorTable.CONCAT, null); + registerFunction("substr", SqlStdOperatorTable.SUBSTRING, null); + registerFunction("substring", SqlStdOperatorTable.SUBSTRING, null); + stringFunction("space"); + stringFunction("repeat"); + numericFunction("ascii"); + stringFunction("repeat"); + + numericFunction("size"); + + numericFunction("round"); + registerFunction("floor", SqlStdOperatorTable.FLOOR, null); + registerFunction("sqrt", SqlStdOperatorTable.SQRT, null); + registerFunction("ceil", SqlStdOperatorTable.CEIL, null); + registerFunction("ceiling", SqlStdOperatorTable.CEIL, null); + numericFunction("rand"); + operatorMap.put("abs", SqlStdOperatorTable.ABS); + numericFunction("pmod"); + + numericFunction("ln"); + numericFunction("log2"); + numericFunction("sin"); + numericFunction("asin"); + numericFunction("cos"); + numericFunction("acos"); + registerFunction("log10", SqlStdOperatorTable.LOG10, null); + numericFunction("log"); + numericFunction("exp"); + numericFunction("power"); + numericFunction("pow"); + numericFunction("sign"); + numericFunction("pi"); + numericFunction("degrees"); + numericFunction("atan"); + numericFunction("tan"); + numericFunction("e"); + + registerFunction("upper", SqlStdOperatorTable.UPPER, null); + registerFunction("lower", SqlStdOperatorTable.LOWER, null); + registerFunction("ucase", SqlStdOperatorTable.UPPER, null); + registerFunction("lcase", SqlStdOperatorTable.LOWER, null); + registerFunction("trim", SqlStdOperatorTable.TRIM, null); + stringFunction("ltrim"); + stringFunction("rtrim"); + numericFunction("length"); + + stringFunction("like"); + stringFunction("rlike"); + stringFunction("regexp"); + stringFunction("regexp_replace"); + + stringFunction("regexp_extract"); + stringFunction("parse_url"); + + numericFunction("day"); + numericFunction("dayofmonth"); + numericFunction("month"); + numericFunction("year"); + numericFunction("hour"); + numericFunction("minute"); + numericFunction("second"); + + registerFunction("+", SqlStdOperatorTable.PLUS, hToken(HiveParser.PLUS, "+")); + registerFunction("-", SqlStdOperatorTable.MINUS, hToken(HiveParser.MINUS, "-")); + registerFunction("*", SqlStdOperatorTable.MULTIPLY, hToken(HiveParser.STAR, "*")); + registerFunction("/", SqlStdOperatorTable.DIVIDE, hToken(HiveParser.STAR, "/")); + registerFunction("%", SqlStdOperatorTable.MOD, hToken(HiveParser.STAR, "%")); + numericFunction("div"); + + numericFunction("isnull"); + numericFunction("isnotnull"); + + numericFunction("if"); + numericFunction("in"); + registerFunction("and", SqlStdOperatorTable.AND, hToken(HiveParser.KW_AND, "and")); + registerFunction("or", SqlStdOperatorTable.OR, hToken(HiveParser.KW_OR, "or")); + registerFunction("=", SqlStdOperatorTable.EQUALS, hToken(HiveParser.EQUAL, "=")); +// numericFunction("=="); + numericFunction("<=>"); + numericFunction("!="); + + numericFunction("<>"); + registerFunction("<", SqlStdOperatorTable.LESS_THAN, hToken(HiveParser.LESSTHAN, "<")); + registerFunction("<=", SqlStdOperatorTable.LESS_THAN_OR_EQUAL, + hToken(HiveParser.LESSTHANOREQUALTO, "<=")); + registerFunction(">", SqlStdOperatorTable.GREATER_THAN, hToken(HiveParser.GREATERTHAN, ">")); + registerFunction(">=", SqlStdOperatorTable.GREATER_THAN_OR_EQUAL, + hToken(HiveParser.GREATERTHANOREQUALTO, ">=")); + numericFunction("not"); + registerFunction("!", SqlStdOperatorTable.NOT, hToken(HiveParser.KW_NOT, "not")); + numericFunction("between"); + + registerFunction("case", SqlStdOperatorTable.CASE, null); + numericFunction("when"); + + // implicit convert methods + numericFunction(serdeConstants.BOOLEAN_TYPE_NAME); + numericFunction(serdeConstants.TINYINT_TYPE_NAME); + numericFunction(serdeConstants.SMALLINT_TYPE_NAME); + numericFunction(serdeConstants.INT_TYPE_NAME); + numericFunction(serdeConstants.BIGINT_TYPE_NAME); + numericFunction(serdeConstants.FLOAT_TYPE_NAME); + numericFunction(serdeConstants.DOUBLE_TYPE_NAME); + stringFunction(serdeConstants.STRING_TYPE_NAME); + } + + private void stringFunction(String name) { + registerFunction(name, SqlFunctionCategory.STRING, ReturnTypes.explicit(SqlTypeName.VARCHAR)); + } + + private void numericFunction(String name) { + registerFunction(name, SqlFunctionCategory.NUMERIC, ReturnTypes.explicit(SqlTypeName.DECIMAL)); + } + + private void registerFunction(String name, SqlFunctionCategory cat, SqlReturnTypeInference rti) { + SqlOperator optiqFn = new SqlFunction(name.toUpperCase(), SqlKind.OTHER_FUNCTION, rti, null, + null, cat); + registerFunction(name, optiqFn, null); + } + + private void registerFunction(String name, SqlOperator optiqFn, HiveToken hiveToken) { + FunctionInfo hFn = FunctionRegistry.getFunctionInfo(name); + operatorMap.put(name, optiqFn); + + String hFnName = getName(hFn.getGenericUDF()); + hiveToOptiq.put(hFnName, optiqFn); + if (hiveToken != null) { + optiqToHiveToken.put(optiqFn, hiveToken); + } + } + } + + private static HiveToken hToken(int type, String text) { + return new HiveToken(type, text); + } + + static class HiveToken { + int type; + String text; + + HiveToken(int type, String text) { + this.type = type; + this.text = text; + } + } + + static SqlAggFunction hiveAggFunction(String name) { + return new HiveAggFunction(name); + } + + static class HiveAggFunction extends SqlAggFunction { + + public HiveAggFunction(String name) { + super(name, SqlKind.OTHER_FUNCTION, ReturnTypes.BIGINT, null, + OperandTypes.ANY, SqlFunctionCategory.NUMERIC); + } + + public List getParameterTypes(RelDataTypeFactory typeFactory) { + return ImmutableList.of(typeFactory.createSqlType(SqlTypeName.ANY)); + } + + public RelDataType getReturnType(RelDataTypeFactory typeFactory) { + return typeFactory.createSqlType(SqlTypeName.BIGINT); + } + + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/translator/TypeConverter.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/translator/TypeConverter.java new file mode 100644 index 0000000..0715e70 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/optiq/translator/TypeConverter.java @@ -0,0 +1,152 @@ +package org.apache.hadoop.hive.ql.optimizer.optiq.translator; + +import java.util.LinkedList; +import java.util.List; + +import org.apache.hadoop.hive.ql.exec.ColumnInfo; +import org.apache.hadoop.hive.ql.exec.RowSchema; +import org.apache.hadoop.hive.ql.parse.RowResolver; +import org.apache.hadoop.hive.serde2.typeinfo.BaseCharTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo; +import org.eigenbase.relopt.RelOptCluster; +import org.eigenbase.reltype.RelDataType; +import org.eigenbase.reltype.RelDataTypeFactory; +import org.eigenbase.rex.RexBuilder; +import org.eigenbase.sql.type.SqlTypeName; + +import com.google.common.base.Function; +import com.google.common.collect.Lists; + +public class TypeConverter { + + public static RelDataType getType(RelOptCluster cluster, RowResolver rr, List neededCols) { + RexBuilder rexBuilder = cluster.getRexBuilder(); + RelDataTypeFactory dtFactory = rexBuilder.getTypeFactory(); + RowSchema rs = rr.getRowSchema(); + List fieldTypes = new LinkedList(); + List fieldNames = new LinkedList(); + + for (ColumnInfo ci : rs.getSignature()) { + if (neededCols == null || neededCols.contains(ci.getInternalName())) { + fieldTypes.add(convert(ci.getType(), dtFactory)); + fieldNames.add(ci.getInternalName()); + } + } + return dtFactory.createStructType(fieldTypes, fieldNames); + } + + public static RelDataType convert(TypeInfo type, RelDataTypeFactory dtFactory) { + RelDataType convertedType = null; + + switch (type.getCategory()) { + case PRIMITIVE: + convertedType = convert((PrimitiveTypeInfo) type, dtFactory); + break; + case LIST: + convertedType = convert((ListTypeInfo) type, dtFactory); + break; + case MAP: + convertedType = convert((MapTypeInfo) type, dtFactory); + break; + case STRUCT: + convertedType = convert((StructTypeInfo) type, dtFactory); + break; + case UNION: + convertedType = convert((UnionTypeInfo) type, dtFactory); + break; + } + return convertedType; + } + + public static RelDataType convert(PrimitiveTypeInfo type, RelDataTypeFactory dtFactory) { + RelDataType convertedType = null; + + switch (type.getPrimitiveCategory()) { + case VOID: + // @todo: followup on VOID type in hive + convertedType = dtFactory.createSqlType(SqlTypeName.OTHER); + break; + case BOOLEAN: + convertedType = dtFactory.createSqlType(SqlTypeName.BOOLEAN); + break; + case BYTE: + convertedType = dtFactory.createSqlType(SqlTypeName.TINYINT); + break; + case SHORT: + convertedType = dtFactory.createSqlType(SqlTypeName.SMALLINT); + break; + case INT: + convertedType = dtFactory.createSqlType(SqlTypeName.INTEGER); + break; + case LONG: + convertedType = dtFactory.createSqlType(SqlTypeName.BIGINT); + break; + case FLOAT: + convertedType = dtFactory.createSqlType(SqlTypeName.FLOAT); + break; + case DOUBLE: + convertedType = dtFactory.createSqlType(SqlTypeName.DOUBLE); + break; + case STRING: + convertedType = dtFactory.createSqlType(SqlTypeName.VARCHAR, 1); + break; + case DATE: + convertedType = dtFactory.createSqlType(SqlTypeName.DATE); + break; + case TIMESTAMP: + convertedType = dtFactory.createSqlType(SqlTypeName.TIMESTAMP); + break; + case BINARY: + convertedType = dtFactory.createSqlType(SqlTypeName.BINARY); + break; + case DECIMAL: + convertedType = dtFactory.createSqlType(SqlTypeName.DECIMAL); + break; + case VARCHAR: + convertedType = dtFactory.createSqlType(SqlTypeName.VARCHAR, + ((BaseCharTypeInfo) type).getLength()); + break; + case CHAR: + convertedType = dtFactory.createSqlType(SqlTypeName.CHAR, + ((BaseCharTypeInfo) type).getLength()); + break; + case UNKNOWN: + convertedType = dtFactory.createSqlType(SqlTypeName.OTHER); + break; + } + + return convertedType; + } + + public static RelDataType convert(ListTypeInfo lstType, RelDataTypeFactory dtFactory) { + RelDataType elemType = convert(lstType.getListElementTypeInfo(), dtFactory); + return dtFactory.createArrayType(elemType, -1); + } + + public static RelDataType convert(MapTypeInfo mapType, RelDataTypeFactory dtFactory) { + RelDataType keyType = convert(mapType.getMapKeyTypeInfo(), dtFactory); + RelDataType valueType = convert(mapType.getMapValueTypeInfo(), dtFactory); + return dtFactory.createMapType(keyType, valueType); + } + + public static RelDataType convert(StructTypeInfo structType, final RelDataTypeFactory dtFactory) { + List fTypes = Lists.transform(structType.getAllStructFieldTypeInfos(), + new Function() { + public RelDataType apply(TypeInfo tI) { + return convert(tI, dtFactory); + } + }); + return dtFactory.createStructType(fTypes, structType.getAllStructFieldNames()); + } + + public static RelDataType convert(UnionTypeInfo unionType, RelDataTypeFactory dtFactory) { + // @todo what do we about unions? + throw new UnsupportedOperationException(); + } + +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnStatsSemanticAnalyzer.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnStatsSemanticAnalyzer.java index 5b77e6f..9a9305d 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnStatsSemanticAnalyzer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnStatsSemanticAnalyzer.java @@ -118,7 +118,7 @@ public void setPartKeyType(String partKeyType, int index) { } public ColumnStatsSemanticAnalyzer(HiveConf conf) throws SemanticException { - super(conf); + super(conf, false); } private boolean shouldRewrite(ASTNode tree) { @@ -504,7 +504,7 @@ private ASTNode genRewrittenTree(String rewrittenQuery) throws SemanticException } public ColumnStatsSemanticAnalyzer(HiveConf conf, ASTNode tree) throws SemanticException { - super(conf); + super(conf, false); // check if it is no scan. grammar prevents coexit noscan/columns super.processNoScanCommand(tree); // check if it is partial scan. grammar prevents coexit partialscan/columns diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseDriver.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseDriver.java index 52c39c0..a24cad9 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseDriver.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseDriver.java @@ -131,7 +131,7 @@ public String getErrorMessage(RecognitionException e, String[] tokenNames) { * so that the graph walking algorithms and the rules framework defined in * ql.lib can be used with the AST Nodes. */ - static final TreeAdaptor adaptor = new CommonTreeAdaptor() { + public static final TreeAdaptor adaptor = new CommonTreeAdaptor() { /** * Creates an ASTNode for the given token. The ASTNode is a wrapper around * antlr's CommonTree class that implements the Node interface. diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java index 77305ff..4413504 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java @@ -100,7 +100,10 @@ import org.apache.hadoop.hive.ql.metadata.Partition; import org.apache.hadoop.hive.ql.metadata.Table; import org.apache.hadoop.hive.ql.metadata.VirtualColumn; +import org.apache.hadoop.hive.ql.optimizer.CostBasedOptimizer; import org.apache.hadoop.hive.ql.optimizer.Optimizer; +import org.apache.hadoop.hive.ql.optimizer.PreCBOOptimizer; +import org.apache.hadoop.hive.ql.optimizer.optiq.stats.CBOTableStatsValidator; import org.apache.hadoop.hive.ql.optimizer.unionproc.UnionProcContext; import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.tableSpec.SpecType; import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec.OrderExpression; @@ -258,6 +261,9 @@ //flag for partial scan during analyze ... compute statistics protected boolean partialscan = false; + private volatile boolean runCBO = true; + private volatile boolean disableJoinMerge = false; + /* * Capture the CTE definitions in a Query. */ @@ -272,6 +278,11 @@ int nextNum; } + protected SemanticAnalyzer(HiveConf conf, boolean runCBO) throws SemanticException { + this(conf); + this.runCBO = runCBO; + } + public SemanticAnalyzer(HiveConf conf) throws SemanticException { super(conf); @@ -322,7 +333,28 @@ protected void reset() { opParseCtx.clear(); groupOpToInputTables.clear(); prunedPartitions.clear(); + disableJoinMerge = false; aliasToCTEs.clear(); + topToTable.clear(); + opToPartPruner.clear(); + opToPartList.clear(); + opToPartToSkewedPruner.clear(); + opToSamplePruner.clear(); + nameToSplitSample.clear(); + fsopToTable.clear(); + resultSchema = null; + createVwDesc = null; + viewsExpanded = null; + viewSelect = null; + ctesExpanded = null; + globalLimitCtx.disableOpt(); + viewAliasToInput.clear(); + reduceSinkOperatorsAddedByEnforceBucketingSorting.clear(); + topToTableProps.clear(); + listMapJoinOpsNoReducer.clear(); + unparseTranslator.clear(); + queryProperties.clear(); + outputs.clear(); } public void initParseCtx(ParseContext pctx) { @@ -971,7 +1003,6 @@ public boolean doPhase1(ASTNode ast, QB qb, Phase1Ctx ctx_1) frm.getToken().getType() == HiveParser.TOK_LATERAL_VIEW_OUTER) { processLateralView(qb, frm); } else if (isJoinToken(frm)) { - queryProperties.setHasJoin(true); processJoin(qb, frm); qbp.setJoinExpr(frm); }else if(frm.getToken().getType() == HiveParser.TOK_PTBLFUNCTION){ @@ -1188,6 +1219,10 @@ private void getMetaData(QBExpr qbexpr, ReadEntity parentInput) } } + public Table getTable(TableScanOperator ts) { + return topToTable.get(ts); + } + public void getMetaData(QB qb) throws SemanticException { getMetaData(qb, null); } @@ -6582,6 +6617,7 @@ private Operator genJoinOperatorChildren(QBJoinTree join, Operator left, } desc.setNullSafes(nullsafes); } + queryProperties.incrementJoinCount(joinOp.getConf().getNoOuterJoin()); return putOpInsertMap(joinOp, outputRS); } @@ -8944,7 +8980,9 @@ public Operator genPlan(QB qb) throws SemanticException { aliasToOpInfo ); } } - mergeJoinTree(qb); + + if (!disableJoinMerge) + mergeJoinTree(qb); } // if any filters are present in the join tree, push them on top of the @@ -9209,6 +9247,19 @@ public void analyzeInternal(ASTNode ast) throws SemanticException { getMetaData(qb); LOG.info("Completed getting MetaData in Semantic Analysis"); + if (runCBO) { + boolean tokenTypeIsQuery = ast.getToken().getType() == HiveParser.TOK_QUERY + || ast.getToken().getType() == HiveParser.TOK_EXPLAIN; + if (!tokenTypeIsQuery || createVwDesc != null + || !HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_CBO_ENABLED)) { + runCBO = false; + } + + if (runCBO) { + disableJoinMerge = true; + } + } + // Save the result schema derived from the sink operator produced // by genPlan. This has the correct column names, which clients // such as JDBC would prefer instead of the c0, c1 we'll end @@ -9221,6 +9272,96 @@ public void analyzeInternal(ASTNode ast) throws SemanticException { resultSchema = convertRowSchemaToResultSetSchema(opParseCtx.get(sinkOp).getRowResolver(), HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_RESULTSET_USE_UNIQUE_COLUMN_NAMES)); + if (runCBO) { + /* + * For CBO: 1. Check if CBO can handle op tree. 2. Run PreCBOOptimizer on + * Plan. This applies: Partition Pruning, Predicate Pushdown, Column + * Pruning and Stats Annotation transformations on the generated plan. 3. + * Validate that all TS has valid stats 4. Hand the Plan to CBO, which + * searches the Plan space and returns the best Plan as an AST 5. We then + * run the Analysis Pipeline on the new AST: Phase 1, Get Metadata, Gen + * Plan. a. During Plan Generation, we disable Join Merging, because we + * don't want the Join order to be changed. Error Handling: On Failure - + * we restart the Analysis from the beginning on the original AST, with + * runCBO set to false. + */ + boolean reAnalyzeAST = false; + + try { + // 1. Can CBO handle OP tree + if (CostBasedOptimizer.canHandleOpTree(sinkOp, conf, queryProperties)) { + ASTNode newAST = null; + + // 2. Set up parse ctx for CBO + ParseContext pCtx = new ParseContext(conf, qb, child, opToPartPruner, opToPartList, + topOps, topSelOps, opParseCtx, joinContext, smbMapJoinContext, topToTable, + topToTableProps, fsopToTable, loadTableWork, loadFileWork, ctx, idToTableNameMap, + destTableId, uCtx, listMapJoinOpsNoReducer, groupOpToInputTables, prunedPartitions, + opToSamplePruner, globalLimitCtx, nameToSplitSample, inputs, rootTasks, + opToPartToSkewedPruner, viewAliasToInput, + reduceSinkOperatorsAddedByEnforceBucketingSorting, queryProperties); + + // 3. Run Pre CBO optimizer + PreCBOOptimizer preCBOOptm = new PreCBOOptimizer(); + preCBOOptm.setPctx(pCtx); + preCBOOptm.initialize(conf); + pCtx = preCBOOptm.optimize(); + + // 4. Validate Table Stats + CBOTableStatsValidator tableStatsValidator = new CBOTableStatsValidator(); + if (tableStatsValidator.validStats(sinkOp, pCtx)) { + + // 5. Optimize the plan with CBO & generate optimized AST + newAST = CostBasedOptimizer.optimize(sinkOp, this, pCtx, resultSchema); + if (LOG.isDebugEnabled()) { + String newAstExpanded = newAST.dump(); + LOG.debug("CBO rewritten query: \n" + newAstExpanded); + } + + // 6. Regen OP plan from optimized AST + init(); + ctx_1 = initPhase1Ctx(); + if (!doPhase1(newAST, qb, ctx_1)) { + throw new RuntimeException("Couldn't do phase1 on CBO optimized query plan"); + } + getMetaData(qb); + + disableJoinMerge = true; + sinkOp = genPlan(qb); + + /* + * Use non CBO Result Set Schema so as to preserve user specified + * names. Hive seems to have bugs with OB/LIMIT in sub queries. + * // 7. Reset result set schema resultSchema = + * convertRowSchemaToResultSetSchema(opParseCtx.get(sinkOp) + * .getRowResolver(), true); + */ + } else { + reAnalyzeAST = true; + LOG.warn("Skipping CBO. Incomplete column stats for Tables: " + + tableStatsValidator.getIncompleteStatsTabNames()); + } + } else { + // Need to regen OP tree since join merge was disabled. + // TODO: can we just regen OP tree instead of reanalyzing AST. + if (queryProperties.getJoinCount() > 1) + reAnalyzeAST = true; + LOG.info("Skipping CBO as CBO can not handle OP tree."); + } + } catch (Exception e) { + reAnalyzeAST = true; + LOG.warn("CBO failed, skipping CBO. ", e); + } finally { + runCBO = false; + disableJoinMerge = false; + if (reAnalyzeAST) { + init(); + analyzeInternal(ast); + return; + } + } + } + ParseContext pCtx = new ParseContext(conf, qb, child, opToPartPruner, opToPartList, topOps, topSelOps, opParseCtx, joinContext, smbMapJoinContext, topToTable, topToTableProps, fsopToTable, diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/UnparseTranslator.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/UnparseTranslator.java index c69f747..9ad6714 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/UnparseTranslator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/UnparseTranslator.java @@ -262,4 +262,10 @@ public String toString() { ASTNode targetNode; ASTNode sourceNode; } + + public void clear() { + translations.clear(); + copyTranslations.clear(); + enabled = false; + } }