Index: pom.xml =================================================================== --- pom.xml (.../https://svn.apache.org/repos/asf/hive/trunk) (revision 1672451) +++ pom.xml (working copy) @@ -100,7 +100,7 @@ 3.4 1.7.5 0.8.0.RELEASE - 1.1.0-incubating + 1.2.0-incubating-SNAPSHOT 3.2.6 3.2.10 3.2.9 Property changes on: hbase-handler/pom.xml ___________________________________________________________________ Modified: svn:mergeinfo Reverse-merged /hive/branches/cbo/hbase-handler/pom.xml:r1605012-1627125 Merged /hive/trunk/hbase-handler/pom.xml:r1605012-1660746 Index: metastore/bin/.gitignore =================================================================== --- metastore/bin/.gitignore (.../https://svn.apache.org/repos/asf/hive/trunk) (revision 1672451) +++ metastore/bin/.gitignore (working copy) @@ -1 +1 @@ -# Dummy file to make Git recognize this empty directory +/src/ Index: common/src/java/org/apache/hadoop/hive/conf/HiveConf.java =================================================================== --- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (.../https://svn.apache.org/repos/asf/hive/trunk) (revision 1672451) +++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (working copy) @@ -702,6 +702,9 @@ // CBO related HIVE_CBO_ENABLED("hive.cbo.enable", true, "Flag to control enabling Cost Based Optimizations using Calcite framework."), + HIVE_CBO_RETPATH_HIVEOP("hive.cbo.returnpath.hiveop", false, "Flag to control calcite plan to hive operator conversion"), + EXTENDED_COST_MODEL("hive.cbo.costmodel.extended", false, "Flag to control enabling the extended cost model based on" + + "CPU, IO and cardinality. Otherwise, the cost model is based on cardinality."), // hive.mapjoin.bucket.cache.size has been replaced by hive.smbjoin.cache.row, // need to remove by hive .13. Also, do not change default (see SMB operator) Index: ql/src/test/queries/clientpositive/cbo_join.q =================================================================== --- ql/src/test/queries/clientpositive/cbo_join.q (.../https://svn.apache.org/repos/asf/hive/trunk) (revision 1672451) +++ ql/src/test/queries/clientpositive/cbo_join.q (working copy) @@ -4,6 +4,7 @@ set hive.stats.fetch.column.stats=true; set hive.auto.convert.join=false; +-- SORT_QUERY_RESULTS -- 4. Test Select + Join + TS select cbo_t1.c_int, cbo_t2.c_int from cbo_t1 join cbo_t2 on cbo_t1.key=cbo_t2.key; select cbo_t1.key from cbo_t1 join cbo_t3; Index: ql/src/test/results/clientpositive/cbo_join.q.out =================================================================== --- ql/src/test/results/clientpositive/cbo_join.q.out (.../https://svn.apache.org/repos/asf/hive/trunk) (revision 1672451) +++ ql/src/test/results/clientpositive/cbo_join.q.out (working copy) @@ -1,4 +1,5 @@ -PREHOOK: query: -- 4. Test Select + Join + TS +PREHOOK: query: -- SORT_QUERY_RESULTS +-- 4. Test Select + Join + TS select cbo_t1.c_int, cbo_t2.c_int from cbo_t1 join cbo_t2 on cbo_t1.key=cbo_t2.key PREHOOK: type: QUERY PREHOOK: Input: default@cbo_t1 @@ -6,7 +7,8 @@ PREHOOK: Input: default@cbo_t2 PREHOOK: Input: default@cbo_t2@dt=2014 #### A masked pattern was here #### -POSTHOOK: query: -- 4. Test Select + Join + TS +POSTHOOK: query: -- SORT_QUERY_RESULTS +-- 4. Test Select + Join + TS select cbo_t1.c_int, cbo_t2.c_int from cbo_t1 join cbo_t2 on cbo_t1.key=cbo_t2.key POSTHOOK: type: QUERY POSTHOOK: Input: default@cbo_t1 @@ -122,46 +124,6 @@ POSTHOOK: Input: default@cbo_t1@dt=2014 POSTHOOK: Input: default@cbo_t3 #### A masked pattern was here #### -NULL -NULL -NULL -NULL -NULL -NULL -NULL -NULL -NULL -NULL -NULL -NULL -NULL -NULL -NULL -NULL -NULL -NULL -NULL -NULL -NULL -NULL -NULL -NULL -NULL -NULL -NULL -NULL -NULL -NULL -NULL -NULL -NULL -NULL -NULL -NULL -NULL -NULL -NULL -NULL 1 1 1 @@ -522,6 +484,46 @@ 1 1 1 +NULL +NULL +NULL +NULL +NULL +NULL +NULL +NULL +NULL +NULL +NULL +NULL +NULL +NULL +NULL +NULL +NULL +NULL +NULL +NULL +NULL +NULL +NULL +NULL +NULL +NULL +NULL +NULL +NULL +NULL +NULL +NULL +NULL +NULL +NULL +NULL +NULL +NULL +NULL +NULL PREHOOK: query: select cbo_t1.key from cbo_t1 join cbo_t3 where cbo_t1.key=cbo_t3.key and cbo_t1.key >= 1 PREHOOK: type: QUERY PREHOOK: Input: default@cbo_t1 @@ -632,8 +634,6 @@ POSTHOOK: Input: default@cbo_t2 POSTHOOK: Input: default@cbo_t2@dt=2014 #### A masked pattern was here #### -NULL NULL -NULL NULL 1 1 1 1 1 1 @@ -730,6 +730,8 @@ 1 1 1 1 1 1 +NULL NULL +NULL NULL PREHOOK: query: select cbo_t1.c_int, cbo_t2.c_int from cbo_t1 right outer join cbo_t2 on cbo_t1.key=cbo_t2.key PREHOOK: type: QUERY PREHOOK: Input: default@cbo_t1 @@ -744,8 +746,6 @@ POSTHOOK: Input: default@cbo_t2 POSTHOOK: Input: default@cbo_t2@dt=2014 #### A masked pattern was here #### -NULL NULL -NULL NULL 1 1 1 1 1 1 @@ -847,6 +847,8 @@ NULL 2 NULL 2 NULL 2 +NULL NULL +NULL NULL PREHOOK: query: select cbo_t1.c_int, cbo_t2.c_int from cbo_t1 full outer join cbo_t2 on cbo_t1.key=cbo_t2.key PREHOOK: type: QUERY PREHOOK: Input: default@cbo_t1 @@ -861,10 +863,6 @@ POSTHOOK: Input: default@cbo_t2 POSTHOOK: Input: default@cbo_t2@dt=2014 #### A masked pattern was here #### -NULL NULL -NULL NULL -NULL NULL -NULL NULL 1 1 1 1 1 1 @@ -966,6 +964,10 @@ NULL 2 NULL 2 NULL 2 +NULL NULL +NULL NULL +NULL NULL +NULL NULL PREHOOK: query: select b, cbo_t1.c, cbo_t2.p, q, cbo_t3.c_int from (select key as a, c_int as b, cbo_t1.c_float as c from cbo_t1) cbo_t1 join (select cbo_t2.key as p, cbo_t2.c_int as q, c_float as r from cbo_t2) cbo_t2 on cbo_t1.a=p join cbo_t3 on cbo_t1.a=key PREHOOK: type: QUERY PREHOOK: Input: default@cbo_t1 @@ -5334,8 +5336,6 @@ POSTHOOK: Input: default@cbo_t2@dt=2014 POSTHOOK: Input: default@cbo_t3 #### A masked pattern was here #### -NULL NULL NULL NULL -NULL NULL NULL NULL 1 1 1 1 1 1 1 1 1 1 1 1 @@ -5870,6 +5870,8 @@ NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL +NULL NULL NULL NULL +NULL NULL NULL NULL PREHOOK: query: select b, cbo_t1.c, cbo_t2.p, q, cbo_t3.c_int from (select key as a, c_int as b, cbo_t1.c_float as c from cbo_t1) cbo_t1 full outer join (select cbo_t2.key as p, cbo_t2.c_int as q, c_float as r from cbo_t2) cbo_t2 on cbo_t1.a=p join cbo_t3 on cbo_t1.a=key PREHOOK: type: QUERY PREHOOK: Input: default@cbo_t1 @@ -6430,8 +6432,6 @@ POSTHOOK: Input: default@cbo_t2@dt=2014 POSTHOOK: Input: default@cbo_t3 #### A masked pattern was here #### -NULL NULL NULL NULL -NULL NULL NULL NULL 1 1 1 1 1 1 1 1 1 1 1 1 @@ -6966,6 +6966,8 @@ NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL +NULL NULL NULL NULL +NULL NULL NULL NULL PREHOOK: query: -- 5. Test Select + Join + FIL + TS select cbo_t1.c_int, cbo_t2.c_int from cbo_t1 join cbo_t2 on cbo_t1.key=cbo_t2.key where (cbo_t1.c_int + cbo_t2.c_int == 2) and (cbo_t1.c_int > 0 or cbo_t2.c_float >= 0) PREHOOK: type: QUERY Index: ql/src/java/org/apache/hadoop/hive/ql/parse/UnparseTranslator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/parse/UnparseTranslator.java (.../https://svn.apache.org/repos/asf/hive/trunk) (revision 1672451) +++ ql/src/java/org/apache/hadoop/hive/ql/parse/UnparseTranslator.java (working copy) @@ -38,7 +38,7 @@ * SemanticAnalyzer.saveViewDefinition() calls TokenRewriteStream.toString(). * */ -class UnparseTranslator { +public class UnparseTranslator { // key is token start index private final NavigableMap translations; private final List copyTranslations; Index: ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java (.../https://svn.apache.org/repos/asf/hive/trunk) (revision 1672451) +++ ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java (working copy) @@ -229,9 +229,9 @@ private HashMap opToPartPruner; private HashMap opToPartList; - private HashMap> topOps; - private final HashMap> topSelOps; - private final LinkedHashMap, OpParseContext> opParseCtx; + protected HashMap> topOps; + private HashMap> topSelOps; + protected LinkedHashMap, OpParseContext> opParseCtx; private List loadTableWork; private List loadFileWork; private final Map joinContext; @@ -258,7 +258,7 @@ private CreateViewDesc createVwDesc; private ArrayList viewsExpanded; private ASTNode viewSelect; - private final UnparseTranslator unparseTranslator; + protected final UnparseTranslator unparseTranslator; private final GlobalLimitCtx globalLimitCtx; // prefix for column names auto generated by hive @@ -478,7 +478,7 @@ wExprsInDest.containsKey(wFnSpec.getExpression().toStringTree())) { continue; } - wFnSpec.setAlias("_wcol" + wColIdx); + wFnSpec.setAlias(wFnSpec.getName() + "_window_" + wColIdx); spec.addWindowFunction(wFnSpec); qb.getParseInfo().addWindowingExprToClause(dest, wFnSpec.getExpression()); } @@ -3448,7 +3448,7 @@ return ret; } - private int setBit(int bitmap, int bitIdx) { + public static int setBit(int bitmap, int bitIdx) { return bitmap | (1 << bitIdx); } @@ -3984,10 +3984,10 @@ /** * Class to store GenericUDAF related information. */ - static class GenericUDAFInfo { - ArrayList convertedParameters; - GenericUDAFEvaluator genericUDAFEvaluator; - TypeInfo returnType; + public static class GenericUDAFInfo { + public ArrayList convertedParameters; + public GenericUDAFEvaluator genericUDAFEvaluator; + public TypeInfo returnType; } /** @@ -4028,7 +4028,7 @@ * Returns the GenericUDAFEvaluator for the aggregation. This is called once * for each GroupBy aggregation. */ - static GenericUDAFEvaluator getGenericUDAFEvaluator(String aggName, + public static GenericUDAFEvaluator getGenericUDAFEvaluator(String aggName, ArrayList aggParameters, ASTNode aggTree, boolean isDistinct, boolean isAllColumns) throws SemanticException { @@ -4058,7 +4058,7 @@ * @throws SemanticException * when the UDAF is not found or has problems. */ - static GenericUDAFInfo getGenericUDAFInfo(GenericUDAFEvaluator evaluator, + public static GenericUDAFInfo getGenericUDAFInfo(GenericUDAFEvaluator evaluator, GenericUDAFEvaluator.Mode emode, ArrayList aggParameters) throws SemanticException { @@ -4087,7 +4087,7 @@ return r; } - static GenericUDAFEvaluator.Mode groupByDescModeToUDAFMode( + public static GenericUDAFEvaluator.Mode groupByDescModeToUDAFMode( GroupByDesc.Mode mode, boolean isDistinct) { switch (mode) { case COMPLETE: @@ -4130,7 +4130,7 @@ * @return the ExprNodeDesc of the constant parameter if the given internalName represents * a constant parameter; otherwise, return null */ - private ExprNodeDesc isConstantParameterInAggregationParameters(String internalName, + public static ExprNodeDesc isConstantParameterInAggregationParameters(String internalName, List reduceValues) { // only the pattern of "VALUE._col([0-9]+)" should be handled. @@ -5577,7 +5577,7 @@ return false; } - private void checkExpressionsForGroupingSet(List grpByExprs, + void checkExpressionsForGroupingSet(List grpByExprs, List distinctGrpByExprs, Map aggregationTrees, RowResolver inputRowResolver) throws SemanticException { @@ -6131,7 +6131,7 @@ } @SuppressWarnings("nls") - private Operator genFileSinkPlan(String dest, QB qb, Operator input) + protected Operator genFileSinkPlan(String dest, QB qb, Operator input) throws SemanticException { RowResolver inputRR = opParseCtx.get(input).getRowResolver(); @@ -9234,7 +9234,7 @@ return equalsExpr; } - private String getAliasId(String alias, QB qb) { + protected String getAliasId(String alias, QB qb) { return (qb.getId() == null ? alias : qb.getId() + ":" + alias).toLowerCase(); } Index: ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java (.../https://svn.apache.org/repos/asf/hive/trunk) (revision 1672451) +++ ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java (working copy) @@ -49,8 +49,8 @@ import org.apache.calcite.plan.hep.HepProgramBuilder; import org.apache.calcite.rel.InvalidRelException; import org.apache.calcite.rel.RelCollation; +import org.apache.calcite.rel.RelCollationImpl; import org.apache.calcite.rel.RelCollations; -import org.apache.calcite.rel.RelCollationImpl; import org.apache.calcite.rel.RelFieldCollation; import org.apache.calcite.rel.RelNode; import org.apache.calcite.rel.core.Aggregate; @@ -58,8 +58,10 @@ import org.apache.calcite.rel.core.Filter; import org.apache.calcite.rel.core.Join; import org.apache.calcite.rel.core.JoinRelType; +import org.apache.calcite.rel.core.Project; import org.apache.calcite.rel.core.RelFactories; import org.apache.calcite.rel.core.SemiJoin; +import org.apache.calcite.rel.core.Sort; import org.apache.calcite.rel.metadata.CachingRelMetadataProvider; import org.apache.calcite.rel.metadata.ChainedRelMetadataProvider; import org.apache.calcite.rel.metadata.RelMetadataProvider; @@ -117,6 +119,7 @@ import org.apache.hadoop.hive.ql.metadata.VirtualColumn; import org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException; import org.apache.hadoop.hive.ql.optimizer.calcite.HiveCalciteUtil; +import org.apache.hadoop.hive.ql.optimizer.calcite.HiveConfigContext; import org.apache.hadoop.hive.ql.optimizer.calcite.HiveDefaultRelMetadataProvider; import org.apache.hadoop.hive.ql.optimizer.calcite.HiveTypeSystemImpl; import org.apache.hadoop.hive.ql.optimizer.calcite.RelOptHiveTable; @@ -135,8 +138,11 @@ import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveFilterJoinRule; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveFilterProjectTransposeRule; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveFilterSetOpTransposeRule; +import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveInsertExchange4JoinRule; +import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveJoinAddNotNullRule; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HivePartitionPruneRule; import org.apache.hadoop.hive.ql.optimizer.calcite.translator.ASTConverter; +import org.apache.hadoop.hive.ql.optimizer.calcite.translator.HiveOpConverter; import org.apache.hadoop.hive.ql.optimizer.calcite.translator.JoinCondTypeCheckProcFactory; import org.apache.hadoop.hive.ql.optimizer.calcite.translator.JoinTypeCheckCtx; import org.apache.hadoop.hive.ql.optimizer.calcite.translator.RexNodeConverter; @@ -175,6 +181,7 @@ import com.google.common.collect.Lists; public class CalcitePlanner extends SemanticAnalyzer { + private final AtomicInteger noColsMissingStats = new AtomicInteger(0); private List topLevelFieldSchema; private SemanticException semanticException; @@ -218,13 +225,16 @@ if (cboCtx.type == PreCboCtx.Type.CTAS) { queryForCbo = cboCtx.nodeOfInterest; // nodeOfInterest is the query } - runCBO = canHandleAstForCbo(queryForCbo, getQB(), cboCtx); + runCBO = canCBOHandleAst(queryForCbo, getQB(), cboCtx); if (runCBO) { disableJoinMerge = true; boolean reAnalyzeAST = false; try { + if (this.conf.getBoolVar(HiveConf.ConfVars.HIVE_CBO_RETPATH_HIVEOP)) { + sinkOp = getOptimizedHiveOPDag(); + } else { // 1. Gen Optimized AST ASTNode newAST = getOptimizedAST(); @@ -252,6 +262,7 @@ LOG.info("CBO Succeeded; optimized logical plan."); this.ctx.setCboInfo("Plan optimized by CBO."); LOG.debug(newAST.dump()); + } } catch (Exception e) { boolean isMissingStats = noColsMissingStats.get() > 0; if (isMissingStats) { @@ -324,7 +335,7 @@ * If top level QB is query then everything below it must also be * Query. */ - boolean canHandleAstForCbo(ASTNode ast, QB qb, PreCboCtx cboCtx) { + boolean canCBOHandleAst(ASTNode ast, QB qb, PreCboCtx cboCtx) { int root = ast.getToken().getType(); boolean needToLogMessage = STATIC_LOG.isInfoEnabled(); boolean isSupportedRoot = root == HiveParser.TOK_QUERY || root == HiveParser.TOK_EXPLAIN @@ -598,6 +609,57 @@ return optiqOptimizedAST; } + /** + * Get Optimized Hive Operator DAG for the given QB tree in the semAnalyzer. + * + * @return Optimized Hive operator tree + * @throws SemanticException + */ + Operator getOptimizedHiveOPDag() throws SemanticException { + RelNode optimizedOptiqPlan = null; + CalcitePlannerAction calcitePlannerAction = new CalcitePlannerAction(prunedPartitions); + + try { + optimizedOptiqPlan = Frameworks.withPlanner(calcitePlannerAction, Frameworks + .newConfigBuilder().typeSystem(new HiveTypeSystemImpl()).build()); + } catch (Exception e) { + rethrowCalciteException(e); + throw new AssertionError("rethrowCalciteException didn't throw for " + e.getMessage()); + } + + RelNode modifiedOptimizedOptiqPlan = introduceProjectIfNeeded(optimizedOptiqPlan); + + Operator hiveRoot = new HiveOpConverter(this, conf, unparseTranslator, topOps, + conf.getVar(HiveConf.ConfVars.HIVEMAPREDMODE).equalsIgnoreCase("strict")).convert(modifiedOptimizedOptiqPlan); + RowResolver hiveRootRR = genRowResolver(hiveRoot, getQB()); + opParseCtx.put(hiveRoot, new OpParseContext(hiveRootRR)); + return genFileSinkPlan(getQB().getParseInfo().getClauseNames().iterator().next(), getQB(), hiveRoot); + } + + private RelNode introduceProjectIfNeeded(RelNode optimizedOptiqPlan) + throws CalciteSemanticException { + RelNode parent = null; + RelNode input = optimizedOptiqPlan; + RelNode newRoot = optimizedOptiqPlan; + + while (!(input instanceof Project) && (input instanceof Sort)) { + parent = input; + input = input.getInput(0); + } + + if (!(input instanceof Project)) { + HiveProject hpRel = HiveProject.create(input, + HiveCalciteUtil.getProjsFromBelowAsInputRef(input), input.getRowType().getFieldNames()); + if (input == optimizedOptiqPlan) { + newRoot = hpRel; + } else { + parent.replaceInput(0, hpRel); + } + } + + return newRoot; + } + /*** * Unwraps Calcite Invocation exceptions coming meta data provider chain and * obtains the real cause. @@ -674,6 +736,24 @@ || t instanceof UndeclaredThrowableException; } + private RowResolver genRowResolver(Operator op, QB qb) { + RowResolver rr = new RowResolver(); + String subqAlias = (qb.getAliases().size() == 1 && qb.getSubqAliases().size() == 1) ? qb + .getAliases().get(0) : null; + + for (ColumnInfo ci : op.getSchema().getSignature()) { + try { + rr.putWithCheck((subqAlias != null) ? subqAlias : ci.getTabAlias(), + ci.getAlias() != null ? ci.getAlias() : ci.getInternalName(), ci.getInternalName(), + new ColumnInfo(ci)); + } catch (SemanticException e) { + throw new RuntimeException(e); + } + } + + return rr; + } + /** * Code responsible for Calcite plan generation and optimization. */ @@ -700,7 +780,8 @@ /* * recreate cluster, so that it picks up the additional traitDef */ - RelOptPlanner planner = HiveVolcanoPlanner.createPlanner(); + HiveConfigContext confContext = new HiveConfigContext(conf); + RelOptPlanner planner = HiveVolcanoPlanner.createPlanner(confContext); final RelOptQuery query = new RelOptQuery(planner); final RexBuilder rexBuilder = cluster.getRexBuilder(); cluster = query.createCluster(rexBuilder.getTypeFactory(), rexBuilder); @@ -719,13 +800,16 @@ throw new RuntimeException(e); } + // Create MD provider + HiveDefaultRelMetadataProvider mdProvider = new HiveDefaultRelMetadataProvider(conf); + // 2. Apply Pre Join Order optimizations calcitePreCboPlan = applyPreJoinOrderingTransforms(calciteGenPlan, - HiveDefaultRelMetadataProvider.INSTANCE); + mdProvider.getMetadataProvider()); // 3. Appy Join Order Optimizations using Hep Planner (MST Algorithm) List list = Lists.newArrayList(); - list.add(HiveDefaultRelMetadataProvider.INSTANCE); + list.add(mdProvider.getMetadataProvider()); RelTraitSet desiredTraits = cluster .traitSetOf(HiveRelNode.CONVENTION, RelCollations.EMPTY); @@ -758,6 +842,18 @@ calciteOptimizedPlan = hepPlanner.findBestExp(); + if (HiveConf.getBoolVar(conf, ConfVars.HIVE_CBO_RETPATH_HIVEOP)) { + // run rules to aid in translation from Optiq tree -> Hive tree + hepPgm = new HepProgramBuilder().addMatchOrder(HepMatchOrder.BOTTOM_UP) + .addRuleInstance(new HiveInsertExchange4JoinRule()).build(); + hepPlanner = new HepPlanner(hepPgm); + + hepPlanner.registerMetadataProviders(list); + cluster.setMetadataProvider(new CachingRelMetadataProvider(chainedProvider, hepPlanner)); + hepPlanner.setRoot(calciteOptimizedPlan); + calciteOptimizedPlan = hepPlanner.findBestExp(); + } + if (LOG.isDebugEnabled() && !conf.getBoolVar(ConfVars.HIVE_IN_TEST)) { LOG.debug("CBO Planning details:\n"); LOG.debug("Original Plan:\n" + RelOptUtil.toString(calciteGenPlan)); @@ -789,7 +885,12 @@ basePlan = hepPlan(basePlan, true, mdProvider, SemiJoinJoinTransposeRule.INSTANCE, SemiJoinFilterTransposeRule.INSTANCE, SemiJoinProjectTransposeRule.INSTANCE); - // 2. PPD + // 2. Add not null filters + if (conf.getBoolVar(HiveConf.ConfVars.HIVE_CBO_RETPATH_HIVEOP)) { + basePlan = hepPlan(basePlan, true, mdProvider, HiveJoinAddNotNullRule.INSTANCE); + } + + // 3. PPD basePlan = hepPlan(basePlan, true, mdProvider, ReduceExpressionsRule.PROJECT_INSTANCE, ReduceExpressionsRule.FILTER_INSTANCE, @@ -802,19 +903,19 @@ HiveFilterJoinRule.FILTER_ON_JOIN, new FilterAggregateTransposeRule(Filter.class, HiveFilter.DEFAULT_FILTER_FACTORY, Aggregate.class)); - // 3. Transitive inference & Partition Pruning + // 4. Transitive inference & Partition Pruning basePlan = hepPlan(basePlan, false, mdProvider, new JoinPushTransitivePredicatesRule( Join.class, HiveFilter.DEFAULT_FILTER_FACTORY), new HivePartitionPruneRule(conf)); - // 4. Projection Pruning + // 5. Projection Pruning RelFieldTrimmer fieldTrimmer = new RelFieldTrimmer(null, HiveProject.DEFAULT_PROJECT_FACTORY, HiveFilter.DEFAULT_FILTER_FACTORY, HiveJoin.HIVE_JOIN_FACTORY, RelFactories.DEFAULT_SEMI_JOIN_FACTORY, HiveSort.HIVE_SORT_REL_FACTORY, HiveAggregate.HIVE_AGGR_REL_FACTORY, HiveUnion.UNION_REL_FACTORY); basePlan = fieldTrimmer.trim(basePlan); - // 5. Rerun PPD through Project as column pruning would have introduced DT + // 6. Rerun PPD through Project as column pruning would have introduced DT // above scans basePlan = hepPlan(basePlan, true, mdProvider, new FilterProjectTransposeRule(Filter.class, HiveFilter.DEFAULT_FILTER_FACTORY, @@ -1186,7 +1287,7 @@ } // 2. Get Table Metadata - Table tab = qb.getMetaData().getSrcForAlias(tableAlias); + Table tabMetaData = qb.getMetaData().getSrcForAlias(tableAlias); // 3. Get Table Logical Schema (Row Type) // NOTE: Table logical schema = Non Partition Cols + Partition Cols + @@ -1194,7 +1295,7 @@ // 3.1 Add Column info for non partion cols (Object Inspector fields) @SuppressWarnings("deprecation") - StructObjectInspector rowObjectInspector = (StructObjectInspector) tab.getDeserializer() + StructObjectInspector rowObjectInspector = (StructObjectInspector) tabMetaData.getDeserializer() .getObjectInspector(); List fields = rowObjectInspector.getAllStructFieldRefs(); ColumnInfo colInfo; @@ -1216,7 +1317,7 @@ ArrayList partitionColumns = new ArrayList(); // 3.2 Add column info corresponding to partition columns - for (FieldSchema part_col : tab.getPartCols()) { + for (FieldSchema part_col : tabMetaData.getPartCols()) { colName = part_col.getName(); colInfo = new ColumnInfo(colName, TypeInfoFactory.getPrimitiveTypeInfo(part_col.getType()), tableAlias, true); @@ -1226,6 +1327,7 @@ } // 3.3 Add column info corresponding to virtual columns + List virtualCols = new ArrayList(); Iterator vcs = VirtualColumn.getRegistry(conf).iterator(); while (vcs.hasNext()) { VirtualColumn vc = vcs.next(); @@ -1233,24 +1335,26 @@ vc.getIsHidden()); rr.put(tableAlias, vc.getName(), colInfo); cInfoLst.add(colInfo); + virtualCols.add(vc); } // 3.4 Build row type from field RelDataType rowType = TypeConverter.getType(cluster, rr, null); // 4. Build RelOptAbstractTable - String fullyQualifiedTabName = tab.getDbName(); - if (fullyQualifiedTabName != null && !fullyQualifiedTabName.isEmpty()) - fullyQualifiedTabName = fullyQualifiedTabName + "." + tab.getTableName(); - else - fullyQualifiedTabName = tab.getTableName(); + String fullyQualifiedTabName = tabMetaData.getDbName(); + if (fullyQualifiedTabName != null && !fullyQualifiedTabName.isEmpty()) { + fullyQualifiedTabName = fullyQualifiedTabName + "." + tabMetaData.getTableName(); + } + else { + fullyQualifiedTabName = tabMetaData.getTableName(); + } RelOptHiveTable optTable = new RelOptHiveTable(relOptSchema, fullyQualifiedTabName, - tableAlias, rowType, tab, nonPartitionColumns, partitionColumns, conf, partitionCache, - noColsMissingStats); + rowType, tabMetaData, nonPartitionColumns, partitionColumns, virtualCols, conf, + partitionCache, noColsMissingStats, getAliasId(tableAlias, qb)); // 5. Build Hive Table Scan Rel - tableRel = new HiveTableScan(cluster, cluster.traitSetOf(HiveRelNode.CONVENTION), optTable, - rowType); + tableRel = new HiveTableScan(cluster, cluster.traitSetOf(HiveRelNode.CONVENTION), optTable, null == tableAlias ? tabMetaData.getTableName() : tableAlias); // 6. Add Schema(RR) to RelNode-Schema map ImmutableMap hiveToCalciteColMap = buildHiveToCalciteColumnMap(rr, @@ -1747,10 +1851,40 @@ private RelNode genGBLogicalPlan(QB qb, RelNode srcRel) throws SemanticException { RelNode gbRel = null; QBParseInfo qbp = getQBParseInfo(qb); + // NOTE: Multi Insert is not supported + String detsClauseName = qbp.getClauseNames().iterator().next(); + List grpByAstExprs = SemanticAnalyzer.getGroupByForClause(qbp, detsClauseName); + HashMap aggregationTrees = qbp.getAggregationExprsForClause(detsClauseName); + // NOTE: Multi Insert is not supported + boolean cubeRollupGrpSetPresent = (!qbp.getDestRollups().isEmpty() + || !qbp.getDestGroupingSets().isEmpty() || !qbp.getDestCubes().isEmpty()); + // 0. Sanity check + if (conf.getBoolVar(HiveConf.ConfVars.HIVEGROUPBYSKEW) + && qbp.getDistinctFuncExprsForClause(detsClauseName).size() > 1) { + throw new SemanticException(ErrorMsg.UNSUPPORTED_MULTIPLE_DISTINCTS.getMsg()); + } + if (cubeRollupGrpSetPresent) { + if (!HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVEMAPSIDEAGGREGATE)) { + throw new SemanticException(ErrorMsg.HIVE_GROUPING_SETS_AGGR_NOMAPAGGR.getMsg()); + } + + if (conf.getBoolVar(HiveConf.ConfVars.HIVEGROUPBYSKEW)) { + checkExpressionsForGroupingSet(grpByAstExprs, qb.getParseInfo() + .getDistinctFuncExprsForClause(detsClauseName), aggregationTrees, + this.relToHiveRR.get(srcRel)); + + if (qbp.getDestGroupingSets().size() > conf + .getIntVar(HiveConf.ConfVars.HIVE_NEW_JOB_GROUPING_SET_CARDINALITY)) { + String errorMsg = "The number of rows per input row due to grouping sets is " + + qbp.getDestGroupingSets().size(); + throw new SemanticException( + ErrorMsg.HIVE_GROUPING_SETS_THRESHOLD_NOT_ALLOWED_WITH_SKEW.getMsg(errorMsg)); + } + } + } + // 1. Gather GB Expressions (AST) (GB + Aggregations) - // NOTE: Multi Insert is not supported - String detsClauseName = qbp.getClauseNames().iterator().next(); // Check and transform group by *. This will only happen for select distinct *. // Here the "genSelectPlan" is being leveraged. // The main benefits are (1) remove virtual columns that should @@ -1768,8 +1902,6 @@ qbp.setSelExprForClause(detsClauseName, SemanticAnalyzer.genSelectDIAST(rr)); } } - List grpByAstExprs = SemanticAnalyzer.getGroupByForClause(qbp, detsClauseName); - HashMap aggregationTrees = qbp.getAggregationExprsForClause(detsClauseName); boolean hasGrpByAstExprs = (grpByAstExprs != null && !grpByAstExprs.isEmpty()) ? true : false; boolean hasAggregationTrees = (aggregationTrees != null && !aggregationTrees.isEmpty()) ? true : false; @@ -1802,9 +1934,7 @@ // 4. GroupingSets, Cube, Rollup int groupingColsSize = gbExprNDescLst.size(); List groupingSets = null; - if (!qbp.getDestRollups().isEmpty() - || !qbp.getDestGroupingSets().isEmpty() - || !qbp.getDestCubes().isEmpty()) { + if (cubeRollupGrpSetPresent) { if (qbp.getDestRollups().contains(detsClauseName)) { groupingSets = getGroupingSetsForRollup(grpByAstExprs.size()); } else if (qbp.getDestCubes().contains(detsClauseName)) { @@ -2250,15 +2380,27 @@ } } - return genSelectRelNode(projsForWindowSelOp, out_rwsch, srcRel); + return genSelectRelNode(projsForWindowSelOp, out_rwsch, srcRel, windowExpressions); } private RelNode genSelectRelNode(List calciteColLst, RowResolver out_rwsch, RelNode srcRel) throws CalciteSemanticException { + return genSelectRelNode(calciteColLst, out_rwsch, srcRel, null); + } + + private RelNode genSelectRelNode(List calciteColLst, RowResolver out_rwsch, + RelNode srcRel, List windowExpressions) throws CalciteSemanticException { // 1. Build Column Names Set colNamesSet = new HashSet(); List cInfoLst = out_rwsch.getRowSchema().getSignature(); ArrayList columnNames = new ArrayList(); + Map windowToAlias = null; + if (windowExpressions != null ) { + windowToAlias = new HashMap(); + for (WindowExpressionSpec wes : windowExpressions) { + windowToAlias.put(wes.getExpression().toStringTree().toLowerCase(), wes.getAlias()); + } + } String[] qualifiedColNames; String tmpColAlias; for (int i = 0; i < calciteColLst.size(); i++) { @@ -2276,8 +2418,11 @@ * the names so we don't run into this issue when converting back to * Hive AST. */ - if (tmpColAlias.startsWith("_c")) + if (tmpColAlias.startsWith("_c")) { tmpColAlias = "_o_" + tmpColAlias; + } else if (windowToAlias != null && windowToAlias.containsKey(tmpColAlias)) { + tmpColAlias = windowToAlias.get(tmpColAlias); + } int suffix = 1; while (colNamesSet.contains(tmpColAlias)) { tmpColAlias = qualifiedColNames[1] + suffix; @@ -2769,4 +2914,5 @@ return tabAliases; } } + } Index: ql/src/java/org/apache/hadoop/hive/ql/exec/RowSchema.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/RowSchema.java (.../https://svn.apache.org/repos/asf/hive/trunk) (revision 1672451) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/RowSchema.java (working copy) @@ -22,6 +22,7 @@ import java.util.ArrayList; import java.util.HashSet; import java.util.Iterator; +import java.util.List; import java.util.Set; /** @@ -102,6 +103,14 @@ return tableNames; } + public List getColumnNames() { + List columnNames = new ArrayList(); + for (ColumnInfo var : this.signature) { + columnNames.add(var.getInternalName()); + } + return columnNames; + } + @Override public boolean equals(Object obj) { if (!(obj instanceof RowSchema) || (obj == null)) { Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/ColumnPrunerProcFactory.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/ColumnPrunerProcFactory.java (.../https://svn.apache.org/repos/asf/hive/trunk) (revision 1672451) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/ColumnPrunerProcFactory.java (working copy) @@ -531,14 +531,15 @@ Operator child = op.getChildOperators().get(0); - List childCols; + List childCols = null; if (child instanceof CommonJoinOperator) { - childCols = cppCtx.getJoinPrunedColLists().get(child) + childCols = cppCtx.getJoinPrunedColLists().get(child) == null + ? null : cppCtx.getJoinPrunedColLists().get(child) .get((byte) conf.getTag()); } else { childCols = cppCtx.getPrunedColList(child); + } - } List valCols = conf.getValueCols(); List valColNames = conf.getOutputValueColumnNames(); @@ -749,6 +750,7 @@ conf.setOutputColumnNames(newOutputColumnNames); handleChildren(op, cols, cppCtx); } + return null; } @@ -971,12 +973,12 @@ .getChildOperators(); LOG.info("JOIN " + op.getIdentifier() + " oldExprs: " + conf.getExprs()); + List childColLists = cppCtx.genColLists(op); if (childColLists == null) { return; } - Map> prunedColLists = new HashMap>(); for (byte tag : conf.getTagOrder()) { prunedColLists.put(tag, new ArrayList()); @@ -1076,6 +1078,7 @@ } LOG.info("JOIN " + op.getIdentifier() + " newExprs: " + conf.getExprs()); + op.setColumnExprMap(newColExprMap); conf.setOutputColumnNames(outputCols); op.getSchema().setSignature(rs); Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/NonBlockingOpDeDupProc.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/NonBlockingOpDeDupProc.java (.../https://svn.apache.org/repos/asf/hive/trunk) (revision 1672451) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/NonBlockingOpDeDupProc.java (working copy) @@ -242,4 +242,4 @@ return null; } } -} +} \ No newline at end of file Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java (.../https://svn.apache.org/repos/asf/hive/trunk) (revision 1672451) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java (working copy) @@ -142,7 +142,9 @@ if(HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTREDUCEDEDUPLICATION)) { transformations.add(new ReduceSinkDeDuplication()); } + if(!HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVE_CBO_RETPATH_HIVEOP)) { transformations.add(new NonBlockingOpDeDupProc()); + } if(HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEIDENTITYPROJECTREMOVER)) { transformations.add(new IdentityProjectRemover()); } Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveRelDistribution.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveRelDistribution.java (.../https://svn.apache.org/repos/asf/hive/trunk) (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveRelDistribution.java (revision 1672450) @@ -0,0 +1,80 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer.calcite; + +import java.util.List; + +import org.apache.calcite.plan.RelOptPlanner; +import org.apache.calcite.plan.RelTrait; +import org.apache.calcite.plan.RelTraitDef; +import org.apache.calcite.rel.RelDistribution; +import org.apache.calcite.rel.RelDistributionTraitDef; +import org.apache.calcite.util.mapping.Mappings.TargetMapping; + +public class HiveRelDistribution implements RelDistribution { + + List keys; + RelDistribution.Type type; + + public HiveRelDistribution(Type type, List keys) { + this.type = type; + this.keys = keys; + } + + @Override + public RelTraitDef getTraitDef() { + return RelDistributionTraitDef.INSTANCE; + } + + @Override + public void register(RelOptPlanner planner) { + + } + + @Override + public boolean satisfies(RelTrait trait) { + if (trait == this) { + return true; + } + switch (((RelDistribution)trait).getType()) { + case HASH_DISTRIBUTED : + return this.getKeys().equals(((RelDistribution)trait).getKeys()); + default: + throw new RuntimeException("Other distributions are not used yet."); + } + } + + @Override + public RelDistribution apply(TargetMapping mapping) { + if (keys.isEmpty()) { + return this; + } + return new HiveRelDistribution(type, keys); + } + + @Override + public List getKeys() { + return keys; + } + + @Override + public Type getType() { + return type; + } + +} Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveRelCollation.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveRelCollation.java (.../https://svn.apache.org/repos/asf/hive/trunk) (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveRelCollation.java (revision 1672450) @@ -0,0 +1,16 @@ +package org.apache.hadoop.hive.ql.optimizer.calcite; + +import org.apache.calcite.rel.RelCollationImpl; +import org.apache.calcite.rel.RelFieldCollation; + +import com.google.common.collect.ImmutableList; + +public class HiveRelCollation extends RelCollationImpl { + + public HiveRelCollation(ImmutableList fieldCollations) { + super(fieldCollations); + } + +} + + Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveConfigContext.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveConfigContext.java (.../https://svn.apache.org/repos/asf/hive/trunk) (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveConfigContext.java (revision 1672450) @@ -0,0 +1,20 @@ +package org.apache.hadoop.hive.ql.optimizer.calcite; + +import org.apache.calcite.plan.Context; +import org.apache.hadoop.hive.conf.HiveConf; + + +public class HiveConfigContext implements Context { + private HiveConf config; + + public HiveConfigContext(HiveConf config) { + this.config = config; + } + + public T unwrap(Class clazz) { + if (clazz.isInstance(config)) { + return clazz.cast(config); + } + return null; + } +} \ No newline at end of file Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveCalciteUtil.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveCalciteUtil.java (.../https://svn.apache.org/repos/asf/hive/trunk) (revision 1672451) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveCalciteUtil.java (working copy) @@ -28,8 +28,10 @@ import org.apache.calcite.plan.RelOptUtil; import org.apache.calcite.plan.RelOptUtil.InputReferencedVisitor; import org.apache.calcite.rel.RelNode; +import org.apache.calcite.rel.core.Join; import org.apache.calcite.rel.core.RelFactories.ProjectFactory; import org.apache.calcite.rel.core.Sort; +import org.apache.calcite.rel.type.RelDataType; import org.apache.calcite.rel.type.RelDataTypeField; import org.apache.calcite.rex.RexBuilder; import org.apache.calcite.rex.RexCall; @@ -50,13 +52,18 @@ import org.apache.calcite.util.ImmutableBitSet; import org.apache.calcite.util.Pair; import org.apache.calcite.util.Util; -import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveJoin; +import org.apache.hadoop.hive.metastore.api.FieldSchema; +import org.apache.hadoop.hive.ql.exec.ColumnInfo; +import org.apache.hadoop.hive.ql.metadata.VirtualColumn; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveProject; +import org.apache.hadoop.hive.ql.optimizer.calcite.translator.ExprNodeConverter; import org.apache.hadoop.hive.ql.parse.ASTNode; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import com.google.common.base.Function; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableMap.Builder; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Lists; @@ -319,11 +326,11 @@ return this.mapOfProjIndxInJoinSchemaToLeafPInfo; } - public static JoinPredicateInfo constructJoinPredicateInfo(HiveJoin j) { + public static JoinPredicateInfo constructJoinPredicateInfo(Join j) { return constructJoinPredicateInfo(j, j.getCondition()); } - public static JoinPredicateInfo constructJoinPredicateInfo(HiveJoin j, RexNode predicate) { + public static JoinPredicateInfo constructJoinPredicateInfo(Join j, RexNode predicate) { JoinPredicateInfo jpi = null; JoinLeafPredicateInfo jlpi = null; List equiLPIList = new ArrayList(); @@ -432,6 +439,16 @@ .copyOf(projsFromRightPartOfJoinKeysInJoinSchema); } + public List getJoinKeyExprs(int input) { + if (input == 0) { + return this.joinKeyExprsFromLeft; + } + if (input == 1) { + return this.joinKeyExprsFromRight; + } + return null; + } + public List getJoinKeyExprsFromLeft() { return this.joinKeyExprsFromLeft; } @@ -461,7 +478,7 @@ return this.projsFromRightPartOfJoinKeysInJoinSchema; } - private static JoinLeafPredicateInfo constructJoinLeafPredicateInfo(HiveJoin j, RexNode pe) { + private static JoinLeafPredicateInfo constructJoinLeafPredicateInfo(Join j, RexNode pe) { JoinLeafPredicateInfo jlpi = null; List filterNulls = new ArrayList(); List joinKeyExprsFromLeft = new ArrayList(); @@ -561,6 +578,107 @@ return deterministic; } + public static ImmutableMap getColInfoMap(List hiveCols, + int startIndx) { + Builder bldr = ImmutableMap. builder(); + + int indx = startIndx; + for (T ci : hiveCols) { + bldr.put(indx, ci); + indx++; + } + + return bldr.build(); + } + + public static ImmutableMap shiftVColsMap(Map hiveVCols, + int shift) { + Builder bldr = ImmutableMap. builder(); + + for (Integer pos : hiveVCols.keySet()) { + bldr.put(shift + pos, hiveVCols.get(pos)); + } + + return bldr.build(); + } + + public static ImmutableMap getVColsMap(List hiveVCols, + int startIndx) { + Builder bldr = ImmutableMap. builder(); + + int indx = startIndx; + for (VirtualColumn vc : hiveVCols) { + bldr.put(indx, vc); + indx++; + } + + return bldr.build(); + } + + public static ImmutableMap getColNameIndxMap(List tableFields) { + Builder bldr = ImmutableMap. builder(); + + int indx = 0; + for (FieldSchema fs : tableFields) { + bldr.put(fs.getName(), indx); + indx++; + } + + return bldr.build(); + } + + public static ImmutableMap getRowColNameIndxMap(List rowFields) { + Builder bldr = ImmutableMap. builder(); + + int indx = 0; + for (RelDataTypeField rdt : rowFields) { + bldr.put(rdt.getName(), indx); + indx++; + } + + return bldr.build(); + } + + public static ImmutableList getInputRef(List inputRefs, RelNode inputRel) { + ImmutableList.Builder bldr = ImmutableList. builder(); + for (int i : inputRefs) { + bldr.add(new RexInputRef(i, (RelDataType) inputRel.getRowType().getFieldList().get(i).getType())); + } + return bldr.build(); + } + + public static ExprNodeDesc getExprNode(Integer inputRefIndx, RelNode inputRel, + ExprNodeConverter exprConv) { + ExprNodeDesc exprNode = null; + RexNode rexInputRef = new RexInputRef(inputRefIndx, (RelDataType) inputRel.getRowType() + .getFieldList().get(inputRefIndx).getType()); + exprNode = rexInputRef.accept(exprConv); + + return exprNode; + } + + public static List getExprNodes(List inputRefs, RelNode inputRel, + String inputTabAlias) { + List exprNodes = new ArrayList(); + List rexInputRefs = getInputRef(inputRefs, inputRel); + // TODO: Change ExprNodeConverter to be independent of Partition Expr + ExprNodeConverter exprConv = new ExprNodeConverter(inputTabAlias, inputRel.getRowType(), false, inputRel.getCluster().getTypeFactory()); + for (RexNode iRef : rexInputRefs) { + exprNodes.add(iRef.accept(exprConv)); + } + return exprNodes; + } + + public static List getFieldNames(List inputRefs, RelNode inputRel) { + List fieldNames = new ArrayList(); + List schemaNames = inputRel.getRowType().getFieldNames(); + for (Integer iRef : inputRefs) { + fieldNames.add(schemaNames.get(iRef)); + } + + return fieldNames; + } + /** * Walks over an expression and determines whether it is constant. */ Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdParallelism.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdParallelism.java (.../https://svn.apache.org/repos/asf/hive/trunk) (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdParallelism.java (revision 1672450) @@ -0,0 +1,125 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer.calcite.stats; + +import org.apache.calcite.rel.RelNode; +import org.apache.calcite.rel.metadata.ReflectiveRelMetadataProvider; +import org.apache.calcite.rel.metadata.RelMdParallelism; +import org.apache.calcite.rel.metadata.RelMetadataProvider; +import org.apache.calcite.rel.metadata.RelMetadataQuery; +import org.apache.calcite.util.BuiltInMethod; +import org.apache.hadoop.hive.ql.optimizer.calcite.RelOptHiveTable; +import org.apache.hadoop.hive.ql.optimizer.calcite.cost.HiveCostModel.JoinAlgorithm; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveJoin; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveJoin.MapJoinStreamingRelation; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveSort; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan; + +public class HiveRelMdParallelism extends RelMdParallelism { + + private final Double maxSplitSize; + + //~ Constructors ----------------------------------------------------------- + + public HiveRelMdParallelism(Double maxSplitSize) { + this.maxSplitSize = maxSplitSize; + } + + public RelMetadataProvider getMetadataProvider() { + return ReflectiveRelMetadataProvider.reflectiveSource(this, + BuiltInMethod.IS_PHASE_TRANSITION.method, + BuiltInMethod.SPLIT_COUNT.method); + } + + //~ Methods ---------------------------------------------------------------- + + public Boolean isPhaseTransition(HiveJoin join) { + // As Exchange operator is introduced later on, we make a + // common join operator create a new stage for the moment + if (join.getJoinAlgorithm() == JoinAlgorithm.COMMON_JOIN) { + return true; + } + return false; + } + + public Boolean isPhaseTransition(HiveSort sort) { + // As Exchange operator is introduced later on, we make a + // sort operator create a new stage for the moment + return true; + } + + public Integer splitCount(HiveJoin join) { + if (join.getJoinAlgorithm() == JoinAlgorithm.COMMON_JOIN) { + return splitCountRepartition(join); + } + else if (join.getJoinAlgorithm() == JoinAlgorithm.MAP_JOIN || + join.getJoinAlgorithm() == JoinAlgorithm.BUCKET_JOIN || + join.getJoinAlgorithm() == JoinAlgorithm.SMB_JOIN) { + RelNode largeInput; + if (join.getMapJoinStreamingSide() == MapJoinStreamingRelation.LEFT_RELATION) { + largeInput = join.getLeft(); + } else if (join.getMapJoinStreamingSide() == MapJoinStreamingRelation.RIGHT_RELATION) { + largeInput = join.getRight(); + } else { + return null; + } + return splitCount(largeInput); + } + return null; + } + + public Integer splitCount(HiveTableScan scan) { + RelOptHiveTable table = (RelOptHiveTable) scan.getTable(); + return table.getHiveTableMD().getNumBuckets(); + } + + public Integer splitCount(RelNode rel) { + Boolean newPhase = RelMetadataQuery.isPhaseTransition(rel); + + if (newPhase == null) { + return null; + } + + if (newPhase) { + // We repartition: new number of splits + return splitCountRepartition(rel); + } + + // We do not repartition: take number of splits from children + Integer splitCount = 0; + for (RelNode input : rel.getInputs()) { + splitCount += RelMetadataQuery.splitCount(input); + } + return splitCount; + } + + public Integer splitCountRepartition(RelNode rel) { + // We repartition: new number of splits + final Double averageRowSize = RelMetadataQuery.getAverageRowSize(rel); + final Double rowCount = RelMetadataQuery.getRowCount(rel); + if (averageRowSize == null || rowCount == null) { + return null; + } + final Double totalSize = averageRowSize * rowCount; + final Double splitCount = totalSize / maxSplitSize; + return splitCount.intValue(); + } + +} + +// End RelMdParallelism.java \ No newline at end of file Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdMemory.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdMemory.java (.../https://svn.apache.org/repos/asf/hive/trunk) (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdMemory.java (revision 1672450) @@ -0,0 +1,167 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer.calcite.stats; + +import org.apache.calcite.rel.RelCollations; +import org.apache.calcite.rel.RelNode; +import org.apache.calcite.rel.metadata.ReflectiveRelMetadataProvider; +import org.apache.calcite.rel.metadata.RelMdMemory; +import org.apache.calcite.rel.metadata.RelMetadataProvider; +import org.apache.calcite.rel.metadata.RelMetadataQuery; +import org.apache.calcite.util.BuiltInMethod; +import org.apache.hadoop.hive.ql.optimizer.calcite.cost.HiveCostModel.JoinAlgorithm; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAggregate; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveFilter; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveJoin; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveJoin.MapJoinStreamingRelation; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveLimit; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveProject; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveSort; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveUnion; + +public class HiveRelMdMemory extends RelMdMemory { + + private static final HiveRelMdMemory INSTANCE = new HiveRelMdMemory(); + + public static final RelMetadataProvider SOURCE = + ReflectiveRelMetadataProvider.reflectiveSource(INSTANCE, + BuiltInMethod.MEMORY.method, + BuiltInMethod.CUMULATIVE_MEMORY_WITHIN_PHASE.method, + BuiltInMethod.CUMULATIVE_MEMORY_WITHIN_PHASE_SPLIT.method); + + //~ Constructors ----------------------------------------------------------- + + private HiveRelMdMemory() {} + + //~ Methods ---------------------------------------------------------------- + + public Double memory(HiveTableScan tableScan) { + return 0.0d; + } + + public Double memory(HiveAggregate aggregate) { + final Double avgRowSize = RelMetadataQuery.getAverageRowSize(aggregate.getInput()); + final Double rowCount = RelMetadataQuery.getRowCount(aggregate.getInput()); + if (avgRowSize == null || rowCount == null) { + return null; + } + return avgRowSize * rowCount; + } + + public Double memory(HiveFilter filter) { + return 0.0; + } + + public Double memory(HiveJoin join) { + Double memory = 0.0; + if (join.getJoinAlgorithm() == JoinAlgorithm.COMMON_JOIN) { + // Left side + final Double leftAvgRowSize = RelMetadataQuery.getAverageRowSize(join.getLeft()); + final Double leftRowCount = RelMetadataQuery.getRowCount(join.getLeft()); + if (leftAvgRowSize == null || leftRowCount == null) { + return null; + } + memory += leftAvgRowSize * leftRowCount; + // Right side + final Double rightAvgRowSize = RelMetadataQuery.getAverageRowSize(join.getRight()); + final Double rightRowCount = RelMetadataQuery.getRowCount(join.getRight()); + if (rightAvgRowSize == null || rightRowCount == null) { + return null; + } + memory += rightAvgRowSize * rightRowCount; + } else if (join.getJoinAlgorithm() == JoinAlgorithm.MAP_JOIN || + join.getJoinAlgorithm() == JoinAlgorithm.BUCKET_JOIN) { + RelNode inMemoryInput; + if (join.getMapJoinStreamingSide() == MapJoinStreamingRelation.LEFT_RELATION) { + inMemoryInput = join.getRight(); + } else if (join.getMapJoinStreamingSide() == MapJoinStreamingRelation.RIGHT_RELATION) { + inMemoryInput = join.getLeft(); + } else { + return null; + } + // Result + final Double avgRowSize = RelMetadataQuery.getAverageRowSize(inMemoryInput); + final Double rowCount = RelMetadataQuery.getRowCount(inMemoryInput); + if (avgRowSize == null || rowCount == null) { + return null; + } + memory = avgRowSize * rowCount; + } + return memory; + } + + public Double cumulativeMemoryWithinPhaseSplit(HiveJoin join) { + if (join.getJoinAlgorithm() == JoinAlgorithm.MAP_JOIN || + join.getJoinAlgorithm() == JoinAlgorithm.BUCKET_JOIN) { + // Check streaming side + RelNode inMemoryInput; + if (join.getMapJoinStreamingSide() == MapJoinStreamingRelation.LEFT_RELATION) { + inMemoryInput = join.getRight(); + } else if (join.getMapJoinStreamingSide() == MapJoinStreamingRelation.RIGHT_RELATION) { + inMemoryInput = join.getLeft(); + } else { + return null; + } + + if (join.getJoinAlgorithm() == JoinAlgorithm.MAP_JOIN) { + // If simple map join, the whole relation goes in memory + return RelMetadataQuery.cumulativeMemoryWithinPhase(inMemoryInput); + } + else if (join.getJoinAlgorithm() == JoinAlgorithm.BUCKET_JOIN) { + // If bucket map join, only a split goes in memory + final Double memoryInput = + RelMetadataQuery.cumulativeMemoryWithinPhase(inMemoryInput); + final Integer splitCount = RelMetadataQuery.splitCount(inMemoryInput); + if (memoryInput == null || splitCount == null) { + return null; + } + return memoryInput / splitCount; + } + } + // Else, we fall back to default + return super.cumulativeMemoryWithinPhaseSplit(join); + } + + public Double memory(HiveLimit limit) { + return 0.0; + } + + public Double memory(HiveProject project) { + return 0.0; + } + + public Double memory(HiveSort sort) { + if (sort.getCollation() != RelCollations.EMPTY) { + // It sorts + final Double avgRowSize = RelMetadataQuery.getAverageRowSize(sort.getInput()); + final Double rowCount = RelMetadataQuery.getRowCount(sort.getInput()); + if (avgRowSize == null || rowCount == null) { + return null; + } + return avgRowSize * rowCount; + } + // It does not sort, memory footprint is zero + return 0.0; + } + + public Double memory(HiveUnion union) { + return 0.0; + } + +} Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdSize.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdSize.java (.../https://svn.apache.org/repos/asf/hive/trunk) (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdSize.java (revision 1672450) @@ -0,0 +1,151 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer.calcite.stats; + +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.calcite.rel.RelNode; +import org.apache.calcite.rel.metadata.ReflectiveRelMetadataProvider; +import org.apache.calcite.rel.metadata.RelMdSize; +import org.apache.calcite.rel.metadata.RelMetadataProvider; +import org.apache.calcite.rel.metadata.RelMetadataQuery; +import org.apache.calcite.rel.type.RelDataType; +import org.apache.calcite.rel.type.RelDataTypeField; +import org.apache.calcite.util.BuiltInMethod; +import org.apache.calcite.util.ImmutableNullableList; +import org.apache.hadoop.hive.ql.optimizer.calcite.RelOptHiveTable; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveJoin; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan; +import org.apache.hadoop.hive.ql.plan.ColStatistics; + +import com.google.common.collect.ImmutableList; + +public class HiveRelMdSize extends RelMdSize { + + private static final HiveRelMdSize INSTANCE = new HiveRelMdSize(); + + public static final RelMetadataProvider SOURCE = + ReflectiveRelMetadataProvider.reflectiveSource(INSTANCE, + BuiltInMethod.AVERAGE_COLUMN_SIZES.method, + BuiltInMethod.AVERAGE_ROW_SIZE.method); + + //~ Constructors ----------------------------------------------------------- + + private HiveRelMdSize() {} + + //~ Methods ---------------------------------------------------------------- + + public List averageColumnSizes(HiveTableScan scan) { + List neededcolsLst = scan.getNeededColIndxsFrmReloptHT(); + Set needColsSet = new HashSet(neededcolsLst); + List columnStatistics = ((RelOptHiveTable) scan.getTable()) + .getColStat(neededcolsLst); + + // Obtain list of col stats, or use default if they are not available + final ImmutableList.Builder list = ImmutableList.builder(); + int indxRqdCol = 0; + int nFields = scan.getRowType().getFieldCount(); + for (int i = 0; i < nFields; i++) { + if (needColsSet.contains(i)) { + ColStatistics columnStatistic = columnStatistics.get(indxRqdCol); + indxRqdCol++; + if (columnStatistic == null) { + RelDataTypeField field = scan.getPrunedRowType().getFieldList().get(i); + list.add(averageTypeValueSize(field.getType())); + } else { + list.add(columnStatistic.getAvgColLen()); + } + } else { + list.add(new Double(0)); + } + } + + return list.build(); + } + + public List averageColumnSizes(HiveJoin rel) { + final RelNode left = rel.getLeft(); + final RelNode right = rel.getRight(); + final List lefts = + RelMetadataQuery.getAverageColumnSizes(left); + List rights = null; + if (!rel.isLeftSemiJoin()) { + rights = RelMetadataQuery.getAverageColumnSizes(right); + } + if (lefts == null && rights == null) { + return null; + } + final int fieldCount = rel.getRowType().getFieldCount(); + Double[] sizes = new Double[fieldCount]; + if (lefts != null) { + lefts.toArray(sizes); + } + if (rights != null) { + final int leftCount = left.getRowType().getFieldCount(); + for (int i = 0; i < rights.size(); i++) { + sizes[leftCount + i] = rights.get(i); + } + } + return ImmutableNullableList.copyOf(sizes); + } + + // TODO: remove when averageTypeValueSize method RelMdSize + // supports all types + public Double averageTypeValueSize(RelDataType type) { + switch (type.getSqlTypeName()) { + case BOOLEAN: + case TINYINT: + return 1d; + case SMALLINT: + return 2d; + case INTEGER: + case FLOAT: + case REAL: + case DECIMAL: + case DATE: + case TIME: + return 4d; + case BIGINT: + case DOUBLE: + case TIMESTAMP: + case INTERVAL_DAY_TIME: + case INTERVAL_YEAR_MONTH: + return 8d; + case BINARY: + return (double) type.getPrecision(); + case VARBINARY: + return Math.min((double) type.getPrecision(), 100d); + case CHAR: + return (double) type.getPrecision() * BYTES_PER_CHARACTER; + case VARCHAR: + // Even in large (say VARCHAR(2000)) columns most strings are small + return Math.min((double) type.getPrecision() * BYTES_PER_CHARACTER, 100d); + case ROW: + Double average = 0.0; + for (RelDataTypeField field : type.getFieldList()) { + average += averageTypeValueSize(field.getType()); + } + return average; + default: + return null; + } + } + +} Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdDistribution.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdDistribution.java (.../https://svn.apache.org/repos/asf/hive/trunk) (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdDistribution.java (revision 1672450) @@ -0,0 +1,105 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer.calcite.stats; + +import org.apache.calcite.rel.RelDistribution; +import org.apache.calcite.rel.metadata.ChainedRelMetadataProvider; +import org.apache.calcite.rel.metadata.ReflectiveRelMetadataProvider; +import org.apache.calcite.rel.metadata.RelMdDistribution; +import org.apache.calcite.rel.metadata.RelMetadataProvider; +import org.apache.calcite.util.BuiltInMethod; +import org.apache.hadoop.hive.ql.optimizer.calcite.HiveCalciteUtil; +import org.apache.hadoop.hive.ql.optimizer.calcite.HiveCalciteUtil.JoinLeafPredicateInfo; +import org.apache.hadoop.hive.ql.optimizer.calcite.HiveCalciteUtil.JoinPredicateInfo; +import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRelDistribution; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAggregate; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveJoin; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveJoin.MapJoinStreamingRelation; + +import com.google.common.collect.ImmutableList; + +public class HiveRelMdDistribution { + + public static final RelMetadataProvider SOURCE = + ChainedRelMetadataProvider.of( + ImmutableList.of( + ReflectiveRelMetadataProvider.reflectiveSource( + BuiltInMethod.DISTRIBUTION.method, new HiveRelMdDistribution()), + RelMdDistribution.SOURCE)); + + //~ Constructors ----------------------------------------------------------- + + private HiveRelMdDistribution() {} + + //~ Methods ---------------------------------------------------------------- + + public RelDistribution distribution(HiveAggregate aggregate) { + return new HiveRelDistribution(RelDistribution.Type.HASH_DISTRIBUTED, + aggregate.getGroupSet().asList()); + } + + public RelDistribution distribution(HiveJoin join) { + // Compute distribution + ImmutableList.Builder keysListBuilder = + new ImmutableList.Builder(); + ImmutableList.Builder leftKeysListBuilder = + new ImmutableList.Builder(); + ImmutableList.Builder rightKeysListBuilder = + new ImmutableList.Builder(); + JoinPredicateInfo joinPredInfo = + HiveCalciteUtil.JoinPredicateInfo.constructJoinPredicateInfo(join); + for (int i = 0; i < joinPredInfo.getEquiJoinPredicateElements().size(); i++) { + JoinLeafPredicateInfo joinLeafPredInfo = joinPredInfo. + getEquiJoinPredicateElements().get(i); + for (int leftPos : joinLeafPredInfo.getProjsFromLeftPartOfJoinKeysInJoinSchema()) { + keysListBuilder.add(leftPos); + leftKeysListBuilder.add(leftPos); + } + for (int rightPos : joinLeafPredInfo.getProjsFromRightPartOfJoinKeysInJoinSchema()) { + keysListBuilder.add(rightPos); + rightKeysListBuilder.add(rightPos); + } + } + + RelDistribution distribution; + switch (join.getJoinAlgorithm()) { + case SMB_JOIN: + case BUCKET_JOIN: + case COMMON_JOIN: + distribution = new HiveRelDistribution( + RelDistribution.Type.HASH_DISTRIBUTED, keysListBuilder.build()); + break; + case MAP_JOIN: + // Keep buckets from the streaming relation + if (join.getMapJoinStreamingSide() == MapJoinStreamingRelation.LEFT_RELATION) { + distribution = new HiveRelDistribution( + RelDistribution.Type.HASH_DISTRIBUTED, leftKeysListBuilder.build()); + } else if (join.getMapJoinStreamingSide() == MapJoinStreamingRelation.RIGHT_RELATION) { + distribution = new HiveRelDistribution( + RelDistribution.Type.HASH_DISTRIBUTED, rightKeysListBuilder.build()); + } else { + distribution = null; + } + break; + default: + distribution = null; + } + return distribution; + } + +} Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdCollation.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdCollation.java (.../https://svn.apache.org/repos/asf/hive/trunk) (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdCollation.java (revision 1672450) @@ -0,0 +1,122 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer.calcite.stats; + +import org.apache.calcite.rel.RelCollation; +import org.apache.calcite.rel.RelCollationTraitDef; +import org.apache.calcite.rel.RelFieldCollation; +import org.apache.calcite.rel.metadata.ChainedRelMetadataProvider; +import org.apache.calcite.rel.metadata.ReflectiveRelMetadataProvider; +import org.apache.calcite.rel.metadata.RelMdCollation; +import org.apache.calcite.rel.metadata.RelMetadataProvider; +import org.apache.calcite.util.BuiltInMethod; +import org.apache.hadoop.hive.ql.optimizer.calcite.HiveCalciteUtil; +import org.apache.hadoop.hive.ql.optimizer.calcite.HiveCalciteUtil.JoinLeafPredicateInfo; +import org.apache.hadoop.hive.ql.optimizer.calcite.HiveCalciteUtil.JoinPredicateInfo; +import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRelCollation; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAggregate; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveJoin; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveJoin.MapJoinStreamingRelation; + +import com.google.common.collect.ImmutableList; + +public class HiveRelMdCollation { + + public static final RelMetadataProvider SOURCE = + ChainedRelMetadataProvider.of( + ImmutableList.of( + ReflectiveRelMetadataProvider.reflectiveSource( + BuiltInMethod.COLLATIONS.method, new HiveRelMdCollation()), + RelMdCollation.SOURCE)); + + //~ Constructors ----------------------------------------------------------- + + private HiveRelMdCollation() {} + + //~ Methods ---------------------------------------------------------------- + + public ImmutableList collations(HiveAggregate aggregate) { + // Compute collations + ImmutableList.Builder collationListBuilder = + new ImmutableList.Builder(); + for (int pos : aggregate.getGroupSet().asList()) { + final RelFieldCollation fieldCollation = new RelFieldCollation(pos); + collationListBuilder.add(fieldCollation); + } + // Return aggregate collations + return ImmutableList.of( + RelCollationTraitDef.INSTANCE.canonize( + new HiveRelCollation(collationListBuilder.build()))); + } + + public ImmutableList collations(HiveJoin join) { + // Compute collations + ImmutableList.Builder collationListBuilder = + new ImmutableList.Builder(); + ImmutableList.Builder leftCollationListBuilder = + new ImmutableList.Builder(); + ImmutableList.Builder rightCollationListBuilder = + new ImmutableList.Builder(); + JoinPredicateInfo joinPredInfo = + HiveCalciteUtil.JoinPredicateInfo.constructJoinPredicateInfo(join); + for (int i = 0; i < joinPredInfo.getEquiJoinPredicateElements().size(); i++) { + JoinLeafPredicateInfo joinLeafPredInfo = joinPredInfo. + getEquiJoinPredicateElements().get(i); + for (int leftPos : joinLeafPredInfo.getProjsFromLeftPartOfJoinKeysInJoinSchema()) { + final RelFieldCollation leftFieldCollation = new RelFieldCollation(leftPos); + collationListBuilder.add(leftFieldCollation); + leftCollationListBuilder.add(leftFieldCollation); + } + for (int rightPos : joinLeafPredInfo.getProjsFromRightPartOfJoinKeysInJoinSchema()) { + final RelFieldCollation rightFieldCollation = new RelFieldCollation(rightPos); + collationListBuilder.add(rightFieldCollation); + rightCollationListBuilder.add(rightFieldCollation); + } + } + + // Return join collations + final ImmutableList collation; + switch (join.getJoinAlgorithm()) { + case SMB_JOIN: + case COMMON_JOIN: + collation = ImmutableList.of( + RelCollationTraitDef.INSTANCE.canonize( + new HiveRelCollation(collationListBuilder.build()))); + break; + case BUCKET_JOIN: + case MAP_JOIN: + // Keep order from the streaming relation + if (join.getMapJoinStreamingSide() == MapJoinStreamingRelation.LEFT_RELATION) { + collation = ImmutableList.of( + RelCollationTraitDef.INSTANCE.canonize( + new HiveRelCollation(leftCollationListBuilder.build()))); + } else if (join.getMapJoinStreamingSide() == MapJoinStreamingRelation.RIGHT_RELATION) { + collation = ImmutableList.of( + RelCollationTraitDef.INSTANCE.canonize( + new HiveRelCollation(rightCollationListBuilder.build()))); + } else { + collation = null; + } + break; + default: + collation = null; + } + return collation; + } + +} Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdRowCount.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdRowCount.java (.../https://svn.apache.org/repos/asf/hive/trunk) (revision 1672451) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdRowCount.java (working copy) @@ -15,7 +15,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.apache.hadoop.hive.ql.optimizer.calcite.stats; import java.util.ArrayList; Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdUniqueKeys.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdUniqueKeys.java (.../https://svn.apache.org/repos/asf/hive/trunk) (revision 1672451) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdUniqueKeys.java (working copy) @@ -15,7 +15,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.apache.hadoop.hive.ql.optimizer.calcite.stats; import java.util.BitSet; Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveDefaultRelMetadataProvider.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveDefaultRelMetadataProvider.java (.../https://svn.apache.org/repos/asf/hive/trunk) (revision 1672451) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveDefaultRelMetadataProvider.java (working copy) @@ -20,21 +20,64 @@ import org.apache.calcite.rel.metadata.ChainedRelMetadataProvider; import org.apache.calcite.rel.metadata.DefaultRelMetadataProvider; import org.apache.calcite.rel.metadata.RelMetadataProvider; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.optimizer.calcite.cost.HiveCostModel; +import org.apache.hadoop.hive.ql.optimizer.calcite.cost.HiveDefaultCostModel; +import org.apache.hadoop.hive.ql.optimizer.calcite.cost.HiveRelMdCost; +import org.apache.hadoop.hive.ql.optimizer.calcite.cost.HiveOnTezCostModel; +import org.apache.hadoop.hive.ql.optimizer.calcite.stats.HiveRelMdCollation; import org.apache.hadoop.hive.ql.optimizer.calcite.stats.HiveRelMdDistinctRowCount; +import org.apache.hadoop.hive.ql.optimizer.calcite.stats.HiveRelMdDistribution; +import org.apache.hadoop.hive.ql.optimizer.calcite.stats.HiveRelMdMemory; +import org.apache.hadoop.hive.ql.optimizer.calcite.stats.HiveRelMdParallelism; import org.apache.hadoop.hive.ql.optimizer.calcite.stats.HiveRelMdRowCount; import org.apache.hadoop.hive.ql.optimizer.calcite.stats.HiveRelMdSelectivity; +import org.apache.hadoop.hive.ql.optimizer.calcite.stats.HiveRelMdSize; import org.apache.hadoop.hive.ql.optimizer.calcite.stats.HiveRelMdUniqueKeys; import com.google.common.collect.ImmutableList; public class HiveDefaultRelMetadataProvider { - private HiveDefaultRelMetadataProvider() { + + private final HiveConf hiveConf; + + + public HiveDefaultRelMetadataProvider(HiveConf hiveConf) { + this.hiveConf = hiveConf; } - public static final RelMetadataProvider INSTANCE = ChainedRelMetadataProvider.of(ImmutableList - .of(HiveRelMdDistinctRowCount.SOURCE, + public RelMetadataProvider getMetadataProvider() { + + // Create cost metadata provider + final HiveCostModel cm; + if (HiveConf.getVar(this.hiveConf, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("tez") + && HiveConf.getBoolVar(this.hiveConf, HiveConf.ConfVars.EXTENDED_COST_MODEL)) { + final Double maxMemory = (double) HiveConf.getLongVar( + this.hiveConf, + HiveConf.ConfVars.HIVECONVERTJOINNOCONDITIONALTASKTHRESHOLD); + cm = new HiveOnTezCostModel(maxMemory); + } else { + cm = new HiveDefaultCostModel(); + } + + // Get max split size for HiveRelMdParallelism + final Double maxSplitSize = (double) HiveConf.getLongVar( + this.hiveConf, + HiveConf.ConfVars.MAPREDMAXSPLITSIZE); + + // Return MD provider + return ChainedRelMetadataProvider.of(ImmutableList + .of(new HiveRelMdCost(cm).getMetadataProvider(), + HiveRelMdDistinctRowCount.SOURCE, HiveRelMdSelectivity.SOURCE, HiveRelMdRowCount.SOURCE, HiveRelMdUniqueKeys.SOURCE, + HiveRelMdSize.SOURCE, + HiveRelMdMemory.SOURCE, + new HiveRelMdParallelism(maxSplitSize).getMetadataProvider(), + HiveRelMdDistribution.SOURCE, + HiveRelMdCollation.SOURCE, new DefaultRelMetadataProvider())); } + +} Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveSortExchange.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveSortExchange.java (.../https://svn.apache.org/repos/asf/hive/trunk) (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveSortExchange.java (revision 1672450) @@ -0,0 +1,49 @@ +package org.apache.hadoop.hive.ql.optimizer.calcite.reloperators; + +import org.apache.calcite.plan.Convention; +import org.apache.calcite.plan.RelOptCluster; +import org.apache.calcite.plan.RelTraitSet; +import org.apache.calcite.rel.RelCollation; +import org.apache.calcite.rel.RelCollationTraitDef; +import org.apache.calcite.rel.RelDistribution; +import org.apache.calcite.rel.RelDistributionTraitDef; +import org.apache.calcite.rel.RelInput; +import org.apache.calcite.rel.RelNode; +import org.apache.calcite.rel.core.SortExchange; + +public class HiveSortExchange extends SortExchange { + + private HiveSortExchange(RelOptCluster cluster, RelTraitSet traitSet, + RelNode input, RelDistribution distribution, RelCollation collation) { + super(cluster, traitSet, input, distribution, collation); + } + + public HiveSortExchange(RelInput input) { + super(input); + } + + /** + * Creates a HiveSortExchange. + * + * @param input Input relational expression + * @param distribution Distribution specification + * @param collation Collation specification + */ + public static HiveSortExchange create(RelNode input, + RelDistribution distribution, RelCollation collation) { + RelOptCluster cluster = input.getCluster(); + distribution = RelDistributionTraitDef.INSTANCE.canonize(distribution); + RelTraitSet traitSet = + input.getTraitSet().replace(Convention.NONE).replace(distribution); + collation = RelCollationTraitDef.INSTANCE.canonize(collation); + return new HiveSortExchange(cluster, traitSet, input, distribution, collation); + } + + @Override + public SortExchange copy(RelTraitSet traitSet, RelNode newInput, RelDistribution newDistribution, + RelCollation newCollation) { + return new HiveSortExchange(getCluster(), traitSet, newInput, + newDistribution, newCollation); + } + +} Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveJoin.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveJoin.java (.../https://svn.apache.org/repos/asf/hive/trunk) (revision 1672451) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveJoin.java (working copy) @@ -17,7 +17,9 @@ */ package org.apache.hadoop.hive.ql.optimizer.calcite.reloperators; +import java.util.ArrayList; import java.util.Collections; +import java.util.List; import java.util.Set; import org.apache.calcite.plan.RelOptCluster; @@ -25,7 +27,9 @@ import org.apache.calcite.plan.RelOptPlanner; import org.apache.calcite.plan.RelTraitSet; import org.apache.calcite.rel.InvalidRelException; +import org.apache.calcite.rel.RelCollations; import org.apache.calcite.rel.RelNode; +import org.apache.calcite.rel.RelWriter; import org.apache.calcite.rel.core.Join; import org.apache.calcite.rel.core.JoinRelType; import org.apache.calcite.rel.core.RelFactories.JoinFactory; @@ -33,19 +37,15 @@ import org.apache.calcite.rel.type.RelDataType; import org.apache.calcite.rel.type.RelDataTypeField; import org.apache.calcite.rex.RexNode; +import org.apache.calcite.util.ImmutableBitSet; +import org.apache.calcite.util.ImmutableIntList; +import org.apache.hadoop.hive.ql.optimizer.calcite.HiveCalciteUtil; +import org.apache.hadoop.hive.ql.optimizer.calcite.HiveCalciteUtil.JoinPredicateInfo; import org.apache.hadoop.hive.ql.optimizer.calcite.TraitsUtil; -import org.apache.hadoop.hive.ql.optimizer.calcite.cost.HiveCost; +import org.apache.hadoop.hive.ql.optimizer.calcite.cost.HiveCostModel.JoinAlgorithm; //TODO: Should we convert MultiJoin to be a child of HiveJoin public class HiveJoin extends Join implements HiveRelNode { - // NOTE: COMMON_JOIN & SMB_JOIN are Sort Merge Join (in case of COMMON_JOIN - // each parallel computation handles multiple splits where as in case of SMB - // each parallel computation handles one bucket). MAP_JOIN and BUCKET_JOIN is - // hash joins where MAP_JOIN keeps the whole data set of non streaming tables - // in memory where as BUCKET_JOIN keeps only the b - public enum JoinAlgorithm { - NONE, COMMON_JOIN, MAP_JOIN, BUCKET_JOIN, SMB_JOIN - } public enum MapJoinStreamingRelation { NONE, LEFT_RELATION, RIGHT_RELATION @@ -54,17 +54,20 @@ public static final JoinFactory HIVE_JOIN_FACTORY = new HiveJoinFactoryImpl(); private final boolean leftSemiJoin; - private final JoinAlgorithm joinAlgorithm; - //This will be used once we do Join Algorithm selection - @SuppressWarnings("unused") - private final MapJoinStreamingRelation mapJoinStreamingSide = MapJoinStreamingRelation.NONE; + private JoinAlgorithm joinAlgorithm; + private MapJoinStreamingRelation mapJoinStreamingSide; + private RelOptCost joinCost; + // Whether inputs are already sorted + private ImmutableBitSet sortedInputs; public static HiveJoin getJoin(RelOptCluster cluster, RelNode left, RelNode right, RexNode condition, JoinRelType joinType, boolean leftSemiJoin) { try { Set variablesStopped = Collections.emptySet(); - return new HiveJoin(cluster, null, left, right, condition, joinType, variablesStopped, - JoinAlgorithm.NONE, null, leftSemiJoin); + HiveJoin join = new HiveJoin(cluster, null, left, right, condition, joinType, variablesStopped, + JoinAlgorithm.NONE, chooseStreamingSide(left,right), null, leftSemiJoin); + join.sortedInputs = checkInputsCorrectOrder(join); + return join; } catch (InvalidRelException e) { throw new RuntimeException(e); } @@ -72,11 +75,13 @@ protected HiveJoin(RelOptCluster cluster, RelTraitSet traits, RelNode left, RelNode right, RexNode condition, JoinRelType joinType, Set variablesStopped, - JoinAlgorithm joinAlgo, MapJoinStreamingRelation streamingSideForMapJoin, boolean leftSemiJoin) - throws InvalidRelException { + JoinAlgorithm joinAlgo, MapJoinStreamingRelation streamingSideForMapJoin, + ImmutableBitSet sortedInputs, boolean leftSemiJoin) throws InvalidRelException { super(cluster, TraitsUtil.getDefaultTraitSet(cluster), left, right, condition, joinType, variablesStopped); this.joinAlgorithm = joinAlgo; + this.mapJoinStreamingSide = streamingSideForMapJoin; + this.sortedInputs = sortedInputs; this.leftSemiJoin = leftSemiJoin; } @@ -90,7 +95,7 @@ try { Set variablesStopped = Collections.emptySet(); return new HiveJoin(getCluster(), traitSet, left, right, conditionExpr, joinType, - variablesStopped, JoinAlgorithm.NONE, null, leftSemiJoin); + variablesStopped, joinAlgorithm, mapJoinStreamingSide, sortedInputs, leftSemiJoin); } catch (InvalidRelException e) { // Semantic error not possible. Must be a bug. Convert to // internal error. @@ -102,6 +107,22 @@ return joinAlgorithm; } + public void setJoinAlgorithm(JoinAlgorithm joinAlgorithm) { + this.joinAlgorithm = joinAlgorithm; + } + + public MapJoinStreamingRelation getMapJoinStreamingSide() { + return mapJoinStreamingSide; + } + + public void setJoinCost(RelOptCost joinCost) { + this.joinCost = joinCost; + } + + public ImmutableBitSet getSortedInputs() { + return sortedInputs; + } + public boolean isLeftSemiJoin() { return leftSemiJoin; } @@ -111,11 +132,57 @@ */ @Override public RelOptCost computeSelfCost(RelOptPlanner planner) { - double leftRCount = RelMetadataQuery.getRowCount(getLeft()); - double rightRCount = RelMetadataQuery.getRowCount(getRight()); - return HiveCost.FACTORY.makeCost(leftRCount + rightRCount, 0.0, 0.0); + return RelMetadataQuery.getNonCumulativeCost(this); } + private static MapJoinStreamingRelation chooseStreamingSide(RelNode left, + RelNode right) { + Double leftInputSize = RelMetadataQuery.memory(left); + Double rightInputSize = RelMetadataQuery.memory(right); + if (leftInputSize == null && rightInputSize == null) { + return MapJoinStreamingRelation.NONE; + } else if (leftInputSize != null && + (rightInputSize == null || + (leftInputSize < rightInputSize))) { + return MapJoinStreamingRelation.RIGHT_RELATION; + } else if (rightInputSize != null && + (leftInputSize == null || + (rightInputSize <= leftInputSize))) { + return MapJoinStreamingRelation.LEFT_RELATION; + } + return MapJoinStreamingRelation.NONE; + } + + private static ImmutableBitSet checkInputsCorrectOrder(HiveJoin join) { + ImmutableBitSet.Builder sortedInputs = new ImmutableBitSet.Builder(); + JoinPredicateInfo joinPredInfo = HiveCalciteUtil.JoinPredicateInfo. + constructJoinPredicateInfo(join); + List joinKeysInChildren = new ArrayList(); + joinKeysInChildren.add( + ImmutableIntList.copyOf( + joinPredInfo.getProjsFromLeftPartOfJoinKeysInChildSchema())); + joinKeysInChildren.add( + ImmutableIntList.copyOf( + joinPredInfo.getProjsFromRightPartOfJoinKeysInChildSchema())); + + for (int i=0; i groupSets, List aggCalls) throws InvalidRelException { super(cluster, TraitsUtil.getDefaultTraitSet(cluster), child, indicator, groupSet, groupSets, aggCalls); + this.bucketedInput = checkInputCorrectBucketing(child, groupSet); } @Override @@ -66,15 +69,28 @@ @Override public RelOptCost computeSelfCost(RelOptPlanner planner) { - return HiveCost.FACTORY.makeZeroCost(); + return RelMetadataQuery.getNonCumulativeCost(this); } + private static boolean checkInputCorrectBucketing(RelNode child, ImmutableBitSet groupSet) { + return false; + //TODO: Enable this again + /* + return RelMetadataQuery.distribution(child).getKeys(). + containsAll(groupSet.asList()); + */ + } + @Override public double getRows() { return RelMetadataQuery.getDistinctRowCount(this, groupSet, getCluster().getRexBuilder() .makeLiteral(true)); } + public boolean isBucketedInput() { + return this.bucketedInput; + } + private static class HiveAggRelFactory implements AggregateFactory { @Override Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveProject.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveProject.java (.../https://svn.apache.org/repos/asf/hive/trunk) (revision 1672451) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveProject.java (working copy) @@ -29,6 +29,7 @@ import org.apache.calcite.rel.RelNode; import org.apache.calcite.rel.core.Project; import org.apache.calcite.rel.core.RelFactories.ProjectFactory; +import org.apache.calcite.rel.metadata.RelMetadataQuery; import org.apache.calcite.rel.type.RelDataType; import org.apache.calcite.rel.type.RelDataTypeField; import org.apache.calcite.rex.RexBuilder; @@ -42,7 +43,6 @@ import org.apache.hadoop.hive.ql.optimizer.calcite.TraitsUtil; import org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException.UnsupportedFeature; import org.apache.hadoop.hive.ql.optimizer.calcite.cost.HiveCost; - import com.google.common.collect.ImmutableList; public class HiveProject extends Project implements HiveRelNode { @@ -172,7 +172,7 @@ @Override public RelOptCost computeSelfCost(RelOptPlanner planner) { - return HiveCost.FACTORY.makeZeroCost(); + return RelMetadataQuery.getNonCumulativeCost(this); } @Override Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveTableScan.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveTableScan.java (.../https://svn.apache.org/repos/asf/hive/trunk) (revision 1672451) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveTableScan.java (working copy) @@ -17,21 +17,34 @@ */ package org.apache.hadoop.hive.ql.optimizer.calcite.reloperators; +import java.util.ArrayList; +import java.util.LinkedList; import java.util.List; +import java.util.Map; +import java.util.Set; import org.apache.calcite.plan.RelOptCluster; import org.apache.calcite.plan.RelOptCost; import org.apache.calcite.plan.RelOptPlanner; import org.apache.calcite.plan.RelTraitSet; import org.apache.calcite.rel.RelNode; +import org.apache.calcite.rel.core.RelFactories; import org.apache.calcite.rel.core.TableScan; +import org.apache.calcite.rel.metadata.RelMetadataQuery; import org.apache.calcite.rel.type.RelDataType; +import org.apache.calcite.rel.type.RelDataTypeField; +import org.apache.calcite.rex.RexBuilder; +import org.apache.calcite.rex.RexNode; +import org.apache.calcite.util.ImmutableBitSet; +import org.apache.hadoop.hive.ql.optimizer.calcite.HiveCalciteUtil; import org.apache.hadoop.hive.ql.optimizer.calcite.RelOptHiveTable; import org.apache.hadoop.hive.ql.optimizer.calcite.TraitsUtil; -import org.apache.hadoop.hive.ql.optimizer.calcite.cost.HiveCost; import org.apache.hadoop.hive.ql.plan.ColStatistics; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableList.Builder; + /** * Relational expression representing a scan of a HiveDB collection. * @@ -42,6 +55,14 @@ */ public class HiveTableScan extends TableScan implements HiveRelNode { + private final RelDataType hiveTableScanRowType; + private final ImmutableList neededColIndxsFrmReloptHT; + private final String tblAlias; + + public String getTableAlias() { + return tblAlias; + } + /** * Creates a HiveTableScan. * @@ -54,10 +75,17 @@ * @param table * HiveDB table */ - public HiveTableScan(RelOptCluster cluster, RelTraitSet traitSet, RelOptHiveTable table, - RelDataType rowtype) { + public HiveTableScan(RelOptCluster cluster, RelTraitSet traitSet, RelOptHiveTable table, String alias) { + this(cluster, traitSet, table, alias, table.getRowType()); + } + + private HiveTableScan(RelOptCluster cluster, RelTraitSet traitSet, RelOptHiveTable table, String alias, + RelDataType newRowtype) { super(cluster, TraitsUtil.getDefaultTraitSet(cluster), table); assert getConvention() == HiveRelNode.CONVENTION; + this.tblAlias = alias; + this.hiveTableScanRowType = newRowtype; + this.neededColIndxsFrmReloptHT = buildNeededColIndxsFrmReloptHT(table.getRowType(), newRowtype); } @Override @@ -66,9 +94,21 @@ return this; } + /** + * Copy TableScan operator with a new Row Schema. The new Row Schema can only + * be a subset of this TS schema. + * + * @param newRowtype + * @return + */ + public HiveTableScan copy(RelDataType newRowtype) { + return new HiveTableScan(getCluster(), getTraitSet(), ((RelOptHiveTable) table), this.tblAlias, + newRowtype); + } + @Override public RelOptCost computeSelfCost(RelOptPlanner planner) { - return HiveCost.FACTORY.makeZeroCost(); + return RelMetadataQuery.getNonCumulativeCost(this); } @Override @@ -89,4 +129,62 @@ public List getColStat(List projIndxLst) { return ((RelOptHiveTable) table).getColStat(projIndxLst); } -} \ No newline at end of file + + @Override + public RelNode project(ImmutableBitSet fieldsUsed, Set extraFields, + RelFactories.ProjectFactory projectFactory) { + + // 1. If the schema is the same then bail out + final int fieldCount = getRowType().getFieldCount(); + if (fieldsUsed.equals(ImmutableBitSet.range(fieldCount)) && extraFields.isEmpty()) { + return this; + } + + // 2. Make sure there is no dynamic addition of virtual cols + if (extraFields != null && !extraFields.isEmpty()) { + throw new RuntimeException("Hive TS does not support adding virtual columns dynamically"); + } + + // 3. Create new TS schema that is a subset of original + final List fields = getRowType().getFieldList(); + List fieldTypes = new LinkedList(); + List fieldNames = new LinkedList(); + List exprList = new ArrayList(); + RexBuilder rexBuilder = getCluster().getRexBuilder(); + for (int i : fieldsUsed) { + RelDataTypeField field = fields.get(i); + fieldTypes.add(field.getType()); + fieldNames.add(field.getName()); + exprList.add(rexBuilder.makeInputRef(this, i)); + } + + // 4. Build new TS + HiveTableScan newHT = copy(getCluster().getTypeFactory().createStructType(fieldTypes, + fieldNames)); + + // 5. Add Proj on top of TS + return projectFactory.createProject(newHT, exprList, new ArrayList(fieldNames)); + } + + public List getNeededColIndxsFrmReloptHT() { + return neededColIndxsFrmReloptHT; + } + + public RelDataType getPrunedRowType() { + return hiveTableScanRowType; + } + + private static ImmutableList buildNeededColIndxsFrmReloptHT(RelDataType htRowtype, + RelDataType scanRowType) { + Builder neededColIndxsFrmReloptHTBldr = new ImmutableList.Builder(); + Map colNameToPosInReloptHT = HiveCalciteUtil.getRowColNameIndxMap(htRowtype + .getFieldList()); + List colNamesInScanRowType = scanRowType.getFieldNames(); + + for (int i = 0; i < colNamesInScanRowType.size(); i++) { + neededColIndxsFrmReloptHTBldr.add(colNameToPosInReloptHT.get(colNamesInScanRowType.get(i))); + } + + return neededColIndxsFrmReloptHTBldr.build(); + } +} Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveLimit.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveLimit.java (.../https://svn.apache.org/repos/asf/hive/trunk) (revision 1672451) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveLimit.java (working copy) @@ -25,9 +25,9 @@ import org.apache.calcite.plan.RelTraitSet; import org.apache.calcite.rel.RelNode; import org.apache.calcite.rel.SingleRel; +import org.apache.calcite.rel.metadata.RelMetadataQuery; import org.apache.calcite.rex.RexNode; import org.apache.hadoop.hive.ql.optimizer.calcite.TraitsUtil; -import org.apache.hadoop.hive.ql.optimizer.calcite.cost.HiveCost; public class HiveLimit extends SingleRel implements HiveRelNode { private final RexNode offset; @@ -52,6 +52,6 @@ @Override public RelOptCost computeSelfCost(RelOptPlanner planner) { - return HiveCost.FACTORY.makeZeroCost(); + return RelMetadataQuery.getNonCumulativeCost(this); } } Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/cost/HiveCostModel.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/cost/HiveCostModel.java (.../https://svn.apache.org/repos/asf/hive/trunk) (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/cost/HiveCostModel.java (revision 1672450) @@ -0,0 +1,96 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer.calcite.cost; + +import java.util.EnumSet; + +import org.apache.calcite.plan.RelOptCost; +import org.apache.calcite.plan.RelOptUtil; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAggregate; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveJoin; + +/** + * Cost model interface. + */ +public abstract class HiveCostModel { + + private static final Log LOG = LogFactory.getLog(HiveCostModel.class); + + // NOTE: COMMON_JOIN & SMB_JOIN are Sort Merge Join (in case of COMMON_JOIN + // each parallel computation handles multiple splits where as in case of SMB + // each parallel computation handles one bucket). MAP_JOIN and BUCKET_JOIN is + // hash joins where MAP_JOIN keeps the whole data set of non streaming tables + // in memory where as BUCKET_JOIN keeps only the b + public enum JoinAlgorithm { + NONE, COMMON_JOIN, MAP_JOIN, BUCKET_JOIN, SMB_JOIN + } + + public abstract RelOptCost getDefaultCost(); + + public abstract RelOptCost getAggregateCost(HiveAggregate aggregate); + + public RelOptCost getJoinCost(HiveJoin join) { + // Retrieve algorithms + EnumSet possibleAlgorithms = getExecutableJoinAlgorithms(join); + + // Select algorithm with min cost + JoinAlgorithm joinAlgorithm = null; + RelOptCost minJoinCost = null; + if (LOG.isDebugEnabled()) { + LOG.debug("Join algorithm selection for:\n" + RelOptUtil.toString(join)); + } + for (JoinAlgorithm possibleAlgorithm : possibleAlgorithms) { + RelOptCost joinCost = getJoinCost(join, possibleAlgorithm); + if (LOG.isDebugEnabled()) { + LOG.debug(possibleAlgorithm + " cost: " + joinCost); + } + if (minJoinCost == null || joinCost.isLt(minJoinCost) ) { + joinAlgorithm = possibleAlgorithm; + minJoinCost = joinCost; + } + } + join.setJoinAlgorithm(joinAlgorithm); + join.setJoinCost(minJoinCost); + if (LOG.isDebugEnabled()) { + LOG.debug(joinAlgorithm + " selected"); + } + + return minJoinCost; + } + + /** + * Returns the possible algorithms for a given join operator. + * + * @param join the join operator + * @return a set containing all the possible join algorithms that can be + * executed for this join operator + */ + abstract EnumSet getExecutableJoinAlgorithms(HiveJoin join); + + /** + * Returns the cost for a given algorithm and execution engine. + * + * @param join the join operator + * @param algorithm the join algorithm + * @return the cost for the given algorithm, or null if the algorithm is not + * defined for this execution engine + */ + abstract RelOptCost getJoinCost(HiveJoin join, JoinAlgorithm algorithm); +} Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/cost/HiveOnTezCostModel.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/cost/HiveOnTezCostModel.java (.../https://svn.apache.org/repos/asf/hive/trunk) (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/cost/HiveOnTezCostModel.java (revision 1672450) @@ -0,0 +1,367 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer.calcite.cost; + +import java.util.ArrayList; +import java.util.EnumSet; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.calcite.plan.RelOptCost; +import org.apache.calcite.rel.RelDistribution; +import org.apache.calcite.rel.RelDistribution.Type; +import org.apache.calcite.rel.RelNode; +import org.apache.calcite.rel.metadata.RelMetadataQuery; +import org.apache.calcite.util.ImmutableBitSet; +import org.apache.calcite.util.ImmutableIntList; +import org.apache.calcite.util.Pair; +import org.apache.hadoop.hive.ql.optimizer.calcite.HiveCalciteUtil; +import org.apache.hadoop.hive.ql.optimizer.calcite.HiveCalciteUtil.JoinPredicateInfo; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAggregate; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveJoin; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveJoin.MapJoinStreamingRelation; + +import com.google.common.collect.ImmutableList; + +/** + * Cost model for Tez execution engine. + */ +public class HiveOnTezCostModel extends HiveCostModel { + + private final Double maxMemory; + + + public HiveOnTezCostModel(Double maxMemory) { + this.maxMemory = maxMemory; + } + + @Override + public RelOptCost getDefaultCost() { + return HiveCost.FACTORY.makeZeroCost(); + } + + @Override + public RelOptCost getAggregateCost(HiveAggregate aggregate) { + if (aggregate.isBucketedInput()) { + return HiveCost.FACTORY.makeZeroCost(); + } else { + // 1. Sum of input cardinalities + final Double rCount = RelMetadataQuery.getRowCount(aggregate.getInput()); + if (rCount == null) { + return null; + } + // 2. CPU cost = sorting cost + final double cpuCost = HiveCostUtil.computeSortCPUCost(rCount); + // 3. IO cost = cost of writing intermediary results to local FS + + // cost of reading from local FS for transferring to GBy + + // cost of transferring map outputs to GBy operator + final Double rAverageSize = RelMetadataQuery.getAverageRowSize(aggregate.getInput()); + if (rAverageSize == null) { + return null; + } + final double ioCost = HiveCostUtil.computeSortIOCost(new Pair(rCount,rAverageSize)); + // 4. Result + return HiveCost.FACTORY.makeCost(rCount, cpuCost, ioCost); + } + } + + @Override + protected EnumSet getExecutableJoinAlgorithms(HiveJoin join) { + Set possibleAlgorithms = new HashSet(); + + // Check streaming side + RelNode smallInput; + if (join.getMapJoinStreamingSide() == MapJoinStreamingRelation.LEFT_RELATION) { + smallInput = join.getRight(); + } else if (join.getMapJoinStreamingSide() == MapJoinStreamingRelation.RIGHT_RELATION) { + smallInput = join.getLeft(); + } else { + smallInput = null; + } + + if (smallInput != null) { + // Requirements: + // - For SMB, sorted by their keys on both sides and bucketed. + // - For Bucket, bucketed by their keys on both sides. / Fitting in memory + // - For Map, no additional requirement. / Fitting in memory + + // Get key columns + JoinPredicateInfo joinPredInfo = HiveCalciteUtil.JoinPredicateInfo. + constructJoinPredicateInfo(join); + List joinKeysInChildren = new ArrayList(); + joinKeysInChildren.add( + ImmutableIntList.copyOf( + joinPredInfo.getProjsFromLeftPartOfJoinKeysInChildSchema())); + joinKeysInChildren.add( + ImmutableIntList.copyOf( + joinPredInfo.getProjsFromRightPartOfJoinKeysInChildSchema())); + + // Obtain number of buckets + Integer buckets = RelMetadataQuery.splitCount(smallInput); + // Obtain map algorithms for which smallest input fits in memory + boolean bucketFitsMemory = false; + boolean inputFitsMemory = false; + if (buckets != null) { + bucketFitsMemory = isFittingIntoMemory(maxMemory, smallInput, buckets); + } + inputFitsMemory = bucketFitsMemory ? + isFittingIntoMemory(maxMemory, smallInput, 1) : false; + boolean orderedBucketed = true; + boolean bucketed = true; + for (int i=0; i maxSize) { + return false; + } + return true; + } + return false; + } + + @Override + protected RelOptCost getJoinCost(HiveJoin join, JoinAlgorithm algorithm) { + RelOptCost algorithmCost; + switch (algorithm) { + case COMMON_JOIN: + algorithmCost = computeCostCommonJoin(join); + break; + case MAP_JOIN: + algorithmCost = computeCostMapJoin(join); + break; + case BUCKET_JOIN: + algorithmCost = computeCostBucketJoin(join); + break; + case SMB_JOIN: + algorithmCost = computeCostSMBJoin(join); + break; + default: + algorithmCost = null; + } + return algorithmCost; + } + + private static RelOptCost computeCostCommonJoin(HiveJoin join) { + // 1. Sum of input cardinalities + final Double leftRCount = RelMetadataQuery.getRowCount(join.getLeft()); + final Double rightRCount = RelMetadataQuery.getRowCount(join.getRight()); + if (leftRCount == null || rightRCount == null) { + return null; + } + final double rCount = leftRCount + rightRCount; + // 2. CPU cost = sorting cost (for each relation) + + // total merge cost + ImmutableList cardinalities = new ImmutableList.Builder(). + add(leftRCount). + add(rightRCount). + build(); + final double cpuCost = HiveCostUtil.computeSortMergeCPUCost(cardinalities, join.getSortedInputs()); + // 3. IO cost = cost of writing intermediary results to local FS + + // cost of reading from local FS for transferring to join + + // cost of transferring map outputs to Join operator + final Double leftRAverageSize = RelMetadataQuery.getAverageRowSize(join.getLeft()); + final Double rightRAverageSize = RelMetadataQuery.getAverageRowSize(join.getRight()); + if (leftRAverageSize == null || rightRAverageSize == null) { + return null; + } + ImmutableList> relationInfos = new ImmutableList.Builder>(). + add(new Pair(leftRCount,leftRAverageSize)). + add(new Pair(rightRCount,rightRAverageSize)). + build(); + final double ioCost = HiveCostUtil.computeSortMergeIOCost(relationInfos); + // 4. Result + return HiveCost.FACTORY.makeCost(rCount, cpuCost, ioCost); + } + + private static RelOptCost computeCostMapJoin(HiveJoin join) { + // 1. Sum of input cardinalities + final Double leftRCount = RelMetadataQuery.getRowCount(join.getLeft()); + final Double rightRCount = RelMetadataQuery.getRowCount(join.getRight()); + if (leftRCount == null || rightRCount == null) { + return null; + } + final double rCount = leftRCount + rightRCount; + // 2. CPU cost = HashTable construction cost + + // join cost + ImmutableList cardinalities = new ImmutableList.Builder(). + add(leftRCount). + add(rightRCount). + build(); + ImmutableBitSet.Builder streamingBuilder = new ImmutableBitSet.Builder(); + switch (join.getMapJoinStreamingSide()) { + case LEFT_RELATION: + streamingBuilder.set(0); + break; + case RIGHT_RELATION: + streamingBuilder.set(1); + break; + default: + return null; + } + ImmutableBitSet streaming = streamingBuilder.build(); + final double cpuCost = HiveCostUtil.computeMapJoinCPUCost(cardinalities, streaming); + // 3. IO cost = cost of transferring small tables to join node * + // degree of parallelism + final Double leftRAverageSize = RelMetadataQuery.getAverageRowSize(join.getLeft()); + final Double rightRAverageSize = RelMetadataQuery.getAverageRowSize(join.getRight()); + if (leftRAverageSize == null || rightRAverageSize == null) { + return null; + } + ImmutableList> relationInfos = new ImmutableList.Builder>(). + add(new Pair(leftRCount,leftRAverageSize)). + add(new Pair(rightRCount,rightRAverageSize)). + build(); + final int parallelism = RelMetadataQuery.splitCount(join) == null + ? 1 : RelMetadataQuery.splitCount(join); + final double ioCost = HiveCostUtil.computeMapJoinIOCost(relationInfos, streaming, parallelism); + // 4. Result + return HiveCost.FACTORY.makeCost(rCount, cpuCost, ioCost); + } + + private static RelOptCost computeCostBucketJoin(HiveJoin join) { + // 1. Sum of input cardinalities + final Double leftRCount = RelMetadataQuery.getRowCount(join.getLeft()); + final Double rightRCount = RelMetadataQuery.getRowCount(join.getRight()); + if (leftRCount == null || rightRCount == null) { + return null; + } + final double rCount = leftRCount + rightRCount; + // 2. CPU cost = HashTable construction cost + + // join cost + ImmutableList cardinalities = new ImmutableList.Builder(). + add(leftRCount). + add(rightRCount). + build(); + ImmutableBitSet.Builder streamingBuilder = new ImmutableBitSet.Builder(); + switch (join.getMapJoinStreamingSide()) { + case LEFT_RELATION: + streamingBuilder.set(0); + break; + case RIGHT_RELATION: + streamingBuilder.set(1); + break; + default: + return null; + } + ImmutableBitSet streaming = streamingBuilder.build(); + final double cpuCost = HiveCostUtil.computeBucketMapJoinCPUCost(cardinalities, streaming); + // 3. IO cost = cost of transferring small tables to join node * + // degree of parallelism + final Double leftRAverageSize = RelMetadataQuery.getAverageRowSize(join.getLeft()); + final Double rightRAverageSize = RelMetadataQuery.getAverageRowSize(join.getRight()); + if (leftRAverageSize == null || rightRAverageSize == null) { + return null; + } + ImmutableList> relationInfos = new ImmutableList.Builder>(). + add(new Pair(leftRCount,leftRAverageSize)). + add(new Pair(rightRCount,rightRAverageSize)). + build(); + final int parallelism = RelMetadataQuery.splitCount(join) == null + ? 1 : RelMetadataQuery.splitCount(join); + final double ioCost = HiveCostUtil.computeBucketMapJoinIOCost(relationInfos, streaming, parallelism); + // 4. Result + return HiveCost.FACTORY.makeCost(rCount, cpuCost, ioCost); + } + + private static RelOptCost computeCostSMBJoin(HiveJoin join) { + // 1. Sum of input cardinalities + final Double leftRCount = RelMetadataQuery.getRowCount(join.getLeft()); + final Double rightRCount = RelMetadataQuery.getRowCount(join.getRight()); + if (leftRCount == null || rightRCount == null) { + return null; + } + final double rCount = leftRCount + rightRCount; + // 2. CPU cost = HashTable construction cost + + // join cost + ImmutableList cardinalities = new ImmutableList.Builder(). + add(leftRCount). + add(rightRCount). + build(); + ImmutableBitSet.Builder streamingBuilder = new ImmutableBitSet.Builder(); + switch (join.getMapJoinStreamingSide()) { + case LEFT_RELATION: + streamingBuilder.set(0); + break; + case RIGHT_RELATION: + streamingBuilder.set(1); + break; + default: + return null; + } + ImmutableBitSet streaming = streamingBuilder.build(); + final double cpuCost = HiveCostUtil.computeSMBMapJoinCPUCost(cardinalities); + // 3. IO cost = cost of transferring small tables to join node * + // degree of parallelism + final Double leftRAverageSize = RelMetadataQuery.getAverageRowSize(join.getLeft()); + final Double rightRAverageSize = RelMetadataQuery.getAverageRowSize(join.getRight()); + if (leftRAverageSize == null || rightRAverageSize == null) { + return null; + } + ImmutableList> relationInfos = new ImmutableList.Builder>(). + add(new Pair(leftRCount,leftRAverageSize)). + add(new Pair(rightRCount,rightRAverageSize)). + build(); + final int parallelism = RelMetadataQuery.splitCount(join) == null + ? 1 : RelMetadataQuery.splitCount(join); + final double ioCost = HiveCostUtil.computeSMBMapJoinIOCost(relationInfos, streaming, parallelism); + // 4. Result + return HiveCost.FACTORY.makeCost(rCount, cpuCost, ioCost); + } + +} Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/cost/HiveRelMdCost.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/cost/HiveRelMdCost.java (.../https://svn.apache.org/repos/asf/hive/trunk) (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/cost/HiveRelMdCost.java (revision 1672450) @@ -0,0 +1,66 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer.calcite.cost; + +import org.apache.calcite.plan.RelOptCost; +import org.apache.calcite.rel.RelNode; +import org.apache.calcite.rel.metadata.ChainedRelMetadataProvider; +import org.apache.calcite.rel.metadata.ReflectiveRelMetadataProvider; +import org.apache.calcite.rel.metadata.RelMdPercentageOriginalRows; +import org.apache.calcite.rel.metadata.RelMetadataProvider; +import org.apache.calcite.util.BuiltInMethod; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAggregate; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveJoin; + +import com.google.common.collect.ImmutableList; + +/** + * HiveRelMdCost supplies the implementation of cost model. + */ +public class HiveRelMdCost { + + private final HiveCostModel hiveCostModel; + + public HiveRelMdCost(HiveCostModel hiveCostModel) { + this.hiveCostModel = hiveCostModel; + } + + public RelMetadataProvider getMetadataProvider() { + return ChainedRelMetadataProvider.of( + ImmutableList.of( + ReflectiveRelMetadataProvider.reflectiveSource(this, + BuiltInMethod.NON_CUMULATIVE_COST.method), + RelMdPercentageOriginalRows.SOURCE)); + } + + public RelOptCost getNonCumulativeCost(HiveAggregate aggregate) { + return hiveCostModel.getAggregateCost(aggregate); + } + + public RelOptCost getNonCumulativeCost(HiveJoin join) { + return hiveCostModel.getJoinCost(join); + } + + // Default case + public RelOptCost getNonCumulativeCost(RelNode rel) { + return hiveCostModel.getDefaultCost(); + } + +} + +// End HiveRelMdCost.java Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/cost/HiveDefaultCostModel.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/cost/HiveDefaultCostModel.java (.../https://svn.apache.org/repos/asf/hive/trunk) (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/cost/HiveDefaultCostModel.java (revision 1672450) @@ -0,0 +1,67 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer.calcite.cost; + +import java.util.EnumSet; + +import org.apache.calcite.plan.RelOptCost; +import org.apache.calcite.rel.metadata.RelMetadataQuery; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAggregate; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveJoin; + +/** + * Default implementation of the cost model. + * Currently used by MR and Spark execution engines. + */ +public class HiveDefaultCostModel extends HiveCostModel { + + @Override + public RelOptCost getDefaultCost() { + return HiveCost.FACTORY.makeZeroCost(); + } + + @Override + public RelOptCost getAggregateCost(HiveAggregate aggregate) { + return HiveCost.FACTORY.makeZeroCost(); + } + + @Override + protected EnumSet getExecutableJoinAlgorithms(HiveJoin join) { + return EnumSet.of(JoinAlgorithm.NONE); + } + + @Override + protected RelOptCost getJoinCost(HiveJoin join, JoinAlgorithm algorithm) { + RelOptCost algorithmCost; + switch (algorithm) { + case NONE: + algorithmCost = computeJoinCardinalityCost(join); + break; + default: + algorithmCost = null; + } + return algorithmCost; + } + + private static RelOptCost computeJoinCardinalityCost(HiveJoin join) { + double leftRCount = RelMetadataQuery.getRowCount(join.getLeft()); + double rightRCount = RelMetadataQuery.getRowCount(join.getRight()); + return HiveCost.FACTORY.makeCost(leftRCount + rightRCount, 0.0, 0.0); + } + +} Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/cost/HiveCostUtil.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/cost/HiveCostUtil.java (.../https://svn.apache.org/repos/asf/hive/trunk) (revision 1672451) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/cost/HiveCostUtil.java (working copy) @@ -18,26 +18,160 @@ package org.apache.hadoop.hive.ql.optimizer.calcite.cost; import org.apache.calcite.plan.RelOptCost; +import org.apache.calcite.util.ImmutableBitSet; +import org.apache.calcite.util.Pair; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveRelNode; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan; +import com.google.common.collect.ImmutableList; + // Use this once we have Join Algorithm selection public class HiveCostUtil { - private static final double cpuCostInNanoSec = 1.0; - private static final double netCostInNanoSec = 150 * cpuCostInNanoSec; - private static final double localFSWriteCostInNanoSec = 4 * netCostInNanoSec; - private static final double localFSReadCostInNanoSec = 4 * netCostInNanoSec; - private static final double hDFSWriteCostInNanoSec = 10 * localFSWriteCostInNanoSec; - @SuppressWarnings("unused") -//Use this once we have Join Algorithm selection - private static final double hDFSReadCostInNanoSec = 1.5 * localFSReadCostInNanoSec; + private static final double CPU_COST = 1.0; + private static final double NET_COST = 150.0 * CPU_COST; + private static final double LOCAL_WRITE_COST = 4.0 * NET_COST; + private static final double LOCAL_READ_COST = 4.0 * NET_COST; + private static final double HDFS_WRITE_COST = 10.0 * LOCAL_WRITE_COST; + private static final double HDFS_READ_COST = 1.5 * LOCAL_READ_COST; + public static RelOptCost computCardinalityBasedCost(HiveRelNode hr) { return new HiveCost(hr.getRows(), 0, 0); } public static HiveCost computeCost(HiveTableScan t) { double cardinality = t.getRows(); - return new HiveCost(cardinality, 0, hDFSWriteCostInNanoSec * cardinality * 0); + return new HiveCost(cardinality, 0, HDFS_WRITE_COST * cardinality * 0); } + + public static double computeSortMergeCPUCost( + ImmutableList cardinalities, + ImmutableBitSet sorted) { + // Sort-merge join + double cpuCost = 0.0; + for (int i=0; i> relationInfos) { + // Sort-merge join + double ioCost = 0.0; + for (Pair relationInfo : relationInfos) { + ioCost += computeSortIOCost(relationInfo); + } + return ioCost; + } + + public static double computeSortIOCost(Pair relationInfo) { + // Sort-merge join + double ioCost = 0.0; + double cardinality = relationInfo.left; + double averageTupleSize = relationInfo.right; + // Write cost + ioCost += cardinality * averageTupleSize * LOCAL_WRITE_COST; + // Read cost + ioCost += cardinality * averageTupleSize * LOCAL_READ_COST; + // Net transfer cost + ioCost += cardinality * averageTupleSize * NET_COST; + return ioCost; + } + + public static double computeMapJoinCPUCost( + ImmutableList cardinalities, + ImmutableBitSet streaming) { + // Hash-join + double cpuCost = 0.0; + for (int i=0; i> relationInfos, + ImmutableBitSet streaming, int parallelism) { + // Hash-join + double ioCost = 0.0; + for (int i=0; i cardinalities, + ImmutableBitSet streaming) { + // Hash-join + double cpuCost = 0.0; + for (int i=0; i> relationInfos, + ImmutableBitSet streaming, int parallelism) { + // Hash-join + double ioCost = 0.0; + for (int i=0; i cardinalities) { + // Hash-join + double cpuCost = 0.0; + for (int i=0; i> relationInfos, + ImmutableBitSet streaming, int parallelism) { + // Hash-join + double ioCost = 0.0; + for (int i=0; i hiveNonPartitionCols; + private final ImmutableList hivePartitionCols; private final ImmutableMap hiveNonPartitionColsMap; private final ImmutableMap hivePartitionColsMap; - private final int noOfProjs; + private final ImmutableList hiveVirtualCols; + private final int noOfNonVirtualCols; final HiveConf hiveConf; private double rowCount = -1; @@ -67,37 +77,65 @@ PrunedPartitionList partitionList; Map partitionCache; AtomicInteger noColsMissingStats; + private final String qbID; protected static final Log LOG = LogFactory .getLog(RelOptHiveTable.class .getName()); - public RelOptHiveTable(RelOptSchema calciteSchema, String qualifiedTblName, String tblAlias, RelDataType rowType, - Table hiveTblMetadata, List hiveNonPartitionCols, - List hivePartitionCols, HiveConf hconf, Map partitionCache, AtomicInteger noColsMissingStats) { + public RelOptHiveTable(RelOptSchema calciteSchema, String qualifiedTblName, + RelDataType rowType, Table hiveTblMetadata, List hiveNonPartitionCols, + List hivePartitionCols, List hiveVirtualCols, HiveConf hconf, + Map partitionCache, AtomicInteger noColsMissingStats, + String qbID) { super(calciteSchema, qualifiedTblName, rowType); this.hiveTblMetadata = hiveTblMetadata; - this.tblAlias = tblAlias; this.hiveNonPartitionCols = ImmutableList.copyOf(hiveNonPartitionCols); - this.hiveNonPartitionColsMap = getColInfoMap(hiveNonPartitionCols, 0); - this.hivePartitionColsMap = getColInfoMap(hivePartitionCols, hiveNonPartitionColsMap.size()); - this.noOfProjs = hiveNonPartitionCols.size() + hivePartitionCols.size(); + this.hiveNonPartitionColsMap = HiveCalciteUtil.getColInfoMap(hiveNonPartitionCols, 0); + this.hivePartitionCols = ImmutableList.copyOf(hivePartitionCols); + this.hivePartitionColsMap = HiveCalciteUtil.getColInfoMap(hivePartitionCols, hiveNonPartitionColsMap.size()); + this.noOfNonVirtualCols = hiveNonPartitionCols.size() + hivePartitionCols.size(); + this.hiveVirtualCols = ImmutableList.copyOf(hiveVirtualCols); this.hiveConf = hconf; this.partitionCache = partitionCache; this.noColsMissingStats = noColsMissingStats; + this.qbID = qbID; } - private static ImmutableMap getColInfoMap(List hiveCols, - int startIndx) { - Builder bldr = ImmutableMap. builder(); + public RelOptHiveTable copy(RelDataType newRowType) { + // 1. Build map of column name to col index of original schema + // Assumption: Hive Table can not contain duplicate column names + Map nameToColIndxMap = new HashMap(); + for (RelDataTypeField f : this.rowType.getFieldList()) { + nameToColIndxMap.put(f.getName(), f.getIndex()); + } - int indx = startIndx; - for (ColumnInfo ci : hiveCols) { - bldr.put(indx, ci); - indx++; + // 2. Build nonPart/Part/Virtual column info for new RowSchema + List newHiveNonPartitionCols = new ArrayList(); + List newHivePartitionCols = new ArrayList(); + List newHiveVirtualCols = new ArrayList(); + Map virtualColInfoMap = HiveCalciteUtil.getVColsMap(this.hiveVirtualCols, + this.noOfNonVirtualCols); + Integer originalColIndx; + ColumnInfo cInfo; + VirtualColumn vc; + for (RelDataTypeField f : newRowType.getFieldList()) { + originalColIndx = nameToColIndxMap.get(f.getName()); + if ((cInfo = hiveNonPartitionColsMap.get(originalColIndx)) != null) { + newHiveNonPartitionCols.add(new ColumnInfo(cInfo)); + } else if ((cInfo = hivePartitionColsMap.get(originalColIndx)) != null) { + newHivePartitionCols.add(new ColumnInfo(cInfo)); + } else if ((vc = virtualColInfoMap.get(originalColIndx)) != null) { + newHiveVirtualCols.add(vc); + } else { + throw new RuntimeException("Copy encountered a column not seen in original TS"); } + } - return bldr.build(); + // 3. Build new Table + return new RelOptHiveTable(this.schema, this.name, newRowType, + this.hiveTblMetadata, newHiveNonPartitionCols, newHivePartitionCols, newHiveVirtualCols, + this.hiveConf, this.partitionCache, this.noColsMissingStats, qbID); } @Override @@ -116,16 +154,57 @@ } @Override + public List getCollationList() { + ImmutableList.Builder collationList = new ImmutableList.Builder(); + for (Order sortColumn : this.hiveTblMetadata.getSortCols()) { + for (int i=0; i() + .add(RelCollationTraitDef.INSTANCE.canonize( + new HiveRelCollation(collationList.build()))) + .build(); + } + + @Override + public RelDistribution getDistribution() { + ImmutableList.Builder columnPositions = new ImmutableList.Builder(); + for (String bucketColumn : this.hiveTblMetadata.getBucketCols()) { + for (int i=0; i rowCounts = StatsUtils.getBasicStatForPartitions( - hiveTblMetadata, partitionList.getNotDeniedPartns(), - StatsSetupConst.ROW_COUNT); + List rowCounts = StatsUtils.getBasicStatForPartitions(hiveTblMetadata, + partitionList.getNotDeniedPartns(), StatsSetupConst.ROW_COUNT); rowCount = StatsUtils.getSumIgnoreNegatives(rowCounts); } else { @@ -143,19 +222,6 @@ return hiveTblMetadata; } - public String getTableAlias() { - // NOTE: Calcite considers tbls to be equal if their names are the same. Hence - // we need to provide Calcite the fully qualified table name (dbname.tblname) - // and not the user provided aliases. - // However in HIVE DB name can not appear in select list; in case of join - // where table names differ only in DB name, Hive would require user - // introducing explicit aliases for tbl. - if (tblAlias == null) - return hiveTblMetadata.getTableName(); - else - return tblAlias; - } - private String getColNamesForLogging(Set colLst) { StringBuffer sb = new StringBuffer(); boolean firstEntry = true; @@ -173,16 +239,21 @@ public void computePartitionList(HiveConf conf, RexNode pruneNode) { try { - if (!hiveTblMetadata.isPartitioned() || pruneNode == null || InputFinder.bits(pruneNode).length() == 0 ) { - // there is no predicate on partitioning column, we need all partitions in this case. - partitionList = PartitionPruner.prune(hiveTblMetadata, null, conf, getName(), partitionCache); + if (!hiveTblMetadata.isPartitioned() || pruneNode == null + || InputFinder.bits(pruneNode).length() == 0) { + // there is no predicate on partitioning column, we need all partitions + // in this case. + partitionList = PartitionPruner.prune(hiveTblMetadata, null, conf, getName(), + partitionCache); return; } // We have valid pruning expressions, only retrieve qualifying partitions - ExprNodeDesc pruneExpr = pruneNode.accept(new ExprNodeConverter(getName(), getRowType(), true, getRelOptSchema().getTypeFactory())); + ExprNodeDesc pruneExpr = pruneNode.accept(new ExprNodeConverter(getName(), getRowType(), + true, this.getRelOptSchema().getTypeFactory())); - partitionList = PartitionPruner.prune(hiveTblMetadata, pruneExpr, conf, getName(), partitionCache); + partitionList = PartitionPruner.prune(hiveTblMetadata, pruneExpr, conf, getName(), + partitionCache); } catch (HiveException he) { throw new RuntimeException(he); } @@ -289,10 +360,10 @@ if (colNamesFailedStats.isEmpty() && !partColNamesThatRqrStats.isEmpty()) { ColStatistics cStats = null; for (int i = 0; i < partColNamesThatRqrStats.size(); i++) { - cStats = new ColStatistics(hiveTblMetadata.getTableName(), - partColNamesThatRqrStats.get(i), hivePartitionColsMap.get( - partColIndxsThatRqrStats.get(i)).getTypeName()); - cStats.setCountDistint(getDistinctCount(partitionList.getPartitions(),partColNamesThatRqrStats.get(i))); + cStats = new ColStatistics(hiveTblMetadata.getTableName(), partColNamesThatRqrStats.get(i), + hivePartitionColsMap.get(partColIndxsThatRqrStats.get(i)).getTypeName()); + cStats.setCountDistint(getDistinctCount(partitionList.getPartitions(), + partColNamesThatRqrStats.get(i))); hiveColStatsMap.put(partColIndxsThatRqrStats.get(i), cStats); } } @@ -325,7 +396,7 @@ } } else { List pILst = new ArrayList(); - for (Integer i = 0; i < noOfProjs; i++) { + for (Integer i = 0; i < noOfNonVirtualCols; i++) { pILst.add(i); } updateColStats(new HashSet(pILst)); @@ -338,10 +409,8 @@ } /* - * use to check if a set of columns are all partition columns. - * true only if: - * - all columns in BitSet are partition - * columns. + * use to check if a set of columns are all partition columns. true only if: - + * all columns in BitSet are partition columns. */ public boolean containsPartitionColumnsOnly(ImmutableBitSet cols) { @@ -352,4 +421,32 @@ } return true; } + + public List getVirtualCols() { + return this.hiveVirtualCols; } + + public List getPartColumns() { + return this.hivePartitionCols; + } + + public List getNonPartColumns() { + return this.hiveNonPartitionCols; + } + + public String getQBID() { + return qbID; + } + + public int getNoOfNonVirtualCols() { + return noOfNonVirtualCols; + } + + public Map getPartColInfoMap() { + return hivePartitionColsMap; + } + + public Map getNonPartColInfoMap() { + return hiveNonPartitionColsMap; + } +} Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/HiveOpConverter.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/HiveOpConverter.java (.../https://svn.apache.org/repos/asf/hive/trunk) (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/HiveOpConverter.java (revision 1672450) @@ -0,0 +1,891 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.optimizer.calcite.translator; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.calcite.rel.RelCollation; +import org.apache.calcite.rel.RelCollations; +import org.apache.calcite.rel.RelDistribution; +import org.apache.calcite.rel.RelDistribution.Type; +import org.apache.calcite.rel.RelFieldCollation; +import org.apache.calcite.rel.RelNode; +import org.apache.calcite.rel.core.SemiJoin; +import org.apache.calcite.rel.logical.LogicalExchange; +import org.apache.calcite.rex.RexInputRef; +import org.apache.calcite.rex.RexLiteral; +import org.apache.calcite.rex.RexNode; +import org.apache.calcite.util.Pair; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.ErrorMsg; +import org.apache.hadoop.hive.ql.exec.ColumnInfo; +import org.apache.hadoop.hive.ql.exec.FilterOperator; +import org.apache.hadoop.hive.ql.exec.JoinOperator; +import org.apache.hadoop.hive.ql.exec.LimitOperator; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.OperatorFactory; +import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; +import org.apache.hadoop.hive.ql.exec.RowSchema; +import org.apache.hadoop.hive.ql.exec.SelectOperator; +import org.apache.hadoop.hive.ql.exec.TableScanOperator; +import org.apache.hadoop.hive.ql.exec.Utilities; +import org.apache.hadoop.hive.ql.io.AcidUtils.Operation; +import org.apache.hadoop.hive.ql.metadata.VirtualColumn; +import org.apache.hadoop.hive.ql.optimizer.calcite.HiveCalciteUtil; +import org.apache.hadoop.hive.ql.optimizer.calcite.HiveCalciteUtil.JoinLeafPredicateInfo; +import org.apache.hadoop.hive.ql.optimizer.calcite.HiveCalciteUtil.JoinPredicateInfo; +import org.apache.hadoop.hive.ql.optimizer.calcite.RelOptHiveTable; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAggregate; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveFilter; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveJoin; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveProject; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveSort; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveUnion; +import org.apache.hadoop.hive.ql.parse.JoinCond; +import org.apache.hadoop.hive.ql.parse.JoinType; +import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec; +import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec.OrderExpression; +import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec.PartitionExpression; +import org.apache.hadoop.hive.ql.parse.PTFTranslator; +import org.apache.hadoop.hive.ql.parse.RowResolver; +import org.apache.hadoop.hive.ql.parse.SemanticAnalyzer; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.parse.UnparseTranslator; +import org.apache.hadoop.hive.ql.parse.WindowingComponentizer; +import org.apache.hadoop.hive.ql.parse.WindowingSpec; +import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils; +import org.apache.hadoop.hive.ql.plan.FilterDesc; +import org.apache.hadoop.hive.ql.plan.JoinCondDesc; +import org.apache.hadoop.hive.ql.plan.JoinDesc; +import org.apache.hadoop.hive.ql.plan.LimitDesc; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; +import org.apache.hadoop.hive.ql.plan.PTFDesc; +import org.apache.hadoop.hive.ql.plan.PlanUtils; +import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc; +import org.apache.hadoop.hive.ql.plan.SelectDesc; +import org.apache.hadoop.hive.ql.plan.TableScanDesc; +import org.apache.hadoop.hive.ql.plan.UnionDesc; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; + +public class HiveOpConverter { + + private static final Log LOG = LogFactory.getLog(HiveOpConverter.class); + + public static enum HIVEAGGOPMODE { + NO_SKEW_NO_MAP_SIDE_AGG, // Corresponds to SemAnalyzer genGroupByPlan1MR + SKEW_NO_MAP_SIDE_AGG, // Corresponds to SemAnalyzer genGroupByPlan2MR + NO_SKEW_MAP_SIDE_AGG, // Corresponds to SemAnalyzer + // genGroupByPlanMapAggrNoSkew + SKEW_MAP_SIDE_AGG // Corresponds to SemAnalyzer genGroupByPlanMapAggr2MR + }; + + // TODO: remove this after stashing only rqd pieces from opconverter + private final SemanticAnalyzer semanticAnalyzer; + private final HiveConf hiveConf; + private final UnparseTranslator unparseTranslator; + private final Map> topOps; + private final boolean strictMode; + private int reduceSinkTagGenerator; + + public HiveOpConverter(SemanticAnalyzer semanticAnalyzer, HiveConf hiveConf, + UnparseTranslator unparseTranslator, Map> topOps, + boolean strictMode) { + this.semanticAnalyzer = semanticAnalyzer; + this.hiveConf = hiveConf; + this.unparseTranslator = unparseTranslator; + this.topOps = topOps; + this.strictMode = strictMode; + this.reduceSinkTagGenerator = 0; + } + + static class OpAttr { + final String tabAlias; + ImmutableList inputs; + ImmutableMap vcolMap; + + OpAttr(String tabAlias, Map vcolMap, Operator... inputs) { + this.tabAlias = tabAlias; + this.vcolMap = ImmutableMap.copyOf(vcolMap); + this.inputs = ImmutableList.copyOf(inputs); + } + + private OpAttr clone(Operator... inputs) { + return new OpAttr(tabAlias, this.vcolMap, inputs); + } + } + + public Operator convert(RelNode root) throws SemanticException { + OpAttr opAf = dispatch(root); + return opAf.inputs.get(0); + } + + OpAttr dispatch(RelNode rn) throws SemanticException { + if (rn instanceof HiveTableScan) { + return visit((HiveTableScan) rn); + } else if (rn instanceof HiveProject) { + return visit((HiveProject) rn); + } else if (rn instanceof HiveJoin) { + return visit((HiveJoin) rn); + } else if (rn instanceof SemiJoin) { + SemiJoin sj = (SemiJoin) rn; + HiveJoin hj = HiveJoin.getJoin(sj.getCluster(), sj.getLeft(), sj.getRight(), + sj.getCondition(), sj.getJoinType(), true); + return visit(hj); + } else if (rn instanceof HiveFilter) { + return visit((HiveFilter) rn); + } else if (rn instanceof HiveSort) { + return visit((HiveSort) rn); + } else if (rn instanceof HiveUnion) { + return visit((HiveUnion) rn); + } else if (rn instanceof LogicalExchange) { + return visit((LogicalExchange) rn); + } else if (rn instanceof HiveAggregate) { + return visit((HiveAggregate) rn); + } + LOG.error(rn.getClass().getCanonicalName() + "operator translation not supported" + + " yet in return path."); + return null; + } + + /** + * TODO: 1. PPD needs to get pushed in to TS + * + * @param scanRel + * @return + */ + OpAttr visit(HiveTableScan scanRel) { + + if (LOG.isDebugEnabled()) { + LOG.debug("Translating operator rel#" + scanRel.getId() + ":" + scanRel.getRelTypeName() + + " with row type: [" + scanRel.getRowType() + "]"); + } + + RelOptHiveTable ht = (RelOptHiveTable) scanRel.getTable(); + + // 1. Setup TableScan Desc + // 1.1 Build col details used by scan + ArrayList colInfos = new ArrayList(); + List virtualCols = new ArrayList(ht.getVirtualCols()); + Map hiveScanVColMap = new HashMap(); + List partColNames = new ArrayList(); + List neededColumnIDs = new ArrayList(); + List neededColumns = new ArrayList(); + + Map posToVColMap = HiveCalciteUtil.getVColsMap(virtualCols, + ht.getNoOfNonVirtualCols()); + Map posToPartColInfo = ht.getPartColInfoMap(); + Map posToNonPartColInfo = ht.getNonPartColInfoMap(); + List neededColIndxsFrmReloptHT = scanRel.getNeededColIndxsFrmReloptHT(); + List scanColNames = scanRel.getRowType().getFieldNames(); + String tableAlias = scanRel.getTableAlias(); + + String colName; + ColumnInfo colInfo; + VirtualColumn vc; + Integer posInRHT; + + for (int i = 0; i < neededColIndxsFrmReloptHT.size(); i++) { + colName = scanColNames.get(i); + posInRHT = neededColIndxsFrmReloptHT.get(i); + if (posToVColMap.containsKey(posInRHT)) { + vc = posToVColMap.get(posInRHT); + virtualCols.add(vc); + colInfo = new ColumnInfo(vc.getName(), vc.getTypeInfo(), tableAlias, true, vc.getIsHidden()); + hiveScanVColMap.put(i, vc); + } else if (posToPartColInfo.containsKey(posInRHT)) { + partColNames.add(colName); + colInfo = posToPartColInfo.get(posInRHT); + } else { + colInfo = posToNonPartColInfo.get(posInRHT); + } + neededColumnIDs.add(posInRHT); + neededColumns.add(colName); + colInfos.add(colInfo); + } + + // 1.2 Create TableScanDesc + TableScanDesc tsd = new TableScanDesc(tableAlias, virtualCols, ht.getHiveTableMD()); + + // 1.3. Set Partition cols in TSDesc + tsd.setPartColumns(partColNames); + + // 1.4. Set needed cols in TSDesc + tsd.setNeededColumnIDs(neededColumnIDs); + tsd.setNeededColumns(neededColumns); + + // 2. Setup TableScan + TableScanOperator ts = (TableScanOperator) OperatorFactory.get(tsd, new RowSchema(colInfos)); + + topOps.put(ht.getQBID(), ts); + + if (LOG.isDebugEnabled()) { + LOG.debug("Generated " + ts + " with row schema: [" + ts.getSchema() + "]"); + } + + return new OpAttr(tableAlias, hiveScanVColMap, ts); + } + + OpAttr visit(HiveProject projectRel) throws SemanticException { + OpAttr inputOpAf = dispatch(projectRel.getInput()); + + if (LOG.isDebugEnabled()) { + LOG.debug("Translating operator rel#" + projectRel.getId() + ":" + + projectRel.getRelTypeName() + " with row type: [" + projectRel.getRowType() + "]"); + } + + WindowingSpec windowingSpec = new WindowingSpec(); + List exprCols = new ArrayList(); + for (int pos = 0; pos < projectRel.getChildExps().size(); pos++) { + ExprNodeConverter converter = new ExprNodeConverter(inputOpAf.tabAlias, projectRel + .getRowType().getFieldNames().get(pos), projectRel.getInput().getRowType(), + projectRel.getRowType(), false, projectRel.getCluster().getTypeFactory()); + exprCols.add(projectRel.getChildExps().get(pos).accept(converter)); + if (converter.getWindowFunctionSpec() != null) { + windowingSpec.addWindowFunction(converter.getWindowFunctionSpec()); + } + } + if (windowingSpec.getWindowExpressions() != null + && !windowingSpec.getWindowExpressions().isEmpty()) { + inputOpAf = genPTF(inputOpAf, windowingSpec); + } + // TODO: is this a safe assumption (name collision, external names...) + List exprNames = new ArrayList(projectRel.getRowType().getFieldNames()); + SelectDesc sd = new SelectDesc(exprCols, exprNames); + Pair, Map> colInfoVColPair = createColInfos( + projectRel.getChildExps(), exprCols, exprNames, inputOpAf); + SelectOperator selOp = (SelectOperator) OperatorFactory.getAndMakeChild(sd, new RowSchema( + colInfoVColPair.getKey()), inputOpAf.inputs.get(0)); + + if (LOG.isDebugEnabled()) { + LOG.debug("Generated " + selOp + " with row schema: [" + selOp.getSchema() + "]"); + } + + return new OpAttr(inputOpAf.tabAlias, colInfoVColPair.getValue(), selOp); + } + + OpAttr visit(HiveJoin joinRel) throws SemanticException { + // 1. Convert inputs + OpAttr[] inputs = new OpAttr[joinRel.getInputs().size()]; + List> children = new ArrayList>(joinRel.getInputs().size()); + for (int i = 0; i < inputs.length; i++) { + inputs[i] = dispatch(joinRel.getInput(i)); + children.add(inputs[i].inputs.get(0)); + } + + if (LOG.isDebugEnabled()) { + LOG.debug("Translating operator rel#" + joinRel.getId() + ":" + joinRel.getRelTypeName() + + " with row type: [" + joinRel.getRowType() + "]"); + } + + // 2. Convert join condition + JoinPredicateInfo joinPredInfo = JoinPredicateInfo.constructJoinPredicateInfo(joinRel); + + // 3. Extract join keys from condition + ExprNodeDesc[][] joinKeys = extractJoinKeys(joinPredInfo, joinRel.getInputs()); + + // 4. Generate Join operator + JoinOperator joinOp = genJoin(joinRel, joinPredInfo, children, joinKeys); + + // 5. TODO: Extract condition for non-equi join elements (if any) and + // add it + + // 6. Virtual columns + Map vcolMap = new HashMap(); + vcolMap.putAll(inputs[0].vcolMap); + if (extractJoinType(joinRel) != JoinType.LEFTSEMI) { + int shift = inputs[0].inputs.get(0).getSchema().getSignature().size(); + for (int i = 1; i < inputs.length; i++) { + vcolMap.putAll(HiveCalciteUtil.shiftVColsMap(inputs[i].vcolMap, shift)); + shift += inputs[i].inputs.get(0).getSchema().getSignature().size(); + } + } + + // 8. Return result + return new OpAttr(null, vcolMap, joinOp); + } + + OpAttr visit(HiveAggregate aggRel) throws SemanticException { + OpAttr inputOpAf = dispatch(aggRel.getInput()); + return HiveGBOpConvUtil.translateGB(inputOpAf, aggRel, hiveConf); + } + + OpAttr visit(HiveSort sortRel) throws SemanticException { + OpAttr inputOpAf = dispatch(sortRel.getInput()); + + if (LOG.isDebugEnabled()) { + LOG.debug("Translating operator rel#" + sortRel.getId() + ":" + sortRel.getRelTypeName() + + " with row type: [" + sortRel.getRowType() + "]"); + if (sortRel.getCollation() == RelCollations.EMPTY) { + LOG.debug("Operator rel#" + sortRel.getId() + ":" + sortRel.getRelTypeName() + + " consists of limit"); + } else if (sortRel.fetch == null) { + LOG.debug("Operator rel#" + sortRel.getId() + ":" + sortRel.getRelTypeName() + + " consists of sort"); + } else { + LOG.debug("Operator rel#" + sortRel.getId() + ":" + sortRel.getRelTypeName() + + " consists of sort+limit"); + } + } + + Operator inputOp = inputOpAf.inputs.get(0); + Operator resultOp = inputOpAf.inputs.get(0); + // 1. If we need to sort tuples based on the value of some + // of their columns + if (sortRel.getCollation() != RelCollations.EMPTY) { + + // In strict mode, in the presence of order by, limit must be + // specified + if (strictMode && sortRel.fetch == null) { + throw new SemanticException(ErrorMsg.NO_LIMIT_WITH_ORDERBY.getMsg()); + } + + // 1.a. Extract order for each column from collation + // Generate sortCols and order + List sortCols = new ArrayList(); + StringBuilder order = new StringBuilder(); + for (RelCollation collation : sortRel.getCollationList()) { + for (RelFieldCollation sortInfo : collation.getFieldCollations()) { + int sortColumnPos = sortInfo.getFieldIndex(); + ColumnInfo columnInfo = new ColumnInfo(inputOp.getSchema().getSignature() + .get(sortColumnPos)); + ExprNodeColumnDesc sortColumn = new ExprNodeColumnDesc(columnInfo.getType(), + columnInfo.getInternalName(), columnInfo.getTabAlias(), columnInfo.getIsVirtualCol()); + sortCols.add(sortColumn); + if (sortInfo.getDirection() == RelFieldCollation.Direction.DESCENDING) { + order.append("-"); + } else { + order.append("+"); + } + } + } + // Use only 1 reducer for order by + int numReducers = 1; + + // 1.b. Generate reduce sink and project operator + resultOp = genReduceSinkAndBacktrackSelect(resultOp, + sortCols.toArray(new ExprNodeDesc[sortCols.size()]), -1, new ArrayList(), + order.toString(), numReducers, Operation.NOT_ACID, strictMode); + } + + // 2. If we need to generate limit + if (sortRel.fetch != null) { + int limit = RexLiteral.intValue(sortRel.fetch); + LimitDesc limitDesc = new LimitDesc(limit); + // TODO: Set 'last limit' global property + ArrayList cinfoLst = createColInfos(inputOp); + resultOp = OperatorFactory.getAndMakeChild(limitDesc, + new RowSchema(cinfoLst), resultOp); + + if (LOG.isDebugEnabled()) { + LOG.debug("Generated " + resultOp + " with row schema: [" + resultOp.getSchema() + "]"); + } + } + + // 3. Return result + return inputOpAf.clone(resultOp); + } + + /** + * TODO: 1) isSamplingPred 2) sampleDesc 3) isSortedFilter + */ + OpAttr visit(HiveFilter filterRel) throws SemanticException { + OpAttr inputOpAf = dispatch(filterRel.getInput()); + + if (LOG.isDebugEnabled()) { + LOG.debug("Translating operator rel#" + filterRel.getId() + ":" + filterRel.getRelTypeName() + + " with row type: [" + filterRel.getRowType() + "]"); + } + + ExprNodeDesc filCondExpr = filterRel.getCondition().accept( + new ExprNodeConverter(inputOpAf.tabAlias, filterRel.getInput().getRowType(), false, + filterRel.getCluster().getTypeFactory())); + FilterDesc filDesc = new FilterDesc(filCondExpr, false); + ArrayList cinfoLst = createColInfos(inputOpAf.inputs.get(0)); + FilterOperator filOp = (FilterOperator) OperatorFactory.getAndMakeChild(filDesc, new RowSchema( + cinfoLst), inputOpAf.inputs.get(0)); + + if (LOG.isDebugEnabled()) { + LOG.debug("Generated " + filOp + " with row schema: [" + filOp.getSchema() + "]"); + } + + return inputOpAf.clone(filOp); + } + + OpAttr visit(HiveUnion unionRel) throws SemanticException { + // 1. Convert inputs + OpAttr[] inputs = new OpAttr[unionRel.getInputs().size()]; + for (int i = 0; i < inputs.length; i++) { + inputs[i] = dispatch(unionRel.getInput(i)); + } + + if (LOG.isDebugEnabled()) { + LOG.debug("Translating operator rel#" + unionRel.getId() + ":" + unionRel.getRelTypeName() + + " with row type: [" + unionRel.getRowType() + "]"); + } + + // 2. Create a new union operator + UnionDesc unionDesc = new UnionDesc(); + unionDesc.setNumInputs(inputs.length); + ArrayList cinfoLst = createColInfos(inputs[0].inputs.get(0)); + Operator[] children = new Operator[inputs.length]; + for (int i = 0; i < children.length; i++) { + children[i] = inputs[i].inputs.get(0); + } + Operator unionOp = OperatorFactory.getAndMakeChild(unionDesc, + new RowSchema(cinfoLst), children); + + if (LOG.isDebugEnabled()) { + LOG.debug("Generated " + unionOp + " with row schema: [" + unionOp.getSchema() + "]"); + } + + // 3. Return result + return inputs[0].clone(unionOp); + } + + OpAttr visit(LogicalExchange exchangeRel) throws SemanticException { + OpAttr inputOpAf = dispatch(exchangeRel.getInput()); + + if (LOG.isDebugEnabled()) { + LOG.debug("Translating operator rel#" + exchangeRel.getId() + ":" + + exchangeRel.getRelTypeName() + " with row type: [" + exchangeRel.getRowType() + "]"); + } + + RelDistribution distribution = exchangeRel.getDistribution(); + if (distribution.getType() != Type.HASH_DISTRIBUTED) { + throw new SemanticException("Only hash distribution supported for LogicalExchange"); + } + ExprNodeDesc[] expressions = new ExprNodeDesc[distribution.getKeys().size()]; + for (int i = 0; i < distribution.getKeys().size(); i++) { + int key = distribution.getKeys().get(i); + ColumnInfo colInfo = inputOpAf.inputs.get(0).getSchema().getSignature().get(key); + ExprNodeDesc column = new ExprNodeColumnDesc(colInfo); + expressions[i] = column; + } + + ReduceSinkOperator rsOp = genReduceSink(inputOpAf.inputs.get(0), expressions, + reduceSinkTagGenerator++, -1, Operation.NOT_ACID, strictMode); + + return inputOpAf.clone(rsOp); + } + + private OpAttr genPTF(OpAttr inputOpAf, WindowingSpec wSpec) throws SemanticException { + Operator input = inputOpAf.inputs.get(0); + + wSpec.validateAndMakeEffective(); + WindowingComponentizer groups = new WindowingComponentizer(wSpec); + RowResolver rr = new RowResolver(); + for (ColumnInfo ci : input.getSchema().getSignature()) { + rr.put(ci.getTabAlias(), ci.getInternalName(), ci); + } + + while (groups.hasNext()) { + wSpec = groups.next(hiveConf, semanticAnalyzer, unparseTranslator, rr); + + // 1. Create RS and backtrack Select operator on top + ArrayList keyCols = new ArrayList(); + ArrayList partCols = new ArrayList(); + StringBuilder order = new StringBuilder(); + + for (PartitionExpression partCol : wSpec.getQueryPartitionSpec().getExpressions()) { + ExprNodeDesc partExpr = semanticAnalyzer.genExprNodeDesc(partCol.getExpression(), rr); + if (ExprNodeDescUtils.indexOf(partExpr, partCols) < 0) { + keyCols.add(partExpr); + partCols.add(partExpr); + order.append('+'); + } + } + + if (wSpec.getQueryOrderSpec() != null) { + for (OrderExpression orderCol : wSpec.getQueryOrderSpec().getExpressions()) { + ExprNodeDesc orderExpr = semanticAnalyzer.genExprNodeDesc(orderCol.getExpression(), rr); + char orderChar = orderCol.getOrder() == PTFInvocationSpec.Order.ASC ? '+' : '-'; + int index = ExprNodeDescUtils.indexOf(orderExpr, keyCols); + if (index >= 0) { + order.setCharAt(index, orderChar); + continue; + } + keyCols.add(orderExpr); + order.append(orderChar); + } + } + + SelectOperator selectOp = genReduceSinkAndBacktrackSelect(input, + keyCols.toArray(new ExprNodeDesc[keyCols.size()]), reduceSinkTagGenerator++, partCols, + order.toString(), -1, Operation.NOT_ACID, strictMode); + + // 2. Finally create PTF + PTFTranslator translator = new PTFTranslator(); + PTFDesc ptfDesc = translator.translate(wSpec, semanticAnalyzer, hiveConf, rr, + unparseTranslator); + RowResolver ptfOpRR = ptfDesc.getFuncDef().getOutputShape().getRr(); + + Operator ptfOp = OperatorFactory.getAndMakeChild(ptfDesc, + new RowSchema(ptfOpRR.getColumnInfos()), selectOp); + + if (LOG.isDebugEnabled()) { + LOG.debug("Generated " + ptfOp + " with row schema: [" + ptfOp.getSchema() + "]"); + } + + // 3. Prepare for next iteration (if any) + rr = ptfOpRR; + input = ptfOp; + } + + return inputOpAf.clone(input); + } + + private ExprNodeDesc[][] extractJoinKeys(JoinPredicateInfo joinPredInfo, List inputs) { + ExprNodeDesc[][] joinKeys = new ExprNodeDesc[inputs.size()][]; + for (int i = 0; i < inputs.size(); i++) { + joinKeys[i] = new ExprNodeDesc[joinPredInfo.getEquiJoinPredicateElements().size()]; + for (int j = 0; j < joinPredInfo.getEquiJoinPredicateElements().size(); j++) { + JoinLeafPredicateInfo joinLeafPredInfo = joinPredInfo.getEquiJoinPredicateElements().get(j); + RexNode key = joinLeafPredInfo.getJoinKeyExprs(j).get(0); + joinKeys[i][j] = convertToExprNode(key, inputs.get(j), null); + } + } + return joinKeys; + } + + private static SelectOperator genReduceSinkAndBacktrackSelect(Operator input, + ExprNodeDesc[] keys, int tag, ArrayList partitionCols, String order, + int numReducers, Operation acidOperation, boolean strictMode) throws SemanticException { + // 1. Generate RS operator + ReduceSinkOperator rsOp = genReduceSink(input, keys, tag, partitionCols, order, numReducers, + acidOperation, strictMode); + + // 2. Generate backtrack Select operator + Map descriptors = buildBacktrackFromReduceSink(rsOp, + input); + SelectDesc selectDesc = new SelectDesc(new ArrayList(descriptors.values()), + new ArrayList(descriptors.keySet())); + ArrayList cinfoLst = createColInfos(input); + SelectOperator selectOp = (SelectOperator) OperatorFactory.getAndMakeChild(selectDesc, + new RowSchema(cinfoLst), rsOp); + selectOp.setColumnExprMap(descriptors); + + if (LOG.isDebugEnabled()) { + LOG.debug("Generated " + selectOp + " with row schema: [" + selectOp.getSchema() + "]"); + } + + return selectOp; + } + + private static ReduceSinkOperator genReduceSink(Operator input, ExprNodeDesc[] keys, int tag, + int numReducers, Operation acidOperation, boolean strictMode) throws SemanticException { + return genReduceSink(input, keys, tag, new ArrayList(), "", numReducers, + acidOperation, strictMode); + } + + @SuppressWarnings({ "rawtypes", "unchecked" }) + private static ReduceSinkOperator genReduceSink(Operator input, ExprNodeDesc[] keys, int tag, + ArrayList partitionCols, String order, int numReducers, + Operation acidOperation, boolean strictMode) throws SemanticException { + Operator dummy = Operator.createDummy(); // dummy for backtracking + dummy.setParentOperators(Arrays.asList(input)); + + ArrayList reduceKeys = new ArrayList(); + ArrayList reduceKeysBack = new ArrayList(); + + // Compute join keys and store in reduceKeys + for (ExprNodeDesc key : keys) { + reduceKeys.add(key); + reduceKeysBack.add(ExprNodeDescUtils.backtrack(key, dummy, input)); + } + + // Walk over the input schema and copy in the output + ArrayList reduceValues = new ArrayList(); + ArrayList reduceValuesBack = new ArrayList(); + Map colExprMap = new HashMap(); + + List inputColumns = input.getSchema().getSignature(); + ArrayList outputColumns = new ArrayList(); + List outputColumnNames = new ArrayList(); + int[] index = new int[inputColumns.size()]; + for (int i = 0; i < inputColumns.size(); i++) { + ColumnInfo colInfo = inputColumns.get(i); + String outputColName = colInfo.getInternalName(); + ExprNodeDesc expr = new ExprNodeColumnDesc(colInfo); + + // backtrack can be null when input is script operator + ExprNodeDesc exprBack = ExprNodeDescUtils.backtrack(expr, dummy, input); + int kindex = exprBack == null ? -1 : ExprNodeDescUtils.indexOf(exprBack, reduceKeysBack); + if (kindex >= 0) { + ColumnInfo newColInfo = new ColumnInfo(colInfo); + newColInfo.setInternalName(Utilities.ReduceField.KEY + ".reducesinkkey" + kindex); + newColInfo.setAlias(outputColName); + newColInfo.setTabAlias(colInfo.getTabAlias()); + outputColumns.add(newColInfo); + index[i] = kindex; + continue; + } + int vindex = exprBack == null ? -1 : ExprNodeDescUtils.indexOf(exprBack, reduceValuesBack); + if (kindex >= 0) { + index[i] = -vindex - 1; + continue; + } + index[i] = -reduceValues.size() - 1; + + reduceValues.add(expr); + reduceValuesBack.add(exprBack); + + ColumnInfo newColInfo = new ColumnInfo(colInfo); + newColInfo.setInternalName(Utilities.ReduceField.VALUE + "." + outputColName); + newColInfo.setAlias(outputColName); + newColInfo.setTabAlias(colInfo.getTabAlias()); + + outputColumns.add(newColInfo); + outputColumnNames.add(outputColName); + } + dummy.setParentOperators(null); + + // Use only 1 reducer if no reduce keys + if (reduceKeys.size() == 0) { + numReducers = 1; + + // Cartesian product is not supported in strict mode + if (strictMode) { + throw new SemanticException(ErrorMsg.NO_CARTESIAN_PRODUCT.getMsg()); + } + } + + ReduceSinkDesc rsDesc; + if (order.isEmpty()) { + rsDesc = PlanUtils.getReduceSinkDesc(reduceKeys, reduceValues, outputColumnNames, false, tag, + reduceKeys.size(), numReducers, acidOperation); + } else { + rsDesc = PlanUtils.getReduceSinkDesc(reduceKeys, reduceValues, outputColumnNames, false, tag, + partitionCols, order, numReducers, acidOperation); + } + + ReduceSinkOperator rsOp = (ReduceSinkOperator) OperatorFactory.getAndMakeChild(rsDesc, + new RowSchema(outputColumns), input); + + List keyColNames = rsDesc.getOutputKeyColumnNames(); + for (int i = 0; i < keyColNames.size(); i++) { + colExprMap.put(Utilities.ReduceField.KEY + "." + keyColNames.get(i), reduceKeys.get(i)); + } + List valColNames = rsDesc.getOutputValueColumnNames(); + for (int i = 0; i < valColNames.size(); i++) { + colExprMap.put(Utilities.ReduceField.VALUE + "." + valColNames.get(i), reduceValues.get(i)); + } + + rsOp.setValueIndex(index); + rsOp.setColumnExprMap(colExprMap); + rsOp.setInputAliases(input.getSchema().getColumnNames() + .toArray(new String[input.getSchema().getColumnNames().size()])); + + if (LOG.isDebugEnabled()) { + LOG.debug("Generated " + rsOp + " with row schema: [" + rsOp.getSchema() + "]"); + } + + return rsOp; + } + + private static JoinOperator genJoin(HiveJoin hiveJoin, JoinPredicateInfo joinPredInfo, + List> children, ExprNodeDesc[][] joinKeys) throws SemanticException { + + // Extract join type + JoinType joinType = extractJoinType(hiveJoin); + + // NOTE: Currently binary joins only + JoinCondDesc[] joinCondns = new JoinCondDesc[1]; + joinCondns[0] = new JoinCondDesc(new JoinCond(0, 1, joinType)); + + ArrayList outputColumns = new ArrayList(); + ArrayList outputColumnNames = new ArrayList(hiveJoin.getRowType() + .getFieldNames()); + Operator[] childOps = new Operator[children.size()]; + + Map reversedExprs = new HashMap(); + HashMap> exprMap = new HashMap>(); + Map colExprMap = new HashMap(); + HashMap> posToAliasMap = new HashMap>(); + + int outputPos = 0; + for (int pos = 0; pos < children.size(); pos++) { + ReduceSinkOperator inputRS = (ReduceSinkOperator) children.get(pos); + if (inputRS.getNumParent() != 1) { + throw new SemanticException("RS should have single parent"); + } + Operator parent = inputRS.getParentOperators().get(0); + ReduceSinkDesc rsDesc = inputRS.getConf(); + + int[] index = inputRS.getValueIndex(); + + Byte tag = (byte) rsDesc.getTag(); + + // Semijoin + if (joinType == JoinType.LEFTSEMI && pos != 0) { + exprMap.put(tag, new ArrayList()); + childOps[pos] = inputRS; + continue; + } + + List keyColNames = rsDesc.getOutputKeyColumnNames(); + List valColNames = rsDesc.getOutputValueColumnNames(); + + posToAliasMap.put(pos, new HashSet(inputRS.getSchema().getTableNames())); + + Map descriptors = buildBacktrackFromReduceSink(outputPos, + outputColumnNames, keyColNames, valColNames, index, parent); + + List parentColumns = parent.getSchema().getSignature(); + for (int i = 0; i < index.length; i++) { + ColumnInfo info = new ColumnInfo(parentColumns.get(i)); + info.setInternalName(outputColumnNames.get(outputPos)); + outputColumns.add(info); + reversedExprs.put(outputColumnNames.get(outputPos), tag); + outputPos++; + } + + exprMap.put(tag, new ArrayList(descriptors.values())); + colExprMap.putAll(descriptors); + childOps[pos] = inputRS; + } + + boolean noOuterJoin = joinType != JoinType.FULLOUTER && joinType != JoinType.LEFTOUTER + && joinType != JoinType.RIGHTOUTER; + JoinDesc desc = new JoinDesc(exprMap, outputColumnNames, noOuterJoin, joinCondns, joinKeys); + desc.setReversedExprs(reversedExprs); + + JoinOperator joinOp = (JoinOperator) OperatorFactory.getAndMakeChild(desc, new RowSchema( + outputColumns), childOps); + joinOp.setColumnExprMap(colExprMap); + joinOp.setPosToAliasMap(posToAliasMap); + + // TODO: null safes? + + if (LOG.isDebugEnabled()) { + LOG.debug("Generated " + joinOp + " with row schema: [" + joinOp.getSchema() + "]"); + } + + return joinOp; + } + + private static JoinType extractJoinType(HiveJoin join) { + // UNIQUE + if (join.isDistinct()) { + return JoinType.UNIQUE; + } + // SEMIJOIN + if (join.isLeftSemiJoin()) { + return JoinType.LEFTSEMI; + } + // OUTER AND INNER JOINS + JoinType resultJoinType; + switch (join.getJoinType()) { + case FULL: + resultJoinType = JoinType.FULLOUTER; + break; + case LEFT: + resultJoinType = JoinType.LEFTOUTER; + break; + case RIGHT: + resultJoinType = JoinType.RIGHTOUTER; + break; + default: + resultJoinType = JoinType.INNER; + break; + } + return resultJoinType; + } + + private static Map buildBacktrackFromReduceSink(ReduceSinkOperator rsOp, + Operator inputOp) { + return buildBacktrackFromReduceSink(0, inputOp.getSchema().getColumnNames(), rsOp.getConf() + .getOutputKeyColumnNames(), rsOp.getConf().getOutputValueColumnNames(), + rsOp.getValueIndex(), inputOp); + } + + private static Map buildBacktrackFromReduceSink(int initialPos, + List outputColumnNames, List keyColNames, List valueColNames, + int[] index, Operator inputOp) { + Map columnDescriptors = new LinkedHashMap(); + for (int i = 0; i < index.length; i++) { + ColumnInfo info = new ColumnInfo(inputOp.getSchema().getSignature().get(i)); + String field; + if (index[i] >= 0) { + field = Utilities.ReduceField.KEY + "." + keyColNames.get(index[i]); + } else { + field = Utilities.ReduceField.VALUE + "." + valueColNames.get(-index[i] - 1); + } + ExprNodeColumnDesc desc = new ExprNodeColumnDesc(info.getType(), field, info.getTabAlias(), + info.getIsVirtualCol()); + columnDescriptors.put(outputColumnNames.get(initialPos + i), desc); + } + return columnDescriptors; + } + + private static ExprNodeDesc convertToExprNode(RexNode rn, RelNode inputRel, String tabAlias) { + return rn.accept(new ExprNodeConverter(tabAlias, inputRel.getRowType(), false, + inputRel.getCluster().getTypeFactory())); + } + + private static ArrayList createColInfos(Operator input) { + ArrayList cInfoLst = new ArrayList(); + for (ColumnInfo ci : input.getSchema().getSignature()) { + cInfoLst.add(new ColumnInfo(ci)); + } + return cInfoLst; + } + + private static Pair, Map> createColInfos( + List calciteExprs, List hiveExprs, List projNames, + OpAttr inpOpAf) { + if (hiveExprs.size() != projNames.size()) { + throw new RuntimeException("Column expressions list doesn't match Column Names list"); + } + + RexNode rexN; + ExprNodeDesc pe; + ArrayList colInfos = new ArrayList(); + VirtualColumn vc; + Map newVColMap = new HashMap(); + for (int i = 0; i < hiveExprs.size(); i++) { + pe = hiveExprs.get(i); + rexN = calciteExprs.get(i); + vc = null; + if (rexN instanceof RexInputRef) { + vc = inpOpAf.vcolMap.get(((RexInputRef) rexN).getIndex()); + if (vc != null) { + newVColMap.put(i, vc); + } + } + colInfos + .add(new ColumnInfo(projNames.get(i), pe.getTypeInfo(), inpOpAf.tabAlias, vc != null)); + } + + return new Pair, Map>(colInfos, newVColMap); + } +} Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/HiveGBOpConvUtil.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/HiveGBOpConvUtil.java (.../https://svn.apache.org/repos/asf/hive/trunk) (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/HiveGBOpConvUtil.java (revision 1672450) @@ -0,0 +1,1237 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.optimizer.calcite.translator; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.calcite.rel.RelNode; +import org.apache.calcite.rel.core.AggregateCall; +import org.apache.calcite.rel.type.RelDataType; +import org.apache.calcite.rex.RexInputRef; +import org.apache.calcite.util.ImmutableBitSet; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.ErrorMsg; +import org.apache.hadoop.hive.ql.exec.ColumnInfo; +import org.apache.hadoop.hive.ql.exec.GroupByOperator; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.OperatorFactory; +import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; +import org.apache.hadoop.hive.ql.exec.RowSchema; +import org.apache.hadoop.hive.ql.exec.Utilities; +import org.apache.hadoop.hive.ql.io.AcidUtils; +import org.apache.hadoop.hive.ql.metadata.VirtualColumn; +import org.apache.hadoop.hive.ql.optimizer.calcite.HiveCalciteUtil; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAggregate; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveGroupingID; +import org.apache.hadoop.hive.ql.optimizer.calcite.translator.HiveOpConverter.OpAttr; +import org.apache.hadoop.hive.ql.parse.ASTNode; +import org.apache.hadoop.hive.ql.parse.SemanticAnalyzer; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.GenericUDAFInfo; +import org.apache.hadoop.hive.ql.plan.AggregationDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils; +import org.apache.hadoop.hive.ql.plan.GroupByDesc; +import org.apache.hadoop.hive.ql.plan.PlanUtils; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.Mode; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; + +import com.google.common.collect.ImmutableList; + +/** + * TODO:
+ * 1. Change the output col/ExprNodeColumn names to external names.
+ * 2. Verify if we need to use the "KEY."/"VALUE." in RS cols; switch to + * external names if possible.
+ * 3. In ExprNode & in ColumnInfo the tableAlias/VirtualColumn is specified + * differently for different GB/RS in pipeline. Remove the different treatments. + * 3. VirtualColMap needs to be maintained + * + */ +public class HiveGBOpConvUtil { + private static enum HIVEGBPHYSICALMODE { + MAP_SIDE_GB_NO_SKEW_NO_ADD_MR_JOB, MAP_SIDE_GB_NO_SKEW_ADD_MR_JOB, MAP_SIDE_GB_SKEW_GBKEYS_OR_DIST_UDAF_PRESENT, MAP_SIDE_GB_SKEW_GBKEYS_AND_DIST_UDAF_NOT_PRESENT, NO_MAP_SIDE_GB_NO_SKEW, NO_MAP_SIDE_GB_SKEW + }; + + private static class UDAFAttrs { + private boolean isDistinctUDAF; + private String udafName; + private GenericUDAFEvaluator udafEvaluator; + private ArrayList udafParams = new ArrayList(); + private List udafParamsIndxInGBInfoDistExprs = new ArrayList(); + }; + + private static class GBInfo { + private List outputColNames = new ArrayList(); + + private List gbKeyColNamesInInput = new ArrayList(); + private List gbKeyTypes = new ArrayList(); + private List gbKeys = new ArrayList(); + + private List grpSets = new ArrayList(); + private boolean grpSetRqrAdditionalMRJob; + private boolean grpIdFunctionNeeded; + + private List distExprNames = new ArrayList(); + private List distExprTypes = new ArrayList(); + private List distExprNodes = new ArrayList(); + private List> distColIndices = new ArrayList>(); + + private List deDupedNonDistIrefs = new ArrayList(); + + private List udafAttrs = new ArrayList(); + private boolean containsDistinctAggr = false; + + float groupByMemoryUsage; + float memoryThreshold; + + private HIVEGBPHYSICALMODE gbPhysicalPipelineMode; + }; + + private static HIVEGBPHYSICALMODE getAggOPMode(HiveConf hc, GBInfo gbInfo) { + HIVEGBPHYSICALMODE gbPhysicalPipelineMode = HIVEGBPHYSICALMODE.MAP_SIDE_GB_NO_SKEW_NO_ADD_MR_JOB; + + if (hc.getBoolVar(HiveConf.ConfVars.HIVEMAPSIDEAGGREGATE)) { + if (!hc.getBoolVar(HiveConf.ConfVars.HIVEGROUPBYSKEW)) { + if (!gbInfo.grpSetRqrAdditionalMRJob) { + gbPhysicalPipelineMode = HIVEGBPHYSICALMODE.MAP_SIDE_GB_NO_SKEW_NO_ADD_MR_JOB; + } else { + gbPhysicalPipelineMode = HIVEGBPHYSICALMODE.MAP_SIDE_GB_NO_SKEW_ADD_MR_JOB; + } + } else { + if (gbInfo.containsDistinctAggr || !gbInfo.gbKeys.isEmpty()) { + gbPhysicalPipelineMode = HIVEGBPHYSICALMODE.MAP_SIDE_GB_SKEW_GBKEYS_OR_DIST_UDAF_PRESENT; + } else { + gbPhysicalPipelineMode = HIVEGBPHYSICALMODE.MAP_SIDE_GB_SKEW_GBKEYS_AND_DIST_UDAF_NOT_PRESENT; + } + } + } else { + if (!hc.getBoolVar(HiveConf.ConfVars.HIVEGROUPBYSKEW)) { + gbPhysicalPipelineMode = HIVEGBPHYSICALMODE.NO_MAP_SIDE_GB_NO_SKEW; + } else { + gbPhysicalPipelineMode = HIVEGBPHYSICALMODE.NO_MAP_SIDE_GB_SKEW; + } + } + + return gbPhysicalPipelineMode; + } + + // For each of the GB op in the logical GB this should be called seperately; + // otherwise GBevaluator and expr nodes may get shared among multiple GB ops + private static GBInfo getGBInfo(HiveAggregate aggRel, OpAttr inputOpAf, HiveConf hc) { + GBInfo gbInfo = new GBInfo(); + + // 0. Collect AggRel output col Names + gbInfo.outputColNames.addAll(aggRel.getRowType().getFieldNames()); + + // 1. Collect GB Keys + RelNode aggInputRel = aggRel.getInput(); + ExprNodeConverter exprConv = new ExprNodeConverter(inputOpAf.tabAlias, + aggInputRel.getRowType(), false, aggRel.getCluster().getTypeFactory()); + + ExprNodeDesc tmpExprNodeDesc; + for (int i : aggRel.getGroupSet()) { + RexInputRef iRef = new RexInputRef(i, (RelDataType) aggInputRel.getRowType().getFieldList() + .get(i).getType()); + tmpExprNodeDesc = iRef.accept(exprConv); + gbInfo.gbKeys.add(tmpExprNodeDesc); + gbInfo.gbKeyColNamesInInput.add(aggInputRel.getRowType().getFieldNames().get(i)); + gbInfo.gbKeyTypes.add(tmpExprNodeDesc.getTypeInfo()); + } + + // 2. Collect Grouping Set info + if (aggRel.indicator) { + // 2.1 Translate Grouping set col bitset + ImmutableList lstGrpSet = aggRel.getGroupSets(); + int bitmap = 0; + for (ImmutableBitSet grpSet : lstGrpSet) { + bitmap = 0; + for (Integer bitIdx : grpSet.asList()) { + bitmap = SemanticAnalyzer.setBit(bitmap, bitIdx); + } + gbInfo.grpSets.add(bitmap); + } + Collections.sort(gbInfo.grpSets); + + // 2.2 Check if GRpSet require additional MR Job + gbInfo.grpSetRqrAdditionalMRJob = gbInfo.grpSets.size() > hc + .getIntVar(HiveConf.ConfVars.HIVE_NEW_JOB_GROUPING_SET_CARDINALITY); + + // 2.3 Check if GROUPING_ID needs to be projected out + if (!aggRel.getAggCallList().isEmpty() + && (aggRel.getAggCallList().get(aggRel.getAggCallList().size() - 1).getAggregation() == HiveGroupingID.INSTANCE)) { + gbInfo.grpIdFunctionNeeded = true; + } + } + + // 3. Walk through UDAF & Collect Distinct Info + Set distinctRefs = new HashSet(); + Map distParamInRefsToOutputPos = new HashMap(); + for (AggregateCall aggCall : aggRel.getAggCallList()) { + if ((aggCall.getAggregation() == HiveGroupingID.INSTANCE) || !aggCall.isDistinct()) { + continue; + } + + List argLst = new ArrayList(aggCall.getArgList()); + List argNames = HiveCalciteUtil.getFieldNames(argLst, aggInputRel); + ExprNodeDesc distinctExpr; + for (int i = 0; i < argLst.size(); i++) { + if (!distinctRefs.contains(argLst.get(i))) { + distinctRefs.add(argLst.get(i)); + distParamInRefsToOutputPos.put(argLst.get(i), gbInfo.distExprNodes.size()); + distinctExpr = HiveCalciteUtil.getExprNode(argLst.get(i), aggInputRel, exprConv); + gbInfo.distExprNodes.add(distinctExpr); + gbInfo.distExprNames.add(argNames.get(i)); + gbInfo.distExprTypes.add(distinctExpr.getTypeInfo()); + } + } + } + + // 4. Walk through UDAF & Collect UDAF Info + Set deDupedNonDistIrefsSet = new HashSet(); + for (AggregateCall aggCall : aggRel.getAggCallList()) { + if (aggCall.getAggregation() == HiveGroupingID.INSTANCE) { + continue; + } + + UDAFAttrs udafAttrs = new UDAFAttrs(); + udafAttrs.udafParams.addAll(HiveCalciteUtil.getExprNodes(aggCall.getArgList(), aggInputRel, + inputOpAf.tabAlias)); + udafAttrs.udafName = aggCall.getAggregation().getName(); + udafAttrs.isDistinctUDAF = aggCall.isDistinct(); + List argLst = new ArrayList(aggCall.getArgList()); + List distColIndicesOfUDAF = new ArrayList(); + List distUDAFParamsIndxInDistExprs = new ArrayList(); + for (int i = 0; i < argLst.size(); i++) { + // NOTE: distinct expr can not be part of of GB key (we assume plan + // gen would have prevented it) + if (udafAttrs.isDistinctUDAF) { + distColIndicesOfUDAF.add(distParamInRefsToOutputPos.get(argLst.get(i))); + distUDAFParamsIndxInDistExprs.add(distParamInRefsToOutputPos.get(argLst.get(i))); + } else { + // TODO: this seems wrong (following what Hive Regular does) + if (!distParamInRefsToOutputPos.containsKey(argLst.get(i)) + && !deDupedNonDistIrefsSet.contains(argLst.get(i))) { + deDupedNonDistIrefsSet.add(i); + gbInfo.deDupedNonDistIrefs.add(udafAttrs.udafParams.get(i)); + } + } + } + + if (udafAttrs.isDistinctUDAF) { + gbInfo.containsDistinctAggr = true; + + udafAttrs.udafParamsIndxInGBInfoDistExprs = distUDAFParamsIndxInDistExprs; + gbInfo.distColIndices.add(distColIndicesOfUDAF); + } + try { + udafAttrs.udafEvaluator = SemanticAnalyzer.getGenericUDAFEvaluator(udafAttrs.udafName, + new ArrayList(udafAttrs.udafParams), new ASTNode(), + udafAttrs.isDistinctUDAF, false); + } catch (SemanticException e) { + throw new RuntimeException(e); + } + gbInfo.udafAttrs.add(udafAttrs); + } + + // 4. Gather GB Memory threshold + gbInfo.groupByMemoryUsage = HiveConf.getFloatVar(hc, HiveConf.ConfVars.HIVEMAPAGGRHASHMEMORY); + gbInfo.memoryThreshold = HiveConf.getFloatVar(hc, HiveConf.ConfVars.HIVEMAPAGGRMEMORYTHRESHOLD); + + // 5. Gather GB Physical pipeline (based on user config & Grping Sets size) + gbInfo.gbPhysicalPipelineMode = getAggOPMode(hc, gbInfo); + + return gbInfo; + } + + static OpAttr translateGB(OpAttr inputOpAf, HiveAggregate aggRel, HiveConf hc) + throws SemanticException { + OpAttr translatedGBOpAttr = null; + GBInfo gbInfo = getGBInfo(aggRel, inputOpAf, hc); + + switch (gbInfo.gbPhysicalPipelineMode) { + case MAP_SIDE_GB_NO_SKEW_NO_ADD_MR_JOB: + translatedGBOpAttr = genMapSideGBNoSkewNoAddMRJob(inputOpAf, aggRel, gbInfo); + break; + case MAP_SIDE_GB_NO_SKEW_ADD_MR_JOB: + translatedGBOpAttr = genMapSideGBNoSkewAddMRJob(inputOpAf, aggRel, gbInfo); + break; + case MAP_SIDE_GB_SKEW_GBKEYS_OR_DIST_UDAF_PRESENT: + translatedGBOpAttr = genMapSideGBSkewGBKeysOrDistUDAFPresent(inputOpAf, aggRel, gbInfo); + break; + case MAP_SIDE_GB_SKEW_GBKEYS_AND_DIST_UDAF_NOT_PRESENT: + translatedGBOpAttr = genMapSideGBSkewGBKeysAndDistUDAFNotPresent(inputOpAf, aggRel, gbInfo); + break; + case NO_MAP_SIDE_GB_NO_SKEW: + translatedGBOpAttr = genNoMapSideGBNoSkew(inputOpAf, aggRel, gbInfo); + break; + case NO_MAP_SIDE_GB_SKEW: + translatedGBOpAttr = genNoMapSideGBSkew(inputOpAf, aggRel, gbInfo); + break; + } + + return translatedGBOpAttr; + } + + /** + * GB-RS-GB1 + * + * Construct GB-RS-GB Pipe line. User has enabled Map Side GB, specified no + * skew and Grp Set is below the threshold. + * + * @param inputOpAf + * @param aggRel + * @param gbInfo + * @return + * @throws SemanticException + */ + private static OpAttr genMapSideGBNoSkewNoAddMRJob(OpAttr inputOpAf, HiveAggregate aggRel, + GBInfo gbInfo) throws SemanticException { + OpAttr mapSideGB = null; + OpAttr mapSideRS = null; + OpAttr reduceSideGB = null; + + // 1. Insert MapSide GB + mapSideGB = genMapSideGB(inputOpAf, gbInfo); + + // 2. Insert MapSide RS + mapSideRS = genMapSideGBRS(mapSideGB, gbInfo); + + // 3. Insert ReduceSide GB + reduceSideGB = genReduceSideGB1(mapSideRS, gbInfo, false, false, GroupByDesc.Mode.MERGEPARTIAL); + + return reduceSideGB; + } + + /** + * GB-RS-GB1-RS-GB2 + */ + private static OpAttr genGBRSGBRSGBOpPipeLine(OpAttr inputOpAf, HiveAggregate aggRel, + GBInfo gbInfo) throws SemanticException { + OpAttr mapSideGB = null; + OpAttr mapSideRS = null; + OpAttr reduceSideGB1 = null; + OpAttr reduceSideRS = null; + OpAttr reduceSideGB2 = null; + + // 1. Insert MapSide GB + mapSideGB = genMapSideGB(inputOpAf, gbInfo); + + // 2. Insert MapSide RS + mapSideRS = genMapSideGBRS(mapSideGB, gbInfo); + + // 3. Insert ReduceSide GB1 + boolean computeGrpSet = (gbInfo.gbPhysicalPipelineMode == HIVEGBPHYSICALMODE.MAP_SIDE_GB_SKEW_GBKEYS_OR_DIST_UDAF_PRESENT) ? false : true; + reduceSideGB1 = genReduceSideGB1(mapSideRS, gbInfo, computeGrpSet, false, GroupByDesc.Mode.PARTIALS); + + // 4. Insert RS on reduce side with Reduce side GB as input + reduceSideRS = genReduceGBRS(reduceSideGB1, gbInfo); + + // 5. Insert ReduceSide GB2 + reduceSideGB2 = genReduceSideGB2(reduceSideRS, gbInfo); + + return reduceSideGB2; + } + + /** + * GB-RS-GB1-RS-GB2 + * + * @param inputOpAf + * @param aggRel + * @param gbInfo + * @return + * @throws SemanticException + */ + private static OpAttr genMapSideGBNoSkewAddMRJob(OpAttr inputOpAf, HiveAggregate aggRel, + GBInfo gbInfo) throws SemanticException { + // 1. Sanity check + if (gbInfo.containsDistinctAggr) { + String errorMsg = "The number of rows per input row due to grouping sets is " + + gbInfo.grpSets.size(); + throw new SemanticException( + ErrorMsg.HIVE_GROUPING_SETS_THRESHOLD_NOT_ALLOWED_WITH_DISTINCTS.getMsg(errorMsg)); + } + + // 2. Gen GB-RS-GB-RS-GB pipeline + return genGBRSGBRSGBOpPipeLine(inputOpAf, aggRel, gbInfo); + } + + /** + * GB-RS-GB1-RS-GB2 + * + * @param inputOpAf + * @param aggRel + * @param gbInfo + * @return + * @throws SemanticException + */ + private static OpAttr genMapSideGBSkewGBKeysOrDistUDAFPresent(OpAttr inputOpAf, + HiveAggregate aggRel, GBInfo gbInfo) throws SemanticException { + // 1. Sanity check + if (gbInfo.grpSetRqrAdditionalMRJob) { + String errorMsg = "The number of rows per input row due to grouping sets is " + + gbInfo.grpSets.size(); + throw new SemanticException( + ErrorMsg.HIVE_GROUPING_SETS_THRESHOLD_NOT_ALLOWED_WITH_SKEW.getMsg(errorMsg)); + } + + // 2. Gen GB-RS-GB-RS-GB pipeline + return genGBRSGBRSGBOpPipeLine(inputOpAf, aggRel, gbInfo); + } + + /** + * GB-RS-GB2 + * + * @param inputOpAf + * @param aggRel + * @param gbInfo + * @return + * @throws SemanticException + */ + private static OpAttr genMapSideGBSkewGBKeysAndDistUDAFNotPresent(OpAttr inputOpAf, + HiveAggregate aggRel, GBInfo gbInfo) throws SemanticException { + OpAttr mapSideGB = null; + OpAttr mapSideRS = null; + OpAttr reduceSideGB2 = null; + + // 1. Sanity check + if (gbInfo.grpSetRqrAdditionalMRJob) { + String errorMsg = "The number of rows per input row due to grouping sets is " + + gbInfo.grpSets.size(); + throw new SemanticException( + ErrorMsg.HIVE_GROUPING_SETS_THRESHOLD_NOT_ALLOWED_WITH_SKEW.getMsg(errorMsg)); + } + + // 1. Insert MapSide GB + mapSideGB = genMapSideGB(inputOpAf, gbInfo); + + // 2. Insert MapSide RS + mapSideRS = genMapSideGBRS(mapSideGB, gbInfo); + + // 3. Insert ReduceSide GB2 + reduceSideGB2 = genReduceSideGB2(mapSideRS, gbInfo); + + return reduceSideGB2; + } + + /** + * RS-Gb1 + * + * @param inputOpAf + * @param aggRel + * @param gbInfo + * @return + * @throws SemanticException + */ + private static OpAttr genNoMapSideGBNoSkew(OpAttr inputOpAf, HiveAggregate aggRel, GBInfo gbInfo) + throws SemanticException { + OpAttr mapSideRS = null; + OpAttr reduceSideGB1NoMapGB = null; + + // 1. Insert MapSide RS + mapSideRS = genMapSideRS(inputOpAf, gbInfo); + + // 2. Insert ReduceSide GB + reduceSideGB1NoMapGB = genReduceSideGB1NoMapGB(mapSideRS, gbInfo, GroupByDesc.Mode.COMPLETE); + + return reduceSideGB1NoMapGB; + } + + /** + * RS-GB1-RS-GB2 + * + * @param inputOpAf + * @param aggRel + * @param gbInfo + * @return + * @throws SemanticException + */ + private static OpAttr genNoMapSideGBSkew(OpAttr inputOpAf, HiveAggregate aggRel, GBInfo gbInfo) + throws SemanticException { + OpAttr mapSideRS = null; + OpAttr reduceSideGB1NoMapGB = null; + OpAttr reduceSideRS = null; + OpAttr reduceSideGB2 = null; + + // 1. Insert MapSide RS + mapSideRS = genMapSideRS(inputOpAf, gbInfo); + + // 2. Insert ReduceSide GB + reduceSideGB1NoMapGB = genReduceSideGB1NoMapGB(mapSideRS, gbInfo, GroupByDesc.Mode.PARTIAL1); + + // 3. Insert RS on reduce side with Reduce side GB as input + reduceSideRS = genReduceGBRS(reduceSideGB1NoMapGB, gbInfo); + + // 4. Insert ReduceSide GB2 + reduceSideGB2 = genReduceSideGB2(reduceSideRS, gbInfo); + + return reduceSideGB2; + } + + private static int getParallelismForReduceSideRS(GBInfo gbInfo) { + int degreeOfParallelism = 0; + + switch (gbInfo.gbPhysicalPipelineMode) { + case MAP_SIDE_GB_NO_SKEW_ADD_MR_JOB: + case MAP_SIDE_GB_SKEW_GBKEYS_OR_DIST_UDAF_PRESENT: + case NO_MAP_SIDE_GB_SKEW: + if (gbInfo.gbKeys.isEmpty()) { + degreeOfParallelism = 1; + } else { + degreeOfParallelism = -1; + } + break; + default: + throw new RuntimeException( + "Unable to determine Reducer Parallelism - Invalid Physical Mode: " + + gbInfo.gbPhysicalPipelineMode); + } + + return degreeOfParallelism; + } + + private static int getParallelismForMapSideRS(GBInfo gbInfo) { + int degreeOfParallelism = 0; + + switch (gbInfo.gbPhysicalPipelineMode) { + case MAP_SIDE_GB_NO_SKEW_NO_ADD_MR_JOB: + case MAP_SIDE_GB_NO_SKEW_ADD_MR_JOB: + case NO_MAP_SIDE_GB_NO_SKEW: + if (gbInfo.gbKeys.isEmpty()) { + degreeOfParallelism = 1; + } else { + degreeOfParallelism = -1; + } + break; + case NO_MAP_SIDE_GB_SKEW: + case MAP_SIDE_GB_SKEW_GBKEYS_OR_DIST_UDAF_PRESENT: + degreeOfParallelism = -1; + break; + case MAP_SIDE_GB_SKEW_GBKEYS_AND_DIST_UDAF_NOT_PRESENT: + degreeOfParallelism = 1; + break; + default: + throw new RuntimeException( + "Unable to determine Reducer Parallelism - Invalid Physical Mode: " + + gbInfo.gbPhysicalPipelineMode); + } + + return degreeOfParallelism; + } + + private static int getNumPartFieldsForReduceSideRS(GBInfo gbInfo) { + int numPartFields = 0; + + switch (gbInfo.gbPhysicalPipelineMode) { + case MAP_SIDE_GB_NO_SKEW_ADD_MR_JOB: + numPartFields = gbInfo.gbKeys.size() + 1; + break; + case MAP_SIDE_GB_SKEW_GBKEYS_OR_DIST_UDAF_PRESENT: + case NO_MAP_SIDE_GB_SKEW: + numPartFields = gbInfo.gbKeys.size(); + break; + default: + throw new RuntimeException( + "Unable to determine Number of Partition Fields - Invalid Physical Mode: " + + gbInfo.gbPhysicalPipelineMode); + } + + return numPartFields; + } + + private static int getNumPartFieldsForMapSideRS(GBInfo gbInfo) { + int numPartFields = 0; + + switch (gbInfo.gbPhysicalPipelineMode) { + case MAP_SIDE_GB_NO_SKEW_NO_ADD_MR_JOB: + case MAP_SIDE_GB_NO_SKEW_ADD_MR_JOB: + case MAP_SIDE_GB_SKEW_GBKEYS_AND_DIST_UDAF_NOT_PRESENT: + case NO_MAP_SIDE_GB_NO_SKEW: + numPartFields += gbInfo.gbKeys.size(); + break; + case NO_MAP_SIDE_GB_SKEW: + case MAP_SIDE_GB_SKEW_GBKEYS_OR_DIST_UDAF_PRESENT: + if (gbInfo.containsDistinctAggr) { + numPartFields = Integer.MAX_VALUE; + } else { + numPartFields = -1; + } + break; + default: + throw new RuntimeException( + "Unable to determine Number of Partition Fields - Invalid Physical Mode: " + + gbInfo.gbPhysicalPipelineMode); + } + + return numPartFields; + } + + private static boolean inclGrpSetInReduceSide(GBInfo gbInfo) { + boolean inclGrpSet = false; + + if (gbInfo.grpSets.size() > 0 + && (gbInfo.gbPhysicalPipelineMode == HIVEGBPHYSICALMODE.MAP_SIDE_GB_NO_SKEW_ADD_MR_JOB || gbInfo.gbPhysicalPipelineMode == HIVEGBPHYSICALMODE.MAP_SIDE_GB_SKEW_GBKEYS_OR_DIST_UDAF_PRESENT)) { + inclGrpSet = true; + } + + return inclGrpSet; + } + + private static boolean inclGrpSetInMapSide(GBInfo gbInfo) { + boolean inclGrpSet = false; + + if (gbInfo.grpSets.size() > 0 + && ((gbInfo.gbPhysicalPipelineMode == HIVEGBPHYSICALMODE.MAP_SIDE_GB_NO_SKEW_NO_ADD_MR_JOB) || + gbInfo.gbPhysicalPipelineMode == HIVEGBPHYSICALMODE.MAP_SIDE_GB_SKEW_GBKEYS_OR_DIST_UDAF_PRESENT)) { + inclGrpSet = true; + } + + return inclGrpSet; + } + + private static OpAttr genReduceGBRS(OpAttr inputOpAf, GBInfo gbInfo) throws SemanticException { + Map colExprMap = new HashMap(); + ArrayList outputColumnNames = new ArrayList(); + ArrayList colInfoLst = new ArrayList(); + GroupByOperator reduceSideGB1 = (GroupByOperator) inputOpAf.inputs.get(0); + List gb1ColInfoLst = reduceSideGB1.getSchema().getSignature(); + + ArrayList reduceKeys = getReduceKeysForRS(reduceSideGB1, 0, + gbInfo.gbKeys.size() - 1, outputColumnNames, false, colInfoLst, colExprMap, true, true); + if (inclGrpSetInReduceSide(gbInfo)) { + addGrpSetCol(false, gb1ColInfoLst.get(reduceKeys.size()).getInternalName(), true, reduceKeys, + outputColumnNames, colInfoLst, colExprMap); + } + + ArrayList reduceValues = getValueKeysForRS(reduceSideGB1, reduceSideGB1.getConf() + .getKeys().size(), outputColumnNames, colInfoLst, colExprMap, true, true); + + ReduceSinkOperator rsOp = (ReduceSinkOperator) OperatorFactory.getAndMakeChild(PlanUtils + .getReduceSinkDesc(reduceKeys, reduceValues, outputColumnNames, true, -1, + getNumPartFieldsForReduceSideRS(gbInfo), getParallelismForReduceSideRS(gbInfo), + AcidUtils.Operation.NOT_ACID), new RowSchema(colInfoLst), reduceSideGB1); + + rsOp.setColumnExprMap(colExprMap); + + return new OpAttr("", new HashMap(), rsOp); + } + + private static OpAttr genMapSideGBRS(OpAttr inputOpAf, GBInfo gbInfo) throws SemanticException { + Map colExprMap = new HashMap(); + List outputKeyColumnNames = new ArrayList(); + List outputValueColumnNames = new ArrayList(); + ArrayList colInfoLst = new ArrayList(); + GroupByOperator mapGB = (GroupByOperator) inputOpAf.inputs.get(0); + int distColStartIndx = gbInfo.gbKeys.size() + (gbInfo.grpSets.size() > 0 ? 1 : 0); + + ArrayList reduceKeys = getReduceKeysForRS(mapGB, 0, gbInfo.gbKeys.size() - 1, + outputKeyColumnNames, false, colInfoLst, colExprMap, false, false); + int keyLength = reduceKeys.size(); + + if (inclGrpSetInMapSide(gbInfo)) { + addGrpSetCol(false, SemanticAnalyzer.getColumnInternalName(reduceKeys.size()), true, + reduceKeys, outputKeyColumnNames, colInfoLst, colExprMap); + keyLength++; + } + if (mapGB.getConf().getKeys().size() > reduceKeys.size()) { + // NOTE: All dist cols have single output col name; + reduceKeys.addAll(getReduceKeysForRS(mapGB, reduceKeys.size(), mapGB.getConf().getKeys() + .size() - 1, outputKeyColumnNames, true, colInfoLst, colExprMap, false, false)); + } + + ArrayList reduceValues = getValueKeysForRS(mapGB, mapGB.getConf().getKeys() + .size(), outputValueColumnNames, colInfoLst, colExprMap, false, false); + List> distinctColIndices = getDistColIndices(gbInfo, distColStartIndx); + + ReduceSinkOperator rsOp = (ReduceSinkOperator) OperatorFactory.getAndMakeChild(PlanUtils + .getReduceSinkDesc(reduceKeys, keyLength, reduceValues, distinctColIndices, + outputKeyColumnNames, outputValueColumnNames, true, -1, + getNumPartFieldsForMapSideRS(gbInfo), getParallelismForMapSideRS(gbInfo), + AcidUtils.Operation.NOT_ACID), new RowSchema(colInfoLst), mapGB); + + rsOp.setColumnExprMap(colExprMap); + + return new OpAttr("", new HashMap(), rsOp); + } + + private static OpAttr genMapSideRS(OpAttr inputOpAf, GBInfo gbInfo) throws SemanticException { + Map colExprMap = new HashMap(); + List outputKeyColumnNames = new ArrayList(); + List outputValueColumnNames = new ArrayList(); + ArrayList colInfoLst = new ArrayList(); + int distColStartIndx = gbInfo.gbKeys.size() + (gbInfo.grpSets.size() > 0 ? 1 : 0); + String outputColName; + + // 1. Add GB Keys to reduce keys + ArrayList reduceKeys = getReduceKeysForRS(inputOpAf.inputs.get(0), 0, + gbInfo.gbKeys.size() - 1, outputKeyColumnNames, false, colInfoLst, colExprMap, false, false); + int keyLength = reduceKeys.size(); + + // 2. Add Dist UDAF args to reduce keys + if (gbInfo.containsDistinctAggr) { + // TODO: Why is this needed (doesn't represent any cols) + String udafName = SemanticAnalyzer.getColumnInternalName(reduceKeys.size()); + outputKeyColumnNames.add(udafName); + for (int i = 0; i < gbInfo.distExprNodes.size(); i++) { + reduceKeys.add(gbInfo.distExprNodes.get(i)); + outputColName = SemanticAnalyzer.getColumnInternalName(i); + String field = Utilities.ReduceField.KEY.toString() + "." + udafName + ":" + i + "." + + outputColName; + ColumnInfo colInfo = new ColumnInfo(field, gbInfo.distExprNodes.get(i).getTypeInfo(), null, + false); + colInfoLst.add(colInfo); + colExprMap.put(field, gbInfo.distExprNodes.get(i)); + } + } + + // 3. Add UDAF args deduped to reduce values + ArrayList reduceValues = new ArrayList(); + for (int i = 0; i < gbInfo.deDupedNonDistIrefs.size(); i++) { + reduceValues.add(gbInfo.deDupedNonDistIrefs.get(i)); + outputColName = SemanticAnalyzer.getColumnInternalName(reduceValues.size() - 1); + outputValueColumnNames.add(outputColName); + String field = Utilities.ReduceField.VALUE.toString() + "." + outputColName; + colInfoLst.add(new ColumnInfo(field, reduceValues.get(reduceValues.size() - 1).getTypeInfo(), + null, false)); + colExprMap.put(field, reduceValues.get(reduceValues.size() - 1)); + } + + // 4. Gen RS + ReduceSinkOperator rsOp = (ReduceSinkOperator) OperatorFactory.getAndMakeChild(PlanUtils + .getReduceSinkDesc(reduceKeys, keyLength, reduceValues, + getDistColIndices(gbInfo, distColStartIndx), outputKeyColumnNames, + outputValueColumnNames, true, -1, getNumPartFieldsForMapSideRS(gbInfo), + getParallelismForMapSideRS(gbInfo), AcidUtils.Operation.NOT_ACID), new RowSchema( + colInfoLst), inputOpAf.inputs.get(0)); + + rsOp.setColumnExprMap(colExprMap); + + return new OpAttr("", new HashMap(), rsOp); + } + + private static OpAttr genReduceSideGB2(OpAttr inputOpAf, GBInfo gbInfo) throws SemanticException { + ArrayList outputColNames = new ArrayList(); + ArrayList colInfoLst = new ArrayList(); + Map colExprMap = new HashMap(); + String colOutputName = null; + ReduceSinkOperator rs = (ReduceSinkOperator) inputOpAf.inputs.get(0); + List rsColInfoLst = rs.getSchema().getSignature(); + ColumnInfo ci; + + // 1. Build GB Keys, grouping set starting position + // 1.1 First Add original GB Keys + ArrayList gbKeys = ExprNodeDescUtils.genExprNodeDesc(rs, 0, + gbInfo.gbKeys.size() - 1, false, false); + for (int i = 0; i < gbInfo.gbKeys.size(); i++) { + ci = rsColInfoLst.get(i); + colOutputName = gbInfo.outputColNames.get(i); + outputColNames.add(colOutputName); + colInfoLst.add(new ColumnInfo(colOutputName, ci.getType(), "", false)); + colExprMap.put(colOutputName, gbKeys.get(i)); + } + // 1.2 Add GrpSet Col + int groupingSetsPosition = -1; + if (inclGrpSetInReduceSide(gbInfo) && gbInfo.grpIdFunctionNeeded) { + groupingSetsPosition = gbKeys.size(); + ExprNodeDesc grpSetColExpr = new ExprNodeColumnDesc(TypeInfoFactory.stringTypeInfo, + rsColInfoLst.get(groupingSetsPosition).getInternalName(), null, false); + gbKeys.add(grpSetColExpr); + colOutputName = gbInfo.outputColNames.get(gbInfo.outputColNames.size() - 1); + ; + outputColNames.add(colOutputName); + colInfoLst.add(new ColumnInfo(colOutputName, TypeInfoFactory.stringTypeInfo, null, true)); + colExprMap.put(colOutputName, grpSetColExpr); + } + + // 2. Add UDAF + UDAFAttrs udafAttr; + ArrayList aggregations = new ArrayList(); + int udafStartPosInGBInfOutputColNames = gbInfo.grpSets.isEmpty() ? gbInfo.gbKeys.size() + : gbInfo.gbKeys.size() * 2; + int udafStartPosInInputRS = gbInfo.grpSets.isEmpty() ? gbInfo.gbKeys.size() : gbInfo.gbKeys.size() + 1; + + for (int i = 0; i < gbInfo.udafAttrs.size(); i++) { + udafAttr = gbInfo.udafAttrs.get(i); + ArrayList aggParameters = new ArrayList(); + aggParameters.add(new ExprNodeColumnDesc(rsColInfoLst.get(udafStartPosInInputRS + i))); + colOutputName = gbInfo.outputColNames.get(udafStartPosInGBInfOutputColNames + i); + outputColNames.add(colOutputName); + Mode udafMode = SemanticAnalyzer.groupByDescModeToUDAFMode(GroupByDesc.Mode.FINAL, + udafAttr.isDistinctUDAF); + GenericUDAFInfo udaf = SemanticAnalyzer.getGenericUDAFInfo(udafAttr.udafEvaluator, udafMode, + aggParameters); + aggregations.add(new AggregationDesc(udafAttr.udafName.toLowerCase(), + udaf.genericUDAFEvaluator, udaf.convertedParameters, false, udafMode)); + colInfoLst.add(new ColumnInfo(colOutputName, udaf.returnType, "", false)); + } + + Operator rsGBOp2 = OperatorFactory.getAndMakeChild(new GroupByDesc(GroupByDesc.Mode.FINAL, + outputColNames, gbKeys, aggregations, false, gbInfo.groupByMemoryUsage, + gbInfo.memoryThreshold, null, false, groupingSetsPosition, gbInfo.containsDistinctAggr), + new RowSchema(colInfoLst), rs); + + rsGBOp2.setColumnExprMap(colExprMap); + + // TODO: Shouldn't we propgate vc? is it vc col from tab or all vc + return new OpAttr("", new HashMap(), rsGBOp2); + } + + private static OpAttr genReduceSideGB1(OpAttr inputOpAf, GBInfo gbInfo, boolean computeGrpSet, + boolean propagateConstInDistinctUDAF, GroupByDesc.Mode gbMode) throws SemanticException { + ArrayList outputColNames = new ArrayList(); + ArrayList colInfoLst = new ArrayList(); + Map colExprMap = new HashMap(); + String colOutputName = null; + ReduceSinkOperator rs = (ReduceSinkOperator) inputOpAf.inputs.get(0); + List rsColInfoLst = rs.getSchema().getSignature(); + ColumnInfo ci; + boolean finalGB = (gbInfo.gbPhysicalPipelineMode == HIVEGBPHYSICALMODE.MAP_SIDE_GB_NO_SKEW_NO_ADD_MR_JOB); + + // 1. Build GB Keys, grouping set starting position + // 1.1 First Add original GB Keys + ArrayList gbKeys = ExprNodeDescUtils.genExprNodeDesc(rs, 0, + gbInfo.gbKeys.size() - 1, false, false); + for (int i = 0; i < gbInfo.gbKeys.size(); i++) { + ci = rsColInfoLst.get(i); + if (finalGB) { + colOutputName = gbInfo.outputColNames.get(i); + } else { + colOutputName = SemanticAnalyzer.getColumnInternalName(i); + } + outputColNames.add(colOutputName); + colInfoLst.add(new ColumnInfo(colOutputName, ci.getType(), "", false)); + colExprMap.put(colOutputName, gbKeys.get(i)); + } + + // 1.2 Add GrpSet Col + int groupingSetsColPosition = -1; + if ((!finalGB && gbInfo.grpSets.size() > 0) || (finalGB && gbInfo.grpIdFunctionNeeded)) { + groupingSetsColPosition = gbInfo.gbKeys.size(); + if (computeGrpSet) { + // GrpSet Col needs to be constructed + gbKeys.add(new ExprNodeConstantDesc("0")); + } else { + // GrpSet Col already part of input RS + // TODO: Can't we just copy the ExprNodeDEsc from input (Do we need to + // explicitly set table alias to null & VC to false + gbKeys.addAll(ExprNodeDescUtils.genExprNodeDesc(rs, groupingSetsColPosition, + groupingSetsColPosition, false, true)); + } + + colOutputName = SemanticAnalyzer.getColumnInternalName(groupingSetsColPosition); + if (finalGB) { + colOutputName = gbInfo.outputColNames.get(gbInfo.outputColNames.size() - 1); + } + outputColNames.add(colOutputName); + colInfoLst.add(new ColumnInfo(colOutputName, TypeInfoFactory.stringTypeInfo, null, true)); + colExprMap.put(colOutputName, gbKeys.get(groupingSetsColPosition)); + } + + // 2. Walk through UDAF and add them to GB + String lastReduceKeyColName = null; + if (!rs.getConf().getOutputKeyColumnNames().isEmpty()) { + lastReduceKeyColName = rs.getConf().getOutputKeyColumnNames() + .get(rs.getConf().getOutputKeyColumnNames().size() - 1); + } + int numDistinctUDFs = 0; + int distinctStartPosInReduceKeys = gbKeys.size(); + List reduceValues = rs.getConf().getValueCols(); + ArrayList aggregations = new ArrayList(); + int udafColStartPosInOriginalGB = (gbInfo.grpSets.size() > 0) ? gbInfo.gbKeys.size() * 2 + : gbInfo.gbKeys.size(); + int udafColStartPosInRS = rs.getConf().getKeyCols().size(); + for (int i = 0; i < gbInfo.udafAttrs.size(); i++) { + UDAFAttrs udafAttr = gbInfo.udafAttrs.get(i); + ArrayList aggParameters = new ArrayList(); + + if (udafAttr.isDistinctUDAF) { + ColumnInfo rsDistUDAFParamColInfo; + ExprNodeDesc distinctUDAFParam; + ExprNodeDesc constantPropDistinctUDAFParam; + for (int j = 0; j < udafAttr.udafParamsIndxInGBInfoDistExprs.size(); j++) { + rsDistUDAFParamColInfo = rsColInfoLst.get(distinctStartPosInReduceKeys + j); + String rsDistUDAFParamName = rsDistUDAFParamColInfo.getInternalName(); + // TODO: verify if this is needed + if (lastReduceKeyColName != null) { + rsDistUDAFParamName = Utilities.ReduceField.KEY.name() + "." + lastReduceKeyColName + + ":" + numDistinctUDFs + "." + SemanticAnalyzer.getColumnInternalName(j); + } + distinctUDAFParam = new ExprNodeColumnDesc(rsDistUDAFParamColInfo.getType(), + rsDistUDAFParamName, rsDistUDAFParamColInfo.getTabAlias(), + rsDistUDAFParamColInfo.getIsVirtualCol()); + if (propagateConstInDistinctUDAF) { + // TODO: Implement propConstDistUDAFParams + constantPropDistinctUDAFParam = SemanticAnalyzer + .isConstantParameterInAggregationParameters( + rsDistUDAFParamColInfo.getInternalName(), reduceValues); + if (constantPropDistinctUDAFParam != null) { + distinctUDAFParam = constantPropDistinctUDAFParam; + } + } + aggParameters.add(distinctUDAFParam); + } + numDistinctUDFs++; + } else { + aggParameters.add(new ExprNodeColumnDesc(rsColInfoLst.get(udafColStartPosInRS + i))); + } + Mode udafMode = SemanticAnalyzer.groupByDescModeToUDAFMode(gbMode, udafAttr.isDistinctUDAF); + GenericUDAFInfo udaf = SemanticAnalyzer.getGenericUDAFInfo(udafAttr.udafEvaluator, udafMode, + aggParameters); + aggregations.add(new AggregationDesc(udafAttr.udafName.toLowerCase(), + udaf.genericUDAFEvaluator, udaf.convertedParameters, + (gbMode != GroupByDesc.Mode.FINAL && udafAttr.isDistinctUDAF), udafMode)); + + if (finalGB) { + colOutputName = gbInfo.outputColNames.get(udafColStartPosInOriginalGB + i); + } else { + colOutputName = SemanticAnalyzer.getColumnInternalName(gbKeys.size() + aggregations.size() + - 1); + } + + colInfoLst.add(new ColumnInfo(colOutputName, udaf.returnType, "", false)); + outputColNames.add(colOutputName); + } + + // Nothing special needs to be done for grouping sets if + // this is the final group by operator, and multiple rows corresponding to + // the + // grouping sets have been generated upstream. + // However, if an addition MR job has been created to handle grouping sets, + // additional rows corresponding to grouping sets need to be created here. + //TODO: Clean up/refactor assumptions + boolean includeGrpSetInGBDesc = (gbInfo.grpSets.size() > 0) + && !finalGB + && !(gbInfo.gbPhysicalPipelineMode == HIVEGBPHYSICALMODE.MAP_SIDE_GB_SKEW_GBKEYS_OR_DIST_UDAF_PRESENT); + Operator rsGBOp = OperatorFactory.getAndMakeChild(new GroupByDesc(gbMode, outputColNames, + gbKeys, aggregations, gbInfo.groupByMemoryUsage, gbInfo.memoryThreshold, gbInfo.grpSets, + includeGrpSetInGBDesc, groupingSetsColPosition, + gbInfo.containsDistinctAggr), new RowSchema(colInfoLst), rs); + + rsGBOp.setColumnExprMap(colExprMap); + + return new OpAttr("", new HashMap(), rsGBOp); + } + + /** + * RS-GB0 + * + * @param inputOpAf + * @param gbInfo + * @param gbMode + * @return + * @throws SemanticException + */ + private static OpAttr genReduceSideGB1NoMapGB(OpAttr inputOpAf, GBInfo gbInfo, + GroupByDesc.Mode gbMode) throws SemanticException { + ArrayList outputColNames = new ArrayList(); + ArrayList colInfoLst = new ArrayList(); + Map colExprMap = new HashMap(); + String colOutputName = null; + ReduceSinkOperator rs = (ReduceSinkOperator) inputOpAf.inputs.get(0); + List rsColInfoLst = rs.getSchema().getSignature(); + ColumnInfo ci; + boolean useOriginalGBNames = (gbInfo.gbPhysicalPipelineMode == HIVEGBPHYSICALMODE.NO_MAP_SIDE_GB_NO_SKEW); + + // 1. Build GB Keys, grouping set starting position + // 1.1 First Add original GB Keys + ArrayList gbKeys = ExprNodeDescUtils.genExprNodeDesc(rs, 0, + gbInfo.gbKeys.size() - 1, true, false); + for (int i = 0; i < gbInfo.gbKeys.size(); i++) { + ci = rsColInfoLst.get(i); + if (useOriginalGBNames) { + colOutputName = gbInfo.outputColNames.get(i); + } else { + colOutputName = SemanticAnalyzer.getColumnInternalName(i); + } + outputColNames.add(colOutputName); + colInfoLst.add(new ColumnInfo(colOutputName, ci.getType(), null, false)); + colExprMap.put(colOutputName, gbKeys.get(i)); + } + + // 2. Walk through UDAF and add them to GB + String lastReduceKeyColName = null; + if (!rs.getConf().getOutputKeyColumnNames().isEmpty()) { + lastReduceKeyColName = rs.getConf().getOutputKeyColumnNames() + .get(rs.getConf().getOutputKeyColumnNames().size() - 1); + } + int numDistinctUDFs = 0; + int distinctStartPosInReduceKeys = gbKeys.size(); + List reduceValues = rs.getConf().getValueCols(); + ArrayList aggregations = new ArrayList(); + int udafColStartPosInOriginalGB = gbInfo.gbKeys.size(); + for (int i = 0; i < gbInfo.udafAttrs.size(); i++) { + UDAFAttrs udafAttr = gbInfo.udafAttrs.get(i); + ArrayList aggParameters = new ArrayList(); + + ColumnInfo rsUDAFParamColInfo; + ExprNodeDesc udafParam; + ExprNodeDesc constantPropDistinctUDAFParam; + for (int j = 0; j < udafAttr.udafParams.size(); j++) { + rsUDAFParamColInfo = rsColInfoLst.get(distinctStartPosInReduceKeys + j); + String rsUDAFParamName = rsUDAFParamColInfo.getInternalName(); + // TODO: verify if this is needed + if (udafAttr.isDistinctUDAF && lastReduceKeyColName != null) { + rsUDAFParamName = Utilities.ReduceField.KEY.name() + "." + lastReduceKeyColName + ":" + + numDistinctUDFs + "." + SemanticAnalyzer.getColumnInternalName(j); + } + udafParam = new ExprNodeColumnDesc(rsUDAFParamColInfo.getType(), rsUDAFParamName, + rsUDAFParamColInfo.getTabAlias(), rsUDAFParamColInfo.getIsVirtualCol()); + constantPropDistinctUDAFParam = SemanticAnalyzer + .isConstantParameterInAggregationParameters(rsUDAFParamColInfo.getInternalName(), + reduceValues); + if (constantPropDistinctUDAFParam != null) { + udafParam = constantPropDistinctUDAFParam; + } + aggParameters.add(udafParam); + } + + if (udafAttr.isDistinctUDAF) { + numDistinctUDFs++; + } + Mode udafMode = SemanticAnalyzer.groupByDescModeToUDAFMode(gbMode, udafAttr.isDistinctUDAF); + GenericUDAFInfo udaf = SemanticAnalyzer.getGenericUDAFInfo(udafAttr.udafEvaluator, udafMode, + aggParameters); + aggregations.add(new AggregationDesc(udafAttr.udafName.toLowerCase(), + udaf.genericUDAFEvaluator, udaf.convertedParameters, udafAttr.isDistinctUDAF, udafMode)); + if (useOriginalGBNames) { + colOutputName = gbInfo.outputColNames.get(udafColStartPosInOriginalGB + i); + } else { + colOutputName = SemanticAnalyzer.getColumnInternalName(gbKeys.size() + aggregations.size() + - 1); + } + + colInfoLst.add(new ColumnInfo(colOutputName, udaf.returnType, "", false)); + outputColNames.add(colOutputName); + } + + Operator rsGB1 = OperatorFactory.getAndMakeChild(new GroupByDesc(gbMode, outputColNames, + gbKeys, aggregations, false, gbInfo.groupByMemoryUsage, gbInfo.memoryThreshold, null, + false, -1, numDistinctUDFs > 0), new RowSchema(colInfoLst), rs); + rsGB1.setColumnExprMap(colExprMap); + + return new OpAttr("", new HashMap(), rsGB1); + } + + @SuppressWarnings("unchecked") + private static OpAttr genMapSideGB(OpAttr inputOpAf, GBInfo gbAttrs) throws SemanticException { + ArrayList outputColNames = new ArrayList(); + ArrayList colInfoLst = new ArrayList(); + Map colExprMap = new HashMap(); + Set gbKeyColsAsNamesFrmIn = new HashSet(); + String colOutputName = null; + + // 1. Build GB Keys, grouping set starting position + // 1.1 First Add original GB Keys + ArrayList gbKeys = new ArrayList(); + for (int i = 0; i < gbAttrs.gbKeys.size(); i++) { + gbKeys.add(gbAttrs.gbKeys.get(i)); + colOutputName = SemanticAnalyzer.getColumnInternalName(i); + colInfoLst.add(new ColumnInfo(colOutputName, gbAttrs.gbKeyTypes.get(i), "", false)); + outputColNames.add(colOutputName); + gbKeyColsAsNamesFrmIn.add(gbAttrs.gbKeyColNamesInInput.get(i)); + colExprMap.put(colOutputName, gbKeys.get(i)); + } + // 1.2. Adjust GroupingSet Position, GBKeys for GroupingSet Position if + // needed. NOTE: GroupingID is added to map side GB only if we don't GrpSet + // doesn't require additional MR Jobs + int groupingSetsPosition = -1; + boolean inclGrpID = inclGrpSetInMapSide(gbAttrs); + if (inclGrpID) { + groupingSetsPosition = gbKeys.size(); + addGrpSetCol(true, null, false, gbKeys, outputColNames, colInfoLst, colExprMap); + } + // 1.3. Add all distinct params + // NOTE: distinct expr can not be part of of GB key (we assume plan + // gen would have prevented it) + for (int i = 0; i < gbAttrs.distExprNodes.size(); i++) { + if (!gbKeyColsAsNamesFrmIn.contains(gbAttrs.distExprNames.get(i))) { + gbKeys.add(gbAttrs.distExprNodes.get(i)); + colOutputName = SemanticAnalyzer.getColumnInternalName(gbKeys.size() - 1); + colInfoLst.add(new ColumnInfo(colOutputName, gbAttrs.distExprTypes.get(i), "", false)); + outputColNames.add(colOutputName); + gbKeyColsAsNamesFrmIn.add(gbAttrs.distExprNames.get(i)); + colExprMap.put(colOutputName, gbKeys.get(gbKeys.size() - 1)); + } + } + + // 2. Build Aggregations + ArrayList aggregations = new ArrayList(); + for (UDAFAttrs udafAttr : gbAttrs.udafAttrs) { + Mode amode = SemanticAnalyzer.groupByDescModeToUDAFMode(GroupByDesc.Mode.HASH, + udafAttr.isDistinctUDAF); + aggregations.add(new AggregationDesc(udafAttr.udafName.toLowerCase(), udafAttr.udafEvaluator, + udafAttr.udafParams, udafAttr.isDistinctUDAF, amode)); + GenericUDAFInfo udafInfo; + try { + udafInfo = SemanticAnalyzer.getGenericUDAFInfo(udafAttr.udafEvaluator, amode, + udafAttr.udafParams); + } catch (SemanticException e) { + throw new RuntimeException(e); + } + colOutputName = SemanticAnalyzer.getColumnInternalName(gbKeys.size() + aggregations.size() + - 1); + colInfoLst.add(new ColumnInfo(colOutputName, udafInfo.returnType, "", false)); + outputColNames.add(colOutputName); + } + + // 3. Create GB + @SuppressWarnings("rawtypes") + Operator gbOp = OperatorFactory.getAndMakeChild(new GroupByDesc(GroupByDesc.Mode.HASH, + outputColNames, gbKeys, aggregations, false, gbAttrs.groupByMemoryUsage, + gbAttrs.memoryThreshold, gbAttrs.grpSets, inclGrpID, groupingSetsPosition, + gbAttrs.containsDistinctAggr), new RowSchema(colInfoLst), inputOpAf.inputs.get(0)); + + // 5. Setup Expr Col Map + // NOTE: UDAF is not included in ExprColMap + gbOp.setColumnExprMap(colExprMap); + + return new OpAttr("", new HashMap(), gbOp); + } + + private static void addGrpSetCol(boolean createConstantExpr, String grpSetIDExprName, + boolean addReducePrefixToColInfoName, List exprLst, + List outputColumnNames, List colInfoLst, + Map colExprMap) throws SemanticException { + String outputColName = null; + ExprNodeDesc grpSetColExpr = null; + + if (createConstantExpr) { + grpSetColExpr = new ExprNodeConstantDesc("0"); + } else { + grpSetColExpr = new ExprNodeColumnDesc(TypeInfoFactory.stringTypeInfo, grpSetIDExprName, + null, false); + } + exprLst.add(grpSetColExpr); + + outputColName = SemanticAnalyzer.getColumnInternalName(exprLst.size() - 1); + outputColumnNames.add(outputColName); + String internalColName = outputColName; + if (addReducePrefixToColInfoName) { + internalColName = Utilities.ReduceField.KEY.toString() + "." + outputColName; + } + colInfoLst.add(new ColumnInfo(internalColName, grpSetColExpr.getTypeInfo(), null, true)); + colExprMap.put(internalColName, grpSetColExpr); + } + + /** + * Get Reduce Keys for RS following MapSide GB + * + * @param reduceKeys + * assumed to be deduped list of exprs + * @param outputKeyColumnNames + * @param colExprMap + * @return List of ExprNodeDesc of ReduceKeys + * @throws SemanticException + */ + private static ArrayList getReduceKeysForRS(Operator inOp, int startPos, + int endPos, List outputKeyColumnNames, boolean addOnlyOneKeyColName, + ArrayList colInfoLst, Map colExprMap, + boolean addEmptyTabAlias, boolean setColToNonVirtual) throws SemanticException { + ArrayList reduceKeys = null; + if (endPos < 0) { + reduceKeys = new ArrayList(); + } else { + reduceKeys = ExprNodeDescUtils.genExprNodeDesc(inOp, startPos, endPos, addEmptyTabAlias, + setColToNonVirtual); + int outColNameIndx = startPos; + for (int i = 0; i < reduceKeys.size(); ++i) { + String outputColName = SemanticAnalyzer.getColumnInternalName(outColNameIndx); + outColNameIndx++; + if (!addOnlyOneKeyColName || i == 0) { + outputKeyColumnNames.add(outputColName); + } + + // TODO: Verify if this is needed (Why can't it be always null/empty + String tabAlias = addEmptyTabAlias ? "" : null; + ColumnInfo colInfo = new ColumnInfo(Utilities.ReduceField.KEY.toString() + "." + + outputColName, reduceKeys.get(i).getTypeInfo(), tabAlias, false); + colInfoLst.add(colInfo); + colExprMap.put(colInfo.getInternalName(), reduceKeys.get(i)); + } + } + + return reduceKeys; + } + + /** + * Get Value Keys for RS following MapSide GB + * + * @param GroupByOperator + * MapSide GB + * @param outputKeyColumnNames + * @param colExprMap + * @return List of ExprNodeDesc of Values + * @throws SemanticException + */ + private static ArrayList getValueKeysForRS(Operator inOp, int aggStartPos, + List outputKeyColumnNames, ArrayList colInfoLst, + Map colExprMap, boolean addEmptyTabAlias, boolean setColToNonVirtual) + throws SemanticException { + List mapGBColInfoLst = inOp.getSchema().getSignature(); + ArrayList valueKeys = null; + if (aggStartPos >= mapGBColInfoLst.size()) { + valueKeys = new ArrayList(); + } else { + valueKeys = ExprNodeDescUtils.genExprNodeDesc(inOp, aggStartPos, mapGBColInfoLst.size() - 1, + true, setColToNonVirtual); + for (int i = 0; i < valueKeys.size(); ++i) { + String outputColName = SemanticAnalyzer.getColumnInternalName(i); + outputKeyColumnNames.add(outputColName); + // TODO: Verify if this is needed (Why can't it be always null/empty + String tabAlias = addEmptyTabAlias ? "" : null; + ColumnInfo colInfo = new ColumnInfo(Utilities.ReduceField.VALUE.toString() + "." + + outputColName, valueKeys.get(i).getTypeInfo(), tabAlias, false); + colInfoLst.add(colInfo); + colExprMap.put(colInfo.getInternalName(), valueKeys.get(i)); + } + } + + return valueKeys; + } + + private static List> getDistColIndices(GBInfo gbAttrs, int distOffSet) + throws SemanticException { + List> distColIndices = new ArrayList>(); + + for (List udafDistCols : gbAttrs.distColIndices) { + List udfAdjustedDistColIndx = new ArrayList(); + for (Integer distIndx : udafDistCols) { + udfAdjustedDistColIndx.add(distIndx + distOffSet); + } + distColIndices.add(udfAdjustedDistColIndx); + } + + return distColIndices; + } + + // TODO: Implement this + private static ExprNodeDesc propConstDistUDAFParams() { + return null; + } +} Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/ASTConverter.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/ASTConverter.java (.../https://svn.apache.org/repos/asf/hive/trunk) (revision 1672451) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/ASTConverter.java (working copy) @@ -54,11 +54,13 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hive.metastore.api.FieldSchema; +import org.apache.hadoop.hive.ql.exec.RowSchema; import org.apache.hadoop.hive.ql.metadata.VirtualColumn; import org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException; import org.apache.hadoop.hive.ql.optimizer.calcite.RelOptHiveTable; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveGroupingID; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveSort; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan; import org.apache.hadoop.hive.ql.optimizer.calcite.translator.SqlFunctionConverter.HiveToken; import org.apache.hadoop.hive.ql.parse.ASTNode; import org.apache.hadoop.hive.ql.parse.HiveParser; @@ -69,8 +71,8 @@ public class ASTConverter { private static final Log LOG = LogFactory.getLog(ASTConverter.class); - private RelNode root; - private HiveAST hiveAST; + private final RelNode root; + private final HiveAST hiveAST; private RelNode from; private Filter where; private Aggregate groupBy; @@ -213,7 +215,7 @@ private void convertLimitToASTNode(HiveSort limit) { if (limit != null) { - HiveSort hiveLimit = (HiveSort) limit; + HiveSort hiveLimit = limit; RexNode limitExpr = hiveLimit.getFetchExpr(); if (limitExpr != null) { Object val = ((RexLiteral) limitExpr).getValue2(); @@ -224,12 +226,12 @@ private void convertOBToASTNode(HiveSort order) { if (order != null) { - HiveSort hiveSort = (HiveSort) order; + HiveSort hiveSort = order; if (!hiveSort.getCollation().getFieldCollations().isEmpty()) { // 1 Add order by token ASTNode orderAst = ASTBuilder.createAST(HiveParser.TOK_ORDERBY, "TOK_ORDERBY"); - schema = new Schema((HiveSort) hiveSort); + schema = new Schema(hiveSort); Map obRefToCallMap = hiveSort.getInputRefToCallMap(); RexNode obExpr; ASTNode astCol; @@ -370,7 +372,7 @@ static class RexVisitor extends RexVisitorImpl { private final Schema schema; - private boolean useTypeQualInLiteral; + private final boolean useTypeQualInLiteral; protected RexVisitor(Schema schema) { this(schema, false); @@ -567,7 +569,7 @@ private static final long serialVersionUID = 1L; Schema(TableScan scan) { - String tabName = ((RelOptHiveTable) scan.getTable()).getTableAlias(); + String tabName = ((HiveTableScan) scan).getTableAlias(); for (RelDataTypeField field : scan.getRowType().getFieldList()) { add(new ColumnInfo(tabName, field.getName())); } @@ -641,7 +643,13 @@ add(new ColumnInfo(null, projName)); } } + + public Schema(String tabAlias, List fieldList) { + for (RelDataTypeField field : fieldList) { + add(new ColumnInfo(tabAlias, field.getName())); } + } + } /* * represents Column information exposed by a QueryBlock. Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/ExprNodeConverter.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/ExprNodeConverter.java (.../https://svn.apache.org/repos/asf/hive/trunk) (revision 1672451) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/ExprNodeConverter.java (working copy) @@ -24,58 +24,89 @@ import java.util.LinkedList; import java.util.List; -import org.apache.hadoop.hive.common.type.HiveChar; -import org.apache.hadoop.hive.common.type.HiveVarchar; -import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; -import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc; -import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; -import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; -import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.calcite.rel.RelFieldCollation; import org.apache.calcite.rel.type.RelDataType; import org.apache.calcite.rel.type.RelDataTypeFactory; import org.apache.calcite.rel.type.RelDataTypeField; import org.apache.calcite.rex.RexCall; +import org.apache.calcite.rex.RexFieldCollation; import org.apache.calcite.rex.RexInputRef; import org.apache.calcite.rex.RexLiteral; import org.apache.calcite.rex.RexNode; +import org.apache.calcite.rex.RexOver; import org.apache.calcite.rex.RexVisitorImpl; +import org.apache.calcite.rex.RexWindow; +import org.apache.calcite.rex.RexWindowBound; import org.apache.calcite.sql.SqlKind; import org.apache.calcite.sql.type.SqlTypeUtil; +import org.apache.hadoop.hive.common.type.HiveChar; +import org.apache.hadoop.hive.common.type.HiveVarchar; +import org.apache.hadoop.hive.ql.optimizer.calcite.translator.ASTConverter.Schema; +import org.apache.hadoop.hive.ql.parse.ASTNode; +import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec.Order; +import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec.OrderExpression; +import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec.OrderSpec; +import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec.PartitionExpression; +import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec.PartitionSpec; +import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec.PartitioningSpec; +import org.apache.hadoop.hive.ql.parse.WindowingSpec.BoundarySpec; +import org.apache.hadoop.hive.ql.parse.WindowingSpec.CurrentRowSpec; +import org.apache.hadoop.hive.ql.parse.WindowingSpec.Direction; +import org.apache.hadoop.hive.ql.parse.WindowingSpec.RangeBoundarySpec; +import org.apache.hadoop.hive.ql.parse.WindowingSpec.ValueBoundarySpec; +import org.apache.hadoop.hive.ql.parse.WindowingSpec.WindowFrameSpec; +import org.apache.hadoop.hive.ql.parse.WindowingSpec.WindowFunctionSpec; +import org.apache.hadoop.hive.ql.parse.WindowingSpec.WindowSpec; +import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; /* * convert a RexNode to an ExprNodeDesc */ public class ExprNodeConverter extends RexVisitorImpl { - RelDataType rType; String tabAlias; + String columnAlias; + RelDataType inputRowType; + RelDataType outputRowType; boolean partitioningExpr; + WindowFunctionSpec wfs; private final RelDataTypeFactory dTFactory; - public ExprNodeConverter(String tabAlias, RelDataType rType, boolean partitioningExpr, RelDataTypeFactory dTFactory) { + public ExprNodeConverter(String tabAlias, RelDataType inputRowType, + boolean partitioningExpr, RelDataTypeFactory dTFactory) { + this(tabAlias, null, inputRowType, null, partitioningExpr, dTFactory); + } + + public ExprNodeConverter(String tabAlias, String columnAlias, RelDataType inputRowType, + RelDataType outputRowType, boolean partitioningExpr, RelDataTypeFactory dTFactory) { super(true); - /* - * hb: 6/25/14 for now we only support expressions that only contain - * partition cols. there is no use case for supporting generic expressions. - * for supporting generic exprs., we need to give the converter information - * on whether a column is a partition column or not, whether a column is a - * virtual column or not. - */ - assert partitioningExpr == true; this.tabAlias = tabAlias; - this.rType = rType; + this.columnAlias = columnAlias; + this.inputRowType = inputRowType; + this.outputRowType = outputRowType; this.partitioningExpr = partitioningExpr; this.dTFactory = dTFactory; } + public WindowFunctionSpec getWindowFunctionSpec() { + return this.wfs; + } + @Override public ExprNodeDesc visitInputRef(RexInputRef inputRef) { - RelDataTypeField f = rType.getFieldList().get(inputRef.getIndex()); + RelDataTypeField f = inputRowType.getFieldList().get(inputRef.getIndex()); return new ExprNodeColumnDesc(TypeConverter.convert(f.getType()), f.getName(), tabAlias, partitioningExpr); } + /** + * TODO: Handle 1) cast 2) Field Access 3) Windowing Over() 4, Windowing Agg Call + */ @Override public ExprNodeDesc visitCall(RexCall call) { ExprNodeGenericFuncDesc gfDesc = null; @@ -123,6 +154,9 @@ return gfDesc; } + /** + * TODO: 1. Handle NULL + */ @Override public ExprNodeDesc visitLiteral(RexLiteral literal) { RelDataType lType = literal.getType(); @@ -176,4 +210,138 @@ } } + @Override + public ExprNodeDesc visitOver(RexOver over) { + if (!deep) { + return null; } + + final RexWindow window = over.getWindow(); + + final WindowSpec windowSpec = new WindowSpec(); + final PartitioningSpec partitioningSpec = getPSpec(window); + windowSpec.setPartitioning(partitioningSpec); + final WindowFrameSpec windowFrameSpec = getWindowRange(window); + windowSpec.setWindowFrame(windowFrameSpec); + + wfs = new WindowFunctionSpec(); + wfs.setWindowSpec(windowSpec); + final Schema schema = new Schema(tabAlias, inputRowType.getFieldList()); + final ASTNode wUDAFAst = new ASTConverter.RexVisitor(schema).visitOver(over); + wfs.setExpression(wUDAFAst); + ASTNode nameNode = (ASTNode) wUDAFAst.getChild(0); + wfs.setName(nameNode.getText()); + for(int i=1; i < wUDAFAst.getChildCount()-1; i++) { + ASTNode child = (ASTNode) wUDAFAst.getChild(i); + wfs.addArg(child); + } + wfs.setAlias(columnAlias); + + RelDataTypeField f = outputRowType.getField(columnAlias, false, false); + return new ExprNodeColumnDesc(TypeConverter.convert(f.getType()), columnAlias, tabAlias, + partitioningExpr); + } + + private PartitioningSpec getPSpec(RexWindow window) { + PartitioningSpec partitioning = new PartitioningSpec(); + + if (window.partitionKeys != null && !window.partitionKeys.isEmpty()) { + PartitionSpec pSpec = new PartitionSpec(); + for (RexNode pk : window.partitionKeys) { + PartitionExpression exprSpec = new PartitionExpression(); + RexInputRef inputRef = (RexInputRef) pk; + RelDataTypeField f = inputRowType.getFieldList().get(inputRef.getIndex()); + ASTNode astCol; + if (tabAlias == null || tabAlias.isEmpty()) { + astCol = ASTBuilder.unqualifiedName(f.getName()); + } else { + astCol = ASTBuilder.qualifiedName(tabAlias, f.getName()); + } + exprSpec.setExpression(astCol); + pSpec.addExpression(exprSpec); + } + partitioning.setPartSpec(pSpec); + } + + if (window.orderKeys != null && !window.orderKeys.isEmpty()) { + OrderSpec oSpec = new OrderSpec(); + for (RexFieldCollation ok : window.orderKeys) { + OrderExpression exprSpec = new OrderExpression(); + Order order = ok.getDirection() == RelFieldCollation.Direction.ASCENDING ? + Order.ASC : Order.DESC; + exprSpec.setOrder(order); + RexInputRef inputRef = (RexInputRef) ok.left; + RelDataTypeField f = inputRowType.getFieldList().get(inputRef.getIndex()); + ASTNode astCol; + if (tabAlias == null || tabAlias.isEmpty()) { + astCol = ASTBuilder.unqualifiedName(f.getName()); + } else { + astCol = ASTBuilder.qualifiedName(tabAlias, f.getName()); + } + exprSpec.setExpression(astCol); + oSpec.addExpression(exprSpec); + } + partitioning.setOrderSpec(oSpec); + } + + return partitioning; + } + + private WindowFrameSpec getWindowRange(RexWindow window) { + // NOTE: in Hive AST Rows->Range(Physical) & Range -> Values (logical) + + WindowFrameSpec windowFrame = new WindowFrameSpec(); + + BoundarySpec start = null; + RexWindowBound ub = window.getUpperBound(); + if (ub != null) { + start = getWindowBound(ub, window.isRows()); + } + + BoundarySpec end = null; + RexWindowBound lb = window.getLowerBound(); + if (lb != null) { + end = getWindowBound(lb, window.isRows()); + } + + if (start != null || end != null) { + if (start != null) { + windowFrame.setStart(start); + } + if (end != null) { + windowFrame.setEnd(end); + } + } + + return windowFrame; + } + + private BoundarySpec getWindowBound(RexWindowBound wb, boolean isRows) { + BoundarySpec boundarySpec; + + if (wb.isCurrentRow()) { + boundarySpec = new CurrentRowSpec(); + } else { + final Direction direction; + final int amt; + if (wb.isPreceding()) { + direction = Direction.PRECEDING; + } else { + direction = Direction.FOLLOWING; + } + if (wb.isUnbounded()) { + amt = BoundarySpec.UNBOUNDED_AMOUNT; + } else { + amt = RexLiteral.intValue(wb.getOffset()); + } + if (isRows) { + boundarySpec = new RangeBoundarySpec(direction, amt); + } else { + boundarySpec = new ValueBoundarySpec(direction, amt); + } + } + + return boundarySpec; + } + +} Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/ASTBuilder.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/ASTBuilder.java (.../https://svn.apache.org/repos/asf/hive/trunk) (revision 1672451) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/ASTBuilder.java (working copy) @@ -30,6 +30,7 @@ import org.apache.hadoop.hive.common.type.HiveIntervalDayTime; import org.apache.hadoop.hive.common.type.HiveIntervalYearMonth; import org.apache.hadoop.hive.ql.optimizer.calcite.RelOptHiveTable; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan; import org.apache.hadoop.hive.ql.parse.ASTNode; import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer; import org.apache.hadoop.hive.ql.parse.HiveParser; @@ -68,7 +69,7 @@ // However in HIVE DB name can not appear in select list; in case of join // where table names differ only in DB name, Hive would require user // introducing explicit aliases for tbl. - b.add(HiveParser.Identifier, hTbl.getTableAlias()); + b.add(HiveParser.Identifier, ((HiveTableScan)scan).getTableAlias()); return b.node(); } Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/PlanModifierForASTConv.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/PlanModifierForASTConv.java (.../https://svn.apache.org/repos/asf/hive/trunk) (revision 1672451) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/PlanModifierForASTConv.java (working copy) @@ -50,9 +50,11 @@ import org.apache.hadoop.hive.metastore.api.FieldSchema; import org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException; import org.apache.hadoop.hive.ql.optimizer.calcite.HiveCalciteUtil; +import org.apache.hadoop.hive.ql.optimizer.calcite.RelOptHiveTable; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAggregate; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveProject; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveSort; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; import com.google.common.collect.ImmutableList; @@ -95,6 +97,23 @@ return newTopNode; } + private static String getTblAlias(RelNode rel) { + + if (null == rel) { + return null; + } + if (rel instanceof HiveTableScan) { + return ((HiveTableScan)rel).getTableAlias(); + } + if (rel instanceof Project) { + return null; + } + if (rel.getInputs().size() == 1) { + return getTblAlias(rel.getInput(0)); + } + return null; + } + private static void convertOpTree(RelNode rel, RelNode parent) { if (rel instanceof HepRelVertex) { @@ -103,6 +122,12 @@ if (!validJoinParent(rel, parent)) { introduceDerivedTable(rel, parent); } + String leftChild = getTblAlias(((Join)rel).getLeft()); + if (null != leftChild && leftChild.equalsIgnoreCase(getTblAlias(((Join)rel).getRight()))) { + // introduce derived table above one child, if this is self-join + // since user provided aliases are lost at this point. + introduceDerivedTable(((Join)rel).getLeft(), rel); + } } else if (rel instanceof MultiJoin) { throw new RuntimeException("Found MultiJoin"); } else if (rel instanceof RelSubset) { Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveInsertExchange4JoinRule.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveInsertExchange4JoinRule.java (.../https://svn.apache.org/repos/asf/hive/trunk) (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveInsertExchange4JoinRule.java (revision 1672450) @@ -0,0 +1,110 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer.calcite.rules; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.calcite.plan.RelOptRule; +import org.apache.calcite.plan.RelOptRuleCall; +import org.apache.calcite.rel.RelDistribution; +import org.apache.calcite.rel.RelFieldCollation; +import org.apache.calcite.rel.RelNode; +import org.apache.calcite.rel.core.Exchange; +import org.apache.calcite.rel.core.Join; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.ql.optimizer.calcite.HiveCalciteUtil; +import org.apache.hadoop.hive.ql.optimizer.calcite.HiveCalciteUtil.JoinLeafPredicateInfo; +import org.apache.hadoop.hive.ql.optimizer.calcite.HiveCalciteUtil.JoinPredicateInfo; +import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRelCollation; +import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRelDistribution; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveSortExchange; + +import com.google.common.collect.ImmutableList; + +/** Not an optimization rule. + * Rule to aid in translation from Calcite tree -> Hive tree. + * Transforms : + * Left Right Left Right + * \ / -> \ / + * Join HashExchange HashExchange + * \ / + * Join + */ +public class HiveInsertExchange4JoinRule extends RelOptRule { + + protected static transient final Log LOG = LogFactory + .getLog(HiveInsertExchange4JoinRule.class); + + public HiveInsertExchange4JoinRule() { + // match join with exactly 2 inputs + super(RelOptRule.operand(Join.class, + operand(RelNode.class, any()), + operand(RelNode.class, any()))); + } + + @Override + public void onMatch(RelOptRuleCall call) { + Join join = call.rel(0); + + if (call.rel(1) instanceof Exchange && + call.rel(2) instanceof Exchange) { + return; + } + + JoinPredicateInfo joinPredInfo = + HiveCalciteUtil.JoinPredicateInfo.constructJoinPredicateInfo(join); + + // get key columns from inputs. Those are the columns on which we will distribute on. + // It is also the columns we will sort on. + List joinLeftKeyPositions = new ArrayList(); + List joinRightKeyPositions = new ArrayList(); + ImmutableList.Builder leftCollationListBuilder = + new ImmutableList.Builder(); + ImmutableList.Builder rightCollationListBuilder = + new ImmutableList.Builder(); + for (int i = 0; i < joinPredInfo.getEquiJoinPredicateElements().size(); i++) { + JoinLeafPredicateInfo joinLeafPredInfo = joinPredInfo. + getEquiJoinPredicateElements().get(i); + joinLeftKeyPositions.addAll(joinLeafPredInfo.getProjsFromLeftPartOfJoinKeysInChildSchema()); + for (int leftPos : joinLeafPredInfo.getProjsFromLeftPartOfJoinKeysInChildSchema()) { + leftCollationListBuilder.add(new RelFieldCollation(leftPos)); + } + joinRightKeyPositions.addAll(joinLeafPredInfo.getProjsFromRightPartOfJoinKeysInChildSchema()); + for (int rightPos : joinLeafPredInfo.getProjsFromRightPartOfJoinKeysInChildSchema()) { + rightCollationListBuilder.add(new RelFieldCollation(rightPos)); + } + } + + HiveSortExchange left = HiveSortExchange.create(join.getLeft(), + new HiveRelDistribution(RelDistribution.Type.HASH_DISTRIBUTED, joinLeftKeyPositions), + new HiveRelCollation(leftCollationListBuilder.build())); + HiveSortExchange right = HiveSortExchange.create(join.getRight(), + new HiveRelDistribution(RelDistribution.Type.HASH_DISTRIBUTED, joinRightKeyPositions), + new HiveRelCollation(rightCollationListBuilder.build())); + + Join newJoin = join.copy(join.getTraitSet(), join.getCondition(), + left, right, join.getJoinType(), join.isSemiJoinDone()); + + call.getPlanner().onCopy(join, newJoin); + + call.transformTo(newJoin); + } + +} Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveJoinAddNotNullRule.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveJoinAddNotNullRule.java (.../https://svn.apache.org/repos/asf/hive/trunk) (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveJoinAddNotNullRule.java (revision 1672450) @@ -0,0 +1,197 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer.calcite.rules; + +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +import org.apache.calcite.plan.RelOptCluster; +import org.apache.calcite.plan.RelOptRule; +import org.apache.calcite.plan.RelOptRuleCall; +import org.apache.calcite.rel.RelNode; +import org.apache.calcite.rel.core.Join; +import org.apache.calcite.rel.core.JoinRelType; +import org.apache.calcite.rel.core.RelFactories.FilterFactory; +import org.apache.calcite.rel.type.RelDataType; +import org.apache.calcite.rex.RexBuilder; +import org.apache.calcite.rex.RexCall; +import org.apache.calcite.rex.RexNode; +import org.apache.calcite.rex.RexUtil; +import org.apache.calcite.sql.SqlKind; +import org.apache.calcite.sql.SqlOperator; +import org.apache.calcite.sql.type.SqlTypeName; +import org.apache.hadoop.hive.ql.exec.FunctionRegistry; +import org.apache.hadoop.hive.ql.optimizer.calcite.HiveCalciteUtil; +import org.apache.hadoop.hive.ql.optimizer.calcite.HiveCalciteUtil.JoinLeafPredicateInfo; +import org.apache.hadoop.hive.ql.optimizer.calcite.HiveCalciteUtil.JoinPredicateInfo; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveFilter; +import org.apache.hadoop.hive.ql.optimizer.calcite.translator.SqlFunctionConverter; +import org.apache.hadoop.hive.ql.parse.SemanticException; + +import com.google.common.collect.ImmutableList; + +public final class HiveJoinAddNotNullRule extends RelOptRule { + + private static final String NOT_NULL_FUNC_NAME = "isnotnull"; + + /** The singleton. */ + public static final HiveJoinAddNotNullRule INSTANCE = + new HiveJoinAddNotNullRule(HiveFilter.DEFAULT_FILTER_FACTORY); + + private final FilterFactory filterFactory; + + //~ Constructors ----------------------------------------------------------- + + /** + * Creates an HiveJoinAddNotNullRule. + */ + public HiveJoinAddNotNullRule(FilterFactory filterFactory) { + super(operand(Join.class, + operand(RelNode.class, any()), + operand(RelNode.class, any()))); + this.filterFactory = filterFactory; + } + + //~ Methods ---------------------------------------------------------------- + + public void onMatch(RelOptRuleCall call) { + final Join join = call.rel(0); + RelNode leftInput = call.rel(1); + RelNode rightInput = call.rel(2); + + if (join.getJoinType() != JoinRelType.INNER) { + return; + } + + if (join.getCondition().isAlwaysTrue()) { + return; + } + + JoinPredicateInfo joinPredInfo = + HiveCalciteUtil.JoinPredicateInfo.constructJoinPredicateInfo(join); + + Set joinLeftKeyPositions = new HashSet(); + Set joinRightKeyPositions = new HashSet(); + for (int i = 0; i < joinPredInfo.getEquiJoinPredicateElements().size(); i++) { + JoinLeafPredicateInfo joinLeafPredInfo = joinPredInfo. + getEquiJoinPredicateElements().get(i); + joinLeftKeyPositions.addAll(joinLeafPredInfo.getProjsFromLeftPartOfJoinKeysInChildSchema()); + joinRightKeyPositions.addAll(joinLeafPredInfo.getProjsFromRightPartOfJoinKeysInChildSchema()); + } + + // Build not null conditions + final RelOptCluster cluster = join.getCluster(); + final RexBuilder rexBuilder = join.getCluster().getRexBuilder(); + + final Map newLeftConditions = getNotNullConditions(cluster, + rexBuilder, leftInput, joinLeftKeyPositions); + final Map newRightConditions = getNotNullConditions(cluster, + rexBuilder, rightInput, joinRightKeyPositions); + + // Nothing will be added to the expression + if (newLeftConditions == null && newRightConditions == null) { + return; + } + + if (newLeftConditions != null) { + if (leftInput instanceof HiveFilter) { + leftInput = leftInput.getInput(0); + } + leftInput = createHiveFilterConjunctiveCondition(filterFactory, rexBuilder, + leftInput, newLeftConditions.values()); + } + if (newRightConditions != null) { + if (rightInput instanceof HiveFilter) { + rightInput = rightInput.getInput(0); + } + rightInput = createHiveFilterConjunctiveCondition(filterFactory, rexBuilder, + rightInput, newRightConditions.values()); + } + + Join newJoin = join.copy(join.getTraitSet(), join.getCondition(), + leftInput, rightInput, join.getJoinType(), join.isSemiJoinDone()); + + call.getPlanner().onCopy(join, newJoin); + + call.transformTo(newJoin); + } + + private static Map getNotNullConditions(RelOptCluster cluster, + RexBuilder rexBuilder, RelNode input, Set inputKeyPositions) { + + boolean added = false; + + final RelDataType returnType = cluster.getTypeFactory(). + createSqlType(SqlTypeName.BOOLEAN); + + final Map newConditions; + if (input instanceof HiveFilter) { + newConditions = splitCondition(((HiveFilter) input).getCondition()); + } + else { + newConditions = new HashMap(); + } + for (int pos : inputKeyPositions) { + try { + RelDataType keyType = input.getRowType().getFieldList().get(pos).getType(); + // Nothing to do if key cannot be null + if (!keyType.isNullable()) { + continue; + } + SqlOperator funcCall = SqlFunctionConverter.getCalciteOperator(NOT_NULL_FUNC_NAME, + FunctionRegistry.getFunctionInfo(NOT_NULL_FUNC_NAME).getGenericUDF(), + ImmutableList.of(keyType), returnType); + RexNode cond = rexBuilder.makeCall(funcCall, rexBuilder.makeInputRef(input, pos)); + String digest = cond.toString(); + if (!newConditions.containsKey(digest)) { + newConditions.put(digest,cond); + added = true; + } + } catch (SemanticException e) { + throw new AssertionError(e.getMessage()); + } + } + // Nothing will be added to the expression + if (!added) { + return null; + } + return newConditions; + } + + private static Map splitCondition(RexNode condition) { + Map newConditions = new HashMap(); + if (condition.getKind() == SqlKind.AND) { + for (RexNode node : ((RexCall) condition).getOperands()) { + newConditions.put(node.toString(), node); + } + } + else { + newConditions.put(condition.toString(), condition); + } + return newConditions; + } + + private static RelNode createHiveFilterConjunctiveCondition(FilterFactory filterFactory, + RexBuilder rexBuilder, RelNode input, Collection conditions) { + final RexNode newCondition = RexUtil.composeConjunction(rexBuilder, conditions, false); + return filterFactory.createFilter(input, newCondition); + } +} \ No newline at end of file Index: ql/src/java/org/apache/hadoop/hive/ql/plan/JoinDesc.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/JoinDesc.java (.../https://svn.apache.org/repos/asf/hive/trunk) (revision 1672451) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/JoinDesc.java (working copy) @@ -110,6 +110,13 @@ public JoinDesc(final Map> exprs, List outputColumnNames, final boolean noOuterJoin, + final JoinCondDesc[] conds, ExprNodeDesc[][] joinKeys) { + this (exprs, outputColumnNames, noOuterJoin, conds, + new HashMap>(), joinKeys); + } + + public JoinDesc(final Map> exprs, + List outputColumnNames, final boolean noOuterJoin, final JoinCondDesc[] conds, final Map> filters, ExprNodeDesc[][] joinKeys) { this.exprs = exprs; Index: ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDescUtils.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDescUtils.java (.../https://svn.apache.org/repos/asf/hive/trunk) (revision 1672451) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDescUtils.java (working copy) @@ -22,6 +22,7 @@ import java.util.List; import java.util.Map; +import org.apache.hadoop.hive.ql.exec.ColumnInfo; import org.apache.hadoop.hive.ql.exec.ExprNodeEvaluator; import org.apache.hadoop.hive.ql.exec.ExprNodeEvaluatorFactory; import org.apache.hadoop.hive.ql.exec.FunctionRegistry; @@ -444,4 +445,42 @@ // If the child is also decimal, no cast is needed (we hope - can target type be narrower?). return HiveDecimalUtils.getDecimalTypeForPrimitiveCategory(childTi); } + + /** + * Build ExprNodeColumnDesc for the projections in the input operator from + * sartpos to endpos(both included). Operator must have an associated + * colExprMap. + * + * @param inputOp + * Input Hive Operator + * @param startPos + * starting position in the input operator schema; must be >=0 and <= + * endPos + * @param endPos + * end position in the input operator schema; must be >=0. + * @return List of ExprNodeDesc + */ + public static ArrayList genExprNodeDesc(Operator inputOp, int startPos, int endPos, + boolean addEmptyTabAlias, boolean setColToNonVirtual) { + ArrayList exprColLst = new ArrayList(); + List colInfoLst = inputOp.getSchema().getSignature(); + + String tabAlias; + boolean vc; + ColumnInfo ci; + for (int i = startPos; i <= endPos; i++) { + ci = colInfoLst.get(i); + tabAlias = ci.getTabAlias(); + if (addEmptyTabAlias) { + tabAlias = ""; } + vc = ci.getIsVirtualCol(); + if (setColToNonVirtual) { + vc = false; + } + exprColLst.add(new ExprNodeColumnDesc(ci.getType(), ci.getInternalName(), tabAlias, vc)); + } + + return exprColLst; + } +} Index: ql/.gitignore =================================================================== --- ql/.gitignore (.../https://svn.apache.org/repos/asf/hive/trunk) (revision 1672451) +++ ql/.gitignore (working copy) @@ -1 +1,2 @@ dependency-reduced-pom.xml +/bin/ Property changes on: . ___________________________________________________________________ Modified: svn:mergeinfo Reverse-merged /hive/branches/cbo:r1605012-1627125 Merged /hive/trunk:r1605012-1672448