diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 7dedd23591..ee0442811f 100644 --- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -1562,6 +1562,8 @@ private static void populateLlapDaemonVarsSet(Set llapDaemonVarsSetLocal "Whether to transform OR clauses in Filter operators into IN clauses"), HIVEPOINTLOOKUPOPTIMIZERMIN("hive.optimize.point.lookup.min", 31, "Minimum number of OR clauses needed to transform into IN clauses"), + HIVECOUNTDISTINCTOPTIMIZER("hive.optimize.countdistinct", true, + "Whether to transform count distinct into two stages"), HIVEPARTITIONCOLUMNSEPARATOR("hive.optimize.partition.columns.separate", true, "Extract partition columns from IN clauses"), // Constant propagation optimizer diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties index e23ef6317f..6dde189496 100644 --- a/itests/src/test/resources/testconfiguration.properties +++ b/itests/src/test/resources/testconfiguration.properties @@ -59,7 +59,8 @@ minitez.query.files=explainuser_3.q,\ tez_union_with_udf.q -minillap.shared.query.files=insert_into1.q,\ +minillap.shared.query.files=count_dist_rewrite.q,\ + insert_into1.q,\ insert_into2.q,\ insert_values_orig_table.,\ llapdecider.q,\ diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java index 8b04cd44fa..f8085fea0b 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java @@ -1167,8 +1167,16 @@ public boolean supportSkewJoinOptimization() { @SuppressWarnings("unchecked") T descClone = (T)conf.clone(); // also clone the colExprMap by default + // we need a deep copy + ArrayList colInfos = new ArrayList<>(); + colInfos.addAll(getSchema().getSignature()); + Map map = null; + if (getColumnExprMap() != null) { + map = new HashMap<>(); + map.putAll(getColumnExprMap()); + } Operator ret = OperatorFactory.getAndMakeChild( - cContext, descClone, getSchema(), getColumnExprMap(), parentClones); + cContext, descClone, new RowSchema(colInfos), map, parentClones); return ret; } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/CountDistinctRewriteProc.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/CountDistinctRewriteProc.java new file mode 100644 index 0000000000..c21550e9fa --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/CountDistinctRewriteProc.java @@ -0,0 +1,413 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements.See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership.The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License.You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.optimizer; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Stack; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.hadoop.hive.ql.exec.ColumnInfo; +import org.apache.hadoop.hive.ql.exec.GroupByOperator; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.OperatorFactory; +import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; +import org.apache.hadoop.hive.ql.exec.RowSchema; +import org.apache.hadoop.hive.ql.exec.Utilities; +import org.apache.hadoop.hive.ql.io.AcidUtils; +import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker; +import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher; +import org.apache.hadoop.hive.ql.lib.Dispatcher; +import org.apache.hadoop.hive.ql.lib.GraphWalker; +import org.apache.hadoop.hive.ql.lib.Node; +import org.apache.hadoop.hive.ql.lib.NodeProcessor; +import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; +import org.apache.hadoop.hive.ql.lib.Rule; +import org.apache.hadoop.hive.ql.lib.RuleRegExp; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.SemanticAnalyzer; +import org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.GenericUDAFInfo; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.plan.AggregationDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.GroupByDesc; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; +import org.apache.hadoop.hive.ql.plan.PlanUtils; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.Mode; + +/** + * This transformation does count distinct optimization. + */ +public class CountDistinctRewriteProc extends Transform { + + private static final Logger LOG = LoggerFactory.getLogger(CountDistinctRewriteProc.class + .getName()); + + public CountDistinctRewriteProc() { + } + + @Override + public ParseContext transform(ParseContext pctx) throws SemanticException { + + Map opRules = new LinkedHashMap(); + // process group-by pattern + opRules + .put( + new RuleRegExp("R1", GroupByOperator.getOperatorName() + "%" + + ReduceSinkOperator.getOperatorName() + "%" + GroupByOperator.getOperatorName() + + "%"), getCountDistinctProc(pctx)); + + // The dispatcher fires the processor corresponding to the closest matching + // rule and passes the context along + Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), opRules, null); + GraphWalker ogw = new DefaultGraphWalker(disp); + + // Create a list of topop nodes + List topNodes = new ArrayList(); + topNodes.addAll(pctx.getTopOps().values()); + ogw.startWalking(topNodes, null); + + return pctx; + } + + private NodeProcessor getDefaultProc() { + return new NodeProcessor() { + @Override + public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException { + return null; + } + }; + } + + private NodeProcessor getCountDistinctProc(ParseContext pctx) { + return new CountDistinctProcessor(pctx); + } + + /** + * CountDistinctProcessor. + * + */ + public class CountDistinctProcessor implements NodeProcessor { + + protected ParseContext pGraphContext; + + public CountDistinctProcessor(ParseContext pGraphContext) { + this.pGraphContext = pGraphContext; + } + + ExprNodeColumnDesc exprNodeColumnDesc = null; + int indexOfDist = -1; + + // Check if we can process it or not + protected boolean checkCountDistinct(GroupByOperator mGby, GroupByOperator rGby) { + int cntDist = 0; + ArrayList keys = mGby.getConf().getKeys(); + if (!(keys.size() == 1 && rGby.getConf().getKeys().size() == 0 && mGby.getConf() + .getOutputColumnNames().size() == mGby.getConf().getAggregators().size() + 1)) { + return false; + } + for (int pos = 0; pos < mGby.getConf().getAggregators().size(); pos++) { + AggregationDesc aggr = mGby.getConf().getAggregators().get(pos); + if (aggr.getDistinct()) { + if (cntDist != 0 || !aggr.getGenericUDAFName().equalsIgnoreCase("count")) { + // there are 2 or more distincts, or distinct is not on count + // TODO: may be the same count(distinct key), count(distinct key) + // TODO: deal with duplicate count distinct key + return false; + } + indexOfDist = pos; + if (!(aggr.getParameters().size() == 1 + && aggr.getParameters().get(0) instanceof ExprNodeColumnDesc && mGby.getConf() + .getKeys().get(0) instanceof ExprNodeColumnDesc)) { + return false; + } else { + ExprNodeColumnDesc agg = (ExprNodeColumnDesc) aggr.getParameters().get(0); + ExprNodeColumnDesc key = (ExprNodeColumnDesc) mGby.getConf().getKeys().get(0); + if (!agg.isSame(key)) { + return false; + } + } + cntDist++; + } + } + if (cntDist != 1) { + return false; + } + return true; + } + + /* + * We will transform GB-RS-GBY to mGby1-rs1-mGby2-rs2-rGby1 + */ + @SuppressWarnings("unchecked") + protected void processGroupBy(GroupByOperator mGby, ReduceSinkOperator rs, GroupByOperator rGby) + throws SemanticException, CloneNotSupportedException { + // remove count(distinct) in map-side gby + List> parents = mGby.getParentOperators(); + List> children = rGby.getChildOperators(); + mGby.removeParents(); + rs.removeParents(); + rGby.removeParents(); + + GroupByOperator mGby1 = genMapGroupby1(mGby, indexOfDist); + ReduceSinkOperator rs1 = genReducesink1(mGby1, rs, indexOfDist); + GroupByOperator mGby2 = genMapGroupby2(rs1, mGby); + ReduceSinkOperator rs2 = genReducesink2(mGby2, rs); + GroupByOperator rGby1 = genReduceGroupby(rs2, rGby, indexOfDist); + for (Operator parent : parents) { + OperatorFactory.makeChild(parent, mGby1); + } + OperatorFactory.makeChild(mGby1, rs1); + OperatorFactory.makeChild(rs1, mGby2); + OperatorFactory.makeChild(mGby2, rs2); + OperatorFactory.makeChild(rs2, rGby1); + for (Operator child : children) { + child.removeParents(); + OperatorFactory.makeChild(rGby1, child); + } + } + + // mGby1 ---already contains group by key, we need to remove distinct column + private GroupByOperator genMapGroupby1(Operator mGby, int indexOfDist) + throws CloneNotSupportedException { + GroupByOperator mGby1 = (GroupByOperator) mGby.clone(); + String fieldString = mGby1.getConf().getOutputColumnNames().get(indexOfDist + 1); + mGby1.getColumnExprMap().remove(fieldString); + mGby1.getConf().getOutputColumnNames().remove(indexOfDist + 1); + mGby1.getConf().getAggregators().remove(indexOfDist); + mGby1.getConf().setDistinct(false); + mGby1.getSchema().getColumnNames().remove(indexOfDist + 1); + mGby1.getSchema().getSignature().remove(indexOfDist + 1); + return mGby1; + } + + // rs1 --- remove distinctColIndices, set #reducer as -1, reset keys, + // values, colexpmap and rowschema + private ReduceSinkOperator genReducesink1(GroupByOperator mGby1, + Operator rs, int indexOfDist) throws CloneNotSupportedException, + SemanticException { + ReduceSinkOperator rs1 = (ReduceSinkOperator) rs.clone(); + Map colExprMap = new HashMap(); + ArrayList outputKeyColumnNames = new ArrayList(); + ArrayList outputValueColumnNames = new ArrayList(); + ArrayList reduceKeys = new ArrayList(); + ArrayList reduceValues = new ArrayList(); + List internalNames = new ArrayList<>(); + for (int index = 0; index < mGby1.getSchema().getSignature().size(); index++) { + ColumnInfo paraExprInfo = mGby1.getSchema().getSignature().get(index); + String paraExpression = paraExprInfo.getInternalName(); + assert (paraExpression != null); + ExprNodeColumnDesc exprDesc = new ExprNodeColumnDesc(paraExprInfo.getType(), + paraExpression, paraExprInfo.getTabAlias(), paraExprInfo.getIsVirtualCol()); + // index==0 means this is key + if (index == 0) { + reduceKeys.add(exprDesc); + String outputColName = SemanticAnalyzer.getColumnInternalName(index); + outputKeyColumnNames.add(outputColName); + String internalName = Utilities.ReduceField.KEY.toString() + "." + outputColName; + colExprMap.put(internalName, exprDesc); + internalNames.add(internalName); + } else { + reduceValues.add(exprDesc); + String outputColName = SemanticAnalyzer.getColumnInternalName(index - 1); + outputValueColumnNames.add(outputColName); + String internalName = Utilities.ReduceField.VALUE.toString() + "." + outputColName; + colExprMap.put(internalName, exprDesc); + internalNames.add(internalName); + } + } + List> distinctColIndices = new ArrayList<>(); + rs1.setConf(PlanUtils.getReduceSinkDesc(reduceKeys, 1, reduceValues, distinctColIndices, + outputKeyColumnNames, outputValueColumnNames, true, -1, 1, -1, + AcidUtils.Operation.NOT_ACID)); + rs1.setColumnExprMap(colExprMap); + + rs1.getSchema().getColumnNames().remove(indexOfDist + 1); + rs1.getSchema().getSignature().remove(indexOfDist + 1); + // KEY._col0:0._col0 => KEY._col0 + + for (int i = 0; i < rs1.getSchema().getSignature().size(); i++) { + rs1.getSchema().getSignature().get(i).setInternalName(internalNames.get(i)); + rs1.getSchema().getColumnNames().set(i, internalNames.get(i)); + } + return rs1; + } + + // mGby2 ---already contains key, remove distinct and change all the others + private GroupByOperator genMapGroupby2(ReduceSinkOperator rs1, + Operator mGby) throws CloneNotSupportedException, SemanticException { + GroupByOperator mGby2 = (GroupByOperator) mGby.clone(); + ArrayList rowSchema = new ArrayList<>(); + ArrayList groupByKeys = new ArrayList(); + ArrayList outputColumnNames = new ArrayList(); + Map colExprMap = new HashMap(); + + ColumnInfo exprInfo = rs1.getSchema().getSignature().get(0); + ExprNodeDesc key = new ExprNodeColumnDesc(exprInfo); + groupByKeys.add(key); + String field = SemanticAnalyzer.getColumnInternalName(0); + outputColumnNames.add(field); + ColumnInfo oColInfo = new ColumnInfo(field, exprInfo.getType(), "", false); + colExprMap.put(field, key); + rowSchema.add(oColInfo); + + ArrayList aggregations = new ArrayList(); + for (int index = 0; index < mGby2.getConf().getAggregators().size(); index++) { + ArrayList aggParameters = new ArrayList(); + if (index != indexOfDist) { + AggregationDesc desc = mGby2.getConf().getAggregators().get(index); + ColumnInfo paraExprInfo = null; + // for example, original it is max 0, dist 1, min 2 + // rs1's schema is key 0, max 1, min 2 + if (index < indexOfDist) { + paraExprInfo = rs1.getSchema().getSignature().get(index + 1); + } else { + paraExprInfo = rs1.getSchema().getSignature().get(index); + } + + String paraExpression = paraExprInfo.getInternalName(); + assert (paraExpression != null); + aggParameters.add(new ExprNodeColumnDesc(paraExprInfo.getType(), paraExpression, + paraExprInfo.getTabAlias(), paraExprInfo.getIsVirtualCol())); + + // for all the other aggregations, we set the mode to PARTIAL2 + Mode amode = SemanticAnalyzer.groupByDescModeToUDAFMode(GroupByDesc.Mode.PARTIAL2, false); + GenericUDAFEvaluator genericUDAFEvaluator = desc.getGenericUDAFEvaluator(); + GenericUDAFInfo udaf = SemanticAnalyzer.getGenericUDAFInfo(genericUDAFEvaluator, amode, + aggParameters); + aggregations.add(new AggregationDesc(desc.getGenericUDAFName(), + udaf.genericUDAFEvaluator, udaf.convertedParameters, false, amode)); + String f = SemanticAnalyzer.getColumnInternalName(aggregations.size()); + outputColumnNames.add(f); + rowSchema.add(new ColumnInfo(f, udaf.returnType, "", false)); + } else { + // add count(KEY._col0) to replace distinct + ColumnInfo paraExprInfo = rs1.getSchema().getSignature().get(0); + String paraExpression = paraExprInfo.getInternalName(); + assert (paraExpression != null); + aggParameters.add(new ExprNodeColumnDesc(paraExprInfo.getType(), paraExpression, + paraExprInfo.getTabAlias(), paraExprInfo.getIsVirtualCol())); + Mode amode = SemanticAnalyzer.groupByDescModeToUDAFMode(GroupByDesc.Mode.HASH, false); + GenericUDAFEvaluator genericUDAFEvaluator = SemanticAnalyzer.getGenericUDAFEvaluator( + "count", aggParameters, null, false, false); + assert (genericUDAFEvaluator != null); + GenericUDAFInfo udaf = SemanticAnalyzer.getGenericUDAFInfo(genericUDAFEvaluator, amode, + aggParameters); + AggregationDesc newDesc = new AggregationDesc("count", udaf.genericUDAFEvaluator, + udaf.convertedParameters, false, amode); + aggregations.add(newDesc); + String f = SemanticAnalyzer.getColumnInternalName(aggregations.size()); + outputColumnNames.add(f); + rowSchema.add(new ColumnInfo(f, udaf.returnType, "", false)); + } + } + mGby2.getConf().setMode(GroupByDesc.Mode.PARTIAL2); + mGby2.getConf().setOutputColumnNames(outputColumnNames); + mGby2.getConf().getKeys().clear(); + mGby2.getConf().getKeys().addAll(groupByKeys); + mGby2.getConf().getAggregators().clear(); + mGby2.getConf().getAggregators().addAll(aggregations); + mGby2.getConf().setDistinct(false); + mGby2.setSchema(new RowSchema(rowSchema)); + mGby2.setColumnExprMap(colExprMap); + return mGby2; + } + + // #reducer is already 1 + private ReduceSinkOperator genReducesink2(GroupByOperator mGby2, + Operator rs) throws SemanticException, CloneNotSupportedException { + ReduceSinkOperator rs2 = (ReduceSinkOperator) rs.clone(); + Map colExprMap = new HashMap<>(); + + ArrayList outputKeyColumnNames = new ArrayList(); + ArrayList outputValueColumnNames = new ArrayList(); + ArrayList reduceValues = new ArrayList(); + for (int index = 1; index < mGby2.getSchema().getSignature().size(); index++) { + ColumnInfo paraExprInfo = mGby2.getSchema().getSignature().get(index); + String paraExpression = paraExprInfo.getInternalName(); + assert (paraExpression != null); + ExprNodeColumnDesc exprDesc = new ExprNodeColumnDesc(paraExprInfo.getType(), + paraExpression, paraExprInfo.getTabAlias(), paraExprInfo.getIsVirtualCol()); + reduceValues.add(exprDesc); + String outputColName = SemanticAnalyzer.getColumnInternalName(index - 1); + outputValueColumnNames.add(outputColName); + String internalName = Utilities.ReduceField.VALUE.toString() + "." + outputColName; + colExprMap.put(internalName, exprDesc); + } + List> distinctColIndices = new ArrayList<>(); + ArrayList reduceKeys = new ArrayList<>(); + rs2.setConf(PlanUtils.getReduceSinkDesc(reduceKeys, 0, reduceValues, distinctColIndices, + outputKeyColumnNames, outputValueColumnNames, true, -1, 0, 1, + AcidUtils.Operation.NOT_ACID)); + rs2.setColumnExprMap(colExprMap); + rs2.getSchema().getSignature().remove(0); + return rs2; + } + + // replace the distinct with the count aggregation + private GroupByOperator genReduceGroupby(ReduceSinkOperator rs2, + Operator rGby, int indexOfDist) throws SemanticException, + CloneNotSupportedException { + GroupByOperator rGby1 = (GroupByOperator) rGby.clone(); + ColumnInfo paraExprInfo = rs2.getSchema().getSignature().get(indexOfDist); + String paraExpression = paraExprInfo.getInternalName(); + assert (paraExpression != null); + ArrayList aggParameters = new ArrayList(); + aggParameters.add(new ExprNodeColumnDesc(paraExprInfo.getType(), paraExpression, paraExprInfo + .getTabAlias(), paraExprInfo.getIsVirtualCol())); + GenericUDAFEvaluator genericUDAFEvaluator = SemanticAnalyzer.getGenericUDAFEvaluator("count", + aggParameters, null, false, false); + assert (genericUDAFEvaluator != null); + Mode amode = SemanticAnalyzer.groupByDescModeToUDAFMode(GroupByDesc.Mode.MERGEPARTIAL, false); + GenericUDAFInfo udaf = SemanticAnalyzer.getGenericUDAFInfo(genericUDAFEvaluator, amode, + aggParameters); + AggregationDesc newDesc = new AggregationDesc("count", udaf.genericUDAFEvaluator, + udaf.convertedParameters, false, amode); + rGby1.getConf().getAggregators().set(indexOfDist, newDesc); + rGby1.getConf().setDistinct(false); + return rGby1; + } + + @Override + public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException { + GroupByOperator mGby = (GroupByOperator) stack.get(stack.size() - 3); + ReduceSinkOperator rs = (ReduceSinkOperator) stack.get(stack.size() - 2); + GroupByOperator rGby = (GroupByOperator) stack.get(stack.size() - 1); + if (checkCountDistinct(mGby, rGby)) { + LOG.info("trigger count distinct rewrite"); + try { + processGroupBy(mGby, rs, rGby); + } catch (CloneNotSupportedException e) { + throw new RuntimeException(e.getMessage()); + } + } + return null; + } + + } + +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupByOptimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupByOptimizer.java index 3233157d8d..32d1e05da4 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupByOptimizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupByOptimizer.java @@ -535,6 +535,14 @@ public boolean removeChildren(Operator currOp, int depth } currOp = currOp.getChildOperators().get(0); } + // GB-RS-GB. the two GB's schema does not match. It may due to count + // distinct rewrite. In this case, we just fail this optimization. + // There is an assertion inside genOutputSelectForGroupBy. + if (inputOp.getSchema().getSignature().size() != currOp.getSchema().getSignature().size()) { + LOG.info("The signature of " + inputOp.getOperatorId() + " does not match that of " + + currOp.getOperatorId()); + return false; + } // add selectOp to match the schema // after that, inputOp is the parent of selOp. @@ -556,7 +564,6 @@ public boolean removeChildren(Operator currOp, int depth private Operator genOutputSelectForGroupBy( Operator parentOp, Operator currOp) { - assert (parentOp.getSchema().getSignature().size() == currOp.getSchema().getSignature().size()); Iterator pIter = parentOp.getSchema().getSignature().iterator(); Iterator cIter = currOp.getSchema().getSignature().iterator(); List columns = new ArrayList(); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java index 7dace9076f..6af8f1e504 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java @@ -72,6 +72,10 @@ public void initialize(HiveConf hiveConf) { // we are translating Calcite operators into Hive operators. transformations.add(new HiveOpConverterPostProc()); + if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVECOUNTDISTINCTOPTIMIZER)) { + transformations.add(new CountDistinctRewriteProc()); + } + // Add the transformation that computes the lineage information. Set postExecHooks = Sets.newHashSet( Splitter.on(",").trimResults().omitEmptyStrings().split( diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/GroupByDesc.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/GroupByDesc.java index 38a9ef2af1..fe91ee7025 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/plan/GroupByDesc.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/GroupByDesc.java @@ -367,4 +367,19 @@ public GroupByOperatorExplainVectorization getGroupByVectorization() { } return new GroupByOperatorExplainVectorization(this, vectorDesc); } + + @Override + public Object clone() { + ArrayList outputColumnNames = new ArrayList<>(); + outputColumnNames.addAll(this.outputColumnNames); + ArrayList keys = new ArrayList<>(); + keys.addAll(this.keys); + ArrayList aggregators = new ArrayList<>(); + aggregators.addAll(this.aggregators); + List listGroupingSets = new ArrayList<>(); + listGroupingSets.addAll(this.listGroupingSets); + return new GroupByDesc(this.mode, outputColumnNames, keys, aggregators, + this.groupByMemoryUsage, this.memoryThreshold, listGroupingSets, this.groupingSetsPresent, + this.groupingSetPosition, this.isDistinct); + } } diff --git a/ql/src/test/queries/clientpositive/count_dist_rewrite.q b/ql/src/test/queries/clientpositive/count_dist_rewrite.q new file mode 100644 index 0000000000..182b140f0f --- /dev/null +++ b/ql/src/test/queries/clientpositive/count_dist_rewrite.q @@ -0,0 +1,57 @@ +explain select max(key), count(distinct key) B1_CNTD from src; + +select max(key), count(distinct key) B1_CNTD from src; + +explain select max(key), count(distinct key), min(key) from src; + +select max(key), count(distinct key), min(key) from src; + +explain select max(key), count(distinct key), min(key), avg(key) from src; + +select max(key), count(distinct key), min(key), avg(key) from src; + +explain select count(1), count(distinct key) from src; + +select count(1), count(distinct key) from src; + +explain select + count(*) as total, + count(key) as not_null_total, + count(distinct key) as unique_days, + max(value) as max_ss_store_sk, + max(key) as max_ss_promo_sk +from src; + +select + count(*) as total, + count(key) as not_null_total, + count(distinct key) as unique_days, + max(value) as max_ss_store_sk, + max(key) as max_ss_promo_sk +from src; + +explain select count(1), count(distinct key), cast(STDDEV(key) as int) from src; +select count(1), count(distinct key), cast(STDDEV(key) as int) from src; +select count(distinct key), count(1), cast(STDDEV(key) as int) from src; + +explain SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + count(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + cast(std(substr(src.value,5)) as int), + cast(stddev_samp(substr(src.value,5)) as int), + cast(variance(substr(src.value,5)) as int), + cast(var_samp(substr(src.value,5)) as int) from src; + +SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + count(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + cast(std(substr(src.value,5)) as int), + cast(stddev_samp(substr(src.value,5)) as int), + cast(variance(substr(src.value,5)) as int), + cast(var_samp(substr(src.value,5)) as int) from src; diff --git a/ql/src/test/results/clientpositive/count_dist_rewrite.q.out b/ql/src/test/results/clientpositive/count_dist_rewrite.q.out new file mode 100644 index 0000000000..ac5cb7b4f8 --- /dev/null +++ b/ql/src/test/results/clientpositive/count_dist_rewrite.q.out @@ -0,0 +1,658 @@ +PREHOOK: query: explain select max(key), count(distinct key) B1_CNTD from src +PREHOOK: type: QUERY +POSTHOOK: query: explain select max(key), count(distinct key) B1_CNTD from src +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: max(key) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: string) + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0), count(KEY._col0) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + sort order: + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: string), _col2 (type: bigint) + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0), count(VALUE._col1) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 376 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 376 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select max(key), count(distinct key) B1_CNTD from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select max(key), count(distinct key) B1_CNTD from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +98 309 +PREHOOK: query: explain select max(key), count(distinct key), min(key) from src +PREHOOK: type: QUERY +POSTHOOK: query: explain select max(key), count(distinct key), min(key) from src +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: max(key), min(key) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col3 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: string), _col3 (type: string) + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0), count(KEY._col0), min(VALUE._col1) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + sort order: + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: string), _col2 (type: bigint), _col3 (type: string) + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0), count(VALUE._col1), min(VALUE._col2) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 652 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 652 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select max(key), count(distinct key), min(key) from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select max(key), count(distinct key), min(key) from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +98 309 0 +PREHOOK: query: explain select max(key), count(distinct key), min(key), avg(key) from src +PREHOOK: type: QUERY +POSTHOOK: query: explain select max(key), count(distinct key), min(key), avg(key) from src +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: max(key), min(key), avg(key) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col3, _col4 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: string), _col3 (type: string), _col4 (type: struct) + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0), count(KEY._col0), min(VALUE._col1), avg(VALUE._col2) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + sort order: + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: string), _col2 (type: bigint), _col3 (type: string), _col4 (type: struct) + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0), count(VALUE._col1), min(VALUE._col2), avg(VALUE._col3) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 1 Data size: 908 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 908 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select max(key), count(distinct key), min(key), avg(key) from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select max(key), count(distinct key), min(key), avg(key) from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +98 309 0 260.182 +PREHOOK: query: explain select count(1), count(distinct key) from src +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(1), count(distinct key) from src +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: _col1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(1) + keys: _col1 (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), count(KEY._col0) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + sort order: + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint), _col2 (type: bigint) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), count(VALUE._col1) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 24 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 24 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(1), count(distinct key) from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select count(1), count(distinct key) from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +500 309 +PREHOOK: query: explain select + count(*) as total, + count(key) as not_null_total, + count(distinct key) as unique_days, + max(value) as max_ss_store_sk, + max(key) as max_ss_promo_sk +from src +PREHOOK: type: QUERY +POSTHOOK: query: explain select + count(*) as total, + count(key) as not_null_total, + count(distinct key) as unique_days, + max(value) as max_ss_store_sk, + max(key) as max_ss_promo_sk +from src +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: key, value + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(), count(key), max(value), max(key) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2, _col4, _col5 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint), _col2 (type: bigint), _col4 (type: string), _col5 (type: string) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), count(VALUE._col1), count(KEY._col0), max(VALUE._col2), max(VALUE._col3) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + sort order: + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint), _col2 (type: bigint), _col3 (type: bigint), _col4 (type: string), _col5 (type: string) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), count(VALUE._col1), count(VALUE._col2), max(VALUE._col3), max(VALUE._col4) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 1 Data size: 492 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 492 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select + count(*) as total, + count(key) as not_null_total, + count(distinct key) as unique_days, + max(value) as max_ss_store_sk, + max(key) as max_ss_promo_sk +from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select + count(*) as total, + count(key) as not_null_total, + count(distinct key) as unique_days, + max(value) as max_ss_store_sk, + max(key) as max_ss_promo_sk +from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +500 500 309 val_98 98 +PREHOOK: query: explain select count(1), count(distinct key), cast(STDDEV(key) as int) from src +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(1), count(distinct key), cast(STDDEV(key) as int) from src +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: _col1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(1), stddev(_col1) + keys: _col1 (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col3 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint), _col3 (type: struct) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), count(KEY._col0), stddev(VALUE._col1) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + sort order: + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint), _col2 (type: bigint), _col3 (type: struct) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), count(VALUE._col1), stddev(VALUE._col2) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 104 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: bigint), _col1 (type: bigint), UDFToInteger(_col2) (type: int) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 104 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 104 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(1), count(distinct key), cast(STDDEV(key) as int) from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select count(1), count(distinct key), cast(STDDEV(key) as int) from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +500 309 142 +PREHOOK: query: select count(distinct key), count(1), cast(STDDEV(key) as int) from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select count(distinct key), count(1), cast(STDDEV(key) as int) from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +309 500 142 +PREHOOK: query: explain SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + count(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + cast(std(substr(src.value,5)) as int), + cast(stddev_samp(substr(src.value,5)) as int), + cast(variance(substr(src.value,5)) as int), + cast(var_samp(substr(src.value,5)) as int) from src +PREHOOK: type: QUERY +POSTHOOK: query: explain SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + count(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + cast(std(substr(src.value,5)) as int), + cast(stddev_samp(substr(src.value,5)) as int), + cast(variance(substr(src.value,5)) as int), + cast(var_samp(substr(src.value,5)) as int) from src +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: substr(value, 5) (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: sum(_col0), avg(_col0), max(_col0), min(_col0), std(_col0), stddev_samp(_col0), variance(_col0), var_samp(_col0) + keys: _col0 (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2, _col4, _col5, _col6, _col7, _col8, _col9 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: double), _col2 (type: struct), _col4 (type: string), _col5 (type: string), _col6 (type: struct), _col7 (type: struct), _col8 (type: struct), _col9 (type: struct) + Reduce Operator Tree: + Group By Operator + aggregations: sum(VALUE._col0), avg(VALUE._col1), count(KEY._col0), max(VALUE._col2), min(VALUE._col3), std(VALUE._col4), stddev_samp(VALUE._col5), variance(VALUE._col6), var_samp(VALUE._col7) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + sort order: + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: double), _col2 (type: struct), _col3 (type: bigint), _col4 (type: string), _col5 (type: string), _col6 (type: struct), _col7 (type: struct), _col8 (type: struct), _col9 (type: struct) + Reduce Operator Tree: + Group By Operator + aggregations: sum(VALUE._col0), avg(VALUE._col1), count(VALUE._col2), max(VALUE._col3), min(VALUE._col4), std(VALUE._col5), stddev_samp(VALUE._col6), variance(VALUE._col7), var_samp(VALUE._col8) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 + Statistics: Num rows: 1 Data size: 1060 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: double), _col1 (type: double), _col2 (type: bigint), _col3 (type: string), _col4 (type: string), UDFToInteger(_col5) (type: int), UDFToInteger(_col6) (type: int), UDFToInteger(_col7) (type: int), UDFToInteger(_col8) (type: int) + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 + Statistics: Num rows: 1 Data size: 1060 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 1060 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + count(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + cast(std(substr(src.value,5)) as int), + cast(stddev_samp(substr(src.value,5)) as int), + cast(variance(substr(src.value,5)) as int), + cast(var_samp(substr(src.value,5)) as int) from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + count(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + cast(std(substr(src.value,5)) as int), + cast(stddev_samp(substr(src.value,5)) as int), + cast(variance(substr(src.value,5)) as int), + cast(var_samp(substr(src.value,5)) as int) from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +130091.0 260.182 309 98 0 142 143 20428 20469 diff --git a/ql/src/test/results/clientpositive/groupby_sort_11.q.out b/ql/src/test/results/clientpositive/groupby_sort_11.q.out index 2b3bf4a07a..f3927dd605 100644 --- a/ql/src/test/results/clientpositive/groupby_sort_11.q.out +++ b/ql/src/test/results/clientpositive/groupby_sort_11.q.out @@ -26,7 +26,8 @@ POSTHOOK: query: EXPLAIN select count(distinct key) from T1 POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-1 is a root stage - Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 STAGE PLANS: Stage: Stage-1 @@ -40,17 +41,37 @@ STAGE PLANS: outputColumnNames: key Statistics: Num rows: 10 Data size: 70 Basic stats: COMPLETE Column stats: NONE Group By Operator - aggregations: count(DISTINCT key) - bucketGroup: true keys: key (type: string) mode: hash - outputColumnNames: _col0, _col1 + outputColumnNames: _col0 Statistics: Num rows: 10 Data size: 70 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col0 (type: string) sort order: + + Map-reduce partition columns: _col0 (type: string) Statistics: Num rows: 10 Data size: 70 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: bigint) + Reduce Operator Tree: + Group By Operator + aggregations: count(KEY._col0) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1 + Statistics: Num rows: 10 Data size: 70 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + sort order: + Statistics: Num rows: 10 Data size: 70 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint) Reduce Operator Tree: Group By Operator aggregations: count(VALUE._col0) @@ -292,7 +313,8 @@ POSTHOOK: query: EXPLAIN select count(distinct key+key) from T1 POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-1 is a root stage - Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 STAGE PLANS: Stage: Stage-1 @@ -306,18 +328,40 @@ STAGE PLANS: outputColumnNames: _col0 Statistics: Num rows: 10 Data size: 70 Basic stats: COMPLETE Column stats: NONE Group By Operator - aggregations: count(DISTINCT _col0) keys: _col0 (type: double) mode: hash - outputColumnNames: _col0, _col1 + outputColumnNames: _col0 Statistics: Num rows: 10 Data size: 70 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col0 (type: double) sort order: + + Map-reduce partition columns: _col0 (type: double) Statistics: Num rows: 10 Data size: 70 Basic stats: COMPLETE Column stats: NONE Reduce Operator Tree: Group By Operator - aggregations: count(DISTINCT KEY._col0:0._col0) + aggregations: count(KEY._col0) + keys: KEY._col0 (type: double) + mode: partial2 + outputColumnNames: _col0, _col1 + Statistics: Num rows: 10 Data size: 70 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + sort order: + Statistics: Num rows: 10 Data size: 70 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) mode: mergepartial outputColumnNames: _col0 Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: NONE diff --git a/ql/src/test/results/clientpositive/groupby_sort_8.q.out b/ql/src/test/results/clientpositive/groupby_sort_8.q.out index 4faa0757cc..820d9afdf3 100644 --- a/ql/src/test/results/clientpositive/groupby_sort_8.q.out +++ b/ql/src/test/results/clientpositive/groupby_sort_8.q.out @@ -37,7 +37,8 @@ select count(distinct key) from T1 POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-1 is a root stage - Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 STAGE PLANS: Stage: Stage-1 @@ -51,17 +52,37 @@ STAGE PLANS: outputColumnNames: key Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE Group By Operator - aggregations: count(DISTINCT key) - bucketGroup: true keys: key (type: string) mode: hash - outputColumnNames: _col0, _col1 + outputColumnNames: _col0 Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col0 (type: string) sort order: + + Map-reduce partition columns: _col0 (type: string) Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: bigint) + Reduce Operator Tree: + Group By Operator + aggregations: count(KEY._col0) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1 + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + sort order: + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint) Reduce Operator Tree: Group By Operator aggregations: count(VALUE._col0) diff --git a/ql/src/test/results/clientpositive/llap/count_dist_rewrite.q.out b/ql/src/test/results/clientpositive/llap/count_dist_rewrite.q.out new file mode 100644 index 0000000000..ddc717217c --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/count_dist_rewrite.q.out @@ -0,0 +1,665 @@ +PREHOOK: query: explain select max(key), count(distinct key) B1_CNTD from src +PREHOOK: type: QUERY +POSTHOOK: query: explain select max(key), count(distinct key) B1_CNTD from src +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: max(key) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 205 Data size: 55555 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 205 Data size: 55555 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: string) + Execution mode: llap + LLAP IO: no inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0), count(KEY._col0) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 205 Data size: 57195 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 205 Data size: 57195 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: string), _col2 (type: bigint) + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0), count(VALUE._col1) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 192 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 192 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select max(key), count(distinct key) B1_CNTD from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select max(key), count(distinct key) B1_CNTD from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +98 309 +PREHOOK: query: explain select max(key), count(distinct key), min(key) from src +PREHOOK: type: QUERY +POSTHOOK: query: explain select max(key), count(distinct key), min(key) from src +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: max(key), min(key) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col3 + Statistics: Num rows: 205 Data size: 93275 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 205 Data size: 93275 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: string), _col3 (type: string) + Execution mode: llap + LLAP IO: no inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0), count(KEY._col0), min(VALUE._col1) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 205 Data size: 94915 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 205 Data size: 94915 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: string), _col2 (type: bigint), _col3 (type: string) + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0), count(VALUE._col1), min(VALUE._col2) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 376 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 376 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select max(key), count(distinct key), min(key) from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select max(key), count(distinct key), min(key) from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +98 309 0 +PREHOOK: query: explain select max(key), count(distinct key), min(key), avg(key) from src +PREHOOK: type: QUERY +POSTHOOK: query: explain select max(key), count(distinct key), min(key), avg(key) from src +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: max(key), min(key), avg(key) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col3, _col4 + Statistics: Num rows: 205 Data size: 145755 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 205 Data size: 145755 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: string), _col3 (type: string), _col4 (type: struct) + Execution mode: llap + LLAP IO: no inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0), count(KEY._col0), min(VALUE._col1), avg(VALUE._col2) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 205 Data size: 147395 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 205 Data size: 147395 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: string), _col2 (type: bigint), _col3 (type: string), _col4 (type: struct) + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0), count(VALUE._col1), min(VALUE._col2), avg(VALUE._col3) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 1 Data size: 384 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 384 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select max(key), count(distinct key), min(key), avg(key) from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select max(key), count(distinct key), min(key), avg(key) from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +98 309 0 260.182 +PREHOOK: query: explain select count(1), count(distinct key) from src +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(1), count(distinct key) from src +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: string) + outputColumnNames: _col1 + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: count(1) + keys: _col1 (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 205 Data size: 19475 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 205 Data size: 19475 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: bigint) + Execution mode: llap + LLAP IO: no inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), count(KEY._col0) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 205 Data size: 21115 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 205 Data size: 21115 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: bigint), _col2 (type: bigint) + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), count(VALUE._col1) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(1), count(distinct key) from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select count(1), count(distinct key) from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +500 309 +PREHOOK: query: explain select + count(*) as total, + count(key) as not_null_total, + count(distinct key) as unique_days, + max(value) as max_ss_store_sk, + max(key) as max_ss_promo_sk +from src +PREHOOK: type: QUERY +POSTHOOK: query: explain select + count(*) as total, + count(key) as not_null_total, + count(distinct key) as unique_days, + max(value) as max_ss_store_sk, + max(key) as max_ss_promo_sk +from src +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: key, value + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: count(), count(key), max(value), max(key) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2, _col4, _col5 + Statistics: Num rows: 205 Data size: 96555 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 205 Data size: 96555 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: bigint), _col2 (type: bigint), _col4 (type: string), _col5 (type: string) + Execution mode: llap + LLAP IO: no inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), count(VALUE._col1), count(KEY._col0), max(VALUE._col2), max(VALUE._col3) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5 + Statistics: Num rows: 205 Data size: 98195 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 205 Data size: 98195 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: bigint), _col2 (type: bigint), _col3 (type: bigint), _col4 (type: string), _col5 (type: string) + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), count(VALUE._col1), count(VALUE._col2), max(VALUE._col3), max(VALUE._col4) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 1 Data size: 392 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 392 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select + count(*) as total, + count(key) as not_null_total, + count(distinct key) as unique_days, + max(value) as max_ss_store_sk, + max(key) as max_ss_promo_sk +from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select + count(*) as total, + count(key) as not_null_total, + count(distinct key) as unique_days, + max(value) as max_ss_store_sk, + max(key) as max_ss_promo_sk +from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +500 500 309 val_98 98 +PREHOOK: query: explain select count(1), count(distinct key), cast(STDDEV(key) as int) from src +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(1), count(distinct key), cast(STDDEV(key) as int) from src +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: string) + outputColumnNames: _col1 + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: count(1), stddev(_col1) + keys: _col1 (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col3 + Statistics: Num rows: 205 Data size: 35875 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 205 Data size: 35875 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: bigint), _col3 (type: struct) + Execution mode: llap + LLAP IO: no inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), count(KEY._col0), stddev(VALUE._col1) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 205 Data size: 37515 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 205 Data size: 37515 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: bigint), _col2 (type: bigint), _col3 (type: struct) + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), count(VALUE._col1), stddev(VALUE._col2) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: bigint), _col1 (type: bigint), UDFToInteger(_col2) (type: int) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(1), count(distinct key), cast(STDDEV(key) as int) from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select count(1), count(distinct key), cast(STDDEV(key) as int) from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +500 309 142 +PREHOOK: query: select count(distinct key), count(1), cast(STDDEV(key) as int) from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select count(distinct key), count(1), cast(STDDEV(key) as int) from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +309 500 142 +PREHOOK: query: explain SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + count(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + cast(std(substr(src.value,5)) as int), + cast(stddev_samp(substr(src.value,5)) as int), + cast(variance(substr(src.value,5)) as int), + cast(var_samp(substr(src.value,5)) as int) from src +PREHOOK: type: QUERY +POSTHOOK: query: explain SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + count(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + cast(std(substr(src.value,5)) as int), + cast(stddev_samp(substr(src.value,5)) as int), + cast(variance(substr(src.value,5)) as int), + cast(var_samp(substr(src.value,5)) as int) from src +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 45500 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: substr(value, 5) (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 500 Data size: 45500 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: sum(_col0), avg(_col0), max(_col0), min(_col0), std(_col0), stddev_samp(_col0), variance(_col0), var_samp(_col0) + keys: _col0 (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2, _col4, _col5, _col6, _col7, _col8, _col9 + Statistics: Num rows: 214 Data size: 243104 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 214 Data size: 243104 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: double), _col2 (type: struct), _col4 (type: string), _col5 (type: string), _col6 (type: struct), _col7 (type: struct), _col8 (type: struct), _col9 (type: struct) + Execution mode: llap + LLAP IO: no inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: sum(VALUE._col0), avg(VALUE._col1), count(KEY._col0), max(VALUE._col2), min(VALUE._col3), std(VALUE._col4), stddev_samp(VALUE._col5), variance(VALUE._col6), var_samp(VALUE._col7) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9 + Statistics: Num rows: 214 Data size: 244816 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 214 Data size: 244816 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: double), _col2 (type: struct), _col3 (type: bigint), _col4 (type: string), _col5 (type: string), _col6 (type: struct), _col7 (type: struct), _col8 (type: struct), _col9 (type: struct) + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: sum(VALUE._col0), avg(VALUE._col1), count(VALUE._col2), max(VALUE._col3), min(VALUE._col4), std(VALUE._col5), stddev_samp(VALUE._col6), variance(VALUE._col7), var_samp(VALUE._col8) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 + Statistics: Num rows: 1 Data size: 424 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: double), _col1 (type: double), _col2 (type: bigint), _col3 (type: string), _col4 (type: string), UDFToInteger(_col5) (type: int), UDFToInteger(_col6) (type: int), UDFToInteger(_col7) (type: int), UDFToInteger(_col8) (type: int) + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 + Statistics: Num rows: 1 Data size: 408 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 408 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + count(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + cast(std(substr(src.value,5)) as int), + cast(stddev_samp(substr(src.value,5)) as int), + cast(variance(substr(src.value,5)) as int), + cast(var_samp(substr(src.value,5)) as int) from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + count(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + cast(std(substr(src.value,5)) as int), + cast(stddev_samp(substr(src.value,5)) as int), + cast(variance(substr(src.value,5)) as int), + cast(var_samp(substr(src.value,5)) as int) from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +130091.0 260.182 309 98 0 142 143 20428 20469 diff --git a/ql/src/test/results/clientpositive/llap/metadataonly1.q.out b/ql/src/test/results/clientpositive/llap/metadataonly1.q.out index 27218cf599..f3e779e461 100644 --- a/ql/src/test/results/clientpositive/llap/metadataonly1.q.out +++ b/ql/src/test/results/clientpositive/llap/metadataonly1.q.out @@ -254,6 +254,7 @@ STAGE PLANS: #### A masked pattern was here #### Edges: Reducer 2 <- Map 1 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) #### A masked pattern was here #### Vertices: Map 1 @@ -267,26 +268,27 @@ STAGE PLANS: outputColumnNames: ds Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: COMPLETE Group By Operator - aggregations: count(DISTINCT ds) keys: ds (type: string) mode: hash - outputColumnNames: _col0, _col1 - Statistics: Num rows: 1 Data size: 192 Basic stats: COMPLETE Column stats: COMPLETE + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: string) null sort order: a sort order: + - Statistics: Num rows: 1 Data size: 192 Basic stats: COMPLETE Column stats: COMPLETE + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: COMPLETE tag: -1 - auto parallelism: false + auto parallelism: true Execution mode: llap LLAP IO: no inputs Path -> Alias: - nullscan://null/default.test1/part_ds=1_ [test1] +#### A masked pattern was here #### Path -> Partition: - nullscan://null/default.test1/part_ds=1_ +#### A masked pattern was here #### Partition - input format: org.apache.hadoop.hive.ql.io.ZeroRowsInputFormat + base file name: ds=1 + input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat partition values: ds 1 @@ -306,10 +308,10 @@ STAGE PLANS: rawDataSize 0 serialization.ddl struct test1 { i32 a, double b} serialization.format 1 - serialization.lib org.apache.hadoop.hive.serde2.NullStructSerDe + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 0 #### A masked pattern was here #### - serde: org.apache.hadoop.hive.serde2.NullStructSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat @@ -331,13 +333,30 @@ STAGE PLANS: name: default.test1 name: default.test1 Truncated Path -> Alias: - nullscan://null/default.test1/part_ds=1_ [test1] + /test1/ds=1 [test1] Reducer 2 Execution mode: llap Needs Tagging: false Reduce Operator Tree: Group By Operator - aggregations: count(DISTINCT KEY._col0:0._col0) + aggregations: count(KEY._col0) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 192 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + null sort order: + sort order: + Statistics: Num rows: 1 Data size: 192 Basic stats: COMPLETE Column stats: COMPLETE + tag: -1 + value expressions: _col1 (type: bigint) + auto parallelism: false + Reducer 3 + Execution mode: llap + Needs Tagging: false + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) mode: mergepartial outputColumnNames: _col0 Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE diff --git a/ql/src/test/results/clientpositive/nullgroup4.q.out b/ql/src/test/results/clientpositive/nullgroup4.q.out index e5a8eeee14..bd27e8ca2c 100644 --- a/ql/src/test/results/clientpositive/nullgroup4.q.out +++ b/ql/src/test/results/clientpositive/nullgroup4.q.out @@ -7,7 +7,8 @@ POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-1 is a root stage Stage-2 depends on stages: Stage-1 - Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-2 + Stage-0 depends on stages: Stage-3 STAGE PLANS: Stage: Stage-1 @@ -24,10 +25,10 @@ STAGE PLANS: outputColumnNames: _col1 Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE Group By Operator - aggregations: count(1), count(DISTINCT _col1) + aggregations: count(1) keys: _col1 (type: string) mode: hash - outputColumnNames: _col0, _col1, _col2 + outputColumnNames: _col0, _col1 Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col0 (type: string) @@ -37,7 +38,29 @@ STAGE PLANS: value expressions: _col1 (type: bigint) Reduce Operator Tree: Group By Operator - aggregations: count(VALUE._col0), count(DISTINCT KEY._col0:0._col0) + aggregations: count(VALUE._col0), count(KEY._col0) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + sort order: + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint), _col2 (type: bigint) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), count(VALUE._col1) mode: partials outputColumnNames: _col0, _col1 Statistics: Num rows: 1 Data size: 24 Basic stats: COMPLETE Column stats: NONE @@ -48,7 +71,7 @@ STAGE PLANS: output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - Stage: Stage-2 + Stage: Stage-3 Map Reduce Map Operator Tree: TableScan @@ -93,7 +116,8 @@ select count(1), count(distinct x.value) from src x where x.key = 9999 POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-1 is a root stage - Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 STAGE PLANS: Stage: Stage-1 @@ -110,19 +134,42 @@ STAGE PLANS: outputColumnNames: _col1 Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE Group By Operator - aggregations: count(1), count(DISTINCT _col1) + aggregations: count(1) keys: _col1 (type: string) mode: hash - outputColumnNames: _col0, _col1, _col2 + outputColumnNames: _col0, _col1 Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col0 (type: string) sort order: + + Map-reduce partition columns: _col0 (type: string) Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE value expressions: _col1 (type: bigint) Reduce Operator Tree: Group By Operator - aggregations: count(VALUE._col0), count(DISTINCT KEY._col0:0._col0) + aggregations: count(VALUE._col0), count(KEY._col0) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + sort order: + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint), _col2 (type: bigint) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), count(VALUE._col1) mode: mergepartial outputColumnNames: _col0, _col1 Statistics: Num rows: 1 Data size: 24 Basic stats: COMPLETE Column stats: NONE diff --git a/ql/src/test/results/clientpositive/perf/query16.q.out b/ql/src/test/results/clientpositive/perf/query16.q.out index cf90c0c162..4f88fbf9ff 100644 --- a/ql/src/test/results/clientpositive/perf/query16.q.out +++ b/ql/src/test/results/clientpositive/perf/query16.q.out @@ -1,4 +1,4 @@ -Warning: Shuffle Join MERGEJOIN[106][tables = [$hdt$_2, $hdt$_3, $hdt$_1, $hdt$_4]] in Stage 'Reducer 17' is a cross product +Warning: Shuffle Join MERGEJOIN[111][tables = [$hdt$_2, $hdt$_3, $hdt$_1, $hdt$_4]] in Stage 'Reducer 18' is a cross product PREHOOK: query: explain select count(distinct cs_order_number) as `order count` ,sum(cs_ext_ship_cost) as `total shipping cost` @@ -60,172 +60,178 @@ POSTHOOK: type: QUERY Plan optimized by CBO. Vertex dependency in root stage -Reducer 13 <- Map 12 (SIMPLE_EDGE) -Reducer 15 <- Map 14 (SIMPLE_EDGE), Reducer 18 (SIMPLE_EDGE) -Reducer 16 <- Reducer 15 (SIMPLE_EDGE) -Reducer 17 <- Map 14 (CUSTOM_SIMPLE_EDGE), Map 19 (CUSTOM_SIMPLE_EDGE), Map 20 (CUSTOM_SIMPLE_EDGE), Map 21 (CUSTOM_SIMPLE_EDGE) -Reducer 18 <- Reducer 17 (SIMPLE_EDGE) -Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 9 (SIMPLE_EDGE) -Reducer 3 <- Map 10 (SIMPLE_EDGE), Reducer 2 (SIMPLE_EDGE) -Reducer 4 <- Map 11 (SIMPLE_EDGE), Reducer 3 (SIMPLE_EDGE) -Reducer 5 <- Reducer 13 (SIMPLE_EDGE), Reducer 4 (SIMPLE_EDGE) -Reducer 6 <- Reducer 16 (SIMPLE_EDGE), Reducer 5 (SIMPLE_EDGE) +Reducer 14 <- Map 13 (SIMPLE_EDGE) +Reducer 16 <- Map 15 (SIMPLE_EDGE), Reducer 19 (SIMPLE_EDGE) +Reducer 17 <- Reducer 16 (SIMPLE_EDGE) +Reducer 18 <- Map 15 (CUSTOM_SIMPLE_EDGE), Map 20 (CUSTOM_SIMPLE_EDGE), Map 21 (CUSTOM_SIMPLE_EDGE), Map 22 (CUSTOM_SIMPLE_EDGE) +Reducer 19 <- Reducer 18 (SIMPLE_EDGE) +Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 10 (SIMPLE_EDGE) +Reducer 3 <- Map 11 (SIMPLE_EDGE), Reducer 2 (SIMPLE_EDGE) +Reducer 4 <- Map 12 (SIMPLE_EDGE), Reducer 3 (SIMPLE_EDGE) +Reducer 5 <- Reducer 14 (SIMPLE_EDGE), Reducer 4 (SIMPLE_EDGE) +Reducer 6 <- Reducer 17 (SIMPLE_EDGE), Reducer 5 (SIMPLE_EDGE) Reducer 7 <- Reducer 6 (SIMPLE_EDGE) -Reducer 8 <- Reducer 7 (SIMPLE_EDGE) +Reducer 8 <- Reducer 7 (CUSTOM_SIMPLE_EDGE) +Reducer 9 <- Reducer 8 (SIMPLE_EDGE) Stage-0 Fetch Operator limit:100 Stage-1 - Reducer 8 + Reducer 9 File Output Operator [FS_73] Limit [LIM_72] (rows=1 width=344) Number of rows:100 Select Operator [SEL_71] (rows=1 width=344) Output:["_col0","_col1","_col2"] - <-Reducer 7 [SIMPLE_EDGE] + <-Reducer 8 [SIMPLE_EDGE] SHUFFLE [RS_70] - Group By Operator [GBY_68] (rows=1 width=344) - Output:["_col0","_col1","_col2"],aggregations:["count(DISTINCT KEY._col0:0._col0)","sum(VALUE._col1)","sum(VALUE._col2)"] - <-Reducer 6 [SIMPLE_EDGE] - SHUFFLE [RS_67] - Group By Operator [GBY_66] (rows=1395035081047425024 width=1) - Output:["_col0","_col1","_col2","_col3"],aggregations:["count(DISTINCT _col4)","sum(_col5)","sum(_col6)"],keys:_col4 - Select Operator [SEL_65] (rows=1395035081047425024 width=1) - Output:["_col4","_col5","_col6"] - Filter Operator [FIL_64] (rows=1395035081047425024 width=1) - predicate:_col16 is null - Select Operator [SEL_63] (rows=2790070162094850048 width=1) - Output:["_col4","_col5","_col6","_col16"] - Merge Join Operator [MERGEJOIN_112] (rows=2790070162094850048 width=1) - Conds:RS_60._col3, _col4=RS_61._col0, _col1(Inner),Output:["_col4","_col5","_col6","_col14"] - <-Reducer 16 [SIMPLE_EDGE] - SHUFFLE [RS_61] - PartitionCols:_col0, _col1 - Group By Operator [GBY_46] (rows=2536427365110644736 width=1) - Output:["_col0","_col1"],keys:KEY._col0, KEY._col1 - <-Reducer 15 [SIMPLE_EDGE] - SHUFFLE [RS_45] + Group By Operator [GBY_78] (rows=1 width=344) + Output:["_col0","_col1","_col2"],aggregations:["count(VALUE._col0)","sum(VALUE._col1)","sum(VALUE._col2)"] + <-Reducer 7 [CUSTOM_SIMPLE_EDGE] + PARTITION_ONLY_SHUFFLE [RS_77] + Group By Operator [GBY_76] (rows=1395035081047425024 width=1) + Output:["_col0","_col1","_col2","_col3"],aggregations:["count(KEY._col0)","sum(VALUE._col0)","sum(VALUE._col1)"],keys:KEY._col0 + <-Reducer 6 [SIMPLE_EDGE] + SHUFFLE [RS_75] + PartitionCols:_col0 + Group By Operator [GBY_74] (rows=1395035081047425024 width=1) + Output:["_col0","_col2","_col3"],aggregations:["sum(_col5)","sum(_col6)"],keys:_col4 + Select Operator [SEL_65] (rows=1395035081047425024 width=1) + Output:["_col4","_col5","_col6"] + Filter Operator [FIL_64] (rows=1395035081047425024 width=1) + predicate:_col16 is null + Select Operator [SEL_63] (rows=2790070162094850048 width=1) + Output:["_col4","_col5","_col6","_col16"] + Merge Join Operator [MERGEJOIN_117] (rows=2790070162094850048 width=1) + Conds:RS_60._col3, _col4=RS_61._col0, _col1(Inner),Output:["_col4","_col5","_col6","_col14"] + <-Reducer 17 [SIMPLE_EDGE] + SHUFFLE [RS_61] PartitionCols:_col0, _col1 - Group By Operator [GBY_44] (rows=5072854730221289472 width=1) - Output:["_col0","_col1"],keys:_col2, _col3 - Select Operator [SEL_43] (rows=5072854730221289472 width=1) - Output:["_col2","_col3"] - Filter Operator [FIL_42] (rows=5072854730221289472 width=1) - predicate:(_col2 <> _col0) - Merge Join Operator [MERGEJOIN_110] (rows=5072854730221289472 width=1) - Conds:RS_39._col1=RS_40._col1(Inner),Output:["_col0","_col2","_col3"] - <-Map 14 [SIMPLE_EDGE] - PARTITION_ONLY_SHUFFLE [RS_39] - PartitionCols:_col1 - Select Operator [SEL_20] (rows=287989836 width=135) - Output:["_col0","_col1"] - TableScan [TS_19] (rows=287989836 width=135) - default@catalog_sales,cs2,Tbl:COMPLETE,Col:NONE,Output:["cs_warehouse_sk","cs_order_number"] - <-Reducer 18 [SIMPLE_EDGE] - SHUFFLE [RS_40] - PartitionCols:_col1 - Select Operator [SEL_38] (rows=4611686018427387903 width=1) - Output:["_col0","_col1"] - Group By Operator [GBY_37] (rows=4611686018427387903 width=1) - Output:["_col0","_col1"],keys:KEY._col0, KEY._col1 - <-Reducer 17 [SIMPLE_EDGE] - SHUFFLE [RS_36] - PartitionCols:_col0, _col1 - Group By Operator [GBY_35] (rows=9223372036854775807 width=1) - Output:["_col0","_col1"],keys:_col4, _col3 - Merge Join Operator [MERGEJOIN_106] (rows=9223372036854775807 width=1) - Conds:(Inner),(Inner),(Inner),Output:["_col3","_col4"] - <-Map 14 [CUSTOM_SIMPLE_EDGE] - PARTITION_ONLY_SHUFFLE [RS_32] - Select Operator [SEL_28] (rows=287989836 width=135) - Output:["_col0","_col1"] - Please refer to the previous TableScan [TS_19] - <-Map 19 [CUSTOM_SIMPLE_EDGE] - PARTITION_ONLY_SHUFFLE [RS_29] - Select Operator [SEL_22] (rows=73049 width=4) - TableScan [TS_21] (rows=73049 width=1119) - default@date_dim,date_dim,Tbl:COMPLETE,Col:COMPLETE - <-Map 20 [CUSTOM_SIMPLE_EDGE] - PARTITION_ONLY_SHUFFLE [RS_30] - Select Operator [SEL_24] (rows=60 width=4) - TableScan [TS_23] (rows=60 width=2045) - default@call_center,call_center,Tbl:COMPLETE,Col:COMPLETE - <-Map 21 [CUSTOM_SIMPLE_EDGE] - PARTITION_ONLY_SHUFFLE [RS_31] - Select Operator [SEL_26] (rows=40000000 width=4) - TableScan [TS_25] (rows=40000000 width=1014) - default@customer_address,customer_address,Tbl:COMPLETE,Col:COMPLETE - <-Reducer 5 [SIMPLE_EDGE] - SHUFFLE [RS_60] - PartitionCols:_col3, _col4 - Merge Join Operator [MERGEJOIN_111] (rows=421645953 width=135) - Conds:RS_57._col4=RS_58._col0(Left Outer),Output:["_col3","_col4","_col5","_col6","_col14"] - <-Reducer 13 [SIMPLE_EDGE] - SHUFFLE [RS_58] - PartitionCols:_col0 - Select Operator [SEL_18] (rows=14399440 width=106) - Output:["_col0","_col1"] - Group By Operator [GBY_17] (rows=14399440 width=106) - Output:["_col0"],keys:KEY._col0 - <-Map 12 [SIMPLE_EDGE] - SHUFFLE [RS_16] - PartitionCols:_col0 - Group By Operator [GBY_15] (rows=28798881 width=106) - Output:["_col0"],keys:cr_order_number - Filter Operator [FIL_103] (rows=28798881 width=106) - predicate:cr_order_number is not null - TableScan [TS_12] (rows=28798881 width=106) - default@catalog_returns,cr1,Tbl:COMPLETE,Col:NONE,Output:["cr_order_number"] - <-Reducer 4 [SIMPLE_EDGE] - SHUFFLE [RS_57] - PartitionCols:_col4 - Merge Join Operator [MERGEJOIN_109] (rows=383314495 width=135) - Conds:RS_54._col2=RS_55._col0(Inner),Output:["_col3","_col4","_col5","_col6"] - <-Map 11 [SIMPLE_EDGE] - SHUFFLE [RS_55] + Group By Operator [GBY_46] (rows=2536427365110644736 width=1) + Output:["_col0","_col1"],keys:KEY._col0, KEY._col1 + <-Reducer 16 [SIMPLE_EDGE] + SHUFFLE [RS_45] + PartitionCols:_col0, _col1 + Group By Operator [GBY_44] (rows=5072854730221289472 width=1) + Output:["_col0","_col1"],keys:_col2, _col3 + Select Operator [SEL_43] (rows=5072854730221289472 width=1) + Output:["_col2","_col3"] + Filter Operator [FIL_42] (rows=5072854730221289472 width=1) + predicate:(_col2 <> _col0) + Merge Join Operator [MERGEJOIN_115] (rows=5072854730221289472 width=1) + Conds:RS_39._col1=RS_40._col1(Inner),Output:["_col0","_col2","_col3"] + <-Map 15 [SIMPLE_EDGE] + PARTITION_ONLY_SHUFFLE [RS_39] + PartitionCols:_col1 + Select Operator [SEL_20] (rows=287989836 width=135) + Output:["_col0","_col1"] + TableScan [TS_19] (rows=287989836 width=135) + default@catalog_sales,cs2,Tbl:COMPLETE,Col:NONE,Output:["cs_warehouse_sk","cs_order_number"] + <-Reducer 19 [SIMPLE_EDGE] + SHUFFLE [RS_40] + PartitionCols:_col1 + Select Operator [SEL_38] (rows=4611686018427387903 width=1) + Output:["_col0","_col1"] + Group By Operator [GBY_37] (rows=4611686018427387903 width=1) + Output:["_col0","_col1"],keys:KEY._col0, KEY._col1 + <-Reducer 18 [SIMPLE_EDGE] + SHUFFLE [RS_36] + PartitionCols:_col0, _col1 + Group By Operator [GBY_35] (rows=9223372036854775807 width=1) + Output:["_col0","_col1"],keys:_col4, _col3 + Merge Join Operator [MERGEJOIN_111] (rows=9223372036854775807 width=1) + Conds:(Inner),(Inner),(Inner),Output:["_col3","_col4"] + <-Map 15 [CUSTOM_SIMPLE_EDGE] + PARTITION_ONLY_SHUFFLE [RS_32] + Select Operator [SEL_28] (rows=287989836 width=135) + Output:["_col0","_col1"] + Please refer to the previous TableScan [TS_19] + <-Map 20 [CUSTOM_SIMPLE_EDGE] + PARTITION_ONLY_SHUFFLE [RS_29] + Select Operator [SEL_22] (rows=73049 width=4) + TableScan [TS_21] (rows=73049 width=1119) + default@date_dim,date_dim,Tbl:COMPLETE,Col:COMPLETE + <-Map 21 [CUSTOM_SIMPLE_EDGE] + PARTITION_ONLY_SHUFFLE [RS_30] + Select Operator [SEL_24] (rows=60 width=4) + TableScan [TS_23] (rows=60 width=2045) + default@call_center,call_center,Tbl:COMPLETE,Col:COMPLETE + <-Map 22 [CUSTOM_SIMPLE_EDGE] + PARTITION_ONLY_SHUFFLE [RS_31] + Select Operator [SEL_26] (rows=40000000 width=4) + TableScan [TS_25] (rows=40000000 width=1014) + default@customer_address,customer_address,Tbl:COMPLETE,Col:COMPLETE + <-Reducer 5 [SIMPLE_EDGE] + SHUFFLE [RS_60] + PartitionCols:_col3, _col4 + Merge Join Operator [MERGEJOIN_116] (rows=421645953 width=135) + Conds:RS_57._col4=RS_58._col0(Left Outer),Output:["_col3","_col4","_col5","_col6","_col14"] + <-Reducer 14 [SIMPLE_EDGE] + SHUFFLE [RS_58] PartitionCols:_col0 - Select Operator [SEL_11] (rows=30 width=2045) - Output:["_col0"] - Filter Operator [FIL_102] (rows=30 width=2045) - predicate:((cc_county) IN ('Ziebach County', 'Levy County', 'Huron County', 'Franklin Parish', 'Daviess County') and cc_call_center_sk is not null) - TableScan [TS_9] (rows=60 width=2045) - default@call_center,call_center,Tbl:COMPLETE,Col:NONE,Output:["cc_call_center_sk","cc_county"] - <-Reducer 3 [SIMPLE_EDGE] - SHUFFLE [RS_54] - PartitionCols:_col2 - Merge Join Operator [MERGEJOIN_108] (rows=348467716 width=135) - Conds:RS_51._col1=RS_52._col0(Inner),Output:["_col2","_col3","_col4","_col5","_col6"] - <-Map 10 [SIMPLE_EDGE] - SHUFFLE [RS_52] + Select Operator [SEL_18] (rows=14399440 width=106) + Output:["_col0","_col1"] + Group By Operator [GBY_17] (rows=14399440 width=106) + Output:["_col0"],keys:KEY._col0 + <-Map 13 [SIMPLE_EDGE] + SHUFFLE [RS_16] + PartitionCols:_col0 + Group By Operator [GBY_15] (rows=28798881 width=106) + Output:["_col0"],keys:cr_order_number + Filter Operator [FIL_108] (rows=28798881 width=106) + predicate:cr_order_number is not null + TableScan [TS_12] (rows=28798881 width=106) + default@catalog_returns,cr1,Tbl:COMPLETE,Col:NONE,Output:["cr_order_number"] + <-Reducer 4 [SIMPLE_EDGE] + SHUFFLE [RS_57] + PartitionCols:_col4 + Merge Join Operator [MERGEJOIN_114] (rows=383314495 width=135) + Conds:RS_54._col2=RS_55._col0(Inner),Output:["_col3","_col4","_col5","_col6"] + <-Map 12 [SIMPLE_EDGE] + SHUFFLE [RS_55] PartitionCols:_col0 - Select Operator [SEL_8] (rows=20000000 width=1014) + Select Operator [SEL_11] (rows=30 width=2045) Output:["_col0"] - Filter Operator [FIL_101] (rows=20000000 width=1014) - predicate:((ca_state = 'NY') and ca_address_sk is not null) - TableScan [TS_6] (rows=40000000 width=1014) - default@customer_address,customer_address,Tbl:COMPLETE,Col:NONE,Output:["ca_address_sk","ca_state"] - <-Reducer 2 [SIMPLE_EDGE] - SHUFFLE [RS_51] - PartitionCols:_col1 - Merge Join Operator [MERGEJOIN_107] (rows=316788826 width=135) - Conds:RS_48._col0=RS_49._col0(Inner),Output:["_col1","_col2","_col3","_col4","_col5","_col6"] - <-Map 1 [SIMPLE_EDGE] - SHUFFLE [RS_48] + Filter Operator [FIL_107] (rows=30 width=2045) + predicate:((cc_county) IN ('Ziebach County', 'Levy County', 'Huron County', 'Franklin Parish', 'Daviess County') and cc_call_center_sk is not null) + TableScan [TS_9] (rows=60 width=2045) + default@call_center,call_center,Tbl:COMPLETE,Col:NONE,Output:["cc_call_center_sk","cc_county"] + <-Reducer 3 [SIMPLE_EDGE] + SHUFFLE [RS_54] + PartitionCols:_col2 + Merge Join Operator [MERGEJOIN_113] (rows=348467716 width=135) + Conds:RS_51._col1=RS_52._col0(Inner),Output:["_col2","_col3","_col4","_col5","_col6"] + <-Map 11 [SIMPLE_EDGE] + SHUFFLE [RS_52] PartitionCols:_col0 - Select Operator [SEL_2] (rows=287989836 width=135) - Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6"] - Filter Operator [FIL_99] (rows=287989836 width=135) - predicate:(cs_ship_date_sk is not null and cs_ship_addr_sk is not null and cs_call_center_sk is not null) - TableScan [TS_0] (rows=287989836 width=135) - default@catalog_sales,cs1,Tbl:COMPLETE,Col:NONE,Output:["cs_ship_date_sk","cs_ship_addr_sk","cs_call_center_sk","cs_warehouse_sk","cs_order_number","cs_ext_ship_cost","cs_net_profit"] - <-Map 9 [SIMPLE_EDGE] - SHUFFLE [RS_49] - PartitionCols:_col0 - Select Operator [SEL_5] (rows=8116 width=1119) + Select Operator [SEL_8] (rows=20000000 width=1014) Output:["_col0"] - Filter Operator [FIL_100] (rows=8116 width=1119) - predicate:(CAST( d_date AS TIMESTAMP) BETWEEN 2001-04-01 00:00:00.0 AND 2001-05-31 01:00:00.0 and d_date_sk is not null) - TableScan [TS_3] (rows=73049 width=1119) - default@date_dim,date_dim,Tbl:COMPLETE,Col:NONE,Output:["d_date_sk","d_date"] + Filter Operator [FIL_106] (rows=20000000 width=1014) + predicate:((ca_state = 'NY') and ca_address_sk is not null) + TableScan [TS_6] (rows=40000000 width=1014) + default@customer_address,customer_address,Tbl:COMPLETE,Col:NONE,Output:["ca_address_sk","ca_state"] + <-Reducer 2 [SIMPLE_EDGE] + SHUFFLE [RS_51] + PartitionCols:_col1 + Merge Join Operator [MERGEJOIN_112] (rows=316788826 width=135) + Conds:RS_48._col0=RS_49._col0(Inner),Output:["_col1","_col2","_col3","_col4","_col5","_col6"] + <-Map 1 [SIMPLE_EDGE] + SHUFFLE [RS_48] + PartitionCols:_col0 + Select Operator [SEL_2] (rows=287989836 width=135) + Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6"] + Filter Operator [FIL_104] (rows=287989836 width=135) + predicate:(cs_ship_date_sk is not null and cs_ship_addr_sk is not null and cs_call_center_sk is not null) + TableScan [TS_0] (rows=287989836 width=135) + default@catalog_sales,cs1,Tbl:COMPLETE,Col:NONE,Output:["cs_ship_date_sk","cs_ship_addr_sk","cs_call_center_sk","cs_warehouse_sk","cs_order_number","cs_ext_ship_cost","cs_net_profit"] + <-Map 10 [SIMPLE_EDGE] + SHUFFLE [RS_49] + PartitionCols:_col0 + Select Operator [SEL_5] (rows=8116 width=1119) + Output:["_col0"] + Filter Operator [FIL_105] (rows=8116 width=1119) + predicate:(CAST( d_date AS TIMESTAMP) BETWEEN 2001-04-01 00:00:00.0 AND 2001-05-31 01:00:00.0 and d_date_sk is not null) + TableScan [TS_3] (rows=73049 width=1119) + default@date_dim,date_dim,Tbl:COMPLETE,Col:NONE,Output:["d_date_sk","d_date"] diff --git a/ql/src/test/results/clientpositive/perf/query28.q.out b/ql/src/test/results/clientpositive/perf/query28.q.out index 78129cf68b..8448712bf6 100644 --- a/ql/src/test/results/clientpositive/perf/query28.q.out +++ b/ql/src/test/results/clientpositive/perf/query28.q.out @@ -1,4 +1,4 @@ -Warning: Shuffle Join MERGEJOIN[58][tables = [$hdt$_0, $hdt$_1, $hdt$_2, $hdt$_3, $hdt$_4, $hdt$_5]] in Stage 'Reducer 3' is a cross product +Warning: Shuffle Join MERGEJOIN[88][tables = [$hdt$_0, $hdt$_1, $hdt$_2, $hdt$_3, $hdt$_4, $hdt$_5]] in Stage 'Reducer 4' is a cross product PREHOOK: query: explain select * from (select avg(ss_list_price) B1_LP ,count(ss_list_price) B1_CNT @@ -104,103 +104,139 @@ POSTHOOK: type: QUERY Plan optimized by CBO. Vertex dependency in root stage +Reducer 10 <- Reducer 9 (CUSTOM_SIMPLE_EDGE) +Reducer 11 <- Map 1 (SIMPLE_EDGE) +Reducer 12 <- Reducer 11 (CUSTOM_SIMPLE_EDGE) +Reducer 13 <- Map 1 (SIMPLE_EDGE) +Reducer 14 <- Reducer 13 (CUSTOM_SIMPLE_EDGE) Reducer 2 <- Map 1 (SIMPLE_EDGE) -Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE), Reducer 4 (CUSTOM_SIMPLE_EDGE), Reducer 5 (CUSTOM_SIMPLE_EDGE), Reducer 6 (CUSTOM_SIMPLE_EDGE), Reducer 7 (CUSTOM_SIMPLE_EDGE), Reducer 8 (CUSTOM_SIMPLE_EDGE) -Reducer 4 <- Map 1 (SIMPLE_EDGE) +Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) +Reducer 4 <- Reducer 10 (CUSTOM_SIMPLE_EDGE), Reducer 12 (CUSTOM_SIMPLE_EDGE), Reducer 14 (CUSTOM_SIMPLE_EDGE), Reducer 3 (CUSTOM_SIMPLE_EDGE), Reducer 6 (CUSTOM_SIMPLE_EDGE), Reducer 8 (CUSTOM_SIMPLE_EDGE) Reducer 5 <- Map 1 (SIMPLE_EDGE) -Reducer 6 <- Map 1 (SIMPLE_EDGE) +Reducer 6 <- Reducer 5 (CUSTOM_SIMPLE_EDGE) Reducer 7 <- Map 1 (SIMPLE_EDGE) -Reducer 8 <- Map 1 (SIMPLE_EDGE) +Reducer 8 <- Reducer 7 (CUSTOM_SIMPLE_EDGE) +Reducer 9 <- Map 1 (SIMPLE_EDGE) Stage-0 Fetch Operator limit:100 Stage-1 - Reducer 3 + Reducer 4 File Output Operator [FS_51] Limit [LIM_50] (rows=1 width=2497) Number of rows:100 Select Operator [SEL_49] (rows=1 width=2497) Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8","_col9","_col10","_col11","_col12","_col13","_col14","_col15","_col16","_col17"] - Merge Join Operator [MERGEJOIN_58] (rows=1 width=2497) + Merge Join Operator [MERGEJOIN_88] (rows=1 width=2497) Conds:(Inner),(Inner),(Inner),(Inner),(Inner),Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8","_col9","_col10","_col11","_col12","_col13","_col14","_col15","_col16","_col17"] - <-Reducer 2 [CUSTOM_SIMPLE_EDGE] - PARTITION_ONLY_SHUFFLE [RS_42] - Group By Operator [GBY_5] (rows=1 width=416) - Output:["_col0","_col1","_col2"],aggregations:["avg(VALUE._col0)","count(VALUE._col1)","count(DISTINCT KEY._col0:0._col0)"] - <-Map 1 [SIMPLE_EDGE] - SHUFFLE [RS_4] - Group By Operator [GBY_3] (rows=21333171 width=88) - Output:["_col0","_col1","_col2","_col3"],aggregations:["avg(ss_list_price)","count(ss_list_price)","count(DISTINCT ss_list_price)"],keys:ss_list_price - Select Operator [SEL_2] (rows=21333171 width=88) - Output:["ss_list_price"] - Filter Operator [FIL_52] (rows=21333171 width=88) - predicate:(ss_quantity BETWEEN 0 AND 5 and (ss_list_price BETWEEN 11 AND 21 or ss_coupon_amt BETWEEN 460 AND 1460 or ss_wholesale_cost BETWEEN 14 AND 34)) - TableScan [TS_0] (rows=575995635 width=88) - default@store_sales,store_sales,Tbl:COMPLETE,Col:NONE,Output:["ss_quantity","ss_wholesale_cost","ss_list_price","ss_coupon_amt"] - <-Reducer 4 [CUSTOM_SIMPLE_EDGE] - PARTITION_ONLY_SHUFFLE [RS_43] - Group By Operator [GBY_12] (rows=1 width=416) - Output:["_col0","_col1","_col2"],aggregations:["avg(VALUE._col0)","count(VALUE._col1)","count(DISTINCT KEY._col0:0._col0)"] - <-Map 1 [SIMPLE_EDGE] - SHUFFLE [RS_11] - Group By Operator [GBY_10] (rows=21333171 width=88) - Output:["_col0","_col1","_col2","_col3"],aggregations:["avg(ss_list_price)","count(ss_list_price)","count(DISTINCT ss_list_price)"],keys:ss_list_price - Select Operator [SEL_9] (rows=21333171 width=88) - Output:["ss_list_price"] - Filter Operator [FIL_53] (rows=21333171 width=88) - predicate:(ss_quantity BETWEEN 26 AND 30 and (ss_list_price BETWEEN 28 AND 38 or ss_coupon_amt BETWEEN 2513 AND 3513 or ss_wholesale_cost BETWEEN 42 AND 62)) - Please refer to the previous TableScan [TS_0] - <-Reducer 5 [CUSTOM_SIMPLE_EDGE] - PARTITION_ONLY_SHUFFLE [RS_44] - Group By Operator [GBY_19] (rows=1 width=416) - Output:["_col0","_col1","_col2"],aggregations:["avg(VALUE._col0)","count(VALUE._col1)","count(DISTINCT KEY._col0:0._col0)"] - <-Map 1 [SIMPLE_EDGE] - SHUFFLE [RS_18] - Group By Operator [GBY_17] (rows=21333171 width=88) - Output:["_col0","_col1","_col2","_col3"],aggregations:["avg(ss_list_price)","count(ss_list_price)","count(DISTINCT ss_list_price)"],keys:ss_list_price - Select Operator [SEL_16] (rows=21333171 width=88) - Output:["ss_list_price"] - Filter Operator [FIL_54] (rows=21333171 width=88) - predicate:(ss_quantity BETWEEN 21 AND 25 and (ss_list_price BETWEEN 135 AND 145 or ss_coupon_amt BETWEEN 14180 AND 15180 or ss_wholesale_cost BETWEEN 38 AND 58)) - Please refer to the previous TableScan [TS_0] - <-Reducer 6 [CUSTOM_SIMPLE_EDGE] + <-Reducer 10 [CUSTOM_SIMPLE_EDGE] PARTITION_ONLY_SHUFFLE [RS_45] - Group By Operator [GBY_26] (rows=1 width=416) - Output:["_col0","_col1","_col2"],aggregations:["avg(VALUE._col0)","count(VALUE._col1)","count(DISTINCT KEY._col0:0._col0)"] - <-Map 1 [SIMPLE_EDGE] - SHUFFLE [RS_25] - Group By Operator [GBY_24] (rows=21333171 width=88) - Output:["_col0","_col1","_col2","_col3"],aggregations:["avg(ss_list_price)","count(ss_list_price)","count(DISTINCT ss_list_price)"],keys:ss_list_price - Select Operator [SEL_23] (rows=21333171 width=88) - Output:["ss_list_price"] - Filter Operator [FIL_55] (rows=21333171 width=88) - predicate:(ss_quantity BETWEEN 16 AND 20 and (ss_list_price BETWEEN 142 AND 152 or ss_coupon_amt BETWEEN 3054 AND 4054 or ss_wholesale_cost BETWEEN 80 AND 100)) - Please refer to the previous TableScan [TS_0] - <-Reducer 7 [CUSTOM_SIMPLE_EDGE] + Group By Operator [GBY_71] (rows=1 width=416) + Output:["_col0","_col1","_col2"],aggregations:["avg(VALUE._col0)","count(VALUE._col1)","count(VALUE._col2)"] + <-Reducer 9 [CUSTOM_SIMPLE_EDGE] + PARTITION_ONLY_SHUFFLE [RS_70] + Group By Operator [GBY_69] (rows=21333171 width=88) + Output:["_col0","_col1","_col2","_col3"],aggregations:["avg(VALUE._col0)","count(VALUE._col1)","count(KEY._col0)"],keys:KEY._col0 + <-Map 1 [SIMPLE_EDGE] + SHUFFLE [RS_68] + PartitionCols:_col0 + Group By Operator [GBY_67] (rows=21333171 width=88) + Output:["_col0","_col1","_col2"],aggregations:["avg(ss_list_price)","count(ss_list_price)"],keys:ss_list_price + Select Operator [SEL_23] (rows=21333171 width=88) + Output:["ss_list_price"] + Filter Operator [FIL_85] (rows=21333171 width=88) + predicate:(ss_quantity BETWEEN 16 AND 20 and (ss_list_price BETWEEN 142 AND 152 or ss_coupon_amt BETWEEN 3054 AND 4054 or ss_wholesale_cost BETWEEN 80 AND 100)) + TableScan [TS_0] (rows=575995635 width=88) + default@store_sales,store_sales,Tbl:COMPLETE,Col:NONE,Output:["ss_quantity","ss_wholesale_cost","ss_list_price","ss_coupon_amt"] + <-Reducer 12 [CUSTOM_SIMPLE_EDGE] PARTITION_ONLY_SHUFFLE [RS_46] - Group By Operator [GBY_33] (rows=1 width=416) - Output:["_col0","_col1","_col2"],aggregations:["avg(VALUE._col0)","count(VALUE._col1)","count(DISTINCT KEY._col0:0._col0)"] - <-Map 1 [SIMPLE_EDGE] - SHUFFLE [RS_32] - Group By Operator [GBY_31] (rows=21333171 width=88) - Output:["_col0","_col1","_col2","_col3"],aggregations:["avg(ss_list_price)","count(ss_list_price)","count(DISTINCT ss_list_price)"],keys:ss_list_price - Select Operator [SEL_30] (rows=21333171 width=88) - Output:["ss_list_price"] - Filter Operator [FIL_56] (rows=21333171 width=88) - predicate:(ss_quantity BETWEEN 11 AND 15 and (ss_list_price BETWEEN 66 AND 76 or ss_coupon_amt BETWEEN 920 AND 1920 or ss_wholesale_cost BETWEEN 4 AND 24)) - Please refer to the previous TableScan [TS_0] - <-Reducer 8 [CUSTOM_SIMPLE_EDGE] + Group By Operator [GBY_76] (rows=1 width=416) + Output:["_col0","_col1","_col2"],aggregations:["avg(VALUE._col0)","count(VALUE._col1)","count(VALUE._col2)"] + <-Reducer 11 [CUSTOM_SIMPLE_EDGE] + PARTITION_ONLY_SHUFFLE [RS_75] + Group By Operator [GBY_74] (rows=21333171 width=88) + Output:["_col0","_col1","_col2","_col3"],aggregations:["avg(VALUE._col0)","count(VALUE._col1)","count(KEY._col0)"],keys:KEY._col0 + <-Map 1 [SIMPLE_EDGE] + SHUFFLE [RS_73] + PartitionCols:_col0 + Group By Operator [GBY_72] (rows=21333171 width=88) + Output:["_col0","_col1","_col2"],aggregations:["avg(ss_list_price)","count(ss_list_price)"],keys:ss_list_price + Select Operator [SEL_30] (rows=21333171 width=88) + Output:["ss_list_price"] + Filter Operator [FIL_86] (rows=21333171 width=88) + predicate:(ss_quantity BETWEEN 11 AND 15 and (ss_list_price BETWEEN 66 AND 76 or ss_coupon_amt BETWEEN 920 AND 1920 or ss_wholesale_cost BETWEEN 4 AND 24)) + Please refer to the previous TableScan [TS_0] + <-Reducer 14 [CUSTOM_SIMPLE_EDGE] PARTITION_ONLY_SHUFFLE [RS_47] - Group By Operator [GBY_40] (rows=1 width=416) - Output:["_col0","_col1","_col2"],aggregations:["avg(VALUE._col0)","count(VALUE._col1)","count(DISTINCT KEY._col0:0._col0)"] - <-Map 1 [SIMPLE_EDGE] - SHUFFLE [RS_39] - Group By Operator [GBY_38] (rows=21333171 width=88) - Output:["_col0","_col1","_col2","_col3"],aggregations:["avg(ss_list_price)","count(ss_list_price)","count(DISTINCT ss_list_price)"],keys:ss_list_price - Select Operator [SEL_37] (rows=21333171 width=88) - Output:["ss_list_price"] - Filter Operator [FIL_57] (rows=21333171 width=88) - predicate:(ss_quantity BETWEEN 6 AND 10 and (ss_list_price BETWEEN 91 AND 101 or ss_coupon_amt BETWEEN 1430 AND 2430 or ss_wholesale_cost BETWEEN 32 AND 52)) - Please refer to the previous TableScan [TS_0] + Group By Operator [GBY_81] (rows=1 width=416) + Output:["_col0","_col1","_col2"],aggregations:["avg(VALUE._col0)","count(VALUE._col1)","count(VALUE._col2)"] + <-Reducer 13 [CUSTOM_SIMPLE_EDGE] + PARTITION_ONLY_SHUFFLE [RS_80] + Group By Operator [GBY_79] (rows=21333171 width=88) + Output:["_col0","_col1","_col2","_col3"],aggregations:["avg(VALUE._col0)","count(VALUE._col1)","count(KEY._col0)"],keys:KEY._col0 + <-Map 1 [SIMPLE_EDGE] + SHUFFLE [RS_78] + PartitionCols:_col0 + Group By Operator [GBY_77] (rows=21333171 width=88) + Output:["_col0","_col1","_col2"],aggregations:["avg(ss_list_price)","count(ss_list_price)"],keys:ss_list_price + Select Operator [SEL_37] (rows=21333171 width=88) + Output:["ss_list_price"] + Filter Operator [FIL_87] (rows=21333171 width=88) + predicate:(ss_quantity BETWEEN 6 AND 10 and (ss_list_price BETWEEN 91 AND 101 or ss_coupon_amt BETWEEN 1430 AND 2430 or ss_wholesale_cost BETWEEN 32 AND 52)) + Please refer to the previous TableScan [TS_0] + <-Reducer 3 [CUSTOM_SIMPLE_EDGE] + PARTITION_ONLY_SHUFFLE [RS_42] + Group By Operator [GBY_56] (rows=1 width=416) + Output:["_col0","_col1","_col2"],aggregations:["avg(VALUE._col0)","count(VALUE._col1)","count(VALUE._col2)"] + <-Reducer 2 [CUSTOM_SIMPLE_EDGE] + PARTITION_ONLY_SHUFFLE [RS_55] + Group By Operator [GBY_54] (rows=21333171 width=88) + Output:["_col0","_col1","_col2","_col3"],aggregations:["avg(VALUE._col0)","count(VALUE._col1)","count(KEY._col0)"],keys:KEY._col0 + <-Map 1 [SIMPLE_EDGE] + SHUFFLE [RS_53] + PartitionCols:_col0 + Group By Operator [GBY_52] (rows=21333171 width=88) + Output:["_col0","_col1","_col2"],aggregations:["avg(ss_list_price)","count(ss_list_price)"],keys:ss_list_price + Select Operator [SEL_2] (rows=21333171 width=88) + Output:["ss_list_price"] + Filter Operator [FIL_82] (rows=21333171 width=88) + predicate:(ss_quantity BETWEEN 0 AND 5 and (ss_list_price BETWEEN 11 AND 21 or ss_coupon_amt BETWEEN 460 AND 1460 or ss_wholesale_cost BETWEEN 14 AND 34)) + Please refer to the previous TableScan [TS_0] + <-Reducer 6 [CUSTOM_SIMPLE_EDGE] + PARTITION_ONLY_SHUFFLE [RS_43] + Group By Operator [GBY_61] (rows=1 width=416) + Output:["_col0","_col1","_col2"],aggregations:["avg(VALUE._col0)","count(VALUE._col1)","count(VALUE._col2)"] + <-Reducer 5 [CUSTOM_SIMPLE_EDGE] + PARTITION_ONLY_SHUFFLE [RS_60] + Group By Operator [GBY_59] (rows=21333171 width=88) + Output:["_col0","_col1","_col2","_col3"],aggregations:["avg(VALUE._col0)","count(VALUE._col1)","count(KEY._col0)"],keys:KEY._col0 + <-Map 1 [SIMPLE_EDGE] + SHUFFLE [RS_58] + PartitionCols:_col0 + Group By Operator [GBY_57] (rows=21333171 width=88) + Output:["_col0","_col1","_col2"],aggregations:["avg(ss_list_price)","count(ss_list_price)"],keys:ss_list_price + Select Operator [SEL_9] (rows=21333171 width=88) + Output:["ss_list_price"] + Filter Operator [FIL_83] (rows=21333171 width=88) + predicate:(ss_quantity BETWEEN 26 AND 30 and (ss_list_price BETWEEN 28 AND 38 or ss_coupon_amt BETWEEN 2513 AND 3513 or ss_wholesale_cost BETWEEN 42 AND 62)) + Please refer to the previous TableScan [TS_0] + <-Reducer 8 [CUSTOM_SIMPLE_EDGE] + PARTITION_ONLY_SHUFFLE [RS_44] + Group By Operator [GBY_66] (rows=1 width=416) + Output:["_col0","_col1","_col2"],aggregations:["avg(VALUE._col0)","count(VALUE._col1)","count(VALUE._col2)"] + <-Reducer 7 [CUSTOM_SIMPLE_EDGE] + PARTITION_ONLY_SHUFFLE [RS_65] + Group By Operator [GBY_64] (rows=21333171 width=88) + Output:["_col0","_col1","_col2","_col3"],aggregations:["avg(VALUE._col0)","count(VALUE._col1)","count(KEY._col0)"],keys:KEY._col0 + <-Map 1 [SIMPLE_EDGE] + SHUFFLE [RS_63] + PartitionCols:_col0 + Group By Operator [GBY_62] (rows=21333171 width=88) + Output:["_col0","_col1","_col2"],aggregations:["avg(ss_list_price)","count(ss_list_price)"],keys:ss_list_price + Select Operator [SEL_16] (rows=21333171 width=88) + Output:["ss_list_price"] + Filter Operator [FIL_84] (rows=21333171 width=88) + predicate:(ss_quantity BETWEEN 21 AND 25 and (ss_list_price BETWEEN 135 AND 145 or ss_coupon_amt BETWEEN 14180 AND 15180 or ss_wholesale_cost BETWEEN 38 AND 58)) + Please refer to the previous TableScan [TS_0] diff --git a/ql/src/test/results/clientpositive/perf/query94.q.out b/ql/src/test/results/clientpositive/perf/query94.q.out index 836b16bf9f..ed833c0924 100644 --- a/ql/src/test/results/clientpositive/perf/query94.q.out +++ b/ql/src/test/results/clientpositive/perf/query94.q.out @@ -5,126 +5,132 @@ POSTHOOK: type: QUERY Plan optimized by CBO. Vertex dependency in root stage -Reducer 2 <- Map 1 (SIMPLE_EDGE), Reducer 9 (SIMPLE_EDGE) -Reducer 3 <- Map 11 (SIMPLE_EDGE), Reducer 2 (SIMPLE_EDGE) -Reducer 4 <- Map 12 (SIMPLE_EDGE), Reducer 3 (SIMPLE_EDGE) -Reducer 5 <- Map 13 (SIMPLE_EDGE), Reducer 4 (SIMPLE_EDGE) -Reducer 6 <- Map 14 (SIMPLE_EDGE), Reducer 5 (SIMPLE_EDGE) +Reducer 10 <- Map 11 (SIMPLE_EDGE), Map 9 (SIMPLE_EDGE) +Reducer 2 <- Map 1 (SIMPLE_EDGE), Reducer 10 (SIMPLE_EDGE) +Reducer 3 <- Map 12 (SIMPLE_EDGE), Reducer 2 (SIMPLE_EDGE) +Reducer 4 <- Map 13 (SIMPLE_EDGE), Reducer 3 (SIMPLE_EDGE) +Reducer 5 <- Map 14 (SIMPLE_EDGE), Reducer 4 (SIMPLE_EDGE) +Reducer 6 <- Map 15 (SIMPLE_EDGE), Reducer 5 (SIMPLE_EDGE) Reducer 7 <- Reducer 6 (SIMPLE_EDGE) -Reducer 9 <- Map 10 (SIMPLE_EDGE), Map 8 (SIMPLE_EDGE) +Reducer 8 <- Reducer 7 (CUSTOM_SIMPLE_EDGE) Stage-0 Fetch Operator limit:100 Stage-1 - Reducer 7 + Reducer 8 File Output Operator [FS_51] Limit [LIM_50] (rows=1 width=344) Number of rows:100 - Group By Operator [GBY_48] (rows=1 width=344) - Output:["_col0","_col1","_col2"],aggregations:["count(DISTINCT KEY._col0:0._col0)","sum(VALUE._col1)","sum(VALUE._col2)"] - <-Reducer 6 [SIMPLE_EDGE] - SHUFFLE [RS_47] - Group By Operator [GBY_46] (rows=127554770 width=135) - Output:["_col0","_col1","_col2","_col3"],aggregations:["count(DISTINCT _col3)","sum(_col4)","sum(_col5)"],keys:_col3 - Select Operator [SEL_45] (rows=127554770 width=135) - Output:["_col3","_col4","_col5"] - Filter Operator [FIL_44] (rows=127554770 width=135) - predicate:_col12 is null - Merge Join Operator [MERGEJOIN_85] (rows=255109540 width=135) - Conds:RS_40._col3=RS_41._col0(Left Outer),Output:["_col3","_col4","_col5","_col12"] - <-Map 14 [SIMPLE_EDGE] - SHUFFLE [RS_41] - PartitionCols:_col0 - Select Operator [SEL_25] (rows=14398467 width=92) - Output:["_col0"] - Filter Operator [FIL_79] (rows=14398467 width=92) - predicate:wr_order_number is not null - TableScan [TS_23] (rows=14398467 width=92) - default@web_returns,wr1,Tbl:COMPLETE,Col:NONE,Output:["wr_order_number"] - <-Reducer 5 [SIMPLE_EDGE] - SHUFFLE [RS_40] - PartitionCols:_col3 - Merge Join Operator [MERGEJOIN_84] (rows=231917759 width=135) - Conds:RS_37._col2=RS_38._col0(Inner),Output:["_col3","_col4","_col5"] - <-Map 13 [SIMPLE_EDGE] - SHUFFLE [RS_38] + Group By Operator [GBY_56] (rows=1 width=344) + Output:["_col0","_col1","_col2"],aggregations:["count(VALUE._col0)","sum(VALUE._col1)","sum(VALUE._col2)"] + <-Reducer 7 [CUSTOM_SIMPLE_EDGE] + PARTITION_ONLY_SHUFFLE [RS_55] + Group By Operator [GBY_54] (rows=127554770 width=135) + Output:["_col0","_col1","_col2","_col3"],aggregations:["count(KEY._col0)","sum(VALUE._col0)","sum(VALUE._col1)"],keys:KEY._col0 + <-Reducer 6 [SIMPLE_EDGE] + SHUFFLE [RS_53] + PartitionCols:_col0 + Group By Operator [GBY_52] (rows=127554770 width=135) + Output:["_col0","_col2","_col3"],aggregations:["sum(_col4)","sum(_col5)"],keys:_col3 + Select Operator [SEL_45] (rows=127554770 width=135) + Output:["_col3","_col4","_col5"] + Filter Operator [FIL_44] (rows=127554770 width=135) + predicate:_col12 is null + Merge Join Operator [MERGEJOIN_90] (rows=255109540 width=135) + Conds:RS_40._col3=RS_41._col0(Left Outer),Output:["_col3","_col4","_col5","_col12"] + <-Map 15 [SIMPLE_EDGE] + SHUFFLE [RS_41] PartitionCols:_col0 - Select Operator [SEL_22] (rows=42 width=1850) + Select Operator [SEL_25] (rows=14398467 width=92) Output:["_col0"] - Filter Operator [FIL_78] (rows=42 width=1850) - predicate:((web_company_name = 'pri') and web_site_sk is not null) - TableScan [TS_20] (rows=84 width=1850) - default@web_site,s,Tbl:COMPLETE,Col:NONE,Output:["web_site_sk","web_company_name"] - <-Reducer 4 [SIMPLE_EDGE] - SHUFFLE [RS_37] - PartitionCols:_col2 - Merge Join Operator [MERGEJOIN_83] (rows=210834322 width=135) - Conds:RS_34._col1=RS_35._col0(Inner),Output:["_col2","_col3","_col4","_col5"] - <-Map 12 [SIMPLE_EDGE] - SHUFFLE [RS_35] + Filter Operator [FIL_84] (rows=14398467 width=92) + predicate:wr_order_number is not null + TableScan [TS_23] (rows=14398467 width=92) + default@web_returns,wr1,Tbl:COMPLETE,Col:NONE,Output:["wr_order_number"] + <-Reducer 5 [SIMPLE_EDGE] + SHUFFLE [RS_40] + PartitionCols:_col3 + Merge Join Operator [MERGEJOIN_89] (rows=231917759 width=135) + Conds:RS_37._col2=RS_38._col0(Inner),Output:["_col3","_col4","_col5"] + <-Map 14 [SIMPLE_EDGE] + SHUFFLE [RS_38] PartitionCols:_col0 - Select Operator [SEL_19] (rows=20000000 width=1014) + Select Operator [SEL_22] (rows=42 width=1850) Output:["_col0"] - Filter Operator [FIL_77] (rows=20000000 width=1014) - predicate:((ca_state = 'TX') and ca_address_sk is not null) - TableScan [TS_17] (rows=40000000 width=1014) - default@customer_address,ca,Tbl:COMPLETE,Col:NONE,Output:["ca_address_sk","ca_state"] - <-Reducer 3 [SIMPLE_EDGE] - SHUFFLE [RS_34] - PartitionCols:_col1 - Merge Join Operator [MERGEJOIN_82] (rows=191667562 width=135) - Conds:RS_31._col0=RS_32._col0(Inner),Output:["_col1","_col2","_col3","_col4","_col5"] - <-Map 11 [SIMPLE_EDGE] - SHUFFLE [RS_32] + Filter Operator [FIL_83] (rows=42 width=1850) + predicate:((web_company_name = 'pri') and web_site_sk is not null) + TableScan [TS_20] (rows=84 width=1850) + default@web_site,s,Tbl:COMPLETE,Col:NONE,Output:["web_site_sk","web_company_name"] + <-Reducer 4 [SIMPLE_EDGE] + SHUFFLE [RS_37] + PartitionCols:_col2 + Merge Join Operator [MERGEJOIN_88] (rows=210834322 width=135) + Conds:RS_34._col1=RS_35._col0(Inner),Output:["_col2","_col3","_col4","_col5"] + <-Map 13 [SIMPLE_EDGE] + SHUFFLE [RS_35] PartitionCols:_col0 - Select Operator [SEL_16] (rows=8116 width=1119) + Select Operator [SEL_19] (rows=20000000 width=1014) Output:["_col0"] - Filter Operator [FIL_76] (rows=8116 width=1119) - predicate:(d_date BETWEEN '1999-05-01' AND '1999-07-01' and d_date_sk is not null) - TableScan [TS_14] (rows=73049 width=1119) - default@date_dim,d,Tbl:COMPLETE,Col:NONE,Output:["d_date_sk","d_date"] - <-Reducer 2 [SIMPLE_EDGE] - SHUFFLE [RS_31] - PartitionCols:_col0 - Merge Join Operator [MERGEJOIN_81] (rows=174243235 width=135) - Conds:RS_28._col3=RS_29._col0(Left Semi),Output:["_col0","_col1","_col2","_col3","_col4","_col5"] - <-Map 1 [SIMPLE_EDGE] - SHUFFLE [RS_28] - PartitionCols:_col3 - Select Operator [SEL_2] (rows=144002668 width=135) - Output:["_col0","_col1","_col2","_col3","_col4","_col5"] - Filter Operator [FIL_73] (rows=144002668 width=135) - predicate:(ws_ship_addr_sk is not null and ws_web_site_sk is not null and ws_ship_date_sk is not null and ws_order_number is not null) - TableScan [TS_0] (rows=144002668 width=135) - default@web_sales,ws1,Tbl:COMPLETE,Col:NONE,Output:["ws_ship_date_sk","ws_ship_addr_sk","ws_web_site_sk","ws_order_number","ws_ext_ship_cost","ws_net_profit"] - <-Reducer 9 [SIMPLE_EDGE] - SHUFFLE [RS_29] + Filter Operator [FIL_82] (rows=20000000 width=1014) + predicate:((ca_state = 'TX') and ca_address_sk is not null) + TableScan [TS_17] (rows=40000000 width=1014) + default@customer_address,ca,Tbl:COMPLETE,Col:NONE,Output:["ca_address_sk","ca_state"] + <-Reducer 3 [SIMPLE_EDGE] + SHUFFLE [RS_34] + PartitionCols:_col1 + Merge Join Operator [MERGEJOIN_87] (rows=191667562 width=135) + Conds:RS_31._col0=RS_32._col0(Inner),Output:["_col1","_col2","_col3","_col4","_col5"] + <-Map 12 [SIMPLE_EDGE] + SHUFFLE [RS_32] + PartitionCols:_col0 + Select Operator [SEL_16] (rows=8116 width=1119) + Output:["_col0"] + Filter Operator [FIL_81] (rows=8116 width=1119) + predicate:(d_date BETWEEN '1999-05-01' AND '1999-07-01' and d_date_sk is not null) + TableScan [TS_14] (rows=73049 width=1119) + default@date_dim,d,Tbl:COMPLETE,Col:NONE,Output:["d_date_sk","d_date"] + <-Reducer 2 [SIMPLE_EDGE] + SHUFFLE [RS_31] PartitionCols:_col0 - Group By Operator [GBY_27] (rows=158402938 width=135) - Output:["_col0"],keys:_col0 - Select Operator [SEL_13] (rows=158402938 width=135) - Output:["_col0"] - Filter Operator [FIL_12] (rows=158402938 width=135) - predicate:(_col0 <> _col2) - Merge Join Operator [MERGEJOIN_80] (rows=158402938 width=135) - Conds:RS_9._col1=RS_10._col1(Inner),Output:["_col0","_col1","_col2"] - <-Map 10 [SIMPLE_EDGE] - SHUFFLE [RS_10] - PartitionCols:_col1 - Select Operator [SEL_8] (rows=144002668 width=135) - Output:["_col0","_col1"] - Filter Operator [FIL_75] (rows=144002668 width=135) - predicate:ws_order_number is not null - TableScan [TS_6] (rows=144002668 width=135) - default@web_sales,ws3,Tbl:COMPLETE,Col:NONE,Output:["ws_warehouse_sk","ws_order_number"] - <-Map 8 [SIMPLE_EDGE] - SHUFFLE [RS_9] - PartitionCols:_col1 - Select Operator [SEL_5] (rows=144002668 width=135) - Output:["_col0","_col1"] - Filter Operator [FIL_74] (rows=144002668 width=135) - predicate:ws_order_number is not null - TableScan [TS_3] (rows=144002668 width=135) - default@web_sales,ws2,Tbl:COMPLETE,Col:NONE,Output:["ws_warehouse_sk","ws_order_number"] + Merge Join Operator [MERGEJOIN_86] (rows=174243235 width=135) + Conds:RS_28._col3=RS_29._col0(Left Semi),Output:["_col0","_col1","_col2","_col3","_col4","_col5"] + <-Map 1 [SIMPLE_EDGE] + SHUFFLE [RS_28] + PartitionCols:_col3 + Select Operator [SEL_2] (rows=144002668 width=135) + Output:["_col0","_col1","_col2","_col3","_col4","_col5"] + Filter Operator [FIL_78] (rows=144002668 width=135) + predicate:(ws_ship_addr_sk is not null and ws_web_site_sk is not null and ws_ship_date_sk is not null and ws_order_number is not null) + TableScan [TS_0] (rows=144002668 width=135) + default@web_sales,ws1,Tbl:COMPLETE,Col:NONE,Output:["ws_ship_date_sk","ws_ship_addr_sk","ws_web_site_sk","ws_order_number","ws_ext_ship_cost","ws_net_profit"] + <-Reducer 10 [SIMPLE_EDGE] + SHUFFLE [RS_29] + PartitionCols:_col0 + Group By Operator [GBY_27] (rows=158402938 width=135) + Output:["_col0"],keys:_col0 + Select Operator [SEL_13] (rows=158402938 width=135) + Output:["_col0"] + Filter Operator [FIL_12] (rows=158402938 width=135) + predicate:(_col0 <> _col2) + Merge Join Operator [MERGEJOIN_85] (rows=158402938 width=135) + Conds:RS_9._col1=RS_10._col1(Inner),Output:["_col0","_col1","_col2"] + <-Map 11 [SIMPLE_EDGE] + SHUFFLE [RS_10] + PartitionCols:_col1 + Select Operator [SEL_8] (rows=144002668 width=135) + Output:["_col0","_col1"] + Filter Operator [FIL_80] (rows=144002668 width=135) + predicate:ws_order_number is not null + TableScan [TS_6] (rows=144002668 width=135) + default@web_sales,ws3,Tbl:COMPLETE,Col:NONE,Output:["ws_warehouse_sk","ws_order_number"] + <-Map 9 [SIMPLE_EDGE] + SHUFFLE [RS_9] + PartitionCols:_col1 + Select Operator [SEL_5] (rows=144002668 width=135) + Output:["_col0","_col1"] + Filter Operator [FIL_79] (rows=144002668 width=135) + predicate:ws_order_number is not null + TableScan [TS_3] (rows=144002668 width=135) + default@web_sales,ws2,Tbl:COMPLETE,Col:NONE,Output:["ws_warehouse_sk","ws_order_number"] diff --git a/ql/src/test/results/clientpositive/perf/query95.q.out b/ql/src/test/results/clientpositive/perf/query95.q.out index fa94d0842b..77e067846e 100644 --- a/ql/src/test/results/clientpositive/perf/query95.q.out +++ b/ql/src/test/results/clientpositive/perf/query95.q.out @@ -5,150 +5,156 @@ POSTHOOK: type: QUERY Plan optimized by CBO. Vertex dependency in root stage -Reducer 10 <- Map 12 (SIMPLE_EDGE), Reducer 9 (SIMPLE_EDGE) -Reducer 2 <- Map 1 (SIMPLE_EDGE), Reducer 10 (SIMPLE_EDGE), Reducer 8 (SIMPLE_EDGE) -Reducer 3 <- Map 13 (SIMPLE_EDGE), Reducer 2 (SIMPLE_EDGE) -Reducer 4 <- Map 14 (SIMPLE_EDGE), Reducer 3 (SIMPLE_EDGE) -Reducer 5 <- Map 15 (SIMPLE_EDGE), Reducer 4 (SIMPLE_EDGE) +Reducer 10 <- Map 12 (SIMPLE_EDGE), Map 8 (SIMPLE_EDGE) +Reducer 11 <- Map 13 (SIMPLE_EDGE), Reducer 10 (SIMPLE_EDGE) +Reducer 2 <- Map 1 (SIMPLE_EDGE), Reducer 11 (SIMPLE_EDGE), Reducer 9 (SIMPLE_EDGE) +Reducer 3 <- Map 14 (SIMPLE_EDGE), Reducer 2 (SIMPLE_EDGE) +Reducer 4 <- Map 15 (SIMPLE_EDGE), Reducer 3 (SIMPLE_EDGE) +Reducer 5 <- Map 16 (SIMPLE_EDGE), Reducer 4 (SIMPLE_EDGE) Reducer 6 <- Reducer 5 (SIMPLE_EDGE) -Reducer 8 <- Map 11 (SIMPLE_EDGE), Map 7 (SIMPLE_EDGE) -Reducer 9 <- Map 11 (SIMPLE_EDGE), Map 7 (SIMPLE_EDGE) +Reducer 7 <- Reducer 6 (CUSTOM_SIMPLE_EDGE) +Reducer 9 <- Map 12 (SIMPLE_EDGE), Map 8 (SIMPLE_EDGE) Stage-0 Fetch Operator limit:-1 Stage-1 - Reducer 6 + Reducer 7 File Output Operator [FS_63] - Group By Operator [GBY_61] (rows=1 width=344) - Output:["_col0","_col1","_col2"],aggregations:["count(DISTINCT KEY._col0:0._col0)","sum(VALUE._col1)","sum(VALUE._col2)"] - <-Reducer 5 [SIMPLE_EDGE] - SHUFFLE [RS_60] - Group By Operator [GBY_59] (rows=510219083 width=135) - Output:["_col0","_col1","_col2","_col3"],aggregations:["count(DISTINCT _col3)","sum(_col4)","sum(_col5)"],keys:_col3 - Merge Join Operator [MERGEJOIN_122] (rows=510219083 width=135) - Conds:RS_55._col2=RS_56._col0(Inner),Output:["_col3","_col4","_col5"] - <-Map 15 [SIMPLE_EDGE] - SHUFFLE [RS_56] - PartitionCols:_col0 - Select Operator [SEL_40] (rows=42 width=1850) - Output:["_col0"] - Filter Operator [FIL_115] (rows=42 width=1850) - predicate:((web_company_name = 'pri') and web_site_sk is not null) - TableScan [TS_38] (rows=84 width=1850) - default@web_site,s,Tbl:COMPLETE,Col:NONE,Output:["web_site_sk","web_company_name"] - <-Reducer 4 [SIMPLE_EDGE] - SHUFFLE [RS_55] - PartitionCols:_col2 - Merge Join Operator [MERGEJOIN_121] (rows=463835520 width=135) - Conds:RS_52._col1=RS_53._col0(Inner),Output:["_col2","_col3","_col4","_col5"] - <-Map 14 [SIMPLE_EDGE] - SHUFFLE [RS_53] + Group By Operator [GBY_68] (rows=1 width=344) + Output:["_col0","_col1","_col2"],aggregations:["count(VALUE._col0)","sum(VALUE._col1)","sum(VALUE._col2)"] + <-Reducer 6 [CUSTOM_SIMPLE_EDGE] + PARTITION_ONLY_SHUFFLE [RS_67] + Group By Operator [GBY_66] (rows=510219083 width=135) + Output:["_col0","_col1","_col2","_col3"],aggregations:["count(KEY._col0)","sum(VALUE._col0)","sum(VALUE._col1)"],keys:KEY._col0 + <-Reducer 5 [SIMPLE_EDGE] + SHUFFLE [RS_65] + PartitionCols:_col0 + Group By Operator [GBY_64] (rows=510219083 width=135) + Output:["_col0","_col2","_col3"],aggregations:["sum(_col4)","sum(_col5)"],keys:_col3 + Merge Join Operator [MERGEJOIN_127] (rows=510219083 width=135) + Conds:RS_55._col2=RS_56._col0(Inner),Output:["_col3","_col4","_col5"] + <-Map 16 [SIMPLE_EDGE] + SHUFFLE [RS_56] PartitionCols:_col0 - Select Operator [SEL_37] (rows=20000000 width=1014) + Select Operator [SEL_40] (rows=42 width=1850) Output:["_col0"] - Filter Operator [FIL_114] (rows=20000000 width=1014) - predicate:((ca_state = 'GA') and ca_address_sk is not null) - TableScan [TS_35] (rows=40000000 width=1014) - default@customer_address,ca,Tbl:COMPLETE,Col:NONE,Output:["ca_address_sk","ca_state"] - <-Reducer 3 [SIMPLE_EDGE] - SHUFFLE [RS_52] - PartitionCols:_col1 - Merge Join Operator [MERGEJOIN_120] (rows=421668646 width=135) - Conds:RS_49._col0=RS_50._col0(Inner),Output:["_col1","_col2","_col3","_col4","_col5"] - <-Map 13 [SIMPLE_EDGE] - SHUFFLE [RS_50] + Filter Operator [FIL_120] (rows=42 width=1850) + predicate:((web_company_name = 'pri') and web_site_sk is not null) + TableScan [TS_38] (rows=84 width=1850) + default@web_site,s,Tbl:COMPLETE,Col:NONE,Output:["web_site_sk","web_company_name"] + <-Reducer 4 [SIMPLE_EDGE] + SHUFFLE [RS_55] + PartitionCols:_col2 + Merge Join Operator [MERGEJOIN_126] (rows=463835520 width=135) + Conds:RS_52._col1=RS_53._col0(Inner),Output:["_col2","_col3","_col4","_col5"] + <-Map 15 [SIMPLE_EDGE] + SHUFFLE [RS_53] PartitionCols:_col0 - Select Operator [SEL_34] (rows=8116 width=1119) + Select Operator [SEL_37] (rows=20000000 width=1014) Output:["_col0"] - Filter Operator [FIL_113] (rows=8116 width=1119) - predicate:(d_date BETWEEN '2002-05-01' AND '2002-06-30' and d_date_sk is not null) - TableScan [TS_32] (rows=73049 width=1119) - default@date_dim,d,Tbl:COMPLETE,Col:NONE,Output:["d_date_sk","d_date"] - <-Reducer 2 [SIMPLE_EDGE] - SHUFFLE [RS_49] - PartitionCols:_col0 - Merge Join Operator [MERGEJOIN_119] (rows=383335125 width=135) - Conds:RS_45._col3=RS_46._col0(Left Semi),RS_45._col3=RS_47._col0(Left Semi),Output:["_col0","_col1","_col2","_col3","_col4","_col5"] - <-Map 1 [SIMPLE_EDGE] - SHUFFLE [RS_45] - PartitionCols:_col3 - Select Operator [SEL_2] (rows=144002668 width=135) - Output:["_col0","_col1","_col2","_col3","_col4","_col5"] - Filter Operator [FIL_107] (rows=144002668 width=135) - predicate:(ws_ship_addr_sk is not null and ws_web_site_sk is not null and ws_ship_date_sk is not null and ws_order_number is not null) - TableScan [TS_0] (rows=144002668 width=135) - default@web_sales,ws1,Tbl:COMPLETE,Col:NONE,Output:["ws_ship_date_sk","ws_ship_addr_sk","ws_web_site_sk","ws_order_number","ws_ext_ship_cost","ws_net_profit"] - <-Reducer 10 [SIMPLE_EDGE] - SHUFFLE [RS_47] + Filter Operator [FIL_119] (rows=20000000 width=1014) + predicate:((ca_state = 'GA') and ca_address_sk is not null) + TableScan [TS_35] (rows=40000000 width=1014) + default@customer_address,ca,Tbl:COMPLETE,Col:NONE,Output:["ca_address_sk","ca_state"] + <-Reducer 3 [SIMPLE_EDGE] + SHUFFLE [RS_52] + PartitionCols:_col1 + Merge Join Operator [MERGEJOIN_125] (rows=421668646 width=135) + Conds:RS_49._col0=RS_50._col0(Inner),Output:["_col1","_col2","_col3","_col4","_col5"] + <-Map 14 [SIMPLE_EDGE] + SHUFFLE [RS_50] PartitionCols:_col0 - Group By Operator [GBY_44] (rows=174243235 width=135) - Output:["_col0"],keys:_col0 - Select Operator [SEL_31] (rows=174243235 width=135) - Output:["_col0"] - Merge Join Operator [MERGEJOIN_118] (rows=174243235 width=135) - Conds:RS_28._col0=RS_29._col0(Inner),Output:["_col1"] - <-Map 12 [SIMPLE_EDGE] - SHUFFLE [RS_29] - PartitionCols:_col0 - Select Operator [SEL_27] (rows=14398467 width=92) - Output:["_col0"] - Filter Operator [FIL_112] (rows=14398467 width=92) - predicate:wr_order_number is not null - TableScan [TS_25] (rows=14398467 width=92) - default@web_returns,wr,Tbl:COMPLETE,Col:NONE,Output:["wr_order_number"] - <-Reducer 9 [SIMPLE_EDGE] - SHUFFLE [RS_28] - PartitionCols:_col0 - Select Operator [SEL_24] (rows=158402938 width=135) - Output:["_col0"] - Filter Operator [FIL_23] (rows=158402938 width=135) - predicate:(_col0 <> _col2) - Merge Join Operator [MERGEJOIN_117] (rows=158402938 width=135) - Conds:RS_20._col1=RS_21._col1(Inner),Output:["_col0","_col1","_col2"] - <-Map 11 [SIMPLE_EDGE] - SHUFFLE [RS_21] - PartitionCols:_col1 - Select Operator [SEL_19] (rows=144002668 width=135) - Output:["_col0","_col1"] - Filter Operator [FIL_111] (rows=144002668 width=135) - predicate:ws_order_number is not null - TableScan [TS_6] (rows=144002668 width=135) - default@web_sales,ws3,Tbl:COMPLETE,Col:NONE,Output:["ws_warehouse_sk","ws_order_number"] - <-Map 7 [SIMPLE_EDGE] - SHUFFLE [RS_20] - PartitionCols:_col1 - Select Operator [SEL_16] (rows=144002668 width=135) - Output:["_col0","_col1"] - Filter Operator [FIL_110] (rows=144002668 width=135) - predicate:ws_order_number is not null - TableScan [TS_3] (rows=144002668 width=135) - default@web_sales,ws2,Tbl:COMPLETE,Col:NONE,Output:["ws_warehouse_sk","ws_order_number"] - <-Reducer 8 [SIMPLE_EDGE] - SHUFFLE [RS_46] + Select Operator [SEL_34] (rows=8116 width=1119) + Output:["_col0"] + Filter Operator [FIL_118] (rows=8116 width=1119) + predicate:(d_date BETWEEN '2002-05-01' AND '2002-06-30' and d_date_sk is not null) + TableScan [TS_32] (rows=73049 width=1119) + default@date_dim,d,Tbl:COMPLETE,Col:NONE,Output:["d_date_sk","d_date"] + <-Reducer 2 [SIMPLE_EDGE] + SHUFFLE [RS_49] PartitionCols:_col0 - Group By Operator [GBY_42] (rows=158402938 width=135) - Output:["_col0"],keys:_col0 - Select Operator [SEL_13] (rows=158402938 width=135) - Output:["_col0"] - Filter Operator [FIL_12] (rows=158402938 width=135) - predicate:(_col0 <> _col2) - Merge Join Operator [MERGEJOIN_116] (rows=158402938 width=135) - Conds:RS_9._col1=RS_10._col1(Inner),Output:["_col0","_col1","_col2"] - <-Map 11 [SIMPLE_EDGE] - SHUFFLE [RS_10] - PartitionCols:_col1 - Select Operator [SEL_8] (rows=144002668 width=135) - Output:["_col0","_col1"] - Filter Operator [FIL_109] (rows=144002668 width=135) - predicate:ws_order_number is not null - Please refer to the previous TableScan [TS_6] - <-Map 7 [SIMPLE_EDGE] - SHUFFLE [RS_9] - PartitionCols:_col1 - Select Operator [SEL_5] (rows=144002668 width=135) - Output:["_col0","_col1"] - Filter Operator [FIL_108] (rows=144002668 width=135) - predicate:ws_order_number is not null - Please refer to the previous TableScan [TS_3] + Merge Join Operator [MERGEJOIN_124] (rows=383335125 width=135) + Conds:RS_45._col3=RS_46._col0(Left Semi),RS_45._col3=RS_47._col0(Left Semi),Output:["_col0","_col1","_col2","_col3","_col4","_col5"] + <-Map 1 [SIMPLE_EDGE] + SHUFFLE [RS_45] + PartitionCols:_col3 + Select Operator [SEL_2] (rows=144002668 width=135) + Output:["_col0","_col1","_col2","_col3","_col4","_col5"] + Filter Operator [FIL_112] (rows=144002668 width=135) + predicate:(ws_ship_addr_sk is not null and ws_web_site_sk is not null and ws_ship_date_sk is not null and ws_order_number is not null) + TableScan [TS_0] (rows=144002668 width=135) + default@web_sales,ws1,Tbl:COMPLETE,Col:NONE,Output:["ws_ship_date_sk","ws_ship_addr_sk","ws_web_site_sk","ws_order_number","ws_ext_ship_cost","ws_net_profit"] + <-Reducer 11 [SIMPLE_EDGE] + SHUFFLE [RS_47] + PartitionCols:_col0 + Group By Operator [GBY_44] (rows=174243235 width=135) + Output:["_col0"],keys:_col0 + Select Operator [SEL_31] (rows=174243235 width=135) + Output:["_col0"] + Merge Join Operator [MERGEJOIN_123] (rows=174243235 width=135) + Conds:RS_28._col0=RS_29._col0(Inner),Output:["_col1"] + <-Map 13 [SIMPLE_EDGE] + SHUFFLE [RS_29] + PartitionCols:_col0 + Select Operator [SEL_27] (rows=14398467 width=92) + Output:["_col0"] + Filter Operator [FIL_117] (rows=14398467 width=92) + predicate:wr_order_number is not null + TableScan [TS_25] (rows=14398467 width=92) + default@web_returns,wr,Tbl:COMPLETE,Col:NONE,Output:["wr_order_number"] + <-Reducer 10 [SIMPLE_EDGE] + SHUFFLE [RS_28] + PartitionCols:_col0 + Select Operator [SEL_24] (rows=158402938 width=135) + Output:["_col0"] + Filter Operator [FIL_23] (rows=158402938 width=135) + predicate:(_col0 <> _col2) + Merge Join Operator [MERGEJOIN_122] (rows=158402938 width=135) + Conds:RS_20._col1=RS_21._col1(Inner),Output:["_col0","_col1","_col2"] + <-Map 12 [SIMPLE_EDGE] + SHUFFLE [RS_21] + PartitionCols:_col1 + Select Operator [SEL_19] (rows=144002668 width=135) + Output:["_col0","_col1"] + Filter Operator [FIL_116] (rows=144002668 width=135) + predicate:ws_order_number is not null + TableScan [TS_6] (rows=144002668 width=135) + default@web_sales,ws3,Tbl:COMPLETE,Col:NONE,Output:["ws_warehouse_sk","ws_order_number"] + <-Map 8 [SIMPLE_EDGE] + SHUFFLE [RS_20] + PartitionCols:_col1 + Select Operator [SEL_16] (rows=144002668 width=135) + Output:["_col0","_col1"] + Filter Operator [FIL_115] (rows=144002668 width=135) + predicate:ws_order_number is not null + TableScan [TS_3] (rows=144002668 width=135) + default@web_sales,ws2,Tbl:COMPLETE,Col:NONE,Output:["ws_warehouse_sk","ws_order_number"] + <-Reducer 9 [SIMPLE_EDGE] + SHUFFLE [RS_46] + PartitionCols:_col0 + Group By Operator [GBY_42] (rows=158402938 width=135) + Output:["_col0"],keys:_col0 + Select Operator [SEL_13] (rows=158402938 width=135) + Output:["_col0"] + Filter Operator [FIL_12] (rows=158402938 width=135) + predicate:(_col0 <> _col2) + Merge Join Operator [MERGEJOIN_121] (rows=158402938 width=135) + Conds:RS_9._col1=RS_10._col1(Inner),Output:["_col0","_col1","_col2"] + <-Map 12 [SIMPLE_EDGE] + SHUFFLE [RS_10] + PartitionCols:_col1 + Select Operator [SEL_8] (rows=144002668 width=135) + Output:["_col0","_col1"] + Filter Operator [FIL_114] (rows=144002668 width=135) + predicate:ws_order_number is not null + Please refer to the previous TableScan [TS_6] + <-Map 8 [SIMPLE_EDGE] + SHUFFLE [RS_9] + PartitionCols:_col1 + Select Operator [SEL_5] (rows=144002668 width=135) + Output:["_col0","_col1"] + Filter Operator [FIL_113] (rows=144002668 width=135) + predicate:ws_order_number is not null + Please refer to the previous TableScan [TS_3] diff --git a/ql/src/test/results/clientpositive/spark/nullgroup4.q.out b/ql/src/test/results/clientpositive/spark/nullgroup4.q.out index 24f0291dec..8367d07f3f 100644 --- a/ql/src/test/results/clientpositive/spark/nullgroup4.q.out +++ b/ql/src/test/results/clientpositive/spark/nullgroup4.q.out @@ -12,8 +12,9 @@ STAGE PLANS: Stage: Stage-1 Spark Edges: - Reducer 2 <- Map 1 (GROUP PARTITION-LEVEL SORT, 2) + Reducer 2 <- Map 1 (GROUP, 2) Reducer 3 <- Reducer 2 (GROUP, 1) + Reducer 4 <- Reducer 3 (GROUP, 1) #### A masked pattern was here #### Vertices: Map 1 @@ -29,10 +30,10 @@ STAGE PLANS: outputColumnNames: _col1 Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE Group By Operator - aggregations: count(1), count(DISTINCT _col1) + aggregations: count(1) keys: _col1 (type: string) mode: hash - outputColumnNames: _col0, _col1, _col2 + outputColumnNames: _col0, _col1 Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col0 (type: string) @@ -43,7 +44,19 @@ STAGE PLANS: Reducer 2 Reduce Operator Tree: Group By Operator - aggregations: count(VALUE._col0), count(DISTINCT KEY._col0:0._col0) + aggregations: count(VALUE._col0), count(KEY._col0) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint), _col2 (type: bigint) + Reducer 3 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), count(VALUE._col1) mode: partials outputColumnNames: _col0, _col1 Statistics: Num rows: 1 Data size: 24 Basic stats: COMPLETE Column stats: NONE @@ -51,7 +64,7 @@ STAGE PLANS: sort order: Statistics: Num rows: 1 Data size: 24 Basic stats: COMPLETE Column stats: NONE value expressions: _col0 (type: bigint), _col1 (type: bigint) - Reducer 3 + Reducer 4 Reduce Operator Tree: Group By Operator aggregations: count(VALUE._col0), count(VALUE._col1) @@ -95,7 +108,8 @@ STAGE PLANS: Stage: Stage-1 Spark Edges: - Reducer 2 <- Map 1 (GROUP PARTITION-LEVEL SORT, 1) + Reducer 2 <- Map 1 (GROUP, 2) + Reducer 3 <- Reducer 2 (GROUP, 1) #### A masked pattern was here #### Vertices: Map 1 @@ -111,20 +125,33 @@ STAGE PLANS: outputColumnNames: _col1 Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE Group By Operator - aggregations: count(1), count(DISTINCT _col1) + aggregations: count(1) keys: _col1 (type: string) mode: hash - outputColumnNames: _col0, _col1, _col2 + outputColumnNames: _col0, _col1 Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col0 (type: string) sort order: + + Map-reduce partition columns: _col0 (type: string) Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE value expressions: _col1 (type: bigint) Reducer 2 Reduce Operator Tree: Group By Operator - aggregations: count(VALUE._col0), count(DISTINCT KEY._col0:0._col0) + aggregations: count(VALUE._col0), count(KEY._col0) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint), _col2 (type: bigint) + Reducer 3 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), count(VALUE._col1) mode: mergepartial outputColumnNames: _col0, _col1 Statistics: Num rows: 1 Data size: 24 Basic stats: COMPLETE Column stats: NONE diff --git a/ql/src/test/results/clientpositive/udf_count.q.out b/ql/src/test/results/clientpositive/udf_count.q.out index f60ad0485e..967c79c2ff 100644 --- a/ql/src/test/results/clientpositive/udf_count.q.out +++ b/ql/src/test/results/clientpositive/udf_count.q.out @@ -43,7 +43,8 @@ POSTHOOK: query: EXPLAIN SELECT count(DISTINCT key) FROM src POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-1 is a root stage - Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 STAGE PLANS: Stage: Stage-1 @@ -57,18 +58,40 @@ STAGE PLANS: outputColumnNames: key Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Group By Operator - aggregations: count(DISTINCT key) keys: key (type: string) mode: hash - outputColumnNames: _col0, _col1 + outputColumnNames: _col0 Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col0 (type: string) sort order: + + Map-reduce partition columns: _col0 (type: string) Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Reduce Operator Tree: Group By Operator - aggregations: count(DISTINCT KEY._col0:0._col0) + aggregations: count(KEY._col0) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + sort order: + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) mode: mergepartial outputColumnNames: _col0 Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: NONE diff --git a/ql/src/test/results/clientpositive/vector_empty_where.q.out b/ql/src/test/results/clientpositive/vector_empty_where.q.out index b2dec6d7f6..5578666778 100644 --- a/ql/src/test/results/clientpositive/vector_empty_where.q.out +++ b/ql/src/test/results/clientpositive/vector_empty_where.q.out @@ -10,7 +10,8 @@ PLAN VECTORIZATION: STAGE DEPENDENCIES: Stage-1 is a root stage - Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 STAGE PLANS: Stage: Stage-1 @@ -38,26 +39,25 @@ STAGE PLANS: projectedOutputColumns: [2] Statistics: Num rows: 6144 Data size: 1320982 Basic stats: COMPLETE Column stats: NONE Group By Operator - aggregations: count(DISTINCT cint) Group By Vectorization: - aggregators: VectorUDAFCount(col 2) -> bigint className: VectorGroupByOperator vectorOutput: true keyExpressions: col 2 native: false - projectedOutputColumns: [0] + projectedOutputColumns: [] keys: cint (type: int) mode: hash - outputColumnNames: _col0, _col1 + outputColumnNames: _col0 Statistics: Num rows: 6144 Data size: 1320982 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col0 (type: int) sort order: + + Map-reduce partition columns: _col0 (type: int) Reduce Sink Vectorization: className: VectorReduceSinkOperator native: false - nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, No PTF TopN IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true - nativeConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false, No DISTINCT columns IS false + nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true + nativeConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false Statistics: Num rows: 6144 Data size: 1320982 Basic stats: COMPLETE Column stats: NONE Execution mode: vectorized Map Vectorization: @@ -74,7 +74,54 @@ STAGE PLANS: enableConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false Reduce Operator Tree: Group By Operator - aggregations: count(DISTINCT KEY._col0:0._col0) + aggregations: count(KEY._col0) + Group By Vectorization: + vectorOutput: false + native: false + projectedOutputColumns: null + keys: KEY._col0 (type: int) + mode: partial2 + outputColumnNames: _col0, _col1 + Statistics: Num rows: 6144 Data size: 1320982 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + TableScan Vectorization: + native: true + projectedOutputColumns: [0, 1] + Reduce Output Operator + sort order: + Reduce Sink Vectorization: + className: VectorReduceSinkOperator + native: false + nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true + nativeConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false + Statistics: Num rows: 6144 Data size: 1320982 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint) + Execution mode: vectorized + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vector.serde.deserialize IS true + groupByVectorOutput: true + inputFileFormats: org.apache.hadoop.mapred.SequenceFileInputFormat + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + Reduce Vectorization: + enabled: false + enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true + enableConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) Group By Vectorization: vectorOutput: false native: false @@ -117,7 +164,8 @@ PLAN VECTORIZATION: STAGE DEPENDENCIES: Stage-1 is a root stage - Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 STAGE PLANS: Stage: Stage-1 @@ -137,26 +185,25 @@ STAGE PLANS: predicate: cint (type: int) Statistics: Num rows: 6144 Data size: 1320982 Basic stats: COMPLETE Column stats: NONE Group By Operator - aggregations: count(DISTINCT cint) Group By Vectorization: - aggregators: VectorUDAFCount(col 2) -> bigint className: VectorGroupByOperator vectorOutput: true keyExpressions: col 2 native: false - projectedOutputColumns: [0] + projectedOutputColumns: [] keys: cint (type: int) mode: hash - outputColumnNames: _col0, _col1 + outputColumnNames: _col0 Statistics: Num rows: 6144 Data size: 1320982 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col0 (type: int) sort order: + + Map-reduce partition columns: _col0 (type: int) Reduce Sink Vectorization: className: VectorReduceSinkOperator native: false - nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, No PTF TopN IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true - nativeConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false, No DISTINCT columns IS false + nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true + nativeConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false Statistics: Num rows: 6144 Data size: 1320982 Basic stats: COMPLETE Column stats: NONE Execution mode: vectorized Map Vectorization: @@ -173,7 +220,54 @@ STAGE PLANS: enableConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false Reduce Operator Tree: Group By Operator - aggregations: count(DISTINCT KEY._col0:0._col0) + aggregations: count(KEY._col0) + Group By Vectorization: + vectorOutput: false + native: false + projectedOutputColumns: null + keys: KEY._col0 (type: int) + mode: partial2 + outputColumnNames: _col0, _col1 + Statistics: Num rows: 6144 Data size: 1320982 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + TableScan Vectorization: + native: true + projectedOutputColumns: [0, 1] + Reduce Output Operator + sort order: + Reduce Sink Vectorization: + className: VectorReduceSinkOperator + native: false + nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true + nativeConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false + Statistics: Num rows: 6144 Data size: 1320982 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint) + Execution mode: vectorized + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vector.serde.deserialize IS true + groupByVectorOutput: true + inputFileFormats: org.apache.hadoop.mapred.SequenceFileInputFormat + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + Reduce Vectorization: + enabled: false + enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true + enableConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) Group By Vectorization: vectorOutput: false native: false @@ -216,7 +310,8 @@ PLAN VECTORIZATION: STAGE DEPENDENCIES: Stage-1 is a root stage - Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 STAGE PLANS: Stage: Stage-1 @@ -244,26 +339,25 @@ STAGE PLANS: projectedOutputColumns: [2] Statistics: Num rows: 6144 Data size: 1320982 Basic stats: COMPLETE Column stats: NONE Group By Operator - aggregations: count(DISTINCT cint) Group By Vectorization: - aggregators: VectorUDAFCount(col 2) -> bigint className: VectorGroupByOperator vectorOutput: true keyExpressions: col 2 native: false - projectedOutputColumns: [0] + projectedOutputColumns: [] keys: cint (type: int) mode: hash - outputColumnNames: _col0, _col1 + outputColumnNames: _col0 Statistics: Num rows: 6144 Data size: 1320982 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col0 (type: int) sort order: + + Map-reduce partition columns: _col0 (type: int) Reduce Sink Vectorization: className: VectorReduceSinkOperator native: false - nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, No PTF TopN IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true - nativeConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false, No DISTINCT columns IS false + nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true + nativeConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false Statistics: Num rows: 6144 Data size: 1320982 Basic stats: COMPLETE Column stats: NONE Execution mode: vectorized Map Vectorization: @@ -280,7 +374,54 @@ STAGE PLANS: enableConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false Reduce Operator Tree: Group By Operator - aggregations: count(DISTINCT KEY._col0:0._col0) + aggregations: count(KEY._col0) + Group By Vectorization: + vectorOutput: false + native: false + projectedOutputColumns: null + keys: KEY._col0 (type: int) + mode: partial2 + outputColumnNames: _col0, _col1 + Statistics: Num rows: 6144 Data size: 1320982 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + TableScan Vectorization: + native: true + projectedOutputColumns: [0, 1] + Reduce Output Operator + sort order: + Reduce Sink Vectorization: + className: VectorReduceSinkOperator + native: false + nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true + nativeConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false + Statistics: Num rows: 6144 Data size: 1320982 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint) + Execution mode: vectorized + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vector.serde.deserialize IS true + groupByVectorOutput: true + inputFileFormats: org.apache.hadoop.mapred.SequenceFileInputFormat + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + Reduce Vectorization: + enabled: false + enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true + enableConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) Group By Vectorization: vectorOutput: false native: false @@ -323,7 +464,8 @@ PLAN VECTORIZATION: STAGE DEPENDENCIES: Stage-1 is a root stage - Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 STAGE PLANS: Stage: Stage-1 @@ -351,26 +493,25 @@ STAGE PLANS: projectedOutputColumns: [2] Statistics: Num rows: 6144 Data size: 1320982 Basic stats: COMPLETE Column stats: NONE Group By Operator - aggregations: count(DISTINCT cint) Group By Vectorization: - aggregators: VectorUDAFCount(col 2) -> bigint className: VectorGroupByOperator vectorOutput: true keyExpressions: col 2 native: false - projectedOutputColumns: [0] + projectedOutputColumns: [] keys: cint (type: int) mode: hash - outputColumnNames: _col0, _col1 + outputColumnNames: _col0 Statistics: Num rows: 6144 Data size: 1320982 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col0 (type: int) sort order: + + Map-reduce partition columns: _col0 (type: int) Reduce Sink Vectorization: className: VectorReduceSinkOperator native: false - nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, No PTF TopN IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true - nativeConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false, No DISTINCT columns IS false + nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true + nativeConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false Statistics: Num rows: 6144 Data size: 1320982 Basic stats: COMPLETE Column stats: NONE Execution mode: vectorized Map Vectorization: @@ -387,7 +528,54 @@ STAGE PLANS: enableConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false Reduce Operator Tree: Group By Operator - aggregations: count(DISTINCT KEY._col0:0._col0) + aggregations: count(KEY._col0) + Group By Vectorization: + vectorOutput: false + native: false + projectedOutputColumns: null + keys: KEY._col0 (type: int) + mode: partial2 + outputColumnNames: _col0, _col1 + Statistics: Num rows: 6144 Data size: 1320982 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + TableScan Vectorization: + native: true + projectedOutputColumns: [0, 1] + Reduce Output Operator + sort order: + Reduce Sink Vectorization: + className: VectorReduceSinkOperator + native: false + nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true + nativeConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false + Statistics: Num rows: 6144 Data size: 1320982 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint) + Execution mode: vectorized + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vector.serde.deserialize IS true + groupByVectorOutput: true + inputFileFormats: org.apache.hadoop.mapred.SequenceFileInputFormat + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + Reduce Vectorization: + enabled: false + enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true + enableConditionsNotMet: hive.execution.engine mr IN [tez, spark] IS false + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) Group By Vectorization: vectorOutput: false native: false