commit 3c4053ed1b7a1c550791b53adf1e60d2b48936f4 Author: Pengcheng Xiong Date: Fri May 26 19:15:00 2017 -0700 pa diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 2dfc8b6f89..af73472a4d 100644 --- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -1567,6 +1567,8 @@ private static void populateLlapDaemonVarsSet(Set llapDaemonVarsSetLocal "Whether to transform OR clauses in Filter operators into IN clauses"), HIVEPOINTLOOKUPOPTIMIZERMIN("hive.optimize.point.lookup.min", 31, "Minimum number of OR clauses needed to transform into IN clauses"), + HIVECOUNTDISTINCTOPTIMIZER("hive.optimize.countdistinct", true, + "Whether to transform count distinct into two stages"), HIVEPARTITIONCOLUMNSEPARATOR("hive.optimize.partition.columns.separate", true, "Extract partition columns from IN clauses"), // Constant propagation optimizer diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties index 47a13c93b9..590e056992 100644 --- a/itests/src/test/resources/testconfiguration.properties +++ b/itests/src/test/resources/testconfiguration.properties @@ -130,6 +130,7 @@ minillaplocal.shared.query.files=alter_merge_2_orc.q,\ constprog_semijoin.q,\ correlationoptimizer1.q,\ count.q,\ + count_dist_rewrite.q,\ create_merge_compressed.q,\ cross_join.q,\ cross_product_check_1.q,\ diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java index 8b04cd44fa..f8085fea0b 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java @@ -1167,8 +1167,16 @@ public boolean supportSkewJoinOptimization() { @SuppressWarnings("unchecked") T descClone = (T)conf.clone(); // also clone the colExprMap by default + // we need a deep copy + ArrayList colInfos = new ArrayList<>(); + colInfos.addAll(getSchema().getSignature()); + Map map = null; + if (getColumnExprMap() != null) { + map = new HashMap<>(); + map.putAll(getColumnExprMap()); + } Operator ret = OperatorFactory.getAndMakeChild( - cContext, descClone, getSchema(), getColumnExprMap(), parentClones); + cContext, descClone, new RowSchema(colInfos), map, parentClones); return ret; } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/CountDistinctRewriteProc.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/CountDistinctRewriteProc.java new file mode 100644 index 0000000000..6450cb3821 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/CountDistinctRewriteProc.java @@ -0,0 +1,504 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements.See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership.The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License.You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.optimizer; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Stack; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.exec.ColumnInfo; +import org.apache.hadoop.hive.ql.exec.GroupByOperator; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.OperatorFactory; +import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; +import org.apache.hadoop.hive.ql.exec.RowSchema; +import org.apache.hadoop.hive.ql.exec.TableScanOperator; +import org.apache.hadoop.hive.ql.exec.Utilities; +import org.apache.hadoop.hive.ql.io.AcidUtils; +import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker; +import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher; +import org.apache.hadoop.hive.ql.lib.Dispatcher; +import org.apache.hadoop.hive.ql.lib.GraphWalker; +import org.apache.hadoop.hive.ql.lib.Node; +import org.apache.hadoop.hive.ql.lib.NodeProcessor; +import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; +import org.apache.hadoop.hive.ql.lib.Rule; +import org.apache.hadoop.hive.ql.lib.RuleRegExp; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.SemanticAnalyzer; +import org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.GenericUDAFInfo; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.plan.AggregationDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.GroupByDesc; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; +import org.apache.hadoop.hive.ql.plan.PlanUtils; +import org.apache.hadoop.hive.ql.plan.TableScanDesc; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.Mode; + +/** + * Queries of form : select max(c), count(distinct c) from T; generates a plan + * of form TS->mGBy->RS->rGBy->FS This plan suffers from a problem that vertex + * containing rGBy->FS necessarily need to have 1 task. This limitation results + * in slow execution because that task gets all the data. This optimization if + * successful will rewrite above plan to mGby1-rs1-mGby2-mGby3-rs2-rGby1 This + * introduces extra vertex of mGby2-mGby3-rs2. Note this vertex can have + * multiple tasks and since we are doing aggregation, output of this must + * necessarily be smaller than its input, which results in much less data going + * in to original rGby->FS vertex, which continues to have single task. Also + * note on calcite tree we have HiveExpandDistinctAggregatesRule rule which does + * similar plan transformation but has different conditions which needs to be + * satisfied. Additionally, we don't do any costing here but this is possibly + * that this transformation may slow down query a bit since if data is small + * enough to fit in a single task of last reducer, injecting additional vertex + * in pipeline may make query slower. If this happens, users can use the + * configuration hive.optimize.countdistinct to turn it off. + */ +public class CountDistinctRewriteProc extends Transform { + + private static final Logger LOG = LoggerFactory.getLogger(CountDistinctRewriteProc.class + .getName()); + + public CountDistinctRewriteProc() { + } + + @Override + public ParseContext transform(ParseContext pctx) throws SemanticException { + + Map opRules = new LinkedHashMap(); + // process group-by pattern + opRules + .put( + new RuleRegExp("R1", GroupByOperator.getOperatorName() + "%" + + ReduceSinkOperator.getOperatorName() + "%" + GroupByOperator.getOperatorName() + + "%"), getCountDistinctProc(pctx)); + + // The dispatcher fires the processor corresponding to the closest matching + // rule and passes the context along + Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), opRules, null); + GraphWalker ogw = new DefaultGraphWalker(disp); + + // Create a list of topop nodes + List topNodes = new ArrayList(); + topNodes.addAll(pctx.getTopOps().values()); + ogw.startWalking(topNodes, null); + + return pctx; + } + + private NodeProcessor getDefaultProc() { + return new NodeProcessor() { + @Override + public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException { + return null; + } + }; + } + + private NodeProcessor getCountDistinctProc(ParseContext pctx) { + return new CountDistinctProcessor(pctx); + } + + /** + * CountDistinctProcessor. + * + */ + public class CountDistinctProcessor implements NodeProcessor { + + protected ParseContext pGraphContext; + + public CountDistinctProcessor(ParseContext pGraphContext) { + this.pGraphContext = pGraphContext; + } + + // Position of distinct column in aggregator list of map Gby before rewrite. + int indexOfDist = -1; + + // Check if we can process it or not + protected boolean checkCountDistinct(GroupByOperator mGby, ReduceSinkOperator rs, + GroupByOperator rGby) { + ArrayList keys = mGby.getConf().getKeys(); + if (!(mGby.getConf().getMode() == GroupByDesc.Mode.HASH + && !mGby.getConf().isGroupingSetsPresent() && rs.getConf().getKeyCols().size() == 1 + && rs.getConf().getPartitionCols().size() == 0 + && rs.getConf().getDistinctColumnIndices().size() == 1 + && rGby.getConf().getMode() == GroupByDesc.Mode.MERGEPARTIAL && keys.size() == 1 + && rGby.getConf().getKeys().size() == 0 && mGby.getConf().getOutputColumnNames().size() == mGby + .getConf().getAggregators().size() + 1)) { + return false; + } + for (int pos = 0; pos < mGby.getConf().getAggregators().size(); pos++) { + AggregationDesc aggr = mGby.getConf().getAggregators().get(pos); + if (aggr.getDistinct()) { + if (indexOfDist != -1 || !aggr.getGenericUDAFName().equalsIgnoreCase("count")) { + // there are 2 or more distincts, or distinct is not on count + // TODO: may be the same count(distinct key), count(distinct key) + // TODO: deal with duplicate count distinct key + return false; + } + indexOfDist = pos; + if (!(aggr.getParameters().size() == 1 + && aggr.getParameters().get(0) instanceof ExprNodeColumnDesc && mGby.getConf() + .getKeys().get(0) instanceof ExprNodeColumnDesc)) { + return false; + } else { + ExprNodeColumnDesc agg = (ExprNodeColumnDesc) aggr.getParameters().get(0); + ExprNodeColumnDesc key = (ExprNodeColumnDesc) mGby.getConf().getKeys().get(0); + if (!agg.isSame(key)) { + return false; + } + } + } + } + if (indexOfDist == -1) { + return false; + } + // check if it is potential to trigger nullscan + if (pGraphContext.getConf().getBoolVar(HiveConf.ConfVars.HIVEMETADATAONLYQUERIES)) { + for (TableScanOperator tsOp : pGraphContext.getTopOps().values()) { + List colIDs = tsOp.getNeededColumnIDs(); + TableScanDesc desc = tsOp.getConf(); + boolean noColNeeded = (colIDs == null) || (colIDs.isEmpty()); + // VC is still here and it will be pruned by column pruner + // boolean noVCneeded = (desc == null) || (desc.getVirtualCols() == null) + // || (desc.getVirtualCols().isEmpty()); + boolean isSkipHF = desc.isNeedSkipHeaderFooters(); + if (noColNeeded && !isSkipHF) { + // it is possible that nullscan can fire, we skip this rule. + return false; + } + } + } + return true; + } + + /* + * We will transform GB-RS-GBY to mGby1-rs1-mGby2-mGby3-rs2-rGby1 + */ + @SuppressWarnings("unchecked") + protected void processGroupBy(GroupByOperator mGby, ReduceSinkOperator rs, GroupByOperator rGby) + throws SemanticException, CloneNotSupportedException { + // remove count(distinct) in map-side gby + List> parents = mGby.getParentOperators(); + List> children = rGby.getChildOperators(); + mGby.removeParents(); + rs.removeParents(); + rGby.removeParents(); + + GroupByOperator mGby1 = genMapGroupby1(mGby, indexOfDist); + ReduceSinkOperator rs1 = genReducesink1(mGby1, rs, indexOfDist); + GroupByOperator mGby2 = genMapGroupby2(rs1, mGby); + GroupByOperator mGby3 = genMapGroupby3(mGby2, mGby); + ReduceSinkOperator rs2 = genReducesink2(mGby3, rs); + GroupByOperator rGby1 = genReduceGroupby(rs2, rGby, indexOfDist); + for (Operator parent : parents) { + OperatorFactory.makeChild(parent, mGby1); + } + OperatorFactory.makeChild(mGby1, rs1); + OperatorFactory.makeChild(rs1, mGby2); + OperatorFactory.makeChild(mGby2, mGby3); + OperatorFactory.makeChild(mGby3, rs2); + OperatorFactory.makeChild(rs2, rGby1); + for (Operator child : children) { + child.removeParents(); + OperatorFactory.makeChild(rGby1, child); + } + } + + // mGby1 ---already contains group by key, we need to remove distinct column + private GroupByOperator genMapGroupby1(Operator mGby, int indexOfDist) + throws CloneNotSupportedException { + GroupByOperator mGby1 = (GroupByOperator) mGby.clone(); + // distinct is at lost position. + String fieldString = mGby1.getConf().getOutputColumnNames().get(indexOfDist + 1); + mGby1.getColumnExprMap().remove(fieldString); + mGby1.getConf().getOutputColumnNames().remove(indexOfDist + 1); + mGby1.getConf().getAggregators().remove(indexOfDist); + mGby1.getConf().setDistinct(false); + mGby1.getSchema().getColumnNames().remove(indexOfDist + 1); + mGby1.getSchema().getSignature().remove(indexOfDist + 1); + return mGby1; + } + + // rs1 --- remove distinctColIndices, set #reducer as -1, reset keys, + // values, colexpmap and rowschema + private ReduceSinkOperator genReducesink1(GroupByOperator mGby1, + Operator rs, int indexOfDist) throws CloneNotSupportedException, + SemanticException { + ReduceSinkOperator rs1 = (ReduceSinkOperator) rs.clone(); + Map colExprMap = new HashMap(); + ArrayList outputKeyColumnNames = new ArrayList(); + ArrayList outputValueColumnNames = new ArrayList(); + ArrayList reduceKeys = new ArrayList(); + ArrayList reduceValues = new ArrayList(); + List internalNames = new ArrayList<>(); + for (int index = 0; index < mGby1.getSchema().getSignature().size(); index++) { + ColumnInfo paraExprInfo = mGby1.getSchema().getSignature().get(index); + String paraExpression = paraExprInfo.getInternalName(); + assert (paraExpression != null); + ExprNodeColumnDesc exprDesc = new ExprNodeColumnDesc(paraExprInfo.getType(), + paraExpression, paraExprInfo.getTabAlias(), paraExprInfo.getIsVirtualCol()); + // index==0 means this is key + if (index == 0) { + reduceKeys.add(exprDesc); + String outputColName = SemanticAnalyzer.getColumnInternalName(index); + outputKeyColumnNames.add(outputColName); + String internalName = Utilities.ReduceField.KEY.toString() + "." + outputColName; + colExprMap.put(internalName, exprDesc); + internalNames.add(internalName); + } else { + reduceValues.add(exprDesc); + String outputColName = SemanticAnalyzer.getColumnInternalName(index - 1); + outputValueColumnNames.add(outputColName); + String internalName = Utilities.ReduceField.VALUE.toString() + "." + outputColName; + colExprMap.put(internalName, exprDesc); + internalNames.add(internalName); + } + } + List> distinctColIndices = new ArrayList<>(); + rs1.setConf(PlanUtils.getReduceSinkDesc(reduceKeys, 1, reduceValues, distinctColIndices, + outputKeyColumnNames, outputValueColumnNames, true, -1, 1, -1, + AcidUtils.Operation.NOT_ACID)); + rs1.setColumnExprMap(colExprMap); + + rs1.getSchema().getColumnNames().remove(indexOfDist + 1); + rs1.getSchema().getSignature().remove(indexOfDist + 1); + // KEY._col0:0._col0 => KEY._col0 + + for (int i = 0; i < rs1.getSchema().getSignature().size(); i++) { + rs1.getSchema().getSignature().get(i).setInternalName(internalNames.get(i)); + rs1.getSchema().getColumnNames().set(i, internalNames.get(i)); + } + return rs1; + } + + // mGby2 ---already contains key, remove distinct and change all the others + private GroupByOperator genMapGroupby2(ReduceSinkOperator rs1, + Operator mGby) throws CloneNotSupportedException, SemanticException { + GroupByOperator mGby2 = (GroupByOperator) mGby.clone(); + ArrayList rowSchema = new ArrayList<>(); + ArrayList groupByKeys = new ArrayList(); + ArrayList outputColumnNames = new ArrayList(); + Map colExprMap = new HashMap(); + + ColumnInfo exprInfo = rs1.getSchema().getSignature().get(0); + ExprNodeDesc key = new ExprNodeColumnDesc(exprInfo); + groupByKeys.add(key); + String field = SemanticAnalyzer.getColumnInternalName(0); + outputColumnNames.add(field); + ColumnInfo oColInfo = new ColumnInfo(field, exprInfo.getType(), "", false); + colExprMap.put(field, key); + rowSchema.add(oColInfo); + + ArrayList aggregations = new ArrayList(); + for (int index = 0; index < mGby2.getConf().getAggregators().size(); index++) { + ArrayList aggParameters = new ArrayList(); + if (index != indexOfDist) { + AggregationDesc desc = mGby2.getConf().getAggregators().get(index); + ColumnInfo paraExprInfo = null; + // for example, original it is max 0, dist 1, min 2 + // rs1's schema is key 0, max 1, min 2 + if (index < indexOfDist) { + paraExprInfo = rs1.getSchema().getSignature().get(index + 1); + } else { + paraExprInfo = rs1.getSchema().getSignature().get(index); + } + + String paraExpression = paraExprInfo.getInternalName(); + assert (paraExpression != null); + aggParameters.add(new ExprNodeColumnDesc(paraExprInfo.getType(), paraExpression, + paraExprInfo.getTabAlias(), paraExprInfo.getIsVirtualCol())); + + // for all the other aggregations, we set the mode to PARTIAL2 + Mode amode = SemanticAnalyzer.groupByDescModeToUDAFMode(GroupByDesc.Mode.PARTIAL2, false); + GenericUDAFEvaluator genericUDAFEvaluator = desc.getGenericUDAFEvaluator(); + GenericUDAFInfo udaf = SemanticAnalyzer.getGenericUDAFInfo(genericUDAFEvaluator, amode, + aggParameters); + aggregations.add(new AggregationDesc(desc.getGenericUDAFName(), + udaf.genericUDAFEvaluator, udaf.convertedParameters, false, amode)); + String f = SemanticAnalyzer.getColumnInternalName(aggregations.size()); + outputColumnNames.add(f); + rowSchema.add(new ColumnInfo(f, udaf.returnType, "", false)); + } + } + mGby2.getConf().setMode(GroupByDesc.Mode.PARTIAL2); + mGby2.getConf().setOutputColumnNames(outputColumnNames); + mGby2.getConf().getKeys().clear(); + mGby2.getConf().getKeys().addAll(groupByKeys); + mGby2.getConf().getAggregators().clear(); + mGby2.getConf().getAggregators().addAll(aggregations); + mGby2.getConf().setDistinct(false); + mGby2.setSchema(new RowSchema(rowSchema)); + mGby2.setColumnExprMap(colExprMap); + return mGby2; + } + + // mGby3 is a follow up of mGby2. Here we start to count(key). + private GroupByOperator genMapGroupby3(GroupByOperator mGby2, + Operator mGby) throws CloneNotSupportedException, SemanticException { + GroupByOperator mGby3 = (GroupByOperator) mGby.clone(); + ArrayList rowSchema = new ArrayList<>(); + ArrayList outputColumnNames = new ArrayList(); + Map colExprMap = new HashMap(); + + // exprInfo is the key + ArrayList aggregations = new ArrayList(); + for (int index = 0; index <= mGby2.getConf().getAggregators().size(); index++) { + if (index == indexOfDist) { + ArrayList aggParameters = new ArrayList(); + // add count(KEY._col0) to replace distinct + ColumnInfo paraExprInfo = mGby2.getSchema().getSignature().get(0); + String paraExpression = paraExprInfo.getInternalName(); + assert (paraExpression != null); + aggParameters.add(new ExprNodeColumnDesc(paraExprInfo.getType(), paraExpression, + paraExprInfo.getTabAlias(), paraExprInfo.getIsVirtualCol())); + Mode amode = SemanticAnalyzer.groupByDescModeToUDAFMode(GroupByDesc.Mode.HASH, false); + GenericUDAFEvaluator genericUDAFEvaluator = SemanticAnalyzer.getGenericUDAFEvaluator( + "count", aggParameters, null, false, false); + assert (genericUDAFEvaluator != null); + GenericUDAFInfo udaf = SemanticAnalyzer.getGenericUDAFInfo(genericUDAFEvaluator, amode, + aggParameters); + AggregationDesc newDesc = new AggregationDesc("count", udaf.genericUDAFEvaluator, + udaf.convertedParameters, false, amode); + String f = SemanticAnalyzer.getColumnInternalName(aggregations.size()); + aggregations.add(newDesc); + outputColumnNames.add(f); + rowSchema.add(new ColumnInfo(f, udaf.returnType, "", false)); + } + if (index == mGby2.getConf().getAggregators().size()) { + break; + } + ArrayList aggParameters = new ArrayList(); + AggregationDesc desc = mGby2.getConf().getAggregators().get(index); + ColumnInfo paraExprInfo = null; + // for example, original it is max 0, dist 1, min 2 + // rs1's schema is key 0, max 1, min 2 + paraExprInfo = mGby2.getSchema().getSignature().get(index + 1); + String paraExpression = paraExprInfo.getInternalName(); + assert (paraExpression != null); + aggParameters.add(new ExprNodeColumnDesc(paraExprInfo.getType(), paraExpression, + paraExprInfo.getTabAlias(), paraExprInfo.getIsVirtualCol())); + + // for all the other aggregations, we set the mode to PARTIAL2 + Mode amode = SemanticAnalyzer.groupByDescModeToUDAFMode(GroupByDesc.Mode.PARTIAL2, false); + GenericUDAFEvaluator genericUDAFEvaluator = desc.getGenericUDAFEvaluator(); + GenericUDAFInfo udaf = SemanticAnalyzer.getGenericUDAFInfo(genericUDAFEvaluator, amode, + aggParameters); + String f = SemanticAnalyzer.getColumnInternalName(aggregations.size()); + aggregations.add(new AggregationDesc(desc.getGenericUDAFName(), udaf.genericUDAFEvaluator, + udaf.convertedParameters, false, amode)); + outputColumnNames.add(f); + rowSchema.add(new ColumnInfo(f, udaf.returnType, "", false)); + } + mGby3.getConf().setMode(GroupByDesc.Mode.PARTIAL2); + mGby3.getConf().setOutputColumnNames(outputColumnNames); + mGby3.getConf().getKeys().clear(); + mGby3.getConf().getAggregators().clear(); + mGby3.getConf().getAggregators().addAll(aggregations); + mGby3.getConf().setDistinct(false); + mGby3.setSchema(new RowSchema(rowSchema)); + mGby3.setColumnExprMap(colExprMap); + return mGby3; + } + + // #reducer is already 1 + private ReduceSinkOperator genReducesink2(GroupByOperator mGby2, + Operator rs) throws SemanticException, CloneNotSupportedException { + ReduceSinkOperator rs2 = (ReduceSinkOperator) rs.clone(); + Map colExprMap = new HashMap<>(); + + ArrayList outputKeyColumnNames = new ArrayList(); + ArrayList outputValueColumnNames = new ArrayList(); + ArrayList reduceValues = new ArrayList(); + for (int index = 0; index < mGby2.getSchema().getSignature().size(); index++) { + ColumnInfo paraExprInfo = mGby2.getSchema().getSignature().get(index); + String paraExpression = paraExprInfo.getInternalName(); + assert (paraExpression != null); + ExprNodeColumnDesc exprDesc = new ExprNodeColumnDesc(paraExprInfo.getType(), + paraExpression, paraExprInfo.getTabAlias(), paraExprInfo.getIsVirtualCol()); + reduceValues.add(exprDesc); + String outputColName = SemanticAnalyzer.getColumnInternalName(index); + outputValueColumnNames.add(outputColName); + String internalName = Utilities.ReduceField.VALUE.toString() + "." + outputColName; + colExprMap.put(internalName, exprDesc); + } + List> distinctColIndices = new ArrayList<>(); + ArrayList reduceKeys = new ArrayList<>(); + rs2.setConf(PlanUtils.getReduceSinkDesc(reduceKeys, 0, reduceValues, distinctColIndices, + outputKeyColumnNames, outputValueColumnNames, false, -1, 0, 1, + AcidUtils.Operation.NOT_ACID)); + rs2.setColumnExprMap(colExprMap); + rs2.getSchema().getSignature().remove(0); + return rs2; + } + + // replace the distinct with the count aggregation + private GroupByOperator genReduceGroupby(ReduceSinkOperator rs2, + Operator rGby, int indexOfDist) throws SemanticException, + CloneNotSupportedException { + GroupByOperator rGby1 = (GroupByOperator) rGby.clone(); + ColumnInfo paraExprInfo = rs2.getSchema().getSignature().get(indexOfDist); + String paraExpression = paraExprInfo.getInternalName(); + assert (paraExpression != null); + ArrayList aggParameters = new ArrayList(); + aggParameters.add(new ExprNodeColumnDesc(paraExprInfo.getType(), paraExpression, paraExprInfo + .getTabAlias(), paraExprInfo.getIsVirtualCol())); + GenericUDAFEvaluator genericUDAFEvaluator = SemanticAnalyzer.getGenericUDAFEvaluator("count", + aggParameters, null, false, false); + assert (genericUDAFEvaluator != null); + Mode amode = SemanticAnalyzer.groupByDescModeToUDAFMode(GroupByDesc.Mode.MERGEPARTIAL, false); + GenericUDAFInfo udaf = SemanticAnalyzer.getGenericUDAFInfo(genericUDAFEvaluator, amode, + aggParameters); + AggregationDesc newDesc = new AggregationDesc("count", udaf.genericUDAFEvaluator, + udaf.convertedParameters, false, amode); + rGby1.getConf().getAggregators().set(indexOfDist, newDesc); + rGby1.getConf().setDistinct(false); + return rGby1; + } + + @Override + public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException { + GroupByOperator mGby = (GroupByOperator) stack.get(stack.size() - 3); + ReduceSinkOperator rs = (ReduceSinkOperator) stack.get(stack.size() - 2); + GroupByOperator rGby = (GroupByOperator) stack.get(stack.size() - 1); + if (checkCountDistinct(mGby, rs, rGby)) { + LOG.info("trigger count distinct rewrite"); + try { + processGroupBy(mGby, rs, rGby); + } catch (CloneNotSupportedException e) { + throw new SemanticException(e.getMessage()); + } + } + return null; + } + + } + +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java index 7dace9076f..781e088b88 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java @@ -143,6 +143,10 @@ public void initialize(HiveConf hiveConf) { HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVE_MAP_GROUPBY_SORT)) { transformations.add(new GroupByOptimizer()); } + if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVECOUNTDISTINCTOPTIMIZER) + && (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVE_IN_TEST) || isTezExecEngine)) { + transformations.add(new CountDistinctRewriteProc()); + } transformations.add(new ColumnPruner()); if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVE_OPTIMIZE_SKEWJOIN_COMPILETIME)) { if (!isTezExecEngine) { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/GroupByDesc.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/GroupByDesc.java index 38a9ef2af1..fe91ee7025 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/plan/GroupByDesc.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/GroupByDesc.java @@ -367,4 +367,19 @@ public GroupByOperatorExplainVectorization getGroupByVectorization() { } return new GroupByOperatorExplainVectorization(this, vectorDesc); } + + @Override + public Object clone() { + ArrayList outputColumnNames = new ArrayList<>(); + outputColumnNames.addAll(this.outputColumnNames); + ArrayList keys = new ArrayList<>(); + keys.addAll(this.keys); + ArrayList aggregators = new ArrayList<>(); + aggregators.addAll(this.aggregators); + List listGroupingSets = new ArrayList<>(); + listGroupingSets.addAll(this.listGroupingSets); + return new GroupByDesc(this.mode, outputColumnNames, keys, aggregators, + this.groupByMemoryUsage, this.memoryThreshold, listGroupingSets, this.groupingSetsPresent, + this.groupingSetPosition, this.isDistinct); + } } diff --git a/ql/src/test/queries/clientpositive/count_dist_rewrite.q b/ql/src/test/queries/clientpositive/count_dist_rewrite.q new file mode 100644 index 0000000000..0b1bc66521 --- /dev/null +++ b/ql/src/test/queries/clientpositive/count_dist_rewrite.q @@ -0,0 +1,65 @@ +explain select count(distinct key) from src; + +select count(distinct key) from src; + +explain select max(key), count(distinct key) B1_CNTD from src; + +select max(key), count(distinct key) B1_CNTD from src; + +explain select max(key), count(distinct key), min(key) from src; + +select max(key), count(distinct key), min(key) from src; + +explain select max(key), count(distinct key), min(key), avg(key) from src; + +select max(key), count(distinct key), min(key), avg(key) from src; + +explain select count(1), count(distinct key) from src; + +select count(1), count(distinct key) from src; + +explain select + count(*) as total, + count(key) as not_null_total, + count(distinct key) as unique_days, + max(value) as max_ss_store_sk, + max(key) as max_ss_promo_sk +from src; + +select + count(*) as total, + count(key) as not_null_total, + count(distinct key) as unique_days, + max(value) as max_ss_store_sk, + max(key) as max_ss_promo_sk +from src; + +explain select count(1), count(distinct key), cast(STDDEV(key) as int) from src; +select count(1), count(distinct key), cast(STDDEV(key) as int) from src; +select count(distinct key), count(1), cast(STDDEV(key) as int) from src; + +explain SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + count(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + cast(std(substr(src.value,5)) as int), + cast(stddev_samp(substr(src.value,5)) as int), + cast(variance(substr(src.value,5)) as int), + cast(var_samp(substr(src.value,5)) as int) from src; + +SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + count(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + cast(std(substr(src.value,5)) as int), + cast(stddev_samp(substr(src.value,5)) as int), + cast(variance(substr(src.value,5)) as int), + cast(var_samp(substr(src.value,5)) as int) from src; + +explain select max(key), count(distinct key), min(key), avg(key) from src group by value; + +select max(key), count(distinct key), min(key), avg(key) from src group by value; diff --git a/ql/src/test/results/clientpositive/count_dist_rewrite.q.out b/ql/src/test/results/clientpositive/count_dist_rewrite.q.out new file mode 100644 index 0000000000..ceda918488 --- /dev/null +++ b/ql/src/test/results/clientpositive/count_dist_rewrite.q.out @@ -0,0 +1,1151 @@ +PREHOOK: query: explain select count(distinct key) from src +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(distinct key) from src +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: key (type: string) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(_col0) + mode: partial2 + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(distinct key) from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select count(distinct key) from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +309 +PREHOOK: query: explain select max(key), count(distinct key) B1_CNTD from src +PREHOOK: type: QUERY +POSTHOOK: query: explain select max(key), count(distinct key) B1_CNTD from src +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: max(key) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: string) + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: max(_col1), count(_col0) + mode: partial2 + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 368 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 368 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: bigint) + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0), count(VALUE._col1) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 368 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 368 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select max(key), count(distinct key) B1_CNTD from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select max(key), count(distinct key) B1_CNTD from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +98 309 +PREHOOK: query: explain select max(key), count(distinct key), min(key) from src +PREHOOK: type: QUERY +POSTHOOK: query: explain select max(key), count(distinct key), min(key) from src +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: max(key), min(key) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col3 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: string), _col3 (type: string) + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0), min(VALUE._col1) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: max(_col1), count(_col0), min(_col2) + mode: partial2 + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 736 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 736 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: bigint), _col2 (type: string) + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0), count(VALUE._col1), min(VALUE._col2) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 736 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 736 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select max(key), count(distinct key), min(key) from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select max(key), count(distinct key), min(key) from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +98 309 0 +PREHOOK: query: explain select max(key), count(distinct key), min(key), avg(key) from src +PREHOOK: type: QUERY +POSTHOOK: query: explain select max(key), count(distinct key), min(key), avg(key) from src +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: max(key), min(key), avg(key) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col3, _col4 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: string), _col3 (type: string), _col4 (type: struct) + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0), min(VALUE._col1), avg(VALUE._col2) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: max(_col1), count(_col0), min(_col2), avg(_col3) + mode: partial2 + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 1 Data size: 1148 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 1148 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: bigint), _col2 (type: string), _col3 (type: struct) + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0), count(VALUE._col1), min(VALUE._col2), avg(VALUE._col3) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 1 Data size: 1148 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 1148 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select max(key), count(distinct key), min(key), avg(key) from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select max(key), count(distinct key), min(key), avg(key) from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +98 309 0 260.182 +PREHOOK: query: explain select count(1), count(distinct key) from src +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(1), count(distinct key) from src +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: _col1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(1) + keys: _col1 (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(_col1), count(_col0) + mode: partial2 + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint), _col1 (type: bigint) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), count(VALUE._col1) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(1), count(distinct key) from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select count(1), count(distinct key) from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +500 309 +PREHOOK: query: explain select + count(*) as total, + count(key) as not_null_total, + count(distinct key) as unique_days, + max(value) as max_ss_store_sk, + max(key) as max_ss_promo_sk +from src +PREHOOK: type: QUERY +POSTHOOK: query: explain select + count(*) as total, + count(key) as not_null_total, + count(distinct key) as unique_days, + max(value) as max_ss_store_sk, + max(key) as max_ss_promo_sk +from src +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: key, value + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(), count(key), max(value), max(key) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2, _col4, _col5 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint), _col2 (type: bigint), _col4 (type: string), _col5 (type: string) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), count(VALUE._col1), max(VALUE._col2), max(VALUE._col3) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(_col1), count(_col2), count(_col0), max(_col3), max(_col4) + mode: partial2 + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 1 Data size: 576 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 576 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint), _col1 (type: bigint), _col2 (type: bigint), _col3 (type: string), _col4 (type: string) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), count(VALUE._col1), count(VALUE._col2), max(VALUE._col3), max(VALUE._col4) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 1 Data size: 576 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 576 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select + count(*) as total, + count(key) as not_null_total, + count(distinct key) as unique_days, + max(value) as max_ss_store_sk, + max(key) as max_ss_promo_sk +from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select + count(*) as total, + count(key) as not_null_total, + count(distinct key) as unique_days, + max(value) as max_ss_store_sk, + max(key) as max_ss_promo_sk +from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +500 500 309 val_98 98 +PREHOOK: query: explain select count(1), count(distinct key), cast(STDDEV(key) as int) from src +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(1), count(distinct key), cast(STDDEV(key) as int) from src +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: _col1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(1), stddev(_col1) + keys: _col1 (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col3 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint), _col3 (type: struct) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), stddev(VALUE._col1) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(_col1), count(_col0), stddev(_col2) + mode: partial2 + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 176 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 176 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint), _col1 (type: bigint), _col2 (type: struct) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), count(VALUE._col1), stddev(VALUE._col2) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 176 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: bigint), _col1 (type: bigint), UDFToInteger(_col2) (type: int) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 176 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 176 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(1), count(distinct key), cast(STDDEV(key) as int) from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select count(1), count(distinct key), cast(STDDEV(key) as int) from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +500 309 142 +PREHOOK: query: select count(distinct key), count(1), cast(STDDEV(key) as int) from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select count(distinct key), count(1), cast(STDDEV(key) as int) from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +309 500 142 +PREHOOK: query: explain SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + count(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + cast(std(substr(src.value,5)) as int), + cast(stddev_samp(substr(src.value,5)) as int), + cast(variance(substr(src.value,5)) as int), + cast(var_samp(substr(src.value,5)) as int) from src +PREHOOK: type: QUERY +POSTHOOK: query: explain SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + count(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + cast(std(substr(src.value,5)) as int), + cast(stddev_samp(substr(src.value,5)) as int), + cast(variance(substr(src.value,5)) as int), + cast(var_samp(substr(src.value,5)) as int) from src +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: substr(value, 5) (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: sum(_col0), avg(_col0), max(_col0), min(_col0), std(_col0), stddev_samp(_col0), variance(_col0), var_samp(_col0) + keys: _col0 (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2, _col4, _col5, _col6, _col7, _col8, _col9 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: double), _col2 (type: struct), _col4 (type: string), _col5 (type: string), _col6 (type: struct), _col7 (type: struct), _col8 (type: struct), _col9 (type: struct) + Reduce Operator Tree: + Group By Operator + aggregations: sum(VALUE._col0), avg(VALUE._col1), max(VALUE._col2), min(VALUE._col3), std(VALUE._col4), stddev_samp(VALUE._col5), variance(VALUE._col6), var_samp(VALUE._col7) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: sum(_col1), avg(_col2), count(_col0), max(_col3), min(_col4), std(_col5), stddev_samp(_col6), variance(_col7), var_samp(_col8) + mode: partial2 + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 + Statistics: Num rows: 1 Data size: 1392 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 1392 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: double), _col1 (type: struct), _col2 (type: bigint), _col3 (type: string), _col4 (type: string), _col5 (type: struct), _col6 (type: struct), _col7 (type: struct), _col8 (type: struct) + Reduce Operator Tree: + Group By Operator + aggregations: sum(VALUE._col0), avg(VALUE._col1), count(VALUE._col2), max(VALUE._col3), min(VALUE._col4), std(VALUE._col5), stddev_samp(VALUE._col6), variance(VALUE._col7), var_samp(VALUE._col8) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 + Statistics: Num rows: 1 Data size: 1392 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: double), _col1 (type: double), _col2 (type: bigint), _col3 (type: string), _col4 (type: string), UDFToInteger(_col5) (type: int), UDFToInteger(_col6) (type: int), UDFToInteger(_col7) (type: int), UDFToInteger(_col8) (type: int) + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 + Statistics: Num rows: 1 Data size: 1392 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 1392 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + count(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + cast(std(substr(src.value,5)) as int), + cast(stddev_samp(substr(src.value,5)) as int), + cast(variance(substr(src.value,5)) as int), + cast(var_samp(substr(src.value,5)) as int) from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + count(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + cast(std(substr(src.value,5)) as int), + cast(stddev_samp(substr(src.value,5)) as int), + cast(variance(substr(src.value,5)) as int), + cast(var_samp(substr(src.value,5)) as int) from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +130091.0 260.182 309 98 0 142 143 20428 20469 +PREHOOK: query: explain select max(key), count(distinct key), min(key), avg(key) from src group by value +PREHOOK: type: QUERY +POSTHOOK: query: explain select max(key), count(distinct key), min(key), avg(key) from src group by value +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: key, value + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: max(key), count(DISTINCT key), min(key), avg(key) + keys: value (type: string), key (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: string) + sort order: ++ + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col2 (type: string), _col4 (type: string), _col5 (type: struct) + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0), count(DISTINCT KEY._col1:0._col0), min(VALUE._col2), avg(VALUE._col3) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col1 (type: string), _col2 (type: bigint), _col3 (type: string), _col4 (type: double) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select max(key), count(distinct key), min(key), avg(key) from src group by value +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select max(key), count(distinct key), min(key), avg(key) from src group by value +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +0 1 0 0.0 +10 1 10 10.0 +100 1 100 100.0 +103 1 103 103.0 +104 1 104 104.0 +105 1 105 105.0 +11 1 11 11.0 +111 1 111 111.0 +113 1 113 113.0 +114 1 114 114.0 +116 1 116 116.0 +118 1 118 118.0 +119 1 119 119.0 +12 1 12 12.0 +120 1 120 120.0 +125 1 125 125.0 +126 1 126 126.0 +128 1 128 128.0 +129 1 129 129.0 +131 1 131 131.0 +133 1 133 133.0 +134 1 134 134.0 +136 1 136 136.0 +137 1 137 137.0 +138 1 138 138.0 +143 1 143 143.0 +145 1 145 145.0 +146 1 146 146.0 +149 1 149 149.0 +15 1 15 15.0 +150 1 150 150.0 +152 1 152 152.0 +153 1 153 153.0 +155 1 155 155.0 +156 1 156 156.0 +157 1 157 157.0 +158 1 158 158.0 +160 1 160 160.0 +162 1 162 162.0 +163 1 163 163.0 +164 1 164 164.0 +165 1 165 165.0 +166 1 166 166.0 +167 1 167 167.0 +168 1 168 168.0 +169 1 169 169.0 +17 1 17 17.0 +170 1 170 170.0 +172 1 172 172.0 +174 1 174 174.0 +175 1 175 175.0 +176 1 176 176.0 +177 1 177 177.0 +178 1 178 178.0 +179 1 179 179.0 +18 1 18 18.0 +180 1 180 180.0 +181 1 181 181.0 +183 1 183 183.0 +186 1 186 186.0 +187 1 187 187.0 +189 1 189 189.0 +19 1 19 19.0 +190 1 190 190.0 +191 1 191 191.0 +192 1 192 192.0 +193 1 193 193.0 +194 1 194 194.0 +195 1 195 195.0 +196 1 196 196.0 +197 1 197 197.0 +199 1 199 199.0 +2 1 2 2.0 +20 1 20 20.0 +200 1 200 200.0 +201 1 201 201.0 +202 1 202 202.0 +203 1 203 203.0 +205 1 205 205.0 +207 1 207 207.0 +208 1 208 208.0 +209 1 209 209.0 +213 1 213 213.0 +214 1 214 214.0 +216 1 216 216.0 +217 1 217 217.0 +218 1 218 218.0 +219 1 219 219.0 +221 1 221 221.0 +222 1 222 222.0 +223 1 223 223.0 +224 1 224 224.0 +226 1 226 226.0 +228 1 228 228.0 +229 1 229 229.0 +230 1 230 230.0 +233 1 233 233.0 +235 1 235 235.0 +237 1 237 237.0 +238 1 238 238.0 +239 1 239 239.0 +24 1 24 24.0 +241 1 241 241.0 +242 1 242 242.0 +244 1 244 244.0 +247 1 247 247.0 +248 1 248 248.0 +249 1 249 249.0 +252 1 252 252.0 +255 1 255 255.0 +256 1 256 256.0 +257 1 257 257.0 +258 1 258 258.0 +26 1 26 26.0 +260 1 260 260.0 +262 1 262 262.0 +263 1 263 263.0 +265 1 265 265.0 +266 1 266 266.0 +27 1 27 27.0 +272 1 272 272.0 +273 1 273 273.0 +274 1 274 274.0 +275 1 275 275.0 +277 1 277 277.0 +278 1 278 278.0 +28 1 28 28.0 +280 1 280 280.0 +281 1 281 281.0 +282 1 282 282.0 +283 1 283 283.0 +284 1 284 284.0 +285 1 285 285.0 +286 1 286 286.0 +287 1 287 287.0 +288 1 288 288.0 +289 1 289 289.0 +291 1 291 291.0 +292 1 292 292.0 +296 1 296 296.0 +298 1 298 298.0 +30 1 30 30.0 +302 1 302 302.0 +305 1 305 305.0 +306 1 306 306.0 +307 1 307 307.0 +308 1 308 308.0 +309 1 309 309.0 +310 1 310 310.0 +311 1 311 311.0 +315 1 315 315.0 +316 1 316 316.0 +317 1 317 317.0 +318 1 318 318.0 +321 1 321 321.0 +322 1 322 322.0 +323 1 323 323.0 +325 1 325 325.0 +327 1 327 327.0 +33 1 33 33.0 +331 1 331 331.0 +332 1 332 332.0 +333 1 333 333.0 +335 1 335 335.0 +336 1 336 336.0 +338 1 338 338.0 +339 1 339 339.0 +34 1 34 34.0 +341 1 341 341.0 +342 1 342 342.0 +344 1 344 344.0 +345 1 345 345.0 +348 1 348 348.0 +35 1 35 35.0 +351 1 351 351.0 +353 1 353 353.0 +356 1 356 356.0 +360 1 360 360.0 +362 1 362 362.0 +364 1 364 364.0 +365 1 365 365.0 +366 1 366 366.0 +367 1 367 367.0 +368 1 368 368.0 +369 1 369 369.0 +37 1 37 37.0 +373 1 373 373.0 +374 1 374 374.0 +375 1 375 375.0 +377 1 377 377.0 +378 1 378 378.0 +379 1 379 379.0 +382 1 382 382.0 +384 1 384 384.0 +386 1 386 386.0 +389 1 389 389.0 +392 1 392 392.0 +393 1 393 393.0 +394 1 394 394.0 +395 1 395 395.0 +396 1 396 396.0 +397 1 397 397.0 +399 1 399 399.0 +4 1 4 4.0 +400 1 400 400.0 +401 1 401 401.0 +402 1 402 402.0 +403 1 403 403.0 +404 1 404 404.0 +406 1 406 406.0 +407 1 407 407.0 +409 1 409 409.0 +41 1 41 41.0 +411 1 411 411.0 +413 1 413 413.0 +414 1 414 414.0 +417 1 417 417.0 +418 1 418 418.0 +419 1 419 419.0 +42 1 42 42.0 +421 1 421 421.0 +424 1 424 424.0 +427 1 427 427.0 +429 1 429 429.0 +43 1 43 43.0 +430 1 430 430.0 +431 1 431 431.0 +432 1 432 432.0 +435 1 435 435.0 +436 1 436 436.0 +437 1 437 437.0 +438 1 438 438.0 +439 1 439 439.0 +44 1 44 44.0 +443 1 443 443.0 +444 1 444 444.0 +446 1 446 446.0 +448 1 448 448.0 +449 1 449 449.0 +452 1 452 452.0 +453 1 453 453.0 +454 1 454 454.0 +455 1 455 455.0 +457 1 457 457.0 +458 1 458 458.0 +459 1 459 459.0 +460 1 460 460.0 +462 1 462 462.0 +463 1 463 463.0 +466 1 466 466.0 +467 1 467 467.0 +468 1 468 468.0 +469 1 469 469.0 +47 1 47 47.0 +470 1 470 470.0 +472 1 472 472.0 +475 1 475 475.0 +477 1 477 477.0 +478 1 478 478.0 +479 1 479 479.0 +480 1 480 480.0 +481 1 481 481.0 +482 1 482 482.0 +483 1 483 483.0 +484 1 484 484.0 +485 1 485 485.0 +487 1 487 487.0 +489 1 489 489.0 +490 1 490 490.0 +491 1 491 491.0 +492 1 492 492.0 +493 1 493 493.0 +494 1 494 494.0 +495 1 495 495.0 +496 1 496 496.0 +497 1 497 497.0 +498 1 498 498.0 +5 1 5 5.0 +51 1 51 51.0 +53 1 53 53.0 +54 1 54 54.0 +57 1 57 57.0 +58 1 58 58.0 +64 1 64 64.0 +65 1 65 65.0 +66 1 66 66.0 +67 1 67 67.0 +69 1 69 69.0 +70 1 70 70.0 +72 1 72 72.0 +74 1 74 74.0 +76 1 76 76.0 +77 1 77 77.0 +78 1 78 78.0 +8 1 8 8.0 +80 1 80 80.0 +82 1 82 82.0 +83 1 83 83.0 +84 1 84 84.0 +85 1 85 85.0 +86 1 86 86.0 +87 1 87 87.0 +9 1 9 9.0 +90 1 90 90.0 +92 1 92 92.0 +95 1 95 95.0 +96 1 96 96.0 +97 1 97 97.0 +98 1 98 98.0 diff --git a/ql/src/test/results/clientpositive/groupby_sort_11.q.out b/ql/src/test/results/clientpositive/groupby_sort_11.q.out index 2b3bf4a07a..fe6bbb3bd9 100644 --- a/ql/src/test/results/clientpositive/groupby_sort_11.q.out +++ b/ql/src/test/results/clientpositive/groupby_sort_11.q.out @@ -292,7 +292,8 @@ POSTHOOK: query: EXPLAIN select count(distinct key+key) from T1 POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-1 is a root stage - Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 STAGE PLANS: Stage: Stage-1 @@ -306,24 +307,50 @@ STAGE PLANS: outputColumnNames: _col0 Statistics: Num rows: 10 Data size: 70 Basic stats: COMPLETE Column stats: NONE Group By Operator - aggregations: count(DISTINCT _col0) keys: _col0 (type: double) mode: hash - outputColumnNames: _col0, _col1 + outputColumnNames: _col0 Statistics: Num rows: 10 Data size: 70 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col0 (type: double) sort order: + + Map-reduce partition columns: _col0 (type: double) Statistics: Num rows: 10 Data size: 70 Basic stats: COMPLETE Column stats: NONE Reduce Operator Tree: Group By Operator - aggregations: count(DISTINCT KEY._col0:0._col0) + keys: KEY._col0 (type: double) + mode: partial2 + outputColumnNames: _col0 + Statistics: Num rows: 10 Data size: 70 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(_col0) + mode: partial2 + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) mode: mergepartial outputColumnNames: _col0 - Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE File Output Operator compressed: false - Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat diff --git a/ql/src/test/results/clientpositive/llap/count_dist_rewrite.q.out b/ql/src/test/results/clientpositive/llap/count_dist_rewrite.q.out new file mode 100644 index 0000000000..844c833c94 --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/count_dist_rewrite.q.out @@ -0,0 +1,1169 @@ +PREHOOK: query: explain select count(distinct key) from src +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(distinct key) from src +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + keys: key (type: string) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 205 Data size: 17835 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 205 Data size: 17835 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: llap + LLAP IO: no inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 205 Data size: 17835 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: count(_col0) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: bigint) + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(distinct key) from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select count(distinct key) from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +309 +PREHOOK: query: explain select max(key), count(distinct key) B1_CNTD from src +PREHOOK: type: QUERY +POSTHOOK: query: explain select max(key), count(distinct key) B1_CNTD from src +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: max(key) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 205 Data size: 55555 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 205 Data size: 55555 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: string) + Execution mode: llap + LLAP IO: no inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1 + Statistics: Num rows: 205 Data size: 55555 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: max(_col1), count(_col0) + mode: partial2 + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 192 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 192 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: string), _col1 (type: bigint) + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0), count(VALUE._col1) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 192 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 192 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select max(key), count(distinct key) B1_CNTD from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select max(key), count(distinct key) B1_CNTD from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +98 309 +PREHOOK: query: explain select max(key), count(distinct key), min(key) from src +PREHOOK: type: QUERY +POSTHOOK: query: explain select max(key), count(distinct key), min(key) from src +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: max(key), min(key) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col3 + Statistics: Num rows: 205 Data size: 93275 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 205 Data size: 93275 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: string), _col3 (type: string) + Execution mode: llap + LLAP IO: no inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0), min(VALUE._col1) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 205 Data size: 93275 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: max(_col1), count(_col0), min(_col2) + mode: partial2 + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 376 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 376 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: string), _col1 (type: bigint), _col2 (type: string) + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0), count(VALUE._col1), min(VALUE._col2) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 376 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 376 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select max(key), count(distinct key), min(key) from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select max(key), count(distinct key), min(key) from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +98 309 0 +PREHOOK: query: explain select max(key), count(distinct key), min(key), avg(key) from src +PREHOOK: type: QUERY +POSTHOOK: query: explain select max(key), count(distinct key), min(key), avg(key) from src +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: max(key), min(key), avg(key) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col3, _col4 + Statistics: Num rows: 205 Data size: 145755 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 205 Data size: 145755 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: string), _col3 (type: string), _col4 (type: struct) + Execution mode: llap + LLAP IO: no inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0), min(VALUE._col1), avg(VALUE._col2) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 205 Data size: 145755 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: max(_col1), count(_col0), min(_col2), avg(_col3) + mode: partial2 + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 1 Data size: 632 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 632 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: string), _col1 (type: bigint), _col2 (type: string), _col3 (type: struct) + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0), count(VALUE._col1), min(VALUE._col2), avg(VALUE._col3) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 1 Data size: 384 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 384 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select max(key), count(distinct key), min(key), avg(key) from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select max(key), count(distinct key), min(key), avg(key) from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +98 309 0 260.182 +PREHOOK: query: explain select count(1), count(distinct key) from src +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(1), count(distinct key) from src +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: string) + outputColumnNames: _col1 + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: count(1) + keys: _col1 (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 205 Data size: 19475 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 205 Data size: 19475 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: bigint) + Execution mode: llap + LLAP IO: no inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1 + Statistics: Num rows: 205 Data size: 19475 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: count(_col1), count(_col0) + mode: partial2 + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: bigint), _col1 (type: bigint) + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), count(VALUE._col1) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(1), count(distinct key) from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select count(1), count(distinct key) from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +500 309 +PREHOOK: query: explain select + count(*) as total, + count(key) as not_null_total, + count(distinct key) as unique_days, + max(value) as max_ss_store_sk, + max(key) as max_ss_promo_sk +from src +PREHOOK: type: QUERY +POSTHOOK: query: explain select + count(*) as total, + count(key) as not_null_total, + count(distinct key) as unique_days, + max(value) as max_ss_store_sk, + max(key) as max_ss_promo_sk +from src +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: key, value + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: count(), count(key), max(value), max(key) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2, _col4, _col5 + Statistics: Num rows: 205 Data size: 96555 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 205 Data size: 96555 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: bigint), _col2 (type: bigint), _col4 (type: string), _col5 (type: string) + Execution mode: llap + LLAP IO: no inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), count(VALUE._col1), max(VALUE._col2), max(VALUE._col3) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 205 Data size: 96555 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: count(_col1), count(_col2), count(_col0), max(_col3), max(_col4) + mode: partial2 + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 1 Data size: 392 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 392 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: bigint), _col1 (type: bigint), _col2 (type: bigint), _col3 (type: string), _col4 (type: string) + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), count(VALUE._col1), count(VALUE._col2), max(VALUE._col3), max(VALUE._col4) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 1 Data size: 392 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 392 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select + count(*) as total, + count(key) as not_null_total, + count(distinct key) as unique_days, + max(value) as max_ss_store_sk, + max(key) as max_ss_promo_sk +from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select + count(*) as total, + count(key) as not_null_total, + count(distinct key) as unique_days, + max(value) as max_ss_store_sk, + max(key) as max_ss_promo_sk +from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +500 500 309 val_98 98 +PREHOOK: query: explain select count(1), count(distinct key), cast(STDDEV(key) as int) from src +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(1), count(distinct key), cast(STDDEV(key) as int) from src +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: string) + outputColumnNames: _col1 + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: count(1), stddev(_col1) + keys: _col1 (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col3 + Statistics: Num rows: 205 Data size: 35875 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 205 Data size: 35875 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: bigint), _col3 (type: struct) + Execution mode: llap + LLAP IO: no inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), stddev(VALUE._col1) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 205 Data size: 35875 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: count(_col1), count(_col0), stddev(_col2) + mode: partial2 + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 96 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 96 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: bigint), _col1 (type: bigint), _col2 (type: struct) + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), count(VALUE._col1), stddev(VALUE._col2) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: bigint), _col1 (type: bigint), UDFToInteger(_col2) (type: int) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(1), count(distinct key), cast(STDDEV(key) as int) from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select count(1), count(distinct key), cast(STDDEV(key) as int) from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +500 309 142 +PREHOOK: query: select count(distinct key), count(1), cast(STDDEV(key) as int) from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select count(distinct key), count(1), cast(STDDEV(key) as int) from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +309 500 142 +PREHOOK: query: explain SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + count(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + cast(std(substr(src.value,5)) as int), + cast(stddev_samp(substr(src.value,5)) as int), + cast(variance(substr(src.value,5)) as int), + cast(var_samp(substr(src.value,5)) as int) from src +PREHOOK: type: QUERY +POSTHOOK: query: explain SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + count(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + cast(std(substr(src.value,5)) as int), + cast(stddev_samp(substr(src.value,5)) as int), + cast(variance(substr(src.value,5)) as int), + cast(var_samp(substr(src.value,5)) as int) from src +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 45500 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: substr(value, 5) (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 500 Data size: 45500 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: sum(_col0), avg(_col0), max(_col0), min(_col0), std(_col0), stddev_samp(_col0), variance(_col0), var_samp(_col0) + keys: _col0 (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2, _col4, _col5, _col6, _col7, _col8, _col9 + Statistics: Num rows: 214 Data size: 243104 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 214 Data size: 243104 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: double), _col2 (type: struct), _col4 (type: string), _col5 (type: string), _col6 (type: struct), _col7 (type: struct), _col8 (type: struct), _col9 (type: struct) + Execution mode: llap + LLAP IO: no inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: sum(VALUE._col0), avg(VALUE._col1), max(VALUE._col2), min(VALUE._col3), std(VALUE._col4), stddev_samp(VALUE._col5), variance(VALUE._col6), var_samp(VALUE._col7) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 + Statistics: Num rows: 214 Data size: 243104 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: sum(_col1), avg(_col2), count(_col0), max(_col3), min(_col4), std(_col5), stddev_samp(_col6), variance(_col7), var_samp(_col8) + mode: partial2 + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 + Statistics: Num rows: 1 Data size: 960 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 960 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: double), _col1 (type: struct), _col2 (type: bigint), _col3 (type: string), _col4 (type: string), _col5 (type: struct), _col6 (type: struct), _col7 (type: struct), _col8 (type: struct) + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: sum(VALUE._col0), avg(VALUE._col1), count(VALUE._col2), max(VALUE._col3), min(VALUE._col4), std(VALUE._col5), stddev_samp(VALUE._col6), variance(VALUE._col7), var_samp(VALUE._col8) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 + Statistics: Num rows: 1 Data size: 424 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: double), _col1 (type: double), _col2 (type: bigint), _col3 (type: string), _col4 (type: string), UDFToInteger(_col5) (type: int), UDFToInteger(_col6) (type: int), UDFToInteger(_col7) (type: int), UDFToInteger(_col8) (type: int) + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 + Statistics: Num rows: 1 Data size: 408 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 408 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + count(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + cast(std(substr(src.value,5)) as int), + cast(stddev_samp(substr(src.value,5)) as int), + cast(variance(substr(src.value,5)) as int), + cast(var_samp(substr(src.value,5)) as int) from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + count(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + cast(std(substr(src.value,5)) as int), + cast(stddev_samp(substr(src.value,5)) as int), + cast(variance(substr(src.value,5)) as int), + cast(var_samp(substr(src.value,5)) as int) from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +130091.0 260.182 309 98 0 142 143 20428 20469 +PREHOOK: query: explain select max(key), count(distinct key), min(key), avg(key) from src group by value +PREHOOK: type: QUERY +POSTHOOK: query: explain select max(key), count(distinct key), min(key), avg(key) from src group by value +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: key, value + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: max(key), count(DISTINCT key), min(key), avg(key) + keys: value (type: string), key (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5 + Statistics: Num rows: 250 Data size: 202500 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: string) + sort order: ++ + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 250 Data size: 202500 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col2 (type: string), _col4 (type: string), _col5 (type: struct) + Execution mode: llap + LLAP IO: no inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0), count(DISTINCT KEY._col1:0._col0), min(VALUE._col2), avg(VALUE._col3) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 214 Data size: 101650 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col1 (type: string), _col2 (type: bigint), _col3 (type: string), _col4 (type: double) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 214 Data size: 82176 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 214 Data size: 82176 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select max(key), count(distinct key), min(key), avg(key) from src group by value +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select max(key), count(distinct key), min(key), avg(key) from src group by value +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +0 1 0 0.0 +10 1 10 10.0 +100 1 100 100.0 +103 1 103 103.0 +104 1 104 104.0 +105 1 105 105.0 +11 1 11 11.0 +111 1 111 111.0 +113 1 113 113.0 +114 1 114 114.0 +116 1 116 116.0 +118 1 118 118.0 +119 1 119 119.0 +12 1 12 12.0 +120 1 120 120.0 +125 1 125 125.0 +126 1 126 126.0 +128 1 128 128.0 +129 1 129 129.0 +131 1 131 131.0 +133 1 133 133.0 +134 1 134 134.0 +136 1 136 136.0 +137 1 137 137.0 +138 1 138 138.0 +143 1 143 143.0 +145 1 145 145.0 +146 1 146 146.0 +149 1 149 149.0 +15 1 15 15.0 +150 1 150 150.0 +152 1 152 152.0 +153 1 153 153.0 +155 1 155 155.0 +156 1 156 156.0 +157 1 157 157.0 +158 1 158 158.0 +160 1 160 160.0 +162 1 162 162.0 +163 1 163 163.0 +164 1 164 164.0 +165 1 165 165.0 +166 1 166 166.0 +167 1 167 167.0 +168 1 168 168.0 +169 1 169 169.0 +17 1 17 17.0 +170 1 170 170.0 +172 1 172 172.0 +174 1 174 174.0 +175 1 175 175.0 +176 1 176 176.0 +177 1 177 177.0 +178 1 178 178.0 +179 1 179 179.0 +18 1 18 18.0 +180 1 180 180.0 +181 1 181 181.0 +183 1 183 183.0 +186 1 186 186.0 +187 1 187 187.0 +189 1 189 189.0 +19 1 19 19.0 +190 1 190 190.0 +191 1 191 191.0 +192 1 192 192.0 +193 1 193 193.0 +194 1 194 194.0 +195 1 195 195.0 +196 1 196 196.0 +197 1 197 197.0 +199 1 199 199.0 +2 1 2 2.0 +20 1 20 20.0 +200 1 200 200.0 +201 1 201 201.0 +202 1 202 202.0 +203 1 203 203.0 +205 1 205 205.0 +207 1 207 207.0 +208 1 208 208.0 +209 1 209 209.0 +213 1 213 213.0 +214 1 214 214.0 +216 1 216 216.0 +217 1 217 217.0 +218 1 218 218.0 +219 1 219 219.0 +221 1 221 221.0 +222 1 222 222.0 +223 1 223 223.0 +224 1 224 224.0 +226 1 226 226.0 +228 1 228 228.0 +229 1 229 229.0 +230 1 230 230.0 +233 1 233 233.0 +235 1 235 235.0 +237 1 237 237.0 +238 1 238 238.0 +239 1 239 239.0 +24 1 24 24.0 +241 1 241 241.0 +242 1 242 242.0 +244 1 244 244.0 +247 1 247 247.0 +248 1 248 248.0 +249 1 249 249.0 +252 1 252 252.0 +255 1 255 255.0 +256 1 256 256.0 +257 1 257 257.0 +258 1 258 258.0 +26 1 26 26.0 +260 1 260 260.0 +262 1 262 262.0 +263 1 263 263.0 +265 1 265 265.0 +266 1 266 266.0 +27 1 27 27.0 +272 1 272 272.0 +273 1 273 273.0 +274 1 274 274.0 +275 1 275 275.0 +277 1 277 277.0 +278 1 278 278.0 +28 1 28 28.0 +280 1 280 280.0 +281 1 281 281.0 +282 1 282 282.0 +283 1 283 283.0 +284 1 284 284.0 +285 1 285 285.0 +286 1 286 286.0 +287 1 287 287.0 +288 1 288 288.0 +289 1 289 289.0 +291 1 291 291.0 +292 1 292 292.0 +296 1 296 296.0 +298 1 298 298.0 +30 1 30 30.0 +302 1 302 302.0 +305 1 305 305.0 +306 1 306 306.0 +307 1 307 307.0 +308 1 308 308.0 +309 1 309 309.0 +310 1 310 310.0 +311 1 311 311.0 +315 1 315 315.0 +316 1 316 316.0 +317 1 317 317.0 +318 1 318 318.0 +321 1 321 321.0 +322 1 322 322.0 +323 1 323 323.0 +325 1 325 325.0 +327 1 327 327.0 +33 1 33 33.0 +331 1 331 331.0 +332 1 332 332.0 +333 1 333 333.0 +335 1 335 335.0 +336 1 336 336.0 +338 1 338 338.0 +339 1 339 339.0 +34 1 34 34.0 +341 1 341 341.0 +342 1 342 342.0 +344 1 344 344.0 +345 1 345 345.0 +348 1 348 348.0 +35 1 35 35.0 +351 1 351 351.0 +353 1 353 353.0 +356 1 356 356.0 +360 1 360 360.0 +362 1 362 362.0 +364 1 364 364.0 +365 1 365 365.0 +366 1 366 366.0 +367 1 367 367.0 +368 1 368 368.0 +369 1 369 369.0 +37 1 37 37.0 +373 1 373 373.0 +374 1 374 374.0 +375 1 375 375.0 +377 1 377 377.0 +378 1 378 378.0 +379 1 379 379.0 +382 1 382 382.0 +384 1 384 384.0 +386 1 386 386.0 +389 1 389 389.0 +392 1 392 392.0 +393 1 393 393.0 +394 1 394 394.0 +395 1 395 395.0 +396 1 396 396.0 +397 1 397 397.0 +399 1 399 399.0 +4 1 4 4.0 +400 1 400 400.0 +401 1 401 401.0 +402 1 402 402.0 +403 1 403 403.0 +404 1 404 404.0 +406 1 406 406.0 +407 1 407 407.0 +409 1 409 409.0 +41 1 41 41.0 +411 1 411 411.0 +413 1 413 413.0 +414 1 414 414.0 +417 1 417 417.0 +418 1 418 418.0 +419 1 419 419.0 +42 1 42 42.0 +421 1 421 421.0 +424 1 424 424.0 +427 1 427 427.0 +429 1 429 429.0 +43 1 43 43.0 +430 1 430 430.0 +431 1 431 431.0 +432 1 432 432.0 +435 1 435 435.0 +436 1 436 436.0 +437 1 437 437.0 +438 1 438 438.0 +439 1 439 439.0 +44 1 44 44.0 +443 1 443 443.0 +444 1 444 444.0 +446 1 446 446.0 +448 1 448 448.0 +449 1 449 449.0 +452 1 452 452.0 +453 1 453 453.0 +454 1 454 454.0 +455 1 455 455.0 +457 1 457 457.0 +458 1 458 458.0 +459 1 459 459.0 +460 1 460 460.0 +462 1 462 462.0 +463 1 463 463.0 +466 1 466 466.0 +467 1 467 467.0 +468 1 468 468.0 +469 1 469 469.0 +47 1 47 47.0 +470 1 470 470.0 +472 1 472 472.0 +475 1 475 475.0 +477 1 477 477.0 +478 1 478 478.0 +479 1 479 479.0 +480 1 480 480.0 +481 1 481 481.0 +482 1 482 482.0 +483 1 483 483.0 +484 1 484 484.0 +485 1 485 485.0 +487 1 487 487.0 +489 1 489 489.0 +490 1 490 490.0 +491 1 491 491.0 +492 1 492 492.0 +493 1 493 493.0 +494 1 494 494.0 +495 1 495 495.0 +496 1 496 496.0 +497 1 497 497.0 +498 1 498 498.0 +5 1 5 5.0 +51 1 51 51.0 +53 1 53 53.0 +54 1 54 54.0 +57 1 57 57.0 +58 1 58 58.0 +64 1 64 64.0 +65 1 65 65.0 +66 1 66 66.0 +67 1 67 67.0 +69 1 69 69.0 +70 1 70 70.0 +72 1 72 72.0 +74 1 74 74.0 +76 1 76 76.0 +77 1 77 77.0 +78 1 78 78.0 +8 1 8 8.0 +80 1 80 80.0 +82 1 82 82.0 +83 1 83 83.0 +84 1 84 84.0 +85 1 85 85.0 +86 1 86 86.0 +87 1 87 87.0 +9 1 9 9.0 +90 1 90 90.0 +92 1 92 92.0 +95 1 95 95.0 +96 1 96 96.0 +97 1 97 97.0 +98 1 98 98.0 diff --git a/ql/src/test/results/clientpositive/nullgroup4.q.out b/ql/src/test/results/clientpositive/nullgroup4.q.out index e5a8eeee14..d4c8e6a744 100644 --- a/ql/src/test/results/clientpositive/nullgroup4.q.out +++ b/ql/src/test/results/clientpositive/nullgroup4.q.out @@ -93,7 +93,8 @@ select count(1), count(distinct x.value) from src x where x.key = 9999 POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-1 is a root stage - Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 STAGE PLANS: Stage: Stage-1 @@ -110,25 +111,53 @@ STAGE PLANS: outputColumnNames: _col1 Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE Group By Operator - aggregations: count(1), count(DISTINCT _col1) + aggregations: count(1) keys: _col1 (type: string) mode: hash - outputColumnNames: _col0, _col1, _col2 + outputColumnNames: _col0, _col1 Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col0 (type: string) sort order: + + Map-reduce partition columns: _col0 (type: string) Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE value expressions: _col1 (type: bigint) Reduce Operator Tree: Group By Operator - aggregations: count(VALUE._col0), count(DISTINCT KEY._col0:0._col0) + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1 + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(_col1), count(_col0) + mode: partial2 + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint), _col1 (type: bigint) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), count(VALUE._col1) mode: mergepartial outputColumnNames: _col0, _col1 - Statistics: Num rows: 1 Data size: 24 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: NONE File Output Operator compressed: false - Statistics: Num rows: 1 Data size: 24 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: NONE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat diff --git a/ql/src/test/results/clientpositive/perf/query16.q.out b/ql/src/test/results/clientpositive/perf/query16.q.out index cf90c0c162..7fa7a00808 100644 --- a/ql/src/test/results/clientpositive/perf/query16.q.out +++ b/ql/src/test/results/clientpositive/perf/query16.q.out @@ -1,4 +1,4 @@ -Warning: Shuffle Join MERGEJOIN[106][tables = [$hdt$_2, $hdt$_3, $hdt$_1, $hdt$_4]] in Stage 'Reducer 17' is a cross product +Warning: Shuffle Join MERGEJOIN[112][tables = [$hdt$_2, $hdt$_3, $hdt$_1, $hdt$_4]] in Stage 'Reducer 18' is a cross product PREHOOK: query: explain select count(distinct cs_order_number) as `order count` ,sum(cs_ext_ship_cost) as `total shipping cost` @@ -60,172 +60,180 @@ POSTHOOK: type: QUERY Plan optimized by CBO. Vertex dependency in root stage -Reducer 13 <- Map 12 (SIMPLE_EDGE) -Reducer 15 <- Map 14 (SIMPLE_EDGE), Reducer 18 (SIMPLE_EDGE) -Reducer 16 <- Reducer 15 (SIMPLE_EDGE) -Reducer 17 <- Map 14 (CUSTOM_SIMPLE_EDGE), Map 19 (CUSTOM_SIMPLE_EDGE), Map 20 (CUSTOM_SIMPLE_EDGE), Map 21 (CUSTOM_SIMPLE_EDGE) -Reducer 18 <- Reducer 17 (SIMPLE_EDGE) -Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 9 (SIMPLE_EDGE) -Reducer 3 <- Map 10 (SIMPLE_EDGE), Reducer 2 (SIMPLE_EDGE) -Reducer 4 <- Map 11 (SIMPLE_EDGE), Reducer 3 (SIMPLE_EDGE) -Reducer 5 <- Reducer 13 (SIMPLE_EDGE), Reducer 4 (SIMPLE_EDGE) -Reducer 6 <- Reducer 16 (SIMPLE_EDGE), Reducer 5 (SIMPLE_EDGE) +Reducer 14 <- Map 13 (SIMPLE_EDGE) +Reducer 16 <- Map 15 (SIMPLE_EDGE), Reducer 19 (SIMPLE_EDGE) +Reducer 17 <- Reducer 16 (SIMPLE_EDGE) +Reducer 18 <- Map 15 (CUSTOM_SIMPLE_EDGE), Map 20 (CUSTOM_SIMPLE_EDGE), Map 21 (CUSTOM_SIMPLE_EDGE), Map 22 (CUSTOM_SIMPLE_EDGE) +Reducer 19 <- Reducer 18 (SIMPLE_EDGE) +Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 10 (SIMPLE_EDGE) +Reducer 3 <- Map 11 (SIMPLE_EDGE), Reducer 2 (SIMPLE_EDGE) +Reducer 4 <- Map 12 (SIMPLE_EDGE), Reducer 3 (SIMPLE_EDGE) +Reducer 5 <- Reducer 14 (SIMPLE_EDGE), Reducer 4 (SIMPLE_EDGE) +Reducer 6 <- Reducer 17 (SIMPLE_EDGE), Reducer 5 (SIMPLE_EDGE) Reducer 7 <- Reducer 6 (SIMPLE_EDGE) -Reducer 8 <- Reducer 7 (SIMPLE_EDGE) +Reducer 8 <- Reducer 7 (CUSTOM_SIMPLE_EDGE) +Reducer 9 <- Reducer 8 (SIMPLE_EDGE) Stage-0 Fetch Operator limit:100 Stage-1 - Reducer 8 + Reducer 9 File Output Operator [FS_73] Limit [LIM_72] (rows=1 width=344) Number of rows:100 Select Operator [SEL_71] (rows=1 width=344) Output:["_col0","_col1","_col2"] - <-Reducer 7 [SIMPLE_EDGE] + <-Reducer 8 [SIMPLE_EDGE] SHUFFLE [RS_70] - Group By Operator [GBY_68] (rows=1 width=344) - Output:["_col0","_col1","_col2"],aggregations:["count(DISTINCT KEY._col0:0._col0)","sum(VALUE._col1)","sum(VALUE._col2)"] - <-Reducer 6 [SIMPLE_EDGE] - SHUFFLE [RS_67] - Group By Operator [GBY_66] (rows=1395035081047425024 width=1) - Output:["_col0","_col1","_col2","_col3"],aggregations:["count(DISTINCT _col4)","sum(_col5)","sum(_col6)"],keys:_col4 - Select Operator [SEL_65] (rows=1395035081047425024 width=1) - Output:["_col4","_col5","_col6"] - Filter Operator [FIL_64] (rows=1395035081047425024 width=1) - predicate:_col16 is null - Select Operator [SEL_63] (rows=2790070162094850048 width=1) - Output:["_col4","_col5","_col6","_col16"] - Merge Join Operator [MERGEJOIN_112] (rows=2790070162094850048 width=1) - Conds:RS_60._col3, _col4=RS_61._col0, _col1(Inner),Output:["_col4","_col5","_col6","_col14"] - <-Reducer 16 [SIMPLE_EDGE] - SHUFFLE [RS_61] - PartitionCols:_col0, _col1 - Group By Operator [GBY_46] (rows=2536427365110644736 width=1) - Output:["_col0","_col1"],keys:KEY._col0, KEY._col1 - <-Reducer 15 [SIMPLE_EDGE] - SHUFFLE [RS_45] - PartitionCols:_col0, _col1 - Group By Operator [GBY_44] (rows=5072854730221289472 width=1) - Output:["_col0","_col1"],keys:_col2, _col3 - Select Operator [SEL_43] (rows=5072854730221289472 width=1) - Output:["_col2","_col3"] - Filter Operator [FIL_42] (rows=5072854730221289472 width=1) - predicate:(_col2 <> _col0) - Merge Join Operator [MERGEJOIN_110] (rows=5072854730221289472 width=1) - Conds:RS_39._col1=RS_40._col1(Inner),Output:["_col0","_col2","_col3"] - <-Map 14 [SIMPLE_EDGE] - PARTITION_ONLY_SHUFFLE [RS_39] - PartitionCols:_col1 - Select Operator [SEL_20] (rows=287989836 width=135) - Output:["_col0","_col1"] - TableScan [TS_19] (rows=287989836 width=135) - default@catalog_sales,cs2,Tbl:COMPLETE,Col:NONE,Output:["cs_warehouse_sk","cs_order_number"] - <-Reducer 18 [SIMPLE_EDGE] - SHUFFLE [RS_40] - PartitionCols:_col1 - Select Operator [SEL_38] (rows=4611686018427387903 width=1) - Output:["_col0","_col1"] - Group By Operator [GBY_37] (rows=4611686018427387903 width=1) - Output:["_col0","_col1"],keys:KEY._col0, KEY._col1 - <-Reducer 17 [SIMPLE_EDGE] - SHUFFLE [RS_36] - PartitionCols:_col0, _col1 - Group By Operator [GBY_35] (rows=9223372036854775807 width=1) - Output:["_col0","_col1"],keys:_col4, _col3 - Merge Join Operator [MERGEJOIN_106] (rows=9223372036854775807 width=1) - Conds:(Inner),(Inner),(Inner),Output:["_col3","_col4"] - <-Map 14 [CUSTOM_SIMPLE_EDGE] - PARTITION_ONLY_SHUFFLE [RS_32] - Select Operator [SEL_28] (rows=287989836 width=135) - Output:["_col0","_col1"] - Please refer to the previous TableScan [TS_19] - <-Map 19 [CUSTOM_SIMPLE_EDGE] - PARTITION_ONLY_SHUFFLE [RS_29] - Select Operator [SEL_22] (rows=73049 width=4) - TableScan [TS_21] (rows=73049 width=1119) - default@date_dim,date_dim,Tbl:COMPLETE,Col:COMPLETE - <-Map 20 [CUSTOM_SIMPLE_EDGE] - PARTITION_ONLY_SHUFFLE [RS_30] - Select Operator [SEL_24] (rows=60 width=4) - TableScan [TS_23] (rows=60 width=2045) - default@call_center,call_center,Tbl:COMPLETE,Col:COMPLETE - <-Map 21 [CUSTOM_SIMPLE_EDGE] - PARTITION_ONLY_SHUFFLE [RS_31] - Select Operator [SEL_26] (rows=40000000 width=4) - TableScan [TS_25] (rows=40000000 width=1014) - default@customer_address,customer_address,Tbl:COMPLETE,Col:COMPLETE - <-Reducer 5 [SIMPLE_EDGE] - SHUFFLE [RS_60] - PartitionCols:_col3, _col4 - Merge Join Operator [MERGEJOIN_111] (rows=421645953 width=135) - Conds:RS_57._col4=RS_58._col0(Left Outer),Output:["_col3","_col4","_col5","_col6","_col14"] - <-Reducer 13 [SIMPLE_EDGE] - SHUFFLE [RS_58] - PartitionCols:_col0 - Select Operator [SEL_18] (rows=14399440 width=106) - Output:["_col0","_col1"] - Group By Operator [GBY_17] (rows=14399440 width=106) - Output:["_col0"],keys:KEY._col0 - <-Map 12 [SIMPLE_EDGE] - SHUFFLE [RS_16] + Group By Operator [GBY_111] (rows=1 width=344) + Output:["_col0","_col1","_col2"],aggregations:["count(VALUE._col0)","sum(VALUE._col1)","sum(VALUE._col2)"] + <-Reducer 7 [CUSTOM_SIMPLE_EDGE] + PARTITION_ONLY_SHUFFLE [RS_110] + Group By Operator [GBY_109] (rows=1 width=344) + Output:["_col0","_col1","_col2"],aggregations:["count(_col0)","sum(_col1)","sum(_col2)"] + Group By Operator [GBY_108] (rows=1395035081047425024 width=1) + Output:["_col0","_col1","_col2"],aggregations:["sum(VALUE._col0)","sum(VALUE._col1)"],keys:KEY._col0 + <-Reducer 6 [SIMPLE_EDGE] + SHUFFLE [RS_107] + PartitionCols:_col0 + Group By Operator [GBY_106] (rows=1395035081047425024 width=1) + Output:["_col0","_col2","_col3"],aggregations:["sum(_col5)","sum(_col6)"],keys:_col4 + Select Operator [SEL_65] (rows=1395035081047425024 width=1) + Output:["_col4","_col5","_col6"] + Filter Operator [FIL_64] (rows=1395035081047425024 width=1) + predicate:_col16 is null + Select Operator [SEL_63] (rows=2790070162094850048 width=1) + Output:["_col4","_col5","_col6","_col16"] + Merge Join Operator [MERGEJOIN_118] (rows=2790070162094850048 width=1) + Conds:RS_60._col3, _col4=RS_61._col0, _col1(Inner),Output:["_col4","_col5","_col6","_col14"] + <-Reducer 17 [SIMPLE_EDGE] + SHUFFLE [RS_61] + PartitionCols:_col0, _col1 + Group By Operator [GBY_46] (rows=2536427365110644736 width=1) + Output:["_col0","_col1"],keys:KEY._col0, KEY._col1 + <-Reducer 16 [SIMPLE_EDGE] + SHUFFLE [RS_45] + PartitionCols:_col0, _col1 + Group By Operator [GBY_44] (rows=5072854730221289472 width=1) + Output:["_col0","_col1"],keys:_col2, _col3 + Select Operator [SEL_43] (rows=5072854730221289472 width=1) + Output:["_col2","_col3"] + Filter Operator [FIL_42] (rows=5072854730221289472 width=1) + predicate:(_col2 <> _col0) + Merge Join Operator [MERGEJOIN_116] (rows=5072854730221289472 width=1) + Conds:RS_39._col1=RS_40._col1(Inner),Output:["_col0","_col2","_col3"] + <-Map 15 [SIMPLE_EDGE] + PARTITION_ONLY_SHUFFLE [RS_39] + PartitionCols:_col1 + Select Operator [SEL_20] (rows=287989836 width=135) + Output:["_col0","_col1"] + TableScan [TS_19] (rows=287989836 width=135) + default@catalog_sales,cs2,Tbl:COMPLETE,Col:NONE,Output:["cs_warehouse_sk","cs_order_number"] + <-Reducer 19 [SIMPLE_EDGE] + SHUFFLE [RS_40] + PartitionCols:_col1 + Select Operator [SEL_38] (rows=4611686018427387903 width=1) + Output:["_col0","_col1"] + Group By Operator [GBY_37] (rows=4611686018427387903 width=1) + Output:["_col0","_col1"],keys:KEY._col0, KEY._col1 + <-Reducer 18 [SIMPLE_EDGE] + SHUFFLE [RS_36] + PartitionCols:_col0, _col1 + Group By Operator [GBY_35] (rows=9223372036854775807 width=1) + Output:["_col0","_col1"],keys:_col4, _col3 + Merge Join Operator [MERGEJOIN_112] (rows=9223372036854775807 width=1) + Conds:(Inner),(Inner),(Inner),Output:["_col3","_col4"] + <-Map 15 [CUSTOM_SIMPLE_EDGE] + PARTITION_ONLY_SHUFFLE [RS_32] + Select Operator [SEL_28] (rows=287989836 width=135) + Output:["_col0","_col1"] + Please refer to the previous TableScan [TS_19] + <-Map 20 [CUSTOM_SIMPLE_EDGE] + PARTITION_ONLY_SHUFFLE [RS_29] + Select Operator [SEL_22] (rows=73049 width=4) + TableScan [TS_21] (rows=73049 width=1119) + default@date_dim,date_dim,Tbl:COMPLETE,Col:COMPLETE + <-Map 21 [CUSTOM_SIMPLE_EDGE] + PARTITION_ONLY_SHUFFLE [RS_30] + Select Operator [SEL_24] (rows=60 width=4) + TableScan [TS_23] (rows=60 width=2045) + default@call_center,call_center,Tbl:COMPLETE,Col:COMPLETE + <-Map 22 [CUSTOM_SIMPLE_EDGE] + PARTITION_ONLY_SHUFFLE [RS_31] + Select Operator [SEL_26] (rows=40000000 width=4) + TableScan [TS_25] (rows=40000000 width=1014) + default@customer_address,customer_address,Tbl:COMPLETE,Col:COMPLETE + <-Reducer 5 [SIMPLE_EDGE] + SHUFFLE [RS_60] + PartitionCols:_col3, _col4 + Merge Join Operator [MERGEJOIN_117] (rows=421645953 width=135) + Conds:RS_57._col4=RS_58._col0(Left Outer),Output:["_col3","_col4","_col5","_col6","_col14"] + <-Reducer 14 [SIMPLE_EDGE] + SHUFFLE [RS_58] PartitionCols:_col0 - Group By Operator [GBY_15] (rows=28798881 width=106) - Output:["_col0"],keys:cr_order_number - Filter Operator [FIL_103] (rows=28798881 width=106) - predicate:cr_order_number is not null - TableScan [TS_12] (rows=28798881 width=106) - default@catalog_returns,cr1,Tbl:COMPLETE,Col:NONE,Output:["cr_order_number"] - <-Reducer 4 [SIMPLE_EDGE] - SHUFFLE [RS_57] - PartitionCols:_col4 - Merge Join Operator [MERGEJOIN_109] (rows=383314495 width=135) - Conds:RS_54._col2=RS_55._col0(Inner),Output:["_col3","_col4","_col5","_col6"] - <-Map 11 [SIMPLE_EDGE] - SHUFFLE [RS_55] - PartitionCols:_col0 - Select Operator [SEL_11] (rows=30 width=2045) - Output:["_col0"] - Filter Operator [FIL_102] (rows=30 width=2045) - predicate:((cc_county) IN ('Ziebach County', 'Levy County', 'Huron County', 'Franklin Parish', 'Daviess County') and cc_call_center_sk is not null) - TableScan [TS_9] (rows=60 width=2045) - default@call_center,call_center,Tbl:COMPLETE,Col:NONE,Output:["cc_call_center_sk","cc_county"] - <-Reducer 3 [SIMPLE_EDGE] - SHUFFLE [RS_54] - PartitionCols:_col2 - Merge Join Operator [MERGEJOIN_108] (rows=348467716 width=135) - Conds:RS_51._col1=RS_52._col0(Inner),Output:["_col2","_col3","_col4","_col5","_col6"] - <-Map 10 [SIMPLE_EDGE] - SHUFFLE [RS_52] - PartitionCols:_col0 - Select Operator [SEL_8] (rows=20000000 width=1014) - Output:["_col0"] - Filter Operator [FIL_101] (rows=20000000 width=1014) - predicate:((ca_state = 'NY') and ca_address_sk is not null) - TableScan [TS_6] (rows=40000000 width=1014) - default@customer_address,customer_address,Tbl:COMPLETE,Col:NONE,Output:["ca_address_sk","ca_state"] - <-Reducer 2 [SIMPLE_EDGE] - SHUFFLE [RS_51] - PartitionCols:_col1 - Merge Join Operator [MERGEJOIN_107] (rows=316788826 width=135) - Conds:RS_48._col0=RS_49._col0(Inner),Output:["_col1","_col2","_col3","_col4","_col5","_col6"] - <-Map 1 [SIMPLE_EDGE] - SHUFFLE [RS_48] + Select Operator [SEL_18] (rows=14399440 width=106) + Output:["_col0","_col1"] + Group By Operator [GBY_17] (rows=14399440 width=106) + Output:["_col0"],keys:KEY._col0 + <-Map 13 [SIMPLE_EDGE] + SHUFFLE [RS_16] PartitionCols:_col0 - Select Operator [SEL_2] (rows=287989836 width=135) - Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6"] - Filter Operator [FIL_99] (rows=287989836 width=135) - predicate:(cs_ship_date_sk is not null and cs_ship_addr_sk is not null and cs_call_center_sk is not null) - TableScan [TS_0] (rows=287989836 width=135) - default@catalog_sales,cs1,Tbl:COMPLETE,Col:NONE,Output:["cs_ship_date_sk","cs_ship_addr_sk","cs_call_center_sk","cs_warehouse_sk","cs_order_number","cs_ext_ship_cost","cs_net_profit"] - <-Map 9 [SIMPLE_EDGE] - SHUFFLE [RS_49] - PartitionCols:_col0 - Select Operator [SEL_5] (rows=8116 width=1119) - Output:["_col0"] - Filter Operator [FIL_100] (rows=8116 width=1119) - predicate:(CAST( d_date AS TIMESTAMP) BETWEEN 2001-04-01 00:00:00.0 AND 2001-05-31 01:00:00.0 and d_date_sk is not null) - TableScan [TS_3] (rows=73049 width=1119) - default@date_dim,date_dim,Tbl:COMPLETE,Col:NONE,Output:["d_date_sk","d_date"] + Group By Operator [GBY_15] (rows=28798881 width=106) + Output:["_col0"],keys:cr_order_number + Filter Operator [FIL_103] (rows=28798881 width=106) + predicate:cr_order_number is not null + TableScan [TS_12] (rows=28798881 width=106) + default@catalog_returns,cr1,Tbl:COMPLETE,Col:NONE,Output:["cr_order_number"] + <-Reducer 4 [SIMPLE_EDGE] + SHUFFLE [RS_57] + PartitionCols:_col4 + Merge Join Operator [MERGEJOIN_115] (rows=383314495 width=135) + Conds:RS_54._col2=RS_55._col0(Inner),Output:["_col3","_col4","_col5","_col6"] + <-Map 12 [SIMPLE_EDGE] + SHUFFLE [RS_55] + PartitionCols:_col0 + Select Operator [SEL_11] (rows=30 width=2045) + Output:["_col0"] + Filter Operator [FIL_102] (rows=30 width=2045) + predicate:((cc_county) IN ('Ziebach County', 'Levy County', 'Huron County', 'Franklin Parish', 'Daviess County') and cc_call_center_sk is not null) + TableScan [TS_9] (rows=60 width=2045) + default@call_center,call_center,Tbl:COMPLETE,Col:NONE,Output:["cc_call_center_sk","cc_county"] + <-Reducer 3 [SIMPLE_EDGE] + SHUFFLE [RS_54] + PartitionCols:_col2 + Merge Join Operator [MERGEJOIN_114] (rows=348467716 width=135) + Conds:RS_51._col1=RS_52._col0(Inner),Output:["_col2","_col3","_col4","_col5","_col6"] + <-Map 11 [SIMPLE_EDGE] + SHUFFLE [RS_52] + PartitionCols:_col0 + Select Operator [SEL_8] (rows=20000000 width=1014) + Output:["_col0"] + Filter Operator [FIL_101] (rows=20000000 width=1014) + predicate:((ca_state = 'NY') and ca_address_sk is not null) + TableScan [TS_6] (rows=40000000 width=1014) + default@customer_address,customer_address,Tbl:COMPLETE,Col:NONE,Output:["ca_address_sk","ca_state"] + <-Reducer 2 [SIMPLE_EDGE] + SHUFFLE [RS_51] + PartitionCols:_col1 + Merge Join Operator [MERGEJOIN_113] (rows=316788826 width=135) + Conds:RS_48._col0=RS_49._col0(Inner),Output:["_col1","_col2","_col3","_col4","_col5","_col6"] + <-Map 1 [SIMPLE_EDGE] + SHUFFLE [RS_48] + PartitionCols:_col0 + Select Operator [SEL_2] (rows=287989836 width=135) + Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6"] + Filter Operator [FIL_99] (rows=287989836 width=135) + predicate:(cs_ship_date_sk is not null and cs_ship_addr_sk is not null and cs_call_center_sk is not null) + TableScan [TS_0] (rows=287989836 width=135) + default@catalog_sales,cs1,Tbl:COMPLETE,Col:NONE,Output:["cs_ship_date_sk","cs_ship_addr_sk","cs_call_center_sk","cs_warehouse_sk","cs_order_number","cs_ext_ship_cost","cs_net_profit"] + <-Map 10 [SIMPLE_EDGE] + SHUFFLE [RS_49] + PartitionCols:_col0 + Select Operator [SEL_5] (rows=8116 width=1119) + Output:["_col0"] + Filter Operator [FIL_100] (rows=8116 width=1119) + predicate:(CAST( d_date AS TIMESTAMP) BETWEEN 2001-04-01 00:00:00.0 AND 2001-05-31 01:00:00.0 and d_date_sk is not null) + TableScan [TS_3] (rows=73049 width=1119) + default@date_dim,date_dim,Tbl:COMPLETE,Col:NONE,Output:["d_date_sk","d_date"] diff --git a/ql/src/test/results/clientpositive/perf/query28.q.out b/ql/src/test/results/clientpositive/perf/query28.q.out index 78129cf68b..8ada59fd00 100644 --- a/ql/src/test/results/clientpositive/perf/query28.q.out +++ b/ql/src/test/results/clientpositive/perf/query28.q.out @@ -1,4 +1,4 @@ -Warning: Shuffle Join MERGEJOIN[58][tables = [$hdt$_0, $hdt$_1, $hdt$_2, $hdt$_3, $hdt$_4, $hdt$_5]] in Stage 'Reducer 3' is a cross product +Warning: Shuffle Join MERGEJOIN[64][tables = [$hdt$_0, $hdt$_1, $hdt$_2, $hdt$_3, $hdt$_4, $hdt$_5]] in Stage 'Reducer 4' is a cross product PREHOOK: query: explain select * from (select avg(ss_list_price) B1_LP ,count(ss_list_price) B1_CNT @@ -105,40 +105,48 @@ Plan optimized by CBO. Vertex dependency in root stage Reducer 2 <- Map 1 (SIMPLE_EDGE) -Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE), Reducer 4 (CUSTOM_SIMPLE_EDGE), Reducer 5 (CUSTOM_SIMPLE_EDGE), Reducer 6 (CUSTOM_SIMPLE_EDGE), Reducer 7 (CUSTOM_SIMPLE_EDGE), Reducer 8 (CUSTOM_SIMPLE_EDGE) -Reducer 4 <- Map 1 (SIMPLE_EDGE) +Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) +Reducer 4 <- Reducer 3 (CUSTOM_SIMPLE_EDGE), Reducer 5 (CUSTOM_SIMPLE_EDGE), Reducer 6 (CUSTOM_SIMPLE_EDGE), Reducer 7 (CUSTOM_SIMPLE_EDGE), Reducer 8 (CUSTOM_SIMPLE_EDGE), Reducer 9 (CUSTOM_SIMPLE_EDGE) Reducer 5 <- Map 1 (SIMPLE_EDGE) Reducer 6 <- Map 1 (SIMPLE_EDGE) Reducer 7 <- Map 1 (SIMPLE_EDGE) Reducer 8 <- Map 1 (SIMPLE_EDGE) +Reducer 9 <- Map 1 (SIMPLE_EDGE) Stage-0 Fetch Operator limit:100 Stage-1 - Reducer 3 + Reducer 4 File Output Operator [FS_51] - Limit [LIM_50] (rows=1 width=2497) + Limit [LIM_50] (rows=1 width=2665) Number of rows:100 - Select Operator [SEL_49] (rows=1 width=2497) + Select Operator [SEL_49] (rows=1 width=2665) Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8","_col9","_col10","_col11","_col12","_col13","_col14","_col15","_col16","_col17"] - Merge Join Operator [MERGEJOIN_58] (rows=1 width=2497) + Merge Join Operator [MERGEJOIN_64] (rows=1 width=2665) Conds:(Inner),(Inner),(Inner),(Inner),(Inner),Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8","_col9","_col10","_col11","_col12","_col13","_col14","_col15","_col16","_col17"] - <-Reducer 2 [CUSTOM_SIMPLE_EDGE] + <-Reducer 3 [CUSTOM_SIMPLE_EDGE] PARTITION_ONLY_SHUFFLE [RS_42] - Group By Operator [GBY_5] (rows=1 width=416) - Output:["_col0","_col1","_col2"],aggregations:["avg(VALUE._col0)","count(VALUE._col1)","count(DISTINCT KEY._col0:0._col0)"] - <-Map 1 [SIMPLE_EDGE] - SHUFFLE [RS_4] - Group By Operator [GBY_3] (rows=21333171 width=88) - Output:["_col0","_col1","_col2","_col3"],aggregations:["avg(ss_list_price)","count(ss_list_price)","count(DISTINCT ss_list_price)"],keys:ss_list_price - Select Operator [SEL_2] (rows=21333171 width=88) - Output:["ss_list_price"] - Filter Operator [FIL_52] (rows=21333171 width=88) - predicate:(ss_quantity BETWEEN 0 AND 5 and (ss_list_price BETWEEN 11 AND 21 or ss_coupon_amt BETWEEN 460 AND 1460 or ss_wholesale_cost BETWEEN 14 AND 34)) - TableScan [TS_0] (rows=575995635 width=88) - default@store_sales,store_sales,Tbl:COMPLETE,Col:NONE,Output:["ss_quantity","ss_wholesale_cost","ss_list_price","ss_coupon_amt"] - <-Reducer 4 [CUSTOM_SIMPLE_EDGE] + Group By Operator [GBY_63] (rows=1 width=584) + Output:["_col0","_col1","_col2"],aggregations:["avg(VALUE._col0)","count(VALUE._col1)","count(VALUE._col2)"] + <-Reducer 2 [CUSTOM_SIMPLE_EDGE] + PARTITION_ONLY_SHUFFLE [RS_62] + Group By Operator [GBY_61] (rows=1 width=584) + Output:["_col0","_col1","_col2"],aggregations:["avg(_col1)","count(_col2)","count(_col0)"] + Group By Operator [GBY_60] (rows=21333171 width=88) + Output:["_col0","_col1","_col2"],aggregations:["avg(VALUE._col0)","count(VALUE._col1)"],keys:KEY._col0 + <-Map 1 [SIMPLE_EDGE] + SHUFFLE [RS_59] + PartitionCols:_col0 + Group By Operator [GBY_58] (rows=21333171 width=88) + Output:["_col0","_col1","_col2"],aggregations:["avg(ss_list_price)","count(ss_list_price)"],keys:ss_list_price + Select Operator [SEL_2] (rows=21333171 width=88) + Output:["ss_list_price"] + Filter Operator [FIL_52] (rows=21333171 width=88) + predicate:(ss_quantity BETWEEN 0 AND 5 and (ss_list_price BETWEEN 11 AND 21 or ss_coupon_amt BETWEEN 460 AND 1460 or ss_wholesale_cost BETWEEN 14 AND 34)) + TableScan [TS_0] (rows=575995635 width=88) + default@store_sales,store_sales,Tbl:COMPLETE,Col:NONE,Output:["ss_quantity","ss_wholesale_cost","ss_list_price","ss_coupon_amt"] + <-Reducer 5 [CUSTOM_SIMPLE_EDGE] PARTITION_ONLY_SHUFFLE [RS_43] Group By Operator [GBY_12] (rows=1 width=416) Output:["_col0","_col1","_col2"],aggregations:["avg(VALUE._col0)","count(VALUE._col1)","count(DISTINCT KEY._col0:0._col0)"] @@ -151,7 +159,7 @@ Stage-0 Filter Operator [FIL_53] (rows=21333171 width=88) predicate:(ss_quantity BETWEEN 26 AND 30 and (ss_list_price BETWEEN 28 AND 38 or ss_coupon_amt BETWEEN 2513 AND 3513 or ss_wholesale_cost BETWEEN 42 AND 62)) Please refer to the previous TableScan [TS_0] - <-Reducer 5 [CUSTOM_SIMPLE_EDGE] + <-Reducer 6 [CUSTOM_SIMPLE_EDGE] PARTITION_ONLY_SHUFFLE [RS_44] Group By Operator [GBY_19] (rows=1 width=416) Output:["_col0","_col1","_col2"],aggregations:["avg(VALUE._col0)","count(VALUE._col1)","count(DISTINCT KEY._col0:0._col0)"] @@ -164,7 +172,7 @@ Stage-0 Filter Operator [FIL_54] (rows=21333171 width=88) predicate:(ss_quantity BETWEEN 21 AND 25 and (ss_list_price BETWEEN 135 AND 145 or ss_coupon_amt BETWEEN 14180 AND 15180 or ss_wholesale_cost BETWEEN 38 AND 58)) Please refer to the previous TableScan [TS_0] - <-Reducer 6 [CUSTOM_SIMPLE_EDGE] + <-Reducer 7 [CUSTOM_SIMPLE_EDGE] PARTITION_ONLY_SHUFFLE [RS_45] Group By Operator [GBY_26] (rows=1 width=416) Output:["_col0","_col1","_col2"],aggregations:["avg(VALUE._col0)","count(VALUE._col1)","count(DISTINCT KEY._col0:0._col0)"] @@ -177,7 +185,7 @@ Stage-0 Filter Operator [FIL_55] (rows=21333171 width=88) predicate:(ss_quantity BETWEEN 16 AND 20 and (ss_list_price BETWEEN 142 AND 152 or ss_coupon_amt BETWEEN 3054 AND 4054 or ss_wholesale_cost BETWEEN 80 AND 100)) Please refer to the previous TableScan [TS_0] - <-Reducer 7 [CUSTOM_SIMPLE_EDGE] + <-Reducer 8 [CUSTOM_SIMPLE_EDGE] PARTITION_ONLY_SHUFFLE [RS_46] Group By Operator [GBY_33] (rows=1 width=416) Output:["_col0","_col1","_col2"],aggregations:["avg(VALUE._col0)","count(VALUE._col1)","count(DISTINCT KEY._col0:0._col0)"] @@ -190,7 +198,7 @@ Stage-0 Filter Operator [FIL_56] (rows=21333171 width=88) predicate:(ss_quantity BETWEEN 11 AND 15 and (ss_list_price BETWEEN 66 AND 76 or ss_coupon_amt BETWEEN 920 AND 1920 or ss_wholesale_cost BETWEEN 4 AND 24)) Please refer to the previous TableScan [TS_0] - <-Reducer 8 [CUSTOM_SIMPLE_EDGE] + <-Reducer 9 [CUSTOM_SIMPLE_EDGE] PARTITION_ONLY_SHUFFLE [RS_47] Group By Operator [GBY_40] (rows=1 width=416) Output:["_col0","_col1","_col2"],aggregations:["avg(VALUE._col0)","count(VALUE._col1)","count(DISTINCT KEY._col0:0._col0)"] diff --git a/ql/src/test/results/clientpositive/perf/query94.q.out b/ql/src/test/results/clientpositive/perf/query94.q.out index 836b16bf9f..0a0011c5d2 100644 --- a/ql/src/test/results/clientpositive/perf/query94.q.out +++ b/ql/src/test/results/clientpositive/perf/query94.q.out @@ -5,126 +5,134 @@ POSTHOOK: type: QUERY Plan optimized by CBO. Vertex dependency in root stage -Reducer 2 <- Map 1 (SIMPLE_EDGE), Reducer 9 (SIMPLE_EDGE) -Reducer 3 <- Map 11 (SIMPLE_EDGE), Reducer 2 (SIMPLE_EDGE) -Reducer 4 <- Map 12 (SIMPLE_EDGE), Reducer 3 (SIMPLE_EDGE) -Reducer 5 <- Map 13 (SIMPLE_EDGE), Reducer 4 (SIMPLE_EDGE) -Reducer 6 <- Map 14 (SIMPLE_EDGE), Reducer 5 (SIMPLE_EDGE) +Reducer 10 <- Map 11 (SIMPLE_EDGE), Map 9 (SIMPLE_EDGE) +Reducer 2 <- Map 1 (SIMPLE_EDGE), Reducer 10 (SIMPLE_EDGE) +Reducer 3 <- Map 12 (SIMPLE_EDGE), Reducer 2 (SIMPLE_EDGE) +Reducer 4 <- Map 13 (SIMPLE_EDGE), Reducer 3 (SIMPLE_EDGE) +Reducer 5 <- Map 14 (SIMPLE_EDGE), Reducer 4 (SIMPLE_EDGE) +Reducer 6 <- Map 15 (SIMPLE_EDGE), Reducer 5 (SIMPLE_EDGE) Reducer 7 <- Reducer 6 (SIMPLE_EDGE) -Reducer 9 <- Map 10 (SIMPLE_EDGE), Map 8 (SIMPLE_EDGE) +Reducer 8 <- Reducer 7 (CUSTOM_SIMPLE_EDGE) Stage-0 Fetch Operator limit:100 Stage-1 - Reducer 7 + Reducer 8 File Output Operator [FS_51] Limit [LIM_50] (rows=1 width=344) Number of rows:100 - Group By Operator [GBY_48] (rows=1 width=344) - Output:["_col0","_col1","_col2"],aggregations:["count(DISTINCT KEY._col0:0._col0)","sum(VALUE._col1)","sum(VALUE._col2)"] - <-Reducer 6 [SIMPLE_EDGE] - SHUFFLE [RS_47] - Group By Operator [GBY_46] (rows=127554770 width=135) - Output:["_col0","_col1","_col2","_col3"],aggregations:["count(DISTINCT _col3)","sum(_col4)","sum(_col5)"],keys:_col3 - Select Operator [SEL_45] (rows=127554770 width=135) - Output:["_col3","_col4","_col5"] - Filter Operator [FIL_44] (rows=127554770 width=135) - predicate:_col12 is null - Merge Join Operator [MERGEJOIN_85] (rows=255109540 width=135) - Conds:RS_40._col3=RS_41._col0(Left Outer),Output:["_col3","_col4","_col5","_col12"] - <-Map 14 [SIMPLE_EDGE] - SHUFFLE [RS_41] - PartitionCols:_col0 - Select Operator [SEL_25] (rows=14398467 width=92) - Output:["_col0"] - Filter Operator [FIL_79] (rows=14398467 width=92) - predicate:wr_order_number is not null - TableScan [TS_23] (rows=14398467 width=92) - default@web_returns,wr1,Tbl:COMPLETE,Col:NONE,Output:["wr_order_number"] - <-Reducer 5 [SIMPLE_EDGE] - SHUFFLE [RS_40] - PartitionCols:_col3 - Merge Join Operator [MERGEJOIN_84] (rows=231917759 width=135) - Conds:RS_37._col2=RS_38._col0(Inner),Output:["_col3","_col4","_col5"] - <-Map 13 [SIMPLE_EDGE] - SHUFFLE [RS_38] - PartitionCols:_col0 - Select Operator [SEL_22] (rows=42 width=1850) - Output:["_col0"] - Filter Operator [FIL_78] (rows=42 width=1850) - predicate:((web_company_name = 'pri') and web_site_sk is not null) - TableScan [TS_20] (rows=84 width=1850) - default@web_site,s,Tbl:COMPLETE,Col:NONE,Output:["web_site_sk","web_company_name"] - <-Reducer 4 [SIMPLE_EDGE] - SHUFFLE [RS_37] - PartitionCols:_col2 - Merge Join Operator [MERGEJOIN_83] (rows=210834322 width=135) - Conds:RS_34._col1=RS_35._col0(Inner),Output:["_col2","_col3","_col4","_col5"] - <-Map 12 [SIMPLE_EDGE] - SHUFFLE [RS_35] - PartitionCols:_col0 - Select Operator [SEL_19] (rows=20000000 width=1014) - Output:["_col0"] - Filter Operator [FIL_77] (rows=20000000 width=1014) - predicate:((ca_state = 'TX') and ca_address_sk is not null) - TableScan [TS_17] (rows=40000000 width=1014) - default@customer_address,ca,Tbl:COMPLETE,Col:NONE,Output:["ca_address_sk","ca_state"] - <-Reducer 3 [SIMPLE_EDGE] - SHUFFLE [RS_34] - PartitionCols:_col1 - Merge Join Operator [MERGEJOIN_82] (rows=191667562 width=135) - Conds:RS_31._col0=RS_32._col0(Inner),Output:["_col1","_col2","_col3","_col4","_col5"] - <-Map 11 [SIMPLE_EDGE] - SHUFFLE [RS_32] - PartitionCols:_col0 - Select Operator [SEL_16] (rows=8116 width=1119) - Output:["_col0"] - Filter Operator [FIL_76] (rows=8116 width=1119) - predicate:(d_date BETWEEN '1999-05-01' AND '1999-07-01' and d_date_sk is not null) - TableScan [TS_14] (rows=73049 width=1119) - default@date_dim,d,Tbl:COMPLETE,Col:NONE,Output:["d_date_sk","d_date"] - <-Reducer 2 [SIMPLE_EDGE] - SHUFFLE [RS_31] - PartitionCols:_col0 - Merge Join Operator [MERGEJOIN_81] (rows=174243235 width=135) - Conds:RS_28._col3=RS_29._col0(Left Semi),Output:["_col0","_col1","_col2","_col3","_col4","_col5"] - <-Map 1 [SIMPLE_EDGE] - SHUFFLE [RS_28] - PartitionCols:_col3 - Select Operator [SEL_2] (rows=144002668 width=135) - Output:["_col0","_col1","_col2","_col3","_col4","_col5"] - Filter Operator [FIL_73] (rows=144002668 width=135) - predicate:(ws_ship_addr_sk is not null and ws_web_site_sk is not null and ws_ship_date_sk is not null and ws_order_number is not null) - TableScan [TS_0] (rows=144002668 width=135) - default@web_sales,ws1,Tbl:COMPLETE,Col:NONE,Output:["ws_ship_date_sk","ws_ship_addr_sk","ws_web_site_sk","ws_order_number","ws_ext_ship_cost","ws_net_profit"] - <-Reducer 9 [SIMPLE_EDGE] - SHUFFLE [RS_29] - PartitionCols:_col0 - Group By Operator [GBY_27] (rows=158402938 width=135) - Output:["_col0"],keys:_col0 - Select Operator [SEL_13] (rows=158402938 width=135) + Group By Operator [GBY_85] (rows=1 width=344) + Output:["_col0","_col1","_col2"],aggregations:["count(VALUE._col0)","sum(VALUE._col1)","sum(VALUE._col2)"] + <-Reducer 7 [CUSTOM_SIMPLE_EDGE] + PARTITION_ONLY_SHUFFLE [RS_84] + Group By Operator [GBY_83] (rows=1 width=344) + Output:["_col0","_col1","_col2"],aggregations:["count(_col0)","sum(_col1)","sum(_col2)"] + Group By Operator [GBY_82] (rows=127554770 width=135) + Output:["_col0","_col1","_col2"],aggregations:["sum(VALUE._col0)","sum(VALUE._col1)"],keys:KEY._col0 + <-Reducer 6 [SIMPLE_EDGE] + SHUFFLE [RS_81] + PartitionCols:_col0 + Group By Operator [GBY_80] (rows=127554770 width=135) + Output:["_col0","_col2","_col3"],aggregations:["sum(_col4)","sum(_col5)"],keys:_col3 + Select Operator [SEL_45] (rows=127554770 width=135) + Output:["_col3","_col4","_col5"] + Filter Operator [FIL_44] (rows=127554770 width=135) + predicate:_col12 is null + Merge Join Operator [MERGEJOIN_91] (rows=255109540 width=135) + Conds:RS_40._col3=RS_41._col0(Left Outer),Output:["_col3","_col4","_col5","_col12"] + <-Map 15 [SIMPLE_EDGE] + SHUFFLE [RS_41] + PartitionCols:_col0 + Select Operator [SEL_25] (rows=14398467 width=92) + Output:["_col0"] + Filter Operator [FIL_79] (rows=14398467 width=92) + predicate:wr_order_number is not null + TableScan [TS_23] (rows=14398467 width=92) + default@web_returns,wr1,Tbl:COMPLETE,Col:NONE,Output:["wr_order_number"] + <-Reducer 5 [SIMPLE_EDGE] + SHUFFLE [RS_40] + PartitionCols:_col3 + Merge Join Operator [MERGEJOIN_90] (rows=231917759 width=135) + Conds:RS_37._col2=RS_38._col0(Inner),Output:["_col3","_col4","_col5"] + <-Map 14 [SIMPLE_EDGE] + SHUFFLE [RS_38] + PartitionCols:_col0 + Select Operator [SEL_22] (rows=42 width=1850) + Output:["_col0"] + Filter Operator [FIL_78] (rows=42 width=1850) + predicate:((web_company_name = 'pri') and web_site_sk is not null) + TableScan [TS_20] (rows=84 width=1850) + default@web_site,s,Tbl:COMPLETE,Col:NONE,Output:["web_site_sk","web_company_name"] + <-Reducer 4 [SIMPLE_EDGE] + SHUFFLE [RS_37] + PartitionCols:_col2 + Merge Join Operator [MERGEJOIN_89] (rows=210834322 width=135) + Conds:RS_34._col1=RS_35._col0(Inner),Output:["_col2","_col3","_col4","_col5"] + <-Map 13 [SIMPLE_EDGE] + SHUFFLE [RS_35] + PartitionCols:_col0 + Select Operator [SEL_19] (rows=20000000 width=1014) + Output:["_col0"] + Filter Operator [FIL_77] (rows=20000000 width=1014) + predicate:((ca_state = 'TX') and ca_address_sk is not null) + TableScan [TS_17] (rows=40000000 width=1014) + default@customer_address,ca,Tbl:COMPLETE,Col:NONE,Output:["ca_address_sk","ca_state"] + <-Reducer 3 [SIMPLE_EDGE] + SHUFFLE [RS_34] + PartitionCols:_col1 + Merge Join Operator [MERGEJOIN_88] (rows=191667562 width=135) + Conds:RS_31._col0=RS_32._col0(Inner),Output:["_col1","_col2","_col3","_col4","_col5"] + <-Map 12 [SIMPLE_EDGE] + SHUFFLE [RS_32] + PartitionCols:_col0 + Select Operator [SEL_16] (rows=8116 width=1119) Output:["_col0"] - Filter Operator [FIL_12] (rows=158402938 width=135) - predicate:(_col0 <> _col2) - Merge Join Operator [MERGEJOIN_80] (rows=158402938 width=135) - Conds:RS_9._col1=RS_10._col1(Inner),Output:["_col0","_col1","_col2"] - <-Map 10 [SIMPLE_EDGE] - SHUFFLE [RS_10] - PartitionCols:_col1 - Select Operator [SEL_8] (rows=144002668 width=135) - Output:["_col0","_col1"] - Filter Operator [FIL_75] (rows=144002668 width=135) - predicate:ws_order_number is not null - TableScan [TS_6] (rows=144002668 width=135) - default@web_sales,ws3,Tbl:COMPLETE,Col:NONE,Output:["ws_warehouse_sk","ws_order_number"] - <-Map 8 [SIMPLE_EDGE] - SHUFFLE [RS_9] - PartitionCols:_col1 - Select Operator [SEL_5] (rows=144002668 width=135) - Output:["_col0","_col1"] - Filter Operator [FIL_74] (rows=144002668 width=135) - predicate:ws_order_number is not null - TableScan [TS_3] (rows=144002668 width=135) - default@web_sales,ws2,Tbl:COMPLETE,Col:NONE,Output:["ws_warehouse_sk","ws_order_number"] + Filter Operator [FIL_76] (rows=8116 width=1119) + predicate:(d_date BETWEEN '1999-05-01' AND '1999-07-01' and d_date_sk is not null) + TableScan [TS_14] (rows=73049 width=1119) + default@date_dim,d,Tbl:COMPLETE,Col:NONE,Output:["d_date_sk","d_date"] + <-Reducer 2 [SIMPLE_EDGE] + SHUFFLE [RS_31] + PartitionCols:_col0 + Merge Join Operator [MERGEJOIN_87] (rows=174243235 width=135) + Conds:RS_28._col3=RS_29._col0(Left Semi),Output:["_col0","_col1","_col2","_col3","_col4","_col5"] + <-Map 1 [SIMPLE_EDGE] + SHUFFLE [RS_28] + PartitionCols:_col3 + Select Operator [SEL_2] (rows=144002668 width=135) + Output:["_col0","_col1","_col2","_col3","_col4","_col5"] + Filter Operator [FIL_73] (rows=144002668 width=135) + predicate:(ws_ship_addr_sk is not null and ws_web_site_sk is not null and ws_ship_date_sk is not null and ws_order_number is not null) + TableScan [TS_0] (rows=144002668 width=135) + default@web_sales,ws1,Tbl:COMPLETE,Col:NONE,Output:["ws_ship_date_sk","ws_ship_addr_sk","ws_web_site_sk","ws_order_number","ws_ext_ship_cost","ws_net_profit"] + <-Reducer 10 [SIMPLE_EDGE] + SHUFFLE [RS_29] + PartitionCols:_col0 + Group By Operator [GBY_27] (rows=158402938 width=135) + Output:["_col0"],keys:_col0 + Select Operator [SEL_13] (rows=158402938 width=135) + Output:["_col0"] + Filter Operator [FIL_12] (rows=158402938 width=135) + predicate:(_col0 <> _col2) + Merge Join Operator [MERGEJOIN_86] (rows=158402938 width=135) + Conds:RS_9._col1=RS_10._col1(Inner),Output:["_col0","_col1","_col2"] + <-Map 11 [SIMPLE_EDGE] + SHUFFLE [RS_10] + PartitionCols:_col1 + Select Operator [SEL_8] (rows=144002668 width=135) + Output:["_col0","_col1"] + Filter Operator [FIL_75] (rows=144002668 width=135) + predicate:ws_order_number is not null + TableScan [TS_6] (rows=144002668 width=135) + default@web_sales,ws3,Tbl:COMPLETE,Col:NONE,Output:["ws_warehouse_sk","ws_order_number"] + <-Map 9 [SIMPLE_EDGE] + SHUFFLE [RS_9] + PartitionCols:_col1 + Select Operator [SEL_5] (rows=144002668 width=135) + Output:["_col0","_col1"] + Filter Operator [FIL_74] (rows=144002668 width=135) + predicate:ws_order_number is not null + TableScan [TS_3] (rows=144002668 width=135) + default@web_sales,ws2,Tbl:COMPLETE,Col:NONE,Output:["ws_warehouse_sk","ws_order_number"] diff --git a/ql/src/test/results/clientpositive/perf/query95.q.out b/ql/src/test/results/clientpositive/perf/query95.q.out index fa94d0842b..469b6df7d8 100644 --- a/ql/src/test/results/clientpositive/perf/query95.q.out +++ b/ql/src/test/results/clientpositive/perf/query95.q.out @@ -5,150 +5,158 @@ POSTHOOK: type: QUERY Plan optimized by CBO. Vertex dependency in root stage -Reducer 10 <- Map 12 (SIMPLE_EDGE), Reducer 9 (SIMPLE_EDGE) -Reducer 2 <- Map 1 (SIMPLE_EDGE), Reducer 10 (SIMPLE_EDGE), Reducer 8 (SIMPLE_EDGE) -Reducer 3 <- Map 13 (SIMPLE_EDGE), Reducer 2 (SIMPLE_EDGE) -Reducer 4 <- Map 14 (SIMPLE_EDGE), Reducer 3 (SIMPLE_EDGE) -Reducer 5 <- Map 15 (SIMPLE_EDGE), Reducer 4 (SIMPLE_EDGE) +Reducer 10 <- Map 12 (SIMPLE_EDGE), Map 8 (SIMPLE_EDGE) +Reducer 11 <- Map 13 (SIMPLE_EDGE), Reducer 10 (SIMPLE_EDGE) +Reducer 2 <- Map 1 (SIMPLE_EDGE), Reducer 11 (SIMPLE_EDGE), Reducer 9 (SIMPLE_EDGE) +Reducer 3 <- Map 14 (SIMPLE_EDGE), Reducer 2 (SIMPLE_EDGE) +Reducer 4 <- Map 15 (SIMPLE_EDGE), Reducer 3 (SIMPLE_EDGE) +Reducer 5 <- Map 16 (SIMPLE_EDGE), Reducer 4 (SIMPLE_EDGE) Reducer 6 <- Reducer 5 (SIMPLE_EDGE) -Reducer 8 <- Map 11 (SIMPLE_EDGE), Map 7 (SIMPLE_EDGE) -Reducer 9 <- Map 11 (SIMPLE_EDGE), Map 7 (SIMPLE_EDGE) +Reducer 7 <- Reducer 6 (CUSTOM_SIMPLE_EDGE) +Reducer 9 <- Map 12 (SIMPLE_EDGE), Map 8 (SIMPLE_EDGE) Stage-0 Fetch Operator limit:-1 Stage-1 - Reducer 6 + Reducer 7 File Output Operator [FS_63] - Group By Operator [GBY_61] (rows=1 width=344) - Output:["_col0","_col1","_col2"],aggregations:["count(DISTINCT KEY._col0:0._col0)","sum(VALUE._col1)","sum(VALUE._col2)"] - <-Reducer 5 [SIMPLE_EDGE] - SHUFFLE [RS_60] - Group By Operator [GBY_59] (rows=510219083 width=135) - Output:["_col0","_col1","_col2","_col3"],aggregations:["count(DISTINCT _col3)","sum(_col4)","sum(_col5)"],keys:_col3 - Merge Join Operator [MERGEJOIN_122] (rows=510219083 width=135) - Conds:RS_55._col2=RS_56._col0(Inner),Output:["_col3","_col4","_col5"] - <-Map 15 [SIMPLE_EDGE] - SHUFFLE [RS_56] + Group By Operator [GBY_121] (rows=1 width=344) + Output:["_col0","_col1","_col2"],aggregations:["count(VALUE._col0)","sum(VALUE._col1)","sum(VALUE._col2)"] + <-Reducer 6 [CUSTOM_SIMPLE_EDGE] + PARTITION_ONLY_SHUFFLE [RS_120] + Group By Operator [GBY_119] (rows=1 width=344) + Output:["_col0","_col1","_col2"],aggregations:["count(_col0)","sum(_col1)","sum(_col2)"] + Group By Operator [GBY_118] (rows=510219083 width=135) + Output:["_col0","_col1","_col2"],aggregations:["sum(VALUE._col0)","sum(VALUE._col1)"],keys:KEY._col0 + <-Reducer 5 [SIMPLE_EDGE] + SHUFFLE [RS_117] PartitionCols:_col0 - Select Operator [SEL_40] (rows=42 width=1850) - Output:["_col0"] - Filter Operator [FIL_115] (rows=42 width=1850) - predicate:((web_company_name = 'pri') and web_site_sk is not null) - TableScan [TS_38] (rows=84 width=1850) - default@web_site,s,Tbl:COMPLETE,Col:NONE,Output:["web_site_sk","web_company_name"] - <-Reducer 4 [SIMPLE_EDGE] - SHUFFLE [RS_55] - PartitionCols:_col2 - Merge Join Operator [MERGEJOIN_121] (rows=463835520 width=135) - Conds:RS_52._col1=RS_53._col0(Inner),Output:["_col2","_col3","_col4","_col5"] - <-Map 14 [SIMPLE_EDGE] - SHUFFLE [RS_53] - PartitionCols:_col0 - Select Operator [SEL_37] (rows=20000000 width=1014) - Output:["_col0"] - Filter Operator [FIL_114] (rows=20000000 width=1014) - predicate:((ca_state = 'GA') and ca_address_sk is not null) - TableScan [TS_35] (rows=40000000 width=1014) - default@customer_address,ca,Tbl:COMPLETE,Col:NONE,Output:["ca_address_sk","ca_state"] - <-Reducer 3 [SIMPLE_EDGE] - SHUFFLE [RS_52] - PartitionCols:_col1 - Merge Join Operator [MERGEJOIN_120] (rows=421668646 width=135) - Conds:RS_49._col0=RS_50._col0(Inner),Output:["_col1","_col2","_col3","_col4","_col5"] - <-Map 13 [SIMPLE_EDGE] - SHUFFLE [RS_50] - PartitionCols:_col0 - Select Operator [SEL_34] (rows=8116 width=1119) - Output:["_col0"] - Filter Operator [FIL_113] (rows=8116 width=1119) - predicate:(d_date BETWEEN '2002-05-01' AND '2002-06-30' and d_date_sk is not null) - TableScan [TS_32] (rows=73049 width=1119) - default@date_dim,d,Tbl:COMPLETE,Col:NONE,Output:["d_date_sk","d_date"] - <-Reducer 2 [SIMPLE_EDGE] - SHUFFLE [RS_49] - PartitionCols:_col0 - Merge Join Operator [MERGEJOIN_119] (rows=383335125 width=135) - Conds:RS_45._col3=RS_46._col0(Left Semi),RS_45._col3=RS_47._col0(Left Semi),Output:["_col0","_col1","_col2","_col3","_col4","_col5"] - <-Map 1 [SIMPLE_EDGE] - SHUFFLE [RS_45] - PartitionCols:_col3 - Select Operator [SEL_2] (rows=144002668 width=135) - Output:["_col0","_col1","_col2","_col3","_col4","_col5"] - Filter Operator [FIL_107] (rows=144002668 width=135) - predicate:(ws_ship_addr_sk is not null and ws_web_site_sk is not null and ws_ship_date_sk is not null and ws_order_number is not null) - TableScan [TS_0] (rows=144002668 width=135) - default@web_sales,ws1,Tbl:COMPLETE,Col:NONE,Output:["ws_ship_date_sk","ws_ship_addr_sk","ws_web_site_sk","ws_order_number","ws_ext_ship_cost","ws_net_profit"] - <-Reducer 10 [SIMPLE_EDGE] - SHUFFLE [RS_47] - PartitionCols:_col0 - Group By Operator [GBY_44] (rows=174243235 width=135) - Output:["_col0"],keys:_col0 - Select Operator [SEL_31] (rows=174243235 width=135) + Group By Operator [GBY_116] (rows=510219083 width=135) + Output:["_col0","_col2","_col3"],aggregations:["sum(_col4)","sum(_col5)"],keys:_col3 + Merge Join Operator [MERGEJOIN_128] (rows=510219083 width=135) + Conds:RS_55._col2=RS_56._col0(Inner),Output:["_col3","_col4","_col5"] + <-Map 16 [SIMPLE_EDGE] + SHUFFLE [RS_56] + PartitionCols:_col0 + Select Operator [SEL_40] (rows=42 width=1850) + Output:["_col0"] + Filter Operator [FIL_115] (rows=42 width=1850) + predicate:((web_company_name = 'pri') and web_site_sk is not null) + TableScan [TS_38] (rows=84 width=1850) + default@web_site,s,Tbl:COMPLETE,Col:NONE,Output:["web_site_sk","web_company_name"] + <-Reducer 4 [SIMPLE_EDGE] + SHUFFLE [RS_55] + PartitionCols:_col2 + Merge Join Operator [MERGEJOIN_127] (rows=463835520 width=135) + Conds:RS_52._col1=RS_53._col0(Inner),Output:["_col2","_col3","_col4","_col5"] + <-Map 15 [SIMPLE_EDGE] + SHUFFLE [RS_53] + PartitionCols:_col0 + Select Operator [SEL_37] (rows=20000000 width=1014) + Output:["_col0"] + Filter Operator [FIL_114] (rows=20000000 width=1014) + predicate:((ca_state = 'GA') and ca_address_sk is not null) + TableScan [TS_35] (rows=40000000 width=1014) + default@customer_address,ca,Tbl:COMPLETE,Col:NONE,Output:["ca_address_sk","ca_state"] + <-Reducer 3 [SIMPLE_EDGE] + SHUFFLE [RS_52] + PartitionCols:_col1 + Merge Join Operator [MERGEJOIN_126] (rows=421668646 width=135) + Conds:RS_49._col0=RS_50._col0(Inner),Output:["_col1","_col2","_col3","_col4","_col5"] + <-Map 14 [SIMPLE_EDGE] + SHUFFLE [RS_50] + PartitionCols:_col0 + Select Operator [SEL_34] (rows=8116 width=1119) Output:["_col0"] - Merge Join Operator [MERGEJOIN_118] (rows=174243235 width=135) - Conds:RS_28._col0=RS_29._col0(Inner),Output:["_col1"] - <-Map 12 [SIMPLE_EDGE] - SHUFFLE [RS_29] - PartitionCols:_col0 - Select Operator [SEL_27] (rows=14398467 width=92) + Filter Operator [FIL_113] (rows=8116 width=1119) + predicate:(d_date BETWEEN '2002-05-01' AND '2002-06-30' and d_date_sk is not null) + TableScan [TS_32] (rows=73049 width=1119) + default@date_dim,d,Tbl:COMPLETE,Col:NONE,Output:["d_date_sk","d_date"] + <-Reducer 2 [SIMPLE_EDGE] + SHUFFLE [RS_49] + PartitionCols:_col0 + Merge Join Operator [MERGEJOIN_125] (rows=383335125 width=135) + Conds:RS_45._col3=RS_46._col0(Left Semi),RS_45._col3=RS_47._col0(Left Semi),Output:["_col0","_col1","_col2","_col3","_col4","_col5"] + <-Map 1 [SIMPLE_EDGE] + SHUFFLE [RS_45] + PartitionCols:_col3 + Select Operator [SEL_2] (rows=144002668 width=135) + Output:["_col0","_col1","_col2","_col3","_col4","_col5"] + Filter Operator [FIL_107] (rows=144002668 width=135) + predicate:(ws_ship_addr_sk is not null and ws_web_site_sk is not null and ws_ship_date_sk is not null and ws_order_number is not null) + TableScan [TS_0] (rows=144002668 width=135) + default@web_sales,ws1,Tbl:COMPLETE,Col:NONE,Output:["ws_ship_date_sk","ws_ship_addr_sk","ws_web_site_sk","ws_order_number","ws_ext_ship_cost","ws_net_profit"] + <-Reducer 11 [SIMPLE_EDGE] + SHUFFLE [RS_47] + PartitionCols:_col0 + Group By Operator [GBY_44] (rows=174243235 width=135) + Output:["_col0"],keys:_col0 + Select Operator [SEL_31] (rows=174243235 width=135) Output:["_col0"] - Filter Operator [FIL_112] (rows=14398467 width=92) - predicate:wr_order_number is not null - TableScan [TS_25] (rows=14398467 width=92) - default@web_returns,wr,Tbl:COMPLETE,Col:NONE,Output:["wr_order_number"] - <-Reducer 9 [SIMPLE_EDGE] - SHUFFLE [RS_28] - PartitionCols:_col0 - Select Operator [SEL_24] (rows=158402938 width=135) + Merge Join Operator [MERGEJOIN_124] (rows=174243235 width=135) + Conds:RS_28._col0=RS_29._col0(Inner),Output:["_col1"] + <-Map 13 [SIMPLE_EDGE] + SHUFFLE [RS_29] + PartitionCols:_col0 + Select Operator [SEL_27] (rows=14398467 width=92) + Output:["_col0"] + Filter Operator [FIL_112] (rows=14398467 width=92) + predicate:wr_order_number is not null + TableScan [TS_25] (rows=14398467 width=92) + default@web_returns,wr,Tbl:COMPLETE,Col:NONE,Output:["wr_order_number"] + <-Reducer 10 [SIMPLE_EDGE] + SHUFFLE [RS_28] + PartitionCols:_col0 + Select Operator [SEL_24] (rows=158402938 width=135) + Output:["_col0"] + Filter Operator [FIL_23] (rows=158402938 width=135) + predicate:(_col0 <> _col2) + Merge Join Operator [MERGEJOIN_123] (rows=158402938 width=135) + Conds:RS_20._col1=RS_21._col1(Inner),Output:["_col0","_col1","_col2"] + <-Map 12 [SIMPLE_EDGE] + SHUFFLE [RS_21] + PartitionCols:_col1 + Select Operator [SEL_19] (rows=144002668 width=135) + Output:["_col0","_col1"] + Filter Operator [FIL_111] (rows=144002668 width=135) + predicate:ws_order_number is not null + TableScan [TS_6] (rows=144002668 width=135) + default@web_sales,ws3,Tbl:COMPLETE,Col:NONE,Output:["ws_warehouse_sk","ws_order_number"] + <-Map 8 [SIMPLE_EDGE] + SHUFFLE [RS_20] + PartitionCols:_col1 + Select Operator [SEL_16] (rows=144002668 width=135) + Output:["_col0","_col1"] + Filter Operator [FIL_110] (rows=144002668 width=135) + predicate:ws_order_number is not null + TableScan [TS_3] (rows=144002668 width=135) + default@web_sales,ws2,Tbl:COMPLETE,Col:NONE,Output:["ws_warehouse_sk","ws_order_number"] + <-Reducer 9 [SIMPLE_EDGE] + SHUFFLE [RS_46] + PartitionCols:_col0 + Group By Operator [GBY_42] (rows=158402938 width=135) + Output:["_col0"],keys:_col0 + Select Operator [SEL_13] (rows=158402938 width=135) Output:["_col0"] - Filter Operator [FIL_23] (rows=158402938 width=135) + Filter Operator [FIL_12] (rows=158402938 width=135) predicate:(_col0 <> _col2) - Merge Join Operator [MERGEJOIN_117] (rows=158402938 width=135) - Conds:RS_20._col1=RS_21._col1(Inner),Output:["_col0","_col1","_col2"] - <-Map 11 [SIMPLE_EDGE] - SHUFFLE [RS_21] + Merge Join Operator [MERGEJOIN_122] (rows=158402938 width=135) + Conds:RS_9._col1=RS_10._col1(Inner),Output:["_col0","_col1","_col2"] + <-Map 12 [SIMPLE_EDGE] + SHUFFLE [RS_10] PartitionCols:_col1 - Select Operator [SEL_19] (rows=144002668 width=135) + Select Operator [SEL_8] (rows=144002668 width=135) Output:["_col0","_col1"] - Filter Operator [FIL_111] (rows=144002668 width=135) + Filter Operator [FIL_109] (rows=144002668 width=135) predicate:ws_order_number is not null - TableScan [TS_6] (rows=144002668 width=135) - default@web_sales,ws3,Tbl:COMPLETE,Col:NONE,Output:["ws_warehouse_sk","ws_order_number"] - <-Map 7 [SIMPLE_EDGE] - SHUFFLE [RS_20] + Please refer to the previous TableScan [TS_6] + <-Map 8 [SIMPLE_EDGE] + SHUFFLE [RS_9] PartitionCols:_col1 - Select Operator [SEL_16] (rows=144002668 width=135) + Select Operator [SEL_5] (rows=144002668 width=135) Output:["_col0","_col1"] - Filter Operator [FIL_110] (rows=144002668 width=135) + Filter Operator [FIL_108] (rows=144002668 width=135) predicate:ws_order_number is not null - TableScan [TS_3] (rows=144002668 width=135) - default@web_sales,ws2,Tbl:COMPLETE,Col:NONE,Output:["ws_warehouse_sk","ws_order_number"] - <-Reducer 8 [SIMPLE_EDGE] - SHUFFLE [RS_46] - PartitionCols:_col0 - Group By Operator [GBY_42] (rows=158402938 width=135) - Output:["_col0"],keys:_col0 - Select Operator [SEL_13] (rows=158402938 width=135) - Output:["_col0"] - Filter Operator [FIL_12] (rows=158402938 width=135) - predicate:(_col0 <> _col2) - Merge Join Operator [MERGEJOIN_116] (rows=158402938 width=135) - Conds:RS_9._col1=RS_10._col1(Inner),Output:["_col0","_col1","_col2"] - <-Map 11 [SIMPLE_EDGE] - SHUFFLE [RS_10] - PartitionCols:_col1 - Select Operator [SEL_8] (rows=144002668 width=135) - Output:["_col0","_col1"] - Filter Operator [FIL_109] (rows=144002668 width=135) - predicate:ws_order_number is not null - Please refer to the previous TableScan [TS_6] - <-Map 7 [SIMPLE_EDGE] - SHUFFLE [RS_9] - PartitionCols:_col1 - Select Operator [SEL_5] (rows=144002668 width=135) - Output:["_col0","_col1"] - Filter Operator [FIL_108] (rows=144002668 width=135) - predicate:ws_order_number is not null - Please refer to the previous TableScan [TS_3] + Please refer to the previous TableScan [TS_3] diff --git a/ql/src/test/results/clientpositive/udf_count.q.out b/ql/src/test/results/clientpositive/udf_count.q.out index f60ad0485e..c3903b29e8 100644 --- a/ql/src/test/results/clientpositive/udf_count.q.out +++ b/ql/src/test/results/clientpositive/udf_count.q.out @@ -43,7 +43,8 @@ POSTHOOK: query: EXPLAIN SELECT count(DISTINCT key) FROM src POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-1 is a root stage - Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 STAGE PLANS: Stage: Stage-1 @@ -57,24 +58,50 @@ STAGE PLANS: outputColumnNames: key Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Group By Operator - aggregations: count(DISTINCT key) keys: key (type: string) mode: hash - outputColumnNames: _col0, _col1 + outputColumnNames: _col0 Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col0 (type: string) sort order: + + Map-reduce partition columns: _col0 (type: string) Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Reduce Operator Tree: Group By Operator - aggregations: count(DISTINCT KEY._col0:0._col0) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(_col0) + mode: partial2 + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) mode: mergepartial outputColumnNames: _col0 - Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE File Output Operator compressed: false - Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat