diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 7dedd23591..ee0442811f 100644 --- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -1562,6 +1562,8 @@ private static void populateLlapDaemonVarsSet(Set llapDaemonVarsSetLocal "Whether to transform OR clauses in Filter operators into IN clauses"), HIVEPOINTLOOKUPOPTIMIZERMIN("hive.optimize.point.lookup.min", 31, "Minimum number of OR clauses needed to transform into IN clauses"), + HIVECOUNTDISTINCTOPTIMIZER("hive.optimize.countdistinct", true, + "Whether to transform count distinct into two stages"), HIVEPARTITIONCOLUMNSEPARATOR("hive.optimize.partition.columns.separate", true, "Extract partition columns from IN clauses"), // Constant propagation optimizer diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties index e23ef6317f..6dde189496 100644 --- a/itests/src/test/resources/testconfiguration.properties +++ b/itests/src/test/resources/testconfiguration.properties @@ -59,7 +59,8 @@ minitez.query.files=explainuser_3.q,\ tez_union_with_udf.q -minillap.shared.query.files=insert_into1.q,\ +minillap.shared.query.files=count_dist_rewrite.q,\ + insert_into1.q,\ insert_into2.q,\ insert_values_orig_table.,\ llapdecider.q,\ diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java index 8b04cd44fa..d92ed4021f 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java @@ -1167,8 +1167,13 @@ public boolean supportSkewJoinOptimization() { @SuppressWarnings("unchecked") T descClone = (T)conf.clone(); // also clone the colExprMap by default + // we need a deep copy + ArrayList colInfos = new ArrayList<>(); + colInfos.addAll(getSchema().getSignature()); + Map map = new HashMap<>(); + map.putAll(getColumnExprMap()); Operator ret = OperatorFactory.getAndMakeChild( - cContext, descClone, getSchema(), getColumnExprMap(), parentClones); + cContext, descClone, new RowSchema(colInfos), map, parentClones); return ret; } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/CountDistinctRewriteProc.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/CountDistinctRewriteProc.java new file mode 100644 index 0000000000..5d4a6aa9a2 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/CountDistinctRewriteProc.java @@ -0,0 +1,413 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements.See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership.The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License.You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.optimizer; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Stack; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.hadoop.hive.ql.exec.ColumnInfo; +import org.apache.hadoop.hive.ql.exec.GroupByOperator; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.OperatorFactory; +import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; +import org.apache.hadoop.hive.ql.exec.RowSchema; +import org.apache.hadoop.hive.ql.exec.Utilities; +import org.apache.hadoop.hive.ql.io.AcidUtils; +import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker; +import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher; +import org.apache.hadoop.hive.ql.lib.Dispatcher; +import org.apache.hadoop.hive.ql.lib.GraphWalker; +import org.apache.hadoop.hive.ql.lib.Node; +import org.apache.hadoop.hive.ql.lib.NodeProcessor; +import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; +import org.apache.hadoop.hive.ql.lib.Rule; +import org.apache.hadoop.hive.ql.lib.RuleRegExp; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.SemanticAnalyzer; +import org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.GenericUDAFInfo; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.plan.AggregationDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.GroupByDesc; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; +import org.apache.hadoop.hive.ql.plan.PlanUtils; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.Mode; + +/** + * This transformation does count distinct optimization. + */ +public class CountDistinctRewriteProc extends Transform { + + private static final Logger LOG = LoggerFactory.getLogger(CountDistinctRewriteProc.class + .getName()); + + public CountDistinctRewriteProc() { + } + + @Override + public ParseContext transform(ParseContext pctx) throws SemanticException { + + Map opRules = new LinkedHashMap(); + // process group-by pattern + opRules + .put( + new RuleRegExp("R1", GroupByOperator.getOperatorName() + "%" + + ReduceSinkOperator.getOperatorName() + "%" + GroupByOperator.getOperatorName() + + "%"), getCountDistinctProc(pctx)); + + // The dispatcher fires the processor corresponding to the closest matching + // rule and passes the context along + Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), opRules, null); + GraphWalker ogw = new DefaultGraphWalker(disp); + + // Create a list of topop nodes + List topNodes = new ArrayList(); + topNodes.addAll(pctx.getTopOps().values()); + ogw.startWalking(topNodes, null); + + return pctx; + } + + private NodeProcessor getDefaultProc() { + return new NodeProcessor() { + @Override + public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException { + return null; + } + }; + } + + private NodeProcessor getCountDistinctProc(ParseContext pctx) { + return new CountDistinctProcessor(pctx); + } + + /** + * CountDistinctProcessor. + * + */ + public class CountDistinctProcessor implements NodeProcessor { + + protected ParseContext pGraphContext; + + public CountDistinctProcessor(ParseContext pGraphContext) { + this.pGraphContext = pGraphContext; + } + + ExprNodeColumnDesc exprNodeColumnDesc = null; + int indexOfDist = -1; + + // Check if we can process it or not + protected boolean checkCountDistinct(GroupByOperator mGby, GroupByOperator rGby) { + int cntDist = 0; + ArrayList keys = mGby.getConf().getKeys(); + if (!(keys.size() == 1 && rGby.getConf().getKeys().size() == 0 && mGby.getConf() + .getOutputColumnNames().size() == mGby.getConf().getAggregators().size() + 1)) { + return false; + } + for (int pos = 0; pos < mGby.getConf().getAggregators().size(); pos++) { + AggregationDesc aggr = mGby.getConf().getAggregators().get(pos); + if (aggr.getDistinct()) { + if (cntDist != 0) { + // TODO: may be the same count(distinct key), count(distinct key) + // TODO: deal with duplicate count distinct key + return false; + } + indexOfDist = pos; + if (!(aggr.getParameters().size() == 1 + && aggr.getParameters().get(0) instanceof ExprNodeColumnDesc && mGby.getConf() + .getKeys().get(0) instanceof ExprNodeColumnDesc)) { + return false; + } else { + ExprNodeColumnDesc agg = (ExprNodeColumnDesc) aggr.getParameters().get(0); + ExprNodeColumnDesc key = (ExprNodeColumnDesc) mGby.getConf().getKeys().get(0); + if (!agg.isSame(key)) { + return false; + } + } + cntDist++; + } + } + if (cntDist != 1) { + return false; + } + return true; + } + + /* + * We will transform GB-RS-GBY to mGby1-rs1-mGby2-rs2-rGby1 + */ + @SuppressWarnings("unchecked") + protected void processGroupBy(GroupByOperator mGby, ReduceSinkOperator rs, GroupByOperator rGby) + throws SemanticException, CloneNotSupportedException { + // remove count(distinct) in map-side gby + List> parents = mGby.getParentOperators(); + List> children = rGby.getChildOperators(); + mGby.removeParents(); + rs.removeParents(); + rGby.removeParents(); + + GroupByOperator mGby1 = genMapGroupby1(mGby, indexOfDist); + ReduceSinkOperator rs1 = genReducesink1(mGby1, rs, indexOfDist); + GroupByOperator mGby2 = genMapGroupby2(rs1, mGby); + ReduceSinkOperator rs2 = genReducesink2(mGby2, rs); + GroupByOperator rGby1 = genReduceGroupby(rs2, rGby, indexOfDist); + for (Operator parent : parents) { + OperatorFactory.makeChild(parent, mGby1); + } + OperatorFactory.makeChild(mGby1, rs1); + OperatorFactory.makeChild(rs1, mGby2); + OperatorFactory.makeChild(mGby2, rs2); + OperatorFactory.makeChild(rs2, rGby1); + for (Operator child : children) { + child.removeParents(); + OperatorFactory.makeChild(rGby1, child); + } + } + + // mGby1 ---already contains group by key, we need to remove distinct column + private GroupByOperator genMapGroupby1(Operator mGby, int indexOfDist) + throws CloneNotSupportedException { + GroupByOperator mGby1 = (GroupByOperator) mGby.clone(); + String fieldString = mGby1.getConf().getOutputColumnNames().get(indexOfDist + 1); + mGby1.getColumnExprMap().remove(fieldString); + mGby1.getConf().getOutputColumnNames().remove(indexOfDist + 1); + mGby1.getConf().getAggregators().remove(indexOfDist); + mGby1.getConf().setDistinct(false); + mGby1.getSchema().getColumnNames().remove(indexOfDist + 1); + mGby1.getSchema().getSignature().remove(indexOfDist + 1); + return mGby1; + } + + // rs1 --- remove distinctColIndices, set #reducer as -1, reset keys, + // values, colexpmap and rowschema + private ReduceSinkOperator genReducesink1(GroupByOperator mGby1, + Operator rs, int indexOfDist) throws CloneNotSupportedException, + SemanticException { + ReduceSinkOperator rs1 = (ReduceSinkOperator) rs.clone(); + Map colExprMap = new HashMap(); + ArrayList outputKeyColumnNames = new ArrayList(); + ArrayList outputValueColumnNames = new ArrayList(); + ArrayList reduceKeys = new ArrayList(); + ArrayList reduceValues = new ArrayList(); + List internalNames = new ArrayList<>(); + for (int index = 0; index < mGby1.getSchema().getSignature().size(); index++) { + ColumnInfo paraExprInfo = mGby1.getSchema().getSignature().get(index); + String paraExpression = paraExprInfo.getInternalName(); + assert (paraExpression != null); + ExprNodeColumnDesc exprDesc = new ExprNodeColumnDesc(paraExprInfo.getType(), + paraExpression, paraExprInfo.getTabAlias(), paraExprInfo.getIsVirtualCol()); + // index==0 means this is key + if (index == 0) { + reduceKeys.add(exprDesc); + String outputColName = SemanticAnalyzer.getColumnInternalName(index); + outputKeyColumnNames.add(outputColName); + String internalName = Utilities.ReduceField.KEY.toString() + "." + outputColName; + colExprMap.put(internalName, exprDesc); + internalNames.add(internalName); + } else { + reduceValues.add(exprDesc); + String outputColName = SemanticAnalyzer.getColumnInternalName(index - 1); + outputValueColumnNames.add(outputColName); + String internalName = Utilities.ReduceField.VALUE.toString() + "." + outputColName; + colExprMap.put(internalName, exprDesc); + internalNames.add(internalName); + } + } + List> distinctColIndices = new ArrayList<>(); + rs1.setConf(PlanUtils.getReduceSinkDesc(reduceKeys, 1, reduceValues, distinctColIndices, + outputKeyColumnNames, outputValueColumnNames, true, -1, 1, -1, + AcidUtils.Operation.NOT_ACID)); + rs1.setColumnExprMap(colExprMap); + + rs1.getSchema().getColumnNames().remove(indexOfDist + 1); + rs1.getSchema().getSignature().remove(indexOfDist + 1); + // KEY._col0:0._col0 => KEY._col0 + + for (int i = 0; i < rs1.getSchema().getSignature().size(); i++) { + rs1.getSchema().getSignature().get(i).setInternalName(internalNames.get(i)); + rs1.getSchema().getColumnNames().set(i, internalNames.get(i)); + } + return rs1; + } + + // mGby2 ---already contains key, remove distinct and change all the others + private GroupByOperator genMapGroupby2(ReduceSinkOperator rs1, + Operator mGby) throws CloneNotSupportedException, SemanticException { + GroupByOperator mGby2 = (GroupByOperator) mGby.clone(); + ArrayList rowSchema = new ArrayList<>(); + ArrayList groupByKeys = new ArrayList(); + ArrayList outputColumnNames = new ArrayList(); + Map colExprMap = new HashMap(); + + ColumnInfo exprInfo = rs1.getSchema().getSignature().get(0); + ExprNodeDesc key = new ExprNodeColumnDesc(exprInfo); + groupByKeys.add(key); + String field = SemanticAnalyzer.getColumnInternalName(0); + outputColumnNames.add(field); + ColumnInfo oColInfo = new ColumnInfo(field, exprInfo.getType(), "", false); + colExprMap.put(field, key); + rowSchema.add(oColInfo); + + ArrayList aggregations = new ArrayList(); + for (int index = 0; index < mGby2.getConf().getAggregators().size(); index++) { + ArrayList aggParameters = new ArrayList(); + if (index != indexOfDist) { + AggregationDesc desc = mGby2.getConf().getAggregators().get(index); + ColumnInfo paraExprInfo = null; + // for example, original it is max 0, dist 1, min 2 + // rs1's schema is key 0, max 1, min 2 + if (index < indexOfDist) { + paraExprInfo= rs1.getSchema().getSignature().get(index + 1); + } + else { + paraExprInfo= rs1.getSchema().getSignature().get(index); + } + + String paraExpression = paraExprInfo.getInternalName(); + assert (paraExpression != null); + aggParameters.add(new ExprNodeColumnDesc(paraExprInfo.getType(), paraExpression, + paraExprInfo.getTabAlias(), paraExprInfo.getIsVirtualCol())); + + // for all the other aggregations, we set the mode to PARTIAL2 + Mode amode = SemanticAnalyzer.groupByDescModeToUDAFMode(GroupByDesc.Mode.PARTIAL2, false); + GenericUDAFEvaluator genericUDAFEvaluator = desc.getGenericUDAFEvaluator(); + GenericUDAFInfo udaf = SemanticAnalyzer.getGenericUDAFInfo(genericUDAFEvaluator, amode, + aggParameters); + aggregations.add(new AggregationDesc(desc.getGenericUDAFName(), + udaf.genericUDAFEvaluator, udaf.convertedParameters, false, amode)); + String f = SemanticAnalyzer.getColumnInternalName(aggregations.size()); + outputColumnNames.add(f); + rowSchema.add(new ColumnInfo(f, udaf.returnType, "", false)); + } else { + // add count(KEY._col0) to replace distinct + ColumnInfo paraExprInfo = rs1.getSchema().getSignature().get(0); + String paraExpression = paraExprInfo.getInternalName(); + assert (paraExpression != null); + aggParameters.add(new ExprNodeColumnDesc(paraExprInfo.getType(), paraExpression, + paraExprInfo.getTabAlias(), paraExprInfo.getIsVirtualCol())); + Mode amode = SemanticAnalyzer.groupByDescModeToUDAFMode(GroupByDesc.Mode.HASH, false); + GenericUDAFEvaluator genericUDAFEvaluator = SemanticAnalyzer.getGenericUDAFEvaluator( + "count", aggParameters, null, false, false); + assert (genericUDAFEvaluator != null); + GenericUDAFInfo udaf = SemanticAnalyzer.getGenericUDAFInfo(genericUDAFEvaluator, amode, + aggParameters); + AggregationDesc newDesc = new AggregationDesc("count", udaf.genericUDAFEvaluator, + udaf.convertedParameters, false, amode); + aggregations.add(newDesc); + String f = SemanticAnalyzer.getColumnInternalName(aggregations.size()); + outputColumnNames.add(f); + rowSchema.add(new ColumnInfo(f, udaf.returnType, "", false)); + } + } + mGby2.getConf().setMode(GroupByDesc.Mode.PARTIAL2); + mGby2.getConf().setOutputColumnNames(outputColumnNames); + mGby2.getConf().getKeys().clear(); + mGby2.getConf().getKeys().addAll(groupByKeys); + mGby2.getConf().getAggregators().clear(); + mGby2.getConf().getAggregators().addAll(aggregations); + mGby2.getConf().setDistinct(false); + mGby2.setSchema(new RowSchema(rowSchema)); + mGby2.setColumnExprMap(colExprMap); + return mGby2; + } + + // #reducer is already 1 + private ReduceSinkOperator genReducesink2(GroupByOperator mGby2, + Operator rs) throws SemanticException, CloneNotSupportedException { + ReduceSinkOperator rs2 = (ReduceSinkOperator) rs.clone(); + Map colExprMap = new HashMap<>(); + + ArrayList outputKeyColumnNames = new ArrayList(); + ArrayList outputValueColumnNames = new ArrayList(); + ArrayList reduceValues = new ArrayList(); + for (int index = 1; index < mGby2.getSchema().getSignature().size(); index++) { + ColumnInfo paraExprInfo = mGby2.getSchema().getSignature().get(index); + String paraExpression = paraExprInfo.getInternalName(); + assert (paraExpression != null); + ExprNodeColumnDesc exprDesc = new ExprNodeColumnDesc(paraExprInfo.getType(), + paraExpression, paraExprInfo.getTabAlias(), paraExprInfo.getIsVirtualCol()); + reduceValues.add(exprDesc); + String outputColName = SemanticAnalyzer.getColumnInternalName(index - 1); + outputValueColumnNames.add(outputColName); + String internalName = Utilities.ReduceField.VALUE.toString() + "." + outputColName; + colExprMap.put(internalName, exprDesc); + } + List> distinctColIndices = new ArrayList<>(); + ArrayList reduceKeys = new ArrayList<>(); + rs2.setConf(PlanUtils.getReduceSinkDesc(reduceKeys, 0, reduceValues, distinctColIndices, + outputKeyColumnNames, outputValueColumnNames, true, -1, 0, 1, + AcidUtils.Operation.NOT_ACID)); + rs2.setColumnExprMap(colExprMap); + rs2.getSchema().getSignature().remove(0); + return rs2; + } + + // replace the distinct with the count aggregation + private GroupByOperator genReduceGroupby(ReduceSinkOperator rs2, + Operator rGby, int indexOfDist) throws SemanticException, + CloneNotSupportedException { + GroupByOperator rGby1 = (GroupByOperator) rGby.clone(); + ColumnInfo paraExprInfo = rs2.getSchema().getSignature().get(indexOfDist); + String paraExpression = paraExprInfo.getInternalName(); + assert (paraExpression != null); + ArrayList aggParameters = new ArrayList(); + aggParameters.add(new ExprNodeColumnDesc(paraExprInfo.getType(), paraExpression, paraExprInfo + .getTabAlias(), paraExprInfo.getIsVirtualCol())); + GenericUDAFEvaluator genericUDAFEvaluator = SemanticAnalyzer.getGenericUDAFEvaluator("count", + aggParameters, null, false, false); + assert (genericUDAFEvaluator != null); + Mode amode = SemanticAnalyzer.groupByDescModeToUDAFMode(GroupByDesc.Mode.MERGEPARTIAL, false); + GenericUDAFInfo udaf = SemanticAnalyzer.getGenericUDAFInfo(genericUDAFEvaluator, amode, + aggParameters); + AggregationDesc newDesc = new AggregationDesc("count", udaf.genericUDAFEvaluator, + udaf.convertedParameters, false, amode); + rGby1.getConf().getAggregators().set(indexOfDist, newDesc); + rGby1.getConf().setDistinct(false); + return rGby1; + } + + @Override + public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException { + GroupByOperator mGby = (GroupByOperator) stack.get(stack.size() - 3); + ReduceSinkOperator rs = (ReduceSinkOperator) stack.get(stack.size() - 2); + GroupByOperator rGby = (GroupByOperator) stack.get(stack.size() - 1); + if (checkCountDistinct(mGby, rGby)) { + LOG.info("trigger distinct rewrite"); + try { + processGroupBy(mGby, rs, rGby); + } catch (CloneNotSupportedException e) { + throw new RuntimeException(e.getMessage()); + } + } + return null; + } + + } + +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java index 7dace9076f..6af8f1e504 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java @@ -72,6 +72,10 @@ public void initialize(HiveConf hiveConf) { // we are translating Calcite operators into Hive operators. transformations.add(new HiveOpConverterPostProc()); + if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVECOUNTDISTINCTOPTIMIZER)) { + transformations.add(new CountDistinctRewriteProc()); + } + // Add the transformation that computes the lineage information. Set postExecHooks = Sets.newHashSet( Splitter.on(",").trimResults().omitEmptyStrings().split( diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/GroupByDesc.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/GroupByDesc.java index 38a9ef2af1..fe91ee7025 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/plan/GroupByDesc.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/GroupByDesc.java @@ -367,4 +367,19 @@ public GroupByOperatorExplainVectorization getGroupByVectorization() { } return new GroupByOperatorExplainVectorization(this, vectorDesc); } + + @Override + public Object clone() { + ArrayList outputColumnNames = new ArrayList<>(); + outputColumnNames.addAll(this.outputColumnNames); + ArrayList keys = new ArrayList<>(); + keys.addAll(this.keys); + ArrayList aggregators = new ArrayList<>(); + aggregators.addAll(this.aggregators); + List listGroupingSets = new ArrayList<>(); + listGroupingSets.addAll(this.listGroupingSets); + return new GroupByDesc(this.mode, outputColumnNames, keys, aggregators, + this.groupByMemoryUsage, this.memoryThreshold, listGroupingSets, this.groupingSetsPresent, + this.groupingSetPosition, this.isDistinct); + } } diff --git a/ql/src/test/queries/clientpositive/count_dist_rewrite.q b/ql/src/test/queries/clientpositive/count_dist_rewrite.q new file mode 100644 index 0000000000..47b27fab2d --- /dev/null +++ b/ql/src/test/queries/clientpositive/count_dist_rewrite.q @@ -0,0 +1,35 @@ +explain select max(key), count(distinct key) B1_CNTD from src; + +select max(key), count(distinct key) B1_CNTD from src; + +explain select max(key), count(distinct key), min(key) from src; + +select max(key), count(distinct key), min(key) from src; + +explain select max(key), count(distinct key), min(key), avg(key) from src; + +select max(key), count(distinct key), min(key), avg(key) from src; + +explain select count(1), count(distinct key) from src; + +select count(1), count(distinct key) from src; + +explain select + count(*) as total, + count(key) as not_null_total, + count(distinct key) as unique_days, + max(value) as max_ss_store_sk, + max(key) as max_ss_promo_sk +from src; + +select + count(*) as total, + count(key) as not_null_total, + count(distinct key) as unique_days, + max(value) as max_ss_store_sk, + max(key) as max_ss_promo_sk +from src; + +explain select count(1), count(distinct key), STDDEV(key) from src; +select count(1), count(distinct key), STDDEV(key) from src; +select count(distinct key), count(1), STDDEV(key) from src; diff --git a/ql/src/test/results/clientpositive/count_dist_rewrite.q.out b/ql/src/test/results/clientpositive/count_dist_rewrite.q.out new file mode 100644 index 0000000000..f5a6321f07 --- /dev/null +++ b/ql/src/test/results/clientpositive/count_dist_rewrite.q.out @@ -0,0 +1,531 @@ +PREHOOK: query: explain select max(key), count(distinct key) B1_CNTD from src +PREHOOK: type: QUERY +POSTHOOK: query: explain select max(key), count(distinct key) B1_CNTD from src +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: max(key) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: string) + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0), count(KEY._col0) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + sort order: + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: string), _col2 (type: bigint) + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0), count(VALUE._col1) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 376 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 376 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select max(key), count(distinct key) B1_CNTD from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select max(key), count(distinct key) B1_CNTD from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +98 309 +PREHOOK: query: explain select max(key), count(distinct key), min(key) from src +PREHOOK: type: QUERY +POSTHOOK: query: explain select max(key), count(distinct key), min(key) from src +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: max(key), min(key) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col3 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: string), _col3 (type: string) + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0), count(KEY._col0), min(VALUE._col1) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + sort order: + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: string), _col2 (type: bigint), _col3 (type: string) + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0), count(VALUE._col1), min(VALUE._col2) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 652 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 652 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select max(key), count(distinct key), min(key) from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select max(key), count(distinct key), min(key) from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +98 309 0 +PREHOOK: query: explain select max(key), count(distinct key), min(key), avg(key) from src +PREHOOK: type: QUERY +POSTHOOK: query: explain select max(key), count(distinct key), min(key), avg(key) from src +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: max(key), min(key), avg(key) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col3, _col4 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: string), _col3 (type: string), _col4 (type: struct) + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0), count(KEY._col0), min(VALUE._col1), avg(VALUE._col2) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + sort order: + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: string), _col2 (type: bigint), _col3 (type: string), _col4 (type: struct) + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0), count(VALUE._col1), min(VALUE._col2), avg(VALUE._col3) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 1 Data size: 908 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 908 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select max(key), count(distinct key), min(key), avg(key) from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select max(key), count(distinct key), min(key), avg(key) from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +98 309 0 260.182 +PREHOOK: query: explain select count(1), count(distinct key) from src +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(1), count(distinct key) from src +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: _col1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(1) + keys: _col1 (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), count(KEY._col0) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + sort order: + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint), _col2 (type: bigint) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), count(VALUE._col1) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 24 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 24 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(1), count(distinct key) from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select count(1), count(distinct key) from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +500 309 +PREHOOK: query: explain select + count(*) as total, + count(key) as not_null_total, + count(distinct key) as unique_days, + max(value) as max_ss_store_sk, + max(key) as max_ss_promo_sk +from src +PREHOOK: type: QUERY +POSTHOOK: query: explain select + count(*) as total, + count(key) as not_null_total, + count(distinct key) as unique_days, + max(value) as max_ss_store_sk, + max(key) as max_ss_promo_sk +from src +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: key, value + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(), count(key), max(value), max(key) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2, _col4, _col5 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint), _col2 (type: bigint), _col4 (type: string), _col5 (type: string) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), count(VALUE._col1), count(KEY._col0), max(VALUE._col2), max(VALUE._col3) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + sort order: + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint), _col2 (type: bigint), _col3 (type: bigint), _col4 (type: string), _col5 (type: string) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), count(VALUE._col1), count(VALUE._col2), max(VALUE._col3), max(VALUE._col4) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 1 Data size: 492 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 492 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select + count(*) as total, + count(key) as not_null_total, + count(distinct key) as unique_days, + max(value) as max_ss_store_sk, + max(key) as max_ss_promo_sk +from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select + count(*) as total, + count(key) as not_null_total, + count(distinct key) as unique_days, + max(value) as max_ss_store_sk, + max(key) as max_ss_promo_sk +from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +500 500 309 val_98 98 +PREHOOK: query: explain select count(1), count(distinct key), STDDEV(key) from src +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(1), count(distinct key), STDDEV(key) from src +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: _col1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(1), stddev(_col1) + keys: _col1 (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col3 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint), _col3 (type: struct) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), count(KEY._col0), stddev(VALUE._col1) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + sort order: + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint), _col2 (type: bigint), _col3 (type: struct) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), count(VALUE._col1), stddev(VALUE._col2) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 104 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 104 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(1), count(distinct key), STDDEV(key) from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select count(1), count(distinct key), STDDEV(key) from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +500 309 142.92680950752384 +PREHOOK: query: select count(distinct key), count(1), STDDEV(key) from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select count(distinct key), count(1), STDDEV(key) from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +309 500 142.92680950752384 diff --git a/ql/src/test/results/clientpositive/llap/count_dist_rewrite.q.out b/ql/src/test/results/clientpositive/llap/count_dist_rewrite.q.out new file mode 100644 index 0000000000..79844be3d3 --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/count_dist_rewrite.q.out @@ -0,0 +1,537 @@ +PREHOOK: query: explain select max(key), count(distinct key) B1_CNTD from src +PREHOOK: type: QUERY +POSTHOOK: query: explain select max(key), count(distinct key) B1_CNTD from src +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: max(key) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 205 Data size: 55555 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 205 Data size: 55555 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: string) + Execution mode: llap + LLAP IO: no inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0), count(KEY._col0) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 205 Data size: 57195 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 205 Data size: 57195 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: string), _col2 (type: bigint) + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0), count(VALUE._col1) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 192 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 192 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select max(key), count(distinct key) B1_CNTD from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select max(key), count(distinct key) B1_CNTD from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +98 309 +PREHOOK: query: explain select max(key), count(distinct key), min(key) from src +PREHOOK: type: QUERY +POSTHOOK: query: explain select max(key), count(distinct key), min(key) from src +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: max(key), min(key) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col3 + Statistics: Num rows: 205 Data size: 93275 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 205 Data size: 93275 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: string), _col3 (type: string) + Execution mode: llap + LLAP IO: no inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0), count(KEY._col0), min(VALUE._col1) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 205 Data size: 94915 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 205 Data size: 94915 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: string), _col2 (type: bigint), _col3 (type: string) + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0), count(VALUE._col1), min(VALUE._col2) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 376 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 376 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select max(key), count(distinct key), min(key) from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select max(key), count(distinct key), min(key) from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +98 309 0 +PREHOOK: query: explain select max(key), count(distinct key), min(key), avg(key) from src +PREHOOK: type: QUERY +POSTHOOK: query: explain select max(key), count(distinct key), min(key), avg(key) from src +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: max(key), min(key), avg(key) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col3, _col4 + Statistics: Num rows: 205 Data size: 145755 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 205 Data size: 145755 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: string), _col3 (type: string), _col4 (type: struct) + Execution mode: llap + LLAP IO: no inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0), count(KEY._col0), min(VALUE._col1), avg(VALUE._col2) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 205 Data size: 147395 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 205 Data size: 147395 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: string), _col2 (type: bigint), _col3 (type: string), _col4 (type: struct) + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0), count(VALUE._col1), min(VALUE._col2), avg(VALUE._col3) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 1 Data size: 384 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 384 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select max(key), count(distinct key), min(key), avg(key) from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select max(key), count(distinct key), min(key), avg(key) from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +98 309 0 260.182 +PREHOOK: query: explain select count(1), count(distinct key) from src +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(1), count(distinct key) from src +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: string) + outputColumnNames: _col1 + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: count(1) + keys: _col1 (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 205 Data size: 19475 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 205 Data size: 19475 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: bigint) + Execution mode: llap + LLAP IO: no inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), count(KEY._col0) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 205 Data size: 21115 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 205 Data size: 21115 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: bigint), _col2 (type: bigint) + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), count(VALUE._col1) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(1), count(distinct key) from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select count(1), count(distinct key) from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +500 309 +PREHOOK: query: explain select + count(*) as total, + count(key) as not_null_total, + count(distinct key) as unique_days, + max(value) as max_ss_store_sk, + max(key) as max_ss_promo_sk +from src +PREHOOK: type: QUERY +POSTHOOK: query: explain select + count(*) as total, + count(key) as not_null_total, + count(distinct key) as unique_days, + max(value) as max_ss_store_sk, + max(key) as max_ss_promo_sk +from src +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: key, value + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: count(), count(key), max(value), max(key) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2, _col4, _col5 + Statistics: Num rows: 205 Data size: 96555 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 205 Data size: 96555 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: bigint), _col2 (type: bigint), _col4 (type: string), _col5 (type: string) + Execution mode: llap + LLAP IO: no inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), count(VALUE._col1), count(KEY._col0), max(VALUE._col2), max(VALUE._col3) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5 + Statistics: Num rows: 205 Data size: 98195 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 205 Data size: 98195 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: bigint), _col2 (type: bigint), _col3 (type: bigint), _col4 (type: string), _col5 (type: string) + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), count(VALUE._col1), count(VALUE._col2), max(VALUE._col3), max(VALUE._col4) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 1 Data size: 392 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 392 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select + count(*) as total, + count(key) as not_null_total, + count(distinct key) as unique_days, + max(value) as max_ss_store_sk, + max(key) as max_ss_promo_sk +from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select + count(*) as total, + count(key) as not_null_total, + count(distinct key) as unique_days, + max(value) as max_ss_store_sk, + max(key) as max_ss_promo_sk +from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +500 500 309 val_98 98 +PREHOOK: query: explain select count(1), count(distinct key), STDDEV(key) from src +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(1), count(distinct key), STDDEV(key) from src +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: string) + outputColumnNames: _col1 + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: count(1), stddev(_col1) + keys: _col1 (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col3 + Statistics: Num rows: 205 Data size: 35875 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 205 Data size: 35875 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: bigint), _col3 (type: struct) + Execution mode: llap + LLAP IO: no inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), count(KEY._col0), stddev(VALUE._col1) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 205 Data size: 37515 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 205 Data size: 37515 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: bigint), _col2 (type: bigint), _col3 (type: struct) + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), count(VALUE._col1), stddev(VALUE._col2) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(1), count(distinct key), STDDEV(key) from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select count(1), count(distinct key), STDDEV(key) from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +500 309 142.9268095075238 +PREHOOK: query: select count(distinct key), count(1), STDDEV(key) from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select count(distinct key), count(1), STDDEV(key) from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +309 500 142.92680950752379