diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveCalciteUtil.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveCalciteUtil.java index 774fc59..c527e58 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveCalciteUtil.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveCalciteUtil.java @@ -918,8 +918,11 @@ public static ExprNodeDesc getExprNode(Integer inputRefIndx, RelNode inputRel, // The following check is only a guard against failures. // TODO: Knowing which expr is constant in GBY's aggregation function // arguments could be better done using Metadata provider of Calcite. - if (exprs != null && index < exprs.size() && exprs.get(index) instanceof RexLiteral) { - ExprNodeDesc exprNodeDesc = exprConv.visitLiteral((RexLiteral) exprs.get(index)); + //check the corresponding expression in exprs to see if it is literal + if (exprs != null && index < exprs.size() && exprs.get(inputRefs.get(index)) instanceof RexLiteral) { + //because rexInputRefs represent ref expr corresponding to value in inputRefs it is used to get + // corresponding index + ExprNodeDesc exprNodeDesc = exprConv.visitLiteral((RexLiteral) exprs.get(inputRefs.get(index))); exprNodes.add(exprNodeDesc); } else { RexNode iRef = rexInputRefs.get(index); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/HiveGBOpConvUtil.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/HiveGBOpConvUtil.java index 25fe059..0f6c5b5 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/HiveGBOpConvUtil.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/HiveGBOpConvUtil.java @@ -724,7 +724,10 @@ private static OpAttr genMapSideRS(OpAttr inputOpAf, GBInfo gbInfo) throws Seman outputKeyColumnNames.add(udafName); for (int i = 0; i < gbInfo.distExprNodes.size(); i++) { reduceKeys.add(gbInfo.distExprNodes.get(i)); - outputColName = SemanticAnalyzer.getColumnInternalName(i); + //this part of reduceKeys is later used to create column names strictly for non-distinct aggregates + // with parameters same as distinct keys which expects _col0 at the end. So we always append + // _col0 at the end instead of _col + outputColName = SemanticAnalyzer.getColumnInternalName(0); String field = Utilities.ReduceField.KEY.toString() + "." + udafName + ":" + i + "." + outputColName; ColumnInfo colInfo = new ColumnInfo(field, gbInfo.distExprNodes.get(i).getTypeInfo(), null, @@ -1014,8 +1017,8 @@ private static OpAttr genReduceSideGB1NoMapGB(OpAttr inputOpAf, GBInfo gbInfo, // --grpkey--,--distkey--,--values-- // but distUDAF may be before/after some non-distUDAF, // i.e., their positions can be mixed. - // so we first process distUDAF and then non-distUDAF. - // But we need to remember the sequence of udafs. + // so for all UDAF we first check to see if it is groupby key, if not is it distinct key + // if not it should be value List distinctPositions = new ArrayList<>(); Map> indexToParameter = new TreeMap<>(); for (int i = 0; i < gbInfo.udafAttrs.size(); i++) { @@ -1025,62 +1028,28 @@ private static OpAttr genReduceSideGB1NoMapGB(OpAttr inputOpAf, GBInfo gbInfo, ColumnInfo rsUDAFParamColInfo; ExprNodeDesc udafParam; ExprNodeDesc constantPropDistinctUDAFParam; - if (udafAttr.isDistinctUDAF) { - // udafAttr.udafParamsIndxInGBInfoDistExprs is not quite useful - // because distinctUDAF can also include group by key as an argument. - for (int j = 0; j < udafAttr.argList.size(); j++) { - int argPos = udafAttr.argList.get(j); - if (argPos < gbInfo.gbKeys.size() + distinctPositions.size()) { - // distinctUDAF includes group by key as an argument or reuses distinct keys. - rsUDAFParamColInfo = rsColInfoLst.get(argPos); - } else { - rsUDAFParamColInfo = rsColInfoLst.get(gbInfo.gbKeys.size() + distinctPositions.size()); - distinctPositions.add(argPos); - } - String rsDistUDAFParamName = rsUDAFParamColInfo.getInternalName(); - // TODO: verify if this is needed - if (lastReduceKeyColName != null) { - rsDistUDAFParamName = Utilities.ReduceField.KEY.name() + "." + lastReduceKeyColName - + ":" + numDistinctUDFs + "." + SemanticAnalyzer.getColumnInternalName(j); - } - - udafParam = new ExprNodeColumnDesc(rsUDAFParamColInfo.getType(), rsDistUDAFParamName, - rsUDAFParamColInfo.getTabAlias(), rsUDAFParamColInfo.getIsVirtualCol()); - constantPropDistinctUDAFParam = SemanticAnalyzer - .isConstantParameterInAggregationParameters(rsUDAFParamColInfo.getInternalName(), - reduceValues); - if (constantPropDistinctUDAFParam != null) { - udafParam = constantPropDistinctUDAFParam; - } - aggParameters.add(udafParam); + for (int j = 0; j < udafAttr.udafParams.size(); j++) { + int argPos = getColInfoPos(udafAttr.udafParams.get(j), gbInfo); + rsUDAFParamColInfo = rsColInfoLst.get(argPos); + String rsUDAFParamName = rsUDAFParamColInfo.getInternalName(); + + if (udafAttr.isDistinctUDAF && lastReduceKeyColName != null) { + rsUDAFParamName = Utilities.ReduceField.KEY.name() + "." + lastReduceKeyColName + + ":" + numDistinctUDFs + "." + SemanticAnalyzer.getColumnInternalName(j); } - indexToParameter.put(i, aggParameters); - numDistinctUDFs++; - } - } - for (int i = 0; i < gbInfo.udafAttrs.size(); i++) { - UDAFAttrs udafAttr = gbInfo.udafAttrs.get(i); - ArrayList aggParameters = new ArrayList(); - - ColumnInfo rsUDAFParamColInfo; - ExprNodeDesc udafParam; - ExprNodeDesc constantPropDistinctUDAFParam; - if (!udafAttr.isDistinctUDAF) { - for (int j = 0; j < udafAttr.udafParams.size(); j++) { - int argPos = udafAttr.argList.get(j); - rsUDAFParamColInfo = rsColInfoLst.get(argPos + getOffSet(distinctPositions, argPos)); - String rsUDAFParamName = rsUDAFParamColInfo.getInternalName(); - udafParam = new ExprNodeColumnDesc(rsUDAFParamColInfo.getType(), rsUDAFParamName, - rsUDAFParamColInfo.getTabAlias(), rsUDAFParamColInfo.getIsVirtualCol()); - constantPropDistinctUDAFParam = SemanticAnalyzer - .isConstantParameterInAggregationParameters(rsUDAFParamColInfo.getInternalName(), - reduceValues); - if (constantPropDistinctUDAFParam != null) { - udafParam = constantPropDistinctUDAFParam; - } - aggParameters.add(udafParam); + udafParam = new ExprNodeColumnDesc(rsUDAFParamColInfo.getType(), rsUDAFParamName, + rsUDAFParamColInfo.getTabAlias(), rsUDAFParamColInfo.getIsVirtualCol()); + constantPropDistinctUDAFParam = SemanticAnalyzer + .isConstantParameterInAggregationParameters(rsUDAFParamColInfo.getInternalName(), + reduceValues); + if (constantPropDistinctUDAFParam != null) { + udafParam = constantPropDistinctUDAFParam; } - indexToParameter.put(i, aggParameters); + aggParameters.add(udafParam); + } + indexToParameter.put(i, aggParameters); + if(udafAttr.isDistinctUDAF) { + numDistinctUDFs++; } } for(int index : indexToParameter.keySet()){ @@ -1108,14 +1077,27 @@ private static OpAttr genReduceSideGB1NoMapGB(OpAttr inputOpAf, GBInfo gbInfo, return new OpAttr("", new HashSet(), rsGB1); } - private static int getOffSet(List distinctPositions, int pos) { - int ret = 0; - for (int distPos : distinctPositions) { - if (distPos > pos) { - ret++; + private static int getColInfoPos(ExprNodeDesc aggExpr, GBInfo gbInfo ) { + //first see if it is gbkeys + int gbKeyIndex = ExprNodeDescUtils.indexOf(aggExpr, gbInfo.gbKeys); + if(gbKeyIndex < 0 ) { + //then check if it is distinct key + int distinctKeyIndex = ExprNodeDescUtils.indexOf(aggExpr, gbInfo.distExprNodes); + if(distinctKeyIndex < 0) { + // lastly it should be in deDupedNonDistIrefs + int deDupValIndex = ExprNodeDescUtils.indexOf(aggExpr, gbInfo.deDupedNonDistIrefs); + assert(deDupValIndex >= 0); + return gbInfo.gbKeys.size() + gbInfo.distExprNodes.size() + deDupValIndex; } + else { + //aggExpr is part of distinct key + return gbInfo.gbKeys.size() + distinctKeyIndex; + } + + } + else { + return gbKeyIndex; } - return ret; } @SuppressWarnings("unchecked") diff --git a/ql/src/test/queries/clientpositive/count.q b/ql/src/test/queries/clientpositive/count.q index 41ffaf2..2849d9a 100644 --- a/ql/src/test/queries/clientpositive/count.q +++ b/ql/src/test/queries/clientpositive/count.q @@ -21,6 +21,12 @@ select count(1), count(*), count(a), count(b), count(c), count(d), count(distinc set hive.cbo.returnpath.hiveop=true; +set hive.map.aggr=true; +--first aggregation with literal. gbinfo was generating wrong expression +explain select count(1), count(*), count(a), count(b), count(c), count(d), count(distinct a), count(distinct b), count(distinct c), count(distinct d), count(distinct a,b), count(distinct b,c), count(distinct c,d), count(distinct a,d), count(distinct a,c), count(distinct b,d), count(distinct a,b,c), count(distinct b,c,d), count(distinct a,c,d), count(distinct a,b,d), count(distinct a,b,c,d) from abcd; +select count(1), count(*), count(a), count(b), count(c), count(d), count(distinct a), count(distinct b), count(distinct c), count(distinct d), count(distinct a,b), count(distinct b,c), count(distinct c,d), count(distinct a,d), count(distinct a,c), count(distinct b,d), count(distinct a,b,c), count(distinct b,c,d), count(distinct a,c,d), count(distinct a,b,d), count(distinct a,b,c,d) from abcd; + +set hive.map.aggr=false; explain select count(distinct b) from abcd group by a; select count(distinct b) from abcd group by a; @@ -33,4 +39,16 @@ select count(distinct b) from abcd group by c; explain select count(b), count(distinct c) from abcd group by d; select count(b), count(distinct c) from abcd group by d; +--non distinct aggregate with same column as group by key +explain select a, count(distinct b), count(distinct c), sum(d), sum(d+d), sum(d*3), sum(b), sum(c), sum(a), sum(distinct a), sum(distinct b) from abcd group by a; +select a, count(distinct b), count(distinct c), sum(d), sum(d+d), sum(d*3), sum(b), sum(c), sum(a), sum(distinct a), sum(distinct b) from abcd group by a; + +--non distinct aggregate with same column as distinct aggregate +explain select a, count(distinct b), count(distinct c), sum(d), sum(c) from abcd group by a; +select a, count(distinct b), count(distinct c), sum(d), sum(c) from abcd group by a; + +--aggregation with literal +explain select count(1), count(*), count(a), count(b), count(c), count(d), count(distinct a), count(distinct b), count(distinct c), count(distinct d), count(distinct a,b), count(distinct b,c), count(distinct c,d), count(distinct a,d), count(distinct a,c), count(distinct b,d), count(distinct a,b,c), count(distinct b,c,d), count(distinct a,c,d), count(distinct a,b,d), count(distinct a,b,c,d) from abcd; +select count(1), count(*), count(a), count(b), count(c), count(d), count(distinct a), count(distinct b), count(distinct c), count(distinct d), count(distinct a,b), count(distinct b,c), count(distinct c,d), count(distinct a,d), count(distinct a,c), count(distinct b,d), count(distinct a,b,c), count(distinct b,c,d), count(distinct a,c,d), count(distinct a,b,d), count(distinct a,b,c,d) from abcd; + set hive.cbo.returnpath.hiveop=false; diff --git a/ql/src/test/queries/clientpositive/groupby_ppr_multi_distinct.q b/ql/src/test/queries/clientpositive/groupby_ppr_multi_distinct.q index 74bd2fd..357ab95 100644 --- a/ql/src/test/queries/clientpositive/groupby_ppr_multi_distinct.q +++ b/ql/src/test/queries/clientpositive/groupby_ppr_multi_distinct.q @@ -20,3 +20,20 @@ WHERE src.ds = '2008-04-08' GROUP BY substr(src.key,1,1); SELECT dest1.* FROM dest1; + +set hive.cbo.returnpath.hiveop=true; +EXPLAIN EXTENDED +FROM srcpart src +INSERT OVERWRITE TABLE dest1 +SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))), sum(DISTINCT substr(src.value, 5)), count(DISTINCT src.value) +WHERE src.ds = '2008-04-08' +GROUP BY substr(src.key,1,1); + +FROM srcpart src +INSERT OVERWRITE TABLE dest1 +SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))), sum(DISTINCT substr(src.value, 5)), count(DISTINCT src.value) +WHERE src.ds = '2008-04-08' +GROUP BY substr(src.key,1,1); + +SELECT dest1.* FROM dest1; +set hive.cbo.returnpath.hiveop=false; diff --git a/ql/src/test/results/clientpositive/count.q.out b/ql/src/test/results/clientpositive/count.q.out index c950c5b..641da27 100644 --- a/ql/src/test/results/clientpositive/count.q.out +++ b/ql/src/test/results/clientpositive/count.q.out @@ -264,6 +264,67 @@ POSTHOOK: type: QUERY POSTHOOK: Input: default@abcd #### A masked pattern was here #### 7 7 6 6 6 7 3 3 6 7 4 5 6 6 5 6 4 5 5 5 4 +PREHOOK: query: --first aggregation with literal. gbinfo was generating wrong expression +explain select count(1), count(*), count(a), count(b), count(c), count(d), count(distinct a), count(distinct b), count(distinct c), count(distinct d), count(distinct a,b), count(distinct b,c), count(distinct c,d), count(distinct a,d), count(distinct a,c), count(distinct b,d), count(distinct a,b,c), count(distinct b,c,d), count(distinct a,c,d), count(distinct a,b,d), count(distinct a,b,c,d) from abcd +PREHOOK: type: QUERY +POSTHOOK: query: --first aggregation with literal. gbinfo was generating wrong expression +explain select count(1), count(*), count(a), count(b), count(c), count(d), count(distinct a), count(distinct b), count(distinct c), count(distinct d), count(distinct a,b), count(distinct b,c), count(distinct c,d), count(distinct a,d), count(distinct a,c), count(distinct b,d), count(distinct a,b,c), count(distinct b,c,d), count(distinct a,c,d), count(distinct a,b,d), count(distinct a,b,c,d) from abcd +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: abcd + Statistics: Num rows: 4 Data size: 78 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: a (type: int), b (type: int), c (type: int), d (type: int) + outputColumnNames: $f1, $f2, $f3, $f4 + Statistics: Num rows: 4 Data size: 78 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(1), count(), count($f1), count($f2), count($f3), count($f4), count(DISTINCT $f1), count(DISTINCT $f2), count(DISTINCT $f3), count(DISTINCT $f4), count(DISTINCT $f1, $f2), count(DISTINCT $f2, $f3), count(DISTINCT $f3, $f4), count(DISTINCT $f1, $f4), count(DISTINCT $f1, $f3), count(DISTINCT $f2, $f4), count(DISTINCT $f1, $f2, $f3), count(DISTINCT $f2, $f3, $f4), count(DISTINCT $f1, $f3, $f4), count(DISTINCT $f1, $f2, $f4), count(DISTINCT $f1, $f2, $f3, $f4) + keys: $f1 (type: int), $f2 (type: int), $f3 (type: int), $f4 (type: int) + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16, _col17, _col18, _col19, _col20, _col21, _col22, _col23, _col24 + Statistics: Num rows: 4 Data size: 78 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: int), _col1 (type: int), _col2 (type: int), _col3 (type: int) + sort order: ++++ + Statistics: Num rows: 4 Data size: 78 Basic stats: COMPLETE Column stats: NONE + value expressions: _col4 (type: bigint), _col5 (type: bigint), _col6 (type: bigint), _col7 (type: bigint), _col8 (type: bigint), _col9 (type: bigint) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), count(VALUE._col1), count(VALUE._col2), count(VALUE._col3), count(VALUE._col4), count(VALUE._col5), count(DISTINCT KEY._col0:0._col0), count(DISTINCT KEY._col0:1._col0), count(DISTINCT KEY._col0:2._col0), count(DISTINCT KEY._col0:3._col0), count(DISTINCT KEY._col0:4._col0, KEY._col0:4._col1), count(DISTINCT KEY._col0:5._col0, KEY._col0:5._col1), count(DISTINCT KEY._col0:6._col0, KEY._col0:6._col1), count(DISTINCT KEY._col0:7._col0, KEY._col0:7._col1), count(DISTINCT KEY._col0:8._col0, KEY._col0:8._col1), count(DISTINCT KEY._col0:9._col0, KEY._col0:9._col1), count(DISTINCT KEY._col0:10._col0, KEY._col0:10._col1, KEY._col0:10._col2), count(DISTINCT KEY._col0:11._col0, KEY._col0:11._col1, KEY._col0:11._col2), count(DISTINCT KEY._col0:12._col0, KEY._col0:12._col1, KEY._col0:12._col2), count(DISTINCT KEY._col0:13._col0, KEY._col0:13._col1, KEY._col0:13._col2), count(DISTINCT KEY._col0:14._col0, KEY._col0:14._col1, KEY._col0:14._col2, KEY._col0:14._col3) + mode: mergepartial + outputColumnNames: $f0, $f1, $f2, $f3, $f4, $f5, $f6, $f7, $f8, $f9, $f10, $f11, $f12, $f13, $f14, $f15, $f16, $f17, $f18, $f19, $f20 + Statistics: Num rows: 1 Data size: 336 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 336 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(1), count(*), count(a), count(b), count(c), count(d), count(distinct a), count(distinct b), count(distinct c), count(distinct d), count(distinct a,b), count(distinct b,c), count(distinct c,d), count(distinct a,d), count(distinct a,c), count(distinct b,d), count(distinct a,b,c), count(distinct b,c,d), count(distinct a,c,d), count(distinct a,b,d), count(distinct a,b,c,d) from abcd +PREHOOK: type: QUERY +PREHOOK: Input: default@abcd +#### A masked pattern was here #### +POSTHOOK: query: select count(1), count(*), count(a), count(b), count(c), count(d), count(distinct a), count(distinct b), count(distinct c), count(distinct d), count(distinct a,b), count(distinct b,c), count(distinct c,d), count(distinct a,d), count(distinct a,c), count(distinct b,d), count(distinct a,b,c), count(distinct b,c,d), count(distinct a,c,d), count(distinct a,b,d), count(distinct a,b,c,d) from abcd +POSTHOOK: type: QUERY +POSTHOOK: Input: default@abcd +#### A masked pattern was here #### +7 7 6 6 6 7 3 3 6 7 4 5 6 6 5 6 4 5 5 5 4 PREHOOK: query: explain select count(distinct b) from abcd group by a PREHOOK: type: QUERY POSTHOOK: query: explain select count(distinct b) from abcd group by a @@ -464,30 +525,31 @@ STAGE PLANS: Map Operator Tree: TableScan alias: abcd - Statistics: Num rows: 9 Data size: 78 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 6 Data size: 78 Basic stats: COMPLETE Column stats: NONE Select Operator - expressions: c (type: int), d (type: int) - outputColumnNames: c, d - Statistics: Num rows: 9 Data size: 78 Basic stats: COMPLETE Column stats: NONE + expressions: b (type: int), c (type: int), d (type: int) + outputColumnNames: b, c, d + Statistics: Num rows: 6 Data size: 78 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: d (type: int), c (type: int) sort order: ++ Map-reduce partition columns: d (type: int) - Statistics: Num rows: 9 Data size: 78 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 6 Data size: 78 Basic stats: COMPLETE Column stats: NONE + value expressions: b (type: int) Reduce Operator Tree: Group By Operator - aggregations: count(KEY._col1:0._col0), count(DISTINCT KEY._col1:0._col0) + aggregations: count(VALUE._col0), count(DISTINCT KEY._col1:0._col0) keys: KEY._col0 (type: int) mode: complete outputColumnNames: d, $f1, $f2 - Statistics: Num rows: 4 Data size: 34 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 3 Data size: 39 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: $f1 (type: bigint), $f2 (type: bigint) outputColumnNames: _o__c0, _o__c1 - Statistics: Num rows: 4 Data size: 34 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 3 Data size: 39 Basic stats: COMPLETE Column stats: NONE File Output Operator compressed: false - Statistics: Num rows: 4 Data size: 34 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 3 Data size: 39 Basic stats: COMPLETE Column stats: NONE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat @@ -507,10 +569,184 @@ POSTHOOK: query: select count(b), count(distinct c) from abcd group by d POSTHOOK: type: QUERY POSTHOOK: Input: default@abcd #### A masked pattern was here #### -0 0 -1 1 +0 1 +1 0 1 1 1 1 1 1 1 1 1 1 +PREHOOK: query: --non distinct aggregate with same column as group by key +explain select a, count(distinct b), count(distinct c), sum(d), sum(d+d), sum(d*3), sum(b), sum(c), sum(a), sum(distinct a), sum(distinct b) from abcd group by a +PREHOOK: type: QUERY +POSTHOOK: query: --non distinct aggregate with same column as group by key +explain select a, count(distinct b), count(distinct c), sum(d), sum(d+d), sum(d*3), sum(b), sum(c), sum(a), sum(distinct a), sum(distinct b) from abcd group by a +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: abcd + Statistics: Num rows: 4 Data size: 78 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: a (type: int), b (type: int), c (type: int), d (type: int), (d + d) (type: int), (d * 3) (type: int) + outputColumnNames: $f0, $f1, $f2, $f3, $f4, $f5 + Statistics: Num rows: 4 Data size: 78 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: $f0 (type: int), $f1 (type: int), $f2 (type: int) + sort order: +++ + Map-reduce partition columns: $f0 (type: int) + Statistics: Num rows: 4 Data size: 78 Basic stats: COMPLETE Column stats: NONE + value expressions: $f3 (type: int), $f4 (type: int), $f5 (type: int) + Reduce Operator Tree: + Group By Operator + aggregations: count(DISTINCT KEY._col1:0._col0), count(DISTINCT KEY._col1:1._col0), sum(VALUE._col0), sum(VALUE._col1), sum(VALUE._col2), sum(KEY._col1:0._col0), sum(KEY._col1:1._col0), sum(KEY._col0), sum(DISTINCT KEY._col1:2._col0), sum(DISTINCT KEY._col1:3._col0) + keys: KEY._col0 (type: int) + mode: complete + outputColumnNames: $f0, $f1, $f2, $f3, $f4, $f5, $f6, $f7, $f8, $f9, $f10 + Statistics: Num rows: 2 Data size: 39 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 39 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select a, count(distinct b), count(distinct c), sum(d), sum(d+d), sum(d*3), sum(b), sum(c), sum(a), sum(distinct a), sum(distinct b) from abcd group by a +PREHOOK: type: QUERY +PREHOOK: Input: default@abcd +#### A masked pattern was here #### +POSTHOOK: query: select a, count(distinct b), count(distinct c), sum(d), sum(d+d), sum(d*3), sum(b), sum(c), sum(a), sum(distinct a), sum(distinct b) from abcd group by a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@abcd +#### A masked pattern was here #### +10 2 2 10 20 30 1200 95 30 10 1100 +100 1 1 3 6 9 100 10 100 100 100 +12 1 2 9 18 27 100 155 24 12 100 +NULL 1 1 6 12 18 35 23 NULL NULL 35 +PREHOOK: query: --non distinct aggregate with same column as distinct aggregate +explain select a, count(distinct b), count(distinct c), sum(d), sum(c) from abcd group by a +PREHOOK: type: QUERY +POSTHOOK: query: --non distinct aggregate with same column as distinct aggregate +explain select a, count(distinct b), count(distinct c), sum(d), sum(c) from abcd group by a +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: abcd + Statistics: Num rows: 4 Data size: 78 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: a (type: int), b (type: int), c (type: int), d (type: int) + outputColumnNames: a, b, c, d + Statistics: Num rows: 4 Data size: 78 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: a (type: int), b (type: int), c (type: int) + sort order: +++ + Map-reduce partition columns: a (type: int) + Statistics: Num rows: 4 Data size: 78 Basic stats: COMPLETE Column stats: NONE + value expressions: d (type: int) + Reduce Operator Tree: + Group By Operator + aggregations: count(DISTINCT KEY._col1:0._col0), count(DISTINCT KEY._col1:1._col0), sum(VALUE._col0), sum(KEY._col1:1._col0) + keys: KEY._col0 (type: int) + mode: complete + outputColumnNames: a, $f1, $f2, $f3, $f4 + Statistics: Num rows: 2 Data size: 39 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 39 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select a, count(distinct b), count(distinct c), sum(d), sum(c) from abcd group by a +PREHOOK: type: QUERY +PREHOOK: Input: default@abcd +#### A masked pattern was here #### +POSTHOOK: query: select a, count(distinct b), count(distinct c), sum(d), sum(c) from abcd group by a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@abcd +#### A masked pattern was here #### +10 2 2 10 95 +100 1 1 3 10 +12 1 2 9 155 +NULL 1 1 6 23 +PREHOOK: query: --aggregation with literal +explain select count(1), count(*), count(a), count(b), count(c), count(d), count(distinct a), count(distinct b), count(distinct c), count(distinct d), count(distinct a,b), count(distinct b,c), count(distinct c,d), count(distinct a,d), count(distinct a,c), count(distinct b,d), count(distinct a,b,c), count(distinct b,c,d), count(distinct a,c,d), count(distinct a,b,d), count(distinct a,b,c,d) from abcd +PREHOOK: type: QUERY +POSTHOOK: query: --aggregation with literal +explain select count(1), count(*), count(a), count(b), count(c), count(d), count(distinct a), count(distinct b), count(distinct c), count(distinct d), count(distinct a,b), count(distinct b,c), count(distinct c,d), count(distinct a,d), count(distinct a,c), count(distinct b,d), count(distinct a,b,c), count(distinct b,c,d), count(distinct a,c,d), count(distinct a,b,d), count(distinct a,b,c,d) from abcd +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: abcd + Statistics: Num rows: 4 Data size: 78 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: a (type: int), b (type: int), c (type: int), d (type: int) + outputColumnNames: $f1, $f2, $f3, $f4 + Statistics: Num rows: 4 Data size: 78 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: $f1 (type: int), $f2 (type: int), $f3 (type: int), $f4 (type: int) + sort order: ++++ + Statistics: Num rows: 4 Data size: 78 Basic stats: COMPLETE Column stats: NONE + Reduce Operator Tree: + Group By Operator + aggregations: count(1), count(), count(KEY._col0:0._col0), count(KEY._col0:1._col0), count(KEY._col0:2._col0), count(KEY._col0:3._col0), count(DISTINCT KEY._col0:0._col0), count(DISTINCT KEY._col0:1._col0), count(DISTINCT KEY._col0:2._col0), count(DISTINCT KEY._col0:3._col0), count(DISTINCT KEY._col0:4._col0, KEY._col0:4._col1), count(DISTINCT KEY._col0:5._col0, KEY._col0:5._col1), count(DISTINCT KEY._col0:6._col0, KEY._col0:6._col1), count(DISTINCT KEY._col0:7._col0, KEY._col0:7._col1), count(DISTINCT KEY._col0:8._col0, KEY._col0:8._col1), count(DISTINCT KEY._col0:9._col0, KEY._col0:9._col1), count(DISTINCT KEY._col0:10._col0, KEY._col0:10._col1, KEY._col0:10._col2), count(DISTINCT KEY._col0:11._col0, KEY._col0:11._col1, KEY._col0:11._col2), count(DISTINCT KEY._col0:12._col0, KEY._col0:12._col1, KEY._col0:12._col2), count(DISTINCT KEY._col0:13._col0, KEY._col0:13._col1, KEY._col0:13._col2), count(DISTINCT KEY._col0:14._col0, KEY._col0:14._col1, KEY._col0:14._col2, KEY._col0:14._col3) + mode: complete + outputColumnNames: $f0, $f1, $f2, $f3, $f4, $f5, $f6, $f7, $f8, $f9, $f10, $f11, $f12, $f13, $f14, $f15, $f16, $f17, $f18, $f19, $f20 + Statistics: Num rows: 1 Data size: 168 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 168 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(1), count(*), count(a), count(b), count(c), count(d), count(distinct a), count(distinct b), count(distinct c), count(distinct d), count(distinct a,b), count(distinct b,c), count(distinct c,d), count(distinct a,d), count(distinct a,c), count(distinct b,d), count(distinct a,b,c), count(distinct b,c,d), count(distinct a,c,d), count(distinct a,b,d), count(distinct a,b,c,d) from abcd +PREHOOK: type: QUERY +PREHOOK: Input: default@abcd +#### A masked pattern was here #### +POSTHOOK: query: select count(1), count(*), count(a), count(b), count(c), count(d), count(distinct a), count(distinct b), count(distinct c), count(distinct d), count(distinct a,b), count(distinct b,c), count(distinct c,d), count(distinct a,d), count(distinct a,c), count(distinct b,d), count(distinct a,b,c), count(distinct b,c,d), count(distinct a,c,d), count(distinct a,b,d), count(distinct a,b,c,d) from abcd +POSTHOOK: type: QUERY +POSTHOOK: Input: default@abcd +#### A masked pattern was here #### +7 7 6 6 6 7 3 3 6 7 4 5 6 6 5 6 4 5 5 5 4 diff --git a/ql/src/test/results/clientpositive/groupby_ppr_multi_distinct.q.out b/ql/src/test/results/clientpositive/groupby_ppr_multi_distinct.q.out index 33d1ed0..6595196 100644 --- a/ql/src/test/results/clientpositive/groupby_ppr_multi_distinct.q.out +++ b/ql/src/test/results/clientpositive/groupby_ppr_multi_distinct.q.out @@ -265,3 +265,258 @@ POSTHOOK: Input: default@dest1 7 6 71470.0 447 6 8 8 81524.0 595 8 9 7 92094.0 577 7 +PREHOOK: query: EXPLAIN EXTENDED +FROM srcpart src +INSERT OVERWRITE TABLE dest1 +SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))), sum(DISTINCT substr(src.value, 5)), count(DISTINCT src.value) +WHERE src.ds = '2008-04-08' +GROUP BY substr(src.key,1,1) +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN EXTENDED +FROM srcpart src +INSERT OVERWRITE TABLE dest1 +SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))), sum(DISTINCT substr(src.value, 5)), count(DISTINCT src.value) +WHERE src.ds = '2008-04-08' +GROUP BY substr(src.key,1,1) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 1000 Data size: 10624 Basic stats: COMPLETE Column stats: NONE + GatherStats: false + Select Operator + expressions: substr(key, 1, 1) (type: string), substr(value, 5) (type: string), value (type: string) + outputColumnNames: $f0, $f1, $f2 + Statistics: Num rows: 1000 Data size: 10624 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: $f0 (type: string), $f1 (type: string), $f2 (type: string) + null sort order: aaa + sort order: +++ + Map-reduce partition columns: $f0 (type: string) + Statistics: Num rows: 1000 Data size: 10624 Basic stats: COMPLETE Column stats: NONE + tag: -1 + auto parallelism: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: hr=11 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + hr 11 + properties: + COLUMN_STATS_ACCURATE {"BASIC_STATS":"true","COLUMN_STATS":{"key":"true","value":"true"}} + bucket_count -1 + columns key,value + columns.comments 'default','default' + columns.types string:string +#### A masked pattern was here #### + name default.srcpart + numFiles 1 + numRows 500 + partition_columns ds/hr + partition_columns.types string:string + rawDataSize 5312 + serialization.ddl struct srcpart { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.comments 'default','default' + columns.types string:string +#### A masked pattern was here #### + name default.srcpart + partition_columns ds/hr + partition_columns.types string:string + serialization.ddl struct srcpart { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.srcpart + name: default.srcpart +#### A masked pattern was here #### + Partition + base file name: hr=12 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + hr 12 + properties: + COLUMN_STATS_ACCURATE {"BASIC_STATS":"true","COLUMN_STATS":{"key":"true","value":"true"}} + bucket_count -1 + columns key,value + columns.comments 'default','default' + columns.types string:string +#### A masked pattern was here #### + name default.srcpart + numFiles 1 + numRows 500 + partition_columns ds/hr + partition_columns.types string:string + rawDataSize 5312 + serialization.ddl struct srcpart { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.comments 'default','default' + columns.types string:string +#### A masked pattern was here #### + name default.srcpart + partition_columns ds/hr + partition_columns.types string:string + serialization.ddl struct srcpart { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.srcpart + name: default.srcpart + Truncated Path -> Alias: + /srcpart/ds=2008-04-08/hr=11 [src] + /srcpart/ds=2008-04-08/hr=12 [src] + Needs Tagging: false + Reduce Operator Tree: + Group By Operator + aggregations: count(DISTINCT KEY._col1:0._col0), sum(KEY._col1:0._col0), sum(DISTINCT KEY._col1:1._col0), count(DISTINCT KEY._col1:2._col0) + keys: KEY._col0 (type: string) + mode: complete + outputColumnNames: $f0, $f1, $f2, $f3, $f4 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: $f0 (type: string), UDFToInteger($f1) (type: int), concat($f0, $f2) (type: string), UDFToInteger($f3) (type: int), UDFToInteger($f4) (type: int) + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 1 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + COLUMN_STATS_ACCURATE {"BASIC_STATS":"true"} + bucket_count -1 + columns key,c1,c2,c3,c4 + columns.comments + columns.types string:int:string:int:int +#### A masked pattern was here #### + name default.dest1 + numFiles 1 + numRows 10 + rawDataSize 184 + serialization.ddl struct dest1 { string key, i32 c1, string c2, i32 c3, i32 c4} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 194 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest1 + TotalFiles: 1 + GatherStats: true + MultiFileSpray: false + + Stage: Stage-0 + Move Operator + tables: + replace: true +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + COLUMN_STATS_ACCURATE {"BASIC_STATS":"true"} + bucket_count -1 + columns key,c1,c2,c3,c4 + columns.comments + columns.types string:int:string:int:int +#### A masked pattern was here #### + name default.dest1 + numFiles 1 + numRows 10 + rawDataSize 184 + serialization.ddl struct dest1 { string key, i32 c1, string c2, i32 c3, i32 c4} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 194 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest1 + + Stage: Stage-2 + Stats-Aggr Operator +#### A masked pattern was here #### + +PREHOOK: query: FROM srcpart src +INSERT OVERWRITE TABLE dest1 +SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))), sum(DISTINCT substr(src.value, 5)), count(DISTINCT src.value) +WHERE src.ds = '2008-04-08' +GROUP BY substr(src.key,1,1) +PREHOOK: type: QUERY +PREHOOK: Input: default@srcpart +PREHOOK: Input: default@srcpart@ds=2008-04-08/hr=11 +PREHOOK: Input: default@srcpart@ds=2008-04-08/hr=12 +PREHOOK: Output: default@dest1 +POSTHOOK: query: FROM srcpart src +INSERT OVERWRITE TABLE dest1 +SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))), sum(DISTINCT substr(src.value, 5)), count(DISTINCT src.value) +WHERE src.ds = '2008-04-08' +GROUP BY substr(src.key,1,1) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@srcpart +POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=11 +POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=12 +POSTHOOK: Output: default@dest1 +POSTHOOK: Lineage: dest1.c1 EXPRESSION [(srcpart)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c2 EXPRESSION [(srcpart)src.FieldSchema(name:key, type:string, comment:default), (srcpart)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c3 EXPRESSION [(srcpart)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c4 EXPRESSION [(srcpart)src.null, ] +POSTHOOK: Lineage: dest1.key EXPRESSION [(srcpart)src.FieldSchema(name:key, type:string, comment:default), ] +PREHOOK: query: SELECT dest1.* FROM dest1 +PREHOOK: type: QUERY +PREHOOK: Input: default@dest1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT dest1.* FROM dest1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest1 +#### A masked pattern was here #### +0 1 00.0 0 1 +1 71 132828.0 10044 71 +2 69 251142.0 15780 69 +3 62 364008.0 20119 62 +4 74 4105526.0 30965 74 +5 6 5794.0 278 6 +6 5 6796.0 331 5 +7 6 71470.0 447 6 +8 8 81524.0 595 8 +9 7 92094.0 577 7 diff --git a/ql/src/test/results/clientpositive/spark/count.q.out b/ql/src/test/results/clientpositive/spark/count.q.out index b1ad662..06f7235 100644 --- a/ql/src/test/results/clientpositive/spark/count.q.out +++ b/ql/src/test/results/clientpositive/spark/count.q.out @@ -288,6 +288,73 @@ POSTHOOK: type: QUERY POSTHOOK: Input: default@abcd #### A masked pattern was here #### 7 7 6 6 6 7 3 3 6 7 4 5 6 6 5 6 4 5 5 5 4 +PREHOOK: query: --first aggregation with literal. gbinfo was generating wrong expression +explain select count(1), count(*), count(a), count(b), count(c), count(d), count(distinct a), count(distinct b), count(distinct c), count(distinct d), count(distinct a,b), count(distinct b,c), count(distinct c,d), count(distinct a,d), count(distinct a,c), count(distinct b,d), count(distinct a,b,c), count(distinct b,c,d), count(distinct a,c,d), count(distinct a,b,d), count(distinct a,b,c,d) from abcd +PREHOOK: type: QUERY +POSTHOOK: query: --first aggregation with literal. gbinfo was generating wrong expression +explain select count(1), count(*), count(a), count(b), count(c), count(d), count(distinct a), count(distinct b), count(distinct c), count(distinct d), count(distinct a,b), count(distinct b,c), count(distinct c,d), count(distinct a,d), count(distinct a,c), count(distinct b,d), count(distinct a,b,c), count(distinct b,c,d), count(distinct a,c,d), count(distinct a,b,d), count(distinct a,b,c,d) from abcd +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP PARTITION-LEVEL SORT, 1) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: abcd + Statistics: Num rows: 4 Data size: 78 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: a (type: int), b (type: int), c (type: int), d (type: int) + outputColumnNames: $f1, $f2, $f3, $f4 + Statistics: Num rows: 4 Data size: 78 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(1), count(), count($f1), count($f2), count($f3), count($f4), count(DISTINCT $f1), count(DISTINCT $f2), count(DISTINCT $f3), count(DISTINCT $f4), count(DISTINCT $f1, $f2), count(DISTINCT $f2, $f3), count(DISTINCT $f3, $f4), count(DISTINCT $f1, $f4), count(DISTINCT $f1, $f3), count(DISTINCT $f2, $f4), count(DISTINCT $f1, $f2, $f3), count(DISTINCT $f2, $f3, $f4), count(DISTINCT $f1, $f3, $f4), count(DISTINCT $f1, $f2, $f4), count(DISTINCT $f1, $f2, $f3, $f4) + keys: $f1 (type: int), $f2 (type: int), $f3 (type: int), $f4 (type: int) + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16, _col17, _col18, _col19, _col20, _col21, _col22, _col23, _col24 + Statistics: Num rows: 4 Data size: 78 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: int), _col1 (type: int), _col2 (type: int), _col3 (type: int) + sort order: ++++ + Statistics: Num rows: 4 Data size: 78 Basic stats: COMPLETE Column stats: NONE + value expressions: _col4 (type: bigint), _col5 (type: bigint), _col6 (type: bigint), _col7 (type: bigint), _col8 (type: bigint), _col9 (type: bigint) + Reducer 2 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), count(VALUE._col1), count(VALUE._col2), count(VALUE._col3), count(VALUE._col4), count(VALUE._col5), count(DISTINCT KEY._col0:0._col0), count(DISTINCT KEY._col0:1._col0), count(DISTINCT KEY._col0:2._col0), count(DISTINCT KEY._col0:3._col0), count(DISTINCT KEY._col0:4._col0, KEY._col0:4._col1), count(DISTINCT KEY._col0:5._col0, KEY._col0:5._col1), count(DISTINCT KEY._col0:6._col0, KEY._col0:6._col1), count(DISTINCT KEY._col0:7._col0, KEY._col0:7._col1), count(DISTINCT KEY._col0:8._col0, KEY._col0:8._col1), count(DISTINCT KEY._col0:9._col0, KEY._col0:9._col1), count(DISTINCT KEY._col0:10._col0, KEY._col0:10._col1, KEY._col0:10._col2), count(DISTINCT KEY._col0:11._col0, KEY._col0:11._col1, KEY._col0:11._col2), count(DISTINCT KEY._col0:12._col0, KEY._col0:12._col1, KEY._col0:12._col2), count(DISTINCT KEY._col0:13._col0, KEY._col0:13._col1, KEY._col0:13._col2), count(DISTINCT KEY._col0:14._col0, KEY._col0:14._col1, KEY._col0:14._col2, KEY._col0:14._col3) + mode: mergepartial + outputColumnNames: $f0, $f1, $f2, $f3, $f4, $f5, $f6, $f7, $f8, $f9, $f10, $f11, $f12, $f13, $f14, $f15, $f16, $f17, $f18, $f19, $f20 + Statistics: Num rows: 1 Data size: 336 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 336 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(1), count(*), count(a), count(b), count(c), count(d), count(distinct a), count(distinct b), count(distinct c), count(distinct d), count(distinct a,b), count(distinct b,c), count(distinct c,d), count(distinct a,d), count(distinct a,c), count(distinct b,d), count(distinct a,b,c), count(distinct b,c,d), count(distinct a,c,d), count(distinct a,b,d), count(distinct a,b,c,d) from abcd +PREHOOK: type: QUERY +PREHOOK: Input: default@abcd +#### A masked pattern was here #### +POSTHOOK: query: select count(1), count(*), count(a), count(b), count(c), count(d), count(distinct a), count(distinct b), count(distinct c), count(distinct d), count(distinct a,b), count(distinct b,c), count(distinct c,d), count(distinct a,d), count(distinct a,c), count(distinct b,d), count(distinct a,b,c), count(distinct b,c,d), count(distinct a,c,d), count(distinct a,b,d), count(distinct a,b,c,d) from abcd +POSTHOOK: type: QUERY +POSTHOOK: Input: default@abcd +#### A masked pattern was here #### +7 7 6 6 6 7 3 3 6 7 4 5 6 6 5 6 4 5 5 5 4 PREHOOK: query: explain select count(distinct b) from abcd group by a PREHOOK: type: QUERY POSTHOOK: query: explain select count(distinct b) from abcd group by a @@ -491,10 +558,11 @@ STAGE PLANS: sort order: + Map-reduce partition columns: c (type: int) Statistics: Num rows: 4 Data size: 34 Basic stats: COMPLETE Column stats: NONE + value expressions: b (type: int) Reducer 3 Reduce Operator Tree: Group By Operator - aggregations: count(KEY._col0) + aggregations: count(VALUE._col0) keys: KEY._col0 (type: int) mode: complete outputColumnNames: c, $f1 @@ -551,31 +619,32 @@ STAGE PLANS: Map Operator Tree: TableScan alias: abcd - Statistics: Num rows: 9 Data size: 78 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 6 Data size: 78 Basic stats: COMPLETE Column stats: NONE Select Operator - expressions: c (type: int), d (type: int) - outputColumnNames: c, d - Statistics: Num rows: 9 Data size: 78 Basic stats: COMPLETE Column stats: NONE + expressions: b (type: int), c (type: int), d (type: int) + outputColumnNames: b, c, d + Statistics: Num rows: 6 Data size: 78 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: d (type: int), c (type: int) sort order: ++ Map-reduce partition columns: d (type: int) - Statistics: Num rows: 9 Data size: 78 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 6 Data size: 78 Basic stats: COMPLETE Column stats: NONE + value expressions: b (type: int) Reducer 2 Reduce Operator Tree: Group By Operator - aggregations: count(KEY._col1:0._col0), count(DISTINCT KEY._col1:0._col0) + aggregations: count(VALUE._col0), count(DISTINCT KEY._col1:0._col0) keys: KEY._col0 (type: int) mode: complete outputColumnNames: d, $f1, $f2 - Statistics: Num rows: 4 Data size: 34 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 3 Data size: 39 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: $f1 (type: bigint), $f2 (type: bigint) outputColumnNames: _o__c0, _o__c1 - Statistics: Num rows: 4 Data size: 34 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 3 Data size: 39 Basic stats: COMPLETE Column stats: NONE File Output Operator compressed: false - Statistics: Num rows: 4 Data size: 34 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 3 Data size: 39 Basic stats: COMPLETE Column stats: NONE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat @@ -595,10 +664,202 @@ POSTHOOK: query: select count(b), count(distinct c) from abcd group by d POSTHOOK: type: QUERY POSTHOOK: Input: default@abcd #### A masked pattern was here #### -0 0 -1 1 +0 1 +1 0 1 1 1 1 1 1 1 1 1 1 +PREHOOK: query: --non distinct aggregate with same column as group by key +explain select a, count(distinct b), count(distinct c), sum(d), sum(d+d), sum(d*3), sum(b), sum(c), sum(a), sum(distinct a), sum(distinct b) from abcd group by a +PREHOOK: type: QUERY +POSTHOOK: query: --non distinct aggregate with same column as group by key +explain select a, count(distinct b), count(distinct c), sum(d), sum(d+d), sum(d*3), sum(b), sum(c), sum(a), sum(distinct a), sum(distinct b) from abcd group by a +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP PARTITION-LEVEL SORT, 2) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: abcd + Statistics: Num rows: 4 Data size: 78 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: a (type: int), b (type: int), c (type: int), d (type: int), (d + d) (type: int), (d * 3) (type: int) + outputColumnNames: $f0, $f1, $f2, $f3, $f4, $f5 + Statistics: Num rows: 4 Data size: 78 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: $f0 (type: int), $f1 (type: int), $f2 (type: int) + sort order: +++ + Map-reduce partition columns: $f0 (type: int) + Statistics: Num rows: 4 Data size: 78 Basic stats: COMPLETE Column stats: NONE + value expressions: $f3 (type: int), $f4 (type: int), $f5 (type: int) + Reducer 2 + Reduce Operator Tree: + Group By Operator + aggregations: count(DISTINCT KEY._col1:0._col0), count(DISTINCT KEY._col1:1._col0), sum(VALUE._col0), sum(VALUE._col1), sum(VALUE._col2), sum(KEY._col1:0._col0), sum(KEY._col1:1._col0), sum(KEY._col0), sum(DISTINCT KEY._col1:2._col0), sum(DISTINCT KEY._col1:3._col0) + keys: KEY._col0 (type: int) + mode: complete + outputColumnNames: $f0, $f1, $f2, $f3, $f4, $f5, $f6, $f7, $f8, $f9, $f10 + Statistics: Num rows: 2 Data size: 39 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 39 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select a, count(distinct b), count(distinct c), sum(d), sum(d+d), sum(d*3), sum(b), sum(c), sum(a), sum(distinct a), sum(distinct b) from abcd group by a +PREHOOK: type: QUERY +PREHOOK: Input: default@abcd +#### A masked pattern was here #### +POSTHOOK: query: select a, count(distinct b), count(distinct c), sum(d), sum(d+d), sum(d*3), sum(b), sum(c), sum(a), sum(distinct a), sum(distinct b) from abcd group by a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@abcd +#### A masked pattern was here #### +10 2 2 10 20 30 1200 95 30 10 1100 +100 1 1 3 6 9 100 10 100 100 100 +12 1 2 9 18 27 100 155 24 12 100 +NULL 1 1 6 12 18 35 23 NULL NULL 35 +PREHOOK: query: --non distinct aggregate with same column as distinct aggregate +explain select a, count(distinct b), count(distinct c), sum(d), sum(c) from abcd group by a +PREHOOK: type: QUERY +POSTHOOK: query: --non distinct aggregate with same column as distinct aggregate +explain select a, count(distinct b), count(distinct c), sum(d), sum(c) from abcd group by a +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP PARTITION-LEVEL SORT, 2) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: abcd + Statistics: Num rows: 4 Data size: 78 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: a (type: int), b (type: int), c (type: int), d (type: int) + outputColumnNames: a, b, c, d + Statistics: Num rows: 4 Data size: 78 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: a (type: int), b (type: int), c (type: int) + sort order: +++ + Map-reduce partition columns: a (type: int) + Statistics: Num rows: 4 Data size: 78 Basic stats: COMPLETE Column stats: NONE + value expressions: d (type: int) + Reducer 2 + Reduce Operator Tree: + Group By Operator + aggregations: count(DISTINCT KEY._col1:0._col0), count(DISTINCT KEY._col1:1._col0), sum(VALUE._col0), sum(KEY._col1:1._col0) + keys: KEY._col0 (type: int) + mode: complete + outputColumnNames: a, $f1, $f2, $f3, $f4 + Statistics: Num rows: 2 Data size: 39 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 39 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select a, count(distinct b), count(distinct c), sum(d), sum(c) from abcd group by a +PREHOOK: type: QUERY +PREHOOK: Input: default@abcd +#### A masked pattern was here #### +POSTHOOK: query: select a, count(distinct b), count(distinct c), sum(d), sum(c) from abcd group by a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@abcd +#### A masked pattern was here #### +10 2 2 10 95 +100 1 1 3 10 +12 1 2 9 155 +NULL 1 1 6 23 +PREHOOK: query: --aggregation with literal +explain select count(1), count(*), count(a), count(b), count(c), count(d), count(distinct a), count(distinct b), count(distinct c), count(distinct d), count(distinct a,b), count(distinct b,c), count(distinct c,d), count(distinct a,d), count(distinct a,c), count(distinct b,d), count(distinct a,b,c), count(distinct b,c,d), count(distinct a,c,d), count(distinct a,b,d), count(distinct a,b,c,d) from abcd +PREHOOK: type: QUERY +POSTHOOK: query: --aggregation with literal +explain select count(1), count(*), count(a), count(b), count(c), count(d), count(distinct a), count(distinct b), count(distinct c), count(distinct d), count(distinct a,b), count(distinct b,c), count(distinct c,d), count(distinct a,d), count(distinct a,c), count(distinct b,d), count(distinct a,b,c), count(distinct b,c,d), count(distinct a,c,d), count(distinct a,b,d), count(distinct a,b,c,d) from abcd +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP PARTITION-LEVEL SORT, 1) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: abcd + Statistics: Num rows: 4 Data size: 78 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: a (type: int), b (type: int), c (type: int), d (type: int) + outputColumnNames: $f1, $f2, $f3, $f4 + Statistics: Num rows: 4 Data size: 78 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: $f1 (type: int), $f2 (type: int), $f3 (type: int), $f4 (type: int) + sort order: ++++ + Statistics: Num rows: 4 Data size: 78 Basic stats: COMPLETE Column stats: NONE + Reducer 2 + Reduce Operator Tree: + Group By Operator + aggregations: count(1), count(), count(KEY._col0:0._col0), count(KEY._col0:1._col0), count(KEY._col0:2._col0), count(KEY._col0:3._col0), count(DISTINCT KEY._col0:0._col0), count(DISTINCT KEY._col0:1._col0), count(DISTINCT KEY._col0:2._col0), count(DISTINCT KEY._col0:3._col0), count(DISTINCT KEY._col0:4._col0, KEY._col0:4._col1), count(DISTINCT KEY._col0:5._col0, KEY._col0:5._col1), count(DISTINCT KEY._col0:6._col0, KEY._col0:6._col1), count(DISTINCT KEY._col0:7._col0, KEY._col0:7._col1), count(DISTINCT KEY._col0:8._col0, KEY._col0:8._col1), count(DISTINCT KEY._col0:9._col0, KEY._col0:9._col1), count(DISTINCT KEY._col0:10._col0, KEY._col0:10._col1, KEY._col0:10._col2), count(DISTINCT KEY._col0:11._col0, KEY._col0:11._col1, KEY._col0:11._col2), count(DISTINCT KEY._col0:12._col0, KEY._col0:12._col1, KEY._col0:12._col2), count(DISTINCT KEY._col0:13._col0, KEY._col0:13._col1, KEY._col0:13._col2), count(DISTINCT KEY._col0:14._col0, KEY._col0:14._col1, KEY._col0:14._col2, KEY._col0:14._col3) + mode: complete + outputColumnNames: $f0, $f1, $f2, $f3, $f4, $f5, $f6, $f7, $f8, $f9, $f10, $f11, $f12, $f13, $f14, $f15, $f16, $f17, $f18, $f19, $f20 + Statistics: Num rows: 1 Data size: 168 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 168 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(1), count(*), count(a), count(b), count(c), count(d), count(distinct a), count(distinct b), count(distinct c), count(distinct d), count(distinct a,b), count(distinct b,c), count(distinct c,d), count(distinct a,d), count(distinct a,c), count(distinct b,d), count(distinct a,b,c), count(distinct b,c,d), count(distinct a,c,d), count(distinct a,b,d), count(distinct a,b,c,d) from abcd +PREHOOK: type: QUERY +PREHOOK: Input: default@abcd +#### A masked pattern was here #### +POSTHOOK: query: select count(1), count(*), count(a), count(b), count(c), count(d), count(distinct a), count(distinct b), count(distinct c), count(distinct d), count(distinct a,b), count(distinct b,c), count(distinct c,d), count(distinct a,d), count(distinct a,c), count(distinct b,d), count(distinct a,b,c), count(distinct b,c,d), count(distinct a,c,d), count(distinct a,b,d), count(distinct a,b,c,d) from abcd +POSTHOOK: type: QUERY +POSTHOOK: Input: default@abcd +#### A masked pattern was here #### +7 7 6 6 6 7 3 3 6 7 4 5 6 6 5 6 4 5 5 5 4 diff --git a/ql/src/test/results/clientpositive/spark/groupby_ppr_multi_distinct.q.out b/ql/src/test/results/clientpositive/spark/groupby_ppr_multi_distinct.q.out index 5251241..7d2f9c3 100644 --- a/ql/src/test/results/clientpositive/spark/groupby_ppr_multi_distinct.q.out +++ b/ql/src/test/results/clientpositive/spark/groupby_ppr_multi_distinct.q.out @@ -271,3 +271,264 @@ POSTHOOK: Input: default@dest1 7 6 71470.0 447 6 8 8 81524.0 595 8 9 7 92094.0 577 7 +PREHOOK: query: EXPLAIN EXTENDED +FROM srcpart src +INSERT OVERWRITE TABLE dest1 +SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))), sum(DISTINCT substr(src.value, 5)), count(DISTINCT src.value) +WHERE src.ds = '2008-04-08' +GROUP BY substr(src.key,1,1) +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN EXTENDED +FROM srcpart src +INSERT OVERWRITE TABLE dest1 +SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))), sum(DISTINCT substr(src.value, 5)), count(DISTINCT src.value) +WHERE src.ds = '2008-04-08' +GROUP BY substr(src.key,1,1) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP PARTITION-LEVEL SORT, 2) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 1000 Data size: 10624 Basic stats: COMPLETE Column stats: NONE + GatherStats: false + Select Operator + expressions: substr(key, 1, 1) (type: string), substr(value, 5) (type: string), value (type: string) + outputColumnNames: $f0, $f1, $f2 + Statistics: Num rows: 1000 Data size: 10624 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: $f0 (type: string), $f1 (type: string), $f2 (type: string) + null sort order: aaa + sort order: +++ + Map-reduce partition columns: $f0 (type: string) + Statistics: Num rows: 1000 Data size: 10624 Basic stats: COMPLETE Column stats: NONE + tag: -1 + auto parallelism: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: hr=11 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + hr 11 + properties: + COLUMN_STATS_ACCURATE {"BASIC_STATS":"true","COLUMN_STATS":{"key":"true","value":"true"}} + bucket_count -1 + columns key,value + columns.comments 'default','default' + columns.types string:string +#### A masked pattern was here #### + name default.srcpart + numFiles 1 + numRows 500 + partition_columns ds/hr + partition_columns.types string:string + rawDataSize 5312 + serialization.ddl struct srcpart { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.comments 'default','default' + columns.types string:string +#### A masked pattern was here #### + name default.srcpart + partition_columns ds/hr + partition_columns.types string:string + serialization.ddl struct srcpart { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.srcpart + name: default.srcpart +#### A masked pattern was here #### + Partition + base file name: hr=12 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + partition values: + ds 2008-04-08 + hr 12 + properties: + COLUMN_STATS_ACCURATE {"BASIC_STATS":"true","COLUMN_STATS":{"key":"true","value":"true"}} + bucket_count -1 + columns key,value + columns.comments 'default','default' + columns.types string:string +#### A masked pattern was here #### + name default.srcpart + numFiles 1 + numRows 500 + partition_columns ds/hr + partition_columns.types string:string + rawDataSize 5312 + serialization.ddl struct srcpart { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + columns key,value + columns.comments 'default','default' + columns.types string:string +#### A masked pattern was here #### + name default.srcpart + partition_columns ds/hr + partition_columns.types string:string + serialization.ddl struct srcpart { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.srcpart + name: default.srcpart + Truncated Path -> Alias: + /srcpart/ds=2008-04-08/hr=11 [src] + /srcpart/ds=2008-04-08/hr=12 [src] + Reducer 2 + Needs Tagging: false + Reduce Operator Tree: + Group By Operator + aggregations: count(DISTINCT KEY._col1:0._col0), sum(KEY._col1:0._col0), sum(DISTINCT KEY._col1:1._col0), count(DISTINCT KEY._col1:2._col0) + keys: KEY._col0 (type: string) + mode: complete + outputColumnNames: $f0, $f1, $f2, $f3, $f4 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: $f0 (type: string), UDFToInteger($f1) (type: int), concat($f0, $f2) (type: string), UDFToInteger($f3) (type: int), UDFToInteger($f4) (type: int) + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 1 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + COLUMN_STATS_ACCURATE {"BASIC_STATS":"true"} + bucket_count -1 + columns key,c1,c2,c3,c4 + columns.comments + columns.types string:int:string:int:int +#### A masked pattern was here #### + name default.dest1 + numFiles 2 + numRows 10 + rawDataSize 184 + serialization.ddl struct dest1 { string key, i32 c1, string c2, i32 c3, i32 c4} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 194 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest1 + TotalFiles: 1 + GatherStats: true + MultiFileSpray: false + + Stage: Stage-0 + Move Operator + tables: + replace: true +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + COLUMN_STATS_ACCURATE {"BASIC_STATS":"true"} + bucket_count -1 + columns key,c1,c2,c3,c4 + columns.comments + columns.types string:int:string:int:int +#### A masked pattern was here #### + name default.dest1 + numFiles 2 + numRows 10 + rawDataSize 184 + serialization.ddl struct dest1 { string key, i32 c1, string c2, i32 c3, i32 c4} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 194 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest1 + + Stage: Stage-2 + Stats-Aggr Operator +#### A masked pattern was here #### + +PREHOOK: query: FROM srcpart src +INSERT OVERWRITE TABLE dest1 +SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))), sum(DISTINCT substr(src.value, 5)), count(DISTINCT src.value) +WHERE src.ds = '2008-04-08' +GROUP BY substr(src.key,1,1) +PREHOOK: type: QUERY +PREHOOK: Input: default@srcpart +PREHOOK: Input: default@srcpart@ds=2008-04-08/hr=11 +PREHOOK: Input: default@srcpart@ds=2008-04-08/hr=12 +PREHOOK: Output: default@dest1 +POSTHOOK: query: FROM srcpart src +INSERT OVERWRITE TABLE dest1 +SELECT substr(src.key,1,1), count(DISTINCT substr(src.value,5)), concat(substr(src.key,1,1),sum(substr(src.value,5))), sum(DISTINCT substr(src.value, 5)), count(DISTINCT src.value) +WHERE src.ds = '2008-04-08' +GROUP BY substr(src.key,1,1) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@srcpart +POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=11 +POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=12 +POSTHOOK: Output: default@dest1 +POSTHOOK: Lineage: dest1.c1 EXPRESSION [(srcpart)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c2 EXPRESSION [(srcpart)src.FieldSchema(name:key, type:string, comment:default), (srcpart)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c3 EXPRESSION [(srcpart)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: dest1.c4 EXPRESSION [(srcpart)src.null, ] +POSTHOOK: Lineage: dest1.key EXPRESSION [(srcpart)src.FieldSchema(name:key, type:string, comment:default), ] +PREHOOK: query: SELECT dest1.* FROM dest1 +PREHOOK: type: QUERY +PREHOOK: Input: default@dest1 +#### A masked pattern was here #### +POSTHOOK: query: SELECT dest1.* FROM dest1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@dest1 +#### A masked pattern was here #### +0 1 00.0 0 1 +1 71 132828.0 10044 71 +2 69 251142.0 15780 69 +3 62 364008.0 20119 62 +4 74 4105526.0 30965 74 +5 6 5794.0 278 6 +6 5 6796.0 331 5 +7 6 71470.0 447 6 +8 8 81524.0 595 8 +9 7 92094.0 577 7 diff --git a/ql/src/test/results/clientpositive/tez/count.q.out b/ql/src/test/results/clientpositive/tez/count.q.out index 9fc2c75..9dc2764 100644 --- a/ql/src/test/results/clientpositive/tez/count.q.out +++ b/ql/src/test/results/clientpositive/tez/count.q.out @@ -292,6 +292,74 @@ POSTHOOK: type: QUERY POSTHOOK: Input: default@abcd #### A masked pattern was here #### 7 7 6 6 6 7 3 3 6 7 4 5 6 6 5 6 4 5 5 5 4 +PREHOOK: query: --first aggregation with literal. gbinfo was generating wrong expression +explain select count(1), count(*), count(a), count(b), count(c), count(d), count(distinct a), count(distinct b), count(distinct c), count(distinct d), count(distinct a,b), count(distinct b,c), count(distinct c,d), count(distinct a,d), count(distinct a,c), count(distinct b,d), count(distinct a,b,c), count(distinct b,c,d), count(distinct a,c,d), count(distinct a,b,d), count(distinct a,b,c,d) from abcd +PREHOOK: type: QUERY +POSTHOOK: query: --first aggregation with literal. gbinfo was generating wrong expression +explain select count(1), count(*), count(a), count(b), count(c), count(d), count(distinct a), count(distinct b), count(distinct c), count(distinct d), count(distinct a,b), count(distinct b,c), count(distinct c,d), count(distinct a,d), count(distinct a,c), count(distinct b,d), count(distinct a,b,c), count(distinct b,c,d), count(distinct a,c,d), count(distinct a,b,d), count(distinct a,b,c,d) from abcd +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: abcd + Statistics: Num rows: 4 Data size: 78 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: a (type: int), b (type: int), c (type: int), d (type: int) + outputColumnNames: $f1, $f2, $f3, $f4 + Statistics: Num rows: 4 Data size: 78 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(1), count(), count($f1), count($f2), count($f3), count($f4), count(DISTINCT $f1), count(DISTINCT $f2), count(DISTINCT $f3), count(DISTINCT $f4), count(DISTINCT $f1, $f2), count(DISTINCT $f2, $f3), count(DISTINCT $f3, $f4), count(DISTINCT $f1, $f4), count(DISTINCT $f1, $f3), count(DISTINCT $f2, $f4), count(DISTINCT $f1, $f2, $f3), count(DISTINCT $f2, $f3, $f4), count(DISTINCT $f1, $f3, $f4), count(DISTINCT $f1, $f2, $f4), count(DISTINCT $f1, $f2, $f3, $f4) + keys: $f1 (type: int), $f2 (type: int), $f3 (type: int), $f4 (type: int) + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16, _col17, _col18, _col19, _col20, _col21, _col22, _col23, _col24 + Statistics: Num rows: 4 Data size: 78 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: int), _col1 (type: int), _col2 (type: int), _col3 (type: int) + sort order: ++++ + Statistics: Num rows: 4 Data size: 78 Basic stats: COMPLETE Column stats: NONE + value expressions: _col4 (type: bigint), _col5 (type: bigint), _col6 (type: bigint), _col7 (type: bigint), _col8 (type: bigint), _col9 (type: bigint) + Reducer 2 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), count(VALUE._col1), count(VALUE._col2), count(VALUE._col3), count(VALUE._col4), count(VALUE._col5), count(DISTINCT KEY._col0:0._col0), count(DISTINCT KEY._col0:1._col0), count(DISTINCT KEY._col0:2._col0), count(DISTINCT KEY._col0:3._col0), count(DISTINCT KEY._col0:4._col0, KEY._col0:4._col1), count(DISTINCT KEY._col0:5._col0, KEY._col0:5._col1), count(DISTINCT KEY._col0:6._col0, KEY._col0:6._col1), count(DISTINCT KEY._col0:7._col0, KEY._col0:7._col1), count(DISTINCT KEY._col0:8._col0, KEY._col0:8._col1), count(DISTINCT KEY._col0:9._col0, KEY._col0:9._col1), count(DISTINCT KEY._col0:10._col0, KEY._col0:10._col1, KEY._col0:10._col2), count(DISTINCT KEY._col0:11._col0, KEY._col0:11._col1, KEY._col0:11._col2), count(DISTINCT KEY._col0:12._col0, KEY._col0:12._col1, KEY._col0:12._col2), count(DISTINCT KEY._col0:13._col0, KEY._col0:13._col1, KEY._col0:13._col2), count(DISTINCT KEY._col0:14._col0, KEY._col0:14._col1, KEY._col0:14._col2, KEY._col0:14._col3) + mode: mergepartial + outputColumnNames: $f0, $f1, $f2, $f3, $f4, $f5, $f6, $f7, $f8, $f9, $f10, $f11, $f12, $f13, $f14, $f15, $f16, $f17, $f18, $f19, $f20 + Statistics: Num rows: 1 Data size: 336 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 336 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(1), count(*), count(a), count(b), count(c), count(d), count(distinct a), count(distinct b), count(distinct c), count(distinct d), count(distinct a,b), count(distinct b,c), count(distinct c,d), count(distinct a,d), count(distinct a,c), count(distinct b,d), count(distinct a,b,c), count(distinct b,c,d), count(distinct a,c,d), count(distinct a,b,d), count(distinct a,b,c,d) from abcd +PREHOOK: type: QUERY +PREHOOK: Input: default@abcd +#### A masked pattern was here #### +POSTHOOK: query: select count(1), count(*), count(a), count(b), count(c), count(d), count(distinct a), count(distinct b), count(distinct c), count(distinct d), count(distinct a,b), count(distinct b,c), count(distinct c,d), count(distinct a,d), count(distinct a,c), count(distinct b,d), count(distinct a,b,c), count(distinct b,c,d), count(distinct a,c,d), count(distinct a,b,d), count(distinct a,b,c,d) from abcd +POSTHOOK: type: QUERY +POSTHOOK: Input: default@abcd +#### A masked pattern was here #### +7 7 6 6 6 7 3 3 6 7 4 5 6 6 5 6 4 5 5 5 4 PREHOOK: query: explain select count(distinct b) from abcd group by a PREHOOK: type: QUERY POSTHOOK: query: explain select count(distinct b) from abcd group by a @@ -498,10 +566,11 @@ STAGE PLANS: sort order: + Map-reduce partition columns: c (type: int) Statistics: Num rows: 4 Data size: 34 Basic stats: COMPLETE Column stats: NONE + value expressions: b (type: int) Reducer 3 Reduce Operator Tree: Group By Operator - aggregations: count(KEY._col0) + aggregations: count(VALUE._col0) keys: KEY._col0 (type: int) mode: complete outputColumnNames: c, $f1 @@ -559,31 +628,32 @@ STAGE PLANS: Map Operator Tree: TableScan alias: abcd - Statistics: Num rows: 9 Data size: 78 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 6 Data size: 78 Basic stats: COMPLETE Column stats: NONE Select Operator - expressions: c (type: int), d (type: int) - outputColumnNames: c, d - Statistics: Num rows: 9 Data size: 78 Basic stats: COMPLETE Column stats: NONE + expressions: b (type: int), c (type: int), d (type: int) + outputColumnNames: b, c, d + Statistics: Num rows: 6 Data size: 78 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: d (type: int), c (type: int) sort order: ++ Map-reduce partition columns: d (type: int) - Statistics: Num rows: 9 Data size: 78 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 6 Data size: 78 Basic stats: COMPLETE Column stats: NONE + value expressions: b (type: int) Reducer 2 Reduce Operator Tree: Group By Operator - aggregations: count(KEY._col1:0._col0), count(DISTINCT KEY._col1:0._col0) + aggregations: count(VALUE._col0), count(DISTINCT KEY._col1:0._col0) keys: KEY._col0 (type: int) mode: complete outputColumnNames: d, $f1, $f2 - Statistics: Num rows: 4 Data size: 34 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 3 Data size: 39 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: $f1 (type: bigint), $f2 (type: bigint) outputColumnNames: _o__c0, _o__c1 - Statistics: Num rows: 4 Data size: 34 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 3 Data size: 39 Basic stats: COMPLETE Column stats: NONE File Output Operator compressed: false - Statistics: Num rows: 4 Data size: 34 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 3 Data size: 39 Basic stats: COMPLETE Column stats: NONE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat @@ -603,10 +673,205 @@ POSTHOOK: query: select count(b), count(distinct c) from abcd group by d POSTHOOK: type: QUERY POSTHOOK: Input: default@abcd #### A masked pattern was here #### -0 0 -1 1 +0 1 +1 0 1 1 1 1 1 1 1 1 1 1 +PREHOOK: query: --non distinct aggregate with same column as group by key +explain select a, count(distinct b), count(distinct c), sum(d), sum(d+d), sum(d*3), sum(b), sum(c), sum(a), sum(distinct a), sum(distinct b) from abcd group by a +PREHOOK: type: QUERY +POSTHOOK: query: --non distinct aggregate with same column as group by key +explain select a, count(distinct b), count(distinct c), sum(d), sum(d+d), sum(d*3), sum(b), sum(c), sum(a), sum(distinct a), sum(distinct b) from abcd group by a +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: abcd + Statistics: Num rows: 4 Data size: 78 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: a (type: int), b (type: int), c (type: int), d (type: int), (d + d) (type: int), (d * 3) (type: int) + outputColumnNames: $f0, $f1, $f2, $f3, $f4, $f5 + Statistics: Num rows: 4 Data size: 78 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: $f0 (type: int), $f1 (type: int), $f2 (type: int) + sort order: +++ + Map-reduce partition columns: $f0 (type: int) + Statistics: Num rows: 4 Data size: 78 Basic stats: COMPLETE Column stats: NONE + value expressions: $f3 (type: int), $f4 (type: int), $f5 (type: int) + Reducer 2 + Reduce Operator Tree: + Group By Operator + aggregations: count(DISTINCT KEY._col1:0._col0), count(DISTINCT KEY._col1:1._col0), sum(VALUE._col0), sum(VALUE._col1), sum(VALUE._col2), sum(KEY._col1:0._col0), sum(KEY._col1:1._col0), sum(KEY._col0), sum(DISTINCT KEY._col1:2._col0), sum(DISTINCT KEY._col1:3._col0) + keys: KEY._col0 (type: int) + mode: complete + outputColumnNames: $f0, $f1, $f2, $f3, $f4, $f5, $f6, $f7, $f8, $f9, $f10 + Statistics: Num rows: 2 Data size: 39 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 39 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select a, count(distinct b), count(distinct c), sum(d), sum(d+d), sum(d*3), sum(b), sum(c), sum(a), sum(distinct a), sum(distinct b) from abcd group by a +PREHOOK: type: QUERY +PREHOOK: Input: default@abcd +#### A masked pattern was here #### +POSTHOOK: query: select a, count(distinct b), count(distinct c), sum(d), sum(d+d), sum(d*3), sum(b), sum(c), sum(a), sum(distinct a), sum(distinct b) from abcd group by a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@abcd +#### A masked pattern was here #### +10 2 2 10 20 30 1200 95 30 10 1100 +100 1 1 3 6 9 100 10 100 100 100 +12 1 2 9 18 27 100 155 24 12 100 +NULL 1 1 6 12 18 35 23 NULL NULL 35 +PREHOOK: query: --non distinct aggregate with same column as distinct aggregate +explain select a, count(distinct b), count(distinct c), sum(d), sum(c) from abcd group by a +PREHOOK: type: QUERY +POSTHOOK: query: --non distinct aggregate with same column as distinct aggregate +explain select a, count(distinct b), count(distinct c), sum(d), sum(c) from abcd group by a +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: abcd + Statistics: Num rows: 4 Data size: 78 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: a (type: int), b (type: int), c (type: int), d (type: int) + outputColumnNames: a, b, c, d + Statistics: Num rows: 4 Data size: 78 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: a (type: int), b (type: int), c (type: int) + sort order: +++ + Map-reduce partition columns: a (type: int) + Statistics: Num rows: 4 Data size: 78 Basic stats: COMPLETE Column stats: NONE + value expressions: d (type: int) + Reducer 2 + Reduce Operator Tree: + Group By Operator + aggregations: count(DISTINCT KEY._col1:0._col0), count(DISTINCT KEY._col1:1._col0), sum(VALUE._col0), sum(KEY._col1:1._col0) + keys: KEY._col0 (type: int) + mode: complete + outputColumnNames: a, $f1, $f2, $f3, $f4 + Statistics: Num rows: 2 Data size: 39 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 39 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select a, count(distinct b), count(distinct c), sum(d), sum(c) from abcd group by a +PREHOOK: type: QUERY +PREHOOK: Input: default@abcd +#### A masked pattern was here #### +POSTHOOK: query: select a, count(distinct b), count(distinct c), sum(d), sum(c) from abcd group by a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@abcd +#### A masked pattern was here #### +10 2 2 10 95 +100 1 1 3 10 +12 1 2 9 155 +NULL 1 1 6 23 +PREHOOK: query: --aggregation with literal +explain select count(1), count(*), count(a), count(b), count(c), count(d), count(distinct a), count(distinct b), count(distinct c), count(distinct d), count(distinct a,b), count(distinct b,c), count(distinct c,d), count(distinct a,d), count(distinct a,c), count(distinct b,d), count(distinct a,b,c), count(distinct b,c,d), count(distinct a,c,d), count(distinct a,b,d), count(distinct a,b,c,d) from abcd +PREHOOK: type: QUERY +POSTHOOK: query: --aggregation with literal +explain select count(1), count(*), count(a), count(b), count(c), count(d), count(distinct a), count(distinct b), count(distinct c), count(distinct d), count(distinct a,b), count(distinct b,c), count(distinct c,d), count(distinct a,d), count(distinct a,c), count(distinct b,d), count(distinct a,b,c), count(distinct b,c,d), count(distinct a,c,d), count(distinct a,b,d), count(distinct a,b,c,d) from abcd +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: abcd + Statistics: Num rows: 4 Data size: 78 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: a (type: int), b (type: int), c (type: int), d (type: int) + outputColumnNames: $f1, $f2, $f3, $f4 + Statistics: Num rows: 4 Data size: 78 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: $f1 (type: int), $f2 (type: int), $f3 (type: int), $f4 (type: int) + sort order: ++++ + Statistics: Num rows: 4 Data size: 78 Basic stats: COMPLETE Column stats: NONE + Reducer 2 + Reduce Operator Tree: + Group By Operator + aggregations: count(1), count(), count(KEY._col0:0._col0), count(KEY._col0:1._col0), count(KEY._col0:2._col0), count(KEY._col0:3._col0), count(DISTINCT KEY._col0:0._col0), count(DISTINCT KEY._col0:1._col0), count(DISTINCT KEY._col0:2._col0), count(DISTINCT KEY._col0:3._col0), count(DISTINCT KEY._col0:4._col0, KEY._col0:4._col1), count(DISTINCT KEY._col0:5._col0, KEY._col0:5._col1), count(DISTINCT KEY._col0:6._col0, KEY._col0:6._col1), count(DISTINCT KEY._col0:7._col0, KEY._col0:7._col1), count(DISTINCT KEY._col0:8._col0, KEY._col0:8._col1), count(DISTINCT KEY._col0:9._col0, KEY._col0:9._col1), count(DISTINCT KEY._col0:10._col0, KEY._col0:10._col1, KEY._col0:10._col2), count(DISTINCT KEY._col0:11._col0, KEY._col0:11._col1, KEY._col0:11._col2), count(DISTINCT KEY._col0:12._col0, KEY._col0:12._col1, KEY._col0:12._col2), count(DISTINCT KEY._col0:13._col0, KEY._col0:13._col1, KEY._col0:13._col2), count(DISTINCT KEY._col0:14._col0, KEY._col0:14._col1, KEY._col0:14._col2, KEY._col0:14._col3) + mode: complete + outputColumnNames: $f0, $f1, $f2, $f3, $f4, $f5, $f6, $f7, $f8, $f9, $f10, $f11, $f12, $f13, $f14, $f15, $f16, $f17, $f18, $f19, $f20 + Statistics: Num rows: 1 Data size: 168 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 168 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(1), count(*), count(a), count(b), count(c), count(d), count(distinct a), count(distinct b), count(distinct c), count(distinct d), count(distinct a,b), count(distinct b,c), count(distinct c,d), count(distinct a,d), count(distinct a,c), count(distinct b,d), count(distinct a,b,c), count(distinct b,c,d), count(distinct a,c,d), count(distinct a,b,d), count(distinct a,b,c,d) from abcd +PREHOOK: type: QUERY +PREHOOK: Input: default@abcd +#### A masked pattern was here #### +POSTHOOK: query: select count(1), count(*), count(a), count(b), count(c), count(d), count(distinct a), count(distinct b), count(distinct c), count(distinct d), count(distinct a,b), count(distinct b,c), count(distinct c,d), count(distinct a,d), count(distinct a,c), count(distinct b,d), count(distinct a,b,c), count(distinct b,c,d), count(distinct a,c,d), count(distinct a,b,d), count(distinct a,b,c,d) from abcd +POSTHOOK: type: QUERY +POSTHOOK: Input: default@abcd +#### A masked pattern was here #### +7 7 6 6 6 7 3 3 6 7 4 5 6 6 5 6 4 5 5 5 4