diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties index f30152b..e4910e4 100644 --- a/itests/src/test/resources/testconfiguration.properties +++ b/itests/src/test/resources/testconfiguration.properties @@ -51,6 +51,7 @@ minitez.query.files=explainuser_3.q,\ explainanalyze_5.q,\ hybridgrace_hashjoin_1.q,\ hybridgrace_hashjoin_2.q,\ + multi_count_distinct.q,\ tez_union_with_udf.q diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAggregateProjectMergeRule.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAggregateProjectMergeRule.java index c243266..7c3b4b0 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAggregateProjectMergeRule.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAggregateProjectMergeRule.java @@ -31,6 +31,7 @@ import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRelFactories; import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRelOptUtil; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAggregate; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveGroupingID; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveProject; import com.google.common.collect.ImmutableList; @@ -59,6 +60,19 @@ private HiveAggregateProjectMergeRule() { } @Override + public boolean matches(RelOptRuleCall call) { + final Aggregate aggregate = call.rel(0); + // Rule cannot be applied if there are GroupingId because it will change the + // value as the position will be changed. + for (AggregateCall aggCall : aggregate.getAggCallList()) { + if (aggCall.getAggregation().equals(HiveGroupingID.INSTANCE)) { + return false; + } + } + return super.matches(call); + } + + @Override public void onMatch(RelOptRuleCall call) { final HiveAggregate aggregate = call.rel(0); final HiveProject project = call.rel(1); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveExpandDistinctAggregatesRule.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveExpandDistinctAggregatesRule.java index 7d4411a..417eda3 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveExpandDistinctAggregatesRule.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveExpandDistinctAggregatesRule.java @@ -16,13 +16,17 @@ */ package org.apache.hadoop.hive.ql.optimizer.calcite.rules; +import java.math.BigDecimal; import java.util.ArrayList; +import java.util.Collections; import java.util.HashMap; +import java.util.HashSet; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Set; +import org.apache.calcite.plan.RelOptCluster; import org.apache.calcite.plan.RelOptRule; import org.apache.calcite.plan.RelOptRuleCall; import org.apache.calcite.rel.RelNode; @@ -31,16 +35,30 @@ import org.apache.calcite.rel.core.RelFactories; import org.apache.calcite.rel.metadata.RelColumnOrigin; import org.apache.calcite.rel.metadata.RelMetadataQuery; +import org.apache.calcite.rel.type.RelDataType; import org.apache.calcite.rel.type.RelDataTypeField; +import org.apache.calcite.rex.RexBuilder; import org.apache.calcite.rex.RexInputRef; import org.apache.calcite.rex.RexNode; +import org.apache.calcite.sql.fun.SqlStdOperatorTable; +import org.apache.calcite.sql.type.SqlTypeName; import org.apache.calcite.util.ImmutableBitSet; import org.apache.calcite.util.Pair; import org.apache.calcite.util.Util; +import org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException; +import org.apache.hadoop.hive.ql.optimizer.calcite.HiveCalciteUtil; import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRelFactories; import org.apache.hadoop.hive.ql.optimizer.calcite.RelOptHiveTable; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAggregate; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveGroupingID; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveProject; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveRelNode; +import org.apache.hadoop.hive.ql.optimizer.calcite.translator.TypeConverter; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import com.google.common.base.Function; import com.google.common.collect.ImmutableList; import com.google.common.collect.Lists; @@ -73,6 +91,8 @@ HiveRelFactories.HIVE_PROJECT_FACTORY); private static RelFactories.ProjectFactory projFactory; + + protected static final Logger LOG = LoggerFactory.getLogger(HiveExpandDistinctAggregatesRule.class); //~ Constructors ----------------------------------------------------------- @@ -82,19 +102,25 @@ public HiveExpandDistinctAggregatesRule( projFactory = projectFactory; } + RelOptCluster cluster = null; + RexBuilder rexBuilder = null; + //~ Methods ---------------------------------------------------------------- @Override public void onMatch(RelOptRuleCall call) { final Aggregate aggregate = call.rel(0); - if (!aggregate.containsDistinctCall()) { + int numCountDistinct = getNumCountDistinctCall(aggregate); + if (numCountDistinct == 0) { return; } - // Find all of the agg expressions. We use a LinkedHashSet to ensure - // determinism. + // Find all of the agg expressions. We use a List (for all count(distinct)) + // as well as a Set (for all others) to ensure determinism. int nonDistinctCount = 0; + List> argListList = new ArrayList>(); Set> argListSets = new LinkedHashSet>(); + Set positions = new HashSet<>(); for (AggregateCall aggCall : aggregate.getAggCallList()) { if (!aggCall.isDistinct()) { ++nonDistinctCount; @@ -103,11 +129,35 @@ public void onMatch(RelOptRuleCall call) { ArrayList argList = new ArrayList(); for (Integer arg : aggCall.getArgList()) { argList.add(arg); + positions.add(arg); } + // Aggr checks for sorted argList. + argListList.add(argList); argListSets.add(argList); } Util.permAssert(argListSets.size() > 0, "containsDistinctCall lied"); + if (numCountDistinct > 1 && numCountDistinct == aggregate.getAggCallList().size() + && aggregate.getGroupSet().isEmpty()) { + // now positions contains all the distinct positions, i.e., $5, $4, $6 + // we need to first sort them as group by set + // and then get their position later, i.e., $4->1, $5->2, $6->3 + cluster = aggregate.getCluster(); + rexBuilder = cluster.getRexBuilder(); + RelNode converted = null; + List sourceOfForCountDistinct = new ArrayList<>(); + sourceOfForCountDistinct.addAll(positions); + Collections.sort(sourceOfForCountDistinct); + try { + converted = convert(aggregate, argListList, sourceOfForCountDistinct); + } catch (CalciteSemanticException e) { + LOG.debug(e.toString()); + throw new RuntimeException(e); + } + call.transformTo(converted); + return; + } + // If all of the agg expressions are distinct and have the same // arguments then we can use a more efficient form. if ((nonDistinctCount == 0) && (argListSets.size() == 1)) { @@ -133,6 +183,156 @@ public void onMatch(RelOptRuleCall call) { } /** + * Converts an aggregate relational expression that contains only + * count(distinct) to grouping sets with count. For example select + * count(distinct department_id), count(distinct gender), count(distinct + * education_level) from employee; can be transformed to select count(case i + * when 1 then 1 else null end) as c0, count(case i when 2 then 1 else null + * end) as c1, count(case i when 4 then 1 else null end) as c2 from (select + * grouping__id as i, department_id, gender, education_level from employee + * group by department_id, gender, education_level grouping sets + * (department_id, gender, education_level))subq; + * @throws CalciteSemanticException + */ + private RelNode convert(Aggregate aggregate, List> argList, List sourceOfForCountDistinct) throws CalciteSemanticException { + // we use this map to map the position of argList to the position of grouping set + Map map = new HashMap<>(); + List> cleanArgList = new ArrayList<>(); + final Aggregate groupingSets = createGroupingSets(aggregate, argList, cleanArgList, map, sourceOfForCountDistinct); + return createCount(groupingSets, argList, cleanArgList, map, sourceOfForCountDistinct); + } + + private long getGroupingIdValue(List list, List sourceOfForCountDistinct) { + long ind = 0; + for (int i : list) { + ind |= 1 << sourceOfForCountDistinct.indexOf(i); + } + return ind; + } + + /** + * @param aggr: the original aggregate + * @param argList: the original argList in aggregate + * @param cleanArgList: the new argList without duplicates + * @param map: the mapping from the original argList to the new argList + * @param sourceOfForCountDistinct: the sorted positions of groupset + * @return + * @throws CalciteSemanticException + */ + private RelNode createCount(Aggregate aggr, List> argList, + List> cleanArgList, Map map, + List sourceOfForCountDistinct) throws CalciteSemanticException { + List originalInputRefs = Lists.transform(aggr.getRowType().getFieldList(), + new Function() { + @Override + public RexNode apply(RelDataTypeField input) { + return new RexInputRef(input.getIndex(), input.getType()); + } + }); + final List gbChildProjLst = Lists.newArrayList(); + for (List list : cleanArgList) { + RexNode equal = rexBuilder.makeCall(SqlStdOperatorTable.EQUALS, + originalInputRefs.get(originalInputRefs.size() - 1), + rexBuilder.makeExactLiteral(new BigDecimal(getGroupingIdValue(list, sourceOfForCountDistinct)))); + RexNode condition = rexBuilder.makeCall(SqlStdOperatorTable.CASE, equal, + rexBuilder.makeExactLiteral(BigDecimal.ONE), rexBuilder.constantNull()); + gbChildProjLst.add(condition); + } + + // create the project before GB + RelNode gbInputRel = HiveProject.create(aggr, gbChildProjLst, null); + + // create the aggregate + List aggregateCalls = Lists.newArrayList(); + RelDataType aggFnRetType = TypeConverter.convert(TypeInfoFactory.longTypeInfo, + cluster.getTypeFactory()); + for (int i = 0; i < cleanArgList.size(); i++) { + AggregateCall aggregateCall = HiveCalciteUtil.createSingleArgAggCall("count", cluster, + TypeInfoFactory.longTypeInfo, i, aggFnRetType); + aggregateCalls.add(aggregateCall); + } + Aggregate aggregate = new HiveAggregate(cluster, cluster.traitSetOf(HiveRelNode.CONVENTION), gbInputRel, + false, ImmutableBitSet.of(), null, aggregateCalls); + + // create the project after GB. For those repeated values, e.g., select + // count(distinct x, y), count(distinct y, x), we find the correct mapping. + if (map.isEmpty()) { + return aggregate; + } else { + List originalAggrRefs = Lists.transform(aggregate.getRowType().getFieldList(), + new Function() { + @Override + public RexNode apply(RelDataTypeField input) { + return new RexInputRef(input.getIndex(), input.getType()); + } + }); + final List projLst = Lists.newArrayList(); + int index = 0; + for (int i = 0; i < argList.size(); i++) { + if (map.containsKey(i)) { + projLst.add(originalAggrRefs.get(map.get(i))); + } else { + projLst.add(originalAggrRefs.get(index++)); + } + } + return HiveProject.create(aggregate, projLst, null); + } + } + + /** + * @param aggregate: the original aggregate + * @param argList: the original argList in aggregate + * @param cleanArgList: the new argList without duplicates + * @param map: the mapping from the original argList to the new argList + * @param sourceOfForCountDistinct: the sorted positions of groupset + * @return + */ + private Aggregate createGroupingSets(Aggregate aggregate, List> argList, + List> cleanArgList, Map map, + List sourceOfForCountDistinct) { + final ImmutableBitSet groupSet = ImmutableBitSet.of(sourceOfForCountDistinct); + final List origGroupSets = new ArrayList<>(); + + for (int i = 0; i < argList.size(); i++) { + List list = argList.get(i); + ImmutableBitSet bitSet = ImmutableBitSet.of(list); + int prev = origGroupSets.indexOf(bitSet); + if (prev == -1) { + origGroupSets.add(bitSet); + cleanArgList.add(list); + } else { + map.put(i, prev); + } + } + // Calcite expects the grouping sets sorted and without duplicates + Collections.sort(origGroupSets, ImmutableBitSet.COMPARATOR); + + List aggregateCalls = new ArrayList(); + // Create GroupingID column + AggregateCall aggCall = AggregateCall.create(HiveGroupingID.INSTANCE, false, + new ImmutableList.Builder().build(), -1, this.cluster.getTypeFactory() + .createSqlType(SqlTypeName.INTEGER), HiveGroupingID.INSTANCE.getName()); + aggregateCalls.add(aggCall); + return new HiveAggregate(cluster, cluster.traitSetOf(HiveRelNode.CONVENTION), + aggregate.getInput(), true, groupSet, origGroupSets, aggregateCalls); + } + + /** + * Returns the number of count DISTINCT + * + * @return the number of count DISTINCT + */ + private int getNumCountDistinctCall(Aggregate hiveAggregate) { + int cnt = 0; + for (AggregateCall aggCall : hiveAggregate.getAggCallList()) { + if (aggCall.isDistinct() && (aggCall.getAggregation().getName().equalsIgnoreCase("count"))) { + cnt++; + } + } + return cnt; + } + + /** * Converts an aggregate relational expression that contains just one * distinct aggregate function (or perhaps several over the same arguments) * and no non-distinct aggregate functions. diff --git a/ql/src/test/queries/clientpositive/multi_count_distinct.q b/ql/src/test/queries/clientpositive/multi_count_distinct.q new file mode 100644 index 0000000..855cb64 --- /dev/null +++ b/ql/src/test/queries/clientpositive/multi_count_distinct.q @@ -0,0 +1,38 @@ +set hive.mapred.mode=nonstrict; + +drop table employee; + +create table employee (department_id int, gender varchar(10), education_level int); + +insert into employee values (1, 'M', 1),(1, 'M', 1),(2, 'F', 1),(1, 'F', 3),(1, 'M', 2),(4, 'M', 1),(2, 'F', 1),(2, 'F', 3),(3, 'M', 2); + +explain select count(distinct department_id), count(distinct gender), count(distinct education_level) from employee; + +select count(distinct department_id), count(distinct gender), count(distinct education_level) from employee; + +select count(distinct department_id), count(distinct gender), count(distinct education_level), count(distinct education_level) from employee; + +select count(distinct department_id), count(distinct gender), count(distinct education_level), +count(distinct education_level, department_id) from employee; + +select count(distinct gender), count(distinct department_id), count(distinct gender), count(distinct education_level), +count(distinct education_level, department_id), count(distinct department_id, education_level) from employee; + +explain select count(distinct gender), count(distinct department_id), count(distinct gender), count(distinct education_level), +count(distinct education_level, department_id), count(distinct department_id, education_level), count(distinct department_id, education_level, gender) from employee; + +select count(distinct gender), count(distinct department_id), count(distinct gender), count(distinct education_level), +count(distinct education_level, department_id), count(distinct department_id, education_level), count(distinct department_id, education_level, gender) from employee; + +select count(case i when 1 then 1 else null end) as c0, count(case i when 2 then 1 else null end) as c1, +count(case i when 4 then 1 else null end) as c2 from (select grouping__id as i, department_id, gender, +education_level from employee group by department_id, gender, education_level grouping sets +(department_id, gender, education_level))subq; + +select grouping__id as i, department_id, gender, education_level from employee +group by department_id, gender, education_level grouping sets +(department_id, gender, education_level, (education_level, department_id)); + + + + diff --git a/ql/src/test/results/clientpositive/llap/vector_grouping_sets.q.out b/ql/src/test/results/clientpositive/llap/vector_grouping_sets.q.out index 2cd758c..127478e 100644 --- a/ql/src/test/results/clientpositive/llap/vector_grouping_sets.q.out +++ b/ql/src/test/results/clientpositive/llap/vector_grouping_sets.q.out @@ -241,10 +241,10 @@ STAGE PLANS: Statistics: Num rows: 12 Data size: 25632 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: s_store_id (type: string) - outputColumnNames: s_store_id + outputColumnNames: _col0 Statistics: Num rows: 12 Data size: 25632 Basic stats: COMPLETE Column stats: NONE Group By Operator - keys: s_store_id (type: string), '0' (type: string) + keys: _col0 (type: string), '0' (type: string) mode: hash outputColumnNames: _col0, _col1 Statistics: Num rows: 24 Data size: 51264 Basic stats: COMPLETE Column stats: NONE @@ -329,10 +329,10 @@ STAGE PLANS: Statistics: Num rows: 12 Data size: 25632 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: s_store_id (type: string) - outputColumnNames: s_store_id + outputColumnNames: _col0 Statistics: Num rows: 12 Data size: 25632 Basic stats: COMPLETE Column stats: NONE Group By Operator - keys: s_store_id (type: string), '0' (type: string) + keys: _col0 (type: string), '0' (type: string) mode: hash outputColumnNames: _col0, _col1 Statistics: Num rows: 24 Data size: 51264 Basic stats: COMPLETE Column stats: NONE diff --git a/ql/src/test/results/clientpositive/perf/query70.q.out b/ql/src/test/results/clientpositive/perf/query70.q.out index 5a82cd1..611af74 100644 --- a/ql/src/test/results/clientpositive/perf/query70.q.out +++ b/ql/src/test/results/clientpositive/perf/query70.q.out @@ -21,46 +21,46 @@ Stage-0 limit:100 Stage-1 Reducer 6 - File Output Operator [FS_62] - Limit [LIM_61] (rows=100 width=88) + File Output Operator [FS_63] + Limit [LIM_62] (rows=100 width=88) Number of rows:100 - Select Operator [SEL_60] (rows=1045432122 width=88) + Select Operator [SEL_61] (rows=1045432122 width=88) Output:["_col0","_col1","_col2","_col3","_col4"] <-Reducer 5 [SIMPLE_EDGE] - SHUFFLE [RS_59] - Select Operator [SEL_57] (rows=1045432122 width=88) + SHUFFLE [RS_60] + Select Operator [SEL_58] (rows=1045432122 width=88) Output:["_col0","_col1","_col2","_col3","_col4"] - PTF Operator [PTF_56] (rows=1045432122 width=88) + PTF Operator [PTF_57] (rows=1045432122 width=88) Function definitions:[{},{"name:":"windowingtablefunction","order by:":"_col4 ASC NULLS FIRST","partition by:":"_col5, CASE WHEN ((_col5 = 2)) THEN (_col0) ELSE (null) END"}] - Select Operator [SEL_55] (rows=1045432122 width=88) + Select Operator [SEL_56] (rows=1045432122 width=88) Output:["_col0","_col1","_col4","_col5"] <-Reducer 4 [SIMPLE_EDGE] - SHUFFLE [RS_54] + SHUFFLE [RS_55] PartitionCols:_col5, CASE WHEN ((_col5 = 2)) THEN (_col0) ELSE (null) END - Select Operator [SEL_53] (rows=1045432122 width=88) + Select Operator [SEL_54] (rows=1045432122 width=88) Output:["_col0","_col1","_col4","_col5"] - Group By Operator [GBY_52] (rows=1045432122 width=88) + Group By Operator [GBY_53] (rows=1045432122 width=88) Output:["_col0","_col1","_col2","_col3"],aggregations:["sum(VALUE._col0)"],keys:KEY._col0, KEY._col1, KEY._col2 <-Reducer 3 [SIMPLE_EDGE] - SHUFFLE [RS_51] + SHUFFLE [RS_52] PartitionCols:_col0, _col1, _col2 - Group By Operator [GBY_50] (rows=2090864244 width=88) - Output:["_col0","_col1","_col2","_col3"],aggregations:["sum(_col2)"],keys:_col6, _col7, '0' + Group By Operator [GBY_51] (rows=2090864244 width=88) + Output:["_col0","_col1","_col2","_col3"],aggregations:["sum(_col2)"],keys:_col0, _col1, '0' Select Operator [SEL_49] (rows=696954748 width=88) - Output:["_col6","_col7","_col2"] - Merge Join Operator [MERGEJOIN_91] (rows=696954748 width=88) + Output:["_col0","_col1","_col2"] + Merge Join Operator [MERGEJOIN_92] (rows=696954748 width=88) Conds:RS_46._col1=RS_47._col0(Inner),Output:["_col2","_col6","_col7"] <-Reducer 2 [SIMPLE_EDGE] SHUFFLE [RS_46] PartitionCols:_col1 - Merge Join Operator [MERGEJOIN_87] (rows=633595212 width=88) + Merge Join Operator [MERGEJOIN_88] (rows=633595212 width=88) Conds:RS_43._col0=RS_44._col0(Inner),Output:["_col1","_col2"] <-Map 1 [SIMPLE_EDGE] SHUFFLE [RS_43] PartitionCols:_col0 Select Operator [SEL_2] (rows=575995635 width=88) Output:["_col0","_col1","_col2"] - Filter Operator [FIL_80] (rows=575995635 width=88) + Filter Operator [FIL_81] (rows=575995635 width=88) predicate:(ss_sold_date_sk is not null and ss_store_sk is not null) TableScan [TS_0] (rows=575995635 width=88) default@store_sales,ss,Tbl:COMPLETE,Col:NONE,Output:["ss_sold_date_sk","ss_store_sk","ss_net_profit"] @@ -69,21 +69,21 @@ Stage-0 PartitionCols:_col0 Select Operator [SEL_5] (rows=8116 width=1119) Output:["_col0"] - Filter Operator [FIL_81] (rows=8116 width=1119) + Filter Operator [FIL_82] (rows=8116 width=1119) predicate:(d_month_seq BETWEEN 1193 AND 1204 and d_date_sk is not null) TableScan [TS_3] (rows=73049 width=1119) default@date_dim,d1,Tbl:COMPLETE,Col:NONE,Output:["d_date_sk","d_month_seq"] <-Reducer 9 [SIMPLE_EDGE] SHUFFLE [RS_47] PartitionCols:_col0 - Merge Join Operator [MERGEJOIN_90] (rows=127775039 width=88) + Merge Join Operator [MERGEJOIN_91] (rows=127775039 width=88) Conds:RS_39._col2=RS_40._col0(Left Semi),Output:["_col0","_col1","_col2"] <-Map 8 [SIMPLE_EDGE] SHUFFLE [RS_39] PartitionCols:_col2 Select Operator [SEL_8] (rows=1704 width=1910) Output:["_col0","_col1","_col2"] - Filter Operator [FIL_82] (rows=1704 width=1910) + Filter Operator [FIL_83] (rows=1704 width=1910) predicate:(s_store_sk is not null and s_state is not null) TableScan [TS_6] (rows=1704 width=1910) default@store,s,Tbl:COMPLETE,Col:NONE,Output:["s_store_sk","s_county","s_state"] @@ -94,7 +94,7 @@ Stage-0 Output:["_col0"],keys:_col0 Select Operator [SEL_32] (rows=116159124 width=88) Output:["_col0"] - Filter Operator [FIL_83] (rows=116159124 width=88) + Filter Operator [FIL_84] (rows=116159124 width=88) predicate:(rank_window_0 <= 5) PTF Operator [PTF_31] (rows=348477374 width=88) Function definitions:[{},{"name:":"windowingtablefunction","order by:":"_col1 DESC NULLS LAST","partition by:":"_col0"}] @@ -112,28 +112,28 @@ Stage-0 Output:["_col0","_col1"],aggregations:["sum(_col2)"],keys:_col6 Select Operator [SEL_24] (rows=696954748 width=88) Output:["_col6","_col2"] - Merge Join Operator [MERGEJOIN_89] (rows=696954748 width=88) + Merge Join Operator [MERGEJOIN_90] (rows=696954748 width=88) Conds:RS_21._col1=RS_22._col0(Inner),Output:["_col2","_col6"] <-Map 16 [SIMPLE_EDGE] SHUFFLE [RS_22] PartitionCols:_col0 Select Operator [SEL_17] (rows=1704 width=1910) Output:["_col0","_col1"] - Filter Operator [FIL_86] (rows=1704 width=1910) + Filter Operator [FIL_87] (rows=1704 width=1910) predicate:(s_store_sk is not null and s_state is not null) TableScan [TS_15] (rows=1704 width=1910) default@store,store,Tbl:COMPLETE,Col:NONE,Output:["s_store_sk","s_state"] <-Reducer 11 [SIMPLE_EDGE] SHUFFLE [RS_21] PartitionCols:_col1 - Merge Join Operator [MERGEJOIN_88] (rows=633595212 width=88) + Merge Join Operator [MERGEJOIN_89] (rows=633595212 width=88) Conds:RS_18._col0=RS_19._col0(Inner),Output:["_col1","_col2"] <-Map 10 [SIMPLE_EDGE] SHUFFLE [RS_18] PartitionCols:_col0 Select Operator [SEL_11] (rows=575995635 width=88) Output:["_col0","_col1","_col2"] - Filter Operator [FIL_84] (rows=575995635 width=88) + Filter Operator [FIL_85] (rows=575995635 width=88) predicate:(ss_store_sk is not null and ss_sold_date_sk is not null) TableScan [TS_9] (rows=575995635 width=88) default@store_sales,store_sales,Tbl:COMPLETE,Col:NONE,Output:["ss_sold_date_sk","ss_store_sk","ss_net_profit"] @@ -142,7 +142,7 @@ Stage-0 PartitionCols:_col0 Select Operator [SEL_14] (rows=8116 width=1119) Output:["_col0"] - Filter Operator [FIL_85] (rows=8116 width=1119) + Filter Operator [FIL_86] (rows=8116 width=1119) predicate:(d_month_seq BETWEEN 1193 AND 1204 and d_date_sk is not null) TableScan [TS_12] (rows=73049 width=1119) default@date_dim,date_dim,Tbl:COMPLETE,Col:NONE,Output:["d_date_sk","d_month_seq"] diff --git a/ql/src/test/results/clientpositive/tez/multi_count_distinct.q.out b/ql/src/test/results/clientpositive/tez/multi_count_distinct.q.out new file mode 100644 index 0000000..b623f1d --- /dev/null +++ b/ql/src/test/results/clientpositive/tez/multi_count_distinct.q.out @@ -0,0 +1,193 @@ +PREHOOK: query: drop table employee +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table employee +POSTHOOK: type: DROPTABLE +PREHOOK: query: create table employee (department_id int, gender varchar(10), education_level int) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@employee +POSTHOOK: query: create table employee (department_id int, gender varchar(10), education_level int) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@employee +PREHOOK: query: insert into employee values (1, 'M', 1),(1, 'M', 1),(2, 'F', 1),(1, 'F', 3),(1, 'M', 2),(4, 'M', 1),(2, 'F', 1),(2, 'F', 3),(3, 'M', 2) +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__1 +PREHOOK: Output: default@employee +POSTHOOK: query: insert into employee values (1, 'M', 1),(1, 'M', 1),(2, 'F', 1),(1, 'F', 3),(1, 'M', 2),(4, 'M', 1),(2, 'F', 1),(2, 'F', 3),(3, 'M', 2) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__1 +POSTHOOK: Output: default@employee +POSTHOOK: Lineage: employee.department_id EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: employee.education_level EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col3, type:string, comment:), ] +POSTHOOK: Lineage: employee.gender EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +PREHOOK: query: explain select count(distinct department_id), count(distinct gender), count(distinct education_level) from employee +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(distinct department_id), count(distinct gender), count(distinct education_level) from employee +POSTHOOK: type: QUERY +Plan optimized by CBO. + +Vertex dependency in root stage +Reducer 2 <- Map 1 (SIMPLE_EDGE) +Reducer 3 <- Reducer 2 (SIMPLE_EDGE) + +Stage-0 + Fetch Operator + limit:-1 + Stage-1 + Reducer 3 + File Output Operator [FS_12] + Group By Operator [GBY_10] (rows=1 width=24) + Output:["_col0","_col1","_col2"],aggregations:["count(VALUE._col0)","count(VALUE._col1)","count(VALUE._col2)"] + <-Reducer 2 [SIMPLE_EDGE] + SHUFFLE [RS_9] + Group By Operator [GBY_8] (rows=1 width=24) + Output:["_col0","_col1","_col2"],aggregations:["count(_col0)","count(_col1)","count(_col2)"] + Select Operator [SEL_6] (rows=13 width=5) + Output:["_col0","_col1","_col2"] + Group By Operator [GBY_5] (rows=13 width=5) + Output:["_col0","_col1","_col2","_col3"],keys:KEY._col0, KEY._col1, KEY._col2, KEY._col3 + <-Map 1 [SIMPLE_EDGE] + SHUFFLE [RS_4] + PartitionCols:_col0, _col1, _col2, _col3 + Group By Operator [GBY_3] (rows=27 width=5) + Output:["_col0","_col1","_col2","_col3"],keys:_col0, _col1, _col2, '0' + Select Operator [SEL_1] (rows=9 width=5) + Output:["_col0","_col1","_col2"] + TableScan [TS_0] (rows=9 width=5) + default@employee,employee,Tbl:COMPLETE,Col:NONE,Output:["department_id","gender","education_level"] + +PREHOOK: query: select count(distinct department_id), count(distinct gender), count(distinct education_level) from employee +PREHOOK: type: QUERY +PREHOOK: Input: default@employee +#### A masked pattern was here #### +POSTHOOK: query: select count(distinct department_id), count(distinct gender), count(distinct education_level) from employee +POSTHOOK: type: QUERY +POSTHOOK: Input: default@employee +#### A masked pattern was here #### +4 2 3 +PREHOOK: query: select count(distinct department_id), count(distinct gender), count(distinct education_level), count(distinct education_level) from employee +PREHOOK: type: QUERY +PREHOOK: Input: default@employee +#### A masked pattern was here #### +POSTHOOK: query: select count(distinct department_id), count(distinct gender), count(distinct education_level), count(distinct education_level) from employee +POSTHOOK: type: QUERY +POSTHOOK: Input: default@employee +#### A masked pattern was here #### +4 2 3 3 +PREHOOK: query: select count(distinct department_id), count(distinct gender), count(distinct education_level), +count(distinct education_level, department_id) from employee +PREHOOK: type: QUERY +PREHOOK: Input: default@employee +#### A masked pattern was here #### +POSTHOOK: query: select count(distinct department_id), count(distinct gender), count(distinct education_level), +count(distinct education_level, department_id) from employee +POSTHOOK: type: QUERY +POSTHOOK: Input: default@employee +#### A masked pattern was here #### +4 2 3 7 +PREHOOK: query: select count(distinct gender), count(distinct department_id), count(distinct gender), count(distinct education_level), +count(distinct education_level, department_id), count(distinct department_id, education_level) from employee +PREHOOK: type: QUERY +PREHOOK: Input: default@employee +#### A masked pattern was here #### +POSTHOOK: query: select count(distinct gender), count(distinct department_id), count(distinct gender), count(distinct education_level), +count(distinct education_level, department_id), count(distinct department_id, education_level) from employee +POSTHOOK: type: QUERY +POSTHOOK: Input: default@employee +#### A masked pattern was here #### +2 4 2 3 7 7 +PREHOOK: query: explain select count(distinct gender), count(distinct department_id), count(distinct gender), count(distinct education_level), +count(distinct education_level, department_id), count(distinct department_id, education_level), count(distinct department_id, education_level, gender) from employee +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(distinct gender), count(distinct department_id), count(distinct gender), count(distinct education_level), +count(distinct education_level, department_id), count(distinct department_id, education_level), count(distinct department_id, education_level, gender) from employee +POSTHOOK: type: QUERY +Plan optimized by CBO. + +Vertex dependency in root stage +Reducer 2 <- Map 1 (SIMPLE_EDGE) +Reducer 3 <- Reducer 2 (SIMPLE_EDGE) + +Stage-0 + Fetch Operator + limit:-1 + Stage-1 + Reducer 3 + File Output Operator [FS_12] + Select Operator [SEL_11] (rows=1 width=40) + Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6"] + Group By Operator [GBY_10] (rows=1 width=40) + Output:["_col0","_col1","_col2","_col3","_col4"],aggregations:["count(VALUE._col0)","count(VALUE._col1)","count(VALUE._col2)","count(VALUE._col3)","count(VALUE._col4)"] + <-Reducer 2 [SIMPLE_EDGE] + SHUFFLE [RS_9] + Group By Operator [GBY_8] (rows=1 width=40) + Output:["_col0","_col1","_col2","_col3","_col4"],aggregations:["count(_col0)","count(_col1)","count(_col2)","count(_col3)","count(_col4)"] + Select Operator [SEL_6] (rows=22 width=5) + Output:["_col0","_col1","_col2","_col3","_col4"] + Group By Operator [GBY_5] (rows=22 width=5) + Output:["_col0","_col1","_col2","_col3"],keys:KEY._col0, KEY._col1, KEY._col2, KEY._col3 + <-Map 1 [SIMPLE_EDGE] + SHUFFLE [RS_4] + PartitionCols:_col0, _col1, _col2, _col3 + Group By Operator [GBY_3] (rows=45 width=5) + Output:["_col0","_col1","_col2","_col3"],keys:_col0, _col1, _col2, '0' + Select Operator [SEL_1] (rows=9 width=5) + Output:["_col0","_col1","_col2"] + TableScan [TS_0] (rows=9 width=5) + default@employee,employee,Tbl:COMPLETE,Col:NONE,Output:["gender","department_id","education_level"] + +PREHOOK: query: select count(distinct gender), count(distinct department_id), count(distinct gender), count(distinct education_level), +count(distinct education_level, department_id), count(distinct department_id, education_level), count(distinct department_id, education_level, gender) from employee +PREHOOK: type: QUERY +PREHOOK: Input: default@employee +#### A masked pattern was here #### +POSTHOOK: query: select count(distinct gender), count(distinct department_id), count(distinct gender), count(distinct education_level), +count(distinct education_level, department_id), count(distinct department_id, education_level), count(distinct department_id, education_level, gender) from employee +POSTHOOK: type: QUERY +POSTHOOK: Input: default@employee +#### A masked pattern was here #### +2 4 2 3 7 7 7 +PREHOOK: query: select count(case i when 1 then 1 else null end) as c0, count(case i when 2 then 1 else null end) as c1, +count(case i when 4 then 1 else null end) as c2 from (select grouping__id as i, department_id, gender, +education_level from employee group by department_id, gender, education_level grouping sets +(department_id, gender, education_level))subq +PREHOOK: type: QUERY +PREHOOK: Input: default@employee +#### A masked pattern was here #### +POSTHOOK: query: select count(case i when 1 then 1 else null end) as c0, count(case i when 2 then 1 else null end) as c1, +count(case i when 4 then 1 else null end) as c2 from (select grouping__id as i, department_id, gender, +education_level from employee group by department_id, gender, education_level grouping sets +(department_id, gender, education_level))subq +POSTHOOK: type: QUERY +POSTHOOK: Input: default@employee +#### A masked pattern was here #### +4 2 3 +PREHOOK: query: select grouping__id as i, department_id, gender, education_level from employee +group by department_id, gender, education_level grouping sets +(department_id, gender, education_level, (education_level, department_id)) +PREHOOK: type: QUERY +PREHOOK: Input: default@employee +#### A masked pattern was here #### +POSTHOOK: query: select grouping__id as i, department_id, gender, education_level from employee +group by department_id, gender, education_level grouping sets +(department_id, gender, education_level, (education_level, department_id)) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@employee +#### A masked pattern was here #### +4 NULL NULL 1 +4 NULL NULL 2 +4 NULL NULL 3 +2 NULL F NULL +2 NULL M NULL +1 1 NULL NULL +5 1 NULL 1 +5 1 NULL 2 +5 1 NULL 3 +1 2 NULL NULL +5 2 NULL 1 +5 2 NULL 3 +1 3 NULL NULL +5 3 NULL 2 +1 4 NULL NULL +5 4 NULL 1 diff --git a/ql/src/test/results/clientpositive/vector_grouping_sets.q.out b/ql/src/test/results/clientpositive/vector_grouping_sets.q.out index 9e35049..aa9ee87 100644 --- a/ql/src/test/results/clientpositive/vector_grouping_sets.q.out +++ b/ql/src/test/results/clientpositive/vector_grouping_sets.q.out @@ -225,10 +225,10 @@ STAGE PLANS: Statistics: Num rows: 12 Data size: 25632 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: s_store_id (type: string) - outputColumnNames: s_store_id + outputColumnNames: _col0 Statistics: Num rows: 12 Data size: 25632 Basic stats: COMPLETE Column stats: NONE Group By Operator - keys: s_store_id (type: string), '0' (type: string) + keys: _col0 (type: string), '0' (type: string) mode: hash outputColumnNames: _col0, _col1 Statistics: Num rows: 24 Data size: 51264 Basic stats: COMPLETE Column stats: NONE @@ -303,10 +303,10 @@ STAGE PLANS: Statistics: Num rows: 12 Data size: 25632 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: s_store_id (type: string) - outputColumnNames: s_store_id + outputColumnNames: _col0 Statistics: Num rows: 12 Data size: 25632 Basic stats: COMPLETE Column stats: NONE Group By Operator - keys: s_store_id (type: string), '0' (type: string) + keys: _col0 (type: string), '0' (type: string) mode: hash outputColumnNames: _col0, _col1 Statistics: Num rows: 24 Data size: 51264 Basic stats: COMPLETE Column stats: NONE