diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties index be5a747..cc05ba9 100644 --- a/itests/src/test/resources/testconfiguration.properties +++ b/itests/src/test/resources/testconfiguration.properties @@ -437,7 +437,8 @@ minillap.query.files=acid_bucket_pruning.q,\ global_limit.q,\ dynamic_partition_pruning_2.q,\ tez_union_dynamic_partition.q,\ - load_fs2.q + load_fs2.q,\ + multi_count_distinct_null.q minillaplocal.query.files=acid_globallimit.q,\ acid_vectorization_missing_cols.q,\ diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveExpandDistinctAggregatesRule.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveExpandDistinctAggregatesRule.java index 417eda3..82624fc 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveExpandDistinctAggregatesRule.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveExpandDistinctAggregatesRule.java @@ -139,6 +139,7 @@ public void onMatch(RelOptRuleCall call) { if (numCountDistinct > 1 && numCountDistinct == aggregate.getAggCallList().size() && aggregate.getGroupSet().isEmpty()) { + LOG.debug("Trigger countDistinct rewrite. numCountDistinct is " + numCountDistinct); // now positions contains all the distinct positions, i.e., $5, $4, $6 // we need to first sort them as group by set // and then get their position later, i.e., $4->1, $5->2, $6->3 @@ -186,9 +187,12 @@ public void onMatch(RelOptRuleCall call) { * Converts an aggregate relational expression that contains only * count(distinct) to grouping sets with count. For example select * count(distinct department_id), count(distinct gender), count(distinct - * education_level) from employee; can be transformed to select count(case i - * when 1 then 1 else null end) as c0, count(case i when 2 then 1 else null - * end) as c1, count(case i when 4 then 1 else null end) as c2 from (select + * education_level) from employee; can be transformed to + * select + * count(case when i=1 and department_id is not null then 1 else null end) as c0, + * count(case when i=2 and gender is not null then 1 else null end) as c1, + * count(case when i=4 and education_level is not null then 1 else null end) as c2 + * from (select * grouping__id as i, department_id, gender, education_level from employee * group by department_id, gender, education_level grouping sets * (department_id, gender, education_level))subq; @@ -230,13 +234,22 @@ public RexNode apply(RelDataTypeField input) { } }); final List gbChildProjLst = Lists.newArrayList(); + // for singular arg, count should not include null + // e.g., count(case when i=1 and department_id is not null then 1 else null end) as c0, + // for non-singular args, count can include null, i.e. (,) is counted as 1 for (List list : cleanArgList) { - RexNode equal = rexBuilder.makeCall(SqlStdOperatorTable.EQUALS, - originalInputRefs.get(originalInputRefs.size() - 1), - rexBuilder.makeExactLiteral(new BigDecimal(getGroupingIdValue(list, sourceOfForCountDistinct)))); - RexNode condition = rexBuilder.makeCall(SqlStdOperatorTable.CASE, equal, + RexNode condition = rexBuilder.makeCall(SqlStdOperatorTable.EQUALS, originalInputRefs + .get(originalInputRefs.size() - 1), rexBuilder.makeExactLiteral(new BigDecimal( + getGroupingIdValue(list, sourceOfForCountDistinct)))); + if (list.size() == 1) { + int pos = list.get(0); + RexNode notNull = rexBuilder.makeCall(SqlStdOperatorTable.IS_NOT_NULL, + originalInputRefs.get(pos)); + condition = rexBuilder.makeCall(SqlStdOperatorTable.AND, condition, notNull); + } + RexNode when = rexBuilder.makeCall(SqlStdOperatorTable.CASE, condition, rexBuilder.makeExactLiteral(BigDecimal.ONE), rexBuilder.constantNull()); - gbChildProjLst.add(condition); + gbChildProjLst.add(when); } // create the project before GB diff --git a/ql/src/test/queries/clientpositive/multi_count_distinct_null.q b/ql/src/test/queries/clientpositive/multi_count_distinct_null.q new file mode 100644 index 0000000..442d855 --- /dev/null +++ b/ql/src/test/queries/clientpositive/multi_count_distinct_null.q @@ -0,0 +1,45 @@ +set hive.mapred.mode=nonstrict; + +drop table employee; + +create table employee (department_id int, gender varchar(10), education_level int); + +insert into employee values (1, 'M', 1),(1, 'M', 1),(2, 'F', 1),(1, 'F', 3),(1, 'M', 2),(4, 'M', 1),(2, 'F', 1),(2, 'F', 3),(3, 'M', 2),(null, 'M', 1),(null, null, 1),(null, null, null); + +explain select count(distinct department_id), count(distinct gender), count(distinct education_level) from employee; + +select count(distinct gender), count(distinct department_id), count(distinct education_level) from employee; + +select count(distinct gender), count(distinct department_id), count(distinct gender), count(distinct education_level) from employee; + +select count(distinct department_id), count(distinct gender), count(distinct education_level) from employee; + +select count(distinct department_id), count(distinct gender), count(distinct education_level), count(distinct education_level) from employee; + +select count(distinct department_id), count(distinct gender), count(distinct education_level), +count(distinct education_level, department_id) from employee; + +select count(distinct gender), count(distinct department_id), count(distinct gender), count(distinct education_level), +count(distinct education_level, department_id), count(distinct department_id, education_level) from employee; + +explain select count(distinct gender), count(distinct department_id), count(distinct gender), count(distinct education_level), +count(distinct education_level, department_id), count(distinct department_id, education_level), count(distinct department_id, education_level, gender) from employee; + +select count(distinct gender), count(distinct department_id), count(distinct gender), count(distinct education_level), +count(distinct education_level, department_id), count(distinct department_id, education_level), count(distinct department_id, education_level, gender) from employee; + +select +count(case when i=1 and department_id is not null then 1 else null end) as c0, +count(case when i=2 and gender is not null then 1 else null end) as c1, +count(case when i=4 and education_level is not null then 1 else null end) as c2 from +(select grouping__id as i, department_id, gender, +education_level from employee group by department_id, gender, education_level grouping sets +(department_id, gender, education_level))subq; + +select grouping__id as i, department_id, gender, education_level from employee +group by department_id, gender, education_level grouping sets +(department_id, gender, education_level, (education_level, department_id)); + + + + diff --git a/ql/src/test/results/clientpositive/llap/multi_count_distinct_null.q.out b/ql/src/test/results/clientpositive/llap/multi_count_distinct_null.q.out new file mode 100644 index 0000000..6c13684 --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/multi_count_distinct_null.q.out @@ -0,0 +1,312 @@ +PREHOOK: query: drop table employee +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table employee +POSTHOOK: type: DROPTABLE +PREHOOK: query: create table employee (department_id int, gender varchar(10), education_level int) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@employee +POSTHOOK: query: create table employee (department_id int, gender varchar(10), education_level int) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@employee +PREHOOK: query: insert into employee values (1, 'M', 1),(1, 'M', 1),(2, 'F', 1),(1, 'F', 3),(1, 'M', 2),(4, 'M', 1),(2, 'F', 1),(2, 'F', 3),(3, 'M', 2),(null, 'M', 1),(null, null, 1),(null, null, null) +PREHOOK: type: QUERY +PREHOOK: Output: default@employee +POSTHOOK: query: insert into employee values (1, 'M', 1),(1, 'M', 1),(2, 'F', 1),(1, 'F', 3),(1, 'M', 2),(4, 'M', 1),(2, 'F', 1),(2, 'F', 3),(3, 'M', 2),(null, 'M', 1),(null, null, 1),(null, null, null) +POSTHOOK: type: QUERY +POSTHOOK: Output: default@employee +POSTHOOK: Lineage: employee.department_id EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: employee.education_level EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col3, type:string, comment:), ] +POSTHOOK: Lineage: employee.gender EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +PREHOOK: query: explain select count(distinct department_id), count(distinct gender), count(distinct education_level) from employee +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(distinct department_id), count(distinct gender), count(distinct education_level) from employee +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: employee + Statistics: Num rows: 12 Data size: 66 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: department_id (type: int), gender (type: varchar(10)), education_level (type: int) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 12 Data size: 66 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: _col0 (type: int), _col1 (type: varchar(10)), _col2 (type: int), 0 (type: int) + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 36 Data size: 198 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: int), _col1 (type: varchar(10)), _col2 (type: int), _col3 (type: int) + sort order: ++++ + Map-reduce partition columns: _col0 (type: int), _col1 (type: varchar(10)), _col2 (type: int), _col3 (type: int) + Statistics: Num rows: 36 Data size: 198 Basic stats: COMPLETE Column stats: NONE + Execution mode: llap + LLAP IO: no inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: int), KEY._col1 (type: varchar(10)), KEY._col2 (type: int), KEY._col3 (type: int) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 18 Data size: 99 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: CASE WHEN (((_col3 = 1) and _col0 is not null)) THEN (1) ELSE (null) END (type: int), CASE WHEN (((_col3 = 2) and _col1 is not null)) THEN (1) ELSE (null) END (type: int), CASE WHEN (((_col3 = 4) and _col2 is not null)) THEN (1) ELSE (null) END (type: int) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 18 Data size: 99 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(_col0), count(_col1), count(_col2) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 24 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 24 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint), _col1 (type: bigint), _col2 (type: bigint) + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), count(VALUE._col1), count(VALUE._col2) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 24 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 24 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(distinct gender), count(distinct department_id), count(distinct education_level) from employee +PREHOOK: type: QUERY +PREHOOK: Input: default@employee +#### A masked pattern was here #### +POSTHOOK: query: select count(distinct gender), count(distinct department_id), count(distinct education_level) from employee +POSTHOOK: type: QUERY +POSTHOOK: Input: default@employee +#### A masked pattern was here #### +2 4 3 +PREHOOK: query: select count(distinct gender), count(distinct department_id), count(distinct gender), count(distinct education_level) from employee +PREHOOK: type: QUERY +PREHOOK: Input: default@employee +#### A masked pattern was here #### +POSTHOOK: query: select count(distinct gender), count(distinct department_id), count(distinct gender), count(distinct education_level) from employee +POSTHOOK: type: QUERY +POSTHOOK: Input: default@employee +#### A masked pattern was here #### +2 4 2 3 +PREHOOK: query: select count(distinct department_id), count(distinct gender), count(distinct education_level) from employee +PREHOOK: type: QUERY +PREHOOK: Input: default@employee +#### A masked pattern was here #### +POSTHOOK: query: select count(distinct department_id), count(distinct gender), count(distinct education_level) from employee +POSTHOOK: type: QUERY +POSTHOOK: Input: default@employee +#### A masked pattern was here #### +4 2 3 +PREHOOK: query: select count(distinct department_id), count(distinct gender), count(distinct education_level), count(distinct education_level) from employee +PREHOOK: type: QUERY +PREHOOK: Input: default@employee +#### A masked pattern was here #### +POSTHOOK: query: select count(distinct department_id), count(distinct gender), count(distinct education_level), count(distinct education_level) from employee +POSTHOOK: type: QUERY +POSTHOOK: Input: default@employee +#### A masked pattern was here #### +4 2 3 3 +PREHOOK: query: select count(distinct department_id), count(distinct gender), count(distinct education_level), +count(distinct education_level, department_id) from employee +PREHOOK: type: QUERY +PREHOOK: Input: default@employee +#### A masked pattern was here #### +POSTHOOK: query: select count(distinct department_id), count(distinct gender), count(distinct education_level), +count(distinct education_level, department_id) from employee +POSTHOOK: type: QUERY +POSTHOOK: Input: default@employee +#### A masked pattern was here #### +4 2 3 9 +PREHOOK: query: select count(distinct gender), count(distinct department_id), count(distinct gender), count(distinct education_level), +count(distinct education_level, department_id), count(distinct department_id, education_level) from employee +PREHOOK: type: QUERY +PREHOOK: Input: default@employee +#### A masked pattern was here #### +POSTHOOK: query: select count(distinct gender), count(distinct department_id), count(distinct gender), count(distinct education_level), +count(distinct education_level, department_id), count(distinct department_id, education_level) from employee +POSTHOOK: type: QUERY +POSTHOOK: Input: default@employee +#### A masked pattern was here #### +2 4 2 3 9 9 +PREHOOK: query: explain select count(distinct gender), count(distinct department_id), count(distinct gender), count(distinct education_level), +count(distinct education_level, department_id), count(distinct department_id, education_level), count(distinct department_id, education_level, gender) from employee +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(distinct gender), count(distinct department_id), count(distinct gender), count(distinct education_level), +count(distinct education_level, department_id), count(distinct department_id, education_level), count(distinct department_id, education_level, gender) from employee +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: employee + Statistics: Num rows: 12 Data size: 66 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: gender (type: varchar(10)), department_id (type: int), education_level (type: int) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 12 Data size: 66 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: _col0 (type: varchar(10)), _col1 (type: int), _col2 (type: int), 0 (type: int) + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 60 Data size: 330 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: varchar(10)), _col1 (type: int), _col2 (type: int), _col3 (type: int) + sort order: ++++ + Map-reduce partition columns: _col0 (type: varchar(10)), _col1 (type: int), _col2 (type: int), _col3 (type: int) + Statistics: Num rows: 60 Data size: 330 Basic stats: COMPLETE Column stats: NONE + Execution mode: llap + LLAP IO: no inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: varchar(10)), KEY._col1 (type: int), KEY._col2 (type: int), KEY._col3 (type: int) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 30 Data size: 165 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: CASE WHEN (((_col3 = 1) and _col0 is not null)) THEN (1) ELSE (null) END (type: int), CASE WHEN (((_col3 = 2) and _col1 is not null)) THEN (1) ELSE (null) END (type: int), CASE WHEN (((_col3 = 4) and _col2 is not null)) THEN (1) ELSE (null) END (type: int), CASE WHEN ((_col3 = 6)) THEN (1) ELSE (null) END (type: int), CASE WHEN ((_col3 = 7)) THEN (1) ELSE (null) END (type: int) + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 30 Data size: 165 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(_col0), count(_col1), count(_col2), count(_col3), count(_col4) + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 1 Data size: 40 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 40 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint), _col1 (type: bigint), _col2 (type: bigint), _col3 (type: bigint), _col4 (type: bigint) + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), count(VALUE._col1), count(VALUE._col2), count(VALUE._col3), count(VALUE._col4) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 1 Data size: 40 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: bigint), _col1 (type: bigint), _col0 (type: bigint), _col2 (type: bigint), _col3 (type: bigint), _col3 (type: bigint), _col4 (type: bigint) + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6 + Statistics: Num rows: 1 Data size: 40 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 40 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(distinct gender), count(distinct department_id), count(distinct gender), count(distinct education_level), +count(distinct education_level, department_id), count(distinct department_id, education_level), count(distinct department_id, education_level, gender) from employee +PREHOOK: type: QUERY +PREHOOK: Input: default@employee +#### A masked pattern was here #### +POSTHOOK: query: select count(distinct gender), count(distinct department_id), count(distinct gender), count(distinct education_level), +count(distinct education_level, department_id), count(distinct department_id, education_level), count(distinct department_id, education_level, gender) from employee +POSTHOOK: type: QUERY +POSTHOOK: Input: default@employee +#### A masked pattern was here #### +2 4 2 3 9 9 10 +PREHOOK: query: select +count(case when i=1 and department_id is not null then 1 else null end) as c0, +count(case when i=2 and gender is not null then 1 else null end) as c1, +count(case when i=4 and education_level is not null then 1 else null end) as c2 from +(select grouping__id as i, department_id, gender, +education_level from employee group by department_id, gender, education_level grouping sets +(department_id, gender, education_level))subq +PREHOOK: type: QUERY +PREHOOK: Input: default@employee +#### A masked pattern was here #### +POSTHOOK: query: select +count(case when i=1 and department_id is not null then 1 else null end) as c0, +count(case when i=2 and gender is not null then 1 else null end) as c1, +count(case when i=4 and education_level is not null then 1 else null end) as c2 from +(select grouping__id as i, department_id, gender, +education_level from employee group by department_id, gender, education_level grouping sets +(department_id, gender, education_level))subq +POSTHOOK: type: QUERY +POSTHOOK: Input: default@employee +#### A masked pattern was here #### +4 2 3 +PREHOOK: query: select grouping__id as i, department_id, gender, education_level from employee +group by department_id, gender, education_level grouping sets +(department_id, gender, education_level, (education_level, department_id)) +PREHOOK: type: QUERY +PREHOOK: Input: default@employee +#### A masked pattern was here #### +POSTHOOK: query: select grouping__id as i, department_id, gender, education_level from employee +group by department_id, gender, education_level grouping sets +(department_id, gender, education_level, (education_level, department_id)) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@employee +#### A masked pattern was here #### +1 NULL NULL NULL +2 NULL NULL NULL +4 NULL NULL NULL +5 NULL NULL NULL +4 NULL NULL 1 +5 NULL NULL 1 +4 NULL NULL 2 +4 NULL NULL 3 +2 NULL F NULL +2 NULL M NULL +1 1 NULL NULL +5 1 NULL 1 +5 1 NULL 2 +5 1 NULL 3 +1 2 NULL NULL +5 2 NULL 1 +5 2 NULL 3 +1 3 NULL NULL +5 3 NULL 2 +1 4 NULL NULL +5 4 NULL 1