diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java index 48e3cc7..7f82cae 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java @@ -917,7 +917,7 @@ private RelNode applyPreJoinOrderingTransforms(RelNode basePlan, RelMetadataProv //0. Distinct aggregate rewrite // Run this optimization early, since it is expanding the operator pipeline. - if (conf.getVar(HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("tez") && + if (!conf.getVar(HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("mr") && conf.getBoolVar(HiveConf.ConfVars.HIVEOPTIMIZEDISTINCTREWRITE)) { // Its not clear, if this rewrite is always performant on MR, since extra map phase // introduced for 2nd MR job may offset gains of this multi-stage aggregation. diff --git a/ql/src/test/results/clientpositive/spark/auto_join32.q.out b/ql/src/test/results/clientpositive/spark/auto_join32.q.out index e26e4a2..316792b 100644 --- a/ql/src/test/results/clientpositive/spark/auto_join32.q.out +++ b/ql/src/test/results/clientpositive/spark/auto_join32.q.out @@ -450,10 +450,9 @@ STAGE PLANS: outputColumnNames: _col0, _col1 Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE Group By Operator - aggregations: count(DISTINCT _col1) keys: _col0 (type: string), _col1 (type: string) mode: hash - outputColumnNames: _col0, _col1, _col2 + outputColumnNames: _col0, _col1 Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE Reduce Output Operator key expressions: _col0 (type: string), _col1 (type: string) @@ -463,18 +462,23 @@ STAGE PLANS: Reducer 2 Reduce Operator Tree: Group By Operator - aggregations: count(DISTINCT KEY._col1:0._col0) - keys: KEY._col0 (type: string) + keys: KEY._col0 (type: string), KEY._col1 (type: string) mode: mergepartial outputColumnNames: _col0, _col1 Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE - File Output Operator - compressed: false + Group By Operator + aggregations: count(_col1) + keys: _col0 (type: string) + mode: complete + outputColumnNames: _col0, _col1 Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe Stage: Stage-0 Fetch Operator diff --git a/ql/src/test/results/clientpositive/spark/limit_pushdown.q.out b/ql/src/test/results/clientpositive/spark/limit_pushdown.q.out index 7101beb..f427aed 100644 --- a/ql/src/test/results/clientpositive/spark/limit_pushdown.q.out +++ b/ql/src/test/results/clientpositive/spark/limit_pushdown.q.out @@ -476,35 +476,38 @@ STAGE PLANS: outputColumnNames: _col0, _col1 Statistics: Num rows: 12288 Data size: 377237 Basic stats: COMPLETE Column stats: NONE Group By Operator - aggregations: count(DISTINCT _col1) keys: _col0 (type: tinyint), _col1 (type: double) mode: hash - outputColumnNames: _col0, _col1, _col2 + outputColumnNames: _col0, _col1 Statistics: Num rows: 12288 Data size: 377237 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col0 (type: tinyint), _col1 (type: double) sort order: ++ Map-reduce partition columns: _col0 (type: tinyint) Statistics: Num rows: 12288 Data size: 377237 Basic stats: COMPLETE Column stats: NONE - TopN Hash Memory Usage: 0.3 Reducer 2 Reduce Operator Tree: Group By Operator - aggregations: count(DISTINCT KEY._col1:0._col0) - keys: KEY._col0 (type: tinyint) + keys: KEY._col0 (type: tinyint), KEY._col1 (type: double) mode: mergepartial outputColumnNames: _col0, _col1 Statistics: Num rows: 6144 Data size: 188618 Basic stats: COMPLETE Column stats: NONE - Limit - Number of rows: 20 - Statistics: Num rows: 20 Data size: 600 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false + Group By Operator + aggregations: count(_col1) + keys: _col0 (type: tinyint) + mode: complete + outputColumnNames: _col0, _col1 + Statistics: Num rows: 3072 Data size: 94309 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 20 Statistics: Num rows: 20 Data size: 600 Basic stats: COMPLETE Column stats: NONE - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + File Output Operator + compressed: false + Statistics: Num rows: 20 Data size: 600 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe Stage: Stage-0 Fetch Operator diff --git a/ql/src/test/results/clientpositive/spark/vector_count_distinct.q.out b/ql/src/test/results/clientpositive/spark/vector_count_distinct.q.out index fecfe0a..aaea86d 100644 --- a/ql/src/test/results/clientpositive/spark/vector_count_distinct.q.out +++ b/ql/src/test/results/clientpositive/spark/vector_count_distinct.q.out @@ -1247,7 +1247,8 @@ STAGE PLANS: Stage: Stage-1 Spark Edges: - Reducer 2 <- Map 1 (GROUP PARTITION-LEVEL SORT, 1) + Reducer 2 <- Map 1 (GROUP, 2) + Reducer 3 <- Reducer 2 (GROUP, 1) #### A masked pattern was here #### Vertices: Map 1 @@ -1260,30 +1261,47 @@ STAGE PLANS: outputColumnNames: _col0 Statistics: Num rows: 2000 Data size: 3504000 Basic stats: COMPLETE Column stats: NONE Group By Operator - aggregations: count(DISTINCT _col0) keys: _col0 (type: int) mode: hash - outputColumnNames: _col0, _col1 + outputColumnNames: _col0 Statistics: Num rows: 2000 Data size: 3504000 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col0 (type: int) sort order: + + Map-reduce partition columns: _col0 (type: int) Statistics: Num rows: 2000 Data size: 3504000 Basic stats: COMPLETE Column stats: NONE Execution mode: vectorized Reducer 2 Reduce Operator Tree: Group By Operator - aggregations: count(DISTINCT KEY._col0:0._col0) + keys: KEY._col0 (type: int) mode: mergepartial outputColumnNames: _col0 - Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1000 Data size: 1752000 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(_col0) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Reducer 3 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE File Output Operator compressed: false - Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE table: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized Stage: Stage-0 Fetch Operator