diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java index 48e3cc7..7f82cae 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java @@ -917,7 +917,7 @@ private RelNode applyPreJoinOrderingTransforms(RelNode basePlan, RelMetadataProv //0. Distinct aggregate rewrite // Run this optimization early, since it is expanding the operator pipeline. - if (conf.getVar(HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("tez") && + if (!conf.getVar(HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("mr") && conf.getBoolVar(HiveConf.ConfVars.HIVEOPTIMIZEDISTINCTREWRITE)) { // Its not clear, if this rewrite is always performant on MR, since extra map phase // introduced for 2nd MR job may offset gains of this multi-stage aggregation. diff --git a/ql/src/test/results/clientpositive/spark/auto_join18.q.out b/ql/src/test/results/clientpositive/spark/auto_join18.q.out index eaef06c..8607221 100644 --- a/ql/src/test/results/clientpositive/spark/auto_join18.q.out +++ b/ql/src/test/results/clientpositive/spark/auto_join18.q.out @@ -69,10 +69,9 @@ STAGE PLANS: outputColumnNames: _col0, _col1 Statistics: Num rows: 25 Data size: 191 Basic stats: COMPLETE Column stats: NONE Group By Operator - aggregations: count(DISTINCT _col1) keys: _col0 (type: string), _col1 (type: string) mode: hash - outputColumnNames: _col0, _col1, _col2 + outputColumnNames: _col0, _col1 Statistics: Num rows: 25 Data size: 191 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col0 (type: string), _col1 (type: string) @@ -133,17 +132,22 @@ STAGE PLANS: Reducer 6 Reduce Operator Tree: Group By Operator - aggregations: count(DISTINCT KEY._col1:0._col0) - keys: KEY._col0 (type: string) + keys: KEY._col0 (type: string), KEY._col1 (type: string) mode: mergepartial outputColumnNames: _col0, _col1 Statistics: Num rows: 12 Data size: 91 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) - Statistics: Num rows: 12 Data size: 91 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: bigint) + Group By Operator + aggregations: count(_col1) + keys: _col0 (type: string) + mode: complete + outputColumnNames: _col0, _col1 + Statistics: Num rows: 6 Data size: 45 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 6 Data size: 45 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint) Stage: Stage-0 Fetch Operator diff --git a/ql/src/test/results/clientpositive/spark/auto_join32.q.out b/ql/src/test/results/clientpositive/spark/auto_join32.q.out index e26e4a2..316792b 100644 --- a/ql/src/test/results/clientpositive/spark/auto_join32.q.out +++ b/ql/src/test/results/clientpositive/spark/auto_join32.q.out @@ -450,10 +450,9 @@ STAGE PLANS: outputColumnNames: _col0, _col1 Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE Group By Operator - aggregations: count(DISTINCT _col1) keys: _col0 (type: string), _col1 (type: string) mode: hash - outputColumnNames: _col0, _col1, _col2 + outputColumnNames: _col0, _col1 Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE Reduce Output Operator key expressions: _col0 (type: string), _col1 (type: string) @@ -463,18 +462,23 @@ STAGE PLANS: Reducer 2 Reduce Operator Tree: Group By Operator - aggregations: count(DISTINCT KEY._col1:0._col0) - keys: KEY._col0 (type: string) + keys: KEY._col0 (type: string), KEY._col1 (type: string) mode: mergepartial outputColumnNames: _col0, _col1 Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE - File Output Operator - compressed: false + Group By Operator + aggregations: count(_col1) + keys: _col0 (type: string) + mode: complete + outputColumnNames: _col0, _col1 Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe Stage: Stage-0 Fetch Operator diff --git a/ql/src/test/results/clientpositive/spark/groupby_position.q.out b/ql/src/test/results/clientpositive/spark/groupby_position.q.out index 50d707d..088428b 100644 --- a/ql/src/test/results/clientpositive/spark/groupby_position.q.out +++ b/ql/src/test/results/clientpositive/spark/groupby_position.q.out @@ -552,7 +552,7 @@ STAGE PLANS: Stage: Stage-1 Spark Edges: - Reducer 2 <- Map 1 (GROUP, 2) + Reducer 2 <- Map 1 (GROUP PARTITION-LEVEL SORT, 2) Reducer 6 <- Map 5 (GROUP, 2) Reducer 3 <- Reducer 2 (PARTITION-LEVEL SORT, 2), Reducer 6 (PARTITION-LEVEL SORT, 2) Reducer 4 <- Reducer 3 (SORT, 1) @@ -567,17 +567,17 @@ STAGE PLANS: predicate: (((((UDFToDouble(key) > 10.0) and (UDFToDouble(key) < 20.0)) and (UDFToDouble(key) > 15.0)) and (UDFToDouble(key) < 25.0)) and key is not null) (type: boolean) Statistics: Num rows: 3 Data size: 31 Basic stats: COMPLETE Column stats: NONE Select Operator - expressions: key (type: string), value (type: string) - outputColumnNames: _col0, _col1 + expressions: key (type: string), value (type: string), substr(value, 5) (type: string) + outputColumnNames: _col0, _col1, _col2 Statistics: Num rows: 3 Data size: 31 Basic stats: COMPLETE Column stats: NONE Group By Operator - keys: _col0 (type: string), _col1 (type: string) + keys: _col0 (type: string), _col1 (type: string), _col2 (type: string) mode: hash - outputColumnNames: _col0, _col1 + outputColumnNames: _col0, _col1, _col2 Statistics: Num rows: 3 Data size: 31 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator - key expressions: _col0 (type: string), _col1 (type: string) - sort order: ++ + key expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string) + sort order: +++ Map-reduce partition columns: _col0 (type: string), _col1 (type: string) Statistics: Num rows: 3 Data size: 31 Basic stats: COMPLETE Column stats: NONE Map 5 @@ -605,16 +605,25 @@ STAGE PLANS: Reducer 2 Reduce Operator Tree: Group By Operator - keys: KEY._col0 (type: string), KEY._col1 (type: string) + keys: KEY._col0 (type: string), KEY._col1 (type: string), KEY._col2 (type: string) mode: mergepartial - outputColumnNames: _col0, _col1 + outputColumnNames: _col0, _col1, _col2 Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) + Select Operator + expressions: _col0 (type: string), _col1 (type: string) + outputColumnNames: _col0, _col1 Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: string) + Group By Operator + keys: _col0 (type: string), _col1 (type: string) + mode: complete + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: string) Reducer 3 Reduce Operator Tree: Join Operator diff --git a/ql/src/test/results/clientpositive/spark/join18.q.out b/ql/src/test/results/clientpositive/spark/join18.q.out index ed17bab..c702fdf 100644 --- a/ql/src/test/results/clientpositive/spark/join18.q.out +++ b/ql/src/test/results/clientpositive/spark/join18.q.out @@ -72,10 +72,9 @@ STAGE PLANS: outputColumnNames: _col0, _col1 Statistics: Num rows: 25 Data size: 191 Basic stats: COMPLETE Column stats: NONE Group By Operator - aggregations: count(DISTINCT _col1) keys: _col0 (type: string), _col1 (type: string) mode: hash - outputColumnNames: _col0, _col1, _col2 + outputColumnNames: _col0, _col1 Statistics: Num rows: 25 Data size: 191 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col0 (type: string), _col1 (type: string) @@ -116,17 +115,22 @@ STAGE PLANS: Reducer 5 Reduce Operator Tree: Group By Operator - aggregations: count(DISTINCT KEY._col1:0._col0) - keys: KEY._col0 (type: string) + keys: KEY._col0 (type: string), KEY._col1 (type: string) mode: mergepartial outputColumnNames: _col0, _col1 Statistics: Num rows: 12 Data size: 91 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col0 (type: string) - sort order: + - Map-reduce partition columns: _col0 (type: string) - Statistics: Num rows: 12 Data size: 91 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: bigint) + Group By Operator + aggregations: count(_col1) + keys: _col0 (type: string) + mode: complete + outputColumnNames: _col0, _col1 + Statistics: Num rows: 6 Data size: 45 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 6 Data size: 45 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint) Stage: Stage-0 Fetch Operator diff --git a/ql/src/test/results/clientpositive/spark/limit_pushdown.q.out b/ql/src/test/results/clientpositive/spark/limit_pushdown.q.out index 7101beb..f427aed 100644 --- a/ql/src/test/results/clientpositive/spark/limit_pushdown.q.out +++ b/ql/src/test/results/clientpositive/spark/limit_pushdown.q.out @@ -476,35 +476,38 @@ STAGE PLANS: outputColumnNames: _col0, _col1 Statistics: Num rows: 12288 Data size: 377237 Basic stats: COMPLETE Column stats: NONE Group By Operator - aggregations: count(DISTINCT _col1) keys: _col0 (type: tinyint), _col1 (type: double) mode: hash - outputColumnNames: _col0, _col1, _col2 + outputColumnNames: _col0, _col1 Statistics: Num rows: 12288 Data size: 377237 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col0 (type: tinyint), _col1 (type: double) sort order: ++ Map-reduce partition columns: _col0 (type: tinyint) Statistics: Num rows: 12288 Data size: 377237 Basic stats: COMPLETE Column stats: NONE - TopN Hash Memory Usage: 0.3 Reducer 2 Reduce Operator Tree: Group By Operator - aggregations: count(DISTINCT KEY._col1:0._col0) - keys: KEY._col0 (type: tinyint) + keys: KEY._col0 (type: tinyint), KEY._col1 (type: double) mode: mergepartial outputColumnNames: _col0, _col1 Statistics: Num rows: 6144 Data size: 188618 Basic stats: COMPLETE Column stats: NONE - Limit - Number of rows: 20 - Statistics: Num rows: 20 Data size: 600 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false + Group By Operator + aggregations: count(_col1) + keys: _col0 (type: tinyint) + mode: complete + outputColumnNames: _col0, _col1 + Statistics: Num rows: 3072 Data size: 94309 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 20 Statistics: Num rows: 20 Data size: 600 Basic stats: COMPLETE Column stats: NONE - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + File Output Operator + compressed: false + Statistics: Num rows: 20 Data size: 600 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe Stage: Stage-0 Fetch Operator diff --git a/ql/src/test/results/clientpositive/spark/vector_count_distinct.q.out b/ql/src/test/results/clientpositive/spark/vector_count_distinct.q.out index fecfe0a..aaea86d 100644 --- a/ql/src/test/results/clientpositive/spark/vector_count_distinct.q.out +++ b/ql/src/test/results/clientpositive/spark/vector_count_distinct.q.out @@ -1247,7 +1247,8 @@ STAGE PLANS: Stage: Stage-1 Spark Edges: - Reducer 2 <- Map 1 (GROUP PARTITION-LEVEL SORT, 1) + Reducer 2 <- Map 1 (GROUP, 2) + Reducer 3 <- Reducer 2 (GROUP, 1) #### A masked pattern was here #### Vertices: Map 1 @@ -1260,30 +1261,47 @@ STAGE PLANS: outputColumnNames: _col0 Statistics: Num rows: 2000 Data size: 3504000 Basic stats: COMPLETE Column stats: NONE Group By Operator - aggregations: count(DISTINCT _col0) keys: _col0 (type: int) mode: hash - outputColumnNames: _col0, _col1 + outputColumnNames: _col0 Statistics: Num rows: 2000 Data size: 3504000 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col0 (type: int) sort order: + + Map-reduce partition columns: _col0 (type: int) Statistics: Num rows: 2000 Data size: 3504000 Basic stats: COMPLETE Column stats: NONE Execution mode: vectorized Reducer 2 Reduce Operator Tree: Group By Operator - aggregations: count(DISTINCT KEY._col0:0._col0) + keys: KEY._col0 (type: int) mode: mergepartial outputColumnNames: _col0 - Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1000 Data size: 1752000 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(_col0) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Reducer 3 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE File Output Operator compressed: false - Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE table: input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized Stage: Stage-0 Fetch Operator