diff --git ql/src/java/org/apache/hadoop/hive/ql/hooks/ATSHook.java ql/src/java/org/apache/hadoop/hive/ql/hooks/ATSHook.java index ed18f7e..cdc929a 100644 --- ql/src/java/org/apache/hadoop/hive/ql/hooks/ATSHook.java +++ ql/src/java/org/apache/hadoop/hive/ql/hooks/ATSHook.java @@ -19,6 +19,7 @@ import com.google.common.util.concurrent.ThreadFactoryBuilder; +import java.util.List; import java.util.concurrent.Executors; import java.util.concurrent.ExecutorService; import java.util.concurrent.TimeUnit; @@ -28,6 +29,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.ql.QueryPlan; import org.apache.hadoop.hive.ql.exec.ExplainTask; +import org.apache.hadoop.hive.ql.exec.Task; import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.yarn.api.records.timeline.TimelineEntity; @@ -110,7 +112,8 @@ public void run() { ExplainTask explain = new ExplainTask(); explain.initialize(hookContext.getConf(), plan, null); String query = plan.getQueryStr(); - JSONObject explainPlan = explain.getJSONPlan(null, null, plan.getRootTasks(), + List> rootTasks = plan.getRootTasks(); + JSONObject explainPlan = explain.getJSONPlan(null, null, rootTasks, plan.getFetchTask(), true, false, false); fireAndForget(hookContext.getConf(), createPreHookEvent(queryId, query, explainPlan, queryStartTime, user, numMrJobs, numTezJobs)); diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/IntraQueryCorrelation.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/IntraQueryCorrelation.java index f3caed2..95c4340 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/IntraQueryCorrelation.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/IntraQueryCorrelation.java @@ -52,7 +52,7 @@ private final Set allReduceSinkOperators; // Since we merge multiple operation paths, we assign new tags to bottom layer - // ReduceSinkOperatos. This mapping is used to map new tags to original tags associated + // ReduceSinkOperators. This mapping is used to map new tags to original tags associated // to these bottom layer ReduceSinkOperators. private final Map newTagToOldTag; diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/QueryPlanTreeTransformation.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/QueryPlanTreeTransformation.java index aa02a40..080725b 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/QueryPlanTreeTransformation.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/QueryPlanTreeTransformation.java @@ -37,6 +37,7 @@ import org.apache.hadoop.hive.ql.parse.ParseContext; import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.plan.DemuxDesc; +import org.apache.hadoop.hive.ql.plan.GroupByDesc; import org.apache.hadoop.hive.ql.plan.MuxDesc; import org.apache.hadoop.hive.ql.plan.OperatorDesc; import org.apache.hadoop.hive.ql.plan.TableDesc; @@ -214,7 +215,7 @@ protected static void applyCorrelation( } else if (op instanceof ReduceSinkOperator){ GroupByOperator pGBYm = CorrelationUtilities.getSingleParent(op, GroupByOperator.class); - if (pGBYm != null) { + if (pGBYm != null && pGBYm.getConf().getMode() == GroupByDesc.Mode.HASH) { // We get a semi join at here. // This map-side GroupByOperator needs to be removed CorrelationUtilities.removeOperator( diff --git ql/src/test/queries/clientpositive/subquery_exists_having.q ql/src/test/queries/clientpositive/subquery_exists_having.q index 690aa10..39d2d17 100644 --- ql/src/test/queries/clientpositive/subquery_exists_having.q +++ ql/src/test/queries/clientpositive/subquery_exists_having.q @@ -1,4 +1,4 @@ - +set hive.optimize.correlation=false; -- no agg, corr explain @@ -22,6 +22,30 @@ having exists ) ; +set hive.optimize.correlation=true; + +-- no agg, corr +explain +select b.key, count(*) +from src b +group by b.key +having exists + (select a.key + from src a + where a.key = b.key and a.value > 'val_9' + ) +; + +select b.key, count(*) +from src b +group by b.key +having exists + (select a.key + from src a + where a.key = b.key and a.value > 'val_9' + ) +; + -- view test create view cv1 as select b.key, count(*) as c diff --git ql/src/test/queries/clientpositive/subquery_in_having.q ql/src/test/queries/clientpositive/subquery_in_having.q index 5b55b0b..fecc5f8 100644 --- ql/src/test/queries/clientpositive/subquery_in_having.q +++ ql/src/test/queries/clientpositive/subquery_in_having.q @@ -40,6 +40,8 @@ group by key, value having count(*) in (select count(*) from src s1 where s1.key > '9' and s1.value = b.value group by s1.key ) ; +set hive.optimize.correlation=false; + -- agg, non corr explain select p_mfgr, avg(p_size) @@ -53,6 +55,21 @@ having b.p_mfgr in ) ; +set hive.optimize.correlation=true; + +-- agg, non corr +explain +select p_mfgr, avg(p_size) +from part b +group by b.p_mfgr +having b.p_mfgr in + (select p_mfgr + from part + group by p_mfgr + having max(p_size) - min(p_size) < 20 + ) +; + -- join on agg select b.key, min(b.value) from src b diff --git ql/src/test/results/clientpositive/subquery_exists_having.q.out ql/src/test/results/clientpositive/subquery_exists_having.q.out index 2194143..b5b73da 100644 --- ql/src/test/results/clientpositive/subquery_exists_having.q.out +++ ql/src/test/results/clientpositive/subquery_exists_having.q.out @@ -153,6 +153,170 @@ POSTHOOK: Input: default@src 96 1 97 2 98 2 +PREHOOK: query: -- no agg, corr +explain +select b.key, count(*) +from src b +group by b.key +having exists + (select a.key + from src a + where a.key = b.key and a.value > 'val_9' + ) +PREHOOK: type: QUERY +POSTHOOK: query: -- no agg, corr +explain +select b.key, count(*) +from src b +group by b.key +having exists + (select a.key + from src a + where a.key = b.key and a.value > 'val_9' + ) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: b + Statistics: Num rows: 58 Data size: 5812 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: key is not null (type: boolean) + Statistics: Num rows: 29 Data size: 2906 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 29 Data size: 2906 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 29 Data size: 2906 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 29 Data size: 2906 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint) + TableScan + alias: a + Statistics: Num rows: 29 Data size: 5812 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: ((value > 'val_9') and key is not null) (type: boolean) + Statistics: Num rows: 5 Data size: 1002 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: _col1 + Statistics: Num rows: 5 Data size: 1002 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: _col1 (type: string) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 5 Data size: 1002 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 5 Data size: 1002 Basic stats: COMPLETE Column stats: NONE + Reduce Operator Tree: + Demux Operator + Statistics: Num rows: 34 Data size: 3908 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 17 Data size: 1954 Basic stats: COMPLETE Column stats: NONE + Mux Operator + Statistics: Num rows: 51 Data size: 5862 Basic stats: COMPLETE Column stats: NONE + Join Operator + condition map: + Left Semi Join 0 to 1 + condition expressions: + 0 {KEY.reducesinkkey0} {VALUE._col0} + 1 + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Filter Operator + predicate: (1 = 1) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Mux Operator + Statistics: Num rows: 51 Data size: 5862 Basic stats: COMPLETE Column stats: NONE + Join Operator + condition map: + Left Semi Join 0 to 1 + condition expressions: + 0 {KEY.reducesinkkey0} {VALUE._col0} + 1 + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Filter Operator + predicate: (1 = 1) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select b.key, count(*) +from src b +group by b.key +having exists + (select a.key + from src a + where a.key = b.key and a.value > 'val_9' + ) +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select b.key, count(*) +from src b +group by b.key +having exists + (select a.key + from src a + where a.key = b.key and a.value > 'val_9' + ) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +90 3 +92 1 +95 2 +96 1 +97 2 +98 2 PREHOOK: query: -- view test create view cv1 as select b.key, count(*) as c diff --git ql/src/test/results/clientpositive/subquery_in_having.q.out ql/src/test/results/clientpositive/subquery_in_having.q.out index b88d717..2f0a015 100644 --- ql/src/test/results/clientpositive/subquery_in_having.q.out +++ ql/src/test/results/clientpositive/subquery_in_having.q.out @@ -582,6 +582,159 @@ STAGE PLANS: Processor Tree: ListSink +PREHOOK: query: -- agg, non corr +explain +select p_mfgr, avg(p_size) +from part b +group by b.p_mfgr +having b.p_mfgr in + (select p_mfgr + from part + group by p_mfgr + having max(p_size) - min(p_size) < 20 + ) +PREHOOK: type: QUERY +POSTHOOK: query: -- agg, non corr +explain +select p_mfgr, avg(p_size) +from part b +group by b.p_mfgr +having b.p_mfgr in + (select p_mfgr + from part + group by p_mfgr + having max(p_size) - min(p_size) < 20 + ) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: b + Statistics: Num rows: 30 Data size: 3173 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: p_mfgr is not null (type: boolean) + Statistics: Num rows: 15 Data size: 1586 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: p_mfgr (type: string), p_size (type: int) + outputColumnNames: p_mfgr, p_size + Statistics: Num rows: 15 Data size: 1586 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: avg(p_size) + keys: p_mfgr (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 15 Data size: 1586 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 15 Data size: 1586 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: struct) + TableScan + alias: part + Statistics: Num rows: 30 Data size: 3173 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: p_mfgr is not null (type: boolean) + Statistics: Num rows: 15 Data size: 1586 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: p_mfgr (type: string), p_size (type: int) + outputColumnNames: p_mfgr, p_size + Statistics: Num rows: 15 Data size: 1586 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: max(p_size), min(p_size) + keys: p_mfgr (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 15 Data size: 1586 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 15 Data size: 1586 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: int), _col2 (type: int) + Reduce Operator Tree: + Demux Operator + Statistics: Num rows: 30 Data size: 3172 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: avg(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 15 Data size: 1586 Basic stats: COMPLETE Column stats: NONE + Mux Operator + Statistics: Num rows: 20 Data size: 2114 Basic stats: COMPLETE Column stats: NONE + Join Operator + condition map: + Left Semi Join 0 to 1 + condition expressions: + 0 {KEY.reducesinkkey0} {VALUE._col0} + 1 + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Filter Operator + predicate: (1 = 1) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: double) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Group By Operator + aggregations: max(VALUE._col0), min(VALUE._col1) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 15 Data size: 1586 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: ((_col1 - _col2) < 20) (type: boolean) + Statistics: Num rows: 5 Data size: 528 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 5 Data size: 528 Basic stats: COMPLETE Column stats: NONE + Mux Operator + Statistics: Num rows: 20 Data size: 2114 Basic stats: COMPLETE Column stats: NONE + Join Operator + condition map: + Left Semi Join 0 to 1 + condition expressions: + 0 {KEY.reducesinkkey0} {VALUE._col0} + 1 + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Filter Operator + predicate: (1 = 1) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: double) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + PREHOOK: query: -- join on agg select b.key, min(b.value) from src b