diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties index 90cb007057..36a51e347b 100644 --- a/itests/src/test/resources/testconfiguration.properties +++ b/itests/src/test/resources/testconfiguration.properties @@ -767,6 +767,7 @@ minillaplocal.query.files=\ tez_join_tests.q,\ tez_joins_explain.q,\ tez_multi_union.q,\ + tez_no_dynamic_semijoin_reduction_on_aggcol.q,\ tez_nway_join.q,\ tez_schema_evolution.q,\ tez_self_join.q,\ diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java index 189c68dd91..53bde2c218 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java @@ -604,11 +604,11 @@ private boolean generateSemiJoinOperatorPlan(DynamicListContext ctx, ParseContex // Create the column expr map Map colExprMap = new HashMap(); ExprNodeDesc exprNode = null; - if ( parentOfRS.getColumnExprMap() != null) { - exprNode = parentOfRS.getColumnExprMap().get(internalColName).clone(); - } else { - exprNode = new ExprNodeColumnDesc(columnInfo); + if (columnInfo == null) { + LOG.debug("No ColumnInfo found in {} for {}", parentOfRS.getOperatorId(), internalColName); + return false; } + exprNode = new ExprNodeColumnDesc(columnInfo); if (exprNode instanceof ExprNodeColumnDesc) { ExprNodeColumnDesc encd = (ExprNodeColumnDesc) exprNode; diff --git a/ql/src/test/queries/clientpositive/tez_dynamic_semijoin_reduction_on_aggcol.q b/ql/src/test/queries/clientpositive/tez_dynamic_semijoin_reduction_on_aggcol.q new file mode 100644 index 0000000000..55edc49118 --- /dev/null +++ b/ql/src/test/queries/clientpositive/tez_dynamic_semijoin_reduction_on_aggcol.q @@ -0,0 +1,17 @@ +--! qt:dataset:src +set hive.explain.user=false; +set hive.tez.dynamic.partition.pruning=true; +set hive.tez.dynamic.semijoin.reduction=true; +set hive.tez.bigtable.minsize.semijoin.reduction=1; +set hive.tez.min.bloom.filter.entries=1; + +create table tez_dynpart_hashjoin_on_agg(id int, outcome string, eventid int) stored as orc; +insert into tez_dynpart_hashjoin_on_agg select key, value, key from src; + +explain select a.id, b.outcome from (select id, max(eventid) as event_id_max from tez_dynpart_hashjoin_on_agg where id = 0 group by id) a +LEFT OUTER JOIN tez_dynpart_hashjoin_on_agg b +on a.event_id_max = b.eventid; + +select a.id, b.outcome from (select id, max(eventid) as event_id_max from tez_dynpart_hashjoin_on_agg where id = 0 group by id) a +LEFT OUTER JOIN tez_dynpart_hashjoin_on_agg b +on a.event_id_max = b.eventid; diff --git a/ql/src/test/results/clientpositive/llap/tez_dynamic_semijoin_reduction_on_aggcol.q.out b/ql/src/test/results/clientpositive/llap/tez_dynamic_semijoin_reduction_on_aggcol.q.out new file mode 100644 index 0000000000..3ff5912316 --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/tez_dynamic_semijoin_reduction_on_aggcol.q.out @@ -0,0 +1,183 @@ +PREHOOK: query: create table tez_dynpart_hashjoin_on_agg(id int, outcome string, eventid int) stored as orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@tez_dynpart_hashjoin_on_agg +POSTHOOK: query: create table tez_dynpart_hashjoin_on_agg(id int, outcome string, eventid int) stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@tez_dynpart_hashjoin_on_agg +PREHOOK: query: insert into tez_dynpart_hashjoin_on_agg select key, value, key from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@tez_dynpart_hashjoin_on_agg +POSTHOOK: query: insert into tez_dynpart_hashjoin_on_agg select key, value, key from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@tez_dynpart_hashjoin_on_agg +POSTHOOK: Lineage: tez_dynpart_hashjoin_on_agg.eventid EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tez_dynpart_hashjoin_on_agg.id EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: tez_dynpart_hashjoin_on_agg.outcome SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: explain select a.id, b.outcome from (select id, max(eventid) as event_id_max from tez_dynpart_hashjoin_on_agg where id = 0 group by id) a +LEFT OUTER JOIN tez_dynpart_hashjoin_on_agg b +on a.event_id_max = b.eventid +PREHOOK: type: QUERY +PREHOOK: Input: default@tez_dynpart_hashjoin_on_agg +#### A masked pattern was here #### +POSTHOOK: query: explain select a.id, b.outcome from (select id, max(eventid) as event_id_max from tez_dynpart_hashjoin_on_agg where id = 0 group by id) a +LEFT OUTER JOIN tez_dynpart_hashjoin_on_agg b +on a.event_id_max = b.eventid +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tez_dynpart_hashjoin_on_agg +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Map 5 <- Reducer 4 (BROADCAST_EDGE) + Reducer 2 <- Map 1 (SIMPLE_EDGE) + Reducer 3 <- Map 5 (SIMPLE_EDGE), Reducer 2 (SIMPLE_EDGE) + Reducer 4 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: tez_dynpart_hashjoin_on_agg + filterExpr: (id = 0) (type: boolean) + Statistics: Num rows: 500 Data size: 4000 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: (id = 0) (type: boolean) + Statistics: Num rows: 2 Data size: 16 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: eventid (type: int) + outputColumnNames: _col1 + Statistics: Num rows: 2 Data size: 16 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: max(_col1) + keys: true (type: boolean) + minReductionHashAggr: 0.5 + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: boolean) + sort order: + + Map-reduce partition columns: _col0 (type: boolean) + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: int) + Execution mode: vectorized, llap + LLAP IO: all inputs + Map 5 + Map Operator Tree: + TableScan + alias: b + filterExpr: (eventid is not null and (eventid BETWEEN DynamicValue(RS_11_tez_dynpart_hashjoin_on_agg__col1_min) AND DynamicValue(RS_11_tez_dynpart_hashjoin_on_agg__col1_max) and in_bloom_filter(eventid, DynamicValue(RS_11_tez_dynpart_hashjoin_on_agg__col1_bloom_filter)))) (type: boolean) + Statistics: Num rows: 500 Data size: 47500 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: ((eventid BETWEEN DynamicValue(RS_11_tez_dynpart_hashjoin_on_agg__col1_min) AND DynamicValue(RS_11_tez_dynpart_hashjoin_on_agg__col1_max) and in_bloom_filter(eventid, DynamicValue(RS_11_tez_dynpart_hashjoin_on_agg__col1_bloom_filter))) and eventid is not null) (type: boolean) + Statistics: Num rows: 500 Data size: 47500 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: outcome (type: string), eventid (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 47500 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col1 (type: int) + sort order: + + Map-reduce partition columns: _col1 (type: int) + Statistics: Num rows: 500 Data size: 47500 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: string) + Execution mode: vectorized, llap + LLAP IO: all inputs + Reducer 2 + Execution mode: vectorized, llap + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0) + keys: KEY._col0 (type: boolean) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col1 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=1) + minReductionHashAggr: 0.0 + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary) + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Left Outer Join 0 to 1 + keys: + 0 _col0 (type: int) + 1 _col1 (type: int) + outputColumnNames: _col1 + Statistics: Num rows: 2 Data size: 182 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: 0 (type: int), _col1 (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 2 Data size: 190 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 190 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Reducer 4 + Execution mode: vectorized, llap + Reduce Operator Tree: + Group By Operator + aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=1) + mode: final + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary) + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select a.id, b.outcome from (select id, max(eventid) as event_id_max from tez_dynpart_hashjoin_on_agg where id = 0 group by id) a +LEFT OUTER JOIN tez_dynpart_hashjoin_on_agg b +on a.event_id_max = b.eventid +PREHOOK: type: QUERY +PREHOOK: Input: default@tez_dynpart_hashjoin_on_agg +#### A masked pattern was here #### +POSTHOOK: query: select a.id, b.outcome from (select id, max(eventid) as event_id_max from tez_dynpart_hashjoin_on_agg where id = 0 group by id) a +LEFT OUTER JOIN tez_dynpart_hashjoin_on_agg b +on a.event_id_max = b.eventid +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tez_dynpart_hashjoin_on_agg +#### A masked pattern was here #### +0 val_0 +0 val_0 +0 val_0