diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdPredicates.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdPredicates.java index 09e0fc1..e468573 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdPredicates.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdPredicates.java @@ -19,9 +19,9 @@ import java.util.ArrayList; import java.util.BitSet; +import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; -import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; @@ -213,15 +213,16 @@ public RelOptPredicateList getPredicates(Aggregate agg, RelMetadataQuery mq) { public RelOptPredicateList getPredicates(Union union, RelMetadataQuery mq) { RexBuilder rB = union.getCluster().getRexBuilder(); - Map finalPreds = new LinkedHashMap<>(); - Map finalResidualPreds = new LinkedHashMap<>(); + Map finalPreds = new HashMap<>(); + List finalResidualPreds = new ArrayList<>(); for (int i = 0; i < union.getInputs().size(); i++) { RelNode input = union.getInputs().get(i); RelOptPredicateList info = mq.getPulledUpPredicates(input); if (info.pulledUpPredicates.isEmpty()) { return RelOptPredicateList.EMPTY; } - Map preds = new LinkedHashMap<>(); + Map preds = new HashMap<>(); + List residualPreds = new ArrayList<>(); for (RexNode pred : info.pulledUpPredicates) { final String predString = pred.toString(); if (i == 0) { @@ -231,21 +232,28 @@ public RelOptPredicateList getPredicates(Union union, RelMetadataQuery mq) { if (finalPreds.containsKey(predString)) { preds.put(predString, pred); } else { - finalResidualPreds.put(predString, pred); + residualPreds.add(pred); } } + // Add new residual preds + finalResidualPreds.add(RexUtil.composeConjunction(rB, residualPreds, false)); // Add those that are not part of the final set to residual for (Entry e : finalPreds.entrySet()) { if (!preds.containsKey(e.getKey())) { - finalResidualPreds.put(e.getKey(), e.getValue()); + // This node was in previous union inputs, but it is not in this one + for (int j = 0; j < i; j++) { + finalResidualPreds.set(j, RexUtil.composeConjunction(rB, Lists.newArrayList( + finalResidualPreds.get(j), e.getValue()), false)); + } } } + // Final preds finalPreds = preds; } List preds = new ArrayList<>(finalPreds.values()); - RexNode disjPred = RexUtil.composeDisjunction(rB, finalResidualPreds.values(), true); - if (disjPred != null) { + RexNode disjPred = RexUtil.composeDisjunction(rB, finalResidualPreds, false); + if (!disjPred.isAlwaysTrue()) { preds.add(disjPred); } return RelOptPredicateList.of(preds); diff --git ql/src/test/queries/clientpositive/union37.q ql/src/test/queries/clientpositive/union37.q new file mode 100644 index 0000000..23c130c --- /dev/null +++ ql/src/test/queries/clientpositive/union37.q @@ -0,0 +1,125 @@ +create table l_test1 (id bigint,val string,trans_date string) row format delimited fields terminated by ' ' ; +insert into l_test1 values (1, "table_1", "2016-08-11"); + +create table l_test2 (id bigint,val string,trans_date string) row format delimited fields terminated by ' ' ; +insert into l_test2 values (2, "table_2", "2016-08-11"); + +explain +select + id, + 'table_1' , + trans_date +from l_test1 +union all +select + id, + val, + trans_date +from l_test2 ; + +select + id, + 'table_1' , + trans_date +from l_test1 +union all +select + id, + val, + trans_date +from l_test2 ; + +explain +select + id, + 999, + 'table_1' , + trans_date +from l_test1 +union all +select + id, + 999, + val, + trans_date +from l_test2 ; + +select + id, + 999, + 'table_1' , + trans_date +from l_test1 +union all +select + id, + 999, + val, + trans_date +from l_test2 ; + +explain +select + id, + 999, + 666, + 'table_1' , + trans_date +from l_test1 +union all +select + id, + 999, + 666, + val, + trans_date +from l_test2 ; + +select + id, + 999, + 666, + 'table_1' , + trans_date +from l_test1 +union all +select + id, + 999, + 666, + val, + trans_date +from l_test2 ; + +explain +select + id, + 999, + 'table_1' , + trans_date, + '2016-11-11' +from l_test1 +union all +select + id, + 999, + val, + trans_date, + trans_date +from l_test2 ; + +select + id, + 999, + 'table_1' , + trans_date, + '2016-11-11' +from l_test1 +union all +select + id, + 999, + val, + trans_date, + trans_date +from l_test2 ; diff --git ql/src/test/results/clientpositive/union37.q.out ql/src/test/results/clientpositive/union37.q.out new file mode 100644 index 0000000..e8a6f1d --- /dev/null +++ ql/src/test/results/clientpositive/union37.q.out @@ -0,0 +1,522 @@ +PREHOOK: query: create table l_test1 (id bigint,val string,trans_date string) row format delimited fields terminated by ' ' +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@l_test1 +POSTHOOK: query: create table l_test1 (id bigint,val string,trans_date string) row format delimited fields terminated by ' ' +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@l_test1 +PREHOOK: query: insert into l_test1 values (1, "table_1", "2016-08-11") +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__1 +PREHOOK: Output: default@l_test1 +POSTHOOK: query: insert into l_test1 values (1, "table_1", "2016-08-11") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__1 +POSTHOOK: Output: default@l_test1 +POSTHOOK: Lineage: l_test1.id EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: l_test1.trans_date SIMPLE [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col3, type:string, comment:), ] +POSTHOOK: Lineage: l_test1.val SIMPLE [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +PREHOOK: query: create table l_test2 (id bigint,val string,trans_date string) row format delimited fields terminated by ' ' +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@l_test2 +POSTHOOK: query: create table l_test2 (id bigint,val string,trans_date string) row format delimited fields terminated by ' ' +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@l_test2 +PREHOOK: query: insert into l_test2 values (2, "table_2", "2016-08-11") +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__2 +PREHOOK: Output: default@l_test2 +POSTHOOK: query: insert into l_test2 values (2, "table_2", "2016-08-11") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__2 +POSTHOOK: Output: default@l_test2 +POSTHOOK: Lineage: l_test2.id EXPRESSION [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: l_test2.trans_date SIMPLE [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col3, type:string, comment:), ] +POSTHOOK: Lineage: l_test2.val SIMPLE [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +PREHOOK: query: explain +select + id, + 'table_1' , + trans_date +from l_test1 +union all +select + id, + val, + trans_date +from l_test2 +PREHOOK: type: QUERY +POSTHOOK: query: explain +select + id, + 'table_1' , + trans_date +from l_test1 +union all +select + id, + val, + trans_date +from l_test2 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: l_test1 + Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: id (type: bigint), 'table_1' (type: string), trans_date (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: NONE + Union + Statistics: Num rows: 2 Data size: 40 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 40 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TableScan + alias: l_test2 + Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: id (type: bigint), val (type: string), trans_date (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: NONE + Union + Statistics: Num rows: 2 Data size: 40 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 40 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select + id, + 'table_1' , + trans_date +from l_test1 +union all +select + id, + val, + trans_date +from l_test2 +PREHOOK: type: QUERY +PREHOOK: Input: default@l_test1 +PREHOOK: Input: default@l_test2 +#### A masked pattern was here #### +POSTHOOK: query: select + id, + 'table_1' , + trans_date +from l_test1 +union all +select + id, + val, + trans_date +from l_test2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@l_test1 +POSTHOOK: Input: default@l_test2 +#### A masked pattern was here #### +1 table_1 2016-08-11 +2 table_2 2016-08-11 +PREHOOK: query: explain +select + id, + 999, + 'table_1' , + trans_date +from l_test1 +union all +select + id, + 999, + val, + trans_date +from l_test2 +PREHOOK: type: QUERY +POSTHOOK: query: explain +select + id, + 999, + 'table_1' , + trans_date +from l_test1 +union all +select + id, + 999, + val, + trans_date +from l_test2 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: l_test1 + Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: id (type: bigint), 'table_1' (type: string), trans_date (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: NONE + Union + Statistics: Num rows: 2 Data size: 40 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: bigint), 999 (type: int), _col1 (type: string), _col2 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 2 Data size: 40 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 40 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TableScan + alias: l_test2 + Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: id (type: bigint), val (type: string), trans_date (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: NONE + Union + Statistics: Num rows: 2 Data size: 40 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: bigint), 999 (type: int), _col1 (type: string), _col2 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 2 Data size: 40 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 40 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select + id, + 999, + 'table_1' , + trans_date +from l_test1 +union all +select + id, + 999, + val, + trans_date +from l_test2 +PREHOOK: type: QUERY +PREHOOK: Input: default@l_test1 +PREHOOK: Input: default@l_test2 +#### A masked pattern was here #### +POSTHOOK: query: select + id, + 999, + 'table_1' , + trans_date +from l_test1 +union all +select + id, + 999, + val, + trans_date +from l_test2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@l_test1 +POSTHOOK: Input: default@l_test2 +#### A masked pattern was here #### +1 999 table_1 2016-08-11 +2 999 table_2 2016-08-11 +PREHOOK: query: explain +select + id, + 999, + 666, + 'table_1' , + trans_date +from l_test1 +union all +select + id, + 999, + 666, + val, + trans_date +from l_test2 +PREHOOK: type: QUERY +POSTHOOK: query: explain +select + id, + 999, + 666, + 'table_1' , + trans_date +from l_test1 +union all +select + id, + 999, + 666, + val, + trans_date +from l_test2 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: l_test1 + Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: id (type: bigint), 'table_1' (type: string), trans_date (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: NONE + Union + Statistics: Num rows: 2 Data size: 40 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: bigint), 999 (type: int), 666 (type: int), _col1 (type: string), _col2 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 2 Data size: 40 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 40 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TableScan + alias: l_test2 + Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: id (type: bigint), val (type: string), trans_date (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: NONE + Union + Statistics: Num rows: 2 Data size: 40 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: bigint), 999 (type: int), 666 (type: int), _col1 (type: string), _col2 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 2 Data size: 40 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 40 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select + id, + 999, + 666, + 'table_1' , + trans_date +from l_test1 +union all +select + id, + 999, + 666, + val, + trans_date +from l_test2 +PREHOOK: type: QUERY +PREHOOK: Input: default@l_test1 +PREHOOK: Input: default@l_test2 +#### A masked pattern was here #### +POSTHOOK: query: select + id, + 999, + 666, + 'table_1' , + trans_date +from l_test1 +union all +select + id, + 999, + 666, + val, + trans_date +from l_test2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@l_test1 +POSTHOOK: Input: default@l_test2 +#### A masked pattern was here #### +1 999 666 table_1 2016-08-11 +2 999 666 table_2 2016-08-11 +PREHOOK: query: explain +select + id, + 999, + 'table_1' , + trans_date, + '2016-11-11' +from l_test1 +union all +select + id, + 999, + val, + trans_date, + trans_date +from l_test2 +PREHOOK: type: QUERY +POSTHOOK: query: explain +select + id, + 999, + 'table_1' , + trans_date, + '2016-11-11' +from l_test1 +union all +select + id, + 999, + val, + trans_date, + trans_date +from l_test2 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: l_test1 + Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: id (type: bigint), 'table_1' (type: string), trans_date (type: string), '2016-11-11' (type: string) + outputColumnNames: _col0, _col2, _col3, _col4 + Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: NONE + Union + Statistics: Num rows: 2 Data size: 40 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: bigint), 999 (type: int), _col2 (type: string), _col3 (type: string), _col4 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 2 Data size: 40 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 40 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TableScan + alias: l_test2 + Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: id (type: bigint), val (type: string), trans_date (type: string), trans_date (type: string) + outputColumnNames: _col0, _col2, _col3, _col4 + Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: NONE + Union + Statistics: Num rows: 2 Data size: 40 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: bigint), 999 (type: int), _col2 (type: string), _col3 (type: string), _col4 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 2 Data size: 40 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 40 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select + id, + 999, + 'table_1' , + trans_date, + '2016-11-11' +from l_test1 +union all +select + id, + 999, + val, + trans_date, + trans_date +from l_test2 +PREHOOK: type: QUERY +PREHOOK: Input: default@l_test1 +PREHOOK: Input: default@l_test2 +#### A masked pattern was here #### +POSTHOOK: query: select + id, + 999, + 'table_1' , + trans_date, + '2016-11-11' +from l_test1 +union all +select + id, + 999, + val, + trans_date, + trans_date +from l_test2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@l_test1 +POSTHOOK: Input: default@l_test2 +#### A masked pattern was here #### +1 999 table_1 2016-08-11 2016-11-11 +2 999 table_2 2016-08-11 2016-08-11