diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdPredicates.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdPredicates.java index 85e66d5..9cec6ca 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdPredicates.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdPredicates.java @@ -18,13 +18,17 @@ package org.apache.hadoop.hive.ql.optimizer.calcite.stats; import java.util.ArrayList; +import java.util.LinkedHashMap; import java.util.List; +import java.util.Map; +import java.util.Map.Entry; import org.apache.calcite.linq4j.Ord; import org.apache.calcite.plan.RelOptPredicateList; import org.apache.calcite.plan.RelOptUtil; import org.apache.calcite.rel.RelNode; import org.apache.calcite.rel.core.Project; +import org.apache.calcite.rel.core.Union; import org.apache.calcite.rel.metadata.ReflectiveRelMetadataProvider; import org.apache.calcite.rel.metadata.RelMdPredicates; import org.apache.calcite.rel.metadata.RelMetadataProvider; @@ -35,6 +39,7 @@ import org.apache.calcite.rex.RexLiteral; import org.apache.calcite.rex.RexNode; import org.apache.calcite.rex.RexPermuteInputsShuttle; +import org.apache.calcite.rex.RexUtil; import org.apache.calcite.sql.fun.SqlStdOperatorTable; import org.apache.calcite.util.BuiltInMethod; import org.apache.calcite.util.ImmutableBitSet; @@ -126,4 +131,49 @@ public RelOptPredicateList getPredicates(Project project, RelMetadataQuery mq) { return RelOptPredicateList.of(projectPullUpPredicates); } + /** + * Infers predicates for a Union. + */ + @Override + public RelOptPredicateList getPredicates(Union union, RelMetadataQuery mq) { + RexBuilder rB = union.getCluster().getRexBuilder(); + + Map finalPreds = new LinkedHashMap<>(); + Map finalResidualPreds = new LinkedHashMap<>(); + for (int i = 0; i < union.getInputs().size(); i++) { + RelNode input = union.getInputs().get(i); + RelOptPredicateList info = mq.getPulledUpPredicates(input); + if (info.pulledUpPredicates.isEmpty()) { + return RelOptPredicateList.EMPTY; + } + Map preds = new LinkedHashMap<>(); + for (RexNode pred : info.pulledUpPredicates) { + final String predString = pred.toString(); + if (i == 0) { + preds.put(predString, pred); + continue; + } + if (finalPreds.containsKey(predString)) { + preds.put(predString, pred); + } else { + finalResidualPreds.put(predString, pred); + } + } + // Add those that are not part of the final set to residual + for (Entry e : finalPreds.entrySet()) { + if (!preds.containsKey(e.getKey())) { + finalResidualPreds.put(e.getKey(), e.getValue()); + } + } + finalPreds = preds; + } + + List preds = new ArrayList<>(finalPreds.values()); + RexNode disjPred = RexUtil.composeDisjunction(rB, finalResidualPreds.values(), true); + if (disjPred != null) { + preds.add(disjPred); + } + return RelOptPredicateList.of(preds); + } + } diff --git ql/src/test/results/clientpositive/input26.q.out ql/src/test/results/clientpositive/input26.q.out index 74035c5..70f7f65 100644 --- ql/src/test/results/clientpositive/input26.q.out +++ ql/src/test/results/clientpositive/input26.q.out @@ -44,8 +44,8 @@ STAGE PLANS: Number of rows: 5 Statistics: Num rows: 5 Data size: 50 Basic stats: COMPLETE Column stats: NONE Select Operator - expressions: _col0 (type: string), _col1 (type: string), '2008-04-08' (type: string), '11' (type: string) - outputColumnNames: _col0, _col1, _col2, _col3 + expressions: _col0 (type: string), _col1 (type: string), '11' (type: string) + outputColumnNames: _col0, _col1, _col2 Statistics: Num rows: 5 Data size: 50 Basic stats: COMPLETE Column stats: NONE File Output Operator compressed: false @@ -60,23 +60,31 @@ STAGE PLANS: TableScan Union Statistics: Num rows: 6 Data size: 50 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false + Select Operator + expressions: _col0 (type: string), _col1 (type: string), '2008-04-08' (type: string), _col2 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 Statistics: Num rows: 6 Data size: 50 Basic stats: COMPLETE Column stats: NONE - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + File Output Operator + compressed: false + Statistics: Num rows: 6 Data size: 50 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe TableScan Union Statistics: Num rows: 6 Data size: 50 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false + Select Operator + expressions: _col0 (type: string), _col1 (type: string), '2008-04-08' (type: string), _col2 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 Statistics: Num rows: 6 Data size: 50 Basic stats: COMPLETE Column stats: NONE - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + File Output Operator + compressed: false + Statistics: Num rows: 6 Data size: 50 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe Stage: Stage-3 Map Reduce @@ -108,8 +116,8 @@ STAGE PLANS: Number of rows: 5 Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE Select Operator - expressions: _col0 (type: string), _col1 (type: string), '2008-04-08' (type: string), '14' (type: string) - outputColumnNames: _col0, _col1, _col2, _col3 + expressions: _col0 (type: string), _col1 (type: string), '14' (type: string) + outputColumnNames: _col0, _col1, _col2 Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE File Output Operator compressed: false diff --git ql/src/test/results/clientpositive/llap/tez_union_dynamic_partition.q.out ql/src/test/results/clientpositive/llap/tez_union_dynamic_partition.q.out index ce86640..faa3adb 100644 --- ql/src/test/results/clientpositive/llap/tez_union_dynamic_partition.q.out +++ ql/src/test/results/clientpositive/llap/tez_union_dynamic_partition.q.out @@ -66,17 +66,21 @@ STAGE PLANS: alias: dummy Statistics: Num rows: 1 Data size: 1 Basic stats: COMPLETE Column stats: COMPLETE Select Operator - expressions: 1 (type: int), '2014' (type: string) - outputColumnNames: _col0, _col1 - Statistics: Num rows: 1 Data size: 92 Basic stats: COMPLETE Column stats: COMPLETE - File Output Operator - compressed: false + expressions: 1 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: int), '2014' (type: string) + outputColumnNames: _col0, _col1 Statistics: Num rows: 2 Data size: 184 Basic stats: COMPLETE Column stats: COMPLETE - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.partunion1 + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 184 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.partunion1 Execution mode: llap LLAP IO: no inputs Map 3 @@ -85,17 +89,21 @@ STAGE PLANS: alias: dummy Statistics: Num rows: 1 Data size: 1 Basic stats: COMPLETE Column stats: COMPLETE Select Operator - expressions: 2 (type: int), '2014' (type: string) - outputColumnNames: _col0, _col1 - Statistics: Num rows: 1 Data size: 92 Basic stats: COMPLETE Column stats: COMPLETE - File Output Operator - compressed: false + expressions: 2 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: int), '2014' (type: string) + outputColumnNames: _col0, _col1 Statistics: Num rows: 2 Data size: 184 Basic stats: COMPLETE Column stats: COMPLETE - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.partunion1 + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 184 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.partunion1 Execution mode: llap LLAP IO: no inputs Union 2 diff --git ql/src/test/results/clientpositive/tez/tez_union_dynamic_partition.q.out ql/src/test/results/clientpositive/tez/tez_union_dynamic_partition.q.out index a1221a4..83c6c82 100644 --- ql/src/test/results/clientpositive/tez/tez_union_dynamic_partition.q.out +++ ql/src/test/results/clientpositive/tez/tez_union_dynamic_partition.q.out @@ -66,34 +66,42 @@ STAGE PLANS: alias: dummy Statistics: Num rows: 1 Data size: 1 Basic stats: COMPLETE Column stats: COMPLETE Select Operator - expressions: 1 (type: int), '2014' (type: string) - outputColumnNames: _col0, _col1 - Statistics: Num rows: 1 Data size: 92 Basic stats: COMPLETE Column stats: COMPLETE - File Output Operator - compressed: false + expressions: 1 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: int), '2014' (type: string) + outputColumnNames: _col0, _col1 Statistics: Num rows: 2 Data size: 184 Basic stats: COMPLETE Column stats: COMPLETE - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.partunion1 + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 184 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.partunion1 Map 3 Map Operator Tree: TableScan alias: dummy Statistics: Num rows: 1 Data size: 1 Basic stats: COMPLETE Column stats: COMPLETE Select Operator - expressions: 2 (type: int), '2014' (type: string) - outputColumnNames: _col0, _col1 - Statistics: Num rows: 1 Data size: 92 Basic stats: COMPLETE Column stats: COMPLETE - File Output Operator - compressed: false + expressions: 2 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: int), '2014' (type: string) + outputColumnNames: _col0, _col1 Statistics: Num rows: 2 Data size: 184 Basic stats: COMPLETE Column stats: COMPLETE - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.partunion1 + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 184 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.partunion1 Union 2 Vertex: Union 2 diff --git ql/src/test/results/clientpositive/unionall_unbalancedppd.q.out ql/src/test/results/clientpositive/unionall_unbalancedppd.q.out index 5e166e6..ba3a0b8 100644 --- ql/src/test/results/clientpositive/unionall_unbalancedppd.q.out +++ ql/src/test/results/clientpositive/unionall_unbalancedppd.q.out @@ -386,18 +386,20 @@ STAGE PLANS: predicate: (f1 = 1) (type: boolean) Statistics: Num rows: 1 Data size: 3 Basic stats: COMPLETE Column stats: NONE Select Operator - expressions: 1 (type: int) - outputColumnNames: _col0 Statistics: Num rows: 1 Data size: 3 Basic stats: COMPLETE Column stats: NONE Union Statistics: Num rows: 2 Data size: 4 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false + Select Operator + expressions: 1 (type: int) + outputColumnNames: _col0 Statistics: Num rows: 2 Data size: 4 Basic stats: COMPLETE Column stats: NONE - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 4 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe TableScan alias: union_all_bug_test_2 Statistics: Num rows: 2 Data size: 2 Basic stats: COMPLETE Column stats: NONE @@ -405,18 +407,20 @@ STAGE PLANS: predicate: (f1 = 1) (type: boolean) Statistics: Num rows: 1 Data size: 1 Basic stats: COMPLETE Column stats: NONE Select Operator - expressions: 1 (type: int) - outputColumnNames: _col0 Statistics: Num rows: 1 Data size: 1 Basic stats: COMPLETE Column stats: NONE Union Statistics: Num rows: 2 Data size: 4 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false + Select Operator + expressions: 1 (type: int) + outputColumnNames: _col0 Statistics: Num rows: 2 Data size: 4 Basic stats: COMPLETE Column stats: NONE - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 4 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe Stage: Stage-0 Fetch Operator