diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HivePreFilteringRule.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HivePreFilteringRule.java index 3e2311c..5958c36 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HivePreFilteringRule.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HivePreFilteringRule.java @@ -47,6 +47,7 @@ import com.google.common.collect.ImmutableList; import com.google.common.collect.LinkedHashMultimap; import com.google.common.collect.Multimap; +import com.google.common.collect.Sets; public class HivePreFilteringRule extends RelOptRule { @@ -148,54 +149,67 @@ public void onMatch(RelOptRuleCall call) { assert condition.getKind() == SqlKind.OR; Multimap reductionCondition = LinkedHashMultimap.create(); + // Data structure to control whether a certain reference is present in every operand + Set refsInAllOperands = null; + // 1. We extract the information necessary to create the predicate for the new // filter; currently we support comparison functions, in and between ImmutableList operands = RexUtil.flattenOr(((RexCall) condition).getOperands()); - for (RexNode operand: operands) { + for (int i = 0; i < operands.size(); i++) { + final RexNode operand = operands.get(i); + final RexNode operandCNF = RexUtil.toCnf(rexBuilder, operand); final List conjunctions = RelOptUtil.conjunctions(operandCNF); - boolean addedToReductionCondition = false; // Flag to control whether we have added a new factor - // to the reduction predicate + + Set refsInCurrentOperand = Sets.newHashSet(); for (RexNode conjunction: conjunctions) { + // We do not know what it is, we bail out for safety if (!(conjunction instanceof RexCall)) { - continue; + return new ArrayList<>(); } RexCall conjCall = (RexCall) conjunction; + RexNode ref = null; if(COMPARISON.contains(conjCall.getOperator().getKind())) { if (conjCall.operands.get(0) instanceof RexInputRef && conjCall.operands.get(1) instanceof RexLiteral) { - reductionCondition.put(conjCall.operands.get(0).toString(), - conjCall); - addedToReductionCondition = true; + ref = conjCall.operands.get(0); } else if (conjCall.operands.get(1) instanceof RexInputRef && conjCall.operands.get(0) instanceof RexLiteral) { - reductionCondition.put(conjCall.operands.get(1).toString(), - conjCall); - addedToReductionCondition = true; + ref = conjCall.operands.get(1); + } else { + // We do not know what it is, we bail out for safety + return new ArrayList<>(); } } else if(conjCall.getOperator().getKind().equals(SqlKind.IN)) { - reductionCondition.put(conjCall.operands.get(0).toString(), - conjCall); - addedToReductionCondition = true; + ref = conjCall.operands.get(0); } else if(conjCall.getOperator().getKind().equals(SqlKind.BETWEEN)) { - reductionCondition.put(conjCall.operands.get(1).toString(), - conjCall); - addedToReductionCondition = true; + ref = conjCall.operands.get(1); + } else { + // We do not know what it is, we bail out for safety + return new ArrayList<>(); } + + String stringRef = ref.toString(); + reductionCondition.put(stringRef, conjCall); + refsInCurrentOperand.add(stringRef); } - // If we did not add any factor, we can bail out - if (!addedToReductionCondition) { + // Updates the references that are present in every operand up till now + if (i == 0) { + refsInAllOperands = refsInCurrentOperand; + } else { + refsInAllOperands = Sets.intersection(refsInAllOperands, refsInCurrentOperand); + } + // If we did not add any factor or there are no common factors, we can bail out + if (refsInAllOperands.isEmpty()) { return new ArrayList<>(); } } // 2. We gather the common factors and return them List commonOperands = new ArrayList<>(); - for (Entry> pair : reductionCondition.asMap().entrySet()) { - if (pair.getValue().size() == operands.size()) { - commonOperands.add(RexUtil.composeDisjunction(rexBuilder, pair.getValue(), false)); - } + for (String ref : refsInAllOperands) { + commonOperands.add(RexUtil.composeDisjunction(rexBuilder, reductionCondition.get(ref), false)); } return commonOperands; } diff --git ql/src/test/queries/clientpositive/filter_cond_pushdown.q ql/src/test/queries/clientpositive/filter_cond_pushdown.q index 5e23b71..2425706 100644 --- ql/src/test/queries/clientpositive/filter_cond_pushdown.q +++ ql/src/test/queries/clientpositive/filter_cond_pushdown.q @@ -17,3 +17,8 @@ JOIN ( JOIN (SELECT * FROM cbo_t3 t3 WHERE c_int=1) t3 ON t2.key=t3.c_int WHERE ((t2.key=t3.key) AND (t2.c_float + t3.c_float > 2)) OR ((t2.key=t3.key) AND (t2.c_int + t3.c_int > 2))) t4 ON t1.key=t4.key; + +EXPLAIN +SELECT f.key, f.value, m.value +FROM src f JOIN src m ON(f.key = m.key AND m.value is not null AND m.value !='') +WHERE (f.value IN ('2008-04-08','2008-04-10') AND f.value IN ('2008-04-08','2008-04-09') AND m.value='2008-04-10') OR (m.value='2008-04-08'); diff --git ql/src/test/results/clientpositive/filter_cond_pushdown.q.out ql/src/test/results/clientpositive/filter_cond_pushdown.q.out index af42d5c..99eb3f7 100644 --- ql/src/test/results/clientpositive/filter_cond_pushdown.q.out +++ ql/src/test/results/clientpositive/filter_cond_pushdown.q.out @@ -380,3 +380,83 @@ STAGE PLANS: Processor Tree: ListSink +PREHOOK: query: EXPLAIN +SELECT f.key, f.value, m.value +FROM src f JOIN src m ON(f.key = m.key AND m.value is not null AND m.value !='') +WHERE (f.value IN ('2008-04-08','2008-04-10') AND f.value IN ('2008-04-08','2008-04-09') AND m.value='2008-04-10') OR (m.value='2008-04-08') +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +SELECT f.key, f.value, m.value +FROM src f JOIN src m ON(f.key = m.key AND m.value is not null AND m.value !='') +WHERE (f.value IN ('2008-04-08','2008-04-10') AND f.value IN ('2008-04-08','2008-04-09') AND m.value='2008-04-10') OR (m.value='2008-04-08') +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: f + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: key is not null (type: boolean) + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: string) + TableScan + alias: f + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (((((value = '2008-04-10') or (value = '2008-04-08')) and value is not null) and (value <> '')) and key is not null) (type: boolean) + Statistics: Num rows: 125 Data size: 1328 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 125 Data size: 1328 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 125 Data size: 1328 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: string) + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: string) + 1 _col0 (type: string) + outputColumnNames: _col0, _col1, _col3 + Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (((_col1) IN ('2008-04-08', '2008-04-10') and (_col1) IN ('2008-04-08', '2008-04-09') and (_col3 = '2008-04-10')) or (_col3 = '2008-04-08')) (type: boolean) + Statistics: Num rows: 171 Data size: 1816 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: string), _col3 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 171 Data size: 1816 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 171 Data size: 1816 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink +