commit 141c92dc677897257c605b7723bb2e0d09cbb59b Author: Janaki Lahorani Date: Wed Jun 13 15:24:03 2018 -0700 HIVE-19889: Do not push predicates referencing non deterministic functions diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveFilterProjectTransposeRule.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveFilterProjectTransposeRule.java index af2207ff41e1b91fbd9c6f51a76da9cdb1b92934..efe20d9f6c5f4f646e5409d598faba8224b8cac1 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveFilterProjectTransposeRule.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveFilterProjectTransposeRule.java @@ -69,7 +69,13 @@ private HiveFilterProjectTransposeRule(Class filterClass, @Override public boolean matches(RelOptRuleCall call) { final Filter filterRel = call.rel(0); - RexNode condition = filterRel.getCondition(); + + // The condition fetched here can reference a udf that is not deterministic, but defined + // as part of the select list when a view is in play. But the condition after the pushdown + // will resolve to using the udf from select list. The check here for deterministic filters + // should be based on the resolved expression. Refer to test case cbo_ppd_non_deterministic.q. + RexNode condition = RelOptUtil.pushPastProject(filterRel.getCondition(), call.rel(1)); + if (this.onlyDeterministic && !HiveCalciteUtil.isDeterministic(condition)) { return false; } diff --git ql/src/test/queries/clientpositive/cbo_ppd_non_deterministic.q ql/src/test/queries/clientpositive/cbo_ppd_non_deterministic.q new file mode 100644 index 0000000000000000000000000000000000000000..f1a7a63d4aff9e2e55d549ea993a612d9d75001c --- /dev/null +++ ql/src/test/queries/clientpositive/cbo_ppd_non_deterministic.q @@ -0,0 +1,42 @@ +CREATE TABLE `testa`( + `col1` string COMMENT '', + `col2` string COMMENT '', + `col3` string COMMENT '', + `col4` string COMMENT '', + `col5` string COMMENT '') +PARTITIONED BY ( + `part1` string, + `part2` string, + `part3` string) +STORED AS AVRO; + +insert into testA partition (part1='US', part2='ABC', part3='123') +values ('12.34', '100', '200', '300', 'abc'), +('12.341', '1001', '2001', '3001', 'abcd'); + +insert into testA partition (part1='UK', part2='DEF', part3='123') +values ('12.34', '100', '200', '300', 'abc'), +('12.341', '1001', '2001', '3001', 'abcd'); + +insert into testA partition (part1='US', part2='DEF', part3='200') +values ('12.34', '100', '200', '300', 'abc'), +('12.341', '1001', '2001', '3001', 'abcd'); + +insert into testA partition (part1='CA', part2='ABC', part3='300') +values ('12.34', '100', '200', '300', 'abc'), +('12.341', '1001', '2001', '3001', 'abcd'); + +set hive.cbo.enable=true; +SET hive.vectorized.execution.enabled=false; + +explain select * from ( +select part1,randum123 +from (SELECT *, cast(rand() as double) AS randum123 FROM testA where part1='CA' and part2 = 'ABC') a +where randum123 <= 0.5) s where s.randum123 > 0.25 limit 20; + +SET hive.vectorized.execution.enabled=true; + +explain select * from ( +select part1,randum123 +from (SELECT *, cast(rand() as double) AS randum123 FROM testA where part1='CA' and part2 = 'ABC') a +where randum123 <= 0.5) s where s.randum123 > 0.25 limit 20; diff --git ql/src/test/results/clientpositive/cbo_ppd_non_deterministic.q.out ql/src/test/results/clientpositive/cbo_ppd_non_deterministic.q.out new file mode 100644 index 0000000000000000000000000000000000000000..8f00aa8e0401471527e33716a66c156e3c334cb5 --- /dev/null +++ ql/src/test/results/clientpositive/cbo_ppd_non_deterministic.q.out @@ -0,0 +1,195 @@ +PREHOOK: query: CREATE TABLE `testa`( + `col1` string COMMENT '', + `col2` string COMMENT '', + `col3` string COMMENT '', + `col4` string COMMENT '', + `col5` string COMMENT '') +PARTITIONED BY ( + `part1` string, + `part2` string, + `part3` string) +STORED AS AVRO +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@testa +POSTHOOK: query: CREATE TABLE `testa`( + `col1` string COMMENT '', + `col2` string COMMENT '', + `col3` string COMMENT '', + `col4` string COMMENT '', + `col5` string COMMENT '') +PARTITIONED BY ( + `part1` string, + `part2` string, + `part3` string) +STORED AS AVRO +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@testa +PREHOOK: query: insert into testA partition (part1='US', part2='ABC', part3='123') +values ('12.34', '100', '200', '300', 'abc'), +('12.341', '1001', '2001', '3001', 'abcd') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@testa@part1=US/part2=ABC/part3=123 +POSTHOOK: query: insert into testA partition (part1='US', part2='ABC', part3='123') +values ('12.34', '100', '200', '300', 'abc'), +('12.341', '1001', '2001', '3001', 'abcd') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@testa@part1=US/part2=ABC/part3=123 +POSTHOOK: Lineage: testa PARTITION(part1=US,part2=ABC,part3=123).col1 SCRIPT [] +POSTHOOK: Lineage: testa PARTITION(part1=US,part2=ABC,part3=123).col2 SCRIPT [] +POSTHOOK: Lineage: testa PARTITION(part1=US,part2=ABC,part3=123).col3 SCRIPT [] +POSTHOOK: Lineage: testa PARTITION(part1=US,part2=ABC,part3=123).col4 SCRIPT [] +POSTHOOK: Lineage: testa PARTITION(part1=US,part2=ABC,part3=123).col5 SCRIPT [] +PREHOOK: query: insert into testA partition (part1='UK', part2='DEF', part3='123') +values ('12.34', '100', '200', '300', 'abc'), +('12.341', '1001', '2001', '3001', 'abcd') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@testa@part1=UK/part2=DEF/part3=123 +POSTHOOK: query: insert into testA partition (part1='UK', part2='DEF', part3='123') +values ('12.34', '100', '200', '300', 'abc'), +('12.341', '1001', '2001', '3001', 'abcd') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@testa@part1=UK/part2=DEF/part3=123 +POSTHOOK: Lineage: testa PARTITION(part1=UK,part2=DEF,part3=123).col1 SCRIPT [] +POSTHOOK: Lineage: testa PARTITION(part1=UK,part2=DEF,part3=123).col2 SCRIPT [] +POSTHOOK: Lineage: testa PARTITION(part1=UK,part2=DEF,part3=123).col3 SCRIPT [] +POSTHOOK: Lineage: testa PARTITION(part1=UK,part2=DEF,part3=123).col4 SCRIPT [] +POSTHOOK: Lineage: testa PARTITION(part1=UK,part2=DEF,part3=123).col5 SCRIPT [] +PREHOOK: query: insert into testA partition (part1='US', part2='DEF', part3='200') +values ('12.34', '100', '200', '300', 'abc'), +('12.341', '1001', '2001', '3001', 'abcd') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@testa@part1=US/part2=DEF/part3=200 +POSTHOOK: query: insert into testA partition (part1='US', part2='DEF', part3='200') +values ('12.34', '100', '200', '300', 'abc'), +('12.341', '1001', '2001', '3001', 'abcd') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@testa@part1=US/part2=DEF/part3=200 +POSTHOOK: Lineage: testa PARTITION(part1=US,part2=DEF,part3=200).col1 SCRIPT [] +POSTHOOK: Lineage: testa PARTITION(part1=US,part2=DEF,part3=200).col2 SCRIPT [] +POSTHOOK: Lineage: testa PARTITION(part1=US,part2=DEF,part3=200).col3 SCRIPT [] +POSTHOOK: Lineage: testa PARTITION(part1=US,part2=DEF,part3=200).col4 SCRIPT [] +POSTHOOK: Lineage: testa PARTITION(part1=US,part2=DEF,part3=200).col5 SCRIPT [] +PREHOOK: query: insert into testA partition (part1='CA', part2='ABC', part3='300') +values ('12.34', '100', '200', '300', 'abc'), +('12.341', '1001', '2001', '3001', 'abcd') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@testa@part1=CA/part2=ABC/part3=300 +POSTHOOK: query: insert into testA partition (part1='CA', part2='ABC', part3='300') +values ('12.34', '100', '200', '300', 'abc'), +('12.341', '1001', '2001', '3001', 'abcd') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@testa@part1=CA/part2=ABC/part3=300 +POSTHOOK: Lineage: testa PARTITION(part1=CA,part2=ABC,part3=300).col1 SCRIPT [] +POSTHOOK: Lineage: testa PARTITION(part1=CA,part2=ABC,part3=300).col2 SCRIPT [] +POSTHOOK: Lineage: testa PARTITION(part1=CA,part2=ABC,part3=300).col3 SCRIPT [] +POSTHOOK: Lineage: testa PARTITION(part1=CA,part2=ABC,part3=300).col4 SCRIPT [] +POSTHOOK: Lineage: testa PARTITION(part1=CA,part2=ABC,part3=300).col5 SCRIPT [] +PREHOOK: query: explain select * from ( +select part1,randum123 +from (SELECT *, cast(rand() as double) AS randum123 FROM testA where part1='CA' and part2 = 'ABC') a +where randum123 <= 0.5) s where s.randum123 > 0.25 limit 20 +PREHOOK: type: QUERY +POSTHOOK: query: explain select * from ( +select part1,randum123 +from (SELECT *, cast(rand() as double) AS randum123 FROM testA where part1='CA' and part2 = 'ABC') a +where randum123 <= 0.5) s where s.randum123 > 0.25 limit 20 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: testa + Statistics: Num rows: 2 Data size: 4580 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: rand() (type: double) + outputColumnNames: _col0 + Statistics: Num rows: 2 Data size: 4580 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: ((_col0 <= 0.5D) and (_col0 > 0.25D)) (type: boolean) + Statistics: Num rows: 1 Data size: 2290 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: 'CA' (type: string), _col0 (type: double) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 2290 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 20 + Statistics: Num rows: 1 Data size: 2290 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 2290 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: 20 + Processor Tree: + ListSink + +PREHOOK: query: explain select * from ( +select part1,randum123 +from (SELECT *, cast(rand() as double) AS randum123 FROM testA where part1='CA' and part2 = 'ABC') a +where randum123 <= 0.5) s where s.randum123 > 0.25 limit 20 +PREHOOK: type: QUERY +POSTHOOK: query: explain select * from ( +select part1,randum123 +from (SELECT *, cast(rand() as double) AS randum123 FROM testA where part1='CA' and part2 = 'ABC') a +where randum123 <= 0.5) s where s.randum123 > 0.25 limit 20 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: testa + Statistics: Num rows: 2 Data size: 4580 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: rand() (type: double) + outputColumnNames: _col0 + Statistics: Num rows: 2 Data size: 4580 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: ((_col0 <= 0.5D) and (_col0 > 0.25D)) (type: boolean) + Statistics: Num rows: 1 Data size: 2290 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: 'CA' (type: string), _col0 (type: double) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 2290 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 20 + Statistics: Num rows: 1 Data size: 2290 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 2290 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized + + Stage: Stage-0 + Fetch Operator + limit: 20 + Processor Tree: + ListSink +