diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java index 1663b88..4b98230 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java @@ -18,8 +18,13 @@ package org.apache.hadoop.hive.ql.optimizer.stats.annotation; -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; +import java.lang.reflect.Field; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Stack; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -72,14 +77,8 @@ import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; -import java.lang.reflect.Field; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Map.Entry; -import java.util.Set; -import java.util.Stack; +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; public class StatsRulesProcFactory { @@ -259,7 +258,7 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, // evaluate filter expression and update statistics long newNumRows = evaluateExpression(parentStats, pred, aspCtx, - neededCols, fop); + neededCols, fop, 0); Statistics st = parentStats.clone(); if (satisfyPrecondition(parentStats)) { @@ -297,7 +296,7 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, private long evaluateExpression(Statistics stats, ExprNodeDesc pred, AnnotateStatsProcCtx aspCtx, List neededCols, - FilterOperator fop) throws CloneNotSupportedException { + FilterOperator fop, long evaluatedRowCount) throws CloneNotSupportedException { long newNumRows = 0; Statistics andStats = null; @@ -316,7 +315,7 @@ private long evaluateExpression(Statistics stats, ExprNodeDesc pred, // evaluate children for (ExprNodeDesc child : genFunc.getChildren()) { newNumRows = evaluateChildExpr(aspCtx.getAndExprStats(), child, - aspCtx, neededCols, fop); + aspCtx, neededCols, fop, evaluatedRowCount); if (satisfyPrecondition(aspCtx.getAndExprStats())) { updateStats(aspCtx.getAndExprStats(), newNumRows, true, fop); } else { @@ -324,17 +323,25 @@ private long evaluateExpression(Statistics stats, ExprNodeDesc pred, } } } else if (udf instanceof GenericUDFOPOr) { - // for OR condition independently compute and update stats - for (ExprNodeDesc child : genFunc.getChildren()) { - newNumRows = StatsUtils.safeAdd( - evaluateChildExpr(stats, child, aspCtx, neededCols, fop), newNumRows); + // for OR condition independently compute and update stats. + // reverse children to avoid left deep tree evaluation. + for (ExprNodeDesc child : Lists.reverse(genFunc.getChildren())) { + // early exit if OR evaluation yields more rows than input rows + if (evaluatedRowCount >= stats.getNumRows()) { + evaluatedRowCount = stats.getNumRows(); + } else { + newNumRows = StatsUtils.safeAdd( + evaluateChildExpr(stats, child, aspCtx, neededCols, fop, evaluatedRowCount), + newNumRows); + evaluatedRowCount = StatsUtils.safeAdd(evaluatedRowCount, newNumRows); + } } } else if (udf instanceof GenericUDFOPNot) { newNumRows = evaluateNotExpr(stats, pred, aspCtx, neededCols, fop); } else { // single predicate condition - newNumRows = evaluateChildExpr(stats, pred, aspCtx, neededCols, fop); + newNumRows = evaluateChildExpr(stats, pred, aspCtx, neededCols, fop, evaluatedRowCount); } } else if (pred instanceof ExprNodeColumnDesc) { @@ -381,7 +388,7 @@ private long evaluateNotExpr(Statistics stats, ExprNodeDesc pred, long newNumRows = 0; for (ExprNodeDesc child : genFunc.getChildren()) { newNumRows = evaluateChildExpr(stats, child, aspCtx, neededCols, - fop); + fop, 0); } return numRows - newNumRows; } else if (leaf instanceof ExprNodeConstantDesc) { @@ -439,7 +446,7 @@ private long evaluateColEqualsNullExpr(Statistics stats, ExprNodeDesc pred) { private long evaluateChildExpr(Statistics stats, ExprNodeDesc child, AnnotateStatsProcCtx aspCtx, List neededCols, - FilterOperator fop) throws CloneNotSupportedException { + FilterOperator fop, long evaluatedRowCount) throws CloneNotSupportedException { long numRows = stats.getNumRows(); @@ -525,7 +532,7 @@ private long evaluateChildExpr(Statistics stats, ExprNodeDesc child, return evaluateColEqualsNullExpr(stats, genFunc); } else if (udf instanceof GenericUDFOPAnd || udf instanceof GenericUDFOPOr || udf instanceof GenericUDFOPNot) { - return evaluateExpression(stats, genFunc, aspCtx, neededCols, fop); + return evaluateExpression(stats, genFunc, aspCtx, neededCols, fop, evaluatedRowCount); } } diff --git a/ql/src/test/queries/clientpositive/annotate_stats_deep_filters.q b/ql/src/test/queries/clientpositive/annotate_stats_deep_filters.q new file mode 100644 index 0000000..c027532 --- /dev/null +++ b/ql/src/test/queries/clientpositive/annotate_stats_deep_filters.q @@ -0,0 +1,66 @@ +create table over1k( +t tinyint, +si smallint, +i int, +b bigint, +f float, +d double, +bo boolean, +s string, +ts timestamp, +dec decimal(4,2), +bin binary) +ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' +STORED AS TEXTFILE; + +load data local inpath '../../data/files/over1k' overwrite into table over1k; +load data local inpath '../../data/files/over1k' into table over1k; + +analyze table over1k compute statistics; +analyze table over1k compute statistics for columns; + +set hive.stats.fetch.column.stats=true; +explain select count(*) from over1k where ( +(t=1 and si=2) +or (t=2 and si=3) +or (t=3 and si=4) +or (t=4 and si=5) +or (t=5 and si=6) +or (t=6 and si=7) +or (t=7 and si=8) +or (t=9 and si=10) +or (t=10 and si=11) +or (t=11 and si=12) +or (t=12 and si=13) +or (t=13 and si=14) +or (t=14 and si=15) +or (t=15 and si=16) +or (t=16 and si=17) +or (t=17 and si=18) +or (t=27 and si=28) +or (t=37 and si=38) +or (t=47 and si=48) +or (t=52 and si=53)); + +set hive.stats.fetch.column.stats=false; +explain select count(*) from over1k where ( +(t=1 and si=2) +or (t=2 and si=3) +or (t=3 and si=4) +or (t=4 and si=5) +or (t=5 and si=6) +or (t=6 and si=7) +or (t=7 and si=8) +or (t=9 and si=10) +or (t=10 and si=11) +or (t=11 and si=12) +or (t=12 and si=13) +or (t=13 and si=14) +or (t=14 and si=15) +or (t=15 and si=16) +or (t=16 and si=17) +or (t=17 and si=18) +or (t=27 and si=28) +or (t=37 and si=38) +or (t=47 and si=48) +or (t=52 and si=53)); \ No newline at end of file diff --git a/ql/src/test/results/clientpositive/annotate_stats_deep_filters.q.out b/ql/src/test/results/clientpositive/annotate_stats_deep_filters.q.out new file mode 100644 index 0000000..788d6c8 --- /dev/null +++ b/ql/src/test/results/clientpositive/annotate_stats_deep_filters.q.out @@ -0,0 +1,244 @@ +PREHOOK: query: create table over1k( +t tinyint, +si smallint, +i int, +b bigint, +f float, +d double, +bo boolean, +s string, +ts timestamp, +dec decimal(4,2), +bin binary) +ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' +STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@over1k +POSTHOOK: query: create table over1k( +t tinyint, +si smallint, +i int, +b bigint, +f float, +d double, +bo boolean, +s string, +ts timestamp, +dec decimal(4,2), +bin binary) +ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' +STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@over1k +PREHOOK: query: load data local inpath '../../data/files/over1k' overwrite into table over1k +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@over1k +POSTHOOK: query: load data local inpath '../../data/files/over1k' overwrite into table over1k +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@over1k +PREHOOK: query: load data local inpath '../../data/files/over1k' into table over1k +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@over1k +POSTHOOK: query: load data local inpath '../../data/files/over1k' into table over1k +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@over1k +PREHOOK: query: analyze table over1k compute statistics +PREHOOK: type: QUERY +PREHOOK: Input: default@over1k +PREHOOK: Output: default@over1k +POSTHOOK: query: analyze table over1k compute statistics +POSTHOOK: type: QUERY +POSTHOOK: Input: default@over1k +POSTHOOK: Output: default@over1k +PREHOOK: query: analyze table over1k compute statistics for columns +PREHOOK: type: QUERY +PREHOOK: Input: default@over1k +#### A masked pattern was here #### +POSTHOOK: query: analyze table over1k compute statistics for columns +POSTHOOK: type: QUERY +POSTHOOK: Input: default@over1k +#### A masked pattern was here #### +PREHOOK: query: explain select count(*) from over1k where ( +(t=1 and si=2) +or (t=2 and si=3) +or (t=3 and si=4) +or (t=4 and si=5) +or (t=5 and si=6) +or (t=6 and si=7) +or (t=7 and si=8) +or (t=9 and si=10) +or (t=10 and si=11) +or (t=11 and si=12) +or (t=12 and si=13) +or (t=13 and si=14) +or (t=14 and si=15) +or (t=15 and si=16) +or (t=16 and si=17) +or (t=17 and si=18) +or (t=27 and si=28) +or (t=37 and si=38) +or (t=47 and si=48) +or (t=52 and si=53)) +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(*) from over1k where ( +(t=1 and si=2) +or (t=2 and si=3) +or (t=3 and si=4) +or (t=4 and si=5) +or (t=5 and si=6) +or (t=6 and si=7) +or (t=7 and si=8) +or (t=9 and si=10) +or (t=10 and si=11) +or (t=11 and si=12) +or (t=12 and si=13) +or (t=13 and si=14) +or (t=14 and si=15) +or (t=15 and si=16) +or (t=16 and si=17) +or (t=17 and si=18) +or (t=27 and si=28) +or (t=37 and si=38) +or (t=47 and si=48) +or (t=52 and si=53)) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: over1k + Statistics: Num rows: 2098 Data size: 211174 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: (((t = 1) and (si = 2)) or (((t = 2) and (si = 3)) or (((t = 3) and (si = 4)) or (((t = 4) and (si = 5)) or (((t = 5) and (si = 6)) or (((t = 6) and (si = 7)) or (((t = 7) and (si = 8)) or (((t = 9) and (si = 10)) or (((t = 10) and (si = 11)) or (((t = 11) and (si = 12)) or (((t = 12) and (si = 13)) or (((t = 13) and (si = 14)) or (((t = 14) and (si = 15)) or (((t = 15) and (si = 16)) or (((t = 16) and (si = 17)) or (((t = 17) and (si = 18)) or (((t = 27) and (si = 28)) or (((t = 37) and (si = 38)) or (((t = 47) and (si = 48)) or ((t = 52) and (si = 53))))))))))))))))))))) (type: boolean) + Statistics: Num rows: 280 Data size: 2232 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + Statistics: Num rows: 280 Data size: 2232 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: bigint) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: explain select count(*) from over1k where ( +(t=1 and si=2) +or (t=2 and si=3) +or (t=3 and si=4) +or (t=4 and si=5) +or (t=5 and si=6) +or (t=6 and si=7) +or (t=7 and si=8) +or (t=9 and si=10) +or (t=10 and si=11) +or (t=11 and si=12) +or (t=12 and si=13) +or (t=13 and si=14) +or (t=14 and si=15) +or (t=15 and si=16) +or (t=16 and si=17) +or (t=17 and si=18) +or (t=27 and si=28) +or (t=37 and si=38) +or (t=47 and si=48) +or (t=52 and si=53)) +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(*) from over1k where ( +(t=1 and si=2) +or (t=2 and si=3) +or (t=3 and si=4) +or (t=4 and si=5) +or (t=5 and si=6) +or (t=6 and si=7) +or (t=7 and si=8) +or (t=9 and si=10) +or (t=10 and si=11) +or (t=11 and si=12) +or (t=12 and si=13) +or (t=13 and si=14) +or (t=14 and si=15) +or (t=15 and si=16) +or (t=16 and si=17) +or (t=17 and si=18) +or (t=27 and si=28) +or (t=37 and si=38) +or (t=47 and si=48) +or (t=52 and si=53)) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: over1k + Statistics: Num rows: 2098 Data size: 211174 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (((t = 1) and (si = 2)) or (((t = 2) and (si = 3)) or (((t = 3) and (si = 4)) or (((t = 4) and (si = 5)) or (((t = 5) and (si = 6)) or (((t = 6) and (si = 7)) or (((t = 7) and (si = 8)) or (((t = 9) and (si = 10)) or (((t = 10) and (si = 11)) or (((t = 11) and (si = 12)) or (((t = 12) and (si = 13)) or (((t = 13) and (si = 14)) or (((t = 14) and (si = 15)) or (((t = 15) and (si = 16)) or (((t = 16) and (si = 17)) or (((t = 17) and (si = 18)) or (((t = 27) and (si = 28)) or (((t = 37) and (si = 38)) or (((t = 47) and (si = 48)) or ((t = 52) and (si = 53))))))))))))))))))))) (type: boolean) + Statistics: Num rows: 2098 Data size: 211174 Basic stats: COMPLETE Column stats: NONE + Select Operator + Statistics: Num rows: 2098 Data size: 211174 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink +