diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index b4e89b0..4ea125b 100644 --- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -1748,16 +1748,21 @@ private static void populateLlapDaemonVarsSet(Set llapDaemonVarsSetLocal // in the absence of column statistics, the estimated number of rows/data size that will // be emitted from join operator will depend on this factor HIVE_STATS_JOIN_FACTOR("hive.stats.join.factor", (float) 1.1, - "Hive/Tez optimizer estimates the data size flowing through each of the operators. JOIN operator\n" + + "Hive/Tez/Spark optimizer estimates the data size flowing through each of the operators. JOIN operator\n" + "uses column statistics to estimate the number of rows flowing out of it and hence the data size.\n" + "In the absence of column statistics, this factor determines the amount of rows that flows out\n" + "of JOIN operator."), + HIVE_STATS_FILTER_PREDICATE_FACTOR("hive.stats.filter.predicate.factor", (float) 0.5, + "Hive/Tez/Spark optimizer estimates the data size flowing through each of the operators.\n" + + "In the absence of column statistics, this factor is used to determine the amount of rows that" + + " flows out of a predicate expression in a filter operator."), + // in the absence of uncompressed/raw data size, total file size will be used for statistics // annotation. But the file may be compressed, encoded and serialized which may be lesser in size // than the actual uncompressed/raw data size. This factor will be multiplied to file size to estimate // the raw data size. HIVE_STATS_DESERIALIZATION_FACTOR("hive.stats.deserialization.factor", (float) 1.0, - "Hive/Tez optimizer estimates the data size flowing through each of the operators. In the absence\n" + + "Hive/Tez/Spark optimizer estimates the data size flowing through each of the operators. In the absence\n" + "of basic statistics like number of rows and data size, file size is used to estimate the number\n" + "of rows and data size. Since files in tables/partitions are serialized (and optionally\n" + "compressed) the estimates of number of rows and data size cannot be reliably determined.\n" + diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/AnnotateWithStatistics.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/AnnotateWithStatistics.java index f0c1e86..8581cfe 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/AnnotateWithStatistics.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/AnnotateWithStatistics.java @@ -56,7 +56,7 @@ public ParseContext transform(ParseContext pctx) throws SemanticException { opRules.put(new RuleRegExp("SEL", SelectOperator.getOperatorName() + "%"), StatsRulesProcFactory.getSelectRule()); opRules.put(new RuleRegExp("FIL", FilterOperator.getOperatorName() + "%"), - StatsRulesProcFactory.getFilterRule()); + StatsRulesProcFactory.getFilterRule(aspCtx)); opRules.put(new RuleRegExp("GBY", GroupByOperator.getOperatorName() + "%"), StatsRulesProcFactory.getGroupByRule()); opRules.put(new RuleRegExp("JOIN", CommonJoinOperator.getOperatorName() + "%|" diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java index aa1e509..d837994 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java @@ -249,13 +249,19 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, * *

* Worst case: If no column statistics are available, then evaluation of predicate - * expression will assume worst case (i.e; half the input rows) for each of predicate expression. + * expression will use {@link HiveConf.ConfVars#HIVE_STATS_FILTER_PREDICATE_FACTOR} to compute + * the number of rows emitted for each of predicate expression. *

* For more information, refer 'Estimating The Cost Of Operations' chapter in * "Database Systems: The Complete Book" by Garcia-Molina et. al. *

*/ public static class FilterStatsRule extends DefaultStatsRule implements NodeProcessor { + private final float filterFactor; + + FilterStatsRule(AnnotateStatsProcCtx ctx) { + this.filterFactor = ctx.getConf().getFloatVar(HiveConf.ConfVars.HIVE_STATS_FILTER_PREDICATE_FACTOR); + } @Override public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, @@ -381,7 +387,7 @@ private long evaluateExpression(Statistics stats, ExprNodeDesc pred, } // if not boolean column return half the number of rows - return stats.getNumRows() / 2; + return worstEstimate(stats.getNumRows()); } else if (pred instanceof ExprNodeConstantDesc) { // special case for handling false constants @@ -417,7 +423,7 @@ private long evaluateInExpr(Statistics stats, ExprNodeDesc pred, AnnotateStatsPr // If column is not column reference , we bail out if (!(columnChild instanceof ExprNodeColumnDesc)) { // Default - return numRows / 2; + return worstEstimate(numRows); } columns.add(columnChild); final String columnName = ((ExprNodeColumnDesc)columnChild).getColumn(); @@ -426,7 +432,7 @@ private long evaluateInExpr(Statistics stats, ExprNodeDesc pred, AnnotateStatsPr // in filter expression since it will be taken care by partition pruner if (neededCols != null && !neededCols.contains(columnName)) { // Default - return numRows / 2; + return worstEstimate(numRows); } columnStats.add(stats.getColumnStatisticsFromColName(columnName)); values.add(Sets.newHashSet()); @@ -436,7 +442,7 @@ private long evaluateInExpr(Statistics stats, ExprNodeDesc pred, AnnotateStatsPr // If column is not column reference , we bail out if (!(columnsChild instanceof ExprNodeColumnDesc)) { // Default - return numRows / 2; + return worstEstimate(numRows); } columns.add(columnsChild); final String columnName = ((ExprNodeColumnDesc)columnsChild).getColumn(); @@ -445,7 +451,7 @@ private long evaluateInExpr(Statistics stats, ExprNodeDesc pred, AnnotateStatsPr // in filter expression since it will be taken care by partition pruner if (neededCols != null && !neededCols.contains(columnName)) { // Default - return numRows / 2; + return worstEstimate(numRows); } columnStats.add(stats.getColumnStatisticsFromColName(columnName)); values.add(Sets.newHashSet()); @@ -458,7 +464,7 @@ private long evaluateInExpr(Statistics stats, ExprNodeDesc pred, AnnotateStatsPr // If value is not a constant, we bail out if (!(child instanceof ExprNodeConstantDesc)) { // Default - return numRows / 2; + return worstEstimate(numRows); } if (multiColumn) { ExprNodeConstantDesc constantChild = (ExprNodeConstantDesc) child; @@ -550,13 +556,13 @@ private long evaluateNotExpr(Statistics stats, ExprNodeDesc pred, } } // if not boolean column return half the number of rows - return numRows / 2; + return worstEstimate(numRows); } } } // worst case - return numRows / 2; + return worstEstimate(numRows); } private long evaluateColEqualsNullExpr(Statistics stats, ExprNodeDesc pred) { @@ -580,7 +586,7 @@ private long evaluateColEqualsNullExpr(Statistics stats, ExprNodeDesc pred) { } // worst case - return numRows / 2; + return worstEstimate(numRows); } private long evaluateNotNullExpr(Statistics parentStats, ExprNodeGenericFuncDesc pred) { @@ -813,7 +819,6 @@ private long evaluateComparator(Statistics stats, ExprNodeGenericFuncDesc genFun private long evaluateChildExpr(Statistics stats, ExprNodeDesc child, AnnotateStatsProcCtx aspCtx, List neededCols, FilterOperator fop, long evaluatedRowCount) throws CloneNotSupportedException, SemanticException { - long numRows = stats.getNumRows(); if (child instanceof ExprNodeGenericFuncDesc) { @@ -821,8 +826,7 @@ private long evaluateChildExpr(Statistics stats, ExprNodeDesc child, ExprNodeGenericFuncDesc genFunc = (ExprNodeGenericFuncDesc) child; GenericUDF udf = genFunc.getGenericUDF(); - if (udf instanceof GenericUDFOPEqual || - udf instanceof GenericUDFOPEqualNS) { + if (udf instanceof GenericUDFOPEqual) { String colName = null; boolean isConst = false; Object prevConst = null; @@ -859,7 +863,7 @@ private long evaluateChildExpr(Statistics stats, ExprNodeDesc child, ColStatistics cs = stats.getColumnStatisticsFromColName(colName); if (cs != null) { long dvs = cs.getCountDistint(); - numRows = dvs == 0 ? numRows / 2 : Math.round( (double)numRows / dvs); + numRows = dvs == 0 ? worstEstimate(numRows) : Math.round( (double)numRows / dvs); return numRows; } } else if (leaf instanceof ExprNodeColumnDesc) { @@ -879,7 +883,7 @@ private long evaluateChildExpr(Statistics stats, ExprNodeDesc child, ColStatistics cs = stats.getColumnStatisticsFromColName(colName); if (cs != null) { long dvs = cs.getCountDistint(); - numRows = dvs == 0 ? numRows / 2 : Math.round( (double)numRows / dvs); + numRows = dvs == 0 ? worstEstimate(numRows) : Math.round( (double)numRows / dvs); return numRows; } } @@ -904,7 +908,15 @@ private long evaluateChildExpr(Statistics stats, ExprNodeDesc child, } // worst case - return numRows / 2; + return worstEstimate(numRows); + } + + /** + * Given the input 'numRows', produce a number for the output rows based on + * the worst estimate in the case that not enough stats are available. + */ + private long worstEstimate(long numRows) { + return (long) (numRows * filterFactor); } } @@ -2184,8 +2196,8 @@ public static NodeProcessor getSelectRule() { return new SelectStatsRule(); } - public static NodeProcessor getFilterRule() { - return new FilterStatsRule(); + public static NodeProcessor getFilterRule(AnnotateStatsProcCtx ctx) { + return new FilterStatsRule(ctx); } public static NodeProcessor getGroupByRule() {