diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java index ab07fb6..aa1e509 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java @@ -71,6 +71,7 @@ import org.apache.hadoop.hive.ql.stats.StatsUtils; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBetween; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFIn; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPAnd; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqual; @@ -89,6 +90,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -354,6 +356,9 @@ private long evaluateExpression(Statistics stats, ExprNodeDesc pred, } else if (udf instanceof GenericUDFIn) { // for IN clause newNumRows = evaluateInExpr(stats, pred, aspCtx, neededCols, fop); + } else if (udf instanceof GenericUDFBetween) { + // for BETWEEN clause + newNumRows = evaluateBetweenExpr(stats, pred, aspCtx, neededCols, fop); } else if (udf instanceof GenericUDFOPNot) { newNumRows = evaluateNotExpr(stats, pred, aspCtx, neededCols, fop); } else if (udf instanceof GenericUDFOPNotNull) { @@ -480,6 +485,32 @@ private long evaluateInExpr(Statistics stats, ExprNodeDesc pred, AnnotateStatsPr return Math.round( (double)numRows * factor * inFactor); } + private long evaluateBetweenExpr(Statistics stats, ExprNodeDesc pred, AnnotateStatsProcCtx aspCtx, + List neededCols, FilterOperator fop) throws SemanticException, CloneNotSupportedException { + final ExprNodeGenericFuncDesc fd = (ExprNodeGenericFuncDesc) pred; + final boolean invert = Boolean.TRUE.equals( + ((ExprNodeConstantDesc) fd.getChildren().get(0)).getValue()); // boolean invert (not) + final ExprNodeDesc comparisonExpression = fd.getChildren().get(1); // expression + final ExprNodeDesc leftExpression = fd.getChildren().get(2); // left expression + final ExprNodeDesc rightExpression = fd.getChildren().get(3); // right expression + + // We transform the BETWEEN clause to AND clause (with NOT on top in invert is true). + // This is more straightforward, as the evaluateExpression method will deal with + // generating the final row count relying on the basic comparator evaluation methods + final ExprNodeDesc leftComparator = new ExprNodeGenericFuncDesc(TypeInfoFactory.booleanTypeInfo, + new GenericUDFOPEqualOrGreaterThan(), Lists.newArrayList(comparisonExpression, leftExpression)); + final ExprNodeDesc rightComparator = new ExprNodeGenericFuncDesc(TypeInfoFactory.booleanTypeInfo, + new GenericUDFOPEqualOrLessThan(), Lists.newArrayList(comparisonExpression, rightExpression)); + ExprNodeDesc newExpression = new ExprNodeGenericFuncDesc(TypeInfoFactory.booleanTypeInfo, + new GenericUDFOPAnd(), Lists.newArrayList(leftComparator, rightComparator)); + if (invert) { + newExpression = new ExprNodeGenericFuncDesc(TypeInfoFactory.booleanTypeInfo, + new GenericUDFOPNot(), Lists.newArrayList(newExpression)); + } + + return evaluateExpression(stats, newExpression, aspCtx, neededCols, fop, 0); + } + private long evaluateNotExpr(Statistics stats, ExprNodeDesc pred, AnnotateStatsProcCtx aspCtx, List neededCols, FilterOperator fop) throws CloneNotSupportedException, SemanticException { @@ -866,7 +897,8 @@ private long evaluateChildExpr(Statistics stats, ExprNodeDesc child, } else if (udf instanceof GenericUDFOPNull) { return evaluateColEqualsNullExpr(stats, genFunc); } else if (udf instanceof GenericUDFOPAnd || udf instanceof GenericUDFOPOr - || udf instanceof GenericUDFIn || udf instanceof GenericUDFOPNot) { + || udf instanceof GenericUDFIn || udf instanceof GenericUDFBetween + || udf instanceof GenericUDFOPNot) { return evaluateExpression(stats, genFunc, aspCtx, neededCols, fop, evaluatedRowCount); } } diff --git ql/src/test/results/clientpositive/udf_between.q.out ql/src/test/results/clientpositive/udf_between.q.out index efe6615..bd9b4bb 100644 --- ql/src/test/results/clientpositive/udf_between.q.out +++ ql/src/test/results/clientpositive/udf_between.q.out @@ -25,11 +25,11 @@ STAGE PLANS: Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Filter Operator predicate: (UDFToDouble(key) + 100.0) BETWEEN 100 AND 200 (type: boolean) - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 55 Data size: 584 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: key (type: string), value (type: string) outputColumnNames: _col0, _col1 - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 55 Data size: 584 Basic stats: COMPLETE Column stats: NONE Limit Number of rows: 20 Statistics: Num rows: 20 Data size: 200 Basic stats: COMPLETE Column stats: NONE @@ -80,11 +80,11 @@ STAGE PLANS: Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Filter Operator predicate: (UDFToDouble(key) + 100.0) NOT BETWEEN 100 AND 200 (type: boolean) - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 445 Data size: 4727 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: key (type: string), value (type: string) outputColumnNames: _col0, _col1 - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 445 Data size: 4727 Basic stats: COMPLETE Column stats: NONE Limit Number of rows: 20 Statistics: Num rows: 20 Data size: 200 Basic stats: COMPLETE Column stats: NONE