diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java index f273d25..ea97e1e 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java @@ -26,8 +26,6 @@ import java.util.Map.Entry; import java.util.Stack; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.ErrorMsg; import org.apache.hadoop.hive.ql.exec.ColumnInfo; @@ -80,6 +78,8 @@ import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPOr; import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import com.google.common.collect.Lists; import com.google.common.collect.Maps; @@ -500,6 +500,173 @@ private long getMaxNulls(Statistics stats, ExprNodeDesc pred) { return maxNoNulls; } + private long evaluateComparator(Statistics stats, ExprNodeGenericFuncDesc genFunc) { + long numRows = stats.getNumRows(); + GenericUDF udf = genFunc.getGenericUDF(); + + ExprNodeColumnDesc columnDesc; + ExprNodeConstantDesc constantDesc; + boolean upperBound; + String boundValue = null; + if (genFunc.getChildren().get(0) instanceof ExprNodeColumnDesc && + genFunc.getChildren().get(1) instanceof ExprNodeConstantDesc) { + columnDesc = (ExprNodeColumnDesc) genFunc.getChildren().get(0); + constantDesc = (ExprNodeConstantDesc) genFunc.getChildren().get(1); + if (udf instanceof GenericUDFOPEqualOrGreaterThan || + udf instanceof GenericUDFOPGreaterThan) { + boundValue = constantDesc.getValue().toString(); + upperBound = false; + } else { + boundValue = constantDesc.getValue().toString(); + upperBound = true; + } + } else if (genFunc.getChildren().get(1) instanceof ExprNodeColumnDesc && + genFunc.getChildren().get(0) instanceof ExprNodeConstantDesc) { + columnDesc = (ExprNodeColumnDesc) genFunc.getChildren().get(1); + constantDesc = (ExprNodeConstantDesc) genFunc.getChildren().get(0); + if (udf instanceof GenericUDFOPEqualOrGreaterThan || + udf instanceof GenericUDFOPGreaterThan) { + boundValue = constantDesc.getValue().toString(); + upperBound = true; + } else { + boundValue = constantDesc.getValue().toString(); + upperBound = false; + } + } else { + // default + return numRows / 3; + } + + ColStatistics cs = stats.getColumnStatisticsFromColName(columnDesc.getColumn()); + if (cs != null && cs.getRange() != null && + cs.getRange().maxValue != null && cs.getRange().minValue != null) { + String colTypeLowerCase = columnDesc.getTypeString().toLowerCase(); + try { + if (colTypeLowerCase.equals(serdeConstants.TINYINT_TYPE_NAME)) { + byte value = new Byte(boundValue); + byte maxValue = cs.getRange().maxValue.byteValue(); + byte minValue = cs.getRange().minValue.byteValue(); + if (upperBound) { + if (maxValue < value) { + return numRows; + } + if (minValue > value) { + return 0; + } + } else { + if (minValue > value) { + return numRows; + } + if (maxValue < value) { + return 0; + } + } + } else if (colTypeLowerCase.equals(serdeConstants.SMALLINT_TYPE_NAME)) { + short value = new Short(boundValue); + short maxValue = cs.getRange().maxValue.shortValue(); + short minValue = cs.getRange().minValue.shortValue(); + if (upperBound) { + if (maxValue < value) { + return numRows; + } + if (minValue > value) { + return 0; + } + } else { + if (minValue > value) { + return numRows; + } + if (maxValue < value) { + return 0; + } + } + } else if (colTypeLowerCase.equals(serdeConstants.INT_TYPE_NAME) || + colTypeLowerCase.equals(serdeConstants.DATE_TYPE_NAME)) { + // Date is an integer internally + int value = new Integer(boundValue); + int maxValue = cs.getRange().maxValue.intValue(); + int minValue = cs.getRange().minValue.intValue(); + if (upperBound) { + if (maxValue < value) { + return numRows; + } + if (minValue > value) { + return 0; + } + } else { + if (minValue > value) { + return numRows; + } + if (maxValue < value) { + return 0; + } + } + } else if (colTypeLowerCase.equals(serdeConstants.BIGINT_TYPE_NAME)) { + long value = new Long(boundValue); + long maxValue = cs.getRange().maxValue.longValue(); + long minValue = cs.getRange().minValue.longValue(); + if (upperBound) { + if (maxValue < value) { + return numRows; + } + if (minValue > value) { + return 0; + } + } else { + if (minValue > value) { + return numRows; + } + if (maxValue < value) { + return 0; + } + } + } else if (colTypeLowerCase.equals(serdeConstants.FLOAT_TYPE_NAME)) { + float value = new Float(boundValue); + float maxValue = cs.getRange().maxValue.floatValue(); + float minValue = cs.getRange().minValue.floatValue(); + if (upperBound) { + if (maxValue < value) { + return numRows; + } + if (minValue > value) { + return 0; + } + } else { + if (minValue > value) { + return numRows; + } + if (maxValue < value) { + return 0; + } + } + } else if (colTypeLowerCase.equals(serdeConstants.DOUBLE_TYPE_NAME)) { + double value = new Double(boundValue); + double maxValue = cs.getRange().maxValue.doubleValue(); + double minValue = cs.getRange().minValue.doubleValue(); + if (upperBound) { + if (maxValue < value) { + return numRows; + } + if (minValue > value) { + return 0; + } + } else { + if (minValue > value) { + return numRows; + } + if (maxValue < value) { + return 0; + } + } + } + } catch (NumberFormatException nfe) { + return numRows / 3; + } + } + // default + return numRows / 3; + } + private long evaluateChildExpr(Statistics stats, ExprNodeDesc child, AnnotateStatsProcCtx aspCtx, List neededCols, FilterOperator fop, long evaluatedRowCount) throws CloneNotSupportedException { @@ -578,9 +745,10 @@ private long evaluateChildExpr(Statistics stats, ExprNodeDesc child, } else if (udf instanceof GenericUDFOPNotEqual) { return numRows; } else if (udf instanceof GenericUDFOPEqualOrGreaterThan - || udf instanceof GenericUDFOPEqualOrLessThan || udf instanceof GenericUDFOPGreaterThan + || udf instanceof GenericUDFOPEqualOrLessThan + || udf instanceof GenericUDFOPGreaterThan || udf instanceof GenericUDFOPLessThan) { - return numRows / 3; + return evaluateComparator(stats, genFunc); } else if (udf instanceof GenericUDFOPNotNull) { return evaluateNotNullExpr(stats, genFunc); } else if (udf instanceof GenericUDFOPNull) { diff --git ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java index 9d139ba..d8acf94 100644 --- ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java +++ ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java @@ -724,6 +724,8 @@ public static ColStatistics getColStatistics(ColumnStatisticsObj cso, String tab } } else if (colTypeLowerCase.equals(serdeConstants.DATE_TYPE_NAME)) { cs.setAvgColLen(JavaDataModel.get().lengthOfDate()); + cs.setRange(csd.getDateStats().getLowValue().getDaysSinceEpoch(), + csd.getDateStats().getHighValue().getDaysSinceEpoch()); } else { // Columns statistics for complex datatypes are not supported yet return null; diff --git ql/src/test/queries/clientpositive/annotate_stats_filter.q ql/src/test/queries/clientpositive/annotate_stats_filter.q index 436c053..a352a77 100644 --- ql/src/test/queries/clientpositive/annotate_stats_filter.q +++ ql/src/test/queries/clientpositive/annotate_stats_filter.q @@ -83,9 +83,17 @@ explain select * from loc_orc where (year=2001 and year is null) or (state='CA') -- numRows: 1 rawDataSize: 102 explain select * from loc_orc where (year=2001 or year is null) and (state='CA'); --- all inequality conditions rows/3 is the rules --- numRows: 2 rawDataSize: 204 +-- inequality conditions falling out of range. total or zero (converted to one) +-- numRows: 1 rawDataSize: 102 +-- numRows: 8 rawDataSize: 804 explain select * from loc_orc where locid < 30; explain select * from loc_orc where locid > 30; explain select * from loc_orc where locid <= 30; explain select * from loc_orc where locid >= 30; + +-- all inequality conditions falling within range. rows/3 is the rules +-- numRows: 2 rawDataSize: 204 +explain select * from loc_orc where locid < 3; +explain select * from loc_orc where locid > 3; +explain select * from loc_orc where locid <= 3; +explain select * from loc_orc where locid >= 3; diff --git ql/src/test/results/clientpositive/annotate_stats_filter.q.out ql/src/test/results/clientpositive/annotate_stats_filter.q.out index b09ad03..7e697f1 100644 --- ql/src/test/results/clientpositive/annotate_stats_filter.q.out +++ ql/src/test/results/clientpositive/annotate_stats_filter.q.out @@ -856,12 +856,14 @@ STAGE PLANS: Processor Tree: ListSink -PREHOOK: query: -- all inequality conditions rows/3 is the rules --- numRows: 2 rawDataSize: 204 +PREHOOK: query: -- inequality conditions falling out of range. total or zero (converted to one) +-- numRows: 1 rawDataSize: 102 +-- numRows: 8 rawDataSize: 804 explain select * from loc_orc where locid < 30 PREHOOK: type: QUERY -POSTHOOK: query: -- all inequality conditions rows/3 is the rules --- numRows: 2 rawDataSize: 204 +POSTHOOK: query: -- inequality conditions falling out of range. total or zero (converted to one) +-- numRows: 1 rawDataSize: 102 +-- numRows: 8 rawDataSize: 804 explain select * from loc_orc where locid < 30 POSTHOOK: type: QUERY STAGE DEPENDENCIES: @@ -877,14 +879,14 @@ STAGE PLANS: Statistics: Num rows: 8 Data size: 804 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator predicate: (locid < 30) (type: boolean) - Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 8 Data size: 804 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: state (type: string), locid (type: int), zip (type: bigint), year (type: int) outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 8 Data size: 804 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 8 Data size: 804 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat @@ -913,14 +915,14 @@ STAGE PLANS: Statistics: Num rows: 8 Data size: 804 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator predicate: (locid > 30) (type: boolean) - Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1 Data size: 102 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: state (type: string), locid (type: int), zip (type: bigint), year (type: int) outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1 Data size: 102 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1 Data size: 102 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat @@ -949,14 +951,14 @@ STAGE PLANS: Statistics: Num rows: 8 Data size: 804 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator predicate: (locid <= 30) (type: boolean) - Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 8 Data size: 804 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: state (type: string), locid (type: int), zip (type: bigint), year (type: int) outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 8 Data size: 804 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 8 Data size: 804 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat @@ -985,6 +987,154 @@ STAGE PLANS: Statistics: Num rows: 8 Data size: 804 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator predicate: (locid >= 30) (type: boolean) + Statistics: Num rows: 1 Data size: 102 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: state (type: string), locid (type: int), zip (type: bigint), year (type: int) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 1 Data size: 102 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 102 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: -- all inequality conditions falling within range. rows/3 is the rules +-- numRows: 2 rawDataSize: 204 +explain select * from loc_orc where locid < 3 +PREHOOK: type: QUERY +POSTHOOK: query: -- all inequality conditions falling within range. rows/3 is the rules +-- numRows: 2 rawDataSize: 204 +explain select * from loc_orc where locid < 3 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: loc_orc + Statistics: Num rows: 8 Data size: 804 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: (locid < 3) (type: boolean) + Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: state (type: string), locid (type: int), zip (type: bigint), year (type: int) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: explain select * from loc_orc where locid > 3 +PREHOOK: type: QUERY +POSTHOOK: query: explain select * from loc_orc where locid > 3 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: loc_orc + Statistics: Num rows: 8 Data size: 804 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: (locid > 3) (type: boolean) + Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: state (type: string), locid (type: int), zip (type: bigint), year (type: int) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: explain select * from loc_orc where locid <= 3 +PREHOOK: type: QUERY +POSTHOOK: query: explain select * from loc_orc where locid <= 3 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: loc_orc + Statistics: Num rows: 8 Data size: 804 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: (locid <= 3) (type: boolean) + Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: state (type: string), locid (type: int), zip (type: bigint), year (type: int) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: explain select * from loc_orc where locid >= 3 +PREHOOK: type: QUERY +POSTHOOK: query: explain select * from loc_orc where locid >= 3 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: loc_orc + Statistics: Num rows: 8 Data size: 804 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: (locid >= 3) (type: boolean) Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: state (type: string), locid (type: int), zip (type: bigint), year (type: int)