diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/StatObjectConverter.java b/metastore/src/java/org/apache/hadoop/hive/metastore/StatObjectConverter.java index 475883b..d634bd7 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/StatObjectConverter.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/StatObjectConverter.java @@ -219,7 +219,7 @@ public static ColumnStatisticsObj getTableColumnStatisticsObj( colStatsData.setBinaryStats(binaryStats); } else if (colType.equals("bigint") || colType.equals("int") || colType.equals("smallint") || colType.equals("tinyint") || - colType.equals("timestamp")) { + colType.equals("timestamp") || colType.equals("date")) { LongColumnStatsData longStats = new LongColumnStatsData(); longStats.setNumNulls(mStatsObj.getNumNulls()); Long longHighValue = mStatsObj.getLongHighValue(); @@ -364,7 +364,7 @@ public static ColumnStatisticsObj getPartitionColumnStatisticsObj( colStatsData.setBinaryStats(binaryStats); } else if (colType.equals("tinyint") || colType.equals("smallint") || colType.equals("int") || colType.equals("bigint") || - colType.equals("timestamp")) { + colType.equals("timestamp") || colType.equals("date")) { LongColumnStatsData longStats = new LongColumnStatsData(); longStats.setNumNulls(mStatsObj.getNumNulls()); if (mStatsObj.getLongHighValue() != null) { @@ -440,7 +440,7 @@ public static void fillColumnStatisticsData(String colType, ColumnStatisticsData data.setBinaryStats(binaryStats); } else if (colType.equals("bigint") || colType.equals("int") || colType.equals("smallint") || colType.equals("tinyint") || - colType.equals("timestamp")) { + colType.equals("timestamp") || colType.equals("date")) { LongColumnStatsData longStats = new LongColumnStatsData(); longStats.setNumNulls(MetaStoreDirectSql.extractSqlLong(nulls)); if (lhigh != null) { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/ColumnStatsTask.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/ColumnStatsTask.java index 0c46b00..ec7298e 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/ColumnStatsTask.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/ColumnStatsTask.java @@ -198,7 +198,7 @@ private void unpackPrimitiveObject (ObjectInspector oi, Object o, String fieldNa String s = ((StringObjectInspector) poi).getPrimitiveJavaObject(o); ColumnStatisticsData statsData = new ColumnStatisticsData(); - if (s.equalsIgnoreCase("long")) { + if (s.equalsIgnoreCase("long") || s.equalsIgnoreCase("date")) { LongColumnStatsData longStats = new LongColumnStatsData(); statsData.setLongStats(longStats); statsObj.setStatsData(statsData); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFComputeStats.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFComputeStats.java index 363039b..b958c2d 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFComputeStats.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFComputeStats.java @@ -28,6 +28,7 @@ import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.util.JavaDataModel; +import org.apache.hadoop.hive.serde2.io.DateWritable; import org.apache.hadoop.hive.serde2.io.DoubleWritable; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; @@ -86,9 +87,11 @@ public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) return new GenericUDAFBinaryStatsEvaluator(); case DECIMAL: return new GenericUDAFDecimalStatsEvaluator(); + case DATE: + return new GenericUDAFDateStatsEvaluator(); default: throw new UDFArgumentTypeException(0, - "Only integer/long/timestamp/float/double/string/binary/boolean/decimal type argument " + + "Only integer/long/timestamp/date/float/double/string/binary/boolean/decimal type argument " + "is accepted but " + parameters[0].getTypeName() + " is passed."); } @@ -1314,4 +1317,69 @@ public void reset(AggregationBuffer agg) throws HiveException { ((NumericStatsAgg)agg).reset("Decimal"); } } + + public static class GenericUDAFDateStatsEvaluator extends GenericUDAFEvaluator { + // Just re-use LongStatsEvaluator, using DateWritable.getDays() as the value. + protected transient GenericUDAFLongStatsEvaluator longStats = new GenericUDAFLongStatsEvaluator(); + protected transient LongWritable longValue = new LongWritable(); + protected transient Object[] values = new Object[2]; + private transient PrimitiveObjectInspector inputOI; + + @Override + public AggregationBuffer getNewAggregationBuffer() throws HiveException { + return longStats.getNewAggregationBuffer(); + } + + @Override + public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveException { + + super.init(m, parameters); + + // initialize input + if (mode == Mode.PARTIAL1 || mode == Mode.COMPLETE) { + // Convert from DateWritable to long value and pass to LongStatsEvaluator + inputOI = (PrimitiveObjectInspector) parameters[0]; + ObjectInspector[] longParams = new ObjectInspector[] { + PrimitiveObjectInspectorFactory.writableLongObjectInspector, + parameters[1] + }; + return longStats.init(m, longParams); + } else { + return longStats.init(m, parameters); + } + } + + @Override + public void reset(AggregationBuffer agg) throws HiveException { + longStats.reset(agg); + } + + @Override + public void iterate(AggregationBuffer agg, Object[] parameters) + throws HiveException { + // Convert from DateWritable to long value and pass to LongStatsEvaluator + long daysValue = + ((DateObjectInspector) inputOI).getPrimitiveWritableObject(parameters[0]).getDays(); + longValue.set(daysValue); + values[0] = longValue; + values[1] = parameters[1]; + longStats.iterate(agg, values); + } + + @Override + public Object terminatePartial(AggregationBuffer agg) throws HiveException { + return longStats.terminatePartial(agg); + } + + @Override + public void merge(AggregationBuffer agg, Object partial) + throws HiveException { + longStats.merge(agg, partial); + } + + @Override + public Object terminate(AggregationBuffer agg) throws HiveException { + return longStats.terminate(agg); + } + } } diff --git a/ql/src/test/queries/clientpositive/compute_stats_date.q b/ql/src/test/queries/clientpositive/compute_stats_date.q new file mode 100644 index 0000000..cf7e488 --- /dev/null +++ b/ql/src/test/queries/clientpositive/compute_stats_date.q @@ -0,0 +1,24 @@ + +create table tab_date ( + origin_city_name string, + dest_city_name string, + fl_date date, + arr_delay float, + fl_num int +); + +-- insert some data +load data local inpath '../../data/files/flights_join.txt' overwrite into table tab_date; + +select count(*) from tab_date; + +-- compute statistical summary of data +select compute_stats(fl_date, 16) from tab_date; + +explain +analyze table tab_date compute statistics for columns fl_date; + +analyze table tab_date compute statistics for columns fl_date; + +describe formatted tab_date fl_date; + diff --git a/ql/src/test/results/clientpositive/compute_stats_date.q.out b/ql/src/test/results/clientpositive/compute_stats_date.q.out new file mode 100644 index 0000000..a81f48c --- /dev/null +++ b/ql/src/test/results/clientpositive/compute_stats_date.q.out @@ -0,0 +1,112 @@ +PREHOOK: query: create table tab_date ( + origin_city_name string, + dest_city_name string, + fl_date date, + arr_delay float, + fl_num int +) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@tab_date +POSTHOOK: query: create table tab_date ( + origin_city_name string, + dest_city_name string, + fl_date date, + arr_delay float, + fl_num int +) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@tab_date +PREHOOK: query: -- insert some data +load data local inpath '../../data/files/flights_join.txt' overwrite into table tab_date +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@tab_date +POSTHOOK: query: -- insert some data +load data local inpath '../../data/files/flights_join.txt' overwrite into table tab_date +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@tab_date +PREHOOK: query: select count(*) from tab_date +PREHOOK: type: QUERY +PREHOOK: Input: default@tab_date +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from tab_date +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tab_date +#### A masked pattern was here #### +20 +PREHOOK: query: -- compute statistical summary of data +select compute_stats(fl_date, 16) from tab_date +PREHOOK: type: QUERY +PREHOOK: Input: default@tab_date +#### A masked pattern was here #### +POSTHOOK: query: -- compute statistical summary of data +select compute_stats(fl_date, 16) from tab_date +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tab_date +#### A masked pattern was here #### +{"columntype":"Long","min":11281,"max":14911,"countnulls":0,"numdistinctvalues":18} +PREHOOK: query: explain +analyze table tab_date compute statistics for columns fl_date +PREHOOK: type: QUERY +POSTHOOK: query: explain +analyze table tab_date compute statistics for columns fl_date +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-0 is a root stage + Stage-1 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Map Reduce + Map Operator Tree: + TableScan + alias: tab_date + Select Operator + expressions: fl_date (type: date) + outputColumnNames: fl_date + Group By Operator + aggregations: compute_stats(fl_date, 16) + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + value expressions: _col0 (type: struct) + Reduce Operator Tree: + Group By Operator + aggregations: compute_stats(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-1 + Column Stats Work + Column Stats Desc: + Columns: fl_date + Column Types: date + Table: default.tab_date + +PREHOOK: query: analyze table tab_date compute statistics for columns fl_date +PREHOOK: type: QUERY +PREHOOK: Input: default@tab_date +#### A masked pattern was here #### +POSTHOOK: query: analyze table tab_date compute statistics for columns fl_date +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tab_date +#### A masked pattern was here #### +PREHOOK: query: describe formatted tab_date fl_date +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@tab_date +POSTHOOK: query: describe formatted tab_date fl_date +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@tab_date +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +fl_date date 11281 14911 0 18 from deserializer