diff --git ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java index a718264..d1f717b 100644 --- ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java +++ ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java @@ -30,12 +30,10 @@ import java.util.Set; import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; -import java.util.concurrent.Executors; import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; import java.util.concurrent.Future; - -import com.google.common.util.concurrent.ThreadFactoryBuilder; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -100,6 +98,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableTimestampObjectInspector; import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo; import org.apache.hadoop.io.BytesWritable; import org.apache.hive.common.util.AnnotationUtils; @@ -110,6 +109,7 @@ import com.google.common.base.Joiner; import com.google.common.collect.Lists; import com.google.common.math.LongMath; +import com.google.common.util.concurrent.ThreadFactoryBuilder; public class StatsUtils { @@ -1307,11 +1307,31 @@ public static ColStatistics getColStatisticsFromExpression(HiveConf conf, Statis countDistincts = 1; } } else if (end instanceof ExprNodeGenericFuncDesc) { - - // udf projection ExprNodeGenericFuncDesc engfd = (ExprNodeGenericFuncDesc) end; colName = engfd.getName(); colType = engfd.getTypeString(); + + // If it is a widening cast, we do not change NDV, min, max + if (isWideningCast(engfd) && engfd.getChildren().get(0) instanceof ExprNodeColumnDesc) { + // cast on single column + ColStatistics stats = parentStats.getColumnStatisticsFromColName(engfd.getCols().get(0)); + if (stats != null) { + ColStatistics newStats; + try { + newStats = stats.clone(); + } catch (CloneNotSupportedException e) { + LOG.warn("error cloning stats, this should not happen"); + return null; + } + newStats.setColumnName(colName); + colType = colType.toLowerCase(); + newStats.setColumnType(colType); + newStats.setAvgColLen(getAvgColLenOf(conf, oi, colType)); + return newStats; + } + } + + // fallback to default countDistincts = getNDVFor(engfd, numRows, parentStats); } else if (end instanceof ExprNodeColumnListDesc) { @@ -1341,6 +1361,15 @@ public static ColStatistics getColStatisticsFromExpression(HiveConf conf, Statis return colStats; } + private static boolean isWideningCast(ExprNodeGenericFuncDesc engfd) { + GenericUDF udf = engfd.getGenericUDF(); + if (!FunctionRegistry.isOpCast(udf)) { + // It is not a cast + return false; + } + return TypeInfoUtils.implicitConvertible(engfd.getChildren().get(0).getTypeInfo(), + engfd.getTypeInfo()); + } public static Long addWithExpDecay (List distinctVals) { // Exponential back-off for NDVs. diff --git ql/src/test/queries/clientpositive/annotate_stats_join_pkfk.q ql/src/test/queries/clientpositive/annotate_stats_join_pkfk.q index aa62c60..f94994a 100644 --- ql/src/test/queries/clientpositive/annotate_stats_join_pkfk.q +++ ql/src/test/queries/clientpositive/annotate_stats_join_pkfk.q @@ -69,6 +69,40 @@ create table store ) row format delimited fields terminated by '|'; +create table store_bigint +( + s_store_sk bigint, + s_store_id string, + s_rec_start_date string, + s_rec_end_date string, + s_closed_date_sk int, + s_store_name string, + s_number_employees int, + s_floor_space int, + s_hours string, + s_manager string, + s_market_id int, + s_geography_class string, + s_market_desc string, + s_market_manager string, + s_division_id int, + s_division_name string, + s_company_id int, + s_company_name string, + s_street_number string, + s_street_name string, + s_street_type string, + s_suite_number string, + s_city string, + s_county string, + s_state string, + s_zip string, + s_country string, + s_gmt_offset float, + s_tax_precentage float +) +row format delimited fields terminated by '|'; + create table customer_address ( ca_address_sk int, @@ -88,11 +122,14 @@ create table customer_address row format delimited fields terminated by '|'; load data local inpath '../../data/files/store.txt' overwrite into table store; +load data local inpath '../../data/files/store.txt' overwrite into table store_bigint; load data local inpath '../../data/files/store_sales.txt' overwrite into table store_sales; load data local inpath '../../data/files/customer_address.txt' overwrite into table customer_address; analyze table store compute statistics; analyze table store compute statistics for columns s_store_sk, s_floor_space; +analyze table store_bigint compute statistics; +analyze table store_bigint compute statistics for columns s_store_sk, s_floor_space; analyze table store_sales compute statistics; analyze table store_sales compute statistics for columns ss_store_sk, ss_addr_sk, ss_quantity; analyze table customer_address compute statistics; @@ -100,6 +137,9 @@ analyze table customer_address compute statistics for columns ca_address_sk; explain select s.s_store_sk from store s join store_sales ss on (s.s_store_sk = ss.ss_store_sk); +-- widening cast: inferred PK-FK, thus same row count as previous query +explain select s.s_store_sk from store_bigint s join store_sales ss on (s.s_store_sk = ss.ss_store_sk); + explain select s.s_store_sk from store s join store_sales ss on (s.s_store_sk = ss.ss_store_sk) where s.s_store_sk > 0; explain select s.s_store_sk from store s join store_sales ss on (s.s_store_sk = ss.ss_store_sk) where s.s_company_id > 0 and ss.ss_quantity > 10; @@ -120,4 +160,5 @@ explain select s.s_store_sk from store s join store_sales ss on (s.s_store_sk = drop table store_sales; drop table store; +drop table store_bigint; drop table customer_address; diff --git ql/src/test/results/clientpositive/annotate_stats_join_pkfk.q.out ql/src/test/results/clientpositive/annotate_stats_join_pkfk.q.out index 6588db2..c581aff 100644 --- ql/src/test/results/clientpositive/annotate_stats_join_pkfk.q.out +++ ql/src/test/results/clientpositive/annotate_stats_join_pkfk.q.out @@ -148,6 +148,78 @@ row format delimited fields terminated by '|' POSTHOOK: type: CREATETABLE POSTHOOK: Output: database:default POSTHOOK: Output: default@store +PREHOOK: query: create table store_bigint +( + s_store_sk bigint, + s_store_id string, + s_rec_start_date string, + s_rec_end_date string, + s_closed_date_sk int, + s_store_name string, + s_number_employees int, + s_floor_space int, + s_hours string, + s_manager string, + s_market_id int, + s_geography_class string, + s_market_desc string, + s_market_manager string, + s_division_id int, + s_division_name string, + s_company_id int, + s_company_name string, + s_street_number string, + s_street_name string, + s_street_type string, + s_suite_number string, + s_city string, + s_county string, + s_state string, + s_zip string, + s_country string, + s_gmt_offset float, + s_tax_precentage float +) +row format delimited fields terminated by '|' +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@store_bigint +POSTHOOK: query: create table store_bigint +( + s_store_sk bigint, + s_store_id string, + s_rec_start_date string, + s_rec_end_date string, + s_closed_date_sk int, + s_store_name string, + s_number_employees int, + s_floor_space int, + s_hours string, + s_manager string, + s_market_id int, + s_geography_class string, + s_market_desc string, + s_market_manager string, + s_division_id int, + s_division_name string, + s_company_id int, + s_company_name string, + s_street_number string, + s_street_name string, + s_street_type string, + s_suite_number string, + s_city string, + s_county string, + s_state string, + s_zip string, + s_country string, + s_gmt_offset float, + s_tax_precentage float +) +row format delimited fields terminated by '|' +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@store_bigint PREHOOK: query: create table customer_address ( ca_address_sk int, @@ -196,6 +268,14 @@ POSTHOOK: query: load data local inpath '../../data/files/store.txt' overwrite i POSTHOOK: type: LOAD #### A masked pattern was here #### POSTHOOK: Output: default@store +PREHOOK: query: load data local inpath '../../data/files/store.txt' overwrite into table store_bigint +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@store_bigint +POSTHOOK: query: load data local inpath '../../data/files/store.txt' overwrite into table store_bigint +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@store_bigint PREHOOK: query: load data local inpath '../../data/files/store_sales.txt' overwrite into table store_sales PREHOOK: type: LOAD #### A masked pattern was here #### @@ -228,6 +308,22 @@ POSTHOOK: query: analyze table store compute statistics for columns s_store_sk, POSTHOOK: type: QUERY POSTHOOK: Input: default@store #### A masked pattern was here #### +PREHOOK: query: analyze table store_bigint compute statistics +PREHOOK: type: QUERY +PREHOOK: Input: default@store_bigint +PREHOOK: Output: default@store_bigint +POSTHOOK: query: analyze table store_bigint compute statistics +POSTHOOK: type: QUERY +POSTHOOK: Input: default@store_bigint +POSTHOOK: Output: default@store_bigint +PREHOOK: query: analyze table store_bigint compute statistics for columns s_store_sk, s_floor_space +PREHOOK: type: QUERY +PREHOOK: Input: default@store_bigint +#### A masked pattern was here #### +POSTHOOK: query: analyze table store_bigint compute statistics for columns s_store_sk, s_floor_space +POSTHOOK: type: QUERY +POSTHOOK: Input: default@store_bigint +#### A masked pattern was here #### PREHOOK: query: analyze table store_sales compute statistics PREHOOK: type: QUERY PREHOOK: Input: default@store_sales @@ -325,6 +421,73 @@ STAGE PLANS: Processor Tree: ListSink +PREHOOK: query: -- widening cast: inferred PK-FK, thus same row count as previous query +explain select s.s_store_sk from store_bigint s join store_sales ss on (s.s_store_sk = ss.ss_store_sk) +PREHOOK: type: QUERY +POSTHOOK: query: -- widening cast: inferred PK-FK, thus same row count as previous query +explain select s.s_store_sk from store_bigint s join store_sales ss on (s.s_store_sk = ss.ss_store_sk) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: s + Statistics: Num rows: 12 Data size: 96 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: s_store_sk is not null (type: boolean) + Statistics: Num rows: 12 Data size: 96 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: s_store_sk (type: bigint) + outputColumnNames: _col0 + Statistics: Num rows: 12 Data size: 96 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: bigint) + sort order: + + Map-reduce partition columns: _col0 (type: bigint) + Statistics: Num rows: 12 Data size: 96 Basic stats: COMPLETE Column stats: COMPLETE + TableScan + alias: ss + Statistics: Num rows: 1000 Data size: 3860 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: ss_store_sk is not null (type: boolean) + Statistics: Num rows: 964 Data size: 3720 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: ss_store_sk (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 964 Data size: 3720 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: UDFToLong(_col0) (type: bigint) + sort order: + + Map-reduce partition columns: UDFToLong(_col0) (type: bigint) + Statistics: Num rows: 964 Data size: 3720 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: bigint) + 1 UDFToLong(_col0) (type: bigint) + outputColumnNames: _col0 + Statistics: Num rows: 964 Data size: 7712 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 964 Data size: 7712 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + PREHOOK: query: explain select s.s_store_sk from store s join store_sales ss on (s.s_store_sk = ss.ss_store_sk) where s.s_store_sk > 0 PREHOOK: type: QUERY POSTHOOK: query: explain select s.s_store_sk from store s join store_sales ss on (s.s_store_sk = ss.ss_store_sk) where s.s_store_sk > 0 @@ -1057,6 +1220,14 @@ POSTHOOK: query: drop table store POSTHOOK: type: DROPTABLE POSTHOOK: Input: default@store POSTHOOK: Output: default@store +PREHOOK: query: drop table store_bigint +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@store_bigint +PREHOOK: Output: default@store_bigint +POSTHOOK: query: drop table store_bigint +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@store_bigint +POSTHOOK: Output: default@store_bigint PREHOOK: query: drop table customer_address PREHOOK: type: DROPTABLE PREHOOK: Input: default@customer_address diff --git ql/src/test/results/clientpositive/llap/vector_char_simple.q.out ql/src/test/results/clientpositive/llap/vector_char_simple.q.out index 3dea73d..063170d 100644 --- ql/src/test/results/clientpositive/llap/vector_char_simple.q.out +++ ql/src/test/results/clientpositive/llap/vector_char_simple.q.out @@ -306,10 +306,10 @@ STAGE PLANS: Select Operator expressions: CAST( _col0 AS CHAR(12) (type: char(12)) outputColumnNames: _col0 - Statistics: Num rows: 10 Data size: 960 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 10 Data size: 768 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 10 Data size: 960 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 10 Data size: 768 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat diff --git ql/src/test/results/clientpositive/llap/vector_varchar_simple.q.out ql/src/test/results/clientpositive/llap/vector_varchar_simple.q.out index edb67f1..118130e 100644 --- ql/src/test/results/clientpositive/llap/vector_varchar_simple.q.out +++ ql/src/test/results/clientpositive/llap/vector_varchar_simple.q.out @@ -306,10 +306,10 @@ STAGE PLANS: Select Operator expressions: CAST( _col0 AS varchar(25)) (type: varchar(25)) outputColumnNames: _col0 - Statistics: Num rows: 10 Data size: 1090 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 10 Data size: 872 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 10 Data size: 1090 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 10 Data size: 872 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat diff --git ql/src/test/results/clientpositive/llap/vectorized_casts.q.out ql/src/test/results/clientpositive/llap/vectorized_casts.q.out index bcddce5..61267dd 100644 --- ql/src/test/results/clientpositive/llap/vectorized_casts.q.out +++ ql/src/test/results/clientpositive/llap/vectorized_casts.q.out @@ -176,10 +176,10 @@ STAGE PLANS: Select Operator expressions: UDFToBoolean(ctinyint) (type: boolean), UDFToBoolean(csmallint) (type: boolean), UDFToBoolean(cint) (type: boolean), UDFToBoolean(cbigint) (type: boolean), UDFToBoolean(cfloat) (type: boolean), UDFToBoolean(cdouble) (type: boolean), cboolean1 (type: boolean), UDFToBoolean((cbigint * 0)) (type: boolean), UDFToBoolean(ctimestamp1) (type: boolean), UDFToBoolean(cstring1) (type: boolean), UDFToInteger(ctinyint) (type: int), UDFToInteger(csmallint) (type: int), cint (type: int), UDFToInteger(cbigint) (type: int), UDFToInteger(cfloat) (type: int), UDFToInteger(cdouble) (type: int), UDFToInteger(cboolean1) (type: int), UDFToInteger(ctimestamp1) (type: int), UDFToInteger(cstring1) (type: int), UDFToInteger(substr(cstring1, 1, 1)) (type: int), UDFToByte(cfloat) (type: tinyint), UDFToShort(cfloat) (type: smallint), UDFToLong(cfloat) (type: bigint), UDFToDouble(ctinyint) (type: double), UDFToDouble(csmallint) (type: double), UDFToDouble(cint) (type: double), UDFToDouble(cbigint) (type: double), UDFToDouble(cfloat) (type: double), cdouble (type: double), UDFToDouble(cboolean1) (type: double), UDFToDouble(ctimestamp1) (type: double), UDFToDouble(cstring1) (type: double), UDFToDouble(substr(cstring1, 1, 1)) (type: double), UDFToFloat(cint) (type: float), UDFToFloat(cdouble) (type: float), CAST( ctinyint AS TIMESTAMP) (type: timestamp), CAST( csmallint AS TIMESTAMP) (type: timestamp), CAST( cint AS TIMESTAMP) (type: timestamp), CAST( cbigint AS TIMESTAMP) (type: timestamp), CAST( cfloat AS TIMESTAMP) (type: timestamp), CAST( cdouble AS TIMESTAMP) (type: timestamp), CAST( cboolean1 AS TIMESTAMP) (type: timestamp), CAST( (cbigint * 0) AS TIMESTAMP) (type: timestamp), ctimestamp1 (type: timestamp), CAST( cstring1 AS TIMESTAMP) (type: timestamp), CAST( substr(cstring1, 1, 1) AS TIMESTAMP) (type: timestamp), UDFToString(ctinyint) (type: string), UDFToString(csmallint) (type: string), UDFToString(cint) (type: string), UDFToString(cbigint) (type: string), UDFToString(cfloat) (type: string), UDFToString(cdouble) (type: string), UDFToString(cboolean1) (type: string), UDFToString((cbigint * 0)) (type: string), UDFToString(ctimestamp1) (type: string), cstring1 (type: string), UDFToString(CAST( cstring1 AS CHAR(10)) (type: string), UDFToString(CAST( cstring1 AS varchar(10))) (type: string), UDFToFloat(UDFToInteger(cfloat)) (type: float), UDFToDouble((cint * 2)) (type: double), UDFToString(sin(cfloat)) (type: string), (UDFToDouble(UDFToFloat(cint)) + UDFToDouble(cboolean1)) (type: double) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16, _col17, _col18, _col19, _col20, _col21, _col22, _col23, _col24, _col25, _col26, _col27, _col28, _col29, _col30, _col31, _col32, _col33, _col34, _col35, _col36, _col37, _col38, _col39, _col40, _col41, _col42, _col43, _col44, _col45, _col46, _col47, _col48, _col49, _col50, _col51, _col52, _col53, _col54, _col55, _col56, _col57, _col58, _col59, _col60, _col61 - Statistics: Num rows: 6144 Data size: 17929060 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 6144 Data size: 16117100 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 6144 Data size: 17929060 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 6144 Data size: 16117100 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat