diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnStatsSemanticAnalyzer.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnStatsSemanticAnalyzer.java index b84ea46..cc8ae4d 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnStatsSemanticAnalyzer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnStatsSemanticAnalyzer.java @@ -186,41 +186,23 @@ private StringBuilder genPartitionClause(Map partSpec) throws Sem } else { whereClause.append(" and "); } - whereClause.append(partKey); - whereClause.append(" = "); - if (getColTypeOf(partKey).equalsIgnoreCase("string")) { - whereClause.append("'"); - } - whereClause.append(value); - if (getColTypeOf(partKey).equalsIgnoreCase("string")) { - whereClause.append("'"); - } + whereClause.append(partKey).append(" = '").append(value).append("'"); } } - - for (FieldSchema fs : tbl.getPartitionKeys()) { - if (!aggPresent) { - aggPresent = true; - } else { - groupByClause.append(","); - } - groupByClause.append(fs.getName()); + + for (FieldSchema fs : tbl.getPartitionKeys()) { + if (!aggPresent) { + aggPresent = true; + } else { + groupByClause.append(","); + } + groupByClause.append(fs.getName()); } // attach the predicate and group by to the return clause return predPresent ? whereClause.append(groupByClause) : groupByClause; } - private String getColTypeOf (String partKey) throws SemanticException{ - - for (FieldSchema fs : tbl.getPartitionKeys()) { - if (partKey.equalsIgnoreCase(fs.getName())) { - return fs.getType(); - } - } - throw new SemanticException ("Unknown partition key : " + partKey); - } - private int getNumBitVectorsForNDVEstimation(HiveConf conf) throws SemanticException { int numBitVectors; float percentageError = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVE_STATS_NDV_ERROR); diff --git a/ql/src/test/queries/clientpositive/columnstats_part_coltype.q b/ql/src/test/queries/clientpositive/columnstats_part_coltype.q new file mode 100644 index 0000000..bfeed89 --- /dev/null +++ b/ql/src/test/queries/clientpositive/columnstats_part_coltype.q @@ -0,0 +1,38 @@ +drop table if exists partcolstats; +create table partcolstats (key int, value string) partitioned by (ds date, hr int, part string); +insert into partcolstats partition (ds=date '2015-04-02', hr=2, part='partA') select key, value from src limit 20; +insert into partcolstats partition (ds=date '2015-04-02', hr=2, part='partB') select key, value from src limit 20; +insert into partcolstats partition (ds=date '2015-04-02', hr=3, part='partA') select key, value from src limit 30; +insert into partcolstats partition (ds=date '2015-04-03', hr=3, part='partA') select key, value from src limit 40; +insert into partcolstats partition (ds=date '2015-04-03', hr=3, part='partB') select key, value from src limit 60; + +analyze table partcolstats partition (ds=date '2015-04-02', hr=2, part='partA') compute statistics for columns; +describe formatted partcolstats.key partition (ds=date '2015-04-02', hr=2, part='partA'); +describe formatted partcolstats.value partition (ds=date '2015-04-02', hr=2, part='partA'); + +describe formatted partcolstats.key partition (ds=date '2015-04-02', hr=2, part='partB'); +describe formatted partcolstats.value partition (ds=date '2015-04-02', hr=2, part='partB'); + +analyze table partcolstats partition (ds=date '2015-04-02', hr=2, part) compute statistics for columns; +describe formatted partcolstats.key partition (ds=date '2015-04-02', hr=2, part='partB'); +describe formatted partcolstats.value partition (ds=date '2015-04-02', hr=2, part='partB'); + +describe formatted partcolstats.key partition (ds=date '2015-04-02', hr=3, part='partA'); +describe formatted partcolstats.value partition (ds=date '2015-04-02', hr=3, part='partA'); + +analyze table partcolstats partition (ds=date '2015-04-02', hr, part) compute statistics for columns; +describe formatted partcolstats.key partition (ds=date '2015-04-02', hr=3, part='partA'); +describe formatted partcolstats.value partition (ds=date '2015-04-02', hr=3, part='partA'); + +describe formatted partcolstats.key partition (ds=date '2015-04-03', hr=3, part='partA'); +describe formatted partcolstats.value partition (ds=date '2015-04-03', hr=3, part='partA'); +describe formatted partcolstats.key partition (ds=date '2015-04-03', hr=3, part='partB'); +describe formatted partcolstats.value partition (ds=date '2015-04-03', hr=3, part='partB'); + +analyze table partcolstats partition (ds, hr, part) compute statistics for columns; +describe formatted partcolstats.key partition (ds=date '2015-04-03', hr=3, part='partA'); +describe formatted partcolstats.value partition (ds=date '2015-04-03', hr=3, part='partA'); +describe formatted partcolstats.key partition (ds=date '2015-04-03', hr=3, part='partB'); +describe formatted partcolstats.value partition (ds=date '2015-04-03', hr=3, part='partB'); + +drop table partcolstats; \ No newline at end of file diff --git a/ql/src/test/results/clientpositive/columnstats_part_coltype.q.out b/ql/src/test/results/clientpositive/columnstats_part_coltype.q.out new file mode 100644 index 0000000..11788bb --- /dev/null +++ b/ql/src/test/results/clientpositive/columnstats_part_coltype.q.out @@ -0,0 +1,286 @@ +PREHOOK: query: drop table if exists partcolstats +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table if exists partcolstats +POSTHOOK: type: DROPTABLE +PREHOOK: query: create table partcolstats (key int, value string) partitioned by (ds date, hr int, part string) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@partcolstats +POSTHOOK: query: create table partcolstats (key int, value string) partitioned by (ds date, hr int, part string) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@partcolstats +PREHOOK: query: insert into partcolstats partition (ds=date '2015-04-02', hr=2, part='partA') select key, value from src limit 20 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@partcolstats@ds=2015-04-02/hr=2/part=partA +POSTHOOK: query: insert into partcolstats partition (ds=date '2015-04-02', hr=2, part='partA') select key, value from src limit 20 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@partcolstats@ds=2015-04-02/hr=2/part=partA +POSTHOOK: Lineage: partcolstats PARTITION(ds=2015-04-02,hr=2,part=partA).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: partcolstats PARTITION(ds=2015-04-02,hr=2,part=partA).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: insert into partcolstats partition (ds=date '2015-04-02', hr=2, part='partB') select key, value from src limit 20 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@partcolstats@ds=2015-04-02/hr=2/part=partB +POSTHOOK: query: insert into partcolstats partition (ds=date '2015-04-02', hr=2, part='partB') select key, value from src limit 20 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@partcolstats@ds=2015-04-02/hr=2/part=partB +POSTHOOK: Lineage: partcolstats PARTITION(ds=2015-04-02,hr=2,part=partB).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: partcolstats PARTITION(ds=2015-04-02,hr=2,part=partB).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: insert into partcolstats partition (ds=date '2015-04-02', hr=3, part='partA') select key, value from src limit 30 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@partcolstats@ds=2015-04-02/hr=3/part=partA +POSTHOOK: query: insert into partcolstats partition (ds=date '2015-04-02', hr=3, part='partA') select key, value from src limit 30 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@partcolstats@ds=2015-04-02/hr=3/part=partA +POSTHOOK: Lineage: partcolstats PARTITION(ds=2015-04-02,hr=3,part=partA).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: partcolstats PARTITION(ds=2015-04-02,hr=3,part=partA).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: insert into partcolstats partition (ds=date '2015-04-03', hr=3, part='partA') select key, value from src limit 40 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@partcolstats@ds=2015-04-03/hr=3/part=partA +POSTHOOK: query: insert into partcolstats partition (ds=date '2015-04-03', hr=3, part='partA') select key, value from src limit 40 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@partcolstats@ds=2015-04-03/hr=3/part=partA +POSTHOOK: Lineage: partcolstats PARTITION(ds=2015-04-03,hr=3,part=partA).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: partcolstats PARTITION(ds=2015-04-03,hr=3,part=partA).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: insert into partcolstats partition (ds=date '2015-04-03', hr=3, part='partB') select key, value from src limit 60 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@partcolstats@ds=2015-04-03/hr=3/part=partB +POSTHOOK: query: insert into partcolstats partition (ds=date '2015-04-03', hr=3, part='partB') select key, value from src limit 60 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@partcolstats@ds=2015-04-03/hr=3/part=partB +POSTHOOK: Lineage: partcolstats PARTITION(ds=2015-04-03,hr=3,part=partB).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: partcolstats PARTITION(ds=2015-04-03,hr=3,part=partB).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: analyze table partcolstats partition (ds=date '2015-04-02', hr=2, part='partA') compute statistics for columns +PREHOOK: type: QUERY +PREHOOK: Input: default@partcolstats +PREHOOK: Input: default@partcolstats@ds=2015-04-02/hr=2/part=partA +#### A masked pattern was here #### +POSTHOOK: query: analyze table partcolstats partition (ds=date '2015-04-02', hr=2, part='partA') compute statistics for columns +POSTHOOK: type: QUERY +POSTHOOK: Input: default@partcolstats +POSTHOOK: Input: default@partcolstats@ds=2015-04-02/hr=2/part=partA +#### A masked pattern was here #### +PREHOOK: query: describe formatted partcolstats.key partition (ds=date '2015-04-02', hr=2, part='partA') +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@partcolstats +POSTHOOK: query: describe formatted partcolstats.key partition (ds=date '2015-04-02', hr=2, part='partA') +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@partcolstats +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +key int 27 484 0 18 from deserializer +PREHOOK: query: describe formatted partcolstats.value partition (ds=date '2015-04-02', hr=2, part='partA') +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@partcolstats +POSTHOOK: query: describe formatted partcolstats.value partition (ds=date '2015-04-02', hr=2, part='partA') +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@partcolstats +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +value string 0 18 6.8 7 from deserializer +PREHOOK: query: describe formatted partcolstats.key partition (ds=date '2015-04-02', hr=2, part='partB') +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@partcolstats +POSTHOOK: query: describe formatted partcolstats.key partition (ds=date '2015-04-02', hr=2, part='partB') +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@partcolstats +# col_name data_type comment + +key int from deserializer +PREHOOK: query: describe formatted partcolstats.value partition (ds=date '2015-04-02', hr=2, part='partB') +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@partcolstats +POSTHOOK: query: describe formatted partcolstats.value partition (ds=date '2015-04-02', hr=2, part='partB') +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@partcolstats +# col_name data_type comment + +value string from deserializer +PREHOOK: query: analyze table partcolstats partition (ds=date '2015-04-02', hr=2, part) compute statistics for columns +PREHOOK: type: QUERY +PREHOOK: Input: default@partcolstats +PREHOOK: Input: default@partcolstats@ds=2015-04-02/hr=2/part=partA +PREHOOK: Input: default@partcolstats@ds=2015-04-02/hr=2/part=partB +#### A masked pattern was here #### +POSTHOOK: query: analyze table partcolstats partition (ds=date '2015-04-02', hr=2, part) compute statistics for columns +POSTHOOK: type: QUERY +POSTHOOK: Input: default@partcolstats +POSTHOOK: Input: default@partcolstats@ds=2015-04-02/hr=2/part=partA +POSTHOOK: Input: default@partcolstats@ds=2015-04-02/hr=2/part=partB +#### A masked pattern was here #### +PREHOOK: query: describe formatted partcolstats.key partition (ds=date '2015-04-02', hr=2, part='partB') +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@partcolstats +POSTHOOK: query: describe formatted partcolstats.key partition (ds=date '2015-04-02', hr=2, part='partB') +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@partcolstats +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +key int 27 484 0 18 from deserializer +PREHOOK: query: describe formatted partcolstats.value partition (ds=date '2015-04-02', hr=2, part='partB') +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@partcolstats +POSTHOOK: query: describe formatted partcolstats.value partition (ds=date '2015-04-02', hr=2, part='partB') +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@partcolstats +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +value string 0 18 6.8 7 from deserializer +PREHOOK: query: describe formatted partcolstats.key partition (ds=date '2015-04-02', hr=3, part='partA') +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@partcolstats +POSTHOOK: query: describe formatted partcolstats.key partition (ds=date '2015-04-02', hr=3, part='partA') +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@partcolstats +# col_name data_type comment + +key int from deserializer +PREHOOK: query: describe formatted partcolstats.value partition (ds=date '2015-04-02', hr=3, part='partA') +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@partcolstats +POSTHOOK: query: describe formatted partcolstats.value partition (ds=date '2015-04-02', hr=3, part='partA') +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@partcolstats +# col_name data_type comment + +value string from deserializer +PREHOOK: query: analyze table partcolstats partition (ds=date '2015-04-02', hr, part) compute statistics for columns +PREHOOK: type: QUERY +PREHOOK: Input: default@partcolstats +PREHOOK: Input: default@partcolstats@ds=2015-04-02/hr=2/part=partA +PREHOOK: Input: default@partcolstats@ds=2015-04-02/hr=2/part=partB +PREHOOK: Input: default@partcolstats@ds=2015-04-02/hr=3/part=partA +#### A masked pattern was here #### +POSTHOOK: query: analyze table partcolstats partition (ds=date '2015-04-02', hr, part) compute statistics for columns +POSTHOOK: type: QUERY +POSTHOOK: Input: default@partcolstats +POSTHOOK: Input: default@partcolstats@ds=2015-04-02/hr=2/part=partA +POSTHOOK: Input: default@partcolstats@ds=2015-04-02/hr=2/part=partB +POSTHOOK: Input: default@partcolstats@ds=2015-04-02/hr=3/part=partA +#### A masked pattern was here #### +PREHOOK: query: describe formatted partcolstats.key partition (ds=date '2015-04-02', hr=3, part='partA') +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@partcolstats +POSTHOOK: query: describe formatted partcolstats.key partition (ds=date '2015-04-02', hr=3, part='partA') +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@partcolstats +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +key int 27 495 0 28 from deserializer +PREHOOK: query: describe formatted partcolstats.value partition (ds=date '2015-04-02', hr=3, part='partA') +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@partcolstats +POSTHOOK: query: describe formatted partcolstats.value partition (ds=date '2015-04-02', hr=3, part='partA') +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@partcolstats +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +value string 0 18 6.833333333333333 7 from deserializer +PREHOOK: query: describe formatted partcolstats.key partition (ds=date '2015-04-03', hr=3, part='partA') +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@partcolstats +POSTHOOK: query: describe formatted partcolstats.key partition (ds=date '2015-04-03', hr=3, part='partA') +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@partcolstats +# col_name data_type comment + +key int from deserializer +PREHOOK: query: describe formatted partcolstats.value partition (ds=date '2015-04-03', hr=3, part='partA') +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@partcolstats +POSTHOOK: query: describe formatted partcolstats.value partition (ds=date '2015-04-03', hr=3, part='partA') +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@partcolstats +# col_name data_type comment + +value string from deserializer +PREHOOK: query: describe formatted partcolstats.key partition (ds=date '2015-04-03', hr=3, part='partB') +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@partcolstats +POSTHOOK: query: describe formatted partcolstats.key partition (ds=date '2015-04-03', hr=3, part='partB') +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@partcolstats +# col_name data_type comment + +key int from deserializer +PREHOOK: query: describe formatted partcolstats.value partition (ds=date '2015-04-03', hr=3, part='partB') +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@partcolstats +POSTHOOK: query: describe formatted partcolstats.value partition (ds=date '2015-04-03', hr=3, part='partB') +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@partcolstats +# col_name data_type comment + +value string from deserializer +PREHOOK: query: analyze table partcolstats partition (ds, hr, part) compute statistics for columns +PREHOOK: type: QUERY +PREHOOK: Input: default@partcolstats +PREHOOK: Input: default@partcolstats@ds=2015-04-02/hr=2/part=partA +PREHOOK: Input: default@partcolstats@ds=2015-04-02/hr=2/part=partB +PREHOOK: Input: default@partcolstats@ds=2015-04-02/hr=3/part=partA +PREHOOK: Input: default@partcolstats@ds=2015-04-03/hr=3/part=partA +PREHOOK: Input: default@partcolstats@ds=2015-04-03/hr=3/part=partB +#### A masked pattern was here #### +POSTHOOK: query: analyze table partcolstats partition (ds, hr, part) compute statistics for columns +POSTHOOK: type: QUERY +POSTHOOK: Input: default@partcolstats +POSTHOOK: Input: default@partcolstats@ds=2015-04-02/hr=2/part=partA +POSTHOOK: Input: default@partcolstats@ds=2015-04-02/hr=2/part=partB +POSTHOOK: Input: default@partcolstats@ds=2015-04-02/hr=3/part=partA +POSTHOOK: Input: default@partcolstats@ds=2015-04-03/hr=3/part=partA +POSTHOOK: Input: default@partcolstats@ds=2015-04-03/hr=3/part=partB +#### A masked pattern was here #### +PREHOOK: query: describe formatted partcolstats.key partition (ds=date '2015-04-03', hr=3, part='partA') +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@partcolstats +POSTHOOK: query: describe formatted partcolstats.key partition (ds=date '2015-04-03', hr=3, part='partA') +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@partcolstats +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +key int 15 495 0 43 from deserializer +PREHOOK: query: describe formatted partcolstats.value partition (ds=date '2015-04-03', hr=3, part='partA') +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@partcolstats +POSTHOOK: query: describe formatted partcolstats.value partition (ds=date '2015-04-03', hr=3, part='partA') +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@partcolstats +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +value string 0 34 6.825 7 from deserializer +PREHOOK: query: describe formatted partcolstats.key partition (ds=date '2015-04-03', hr=3, part='partB') +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@partcolstats +POSTHOOK: query: describe formatted partcolstats.key partition (ds=date '2015-04-03', hr=3, part='partB') +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@partcolstats +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +key int 15 495 0 51 from deserializer +PREHOOK: query: describe formatted partcolstats.value partition (ds=date '2015-04-03', hr=3, part='partB') +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@partcolstats +POSTHOOK: query: describe formatted partcolstats.value partition (ds=date '2015-04-03', hr=3, part='partB') +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@partcolstats +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +value string 0 53 6.883333333333334 7 from deserializer +PREHOOK: query: drop table partcolstats +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@partcolstats +PREHOOK: Output: default@partcolstats +POSTHOOK: query: drop table partcolstats +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@partcolstats +POSTHOOK: Output: default@partcolstats