diff --git common/src/java/org/apache/hadoop/hive/conf/HiveConf.java common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index ed22dbd..0fa3d6f 100644 --- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -1098,7 +1098,9 @@ "Whether queries will fail because stats cannot be collected completely accurately. \n" + "If this is set to true, reading/writing from/into a partition may fail because the stats\n" + "could not be computed accurately."), - + HIVE_STATS_COLLECT_PART_LEVEL_STATS("hive.analyze.stmt.collect.partlevel.stats", true, + "analyze table T compute statistics for columns. Queries like these should compute partition" + + "level stats for partitioned table even when no part spec is specified."), HIVE_STATS_GATHER_NUM_THREADS("hive.stats.gather.num.threads", 10, "Number of threads used by partialscan/noscan analyze command for partitioned tables.\n" + "This is applicable only for file formats that implement StatsProvidingRecordReader (like ORC)."), diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnStatsSemanticAnalyzer.java ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnStatsSemanticAnalyzer.java index 24f3710..3f8648b 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnStatsSemanticAnalyzer.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnStatsSemanticAnalyzer.java @@ -27,6 +27,7 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.conf.HiveConf.ConfVars; import org.apache.hadoop.hive.metastore.api.FieldSchema; import org.apache.hadoop.hive.ql.Context; import org.apache.hadoop.hive.ql.ErrorMsg; @@ -102,6 +103,10 @@ private Table getTable(ASTNode tree) throws SemanticException { private Map getPartKeyValuePairsFromAST(ASTNode tree) { ASTNode child = ((ASTNode) tree.getChild(0).getChild(1)); Map partSpec = new HashMap(); + if (null == child) { + // case of analyze table T compute statistics for columns; + return partSpec; + } String partKey; String partValue; for (int i = 0; i < child.getChildCount(); i++) { @@ -361,6 +366,9 @@ public ColumnStatsSemanticAnalyzer(HiveConf conf, ASTNode tree) throws SemanticE checkIfTemporaryTable(); checkForPartitionColumns(colNames, Utilities.getColumnNamesFromFieldSchema(tbl.getPartitionKeys())); validateSpecifiedColumnNames(colNames); + if (conf.getBoolVar(ConfVars.HIVE_STATS_COLLECT_PART_LEVEL_STATS) && tbl.isPartitioned()) { + isPartitionStats = true; + } if (isPartitionStats) { isTableLevel = false; diff --git ql/src/test/queries/clientpositive/columnstats_partlvl.q ql/src/test/queries/clientpositive/columnstats_partlvl.q index 7ec101e..82a9e0f 100644 --- ql/src/test/queries/clientpositive/columnstats_partlvl.q +++ ql/src/test/queries/clientpositive/columnstats_partlvl.q @@ -30,4 +30,12 @@ explain analyze table Employee_Part compute statistics for columns; analyze table Employee_Part compute statistics for columns; +describe formatted Employee_Part.employeeID partition(employeeSalary=2000.0); +describe formatted Employee_Part.employeeID partition(employeeSalary=4000.0); + +set hive.analyze.stmt.collect.partlevel.stats=false; +explain +analyze table Employee_Part compute statistics for columns; +analyze table Employee_Part compute statistics for columns; + describe formatted Employee_Part.employeeID; diff --git ql/src/test/results/clientpositive/columnstats_partlvl.q.out ql/src/test/results/clientpositive/columnstats_partlvl.q.out index e7196d0..84633c9 100644 --- ql/src/test/results/clientpositive/columnstats_partlvl.q.out +++ ql/src/test/results/clientpositive/columnstats_partlvl.q.out @@ -526,6 +526,88 @@ STAGE PLANS: TableScan alias: employee_part Select Operator + expressions: employeesalary (type: double), employeeid (type: int), employeename (type: string) + outputColumnNames: employeesalary, employeeid, employeename + Group By Operator + aggregations: compute_stats(employeeid, 16), compute_stats(employeename, 16) + keys: employeesalary (type: double) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Reduce Output Operator + key expressions: _col0 (type: double) + sort order: + + Map-reduce partition columns: _col0 (type: double) + value expressions: _col1 (type: struct), _col2 (type: struct) + Reduce Operator Tree: + Group By Operator + aggregations: compute_stats(VALUE._col0), compute_stats(VALUE._col1) + keys: KEY._col0 (type: double) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: _col1 (type: struct), _col2 (type: struct), _col0 (type: double) + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-1 + Column Stats Work + Column Stats Desc: + Columns: employeeid, employeename + Column Types: int, string + Table: employee_part + +PREHOOK: query: analyze table Employee_Part compute statistics for columns +PREHOOK: type: QUERY +PREHOOK: Input: default@employee_part +PREHOOK: Input: default@employee_part@employeesalary=2000.0 +PREHOOK: Input: default@employee_part@employeesalary=4000.0 +#### A masked pattern was here #### +POSTHOOK: query: analyze table Employee_Part compute statistics for columns +POSTHOOK: type: QUERY +POSTHOOK: Input: default@employee_part +POSTHOOK: Input: default@employee_part@employeesalary=2000.0 +POSTHOOK: Input: default@employee_part@employeesalary=4000.0 +#### A masked pattern was here #### +PREHOOK: query: describe formatted Employee_Part.employeeID partition(employeeSalary=2000.0) +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@employee_part +POSTHOOK: query: describe formatted Employee_Part.employeeID partition(employeeSalary=2000.0) +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@employee_part +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +employeeID int 16 34 1 14 from deserializer +PREHOOK: query: describe formatted Employee_Part.employeeID partition(employeeSalary=4000.0) +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@employee_part +POSTHOOK: query: describe formatted Employee_Part.employeeID partition(employeeSalary=4000.0) +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@employee_part +# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment + +employeeID int 16 34 1 14 from deserializer +PREHOOK: query: explain +analyze table Employee_Part compute statistics for columns +PREHOOK: type: QUERY +POSTHOOK: query: explain +analyze table Employee_Part compute statistics for columns +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-0 is a root stage + Stage-1 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Map Reduce + Map Operator Tree: + TableScan + alias: employee_part + Select Operator expressions: employeeid (type: int), employeename (type: string) outputColumnNames: employeeid, employeename Group By Operator