diff --git common/src/java/org/apache/hadoop/hive/conf/HiveConf.java common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index cd095d6..36503fa 100644 --- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -641,6 +641,9 @@ HIVE_STATS_MAP_NUM_ENTRIES("hive.stats.map.num.entries", 10), // to accurately compute statistics for GROUPBY map side parallelism needs to be known HIVE_STATS_MAP_SIDE_PARALLELISM("hive.stats.map.parallelism", 1), + // statistics annotation fetches column statistics for all required columns and for all + // required partitions which can be very expensive sometimes + HIVE_STATS_FETCH_COLUMN_STATS("hive.stats.fetch.column.stats", false), // Concurrency HIVE_SUPPORT_CONCURRENCY("hive.support.concurrency", false), diff --git ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java index 50e1969..200ddf6 100644 --- ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java +++ ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java @@ -86,6 +86,7 @@ public static Statistics collectStatistics(HiveConf conf, PrunedPartitionList pa List neededColumns = tableScanOperator.getNeededColumns(); String dbName = table.getDbName(); String tabName = table.getTableName(); + boolean fetchColStats = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_FETCH_COLUMN_STATS); if (!table.isPartitioned()) { long nr = getNumRows(dbName, tabName); @@ -106,7 +107,10 @@ public static Statistics collectStatistics(HiveConf conf, PrunedPartitionList pa stats.setNumRows(nr); stats.setDataSize(rds); - List colStats = getTableColumnStats(table, schema, neededColumns); + List colStats = Lists.newArrayList(); + if (fetchColStats) { + colStats = getTableColumnStats(table, schema, neededColumns); + } // if column stats available and if atleast one column doesn't have stats // then mark it as partial @@ -128,11 +132,8 @@ public static Statistics collectStatistics(HiveConf conf, PrunedPartitionList pa } else { stats.setColumnStatsState(Statistics.State.COMPLETE); } - stats.addToColumnStats(null); - } else { - // set col stats and mark it as table level col stats - stats.addToColumnStats(colStats); } + stats.addToColumnStats(colStats); } else { // For partitioned tables, get the size of all the partitions after pruning @@ -176,7 +177,10 @@ public static Statistics collectStatistics(HiveConf conf, PrunedPartitionList pa // column stats for (Partition part : partList.getNotDeniedPartns()) { - List colStats = getPartitionColumnStats(table, part, schema, neededColumns); + List colStats = Lists.newArrayList(); + if (fetchColStats) { + colStats = getPartitionColumnStats(table, part, schema, neededColumns); + } if (checkIfColStatsAvailable(colStats) && colStats.contains(null)) { stats.updateColumnStatsState(Statistics.State.PARTIAL); } else if (checkIfColStatsAvailable(colStats) && !colStats.contains(null)) { diff --git ql/src/test/queries/clientpositive/annotate_stats_filter.q ql/src/test/queries/clientpositive/annotate_stats_filter.q index 0a645a6..3f2452e 100644 --- ql/src/test/queries/clientpositive/annotate_stats_filter.q +++ ql/src/test/queries/clientpositive/annotate_stats_filter.q @@ -1,3 +1,5 @@ +set hive.stats.fetch.column.stats=true; + create table if not exists loc_staging ( state string, locid int, diff --git ql/src/test/queries/clientpositive/annotate_stats_groupby.q ql/src/test/queries/clientpositive/annotate_stats_groupby.q index cd2cbff..05cb036 100644 --- ql/src/test/queries/clientpositive/annotate_stats_groupby.q +++ ql/src/test/queries/clientpositive/annotate_stats_groupby.q @@ -1,3 +1,5 @@ +set hive.stats.fetch.column.stats=true; + create table if not exists loc_staging ( state string, locid int, diff --git ql/src/test/queries/clientpositive/annotate_stats_join.q ql/src/test/queries/clientpositive/annotate_stats_join.q index 5683498..965b0b7 100644 --- ql/src/test/queries/clientpositive/annotate_stats_join.q +++ ql/src/test/queries/clientpositive/annotate_stats_join.q @@ -1,3 +1,5 @@ +set hive.stats.fetch.column.stats=true; + create table if not exists emp_staging ( lastname string, deptid int @@ -28,7 +30,6 @@ LOAD DATA LOCAL INPATH '../../data/files/emp.txt' OVERWRITE INTO TABLE emp_stagi LOAD DATA LOCAL INPATH '../../data/files/dept.txt' OVERWRITE INTO TABLE dept_staging; LOAD DATA LOCAL INPATH '../../data/files/loc.txt' OVERWRITE INTO TABLE loc_staging; - insert overwrite table emp_orc select * from emp_staging; insert overwrite table dept_orc select * from dept_staging; insert overwrite table loc_orc select * from loc_staging; diff --git ql/src/test/queries/clientpositive/annotate_stats_limit.q ql/src/test/queries/clientpositive/annotate_stats_limit.q index e739326..0a9f880 100644 --- ql/src/test/queries/clientpositive/annotate_stats_limit.q +++ ql/src/test/queries/clientpositive/annotate_stats_limit.q @@ -1,3 +1,5 @@ +set hive.stats.fetch.column.stats=true; + create table if not exists loc_staging ( state string, locid int, diff --git ql/src/test/queries/clientpositive/annotate_stats_part.q ql/src/test/queries/clientpositive/annotate_stats_part.q index 05f3a19..257b840 100644 --- ql/src/test/queries/clientpositive/annotate_stats_part.q +++ ql/src/test/queries/clientpositive/annotate_stats_part.q @@ -1,3 +1,8 @@ +set hive.stats.fetch.column.stats=true; +set hive.stats.autogather=false; +set hive.exec.dynamic.partition=true; +set hive.exec.dynamic.partition.mode=nonstrict; + create table if not exists loc_staging ( state string, locid int, @@ -16,10 +21,6 @@ create table if not exists loc_orc ( -- basicStatState: NONE colStatState: NONE explain extended select * from loc_orc; -set hive.stats.autogather=false; -set hive.exec.dynamic.partition=true; -set hive.exec.dynamic.partition.mode=nonstrict; - insert overwrite table loc_orc partition(year) select * from loc_staging; -- stats are disabled. basic stats will report the file size but not raw data size. so initial statistics will be PARTIAL diff --git ql/src/test/queries/clientpositive/annotate_stats_select.q ql/src/test/queries/clientpositive/annotate_stats_select.q index 93492f5..0d61151 100644 --- ql/src/test/queries/clientpositive/annotate_stats_select.q +++ ql/src/test/queries/clientpositive/annotate_stats_select.q @@ -1,3 +1,5 @@ +set hive.stats.fetch.column.stats=true; + create table if not exists alltypes ( bo1 boolean, ti1 tinyint, diff --git ql/src/test/queries/clientpositive/annotate_stats_table.q ql/src/test/queries/clientpositive/annotate_stats_table.q index 1c7d163..4140fe6 100644 --- ql/src/test/queries/clientpositive/annotate_stats_table.q +++ ql/src/test/queries/clientpositive/annotate_stats_table.q @@ -1,3 +1,6 @@ +set hive.stats.fetch.column.stats=true; +set hive.stats.autogather=false; + create table if not exists emp_staging ( lastname string, deptid int @@ -11,8 +14,6 @@ explain extended select * from emp_orc; LOAD DATA LOCAL INPATH '../../data/files/emp.txt' OVERWRITE INTO TABLE emp_staging; -set hive.stats.autogather=false; - insert overwrite table emp_orc select * from emp_staging; -- stats are disabled. basic stats will report the file size but not raw data size. so initial statistics will be PARTIAL diff --git ql/src/test/queries/clientpositive/annotate_stats_union.q ql/src/test/queries/clientpositive/annotate_stats_union.q index 726b048..586d9e1 100644 --- ql/src/test/queries/clientpositive/annotate_stats_union.q +++ ql/src/test/queries/clientpositive/annotate_stats_union.q @@ -1,3 +1,5 @@ +set hive.stats.fetch.column.stats=true; + create table if not exists loc_staging ( state string, locid int,