diff --git common/src/java/org/apache/hadoop/hive/conf/HiveConf.java common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index e05513b..f3f3fd0 100644 --- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -652,6 +652,9 @@ HIVE_STATS_MAP_NUM_ENTRIES("hive.stats.map.num.entries", 10), // to accurately compute statistics for GROUPBY map side parallelism needs to be known HIVE_STATS_MAP_SIDE_PARALLELISM("hive.stats.map.parallelism", 1), + // statistics annotation fetches stats for each partition, which can be expensive. turning + // this off will result in basic sizes being fetched from namenode instead + HIVE_STATS_FETCH_PARTITION_STATS("hive.stats.fetch.partition.stats", true), // statistics annotation fetches column statistics for all required columns which can // be very expensive sometimes HIVE_STATS_FETCH_COLUMN_STATS("hive.stats.fetch.column.stats", false), diff --git ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java index 62c37f4..faa2387 100644 --- ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java +++ ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java @@ -110,6 +110,8 @@ public static Statistics collectStatistics(HiveConf conf, PrunedPartitionList pa List neededColumns = tableScanOperator.getNeededColumns(); boolean fetchColStats = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_FETCH_COLUMN_STATS); + boolean fetchPartStats = + HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_FETCH_PARTITION_STATS); float deserFactor = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVE_STATS_DESERIALIZATION_FACTOR); @@ -151,27 +153,34 @@ public static Statistics collectStatistics(HiveConf conf, PrunedPartitionList pa } else if (partList != null) { // For partitioned tables, get the size of all the partitions after pruning // the partitions that are not required - List rowCounts = getBasicStatForPartitions( - table, partList.getNotDeniedPartns(), StatsSetupConst.ROW_COUNT); - List dataSizes = getBasicStatForPartitions( - table, partList.getNotDeniedPartns(), StatsSetupConst.RAW_DATA_SIZE); + long nr = 0; + long ds = 0; - long nr = getSumIgnoreNegatives(rowCounts); - long ds = getSumIgnoreNegatives(dataSizes); - if (ds <= 0) { - dataSizes = getBasicStatForPartitions( - table, partList.getNotDeniedPartns(), StatsSetupConst.TOTAL_SIZE); - ds = getSumIgnoreNegatives(dataSizes); + List rowCounts = Lists.newArrayList(); + List dataSizes = Lists.newArrayList(); + + if (fetchPartStats) { + rowCounts = getBasicStatForPartitions( + table, partList.getNotDeniedPartns(), StatsSetupConst.ROW_COUNT); + dataSizes = getBasicStatForPartitions( + table, partList.getNotDeniedPartns(), StatsSetupConst.RAW_DATA_SIZE); - // if data size still could not be determined, then fall back to filesytem to get file - // sizes + nr = getSumIgnoreNegatives(rowCounts); + ds = getSumIgnoreNegatives(dataSizes); if (ds <= 0) { - dataSizes = getFileSizeForPartitions(conf, partList.getNotDeniedPartns()); + dataSizes = getBasicStatForPartitions( + table, partList.getNotDeniedPartns(), StatsSetupConst.TOTAL_SIZE); + ds = getSumIgnoreNegatives(dataSizes); } - ds = getSumIgnoreNegatives(dataSizes); + } - ds = (long) (ds * deserFactor); + // if data size still could not be determined, then fall back to filesytem to get file + // sizes + if (ds <= 0) { + dataSizes = getFileSizeForPartitions(conf, partList.getNotDeniedPartns()); } + ds = getSumIgnoreNegatives(dataSizes); + ds = (long) (ds * deserFactor); int avgRowSize = estimateRowSizeFromSchema(conf, schema, neededColumns); if (avgRowSize > 0) {