diff --git common/src/java/org/apache/hadoop/hive/conf/HiveConf.java common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 84ee78f..274b150 100644 --- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -655,6 +655,9 @@ // statistics annotation fetches column statistics for all required columns and for all // required partitions which can be very expensive sometimes HIVE_STATS_FETCH_COLUMN_STATS("hive.stats.fetch.column.stats", false), + // statistics annotation fetches stats for each partition, which can be expensive. turning + // this off will result in basic sizes being fetched from namenode instead + HIVE_STATS_FETCH_PARTITION_STATS("hive.stats.fetch.partition.stats", true), // in the absence of table/partition stats, average row size will be used to // estimate the number of rows/data size HIVE_STATS_AVG_ROW_SIZE("hive.stats.avg.row.size", 10000), diff --git ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java index 384b49e..a342daf 100644 --- ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java +++ ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java @@ -109,6 +109,8 @@ public static Statistics collectStatistics(HiveConf conf, PrunedPartitionList pa String tabName = table.getTableName(); boolean fetchColStats = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_FETCH_COLUMN_STATS); + boolean fetchPartStats = + HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_FETCH_PARTITION_STATS); float deserFactor = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVE_STATS_DESERIALIZATION_FACTOR); @@ -171,33 +173,42 @@ public static Statistics collectStatistics(HiveConf conf, PrunedPartitionList pa // For partitioned tables, get the size of all the partitions after pruning // the partitions that are not required + if (partList != null) { List partNames = Lists.newArrayList(); for (Partition part : partList.getNotDeniedPartns()) { partNames.add(part.getName()); } - List rowCounts = + long nr = 0; + long ds = 0; + + List rowCounts = Lists.newArrayList(); + List dataSizes = Lists.newArrayList(); + + if (fetchPartStats) { + rowCounts = getBasicStatForPartitions(table, partNames, StatsSetupConst.ROW_COUNT); - List dataSizes = + dataSizes = getBasicStatForPartitions(table, partNames, StatsSetupConst.RAW_DATA_SIZE); - - long nr = getSumIgnoreNegatives(rowCounts); - long ds = getSumIgnoreNegatives(dataSizes); - if (ds <= 0) { - dataSizes = getBasicStatForPartitions(table, partNames, StatsSetupConst.TOTAL_SIZE); + + nr = getSumIgnoreNegatives(rowCounts); ds = getSumIgnoreNegatives(dataSizes); - // if data size still could not be determined, then fall back to filesytem to get file - // sizes if (ds <= 0) { - dataSizes = getFileSizeForPartitions(conf, partList.getNotDeniedPartns()); + dataSizes = getBasicStatForPartitions(table, partNames, StatsSetupConst.TOTAL_SIZE); + ds = getSumIgnoreNegatives(dataSizes); } - ds = getSumIgnoreNegatives(dataSizes); - - ds = (long) (ds * deserFactor); + } + // if data size still could not be determined, then fall back to filesytem to get file + // sizes + if (ds <= 0) { + dataSizes = getFileSizeForPartitions(conf, partList.getNotDeniedPartns()); } + ds = getSumIgnoreNegatives(dataSizes); + ds = (long) (ds * deserFactor); + int avgRowSize = estimateRowSizeFromSchema(conf, schema, neededColumns); if (avgRowSize > 0) { if (LOG.isDebugEnabled()) {