diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java index 17d9f2df0a..d04a3ac088 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java @@ -277,7 +277,31 @@ public static long getNumRows(HiveConf conf, List schema, Table tabl } } - private static long getNumRows(HiveConf conf, List schema, List neededColumns, Table table, long ds) { + private static void estimateStatsForMissingCols(List neededColumns, List columnStats, + Table table, HiveConf conf, long nr, List schema) { + List missingColStats = Lists.newArrayList(); + for(String colName:neededColumns) { + boolean hasColStats = false; + for (ColStatistics cstats : columnStats) { + if (colName.equals(cstats.getColumnName())) { + hasColStats = true; + break; + } + } + if (!hasColStats) { + missingColStats.add(colName); + } + } + if(missingColStats.size() > 0) { + List estimatedColStats = estimateStats(table, schema, missingColStats, conf, nr); + for (ColStatistics estColStats : estimatedColStats) { + columnStats.add(estColStats); + } + } + } + + private static long getNumRows(HiveConf conf, List schema, List neededColumns, + Table table, long ds) { long nr = getNumRows(table); // number of rows -1 means that statistics from metastore is not reliable // and 0 means statistics gathering is disabled @@ -322,9 +346,10 @@ private static Statistics collectStatistics(HiveConf conf, PrunedPartitionList p List colStats = Lists.newArrayList(); if (fetchColStats) { colStats = getTableColumnStats(table, schema, neededColumns, colStatsCache); - if(colStats == null || colStats.size() < 1) { - colStats = estimateStats(table,schema,neededColumns, conf, nr); - } + estimateStatsForMissingCols(neededColumns, colStats, table, conf, nr, schema); + + // we should have stats for all columns (estimated or actual) + assert(neededColumns.size() == colStats.size()); long betterDS = getDataSizeFromColumnStats(nr, colStats); ds = (betterDS < 1 || colStats.isEmpty()) ? ds : betterDS; } @@ -457,15 +482,11 @@ private static Statistics collectStatistics(HiveConf conf, PrunedPartitionList p aggrStats.getColStats() != null && aggrStats.getColStatsSize() != 0; if (neededColumns.size() == 0 || (neededColsToRetrieve.size() > 0 && !statsRetrieved)) { + estimateStatsForMissingCols(neededColsToRetrieve, columnStats, table, conf, nr, schema); // There are some partitions with no state (or we didn't fetch any state). // Update the stats with empty list to reflect that in the // state/initialize structures. - if(columnStats.isEmpty()) { - // estimate stats - columnStats = estimateStats(table, schema, neededColumns, conf, nr); - } - // add partition column stats addPartitionColumnStats(conf, partitionColsToRetrieve, schema, table, partList, columnStats); @@ -482,6 +503,7 @@ private static Statistics collectStatistics(HiveConf conf, PrunedPartitionList p LOG.debug("Column stats requested for : {} columns. Able to retrieve for {} columns", columnStats.size(), colStatsAvailable); } + estimateStatsForMissingCols(neededColsToRetrieve, columnStats, table, conf, nr, schema); addPartitionColumnStats(conf, partitionColsToRetrieve, schema, table, partList, columnStats); long betterDS = getDataSizeFromColumnStats(nr, columnStats);