commit eb8c9c14e147c44e0f29f0681fe6d2fe8faacdcb Author: Ivan Suller Date: Thu Jun 6 11:08:37 2019 +0200 HIVE-21842 diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java index e184b9d0a4..e0f401beb6 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java @@ -263,7 +263,7 @@ private static Statistics collectStatistics(HiveConf conf, PrunedPartitionList p // long nr = getNumRows(conf, schema, neededColumns, table, ds); long ds = basicStats.getDataSize(); long nr = basicStats.getNumRows(); - List colStats = Lists.newArrayList(); + List colStats = Collections.emptyList(); long numErasureCodedFiles = getErasureCodedFiles(table); @@ -303,11 +303,6 @@ private static Statistics collectStatistics(HiveConf conf, PrunedPartitionList p } BasicStats bbs = BasicStats.buildFrom(partStats); - List rowCounts = Lists.newArrayList(); - for (BasicStats basicStats : partStats) { - rowCounts.add(basicStats.getNumRows()); - } - long nr = bbs.getNumRows(); long ds = bbs.getDataSize(); @@ -316,7 +311,7 @@ private static Statistics collectStatistics(HiveConf conf, PrunedPartitionList p long numErasureCodedFiles = getSumIgnoreNegatives(erasureCodedFiles); if (nr == 0) { - nr=1; + nr = 1; } stats = new Statistics(nr, ds, numErasureCodedFiles); stats.setBasicStatsState(bbs.getState()); @@ -331,51 +326,12 @@ private static Statistics collectStatistics(HiveConf conf, PrunedPartitionList p List partitionCols = getPartitionColumns(schema, neededColumns, referencedColumns); // We will retrieve stats from the metastore only for columns that are not cached - List neededColsToRetrieve; - List partitionColsToRetrieve; List columnStats = new ArrayList<>(); - if (colStatsCache != null) { - neededColsToRetrieve = new ArrayList(neededColumns.size()); - for (String colName : neededColumns) { - ColStatistics colStats = colStatsCache.getColStats().get(colName); - if (colStats == null) { - neededColsToRetrieve.add(colName); - if (LOG.isDebugEnabled()) { - LOG.debug("Stats for column " + colName + - " in table " + table.getCompleteName() + " could not be retrieved from cache"); - } - } else { - columnStats.add(colStats); - if (LOG.isDebugEnabled()) { - LOG.debug("Stats for column " + colName + - " in table " + table.getCompleteName() + " retrieved from cache"); - } - } - } - partitionColsToRetrieve = new ArrayList<>(partitionCols.size()); - for (String colName : partitionCols) { - ColStatistics colStats = colStatsCache.getColStats().get(colName); - if (colStats == null) { - partitionColsToRetrieve.add(colName); - if (LOG.isDebugEnabled()) { - LOG.debug("Stats for column " + colName + - " in table " + table.getCompleteName() + " could not be retrieved from cache"); - } - } else { - columnStats.add(colStats); - if (LOG.isDebugEnabled()) { - LOG.debug("Stats for column " + colName + - " in table " + table.getCompleteName() + " retrieved from cache"); - } - } - } - } else { - neededColsToRetrieve = neededColumns; - partitionColsToRetrieve = partitionCols; - } + List neededColsToRetrieve = extractColumnStates(table, neededColumns, colStatsCache, columnStats); + List partitionColsToRetrieve = extractColumnStates(table, partitionCols, colStatsCache, columnStats); // List of partitions - List partNames = new ArrayList(partList.getNotDeniedPartns().size()); + List partNames = new ArrayList<>(partList.getNotDeniedPartns().size()); for (Partition part : partList.getNotDeniedPartns()) { partNames.add(part.getName()); } @@ -384,15 +340,14 @@ private static Statistics collectStatistics(HiveConf conf, PrunedPartitionList p // We check the sizes of neededColumns and partNames here. If either // size is 0, aggrStats is null after several retries. Thus, we can // skip the step to connect to the metastore. - if (fetchColStats && neededColsToRetrieve.size() > 0 && partNames.size() > 0) { + if (fetchColStats && !neededColsToRetrieve.isEmpty() && !partNames.isEmpty()) { aggrStats = Hive.get().getAggrColStatsFor(table.getDbName(), table.getTableName(), neededColsToRetrieve, partNames, false); } boolean statsRetrieved = aggrStats != null && aggrStats.getColStats() != null && aggrStats.getColStatsSize() != 0; - if (neededColumns.size() == 0 || - (neededColsToRetrieve.size() > 0 && !statsRetrieved)) { + if (neededColumns.isEmpty() || (!neededColsToRetrieve.isEmpty() && !statsRetrieved)) { estimateStatsForMissingCols(neededColsToRetrieve, columnStats, table, conf, nr, schema); // There are some partitions with no state (or we didn't fetch any state). // Update the stats with empty list to reflect that in the @@ -439,7 +394,7 @@ private static Statistics collectStatistics(HiveConf conf, PrunedPartitionList p } } - if(rowCounts.size() == 0 ) { + if (partStats.isEmpty()) { // all partitions are filtered by partition pruning stats.setBasicStatsState(State.COMPLETE); } @@ -459,6 +414,26 @@ private static Statistics collectStatistics(HiveConf conf, PrunedPartitionList p return stats; } + private static List extractColumnStates(Table table, List columns, + ColumnStatsList colStatsCache, List columnStats) { + if (colStatsCache == null) { + return columns; + } + List neededColsToRetrieve = new ArrayList<>(columns.size()); + for (String colName : columns) { + ColStatistics colStats = colStatsCache.getColStats().get(colName); + if (colStats == null) { + neededColsToRetrieve.add(colName); + LOG.debug("Stats for column {} in table {} could not be retrieved from cache", colName, + table.getCompleteName()); + } else { + columnStats.add(colStats); + LOG.debug("Stats for column {} in table {} retrieved from cache", colName, table.getCompleteName()); + } + } + return neededColsToRetrieve; + } + /** * Based on the provided column statistics and number of rows, this method infers if the column