diff --git metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java index 561f3e3..dadc6f6 100644 --- metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java +++ metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java @@ -1188,44 +1188,46 @@ public AggrStats aggrColStatsForPartitions(String dbName, String tableName, LOG.debug("Columns is empty or partNames is empty : Short-circuiting stats eval"); return new AggrStats(new ArrayList(), 0); // Nothing to aggregate } - long partsFound = partsFoundForPartitions(dbName, tableName, partNames, colNames); + long partsFound = 0; List colStatsList; // Try to read from the cache first - if (isAggregateStatsCacheEnabled) { + if (isAggregateStatsCacheEnabled + && (partNames.size() < aggrStatsCache.getMaxPartsPerCacheNode())) { AggrColStats colStatsAggrCached; List colStatsAggrFromDB; int maxPartsPerCacheNode = aggrStatsCache.getMaxPartsPerCacheNode(); float fpp = aggrStatsCache.getFalsePositiveProbability(); - int partitionsRequested = partNames.size(); - if (partitionsRequested > maxPartsPerCacheNode) { - colStatsList = columnStatisticsObjForPartitions(dbName, tableName, partNames, colNames, - partsFound, useDensityFunctionForNDVEstimation); - } else { - colStatsList = new ArrayList(); - // Bloom filter for the new node that we will eventually add to the cache - BloomFilter bloomFilter = createPartsBloomFilter(maxPartsPerCacheNode, fpp, partNames); - for (String colName : colNames) { - // Check the cache first - colStatsAggrCached = aggrStatsCache.get(dbName, tableName, colName, partNames); - if (colStatsAggrCached != null) { - colStatsList.add(colStatsAggrCached.getColStats()); - } else { - List colNamesForDB = new ArrayList(); - colNamesForDB.add(colName); - // Read aggregated stats for one column - colStatsAggrFromDB = - columnStatisticsObjForPartitions(dbName, tableName, partNames, colNamesForDB, - partsFound, useDensityFunctionForNDVEstimation); - if (!colStatsAggrFromDB.isEmpty()) { - ColumnStatisticsObj colStatsAggr = colStatsAggrFromDB.get(0); - colStatsList.add(colStatsAggr); - // Update the cache to add this new aggregate node - aggrStatsCache.add(dbName, tableName, colName, partsFound, colStatsAggr, bloomFilter); - } + colStatsList = new ArrayList(); + // Bloom filter for the new node that we will eventually add to the cache + BloomFilter bloomFilter = createPartsBloomFilter(maxPartsPerCacheNode, fpp, partNames); + boolean computePartsFound = true; + for (String colName : colNames) { + // Check the cache first + colStatsAggrCached = aggrStatsCache.get(dbName, tableName, colName, partNames); + if (colStatsAggrCached != null) { + colStatsList.add(colStatsAggrCached.getColStats()); + partsFound = colStatsAggrCached.getNumPartsCached(); + } else { + if (computePartsFound) { + partsFound = partsFoundForPartitions(dbName, tableName, partNames, colNames); + computePartsFound = false; + } + List colNamesForDB = new ArrayList(); + colNamesForDB.add(colName); + // Read aggregated stats for one column + colStatsAggrFromDB = + columnStatisticsObjForPartitions(dbName, tableName, partNames, colNamesForDB, + partsFound, useDensityFunctionForNDVEstimation); + if (!colStatsAggrFromDB.isEmpty()) { + ColumnStatisticsObj colStatsAggr = colStatsAggrFromDB.get(0); + colStatsList.add(colStatsAggr); + // Update the cache to add this new aggregate node + aggrStatsCache.add(dbName, tableName, colName, partsFound, colStatsAggr, bloomFilter); } } } } else { + partsFound = partsFoundForPartitions(dbName, tableName, partNames, colNames); colStatsList = columnStatisticsObjForPartitions(dbName, tableName, partNames, colNames, partsFound, useDensityFunctionForNDVEstimation); diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/RelOptHiveTable.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/RelOptHiveTable.java index 73ca9bf..4ebbb13 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/RelOptHiveTable.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/RelOptHiveTable.java @@ -412,20 +412,32 @@ private int getDistinctCount(Set partitions, String partColName) { public List getColStat(List projIndxLst, boolean allowNullColumnForMissingStats) { List colStatsBldr = Lists.newArrayList(); - + Set projIndxSet = new HashSet(projIndxLst); if (projIndxLst != null) { - updateColStats(new HashSet(projIndxLst), allowNullColumnForMissingStats); for (Integer i : projIndxLst) { - colStatsBldr.add(hiveColStatsMap.get(i)); + if (hiveColStatsMap.get(i) != null) { + colStatsBldr.add(hiveColStatsMap.get(i)); + projIndxSet.remove(i); + } + } + if (!projIndxSet.isEmpty()) { + updateColStats(projIndxSet, allowNullColumnForMissingStats); + for (Integer i : projIndxSet) { + colStatsBldr.add(hiveColStatsMap.get(i)); + } } } else { List pILst = new ArrayList(); for (Integer i = 0; i < noOfNonVirtualCols; i++) { - pILst.add(i); + if (hiveColStatsMap.get(i) == null) { + pILst.add(i); + } } - updateColStats(new HashSet(pILst), allowNullColumnForMissingStats); - for (Integer pi : pILst) { - colStatsBldr.add(hiveColStatsMap.get(pi)); + if (!pILst.isEmpty()) { + updateColStats(new HashSet(pILst), allowNullColumnForMissingStats); + for (Integer pi : pILst) { + colStatsBldr.add(hiveColStatsMap.get(pi)); + } } } diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java index 3a179a3..37baaf6 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java @@ -749,8 +749,8 @@ private boolean validateInputFormatAndSchemaEvolution(MapWork mapWork, String al return false; } VectorPartitionDesc vectorPartDesc = partDesc.getVectorPartitionDesc(); - if (LOG.isInfoEnabled()) { - LOG.info("Vectorizer path: " + path + ", " + vectorPartDesc.toString() + + if (LOG.isDebugEnabled()) { + LOG.debug("Vectorizer path: " + path + ", " + vectorPartDesc.toString() + ", aliases " + aliases); }