diff --git metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java index 561f3e3..83b6e1a 100644 --- metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java +++ metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java @@ -1188,7 +1188,7 @@ public AggrStats aggrColStatsForPartitions(String dbName, String tableName, LOG.debug("Columns is empty or partNames is empty : Short-circuiting stats eval"); return new AggrStats(new ArrayList(), 0); // Nothing to aggregate } - long partsFound = partsFoundForPartitions(dbName, tableName, partNames, colNames); + LOG.debug("partNames size={}", partNames.size()); List colStatsList; // Try to read from the cache first if (isAggregateStatsCacheEnabled) { @@ -1199,7 +1199,7 @@ public AggrStats aggrColStatsForPartitions(String dbName, String tableName, int partitionsRequested = partNames.size(); if (partitionsRequested > maxPartsPerCacheNode) { colStatsList = columnStatisticsObjForPartitions(dbName, tableName, partNames, colNames, - partsFound, useDensityFunctionForNDVEstimation); + useDensityFunctionForNDVEstimation); } else { colStatsList = new ArrayList(); // Bloom filter for the new node that we will eventually add to the cache @@ -1215,25 +1215,25 @@ public AggrStats aggrColStatsForPartitions(String dbName, String tableName, // Read aggregated stats for one column colStatsAggrFromDB = columnStatisticsObjForPartitions(dbName, tableName, partNames, colNamesForDB, - partsFound, useDensityFunctionForNDVEstimation); + useDensityFunctionForNDVEstimation); if (!colStatsAggrFromDB.isEmpty()) { ColumnStatisticsObj colStatsAggr = colStatsAggrFromDB.get(0); colStatsList.add(colStatsAggr); // Update the cache to add this new aggregate node - aggrStatsCache.add(dbName, tableName, colName, partsFound, colStatsAggr, bloomFilter); + aggrStatsCache.add(dbName, tableName, colName, partNames.size(), colStatsAggr, bloomFilter); } } } } } else { colStatsList = - columnStatisticsObjForPartitions(dbName, tableName, partNames, colNames, partsFound, + columnStatisticsObjForPartitions(dbName, tableName, partNames, colNames, useDensityFunctionForNDVEstimation); } LOG.info("useDensityFunctionForNDVEstimation = " + useDensityFunctionForNDVEstimation - + "\npartsFound = " + partsFound + "\nColumnStatisticsObj = " + + "\nparts = " + partNames.size() + "\nColumnStatisticsObj = " + Arrays.toString(colStatsList.toArray())); - return new AggrStats(colStatsList, partsFound); + return new AggrStats(colStatsList, partNames.size()); } private BloomFilter createPartsBloomFilter(int maxPartsPerCacheNode, float fpp, @@ -1290,15 +1290,14 @@ private long partsFoundForPartitions(final String dbName, final String tableName } private List columnStatisticsObjForPartitions(final String dbName, - final String tableName, final List partNames, List colNames, long partsFound, + final String tableName, final List partNames, final List colNames, final boolean useDensityFunctionForNDVEstimation) throws MetaException { - final boolean areAllPartsFound = (partsFound == partNames.size()); return runBatched(colNames, new Batchable() { public List run(final List inputColNames) throws MetaException { return runBatched(partNames, new Batchable() { public List run(List inputPartNames) throws MetaException { return columnStatisticsObjForPartitionsBatch(dbName, tableName, inputPartNames, - inputColNames, areAllPartsFound, useDensityFunctionForNDVEstimation); + inputColNames, useDensityFunctionForNDVEstimation); } }); } @@ -1307,7 +1306,7 @@ private long partsFoundForPartitions(final String dbName, final String tableName /** Should be called with the list short enough to not trip up Oracle/etc. */ private List columnStatisticsObjForPartitionsBatch(String dbName, - String tableName, List partNames, List colNames, boolean areAllPartsFound, + String tableName, List partNames, List colNames, boolean useDensityFunctionForNDVEstimation) throws MetaException { // TODO: all the extrapolation logic should be moved out of this class, // only mechanical data retrieval should remain here. @@ -1343,6 +1342,9 @@ private long partsFoundForPartitions(final String dbName, final String tableName ForwardQueryResult fqr = null; // Check if the status of all the columns of all the partitions exists // Extrapolation is not needed. + boolean areAllPartsFound = true; + int partsSize = partNames.size(); + LOG.debug("parts size={}" + partsSize); if (areAllPartsFound) { queryText = commonPrefix + " and \"COLUMN_NAME\" in (" + makeParams(colNames.size()) + ")" + " and \"PARTITION_NAME\" in (" + makeParams(partNames.size()) + ")" @@ -1358,18 +1360,24 @@ private long partsFoundForPartitions(final String dbName, final String tableName end = doTrace ? System.nanoTime() : 0; timingTrace(doTrace, queryText, start, end); List list = ensureList(qResult); - List colStats = new ArrayList(list.size()); - for (Object[] row : list) { - colStats.add(prepareCSObjWithAdjustedNDV(row, 0, useDensityFunctionForNDVEstimation)); - Deadline.checkTimeout(); + if (list.size() != partNames.size()) { + areAllPartsFound = false; + LOG.info("Could not find all parts for columns. Need to extrapolate some of them"); + } else { + List colStats = new ArrayList(list.size()); + for (Object[] row : list) { + colStats.add(prepareCSObjWithAdjustedNDV(row, 0, useDensityFunctionForNDVEstimation)); + Deadline.checkTimeout(); + } + query.closeAll(); + return colStats; } - query.closeAll(); - return colStats; - } else { + } + List colStats = new ArrayList(colNames.size()); + if (!areAllPartsFound) { // Extrapolation is needed for some columns. // In this case, at least a column status for a partition is missing. // We need to extrapolate this partition based on the other partitions - List colStats = new ArrayList(colNames.size()); queryText = "select \"COLUMN_NAME\", \"COLUMN_TYPE\", count(\"PARTITION_NAME\") " + " from \"PART_COL_STATS\"" + " where \"DB_NAME\" = ? and \"TABLE_NAME\" = ? " + " and \"COLUMN_NAME\" in (" + makeParams(colNames.size()) + ")" @@ -1570,8 +1578,8 @@ private long partsFoundForPartitions(final String dbName, final String tableName Deadline.checkTimeout(); } } - return colStats; } + return colStats; } private ColumnStatisticsObj prepareCSObj (Object[] row, int i) throws MetaException { diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/RelOptHiveTable.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/RelOptHiveTable.java index 73ca9bf..4ebbb13 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/RelOptHiveTable.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/RelOptHiveTable.java @@ -412,20 +412,32 @@ private int getDistinctCount(Set partitions, String partColName) { public List getColStat(List projIndxLst, boolean allowNullColumnForMissingStats) { List colStatsBldr = Lists.newArrayList(); - + Set projIndxSet = new HashSet(projIndxLst); if (projIndxLst != null) { - updateColStats(new HashSet(projIndxLst), allowNullColumnForMissingStats); for (Integer i : projIndxLst) { - colStatsBldr.add(hiveColStatsMap.get(i)); + if (hiveColStatsMap.get(i) != null) { + colStatsBldr.add(hiveColStatsMap.get(i)); + projIndxSet.remove(i); + } + } + if (!projIndxSet.isEmpty()) { + updateColStats(projIndxSet, allowNullColumnForMissingStats); + for (Integer i : projIndxSet) { + colStatsBldr.add(hiveColStatsMap.get(i)); + } } } else { List pILst = new ArrayList(); for (Integer i = 0; i < noOfNonVirtualCols; i++) { - pILst.add(i); + if (hiveColStatsMap.get(i) == null) { + pILst.add(i); + } } - updateColStats(new HashSet(pILst), allowNullColumnForMissingStats); - for (Integer pi : pILst) { - colStatsBldr.add(hiveColStatsMap.get(pi)); + if (!pILst.isEmpty()) { + updateColStats(new HashSet(pILst), allowNullColumnForMissingStats); + for (Integer pi : pILst) { + colStatsBldr.add(hiveColStatsMap.get(pi)); + } } } diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java index 3a179a3..37baaf6 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java @@ -749,8 +749,8 @@ private boolean validateInputFormatAndSchemaEvolution(MapWork mapWork, String al return false; } VectorPartitionDesc vectorPartDesc = partDesc.getVectorPartitionDesc(); - if (LOG.isInfoEnabled()) { - LOG.info("Vectorizer path: " + path + ", " + vectorPartDesc.toString() + + if (LOG.isDebugEnabled()) { + LOG.debug("Vectorizer path: " + path + ", " + vectorPartDesc.toString() + ", aliases " + aliases); }