diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/HiveMetaStore.java b/metastore/src/java/org/apache/hadoop/hive/metastore/HiveMetaStore.java index 38c0eed..b63c37b 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/HiveMetaStore.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/HiveMetaStore.java @@ -6105,9 +6105,12 @@ public AggrStats get_aggr_stats_for(PartitionsStatsRequest request) for (String colName : request.getColNames()) { lowerCaseColNames.add(colName.toLowerCase()); } - List lowerCasePartNames = new ArrayList(request.getPartNames().size()); - for (String partName : request.getPartNames()) { - lowerCasePartNames.add(lowerCaseConvertPartName(partName)); + List lowerCasePartNames = null; + if (request.getPartNames() != null) { + lowerCasePartNames = new ArrayList(request.getPartNames().size()); + for (String partName : request.getPartNames()) { + lowerCasePartNames.add(lowerCaseConvertPartName(partName)); + } } AggrStats aggrStats = null; diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/HiveMetaStoreClient.java b/metastore/src/java/org/apache/hadoop/hive/metastore/HiveMetaStoreClient.java index 909d8eb..d20aefe 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/HiveMetaStoreClient.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/HiveMetaStoreClient.java @@ -2345,7 +2345,7 @@ protected void drop_table_with_environment_context(String dbname, String name, @Override public AggrStats getAggrColStatsFor(String dbName, String tblName, List colNames, List partNames) throws NoSuchObjectException, MetaException, TException { - if (colNames.isEmpty() || partNames.isEmpty()) { + if (colNames.isEmpty() || (partNames != null && partNames.isEmpty())) { LOG.debug("Columns is empty or partNames is empty : Short-circuiting stats eval on client side."); return new AggrStats(new ArrayList(),0); // Nothing to aggregate } diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java b/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java index 9c900af..dbc84c6 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java @@ -1197,7 +1197,8 @@ public ColumnStatistics getTableStats(final String dbName, final String tableNam public AggrStats aggrColStatsForPartitions(String dbName, String tableName, List partNames, List colNames, boolean useDensityFunctionForNDVEstimation) throws MetaException { - if (colNames.isEmpty() || partNames.isEmpty()) { + // partNames = null means all partitions should be considered while aggregating column stats. + if (colNames.isEmpty() || (partNames != null && partNames.isEmpty())) { LOG.debug("Columns is empty or partNames is empty : Short-circuiting stats eval"); return new AggrStats(new ArrayList(), 0); // Nothing to aggregate } @@ -1209,7 +1210,9 @@ public AggrStats aggrColStatsForPartitions(String dbName, String tableName, List colStatsAggrFromDB; int maxPartsPerCacheNode = aggrStatsCache.getMaxPartsPerCacheNode(); float fpp = aggrStatsCache.getFalsePositiveProbability(); - int partitionsRequested = partNames.size(); + // TODO: Needs to be revisited; is setting partitionRequested to Integer.MAX_VALUE the best + // way to do this ? + int partitionsRequested = partNames == null ? Integer.MAX_VALUE : partNames.size(); if (partitionsRequested > maxPartsPerCacheNode) { colStatsList = columnStatisticsObjForPartitions(dbName, tableName, partNames, colNames, partsFound, useDensityFunctionForNDVEstimation); @@ -1260,19 +1263,21 @@ private BloomFilter createPartsBloomFilter(int maxPartsPerCacheNode, float fpp, private long partsFoundForPartitions(final String dbName, final String tableName, final List partNames, List colNames) throws MetaException { - assert !colNames.isEmpty() && !partNames.isEmpty(); + assert !colNames.isEmpty() && (partNames == null || !partNames.isEmpty()); final boolean doTrace = LOG.isDebugEnabled(); final String queryText0 = "select count(\"COLUMN_NAME\") from \"PART_COL_STATS\"" + " where \"DB_NAME\" = ? and \"TABLE_NAME\" = ? " - + " and \"COLUMN_NAME\" in (%1$s) and \"PARTITION_NAME\" in (%2$s)" + + " and \"COLUMN_NAME\" in (%1$s) " + + (partNames == null ? "" : " and \"PARTITION_NAME\" in (%2$s)") + " group by \"PARTITION_NAME\""; List allCounts = runBatched(colNames, new Batchable() { public List run(final List inputColName) throws MetaException { return runBatched(partNames, new Batchable() { public List run(List inputPartNames) throws MetaException { long partsFound = 0; - String queryText = String.format(queryText0, - makeParams(inputColName.size()), makeParams(inputPartNames.size())); + String queryText = inputPartNames != null ? (String.format(queryText0, + makeParams(inputColName.size()), makeParams(inputPartNames.size()))) : + (String.format(queryText0, makeParams(inputColName.size()))); long start = doTrace ? System.nanoTime() : 0; Query query = pm.newQuery("javax.jdo.query.SQL", queryText); try { @@ -1305,7 +1310,7 @@ private long partsFoundForPartitions(final String dbName, final String tableName private List columnStatisticsObjForPartitions(final String dbName, final String tableName, final List partNames, List colNames, long partsFound, final boolean useDensityFunctionForNDVEstimation) throws MetaException { - final boolean areAllPartsFound = (partsFound == partNames.size()); + final boolean areAllPartsFound = partNames == null ? true : (partsFound == partNames.size()); return runBatched(colNames, new Batchable() { public List run(final List inputColNames) throws MetaException { return runBatched(partNames, new Batchable() { @@ -1358,7 +1363,7 @@ private long partsFoundForPartitions(final String dbName, final String tableName // Extrapolation is not needed. if (areAllPartsFound) { queryText = commonPrefix + " and \"COLUMN_NAME\" in (" + makeParams(colNames.size()) + ")" - + " and \"PARTITION_NAME\" in (" + makeParams(partNames.size()) + ")" + + (partNames == null ? "" : " and \"PARTITION_NAME\" in (" + makeParams(partNames.size()) + ")") + " group by \"COLUMN_NAME\", \"COLUMN_TYPE\""; start = doTrace ? System.nanoTime() : 0; query = pm.newQuery("javax.jdo.query.SQL", queryText); @@ -1386,7 +1391,7 @@ private long partsFoundForPartitions(final String dbName, final String tableName queryText = "select \"COLUMN_NAME\", \"COLUMN_TYPE\", count(\"PARTITION_NAME\") " + " from \"PART_COL_STATS\"" + " where \"DB_NAME\" = ? and \"TABLE_NAME\" = ? " + " and \"COLUMN_NAME\" in (" + makeParams(colNames.size()) + ")" - + " and \"PARTITION_NAME\" in (" + makeParams(partNames.size()) + ")" + + (partNames == null ? "" : (" and \"PARTITION_NAME\" in (" + makeParams(partNames.size()) + ")")) + " group by \"COLUMN_NAME\", \"COLUMN_TYPE\""; start = doTrace ? System.nanoTime() : 0; query = pm.newQuery("javax.jdo.query.SQL", queryText); @@ -1420,8 +1425,9 @@ private long partsFoundForPartitions(final String dbName, final String tableName // Extrapolation is not needed for columns noExtraColumnNames if (noExtraColumnNames.size() != 0) { queryText = commonPrefix + " and \"COLUMN_NAME\" in (" - + makeParams(noExtraColumnNames.size()) + ")" + " and \"PARTITION_NAME\" in (" - + makeParams(partNames.size()) + ")" + " group by \"COLUMN_NAME\", \"COLUMN_TYPE\""; + + makeParams(noExtraColumnNames.size()) + ")" + + (partNames == null ? "" : (" and \"PARTITION_NAME\" in (" + + makeParams(partNames.size()) + ")")) + " group by \"COLUMN_NAME\", \"COLUMN_TYPE\""; start = doTrace ? System.nanoTime() : 0; query = pm.newQuery("javax.jdo.query.SQL", queryText); qResult = executeWithArray(query, @@ -1450,9 +1456,9 @@ private long partsFoundForPartitions(final String dbName, final String tableName Map> sumMap = new HashMap>(); queryText = "select \"COLUMN_NAME\", sum(\"NUM_NULLS\"), sum(\"NUM_TRUES\"), sum(\"NUM_FALSES\"), sum(\"NUM_DISTINCTS\")" + " from \"PART_COL_STATS\" where \"DB_NAME\" = ? and \"TABLE_NAME\" = ? " - + " and \"COLUMN_NAME\" in (" + makeParams(extraColumnNameTypeParts.size()) - + ") and \"PARTITION_NAME\" in (" + makeParams(partNames.size()) - + ") group by \"COLUMN_NAME\""; + + " and \"COLUMN_NAME\" in (" + makeParams(extraColumnNameTypeParts.size()) + ") " + + (partNames == null ? "" : (" and \"PARTITION_NAME\" in (" + makeParams(partNames.size()) + ") ")) + + " group by \"COLUMN_NAME\""; start = doTrace ? System.nanoTime() : 0; query = pm.newQuery("javax.jdo.query.SQL", queryText); List extraColumnNames = new ArrayList(); @@ -1523,13 +1529,13 @@ private long partsFoundForPartitions(final String dbName, final String tableName queryText = "select \"" + colStatName + "\",\"PARTITION_NAME\" from \"PART_COL_STATS\"" + " where \"DB_NAME\" = ? and \"TABLE_NAME\" = ?" + " and \"COLUMN_NAME\" = ?" - + " and \"PARTITION_NAME\" in (" + makeParams(partNames.size()) + ")" + + (partNames == null ? "" : (" and \"PARTITION_NAME\" in (" + makeParams(partNames.size()) + ")")) + " order by \"" + colStatName + "\""; } else { queryText = "select \"" + colStatName + "\",\"PARTITION_NAME\" from \"PART_COL_STATS\"" + " where \"DB_NAME\" = ? and \"TABLE_NAME\" = ?" + " and \"COLUMN_NAME\" = ?" - + " and \"PARTITION_NAME\" in (" + makeParams(partNames.size()) + ")" + + (partNames == null ? "" : (" and \"PARTITION_NAME\" in (" + makeParams(partNames.size()) + ")")) + " order by cast(\"" + colStatName + "\" as decimal)"; } start = doTrace ? System.nanoTime() : 0; @@ -1559,8 +1565,9 @@ private long partsFoundForPartitions(final String dbName, final String tableName + "avg((\"DOUBLE_HIGH_VALUE\"-\"DOUBLE_LOW_VALUE\")/\"NUM_DISTINCTS\")," + "avg((cast(\"BIG_DECIMAL_HIGH_VALUE\" as decimal)-cast(\"BIG_DECIMAL_LOW_VALUE\" as decimal))/\"NUM_DISTINCTS\")" + " from \"PART_COL_STATS\"" + " where \"DB_NAME\" = ? and \"TABLE_NAME\" = ?" - + " and \"COLUMN_NAME\" = ?" + " and \"PARTITION_NAME\" in (" - + makeParams(partNames.size()) + ")" + " group by \"COLUMN_NAME\""; + + " and \"COLUMN_NAME\" = ?" + + (partNames == null ? "" : (" and \"PARTITION_NAME\" in (" + + makeParams(partNames.size()) + ")")) + " group by \"COLUMN_NAME\""; start = doTrace ? System.nanoTime() : 0; query = pm.newQuery("javax.jdo.query.SQL", queryText); qResult = executeWithArray(query, @@ -1612,16 +1619,17 @@ private ColumnStatisticsObj prepareCSObjWithAdjustedNDV(Object[] row, int i, private Object[] prepareParams(String dbName, String tableName, List partNames, List colNames) throws MetaException { - Object[] params = new Object[colNames.size() + partNames.size() + 2]; + Object[] params = new Object[colNames.size() + (partNames == null ? 0 : partNames.size()) + 2]; int paramI = 0; params[paramI++] = dbName; params[paramI++] = tableName; for (String colName : colNames) { params[paramI++] = colName; } - for (String partName : partNames) { - params[paramI++] = partName; - } + if (partNames != null) + for (String partName : partNames) { + params[paramI++] = partName; + } return params; } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ppr/PartitionPruner.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ppr/PartitionPruner.java index 26e936e..69a2ac0 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ppr/PartitionPruner.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ppr/PartitionPruner.java @@ -237,7 +237,7 @@ private static PrunedPartitionList getAllPartsFromCacheOrServer(Table tab, Strin } catch (HiveException e) { throw new SemanticException(e); } - ppList = new PrunedPartitionList(tab, parts, null, unknownPartitions); + ppList = new PrunedPartitionList(tab, parts, null, unknownPartitions, true); if (partsCache != null) { partsCache.put(key, ppList); } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/PrunedPartitionList.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/PrunedPartitionList.java index da2e1e2..c072cbf 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/PrunedPartitionList.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/PrunedPartitionList.java @@ -42,6 +42,9 @@ /** Whether there are partitions in the list that may or may not satisfy the criteria. */ private boolean hasUnknowns; + /** To determine whether there are all partitions, may have false negatives */ + private boolean hasAllPartitions; + public PrunedPartitionList(Table source, Set partitions, List referred, boolean hasUnknowns) { this.source = source; @@ -50,6 +53,19 @@ public PrunedPartitionList(Table source, Set partitions, List this.hasUnknowns = hasUnknowns; } + public PrunedPartitionList(Table source, Set partitions, List referred, + boolean hasUnknowns, boolean hasAllPartitions) { + this.source = source; + this.referred = referred; + this.partitions = partitions; + this.hasUnknowns = hasUnknowns; + this.hasAllPartitions = hasAllPartitions; + } + + public boolean getHasAllPartitions(){ + return hasAllPartitions; + } + public Table getSourceTable() { return source; } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java index d8acf94..966819a 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java @@ -260,16 +260,19 @@ public static Statistics collectStatistics(HiveConf conf, PrunedPartitionList pa stats.setBasicStatsState(State.PARTIAL); } if (fetchColStats) { - List partNames = new ArrayList(partList.getNotDeniedPartns().size()); - for (Partition part : partList.getNotDeniedPartns()) { - partNames.add(part.getName()); + List partNames = null; + if (!partList.getHasAllPartitions()) { + partNames = new ArrayList(partList.getNotDeniedPartns().size()); + for (Partition part : partList.getNotDeniedPartns()) { + partNames.add(part.getName()); + } } neededColumns = processNeededColumns(schema, neededColumns); AggrStats aggrStats = null; // We check the sizes of neededColumns and partNames here. If either // size is 0, aggrStats is null after several retries. Thus, we can // skip the step to connect to the metastore. - if (neededColumns.size() > 0 && partNames.size() > 0) { + if (neededColumns.size() > 0 && (partNames == null || partNames.size() > 0)) { aggrStats = Hive.get().getAggrColStatsFor(table.getDbName(), table.getTableName(), neededColumns, partNames); } @@ -304,7 +307,8 @@ public static Statistics collectStatistics(HiveConf conf, PrunedPartitionList pa stats.addToColumnStats(columnStats); State colState = deriveStatType(columnStats, referencedColumns); - if (aggrStats.getPartsFound() != partNames.size() && colState != State.NONE) { + // If partNames = null, the HMS should retrieve all the partitions stats. + if (partNames != null && aggrStats.getPartsFound() != partNames.size() && colState != State.NONE) { LOG.debug("Column stats requested for : " + partNames.size() + " partitions. " + "Able to retrieve for " + aggrStats.getPartsFound() + " partitions"); colState = State.PARTIAL;