From 3598434c1d268f43d22a4f7b1b22e46012f86c6d Mon Sep 17 00:00:00 2001 From: Ashutosh Chauhan Date: Wed, 9 Aug 2017 18:42:45 -0700 Subject: [PATCH] HIVE-17285 : Fixes for bit vector retrievals and merging --- .../hadoop/hive/metastore/MetaStoreDirectSql.java | 35 +++++++--- .../hadoop/hive/metastore/MetaStoreUtils.java | 56 +++++++++++---- .../hadoop/hive/metastore/cache/CachedStore.java | 81 +++++++++------------- .../aggr/DateColumnStatsAggregator.java | 16 +++-- .../aggr/DecimalColumnStatsAggregator.java | 12 ++-- .../aggr/DoubleColumnStatsAggregator.java | 16 +++-- .../aggr/LongColumnStatsAggregator.java | 15 ++-- .../aggr/StringColumnStatsAggregator.java | 14 ++-- 8 files changed, 147 insertions(+), 98 deletions(-) diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java b/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java index 73754fff85..b3274ca6dd 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java @@ -127,7 +127,7 @@ @java.lang.annotation.Target(java.lang.annotation.ElementType.FIELD) @java.lang.annotation.Retention(java.lang.annotation.RetentionPolicy.RUNTIME) private @interface TableName {} - + // Table names with schema name, if necessary @TableName private String DBS, TBLS, PARTITIONS, DATABASE_PARAMS, PARTITION_PARAMS, SORT_COLS, SD_PARAMS, @@ -151,7 +151,7 @@ public MetaStoreDirectSql(PersistenceManager pm, Configuration conf, String sche batchSize = DatabaseProduct.needsInBatching(dbType) ? 1000 : NO_BATCHING; } this.batchSize = batchSize; - + for (java.lang.reflect.Field f : this.getClass().getDeclaredFields()) { if (f.getAnnotation(TableName.class) == null) continue; try { @@ -281,7 +281,7 @@ private boolean runTestQuery() { public String getSchema() { return schema; } - + public boolean isCompatibleDatastore() { return isCompatibleDatastore; } @@ -393,6 +393,7 @@ public Database getDatabase(String dbName) throws MetaException{ return Collections.emptyList(); } return runBatched(partNames, new Batchable() { + @Override public List run(List input) throws MetaException { String filter = "" + PARTITIONS + ".\"PART_NAME\" in (" + makeParams(input.size()) + ")"; return getPartitionsViaSqlFilterInternal(dbName, tblName, null, filter, input, @@ -415,8 +416,8 @@ public Database getDatabase(String dbName) throws MetaException{ } public static class SqlFilterForPushdown { - private List params = new ArrayList(); - private List joins = new ArrayList(); + private final List params = new ArrayList(); + private final List joins = new ArrayList(); private String filter; private Table table; } @@ -526,6 +527,7 @@ private boolean isViewTable(String dbName, String tblName) throws MetaException // Get full objects. For Oracle/etc. do it in batches. List result = runBatched(sqlResult, new Batchable() { + @Override public List run(List input) throws MetaException { return getPartitionsFromPartitionIds(dbNameLcase, tblNameLcase, isView, input); } @@ -949,6 +951,7 @@ static String extractSqlBlob(Object value) throws MetaException { if (value == null) return null; if (value instanceof Blob) { + //derby, oracle try { // getBytes function says: pos the ordinal position of the first byte in // the BLOB value to be extracted; the first byte is at position 1 @@ -956,7 +959,12 @@ static String extractSqlBlob(Object value) throws MetaException { } catch (SQLException e) { throw new MetaException("Encounter error while processing blob."); } - } else { + } + else if (value instanceof byte[]) { + // mysql, postgres, sql server + return new String((byte[])value); + } + else { // this may happen when enablebitvector is false LOG.debug("Expected blob type but got " + value.getClass().getName()); return null; @@ -1251,6 +1259,7 @@ public ColumnStatistics getTableStats(final String dbName, final String tableNam final String queryText0 = "select " + getStatsList(enableBitVector) + " from " + TAB_COL_STATS + " " + " where \"DB_NAME\" = ? and \"TABLE_NAME\" = ? and \"COLUMN_NAME\" in ("; Batchable b = new Batchable() { + @Override public List run(List input) throws MetaException { String queryText = queryText0 + makeParams(input.size()) + ")"; Object[] params = new Object[input.size() + 2]; @@ -1356,8 +1365,10 @@ private long partsFoundForPartitions(final String dbName, final String tableName + " and \"COLUMN_NAME\" in (%1$s) and \"PARTITION_NAME\" in (%2$s)" + " group by \"PARTITION_NAME\""; List allCounts = runBatched(colNames, new Batchable() { + @Override public List run(final List inputColName) throws MetaException { return runBatched(partNames, new Batchable() { + @Override public List run(List inputPartNames) throws MetaException { long partsFound = 0; String queryText = String.format(queryText0, @@ -1396,8 +1407,10 @@ private long partsFoundForPartitions(final String dbName, final String tableName final boolean useDensityFunctionForNDVEstimation, final double ndvTuner, final boolean enableBitVector) throws MetaException { final boolean areAllPartsFound = (partsFound == partNames.size()); return runBatched(colNames, new Batchable() { + @Override public List run(final List inputColNames) throws MetaException { return runBatched(partNames, new Batchable() { + @Override public List run(List inputPartNames) throws MetaException { return columnStatisticsObjForPartitionsBatch(dbName, tableName, inputPartNames, inputColNames, areAllPartsFound, useDensityFunctionForNDVEstimation, ndvTuner, enableBitVector); @@ -1466,7 +1479,7 @@ private long partsFoundForPartitions(final String dbName, final String tableName String tableName, List partNames, List colNames, boolean areAllPartsFound, boolean useDensityFunctionForNDVEstimation, double ndvTuner, boolean enableBitVector) throws MetaException { if(enableBitVector) { - return aggrStatsUseJava(dbName, tableName, partNames, colNames, areAllPartsFound, useDensityFunctionForNDVEstimation, ndvTuner); + return aggrStatsUseJava(dbName, tableName, partNames, colNames, useDensityFunctionForNDVEstimation, ndvTuner); } else { return aggrStatsUseDB(dbName, tableName, partNames, colNames, areAllPartsFound, useDensityFunctionForNDVEstimation, ndvTuner); @@ -1474,14 +1487,14 @@ private long partsFoundForPartitions(final String dbName, final String tableName } private List aggrStatsUseJava(String dbName, String tableName, - List partNames, List colNames, boolean areAllPartsFound, + List partNames, List colNames, boolean useDensityFunctionForNDVEstimation, double ndvTuner) throws MetaException { // 1. get all the stats for colNames in partNames; List partStats = getPartitionStats(dbName, tableName, partNames, colNames, true); // 2. use util function to aggr stats return MetaStoreUtils.aggrPartitionStats(partStats, dbName, tableName, partNames, colNames, - areAllPartsFound, useDensityFunctionForNDVEstimation, ndvTuner); + useDensityFunctionForNDVEstimation, ndvTuner); } private List aggrStatsUseDB(String dbName, @@ -1679,7 +1692,7 @@ private long partsFoundForPartitions(final String dbName, final String tableName row[2 + colStatIndex] = null; } else { Long val = extractSqlLong(o); - row[2 + colStatIndex] = (Long) (val / sumVal * (partNames.size())); + row[2 + colStatIndex] = val / sumVal * (partNames.size()); } } else if (IExtrapolatePartStatus.aggrTypes[colStatIndex] == IExtrapolatePartStatus.AggrType.Min || IExtrapolatePartStatus.aggrTypes[colStatIndex] == IExtrapolatePartStatus.AggrType.Max) { @@ -1802,8 +1815,10 @@ private ColumnStatisticsObj prepareCSObjWithAdjustedNDV(Object[] row, int i, + " " + PART_COL_STATS + " where \"DB_NAME\" = ? and \"TABLE_NAME\" = ? and \"COLUMN_NAME\"" + " in (%1$s) AND \"PARTITION_NAME\" in (%2$s) order by \"PARTITION_NAME\""; Batchable b = new Batchable() { + @Override public List run(final List inputColNames) throws MetaException { Batchable b2 = new Batchable() { + @Override public List run(List inputPartNames) throws MetaException { String queryText = String.format(queryText0, makeParams(inputColNames.size()), makeParams(inputPartNames.size())); diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreUtils.java b/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreUtils.java index c95749cb65..8a481265ac 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreUtils.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreUtils.java @@ -46,11 +46,18 @@ import java.util.SortedSet; import java.util.TreeMap; import java.util.TreeSet; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; import java.util.regex.Matcher; import java.util.regex.Pattern; import com.google.common.base.Predicates; +import com.google.common.collect.Lists; import com.google.common.collect.Maps; +import com.google.common.util.concurrent.ThreadFactoryBuilder; import org.apache.commons.collections.CollectionUtils; import org.apache.commons.collections.ListUtils; @@ -1906,7 +1913,7 @@ public static void mergeColStats(ColumnStatistics csNew, ColumnStatistics csOld) // present in both, overwrite stats for columns absent in metastore and // leave alone columns stats missing from stats task. This last case may // leave stats in stale state. This will be addressed later. - LOG.debug("New ColumnStats size is {}, but old ColumnStats size is {}", + LOG.debug("New ColumnStats size is {}, but old ColumnStats size is {}", csNew.getStatsObj().size(), csOld.getStatsObjSize()); } // In this case, we have to find out which columns can be merged. @@ -1963,7 +1970,7 @@ public static MetaException newMetaException(String errorMessage, Exception e) { // given a list of partStats, this function will give you an aggr stats public static List aggrPartitionStats(List partStats, String dbName, String tableName, List partNames, List colNames, - boolean areAllPartsFound, boolean useDensityFunctionForNDVEstimation, double ndvTuner) + boolean useDensityFunctionForNDVEstimation, double ndvTuner) throws MetaException { // 1. group by the stats by colNames // map the colName to List @@ -1980,27 +1987,48 @@ public static MetaException newMetaException(String errorMessage, Exception e) { map.get(obj.getColName()).add(singleCS); } } - return aggrPartitionStats(map,dbName,tableName,partNames,colNames,areAllPartsFound,useDensityFunctionForNDVEstimation, ndvTuner); + return aggrPartitionStats(map,dbName,tableName,partNames,colNames,useDensityFunctionForNDVEstimation, ndvTuner); } public static List aggrPartitionStats( Map> map, String dbName, String tableName, - List partNames, List colNames, boolean areAllPartsFound, - boolean useDensityFunctionForNDVEstimation, double ndvTuner) throws MetaException { + final List partNames, List colNames, + final boolean useDensityFunctionForNDVEstimation,final double ndvTuner) throws MetaException { List colStats = new ArrayList<>(); - // 2. aggr stats for each colName - // TODO: thread pool can be used to speed up the process - for (Entry> entry : map.entrySet()) { - List css = entry.getValue(); - ColumnStatsAggregator aggregator = ColumnStatsAggregatorFactory.getColumnStatsAggregator(css - .iterator().next().getStatsObj().iterator().next().getStatsData().getSetField(), - useDensityFunctionForNDVEstimation, ndvTuner); - ColumnStatisticsObj statsObj = aggregator.aggregate(entry.getKey(), partNames, css); - colStats.add(statsObj); + // 2. Aggregate stats for each column in a seperate thread + final ExecutorService pool = Executors.newFixedThreadPool(Math.min(map.size(), 16), + new ThreadFactoryBuilder().setDaemon(true).setNameFormat("aggr-col-stats-%d").build()); + final List> futures = Lists.newLinkedList(); + + long start = System.currentTimeMillis(); + for (final Entry> entry : map.entrySet()) { + futures.add(pool.submit(new Callable() { + @Override + public ColumnStatisticsObj call() throws Exception { + List css = entry.getValue(); + ColumnStatsAggregator aggregator = ColumnStatsAggregatorFactory.getColumnStatsAggregator(css + .iterator().next().getStatsObj().iterator().next().getStatsData().getSetField(), + useDensityFunctionForNDVEstimation, ndvTuner); + ColumnStatisticsObj statsObj = aggregator.aggregate(entry.getKey(), partNames, css); + return statsObj; + }})); + } + pool.shutdown(); + for (Future future : futures) { + try { + colStats.add(future.get()); + } catch (InterruptedException | ExecutionException e) { + pool.shutdownNow(); + LOG.debug(e.toString()); + throw new MetaException(e.toString()); + } } + LOG.debug("Time for aggr col stats in seconds: {} Threads used: {}", + ((System.currentTimeMillis() - (double)start))/1000, Math.min(map.size(), 16)); return colStats; } + /** * Produce a hash for the storage descriptor * @param sd storage descriptor to hash diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/cache/CachedStore.java b/metastore/src/java/org/apache/hadoop/hive/metastore/cache/CachedStore.java index ce98a6e68c..697cc2e8c1 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/cache/CachedStore.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/cache/CachedStore.java @@ -81,19 +81,15 @@ import org.apache.hadoop.hive.metastore.api.UnknownDBException; import org.apache.hadoop.hive.metastore.api.UnknownPartitionException; import org.apache.hadoop.hive.metastore.api.UnknownTableException; -import org.apache.hadoop.hive.metastore.columnstats.merge.ColumnStatsMerger; -import org.apache.hadoop.hive.metastore.columnstats.merge.ColumnStatsMergerFactory; import org.apache.hadoop.hive.metastore.partition.spec.PartitionSpecProxy; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; -import org.apache.hadoop.util.StringUtils; import org.apache.hive.common.util.HiveStringUtils; import org.apache.thrift.TException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.annotations.VisibleForTesting; -import com.google.common.collect.Lists; // TODO filter->expr // TODO functionCache @@ -277,6 +273,7 @@ void prewarm() throws Exception { synchronized void startCacheUpdateService() { if (cacheUpdateMaster == null) { cacheUpdateMaster = Executors.newScheduledThreadPool(1, new ThreadFactory() { + @Override public Thread newThread(Runnable r) { Thread t = Executors.defaultThreadFactory().newThread(r); t.setName("CachedStore-CacheUpdateService: Thread-" + t.getId()); @@ -321,7 +318,7 @@ void setCacheRefreshPeriod(long time) { static class CacheUpdateMasterWork implements Runnable { - private CachedStore cachedStore; + private final CachedStore cachedStore; public CacheUpdateMasterWork(CachedStore cachedStore) { this.cachedStore = cachedStore; @@ -1540,63 +1537,51 @@ public boolean deletePartitionColumnStatistics(String dbName, String tableName, } @Override - public AggrStats get_aggr_stats_for(String dbName, String tblName, List partNames, - List colNames) throws MetaException, NoSuchObjectException { - List colStats = new ArrayList(colNames.size()); - for (String colName : colNames) { - ColumnStatisticsObj colStat = - mergeColStatsForPartitions(HiveStringUtils.normalizeIdentifier(dbName), - HiveStringUtils.normalizeIdentifier(tblName), partNames, colName); - if (colStat == null) { - // Stop and fall back to underlying RawStore - colStats = null; - break; - } else { - colStats.add(colStat); - } - } - if (colStats == null) { - return rawStore.get_aggr_stats_for(dbName, tblName, partNames, colNames); - } else { + public AggrStats get_aggr_stats_for(String dbName, String tblName, List partNames, + List colNames) throws MetaException, NoSuchObjectException { + List colStats = mergeColStatsForPartitions( + HiveStringUtils.normalizeIdentifier(dbName), HiveStringUtils.normalizeIdentifier(tblName), + partNames, colNames); return new AggrStats(colStats, partNames.size()); - } - } - private ColumnStatisticsObj mergeColStatsForPartitions(String dbName, String tblName, - List partNames, String colName) throws MetaException { + } + + private List mergeColStatsForPartitions(String dbName, String tblName, + List partNames, List colNames) throws MetaException { final boolean useDensityFunctionForNDVEstimation = HiveConf.getBoolVar(getConf(), HiveConf.ConfVars.HIVE_METASTORE_STATS_NDV_DENSITY_FUNCTION); final double ndvTuner = HiveConf.getFloatVar(getConf(), HiveConf.ConfVars.HIVE_METASTORE_STATS_NDV_TUNER); Map> map = new HashMap<>(); - List list = new ArrayList<>(); - boolean areAllPartsFound = true; - for (String partName : partNames) { - String colStatsCacheKey = CacheUtils.buildKey(dbName, tblName, partNameToVals(partName), - colName); - List singleObj = new ArrayList<>(); - ColumnStatisticsObj colStatsForPart = SharedCache - .getCachedPartitionColStats(colStatsCacheKey); - if (colStatsForPart != null) { - singleObj.add(colStatsForPart); - ColumnStatisticsDesc css = new ColumnStatisticsDesc(false, dbName, tblName); - css.setPartName(partName); - list.add(new ColumnStatistics(css, singleObj)); - } else { - areAllPartsFound = false; + + for (String colName : colNames) { + List colStats = new ArrayList<>(); + for (String partName : partNames) { + String colStatsCacheKey = CacheUtils.buildKey(dbName, tblName, partNameToVals(partName), + colName); + List colStat = new ArrayList<>(); + ColumnStatisticsObj colStatsForPart = SharedCache + .getCachedPartitionColStats(colStatsCacheKey); + if (colStatsForPart != null) { + colStat.add(colStatsForPart); + ColumnStatisticsDesc csDesc = new ColumnStatisticsDesc(false, dbName, tblName); + csDesc.setPartName(partName); + colStats.add(new ColumnStatistics(csDesc, colStat)); + } else { + LOG.debug("Stats not found in CachedStore for: dbName={} tblName={} partName={} colName={}", + dbName, tblName,partName, colName); + } } + map.put(colName, colStats); } - map.put(colName, list); - List colNames = new ArrayList<>(); - colNames.add(colName); // Note that enableBitVector does not apply here because ColumnStatisticsObj // itself will tell whether // bitvector is null or not and aggr logic can automatically apply. - return MetaStoreUtils - .aggrPartitionStats(map, dbName, tblName, partNames, colNames, areAllPartsFound, - useDensityFunctionForNDVEstimation, ndvTuner).iterator().next(); + return MetaStoreUtils.aggrPartitionStats(map, dbName, tblName, partNames, colNames, + useDensityFunctionForNDVEstimation, ndvTuner); } + @Override public long cleanupEvents() { return rawStore.cleanupEvents(); diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DateColumnStatsAggregator.java b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DateColumnStatsAggregator.java index 6fae3e5067..507d3eaf90 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DateColumnStatsAggregator.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DateColumnStatsAggregator.java @@ -232,12 +232,13 @@ public ColumnStatisticsObj aggregate(String colName, List partNames, extrapolate(columnStatisticsData, partNames.size(), css.size(), adjustedIndexMap, adjustedStatsMap, densityAvgSum / adjustedStatsMap.size()); } + LOG.info("Ndv estimatation for " + colName + " is " + + columnStatisticsData.getDateStats().getNumDVs() + ". # of partitions requested: " + + partNames.size() + ". # of partitions found: " + css.size()); statsObj.setStatsData(columnStatisticsData); - LOG.debug("Ndv estimatation for " + colName + " is " - + columnStatisticsData.getDateStats().getNumDVs()); return statsObj; } - + private long diff(Date d1, Date d2) { return d1.getDaysSinceEpoch() - d2.getDaysSinceEpoch(); } @@ -264,9 +265,10 @@ public void extrapolate(ColumnStatisticsData extrapolateData, int numParts, extractedAdjustedStatsMap.entrySet()); // get the lowValue Collections.sort(list, new Comparator>() { + @Override public int compare(Map.Entry o1, Map.Entry o2) { - return diff(o1.getValue().getLowValue(), o2.getValue().getLowValue()) < 0 ? -1 : 1; + return o1.getValue().getLowValue().compareTo(o2.getValue().getLowValue()); } }); double minInd = adjustedIndexMap.get(list.get(0).getKey()); @@ -286,9 +288,10 @@ public int compare(Map.Entry o1, // get the highValue Collections.sort(list, new Comparator>() { + @Override public int compare(Map.Entry o1, Map.Entry o2) { - return diff(o1.getValue().getHighValue(), o2.getValue().getHighValue()) < 0 ? -1 : 1; + return o1.getValue().getHighValue().compareTo(o2.getValue().getHighValue()); } }); minInd = adjustedIndexMap.get(list.get(0).getKey()); @@ -317,9 +320,10 @@ public int compare(Map.Entry o1, // get the ndv long ndv = 0; Collections.sort(list, new Comparator>() { + @Override public int compare(Map.Entry o1, Map.Entry o2) { - return o1.getValue().getNumDVs() < o2.getValue().getNumDVs() ? -1 : 1; + return Long.compare(o1.getValue().getNumDVs(), o2.getValue().getNumDVs()); } }); long lowerBound = list.get(list.size() - 1).getValue().getNumDVs(); diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DecimalColumnStatsAggregator.java b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DecimalColumnStatsAggregator.java index c5e72ebd43..1b9594691d 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DecimalColumnStatsAggregator.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DecimalColumnStatsAggregator.java @@ -40,7 +40,7 @@ public class DecimalColumnStatsAggregator extends ColumnStatsAggregator implements IExtrapolatePartStatus { - + private static final Logger LOG = LoggerFactory.getLogger(DecimalColumnStatsAggregator.class); @Override @@ -253,9 +253,10 @@ public ColumnStatisticsObj aggregate(String colName, List partNames, extrapolate(columnStatisticsData, partNames.size(), css.size(), adjustedIndexMap, adjustedStatsMap, densityAvgSum / adjustedStatsMap.size()); } + LOG.info("Ndv estimatation for " + colName + " is " + + columnStatisticsData.getDecimalStats().getNumDVs() + ". # of partitions requested: " + + partNames.size() + ". # of partitions found: " + css.size()); statsObj.setStatsData(columnStatisticsData); - LOG.debug("Ndv estimatation for " + colName + " is " - + columnStatisticsData.getDecimalStats().getNumDVs()); return statsObj; } @@ -273,6 +274,7 @@ public void extrapolate(ColumnStatisticsData extrapolateData, int numParts, extractedAdjustedStatsMap.entrySet()); // get the lowValue Collections.sort(list, new Comparator>() { + @Override public int compare(Map.Entry o1, Map.Entry o2) { return o1.getValue().getLowValue().compareTo(o2.getValue().getLowValue()); @@ -295,6 +297,7 @@ public int compare(Map.Entry o1, // get the highValue Collections.sort(list, new Comparator>() { + @Override public int compare(Map.Entry o1, Map.Entry o2) { return o1.getValue().getHighValue().compareTo(o2.getValue().getHighValue()); @@ -328,9 +331,10 @@ public int compare(Map.Entry o1, long ndvMin = 0; long ndvMax = 0; Collections.sort(list, new Comparator>() { + @Override public int compare(Map.Entry o1, Map.Entry o2) { - return o1.getValue().getNumDVs() < o2.getValue().getNumDVs() ? -1 : 1; + return Long.compare(o1.getValue().getNumDVs(), o2.getValue().getNumDVs()); } }); long lowerBound = list.get(list.size() - 1).getValue().getNumDVs(); diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DoubleColumnStatsAggregator.java b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DoubleColumnStatsAggregator.java index e55c41230d..e46b99f23f 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DoubleColumnStatsAggregator.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DoubleColumnStatsAggregator.java @@ -228,8 +228,9 @@ public ColumnStatisticsObj aggregate(String colName, List partNames, extrapolate(columnStatisticsData, partNames.size(), css.size(), adjustedIndexMap, adjustedStatsMap, densityAvgSum / adjustedStatsMap.size()); } - LOG.debug("Ndv estimatation for " + colName + " is " - + columnStatisticsData.getDoubleStats().getNumDVs()); + LOG.info("Ndv estimatation for " + colName + " is " + + columnStatisticsData.getDoubleStats().getNumDVs() + ". # of partitions requested: " + + partNames.size() + ". # of partitions found: " + css.size()); statsObj.setStatsData(columnStatisticsData); return statsObj; } @@ -248,9 +249,10 @@ public void extrapolate(ColumnStatisticsData extrapolateData, int numParts, extractedAdjustedStatsMap.entrySet()); // get the lowValue Collections.sort(list, new Comparator>() { + @Override public int compare(Map.Entry o1, Map.Entry o2) { - return o1.getValue().getLowValue() < o2.getValue().getLowValue() ? -1 : 1; + return Double.compare(o1.getValue().getLowValue(), o2.getValue().getLowValue()); } }); double minInd = adjustedIndexMap.get(list.get(0).getKey()); @@ -270,9 +272,10 @@ public int compare(Map.Entry o1, // get the highValue Collections.sort(list, new Comparator>() { + @Override public int compare(Map.Entry o1, Map.Entry o2) { - return o1.getValue().getHighValue() < o2.getValue().getHighValue() ? -1 : 1; + return Double.compare(o1.getValue().getHighValue(), o2.getValue().getHighValue()); } }); minInd = adjustedIndexMap.get(list.get(0).getKey()); @@ -303,9 +306,10 @@ public int compare(Map.Entry o1, long ndvMin = 0; long ndvMax = 0; Collections.sort(list, new Comparator>() { + @Override public int compare(Map.Entry o1, Map.Entry o2) { - return o1.getValue().getNumDVs() < o2.getValue().getNumDVs() ? -1 : 1; + return Long.compare(o1.getValue().getNumDVs(), o2.getValue().getNumDVs()); } }); long lowerBound = list.get(list.size() - 1).getValue().getNumDVs(); @@ -341,5 +345,5 @@ public int compare(Map.Entry o1, extrapolateDoubleData.setNumDVs(ndv); extrapolateData.setDoubleStats(extrapolateDoubleData); } - + } diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/LongColumnStatsAggregator.java b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/LongColumnStatsAggregator.java index 2ee09f3c47..7553dbd2f1 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/LongColumnStatsAggregator.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/LongColumnStatsAggregator.java @@ -230,8 +230,10 @@ public ColumnStatisticsObj aggregate(String colName, List partNames, adjustedStatsMap, densityAvgSum / adjustedStatsMap.size()); } statsObj.setStatsData(columnStatisticsData); - LOG.debug("Ndv estimatation for " + colName + " is " - + columnStatisticsData.getLongStats().getNumDVs()); + LOG.info("Ndv estimatation for " + colName + " is " + + columnStatisticsData.getLongStats().getNumDVs() + ". # of partitions requested: " + + partNames.size() + ". # of partitions found: " + css.size()); + statsObj.setStatsData(columnStatisticsData); return statsObj; } @@ -249,9 +251,10 @@ public void extrapolate(ColumnStatisticsData extrapolateData, int numParts, extractedAdjustedStatsMap.entrySet()); // get the lowValue Collections.sort(list, new Comparator>() { + @Override public int compare(Map.Entry o1, Map.Entry o2) { - return o1.getValue().getLowValue() < o2.getValue().getLowValue() ? -1 : 1; + return Long.compare(o1.getValue().getLowValue(), o2.getValue().getLowValue()); } }); double minInd = adjustedIndexMap.get(list.get(0).getKey()); @@ -271,9 +274,10 @@ public int compare(Map.Entry o1, // get the highValue Collections.sort(list, new Comparator>() { + @Override public int compare(Map.Entry o1, Map.Entry o2) { - return o1.getValue().getHighValue() < o2.getValue().getHighValue() ? -1 : 1; + return Long.compare(o1.getValue().getHighValue(), o2.getValue().getHighValue()); } }); minInd = adjustedIndexMap.get(list.get(0).getKey()); @@ -302,9 +306,10 @@ public int compare(Map.Entry o1, // get the ndv long ndv = 0; Collections.sort(list, new Comparator>() { + @Override public int compare(Map.Entry o1, Map.Entry o2) { - return o1.getValue().getNumDVs() < o2.getValue().getNumDVs() ? -1 : 1; + return Long.compare(o1.getValue().getNumDVs(), o2.getValue().getNumDVs()); } }); long lowerBound = list.get(list.size() - 1).getValue().getNumDVs(); diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/StringColumnStatsAggregator.java b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/StringColumnStatsAggregator.java index 2ea2fcca05..05c6528640 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/StringColumnStatsAggregator.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/StringColumnStatsAggregator.java @@ -201,8 +201,9 @@ public ColumnStatisticsObj aggregate(String colName, List partNames, extrapolate(columnStatisticsData, partNames.size(), css.size(), adjustedIndexMap, adjustedStatsMap, -1); } - LOG.debug("Ndv estimatation for " + colName + " is " - + columnStatisticsData.getStringStats().getNumDVs()); + LOG.info("Ndv estimatation for " + colName + " is " + + columnStatisticsData.getStringStats().getNumDVs() + ". # of partitions requested: " + + partNames.size() + ". # of partitions found: " + css.size()); statsObj.setStatsData(columnStatisticsData); return statsObj; } @@ -221,9 +222,10 @@ public void extrapolate(ColumnStatisticsData extrapolateData, int numParts, extractedAdjustedStatsMap.entrySet()); // get the avgLen Collections.sort(list, new Comparator>() { + @Override public int compare(Map.Entry o1, Map.Entry o2) { - return o1.getValue().getAvgColLen() < o2.getValue().getAvgColLen() ? -1 : 1; + return Double.compare(o1.getValue().getAvgColLen(), o2.getValue().getAvgColLen()); } }); double minInd = adjustedIndexMap.get(list.get(0).getKey()); @@ -243,9 +245,10 @@ public int compare(Map.Entry o1, // get the maxLen Collections.sort(list, new Comparator>() { + @Override public int compare(Map.Entry o1, Map.Entry o2) { - return o1.getValue().getMaxColLen() < o2.getValue().getMaxColLen() ? -1 : 1; + return Long.compare(o1.getValue().getMaxColLen(), o2.getValue().getMaxColLen()); } }); minInd = adjustedIndexMap.get(list.get(0).getKey()); @@ -274,9 +277,10 @@ public int compare(Map.Entry o1, // get the ndv long ndv = 0; Collections.sort(list, new Comparator>() { + @Override public int compare(Map.Entry o1, Map.Entry o2) { - return o1.getValue().getNumDVs() < o2.getValue().getNumDVs() ? -1 : 1; + return Long.compare(o1.getValue().getNumDVs(), o2.getValue().getNumDVs()); } }); minInd = adjustedIndexMap.get(list.get(0).getKey()); -- 2.11.0 (Apple Git-81)