diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/RelOptHiveTable.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/RelOptHiveTable.java index b10e7b07b1..1f789fd48b 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/RelOptHiveTable.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/RelOptHiveTable.java @@ -95,9 +95,7 @@ Map colStatsCache; AtomicInteger noColsMissingStats; - protected static final Logger LOG = LoggerFactory - .getLogger(RelOptHiveTable.class - .getName()); + protected static final Logger LOG = LoggerFactory.getLogger(RelOptHiveTable.class.getName()); public RelOptHiveTable(RelOptSchema calciteSchema, String qualifiedTblName, RelDataType rowType, Table hiveTblMetadata, List hiveNonPartitionCols, diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/Statistics.java ql/src/java/org/apache/hadoop/hive/ql/plan/Statistics.java index fd461ae529..6babe497b2 100644 --- ql/src/java/org/apache/hadoop/hive/ql/plan/Statistics.java +++ ql/src/java/org/apache/hadoop/hive/ql/plan/Statistics.java @@ -38,9 +38,16 @@ public enum State { NONE, PARTIAL, COMPLETE; - boolean morePreciseThan(State other) { + public boolean morePreciseThan(State other) { return ordinal() >= other.ordinal(); } + + public State merge(State otherState) { + if (this == otherState) { + return this; + } + return PARTIAL; + } } private long numRows; @@ -313,8 +320,7 @@ public void setRunTimeNumRows(long runTimeNumRows) { } public Statistics scaleToRowCount(long newRowCount, boolean downScaleOnly) { - Statistics ret; - ret = clone(); + Statistics ret = clone(); if (numRows == 0) { return ret; } diff --git ql/src/java/org/apache/hadoop/hive/ql/stats/BasicStats.java ql/src/java/org/apache/hadoop/hive/ql/stats/BasicStats.java new file mode 100644 index 0000000000..ae278abf83 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/stats/BasicStats.java @@ -0,0 +1,321 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.stats; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.common.StatsSetupConst; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.conf.HiveConf.ConfVars; +import org.apache.hadoop.hive.ql.plan.Statistics; +import org.apache.hadoop.hive.ql.plan.Statistics.State; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.collect.Lists; +import com.google.common.util.concurrent.MoreExecutors; +import com.google.common.util.concurrent.ThreadFactoryBuilder; + +public class BasicStats { + + private static final Logger LOG = LoggerFactory.getLogger(BasicStats.class.getName()); + + public static class Factory { + + private final List enhancers = new LinkedList<>(); + + public Factory(IStatsEnhancer... enhancers) { + this.enhancers.addAll(Arrays.asList(enhancers)); + } + + public void addEnhancer(IStatsEnhancer enhancer) { + enhancers.add(enhancer); + } + + public BasicStats build(Partish p) { + BasicStats ret = new BasicStats(p); + for (IStatsEnhancer enhancer : enhancers) { + ret.apply(enhancer); + } + return ret; + } + + public List buildAll(HiveConf conf, Collection parts) { + LOG.info("Number of partishes : " + parts.size()); + + final List ret = new ArrayList<>(parts.size()); + if (parts.size() <= 1) { + for (Partish partish : parts) { + ret.add(build(partish)); + } + return ret; + } + + List> futures = new ArrayList<>(); + + int threads = conf.getIntVar(ConfVars.METASTORE_FS_HANDLER_THREADS_COUNT); + + final ExecutorService pool; + if (threads <= 1) { + pool = MoreExecutors.sameThreadExecutor(); + } else { + pool = Executors.newFixedThreadPool(threads, new ThreadFactoryBuilder().setDaemon(true).setNameFormat("Get-Partitions-Size-%d").build()); + } + + for (final Partish part : parts) { + futures.add(pool.submit(new Callable() { + @Override + public BasicStats call() throws Exception { + return build(part); + } + })); + } + + try { + for (int i = 0; i < futures.size(); i++) { + ret.add(i, futures.get(i).get()); + } + } catch (InterruptedException | ExecutionException e) { + LOG.warn("Exception in processing files ", e); + } finally { + pool.shutdownNow(); + } + return ret; + } + } + + public static interface IStatsEnhancer { + void apply(BasicStats stats); + } + + public static class SetMinRowNumber implements IStatsEnhancer { + + @Override + public void apply(BasicStats stats) { + if (stats.getNumRows() == 0) { + stats.setNumRows(1); + } + } + } + + public static class SetMinRowNumber01 implements IStatsEnhancer { + + @Override + public void apply(BasicStats stats) { + if (stats.getNumRows() == 0 || stats.getNumRows() == -1) { + stats.setNumRows(1); + } + } + } + + public static class RowNumEstimator implements IStatsEnhancer { + + private long avgRowSize; + + // FIXME: this is most probably broken ; the call-site is dependent on neededColumns; which indicates that it might mis calculate the value + // HIVE-18108 + public RowNumEstimator(long avgRowSize) { + this.avgRowSize = avgRowSize; + if (avgRowSize > 0) { + if (LOG.isDebugEnabled()) { + LOG.debug("Estimated average row size: " + avgRowSize); + } + } + } + + @Override + public void apply(BasicStats stats) { + // FIXME: there were different logic for part/table; merge these logics later + if (stats.partish.getPartition() == null) { + if (stats.getNumRows() < 0 && avgRowSize > 0) { + stats.setNumRows(stats.getDataSize() / avgRowSize); + } + } else { + if (avgRowSize > 0) { + long rc = stats.getNumRows(); + long s = stats.getDataSize(); + if (rc <= 0 && s > 0) { + rc = s / avgRowSize; + stats.setNumRows(rc); + } + + if (s <= 0 && rc > 0) { + s = StatsUtils.safeMult(rc, avgRowSize); + stats.setDataSize(s); + } + } + } + } + } + + public static class DataSizeEstimator implements IStatsEnhancer { + + private HiveConf conf; + private float deserFactor; + + public DataSizeEstimator(HiveConf conf) { + this.conf = conf; + deserFactor = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVE_STATS_DESERIALIZATION_FACTOR); + } + + @Override + public void apply(BasicStats stats) { + long ds = stats.getRawDataSize(); + if (ds <= 0) { + ds = stats.getTotalSize(); + + // if data size is still 0 then get file size + if (ds <= 0) { + Path path = stats.partish.getPath(); + try { + ds = getFileSizeForPath(path); + } catch (IOException e) { + ds = 0L; + } + } + ds = (long) (ds * deserFactor); + + stats.setDataSize(ds); + } + + } + + private long getFileSizeForPath(Path path) throws IOException { + FileSystem fs = path.getFileSystem(conf); + return fs.getContentSummary(path).getLength(); + } + + } + + private Partish partish; + + private long rowCount; + private long totalSize; + private long rawDataSize; + + private long currentNumRows; + private long currentDataSize; + private Statistics.State state; + + public BasicStats(Partish p) { + partish = p; + + rowCount = parseLong(StatsSetupConst.ROW_COUNT); + rawDataSize = parseLong(StatsSetupConst.RAW_DATA_SIZE); + totalSize = parseLong(StatsSetupConst.TOTAL_SIZE); + + currentNumRows = rowCount; + currentDataSize = rawDataSize; + + if (currentNumRows > 0) { + state = State.COMPLETE; + } else { + state = State.NONE; + } + } + + + public BasicStats(List partStats) { + partish = null; + List nrIn = Lists.newArrayList(); + List dsIn = Lists.newArrayList(); + state = (partStats.size() == 0) ? State.COMPLETE : null; + for (BasicStats ps : partStats) { + nrIn.add(ps.getNumRows()); + dsIn.add(ps.getDataSize()); + + if (state == null) { + state = ps.getState(); + } else { + state = state.merge(ps.getState()); + } + } + currentNumRows = StatsUtils.getSumIgnoreNegatives(nrIn); + currentDataSize = StatsUtils.getSumIgnoreNegatives(dsIn); + + } + + public long getNumRows() { + return currentNumRows; + } + + public long getDataSize() { + return currentDataSize; + } + + public Statistics.State getState() { + return state; + } + + void apply(IStatsEnhancer estimator) { + estimator.apply(this); + } + + protected void setNumRows(long l) { + currentNumRows = l; + } + + protected void setDataSize(long ds) { + currentDataSize = ds; + } + + protected long getTotalSize() { + return totalSize; + } + + protected long getRawDataSize() { + return rawDataSize; + } + + private long parseLong(String fieldName) { + Map params = partish.getPartParameters(); + long result = -1; + + if (params != null) { + try { + result = Long.parseLong(params.get(fieldName)); + } catch (NumberFormatException e) { + result = -1; + } + } + return result; + } + + public static BasicStats buildFrom(List partStats) { + return new BasicStats(partStats); + } + + @Override + public String toString() { + return String.format("BasicStats: %d, %d %s", getNumRows(), getDataSize(), getState()); + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/stats/Partish.java ql/src/java/org/apache/hadoop/hive/ql/stats/Partish.java index 05b0474e90..4daae2ad95 100644 --- ql/src/java/org/apache/hadoop/hive/ql/stats/Partish.java +++ ql/src/java/org/apache/hadoop/hive/ql/stats/Partish.java @@ -20,6 +20,7 @@ import java.util.Map; +import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.metastore.api.StorageDescriptor; import org.apache.hadoop.hive.ql.io.AcidUtils; import org.apache.hadoop.hive.ql.metadata.HiveException; @@ -124,6 +125,11 @@ public String getLocation() { public String getSimpleName() { return String.format("Table %s.%s", table.getDbName(), table.getTableName()); } + + @Override + public Path getPath() { + return table.getPath(); + } } static class PPart extends Partish { @@ -181,6 +187,13 @@ public String getSimpleName() { return String.format("Partition %s.%s %s", table.getDbName(), table.getTableName(), partition.getSpec()); } + @Override + public Path getPath() { + return partition.getPartitionPath(); + } + } + public abstract Path getPath(); + } \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java index cef87f5957..154e04b7a3 100644 --- ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java +++ ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java @@ -69,6 +69,7 @@ import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; import org.apache.hadoop.hive.ql.plan.Statistics; import org.apache.hadoop.hive.ql.plan.Statistics.State; +import org.apache.hadoop.hive.ql.stats.BasicStats.Factory; import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge; import org.apache.hadoop.hive.ql.udf.generic.NDV; @@ -160,116 +161,48 @@ private static Statistics collectStatistics(HiveConf conf, PrunedPartitionList p fetchColStats, testMode); } - private static long getDataSize(HiveConf conf, Table table) { - long ds = getRawDataSize(table); - if (ds <= 0) { - ds = getTotalSize(table); - - // if data size is still 0 then get file size - if (ds <= 0) { - ds = getFileSizeForTable(conf, table); - } - float deserFactor = - HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVE_STATS_DESERIALIZATION_FACTOR); - ds = (long) (ds * deserFactor); - } - - return ds; - } - /** * Returns number of rows if it exists. Otherwise it estimates number of rows * based on estimated data size for both partition and non-partitioned table * RelOptHiveTable's getRowCount uses this. - * - * @param conf - * @param schema - * @param table - * @return */ - public static long getNumRows(HiveConf conf, List schema, Table table, - PrunedPartitionList partitionList, AtomicInteger noColsMissingStats) { - - boolean shouldEstimateStats = HiveConf.getBoolVar(conf, ConfVars.HIVE_STATS_ESTIMATE_STATS); - - if(!table.isPartitioned()) { - //get actual number of rows from metastore - long nr = getNumRows(table); + public static long getNumRows(HiveConf conf, List schema, Table table, PrunedPartitionList partitionList, AtomicInteger noColsMissingStats) { - // log warning if row count is missing - if(nr <= 0) { - noColsMissingStats.getAndIncrement(); - } - - // if row count exists or stats aren't to be estimated return - // whatever we have - if(nr > 0 || !shouldEstimateStats) { - return nr; + List inputs = new ArrayList<>(); + if (table.isPartitioned()) { + for (Partition part : partitionList.getNotDeniedPartns()) { + inputs.add(Partish.buildFor(table, part)); } - // go ahead with the estimation - long ds = getDataSize(conf, table); - return getNumRows(conf, schema, table, ds); + } else { + inputs.add(Partish.buildFor(table)); } - else { // partitioned table - long nr = 0; - List rowCounts = Lists.newArrayList(); - rowCounts = getBasicStatForPartitions( - table, partitionList.getNotDeniedPartns(), StatsSetupConst.ROW_COUNT); - nr = getSumIgnoreNegatives(rowCounts); - - // log warning if row count is missing - if(nr <= 0) { - noColsMissingStats.getAndIncrement(); - } - // if row count exists or stats aren't to be estimated return - // whatever we have - if(nr > 0 || !shouldEstimateStats) { - return nr; - } - - // estimate row count - long ds = 0; - List dataSizes = Lists.newArrayList(); + Factory basicStatsFactory = new BasicStats.Factory(); - dataSizes = getBasicStatForPartitions( - table, partitionList.getNotDeniedPartns(), StatsSetupConst.RAW_DATA_SIZE); - - ds = getSumIgnoreNegatives(dataSizes); + if (HiveConf.getBoolVar(conf, ConfVars.HIVE_STATS_ESTIMATE_STATS)) { + basicStatsFactory.addEnhancer(new BasicStats.DataSizeEstimator(conf)); + basicStatsFactory.addEnhancer(new BasicStats.RowNumEstimator(estimateRowSizeFromSchema(conf, schema))); + } - float deserFactor = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVE_STATS_DESERIALIZATION_FACTOR); + List results = new ArrayList<>(); + for (Partish pi : inputs) { - if (ds <= 0) { - dataSizes = getBasicStatForPartitions( - table, partitionList.getNotDeniedPartns(), StatsSetupConst.TOTAL_SIZE); - dataSizes = safeMult(dataSizes, deserFactor); - ds = getSumIgnoreNegatives(dataSizes); + BasicStats bStats = new BasicStats(pi); + long nr = bStats.getNumRows(); + // FIXME parallel! \<>! + // FIXME: this point will be lost after the factory; check that it's really a warning....cleanup/etc + if (nr <= 0) { + // log warning if row count is missing + noColsMissingStats.getAndIncrement(); } + results.add(basicStatsFactory.build(pi)); + } - // if data size still could not be determined, then fall back to filesytem to get file - // sizes - if (ds <= 0 && shouldEstimateStats) { - dataSizes = getFileSizeForPartitions(conf, partitionList.getNotDeniedPartns()); - dataSizes = safeMult(dataSizes, deserFactor); - ds = getSumIgnoreNegatives(dataSizes); - } + BasicStats aggregateStat = BasicStats.buildFrom(results); - int avgRowSize = estimateRowSizeFromSchema(conf, schema); - if (avgRowSize > 0) { - setUnknownRcDsToAverage(rowCounts, dataSizes, avgRowSize); - nr = getSumIgnoreNegatives(rowCounts); - ds = getSumIgnoreNegatives(dataSizes); + aggregateStat.apply(new BasicStats.SetMinRowNumber01()); - // number of rows -1 means that statistics from metastore is not reliable - if (nr <= 0) { - nr = ds / avgRowSize; - } - } - if (nr == 0) { - nr = 1; - } - return nr; - } + return aggregateStat.getNumRows(); } private static void estimateStatsForMissingCols(List neededColumns, List columnStats, @@ -292,26 +225,6 @@ private static void estimateStatsForMissingCols(List neededColumns, List } } - private static long getNumRows(HiveConf conf, List schema, Table table, long ds) { - long nr = getNumRows(table); - // number of rows -1 means that statistics from metastore is not reliable - // and 0 means statistics gathering is disabled - // estimate only if num rows is -1 since 0 could be actual number of rows - if (nr < 0) { - int avgRowSize = estimateRowSizeFromSchema(conf, schema); - if (avgRowSize > 0) { - if (LOG.isDebugEnabled()) { - LOG.debug("Estimated average row size: " + avgRowSize); - } - nr = ds / avgRowSize; - } - } - if(nr == 0 || nr == -1) { - return 1; - } - return nr; - } - public static Statistics collectStatistics(HiveConf conf, PrunedPartitionList partList, Table table, List schema, List neededColumns, ColumnStatsList colStatsCache, List referencedColumns, boolean fetchColStats) @@ -320,24 +233,33 @@ public static Statistics collectStatistics(HiveConf conf, PrunedPartitionList pa referencedColumns, fetchColStats, false); } - private static Statistics collectStatistics(HiveConf conf, PrunedPartitionList partList, - Table table, List schema, List neededColumns, ColumnStatsList colStatsCache, - List referencedColumns, boolean fetchColStats, boolean failIfCacheMiss) - throws HiveException { + private static Statistics collectStatistics(HiveConf conf, PrunedPartitionList partList, Table table, + List schema, List neededColumns, ColumnStatsList colStatsCache, + List referencedColumns, boolean fetchColStats, boolean failIfCacheMiss) throws HiveException { Statistics stats = null; - float deserFactor = - HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVE_STATS_DESERIALIZATION_FACTOR); boolean shouldEstimateStats = HiveConf.getBoolVar(conf, ConfVars.HIVE_STATS_ESTIMATE_STATS); if (!table.isPartitioned()) { - //getDataSize tries to estimate stats if it doesn't exist using file size - // we would like to avoid file system calls if it too expensive - long ds = shouldEstimateStats? getDataSize(conf, table): getRawDataSize(table); - long nr = getNumRows(conf, schema, table, ds); + Factory basicStatsFactory = new BasicStats.Factory(); + + if (shouldEstimateStats) { + basicStatsFactory.addEnhancer(new BasicStats.DataSizeEstimator(conf)); + } + + // long ds = shouldEstimateStats? getDataSize(conf, table): getRawDataSize(table); + basicStatsFactory.addEnhancer(new BasicStats.RowNumEstimator(estimateRowSizeFromSchema(conf, schema))); + basicStatsFactory.addEnhancer(new BasicStats.SetMinRowNumber01()); + + BasicStats basicStats = basicStatsFactory.build(Partish.buildFor(table)); + + // long nr = getNumRows(conf, schema, neededColumns, table, ds); + long ds = basicStats.getDataSize(); + long nr = basicStats.getNumRows(); List colStats = Lists.newArrayList(); + if (fetchColStats) { colStats = getTableColumnStats(table, schema, neededColumns, colStatsCache); if(colStats == null) { @@ -357,59 +279,42 @@ private static Statistics collectStatistics(HiveConf conf, PrunedPartitionList p stats.setColumnStatsState(deriveStatType(colStats, neededColumns)); stats.addToColumnStats(colStats); } else if (partList != null) { + // For partitioned tables, get the size of all the partitions after pruning // the partitions that are not required - long nr = 0; - long ds = 0; - List rowCounts = Lists.newArrayList(); - List dataSizes = Lists.newArrayList(); + Factory basicStatsFactory = new Factory(); + if (shouldEstimateStats) { + // FIXME: misses paralelle + basicStatsFactory.addEnhancer(new BasicStats.DataSizeEstimator(conf)); + } - rowCounts = getBasicStatForPartitions(table, partList.getNotDeniedPartns(), StatsSetupConst.ROW_COUNT); - dataSizes = getBasicStatForPartitions(table, partList.getNotDeniedPartns(), StatsSetupConst.RAW_DATA_SIZE); + basicStatsFactory.addEnhancer(new BasicStats.RowNumEstimator(estimateRowSizeFromSchema(conf, schema))); - nr = getSumIgnoreNegatives(rowCounts); - ds = getSumIgnoreNegatives(dataSizes); - if (ds <= 0) { - dataSizes = getBasicStatForPartitions(table, partList.getNotDeniedPartns(), StatsSetupConst.TOTAL_SIZE); - dataSizes = safeMult(dataSizes, deserFactor); - ds = getSumIgnoreNegatives(dataSizes); - } + List partStats = new ArrayList<>(); - // if data size still could not be determined, then fall back to filesytem to get file - // sizes - if (ds <= 0 && shouldEstimateStats) { - dataSizes = getFileSizeForPartitions(conf, partList.getNotDeniedPartns()); - dataSizes = safeMult(dataSizes, deserFactor); - ds = getSumIgnoreNegatives(dataSizes); + for (Partition p : partList.getNotDeniedPartns()) { + BasicStats basicStats = basicStatsFactory.build(Partish.buildFor(table, p)); + partStats.add(basicStats); } + BasicStats bbs = BasicStats.buildFrom(partStats); - int avgRowSize = estimateRowSizeFromSchema(conf, schema); - if (avgRowSize > 0) { - setUnknownRcDsToAverage(rowCounts, dataSizes, avgRowSize); - nr = getSumIgnoreNegatives(rowCounts); - ds = getSumIgnoreNegatives(dataSizes); - - // number of rows -1 means that statistics from metastore is not reliable - if (nr <= 0) { - nr = ds / avgRowSize; - } + List rowCounts = Lists.newArrayList(); + for (BasicStats basicStats : partStats) { + rowCounts.add(basicStats.getNumRows()); } - // Minimum values + long nr = bbs.getNumRows(); + long ds = bbs.getDataSize(); + if (nr == 0) { - nr = 1; + nr=1; } stats = new Statistics(nr, ds); + stats.setBasicStatsState(bbs.getState()); - // if at least a partition does not contain row count then mark basic stats state as PARTIAL - if (containsNonPositives(rowCounts) && - stats.getBasicStatsState().equals(State.COMPLETE)) { - stats.setBasicStatsState(State.PARTIAL); - } if (fetchColStats) { - List partitionCols = getPartitionColumns( - schema, neededColumns, referencedColumns); + List partitionCols = getPartitionColumns(schema, neededColumns, referencedColumns); // We will retrieve stats from the metastore only for columns that are not cached List neededColsToRetrieve; @@ -744,26 +649,6 @@ private static Range getRangePartitionColumn(PartitionIterable partitions, Strin return range; } - private static void setUnknownRcDsToAverage( - List rowCounts, List dataSizes, int avgRowSize) { - if (LOG.isDebugEnabled()) { - LOG.debug("Estimated average row size: " + avgRowSize); - } - for (int i = 0; i < rowCounts.size(); i++) { - long rc = rowCounts.get(i); - long s = dataSizes.get(i); - if (rc <= 0 && s > 0) { - rc = s / avgRowSize; - rowCounts.set(i, rc); - } - - if (s <= 0 && rc > 0) { - s = safeMult(rc, avgRowSize); - dataSizes.set(i, s); - } - } - } - public static int estimateRowSizeFromSchema(HiveConf conf, List schema) { List neededColumns = new ArrayList<>(); for (ColumnInfo ci : schema) { @@ -836,6 +721,7 @@ public static long getFileSizeForTable(HiveConf conf, Table table) { * - partition list * @return sizes of partitions */ + @Deprecated public static List getFileSizeForPartitions(final HiveConf conf, List parts) { LOG.info("Number of partitions : " + parts.size()); ArrayList> futures = new ArrayList<>(); @@ -876,7 +762,7 @@ public Long call() throws Exception { return sizes; } - private static boolean containsNonPositives(List vals) { + public static boolean containsNonPositives(List vals) { for (Long val : vals) { if (val <= 0L) { return true; @@ -1054,8 +940,8 @@ else if(colTypeLowerCase.equals(serdeConstants.SMALLINT_TYPE_NAME)){ cs.setAvgColLen(getAvgColLenOf(conf,cinfo.getObjectInspector(), cinfo.getTypeName())); } else if (colTypeLowerCase.equals(serdeConstants.BOOLEAN_TYPE_NAME)) { cs.setCountDistint(2); - cs.setNumTrues(Math.max(1, (long)numRows/2)); - cs.setNumFalses(Math.max(1, (long)numRows/2)); + cs.setNumTrues(Math.max(1, numRows/2)); + cs.setNumFalses(Math.max(1, numRows/2)); cs.setAvgColLen(JavaDataModel.get().primitive1()); } else if (colTypeLowerCase.equals(serdeConstants.TIMESTAMP_TYPE_NAME) || colTypeLowerCase.equals(serdeConstants.TIMESTAMPLOCALTZ_TYPE_NAME)) { @@ -1729,12 +1615,13 @@ private static long getNDVFor(ExprNodeGenericFuncDesc engfd, long numRows, Stati } long countDistincts = ndvs.isEmpty() ? numRows : addWithExpDecay(ndvs); return Collections.min(Lists.newArrayList(countDistincts, udfNDV, numRows)); - } + } /** * Get number of rows of a give table * @return number of rows */ + @Deprecated public static long getNumRows(Table table) { return getBasicStatForTable(table, StatsSetupConst.ROW_COUNT); } @@ -1763,6 +1650,7 @@ public static long getTotalSize(Table table) { * - type of stats * @return value of stats */ + @Deprecated public static long getBasicStatForTable(Table table, String statType) { Map params = table.getParameters(); long result = -1; diff --git ql/src/test/org/apache/hadoop/hive/ql/stats/TestBasicStats.java ql/src/test/org/apache/hadoop/hive/ql/stats/TestBasicStats.java new file mode 100644 index 0000000000..e1adcedefe --- /dev/null +++ ql/src/test/org/apache/hadoop/hive/ql/stats/TestBasicStats.java @@ -0,0 +1,116 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.stats; + +import static org.junit.Assert.assertEquals; +import static org.mockito.Mockito.doReturn; + +import java.util.HashMap; +import java.util.Map; + +import org.apache.hadoop.hive.common.StatsSetupConst; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.conf.HiveConf.ConfVars; +import org.apache.hadoop.hive.ql.metadata.Partition; +import org.apache.hadoop.hive.ql.plan.Statistics.State; +import org.apache.hadoop.hive.ql.stats.BasicStats; +import org.junit.Ignore; +import org.junit.Test; +import org.mockito.Mockito; + +import com.google.common.collect.Lists; + +public class TestBasicStats { + + public static class LocalPartishBuilder { + Map params = new HashMap<>(); + + public LocalPartishBuilder numRows(int i) { + params.put(StatsSetupConst.ROW_COUNT, String.valueOf(i)); + return this; + } + + public LocalPartishBuilder rawDataSize(int i) { + params.put(StatsSetupConst.RAW_DATA_SIZE, String.valueOf(i)); + return this; + } + + public LocalPartishBuilder totalSize(int i) { + params.put(StatsSetupConst.TOTAL_SIZE, String.valueOf(i)); + return this; + } + + public Partish buildPartition() { + Partition partition = Mockito.mock(Partition.class); + org.apache.hadoop.hive.metastore.api.Partition tpartition = Mockito.mock(org.apache.hadoop.hive.metastore.api.Partition.class); + doReturn(tpartition).when(partition).getTPartition(); + doReturn(params).when(tpartition).getParameters(); + return Partish.buildFor(null, partition); + } + } + + @Test + public void testDataSizeEstimator() { + Partish p1 = new LocalPartishBuilder().totalSize(10).buildPartition(); + + HiveConf conf = new HiveConf(); + conf.setFloatVar(ConfVars.HIVE_STATS_DESERIALIZATION_FACTOR, 13.0f); + BasicStats.Factory factory = new BasicStats.Factory(new BasicStats.DataSizeEstimator(conf)); + + BasicStats res = factory.build(p1); + + assertEquals(130, res.getDataSize()); + } + + @Test + public void mergeWithEmpty() { + Partish p0 = new LocalPartishBuilder().numRows(10).rawDataSize(10).buildPartition(); + Partish p1 = new LocalPartishBuilder().totalSize(10).buildPartition(); + + HiveConf conf = new HiveConf(); + BasicStats.Factory factory = new BasicStats.Factory(new BasicStats.DataSizeEstimator(conf), new BasicStats.RowNumEstimator(1)); + + BasicStats bs0 = factory.build(p0); + BasicStats bs1 = factory.build(p1); + + BasicStats res = BasicStats.buildFrom(Lists.newArrayList(bs0, bs1)); + + assertEquals(20, res.getNumRows()); + assertEquals(20, res.getDataSize()); + } + + @Test + @Ignore("HIVE-18062 will fix this") + public void mergedKeepsPartialStateEvenIfValuesAreSuccessfullyEstimated() { + Partish p0 = new LocalPartishBuilder().numRows(10).rawDataSize(10).buildPartition(); + Partish p1 = new LocalPartishBuilder().totalSize(10).buildPartition(); + + HiveConf conf = new HiveConf(); + BasicStats.Factory factory = new BasicStats.Factory(new BasicStats.DataSizeEstimator(conf), new BasicStats.RowNumEstimator(1)); + + BasicStats bs0 = factory.build(p0); + BasicStats bs1 = factory.build(p1); + + BasicStats res = BasicStats.buildFrom(Lists.newArrayList(bs0, bs1)); + + assertEquals(State.PARTIAL, res.getState()); + } + + +} diff --git ql/src/test/queries/clientpositive/stats8.q ql/src/test/queries/clientpositive/stats8.q index 46002caf4a..74471aced6 100644 --- ql/src/test/queries/clientpositive/stats8.q +++ ql/src/test/queries/clientpositive/stats8.q @@ -6,11 +6,13 @@ set hive.exec.dynamic.partition.mode=nonstrict; create table analyze_srcpart like srcpart; insert overwrite table analyze_srcpart partition (ds, hr) select * from srcpart where ds is not null; +describe formatted analyze_srcpart PARTITION(ds='2008-04-08',hr=11); explain analyze table analyze_srcpart PARTITION(ds='2008-04-08',hr=11) compute statistics; analyze table analyze_srcpart PARTITION(ds='2008-04-08',hr=11) compute statistics; describe formatted analyze_srcpart PARTITION(ds='2008-04-08',hr=11); describe formatted analyze_srcpart; +describe formatted analyze_srcpart PARTITION(ds='2008-04-08',hr=12); explain analyze table analyze_srcpart PARTITION(ds='2008-04-08',hr=12) compute statistics; analyze table analyze_srcpart PARTITION(ds='2008-04-08',hr=12) compute statistics; describe formatted analyze_srcpart PARTITION(ds='2008-04-08',hr=12); diff --git ql/src/test/results/clientpositive/stats8.q.out ql/src/test/results/clientpositive/stats8.q.out index da92f96f6a..c6216649d7 100644 --- ql/src/test/results/clientpositive/stats8.q.out +++ ql/src/test/results/clientpositive/stats8.q.out @@ -33,6 +33,41 @@ POSTHOOK: Lineage: analyze_srcpart PARTITION(ds=2008-04-09,hr=11).key SIMPLE [(s POSTHOOK: Lineage: analyze_srcpart PARTITION(ds=2008-04-09,hr=11).value SIMPLE [(srcpart)srcpart.FieldSchema(name:value, type:string, comment:default), ] POSTHOOK: Lineage: analyze_srcpart PARTITION(ds=2008-04-09,hr=12).key SIMPLE [(srcpart)srcpart.FieldSchema(name:key, type:string, comment:default), ] POSTHOOK: Lineage: analyze_srcpart PARTITION(ds=2008-04-09,hr=12).value SIMPLE [(srcpart)srcpart.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: describe formatted analyze_srcpart PARTITION(ds='2008-04-08',hr=11) +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@analyze_srcpart +POSTHOOK: query: describe formatted analyze_srcpart PARTITION(ds='2008-04-08',hr=11) +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@analyze_srcpart +# col_name data_type comment +key string default +value string default + +# Partition Information +# col_name data_type comment +ds string +hr string + +# Detailed Partition Information +Partition Value: [2008-04-08, 11] +Database: default +Table: analyze_srcpart +#### A masked pattern was here #### +Partition Parameters: + numFiles 1 + totalSize 5812 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 PREHOOK: query: explain analyze table analyze_srcpart PARTITION(ds='2008-04-08',hr=11) compute statistics PREHOOK: type: QUERY POSTHOOK: query: explain analyze table analyze_srcpart PARTITION(ds='2008-04-08',hr=11) compute statistics @@ -47,7 +82,7 @@ STAGE PLANS: Map Operator Tree: TableScan alias: analyze_srcpart - Statistics: Num rows: 392 Data size: 232480 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 392 Data size: 232480 Basic stats: NONE Column stats: NONE Stage: Stage-1 Stats Work @@ -142,6 +177,41 @@ Bucket Columns: [] Sort Columns: [] Storage Desc Params: serialization.format 1 +PREHOOK: query: describe formatted analyze_srcpart PARTITION(ds='2008-04-08',hr=12) +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@analyze_srcpart +POSTHOOK: query: describe formatted analyze_srcpart PARTITION(ds='2008-04-08',hr=12) +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@analyze_srcpart +# col_name data_type comment +key string default +value string default + +# Partition Information +# col_name data_type comment +ds string +hr string + +# Detailed Partition Information +Partition Value: [2008-04-08, 12] +Database: default +Table: analyze_srcpart +#### A masked pattern was here #### +Partition Parameters: + numFiles 1 + totalSize 5812 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 PREHOOK: query: explain analyze table analyze_srcpart PARTITION(ds='2008-04-08',hr=12) compute statistics PREHOOK: type: QUERY POSTHOOK: query: explain analyze table analyze_srcpart PARTITION(ds='2008-04-08',hr=12) compute statistics @@ -156,7 +226,7 @@ STAGE PLANS: Map Operator Tree: TableScan alias: analyze_srcpart - Statistics: Num rows: 500 Data size: 5312 Basic stats: PARTIAL Column stats: NONE + Statistics: Num rows: 794 Data size: 179672 Basic stats: PARTIAL Column stats: NONE Stage: Stage-1 Stats Work @@ -226,7 +296,7 @@ STAGE PLANS: Map Operator Tree: TableScan alias: analyze_srcpart - Statistics: Num rows: 1000 Data size: 10624 Basic stats: PARTIAL Column stats: NONE + Statistics: Num rows: 1196 Data size: 126864 Basic stats: PARTIAL Column stats: NONE Stage: Stage-1 Stats Work @@ -296,7 +366,7 @@ STAGE PLANS: Map Operator Tree: TableScan alias: analyze_srcpart - Statistics: Num rows: 1500 Data size: 15936 Basic stats: PARTIAL Column stats: NONE + Statistics: Num rows: 1598 Data size: 74056 Basic stats: PARTIAL Column stats: NONE Stage: Stage-1 Stats Work